CUDA에서 어떻게 디바이스에서 호스트로 메모리를 올바르게 복사합니까?

-1

CUDA에서 병렬로 몇 개의 행렬 값을 증가시키고 주 메모리에 다시 복사하려고합니다. 그러나 일단 스레드 함수가 반환되면 그들을 인쇄 할 때 값이 동일합니다. 심지어 단 하나의 스레드로 프로그램을 실행 해 보았지만 운이 없었습니다. 어떤 도움이라도 대단히 감사하겠습니다.CUDA에서 어떻게 디바이스에서 호스트로 메모리를 올바르게 복사합니까?

내 코드 :

#include <stdio.h> 
#include <stdlib.h> 
#include <math.h> 
#include <sys/time.h> 
#include <cuda.h> 

#define BLOCK_SIZE 1024 
#define MAX_N  100000000 
#define MAX_THREADS  1024 

int num_threads; 
int count;    // Count of threads that have updated their partition 
int size; 
//int increment; // VS 
int * inc2; 
//int my_start; 


//Host data 
int * thread_ids; 

//nvcc -arch=sm_20 -o nbody.exe nbody.cu (compilation) 

__global__ void pcyc_red(float * a, float * b, float * c, float * D, float * X, 
        float * a2, float * b2, float * c2, float * D2, 
        int * inc2_dev, int * size_dev, int * num_threads_dev){ 

//__threadfence(); 
int thread_id = threadIdx.x + (blockIdx.x * blockDim.x); 
float k1; 
float k2; 
int i; 

int start = 0; 
//int end = size_dev-1; 
//int inc2_dev = inc2_dev1[0]; 
//int inc_dev = *inc_dev1; 
//int size_dev = size_dev1[0]; 
int nthreads = num_threads_dev[0]; 
//Thread work assignment 
int chunk_size = size_dev[0]/nthreads; 
int my_start = thread_id*(chunk_size); 
int my_end = start + ((thread_id + 1)*chunk_size - 1); 
//__threadfence(); 
__syncthreads(); 
//Forward Reduction 
for(i = my_start; i <= my_end; ++i){ 
    a[i] = a[i]++; 
    b[i] = b[i]++; 
    c[i] = c[i]++; 
    D[i] = D[i]++; 
    X[i] = X[i]++; 
} 

__threadfence(); 
//__syncthreads(); 
}//Device Function 


float* init_vector(int size){ 
float* output; 
output = (float*) calloc(size, sizeof(float)); 
int i; 
for(i = 0; i < size; ++i){ 
    output[i] = 2.0; 
} 
return output; 
} 

float* init_vector_ac(int s){ 
//s will be used for size-1 not to be confused for size. 
float* output; 
output = (float*) calloc(s, sizeof(float)); 
int i; 
for(i = 0; i < s; ++i){ 
    output[i] = -1.0; 
} 
return output; 
} 

// Main program 
int main(int argc, char *argv[]) { 

//num_threads -> atoi(argv[argc-1]); 
//struct timeval start, stop; 
float total_time; 
int i; 

///Host structures 
float* a; 
float* b; 
float* c; 
float* D; 
float* X; 

//increment = 2; // VS 
inc2 = (int*) malloc(sizeof(int)); 
inc2[0] = 1; 
//size = (int*) malloc(sizeof(int)); 
//num_threads = (int*) malloc(sizeof(int)); 
//my_start = 0; 
//wait_flag = false; 

///Device Data 
//SYSTEM * sys_dev; 
float * a_dev; 
float * b_dev; 
float * c_dev; 
float * D_dev; 
float * X_dev; 

float * a2_dev; 
float * b2_dev; 
float * c2_dev; 
float * D2_dev; 
//float * X2_dev; 

//int * inc_dev; 
int * inc2_dev; 
//int * mstart_dev; 
int * size_dev; 
int * num_threads_dev; 
int result_var; 

//int final_inc2; 

cudaEvent_t start, stop; // GPU timing variables 
//struct timeval cpu_start, cpu_stop; // CPU timing variables 
    // float time_array[10]; 

// Timing initializations 
cudaEventCreate(&start); 
cudaEventCreate(&stop); 

if (argc != 3) 
{ 
    printf("Use: <executable_name> <size> <num_threads>\n"); 
    exit(0); 
} 
if ((size = atoi(argv[argc-2])) > MAX_N) 
{ 
    printf("Maximum number of nodes allowed: %d\n", MAX_N); 
    exit(0); 
}; 

if ((num_threads = atoi(argv[argc-1])) > MAX_THREADS) 
{ 
    printf("Maximum number of threads allowed: %d.\n", MAX_THREADS); 
    exit(0); 
}; 

int size_array = (size) * sizeof(float); 
int size_array2 = (size - 1) * sizeof(float); 

// Initialize host tridiagonal matrix 
a = init_vector_ac(size-1); // a[i] = -1.0 
b = init_vector(size);  // b[i] = 2.0 
c = init_vector_ac(size-1); // c[i] = -1.0 
D = init_vector(size);  // D[i] = 2.0 
X = init_vector(size);  // X[i] = 2.0 

//xs = init_vector_err(size); 

// Shift elements of a by 1 
for(i = size-1; i > 0; i--) a[i] = a[i-1]; 
a[0] = 0.0; 


thread_ids = (int*) calloc(num_threads, sizeof(int)); 

count = 0; 

for(i = 0; i < num_threads; ++i){ 
    thread_ids[i] = i; 
} 
//Cuda Operation 

cudaEventRecord(start, 0); 

cudaMalloc((void **) &a_dev, size); 
cudaMalloc((void **) &b_dev, size); 
cudaMalloc((void **) &c_dev, size); 
cudaMalloc((void **) &D_dev, size); 
cudaMalloc((void **) &X_dev, size); 
cudaMalloc((void **) &a2_dev, size); 
cudaMalloc((void **) &b2_dev, size); 
cudaMalloc((void **) &c2_dev, size); 
cudaMalloc((void **) &D2_dev, size); 
//cudaMalloc((void**)&inc_dev, sizeof(int)); 
cudaMalloc((void**)&inc2_dev, sizeof(int)); 
//cudaMalloc((void**)&mstart_dev, sizeof(int)); 
cudaMalloc((void**)&size_dev, sizeof(int)); 
cudaMalloc((void**)&num_threads_dev, sizeof(int)); 


cudaMemcpy(a_dev, a, size_array2, cudaMemcpyHostToDevice); 
cudaMemcpy(b_dev, b, size_array, cudaMemcpyHostToDevice); 
cudaMemcpy(c_dev, c, size_array2, cudaMemcpyHostToDevice); 
cudaMemcpy(D_dev, D, size_array, cudaMemcpyHostToDevice); 
cudaMemcpy(X_dev, X, size_array, cudaMemcpyHostToDevice); 
cudaMemcpy(a2_dev, a, size_array2, cudaMemcpyHostToDevice); 
cudaMemcpy(b2_dev, b, size_array, cudaMemcpyHostToDevice); 
cudaMemcpy(c2_dev, c, size_array2, cudaMemcpyHostToDevice); 
cudaMemcpy(D2_dev, D, size_array, cudaMemcpyHostToDevice); 

//cudaMemcpy(inc_dev, &increment, sizeof(int), cudaMemcpyHostToDevice); 
cudaMemcpy(inc2_dev, inc2, sizeof(int), cudaMemcpyHostToDevice); 
//cudaMemcpy(mstart_dev, &my_start, sizeof(int), cudaMemcpyHostToDevice); 
cudaMemcpy(size_dev, &size, sizeof(int), cudaMemcpyHostToDevice); 
cudaMemcpy(num_threads_dev, &num_threads, sizeof(int), cudaMemcpyHostToDevice); 

cudaDeviceSynchronize(); 
pcyc_red<<<1, num_threads>>>(a_dev, b_dev, c_dev, D_dev, X_dev, 
          a2_dev, b2_dev, c2_dev, D2_dev, 
          inc2_dev, size_dev, num_threads_dev); 
cudaDeviceSynchronize(); 

cudaMemcpy(X, X_dev, size_array, cudaMemcpyDeviceToHost); 
cudaMemcpy(a, a_dev, size_array, cudaMemcpyDeviceToHost); 
cudaMemcpy(b, b_dev, size_array, cudaMemcpyDeviceToHost); 
cudaMemcpy(c, c_dev, size_array, cudaMemcpyDeviceToHost); 
cudaMemcpy(D, D_dev, size_array, cudaMemcpyDeviceToHost); 
cudaMemcpy(inc2, inc2_dev, sizeof(int), cudaMemcpyDeviceToHost); 
cudaMemcpy(&result_var, num_threads_dev, sizeof(int), cudaMemcpyDeviceToHost); 
cudaDeviceSynchronize(); 
cudaEventRecord(stop, 0); 
cudaEventSynchronize(stop); 
cudaEventElapsedTime(&total_time, start, stop); 

printf("Final Var: %d\n\n", inc2[0]); 
printf("Num Threads Var: %d\n\n", result_var); 

for(i = 0; i < size; ++i){ 
    printf("a=%8.4f \n", a[i]); 
    printf("b=%8.4f \n", b[i]); 
    printf("c=%8.4f \n", c[i]); 
    printf("D=%8.4f \n", D[i]); 
    printf("X=%8.4f \n", X[i]); 
} 

printf("Threads = %d, matrix_size = %d, time = %f\n", 
    num_threads, size, total_time); 

cudaFree(a_dev); 
cudaFree(b_dev); 
cudaFree(c_dev); 
cudaFree(D_dev); 
cudaFree(X_dev); 
//cudaFree(inc_dev); 
cudaFree(inc2_dev); 
//cudaFree(mstart_dev); 
//cudaFree(size_dev); 
//cudaFree(num_threads_dev); 

}//end of main

출처

2014-10-01 HarishV

[적절한 cuda 오류 검사]를 추가하여 시작하십시오 (http://stackoverflow.com/questions/14038589/what-is-the-canonical-way-to-check-for-errors-using-the-cuda-runtime -api)를 코드에 추가합니다. 'cuda-memcheck'를 사용하여 코드를 실행하여보고 내용을 볼 수도 있습니다. 그 후 누군가가 아무것도 추가하거나 변경하지 않고 복사, 붙여 넣기, 컴파일 및 실행할 수있는 도움이 필요하면 [post a complete code] (http://stackoverflow.com/help/mcve). –

전체 코드를 추가했습니다. – HarishV

memcheck를 실행할 때 "No Cuda-Memcheck results found"가 표시됩니다. – HarishV

이 코드에 proper cuda error checking를 추가합니다.

하나의 문제점은 할당 크기가 배열 크기와 일치하지 않는다는 것입니다. 단 하나의 예를 선택하십시오 :

int size_array = (size) * sizeof(float); 
... 
cudaMalloc((void **) &b_dev, size); // size should probably be size_array here 
...       ^^^^ 
cudaMemcpy(b_dev, b, size_array, cudaMemcpyHostToDevice); // this won't work, will throw error 
        ^^^^^^^^^^

위의 내용은 분명히 오류이며,이 코드에는 여러 가지 유형이 있습니다. 또한 오류 검사에 표시되는 컴퓨터 구성 문제 (CUDA가 제대로 설치되지 않은 경우 등)가있을 수 있습니다.

출처

2014-10-01 19:15:48

내 작업을 계속 제출 한 서버 노드에는 GPU 장치가 없습니다. 그러나 이것이 정말로 도움이되었습니다. – HarishV

CUDA에서 어떻게 디바이스에서 호스트로 메모리를 올바르게 복사합니까?

답변

관련 문제