CUDA에서 불법적 인 메모리 액세스를 피하는 방법

-1

CUDA에서 메모리 액세스 문제가 발생했습니다. 내 코드의 핵심 my_array 배열 장치 어레이의 주소CUDA에서 불법적 인 메모리 액세스를 피하는 방법

long long addr0,addr1; 
addr0=(long long)my_array; 
addr1 = (addr0^(1 << position)); 
long long *r_addr0, *r_addr1; 
r_addr0 = (long long *)addr0; 
r_addr1 = (long long *)addr1; 
i = *r_addr0; 
j = *r_addr1;

이다.

난 그때 하나 r_addr0 하나의 비트 플립 r_addr0에 my_array 배열의 주소를 저장한다. 예 :

0000 0000 1011 0000 0011 1111 1110 0000 0000 0000 0000 0 addr of my_array 0000 0000 1011 0000 0011 1111 1110 0000 0000 0000 0001 1 flip last bit 0000 0000 1011 0100 0011 1111 1110 0000 0000 0000 0000 31 flip 31 bit.

나는 r_addr0 및 r_addr1마다의 주소를 인쇄하고, 첫 번째 31 비트 잘 작동하지만, 난 32 비트 후 불법 메모리 주소 문제가 발생했습니다. 나는 Tesla K80을 사용하고 있는데, 12GB 메모리가 장착되어 있습니다.

사람이

전체 코드는 아래를 참조 CUDA 불법 메모리 액세스를 방지하는 방법을 알고 있나요 :

# include <stdio.h> # include <stdint.h> # include "cuda_runtime.h" //compile nvcc test.cu -o test __global__ void global_latency (int * my_array, int position, int *d_time); int row_bits(int * h_a, long long N, int pos, int * h_time); int main(){ cudaSetDevice(0); long long i, N; int *h_a; int h_time0; int h_time1; int *h_time; N = 2*1024*1024*1024L;//2G elements, 4 bytes per element, 8 GB memory used. printf("\n=====%10.4f GB array with %d GB elements,discover row bits====\n", sizeof(int)*(float)N/1024/1024/1024,N/1024/1024/1024); /* allocate arrays on CPU */ h_a = (int *)malloc(sizeof(int) * N); h_time = (int *)malloc(sizeof(int)*N); /* initialize array elements*/ for (i=0L; i<N; i++){ h_a[i] = i%(1024*1024); } for (int k=0;k<2;k++){ h_time[k]=0; } printf("... ... ...\n... ... ...\n"); for (int pos = 0; pos < 64; pos++){ h_time0=0; h_time1=0; for (int j=0;j<5;j++){ row_bits(h_a,N,pos,h_time); h_time0 +=h_time[0]; h_time1 +=h_time[1]; } printf("position = %d, time0 = %d, time1 = %d\n", pos+1,h_time0/5, h_time1/5); } printf("===============================================\n\n"); free(h_a); return 0; } int row_bits(int * h_a, long long N, int pos, int * h_time) { cudaError_t error_id; int *d_a; /* allocate arrays on GPU */ error_id = cudaMalloc ((void **) &d_a, sizeof(int) * N); if (error_id != cudaSuccess) { printf("Error 1.0 is %s\n", cudaGetErrorString(error_id)); } /* copy array elements from CPU to GPU */ error_id = cudaMemcpy(d_a, h_a, sizeof(int) * N, cudaMemcpyHostToDevice); if (error_id != cudaSuccess) { printf("Error 1.1 is %s\n", cudaGetErrorString(error_id)); } //int *h_time = (int *)malloc(sizeof(int)); int *d_time; error_id = cudaMalloc ((void **) &d_time, 4*sizeof(int)); if (error_id != cudaSuccess) printf("Error 1.2 is %s\n", cudaGetErrorString(error_id)); cudaThreadSynchronize(); /* launch kernel*/ dim3 Db = dim3(1); dim3 Dg = dim3(1,1,1); global_latency <<<Dg, Db>>>(d_a, pos,d_time); cudaThreadSynchronize(); error_id = cudaGetLastError(); if (error_id != cudaSuccess) { printf("Error kernel is %s\n", cudaGetErrorString(error_id)); } /* copy results from GPU to CPU */ cudaThreadSynchronize(); error_id = cudaMemcpy((void *)h_time, (void *)d_time, 4*sizeof(int), cudaMemcpyDeviceToHost); if (error_id != cudaSuccess) { printf("Error 2.0 is %s\n", cudaGetErrorString(error_id)); } cudaThreadSynchronize(); /* free memory on GPU */ cudaFree(d_a); cudaFree(d_time); cudaDeviceReset(); return 0; } __global__ void global_latency (int * my_array, int position, int *d_time) { //int tid = blockIdx.x*blockDim.x+threadIdx.x; int start_time=0; int mid_time=0; int end_time=0; __shared__ int s_tvalue[2];//2: number of threads per block int i, j; s_tvalue[0]=0; s_tvalue[1]=0; long long addr0,addr1; //printf("%p\n",my_array); //int * p = (int *)0x0; //addr0 = (long long)p; addr0=(long long)my_array; //printf("Address i :%p\n",addr0); addr1 = (addr0^(1 << position)); //printf("Address i':%p\n",addr1); //start_time = clock(); long long *r_addr0, *r_addr1; r_addr0 = (long long *)addr0; r_addr1 = (long long *)addr1; start_time = clock(); i = *r_addr0; s_tvalue[0] = i; mid_time = clock(); j = *r_addr1; s_tvalue[1] = j; //printf("%p",p); //k =(int)p; //printf("%d\n",k); //printf("%d",k); //__syncthreads(); end_time = clock(); d_time[0] = mid_time-start_time; d_time[1] = end_time-mid_time; d_time[2] = s_tvalue[0]; //printf("[%p]=%lld\n",addr0,d_time[1]); d_time[3] = s_tvalue[1]; //printf("[%p]=%lld\n",addr1,d_time[2]); }

출처

2016-06-09 Steven Huang

position=0 및 원래 주소의 비트 0이 0, 사용자가 설정하려는

j=*(int*)&(((char*)my_array)[1]);

이는 t의 4 바이트 정렬 그는 int을 입력합니다. 그러면 프로그램이 중단됩니다.

position=3

및 원래 주소 비트 3 당신이 읽으려는 주소가 my_array 전에 어디

j=*(int*)&(((char*)my_array)[-8]);

을 설정하려고, 한 말씀이다. 확실히 불법적 인 메모리 액세스입니다. 실제로 원래 1과 같은 비트를 대칭 이동하는 것은 음의 배열 인덱스를 의미합니다.

또한 당신은 더 나은 당신이 부호 비트와 오버 플로우 문제에 의해 방해되지 않도록 1 << position 대신 unsigned long long 또는 대신 long long의 size_t 및 1ull << position을 사용하십시오.

출처

2016-06-09 19:28:34 kangshiyin

my_array의 주소는 0xb03fe0000입니다. 위치 1은 0xb03fe0001이고 위치 2는 0xb03fe0002, 입니다. 위치 3의 주소는 0xb03fe0004입니다. 이 주소는 아직 my_arryay보다 앞서 있습니다. 코드를 실행하면 "32"위치에 도달하면 주소가 더 이상 변경되지 않고 불법 메모리 주소 액세스가 발생한다고 모든 것이 맞습니다. –

@StevenHuang 원래 1과 같은 비트를 뒤집는 것은 음의 배열 인덱스를 의미합니다. – kangshiyin

@StevenHuang은 '1ull'을 사용합니다. '더 이상 변하지 않습니다'는 넘쳐 흐른 것처럼 보입니다. – kangshiyin

CUDA에서 불법적 인 메모리 액세스를 피하는 방법

답변

관련 문제