전에 cuda-memcheck
을 추가하지 않으면 경고 나 오류 메시지없이 프로그램이 실행되지만 추가 할 때 다음과 같은 오류 메시지가 표시되는 것은 이상한 일입니다. . 여기 'cuda-memcheck'with cuda 8.0
========= Invalid __global__ write of size 8
========= at 0x00000120 in initCurand(curandStateXORWOW*, unsigned long)
========= by thread (9,0,0) in block (3,0,0)
========= Address 0x5005413b0 is out of bounds
========= Saved host backtrace up to driver entry point at kernel launch time
========= Host Frame:/usr/lib/x86_64-linux-gnu/libcuda.so.1 (cuLaunchKernel + 0x2c5) [0x204115]
========= Host Frame:./main [0x18e11]
========= Host Frame:./main [0x369b3]
========= Host Frame:./main [0x3403]
========= Host Frame:./main [0x308c]
========= Host Frame:./main [0x30b7]
========= Host Frame:./main [0x2ebb]
========= Host Frame:/lib/x86_64-linux-gnu/libc.so.6 (__libc_start_main + 0xf0) [0x20830]
내 기능, 코드에 대한 간략한 소개, 난 임의의 숫자를 생성하는 장치 변수
weights
에 저장 한 후 개별 번호에서 샘플이 벡터를 사용하려고합니다.
#include<iostream>
#include<curand.h>
#include<curand_kernel.h>
#include<time.h>
using namespace std;
#define num 100
__device__ float weights[num];
// function to define seed
__global__ void initCurand(curandState *state, unsigned long seed){
int idx = threadIdx.x + blockIdx.x * blockDim.x;
curand_init(seed, idx, 0, &state[idx]);
}
__device__ void sampling(float *weight, float max_weight, int *index, curandState *state){
int j;
float u;
do{
j = (int)(curand_uniform(state) * (num + 0.999999));
u = curand_uniform(state); //sample from uniform distribution;
}while(u > weight[j]/max_weight);
*index = j;
}
__global__ void test(int *dev_sample, curandState *state){
int idx = threadIdx.x + blockIdx.x * blockDim.x;\
// generate random numbers from uniform distribution and save them to weights
weights[idx] = curand_uniform(&state[idx]);
// run sampling function, in which, weights is an input for the function on each thread
sampling(weights, 1, dev_sample+idx, &state[idx]);
}
int main(){
// define the seed of random generator
curandState *devState;
cudaMalloc((void**)&devState, num*sizeof(curandState));
int *h_sample;
h_sample = (int*) malloc(num*sizeof(int));
int *d_sample;
cudaMalloc((void**)&d_sample, num*sizeof(float));
initCurand<<<(int)num/32 + 1, 32>>>(devState, 1);
test<<<(int)num/32 + 1, 32>>>(d_sample, devState);
cudaMemcpy(h_sample, d_sample, num*sizeof(float), cudaMemcpyDeviceToHost);
for (int i = 0; i < num; ++i)
{
cout << *(h_sample + i) << endl;
}
//free memory
cudaFree(devState);
free(h_sample);
cudaFree(d_sample);
return 0;
}
글로벌 메모리에 액세스하는 방법이 올바르지 않다면 쿠다를 배우기 시작하십시오. 저를 도와주세요. 감사