Cuda call은 사양에 관계없이 블록 당 8 개 이상의 스레드를 할당하지 않습니다.

C++에서 Eratosthenes의 Sieve 병렬 버전을 만들고 있습니다. 문제는 내 커널 호출 (reduce0)이 지정한 256 대신 블록 당 8 개의 쓰레드를 할당하는 것 같다. 첫 번째 CUDA 버전도 블록 당 512 개의 스레드를 허용하기 때문에 코드에 오류가 있어야합니다. 어떤 도움을 주시면 감사하겠습니다.Cuda call은 사양에 관계없이 블록 당 8 개 이상의 스레드를 할당하지 않습니다.

#include <iostream> 
#include <stdlib.h> 
#include <math.h> 
#include <time.h> 
#include <cutil.h> 
//#include <sieve_kernel.cu> 
using namespace std; 

//////////////////////////////////////////////////// 
int psum(int arg[], double n); 
int call_kernel(int primes[], int n); 
int findsmallest(int arg[], int f, double n); 
int sieve(int n); 
__global__ void reduce0(int *g_idata, int *g_odata); 

//////////////////////////////////////////////////// 
int main(){ 
    int n = pow((double) 2, 8); 
    int total = sieve(n); 
    cout << "# primes" << endl << total << endl; 
    return 0; 
} 
/////////////////////////////////////////////////// 

__global__ void reduce0(int *g_idata, int *g_odata) { 
extern __shared__ int sdata[]; 

// each thread loads one element from global to shared mem 
unsigned int tid = threadIdx.x; 
unsigned int i = blockIdx.x*blockDim.x + threadIdx.x; 
sdata[tid] = g_idata[i]; 
__syncthreads(); 

// do reduction in shared mem 
for (int s = 1; s < blockDim.x; s *= 2) { // step = s x 2 
    if (tid % (s*2) == 0) { // only threadIDs divisible by the step participate 
     sdata[tid] += sdata[tid + s]; 
    } 
    __syncthreads(); 
} 

// write result for this block to global mem 
if (tid == 0) g_odata[blockIdx.x] = sdata[0]; 
} 

///////////////////////////////////////////////////// 

int call_kernel(int *primes, int n){ 
    // Allocate and copy device arrays 
    int *g_idevice; 
    int *g_odevice; 
    int size = n * sizeof(int); 
    cudaMalloc(&g_idevice, size); 
    cudaMemcpy(g_idevice, primes, size, cudaMemcpyHostToDevice); 
    cudaMalloc(&g_odevice, size); 

    // Specify grid/block dimenstions and invoke the kernel 
    dim3 dimGrid(1,1); 
    dim3 dimBlock(256,1); 
    reduce0<<<dimGrid, dimBlock>>>(g_idevice, g_odevice); 

    // Copy device data back to primes 
    cudaMemcpy(primes, g_odevice, size, cudaMemcpyDeviceToHost); 

    //for (int i = 0; i < n; i++) { 
    // cout << i << " " << primes[i] << endl; 
    //} 
    int total = primes[0]; 
    cudaFree(g_idevice); 
    cudaFree(g_odevice); 
    return total; 


} 
///////////////////////////////////////////////////////////////////// 
int findsmallest(int arg[], int f, double n){ 
    int i = f; 
    while(arg[i]!= 1 && i < n) { 
     i++; 
    } 
    return i; 
} 
////////////////////////////////////////////////////////////////////// 
int psum(int arg[], double n){ 
    int total = 0; 
    int i = 2; 
    while(i < n){ 
     if(arg[i] == 1){ 
     total = total + 1; 
     } 
     i++; 
    } 
    return total; 
} 
///////////////////////////////////////////////////////////////////////// 
int sieve(int n){ 
    int* primes = NULL; 
    int mult = 0; 
    int k = 2; 
    int i; int total; 
    //primes = new int[n]; 
    primes = new int[256]; 
    for(i = 0; i < n; i++){ 
     primes[i] = 1; 
    } 
    primes[0] = primes[1] = 0; 

    while (k * k < n){ 
     mult = k * k; 
     while (mult < n) { 
      primes[mult] = 0; 
      mult = mult + k; 
     } 
     k = findsmallest(primes,k+1, n); 
    } 
    total = call_kernel(primes, n); 
    //delete [] primes; 
    //primes = NULL; 
    return total; 
}

출처

2011-04-24 zetatr

커널에 동적으로 할당 된 공유 메모리를 사용하지만, 결과는 커널이 때문에 해당 공유 메모리 버퍼에 불법 메모리 작업의 중단됩니다 그래서 커널 출시, 어떤 할당을 포함하지 않습니다. 당신은 다음과 같이 call_kernel의이 부분을 수정하면 작동 찾아야한다 :

// Specify grid/block dimenstions and invoke the kernel 
dim3 dimGrid(1,1); 
dim3 dimBlock(256,1); 
size_t shmsize = size_t(dimBlock.x * dimBlock.y * dimBlock.z) * sizeof(int); 
reduce0<<<dimGrid, dimBlock, shmsize>>>(g_idevice, g_odevice);

당신의이 있다면 포함 아마도 다음과 같이 함수 호출 주위에 검사를 몇 가지 기본적인 오류 :

reduce0<<<dimGrid, dimBlock>>>(g_idevice, g_odevice); 
if (cudaPeekAtLastError() != cudaSuccess) { 
    cout << "kernel launch error: " << cudaGetErrorString(cudaGetLastError()) << endl; 
} 

// Copy device data back to primes 
cudaError_t err = cudaMemcpy(primes, g_odevice, size, cudaMemcpyDeviceToHost); 
if (err != cudaSuccess) { 
    cout << "CUDA error: " << cudaGetErrorString(err) << endl; 
}

는 것 커널 실행 또는 실행이 오류로 인해 실패했음을 즉시 알게되었습니다.

출처

2011-04-24 07:20:05 talonmies

Cuda call은 사양에 관계없이 블록 당 8 개 이상의 스레드를 할당하지 않습니다.

답변

관련 문제