2014-04-21 3 views

"Cuda-Fortran for scientists and engineer"코드를 실행하려고합니다. 그러나 세그먼트 오류로 실행 중으로 이해가되지 않습니다.Cuda-Fortran MPI_Sendrecv 세분화 오류

[mpi_rank_0][error_sighandler] Caught error: Segmentation fault (signal 11) 
[mpi_rank_1][error_sighandler] Caught error: Segmentation fault (signal 11) 
[mpi_rank_2][error_sighandler] Caught error: Segmentation fault (signal 11) 
[mpi_rank_3][error_sighandler] Caught error: Segmentation fault (signal 11) 

내 시스템은 64 리눅스이고 PGI 컴파일러가 있습니다. 쿠다 드라이버는 4.0에서 다음 코드를 읽었습니다. 컴파일 할 수 있지만 MPI_Sendrecv 작동하지 않는 것 같습니다. MVAPICH2.1.8이 설치되어 있습니다. 나는를 제거하면 코드는이 명령

/usr/.../mvapich/bin/mpif90 filename.cuf 


내가 -C 옵션을 빌드에 따라 편집, 컴파일은

pgfortran-Info-Switch -Mvect -fast forces -O2 
PGF90-S-0155-Kernel region ignored; see -Minfo messages (transposeMVA.cuf: 188) 
    140, Loop not vectorized/parallelized: contains call 
    146, Loop not vectorized/parallelized: contains call 
    157, Loop not vectorized/parallelized: contains call 
    190, Accelerator restriction: function/procedure calls are not supported 
     Loop not vectorized/parallelized: contains call 
    191, Accelerator restriction: function/procedure calls are not supported 
    217, all reduction inlined 
     Loop not vectorized/parallelized: contains call 
    0 inform, 0 warnings, 1 severes, 0 fatal for transposempi 

실패 - C 옵션은 컴파일은 통과하지만 결과는 여전히 동일합니다.

/mpif90 -g -O0 -Minfo transposeMVA.cuf  pgfortran-Info-Switch -Mvect -fast forces -O2 
    140, Generated vector sse code for the loop 
    146, Loop not vectorized: may not be beneficial 
     Unrolled inner loop 8 times 
    157, Memory copy idiom, loop replaced by call to __c_mcopy4 
    178, Loop not vectorized/parallelized: contains call 
    190, CUDA kernel generated 
     190, !$cuf kernel do <<< (*,*), (128,1) >>> 
    217, all reduction inlined 


module transpose_m 

    implicit none 
    integer, parameter :: cudaTileDim = 32 
    integer, parameter :: blockRows = 8 


    attributes(global) & 
     subroutine cudaTranspose(odata, ldo, idata, ldi) 
    real, intent(out) :: odata(ldo,*) 
    real, intent(in) :: idata(ldi,*) 
    integer, value, intent(in) :: ldo, ldi 
    real, shared :: tile(cudaTileDim+1, cudaTileDim) 
    integer :: x, y, j 

    x = (blockIdx%x-1) * cudaTileDim + threadIdx%x 
    y = (blockIdx%y-1) * cudaTileDim + threadIdx%y 

    do j = 0, cudaTileDim-1, blockRows 
     tile(threadIdx%x, threadIdx%y+j) = idata(x,y+j) 
    end do 

    call syncthreads() 

    x = (blockIdx%y-1) * cudaTileDim + threadIdx%x 
    y = (blockIdx%x-1) * cudaTileDim + threadIdx%y 

    do j = 0, cudaTileDim-1, blockRows 
     odata(x,y+j) = tile(threadIdx%y+j, threadIdx%x)   
    end do 
    end subroutine cudaTranspose 

end module transpose_m 

! Main code 

program transposeMPI 
    use cudafor 
    use mpi 
    use transpose_m 

    implicit none 

    ! global array size 
    integer, parameter :: nx = 2048, ny = 2048 

    ! host arrays (global) 
    real :: h_idata(nx,ny), h_tdata(ny,nx), gold(ny,nx) 

    ! CUDA vars and device arrays 
    integer :: deviceID 
    type (dim3) :: dimGrid, dimBlock 
    real, device, allocatable :: & 
     d_idata(:,:), d_tdata(:,:), d_sTile(:,:), d_rTile(:,:) 

    ! MPI stuff 
    integer :: mpiTileDimX, mpiTileDimY 
    integer :: myrank, nprocs, tag, ierr, localRank 
    integer :: nstages, stage, sRank, rRank 
    integer :: status(MPI_STATUS_SIZE) 
    real(8) :: timeStart, timeStop 
    character (len=10) :: localRankStr 

    integer :: i, j, nyl, jl, jg, p 
    integer :: xOffset, yOffset 

    ! for MVAPICH set device before MPI initialization 

    call get_environment_variable('MV2_COMM_WORLD_LOCAL_RANK', & 
    read(localRankStr,'(i10)') localRank 
    ierr = cudaSetDevice(localRank) 

    ! MPI initialization 

    call MPI_init(ierr) 
    call MPI_comm_rank(MPI_COMM_WORLD, myrank, ierr) 
    call MPI_comm_size(MPI_COMM_WORLD, nProcs, ierr) 

    ! check parameters and calculate execution configuration 

    if (mod(nx,nProcs) == 0 .and. mod(ny,nProcs) == 0) then 
    mpiTileDimX = nx/nProcs 
    mpiTileDimY = ny/nProcs 
    write(*,*) 'ny must be an integral multiple of nProcs' 
    call MPI_Finalize(ierr) 

    if (mod(mpiTileDimX, cudaTileDim) /= 0 .or. & 
     mod(mpiTileDimY, cudaTileDim) /= 0) then 
    write(*,*) 'mpiTileDimX and mpitileDimY must be an ', & 
      'integral multiple of cudaTileDim' 
    call MPI_Finalize(ierr) 
    end if 

    if (mod(cudaTileDim, blockRows) /= 0) then 
    write(*,*) 'cudaTileDim must be a multiple of blockRows' 
    call MPI_Finalize(ierr) 
    end if 

    dimGrid = dim3(mpiTileDimX/cudaTileDim, & 
     mpiTileDimY/cudaTileDim, 1) 
    dimBlock = dim3(cudaTileDim, blockRows, 1) 

    ! write parameters 

    if (myrank == 0) then 
    write(*,"(/,'Array size: ', i0,'x',i0,/)") nx, ny 

    write(*,"('CUDA block size: ', i0,'x',i0, & 
      ', CUDA tile size: ', i0,'x',i0)") & 
      cudaTileDim, blockRows, cudaTileDim, cudaTileDim 

    write(*,"('dimGrid: ', i0,'x',i0,'x',i0, & 
      ', dimBlock: ', i0,'x',i0,'x',i0,/)") & 
      dimGrid%x, dimGrid%y, dimGrid%z, & 
      dimBlock%x, dimBlock%y, dimBlock%z 

    write(*,"('nprocs: ', i0, ', Local input array size: ', & 
      i0,'x',i0)") nprocs, nx, mpiTileDimY 
    write(*,"('mpiTileDim: ', i0,'x',i0,/)") & 
      mpiTileDimX, mpiTileDimY 

    ! initialize data 

    ! host - each process has entire array on host (for now) 

    do p = 0, nProcs-1 
    do jl = 1, mpiTileDimY 
     jg = p*mpiTileDimY + jl 
     do i = 1, nx 
      h_idata(i,jg) = i+(jg-1)*nx 

    gold = transpose(h_idata) 

    ! device - each process has 
    ! nx*mpiTileDimY = ny*mpiTileDimX elements 

    allocate(d_idata(nx, mpiTileDimY), & 
     d_tdata(ny, mpiTileDimX), & 
     d_sTile(mpiTileDimX,mpiTileDimY), & 
     d_rTile(mpiTileDimX, mpiTileDimY)) 

    yOffset = myrank*mpiTileDimY 
    d_idata(1:nx,1:mpiTileDimY) = & 

    d_tdata = -1.0 

    ! --------- 
    ! transpose 
    ! --------- 

    timeStart = MPI_Wtime() 

    ! 0th stage - local transpose 

    call cudaTranspose<<<dimGrid, dimBlock>>> & 
     (d_tdata(myrank*mpiTileDimY+1,1), ny, & 
     d_idata(myrank*mpiTileDimX+1,1), nx) 

    ! other stages that involve MPI transfers 

    do stage = 1, nProcs-1 
    ! sRank = the rank to which myrank sends data 
    ! rRank = the rank from which myrank receives data 
    sRank = modulo(myrank-stage, nProcs) 
    rRank = modulo(myrank+stage, nProcs) 


    ! pack tile so data to be sent is contiguous 

    !$cuf kernel do(2) <<<*,*>>> 
    do j = 1, mpiTileDimY 
     do i = 1, mpiTileDimX 
      d_sTile(i,j) = d_idata(sRank*mpiTileDimX+i,j) 

    call MPI_SENDRECV(d_sTile, mpiTileDimX*mpiTileDimY, & 
      MPI_REAL, sRank, myrank, & 
      d_rTile, mpiTileDimX*mpiTileDimY, MPI_REAL, & 
      rRank, rRank, MPI_COMM_WORLD, status, ierr) 

    ! do transpose from receive tile into final array 
    ! (no need to unpack) 

    call cudaTranspose<<<dimGrid, dimBlock>>> & 
      (d_tdata(rRank*mpiTileDimY+1,1), ny, & 
      d_rTile, mpiTileDimX) 

    end do ! stage  

    timeStop = MPI_Wtime() 

    ! check results 

    h_tdata = d_tdata 

    xOffset = myrank*mpiTileDimX 
    if (all(h_tdata(1:ny,1:mpiTileDimX) == & 
     gold(1:ny, xOffset+1:xOffset+mpiTileDimX))) then 
    if (myrank == 0) then 
     write(*,"('Bandwidth (GB/s): ', f7.2,/)") & 
    write(*,"('[',i0,']', *** Failed ***,/)") myrank 

    ! cleanup 

    deallocate(d_idata, d_tdata, d_sTile, d_rTile) 

    call MPI_Finalize(ierr) 

end program transposeMPI 

디버깅 옵션으로 컴파일 했습니까? 디버거를 사용해 보셨습니까? –


디버그 옵션을 추가하지 않았습니다. – Adjeiinfo


'-C -g -O0 -traceback'을 사용하여 컴파일을 시도하고, 다시 실행하고 출력을 게시하십시오. – milancurcic



다음과 같은 기능이 있습니다. 감사합니다 로버트

h_sTile = d_sTile 

    call MPI_SENDRECV(h_sTile, mpiTileDimX*mpiTileDimY, & 
    MPI_REAL, sRank, myrank, & 
    h_rTile, mpiTileDimX*mpiTileDimY, MPI_REAL, & 
    rRank, rRank, MPI_COMM_WORLD, status, ierr) 

    !data to device device buffer 
    d_rTile = h_rTile 

나는 적당한 MVAPICH를 얻을 필요가있다.

도움 주셔서 감사합니다.