`->`의 루프 의존성은 병렬 처리를 막는다.

모델에 대한 데이터를 보유하고 그 데이터에 대해 여러 함수를 실행하는 Model 클래스가있다. 세부 사항은 다음과 같은 디자인을 제외하고는 너무 중요하지 않습니다.`->`의 루프 의존성은 병렬 처리를 막는다.

변수는 클래스 네임 스페이스에 저장됩니다.
변수는 클래스의 메서드 중 하나에 의해 초기화되고 해제됩니다.
변수는 다른 여러 가지 방법으로 사용됩니다. 다음과 같이 클래스의

MWE가 나타납니다

#include <cstdlib> 


class Model { 
private: 
    int width; 
    int height; 
    int size; 

    int nshift[8];  //Offset from a focal cell's index to its neighbours 
    double *restrict h; //Digital elevation model (height) 
    int *restrict rec; //Index of receiving cell 

    const int NO_FLOW = -1; 
    const double SQRT2 = 1.414213562373095048801688724209698078569671875376948; 
    const double dr[8] = {1,SQRT2,1,SQRT2,1,SQRT2,1,SQRT2}; 

private: 
    void GenerateRandomTerrain(){ 
    //srand(std::random_device()()); 
    for(int y=0;y<height;y++) 
    for(int x=0;x<width;x++){ 
     const int c = y*width+x; 
     h[c] = rand()/(double)RAND_MAX; 
    } 
    } 


public: 
    Model(const int width0, const int height0) 
    : nshift{-1,-width0-1,-width0,-width0+1,1,width0+1,width0,width0-1} 
    { 
    width = width0; 
    height = height0; 
    size = width*height; 

    h  = new double[size]; 

    GenerateRandomTerrain(); 
    } 

    ~Model(){ 
    delete[] h; 
    } 

private: 
    void FindDownstream(){ 
    //! computing receiver array 
    #pragma acc parallel loop collapse(2) independent present(h,rec,width,height) 
    for(int y=2;y<height-2;y++) 
    for(int x=2;x<width-2;x++){ 
     const int c  = y*width+x; 

     //The slope must be greater than zero for there to be downhill flow; 
     //otherwise, the cell is marekd NO_FLOW 
     double max_slope = 0; 
     int max_n  = NO_FLOW; 

     #pragma acc loop seq 
     for(int n=0;n<8;n++){ 
     double slope = (h[c] - h[c+nshift[n]])/dr[n]; 
     if(slope>max_slope){ 
      max_slope = slope; 
      max_n  = n; 
     } 
     } 
     rec[c] = max_n; 
    }  
    } 

public: 
    void run(const int nstep){ 
    rec = new int[size]; 

    #pragma acc enter data copyin(h[0:size],nshift[0:8],height,width,this) create(rec[0:size]) 

    for(int step=0;step<=nstep;step++) 
     FindDownstream(); 

    #pragma acc exit data copyout(h[0:size]) delete(this,rec) 

    delete[] rec; 
    } 

}; 

int main(int argc, char **argv){ 
    Model model(300,300); 
    model.run(100); 

    return 0; 
}

내가 컴파일 :

pgc++ -acc -ta=tesla,pinned,cc60 -Minfo=accel -fast test.cpp -std=c++11

내가 얻을 다음과 같은 경고 : 일부에 파고

51, Loop without integer trip count will be executed in sequential mode 
    Complex loop carried dependence of rec->,nshift prevents parallelization 
    Loop carried dependence of rec-> prevents parallelization 
    Loop carried backward dependence of rec-> prevents vectorization

인터넷은 이것의 전형적인 원인이 pointe의 잠재력이라는 것을 밝힙니다. 의존성을 야기하는 앨리어싱.

나는 그림과 같이 *restrict과 independent을 사용하여 컴파일러에게 문제가 없다고 말했지만, 무시하고 루프를 병렬 처리하지는 않습니다.

restrict을 적절히 사용하여 함수에 인수로 포인터를 전달하면 오류가 제거되지만,이 디자인에 대한 미적인 선호가 있습니다. 또는, 본질적으로 커널 인 모든 메소드는 run() 함수로 함께 묶을 수 있지만 바람직하지 않습니다. 나는 내부 루프에 independent를 사용하는 경우

, 내가 얻을 : 다른 loop 지시문 (actual_code.cpp : 227)가 안 타일/축소 루프 둥지

PGCC-W-0155-내부 루프

하지만 루프가 병렬 처리 된 것처럼 보입니다.

PGI 17.9로 컴파일 중입니다.

출처

2017-12-21 Richard

여기서 문제는 "높이"와 "너비"가 클래스 데이터 멤버라는 점입니다. 따라서 컴파일러는 외부 참조를 가질 수 있으므로이 루프를 실행하는 동안 값을 변경할 수 있다고 가정해야합니다.

해결 방법은 값을 로컬 변수에 복사 한 다음 루프 변수로 로컬 변수를 사용하는 것입니다.

외부 루프에서 "축소 (2)"했으므로 "독립적 인"절이 이미 두 루프에 적용됩니다. ("독립적 인"은 "병렬"계산 영역에 대한 기본값이기 때문에 불필요합니다.) 다중 루프를 축소 할 때 두 번째 "루프"구문은 허용되지 않습니다.

% cat test.cpp 
#include <cstdlib> 


class Model { 
private: 
    int width; 
    int height; 
    int size; 

    int nshift[8];  //Offset from a focal cell's index to its neighbours 
    double *restrict h; //Digital elevation model (height) 
    int *restrict rec; //Index of receiving cell 

    const int NO_FLOW = -1; 
    const double SQRT2 = 1.414213562373095048801688724209698078569671875376948; 
    const double dr[8] = {1,SQRT2,1,SQRT2,1,SQRT2,1,SQRT2}; 

private: 
    void GenerateRandomTerrain(){ 
    //srand(std::random_device()()); 
    for(int y=0;y<height;y++) 
    for(int x=0;x<width;x++){ 
     const int c = y*width+x; 
     h[c] = rand()/(double)RAND_MAX; 
    } 
    } 


public: 
    Model(const int width0, const int height0) : nshift{-1,-width0-1,-width0,-width0+1,1,width0+1,width0,width0-1} 
    { 
    width = width0; 
    height = height0; 
    size = width*height; 

    h  = new double[size]; 

    GenerateRandomTerrain(); 
    } 

    ~Model(){ 
    delete[] h; 
    } 

private: 
    void FindDownstream(){ 
    //! computing receiver array 
    int hgt = height; 
    int wdt = width; 
    #pragma acc parallel loop collapse(2) present(h,rec) 
    for(int y=2;y<hgt-2;y++) 
    for(int x=2;x<wdt-2;x++){ 
     const int c  = y*wdt+x; 

     //The slope must be greater than zero for there to be downhill flow; 
     //otherwise, the cell is marekd NO_FLOW 
     double max_slope = 0; 
     int max_n  = NO_FLOW; 

     #pragma acc loop seq 
     for(int n=0;n<8;n++){ 
     double slope = (h[c] - h[c+nshift[n]])/dr[n]; 
     if(slope>max_slope){ 
      max_slope = slope; 
      max_n  = n; 
     } 
     } 
     rec[c] = max_n; 
    } 
    } 

public: 
    void run(const int nstep){ 
    rec = new int[size]; 

    #pragma acc enter data copyin(this,h[0:size],nshift[0:8]) create(rec[0:size]) 

    for(int step=0;step<=nstep;step++) 
     FindDownstream(); 

    #pragma acc exit data copyout(h[0:size]) delete(rec,nshift,this) 

    delete[] rec; 
    } 

}; 

int main(int argc, char **argv){ 
    Model model(300,300); 
    model.run(100); 

    return 0; 
} 
% pgc++ test.cpp -w --c++11 -Minfo=accel -ta=tesla:cc60 -V17.10; a.out 
Model::FindDownstream(): 
    49, Generating present(h[:]) 
     Accelerator kernel generated 
     Generating Tesla code 
     51, #pragma acc loop gang, vector(128) collapse(2) /* blockIdx.x threadIdx.x */ 
     52, /* blockIdx.x threadIdx.x collapsed */ 
     61, #pragma acc loop seq 
    49, Generating implicit copy(this[:]) 
     Generating present(rec[:]) 
    61, Loop carried scalar dependence for max_slope at line 63 
Model::run(int): 
    74, Generating enter data copyin(nshift[:],h[:size]) 
     Generating enter data create(rec[:size]) 
     Generating enter data copyin(this[:1]) 
    83, Generating exit data delete(this[:1],rec[:1]) 
     Generating exit data copyout(h[:size]) 
     Generating exit data delete(nshift[:])

출처

2017-12-26 15:38:34

감사합니다. 매트! 컴파일러의 결과물에서 벗어난 것은 아니지만, 'rec ->, nshift'에 대한 언급은 수정해야 할 문제와 같은 것처럼 보입니다. 컴파일러가 '높이'와 '너비'가 휘발성 일 수 있다고 여겨지는 경우에는 멋지다. – Richard

주요 메시지는 높이와 너비가 휘발성이어서 계산할 수없는 루프에 대한 메시지입니다. "rec->"종속성은 계산 된 인덱스의 사용으로 인해 발생합니다. "병렬 루프"를 사용하기 때문에 컴파일러는 종속성을 무시합니다. –

`->`의 루프 의존성은 병렬 처리를 막는다.

답변

관련 문제