C++ 11 싱글 쓰레드보다 느린 멀티 쓰레드

-3

나는 멀티 태스킹에있어 절대적인 초보자이며, 몇 가지 기본 사항을 읽고 그것을 객체 시각화를 위해 내 프로젝트에 가져 가려고한다. 문제는 내가 구현 한 멀티 스레드 솔루션이 단일 스레드보다 느리며 이유를 알지 못하고 알 수없는 이유로 예상치 못한 응용 프로그램 코드가 있다는 것입니다. 더 나은 성능을 얻기 위해 노력한 두 가지 사례를드립니다. 나는 내가 이해하지 못하는 부분과 일반적인 관점에서 내가 실수 한 부분을 알고 싶다. 필자는 소스 코드의 일부를 제공하고 마지막에 모든 질문을 요약합니다.C++ 11 싱글 쓰레드보다 느린 멀티 쓰레드

여기 내 스레드 팩토리 구현 (매우 기본적이지만 그것은 단지 시작에)입니다 :

threadfactory.h

#pragma once 

#include <vector> 
#include "ThreadInterface.h" 
#include "../MemoryManagement/MemoryMgr.h" 
#include "../Logging/LoggingDefines.h" 

class CThreadFactory : public CThreadIntearface 
{ 
    public: 
     CThreadFactory(); 
     CThreadFactory(BYTE max_threads); 
     ~CThreadFactory(); 

     void Init(BYTE max_threads); 
     void Clear(void); 

     //update waves 
     virtual void UpdateWavesInternalPoints(CWaves& waves); 
     virtual void UpdateWavesNormals(CWaves& waves); 

     //update vertices 
     virtual void TransformVertices(const CObject& object, const vector<TVertex>& input, vector<XMFLOAT3>& output, const CXNAMatrix& matrix); 

     static const char* GetHeapName(void) { return "Thread factory"; } 
#if (defined(DEBUG) | defined(_DEBUG)) 
     /** 
     * Return class name. This function is compiled only in debug mode. 
     * \return class name 
     */ 
     NAME_FUNC(); 
#endif 

    private: 
     void Join(vector<std::thread>& threads); 
     void ReleaseThreads(vector<std::thread>& threads); 

    private: 
     UINT muiNumberofThreads; 

    private: 
     DECLARE_HEAP; 
};

threadfactory.cpp

#include "ThreadFactory.h" 

CThreadFactory::CThreadFactory() 
{ 
    TRACE(LOG_DEBUG, string("Start of initialization of object \"") + GetName() + string("\"")); 
    muiNumberofThreads = 1; 
    TRACE(LOG_DEBUG, string("End of initialization of object \"") + GetName() + string("\"")); 
} 

CThreadFactory::CThreadFactory(BYTE max_threads) 
{ 
    TRACE(LOG_DEBUG, string("Start of initialization of object \"") + GetName() + string("\"")); 
    Init(max_threads); 
    TRACE(LOG_DEBUG, string("End of initialization of object \"") + GetName() + string("\"")); 
} 

CThreadFactory::~CThreadFactory() 
{ 
    TRACE(LOG_DEBUG, string("Start of releasing of object \"") + GetName() + string("\"")); 
    Clear(); 
    TRACE(LOG_DEBUG, string("End of releasing of object \"") + GetName() + string("\"")); 
} 

void CThreadFactory::Init(BYTE max_threads) 
{ 
    muiNumberofThreads = max_threads; 
} 

void CThreadFactory::Clear(void) 
{ 

} 

void CThreadFactory::Join(vector<std::thread>& threads) 
{ 
    for (auto& it : threads) 
    { 
     if (it.joinable()) 
      it.join(); 
    } 
} 

void CThreadFactory::ReleaseThreads(vector<std::thread>& threads) 
{ 
    /*for (auto& it : threads) 
    { 

    }*/ 

    threads.clear(); 
} 

void CThreadFactory::UpdateWavesInternalPoints(CWaves& waves) 
{ 
    if (muiNumberofThreads <= 1) 
    { 
     waves.UpdateWaveInteriorPoints(1, waves.RowCount() - 1); 
    } 
    else 
    { 
     vector<std::thread> threads(muiNumberofThreads - 1); 
     UINT dwWavePartDifference = waves.RowCount()/muiNumberofThreads; 

     DWORD dwMinRow = 1, dwMaxRow = 1 + dwWavePartDifference; 
     for (UINT i = 0; i < muiNumberofThreads - 1; i++) 
     { 
      threads[i] = move(std::thread{ &CWaves::UpdateWaveInteriorPoints, &waves, dwMinRow, dwMaxRow }); 

      dwMinRow += dwWavePartDifference; 
      dwMaxRow += dwWavePartDifference; 
     } 

     waves.UpdateWaveInteriorPoints(dwMinRow, dwMaxRow); 

     Join(threads); 
     ReleaseThreads(threads); 
    } 
} 

void CThreadFactory::UpdateWavesNormals(CWaves& waves) 
{ 
    if (muiNumberofThreads <= 1) 
    { 
     waves.UpdateWaveNormals(1, waves.RowCount() - 1); 
    } 
    else 
    { 
     vector<std::thread> threads(muiNumberofThreads - 1); 
     UINT dwWavePartDifference = waves.RowCount()/muiNumberofThreads; 

     DWORD dwMinRow = 1, dwMaxRow = 1 + dwWavePartDifference; 
     for (UINT i = 0; i < muiNumberofThreads - 1; i++) 
     { 
      threads[i] = move(std::thread{ &CWaves::UpdateWaveNormals, &waves, dwMinRow, dwMaxRow }); 

      dwMinRow += dwWavePartDifference; 
      dwMaxRow += dwWavePartDifference; 
     } 

     waves.UpdateWaveNormals(dwMinRow, dwMaxRow); 

     Join(threads); 
     ReleaseThreads(threads); 
    } 
} 

void CThreadFactory::TransformVertices(const CObject& object, const vector<TVertex>& input, vector<XMFLOAT3>& output, const CXNAMatrix& matrix) 
{ 
    if (output.size() != input.size()) 
     output.resize(input.size()); 

    if ((muiNumberofThreads <= 1) || (input.size() < 1000)) 
    { 
     object.TransformVerticesSet(input.begin(), output.begin(), input.size() - 1, matrix); 
    } 
    else 
    { 
     vector<std::thread> threads(muiNumberofThreads - 1); 
     UINT uiThreadVertexCount = input.size()/muiNumberofThreads; 
     UINT uiStartVertexIndex = 0; 

     for (UINT i = 0; i < muiNumberofThreads - 1; i++) 
     { 
      if (uiStartVertexIndex >= input.size()) 
       uiStartVertexIndex = input.size() - 1; 

      threads[i] = move(std::thread{ &CObject::TransformVerticesSet, &object, input.begin() + uiStartVertexIndex, output.begin() + uiStartVertexIndex, uiThreadVertexCount - 1, matrix }); 

      uiStartVertexIndex += uiThreadVertexCount; 
     } 

     object.TransformVerticesSet(input.begin() + uiStartVertexIndex, output.begin() + uiStartVertexIndex, uiThreadVertexCount - 1, matrix); 

     Join(threads); 
     ReleaseThreads(threads); 
    } 
} 

#if (defined(DEBUG) | defined(_DEBUG)) 

NAME_BODY(CThreadFactory, "Threads"); 

#endif 

DEFINE_HEAP(CThreadFactory, GetHeapName());

1. 웨이브 업데이트 :

Wave라는 개체를 사용하고 있습니다. 이 객체는 암묵적으로 약 40,000 개의 꼭지점을가집니다. 나는 각 프레임을 업데이트하기 위해 이러한 기능을 사용하고 있습니다 :

나는 문제가 CWaves에 있다고 생각하는 경우

void CWaves::UpdateWaveInteriorPoints(DWORD min_row, DWORD max_row) 
{ 
    if (min_row < 1) 
     min_row = 1; 

    if (max_row > (RowCount() - 1)) 
     max_row = (RowCount() - 1); 

    for (DWORD i = min_row; i < max_row; ++i) 
    { 
     for (DWORD j = 1; j < ColumnCount() - 1; ++j) 
     { 
      // After this update we will be discarding the old previous 
      // buffer, so overwrite that buffer with the new update. 
      // Note how we can do this inplace (read/write to same element) 
      // because we won't need prev_ij again and the assignment happens last. 

      // Note j indexes x and i indexes z: h(x_j, z_i, t_k) 
      // Moreover, our +z axis goes "down"; this is just to 
      // keep consistent with our row indices going down. 

      GetPrevSolutionVertices()[i*ColumnCount() + j].Position.y = 
       GetK1()*GetPrevSolutionVertices()[i*ColumnCount() + j].Position.y + 
       GetK2()*mpObjectMesh->mVertices[i*ColumnCount() + j].Position.y + 
       GetK3()*(mpObjectMesh->mVertices[(i + 1)*ColumnCount() + j].Position.y + 
       mpObjectMesh->mVertices[(i - 1)*ColumnCount() + j].Position.y + 
       mpObjectMesh->mVertices[i*ColumnCount() + j + 1].Position.y + 
       mpObjectMesh->mVertices[i*ColumnCount() + j - 1].Position.y); 
     } 
    } 
} 

void CWaves::UpdateWaveNormals(DWORD min_row, DWORD max_row) 
{ 
    if (min_row < 1) 
     min_row = 1; 

    if (max_row >(RowCount() - 1)) 
     max_row = (RowCount() - 1); 

    for (UINT i = min_row; i < max_row; ++i) 
    { 
     for (UINT j = 1; j < ColumnCount() - 1; ++j) 
     { 
      float l = mpObjectMesh->mVertices[i*ColumnCount() + j - 1].Position.y; 
      float r = mpObjectMesh->mVertices[i*ColumnCount() + j + 1].Position.y; 
      float t = mpObjectMesh->mVertices[(i - 1)*ColumnCount() + j].Position.y; 
      float b = mpObjectMesh->mVertices[(i + 1)*ColumnCount() + j].Position.y; 
      mpObjectMesh->mVertices[i*ColumnCount() + j].Normal.x = -r + l; 
      mpObjectMesh->mVertices[i*ColumnCount() + j].Normal.y = 2.0f*GetSpatialStep(); 
      mpObjectMesh->mVertices[i*ColumnCount() + j].Normal.z = b - t; 

      XMVECTOR n = XMVector3Normalize(XMLoadFloat3(&mpObjectMesh->mVertices[i*ColumnCount() + j].Normal)); 
      XMStoreFloat3(&mpObjectMesh->mVertices[i*ColumnCount() + j].Normal, n); 

      mpObjectMesh->mVertices[i*ColumnCount() + j].TangentU = XMFLOAT3(2.0f*GetSpatialStep(), r - l, 0.0f); 
      XMVECTOR T = XMVector3Normalize(XMLoadFloat3(&mpObjectMesh->mVertices[i*ColumnCount() + j].TangentU)); 
      XMStoreFloat3(&mpObjectMesh->mVertices[i*ColumnCount() + j].TangentU, T); 
     } 
    } 
} 

void CWaves::UpdateWave(float dt) 
{ 
    static float t_base = 0.0f; 
    if ((g_Timer->TotalTime() - t_base) >= 0.25f) 
    { 
     t_base += 0.25f; 

     DWORD i, j; 

     do 
     { 
      i = 5 + rand() % (RowCount() - 5); 
      j = 5 + rand() % (ColumnCount() - 5); 
     } while (!((i > 1) && (i < (RowCount() - 2)) && 
      (j > 1) && (j < (ColumnCount() - 2)))); 

     float r = MathHelper::RandF(1.0f, 2.0f); 

     Disturb(i, j, r); 
    } 

    static float t = 0; 

    // Accumulate time. 
    t += dt; 

    // Only update the simulation at the specified time step. 
    if (t >= TimeStep()) 
    { 
     // Only update interior points; we use zero boundary conditions. 
     if (g_ThreadFactory) 
     { 
      g_ThreadFactory->UpdateWavesInternalPoints(*this); 
     } 
     else 
     { 
      UpdateWaveInteriorPoints(1, RowCount() - 1); 
     } 

     // We just overwrote the previous buffer with the new data, so 
     // this data needs to become the current solution and the old 
     // current solution becomes the new previous solution. 
     std::swap(GetPrevSolutionVertices(), mpObjectMesh->mVertices); 

     t = 0.0f; // reset time 

     if (mShapeDescription.mShapeProperties.bLightedObject) 
     { 
      // 
      // Compute normals using finite difference scheme. 
      // 
      if (g_ThreadFactory) 
      { 
       g_ThreadFactory->UpdateWavesNormals(*this); 
      } 
      else 
      { 
       UpdateWaveNormals(1, RowCount() - 1); 
      } 
     } 
    } 
}

내가 모든 스레드를주고 난이 원인은 지속적으로 잠금 생각하는 오브젝트. 그래서 저는 주어진 변형 행렬을 사용하여 정점을 변형하려고 시도한 또 다른 경우에 대한 접근 방식을 바꿉니다. 전체 객체 대신 컨테이너 반복자를 사용하고 있습니다. 내가 대신 전체 정점 벡터를주는 반복자를 사용하려고하는 경우

void CObject::TransformVerticesSet(vector<TVertex>::const_iterator input, vector<XMFLOAT3>::iterator output, UINT number_of_vertices, const CXNAMatrix& matrix) const 
{ 
    for (UINT i = 0; i <= number_of_vertices; i++) 
    { 
     CMatrixTransformations::TransformPoint(input[i].Position, matrix, output[i]); 
    } 
}

,하지만 결과는 다음과 같습니다 스레드 공장에서 호출

2. 정점 변환

정점 변환 방법은 위에 표시된 이전 솔루션과 동일합니다. 단일 스레드 솔루션보다 느립니다.

이전 코드에서 편집 < 2014년 8월 12일>

나는 다음과 같은 매크로를 사용 :

TRACE - 릴리스 모드에서, 로깅 시스템에 사용이 비어

NAME_FUNC, NAME_BODY - 선언 매크로 클래스 이름을 반환하는 클래스 메서드의 정의

DECLARE_HEAP, DEFINE_HEAP - 오버로드 된 새 연산자 및 삭제 연산자에 대한 선언 및 정의 만들기

이 것은 다중 스레딩 작업의 성능에 영향을 미치지 않아야합니다. 여기

내가 내 응용 프로그램 (나는 그 경우 이전 상황에 멀티 스레딩 사용하지 마십시오)를 닫은 후 VS 2013에서 출력 :

타사 API를 (아마도 DX) 인 것 같다

The thread 0x229c has exited with code 27 (0x1b). 
The thread 0x22dc has exited with code 27 (0x1b). 
The thread 0x11ac has exited with code 27 (0x1b). 
The thread 0x328c has exited with code 27 (0x1b). 
The thread 0x205c has exited with code 27 (0x1b). 
The thread 0xf4c has exited with code 27 (0x1b). 
The thread 0x894 has exited with code 27 (0x1b). 
The thread 0x3094 has exited with code 27 (0x1b). 
The thread 0x2eb4 has exited with code 27 (0x1b). 
The thread 0x2ef8 has exited with code 27 (0x1b). 
The thread 0x22f4 has exited with code 27 (0x1b). 
The thread 0x2810 has exited with code 27 (0x1b). 
The thread 0x29e0 has exited with code 27 (0x1b). 
The thread 0x2e54 has exited with code 27 (0x1b). 
D3D11 WARNING: Process is terminating. Using simple reporting. Please call ReportLiveObjects() at runtime for standard reporting. [ STATE_CREATION WARNING #0: UNKNOWN] 
D3D11 WARNING: Live Producer at 0x012F05A0, Refcount: 8. [ STATE_CREATION WARNING #0: UNKNOWN] 
D3D11 WARNING: Live Object at 0x012F1D38, Refcount: 0. [ STATE_CREATION WARNING #0: UNKNOWN] 
D3D11 WARNING: Live Object at 0x013BA3F8, Refcount: 0. [ STATE_CREATION WARNING #0: UNKNOWN] 

The program '[13272] EngineDX.exe' has exited with code 27 (0x1b).

스레드를 생성하지만 프로세스 관리자에서 한 스레드 만 사용합니다. 그게 문제가 될 수 있습니다 ...

그래서 여기 내 질문이 있습니다 :

그렇게 잘못 또는 40 개 000 정점의 업데이트 스레드 공장 내 구현이 더 많은 스레드로 분할 될 필요가 없습니다입니까?
내가 잠긴 경우 왜 그런지 알고 싶습니다. 버텍스 변환을위한 솔루션은 반복자와 꼭지점을 사용하므로 벡터 컨테이너가 분리되어 있으므로 잠금을해서는 안됩니다.
하나의 이유로 각 함수 호출에 대한 스레드를 작성하기로 결정했습니다. 처음에는 스레드 벡터 컨테이너를 스레드 팩토리의 멤버 클래스로 사용했습니다. 그러나 이로 인해 디버그 모드에서 메모리 누수가 발생합니다 (릴리스 모드에는이 문제가 없었 음). 아무것도하지 않고 순수한 선언. 나는 이유를 결코 알지 못했다. 스레드를 올바르게 해제하는 데 필요한 다른 것이 있습니까?
이제 모든 스레드가이 오류 코드를 반환했기 때문에 응용 프로그램 코드 27로 끝났습니다. 무슨 뜻이에요?
디버그 모드에서 8 개의 스레드 (7 개의 메인 스레드가 8 개의 스레드 CPU에 있음)를 사용하면 모든 8 개의 스레드가 무언가를한다는 것을 알 수 있습니다. 그러나 릴리스 모드에서는 하나의 스레드 (주 스레드) 만 사용하면 변경되지 않습니다. 행동이 잘못되었거나 어떤 이유로 인해 예상 될 수 있습니까?

죄송합니다. 긴 텍스트입니다. 오해를 피하기 위해 더 자세히 설명하고 싶습니다. 답변 주셔서 감사합니다.

편집 2014년 12월 17일 :

내가 사용 스레드 기능을 다시 구현 (웨이브 클래스는 무관하게)을 더 공유 객체 참조 또는 변수가없는,하지만 여전히 작동하지 않습니다. 나는 왜 그런지 이해할 수 없다. 흥미로운 것은, 내가 8 개의 쓰레드를 사용할 때, 디버그 실행 파일에서 Core i7이 100 %로 실행되는 것을 볼 수는 있지만, 프레임 속도에서는 이점이 없다는 것이다. 릴리스 실행 파일을 볼 때 단 4 개의 스레드 만 실행되고 CPU는 25 % 만 실행됩니다.

새로운 멀티 스레드 기능 :

void UpdateWaveInteriorPoints(TVertexFieldIterator previous_vertex_field, TVertexFieldIterator actual_vertex_field, DWORD min_row, DWORD max_row, float k1, float k2, float k3, UINT column_count) 
{ 
    if (min_row < 1) 
     min_row = 1; 

    /*if (max_row >(RowCount() - 1)) 
     max_row = (RowCount() - 1);*/ 

    for (DWORD i = min_row; i < max_row; ++i) 
    { 
     for (DWORD j = 1; j < column_count - 1; ++j) 
     { 
      // After this update we will be discarding the old previous 
      // buffer, so overwrite that buffer with the new update. 
      // Note how we can do this inplace (read/write to same element) 
      // because we won't need prev_ij again and the assignment happens last. 

      // Note j indexes x and i indexes z: h(x_j, z_i, t_k) 
      // Moreover, our +z axis goes "down"; this is just to 
      // keep consistent with our row indices going down. 

      previous_vertex_field[i*column_count + j].Position.y = 
       k1*previous_vertex_field[i*column_count + j].Position.y + 
       k2*actual_vertex_field[i*column_count + j].Position.y + 
       k3*(actual_vertex_field[(i + 1)*column_count + j].Position.y + 
       actual_vertex_field[(i - 1)*column_count + j].Position.y + 
       actual_vertex_field[i*column_count + j + 1].Position.y + 
       actual_vertex_field[i*column_count + j - 1].Position.y); 
     } 
    } 
}

기능 스레드 생성 :

TVertexFieldIterator tActualVertexIterator = waves.mpObjectMesh->mVertices.begin(); 
     TVertexFieldIterator tPreviousVertexIterator = waves.GetPrevSolutionVertices().begin(); 
     std::vector<std::thread> threads; 
     //std::vector<std::future<void>> threads; 
     UINT dwWavePartDifference = waves.RowCount()/muiNumberofThreads; 

     DWORD dwMinRow = 1, dwMaxRow = dwWavePartDifference; 
     DWORD dwVertexCount = dwWavePartDifference*waves.ColumnCount(); 

     for (UINT i = 0; i < muiNumberofThreads - 1; i++) 
     { 
      //threads.emplace_back(std::async(std::launch::async, &CWaves::UpdateWaveInteriorPoints, &waves, tPreviousVertexIterator, tActualVertexIterator, dwMinRow, dwMaxRow, waves.GetK1(), waves.GetK2(), waves.GetK3(), waves.ColumnCount())); 
      threads.emplace_back(std::thread(&UpdateWaveInteriorPoints, tPreviousVertexIterator, tActualVertexIterator, dwMinRow, dwMaxRow, waves.GetK1(), waves.GetK2(), waves.GetK3(), waves.ColumnCount())); 

      tActualVertexIterator += dwVertexCount; 
      tPreviousVertexIterator += dwVertexCount; 
     } 

     tPreviousVertexIterator -= waves.ColumnCount(); //row - 1 
     tActualVertexIterator -= waves.ColumnCount(); //row - 1 
     waves.UpdateWaveInteriorPoints(tPreviousVertexIterator, tActualVertexIterator, dwMinRow, dwMaxRow, waves.GetK1(), waves.GetK2(), waves.GetK3(), waves.ColumnCount()); 

     for (UINT i = 0; i < muiNumberofThreads -1; i++) 
     { 
      //threads[i].wait(); 
      threads[i].join(); 
     }

마렉을

출처

2014-12-07 mareknr

: 다음은 그 중 몇 가지에 대한 링크입니다. 스레딩과 실제 작업은 직교합니다 (또는 가능한 한 많이해야 함). – didierc

다른 문제 : 질문의 제목이 너무 일반적입니다. 모든 사람들이 자신의 질문에 전화를했다면 상상해보십시오. 문제가 비슷한 사람들을 찾는 데 유용 할 수 있습니까? 보다 정확한 제목을 찾으십시오. – didierc

* 제공하는 코드는 컴파일되지 않습니다 *.코드를 너무 많이 공유하고 싶지 않거나 질문 본문에 모든 것을 포함시키는 것이 너무 길다는 생각이 들었습니다. 이 경우 관리 가능한 구현으로 문제를 줄이기 바랍니다. – didierc

내가 질문을 뽑아 때, 답 10 개 관련 질문이 있었다 @mareknr 사이드 바는 멀티 스레드 구현이 단일 스레드보다 느린 이유와 관련이있었습니다. 나는 그 중 하나 이상이 귀하의 질문에 대답 할 것이라고 생각합니다. 당신은 "파도"로직 로직을 처리하는 스레드를 분리 할 필요가

Multi-threaded GEMM slower than single threaded one?

C++ Boost Multithread is slower than single thread because of CPU type?

Why is this OpenMP program slower than single-thread?

2 threads slower than 1?

출처

2014-12-10 00:37:23

몇 가지 사항을 변경했지만 결과는 변경되지 않았습니다. 문제점 설명 끝에 EDIT 파트를 참조하십시오. – mareknr