2012-04-05 1 views
1

시간을 읽고 이해 한 후에 마침내 사실상 무언가를하는 첫 번째 OpenCL 프로그램을 만들었습니다. 즉 두 벡터를 추가하고 파일에 출력합니다.OpenCL, 간단한 벡터 추가하지만 큰 입력에 대한 잘못된 출력

#include <iostream> 
#include <vector> 
#include <cstdlib> 
#include <string> 
#include <fstream> 

#define __CL_ENABLE_EXCEPTIONS 
#include <CL/cl.hpp> 

int main(int argc, char *argv[]) 
{ 
    try 
    { 
// get platforms, devices and display their info. 
     std::vector<cl::Platform> platforms; 
     cl::Platform::get(&platforms); 

     std::vector<cl::Platform>::iterator i=platforms.begin(); 

     std::cout<<"OpenCL \tPlatform : "<<i->getInfo<CL_PLATFORM_NAME>()<<std::endl; 
     std::cout<<"\tVendor: "<<i->getInfo<CL_PLATFORM_VENDOR>()<<std::endl; 
     std::cout<<"\tVersion : "<<i->getInfo<CL_PLATFORM_VERSION>()<<std::endl; 
     std::cout<<"\tExtensions : "<<i->getInfo<CL_PLATFORM_EXTENSIONS>()<<std::endl; 

// get devices 
     std::vector<cl::Device> devices; 
     i->getDevices(CL_DEVICE_TYPE_ALL,&devices); 
     int o=99; 
     std::cout<<"\n\n"; 
// iterate over available devices 
     for(std::vector<cl::Device>::iterator j=devices.begin(); j!=devices.end(); j++) 
     { 
      std::cout<<"\tOpenCL\tDevice : " << j->getInfo<CL_DEVICE_NAME>()<<std::endl; 
      std::cout<<"\t\t Type : " << j->getInfo<CL_DEVICE_TYPE>()<<std::endl; 
      std::cout<<"\t\t Vendor : " << j->getInfo<CL_DEVICE_VENDOR>()<<std::endl; 
      std::cout<<"\t\t Driver : " << j->getInfo<CL_DRIVER_VERSION>()<<std::endl; 
      std::cout<<"\t\t Global Mem : " << j->getInfo<CL_DEVICE_GLOBAL_MEM_SIZE>()/(1024*1024)<<" MBytes"<<std::endl; 
      std::cout<<"\t\t Local Mem : " << j->getInfo<CL_DEVICE_LOCAL_MEM_SIZE>()/1024<<" KBbytes"<<std::endl; 
      std::cout<<"\t\t Compute Unit : " << j->getInfo<CL_DEVICE_MAX_COMPUTE_UNITS>()<<std::endl; 
      std::cout<<"\t\t Clock Rate : " << j->getInfo<CL_DEVICE_MAX_CLOCK_FREQUENCY>()<<" MHz"<<std::endl; 
     } 
     std::cout<<"\n\n\n"; 





//MAIN CODE BEGINS HERE 


//get Kernel 
     std::ifstream ifs("vector_add_kernel.cl"); 
     std::string kernelSource((std::istreambuf_iterator<char>(ifs)), std::istreambuf_iterator<char>()); 
     std::cout<<kernelSource; 

    //Create context, select device and command queue. 
     cl::Context context(devices); 
     cl::Device &device=devices.front(); 
     cl::CommandQueue cmdqueue(context,device); 

    // Generate Source vector and push the kernel source in it. 
     cl::Program::Sources sourceCode; 
     sourceCode.push_back(std::make_pair(kernelSource.c_str(), kernelSource.size())); 

    //Generate program using sourceCode 
     cl::Program program=cl::Program(context, sourceCode); 

    //Build program.. 
     try 
     { 
      program.build(devices); 
     } 
     catch(cl::Error &err) 
     { 
      std::cerr<<"Building failed, "<<err.what()<<"("<<err.err()<<")" 
        <<"\nRetrieving build log" 
        <<"\n Build Log Follows \n" 
        <<program.getBuildInfo<CL_PROGRAM_BUILD_LOG>(devices.front()); 
     } 


    //Declare and initialize vectors 
     std::vector<cl_float>B(993448,1.3); 
     std::vector<cl_float>C(993448,1.3); 
     std::vector<cl_float>A(993448,1.3); 
     cl_int N=A.size(); 

    //Declare and intialize proper work group size and global size. Global size raised to the nearest multiple of workGroupSize. 
     int workGroupSize=128; 
     int GlobalSize; 
     if(N%workGroupSize) GlobalSize=N - N%workGroupSize + workGroupSize; 
     else GlobalSize=N; 
    //Declare buffers. 
     cl::Buffer vecA(context, CL_MEM_READ_WRITE, sizeof(cl_float)*N); 
     cl::Buffer vecB(context, CL_MEM_READ_ONLY , (B.size())*sizeof(cl_float)); 
     cl::Buffer vecC(context, CL_MEM_READ_ONLY , (C.size())*sizeof(cl_float)); 
    //Write vectors into buffers 
     cmdqueue.enqueueWriteBuffer(vecB, 0, 0, (B.size())*sizeof(cl_float), &B[0]); 
     cmdqueue.enqueueWriteBuffer(vecB, 0, 0, (C.size())*sizeof(cl_float), &C[0]); 

//Executing kernel 
     cl::Kernel kernel(program, "vector_add"); 
     cl::KernelFunctor kernel_func=kernel.bind(cmdqueue, cl::NDRange(GlobalSize), cl::NDRange(workGroupSize)); 

     kernel_func(vecA, vecB, vecC, N); 

//Reading back values into vector A 
     cmdqueue.enqueueReadBuffer(vecA,true,0,N*sizeof(cl_float), &A[0]); 
     cmdqueue.finish(); 


    //Saving into file. 
     std::ofstream output("vectorAdd.txt"); 
     for(int i=0;i<N;i++) output<<A[i]<<"\n"; 

    } 
    catch(cl::Error& err) 
    { 
     std::cerr << "OpenCL error: " << err.what() << "(" << err.err() << 
        ")" << std::endl; 

     return EXIT_FAILURE; 
    } 

    return EXIT_SUCCESS; 
} 

문제가 N보다 작은 값이며, I는 상기 코드 (993,448) 중 하나와 같이, 2.6 그러나 대한 더 큰 값이되는 정확한 결과를 얻고 것은 I 쓰레기 출력 1 사이 변화 얻을 2.4.

__kernel void vector_add(__global float *A, __global float *B, __global float *C, int N) { 

// Get the index of the current element 
int i = get_global_id(0); 


//Do the operation 
if(i<N) A[i] = C[i] + B[i]; 
} 

UPDATE : 여기

는 커널 코드 확인이 코드는 지금 노력하고 있습니다 보인다. 위의 코드에서 약간의 실수를 수정했습니다. 1) GlobalSize가 초기화되는 부분이 수정되었습니다. 2) enqueueWriteBuffer에 바보 실수 (잘못된 파라미터가 주어진) 그것을 지금 N. 큰 값 올바른 결과 등 배로 로트에서의 데이터 유형을 변경

답변

관련 문제