기본 병렬 프로그래밍 개념을 이해하지 못한다고 생각합니다. 아래의 커널은 내가 가지고있는 문제를 재현하는 단순한/고안된 예제이다. "점"의 모든 값을 사용하여 값을 계산하고이를 "블록"의 모든 항목에 할당하려고 시도합니다. 이러한 배열의 크기에 대한 제한을 적용하려고합니다. "블록"배열을 크게 종료 할 수는 있지만 ("clEnqueueNDRangeKernel 직후에 clFinish를 호출 한 후)"points "가 100,000 개 이상의 부동 소수점으로 채워지면"invalid command queue "오류가 발생합니다. 왜 당신이 도와 줄 수 있습니까?전역 메모리 플로트 배열을 통해 굴러 다니는 OpenCl 커널 루프
__kernel void openClTesting (__global float *blocks, __global float *points, int pointsCount)
{
int globalId = get_global_id(0);
int count = 0;
for (int i = 0; i < pointsCount; i++)
{
count++;
}
blocks[globalId] = count;
};
일부 장치 정보 :
CL_DEVICE_LOCAL_MEM_SIZE = 49,152
CL_DEVICE_GLOBAL_MEM_SIZE = 2,147,483,648
CL_DEVICE_MAX_MEM_ALLOC_SIZE = 536,870,912
호스트 코드 : I 통지
#include "stdafx.h"
#include "CL\opencl.h"
#include <iostream>
#include <fstream>
#include <string>
#include <stddef.h>
#include <stdlib.h>
#include <stdio.h>
#define NUM_POINTS 100000
#define NUM_BLOCKS 100000000
struct openClData
{
cl_device_id deviceId = NULL;
cl_uint numDevices;
cl_uint numPlatforms;
cl_int ret;
cl_platform_id *platforms = NULL;
cl_context context;
cl_command_queue commandQueue;
cl_program program;
cl_kernel kernel;
char* kernelCode;
cl_uint kernelCodeSize;
size_t globalItemSize;
size_t localItemSize = 1;
};
char* getKernelCode();
void printErrorLog(openClData oclData);
void printRet(openClData oclData, int line);
int countFileChars(const char *fileName);
int _tmain(int argc, _TCHAR* argv[])
{
openClData oclData;
oclData.globalItemSize = NUM_POINTS;
oclData.kernelCode = getKernelCode();
std::cout << oclData.kernelCode << std::endl;
oclData.kernelCodeSize = strlen(oclData.kernelCode);
int numPoints = NUM_POINTS;
int numBlocks = NUM_BLOCKS;
cl_long localMemSize = 0, globalMemSize = 0, maxAllocMemSize = 0;
float *blocks = new float[numBlocks]{0};
float *points = new float[numPoints]{0};
//prepare platform, device, context and command queue
oclData.ret = clGetPlatformIDs(0, NULL, &oclData.numPlatforms);
printRet(oclData, __LINE__);
oclData.platforms = (cl_platform_id *)malloc(oclData.numPlatforms * sizeof(cl_platform_id));
oclData.ret = clGetPlatformIDs(oclData.numPlatforms, oclData.platforms, NULL);
printRet(oclData, __LINE__);
oclData.ret = clGetDeviceIDs(oclData.platforms[0], CL_DEVICE_TYPE_GPU, 1, &oclData.deviceId, &oclData.numDevices);
printRet(oclData, __LINE__);
oclData.context = clCreateContext(NULL, 1, &oclData.deviceId, NULL, NULL, &oclData.ret);
printRet(oclData, __LINE__);
oclData.commandQueue = clCreateCommandQueue(oclData.context, oclData.deviceId, 0, &oclData.ret);
printRet(oclData, __LINE__);
//prepare cl_mem objects
cl_mem memObjBlocks = clCreateBuffer(oclData.context, CL_MEM_READ_WRITE, sizeof(float) * numBlocks, NULL, &oclData.ret);
printRet(oclData, __LINE__);
cl_mem memObjPoints = clCreateBuffer(oclData.context, CL_MEM_READ_WRITE, sizeof(float) * numPoints, NULL, &oclData.ret);
printRet(oclData, __LINE__);
oclData.ret = clEnqueueWriteBuffer(oclData.commandQueue, memObjBlocks, CL_TRUE, 0, sizeof(float) * numBlocks, blocks, 0, NULL, NULL);
printRet(oclData, __LINE__);
oclData.ret = clEnqueueWriteBuffer(oclData.commandQueue, memObjPoints, CL_TRUE, 0, sizeof(float) * numPoints, points, 0, NULL, NULL);
printRet(oclData, __LINE__);
//prepare program
oclData.program = clCreateProgramWithSource(oclData.context, 1, (const char**)&oclData.kernelCode, (const size_t *)&oclData.kernelCodeSize, &oclData.ret);
printRet(oclData, __LINE__);
oclData.ret = clBuildProgram(oclData.program, 1, &oclData.deviceId, NULL, NULL, NULL);
printRet(oclData, __LINE__);
if (oclData.ret == CL_BUILD_PROGRAM_FAILURE) printErrorLog(oclData);
oclData.kernel = clCreateKernel(oclData.program, "openClTesting", &oclData.ret);
printRet(oclData, __LINE__);
//set arguments
oclData.ret = clSetKernelArg(oclData.kernel, 0, sizeof(cl_mem), &memObjBlocks);
printRet(oclData, __LINE__);
oclData.ret = clSetKernelArg(oclData.kernel, 1, sizeof(cl_mem), &memObjPoints);
printRet(oclData, __LINE__);
oclData.ret = clSetKernelArg(oclData.kernel, 2, sizeof(int), &numPoints);
printRet(oclData, __LINE__);
//run
oclData.ret = clEnqueueNDRangeKernel(oclData.commandQueue, oclData.kernel, 1, NULL, &oclData.globalItemSize, &oclData.localItemSize, 0, NULL, NULL);
printRet(oclData, __LINE__);
oclData.ret = clFinish(oclData.commandQueue);
printRet(oclData, __LINE__);
oclData.ret = clEnqueueReadBuffer(oclData.commandQueue, memObjBlocks, CL_TRUE, 0, sizeof(float) * numBlocks, blocks, 0, NULL, NULL);
printRet(oclData, __LINE__);
oclData.ret = clFinish(oclData.commandQueue);
printRet(oclData, __LINE__);
//print some device info
oclData.ret = clGetDeviceInfo(oclData.deviceId, CL_DEVICE_LOCAL_MEM_SIZE, sizeof(cl_ulong), &localMemSize, 0);
std::cout << "CL_DEVICE_LOCAL_MEM_SIZE = " << localMemSize << '\n';
oclData.ret = clGetDeviceInfo(oclData.deviceId, CL_DEVICE_GLOBAL_MEM_SIZE, sizeof(cl_long), &globalMemSize, 0);
std::cout << "CL_DEVICE_GLOBAL_MEM_SIZE = " << globalMemSize << '\n';
oclData.ret = clGetDeviceInfo(oclData.deviceId, CL_DEVICE_MAX_MEM_ALLOC_SIZE, sizeof(cl_long), &maxAllocMemSize, 0);
std::cout << "CL_DEVICE_MAX_MEM_ALLOC_SIZE = " << maxAllocMemSize << '\n';
//clean up
oclData.ret = clFlush(oclData.commandQueue);
printRet(oclData, __LINE__);
oclData.ret = clFinish(oclData.commandQueue);
printRet(oclData, __LINE__);
oclData.ret = clReleaseKernel(oclData.kernel);
printRet(oclData, __LINE__);
oclData.ret = clReleaseProgram(oclData.program);
printRet(oclData, __LINE__);
oclData.ret = clReleaseMemObject(memObjBlocks);
printRet(oclData, __LINE__);
oclData.ret = clReleaseMemObject(memObjPoints);
printRet(oclData, __LINE__);
oclData.ret = clReleaseCommandQueue(oclData.commandQueue);
printRet(oclData, __LINE__);
oclData.ret = clReleaseContext(oclData.context);
printRet(oclData, __LINE__);
for (size_t i = 0; i < 10; i++)
{
std::cout << blocks[i] << std::endl;
}
delete blocks;
delete points;
return 0;
}
char* getKernelCode()
{
char* kernelCode =
"__kernel void openClTesting (__global float *blocks, __global float *points, int pointsCount)"
"{"
" int globalId = get_global_id(0);"
" int count = 0;"
" for (int i = 0; i < pointsCount; i++)"
" {"
" count++;"
" }"
"blocks[globalId] = count;"
"}";
return kernelCode;
}
void printErrorLog(openClData oclData)
{
size_t log_size;
clGetProgramBuildInfo(oclData.program, oclData.deviceId, CL_PROGRAM_BUILD_LOG, 0, NULL, &log_size);
char *log = (char *)malloc(log_size);
clGetProgramBuildInfo(oclData.program, oclData.deviceId, CL_PROGRAM_BUILD_LOG, log_size, log, NULL);
std::cout << log;
free(log);
}
void printRet(openClData oclData, int line)
{
std::cout << line << ", " << oclData.ret << std::endl;
}
int countFileChars(const char *fileName)
{
std::ifstream ifs(fileName);
ifs.seekg(0, std::ios_base::end);
size_t count = ifs.tellg();
ifs.seekg(0, std::ios_base::beg);
return count;
}
OpenCL 구현에서 ['CL_DEVICE_GLOBAL_MEM_SIZE' 및'CL_DEVICE_MAX_MEM_ALLOC_SIZE'] (https://www.khronos.org/registry/OpenCL/sdk/1.0/docs/man/xhtml/clGetDeviceInfo.html)에 대한 결과를 살펴 보았습니까?) 그리고 그 값이 당신이 겪고있는 명백한 한계와 어떻게 관련이 있는가? "잘못된 명령 대기열"오류와 관련하여 호스트 코드를 게시 할 수 있습니까? (버퍼 할당, 대기열에 넣기 등)이 정보가 없으면 많은 도움을 받을지 모르겠습니다. – pmdj
시간을내어 주셔서 감사합니다. 게시물을 업데이트했습니다. – amcmahon