#define CL_TARGET_OPENCL_VERSION 120 #include #include #include #include #include #include #include #include const std::string hw("Hello World!"); inline void checkErr(cl_int err, const char* name) { if(err != CL_SUCCESS) { std::cerr << "ERROR: " << name << " (" << err << ")" << std::endl; exit(EXIT_FAILURE); } } void initMatrix(float *mat, int size) { for(int i = 0; i < size; i++) { mat[i] = rand() % 50; } } int main(void) { cl_int err; std::vector platformList; cl::Platform::get(&platformList); checkErr(platformList.size() != 0 ? CL_SUCCESS : -1, "cl::Platform::get"); std::cerr << "Number of platforms: " << platformList.size() << std::endl; for(int i = 0; i < platformList.size(); i++) { std::string platformVendor; platformList[i].getInfo((cl_platform_info)CL_PLATFORM_VENDOR, &platformVendor); std::cerr << "Platform #" << i << " Vendor: " << platformVendor << std::endl; std::string platformName; platformList[i].getInfo((cl_platform_info)CL_PLATFORM_NAME, &platformName); std::cerr << "Platform #" << i << " Name: " << platformName << std::endl; } cl_context_properties cprops[3] = { CL_CONTEXT_PLATFORM, (cl_context_properties)(platformList[0])(), 0 }; cl::Context context(CL_DEVICE_TYPE_GPU, cprops, NULL, NULL, &err); checkErr(err, "Context::Context()"); // Matrices as seen on OpenCL device cl::Buffer deviceA; cl::Buffer deviceB; cl::Buffer deviceC; // Seed rand() srand(432414); // Allocate host side of memory for inputs int widthA = 1024; int widthB = 1024; unsigned int sizeA = widthA * widthA; unsigned int memSizeA = sizeA * sizeof(float); float* hostA = (float*) malloc(memSizeA); unsigned int sizeB = widthB * widthB; unsigned int memSizeB = sizeB * sizeof(float); float* hostB = (float*) malloc(memSizeB); initMatrix(hostA, sizeA); initMatrix(hostB, sizeB); // Allocate host memory for result unsigned int sizeC = widthA * widthB; unsigned int memSizeC = sizeC * sizeof(float); float* hostC = (float*) malloc(memSizeC); std::vector devices; devices = context.getInfo(); checkErr(devices.size() > 0 ? CL_SUCCESS : -1, "devices.size() > 0"); std::ifstream file("fresh_kernel.cl"); checkErr(file.is_open() ? CL_SUCCESS : -1, "fresh_kernel.cl"); std::string programSourceString(std::istreambuf_iterator(file), (std::istreambuf_iterator())); cl::Program::Sources programSource(1, std::make_pair(programSourceString.c_str(), programSourceString.length() + 1)); cl::Program program(context, programSource); err = program.build(devices, ""); checkErr(err, "Program::Build()"); cl::Kernel kernel(program, "matrix_mult", &err); checkErr(err, "Kernel::Kernel()"); deviceC = cl::Buffer(context, CL_MEM_WRITE_ONLY, memSizeC, NULL, &err); deviceA = cl::Buffer(context, CL_MEM_READ_ONLY | CL_MEM_COPY_HOST_PTR, memSizeA, hostA, &err); deviceB = cl::Buffer(context, CL_MEM_READ_ONLY | CL_MEM_COPY_HOST_PTR, memSizeB, hostB, &err); size_t localWorkSize[2], globalWorkSize[2]; err = kernel.setArg(0, deviceC); err |= kernel.setArg(1, deviceA); err |= kernel.setArg(2, deviceB); err |= kernel.setArg(3, widthA); err |= kernel.setArg(4, widthB); localWorkSize[0] = 16; localWorkSize[1] = 16; globalWorkSize[0] = 1024; globalWorkSize[1] = 1024; cl::CommandQueue queue(context, devices[0], 0, &err); checkErr(err, "CommandQueue::CommandQueue()"); cl::Event event; err = queue.enqueueNDRangeKernel(kernel, cl::NullRange, cl::NDRange(1024, 1024), cl::NDRange(16, 16), NULL, &event); checkErr(err, "CommandQueue::enqueueNDRangeKernel()"); event.wait(); err = queue.enqueueReadBuffer(deviceC, CL_TRUE, 0, memSizeC, hostC); checkErr(err, "CommandQueue::enqueueReadBuffer()"); // Verification int matches = 0; for(int y = 0; y < widthA; y++) { for(int x = 0; x < widthA; x++) { float value = 0; for(int k = 0; k < widthA; k++) { float elemA = hostA[y * widthA + k]; float elemB = hostB[k * widthB + x]; value += (elemA * elemB); } if(hostC[y * widthA + x] == value) { matches++; } else { std::cout << "Device gave " << hostC[y * widthA + x] << ", host gave " << value << std::endl; } } } std::cout << matches << "/" << widthA * widthA << " correct." << std::endl; return EXIT_SUCCESS; }