OpenCLTutorial/main.cpp

140 lines
4.3 KiB
C++

#define CL_TARGET_OPENCL_VERSION 120
#include <utility>
#include <CL/cl.hpp>
#include <cstdio>
#include <cstdlib>
#include <fstream>
#include <iostream>
#include <string>
#include <iterator>
const std::string hw("Hello World!");
inline void checkErr(cl_int err, const char* name) {
if(err != CL_SUCCESS) {
std::cerr << "ERROR: " << name << " (" << err << ")" << std::endl;
exit(EXIT_FAILURE);
}
}
void initMatrix(float *mat, int size) {
for(int i = 0; i < size; i++) {
mat[i] = rand() % 50;
}
}
int main(void) {
cl_int err;
std::vector<cl::Platform> platformList;
cl::Platform::get(&platformList);
checkErr(platformList.size() != 0 ? CL_SUCCESS : -1, "cl::Platform::get");
std::cerr << "Number of platforms: " << platformList.size() << std::endl;
for(int i = 0; i < platformList.size(); i++) {
std::string platformVendor;
platformList[i].getInfo((cl_platform_info)CL_PLATFORM_VENDOR, &platformVendor);
std::cerr << "Platform #" << i << " Vendor: " << platformVendor << std::endl;
std::string platformName;
platformList[i].getInfo((cl_platform_info)CL_PLATFORM_NAME, &platformName);
std::cerr << "Platform #" << i << " Name: " << platformName << std::endl;
}
cl_context_properties cprops[3] = { CL_CONTEXT_PLATFORM, (cl_context_properties)(platformList[0])(), 0 };
cl::Context context(CL_DEVICE_TYPE_GPU, cprops, NULL, NULL, &err);
checkErr(err, "Context::Context()");
// Matrices as seen on OpenCL device
cl::Buffer deviceA;
cl::Buffer deviceB;
cl::Buffer deviceC;
// Seed rand()
srand(432414);
// Allocate host side of memory for inputs
int widthA = 1024;
int widthB = 1024;
unsigned int sizeA = widthA * widthA;
unsigned int memSizeA = sizeA * sizeof(float);
float* hostA = (float*) malloc(memSizeA);
unsigned int sizeB = widthB * widthB;
unsigned int memSizeB = sizeB * sizeof(float);
float* hostB = (float*) malloc(memSizeB);
initMatrix(hostA, sizeA);
initMatrix(hostB, sizeB);
// Allocate host memory for result
unsigned int sizeC = widthA * widthB;
unsigned int memSizeC = sizeC * sizeof(float);
float* hostC = (float*) malloc(memSizeC);
std::vector<cl::Device> devices;
devices = context.getInfo<CL_CONTEXT_DEVICES>();
checkErr(devices.size() > 0 ? CL_SUCCESS : -1, "devices.size() > 0");
std::ifstream file("fresh_kernel.cl");
checkErr(file.is_open() ? CL_SUCCESS : -1, "fresh_kernel.cl");
std::string programSourceString(std::istreambuf_iterator<char>(file), (std::istreambuf_iterator<char>()));
cl::Program::Sources programSource(1, std::make_pair(programSourceString.c_str(), programSourceString.length() + 1));
cl::Program program(context, programSource);
err = program.build(devices, "");
checkErr(err, "Program::Build()");
cl::Kernel kernel(program, "matrix_mult", &err);
checkErr(err, "Kernel::Kernel()");
deviceC = cl::Buffer(context, CL_MEM_WRITE_ONLY, memSizeC, NULL, &err);
deviceA = cl::Buffer(context, CL_MEM_READ_ONLY | CL_MEM_COPY_HOST_PTR, memSizeA, hostA, &err);
deviceB = cl::Buffer(context, CL_MEM_READ_ONLY | CL_MEM_COPY_HOST_PTR, memSizeB, hostB, &err);
size_t localWorkSize[2], globalWorkSize[2];
err = kernel.setArg(0, deviceC);
err |= kernel.setArg(1, deviceA);
err |= kernel.setArg(2, deviceB);
err |= kernel.setArg(3, widthA);
err |= kernel.setArg(4, widthB);
localWorkSize[0] = 16;
localWorkSize[1] = 16;
globalWorkSize[0] = 1024;
globalWorkSize[1] = 1024;
cl::CommandQueue queue(context, devices[0], 0, &err);
checkErr(err, "CommandQueue::CommandQueue()");
cl::Event event;
err = queue.enqueueNDRangeKernel(kernel, cl::NullRange, cl::NDRange(1024, 1024), cl::NDRange(16, 16), NULL, &event);
checkErr(err, "CommandQueue::enqueueNDRangeKernel()");
event.wait();
err = queue.enqueueReadBuffer(deviceC, CL_TRUE, 0, memSizeC, hostC);
checkErr(err, "CommandQueue::enqueueReadBuffer()");
// Verification
int matches = 0;
for(int y = 0; y < widthA; y++) {
for(int x = 0; x < widthA; x++) {
float value = 0;
for(int k = 0; k < widthA; k++) {
float elemA = hostA[y * widthA + k];
float elemB = hostB[k * widthB + x];
value += (elemA * elemB);
}
if(hostC[y * widthA + x] == value) {
matches++;
} else {
std::cout << "Device gave " << hostC[y * widthA + x] << ", host gave " << value << std::endl;
}
}
}
std::cout << matches << "/" << widthA * widthA << " correct." << std::endl;
return EXIT_SUCCESS;
}