diff --git a/fresh_kernel.cl b/fresh_kernel.cl new file mode 100644 index 0000000..ed70e8f --- /dev/null +++ b/fresh_kernel.cl @@ -0,0 +1,15 @@ +#pragma OPENCL EXTENSION cl_khr_byte_addressable_store : enable + +__kernel void matrix_mult(__global float* C, __global float* A, __global float* B, int widthA, int widthB) { + int x = get_global_id(0); + int y = get_global_id(1); + + float value = 0; + for(int k = 0; k < widthA; k++) { + float elemA = A[y * widthA + k]; + float elemB = B[k * widthB + x]; + value += (elemA * elemB); + } + + C[y * widthA + x] = value; +} diff --git a/hello b/hello deleted file mode 100755 index ca17254..0000000 Binary files a/hello and /dev/null differ diff --git a/main.cpp b/main.cpp index 06c2576..7771ea3 100644 --- a/main.cpp +++ b/main.cpp @@ -16,6 +16,12 @@ inline void checkErr(cl_int err, const char* name) { } } +void initMatrix(float *mat, int size) { + for(int i = 0; i < size; i++) { + mat[i] = rand() % 50; + } +} + int main(void) { cl_int err; @@ -38,16 +44,40 @@ int main(void) { cl::Context context(CL_DEVICE_TYPE_GPU, cprops, NULL, NULL, &err); checkErr(err, "Context::Context()"); - char* outH = new char [hw.length()+1]; - cl::Buffer outCL(context, CL_MEM_WRITE_ONLY | CL_MEM_USE_HOST_PTR, hw.length()+1, outH, &err); - checkErr(err, "Buffer::Buffer()"); + // Matrices as seen on OpenCL device + cl::Buffer deviceA; + cl::Buffer deviceB; + cl::Buffer deviceC; + + // Seed rand() + srand(432414); + + // Allocate host side of memory for inputs + int widthA = 1024; + int widthB = 1024; + + unsigned int sizeA = widthA * widthA; + unsigned int memSizeA = sizeA * sizeof(float); + float* hostA = (float*) malloc(memSizeA); + + unsigned int sizeB = widthB * widthB; + unsigned int memSizeB = sizeB * sizeof(float); + float* hostB = (float*) malloc(memSizeB); + + initMatrix(hostA, sizeA); + initMatrix(hostB, sizeB); + + // Allocate host memory for result + unsigned int sizeC = widthA * widthB; + unsigned int memSizeC = sizeC * sizeof(float); + float* hostC = (float*) malloc(memSizeC); std::vector devices; devices = context.getInfo(); checkErr(devices.size() > 0 ? CL_SUCCESS : -1, "devices.size() > 0"); - std::ifstream file("lesson1_kernel.cl"); - checkErr(file.is_open() ? CL_SUCCESS : -1, "lesson1_kernel.cl"); + std::ifstream file("fresh_kernel.cl"); + checkErr(file.is_open() ? CL_SUCCESS : -1, "fresh_kernel.cl"); std::string programSourceString(std::istreambuf_iterator(file), (std::istreambuf_iterator())); cl::Program::Sources programSource(1, std::make_pair(programSourceString.c_str(), programSourceString.length() + 1)); @@ -55,21 +85,54 @@ int main(void) { err = program.build(devices, ""); checkErr(err, "Program::Build()"); - cl::Kernel kernel(program, "hello", &err); + cl::Kernel kernel(program, "matrix_mult", &err); checkErr(err, "Kernel::Kernel()"); - err = kernel.setArg(0, outCL); - checkErr(err, "Kernel::setArg()"); + + deviceC = cl::Buffer(context, CL_MEM_WRITE_ONLY, memSizeC, NULL, &err); + deviceA = cl::Buffer(context, CL_MEM_READ_ONLY | CL_MEM_COPY_HOST_PTR, memSizeA, NULL, &err); + deviceB = cl::Buffer(context, CL_MEM_READ_ONLY | CL_MEM_COPY_HOST_PTR, memSizeB, NULL, &err); + + size_t localWorkSize[2], globalWorkSize[2]; + + err = kernel.setArg(0, deviceC); + err |= kernel.setArg(1, deviceA); + err |= kernel.setArg(2, deviceB); + err |= kernel.setArg(3, widthA); + err |= kernel.setArg(4, widthB); + + localWorkSize[0] = 16; + localWorkSize[1] = 16; + globalWorkSize[0] = 1024; + globalWorkSize[1] = 1024; cl::CommandQueue queue(context, devices[0], 0, &err); checkErr(err, "CommandQueue::CommandQueue()"); cl::Event event; - err = queue.enqueueNDRangeKernel(kernel, cl::NullRange, cl::NDRange(hw.length() + 1), cl::NDRange(1, 1), NULL, &event); + err = queue.enqueueNDRangeKernel(kernel, cl::NullRange, cl::NDRange(1024, 1024), cl::NDRange(16, 16), NULL, &event); checkErr(err, "CommandQUeue::enqueueNDRangeKernel()"); event.wait(); - err = queue.enqueueReadBuffer(outCL, CL_TRUE, 0, hw.length() + 1, outH); + err = queue.enqueueReadBuffer(deviceC, CL_TRUE, 0, memSizeC, hostC); checkErr(err, "CommandQueue::enqueueReadBuffer"); - std::cout << outH; + + // Verification + int matches = 0; + for(int y = 0; y < widthA; y++) { + for(int x = 0; x < widthA; x++) { + float value = 0; + for(int k = 0; k < widthA; k++) { + float elemA = hostA[y * widthA + k]; + float elemB = hostB[k * widthB + x]; + value += (elemA * elemB); + } + if(hostC[y * widthA + x] == value) { + matches++; + } else { + std::cout << "Device gave " << hostC[y * widthA + x] << ", host gave " << value << std::endl; + } + } + } + std::cout << matches << "/" << widthA * widthA << " correct." << std::endl; return EXIT_SUCCESS; }