Start on the matrix multiplication kernel
This commit is contained in:
parent
42b2451d22
commit
7e58dc842b
3 changed files with 89 additions and 11 deletions
15
fresh_kernel.cl
Normal file
15
fresh_kernel.cl
Normal file
|
@ -0,0 +1,15 @@
|
|||
#pragma OPENCL EXTENSION cl_khr_byte_addressable_store : enable
|
||||
|
||||
__kernel void matrix_mult(__global float* C, __global float* A, __global float* B, int widthA, int widthB) {
|
||||
int x = get_global_id(0);
|
||||
int y = get_global_id(1);
|
||||
|
||||
float value = 0;
|
||||
for(int k = 0; k < widthA; k++) {
|
||||
float elemA = A[y * widthA + k];
|
||||
float elemB = B[k * widthB + x];
|
||||
value += (elemA * elemB);
|
||||
}
|
||||
|
||||
C[y * widthA + x] = value;
|
||||
}
|
BIN
hello
BIN
hello
Binary file not shown.
85
main.cpp
85
main.cpp
|
@ -16,6 +16,12 @@ inline void checkErr(cl_int err, const char* name) {
|
|||
}
|
||||
}
|
||||
|
||||
void initMatrix(float *mat, int size) {
|
||||
for(int i = 0; i < size; i++) {
|
||||
mat[i] = rand() % 50;
|
||||
}
|
||||
}
|
||||
|
||||
int main(void) {
|
||||
cl_int err;
|
||||
|
||||
|
@ -38,16 +44,40 @@ int main(void) {
|
|||
cl::Context context(CL_DEVICE_TYPE_GPU, cprops, NULL, NULL, &err);
|
||||
checkErr(err, "Context::Context()");
|
||||
|
||||
char* outH = new char [hw.length()+1];
|
||||
cl::Buffer outCL(context, CL_MEM_WRITE_ONLY | CL_MEM_USE_HOST_PTR, hw.length()+1, outH, &err);
|
||||
checkErr(err, "Buffer::Buffer()");
|
||||
// Matrices as seen on OpenCL device
|
||||
cl::Buffer deviceA;
|
||||
cl::Buffer deviceB;
|
||||
cl::Buffer deviceC;
|
||||
|
||||
// Seed rand()
|
||||
srand(432414);
|
||||
|
||||
// Allocate host side of memory for inputs
|
||||
int widthA = 1024;
|
||||
int widthB = 1024;
|
||||
|
||||
unsigned int sizeA = widthA * widthA;
|
||||
unsigned int memSizeA = sizeA * sizeof(float);
|
||||
float* hostA = (float*) malloc(memSizeA);
|
||||
|
||||
unsigned int sizeB = widthB * widthB;
|
||||
unsigned int memSizeB = sizeB * sizeof(float);
|
||||
float* hostB = (float*) malloc(memSizeB);
|
||||
|
||||
initMatrix(hostA, sizeA);
|
||||
initMatrix(hostB, sizeB);
|
||||
|
||||
// Allocate host memory for result
|
||||
unsigned int sizeC = widthA * widthB;
|
||||
unsigned int memSizeC = sizeC * sizeof(float);
|
||||
float* hostC = (float*) malloc(memSizeC);
|
||||
|
||||
std::vector<cl::Device> devices;
|
||||
devices = context.getInfo<CL_CONTEXT_DEVICES>();
|
||||
checkErr(devices.size() > 0 ? CL_SUCCESS : -1, "devices.size() > 0");
|
||||
|
||||
std::ifstream file("lesson1_kernel.cl");
|
||||
checkErr(file.is_open() ? CL_SUCCESS : -1, "lesson1_kernel.cl");
|
||||
std::ifstream file("fresh_kernel.cl");
|
||||
checkErr(file.is_open() ? CL_SUCCESS : -1, "fresh_kernel.cl");
|
||||
|
||||
std::string programSourceString(std::istreambuf_iterator<char>(file), (std::istreambuf_iterator<char>()));
|
||||
cl::Program::Sources programSource(1, std::make_pair(programSourceString.c_str(), programSourceString.length() + 1));
|
||||
|
@ -55,21 +85,54 @@ int main(void) {
|
|||
err = program.build(devices, "");
|
||||
checkErr(err, "Program::Build()");
|
||||
|
||||
cl::Kernel kernel(program, "hello", &err);
|
||||
cl::Kernel kernel(program, "matrix_mult", &err);
|
||||
checkErr(err, "Kernel::Kernel()");
|
||||
err = kernel.setArg(0, outCL);
|
||||
checkErr(err, "Kernel::setArg()");
|
||||
|
||||
deviceC = cl::Buffer(context, CL_MEM_WRITE_ONLY, memSizeC, NULL, &err);
|
||||
deviceA = cl::Buffer(context, CL_MEM_READ_ONLY | CL_MEM_COPY_HOST_PTR, memSizeA, NULL, &err);
|
||||
deviceB = cl::Buffer(context, CL_MEM_READ_ONLY | CL_MEM_COPY_HOST_PTR, memSizeB, NULL, &err);
|
||||
|
||||
size_t localWorkSize[2], globalWorkSize[2];
|
||||
|
||||
err = kernel.setArg(0, deviceC);
|
||||
err |= kernel.setArg(1, deviceA);
|
||||
err |= kernel.setArg(2, deviceB);
|
||||
err |= kernel.setArg(3, widthA);
|
||||
err |= kernel.setArg(4, widthB);
|
||||
|
||||
localWorkSize[0] = 16;
|
||||
localWorkSize[1] = 16;
|
||||
globalWorkSize[0] = 1024;
|
||||
globalWorkSize[1] = 1024;
|
||||
|
||||
cl::CommandQueue queue(context, devices[0], 0, &err);
|
||||
checkErr(err, "CommandQueue::CommandQueue()");
|
||||
cl::Event event;
|
||||
err = queue.enqueueNDRangeKernel(kernel, cl::NullRange, cl::NDRange(hw.length() + 1), cl::NDRange(1, 1), NULL, &event);
|
||||
err = queue.enqueueNDRangeKernel(kernel, cl::NullRange, cl::NDRange(1024, 1024), cl::NDRange(16, 16), NULL, &event);
|
||||
checkErr(err, "CommandQUeue::enqueueNDRangeKernel()");
|
||||
|
||||
event.wait();
|
||||
err = queue.enqueueReadBuffer(outCL, CL_TRUE, 0, hw.length() + 1, outH);
|
||||
err = queue.enqueueReadBuffer(deviceC, CL_TRUE, 0, memSizeC, hostC);
|
||||
checkErr(err, "CommandQueue::enqueueReadBuffer");
|
||||
std::cout << outH;
|
||||
|
||||
// Verification
|
||||
int matches = 0;
|
||||
for(int y = 0; y < widthA; y++) {
|
||||
for(int x = 0; x < widthA; x++) {
|
||||
float value = 0;
|
||||
for(int k = 0; k < widthA; k++) {
|
||||
float elemA = hostA[y * widthA + k];
|
||||
float elemB = hostB[k * widthB + x];
|
||||
value += (elemA * elemB);
|
||||
}
|
||||
if(hostC[y * widthA + x] == value) {
|
||||
matches++;
|
||||
} else {
|
||||
std::cout << "Device gave " << hostC[y * widthA + x] << ", host gave " << value << std::endl;
|
||||
}
|
||||
}
|
||||
}
|
||||
std::cout << matches << "/" << widthA * widthA << " correct." << std::endl;
|
||||
|
||||
return EXIT_SUCCESS;
|
||||
}
|
||||
|
|
Loading…
Add table
Add a link
Reference in a new issue