diff --git a/fresh_kernel.cl b/fresh_kernel.cl
new file mode 100644
index 0000000..ed70e8f
--- /dev/null
+++ b/fresh_kernel.cl
@@ -0,0 +1,15 @@
+#pragma OPENCL EXTENSION cl_khr_byte_addressable_store : enable
+
+__kernel void matrix_mult(__global float* C, __global float* A, __global float* B, int widthA, int widthB) {
+   int x = get_global_id(0);
+   int y = get_global_id(1);
+
+   float value = 0;
+   for(int k = 0; k < widthA; k++) {
+      float elemA = A[y * widthA + k];
+      float elemB = B[k * widthB + x];
+      value += (elemA * elemB);
+   }
+
+   C[y * widthA + x] = value;
+}
diff --git a/hello b/hello
deleted file mode 100755
index ca17254..0000000
Binary files a/hello and /dev/null differ
diff --git a/main.cpp b/main.cpp
index 06c2576..7771ea3 100644
--- a/main.cpp
+++ b/main.cpp
@@ -16,6 +16,12 @@ inline void checkErr(cl_int err, const char* name) {
 	}
 }
 
+void initMatrix(float *mat, int size) {
+	for(int i = 0; i < size; i++) {
+		mat[i] = rand() % 50;
+	}
+}
+
 int main(void) {
 	cl_int err;
 
@@ -38,16 +44,40 @@ int main(void) {
 	cl::Context context(CL_DEVICE_TYPE_GPU, cprops, NULL, NULL, &err);
 	checkErr(err, "Context::Context()");
 
-	char* outH = new char [hw.length()+1];
-	cl::Buffer outCL(context, CL_MEM_WRITE_ONLY | CL_MEM_USE_HOST_PTR, hw.length()+1, outH, &err);
-	checkErr(err, "Buffer::Buffer()");
+	// Matrices as seen on OpenCL device
+	cl::Buffer deviceA;
+	cl::Buffer deviceB;
+	cl::Buffer deviceC;
+
+	// Seed rand()
+	srand(432414);
+
+	// Allocate host side of memory for inputs
+	int widthA = 1024;
+	int widthB = 1024;
+
+	unsigned int sizeA = widthA * widthA;
+	unsigned int memSizeA = sizeA * sizeof(float);
+	float* hostA = (float*) malloc(memSizeA);
+
+	unsigned int sizeB = widthB * widthB;
+	unsigned int memSizeB = sizeB * sizeof(float);
+	float* hostB = (float*) malloc(memSizeB);
+
+	initMatrix(hostA, sizeA);
+	initMatrix(hostB, sizeB);
+
+	// Allocate host memory for result
+	unsigned int sizeC = widthA * widthB;
+	unsigned int memSizeC = sizeC * sizeof(float);
+	float* hostC = (float*) malloc(memSizeC);
 
 	std::vector<cl::Device> devices;
 	devices = context.getInfo<CL_CONTEXT_DEVICES>();
 	checkErr(devices.size() > 0 ? CL_SUCCESS : -1, "devices.size() > 0");
 
-	std::ifstream file("lesson1_kernel.cl");
-	checkErr(file.is_open() ? CL_SUCCESS : -1, "lesson1_kernel.cl");
+	std::ifstream file("fresh_kernel.cl");
+	checkErr(file.is_open() ? CL_SUCCESS : -1, "fresh_kernel.cl");
 
 	std::string programSourceString(std::istreambuf_iterator<char>(file), (std::istreambuf_iterator<char>()));
 	cl::Program::Sources programSource(1, std::make_pair(programSourceString.c_str(), programSourceString.length() + 1));
@@ -55,21 +85,54 @@ int main(void) {
 	err = program.build(devices, "");
 	checkErr(err, "Program::Build()");
 
-	cl::Kernel kernel(program, "hello", &err);
+	cl::Kernel kernel(program, "matrix_mult", &err);
 	checkErr(err, "Kernel::Kernel()");
-	err = kernel.setArg(0, outCL);
-	checkErr(err, "Kernel::setArg()");
+
+	deviceC = cl::Buffer(context, CL_MEM_WRITE_ONLY, memSizeC, NULL, &err);
+	deviceA = cl::Buffer(context, CL_MEM_READ_ONLY | CL_MEM_COPY_HOST_PTR, memSizeA, NULL, &err);
+	deviceB = cl::Buffer(context, CL_MEM_READ_ONLY | CL_MEM_COPY_HOST_PTR, memSizeB, NULL, &err);
+
+	size_t localWorkSize[2], globalWorkSize[2];
+
+	err = kernel.setArg(0, deviceC);
+	err |= kernel.setArg(1, deviceA);
+	err |= kernel.setArg(2, deviceB);
+	err |= kernel.setArg(3, widthA);
+	err |= kernel.setArg(4, widthB);
+
+	localWorkSize[0] = 16;
+	localWorkSize[1] = 16;
+	globalWorkSize[0] = 1024;
+	globalWorkSize[1] = 1024;
 
 	cl::CommandQueue queue(context, devices[0], 0, &err);
 	checkErr(err, "CommandQueue::CommandQueue()");
 	cl::Event event;
-	err = queue.enqueueNDRangeKernel(kernel, cl::NullRange, cl::NDRange(hw.length() + 1), cl::NDRange(1, 1), NULL, &event);
+	err = queue.enqueueNDRangeKernel(kernel, cl::NullRange, cl::NDRange(1024, 1024), cl::NDRange(16, 16), NULL, &event);
 	checkErr(err, "CommandQUeue::enqueueNDRangeKernel()");
 
 	event.wait();
-	err = queue.enqueueReadBuffer(outCL, CL_TRUE, 0, hw.length() + 1, outH);
+	err = queue.enqueueReadBuffer(deviceC, CL_TRUE, 0, memSizeC, hostC);
 	checkErr(err, "CommandQueue::enqueueReadBuffer");
-	std::cout << outH;
+
+	// Verification
+	int matches = 0;
+	for(int y = 0; y < widthA; y++) {
+		for(int x = 0; x < widthA; x++) {
+			float value = 0;
+			for(int k = 0; k < widthA; k++) {
+		      float elemA = hostA[y * widthA + k];
+		      float elemB = hostB[k * widthB + x];
+		      value += (elemA * elemB);
+		   }
+			if(hostC[y * widthA + x] == value) {
+				matches++;
+			} else {
+				std::cout << "Device gave " << hostC[y * widthA + x] << ", host gave " << value << std::endl;
+			}
+		}
+	}
+	std::cout << matches << "/" << widthA * widthA << " correct." << std::endl;
 
 	return EXIT_SUCCESS;
 }