If you look at the sample code available for using OpenCL 1.2/2.0 on from the primary vendors you will notice that it is all very complicated, some is out of date ( i.e using depreciated functions) or difficult to get running.
The demos they provide do some clever things and definitely show you some good tricks but they are far from a good entry point. Ideally a newcomer to GPGPU wants to be able to open a single file, compilable demo. It should be simple and cover the basics only. Nothing more.
The user should be able to walk from the top of the file to the bottom without having to jump around and see the working pipeline in-order and clearly. Once they are comfortable with seeing the minimum and can have a play around, then you can show them more than just the top layer.
If those of us in the high-performance computing end of software development havent learned anything (and sometimes I think we havent learned anything) from javascript, GUI development tools and the rapid pace of the app development world we should hopefully have at least learned that getting someone onto your platform and working quickly is the best way to keep them. A single hour to understanding how to use something basically is better than a whole day of stress to only gain slightly more.
But enough ranting. I have written a minimal code demo for OpenCL in this style. It lacks all the options, robustness, safety and control the of Intel samples - but its basically ~100 lines of code instead of many thousand and is enough to show the basic concepts and usage patterns.
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 | //Include the OpenCL Headers // On Intel default directory is: C:\Intel\OpenCL\sdk\include\CL // During installation of the Intel SDK it usually placed $(INTELOCLSDKROOT) into your // system environment. So the project should have the default include directory of: $(INTELOCLSDKROOT)\include // On AMD and NVIDIA this is different. When you grab the SDK from their site it will usually tell me // but it is mostly variations on this. Easiest way to find it if the site it confusing is to open a sample // in visual studio and check the include directories listed in the project properties. #include <CL/cl.h> // Standard library to make some things easier. #include <vector> #include <string> #include <fstream> #include <streambuf> #define BUFFER_ENTRY_COUNT 256 int main() { //The important objects for initialising and using OpenCL. cl_platform_id platform = 0; cl_device_id device = 0; cl_context context = 0; cl_command_queue queue = 0; //First get our platform------------------- { //Find which platforms are available cl_uint numPlatforms = 0; cl_int err = clGetPlatformIDs(0, 0, &numPlatforms); if (err != CL_SUCCESS || numPlatforms == 0) return 0; //Fetch the IDs and take our first one. std::vector<cl_platform_id> availablePlatforms(numPlatforms); err = clGetPlatformIDs(numPlatforms, &availablePlatforms[0], 0); if (err != CL_SUCCESS) return 0; platform = availablePlatforms[0]; } //Now we need our device------------------ { //You can specify if you want CPU/GPU or Accelerator here, but for simple getting going //we will just take any. cl_device_type deviceType = CL_DEVICE_TYPE_ALL; // Same as above, get the number of devices before we fetch the information. cl_uint num_of_devices = 0; cl_int err = clGetDeviceIDs(platform, deviceType, 0, 0, &num_of_devices ); if (err != CL_SUCCESS || num_of_devices==0) return 0; //Fetch the ids and select the first one. std::vector<cl_device_id> deviceVector(num_of_devices); err = clGetDeviceIDs(platform, deviceType, num_of_devices, &deviceVector[0], 0); if (err != CL_SUCCESS) return 0; device = deviceVector[0]; } //Create the context minimal code. { cl_context_properties contextProps[3]; contextProps[0] = CL_CONTEXT_PLATFORM; contextProps[1] = cl_context_properties(platform); contextProps[2] = 0; cl_int err = 0; context = clCreateContext(&contextProps[0], 1, &device, 0, 0, &err); if (err != CL_SUCCESS) return 0; } //Create a Queue { cl_int err = 0; cl_command_queue_properties props= 0; queue = clCreateCommandQueueWithProperties(context, device, &props, &err); if (err != CL_SUCCESS) return 0; } std::string CLProgramFilename = "./simpleprogram.cl"; std::string CLProgramKernelName = "EmptyKernel"; std::string CLProgramSource = ""; cl_program CLProgram = 0; cl_kernel CLProgramKernel = 0; //Read program source code from file { std::ifstream file(CLProgramFilename); std::string temp; while (std::getline(file, temp)) { CLProgramSource.append(temp); } } //Create Program from source { //Take the source and get the program cl_int err; const char* text = CLProgramSource.c_str(); cl_program program = clCreateProgramWithSource(context, 1, &text, 0, &err); if (err != CL_SUCCESS) return 0; //Build it for your specified device. err = clBuildProgram(program, (cl_uint)1, &device, "", 0, 0); if (err != CL_SUCCESS) return 0; //Pull out the kernel(function) we want to use from the program. //Programs can have many kernels CLProgramKernel = clCreateKernel(program, CLProgramKernelName.c_str(), &err); if (err != CL_SUCCESS) return 0; } cl_mem outputBuffer = 0; cl_uint buffSize = BUFFER_ENTRY_COUNT * sizeof(cl_int); //Create an output Buffer { //We are creating a buffer here. The important flags are the CL_MEM_... ones. // In this example we say we want one that the kernel can only write and the CPU // can only request to read. // There are many options here and combining them in different ways has interesting performance effects. cl_int err = 0; outputBuffer = clCreateBuffer(context, CL_MEM_WRITE_ONLY | CL_MEM_HOST_READ_ONLY, buffSize, NULL, &err); if (err != CL_SUCCESS) return 0; } //Run Kernel { //Set the buffer we write to. This maps to the index of the variable in the function in the kernel. cl_int err = clSetKernelArg(CLProgramKernel, 0, sizeof(cl_mem), (void *)&outputBuffer); //Global size is the total number of things we want to do. //Local size is the chunks we are breaking it into. If global not divisible by local //it will throw an error. cl_uint globalSize = BUFFER_ENTRY_COUNT; cl_uint localSize = 16; err = clEnqueueNDRangeKernel(queue, CLProgramKernel, 1, NULL, &globalSize, &localSize, 0, NULL, NULL); if (err != CL_SUCCESS) return 0; //Ensuring all the work is done before we copy out our buffer to check the kernel ran correctly. err = clFinish(queue); if (err != CL_SUCCESS) return 0; //Validate the output from our buffer cl_int ourOutput[BUFFER_ENTRY_COUNT]; err = clEnqueueReadBuffer(queue, outputBuffer, CL_TRUE, 0, buffSize, ourOutput, 0, NULL, NULL); if (err != CL_SUCCESS) return 0; //Check the array has the magic number in it if (ourOutput[6] != 42) return 0; } //Everything went well. return 0; } |
I hope this is useful to anyone trying to get started with OpenCL. I will do the same for Vulkan Compute and DirectX11/12 if I see this gets any traction.