Showing
5 changed files
with
367 additions
and
0 deletions
.gitignore
0 → 100644
| 1 | +arrayadd |
Makefile
0 → 100644
README.md
0 → 100644
| 1 | +# OpenCL array add. | ||
| 2 | + | ||
| 3 | +Taken from | ||
| 4 | +[here](http://www.heterogeneouscompute.org/wordpress/wp-content/uploads/2011/06/Chapter2.txt). | ||
| 5 | + | ||
| 6 | +## Description | ||
| 7 | + | ||
| 8 | +This is an example on how to implement an array add with OpenCL. | ||
| 9 | + | ||
| 10 | +## Requirements | ||
| 11 | + | ||
| 12 | +Some OpenCL capable hardware and the according OpenCL library exposing the | ||
| 13 | +OpenCL API. I tested this on an Intel GPU (Intel Corporation Haswell-ULT | ||
| 14 | +Integrated Graphics Controller (rev 09)) with the | ||
| 15 | +[beignet](https://www.freedesktop.org/wiki/Software/Beignet/) | ||
| 16 | +open source library. | ||
| 17 | + | ||
| 18 | +## License | ||
| 19 | + | ||
| 20 | +unknown |
README.md.old
0 → 100644
arrayadd.c
0 → 100644
| 1 | +// This program implements a vector addition using OpenCL | ||
| 2 | + | ||
| 3 | +// System includes | ||
| 4 | +#include <stdio.h> | ||
| 5 | +#include <stdlib.h> | ||
| 6 | + | ||
| 7 | +// OpenCL includes | ||
| 8 | +#include <CL/cl.h> | ||
| 9 | + | ||
| 10 | +// OpenCL kernel to perform an element-wise add of two arrays | ||
| 11 | +const char* programSource = | ||
| 12 | +"__kernel \n" | ||
| 13 | +"void vecadd(__global int *A, \n" | ||
| 14 | +" __global int *B, \n" | ||
| 15 | +" __global int *C) \n" | ||
| 16 | +"{ \n" | ||
| 17 | +" \n" | ||
| 18 | +" // Get the work-item’s unique ID \n" | ||
| 19 | +" int idx = get_global_id(0); \n" | ||
| 20 | +" \n" | ||
| 21 | +" // Add the corresponding locations of \n" | ||
| 22 | +" // 'A' and 'B', and store the result in 'C'. \n" | ||
| 23 | +" C[idx] = A[idx] + B[idx]; \n" | ||
| 24 | +"} \n" | ||
| 25 | +; | ||
| 26 | + | ||
| 27 | +typedef enum {false=0, true} bool; | ||
| 28 | + | ||
| 29 | +int main() { | ||
| 30 | + // This code executes on the OpenCL host | ||
| 31 | + | ||
| 32 | + // Host data | ||
| 33 | + int *A = NULL; // Input array | ||
| 34 | + int *B = NULL; // Input array | ||
| 35 | + int *C = NULL; // Output array | ||
| 36 | + | ||
| 37 | + // Elements in each array | ||
| 38 | + const int elements = 2048; | ||
| 39 | + | ||
| 40 | + // Compute the size of the data | ||
| 41 | + size_t datasize = sizeof(int)*elements; | ||
| 42 | + | ||
| 43 | + // Allocate space for input/output data | ||
| 44 | + A = (int*)malloc(datasize); | ||
| 45 | + B = (int*)malloc(datasize); | ||
| 46 | + C = (int*)malloc(datasize); | ||
| 47 | + // Initialize the input data | ||
| 48 | + for(int i = 0; i < elements; i++) { | ||
| 49 | + A[i] = i; | ||
| 50 | + B[i] = i; | ||
| 51 | + } | ||
| 52 | + | ||
| 53 | + // Use this to check the output of each API call | ||
| 54 | + cl_int status; | ||
| 55 | + | ||
| 56 | + //----------------------------------------------------- | ||
| 57 | + // STEP 1: Discover and initialize the platforms | ||
| 58 | + //----------------------------------------------------- | ||
| 59 | + | ||
| 60 | + cl_uint numPlatforms = 0; | ||
| 61 | + cl_platform_id *platforms = NULL; | ||
| 62 | + | ||
| 63 | + // Use clGetPlatformIDs() to retrieve the number of platforms | ||
| 64 | + status = clGetPlatformIDs(0, NULL, &numPlatforms); | ||
| 65 | + | ||
| 66 | + // Allocate enough space for each platform | ||
| 67 | + platforms = | ||
| 68 | + (cl_platform_id*)malloc( | ||
| 69 | + numPlatforms*sizeof(cl_platform_id)); | ||
| 70 | + | ||
| 71 | + // Fill in platforms with clGetPlatformIDs() | ||
| 72 | + status = clGetPlatformIDs(numPlatforms, platforms, | ||
| 73 | + NULL); | ||
| 74 | + | ||
| 75 | + //----------------------------------------------------- | ||
| 76 | + // STEP 2: Discover and initialize the devices | ||
| 77 | + //----------------------------------------------------- | ||
| 78 | + | ||
| 79 | + cl_uint numDevices = 0; | ||
| 80 | + cl_device_id *devices = NULL; | ||
| 81 | + | ||
| 82 | + // Use clGetDeviceIDs() to retrieve the number of | ||
| 83 | + // devices present | ||
| 84 | + status = clGetDeviceIDs( | ||
| 85 | + platforms[0], | ||
| 86 | + CL_DEVICE_TYPE_ALL, | ||
| 87 | + 0, | ||
| 88 | + NULL, | ||
| 89 | + &numDevices); | ||
| 90 | + | ||
| 91 | + // Allocate enough space for each device | ||
| 92 | + devices = | ||
| 93 | + (cl_device_id*)malloc( | ||
| 94 | + numDevices*sizeof(cl_device_id)); | ||
| 95 | + | ||
| 96 | + // Fill in devices with clGetDeviceIDs() | ||
| 97 | + status = clGetDeviceIDs( | ||
| 98 | + platforms[0], | ||
| 99 | + CL_DEVICE_TYPE_ALL, | ||
| 100 | + numDevices, | ||
| 101 | + devices, | ||
| 102 | + NULL); | ||
| 103 | + | ||
| 104 | + //----------------------------------------------------- | ||
| 105 | + // STEP 3: Create a context | ||
| 106 | + //----------------------------------------------------- | ||
| 107 | + | ||
| 108 | + cl_context context = NULL; | ||
| 109 | + | ||
| 110 | + // Create a context using clCreateContext() and | ||
| 111 | + // associate it with the devices | ||
| 112 | + context = clCreateContext( | ||
| 113 | + NULL, | ||
| 114 | + numDevices, | ||
| 115 | + devices, | ||
| 116 | + NULL, | ||
| 117 | + NULL, | ||
| 118 | + &status); | ||
| 119 | + | ||
| 120 | + //----------------------------------------------------- | ||
| 121 | + // STEP 4: Create a command queue | ||
| 122 | + //----------------------------------------------------- | ||
| 123 | + | ||
| 124 | + cl_command_queue cmdQueue; | ||
| 125 | + | ||
| 126 | + // Create a command queue using clCreateCommandQueue(), | ||
| 127 | + // and associate it with the device you want to execute | ||
| 128 | + // on | ||
| 129 | + cmdQueue = clCreateCommandQueue( | ||
| 130 | + context, | ||
| 131 | + devices[0], | ||
| 132 | + 0, | ||
| 133 | + &status); | ||
| 134 | + | ||
| 135 | + //----------------------------------------------------- | ||
| 136 | + // STEP 5: Create device buffers | ||
| 137 | + //----------------------------------------------------- | ||
| 138 | + | ||
| 139 | + cl_mem bufferA; // Input array on the device | ||
| 140 | + cl_mem bufferB; // Input array on the device | ||
| 141 | + cl_mem bufferC; // Output array on the device | ||
| 142 | + | ||
| 143 | + // Use clCreateBuffer() to create a buffer object (d_A) | ||
| 144 | + // that will contain the data from the host array A | ||
| 145 | + bufferA = clCreateBuffer( | ||
| 146 | + context, | ||
| 147 | + CL_MEM_READ_ONLY, | ||
| 148 | + datasize, | ||
| 149 | + NULL, | ||
| 150 | + &status); | ||
| 151 | + | ||
| 152 | + // Use clCreateBuffer() to create a buffer object (d_B) | ||
| 153 | + // that will contain the data from the host array B | ||
| 154 | + bufferB = clCreateBuffer( | ||
| 155 | + context, | ||
| 156 | + CL_MEM_READ_ONLY, | ||
| 157 | + datasize, | ||
| 158 | + NULL, | ||
| 159 | + &status); | ||
| 160 | + | ||
| 161 | + // Use clCreateBuffer() to create a buffer object (d_C) | ||
| 162 | + // with enough space to hold the output data | ||
| 163 | + bufferC = clCreateBuffer( | ||
| 164 | + context, | ||
| 165 | + CL_MEM_WRITE_ONLY, | ||
| 166 | + datasize, | ||
| 167 | + NULL, | ||
| 168 | + &status); | ||
| 169 | + | ||
| 170 | + //----------------------------------------------------- | ||
| 171 | + // STEP 6: Write host data to device buffers | ||
| 172 | + //----------------------------------------------------- | ||
| 173 | + | ||
| 174 | + // Use clEnqueueWriteBuffer() to write input array A to | ||
| 175 | + // the device buffer bufferA | ||
| 176 | + status = clEnqueueWriteBuffer( | ||
| 177 | + cmdQueue, | ||
| 178 | + bufferA, | ||
| 179 | + CL_FALSE, | ||
| 180 | + 0, | ||
| 181 | + datasize, | ||
| 182 | + A, | ||
| 183 | + 0, | ||
| 184 | + NULL, | ||
| 185 | + NULL); | ||
| 186 | + | ||
| 187 | + // Use clEnqueueWriteBuffer() to write input array B to | ||
| 188 | + // the device buffer bufferB | ||
| 189 | + status = clEnqueueWriteBuffer( | ||
| 190 | + cmdQueue, | ||
| 191 | + bufferB, | ||
| 192 | + CL_FALSE, | ||
| 193 | + 0, | ||
| 194 | + datasize, | ||
| 195 | + B, | ||
| 196 | + 0, | ||
| 197 | + NULL, | ||
| 198 | + NULL); | ||
| 199 | + | ||
| 200 | + //----------------------------------------------------- | ||
| 201 | + // STEP 7: Create and compile the program | ||
| 202 | + //----------------------------------------------------- | ||
| 203 | + | ||
| 204 | + // Create a program using clCreateProgramWithSource() | ||
| 205 | + cl_program program = clCreateProgramWithSource( | ||
| 206 | + context, | ||
| 207 | + 1, | ||
| 208 | + (const char**)&programSource, | ||
| 209 | + NULL, | ||
| 210 | + &status); | ||
| 211 | + | ||
| 212 | + // Build (compile) the program for the devices with | ||
| 213 | + // clBuildProgram() | ||
| 214 | + status = clBuildProgram( | ||
| 215 | + program, | ||
| 216 | + numDevices, | ||
| 217 | + devices, | ||
| 218 | + NULL, | ||
| 219 | + NULL, | ||
| 220 | + NULL); | ||
| 221 | + | ||
| 222 | + //----------------------------------------------------- | ||
| 223 | + // STEP 8: Create the kernel | ||
| 224 | + //----------------------------------------------------- | ||
| 225 | + | ||
| 226 | + cl_kernel kernel = NULL; | ||
| 227 | + | ||
| 228 | + // Use clCreateKernel() to create a kernel from the | ||
| 229 | + // vector addition function (named "vecadd") | ||
| 230 | + kernel = clCreateKernel(program, "vecadd", &status); | ||
| 231 | + | ||
| 232 | + //----------------------------------------------------- | ||
| 233 | + // STEP 9: Set the kernel arguments | ||
| 234 | + //----------------------------------------------------- | ||
| 235 | + | ||
| 236 | + // Associate the input and output buffers with the | ||
| 237 | + // kernel | ||
| 238 | + // using clSetKernelArg() | ||
| 239 | + status = clSetKernelArg( | ||
| 240 | + kernel, | ||
| 241 | + 0, | ||
| 242 | + sizeof(cl_mem), | ||
| 243 | + &bufferA); | ||
| 244 | + status |= clSetKernelArg( | ||
| 245 | + kernel, | ||
| 246 | + 1, | ||
| 247 | + sizeof(cl_mem), | ||
| 248 | + &bufferB); | ||
| 249 | + status |= clSetKernelArg( | ||
| 250 | + kernel, | ||
| 251 | + 2, | ||
| 252 | + sizeof(cl_mem), | ||
| 253 | + &bufferC); | ||
| 254 | + | ||
| 255 | + //----------------------------------------------------- | ||
| 256 | + // STEP 10: Configure the work-item structure | ||
| 257 | + //----------------------------------------------------- | ||
| 258 | + | ||
| 259 | + // Define an index space (global work size) of work items for | ||
| 260 | + // execution. A workgroup size (local work size) is not required, | ||
| 261 | + // but can be used. | ||
| 262 | + size_t globalWorkSize[1]; | ||
| 263 | + // There are 'elements' work-items | ||
| 264 | + globalWorkSize[0] = elements; | ||
| 265 | + | ||
| 266 | + //----------------------------------------------------- | ||
| 267 | + // STEP 11: Enqueue the kernel for execution | ||
| 268 | + //----------------------------------------------------- | ||
| 269 | + | ||
| 270 | + // Execute the kernel by using clEnqueueNDRangeKernel(). | ||
| 271 | + // 'globalWorkSize' is the 1D dimension of the work-items | ||
| 272 | + status = clEnqueueNDRangeKernel( | ||
| 273 | + cmdQueue, | ||
| 274 | + kernel, | ||
| 275 | + 1, | ||
| 276 | + NULL, | ||
| 277 | + globalWorkSize, | ||
| 278 | + NULL, | ||
| 279 | + 0, | ||
| 280 | + NULL, | ||
| 281 | + NULL); | ||
| 282 | + | ||
| 283 | + //----------------------------------------------------- | ||
| 284 | + // STEP 12: Read the output buffer back to the host | ||
| 285 | + //----------------------------------------------------- | ||
| 286 | + | ||
| 287 | + // Use clEnqueueReadBuffer() to read the OpenCL output | ||
| 288 | + // buffer (bufferC) | ||
| 289 | + // to the host output array (C) | ||
| 290 | + clEnqueueReadBuffer( | ||
| 291 | + cmdQueue, | ||
| 292 | + bufferC, | ||
| 293 | + CL_TRUE, | ||
| 294 | + 0, | ||
| 295 | + datasize, | ||
| 296 | + C, | ||
| 297 | + 0, | ||
| 298 | + NULL, | ||
| 299 | + NULL); | ||
| 300 | + | ||
| 301 | + // Verify the output | ||
| 302 | + bool result = true; | ||
| 303 | + for(int i = 0; i < elements; i++) { | ||
| 304 | + if(C[i] != i+i) { | ||
| 305 | + result = false; | ||
| 306 | + break; | ||
| 307 | + } | ||
| 308 | + } | ||
| 309 | + if(result) { | ||
| 310 | + printf("Output is correct\n"); | ||
| 311 | + } else { | ||
| 312 | + printf("Output is incorrect\n"); | ||
| 313 | + } | ||
| 314 | + | ||
| 315 | + //----------------------------------------------------- | ||
| 316 | + // STEP 13: Release OpenCL resources | ||
| 317 | + //----------------------------------------------------- | ||
| 318 | + | ||
| 319 | + // Free OpenCL resources | ||
| 320 | + clReleaseKernel(kernel); | ||
| 321 | + clReleaseProgram(program); | ||
| 322 | + clReleaseCommandQueue(cmdQueue); | ||
| 323 | + clReleaseMemObject(bufferA); | ||
| 324 | + clReleaseMemObject(bufferB); | ||
| 325 | + clReleaseMemObject(bufferC); | ||
| 326 | + clReleaseContext(context); | ||
| 327 | + | ||
| 328 | + // Free host resources | ||
| 329 | + free(A); | ||
| 330 | + free(B); | ||
| 331 | + free(C); | ||
| 332 | + free(platforms); | ||
| 333 | + free(devices); | ||
| 334 | +} | ||
| 335 | + | ||
| 336 | +// vim: ft=c ts=4 sw=4: |
Please
register
or
login
to post a comment