Showing
5 changed files
with
367 additions
and
0 deletions
.gitignore
0 → 100644
| 1 | +arrayadd | ... | ... |
Makefile
0 → 100644
README.md
0 → 100644
| 1 | +# OpenCL array add. | |
| 2 | + | |
| 3 | +Taken from | |
| 4 | +[here](http://www.heterogeneouscompute.org/wordpress/wp-content/uploads/2011/06/Chapter2.txt). | |
| 5 | + | |
| 6 | +## Description | |
| 7 | + | |
| 8 | +This is an example on how to implement an array add with OpenCL. | |
| 9 | + | |
| 10 | +## Requirements | |
| 11 | + | |
| 12 | +Some OpenCL capable hardware and the according OpenCL library exposing the | |
| 13 | +OpenCL API. I tested this on an Intel GPU (Intel Corporation Haswell-ULT | |
| 14 | +Integrated Graphics Controller (rev 09)) with the | |
| 15 | +[beignet](https://www.freedesktop.org/wiki/Software/Beignet/) | |
| 16 | +open source library. | |
| 17 | + | |
| 18 | +## License | |
| 19 | + | |
| 20 | +unknown | ... | ... |
README.md.old
0 → 100644
arrayadd.c
0 → 100644
| 1 | +// This program implements a vector addition using OpenCL | |
| 2 | + | |
| 3 | +// System includes | |
| 4 | +#include <stdio.h> | |
| 5 | +#include <stdlib.h> | |
| 6 | + | |
| 7 | +// OpenCL includes | |
| 8 | +#include <CL/cl.h> | |
| 9 | + | |
| 10 | +// OpenCL kernel to perform an element-wise add of two arrays | |
| 11 | +const char* programSource = | |
| 12 | +"__kernel \n" | |
| 13 | +"void vecadd(__global int *A, \n" | |
| 14 | +" __global int *B, \n" | |
| 15 | +" __global int *C) \n" | |
| 16 | +"{ \n" | |
| 17 | +" \n" | |
| 18 | +" // Get the work-item’s unique ID \n" | |
| 19 | +" int idx = get_global_id(0); \n" | |
| 20 | +" \n" | |
| 21 | +" // Add the corresponding locations of \n" | |
| 22 | +" // 'A' and 'B', and store the result in 'C'. \n" | |
| 23 | +" C[idx] = A[idx] + B[idx]; \n" | |
| 24 | +"} \n" | |
| 25 | +; | |
| 26 | + | |
| 27 | +typedef enum {false=0, true} bool; | |
| 28 | + | |
| 29 | +int main() { | |
| 30 | + // This code executes on the OpenCL host | |
| 31 | + | |
| 32 | + // Host data | |
| 33 | + int *A = NULL; // Input array | |
| 34 | + int *B = NULL; // Input array | |
| 35 | + int *C = NULL; // Output array | |
| 36 | + | |
| 37 | + // Elements in each array | |
| 38 | + const int elements = 2048; | |
| 39 | + | |
| 40 | + // Compute the size of the data | |
| 41 | + size_t datasize = sizeof(int)*elements; | |
| 42 | + | |
| 43 | + // Allocate space for input/output data | |
| 44 | + A = (int*)malloc(datasize); | |
| 45 | + B = (int*)malloc(datasize); | |
| 46 | + C = (int*)malloc(datasize); | |
| 47 | + // Initialize the input data | |
| 48 | + for(int i = 0; i < elements; i++) { | |
| 49 | + A[i] = i; | |
| 50 | + B[i] = i; | |
| 51 | + } | |
| 52 | + | |
| 53 | + // Use this to check the output of each API call | |
| 54 | + cl_int status; | |
| 55 | + | |
| 56 | + //----------------------------------------------------- | |
| 57 | + // STEP 1: Discover and initialize the platforms | |
| 58 | + //----------------------------------------------------- | |
| 59 | + | |
| 60 | + cl_uint numPlatforms = 0; | |
| 61 | + cl_platform_id *platforms = NULL; | |
| 62 | + | |
| 63 | + // Use clGetPlatformIDs() to retrieve the number of platforms | |
| 64 | + status = clGetPlatformIDs(0, NULL, &numPlatforms); | |
| 65 | + | |
| 66 | + // Allocate enough space for each platform | |
| 67 | + platforms = | |
| 68 | + (cl_platform_id*)malloc( | |
| 69 | + numPlatforms*sizeof(cl_platform_id)); | |
| 70 | + | |
| 71 | + // Fill in platforms with clGetPlatformIDs() | |
| 72 | + status = clGetPlatformIDs(numPlatforms, platforms, | |
| 73 | + NULL); | |
| 74 | + | |
| 75 | + //----------------------------------------------------- | |
| 76 | + // STEP 2: Discover and initialize the devices | |
| 77 | + //----------------------------------------------------- | |
| 78 | + | |
| 79 | + cl_uint numDevices = 0; | |
| 80 | + cl_device_id *devices = NULL; | |
| 81 | + | |
| 82 | + // Use clGetDeviceIDs() to retrieve the number of | |
| 83 | + // devices present | |
| 84 | + status = clGetDeviceIDs( | |
| 85 | + platforms[0], | |
| 86 | + CL_DEVICE_TYPE_ALL, | |
| 87 | + 0, | |
| 88 | + NULL, | |
| 89 | + &numDevices); | |
| 90 | + | |
| 91 | + // Allocate enough space for each device | |
| 92 | + devices = | |
| 93 | + (cl_device_id*)malloc( | |
| 94 | + numDevices*sizeof(cl_device_id)); | |
| 95 | + | |
| 96 | + // Fill in devices with clGetDeviceIDs() | |
| 97 | + status = clGetDeviceIDs( | |
| 98 | + platforms[0], | |
| 99 | + CL_DEVICE_TYPE_ALL, | |
| 100 | + numDevices, | |
| 101 | + devices, | |
| 102 | + NULL); | |
| 103 | + | |
| 104 | + //----------------------------------------------------- | |
| 105 | + // STEP 3: Create a context | |
| 106 | + //----------------------------------------------------- | |
| 107 | + | |
| 108 | + cl_context context = NULL; | |
| 109 | + | |
| 110 | + // Create a context using clCreateContext() and | |
| 111 | + // associate it with the devices | |
| 112 | + context = clCreateContext( | |
| 113 | + NULL, | |
| 114 | + numDevices, | |
| 115 | + devices, | |
| 116 | + NULL, | |
| 117 | + NULL, | |
| 118 | + &status); | |
| 119 | + | |
| 120 | + //----------------------------------------------------- | |
| 121 | + // STEP 4: Create a command queue | |
| 122 | + //----------------------------------------------------- | |
| 123 | + | |
| 124 | + cl_command_queue cmdQueue; | |
| 125 | + | |
| 126 | + // Create a command queue using clCreateCommandQueue(), | |
| 127 | + // and associate it with the device you want to execute | |
| 128 | + // on | |
| 129 | + cmdQueue = clCreateCommandQueue( | |
| 130 | + context, | |
| 131 | + devices[0], | |
| 132 | + 0, | |
| 133 | + &status); | |
| 134 | + | |
| 135 | + //----------------------------------------------------- | |
| 136 | + // STEP 5: Create device buffers | |
| 137 | + //----------------------------------------------------- | |
| 138 | + | |
| 139 | + cl_mem bufferA; // Input array on the device | |
| 140 | + cl_mem bufferB; // Input array on the device | |
| 141 | + cl_mem bufferC; // Output array on the device | |
| 142 | + | |
| 143 | + // Use clCreateBuffer() to create a buffer object (d_A) | |
| 144 | + // that will contain the data from the host array A | |
| 145 | + bufferA = clCreateBuffer( | |
| 146 | + context, | |
| 147 | + CL_MEM_READ_ONLY, | |
| 148 | + datasize, | |
| 149 | + NULL, | |
| 150 | + &status); | |
| 151 | + | |
| 152 | + // Use clCreateBuffer() to create a buffer object (d_B) | |
| 153 | + // that will contain the data from the host array B | |
| 154 | + bufferB = clCreateBuffer( | |
| 155 | + context, | |
| 156 | + CL_MEM_READ_ONLY, | |
| 157 | + datasize, | |
| 158 | + NULL, | |
| 159 | + &status); | |
| 160 | + | |
| 161 | + // Use clCreateBuffer() to create a buffer object (d_C) | |
| 162 | + // with enough space to hold the output data | |
| 163 | + bufferC = clCreateBuffer( | |
| 164 | + context, | |
| 165 | + CL_MEM_WRITE_ONLY, | |
| 166 | + datasize, | |
| 167 | + NULL, | |
| 168 | + &status); | |
| 169 | + | |
| 170 | + //----------------------------------------------------- | |
| 171 | + // STEP 6: Write host data to device buffers | |
| 172 | + //----------------------------------------------------- | |
| 173 | + | |
| 174 | + // Use clEnqueueWriteBuffer() to write input array A to | |
| 175 | + // the device buffer bufferA | |
| 176 | + status = clEnqueueWriteBuffer( | |
| 177 | + cmdQueue, | |
| 178 | + bufferA, | |
| 179 | + CL_FALSE, | |
| 180 | + 0, | |
| 181 | + datasize, | |
| 182 | + A, | |
| 183 | + 0, | |
| 184 | + NULL, | |
| 185 | + NULL); | |
| 186 | + | |
| 187 | + // Use clEnqueueWriteBuffer() to write input array B to | |
| 188 | + // the device buffer bufferB | |
| 189 | + status = clEnqueueWriteBuffer( | |
| 190 | + cmdQueue, | |
| 191 | + bufferB, | |
| 192 | + CL_FALSE, | |
| 193 | + 0, | |
| 194 | + datasize, | |
| 195 | + B, | |
| 196 | + 0, | |
| 197 | + NULL, | |
| 198 | + NULL); | |
| 199 | + | |
| 200 | + //----------------------------------------------------- | |
| 201 | + // STEP 7: Create and compile the program | |
| 202 | + //----------------------------------------------------- | |
| 203 | + | |
| 204 | + // Create a program using clCreateProgramWithSource() | |
| 205 | + cl_program program = clCreateProgramWithSource( | |
| 206 | + context, | |
| 207 | + 1, | |
| 208 | + (const char**)&programSource, | |
| 209 | + NULL, | |
| 210 | + &status); | |
| 211 | + | |
| 212 | + // Build (compile) the program for the devices with | |
| 213 | + // clBuildProgram() | |
| 214 | + status = clBuildProgram( | |
| 215 | + program, | |
| 216 | + numDevices, | |
| 217 | + devices, | |
| 218 | + NULL, | |
| 219 | + NULL, | |
| 220 | + NULL); | |
| 221 | + | |
| 222 | + //----------------------------------------------------- | |
| 223 | + // STEP 8: Create the kernel | |
| 224 | + //----------------------------------------------------- | |
| 225 | + | |
| 226 | + cl_kernel kernel = NULL; | |
| 227 | + | |
| 228 | + // Use clCreateKernel() to create a kernel from the | |
| 229 | + // vector addition function (named "vecadd") | |
| 230 | + kernel = clCreateKernel(program, "vecadd", &status); | |
| 231 | + | |
| 232 | + //----------------------------------------------------- | |
| 233 | + // STEP 9: Set the kernel arguments | |
| 234 | + //----------------------------------------------------- | |
| 235 | + | |
| 236 | + // Associate the input and output buffers with the | |
| 237 | + // kernel | |
| 238 | + // using clSetKernelArg() | |
| 239 | + status = clSetKernelArg( | |
| 240 | + kernel, | |
| 241 | + 0, | |
| 242 | + sizeof(cl_mem), | |
| 243 | + &bufferA); | |
| 244 | + status |= clSetKernelArg( | |
| 245 | + kernel, | |
| 246 | + 1, | |
| 247 | + sizeof(cl_mem), | |
| 248 | + &bufferB); | |
| 249 | + status |= clSetKernelArg( | |
| 250 | + kernel, | |
| 251 | + 2, | |
| 252 | + sizeof(cl_mem), | |
| 253 | + &bufferC); | |
| 254 | + | |
| 255 | + //----------------------------------------------------- | |
| 256 | + // STEP 10: Configure the work-item structure | |
| 257 | + //----------------------------------------------------- | |
| 258 | + | |
| 259 | + // Define an index space (global work size) of work items for | |
| 260 | + // execution. A workgroup size (local work size) is not required, | |
| 261 | + // but can be used. | |
| 262 | + size_t globalWorkSize[1]; | |
| 263 | + // There are 'elements' work-items | |
| 264 | + globalWorkSize[0] = elements; | |
| 265 | + | |
| 266 | + //----------------------------------------------------- | |
| 267 | + // STEP 11: Enqueue the kernel for execution | |
| 268 | + //----------------------------------------------------- | |
| 269 | + | |
| 270 | + // Execute the kernel by using clEnqueueNDRangeKernel(). | |
| 271 | + // 'globalWorkSize' is the 1D dimension of the work-items | |
| 272 | + status = clEnqueueNDRangeKernel( | |
| 273 | + cmdQueue, | |
| 274 | + kernel, | |
| 275 | + 1, | |
| 276 | + NULL, | |
| 277 | + globalWorkSize, | |
| 278 | + NULL, | |
| 279 | + 0, | |
| 280 | + NULL, | |
| 281 | + NULL); | |
| 282 | + | |
| 283 | + //----------------------------------------------------- | |
| 284 | + // STEP 12: Read the output buffer back to the host | |
| 285 | + //----------------------------------------------------- | |
| 286 | + | |
| 287 | + // Use clEnqueueReadBuffer() to read the OpenCL output | |
| 288 | + // buffer (bufferC) | |
| 289 | + // to the host output array (C) | |
| 290 | + clEnqueueReadBuffer( | |
| 291 | + cmdQueue, | |
| 292 | + bufferC, | |
| 293 | + CL_TRUE, | |
| 294 | + 0, | |
| 295 | + datasize, | |
| 296 | + C, | |
| 297 | + 0, | |
| 298 | + NULL, | |
| 299 | + NULL); | |
| 300 | + | |
| 301 | + // Verify the output | |
| 302 | + bool result = true; | |
| 303 | + for(int i = 0; i < elements; i++) { | |
| 304 | + if(C[i] != i+i) { | |
| 305 | + result = false; | |
| 306 | + break; | |
| 307 | + } | |
| 308 | + } | |
| 309 | + if(result) { | |
| 310 | + printf("Output is correct\n"); | |
| 311 | + } else { | |
| 312 | + printf("Output is incorrect\n"); | |
| 313 | + } | |
| 314 | + | |
| 315 | + //----------------------------------------------------- | |
| 316 | + // STEP 13: Release OpenCL resources | |
| 317 | + //----------------------------------------------------- | |
| 318 | + | |
| 319 | + // Free OpenCL resources | |
| 320 | + clReleaseKernel(kernel); | |
| 321 | + clReleaseProgram(program); | |
| 322 | + clReleaseCommandQueue(cmdQueue); | |
| 323 | + clReleaseMemObject(bufferA); | |
| 324 | + clReleaseMemObject(bufferB); | |
| 325 | + clReleaseMemObject(bufferC); | |
| 326 | + clReleaseContext(context); | |
| 327 | + | |
| 328 | + // Free host resources | |
| 329 | + free(A); | |
| 330 | + free(B); | |
| 331 | + free(C); | |
| 332 | + free(platforms); | |
| 333 | + free(devices); | |
| 334 | +} | |
| 335 | + | |
| 336 | +// vim: ft=c ts=4 sw=4: | ... | ... |
Please
register
or
login
to post a comment