Showing
5 changed files
with
367 additions
and
0 deletions
.gitignore
0 → 100644
1 | +arrayadd |
Makefile
0 → 100644
README.md
0 → 100644
1 | +# OpenCL array add. | ||
2 | + | ||
3 | +Taken from | ||
4 | +[here](http://www.heterogeneouscompute.org/wordpress/wp-content/uploads/2011/06/Chapter2.txt). | ||
5 | + | ||
6 | +## Description | ||
7 | + | ||
8 | +This is an example on how to implement an array add with OpenCL. | ||
9 | + | ||
10 | +## Requirements | ||
11 | + | ||
12 | +Some OpenCL capable hardware and the according OpenCL library exposing the | ||
13 | +OpenCL API. I tested this on an Intel GPU (Intel Corporation Haswell-ULT | ||
14 | +Integrated Graphics Controller (rev 09)) with the | ||
15 | +[beignet](https://www.freedesktop.org/wiki/Software/Beignet/) | ||
16 | +open source library. | ||
17 | + | ||
18 | +## License | ||
19 | + | ||
20 | +unknown |
README.md.old
0 → 100644
arrayadd.c
0 → 100644
1 | +// This program implements a vector addition using OpenCL | ||
2 | + | ||
3 | +// System includes | ||
4 | +#include <stdio.h> | ||
5 | +#include <stdlib.h> | ||
6 | + | ||
7 | +// OpenCL includes | ||
8 | +#include <CL/cl.h> | ||
9 | + | ||
10 | +// OpenCL kernel to perform an element-wise add of two arrays | ||
11 | +const char* programSource = | ||
12 | +"__kernel \n" | ||
13 | +"void vecadd(__global int *A, \n" | ||
14 | +" __global int *B, \n" | ||
15 | +" __global int *C) \n" | ||
16 | +"{ \n" | ||
17 | +" \n" | ||
18 | +" // Get the work-item’s unique ID \n" | ||
19 | +" int idx = get_global_id(0); \n" | ||
20 | +" \n" | ||
21 | +" // Add the corresponding locations of \n" | ||
22 | +" // 'A' and 'B', and store the result in 'C'. \n" | ||
23 | +" C[idx] = A[idx] + B[idx]; \n" | ||
24 | +"} \n" | ||
25 | +; | ||
26 | + | ||
27 | +typedef enum {false=0, true} bool; | ||
28 | + | ||
29 | +int main() { | ||
30 | + // This code executes on the OpenCL host | ||
31 | + | ||
32 | + // Host data | ||
33 | + int *A = NULL; // Input array | ||
34 | + int *B = NULL; // Input array | ||
35 | + int *C = NULL; // Output array | ||
36 | + | ||
37 | + // Elements in each array | ||
38 | + const int elements = 2048; | ||
39 | + | ||
40 | + // Compute the size of the data | ||
41 | + size_t datasize = sizeof(int)*elements; | ||
42 | + | ||
43 | + // Allocate space for input/output data | ||
44 | + A = (int*)malloc(datasize); | ||
45 | + B = (int*)malloc(datasize); | ||
46 | + C = (int*)malloc(datasize); | ||
47 | + // Initialize the input data | ||
48 | + for(int i = 0; i < elements; i++) { | ||
49 | + A[i] = i; | ||
50 | + B[i] = i; | ||
51 | + } | ||
52 | + | ||
53 | + // Use this to check the output of each API call | ||
54 | + cl_int status; | ||
55 | + | ||
56 | + //----------------------------------------------------- | ||
57 | + // STEP 1: Discover and initialize the platforms | ||
58 | + //----------------------------------------------------- | ||
59 | + | ||
60 | + cl_uint numPlatforms = 0; | ||
61 | + cl_platform_id *platforms = NULL; | ||
62 | + | ||
63 | + // Use clGetPlatformIDs() to retrieve the number of platforms | ||
64 | + status = clGetPlatformIDs(0, NULL, &numPlatforms); | ||
65 | + | ||
66 | + // Allocate enough space for each platform | ||
67 | + platforms = | ||
68 | + (cl_platform_id*)malloc( | ||
69 | + numPlatforms*sizeof(cl_platform_id)); | ||
70 | + | ||
71 | + // Fill in platforms with clGetPlatformIDs() | ||
72 | + status = clGetPlatformIDs(numPlatforms, platforms, | ||
73 | + NULL); | ||
74 | + | ||
75 | + //----------------------------------------------------- | ||
76 | + // STEP 2: Discover and initialize the devices | ||
77 | + //----------------------------------------------------- | ||
78 | + | ||
79 | + cl_uint numDevices = 0; | ||
80 | + cl_device_id *devices = NULL; | ||
81 | + | ||
82 | + // Use clGetDeviceIDs() to retrieve the number of | ||
83 | + // devices present | ||
84 | + status = clGetDeviceIDs( | ||
85 | + platforms[0], | ||
86 | + CL_DEVICE_TYPE_ALL, | ||
87 | + 0, | ||
88 | + NULL, | ||
89 | + &numDevices); | ||
90 | + | ||
91 | + // Allocate enough space for each device | ||
92 | + devices = | ||
93 | + (cl_device_id*)malloc( | ||
94 | + numDevices*sizeof(cl_device_id)); | ||
95 | + | ||
96 | + // Fill in devices with clGetDeviceIDs() | ||
97 | + status = clGetDeviceIDs( | ||
98 | + platforms[0], | ||
99 | + CL_DEVICE_TYPE_ALL, | ||
100 | + numDevices, | ||
101 | + devices, | ||
102 | + NULL); | ||
103 | + | ||
104 | + //----------------------------------------------------- | ||
105 | + // STEP 3: Create a context | ||
106 | + //----------------------------------------------------- | ||
107 | + | ||
108 | + cl_context context = NULL; | ||
109 | + | ||
110 | + // Create a context using clCreateContext() and | ||
111 | + // associate it with the devices | ||
112 | + context = clCreateContext( | ||
113 | + NULL, | ||
114 | + numDevices, | ||
115 | + devices, | ||
116 | + NULL, | ||
117 | + NULL, | ||
118 | + &status); | ||
119 | + | ||
120 | + //----------------------------------------------------- | ||
121 | + // STEP 4: Create a command queue | ||
122 | + //----------------------------------------------------- | ||
123 | + | ||
124 | + cl_command_queue cmdQueue; | ||
125 | + | ||
126 | + // Create a command queue using clCreateCommandQueue(), | ||
127 | + // and associate it with the device you want to execute | ||
128 | + // on | ||
129 | + cmdQueue = clCreateCommandQueue( | ||
130 | + context, | ||
131 | + devices[0], | ||
132 | + 0, | ||
133 | + &status); | ||
134 | + | ||
135 | + //----------------------------------------------------- | ||
136 | + // STEP 5: Create device buffers | ||
137 | + //----------------------------------------------------- | ||
138 | + | ||
139 | + cl_mem bufferA; // Input array on the device | ||
140 | + cl_mem bufferB; // Input array on the device | ||
141 | + cl_mem bufferC; // Output array on the device | ||
142 | + | ||
143 | + // Use clCreateBuffer() to create a buffer object (d_A) | ||
144 | + // that will contain the data from the host array A | ||
145 | + bufferA = clCreateBuffer( | ||
146 | + context, | ||
147 | + CL_MEM_READ_ONLY, | ||
148 | + datasize, | ||
149 | + NULL, | ||
150 | + &status); | ||
151 | + | ||
152 | + // Use clCreateBuffer() to create a buffer object (d_B) | ||
153 | + // that will contain the data from the host array B | ||
154 | + bufferB = clCreateBuffer( | ||
155 | + context, | ||
156 | + CL_MEM_READ_ONLY, | ||
157 | + datasize, | ||
158 | + NULL, | ||
159 | + &status); | ||
160 | + | ||
161 | + // Use clCreateBuffer() to create a buffer object (d_C) | ||
162 | + // with enough space to hold the output data | ||
163 | + bufferC = clCreateBuffer( | ||
164 | + context, | ||
165 | + CL_MEM_WRITE_ONLY, | ||
166 | + datasize, | ||
167 | + NULL, | ||
168 | + &status); | ||
169 | + | ||
170 | + //----------------------------------------------------- | ||
171 | + // STEP 6: Write host data to device buffers | ||
172 | + //----------------------------------------------------- | ||
173 | + | ||
174 | + // Use clEnqueueWriteBuffer() to write input array A to | ||
175 | + // the device buffer bufferA | ||
176 | + status = clEnqueueWriteBuffer( | ||
177 | + cmdQueue, | ||
178 | + bufferA, | ||
179 | + CL_FALSE, | ||
180 | + 0, | ||
181 | + datasize, | ||
182 | + A, | ||
183 | + 0, | ||
184 | + NULL, | ||
185 | + NULL); | ||
186 | + | ||
187 | + // Use clEnqueueWriteBuffer() to write input array B to | ||
188 | + // the device buffer bufferB | ||
189 | + status = clEnqueueWriteBuffer( | ||
190 | + cmdQueue, | ||
191 | + bufferB, | ||
192 | + CL_FALSE, | ||
193 | + 0, | ||
194 | + datasize, | ||
195 | + B, | ||
196 | + 0, | ||
197 | + NULL, | ||
198 | + NULL); | ||
199 | + | ||
200 | + //----------------------------------------------------- | ||
201 | + // STEP 7: Create and compile the program | ||
202 | + //----------------------------------------------------- | ||
203 | + | ||
204 | + // Create a program using clCreateProgramWithSource() | ||
205 | + cl_program program = clCreateProgramWithSource( | ||
206 | + context, | ||
207 | + 1, | ||
208 | + (const char**)&programSource, | ||
209 | + NULL, | ||
210 | + &status); | ||
211 | + | ||
212 | + // Build (compile) the program for the devices with | ||
213 | + // clBuildProgram() | ||
214 | + status = clBuildProgram( | ||
215 | + program, | ||
216 | + numDevices, | ||
217 | + devices, | ||
218 | + NULL, | ||
219 | + NULL, | ||
220 | + NULL); | ||
221 | + | ||
222 | + //----------------------------------------------------- | ||
223 | + // STEP 8: Create the kernel | ||
224 | + //----------------------------------------------------- | ||
225 | + | ||
226 | + cl_kernel kernel = NULL; | ||
227 | + | ||
228 | + // Use clCreateKernel() to create a kernel from the | ||
229 | + // vector addition function (named "vecadd") | ||
230 | + kernel = clCreateKernel(program, "vecadd", &status); | ||
231 | + | ||
232 | + //----------------------------------------------------- | ||
233 | + // STEP 9: Set the kernel arguments | ||
234 | + //----------------------------------------------------- | ||
235 | + | ||
236 | + // Associate the input and output buffers with the | ||
237 | + // kernel | ||
238 | + // using clSetKernelArg() | ||
239 | + status = clSetKernelArg( | ||
240 | + kernel, | ||
241 | + 0, | ||
242 | + sizeof(cl_mem), | ||
243 | + &bufferA); | ||
244 | + status |= clSetKernelArg( | ||
245 | + kernel, | ||
246 | + 1, | ||
247 | + sizeof(cl_mem), | ||
248 | + &bufferB); | ||
249 | + status |= clSetKernelArg( | ||
250 | + kernel, | ||
251 | + 2, | ||
252 | + sizeof(cl_mem), | ||
253 | + &bufferC); | ||
254 | + | ||
255 | + //----------------------------------------------------- | ||
256 | + // STEP 10: Configure the work-item structure | ||
257 | + //----------------------------------------------------- | ||
258 | + | ||
259 | + // Define an index space (global work size) of work items for | ||
260 | + // execution. A workgroup size (local work size) is not required, | ||
261 | + // but can be used. | ||
262 | + size_t globalWorkSize[1]; | ||
263 | + // There are 'elements' work-items | ||
264 | + globalWorkSize[0] = elements; | ||
265 | + | ||
266 | + //----------------------------------------------------- | ||
267 | + // STEP 11: Enqueue the kernel for execution | ||
268 | + //----------------------------------------------------- | ||
269 | + | ||
270 | + // Execute the kernel by using clEnqueueNDRangeKernel(). | ||
271 | + // 'globalWorkSize' is the 1D dimension of the work-items | ||
272 | + status = clEnqueueNDRangeKernel( | ||
273 | + cmdQueue, | ||
274 | + kernel, | ||
275 | + 1, | ||
276 | + NULL, | ||
277 | + globalWorkSize, | ||
278 | + NULL, | ||
279 | + 0, | ||
280 | + NULL, | ||
281 | + NULL); | ||
282 | + | ||
283 | + //----------------------------------------------------- | ||
284 | + // STEP 12: Read the output buffer back to the host | ||
285 | + //----------------------------------------------------- | ||
286 | + | ||
287 | + // Use clEnqueueReadBuffer() to read the OpenCL output | ||
288 | + // buffer (bufferC) | ||
289 | + // to the host output array (C) | ||
290 | + clEnqueueReadBuffer( | ||
291 | + cmdQueue, | ||
292 | + bufferC, | ||
293 | + CL_TRUE, | ||
294 | + 0, | ||
295 | + datasize, | ||
296 | + C, | ||
297 | + 0, | ||
298 | + NULL, | ||
299 | + NULL); | ||
300 | + | ||
301 | + // Verify the output | ||
302 | + bool result = true; | ||
303 | + for(int i = 0; i < elements; i++) { | ||
304 | + if(C[i] != i+i) { | ||
305 | + result = false; | ||
306 | + break; | ||
307 | + } | ||
308 | + } | ||
309 | + if(result) { | ||
310 | + printf("Output is correct\n"); | ||
311 | + } else { | ||
312 | + printf("Output is incorrect\n"); | ||
313 | + } | ||
314 | + | ||
315 | + //----------------------------------------------------- | ||
316 | + // STEP 13: Release OpenCL resources | ||
317 | + //----------------------------------------------------- | ||
318 | + | ||
319 | + // Free OpenCL resources | ||
320 | + clReleaseKernel(kernel); | ||
321 | + clReleaseProgram(program); | ||
322 | + clReleaseCommandQueue(cmdQueue); | ||
323 | + clReleaseMemObject(bufferA); | ||
324 | + clReleaseMemObject(bufferB); | ||
325 | + clReleaseMemObject(bufferC); | ||
326 | + clReleaseContext(context); | ||
327 | + | ||
328 | + // Free host resources | ||
329 | + free(A); | ||
330 | + free(B); | ||
331 | + free(C); | ||
332 | + free(platforms); | ||
333 | + free(devices); | ||
334 | +} | ||
335 | + | ||
336 | +// vim: ft=c ts=4 sw=4: |
Please
register
or
login
to post a comment