Showing
5 changed files
with
367 additions
and
0 deletions
.gitignore
0 → 100644
1 | +arrayadd | ... | ... |
Makefile
0 → 100644
README.md
0 → 100644
1 | +# OpenCL array add. | |
2 | + | |
3 | +Taken from | |
4 | +[here](http://www.heterogeneouscompute.org/wordpress/wp-content/uploads/2011/06/Chapter2.txt). | |
5 | + | |
6 | +## Description | |
7 | + | |
8 | +This is an example on how to implement an array add with OpenCL. | |
9 | + | |
10 | +## Requirements | |
11 | + | |
12 | +Some OpenCL capable hardware and the according OpenCL library exposing the | |
13 | +OpenCL API. I tested this on an Intel GPU (Intel Corporation Haswell-ULT | |
14 | +Integrated Graphics Controller (rev 09)) with the | |
15 | +[beignet](https://www.freedesktop.org/wiki/Software/Beignet/) | |
16 | +open source library. | |
17 | + | |
18 | +## License | |
19 | + | |
20 | +unknown | ... | ... |
README.md.old
0 → 100644
arrayadd.c
0 → 100644
1 | +// This program implements a vector addition using OpenCL | |
2 | + | |
3 | +// System includes | |
4 | +#include <stdio.h> | |
5 | +#include <stdlib.h> | |
6 | + | |
7 | +// OpenCL includes | |
8 | +#include <CL/cl.h> | |
9 | + | |
10 | +// OpenCL kernel to perform an element-wise add of two arrays | |
11 | +const char* programSource = | |
12 | +"__kernel \n" | |
13 | +"void vecadd(__global int *A, \n" | |
14 | +" __global int *B, \n" | |
15 | +" __global int *C) \n" | |
16 | +"{ \n" | |
17 | +" \n" | |
18 | +" // Get the work-item’s unique ID \n" | |
19 | +" int idx = get_global_id(0); \n" | |
20 | +" \n" | |
21 | +" // Add the corresponding locations of \n" | |
22 | +" // 'A' and 'B', and store the result in 'C'. \n" | |
23 | +" C[idx] = A[idx] + B[idx]; \n" | |
24 | +"} \n" | |
25 | +; | |
26 | + | |
27 | +typedef enum {false=0, true} bool; | |
28 | + | |
29 | +int main() { | |
30 | + // This code executes on the OpenCL host | |
31 | + | |
32 | + // Host data | |
33 | + int *A = NULL; // Input array | |
34 | + int *B = NULL; // Input array | |
35 | + int *C = NULL; // Output array | |
36 | + | |
37 | + // Elements in each array | |
38 | + const int elements = 2048; | |
39 | + | |
40 | + // Compute the size of the data | |
41 | + size_t datasize = sizeof(int)*elements; | |
42 | + | |
43 | + // Allocate space for input/output data | |
44 | + A = (int*)malloc(datasize); | |
45 | + B = (int*)malloc(datasize); | |
46 | + C = (int*)malloc(datasize); | |
47 | + // Initialize the input data | |
48 | + for(int i = 0; i < elements; i++) { | |
49 | + A[i] = i; | |
50 | + B[i] = i; | |
51 | + } | |
52 | + | |
53 | + // Use this to check the output of each API call | |
54 | + cl_int status; | |
55 | + | |
56 | + //----------------------------------------------------- | |
57 | + // STEP 1: Discover and initialize the platforms | |
58 | + //----------------------------------------------------- | |
59 | + | |
60 | + cl_uint numPlatforms = 0; | |
61 | + cl_platform_id *platforms = NULL; | |
62 | + | |
63 | + // Use clGetPlatformIDs() to retrieve the number of platforms | |
64 | + status = clGetPlatformIDs(0, NULL, &numPlatforms); | |
65 | + | |
66 | + // Allocate enough space for each platform | |
67 | + platforms = | |
68 | + (cl_platform_id*)malloc( | |
69 | + numPlatforms*sizeof(cl_platform_id)); | |
70 | + | |
71 | + // Fill in platforms with clGetPlatformIDs() | |
72 | + status = clGetPlatformIDs(numPlatforms, platforms, | |
73 | + NULL); | |
74 | + | |
75 | + //----------------------------------------------------- | |
76 | + // STEP 2: Discover and initialize the devices | |
77 | + //----------------------------------------------------- | |
78 | + | |
79 | + cl_uint numDevices = 0; | |
80 | + cl_device_id *devices = NULL; | |
81 | + | |
82 | + // Use clGetDeviceIDs() to retrieve the number of | |
83 | + // devices present | |
84 | + status = clGetDeviceIDs( | |
85 | + platforms[0], | |
86 | + CL_DEVICE_TYPE_ALL, | |
87 | + 0, | |
88 | + NULL, | |
89 | + &numDevices); | |
90 | + | |
91 | + // Allocate enough space for each device | |
92 | + devices = | |
93 | + (cl_device_id*)malloc( | |
94 | + numDevices*sizeof(cl_device_id)); | |
95 | + | |
96 | + // Fill in devices with clGetDeviceIDs() | |
97 | + status = clGetDeviceIDs( | |
98 | + platforms[0], | |
99 | + CL_DEVICE_TYPE_ALL, | |
100 | + numDevices, | |
101 | + devices, | |
102 | + NULL); | |
103 | + | |
104 | + //----------------------------------------------------- | |
105 | + // STEP 3: Create a context | |
106 | + //----------------------------------------------------- | |
107 | + | |
108 | + cl_context context = NULL; | |
109 | + | |
110 | + // Create a context using clCreateContext() and | |
111 | + // associate it with the devices | |
112 | + context = clCreateContext( | |
113 | + NULL, | |
114 | + numDevices, | |
115 | + devices, | |
116 | + NULL, | |
117 | + NULL, | |
118 | + &status); | |
119 | + | |
120 | + //----------------------------------------------------- | |
121 | + // STEP 4: Create a command queue | |
122 | + //----------------------------------------------------- | |
123 | + | |
124 | + cl_command_queue cmdQueue; | |
125 | + | |
126 | + // Create a command queue using clCreateCommandQueue(), | |
127 | + // and associate it with the device you want to execute | |
128 | + // on | |
129 | + cmdQueue = clCreateCommandQueue( | |
130 | + context, | |
131 | + devices[0], | |
132 | + 0, | |
133 | + &status); | |
134 | + | |
135 | + //----------------------------------------------------- | |
136 | + // STEP 5: Create device buffers | |
137 | + //----------------------------------------------------- | |
138 | + | |
139 | + cl_mem bufferA; // Input array on the device | |
140 | + cl_mem bufferB; // Input array on the device | |
141 | + cl_mem bufferC; // Output array on the device | |
142 | + | |
143 | + // Use clCreateBuffer() to create a buffer object (d_A) | |
144 | + // that will contain the data from the host array A | |
145 | + bufferA = clCreateBuffer( | |
146 | + context, | |
147 | + CL_MEM_READ_ONLY, | |
148 | + datasize, | |
149 | + NULL, | |
150 | + &status); | |
151 | + | |
152 | + // Use clCreateBuffer() to create a buffer object (d_B) | |
153 | + // that will contain the data from the host array B | |
154 | + bufferB = clCreateBuffer( | |
155 | + context, | |
156 | + CL_MEM_READ_ONLY, | |
157 | + datasize, | |
158 | + NULL, | |
159 | + &status); | |
160 | + | |
161 | + // Use clCreateBuffer() to create a buffer object (d_C) | |
162 | + // with enough space to hold the output data | |
163 | + bufferC = clCreateBuffer( | |
164 | + context, | |
165 | + CL_MEM_WRITE_ONLY, | |
166 | + datasize, | |
167 | + NULL, | |
168 | + &status); | |
169 | + | |
170 | + //----------------------------------------------------- | |
171 | + // STEP 6: Write host data to device buffers | |
172 | + //----------------------------------------------------- | |
173 | + | |
174 | + // Use clEnqueueWriteBuffer() to write input array A to | |
175 | + // the device buffer bufferA | |
176 | + status = clEnqueueWriteBuffer( | |
177 | + cmdQueue, | |
178 | + bufferA, | |
179 | + CL_FALSE, | |
180 | + 0, | |
181 | + datasize, | |
182 | + A, | |
183 | + 0, | |
184 | + NULL, | |
185 | + NULL); | |
186 | + | |
187 | + // Use clEnqueueWriteBuffer() to write input array B to | |
188 | + // the device buffer bufferB | |
189 | + status = clEnqueueWriteBuffer( | |
190 | + cmdQueue, | |
191 | + bufferB, | |
192 | + CL_FALSE, | |
193 | + 0, | |
194 | + datasize, | |
195 | + B, | |
196 | + 0, | |
197 | + NULL, | |
198 | + NULL); | |
199 | + | |
200 | + //----------------------------------------------------- | |
201 | + // STEP 7: Create and compile the program | |
202 | + //----------------------------------------------------- | |
203 | + | |
204 | + // Create a program using clCreateProgramWithSource() | |
205 | + cl_program program = clCreateProgramWithSource( | |
206 | + context, | |
207 | + 1, | |
208 | + (const char**)&programSource, | |
209 | + NULL, | |
210 | + &status); | |
211 | + | |
212 | + // Build (compile) the program for the devices with | |
213 | + // clBuildProgram() | |
214 | + status = clBuildProgram( | |
215 | + program, | |
216 | + numDevices, | |
217 | + devices, | |
218 | + NULL, | |
219 | + NULL, | |
220 | + NULL); | |
221 | + | |
222 | + //----------------------------------------------------- | |
223 | + // STEP 8: Create the kernel | |
224 | + //----------------------------------------------------- | |
225 | + | |
226 | + cl_kernel kernel = NULL; | |
227 | + | |
228 | + // Use clCreateKernel() to create a kernel from the | |
229 | + // vector addition function (named "vecadd") | |
230 | + kernel = clCreateKernel(program, "vecadd", &status); | |
231 | + | |
232 | + //----------------------------------------------------- | |
233 | + // STEP 9: Set the kernel arguments | |
234 | + //----------------------------------------------------- | |
235 | + | |
236 | + // Associate the input and output buffers with the | |
237 | + // kernel | |
238 | + // using clSetKernelArg() | |
239 | + status = clSetKernelArg( | |
240 | + kernel, | |
241 | + 0, | |
242 | + sizeof(cl_mem), | |
243 | + &bufferA); | |
244 | + status |= clSetKernelArg( | |
245 | + kernel, | |
246 | + 1, | |
247 | + sizeof(cl_mem), | |
248 | + &bufferB); | |
249 | + status |= clSetKernelArg( | |
250 | + kernel, | |
251 | + 2, | |
252 | + sizeof(cl_mem), | |
253 | + &bufferC); | |
254 | + | |
255 | + //----------------------------------------------------- | |
256 | + // STEP 10: Configure the work-item structure | |
257 | + //----------------------------------------------------- | |
258 | + | |
259 | + // Define an index space (global work size) of work items for | |
260 | + // execution. A workgroup size (local work size) is not required, | |
261 | + // but can be used. | |
262 | + size_t globalWorkSize[1]; | |
263 | + // There are 'elements' work-items | |
264 | + globalWorkSize[0] = elements; | |
265 | + | |
266 | + //----------------------------------------------------- | |
267 | + // STEP 11: Enqueue the kernel for execution | |
268 | + //----------------------------------------------------- | |
269 | + | |
270 | + // Execute the kernel by using clEnqueueNDRangeKernel(). | |
271 | + // 'globalWorkSize' is the 1D dimension of the work-items | |
272 | + status = clEnqueueNDRangeKernel( | |
273 | + cmdQueue, | |
274 | + kernel, | |
275 | + 1, | |
276 | + NULL, | |
277 | + globalWorkSize, | |
278 | + NULL, | |
279 | + 0, | |
280 | + NULL, | |
281 | + NULL); | |
282 | + | |
283 | + //----------------------------------------------------- | |
284 | + // STEP 12: Read the output buffer back to the host | |
285 | + //----------------------------------------------------- | |
286 | + | |
287 | + // Use clEnqueueReadBuffer() to read the OpenCL output | |
288 | + // buffer (bufferC) | |
289 | + // to the host output array (C) | |
290 | + clEnqueueReadBuffer( | |
291 | + cmdQueue, | |
292 | + bufferC, | |
293 | + CL_TRUE, | |
294 | + 0, | |
295 | + datasize, | |
296 | + C, | |
297 | + 0, | |
298 | + NULL, | |
299 | + NULL); | |
300 | + | |
301 | + // Verify the output | |
302 | + bool result = true; | |
303 | + for(int i = 0; i < elements; i++) { | |
304 | + if(C[i] != i+i) { | |
305 | + result = false; | |
306 | + break; | |
307 | + } | |
308 | + } | |
309 | + if(result) { | |
310 | + printf("Output is correct\n"); | |
311 | + } else { | |
312 | + printf("Output is incorrect\n"); | |
313 | + } | |
314 | + | |
315 | + //----------------------------------------------------- | |
316 | + // STEP 13: Release OpenCL resources | |
317 | + //----------------------------------------------------- | |
318 | + | |
319 | + // Free OpenCL resources | |
320 | + clReleaseKernel(kernel); | |
321 | + clReleaseProgram(program); | |
322 | + clReleaseCommandQueue(cmdQueue); | |
323 | + clReleaseMemObject(bufferA); | |
324 | + clReleaseMemObject(bufferB); | |
325 | + clReleaseMemObject(bufferC); | |
326 | + clReleaseContext(context); | |
327 | + | |
328 | + // Free host resources | |
329 | + free(A); | |
330 | + free(B); | |
331 | + free(C); | |
332 | + free(platforms); | |
333 | + free(devices); | |
334 | +} | |
335 | + | |
336 | +// vim: ft=c ts=4 sw=4: | ... | ... |
Please
register
or
login
to post a comment