Commit 3b978b8e0b4bb1ffabe51f924455ea6b1f64ee7b

Authored by Georg Hopp
0 parents

initial checkin

  1 +arrayadd
  1 +CFLAGS += -std=c99 -I/usr/local/include
  2 +LIBS += -lcl -L/usr/local/lib64/beignet
  3 +CC = cc
  4 +
  5 +all:
  6 + $(CC) -O2 -march=native $(CFLAGS) $(LIBS) -o arrayadd arrayadd.c
  1 +# OpenCL array add.
  2 +
  3 +Taken from
  4 +[here](http://www.heterogeneouscompute.org/wordpress/wp-content/uploads/2011/06/Chapter2.txt).
  5 +
  6 +## Description
  7 +
  8 +This is an example on how to implement an array add with OpenCL.
  9 +
  10 +## Requirements
  11 +
  12 +Some OpenCL capable hardware and the according OpenCL library exposing the
  13 +OpenCL API. I tested this on an Intel GPU (Intel Corporation Haswell-ULT
  14 +Integrated Graphics Controller (rev 09)) with the
  15 +[beignet](https://www.freedesktop.org/wiki/Software/Beignet/)
  16 +open source library.
  17 +
  18 +## License
  19 +
  20 +unknown
  1 +OpenCL tutorial notes
  2 +=====================
  3 +
  4 +URL: http://www.heterogeneouscompute.org/wordpress/wp-content/uploads/2011/06/Chapter2.txt
  1 +// This program implements a vector addition using OpenCL
  2 +
  3 +// System includes
  4 +#include <stdio.h>
  5 +#include <stdlib.h>
  6 +
  7 +// OpenCL includes
  8 +#include <CL/cl.h>
  9 +
  10 +// OpenCL kernel to perform an element-wise add of two arrays
  11 +const char* programSource =
  12 +"__kernel \n"
  13 +"void vecadd(__global int *A, \n"
  14 +" __global int *B, \n"
  15 +" __global int *C) \n"
  16 +"{ \n"
  17 +" \n"
  18 +" // Get the work-item’s unique ID \n"
  19 +" int idx = get_global_id(0); \n"
  20 +" \n"
  21 +" // Add the corresponding locations of \n"
  22 +" // 'A' and 'B', and store the result in 'C'. \n"
  23 +" C[idx] = A[idx] + B[idx]; \n"
  24 +"} \n"
  25 +;
  26 +
  27 +typedef enum {false=0, true} bool;
  28 +
  29 +int main() {
  30 + // This code executes on the OpenCL host
  31 +
  32 + // Host data
  33 + int *A = NULL; // Input array
  34 + int *B = NULL; // Input array
  35 + int *C = NULL; // Output array
  36 +
  37 + // Elements in each array
  38 + const int elements = 2048;
  39 +
  40 + // Compute the size of the data
  41 + size_t datasize = sizeof(int)*elements;
  42 +
  43 + // Allocate space for input/output data
  44 + A = (int*)malloc(datasize);
  45 + B = (int*)malloc(datasize);
  46 + C = (int*)malloc(datasize);
  47 + // Initialize the input data
  48 + for(int i = 0; i < elements; i++) {
  49 + A[i] = i;
  50 + B[i] = i;
  51 + }
  52 +
  53 + // Use this to check the output of each API call
  54 + cl_int status;
  55 +
  56 + //-----------------------------------------------------
  57 + // STEP 1: Discover and initialize the platforms
  58 + //-----------------------------------------------------
  59 +
  60 + cl_uint numPlatforms = 0;
  61 + cl_platform_id *platforms = NULL;
  62 +
  63 + // Use clGetPlatformIDs() to retrieve the number of platforms
  64 + status = clGetPlatformIDs(0, NULL, &numPlatforms);
  65 +
  66 + // Allocate enough space for each platform
  67 + platforms =
  68 + (cl_platform_id*)malloc(
  69 + numPlatforms*sizeof(cl_platform_id));
  70 +
  71 + // Fill in platforms with clGetPlatformIDs()
  72 + status = clGetPlatformIDs(numPlatforms, platforms,
  73 + NULL);
  74 +
  75 + //-----------------------------------------------------
  76 + // STEP 2: Discover and initialize the devices
  77 + //-----------------------------------------------------
  78 +
  79 + cl_uint numDevices = 0;
  80 + cl_device_id *devices = NULL;
  81 +
  82 + // Use clGetDeviceIDs() to retrieve the number of
  83 + // devices present
  84 + status = clGetDeviceIDs(
  85 + platforms[0],
  86 + CL_DEVICE_TYPE_ALL,
  87 + 0,
  88 + NULL,
  89 + &numDevices);
  90 +
  91 + // Allocate enough space for each device
  92 + devices =
  93 + (cl_device_id*)malloc(
  94 + numDevices*sizeof(cl_device_id));
  95 +
  96 + // Fill in devices with clGetDeviceIDs()
  97 + status = clGetDeviceIDs(
  98 + platforms[0],
  99 + CL_DEVICE_TYPE_ALL,
  100 + numDevices,
  101 + devices,
  102 + NULL);
  103 +
  104 + //-----------------------------------------------------
  105 + // STEP 3: Create a context
  106 + //-----------------------------------------------------
  107 +
  108 + cl_context context = NULL;
  109 +
  110 + // Create a context using clCreateContext() and
  111 + // associate it with the devices
  112 + context = clCreateContext(
  113 + NULL,
  114 + numDevices,
  115 + devices,
  116 + NULL,
  117 + NULL,
  118 + &status);
  119 +
  120 + //-----------------------------------------------------
  121 + // STEP 4: Create a command queue
  122 + //-----------------------------------------------------
  123 +
  124 + cl_command_queue cmdQueue;
  125 +
  126 + // Create a command queue using clCreateCommandQueue(),
  127 + // and associate it with the device you want to execute
  128 + // on
  129 + cmdQueue = clCreateCommandQueue(
  130 + context,
  131 + devices[0],
  132 + 0,
  133 + &status);
  134 +
  135 + //-----------------------------------------------------
  136 + // STEP 5: Create device buffers
  137 + //-----------------------------------------------------
  138 +
  139 + cl_mem bufferA; // Input array on the device
  140 + cl_mem bufferB; // Input array on the device
  141 + cl_mem bufferC; // Output array on the device
  142 +
  143 + // Use clCreateBuffer() to create a buffer object (d_A)
  144 + // that will contain the data from the host array A
  145 + bufferA = clCreateBuffer(
  146 + context,
  147 + CL_MEM_READ_ONLY,
  148 + datasize,
  149 + NULL,
  150 + &status);
  151 +
  152 + // Use clCreateBuffer() to create a buffer object (d_B)
  153 + // that will contain the data from the host array B
  154 + bufferB = clCreateBuffer(
  155 + context,
  156 + CL_MEM_READ_ONLY,
  157 + datasize,
  158 + NULL,
  159 + &status);
  160 +
  161 + // Use clCreateBuffer() to create a buffer object (d_C)
  162 + // with enough space to hold the output data
  163 + bufferC = clCreateBuffer(
  164 + context,
  165 + CL_MEM_WRITE_ONLY,
  166 + datasize,
  167 + NULL,
  168 + &status);
  169 +
  170 + //-----------------------------------------------------
  171 + // STEP 6: Write host data to device buffers
  172 + //-----------------------------------------------------
  173 +
  174 + // Use clEnqueueWriteBuffer() to write input array A to
  175 + // the device buffer bufferA
  176 + status = clEnqueueWriteBuffer(
  177 + cmdQueue,
  178 + bufferA,
  179 + CL_FALSE,
  180 + 0,
  181 + datasize,
  182 + A,
  183 + 0,
  184 + NULL,
  185 + NULL);
  186 +
  187 + // Use clEnqueueWriteBuffer() to write input array B to
  188 + // the device buffer bufferB
  189 + status = clEnqueueWriteBuffer(
  190 + cmdQueue,
  191 + bufferB,
  192 + CL_FALSE,
  193 + 0,
  194 + datasize,
  195 + B,
  196 + 0,
  197 + NULL,
  198 + NULL);
  199 +
  200 + //-----------------------------------------------------
  201 + // STEP 7: Create and compile the program
  202 + //-----------------------------------------------------
  203 +
  204 + // Create a program using clCreateProgramWithSource()
  205 + cl_program program = clCreateProgramWithSource(
  206 + context,
  207 + 1,
  208 + (const char**)&programSource,
  209 + NULL,
  210 + &status);
  211 +
  212 + // Build (compile) the program for the devices with
  213 + // clBuildProgram()
  214 + status = clBuildProgram(
  215 + program,
  216 + numDevices,
  217 + devices,
  218 + NULL,
  219 + NULL,
  220 + NULL);
  221 +
  222 + //-----------------------------------------------------
  223 + // STEP 8: Create the kernel
  224 + //-----------------------------------------------------
  225 +
  226 + cl_kernel kernel = NULL;
  227 +
  228 + // Use clCreateKernel() to create a kernel from the
  229 + // vector addition function (named "vecadd")
  230 + kernel = clCreateKernel(program, "vecadd", &status);
  231 +
  232 + //-----------------------------------------------------
  233 + // STEP 9: Set the kernel arguments
  234 + //-----------------------------------------------------
  235 +
  236 + // Associate the input and output buffers with the
  237 + // kernel
  238 + // using clSetKernelArg()
  239 + status = clSetKernelArg(
  240 + kernel,
  241 + 0,
  242 + sizeof(cl_mem),
  243 + &bufferA);
  244 + status |= clSetKernelArg(
  245 + kernel,
  246 + 1,
  247 + sizeof(cl_mem),
  248 + &bufferB);
  249 + status |= clSetKernelArg(
  250 + kernel,
  251 + 2,
  252 + sizeof(cl_mem),
  253 + &bufferC);
  254 +
  255 + //-----------------------------------------------------
  256 + // STEP 10: Configure the work-item structure
  257 + //-----------------------------------------------------
  258 +
  259 + // Define an index space (global work size) of work items for
  260 + // execution. A workgroup size (local work size) is not required,
  261 + // but can be used.
  262 + size_t globalWorkSize[1];
  263 + // There are 'elements' work-items
  264 + globalWorkSize[0] = elements;
  265 +
  266 + //-----------------------------------------------------
  267 + // STEP 11: Enqueue the kernel for execution
  268 + //-----------------------------------------------------
  269 +
  270 + // Execute the kernel by using clEnqueueNDRangeKernel().
  271 + // 'globalWorkSize' is the 1D dimension of the work-items
  272 + status = clEnqueueNDRangeKernel(
  273 + cmdQueue,
  274 + kernel,
  275 + 1,
  276 + NULL,
  277 + globalWorkSize,
  278 + NULL,
  279 + 0,
  280 + NULL,
  281 + NULL);
  282 +
  283 + //-----------------------------------------------------
  284 + // STEP 12: Read the output buffer back to the host
  285 + //-----------------------------------------------------
  286 +
  287 + // Use clEnqueueReadBuffer() to read the OpenCL output
  288 + // buffer (bufferC)
  289 + // to the host output array (C)
  290 + clEnqueueReadBuffer(
  291 + cmdQueue,
  292 + bufferC,
  293 + CL_TRUE,
  294 + 0,
  295 + datasize,
  296 + C,
  297 + 0,
  298 + NULL,
  299 + NULL);
  300 +
  301 + // Verify the output
  302 + bool result = true;
  303 + for(int i = 0; i < elements; i++) {
  304 + if(C[i] != i+i) {
  305 + result = false;
  306 + break;
  307 + }
  308 + }
  309 + if(result) {
  310 + printf("Output is correct\n");
  311 + } else {
  312 + printf("Output is incorrect\n");
  313 + }
  314 +
  315 + //-----------------------------------------------------
  316 + // STEP 13: Release OpenCL resources
  317 + //-----------------------------------------------------
  318 +
  319 + // Free OpenCL resources
  320 + clReleaseKernel(kernel);
  321 + clReleaseProgram(program);
  322 + clReleaseCommandQueue(cmdQueue);
  323 + clReleaseMemObject(bufferA);
  324 + clReleaseMemObject(bufferB);
  325 + clReleaseMemObject(bufferC);
  326 + clReleaseContext(context);
  327 +
  328 + // Free host resources
  329 + free(A);
  330 + free(B);
  331 + free(C);
  332 + free(platforms);
  333 + free(devices);
  334 +}
  335 +
  336 +// vim: ft=c ts=4 sw=4:
Please register or login to post a comment