commit f38a238420defdd923d4edece13511e7f6842b7f
parent fb111b34e2a4ecce05efbadf1f62fcd4ce752415
Author: Christian Grothoff <christian@grothoff.org>
Date: Sun, 19 Nov 2023 00:54:50 +0100
proper square example
Diffstat:
4 files changed, 542 insertions(+), 5 deletions(-)
diff --git a/source/Makefile b/source/Makefile
@@ -1,2 +1,5 @@
all:
gcc -g -O0 -D CL_TARGET_OPENCL_VERSION=100 -o foo rsa-test.c lib-gpu-verify.c big-int-test.c -lgcrypt -lOpenCL -lm
+
+square: square.c
+ gcc -g -O0 -D CL_TARGET_OPENCL_VERSION=100 -o square square.c -lOpenCL -lm
diff --git a/source/square.c b/source/square.c
@@ -0,0 +1,529 @@
+#include <stdio.h>
+#include <stdlib.h>
+#include <CL/opencl.h>
+#include <errno.h>
+#include <string.h>
+#include <fcntl.h>
+#include <sys/mman.h>
+#include <sys/stat.h>
+#include <stdbool.h>
+#include <unistd.h>
+
+
+static cl_platform_id
+select_platform (unsigned int offset,
+ bool print_platforms)
+{
+ cl_uint max_platforms = 4;
+ cl_platform_id platforms[max_platforms];
+ cl_uint num_platforms;
+ cl_int rplat;
+
+ rplat = clGetPlatformIDs (max_platforms,
+ platforms,
+ &num_platforms);
+ if (CL_SUCCESS != rplat)
+ {
+ fprintf (stderr,
+ "Error: Failed to lookup platforms! (%d)\n",
+ rplat);
+ exit (1);
+ }
+ if (print_platforms)
+ {
+ for (unsigned int i = 0; i<num_platforms; i++)
+ {
+ char buf[128];
+ size_t rbuf;
+ static struct
+ {
+ cl_platform_info cpi;
+ const char *name;
+ } param[] = {
+ { CL_PLATFORM_PROFILE, "profile" },
+ { CL_PLATFORM_VENDOR, "vendor" },
+ { CL_PLATFORM_NAME, "name" },
+ { CL_PLATFORM_EXTENSIONS, "extensions" },
+ { 0, NULL }
+ };
+
+ for (unsigned int j = 0; NULL != param[j].name; j++)
+ {
+ cl_int err;
+
+ err = clGetPlatformInfo (platforms[i],
+ param[j].cpi,
+ sizeof (buf),
+ buf,
+ &rbuf);
+ if (err != CL_SUCCESS)
+ {
+ fprintf (stderr,
+ "Error: Failed to get platform info for %s! (%d)\n",
+ param[j].name,
+ err);
+ }
+ else
+ {
+ printf ("#%u %s %.*s\n",
+ i,
+ param[j].name,
+ (int) rbuf,
+ buf);
+ }
+ }
+ }
+ exit (0);
+ }
+ if (offset >= num_platforms)
+ {
+ fprintf (stderr,
+ "Found only %u platforms\n",
+ (unsigned int) num_platforms);
+ exit (1);
+ }
+ return platforms[offset];
+}
+
+
+static cl_device_id
+select_device (cl_platform_id platform)
+{
+ cl_device_id device_id;
+ char buf[1024];
+ size_t len;
+ cl_int err;
+ cl_uint address_bits = 0;
+
+ err = clGetDeviceIDs (platform,
+ CL_DEVICE_TYPE_ALL,
+ 1, /* 1 device */
+ &device_id,
+ NULL);
+ if (CL_SUCCESS != err)
+ {
+ fprintf (stderr,
+ "Error: Failed to find a device! (%d)\n",
+ err);
+ exit (1);
+ }
+
+ err = clGetDeviceInfo (device_id,
+ CL_DRIVER_VERSION,
+ sizeof (buf),
+ buf,
+ &len);
+ if (CL_SUCCESS != err)
+ {
+ fprintf (stderr,
+ "Error: Failed to get device driver version! (%d)\n",
+ err);
+ exit (1);
+ }
+ printf ("Driver version: %.*s\n",
+ (int) len,
+ buf);
+ clGetDeviceInfo (device_id,
+ CL_DEVICE_ADDRESS_BITS,
+ sizeof (address_bits),
+ &address_bits,
+ &len);
+ if (CL_SUCCESS != err)
+ {
+ fprintf (stderr,
+ "Error: Failed to get device address bits! (%d)\n",
+ err);
+ exit (1);
+ }
+ printf ("device address bits: %d\n",
+ (int) address_bits);
+ return device_id;
+}
+
+
+static void
+logger (const char *errinfo,
+ const void *private_info,
+ size_t cb,
+ void *user_data)
+{
+ fprintf (stderr,
+ "<OpenCL>: %s\n",
+ errinfo);
+}
+
+
+static cl_context
+create_compute_context (cl_device_id device_id)
+{
+ cl_int err;
+ cl_context context;
+
+ context = clCreateContext (NULL,
+ 1,
+ &device_id,
+ &logger, NULL,
+ &err);
+ if (CL_SUCCESS != err)
+ {
+ fprintf (stderr,
+ "Error: Failed to create a compute context (%d)\n",
+ err);
+ exit (1);
+ }
+ return context;
+}
+
+
+static cl_command_queue
+create_command_queue (cl_device_id device_id,
+ cl_context context)
+{
+ cl_int err;
+ cl_command_queue commands;
+
+ commands = clCreateCommandQueue (context,
+ device_id,
+ 0, /* properties */
+ &err);
+ if (CL_SUCCESS != err)
+ {
+ fprintf (stderr,
+ "Error: Failed to create a command commands!\n (%d)",
+ err);
+ exit (1);
+ }
+ return commands;
+}
+
+
+static cl_program
+compile_program (cl_device_id device_id,
+ cl_context context,
+ const char *sourcefile)
+{
+ cl_program program;
+ int fd;
+ void *code;
+ struct stat ss;
+ cl_int err;
+
+ fd = open (sourcefile,
+ O_RDONLY);
+ if (-1 == fd)
+ {
+ fprintf (stderr,
+ "Failed to open %s: %s\n",
+ sourcefile,
+ strerror (errno));
+ exit (1);
+ }
+ if (0 != fstat (fd,
+ &ss))
+ {
+ fprintf (stderr,
+ "Failed to stat %s: %s\n",
+ sourcefile,
+ strerror (errno));
+ close (fd);
+ exit (1);
+ }
+ code = mmap (NULL,
+ ss.st_size,
+ PROT_READ,
+ MAP_PRIVATE,
+ fd,
+ 0 /* offset */);
+ close (fd);
+ {
+ size_t sz = ss.st_size;
+
+ program = clCreateProgramWithSource (context,
+ 1, /* 1 source file */
+ (const char **) &code,
+ &sz,
+ &err);
+ if (CL_SUCCESS != err)
+ {
+ fprintf (stderr,
+ "Error: Failed to create compute program (%d)!\n",
+ err);
+ munmap (code,
+ ss.st_size);
+ exit (1);
+ }
+ }
+ err = clBuildProgram (program,
+ 0, /* number of devices */
+ NULL, /* devices */
+ NULL, /* options (char *) */
+ NULL, /* callback */
+ NULL);
+ munmap (code,
+ ss.st_size);
+ if (CL_SUCCESS != err)
+ {
+ size_t len;
+ char buffer[2048];
+
+ fprintf (stderr,
+ "Error: Failed to build program executable (%d)!\n",
+ err);
+ err = clGetProgramBuildInfo (program,
+ device_id,
+ CL_PROGRAM_BUILD_LOG,
+ sizeof(buffer),
+ buffer,
+ &len);
+ if (CL_SUCCESS != err)
+ {
+ fprintf (stderr,
+ "Error: could not get build logs (%d)!\n",
+ err);
+ exit (1);
+ }
+ fprintf (stderr,
+ "<clBuild>: %.*s\n",
+ (int) len,
+ buffer);
+ exit (1);
+ }
+ return program;
+}
+
+
+static cl_kernel
+create_kernel (cl_program program,
+ const char *name)
+{
+ cl_kernel kernel;
+ cl_int err;
+
+ kernel = clCreateKernel (program,
+ name,
+ &err);
+ if (CL_SUCCESS != err)
+ {
+ fprintf (stderr,
+ "Error: Failed to create compute kernel %s: %d!\n",
+ name,
+ err);
+ exit (1);
+ }
+ return kernel;
+}
+
+
+static void
+square_with_kernel (cl_device_id device_id,
+ cl_context context,
+ cl_command_queue commands,
+ cl_kernel kernel,
+ unsigned int problem_size,
+ const float problem[static problem_size],
+ float solution[static problem_size])
+{
+ // Create the input and output arrays in device memory for our calculation
+ cl_int err;
+ cl_mem sq_input;
+ cl_mem sq_output;
+ size_t local;
+
+ sq_input = clCreateBuffer (context,
+ CL_MEM_READ_ONLY
+ | CL_MEM_USE_HOST_PTR,
+ sizeof(float) * problem_size,
+ (void *) problem,
+ &err);
+ if (CL_SUCCESS != err)
+ {
+ fprintf (stderr,
+ "Error: Failed to create input buffer: %d!\n",
+ err);
+ exit (1);
+ }
+ sq_output = clCreateBuffer (context,
+ CL_MEM_WRITE_ONLY,
+ sizeof(float) * problem_size,
+ NULL /* no previous data */,
+ &err);
+ if (CL_SUCCESS != err)
+ {
+ fprintf (stderr,
+ "Error: Failed to create output buffer: %d!\n",
+ err);
+ exit (1);
+ }
+#if DEAD
+ // Write our data set into the input array in device memory
+ err = clEnqueueWriteBuffer (commands,
+ sq_input,
+ CL_TRUE,
+ 0,
+ sizeof(DIGIT_T) * s_len[n - 1],
+ signatures,
+ 0,
+ NULL,
+ NULL);
+ if (err != CL_SUCCESS)
+ {
+ printf ("Error: Failed to write to source array!\n");
+ exit (1);
+ }
+#endif
+ err = 0;
+ err = clSetKernelArg (kernel, 0, sizeof(cl_mem), &sq_input);
+ err |= clSetKernelArg (kernel, 1, sizeof(cl_mem), &sq_output);
+ err |= clSetKernelArg (kernel, 2, sizeof(problem_size), &problem_size);
+ if (err != CL_SUCCESS)
+ {
+ fprintf (stderr,
+ "Failed to set kernel arguments! %d\n",
+ err);
+ exit (1);
+ }
+
+ // Get the maximum work group size for executing the kernel on the device
+ // FIXME: do this once when the kernel is created, not every time we run it!
+ err = clGetKernelWorkGroupInfo (kernel,
+ device_id,
+ CL_KERNEL_WORK_GROUP_SIZE,
+ sizeof(local),
+ &local,
+ NULL);
+ if (CL_SUCCESS != err)
+ {
+ fprintf (stderr,
+ "Error: Failed to retrieve kernel work group info! %d\n", err);
+ exit (1);
+ }
+
+ // Execute the kernel over the entire range of our 1d input data set
+ // using the maximum number of work group items for this device
+ {
+ size_t global_problem_size = problem_size;
+
+#if 0
+ fprintf (stderr,
+ "GPS: %llu - LPS: %llu\n",
+ (unsigned long long) problem_size,
+ (unsigned long long) local);
+#endif
+ local = (local > problem_size) ? problem_size : local;
+ err = clEnqueueNDRangeKernel (commands,
+ kernel,
+ 1 /* work_dim */,
+ NULL /* global work offset */,
+ &global_problem_size, /* array of work_dim values */
+ &local,
+ 0, /* num_events */
+ NULL, /* event wait list */
+ NULL /* event */);
+ }
+ if (CL_SUCCESS != err)
+ {
+ fprintf (stderr,
+ "Error: Failed to execute kernel (%d)!\n",
+ err);
+ exit (1);
+ }
+
+ // Wait for the command commands to get serviced before reading back results
+ clFinish (commands);
+
+ // Read back the results from the device to verify the output
+ err = clEnqueueReadBuffer (commands,
+ sq_output,
+ CL_TRUE /* blocking read */,
+ 0 /* offset */,
+ sizeof (float) * problem_size,
+ solution,
+ 0, /* num events in wait list */
+ NULL /* wait list */,
+ NULL /* event */);
+ if (CL_SUCCESS != err)
+ {
+ fprintf (stderr,
+ "Error: Failed to read output array! %d\n",
+ err);
+ exit (1);
+ }
+ clReleaseMemObject (sq_output);
+ clReleaseMemObject (sq_input);
+}
+
+
+int
+main (int argc,
+ char **argv)
+{
+ unsigned int offset = 0;
+ bool print_platforms = false;
+ cl_platform_id platform;
+ cl_device_id device_id;
+ cl_context context;
+ cl_command_queue commands;
+ cl_program program;
+ cl_kernel kernel;
+
+ /* TODO: replace by getopt-style command-line parsing ... */
+ if ( (NULL != argv[1]) &&
+ (0 == strcmp (argv[1],
+ "list")) )
+ print_platforms = true;
+ if ( (NULL != argv[1]) &&
+ (0 == strncmp (argv[1],
+ "platform=",
+ strlen ("platform="))) )
+ offset = atoi (&argv[1][strlen ("platform=")]);
+
+ platform = select_platform (offset,
+ print_platforms);
+ device_id = select_device (platform);
+ context = create_compute_context (device_id);
+ commands = create_command_queue (device_id,
+ context);
+ program = compile_program (device_id,
+ context,
+ "square.cl");
+ kernel = create_kernel (program,
+ "square");
+ {
+ unsigned int scale = 1024 * 1024;
+ float *inputs = malloc (sizeof (float) * scale);
+ float *squares = malloc (sizeof (float) * scale);
+
+ if ( (NULL == inputs) ||
+ (NULL == squares) )
+ {
+ fprintf (stderr,
+ "allocation failed (%s)\n",
+ strerror (errno));
+ exit (1);
+ }
+ for (unsigned int i = 0; i<scale; i++)
+ inputs[i] = 1.0 * i + i * 0.01;
+ for (unsigned int b = 0; b < 10000; b++)
+ {
+ square_with_kernel (device_id,
+ context,
+ commands,
+ kernel,
+ scale,
+ inputs,
+ squares);
+ }
+ for (unsigned int i = 0; i<scale; i++)
+ if (squares[i] != (inputs[i] * inputs[i]))
+ fprintf (stderr,
+ "Bad computation (%u)\n",
+ i);
+ free (inputs);
+ free (squares);
+ }
+ clReleaseKernel (kernel);
+ clReleaseProgram (program);
+ clReleaseCommandQueue (commands);
+ clReleaseContext (context);
+ return 0;
+}
diff --git a/source/square.cl b/source/square.cl
@@ -0,0 +1 @@
+../xcode/rsa-kernel.cl
+\ No newline at end of file
diff --git a/xcode/rsa-kernel.cl b/xcode/rsa-kernel.cl
@@ -1,9 +1,12 @@
-__kernel void square(__global float* input, __global float* output, const unsigned int count)
+__kernel void
+square (__global float* input,
+ __global float* output,
+ const unsigned int count)
{
- int i = get_global_id(0);
-
- if(i < count)
- output[i] = input[i] * input[i];
+ int i = get_global_id(0);
+
+ if(i < count)
+ output[i] = input[i] * input[i];
}