commit d50b5b70b89f9fd9a7208fa52c8bf0c9aadb9f92
parent b5c2468c4d2b4b9ee2364e6331cbf968c060023d
Author: Cedric <cedric.zwahlen@students.bfh.ch>
Date: Wed, 1 Nov 2023 23:33:32 +0100
The OpenCL kernel compiles, and data can be passed to it. The kernel does not return anything yet.
Diffstat:
10 files changed, 1445 insertions(+), 65 deletions(-)
diff --git a/source/opencl-test.c b/source/opencl-test.c
@@ -29,12 +29,10 @@ int opencl_tests(void) {
cl_mem input; // device memory used for the input array
cl_mem output; // device memory used for the output array
- // Fill our data set with random float values
- //
+
int i = 0;
unsigned int count = DATA_SIZE;
- for(i = 0; i < count; i++)
- data[i] = rand() / (float)RAND_MAX;
+
// Connect to a compute device
//
@@ -46,15 +44,20 @@ int opencl_tests(void) {
return EXIT_FAILURE;
}
- size_t retSize = 0;
- clGetDeviceInfo(device_id, CL_DRIVER_VERSION, 0, NULL, &retSize);
-
- char driver_version[retSize];
- clGetDeviceInfo(device_id, CL_DRIVER_VERSION, retSize, &driver_version, &retSize);
+ size_t retSize_1 = 0;
+ clGetDeviceInfo(device_id, CL_DRIVER_VERSION, 0, NULL, &retSize_1);
+ char driver_version[retSize_1];
+ clGetDeviceInfo(device_id, CL_DRIVER_VERSION, retSize_1, &driver_version, &retSize_1);
+
+ printf("driver version: %s\n", driver_version);
+
+
+ size_t retSize_2 = sizeof(cl_uint);
+ cl_uint address_bits = 0;
+ clGetDeviceInfo(device_id, CL_DEVICE_ADDRESS_BITS, 0, NULL, &retSize_2);
+ clGetDeviceInfo(device_id, CL_DEVICE_ADDRESS_BITS, retSize_2, &address_bits, &retSize_2);
- //char *driver_version;
- //clGetDeviceInfo(device_id, CL_DRIVER_VERSION, NULL, &driver_version, NULL);
- printf("%s\n", driver_version);
+ printf("device address bits: %i\n", address_bits);
// Create a compute context
@@ -78,7 +81,7 @@ int opencl_tests(void) {
// get the kernel from a file instead of a constant
- FILE *fp = fopen("rsa-kernel.cl", "r");
+ FILE *fp = fopen("./verify.cl", "r");
fseek(fp, 0L, SEEK_END);
size_t sz = ftell(fp);
rewind(fp);
@@ -113,7 +116,7 @@ int opencl_tests(void) {
// Create the compute kernel in the program we wish to run
//
- kernel = clCreateKernel(program, "square", &err);
+ kernel = clCreateKernel(program, "single", &err);
if (!kernel || err != CL_SUCCESS)
{
printf("Error: Failed to create compute kernel!\n");
@@ -184,15 +187,7 @@ int opencl_tests(void) {
exit(1);
}
- // Validate our results
- //
- correct = 0;
- for(i = 0; i < count; i++)
- {
- if(results[i] == data[i] * data[i])
- correct++;
- }
-
+
// Print a brief summary detailing the results
//
@@ -207,5 +202,4 @@ int opencl_tests(void) {
clReleaseCommandQueue(commands);
clReleaseContext(context);
-
}
diff --git a/source/rsa-test.c b/source/rsa-test.c
@@ -7,7 +7,7 @@
#include "rsa-test.h"
#include "big-int-test.h"
-
+#include <OpenCL/opencl.h>
#include "ctype.h"
#include "time.h"
@@ -16,9 +16,9 @@
#define NEED_LIBGCRYPT_VERSION "1.10.2"
+#define DATA_SIZE (1024)
-
-void rsa_tests(void) {
+int rsa_tests(void) {
// MARK: UNSAFE init
@@ -75,13 +75,8 @@ void rsa_tests(void) {
// use gcry_rsa_sign – without padding?
gcry_sexp_t resSign;
-
-
gcry_pk_sign(&resSign, toSign, key);
-
- // show_sexp("\n", resSign);
-
// measure time
// do the same thing with our bigNum library – do we even get the same signature?
@@ -112,21 +107,10 @@ void rsa_tests(void) {
unsigned char *sgn = malloc(2048);
gcry_mpi_print(GCRYMPI_FMT_HEX,sgn,2048,&nL,sig_mpi);
- // printf("%s",n);
-
- // use 'real time' – not clock time
-
-
-
- bigNum_tests(n, val, d);
-
-
struct timespec t1, t2;
clock_gettime(CLOCK_REALTIME, &t1);
-
-
if (verify(sgn, e, n, val)) {
printf("\nverification failed\n");
@@ -163,11 +147,287 @@ void rsa_tests(void) {
// try to put it onto the gpu
- return;
+ // MARK: GPU Code
-}
+ int err; // error code returned from api calls
+
+ float data[DATA_SIZE]; // original data set given to device
+ float results[DATA_SIZE]; // results returned from device
+ unsigned int correct; // number of correct results returned
+
+ size_t global; // global domain size for our calculation
+ size_t local; // local domain size for our calculation
+
+ cl_device_id device_id; // compute device id
+ cl_context context; // compute context
+ cl_command_queue commands; // compute command queue
+ cl_program program; // compute program
+ cl_kernel kernel; // compute kernel
+
+ //cl_mem input; // device memory used for the input array
+ //cl_mem output; // device memory used for the output array
+
+
+ //int i = 0;
+ unsigned int count = DATA_SIZE;
+
+
+ // Connect to a compute device
+ //
+ int gpu = 1;
+ err = clGetDeviceIDs(NULL, gpu ? CL_DEVICE_TYPE_GPU : CL_DEVICE_TYPE_CPU, 1, &device_id, NULL);
+ if (err != CL_SUCCESS)
+ {
+ printf("Error: Failed to create a device group!\n");
+ return EXIT_FAILURE;
+ }
+
+ size_t retSize_1 = 0;
+ clGetDeviceInfo(device_id, CL_DRIVER_VERSION, 0, NULL, &retSize_1);
+ char driver_version[retSize_1];
+ clGetDeviceInfo(device_id, CL_DRIVER_VERSION, retSize_1, &driver_version, &retSize_1);
+
+ printf("driver version: %s\n", driver_version);
+
+
+ size_t retSize_2 = sizeof(cl_uint);
+ cl_uint address_bits = 0;
+ clGetDeviceInfo(device_id, CL_DEVICE_ADDRESS_BITS, 0, NULL, &retSize_2);
+ clGetDeviceInfo(device_id, CL_DEVICE_ADDRESS_BITS, retSize_2, &address_bits, &retSize_2);
+
+ printf("device address bits: %i\n", address_bits);
+
+
+ // Create a compute context
+ //
+ context = clCreateContext(0, 1, &device_id, NULL, NULL, &err);
+ if (!context)
+ {
+ printf("Error: Failed to create a compute context!\n");
+ return EXIT_FAILURE;
+ }
+ // Create a command commands
+ //
+ commands = clCreateCommandQueue(context, device_id, 0, &err);
+ if (!commands)
+ {
+ printf("Error: Failed to create a command commands!\n");
+ return EXIT_FAILURE;
+ }
+
+ // get the kernel from a file instead of a constant
+
+ FILE *fp = fopen("./verify.cl", "r");
+ fseek(fp, 0L, SEEK_END);
+ size_t sz = ftell(fp);
+ rewind(fp);
+
+ char *kernelBuf = malloc(sz);
+ fread(kernelBuf, sizeof(char), sz, fp);
+ fclose(fp);
+
+ // Create the compute program from the source buffer
+ //
+ //program = clCreateProgramWithSource(context, 1, (const char **) & KernelSource, NULL, &err);
+ program = clCreateProgramWithSource(context, 1, (const char **) & kernelBuf, NULL, &err);
+ if (!program)
+ {
+ printf("Error: Failed to create compute program!\n");
+ return EXIT_FAILURE;
+ }
+
+ // Build the program executable
+ //
+ err = clBuildProgram(program, 0, NULL, NULL, NULL, NULL);
+ if (err != CL_SUCCESS)
+ {
+ size_t len;
+ char buffer[2048];
+
+ printf("Error: Failed to build program executable!\n");
+ clGetProgramBuildInfo(program, device_id, CL_PROGRAM_BUILD_LOG, sizeof(buffer), buffer, &len);
+ printf("%s\n", buffer);
+ exit(1);
+ }
+
+ // Create the compute kernel in the program we wish to run
+ //
+ kernel = clCreateKernel(program, "single", &err);
+ if (!kernel || err != CL_SUCCESS)
+ {
+ printf("Error: Failed to create compute kernel!\n");
+ exit(1);
+ }
+
+ // Create the input and output arrays in device memory for our calculation
+ //
+ //input = clCreateBuffer(context, CL_MEM_READ_ONLY, sizeof(float) * count, NULL, NULL);
+ //output = clCreateBuffer(context, CL_MEM_WRITE_ONLY, sizeof(float) * count, NULL, NULL);
+
+ cl_mem s_mem;
+ cl_mem e_mem;
+ cl_mem n_mem;
+
+ cl_mem res_mem;
+ cl_mem valid; // needs to be a buffer because it goes out
+
+ DIGIT_T n_buf [MAX_ALLOC_SIZE*2];
+ DIGIT_T e_buf [MAX_ALLOC_SIZE*2];
+ DIGIT_T s_buf [MAX_ALLOC_SIZE*2];
+
+ DIGIT_T res_buf [MAX_ALLOC_SIZE*2];
+
+ mpSetZero(n_buf, MAX_ALLOC_SIZE*2);
+ mpSetZero(e_buf, MAX_ALLOC_SIZE*2);
+ mpSetZero(s_buf, MAX_ALLOC_SIZE*2);
+
+ mpSetZero(res_buf, MAX_ALLOC_SIZE*2);
+
+ mpConvFromHex(n_buf, strlen(n), n);
+ mpConvFromHex(e_buf, strlen(e), e);
+ mpConvFromHex(s_buf, strlen(sgn), sgn);
+
+ size_t sz_n = mpSizeof(n_buf, MAX_ALLOC_SIZE*2);
+ size_t sz_s = mpSizeof(s_buf, MAX_ALLOC_SIZE*2);
+
+
+ unsigned long s_len = mpSizeof(s_buf, MAX_ALLOC_SIZE*2);
+ unsigned long e_len = mpSizeof(e_buf, MAX_ALLOC_SIZE*2);
+ unsigned long n_len = mpSizeof(n_buf, MAX_ALLOC_SIZE*2);
+
+ unsigned long res_len = MAX_ALLOC_SIZE*2;
+
+ unsigned long max_len = max(sz_s,sz_n);
+
+ n_mem = clCreateBuffer(context, CL_MEM_READ_WRITE, sizeof(char) * n_len, NULL, NULL);
+ e_mem = clCreateBuffer(context, CL_MEM_READ_WRITE, sizeof(char) * e_len, NULL, NULL);
+ s_mem = clCreateBuffer(context, CL_MEM_READ_WRITE, sizeof(char) * s_len, NULL, NULL);
+
+ res_mem = clCreateBuffer(context, CL_MEM_READ_WRITE, sizeof(char) * res_len, NULL, NULL);
+ valid = clCreateBuffer(context, CL_MEM_READ_WRITE, sizeof(int8_t), NULL, NULL);
+
+
+ global = 4096;
+ //local = global;
+
+
+ if (!s_mem || !e_mem || !n_mem || !res_mem || !valid )
+ {
+ printf("Error: Failed to allocate device memory!\n");
+ exit(1);
+ }
+
+
+ // Write our data set into the input array in device memory
+ //
+ err = clEnqueueWriteBuffer(commands, s_mem, CL_TRUE, 0, s_len, s_buf, 0, NULL, NULL);
+ err |= clEnqueueWriteBuffer(commands, e_mem, CL_TRUE, 0, e_len, e_buf, 0, NULL, NULL);
+ err |= clEnqueueWriteBuffer(commands, n_mem, CL_TRUE, 0, n_len, n_buf, 0, NULL, NULL);
+ //err |= clEnqueueWriteBuffer(commands, res_mem, CL_TRUE, 0, res_len, res_buf, 0, NULL, NULL);
+ if (err != CL_SUCCESS)
+ {
+ printf("Error: Failed to write to source array!\n");
+ exit(1);
+ }
+
+ // Set the arguments to our compute kernel
+ //
+ err = 0;
+ err = clSetKernelArg(kernel, 0, sizeof(cl_mem), &s_mem);
+ err |= clSetKernelArg(kernel, 1, sizeof(unsigned int), &s_len);
+ err |= clSetKernelArg(kernel, 2, sizeof(cl_mem), &e_mem);
+ err |= clSetKernelArg(kernel, 3, sizeof(unsigned int), &e_len);
+ err |= clSetKernelArg(kernel, 4, sizeof(cl_mem), &n_mem);
+ err |= clSetKernelArg(kernel, 5, sizeof(unsigned int), &n_len);
+ err |= clSetKernelArg(kernel, 6, sizeof(cl_mem), &res_mem);
+ err |= clSetKernelArg(kernel, 7, sizeof(unsigned int), &res_len);
+ err |= clSetKernelArg(kernel, 8, sizeof(unsigned int), &max_len);
+ err |= clSetKernelArg(kernel, 9, sizeof(cl_mem), &valid);
+
+ //err |= clSetKernelArg(kernel, 2, sizeof(unsigned int), &count);
+ if (err != CL_SUCCESS)
+ {
+ printf("Error: Failed to set kernel arguments! %d\n", err);
+ exit(1);
+ }
+
+ // Get the maximum work group size for executing the kernel on the device
+ //
+ err = clGetKernelWorkGroupInfo(kernel, device_id, CL_KERNEL_WORK_GROUP_SIZE, sizeof(local), &local, NULL);
+ if (err != CL_SUCCESS)
+ {
+ printf("Error: Failed to retrieve kernel work group info! %d\n", err);
+ exit(1);
+ }
+
+ // Execute the kernel over the entire range of our 1d input data set
+ // using the maximum number of work group items for this device
+ //
+
+ err = clEnqueueNDRangeKernel(commands, kernel, 1, NULL, &global, &local, 0, NULL, NULL);
+ if (err)
+ {
+ printf("Error: Failed to execute kernel!\n");
+ return EXIT_FAILURE;
+ }
+
+ // Wait for the command commands to get serviced before reading back results
+ //
+ clFinish(commands);
+
+ // Read back the results from the device to verify the output
+ //
+ err = clEnqueueReadBuffer( commands, res_mem, CL_TRUE, 0, res_len, res_buf, 0, NULL, NULL );
+ if (err != CL_SUCCESS)
+ {
+ printf("Error: Failed to read output array! %d\n", err);
+ exit(1);
+ }
+
+ size_t sz_res = mpSizeof(res_buf, MAX_ALLOC_SIZE*2);
+
+ int sz_mm = strlen(val) + 2;
+
+ unsigned char comp[sz_mm];
+
+ mpConvToHex(res_buf, sz_res, comp, sz_mm);
+
+ printf("%s",comp);
+
+ // Print a brief summary detailing the results
+ //
+ // printf("Computed '%d/%d' correct values!\n", correct, count);
+
+ // Shutdown and cleanup
+ //
+ clReleaseMemObject(res_mem);
+ clReleaseMemObject(e_mem);
+ clReleaseMemObject(n_mem);
+ clReleaseMemObject(s_mem);
+
+ clReleaseProgram(program);
+ clReleaseKernel(kernel);
+ clReleaseCommandQueue(commands);
+ clReleaseContext(context);
+
+
+
+
+
+
+
+
+
+
+
+
+ return 0;
+
+}
+
+/*
//void bigNum_tests(void) {
void bigNum_tests(unsigned char* nn,unsigned char* ee,unsigned char* dd) {
@@ -185,14 +445,7 @@ void bigNum_tests(unsigned char* nn,unsigned char* ee,unsigned char* dd) {
mpSetZero(d, MAX_ALLOC_SIZE*2);
mpSetZero(res, MAX_ALLOC_SIZE*2);
- /*
-
- char* nn = "E08973398DD8F5F5E88776397F4EB005BB5383DE0FB7ABDC7DC775290D052E6D12DFA68626D4D26FAA5829FC97ECFA82510F3080BEB1509E4644F12CBBD832CFC6686F07D9B060ACBEEE34096A13F5F7050593DF5EBA3556D961FF197FC981E6F86CEA874070EFAC6D2C749F2DFA553AB9997702A648528C4EF357385774575F";
-
- char* ee = "010001";
-
- char* dd = "A403C327477634346CA686B57949014B2E8AD2C862B2C7D748096A8B91F736F275D6E8CD15906027314735644D95CD6763CEB49F56AC2F376E1CEE0EBF282DF439906F34D86E085BD5656AD841F313D72D395EFE33CBFF29E4030B3D05A28FB7F18EA27637B07957D32F2BDE8706227D04665EC91BAF8B1AC3EC9144AB7F21";
- */
+
mpConvFromHex(N, strlen(nn), nn);
mpConvFromHex(e, strlen(ee), ee);
mpConvFromHex(d, strlen(dd), dd);
@@ -213,7 +466,7 @@ void bigNum_tests(unsigned char* nn,unsigned char* ee,unsigned char* dd) {
}
-
+*/
int verify(unsigned char* sign, unsigned char* ee, unsigned char* nn, unsigned char* mm) {
diff --git a/source/rsa-test.h b/source/rsa-test.h
@@ -12,7 +12,7 @@
#include <gcrypt.h>
-void rsa_tests(void);
+int rsa_tests(void);
static void show_sexp(const char *prefix, gcry_sexp_t a);
diff --git a/xcode/.DS_Store b/xcode/.DS_Store
Binary files differ.
diff --git a/xcode/lib-gpu-verify.xcodeproj/project.pbxproj b/xcode/lib-gpu-verify.xcodeproj/project.pbxproj
@@ -8,7 +8,7 @@
/* Begin PBXBuildFile section */
6A8A795D2A89357400116D7D /* rsa-kernel.cl in Sources */ = {isa = PBXBuildFile; fileRef = 6A8A795C2A89357400116D7D /* rsa-kernel.cl */; };
- 6A8A795F2A89672700116D7D /* modexp.cl in Sources */ = {isa = PBXBuildFile; fileRef = 6A8A795E2A89672700116D7D /* modexp.cl */; };
+ 6A8A795F2A89672700116D7D /* verify.cl in Sources */ = {isa = PBXBuildFile; fileRef = 6A8A795E2A89672700116D7D /* verify.cl */; };
6AF7487A2ADADEBD00D58E08 /* lib-gpu-verify.c in Sources */ = {isa = PBXBuildFile; fileRef = 6AF748792ADADEBD00D58E08 /* lib-gpu-verify.c */; };
6AF748822ADADF4500D58E08 /* big-int-test.c in Sources */ = {isa = PBXBuildFile; fileRef = 6AF7487D2ADADF4500D58E08 /* big-int-test.c */; };
6AF748832ADADF4500D58E08 /* rsa-test.c in Sources */ = {isa = PBXBuildFile; fileRef = 6AF7487F2ADADF4500D58E08 /* rsa-test.c */; };
@@ -31,7 +31,7 @@
/* Begin PBXFileReference section */
466E0F5F0C932E1A00ED01DB /* lib-gpu-verify */ = {isa = PBXFileReference; explicitFileType = "compiled.mach-o.executable"; includeInIndex = 0; path = "lib-gpu-verify"; sourceTree = BUILT_PRODUCTS_DIR; };
6A8A795C2A89357400116D7D /* rsa-kernel.cl */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.opencl; path = "rsa-kernel.cl"; sourceTree = "<group>"; };
- 6A8A795E2A89672700116D7D /* modexp.cl */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.opencl; path = modexp.cl; sourceTree = "<group>"; };
+ 6A8A795E2A89672700116D7D /* verify.cl */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.opencl; path = verify.cl; sourceTree = "<group>"; };
6AF748792ADADEBD00D58E08 /* lib-gpu-verify.c */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.c; name = "lib-gpu-verify.c"; path = "../source/lib-gpu-verify.c"; sourceTree = "<group>"; };
6AF7487B2ADADF4500D58E08 /* big-int-test.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; name = "big-int-test.h"; path = "../source/big-int-test.h"; sourceTree = "<group>"; };
6AF7487D2ADADF4500D58E08 /* big-int-test.c */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.c; name = "big-int-test.c"; path = "../source/big-int-test.c"; sourceTree = "<group>"; };
@@ -86,7 +86,7 @@
children = (
6A984F162AC5B18A00F530FD /* Headers */,
6A8A795C2A89357400116D7D /* rsa-kernel.cl */,
- 6A8A795E2A89672700116D7D /* modexp.cl */,
+ 6A8A795E2A89672700116D7D /* verify.cl */,
6AF748792ADADEBD00D58E08 /* lib-gpu-verify.c */,
6AF7487D2ADADF4500D58E08 /* big-int-test.c */,
6AF7487F2ADADF4500D58E08 /* rsa-test.c */,
@@ -159,7 +159,7 @@
files = (
6AF7487A2ADADEBD00D58E08 /* lib-gpu-verify.c in Sources */,
6A8A795D2A89357400116D7D /* rsa-kernel.cl in Sources */,
- 6A8A795F2A89672700116D7D /* modexp.cl in Sources */,
+ 6A8A795F2A89672700116D7D /* verify.cl in Sources */,
6AF748832ADADF4500D58E08 /* rsa-test.c in Sources */,
6AF748862ADADFAD00D58E08 /* opencl-test.c in Sources */,
6AF748822ADADF4500D58E08 /* big-int-test.c in Sources */,
diff --git a/xcode/lib-gpu-verify.xcodeproj/project.xcworkspace/xcuserdata/cedriczwahlen.xcuserdatad/UserInterfaceState.xcuserstate b/xcode/lib-gpu-verify.xcodeproj/project.xcworkspace/xcuserdata/cedriczwahlen.xcuserdatad/UserInterfaceState.xcuserstate
Binary files differ.
diff --git a/xcode/lib-gpu-verify.xcodeproj/xcshareddata/xcschemes/lib-gpu-verify.xcscheme b/xcode/lib-gpu-verify.xcodeproj/xcshareddata/xcschemes/lib-gpu-verify.xcscheme
@@ -35,7 +35,7 @@
selectedLauncherIdentifier = "Xcode.DebuggerFoundation.Launcher.LLDB"
launchStyle = "0"
useCustomWorkingDirectory = "YES"
- customWorkingDirectory = "/Users/cedriczwahlen/Desktop/OpenCL_Hello_World_Example"
+ customWorkingDirectory = "/Users/cedriczwahlen/libgpuverify/xcode"
ignoresPersistentStateOnLaunch = "NO"
debugDocumentVersioning = "YES"
debugServiceExtension = "internal"
diff --git a/xcode/lib-gpu-verify.xcodeproj/xcuserdata/cedriczwahlen.xcuserdatad/xcdebugger/Breakpoints_v2.xcbkptlist b/xcode/lib-gpu-verify.xcodeproj/xcuserdata/cedriczwahlen.xcuserdatad/xcdebugger/Breakpoints_v2.xcbkptlist
@@ -680,5 +680,147 @@
landmarkType = "9">
</BreakpointContent>
</BreakpointProxy>
+ <BreakpointProxy
+ BreakpointExtensionID = "Xcode.Breakpoint.FileBreakpoint">
+ <BreakpointContent
+ uuid = "E34A3BBB-4BEA-4FC9-B0F1-55FB3A180116"
+ shouldBeEnabled = "Yes"
+ ignoreCount = "0"
+ continueAfterRunningActions = "No"
+ filePath = "../source/rsa-test.c"
+ startingColumnNumber = "9223372036854775807"
+ endingColumnNumber = "9223372036854775807"
+ startingLineNumber = "397"
+ endingLineNumber = "397"
+ landmarkName = "rsa_tests()"
+ landmarkType = "9">
+ <Locations>
+ <Location
+ uuid = "E34A3BBB-4BEA-4FC9-B0F1-55FB3A180116 - b0b9078e770c93b0"
+ shouldBeEnabled = "Yes"
+ ignoreCount = "0"
+ continueAfterRunningActions = "No"
+ symbolName = "rsa_tests"
+ moduleName = "lib-gpu-verify"
+ usesParentBreakpointCondition = "Yes"
+ urlString = "file:///Users/cedriczwahlen/libgpuverify/source/rsa-test.c"
+ startingColumnNumber = "9223372036854775807"
+ endingColumnNumber = "9223372036854775807"
+ startingLineNumber = "402"
+ endingLineNumber = "402"
+ offsetFromSymbolStart = "4114">
+ </Location>
+ <Location
+ uuid = "E34A3BBB-4BEA-4FC9-B0F1-55FB3A180116 - b0b9078e770c934d"
+ shouldBeEnabled = "Yes"
+ ignoreCount = "0"
+ continueAfterRunningActions = "No"
+ symbolName = "rsa_tests"
+ moduleName = "lib-gpu-verify"
+ usesParentBreakpointCondition = "Yes"
+ urlString = "file:///Users/cedriczwahlen/libgpuverify/source/rsa-test.c"
+ startingColumnNumber = "9223372036854775807"
+ endingColumnNumber = "9223372036854775807"
+ startingLineNumber = "397"
+ endingLineNumber = "397"
+ offsetFromSymbolStart = "4149">
+ </Location>
+ <Location
+ uuid = "E34A3BBB-4BEA-4FC9-B0F1-55FB3A180116 - b0b9078e770c934d"
+ shouldBeEnabled = "Yes"
+ ignoreCount = "0"
+ continueAfterRunningActions = "No"
+ symbolName = "rsa_tests"
+ moduleName = "lib-gpu-verify"
+ usesParentBreakpointCondition = "Yes"
+ urlString = "file:///Users/cedriczwahlen/libgpuverify/source/rsa-test.c"
+ startingColumnNumber = "9223372036854775807"
+ endingColumnNumber = "9223372036854775807"
+ startingLineNumber = "397"
+ endingLineNumber = "397"
+ offsetFromSymbolStart = "4122">
+ </Location>
+ <Location
+ uuid = "E34A3BBB-4BEA-4FC9-B0F1-55FB3A180116 - b0b9078e770c934d"
+ shouldBeEnabled = "Yes"
+ ignoreCount = "0"
+ continueAfterRunningActions = "No"
+ symbolName = "rsa_tests"
+ moduleName = "lib-gpu-verify"
+ usesParentBreakpointCondition = "Yes"
+ urlString = "file:///Users/cedriczwahlen/libgpuverify/source/rsa-test.c"
+ startingColumnNumber = "9223372036854775807"
+ endingColumnNumber = "9223372036854775807"
+ startingLineNumber = "397"
+ endingLineNumber = "397"
+ offsetFromSymbolStart = "4098">
+ </Location>
+ </Locations>
+ </BreakpointContent>
+ </BreakpointProxy>
+ <BreakpointProxy
+ BreakpointExtensionID = "Xcode.Breakpoint.FileBreakpoint">
+ <BreakpointContent
+ uuid = "8CC35C0B-FC9A-4FA3-841F-C77B239866FA"
+ shouldBeEnabled = "No"
+ ignoreCount = "0"
+ continueAfterRunningActions = "No"
+ filePath = "../source/rsa-test.c"
+ startingColumnNumber = "9223372036854775807"
+ endingColumnNumber = "9223372036854775807"
+ startingLineNumber = "316"
+ endingLineNumber = "316"
+ landmarkName = "rsa_tests()"
+ landmarkType = "9">
+ <Locations>
+ <Location
+ uuid = "8CC35C0B-FC9A-4FA3-841F-C77B239866FA - b0b9078e770c98da"
+ shouldBeEnabled = "Yes"
+ ignoreCount = "0"
+ continueAfterRunningActions = "No"
+ symbolName = "rsa_tests"
+ moduleName = "lib-gpu-verify"
+ usesParentBreakpointCondition = "Yes"
+ urlString = "file:///Users/cedriczwahlen/libgpuverify/source/rsa-test.c"
+ startingColumnNumber = "9223372036854775807"
+ endingColumnNumber = "9223372036854775807"
+ startingLineNumber = "316"
+ endingLineNumber = "316"
+ offsetFromSymbolStart = "2796">
+ </Location>
+ <Location
+ uuid = "8CC35C0B-FC9A-4FA3-841F-C77B239866FA - b0b9078e770c98da"
+ shouldBeEnabled = "Yes"
+ ignoreCount = "0"
+ continueAfterRunningActions = "No"
+ symbolName = "rsa_tests"
+ moduleName = "lib-gpu-verify"
+ usesParentBreakpointCondition = "Yes"
+ urlString = "file:///Users/cedriczwahlen/libgpuverify/source/rsa-test.c"
+ startingColumnNumber = "9223372036854775807"
+ endingColumnNumber = "9223372036854775807"
+ startingLineNumber = "316"
+ endingLineNumber = "316"
+ offsetFromSymbolStart = "2782">
+ </Location>
+ </Locations>
+ </BreakpointContent>
+ </BreakpointProxy>
+ <BreakpointProxy
+ BreakpointExtensionID = "Xcode.Breakpoint.FileBreakpoint">
+ <BreakpointContent
+ uuid = "0B50C23D-36DE-4C9C-B911-72E48EA8C7FD"
+ shouldBeEnabled = "Yes"
+ ignoreCount = "0"
+ continueAfterRunningActions = "No"
+ filePath = "../source/rsa-test.c"
+ startingColumnNumber = "9223372036854775807"
+ endingColumnNumber = "9223372036854775807"
+ startingLineNumber = "382"
+ endingLineNumber = "382"
+ landmarkName = "rsa_tests()"
+ landmarkType = "9">
+ </BreakpointContent>
+ </BreakpointProxy>
</Breakpoints>
</Bucket>
diff --git a/xcode/modexp.cl b/xcode/modexp.cl
@@ -1,5 +0,0 @@
-__kernel void verify() {
-
-}
-
-
diff --git a/xcode/verify.cl b/xcode/verify.cl
@@ -0,0 +1,996 @@
+
+// macros
+
+#define mpDESTROY(b, n) do{if(b)mpSetZero(b,n);}while(0)
+#define max(a,b) (((a) > (b)) ? (a) : (b))
+
+// only for that string conversion
+#define ALLOC_BYTES(b,n) do{assert((n)<=sizeof((b)));zeroise_bytes((b),(n));}while(0)
+#define FREE_BYTES(b,n) zeroise_bytes((b),(n))
+
+
+#define MAX_DIGIT 0xFFFFFFFFUL
+#define MAX_HALF_DIGIT 0xFFFFUL /* NB 'L' */
+#define BITS_PER_DIGIT 32
+#define HIBITMASK 0x80000000UL
+
+#define MAX_FIXED_BIT_LENGTH 8192
+#define MAX_FIXED_DIGITS ((MAX_FIXED_BIT_LENGTH + BITS_PER_DIGIT - 1) / BITS_PER_DIGIT)
+
+#define MAX_ALLOC_SIZE (MAX_FIXED_DIGITS*BYTES_PER_DIGIT)
+
+#define BYTES_PER_DIGIT (BITS_PER_DIGIT / 8)
+
+#define PRIuBIGD PRIu32
+#define PRIxBIGD PRIx32
+#define PRIXBIGD PRIX32
+
+/* MACROS TO DO MODULAR SQUARING AND MULTIPLICATION USING PRE-ALLOCATED TEMPS */
+/* Required lengths |y|=|t1|=|t2|=2*n, |m|=n; but final |y|=n */
+/* Square: y = (y * y) mod m */
+#define mpMODSQUARETEMP(y,m,n,t1,t2) do{mpSquare(t1,y,n);mpDivide(t2,y,t1,n*2,m,n);}while(0)
+/* Mult: y = (y * x) mod m */
+#define mpMODMULTTEMP(y,x,m,n,t1,t2) do{mpMultiply(t1,x,y,n);mpDivide(t2,y,t1,n*2,m,n);}while(0)
+
+#define mpNEXTBITMASK(mask, n) do{if(mask==1){mask=HIBITMASK;n--;}else{mask>>=1;}}while(0)
+
+typedef uint DIGIT_T;
+
+typedef uint16 HALF_DIGIT_T;
+
+
+// forward definitions
+
+int mpModulo(DIGIT_T r[], const DIGIT_T u[], size_t udigits, DIGIT_T v[], size_t vdigits);
+
+int mpModMult(DIGIT_T a[], const DIGIT_T x[], const DIGIT_T y[], DIGIT_T m[], size_t ndigits);
+
+int mpMultiply(DIGIT_T w[], const DIGIT_T u[], const DIGIT_T v[], size_t ndigits);
+DIGIT_T mpAdd(DIGIT_T w[], const DIGIT_T u[], const DIGIT_T v[], size_t ndigits);
+int mpDivide(DIGIT_T q[], DIGIT_T r[], const DIGIT_T u[], size_t udigits, DIGIT_T v[], size_t vdigits);
+int QhatTooBig(DIGIT_T qhat, DIGIT_T rhat, DIGIT_T vn2, DIGIT_T ujn2);
+DIGIT_T mpMultSub(DIGIT_T wn, DIGIT_T w[], const DIGIT_T v[], DIGIT_T q, size_t n);
+DIGIT_T mpShiftLeft(DIGIT_T a[], const DIGIT_T *b, size_t shift, size_t ndigits);
+
+
+void mpSetDigit(DIGIT_T a[], DIGIT_T d, size_t ndigits);
+
+int mpCompare(const DIGIT_T a[], const DIGIT_T b[], size_t ndigits);
+
+
+DIGIT_T mpShiftRight(DIGIT_T a[], const DIGIT_T b[], size_t shift, size_t ndigits);
+int spMultiply(uint p[2], uint x, uint y);
+uint spDivide(uint *pq, uint *pr, const uint u[2], uint v);
+
+int mpSquare(DIGIT_T w[], const DIGIT_T x[], size_t ndigits);
+
+size_t mpBitLength(const DIGIT_T d[], size_t ndigits);
+
+DIGIT_T mpShortDiv(DIGIT_T q[], const DIGIT_T u[], DIGIT_T v,
+ size_t ndigits);
+
+void mpSetEqual(DIGIT_T a[], const DIGIT_T b[], size_t ndigits);
+
+size_t uiceil(float x);
+volatile uint8 zeroise_bytes(volatile void *v, size_t n);
+
+size_t mpSizeof(const DIGIT_T a[], size_t ndigits);
+
+volatile DIGIT_T mpSetZero(volatile DIGIT_T a[], size_t ndigits);
+
+int mpIsZero(const DIGIT_T a[], size_t ndigits);
+
+void mpFail(char *msg);
+
+int mpModExpO(DIGIT_T *yout[], DIGIT_T *x[], DIGIT_T *e[], DIGIT_T *m[], size_t ndigits);
+
+void assert(bool precondition);
+
+
+// implementation
+
+int mpModulo(DIGIT_T r[], const DIGIT_T u[], size_t udigits,
+ DIGIT_T v[], size_t vdigits)
+{
+ /* Computes r = u mod v
+ where r, v are multiprecision integers of length vdigits
+ and u is a multiprecision integer of length udigits.
+ r may overlap v.
+
+ Note that r here is only vdigits long,
+ whereas in mpDivide it is udigits long.
+
+ Use remainder from mpDivide function.
+ */
+
+ size_t nn = max(udigits, vdigits);
+/* Allocate temp storage */
+//#ifdef NO_ALLOCS
+ // [v2.6] increased to two times
+ DIGIT_T qq[MAX_FIXED_DIGITS*2];
+ DIGIT_T rr[MAX_FIXED_DIGITS*2];
+ // assert(nn <= (MAX_FIXED_DIGITS*2));
+/*#else
+ DIGIT_T *qq, *rr;
+ qq = mpAlloc(udigits);
+ rr = mpAlloc(nn);
+#endif
+*/
+
+ /* rr[nn] = u mod v */
+ mpDivide(qq, rr, u, udigits, v, vdigits);
+
+ /* Final r is only vdigits long */
+ mpSetEqual(r, rr, vdigits);
+
+ mpDESTROY(rr, udigits);
+ mpDESTROY(qq, udigits);
+
+ return 0;
+}
+
+int mpModMult(DIGIT_T a[], const DIGIT_T x[], const DIGIT_T y[],
+ DIGIT_T m[], size_t ndigits)
+{ /* Computes a = (x * y) mod m */
+
+/* Double-length temp variable p */
+// #ifdef NO_ALLOCS
+ DIGIT_T p[MAX_FIXED_DIGITS * 2];
+// assert(ndigits <= MAX_FIXED_DIGITS);
+/*#else
+ DIGIT_T *p;
+ p = mpAlloc(ndigits * 2);
+#endif
+*/
+ /* Calc p[2n] = x * y */
+ mpMultiply(p, x, y, ndigits);
+
+ /* Then modulo (NOTE: a is OK at only ndigits long) */
+ mpModulo(a, p, ndigits * 2, m, ndigits);
+
+ mpDESTROY(p, ndigits * 2);
+
+ return 0;
+}
+
+int mpMultiply(DIGIT_T w[], const DIGIT_T u[], const DIGIT_T v[], size_t ndigits)
+{
+ /* Computes product w = u * v
+ where u, v are multiprecision integers of ndigits each
+ and w is a multiprecision integer of 2*ndigits
+
+ Ref: Knuth Vol 2 Ch 4.3.1 p 268 Algorithm M.
+ */
+
+ DIGIT_T k, t[2];
+ size_t i, j, m, n;
+
+ // assert(w != u && w != v);
+
+ m = n = ndigits;
+
+ /* Step M1. Initialise */
+ for (i = 0; i < 2 * m; i++)
+ w[i] = 0;
+
+ for (j = 0; j < n; j++)
+ {
+ /* Step M2. Zero multiplier? */
+ if (v[j] == 0)
+ {
+ w[j + m] = 0;
+ }
+ else
+ {
+ /* Step M3. Initialise i */
+ k = 0;
+ for (i = 0; i < m; i++)
+ {
+ /* Step M4. Multiply and add */
+ /* t = u_i * v_j + w_(i+j) + k */
+ spMultiply(t, u[i], v[j]);
+
+ t[0] += k;
+ if (t[0] < k)
+ t[1]++;
+ t[0] += w[i+j];
+ if (t[0] < w[i+j])
+ t[1]++;
+
+ w[i+j] = t[0];
+ k = t[1];
+ }
+ /* Step M5. Loop on i, set w_(j+m) = k */
+ w[j+m] = k;
+ }
+ } /* Step M6. Loop on j */
+
+ return 0;
+}
+
+DIGIT_T mpAdd(DIGIT_T w[], const DIGIT_T u[], const DIGIT_T v[], size_t ndigits)
+{
+ /* Calculates w = u + v
+ where w, u, v are multiprecision integers of ndigits each
+ Returns carry if overflow. Carry = 0 or 1.
+
+ Ref: Knuth Vol 2 Ch 4.3.1 p 266 Algorithm A.
+ */
+
+ DIGIT_T k;
+ size_t j;
+
+ // assert(w != v);
+
+ /* Step A1. Initialise */
+ k = 0;
+
+ for (j = 0; j < ndigits; j++)
+ {
+ /* Step A2. Add digits w_j = (u_j + v_j + k)
+ Set k = 1 if carry (overflow) occurs
+ */
+ w[j] = u[j] + k;
+ if (w[j] < k)
+ k = 1;
+ else
+ k = 0;
+
+ w[j] += v[j];
+ if (w[j] < v[j])
+ k++;
+
+ } /* Step A3. Loop on j */
+
+ return k; /* w_n = k */
+}
+
+// MARK: This function is causing problems – this function calls mpShiftLeft, mpShiftRight at some point (and so does mpShortDiv) they contain recursions, which are forbidden
+
+int mpDivide(DIGIT_T q[], DIGIT_T r[], const DIGIT_T u[],
+ size_t udigits, DIGIT_T v[], size_t vdigits)
+{ /* Computes quotient q = u / v and remainder r = u mod v
+ where q, r, u are multiple precision digits
+ all of udigits and the divisor v is vdigits.
+
+ Ref: Knuth Vol 2 Ch 4.3.1 p 272 Algorithm D.
+
+ Do without extra storage space, i.e. use r[] for
+ normalised u[], unnormalise v[] at end, and cope with
+ extra digit Uj+n added to u after normalisation.
+
+ WARNING: this trashes q and r first, so cannot do
+ u = u / v or v = u mod v.
+ It also changes v temporarily so cannot make it const.
+ */
+ size_t shift;
+ int n, m, j;
+ DIGIT_T bitmask, overflow;
+ DIGIT_T qhat, rhat, t[2];
+ DIGIT_T *uu, *ww;
+ int qhatOK, cmp;
+
+ /* Clear q and r */
+ mpSetZero(q, udigits);
+ mpSetZero(r, udigits);
+
+ /* Work out exact sizes of u and v */
+ n = (int)mpSizeof(v, vdigits);
+ m = (int)mpSizeof(u, udigits);
+ m -= n;
+
+ /* Catch special cases */
+ if (n == 0)
+ return -1; /* Error: divide by zero */
+
+ if (n == 1)
+ { /* Use short division instead */
+ r[0] = mpShortDiv(q, u, v[0], udigits);
+ return 0;
+ }
+
+ if (m < 0)
+ { /* v > u, so just set q = 0 and r = u */
+ mpSetEqual(r, u, udigits);
+ return 0;
+ }
+
+ if (m == 0)
+ { /* u and v are the same length */
+ cmp = mpCompare(u, v, (size_t)n);
+ if (cmp < 0)
+ { /* v > u, as above */
+ mpSetEqual(r, u, udigits);
+ return 0;
+ }
+ else if (cmp == 0)
+ { /* v == u, so set q = 1 and r = 0 */
+ mpSetDigit(q, 1, udigits);
+ return 0;
+ }
+ }
+
+ /* In Knuth notation, we have:
+ Given
+ u = (Um+n-1 ... U1U0)
+ v = (Vn-1 ... V1V0)
+ Compute
+ q = u/v = (QmQm-1 ... Q0)
+ r = u mod v = (Rn-1 ... R1R0)
+ */
+
+ /* Step D1. Normalise */
+ /* Requires high bit of Vn-1
+ to be set, so find most signif. bit then shift left,
+ i.e. d = 2^shift, u' = u * d, v' = v * d.
+ */
+ bitmask = HIBITMASK;
+ for (shift = 0; shift < BITS_PER_DIGIT; shift++)
+ {
+ if (v[n-1] & bitmask)
+ break;
+ bitmask >>= 1;
+ }
+
+ /* Normalise v in situ - NB only shift non-zero digits */
+ overflow = mpShiftLeft(v, v, shift, n);
+
+ /* Copy normalised dividend u*d into r */
+ overflow = mpShiftLeft(r, u, shift, n + m);
+ uu = r; /* Use ptr to keep notation constant */
+
+ t[0] = overflow; /* Extra digit Um+n */
+
+
+ /* Step D2. Initialise j. Set j = m */
+
+ for (j = m; j >= 0; j--)
+ {
+ /* Step D3. Set Qhat = [(b.Uj+n + Uj+n-1)/Vn-1]
+ and Rhat = remainder */
+ qhatOK = 0;
+ t[1] = t[0]; /* This is Uj+n */
+ t[0] = uu[j+n-1];
+ overflow = spDivide(&qhat, &rhat, t, v[n-1]);
+
+ /* Test Qhat */
+ if (overflow)
+ { /* Qhat == b so set Qhat = b - 1 */
+ qhat = MAX_DIGIT;
+ rhat = uu[j+n-1];
+ rhat += v[n-1];
+ if (rhat < v[n-1]) /* Rhat >= b, so no re-test */
+ qhatOK = 1;
+ }
+ /* [VERSION 2: Added extra test "qhat && "] */
+ if (qhat && !qhatOK && QhatTooBig(qhat, rhat, v[n-2], uu[j+n-2]))
+ { /* If Qhat.Vn-2 > b.Rhat + Uj+n-2
+ decrease Qhat by one, increase Rhat by Vn-1
+ */
+ qhat--;
+ rhat += v[n-1];
+ /* Repeat this test if Rhat < b */
+ if (!(rhat < v[n-1]))
+ if (QhatTooBig(qhat, rhat, v[n-2], uu[j+n-2]))
+ qhat--;
+ }
+
+
+ /* Step D4. Multiply and subtract */
+ ww = &uu[j];
+ overflow = mpMultSub(t[1], ww, v, qhat, (size_t)n);
+
+ /* Step D5. Test remainder. Set Qj = Qhat */
+ q[j] = qhat;
+ if (overflow)
+ { /* Step D6. Add back if D4 was negative */
+ q[j]--;
+ overflow = mpAdd(ww, ww, v, (size_t)n);
+ }
+
+ t[0] = uu[j+n-1]; /* Uj+n on next round */
+
+ } /* Step D7. Loop on j */
+
+ /* Clear high digits in uu */
+ for (j = n; j < m+n; j++)
+ uu[j] = 0;
+
+ /* Step D8. Unnormalise. */
+
+ mpShiftRight(r, r, shift, n);
+ mpShiftRight(v, v, shift, n);
+
+ return 0;
+}
+
+void mpSetDigit(DIGIT_T a[], DIGIT_T d, size_t ndigits)
+{ /* Sets a = d where d is a single digit */
+ size_t i;
+
+ for (i = 1; i < ndigits; i++)
+ {
+ a[i] = 0;
+ }
+ a[0] = d;
+}
+
+DIGIT_T mpShortDiv(DIGIT_T q[], const DIGIT_T u[], DIGIT_T v,
+ size_t ndigits)
+{
+ /* Calculates quotient q = u div v
+ Returns remainder r = u mod v
+ where q, u are multiprecision integers of ndigits each
+ and r, v are single precision digits.
+
+ Makes no assumptions about normalisation.
+
+ Ref: Knuth Vol 2 Ch 4.3.1 Exercise 16 p625
+ */
+ size_t j;
+ DIGIT_T t[2], r;
+ size_t shift;
+ DIGIT_T bitmask, overflow, *uu;
+
+ if (ndigits == 0) return 0;
+ if (v == 0) return 0; /* Divide by zero error */
+
+ /* Normalise first */
+ /* Requires high bit of V
+ to be set, so find most signif. bit then shift left,
+ i.e. d = 2^shift, u' = u * d, v' = v * d.
+ */
+ bitmask = HIBITMASK;
+ for (shift = 0; shift < BITS_PER_DIGIT; shift++)
+ {
+ if (v & bitmask)
+ break;
+ bitmask >>= 1;
+ }
+
+ v <<= shift;
+ overflow = mpShiftLeft(q, u, shift, ndigits);
+ uu = q;
+
+ /* Step S1 - modified for extra digit. */
+ r = overflow; /* New digit Un */
+ j = ndigits;
+ while (j--)
+ {
+ /* Step S2. */
+ t[1] = r;
+ t[0] = uu[j];
+ overflow = spDivide(&q[j], &r, t, v);
+ }
+
+ /* Unnormalise */
+ r >>= shift;
+
+ return r;
+}
+
+int QhatTooBig(DIGIT_T qhat, DIGIT_T rhat,
+ DIGIT_T vn2, DIGIT_T ujn2)
+{ /* Returns true if Qhat is too big
+ i.e. if (Qhat * Vn-2) > (b.Rhat + Uj+n-2)
+ */
+ DIGIT_T t[2];
+
+ spMultiply(t, qhat, vn2);
+ if (t[1] < rhat)
+ return 0;
+ else if (t[1] > rhat)
+ return 1;
+ else if (t[0] > ujn2)
+ return 1;
+
+ return 0;
+}
+
+DIGIT_T mpMultSub(DIGIT_T wn, DIGIT_T w[], const DIGIT_T v[],
+ DIGIT_T q, size_t n)
+{ /* Compute w = w - qv
+ where w = (WnW[n-1]...W[0])
+ return modified Wn.
+ */
+ DIGIT_T k, t[2];
+ size_t i;
+
+ if (q == 0) /* No change */
+ return wn;
+
+ k = 0;
+
+ for (i = 0; i < n; i++)
+ {
+ spMultiply(t, q, v[i]);
+ w[i] -= k;
+ if (w[i] > MAX_DIGIT - k)
+ k = 1;
+ else
+ k = 0;
+ w[i] -= t[0];
+ if (w[i] > MAX_DIGIT - t[0])
+ k++;
+ k += t[1];
+ }
+
+ /* Cope with Wn not stored in array w[0..n-1] */
+ wn -= k;
+
+ return wn;
+}
+
+DIGIT_T mpShiftLeft(DIGIT_T a[], const DIGIT_T *b,
+ size_t shift, size_t ndigits)
+{ /* Computes a = b << shift */
+ /* [v2.1] Modified to cope with shift > BITS_PERDIGIT */
+
+ DIGIT_T carry = 0;
+
+ // this replaces the recursion
+ while (1) {
+
+ size_t i, y, nw, bits;
+ DIGIT_T mask, tempCarry, nextcarry;
+
+ /* Do we shift whole digits? */
+ if (shift >= BITS_PER_DIGIT)
+ {
+ nw = shift / BITS_PER_DIGIT;
+ i = ndigits;
+ while (i--)
+ {
+ if (i >= nw)
+ a[i] = b[i-nw];
+ else
+ a[i] = 0;
+ }
+ /* Call again to shift bits inside digits */
+ bits = shift % BITS_PER_DIGIT;
+ tempCarry = b[ndigits-nw] << bits;
+ if (bits) {
+ carry |= tempCarry;
+ continue;
+ }
+ return carry;
+ }
+ else
+ {
+ bits = shift;
+ }
+
+ /* Construct mask = high bits set */
+ mask = ~(~(DIGIT_T)0 >> bits);
+
+ y = BITS_PER_DIGIT - bits;
+ carry = 0;
+ for (i = 0; i < ndigits; i++)
+ {
+ nextcarry = (b[i] & mask) >> y;
+ a[i] = b[i] << bits | carry;
+ carry = nextcarry;
+ }
+
+ return carry;
+
+ }
+}
+
+DIGIT_T mpShiftRight(DIGIT_T a[], const DIGIT_T b[], size_t shift, size_t ndigits)
+{ /* Computes a = b >> shift */
+ /* [v2.1] Modified to cope with shift > BITS_PERDIGIT */
+
+ DIGIT_T carry = 0;
+
+ while (1) {
+
+ size_t i, y, nw, bits;
+ DIGIT_T mask, tempCarry, nextcarry;
+
+ /* Do we shift whole digits? */
+ if (shift >= BITS_PER_DIGIT)
+ {
+ nw = shift / BITS_PER_DIGIT;
+ for (i = 0; i < ndigits; i++)
+ {
+ if ((i+nw) < ndigits)
+ a[i] = b[i+nw];
+ else
+ a[i] = 0;
+ }
+ /* Call again to shift bits inside digits */
+ bits = shift % BITS_PER_DIGIT;
+ tempCarry = b[nw-1] >> bits;
+ if (bits)
+ carry |= tempCarry;
+ return carry;
+ }
+ else
+ {
+ bits = shift;
+ }
+
+ /* Construct mask to set low bits */
+ /* (thanks to Jesse Chisholm for suggesting this improved technique) */
+ mask = ~(~(DIGIT_T)0 << bits);
+
+ y = BITS_PER_DIGIT - bits;
+ carry = 0;
+ i = ndigits;
+ while (i--)
+ {
+ nextcarry = (b[i] & mask) << y;
+ a[i] = b[i] >> bits | carry;
+ carry = nextcarry;
+ }
+
+ return carry;
+
+ }
+}
+
+
+
+int spMultiply(uint p[2], uint x, uint y)
+{
+ /* Use a 64-bit temp for product */
+ ulong t = (ulong)x * (ulong)y;
+ /* then split into two parts */
+ p[1] = (uint)(t >> 32);
+ p[0] = (uint)(t & 0xFFFFFFFF);
+
+ return 0;
+}
+
+uint spDivide(uint *pq, uint *pr, const uint u[2], uint v)
+{
+ ulong uu, q;
+ uu = (ulong)u[1] << 32 | (ulong)u[0];
+ q = uu / (ulong)v;
+ //r = uu % (uint64_t)v;
+ *pr = (uint)(uu - q * v);
+ *pq = (uint)(q & 0xFFFFFFFF);
+ return (uint)(q >> 32);
+}
+
+int mpCompare(const DIGIT_T a[], const DIGIT_T b[], size_t ndigits)
+{
+ /* if (ndigits == 0) return 0; // deleted [v2.5] */
+
+ while (ndigits--)
+ {
+ if (a[ndigits] > b[ndigits])
+ return 1; /* GT */
+ if (a[ndigits] < b[ndigits])
+ return -1; /* LT */
+ }
+
+ return 0; /* EQ */
+}
+
+void mpSetEqual(DIGIT_T a[], const DIGIT_T b[], size_t ndigits)
+{ /* Sets a = b */
+ size_t i;
+
+ for (i = 0; i < ndigits; i++)
+ {
+ a[i] = b[i];
+ }
+}
+
+volatile DIGIT_T mpSetZero(volatile DIGIT_T a[], size_t ndigits)
+{ /* Sets a = 0 */
+
+ /* Prevent optimiser ignoring this */
+ volatile DIGIT_T optdummy;
+ volatile DIGIT_T *p = a;
+
+ while (ndigits--)
+ a[ndigits] = 0;
+
+ optdummy = *p;
+ return optdummy;
+}
+
+size_t mpSizeof(const DIGIT_T a[], size_t ndigits)
+{
+ while(ndigits--)
+ {
+ if (a[ndigits] != 0)
+ return (++ndigits);
+ }
+ return 0;
+}
+
+volatile uint8 zeroise_bytes(volatile void *v, size_t n)
+{ /* Zeroise byte array b and make sure optimiser does not ignore this */
+ volatile uint8 optdummy;
+ volatile uint8 *b = (uint8*)v;
+ while(n--)
+ b[n] = 0;
+ optdummy = *b;
+ return optdummy;
+}
+
+void mpFail(char *msg)
+{
+ //perror(msg);
+ printf("the program should stop here");
+}
+
+size_t mpBitLength(const DIGIT_T d[], size_t ndigits)
+/* Returns no of significant bits in d */
+{
+ size_t n, i, bits;
+ DIGIT_T mask;
+
+ if (!d || ndigits == 0)
+ return 0;
+
+ n = mpSizeof(d, ndigits);
+ if (0 == n) return 0;
+
+ for (i = 0, mask = HIBITMASK; mask > 0; mask >>= 1, i++)
+ {
+ if (d[n-1] & mask)
+ break;
+ }
+
+ bits = n * BITS_PER_DIGIT - i;
+
+ return bits;
+}
+
+
+void mpModSquareTemp(DIGIT_T *y,DIGIT_T *m,size_t n,DIGIT_T *t1,DIGIT_T *t2) {
+
+ mpSquare(t1,y,n);
+ mpDivide(t2,y,t1,n*2,m,n);
+
+}
+
+void mpModMultTemp(DIGIT_T *y, DIGIT_T *x, DIGIT_T *m, size_t n, DIGIT_T* t1, DIGIT_T *t2) {
+
+ mpMultiply(t1,x,y,n);
+ mpDivide(t2,y,t1,n*2,m,n);
+
+}
+
+
+
+
+
+int mpModExpO(DIGIT_T *yout[], DIGIT_T *x[], DIGIT_T *e[], DIGIT_T *m[], size_t ndigits)
+{ /* Computes y = x^e mod m */
+ /* "Classic" binary left-to-right method */
+ /* [v2.2] removed const restriction on m[] to avoid using an extra alloc'd var
+ (m is changed in-situ during the divide operation then restored) */
+ DIGIT_T mask;
+ size_t n;
+ size_t nn = ndigits * 2;
+ /* Create some double-length temps */
+//#ifdef NO_ALLOCS
+ DIGIT_T t1[MAX_FIXED_DIGITS * 2];
+ DIGIT_T t2[MAX_FIXED_DIGITS * 2];
+ DIGIT_T y[MAX_FIXED_DIGITS * 2];
+ assert(ndigits <= MAX_FIXED_DIGITS);
+/*#else
+ DIGIT_T *t1, *t2, *y;
+ t1 = mpAlloc(nn);
+ t2 = mpAlloc(nn);
+ y = mpAlloc(nn);
+#endif
+ */
+ assert(ndigits != 0);
+
+ n = mpSizeof(*e, ndigits);
+ /* Catch e==0 => x^0=1 */
+ if (0 == n)
+ {
+ mpSetDigit(*yout, 1, ndigits);
+ goto done;
+ }
+ /* Find second-most significant bit in e */
+ for (mask = HIBITMASK; mask > 0; mask >>= 1)
+ {
+ if (*e[n-1] & mask)
+ break;
+ }
+ mpNEXTBITMASK(mask, n);
+
+ /* Set y = x */
+ mpSetEqual(*y, *x, ndigits);
+
+ /* For bit j = k-2 downto 0 */
+ while (n)
+ {
+ /* Square y = y * y mod n */
+ //mpMODSQUARETEMP(*y, *m, ndigits, t1, t2);
+ //mpModSquareTemp(*y, *m, ndigits, t1, t2);
+
+ mpSquare(t1,y,n);
+ mpDivide(t2,y,t1,n*2,m,n);
+
+ if (*e[n-1] & mask)
+ { /* if e(j) == 1 then multiply
+ y = y * x mod n */
+ //mpMODMULTTEMP(*y, *x, *m, ndigits, t1, t2);
+ //mpModMultTemp(*y, *x, *m, ndigits, t1, t2);
+
+ mpMultiply(t1,x,y,n);
+ mpDivide(t2,y,t1,n*2,m,n);
+
+ }
+
+ /* Move to next bit */
+ mpNEXTBITMASK(mask, n);
+ }
+
+ /* Return y */
+ mpSetEqual(*yout, y, ndigits);
+
+done:
+ mpDESTROY(t1, nn);
+ mpDESTROY(t2, nn);
+ mpDESTROY(y, ndigits);
+
+ return 0;
+}
+
+int mpSquare(DIGIT_T w[], const DIGIT_T x[], size_t ndigits)
+/* New in Version 2.0 */
+{
+ /* Computes square w = x * x
+ where x is a multiprecision integer of ndigits
+ and w is a multiprecision integer of 2*ndigits
+
+ Ref: Menezes p596 Algorithm 14.16 with errata.
+ */
+
+ DIGIT_T k, p[2], u[2], cbit, carry;
+ size_t i, j, t, i2, cpos;
+
+ assert(w != x);
+
+ t = ndigits;
+
+ /* 1. For i from 0 to (2t-1) do: w_i = 0 */
+ i2 = t << 1;
+ for (i = 0; i < i2; i++)
+ w[i] = 0;
+
+ carry = 0;
+ cpos = i2-1;
+ /* 2. For i from 0 to (t-1) do: */
+ for (i = 0; i < t; i++)
+ {
+ /* 2.1 (uv) = w_2i + x_i * x_i, w_2i = v, c = u
+ Careful, w_2i may be double-prec
+ */
+ i2 = i << 1; /* 2*i */
+ spMultiply(p, x[i], x[i]);
+ p[0] += w[i2];
+ if (p[0] < w[i2])
+ p[1]++;
+ k = 0; /* p[1] < b, so no overflow here */
+ if (i2 == cpos && carry)
+ {
+ p[1] += carry;
+ if (p[1] < carry)
+ k++;
+ carry = 0;
+ }
+ w[i2] = p[0];
+ u[0] = p[1];
+ u[1] = k;
+
+ /* 2.2 for j from (i+1) to (t-1) do:
+ (uv) = w_{i+j} + 2x_j * x_i + c,
+ w_{i+j} = v, c = u,
+ u is double-prec
+ w_{i+j} is dbl if [i+j] == cpos
+ */
+ k = 0;
+ for (j = i+1; j < t; j++)
+ {
+ /* p = x_j * x_i */
+ spMultiply(p, x[j], x[i]);
+ /* p = 2p <=> p <<= 1 */
+ cbit = (p[0] & HIBITMASK) != 0;
+ k = (p[1] & HIBITMASK) != 0;
+ p[0] <<= 1;
+ p[1] <<= 1;
+ p[1] |= cbit;
+ /* p = p + c */
+ p[0] += u[0];
+ if (p[0] < u[0])
+ {
+ p[1]++;
+ if (p[1] == 0)
+ k++;
+ }
+ p[1] += u[1];
+ if (p[1] < u[1])
+ k++;
+ /* p = p + w_{i+j} */
+ p[0] += w[i+j];
+ if (p[0] < w[i+j])
+ {
+ p[1]++;
+ if (p[1] == 0)
+ k++;
+ }
+ if ((i+j) == cpos && carry)
+ { /* catch overflow from last round */
+ p[1] += carry;
+ if (p[1] < carry)
+ k++;
+ carry = 0;
+ }
+ /* w_{i+j} = v, c = u */
+ w[i+j] = p[0];
+ u[0] = p[1];
+ u[1] = k;
+ }
+ /* 2.3 w_{i+t} = u */
+ w[i+t] = u[0];
+ /* remember overflow in w_{i+t} */
+ carry = u[1];
+ cpos = i+t;
+ }
+
+ /* (NB original step 3 deleted in Menezes errata) */
+
+ /* Return w */
+
+ return 0;
+}
+
+
+
+
+int mpIsZero(const DIGIT_T a[], size_t ndigits)
+{
+ size_t i;
+
+ /* if (ndigits == 0) return -1; // deleted [v2.5] */
+
+ for (i = 0; i < ndigits; i++) /* Start at lsb */
+ {
+ if (a[i] != 0)
+ return 0; /* False */
+ }
+
+ return (!0); /* True */
+}
+
+
+void assert(bool precondition) {
+
+ char str[] = "assert reached, also this message leaks memory";
+
+ if (!precondition)
+ mpFail(str);
+
+
+}
+
+// some might be constants
+__kernel void single(global DIGIT_T* s, const unsigned int s_len,
+ global DIGIT_T* e, const unsigned int e_len,
+ global DIGIT_T* n, const unsigned int n_len,
+ global DIGIT_T* res, const unsigned int res_len,
+ //global DIGIT_T* comp, const unsigned int comp_len,
+ const unsigned int max_len,
+ global int8* valid
+ //const unsigned int count
+ )
+{
+
+
+
+ mpModExpO(&res,&s,&e,&n,max_len);
+
+
+}