The OpenCL kernel compiles, and data can be passed to it. The kernel does not return anything yet. - libgpuverify

commit d50b5b70b89f9fd9a7208fa52c8bf0c9aadb9f92
parent b5c2468c4d2b4b9ee2364e6331cbf968c060023d
Author: Cedric <cedric.zwahlen@students.bfh.ch>
Date:   Wed,  1 Nov 2023 23:33:32 +0100

The OpenCL kernel compiles, and data can be passed to it. The kernel does not return anything yet.

Diffstat:
M source/opencl-test.c  | 42 ++++++++++++++++++------------------------
M source/rsa-test.c  | 313 +++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++--------
M source/rsa-test.h  | 2 +-
M xcode/.DS_Store  | 0 
M xcode/lib-gpu-verify.xcodeproj/project.pbxproj  | 8 ++++----
M xcode/lib-gpu-verify.xcodeproj/project.xcworkspace/xcuserdata/cedriczwahlen.xcuserdatad/UserInterfaceState.xcuserstate  | 0 
M xcode/lib-gpu-verify.xcodeproj/xcshareddata/xcschemes/lib-gpu-verify.xcscheme  | 2 +-
M xcode/lib-gpu-verify.xcodeproj/xcuserdata/cedriczwahlen.xcuserdatad/xcdebugger/Breakpoints_v2.xcbkptlist  | 142 +++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
D xcode/modexp.cl  | 5 -----
A xcode/verify.cl  | 996 +++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++

10 files changed, 1445 insertions(+), 65 deletions(-)
diff --git a/source/opencl-test.c b/source/opencl-test.c
@@ -29,12 +29,10 @@ int opencl_tests(void) {
     cl_mem input;                       // device memory used for the input array
     cl_mem output;                      // device memory used for the output array
     
-    // Fill our data set with random float values
-    //
+   
     int i = 0;
     unsigned int count = DATA_SIZE;
-    for(i = 0; i < count; i++)
-        data[i] = rand() / (float)RAND_MAX;
+    
     
     // Connect to a compute device
     //
@@ -46,15 +44,20 @@ int opencl_tests(void) {
         return EXIT_FAILURE;
     }
     
-    size_t retSize = 0;
-    clGetDeviceInfo(device_id, CL_DRIVER_VERSION, 0, NULL, &retSize);
-
-    char driver_version[retSize];
-    clGetDeviceInfo(device_id, CL_DRIVER_VERSION, retSize, &driver_version, &retSize);
+    size_t retSize_1 = 0;
+    clGetDeviceInfo(device_id, CL_DRIVER_VERSION, 0, NULL, &retSize_1);
+    char driver_version[retSize_1];
+    clGetDeviceInfo(device_id, CL_DRIVER_VERSION, retSize_1, &driver_version, &retSize_1);
+    
+    printf("driver version: %s\n", driver_version);
+    
+    
+    size_t retSize_2 = sizeof(cl_uint);
+    cl_uint address_bits = 0;
+    clGetDeviceInfo(device_id, CL_DEVICE_ADDRESS_BITS, 0, NULL, &retSize_2);
+    clGetDeviceInfo(device_id, CL_DEVICE_ADDRESS_BITS, retSize_2, &address_bits, &retSize_2);
     
-    //char *driver_version;
-    //clGetDeviceInfo(device_id, CL_DRIVER_VERSION, NULL, &driver_version, NULL);
-    printf("%s\n", driver_version);
+    printf("device address bits: %i\n", address_bits);
     
   
     // Create a compute context
@@ -78,7 +81,7 @@ int opencl_tests(void) {
     // get the kernel from a file instead of a constant
     
 
-    FILE *fp = fopen("rsa-kernel.cl", "r");
+    FILE *fp = fopen("./verify.cl", "r");
     fseek(fp, 0L, SEEK_END);
     size_t sz = ftell(fp);
     rewind(fp);
@@ -113,7 +116,7 @@ int opencl_tests(void) {
 
     // Create the compute kernel in the program we wish to run
     //
-    kernel = clCreateKernel(program, "square", &err);
+    kernel = clCreateKernel(program, "single", &err);
     if (!kernel || err != CL_SUCCESS)
     {
         printf("Error: Failed to create compute kernel!\n");
@@ -184,15 +187,7 @@ int opencl_tests(void) {
         exit(1);
     }
     
-    // Validate our results
-    //
-    correct = 0;
-    for(i = 0; i < count; i++)
-    {
-        if(results[i] == data[i] * data[i])
-            correct++;
-    }
-    
+   
     
     // Print a brief summary detailing the results
     //
@@ -207,5 +202,4 @@ int opencl_tests(void) {
     clReleaseCommandQueue(commands);
     clReleaseContext(context);
     
-    
 }
diff --git a/source/rsa-test.c b/source/rsa-test.c
@@ -7,7 +7,7 @@
 
 #include "rsa-test.h"
 #include "big-int-test.h"
-
+#include <OpenCL/opencl.h>
 
 #include "ctype.h"
 #include "time.h"
@@ -16,9 +16,9 @@
 
 #define NEED_LIBGCRYPT_VERSION "1.10.2"
 
+#define DATA_SIZE (1024)
 
-
-void rsa_tests(void) {
+int rsa_tests(void) {
     
     // MARK: UNSAFE init
     
@@ -75,13 +75,8 @@ void rsa_tests(void) {
     // use gcry_rsa_sign  – without padding?
     gcry_sexp_t resSign;
     
-   
-    
     gcry_pk_sign(&resSign, toSign, key);
     
-    
-   // show_sexp("\n", resSign);
-    
     // measure time
     
     // do the same thing with our bigNum library – do we even get the same signature?
@@ -112,21 +107,10 @@ void rsa_tests(void) {
     unsigned char *sgn = malloc(2048);
     gcry_mpi_print(GCRYMPI_FMT_HEX,sgn,2048,&nL,sig_mpi);
     
-  //  printf("%s",n);
-    
-    // use 'real time' – not clock time
-    
-   
-    
-    bigNum_tests(n, val, d);
-    
-    
     struct timespec t1, t2;
   
     clock_gettime(CLOCK_REALTIME, &t1);
     
-   
-    
     if (verify(sgn, e, n, val)) {
         
         printf("\nverification failed\n");
@@ -163,11 +147,287 @@ void rsa_tests(void) {
     
     // try to put it onto the gpu
     
-    return;
+    // MARK: GPU Code
     
-}
+    int err;                            // error code returned from api calls
+      
+    float data[DATA_SIZE];              // original data set given to device
+    float results[DATA_SIZE];           // results returned from device
+    unsigned int correct;               // number of correct results returned
+
+    size_t global;                      // global domain size for our calculation
+    size_t local;                       // local domain size for our calculation
+
+    cl_device_id device_id;             // compute device id
+    cl_context context;                 // compute context
+    cl_command_queue commands;          // compute command queue
+    cl_program program;                 // compute program
+    cl_kernel kernel;                   // compute kernel
+    
+    //cl_mem input;                       // device memory used for the input array
+    //cl_mem output;                      // device memory used for the output array
+    
+   
+    //int i = 0;
+    unsigned int count = DATA_SIZE;
+    
+    
+    // Connect to a compute device
+    //
+    int gpu = 1;
+    err = clGetDeviceIDs(NULL, gpu ? CL_DEVICE_TYPE_GPU : CL_DEVICE_TYPE_CPU, 1, &device_id, NULL);
+    if (err != CL_SUCCESS)
+    {
+        printf("Error: Failed to create a device group!\n");
+        return EXIT_FAILURE;
+    }
+    
+    size_t retSize_1 = 0;
+    clGetDeviceInfo(device_id, CL_DRIVER_VERSION, 0, NULL, &retSize_1);
+    char driver_version[retSize_1];
+    clGetDeviceInfo(device_id, CL_DRIVER_VERSION, retSize_1, &driver_version, &retSize_1);
+    
+    printf("driver version: %s\n", driver_version);
+    
+    
+    size_t retSize_2 = sizeof(cl_uint);
+    cl_uint address_bits = 0;
+    clGetDeviceInfo(device_id, CL_DEVICE_ADDRESS_BITS, 0, NULL, &retSize_2);
+    clGetDeviceInfo(device_id, CL_DEVICE_ADDRESS_BITS, retSize_2, &address_bits, &retSize_2);
+    
+    printf("device address bits: %i\n", address_bits);
+    
+  
+    // Create a compute context
+    //
+    context = clCreateContext(0, 1, &device_id, NULL, NULL, &err);
+    if (!context)
+    {
+        printf("Error: Failed to create a compute context!\n");
+        return EXIT_FAILURE;
+    }
 
+    // Create a command commands
+    //
+    commands = clCreateCommandQueue(context, device_id, 0, &err);
+    if (!commands)
+    {
+        printf("Error: Failed to create a command commands!\n");
+        return EXIT_FAILURE;
+    }
+    
+    // get the kernel from a file instead of a constant
+    
 
+    FILE *fp = fopen("./verify.cl", "r");
+    fseek(fp, 0L, SEEK_END);
+    size_t sz = ftell(fp);
+    rewind(fp);
+    
+    char *kernelBuf = malloc(sz);
+    fread(kernelBuf, sizeof(char), sz, fp);
+    fclose(fp);
+
+    // Create the compute program from the source buffer
+    //
+    //program = clCreateProgramWithSource(context, 1, (const char **) & KernelSource, NULL, &err);
+    program = clCreateProgramWithSource(context, 1, (const char **) & kernelBuf, NULL, &err);
+    if (!program)
+    {
+        printf("Error: Failed to create compute program!\n");
+        return EXIT_FAILURE;
+    }
+
+    // Build the program executable
+    //
+    err = clBuildProgram(program, 0, NULL, NULL, NULL, NULL);
+    if (err != CL_SUCCESS)
+    {
+        size_t len;
+        char buffer[2048];
+
+        printf("Error: Failed to build program executable!\n");
+        clGetProgramBuildInfo(program, device_id, CL_PROGRAM_BUILD_LOG, sizeof(buffer), buffer, &len);
+        printf("%s\n", buffer);
+        exit(1);
+    }
+
+    // Create the compute kernel in the program we wish to run
+    //
+    kernel = clCreateKernel(program, "single", &err);
+    if (!kernel || err != CL_SUCCESS)
+    {
+        printf("Error: Failed to create compute kernel!\n");
+        exit(1);
+    }
+
+    // Create the input and output arrays in device memory for our calculation
+    //
+    //input = clCreateBuffer(context,  CL_MEM_READ_ONLY,  sizeof(float) * count, NULL, NULL);
+    //output = clCreateBuffer(context, CL_MEM_WRITE_ONLY, sizeof(float) * count, NULL, NULL);
+    
+    cl_mem s_mem;
+    cl_mem e_mem;
+    cl_mem n_mem;
+    
+    cl_mem res_mem;
+    cl_mem valid; // needs to be a buffer because it goes out
+    
+    DIGIT_T n_buf [MAX_ALLOC_SIZE*2];
+    DIGIT_T e_buf [MAX_ALLOC_SIZE*2];
+    DIGIT_T s_buf [MAX_ALLOC_SIZE*2];
+    
+    DIGIT_T res_buf [MAX_ALLOC_SIZE*2];
+    
+    mpSetZero(n_buf, MAX_ALLOC_SIZE*2);
+    mpSetZero(e_buf, MAX_ALLOC_SIZE*2);
+    mpSetZero(s_buf, MAX_ALLOC_SIZE*2);
+    
+    mpSetZero(res_buf, MAX_ALLOC_SIZE*2);
+  
+    mpConvFromHex(n_buf, strlen(n), n);
+    mpConvFromHex(e_buf, strlen(e), e);
+    mpConvFromHex(s_buf, strlen(sgn), sgn);
+    
+    size_t sz_n = mpSizeof(n_buf, MAX_ALLOC_SIZE*2);
+    size_t sz_s = mpSizeof(s_buf, MAX_ALLOC_SIZE*2);
+    
+    
+    unsigned long s_len = mpSizeof(s_buf, MAX_ALLOC_SIZE*2);
+    unsigned long e_len = mpSizeof(e_buf, MAX_ALLOC_SIZE*2);
+    unsigned long n_len = mpSizeof(n_buf, MAX_ALLOC_SIZE*2);
+    
+    unsigned long res_len = MAX_ALLOC_SIZE*2;
+    
+    unsigned long max_len = max(sz_s,sz_n);
+    
+    n_mem = clCreateBuffer(context, CL_MEM_READ_WRITE, sizeof(char) * n_len, NULL, NULL);
+    e_mem = clCreateBuffer(context, CL_MEM_READ_WRITE, sizeof(char) * e_len, NULL, NULL);
+    s_mem = clCreateBuffer(context, CL_MEM_READ_WRITE, sizeof(char) * s_len, NULL, NULL);
+    
+    res_mem = clCreateBuffer(context, CL_MEM_READ_WRITE, sizeof(char) * res_len, NULL, NULL);
+    valid = clCreateBuffer(context, CL_MEM_READ_WRITE, sizeof(int8_t), NULL, NULL);
+    
+    
+    global = 4096;
+    //local = global;
+    
+    
+    if (!s_mem || !e_mem || !n_mem || !res_mem || !valid )
+    {
+        printf("Error: Failed to allocate device memory!\n");
+        exit(1);
+    }
+    
+    
+    // Write our data set into the input array in device memory
+    //
+    err = clEnqueueWriteBuffer(commands, s_mem, CL_TRUE, 0, s_len, s_buf, 0, NULL, NULL);
+    err |= clEnqueueWriteBuffer(commands, e_mem, CL_TRUE, 0, e_len, e_buf, 0, NULL, NULL);
+    err |= clEnqueueWriteBuffer(commands, n_mem, CL_TRUE, 0, n_len, n_buf, 0, NULL, NULL);
+    //err |= clEnqueueWriteBuffer(commands, res_mem, CL_TRUE, 0, res_len, res_buf, 0, NULL, NULL);
+    if (err != CL_SUCCESS)
+    {
+        printf("Error: Failed to write to source array!\n");
+        exit(1);
+    }
+
+    // Set the arguments to our compute kernel
+    //
+    err = 0;
+    err  = clSetKernelArg(kernel, 0, sizeof(cl_mem), &s_mem);
+    err |= clSetKernelArg(kernel, 1, sizeof(unsigned int), &s_len);
+    err |= clSetKernelArg(kernel, 2, sizeof(cl_mem), &e_mem);
+    err |= clSetKernelArg(kernel, 3, sizeof(unsigned int), &e_len);
+    err |= clSetKernelArg(kernel, 4, sizeof(cl_mem), &n_mem);
+    err |= clSetKernelArg(kernel, 5, sizeof(unsigned int), &n_len);
+    err |= clSetKernelArg(kernel, 6, sizeof(cl_mem), &res_mem);
+    err |= clSetKernelArg(kernel, 7, sizeof(unsigned int), &res_len);
+    err |= clSetKernelArg(kernel, 8, sizeof(unsigned int), &max_len);
+    err |= clSetKernelArg(kernel, 9, sizeof(cl_mem), &valid);
+    
+    //err |= clSetKernelArg(kernel, 2, sizeof(unsigned int), &count);
+    if (err != CL_SUCCESS)
+    {
+        printf("Error: Failed to set kernel arguments! %d\n", err);
+        exit(1);
+    }
+
+    // Get the maximum work group size for executing the kernel on the device
+    //
+    err = clGetKernelWorkGroupInfo(kernel, device_id, CL_KERNEL_WORK_GROUP_SIZE, sizeof(local), &local, NULL);
+    if (err != CL_SUCCESS)
+    {
+        printf("Error: Failed to retrieve kernel work group info! %d\n", err);
+        exit(1);
+    }
+
+    // Execute the kernel over the entire range of our 1d input data set
+    // using the maximum number of work group items for this device
+    //
+   
+    err = clEnqueueNDRangeKernel(commands, kernel, 1, NULL, &global, &local, 0, NULL, NULL);
+    if (err)
+    {
+        printf("Error: Failed to execute kernel!\n");
+        return EXIT_FAILURE;
+    }
+
+    // Wait for the command commands to get serviced before reading back results
+    //
+    clFinish(commands);
+
+    // Read back the results from the device to verify the output
+    //
+    err = clEnqueueReadBuffer( commands, res_mem, CL_TRUE, 0, res_len, res_buf, 0, NULL, NULL );
+    if (err != CL_SUCCESS)
+    {
+        printf("Error: Failed to read output array! %d\n", err);
+        exit(1);
+    }
+    
+    size_t sz_res = mpSizeof(res_buf, MAX_ALLOC_SIZE*2);
+    
+    int sz_mm = strlen(val) + 2;
+    
+    unsigned char comp[sz_mm];
+    
+    mpConvToHex(res_buf, sz_res, comp, sz_mm);
+   
+    printf("%s",comp);
+    
+    // Print a brief summary detailing the results
+    //
+  //  printf("Computed '%d/%d' correct values!\n", correct, count);
+    
+    // Shutdown and cleanup
+    //
+    clReleaseMemObject(res_mem);
+    clReleaseMemObject(e_mem);
+    clReleaseMemObject(n_mem);
+    clReleaseMemObject(s_mem);
+    
+    clReleaseProgram(program);
+    clReleaseKernel(kernel);
+    clReleaseCommandQueue(commands);
+    clReleaseContext(context);
+    
+    
+    
+    
+    
+    
+    
+    
+    
+    
+    
+    
+    return 0;
+    
+}
+
+/*
 //void bigNum_tests(void) {
 void bigNum_tests(unsigned char* nn,unsigned char* ee,unsigned char* dd) {
     
@@ -185,14 +445,7 @@ void bigNum_tests(unsigned char* nn,unsigned char* ee,unsigned char* dd) {
     mpSetZero(d, MAX_ALLOC_SIZE*2);
     
     mpSetZero(res, MAX_ALLOC_SIZE*2);
-    /*
-    
-    char* nn = "E08973398DD8F5F5E88776397F4EB005BB5383DE0FB7ABDC7DC775290D052E6D12DFA68626D4D26FAA5829FC97ECFA82510F3080BEB1509E4644F12CBBD832CFC6686F07D9B060ACBEEE34096A13F5F7050593DF5EBA3556D961FF197FC981E6F86CEA874070EFAC6D2C749F2DFA553AB9997702A648528C4EF357385774575F";
-    
-    char* ee = "010001";
-    
-    char* dd = "A403C327477634346CA686B57949014B2E8AD2C862B2C7D748096A8B91F736F275D6E8CD15906027314735644D95CD6763CEB49F56AC2F376E1CEE0EBF282DF439906F34D86E085BD5656AD841F313D72D395EFE33CBFF29E4030B3D05A28FB7F18EA27637B07957D32F2BDE8706227D04665EC91BAF8B1AC3EC9144AB7F21";
-    */
+   
     mpConvFromHex(N, strlen(nn), nn);
     mpConvFromHex(e, strlen(ee), ee);
     mpConvFromHex(d, strlen(dd), dd);
@@ -213,7 +466,7 @@ void bigNum_tests(unsigned char* nn,unsigned char* ee,unsigned char* dd) {
     
     
 }
-
+*/
 int verify(unsigned char* sign, unsigned char* ee, unsigned char* nn, unsigned char* mm) {
     
     
diff --git a/source/rsa-test.h b/source/rsa-test.h
@@ -12,7 +12,7 @@
 
 #include <gcrypt.h>
 
-void rsa_tests(void);
+int rsa_tests(void);
 
 static void show_sexp(const char *prefix, gcry_sexp_t a);
 
diff --git a/xcode/.DS_Store b/xcode/.DS_Store
Binary files differ.
diff --git a/xcode/lib-gpu-verify.xcodeproj/project.pbxproj b/xcode/lib-gpu-verify.xcodeproj/project.pbxproj
@@ -8,7 +8,7 @@
 
 /* Begin PBXBuildFile section */
 		6A8A795D2A89357400116D7D /* rsa-kernel.cl in Sources */ = {isa = PBXBuildFile; fileRef = 6A8A795C2A89357400116D7D /* rsa-kernel.cl */; };
-		6A8A795F2A89672700116D7D /* modexp.cl in Sources */ = {isa = PBXBuildFile; fileRef = 6A8A795E2A89672700116D7D /* modexp.cl */; };
+		6A8A795F2A89672700116D7D /* verify.cl in Sources */ = {isa = PBXBuildFile; fileRef = 6A8A795E2A89672700116D7D /* verify.cl */; };
 		6AF7487A2ADADEBD00D58E08 /* lib-gpu-verify.c in Sources */ = {isa = PBXBuildFile; fileRef = 6AF748792ADADEBD00D58E08 /* lib-gpu-verify.c */; };
 		6AF748822ADADF4500D58E08 /* big-int-test.c in Sources */ = {isa = PBXBuildFile; fileRef = 6AF7487D2ADADF4500D58E08 /* big-int-test.c */; };
 		6AF748832ADADF4500D58E08 /* rsa-test.c in Sources */ = {isa = PBXBuildFile; fileRef = 6AF7487F2ADADF4500D58E08 /* rsa-test.c */; };
@@ -31,7 +31,7 @@
 /* Begin PBXFileReference section */
 		466E0F5F0C932E1A00ED01DB /* lib-gpu-verify */ = {isa = PBXFileReference; explicitFileType = "compiled.mach-o.executable"; includeInIndex = 0; path = "lib-gpu-verify"; sourceTree = BUILT_PRODUCTS_DIR; };
 		6A8A795C2A89357400116D7D /* rsa-kernel.cl */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.opencl; path = "rsa-kernel.cl"; sourceTree = "<group>"; };
-		6A8A795E2A89672700116D7D /* modexp.cl */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.opencl; path = modexp.cl; sourceTree = "<group>"; };
+		6A8A795E2A89672700116D7D /* verify.cl */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.opencl; path = verify.cl; sourceTree = "<group>"; };
 		6AF748792ADADEBD00D58E08 /* lib-gpu-verify.c */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.c; name = "lib-gpu-verify.c"; path = "../source/lib-gpu-verify.c"; sourceTree = "<group>"; };
 		6AF7487B2ADADF4500D58E08 /* big-int-test.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; name = "big-int-test.h"; path = "../source/big-int-test.h"; sourceTree = "<group>"; };
 		6AF7487D2ADADF4500D58E08 /* big-int-test.c */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.c; name = "big-int-test.c"; path = "../source/big-int-test.c"; sourceTree = "<group>"; };
@@ -86,7 +86,7 @@
 			children = (
 				6A984F162AC5B18A00F530FD /* Headers */,
 				6A8A795C2A89357400116D7D /* rsa-kernel.cl */,
-				6A8A795E2A89672700116D7D /* modexp.cl */,
+				6A8A795E2A89672700116D7D /* verify.cl */,
 				6AF748792ADADEBD00D58E08 /* lib-gpu-verify.c */,
 				6AF7487D2ADADF4500D58E08 /* big-int-test.c */,
 				6AF7487F2ADADF4500D58E08 /* rsa-test.c */,
@@ -159,7 +159,7 @@
 			files = (
 				6AF7487A2ADADEBD00D58E08 /* lib-gpu-verify.c in Sources */,
 				6A8A795D2A89357400116D7D /* rsa-kernel.cl in Sources */,
-				6A8A795F2A89672700116D7D /* modexp.cl in Sources */,
+				6A8A795F2A89672700116D7D /* verify.cl in Sources */,
 				6AF748832ADADF4500D58E08 /* rsa-test.c in Sources */,
 				6AF748862ADADFAD00D58E08 /* opencl-test.c in Sources */,
 				6AF748822ADADF4500D58E08 /* big-int-test.c in Sources */,
diff --git a/xcode/lib-gpu-verify.xcodeproj/project.xcworkspace/xcuserdata/cedriczwahlen.xcuserdatad/UserInterfaceState.xcuserstate b/xcode/lib-gpu-verify.xcodeproj/project.xcworkspace/xcuserdata/cedriczwahlen.xcuserdatad/UserInterfaceState.xcuserstate
Binary files differ.
diff --git a/xcode/lib-gpu-verify.xcodeproj/xcshareddata/xcschemes/lib-gpu-verify.xcscheme b/xcode/lib-gpu-verify.xcodeproj/xcshareddata/xcschemes/lib-gpu-verify.xcscheme
@@ -35,7 +35,7 @@
       selectedLauncherIdentifier = "Xcode.DebuggerFoundation.Launcher.LLDB"
       launchStyle = "0"
       useCustomWorkingDirectory = "YES"
-      customWorkingDirectory = "/Users/cedriczwahlen/Desktop/OpenCL_Hello_World_Example"
+      customWorkingDirectory = "/Users/cedriczwahlen/libgpuverify/xcode"
       ignoresPersistentStateOnLaunch = "NO"
       debugDocumentVersioning = "YES"
       debugServiceExtension = "internal"
diff --git a/xcode/lib-gpu-verify.xcodeproj/xcuserdata/cedriczwahlen.xcuserdatad/xcdebugger/Breakpoints_v2.xcbkptlist b/xcode/lib-gpu-verify.xcodeproj/xcuserdata/cedriczwahlen.xcuserdatad/xcdebugger/Breakpoints_v2.xcbkptlist
@@ -680,5 +680,147 @@
             landmarkType = "9">
          </BreakpointContent>
       </BreakpointProxy>
+      <BreakpointProxy
+         BreakpointExtensionID = "Xcode.Breakpoint.FileBreakpoint">
+         <BreakpointContent
+            uuid = "E34A3BBB-4BEA-4FC9-B0F1-55FB3A180116"
+            shouldBeEnabled = "Yes"
+            ignoreCount = "0"
+            continueAfterRunningActions = "No"
+            filePath = "../source/rsa-test.c"
+            startingColumnNumber = "9223372036854775807"
+            endingColumnNumber = "9223372036854775807"
+            startingLineNumber = "397"
+            endingLineNumber = "397"
+            landmarkName = "rsa_tests()"
+            landmarkType = "9">
+            <Locations>
+               <Location
+                  uuid = "E34A3BBB-4BEA-4FC9-B0F1-55FB3A180116 - b0b9078e770c93b0"
+                  shouldBeEnabled = "Yes"
+                  ignoreCount = "0"
+                  continueAfterRunningActions = "No"
+                  symbolName = "rsa_tests"
+                  moduleName = "lib-gpu-verify"
+                  usesParentBreakpointCondition = "Yes"
+                  urlString = "file:///Users/cedriczwahlen/libgpuverify/source/rsa-test.c"
+                  startingColumnNumber = "9223372036854775807"
+                  endingColumnNumber = "9223372036854775807"
+                  startingLineNumber = "402"
+                  endingLineNumber = "402"
+                  offsetFromSymbolStart = "4114">
+               </Location>
+               <Location
+                  uuid = "E34A3BBB-4BEA-4FC9-B0F1-55FB3A180116 - b0b9078e770c934d"
+                  shouldBeEnabled = "Yes"
+                  ignoreCount = "0"
+                  continueAfterRunningActions = "No"
+                  symbolName = "rsa_tests"
+                  moduleName = "lib-gpu-verify"
+                  usesParentBreakpointCondition = "Yes"
+                  urlString = "file:///Users/cedriczwahlen/libgpuverify/source/rsa-test.c"
+                  startingColumnNumber = "9223372036854775807"
+                  endingColumnNumber = "9223372036854775807"
+                  startingLineNumber = "397"
+                  endingLineNumber = "397"
+                  offsetFromSymbolStart = "4149">
+               </Location>
+               <Location
+                  uuid = "E34A3BBB-4BEA-4FC9-B0F1-55FB3A180116 - b0b9078e770c934d"
+                  shouldBeEnabled = "Yes"
+                  ignoreCount = "0"
+                  continueAfterRunningActions = "No"
+                  symbolName = "rsa_tests"
+                  moduleName = "lib-gpu-verify"
+                  usesParentBreakpointCondition = "Yes"
+                  urlString = "file:///Users/cedriczwahlen/libgpuverify/source/rsa-test.c"
+                  startingColumnNumber = "9223372036854775807"
+                  endingColumnNumber = "9223372036854775807"
+                  startingLineNumber = "397"
+                  endingLineNumber = "397"
+                  offsetFromSymbolStart = "4122">
+               </Location>
+               <Location
+                  uuid = "E34A3BBB-4BEA-4FC9-B0F1-55FB3A180116 - b0b9078e770c934d"
+                  shouldBeEnabled = "Yes"
+                  ignoreCount = "0"
+                  continueAfterRunningActions = "No"
+                  symbolName = "rsa_tests"
+                  moduleName = "lib-gpu-verify"
+                  usesParentBreakpointCondition = "Yes"
+                  urlString = "file:///Users/cedriczwahlen/libgpuverify/source/rsa-test.c"
+                  startingColumnNumber = "9223372036854775807"
+                  endingColumnNumber = "9223372036854775807"
+                  startingLineNumber = "397"
+                  endingLineNumber = "397"
+                  offsetFromSymbolStart = "4098">
+               </Location>
+            </Locations>
+         </BreakpointContent>
+      </BreakpointProxy>
+      <BreakpointProxy
+         BreakpointExtensionID = "Xcode.Breakpoint.FileBreakpoint">
+         <BreakpointContent
+            uuid = "8CC35C0B-FC9A-4FA3-841F-C77B239866FA"
+            shouldBeEnabled = "No"
+            ignoreCount = "0"
+            continueAfterRunningActions = "No"
+            filePath = "../source/rsa-test.c"
+            startingColumnNumber = "9223372036854775807"
+            endingColumnNumber = "9223372036854775807"
+            startingLineNumber = "316"
+            endingLineNumber = "316"
+            landmarkName = "rsa_tests()"
+            landmarkType = "9">
+            <Locations>
+               <Location
+                  uuid = "8CC35C0B-FC9A-4FA3-841F-C77B239866FA - b0b9078e770c98da"
+                  shouldBeEnabled = "Yes"
+                  ignoreCount = "0"
+                  continueAfterRunningActions = "No"
+                  symbolName = "rsa_tests"
+                  moduleName = "lib-gpu-verify"
+                  usesParentBreakpointCondition = "Yes"
+                  urlString = "file:///Users/cedriczwahlen/libgpuverify/source/rsa-test.c"
+                  startingColumnNumber = "9223372036854775807"
+                  endingColumnNumber = "9223372036854775807"
+                  startingLineNumber = "316"
+                  endingLineNumber = "316"
+                  offsetFromSymbolStart = "2796">
+               </Location>
+               <Location
+                  uuid = "8CC35C0B-FC9A-4FA3-841F-C77B239866FA - b0b9078e770c98da"
+                  shouldBeEnabled = "Yes"
+                  ignoreCount = "0"
+                  continueAfterRunningActions = "No"
+                  symbolName = "rsa_tests"
+                  moduleName = "lib-gpu-verify"
+                  usesParentBreakpointCondition = "Yes"
+                  urlString = "file:///Users/cedriczwahlen/libgpuverify/source/rsa-test.c"
+                  startingColumnNumber = "9223372036854775807"
+                  endingColumnNumber = "9223372036854775807"
+                  startingLineNumber = "316"
+                  endingLineNumber = "316"
+                  offsetFromSymbolStart = "2782">
+               </Location>
+            </Locations>
+         </BreakpointContent>
+      </BreakpointProxy>
+      <BreakpointProxy
+         BreakpointExtensionID = "Xcode.Breakpoint.FileBreakpoint">
+         <BreakpointContent
+            uuid = "0B50C23D-36DE-4C9C-B911-72E48EA8C7FD"
+            shouldBeEnabled = "Yes"
+            ignoreCount = "0"
+            continueAfterRunningActions = "No"
+            filePath = "../source/rsa-test.c"
+            startingColumnNumber = "9223372036854775807"
+            endingColumnNumber = "9223372036854775807"
+            startingLineNumber = "382"
+            endingLineNumber = "382"
+            landmarkName = "rsa_tests()"
+            landmarkType = "9">
+         </BreakpointContent>
+      </BreakpointProxy>
    </Breakpoints>
 </Bucket>
diff --git a/xcode/modexp.cl b/xcode/modexp.cl
@@ -1,5 +0,0 @@
-__kernel void verify() {
-    
-}
-
-
diff --git a/xcode/verify.cl b/xcode/verify.cl
@@ -0,0 +1,996 @@
+
+// macros
+
+#define mpDESTROY(b, n) do{if(b)mpSetZero(b,n);}while(0)
+#define max(a,b)            (((a) > (b)) ? (a) : (b))
+
+// only for that string conversion
+#define ALLOC_BYTES(b,n) do{assert((n)<=sizeof((b)));zeroise_bytes((b),(n));}while(0)
+#define FREE_BYTES(b,n) zeroise_bytes((b),(n))
+
+
+#define MAX_DIGIT 0xFFFFFFFFUL
+#define MAX_HALF_DIGIT 0xFFFFUL    /* NB 'L' */
+#define BITS_PER_DIGIT 32
+#define HIBITMASK 0x80000000UL
+
+#define MAX_FIXED_BIT_LENGTH 8192
+#define MAX_FIXED_DIGITS ((MAX_FIXED_BIT_LENGTH + BITS_PER_DIGIT - 1) / BITS_PER_DIGIT)
+
+#define MAX_ALLOC_SIZE (MAX_FIXED_DIGITS*BYTES_PER_DIGIT)
+
+#define BYTES_PER_DIGIT (BITS_PER_DIGIT / 8)
+
+#define PRIuBIGD PRIu32
+#define PRIxBIGD PRIx32
+#define PRIXBIGD PRIX32
+
+/* MACROS TO DO MODULAR SQUARING AND MULTIPLICATION USING PRE-ALLOCATED TEMPS */
+/* Required lengths |y|=|t1|=|t2|=2*n, |m|=n; but final |y|=n */
+/* Square: y = (y * y) mod m */
+#define mpMODSQUARETEMP(y,m,n,t1,t2) do{mpSquare(t1,y,n);mpDivide(t2,y,t1,n*2,m,n);}while(0)
+/* Mult:   y = (y * x) mod m */
+#define mpMODMULTTEMP(y,x,m,n,t1,t2) do{mpMultiply(t1,x,y,n);mpDivide(t2,y,t1,n*2,m,n);}while(0)
+
+#define mpNEXTBITMASK(mask, n) do{if(mask==1){mask=HIBITMASK;n--;}else{mask>>=1;}}while(0)
+
+typedef uint DIGIT_T;
+
+typedef uint16 HALF_DIGIT_T;
+
+
+// forward definitions
+
+int mpModulo(DIGIT_T r[], const DIGIT_T u[], size_t udigits, DIGIT_T v[], size_t vdigits);
+
+int mpModMult(DIGIT_T a[], const DIGIT_T x[], const DIGIT_T y[], DIGIT_T m[], size_t ndigits);
+
+int mpMultiply(DIGIT_T w[], const DIGIT_T u[], const DIGIT_T v[], size_t ndigits);
+DIGIT_T mpAdd(DIGIT_T w[], const DIGIT_T u[], const DIGIT_T v[], size_t ndigits);
+int mpDivide(DIGIT_T q[], DIGIT_T r[], const DIGIT_T u[], size_t udigits, DIGIT_T v[], size_t vdigits);
+int QhatTooBig(DIGIT_T qhat, DIGIT_T rhat, DIGIT_T vn2, DIGIT_T ujn2);
+DIGIT_T mpMultSub(DIGIT_T wn, DIGIT_T w[], const DIGIT_T v[], DIGIT_T q, size_t n);
+DIGIT_T mpShiftLeft(DIGIT_T a[], const DIGIT_T *b, size_t shift, size_t ndigits);
+
+
+void mpSetDigit(DIGIT_T a[], DIGIT_T d, size_t ndigits);
+
+int mpCompare(const DIGIT_T a[], const DIGIT_T b[], size_t ndigits);
+
+
+DIGIT_T mpShiftRight(DIGIT_T a[], const DIGIT_T b[], size_t shift, size_t ndigits);
+int spMultiply(uint p[2], uint x, uint y);
+uint spDivide(uint *pq, uint *pr, const uint u[2], uint v);
+
+int mpSquare(DIGIT_T w[], const DIGIT_T x[], size_t ndigits);
+
+size_t mpBitLength(const DIGIT_T d[], size_t ndigits);
+
+DIGIT_T mpShortDiv(DIGIT_T q[], const DIGIT_T u[], DIGIT_T v,
+                   size_t ndigits);
+
+void mpSetEqual(DIGIT_T a[], const DIGIT_T b[], size_t ndigits);
+
+size_t uiceil(float x);
+volatile uint8 zeroise_bytes(volatile void *v, size_t n);
+
+size_t mpSizeof(const DIGIT_T a[], size_t ndigits);
+
+volatile DIGIT_T mpSetZero(volatile DIGIT_T a[], size_t ndigits);
+
+int mpIsZero(const DIGIT_T a[], size_t ndigits);
+
+void mpFail(char *msg);
+
+int mpModExpO(DIGIT_T *yout[], DIGIT_T *x[], DIGIT_T *e[], DIGIT_T *m[], size_t ndigits);
+
+void assert(bool precondition);
+
+
+// implementation
+
+int mpModulo(DIGIT_T r[], const DIGIT_T u[], size_t udigits,
+             DIGIT_T v[], size_t vdigits)
+{
+    /*    Computes r = u mod v
+        where r, v are multiprecision integers of length vdigits
+        and u is a multiprecision integer of length udigits.
+        r may overlap v.
+
+        Note that r here is only vdigits long,
+        whereas in mpDivide it is udigits long.
+
+        Use remainder from mpDivide function.
+    */
+
+    size_t nn = max(udigits, vdigits);
+/* Allocate temp storage */
+//#ifdef NO_ALLOCS
+    // [v2.6] increased to two times
+    DIGIT_T qq[MAX_FIXED_DIGITS*2];
+    DIGIT_T rr[MAX_FIXED_DIGITS*2];
+  //  assert(nn <= (MAX_FIXED_DIGITS*2));
+/*#else
+    DIGIT_T *qq, *rr;
+    qq = mpAlloc(udigits);
+    rr = mpAlloc(nn);
+#endif
+*/
+
+    /* rr[nn] = u mod v */
+    mpDivide(qq, rr, u, udigits, v, vdigits);
+
+    /* Final r is only vdigits long */
+    mpSetEqual(r, rr, vdigits);
+
+    mpDESTROY(rr, udigits);
+    mpDESTROY(qq, udigits);
+
+    return 0;
+}
+
+int mpModMult(DIGIT_T a[], const DIGIT_T x[], const DIGIT_T y[],
+              DIGIT_T m[], size_t ndigits)
+{    /*    Computes a = (x * y) mod m */
+    
+/* Double-length temp variable p */
+// #ifdef NO_ALLOCS
+    DIGIT_T p[MAX_FIXED_DIGITS * 2];
+//    assert(ndigits <= MAX_FIXED_DIGITS);
+/*#else
+    DIGIT_T *p;
+    p = mpAlloc(ndigits * 2);
+#endif
+*/
+    /* Calc p[2n] = x * y */
+    mpMultiply(p, x, y, ndigits);
+
+    /* Then modulo (NOTE: a is OK at only ndigits long) */
+    mpModulo(a, p, ndigits * 2, m, ndigits);
+
+    mpDESTROY(p, ndigits * 2);
+
+    return 0;
+}
+
+int mpMultiply(DIGIT_T w[], const DIGIT_T u[], const DIGIT_T v[], size_t ndigits)
+{
+    /*    Computes product w = u * v
+        where u, v are multiprecision integers of ndigits each
+        and w is a multiprecision integer of 2*ndigits
+
+        Ref: Knuth Vol 2 Ch 4.3.1 p 268 Algorithm M.
+    */
+
+    DIGIT_T k, t[2];
+    size_t i, j, m, n;
+
+   // assert(w != u && w != v);
+
+    m = n = ndigits;
+
+    /* Step M1. Initialise */
+    for (i = 0; i < 2 * m; i++)
+        w[i] = 0;
+
+    for (j = 0; j < n; j++)
+    {
+        /* Step M2. Zero multiplier? */
+        if (v[j] == 0)
+        {
+            w[j + m] = 0;
+        }
+        else
+        {
+            /* Step M3. Initialise i */
+            k = 0;
+            for (i = 0; i < m; i++)
+            {
+                /* Step M4. Multiply and add */
+                /* t = u_i * v_j + w_(i+j) + k */
+                spMultiply(t, u[i], v[j]);
+
+                t[0] += k;
+                if (t[0] < k)
+                    t[1]++;
+                t[0] += w[i+j];
+                if (t[0] < w[i+j])
+                    t[1]++;
+
+                w[i+j] = t[0];
+                k = t[1];
+            }
+            /* Step M5. Loop on i, set w_(j+m) = k */
+            w[j+m] = k;
+        }
+    }    /* Step M6. Loop on j */
+
+    return 0;
+}
+
+DIGIT_T mpAdd(DIGIT_T w[], const DIGIT_T u[], const DIGIT_T v[], size_t ndigits)
+{
+    /*    Calculates w = u + v
+        where w, u, v are multiprecision integers of ndigits each
+        Returns carry if overflow. Carry = 0 or 1.
+
+        Ref: Knuth Vol 2 Ch 4.3.1 p 266 Algorithm A.
+    */
+
+    DIGIT_T k;
+    size_t j;
+
+  //  assert(w != v);
+
+    /* Step A1. Initialise */
+    k = 0;
+
+    for (j = 0; j < ndigits; j++)
+    {
+        /*    Step A2. Add digits w_j = (u_j + v_j + k)
+            Set k = 1 if carry (overflow) occurs
+        */
+        w[j] = u[j] + k;
+        if (w[j] < k)
+            k = 1;
+        else
+            k = 0;
+        
+        w[j] += v[j];
+        if (w[j] < v[j])
+            k++;
+
+    }    /* Step A3. Loop on j */
+
+    return k;    /* w_n = k */
+}
+
+// MARK: This function is causing problems – this function calls mpShiftLeft, mpShiftRight at some point (and so does mpShortDiv) they contain recursions, which are forbidden
+
+int mpDivide(DIGIT_T q[], DIGIT_T r[], const DIGIT_T u[],
+    size_t udigits, DIGIT_T v[], size_t vdigits)
+{    /*    Computes quotient q = u / v and remainder r = u mod v
+        where q, r, u are multiple precision digits
+        all of udigits and the divisor v is vdigits.
+
+        Ref: Knuth Vol 2 Ch 4.3.1 p 272 Algorithm D.
+
+        Do without extra storage space, i.e. use r[] for
+        normalised u[], unnormalise v[] at end, and cope with
+        extra digit Uj+n added to u after normalisation.
+
+        WARNING: this trashes q and r first, so cannot do
+        u = u / v or v = u mod v.
+        It also changes v temporarily so cannot make it const.
+    */
+    size_t shift;
+    int n, m, j;
+    DIGIT_T bitmask, overflow;
+    DIGIT_T qhat, rhat, t[2];
+    DIGIT_T *uu, *ww;
+    int qhatOK, cmp;
+
+    /* Clear q and r */
+    mpSetZero(q, udigits);
+    mpSetZero(r, udigits);
+
+    /* Work out exact sizes of u and v */
+    n = (int)mpSizeof(v, vdigits);
+    m = (int)mpSizeof(u, udigits);
+    m -= n;
+
+    /* Catch special cases */
+    if (n == 0)
+        return -1;    /* Error: divide by zero */
+
+    if (n == 1)
+    {    /* Use short division instead */
+        r[0] = mpShortDiv(q, u, v[0], udigits);
+        return 0;
+    }
+
+    if (m < 0)
+    {    /* v > u, so just set q = 0 and r = u */
+        mpSetEqual(r, u, udigits);
+        return 0;
+    }
+
+    if (m == 0)
+    {    /* u and v are the same length */
+        cmp = mpCompare(u, v, (size_t)n);
+        if (cmp < 0)
+        {    /* v > u, as above */
+            mpSetEqual(r, u, udigits);
+            return 0;
+        }
+        else if (cmp == 0)
+        {    /* v == u, so set q = 1 and r = 0 */
+            mpSetDigit(q, 1, udigits);
+            return 0;
+        }
+    }
+
+    /*    In Knuth notation, we have:
+        Given
+        u = (Um+n-1 ... U1U0)
+        v = (Vn-1 ... V1V0)
+        Compute
+        q = u/v = (QmQm-1 ... Q0)
+        r = u mod v = (Rn-1 ... R1R0)
+    */
+
+    /*    Step D1. Normalise */
+    /*    Requires high bit of Vn-1
+        to be set, so find most signif. bit then shift left,
+        i.e. d = 2^shift, u' = u * d, v' = v * d.
+    */
+    bitmask = HIBITMASK;
+    for (shift = 0; shift < BITS_PER_DIGIT; shift++)
+    {
+        if (v[n-1] & bitmask)
+            break;
+        bitmask >>= 1;
+    }
+
+    /* Normalise v in situ - NB only shift non-zero digits */
+    overflow = mpShiftLeft(v, v, shift, n);
+
+    /* Copy normalised dividend u*d into r */
+    overflow = mpShiftLeft(r, u, shift, n + m);
+    uu = r;    /* Use ptr to keep notation constant */
+
+    t[0] = overflow;    /* Extra digit Um+n */
+
+    
+    /* Step D2. Initialise j. Set j = m */
+    
+    for (j = m; j >= 0; j--)
+    {
+        /* Step D3. Set Qhat = [(b.Uj+n + Uj+n-1)/Vn-1]
+           and Rhat = remainder */
+        qhatOK = 0;
+        t[1] = t[0];    /* This is Uj+n */
+        t[0] = uu[j+n-1];
+        overflow = spDivide(&qhat, &rhat, t, v[n-1]);
+
+        /* Test Qhat */
+        if (overflow)
+        {    /* Qhat == b so set Qhat = b - 1 */
+            qhat = MAX_DIGIT;
+            rhat = uu[j+n-1];
+            rhat += v[n-1];
+            if (rhat < v[n-1])    /* Rhat >= b, so no re-test */
+                qhatOK = 1;
+        }
+        /* [VERSION 2: Added extra test "qhat && "] */
+        if (qhat && !qhatOK && QhatTooBig(qhat, rhat, v[n-2], uu[j+n-2]))
+        {    /* If Qhat.Vn-2 > b.Rhat + Uj+n-2
+               decrease Qhat by one, increase Rhat by Vn-1
+            */
+            qhat--;
+            rhat += v[n-1];
+            /* Repeat this test if Rhat < b */
+            if (!(rhat < v[n-1]))
+                if (QhatTooBig(qhat, rhat, v[n-2], uu[j+n-2]))
+                    qhat--;
+        }
+
+
+        /* Step D4. Multiply and subtract */
+        ww = &uu[j];
+        overflow = mpMultSub(t[1], ww, v, qhat, (size_t)n);
+
+        /* Step D5. Test remainder. Set Qj = Qhat */
+        q[j] = qhat;
+        if (overflow)
+        {    /* Step D6. Add back if D4 was negative */
+            q[j]--;
+            overflow = mpAdd(ww, ww, v, (size_t)n);
+        }
+
+        t[0] = uu[j+n-1];    /* Uj+n on next round */
+
+    }    /* Step D7. Loop on j */
+
+    /* Clear high digits in uu */
+    for (j = n; j < m+n; j++)
+        uu[j] = 0;
+
+    /* Step D8. Unnormalise. */
+
+    mpShiftRight(r, r, shift, n);
+    mpShiftRight(v, v, shift, n);
+
+    return 0;
+}
+
+void mpSetDigit(DIGIT_T a[], DIGIT_T d, size_t ndigits)
+{    /* Sets a = d where d is a single digit */
+    size_t i;
+    
+    for (i = 1; i < ndigits; i++)
+    {
+        a[i] = 0;
+    }
+    a[0] = d;
+}
+
+DIGIT_T mpShortDiv(DIGIT_T q[], const DIGIT_T u[], DIGIT_T v,
+                   size_t ndigits)
+{
+    /*    Calculates quotient q = u div v
+        Returns remainder r = u mod v
+        where q, u are multiprecision integers of ndigits each
+        and r, v are single precision digits.
+
+        Makes no assumptions about normalisation.
+        
+        Ref: Knuth Vol 2 Ch 4.3.1 Exercise 16 p625
+    */
+    size_t j;
+    DIGIT_T t[2], r;
+    size_t shift;
+    DIGIT_T bitmask, overflow, *uu;
+
+    if (ndigits == 0) return 0;
+    if (v == 0)    return 0;    /* Divide by zero error */
+
+    /*    Normalise first */
+    /*    Requires high bit of V
+        to be set, so find most signif. bit then shift left,
+        i.e. d = 2^shift, u' = u * d, v' = v * d.
+    */
+    bitmask = HIBITMASK;
+    for (shift = 0; shift < BITS_PER_DIGIT; shift++)
+    {
+        if (v & bitmask)
+            break;
+        bitmask >>= 1;
+    }
+
+    v <<= shift;
+    overflow = mpShiftLeft(q, u, shift, ndigits);
+    uu = q;
+    
+    /* Step S1 - modified for extra digit. */
+    r = overflow;    /* New digit Un */
+    j = ndigits;
+    while (j--)
+    {
+        /* Step S2. */
+        t[1] = r;
+        t[0] = uu[j];
+        overflow = spDivide(&q[j], &r, t, v);
+    }
+
+    /* Unnormalise */
+    r >>= shift;
+    
+    return r;
+}
+
+int QhatTooBig(DIGIT_T qhat, DIGIT_T rhat,
+                      DIGIT_T vn2, DIGIT_T ujn2)
+{    /*    Returns true if Qhat is too big
+        i.e. if (Qhat * Vn-2) > (b.Rhat + Uj+n-2)
+    */
+    DIGIT_T t[2];
+
+    spMultiply(t, qhat, vn2);
+    if (t[1] < rhat)
+        return 0;
+    else if (t[1] > rhat)
+        return 1;
+    else if (t[0] > ujn2)
+        return 1;
+
+    return 0;
+}
+
+DIGIT_T mpMultSub(DIGIT_T wn, DIGIT_T w[], const DIGIT_T v[],
+                       DIGIT_T q, size_t n)
+{    /*    Compute w = w - qv
+        where w = (WnW[n-1]...W[0])
+        return modified Wn.
+    */
+    DIGIT_T k, t[2];
+    size_t i;
+
+    if (q == 0)    /* No change */
+        return wn;
+
+    k = 0;
+
+    for (i = 0; i < n; i++)
+    {
+        spMultiply(t, q, v[i]);
+        w[i] -= k;
+        if (w[i] > MAX_DIGIT - k)
+            k = 1;
+        else
+            k = 0;
+        w[i] -= t[0];
+        if (w[i] > MAX_DIGIT - t[0])
+            k++;
+        k += t[1];
+    }
+
+    /* Cope with Wn not stored in array w[0..n-1] */
+    wn -= k;
+
+    return wn;
+}
+
+DIGIT_T mpShiftLeft(DIGIT_T a[], const DIGIT_T *b,
+    size_t shift, size_t ndigits)
+{    /* Computes a = b << shift */
+    /* [v2.1] Modified to cope with shift > BITS_PERDIGIT */
+    
+    DIGIT_T carry = 0;
+    
+    // this replaces the recursion
+    while (1) {
+        
+        size_t i, y, nw, bits;
+        DIGIT_T mask, tempCarry, nextcarry;
+
+        /* Do we shift whole digits? */
+        if (shift >= BITS_PER_DIGIT)
+        {
+            nw = shift / BITS_PER_DIGIT;
+            i = ndigits;
+            while (i--)
+            {
+                if (i >= nw)
+                    a[i] = b[i-nw];
+                else
+                    a[i] = 0;
+            }
+            /* Call again to shift bits inside digits */
+            bits = shift % BITS_PER_DIGIT;
+            tempCarry = b[ndigits-nw] << bits;
+            if (bits) {
+                carry |= tempCarry;
+                continue;
+            }
+            return carry;
+        }
+        else
+        {
+            bits = shift;
+        }
+
+        /* Construct mask = high bits set */
+        mask = ~(~(DIGIT_T)0 >> bits);
+        
+        y = BITS_PER_DIGIT - bits;
+        carry = 0;
+        for (i = 0; i < ndigits; i++)
+        {
+            nextcarry = (b[i] & mask) >> y;
+            a[i] = b[i] << bits | carry;
+            carry = nextcarry;
+        }
+
+        return carry;
+        
+    }
+}
+
+DIGIT_T mpShiftRight(DIGIT_T a[], const DIGIT_T b[], size_t shift, size_t ndigits)
+{    /* Computes a = b >> shift */
+    /* [v2.1] Modified to cope with shift > BITS_PERDIGIT */
+    
+    DIGIT_T carry = 0;
+    
+    while (1) {
+       
+        size_t i, y, nw, bits;
+        DIGIT_T mask, tempCarry, nextcarry;
+
+        /* Do we shift whole digits? */
+        if (shift >= BITS_PER_DIGIT)
+        {
+            nw = shift / BITS_PER_DIGIT;
+            for (i = 0; i < ndigits; i++)
+            {
+                if ((i+nw) < ndigits)
+                    a[i] = b[i+nw];
+                else
+                    a[i] = 0;
+            }
+            /* Call again to shift bits inside digits */
+            bits = shift % BITS_PER_DIGIT;
+            tempCarry = b[nw-1] >> bits;
+            if (bits)
+                carry |= tempCarry;
+            return carry;
+        }
+        else
+        {
+            bits = shift;
+        }
+
+        /* Construct mask to set low bits */
+        /* (thanks to Jesse Chisholm for suggesting this improved technique) */
+        mask = ~(~(DIGIT_T)0 << bits);
+        
+        y = BITS_PER_DIGIT - bits;
+        carry = 0;
+        i = ndigits;
+        while (i--)
+        {
+            nextcarry = (b[i] & mask) << y;
+            a[i] = b[i] >> bits | carry;
+            carry = nextcarry;
+        }
+
+        return carry;
+        
+    }
+}
+
+
+
+int spMultiply(uint p[2], uint x, uint y)
+{
+    /* Use a 64-bit temp for product */
+    ulong t = (ulong)x * (ulong)y;
+    /* then split into two parts */
+    p[1] = (uint)(t >> 32);
+    p[0] = (uint)(t & 0xFFFFFFFF);
+
+    return 0;
+}
+
+uint spDivide(uint *pq, uint *pr, const uint u[2], uint v)
+{
+    ulong uu, q;
+    uu = (ulong)u[1] << 32 | (ulong)u[0];
+    q = uu / (ulong)v;
+    //r = uu % (uint64_t)v;
+    *pr = (uint)(uu - q * v);
+    *pq = (uint)(q & 0xFFFFFFFF);
+    return (uint)(q >> 32);
+}
+
+int mpCompare(const DIGIT_T a[], const DIGIT_T b[], size_t ndigits)
+{
+    /* if (ndigits == 0) return 0; // deleted [v2.5] */
+
+    while (ndigits--)
+    {
+        if (a[ndigits] > b[ndigits])
+            return 1;    /* GT */
+        if (a[ndigits] < b[ndigits])
+            return -1;    /* LT */
+    }
+
+    return 0;    /* EQ */
+}
+
+void mpSetEqual(DIGIT_T a[], const DIGIT_T b[], size_t ndigits)
+{    /* Sets a = b */
+    size_t i;
+    
+    for (i = 0; i < ndigits; i++)
+    {
+        a[i] = b[i];
+    }
+}
+
+volatile DIGIT_T mpSetZero(volatile DIGIT_T a[], size_t ndigits)
+{    /* Sets a = 0 */
+
+    /* Prevent optimiser ignoring this */
+    volatile DIGIT_T optdummy;
+    volatile DIGIT_T *p = a;
+
+    while (ndigits--)
+        a[ndigits] = 0;
+    
+    optdummy = *p;
+    return optdummy;
+}
+
+size_t mpSizeof(const DIGIT_T a[], size_t ndigits)
+{
+    while(ndigits--)
+    {
+        if (a[ndigits] != 0)
+            return (++ndigits);
+    }
+    return 0;
+}
+
+volatile uint8 zeroise_bytes(volatile void *v, size_t n)
+{    /* Zeroise byte array b and make sure optimiser does not ignore this */
+    volatile uint8 optdummy;
+    volatile uint8 *b = (uint8*)v;
+    while(n--)
+        b[n] = 0;
+    optdummy = *b;
+    return optdummy;
+}
+
+void mpFail(char *msg)
+{
+    //perror(msg);
+    printf("the program should stop here");
+}
+
+size_t mpBitLength(const DIGIT_T d[], size_t ndigits)
+/* Returns no of significant bits in d */
+{
+    size_t n, i, bits;
+    DIGIT_T mask;
+
+    if (!d || ndigits == 0)
+        return 0;
+
+    n = mpSizeof(d, ndigits);
+    if (0 == n) return 0;
+
+    for (i = 0, mask = HIBITMASK; mask > 0; mask >>= 1, i++)
+    {
+        if (d[n-1] & mask)
+            break;
+    }
+
+    bits = n * BITS_PER_DIGIT - i;
+
+    return bits;
+}
+
+
+void mpModSquareTemp(DIGIT_T *y,DIGIT_T *m,size_t n,DIGIT_T *t1,DIGIT_T *t2) {
+    
+    mpSquare(t1,y,n);
+    mpDivide(t2,y,t1,n*2,m,n);
+    
+}
+
+void mpModMultTemp(DIGIT_T *y, DIGIT_T *x, DIGIT_T *m, size_t n, DIGIT_T* t1, DIGIT_T *t2) {
+    
+    mpMultiply(t1,x,y,n);
+    mpDivide(t2,y,t1,n*2,m,n);
+    
+}
+ 
+ 
+
+
+
+int mpModExpO(DIGIT_T *yout[], DIGIT_T *x[], DIGIT_T *e[], DIGIT_T *m[], size_t ndigits)
+{    /*    Computes y = x^e mod m */
+    /*    "Classic" binary left-to-right method */
+    /*  [v2.2] removed const restriction on m[] to avoid using an extra alloc'd var
+        (m is changed in-situ during the divide operation then restored) */
+    DIGIT_T mask;
+    size_t n;
+    size_t nn = ndigits * 2;
+    /* Create some double-length temps */
+//#ifdef NO_ALLOCS
+    DIGIT_T t1[MAX_FIXED_DIGITS * 2];
+    DIGIT_T t2[MAX_FIXED_DIGITS * 2];
+    DIGIT_T y[MAX_FIXED_DIGITS * 2];
+    assert(ndigits <= MAX_FIXED_DIGITS);
+/*#else
+    DIGIT_T *t1, *t2, *y;
+    t1 = mpAlloc(nn);
+    t2 = mpAlloc(nn);
+    y  = mpAlloc(nn);
+#endif
+    */
+    assert(ndigits != 0);
+
+    n = mpSizeof(*e, ndigits);
+    /* Catch e==0 => x^0=1 */
+    if (0 == n)
+    {
+        mpSetDigit(*yout, 1, ndigits);
+        goto done;
+    }
+    /* Find second-most significant bit in e */
+    for (mask = HIBITMASK; mask > 0; mask >>= 1)
+    {
+        if (*e[n-1] & mask)
+            break;
+    }
+    mpNEXTBITMASK(mask, n);
+
+    /* Set y = x */
+    mpSetEqual(*y, *x, ndigits);
+
+    /* For bit j = k-2 downto 0 */
+    while (n)
+    {
+        /* Square y = y * y mod n */
+        //mpMODSQUARETEMP(*y, *m, ndigits, t1, t2);
+        //mpModSquareTemp(*y, *m, ndigits, t1, t2);
+        
+        mpSquare(t1,y,n);
+        mpDivide(t2,y,t1,n*2,m,n);
+        
+        if (*e[n-1] & mask)
+        {    /*    if e(j) == 1 then multiply
+                y = y * x mod n */
+            //mpMODMULTTEMP(*y, *x, *m, ndigits, t1, t2);
+            //mpModMultTemp(*y, *x, *m, ndigits, t1, t2);
+            
+            mpMultiply(t1,x,y,n);
+            mpDivide(t2,y,t1,n*2,m,n);
+            
+        }
+        
+        /* Move to next bit */
+       mpNEXTBITMASK(mask, n);
+    }
+
+    /* Return y */
+    mpSetEqual(*yout, y, ndigits);
+
+done:
+    mpDESTROY(t1, nn);
+    mpDESTROY(t2, nn);
+    mpDESTROY(y, ndigits);
+
+    return 0;
+}
+
+int mpSquare(DIGIT_T w[], const DIGIT_T x[], size_t ndigits)
+/* New in Version 2.0 */
+{
+    /*    Computes square w = x * x
+        where x is a multiprecision integer of ndigits
+        and w is a multiprecision integer of 2*ndigits
+
+        Ref: Menezes p596 Algorithm 14.16 with errata.
+    */
+
+    DIGIT_T k, p[2], u[2], cbit, carry;
+    size_t i, j, t, i2, cpos;
+
+    assert(w != x);
+
+    t = ndigits;
+
+    /* 1. For i from 0 to (2t-1) do: w_i = 0 */
+    i2 = t << 1;
+    for (i = 0; i < i2; i++)
+        w[i] = 0;
+
+    carry = 0;
+    cpos = i2-1;
+    /* 2. For i from 0 to (t-1) do: */
+    for (i = 0; i < t; i++)
+    {
+        /* 2.1 (uv) = w_2i + x_i * x_i, w_2i = v, c = u
+           Careful, w_2i may be double-prec
+        */
+        i2 = i << 1; /* 2*i */
+        spMultiply(p, x[i], x[i]);
+        p[0] += w[i2];
+        if (p[0] < w[i2])
+            p[1]++;
+        k = 0;    /* p[1] < b, so no overflow here */
+        if (i2 == cpos && carry)
+        {
+            p[1] += carry;
+            if (p[1] < carry)
+                k++;
+            carry = 0;
+        }
+        w[i2] = p[0];
+        u[0] = p[1];
+        u[1] = k;
+
+        /* 2.2 for j from (i+1) to (t-1) do:
+           (uv) = w_{i+j} + 2x_j * x_i + c,
+           w_{i+j} = v, c = u,
+           u is double-prec
+           w_{i+j} is dbl if [i+j] == cpos
+        */
+        k = 0;
+        for (j = i+1; j < t; j++)
+        {
+            /* p = x_j * x_i */
+            spMultiply(p, x[j], x[i]);
+            /* p = 2p <=> p <<= 1 */
+            cbit = (p[0] & HIBITMASK) != 0;
+            k =  (p[1] & HIBITMASK) != 0;
+            p[0] <<= 1;
+            p[1] <<= 1;
+            p[1] |= cbit;
+            /* p = p + c */
+            p[0] += u[0];
+            if (p[0] < u[0])
+            {
+                p[1]++;
+                if (p[1] == 0)
+                    k++;
+            }
+            p[1] += u[1];
+            if (p[1] < u[1])
+                k++;
+            /* p = p + w_{i+j} */
+            p[0] += w[i+j];
+            if (p[0] < w[i+j])
+            {
+                p[1]++;
+                if (p[1] == 0)
+                    k++;
+            }
+            if ((i+j) == cpos && carry)
+            {    /* catch overflow from last round */
+                p[1] += carry;
+                if (p[1] < carry)
+                    k++;
+                carry = 0;
+            }
+            /* w_{i+j} = v, c = u */
+            w[i+j] = p[0];
+            u[0] = p[1];
+            u[1] = k;
+        }
+        /* 2.3 w_{i+t} = u */
+        w[i+t] = u[0];
+        /* remember overflow in w_{i+t} */
+        carry = u[1];
+        cpos = i+t;
+    }
+
+    /* (NB original step 3 deleted in Menezes errata) */
+
+    /* Return w */
+
+    return 0;
+}
+
+
+
+
+int mpIsZero(const DIGIT_T a[], size_t ndigits)
+{
+    size_t i;
+
+    /* if (ndigits == 0) return -1; // deleted [v2.5] */
+
+    for (i = 0; i < ndigits; i++)    /* Start at lsb */
+    {
+        if (a[i] != 0)
+            return 0;    /* False */
+    }
+
+    return (!0);    /* True */
+}
+
+
+void assert(bool precondition) {
+    
+    char str[] = "assert reached, also this message leaks memory";
+    
+    if (!precondition)
+        mpFail(str);
+    
+    
+}
+
+// some might be constants
+__kernel void single(global DIGIT_T* s, const unsigned int s_len,
+                     global DIGIT_T* e, const unsigned int e_len,
+                     global DIGIT_T* n, const unsigned int n_len,
+                     global DIGIT_T* res, const unsigned int res_len,
+                     //global DIGIT_T* comp, const unsigned int comp_len,
+                     const unsigned int max_len,
+                     global int8* valid
+                     //const unsigned int count
+                     )
+{
+    
+    
+    
+    mpModExpO(&res,&s,&e,&n,max_len);
+
+    
+}

	libgpuverify Signature verification on GPUs (WiP)
	Log \| Files \| Refs \| README \| LICENSE

M	source/opencl-test.c	\|	42	++++++++++++++++++------------------------
M	source/rsa-test.c	\|	313	+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++--------
M	source/rsa-test.h	\|	2	+-
M	xcode/.DS_Store	\|	0
M	xcode/lib-gpu-verify.xcodeproj/project.pbxproj	\|	8	++++----
M	xcode/lib-gpu-verify.xcodeproj/project.xcworkspace/xcuserdata/cedriczwahlen.xcuserdatad/UserInterfaceState.xcuserstate	\|	0
M	xcode/lib-gpu-verify.xcodeproj/xcshareddata/xcschemes/lib-gpu-verify.xcscheme	\|	2	+-
M	xcode/lib-gpu-verify.xcodeproj/xcuserdata/cedriczwahlen.xcuserdatad/xcdebugger/Breakpoints_v2.xcbkptlist	\|	142	+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
D	xcode/modexp.cl	\|	5	-----
A	xcode/verify.cl	\|	996	+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++