libgpuverify

Signature verification on GPUs (WiP)
Log | Files | Refs | README | LICENSE

commit d50b5b70b89f9fd9a7208fa52c8bf0c9aadb9f92
parent b5c2468c4d2b4b9ee2364e6331cbf968c060023d
Author: Cedric <cedric.zwahlen@students.bfh.ch>
Date:   Wed,  1 Nov 2023 23:33:32 +0100

The OpenCL kernel compiles, and data can be passed to it. The kernel does not return anything yet.

Diffstat:
Msource/opencl-test.c | 42++++++++++++++++++------------------------
Msource/rsa-test.c | 313+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++--------
Msource/rsa-test.h | 2+-
Mxcode/.DS_Store | 0
Mxcode/lib-gpu-verify.xcodeproj/project.pbxproj | 8++++----
Mxcode/lib-gpu-verify.xcodeproj/project.xcworkspace/xcuserdata/cedriczwahlen.xcuserdatad/UserInterfaceState.xcuserstate | 0
Mxcode/lib-gpu-verify.xcodeproj/xcshareddata/xcschemes/lib-gpu-verify.xcscheme | 2+-
Mxcode/lib-gpu-verify.xcodeproj/xcuserdata/cedriczwahlen.xcuserdatad/xcdebugger/Breakpoints_v2.xcbkptlist | 142+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
Dxcode/modexp.cl | 5-----
Axcode/verify.cl | 996+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
10 files changed, 1445 insertions(+), 65 deletions(-)

diff --git a/source/opencl-test.c b/source/opencl-test.c @@ -29,12 +29,10 @@ int opencl_tests(void) { cl_mem input; // device memory used for the input array cl_mem output; // device memory used for the output array - // Fill our data set with random float values - // + int i = 0; unsigned int count = DATA_SIZE; - for(i = 0; i < count; i++) - data[i] = rand() / (float)RAND_MAX; + // Connect to a compute device // @@ -46,15 +44,20 @@ int opencl_tests(void) { return EXIT_FAILURE; } - size_t retSize = 0; - clGetDeviceInfo(device_id, CL_DRIVER_VERSION, 0, NULL, &retSize); - - char driver_version[retSize]; - clGetDeviceInfo(device_id, CL_DRIVER_VERSION, retSize, &driver_version, &retSize); + size_t retSize_1 = 0; + clGetDeviceInfo(device_id, CL_DRIVER_VERSION, 0, NULL, &retSize_1); + char driver_version[retSize_1]; + clGetDeviceInfo(device_id, CL_DRIVER_VERSION, retSize_1, &driver_version, &retSize_1); + + printf("driver version: %s\n", driver_version); + + + size_t retSize_2 = sizeof(cl_uint); + cl_uint address_bits = 0; + clGetDeviceInfo(device_id, CL_DEVICE_ADDRESS_BITS, 0, NULL, &retSize_2); + clGetDeviceInfo(device_id, CL_DEVICE_ADDRESS_BITS, retSize_2, &address_bits, &retSize_2); - //char *driver_version; - //clGetDeviceInfo(device_id, CL_DRIVER_VERSION, NULL, &driver_version, NULL); - printf("%s\n", driver_version); + printf("device address bits: %i\n", address_bits); // Create a compute context @@ -78,7 +81,7 @@ int opencl_tests(void) { // get the kernel from a file instead of a constant - FILE *fp = fopen("rsa-kernel.cl", "r"); + FILE *fp = fopen("./verify.cl", "r"); fseek(fp, 0L, SEEK_END); size_t sz = ftell(fp); rewind(fp); @@ -113,7 +116,7 @@ int opencl_tests(void) { // Create the compute kernel in the program we wish to run // - kernel = clCreateKernel(program, "square", &err); + kernel = clCreateKernel(program, "single", &err); if (!kernel || err != CL_SUCCESS) { printf("Error: Failed to create compute kernel!\n"); @@ -184,15 +187,7 @@ int opencl_tests(void) { exit(1); } - // Validate our results - // - correct = 0; - for(i = 0; i < count; i++) - { - if(results[i] == data[i] * data[i]) - correct++; - } - + // Print a brief summary detailing the results // @@ -207,5 +202,4 @@ int opencl_tests(void) { clReleaseCommandQueue(commands); clReleaseContext(context); - } diff --git a/source/rsa-test.c b/source/rsa-test.c @@ -7,7 +7,7 @@ #include "rsa-test.h" #include "big-int-test.h" - +#include <OpenCL/opencl.h> #include "ctype.h" #include "time.h" @@ -16,9 +16,9 @@ #define NEED_LIBGCRYPT_VERSION "1.10.2" +#define DATA_SIZE (1024) - -void rsa_tests(void) { +int rsa_tests(void) { // MARK: UNSAFE init @@ -75,13 +75,8 @@ void rsa_tests(void) { // use gcry_rsa_sign – without padding? gcry_sexp_t resSign; - - gcry_pk_sign(&resSign, toSign, key); - - // show_sexp("\n", resSign); - // measure time // do the same thing with our bigNum library – do we even get the same signature? @@ -112,21 +107,10 @@ void rsa_tests(void) { unsigned char *sgn = malloc(2048); gcry_mpi_print(GCRYMPI_FMT_HEX,sgn,2048,&nL,sig_mpi); - // printf("%s",n); - - // use 'real time' – not clock time - - - - bigNum_tests(n, val, d); - - struct timespec t1, t2; clock_gettime(CLOCK_REALTIME, &t1); - - if (verify(sgn, e, n, val)) { printf("\nverification failed\n"); @@ -163,11 +147,287 @@ void rsa_tests(void) { // try to put it onto the gpu - return; + // MARK: GPU Code -} + int err; // error code returned from api calls + + float data[DATA_SIZE]; // original data set given to device + float results[DATA_SIZE]; // results returned from device + unsigned int correct; // number of correct results returned + + size_t global; // global domain size for our calculation + size_t local; // local domain size for our calculation + + cl_device_id device_id; // compute device id + cl_context context; // compute context + cl_command_queue commands; // compute command queue + cl_program program; // compute program + cl_kernel kernel; // compute kernel + + //cl_mem input; // device memory used for the input array + //cl_mem output; // device memory used for the output array + + + //int i = 0; + unsigned int count = DATA_SIZE; + + + // Connect to a compute device + // + int gpu = 1; + err = clGetDeviceIDs(NULL, gpu ? CL_DEVICE_TYPE_GPU : CL_DEVICE_TYPE_CPU, 1, &device_id, NULL); + if (err != CL_SUCCESS) + { + printf("Error: Failed to create a device group!\n"); + return EXIT_FAILURE; + } + + size_t retSize_1 = 0; + clGetDeviceInfo(device_id, CL_DRIVER_VERSION, 0, NULL, &retSize_1); + char driver_version[retSize_1]; + clGetDeviceInfo(device_id, CL_DRIVER_VERSION, retSize_1, &driver_version, &retSize_1); + + printf("driver version: %s\n", driver_version); + + + size_t retSize_2 = sizeof(cl_uint); + cl_uint address_bits = 0; + clGetDeviceInfo(device_id, CL_DEVICE_ADDRESS_BITS, 0, NULL, &retSize_2); + clGetDeviceInfo(device_id, CL_DEVICE_ADDRESS_BITS, retSize_2, &address_bits, &retSize_2); + + printf("device address bits: %i\n", address_bits); + + + // Create a compute context + // + context = clCreateContext(0, 1, &device_id, NULL, NULL, &err); + if (!context) + { + printf("Error: Failed to create a compute context!\n"); + return EXIT_FAILURE; + } + // Create a command commands + // + commands = clCreateCommandQueue(context, device_id, 0, &err); + if (!commands) + { + printf("Error: Failed to create a command commands!\n"); + return EXIT_FAILURE; + } + + // get the kernel from a file instead of a constant + + FILE *fp = fopen("./verify.cl", "r"); + fseek(fp, 0L, SEEK_END); + size_t sz = ftell(fp); + rewind(fp); + + char *kernelBuf = malloc(sz); + fread(kernelBuf, sizeof(char), sz, fp); + fclose(fp); + + // Create the compute program from the source buffer + // + //program = clCreateProgramWithSource(context, 1, (const char **) & KernelSource, NULL, &err); + program = clCreateProgramWithSource(context, 1, (const char **) & kernelBuf, NULL, &err); + if (!program) + { + printf("Error: Failed to create compute program!\n"); + return EXIT_FAILURE; + } + + // Build the program executable + // + err = clBuildProgram(program, 0, NULL, NULL, NULL, NULL); + if (err != CL_SUCCESS) + { + size_t len; + char buffer[2048]; + + printf("Error: Failed to build program executable!\n"); + clGetProgramBuildInfo(program, device_id, CL_PROGRAM_BUILD_LOG, sizeof(buffer), buffer, &len); + printf("%s\n", buffer); + exit(1); + } + + // Create the compute kernel in the program we wish to run + // + kernel = clCreateKernel(program, "single", &err); + if (!kernel || err != CL_SUCCESS) + { + printf("Error: Failed to create compute kernel!\n"); + exit(1); + } + + // Create the input and output arrays in device memory for our calculation + // + //input = clCreateBuffer(context, CL_MEM_READ_ONLY, sizeof(float) * count, NULL, NULL); + //output = clCreateBuffer(context, CL_MEM_WRITE_ONLY, sizeof(float) * count, NULL, NULL); + + cl_mem s_mem; + cl_mem e_mem; + cl_mem n_mem; + + cl_mem res_mem; + cl_mem valid; // needs to be a buffer because it goes out + + DIGIT_T n_buf [MAX_ALLOC_SIZE*2]; + DIGIT_T e_buf [MAX_ALLOC_SIZE*2]; + DIGIT_T s_buf [MAX_ALLOC_SIZE*2]; + + DIGIT_T res_buf [MAX_ALLOC_SIZE*2]; + + mpSetZero(n_buf, MAX_ALLOC_SIZE*2); + mpSetZero(e_buf, MAX_ALLOC_SIZE*2); + mpSetZero(s_buf, MAX_ALLOC_SIZE*2); + + mpSetZero(res_buf, MAX_ALLOC_SIZE*2); + + mpConvFromHex(n_buf, strlen(n), n); + mpConvFromHex(e_buf, strlen(e), e); + mpConvFromHex(s_buf, strlen(sgn), sgn); + + size_t sz_n = mpSizeof(n_buf, MAX_ALLOC_SIZE*2); + size_t sz_s = mpSizeof(s_buf, MAX_ALLOC_SIZE*2); + + + unsigned long s_len = mpSizeof(s_buf, MAX_ALLOC_SIZE*2); + unsigned long e_len = mpSizeof(e_buf, MAX_ALLOC_SIZE*2); + unsigned long n_len = mpSizeof(n_buf, MAX_ALLOC_SIZE*2); + + unsigned long res_len = MAX_ALLOC_SIZE*2; + + unsigned long max_len = max(sz_s,sz_n); + + n_mem = clCreateBuffer(context, CL_MEM_READ_WRITE, sizeof(char) * n_len, NULL, NULL); + e_mem = clCreateBuffer(context, CL_MEM_READ_WRITE, sizeof(char) * e_len, NULL, NULL); + s_mem = clCreateBuffer(context, CL_MEM_READ_WRITE, sizeof(char) * s_len, NULL, NULL); + + res_mem = clCreateBuffer(context, CL_MEM_READ_WRITE, sizeof(char) * res_len, NULL, NULL); + valid = clCreateBuffer(context, CL_MEM_READ_WRITE, sizeof(int8_t), NULL, NULL); + + + global = 4096; + //local = global; + + + if (!s_mem || !e_mem || !n_mem || !res_mem || !valid ) + { + printf("Error: Failed to allocate device memory!\n"); + exit(1); + } + + + // Write our data set into the input array in device memory + // + err = clEnqueueWriteBuffer(commands, s_mem, CL_TRUE, 0, s_len, s_buf, 0, NULL, NULL); + err |= clEnqueueWriteBuffer(commands, e_mem, CL_TRUE, 0, e_len, e_buf, 0, NULL, NULL); + err |= clEnqueueWriteBuffer(commands, n_mem, CL_TRUE, 0, n_len, n_buf, 0, NULL, NULL); + //err |= clEnqueueWriteBuffer(commands, res_mem, CL_TRUE, 0, res_len, res_buf, 0, NULL, NULL); + if (err != CL_SUCCESS) + { + printf("Error: Failed to write to source array!\n"); + exit(1); + } + + // Set the arguments to our compute kernel + // + err = 0; + err = clSetKernelArg(kernel, 0, sizeof(cl_mem), &s_mem); + err |= clSetKernelArg(kernel, 1, sizeof(unsigned int), &s_len); + err |= clSetKernelArg(kernel, 2, sizeof(cl_mem), &e_mem); + err |= clSetKernelArg(kernel, 3, sizeof(unsigned int), &e_len); + err |= clSetKernelArg(kernel, 4, sizeof(cl_mem), &n_mem); + err |= clSetKernelArg(kernel, 5, sizeof(unsigned int), &n_len); + err |= clSetKernelArg(kernel, 6, sizeof(cl_mem), &res_mem); + err |= clSetKernelArg(kernel, 7, sizeof(unsigned int), &res_len); + err |= clSetKernelArg(kernel, 8, sizeof(unsigned int), &max_len); + err |= clSetKernelArg(kernel, 9, sizeof(cl_mem), &valid); + + //err |= clSetKernelArg(kernel, 2, sizeof(unsigned int), &count); + if (err != CL_SUCCESS) + { + printf("Error: Failed to set kernel arguments! %d\n", err); + exit(1); + } + + // Get the maximum work group size for executing the kernel on the device + // + err = clGetKernelWorkGroupInfo(kernel, device_id, CL_KERNEL_WORK_GROUP_SIZE, sizeof(local), &local, NULL); + if (err != CL_SUCCESS) + { + printf("Error: Failed to retrieve kernel work group info! %d\n", err); + exit(1); + } + + // Execute the kernel over the entire range of our 1d input data set + // using the maximum number of work group items for this device + // + + err = clEnqueueNDRangeKernel(commands, kernel, 1, NULL, &global, &local, 0, NULL, NULL); + if (err) + { + printf("Error: Failed to execute kernel!\n"); + return EXIT_FAILURE; + } + + // Wait for the command commands to get serviced before reading back results + // + clFinish(commands); + + // Read back the results from the device to verify the output + // + err = clEnqueueReadBuffer( commands, res_mem, CL_TRUE, 0, res_len, res_buf, 0, NULL, NULL ); + if (err != CL_SUCCESS) + { + printf("Error: Failed to read output array! %d\n", err); + exit(1); + } + + size_t sz_res = mpSizeof(res_buf, MAX_ALLOC_SIZE*2); + + int sz_mm = strlen(val) + 2; + + unsigned char comp[sz_mm]; + + mpConvToHex(res_buf, sz_res, comp, sz_mm); + + printf("%s",comp); + + // Print a brief summary detailing the results + // + // printf("Computed '%d/%d' correct values!\n", correct, count); + + // Shutdown and cleanup + // + clReleaseMemObject(res_mem); + clReleaseMemObject(e_mem); + clReleaseMemObject(n_mem); + clReleaseMemObject(s_mem); + + clReleaseProgram(program); + clReleaseKernel(kernel); + clReleaseCommandQueue(commands); + clReleaseContext(context); + + + + + + + + + + + + + return 0; + +} + +/* //void bigNum_tests(void) { void bigNum_tests(unsigned char* nn,unsigned char* ee,unsigned char* dd) { @@ -185,14 +445,7 @@ void bigNum_tests(unsigned char* nn,unsigned char* ee,unsigned char* dd) { mpSetZero(d, MAX_ALLOC_SIZE*2); mpSetZero(res, MAX_ALLOC_SIZE*2); - /* - - char* nn = "E08973398DD8F5F5E88776397F4EB005BB5383DE0FB7ABDC7DC775290D052E6D12DFA68626D4D26FAA5829FC97ECFA82510F3080BEB1509E4644F12CBBD832CFC6686F07D9B060ACBEEE34096A13F5F7050593DF5EBA3556D961FF197FC981E6F86CEA874070EFAC6D2C749F2DFA553AB9997702A648528C4EF357385774575F"; - - char* ee = "010001"; - - char* dd = "A403C327477634346CA686B57949014B2E8AD2C862B2C7D748096A8B91F736F275D6E8CD15906027314735644D95CD6763CEB49F56AC2F376E1CEE0EBF282DF439906F34D86E085BD5656AD841F313D72D395EFE33CBFF29E4030B3D05A28FB7F18EA27637B07957D32F2BDE8706227D04665EC91BAF8B1AC3EC9144AB7F21"; - */ + mpConvFromHex(N, strlen(nn), nn); mpConvFromHex(e, strlen(ee), ee); mpConvFromHex(d, strlen(dd), dd); @@ -213,7 +466,7 @@ void bigNum_tests(unsigned char* nn,unsigned char* ee,unsigned char* dd) { } - +*/ int verify(unsigned char* sign, unsigned char* ee, unsigned char* nn, unsigned char* mm) { diff --git a/source/rsa-test.h b/source/rsa-test.h @@ -12,7 +12,7 @@ #include <gcrypt.h> -void rsa_tests(void); +int rsa_tests(void); static void show_sexp(const char *prefix, gcry_sexp_t a); diff --git a/xcode/.DS_Store b/xcode/.DS_Store Binary files differ. diff --git a/xcode/lib-gpu-verify.xcodeproj/project.pbxproj b/xcode/lib-gpu-verify.xcodeproj/project.pbxproj @@ -8,7 +8,7 @@ /* Begin PBXBuildFile section */ 6A8A795D2A89357400116D7D /* rsa-kernel.cl in Sources */ = {isa = PBXBuildFile; fileRef = 6A8A795C2A89357400116D7D /* rsa-kernel.cl */; }; - 6A8A795F2A89672700116D7D /* modexp.cl in Sources */ = {isa = PBXBuildFile; fileRef = 6A8A795E2A89672700116D7D /* modexp.cl */; }; + 6A8A795F2A89672700116D7D /* verify.cl in Sources */ = {isa = PBXBuildFile; fileRef = 6A8A795E2A89672700116D7D /* verify.cl */; }; 6AF7487A2ADADEBD00D58E08 /* lib-gpu-verify.c in Sources */ = {isa = PBXBuildFile; fileRef = 6AF748792ADADEBD00D58E08 /* lib-gpu-verify.c */; }; 6AF748822ADADF4500D58E08 /* big-int-test.c in Sources */ = {isa = PBXBuildFile; fileRef = 6AF7487D2ADADF4500D58E08 /* big-int-test.c */; }; 6AF748832ADADF4500D58E08 /* rsa-test.c in Sources */ = {isa = PBXBuildFile; fileRef = 6AF7487F2ADADF4500D58E08 /* rsa-test.c */; }; @@ -31,7 +31,7 @@ /* Begin PBXFileReference section */ 466E0F5F0C932E1A00ED01DB /* lib-gpu-verify */ = {isa = PBXFileReference; explicitFileType = "compiled.mach-o.executable"; includeInIndex = 0; path = "lib-gpu-verify"; sourceTree = BUILT_PRODUCTS_DIR; }; 6A8A795C2A89357400116D7D /* rsa-kernel.cl */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.opencl; path = "rsa-kernel.cl"; sourceTree = "<group>"; }; - 6A8A795E2A89672700116D7D /* modexp.cl */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.opencl; path = modexp.cl; sourceTree = "<group>"; }; + 6A8A795E2A89672700116D7D /* verify.cl */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.opencl; path = verify.cl; sourceTree = "<group>"; }; 6AF748792ADADEBD00D58E08 /* lib-gpu-verify.c */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.c; name = "lib-gpu-verify.c"; path = "../source/lib-gpu-verify.c"; sourceTree = "<group>"; }; 6AF7487B2ADADF4500D58E08 /* big-int-test.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; name = "big-int-test.h"; path = "../source/big-int-test.h"; sourceTree = "<group>"; }; 6AF7487D2ADADF4500D58E08 /* big-int-test.c */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.c; name = "big-int-test.c"; path = "../source/big-int-test.c"; sourceTree = "<group>"; }; @@ -86,7 +86,7 @@ children = ( 6A984F162AC5B18A00F530FD /* Headers */, 6A8A795C2A89357400116D7D /* rsa-kernel.cl */, - 6A8A795E2A89672700116D7D /* modexp.cl */, + 6A8A795E2A89672700116D7D /* verify.cl */, 6AF748792ADADEBD00D58E08 /* lib-gpu-verify.c */, 6AF7487D2ADADF4500D58E08 /* big-int-test.c */, 6AF7487F2ADADF4500D58E08 /* rsa-test.c */, @@ -159,7 +159,7 @@ files = ( 6AF7487A2ADADEBD00D58E08 /* lib-gpu-verify.c in Sources */, 6A8A795D2A89357400116D7D /* rsa-kernel.cl in Sources */, - 6A8A795F2A89672700116D7D /* modexp.cl in Sources */, + 6A8A795F2A89672700116D7D /* verify.cl in Sources */, 6AF748832ADADF4500D58E08 /* rsa-test.c in Sources */, 6AF748862ADADFAD00D58E08 /* opencl-test.c in Sources */, 6AF748822ADADF4500D58E08 /* big-int-test.c in Sources */, diff --git a/xcode/lib-gpu-verify.xcodeproj/project.xcworkspace/xcuserdata/cedriczwahlen.xcuserdatad/UserInterfaceState.xcuserstate b/xcode/lib-gpu-verify.xcodeproj/project.xcworkspace/xcuserdata/cedriczwahlen.xcuserdatad/UserInterfaceState.xcuserstate Binary files differ. diff --git a/xcode/lib-gpu-verify.xcodeproj/xcshareddata/xcschemes/lib-gpu-verify.xcscheme b/xcode/lib-gpu-verify.xcodeproj/xcshareddata/xcschemes/lib-gpu-verify.xcscheme @@ -35,7 +35,7 @@ selectedLauncherIdentifier = "Xcode.DebuggerFoundation.Launcher.LLDB" launchStyle = "0" useCustomWorkingDirectory = "YES" - customWorkingDirectory = "/Users/cedriczwahlen/Desktop/OpenCL_Hello_World_Example" + customWorkingDirectory = "/Users/cedriczwahlen/libgpuverify/xcode" ignoresPersistentStateOnLaunch = "NO" debugDocumentVersioning = "YES" debugServiceExtension = "internal" diff --git a/xcode/lib-gpu-verify.xcodeproj/xcuserdata/cedriczwahlen.xcuserdatad/xcdebugger/Breakpoints_v2.xcbkptlist b/xcode/lib-gpu-verify.xcodeproj/xcuserdata/cedriczwahlen.xcuserdatad/xcdebugger/Breakpoints_v2.xcbkptlist @@ -680,5 +680,147 @@ landmarkType = "9"> </BreakpointContent> </BreakpointProxy> + <BreakpointProxy + BreakpointExtensionID = "Xcode.Breakpoint.FileBreakpoint"> + <BreakpointContent + uuid = "E34A3BBB-4BEA-4FC9-B0F1-55FB3A180116" + shouldBeEnabled = "Yes" + ignoreCount = "0" + continueAfterRunningActions = "No" + filePath = "../source/rsa-test.c" + startingColumnNumber = "9223372036854775807" + endingColumnNumber = "9223372036854775807" + startingLineNumber = "397" + endingLineNumber = "397" + landmarkName = "rsa_tests()" + landmarkType = "9"> + <Locations> + <Location + uuid = "E34A3BBB-4BEA-4FC9-B0F1-55FB3A180116 - b0b9078e770c93b0" + shouldBeEnabled = "Yes" + ignoreCount = "0" + continueAfterRunningActions = "No" + symbolName = "rsa_tests" + moduleName = "lib-gpu-verify" + usesParentBreakpointCondition = "Yes" + urlString = "file:///Users/cedriczwahlen/libgpuverify/source/rsa-test.c" + startingColumnNumber = "9223372036854775807" + endingColumnNumber = "9223372036854775807" + startingLineNumber = "402" + endingLineNumber = "402" + offsetFromSymbolStart = "4114"> + </Location> + <Location + uuid = "E34A3BBB-4BEA-4FC9-B0F1-55FB3A180116 - b0b9078e770c934d" + shouldBeEnabled = "Yes" + ignoreCount = "0" + continueAfterRunningActions = "No" + symbolName = "rsa_tests" + moduleName = "lib-gpu-verify" + usesParentBreakpointCondition = "Yes" + urlString = "file:///Users/cedriczwahlen/libgpuverify/source/rsa-test.c" + startingColumnNumber = "9223372036854775807" + endingColumnNumber = "9223372036854775807" + startingLineNumber = "397" + endingLineNumber = "397" + offsetFromSymbolStart = "4149"> + </Location> + <Location + uuid = "E34A3BBB-4BEA-4FC9-B0F1-55FB3A180116 - b0b9078e770c934d" + shouldBeEnabled = "Yes" + ignoreCount = "0" + continueAfterRunningActions = "No" + symbolName = "rsa_tests" + moduleName = "lib-gpu-verify" + usesParentBreakpointCondition = "Yes" + urlString = "file:///Users/cedriczwahlen/libgpuverify/source/rsa-test.c" + startingColumnNumber = "9223372036854775807" + endingColumnNumber = "9223372036854775807" + startingLineNumber = "397" + endingLineNumber = "397" + offsetFromSymbolStart = "4122"> + </Location> + <Location + uuid = "E34A3BBB-4BEA-4FC9-B0F1-55FB3A180116 - b0b9078e770c934d" + shouldBeEnabled = "Yes" + ignoreCount = "0" + continueAfterRunningActions = "No" + symbolName = "rsa_tests" + moduleName = "lib-gpu-verify" + usesParentBreakpointCondition = "Yes" + urlString = "file:///Users/cedriczwahlen/libgpuverify/source/rsa-test.c" + startingColumnNumber = "9223372036854775807" + endingColumnNumber = "9223372036854775807" + startingLineNumber = "397" + endingLineNumber = "397" + offsetFromSymbolStart = "4098"> + </Location> + </Locations> + </BreakpointContent> + </BreakpointProxy> + <BreakpointProxy + BreakpointExtensionID = "Xcode.Breakpoint.FileBreakpoint"> + <BreakpointContent + uuid = "8CC35C0B-FC9A-4FA3-841F-C77B239866FA" + shouldBeEnabled = "No" + ignoreCount = "0" + continueAfterRunningActions = "No" + filePath = "../source/rsa-test.c" + startingColumnNumber = "9223372036854775807" + endingColumnNumber = "9223372036854775807" + startingLineNumber = "316" + endingLineNumber = "316" + landmarkName = "rsa_tests()" + landmarkType = "9"> + <Locations> + <Location + uuid = "8CC35C0B-FC9A-4FA3-841F-C77B239866FA - b0b9078e770c98da" + shouldBeEnabled = "Yes" + ignoreCount = "0" + continueAfterRunningActions = "No" + symbolName = "rsa_tests" + moduleName = "lib-gpu-verify" + usesParentBreakpointCondition = "Yes" + urlString = "file:///Users/cedriczwahlen/libgpuverify/source/rsa-test.c" + startingColumnNumber = "9223372036854775807" + endingColumnNumber = "9223372036854775807" + startingLineNumber = "316" + endingLineNumber = "316" + offsetFromSymbolStart = "2796"> + </Location> + <Location + uuid = "8CC35C0B-FC9A-4FA3-841F-C77B239866FA - b0b9078e770c98da" + shouldBeEnabled = "Yes" + ignoreCount = "0" + continueAfterRunningActions = "No" + symbolName = "rsa_tests" + moduleName = "lib-gpu-verify" + usesParentBreakpointCondition = "Yes" + urlString = "file:///Users/cedriczwahlen/libgpuverify/source/rsa-test.c" + startingColumnNumber = "9223372036854775807" + endingColumnNumber = "9223372036854775807" + startingLineNumber = "316" + endingLineNumber = "316" + offsetFromSymbolStart = "2782"> + </Location> + </Locations> + </BreakpointContent> + </BreakpointProxy> + <BreakpointProxy + BreakpointExtensionID = "Xcode.Breakpoint.FileBreakpoint"> + <BreakpointContent + uuid = "0B50C23D-36DE-4C9C-B911-72E48EA8C7FD" + shouldBeEnabled = "Yes" + ignoreCount = "0" + continueAfterRunningActions = "No" + filePath = "../source/rsa-test.c" + startingColumnNumber = "9223372036854775807" + endingColumnNumber = "9223372036854775807" + startingLineNumber = "382" + endingLineNumber = "382" + landmarkName = "rsa_tests()" + landmarkType = "9"> + </BreakpointContent> + </BreakpointProxy> </Breakpoints> </Bucket> diff --git a/xcode/modexp.cl b/xcode/modexp.cl @@ -1,5 +0,0 @@ -__kernel void verify() { - -} - - diff --git a/xcode/verify.cl b/xcode/verify.cl @@ -0,0 +1,996 @@ + +// macros + +#define mpDESTROY(b, n) do{if(b)mpSetZero(b,n);}while(0) +#define max(a,b) (((a) > (b)) ? (a) : (b)) + +// only for that string conversion +#define ALLOC_BYTES(b,n) do{assert((n)<=sizeof((b)));zeroise_bytes((b),(n));}while(0) +#define FREE_BYTES(b,n) zeroise_bytes((b),(n)) + + +#define MAX_DIGIT 0xFFFFFFFFUL +#define MAX_HALF_DIGIT 0xFFFFUL /* NB 'L' */ +#define BITS_PER_DIGIT 32 +#define HIBITMASK 0x80000000UL + +#define MAX_FIXED_BIT_LENGTH 8192 +#define MAX_FIXED_DIGITS ((MAX_FIXED_BIT_LENGTH + BITS_PER_DIGIT - 1) / BITS_PER_DIGIT) + +#define MAX_ALLOC_SIZE (MAX_FIXED_DIGITS*BYTES_PER_DIGIT) + +#define BYTES_PER_DIGIT (BITS_PER_DIGIT / 8) + +#define PRIuBIGD PRIu32 +#define PRIxBIGD PRIx32 +#define PRIXBIGD PRIX32 + +/* MACROS TO DO MODULAR SQUARING AND MULTIPLICATION USING PRE-ALLOCATED TEMPS */ +/* Required lengths |y|=|t1|=|t2|=2*n, |m|=n; but final |y|=n */ +/* Square: y = (y * y) mod m */ +#define mpMODSQUARETEMP(y,m,n,t1,t2) do{mpSquare(t1,y,n);mpDivide(t2,y,t1,n*2,m,n);}while(0) +/* Mult: y = (y * x) mod m */ +#define mpMODMULTTEMP(y,x,m,n,t1,t2) do{mpMultiply(t1,x,y,n);mpDivide(t2,y,t1,n*2,m,n);}while(0) + +#define mpNEXTBITMASK(mask, n) do{if(mask==1){mask=HIBITMASK;n--;}else{mask>>=1;}}while(0) + +typedef uint DIGIT_T; + +typedef uint16 HALF_DIGIT_T; + + +// forward definitions + +int mpModulo(DIGIT_T r[], const DIGIT_T u[], size_t udigits, DIGIT_T v[], size_t vdigits); + +int mpModMult(DIGIT_T a[], const DIGIT_T x[], const DIGIT_T y[], DIGIT_T m[], size_t ndigits); + +int mpMultiply(DIGIT_T w[], const DIGIT_T u[], const DIGIT_T v[], size_t ndigits); +DIGIT_T mpAdd(DIGIT_T w[], const DIGIT_T u[], const DIGIT_T v[], size_t ndigits); +int mpDivide(DIGIT_T q[], DIGIT_T r[], const DIGIT_T u[], size_t udigits, DIGIT_T v[], size_t vdigits); +int QhatTooBig(DIGIT_T qhat, DIGIT_T rhat, DIGIT_T vn2, DIGIT_T ujn2); +DIGIT_T mpMultSub(DIGIT_T wn, DIGIT_T w[], const DIGIT_T v[], DIGIT_T q, size_t n); +DIGIT_T mpShiftLeft(DIGIT_T a[], const DIGIT_T *b, size_t shift, size_t ndigits); + + +void mpSetDigit(DIGIT_T a[], DIGIT_T d, size_t ndigits); + +int mpCompare(const DIGIT_T a[], const DIGIT_T b[], size_t ndigits); + + +DIGIT_T mpShiftRight(DIGIT_T a[], const DIGIT_T b[], size_t shift, size_t ndigits); +int spMultiply(uint p[2], uint x, uint y); +uint spDivide(uint *pq, uint *pr, const uint u[2], uint v); + +int mpSquare(DIGIT_T w[], const DIGIT_T x[], size_t ndigits); + +size_t mpBitLength(const DIGIT_T d[], size_t ndigits); + +DIGIT_T mpShortDiv(DIGIT_T q[], const DIGIT_T u[], DIGIT_T v, + size_t ndigits); + +void mpSetEqual(DIGIT_T a[], const DIGIT_T b[], size_t ndigits); + +size_t uiceil(float x); +volatile uint8 zeroise_bytes(volatile void *v, size_t n); + +size_t mpSizeof(const DIGIT_T a[], size_t ndigits); + +volatile DIGIT_T mpSetZero(volatile DIGIT_T a[], size_t ndigits); + +int mpIsZero(const DIGIT_T a[], size_t ndigits); + +void mpFail(char *msg); + +int mpModExpO(DIGIT_T *yout[], DIGIT_T *x[], DIGIT_T *e[], DIGIT_T *m[], size_t ndigits); + +void assert(bool precondition); + + +// implementation + +int mpModulo(DIGIT_T r[], const DIGIT_T u[], size_t udigits, + DIGIT_T v[], size_t vdigits) +{ + /* Computes r = u mod v + where r, v are multiprecision integers of length vdigits + and u is a multiprecision integer of length udigits. + r may overlap v. + + Note that r here is only vdigits long, + whereas in mpDivide it is udigits long. + + Use remainder from mpDivide function. + */ + + size_t nn = max(udigits, vdigits); +/* Allocate temp storage */ +//#ifdef NO_ALLOCS + // [v2.6] increased to two times + DIGIT_T qq[MAX_FIXED_DIGITS*2]; + DIGIT_T rr[MAX_FIXED_DIGITS*2]; + // assert(nn <= (MAX_FIXED_DIGITS*2)); +/*#else + DIGIT_T *qq, *rr; + qq = mpAlloc(udigits); + rr = mpAlloc(nn); +#endif +*/ + + /* rr[nn] = u mod v */ + mpDivide(qq, rr, u, udigits, v, vdigits); + + /* Final r is only vdigits long */ + mpSetEqual(r, rr, vdigits); + + mpDESTROY(rr, udigits); + mpDESTROY(qq, udigits); + + return 0; +} + +int mpModMult(DIGIT_T a[], const DIGIT_T x[], const DIGIT_T y[], + DIGIT_T m[], size_t ndigits) +{ /* Computes a = (x * y) mod m */ + +/* Double-length temp variable p */ +// #ifdef NO_ALLOCS + DIGIT_T p[MAX_FIXED_DIGITS * 2]; +// assert(ndigits <= MAX_FIXED_DIGITS); +/*#else + DIGIT_T *p; + p = mpAlloc(ndigits * 2); +#endif +*/ + /* Calc p[2n] = x * y */ + mpMultiply(p, x, y, ndigits); + + /* Then modulo (NOTE: a is OK at only ndigits long) */ + mpModulo(a, p, ndigits * 2, m, ndigits); + + mpDESTROY(p, ndigits * 2); + + return 0; +} + +int mpMultiply(DIGIT_T w[], const DIGIT_T u[], const DIGIT_T v[], size_t ndigits) +{ + /* Computes product w = u * v + where u, v are multiprecision integers of ndigits each + and w is a multiprecision integer of 2*ndigits + + Ref: Knuth Vol 2 Ch 4.3.1 p 268 Algorithm M. + */ + + DIGIT_T k, t[2]; + size_t i, j, m, n; + + // assert(w != u && w != v); + + m = n = ndigits; + + /* Step M1. Initialise */ + for (i = 0; i < 2 * m; i++) + w[i] = 0; + + for (j = 0; j < n; j++) + { + /* Step M2. Zero multiplier? */ + if (v[j] == 0) + { + w[j + m] = 0; + } + else + { + /* Step M3. Initialise i */ + k = 0; + for (i = 0; i < m; i++) + { + /* Step M4. Multiply and add */ + /* t = u_i * v_j + w_(i+j) + k */ + spMultiply(t, u[i], v[j]); + + t[0] += k; + if (t[0] < k) + t[1]++; + t[0] += w[i+j]; + if (t[0] < w[i+j]) + t[1]++; + + w[i+j] = t[0]; + k = t[1]; + } + /* Step M5. Loop on i, set w_(j+m) = k */ + w[j+m] = k; + } + } /* Step M6. Loop on j */ + + return 0; +} + +DIGIT_T mpAdd(DIGIT_T w[], const DIGIT_T u[], const DIGIT_T v[], size_t ndigits) +{ + /* Calculates w = u + v + where w, u, v are multiprecision integers of ndigits each + Returns carry if overflow. Carry = 0 or 1. + + Ref: Knuth Vol 2 Ch 4.3.1 p 266 Algorithm A. + */ + + DIGIT_T k; + size_t j; + + // assert(w != v); + + /* Step A1. Initialise */ + k = 0; + + for (j = 0; j < ndigits; j++) + { + /* Step A2. Add digits w_j = (u_j + v_j + k) + Set k = 1 if carry (overflow) occurs + */ + w[j] = u[j] + k; + if (w[j] < k) + k = 1; + else + k = 0; + + w[j] += v[j]; + if (w[j] < v[j]) + k++; + + } /* Step A3. Loop on j */ + + return k; /* w_n = k */ +} + +// MARK: This function is causing problems – this function calls mpShiftLeft, mpShiftRight at some point (and so does mpShortDiv) they contain recursions, which are forbidden + +int mpDivide(DIGIT_T q[], DIGIT_T r[], const DIGIT_T u[], + size_t udigits, DIGIT_T v[], size_t vdigits) +{ /* Computes quotient q = u / v and remainder r = u mod v + where q, r, u are multiple precision digits + all of udigits and the divisor v is vdigits. + + Ref: Knuth Vol 2 Ch 4.3.1 p 272 Algorithm D. + + Do without extra storage space, i.e. use r[] for + normalised u[], unnormalise v[] at end, and cope with + extra digit Uj+n added to u after normalisation. + + WARNING: this trashes q and r first, so cannot do + u = u / v or v = u mod v. + It also changes v temporarily so cannot make it const. + */ + size_t shift; + int n, m, j; + DIGIT_T bitmask, overflow; + DIGIT_T qhat, rhat, t[2]; + DIGIT_T *uu, *ww; + int qhatOK, cmp; + + /* Clear q and r */ + mpSetZero(q, udigits); + mpSetZero(r, udigits); + + /* Work out exact sizes of u and v */ + n = (int)mpSizeof(v, vdigits); + m = (int)mpSizeof(u, udigits); + m -= n; + + /* Catch special cases */ + if (n == 0) + return -1; /* Error: divide by zero */ + + if (n == 1) + { /* Use short division instead */ + r[0] = mpShortDiv(q, u, v[0], udigits); + return 0; + } + + if (m < 0) + { /* v > u, so just set q = 0 and r = u */ + mpSetEqual(r, u, udigits); + return 0; + } + + if (m == 0) + { /* u and v are the same length */ + cmp = mpCompare(u, v, (size_t)n); + if (cmp < 0) + { /* v > u, as above */ + mpSetEqual(r, u, udigits); + return 0; + } + else if (cmp == 0) + { /* v == u, so set q = 1 and r = 0 */ + mpSetDigit(q, 1, udigits); + return 0; + } + } + + /* In Knuth notation, we have: + Given + u = (Um+n-1 ... U1U0) + v = (Vn-1 ... V1V0) + Compute + q = u/v = (QmQm-1 ... Q0) + r = u mod v = (Rn-1 ... R1R0) + */ + + /* Step D1. Normalise */ + /* Requires high bit of Vn-1 + to be set, so find most signif. bit then shift left, + i.e. d = 2^shift, u' = u * d, v' = v * d. + */ + bitmask = HIBITMASK; + for (shift = 0; shift < BITS_PER_DIGIT; shift++) + { + if (v[n-1] & bitmask) + break; + bitmask >>= 1; + } + + /* Normalise v in situ - NB only shift non-zero digits */ + overflow = mpShiftLeft(v, v, shift, n); + + /* Copy normalised dividend u*d into r */ + overflow = mpShiftLeft(r, u, shift, n + m); + uu = r; /* Use ptr to keep notation constant */ + + t[0] = overflow; /* Extra digit Um+n */ + + + /* Step D2. Initialise j. Set j = m */ + + for (j = m; j >= 0; j--) + { + /* Step D3. Set Qhat = [(b.Uj+n + Uj+n-1)/Vn-1] + and Rhat = remainder */ + qhatOK = 0; + t[1] = t[0]; /* This is Uj+n */ + t[0] = uu[j+n-1]; + overflow = spDivide(&qhat, &rhat, t, v[n-1]); + + /* Test Qhat */ + if (overflow) + { /* Qhat == b so set Qhat = b - 1 */ + qhat = MAX_DIGIT; + rhat = uu[j+n-1]; + rhat += v[n-1]; + if (rhat < v[n-1]) /* Rhat >= b, so no re-test */ + qhatOK = 1; + } + /* [VERSION 2: Added extra test "qhat && "] */ + if (qhat && !qhatOK && QhatTooBig(qhat, rhat, v[n-2], uu[j+n-2])) + { /* If Qhat.Vn-2 > b.Rhat + Uj+n-2 + decrease Qhat by one, increase Rhat by Vn-1 + */ + qhat--; + rhat += v[n-1]; + /* Repeat this test if Rhat < b */ + if (!(rhat < v[n-1])) + if (QhatTooBig(qhat, rhat, v[n-2], uu[j+n-2])) + qhat--; + } + + + /* Step D4. Multiply and subtract */ + ww = &uu[j]; + overflow = mpMultSub(t[1], ww, v, qhat, (size_t)n); + + /* Step D5. Test remainder. Set Qj = Qhat */ + q[j] = qhat; + if (overflow) + { /* Step D6. Add back if D4 was negative */ + q[j]--; + overflow = mpAdd(ww, ww, v, (size_t)n); + } + + t[0] = uu[j+n-1]; /* Uj+n on next round */ + + } /* Step D7. Loop on j */ + + /* Clear high digits in uu */ + for (j = n; j < m+n; j++) + uu[j] = 0; + + /* Step D8. Unnormalise. */ + + mpShiftRight(r, r, shift, n); + mpShiftRight(v, v, shift, n); + + return 0; +} + +void mpSetDigit(DIGIT_T a[], DIGIT_T d, size_t ndigits) +{ /* Sets a = d where d is a single digit */ + size_t i; + + for (i = 1; i < ndigits; i++) + { + a[i] = 0; + } + a[0] = d; +} + +DIGIT_T mpShortDiv(DIGIT_T q[], const DIGIT_T u[], DIGIT_T v, + size_t ndigits) +{ + /* Calculates quotient q = u div v + Returns remainder r = u mod v + where q, u are multiprecision integers of ndigits each + and r, v are single precision digits. + + Makes no assumptions about normalisation. + + Ref: Knuth Vol 2 Ch 4.3.1 Exercise 16 p625 + */ + size_t j; + DIGIT_T t[2], r; + size_t shift; + DIGIT_T bitmask, overflow, *uu; + + if (ndigits == 0) return 0; + if (v == 0) return 0; /* Divide by zero error */ + + /* Normalise first */ + /* Requires high bit of V + to be set, so find most signif. bit then shift left, + i.e. d = 2^shift, u' = u * d, v' = v * d. + */ + bitmask = HIBITMASK; + for (shift = 0; shift < BITS_PER_DIGIT; shift++) + { + if (v & bitmask) + break; + bitmask >>= 1; + } + + v <<= shift; + overflow = mpShiftLeft(q, u, shift, ndigits); + uu = q; + + /* Step S1 - modified for extra digit. */ + r = overflow; /* New digit Un */ + j = ndigits; + while (j--) + { + /* Step S2. */ + t[1] = r; + t[0] = uu[j]; + overflow = spDivide(&q[j], &r, t, v); + } + + /* Unnormalise */ + r >>= shift; + + return r; +} + +int QhatTooBig(DIGIT_T qhat, DIGIT_T rhat, + DIGIT_T vn2, DIGIT_T ujn2) +{ /* Returns true if Qhat is too big + i.e. if (Qhat * Vn-2) > (b.Rhat + Uj+n-2) + */ + DIGIT_T t[2]; + + spMultiply(t, qhat, vn2); + if (t[1] < rhat) + return 0; + else if (t[1] > rhat) + return 1; + else if (t[0] > ujn2) + return 1; + + return 0; +} + +DIGIT_T mpMultSub(DIGIT_T wn, DIGIT_T w[], const DIGIT_T v[], + DIGIT_T q, size_t n) +{ /* Compute w = w - qv + where w = (WnW[n-1]...W[0]) + return modified Wn. + */ + DIGIT_T k, t[2]; + size_t i; + + if (q == 0) /* No change */ + return wn; + + k = 0; + + for (i = 0; i < n; i++) + { + spMultiply(t, q, v[i]); + w[i] -= k; + if (w[i] > MAX_DIGIT - k) + k = 1; + else + k = 0; + w[i] -= t[0]; + if (w[i] > MAX_DIGIT - t[0]) + k++; + k += t[1]; + } + + /* Cope with Wn not stored in array w[0..n-1] */ + wn -= k; + + return wn; +} + +DIGIT_T mpShiftLeft(DIGIT_T a[], const DIGIT_T *b, + size_t shift, size_t ndigits) +{ /* Computes a = b << shift */ + /* [v2.1] Modified to cope with shift > BITS_PERDIGIT */ + + DIGIT_T carry = 0; + + // this replaces the recursion + while (1) { + + size_t i, y, nw, bits; + DIGIT_T mask, tempCarry, nextcarry; + + /* Do we shift whole digits? */ + if (shift >= BITS_PER_DIGIT) + { + nw = shift / BITS_PER_DIGIT; + i = ndigits; + while (i--) + { + if (i >= nw) + a[i] = b[i-nw]; + else + a[i] = 0; + } + /* Call again to shift bits inside digits */ + bits = shift % BITS_PER_DIGIT; + tempCarry = b[ndigits-nw] << bits; + if (bits) { + carry |= tempCarry; + continue; + } + return carry; + } + else + { + bits = shift; + } + + /* Construct mask = high bits set */ + mask = ~(~(DIGIT_T)0 >> bits); + + y = BITS_PER_DIGIT - bits; + carry = 0; + for (i = 0; i < ndigits; i++) + { + nextcarry = (b[i] & mask) >> y; + a[i] = b[i] << bits | carry; + carry = nextcarry; + } + + return carry; + + } +} + +DIGIT_T mpShiftRight(DIGIT_T a[], const DIGIT_T b[], size_t shift, size_t ndigits) +{ /* Computes a = b >> shift */ + /* [v2.1] Modified to cope with shift > BITS_PERDIGIT */ + + DIGIT_T carry = 0; + + while (1) { + + size_t i, y, nw, bits; + DIGIT_T mask, tempCarry, nextcarry; + + /* Do we shift whole digits? */ + if (shift >= BITS_PER_DIGIT) + { + nw = shift / BITS_PER_DIGIT; + for (i = 0; i < ndigits; i++) + { + if ((i+nw) < ndigits) + a[i] = b[i+nw]; + else + a[i] = 0; + } + /* Call again to shift bits inside digits */ + bits = shift % BITS_PER_DIGIT; + tempCarry = b[nw-1] >> bits; + if (bits) + carry |= tempCarry; + return carry; + } + else + { + bits = shift; + } + + /* Construct mask to set low bits */ + /* (thanks to Jesse Chisholm for suggesting this improved technique) */ + mask = ~(~(DIGIT_T)0 << bits); + + y = BITS_PER_DIGIT - bits; + carry = 0; + i = ndigits; + while (i--) + { + nextcarry = (b[i] & mask) << y; + a[i] = b[i] >> bits | carry; + carry = nextcarry; + } + + return carry; + + } +} + + + +int spMultiply(uint p[2], uint x, uint y) +{ + /* Use a 64-bit temp for product */ + ulong t = (ulong)x * (ulong)y; + /* then split into two parts */ + p[1] = (uint)(t >> 32); + p[0] = (uint)(t & 0xFFFFFFFF); + + return 0; +} + +uint spDivide(uint *pq, uint *pr, const uint u[2], uint v) +{ + ulong uu, q; + uu = (ulong)u[1] << 32 | (ulong)u[0]; + q = uu / (ulong)v; + //r = uu % (uint64_t)v; + *pr = (uint)(uu - q * v); + *pq = (uint)(q & 0xFFFFFFFF); + return (uint)(q >> 32); +} + +int mpCompare(const DIGIT_T a[], const DIGIT_T b[], size_t ndigits) +{ + /* if (ndigits == 0) return 0; // deleted [v2.5] */ + + while (ndigits--) + { + if (a[ndigits] > b[ndigits]) + return 1; /* GT */ + if (a[ndigits] < b[ndigits]) + return -1; /* LT */ + } + + return 0; /* EQ */ +} + +void mpSetEqual(DIGIT_T a[], const DIGIT_T b[], size_t ndigits) +{ /* Sets a = b */ + size_t i; + + for (i = 0; i < ndigits; i++) + { + a[i] = b[i]; + } +} + +volatile DIGIT_T mpSetZero(volatile DIGIT_T a[], size_t ndigits) +{ /* Sets a = 0 */ + + /* Prevent optimiser ignoring this */ + volatile DIGIT_T optdummy; + volatile DIGIT_T *p = a; + + while (ndigits--) + a[ndigits] = 0; + + optdummy = *p; + return optdummy; +} + +size_t mpSizeof(const DIGIT_T a[], size_t ndigits) +{ + while(ndigits--) + { + if (a[ndigits] != 0) + return (++ndigits); + } + return 0; +} + +volatile uint8 zeroise_bytes(volatile void *v, size_t n) +{ /* Zeroise byte array b and make sure optimiser does not ignore this */ + volatile uint8 optdummy; + volatile uint8 *b = (uint8*)v; + while(n--) + b[n] = 0; + optdummy = *b; + return optdummy; +} + +void mpFail(char *msg) +{ + //perror(msg); + printf("the program should stop here"); +} + +size_t mpBitLength(const DIGIT_T d[], size_t ndigits) +/* Returns no of significant bits in d */ +{ + size_t n, i, bits; + DIGIT_T mask; + + if (!d || ndigits == 0) + return 0; + + n = mpSizeof(d, ndigits); + if (0 == n) return 0; + + for (i = 0, mask = HIBITMASK; mask > 0; mask >>= 1, i++) + { + if (d[n-1] & mask) + break; + } + + bits = n * BITS_PER_DIGIT - i; + + return bits; +} + + +void mpModSquareTemp(DIGIT_T *y,DIGIT_T *m,size_t n,DIGIT_T *t1,DIGIT_T *t2) { + + mpSquare(t1,y,n); + mpDivide(t2,y,t1,n*2,m,n); + +} + +void mpModMultTemp(DIGIT_T *y, DIGIT_T *x, DIGIT_T *m, size_t n, DIGIT_T* t1, DIGIT_T *t2) { + + mpMultiply(t1,x,y,n); + mpDivide(t2,y,t1,n*2,m,n); + +} + + + + + +int mpModExpO(DIGIT_T *yout[], DIGIT_T *x[], DIGIT_T *e[], DIGIT_T *m[], size_t ndigits) +{ /* Computes y = x^e mod m */ + /* "Classic" binary left-to-right method */ + /* [v2.2] removed const restriction on m[] to avoid using an extra alloc'd var + (m is changed in-situ during the divide operation then restored) */ + DIGIT_T mask; + size_t n; + size_t nn = ndigits * 2; + /* Create some double-length temps */ +//#ifdef NO_ALLOCS + DIGIT_T t1[MAX_FIXED_DIGITS * 2]; + DIGIT_T t2[MAX_FIXED_DIGITS * 2]; + DIGIT_T y[MAX_FIXED_DIGITS * 2]; + assert(ndigits <= MAX_FIXED_DIGITS); +/*#else + DIGIT_T *t1, *t2, *y; + t1 = mpAlloc(nn); + t2 = mpAlloc(nn); + y = mpAlloc(nn); +#endif + */ + assert(ndigits != 0); + + n = mpSizeof(*e, ndigits); + /* Catch e==0 => x^0=1 */ + if (0 == n) + { + mpSetDigit(*yout, 1, ndigits); + goto done; + } + /* Find second-most significant bit in e */ + for (mask = HIBITMASK; mask > 0; mask >>= 1) + { + if (*e[n-1] & mask) + break; + } + mpNEXTBITMASK(mask, n); + + /* Set y = x */ + mpSetEqual(*y, *x, ndigits); + + /* For bit j = k-2 downto 0 */ + while (n) + { + /* Square y = y * y mod n */ + //mpMODSQUARETEMP(*y, *m, ndigits, t1, t2); + //mpModSquareTemp(*y, *m, ndigits, t1, t2); + + mpSquare(t1,y,n); + mpDivide(t2,y,t1,n*2,m,n); + + if (*e[n-1] & mask) + { /* if e(j) == 1 then multiply + y = y * x mod n */ + //mpMODMULTTEMP(*y, *x, *m, ndigits, t1, t2); + //mpModMultTemp(*y, *x, *m, ndigits, t1, t2); + + mpMultiply(t1,x,y,n); + mpDivide(t2,y,t1,n*2,m,n); + + } + + /* Move to next bit */ + mpNEXTBITMASK(mask, n); + } + + /* Return y */ + mpSetEqual(*yout, y, ndigits); + +done: + mpDESTROY(t1, nn); + mpDESTROY(t2, nn); + mpDESTROY(y, ndigits); + + return 0; +} + +int mpSquare(DIGIT_T w[], const DIGIT_T x[], size_t ndigits) +/* New in Version 2.0 */ +{ + /* Computes square w = x * x + where x is a multiprecision integer of ndigits + and w is a multiprecision integer of 2*ndigits + + Ref: Menezes p596 Algorithm 14.16 with errata. + */ + + DIGIT_T k, p[2], u[2], cbit, carry; + size_t i, j, t, i2, cpos; + + assert(w != x); + + t = ndigits; + + /* 1. For i from 0 to (2t-1) do: w_i = 0 */ + i2 = t << 1; + for (i = 0; i < i2; i++) + w[i] = 0; + + carry = 0; + cpos = i2-1; + /* 2. For i from 0 to (t-1) do: */ + for (i = 0; i < t; i++) + { + /* 2.1 (uv) = w_2i + x_i * x_i, w_2i = v, c = u + Careful, w_2i may be double-prec + */ + i2 = i << 1; /* 2*i */ + spMultiply(p, x[i], x[i]); + p[0] += w[i2]; + if (p[0] < w[i2]) + p[1]++; + k = 0; /* p[1] < b, so no overflow here */ + if (i2 == cpos && carry) + { + p[1] += carry; + if (p[1] < carry) + k++; + carry = 0; + } + w[i2] = p[0]; + u[0] = p[1]; + u[1] = k; + + /* 2.2 for j from (i+1) to (t-1) do: + (uv) = w_{i+j} + 2x_j * x_i + c, + w_{i+j} = v, c = u, + u is double-prec + w_{i+j} is dbl if [i+j] == cpos + */ + k = 0; + for (j = i+1; j < t; j++) + { + /* p = x_j * x_i */ + spMultiply(p, x[j], x[i]); + /* p = 2p <=> p <<= 1 */ + cbit = (p[0] & HIBITMASK) != 0; + k = (p[1] & HIBITMASK) != 0; + p[0] <<= 1; + p[1] <<= 1; + p[1] |= cbit; + /* p = p + c */ + p[0] += u[0]; + if (p[0] < u[0]) + { + p[1]++; + if (p[1] == 0) + k++; + } + p[1] += u[1]; + if (p[1] < u[1]) + k++; + /* p = p + w_{i+j} */ + p[0] += w[i+j]; + if (p[0] < w[i+j]) + { + p[1]++; + if (p[1] == 0) + k++; + } + if ((i+j) == cpos && carry) + { /* catch overflow from last round */ + p[1] += carry; + if (p[1] < carry) + k++; + carry = 0; + } + /* w_{i+j} = v, c = u */ + w[i+j] = p[0]; + u[0] = p[1]; + u[1] = k; + } + /* 2.3 w_{i+t} = u */ + w[i+t] = u[0]; + /* remember overflow in w_{i+t} */ + carry = u[1]; + cpos = i+t; + } + + /* (NB original step 3 deleted in Menezes errata) */ + + /* Return w */ + + return 0; +} + + + + +int mpIsZero(const DIGIT_T a[], size_t ndigits) +{ + size_t i; + + /* if (ndigits == 0) return -1; // deleted [v2.5] */ + + for (i = 0; i < ndigits; i++) /* Start at lsb */ + { + if (a[i] != 0) + return 0; /* False */ + } + + return (!0); /* True */ +} + + +void assert(bool precondition) { + + char str[] = "assert reached, also this message leaks memory"; + + if (!precondition) + mpFail(str); + + +} + +// some might be constants +__kernel void single(global DIGIT_T* s, const unsigned int s_len, + global DIGIT_T* e, const unsigned int e_len, + global DIGIT_T* n, const unsigned int n_len, + global DIGIT_T* res, const unsigned int res_len, + //global DIGIT_T* comp, const unsigned int comp_len, + const unsigned int max_len, + global int8* valid + //const unsigned int count + ) +{ + + + + mpModExpO(&res,&s,&e,&n,max_len); + + +}