gpuv-montg.cl (12251B)
1 /* 2 * lib-gpu-verify 3 * 4 * This software contains code derived from or inspired by the BigDigit library, 5 * <http://www.di-mgt.com.au/bigdigits.html> 6 * which is distributed under the Mozilla Public License, version 2.0. 7 * 8 * The original code and modifications made to it are subject to the terms and 9 * conditions of the Mozilla Public License, version 2.0. A copy of the 10 * MPL license can be obtained at 11 * https://www.mozilla.org/en-US/MPL/2.0/. 12 * 13 * Changes and additions to the original code are as follows: 14 * - Copied various parts of the BigDigit library into this kernel. 15 * - Some functions and macros were changed to accomodate the architecture of OpenCL. 16 * - functions were added to extend the functionality required by this OpenCL kernel. 17 * 18 * Contributors: 19 * - Cedric Zwahlen cedric.zwahlen@bfh.ch 20 * 21 * Please note that this software is distributed on an "AS IS" BASIS, 22 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 23 * See the Mozilla Public License, version 2.0, for the specific language 24 * governing permissions and limitations under the License. 25 */ 26 27 28 typedef ulong gpu_register; 29 30 #define BITS_PER_DIGIT (sizeof(gpu_register) * 8) 31 #define HIBITMASK 0x8000000000000000 32 #define MAX_DIGIT 0xFFFFFFFFFFFFFFFF 33 34 #define R 32 35 36 /* 37 typedef uint gpu_register; 38 39 #define BITS_PER_DIGIT (sizeof(gpu_register) * 8) 40 #define HIBITMASK 0x80000000 41 #define MAX_DIGIT 0xFFFFFFFF 42 43 #define R 64 44 */ 45 int mult(gpu_register p[2], gpu_register x, gpu_register y) 46 { 47 48 49 p[1] = mul_hi(x,y); 50 p[0] = x * y; 51 52 return 0; 53 } 54 55 int multiply( __private gpu_register *w, __private gpu_register *u, __global gpu_register *v, size_t ndigits) 56 { 57 /* Computes product w = u * v 58 where u, v are multiprecision integers of ndigits each 59 and w is a multiprecision integer of 2*ndigits 60 61 Ref: Knuth Vol 2 Ch 4.3.1 p 268 Algorithm M. 62 */ 63 64 gpu_register k, t[2]; 65 size_t i, j, m, n; 66 67 //assert(w != u && w != v); 68 69 m = n = ndigits; 70 71 /* Step M1. Initialise */ 72 for (i = 0; i < 2 * m; i++) 73 w[i] = 0; 74 75 for (j = 0; j < n; j++) 76 { 77 /* Step M2. Zero multiplier? */ 78 if (v[j] == 0) 79 { 80 w[j + m] = 0; 81 } 82 else 83 { 84 /* Step M3. Initialise i */ 85 k = 0; 86 for (i = 0; i < m; i++) 87 { 88 /* Step M4. Multiply and add */ 89 /* t = u_i * v_j + w_(i+j) + k */ 90 mult(t, u[i], v[j]); 91 92 t[0] += k; 93 if (t[0] < k) 94 t[1]++; 95 t[0] += w[i+j]; 96 if (t[0] < w[i+j]) 97 t[1]++; 98 99 w[i+j] = t[0]; 100 k = t[1]; 101 } 102 /* Step M5. Loop on i, set w_(j+m) = k */ 103 w[j+m] = k; 104 } 105 } /* Step M6. Loop on j */ 106 107 return 0; 108 } 109 110 int square( __private gpu_register *w, __private gpu_register *x, size_t ndigits) 111 { 112 /* Computes square w = x * x 113 where x is a multiprecision integer of ndigits 114 and w is a multiprecision integer of 2*ndigits 115 116 Ref: Menezes p596 Algorithm 14.16 with errata. 117 */ 118 119 gpu_register k, p[2], u[2], cbit, carry; 120 size_t i, j, t, i2, cpos; 121 122 t = ndigits; 123 124 /* 1. For i from 0 to (2t-1) do: w_i = 0 */ 125 i2 = t << 1; 126 for (i = 0; i < i2; i++) 127 w[i] = 0; 128 129 carry = 0; 130 cpos = i2-1; 131 /* 2. For i from 0 to (t-1) do: */ 132 for (i = 0; i < t; i++) 133 { 134 /* 2.1 (uv) = w_2i + x_i * x_i, w_2i = v, c = u 135 Careful, w_2i may be double-prec 136 */ 137 i2 = i << 1; /* 2*i */ 138 mult(p, x[i], x[i]); 139 p[0] += w[i2]; 140 if (p[0] < w[i2]) 141 p[1]++; 142 k = 0; /* p[1] < b, so no overflow here */ 143 if (i2 == cpos && carry) 144 { 145 p[1] += carry; 146 if (p[1] < carry) 147 k++; 148 carry = 0; 149 } 150 w[i2] = p[0]; 151 u[0] = p[1]; 152 u[1] = k; 153 154 /* 2.2 for j from (i+1) to (t-1) do: 155 (uv) = w_{i+j} + 2x_j * x_i + c, 156 w_{i+j} = v, c = u, 157 u is double-prec 158 w_{i+j} is dbl if [i+j] == cpos 159 */ 160 k = 0; 161 for (j = i+1; j < t; j++) 162 { 163 /* p = x_j * x_i */ 164 mult(p, x[j], x[i]); 165 /* p = 2p <=> p <<= 1 */ 166 cbit = (p[0] & HIBITMASK) != 0; 167 k = (p[1] & HIBITMASK) != 0; 168 p[0] <<= 1; 169 p[1] <<= 1; 170 p[1] |= cbit; 171 /* p = p + c */ 172 p[0] += u[0]; 173 if (p[0] < u[0]) 174 { 175 p[1]++; 176 if (p[1] == 0) 177 k++; 178 } 179 p[1] += u[1]; 180 if (p[1] < u[1]) 181 k++; 182 /* p = p + w_{i+j} */ 183 p[0] += w[i+j]; 184 if (p[0] < w[i+j]) 185 { 186 p[1]++; 187 if (p[1] == 0) 188 k++; 189 } 190 if ((i+j) == cpos && carry) 191 { /* catch overflow from last round */ 192 p[1] += carry; 193 if (p[1] < carry) 194 k++; 195 carry = 0; 196 } 197 /* w_{i+j} = v, c = u */ 198 w[i+j] = p[0]; 199 u[0] = p[1]; 200 u[1] = k; 201 } 202 /* 2.3 w_{i+t} = u */ 203 w[i+t] = u[0]; 204 /* remember overflow in w_{i+t} */ 205 carry = u[1]; 206 cpos = i+t; 207 } 208 209 /* (NB original step 3 deleted in Menezes errata) */ 210 211 /* Return w */ 212 213 return 0; 214 } 215 216 gpu_register add( __private gpu_register *w, __private gpu_register *u, __private gpu_register *v, size_t ndigits) 217 { 218 /* Calculates w = u + v 219 where w, u, v are multiprecision integers of ndigits each 220 Returns carry if overflow. Carry = 0 or 1. 221 222 Ref: Knuth Vol 2 Ch 4.3.1 p 266 Algorithm A. 223 */ 224 225 gpu_register k; 226 size_t j; 227 228 //assert(w != v); 229 230 /* Step A1. Initialise */ 231 k = 0; 232 233 for (j = 0; j < ndigits; j++) 234 { 235 /* Step A2. Add digits w_j = (u_j + v_j + k) 236 Set k = 1 if carry (overflow) occurs 237 */ 238 w[j] = u[j] + k; 239 if (w[j] < k) 240 k = 1; 241 else 242 k = 0; 243 244 w[j] += v[j]; 245 if (w[j] < v[j]) 246 k++; 247 248 } /* Step A3. Loop on j */ 249 250 return k; /* w_n = k */ 251 } 252 253 gpu_register subtract(__private gpu_register *w, __private gpu_register *u, __private gpu_register *v, size_t ndigits) 254 { 255 /* Calculates w = u - v where u >= v 256 w, u, v are multiprecision integers of ndigits each 257 Returns 0 if OK, or 1 if v > u. 258 259 Ref: Knuth Vol 2 Ch 4.3.1 p 267 Algorithm S. 260 */ 261 262 gpu_register k; 263 size_t j; 264 265 266 267 /* Step S1. Initialise */ 268 k = 0; 269 270 for (j = 0; j < ndigits; j++) 271 { 272 /* Step S2. Subtract digits w_j = (u_j - v_j - k) 273 Set k = 1 if borrow occurs. 274 */ 275 w[j] = u[j] - k; 276 if (w[j] > MAX_DIGIT - k) 277 k = 1; 278 else 279 k = 0; 280 281 w[j] -= v[j]; 282 if (w[j] > MAX_DIGIT - v[j]) 283 k++; 284 285 } /* Step S3. Loop on j */ 286 287 return k; /* Should be zero if u >= v */ 288 } 289 290 void equal( __global gpu_register *a, __global gpu_register *b, size_t ndigits) 291 { /* Sets a = b */ 292 size_t i; 293 294 for (i = 0; i < ndigits; i++) 295 { 296 a[i] = b[i]; 297 } 298 } 299 300 void equal_lg( __private gpu_register *a, __global gpu_register *b, size_t ndigits) 301 { /* Sets a = b */ 302 size_t i; 303 304 for (i = 0; i < ndigits; i++) 305 { 306 a[i] = b[i]; 307 } 308 } 309 310 void equal_ll( __private gpu_register *a, __private gpu_register *b, size_t ndigits) 311 { /* Sets a = b */ 312 size_t i; 313 314 for (i = 0; i < ndigits; i++) 315 { 316 a[i] = b[i]; 317 } 318 } 319 320 321 void erase_all( __private gpu_register *a, size_t n) 322 { 323 324 for (int i = 0; i < n; i++) 325 { 326 a[i] = 0; 327 } 328 329 } 330 331 void shift_right(__private gpu_register *r, int n) { 332 333 for (int i = 0; i < R+1; i++) { 334 335 r[i] = r[i + n]; 336 //r[i + n] = 0; 337 338 } 339 340 } 341 342 // 1 if r > l ; -1 if r < l; == 0 343 int compare(__private gpu_register *r, __private gpu_register *l, int n) { 344 345 int x = 0; 346 for (int i = n - 1; i >= 0; i--) { 347 x = r[i] > l[i]; 348 if (x) return 1; 349 x = r[i] < l[i]; 350 if (x) return -1; 351 } 352 return 0; 353 } 354 355 int compare_g(__private gpu_register *r, __global gpu_register *l, int n) { 356 357 int x = 0; 358 for (int i = n - 1; i >= 0; i--) { 359 x = r[i] > l[i]; 360 if (x) return 1; 361 x = r[i] < l[i]; 362 if (x) return -1; 363 } 364 return 0; 365 } 366 367 int testbit(gpu_register e, int i) { 368 369 return (e & (0x1 << (gpu_register)i) ) > 0 ? 1 : 0; 370 371 } 372 373 void montMul( __private gpu_register *ret, 374 __private gpu_register *a, __global gpu_register *b, 375 __global gpu_register *ni, __global gpu_register *n, 376 __private gpu_register *tmp_1, __private gpu_register *tmp_2, __private gpu_register *tmp_3 377 ) { 378 379 multiply(tmp_1,a,b,R); 380 multiply(tmp_2,tmp_1,ni,R); 381 multiply(tmp_3,tmp_2,n,R); 382 383 add(tmp_2,tmp_1,tmp_3,R*2+1); 384 385 shift_right(tmp_2, R); 386 387 erase_all(tmp_3, R+1); 388 equal_lg(tmp_3, n, R); 389 390 if (compare(tmp_2, tmp_3, R+1) >= 0) { 391 subtract(ret, tmp_2, tmp_3, R+1); 392 } else { 393 equal_ll(ret, tmp_2, R); 394 } 395 396 } 397 398 void montSqr( __private gpu_register *ret, 399 __private gpu_register *a, 400 __global gpu_register *ni, __global gpu_register *n, 401 __private gpu_register *tmp_1, __private gpu_register *tmp_2, __private gpu_register *tmp_3 402 ) { 403 404 square(tmp_1,a,R); 405 multiply(tmp_2,tmp_1,ni,R); 406 multiply(tmp_3,tmp_2,n,R); 407 408 add(tmp_2,tmp_1,tmp_3,R*2+1); 409 410 shift_right(tmp_2, R); 411 412 erase_all(tmp_3, R+1); 413 equal_lg(tmp_3, n, R); 414 415 if (compare(tmp_2, tmp_3, R+1) >= 0) { 416 subtract(ret, tmp_2, tmp_3, R+1); 417 } else { 418 equal_ll(ret, tmp_2, R); 419 } 420 421 422 423 } 424 425 void montFinish( __private gpu_register *ret, 426 __private gpu_register *a, 427 __global gpu_register *ni, __global gpu_register *n, 428 __private gpu_register *tmp_1, __private gpu_register *tmp_2, __private gpu_register *tmp_3 429 ) { 430 431 erase_all(tmp_1, R*2+1); 432 equal_ll(tmp_1,a,R); 433 434 multiply(tmp_2,tmp_1,ni,R); 435 multiply(tmp_3,tmp_2,n,R); 436 437 add(tmp_2,tmp_1,tmp_3,R*2+1); 438 439 shift_right(tmp_2, R); 440 441 erase_all(tmp_3, R+1); 442 equal_lg(tmp_3, n, R); 443 444 if (compare(tmp_2, tmp_3, R+1) >= 0) { 445 subtract(ret, tmp_2, tmp_3, R+1); 446 } else { 447 equal_ll(ret, tmp_2, R); 448 } 449 450 } 451 452 453 __kernel void mont(__global gpu_register *x, __global gpu_register *m, 454 __global gpu_register *n, __global gpu_register *ni, 455 __global ulong *exp, 456 __global gpu_register *cmp, 457 __global uint *pks, __global uint *out // 32 bit output 458 ) 459 { 460 461 __private gpu_register res_local[R]; 462 __private gpu_register x_local[R]; 463 __private gpu_register tmp_1_local[R * 2 + 1]; 464 __private gpu_register tmp_2_local[R * 2 + 1]; 465 __private gpu_register tmp_3_local[R * 2 + 1]; 466 467 for (int z = 0; z < R*2+1; z++) { tmp_1_local[z] = 0; tmp_2_local[z] = 0; tmp_3_local[z] = 0; } 468 469 size_t i = get_global_id(0); 470 471 uint pk = pks[i]; 472 473 474 475 ulong pk_i = R * pk; 476 ulong s_i = R * i; 477 478 // printf((char __constant *)"%lu\n", ni[pk_i]); 479 480 int k = ceil(log2((float)exp[pk] + (float)1)); 481 482 equal_lg(x_local,&x[s_i],R); 483 484 for (int j = k - 1; j >= 0; j--) { 485 486 montSqr(res_local, x_local, &ni[pk_i], &n[pk_i], tmp_1_local, tmp_2_local, tmp_3_local); 487 488 if (testbit(exp[pk], j)) { 489 490 equal_ll(x_local, res_local, R); 491 montMul(res_local, x_local, &m[s_i], &ni[pk_i], &n[pk_i], tmp_1_local, tmp_2_local, tmp_3_local); 492 493 } 494 495 equal_ll(x_local, res_local, R); 496 } 497 498 montFinish(res_local, x_local, &ni[pk_i], &n[pk_i], tmp_1_local, tmp_2_local, tmp_3_local); 499 500 if (compare_g(res_local, &cmp[s_i], R) == 0) 501 { 502 503 uint out_offset = i / (sizeof(uint) * 8); // 32 bit 504 505 uint mv = 1 << i; 506 507 atomic_or(&out[out_offset], mv); 508 509 510 } 511 }