libgpuverify

Signature verification on GPUs (WiP)
Log | Files | Refs | README | LICENSE

gpuv-montg.cl (12251B)


      1 /*
      2  * lib-gpu-verify
      3  *
      4  * This software contains code derived from or inspired by the BigDigit library,
      5  * <http://www.di-mgt.com.au/bigdigits.html>
      6  * which is distributed under the Mozilla Public License, version 2.0.
      7  *
      8  * The original code and modifications made to it are subject to the terms and
      9  * conditions of the Mozilla Public License, version 2.0. A copy of the
     10  * MPL license can be obtained at
     11  * https://www.mozilla.org/en-US/MPL/2.0/.
     12  *
     13  * Changes and additions to the original code are as follows:
     14  *   - Copied various parts of the BigDigit library into this kernel.
     15  *   - Some functions and macros were changed to accomodate the architecture of OpenCL.
     16  *   - functions were added to extend the functionality required by this OpenCL kernel.
     17  *
     18  * Contributors:
     19  *   - Cedric Zwahlen cedric.zwahlen@bfh.ch
     20  *
     21  * Please note that this software is distributed on an "AS IS" BASIS,
     22  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
     23  * See the Mozilla Public License, version 2.0, for the specific language
     24  * governing permissions and limitations under the License.
     25  */
     26 
     27 
     28 typedef ulong gpu_register;
     29 
     30 #define BITS_PER_DIGIT (sizeof(gpu_register) * 8)
     31 #define HIBITMASK 0x8000000000000000
     32 #define MAX_DIGIT 0xFFFFFFFFFFFFFFFF
     33 
     34 #define R 32
     35 
     36 /*
     37 typedef uint gpu_register;
     38 
     39 #define BITS_PER_DIGIT (sizeof(gpu_register) * 8)
     40 #define HIBITMASK 0x80000000
     41 #define MAX_DIGIT 0xFFFFFFFF
     42 
     43 #define R 64
     44 */
     45 int mult(gpu_register p[2], gpu_register x, gpu_register y)
     46 {
     47 
     48 
     49     p[1] = mul_hi(x,y);
     50     p[0] = x * y;
     51 
     52     return 0;
     53 }
     54 
     55 int multiply( __private gpu_register *w,  __private gpu_register *u,  __global gpu_register *v, size_t ndigits)
     56 {
     57     /*    Computes product w = u * v
     58         where u, v are multiprecision integers of ndigits each
     59         and w is a multiprecision integer of 2*ndigits
     60 
     61         Ref: Knuth Vol 2 Ch 4.3.1 p 268 Algorithm M.
     62     */
     63 
     64     gpu_register k, t[2];
     65     size_t i, j, m, n;
     66 
     67     //assert(w != u && w != v);
     68 
     69     m = n = ndigits;
     70 
     71     /* Step M1. Initialise */
     72     for (i = 0; i < 2 * m; i++)
     73         w[i] = 0;
     74 
     75     for (j = 0; j < n; j++)
     76     {
     77         /* Step M2. Zero multiplier? */
     78         if (v[j] == 0)
     79         {
     80             w[j + m] = 0;
     81         }
     82         else
     83         {
     84             /* Step M3. Initialise i */
     85             k = 0;
     86             for (i = 0; i < m; i++)
     87             {
     88                 /* Step M4. Multiply and add */
     89                 /* t = u_i * v_j + w_(i+j) + k */
     90                 mult(t, u[i], v[j]);
     91 
     92                 t[0] += k;
     93                 if (t[0] < k)
     94                     t[1]++;
     95                 t[0] += w[i+j];
     96                 if (t[0] < w[i+j])
     97                     t[1]++;
     98 
     99                 w[i+j] = t[0];
    100                 k = t[1];
    101             }
    102             /* Step M5. Loop on i, set w_(j+m) = k */
    103             w[j+m] = k;
    104         }
    105     }    /* Step M6. Loop on j */
    106 
    107     return 0;
    108 }
    109 
    110 int square( __private gpu_register *w,  __private gpu_register *x, size_t ndigits)
    111 {
    112     /*    Computes square w = x * x
    113         where x is a multiprecision integer of ndigits
    114         and w is a multiprecision integer of 2*ndigits
    115 
    116         Ref: Menezes p596 Algorithm 14.16 with errata.
    117     */
    118 
    119     gpu_register k, p[2], u[2], cbit, carry;
    120     size_t i, j, t, i2, cpos;
    121 
    122     t = ndigits;
    123 
    124     /* 1. For i from 0 to (2t-1) do: w_i = 0 */
    125     i2 = t << 1;
    126     for (i = 0; i < i2; i++)
    127         w[i] = 0;
    128 
    129     carry = 0;
    130     cpos = i2-1;
    131     /* 2. For i from 0 to (t-1) do: */
    132     for (i = 0; i < t; i++)
    133     {
    134         /* 2.1 (uv) = w_2i + x_i * x_i, w_2i = v, c = u
    135            Careful, w_2i may be double-prec
    136         */
    137         i2 = i << 1; /* 2*i */
    138         mult(p, x[i], x[i]);
    139         p[0] += w[i2];
    140         if (p[0] < w[i2])
    141             p[1]++;
    142         k = 0;    /* p[1] < b, so no overflow here */
    143         if (i2 == cpos && carry)
    144         {
    145             p[1] += carry;
    146             if (p[1] < carry)
    147                 k++;
    148             carry = 0;
    149         }
    150         w[i2] = p[0];
    151         u[0] = p[1];
    152         u[1] = k;
    153 
    154         /* 2.2 for j from (i+1) to (t-1) do:
    155            (uv) = w_{i+j} + 2x_j * x_i + c,
    156            w_{i+j} = v, c = u,
    157            u is double-prec
    158            w_{i+j} is dbl if [i+j] == cpos
    159         */
    160         k = 0;
    161         for (j = i+1; j < t; j++)
    162         {
    163             /* p = x_j * x_i */
    164             mult(p, x[j], x[i]);
    165             /* p = 2p <=> p <<= 1 */
    166             cbit = (p[0] & HIBITMASK) != 0;
    167             k =  (p[1] & HIBITMASK) != 0;
    168             p[0] <<= 1;
    169             p[1] <<= 1;
    170             p[1] |= cbit;
    171             /* p = p + c */
    172             p[0] += u[0];
    173             if (p[0] < u[0])
    174             {
    175                 p[1]++;
    176                 if (p[1] == 0)
    177                     k++;
    178             }
    179             p[1] += u[1];
    180             if (p[1] < u[1])
    181                 k++;
    182             /* p = p + w_{i+j} */
    183             p[0] += w[i+j];
    184             if (p[0] < w[i+j])
    185             {
    186                 p[1]++;
    187                 if (p[1] == 0)
    188                     k++;
    189             }
    190             if ((i+j) == cpos && carry)
    191             {    /* catch overflow from last round */
    192                 p[1] += carry;
    193                 if (p[1] < carry)
    194                     k++;
    195                 carry = 0;
    196             }
    197             /* w_{i+j} = v, c = u */
    198             w[i+j] = p[0];
    199             u[0] = p[1];
    200             u[1] = k;
    201         }
    202         /* 2.3 w_{i+t} = u */
    203         w[i+t] = u[0];
    204         /* remember overflow in w_{i+t} */
    205         carry = u[1];
    206         cpos = i+t;
    207     }
    208 
    209     /* (NB original step 3 deleted in Menezes errata) */
    210 
    211     /* Return w */
    212 
    213     return 0;
    214 }
    215 
    216 gpu_register add( __private gpu_register *w,  __private gpu_register *u,  __private gpu_register *v, size_t ndigits)
    217 {
    218     /*    Calculates w = u + v
    219         where w, u, v are multiprecision integers of ndigits each
    220         Returns carry if overflow. Carry = 0 or 1.
    221 
    222         Ref: Knuth Vol 2 Ch 4.3.1 p 266 Algorithm A.
    223     */
    224 
    225     gpu_register k;
    226     size_t j;
    227 
    228     //assert(w != v);
    229 
    230     /* Step A1. Initialise */
    231     k = 0;
    232 
    233     for (j = 0; j < ndigits; j++)
    234     {
    235         /*    Step A2. Add digits w_j = (u_j + v_j + k)
    236             Set k = 1 if carry (overflow) occurs
    237         */
    238         w[j] = u[j] + k;
    239         if (w[j] < k)
    240             k = 1;
    241         else
    242             k = 0;
    243 
    244         w[j] += v[j];
    245         if (w[j] < v[j])
    246             k++;
    247 
    248     }    /* Step A3. Loop on j */
    249 
    250     return k;    /* w_n = k */
    251 }
    252 
    253 gpu_register subtract(__private gpu_register *w, __private gpu_register *u, __private gpu_register *v, size_t ndigits)
    254 {
    255     /*    Calculates w = u - v where u >= v
    256         w, u, v are multiprecision integers of ndigits each
    257         Returns 0 if OK, or 1 if v > u.
    258 
    259         Ref: Knuth Vol 2 Ch 4.3.1 p 267 Algorithm S.
    260     */
    261 
    262     gpu_register k;
    263     size_t j;
    264 
    265 
    266 
    267     /* Step S1. Initialise */
    268     k = 0;
    269 
    270     for (j = 0; j < ndigits; j++)
    271     {
    272         /*    Step S2. Subtract digits w_j = (u_j - v_j - k)
    273             Set k = 1 if borrow occurs.
    274         */
    275         w[j] = u[j] - k;
    276         if (w[j] > MAX_DIGIT - k)
    277             k = 1;
    278         else
    279             k = 0;
    280 
    281         w[j] -= v[j];
    282         if (w[j] > MAX_DIGIT - v[j])
    283             k++;
    284 
    285     }    /* Step S3. Loop on j */
    286 
    287     return k;    /* Should be zero if u >= v */
    288 }
    289 
    290 void equal( __global gpu_register *a,  __global gpu_register *b, size_t ndigits)
    291 {    /* Sets a = b */
    292     size_t i;
    293 
    294     for (i = 0; i < ndigits; i++)
    295     {
    296         a[i] = b[i];
    297     }
    298 }
    299 
    300 void equal_lg( __private gpu_register *a,  __global gpu_register *b, size_t ndigits)
    301 {    /* Sets a = b */
    302     size_t i;
    303 
    304     for (i = 0; i < ndigits; i++)
    305     {
    306         a[i] = b[i];
    307     }
    308 }
    309 
    310 void equal_ll( __private gpu_register *a,  __private gpu_register *b, size_t ndigits)
    311 {    /* Sets a = b */
    312     size_t i;
    313 
    314     for (i = 0; i < ndigits; i++)
    315     {
    316         a[i] = b[i];
    317     }
    318 }
    319 
    320 
    321 void erase_all( __private gpu_register *a, size_t n)
    322 {
    323 
    324     for (int i = 0; i < n; i++)
    325     {
    326         a[i] = 0;
    327     }
    328 
    329 }
    330 
    331 void shift_right(__private gpu_register *r, int n) {
    332 
    333     for (int i = 0; i < R+1; i++) {
    334 
    335         r[i] = r[i + n];
    336         //r[i + n] = 0;
    337 
    338     }
    339 
    340 }
    341 
    342 // 1 if r > l ; -1 if r < l; == 0
    343 int compare(__private gpu_register *r, __private gpu_register *l, int n) {
    344 
    345     int x = 0;
    346     for (int i = n - 1; i >= 0; i--) {
    347         x = r[i] > l[i];
    348         if (x) return 1;
    349         x = r[i] < l[i];
    350         if (x) return -1;
    351     }
    352     return 0;
    353 }
    354 
    355 int compare_g(__private gpu_register *r, __global gpu_register *l, int n) {
    356 
    357     int x = 0;
    358     for (int i = n - 1; i >= 0; i--) {
    359         x = r[i] > l[i];
    360         if (x) return 1;
    361         x = r[i] < l[i];
    362         if (x) return -1;
    363     }
    364     return 0;
    365 }
    366 
    367 int testbit(gpu_register e, int i) {
    368 
    369     return (e & (0x1 << (gpu_register)i) ) > 0 ? 1 : 0;
    370 
    371 }
    372 
    373 void montMul( __private gpu_register *ret,
    374               __private gpu_register *a,  __global gpu_register *b,
    375               __global gpu_register *ni,  __global gpu_register *n,
    376               __private gpu_register *tmp_1,  __private gpu_register *tmp_2,  __private gpu_register *tmp_3
    377              ) {
    378 
    379     multiply(tmp_1,a,b,R);
    380     multiply(tmp_2,tmp_1,ni,R);
    381     multiply(tmp_3,tmp_2,n,R);
    382 
    383     add(tmp_2,tmp_1,tmp_3,R*2+1);
    384 
    385     shift_right(tmp_2, R);
    386 
    387     erase_all(tmp_3, R+1);
    388     equal_lg(tmp_3, n, R);
    389 
    390     if (compare(tmp_2, tmp_3, R+1) >= 0) {
    391         subtract(ret, tmp_2, tmp_3, R+1);
    392     } else {
    393         equal_ll(ret, tmp_2, R);
    394     }
    395 
    396 }
    397 
    398 void montSqr( __private gpu_register *ret,
    399               __private gpu_register *a,
    400               __global gpu_register *ni,  __global gpu_register *n,
    401               __private gpu_register *tmp_1,  __private gpu_register *tmp_2,  __private gpu_register *tmp_3
    402              ) {
    403 
    404     square(tmp_1,a,R);
    405     multiply(tmp_2,tmp_1,ni,R);
    406     multiply(tmp_3,tmp_2,n,R);
    407 
    408     add(tmp_2,tmp_1,tmp_3,R*2+1);
    409 
    410     shift_right(tmp_2, R);
    411 
    412     erase_all(tmp_3, R+1);
    413     equal_lg(tmp_3, n, R);
    414 
    415     if (compare(tmp_2, tmp_3, R+1) >= 0) {
    416         subtract(ret, tmp_2, tmp_3, R+1);
    417     } else {
    418         equal_ll(ret, tmp_2, R);
    419     }
    420 
    421 
    422 
    423 }
    424 
    425 void montFinish( __private gpu_register *ret,
    426               __private gpu_register *a,
    427               __global gpu_register *ni,  __global gpu_register *n,
    428               __private gpu_register *tmp_1,  __private gpu_register *tmp_2,  __private gpu_register *tmp_3
    429              ) {
    430 
    431     erase_all(tmp_1, R*2+1);
    432     equal_ll(tmp_1,a,R);
    433 
    434     multiply(tmp_2,tmp_1,ni,R);
    435     multiply(tmp_3,tmp_2,n,R);
    436 
    437     add(tmp_2,tmp_1,tmp_3,R*2+1);
    438 
    439     shift_right(tmp_2, R);
    440 
    441     erase_all(tmp_3, R+1);
    442     equal_lg(tmp_3, n, R);
    443 
    444     if (compare(tmp_2, tmp_3, R+1) >= 0) {
    445         subtract(ret, tmp_2, tmp_3, R+1);
    446     } else {
    447         equal_ll(ret, tmp_2, R);
    448     }
    449 
    450 }
    451 
    452 
    453 __kernel void mont(__global gpu_register *x,  __global gpu_register *m,
    454                    __global gpu_register *n,  __global gpu_register *ni,
    455                    __global ulong *exp,
    456                    __global gpu_register *cmp,
    457                    __global uint *pks, __global uint *out // 32 bit output
    458                    )
    459 {
    460 
    461     __private gpu_register res_local[R];
    462     __private gpu_register x_local[R];
    463     __private gpu_register tmp_1_local[R * 2 + 1];
    464     __private gpu_register tmp_2_local[R * 2 + 1];
    465     __private gpu_register tmp_3_local[R * 2 + 1];
    466 
    467     for (int z = 0; z < R*2+1; z++) { tmp_1_local[z] = 0; tmp_2_local[z] = 0; tmp_3_local[z] = 0; }
    468 
    469     size_t i = get_global_id(0);
    470 
    471     uint pk = pks[i];
    472 
    473 
    474 
    475     ulong pk_i = R * pk;
    476     ulong s_i = R * i;
    477 
    478 //    printf((char __constant *)"%lu\n", ni[pk_i]);
    479 
    480     int k = ceil(log2((float)exp[pk] + (float)1));
    481 
    482     equal_lg(x_local,&x[s_i],R);
    483 
    484     for (int j = k - 1; j >= 0; j--) {
    485 
    486         montSqr(res_local, x_local, &ni[pk_i], &n[pk_i], tmp_1_local, tmp_2_local, tmp_3_local);
    487 
    488         if (testbit(exp[pk], j)) {
    489 
    490             equal_ll(x_local, res_local, R);
    491             montMul(res_local, x_local, &m[s_i], &ni[pk_i], &n[pk_i], tmp_1_local, tmp_2_local, tmp_3_local);
    492 
    493         }
    494 
    495         equal_ll(x_local, res_local, R);
    496     }
    497 
    498     montFinish(res_local, x_local, &ni[pk_i], &n[pk_i], tmp_1_local, tmp_2_local, tmp_3_local);
    499 
    500     if (compare_g(res_local, &cmp[s_i], R) == 0)
    501     {
    502 
    503         uint out_offset = i / (sizeof(uint) * 8); // 32 bit
    504 
    505         uint mv = 1 << i;
    506 
    507         atomic_or(&out[out_offset], mv);
    508 
    509 
    510     }
    511 }