aboutsummaryrefslogtreecommitdiff
path: root/deps/openssl/openssl/crypto/bn/bn_exp.c
diff options
context:
space:
mode:
Diffstat (limited to 'deps/openssl/openssl/crypto/bn/bn_exp.c')
-rw-r--r--deps/openssl/openssl/crypto/bn/bn_exp.c332
1 files changed, 306 insertions, 26 deletions
diff --git a/deps/openssl/openssl/crypto/bn/bn_exp.c b/deps/openssl/openssl/crypto/bn/bn_exp.c
index 27146c89e7..8c46e50bdc 100644
--- a/deps/openssl/openssl/crypto/bn/bn_exp.c
+++ b/deps/openssl/openssl/crypto/bn/bn_exp.c
@@ -122,6 +122,23 @@
# ifndef alloca
# define alloca(s) __builtin_alloca((s))
# endif
+#elif defined(__sun)
+# include <alloca.h>
+#endif
+
+#undef RSAZ_ENABLED
+#if defined(OPENSSL_BN_ASM_MONT) && \
+ (defined(__x86_64) || defined(__x86_64__) || \
+ defined(_M_AMD64) || defined(_M_X64))
+# include "rsaz_exp.h"
+# define RSAZ_ENABLED
+#endif
+
+#undef SPARC_T4_MONT
+#if defined(OPENSSL_BN_ASM_MONT) && (defined(__sparc__) || defined(__sparc))
+# include "sparc_arch.h"
+extern unsigned int OPENSSL_sparcv9cap_P[];
+# define SPARC_T4_MONT
#endif
/* maximum precomputation table size for *variable* sliding windows */
@@ -464,6 +481,23 @@ int BN_mod_exp_mont(BIGNUM *rr, const BIGNUM *a, const BIGNUM *p,
wstart = bits - 1; /* The top bit of the window */
wend = 0; /* The bottom bit of the window */
+#if 1 /* by Shay Gueron's suggestion */
+ j = m->top; /* borrow j */
+ if (m->d[j - 1] & (((BN_ULONG)1) << (BN_BITS2 - 1))) {
+ if (bn_wexpand(r, j) == NULL)
+ goto err;
+ /* 2^(top*BN_BITS2) - m */
+ r->d[0] = (0 - m->d[0]) & BN_MASK2;
+ for (i = 1; i < j; i++)
+ r->d[i] = (~m->d[i]) & BN_MASK2;
+ r->top = j;
+ /*
+ * Upper words will be zero if the corresponding words of 'm' were
+ * 0xfff[...], so decrement r->top accordingly.
+ */
+ bn_correct_top(r);
+ } else
+#endif
if (!BN_to_montgomery(r, BN_value_one(), mont, ctx))
goto err;
for (;;) {
@@ -515,6 +549,17 @@ int BN_mod_exp_mont(BIGNUM *rr, const BIGNUM *a, const BIGNUM *p,
if (wstart < 0)
break;
}
+#if defined(SPARC_T4_MONT)
+ if (OPENSSL_sparcv9cap_P[0] & (SPARCV9_VIS3 | SPARCV9_PREFER_FPU)) {
+ j = mont->N.top; /* borrow j */
+ val[0]->d[0] = 1; /* borrow val[0] */
+ for (i = 1; i < j; i++)
+ val[0]->d[i] = 0;
+ val[0]->top = j;
+ if (!BN_mod_mul_montgomery(rr, r, val[0], mont, ctx))
+ goto err;
+ } else
+#endif
if (!BN_from_montgomery(rr, r, mont, ctx))
goto err;
ret = 1;
@@ -526,6 +571,27 @@ int BN_mod_exp_mont(BIGNUM *rr, const BIGNUM *a, const BIGNUM *p,
return (ret);
}
+#if defined(SPARC_T4_MONT)
+static BN_ULONG bn_get_bits(const BIGNUM *a, int bitpos)
+{
+ BN_ULONG ret = 0;
+ int wordpos;
+
+ wordpos = bitpos / BN_BITS2;
+ bitpos %= BN_BITS2;
+ if (wordpos >= 0 && wordpos < a->top) {
+ ret = a->d[wordpos] & BN_MASK2;
+ if (bitpos) {
+ ret >>= bitpos;
+ if (++wordpos < a->top)
+ ret |= a->d[wordpos] << (BN_BITS2 - bitpos);
+ }
+ }
+
+ return ret & BN_MASK2;
+}
+#endif
+
/*
* BN_mod_exp_mont_consttime() stores the precomputed powers in a specific
* layout so that accessing any of these table values shows the same access
@@ -594,6 +660,9 @@ int BN_mod_exp_mont_consttime(BIGNUM *rr, const BIGNUM *a, const BIGNUM *p,
int powerbufLen = 0;
unsigned char *powerbuf = NULL;
BIGNUM tmp, am;
+#if defined(SPARC_T4_MONT)
+ unsigned int t4 = 0;
+#endif
bn_check_top(a);
bn_check_top(p);
@@ -626,21 +695,62 @@ int BN_mod_exp_mont_consttime(BIGNUM *rr, const BIGNUM *a, const BIGNUM *p,
goto err;
}
+#ifdef RSAZ_ENABLED
+ /*
+ * If the size of the operands allow it, perform the optimized
+ * RSAZ exponentiation. For further information see
+ * crypto/bn/rsaz_exp.c and accompanying assembly modules.
+ */
+ if ((16 == a->top) && (16 == p->top) && (BN_num_bits(m) == 1024)
+ && rsaz_avx2_eligible()) {
+ if (NULL == bn_wexpand(rr, 16))
+ goto err;
+ RSAZ_1024_mod_exp_avx2(rr->d, a->d, p->d, m->d, mont->RR.d,
+ mont->n0[0]);
+ rr->top = 16;
+ rr->neg = 0;
+ bn_correct_top(rr);
+ ret = 1;
+ goto err;
+ } else if ((8 == a->top) && (8 == p->top) && (BN_num_bits(m) == 512)) {
+ if (NULL == bn_wexpand(rr, 8))
+ goto err;
+ RSAZ_512_mod_exp(rr->d, a->d, p->d, m->d, mont->n0[0], mont->RR.d);
+ rr->top = 8;
+ rr->neg = 0;
+ bn_correct_top(rr);
+ ret = 1;
+ goto err;
+ }
+#endif
+
/* Get the window size to use with size of p. */
window = BN_window_bits_for_ctime_exponent_size(bits);
+#if defined(SPARC_T4_MONT)
+ if (window >= 5 && (top & 15) == 0 && top <= 64 &&
+ (OPENSSL_sparcv9cap_P[1] & (CFR_MONTMUL | CFR_MONTSQR)) ==
+ (CFR_MONTMUL | CFR_MONTSQR) && (t4 = OPENSSL_sparcv9cap_P[0]))
+ window = 5;
+ else
+#endif
#if defined(OPENSSL_BN_ASM_MONT5)
- if (window == 6 && bits <= 1024)
- window = 5; /* ~5% improvement of 2048-bit RSA sign */
+ if (window >= 5) {
+ window = 5; /* ~5% improvement for RSA2048 sign, and even
+ * for RSA4096 */
+ if ((top & 7) == 0)
+ powerbufLen += 2 * top * sizeof(m->d[0]);
+ }
#endif
+ (void)0;
/*
* Allocate a buffer large enough to hold all of the pre-computed powers
* of am, am itself and tmp.
*/
numPowers = 1 << window;
- powerbufLen = sizeof(m->d[0]) * (top * numPowers +
- ((2 * top) >
- numPowers ? (2 * top) : numPowers));
+ powerbufLen += sizeof(m->d[0]) * (top * numPowers +
+ ((2 * top) >
+ numPowers ? (2 * top) : numPowers));
#ifdef alloca
if (powerbufLen < 3072)
powerbufFree =
@@ -670,15 +780,17 @@ int BN_mod_exp_mont_consttime(BIGNUM *rr, const BIGNUM *a, const BIGNUM *p,
tmp.flags = am.flags = BN_FLG_STATIC_DATA;
/* prepare a^0 in Montgomery domain */
-#if 1
+#if 1 /* by Shay Gueron's suggestion */
+ if (m->d[top - 1] & (((BN_ULONG)1) << (BN_BITS2 - 1))) {
+ /* 2^(top*BN_BITS2) - m */
+ tmp.d[0] = (0 - m->d[0]) & BN_MASK2;
+ for (i = 1; i < top; i++)
+ tmp.d[i] = (~m->d[i]) & BN_MASK2;
+ tmp.top = top;
+ } else
+#endif
if (!BN_to_montgomery(&tmp, BN_value_one(), mont, ctx))
goto err;
-#else
- tmp.d[0] = (0 - m->d[0]) & BN_MASK2; /* 2^(top*BN_BITS2) - m */
- for (i = 1; i < top; i++)
- tmp.d[i] = (~m->d[i]) & BN_MASK2;
- tmp.top = top;
-#endif
/* prepare a^1 in Montgomery domain */
if (a->neg || BN_ucmp(a, m) >= 0) {
@@ -689,6 +801,138 @@ int BN_mod_exp_mont_consttime(BIGNUM *rr, const BIGNUM *a, const BIGNUM *p,
} else if (!BN_to_montgomery(&am, a, mont, ctx))
goto err;
+#if defined(SPARC_T4_MONT)
+ if (t4) {
+ typedef int (*bn_pwr5_mont_f) (BN_ULONG *tp, const BN_ULONG *np,
+ const BN_ULONG *n0, const void *table,
+ int power, int bits);
+ int bn_pwr5_mont_t4_8(BN_ULONG *tp, const BN_ULONG *np,
+ const BN_ULONG *n0, const void *table,
+ int power, int bits);
+ int bn_pwr5_mont_t4_16(BN_ULONG *tp, const BN_ULONG *np,
+ const BN_ULONG *n0, const void *table,
+ int power, int bits);
+ int bn_pwr5_mont_t4_24(BN_ULONG *tp, const BN_ULONG *np,
+ const BN_ULONG *n0, const void *table,
+ int power, int bits);
+ int bn_pwr5_mont_t4_32(BN_ULONG *tp, const BN_ULONG *np,
+ const BN_ULONG *n0, const void *table,
+ int power, int bits);
+ static const bn_pwr5_mont_f pwr5_funcs[4] = {
+ bn_pwr5_mont_t4_8, bn_pwr5_mont_t4_16,
+ bn_pwr5_mont_t4_24, bn_pwr5_mont_t4_32
+ };
+ bn_pwr5_mont_f pwr5_worker = pwr5_funcs[top / 16 - 1];
+
+ typedef int (*bn_mul_mont_f) (BN_ULONG *rp, const BN_ULONG *ap,
+ const void *bp, const BN_ULONG *np,
+ const BN_ULONG *n0);
+ int bn_mul_mont_t4_8(BN_ULONG *rp, const BN_ULONG *ap, const void *bp,
+ const BN_ULONG *np, const BN_ULONG *n0);
+ int bn_mul_mont_t4_16(BN_ULONG *rp, const BN_ULONG *ap,
+ const void *bp, const BN_ULONG *np,
+ const BN_ULONG *n0);
+ int bn_mul_mont_t4_24(BN_ULONG *rp, const BN_ULONG *ap,
+ const void *bp, const BN_ULONG *np,
+ const BN_ULONG *n0);
+ int bn_mul_mont_t4_32(BN_ULONG *rp, const BN_ULONG *ap,
+ const void *bp, const BN_ULONG *np,
+ const BN_ULONG *n0);
+ static const bn_mul_mont_f mul_funcs[4] = {
+ bn_mul_mont_t4_8, bn_mul_mont_t4_16,
+ bn_mul_mont_t4_24, bn_mul_mont_t4_32
+ };
+ bn_mul_mont_f mul_worker = mul_funcs[top / 16 - 1];
+
+ void bn_mul_mont_vis3(BN_ULONG *rp, const BN_ULONG *ap,
+ const void *bp, const BN_ULONG *np,
+ const BN_ULONG *n0, int num);
+ void bn_mul_mont_t4(BN_ULONG *rp, const BN_ULONG *ap,
+ const void *bp, const BN_ULONG *np,
+ const BN_ULONG *n0, int num);
+ void bn_mul_mont_gather5_t4(BN_ULONG *rp, const BN_ULONG *ap,
+ const void *table, const BN_ULONG *np,
+ const BN_ULONG *n0, int num, int power);
+ void bn_flip_n_scatter5_t4(const BN_ULONG *inp, size_t num,
+ void *table, size_t power);
+ void bn_gather5_t4(BN_ULONG *out, size_t num,
+ void *table, size_t power);
+ void bn_flip_t4(BN_ULONG *dst, BN_ULONG *src, size_t num);
+
+ BN_ULONG *np = mont->N.d, *n0 = mont->n0;
+ int stride = 5 * (6 - (top / 16 - 1)); /* multiple of 5, but less
+ * than 32 */
+
+ /*
+ * BN_to_montgomery can contaminate words above .top [in
+ * BN_DEBUG[_DEBUG] build]...
+ */
+ for (i = am.top; i < top; i++)
+ am.d[i] = 0;
+ for (i = tmp.top; i < top; i++)
+ tmp.d[i] = 0;
+
+ bn_flip_n_scatter5_t4(tmp.d, top, powerbuf, 0);
+ bn_flip_n_scatter5_t4(am.d, top, powerbuf, 1);
+ if (!(*mul_worker) (tmp.d, am.d, am.d, np, n0) &&
+ !(*mul_worker) (tmp.d, am.d, am.d, np, n0))
+ bn_mul_mont_vis3(tmp.d, am.d, am.d, np, n0, top);
+ bn_flip_n_scatter5_t4(tmp.d, top, powerbuf, 2);
+
+ for (i = 3; i < 32; i++) {
+ /* Calculate a^i = a^(i-1) * a */
+ if (!(*mul_worker) (tmp.d, tmp.d, am.d, np, n0) &&
+ !(*mul_worker) (tmp.d, tmp.d, am.d, np, n0))
+ bn_mul_mont_vis3(tmp.d, tmp.d, am.d, np, n0, top);
+ bn_flip_n_scatter5_t4(tmp.d, top, powerbuf, i);
+ }
+
+ /* switch to 64-bit domain */
+ np = alloca(top * sizeof(BN_ULONG));
+ top /= 2;
+ bn_flip_t4(np, mont->N.d, top);
+
+ bits--;
+ for (wvalue = 0, i = bits % 5; i >= 0; i--, bits--)
+ wvalue = (wvalue << 1) + BN_is_bit_set(p, bits);
+ bn_gather5_t4(tmp.d, top, powerbuf, wvalue);
+
+ /*
+ * Scan the exponent one window at a time starting from the most
+ * significant bits.
+ */
+ while (bits >= 0) {
+ if (bits < stride)
+ stride = bits + 1;
+ bits -= stride;
+ wvalue = bn_get_bits(p, bits + 1);
+
+ if ((*pwr5_worker) (tmp.d, np, n0, powerbuf, wvalue, stride))
+ continue;
+ /* retry once and fall back */
+ if ((*pwr5_worker) (tmp.d, np, n0, powerbuf, wvalue, stride))
+ continue;
+
+ bits += stride - 5;
+ wvalue >>= stride - 5;
+ wvalue &= 31;
+ bn_mul_mont_t4(tmp.d, tmp.d, tmp.d, np, n0, top);
+ bn_mul_mont_t4(tmp.d, tmp.d, tmp.d, np, n0, top);
+ bn_mul_mont_t4(tmp.d, tmp.d, tmp.d, np, n0, top);
+ bn_mul_mont_t4(tmp.d, tmp.d, tmp.d, np, n0, top);
+ bn_mul_mont_t4(tmp.d, tmp.d, tmp.d, np, n0, top);
+ bn_mul_mont_gather5_t4(tmp.d, tmp.d, powerbuf, np, n0, top,
+ wvalue);
+ }
+
+ bn_flip_t4(tmp.d, tmp.d, top);
+ top *= 2;
+ /* back to 32-bit domain */
+ tmp.top = top;
+ bn_correct_top(&tmp);
+ OPENSSL_cleanse(np, top * sizeof(BN_ULONG));
+ } else
+#endif
#if defined(OPENSSL_BN_ASM_MONT5)
if (window == 5 && top > 1) {
/*
@@ -707,8 +951,15 @@ int BN_mod_exp_mont_consttime(BIGNUM *rr, const BIGNUM *a, const BIGNUM *p,
void bn_scatter5(const BN_ULONG *inp, size_t num,
void *table, size_t power);
void bn_gather5(BN_ULONG *out, size_t num, void *table, size_t power);
+ void bn_power5(BN_ULONG *rp, const BN_ULONG *ap,
+ const void *table, const BN_ULONG *np,
+ const BN_ULONG *n0, int num, int power);
+ int bn_get_bits5(const BN_ULONG *ap, int off);
+ int bn_from_montgomery(BN_ULONG *rp, const BN_ULONG *ap,
+ const BN_ULONG *not_used, const BN_ULONG *np,
+ const BN_ULONG *n0, int num);
- BN_ULONG *np = mont->N.d, *n0 = mont->n0;
+ BN_ULONG *np = mont->N.d, *n0 = mont->n0, *np2;
/*
* BN_to_montgomery can contaminate words above .top [in
@@ -719,6 +970,12 @@ int BN_mod_exp_mont_consttime(BIGNUM *rr, const BIGNUM *a, const BIGNUM *p,
for (i = tmp.top; i < top; i++)
tmp.d[i] = 0;
+ if (top & 7)
+ np2 = np;
+ else
+ for (np2 = am.d + top, i = 0; i < top; i++)
+ np2[2 * i] = np[i];
+
bn_scatter5(tmp.d, top, powerbuf, 0);
bn_scatter5(am.d, am.top, powerbuf, 1);
bn_mul_mont(tmp.d, am.d, am.d, np, n0, top);
@@ -727,7 +984,7 @@ int BN_mod_exp_mont_consttime(BIGNUM *rr, const BIGNUM *a, const BIGNUM *p,
# if 0
for (i = 3; i < 32; i++) {
/* Calculate a^i = a^(i-1) * a */
- bn_mul_mont_gather5(tmp.d, am.d, powerbuf, np, n0, top, i - 1);
+ bn_mul_mont_gather5(tmp.d, am.d, powerbuf, np2, n0, top, i - 1);
bn_scatter5(tmp.d, top, powerbuf, i);
}
# else
@@ -738,7 +995,7 @@ int BN_mod_exp_mont_consttime(BIGNUM *rr, const BIGNUM *a, const BIGNUM *p,
}
for (i = 3; i < 8; i += 2) {
int j;
- bn_mul_mont_gather5(tmp.d, am.d, powerbuf, np, n0, top, i - 1);
+ bn_mul_mont_gather5(tmp.d, am.d, powerbuf, np2, n0, top, i - 1);
bn_scatter5(tmp.d, top, powerbuf, i);
for (j = 2 * i; j < 32; j *= 2) {
bn_mul_mont(tmp.d, tmp.d, tmp.d, np, n0, top);
@@ -746,13 +1003,13 @@ int BN_mod_exp_mont_consttime(BIGNUM *rr, const BIGNUM *a, const BIGNUM *p,
}
}
for (; i < 16; i += 2) {
- bn_mul_mont_gather5(tmp.d, am.d, powerbuf, np, n0, top, i - 1);
+ bn_mul_mont_gather5(tmp.d, am.d, powerbuf, np2, n0, top, i - 1);
bn_scatter5(tmp.d, top, powerbuf, i);
bn_mul_mont(tmp.d, tmp.d, tmp.d, np, n0, top);
bn_scatter5(tmp.d, top, powerbuf, 2 * i);
}
for (; i < 32; i += 2) {
- bn_mul_mont_gather5(tmp.d, am.d, powerbuf, np, n0, top, i - 1);
+ bn_mul_mont_gather5(tmp.d, am.d, powerbuf, np2, n0, top, i - 1);
bn_scatter5(tmp.d, top, powerbuf, i);
}
# endif
@@ -765,20 +1022,34 @@ int BN_mod_exp_mont_consttime(BIGNUM *rr, const BIGNUM *a, const BIGNUM *p,
* Scan the exponent one window at a time starting from the most
* significant bits.
*/
- while (bits >= 0) {
- for (wvalue = 0, i = 0; i < 5; i++, bits--)
- wvalue = (wvalue << 1) + BN_is_bit_set(p, bits);
+ if (top & 7)
+ while (bits >= 0) {
+ for (wvalue = 0, i = 0; i < 5; i++, bits--)
+ wvalue = (wvalue << 1) + BN_is_bit_set(p, bits);
- bn_mul_mont(tmp.d, tmp.d, tmp.d, np, n0, top);
- bn_mul_mont(tmp.d, tmp.d, tmp.d, np, n0, top);
- bn_mul_mont(tmp.d, tmp.d, tmp.d, np, n0, top);
- bn_mul_mont(tmp.d, tmp.d, tmp.d, np, n0, top);
- bn_mul_mont(tmp.d, tmp.d, tmp.d, np, n0, top);
- bn_mul_mont_gather5(tmp.d, tmp.d, powerbuf, np, n0, top, wvalue);
+ bn_mul_mont(tmp.d, tmp.d, tmp.d, np, n0, top);
+ bn_mul_mont(tmp.d, tmp.d, tmp.d, np, n0, top);
+ bn_mul_mont(tmp.d, tmp.d, tmp.d, np, n0, top);
+ bn_mul_mont(tmp.d, tmp.d, tmp.d, np, n0, top);
+ bn_mul_mont(tmp.d, tmp.d, tmp.d, np, n0, top);
+ bn_mul_mont_gather5(tmp.d, tmp.d, powerbuf, np, n0, top,
+ wvalue);
+ } else {
+ while (bits >= 0) {
+ wvalue = bn_get_bits5(p->d, bits - 4);
+ bits -= 5;
+ bn_power5(tmp.d, tmp.d, powerbuf, np2, n0, top, wvalue);
+ }
}
+ ret = bn_from_montgomery(tmp.d, tmp.d, NULL, np2, n0, top);
tmp.top = top;
bn_correct_top(&tmp);
+ if (ret) {
+ if (!BN_copy(rr, &tmp))
+ ret = 0;
+ goto err; /* non-zero ret means it's not error */
+ }
} else
#endif
{
@@ -844,6 +1115,15 @@ int BN_mod_exp_mont_consttime(BIGNUM *rr, const BIGNUM *a, const BIGNUM *p,
}
/* Convert the final result from montgomery to standard format */
+#if defined(SPARC_T4_MONT)
+ if (OPENSSL_sparcv9cap_P[0] & (SPARCV9_VIS3 | SPARCV9_PREFER_FPU)) {
+ am.d[0] = 1; /* borrow am */
+ for (i = 1; i < top; i++)
+ am.d[i] = 0;
+ if (!BN_mod_mul_montgomery(rr, &tmp, &am, mont, ctx))
+ goto err;
+ } else
+#endif
if (!BN_from_montgomery(rr, &tmp, mont, ctx))
goto err;
ret = 1;