summaryrefslogtreecommitdiff
path: root/deps/openssl/openssl/crypto/ec/asm/ecp_nistz256-x86_64.pl
diff options
context:
space:
mode:
Diffstat (limited to 'deps/openssl/openssl/crypto/ec/asm/ecp_nistz256-x86_64.pl')
-rwxr-xr-xdeps/openssl/openssl/crypto/ec/asm/ecp_nistz256-x86_64.pl40
1 files changed, 22 insertions, 18 deletions
diff --git a/deps/openssl/openssl/crypto/ec/asm/ecp_nistz256-x86_64.pl b/deps/openssl/openssl/crypto/ec/asm/ecp_nistz256-x86_64.pl
index a34f03cc5e..84379fce1c 100755
--- a/deps/openssl/openssl/crypto/ec/asm/ecp_nistz256-x86_64.pl
+++ b/deps/openssl/openssl/crypto/ec/asm/ecp_nistz256-x86_64.pl
@@ -30,20 +30,24 @@
# Further optimization by <appro@openssl.org>:
#
-# this/original
-# Opteron +12-49%
-# Bulldozer +14-45%
-# P4 +18-46%
-# Westmere +12-34%
-# Sandy Bridge +9-35%
-# Ivy Bridge +9-35%
-# Haswell +8-37%
-# Broadwell +18-58%
-# Atom +15-50%
-# VIA Nano +43-160%
+# this/original with/without -DECP_NISTZ256_ASM(*)
+# Opteron +12-49% +110-150%
+# Bulldozer +14-45% +175-210%
+# P4 +18-46% n/a :-(
+# Westmere +12-34% +80-87%
+# Sandy Bridge +9-35% +110-120%
+# Ivy Bridge +9-35% +110-125%
+# Haswell +8-37% +140-160%
+# Broadwell +18-58% +145-210%
+# Atom +15-50% +130-180%
+# VIA Nano +43-160% +300-480%
+#
+# (*) "without -DECP_NISTZ256_ASM" refers to build with
+# "enable-ec_nistp_64_gcc_128";
#
# Ranges denote minimum and maximum improvement coefficients depending
-# on benchmark.
+# on benchmark. Lower coefficients are for ECDSA sign, relatively fastest
+# server-side operation. Keep in mind that +100% means 2x improvement.
$flavour = shift;
$output = shift;
@@ -599,7 +603,7 @@ __ecp_nistz256_mul_montq:
adc \$0, $acc0
########################################################################
- # Second reduction step
+ # Second reduction step
mov $acc1, $t1
shl \$32, $acc1
mulq $poly3
@@ -646,7 +650,7 @@ __ecp_nistz256_mul_montq:
adc \$0, $acc1
########################################################################
- # Third reduction step
+ # Third reduction step
mov $acc2, $t1
shl \$32, $acc2
mulq $poly3
@@ -693,7 +697,7 @@ __ecp_nistz256_mul_montq:
adc \$0, $acc2
########################################################################
- # Final reduction step
+ # Final reduction step
mov $acc3, $t1
shl \$32, $acc3
mulq $poly3
@@ -706,7 +710,7 @@ __ecp_nistz256_mul_montq:
mov $acc5, $t1
adc \$0, $acc2
- ########################################################################
+ ########################################################################
# Branch-less conditional subtraction of P
sub \$-1, $acc4 # .Lpoly[0]
mov $acc0, $t2
@@ -2060,7 +2064,7 @@ $code.=<<___;
movq %xmm1, $r_ptr
call __ecp_nistz256_sqr_mont$x # p256_sqr_mont(res_y, S);
___
-{
+{
######## ecp_nistz256_div_by_2(res_y, res_y); ##########################
# operate in 4-5-6-7 "name space" that matches squaring output
#
@@ -2149,7 +2153,7 @@ $code.=<<___;
lea $M(%rsp), $b_ptr
mov $acc4, $acc6 # harmonize sub output and mul input
xor %ecx, %ecx
- mov $acc4, $S+8*0(%rsp) # have to save:-(
+ mov $acc4, $S+8*0(%rsp) # have to save:-(
mov $acc5, $acc2
mov $acc5, $S+8*1(%rsp)
cmovz $acc0, $acc3