summaryrefslogtreecommitdiff
path: root/deps/openssl/openssl/crypto/ec/asm/ecp_nistz256-x86_64.pl
diff options
context:
space:
mode:
Diffstat (limited to 'deps/openssl/openssl/crypto/ec/asm/ecp_nistz256-x86_64.pl')
-rwxr-xr-xdeps/openssl/openssl/crypto/ec/asm/ecp_nistz256-x86_64.pl1874
1 files changed, 1749 insertions, 125 deletions
diff --git a/deps/openssl/openssl/crypto/ec/asm/ecp_nistz256-x86_64.pl b/deps/openssl/openssl/crypto/ec/asm/ecp_nistz256-x86_64.pl
index 714e852a18..eba6ffd430 100755
--- a/deps/openssl/openssl/crypto/ec/asm/ecp_nistz256-x86_64.pl
+++ b/deps/openssl/openssl/crypto/ec/asm/ecp_nistz256-x86_64.pl
@@ -1,60 +1,44 @@
#! /usr/bin/env perl
-# Copyright 2014-2016 The OpenSSL Project Authors. All Rights Reserved.
+# Copyright 2014-2018 The OpenSSL Project Authors. All Rights Reserved.
+# Copyright (c) 2014, Intel Corporation. All Rights Reserved.
+# Copyright (c) 2015 CloudFlare, Inc.
#
# Licensed under the OpenSSL license (the "License"). You may not use
# this file except in compliance with the License. You can obtain a copy
# in the file LICENSE in the source distribution or at
# https://www.openssl.org/source/license.html
-
-
-##############################################################################
-# #
-# Copyright 2014 Intel Corporation #
-# #
-# Licensed under the Apache License, Version 2.0 (the "License"); #
-# you may not use this file except in compliance with the License. #
-# You may obtain a copy of the License at #
-# #
-# http://www.apache.org/licenses/LICENSE-2.0 #
-# #
-# Unless required by applicable law or agreed to in writing, software #
-# distributed under the License is distributed on an "AS IS" BASIS, #
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. #
-# See the License for the specific language governing permissions and #
-# limitations under the License. #
-# #
-##############################################################################
-# #
-# Developers and authors: #
-# Shay Gueron (1, 2), and Vlad Krasnov (1) #
-# (1) Intel Corporation, Israel Development Center #
-# (2) University of Haifa #
-# Reference: #
-# S.Gueron and V.Krasnov, "Fast Prime Field Elliptic Curve Cryptography with#
-# 256 Bit Primes" #
-# #
-##############################################################################
+#
+# Originally written by Shay Gueron (1, 2), and Vlad Krasnov (1, 3)
+# (1) Intel Corporation, Israel Development Center, Haifa, Israel
+# (2) University of Haifa, Israel
+# (3) CloudFlare, Inc.
+#
+# Reference:
+# S.Gueron and V.Krasnov, "Fast Prime Field Elliptic Curve Cryptography with
+# 256 Bit Primes"
# Further optimization by <appro@openssl.org>:
#
# this/original with/without -DECP_NISTZ256_ASM(*)
-# Opteron +12-49% +110-150%
-# Bulldozer +14-45% +175-210%
-# P4 +18-46% n/a :-(
-# Westmere +12-34% +80-87%
-# Sandy Bridge +9-35% +110-120%
-# Ivy Bridge +9-35% +110-125%
-# Haswell +8-37% +140-160%
-# Broadwell +18-58% +145-210%
-# Atom +15-50% +130-180%
-# VIA Nano +43-160% +300-480%
+# Opteron +15-49% +150-195%
+# Bulldozer +18-45% +175-240%
+# P4 +24-46% +100-150%
+# Westmere +18-34% +87-160%
+# Sandy Bridge +14-35% +120-185%
+# Ivy Bridge +11-35% +125-180%
+# Haswell +10-37% +160-200%
+# Broadwell +24-58% +210-270%
+# Atom +20-50% +180-240%
+# VIA Nano +50-160% +480-480%
#
# (*) "without -DECP_NISTZ256_ASM" refers to build with
# "enable-ec_nistp_64_gcc_128";
#
# Ranges denote minimum and maximum improvement coefficients depending
-# on benchmark. Lower coefficients are for ECDSA sign, relatively fastest
-# server-side operation. Keep in mind that +100% means 2x improvement.
+# on benchmark. In "this/original" column lower coefficient is for
+# ECDSA sign, while in "with/without" - for ECDH key agreement, and
+# higher - for ECDSA sign, relatively fastest server-side operation.
+# Keep in mind that +100% means 2x improvement.
$flavour = shift;
$output = shift;
@@ -115,6 +99,12 @@ $code.=<<___;
.long 3,3,3,3,3,3,3,3
.LONE_mont:
.quad 0x0000000000000001, 0xffffffff00000000, 0xffffffffffffffff, 0x00000000fffffffe
+
+# Constants for computations modulo ord(p256)
+.Lord:
+.quad 0xf3b9cac2fc632551, 0xbce6faada7179e84, 0xffffffffffffffff, 0xffffffff00000000
+.LordK:
+.quad 0xccd1c8aaee00bc4f
___
{
@@ -131,8 +121,12 @@ $code.=<<___;
.type ecp_nistz256_mul_by_2,\@function,2
.align 64
ecp_nistz256_mul_by_2:
+.cfi_startproc
push %r12
+.cfi_push %r12
push %r13
+.cfi_push %r13
+.Lmul_by_2_body:
mov 8*0($a_ptr), $a0
xor $t4,$t4
@@ -165,9 +159,15 @@ ecp_nistz256_mul_by_2:
mov $a2, 8*2($r_ptr)
mov $a3, 8*3($r_ptr)
- pop %r13
- pop %r12
+ mov 0(%rsp),%r13
+.cfi_restore %r13
+ mov 8(%rsp),%r12
+.cfi_restore %r12
+ lea 16(%rsp),%rsp
+.cfi_adjust_cfa_offset -16
+.Lmul_by_2_epilogue:
ret
+.cfi_endproc
.size ecp_nistz256_mul_by_2,.-ecp_nistz256_mul_by_2
################################################################################
@@ -176,8 +176,12 @@ ecp_nistz256_mul_by_2:
.type ecp_nistz256_div_by_2,\@function,2
.align 32
ecp_nistz256_div_by_2:
+.cfi_startproc
push %r12
+.cfi_push %r12
push %r13
+.cfi_push %r13
+.Ldiv_by_2_body:
mov 8*0($a_ptr), $a0
mov 8*1($a_ptr), $a1
@@ -225,9 +229,15 @@ ecp_nistz256_div_by_2:
mov $a2, 8*2($r_ptr)
mov $a3, 8*3($r_ptr)
- pop %r13
- pop %r12
+ mov 0(%rsp),%r13
+.cfi_restore %r13
+ mov 8(%rsp),%r12
+.cfi_restore %r12
+ lea 16(%rsp),%rsp
+.cfi_adjust_cfa_offset -16
+.Ldiv_by_2_epilogue:
ret
+.cfi_endproc
.size ecp_nistz256_div_by_2,.-ecp_nistz256_div_by_2
################################################################################
@@ -236,8 +246,12 @@ ecp_nistz256_div_by_2:
.type ecp_nistz256_mul_by_3,\@function,2
.align 32
ecp_nistz256_mul_by_3:
+.cfi_startproc
push %r12
+.cfi_push %r12
push %r13
+.cfi_push %r13
+.Lmul_by_3_body:
mov 8*0($a_ptr), $a0
xor $t4, $t4
@@ -291,9 +305,15 @@ ecp_nistz256_mul_by_3:
mov $a2, 8*2($r_ptr)
mov $a3, 8*3($r_ptr)
- pop %r13
- pop %r12
+ mov 0(%rsp),%r13
+.cfi_restore %r13
+ mov 8(%rsp),%r12
+.cfi_restore %r12
+ lea 16(%rsp),%rsp
+.cfi_adjust_cfa_offset -16
+.Lmul_by_3_epilogue:
ret
+.cfi_endproc
.size ecp_nistz256_mul_by_3,.-ecp_nistz256_mul_by_3
################################################################################
@@ -302,8 +322,12 @@ ecp_nistz256_mul_by_3:
.type ecp_nistz256_add,\@function,3
.align 32
ecp_nistz256_add:
+.cfi_startproc
push %r12
+.cfi_push %r12
push %r13
+.cfi_push %r13
+.Ladd_body:
mov 8*0($a_ptr), $a0
xor $t4, $t4
@@ -337,9 +361,15 @@ ecp_nistz256_add:
mov $a2, 8*2($r_ptr)
mov $a3, 8*3($r_ptr)
- pop %r13
- pop %r12
+ mov 0(%rsp),%r13
+.cfi_restore %r13
+ mov 8(%rsp),%r12
+.cfi_restore %r12
+ lea 16(%rsp),%rsp
+.cfi_adjust_cfa_offset -16
+.Ladd_epilogue:
ret
+.cfi_endproc
.size ecp_nistz256_add,.-ecp_nistz256_add
################################################################################
@@ -348,8 +378,12 @@ ecp_nistz256_add:
.type ecp_nistz256_sub,\@function,3
.align 32
ecp_nistz256_sub:
+.cfi_startproc
push %r12
+.cfi_push %r12
push %r13
+.cfi_push %r13
+.Lsub_body:
mov 8*0($a_ptr), $a0
xor $t4, $t4
@@ -383,9 +417,15 @@ ecp_nistz256_sub:
mov $a2, 8*2($r_ptr)
mov $a3, 8*3($r_ptr)
- pop %r13
- pop %r12
+ mov 0(%rsp),%r13
+.cfi_restore %r13
+ mov 8(%rsp),%r12
+.cfi_restore %r12
+ lea 16(%rsp),%rsp
+.cfi_adjust_cfa_offset -16
+.Lsub_epilogue:
ret
+.cfi_endproc
.size ecp_nistz256_sub,.-ecp_nistz256_sub
################################################################################
@@ -394,8 +434,12 @@ ecp_nistz256_sub:
.type ecp_nistz256_neg,\@function,2
.align 32
ecp_nistz256_neg:
+.cfi_startproc
push %r12
+.cfi_push %r12
push %r13
+.cfi_push %r13
+.Lneg_body:
xor $a0, $a0
xor $a1, $a1
@@ -429,9 +473,15 @@ ecp_nistz256_neg:
mov $a2, 8*2($r_ptr)
mov $a3, 8*3($r_ptr)
- pop %r13
- pop %r12
+ mov 0(%rsp),%r13
+.cfi_restore %r13
+ mov 8(%rsp),%r12
+.cfi_restore %r12
+ lea 16(%rsp),%rsp
+.cfi_adjust_cfa_offset -16
+.Lneg_epilogue:
ret
+.cfi_endproc
.size ecp_nistz256_neg,.-ecp_nistz256_neg
___
}
@@ -443,6 +493,1085 @@ my ($poly1,$poly3)=($acc6,$acc7);
$code.=<<___;
################################################################################
+# void ecp_nistz256_ord_mul_mont(
+# uint64_t res[4],
+# uint64_t a[4],
+# uint64_t b[4]);
+
+.globl ecp_nistz256_ord_mul_mont
+.type ecp_nistz256_ord_mul_mont,\@function,3
+.align 32
+ecp_nistz256_ord_mul_mont:
+.cfi_startproc
+___
+$code.=<<___ if ($addx);
+ mov \$0x80100, %ecx
+ and OPENSSL_ia32cap_P+8(%rip), %ecx
+ cmp \$0x80100, %ecx
+ je .Lecp_nistz256_ord_mul_montx
+___
+$code.=<<___;
+ push %rbp
+.cfi_push %rbp
+ push %rbx
+.cfi_push %rbx
+ push %r12
+.cfi_push %r12
+ push %r13
+.cfi_push %r13
+ push %r14
+.cfi_push %r14
+ push %r15
+.cfi_push %r15
+.Lord_mul_body:
+
+ mov 8*0($b_org), %rax
+ mov $b_org, $b_ptr
+ lea .Lord(%rip), %r14
+ mov .LordK(%rip), %r15
+
+ ################################# * b[0]
+ mov %rax, $t0
+ mulq 8*0($a_ptr)
+ mov %rax, $acc0
+ mov $t0, %rax
+ mov %rdx, $acc1
+
+ mulq 8*1($a_ptr)
+ add %rax, $acc1
+ mov $t0, %rax
+ adc \$0, %rdx
+ mov %rdx, $acc2
+
+ mulq 8*2($a_ptr)
+ add %rax, $acc2
+ mov $t0, %rax
+ adc \$0, %rdx
+
+ mov $acc0, $acc5
+ imulq %r15,$acc0
+
+ mov %rdx, $acc3
+ mulq 8*3($a_ptr)
+ add %rax, $acc3
+ mov $acc0, %rax
+ adc \$0, %rdx
+ mov %rdx, $acc4
+
+ ################################# First reduction step
+ mulq 8*0(%r14)
+ mov $acc0, $t1
+ add %rax, $acc5 # guaranteed to be zero
+ mov $acc0, %rax
+ adc \$0, %rdx
+ mov %rdx, $t0
+
+ sub $acc0, $acc2
+ sbb \$0, $acc0 # can't borrow
+
+ mulq 8*1(%r14)
+ add $t0, $acc1
+ adc \$0, %rdx
+ add %rax, $acc1
+ mov $t1, %rax
+ adc %rdx, $acc2
+ mov $t1, %rdx
+ adc \$0, $acc0 # can't overflow
+
+ shl \$32, %rax
+ shr \$32, %rdx
+ sub %rax, $acc3
+ mov 8*1($b_ptr), %rax
+ sbb %rdx, $t1 # can't borrow
+
+ add $acc0, $acc3
+ adc $t1, $acc4
+ adc \$0, $acc5
+
+ ################################# * b[1]
+ mov %rax, $t0
+ mulq 8*0($a_ptr)
+ add %rax, $acc1
+ mov $t0, %rax
+ adc \$0, %rdx
+ mov %rdx, $t1
+
+ mulq 8*1($a_ptr)
+ add $t1, $acc2
+ adc \$0, %rdx
+ add %rax, $acc2
+ mov $t0, %rax
+ adc \$0, %rdx
+ mov %rdx, $t1
+
+ mulq 8*2($a_ptr)
+ add $t1, $acc3
+ adc \$0, %rdx
+ add %rax, $acc3
+ mov $t0, %rax
+ adc \$0, %rdx
+
+ mov $acc1, $t0
+ imulq %r15, $acc1
+
+ mov %rdx, $t1
+ mulq 8*3($a_ptr)
+ add $t1, $acc4
+ adc \$0, %rdx
+ xor $acc0, $acc0
+ add %rax, $acc4
+ mov $acc1, %rax
+ adc %rdx, $acc5
+ adc \$0, $acc0
+
+ ################################# Second reduction step
+ mulq 8*0(%r14)
+ mov $acc1, $t1
+ add %rax, $t0 # guaranteed to be zero
+ mov $acc1, %rax
+ adc %rdx, $t0
+
+ sub $acc1, $acc3
+ sbb \$0, $acc1 # can't borrow
+
+ mulq 8*1(%r14)
+ add $t0, $acc2
+ adc \$0, %rdx
+ add %rax, $acc2
+ mov $t1, %rax
+ adc %rdx, $acc3
+ mov $t1, %rdx
+ adc \$0, $acc1 # can't overflow
+
+ shl \$32, %rax
+ shr \$32, %rdx
+ sub %rax, $acc4
+ mov 8*2($b_ptr), %rax
+ sbb %rdx, $t1 # can't borrow
+
+ add $acc1, $acc4
+ adc $t1, $acc5
+ adc \$0, $acc0
+
+ ################################## * b[2]
+ mov %rax, $t0
+ mulq 8*0($a_ptr)
+ add %rax, $acc2
+ mov $t0, %rax
+ adc \$0, %rdx
+ mov %rdx, $t1
+
+ mulq 8*1($a_ptr)
+ add $t1, $acc3
+ adc \$0, %rdx
+ add %rax, $acc3
+ mov $t0, %rax
+ adc \$0, %rdx
+ mov %rdx, $t1
+
+ mulq 8*2($a_ptr)
+ add $t1, $acc4
+ adc \$0, %rdx
+ add %rax, $acc4
+ mov $t0, %rax
+ adc \$0, %rdx
+
+ mov $acc2, $t0
+ imulq %r15, $acc2
+
+ mov %rdx, $t1
+ mulq 8*3($a_ptr)
+ add $t1, $acc5
+ adc \$0, %rdx
+ xor $acc1, $acc1
+ add %rax, $acc5
+ mov $acc2, %rax
+ adc %rdx, $acc0
+ adc \$0, $acc1
+
+ ################################# Third reduction step
+ mulq 8*0(%r14)
+ mov $acc2, $t1
+ add %rax, $t0 # guaranteed to be zero
+ mov $acc2, %rax
+ adc %rdx, $t0
+
+ sub $acc2, $acc4
+ sbb \$0, $acc2 # can't borrow
+
+ mulq 8*1(%r14)
+ add $t0, $acc3
+ adc \$0, %rdx
+ add %rax, $acc3
+ mov $t1, %rax
+ adc %rdx, $acc4
+ mov $t1, %rdx
+ adc \$0, $acc2 # can't overflow
+
+ shl \$32, %rax
+ shr \$32, %rdx
+ sub %rax, $acc5
+ mov 8*3($b_ptr), %rax
+ sbb %rdx, $t1 # can't borrow
+
+ add $acc2, $acc5
+ adc $t1, $acc0
+ adc \$0, $acc1
+
+ ################################# * b[3]
+ mov %rax, $t0
+ mulq 8*0($a_ptr)
+ add %rax, $acc3
+ mov $t0, %rax
+ adc \$0, %rdx
+ mov %rdx, $t1
+
+ mulq 8*1($a_ptr)
+ add $t1, $acc4
+ adc \$0, %rdx
+ add %rax, $acc4
+ mov $t0, %rax
+ adc \$0, %rdx
+ mov %rdx, $t1
+
+ mulq 8*2($a_ptr)
+ add $t1, $acc5
+ adc \$0, %rdx
+ add %rax, $acc5
+ mov $t0, %rax
+ adc \$0, %rdx
+
+ mov $acc3, $t0
+ imulq %r15, $acc3
+
+ mov %rdx, $t1
+ mulq 8*3($a_ptr)
+ add $t1, $acc0
+ adc \$0, %rdx
+ xor $acc2, $acc2
+ add %rax, $acc0
+ mov $acc3, %rax
+ adc %rdx, $acc1
+ adc \$0, $acc2
+
+ ################################# Last reduction step
+ mulq 8*0(%r14)
+ mov $acc3, $t1
+ add %rax, $t0 # guaranteed to be zero
+ mov $acc3, %rax
+ adc %rdx, $t0
+
+ sub $acc3, $acc5
+ sbb \$0, $acc3 # can't borrow
+
+ mulq 8*1(%r14)
+ add $t0, $acc4
+ adc \$0, %rdx
+ add %rax, $acc4
+ mov $t1, %rax
+ adc %rdx, $acc5
+ mov $t1, %rdx
+ adc \$0, $acc3 # can't overflow
+
+ shl \$32, %rax
+ shr \$32, %rdx
+ sub %rax, $acc0
+ sbb %rdx, $t1 # can't borrow
+
+ add $acc3, $acc0
+ adc $t1, $acc1
+ adc \$0, $acc2
+
+ ################################# Subtract ord
+ mov $acc4, $a_ptr
+ sub 8*0(%r14), $acc4
+ mov $acc5, $acc3
+ sbb 8*1(%r14), $acc5
+ mov $acc0, $t0
+ sbb 8*2(%r14), $acc0
+ mov $acc1, $t1
+ sbb 8*3(%r14), $acc1
+ sbb \$0, $acc2
+
+ cmovc $a_ptr, $acc4
+ cmovc $acc3, $acc5
+ cmovc $t0, $acc0
+ cmovc $t1, $acc1
+
+ mov $acc4, 8*0($r_ptr)
+ mov $acc5, 8*1($r_ptr)
+ mov $acc0, 8*2($r_ptr)
+ mov $acc1, 8*3($r_ptr)
+
+ mov 0(%rsp),%r15
+.cfi_restore %r15
+ mov 8(%rsp),%r14
+.cfi_restore %r14
+ mov 16(%rsp),%r13
+.cfi_restore %r13
+ mov 24(%rsp),%r12
+.cfi_restore %r12
+ mov 32(%rsp),%rbx
+.cfi_restore %rbx
+ mov 40(%rsp),%rbp
+.cfi_restore %rbp
+ lea 48(%rsp),%rsp
+.cfi_adjust_cfa_offset -48
+.Lord_mul_epilogue:
+ ret
+.cfi_endproc
+.size ecp_nistz256_ord_mul_mont,.-ecp_nistz256_ord_mul_mont
+
+################################################################################
+# void ecp_nistz256_ord_sqr_mont(
+# uint64_t res[4],
+# uint64_t a[4],
+# int rep);
+
+.globl ecp_nistz256_ord_sqr_mont
+.type ecp_nistz256_ord_sqr_mont,\@function,3
+.align 32
+ecp_nistz256_ord_sqr_mont:
+.cfi_startproc
+___
+$code.=<<___ if ($addx);
+ mov \$0x80100, %ecx
+ and OPENSSL_ia32cap_P+8(%rip), %ecx
+ cmp \$0x80100, %ecx
+ je .Lecp_nistz256_ord_sqr_montx
+___
+$code.=<<___;
+ push %rbp
+.cfi_push %rbp
+ push %rbx
+.cfi_push %rbx
+ push %r12
+.cfi_push %r12
+ push %r13
+.cfi_push %r13
+ push %r14
+.cfi_push %r14
+ push %r15
+.cfi_push %r15
+.Lord_sqr_body:
+
+ mov 8*0($a_ptr), $acc0
+ mov 8*1($a_ptr), %rax
+ mov 8*2($a_ptr), $acc6
+ mov 8*3($a_ptr), $acc7
+ lea .Lord(%rip), $a_ptr # pointer to modulus
+ mov $b_org, $b_ptr
+ jmp .Loop_ord_sqr
+
+.align 32
+.Loop_ord_sqr:
+ ################################# a[1:] * a[0]
+ mov %rax, $t1 # put aside a[1]
+ mul $acc0 # a[1] * a[0]
+ mov %rax, $acc1
+ movq $t1, %xmm1 # offload a[1]
+ mov $acc6, %rax
+ mov %rdx, $acc2
+
+ mul $acc0 # a[2] * a[0]
+ add %rax, $acc2
+ mov $acc7, %rax
+ movq $acc6, %xmm2 # offload a[2]
+ adc \$0, %rdx
+ mov %rdx, $acc3
+
+ mul $acc0 # a[3] * a[0]
+ add %rax, $acc3
+ mov $acc7, %rax
+ movq $acc7, %xmm3 # offload a[3]
+ adc \$0, %rdx
+ mov %rdx, $acc4
+
+ ################################# a[3] * a[2]
+ mul $acc6 # a[3] * a[2]
+ mov %rax, $acc5
+ mov $acc6, %rax
+ mov %rdx, $acc6
+
+ ################################# a[2:] * a[1]
+ mul $t1 # a[2] * a[1]
+ add %rax, $acc3
+ mov $acc7, %rax
+ adc \$0, %rdx
+ mov %rdx, $acc7
+
+ mul $t1 # a[3] * a[1]
+ add %rax, $acc4
+ adc \$0, %rdx
+
+ add $acc7, $acc4
+ adc %rdx, $acc5
+ adc \$0, $acc6 # can't overflow
+
+ ################################# *2
+ xor $acc7, $acc7
+ mov $acc0, %rax
+ add $acc1, $acc1
+ adc $acc2, $acc2
+ adc $acc3, $acc3
+ adc $acc4, $acc4
+ adc $acc5, $acc5
+ adc $acc6, $acc6
+ adc \$0, $acc7
+
+ ################################# Missing products
+ mul %rax # a[0] * a[0]
+ mov %rax, $acc0
+ movq %xmm1, %rax
+ mov %rdx, $t1
+
+ mul %rax # a[1] * a[1]
+ add $t1, $acc1
+ adc %rax, $acc2
+ movq %xmm2, %rax
+ adc \$0, %rdx
+ mov %rdx, $t1
+
+ mul %rax # a[2] * a[2]
+ add $t1, $acc3
+ adc %rax, $acc4
+ movq %xmm3, %rax
+ adc \$0, %rdx
+ mov %rdx, $t1
+
+ mov $acc0, $t0
+ imulq 8*4($a_ptr), $acc0 # *= .LordK
+
+ mul %rax # a[3] * a[3]
+ add $t1, $acc5
+ adc %rax, $acc6
+ mov 8*0($a_ptr), %rax # modulus[0]
+ adc %rdx, $acc7 # can't overflow
+
+ ################################# First reduction step
+ mul $acc0
+ mov $acc0, $t1
+ add %rax, $t0 # guaranteed to be zero
+ mov 8*1($a_ptr), %rax # modulus[1]
+ adc %rdx, $t0
+
+ sub $acc0, $acc2
+ sbb \$0, $t1 # can't borrow
+
+ mul $acc0
+ add $t0, $acc1
+ adc \$0, %rdx
+ add %rax, $acc1
+ mov $acc0, %rax
+ adc %rdx, $acc2
+ mov $acc0, %rdx
+ adc \$0, $t1 # can't overflow
+
+ mov $acc1, $t0
+ imulq 8*4($a_ptr), $acc1 # *= .LordK
+
+ shl \$32, %rax
+ shr \$32, %rdx
+ sub %rax, $acc3
+ mov 8*0($a_ptr), %rax
+ sbb %rdx, $acc0 # can't borrow
+
+ add $t1, $acc3
+ adc \$0, $acc0 # can't overflow
+
+ ################################# Second reduction step
+ mul $acc1
+ mov $acc1, $t1
+ add %rax, $t0 # guaranteed to be zero
+ mov 8*1($a_ptr), %rax
+ adc %rdx, $t0
+
+ sub $acc1, $acc3
+ sbb \$0, $t1 # can't borrow
+
+ mul $acc1
+ add $t0, $acc2
+ adc \$0, %rdx
+ add %rax, $acc2
+ mov $acc1, %rax
+ adc %rdx, $acc3
+ mov $acc1, %rdx
+ adc \$0, $t1 # can't overflow
+
+ mov $acc2, $t0
+ imulq 8*4($a_ptr), $acc2 # *= .LordK
+
+ shl \$32, %rax
+ shr \$32, %rdx
+ sub %rax, $acc0
+ mov 8*0($a_ptr), %rax
+ sbb %rdx, $acc1 # can't borrow
+
+ add $t1, $acc0
+ adc \$0, $acc1 # can't overflow
+
+ ################################# Third reduction step
+ mul $acc2
+ mov $acc2, $t1
+ add %rax, $t0 # guaranteed to be zero
+ mov 8*1($a_ptr), %rax
+ adc %rdx, $t0
+
+ sub $acc2, $acc0
+ sbb \$0, $t1 # can't borrow
+
+ mul $acc2
+ add $t0, $acc3
+ adc \$0, %rdx
+ add %rax, $acc3
+ mov $acc2, %rax
+ adc %rdx, $acc0
+ mov $acc2, %rdx
+ adc \$0, $t1 # can't overflow
+
+ mov $acc3, $t0
+ imulq 8*4($a_ptr), $acc3 # *= .LordK
+
+ shl \$32, %rax
+ shr \$32, %rdx
+ sub %rax, $acc1
+ mov 8*0($a_ptr), %rax
+ sbb %rdx, $acc2 # can't borrow
+
+ add $t1, $acc1
+ adc \$0, $acc2 # can't overflow
+
+ ################################# Last reduction step
+ mul $acc3
+ mov $acc3, $t1
+ add %rax, $t0 # guaranteed to be zero
+ mov 8*1($a_ptr), %rax
+ adc %rdx, $t0
+
+ sub $acc3, $acc1
+ sbb \$0, $t1 # can't borrow
+
+ mul $acc3
+ add $t0, $acc0
+ adc \$0, %rdx
+ add %rax, $acc0
+ mov $acc3, %rax
+ adc %rdx, $acc1
+ mov $acc3, %rdx
+ adc \$0, $t1 # can't overflow
+
+ shl \$32, %rax
+ shr \$32, %rdx
+ sub %rax, $acc2
+ sbb %rdx, $acc3 # can't borrow
+
+ add $t1, $acc2
+ adc \$0, $acc3 # can't overflow
+
+ ################################# Add bits [511:256] of the sqr result
+ xor %rdx, %rdx
+ add $acc4, $acc0
+ adc $acc5, $acc1
+ mov $acc0, $acc4
+ adc $acc6, $acc2
+ adc $acc7, $acc3
+ mov $acc1, %rax
+ adc \$0, %rdx
+
+ ################################# Compare to modulus
+ sub 8*0($a_ptr), $acc0
+ mov $acc2, $acc6
+ sbb 8*1($a_ptr), $acc1
+ sbb 8*2($a_ptr), $acc2
+ mov $acc3, $acc7
+ sbb 8*3($a_ptr), $acc3
+ sbb \$0, %rdx
+
+ cmovc $acc4, $acc0
+ cmovnc $acc1, %rax
+ cmovnc $acc2, $acc6
+ cmovnc $acc3, $acc7
+
+ dec $b_ptr
+ jnz .Loop_ord_sqr
+
+ mov $acc0, 8*0($r_ptr)
+ mov %rax, 8*1($r_ptr)
+ pxor %xmm1, %xmm1
+ mov $acc6, 8*2($r_ptr)
+ pxor %xmm2, %xmm2
+ mov $acc7, 8*3($r_ptr)
+ pxor %xmm3, %xmm3
+
+ mov 0(%rsp),%r15
+.cfi_restore %r15
+ mov 8(%rsp),%r14
+.cfi_restore %r14
+ mov 16(%rsp),%r13
+.cfi_restore %r13
+ mov 24(%rsp),%r12
+.cfi_restore %r12
+ mov 32(%rsp),%rbx
+.cfi_restore %rbx
+ mov 40(%rsp),%rbp
+.cfi_restore %rbp
+ lea 48(%rsp),%rsp
+.cfi_adjust_cfa_offset -48
+.Lord_sqr_epilogue:
+ ret
+.cfi_endproc
+.size ecp_nistz256_ord_sqr_mont,.-ecp_nistz256_ord_sqr_mont
+___
+
+$code.=<<___ if ($addx);
+################################################################################
+.type ecp_nistz256_ord_mul_montx,\@function,3
+.align 32
+ecp_nistz256_ord_mul_montx:
+.cfi_startproc
+.Lecp_nistz256_ord_mul_montx:
+ push %rbp
+.cfi_push %rbp
+ push %rbx
+.cfi_push %rbx
+ push %r12
+.cfi_push %r12
+ push %r13
+.cfi_push %r13
+ push %r14
+.cfi_push %r14
+ push %r15
+.cfi_push %r15
+.Lord_mulx_body:
+
+ mov $b_org, $b_ptr
+ mov 8*0($b_org), %rdx
+ mov 8*0($a_ptr), $acc1
+ mov 8*1($a_ptr), $acc2
+ mov 8*2($a_ptr), $acc3
+ mov 8*3($a_ptr), $acc4
+ lea -128($a_ptr), $a_ptr # control u-op density
+ lea .Lord-128(%rip), %r14
+ mov .LordK(%rip), %r15
+
+ ################################# Multiply by b[0]
+ mulx $acc1, $acc0, $acc1
+ mulx $acc2, $t0, $acc2
+ mulx $acc3, $t1, $acc3
+ add $t0, $acc1
+ mulx $acc4, $t0, $acc4
+ mov $acc0, %rdx
+ mulx %r15, %rdx, %rax
+ adc $t1, $acc2
+ adc $t0, $acc3
+ adc \$0, $acc4
+
+ ################################# reduction
+ xor $acc5, $acc5 # $acc5=0, cf=0, of=0
+ mulx 8*0+128(%r14), $t0, $t1
+ adcx $t0, $acc0 # guaranteed to be zero
+ adox $t1, $acc1
+
+ mulx 8*1+128(%r14), $t0, $t1
+ adcx $t0, $acc1
+ adox $t1, $acc2
+
+ mulx 8*2+128(%r14), $t0, $t1
+ adcx $t0, $acc2
+ adox $t1, $acc3
+
+ mulx 8*3+128(%r14), $t0, $t1
+ mov 8*1($b_ptr), %rdx
+ adcx $t0, $acc3
+ adox $t1, $acc4
+ adcx $acc0, $acc4
+ adox $acc0, $acc5
+ adc \$0, $acc5 # cf=0, of=0
+
+ ################################# Multiply by b[1]
+ mulx 8*0+128($a_ptr), $t0, $t1
+ adcx $t0, $acc1
+ adox $t1, $acc2
+
+ mulx 8*1+128($a_ptr), $t0, $t1
+ adcx $t0, $acc2
+ adox $t1, $acc3
+
+ mulx 8*2+128($a_ptr), $t0, $t1
+ adcx $t0, $acc3
+ adox $t1, $acc4
+
+ mulx 8*3+128($a_ptr), $t0, $t1
+ mov $acc1, %rdx
+ mulx %r15, %rdx, %rax
+ adcx $t0, $acc4
+ adox $t1, $acc5
+
+ adcx $acc0, $acc5
+ adox $acc0, $acc0
+ adc \$0, $acc0 # cf=0, of=0
+
+ ################################# reduction
+ mulx 8*0+128(%r14), $t0, $t1
+ adcx $t0, $acc1 # guaranteed to be zero
+ adox $t1, $acc2
+
+ mulx 8*1+128(%r14), $t0, $t1
+ adcx $t0, $acc2
+ adox $t1, $acc3
+
+ mulx 8*2+128(%r14), $t0, $t1
+ adcx $t0, $acc3
+ adox $t1, $acc4
+
+ mulx 8*3+128(%r14), $t0, $t1
+ mov 8*2($b_ptr), %rdx
+ adcx $t0, $acc4
+ adox $t1, $acc5
+ adcx $acc1, $acc5
+ adox $acc1, $acc0
+ adc \$0, $acc0 # cf=0, of=0
+
+ ################################# Multiply by b[2]
+ mulx 8*0+128($a_ptr), $t0, $t1
+ adcx $t0, $acc2
+ adox $t1, $acc3
+
+ mulx 8*1+128($a_ptr), $t0, $t1
+ adcx $t0, $acc3
+ adox $t1, $acc4
+
+ mulx 8*2+128($a_ptr), $t0, $t1
+ adcx $t0, $acc4
+ adox $t1, $acc5
+
+ mulx 8*3+128($a_ptr), $t0, $t1
+ mov $acc2, %rdx
+ mulx %r15, %rdx, %rax
+ adcx $t0, $acc5
+ adox $t1, $acc0
+
+ adcx $acc1, $acc0
+ adox $acc1, $acc1
+ adc \$0, $acc1 # cf=0, of=0
+
+ ################################# reduction
+ mulx 8*0+128(%r14), $t0, $t1
+ adcx $t0, $acc2 # guaranteed to be zero
+ adox $t1, $acc3
+
+ mulx 8*1+128(%r14), $t0, $t1
+ adcx $t0, $acc3
+ adox $t1, $acc4
+
+ mulx 8*2+128(%r14), $t0, $t1
+ adcx $t0, $acc4
+ adox $t1, $acc5
+
+ mulx 8*3+128(%r14), $t0, $t1
+ mov 8*3($b_ptr), %rdx
+ adcx $t0, $acc5
+ adox $t1, $acc0
+ adcx $acc2, $acc0
+ adox $acc2, $acc1
+ adc \$0, $acc1 # cf=0, of=0
+
+ ################################# Multiply by b[3]
+ mulx 8*0+128($a_ptr), $t0, $t1
+ adcx $t0, $acc3
+ adox $t1, $acc4
+
+ mulx 8*1+128($a_ptr), $t0, $t1
+ adcx $t0, $acc4
+ adox $t1, $acc5
+
+ mulx 8*2+128($a_ptr), $t0, $t1
+ adcx $t0, $acc5
+ adox $t1, $acc0
+
+ mulx 8*3+128($a_ptr), $t0, $t1
+ mov $acc3, %rdx
+ mulx %r15, %rdx, %rax
+ adcx $t0, $acc0
+ adox $t1, $acc1
+
+ adcx $acc2, $acc1
+ adox $acc2, $acc2
+ adc \$0, $acc2 # cf=0, of=0
+
+ ################################# reduction
+ mulx 8*0+128(%r14), $t0, $t1
+ adcx $t0, $acc3 # guranteed to be zero
+ adox $t1, $acc4
+
+ mulx 8*1+128(%r14), $t0, $t1
+ adcx $t0, $acc4
+ adox $t1, $acc5
+
+ mulx 8*2+128(%r14), $t0, $t1
+ adcx $t0, $acc5
+ adox $t1, $acc0
+
+ mulx 8*3+128(%r14), $t0, $t1
+ lea 128(%r14),%r14
+ mov $acc4, $t2
+ adcx $t0, $acc0
+ adox $t1, $acc1
+ mov $acc5, $t3
+ adcx $acc3, $acc1
+ adox $acc3, $acc2
+ adc \$0, $acc2
+
+ #################################
+ # Branch-less conditional subtraction of P
+ mov $acc0, $t0
+ sub 8*0(%r14), $acc4
+ sbb 8*1(%r14), $acc5
+ sbb 8*2(%r14), $acc0
+ mov $acc1, $t1
+ sbb 8*3(%r14), $acc1
+ sbb \$0, $acc2
+
+ cmovc $t2, $acc4
+ cmovc $t3, $acc5
+ cmovc $t0, $acc0
+ cmovc $t1, $acc1
+
+ mov $acc4, 8*0($r_ptr)
+ mov $acc5, 8*1($r_ptr)
+ mov $acc0, 8*2($r_ptr)
+ mov $acc1, 8*3($r_ptr)
+
+ mov 0(%rsp),%r15
+.cfi_restore %r15
+ mov 8(%rsp),%r14
+.cfi_restore %r14
+ mov 16(%rsp),%r13
+.cfi_restore %r13
+ mov 24(%rsp),%r12
+.cfi_restore %r12
+ mov 32(%rsp),%rbx
+.cfi_restore %rbx
+ mov 40(%rsp),%rbp
+.cfi_restore %rbp
+ lea 48(%rsp),%rsp
+.cfi_adjust_cfa_offset -48
+.Lord_mulx_epilogue:
+ ret
+.cfi_endproc
+.size ecp_nistz256_ord_mul_montx,.-ecp_nistz256_ord_mul_montx
+
+.type ecp_nistz256_ord_sqr_montx,\@function,3
+.align 32
+ecp_nistz256_ord_sqr_montx:
+.cfi_startproc
+.Lecp_nistz256_ord_sqr_montx:
+ push %rbp
+.cfi_push %rbp
+ push %rbx
+.cfi_push %rbx
+ push %r12
+.cfi_push %r12
+ push %r13
+.cfi_push %r13
+ push %r14
+.cfi_push %r14
+ push %r15
+.cfi_push %r15
+.Lord_sqrx_body:
+
+ mov $b_org, $b_ptr
+ mov 8*0($a_ptr), %rdx
+ mov 8*1($a_ptr), $acc6
+ mov 8*2($a_ptr), $acc7
+ mov 8*3($a_ptr), $acc0
+ lea .Lord(%rip), $a_ptr
+ jmp .Loop_ord_sqrx
+
+.align 32
+.Loop_ord_sqrx:
+ mulx $acc6, $acc1, $acc2 # a[0]*a[1]
+ mulx $acc7, $t0, $acc3 # a[0]*a[2]
+ mov %rdx, %rax # offload a[0]
+ movq $acc6, %xmm1 # offload a[1]
+ mulx $acc0, $t1, $acc4 # a[0]*a[3]
+ mov $acc6, %rdx
+ add $t0, $acc2
+ movq $acc7, %xmm2 # offload a[2]
+ adc $t1, $acc3
+ adc \$0, $acc4
+ xor $acc5, $acc5 # $acc5=0,cf=0,of=0
+ #################################
+ mulx $acc7, $t0, $t1 # a[1]*a[2]
+ adcx $t0, $acc3
+ adox $t1, $acc4
+
+ mulx $acc0, $t0, $t1 # a[1]*a[3]
+ mov $acc7, %rdx
+ adcx $t0, $acc4
+ adox $t1, $acc5
+ adc \$0, $acc5
+ #################################
+ mulx $acc0, $t0, $acc6 # a[2]*a[3]
+ mov %rax, %rdx
+ movq $acc0, %xmm3 # offload a[3]
+ xor $acc7, $acc7 # $acc7=0,cf=0,of=0
+ adcx $acc1, $acc1 # acc1:6<<1
+ adox $t0, $acc5
+ adcx $acc2, $acc2
+ adox $acc7, $acc6 # of=0
+
+ ################################# a[i]*a[i]
+ mulx %rdx, $acc0, $t1
+ movq %xmm1, %rdx
+ adcx $acc3, $acc3
+ adox $t1, $acc1
+ adcx $acc4, $acc4
+ mulx %rdx, $t0, $t4
+ movq %xmm2, %rdx
+ adcx $acc5, $acc5
+ adox $t0, $acc2
+ adcx $acc6, $acc6
+ mulx %rdx, $t0, $t1
+ .byte 0x67
+ movq %xmm3, %rdx
+ adox $t4, $acc3
+ adcx $acc7, $acc7
+ adox $t0, $acc4
+ adox $t1, $acc5
+ mulx %rdx, $t0, $t4
+ adox $t0, $acc6
+ adox $t4, $acc7
+
+ ################################# reduction
+ mov $acc0, %rdx
+ mulx 8*4($a_ptr), %rdx, $t0
+
+ xor %rax, %rax # cf=0, of=0
+ mulx 8*0($a_ptr), $t0, $t1
+ adcx $t0, $acc0 # guaranteed to be zero
+ adox $t1, $acc1
+ mulx 8*1($a_ptr), $t0, $t1
+ adcx $t0, $acc1
+ adox $t1, $acc2
+ mulx 8*2($a_ptr), $t0, $t1
+ adcx $t0, $acc2
+ adox $t1, $acc3
+ mulx 8*3($a_ptr), $t0, $t1
+ adcx $t0, $acc3
+ adox $t1, $acc0 # of=0
+ adcx %rax, $acc0 # cf=0
+
+ #################################
+ mov $acc1, %rdx
+ mulx 8*4($a_ptr), %rdx, $t0
+
+ mulx 8*0($a_ptr), $t0, $t1
+ adox $t0, $acc1 # guaranteed to be zero
+ adcx $t1, $acc2
+ mulx 8*1($a_ptr), $t0, $t1
+ adox $t0, $acc2
+ adcx $t1, $acc3
+ mulx 8*2($a_ptr), $t0, $t1
+ adox $t0, $acc3
+ adcx $t1, $acc0
+ mulx 8*3($a_ptr), $t0, $t1
+ adox $t0, $acc0
+ adcx $t1, $acc1 # cf=0
+ adox %rax, $acc1 # of=0
+
+ #################################
+ mov $acc2, %rdx
+ mulx 8*4($a_ptr), %rdx, $t0
+
+ mulx 8*0($a_ptr), $t0, $t1
+ adcx $t0, $acc2 # guaranteed to be zero
+ adox $t1, $acc3
+ mulx 8*1($a_ptr), $t0, $t1
+ adcx $t0, $acc3
+ adox $t1, $acc0
+ mulx 8*2($a_ptr), $t0, $t1
+ adcx $t0, $acc0
+ adox $t1, $acc1
+ mulx 8*3($a_ptr), $t0, $t1
+ adcx $t0, $acc1
+ adox $t1, $acc2 # of=0
+ adcx %rax, $acc2 # cf=0
+
+ #################################
+ mov $acc3, %rdx
+ mulx 8*4($a_ptr), %rdx, $t0
+
+ mulx 8*0($a_ptr), $t0, $t1
+ adox $t0, $acc3 # guaranteed to be zero
+ adcx $t1, $acc0
+ mulx 8*1($a_ptr), $t0, $t1
+ adox $t0, $acc0
+ adcx $t1, $acc1
+ mulx 8*2($a_ptr), $t0, $t1
+ adox $t0, $acc1
+ adcx $t1, $acc2
+ mulx 8*3($a_ptr), $t0, $t1
+ adox $t0, $acc2
+ adcx $t1, $acc3
+ adox %rax, $acc3
+
+ ################################# accumulate upper half
+ add $acc0, $acc4 # add $acc4, $acc0
+ adc $acc5, $acc1
+ mov $acc4, %rdx
+ adc $acc6, $acc2
+ adc $acc7, $acc3
+ mov $acc1, $acc6
+ adc \$0, %rax
+
+ ################################# compare to modulus
+ sub 8*0($a_ptr), $acc4
+ mov $acc2, $acc7
+ sbb 8*1($a_ptr), $acc1
+ sbb 8*2($a_ptr), $acc2
+ mov $acc3, $acc0
+ sbb 8*3($a_ptr), $acc3
+ sbb \$0, %rax
+
+ cmovnc $acc4, %rdx
+ cmovnc $acc1, $acc6
+ cmovnc $acc2, $acc7
+ cmovnc $acc3, $acc0
+
+ dec $b_ptr
+ jnz .Loop_ord_sqrx
+
+ mov %rdx, 8*0($r_ptr)
+ mov $acc6, 8*1($r_ptr)
+ pxor %xmm1, %xmm1
+ mov $acc7, 8*2($r_ptr)
+ pxor %xmm2, %xmm2
+ mov $acc0, 8*3($r_ptr)
+ pxor %xmm3, %xmm3
+
+ mov 0(%rsp),%r15
+.cfi_restore %r15
+ mov 8(%rsp),%r14
+.cfi_restore %r14
+ mov 16(%rsp),%r13
+.cfi_restore %r13
+ mov 24(%rsp),%r12
+.cfi_restore %r12
+ mov 32(%rsp),%rbx
+.cfi_restore %rbx
+ mov 40(%rsp),%rbp
+.cfi_restore %rbp
+ lea 48(%rsp),%rsp
+.cfi_adjust_cfa_offset -48
+.Lord_sqrx_epilogue:
+ ret
+.cfi_endproc
+.size ecp_nistz256_ord_sqr_montx,.-ecp_nistz256_ord_sqr_montx
+___
+
+$code.=<<___;
+################################################################################
# void ecp_nistz256_to_mont(
# uint64_t res[4],
# uint64_t in[4]);
@@ -470,6 +1599,7 @@ $code.=<<___;
.type ecp_nistz256_mul_mont,\@function,3
.align 32
ecp_nistz256_mul_mont:
+.cfi_startproc
___
$code.=<<___ if ($addx);
mov \$0x80100, %ecx
@@ -478,11 +1608,18 @@ ___
$code.=<<___;
.Lmul_mont:
push %rbp
+.cfi_push %rbp
push %rbx
+.cfi_push %rbx
push %r12
+.cfi_push %r12
push %r13
+.cfi_push %r13
push %r14
+.cfi_push %r14
push %r15
+.cfi_push %r15
+.Lmul_body:
___
$code.=<<___ if ($addx);
cmp \$0x80100, %ecx
@@ -515,13 +1652,23 @@ $code.=<<___ if ($addx);
___
$code.=<<___;
.Lmul_mont_done:
- pop %r15
- pop %r14
- pop %r13
- pop %r12
- pop %rbx
- pop %rbp
+ mov 0(%rsp),%r15
+.cfi_restore %r15
+ mov 8(%rsp),%r14
+.cfi_restore %r14
+ mov 16(%rsp),%r13
+.cfi_restore %r13
+ mov 24(%rsp),%r12
+.cfi_restore %r12
+ mov 32(%rsp),%rbx
+.cfi_restore %rbx
+ mov 40(%rsp),%rbp
+.cfi_restore %rbp
+ lea 48(%rsp),%rsp
+.cfi_adjust_cfa_offset -48
+.Lmul_epilogue:
ret
+.cfi_endproc
.size ecp_nistz256_mul_mont,.-ecp_nistz256_mul_mont
.type __ecp_nistz256_mul_montq,\@abi-omnipotent
@@ -611,7 +1758,7 @@ __ecp_nistz256_mul_montq:
adc \$0, $acc0
########################################################################
- # Second reduction step
+ # Second reduction step
mov $acc1, $t1
shl \$32, $acc1
mulq $poly3
@@ -658,7 +1805,7 @@ __ecp_nistz256_mul_montq:
adc \$0, $acc1
########################################################################
- # Third reduction step
+ # Third reduction step
mov $acc2, $t1
shl \$32, $acc2
mulq $poly3
@@ -705,7 +1852,7 @@ __ecp_nistz256_mul_montq:
adc \$0, $acc2
########################################################################
- # Final reduction step
+ # Final reduction step
mov $acc3, $t1
shl \$32, $acc3
mulq $poly3
@@ -718,7 +1865,7 @@ __ecp_nistz256_mul_montq:
mov $acc5, $t1
adc \$0, $acc2
- ########################################################################
+ ########################################################################
# Branch-less conditional subtraction of P
sub \$-1, $acc4 # .Lpoly[0]
mov $acc0, $t2
@@ -751,6 +1898,7 @@ __ecp_nistz256_mul_montq:
.type ecp_nistz256_sqr_mont,\@function,2
.align 32
ecp_nistz256_sqr_mont:
+.cfi_startproc
___
$code.=<<___ if ($addx);
mov \$0x80100, %ecx
@@ -758,11 +1906,18 @@ $code.=<<___ if ($addx);
___
$code.=<<___;
push %rbp
+.cfi_push %rbp
push %rbx
+.cfi_push %rbx
push %r12
+.cfi_push %r12
push %r13
+.cfi_push %r13
push %r14
+.cfi_push %r14
push %r15
+.cfi_push %r15
+.Lsqr_body:
___
$code.=<<___ if ($addx);
cmp \$0x80100, %ecx
@@ -791,13 +1946,23 @@ $code.=<<___ if ($addx);
___
$code.=<<___;
.Lsqr_mont_done:
- pop %r15
- pop %r14
- pop %r13
- pop %r12
- pop %rbx
- pop %rbp
+ mov 0(%rsp),%r15
+.cfi_restore %r15
+ mov 8(%rsp),%r14
+.cfi_restore %r14
+ mov 16(%rsp),%r13
+.cfi_restore %r13
+ mov 24(%rsp),%r12
+.cfi_restore %r12
+ mov 32(%rsp),%rbx
+.cfi_restore %rbx
+ mov 40(%rsp),%rbp
+.cfi_restore %rbp
+ lea 48(%rsp),%rsp
+.cfi_adjust_cfa_offset -48
+.Lsqr_epilogue:
ret
+.cfi_endproc
.size ecp_nistz256_sqr_mont,.-ecp_nistz256_sqr_mont
.type __ecp_nistz256_sqr_montq,\@abi-omnipotent
@@ -1278,8 +2443,12 @@ $code.=<<___;
.type ecp_nistz256_from_mont,\@function,2
.align 32
ecp_nistz256_from_mont:
+.cfi_startproc
push %r12
+.cfi_push %r12
push %r13
+.cfi_push %r13
+.Lfrom_body:
mov 8*0($in_ptr), %rax
mov .Lpoly+8*3(%rip), $t2
@@ -1360,9 +2529,15 @@ ecp_nistz256_from_mont:
mov $acc2, 8*2($r_ptr)
mov $acc3, 8*3($r_ptr)
- pop %r13
- pop %r12
+ mov 0(%rsp),%r13
+.cfi_restore %r13
+ mov 8(%rsp),%r12
+.cfi_restore %r12
+ lea 16(%rsp),%rsp
+.cfi_adjust_cfa_offset -16
+.Lfrom_epilogue:
ret
+.cfi_endproc
.size ecp_nistz256_from_mont,.-ecp_nistz256_from_mont
___
}
@@ -1488,10 +2663,10 @@ $code.=<<___ if ($win64);
movaps 0x80(%rsp), %xmm14
movaps 0x90(%rsp), %xmm15
lea 0xa8(%rsp), %rsp
-.LSEH_end_ecp_nistz256_gather_w5:
___
$code.=<<___;
ret
+.LSEH_end_ecp_nistz256_gather_w5:
.size ecp_nistz256_gather_w5,.-ecp_nistz256_gather_w5
################################################################################
@@ -1593,10 +2768,10 @@ $code.=<<___ if ($win64);
movaps 0x80(%rsp), %xmm14
movaps 0x90(%rsp), %xmm15
lea 0xa8(%rsp), %rsp
-.LSEH_end_ecp_nistz256_gather_w7:
___
$code.=<<___;
ret
+.LSEH_end_ecp_nistz256_gather_w7:
.size ecp_nistz256_gather_w7,.-ecp_nistz256_gather_w7
___
}
@@ -1617,18 +2792,19 @@ ecp_nistz256_avx2_gather_w5:
___
$code.=<<___ if ($win64);
lea -0x88(%rsp), %rax
+ mov %rsp,%r11
.LSEH_begin_ecp_nistz256_avx2_gather_w5:
- .byte 0x48,0x8d,0x60,0xe0 #lea -0x20(%rax), %rsp
- .byte 0xc5,0xf8,0x29,0x70,0xe0 #vmovaps %xmm6, -0x20(%rax)
- .byte 0xc5,0xf8,0x29,0x78,0xf0 #vmovaps %xmm7, -0x10(%rax)
- .byte 0xc5,0x78,0x29,0x40,0x00 #vmovaps %xmm8, 8(%rax)
- .byte 0xc5,0x78,0x29,0x48,0x10 #vmovaps %xmm9, 0x10(%rax)
- .byte 0xc5,0x78,0x29,0x50,0x20 #vmovaps %xmm10, 0x20(%rax)
- .byte 0xc5,0x78,0x29,0x58,0x30 #vmovaps %xmm11, 0x30(%rax)
- .byte 0xc5,0x78,0x29,0x60,0x40 #vmovaps %xmm12, 0x40(%rax)
- .byte 0xc5,0x78,0x29,0x68,0x50 #vmovaps %xmm13, 0x50(%rax)
- .byte 0xc5,0x78,0x29,0x70,0x60 #vmovaps %xmm14, 0x60(%rax)
- .byte 0xc5,0x78,0x29,0x78,0x70 #vmovaps %xmm15, 0x70(%rax)
+ .byte 0x48,0x8d,0x60,0xe0 # lea -0x20(%rax), %rsp
+ .byte 0xc5,0xf8,0x29,0x70,0xe0 # vmovaps %xmm6, -0x20(%rax)
+ .byte 0xc5,0xf8,0x29,0x78,0xf0 # vmovaps %xmm7, -0x10(%rax)
+ .byte 0xc5,0x78,0x29,0x40,0x00 # vmovaps %xmm8, 8(%rax)
+ .byte 0xc5,0x78,0x29,0x48,0x10 # vmovaps %xmm9, 0x10(%rax)
+ .byte 0xc5,0x78,0x29,0x50,0x20 # vmovaps %xmm10, 0x20(%rax)
+ .byte 0xc5,0x78,0x29,0x58,0x30 # vmovaps %xmm11, 0x30(%rax)
+ .byte 0xc5,0x78,0x29,0x60,0x40 # vmovaps %xmm12, 0x40(%rax)
+ .byte 0xc5,0x78,0x29,0x68,0x50 # vmovaps %xmm13, 0x50(%rax)
+ .byte 0xc5,0x78,0x29,0x70,0x60 # vmovaps %xmm14, 0x60(%rax)
+ .byte 0xc5,0x78,0x29,0x78,0x70 # vmovaps %xmm15, 0x70(%rax)
___
$code.=<<___;
vmovdqa .LTwo(%rip), $TWO
@@ -1694,11 +2870,11 @@ $code.=<<___ if ($win64);
movaps 0x70(%rsp), %xmm13
movaps 0x80(%rsp), %xmm14
movaps 0x90(%rsp), %xmm15
- lea 0xa8(%rsp), %rsp
-.LSEH_end_ecp_nistz256_avx2_gather_w5:
+ lea (%r11), %rsp
___
$code.=<<___;
ret
+.LSEH_end_ecp_nistz256_avx2_gather_w5:
.size ecp_nistz256_avx2_gather_w5,.-ecp_nistz256_avx2_gather_w5
___
}
@@ -1721,19 +2897,20 @@ ecp_nistz256_avx2_gather_w7:
vzeroupper
___
$code.=<<___ if ($win64);
+ mov %rsp,%r11
lea -0x88(%rsp), %rax
.LSEH_begin_ecp_nistz256_avx2_gather_w7:
- .byte 0x48,0x8d,0x60,0xe0 #lea -0x20(%rax), %rsp
- .byte 0xc5,0xf8,0x29,0x70,0xe0 #vmovaps %xmm6, -0x20(%rax)
- .byte 0xc5,0xf8,0x29,0x78,0xf0 #vmovaps %xmm7, -0x10(%rax)
- .byte 0xc5,0x78,0x29,0x40,0x00 #vmovaps %xmm8, 8(%rax)
- .byte 0xc5,0x78,0x29,0x48,0x10 #vmovaps %xmm9, 0x10(%rax)
- .byte 0xc5,0x78,0x29,0x50,0x20 #vmovaps %xmm10, 0x20(%rax)
- .byte 0xc5,0x78,0x29,0x58,0x30 #vmovaps %xmm11, 0x30(%rax)
- .byte 0xc5,0x78,0x29,0x60,0x40 #vmovaps %xmm12, 0x40(%rax)
- .byte 0xc5,0x78,0x29,0x68,0x50 #vmovaps %xmm13, 0x50(%rax)
- .byte 0xc5,0x78,0x29,0x70,0x60 #vmovaps %xmm14, 0x60(%rax)
- .byte 0xc5,0x78,0x29,0x78,0x70 #vmovaps %xmm15, 0x70(%rax)
+ .byte 0x48,0x8d,0x60,0xe0 # lea -0x20(%rax), %rsp
+ .byte 0xc5,0xf8,0x29,0x70,0xe0 # vmovaps %xmm6, -0x20(%rax)
+ .byte 0xc5,0xf8,0x29,0x78,0xf0 # vmovaps %xmm7, -0x10(%rax)
+ .byte 0xc5,0x78,0x29,0x40,0x00 # vmovaps %xmm8, 8(%rax)
+ .byte 0xc5,0x78,0x29,0x48,0x10 # vmovaps %xmm9, 0x10(%rax)
+ .byte 0xc5,0x78,0x29,0x50,0x20 # vmovaps %xmm10, 0x20(%rax)
+ .byte 0xc5,0x78,0x29,0x58,0x30 # vmovaps %xmm11, 0x30(%rax)
+ .byte 0xc5,0x78,0x29,0x60,0x40 # vmovaps %xmm12, 0x40(%rax)
+ .byte 0xc5,0x78,0x29,0x68,0x50 # vmovaps %xmm13, 0x50(%rax)
+ .byte 0xc5,0x78,0x29,0x70,0x60 # vmovaps %xmm14, 0x60(%rax)
+ .byte 0xc5,0x78,0x29,0x78,0x70 # vmovaps %xmm15, 0x70(%rax)
___
$code.=<<___;
vmovdqa .LThree(%rip), $THREE
@@ -1814,11 +2991,11 @@ $code.=<<___ if ($win64);
movaps 0x70(%rsp), %xmm13
movaps 0x80(%rsp), %xmm14
movaps 0x90(%rsp), %xmm15
- lea 0xa8(%rsp), %rsp
-.LSEH_end_ecp_nistz256_avx2_gather_w7:
+ lea (%r11), %rsp
___
$code.=<<___;
ret
+.LSEH_end_ecp_nistz256_avx2_gather_w7:
.size ecp_nistz256_avx2_gather_w7,.-ecp_nistz256_avx2_gather_w7
___
} else {
@@ -2022,6 +3199,7 @@ $code.=<<___;
.type ecp_nistz256_point_double,\@function,2
.align 32
ecp_nistz256_point_double:
+.cfi_startproc
___
$code.=<<___ if ($addx);
mov \$0x80100, %ecx
@@ -2038,17 +3216,26 @@ $code.=<<___;
.type ecp_nistz256_point_doublex,\@function,2
.align 32
ecp_nistz256_point_doublex:
+.cfi_startproc
.Lpoint_doublex:
___
}
$code.=<<___;
push %rbp
+.cfi_push %rbp
push %rbx
+.cfi_push %rbx
push %r12
+.cfi_push %r12
push %r13
+.cfi_push %r13
push %r14
+.cfi_push %r14
push %r15
+.cfi_push %r15
sub \$32*5+8, %rsp
+.cfi_adjust_cfa_offset 32*5+8
+.Lpoint_double${x}_body:
.Lpoint_double_shortcut$x:
movdqu 0x00($a_ptr), %xmm0 # copy *(P256_POINT *)$a_ptr.x
@@ -2114,7 +3301,7 @@ $code.=<<___;
movq %xmm1, $r_ptr
call __ecp_nistz256_sqr_mont$x # p256_sqr_mont(res_y, S);
___
-{
+{
######## ecp_nistz256_div_by_2(res_y, res_y); ##########################
# operate in 4-5-6-7 "name space" that matches squaring output
#
@@ -2203,7 +3390,7 @@ $code.=<<___;
lea $M(%rsp), $b_ptr
mov $acc4, $acc6 # harmonize sub output and mul input
xor %ecx, %ecx
- mov $acc4, $S+8*0(%rsp) # have to save:-(
+ mov $acc4, $S+8*0(%rsp) # have to save:-(
mov $acc5, $acc2
mov $acc5, $S+8*1(%rsp)
cmovz $acc0, $acc3
@@ -2219,14 +3406,25 @@ $code.=<<___;
movq %xmm1, $r_ptr
call __ecp_nistz256_sub_from$x # p256_sub(res_y, S, res_y);
- add \$32*5+8, %rsp
- pop %r15
- pop %r14
- pop %r13
- pop %r12
- pop %rbx
- pop %rbp
+ lea 32*5+56(%rsp), %rsi
+.cfi_def_cfa %rsi,8
+ mov -48(%rsi),%r15
+.cfi_restore %r15
+ mov -40(%rsi),%r14
+.cfi_restore %r14
+ mov -32(%rsi),%r13
+.cfi_restore %r13
+ mov -24(%rsi),%r12
+.cfi_restore %r12
+ mov -16(%rsi),%rbx
+.cfi_restore %rbx
+ mov -8(%rsi),%rbp
+.cfi_restore %rbp
+ lea (%rsi),%rsp
+.cfi_def_cfa_register %rsp
+.Lpoint_double${x}_epilogue:
ret
+.cfi_endproc
.size ecp_nistz256_point_double$sfx,.-ecp_nistz256_point_double$sfx
___
}
@@ -2252,6 +3450,7 @@ $code.=<<___;
.type ecp_nistz256_point_add,\@function,3
.align 32
ecp_nistz256_point_add:
+.cfi_startproc
___
$code.=<<___ if ($addx);
mov \$0x80100, %ecx
@@ -2268,17 +3467,26 @@ $code.=<<___;
.type ecp_nistz256_point_addx,\@function,3
.align 32
ecp_nistz256_point_addx:
+.cfi_startproc
.Lpoint_addx:
___
}
$code.=<<___;
push %rbp
+.cfi_push %rbp
push %rbx
+.cfi_push %rbx
push %r12
+.cfi_push %r12
push %r13
+.cfi_push %r13
push %r14
+.cfi_push %r14
push %r15
+.cfi_push %r15
sub \$32*18+8, %rsp
+.cfi_adjust_cfa_offset 32*18+8
+.Lpoint_add${x}_body:
movdqu 0x00($a_ptr), %xmm0 # copy *(P256_POINT *)$a_ptr
movdqu 0x10($a_ptr), %xmm1
@@ -2587,14 +3795,25 @@ $code.=<<___;
movdqu %xmm3, 0x30($r_ptr)
.Ladd_done$x:
- add \$32*18+8, %rsp
- pop %r15
- pop %r14
- pop %r13
- pop %r12
- pop %rbx
- pop %rbp
+ lea 32*18+56(%rsp), %rsi
+.cfi_def_cfa %rsi,8
+ mov -48(%rsi),%r15
+.cfi_restore %r15
+ mov -40(%rsi),%r14
+.cfi_restore %r14
+ mov -32(%rsi),%r13
+.cfi_restore %r13
+ mov -24(%rsi),%r12
+.cfi_restore %r12
+ mov -16(%rsi),%rbx
+.cfi_restore %rbx
+ mov -8(%rsi),%rbp
+.cfi_restore %rbp
+ lea (%rsi),%rsp
+.cfi_def_cfa_register %rsp
+.Lpoint_add${x}_epilogue:
ret
+.cfi_endproc
.size ecp_nistz256_point_add$sfx,.-ecp_nistz256_point_add$sfx
___
}
@@ -2619,6 +3838,7 @@ $code.=<<___;
.type ecp_nistz256_point_add_affine,\@function,3
.align 32
ecp_nistz256_point_add_affine:
+.cfi_startproc
___
$code.=<<___ if ($addx);
mov \$0x80100, %ecx
@@ -2635,17 +3855,26 @@ $code.=<<___;
.type ecp_nistz256_point_add_affinex,\@function,3
.align 32
ecp_nistz256_point_add_affinex:
+.cfi_startproc
.Lpoint_add_affinex:
___
}
$code.=<<___;
push %rbp
+.cfi_push %rbp
push %rbx
+.cfi_push %rbx
push %r12
+.cfi_push %r12
push %r13
+.cfi_push %r13
push %r14
+.cfi_push %r14
push %r15
+.cfi_push %r15
sub \$32*15+8, %rsp
+.cfi_adjust_cfa_offset 32*15+8
+.Ladd_affine${x}_body:
movdqu 0x00($a_ptr), %xmm0 # copy *(P256_POINT *)$a_ptr
mov $b_org, $b_ptr # reassign
@@ -2890,14 +4119,25 @@ $code.=<<___;
movdqu %xmm2, 0x20($r_ptr)
movdqu %xmm3, 0x30($r_ptr)
- add \$32*15+8, %rsp
- pop %r15
- pop %r14
- pop %r13
- pop %r12
- pop %rbx
- pop %rbp
+ lea 32*15+56(%rsp), %rsi
+.cfi_def_cfa %rsi,8
+ mov -48(%rsi),%r15
+.cfi_restore %r15
+ mov -40(%rsi),%r14
+.cfi_restore %r14
+ mov -32(%rsi),%r13
+.cfi_restore %r13
+ mov -24(%rsi),%r12
+.cfi_restore %r12
+ mov -16(%rsi),%rbx
+.cfi_restore %rbx
+ mov -8(%rsi),%rbp
+.cfi_restore %rbp
+ lea (%rsi),%rsp
+.cfi_def_cfa_register %rsp
+.Ladd_affine${x}_epilogue:
ret
+.cfi_endproc
.size ecp_nistz256_point_add_affine$sfx,.-ecp_nistz256_point_add_affine$sfx
___
}
@@ -3048,11 +4288,395 @@ ___
}
}}}
+# EXCEPTION_DISPOSITION handler (EXCEPTION_RECORD *rec,ULONG64 frame,
+# CONTEXT *context,DISPATCHER_CONTEXT *disp)
+if ($win64) {
+$rec="%rcx";
+$frame="%rdx";
+$context="%r8";
+$disp="%r9";
+
+$code.=<<___;
+.extern __imp_RtlVirtualUnwind
+
+.type short_handler,\@abi-omnipotent
+.align 16
+short_handler:
+ push %rsi
+ push %rdi
+ push %rbx
+ push %rbp
+ push %r12
+ push %r13
+ push %r14
+ push %r15
+ pushfq
+ sub \$64,%rsp
+
+ mov 120($context),%rax # pull context->Rax
+ mov 248($context),%rbx # pull context->Rip
+
+ mov 8($disp),%rsi # disp->ImageBase
+ mov 56($disp),%r11 # disp->HandlerData
+
+ mov 0(%r11),%r10d # HandlerData[0]
+ lea (%rsi,%r10),%r10 # end of prologue label
+ cmp %r10,%rbx # context->Rip<end of prologue label
+ jb .Lcommon_seh_tail
+
+ mov 152($context),%rax # pull context->Rsp
+
+ mov 4(%r11),%r10d # HandlerData[1]
+ lea (%rsi,%r10),%r10 # epilogue label
+ cmp %r10,%rbx # context->Rip>=epilogue label
+ jae .Lcommon_seh_tail
+
+ lea 16(%rax),%rax
+
+ mov -8(%rax),%r12
+ mov -16(%rax),%r13
+ mov %r12,216($context) # restore context->R12
+ mov %r13,224($context) # restore context->R13
+
+ jmp .Lcommon_seh_tail
+.size short_handler,.-short_handler
+
+.type full_handler,\@abi-omnipotent
+.align 16
+full_handler:
+ push %rsi
+ push %rdi
+ push %rbx
+ push %rbp
+ push %r12
+ push %r13
+ push %r14
+ push %r15
+ pushfq
+ sub \$64,%rsp
+
+ mov 120($context),%rax # pull context->Rax
+ mov 248($context),%rbx # pull context->Rip
+
+ mov 8($disp),%rsi # disp->ImageBase
+ mov 56($disp),%r11 # disp->HandlerData
+
+ mov 0(%r11),%r10d # HandlerData[0]
+ lea (%rsi,%r10),%r10 # end of prologue label
+ cmp %r10,%rbx # context->Rip<end of prologue label
+ jb .Lcommon_seh_tail
+
+ mov 152($context),%rax # pull context->Rsp
+
+ mov 4(%r11),%r10d # HandlerData[1]
+ lea (%rsi,%r10),%r10 # epilogue label
+ cmp %r10,%rbx # context->Rip>=epilogue label
+ jae .Lcommon_seh_tail
+
+ mov 8(%r11),%r10d # HandlerData[2]
+ lea (%rax,%r10),%rax
+
+ mov -8(%rax),%rbp
+ mov -16(%rax),%rbx
+ mov -24(%rax),%r12
+ mov -32(%rax),%r13
+ mov -40(%rax),%r14
+ mov -48(%rax),%r15
+ mov %rbx,144($context) # restore context->Rbx
+ mov %rbp,160($context) # restore context->Rbp
+ mov %r12,216($context) # restore context->R12
+ mov %r13,224($context) # restore context->R13
+ mov %r14,232($context) # restore context->R14
+ mov %r15,240($context) # restore context->R15
+
+.Lcommon_seh_tail:
+ mov 8(%rax),%rdi
+ mov 16(%rax),%rsi
+ mov %rax,152($context) # restore context->Rsp
+ mov %rsi,168($context) # restore context->Rsi
+ mov %rdi,176($context) # restore context->Rdi
+
+ mov 40($disp),%rdi # disp->ContextRecord
+ mov $context,%rsi # context
+ mov \$154,%ecx # sizeof(CONTEXT)
+ .long 0xa548f3fc # cld; rep movsq
+
+ mov $disp,%rsi
+ xor %rcx,%rcx # arg1, UNW_FLAG_NHANDLER
+ mov 8(%rsi),%rdx # arg2, disp->ImageBase
+ mov 0(%rsi),%r8 # arg3, disp->ControlPc
+ mov 16(%rsi),%r9 # arg4, disp->FunctionEntry
+ mov 40(%rsi),%r10 # disp->ContextRecord
+ lea 56(%rsi),%r11 # &disp->HandlerData
+ lea 24(%rsi),%r12 # &disp->EstablisherFrame
+ mov %r10,32(%rsp) # arg5
+ mov %r11,40(%rsp) # arg6
+ mov %r12,48(%rsp) # arg7
+ mov %rcx,56(%rsp) # arg8, (NULL)
+ call *__imp_RtlVirtualUnwind(%rip)
+
+ mov \$1,%eax # ExceptionContinueSearch
+ add \$64,%rsp
+ popfq
+ pop %r15
+ pop %r14
+ pop %r13
+ pop %r12
+ pop %rbp
+ pop %rbx
+ pop %rdi
+ pop %rsi
+ ret
+.size full_handler,.-full_handler
+
+.section .pdata
+.align 4
+ .rva .LSEH_begin_ecp_nistz256_mul_by_2
+ .rva .LSEH_end_ecp_nistz256_mul_by_2
+ .rva .LSEH_info_ecp_nistz256_mul_by_2
+
+ .rva .LSEH_begin_ecp_nistz256_div_by_2
+ .rva .LSEH_end_ecp_nistz256_div_by_2
+ .rva .LSEH_info_ecp_nistz256_div_by_2
+
+ .rva .LSEH_begin_ecp_nistz256_mul_by_3
+ .rva .LSEH_end_ecp_nistz256_mul_by_3
+ .rva .LSEH_info_ecp_nistz256_mul_by_3
+
+ .rva .LSEH_begin_ecp_nistz256_add
+ .rva .LSEH_end_ecp_nistz256_add
+ .rva .LSEH_info_ecp_nistz256_add
+
+ .rva .LSEH_begin_ecp_nistz256_sub
+ .rva .LSEH_end_ecp_nistz256_sub
+ .rva .LSEH_info_ecp_nistz256_sub
+
+ .rva .LSEH_begin_ecp_nistz256_neg
+ .rva .LSEH_end_ecp_nistz256_neg
+ .rva .LSEH_info_ecp_nistz256_neg
+
+ .rva .LSEH_begin_ecp_nistz256_ord_mul_mont
+ .rva .LSEH_end_ecp_nistz256_ord_mul_mont
+ .rva .LSEH_info_ecp_nistz256_ord_mul_mont
+
+ .rva .LSEH_begin_ecp_nistz256_ord_sqr_mont
+ .rva .LSEH_end_ecp_nistz256_ord_sqr_mont
+ .rva .LSEH_info_ecp_nistz256_ord_sqr_mont
+___
+$code.=<<___ if ($addx);
+ .rva .LSEH_begin_ecp_nistz256_ord_mul_montx
+ .rva .LSEH_end_ecp_nistz256_ord_mul_montx
+ .rva .LSEH_info_ecp_nistz256_ord_mul_montx
+
+ .rva .LSEH_begin_ecp_nistz256_ord_sqr_montx
+ .rva .LSEH_end_ecp_nistz256_ord_sqr_montx
+ .rva .LSEH_info_ecp_nistz256_ord_sqr_montx
+___
+$code.=<<___;
+ .rva .LSEH_begin_ecp_nistz256_to_mont
+ .rva .LSEH_end_ecp_nistz256_to_mont
+ .rva .LSEH_info_ecp_nistz256_to_mont
+
+ .rva .LSEH_begin_ecp_nistz256_mul_mont
+ .rva .LSEH_end_ecp_nistz256_mul_mont
+ .rva .LSEH_info_ecp_nistz256_mul_mont
+
+ .rva .LSEH_begin_ecp_nistz256_sqr_mont
+ .rva .LSEH_end_ecp_nistz256_sqr_mont
+ .rva .LSEH_info_ecp_nistz256_sqr_mont
+
+ .rva .LSEH_begin_ecp_nistz256_from_mont
+ .rva .LSEH_end_ecp_nistz256_from_mont
+ .rva .LSEH_info_ecp_nistz256_from_mont
+
+ .rva .LSEH_begin_ecp_nistz256_gather_w5
+ .rva .LSEH_end_ecp_nistz256_gather_w5
+ .rva .LSEH_info_ecp_nistz256_gather_wX
+
+ .rva .LSEH_begin_ecp_nistz256_gather_w7
+ .rva .LSEH_end_ecp_nistz256_gather_w7
+ .rva .LSEH_info_ecp_nistz256_gather_wX
+___
+$code.=<<___ if ($avx>1);
+ .rva .LSEH_begin_ecp_nistz256_avx2_gather_w5
+ .rva .LSEH_end_ecp_nistz256_avx2_gather_w5
+ .rva .LSEH_info_ecp_nistz256_avx2_gather_wX
+
+ .rva .LSEH_begin_ecp_nistz256_avx2_gather_w7
+ .rva .LSEH_end_ecp_nistz256_avx2_gather_w7
+ .rva .LSEH_info_ecp_nistz256_avx2_gather_wX
+___
+$code.=<<___;
+ .rva .LSEH_begin_ecp_nistz256_point_double
+ .rva .LSEH_end_ecp_nistz256_point_double
+ .rva .LSEH_info_ecp_nistz256_point_double
+
+ .rva .LSEH_begin_ecp_nistz256_point_add
+ .rva .LSEH_end_ecp_nistz256_point_add
+ .rva .LSEH_info_ecp_nistz256_point_add
+
+ .rva .LSEH_begin_ecp_nistz256_point_add_affine
+ .rva .LSEH_end_ecp_nistz256_point_add_affine
+ .rva .LSEH_info_ecp_nistz256_point_add_affine
+___
+$code.=<<___ if ($addx);
+ .rva .LSEH_begin_ecp_nistz256_point_doublex
+ .rva .LSEH_end_ecp_nistz256_point_doublex
+ .rva .LSEH_info_ecp_nistz256_point_doublex
+
+ .rva .LSEH_begin_ecp_nistz256_point_addx
+ .rva .LSEH_end_ecp_nistz256_point_addx
+ .rva .LSEH_info_ecp_nistz256_point_addx
+
+ .rva .LSEH_begin_ecp_nistz256_point_add_affinex
+ .rva .LSEH_end_ecp_nistz256_point_add_affinex
+ .rva .LSEH_info_ecp_nistz256_point_add_affinex
+___
+$code.=<<___;
+
+.section .xdata
+.align 8
+.LSEH_info_ecp_nistz256_mul_by_2:
+ .byte 9,0,0,0
+ .rva short_handler
+ .rva .Lmul_by_2_body,.Lmul_by_2_epilogue # HandlerData[]
+.LSEH_info_ecp_nistz256_div_by_2:
+ .byte 9,0,0,0
+ .rva short_handler
+ .rva .Ldiv_by_2_body,.Ldiv_by_2_epilogue # HandlerData[]
+.LSEH_info_ecp_nistz256_mul_by_3:
+ .byte 9,0,0,0
+ .rva short_handler
+ .rva .Lmul_by_3_body,.Lmul_by_3_epilogue # HandlerData[]
+.LSEH_info_ecp_nistz256_add:
+ .byte 9,0,0,0
+ .rva short_handler
+ .rva .Ladd_body,.Ladd_epilogue # HandlerData[]
+.LSEH_info_ecp_nistz256_sub:
+ .byte 9,0,0,0
+ .rva short_handler
+ .rva .Lsub_body,.Lsub_epilogue # HandlerData[]
+.LSEH_info_ecp_nistz256_neg:
+ .byte 9,0,0,0
+ .rva short_handler
+ .rva .Lneg_body,.Lneg_epilogue # HandlerData[]
+.LSEH_info_ecp_nistz256_ord_mul_mont:
+ .byte 9,0,0,0
+ .rva full_handler
+ .rva .Lord_mul_body,.Lord_mul_epilogue # HandlerData[]
+ .long 48,0
+.LSEH_info_ecp_nistz256_ord_sqr_mont:
+ .byte 9,0,0,0
+ .rva full_handler
+ .rva .Lord_sqr_body,.Lord_sqr_epilogue # HandlerData[]
+ .long 48,0
+___
+$code.=<<___ if ($addx);
+.LSEH_info_ecp_nistz256_ord_mul_montx:
+ .byte 9,0,0,0
+ .rva full_handler
+ .rva .Lord_mulx_body,.Lord_mulx_epilogue # HandlerData[]
+ .long 48,0
+.LSEH_info_ecp_nistz256_ord_sqr_montx:
+ .byte 9,0,0,0
+ .rva full_handler
+ .rva .Lord_sqrx_body,.Lord_sqrx_epilogue # HandlerData[]
+ .long 48,0
+___
+$code.=<<___;
+.LSEH_info_ecp_nistz256_to_mont:
+ .byte 9,0,0,0
+ .rva full_handler
+ .rva .Lmul_body,.Lmul_epilogue # HandlerData[]
+ .long 48,0
+.LSEH_info_ecp_nistz256_mul_mont:
+ .byte 9,0,0,0
+ .rva full_handler
+ .rva .Lmul_body,.Lmul_epilogue # HandlerData[]
+ .long 48,0
+.LSEH_info_ecp_nistz256_sqr_mont:
+ .byte 9,0,0,0
+ .rva full_handler
+ .rva .Lsqr_body,.Lsqr_epilogue # HandlerData[]
+ .long 48,0
+.LSEH_info_ecp_nistz256_from_mont:
+ .byte 9,0,0,0
+ .rva short_handler
+ .rva .Lfrom_body,.Lfrom_epilogue # HandlerData[]
+.LSEH_info_ecp_nistz256_gather_wX:
+ .byte 0x01,0x33,0x16,0x00
+ .byte 0x33,0xf8,0x09,0x00 #movaps 0x90(rsp),xmm15
+ .byte 0x2e,0xe8,0x08,0x00 #movaps 0x80(rsp),xmm14
+ .byte 0x29,0xd8,0x07,0x00 #movaps 0x70(rsp),xmm13
+ .byte 0x24,0xc8,0x06,0x00 #movaps 0x60(rsp),xmm12
+ .byte 0x1f,0xb8,0x05,0x00 #movaps 0x50(rsp),xmm11
+ .byte 0x1a,0xa8,0x04,0x00 #movaps 0x40(rsp),xmm10
+ .byte 0x15,0x98,0x03,0x00 #movaps 0x30(rsp),xmm9
+ .byte 0x10,0x88,0x02,0x00 #movaps 0x20(rsp),xmm8
+ .byte 0x0c,0x78,0x01,0x00 #movaps 0x10(rsp),xmm7
+ .byte 0x08,0x68,0x00,0x00 #movaps 0x00(rsp),xmm6
+ .byte 0x04,0x01,0x15,0x00 #sub rsp,0xa8
+ .align 8
+___
+$code.=<<___ if ($avx>1);
+.LSEH_info_ecp_nistz256_avx2_gather_wX:
+ .byte 0x01,0x36,0x17,0x0b
+ .byte 0x36,0xf8,0x09,0x00 # vmovaps 0x90(rsp),xmm15
+ .byte 0x31,0xe8,0x08,0x00 # vmovaps 0x80(rsp),xmm14
+ .byte 0x2c,0xd8,0x07,0x00 # vmovaps 0x70(rsp),xmm13
+ .byte 0x27,0xc8,0x06,0x00 # vmovaps 0x60(rsp),xmm12
+ .byte 0x22,0xb8,0x05,0x00 # vmovaps 0x50(rsp),xmm11
+ .byte 0x1d,0xa8,0x04,0x00 # vmovaps 0x40(rsp),xmm10
+ .byte 0x18,0x98,0x03,0x00 # vmovaps 0x30(rsp),xmm9
+ .byte 0x13,0x88,0x02,0x00 # vmovaps 0x20(rsp),xmm8
+ .byte 0x0e,0x78,0x01,0x00 # vmovaps 0x10(rsp),xmm7
+ .byte 0x09,0x68,0x00,0x00 # vmovaps 0x00(rsp),xmm6
+ .byte 0x04,0x01,0x15,0x00 # sub rsp,0xa8
+ .byte 0x00,0xb3,0x00,0x00 # set_frame r11
+ .align 8
+___
+$code.=<<___;
+.LSEH_info_ecp_nistz256_point_double:
+ .byte 9,0,0,0
+ .rva full_handler
+ .rva .Lpoint_doubleq_body,.Lpoint_doubleq_epilogue # HandlerData[]
+ .long 32*5+56,0
+.LSEH_info_ecp_nistz256_point_add:
+ .byte 9,0,0,0
+ .rva full_handler
+ .rva .Lpoint_addq_body,.Lpoint_addq_epilogue # HandlerData[]
+ .long 32*18+56,0
+.LSEH_info_ecp_nistz256_point_add_affine:
+ .byte 9,0,0,0
+ .rva full_handler
+ .rva .Ladd_affineq_body,.Ladd_affineq_epilogue # HandlerData[]
+ .long 32*15+56,0
+___
+$code.=<<___ if ($addx);
+.align 8
+.LSEH_info_ecp_nistz256_point_doublex:
+ .byte 9,0,0,0
+ .rva full_handler
+ .rva .Lpoint_doublex_body,.Lpoint_doublex_epilogue # HandlerData[]
+ .long 32*5+56,0
+.LSEH_info_ecp_nistz256_point_addx:
+ .byte 9,0,0,0
+ .rva full_handler
+ .rva .Lpoint_addx_body,.Lpoint_addx_epilogue # HandlerData[]
+ .long 32*18+56,0
+.LSEH_info_ecp_nistz256_point_add_affinex:
+ .byte 9,0,0,0
+ .rva full_handler
+ .rva .Ladd_affinex_body,.Ladd_affinex_epilogue # HandlerData[]
+ .long 32*15+56,0
+___
+}
+
########################################################################
# Convert ecp_nistz256_table.c to layout expected by ecp_nistz_gather_w7
#
-open TABLE,"<ecp_nistz256_table.c" or
-open TABLE,"<${dir}../ecp_nistz256_table.c" or
+open TABLE,"<ecp_nistz256_table.c" or
+open TABLE,"<${dir}../ecp_nistz256_table.c" or
die "failed to open ecp_nistz256_table.c:",$!;
use integer;