diff options
Diffstat (limited to 'deps/openssl/openssl/crypto/chacha/asm/chacha-ppc.pl')
-rwxr-xr-x | deps/openssl/openssl/crypto/chacha/asm/chacha-ppc.pl | 561 |
1 files changed, 478 insertions, 83 deletions
diff --git a/deps/openssl/openssl/crypto/chacha/asm/chacha-ppc.pl b/deps/openssl/openssl/crypto/chacha/asm/chacha-ppc.pl index 181decdad9..f4f8610bf3 100755 --- a/deps/openssl/openssl/crypto/chacha/asm/chacha-ppc.pl +++ b/deps/openssl/openssl/crypto/chacha/asm/chacha-ppc.pl @@ -1,5 +1,5 @@ #! /usr/bin/env perl -# Copyright 2016 The OpenSSL Project Authors. All Rights Reserved. +# Copyright 2016-2018 The OpenSSL Project Authors. All Rights Reserved. # # Licensed under the OpenSSL license (the "License"). You may not use # this file except in compliance with the License. You can obtain a copy @@ -15,18 +15,34 @@ # ==================================================================== # # October 2015 -# +# # ChaCha20 for PowerPC/AltiVec. # +# June 2018 +# +# Add VSX 2.07 code path. Original 3xAltiVec+1xIALU is well-suited for +# processors that can't issue more than one vector instruction per +# cycle. But POWER8 (and POWER9) can issue a pair, and vector-only 4x +# interleave would perform better. Incidentally PowerISA 2.07 (first +# implemented by POWER8) defined new usable instructions, hence 4xVSX +# code path... +# # Performance in cycles per byte out of large buffer. # -# IALU/gcc-4.x 3xAltiVec+1xIALU +# IALU/gcc-4.x 3xAltiVec+1xIALU 4xVSX +# +# Freescale e300 13.6/+115% - - +# PPC74x0/G4e 6.81/+310% 3.81 - +# PPC970/G5 9.29/+160% ? - +# POWER7 8.62/+61% 3.35 - +# POWER8 8.70/+51% 2.91 2.09 +# POWER9 8.80/+29% 4.44(*) 2.45(**) # -# Freescale e300 13.6/+115% - -# PPC74x0/G4e 6.81/+310% 4.66 -# PPC970/G5 9.29/+160% 4.60 -# POWER7 8.62/+61% 4.27 -# POWER8 8.70/+51% 3.96 +# (*) this is trade-off result, it's possible to improve it, but +# then it would negatively affect all others; +# (**) POWER9 seems to be "allergic" to mixing vector and integer +# instructions, which is why switch to vector-only code pays +# off that much; $flavour = shift; @@ -391,19 +407,19 @@ Loop_tail: # byte-by-byte loop ___ {{{ -my ($A0,$B0,$C0,$D0,$A1,$B1,$C1,$D1,$A2,$B2,$C2,$D2,$T0,$T1,$T2) = - map("v$_",(0..14)); -my (@K)=map("v$_",(15..20)); -my ($FOUR,$sixteen,$twenty4,$twenty,$twelve,$twenty5,$seven) = - map("v$_",(21..27)); -my ($inpperm,$outperm,$outmask) = map("v$_",(28..30)); -my @D=("v31",$seven,$T0,$T1,$T2); +my ($A0,$B0,$C0,$D0,$A1,$B1,$C1,$D1,$A2,$B2,$C2,$D2) + = map("v$_",(0..11)); +my @K = map("v$_",(12..17)); +my ($FOUR,$sixteen,$twenty4) = map("v$_",(18..19,23)); +my ($inpperm,$outperm,$outmask) = map("v$_",(24..26)); +my @D = map("v$_",(27..31)); +my ($twelve,$seven,$T0,$T1) = @D; -my $FRAME=$LOCALS+64+13*16+18*$SIZE_T; # 13*16 is for v20-v31 offload +my $FRAME=$LOCALS+64+10*16+18*$SIZE_T; # 10*16 is for v23-v31 offload sub VMXROUND { my $odd = pop; -my ($a,$b,$c,$d,$t)=@_; +my ($a,$b,$c,$d)=@_; ( "&vadduwm ('$a','$a','$b')", @@ -411,24 +427,20 @@ my ($a,$b,$c,$d,$t)=@_; "&vperm ('$d','$d','$d','$sixteen')", "&vadduwm ('$c','$c','$d')", - "&vxor ('$t','$b','$c')", - "&vsrw ('$b','$t','$twenty')", - "&vslw ('$t','$t','$twelve')", - "&vor ('$b','$b','$t')", + "&vxor ('$b','$b','$c')", + "&vrlw ('$b','$b','$twelve')", "&vadduwm ('$a','$a','$b')", "&vxor ('$d','$d','$a')", "&vperm ('$d','$d','$d','$twenty4')", "&vadduwm ('$c','$c','$d')", - "&vxor ('$t','$b','$c')", - "&vsrw ('$b','$t','$twenty5')", - "&vslw ('$t','$t','$seven')", - "&vor ('$b','$b','$t')", - - "&vsldoi ('$c','$c','$c',8)", - "&vsldoi ('$b','$b','$b',$odd?4:12)", - "&vsldoi ('$d','$d','$d',$odd?12:4)" + "&vxor ('$b','$b','$c')", + "&vrlw ('$b','$b','$seven')", + + "&vrldoi ('$c','$c',8)", + "&vrldoi ('$b','$b',$odd?4:12)", + "&vrldoi ('$d','$d',$odd?12:4)" ); } @@ -445,28 +457,22 @@ $code.=<<___; li r10,`15+$LOCALS+64` li r11,`31+$LOCALS+64` mfspr r12,256 - stvx v20,r10,$sp - addi r10,r10,32 - stvx v21,r11,$sp - addi r11,r11,32 - stvx v22,r10,$sp + stvx v23,r10,$sp addi r10,r10,32 - stvx v23,r11,$sp + stvx v24,r11,$sp addi r11,r11,32 - stvx v24,r10,$sp + stvx v25,r10,$sp addi r10,r10,32 - stvx v25,r11,$sp + stvx v26,r11,$sp addi r11,r11,32 - stvx v26,r10,$sp + stvx v27,r10,$sp addi r10,r10,32 - stvx v27,r11,$sp + stvx v28,r11,$sp addi r11,r11,32 - stvx v28,r10,$sp + stvx v29,r10,$sp addi r10,r10,32 - stvx v29,r11,$sp - addi r11,r11,32 - stvx v30,r10,$sp - stvx v31,r11,$sp + stvx v30,r11,$sp + stvx v31,r10,$sp stw r12,`$FRAME-$SIZE_T*18-4`($sp) # save vrsave $PUSH r14,`$FRAME-$SIZE_T*18`($sp) $PUSH r15,`$FRAME-$SIZE_T*17`($sp) @@ -486,9 +492,9 @@ $code.=<<___; $PUSH r29,`$FRAME-$SIZE_T*3`($sp) $PUSH r30,`$FRAME-$SIZE_T*2`($sp) $PUSH r31,`$FRAME-$SIZE_T*1`($sp) - li r12,-1 + li r12,-4096+511 $PUSH r0, `$FRAME+$LRSAVE`($sp) - mtspr 256,r12 # preserve all AltiVec registers + mtspr 256,r12 # preserve 29 AltiVec registers bl Lconsts # returns pointer Lsigma in r12 li @x[0],16 @@ -525,11 +531,6 @@ $code.=<<___; lwz @d[3],12($ctr) vadduwm @K[5],@K[4],@K[5] - vspltisw $twenty,-12 # synthesize constants - vspltisw $twelve,12 - vspltisw $twenty5,-7 - #vspltisw $seven,7 # synthesized in the loop - vxor $T0,$T0,$T0 # 0x00..00 vspltisw $outmask,-1 # 0xff..ff ?lvsr $inpperm,0,$inp # prepare for unaligned load @@ -542,6 +543,7 @@ $code.=<<___; be?vxor $outperm,$outperm,$T1 be?vperm $inpperm,$inpperm,$inpperm,$T0 + li r0,10 # inner loop counter b Loop_outer_vmx .align 4 @@ -559,7 +561,6 @@ Loop_outer_vmx: ori @x[3],@x[3],0x6574 vmr $B0,@K[1] - li r0,10 # inner loop counter lwz @x[4],0($key) # load key to GPR vmr $B1,@K[1] lwz @x[5],4($key) @@ -585,33 +586,45 @@ Loop_outer_vmx: mr @t[1],@x[5] mr @t[2],@x[6] mr @t[3],@x[7] + + vspltisw $twelve,12 # synthesize constants vspltisw $seven,7 mtctr r0 nop Loop_vmx: ___ - my @thread0=&VMXROUND($A0,$B0,$C0,$D0,$T0,0); - my @thread1=&VMXROUND($A1,$B1,$C1,$D1,$T1,0); - my @thread2=&VMXROUND($A2,$B2,$C2,$D2,$T2,0); + my @thread0=&VMXROUND($A0,$B0,$C0,$D0,0); + my @thread1=&VMXROUND($A1,$B1,$C1,$D1,0); + my @thread2=&VMXROUND($A2,$B2,$C2,$D2,0); my @thread3=&ROUND(0,4,8,12); foreach (@thread0) { - eval; eval(shift(@thread3)); - eval(shift(@thread1)); eval(shift(@thread3)); - eval(shift(@thread2)); eval(shift(@thread3)); + eval; + eval(shift(@thread1)); + eval(shift(@thread2)); + + eval(shift(@thread3)); + eval(shift(@thread3)); + eval(shift(@thread3)); } + foreach (@thread3) { eval; } - @thread0=&VMXROUND($A0,$B0,$C0,$D0,$T0,1); - @thread1=&VMXROUND($A1,$B1,$C1,$D1,$T1,1); - @thread2=&VMXROUND($A2,$B2,$C2,$D2,$T2,1); + @thread0=&VMXROUND($A0,$B0,$C0,$D0,1); + @thread1=&VMXROUND($A1,$B1,$C1,$D1,1); + @thread2=&VMXROUND($A2,$B2,$C2,$D2,1); @thread3=&ROUND(0,5,10,15); foreach (@thread0) { - eval; eval(shift(@thread3)); - eval(shift(@thread1)); eval(shift(@thread3)); - eval(shift(@thread2)); eval(shift(@thread3)); + eval; + eval(shift(@thread1)); + eval(shift(@thread2)); + + eval(shift(@thread3)); + eval(shift(@thread3)); + eval(shift(@thread3)); } + foreach (@thread3) { eval; } $code.=<<___; bdnz Loop_vmx @@ -850,28 +863,22 @@ Ldone_vmx: li r10,`15+$LOCALS+64` li r11,`31+$LOCALS+64` mtspr 256,r12 # restore vrsave - lvx v20,r10,$sp + lvx v23,r10,$sp addi r10,r10,32 - lvx v21,r11,$sp + lvx v24,r11,$sp addi r11,r11,32 - lvx v22,r10,$sp + lvx v25,r10,$sp addi r10,r10,32 - lvx v23,r11,$sp + lvx v26,r11,$sp addi r11,r11,32 - lvx v24,r10,$sp + lvx v27,r10,$sp addi r10,r10,32 - lvx v25,r11,$sp + lvx v28,r11,$sp addi r11,r11,32 - lvx v26,r10,$sp - addi r10,r10,32 - lvx v27,r11,$sp - addi r11,r11,32 - lvx v28,r10,$sp + lvx v29,r10,$sp addi r10,r10,32 - lvx v29,r11,$sp - addi r11,r11,32 - lvx v30,r10,$sp - lvx v31,r11,$sp + lvx v30,r11,$sp + lvx v31,r10,$sp $POP r0, `$FRAME+$LRSAVE`($sp) $POP r14,`$FRAME-$SIZE_T*18`($sp) $POP r15,`$FRAME-$SIZE_T*17`($sp) @@ -898,12 +905,395 @@ Ldone_vmx: .byte 0,12,0x04,1,0x80,18,5,0 .long 0 .size .ChaCha20_ctr32_vmx,.-.ChaCha20_ctr32_vmx +___ +}}} +{{{ +my ($xa0,$xa1,$xa2,$xa3, $xb0,$xb1,$xb2,$xb3, + $xc0,$xc1,$xc2,$xc3, $xd0,$xd1,$xd2,$xd3) = map("v$_",(0..15)); +my @K = map("v$_",(16..19)); +my $CTR = "v26"; +my ($xt0,$xt1,$xt2,$xt3) = map("v$_",(27..30)); +my ($sixteen,$twelve,$eight,$seven) = ($xt0,$xt1,$xt2,$xt3); +my $beperm = "v31"; + +my ($x00,$x10,$x20,$x30) = (0, map("r$_",(8..10))); +my $FRAME=$LOCALS+64+7*16; # 7*16 is for v26-v31 offload + +sub VSX_lane_ROUND { +my ($a0,$b0,$c0,$d0)=@_; +my ($a1,$b1,$c1,$d1)=map(($_&~3)+(($_+1)&3),($a0,$b0,$c0,$d0)); +my ($a2,$b2,$c2,$d2)=map(($_&~3)+(($_+1)&3),($a1,$b1,$c1,$d1)); +my ($a3,$b3,$c3,$d3)=map(($_&~3)+(($_+1)&3),($a2,$b2,$c2,$d2)); +my @x=map("\"v$_\"",(0..15)); + + ( + "&vadduwm (@x[$a0],@x[$a0],@x[$b0])", # Q1 + "&vadduwm (@x[$a1],@x[$a1],@x[$b1])", # Q2 + "&vadduwm (@x[$a2],@x[$a2],@x[$b2])", # Q3 + "&vadduwm (@x[$a3],@x[$a3],@x[$b3])", # Q4 + "&vxor (@x[$d0],@x[$d0],@x[$a0])", + "&vxor (@x[$d1],@x[$d1],@x[$a1])", + "&vxor (@x[$d2],@x[$d2],@x[$a2])", + "&vxor (@x[$d3],@x[$d3],@x[$a3])", + "&vrlw (@x[$d0],@x[$d0],'$sixteen')", + "&vrlw (@x[$d1],@x[$d1],'$sixteen')", + "&vrlw (@x[$d2],@x[$d2],'$sixteen')", + "&vrlw (@x[$d3],@x[$d3],'$sixteen')", + + "&vadduwm (@x[$c0],@x[$c0],@x[$d0])", + "&vadduwm (@x[$c1],@x[$c1],@x[$d1])", + "&vadduwm (@x[$c2],@x[$c2],@x[$d2])", + "&vadduwm (@x[$c3],@x[$c3],@x[$d3])", + "&vxor (@x[$b0],@x[$b0],@x[$c0])", + "&vxor (@x[$b1],@x[$b1],@x[$c1])", + "&vxor (@x[$b2],@x[$b2],@x[$c2])", + "&vxor (@x[$b3],@x[$b3],@x[$c3])", + "&vrlw (@x[$b0],@x[$b0],'$twelve')", + "&vrlw (@x[$b1],@x[$b1],'$twelve')", + "&vrlw (@x[$b2],@x[$b2],'$twelve')", + "&vrlw (@x[$b3],@x[$b3],'$twelve')", + + "&vadduwm (@x[$a0],@x[$a0],@x[$b0])", + "&vadduwm (@x[$a1],@x[$a1],@x[$b1])", + "&vadduwm (@x[$a2],@x[$a2],@x[$b2])", + "&vadduwm (@x[$a3],@x[$a3],@x[$b3])", + "&vxor (@x[$d0],@x[$d0],@x[$a0])", + "&vxor (@x[$d1],@x[$d1],@x[$a1])", + "&vxor (@x[$d2],@x[$d2],@x[$a2])", + "&vxor (@x[$d3],@x[$d3],@x[$a3])", + "&vrlw (@x[$d0],@x[$d0],'$eight')", + "&vrlw (@x[$d1],@x[$d1],'$eight')", + "&vrlw (@x[$d2],@x[$d2],'$eight')", + "&vrlw (@x[$d3],@x[$d3],'$eight')", + + "&vadduwm (@x[$c0],@x[$c0],@x[$d0])", + "&vadduwm (@x[$c1],@x[$c1],@x[$d1])", + "&vadduwm (@x[$c2],@x[$c2],@x[$d2])", + "&vadduwm (@x[$c3],@x[$c3],@x[$d3])", + "&vxor (@x[$b0],@x[$b0],@x[$c0])", + "&vxor (@x[$b1],@x[$b1],@x[$c1])", + "&vxor (@x[$b2],@x[$b2],@x[$c2])", + "&vxor (@x[$b3],@x[$b3],@x[$c3])", + "&vrlw (@x[$b0],@x[$b0],'$seven')", + "&vrlw (@x[$b1],@x[$b1],'$seven')", + "&vrlw (@x[$b2],@x[$b2],'$seven')", + "&vrlw (@x[$b3],@x[$b3],'$seven')" + ); +} + +$code.=<<___; + +.globl .ChaCha20_ctr32_vsx +.align 5 +.ChaCha20_ctr32_vsx: + $STU $sp,-$FRAME($sp) + mflr r0 + li r10,`15+$LOCALS+64` + li r11,`31+$LOCALS+64` + mfspr r12,256 + stvx v26,r10,$sp + addi r10,r10,32 + stvx v27,r11,$sp + addi r11,r11,32 + stvx v28,r10,$sp + addi r10,r10,32 + stvx v29,r11,$sp + addi r11,r11,32 + stvx v30,r10,$sp + stvx v31,r11,$sp + stw r12,`$FRAME-4`($sp) # save vrsave + li r12,-4096+63 + $PUSH r0, `$FRAME+$LRSAVE`($sp) + mtspr 256,r12 # preserve 29 AltiVec registers + + bl Lconsts # returns pointer Lsigma in r12 + lvx_4w @K[0],0,r12 # load sigma + addi r12,r12,0x50 + li $x10,16 + li $x20,32 + li $x30,48 + li r11,64 + + lvx_4w @K[1],0,$key # load key + lvx_4w @K[2],$x10,$key + lvx_4w @K[3],0,$ctr # load counter + + vxor $xt0,$xt0,$xt0 + lvx_4w $xt1,r11,r12 + vspltw $CTR,@K[3],0 + vsldoi @K[3],@K[3],$xt0,4 + vsldoi @K[3],$xt0,@K[3],12 # clear @K[3].word[0] + vadduwm $CTR,$CTR,$xt1 + + be?lvsl $beperm,0,$x10 # 0x00..0f + be?vspltisb $xt0,3 # 0x03..03 + be?vxor $beperm,$beperm,$xt0 # swap bytes within words + + li r0,10 # inner loop counter + mtctr r0 + b Loop_outer_vsx + +.align 5 +Loop_outer_vsx: + lvx $xa0,$x00,r12 # load [smashed] sigma + lvx $xa1,$x10,r12 + lvx $xa2,$x20,r12 + lvx $xa3,$x30,r12 + + vspltw $xb0,@K[1],0 # smash the key + vspltw $xb1,@K[1],1 + vspltw $xb2,@K[1],2 + vspltw $xb3,@K[1],3 + + vspltw $xc0,@K[2],0 + vspltw $xc1,@K[2],1 + vspltw $xc2,@K[2],2 + vspltw $xc3,@K[2],3 + + vmr $xd0,$CTR # smash the counter + vspltw $xd1,@K[3],1 + vspltw $xd2,@K[3],2 + vspltw $xd3,@K[3],3 + + vspltisw $sixteen,-16 # synthesize constants + vspltisw $twelve,12 + vspltisw $eight,8 + vspltisw $seven,7 + +Loop_vsx: +___ + foreach (&VSX_lane_ROUND(0, 4, 8,12)) { eval; } + foreach (&VSX_lane_ROUND(0, 5,10,15)) { eval; } +$code.=<<___; + bdnz Loop_vsx + + vadduwm $xd0,$xd0,$CTR + + vmrgew $xt0,$xa0,$xa1 # transpose data + vmrgew $xt1,$xa2,$xa3 + vmrgow $xa0,$xa0,$xa1 + vmrgow $xa2,$xa2,$xa3 + vmrgew $xt2,$xb0,$xb1 + vmrgew $xt3,$xb2,$xb3 + vpermdi $xa1,$xa0,$xa2,0b00 + vpermdi $xa3,$xa0,$xa2,0b11 + vpermdi $xa0,$xt0,$xt1,0b00 + vpermdi $xa2,$xt0,$xt1,0b11 + + vmrgow $xb0,$xb0,$xb1 + vmrgow $xb2,$xb2,$xb3 + vmrgew $xt0,$xc0,$xc1 + vmrgew $xt1,$xc2,$xc3 + vpermdi $xb1,$xb0,$xb2,0b00 + vpermdi $xb3,$xb0,$xb2,0b11 + vpermdi $xb0,$xt2,$xt3,0b00 + vpermdi $xb2,$xt2,$xt3,0b11 + + vmrgow $xc0,$xc0,$xc1 + vmrgow $xc2,$xc2,$xc3 + vmrgew $xt2,$xd0,$xd1 + vmrgew $xt3,$xd2,$xd3 + vpermdi $xc1,$xc0,$xc2,0b00 + vpermdi $xc3,$xc0,$xc2,0b11 + vpermdi $xc0,$xt0,$xt1,0b00 + vpermdi $xc2,$xt0,$xt1,0b11 + + vmrgow $xd0,$xd0,$xd1 + vmrgow $xd2,$xd2,$xd3 + vspltisw $xt0,4 + vadduwm $CTR,$CTR,$xt0 # next counter value + vpermdi $xd1,$xd0,$xd2,0b00 + vpermdi $xd3,$xd0,$xd2,0b11 + vpermdi $xd0,$xt2,$xt3,0b00 + vpermdi $xd2,$xt2,$xt3,0b11 + + vadduwm $xa0,$xa0,@K[0] + vadduwm $xb0,$xb0,@K[1] + vadduwm $xc0,$xc0,@K[2] + vadduwm $xd0,$xd0,@K[3] + + be?vperm $xa0,$xa0,$xa0,$beperm + be?vperm $xb0,$xb0,$xb0,$beperm + be?vperm $xc0,$xc0,$xc0,$beperm + be?vperm $xd0,$xd0,$xd0,$beperm + + ${UCMP}i $len,0x40 + blt Ltail_vsx + + lvx_4w $xt0,$x00,$inp + lvx_4w $xt1,$x10,$inp + lvx_4w $xt2,$x20,$inp + lvx_4w $xt3,$x30,$inp + + vxor $xt0,$xt0,$xa0 + vxor $xt1,$xt1,$xb0 + vxor $xt2,$xt2,$xc0 + vxor $xt3,$xt3,$xd0 + + stvx_4w $xt0,$x00,$out + stvx_4w $xt1,$x10,$out + addi $inp,$inp,0x40 + stvx_4w $xt2,$x20,$out + subi $len,$len,0x40 + stvx_4w $xt3,$x30,$out + addi $out,$out,0x40 + beq Ldone_vsx + + vadduwm $xa0,$xa1,@K[0] + vadduwm $xb0,$xb1,@K[1] + vadduwm $xc0,$xc1,@K[2] + vadduwm $xd0,$xd1,@K[3] + + be?vperm $xa0,$xa0,$xa0,$beperm + be?vperm $xb0,$xb0,$xb0,$beperm + be?vperm $xc0,$xc0,$xc0,$beperm + be?vperm $xd0,$xd0,$xd0,$beperm + + ${UCMP}i $len,0x40 + blt Ltail_vsx + + lvx_4w $xt0,$x00,$inp + lvx_4w $xt1,$x10,$inp + lvx_4w $xt2,$x20,$inp + lvx_4w $xt3,$x30,$inp + + vxor $xt0,$xt0,$xa0 + vxor $xt1,$xt1,$xb0 + vxor $xt2,$xt2,$xc0 + vxor $xt3,$xt3,$xd0 + + stvx_4w $xt0,$x00,$out + stvx_4w $xt1,$x10,$out + addi $inp,$inp,0x40 + stvx_4w $xt2,$x20,$out + subi $len,$len,0x40 + stvx_4w $xt3,$x30,$out + addi $out,$out,0x40 + beq Ldone_vsx + + vadduwm $xa0,$xa2,@K[0] + vadduwm $xb0,$xb2,@K[1] + vadduwm $xc0,$xc2,@K[2] + vadduwm $xd0,$xd2,@K[3] + + be?vperm $xa0,$xa0,$xa0,$beperm + be?vperm $xb0,$xb0,$xb0,$beperm + be?vperm $xc0,$xc0,$xc0,$beperm + be?vperm $xd0,$xd0,$xd0,$beperm + + ${UCMP}i $len,0x40 + blt Ltail_vsx + + lvx_4w $xt0,$x00,$inp + lvx_4w $xt1,$x10,$inp + lvx_4w $xt2,$x20,$inp + lvx_4w $xt3,$x30,$inp + + vxor $xt0,$xt0,$xa0 + vxor $xt1,$xt1,$xb0 + vxor $xt2,$xt2,$xc0 + vxor $xt3,$xt3,$xd0 + + stvx_4w $xt0,$x00,$out + stvx_4w $xt1,$x10,$out + addi $inp,$inp,0x40 + stvx_4w $xt2,$x20,$out + subi $len,$len,0x40 + stvx_4w $xt3,$x30,$out + addi $out,$out,0x40 + beq Ldone_vsx + + vadduwm $xa0,$xa3,@K[0] + vadduwm $xb0,$xb3,@K[1] + vadduwm $xc0,$xc3,@K[2] + vadduwm $xd0,$xd3,@K[3] + + be?vperm $xa0,$xa0,$xa0,$beperm + be?vperm $xb0,$xb0,$xb0,$beperm + be?vperm $xc0,$xc0,$xc0,$beperm + be?vperm $xd0,$xd0,$xd0,$beperm + + ${UCMP}i $len,0x40 + blt Ltail_vsx + + lvx_4w $xt0,$x00,$inp + lvx_4w $xt1,$x10,$inp + lvx_4w $xt2,$x20,$inp + lvx_4w $xt3,$x30,$inp + + vxor $xt0,$xt0,$xa0 + vxor $xt1,$xt1,$xb0 + vxor $xt2,$xt2,$xc0 + vxor $xt3,$xt3,$xd0 + + stvx_4w $xt0,$x00,$out + stvx_4w $xt1,$x10,$out + addi $inp,$inp,0x40 + stvx_4w $xt2,$x20,$out + subi $len,$len,0x40 + stvx_4w $xt3,$x30,$out + addi $out,$out,0x40 + mtctr r0 + bne Loop_outer_vsx + +Ldone_vsx: + lwz r12,`$FRAME-4`($sp) # pull vrsave + li r10,`15+$LOCALS+64` + li r11,`31+$LOCALS+64` + $POP r0, `$FRAME+$LRSAVE`($sp) + mtspr 256,r12 # restore vrsave + lvx v26,r10,$sp + addi r10,r10,32 + lvx v27,r11,$sp + addi r11,r11,32 + lvx v28,r10,$sp + addi r10,r10,32 + lvx v29,r11,$sp + addi r11,r11,32 + lvx v30,r10,$sp + lvx v31,r11,$sp + mtlr r0 + addi $sp,$sp,$FRAME + blr + +.align 4 +Ltail_vsx: + addi r11,$sp,$LOCALS + mtctr $len + stvx_4w $xa0,$x00,r11 # offload block to stack + stvx_4w $xb0,$x10,r11 + stvx_4w $xc0,$x20,r11 + stvx_4w $xd0,$x30,r11 + subi r12,r11,1 # prepare for *++ptr + subi $inp,$inp,1 + subi $out,$out,1 + +Loop_tail_vsx: + lbzu r6,1(r12) + lbzu r7,1($inp) + xor r6,r6,r7 + stbu r6,1($out) + bdnz Loop_tail_vsx + + stvx_4w $K[0],$x00,r11 # wipe copy of the block + stvx_4w $K[0],$x10,r11 + stvx_4w $K[0],$x20,r11 + stvx_4w $K[0],$x30,r11 + + b Ldone_vsx + .long 0 + .byte 0,12,0x04,1,0x80,0,5,0 + .long 0 +.size .ChaCha20_ctr32_vsx,.-.ChaCha20_ctr32_vsx +___ +}}} +$code.=<<___; .align 5 Lconsts: mflr r0 bcl 20,31,\$+4 - mflr r12 #vvvvv "distance between . and _vpaes_consts + mflr r12 #vvvvv "distance between . and Lsigma addi r12,r12,`64-8` mtlr r0 blr @@ -924,10 +1314,14 @@ $code.=<<___ if (!$LITTLE_ENDIAN); # flipped words .long 0x01020300,0x05060704,0x090a0b08,0x0d0e0f0c ___ $code.=<<___; + .long 0x61707865,0x61707865,0x61707865,0x61707865 + .long 0x3320646e,0x3320646e,0x3320646e,0x3320646e + .long 0x79622d32,0x79622d32,0x79622d32,0x79622d32 + .long 0x6b206574,0x6b206574,0x6b206574,0x6b206574 + .long 0,1,2,3 .asciz "ChaCha20 for PowerPC/AltiVec, CRYPTOGAMS by <appro\@openssl.org>" .align 2 ___ -}}} foreach (split("\n",$code)) { s/\`([^\`]*)\`/eval $1/ge; @@ -940,11 +1334,12 @@ foreach (split("\n",$code)) { s/\?lvsr/lvsl/ or s/\?lvsl/lvsr/ or s/\?(vperm\s+v[0-9]+,\s*)(v[0-9]+,\s*)(v[0-9]+,\s*)(v[0-9]+)/$1$3$2$4/ or - s/(vsldoi\s+v[0-9]+,\s*)(v[0-9]+,)\s*(v[0-9]+,\s*)([0-9]+)/$1$3$2 16-$4/; + s/vrldoi(\s+v[0-9]+,\s*)(v[0-9]+,)\s*([0-9]+)/vsldoi$1$2$2 16-$3/; } else { # little-endian s/le\?// or s/be\?/#be#/ or - s/\?([a-z]+)/$1/; + s/\?([a-z]+)/$1/ or + s/vrldoi(\s+v[0-9]+,\s*)(v[0-9]+,)\s*([0-9]+)/vsldoi$1$2$2 $3/; } print $_,"\n"; |