ghash-armv4.pl revision 291719
138363Swpaul#!/usr/bin/env perl 238363Swpaul# 338363Swpaul# ==================================================================== 438363Swpaul# Written by Andy Polyakov <appro@openssl.org> for the OpenSSL 538363Swpaul# project. The module is, however, dual licensed under OpenSSL and 638363Swpaul# CRYPTOGAMS licenses depending on where you obtain it. For further 738363Swpaul# details see http://www.openssl.org/~appro/cryptogams/. 838363Swpaul# ==================================================================== 938363Swpaul# 1038363Swpaul# April 2010 1138363Swpaul# 1238363Swpaul# The module implements "4-bit" GCM GHASH function and underlying 1338363Swpaul# single multiplication operation in GF(2^128). "4-bit" means that it 1438363Swpaul# uses 256 bytes per-key table [+32 bytes shared table]. There is no 1538363Swpaul# experimental performance data available yet. The only approximation 1638363Swpaul# that can be made at this point is based on code size. Inner loop is 1738363Swpaul# 32 instructions long and on single-issue core should execute in <40 1838363Swpaul# cycles. Having verified that gcc 3.4 didn't unroll corresponding 1938363Swpaul# loop, this assembler loop body was found to be ~3x smaller than 2038363Swpaul# compiler-generated one... 2138363Swpaul# 2238363Swpaul# July 2010 2338363Swpaul# 2438363Swpaul# Rescheduling for dual-issue pipeline resulted in 8.5% improvement on 2538363Swpaul# Cortex A8 core and ~25 cycles per processed byte (which was observed 2638363Swpaul# to be ~3 times faster than gcc-generated code:-) 2738363Swpaul# 2838363Swpaul# February 2011 2938363Swpaul# 3038363Swpaul# Profiler-assisted and platform-specific optimization resulted in 7% 3138363Swpaul# improvement on Cortex A8 core and ~23.5 cycles per byte. 3250477Speter# 3338363Swpaul# March 2011 3438363Swpaul# 3538363Swpaul# Add NEON implementation featuring polynomial multiplication, i.e. no 3638363Swpaul# lookup tables involved. On Cortex A8 it was measured to process one 3738363Swpaul# byte in 15 cycles or 55% faster than integer-only code. 3838363Swpaul# 3938363Swpaul# April 2014 4038363Swpaul# 4138363Swpaul# Switch to multiplication algorithm suggested in paper referred 4238363Swpaul# below and combine it with reduction algorithm from x86 module. 4338363Swpaul# Performance improvement over previous version varies from 65% on 4438363Swpaul# Snapdragon S4 to 110% on Cortex A9. In absolute terms Cortex A8 4538363Swpaul# processes one byte in 8.45 cycles, A9 - in 10.2, Snapdragon S4 - 4638363Swpaul# in 9.33. 4738363Swpaul# 4838363Swpaul# C��mara, D.; Gouv��a, C. P. L.; L��pez, J. & Dahab, R.: Fast Software 4938363Swpaul# Polynomial Multiplication on ARM Processors using the NEON Engine. 5038363Swpaul# 5138363Swpaul# http://conradoplg.cryptoland.net/files/2010/12/mocrysen13.pdf 5238363Swpaul 5338363Swpaul# ==================================================================== 5438363Swpaul# Note about "528B" variant. In ARM case it makes lesser sense to 5538363Swpaul# implement it for following reasons: 5638363Swpaul# 5738363Swpaul# - performance improvement won't be anywhere near 50%, because 128- 5838363Swpaul# bit shift operation is neatly fused with 128-bit xor here, and 5938363Swpaul# "538B" variant would eliminate only 4-5 instructions out of 32 6038363Swpaul# in the inner loop (meaning that estimated improvement is ~15%); 6138363Swpaul# - ARM-based systems are often embedded ones and extra memory 6238363Swpaul# consumption might be unappreciated (for so little improvement); 6338363Swpaul# 6438363Swpaul# Byte order [in]dependence. ========================================= 6538363Swpaul# 6638363Swpaul# Caller is expected to maintain specific *dword* order in Htable, 6738363Swpaul# namely with *least* significant dword of 128-bit value at *lower* 6838363Swpaul# address. This differs completely from C code and has everything to 6938363Swpaul# do with ldm instruction and order in which dwords are "consumed" by 7038363Swpaul# algorithm. *Byte* order within these dwords in turn is whatever 7138363Swpaul# *native* byte order on current platform. See gcm128.c for working 7238363Swpaul# example... 7338363Swpaul 7438363Swpaulwhile (($output=shift) && ($output!~/^\w[\w\-]*\.\w+$/)) {} 7538363Swpaulopen STDOUT,">$output"; 7638363Swpaul 7738363Swpaul$Xi="r0"; # argument block 7838363Swpaul$Htbl="r1"; 7938363Swpaul$inp="r2"; 8038363Swpaul$len="r3"; 8138363Swpaul 8238363Swpaul$Zll="r4"; # variables 8338363Swpaul$Zlh="r5"; 8438363Swpaul$Zhl="r6"; 8538363Swpaul$Zhh="r7"; 8638363Swpaul$Tll="r8"; 8738363Swpaul$Tlh="r9"; 8838363Swpaul$Thl="r10"; 8938363Swpaul$Thh="r11"; 9038363Swpaul$nlo="r12"; 9138363Swpaul################# r13 is stack pointer 9238363Swpaul$nhi="r14"; 9338363Swpaul################# r15 is program counter 9438363Swpaul 9551441Swpaul$rem_4bit=$inp; # used in gcm_gmult_4bit 9638363Swpaul$cnt=$len; 9738363Swpaul 9838363Swpaulsub Zsmash() { 9951441Swpaul my $i=12; 10038363Swpaul my @args=@_; 10138363Swpaul for ($Zll,$Zlh,$Zhl,$Zhh) { 10238363Swpaul $code.=<<___; 10338363Swpaul#if __ARM_ARCH__>=7 && defined(__ARMEL__) 10438363Swpaul rev $_,$_ 10538363Swpaul str $_,[$Xi,#$i] 10638363Swpaul#elif defined(__ARMEB__) 10738363Swpaul str $_,[$Xi,#$i] 10838363Swpaul#else 10938363Swpaul mov $Tlh,$_,lsr#8 11038363Swpaul strb $_,[$Xi,#$i+3] 11138363Swpaul mov $Thl,$_,lsr#16 11238363Swpaul strb $Tlh,[$Xi,#$i+2] 11338363Swpaul mov $Thh,$_,lsr#24 11438363Swpaul strb $Thl,[$Xi,#$i+1] 11538363Swpaul strb $Thh,[$Xi,#$i] 11638363Swpaul#endif 11738363Swpaul___ 11838363Swpaul $code.="\t".shift(@args)."\n"; 11938363Swpaul $i-=4; 12038363Swpaul } 12138363Swpaul} 12238363Swpaul 12338363Swpaul$code=<<___; 12438363Swpaul#include "arm_arch.h" 12538363Swpaul 12638363Swpaul.text 12738363Swpaul.code 32 12838363Swpaul 12938363Swpaul#ifdef __clang__ 13038363Swpaul#define ldrplb ldrbpl 13138363Swpaul#define ldrneb ldrbne 13238363Swpaul#endif 13338363Swpaul 13438363Swpaul.type rem_4bit,%object 13538363Swpaul.align 5 13638363Swpaulrem_4bit: 13738363Swpaul.short 0x0000,0x1C20,0x3840,0x2460 13838363Swpaul.short 0x7080,0x6CA0,0x48C0,0x54E0 13938363Swpaul.short 0xE100,0xFD20,0xD940,0xC560 14038363Swpaul.short 0x9180,0x8DA0,0xA9C0,0xB5E0 14138363Swpaul.size rem_4bit,.-rem_4bit 14238363Swpaul 14338363Swpaul.type rem_4bit_get,%function 14438363Swpaulrem_4bit_get: 14538363Swpaul sub $rem_4bit,pc,#8 14638363Swpaul sub $rem_4bit,$rem_4bit,#32 @ &rem_4bit 14738363Swpaul b .Lrem_4bit_got 14838363Swpaul nop 14938363Swpaul.size rem_4bit_get,.-rem_4bit_get 15038363Swpaul 15138363Swpaul.global gcm_ghash_4bit 15238363Swpaul.type gcm_ghash_4bit,%function 15338363Swpaulgcm_ghash_4bit: 15438363Swpaul sub r12,pc,#8 15538363Swpaul add $len,$inp,$len @ $len to point at the end 15638363Swpaul stmdb sp!,{r3-r11,lr} @ save $len/end too 15738363Swpaul sub r12,r12,#48 @ &rem_4bit 15838363Swpaul 15938363Swpaul ldmia r12,{r4-r11} @ copy rem_4bit ... 16038363Swpaul stmdb sp!,{r4-r11} @ ... to stack 16138363Swpaul 16238363Swpaul ldrb $nlo,[$inp,#15] 16338363Swpaul ldrb $nhi,[$Xi,#15] 16438363Swpaul.Louter: 16538363Swpaul eor $nlo,$nlo,$nhi 16638363Swpaul and $nhi,$nlo,#0xf0 16738363Swpaul and $nlo,$nlo,#0x0f 16838363Swpaul mov $cnt,#14 16938363Swpaul 17038363Swpaul add $Zhh,$Htbl,$nlo,lsl#4 17138363Swpaul ldmia $Zhh,{$Zll-$Zhh} @ load Htbl[nlo] 17238363Swpaul add $Thh,$Htbl,$nhi 17338363Swpaul ldrb $nlo,[$inp,#14] 17438363Swpaul 17538363Swpaul and $nhi,$Zll,#0xf @ rem 17638526Swpaul ldmia $Thh,{$Tll-$Thh} @ load Htbl[nhi] 17738526Swpaul add $nhi,$nhi,$nhi 17838526Swpaul eor $Zll,$Tll,$Zll,lsr#4 17938526Swpaul ldrh $Tll,[sp,$nhi] @ rem_4bit[rem] 18038526Swpaul eor $Zll,$Zll,$Zlh,lsl#28 18138526Swpaul ldrb $nhi,[$Xi,#14] 18238526Swpaul eor $Zlh,$Tlh,$Zlh,lsr#4 18338363Swpaul eor $Zlh,$Zlh,$Zhl,lsl#28 18438363Swpaul eor $Zhl,$Thl,$Zhl,lsr#4 18538363Swpaul eor $Zhl,$Zhl,$Zhh,lsl#28 18638363Swpaul eor $Zhh,$Thh,$Zhh,lsr#4 18738363Swpaul eor $nlo,$nlo,$nhi 18838363Swpaul and $nhi,$nlo,#0xf0 18938363Swpaul and $nlo,$nlo,#0x0f 19038363Swpaul eor $Zhh,$Zhh,$Tll,lsl#16 19138363Swpaul 19238363Swpaul.Linner: 19338363Swpaul add $Thh,$Htbl,$nlo,lsl#4 19438363Swpaul and $nlo,$Zll,#0xf @ rem 19538363Swpaul subs $cnt,$cnt,#1 19638363Swpaul add $nlo,$nlo,$nlo 19738363Swpaul ldmia $Thh,{$Tll-$Thh} @ load Htbl[nlo] 19838363Swpaul eor $Zll,$Tll,$Zll,lsr#4 19938363Swpaul eor $Zll,$Zll,$Zlh,lsl#28 20038363Swpaul eor $Zlh,$Tlh,$Zlh,lsr#4 20138363Swpaul eor $Zlh,$Zlh,$Zhl,lsl#28 20238363Swpaul ldrh $Tll,[sp,$nlo] @ rem_4bit[rem] 20338363Swpaul eor $Zhl,$Thl,$Zhl,lsr#4 20438363Swpaul ldrplb $nlo,[$inp,$cnt] 20538363Swpaul eor $Zhl,$Zhl,$Zhh,lsl#28 20638363Swpaul eor $Zhh,$Thh,$Zhh,lsr#4 20738363Swpaul 20838363Swpaul add $Thh,$Htbl,$nhi 20938363Swpaul and $nhi,$Zll,#0xf @ rem 21038363Swpaul eor $Zhh,$Zhh,$Tll,lsl#16 @ ^= rem_4bit[rem] 21138363Swpaul add $nhi,$nhi,$nhi 21238363Swpaul ldmia $Thh,{$Tll-$Thh} @ load Htbl[nhi] 21338363Swpaul eor $Zll,$Tll,$Zll,lsr#4 21438363Swpaul ldrplb $Tll,[$Xi,$cnt] 21538363Swpaul eor $Zll,$Zll,$Zlh,lsl#28 21638363Swpaul eor $Zlh,$Tlh,$Zlh,lsr#4 21738363Swpaul ldrh $Tlh,[sp,$nhi] 21838363Swpaul eor $Zlh,$Zlh,$Zhl,lsl#28 21938363Swpaul eor $Zhl,$Thl,$Zhl,lsr#4 22038363Swpaul eor $Zhl,$Zhl,$Zhh,lsl#28 22138363Swpaul eorpl $nlo,$nlo,$Tll 22238363Swpaul eor $Zhh,$Thh,$Zhh,lsr#4 22338363Swpaul andpl $nhi,$nlo,#0xf0 22438363Swpaul andpl $nlo,$nlo,#0x0f 22538363Swpaul eor $Zhh,$Zhh,$Tlh,lsl#16 @ ^= rem_4bit[rem] 22638363Swpaul bpl .Linner 22738363Swpaul 22838363Swpaul ldr $len,[sp,#32] @ re-load $len/end 22938363Swpaul add $inp,$inp,#16 23038363Swpaul mov $nhi,$Zll 23138363Swpaul___ 23238363Swpaul &Zsmash("cmp\t$inp,$len","ldrneb\t$nlo,[$inp,#15]"); 23338363Swpaul$code.=<<___; 23438363Swpaul bne .Louter 23538363Swpaul 23638363Swpaul add sp,sp,#36 23738363Swpaul#if __ARM_ARCH__>=5 23838363Swpaul ldmia sp!,{r4-r11,pc} 23938363Swpaul#else 24038363Swpaul ldmia sp!,{r4-r11,lr} 24138363Swpaul tst lr,#1 24238363Swpaul moveq pc,lr @ be binary compatible with V4, yet 24338363Swpaul bx lr @ interoperable with Thumb ISA:-) 24438363Swpaul#endif 24538363Swpaul.size gcm_ghash_4bit,.-gcm_ghash_4bit 24638363Swpaul 24738363Swpaul.global gcm_gmult_4bit 24838363Swpaul.type gcm_gmult_4bit,%function 24938363Swpaulgcm_gmult_4bit: 25038363Swpaul stmdb sp!,{r4-r11,lr} 25138363Swpaul ldrb $nlo,[$Xi,#15] 25238363Swpaul b rem_4bit_get 25338363Swpaul.Lrem_4bit_got: 25438363Swpaul and $nhi,$nlo,#0xf0 25538363Swpaul and $nlo,$nlo,#0x0f 25638363Swpaul mov $cnt,#14 25738363Swpaul 25838363Swpaul add $Zhh,$Htbl,$nlo,lsl#4 25938363Swpaul ldmia $Zhh,{$Zll-$Zhh} @ load Htbl[nlo] 26038363Swpaul ldrb $nlo,[$Xi,#14] 26138363Swpaul 26238363Swpaul add $Thh,$Htbl,$nhi 26338363Swpaul and $nhi,$Zll,#0xf @ rem 26438363Swpaul ldmia $Thh,{$Tll-$Thh} @ load Htbl[nhi] 26538363Swpaul add $nhi,$nhi,$nhi 26638363Swpaul eor $Zll,$Tll,$Zll,lsr#4 26738363Swpaul ldrh $Tll,[$rem_4bit,$nhi] @ rem_4bit[rem] 26838363Swpaul eor $Zll,$Zll,$Zlh,lsl#28 26938363Swpaul eor $Zlh,$Tlh,$Zlh,lsr#4 27038363Swpaul eor $Zlh,$Zlh,$Zhl,lsl#28 27138363Swpaul eor $Zhl,$Thl,$Zhl,lsr#4 27238363Swpaul eor $Zhl,$Zhl,$Zhh,lsl#28 27338363Swpaul eor $Zhh,$Thh,$Zhh,lsr#4 27438363Swpaul and $nhi,$nlo,#0xf0 27538363Swpaul eor $Zhh,$Zhh,$Tll,lsl#16 27638363Swpaul and $nlo,$nlo,#0x0f 27738363Swpaul 27838363Swpaul.Loop: 27938363Swpaul add $Thh,$Htbl,$nlo,lsl#4 28038363Swpaul and $nlo,$Zll,#0xf @ rem 28138363Swpaul subs $cnt,$cnt,#1 28238363Swpaul add $nlo,$nlo,$nlo 28338363Swpaul ldmia $Thh,{$Tll-$Thh} @ load Htbl[nlo] 28438363Swpaul eor $Zll,$Tll,$Zll,lsr#4 28538363Swpaul eor $Zll,$Zll,$Zlh,lsl#28 28638363Swpaul eor $Zlh,$Tlh,$Zlh,lsr#4 28738363Swpaul eor $Zlh,$Zlh,$Zhl,lsl#28 28838363Swpaul ldrh $Tll,[$rem_4bit,$nlo] @ rem_4bit[rem] 28938363Swpaul eor $Zhl,$Thl,$Zhl,lsr#4 29038363Swpaul ldrplb $nlo,[$Xi,$cnt] 29138363Swpaul eor $Zhl,$Zhl,$Zhh,lsl#28 29238363Swpaul eor $Zhh,$Thh,$Zhh,lsr#4 29338363Swpaul 29438363Swpaul add $Thh,$Htbl,$nhi 29538363Swpaul and $nhi,$Zll,#0xf @ rem 29638363Swpaul eor $Zhh,$Zhh,$Tll,lsl#16 @ ^= rem_4bit[rem] 29738363Swpaul add $nhi,$nhi,$nhi 29838363Swpaul ldmia $Thh,{$Tll-$Thh} @ load Htbl[nhi] 29938363Swpaul eor $Zll,$Tll,$Zll,lsr#4 30038363Swpaul eor $Zll,$Zll,$Zlh,lsl#28 30138363Swpaul eor $Zlh,$Tlh,$Zlh,lsr#4 30238363Swpaul ldrh $Tll,[$rem_4bit,$nhi] @ rem_4bit[rem] 30338363Swpaul eor $Zlh,$Zlh,$Zhl,lsl#28 30438363Swpaul eor $Zhl,$Thl,$Zhl,lsr#4 30538363Swpaul eor $Zhl,$Zhl,$Zhh,lsl#28 30638363Swpaul eor $Zhh,$Thh,$Zhh,lsr#4 30738363Swpaul andpl $nhi,$nlo,#0xf0 30838363Swpaul andpl $nlo,$nlo,#0x0f 30938363Swpaul eor $Zhh,$Zhh,$Tll,lsl#16 @ ^= rem_4bit[rem] 31038363Swpaul bpl .Loop 31138363Swpaul___ 31238363Swpaul &Zsmash(); 31338363Swpaul$code.=<<___; 31438363Swpaul#if __ARM_ARCH__>=5 31538363Swpaul ldmia sp!,{r4-r11,pc} 31638363Swpaul#else 31738363Swpaul ldmia sp!,{r4-r11,lr} 31838363Swpaul tst lr,#1 31938363Swpaul moveq pc,lr @ be binary compatible with V4, yet 32038363Swpaul bx lr @ interoperable with Thumb ISA:-) 32138363Swpaul#endif 32238363Swpaul.size gcm_gmult_4bit,.-gcm_gmult_4bit 32338363Swpaul___ 32438363Swpaul{ 32538363Swpaulmy ($Xl,$Xm,$Xh,$IN)=map("q$_",(0..3)); 32638363Swpaulmy ($t0,$t1,$t2,$t3)=map("q$_",(8..12)); 32738363Swpaulmy ($Hlo,$Hhi,$Hhl,$k48,$k32,$k16)=map("d$_",(26..31)); 32838363Swpaul 32938363Swpaulsub clmul64x64 { 33038363Swpaulmy ($r,$a,$b)=@_; 33138363Swpaul$code.=<<___; 33238363Swpaul vext.8 $t0#lo, $a, $a, #1 @ A1 33338363Swpaul vmull.p8 $t0, $t0#lo, $b @ F = A1*B 33438363Swpaul vext.8 $r#lo, $b, $b, #1 @ B1 33538363Swpaul vmull.p8 $r, $a, $r#lo @ E = A*B1 33638363Swpaul vext.8 $t1#lo, $a, $a, #2 @ A2 33738363Swpaul vmull.p8 $t1, $t1#lo, $b @ H = A2*B 33838363Swpaul vext.8 $t3#lo, $b, $b, #2 @ B2 33938363Swpaul vmull.p8 $t3, $a, $t3#lo @ G = A*B2 34038363Swpaul vext.8 $t2#lo, $a, $a, #3 @ A3 34138363Swpaul veor $t0, $t0, $r @ L = E + F 34238363Swpaul vmull.p8 $t2, $t2#lo, $b @ J = A3*B 34338363Swpaul vext.8 $r#lo, $b, $b, #3 @ B3 34438363Swpaul veor $t1, $t1, $t3 @ M = G + H 34538363Swpaul vmull.p8 $r, $a, $r#lo @ I = A*B3 34638363Swpaul veor $t0#lo, $t0#lo, $t0#hi @ t0 = (L) (P0 + P1) << 8 34738363Swpaul vand $t0#hi, $t0#hi, $k48 34838363Swpaul vext.8 $t3#lo, $b, $b, #4 @ B4 34938363Swpaul veor $t1#lo, $t1#lo, $t1#hi @ t1 = (M) (P2 + P3) << 16 35038363Swpaul vand $t1#hi, $t1#hi, $k32 35138363Swpaul vmull.p8 $t3, $a, $t3#lo @ K = A*B4 35238363Swpaul veor $t2, $t2, $r @ N = I + J 35338363Swpaul veor $t0#lo, $t0#lo, $t0#hi 35438363Swpaul veor $t1#lo, $t1#lo, $t1#hi 35538363Swpaul veor $t2#lo, $t2#lo, $t2#hi @ t2 = (N) (P4 + P5) << 24 35638363Swpaul vand $t2#hi, $t2#hi, $k16 35738363Swpaul vext.8 $t0, $t0, $t0, #15 35838363Swpaul veor $t3#lo, $t3#lo, $t3#hi @ t3 = (K) (P6 + P7) << 32 35938363Swpaul vmov.i64 $t3#hi, #0 36038363Swpaul vext.8 $t1, $t1, $t1, #14 36138363Swpaul veor $t2#lo, $t2#lo, $t2#hi 36238363Swpaul vmull.p8 $r, $a, $b @ D = A*B 36338363Swpaul vext.8 $t3, $t3, $t3, #12 36438363Swpaul vext.8 $t2, $t2, $t2, #13 36538363Swpaul veor $t0, $t0, $t1 36640588Swpaul veor $t2, $t2, $t3 36738363Swpaul veor $r, $r, $t0 36838363Swpaul veor $r, $r, $t2 36938363Swpaul___ 37038363Swpaul} 37138363Swpaul 37238363Swpaul$code.=<<___; 37338363Swpaul#if __ARM_MAX_ARCH__>=7 37438363Swpaul.arch armv7-a 37538363Swpaul.fpu neon 37638363Swpaul 37738363Swpaul.global gcm_init_neon 37838363Swpaul.type gcm_init_neon,%function 37938363Swpaul.align 4 38038363Swpaulgcm_init_neon: 38138363Swpaul vld1.64 $IN#hi,[r1,:64]! @ load H 38238363Swpaul vmov.i8 $t0,#0xe1 38338363Swpaul vld1.64 $IN#lo,[r1,:64] 38438363Swpaul vshl.i64 $t0#hi,#57 38538363Swpaul vshr.u64 $t0#lo,#63 @ t0=0xc2....01 38638363Swpaul vdup.8 $t1,$IN#hi[7] 38738363Swpaul vshr.u64 $Hlo,$IN#lo,#63 38838363Swpaul vshr.s8 $t1,#7 @ broadcast carry bit 38938363Swpaul vshl.i64 $IN,$IN,#1 39038363Swpaul vand $t0,$t0,$t1 39138363Swpaul vorr $IN#hi,$Hlo @ H<<<=1 39238363Swpaul veor $IN,$IN,$t0 @ twisted H 39338363Swpaul vstmia r0,{$IN} 39438363Swpaul 39538363Swpaul ret @ bx lr 39638363Swpaul.size gcm_init_neon,.-gcm_init_neon 39738363Swpaul 39838363Swpaul.global gcm_gmult_neon 39938363Swpaul.type gcm_gmult_neon,%function 40038363Swpaul.align 4 40138363Swpaulgcm_gmult_neon: 40238363Swpaul vld1.64 $IN#hi,[$Xi,:64]! @ load Xi 40338363Swpaul vld1.64 $IN#lo,[$Xi,:64]! 40438363Swpaul vmov.i64 $k48,#0x0000ffffffffffff 40538363Swpaul vldmia $Htbl,{$Hlo-$Hhi} @ load twisted H 40638363Swpaul vmov.i64 $k32,#0x00000000ffffffff 40738363Swpaul#ifdef __ARMEL__ 40838363Swpaul vrev64.8 $IN,$IN 40938363Swpaul#endif 41038363Swpaul vmov.i64 $k16,#0x000000000000ffff 41138363Swpaul veor $Hhl,$Hlo,$Hhi @ Karatsuba pre-processing 41238363Swpaul mov $len,#16 41338363Swpaul b .Lgmult_neon 41438363Swpaul.size gcm_gmult_neon,.-gcm_gmult_neon 41538363Swpaul 41638363Swpaul.global gcm_ghash_neon 41738363Swpaul.type gcm_ghash_neon,%function 41838363Swpaul.align 4 41938363Swpaulgcm_ghash_neon: 42038363Swpaul vld1.64 $Xl#hi,[$Xi,:64]! @ load Xi 42138363Swpaul vld1.64 $Xl#lo,[$Xi,:64]! 42238363Swpaul vmov.i64 $k48,#0x0000ffffffffffff 42338363Swpaul vldmia $Htbl,{$Hlo-$Hhi} @ load twisted H 42438363Swpaul vmov.i64 $k32,#0x00000000ffffffff 42538363Swpaul#ifdef __ARMEL__ 42638363Swpaul vrev64.8 $Xl,$Xl 42738363Swpaul#endif 42838363Swpaul vmov.i64 $k16,#0x000000000000ffff 42938363Swpaul veor $Hhl,$Hlo,$Hhi @ Karatsuba pre-processing 43038363Swpaul 43138363Swpaul.Loop_neon: 43238363Swpaul vld1.64 $IN#hi,[$inp]! @ load inp 43338363Swpaul vld1.64 $IN#lo,[$inp]! 43438363Swpaul#ifdef __ARMEL__ 43538363Swpaul vrev64.8 $IN,$IN 43638363Swpaul#endif 43738363Swpaul veor $IN,$Xl @ inp^=Xi 43838363Swpaul.Lgmult_neon: 43938363Swpaul___ 44038363Swpaul &clmul64x64 ($Xl,$Hlo,"$IN#lo"); # H.lo��Xi.lo 44138363Swpaul$code.=<<___; 44246204Swpaul veor $IN#lo,$IN#lo,$IN#hi @ Karatsuba pre-processing 44346204Swpaul___ 44438363Swpaul &clmul64x64 ($Xm,$Hhl,"$IN#lo"); # (H.lo+H.hi)��(Xi.lo+Xi.hi) 44551441Swpaul &clmul64x64 ($Xh,$Hhi,"$IN#hi"); # H.hi��Xi.hi 44651441Swpaul$code.=<<___; 44738363Swpaul veor $Xm,$Xm,$Xl @ Karatsuba post-processing 44838363Swpaul veor $Xm,$Xm,$Xh 44938363Swpaul veor $Xl#hi,$Xl#hi,$Xm#lo 45038363Swpaul veor $Xh#lo,$Xh#lo,$Xm#hi @ Xh|Xl - 256-bit result 45138363Swpaul 45238363Swpaul @ equivalent of reduction_avx from ghash-x86_64.pl 45338363Swpaul vshl.i64 $t1,$Xl,#57 @ 1st phase 45438363Swpaul vshl.i64 $t2,$Xl,#62 45538363Swpaul veor $t2,$t2,$t1 @ 45638363Swpaul vshl.i64 $t1,$Xl,#63 45738363Swpaul veor $t2, $t2, $t1 @ 45851441Swpaul veor $Xl#hi,$Xl#hi,$t2#lo @ 45951441Swpaul veor $Xh#lo,$Xh#lo,$t2#hi 46038363Swpaul 46138363Swpaul vshr.u64 $t2,$Xl,#1 @ 2nd phase 46238363Swpaul veor $Xh,$Xh,$Xl 46338363Swpaul veor $Xl,$Xl,$t2 @ 46438363Swpaul vshr.u64 $t2,$t2,#6 46538363Swpaul vshr.u64 $Xl,$Xl,#1 @ 46638363Swpaul veor $Xl,$Xl,$Xh @ 46738363Swpaul veor $Xl,$Xl,$t2 @ 46838363Swpaul 46938363Swpaul subs $len,#16 47038363Swpaul bne .Loop_neon 47138363Swpaul 47238363Swpaul#ifdef __ARMEL__ 47338363Swpaul vrev64.8 $Xl,$Xl 47451441Swpaul#endif 47538363Swpaul sub $Xi,#16 47638363Swpaul vst1.64 $Xl#hi,[$Xi,:64]! @ write out Xi 47738363Swpaul vst1.64 $Xl#lo,[$Xi,:64] 47851441Swpaul 47951441Swpaul ret @ bx lr 48051441Swpaul.size gcm_ghash_neon,.-gcm_ghash_neon 48151441Swpaul#endif 48251441Swpaul___ 48338363Swpaul} 48438363Swpaul$code.=<<___; 48538363Swpaul.asciz "GHASH for ARMv4/NEON, CRYPTOGAMS by <appro\@openssl.org>" 48638363Swpaul.align 2 48738363Swpaul___ 48838363Swpaul 48938363Swpaulforeach (split("\n",$code)) { 49038363Swpaul s/\`([^\`]*)\`/eval $1/geo; 49138363Swpaul 49238363Swpaul s/\bq([0-9]+)#(lo|hi)/sprintf "d%d",2*$1+($2 eq "hi")/geo or 49338363Swpaul s/\bret\b/bx lr/go or 49438363Swpaul s/\bbx\s+lr\b/.word\t0xe12fff1e/go; # make it possible to compile with -march=armv4 49538363Swpaul 49638363Swpaul print $_,"\n"; 49738363Swpaul} 49838363Swpaulclose STDOUT; # enforce flush 49938363Swpaul