1305153Sjkim/* $FreeBSD$ */ 2305153Sjkim/* Do not modify. This file is auto-generated from ghashv8-armx.pl. */ 3304636Sjkim#include "arm_arch.h" 4304636Sjkim 5304636Sjkim.text 6304636Sjkim.fpu neon 7304636Sjkim.code 32 8304636Sjkim.global gcm_init_v8 9304636Sjkim.type gcm_init_v8,%function 10304636Sjkim.align 4 11304636Sjkimgcm_init_v8: 12304636Sjkim vld1.64 {q9},[r1] @ load input H 13304636Sjkim vmov.i8 q11,#0xe1 14304636Sjkim vshl.i64 q11,q11,#57 @ 0xc2.0 15304636Sjkim vext.8 q3,q9,q9,#8 16304636Sjkim vshr.u64 q10,q11,#63 17304636Sjkim vdup.32 q9,d18[1] 18304636Sjkim vext.8 q8,q10,q11,#8 @ t0=0xc2....01 19304636Sjkim vshr.u64 q10,q3,#63 20304636Sjkim vshr.s32 q9,q9,#31 @ broadcast carry bit 21304636Sjkim vand q10,q10,q8 22304636Sjkim vshl.i64 q3,q3,#1 23304636Sjkim vext.8 q10,q10,q10,#8 24304636Sjkim vand q8,q8,q9 25304636Sjkim vorr q3,q3,q10 @ H<<<=1 26304636Sjkim veor q12,q3,q8 @ twisted H 27304636Sjkim vst1.64 {q12},[r0]! @ store Htable[0] 28304636Sjkim 29304636Sjkim @ calculate H^2 30304636Sjkim vext.8 q8,q12,q12,#8 @ Karatsuba pre-processing 31304636Sjkim .byte 0xa8,0x0e,0xa8,0xf2 @ pmull q0,q12,q12 32304636Sjkim veor q8,q8,q12 33304636Sjkim .byte 0xa9,0x4e,0xa9,0xf2 @ pmull2 q2,q12,q12 34304636Sjkim .byte 0xa0,0x2e,0xa0,0xf2 @ pmull q1,q8,q8 35304636Sjkim 36304636Sjkim vext.8 q9,q0,q2,#8 @ Karatsuba post-processing 37304636Sjkim veor q10,q0,q2 38304636Sjkim veor q1,q1,q9 39304636Sjkim veor q1,q1,q10 40304636Sjkim .byte 0x26,0x4e,0xe0,0xf2 @ pmull q10,q0,q11 @ 1st phase 41304636Sjkim 42304636Sjkim vmov d4,d3 @ Xh|Xm - 256-bit result 43304636Sjkim vmov d3,d0 @ Xm is rotated Xl 44304636Sjkim veor q0,q1,q10 45304636Sjkim 46304636Sjkim vext.8 q10,q0,q0,#8 @ 2nd phase 47304636Sjkim .byte 0x26,0x0e,0xa0,0xf2 @ pmull q0,q0,q11 48304636Sjkim veor q10,q10,q2 49304636Sjkim veor q14,q0,q10 50304636Sjkim 51304636Sjkim vext.8 q9,q14,q14,#8 @ Karatsuba pre-processing 52304636Sjkim veor q9,q9,q14 53304636Sjkim vext.8 q13,q8,q9,#8 @ pack Karatsuba pre-processed 54304636Sjkim vst1.64 {q13-q14},[r0] @ store Htable[1..2] 55304636Sjkim 56304636Sjkim bx lr 57304636Sjkim.size gcm_init_v8,.-gcm_init_v8 58304636Sjkim.global gcm_gmult_v8 59304636Sjkim.type gcm_gmult_v8,%function 60304636Sjkim.align 4 61304636Sjkimgcm_gmult_v8: 62304636Sjkim vld1.64 {q9},[r0] @ load Xi 63304636Sjkim vmov.i8 q11,#0xe1 64304636Sjkim vld1.64 {q12-q13},[r1] @ load twisted H, ... 65304636Sjkim vshl.u64 q11,q11,#57 66304636Sjkim#ifndef __ARMEB__ 67304636Sjkim vrev64.8 q9,q9 68304636Sjkim#endif 69304636Sjkim vext.8 q3,q9,q9,#8 70304636Sjkim 71304636Sjkim .byte 0x86,0x0e,0xa8,0xf2 @ pmull q0,q12,q3 @ H.lo��Xi.lo 72304636Sjkim veor q9,q9,q3 @ Karatsuba pre-processing 73304636Sjkim .byte 0x87,0x4e,0xa9,0xf2 @ pmull2 q2,q12,q3 @ H.hi��Xi.hi 74304636Sjkim .byte 0xa2,0x2e,0xaa,0xf2 @ pmull q1,q13,q9 @ (H.lo+H.hi)��(Xi.lo+Xi.hi) 75304636Sjkim 76304636Sjkim vext.8 q9,q0,q2,#8 @ Karatsuba post-processing 77304636Sjkim veor q10,q0,q2 78304636Sjkim veor q1,q1,q9 79304636Sjkim veor q1,q1,q10 80304636Sjkim .byte 0x26,0x4e,0xe0,0xf2 @ pmull q10,q0,q11 @ 1st phase of reduction 81304636Sjkim 82304636Sjkim vmov d4,d3 @ Xh|Xm - 256-bit result 83304636Sjkim vmov d3,d0 @ Xm is rotated Xl 84304636Sjkim veor q0,q1,q10 85304636Sjkim 86304636Sjkim vext.8 q10,q0,q0,#8 @ 2nd phase of reduction 87304636Sjkim .byte 0x26,0x0e,0xa0,0xf2 @ pmull q0,q0,q11 88304636Sjkim veor q10,q10,q2 89304636Sjkim veor q0,q0,q10 90304636Sjkim 91304636Sjkim#ifndef __ARMEB__ 92304636Sjkim vrev64.8 q0,q0 93304636Sjkim#endif 94304636Sjkim vext.8 q0,q0,q0,#8 95304636Sjkim vst1.64 {q0},[r0] @ write out Xi 96304636Sjkim 97304636Sjkim bx lr 98304636Sjkim.size gcm_gmult_v8,.-gcm_gmult_v8 99304636Sjkim.global gcm_ghash_v8 100304636Sjkim.type gcm_ghash_v8,%function 101304636Sjkim.align 4 102304636Sjkimgcm_ghash_v8: 103304636Sjkim vstmdb sp!,{d8-d15} @ 32-bit ABI says so 104304636Sjkim vld1.64 {q0},[r0] @ load [rotated] Xi 105304636Sjkim @ "[rotated]" means that 106304636Sjkim @ loaded value would have 107304636Sjkim @ to be rotated in order to 108304636Sjkim @ make it appear as in 109304636Sjkim @ alorithm specification 110304636Sjkim subs r3,r3,#32 @ see if r3 is 32 or larger 111304636Sjkim mov r12,#16 @ r12 is used as post- 112304636Sjkim @ increment for input pointer; 113304636Sjkim @ as loop is modulo-scheduled 114304636Sjkim @ r12 is zeroed just in time 115304636Sjkim @ to preclude oversteping 116304636Sjkim @ inp[len], which means that 117304636Sjkim @ last block[s] are actually 118304636Sjkim @ loaded twice, but last 119304636Sjkim @ copy is not processed 120304636Sjkim vld1.64 {q12-q13},[r1]! @ load twisted H, ..., H^2 121304636Sjkim vmov.i8 q11,#0xe1 122304636Sjkim vld1.64 {q14},[r1] 123304636Sjkim moveq r12,#0 @ is it time to zero r12? 124304636Sjkim vext.8 q0,q0,q0,#8 @ rotate Xi 125304636Sjkim vld1.64 {q8},[r2]! @ load [rotated] I[0] 126304636Sjkim vshl.u64 q11,q11,#57 @ compose 0xc2.0 constant 127304636Sjkim#ifndef __ARMEB__ 128304636Sjkim vrev64.8 q8,q8 129304636Sjkim vrev64.8 q0,q0 130304636Sjkim#endif 131304636Sjkim vext.8 q3,q8,q8,#8 @ rotate I[0] 132304636Sjkim blo .Lodd_tail_v8 @ r3 was less than 32 133304636Sjkim vld1.64 {q9},[r2],r12 @ load [rotated] I[1] 134304636Sjkim#ifndef __ARMEB__ 135304636Sjkim vrev64.8 q9,q9 136304636Sjkim#endif 137304636Sjkim vext.8 q7,q9,q9,#8 138304636Sjkim veor q3,q3,q0 @ I[i]^=Xi 139304636Sjkim .byte 0x8e,0x8e,0xa8,0xf2 @ pmull q4,q12,q7 @ H��Ii+1 140304636Sjkim veor q9,q9,q7 @ Karatsuba pre-processing 141304636Sjkim .byte 0x8f,0xce,0xa9,0xf2 @ pmull2 q6,q12,q7 142304636Sjkim b .Loop_mod2x_v8 143304636Sjkim 144304636Sjkim.align 4 145304636Sjkim.Loop_mod2x_v8: 146304636Sjkim vext.8 q10,q3,q3,#8 147304636Sjkim subs r3,r3,#32 @ is there more data? 148304636Sjkim .byte 0x86,0x0e,0xac,0xf2 @ pmull q0,q14,q3 @ H^2.lo��Xi.lo 149304636Sjkim movlo r12,#0 @ is it time to zero r12? 150304636Sjkim 151304636Sjkim .byte 0xa2,0xae,0xaa,0xf2 @ pmull q5,q13,q9 152304636Sjkim veor q10,q10,q3 @ Karatsuba pre-processing 153304636Sjkim .byte 0x87,0x4e,0xad,0xf2 @ pmull2 q2,q14,q3 @ H^2.hi��Xi.hi 154304636Sjkim veor q0,q0,q4 @ accumulate 155304636Sjkim .byte 0xa5,0x2e,0xab,0xf2 @ pmull2 q1,q13,q10 @ (H^2.lo+H^2.hi)��(Xi.lo+Xi.hi) 156304636Sjkim vld1.64 {q8},[r2],r12 @ load [rotated] I[i+2] 157304636Sjkim 158304636Sjkim veor q2,q2,q6 159304636Sjkim moveq r12,#0 @ is it time to zero r12? 160304636Sjkim veor q1,q1,q5 161304636Sjkim 162304636Sjkim vext.8 q9,q0,q2,#8 @ Karatsuba post-processing 163304636Sjkim veor q10,q0,q2 164304636Sjkim veor q1,q1,q9 165304636Sjkim vld1.64 {q9},[r2],r12 @ load [rotated] I[i+3] 166304636Sjkim#ifndef __ARMEB__ 167304636Sjkim vrev64.8 q8,q8 168304636Sjkim#endif 169304636Sjkim veor q1,q1,q10 170304636Sjkim .byte 0x26,0x4e,0xe0,0xf2 @ pmull q10,q0,q11 @ 1st phase of reduction 171304636Sjkim 172304636Sjkim#ifndef __ARMEB__ 173304636Sjkim vrev64.8 q9,q9 174304636Sjkim#endif 175304636Sjkim vmov d4,d3 @ Xh|Xm - 256-bit result 176304636Sjkim vmov d3,d0 @ Xm is rotated Xl 177304636Sjkim vext.8 q7,q9,q9,#8 178304636Sjkim vext.8 q3,q8,q8,#8 179304636Sjkim veor q0,q1,q10 180304636Sjkim .byte 0x8e,0x8e,0xa8,0xf2 @ pmull q4,q12,q7 @ H��Ii+1 181304636Sjkim veor q3,q3,q2 @ accumulate q3 early 182304636Sjkim 183304636Sjkim vext.8 q10,q0,q0,#8 @ 2nd phase of reduction 184304636Sjkim .byte 0x26,0x0e,0xa0,0xf2 @ pmull q0,q0,q11 185304636Sjkim veor q3,q3,q10 186304636Sjkim veor q9,q9,q7 @ Karatsuba pre-processing 187304636Sjkim veor q3,q3,q0 188304636Sjkim .byte 0x8f,0xce,0xa9,0xf2 @ pmull2 q6,q12,q7 189304636Sjkim bhs .Loop_mod2x_v8 @ there was at least 32 more bytes 190304636Sjkim 191304636Sjkim veor q2,q2,q10 192304636Sjkim vext.8 q3,q8,q8,#8 @ re-construct q3 193304636Sjkim adds r3,r3,#32 @ re-construct r3 194304636Sjkim veor q0,q0,q2 @ re-construct q0 195304636Sjkim beq .Ldone_v8 @ is r3 zero? 196304636Sjkim.Lodd_tail_v8: 197304636Sjkim vext.8 q10,q0,q0,#8 198304636Sjkim veor q3,q3,q0 @ inp^=Xi 199304636Sjkim veor q9,q8,q10 @ q9 is rotated inp^Xi 200304636Sjkim 201304636Sjkim .byte 0x86,0x0e,0xa8,0xf2 @ pmull q0,q12,q3 @ H.lo��Xi.lo 202304636Sjkim veor q9,q9,q3 @ Karatsuba pre-processing 203304636Sjkim .byte 0x87,0x4e,0xa9,0xf2 @ pmull2 q2,q12,q3 @ H.hi��Xi.hi 204304636Sjkim .byte 0xa2,0x2e,0xaa,0xf2 @ pmull q1,q13,q9 @ (H.lo+H.hi)��(Xi.lo+Xi.hi) 205304636Sjkim 206304636Sjkim vext.8 q9,q0,q2,#8 @ Karatsuba post-processing 207304636Sjkim veor q10,q0,q2 208304636Sjkim veor q1,q1,q9 209304636Sjkim veor q1,q1,q10 210304636Sjkim .byte 0x26,0x4e,0xe0,0xf2 @ pmull q10,q0,q11 @ 1st phase of reduction 211304636Sjkim 212304636Sjkim vmov d4,d3 @ Xh|Xm - 256-bit result 213304636Sjkim vmov d3,d0 @ Xm is rotated Xl 214304636Sjkim veor q0,q1,q10 215304636Sjkim 216304636Sjkim vext.8 q10,q0,q0,#8 @ 2nd phase of reduction 217304636Sjkim .byte 0x26,0x0e,0xa0,0xf2 @ pmull q0,q0,q11 218304636Sjkim veor q10,q10,q2 219304636Sjkim veor q0,q0,q10 220304636Sjkim 221304636Sjkim.Ldone_v8: 222304636Sjkim#ifndef __ARMEB__ 223304636Sjkim vrev64.8 q0,q0 224304636Sjkim#endif 225304636Sjkim vext.8 q0,q0,q0,#8 226304636Sjkim vst1.64 {q0},[r0] @ write out Xi 227304636Sjkim 228304636Sjkim vldmia sp!,{d8-d15} @ 32-bit ABI says so 229304636Sjkim bx lr 230304636Sjkim.size gcm_ghash_v8,.-gcm_ghash_v8 231304636Sjkim.asciz "GHASH for ARMv8, CRYPTOGAMS by <appro@openssl.org>" 232304636Sjkim.align 2 233