#include "arm_asm.h" #include "arm_arch.h" #if __ARM_MAX_ARCH__>=8 .fpu neon #ifdef __thumb2__ .syntax unified .thumb # define INST(a,b,c,d) c,0xef,a,b #else .code 32 # define INST(a,b,c,d) a,b,c,0xf2 #endif .text .globl aes_gcm_enc_128_kernel .type aes_gcm_enc_128_kernel,%function .align 4 aes_gcm_enc_128_kernel: cbz r1, .L128_enc_ret stp r19, r20, [sp, #-112]! mov r16, r4 mov r8, r5 stp r21, r22, [sp, #16] stp r23, r24, [sp, #32] stp d8, d9, [sp, #48] stp d10, d11, [sp, #64] stp d12, d13, [sp, #80] stp d14, d15, [sp, #96] ldp r10, r11, [r16] @ ctr96_b64, ctr96_t32 #ifdef __ARMEB__ rev r10, r10 rev r11, r11 #endif ldp r13, r14, [r8, #160] @ load rk10 #ifdef __ARMEB__ ror r13, r13, #32 ror r14, r14, #32 #endif ld1 {v11.16b}, [r3] ext v11.16b, v11.16b, v11.16b, #8 rev64 v11.16b, v11.16b lsr r5, r1, #3 @ byte_len mov r15, r5 ld1 {v18.4s}, [r8], #16 @ load rk0 add r4, r0, r1, lsr #3 @ end_input_ptr sub r5, r5, #1 @ byte_len - 1 lsr r12, r11, #32 ldr q15, [r3, #112] @ load h4l | h4h #ifndef __ARMEB__ ext v15.16b, v15.16b, v15.16b, #8 #endif fmov d1, r10 @ CTR block 1 rev r12, r12 @ rev_ctr32 add r12, r12, #1 @ increment rev_ctr32 orr r11, r11, r11 ld1 {v19.4s}, [r8], #16 @ load rk1 rev r9, r12 @ CTR block 1 add r12, r12, #1 @ CTR block 1 fmov d3, r10 @ CTR block 3 orr r9, r11, r9, lsl #32 @ CTR block 1 ld1 { q0}, [r16] @ special case vector load initial counter so we can start first AES block as quickly as possible fmov v1.d[1], r9 @ CTR block 1 rev r9, r12 @ CTR block 2 fmov d2, r10 @ CTR block 2 orr r9, r11, r9, lsl #32 @ CTR block 2 add r12, r12, #1 @ CTR block 2 fmov v2.d[1], r9 @ CTR block 2 rev r9, r12 @ CTR block 3 orr r9, r11, r9, lsl #32 @ CTR block 3 ld1 {v20.4s}, [r8], #16 @ load rk2 add r12, r12, #1 @ CTR block 3 fmov v3.d[1], r9 @ CTR block 3 ldr q14, [r3, #80] @ load h3l | h3h #ifndef __ARMEB__ ext v14.16b, v14.16b, v14.16b, #8 #endif aese q1, v18.16b aesmc q1, q1 @ AES block 1 - round 0 ld1 {v21.4s}, [r8], #16 @ load rk3 aese q2, v18.16b aesmc q2, q2 @ AES block 2 - round 0 ldr q12, [r3, #32] @ load h1l | h1h #ifndef __ARMEB__ ext v12.16b, v12.16b, v12.16b, #8 #endif aese q0, v18.16b aesmc q0, q0 @ AES block 0 - round 0 ld1 {v22.4s}, [r8], #16 @ load rk4 aese q3, v18.16b aesmc q3, q3 @ AES block 3 - round 0 ld1 {v23.4s}, [r8], #16 @ load rk5 aese q2, v19.16b aesmc q2, q2 @ AES block 2 - round 1 trn2 v17.2d, v14.2d, v15.2d @ h4l | h3l aese q0, v19.16b aesmc q0, q0 @ AES block 0 - round 1 ld1 {v24.4s}, [r8], #16 @ load rk6 aese q1, v19.16b aesmc q1, q1 @ AES block 1 - round 1 ld1 {v25.4s}, [r8], #16 @ load rk7 aese q3, v19.16b aesmc q3, q3 @ AES block 3 - round 1 trn1 q9, v14.2d, v15.2d @ h4h | h3h aese q0, v20.16b aesmc q0, q0 @ AES block 0 - round 2 ld1 {v26.4s}, [r8], #16 @ load rk8 aese q1, v20.16b aesmc q1, q1 @ AES block 1 - round 2 ldr q13, [r3, #64] @ load h2l | h2h #ifndef __ARMEB__ ext v13.16b, v13.16b, v13.16b, #8 #endif aese q3, v20.16b aesmc q3, q3 @ AES block 3 - round 2 aese q2, v20.16b aesmc q2, q2 @ AES block 2 - round 2 eor v17.16b, v17.16b, q9 @ h4k | h3k aese q0, v21.16b aesmc q0, q0 @ AES block 0 - round 3 aese q1, v21.16b aesmc q1, q1 @ AES block 1 - round 3 aese q2, v21.16b aesmc q2, q2 @ AES block 2 - round 3 ld1 {v27.4s}, [r8], #16 @ load rk9 aese q3, v21.16b aesmc q3, q3 @ AES block 3 - round 3 and r5, r5, #0xffffffffffffffc0 @ number of bytes to be processed in main loop (at least 1 byte must be handled by tail) trn2 v16.2d, v12.2d, v13.2d @ h2l | h1l aese q3, v22.16b aesmc q3, q3 @ AES block 3 - round 4 add r5, r5, r0 aese q2, v22.16b aesmc q2, q2 @ AES block 2 - round 4 cmp r0, r5 @ check if we have <= 4 blocks aese q0, v22.16b aesmc q0, q0 @ AES block 0 - round 4 aese q3, v23.16b aesmc q3, q3 @ AES block 3 - round 5 aese q2, v23.16b aesmc q2, q2 @ AES block 2 - round 5 aese q0, v23.16b aesmc q0, q0 @ AES block 0 - round 5 aese q3, v24.16b aesmc q3, q3 @ AES block 3 - round 6 aese q1, v22.16b aesmc q1, q1 @ AES block 1 - round 4 aese q2, v24.16b aesmc q2, q2 @ AES block 2 - round 6 trn1 q8, v12.2d, v13.2d @ h2h | h1h aese q0, v24.16b aesmc q0, q0 @ AES block 0 - round 6 aese q1, v23.16b aesmc q1, q1 @ AES block 1 - round 5 aese q3, v25.16b aesmc q3, q3 @ AES block 3 - round 7 aese q0, v25.16b aesmc q0, q0 @ AES block 0 - round 7 aese q1, v24.16b aesmc q1, q1 @ AES block 1 - round 6 aese q2, v25.16b aesmc q2, q2 @ AES block 2 - round 7 aese q0, v26.16b aesmc q0, q0 @ AES block 0 - round 8 aese q1, v25.16b aesmc q1, q1 @ AES block 1 - round 7 aese q2, v26.16b aesmc q2, q2 @ AES block 2 - round 8 aese q3, v26.16b aesmc q3, q3 @ AES block 3 - round 8 aese q1, v26.16b aesmc q1, q1 @ AES block 1 - round 8 aese q2, v27.16b @ AES block 2 - round 9 aese q0, v27.16b @ AES block 0 - round 9 eor v16.16b, v16.16b, q8 @ h2k | h1k aese q1, v27.16b @ AES block 1 - round 9 aese q3, v27.16b @ AES block 3 - round 9 bge .L128_enc_tail @ handle tail ldp r6, r7, [r0, #0] @ AES block 0 - load plaintext #ifdef __ARMEB__ rev r6, r6 rev r7, r7 #endif ldp r21, r22, [r0, #32] @ AES block 2 - load plaintext #ifdef __ARMEB__ rev r21, r21 rev r22, r22 #endif ldp r19, r20, [r0, #16] @ AES block 1 - load plaintext #ifdef __ARMEB__ rev r19, r19 rev r20, r20 #endif ldp r23, r24, [r0, #48] @ AES block 3 - load plaintext #ifdef __ARMEB__ rev r23, r23 rev r24, r24 #endif eor r6, r6, r13 @ AES block 0 - round 10 low eor r7, r7, r14 @ AES block 0 - round 10 high eor r21, r21, r13 @ AES block 2 - round 10 low fmov d4, r6 @ AES block 0 - mov low eor r19, r19, r13 @ AES block 1 - round 10 low eor r22, r22, r14 @ AES block 2 - round 10 high fmov v4.d[1], r7 @ AES block 0 - mov high fmov d5, r19 @ AES block 1 - mov low eor r20, r20, r14 @ AES block 1 - round 10 high eor r23, r23, r13 @ AES block 3 - round 10 low fmov v5.d[1], r20 @ AES block 1 - mov high fmov d6, r21 @ AES block 2 - mov low eor r24, r24, r14 @ AES block 3 - round 10 high rev r9, r12 @ CTR block 4 fmov v6.d[1], r22 @ AES block 2 - mov high orr r9, r11, r9, lsl #32 @ CTR block 4 eor q4, q4, q0 @ AES block 0 - result fmov d0, r10 @ CTR block 4 add r12, r12, #1 @ CTR block 4 fmov v0.d[1], r9 @ CTR block 4 rev r9, r12 @ CTR block 5 eor q5, q5, q1 @ AES block 1 - result fmov d1, r10 @ CTR block 5 orr r9, r11, r9, lsl #32 @ CTR block 5 add r12, r12, #1 @ CTR block 5 add r0, r0, #64 @ AES input_ptr update fmov v1.d[1], r9 @ CTR block 5 fmov d7, r23 @ AES block 3 - mov low rev r9, r12 @ CTR block 6 st1 { q4}, [r2], #16 @ AES block 0 - store result fmov v7.d[1], r24 @ AES block 3 - mov high orr r9, r11, r9, lsl #32 @ CTR block 6 add r12, r12, #1 @ CTR block 6 eor q6, q6, q2 @ AES block 2 - result st1 { q5}, [r2], #16 @ AES block 1 - store result fmov d2, r10 @ CTR block 6 cmp r0, r5 @ check if we have <= 8 blocks fmov v2.d[1], r9 @ CTR block 6 rev r9, r12 @ CTR block 7 st1 { q6}, [r2], #16 @ AES block 2 - store result orr r9, r11, r9, lsl #32 @ CTR block 7 eor q7, q7, q3 @ AES block 3 - result st1 { q7}, [r2], #16 @ AES block 3 - store result bge .L128_enc_prepretail @ do prepretail .L128_enc_main_loop:@ main loop start ldp r23, r24, [r0, #48] @ AES block 4k+3 - load plaintext #ifdef __ARMEB__ rev r23, r23 rev r24, r24 #endif rev64 q4, q4 @ GHASH block 4k (only t0 is free) rev64 q6, q6 @ GHASH block 4k+2 (t0, t1, and t2 free) aese q2, v18.16b aesmc q2, q2 @ AES block 4k+6 - round 0 fmov d3, r10 @ CTR block 4k+3 ext v11.16b, v11.16b, v11.16b, #8 @ PRE 0 rev64 q5, q5 @ GHASH block 4k+1 (t0 and t1 free) aese q1, v18.16b aesmc q1, q1 @ AES block 4k+5 - round 0 add r12, r12, #1 @ CTR block 4k+3 fmov v3.d[1], r9 @ CTR block 4k+3 aese q0, v18.16b aesmc q0, q0 @ AES block 4k+4 - round 0 mov d31, v6.d[1] @ GHASH block 4k+2 - mid aese q2, v19.16b aesmc q2, q2 @ AES block 4k+6 - round 1 mov d30, v5.d[1] @ GHASH block 4k+1 - mid aese q1, v19.16b aesmc q1, q1 @ AES block 4k+5 - round 1 eor q4, q4, v11.16b @ PRE 1 aese q3, v18.16b aesmc q3, q3 @ AES block 4k+7 - round 0 eor r24, r24, r14 @ AES block 4k+3 - round 10 high pmull2 v28.1q, q5, v14.2d @ GHASH block 4k+1 - high eor v31.8b, v31.8b, q6 @ GHASH block 4k+2 - mid ldp r6, r7, [r0, #0] @ AES block 4k+4 - load plaintext #ifdef __ARMEB__ rev r6, r6 rev r7, r7 #endif aese q0, v19.16b aesmc q0, q0 @ AES block 4k+4 - round 1 rev r9, r12 @ CTR block 4k+8 eor v30.8b, v30.8b, q5 @ GHASH block 4k+1 - mid mov d8, v4.d[1] @ GHASH block 4k - mid orr r9, r11, r9, lsl #32 @ CTR block 4k+8 pmull2 v9.1q, q4, v15.2d @ GHASH block 4k - high add r12, r12, #1 @ CTR block 4k+8 mov d10, v17.d[1] @ GHASH block 4k - mid aese q0, v20.16b aesmc q0, q0 @ AES block 4k+4 - round 2 pmull v11.1q, q4, v15.1d @ GHASH block 4k - low eor q8, q8, q4 @ GHASH block 4k - mid aese q1, v20.16b aesmc q1, q1 @ AES block 4k+5 - round 2 aese q0, v21.16b aesmc q0, q0 @ AES block 4k+4 - round 3 eor q9, q9, v28.16b @ GHASH block 4k+1 - high pmull v28.1q, q6, v13.1d @ GHASH block 4k+2 - low pmull v10.1q, q8, v10.1d @ GHASH block 4k - mid rev64 q7, q7 @ GHASH block 4k+3 (t0, t1, t2 and t3 free) pmull v30.1q, v30.1d, v17.1d @ GHASH block 4k+1 - mid pmull v29.1q, q5, v14.1d @ GHASH block 4k+1 - low ins v31.d[1], v31.d[0] @ GHASH block 4k+2 - mid pmull2 v8.1q, q6, v13.2d @ GHASH block 4k+2 - high eor r7, r7, r14 @ AES block 4k+4 - round 10 high eor v10.16b, v10.16b, v30.16b @ GHASH block 4k+1 - mid mov d30, v7.d[1] @ GHASH block 4k+3 - mid aese q3, v19.16b aesmc q3, q3 @ AES block 4k+7 - round 1 eor v11.16b, v11.16b, v29.16b @ GHASH block 4k+1 - low aese q2, v20.16b aesmc q2, q2 @ AES block 4k+6 - round 2 eor r6, r6, r13 @ AES block 4k+4 - round 10 low aese q1, v21.16b aesmc q1, q1 @ AES block 4k+5 - round 3 eor v30.8b, v30.8b, q7 @ GHASH block 4k+3 - mid pmull2 v4.1q, q7, v12.2d @ GHASH block 4k+3 - high aese q2, v21.16b aesmc q2, q2 @ AES block 4k+6 - round 3 eor q9, q9, q8 @ GHASH block 4k+2 - high pmull2 v31.1q, v31.2d, v16.2d @ GHASH block 4k+2 - mid pmull v29.1q, q7, v12.1d @ GHASH block 4k+3 - low movi q8, #0xc2 pmull v30.1q, v30.1d, v16.1d @ GHASH block 4k+3 - mid eor v11.16b, v11.16b, v28.16b @ GHASH block 4k+2 - low aese q1, v22.16b aesmc q1, q1 @ AES block 4k+5 - round 4 aese q3, v20.16b aesmc q3, q3 @ AES block 4k+7 - round 2 shl d8, d8, #56 @ mod_constant aese q0, v22.16b aesmc q0, q0 @ AES block 4k+4 - round 4 eor q9, q9, q4 @ GHASH block 4k+3 - high aese q1, v23.16b aesmc q1, q1 @ AES block 4k+5 - round 5 ldp r19, r20, [r0, #16] @ AES block 4k+5 - load plaintext #ifdef __ARMEB__ rev r19, r19 rev r20, r20 #endif aese q3, v21.16b aesmc q3, q3 @ AES block 4k+7 - round 3 eor v10.16b, v10.16b, v31.16b @ GHASH block 4k+2 - mid aese q0, v23.16b aesmc q0, q0 @ AES block 4k+4 - round 5 ldp r21, r22, [r0, #32] @ AES block 4k+6 - load plaintext #ifdef __ARMEB__ rev r21, r21 rev r22, r22 #endif pmull v31.1q, q9, q8 @ MODULO - top 64b align with mid eor v11.16b, v11.16b, v29.16b @ GHASH block 4k+3 - low aese q2, v22.16b aesmc q2, q2 @ AES block 4k+6 - round 4 eor r19, r19, r13 @ AES block 4k+5 - round 10 low aese q3, v22.16b aesmc q3, q3 @ AES block 4k+7 - round 4 eor v10.16b, v10.16b, v30.16b @ GHASH block 4k+3 - mid aese q1, v24.16b aesmc q1, q1 @ AES block 4k+5 - round 6 eor r23, r23, r13 @ AES block 4k+3 - round 10 low aese q2, v23.16b aesmc q2, q2 @ AES block 4k+6 - round 5 eor v30.16b, v11.16b, q9 @ MODULO - karatsuba tidy up fmov d4, r6 @ AES block 4k+4 - mov low aese q0, v24.16b aesmc q0, q0 @ AES block 4k+4 - round 6 fmov v4.d[1], r7 @ AES block 4k+4 - mov high add r0, r0, #64 @ AES input_ptr update fmov d7, r23 @ AES block 4k+3 - mov low ext q9, q9, q9, #8 @ MODULO - other top alignment aese q3, v23.16b aesmc q3, q3 @ AES block 4k+7 - round 5 fmov d5, r19 @ AES block 4k+5 - mov low aese q0, v25.16b aesmc q0, q0 @ AES block 4k+4 - round 7 eor v10.16b, v10.16b, v30.16b @ MODULO - karatsuba tidy up aese q2, v24.16b aesmc q2, q2 @ AES block 4k+6 - round 6 eor r20, r20, r14 @ AES block 4k+5 - round 10 high aese q1, v25.16b aesmc q1, q1 @ AES block 4k+5 - round 7 fmov v5.d[1], r20 @ AES block 4k+5 - mov high aese q0, v26.16b aesmc q0, q0 @ AES block 4k+4 - round 8 fmov v7.d[1], r24 @ AES block 4k+3 - mov high aese q3, v24.16b aesmc q3, q3 @ AES block 4k+7 - round 6 cmp r0, r5 @ .LOOP CONTROL aese q1, v26.16b aesmc q1, q1 @ AES block 4k+5 - round 8 eor v10.16b, v10.16b, v31.16b @ MODULO - fold into mid aese q0, v27.16b @ AES block 4k+4 - round 9 eor r21, r21, r13 @ AES block 4k+6 - round 10 low eor r22, r22, r14 @ AES block 4k+6 - round 10 high aese q3, v25.16b aesmc q3, q3 @ AES block 4k+7 - round 7 fmov d6, r21 @ AES block 4k+6 - mov low aese q1, v27.16b @ AES block 4k+5 - round 9 fmov v6.d[1], r22 @ AES block 4k+6 - mov high aese q2, v25.16b aesmc q2, q2 @ AES block 4k+6 - round 7 eor q4, q4, q0 @ AES block 4k+4 - result fmov d0, r10 @ CTR block 4k+8 aese q3, v26.16b aesmc q3, q3 @ AES block 4k+7 - round 8 fmov v0.d[1], r9 @ CTR block 4k+8 rev r9, r12 @ CTR block 4k+9 eor v10.16b, v10.16b, q9 @ MODULO - fold into mid aese q2, v26.16b aesmc q2, q2 @ AES block 4k+6 - round 8 eor q5, q5, q1 @ AES block 4k+5 - result add r12, r12, #1 @ CTR block 4k+9 orr r9, r11, r9, lsl #32 @ CTR block 4k+9 fmov d1, r10 @ CTR block 4k+9 pmull v9.1q, v10.1d, q8 @ MODULO - mid 64b align with low fmov v1.d[1], r9 @ CTR block 4k+9 rev r9, r12 @ CTR block 4k+10 aese q2, v27.16b @ AES block 4k+6 - round 9 st1 { q4}, [r2], #16 @ AES block 4k+4 - store result eor q6, q6, q2 @ AES block 4k+6 - result orr r9, r11, r9, lsl #32 @ CTR block 4k+10 aese q3, v27.16b @ AES block 4k+7 - round 9 add r12, r12, #1 @ CTR block 4k+10 ext v10.16b, v10.16b, v10.16b, #8 @ MODULO - other mid alignment fmov d2, r10 @ CTR block 4k+10 eor v11.16b, v11.16b, q9 @ MODULO - fold into low st1 { q5}, [r2], #16 @ AES block 4k+5 - store result fmov v2.d[1], r9 @ CTR block 4k+10 st1 { q6}, [r2], #16 @ AES block 4k+6 - store result rev r9, r12 @ CTR block 4k+11 orr r9, r11, r9, lsl #32 @ CTR block 4k+11 eor q7, q7, q3 @ AES block 4k+3 - result eor v11.16b, v11.16b, v10.16b @ MODULO - fold into low st1 { q7}, [r2], #16 @ AES block 4k+3 - store result blt .L128_enc_main_loop .L128_enc_prepretail:@ PREPRETAIL rev64 q4, q4 @ GHASH block 4k (only t0 is free) fmov d3, r10 @ CTR block 4k+3 rev64 q5, q5 @ GHASH block 4k+1 (t0 and t1 free) ext v11.16b, v11.16b, v11.16b, #8 @ PRE 0 add r12, r12, #1 @ CTR block 4k+3 fmov v3.d[1], r9 @ CTR block 4k+3 aese q1, v18.16b aesmc q1, q1 @ AES block 4k+5 - round 0 rev64 q6, q6 @ GHASH block 4k+2 (t0, t1, and t2 free) pmull v29.1q, q5, v14.1d @ GHASH block 4k+1 - low rev64 q7, q7 @ GHASH block 4k+3 (t0, t1, t2 and t3 free) eor q4, q4, v11.16b @ PRE 1 pmull2 v28.1q, q5, v14.2d @ GHASH block 4k+1 - high aese q3, v18.16b aesmc q3, q3 @ AES block 4k+7 - round 0 mov d30, v5.d[1] @ GHASH block 4k+1 - mid pmull v11.1q, q4, v15.1d @ GHASH block 4k - low mov d8, v4.d[1] @ GHASH block 4k - mid mov d31, v6.d[1] @ GHASH block 4k+2 - mid mov d10, v17.d[1] @ GHASH block 4k - mid aese q1, v19.16b aesmc q1, q1 @ AES block 4k+5 - round 1 eor v30.8b, v30.8b, q5 @ GHASH block 4k+1 - mid eor q8, q8, q4 @ GHASH block 4k - mid pmull2 v9.1q, q4, v15.2d @ GHASH block 4k - high eor v31.8b, v31.8b, q6 @ GHASH block 4k+2 - mid aese q3, v19.16b aesmc q3, q3 @ AES block 4k+7 - round 1 pmull v30.1q, v30.1d, v17.1d @ GHASH block 4k+1 - mid eor v11.16b, v11.16b, v29.16b @ GHASH block 4k+1 - low pmull v10.1q, q8, v10.1d @ GHASH block 4k - mid aese q0, v18.16b aesmc q0, q0 @ AES block 4k+4 - round 0 ins v31.d[1], v31.d[0] @ GHASH block 4k+2 - mid aese q2, v18.16b aesmc q2, q2 @ AES block 4k+6 - round 0 eor v10.16b, v10.16b, v30.16b @ GHASH block 4k+1 - mid mov d30, v7.d[1] @ GHASH block 4k+3 - mid aese q0, v19.16b aesmc q0, q0 @ AES block 4k+4 - round 1 eor q9, q9, v28.16b @ GHASH block 4k+1 - high pmull2 v31.1q, v31.2d, v16.2d @ GHASH block 4k+2 - mid pmull2 v8.1q, q6, v13.2d @ GHASH block 4k+2 - high eor v30.8b, v30.8b, q7 @ GHASH block 4k+3 - mid pmull2 v4.1q, q7, v12.2d @ GHASH block 4k+3 - high pmull v28.1q, q6, v13.1d @ GHASH block 4k+2 - low aese q2, v19.16b aesmc q2, q2 @ AES block 4k+6 - round 1 eor q9, q9, q8 @ GHASH block 4k+2 - high aese q0, v20.16b aesmc q0, q0 @ AES block 4k+4 - round 2 pmull v29.1q, q7, v12.1d @ GHASH block 4k+3 - low movi q8, #0xc2 aese q2, v20.16b aesmc q2, q2 @ AES block 4k+6 - round 2 eor v11.16b, v11.16b, v28.16b @ GHASH block 4k+2 - low aese q3, v20.16b aesmc q3, q3 @ AES block 4k+7 - round 2 pmull v30.1q, v30.1d, v16.1d @ GHASH block 4k+3 - mid eor v10.16b, v10.16b, v31.16b @ GHASH block 4k+2 - mid aese q2, v21.16b aesmc q2, q2 @ AES block 4k+6 - round 3 aese q1, v20.16b aesmc q1, q1 @ AES block 4k+5 - round 2 eor q9, q9, q4 @ GHASH block 4k+3 - high aese q0, v21.16b aesmc q0, q0 @ AES block 4k+4 - round 3 eor v10.16b, v10.16b, v30.16b @ GHASH block 4k+3 - mid shl d8, d8, #56 @ mod_constant aese q1, v21.16b aesmc q1, q1 @ AES block 4k+5 - round 3 eor v11.16b, v11.16b, v29.16b @ GHASH block 4k+3 - low aese q0, v22.16b aesmc q0, q0 @ AES block 4k+4 - round 4 pmull v28.1q, q9, q8 eor v10.16b, v10.16b, q9 @ karatsuba tidy up aese q1, v22.16b aesmc q1, q1 @ AES block 4k+5 - round 4 aese q0, v23.16b aesmc q0, q0 @ AES block 4k+4 - round 5 ext q9, q9, q9, #8 aese q3, v21.16b aesmc q3, q3 @ AES block 4k+7 - round 3 aese q2, v22.16b aesmc q2, q2 @ AES block 4k+6 - round 4 eor v10.16b, v10.16b, v11.16b aese q0, v24.16b aesmc q0, q0 @ AES block 4k+4 - round 6 aese q3, v22.16b aesmc q3, q3 @ AES block 4k+7 - round 4 aese q1, v23.16b aesmc q1, q1 @ AES block 4k+5 - round 5 aese q2, v23.16b aesmc q2, q2 @ AES block 4k+6 - round 5 eor v10.16b, v10.16b, v28.16b aese q3, v23.16b aesmc q3, q3 @ AES block 4k+7 - round 5 aese q1, v24.16b aesmc q1, q1 @ AES block 4k+5 - round 6 aese q2, v24.16b aesmc q2, q2 @ AES block 4k+6 - round 6 aese q3, v24.16b aesmc q3, q3 @ AES block 4k+7 - round 6 eor v10.16b, v10.16b, q9 aese q0, v25.16b aesmc q0, q0 @ AES block 4k+4 - round 7 aese q2, v25.16b aesmc q2, q2 @ AES block 4k+6 - round 7 aese q3, v25.16b aesmc q3, q3 @ AES block 4k+7 - round 7 pmull v28.1q, v10.1d, q8 aese q1, v25.16b aesmc q1, q1 @ AES block 4k+5 - round 7 ext v10.16b, v10.16b, v10.16b, #8 aese q3, v26.16b aesmc q3, q3 @ AES block 4k+7 - round 8 aese q0, v26.16b aesmc q0, q0 @ AES block 4k+4 - round 8 eor v11.16b, v11.16b, v28.16b aese q1, v26.16b aesmc q1, q1 @ AES block 4k+5 - round 8 aese q3, v27.16b @ AES block 4k+7 - round 9 aese q2, v26.16b aesmc q2, q2 @ AES block 4k+6 - round 8 aese q0, v27.16b @ AES block 4k+4 - round 9 aese q1, v27.16b @ AES block 4k+5 - round 9 eor v11.16b, v11.16b, v10.16b aese q2, v27.16b @ AES block 4k+6 - round 9 .L128_enc_tail:@ TAIL sub r5, r4, r0 @ main_end_input_ptr is number of bytes left to process ldp r6, r7, [r0], #16 @ AES block 4k+4 - load plaintext #ifdef __ARMEB__ rev r6, r6 rev r7, r7 #endif cmp r5, #48 ext q8, v11.16b, v11.16b, #8 @ prepare final partial tag eor r6, r6, r13 @ AES block 4k+4 - round 10 low eor r7, r7, r14 @ AES block 4k+4 - round 10 high fmov d4, r6 @ AES block 4k+4 - mov low fmov v4.d[1], r7 @ AES block 4k+4 - mov high eor q5, q4, q0 @ AES block 4k+4 - result bgt .L128_enc_blocks_more_than_3 sub r12, r12, #1 movi v11.8b, #0 mov q3, q2 cmp r5, #32 mov q2, q1 movi q9, #0 movi v10.8b, #0 bgt .L128_enc_blocks_more_than_2 mov q3, q1 cmp r5, #16 sub r12, r12, #1 bgt .L128_enc_blocks_more_than_1 sub r12, r12, #1 b .L128_enc_blocks_less_than_1 .L128_enc_blocks_more_than_3:@ blocks left > 3 st1 { q5}, [r2], #16 @ AES final-3 block - store result ldp r6, r7, [r0], #16 @ AES final-2 block - load input low & high #ifdef __ARMEB__ rev r6, r6 rev r7, r7 #endif rev64 q4, q5 @ GHASH final-3 block eor q4, q4, q8 @ feed in partial tag eor r7, r7, r14 @ AES final-2 block - round 10 high eor r6, r6, r13 @ AES final-2 block - round 10 low fmov d5, r6 @ AES final-2 block - mov low movi q8, #0 @ suppress further partial tag feed in fmov v5.d[1], r7 @ AES final-2 block - mov high pmull v11.1q, q4, v15.1d @ GHASH final-3 block - low mov d22, v4.d[1] @ GHASH final-3 block - mid pmull2 v9.1q, q4, v15.2d @ GHASH final-3 block - high mov d10, v17.d[1] @ GHASH final-3 block - mid eor q5, q5, q1 @ AES final-2 block - result eor v22.8b, v22.8b, q4 @ GHASH final-3 block - mid pmull v10.1q, v22.1d, v10.1d @ GHASH final-3 block - mid .L128_enc_blocks_more_than_2:@ blocks left > 2 st1 { q5}, [r2], #16 @ AES final-2 block - store result rev64 q4, q5 @ GHASH final-2 block ldp r6, r7, [r0], #16 @ AES final-1 block - load input low & high #ifdef __ARMEB__ rev r6, r6 rev r7, r7 #endif eor q4, q4, q8 @ feed in partial tag eor r6, r6, r13 @ AES final-1 block - round 10 low fmov d5, r6 @ AES final-1 block - mov low eor r7, r7, r14 @ AES final-1 block - round 10 high pmull2 v20.1q, q4, v14.2d @ GHASH final-2 block - high fmov v5.d[1], r7 @ AES final-1 block - mov high mov d22, v4.d[1] @ GHASH final-2 block - mid pmull v21.1q, q4, v14.1d @ GHASH final-2 block - low eor q9, q9, v20.16b @ GHASH final-2 block - high eor v22.8b, v22.8b, q4 @ GHASH final-2 block - mid eor q5, q5, q2 @ AES final-1 block - result eor v11.16b, v11.16b, v21.16b @ GHASH final-2 block - low pmull v22.1q, v22.1d, v17.1d @ GHASH final-2 block - mid movi q8, #0 @ suppress further partial tag feed in eor v10.16b, v10.16b, v22.16b @ GHASH final-2 block - mid .L128_enc_blocks_more_than_1:@ blocks left > 1 st1 { q5}, [r2], #16 @ AES final-1 block - store result rev64 q4, q5 @ GHASH final-1 block ldp r6, r7, [r0], #16 @ AES final block - load input low & high #ifdef __ARMEB__ rev r6, r6 rev r7, r7 #endif eor q4, q4, q8 @ feed in partial tag eor r7, r7, r14 @ AES final block - round 10 high eor r6, r6, r13 @ AES final block - round 10 low fmov d5, r6 @ AES final block - mov low pmull2 v20.1q, q4, v13.2d @ GHASH final-1 block - high fmov v5.d[1], r7 @ AES final block - mov high mov d22, v4.d[1] @ GHASH final-1 block - mid pmull v21.1q, q4, v13.1d @ GHASH final-1 block - low eor v22.8b, v22.8b, q4 @ GHASH final-1 block - mid eor q5, q5, q3 @ AES final block - result ins v22.d[1], v22.d[0] @ GHASH final-1 block - mid pmull2 v22.1q, v22.2d, v16.2d @ GHASH final-1 block - mid eor v11.16b, v11.16b, v21.16b @ GHASH final-1 block - low eor q9, q9, v20.16b @ GHASH final-1 block - high eor v10.16b, v10.16b, v22.16b @ GHASH final-1 block - mid movi q8, #0 @ suppress further partial tag feed in .L128_enc_blocks_less_than_1:@ blocks left <= 1 and r1, r1, #127 @ bit_length %= 128 mvn r13, xzr @ rk10_l = 0xffffffffffffffff mvn r14, xzr @ rk10_h = 0xffffffffffffffff sub r1, r1, #128 @ bit_length -= 128 neg r1, r1 @ bit_length = 128 - #bits in input (in range [1,128]) and r1, r1, #127 @ bit_length %= 128 lsr r14, r14, r1 @ rk10_h is mask for top 64b of last block cmp r1, #64 csel r6, r13, r14, lt csel r7, r14, xzr, lt fmov d0, r6 @ ctr0b is mask for last block fmov v0.d[1], r7 and q5, q5, q0 @ possibly partial last block has zeroes in highest bits rev64 q4, q5 @ GHASH final block eor q4, q4, q8 @ feed in partial tag mov d8, v4.d[1] @ GHASH final block - mid pmull v21.1q, q4, v12.1d @ GHASH final block - low ld1 { v18.16b}, [r2] @ load existing bytes where the possibly partial last block is to be stored eor q8, q8, q4 @ GHASH final block - mid #ifndef __ARMEB__ rev r9, r12 #else mov r9, r12 #endif pmull2 v20.1q, q4, v12.2d @ GHASH final block - high pmull v8.1q, q8, v16.1d @ GHASH final block - mid eor v11.16b, v11.16b, v21.16b @ GHASH final block - low eor q9, q9, v20.16b @ GHASH final block - high eor v10.16b, v10.16b, q8 @ GHASH final block - mid movi q8, #0xc2 eor v30.16b, v11.16b, q9 @ MODULO - karatsuba tidy up shl d8, d8, #56 @ mod_constant eor v10.16b, v10.16b, v30.16b @ MODULO - karatsuba tidy up pmull v31.1q, q9, q8 @ MODULO - top 64b align with mid ext q9, q9, q9, #8 @ MODULO - other top alignment eor v10.16b, v10.16b, v31.16b @ MODULO - fold into mid eor v10.16b, v10.16b, q9 @ MODULO - fold into mid pmull v9.1q, v10.1d, q8 @ MODULO - mid 64b align with low ext v10.16b, v10.16b, v10.16b, #8 @ MODULO - other mid alignment bif q5, v18.16b, q0 @ insert existing bytes in top end of result before storing eor v11.16b, v11.16b, q9 @ MODULO - fold into low st1 { q5}, [r2] @ store all 16B str r9, [r16, #12] @ store the updated counter eor v11.16b, v11.16b, v10.16b @ MODULO - fold into low ext v11.16b, v11.16b, v11.16b, #8 rev64 v11.16b, v11.16b mov r0, r15 st1 { v11.16b }, [r3] ldp r21, r22, [sp, #16] ldp r23, r24, [sp, #32] ldp d8, d9, [sp, #48] ldp d10, d11, [sp, #64] ldp d12, d13, [sp, #80] ldp d14, d15, [sp, #96] ldp r19, r20, [sp], #112 RET .L128_enc_ret: mov r0, #0x0 RET .size aes_gcm_enc_128_kernel,.-aes_gcm_enc_128_kernel .globl aes_gcm_dec_128_kernel .type aes_gcm_dec_128_kernel,%function .align 4 aes_gcm_dec_128_kernel: cbz r1, .L128_dec_ret stp r19, r20, [sp, #-112]! mov r16, r4 mov r8, r5 stp r21, r22, [sp, #16] stp r23, r24, [sp, #32] stp d8, d9, [sp, #48] stp d10, d11, [sp, #64] stp d12, d13, [sp, #80] stp d14, d15, [sp, #96] lsr r5, r1, #3 @ byte_len mov r15, r5 ldp r10, r11, [r16] @ ctr96_b64, ctr96_t32 #ifdef __ARMEB__ rev r10, r10 rev r11, r11 #endif ldp r13, r14, [r8, #160] @ load rk10 #ifdef __ARMEB__ ror r14, r14, 32 ror r13, r13, 32 #endif sub r5, r5, #1 @ byte_len - 1 ld1 {v18.4s}, [r8], #16 @ load rk0 and r5, r5, #0xffffffffffffffc0 @ number of bytes to be processed in main loop (at least 1 byte must be handled by tail) ld1 { q0}, [r16] @ special case vector load initial counter so we can start first AES block as quickly as possible ldr q13, [r3, #64] @ load h2l | h2h #ifndef __ARMEB__ ext v13.16b, v13.16b, v13.16b, #8 #endif lsr r12, r11, #32 fmov d2, r10 @ CTR block 2 ld1 {v19.4s}, [r8], #16 @ load rk1 orr r11, r11, r11 rev r12, r12 @ rev_ctr32 fmov d1, r10 @ CTR block 1 add r12, r12, #1 @ increment rev_ctr32 aese q0, v18.16b aesmc q0, q0 @ AES block 0 - round 0 rev r9, r12 @ CTR block 1 orr r9, r11, r9, lsl #32 @ CTR block 1 ld1 {v20.4s}, [r8], #16 @ load rk2 add r12, r12, #1 @ CTR block 1 fmov v1.d[1], r9 @ CTR block 1 rev r9, r12 @ CTR block 2 add r12, r12, #1 @ CTR block 2 aese q0, v19.16b aesmc q0, q0 @ AES block 0 - round 1 orr r9, r11, r9, lsl #32 @ CTR block 2 fmov v2.d[1], r9 @ CTR block 2 rev r9, r12 @ CTR block 3 fmov d3, r10 @ CTR block 3 orr r9, r11, r9, lsl #32 @ CTR block 3 add r12, r12, #1 @ CTR block 3 fmov v3.d[1], r9 @ CTR block 3 add r4, r0, r1, lsr #3 @ end_input_ptr aese q1, v18.16b aesmc q1, q1 @ AES block 1 - round 0 ld1 {v21.4s}, [r8], #16 @ load rk3 aese q0, v20.16b aesmc q0, q0 @ AES block 0 - round 2 ld1 {v22.4s}, [r8], #16 @ load rk4 aese q2, v18.16b aesmc q2, q2 @ AES block 2 - round 0 ld1 {v23.4s}, [r8], #16 @ load rk5 aese q1, v19.16b aesmc q1, q1 @ AES block 1 - round 1 ld1 {v24.4s}, [r8], #16 @ load rk6 aese q3, v18.16b aesmc q3, q3 @ AES block 3 - round 0 aese q2, v19.16b aesmc q2, q2 @ AES block 2 - round 1 aese q1, v20.16b aesmc q1, q1 @ AES block 1 - round 2 aese q3, v19.16b aesmc q3, q3 @ AES block 3 - round 1 ld1 { v11.16b}, [r3] ext v11.16b, v11.16b, v11.16b, #8 rev64 v11.16b, v11.16b aese q0, v21.16b aesmc q0, q0 @ AES block 0 - round 3 ld1 {v25.4s}, [r8], #16 @ load rk7 aese q1, v21.16b aesmc q1, q1 @ AES block 1 - round 3 aese q3, v20.16b aesmc q3, q3 @ AES block 3 - round 2 aese q2, v20.16b aesmc q2, q2 @ AES block 2 - round 2 ld1 {v26.4s}, [r8], #16 @ load rk8 aese q1, v22.16b aesmc q1, q1 @ AES block 1 - round 4 aese q3, v21.16b aesmc q3, q3 @ AES block 3 - round 3 aese q2, v21.16b aesmc q2, q2 @ AES block 2 - round 3 ldr q14, [r3, #80] @ load h3l | h3h #ifndef __ARMEB__ ext v14.16b, v14.16b, v14.16b, #8 #endif aese q0, v22.16b aesmc q0, q0 @ AES block 0 - round 4 ld1 {v27.4s}, [r8], #16 @ load rk9 aese q1, v23.16b aesmc q1, q1 @ AES block 1 - round 5 aese q2, v22.16b aesmc q2, q2 @ AES block 2 - round 4 aese q3, v22.16b aesmc q3, q3 @ AES block 3 - round 4 aese q0, v23.16b aesmc q0, q0 @ AES block 0 - round 5 aese q2, v23.16b aesmc q2, q2 @ AES block 2 - round 5 ldr q12, [r3, #32] @ load h1l | h1h #ifndef __ARMEB__ ext v12.16b, v12.16b, v12.16b, #8 #endif aese q3, v23.16b aesmc q3, q3 @ AES block 3 - round 5 aese q0, v24.16b aesmc q0, q0 @ AES block 0 - round 6 aese q1, v24.16b aesmc q1, q1 @ AES block 1 - round 6 aese q3, v24.16b aesmc q3, q3 @ AES block 3 - round 6 aese q2, v24.16b aesmc q2, q2 @ AES block 2 - round 6 trn1 q8, v12.2d, v13.2d @ h2h | h1h ldr q15, [r3, #112] @ load h4l | h4h #ifndef __ARMEB__ ext v15.16b, v15.16b, v15.16b, #8 #endif trn2 v16.2d, v12.2d, v13.2d @ h2l | h1l add r5, r5, r0 aese q1, v25.16b aesmc q1, q1 @ AES block 1 - round 7 aese q2, v25.16b aesmc q2, q2 @ AES block 2 - round 7 aese q0, v25.16b aesmc q0, q0 @ AES block 0 - round 7 eor v16.16b, v16.16b, q8 @ h2k | h1k aese q3, v25.16b aesmc q3, q3 @ AES block 3 - round 7 aese q1, v26.16b aesmc q1, q1 @ AES block 1 - round 8 trn2 v17.2d, v14.2d, v15.2d @ h4l | h3l aese q2, v26.16b aesmc q2, q2 @ AES block 2 - round 8 aese q3, v26.16b aesmc q3, q3 @ AES block 3 - round 8 aese q0, v26.16b aesmc q0, q0 @ AES block 0 - round 8 trn1 q9, v14.2d, v15.2d @ h4h | h3h aese q2, v27.16b @ AES block 2 - round 9 aese q3, v27.16b @ AES block 3 - round 9 aese q0, v27.16b @ AES block 0 - round 9 cmp r0, r5 @ check if we have <= 4 blocks aese q1, v27.16b @ AES block 1 - round 9 eor v17.16b, v17.16b, q9 @ h4k | h3k bge .L128_dec_tail @ handle tail ld1 {q4, q5}, [r0], #32 @ AES block 0 - load ciphertext; AES block 1 - load ciphertext eor q1, q5, q1 @ AES block 1 - result ld1 {q6}, [r0], #16 @ AES block 2 - load ciphertext eor q0, q4, q0 @ AES block 0 - result rev64 q4, q4 @ GHASH block 0 rev r9, r12 @ CTR block 4 orr r9, r11, r9, lsl #32 @ CTR block 4 add r12, r12, #1 @ CTR block 4 ld1 {q7}, [r0], #16 @ AES block 3 - load ciphertext rev64 q5, q5 @ GHASH block 1 mov r19, v1.d[0] @ AES block 1 - mov low mov r20, v1.d[1] @ AES block 1 - mov high mov r6, v0.d[0] @ AES block 0 - mov low cmp r0, r5 @ check if we have <= 8 blocks mov r7, v0.d[1] @ AES block 0 - mov high fmov d0, r10 @ CTR block 4 fmov v0.d[1], r9 @ CTR block 4 rev r9, r12 @ CTR block 5 eor r19, r19, r13 @ AES block 1 - round 10 low #ifdef __ARMEB__ rev r19, r19 #endif fmov d1, r10 @ CTR block 5 add r12, r12, #1 @ CTR block 5 orr r9, r11, r9, lsl #32 @ CTR block 5 fmov v1.d[1], r9 @ CTR block 5 rev r9, r12 @ CTR block 6 add r12, r12, #1 @ CTR block 6 orr r9, r11, r9, lsl #32 @ CTR block 6 eor r20, r20, r14 @ AES block 1 - round 10 high #ifdef __ARMEB__ rev r20, r20 #endif eor r6, r6, r13 @ AES block 0 - round 10 low #ifdef __ARMEB__ rev r6, r6 #endif eor q2, q6, q2 @ AES block 2 - result eor r7, r7, r14 @ AES block 0 - round 10 high #ifdef __ARMEB__ rev r7, r7 #endif stp r6, r7, [r2], #16 @ AES block 0 - store result stp r19, r20, [r2], #16 @ AES block 1 - store result bge .L128_dec_prepretail @ do prepretail .L128_dec_main_loop:@ main loop start eor q3, q7, q3 @ AES block 4k+3 - result ext v11.16b, v11.16b, v11.16b, #8 @ PRE 0 mov r21, v2.d[0] @ AES block 4k+2 - mov low pmull2 v28.1q, q5, v14.2d @ GHASH block 4k+1 - high mov r22, v2.d[1] @ AES block 4k+2 - mov high aese q1, v18.16b aesmc q1, q1 @ AES block 4k+5 - round 0 fmov d2, r10 @ CTR block 4k+6 rev64 q6, q6 @ GHASH block 4k+2 fmov v2.d[1], r9 @ CTR block 4k+6 rev r9, r12 @ CTR block 4k+7 mov r23, v3.d[0] @ AES block 4k+3 - mov low eor q4, q4, v11.16b @ PRE 1 mov d30, v5.d[1] @ GHASH block 4k+1 - mid aese q1, v19.16b aesmc q1, q1 @ AES block 4k+5 - round 1 rev64 q7, q7 @ GHASH block 4k+3 pmull v29.1q, q5, v14.1d @ GHASH block 4k+1 - low mov r24, v3.d[1] @ AES block 4k+3 - mov high orr r9, r11, r9, lsl #32 @ CTR block 4k+7 pmull v11.1q, q4, v15.1d @ GHASH block 4k - low fmov d3, r10 @ CTR block 4k+7 eor v30.8b, v30.8b, q5 @ GHASH block 4k+1 - mid aese q1, v20.16b aesmc q1, q1 @ AES block 4k+5 - round 2 fmov v3.d[1], r9 @ CTR block 4k+7 aese q2, v18.16b aesmc q2, q2 @ AES block 4k+6 - round 0 mov d10, v17.d[1] @ GHASH block 4k - mid pmull2 v9.1q, q4, v15.2d @ GHASH block 4k - high eor v11.16b, v11.16b, v29.16b @ GHASH block 4k+1 - low pmull v29.1q, q7, v12.1d @ GHASH block 4k+3 - low aese q1, v21.16b aesmc q1, q1 @ AES block 4k+5 - round 3 mov d8, v4.d[1] @ GHASH block 4k - mid aese q3, v18.16b aesmc q3, q3 @ AES block 4k+7 - round 0 eor q9, q9, v28.16b @ GHASH block 4k+1 - high aese q0, v18.16b aesmc q0, q0 @ AES block 4k+4 - round 0 pmull v28.1q, q6, v13.1d @ GHASH block 4k+2 - low eor q8, q8, q4 @ GHASH block 4k - mid aese q3, v19.16b aesmc q3, q3 @ AES block 4k+7 - round 1 eor r23, r23, r13 @ AES block 4k+3 - round 10 low #ifdef __ARMEB__ rev r23, r23 #endif pmull v30.1q, v30.1d, v17.1d @ GHASH block 4k+1 - mid eor r22, r22, r14 @ AES block 4k+2 - round 10 high #ifdef __ARMEB__ rev r22, r22 #endif mov d31, v6.d[1] @ GHASH block 4k+2 - mid aese q0, v19.16b aesmc q0, q0 @ AES block 4k+4 - round 1 eor v11.16b, v11.16b, v28.16b @ GHASH block 4k+2 - low pmull v10.1q, q8, v10.1d @ GHASH block 4k - mid aese q3, v20.16b aesmc q3, q3 @ AES block 4k+7 - round 2 eor v31.8b, v31.8b, q6 @ GHASH block 4k+2 - mid aese q0, v20.16b aesmc q0, q0 @ AES block 4k+4 - round 2 aese q1, v22.16b aesmc q1, q1 @ AES block 4k+5 - round 4 eor v10.16b, v10.16b, v30.16b @ GHASH block 4k+1 - mid pmull2 v8.1q, q6, v13.2d @ GHASH block 4k+2 - high aese q0, v21.16b aesmc q0, q0 @ AES block 4k+4 - round 3 ins v31.d[1], v31.d[0] @ GHASH block 4k+2 - mid pmull2 v4.1q, q7, v12.2d @ GHASH block 4k+3 - high aese q2, v19.16b aesmc q2, q2 @ AES block 4k+6 - round 1 mov d30, v7.d[1] @ GHASH block 4k+3 - mid aese q0, v22.16b aesmc q0, q0 @ AES block 4k+4 - round 4 eor q9, q9, q8 @ GHASH block 4k+2 - high pmull2 v31.1q, v31.2d, v16.2d @ GHASH block 4k+2 - mid eor r24, r24, r14 @ AES block 4k+3 - round 10 high #ifdef __ARMEB__ rev r24, r24 #endif aese q2, v20.16b aesmc q2, q2 @ AES block 4k+6 - round 2 eor v30.8b, v30.8b, q7 @ GHASH block 4k+3 - mid aese q1, v23.16b aesmc q1, q1 @ AES block 4k+5 - round 5 eor r21, r21, r13 @ AES block 4k+2 - round 10 low #ifdef __ARMEB__ rev r21, r21 #endif aese q0, v23.16b aesmc q0, q0 @ AES block 4k+4 - round 5 movi q8, #0xc2 aese q2, v21.16b aesmc q2, q2 @ AES block 4k+6 - round 3 eor v11.16b, v11.16b, v29.16b @ GHASH block 4k+3 - low aese q1, v24.16b aesmc q1, q1 @ AES block 4k+5 - round 6 aese q0, v24.16b aesmc q0, q0 @ AES block 4k+4 - round 6 eor v10.16b, v10.16b, v31.16b @ GHASH block 4k+2 - mid aese q2, v22.16b aesmc q2, q2 @ AES block 4k+6 - round 4 stp r21, r22, [r2], #16 @ AES block 4k+2 - store result pmull v30.1q, v30.1d, v16.1d @ GHASH block 4k+3 - mid eor q9, q9, q4 @ GHASH block 4k+3 - high ld1 {q4}, [r0], #16 @ AES block 4k+3 - load ciphertext aese q1, v25.16b aesmc q1, q1 @ AES block 4k+5 - round 7 add r12, r12, #1 @ CTR block 4k+7 aese q0, v25.16b aesmc q0, q0 @ AES block 4k+4 - round 7 shl d8, d8, #56 @ mod_constant aese q2, v23.16b aesmc q2, q2 @ AES block 4k+6 - round 5 eor v10.16b, v10.16b, v30.16b @ GHASH block 4k+3 - mid aese q1, v26.16b aesmc q1, q1 @ AES block 4k+5 - round 8 stp r23, r24, [r2], #16 @ AES block 4k+3 - store result aese q0, v26.16b aesmc q0, q0 @ AES block 4k+4 - round 8 eor v30.16b, v11.16b, q9 @ MODULO - karatsuba tidy up aese q3, v21.16b aesmc q3, q3 @ AES block 4k+7 - round 3 rev r9, r12 @ CTR block 4k+8 pmull v31.1q, q9, q8 @ MODULO - top 64b align with mid ld1 {q5}, [r0], #16 @ AES block 4k+4 - load ciphertext ext q9, q9, q9, #8 @ MODULO - other top alignment aese q0, v27.16b @ AES block 4k+4 - round 9 orr r9, r11, r9, lsl #32 @ CTR block 4k+8 aese q3, v22.16b aesmc q3, q3 @ AES block 4k+7 - round 4 eor v10.16b, v10.16b, v30.16b @ MODULO - karatsuba tidy up aese q1, v27.16b @ AES block 4k+5 - round 9 aese q2, v24.16b aesmc q2, q2 @ AES block 4k+6 - round 6 eor q0, q4, q0 @ AES block 4k+4 - result aese q3, v23.16b aesmc q3, q3 @ AES block 4k+7 - round 5 ld1 {q6}, [r0], #16 @ AES block 4k+5 - load ciphertext add r12, r12, #1 @ CTR block 4k+8 eor v10.16b, v10.16b, v31.16b @ MODULO - fold into mid eor q1, q5, q1 @ AES block 4k+5 - result aese q2, v25.16b aesmc q2, q2 @ AES block 4k+6 - round 7 ld1 {q7}, [r0], #16 @ AES block 4k+6 - load ciphertext aese q3, v24.16b aesmc q3, q3 @ AES block 4k+7 - round 6 rev64 q5, q5 @ GHASH block 4k+5 eor v10.16b, v10.16b, q9 @ MODULO - fold into mid mov r7, v0.d[1] @ AES block 4k+4 - mov high aese q2, v26.16b aesmc q2, q2 @ AES block 4k+6 - round 8 mov r6, v0.d[0] @ AES block 4k+4 - mov low aese q3, v25.16b aesmc q3, q3 @ AES block 4k+7 - round 7 fmov d0, r10 @ CTR block 4k+8 pmull v8.1q, v10.1d, q8 @ MODULO - mid 64b align with low fmov v0.d[1], r9 @ CTR block 4k+8 rev r9, r12 @ CTR block 4k+9 aese q2, v27.16b @ AES block 4k+6 - round 9 orr r9, r11, r9, lsl #32 @ CTR block 4k+9 ext v10.16b, v10.16b, v10.16b, #8 @ MODULO - other mid alignment aese q3, v26.16b aesmc q3, q3 @ AES block 4k+7 - round 8 eor r7, r7, r14 @ AES block 4k+4 - round 10 high #ifdef __ARMEB__ rev r7, r7 #endif eor v11.16b, v11.16b, q8 @ MODULO - fold into low mov r20, v1.d[1] @ AES block 4k+5 - mov high eor r6, r6, r13 @ AES block 4k+4 - round 10 low #ifdef __ARMEB__ rev r6, r6 #endif eor q2, q6, q2 @ AES block 4k+6 - result mov r19, v1.d[0] @ AES block 4k+5 - mov low add r12, r12, #1 @ CTR block 4k+9 aese q3, v27.16b @ AES block 4k+7 - round 9 fmov d1, r10 @ CTR block 4k+9 cmp r0, r5 @ .LOOP CONTROL rev64 q4, q4 @ GHASH block 4k+4 eor v11.16b, v11.16b, v10.16b @ MODULO - fold into low fmov v1.d[1], r9 @ CTR block 4k+9 rev r9, r12 @ CTR block 4k+10 add r12, r12, #1 @ CTR block 4k+10 eor r20, r20, r14 @ AES block 4k+5 - round 10 high #ifdef __ARMEB__ rev r20, r20 #endif stp r6, r7, [r2], #16 @ AES block 4k+4 - store result eor r19, r19, r13 @ AES block 4k+5 - round 10 low #ifdef __ARMEB__ rev r19, r19 #endif stp r19, r20, [r2], #16 @ AES block 4k+5 - store result orr r9, r11, r9, lsl #32 @ CTR block 4k+10 blt .L128_dec_main_loop .L128_dec_prepretail:@ PREPRETAIL ext v11.16b, v11.16b, v11.16b, #8 @ PRE 0 mov r21, v2.d[0] @ AES block 4k+2 - mov low mov d30, v5.d[1] @ GHASH block 4k+1 - mid aese q0, v18.16b aesmc q0, q0 @ AES block 4k+4 - round 0 eor q3, q7, q3 @ AES block 4k+3 - result aese q1, v18.16b aesmc q1, q1 @ AES block 4k+5 - round 0 mov r22, v2.d[1] @ AES block 4k+2 - mov high eor q4, q4, v11.16b @ PRE 1 fmov d2, r10 @ CTR block 4k+6 rev64 q6, q6 @ GHASH block 4k+2 aese q0, v19.16b aesmc q0, q0 @ AES block 4k+4 - round 1 fmov v2.d[1], r9 @ CTR block 4k+6 rev r9, r12 @ CTR block 4k+7 mov r23, v3.d[0] @ AES block 4k+3 - mov low eor v30.8b, v30.8b, q5 @ GHASH block 4k+1 - mid pmull v11.1q, q4, v15.1d @ GHASH block 4k - low mov d10, v17.d[1] @ GHASH block 4k - mid mov r24, v3.d[1] @ AES block 4k+3 - mov high aese q1, v19.16b aesmc q1, q1 @ AES block 4k+5 - round 1 mov d31, v6.d[1] @ GHASH block 4k+2 - mid aese q0, v20.16b aesmc q0, q0 @ AES block 4k+4 - round 2 orr r9, r11, r9, lsl #32 @ CTR block 4k+7 pmull v29.1q, q5, v14.1d @ GHASH block 4k+1 - low mov d8, v4.d[1] @ GHASH block 4k - mid fmov d3, r10 @ CTR block 4k+7 aese q2, v18.16b aesmc q2, q2 @ AES block 4k+6 - round 0 fmov v3.d[1], r9 @ CTR block 4k+7 pmull v30.1q, v30.1d, v17.1d @ GHASH block 4k+1 - mid eor v31.8b, v31.8b, q6 @ GHASH block 4k+2 - mid rev64 q7, q7 @ GHASH block 4k+3 aese q2, v19.16b aesmc q2, q2 @ AES block 4k+6 - round 1 eor q8, q8, q4 @ GHASH block 4k - mid pmull2 v9.1q, q4, v15.2d @ GHASH block 4k - high aese q3, v18.16b aesmc q3, q3 @ AES block 4k+7 - round 0 ins v31.d[1], v31.d[0] @ GHASH block 4k+2 - mid pmull2 v28.1q, q5, v14.2d @ GHASH block 4k+1 - high pmull v10.1q, q8, v10.1d @ GHASH block 4k - mid eor v11.16b, v11.16b, v29.16b @ GHASH block 4k+1 - low pmull v29.1q, q7, v12.1d @ GHASH block 4k+3 - low pmull2 v31.1q, v31.2d, v16.2d @ GHASH block 4k+2 - mid eor q9, q9, v28.16b @ GHASH block 4k+1 - high eor v10.16b, v10.16b, v30.16b @ GHASH block 4k+1 - mid pmull2 v4.1q, q7, v12.2d @ GHASH block 4k+3 - high pmull2 v8.1q, q6, v13.2d @ GHASH block 4k+2 - high mov d30, v7.d[1] @ GHASH block 4k+3 - mid aese q1, v20.16b aesmc q1, q1 @ AES block 4k+5 - round 2 eor v10.16b, v10.16b, v31.16b @ GHASH block 4k+2 - mid pmull v28.1q, q6, v13.1d @ GHASH block 4k+2 - low eor q9, q9, q8 @ GHASH block 4k+2 - high movi q8, #0xc2 aese q3, v19.16b aesmc q3, q3 @ AES block 4k+7 - round 1 eor v30.8b, v30.8b, q7 @ GHASH block 4k+3 - mid eor v11.16b, v11.16b, v28.16b @ GHASH block 4k+2 - low aese q2, v20.16b aesmc q2, q2 @ AES block 4k+6 - round 2 eor q9, q9, q4 @ GHASH block 4k+3 - high aese q3, v20.16b aesmc q3, q3 @ AES block 4k+7 - round 2 eor r23, r23, r13 @ AES block 4k+3 - round 10 low #ifdef __ARMEB__ rev r23, r23 #endif pmull v30.1q, v30.1d, v16.1d @ GHASH block 4k+3 - mid eor r21, r21, r13 @ AES block 4k+2 - round 10 low #ifdef __ARMEB__ rev r21, r21 #endif eor v11.16b, v11.16b, v29.16b @ GHASH block 4k+3 - low aese q2, v21.16b aesmc q2, q2 @ AES block 4k+6 - round 3 aese q1, v21.16b aesmc q1, q1 @ AES block 4k+5 - round 3 shl d8, d8, #56 @ mod_constant aese q0, v21.16b aesmc q0, q0 @ AES block 4k+4 - round 3 aese q2, v22.16b aesmc q2, q2 @ AES block 4k+6 - round 4 eor v10.16b, v10.16b, v30.16b @ GHASH block 4k+3 - mid aese q1, v22.16b aesmc q1, q1 @ AES block 4k+5 - round 4 aese q3, v21.16b aesmc q3, q3 @ AES block 4k+7 - round 3 eor v30.16b, v11.16b, q9 @ MODULO - karatsuba tidy up aese q2, v23.16b aesmc q2, q2 @ AES block 4k+6 - round 5 aese q1, v23.16b aesmc q1, q1 @ AES block 4k+5 - round 5 aese q3, v22.16b aesmc q3, q3 @ AES block 4k+7 - round 4 aese q0, v22.16b aesmc q0, q0 @ AES block 4k+4 - round 4 eor v10.16b, v10.16b, v30.16b @ MODULO - karatsuba tidy up pmull v31.1q, q9, q8 @ MODULO - top 64b align with mid aese q1, v24.16b aesmc q1, q1 @ AES block 4k+5 - round 6 ext q9, q9, q9, #8 @ MODULO - other top alignment aese q3, v23.16b aesmc q3, q3 @ AES block 4k+7 - round 5 aese q0, v23.16b aesmc q0, q0 @ AES block 4k+4 - round 5 eor v10.16b, v10.16b, v31.16b @ MODULO - fold into mid aese q1, v25.16b aesmc q1, q1 @ AES block 4k+5 - round 7 aese q2, v24.16b aesmc q2, q2 @ AES block 4k+6 - round 6 aese q0, v24.16b aesmc q0, q0 @ AES block 4k+4 - round 6 aese q1, v26.16b aesmc q1, q1 @ AES block 4k+5 - round 8 eor v10.16b, v10.16b, q9 @ MODULO - fold into mid aese q3, v24.16b aesmc q3, q3 @ AES block 4k+7 - round 6 aese q0, v25.16b aesmc q0, q0 @ AES block 4k+4 - round 7 aese q1, v27.16b @ AES block 4k+5 - round 9 pmull v8.1q, v10.1d, q8 @ MODULO - mid 64b align with low eor r24, r24, r14 @ AES block 4k+3 - round 10 high #ifdef __ARMEB__ rev r24, r24 #endif aese q2, v25.16b aesmc q2, q2 @ AES block 4k+6 - round 7 ext v10.16b, v10.16b, v10.16b, #8 @ MODULO - other mid alignment aese q3, v25.16b aesmc q3, q3 @ AES block 4k+7 - round 7 aese q0, v26.16b aesmc q0, q0 @ AES block 4k+4 - round 8 eor v11.16b, v11.16b, q8 @ MODULO - fold into low aese q2, v26.16b aesmc q2, q2 @ AES block 4k+6 - round 8 aese q3, v26.16b aesmc q3, q3 @ AES block 4k+7 - round 8 eor r22, r22, r14 @ AES block 4k+2 - round 10 high #ifdef __ARMEB__ rev r22, r22 #endif aese q0, v27.16b @ AES block 4k+4 - round 9 stp r21, r22, [r2], #16 @ AES block 4k+2 - store result aese q2, v27.16b @ AES block 4k+6 - round 9 add r12, r12, #1 @ CTR block 4k+7 stp r23, r24, [r2], #16 @ AES block 4k+3 - store result aese q3, v27.16b @ AES block 4k+7 - round 9 eor v11.16b, v11.16b, v10.16b @ MODULO - fold into low .L128_dec_tail:@ TAIL sub r5, r4, r0 @ main_end_input_ptr is number of bytes left to process ld1 { q5}, [r0], #16 @ AES block 4k+4 - load ciphertext eor q0, q5, q0 @ AES block 4k+4 - result mov r7, v0.d[1] @ AES block 4k+4 - mov high mov r6, v0.d[0] @ AES block 4k+4 - mov low cmp r5, #48 eor r7, r7, r14 @ AES block 4k+4 - round 10 high #ifdef __ARMEB__ rev r7, r7 #endif ext q8, v11.16b, v11.16b, #8 @ prepare final partial tag eor r6, r6, r13 @ AES block 4k+4 - round 10 low #ifdef __ARMEB__ rev r6, r6 #endif bgt .L128_dec_blocks_more_than_3 mov q3, q2 sub r12, r12, #1 movi v11.8b, #0 movi q9, #0 mov q2, q1 movi v10.8b, #0 cmp r5, #32 bgt .L128_dec_blocks_more_than_2 cmp r5, #16 mov q3, q1 sub r12, r12, #1 bgt .L128_dec_blocks_more_than_1 sub r12, r12, #1 b .L128_dec_blocks_less_than_1 .L128_dec_blocks_more_than_3:@ blocks left > 3 rev64 q4, q5 @ GHASH final-3 block ld1 { q5}, [r0], #16 @ AES final-2 block - load ciphertext eor q4, q4, q8 @ feed in partial tag mov d10, v17.d[1] @ GHASH final-3 block - mid stp r6, r7, [r2], #16 @ AES final-3 block - store result eor q0, q5, q1 @ AES final-2 block - result mov d22, v4.d[1] @ GHASH final-3 block - mid mov r7, v0.d[1] @ AES final-2 block - mov high pmull v11.1q, q4, v15.1d @ GHASH final-3 block - low mov r6, v0.d[0] @ AES final-2 block - mov low pmull2 v9.1q, q4, v15.2d @ GHASH final-3 block - high eor v22.8b, v22.8b, q4 @ GHASH final-3 block - mid movi q8, #0 @ suppress further partial tag feed in eor r7, r7, r14 @ AES final-2 block - round 10 high #ifdef __ARMEB__ rev r7, r7 #endif pmull v10.1q, v22.1d, v10.1d @ GHASH final-3 block - mid eor r6, r6, r13 @ AES final-2 block - round 10 low #ifdef __ARMEB__ rev r6, r6 #endif .L128_dec_blocks_more_than_2:@ blocks left > 2 rev64 q4, q5 @ GHASH final-2 block ld1 { q5}, [r0], #16 @ AES final-1 block - load ciphertext eor q4, q4, q8 @ feed in partial tag eor q0, q5, q2 @ AES final-1 block - result stp r6, r7, [r2], #16 @ AES final-2 block - store result mov d22, v4.d[1] @ GHASH final-2 block - mid pmull v21.1q, q4, v14.1d @ GHASH final-2 block - low pmull2 v20.1q, q4, v14.2d @ GHASH final-2 block - high mov r6, v0.d[0] @ AES final-1 block - mov low mov r7, v0.d[1] @ AES final-1 block - mov high eor v22.8b, v22.8b, q4 @ GHASH final-2 block - mid movi q8, #0 @ suppress further partial tag feed in pmull v22.1q, v22.1d, v17.1d @ GHASH final-2 block - mid eor r6, r6, r13 @ AES final-1 block - round 10 low #ifdef __ARMEB__ rev r6, r6 #endif eor v11.16b, v11.16b, v21.16b @ GHASH final-2 block - low eor q9, q9, v20.16b @ GHASH final-2 block - high eor v10.16b, v10.16b, v22.16b @ GHASH final-2 block - mid eor r7, r7, r14 @ AES final-1 block - round 10 high #ifdef __ARMEB__ rev r7, r7 #endif .L128_dec_blocks_more_than_1:@ blocks left > 1 rev64 q4, q5 @ GHASH final-1 block ld1 { q5}, [r0], #16 @ AES final block - load ciphertext eor q4, q4, q8 @ feed in partial tag mov d22, v4.d[1] @ GHASH final-1 block - mid eor q0, q5, q3 @ AES final block - result eor v22.8b, v22.8b, q4 @ GHASH final-1 block - mid stp r6, r7, [r2], #16 @ AES final-1 block - store result mov r6, v0.d[0] @ AES final block - mov low mov r7, v0.d[1] @ AES final block - mov high ins v22.d[1], v22.d[0] @ GHASH final-1 block - mid pmull v21.1q, q4, v13.1d @ GHASH final-1 block - low pmull2 v20.1q, q4, v13.2d @ GHASH final-1 block - high pmull2 v22.1q, v22.2d, v16.2d @ GHASH final-1 block - mid movi q8, #0 @ suppress further partial tag feed in eor v11.16b, v11.16b, v21.16b @ GHASH final-1 block - low eor q9, q9, v20.16b @ GHASH final-1 block - high eor r7, r7, r14 @ AES final block - round 10 high #ifdef __ARMEB__ rev r7, r7 #endif eor r6, r6, r13 @ AES final block - round 10 low #ifdef __ARMEB__ rev r6, r6 #endif eor v10.16b, v10.16b, v22.16b @ GHASH final-1 block - mid .L128_dec_blocks_less_than_1:@ blocks left <= 1 mvn r14, xzr @ rk10_h = 0xffffffffffffffff and r1, r1, #127 @ bit_length %= 128 mvn r13, xzr @ rk10_l = 0xffffffffffffffff sub r1, r1, #128 @ bit_length -= 128 neg r1, r1 @ bit_length = 128 - #bits in input (in range [1,128]) and r1, r1, #127 @ bit_length %= 128 lsr r14, r14, r1 @ rk10_h is mask for top 64b of last block cmp r1, #64 csel r10, r14, xzr, lt csel r9, r13, r14, lt fmov d0, r9 @ ctr0b is mask for last block mov v0.d[1], r10 and q5, q5, q0 @ possibly partial last block has zeroes in highest bits rev64 q4, q5 @ GHASH final block eor q4, q4, q8 @ feed in partial tag ldp r4, r5, [r2] @ load existing bytes we need to not overwrite and r7, r7, r10 pmull2 v20.1q, q4, v12.2d @ GHASH final block - high mov d8, v4.d[1] @ GHASH final block - mid eor q8, q8, q4 @ GHASH final block - mid eor q9, q9, v20.16b @ GHASH final block - high pmull v8.1q, q8, v16.1d @ GHASH final block - mid pmull v21.1q, q4, v12.1d @ GHASH final block - low bic r4, r4, r9 @ mask out low existing bytes and r6, r6, r9 #ifndef __ARMEB__ rev r9, r12 #else mov r9, r12 #endif eor v10.16b, v10.16b, q8 @ GHASH final block - mid movi q8, #0xc2 eor v11.16b, v11.16b, v21.16b @ GHASH final block - low bic r5, r5, r10 @ mask out high existing bytes shl d8, d8, #56 @ mod_constant eor v30.16b, v11.16b, q9 @ MODULO - karatsuba tidy up pmull v31.1q, q9, q8 @ MODULO - top 64b align with mid eor v10.16b, v10.16b, v30.16b @ MODULO - karatsuba tidy up orr r6, r6, r4 str r9, [r16, #12] @ store the updated counter orr r7, r7, r5 stp r6, r7, [r2] ext q9, q9, q9, #8 @ MODULO - other top alignment eor v10.16b, v10.16b, v31.16b @ MODULO - fold into mid eor v10.16b, v10.16b, q9 @ MODULO - fold into mid pmull v8.1q, v10.1d, q8 @ MODULO - mid 64b align with low ext v10.16b, v10.16b, v10.16b, #8 @ MODULO - other mid alignment eor v11.16b, v11.16b, q8 @ MODULO - fold into low eor v11.16b, v11.16b, v10.16b @ MODULO - fold into low ext v11.16b, v11.16b, v11.16b, #8 rev64 v11.16b, v11.16b mov r0, r15 st1 { v11.16b }, [r3] ldp r21, r22, [sp, #16] ldp r23, r24, [sp, #32] ldp d8, d9, [sp, #48] ldp d10, d11, [sp, #64] ldp d12, d13, [sp, #80] ldp d14, d15, [sp, #96] ldp r19, r20, [sp], #112 RET .L128_dec_ret: mov r0, #0x0 RET .size aes_gcm_dec_128_kernel,.-aes_gcm_dec_128_kernel .globl aes_gcm_enc_192_kernel .type aes_gcm_enc_192_kernel,%function .align 4 aes_gcm_enc_192_kernel: cbz r1, .L192_enc_ret stp r19, r20, [sp, #-112]! mov r16, r4 mov r8, r5 stp r21, r22, [sp, #16] stp r23, r24, [sp, #32] stp d8, d9, [sp, #48] stp d10, d11, [sp, #64] stp d12, d13, [sp, #80] stp d14, d15, [sp, #96] ldp r10, r11, [r16] @ ctr96_b64, ctr96_t32 #ifdef __ARMEB__ rev r10, r10 rev r11, r11 #endif ldp r13, r14, [r8, #192] @ load rk12 #ifdef __ARMEB__ ror r13, r13, #32 ror r14, r14, #32 #endif ld1 {v18.4s}, [r8], #16 @ load rk0 ld1 {v19.4s}, [r8], #16 @ load rk1 ld1 {v20.4s}, [r8], #16 @ load rk2 lsr r12, r11, #32 ld1 {v21.4s}, [r8], #16 @ load rk3 orr r11, r11, r11 ld1 {v22.4s}, [r8], #16 @ load rk4 rev r12, r12 @ rev_ctr32 add r12, r12, #1 @ increment rev_ctr32 fmov d3, r10 @ CTR block 3 rev r9, r12 @ CTR block 1 add r12, r12, #1 @ CTR block 1 fmov d1, r10 @ CTR block 1 orr r9, r11, r9, lsl #32 @ CTR block 1 ld1 { q0}, [r16] @ special case vector load initial counter so we can start first AES block as quickly as possible fmov v1.d[1], r9 @ CTR block 1 rev r9, r12 @ CTR block 2 add r12, r12, #1 @ CTR block 2 fmov d2, r10 @ CTR block 2 orr r9, r11, r9, lsl #32 @ CTR block 2 fmov v2.d[1], r9 @ CTR block 2 rev r9, r12 @ CTR block 3 orr r9, r11, r9, lsl #32 @ CTR block 3 ld1 {v23.4s}, [r8], #16 @ load rk5 fmov v3.d[1], r9 @ CTR block 3 ld1 {v24.4s}, [r8], #16 @ load rk6 ld1 {v25.4s}, [r8], #16 @ load rk7 aese q0, v18.16b aesmc q0, q0 @ AES block 0 - round 0 ld1 { v11.16b}, [r3] ext v11.16b, v11.16b, v11.16b, #8 rev64 v11.16b, v11.16b aese q3, v18.16b aesmc q3, q3 @ AES block 3 - round 0 ld1 {v26.4s}, [r8], #16 @ load rk8 aese q1, v18.16b aesmc q1, q1 @ AES block 1 - round 0 ldr q15, [r3, #112] @ load h4l | h4h #ifndef __ARMEB__ ext v15.16b, v15.16b, v15.16b, #8 #endif aese q2, v18.16b aesmc q2, q2 @ AES block 2 - round 0 ld1 {v27.4s}, [r8], #16 @ load rk9 aese q0, v19.16b aesmc q0, q0 @ AES block 0 - round 1 ld1 {v28.4s}, [r8], #16 @ load rk10 aese q1, v19.16b aesmc q1, q1 @ AES block 1 - round 1 ldr q12, [r3, #32] @ load h1l | h1h #ifndef __ARMEB__ ext v12.16b, v12.16b, v12.16b, #8 #endif aese q2, v19.16b aesmc q2, q2 @ AES block 2 - round 1 ld1 {v29.4s}, [r8], #16 @ load rk11 aese q3, v19.16b aesmc q3, q3 @ AES block 3 - round 1 ldr q14, [r3, #80] @ load h3l | h3h #ifndef __ARMEB__ ext v14.16b, v14.16b, v14.16b, #8 #endif aese q0, v20.16b aesmc q0, q0 @ AES block 0 - round 2 aese q2, v20.16b aesmc q2, q2 @ AES block 2 - round 2 aese q3, v20.16b aesmc q3, q3 @ AES block 3 - round 2 aese q0, v21.16b aesmc q0, q0 @ AES block 0 - round 3 trn1 q9, v14.2d, v15.2d @ h4h | h3h aese q2, v21.16b aesmc q2, q2 @ AES block 2 - round 3 aese q1, v20.16b aesmc q1, q1 @ AES block 1 - round 2 trn2 v17.2d, v14.2d, v15.2d @ h4l | h3l aese q0, v22.16b aesmc q0, q0 @ AES block 0 - round 4 aese q3, v21.16b aesmc q3, q3 @ AES block 3 - round 3 aese q1, v21.16b aesmc q1, q1 @ AES block 1 - round 3 aese q0, v23.16b aesmc q0, q0 @ AES block 0 - round 5 aese q2, v22.16b aesmc q2, q2 @ AES block 2 - round 4 aese q1, v22.16b aesmc q1, q1 @ AES block 1 - round 4 aese q0, v24.16b aesmc q0, q0 @ AES block 0 - round 6 aese q3, v22.16b aesmc q3, q3 @ AES block 3 - round 4 aese q2, v23.16b aesmc q2, q2 @ AES block 2 - round 5 aese q1, v23.16b aesmc q1, q1 @ AES block 1 - round 5 aese q3, v23.16b aesmc q3, q3 @ AES block 3 - round 5 aese q2, v24.16b aesmc q2, q2 @ AES block 2 - round 6 ldr q13, [r3, #64] @ load h2l | h2h #ifndef __ARMEB__ ext v13.16b, v13.16b, v13.16b, #8 #endif aese q1, v24.16b aesmc q1, q1 @ AES block 1 - round 6 aese q3, v24.16b aesmc q3, q3 @ AES block 3 - round 6 aese q0, v25.16b aesmc q0, q0 @ AES block 0 - round 7 aese q1, v25.16b aesmc q1, q1 @ AES block 1 - round 7 trn2 v16.2d, v12.2d, v13.2d @ h2l | h1l aese q3, v25.16b aesmc q3, q3 @ AES block 3 - round 7 aese q0, v26.16b aesmc q0, q0 @ AES block 0 - round 8 aese q2, v25.16b aesmc q2, q2 @ AES block 2 - round 7 trn1 q8, v12.2d, v13.2d @ h2h | h1h aese q1, v26.16b aesmc q1, q1 @ AES block 1 - round 8 aese q3, v26.16b aesmc q3, q3 @ AES block 3 - round 8 aese q2, v26.16b aesmc q2, q2 @ AES block 2 - round 8 aese q0, v27.16b aesmc q0, q0 @ AES block 0 - round 9 aese q3, v27.16b aesmc q3, q3 @ AES block 3 - round 9 aese q2, v27.16b aesmc q2, q2 @ AES block 2 - round 9 aese q1, v27.16b aesmc q1, q1 @ AES block 1 - round 9 aese q0, v28.16b aesmc q0, q0 @ AES block 0 - round 10 aese q2, v28.16b aesmc q2, q2 @ AES block 2 - round 10 aese q1, v28.16b aesmc q1, q1 @ AES block 1 - round 10 lsr r5, r1, #3 @ byte_len mov r15, r5 aese q3, v28.16b aesmc q3, q3 @ AES block 3 - round 10 sub r5, r5, #1 @ byte_len - 1 eor v16.16b, v16.16b, q8 @ h2k | h1k and r5, r5, #0xffffffffffffffc0 @ number of bytes to be processed in main loop (at least 1 byte must be handled by tail) eor v17.16b, v17.16b, q9 @ h4k | h3k aese q2, v29.16b @ AES block 2 - round 11 add r4, r0, r1, lsr #3 @ end_input_ptr add r5, r5, r0 aese q1, v29.16b @ AES block 1 - round 11 cmp r0, r5 @ check if we have <= 4 blocks aese q0, v29.16b @ AES block 0 - round 11 add r12, r12, #1 @ CTR block 3 aese q3, v29.16b @ AES block 3 - round 11 bge .L192_enc_tail @ handle tail rev r9, r12 @ CTR block 4 ldp r6, r7, [r0, #0] @ AES block 0 - load plaintext #ifdef __ARMEB__ rev r6, r6 rev r7, r7 #endif orr r9, r11, r9, lsl #32 @ CTR block 4 ldp r21, r22, [r0, #32] @ AES block 2 - load plaintext #ifdef __ARMEB__ rev r21, r21 rev r22, r22 #endif ldp r23, r24, [r0, #48] @ AES block 3 - load plaintext #ifdef __ARMEB__ rev r23, r23 rev r24, r24 #endif ldp r19, r20, [r0, #16] @ AES block 1 - load plaintext #ifdef __ARMEB__ rev r19, r19 rev r20, r20 #endif add r0, r0, #64 @ AES input_ptr update cmp r0, r5 @ check if we have <= 8 blocks eor r6, r6, r13 @ AES block 0 - round 12 low eor r7, r7, r14 @ AES block 0 - round 12 high eor r22, r22, r14 @ AES block 2 - round 12 high fmov d4, r6 @ AES block 0 - mov low eor r24, r24, r14 @ AES block 3 - round 12 high fmov v4.d[1], r7 @ AES block 0 - mov high eor r21, r21, r13 @ AES block 2 - round 12 low eor r19, r19, r13 @ AES block 1 - round 12 low fmov d5, r19 @ AES block 1 - mov low eor r20, r20, r14 @ AES block 1 - round 12 high fmov v5.d[1], r20 @ AES block 1 - mov high eor r23, r23, r13 @ AES block 3 - round 12 low fmov d6, r21 @ AES block 2 - mov low add r12, r12, #1 @ CTR block 4 eor q4, q4, q0 @ AES block 0 - result fmov d0, r10 @ CTR block 4 fmov v0.d[1], r9 @ CTR block 4 rev r9, r12 @ CTR block 5 orr r9, r11, r9, lsl #32 @ CTR block 5 add r12, r12, #1 @ CTR block 5 fmov d7, r23 @ AES block 3 - mov low st1 { q4}, [r2], #16 @ AES block 0 - store result fmov v6.d[1], r22 @ AES block 2 - mov high eor q5, q5, q1 @ AES block 1 - result fmov d1, r10 @ CTR block 5 st1 { q5}, [r2], #16 @ AES block 1 - store result fmov v7.d[1], r24 @ AES block 3 - mov high fmov v1.d[1], r9 @ CTR block 5 rev r9, r12 @ CTR block 6 orr r9, r11, r9, lsl #32 @ CTR block 6 add r12, r12, #1 @ CTR block 6 eor q6, q6, q2 @ AES block 2 - result fmov d2, r10 @ CTR block 6 fmov v2.d[1], r9 @ CTR block 6 rev r9, r12 @ CTR block 7 orr r9, r11, r9, lsl #32 @ CTR block 7 st1 { q6}, [r2], #16 @ AES block 2 - store result eor q7, q7, q3 @ AES block 3 - result st1 { q7}, [r2], #16 @ AES block 3 - store result bge .L192_enc_prepretail @ do prepretail .L192_enc_main_loop:@ main loop start aese q2, v18.16b aesmc q2, q2 @ AES block 4k+6 - round 0 rev64 q5, q5 @ GHASH block 4k+1 (t0 and t1 free) aese q1, v18.16b aesmc q1, q1 @ AES block 4k+5 - round 0 ldp r19, r20, [r0, #16] @ AES block 4k+5 - load plaintext #ifdef __ARMEB__ rev r19, r19 rev r20, r20 #endif ext v11.16b, v11.16b, v11.16b, #8 @ PRE 0 fmov d3, r10 @ CTR block 4k+3 rev64 q4, q4 @ GHASH block 4k (only t0 is free) aese q2, v19.16b aesmc q2, q2 @ AES block 4k+6 - round 1 fmov v3.d[1], r9 @ CTR block 4k+3 pmull2 v30.1q, q5, v14.2d @ GHASH block 4k+1 - high rev64 q7, q7 @ GHASH block 4k+3 (t0, t1, t2 and t3 free) ldp r21, r22, [r0, #32] @ AES block 4k+6 - load plaintext #ifdef __ARMEB__ rev r21, r21 rev r22, r22 #endif aese q0, v18.16b aesmc q0, q0 @ AES block 4k+4 - round 0 ldp r23, r24, [r0, #48] @ AES block 4k+3 - load plaintext #ifdef __ARMEB__ rev r23, r23 rev r24, r24 #endif pmull v31.1q, q5, v14.1d @ GHASH block 4k+1 - low eor q4, q4, v11.16b @ PRE 1 aese q1, v19.16b aesmc q1, q1 @ AES block 4k+5 - round 1 aese q0, v19.16b aesmc q0, q0 @ AES block 4k+4 - round 1 rev64 q6, q6 @ GHASH block 4k+2 (t0, t1, and t2 free) aese q3, v18.16b aesmc q3, q3 @ AES block 4k+7 - round 0 eor r24, r24, r14 @ AES block 4k+3 - round 12 high pmull v11.1q, q4, v15.1d @ GHASH block 4k - low mov d8, v4.d[1] @ GHASH block 4k - mid aese q0, v20.16b aesmc q0, q0 @ AES block 4k+4 - round 2 aese q3, v19.16b aesmc q3, q3 @ AES block 4k+7 - round 1 eor r21, r21, r13 @ AES block 4k+6 - round 12 low eor q8, q8, q4 @ GHASH block 4k - mid eor v11.16b, v11.16b, v31.16b @ GHASH block 4k+1 - low aese q0, v21.16b aesmc q0, q0 @ AES block 4k+4 - round 3 eor r19, r19, r13 @ AES block 4k+5 - round 12 low aese q1, v20.16b aesmc q1, q1 @ AES block 4k+5 - round 2 mov d31, v6.d[1] @ GHASH block 4k+2 - mid pmull2 v9.1q, q4, v15.2d @ GHASH block 4k - high mov d4, v5.d[1] @ GHASH block 4k+1 - mid aese q2, v20.16b aesmc q2, q2 @ AES block 4k+6 - round 2 aese q1, v21.16b aesmc q1, q1 @ AES block 4k+5 - round 3 mov d10, v17.d[1] @ GHASH block 4k - mid eor q9, q9, v30.16b @ GHASH block 4k+1 - high aese q3, v20.16b aesmc q3, q3 @ AES block 4k+7 - round 2 eor v31.8b, v31.8b, q6 @ GHASH block 4k+2 - mid pmull2 v30.1q, q6, v13.2d @ GHASH block 4k+2 - high aese q0, v22.16b aesmc q0, q0 @ AES block 4k+4 - round 4 eor q4, q4, q5 @ GHASH block 4k+1 - mid aese q3, v21.16b aesmc q3, q3 @ AES block 4k+7 - round 3 pmull2 v5.1q, q7, v12.2d @ GHASH block 4k+3 - high eor r20, r20, r14 @ AES block 4k+5 - round 12 high ins v31.d[1], v31.d[0] @ GHASH block 4k+2 - mid aese q0, v23.16b aesmc q0, q0 @ AES block 4k+4 - round 5 add r12, r12, #1 @ CTR block 4k+3 aese q3, v22.16b aesmc q3, q3 @ AES block 4k+7 - round 4 eor q9, q9, v30.16b @ GHASH block 4k+2 - high pmull v4.1q, q4, v17.1d @ GHASH block 4k+1 - mid eor r22, r22, r14 @ AES block 4k+6 - round 12 high pmull2 v31.1q, v31.2d, v16.2d @ GHASH block 4k+2 - mid eor r23, r23, r13 @ AES block 4k+3 - round 12 low mov d30, v7.d[1] @ GHASH block 4k+3 - mid pmull v10.1q, q8, v10.1d @ GHASH block 4k - mid rev r9, r12 @ CTR block 4k+8 pmull v8.1q, q6, v13.1d @ GHASH block 4k+2 - low orr r9, r11, r9, lsl #32 @ CTR block 4k+8 aese q2, v21.16b aesmc q2, q2 @ AES block 4k+6 - round 3 eor v30.8b, v30.8b, q7 @ GHASH block 4k+3 - mid aese q1, v22.16b aesmc q1, q1 @ AES block 4k+5 - round 4 ldp r6, r7, [r0, #0] @ AES block 4k+4 - load plaintext #ifdef __ARMEB__ rev r6, r6 rev r7, r7 #endif aese q0, v24.16b aesmc q0, q0 @ AES block 4k+4 - round 6 eor v11.16b, v11.16b, q8 @ GHASH block 4k+2 - low aese q2, v22.16b aesmc q2, q2 @ AES block 4k+6 - round 4 add r0, r0, #64 @ AES input_ptr update aese q1, v23.16b aesmc q1, q1 @ AES block 4k+5 - round 5 movi q8, #0xc2 pmull v6.1q, q7, v12.1d @ GHASH block 4k+3 - low eor r7, r7, r14 @ AES block 4k+4 - round 12 high eor v10.16b, v10.16b, q4 @ GHASH block 4k+1 - mid aese q2, v23.16b aesmc q2, q2 @ AES block 4k+6 - round 5 eor r6, r6, r13 @ AES block 4k+4 - round 12 low aese q1, v24.16b aesmc q1, q1 @ AES block 4k+5 - round 6 shl d8, d8, #56 @ mod_constant aese q3, v23.16b aesmc q3, q3 @ AES block 4k+7 - round 5 eor q9, q9, q5 @ GHASH block 4k+3 - high aese q0, v25.16b aesmc q0, q0 @ AES block 4k+4 - round 7 fmov d5, r19 @ AES block 4k+5 - mov low aese q1, v25.16b aesmc q1, q1 @ AES block 4k+5 - round 7 eor v10.16b, v10.16b, v31.16b @ GHASH block 4k+2 - mid aese q3, v24.16b aesmc q3, q3 @ AES block 4k+7 - round 6 fmov v5.d[1], r20 @ AES block 4k+5 - mov high aese q0, v26.16b aesmc q0, q0 @ AES block 4k+4 - round 8 eor v11.16b, v11.16b, q6 @ GHASH block 4k+3 - low pmull v30.1q, v30.1d, v16.1d @ GHASH block 4k+3 - mid cmp r0, r5 @ .LOOP CONTROL fmov d4, r6 @ AES block 4k+4 - mov low aese q2, v24.16b aesmc q2, q2 @ AES block 4k+6 - round 6 fmov v4.d[1], r7 @ AES block 4k+4 - mov high aese q1, v26.16b aesmc q1, q1 @ AES block 4k+5 - round 8 fmov d7, r23 @ AES block 4k+3 - mov low eor v10.16b, v10.16b, v30.16b @ GHASH block 4k+3 - mid eor v30.16b, v11.16b, q9 @ MODULO - karatsuba tidy up add r12, r12, #1 @ CTR block 4k+8 aese q2, v25.16b aesmc q2, q2 @ AES block 4k+6 - round 7 fmov v7.d[1], r24 @ AES block 4k+3 - mov high pmull v31.1q, q9, q8 @ MODULO - top 64b align with mid ext q9, q9, q9, #8 @ MODULO - other top alignment fmov d6, r21 @ AES block 4k+6 - mov low aese q3, v25.16b aesmc q3, q3 @ AES block 4k+7 - round 7 aese q0, v27.16b aesmc q0, q0 @ AES block 4k+4 - round 9 eor v10.16b, v10.16b, v30.16b @ MODULO - karatsuba tidy up aese q2, v26.16b aesmc q2, q2 @ AES block 4k+6 - round 8 aese q3, v26.16b aesmc q3, q3 @ AES block 4k+7 - round 8 aese q1, v27.16b aesmc q1, q1 @ AES block 4k+5 - round 9 aese q0, v28.16b aesmc q0, q0 @ AES block 4k+4 - round 10 eor v10.16b, v10.16b, v31.16b @ MODULO - fold into mid aese q3, v27.16b aesmc q3, q3 @ AES block 4k+7 - round 9 aese q2, v27.16b aesmc q2, q2 @ AES block 4k+6 - round 9 aese q0, v29.16b @ AES block 4k+4 - round 11 aese q1, v28.16b aesmc q1, q1 @ AES block 4k+5 - round 10 eor v10.16b, v10.16b, q9 @ MODULO - fold into mid aese q2, v28.16b aesmc q2, q2 @ AES block 4k+6 - round 10 eor q4, q4, q0 @ AES block 4k+4 - result fmov d0, r10 @ CTR block 4k+8 aese q1, v29.16b @ AES block 4k+5 - round 11 fmov v0.d[1], r9 @ CTR block 4k+8 rev r9, r12 @ CTR block 4k+9 pmull v9.1q, v10.1d, q8 @ MODULO - mid 64b align with low fmov v6.d[1], r22 @ AES block 4k+6 - mov high st1 { q4}, [r2], #16 @ AES block 4k+4 - store result aese q3, v28.16b aesmc q3, q3 @ AES block 4k+7 - round 10 orr r9, r11, r9, lsl #32 @ CTR block 4k+9 eor q5, q5, q1 @ AES block 4k+5 - result add r12, r12, #1 @ CTR block 4k+9 fmov d1, r10 @ CTR block 4k+9 aese q2, v29.16b @ AES block 4k+6 - round 11 fmov v1.d[1], r9 @ CTR block 4k+9 rev r9, r12 @ CTR block 4k+10 add r12, r12, #1 @ CTR block 4k+10 ext v10.16b, v10.16b, v10.16b, #8 @ MODULO - other mid alignment orr r9, r11, r9, lsl #32 @ CTR block 4k+10 st1 { q5}, [r2], #16 @ AES block 4k+5 - store result eor v11.16b, v11.16b, q9 @ MODULO - fold into low aese q3, v29.16b @ AES block 4k+7 - round 11 eor q6, q6, q2 @ AES block 4k+6 - result fmov d2, r10 @ CTR block 4k+10 st1 { q6}, [r2], #16 @ AES block 4k+6 - store result fmov v2.d[1], r9 @ CTR block 4k+10 rev r9, r12 @ CTR block 4k+11 eor v11.16b, v11.16b, v10.16b @ MODULO - fold into low orr r9, r11, r9, lsl #32 @ CTR block 4k+11 eor q7, q7, q3 @ AES block 4k+3 - result st1 { q7}, [r2], #16 @ AES block 4k+3 - store result blt .L192_enc_main_loop .L192_enc_prepretail:@ PREPRETAIL aese q0, v18.16b aesmc q0, q0 @ AES block 4k+4 - round 0 rev64 q4, q4 @ GHASH block 4k (only t0 is free) fmov d3, r10 @ CTR block 4k+3 ext v11.16b, v11.16b, v11.16b, #8 @ PRE 0 add r12, r12, #1 @ CTR block 4k+3 aese q1, v18.16b aesmc q1, q1 @ AES block 4k+5 - round 0 rev64 q5, q5 @ GHASH block 4k+1 (t0 and t1 free) aese q2, v18.16b aesmc q2, q2 @ AES block 4k+6 - round 0 fmov v3.d[1], r9 @ CTR block 4k+3 eor q4, q4, v11.16b @ PRE 1 mov d10, v17.d[1] @ GHASH block 4k - mid aese q1, v19.16b aesmc q1, q1 @ AES block 4k+5 - round 1 rev64 q6, q6 @ GHASH block 4k+2 (t0, t1, and t2 free) pmull2 v30.1q, q5, v14.2d @ GHASH block 4k+1 - high pmull v11.1q, q4, v15.1d @ GHASH block 4k - low mov d8, v4.d[1] @ GHASH block 4k - mid pmull v31.1q, q5, v14.1d @ GHASH block 4k+1 - low rev64 q7, q7 @ GHASH block 4k+3 (t0, t1, t2 and t3 free) pmull2 v9.1q, q4, v15.2d @ GHASH block 4k - high eor q8, q8, q4 @ GHASH block 4k - mid mov d4, v5.d[1] @ GHASH block 4k+1 - mid eor v11.16b, v11.16b, v31.16b @ GHASH block 4k+1 - low mov d31, v6.d[1] @ GHASH block 4k+2 - mid aese q3, v18.16b aesmc q3, q3 @ AES block 4k+7 - round 0 eor q9, q9, v30.16b @ GHASH block 4k+1 - high pmull2 v30.1q, q6, v13.2d @ GHASH block 4k+2 - high eor q4, q4, q5 @ GHASH block 4k+1 - mid eor v31.8b, v31.8b, q6 @ GHASH block 4k+2 - mid aese q3, v19.16b aesmc q3, q3 @ AES block 4k+7 - round 1 aese q2, v19.16b aesmc q2, q2 @ AES block 4k+6 - round 1 eor q9, q9, v30.16b @ GHASH block 4k+2 - high aese q0, v19.16b aesmc q0, q0 @ AES block 4k+4 - round 1 aese q1, v20.16b aesmc q1, q1 @ AES block 4k+5 - round 2 mov d30, v7.d[1] @ GHASH block 4k+3 - mid pmull2 v5.1q, q7, v12.2d @ GHASH block 4k+3 - high ins v31.d[1], v31.d[0] @ GHASH block 4k+2 - mid aese q0, v20.16b aesmc q0, q0 @ AES block 4k+4 - round 2 pmull v10.1q, q8, v10.1d @ GHASH block 4k - mid eor v30.8b, v30.8b, q7 @ GHASH block 4k+3 - mid aese q1, v21.16b aesmc q1, q1 @ AES block 4k+5 - round 3 pmull2 v31.1q, v31.2d, v16.2d @ GHASH block 4k+2 - mid pmull v4.1q, q4, v17.1d @ GHASH block 4k+1 - mid pmull v30.1q, v30.1d, v16.1d @ GHASH block 4k+3 - mid eor q9, q9, q5 @ GHASH block 4k+3 - high pmull v8.1q, q6, v13.1d @ GHASH block 4k+2 - low aese q0, v21.16b aesmc q0, q0 @ AES block 4k+4 - round 3 eor v10.16b, v10.16b, q4 @ GHASH block 4k+1 - mid aese q3, v20.16b aesmc q3, q3 @ AES block 4k+7 - round 2 aese q2, v20.16b aesmc q2, q2 @ AES block 4k+6 - round 2 eor v11.16b, v11.16b, q8 @ GHASH block 4k+2 - low aese q0, v22.16b aesmc q0, q0 @ AES block 4k+4 - round 4 aese q3, v21.16b aesmc q3, q3 @ AES block 4k+7 - round 3 eor v10.16b, v10.16b, v31.16b @ GHASH block 4k+2 - mid aese q2, v21.16b aesmc q2, q2 @ AES block 4k+6 - round 3 pmull v6.1q, q7, v12.1d @ GHASH block 4k+3 - low movi q8, #0xc2 aese q3, v22.16b aesmc q3, q3 @ AES block 4k+7 - round 4 aese q2, v22.16b aesmc q2, q2 @ AES block 4k+6 - round 4 aese q1, v22.16b aesmc q1, q1 @ AES block 4k+5 - round 4 eor v10.16b, v10.16b, v30.16b @ GHASH block 4k+3 - mid aese q3, v23.16b aesmc q3, q3 @ AES block 4k+7 - round 5 aese q2, v23.16b aesmc q2, q2 @ AES block 4k+6 - round 5 aese q1, v23.16b aesmc q1, q1 @ AES block 4k+5 - round 5 eor v11.16b, v11.16b, q6 @ GHASH block 4k+3 - low aese q0, v23.16b aesmc q0, q0 @ AES block 4k+4 - round 5 aese q3, v24.16b aesmc q3, q3 @ AES block 4k+7 - round 6 eor v10.16b, v10.16b, q9 @ karatsuba tidy up aese q1, v24.16b aesmc q1, q1 @ AES block 4k+5 - round 6 aese q0, v24.16b aesmc q0, q0 @ AES block 4k+4 - round 6 shl d8, d8, #56 @ mod_constant aese q3, v25.16b aesmc q3, q3 @ AES block 4k+7 - round 7 aese q1, v25.16b aesmc q1, q1 @ AES block 4k+5 - round 7 eor v10.16b, v10.16b, v11.16b aese q0, v25.16b aesmc q0, q0 @ AES block 4k+4 - round 7 pmull v30.1q, q9, q8 aese q2, v24.16b aesmc q2, q2 @ AES block 4k+6 - round 6 ext q9, q9, q9, #8 aese q0, v26.16b aesmc q0, q0 @ AES block 4k+4 - round 8 aese q1, v26.16b aesmc q1, q1 @ AES block 4k+5 - round 8 eor v10.16b, v10.16b, v30.16b aese q2, v25.16b aesmc q2, q2 @ AES block 4k+6 - round 7 aese q3, v26.16b aesmc q3, q3 @ AES block 4k+7 - round 8 aese q0, v27.16b aesmc q0, q0 @ AES block 4k+4 - round 9 aese q2, v26.16b aesmc q2, q2 @ AES block 4k+6 - round 8 eor v10.16b, v10.16b, q9 aese q3, v27.16b aesmc q3, q3 @ AES block 4k+7 - round 9 aese q1, v27.16b aesmc q1, q1 @ AES block 4k+5 - round 9 aese q2, v27.16b aesmc q2, q2 @ AES block 4k+6 - round 9 pmull v30.1q, v10.1d, q8 ext v10.16b, v10.16b, v10.16b, #8 aese q3, v28.16b aesmc q3, q3 @ AES block 4k+7 - round 10 aese q0, v28.16b aesmc q0, q0 @ AES block 4k+4 - round 10 aese q2, v28.16b aesmc q2, q2 @ AES block 4k+6 - round 10 aese q1, v28.16b aesmc q1, q1 @ AES block 4k+5 - round 10 eor v11.16b, v11.16b, v30.16b aese q0, v29.16b @ AES block 4k+4 - round 11 aese q3, v29.16b @ AES block 4k+7 - round 11 aese q2, v29.16b @ AES block 4k+6 - round 11 aese q1, v29.16b @ AES block 4k+5 - round 11 eor v11.16b, v11.16b, v10.16b .L192_enc_tail:@ TAIL sub r5, r4, r0 @ main_end_input_ptr is number of bytes left to process ldp r6, r7, [r0], #16 @ AES block 4k+4 - load plaintext #ifdef __ARMEB__ rev r6, r6 rev r7, r7 #endif eor r6, r6, r13 @ AES block 4k+4 - round 12 low eor r7, r7, r14 @ AES block 4k+4 - round 12 high fmov d4, r6 @ AES block 4k+4 - mov low fmov v4.d[1], r7 @ AES block 4k+4 - mov high cmp r5, #48 eor q5, q4, q0 @ AES block 4k+4 - result ext q8, v11.16b, v11.16b, #8 @ prepare final partial tag bgt .L192_enc_blocks_more_than_3 sub r12, r12, #1 movi v10.8b, #0 mov q3, q2 movi q9, #0 cmp r5, #32 mov q2, q1 movi v11.8b, #0 bgt .L192_enc_blocks_more_than_2 sub r12, r12, #1 mov q3, q1 cmp r5, #16 bgt .L192_enc_blocks_more_than_1 sub r12, r12, #1 b .L192_enc_blocks_less_than_1 .L192_enc_blocks_more_than_3:@ blocks left > 3 st1 { q5}, [r2], #16 @ AES final-3 block - store result ldp r6, r7, [r0], #16 @ AES final-2 block - load input low & high #ifdef __ARMEB__ rev r6, r6 rev r7, r7 #endif rev64 q4, q5 @ GHASH final-3 block eor r6, r6, r13 @ AES final-2 block - round 12 low eor q4, q4, q8 @ feed in partial tag eor r7, r7, r14 @ AES final-2 block - round 12 high fmov d5, r6 @ AES final-2 block - mov low fmov v5.d[1], r7 @ AES final-2 block - mov high mov d22, v4.d[1] @ GHASH final-3 block - mid pmull v11.1q, q4, v15.1d @ GHASH final-3 block - low mov d10, v17.d[1] @ GHASH final-3 block - mid eor v22.8b, v22.8b, q4 @ GHASH final-3 block - mid movi q8, #0 @ suppress further partial tag feed in pmull2 v9.1q, q4, v15.2d @ GHASH final-3 block - high pmull v10.1q, v22.1d, v10.1d @ GHASH final-3 block - mid eor q5, q5, q1 @ AES final-2 block - result .L192_enc_blocks_more_than_2:@ blocks left > 2 st1 { q5}, [r2], #16 @ AES final-2 block - store result rev64 q4, q5 @ GHASH final-2 block ldp r6, r7, [r0], #16 @ AES final-1 block - load input low & high #ifdef __ARMEB__ rev r6, r6 rev r7, r7 #endif eor q4, q4, q8 @ feed in partial tag eor r7, r7, r14 @ AES final-1 block - round 12 high pmull2 v20.1q, q4, v14.2d @ GHASH final-2 block - high mov d22, v4.d[1] @ GHASH final-2 block - mid pmull v21.1q, q4, v14.1d @ GHASH final-2 block - low eor r6, r6, r13 @ AES final-1 block - round 12 low fmov d5, r6 @ AES final-1 block - mov low fmov v5.d[1], r7 @ AES final-1 block - mov high eor q9, q9, v20.16b @ GHASH final-2 block - high eor v22.8b, v22.8b, q4 @ GHASH final-2 block - mid eor v11.16b, v11.16b, v21.16b @ GHASH final-2 block - low pmull v22.1q, v22.1d, v17.1d @ GHASH final-2 block - mid movi q8, #0 @ suppress further partial tag feed in eor q5, q5, q2 @ AES final-1 block - result eor v10.16b, v10.16b, v22.16b @ GHASH final-2 block - mid .L192_enc_blocks_more_than_1:@ blocks left > 1 st1 { q5}, [r2], #16 @ AES final-1 block - store result ldp r6, r7, [r0], #16 @ AES final block - load input low & high #ifdef __ARMEB__ rev r6, r6 rev r7, r7 #endif rev64 q4, q5 @ GHASH final-1 block eor r6, r6, r13 @ AES final block - round 12 low eor q4, q4, q8 @ feed in partial tag movi q8, #0 @ suppress further partial tag feed in mov d22, v4.d[1] @ GHASH final-1 block - mid eor v22.8b, v22.8b, q4 @ GHASH final-1 block - mid eor r7, r7, r14 @ AES final block - round 12 high fmov d5, r6 @ AES final block - mov low pmull2 v20.1q, q4, v13.2d @ GHASH final-1 block - high fmov v5.d[1], r7 @ AES final block - mov high ins v22.d[1], v22.d[0] @ GHASH final-1 block - mid eor q9, q9, v20.16b @ GHASH final-1 block - high pmull v21.1q, q4, v13.1d @ GHASH final-1 block - low pmull2 v22.1q, v22.2d, v16.2d @ GHASH final-1 block - mid eor q5, q5, q3 @ AES final block - result eor v11.16b, v11.16b, v21.16b @ GHASH final-1 block - low eor v10.16b, v10.16b, v22.16b @ GHASH final-1 block - mid .L192_enc_blocks_less_than_1:@ blocks left <= 1 ld1 { v18.16b}, [r2] @ load existing bytes where the possibly partial last block is to be stored #ifndef __ARMEB__ rev r9, r12 #else mov r9, r12 #endif and r1, r1, #127 @ bit_length %= 128 sub r1, r1, #128 @ bit_length -= 128 mvn r14, xzr @ rk12_h = 0xffffffffffffffff neg r1, r1 @ bit_length = 128 - #bits in input (in range [1,128]) mvn r13, xzr @ rk12_l = 0xffffffffffffffff and r1, r1, #127 @ bit_length %= 128 lsr r14, r14, r1 @ rk12_h is mask for top 64b of last block cmp r1, #64 csel r6, r13, r14, lt csel r7, r14, xzr, lt fmov d0, r6 @ ctr0b is mask for last block fmov v0.d[1], r7 and q5, q5, q0 @ possibly partial last block has zeroes in highest bits rev64 q4, q5 @ GHASH final block eor q4, q4, q8 @ feed in partial tag mov d8, v4.d[1] @ GHASH final block - mid pmull v21.1q, q4, v12.1d @ GHASH final block - low pmull2 v20.1q, q4, v12.2d @ GHASH final block - high eor q8, q8, q4 @ GHASH final block - mid eor v11.16b, v11.16b, v21.16b @ GHASH final block - low eor q9, q9, v20.16b @ GHASH final block - high pmull v8.1q, q8, v16.1d @ GHASH final block - mid eor v10.16b, v10.16b, q8 @ GHASH final block - mid movi q8, #0xc2 eor v30.16b, v11.16b, q9 @ MODULO - karatsuba tidy up shl d8, d8, #56 @ mod_constant bif q5, v18.16b, q0 @ insert existing bytes in top end of result before storing eor v10.16b, v10.16b, v30.16b @ MODULO - karatsuba tidy up pmull v31.1q, q9, q8 @ MODULO - top 64b align with mid ext q9, q9, q9, #8 @ MODULO - other top alignment eor v10.16b, v10.16b, v31.16b @ MODULO - fold into mid eor v10.16b, v10.16b, q9 @ MODULO - fold into mid pmull v9.1q, v10.1d, q8 @ MODULO - mid 64b align with low ext v10.16b, v10.16b, v10.16b, #8 @ MODULO - other mid alignment eor v11.16b, v11.16b, q9 @ MODULO - fold into low str r9, [r16, #12] @ store the updated counter st1 { q5}, [r2] @ store all 16B eor v11.16b, v11.16b, v10.16b @ MODULO - fold into low ext v11.16b, v11.16b, v11.16b, #8 rev64 v11.16b, v11.16b mov r0, r15 st1 { v11.16b }, [r3] ldp r21, r22, [sp, #16] ldp r23, r24, [sp, #32] ldp d8, d9, [sp, #48] ldp d10, d11, [sp, #64] ldp d12, d13, [sp, #80] ldp d14, d15, [sp, #96] ldp r19, r20, [sp], #112 RET .L192_enc_ret: mov r0, #0x0 RET .size aes_gcm_enc_192_kernel,.-aes_gcm_enc_192_kernel .globl aes_gcm_dec_192_kernel .type aes_gcm_dec_192_kernel,%function .align 4 aes_gcm_dec_192_kernel: cbz r1, .L192_dec_ret stp r19, r20, [sp, #-112]! mov r16, r4 mov r8, r5 stp r21, r22, [sp, #16] stp r23, r24, [sp, #32] stp d8, d9, [sp, #48] stp d10, d11, [sp, #64] stp d12, d13, [sp, #80] stp d14, d15, [sp, #96] add r4, r0, r1, lsr #3 @ end_input_ptr ldp r10, r11, [r16] @ ctr96_b64, ctr96_t32 #ifdef __ARMEB__ rev r10, r10 rev r11, r11 #endif ldp r13, r14, [r8, #192] @ load rk12 #ifdef __ARMEB__ ror r13, r13, #32 ror r14, r14, #32 #endif ld1 { q0}, [r16] @ special case vector load initial counter so we can start first AES block as quickly as possible ld1 {v18.4s}, [r8], #16 @ load rk0 lsr r5, r1, #3 @ byte_len mov r15, r5 ld1 {v19.4s}, [r8], #16 @ load rk1 lsr r12, r11, #32 orr r11, r11, r11 fmov d3, r10 @ CTR block 3 rev r12, r12 @ rev_ctr32 fmov d1, r10 @ CTR block 1 add r12, r12, #1 @ increment rev_ctr32 ld1 {v20.4s}, [r8], #16 @ load rk2 aese q0, v18.16b aesmc q0, q0 @ AES block 0 - round 0 rev r9, r12 @ CTR block 1 add r12, r12, #1 @ CTR block 1 orr r9, r11, r9, lsl #32 @ CTR block 1 ld1 {v21.4s}, [r8], #16 @ load rk3 fmov v1.d[1], r9 @ CTR block 1 rev r9, r12 @ CTR block 2 add r12, r12, #1 @ CTR block 2 fmov d2, r10 @ CTR block 2 orr r9, r11, r9, lsl #32 @ CTR block 2 fmov v2.d[1], r9 @ CTR block 2 rev r9, r12 @ CTR block 3 aese q0, v19.16b aesmc q0, q0 @ AES block 0 - round 1 orr r9, r11, r9, lsl #32 @ CTR block 3 fmov v3.d[1], r9 @ CTR block 3 ld1 {v22.4s}, [r8], #16 @ load rk4 aese q0, v20.16b aesmc q0, q0 @ AES block 0 - round 2 aese q2, v18.16b aesmc q2, q2 @ AES block 2 - round 0 ld1 {v23.4s}, [r8], #16 @ load rk5 aese q1, v18.16b aesmc q1, q1 @ AES block 1 - round 0 ldr q15, [r3, #112] @ load h4l | h4h #ifndef __ARMEB__ ext v15.16b, v15.16b, v15.16b, #8 #endif aese q3, v18.16b aesmc q3, q3 @ AES block 3 - round 0 ldr q13, [r3, #64] @ load h2l | h2h #ifndef __ARMEB__ ext v13.16b, v13.16b, v13.16b, #8 #endif aese q2, v19.16b aesmc q2, q2 @ AES block 2 - round 1 ldr q14, [r3, #80] @ load h3l | h3h #ifndef __ARMEB__ ext v14.16b, v14.16b, v14.16b, #8 #endif aese q1, v19.16b aesmc q1, q1 @ AES block 1 - round 1 aese q3, v19.16b aesmc q3, q3 @ AES block 3 - round 1 ldr q12, [r3, #32] @ load h1l | h1h #ifndef __ARMEB__ ext v12.16b, v12.16b, v12.16b, #8 #endif aese q2, v20.16b aesmc q2, q2 @ AES block 2 - round 2 ld1 {v24.4s}, [r8], #16 @ load rk6 aese q0, v21.16b aesmc q0, q0 @ AES block 0 - round 3 ld1 {v25.4s}, [r8], #16 @ load rk7 aese q1, v20.16b aesmc q1, q1 @ AES block 1 - round 2 ld1 {v26.4s}, [r8], #16 @ load rk8 aese q3, v20.16b aesmc q3, q3 @ AES block 3 - round 2 ld1 {v27.4s}, [r8], #16 @ load rk9 aese q2, v21.16b aesmc q2, q2 @ AES block 2 - round 3 ld1 { v11.16b}, [r3] ext v11.16b, v11.16b, v11.16b, #8 rev64 v11.16b, v11.16b aese q1, v21.16b aesmc q1, q1 @ AES block 1 - round 3 add r12, r12, #1 @ CTR block 3 aese q3, v21.16b aesmc q3, q3 @ AES block 3 - round 3 trn1 q9, v14.2d, v15.2d @ h4h | h3h aese q0, v22.16b aesmc q0, q0 @ AES block 0 - round 4 ld1 {v28.4s}, [r8], #16 @ load rk10 aese q1, v22.16b aesmc q1, q1 @ AES block 1 - round 4 trn2 v17.2d, v14.2d, v15.2d @ h4l | h3l aese q2, v22.16b aesmc q2, q2 @ AES block 2 - round 4 aese q3, v22.16b aesmc q3, q3 @ AES block 3 - round 4 trn2 v16.2d, v12.2d, v13.2d @ h2l | h1l aese q0, v23.16b aesmc q0, q0 @ AES block 0 - round 5 ld1 {v29.4s}, [r8], #16 @ load rk11 aese q1, v23.16b aesmc q1, q1 @ AES block 1 - round 5 aese q2, v23.16b aesmc q2, q2 @ AES block 2 - round 5 aese q3, v23.16b aesmc q3, q3 @ AES block 3 - round 5 aese q0, v24.16b aesmc q0, q0 @ AES block 0 - round 6 aese q2, v24.16b aesmc q2, q2 @ AES block 2 - round 6 aese q3, v24.16b aesmc q3, q3 @ AES block 3 - round 6 aese q0, v25.16b aesmc q0, q0 @ AES block 0 - round 7 aese q2, v25.16b aesmc q2, q2 @ AES block 2 - round 7 aese q3, v25.16b aesmc q3, q3 @ AES block 3 - round 7 aese q1, v24.16b aesmc q1, q1 @ AES block 1 - round 6 aese q2, v26.16b aesmc q2, q2 @ AES block 2 - round 8 aese q3, v26.16b aesmc q3, q3 @ AES block 3 - round 8 aese q1, v25.16b aesmc q1, q1 @ AES block 1 - round 7 aese q2, v27.16b aesmc q2, q2 @ AES block 2 - round 9 aese q3, v27.16b aesmc q3, q3 @ AES block 3 - round 9 aese q1, v26.16b aesmc q1, q1 @ AES block 1 - round 8 sub r5, r5, #1 @ byte_len - 1 aese q0, v26.16b aesmc q0, q0 @ AES block 0 - round 8 and r5, r5, #0xffffffffffffffc0 @ number of bytes to be processed in main loop (at least 1 byte must be handled by tail) aese q3, v28.16b aesmc q3, q3 @ AES block 3 - round 10 add r5, r5, r0 aese q1, v27.16b aesmc q1, q1 @ AES block 1 - round 9 cmp r0, r5 @ check if we have <= 4 blocks aese q0, v27.16b aesmc q0, q0 @ AES block 0 - round 9 trn1 q8, v12.2d, v13.2d @ h2h | h1h aese q3, v29.16b @ AES block 3 - round 11 aese q2, v28.16b aesmc q2, q2 @ AES block 2 - round 10 aese q1, v28.16b aesmc q1, q1 @ AES block 1 - round 10 aese q0, v28.16b aesmc q0, q0 @ AES block 0 - round 10 eor v16.16b, v16.16b, q8 @ h2k | h1k aese q2, v29.16b @ AES block 2 - round 11 aese q1, v29.16b @ AES block 1 - round 11 eor v17.16b, v17.16b, q9 @ h4k | h3k aese q0, v29.16b @ AES block 0 - round 11 bge .L192_dec_tail @ handle tail ld1 {q4, q5}, [r0], #32 @ AES block 0,1 - load ciphertext eor q1, q5, q1 @ AES block 1 - result eor q0, q4, q0 @ AES block 0 - result rev r9, r12 @ CTR block 4 ld1 {q6, q7}, [r0], #32 @ AES block 2,3 - load ciphertext mov r19, v1.d[0] @ AES block 1 - mov low mov r20, v1.d[1] @ AES block 1 - mov high mov r6, v0.d[0] @ AES block 0 - mov low orr r9, r11, r9, lsl #32 @ CTR block 4 add r12, r12, #1 @ CTR block 4 mov r7, v0.d[1] @ AES block 0 - mov high rev64 q4, q4 @ GHASH block 0 fmov d0, r10 @ CTR block 4 rev64 q5, q5 @ GHASH block 1 cmp r0, r5 @ check if we have <= 8 blocks eor r19, r19, r13 @ AES block 1 - round 12 low #ifdef __ARMEB__ rev r19, r19 #endif fmov v0.d[1], r9 @ CTR block 4 rev r9, r12 @ CTR block 5 orr r9, r11, r9, lsl #32 @ CTR block 5 fmov d1, r10 @ CTR block 5 eor r20, r20, r14 @ AES block 1 - round 12 high #ifdef __ARMEB__ rev r20, r20 #endif add r12, r12, #1 @ CTR block 5 fmov v1.d[1], r9 @ CTR block 5 eor r6, r6, r13 @ AES block 0 - round 12 low #ifdef __ARMEB__ rev r6, r6 #endif rev r9, r12 @ CTR block 6 eor r7, r7, r14 @ AES block 0 - round 12 high #ifdef __ARMEB__ rev r7, r7 #endif stp r6, r7, [r2], #16 @ AES block 0 - store result orr r9, r11, r9, lsl #32 @ CTR block 6 stp r19, r20, [r2], #16 @ AES block 1 - store result add r12, r12, #1 @ CTR block 6 eor q2, q6, q2 @ AES block 2 - result bge .L192_dec_prepretail @ do prepretail .L192_dec_main_loop:@ main loop start aese q1, v18.16b aesmc q1, q1 @ AES block 4k+5 - round 0 ext v11.16b, v11.16b, v11.16b, #8 @ PRE 0 pmull v31.1q, q5, v14.1d @ GHASH block 4k+1 - low mov r21, v2.d[0] @ AES block 4k+2 - mov low mov r22, v2.d[1] @ AES block 4k+2 - mov high eor q3, q7, q3 @ AES block 4k+3 - result rev64 q7, q7 @ GHASH block 4k+3 aese q1, v19.16b aesmc q1, q1 @ AES block 4k+5 - round 1 fmov d2, r10 @ CTR block 4k+6 aese q0, v18.16b aesmc q0, q0 @ AES block 4k+4 - round 0 eor q4, q4, v11.16b @ PRE 1 pmull2 v30.1q, q5, v14.2d @ GHASH block 4k+1 - high fmov v2.d[1], r9 @ CTR block 4k+6 aese q1, v20.16b aesmc q1, q1 @ AES block 4k+5 - round 2 mov r24, v3.d[1] @ AES block 4k+3 - mov high aese q0, v19.16b aesmc q0, q0 @ AES block 4k+4 - round 1 mov r23, v3.d[0] @ AES block 4k+3 - mov low pmull2 v9.1q, q4, v15.2d @ GHASH block 4k - high fmov d3, r10 @ CTR block 4k+7 mov d8, v4.d[1] @ GHASH block 4k - mid pmull v11.1q, q4, v15.1d @ GHASH block 4k - low mov d10, v17.d[1] @ GHASH block 4k - mid rev r9, r12 @ CTR block 4k+7 aese q2, v18.16b aesmc q2, q2 @ AES block 4k+6 - round 0 orr r9, r11, r9, lsl #32 @ CTR block 4k+7 fmov v3.d[1], r9 @ CTR block 4k+7 eor q8, q8, q4 @ GHASH block 4k - mid mov d4, v5.d[1] @ GHASH block 4k+1 - mid aese q1, v21.16b aesmc q1, q1 @ AES block 4k+5 - round 3 aese q0, v20.16b aesmc q0, q0 @ AES block 4k+4 - round 2 eor r22, r22, r14 @ AES block 4k+2 - round 12 high #ifdef __ARMEB__ rev r22, r22 #endif aese q2, v19.16b aesmc q2, q2 @ AES block 4k+6 - round 1 eor q4, q4, q5 @ GHASH block 4k+1 - mid pmull v10.1q, q8, v10.1d @ GHASH block 4k - mid aese q3, v18.16b aesmc q3, q3 @ AES block 4k+7 - round 0 rev64 q6, q6 @ GHASH block 4k+2 aese q2, v20.16b aesmc q2, q2 @ AES block 4k+6 - round 2 pmull v4.1q, q4, v17.1d @ GHASH block 4k+1 - mid eor v11.16b, v11.16b, v31.16b @ GHASH block 4k+1 - low eor r21, r21, r13 @ AES block 4k+2 - round 12 low #ifdef __ARMEB__ rev r21, r21 #endif aese q1, v22.16b aesmc q1, q1 @ AES block 4k+5 - round 4 aese q0, v21.16b aesmc q0, q0 @ AES block 4k+4 - round 3 eor v10.16b, v10.16b, q4 @ GHASH block 4k+1 - mid mov d31, v6.d[1] @ GHASH block 4k+2 - mid aese q3, v19.16b aesmc q3, q3 @ AES block 4k+7 - round 1 eor q9, q9, v30.16b @ GHASH block 4k+1 - high aese q0, v22.16b aesmc q0, q0 @ AES block 4k+4 - round 4 pmull2 v30.1q, q6, v13.2d @ GHASH block 4k+2 - high eor v31.8b, v31.8b, q6 @ GHASH block 4k+2 - mid pmull v8.1q, q6, v13.1d @ GHASH block 4k+2 - low aese q0, v23.16b aesmc q0, q0 @ AES block 4k+4 - round 5 eor q9, q9, v30.16b @ GHASH block 4k+2 - high mov d30, v7.d[1] @ GHASH block 4k+3 - mid aese q1, v23.16b aesmc q1, q1 @ AES block 4k+5 - round 5 pmull2 v5.1q, q7, v12.2d @ GHASH block 4k+3 - high aese q3, v20.16b aesmc q3, q3 @ AES block 4k+7 - round 2 eor v30.8b, v30.8b, q7 @ GHASH block 4k+3 - mid aese q1, v24.16b aesmc q1, q1 @ AES block 4k+5 - round 6 aese q0, v24.16b aesmc q0, q0 @ AES block 4k+4 - round 6 ins v31.d[1], v31.d[0] @ GHASH block 4k+2 - mid aese q3, v21.16b aesmc q3, q3 @ AES block 4k+7 - round 3 pmull v30.1q, v30.1d, v16.1d @ GHASH block 4k+3 - mid eor v11.16b, v11.16b, q8 @ GHASH block 4k+2 - low aese q0, v25.16b aesmc q0, q0 @ AES block 4k+4 - round 7 pmull2 v31.1q, v31.2d, v16.2d @ GHASH block 4k+2 - mid eor q9, q9, q5 @ GHASH block 4k+3 - high aese q1, v25.16b aesmc q1, q1 @ AES block 4k+5 - round 7 aese q0, v26.16b aesmc q0, q0 @ AES block 4k+4 - round 8 movi q8, #0xc2 pmull v6.1q, q7, v12.1d @ GHASH block 4k+3 - low aese q1, v26.16b aesmc q1, q1 @ AES block 4k+5 - round 8 eor v10.16b, v10.16b, v31.16b @ GHASH block 4k+2 - mid aese q2, v21.16b aesmc q2, q2 @ AES block 4k+6 - round 3 aese q0, v27.16b aesmc q0, q0 @ AES block 4k+4 - round 9 eor v11.16b, v11.16b, q6 @ GHASH block 4k+3 - low aese q3, v22.16b aesmc q3, q3 @ AES block 4k+7 - round 4 aese q2, v22.16b aesmc q2, q2 @ AES block 4k+6 - round 4 eor v10.16b, v10.16b, v30.16b @ GHASH block 4k+3 - mid aese q0, v28.16b aesmc q0, q0 @ AES block 4k+4 - round 10 aese q1, v27.16b aesmc q1, q1 @ AES block 4k+5 - round 9 eor v30.16b, v11.16b, q9 @ MODULO - karatsuba tidy up aese q2, v23.16b aesmc q2, q2 @ AES block 4k+6 - round 5 aese q3, v23.16b aesmc q3, q3 @ AES block 4k+7 - round 5 shl d8, d8, #56 @ mod_constant aese q1, v28.16b aesmc q1, q1 @ AES block 4k+5 - round 10 aese q2, v24.16b aesmc q2, q2 @ AES block 4k+6 - round 6 ld1 {q4}, [r0], #16 @ AES block 4k+4 - load ciphertext aese q3, v24.16b aesmc q3, q3 @ AES block 4k+7 - round 6 eor v10.16b, v10.16b, v30.16b @ MODULO - karatsuba tidy up pmull v31.1q, q9, q8 @ MODULO - top 64b align with mid ld1 {q5}, [r0], #16 @ AES block 4k+5 - load ciphertext eor r23, r23, r13 @ AES block 4k+3 - round 12 low #ifdef __ARMEB__ rev r23, r23 #endif aese q2, v25.16b aesmc q2, q2 @ AES block 4k+6 - round 7 ext q9, q9, q9, #8 @ MODULO - other top alignment aese q0, v29.16b @ AES block 4k+4 - round 11 add r12, r12, #1 @ CTR block 4k+7 aese q3, v25.16b aesmc q3, q3 @ AES block 4k+7 - round 7 eor v10.16b, v10.16b, v31.16b @ MODULO - fold into mid aese q2, v26.16b aesmc q2, q2 @ AES block 4k+6 - round 8 ld1 {q6}, [r0], #16 @ AES block 4k+6 - load ciphertext aese q1, v29.16b @ AES block 4k+5 - round 11 ld1 {q7}, [r0], #16 @ AES block 4k+7 - load ciphertext rev r9, r12 @ CTR block 4k+8 aese q3, v26.16b aesmc q3, q3 @ AES block 4k+7 - round 8 stp r21, r22, [r2], #16 @ AES block 4k+2 - store result aese q2, v27.16b aesmc q2, q2 @ AES block 4k+6 - round 9 eor v10.16b, v10.16b, q9 @ MODULO - fold into mid cmp r0, r5 @ .LOOP CONTROL eor q0, q4, q0 @ AES block 4k+4 - result eor r24, r24, r14 @ AES block 4k+3 - round 12 high #ifdef __ARMEB__ rev r24, r24 #endif eor q1, q5, q1 @ AES block 4k+5 - result aese q2, v28.16b aesmc q2, q2 @ AES block 4k+6 - round 10 orr r9, r11, r9, lsl #32 @ CTR block 4k+8 aese q3, v27.16b aesmc q3, q3 @ AES block 4k+7 - round 9 pmull v8.1q, v10.1d, q8 @ MODULO - mid 64b align with low mov r19, v1.d[0] @ AES block 4k+5 - mov low mov r6, v0.d[0] @ AES block 4k+4 - mov low stp r23, r24, [r2], #16 @ AES block 4k+3 - store result rev64 q5, q5 @ GHASH block 4k+5 aese q2, v29.16b @ AES block 4k+6 - round 11 mov r7, v0.d[1] @ AES block 4k+4 - mov high aese q3, v28.16b aesmc q3, q3 @ AES block 4k+7 - round 10 mov r20, v1.d[1] @ AES block 4k+5 - mov high fmov d0, r10 @ CTR block 4k+8 add r12, r12, #1 @ CTR block 4k+8 ext v10.16b, v10.16b, v10.16b, #8 @ MODULO - other mid alignment eor q2, q6, q2 @ AES block 4k+6 - result fmov v0.d[1], r9 @ CTR block 4k+8 rev r9, r12 @ CTR block 4k+9 eor r6, r6, r13 @ AES block 4k+4 - round 12 low #ifdef __ARMEB__ rev r6, r6 #endif orr r9, r11, r9, lsl #32 @ CTR block 4k+9 eor v11.16b, v11.16b, q8 @ MODULO - fold into low fmov d1, r10 @ CTR block 4k+9 add r12, r12, #1 @ CTR block 4k+9 eor r19, r19, r13 @ AES block 4k+5 - round 12 low #ifdef __ARMEB__ rev r19, r19 #endif fmov v1.d[1], r9 @ CTR block 4k+9 rev r9, r12 @ CTR block 4k+10 eor r20, r20, r14 @ AES block 4k+5 - round 12 high #ifdef __ARMEB__ rev r20, r20 #endif eor r7, r7, r14 @ AES block 4k+4 - round 12 high #ifdef __ARMEB__ rev r7, r7 #endif stp r6, r7, [r2], #16 @ AES block 4k+4 - store result eor v11.16b, v11.16b, v10.16b @ MODULO - fold into low add r12, r12, #1 @ CTR block 4k+10 rev64 q4, q4 @ GHASH block 4k+4 orr r9, r11, r9, lsl #32 @ CTR block 4k+10 aese q3, v29.16b @ AES block 4k+7 - round 11 stp r19, r20, [r2], #16 @ AES block 4k+5 - store result blt .L192_dec_main_loop .L192_dec_prepretail:@ PREPRETAIL mov r22, v2.d[1] @ AES block 4k+2 - mov high ext v11.16b, v11.16b, v11.16b, #8 @ PRE 0 eor q3, q7, q3 @ AES block 4k+3 - result aese q1, v18.16b aesmc q1, q1 @ AES block 4k+5 - round 0 mov r21, v2.d[0] @ AES block 4k+2 - mov low aese q0, v18.16b aesmc q0, q0 @ AES block 4k+4 - round 0 mov d10, v17.d[1] @ GHASH block 4k - mid eor q4, q4, v11.16b @ PRE 1 fmov d2, r10 @ CTR block 4k+6 aese q1, v19.16b aesmc q1, q1 @ AES block 4k+5 - round 1 mov r23, v3.d[0] @ AES block 4k+3 - mov low aese q0, v19.16b aesmc q0, q0 @ AES block 4k+4 - round 1 mov r24, v3.d[1] @ AES block 4k+3 - mov high pmull v11.1q, q4, v15.1d @ GHASH block 4k - low mov d8, v4.d[1] @ GHASH block 4k - mid fmov d3, r10 @ CTR block 4k+7 aese q1, v20.16b aesmc q1, q1 @ AES block 4k+5 - round 2 rev64 q6, q6 @ GHASH block 4k+2 pmull2 v9.1q, q4, v15.2d @ GHASH block 4k - high fmov v2.d[1], r9 @ CTR block 4k+6 rev r9, r12 @ CTR block 4k+7 orr r9, r11, r9, lsl #32 @ CTR block 4k+7 eor q8, q8, q4 @ GHASH block 4k - mid mov d4, v5.d[1] @ GHASH block 4k+1 - mid pmull v31.1q, q5, v14.1d @ GHASH block 4k+1 - low eor r24, r24, r14 @ AES block 4k+3 - round 12 high #ifdef __ARMEB__ rev r24, r24 #endif fmov v3.d[1], r9 @ CTR block 4k+7 aese q0, v20.16b aesmc q0, q0 @ AES block 4k+4 - round 2 eor r21, r21, r13 @ AES block 4k+2 - round 12 low #ifdef __ARMEB__ rev r21, r21 #endif pmull2 v30.1q, q5, v14.2d @ GHASH block 4k+1 - high eor r22, r22, r14 @ AES block 4k+2 - round 12 high #ifdef __ARMEB__ rev r22, r22 #endif eor q4, q4, q5 @ GHASH block 4k+1 - mid pmull v10.1q, q8, v10.1d @ GHASH block 4k - mid eor r23, r23, r13 @ AES block 4k+3 - round 12 low #ifdef __ARMEB__ rev r23, r23 #endif stp r21, r22, [r2], #16 @ AES block 4k+2 - store result rev64 q7, q7 @ GHASH block 4k+3 stp r23, r24, [r2], #16 @ AES block 4k+3 - store result aese q3, v18.16b aesmc q3, q3 @ AES block 4k+7 - round 0 eor q9, q9, v30.16b @ GHASH block 4k+1 - high pmull v4.1q, q4, v17.1d @ GHASH block 4k+1 - mid add r12, r12, #1 @ CTR block 4k+7 pmull2 v30.1q, q6, v13.2d @ GHASH block 4k+2 - high eor v11.16b, v11.16b, v31.16b @ GHASH block 4k+1 - low aese q2, v18.16b aesmc q2, q2 @ AES block 4k+6 - round 0 eor v10.16b, v10.16b, q4 @ GHASH block 4k+1 - mid mov d31, v6.d[1] @ GHASH block 4k+2 - mid aese q3, v19.16b aesmc q3, q3 @ AES block 4k+7 - round 1 aese q2, v19.16b aesmc q2, q2 @ AES block 4k+6 - round 1 eor q9, q9, v30.16b @ GHASH block 4k+2 - high eor v31.8b, v31.8b, q6 @ GHASH block 4k+2 - mid pmull v8.1q, q6, v13.1d @ GHASH block 4k+2 - low aese q2, v20.16b aesmc q2, q2 @ AES block 4k+6 - round 2 mov d30, v7.d[1] @ GHASH block 4k+3 - mid aese q3, v20.16b aesmc q3, q3 @ AES block 4k+7 - round 2 ins v31.d[1], v31.d[0] @ GHASH block 4k+2 - mid pmull v6.1q, q7, v12.1d @ GHASH block 4k+3 - low aese q0, v21.16b aesmc q0, q0 @ AES block 4k+4 - round 3 eor v30.8b, v30.8b, q7 @ GHASH block 4k+3 - mid aese q1, v21.16b aesmc q1, q1 @ AES block 4k+5 - round 3 pmull2 v31.1q, v31.2d, v16.2d @ GHASH block 4k+2 - mid eor v11.16b, v11.16b, q8 @ GHASH block 4k+2 - low aese q0, v22.16b aesmc q0, q0 @ AES block 4k+4 - round 4 pmull2 v5.1q, q7, v12.2d @ GHASH block 4k+3 - high movi q8, #0xc2 pmull v30.1q, v30.1d, v16.1d @ GHASH block 4k+3 - mid aese q2, v21.16b aesmc q2, q2 @ AES block 4k+6 - round 3 shl d8, d8, #56 @ mod_constant eor q9, q9, q5 @ GHASH block 4k+3 - high aese q0, v23.16b aesmc q0, q0 @ AES block 4k+4 - round 5 eor v10.16b, v10.16b, v31.16b @ GHASH block 4k+2 - mid aese q2, v22.16b aesmc q2, q2 @ AES block 4k+6 - round 4 pmull v31.1q, q9, q8 @ MODULO - top 64b align with mid eor v11.16b, v11.16b, q6 @ GHASH block 4k+3 - low aese q0, v24.16b aesmc q0, q0 @ AES block 4k+4 - round 6 aese q3, v21.16b aesmc q3, q3 @ AES block 4k+7 - round 3 eor v10.16b, v10.16b, v30.16b @ GHASH block 4k+3 - mid aese q2, v23.16b aesmc q2, q2 @ AES block 4k+6 - round 5 aese q0, v25.16b aesmc q0, q0 @ AES block 4k+4 - round 7 eor v30.16b, v11.16b, q9 @ MODULO - karatsuba tidy up aese q3, v22.16b aesmc q3, q3 @ AES block 4k+7 - round 4 aese q2, v24.16b aesmc q2, q2 @ AES block 4k+6 - round 6 ext q9, q9, q9, #8 @ MODULO - other top alignment aese q0, v26.16b aesmc q0, q0 @ AES block 4k+4 - round 8 aese q3, v23.16b aesmc q3, q3 @ AES block 4k+7 - round 5 eor v10.16b, v10.16b, v30.16b @ MODULO - karatsuba tidy up aese q1, v22.16b aesmc q1, q1 @ AES block 4k+5 - round 4 aese q2, v25.16b aesmc q2, q2 @ AES block 4k+6 - round 7 aese q0, v27.16b aesmc q0, q0 @ AES block 4k+4 - round 9 aese q1, v23.16b aesmc q1, q1 @ AES block 4k+5 - round 5 aese q3, v24.16b aesmc q3, q3 @ AES block 4k+7 - round 6 eor v10.16b, v10.16b, v31.16b @ MODULO - fold into mid aese q0, v28.16b aesmc q0, q0 @ AES block 4k+4 - round 10 aese q1, v24.16b aesmc q1, q1 @ AES block 4k+5 - round 6 aese q3, v25.16b aesmc q3, q3 @ AES block 4k+7 - round 7 aese q2, v26.16b aesmc q2, q2 @ AES block 4k+6 - round 8 eor v10.16b, v10.16b, q9 @ MODULO - fold into mid aese q1, v25.16b aesmc q1, q1 @ AES block 4k+5 - round 7 aese q3, v26.16b aesmc q3, q3 @ AES block 4k+7 - round 8 aese q2, v27.16b aesmc q2, q2 @ AES block 4k+6 - round 9 aese q1, v26.16b aesmc q1, q1 @ AES block 4k+5 - round 8 aese q3, v27.16b aesmc q3, q3 @ AES block 4k+7 - round 9 pmull v8.1q, v10.1d, q8 @ MODULO - mid 64b align with low aese q1, v27.16b aesmc q1, q1 @ AES block 4k+5 - round 9 aese q2, v28.16b aesmc q2, q2 @ AES block 4k+6 - round 10 aese q3, v28.16b aesmc q3, q3 @ AES block 4k+7 - round 10 ext v10.16b, v10.16b, v10.16b, #8 @ MODULO - other mid alignment aese q1, v28.16b aesmc q1, q1 @ AES block 4k+5 - round 10 aese q0, v29.16b eor v11.16b, v11.16b, q8 @ MODULO - fold into low aese q2, v29.16b aese q1, v29.16b aese q3, v29.16b eor v11.16b, v11.16b, v10.16b @ MODULO - fold into low .L192_dec_tail:@ TAIL sub r5, r4, r0 @ main_end_input_ptr is number of bytes left to process ld1 { q5}, [r0], #16 @ AES block 4k+4 - load ciphertext eor q0, q5, q0 @ AES block 4k+4 - result mov r7, v0.d[1] @ AES block 4k+4 - mov high mov r6, v0.d[0] @ AES block 4k+4 - mov low ext q8, v11.16b, v11.16b, #8 @ prepare final partial tag cmp r5, #48 eor r7, r7, r14 @ AES block 4k+4 - round 12 high #ifdef __ARMEB__ rev r7, r7 #endif eor r6, r6, r13 @ AES block 4k+4 - round 12 low #ifdef __ARMEB__ rev r6, r6 #endif bgt .L192_dec_blocks_more_than_3 movi v11.8b, #0 movi q9, #0 mov q3, q2 mov q2, q1 sub r12, r12, #1 movi v10.8b, #0 cmp r5, #32 bgt .L192_dec_blocks_more_than_2 mov q3, q1 cmp r5, #16 sub r12, r12, #1 bgt .L192_dec_blocks_more_than_1 sub r12, r12, #1 b .L192_dec_blocks_less_than_1 .L192_dec_blocks_more_than_3:@ blocks left > 3 rev64 q4, q5 @ GHASH final-3 block ld1 { q5}, [r0], #16 @ AES final-2 block - load ciphertext stp r6, r7, [r2], #16 @ AES final-3 block - store result eor q4, q4, q8 @ feed in partial tag eor q0, q5, q1 @ AES final-2 block - result pmull v11.1q, q4, v15.1d @ GHASH final-3 block - low mov r6, v0.d[0] @ AES final-2 block - mov low mov d22, v4.d[1] @ GHASH final-3 block - mid mov r7, v0.d[1] @ AES final-2 block - mov high mov d10, v17.d[1] @ GHASH final-3 block - mid eor v22.8b, v22.8b, q4 @ GHASH final-3 block - mid pmull2 v9.1q, q4, v15.2d @ GHASH final-3 block - high eor r6, r6, r13 @ AES final-2 block - round 12 low #ifdef __ARMEB__ rev r6, r6 #endif movi q8, #0 @ suppress further partial tag feed in pmull v10.1q, v22.1d, v10.1d @ GHASH final-3 block - mid eor r7, r7, r14 @ AES final-2 block - round 12 high #ifdef __ARMEB__ rev r7, r7 #endif .L192_dec_blocks_more_than_2:@ blocks left > 2 rev64 q4, q5 @ GHASH final-2 block ld1 { q5}, [r0], #16 @ AES final-1 block - load ciphertext eor q4, q4, q8 @ feed in partial tag movi q8, #0 @ suppress further partial tag feed in eor q0, q5, q2 @ AES final-1 block - result mov d22, v4.d[1] @ GHASH final-2 block - mid pmull v21.1q, q4, v14.1d @ GHASH final-2 block - low stp r6, r7, [r2], #16 @ AES final-2 block - store result eor v22.8b, v22.8b, q4 @ GHASH final-2 block - mid mov r7, v0.d[1] @ AES final-1 block - mov high eor v11.16b, v11.16b, v21.16b @ GHASH final-2 block - low mov r6, v0.d[0] @ AES final-1 block - mov low pmull2 v20.1q, q4, v14.2d @ GHASH final-2 block - high pmull v22.1q, v22.1d, v17.1d @ GHASH final-2 block - mid eor q9, q9, v20.16b @ GHASH final-2 block - high eor r7, r7, r14 @ AES final-1 block - round 12 high #ifdef __ARMEB__ rev r7, r7 #endif eor r6, r6, r13 @ AES final-1 block - round 12 low #ifdef __ARMEB__ rev r6, r6 #endif eor v10.16b, v10.16b, v22.16b @ GHASH final-2 block - mid .L192_dec_blocks_more_than_1:@ blocks left > 1 rev64 q4, q5 @ GHASH final-1 block eor q4, q4, q8 @ feed in partial tag ld1 { q5}, [r0], #16 @ AES final block - load ciphertext mov d22, v4.d[1] @ GHASH final-1 block - mid pmull2 v20.1q, q4, v13.2d @ GHASH final-1 block - high eor q0, q5, q3 @ AES final block - result stp r6, r7, [r2], #16 @ AES final-1 block - store result eor v22.8b, v22.8b, q4 @ GHASH final-1 block - mid eor q9, q9, v20.16b @ GHASH final-1 block - high pmull v21.1q, q4, v13.1d @ GHASH final-1 block - low mov r7, v0.d[1] @ AES final block - mov high ins v22.d[1], v22.d[0] @ GHASH final-1 block - mid mov r6, v0.d[0] @ AES final block - mov low pmull2 v22.1q, v22.2d, v16.2d @ GHASH final-1 block - mid movi q8, #0 @ suppress further partial tag feed in eor v11.16b, v11.16b, v21.16b @ GHASH final-1 block - low eor r7, r7, r14 @ AES final block - round 12 high #ifdef __ARMEB__ rev r7, r7 #endif eor r6, r6, r13 @ AES final block - round 12 low #ifdef __ARMEB__ rev r6, r6 #endif eor v10.16b, v10.16b, v22.16b @ GHASH final-1 block - mid .L192_dec_blocks_less_than_1:@ blocks left <= 1 mvn r13, xzr @ rk12_l = 0xffffffffffffffff ldp r4, r5, [r2] @ load existing bytes we need to not overwrite and r1, r1, #127 @ bit_length %= 128 sub r1, r1, #128 @ bit_length -= 128 neg r1, r1 @ bit_length = 128 - #bits in input (in range [1,128]) and r1, r1, #127 @ bit_length %= 128 mvn r14, xzr @ rk12_h = 0xffffffffffffffff lsr r14, r14, r1 @ rk12_h is mask for top 64b of last block cmp r1, #64 csel r9, r13, r14, lt csel r10, r14, xzr, lt fmov d0, r9 @ ctr0b is mask for last block and r6, r6, r9 bic r4, r4, r9 @ mask out low existing bytes orr r6, r6, r4 mov v0.d[1], r10 #ifndef __ARMEB__ rev r9, r12 #else mov r9, r12 #endif and q5, q5, q0 @ possibly partial last block has zeroes in highest bits str r9, [r16, #12] @ store the updated counter rev64 q4, q5 @ GHASH final block eor q4, q4, q8 @ feed in partial tag bic r5, r5, r10 @ mask out high existing bytes and r7, r7, r10 pmull2 v20.1q, q4, v12.2d @ GHASH final block - high mov d8, v4.d[1] @ GHASH final block - mid pmull v21.1q, q4, v12.1d @ GHASH final block - low eor q8, q8, q4 @ GHASH final block - mid eor q9, q9, v20.16b @ GHASH final block - high pmull v8.1q, q8, v16.1d @ GHASH final block - mid eor v11.16b, v11.16b, v21.16b @ GHASH final block - low eor v10.16b, v10.16b, q8 @ GHASH final block - mid movi q8, #0xc2 eor v30.16b, v11.16b, q9 @ MODULO - karatsuba tidy up shl d8, d8, #56 @ mod_constant eor v10.16b, v10.16b, v30.16b @ MODULO - karatsuba tidy up pmull v31.1q, q9, q8 @ MODULO - top 64b align with mid orr r7, r7, r5 stp r6, r7, [r2] ext q9, q9, q9, #8 @ MODULO - other top alignment eor v10.16b, v10.16b, v31.16b @ MODULO - fold into mid eor v10.16b, v10.16b, q9 @ MODULO - fold into mid pmull v8.1q, v10.1d, q8 @ MODULO - mid 64b align with low eor v11.16b, v11.16b, q8 @ MODULO - fold into low ext v10.16b, v10.16b, v10.16b, #8 @ MODULO - other mid alignment eor v11.16b, v11.16b, v10.16b @ MODULO - fold into low ext v11.16b, v11.16b, v11.16b, #8 rev64 v11.16b, v11.16b mov r0, r15 st1 { v11.16b }, [r3] ldp r21, r22, [sp, #16] ldp r23, r24, [sp, #32] ldp d8, d9, [sp, #48] ldp d10, d11, [sp, #64] ldp d12, d13, [sp, #80] ldp d14, d15, [sp, #96] ldp r19, r20, [sp], #112 RET .L192_dec_ret: mov r0, #0x0 RET .size aes_gcm_dec_192_kernel,.-aes_gcm_dec_192_kernel .globl aes_gcm_enc_256_kernel .type aes_gcm_enc_256_kernel,%function .align 4 aes_gcm_enc_256_kernel: cbz r1, .L256_enc_ret stp r19, r20, [sp, #-112]! mov r16, r4 mov r8, r5 stp r21, r22, [sp, #16] stp r23, r24, [sp, #32] stp d8, d9, [sp, #48] stp d10, d11, [sp, #64] stp d12, d13, [sp, #80] stp d14, d15, [sp, #96] add r4, r0, r1, lsr #3 @ end_input_ptr lsr r5, r1, #3 @ byte_len mov r15, r5 ldp r10, r11, [r16] @ ctr96_b64, ctr96_t32 #ifdef __ARMEB__ rev r10, r10 rev r11, r11 #endif ldp r13, r14, [r8, #224] @ load rk14 #ifdef __ARMEB__ ror r13, r13, #32 ror r14, r14, #32 #endif ld1 { q0}, [r16] @ special case vector load initial counter so we can start first AES block as quickly as possible sub r5, r5, #1 @ byte_len - 1 ld1 {v18.4s}, [r8], #16 @ load rk0 and r5, r5, #0xffffffffffffffc0 @ number of bytes to be processed in main loop (at least 1 byte must be handled by tail) ld1 {v19.4s}, [r8], #16 @ load rk1 add r5, r5, r0 lsr r12, r11, #32 fmov d2, r10 @ CTR block 2 orr r11, r11, r11 rev r12, r12 @ rev_ctr32 cmp r0, r5 @ check if we have <= 4 blocks fmov d1, r10 @ CTR block 1 aese q0, v18.16b aesmc q0, q0 @ AES block 0 - round 0 add r12, r12, #1 @ increment rev_ctr32 rev r9, r12 @ CTR block 1 fmov d3, r10 @ CTR block 3 orr r9, r11, r9, lsl #32 @ CTR block 1 add r12, r12, #1 @ CTR block 1 ld1 {v20.4s}, [r8], #16 @ load rk2 fmov v1.d[1], r9 @ CTR block 1 rev r9, r12 @ CTR block 2 add r12, r12, #1 @ CTR block 2 orr r9, r11, r9, lsl #32 @ CTR block 2 ld1 {v21.4s}, [r8], #16 @ load rk3 fmov v2.d[1], r9 @ CTR block 2 rev r9, r12 @ CTR block 3 aese q0, v19.16b aesmc q0, q0 @ AES block 0 - round 1 orr r9, r11, r9, lsl #32 @ CTR block 3 fmov v3.d[1], r9 @ CTR block 3 aese q1, v18.16b aesmc q1, q1 @ AES block 1 - round 0 ld1 {v22.4s}, [r8], #16 @ load rk4 aese q0, v20.16b aesmc q0, q0 @ AES block 0 - round 2 ld1 {v23.4s}, [r8], #16 @ load rk5 aese q2, v18.16b aesmc q2, q2 @ AES block 2 - round 0 ld1 {v24.4s}, [r8], #16 @ load rk6 aese q1, v19.16b aesmc q1, q1 @ AES block 1 - round 1 ldr q14, [r3, #80] @ load h3l | h3h #ifndef __ARMEB__ ext v14.16b, v14.16b, v14.16b, #8 #endif aese q3, v18.16b aesmc q3, q3 @ AES block 3 - round 0 ld1 {v25.4s}, [r8], #16 @ load rk7 aese q2, v19.16b aesmc q2, q2 @ AES block 2 - round 1 ld1 {v26.4s}, [r8], #16 @ load rk8 aese q1, v20.16b aesmc q1, q1 @ AES block 1 - round 2 ldr q13, [r3, #64] @ load h2l | h2h #ifndef __ARMEB__ ext v13.16b, v13.16b, v13.16b, #8 #endif aese q3, v19.16b aesmc q3, q3 @ AES block 3 - round 1 ld1 {v27.4s}, [r8], #16 @ load rk9 aese q2, v20.16b aesmc q2, q2 @ AES block 2 - round 2 ldr q15, [r3, #112] @ load h4l | h4h #ifndef __ARMEB__ ext v15.16b, v15.16b, v15.16b, #8 #endif aese q1, v21.16b aesmc q1, q1 @ AES block 1 - round 3 ld1 {v28.4s}, [r8], #16 @ load rk10 aese q3, v20.16b aesmc q3, q3 @ AES block 3 - round 2 ld1 {v29.4s}, [r8], #16 @ load rk11 aese q2, v21.16b aesmc q2, q2 @ AES block 2 - round 3 add r12, r12, #1 @ CTR block 3 aese q0, v21.16b aesmc q0, q0 @ AES block 0 - round 3 aese q3, v21.16b aesmc q3, q3 @ AES block 3 - round 3 ld1 { v11.16b}, [r3] ext v11.16b, v11.16b, v11.16b, #8 rev64 v11.16b, v11.16b aese q2, v22.16b aesmc q2, q2 @ AES block 2 - round 4 aese q0, v22.16b aesmc q0, q0 @ AES block 0 - round 4 aese q1, v22.16b aesmc q1, q1 @ AES block 1 - round 4 aese q3, v22.16b aesmc q3, q3 @ AES block 3 - round 4 aese q0, v23.16b aesmc q0, q0 @ AES block 0 - round 5 aese q1, v23.16b aesmc q1, q1 @ AES block 1 - round 5 aese q3, v23.16b aesmc q3, q3 @ AES block 3 - round 5 aese q2, v23.16b aesmc q2, q2 @ AES block 2 - round 5 aese q1, v24.16b aesmc q1, q1 @ AES block 1 - round 6 trn2 v17.2d, v14.2d, v15.2d @ h4l | h3l aese q3, v24.16b aesmc q3, q3 @ AES block 3 - round 6 ld1 {v30.4s}, [r8], #16 @ load rk12 aese q0, v24.16b aesmc q0, q0 @ AES block 0 - round 6 ldr q12, [r3, #32] @ load h1l | h1h #ifndef __ARMEB__ ext v12.16b, v12.16b, v12.16b, #8 #endif aese q2, v24.16b aesmc q2, q2 @ AES block 2 - round 6 ld1 {v31.4s}, [r8], #16 @ load rk13 aese q1, v25.16b aesmc q1, q1 @ AES block 1 - round 7 trn1 q9, v14.2d, v15.2d @ h4h | h3h aese q0, v25.16b aesmc q0, q0 @ AES block 0 - round 7 aese q2, v25.16b aesmc q2, q2 @ AES block 2 - round 7 aese q3, v25.16b aesmc q3, q3 @ AES block 3 - round 7 trn2 v16.2d, v12.2d, v13.2d @ h2l | h1l aese q1, v26.16b aesmc q1, q1 @ AES block 1 - round 8 aese q2, v26.16b aesmc q2, q2 @ AES block 2 - round 8 aese q3, v26.16b aesmc q3, q3 @ AES block 3 - round 8 aese q1, v27.16b aesmc q1, q1 @ AES block 1 - round 9 aese q2, v27.16b aesmc q2, q2 @ AES block 2 - round 9 aese q0, v26.16b aesmc q0, q0 @ AES block 0 - round 8 aese q1, v28.16b aesmc q1, q1 @ AES block 1 - round 10 aese q3, v27.16b aesmc q3, q3 @ AES block 3 - round 9 aese q0, v27.16b aesmc q0, q0 @ AES block 0 - round 9 aese q2, v28.16b aesmc q2, q2 @ AES block 2 - round 10 aese q3, v28.16b aesmc q3, q3 @ AES block 3 - round 10 aese q1, v29.16b aesmc q1, q1 @ AES block 1 - round 11 aese q2, v29.16b aesmc q2, q2 @ AES block 2 - round 11 aese q0, v28.16b aesmc q0, q0 @ AES block 0 - round 10 aese q1, v30.16b aesmc q1, q1 @ AES block 1 - round 12 aese q2, v30.16b aesmc q2, q2 @ AES block 2 - round 12 aese q0, v29.16b aesmc q0, q0 @ AES block 0 - round 11 eor v17.16b, v17.16b, q9 @ h4k | h3k aese q3, v29.16b aesmc q3, q3 @ AES block 3 - round 11 aese q2, v31.16b @ AES block 2 - round 13 trn1 q8, v12.2d, v13.2d @ h2h | h1h aese q0, v30.16b aesmc q0, q0 @ AES block 0 - round 12 aese q3, v30.16b aesmc q3, q3 @ AES block 3 - round 12 aese q1, v31.16b @ AES block 1 - round 13 aese q0, v31.16b @ AES block 0 - round 13 aese q3, v31.16b @ AES block 3 - round 13 eor v16.16b, v16.16b, q8 @ h2k | h1k bge .L256_enc_tail @ handle tail ldp r19, r20, [r0, #16] @ AES block 1 - load plaintext #ifdef __ARMEB__ rev r19, r19 rev r20, r20 #endif rev r9, r12 @ CTR block 4 ldp r6, r7, [r0, #0] @ AES block 0 - load plaintext #ifdef __ARMEB__ rev r6, r6 rev r7, r7 #endif ldp r23, r24, [r0, #48] @ AES block 3 - load plaintext #ifdef __ARMEB__ rev r23, r23 rev r24, r24 #endif ldp r21, r22, [r0, #32] @ AES block 2 - load plaintext #ifdef __ARMEB__ rev r21, r21 rev r22, r22 #endif add r0, r0, #64 @ AES input_ptr update eor r19, r19, r13 @ AES block 1 - round 14 low eor r20, r20, r14 @ AES block 1 - round 14 high fmov d5, r19 @ AES block 1 - mov low eor r6, r6, r13 @ AES block 0 - round 14 low eor r7, r7, r14 @ AES block 0 - round 14 high eor r24, r24, r14 @ AES block 3 - round 14 high fmov d4, r6 @ AES block 0 - mov low cmp r0, r5 @ check if we have <= 8 blocks fmov v4.d[1], r7 @ AES block 0 - mov high eor r23, r23, r13 @ AES block 3 - round 14 low eor r21, r21, r13 @ AES block 2 - round 14 low fmov v5.d[1], r20 @ AES block 1 - mov high fmov d6, r21 @ AES block 2 - mov low add r12, r12, #1 @ CTR block 4 orr r9, r11, r9, lsl #32 @ CTR block 4 fmov d7, r23 @ AES block 3 - mov low eor r22, r22, r14 @ AES block 2 - round 14 high fmov v6.d[1], r22 @ AES block 2 - mov high eor q4, q4, q0 @ AES block 0 - result fmov d0, r10 @ CTR block 4 fmov v0.d[1], r9 @ CTR block 4 rev r9, r12 @ CTR block 5 add r12, r12, #1 @ CTR block 5 eor q5, q5, q1 @ AES block 1 - result fmov d1, r10 @ CTR block 5 orr r9, r11, r9, lsl #32 @ CTR block 5 fmov v1.d[1], r9 @ CTR block 5 rev r9, r12 @ CTR block 6 st1 { q4}, [r2], #16 @ AES block 0 - store result fmov v7.d[1], r24 @ AES block 3 - mov high orr r9, r11, r9, lsl #32 @ CTR block 6 eor q6, q6, q2 @ AES block 2 - result st1 { q5}, [r2], #16 @ AES block 1 - store result add r12, r12, #1 @ CTR block 6 fmov d2, r10 @ CTR block 6 fmov v2.d[1], r9 @ CTR block 6 st1 { q6}, [r2], #16 @ AES block 2 - store result rev r9, r12 @ CTR block 7 orr r9, r11, r9, lsl #32 @ CTR block 7 eor q7, q7, q3 @ AES block 3 - result st1 { q7}, [r2], #16 @ AES block 3 - store result bge .L256_enc_prepretail @ do prepretail .L256_enc_main_loop:@ main loop start aese q0, v18.16b aesmc q0, q0 @ AES block 4k+4 - round 0 rev64 q4, q4 @ GHASH block 4k (only t0 is free) aese q1, v18.16b aesmc q1, q1 @ AES block 4k+5 - round 0 fmov d3, r10 @ CTR block 4k+3 aese q2, v18.16b aesmc q2, q2 @ AES block 4k+6 - round 0 ext v11.16b, v11.16b, v11.16b, #8 @ PRE 0 aese q0, v19.16b aesmc q0, q0 @ AES block 4k+4 - round 1 fmov v3.d[1], r9 @ CTR block 4k+3 aese q1, v19.16b aesmc q1, q1 @ AES block 4k+5 - round 1 ldp r23, r24, [r0, #48] @ AES block 4k+7 - load plaintext #ifdef __ARMEB__ rev r23, r23 rev r24, r24 #endif aese q2, v19.16b aesmc q2, q2 @ AES block 4k+6 - round 1 ldp r21, r22, [r0, #32] @ AES block 4k+6 - load plaintext #ifdef __ARMEB__ rev r21, r21 rev r22, r22 #endif aese q0, v20.16b aesmc q0, q0 @ AES block 4k+4 - round 2 eor q4, q4, v11.16b @ PRE 1 aese q1, v20.16b aesmc q1, q1 @ AES block 4k+5 - round 2 aese q3, v18.16b aesmc q3, q3 @ AES block 4k+7 - round 0 eor r23, r23, r13 @ AES block 4k+7 - round 14 low aese q0, v21.16b aesmc q0, q0 @ AES block 4k+4 - round 3 mov d10, v17.d[1] @ GHASH block 4k - mid pmull2 v9.1q, q4, v15.2d @ GHASH block 4k - high eor r22, r22, r14 @ AES block 4k+6 - round 14 high mov d8, v4.d[1] @ GHASH block 4k - mid aese q3, v19.16b aesmc q3, q3 @ AES block 4k+7 - round 1 rev64 q5, q5 @ GHASH block 4k+1 (t0 and t1 free) aese q0, v22.16b aesmc q0, q0 @ AES block 4k+4 - round 4 pmull v11.1q, q4, v15.1d @ GHASH block 4k - low eor q8, q8, q4 @ GHASH block 4k - mid aese q2, v20.16b aesmc q2, q2 @ AES block 4k+6 - round 2 aese q0, v23.16b aesmc q0, q0 @ AES block 4k+4 - round 5 rev64 q7, q7 @ GHASH block 4k+3 (t0, t1, t2 and t3 free) pmull2 v4.1q, q5, v14.2d @ GHASH block 4k+1 - high pmull v10.1q, q8, v10.1d @ GHASH block 4k - mid rev64 q6, q6 @ GHASH block 4k+2 (t0, t1, and t2 free) pmull v8.1q, q5, v14.1d @ GHASH block 4k+1 - low eor q9, q9, q4 @ GHASH block 4k+1 - high mov d4, v5.d[1] @ GHASH block 4k+1 - mid aese q1, v21.16b aesmc q1, q1 @ AES block 4k+5 - round 3 aese q3, v20.16b aesmc q3, q3 @ AES block 4k+7 - round 2 eor v11.16b, v11.16b, q8 @ GHASH block 4k+1 - low aese q2, v21.16b aesmc q2, q2 @ AES block 4k+6 - round 3 aese q1, v22.16b aesmc q1, q1 @ AES block 4k+5 - round 4 mov d8, v6.d[1] @ GHASH block 4k+2 - mid aese q3, v21.16b aesmc q3, q3 @ AES block 4k+7 - round 3 eor q4, q4, q5 @ GHASH block 4k+1 - mid aese q2, v22.16b aesmc q2, q2 @ AES block 4k+6 - round 4 aese q0, v24.16b aesmc q0, q0 @ AES block 4k+4 - round 6 eor q8, q8, q6 @ GHASH block 4k+2 - mid aese q3, v22.16b aesmc q3, q3 @ AES block 4k+7 - round 4 pmull v4.1q, q4, v17.1d @ GHASH block 4k+1 - mid aese q0, v25.16b aesmc q0, q0 @ AES block 4k+4 - round 7 aese q3, v23.16b aesmc q3, q3 @ AES block 4k+7 - round 5 ins v8.d[1], v8.d[0] @ GHASH block 4k+2 - mid aese q1, v23.16b aesmc q1, q1 @ AES block 4k+5 - round 5 aese q0, v26.16b aesmc q0, q0 @ AES block 4k+4 - round 8 aese q2, v23.16b aesmc q2, q2 @ AES block 4k+6 - round 5 aese q1, v24.16b aesmc q1, q1 @ AES block 4k+5 - round 6 eor v10.16b, v10.16b, q4 @ GHASH block 4k+1 - mid pmull2 v4.1q, q6, v13.2d @ GHASH block 4k+2 - high pmull v5.1q, q6, v13.1d @ GHASH block 4k+2 - low aese q1, v25.16b aesmc q1, q1 @ AES block 4k+5 - round 7 pmull v6.1q, q7, v12.1d @ GHASH block 4k+3 - low eor q9, q9, q4 @ GHASH block 4k+2 - high aese q3, v24.16b aesmc q3, q3 @ AES block 4k+7 - round 6 ldp r19, r20, [r0, #16] @ AES block 4k+5 - load plaintext #ifdef __ARMEB__ rev r19, r19 rev r20, r20 #endif aese q1, v26.16b aesmc q1, q1 @ AES block 4k+5 - round 8 mov d4, v7.d[1] @ GHASH block 4k+3 - mid aese q2, v24.16b aesmc q2, q2 @ AES block 4k+6 - round 6 eor v11.16b, v11.16b, q5 @ GHASH block 4k+2 - low pmull2 v8.1q, q8, v16.2d @ GHASH block 4k+2 - mid pmull2 v5.1q, q7, v12.2d @ GHASH block 4k+3 - high eor q4, q4, q7 @ GHASH block 4k+3 - mid aese q2, v25.16b aesmc q2, q2 @ AES block 4k+6 - round 7 eor r19, r19, r13 @ AES block 4k+5 - round 14 low aese q1, v27.16b aesmc q1, q1 @ AES block 4k+5 - round 9 eor v10.16b, v10.16b, q8 @ GHASH block 4k+2 - mid aese q3, v25.16b aesmc q3, q3 @ AES block 4k+7 - round 7 eor r21, r21, r13 @ AES block 4k+6 - round 14 low aese q0, v27.16b aesmc q0, q0 @ AES block 4k+4 - round 9 movi q8, #0xc2 pmull v4.1q, q4, v16.1d @ GHASH block 4k+3 - mid eor q9, q9, q5 @ GHASH block 4k+3 - high fmov d5, r19 @ AES block 4k+5 - mov low aese q2, v26.16b aesmc q2, q2 @ AES block 4k+6 - round 8 ldp r6, r7, [r0, #0] @ AES block 4k+4 - load plaintext #ifdef __ARMEB__ rev r6, r6 rev r7, r7 #endif aese q0, v28.16b aesmc q0, q0 @ AES block 4k+4 - round 10 shl d8, d8, #56 @ mod_constant aese q3, v26.16b aesmc q3, q3 @ AES block 4k+7 - round 8 eor v11.16b, v11.16b, q6 @ GHASH block 4k+3 - low aese q2, v27.16b aesmc q2, q2 @ AES block 4k+6 - round 9 aese q1, v28.16b aesmc q1, q1 @ AES block 4k+5 - round 10 eor v10.16b, v10.16b, q4 @ GHASH block 4k+3 - mid aese q3, v27.16b aesmc q3, q3 @ AES block 4k+7 - round 9 add r12, r12, #1 @ CTR block 4k+3 aese q0, v29.16b aesmc q0, q0 @ AES block 4k+4 - round 11 eor q4, v11.16b, q9 @ MODULO - karatsuba tidy up aese q1, v29.16b aesmc q1, q1 @ AES block 4k+5 - round 11 add r0, r0, #64 @ AES input_ptr update pmull v7.1q, q9, q8 @ MODULO - top 64b align with mid rev r9, r12 @ CTR block 4k+8 ext q9, q9, q9, #8 @ MODULO - other top alignment aese q2, v28.16b aesmc q2, q2 @ AES block 4k+6 - round 10 eor r6, r6, r13 @ AES block 4k+4 - round 14 low aese q1, v30.16b aesmc q1, q1 @ AES block 4k+5 - round 12 eor v10.16b, v10.16b, q4 @ MODULO - karatsuba tidy up aese q3, v28.16b aesmc q3, q3 @ AES block 4k+7 - round 10 eor r7, r7, r14 @ AES block 4k+4 - round 14 high fmov d4, r6 @ AES block 4k+4 - mov low orr r9, r11, r9, lsl #32 @ CTR block 4k+8 eor q7, q9, q7 @ MODULO - fold into mid aese q0, v30.16b aesmc q0, q0 @ AES block 4k+4 - round 12 eor r20, r20, r14 @ AES block 4k+5 - round 14 high aese q2, v29.16b aesmc q2, q2 @ AES block 4k+6 - round 11 eor r24, r24, r14 @ AES block 4k+7 - round 14 high aese q3, v29.16b aesmc q3, q3 @ AES block 4k+7 - round 11 add r12, r12, #1 @ CTR block 4k+8 aese q0, v31.16b @ AES block 4k+4 - round 13 fmov v4.d[1], r7 @ AES block 4k+4 - mov high eor v10.16b, v10.16b, q7 @ MODULO - fold into mid aese q2, v30.16b aesmc q2, q2 @ AES block 4k+6 - round 12 fmov d7, r23 @ AES block 4k+7 - mov low aese q1, v31.16b @ AES block 4k+5 - round 13 fmov v5.d[1], r20 @ AES block 4k+5 - mov high fmov d6, r21 @ AES block 4k+6 - mov low cmp r0, r5 @ .LOOP CONTROL fmov v6.d[1], r22 @ AES block 4k+6 - mov high pmull v9.1q, v10.1d, q8 @ MODULO - mid 64b align with low eor q4, q4, q0 @ AES block 4k+4 - result fmov d0, r10 @ CTR block 4k+8 fmov v0.d[1], r9 @ CTR block 4k+8 rev r9, r12 @ CTR block 4k+9 add r12, r12, #1 @ CTR block 4k+9 eor q5, q5, q1 @ AES block 4k+5 - result fmov d1, r10 @ CTR block 4k+9 orr r9, r11, r9, lsl #32 @ CTR block 4k+9 aese q3, v30.16b aesmc q3, q3 @ AES block 4k+7 - round 12 fmov v1.d[1], r9 @ CTR block 4k+9 aese q2, v31.16b @ AES block 4k+6 - round 13 rev r9, r12 @ CTR block 4k+10 st1 { q4}, [r2], #16 @ AES block 4k+4 - store result orr r9, r11, r9, lsl #32 @ CTR block 4k+10 eor v11.16b, v11.16b, q9 @ MODULO - fold into low fmov v7.d[1], r24 @ AES block 4k+7 - mov high ext v10.16b, v10.16b, v10.16b, #8 @ MODULO - other mid alignment st1 { q5}, [r2], #16 @ AES block 4k+5 - store result add r12, r12, #1 @ CTR block 4k+10 aese q3, v31.16b @ AES block 4k+7 - round 13 eor q6, q6, q2 @ AES block 4k+6 - result fmov d2, r10 @ CTR block 4k+10 st1 { q6}, [r2], #16 @ AES block 4k+6 - store result fmov v2.d[1], r9 @ CTR block 4k+10 rev r9, r12 @ CTR block 4k+11 eor v11.16b, v11.16b, v10.16b @ MODULO - fold into low orr r9, r11, r9, lsl #32 @ CTR block 4k+11 eor q7, q7, q3 @ AES block 4k+7 - result st1 { q7}, [r2], #16 @ AES block 4k+7 - store result blt .L256_enc_main_loop .L256_enc_prepretail:@ PREPRETAIL aese q1, v18.16b aesmc q1, q1 @ AES block 4k+5 - round 0 rev64 q6, q6 @ GHASH block 4k+2 (t0, t1, and t2 free) aese q2, v18.16b aesmc q2, q2 @ AES block 4k+6 - round 0 fmov d3, r10 @ CTR block 4k+3 aese q0, v18.16b aesmc q0, q0 @ AES block 4k+4 - round 0 rev64 q4, q4 @ GHASH block 4k (only t0 is free) fmov v3.d[1], r9 @ CTR block 4k+3 ext v11.16b, v11.16b, v11.16b, #8 @ PRE 0 aese q2, v19.16b aesmc q2, q2 @ AES block 4k+6 - round 1 aese q0, v19.16b aesmc q0, q0 @ AES block 4k+4 - round 1 eor q4, q4, v11.16b @ PRE 1 rev64 q5, q5 @ GHASH block 4k+1 (t0 and t1 free) aese q2, v20.16b aesmc q2, q2 @ AES block 4k+6 - round 2 aese q3, v18.16b aesmc q3, q3 @ AES block 4k+7 - round 0 mov d10, v17.d[1] @ GHASH block 4k - mid aese q1, v19.16b aesmc q1, q1 @ AES block 4k+5 - round 1 pmull v11.1q, q4, v15.1d @ GHASH block 4k - low mov d8, v4.d[1] @ GHASH block 4k - mid pmull2 v9.1q, q4, v15.2d @ GHASH block 4k - high aese q2, v21.16b aesmc q2, q2 @ AES block 4k+6 - round 3 aese q1, v20.16b aesmc q1, q1 @ AES block 4k+5 - round 2 eor q8, q8, q4 @ GHASH block 4k - mid aese q0, v20.16b aesmc q0, q0 @ AES block 4k+4 - round 2 aese q3, v19.16b aesmc q3, q3 @ AES block 4k+7 - round 1 aese q1, v21.16b aesmc q1, q1 @ AES block 4k+5 - round 3 pmull v10.1q, q8, v10.1d @ GHASH block 4k - mid pmull2 v4.1q, q5, v14.2d @ GHASH block 4k+1 - high pmull v8.1q, q5, v14.1d @ GHASH block 4k+1 - low aese q3, v20.16b aesmc q3, q3 @ AES block 4k+7 - round 2 eor q9, q9, q4 @ GHASH block 4k+1 - high mov d4, v5.d[1] @ GHASH block 4k+1 - mid aese q0, v21.16b aesmc q0, q0 @ AES block 4k+4 - round 3 eor v11.16b, v11.16b, q8 @ GHASH block 4k+1 - low aese q3, v21.16b aesmc q3, q3 @ AES block 4k+7 - round 3 eor q4, q4, q5 @ GHASH block 4k+1 - mid mov d8, v6.d[1] @ GHASH block 4k+2 - mid aese q0, v22.16b aesmc q0, q0 @ AES block 4k+4 - round 4 rev64 q7, q7 @ GHASH block 4k+3 (t0, t1, t2 and t3 free) aese q3, v22.16b aesmc q3, q3 @ AES block 4k+7 - round 4 pmull v4.1q, q4, v17.1d @ GHASH block 4k+1 - mid eor q8, q8, q6 @ GHASH block 4k+2 - mid add r12, r12, #1 @ CTR block 4k+3 pmull v5.1q, q6, v13.1d @ GHASH block 4k+2 - low aese q3, v23.16b aesmc q3, q3 @ AES block 4k+7 - round 5 aese q2, v22.16b aesmc q2, q2 @ AES block 4k+6 - round 4 eor v10.16b, v10.16b, q4 @ GHASH block 4k+1 - mid pmull2 v4.1q, q6, v13.2d @ GHASH block 4k+2 - high eor v11.16b, v11.16b, q5 @ GHASH block 4k+2 - low ins v8.d[1], v8.d[0] @ GHASH block 4k+2 - mid aese q2, v23.16b aesmc q2, q2 @ AES block 4k+6 - round 5 eor q9, q9, q4 @ GHASH block 4k+2 - high mov d4, v7.d[1] @ GHASH block 4k+3 - mid aese q1, v22.16b aesmc q1, q1 @ AES block 4k+5 - round 4 pmull2 v8.1q, q8, v16.2d @ GHASH block 4k+2 - mid eor q4, q4, q7 @ GHASH block 4k+3 - mid pmull2 v5.1q, q7, v12.2d @ GHASH block 4k+3 - high aese q1, v23.16b aesmc q1, q1 @ AES block 4k+5 - round 5 pmull v4.1q, q4, v16.1d @ GHASH block 4k+3 - mid eor v10.16b, v10.16b, q8 @ GHASH block 4k+2 - mid aese q0, v23.16b aesmc q0, q0 @ AES block 4k+4 - round 5 aese q1, v24.16b aesmc q1, q1 @ AES block 4k+5 - round 6 aese q2, v24.16b aesmc q2, q2 @ AES block 4k+6 - round 6 aese q0, v24.16b aesmc q0, q0 @ AES block 4k+4 - round 6 movi q8, #0xc2 aese q3, v24.16b aesmc q3, q3 @ AES block 4k+7 - round 6 aese q1, v25.16b aesmc q1, q1 @ AES block 4k+5 - round 7 eor q9, q9, q5 @ GHASH block 4k+3 - high aese q0, v25.16b aesmc q0, q0 @ AES block 4k+4 - round 7 aese q3, v25.16b aesmc q3, q3 @ AES block 4k+7 - round 7 shl d8, d8, #56 @ mod_constant aese q1, v26.16b aesmc q1, q1 @ AES block 4k+5 - round 8 eor v10.16b, v10.16b, q4 @ GHASH block 4k+3 - mid pmull v6.1q, q7, v12.1d @ GHASH block 4k+3 - low aese q3, v26.16b aesmc q3, q3 @ AES block 4k+7 - round 8 aese q1, v27.16b aesmc q1, q1 @ AES block 4k+5 - round 9 aese q0, v26.16b aesmc q0, q0 @ AES block 4k+4 - round 8 eor v11.16b, v11.16b, q6 @ GHASH block 4k+3 - low aese q3, v27.16b aesmc q3, q3 @ AES block 4k+7 - round 9 eor v10.16b, v10.16b, q9 @ karatsuba tidy up pmull v4.1q, q9, q8 ext q9, q9, q9, #8 aese q3, v28.16b aesmc q3, q3 @ AES block 4k+7 - round 10 aese q2, v25.16b aesmc q2, q2 @ AES block 4k+6 - round 7 eor v10.16b, v10.16b, v11.16b aese q1, v28.16b aesmc q1, q1 @ AES block 4k+5 - round 10 aese q0, v27.16b aesmc q0, q0 @ AES block 4k+4 - round 9 aese q2, v26.16b aesmc q2, q2 @ AES block 4k+6 - round 8 aese q1, v29.16b aesmc q1, q1 @ AES block 4k+5 - round 11 eor v10.16b, v10.16b, q4 aese q0, v28.16b aesmc q0, q0 @ AES block 4k+4 - round 10 aese q2, v27.16b aesmc q2, q2 @ AES block 4k+6 - round 9 aese q1, v30.16b aesmc q1, q1 @ AES block 4k+5 - round 12 aese q0, v29.16b aesmc q0, q0 @ AES block 4k+4 - round 11 eor v10.16b, v10.16b, q9 aese q3, v29.16b aesmc q3, q3 @ AES block 4k+7 - round 11 aese q2, v28.16b aesmc q2, q2 @ AES block 4k+6 - round 10 aese q0, v30.16b aesmc q0, q0 @ AES block 4k+4 - round 12 pmull v4.1q, v10.1d, q8 aese q2, v29.16b aesmc q2, q2 @ AES block 4k+6 - round 11 ext v10.16b, v10.16b, v10.16b, #8 aese q3, v30.16b aesmc q3, q3 @ AES block 4k+7 - round 12 aese q1, v31.16b @ AES block 4k+5 - round 13 eor v11.16b, v11.16b, q4 aese q2, v30.16b aesmc q2, q2 @ AES block 4k+6 - round 12 aese q3, v31.16b @ AES block 4k+7 - round 13 aese q0, v31.16b @ AES block 4k+4 - round 13 aese q2, v31.16b @ AES block 4k+6 - round 13 eor v11.16b, v11.16b, v10.16b .L256_enc_tail:@ TAIL ext q8, v11.16b, v11.16b, #8 @ prepare final partial tag sub r5, r4, r0 @ main_end_input_ptr is number of bytes left to process ldp r6, r7, [r0], #16 @ AES block 4k+4 - load plaintext #ifdef __ARMEB__ rev r6, r6 rev r7, r7 #endif eor r6, r6, r13 @ AES block 4k+4 - round 14 low eor r7, r7, r14 @ AES block 4k+4 - round 14 high cmp r5, #48 fmov d4, r6 @ AES block 4k+4 - mov low fmov v4.d[1], r7 @ AES block 4k+4 - mov high eor q5, q4, q0 @ AES block 4k+4 - result bgt .L256_enc_blocks_more_than_3 cmp r5, #32 mov q3, q2 movi v11.8b, #0 movi q9, #0 sub r12, r12, #1 mov q2, q1 movi v10.8b, #0 bgt .L256_enc_blocks_more_than_2 mov q3, q1 sub r12, r12, #1 cmp r5, #16 bgt .L256_enc_blocks_more_than_1 sub r12, r12, #1 b .L256_enc_blocks_less_than_1 .L256_enc_blocks_more_than_3:@ blocks left > 3 st1 { q5}, [r2], #16 @ AES final-3 block - store result ldp r6, r7, [r0], #16 @ AES final-2 block - load input low & high #ifdef __ARMEB__ rev r6, r6 rev r7, r7 #endif rev64 q4, q5 @ GHASH final-3 block eor r6, r6, r13 @ AES final-2 block - round 14 low eor q4, q4, q8 @ feed in partial tag eor r7, r7, r14 @ AES final-2 block - round 14 high mov d22, v4.d[1] @ GHASH final-3 block - mid fmov d5, r6 @ AES final-2 block - mov low fmov v5.d[1], r7 @ AES final-2 block - mov high eor v22.8b, v22.8b, q4 @ GHASH final-3 block - mid movi q8, #0 @ suppress further partial tag feed in mov d10, v17.d[1] @ GHASH final-3 block - mid pmull v11.1q, q4, v15.1d @ GHASH final-3 block - low pmull2 v9.1q, q4, v15.2d @ GHASH final-3 block - high pmull v10.1q, v22.1d, v10.1d @ GHASH final-3 block - mid eor q5, q5, q1 @ AES final-2 block - result .L256_enc_blocks_more_than_2:@ blocks left > 2 st1 { q5}, [r2], #16 @ AES final-2 block - store result ldp r6, r7, [r0], #16 @ AES final-1 block - load input low & high #ifdef __ARMEB__ rev r6, r6 rev r7, r7 #endif rev64 q4, q5 @ GHASH final-2 block eor r6, r6, r13 @ AES final-1 block - round 14 low eor q4, q4, q8 @ feed in partial tag fmov d5, r6 @ AES final-1 block - mov low eor r7, r7, r14 @ AES final-1 block - round 14 high fmov v5.d[1], r7 @ AES final-1 block - mov high movi q8, #0 @ suppress further partial tag feed in pmull2 v20.1q, q4, v14.2d @ GHASH final-2 block - high mov d22, v4.d[1] @ GHASH final-2 block - mid pmull v21.1q, q4, v14.1d @ GHASH final-2 block - low eor v22.8b, v22.8b, q4 @ GHASH final-2 block - mid eor q5, q5, q2 @ AES final-1 block - result eor q9, q9, v20.16b @ GHASH final-2 block - high pmull v22.1q, v22.1d, v17.1d @ GHASH final-2 block - mid eor v11.16b, v11.16b, v21.16b @ GHASH final-2 block - low eor v10.16b, v10.16b, v22.16b @ GHASH final-2 block - mid .L256_enc_blocks_more_than_1:@ blocks left > 1 st1 { q5}, [r2], #16 @ AES final-1 block - store result rev64 q4, q5 @ GHASH final-1 block ldp r6, r7, [r0], #16 @ AES final block - load input low & high #ifdef __ARMEB__ rev r6, r6 rev r7, r7 #endif eor q4, q4, q8 @ feed in partial tag movi q8, #0 @ suppress further partial tag feed in eor r6, r6, r13 @ AES final block - round 14 low mov d22, v4.d[1] @ GHASH final-1 block - mid pmull2 v20.1q, q4, v13.2d @ GHASH final-1 block - high eor r7, r7, r14 @ AES final block - round 14 high eor v22.8b, v22.8b, q4 @ GHASH final-1 block - mid eor q9, q9, v20.16b @ GHASH final-1 block - high ins v22.d[1], v22.d[0] @ GHASH final-1 block - mid fmov d5, r6 @ AES final block - mov low fmov v5.d[1], r7 @ AES final block - mov high pmull2 v22.1q, v22.2d, v16.2d @ GHASH final-1 block - mid pmull v21.1q, q4, v13.1d @ GHASH final-1 block - low eor q5, q5, q3 @ AES final block - result eor v10.16b, v10.16b, v22.16b @ GHASH final-1 block - mid eor v11.16b, v11.16b, v21.16b @ GHASH final-1 block - low .L256_enc_blocks_less_than_1:@ blocks left <= 1 and r1, r1, #127 @ bit_length %= 128 mvn r13, xzr @ rk14_l = 0xffffffffffffffff sub r1, r1, #128 @ bit_length -= 128 neg r1, r1 @ bit_length = 128 - #bits in input (in range [1,128]) ld1 { v18.16b}, [r2] @ load existing bytes where the possibly partial last block is to be stored mvn r14, xzr @ rk14_h = 0xffffffffffffffff and r1, r1, #127 @ bit_length %= 128 lsr r14, r14, r1 @ rk14_h is mask for top 64b of last block cmp r1, #64 csel r6, r13, r14, lt csel r7, r14, xzr, lt fmov d0, r6 @ ctr0b is mask for last block fmov v0.d[1], r7 and q5, q5, q0 @ possibly partial last block has zeroes in highest bits rev64 q4, q5 @ GHASH final block eor q4, q4, q8 @ feed in partial tag bif q5, v18.16b, q0 @ insert existing bytes in top end of result before storing pmull2 v20.1q, q4, v12.2d @ GHASH final block - high mov d8, v4.d[1] @ GHASH final block - mid #ifndef __ARMEB__ rev r9, r12 #else mov r9, r12 #endif pmull v21.1q, q4, v12.1d @ GHASH final block - low eor q9, q9, v20.16b @ GHASH final block - high eor q8, q8, q4 @ GHASH final block - mid pmull v8.1q, q8, v16.1d @ GHASH final block - mid eor v11.16b, v11.16b, v21.16b @ GHASH final block - low eor v10.16b, v10.16b, q8 @ GHASH final block - mid movi q8, #0xc2 eor q4, v11.16b, q9 @ MODULO - karatsuba tidy up shl d8, d8, #56 @ mod_constant eor v10.16b, v10.16b, q4 @ MODULO - karatsuba tidy up pmull v7.1q, q9, q8 @ MODULO - top 64b align with mid ext q9, q9, q9, #8 @ MODULO - other top alignment eor v10.16b, v10.16b, q7 @ MODULO - fold into mid eor v10.16b, v10.16b, q9 @ MODULO - fold into mid pmull v9.1q, v10.1d, q8 @ MODULO - mid 64b align with low ext v10.16b, v10.16b, v10.16b, #8 @ MODULO - other mid alignment str r9, [r16, #12] @ store the updated counter st1 { q5}, [r2] @ store all 16B eor v11.16b, v11.16b, q9 @ MODULO - fold into low eor v11.16b, v11.16b, v10.16b @ MODULO - fold into low ext v11.16b, v11.16b, v11.16b, #8 rev64 v11.16b, v11.16b mov r0, r15 st1 { v11.16b }, [r3] ldp r21, r22, [sp, #16] ldp r23, r24, [sp, #32] ldp d8, d9, [sp, #48] ldp d10, d11, [sp, #64] ldp d12, d13, [sp, #80] ldp d14, d15, [sp, #96] ldp r19, r20, [sp], #112 RET .L256_enc_ret: mov r0, #0x0 RET .size aes_gcm_enc_256_kernel,.-aes_gcm_enc_256_kernel .globl aes_gcm_dec_256_kernel .type aes_gcm_dec_256_kernel,%function .align 4 aes_gcm_dec_256_kernel: cbz r1, .L256_dec_ret stp r19, r20, [sp, #-112]! mov r16, r4 mov r8, r5 stp r21, r22, [sp, #16] stp r23, r24, [sp, #32] stp d8, d9, [sp, #48] stp d10, d11, [sp, #64] stp d12, d13, [sp, #80] stp d14, d15, [sp, #96] lsr r5, r1, #3 @ byte_len mov r15, r5 ldp r10, r11, [r16] @ ctr96_b64, ctr96_t32 #ifdef __ARMEB__ rev r10, r10 rev r11, r11 #endif ldp r13, r14, [r8, #224] @ load rk14 #ifdef __ARMEB__ ror r14, r14, #32 ror r13, r13, #32 #endif ld1 {v18.4s}, [r8], #16 @ load rk0 sub r5, r5, #1 @ byte_len - 1 ld1 {v19.4s}, [r8], #16 @ load rk1 and r5, r5, #0xffffffffffffffc0 @ number of bytes to be processed in main loop (at least 1 byte must be handled by tail) add r4, r0, r1, lsr #3 @ end_input_ptr ld1 {v20.4s}, [r8], #16 @ load rk2 lsr r12, r11, #32 ld1 {v21.4s}, [r8], #16 @ load rk3 orr r11, r11, r11 ld1 {v22.4s}, [r8], #16 @ load rk4 add r5, r5, r0 rev r12, r12 @ rev_ctr32 add r12, r12, #1 @ increment rev_ctr32 fmov d3, r10 @ CTR block 3 rev r9, r12 @ CTR block 1 add r12, r12, #1 @ CTR block 1 fmov d1, r10 @ CTR block 1 orr r9, r11, r9, lsl #32 @ CTR block 1 ld1 { q0}, [r16] @ special case vector load initial counter so we can start first AES block as quickly as possible fmov v1.d[1], r9 @ CTR block 1 rev r9, r12 @ CTR block 2 add r12, r12, #1 @ CTR block 2 fmov d2, r10 @ CTR block 2 orr r9, r11, r9, lsl #32 @ CTR block 2 fmov v2.d[1], r9 @ CTR block 2 rev r9, r12 @ CTR block 3 orr r9, r11, r9, lsl #32 @ CTR block 3 ld1 {v23.4s}, [r8], #16 @ load rk5 fmov v3.d[1], r9 @ CTR block 3 add r12, r12, #1 @ CTR block 3 ld1 {v24.4s}, [r8], #16 @ load rk6 ld1 {v25.4s}, [r8], #16 @ load rk7 ld1 {v26.4s}, [r8], #16 @ load rk8 aese q0, v18.16b aesmc q0, q0 @ AES block 0 - round 0 ldr q14, [r3, #80] @ load h3l | h3h #ifndef __ARMEB__ ext v14.16b, v14.16b, v14.16b, #8 #endif aese q3, v18.16b aesmc q3, q3 @ AES block 3 - round 0 ldr q15, [r3, #112] @ load h4l | h4h #ifndef __ARMEB__ ext v15.16b, v15.16b, v15.16b, #8 #endif aese q1, v18.16b aesmc q1, q1 @ AES block 1 - round 0 ldr q13, [r3, #64] @ load h2l | h2h #ifndef __ARMEB__ ext v13.16b, v13.16b, v13.16b, #8 #endif aese q2, v18.16b aesmc q2, q2 @ AES block 2 - round 0 ld1 {v27.4s}, [r8], #16 @ load rk9 aese q0, v19.16b aesmc q0, q0 @ AES block 0 - round 1 aese q1, v19.16b aesmc q1, q1 @ AES block 1 - round 1 ld1 { v11.16b}, [r3] ext v11.16b, v11.16b, v11.16b, #8 rev64 v11.16b, v11.16b aese q2, v19.16b aesmc q2, q2 @ AES block 2 - round 1 ld1 {v28.4s}, [r8], #16 @ load rk10 aese q3, v19.16b aesmc q3, q3 @ AES block 3 - round 1 ld1 {v29.4s}, [r8], #16 @ load rk11 aese q0, v20.16b aesmc q0, q0 @ AES block 0 - round 2 ldr q12, [r3, #32] @ load h1l | h1h #ifndef __ARMEB__ ext v12.16b, v12.16b, v12.16b, #8 #endif aese q2, v20.16b aesmc q2, q2 @ AES block 2 - round 2 ld1 {v30.4s}, [r8], #16 @ load rk12 aese q3, v20.16b aesmc q3, q3 @ AES block 3 - round 2 aese q0, v21.16b aesmc q0, q0 @ AES block 0 - round 3 aese q1, v20.16b aesmc q1, q1 @ AES block 1 - round 2 aese q3, v21.16b aesmc q3, q3 @ AES block 3 - round 3 aese q0, v22.16b aesmc q0, q0 @ AES block 0 - round 4 cmp r0, r5 @ check if we have <= 4 blocks aese q2, v21.16b aesmc q2, q2 @ AES block 2 - round 3 aese q1, v21.16b aesmc q1, q1 @ AES block 1 - round 3 aese q3, v22.16b aesmc q3, q3 @ AES block 3 - round 4 aese q2, v22.16b aesmc q2, q2 @ AES block 2 - round 4 aese q1, v22.16b aesmc q1, q1 @ AES block 1 - round 4 aese q3, v23.16b aesmc q3, q3 @ AES block 3 - round 5 aese q0, v23.16b aesmc q0, q0 @ AES block 0 - round 5 aese q1, v23.16b aesmc q1, q1 @ AES block 1 - round 5 aese q2, v23.16b aesmc q2, q2 @ AES block 2 - round 5 aese q0, v24.16b aesmc q0, q0 @ AES block 0 - round 6 aese q3, v24.16b aesmc q3, q3 @ AES block 3 - round 6 aese q1, v24.16b aesmc q1, q1 @ AES block 1 - round 6 aese q2, v24.16b aesmc q2, q2 @ AES block 2 - round 6 aese q0, v25.16b aesmc q0, q0 @ AES block 0 - round 7 aese q1, v25.16b aesmc q1, q1 @ AES block 1 - round 7 aese q3, v25.16b aesmc q3, q3 @ AES block 3 - round 7 aese q0, v26.16b aesmc q0, q0 @ AES block 0 - round 8 aese q2, v25.16b aesmc q2, q2 @ AES block 2 - round 7 aese q3, v26.16b aesmc q3, q3 @ AES block 3 - round 8 aese q1, v26.16b aesmc q1, q1 @ AES block 1 - round 8 aese q0, v27.16b aesmc q0, q0 @ AES block 0 - round 9 aese q2, v26.16b aesmc q2, q2 @ AES block 2 - round 8 ld1 {v31.4s}, [r8], #16 @ load rk13 aese q1, v27.16b aesmc q1, q1 @ AES block 1 - round 9 aese q0, v28.16b aesmc q0, q0 @ AES block 0 - round 10 aese q3, v27.16b aesmc q3, q3 @ AES block 3 - round 9 aese q1, v28.16b aesmc q1, q1 @ AES block 1 - round 10 aese q2, v27.16b aesmc q2, q2 @ AES block 2 - round 9 aese q3, v28.16b aesmc q3, q3 @ AES block 3 - round 10 aese q0, v29.16b aesmc q0, q0 @ AES block 0 - round 11 aese q2, v28.16b aesmc q2, q2 @ AES block 2 - round 10 aese q3, v29.16b aesmc q3, q3 @ AES block 3 - round 11 aese q1, v29.16b aesmc q1, q1 @ AES block 1 - round 11 aese q2, v29.16b aesmc q2, q2 @ AES block 2 - round 11 trn1 q9, v14.2d, v15.2d @ h4h | h3h trn2 v17.2d, v14.2d, v15.2d @ h4l | h3l trn1 q8, v12.2d, v13.2d @ h2h | h1h trn2 v16.2d, v12.2d, v13.2d @ h2l | h1l aese q1, v30.16b aesmc q1, q1 @ AES block 1 - round 12 aese q0, v30.16b aesmc q0, q0 @ AES block 0 - round 12 aese q2, v30.16b aesmc q2, q2 @ AES block 2 - round 12 aese q3, v30.16b aesmc q3, q3 @ AES block 3 - round 12 eor v17.16b, v17.16b, q9 @ h4k | h3k aese q1, v31.16b @ AES block 1 - round 13 aese q2, v31.16b @ AES block 2 - round 13 eor v16.16b, v16.16b, q8 @ h2k | h1k aese q3, v31.16b @ AES block 3 - round 13 aese q0, v31.16b @ AES block 0 - round 13 bge .L256_dec_tail @ handle tail ld1 {q4, q5}, [r0], #32 @ AES block 0,1 - load ciphertext rev r9, r12 @ CTR block 4 eor q0, q4, q0 @ AES block 0 - result eor q1, q5, q1 @ AES block 1 - result rev64 q5, q5 @ GHASH block 1 ld1 {q6}, [r0], #16 @ AES block 2 - load ciphertext mov r7, v0.d[1] @ AES block 0 - mov high mov r6, v0.d[0] @ AES block 0 - mov low rev64 q4, q4 @ GHASH block 0 add r12, r12, #1 @ CTR block 4 fmov d0, r10 @ CTR block 4 orr r9, r11, r9, lsl #32 @ CTR block 4 fmov v0.d[1], r9 @ CTR block 4 rev r9, r12 @ CTR block 5 add r12, r12, #1 @ CTR block 5 mov r19, v1.d[0] @ AES block 1 - mov low orr r9, r11, r9, lsl #32 @ CTR block 5 mov r20, v1.d[1] @ AES block 1 - mov high eor r7, r7, r14 @ AES block 0 - round 14 high #ifdef __ARMEB__ rev r7, r7 #endif eor r6, r6, r13 @ AES block 0 - round 14 low #ifdef __ARMEB__ rev r6, r6 #endif stp r6, r7, [r2], #16 @ AES block 0 - store result fmov d1, r10 @ CTR block 5 ld1 {q7}, [r0], #16 @ AES block 3 - load ciphertext fmov v1.d[1], r9 @ CTR block 5 rev r9, r12 @ CTR block 6 add r12, r12, #1 @ CTR block 6 eor r19, r19, r13 @ AES block 1 - round 14 low #ifdef __ARMEB__ rev r19, r19 #endif orr r9, r11, r9, lsl #32 @ CTR block 6 eor r20, r20, r14 @ AES block 1 - round 14 high #ifdef __ARMEB__ rev r20, r20 #endif stp r19, r20, [r2], #16 @ AES block 1 - store result eor q2, q6, q2 @ AES block 2 - result cmp r0, r5 @ check if we have <= 8 blocks bge .L256_dec_prepretail @ do prepretail .L256_dec_main_loop:@ main loop start mov r21, v2.d[0] @ AES block 4k+2 - mov low ext v11.16b, v11.16b, v11.16b, #8 @ PRE 0 eor q3, q7, q3 @ AES block 4k+3 - result aese q0, v18.16b aesmc q0, q0 @ AES block 4k+4 - round 0 mov r22, v2.d[1] @ AES block 4k+2 - mov high aese q1, v18.16b aesmc q1, q1 @ AES block 4k+5 - round 0 fmov d2, r10 @ CTR block 4k+6 fmov v2.d[1], r9 @ CTR block 4k+6 eor q4, q4, v11.16b @ PRE 1 rev r9, r12 @ CTR block 4k+7 aese q0, v19.16b aesmc q0, q0 @ AES block 4k+4 - round 1 mov r24, v3.d[1] @ AES block 4k+3 - mov high aese q1, v19.16b aesmc q1, q1 @ AES block 4k+5 - round 1 mov r23, v3.d[0] @ AES block 4k+3 - mov low pmull2 v9.1q, q4, v15.2d @ GHASH block 4k - high mov d8, v4.d[1] @ GHASH block 4k - mid fmov d3, r10 @ CTR block 4k+7 aese q0, v20.16b aesmc q0, q0 @ AES block 4k+4 - round 2 orr r9, r11, r9, lsl #32 @ CTR block 4k+7 aese q2, v18.16b aesmc q2, q2 @ AES block 4k+6 - round 0 fmov v3.d[1], r9 @ CTR block 4k+7 aese q1, v20.16b aesmc q1, q1 @ AES block 4k+5 - round 2 eor q8, q8, q4 @ GHASH block 4k - mid aese q0, v21.16b aesmc q0, q0 @ AES block 4k+4 - round 3 eor r22, r22, r14 @ AES block 4k+2 - round 14 high #ifdef __ARMEB__ rev r22, r22 #endif aese q2, v19.16b aesmc q2, q2 @ AES block 4k+6 - round 1 mov d10, v17.d[1] @ GHASH block 4k - mid aese q1, v21.16b aesmc q1, q1 @ AES block 4k+5 - round 3 rev64 q6, q6 @ GHASH block 4k+2 aese q3, v18.16b aesmc q3, q3 @ AES block 4k+7 - round 0 eor r21, r21, r13 @ AES block 4k+2 - round 14 low #ifdef __ARMEB__ rev r21, r21 #endif aese q2, v20.16b aesmc q2, q2 @ AES block 4k+6 - round 2 stp r21, r22, [r2], #16 @ AES block 4k+2 - store result pmull v11.1q, q4, v15.1d @ GHASH block 4k - low pmull2 v4.1q, q5, v14.2d @ GHASH block 4k+1 - high aese q2, v21.16b aesmc q2, q2 @ AES block 4k+6 - round 3 rev64 q7, q7 @ GHASH block 4k+3 pmull v10.1q, q8, v10.1d @ GHASH block 4k - mid eor r23, r23, r13 @ AES block 4k+3 - round 14 low #ifdef __ARMEB__ rev r23, r23 #endif pmull v8.1q, q5, v14.1d @ GHASH block 4k+1 - low eor r24, r24, r14 @ AES block 4k+3 - round 14 high #ifdef __ARMEB__ rev r24, r24 #endif eor q9, q9, q4 @ GHASH block 4k+1 - high aese q2, v22.16b aesmc q2, q2 @ AES block 4k+6 - round 4 aese q3, v19.16b aesmc q3, q3 @ AES block 4k+7 - round 1 mov d4, v5.d[1] @ GHASH block 4k+1 - mid aese q0, v22.16b aesmc q0, q0 @ AES block 4k+4 - round 4 eor v11.16b, v11.16b, q8 @ GHASH block 4k+1 - low aese q2, v23.16b aesmc q2, q2 @ AES block 4k+6 - round 5 add r12, r12, #1 @ CTR block 4k+7 aese q3, v20.16b aesmc q3, q3 @ AES block 4k+7 - round 2 mov d8, v6.d[1] @ GHASH block 4k+2 - mid aese q1, v22.16b aesmc q1, q1 @ AES block 4k+5 - round 4 eor q4, q4, q5 @ GHASH block 4k+1 - mid pmull v5.1q, q6, v13.1d @ GHASH block 4k+2 - low aese q3, v21.16b aesmc q3, q3 @ AES block 4k+7 - round 3 eor q8, q8, q6 @ GHASH block 4k+2 - mid aese q1, v23.16b aesmc q1, q1 @ AES block 4k+5 - round 5 aese q0, v23.16b aesmc q0, q0 @ AES block 4k+4 - round 5 eor v11.16b, v11.16b, q5 @ GHASH block 4k+2 - low pmull v4.1q, q4, v17.1d @ GHASH block 4k+1 - mid rev r9, r12 @ CTR block 4k+8 aese q1, v24.16b aesmc q1, q1 @ AES block 4k+5 - round 6 ins v8.d[1], v8.d[0] @ GHASH block 4k+2 - mid aese q0, v24.16b aesmc q0, q0 @ AES block 4k+4 - round 6 add r12, r12, #1 @ CTR block 4k+8 aese q3, v22.16b aesmc q3, q3 @ AES block 4k+7 - round 4 aese q1, v25.16b aesmc q1, q1 @ AES block 4k+5 - round 7 eor v10.16b, v10.16b, q4 @ GHASH block 4k+1 - mid aese q0, v25.16b aesmc q0, q0 @ AES block 4k+4 - round 7 pmull2 v4.1q, q6, v13.2d @ GHASH block 4k+2 - high mov d6, v7.d[1] @ GHASH block 4k+3 - mid aese q3, v23.16b aesmc q3, q3 @ AES block 4k+7 - round 5 pmull2 v8.1q, q8, v16.2d @ GHASH block 4k+2 - mid aese q0, v26.16b aesmc q0, q0 @ AES block 4k+4 - round 8 eor q9, q9, q4 @ GHASH block 4k+2 - high aese q3, v24.16b aesmc q3, q3 @ AES block 4k+7 - round 6 pmull v4.1q, q7, v12.1d @ GHASH block 4k+3 - low orr r9, r11, r9, lsl #32 @ CTR block 4k+8 eor v10.16b, v10.16b, q8 @ GHASH block 4k+2 - mid pmull2 v5.1q, q7, v12.2d @ GHASH block 4k+3 - high aese q0, v27.16b aesmc q0, q0 @ AES block 4k+4 - round 9 eor q6, q6, q7 @ GHASH block 4k+3 - mid aese q1, v26.16b aesmc q1, q1 @ AES block 4k+5 - round 8 aese q2, v24.16b aesmc q2, q2 @ AES block 4k+6 - round 6 eor q9, q9, q5 @ GHASH block 4k+3 - high aese q0, v28.16b aesmc q0, q0 @ AES block 4k+4 - round 10 pmull v6.1q, q6, v16.1d @ GHASH block 4k+3 - mid movi q8, #0xc2 aese q2, v25.16b aesmc q2, q2 @ AES block 4k+6 - round 7 eor v11.16b, v11.16b, q4 @ GHASH block 4k+3 - low aese q0, v29.16b aesmc q0, q0 @ AES block 4k+4 - round 11 aese q3, v25.16b aesmc q3, q3 @ AES block 4k+7 - round 7 shl d8, d8, #56 @ mod_constant aese q2, v26.16b aesmc q2, q2 @ AES block 4k+6 - round 8 eor v10.16b, v10.16b, q6 @ GHASH block 4k+3 - mid aese q0, v30.16b aesmc q0, q0 @ AES block 4k+4 - round 12 pmull v7.1q, q9, q8 @ MODULO - top 64b align with mid eor q6, v11.16b, q9 @ MODULO - karatsuba tidy up aese q1, v27.16b aesmc q1, q1 @ AES block 4k+5 - round 9 ld1 {q4}, [r0], #16 @ AES block 4k+4 - load ciphertext aese q0, v31.16b @ AES block 4k+4 - round 13 ext q9, q9, q9, #8 @ MODULO - other top alignment aese q1, v28.16b aesmc q1, q1 @ AES block 4k+5 - round 10 eor v10.16b, v10.16b, q6 @ MODULO - karatsuba tidy up aese q2, v27.16b aesmc q2, q2 @ AES block 4k+6 - round 9 ld1 {q5}, [r0], #16 @ AES block 4k+5 - load ciphertext aese q3, v26.16b aesmc q3, q3 @ AES block 4k+7 - round 8 eor q0, q4, q0 @ AES block 4k+4 - result aese q1, v29.16b aesmc q1, q1 @ AES block 4k+5 - round 11 stp r23, r24, [r2], #16 @ AES block 4k+3 - store result aese q2, v28.16b aesmc q2, q2 @ AES block 4k+6 - round 10 eor v10.16b, v10.16b, q7 @ MODULO - fold into mid aese q3, v27.16b aesmc q3, q3 @ AES block 4k+7 - round 9 ld1 {q6}, [r0], #16 @ AES block 4k+6 - load ciphertext aese q1, v30.16b aesmc q1, q1 @ AES block 4k+5 - round 12 ld1 {q7}, [r0], #16 @ AES block 4k+7 - load ciphertext aese q2, v29.16b aesmc q2, q2 @ AES block 4k+6 - round 11 mov r7, v0.d[1] @ AES block 4k+4 - mov high aese q3, v28.16b aesmc q3, q3 @ AES block 4k+7 - round 10 eor v10.16b, v10.16b, q9 @ MODULO - fold into mid aese q1, v31.16b @ AES block 4k+5 - round 13 mov r6, v0.d[0] @ AES block 4k+4 - mov low aese q2, v30.16b aesmc q2, q2 @ AES block 4k+6 - round 12 fmov d0, r10 @ CTR block 4k+8 aese q3, v29.16b aesmc q3, q3 @ AES block 4k+7 - round 11 fmov v0.d[1], r9 @ CTR block 4k+8 pmull v8.1q, v10.1d, q8 @ MODULO - mid 64b align with low eor q1, q5, q1 @ AES block 4k+5 - result rev r9, r12 @ CTR block 4k+9 aese q2, v31.16b @ AES block 4k+6 - round 13 orr r9, r11, r9, lsl #32 @ CTR block 4k+9 cmp r0, r5 @ .LOOP CONTROL add r12, r12, #1 @ CTR block 4k+9 eor r6, r6, r13 @ AES block 4k+4 - round 14 low #ifdef __ARMEB__ rev r6, r6 #endif eor r7, r7, r14 @ AES block 4k+4 - round 14 high #ifdef __ARMEB__ rev r7, r7 #endif mov r20, v1.d[1] @ AES block 4k+5 - mov high eor q2, q6, q2 @ AES block 4k+6 - result eor v11.16b, v11.16b, q8 @ MODULO - fold into low aese q3, v30.16b aesmc q3, q3 @ AES block 4k+7 - round 12 mov r19, v1.d[0] @ AES block 4k+5 - mov low fmov d1, r10 @ CTR block 4k+9 ext v10.16b, v10.16b, v10.16b, #8 @ MODULO - other mid alignment fmov v1.d[1], r9 @ CTR block 4k+9 rev r9, r12 @ CTR block 4k+10 add r12, r12, #1 @ CTR block 4k+10 aese q3, v31.16b @ AES block 4k+7 - round 13 orr r9, r11, r9, lsl #32 @ CTR block 4k+10 rev64 q5, q5 @ GHASH block 4k+5 eor r20, r20, r14 @ AES block 4k+5 - round 14 high #ifdef __ARMEB__ rev r20, r20 #endif stp r6, r7, [r2], #16 @ AES block 4k+4 - store result eor r19, r19, r13 @ AES block 4k+5 - round 14 low #ifdef __ARMEB__ rev r19, r19 #endif stp r19, r20, [r2], #16 @ AES block 4k+5 - store result rev64 q4, q4 @ GHASH block 4k+4 eor v11.16b, v11.16b, v10.16b @ MODULO - fold into low blt .L256_dec_main_loop .L256_dec_prepretail:@ PREPRETAIL ext v11.16b, v11.16b, v11.16b, #8 @ PRE 0 mov r21, v2.d[0] @ AES block 4k+2 - mov low eor q3, q7, q3 @ AES block 4k+3 - result aese q0, v18.16b aesmc q0, q0 @ AES block 4k+4 - round 0 mov r22, v2.d[1] @ AES block 4k+2 - mov high aese q1, v18.16b aesmc q1, q1 @ AES block 4k+5 - round 0 fmov d2, r10 @ CTR block 4k+6 fmov v2.d[1], r9 @ CTR block 4k+6 rev r9, r12 @ CTR block 4k+7 eor q4, q4, v11.16b @ PRE 1 rev64 q6, q6 @ GHASH block 4k+2 orr r9, r11, r9, lsl #32 @ CTR block 4k+7 mov r23, v3.d[0] @ AES block 4k+3 - mov low aese q1, v19.16b aesmc q1, q1 @ AES block 4k+5 - round 1 mov r24, v3.d[1] @ AES block 4k+3 - mov high pmull v11.1q, q4, v15.1d @ GHASH block 4k - low mov d8, v4.d[1] @ GHASH block 4k - mid fmov d3, r10 @ CTR block 4k+7 pmull2 v9.1q, q4, v15.2d @ GHASH block 4k - high fmov v3.d[1], r9 @ CTR block 4k+7 aese q2, v18.16b aesmc q2, q2 @ AES block 4k+6 - round 0 mov d10, v17.d[1] @ GHASH block 4k - mid aese q0, v19.16b aesmc q0, q0 @ AES block 4k+4 - round 1 eor q8, q8, q4 @ GHASH block 4k - mid pmull2 v4.1q, q5, v14.2d @ GHASH block 4k+1 - high aese q2, v19.16b aesmc q2, q2 @ AES block 4k+6 - round 1 rev64 q7, q7 @ GHASH block 4k+3 aese q3, v18.16b aesmc q3, q3 @ AES block 4k+7 - round 0 pmull v10.1q, q8, v10.1d @ GHASH block 4k - mid eor q9, q9, q4 @ GHASH block 4k+1 - high pmull v8.1q, q5, v14.1d @ GHASH block 4k+1 - low aese q3, v19.16b aesmc q3, q3 @ AES block 4k+7 - round 1 mov d4, v5.d[1] @ GHASH block 4k+1 - mid aese q0, v20.16b aesmc q0, q0 @ AES block 4k+4 - round 2 aese q1, v20.16b aesmc q1, q1 @ AES block 4k+5 - round 2 eor v11.16b, v11.16b, q8 @ GHASH block 4k+1 - low aese q2, v20.16b aesmc q2, q2 @ AES block 4k+6 - round 2 aese q0, v21.16b aesmc q0, q0 @ AES block 4k+4 - round 3 mov d8, v6.d[1] @ GHASH block 4k+2 - mid aese q3, v20.16b aesmc q3, q3 @ AES block 4k+7 - round 2 eor q4, q4, q5 @ GHASH block 4k+1 - mid pmull v5.1q, q6, v13.1d @ GHASH block 4k+2 - low aese q0, v22.16b aesmc q0, q0 @ AES block 4k+4 - round 4 aese q3, v21.16b aesmc q3, q3 @ AES block 4k+7 - round 3 eor q8, q8, q6 @ GHASH block 4k+2 - mid pmull v4.1q, q4, v17.1d @ GHASH block 4k+1 - mid aese q0, v23.16b aesmc q0, q0 @ AES block 4k+4 - round 5 eor v11.16b, v11.16b, q5 @ GHASH block 4k+2 - low aese q3, v22.16b aesmc q3, q3 @ AES block 4k+7 - round 4 pmull2 v5.1q, q7, v12.2d @ GHASH block 4k+3 - high eor v10.16b, v10.16b, q4 @ GHASH block 4k+1 - mid pmull2 v4.1q, q6, v13.2d @ GHASH block 4k+2 - high aese q3, v23.16b aesmc q3, q3 @ AES block 4k+7 - round 5 ins v8.d[1], v8.d[0] @ GHASH block 4k+2 - mid aese q2, v21.16b aesmc q2, q2 @ AES block 4k+6 - round 3 aese q1, v21.16b aesmc q1, q1 @ AES block 4k+5 - round 3 eor q9, q9, q4 @ GHASH block 4k+2 - high pmull v4.1q, q7, v12.1d @ GHASH block 4k+3 - low aese q2, v22.16b aesmc q2, q2 @ AES block 4k+6 - round 4 mov d6, v7.d[1] @ GHASH block 4k+3 - mid aese q1, v22.16b aesmc q1, q1 @ AES block 4k+5 - round 4 pmull2 v8.1q, q8, v16.2d @ GHASH block 4k+2 - mid aese q2, v23.16b aesmc q2, q2 @ AES block 4k+6 - round 5 eor q6, q6, q7 @ GHASH block 4k+3 - mid aese q1, v23.16b aesmc q1, q1 @ AES block 4k+5 - round 5 aese q3, v24.16b aesmc q3, q3 @ AES block 4k+7 - round 6 eor v10.16b, v10.16b, q8 @ GHASH block 4k+2 - mid aese q2, v24.16b aesmc q2, q2 @ AES block 4k+6 - round 6 aese q0, v24.16b aesmc q0, q0 @ AES block 4k+4 - round 6 movi q8, #0xc2 aese q1, v24.16b aesmc q1, q1 @ AES block 4k+5 - round 6 eor v11.16b, v11.16b, q4 @ GHASH block 4k+3 - low pmull v6.1q, q6, v16.1d @ GHASH block 4k+3 - mid aese q3, v25.16b aesmc q3, q3 @ AES block 4k+7 - round 7 eor q9, q9, q5 @ GHASH block 4k+3 - high aese q1, v25.16b aesmc q1, q1 @ AES block 4k+5 - round 7 aese q0, v25.16b aesmc q0, q0 @ AES block 4k+4 - round 7 eor v10.16b, v10.16b, q6 @ GHASH block 4k+3 - mid aese q3, v26.16b aesmc q3, q3 @ AES block 4k+7 - round 8 aese q2, v25.16b aesmc q2, q2 @ AES block 4k+6 - round 7 eor q6, v11.16b, q9 @ MODULO - karatsuba tidy up aese q1, v26.16b aesmc q1, q1 @ AES block 4k+5 - round 8 aese q0, v26.16b aesmc q0, q0 @ AES block 4k+4 - round 8 shl d8, d8, #56 @ mod_constant aese q2, v26.16b aesmc q2, q2 @ AES block 4k+6 - round 8 aese q1, v27.16b aesmc q1, q1 @ AES block 4k+5 - round 9 eor v10.16b, v10.16b, q6 @ MODULO - karatsuba tidy up pmull v7.1q, q9, q8 @ MODULO - top 64b align with mid aese q2, v27.16b aesmc q2, q2 @ AES block 4k+6 - round 9 ext q9, q9, q9, #8 @ MODULO - other top alignment aese q3, v27.16b aesmc q3, q3 @ AES block 4k+7 - round 9 aese q0, v27.16b aesmc q0, q0 @ AES block 4k+4 - round 9 eor v10.16b, v10.16b, q7 @ MODULO - fold into mid aese q2, v28.16b aesmc q2, q2 @ AES block 4k+6 - round 10 aese q3, v28.16b aesmc q3, q3 @ AES block 4k+7 - round 10 aese q0, v28.16b aesmc q0, q0 @ AES block 4k+4 - round 10 eor r22, r22, r14 @ AES block 4k+2 - round 14 high #ifdef __ARMEB__ rev r22, r22 #endif aese q1, v28.16b aesmc q1, q1 @ AES block 4k+5 - round 10 eor r23, r23, r13 @ AES block 4k+3 - round 14 low #ifdef __ARMEB__ rev r23, r23 #endif aese q2, v29.16b aesmc q2, q2 @ AES block 4k+6 - round 11 eor v10.16b, v10.16b, q9 @ MODULO - fold into mid aese q0, v29.16b aesmc q0, q0 @ AES block 4k+4 - round 11 add r12, r12, #1 @ CTR block 4k+7 aese q1, v29.16b aesmc q1, q1 @ AES block 4k+5 - round 11 eor r21, r21, r13 @ AES block 4k+2 - round 14 low #ifdef __ARMEB__ rev r21, r21 #endif aese q2, v30.16b aesmc q2, q2 @ AES block 4k+6 - round 12 pmull v8.1q, v10.1d, q8 @ MODULO - mid 64b align with low eor r24, r24, r14 @ AES block 4k+3 - round 14 high #ifdef __ARMEB__ rev r24, r24 #endif aese q3, v29.16b aesmc q3, q3 @ AES block 4k+7 - round 11 stp r21, r22, [r2], #16 @ AES block 4k+2 - store result aese q1, v30.16b aesmc q1, q1 @ AES block 4k+5 - round 12 ext v10.16b, v10.16b, v10.16b, #8 @ MODULO - other mid alignment aese q0, v30.16b aesmc q0, q0 @ AES block 4k+4 - round 12 stp r23, r24, [r2], #16 @ AES block 4k+3 - store result aese q3, v30.16b aesmc q3, q3 @ AES block 4k+7 - round 12 eor v11.16b, v11.16b, q8 @ MODULO - fold into low aese q1, v31.16b @ AES block 4k+5 - round 13 aese q0, v31.16b @ AES block 4k+4 - round 13 aese q3, v31.16b @ AES block 4k+7 - round 13 aese q2, v31.16b @ AES block 4k+6 - round 13 eor v11.16b, v11.16b, v10.16b @ MODULO - fold into low .L256_dec_tail:@ TAIL sub r5, r4, r0 @ main_end_input_ptr is number of bytes left to process ld1 { q5}, [r0], #16 @ AES block 4k+4 - load ciphertext eor q0, q5, q0 @ AES block 4k+4 - result mov r6, v0.d[0] @ AES block 4k+4 - mov low mov r7, v0.d[1] @ AES block 4k+4 - mov high ext q8, v11.16b, v11.16b, #8 @ prepare final partial tag cmp r5, #48 eor r6, r6, r13 @ AES block 4k+4 - round 14 low #ifdef __ARMEB__ rev r6, r6 #endif eor r7, r7, r14 @ AES block 4k+4 - round 14 high #ifdef __ARMEB__ rev r7, r7 #endif bgt .L256_dec_blocks_more_than_3 sub r12, r12, #1 mov q3, q2 movi v10.8b, #0 movi v11.8b, #0 cmp r5, #32 movi q9, #0 mov q2, q1 bgt .L256_dec_blocks_more_than_2 sub r12, r12, #1 mov q3, q1 cmp r5, #16 bgt .L256_dec_blocks_more_than_1 sub r12, r12, #1 b .L256_dec_blocks_less_than_1 .L256_dec_blocks_more_than_3:@ blocks left > 3 rev64 q4, q5 @ GHASH final-3 block ld1 { q5}, [r0], #16 @ AES final-2 block - load ciphertext stp r6, r7, [r2], #16 @ AES final-3 block - store result mov d10, v17.d[1] @ GHASH final-3 block - mid eor q4, q4, q8 @ feed in partial tag eor q0, q5, q1 @ AES final-2 block - result mov d22, v4.d[1] @ GHASH final-3 block - mid mov r6, v0.d[0] @ AES final-2 block - mov low mov r7, v0.d[1] @ AES final-2 block - mov high eor v22.8b, v22.8b, q4 @ GHASH final-3 block - mid movi q8, #0 @ suppress further partial tag feed in pmull2 v9.1q, q4, v15.2d @ GHASH final-3 block - high pmull v10.1q, v22.1d, v10.1d @ GHASH final-3 block - mid eor r6, r6, r13 @ AES final-2 block - round 14 low #ifdef __ARMEB__ rev r6, r6 #endif pmull v11.1q, q4, v15.1d @ GHASH final-3 block - low eor r7, r7, r14 @ AES final-2 block - round 14 high #ifdef __ARMEB__ rev r7, r7 #endif .L256_dec_blocks_more_than_2:@ blocks left > 2 rev64 q4, q5 @ GHASH final-2 block ld1 { q5}, [r0], #16 @ AES final-1 block - load ciphertext eor q4, q4, q8 @ feed in partial tag stp r6, r7, [r2], #16 @ AES final-2 block - store result eor q0, q5, q2 @ AES final-1 block - result mov d22, v4.d[1] @ GHASH final-2 block - mid pmull v21.1q, q4, v14.1d @ GHASH final-2 block - low pmull2 v20.1q, q4, v14.2d @ GHASH final-2 block - high eor v22.8b, v22.8b, q4 @ GHASH final-2 block - mid mov r6, v0.d[0] @ AES final-1 block - mov low mov r7, v0.d[1] @ AES final-1 block - mov high eor v11.16b, v11.16b, v21.16b @ GHASH final-2 block - low movi q8, #0 @ suppress further partial tag feed in pmull v22.1q, v22.1d, v17.1d @ GHASH final-2 block - mid eor q9, q9, v20.16b @ GHASH final-2 block - high eor r6, r6, r13 @ AES final-1 block - round 14 low #ifdef __ARMEB__ rev r6, r6 #endif eor v10.16b, v10.16b, v22.16b @ GHASH final-2 block - mid eor r7, r7, r14 @ AES final-1 block - round 14 high #ifdef __ARMEB__ rev r7, r7 #endif .L256_dec_blocks_more_than_1:@ blocks left > 1 stp r6, r7, [r2], #16 @ AES final-1 block - store result rev64 q4, q5 @ GHASH final-1 block ld1 { q5}, [r0], #16 @ AES final block - load ciphertext eor q4, q4, q8 @ feed in partial tag movi q8, #0 @ suppress further partial tag feed in mov d22, v4.d[1] @ GHASH final-1 block - mid eor q0, q5, q3 @ AES final block - result pmull2 v20.1q, q4, v13.2d @ GHASH final-1 block - high eor v22.8b, v22.8b, q4 @ GHASH final-1 block - mid pmull v21.1q, q4, v13.1d @ GHASH final-1 block - low mov r6, v0.d[0] @ AES final block - mov low ins v22.d[1], v22.d[0] @ GHASH final-1 block - mid mov r7, v0.d[1] @ AES final block - mov high pmull2 v22.1q, v22.2d, v16.2d @ GHASH final-1 block - mid eor r6, r6, r13 @ AES final block - round 14 low #ifdef __ARMEB__ rev r6, r6 #endif eor v11.16b, v11.16b, v21.16b @ GHASH final-1 block - low eor q9, q9, v20.16b @ GHASH final-1 block - high eor v10.16b, v10.16b, v22.16b @ GHASH final-1 block - mid eor r7, r7, r14 @ AES final block - round 14 high #ifdef __ARMEB__ rev r7, r7 #endif .L256_dec_blocks_less_than_1:@ blocks left <= 1 and r1, r1, #127 @ bit_length %= 128 mvn r14, xzr @ rk14_h = 0xffffffffffffffff sub r1, r1, #128 @ bit_length -= 128 mvn r13, xzr @ rk14_l = 0xffffffffffffffff ldp r4, r5, [r2] @ load existing bytes we need to not overwrite neg r1, r1 @ bit_length = 128 - #bits in input (in range [1,128]) and r1, r1, #127 @ bit_length %= 128 lsr r14, r14, r1 @ rk14_h is mask for top 64b of last block cmp r1, #64 csel r9, r13, r14, lt csel r10, r14, xzr, lt fmov d0, r9 @ ctr0b is mask for last block and r6, r6, r9 mov v0.d[1], r10 bic r4, r4, r9 @ mask out low existing bytes #ifndef __ARMEB__ rev r9, r12 #else mov r9, r12 #endif bic r5, r5, r10 @ mask out high existing bytes orr r6, r6, r4 and r7, r7, r10 orr r7, r7, r5 and q5, q5, q0 @ possibly partial last block has zeroes in highest bits rev64 q4, q5 @ GHASH final block eor q4, q4, q8 @ feed in partial tag pmull v21.1q, q4, v12.1d @ GHASH final block - low mov d8, v4.d[1] @ GHASH final block - mid eor q8, q8, q4 @ GHASH final block - mid pmull2 v20.1q, q4, v12.2d @ GHASH final block - high pmull v8.1q, q8, v16.1d @ GHASH final block - mid eor q9, q9, v20.16b @ GHASH final block - high eor v11.16b, v11.16b, v21.16b @ GHASH final block - low eor v10.16b, v10.16b, q8 @ GHASH final block - mid movi q8, #0xc2 eor q6, v11.16b, q9 @ MODULO - karatsuba tidy up shl d8, d8, #56 @ mod_constant eor v10.16b, v10.16b, q6 @ MODULO - karatsuba tidy up pmull v7.1q, q9, q8 @ MODULO - top 64b align with mid ext q9, q9, q9, #8 @ MODULO - other top alignment eor v10.16b, v10.16b, q7 @ MODULO - fold into mid eor v10.16b, v10.16b, q9 @ MODULO - fold into mid pmull v8.1q, v10.1d, q8 @ MODULO - mid 64b align with low ext v10.16b, v10.16b, v10.16b, #8 @ MODULO - other mid alignment eor v11.16b, v11.16b, q8 @ MODULO - fold into low stp r6, r7, [r2] str r9, [r16, #12] @ store the updated counter eor v11.16b, v11.16b, v10.16b @ MODULO - fold into low ext v11.16b, v11.16b, v11.16b, #8 rev64 v11.16b, v11.16b mov r0, r15 st1 { v11.16b }, [r3] ldp r21, r22, [sp, #16] ldp r23, r24, [sp, #32] ldp d8, d9, [sp, #48] ldp d10, d11, [sp, #64] ldp d12, d13, [sp, #80] ldp d14, d15, [sp, #96] ldp r19, r20, [sp], #112 RET .L256_dec_ret: mov r0, #0x0 RET .size aes_gcm_dec_256_kernel,.-aes_gcm_dec_256_kernel .byte 71,72,65,83,72,32,102,111,114,32,65,82,77,118,56,44,32,67,82,89,80,84,79,71,65,77,83,32,98,121,32,60,97,112,112,114,111,64,111,112,101,110,115,115,108,46,111,114,103,62,0 .align 2 .align 2 #endif