1#include "arm_asm.h" 2#include "arm_arch.h" 3 4#if __ARM_MAX_ARCH__>=8 5.fpu neon 6#ifdef __thumb2__ 7.syntax unified 8.thumb 9# define INST(a,b,c,d) c,0xef,a,b 10#else 11.code 32 12# define INST(a,b,c,d) a,b,c,0xf2 13#endif 14 15.text 16.globl aes_gcm_enc_128_kernel 17.type aes_gcm_enc_128_kernel,%function 18.align 4 19aes_gcm_enc_128_kernel: 20 cbz r1, .L128_enc_ret 21 stp r19, r20, [sp, #-112]! 22 mov r16, r4 23 mov r8, r5 24 stp r21, r22, [sp, #16] 25 stp r23, r24, [sp, #32] 26 stp d8, d9, [sp, #48] 27 stp d10, d11, [sp, #64] 28 stp d12, d13, [sp, #80] 29 stp d14, d15, [sp, #96] 30 31 ldp r10, r11, [r16] @ ctr96_b64, ctr96_t32 32#ifdef __ARMEB__ 33 rev r10, r10 34 rev r11, r11 35#endif 36 ldp r13, r14, [r8, #160] @ load rk10 37#ifdef __ARMEB__ 38 ror r13, r13, #32 39 ror r14, r14, #32 40#endif 41 ld1 {v11.16b}, [r3] 42 ext v11.16b, v11.16b, v11.16b, #8 43 rev64 v11.16b, v11.16b 44 lsr r5, r1, #3 @ byte_len 45 mov r15, r5 46 47 ld1 {v18.4s}, [r8], #16 @ load rk0 48 add r4, r0, r1, lsr #3 @ end_input_ptr 49 sub r5, r5, #1 @ byte_len - 1 50 51 lsr r12, r11, #32 52 ldr q15, [r3, #112] @ load h4l | h4h 53#ifndef __ARMEB__ 54 ext v15.16b, v15.16b, v15.16b, #8 55#endif 56 fmov d1, r10 @ CTR block 1 57 rev r12, r12 @ rev_ctr32 58 59 add r12, r12, #1 @ increment rev_ctr32 60 orr r11, r11, r11 61 ld1 {v19.4s}, [r8], #16 @ load rk1 62 63 rev r9, r12 @ CTR block 1 64 add r12, r12, #1 @ CTR block 1 65 fmov d3, r10 @ CTR block 3 66 67 orr r9, r11, r9, lsl #32 @ CTR block 1 68 ld1 { q0}, [r16] @ special case vector load initial counter so we can start first AES block as quickly as possible 69 70 fmov v1.d[1], r9 @ CTR block 1 71 rev r9, r12 @ CTR block 2 72 73 fmov d2, r10 @ CTR block 2 74 orr r9, r11, r9, lsl #32 @ CTR block 2 75 add r12, r12, #1 @ CTR block 2 76 77 fmov v2.d[1], r9 @ CTR block 2 78 rev r9, r12 @ CTR block 3 79 80 orr r9, r11, r9, lsl #32 @ CTR block 3 81 ld1 {v20.4s}, [r8], #16 @ load rk2 82 83 add r12, r12, #1 @ CTR block 3 84 fmov v3.d[1], r9 @ CTR block 3 85 86 ldr q14, [r3, #80] @ load h3l | h3h 87#ifndef __ARMEB__ 88 ext v14.16b, v14.16b, v14.16b, #8 89#endif 90 aese q1, v18.16b 91 aesmc q1, q1 @ AES block 1 - round 0 92 ld1 {v21.4s}, [r8], #16 @ load rk3 93 94 aese q2, v18.16b 95 aesmc q2, q2 @ AES block 2 - round 0 96 ldr q12, [r3, #32] @ load h1l | h1h 97#ifndef __ARMEB__ 98 ext v12.16b, v12.16b, v12.16b, #8 99#endif 100 101 aese q0, v18.16b 102 aesmc q0, q0 @ AES block 0 - round 0 103 ld1 {v22.4s}, [r8], #16 @ load rk4 104 105 aese q3, v18.16b 106 aesmc q3, q3 @ AES block 3 - round 0 107 ld1 {v23.4s}, [r8], #16 @ load rk5 108 109 aese q2, v19.16b 110 aesmc q2, q2 @ AES block 2 - round 1 111 trn2 v17.2d, v14.2d, v15.2d @ h4l | h3l 112 113 aese q0, v19.16b 114 aesmc q0, q0 @ AES block 0 - round 1 115 ld1 {v24.4s}, [r8], #16 @ load rk6 116 117 aese q1, v19.16b 118 aesmc q1, q1 @ AES block 1 - round 1 119 ld1 {v25.4s}, [r8], #16 @ load rk7 120 121 aese q3, v19.16b 122 aesmc q3, q3 @ AES block 3 - round 1 123 trn1 q9, v14.2d, v15.2d @ h4h | h3h 124 125 aese q0, v20.16b 126 aesmc q0, q0 @ AES block 0 - round 2 127 ld1 {v26.4s}, [r8], #16 @ load rk8 128 129 aese q1, v20.16b 130 aesmc q1, q1 @ AES block 1 - round 2 131 ldr q13, [r3, #64] @ load h2l | h2h 132#ifndef __ARMEB__ 133 ext v13.16b, v13.16b, v13.16b, #8 134#endif 135 136 aese q3, v20.16b 137 aesmc q3, q3 @ AES block 3 - round 2 138 139 aese q2, v20.16b 140 aesmc q2, q2 @ AES block 2 - round 2 141 eor v17.16b, v17.16b, q9 @ h4k | h3k 142 143 aese q0, v21.16b 144 aesmc q0, q0 @ AES block 0 - round 3 145 146 aese q1, v21.16b 147 aesmc q1, q1 @ AES block 1 - round 3 148 149 aese q2, v21.16b 150 aesmc q2, q2 @ AES block 2 - round 3 151 ld1 {v27.4s}, [r8], #16 @ load rk9 152 153 aese q3, v21.16b 154 aesmc q3, q3 @ AES block 3 - round 3 155 156 and r5, r5, #0xffffffffffffffc0 @ number of bytes to be processed in main loop (at least 1 byte must be handled by tail) 157 trn2 v16.2d, v12.2d, v13.2d @ h2l | h1l 158 159 aese q3, v22.16b 160 aesmc q3, q3 @ AES block 3 - round 4 161 add r5, r5, r0 162 163 aese q2, v22.16b 164 aesmc q2, q2 @ AES block 2 - round 4 165 cmp r0, r5 @ check if we have <= 4 blocks 166 167 aese q0, v22.16b 168 aesmc q0, q0 @ AES block 0 - round 4 169 170 aese q3, v23.16b 171 aesmc q3, q3 @ AES block 3 - round 5 172 173 aese q2, v23.16b 174 aesmc q2, q2 @ AES block 2 - round 5 175 176 aese q0, v23.16b 177 aesmc q0, q0 @ AES block 0 - round 5 178 179 aese q3, v24.16b 180 aesmc q3, q3 @ AES block 3 - round 6 181 182 aese q1, v22.16b 183 aesmc q1, q1 @ AES block 1 - round 4 184 185 aese q2, v24.16b 186 aesmc q2, q2 @ AES block 2 - round 6 187 trn1 q8, v12.2d, v13.2d @ h2h | h1h 188 189 aese q0, v24.16b 190 aesmc q0, q0 @ AES block 0 - round 6 191 192 aese q1, v23.16b 193 aesmc q1, q1 @ AES block 1 - round 5 194 195 aese q3, v25.16b 196 aesmc q3, q3 @ AES block 3 - round 7 197 198 aese q0, v25.16b 199 aesmc q0, q0 @ AES block 0 - round 7 200 201 aese q1, v24.16b 202 aesmc q1, q1 @ AES block 1 - round 6 203 204 aese q2, v25.16b 205 aesmc q2, q2 @ AES block 2 - round 7 206 207 aese q0, v26.16b 208 aesmc q0, q0 @ AES block 0 - round 8 209 210 aese q1, v25.16b 211 aesmc q1, q1 @ AES block 1 - round 7 212 213 aese q2, v26.16b 214 aesmc q2, q2 @ AES block 2 - round 8 215 216 aese q3, v26.16b 217 aesmc q3, q3 @ AES block 3 - round 8 218 219 aese q1, v26.16b 220 aesmc q1, q1 @ AES block 1 - round 8 221 222 aese q2, v27.16b @ AES block 2 - round 9 223 224 aese q0, v27.16b @ AES block 0 - round 9 225 226 eor v16.16b, v16.16b, q8 @ h2k | h1k 227 228 aese q1, v27.16b @ AES block 1 - round 9 229 230 aese q3, v27.16b @ AES block 3 - round 9 231 bge .L128_enc_tail @ handle tail 232 233 ldp r6, r7, [r0, #0] @ AES block 0 - load plaintext 234#ifdef __ARMEB__ 235 rev r6, r6 236 rev r7, r7 237#endif 238 ldp r21, r22, [r0, #32] @ AES block 2 - load plaintext 239#ifdef __ARMEB__ 240 rev r21, r21 241 rev r22, r22 242#endif 243 ldp r19, r20, [r0, #16] @ AES block 1 - load plaintext 244#ifdef __ARMEB__ 245 rev r19, r19 246 rev r20, r20 247#endif 248 ldp r23, r24, [r0, #48] @ AES block 3 - load plaintext 249#ifdef __ARMEB__ 250 rev r23, r23 251 rev r24, r24 252#endif 253 eor r6, r6, r13 @ AES block 0 - round 10 low 254 eor r7, r7, r14 @ AES block 0 - round 10 high 255 256 eor r21, r21, r13 @ AES block 2 - round 10 low 257 fmov d4, r6 @ AES block 0 - mov low 258 259 eor r19, r19, r13 @ AES block 1 - round 10 low 260 eor r22, r22, r14 @ AES block 2 - round 10 high 261 fmov v4.d[1], r7 @ AES block 0 - mov high 262 263 fmov d5, r19 @ AES block 1 - mov low 264 eor r20, r20, r14 @ AES block 1 - round 10 high 265 266 eor r23, r23, r13 @ AES block 3 - round 10 low 267 fmov v5.d[1], r20 @ AES block 1 - mov high 268 269 fmov d6, r21 @ AES block 2 - mov low 270 eor r24, r24, r14 @ AES block 3 - round 10 high 271 rev r9, r12 @ CTR block 4 272 273 fmov v6.d[1], r22 @ AES block 2 - mov high 274 orr r9, r11, r9, lsl #32 @ CTR block 4 275 276 eor q4, q4, q0 @ AES block 0 - result 277 fmov d0, r10 @ CTR block 4 278 add r12, r12, #1 @ CTR block 4 279 280 fmov v0.d[1], r9 @ CTR block 4 281 rev r9, r12 @ CTR block 5 282 283 eor q5, q5, q1 @ AES block 1 - result 284 fmov d1, r10 @ CTR block 5 285 orr r9, r11, r9, lsl #32 @ CTR block 5 286 287 add r12, r12, #1 @ CTR block 5 288 add r0, r0, #64 @ AES input_ptr update 289 fmov v1.d[1], r9 @ CTR block 5 290 291 fmov d7, r23 @ AES block 3 - mov low 292 rev r9, r12 @ CTR block 6 293 st1 { q4}, [r2], #16 @ AES block 0 - store result 294 295 fmov v7.d[1], r24 @ AES block 3 - mov high 296 orr r9, r11, r9, lsl #32 @ CTR block 6 297 298 add r12, r12, #1 @ CTR block 6 299 eor q6, q6, q2 @ AES block 2 - result 300 st1 { q5}, [r2], #16 @ AES block 1 - store result 301 302 fmov d2, r10 @ CTR block 6 303 cmp r0, r5 @ check if we have <= 8 blocks 304 305 fmov v2.d[1], r9 @ CTR block 6 306 rev r9, r12 @ CTR block 7 307 st1 { q6}, [r2], #16 @ AES block 2 - store result 308 309 orr r9, r11, r9, lsl #32 @ CTR block 7 310 311 eor q7, q7, q3 @ AES block 3 - result 312 st1 { q7}, [r2], #16 @ AES block 3 - store result 313 bge .L128_enc_prepretail @ do prepretail 314 315.L128_enc_main_loop:@ main loop start 316 ldp r23, r24, [r0, #48] @ AES block 4k+3 - load plaintext 317#ifdef __ARMEB__ 318 rev r23, r23 319 rev r24, r24 320#endif 321 rev64 q4, q4 @ GHASH block 4k (only t0 is free) 322 rev64 q6, q6 @ GHASH block 4k+2 (t0, t1, and t2 free) 323 324 aese q2, v18.16b 325 aesmc q2, q2 @ AES block 4k+6 - round 0 326 fmov d3, r10 @ CTR block 4k+3 327 328 ext v11.16b, v11.16b, v11.16b, #8 @ PRE 0 329 rev64 q5, q5 @ GHASH block 4k+1 (t0 and t1 free) 330 331 aese q1, v18.16b 332 aesmc q1, q1 @ AES block 4k+5 - round 0 333 add r12, r12, #1 @ CTR block 4k+3 334 fmov v3.d[1], r9 @ CTR block 4k+3 335 336 aese q0, v18.16b 337 aesmc q0, q0 @ AES block 4k+4 - round 0 338 mov d31, v6.d[1] @ GHASH block 4k+2 - mid 339 340 aese q2, v19.16b 341 aesmc q2, q2 @ AES block 4k+6 - round 1 342 mov d30, v5.d[1] @ GHASH block 4k+1 - mid 343 344 aese q1, v19.16b 345 aesmc q1, q1 @ AES block 4k+5 - round 1 346 eor q4, q4, v11.16b @ PRE 1 347 348 aese q3, v18.16b 349 aesmc q3, q3 @ AES block 4k+7 - round 0 350 eor r24, r24, r14 @ AES block 4k+3 - round 10 high 351 352 pmull2 v28.1q, q5, v14.2d @ GHASH block 4k+1 - high 353 eor v31.8b, v31.8b, q6 @ GHASH block 4k+2 - mid 354 ldp r6, r7, [r0, #0] @ AES block 4k+4 - load plaintext 355#ifdef __ARMEB__ 356 rev r6, r6 357 rev r7, r7 358#endif 359 aese q0, v19.16b 360 aesmc q0, q0 @ AES block 4k+4 - round 1 361 rev r9, r12 @ CTR block 4k+8 362 363 eor v30.8b, v30.8b, q5 @ GHASH block 4k+1 - mid 364 mov d8, v4.d[1] @ GHASH block 4k - mid 365 orr r9, r11, r9, lsl #32 @ CTR block 4k+8 366 367 pmull2 v9.1q, q4, v15.2d @ GHASH block 4k - high 368 add r12, r12, #1 @ CTR block 4k+8 369 mov d10, v17.d[1] @ GHASH block 4k - mid 370 371 aese q0, v20.16b 372 aesmc q0, q0 @ AES block 4k+4 - round 2 373 374 pmull v11.1q, q4, v15.1d @ GHASH block 4k - low 375 eor q8, q8, q4 @ GHASH block 4k - mid 376 377 aese q1, v20.16b 378 aesmc q1, q1 @ AES block 4k+5 - round 2 379 380 aese q0, v21.16b 381 aesmc q0, q0 @ AES block 4k+4 - round 3 382 eor q9, q9, v28.16b @ GHASH block 4k+1 - high 383 384 pmull v28.1q, q6, v13.1d @ GHASH block 4k+2 - low 385 386 pmull v10.1q, q8, v10.1d @ GHASH block 4k - mid 387 rev64 q7, q7 @ GHASH block 4k+3 (t0, t1, t2 and t3 free) 388 389 pmull v30.1q, v30.1d, v17.1d @ GHASH block 4k+1 - mid 390 391 pmull v29.1q, q5, v14.1d @ GHASH block 4k+1 - low 392 ins v31.d[1], v31.d[0] @ GHASH block 4k+2 - mid 393 394 pmull2 v8.1q, q6, v13.2d @ GHASH block 4k+2 - high 395 eor r7, r7, r14 @ AES block 4k+4 - round 10 high 396 397 eor v10.16b, v10.16b, v30.16b @ GHASH block 4k+1 - mid 398 mov d30, v7.d[1] @ GHASH block 4k+3 - mid 399 400 aese q3, v19.16b 401 aesmc q3, q3 @ AES block 4k+7 - round 1 402 eor v11.16b, v11.16b, v29.16b @ GHASH block 4k+1 - low 403 404 aese q2, v20.16b 405 aesmc q2, q2 @ AES block 4k+6 - round 2 406 eor r6, r6, r13 @ AES block 4k+4 - round 10 low 407 408 aese q1, v21.16b 409 aesmc q1, q1 @ AES block 4k+5 - round 3 410 eor v30.8b, v30.8b, q7 @ GHASH block 4k+3 - mid 411 412 pmull2 v4.1q, q7, v12.2d @ GHASH block 4k+3 - high 413 414 aese q2, v21.16b 415 aesmc q2, q2 @ AES block 4k+6 - round 3 416 eor q9, q9, q8 @ GHASH block 4k+2 - high 417 418 pmull2 v31.1q, v31.2d, v16.2d @ GHASH block 4k+2 - mid 419 420 pmull v29.1q, q7, v12.1d @ GHASH block 4k+3 - low 421 movi q8, #0xc2 422 423 pmull v30.1q, v30.1d, v16.1d @ GHASH block 4k+3 - mid 424 eor v11.16b, v11.16b, v28.16b @ GHASH block 4k+2 - low 425 426 aese q1, v22.16b 427 aesmc q1, q1 @ AES block 4k+5 - round 4 428 429 aese q3, v20.16b 430 aesmc q3, q3 @ AES block 4k+7 - round 2 431 shl d8, d8, #56 @ mod_constant 432 433 aese q0, v22.16b 434 aesmc q0, q0 @ AES block 4k+4 - round 4 435 eor q9, q9, q4 @ GHASH block 4k+3 - high 436 437 aese q1, v23.16b 438 aesmc q1, q1 @ AES block 4k+5 - round 5 439 ldp r19, r20, [r0, #16] @ AES block 4k+5 - load plaintext 440#ifdef __ARMEB__ 441 rev r19, r19 442 rev r20, r20 443#endif 444 aese q3, v21.16b 445 aesmc q3, q3 @ AES block 4k+7 - round 3 446 eor v10.16b, v10.16b, v31.16b @ GHASH block 4k+2 - mid 447 448 aese q0, v23.16b 449 aesmc q0, q0 @ AES block 4k+4 - round 5 450 ldp r21, r22, [r0, #32] @ AES block 4k+6 - load plaintext 451#ifdef __ARMEB__ 452 rev r21, r21 453 rev r22, r22 454#endif 455 pmull v31.1q, q9, q8 @ MODULO - top 64b align with mid 456 eor v11.16b, v11.16b, v29.16b @ GHASH block 4k+3 - low 457 458 aese q2, v22.16b 459 aesmc q2, q2 @ AES block 4k+6 - round 4 460 eor r19, r19, r13 @ AES block 4k+5 - round 10 low 461 462 aese q3, v22.16b 463 aesmc q3, q3 @ AES block 4k+7 - round 4 464 eor v10.16b, v10.16b, v30.16b @ GHASH block 4k+3 - mid 465 466 aese q1, v24.16b 467 aesmc q1, q1 @ AES block 4k+5 - round 6 468 eor r23, r23, r13 @ AES block 4k+3 - round 10 low 469 470 aese q2, v23.16b 471 aesmc q2, q2 @ AES block 4k+6 - round 5 472 eor v30.16b, v11.16b, q9 @ MODULO - karatsuba tidy up 473 474 fmov d4, r6 @ AES block 4k+4 - mov low 475 aese q0, v24.16b 476 aesmc q0, q0 @ AES block 4k+4 - round 6 477 fmov v4.d[1], r7 @ AES block 4k+4 - mov high 478 479 add r0, r0, #64 @ AES input_ptr update 480 fmov d7, r23 @ AES block 4k+3 - mov low 481 ext q9, q9, q9, #8 @ MODULO - other top alignment 482 483 aese q3, v23.16b 484 aesmc q3, q3 @ AES block 4k+7 - round 5 485 fmov d5, r19 @ AES block 4k+5 - mov low 486 487 aese q0, v25.16b 488 aesmc q0, q0 @ AES block 4k+4 - round 7 489 eor v10.16b, v10.16b, v30.16b @ MODULO - karatsuba tidy up 490 491 aese q2, v24.16b 492 aesmc q2, q2 @ AES block 4k+6 - round 6 493 eor r20, r20, r14 @ AES block 4k+5 - round 10 high 494 495 aese q1, v25.16b 496 aesmc q1, q1 @ AES block 4k+5 - round 7 497 fmov v5.d[1], r20 @ AES block 4k+5 - mov high 498 499 aese q0, v26.16b 500 aesmc q0, q0 @ AES block 4k+4 - round 8 501 fmov v7.d[1], r24 @ AES block 4k+3 - mov high 502 503 aese q3, v24.16b 504 aesmc q3, q3 @ AES block 4k+7 - round 6 505 cmp r0, r5 @ .LOOP CONTROL 506 507 aese q1, v26.16b 508 aesmc q1, q1 @ AES block 4k+5 - round 8 509 eor v10.16b, v10.16b, v31.16b @ MODULO - fold into mid 510 511 aese q0, v27.16b @ AES block 4k+4 - round 9 512 eor r21, r21, r13 @ AES block 4k+6 - round 10 low 513 eor r22, r22, r14 @ AES block 4k+6 - round 10 high 514 515 aese q3, v25.16b 516 aesmc q3, q3 @ AES block 4k+7 - round 7 517 fmov d6, r21 @ AES block 4k+6 - mov low 518 519 aese q1, v27.16b @ AES block 4k+5 - round 9 520 fmov v6.d[1], r22 @ AES block 4k+6 - mov high 521 522 aese q2, v25.16b 523 aesmc q2, q2 @ AES block 4k+6 - round 7 524 eor q4, q4, q0 @ AES block 4k+4 - result 525 526 fmov d0, r10 @ CTR block 4k+8 527 aese q3, v26.16b 528 aesmc q3, q3 @ AES block 4k+7 - round 8 529 530 fmov v0.d[1], r9 @ CTR block 4k+8 531 rev r9, r12 @ CTR block 4k+9 532 eor v10.16b, v10.16b, q9 @ MODULO - fold into mid 533 534 aese q2, v26.16b 535 aesmc q2, q2 @ AES block 4k+6 - round 8 536 eor q5, q5, q1 @ AES block 4k+5 - result 537 538 add r12, r12, #1 @ CTR block 4k+9 539 orr r9, r11, r9, lsl #32 @ CTR block 4k+9 540 fmov d1, r10 @ CTR block 4k+9 541 542 pmull v9.1q, v10.1d, q8 @ MODULO - mid 64b align with low 543 fmov v1.d[1], r9 @ CTR block 4k+9 544 rev r9, r12 @ CTR block 4k+10 545 546 aese q2, v27.16b @ AES block 4k+6 - round 9 547 st1 { q4}, [r2], #16 @ AES block 4k+4 - store result 548 eor q6, q6, q2 @ AES block 4k+6 - result 549 orr r9, r11, r9, lsl #32 @ CTR block 4k+10 550 551 aese q3, v27.16b @ AES block 4k+7 - round 9 552 add r12, r12, #1 @ CTR block 4k+10 553 ext v10.16b, v10.16b, v10.16b, #8 @ MODULO - other mid alignment 554 fmov d2, r10 @ CTR block 4k+10 555 556 eor v11.16b, v11.16b, q9 @ MODULO - fold into low 557 st1 { q5}, [r2], #16 @ AES block 4k+5 - store result 558 559 fmov v2.d[1], r9 @ CTR block 4k+10 560 st1 { q6}, [r2], #16 @ AES block 4k+6 - store result 561 rev r9, r12 @ CTR block 4k+11 562 563 orr r9, r11, r9, lsl #32 @ CTR block 4k+11 564 eor q7, q7, q3 @ AES block 4k+3 - result 565 566 eor v11.16b, v11.16b, v10.16b @ MODULO - fold into low 567 st1 { q7}, [r2], #16 @ AES block 4k+3 - store result 568 blt .L128_enc_main_loop 569 570.L128_enc_prepretail:@ PREPRETAIL 571 rev64 q4, q4 @ GHASH block 4k (only t0 is free) 572 fmov d3, r10 @ CTR block 4k+3 573 rev64 q5, q5 @ GHASH block 4k+1 (t0 and t1 free) 574 575 ext v11.16b, v11.16b, v11.16b, #8 @ PRE 0 576 add r12, r12, #1 @ CTR block 4k+3 577 fmov v3.d[1], r9 @ CTR block 4k+3 578 579 aese q1, v18.16b 580 aesmc q1, q1 @ AES block 4k+5 - round 0 581 rev64 q6, q6 @ GHASH block 4k+2 (t0, t1, and t2 free) 582 583 pmull v29.1q, q5, v14.1d @ GHASH block 4k+1 - low 584 585 rev64 q7, q7 @ GHASH block 4k+3 (t0, t1, t2 and t3 free) 586 eor q4, q4, v11.16b @ PRE 1 587 588 pmull2 v28.1q, q5, v14.2d @ GHASH block 4k+1 - high 589 590 aese q3, v18.16b 591 aesmc q3, q3 @ AES block 4k+7 - round 0 592 mov d30, v5.d[1] @ GHASH block 4k+1 - mid 593 594 pmull v11.1q, q4, v15.1d @ GHASH block 4k - low 595 mov d8, v4.d[1] @ GHASH block 4k - mid 596 597 mov d31, v6.d[1] @ GHASH block 4k+2 - mid 598 mov d10, v17.d[1] @ GHASH block 4k - mid 599 600 aese q1, v19.16b 601 aesmc q1, q1 @ AES block 4k+5 - round 1 602 eor v30.8b, v30.8b, q5 @ GHASH block 4k+1 - mid 603 604 eor q8, q8, q4 @ GHASH block 4k - mid 605 606 pmull2 v9.1q, q4, v15.2d @ GHASH block 4k - high 607 eor v31.8b, v31.8b, q6 @ GHASH block 4k+2 - mid 608 609 aese q3, v19.16b 610 aesmc q3, q3 @ AES block 4k+7 - round 1 611 612 pmull v30.1q, v30.1d, v17.1d @ GHASH block 4k+1 - mid 613 eor v11.16b, v11.16b, v29.16b @ GHASH block 4k+1 - low 614 615 pmull v10.1q, q8, v10.1d @ GHASH block 4k - mid 616 617 aese q0, v18.16b 618 aesmc q0, q0 @ AES block 4k+4 - round 0 619 ins v31.d[1], v31.d[0] @ GHASH block 4k+2 - mid 620 621 aese q2, v18.16b 622 aesmc q2, q2 @ AES block 4k+6 - round 0 623 624 eor v10.16b, v10.16b, v30.16b @ GHASH block 4k+1 - mid 625 mov d30, v7.d[1] @ GHASH block 4k+3 - mid 626 627 aese q0, v19.16b 628 aesmc q0, q0 @ AES block 4k+4 - round 1 629 eor q9, q9, v28.16b @ GHASH block 4k+1 - high 630 631 pmull2 v31.1q, v31.2d, v16.2d @ GHASH block 4k+2 - mid 632 633 pmull2 v8.1q, q6, v13.2d @ GHASH block 4k+2 - high 634 eor v30.8b, v30.8b, q7 @ GHASH block 4k+3 - mid 635 636 pmull2 v4.1q, q7, v12.2d @ GHASH block 4k+3 - high 637 638 pmull v28.1q, q6, v13.1d @ GHASH block 4k+2 - low 639 640 aese q2, v19.16b 641 aesmc q2, q2 @ AES block 4k+6 - round 1 642 eor q9, q9, q8 @ GHASH block 4k+2 - high 643 644 aese q0, v20.16b 645 aesmc q0, q0 @ AES block 4k+4 - round 2 646 647 pmull v29.1q, q7, v12.1d @ GHASH block 4k+3 - low 648 movi q8, #0xc2 649 650 aese q2, v20.16b 651 aesmc q2, q2 @ AES block 4k+6 - round 2 652 eor v11.16b, v11.16b, v28.16b @ GHASH block 4k+2 - low 653 654 aese q3, v20.16b 655 aesmc q3, q3 @ AES block 4k+7 - round 2 656 657 pmull v30.1q, v30.1d, v16.1d @ GHASH block 4k+3 - mid 658 eor v10.16b, v10.16b, v31.16b @ GHASH block 4k+2 - mid 659 660 aese q2, v21.16b 661 aesmc q2, q2 @ AES block 4k+6 - round 3 662 663 aese q1, v20.16b 664 aesmc q1, q1 @ AES block 4k+5 - round 2 665 eor q9, q9, q4 @ GHASH block 4k+3 - high 666 667 aese q0, v21.16b 668 aesmc q0, q0 @ AES block 4k+4 - round 3 669 670 eor v10.16b, v10.16b, v30.16b @ GHASH block 4k+3 - mid 671 shl d8, d8, #56 @ mod_constant 672 673 aese q1, v21.16b 674 aesmc q1, q1 @ AES block 4k+5 - round 3 675 eor v11.16b, v11.16b, v29.16b @ GHASH block 4k+3 - low 676 677 aese q0, v22.16b 678 aesmc q0, q0 @ AES block 4k+4 - round 4 679 680 pmull v28.1q, q9, q8 681 eor v10.16b, v10.16b, q9 @ karatsuba tidy up 682 683 aese q1, v22.16b 684 aesmc q1, q1 @ AES block 4k+5 - round 4 685 686 aese q0, v23.16b 687 aesmc q0, q0 @ AES block 4k+4 - round 5 688 ext q9, q9, q9, #8 689 690 aese q3, v21.16b 691 aesmc q3, q3 @ AES block 4k+7 - round 3 692 693 aese q2, v22.16b 694 aesmc q2, q2 @ AES block 4k+6 - round 4 695 eor v10.16b, v10.16b, v11.16b 696 697 aese q0, v24.16b 698 aesmc q0, q0 @ AES block 4k+4 - round 6 699 700 aese q3, v22.16b 701 aesmc q3, q3 @ AES block 4k+7 - round 4 702 703 aese q1, v23.16b 704 aesmc q1, q1 @ AES block 4k+5 - round 5 705 706 aese q2, v23.16b 707 aesmc q2, q2 @ AES block 4k+6 - round 5 708 eor v10.16b, v10.16b, v28.16b 709 710 aese q3, v23.16b 711 aesmc q3, q3 @ AES block 4k+7 - round 5 712 713 aese q1, v24.16b 714 aesmc q1, q1 @ AES block 4k+5 - round 6 715 716 aese q2, v24.16b 717 aesmc q2, q2 @ AES block 4k+6 - round 6 718 719 aese q3, v24.16b 720 aesmc q3, q3 @ AES block 4k+7 - round 6 721 eor v10.16b, v10.16b, q9 722 723 aese q0, v25.16b 724 aesmc q0, q0 @ AES block 4k+4 - round 7 725 726 aese q2, v25.16b 727 aesmc q2, q2 @ AES block 4k+6 - round 7 728 729 aese q3, v25.16b 730 aesmc q3, q3 @ AES block 4k+7 - round 7 731 732 pmull v28.1q, v10.1d, q8 733 734 aese q1, v25.16b 735 aesmc q1, q1 @ AES block 4k+5 - round 7 736 ext v10.16b, v10.16b, v10.16b, #8 737 738 aese q3, v26.16b 739 aesmc q3, q3 @ AES block 4k+7 - round 8 740 741 aese q0, v26.16b 742 aesmc q0, q0 @ AES block 4k+4 - round 8 743 eor v11.16b, v11.16b, v28.16b 744 745 aese q1, v26.16b 746 aesmc q1, q1 @ AES block 4k+5 - round 8 747 748 aese q3, v27.16b @ AES block 4k+7 - round 9 749 750 aese q2, v26.16b 751 aesmc q2, q2 @ AES block 4k+6 - round 8 752 753 aese q0, v27.16b @ AES block 4k+4 - round 9 754 755 aese q1, v27.16b @ AES block 4k+5 - round 9 756 eor v11.16b, v11.16b, v10.16b 757 758 aese q2, v27.16b @ AES block 4k+6 - round 9 759.L128_enc_tail:@ TAIL 760 761 sub r5, r4, r0 @ main_end_input_ptr is number of bytes left to process 762 ldp r6, r7, [r0], #16 @ AES block 4k+4 - load plaintext 763#ifdef __ARMEB__ 764 rev r6, r6 765 rev r7, r7 766#endif 767 cmp r5, #48 768 769 ext q8, v11.16b, v11.16b, #8 @ prepare final partial tag 770 eor r6, r6, r13 @ AES block 4k+4 - round 10 low 771 eor r7, r7, r14 @ AES block 4k+4 - round 10 high 772 773 fmov d4, r6 @ AES block 4k+4 - mov low 774 775 fmov v4.d[1], r7 @ AES block 4k+4 - mov high 776 777 eor q5, q4, q0 @ AES block 4k+4 - result 778 779 bgt .L128_enc_blocks_more_than_3 780 781 sub r12, r12, #1 782 movi v11.8b, #0 783 mov q3, q2 784 785 cmp r5, #32 786 mov q2, q1 787 movi q9, #0 788 789 movi v10.8b, #0 790 bgt .L128_enc_blocks_more_than_2 791 792 mov q3, q1 793 cmp r5, #16 794 795 sub r12, r12, #1 796 bgt .L128_enc_blocks_more_than_1 797 798 sub r12, r12, #1 799 b .L128_enc_blocks_less_than_1 800.L128_enc_blocks_more_than_3:@ blocks left > 3 801 st1 { q5}, [r2], #16 @ AES final-3 block - store result 802 803 ldp r6, r7, [r0], #16 @ AES final-2 block - load input low & high 804#ifdef __ARMEB__ 805 rev r6, r6 806 rev r7, r7 807#endif 808 rev64 q4, q5 @ GHASH final-3 block 809 810 eor q4, q4, q8 @ feed in partial tag 811 eor r7, r7, r14 @ AES final-2 block - round 10 high 812 eor r6, r6, r13 @ AES final-2 block - round 10 low 813 814 fmov d5, r6 @ AES final-2 block - mov low 815 816 movi q8, #0 @ suppress further partial tag feed in 817 fmov v5.d[1], r7 @ AES final-2 block - mov high 818 819 pmull v11.1q, q4, v15.1d @ GHASH final-3 block - low 820 mov d22, v4.d[1] @ GHASH final-3 block - mid 821 822 pmull2 v9.1q, q4, v15.2d @ GHASH final-3 block - high 823 824 mov d10, v17.d[1] @ GHASH final-3 block - mid 825 826 eor q5, q5, q1 @ AES final-2 block - result 827 eor v22.8b, v22.8b, q4 @ GHASH final-3 block - mid 828 829 pmull v10.1q, v22.1d, v10.1d @ GHASH final-3 block - mid 830.L128_enc_blocks_more_than_2:@ blocks left > 2 831 832 st1 { q5}, [r2], #16 @ AES final-2 block - store result 833 834 rev64 q4, q5 @ GHASH final-2 block 835 ldp r6, r7, [r0], #16 @ AES final-1 block - load input low & high 836#ifdef __ARMEB__ 837 rev r6, r6 838 rev r7, r7 839#endif 840 eor q4, q4, q8 @ feed in partial tag 841 842 eor r6, r6, r13 @ AES final-1 block - round 10 low 843 844 fmov d5, r6 @ AES final-1 block - mov low 845 eor r7, r7, r14 @ AES final-1 block - round 10 high 846 847 pmull2 v20.1q, q4, v14.2d @ GHASH final-2 block - high 848 fmov v5.d[1], r7 @ AES final-1 block - mov high 849 850 mov d22, v4.d[1] @ GHASH final-2 block - mid 851 852 pmull v21.1q, q4, v14.1d @ GHASH final-2 block - low 853 854 eor q9, q9, v20.16b @ GHASH final-2 block - high 855 856 eor v22.8b, v22.8b, q4 @ GHASH final-2 block - mid 857 858 eor q5, q5, q2 @ AES final-1 block - result 859 860 eor v11.16b, v11.16b, v21.16b @ GHASH final-2 block - low 861 862 pmull v22.1q, v22.1d, v17.1d @ GHASH final-2 block - mid 863 864 movi q8, #0 @ suppress further partial tag feed in 865 866 eor v10.16b, v10.16b, v22.16b @ GHASH final-2 block - mid 867.L128_enc_blocks_more_than_1:@ blocks left > 1 868 869 st1 { q5}, [r2], #16 @ AES final-1 block - store result 870 871 rev64 q4, q5 @ GHASH final-1 block 872 ldp r6, r7, [r0], #16 @ AES final block - load input low & high 873#ifdef __ARMEB__ 874 rev r6, r6 875 rev r7, r7 876#endif 877 eor q4, q4, q8 @ feed in partial tag 878 879 eor r7, r7, r14 @ AES final block - round 10 high 880 eor r6, r6, r13 @ AES final block - round 10 low 881 882 fmov d5, r6 @ AES final block - mov low 883 884 pmull2 v20.1q, q4, v13.2d @ GHASH final-1 block - high 885 fmov v5.d[1], r7 @ AES final block - mov high 886 887 mov d22, v4.d[1] @ GHASH final-1 block - mid 888 889 pmull v21.1q, q4, v13.1d @ GHASH final-1 block - low 890 891 eor v22.8b, v22.8b, q4 @ GHASH final-1 block - mid 892 893 eor q5, q5, q3 @ AES final block - result 894 895 ins v22.d[1], v22.d[0] @ GHASH final-1 block - mid 896 897 pmull2 v22.1q, v22.2d, v16.2d @ GHASH final-1 block - mid 898 899 eor v11.16b, v11.16b, v21.16b @ GHASH final-1 block - low 900 901 eor q9, q9, v20.16b @ GHASH final-1 block - high 902 903 eor v10.16b, v10.16b, v22.16b @ GHASH final-1 block - mid 904 movi q8, #0 @ suppress further partial tag feed in 905.L128_enc_blocks_less_than_1:@ blocks left <= 1 906 907 and r1, r1, #127 @ bit_length %= 128 908 mvn r13, xzr @ rk10_l = 0xffffffffffffffff 909 910 mvn r14, xzr @ rk10_h = 0xffffffffffffffff 911 sub r1, r1, #128 @ bit_length -= 128 912 913 neg r1, r1 @ bit_length = 128 - #bits in input (in range [1,128]) 914 915 and r1, r1, #127 @ bit_length %= 128 916 917 lsr r14, r14, r1 @ rk10_h is mask for top 64b of last block 918 cmp r1, #64 919 920 csel r6, r13, r14, lt 921 csel r7, r14, xzr, lt 922 923 fmov d0, r6 @ ctr0b is mask for last block 924 925 fmov v0.d[1], r7 926 927 and q5, q5, q0 @ possibly partial last block has zeroes in highest bits 928 929 rev64 q4, q5 @ GHASH final block 930 931 eor q4, q4, q8 @ feed in partial tag 932 933 mov d8, v4.d[1] @ GHASH final block - mid 934 935 pmull v21.1q, q4, v12.1d @ GHASH final block - low 936 ld1 { v18.16b}, [r2] @ load existing bytes where the possibly partial last block is to be stored 937 938 eor q8, q8, q4 @ GHASH final block - mid 939#ifndef __ARMEB__ 940 rev r9, r12 941#else 942 mov r9, r12 943#endif 944 pmull2 v20.1q, q4, v12.2d @ GHASH final block - high 945 946 pmull v8.1q, q8, v16.1d @ GHASH final block - mid 947 948 eor v11.16b, v11.16b, v21.16b @ GHASH final block - low 949 950 eor q9, q9, v20.16b @ GHASH final block - high 951 952 eor v10.16b, v10.16b, q8 @ GHASH final block - mid 953 movi q8, #0xc2 954 955 eor v30.16b, v11.16b, q9 @ MODULO - karatsuba tidy up 956 957 shl d8, d8, #56 @ mod_constant 958 959 eor v10.16b, v10.16b, v30.16b @ MODULO - karatsuba tidy up 960 961 pmull v31.1q, q9, q8 @ MODULO - top 64b align with mid 962 963 ext q9, q9, q9, #8 @ MODULO - other top alignment 964 965 eor v10.16b, v10.16b, v31.16b @ MODULO - fold into mid 966 967 eor v10.16b, v10.16b, q9 @ MODULO - fold into mid 968 969 pmull v9.1q, v10.1d, q8 @ MODULO - mid 64b align with low 970 971 ext v10.16b, v10.16b, v10.16b, #8 @ MODULO - other mid alignment 972 973 bif q5, v18.16b, q0 @ insert existing bytes in top end of result before storing 974 975 eor v11.16b, v11.16b, q9 @ MODULO - fold into low 976 st1 { q5}, [r2] @ store all 16B 977 978 str r9, [r16, #12] @ store the updated counter 979 980 eor v11.16b, v11.16b, v10.16b @ MODULO - fold into low 981 ext v11.16b, v11.16b, v11.16b, #8 982 rev64 v11.16b, v11.16b 983 mov r0, r15 984 st1 { v11.16b }, [r3] 985 ldp r21, r22, [sp, #16] 986 ldp r23, r24, [sp, #32] 987 ldp d8, d9, [sp, #48] 988 ldp d10, d11, [sp, #64] 989 ldp d12, d13, [sp, #80] 990 ldp d14, d15, [sp, #96] 991 ldp r19, r20, [sp], #112 992 RET 993 994.L128_enc_ret: 995 mov r0, #0x0 996 RET 997.size aes_gcm_enc_128_kernel,.-aes_gcm_enc_128_kernel 998.globl aes_gcm_dec_128_kernel 999.type aes_gcm_dec_128_kernel,%function 1000.align 4 1001aes_gcm_dec_128_kernel: 1002 cbz r1, .L128_dec_ret 1003 stp r19, r20, [sp, #-112]! 1004 mov r16, r4 1005 mov r8, r5 1006 stp r21, r22, [sp, #16] 1007 stp r23, r24, [sp, #32] 1008 stp d8, d9, [sp, #48] 1009 stp d10, d11, [sp, #64] 1010 stp d12, d13, [sp, #80] 1011 stp d14, d15, [sp, #96] 1012 1013 lsr r5, r1, #3 @ byte_len 1014 mov r15, r5 1015 ldp r10, r11, [r16] @ ctr96_b64, ctr96_t32 1016#ifdef __ARMEB__ 1017 rev r10, r10 1018 rev r11, r11 1019#endif 1020 ldp r13, r14, [r8, #160] @ load rk10 1021#ifdef __ARMEB__ 1022 ror r14, r14, 32 1023 ror r13, r13, 32 1024#endif 1025 sub r5, r5, #1 @ byte_len - 1 1026 ld1 {v18.4s}, [r8], #16 @ load rk0 1027 1028 and r5, r5, #0xffffffffffffffc0 @ number of bytes to be processed in main loop (at least 1 byte must be handled by tail) 1029 ld1 { q0}, [r16] @ special case vector load initial counter so we can start first AES block as quickly as possible 1030 1031 ldr q13, [r3, #64] @ load h2l | h2h 1032#ifndef __ARMEB__ 1033 ext v13.16b, v13.16b, v13.16b, #8 1034#endif 1035 lsr r12, r11, #32 1036 fmov d2, r10 @ CTR block 2 1037 1038 ld1 {v19.4s}, [r8], #16 @ load rk1 1039 orr r11, r11, r11 1040 rev r12, r12 @ rev_ctr32 1041 1042 fmov d1, r10 @ CTR block 1 1043 add r12, r12, #1 @ increment rev_ctr32 1044 1045 aese q0, v18.16b 1046 aesmc q0, q0 @ AES block 0 - round 0 1047 rev r9, r12 @ CTR block 1 1048 1049 orr r9, r11, r9, lsl #32 @ CTR block 1 1050 ld1 {v20.4s}, [r8], #16 @ load rk2 1051 add r12, r12, #1 @ CTR block 1 1052 1053 fmov v1.d[1], r9 @ CTR block 1 1054 rev r9, r12 @ CTR block 2 1055 add r12, r12, #1 @ CTR block 2 1056 1057 aese q0, v19.16b 1058 aesmc q0, q0 @ AES block 0 - round 1 1059 orr r9, r11, r9, lsl #32 @ CTR block 2 1060 1061 fmov v2.d[1], r9 @ CTR block 2 1062 rev r9, r12 @ CTR block 3 1063 1064 fmov d3, r10 @ CTR block 3 1065 orr r9, r11, r9, lsl #32 @ CTR block 3 1066 add r12, r12, #1 @ CTR block 3 1067 1068 fmov v3.d[1], r9 @ CTR block 3 1069 add r4, r0, r1, lsr #3 @ end_input_ptr 1070 1071 aese q1, v18.16b 1072 aesmc q1, q1 @ AES block 1 - round 0 1073 ld1 {v21.4s}, [r8], #16 @ load rk3 1074 1075 aese q0, v20.16b 1076 aesmc q0, q0 @ AES block 0 - round 2 1077 ld1 {v22.4s}, [r8], #16 @ load rk4 1078 1079 aese q2, v18.16b 1080 aesmc q2, q2 @ AES block 2 - round 0 1081 ld1 {v23.4s}, [r8], #16 @ load rk5 1082 1083 aese q1, v19.16b 1084 aesmc q1, q1 @ AES block 1 - round 1 1085 ld1 {v24.4s}, [r8], #16 @ load rk6 1086 1087 aese q3, v18.16b 1088 aesmc q3, q3 @ AES block 3 - round 0 1089 1090 aese q2, v19.16b 1091 aesmc q2, q2 @ AES block 2 - round 1 1092 1093 aese q1, v20.16b 1094 aesmc q1, q1 @ AES block 1 - round 2 1095 1096 aese q3, v19.16b 1097 aesmc q3, q3 @ AES block 3 - round 1 1098 ld1 { v11.16b}, [r3] 1099 ext v11.16b, v11.16b, v11.16b, #8 1100 rev64 v11.16b, v11.16b 1101 1102 aese q0, v21.16b 1103 aesmc q0, q0 @ AES block 0 - round 3 1104 ld1 {v25.4s}, [r8], #16 @ load rk7 1105 1106 aese q1, v21.16b 1107 aesmc q1, q1 @ AES block 1 - round 3 1108 1109 aese q3, v20.16b 1110 aesmc q3, q3 @ AES block 3 - round 2 1111 1112 aese q2, v20.16b 1113 aesmc q2, q2 @ AES block 2 - round 2 1114 ld1 {v26.4s}, [r8], #16 @ load rk8 1115 1116 aese q1, v22.16b 1117 aesmc q1, q1 @ AES block 1 - round 4 1118 1119 aese q3, v21.16b 1120 aesmc q3, q3 @ AES block 3 - round 3 1121 1122 aese q2, v21.16b 1123 aesmc q2, q2 @ AES block 2 - round 3 1124 ldr q14, [r3, #80] @ load h3l | h3h 1125#ifndef __ARMEB__ 1126 ext v14.16b, v14.16b, v14.16b, #8 1127#endif 1128 aese q0, v22.16b 1129 aesmc q0, q0 @ AES block 0 - round 4 1130 ld1 {v27.4s}, [r8], #16 @ load rk9 1131 1132 aese q1, v23.16b 1133 aesmc q1, q1 @ AES block 1 - round 5 1134 1135 aese q2, v22.16b 1136 aesmc q2, q2 @ AES block 2 - round 4 1137 1138 aese q3, v22.16b 1139 aesmc q3, q3 @ AES block 3 - round 4 1140 1141 aese q0, v23.16b 1142 aesmc q0, q0 @ AES block 0 - round 5 1143 1144 aese q2, v23.16b 1145 aesmc q2, q2 @ AES block 2 - round 5 1146 ldr q12, [r3, #32] @ load h1l | h1h 1147#ifndef __ARMEB__ 1148 ext v12.16b, v12.16b, v12.16b, #8 1149#endif 1150 aese q3, v23.16b 1151 aesmc q3, q3 @ AES block 3 - round 5 1152 1153 aese q0, v24.16b 1154 aesmc q0, q0 @ AES block 0 - round 6 1155 1156 aese q1, v24.16b 1157 aesmc q1, q1 @ AES block 1 - round 6 1158 1159 aese q3, v24.16b 1160 aesmc q3, q3 @ AES block 3 - round 6 1161 1162 aese q2, v24.16b 1163 aesmc q2, q2 @ AES block 2 - round 6 1164 trn1 q8, v12.2d, v13.2d @ h2h | h1h 1165 1166 ldr q15, [r3, #112] @ load h4l | h4h 1167#ifndef __ARMEB__ 1168 ext v15.16b, v15.16b, v15.16b, #8 1169#endif 1170 trn2 v16.2d, v12.2d, v13.2d @ h2l | h1l 1171 add r5, r5, r0 1172 1173 aese q1, v25.16b 1174 aesmc q1, q1 @ AES block 1 - round 7 1175 1176 aese q2, v25.16b 1177 aesmc q2, q2 @ AES block 2 - round 7 1178 1179 aese q0, v25.16b 1180 aesmc q0, q0 @ AES block 0 - round 7 1181 eor v16.16b, v16.16b, q8 @ h2k | h1k 1182 1183 aese q3, v25.16b 1184 aesmc q3, q3 @ AES block 3 - round 7 1185 1186 aese q1, v26.16b 1187 aesmc q1, q1 @ AES block 1 - round 8 1188 trn2 v17.2d, v14.2d, v15.2d @ h4l | h3l 1189 1190 aese q2, v26.16b 1191 aesmc q2, q2 @ AES block 2 - round 8 1192 1193 aese q3, v26.16b 1194 aesmc q3, q3 @ AES block 3 - round 8 1195 1196 aese q0, v26.16b 1197 aesmc q0, q0 @ AES block 0 - round 8 1198 trn1 q9, v14.2d, v15.2d @ h4h | h3h 1199 1200 aese q2, v27.16b @ AES block 2 - round 9 1201 1202 aese q3, v27.16b @ AES block 3 - round 9 1203 1204 aese q0, v27.16b @ AES block 0 - round 9 1205 cmp r0, r5 @ check if we have <= 4 blocks 1206 1207 aese q1, v27.16b @ AES block 1 - round 9 1208 eor v17.16b, v17.16b, q9 @ h4k | h3k 1209 bge .L128_dec_tail @ handle tail 1210 1211 ld1 {q4, q5}, [r0], #32 @ AES block 0 - load ciphertext; AES block 1 - load ciphertext 1212 1213 eor q1, q5, q1 @ AES block 1 - result 1214 ld1 {q6}, [r0], #16 @ AES block 2 - load ciphertext 1215 1216 eor q0, q4, q0 @ AES block 0 - result 1217 rev64 q4, q4 @ GHASH block 0 1218 rev r9, r12 @ CTR block 4 1219 1220 orr r9, r11, r9, lsl #32 @ CTR block 4 1221 add r12, r12, #1 @ CTR block 4 1222 ld1 {q7}, [r0], #16 @ AES block 3 - load ciphertext 1223 1224 rev64 q5, q5 @ GHASH block 1 1225 mov r19, v1.d[0] @ AES block 1 - mov low 1226 1227 mov r20, v1.d[1] @ AES block 1 - mov high 1228 1229 mov r6, v0.d[0] @ AES block 0 - mov low 1230 cmp r0, r5 @ check if we have <= 8 blocks 1231 1232 mov r7, v0.d[1] @ AES block 0 - mov high 1233 1234 fmov d0, r10 @ CTR block 4 1235 1236 fmov v0.d[1], r9 @ CTR block 4 1237 rev r9, r12 @ CTR block 5 1238 eor r19, r19, r13 @ AES block 1 - round 10 low 1239#ifdef __ARMEB__ 1240 rev r19, r19 1241#endif 1242 fmov d1, r10 @ CTR block 5 1243 add r12, r12, #1 @ CTR block 5 1244 orr r9, r11, r9, lsl #32 @ CTR block 5 1245 1246 fmov v1.d[1], r9 @ CTR block 5 1247 rev r9, r12 @ CTR block 6 1248 add r12, r12, #1 @ CTR block 6 1249 1250 orr r9, r11, r9, lsl #32 @ CTR block 6 1251 1252 eor r20, r20, r14 @ AES block 1 - round 10 high 1253#ifdef __ARMEB__ 1254 rev r20, r20 1255#endif 1256 eor r6, r6, r13 @ AES block 0 - round 10 low 1257#ifdef __ARMEB__ 1258 rev r6, r6 1259#endif 1260 eor q2, q6, q2 @ AES block 2 - result 1261 1262 eor r7, r7, r14 @ AES block 0 - round 10 high 1263#ifdef __ARMEB__ 1264 rev r7, r7 1265#endif 1266 stp r6, r7, [r2], #16 @ AES block 0 - store result 1267 1268 stp r19, r20, [r2], #16 @ AES block 1 - store result 1269 bge .L128_dec_prepretail @ do prepretail 1270 1271.L128_dec_main_loop:@ main loop start 1272 eor q3, q7, q3 @ AES block 4k+3 - result 1273 ext v11.16b, v11.16b, v11.16b, #8 @ PRE 0 1274 mov r21, v2.d[0] @ AES block 4k+2 - mov low 1275 1276 pmull2 v28.1q, q5, v14.2d @ GHASH block 4k+1 - high 1277 mov r22, v2.d[1] @ AES block 4k+2 - mov high 1278 1279 aese q1, v18.16b 1280 aesmc q1, q1 @ AES block 4k+5 - round 0 1281 fmov d2, r10 @ CTR block 4k+6 1282 1283 rev64 q6, q6 @ GHASH block 4k+2 1284 fmov v2.d[1], r9 @ CTR block 4k+6 1285 rev r9, r12 @ CTR block 4k+7 1286 1287 mov r23, v3.d[0] @ AES block 4k+3 - mov low 1288 eor q4, q4, v11.16b @ PRE 1 1289 mov d30, v5.d[1] @ GHASH block 4k+1 - mid 1290 1291 aese q1, v19.16b 1292 aesmc q1, q1 @ AES block 4k+5 - round 1 1293 rev64 q7, q7 @ GHASH block 4k+3 1294 1295 pmull v29.1q, q5, v14.1d @ GHASH block 4k+1 - low 1296 mov r24, v3.d[1] @ AES block 4k+3 - mov high 1297 orr r9, r11, r9, lsl #32 @ CTR block 4k+7 1298 1299 pmull v11.1q, q4, v15.1d @ GHASH block 4k - low 1300 fmov d3, r10 @ CTR block 4k+7 1301 eor v30.8b, v30.8b, q5 @ GHASH block 4k+1 - mid 1302 1303 aese q1, v20.16b 1304 aesmc q1, q1 @ AES block 4k+5 - round 2 1305 fmov v3.d[1], r9 @ CTR block 4k+7 1306 1307 aese q2, v18.16b 1308 aesmc q2, q2 @ AES block 4k+6 - round 0 1309 mov d10, v17.d[1] @ GHASH block 4k - mid 1310 1311 pmull2 v9.1q, q4, v15.2d @ GHASH block 4k - high 1312 eor v11.16b, v11.16b, v29.16b @ GHASH block 4k+1 - low 1313 1314 pmull v29.1q, q7, v12.1d @ GHASH block 4k+3 - low 1315 1316 aese q1, v21.16b 1317 aesmc q1, q1 @ AES block 4k+5 - round 3 1318 mov d8, v4.d[1] @ GHASH block 4k - mid 1319 1320 aese q3, v18.16b 1321 aesmc q3, q3 @ AES block 4k+7 - round 0 1322 eor q9, q9, v28.16b @ GHASH block 4k+1 - high 1323 1324 aese q0, v18.16b 1325 aesmc q0, q0 @ AES block 4k+4 - round 0 1326 1327 pmull v28.1q, q6, v13.1d @ GHASH block 4k+2 - low 1328 eor q8, q8, q4 @ GHASH block 4k - mid 1329 1330 aese q3, v19.16b 1331 aesmc q3, q3 @ AES block 4k+7 - round 1 1332 eor r23, r23, r13 @ AES block 4k+3 - round 10 low 1333#ifdef __ARMEB__ 1334 rev r23, r23 1335#endif 1336 pmull v30.1q, v30.1d, v17.1d @ GHASH block 4k+1 - mid 1337 eor r22, r22, r14 @ AES block 4k+2 - round 10 high 1338#ifdef __ARMEB__ 1339 rev r22, r22 1340#endif 1341 mov d31, v6.d[1] @ GHASH block 4k+2 - mid 1342 1343 aese q0, v19.16b 1344 aesmc q0, q0 @ AES block 4k+4 - round 1 1345 eor v11.16b, v11.16b, v28.16b @ GHASH block 4k+2 - low 1346 1347 pmull v10.1q, q8, v10.1d @ GHASH block 4k - mid 1348 1349 aese q3, v20.16b 1350 aesmc q3, q3 @ AES block 4k+7 - round 2 1351 eor v31.8b, v31.8b, q6 @ GHASH block 4k+2 - mid 1352 1353 aese q0, v20.16b 1354 aesmc q0, q0 @ AES block 4k+4 - round 2 1355 1356 aese q1, v22.16b 1357 aesmc q1, q1 @ AES block 4k+5 - round 4 1358 eor v10.16b, v10.16b, v30.16b @ GHASH block 4k+1 - mid 1359 1360 pmull2 v8.1q, q6, v13.2d @ GHASH block 4k+2 - high 1361 1362 aese q0, v21.16b 1363 aesmc q0, q0 @ AES block 4k+4 - round 3 1364 ins v31.d[1], v31.d[0] @ GHASH block 4k+2 - mid 1365 1366 pmull2 v4.1q, q7, v12.2d @ GHASH block 4k+3 - high 1367 1368 aese q2, v19.16b 1369 aesmc q2, q2 @ AES block 4k+6 - round 1 1370 mov d30, v7.d[1] @ GHASH block 4k+3 - mid 1371 1372 aese q0, v22.16b 1373 aesmc q0, q0 @ AES block 4k+4 - round 4 1374 eor q9, q9, q8 @ GHASH block 4k+2 - high 1375 1376 pmull2 v31.1q, v31.2d, v16.2d @ GHASH block 4k+2 - mid 1377 eor r24, r24, r14 @ AES block 4k+3 - round 10 high 1378#ifdef __ARMEB__ 1379 rev r24, r24 1380#endif 1381 aese q2, v20.16b 1382 aesmc q2, q2 @ AES block 4k+6 - round 2 1383 eor v30.8b, v30.8b, q7 @ GHASH block 4k+3 - mid 1384 1385 aese q1, v23.16b 1386 aesmc q1, q1 @ AES block 4k+5 - round 5 1387 eor r21, r21, r13 @ AES block 4k+2 - round 10 low 1388#ifdef __ARMEB__ 1389 rev r21, r21 1390#endif 1391 aese q0, v23.16b 1392 aesmc q0, q0 @ AES block 4k+4 - round 5 1393 movi q8, #0xc2 1394 1395 aese q2, v21.16b 1396 aesmc q2, q2 @ AES block 4k+6 - round 3 1397 eor v11.16b, v11.16b, v29.16b @ GHASH block 4k+3 - low 1398 1399 aese q1, v24.16b 1400 aesmc q1, q1 @ AES block 4k+5 - round 6 1401 1402 aese q0, v24.16b 1403 aesmc q0, q0 @ AES block 4k+4 - round 6 1404 eor v10.16b, v10.16b, v31.16b @ GHASH block 4k+2 - mid 1405 1406 aese q2, v22.16b 1407 aesmc q2, q2 @ AES block 4k+6 - round 4 1408 stp r21, r22, [r2], #16 @ AES block 4k+2 - store result 1409 1410 pmull v30.1q, v30.1d, v16.1d @ GHASH block 4k+3 - mid 1411 eor q9, q9, q4 @ GHASH block 4k+3 - high 1412 ld1 {q4}, [r0], #16 @ AES block 4k+3 - load ciphertext 1413 1414 aese q1, v25.16b 1415 aesmc q1, q1 @ AES block 4k+5 - round 7 1416 add r12, r12, #1 @ CTR block 4k+7 1417 1418 aese q0, v25.16b 1419 aesmc q0, q0 @ AES block 4k+4 - round 7 1420 shl d8, d8, #56 @ mod_constant 1421 1422 aese q2, v23.16b 1423 aesmc q2, q2 @ AES block 4k+6 - round 5 1424 eor v10.16b, v10.16b, v30.16b @ GHASH block 4k+3 - mid 1425 1426 aese q1, v26.16b 1427 aesmc q1, q1 @ AES block 4k+5 - round 8 1428 stp r23, r24, [r2], #16 @ AES block 4k+3 - store result 1429 1430 aese q0, v26.16b 1431 aesmc q0, q0 @ AES block 4k+4 - round 8 1432 eor v30.16b, v11.16b, q9 @ MODULO - karatsuba tidy up 1433 1434 aese q3, v21.16b 1435 aesmc q3, q3 @ AES block 4k+7 - round 3 1436 rev r9, r12 @ CTR block 4k+8 1437 1438 pmull v31.1q, q9, q8 @ MODULO - top 64b align with mid 1439 ld1 {q5}, [r0], #16 @ AES block 4k+4 - load ciphertext 1440 ext q9, q9, q9, #8 @ MODULO - other top alignment 1441 1442 aese q0, v27.16b @ AES block 4k+4 - round 9 1443 orr r9, r11, r9, lsl #32 @ CTR block 4k+8 1444 1445 aese q3, v22.16b 1446 aesmc q3, q3 @ AES block 4k+7 - round 4 1447 eor v10.16b, v10.16b, v30.16b @ MODULO - karatsuba tidy up 1448 1449 aese q1, v27.16b @ AES block 4k+5 - round 9 1450 1451 aese q2, v24.16b 1452 aesmc q2, q2 @ AES block 4k+6 - round 6 1453 eor q0, q4, q0 @ AES block 4k+4 - result 1454 1455 aese q3, v23.16b 1456 aesmc q3, q3 @ AES block 4k+7 - round 5 1457 ld1 {q6}, [r0], #16 @ AES block 4k+5 - load ciphertext 1458 1459 add r12, r12, #1 @ CTR block 4k+8 1460 eor v10.16b, v10.16b, v31.16b @ MODULO - fold into mid 1461 eor q1, q5, q1 @ AES block 4k+5 - result 1462 1463 aese q2, v25.16b 1464 aesmc q2, q2 @ AES block 4k+6 - round 7 1465 ld1 {q7}, [r0], #16 @ AES block 4k+6 - load ciphertext 1466 1467 aese q3, v24.16b 1468 aesmc q3, q3 @ AES block 4k+7 - round 6 1469 1470 rev64 q5, q5 @ GHASH block 4k+5 1471 eor v10.16b, v10.16b, q9 @ MODULO - fold into mid 1472 mov r7, v0.d[1] @ AES block 4k+4 - mov high 1473 1474 aese q2, v26.16b 1475 aesmc q2, q2 @ AES block 4k+6 - round 8 1476 mov r6, v0.d[0] @ AES block 4k+4 - mov low 1477 1478 aese q3, v25.16b 1479 aesmc q3, q3 @ AES block 4k+7 - round 7 1480 fmov d0, r10 @ CTR block 4k+8 1481 1482 pmull v8.1q, v10.1d, q8 @ MODULO - mid 64b align with low 1483 fmov v0.d[1], r9 @ CTR block 4k+8 1484 rev r9, r12 @ CTR block 4k+9 1485 1486 aese q2, v27.16b @ AES block 4k+6 - round 9 1487 orr r9, r11, r9, lsl #32 @ CTR block 4k+9 1488 ext v10.16b, v10.16b, v10.16b, #8 @ MODULO - other mid alignment 1489 1490 aese q3, v26.16b 1491 aesmc q3, q3 @ AES block 4k+7 - round 8 1492 eor r7, r7, r14 @ AES block 4k+4 - round 10 high 1493#ifdef __ARMEB__ 1494 rev r7, r7 1495#endif 1496 eor v11.16b, v11.16b, q8 @ MODULO - fold into low 1497 mov r20, v1.d[1] @ AES block 4k+5 - mov high 1498 eor r6, r6, r13 @ AES block 4k+4 - round 10 low 1499#ifdef __ARMEB__ 1500 rev r6, r6 1501#endif 1502 eor q2, q6, q2 @ AES block 4k+6 - result 1503 mov r19, v1.d[0] @ AES block 4k+5 - mov low 1504 add r12, r12, #1 @ CTR block 4k+9 1505 1506 aese q3, v27.16b @ AES block 4k+7 - round 9 1507 fmov d1, r10 @ CTR block 4k+9 1508 cmp r0, r5 @ .LOOP CONTROL 1509 1510 rev64 q4, q4 @ GHASH block 4k+4 1511 eor v11.16b, v11.16b, v10.16b @ MODULO - fold into low 1512 fmov v1.d[1], r9 @ CTR block 4k+9 1513 1514 rev r9, r12 @ CTR block 4k+10 1515 add r12, r12, #1 @ CTR block 4k+10 1516 1517 eor r20, r20, r14 @ AES block 4k+5 - round 10 high 1518#ifdef __ARMEB__ 1519 rev r20, r20 1520#endif 1521 stp r6, r7, [r2], #16 @ AES block 4k+4 - store result 1522 1523 eor r19, r19, r13 @ AES block 4k+5 - round 10 low 1524#ifdef __ARMEB__ 1525 rev r19, r19 1526#endif 1527 stp r19, r20, [r2], #16 @ AES block 4k+5 - store result 1528 1529 orr r9, r11, r9, lsl #32 @ CTR block 4k+10 1530 blt .L128_dec_main_loop 1531 1532.L128_dec_prepretail:@ PREPRETAIL 1533 ext v11.16b, v11.16b, v11.16b, #8 @ PRE 0 1534 mov r21, v2.d[0] @ AES block 4k+2 - mov low 1535 mov d30, v5.d[1] @ GHASH block 4k+1 - mid 1536 1537 aese q0, v18.16b 1538 aesmc q0, q0 @ AES block 4k+4 - round 0 1539 eor q3, q7, q3 @ AES block 4k+3 - result 1540 1541 aese q1, v18.16b 1542 aesmc q1, q1 @ AES block 4k+5 - round 0 1543 mov r22, v2.d[1] @ AES block 4k+2 - mov high 1544 1545 eor q4, q4, v11.16b @ PRE 1 1546 fmov d2, r10 @ CTR block 4k+6 1547 rev64 q6, q6 @ GHASH block 4k+2 1548 1549 aese q0, v19.16b 1550 aesmc q0, q0 @ AES block 4k+4 - round 1 1551 fmov v2.d[1], r9 @ CTR block 4k+6 1552 1553 rev r9, r12 @ CTR block 4k+7 1554 mov r23, v3.d[0] @ AES block 4k+3 - mov low 1555 eor v30.8b, v30.8b, q5 @ GHASH block 4k+1 - mid 1556 1557 pmull v11.1q, q4, v15.1d @ GHASH block 4k - low 1558 mov d10, v17.d[1] @ GHASH block 4k - mid 1559 mov r24, v3.d[1] @ AES block 4k+3 - mov high 1560 1561 aese q1, v19.16b 1562 aesmc q1, q1 @ AES block 4k+5 - round 1 1563 mov d31, v6.d[1] @ GHASH block 4k+2 - mid 1564 1565 aese q0, v20.16b 1566 aesmc q0, q0 @ AES block 4k+4 - round 2 1567 orr r9, r11, r9, lsl #32 @ CTR block 4k+7 1568 1569 pmull v29.1q, q5, v14.1d @ GHASH block 4k+1 - low 1570 mov d8, v4.d[1] @ GHASH block 4k - mid 1571 fmov d3, r10 @ CTR block 4k+7 1572 1573 aese q2, v18.16b 1574 aesmc q2, q2 @ AES block 4k+6 - round 0 1575 fmov v3.d[1], r9 @ CTR block 4k+7 1576 1577 pmull v30.1q, v30.1d, v17.1d @ GHASH block 4k+1 - mid 1578 eor v31.8b, v31.8b, q6 @ GHASH block 4k+2 - mid 1579 1580 rev64 q7, q7 @ GHASH block 4k+3 1581 1582 aese q2, v19.16b 1583 aesmc q2, q2 @ AES block 4k+6 - round 1 1584 eor q8, q8, q4 @ GHASH block 4k - mid 1585 1586 pmull2 v9.1q, q4, v15.2d @ GHASH block 4k - high 1587 1588 aese q3, v18.16b 1589 aesmc q3, q3 @ AES block 4k+7 - round 0 1590 ins v31.d[1], v31.d[0] @ GHASH block 4k+2 - mid 1591 1592 pmull2 v28.1q, q5, v14.2d @ GHASH block 4k+1 - high 1593 1594 pmull v10.1q, q8, v10.1d @ GHASH block 4k - mid 1595 eor v11.16b, v11.16b, v29.16b @ GHASH block 4k+1 - low 1596 1597 pmull v29.1q, q7, v12.1d @ GHASH block 4k+3 - low 1598 1599 pmull2 v31.1q, v31.2d, v16.2d @ GHASH block 4k+2 - mid 1600 eor q9, q9, v28.16b @ GHASH block 4k+1 - high 1601 1602 eor v10.16b, v10.16b, v30.16b @ GHASH block 4k+1 - mid 1603 1604 pmull2 v4.1q, q7, v12.2d @ GHASH block 4k+3 - high 1605 1606 pmull2 v8.1q, q6, v13.2d @ GHASH block 4k+2 - high 1607 mov d30, v7.d[1] @ GHASH block 4k+3 - mid 1608 1609 aese q1, v20.16b 1610 aesmc q1, q1 @ AES block 4k+5 - round 2 1611 eor v10.16b, v10.16b, v31.16b @ GHASH block 4k+2 - mid 1612 1613 pmull v28.1q, q6, v13.1d @ GHASH block 4k+2 - low 1614 1615 eor q9, q9, q8 @ GHASH block 4k+2 - high 1616 movi q8, #0xc2 1617 1618 aese q3, v19.16b 1619 aesmc q3, q3 @ AES block 4k+7 - round 1 1620 eor v30.8b, v30.8b, q7 @ GHASH block 4k+3 - mid 1621 1622 eor v11.16b, v11.16b, v28.16b @ GHASH block 4k+2 - low 1623 1624 aese q2, v20.16b 1625 aesmc q2, q2 @ AES block 4k+6 - round 2 1626 eor q9, q9, q4 @ GHASH block 4k+3 - high 1627 1628 aese q3, v20.16b 1629 aesmc q3, q3 @ AES block 4k+7 - round 2 1630 eor r23, r23, r13 @ AES block 4k+3 - round 10 low 1631#ifdef __ARMEB__ 1632 rev r23, r23 1633#endif 1634 pmull v30.1q, v30.1d, v16.1d @ GHASH block 4k+3 - mid 1635 eor r21, r21, r13 @ AES block 4k+2 - round 10 low 1636#ifdef __ARMEB__ 1637 rev r21, r21 1638#endif 1639 eor v11.16b, v11.16b, v29.16b @ GHASH block 4k+3 - low 1640 1641 aese q2, v21.16b 1642 aesmc q2, q2 @ AES block 4k+6 - round 3 1643 1644 aese q1, v21.16b 1645 aesmc q1, q1 @ AES block 4k+5 - round 3 1646 shl d8, d8, #56 @ mod_constant 1647 1648 aese q0, v21.16b 1649 aesmc q0, q0 @ AES block 4k+4 - round 3 1650 1651 aese q2, v22.16b 1652 aesmc q2, q2 @ AES block 4k+6 - round 4 1653 eor v10.16b, v10.16b, v30.16b @ GHASH block 4k+3 - mid 1654 1655 aese q1, v22.16b 1656 aesmc q1, q1 @ AES block 4k+5 - round 4 1657 1658 aese q3, v21.16b 1659 aesmc q3, q3 @ AES block 4k+7 - round 3 1660 eor v30.16b, v11.16b, q9 @ MODULO - karatsuba tidy up 1661 1662 aese q2, v23.16b 1663 aesmc q2, q2 @ AES block 4k+6 - round 5 1664 1665 aese q1, v23.16b 1666 aesmc q1, q1 @ AES block 4k+5 - round 5 1667 1668 aese q3, v22.16b 1669 aesmc q3, q3 @ AES block 4k+7 - round 4 1670 1671 aese q0, v22.16b 1672 aesmc q0, q0 @ AES block 4k+4 - round 4 1673 eor v10.16b, v10.16b, v30.16b @ MODULO - karatsuba tidy up 1674 1675 pmull v31.1q, q9, q8 @ MODULO - top 64b align with mid 1676 1677 aese q1, v24.16b 1678 aesmc q1, q1 @ AES block 4k+5 - round 6 1679 ext q9, q9, q9, #8 @ MODULO - other top alignment 1680 1681 aese q3, v23.16b 1682 aesmc q3, q3 @ AES block 4k+7 - round 5 1683 1684 aese q0, v23.16b 1685 aesmc q0, q0 @ AES block 4k+4 - round 5 1686 eor v10.16b, v10.16b, v31.16b @ MODULO - fold into mid 1687 1688 aese q1, v25.16b 1689 aesmc q1, q1 @ AES block 4k+5 - round 7 1690 1691 aese q2, v24.16b 1692 aesmc q2, q2 @ AES block 4k+6 - round 6 1693 1694 aese q0, v24.16b 1695 aesmc q0, q0 @ AES block 4k+4 - round 6 1696 1697 aese q1, v26.16b 1698 aesmc q1, q1 @ AES block 4k+5 - round 8 1699 eor v10.16b, v10.16b, q9 @ MODULO - fold into mid 1700 1701 aese q3, v24.16b 1702 aesmc q3, q3 @ AES block 4k+7 - round 6 1703 1704 aese q0, v25.16b 1705 aesmc q0, q0 @ AES block 4k+4 - round 7 1706 1707 aese q1, v27.16b @ AES block 4k+5 - round 9 1708 1709 pmull v8.1q, v10.1d, q8 @ MODULO - mid 64b align with low 1710 eor r24, r24, r14 @ AES block 4k+3 - round 10 high 1711#ifdef __ARMEB__ 1712 rev r24, r24 1713#endif 1714 aese q2, v25.16b 1715 aesmc q2, q2 @ AES block 4k+6 - round 7 1716 ext v10.16b, v10.16b, v10.16b, #8 @ MODULO - other mid alignment 1717 1718 aese q3, v25.16b 1719 aesmc q3, q3 @ AES block 4k+7 - round 7 1720 1721 aese q0, v26.16b 1722 aesmc q0, q0 @ AES block 4k+4 - round 8 1723 eor v11.16b, v11.16b, q8 @ MODULO - fold into low 1724 1725 aese q2, v26.16b 1726 aesmc q2, q2 @ AES block 4k+6 - round 8 1727 1728 aese q3, v26.16b 1729 aesmc q3, q3 @ AES block 4k+7 - round 8 1730 eor r22, r22, r14 @ AES block 4k+2 - round 10 high 1731#ifdef __ARMEB__ 1732 rev r22, r22 1733#endif 1734 aese q0, v27.16b @ AES block 4k+4 - round 9 1735 stp r21, r22, [r2], #16 @ AES block 4k+2 - store result 1736 1737 aese q2, v27.16b @ AES block 4k+6 - round 9 1738 add r12, r12, #1 @ CTR block 4k+7 1739 stp r23, r24, [r2], #16 @ AES block 4k+3 - store result 1740 1741 aese q3, v27.16b @ AES block 4k+7 - round 9 1742 eor v11.16b, v11.16b, v10.16b @ MODULO - fold into low 1743.L128_dec_tail:@ TAIL 1744 1745 sub r5, r4, r0 @ main_end_input_ptr is number of bytes left to process 1746 ld1 { q5}, [r0], #16 @ AES block 4k+4 - load ciphertext 1747 1748 eor q0, q5, q0 @ AES block 4k+4 - result 1749 1750 mov r7, v0.d[1] @ AES block 4k+4 - mov high 1751 1752 mov r6, v0.d[0] @ AES block 4k+4 - mov low 1753 1754 cmp r5, #48 1755 1756 eor r7, r7, r14 @ AES block 4k+4 - round 10 high 1757#ifdef __ARMEB__ 1758 rev r7, r7 1759#endif 1760 ext q8, v11.16b, v11.16b, #8 @ prepare final partial tag 1761 eor r6, r6, r13 @ AES block 4k+4 - round 10 low 1762#ifdef __ARMEB__ 1763 rev r6, r6 1764#endif 1765 bgt .L128_dec_blocks_more_than_3 1766 1767 mov q3, q2 1768 sub r12, r12, #1 1769 movi v11.8b, #0 1770 1771 movi q9, #0 1772 mov q2, q1 1773 1774 movi v10.8b, #0 1775 cmp r5, #32 1776 bgt .L128_dec_blocks_more_than_2 1777 1778 cmp r5, #16 1779 1780 mov q3, q1 1781 sub r12, r12, #1 1782 bgt .L128_dec_blocks_more_than_1 1783 1784 sub r12, r12, #1 1785 b .L128_dec_blocks_less_than_1 1786.L128_dec_blocks_more_than_3:@ blocks left > 3 1787 rev64 q4, q5 @ GHASH final-3 block 1788 ld1 { q5}, [r0], #16 @ AES final-2 block - load ciphertext 1789 1790 eor q4, q4, q8 @ feed in partial tag 1791 1792 mov d10, v17.d[1] @ GHASH final-3 block - mid 1793 stp r6, r7, [r2], #16 @ AES final-3 block - store result 1794 eor q0, q5, q1 @ AES final-2 block - result 1795 1796 mov d22, v4.d[1] @ GHASH final-3 block - mid 1797 mov r7, v0.d[1] @ AES final-2 block - mov high 1798 1799 pmull v11.1q, q4, v15.1d @ GHASH final-3 block - low 1800 mov r6, v0.d[0] @ AES final-2 block - mov low 1801 1802 pmull2 v9.1q, q4, v15.2d @ GHASH final-3 block - high 1803 1804 eor v22.8b, v22.8b, q4 @ GHASH final-3 block - mid 1805 1806 movi q8, #0 @ suppress further partial tag feed in 1807 eor r7, r7, r14 @ AES final-2 block - round 10 high 1808#ifdef __ARMEB__ 1809 rev r7, r7 1810#endif 1811 pmull v10.1q, v22.1d, v10.1d @ GHASH final-3 block - mid 1812 eor r6, r6, r13 @ AES final-2 block - round 10 low 1813#ifdef __ARMEB__ 1814 rev r6, r6 1815#endif 1816.L128_dec_blocks_more_than_2:@ blocks left > 2 1817 1818 rev64 q4, q5 @ GHASH final-2 block 1819 ld1 { q5}, [r0], #16 @ AES final-1 block - load ciphertext 1820 1821 eor q4, q4, q8 @ feed in partial tag 1822 1823 eor q0, q5, q2 @ AES final-1 block - result 1824 stp r6, r7, [r2], #16 @ AES final-2 block - store result 1825 1826 mov d22, v4.d[1] @ GHASH final-2 block - mid 1827 1828 pmull v21.1q, q4, v14.1d @ GHASH final-2 block - low 1829 1830 pmull2 v20.1q, q4, v14.2d @ GHASH final-2 block - high 1831 mov r6, v0.d[0] @ AES final-1 block - mov low 1832 1833 mov r7, v0.d[1] @ AES final-1 block - mov high 1834 eor v22.8b, v22.8b, q4 @ GHASH final-2 block - mid 1835 1836 movi q8, #0 @ suppress further partial tag feed in 1837 1838 pmull v22.1q, v22.1d, v17.1d @ GHASH final-2 block - mid 1839 1840 eor r6, r6, r13 @ AES final-1 block - round 10 low 1841#ifdef __ARMEB__ 1842 rev r6, r6 1843#endif 1844 eor v11.16b, v11.16b, v21.16b @ GHASH final-2 block - low 1845 1846 eor q9, q9, v20.16b @ GHASH final-2 block - high 1847 1848 eor v10.16b, v10.16b, v22.16b @ GHASH final-2 block - mid 1849 eor r7, r7, r14 @ AES final-1 block - round 10 high 1850#ifdef __ARMEB__ 1851 rev r7, r7 1852#endif 1853.L128_dec_blocks_more_than_1:@ blocks left > 1 1854 1855 rev64 q4, q5 @ GHASH final-1 block 1856 1857 ld1 { q5}, [r0], #16 @ AES final block - load ciphertext 1858 eor q4, q4, q8 @ feed in partial tag 1859 1860 mov d22, v4.d[1] @ GHASH final-1 block - mid 1861 1862 eor q0, q5, q3 @ AES final block - result 1863 1864 eor v22.8b, v22.8b, q4 @ GHASH final-1 block - mid 1865 1866 stp r6, r7, [r2], #16 @ AES final-1 block - store result 1867 mov r6, v0.d[0] @ AES final block - mov low 1868 1869 mov r7, v0.d[1] @ AES final block - mov high 1870 ins v22.d[1], v22.d[0] @ GHASH final-1 block - mid 1871 1872 pmull v21.1q, q4, v13.1d @ GHASH final-1 block - low 1873 1874 pmull2 v20.1q, q4, v13.2d @ GHASH final-1 block - high 1875 1876 pmull2 v22.1q, v22.2d, v16.2d @ GHASH final-1 block - mid 1877 movi q8, #0 @ suppress further partial tag feed in 1878 1879 eor v11.16b, v11.16b, v21.16b @ GHASH final-1 block - low 1880 1881 eor q9, q9, v20.16b @ GHASH final-1 block - high 1882 eor r7, r7, r14 @ AES final block - round 10 high 1883#ifdef __ARMEB__ 1884 rev r7, r7 1885#endif 1886 eor r6, r6, r13 @ AES final block - round 10 low 1887#ifdef __ARMEB__ 1888 rev r6, r6 1889#endif 1890 eor v10.16b, v10.16b, v22.16b @ GHASH final-1 block - mid 1891.L128_dec_blocks_less_than_1:@ blocks left <= 1 1892 1893 mvn r14, xzr @ rk10_h = 0xffffffffffffffff 1894 and r1, r1, #127 @ bit_length %= 128 1895 1896 mvn r13, xzr @ rk10_l = 0xffffffffffffffff 1897 sub r1, r1, #128 @ bit_length -= 128 1898 1899 neg r1, r1 @ bit_length = 128 - #bits in input (in range [1,128]) 1900 1901 and r1, r1, #127 @ bit_length %= 128 1902 1903 lsr r14, r14, r1 @ rk10_h is mask for top 64b of last block 1904 cmp r1, #64 1905 1906 csel r10, r14, xzr, lt 1907 csel r9, r13, r14, lt 1908 1909 fmov d0, r9 @ ctr0b is mask for last block 1910 1911 mov v0.d[1], r10 1912 1913 and q5, q5, q0 @ possibly partial last block has zeroes in highest bits 1914 1915 rev64 q4, q5 @ GHASH final block 1916 1917 eor q4, q4, q8 @ feed in partial tag 1918 1919 ldp r4, r5, [r2] @ load existing bytes we need to not overwrite 1920 1921 and r7, r7, r10 1922 1923 pmull2 v20.1q, q4, v12.2d @ GHASH final block - high 1924 mov d8, v4.d[1] @ GHASH final block - mid 1925 1926 eor q8, q8, q4 @ GHASH final block - mid 1927 eor q9, q9, v20.16b @ GHASH final block - high 1928 1929 pmull v8.1q, q8, v16.1d @ GHASH final block - mid 1930 1931 pmull v21.1q, q4, v12.1d @ GHASH final block - low 1932 bic r4, r4, r9 @ mask out low existing bytes 1933 and r6, r6, r9 1934 1935#ifndef __ARMEB__ 1936 rev r9, r12 1937#else 1938 mov r9, r12 1939#endif 1940 1941 eor v10.16b, v10.16b, q8 @ GHASH final block - mid 1942 movi q8, #0xc2 1943 1944 eor v11.16b, v11.16b, v21.16b @ GHASH final block - low 1945 1946 bic r5, r5, r10 @ mask out high existing bytes 1947 shl d8, d8, #56 @ mod_constant 1948 1949 eor v30.16b, v11.16b, q9 @ MODULO - karatsuba tidy up 1950 1951 pmull v31.1q, q9, q8 @ MODULO - top 64b align with mid 1952 1953 eor v10.16b, v10.16b, v30.16b @ MODULO - karatsuba tidy up 1954 1955 orr r6, r6, r4 1956 str r9, [r16, #12] @ store the updated counter 1957 1958 orr r7, r7, r5 1959 stp r6, r7, [r2] 1960 ext q9, q9, q9, #8 @ MODULO - other top alignment 1961 1962 eor v10.16b, v10.16b, v31.16b @ MODULO - fold into mid 1963 1964 eor v10.16b, v10.16b, q9 @ MODULO - fold into mid 1965 1966 pmull v8.1q, v10.1d, q8 @ MODULO - mid 64b align with low 1967 ext v10.16b, v10.16b, v10.16b, #8 @ MODULO - other mid alignment 1968 1969 eor v11.16b, v11.16b, q8 @ MODULO - fold into low 1970 1971 eor v11.16b, v11.16b, v10.16b @ MODULO - fold into low 1972 ext v11.16b, v11.16b, v11.16b, #8 1973 rev64 v11.16b, v11.16b 1974 mov r0, r15 1975 st1 { v11.16b }, [r3] 1976 1977 ldp r21, r22, [sp, #16] 1978 ldp r23, r24, [sp, #32] 1979 ldp d8, d9, [sp, #48] 1980 ldp d10, d11, [sp, #64] 1981 ldp d12, d13, [sp, #80] 1982 ldp d14, d15, [sp, #96] 1983 ldp r19, r20, [sp], #112 1984 RET 1985 1986.L128_dec_ret: 1987 mov r0, #0x0 1988 RET 1989.size aes_gcm_dec_128_kernel,.-aes_gcm_dec_128_kernel 1990.globl aes_gcm_enc_192_kernel 1991.type aes_gcm_enc_192_kernel,%function 1992.align 4 1993aes_gcm_enc_192_kernel: 1994 cbz r1, .L192_enc_ret 1995 stp r19, r20, [sp, #-112]! 1996 mov r16, r4 1997 mov r8, r5 1998 stp r21, r22, [sp, #16] 1999 stp r23, r24, [sp, #32] 2000 stp d8, d9, [sp, #48] 2001 stp d10, d11, [sp, #64] 2002 stp d12, d13, [sp, #80] 2003 stp d14, d15, [sp, #96] 2004 2005 ldp r10, r11, [r16] @ ctr96_b64, ctr96_t32 2006#ifdef __ARMEB__ 2007 rev r10, r10 2008 rev r11, r11 2009#endif 2010 ldp r13, r14, [r8, #192] @ load rk12 2011#ifdef __ARMEB__ 2012 ror r13, r13, #32 2013 ror r14, r14, #32 2014#endif 2015 ld1 {v18.4s}, [r8], #16 @ load rk0 2016 2017 ld1 {v19.4s}, [r8], #16 @ load rk1 2018 2019 ld1 {v20.4s}, [r8], #16 @ load rk2 2020 2021 lsr r12, r11, #32 2022 ld1 {v21.4s}, [r8], #16 @ load rk3 2023 orr r11, r11, r11 2024 2025 ld1 {v22.4s}, [r8], #16 @ load rk4 2026 rev r12, r12 @ rev_ctr32 2027 2028 add r12, r12, #1 @ increment rev_ctr32 2029 fmov d3, r10 @ CTR block 3 2030 2031 rev r9, r12 @ CTR block 1 2032 add r12, r12, #1 @ CTR block 1 2033 fmov d1, r10 @ CTR block 1 2034 2035 orr r9, r11, r9, lsl #32 @ CTR block 1 2036 ld1 { q0}, [r16] @ special case vector load initial counter so we can start first AES block as quickly as possible 2037 2038 fmov v1.d[1], r9 @ CTR block 1 2039 rev r9, r12 @ CTR block 2 2040 add r12, r12, #1 @ CTR block 2 2041 2042 fmov d2, r10 @ CTR block 2 2043 orr r9, r11, r9, lsl #32 @ CTR block 2 2044 2045 fmov v2.d[1], r9 @ CTR block 2 2046 rev r9, r12 @ CTR block 3 2047 2048 orr r9, r11, r9, lsl #32 @ CTR block 3 2049 ld1 {v23.4s}, [r8], #16 @ load rk5 2050 2051 fmov v3.d[1], r9 @ CTR block 3 2052 2053 ld1 {v24.4s}, [r8], #16 @ load rk6 2054 2055 ld1 {v25.4s}, [r8], #16 @ load rk7 2056 2057 aese q0, v18.16b 2058 aesmc q0, q0 @ AES block 0 - round 0 2059 ld1 { v11.16b}, [r3] 2060 ext v11.16b, v11.16b, v11.16b, #8 2061 rev64 v11.16b, v11.16b 2062 2063 aese q3, v18.16b 2064 aesmc q3, q3 @ AES block 3 - round 0 2065 ld1 {v26.4s}, [r8], #16 @ load rk8 2066 2067 aese q1, v18.16b 2068 aesmc q1, q1 @ AES block 1 - round 0 2069 ldr q15, [r3, #112] @ load h4l | h4h 2070#ifndef __ARMEB__ 2071 ext v15.16b, v15.16b, v15.16b, #8 2072#endif 2073 aese q2, v18.16b 2074 aesmc q2, q2 @ AES block 2 - round 0 2075 ld1 {v27.4s}, [r8], #16 @ load rk9 2076 2077 aese q0, v19.16b 2078 aesmc q0, q0 @ AES block 0 - round 1 2079 ld1 {v28.4s}, [r8], #16 @ load rk10 2080 2081 aese q1, v19.16b 2082 aesmc q1, q1 @ AES block 1 - round 1 2083 ldr q12, [r3, #32] @ load h1l | h1h 2084#ifndef __ARMEB__ 2085 ext v12.16b, v12.16b, v12.16b, #8 2086#endif 2087 aese q2, v19.16b 2088 aesmc q2, q2 @ AES block 2 - round 1 2089 ld1 {v29.4s}, [r8], #16 @ load rk11 2090 2091 aese q3, v19.16b 2092 aesmc q3, q3 @ AES block 3 - round 1 2093 ldr q14, [r3, #80] @ load h3l | h3h 2094#ifndef __ARMEB__ 2095 ext v14.16b, v14.16b, v14.16b, #8 2096#endif 2097 aese q0, v20.16b 2098 aesmc q0, q0 @ AES block 0 - round 2 2099 2100 aese q2, v20.16b 2101 aesmc q2, q2 @ AES block 2 - round 2 2102 2103 aese q3, v20.16b 2104 aesmc q3, q3 @ AES block 3 - round 2 2105 2106 aese q0, v21.16b 2107 aesmc q0, q0 @ AES block 0 - round 3 2108 trn1 q9, v14.2d, v15.2d @ h4h | h3h 2109 2110 aese q2, v21.16b 2111 aesmc q2, q2 @ AES block 2 - round 3 2112 2113 aese q1, v20.16b 2114 aesmc q1, q1 @ AES block 1 - round 2 2115 trn2 v17.2d, v14.2d, v15.2d @ h4l | h3l 2116 2117 aese q0, v22.16b 2118 aesmc q0, q0 @ AES block 0 - round 4 2119 2120 aese q3, v21.16b 2121 aesmc q3, q3 @ AES block 3 - round 3 2122 2123 aese q1, v21.16b 2124 aesmc q1, q1 @ AES block 1 - round 3 2125 2126 aese q0, v23.16b 2127 aesmc q0, q0 @ AES block 0 - round 5 2128 2129 aese q2, v22.16b 2130 aesmc q2, q2 @ AES block 2 - round 4 2131 2132 aese q1, v22.16b 2133 aesmc q1, q1 @ AES block 1 - round 4 2134 2135 aese q0, v24.16b 2136 aesmc q0, q0 @ AES block 0 - round 6 2137 2138 aese q3, v22.16b 2139 aesmc q3, q3 @ AES block 3 - round 4 2140 2141 aese q2, v23.16b 2142 aesmc q2, q2 @ AES block 2 - round 5 2143 2144 aese q1, v23.16b 2145 aesmc q1, q1 @ AES block 1 - round 5 2146 2147 aese q3, v23.16b 2148 aesmc q3, q3 @ AES block 3 - round 5 2149 2150 aese q2, v24.16b 2151 aesmc q2, q2 @ AES block 2 - round 6 2152 ldr q13, [r3, #64] @ load h2l | h2h 2153#ifndef __ARMEB__ 2154 ext v13.16b, v13.16b, v13.16b, #8 2155#endif 2156 aese q1, v24.16b 2157 aesmc q1, q1 @ AES block 1 - round 6 2158 2159 aese q3, v24.16b 2160 aesmc q3, q3 @ AES block 3 - round 6 2161 2162 aese q0, v25.16b 2163 aesmc q0, q0 @ AES block 0 - round 7 2164 2165 aese q1, v25.16b 2166 aesmc q1, q1 @ AES block 1 - round 7 2167 trn2 v16.2d, v12.2d, v13.2d @ h2l | h1l 2168 2169 aese q3, v25.16b 2170 aesmc q3, q3 @ AES block 3 - round 7 2171 2172 aese q0, v26.16b 2173 aesmc q0, q0 @ AES block 0 - round 8 2174 2175 aese q2, v25.16b 2176 aesmc q2, q2 @ AES block 2 - round 7 2177 trn1 q8, v12.2d, v13.2d @ h2h | h1h 2178 2179 aese q1, v26.16b 2180 aesmc q1, q1 @ AES block 1 - round 8 2181 2182 aese q3, v26.16b 2183 aesmc q3, q3 @ AES block 3 - round 8 2184 2185 aese q2, v26.16b 2186 aesmc q2, q2 @ AES block 2 - round 8 2187 2188 aese q0, v27.16b 2189 aesmc q0, q0 @ AES block 0 - round 9 2190 2191 aese q3, v27.16b 2192 aesmc q3, q3 @ AES block 3 - round 9 2193 2194 aese q2, v27.16b 2195 aesmc q2, q2 @ AES block 2 - round 9 2196 2197 aese q1, v27.16b 2198 aesmc q1, q1 @ AES block 1 - round 9 2199 2200 aese q0, v28.16b 2201 aesmc q0, q0 @ AES block 0 - round 10 2202 2203 aese q2, v28.16b 2204 aesmc q2, q2 @ AES block 2 - round 10 2205 2206 aese q1, v28.16b 2207 aesmc q1, q1 @ AES block 1 - round 10 2208 lsr r5, r1, #3 @ byte_len 2209 mov r15, r5 2210 2211 aese q3, v28.16b 2212 aesmc q3, q3 @ AES block 3 - round 10 2213 sub r5, r5, #1 @ byte_len - 1 2214 2215 eor v16.16b, v16.16b, q8 @ h2k | h1k 2216 and r5, r5, #0xffffffffffffffc0 @ number of bytes to be processed in main loop (at least 1 byte must be handled by tail) 2217 2218 eor v17.16b, v17.16b, q9 @ h4k | h3k 2219 2220 aese q2, v29.16b @ AES block 2 - round 11 2221 add r4, r0, r1, lsr #3 @ end_input_ptr 2222 add r5, r5, r0 2223 2224 aese q1, v29.16b @ AES block 1 - round 11 2225 cmp r0, r5 @ check if we have <= 4 blocks 2226 2227 aese q0, v29.16b @ AES block 0 - round 11 2228 add r12, r12, #1 @ CTR block 3 2229 2230 aese q3, v29.16b @ AES block 3 - round 11 2231 bge .L192_enc_tail @ handle tail 2232 2233 rev r9, r12 @ CTR block 4 2234 ldp r6, r7, [r0, #0] @ AES block 0 - load plaintext 2235#ifdef __ARMEB__ 2236 rev r6, r6 2237 rev r7, r7 2238#endif 2239 orr r9, r11, r9, lsl #32 @ CTR block 4 2240 ldp r21, r22, [r0, #32] @ AES block 2 - load plaintext 2241#ifdef __ARMEB__ 2242 rev r21, r21 2243 rev r22, r22 2244#endif 2245 ldp r23, r24, [r0, #48] @ AES block 3 - load plaintext 2246#ifdef __ARMEB__ 2247 rev r23, r23 2248 rev r24, r24 2249#endif 2250 ldp r19, r20, [r0, #16] @ AES block 1 - load plaintext 2251#ifdef __ARMEB__ 2252 rev r19, r19 2253 rev r20, r20 2254#endif 2255 add r0, r0, #64 @ AES input_ptr update 2256 cmp r0, r5 @ check if we have <= 8 blocks 2257 2258 eor r6, r6, r13 @ AES block 0 - round 12 low 2259 2260 eor r7, r7, r14 @ AES block 0 - round 12 high 2261 eor r22, r22, r14 @ AES block 2 - round 12 high 2262 fmov d4, r6 @ AES block 0 - mov low 2263 2264 eor r24, r24, r14 @ AES block 3 - round 12 high 2265 fmov v4.d[1], r7 @ AES block 0 - mov high 2266 2267 eor r21, r21, r13 @ AES block 2 - round 12 low 2268 eor r19, r19, r13 @ AES block 1 - round 12 low 2269 2270 fmov d5, r19 @ AES block 1 - mov low 2271 eor r20, r20, r14 @ AES block 1 - round 12 high 2272 2273 fmov v5.d[1], r20 @ AES block 1 - mov high 2274 2275 eor r23, r23, r13 @ AES block 3 - round 12 low 2276 fmov d6, r21 @ AES block 2 - mov low 2277 2278 add r12, r12, #1 @ CTR block 4 2279 eor q4, q4, q0 @ AES block 0 - result 2280 fmov d0, r10 @ CTR block 4 2281 2282 fmov v0.d[1], r9 @ CTR block 4 2283 rev r9, r12 @ CTR block 5 2284 2285 orr r9, r11, r9, lsl #32 @ CTR block 5 2286 add r12, r12, #1 @ CTR block 5 2287 2288 fmov d7, r23 @ AES block 3 - mov low 2289 st1 { q4}, [r2], #16 @ AES block 0 - store result 2290 2291 fmov v6.d[1], r22 @ AES block 2 - mov high 2292 2293 eor q5, q5, q1 @ AES block 1 - result 2294 fmov d1, r10 @ CTR block 5 2295 st1 { q5}, [r2], #16 @ AES block 1 - store result 2296 2297 fmov v7.d[1], r24 @ AES block 3 - mov high 2298 2299 fmov v1.d[1], r9 @ CTR block 5 2300 rev r9, r12 @ CTR block 6 2301 2302 orr r9, r11, r9, lsl #32 @ CTR block 6 2303 2304 add r12, r12, #1 @ CTR block 6 2305 eor q6, q6, q2 @ AES block 2 - result 2306 fmov d2, r10 @ CTR block 6 2307 2308 fmov v2.d[1], r9 @ CTR block 6 2309 rev r9, r12 @ CTR block 7 2310 2311 orr r9, r11, r9, lsl #32 @ CTR block 7 2312 st1 { q6}, [r2], #16 @ AES block 2 - store result 2313 2314 eor q7, q7, q3 @ AES block 3 - result 2315 st1 { q7}, [r2], #16 @ AES block 3 - store result 2316 bge .L192_enc_prepretail @ do prepretail 2317 2318.L192_enc_main_loop:@ main loop start 2319 aese q2, v18.16b 2320 aesmc q2, q2 @ AES block 4k+6 - round 0 2321 rev64 q5, q5 @ GHASH block 4k+1 (t0 and t1 free) 2322 2323 aese q1, v18.16b 2324 aesmc q1, q1 @ AES block 4k+5 - round 0 2325 ldp r19, r20, [r0, #16] @ AES block 4k+5 - load plaintext 2326#ifdef __ARMEB__ 2327 rev r19, r19 2328 rev r20, r20 2329#endif 2330 ext v11.16b, v11.16b, v11.16b, #8 @ PRE 0 2331 fmov d3, r10 @ CTR block 4k+3 2332 rev64 q4, q4 @ GHASH block 4k (only t0 is free) 2333 2334 aese q2, v19.16b 2335 aesmc q2, q2 @ AES block 4k+6 - round 1 2336 fmov v3.d[1], r9 @ CTR block 4k+3 2337 2338 pmull2 v30.1q, q5, v14.2d @ GHASH block 4k+1 - high 2339 rev64 q7, q7 @ GHASH block 4k+3 (t0, t1, t2 and t3 free) 2340 ldp r21, r22, [r0, #32] @ AES block 4k+6 - load plaintext 2341#ifdef __ARMEB__ 2342 rev r21, r21 2343 rev r22, r22 2344#endif 2345 aese q0, v18.16b 2346 aesmc q0, q0 @ AES block 4k+4 - round 0 2347 ldp r23, r24, [r0, #48] @ AES block 4k+3 - load plaintext 2348#ifdef __ARMEB__ 2349 rev r23, r23 2350 rev r24, r24 2351#endif 2352 pmull v31.1q, q5, v14.1d @ GHASH block 4k+1 - low 2353 eor q4, q4, v11.16b @ PRE 1 2354 2355 aese q1, v19.16b 2356 aesmc q1, q1 @ AES block 4k+5 - round 1 2357 2358 aese q0, v19.16b 2359 aesmc q0, q0 @ AES block 4k+4 - round 1 2360 rev64 q6, q6 @ GHASH block 4k+2 (t0, t1, and t2 free) 2361 2362 aese q3, v18.16b 2363 aesmc q3, q3 @ AES block 4k+7 - round 0 2364 eor r24, r24, r14 @ AES block 4k+3 - round 12 high 2365 2366 pmull v11.1q, q4, v15.1d @ GHASH block 4k - low 2367 mov d8, v4.d[1] @ GHASH block 4k - mid 2368 2369 aese q0, v20.16b 2370 aesmc q0, q0 @ AES block 4k+4 - round 2 2371 2372 aese q3, v19.16b 2373 aesmc q3, q3 @ AES block 4k+7 - round 1 2374 eor r21, r21, r13 @ AES block 4k+6 - round 12 low 2375 2376 eor q8, q8, q4 @ GHASH block 4k - mid 2377 eor v11.16b, v11.16b, v31.16b @ GHASH block 4k+1 - low 2378 2379 aese q0, v21.16b 2380 aesmc q0, q0 @ AES block 4k+4 - round 3 2381 eor r19, r19, r13 @ AES block 4k+5 - round 12 low 2382 2383 aese q1, v20.16b 2384 aesmc q1, q1 @ AES block 4k+5 - round 2 2385 mov d31, v6.d[1] @ GHASH block 4k+2 - mid 2386 2387 pmull2 v9.1q, q4, v15.2d @ GHASH block 4k - high 2388 mov d4, v5.d[1] @ GHASH block 4k+1 - mid 2389 2390 aese q2, v20.16b 2391 aesmc q2, q2 @ AES block 4k+6 - round 2 2392 2393 aese q1, v21.16b 2394 aesmc q1, q1 @ AES block 4k+5 - round 3 2395 2396 mov d10, v17.d[1] @ GHASH block 4k - mid 2397 eor q9, q9, v30.16b @ GHASH block 4k+1 - high 2398 2399 aese q3, v20.16b 2400 aesmc q3, q3 @ AES block 4k+7 - round 2 2401 eor v31.8b, v31.8b, q6 @ GHASH block 4k+2 - mid 2402 2403 pmull2 v30.1q, q6, v13.2d @ GHASH block 4k+2 - high 2404 2405 aese q0, v22.16b 2406 aesmc q0, q0 @ AES block 4k+4 - round 4 2407 eor q4, q4, q5 @ GHASH block 4k+1 - mid 2408 2409 aese q3, v21.16b 2410 aesmc q3, q3 @ AES block 4k+7 - round 3 2411 2412 pmull2 v5.1q, q7, v12.2d @ GHASH block 4k+3 - high 2413 eor r20, r20, r14 @ AES block 4k+5 - round 12 high 2414 ins v31.d[1], v31.d[0] @ GHASH block 4k+2 - mid 2415 2416 aese q0, v23.16b 2417 aesmc q0, q0 @ AES block 4k+4 - round 5 2418 add r12, r12, #1 @ CTR block 4k+3 2419 2420 aese q3, v22.16b 2421 aesmc q3, q3 @ AES block 4k+7 - round 4 2422 eor q9, q9, v30.16b @ GHASH block 4k+2 - high 2423 2424 pmull v4.1q, q4, v17.1d @ GHASH block 4k+1 - mid 2425 eor r22, r22, r14 @ AES block 4k+6 - round 12 high 2426 2427 pmull2 v31.1q, v31.2d, v16.2d @ GHASH block 4k+2 - mid 2428 eor r23, r23, r13 @ AES block 4k+3 - round 12 low 2429 mov d30, v7.d[1] @ GHASH block 4k+3 - mid 2430 2431 pmull v10.1q, q8, v10.1d @ GHASH block 4k - mid 2432 rev r9, r12 @ CTR block 4k+8 2433 2434 pmull v8.1q, q6, v13.1d @ GHASH block 4k+2 - low 2435 orr r9, r11, r9, lsl #32 @ CTR block 4k+8 2436 2437 aese q2, v21.16b 2438 aesmc q2, q2 @ AES block 4k+6 - round 3 2439 eor v30.8b, v30.8b, q7 @ GHASH block 4k+3 - mid 2440 2441 aese q1, v22.16b 2442 aesmc q1, q1 @ AES block 4k+5 - round 4 2443 ldp r6, r7, [r0, #0] @ AES block 4k+4 - load plaintext 2444#ifdef __ARMEB__ 2445 rev r6, r6 2446 rev r7, r7 2447#endif 2448 aese q0, v24.16b 2449 aesmc q0, q0 @ AES block 4k+4 - round 6 2450 eor v11.16b, v11.16b, q8 @ GHASH block 4k+2 - low 2451 2452 aese q2, v22.16b 2453 aesmc q2, q2 @ AES block 4k+6 - round 4 2454 add r0, r0, #64 @ AES input_ptr update 2455 2456 aese q1, v23.16b 2457 aesmc q1, q1 @ AES block 4k+5 - round 5 2458 movi q8, #0xc2 2459 2460 pmull v6.1q, q7, v12.1d @ GHASH block 4k+3 - low 2461 eor r7, r7, r14 @ AES block 4k+4 - round 12 high 2462 eor v10.16b, v10.16b, q4 @ GHASH block 4k+1 - mid 2463 2464 aese q2, v23.16b 2465 aesmc q2, q2 @ AES block 4k+6 - round 5 2466 eor r6, r6, r13 @ AES block 4k+4 - round 12 low 2467 2468 aese q1, v24.16b 2469 aesmc q1, q1 @ AES block 4k+5 - round 6 2470 shl d8, d8, #56 @ mod_constant 2471 2472 aese q3, v23.16b 2473 aesmc q3, q3 @ AES block 4k+7 - round 5 2474 eor q9, q9, q5 @ GHASH block 4k+3 - high 2475 2476 aese q0, v25.16b 2477 aesmc q0, q0 @ AES block 4k+4 - round 7 2478 fmov d5, r19 @ AES block 4k+5 - mov low 2479 2480 aese q1, v25.16b 2481 aesmc q1, q1 @ AES block 4k+5 - round 7 2482 eor v10.16b, v10.16b, v31.16b @ GHASH block 4k+2 - mid 2483 2484 aese q3, v24.16b 2485 aesmc q3, q3 @ AES block 4k+7 - round 6 2486 fmov v5.d[1], r20 @ AES block 4k+5 - mov high 2487 2488 aese q0, v26.16b 2489 aesmc q0, q0 @ AES block 4k+4 - round 8 2490 eor v11.16b, v11.16b, q6 @ GHASH block 4k+3 - low 2491 2492 pmull v30.1q, v30.1d, v16.1d @ GHASH block 4k+3 - mid 2493 cmp r0, r5 @ .LOOP CONTROL 2494 fmov d4, r6 @ AES block 4k+4 - mov low 2495 2496 aese q2, v24.16b 2497 aesmc q2, q2 @ AES block 4k+6 - round 6 2498 fmov v4.d[1], r7 @ AES block 4k+4 - mov high 2499 2500 aese q1, v26.16b 2501 aesmc q1, q1 @ AES block 4k+5 - round 8 2502 fmov d7, r23 @ AES block 4k+3 - mov low 2503 2504 eor v10.16b, v10.16b, v30.16b @ GHASH block 4k+3 - mid 2505 eor v30.16b, v11.16b, q9 @ MODULO - karatsuba tidy up 2506 add r12, r12, #1 @ CTR block 4k+8 2507 2508 aese q2, v25.16b 2509 aesmc q2, q2 @ AES block 4k+6 - round 7 2510 fmov v7.d[1], r24 @ AES block 4k+3 - mov high 2511 2512 pmull v31.1q, q9, q8 @ MODULO - top 64b align with mid 2513 ext q9, q9, q9, #8 @ MODULO - other top alignment 2514 fmov d6, r21 @ AES block 4k+6 - mov low 2515 2516 aese q3, v25.16b 2517 aesmc q3, q3 @ AES block 4k+7 - round 7 2518 2519 aese q0, v27.16b 2520 aesmc q0, q0 @ AES block 4k+4 - round 9 2521 eor v10.16b, v10.16b, v30.16b @ MODULO - karatsuba tidy up 2522 2523 aese q2, v26.16b 2524 aesmc q2, q2 @ AES block 4k+6 - round 8 2525 2526 aese q3, v26.16b 2527 aesmc q3, q3 @ AES block 4k+7 - round 8 2528 2529 aese q1, v27.16b 2530 aesmc q1, q1 @ AES block 4k+5 - round 9 2531 2532 aese q0, v28.16b 2533 aesmc q0, q0 @ AES block 4k+4 - round 10 2534 eor v10.16b, v10.16b, v31.16b @ MODULO - fold into mid 2535 2536 aese q3, v27.16b 2537 aesmc q3, q3 @ AES block 4k+7 - round 9 2538 2539 aese q2, v27.16b 2540 aesmc q2, q2 @ AES block 4k+6 - round 9 2541 2542 aese q0, v29.16b @ AES block 4k+4 - round 11 2543 2544 aese q1, v28.16b 2545 aesmc q1, q1 @ AES block 4k+5 - round 10 2546 eor v10.16b, v10.16b, q9 @ MODULO - fold into mid 2547 2548 aese q2, v28.16b 2549 aesmc q2, q2 @ AES block 4k+6 - round 10 2550 2551 eor q4, q4, q0 @ AES block 4k+4 - result 2552 fmov d0, r10 @ CTR block 4k+8 2553 2554 aese q1, v29.16b @ AES block 4k+5 - round 11 2555 fmov v0.d[1], r9 @ CTR block 4k+8 2556 rev r9, r12 @ CTR block 4k+9 2557 2558 pmull v9.1q, v10.1d, q8 @ MODULO - mid 64b align with low 2559 fmov v6.d[1], r22 @ AES block 4k+6 - mov high 2560 st1 { q4}, [r2], #16 @ AES block 4k+4 - store result 2561 2562 aese q3, v28.16b 2563 aesmc q3, q3 @ AES block 4k+7 - round 10 2564 orr r9, r11, r9, lsl #32 @ CTR block 4k+9 2565 2566 eor q5, q5, q1 @ AES block 4k+5 - result 2567 add r12, r12, #1 @ CTR block 4k+9 2568 fmov d1, r10 @ CTR block 4k+9 2569 2570 aese q2, v29.16b @ AES block 4k+6 - round 11 2571 fmov v1.d[1], r9 @ CTR block 4k+9 2572 rev r9, r12 @ CTR block 4k+10 2573 2574 add r12, r12, #1 @ CTR block 4k+10 2575 ext v10.16b, v10.16b, v10.16b, #8 @ MODULO - other mid alignment 2576 orr r9, r11, r9, lsl #32 @ CTR block 4k+10 2577 2578 st1 { q5}, [r2], #16 @ AES block 4k+5 - store result 2579 eor v11.16b, v11.16b, q9 @ MODULO - fold into low 2580 2581 aese q3, v29.16b @ AES block 4k+7 - round 11 2582 eor q6, q6, q2 @ AES block 4k+6 - result 2583 fmov d2, r10 @ CTR block 4k+10 2584 2585 st1 { q6}, [r2], #16 @ AES block 4k+6 - store result 2586 fmov v2.d[1], r9 @ CTR block 4k+10 2587 rev r9, r12 @ CTR block 4k+11 2588 2589 eor v11.16b, v11.16b, v10.16b @ MODULO - fold into low 2590 orr r9, r11, r9, lsl #32 @ CTR block 4k+11 2591 2592 eor q7, q7, q3 @ AES block 4k+3 - result 2593 st1 { q7}, [r2], #16 @ AES block 4k+3 - store result 2594 blt .L192_enc_main_loop 2595 2596.L192_enc_prepretail:@ PREPRETAIL 2597 aese q0, v18.16b 2598 aesmc q0, q0 @ AES block 4k+4 - round 0 2599 rev64 q4, q4 @ GHASH block 4k (only t0 is free) 2600 2601 fmov d3, r10 @ CTR block 4k+3 2602 ext v11.16b, v11.16b, v11.16b, #8 @ PRE 0 2603 add r12, r12, #1 @ CTR block 4k+3 2604 2605 aese q1, v18.16b 2606 aesmc q1, q1 @ AES block 4k+5 - round 0 2607 rev64 q5, q5 @ GHASH block 4k+1 (t0 and t1 free) 2608 2609 aese q2, v18.16b 2610 aesmc q2, q2 @ AES block 4k+6 - round 0 2611 2612 fmov v3.d[1], r9 @ CTR block 4k+3 2613 eor q4, q4, v11.16b @ PRE 1 2614 mov d10, v17.d[1] @ GHASH block 4k - mid 2615 2616 aese q1, v19.16b 2617 aesmc q1, q1 @ AES block 4k+5 - round 1 2618 rev64 q6, q6 @ GHASH block 4k+2 (t0, t1, and t2 free) 2619 2620 pmull2 v30.1q, q5, v14.2d @ GHASH block 4k+1 - high 2621 2622 pmull v11.1q, q4, v15.1d @ GHASH block 4k - low 2623 mov d8, v4.d[1] @ GHASH block 4k - mid 2624 2625 pmull v31.1q, q5, v14.1d @ GHASH block 4k+1 - low 2626 rev64 q7, q7 @ GHASH block 4k+3 (t0, t1, t2 and t3 free) 2627 2628 pmull2 v9.1q, q4, v15.2d @ GHASH block 4k - high 2629 2630 eor q8, q8, q4 @ GHASH block 4k - mid 2631 mov d4, v5.d[1] @ GHASH block 4k+1 - mid 2632 2633 eor v11.16b, v11.16b, v31.16b @ GHASH block 4k+1 - low 2634 mov d31, v6.d[1] @ GHASH block 4k+2 - mid 2635 2636 aese q3, v18.16b 2637 aesmc q3, q3 @ AES block 4k+7 - round 0 2638 eor q9, q9, v30.16b @ GHASH block 4k+1 - high 2639 2640 pmull2 v30.1q, q6, v13.2d @ GHASH block 4k+2 - high 2641 2642 eor q4, q4, q5 @ GHASH block 4k+1 - mid 2643 eor v31.8b, v31.8b, q6 @ GHASH block 4k+2 - mid 2644 2645 aese q3, v19.16b 2646 aesmc q3, q3 @ AES block 4k+7 - round 1 2647 2648 aese q2, v19.16b 2649 aesmc q2, q2 @ AES block 4k+6 - round 1 2650 eor q9, q9, v30.16b @ GHASH block 4k+2 - high 2651 2652 aese q0, v19.16b 2653 aesmc q0, q0 @ AES block 4k+4 - round 1 2654 2655 aese q1, v20.16b 2656 aesmc q1, q1 @ AES block 4k+5 - round 2 2657 mov d30, v7.d[1] @ GHASH block 4k+3 - mid 2658 2659 pmull2 v5.1q, q7, v12.2d @ GHASH block 4k+3 - high 2660 ins v31.d[1], v31.d[0] @ GHASH block 4k+2 - mid 2661 2662 aese q0, v20.16b 2663 aesmc q0, q0 @ AES block 4k+4 - round 2 2664 2665 pmull v10.1q, q8, v10.1d @ GHASH block 4k - mid 2666 eor v30.8b, v30.8b, q7 @ GHASH block 4k+3 - mid 2667 2668 aese q1, v21.16b 2669 aesmc q1, q1 @ AES block 4k+5 - round 3 2670 2671 pmull2 v31.1q, v31.2d, v16.2d @ GHASH block 4k+2 - mid 2672 2673 pmull v4.1q, q4, v17.1d @ GHASH block 4k+1 - mid 2674 2675 pmull v30.1q, v30.1d, v16.1d @ GHASH block 4k+3 - mid 2676 eor q9, q9, q5 @ GHASH block 4k+3 - high 2677 2678 pmull v8.1q, q6, v13.1d @ GHASH block 4k+2 - low 2679 2680 aese q0, v21.16b 2681 aesmc q0, q0 @ AES block 4k+4 - round 3 2682 eor v10.16b, v10.16b, q4 @ GHASH block 4k+1 - mid 2683 2684 aese q3, v20.16b 2685 aesmc q3, q3 @ AES block 4k+7 - round 2 2686 2687 aese q2, v20.16b 2688 aesmc q2, q2 @ AES block 4k+6 - round 2 2689 eor v11.16b, v11.16b, q8 @ GHASH block 4k+2 - low 2690 2691 aese q0, v22.16b 2692 aesmc q0, q0 @ AES block 4k+4 - round 4 2693 2694 aese q3, v21.16b 2695 aesmc q3, q3 @ AES block 4k+7 - round 3 2696 eor v10.16b, v10.16b, v31.16b @ GHASH block 4k+2 - mid 2697 2698 aese q2, v21.16b 2699 aesmc q2, q2 @ AES block 4k+6 - round 3 2700 2701 pmull v6.1q, q7, v12.1d @ GHASH block 4k+3 - low 2702 movi q8, #0xc2 2703 2704 aese q3, v22.16b 2705 aesmc q3, q3 @ AES block 4k+7 - round 4 2706 2707 aese q2, v22.16b 2708 aesmc q2, q2 @ AES block 4k+6 - round 4 2709 2710 aese q1, v22.16b 2711 aesmc q1, q1 @ AES block 4k+5 - round 4 2712 eor v10.16b, v10.16b, v30.16b @ GHASH block 4k+3 - mid 2713 2714 aese q3, v23.16b 2715 aesmc q3, q3 @ AES block 4k+7 - round 5 2716 2717 aese q2, v23.16b 2718 aesmc q2, q2 @ AES block 4k+6 - round 5 2719 2720 aese q1, v23.16b 2721 aesmc q1, q1 @ AES block 4k+5 - round 5 2722 eor v11.16b, v11.16b, q6 @ GHASH block 4k+3 - low 2723 2724 aese q0, v23.16b 2725 aesmc q0, q0 @ AES block 4k+4 - round 5 2726 2727 aese q3, v24.16b 2728 aesmc q3, q3 @ AES block 4k+7 - round 6 2729 eor v10.16b, v10.16b, q9 @ karatsuba tidy up 2730 2731 aese q1, v24.16b 2732 aesmc q1, q1 @ AES block 4k+5 - round 6 2733 2734 aese q0, v24.16b 2735 aesmc q0, q0 @ AES block 4k+4 - round 6 2736 shl d8, d8, #56 @ mod_constant 2737 2738 aese q3, v25.16b 2739 aesmc q3, q3 @ AES block 4k+7 - round 7 2740 2741 aese q1, v25.16b 2742 aesmc q1, q1 @ AES block 4k+5 - round 7 2743 eor v10.16b, v10.16b, v11.16b 2744 2745 aese q0, v25.16b 2746 aesmc q0, q0 @ AES block 4k+4 - round 7 2747 2748 pmull v30.1q, q9, q8 2749 2750 aese q2, v24.16b 2751 aesmc q2, q2 @ AES block 4k+6 - round 6 2752 ext q9, q9, q9, #8 2753 2754 aese q0, v26.16b 2755 aesmc q0, q0 @ AES block 4k+4 - round 8 2756 2757 aese q1, v26.16b 2758 aesmc q1, q1 @ AES block 4k+5 - round 8 2759 eor v10.16b, v10.16b, v30.16b 2760 2761 aese q2, v25.16b 2762 aesmc q2, q2 @ AES block 4k+6 - round 7 2763 2764 aese q3, v26.16b 2765 aesmc q3, q3 @ AES block 4k+7 - round 8 2766 2767 aese q0, v27.16b 2768 aesmc q0, q0 @ AES block 4k+4 - round 9 2769 2770 aese q2, v26.16b 2771 aesmc q2, q2 @ AES block 4k+6 - round 8 2772 eor v10.16b, v10.16b, q9 2773 2774 aese q3, v27.16b 2775 aesmc q3, q3 @ AES block 4k+7 - round 9 2776 2777 aese q1, v27.16b 2778 aesmc q1, q1 @ AES block 4k+5 - round 9 2779 2780 aese q2, v27.16b 2781 aesmc q2, q2 @ AES block 4k+6 - round 9 2782 2783 pmull v30.1q, v10.1d, q8 2784 2785 ext v10.16b, v10.16b, v10.16b, #8 2786 2787 aese q3, v28.16b 2788 aesmc q3, q3 @ AES block 4k+7 - round 10 2789 2790 aese q0, v28.16b 2791 aesmc q0, q0 @ AES block 4k+4 - round 10 2792 2793 aese q2, v28.16b 2794 aesmc q2, q2 @ AES block 4k+6 - round 10 2795 2796 aese q1, v28.16b 2797 aesmc q1, q1 @ AES block 4k+5 - round 10 2798 eor v11.16b, v11.16b, v30.16b 2799 2800 aese q0, v29.16b @ AES block 4k+4 - round 11 2801 2802 aese q3, v29.16b @ AES block 4k+7 - round 11 2803 2804 aese q2, v29.16b @ AES block 4k+6 - round 11 2805 2806 aese q1, v29.16b @ AES block 4k+5 - round 11 2807 eor v11.16b, v11.16b, v10.16b 2808.L192_enc_tail:@ TAIL 2809 2810 sub r5, r4, r0 @ main_end_input_ptr is number of bytes left to process 2811 ldp r6, r7, [r0], #16 @ AES block 4k+4 - load plaintext 2812#ifdef __ARMEB__ 2813 rev r6, r6 2814 rev r7, r7 2815#endif 2816 eor r6, r6, r13 @ AES block 4k+4 - round 12 low 2817 eor r7, r7, r14 @ AES block 4k+4 - round 12 high 2818 2819 fmov d4, r6 @ AES block 4k+4 - mov low 2820 2821 fmov v4.d[1], r7 @ AES block 4k+4 - mov high 2822 cmp r5, #48 2823 2824 eor q5, q4, q0 @ AES block 4k+4 - result 2825 2826 ext q8, v11.16b, v11.16b, #8 @ prepare final partial tag 2827 bgt .L192_enc_blocks_more_than_3 2828 2829 sub r12, r12, #1 2830 movi v10.8b, #0 2831 2832 mov q3, q2 2833 movi q9, #0 2834 cmp r5, #32 2835 2836 mov q2, q1 2837 movi v11.8b, #0 2838 bgt .L192_enc_blocks_more_than_2 2839 2840 sub r12, r12, #1 2841 2842 mov q3, q1 2843 cmp r5, #16 2844 bgt .L192_enc_blocks_more_than_1 2845 2846 sub r12, r12, #1 2847 b .L192_enc_blocks_less_than_1 2848.L192_enc_blocks_more_than_3:@ blocks left > 3 2849 st1 { q5}, [r2], #16 @ AES final-3 block - store result 2850 2851 ldp r6, r7, [r0], #16 @ AES final-2 block - load input low & high 2852#ifdef __ARMEB__ 2853 rev r6, r6 2854 rev r7, r7 2855#endif 2856 rev64 q4, q5 @ GHASH final-3 block 2857 2858 eor r6, r6, r13 @ AES final-2 block - round 12 low 2859 eor q4, q4, q8 @ feed in partial tag 2860 2861 eor r7, r7, r14 @ AES final-2 block - round 12 high 2862 fmov d5, r6 @ AES final-2 block - mov low 2863 2864 fmov v5.d[1], r7 @ AES final-2 block - mov high 2865 2866 mov d22, v4.d[1] @ GHASH final-3 block - mid 2867 2868 pmull v11.1q, q4, v15.1d @ GHASH final-3 block - low 2869 2870 mov d10, v17.d[1] @ GHASH final-3 block - mid 2871 2872 eor v22.8b, v22.8b, q4 @ GHASH final-3 block - mid 2873 2874 movi q8, #0 @ suppress further partial tag feed in 2875 2876 pmull2 v9.1q, q4, v15.2d @ GHASH final-3 block - high 2877 2878 pmull v10.1q, v22.1d, v10.1d @ GHASH final-3 block - mid 2879 eor q5, q5, q1 @ AES final-2 block - result 2880.L192_enc_blocks_more_than_2:@ blocks left > 2 2881 2882 st1 { q5}, [r2], #16 @ AES final-2 block - store result 2883 2884 rev64 q4, q5 @ GHASH final-2 block 2885 ldp r6, r7, [r0], #16 @ AES final-1 block - load input low & high 2886#ifdef __ARMEB__ 2887 rev r6, r6 2888 rev r7, r7 2889#endif 2890 eor q4, q4, q8 @ feed in partial tag 2891 2892 eor r7, r7, r14 @ AES final-1 block - round 12 high 2893 2894 pmull2 v20.1q, q4, v14.2d @ GHASH final-2 block - high 2895 mov d22, v4.d[1] @ GHASH final-2 block - mid 2896 2897 pmull v21.1q, q4, v14.1d @ GHASH final-2 block - low 2898 eor r6, r6, r13 @ AES final-1 block - round 12 low 2899 2900 fmov d5, r6 @ AES final-1 block - mov low 2901 2902 fmov v5.d[1], r7 @ AES final-1 block - mov high 2903 eor q9, q9, v20.16b @ GHASH final-2 block - high 2904 eor v22.8b, v22.8b, q4 @ GHASH final-2 block - mid 2905 2906 eor v11.16b, v11.16b, v21.16b @ GHASH final-2 block - low 2907 2908 pmull v22.1q, v22.1d, v17.1d @ GHASH final-2 block - mid 2909 2910 movi q8, #0 @ suppress further partial tag feed in 2911 2912 eor q5, q5, q2 @ AES final-1 block - result 2913 2914 eor v10.16b, v10.16b, v22.16b @ GHASH final-2 block - mid 2915.L192_enc_blocks_more_than_1:@ blocks left > 1 2916 2917 st1 { q5}, [r2], #16 @ AES final-1 block - store result 2918 2919 ldp r6, r7, [r0], #16 @ AES final block - load input low & high 2920#ifdef __ARMEB__ 2921 rev r6, r6 2922 rev r7, r7 2923#endif 2924 rev64 q4, q5 @ GHASH final-1 block 2925 2926 eor r6, r6, r13 @ AES final block - round 12 low 2927 eor q4, q4, q8 @ feed in partial tag 2928 movi q8, #0 @ suppress further partial tag feed in 2929 2930 mov d22, v4.d[1] @ GHASH final-1 block - mid 2931 2932 eor v22.8b, v22.8b, q4 @ GHASH final-1 block - mid 2933 eor r7, r7, r14 @ AES final block - round 12 high 2934 fmov d5, r6 @ AES final block - mov low 2935 2936 pmull2 v20.1q, q4, v13.2d @ GHASH final-1 block - high 2937 fmov v5.d[1], r7 @ AES final block - mov high 2938 2939 ins v22.d[1], v22.d[0] @ GHASH final-1 block - mid 2940 2941 eor q9, q9, v20.16b @ GHASH final-1 block - high 2942 2943 pmull v21.1q, q4, v13.1d @ GHASH final-1 block - low 2944 2945 pmull2 v22.1q, v22.2d, v16.2d @ GHASH final-1 block - mid 2946 2947 eor q5, q5, q3 @ AES final block - result 2948 2949 eor v11.16b, v11.16b, v21.16b @ GHASH final-1 block - low 2950 2951 eor v10.16b, v10.16b, v22.16b @ GHASH final-1 block - mid 2952.L192_enc_blocks_less_than_1:@ blocks left <= 1 2953 2954 ld1 { v18.16b}, [r2] @ load existing bytes where the possibly partial last block is to be stored 2955#ifndef __ARMEB__ 2956 rev r9, r12 2957#else 2958 mov r9, r12 2959#endif 2960 and r1, r1, #127 @ bit_length %= 128 2961 2962 sub r1, r1, #128 @ bit_length -= 128 2963 mvn r14, xzr @ rk12_h = 0xffffffffffffffff 2964 2965 neg r1, r1 @ bit_length = 128 - #bits in input (in range [1,128]) 2966 mvn r13, xzr @ rk12_l = 0xffffffffffffffff 2967 2968 and r1, r1, #127 @ bit_length %= 128 2969 2970 lsr r14, r14, r1 @ rk12_h is mask for top 64b of last block 2971 cmp r1, #64 2972 2973 csel r6, r13, r14, lt 2974 csel r7, r14, xzr, lt 2975 2976 fmov d0, r6 @ ctr0b is mask for last block 2977 2978 fmov v0.d[1], r7 2979 2980 and q5, q5, q0 @ possibly partial last block has zeroes in highest bits 2981 2982 rev64 q4, q5 @ GHASH final block 2983 2984 eor q4, q4, q8 @ feed in partial tag 2985 2986 mov d8, v4.d[1] @ GHASH final block - mid 2987 2988 pmull v21.1q, q4, v12.1d @ GHASH final block - low 2989 2990 pmull2 v20.1q, q4, v12.2d @ GHASH final block - high 2991 2992 eor q8, q8, q4 @ GHASH final block - mid 2993 2994 eor v11.16b, v11.16b, v21.16b @ GHASH final block - low 2995 2996 eor q9, q9, v20.16b @ GHASH final block - high 2997 2998 pmull v8.1q, q8, v16.1d @ GHASH final block - mid 2999 3000 eor v10.16b, v10.16b, q8 @ GHASH final block - mid 3001 movi q8, #0xc2 3002 3003 eor v30.16b, v11.16b, q9 @ MODULO - karatsuba tidy up 3004 3005 shl d8, d8, #56 @ mod_constant 3006 3007 bif q5, v18.16b, q0 @ insert existing bytes in top end of result before storing 3008 3009 eor v10.16b, v10.16b, v30.16b @ MODULO - karatsuba tidy up 3010 3011 pmull v31.1q, q9, q8 @ MODULO - top 64b align with mid 3012 3013 ext q9, q9, q9, #8 @ MODULO - other top alignment 3014 3015 eor v10.16b, v10.16b, v31.16b @ MODULO - fold into mid 3016 3017 eor v10.16b, v10.16b, q9 @ MODULO - fold into mid 3018 3019 pmull v9.1q, v10.1d, q8 @ MODULO - mid 64b align with low 3020 3021 ext v10.16b, v10.16b, v10.16b, #8 @ MODULO - other mid alignment 3022 3023 eor v11.16b, v11.16b, q9 @ MODULO - fold into low 3024 str r9, [r16, #12] @ store the updated counter 3025 3026 st1 { q5}, [r2] @ store all 16B 3027 3028 eor v11.16b, v11.16b, v10.16b @ MODULO - fold into low 3029 ext v11.16b, v11.16b, v11.16b, #8 3030 rev64 v11.16b, v11.16b 3031 mov r0, r15 3032 st1 { v11.16b }, [r3] 3033 3034 ldp r21, r22, [sp, #16] 3035 ldp r23, r24, [sp, #32] 3036 ldp d8, d9, [sp, #48] 3037 ldp d10, d11, [sp, #64] 3038 ldp d12, d13, [sp, #80] 3039 ldp d14, d15, [sp, #96] 3040 ldp r19, r20, [sp], #112 3041 RET 3042 3043.L192_enc_ret: 3044 mov r0, #0x0 3045 RET 3046.size aes_gcm_enc_192_kernel,.-aes_gcm_enc_192_kernel 3047.globl aes_gcm_dec_192_kernel 3048.type aes_gcm_dec_192_kernel,%function 3049.align 4 3050aes_gcm_dec_192_kernel: 3051 cbz r1, .L192_dec_ret 3052 stp r19, r20, [sp, #-112]! 3053 mov r16, r4 3054 mov r8, r5 3055 stp r21, r22, [sp, #16] 3056 stp r23, r24, [sp, #32] 3057 stp d8, d9, [sp, #48] 3058 stp d10, d11, [sp, #64] 3059 stp d12, d13, [sp, #80] 3060 stp d14, d15, [sp, #96] 3061 3062 add r4, r0, r1, lsr #3 @ end_input_ptr 3063 ldp r10, r11, [r16] @ ctr96_b64, ctr96_t32 3064#ifdef __ARMEB__ 3065 rev r10, r10 3066 rev r11, r11 3067#endif 3068 ldp r13, r14, [r8, #192] @ load rk12 3069#ifdef __ARMEB__ 3070 ror r13, r13, #32 3071 ror r14, r14, #32 3072#endif 3073 ld1 { q0}, [r16] @ special case vector load initial counter so we can start first AES block as quickly as possible 3074 3075 ld1 {v18.4s}, [r8], #16 @ load rk0 3076 3077 lsr r5, r1, #3 @ byte_len 3078 mov r15, r5 3079 ld1 {v19.4s}, [r8], #16 @ load rk1 3080 3081 lsr r12, r11, #32 3082 orr r11, r11, r11 3083 fmov d3, r10 @ CTR block 3 3084 3085 rev r12, r12 @ rev_ctr32 3086 fmov d1, r10 @ CTR block 1 3087 3088 add r12, r12, #1 @ increment rev_ctr32 3089 ld1 {v20.4s}, [r8], #16 @ load rk2 3090 3091 aese q0, v18.16b 3092 aesmc q0, q0 @ AES block 0 - round 0 3093 rev r9, r12 @ CTR block 1 3094 3095 add r12, r12, #1 @ CTR block 1 3096 orr r9, r11, r9, lsl #32 @ CTR block 1 3097 ld1 {v21.4s}, [r8], #16 @ load rk3 3098 3099 fmov v1.d[1], r9 @ CTR block 1 3100 rev r9, r12 @ CTR block 2 3101 add r12, r12, #1 @ CTR block 2 3102 3103 fmov d2, r10 @ CTR block 2 3104 orr r9, r11, r9, lsl #32 @ CTR block 2 3105 3106 fmov v2.d[1], r9 @ CTR block 2 3107 rev r9, r12 @ CTR block 3 3108 3109 aese q0, v19.16b 3110 aesmc q0, q0 @ AES block 0 - round 1 3111 orr r9, r11, r9, lsl #32 @ CTR block 3 3112 3113 fmov v3.d[1], r9 @ CTR block 3 3114 3115 ld1 {v22.4s}, [r8], #16 @ load rk4 3116 3117 aese q0, v20.16b 3118 aesmc q0, q0 @ AES block 0 - round 2 3119 3120 aese q2, v18.16b 3121 aesmc q2, q2 @ AES block 2 - round 0 3122 ld1 {v23.4s}, [r8], #16 @ load rk5 3123 3124 aese q1, v18.16b 3125 aesmc q1, q1 @ AES block 1 - round 0 3126 ldr q15, [r3, #112] @ load h4l | h4h 3127#ifndef __ARMEB__ 3128 ext v15.16b, v15.16b, v15.16b, #8 3129#endif 3130 aese q3, v18.16b 3131 aesmc q3, q3 @ AES block 3 - round 0 3132 ldr q13, [r3, #64] @ load h2l | h2h 3133#ifndef __ARMEB__ 3134 ext v13.16b, v13.16b, v13.16b, #8 3135#endif 3136 aese q2, v19.16b 3137 aesmc q2, q2 @ AES block 2 - round 1 3138 ldr q14, [r3, #80] @ load h3l | h3h 3139#ifndef __ARMEB__ 3140 ext v14.16b, v14.16b, v14.16b, #8 3141#endif 3142 aese q1, v19.16b 3143 aesmc q1, q1 @ AES block 1 - round 1 3144 3145 aese q3, v19.16b 3146 aesmc q3, q3 @ AES block 3 - round 1 3147 ldr q12, [r3, #32] @ load h1l | h1h 3148#ifndef __ARMEB__ 3149 ext v12.16b, v12.16b, v12.16b, #8 3150#endif 3151 aese q2, v20.16b 3152 aesmc q2, q2 @ AES block 2 - round 2 3153 ld1 {v24.4s}, [r8], #16 @ load rk6 3154 3155 aese q0, v21.16b 3156 aesmc q0, q0 @ AES block 0 - round 3 3157 ld1 {v25.4s}, [r8], #16 @ load rk7 3158 3159 aese q1, v20.16b 3160 aesmc q1, q1 @ AES block 1 - round 2 3161 ld1 {v26.4s}, [r8], #16 @ load rk8 3162 3163 aese q3, v20.16b 3164 aesmc q3, q3 @ AES block 3 - round 2 3165 ld1 {v27.4s}, [r8], #16 @ load rk9 3166 3167 aese q2, v21.16b 3168 aesmc q2, q2 @ AES block 2 - round 3 3169 ld1 { v11.16b}, [r3] 3170 ext v11.16b, v11.16b, v11.16b, #8 3171 rev64 v11.16b, v11.16b 3172 3173 aese q1, v21.16b 3174 aesmc q1, q1 @ AES block 1 - round 3 3175 add r12, r12, #1 @ CTR block 3 3176 3177 aese q3, v21.16b 3178 aesmc q3, q3 @ AES block 3 - round 3 3179 trn1 q9, v14.2d, v15.2d @ h4h | h3h 3180 3181 aese q0, v22.16b 3182 aesmc q0, q0 @ AES block 0 - round 4 3183 ld1 {v28.4s}, [r8], #16 @ load rk10 3184 3185 aese q1, v22.16b 3186 aesmc q1, q1 @ AES block 1 - round 4 3187 trn2 v17.2d, v14.2d, v15.2d @ h4l | h3l 3188 3189 aese q2, v22.16b 3190 aesmc q2, q2 @ AES block 2 - round 4 3191 3192 aese q3, v22.16b 3193 aesmc q3, q3 @ AES block 3 - round 4 3194 trn2 v16.2d, v12.2d, v13.2d @ h2l | h1l 3195 3196 aese q0, v23.16b 3197 aesmc q0, q0 @ AES block 0 - round 5 3198 ld1 {v29.4s}, [r8], #16 @ load rk11 3199 3200 aese q1, v23.16b 3201 aesmc q1, q1 @ AES block 1 - round 5 3202 3203 aese q2, v23.16b 3204 aesmc q2, q2 @ AES block 2 - round 5 3205 3206 aese q3, v23.16b 3207 aesmc q3, q3 @ AES block 3 - round 5 3208 3209 aese q0, v24.16b 3210 aesmc q0, q0 @ AES block 0 - round 6 3211 3212 aese q2, v24.16b 3213 aesmc q2, q2 @ AES block 2 - round 6 3214 3215 aese q3, v24.16b 3216 aesmc q3, q3 @ AES block 3 - round 6 3217 3218 aese q0, v25.16b 3219 aesmc q0, q0 @ AES block 0 - round 7 3220 3221 aese q2, v25.16b 3222 aesmc q2, q2 @ AES block 2 - round 7 3223 3224 aese q3, v25.16b 3225 aesmc q3, q3 @ AES block 3 - round 7 3226 3227 aese q1, v24.16b 3228 aesmc q1, q1 @ AES block 1 - round 6 3229 3230 aese q2, v26.16b 3231 aesmc q2, q2 @ AES block 2 - round 8 3232 3233 aese q3, v26.16b 3234 aesmc q3, q3 @ AES block 3 - round 8 3235 3236 aese q1, v25.16b 3237 aesmc q1, q1 @ AES block 1 - round 7 3238 3239 aese q2, v27.16b 3240 aesmc q2, q2 @ AES block 2 - round 9 3241 3242 aese q3, v27.16b 3243 aesmc q3, q3 @ AES block 3 - round 9 3244 3245 aese q1, v26.16b 3246 aesmc q1, q1 @ AES block 1 - round 8 3247 sub r5, r5, #1 @ byte_len - 1 3248 3249 aese q0, v26.16b 3250 aesmc q0, q0 @ AES block 0 - round 8 3251 and r5, r5, #0xffffffffffffffc0 @ number of bytes to be processed in main loop (at least 1 byte must be handled by tail) 3252 3253 aese q3, v28.16b 3254 aesmc q3, q3 @ AES block 3 - round 10 3255 add r5, r5, r0 3256 3257 aese q1, v27.16b 3258 aesmc q1, q1 @ AES block 1 - round 9 3259 cmp r0, r5 @ check if we have <= 4 blocks 3260 3261 aese q0, v27.16b 3262 aesmc q0, q0 @ AES block 0 - round 9 3263 trn1 q8, v12.2d, v13.2d @ h2h | h1h 3264 3265 aese q3, v29.16b @ AES block 3 - round 11 3266 3267 aese q2, v28.16b 3268 aesmc q2, q2 @ AES block 2 - round 10 3269 3270 aese q1, v28.16b 3271 aesmc q1, q1 @ AES block 1 - round 10 3272 3273 aese q0, v28.16b 3274 aesmc q0, q0 @ AES block 0 - round 10 3275 eor v16.16b, v16.16b, q8 @ h2k | h1k 3276 3277 aese q2, v29.16b @ AES block 2 - round 11 3278 3279 aese q1, v29.16b @ AES block 1 - round 11 3280 eor v17.16b, v17.16b, q9 @ h4k | h3k 3281 3282 aese q0, v29.16b @ AES block 0 - round 11 3283 bge .L192_dec_tail @ handle tail 3284 3285 ld1 {q4, q5}, [r0], #32 @ AES block 0,1 - load ciphertext 3286 3287 eor q1, q5, q1 @ AES block 1 - result 3288 3289 eor q0, q4, q0 @ AES block 0 - result 3290 rev r9, r12 @ CTR block 4 3291 ld1 {q6, q7}, [r0], #32 @ AES block 2,3 - load ciphertext 3292 3293 mov r19, v1.d[0] @ AES block 1 - mov low 3294 3295 mov r20, v1.d[1] @ AES block 1 - mov high 3296 3297 mov r6, v0.d[0] @ AES block 0 - mov low 3298 orr r9, r11, r9, lsl #32 @ CTR block 4 3299 add r12, r12, #1 @ CTR block 4 3300 3301 mov r7, v0.d[1] @ AES block 0 - mov high 3302 rev64 q4, q4 @ GHASH block 0 3303 3304 fmov d0, r10 @ CTR block 4 3305 rev64 q5, q5 @ GHASH block 1 3306 cmp r0, r5 @ check if we have <= 8 blocks 3307 3308 eor r19, r19, r13 @ AES block 1 - round 12 low 3309#ifdef __ARMEB__ 3310 rev r19, r19 3311#endif 3312 fmov v0.d[1], r9 @ CTR block 4 3313 rev r9, r12 @ CTR block 5 3314 3315 orr r9, r11, r9, lsl #32 @ CTR block 5 3316 fmov d1, r10 @ CTR block 5 3317 eor r20, r20, r14 @ AES block 1 - round 12 high 3318#ifdef __ARMEB__ 3319 rev r20, r20 3320#endif 3321 add r12, r12, #1 @ CTR block 5 3322 fmov v1.d[1], r9 @ CTR block 5 3323 eor r6, r6, r13 @ AES block 0 - round 12 low 3324#ifdef __ARMEB__ 3325 rev r6, r6 3326#endif 3327 rev r9, r12 @ CTR block 6 3328 eor r7, r7, r14 @ AES block 0 - round 12 high 3329#ifdef __ARMEB__ 3330 rev r7, r7 3331#endif 3332 stp r6, r7, [r2], #16 @ AES block 0 - store result 3333 orr r9, r11, r9, lsl #32 @ CTR block 6 3334 3335 stp r19, r20, [r2], #16 @ AES block 1 - store result 3336 3337 add r12, r12, #1 @ CTR block 6 3338 eor q2, q6, q2 @ AES block 2 - result 3339 bge .L192_dec_prepretail @ do prepretail 3340 3341.L192_dec_main_loop:@ main loop start 3342 aese q1, v18.16b 3343 aesmc q1, q1 @ AES block 4k+5 - round 0 3344 ext v11.16b, v11.16b, v11.16b, #8 @ PRE 0 3345 3346 pmull v31.1q, q5, v14.1d @ GHASH block 4k+1 - low 3347 mov r21, v2.d[0] @ AES block 4k+2 - mov low 3348 3349 mov r22, v2.d[1] @ AES block 4k+2 - mov high 3350 eor q3, q7, q3 @ AES block 4k+3 - result 3351 rev64 q7, q7 @ GHASH block 4k+3 3352 3353 aese q1, v19.16b 3354 aesmc q1, q1 @ AES block 4k+5 - round 1 3355 fmov d2, r10 @ CTR block 4k+6 3356 3357 aese q0, v18.16b 3358 aesmc q0, q0 @ AES block 4k+4 - round 0 3359 eor q4, q4, v11.16b @ PRE 1 3360 3361 pmull2 v30.1q, q5, v14.2d @ GHASH block 4k+1 - high 3362 fmov v2.d[1], r9 @ CTR block 4k+6 3363 3364 aese q1, v20.16b 3365 aesmc q1, q1 @ AES block 4k+5 - round 2 3366 mov r24, v3.d[1] @ AES block 4k+3 - mov high 3367 3368 aese q0, v19.16b 3369 aesmc q0, q0 @ AES block 4k+4 - round 1 3370 mov r23, v3.d[0] @ AES block 4k+3 - mov low 3371 3372 pmull2 v9.1q, q4, v15.2d @ GHASH block 4k - high 3373 fmov d3, r10 @ CTR block 4k+7 3374 mov d8, v4.d[1] @ GHASH block 4k - mid 3375 3376 pmull v11.1q, q4, v15.1d @ GHASH block 4k - low 3377 mov d10, v17.d[1] @ GHASH block 4k - mid 3378 rev r9, r12 @ CTR block 4k+7 3379 3380 aese q2, v18.16b 3381 aesmc q2, q2 @ AES block 4k+6 - round 0 3382 orr r9, r11, r9, lsl #32 @ CTR block 4k+7 3383 3384 fmov v3.d[1], r9 @ CTR block 4k+7 3385 eor q8, q8, q4 @ GHASH block 4k - mid 3386 mov d4, v5.d[1] @ GHASH block 4k+1 - mid 3387 3388 aese q1, v21.16b 3389 aesmc q1, q1 @ AES block 4k+5 - round 3 3390 3391 aese q0, v20.16b 3392 aesmc q0, q0 @ AES block 4k+4 - round 2 3393 eor r22, r22, r14 @ AES block 4k+2 - round 12 high 3394#ifdef __ARMEB__ 3395 rev r22, r22 3396#endif 3397 aese q2, v19.16b 3398 aesmc q2, q2 @ AES block 4k+6 - round 1 3399 eor q4, q4, q5 @ GHASH block 4k+1 - mid 3400 3401 pmull v10.1q, q8, v10.1d @ GHASH block 4k - mid 3402 3403 aese q3, v18.16b 3404 aesmc q3, q3 @ AES block 4k+7 - round 0 3405 rev64 q6, q6 @ GHASH block 4k+2 3406 3407 aese q2, v20.16b 3408 aesmc q2, q2 @ AES block 4k+6 - round 2 3409 3410 pmull v4.1q, q4, v17.1d @ GHASH block 4k+1 - mid 3411 eor v11.16b, v11.16b, v31.16b @ GHASH block 4k+1 - low 3412 eor r21, r21, r13 @ AES block 4k+2 - round 12 low 3413#ifdef __ARMEB__ 3414 rev r21, r21 3415#endif 3416 aese q1, v22.16b 3417 aesmc q1, q1 @ AES block 4k+5 - round 4 3418 3419 aese q0, v21.16b 3420 aesmc q0, q0 @ AES block 4k+4 - round 3 3421 3422 eor v10.16b, v10.16b, q4 @ GHASH block 4k+1 - mid 3423 mov d31, v6.d[1] @ GHASH block 4k+2 - mid 3424 3425 aese q3, v19.16b 3426 aesmc q3, q3 @ AES block 4k+7 - round 1 3427 eor q9, q9, v30.16b @ GHASH block 4k+1 - high 3428 3429 aese q0, v22.16b 3430 aesmc q0, q0 @ AES block 4k+4 - round 4 3431 3432 pmull2 v30.1q, q6, v13.2d @ GHASH block 4k+2 - high 3433 eor v31.8b, v31.8b, q6 @ GHASH block 4k+2 - mid 3434 3435 pmull v8.1q, q6, v13.1d @ GHASH block 4k+2 - low 3436 3437 aese q0, v23.16b 3438 aesmc q0, q0 @ AES block 4k+4 - round 5 3439 3440 eor q9, q9, v30.16b @ GHASH block 4k+2 - high 3441 mov d30, v7.d[1] @ GHASH block 4k+3 - mid 3442 3443 aese q1, v23.16b 3444 aesmc q1, q1 @ AES block 4k+5 - round 5 3445 3446 pmull2 v5.1q, q7, v12.2d @ GHASH block 4k+3 - high 3447 3448 aese q3, v20.16b 3449 aesmc q3, q3 @ AES block 4k+7 - round 2 3450 eor v30.8b, v30.8b, q7 @ GHASH block 4k+3 - mid 3451 3452 aese q1, v24.16b 3453 aesmc q1, q1 @ AES block 4k+5 - round 6 3454 3455 aese q0, v24.16b 3456 aesmc q0, q0 @ AES block 4k+4 - round 6 3457 ins v31.d[1], v31.d[0] @ GHASH block 4k+2 - mid 3458 3459 aese q3, v21.16b 3460 aesmc q3, q3 @ AES block 4k+7 - round 3 3461 3462 pmull v30.1q, v30.1d, v16.1d @ GHASH block 4k+3 - mid 3463 eor v11.16b, v11.16b, q8 @ GHASH block 4k+2 - low 3464 3465 aese q0, v25.16b 3466 aesmc q0, q0 @ AES block 4k+4 - round 7 3467 3468 pmull2 v31.1q, v31.2d, v16.2d @ GHASH block 4k+2 - mid 3469 eor q9, q9, q5 @ GHASH block 4k+3 - high 3470 3471 aese q1, v25.16b 3472 aesmc q1, q1 @ AES block 4k+5 - round 7 3473 3474 aese q0, v26.16b 3475 aesmc q0, q0 @ AES block 4k+4 - round 8 3476 movi q8, #0xc2 3477 3478 pmull v6.1q, q7, v12.1d @ GHASH block 4k+3 - low 3479 3480 aese q1, v26.16b 3481 aesmc q1, q1 @ AES block 4k+5 - round 8 3482 eor v10.16b, v10.16b, v31.16b @ GHASH block 4k+2 - mid 3483 3484 aese q2, v21.16b 3485 aesmc q2, q2 @ AES block 4k+6 - round 3 3486 3487 aese q0, v27.16b 3488 aesmc q0, q0 @ AES block 4k+4 - round 9 3489 eor v11.16b, v11.16b, q6 @ GHASH block 4k+3 - low 3490 3491 aese q3, v22.16b 3492 aesmc q3, q3 @ AES block 4k+7 - round 4 3493 3494 aese q2, v22.16b 3495 aesmc q2, q2 @ AES block 4k+6 - round 4 3496 eor v10.16b, v10.16b, v30.16b @ GHASH block 4k+3 - mid 3497 3498 aese q0, v28.16b 3499 aesmc q0, q0 @ AES block 4k+4 - round 10 3500 3501 aese q1, v27.16b 3502 aesmc q1, q1 @ AES block 4k+5 - round 9 3503 eor v30.16b, v11.16b, q9 @ MODULO - karatsuba tidy up 3504 3505 aese q2, v23.16b 3506 aesmc q2, q2 @ AES block 4k+6 - round 5 3507 3508 aese q3, v23.16b 3509 aesmc q3, q3 @ AES block 4k+7 - round 5 3510 shl d8, d8, #56 @ mod_constant 3511 3512 aese q1, v28.16b 3513 aesmc q1, q1 @ AES block 4k+5 - round 10 3514 3515 aese q2, v24.16b 3516 aesmc q2, q2 @ AES block 4k+6 - round 6 3517 ld1 {q4}, [r0], #16 @ AES block 4k+4 - load ciphertext 3518 3519 aese q3, v24.16b 3520 aesmc q3, q3 @ AES block 4k+7 - round 6 3521 eor v10.16b, v10.16b, v30.16b @ MODULO - karatsuba tidy up 3522 3523 pmull v31.1q, q9, q8 @ MODULO - top 64b align with mid 3524 ld1 {q5}, [r0], #16 @ AES block 4k+5 - load ciphertext 3525 eor r23, r23, r13 @ AES block 4k+3 - round 12 low 3526#ifdef __ARMEB__ 3527 rev r23, r23 3528#endif 3529 aese q2, v25.16b 3530 aesmc q2, q2 @ AES block 4k+6 - round 7 3531 ext q9, q9, q9, #8 @ MODULO - other top alignment 3532 3533 aese q0, v29.16b @ AES block 4k+4 - round 11 3534 add r12, r12, #1 @ CTR block 4k+7 3535 3536 aese q3, v25.16b 3537 aesmc q3, q3 @ AES block 4k+7 - round 7 3538 eor v10.16b, v10.16b, v31.16b @ MODULO - fold into mid 3539 3540 aese q2, v26.16b 3541 aesmc q2, q2 @ AES block 4k+6 - round 8 3542 ld1 {q6}, [r0], #16 @ AES block 4k+6 - load ciphertext 3543 3544 aese q1, v29.16b @ AES block 4k+5 - round 11 3545 ld1 {q7}, [r0], #16 @ AES block 4k+7 - load ciphertext 3546 rev r9, r12 @ CTR block 4k+8 3547 3548 aese q3, v26.16b 3549 aesmc q3, q3 @ AES block 4k+7 - round 8 3550 stp r21, r22, [r2], #16 @ AES block 4k+2 - store result 3551 3552 aese q2, v27.16b 3553 aesmc q2, q2 @ AES block 4k+6 - round 9 3554 eor v10.16b, v10.16b, q9 @ MODULO - fold into mid 3555 3556 cmp r0, r5 @ .LOOP CONTROL 3557 3558 eor q0, q4, q0 @ AES block 4k+4 - result 3559 eor r24, r24, r14 @ AES block 4k+3 - round 12 high 3560#ifdef __ARMEB__ 3561 rev r24, r24 3562#endif 3563 eor q1, q5, q1 @ AES block 4k+5 - result 3564 3565 aese q2, v28.16b 3566 aesmc q2, q2 @ AES block 4k+6 - round 10 3567 orr r9, r11, r9, lsl #32 @ CTR block 4k+8 3568 3569 aese q3, v27.16b 3570 aesmc q3, q3 @ AES block 4k+7 - round 9 3571 3572 pmull v8.1q, v10.1d, q8 @ MODULO - mid 64b align with low 3573 mov r19, v1.d[0] @ AES block 4k+5 - mov low 3574 3575 mov r6, v0.d[0] @ AES block 4k+4 - mov low 3576 stp r23, r24, [r2], #16 @ AES block 4k+3 - store result 3577 rev64 q5, q5 @ GHASH block 4k+5 3578 3579 aese q2, v29.16b @ AES block 4k+6 - round 11 3580 mov r7, v0.d[1] @ AES block 4k+4 - mov high 3581 3582 aese q3, v28.16b 3583 aesmc q3, q3 @ AES block 4k+7 - round 10 3584 mov r20, v1.d[1] @ AES block 4k+5 - mov high 3585 3586 fmov d0, r10 @ CTR block 4k+8 3587 add r12, r12, #1 @ CTR block 4k+8 3588 ext v10.16b, v10.16b, v10.16b, #8 @ MODULO - other mid alignment 3589 3590 eor q2, q6, q2 @ AES block 4k+6 - result 3591 fmov v0.d[1], r9 @ CTR block 4k+8 3592 rev r9, r12 @ CTR block 4k+9 3593 3594 eor r6, r6, r13 @ AES block 4k+4 - round 12 low 3595#ifdef __ARMEB__ 3596 rev r6, r6 3597#endif 3598 orr r9, r11, r9, lsl #32 @ CTR block 4k+9 3599 eor v11.16b, v11.16b, q8 @ MODULO - fold into low 3600 3601 fmov d1, r10 @ CTR block 4k+9 3602 add r12, r12, #1 @ CTR block 4k+9 3603 eor r19, r19, r13 @ AES block 4k+5 - round 12 low 3604#ifdef __ARMEB__ 3605 rev r19, r19 3606#endif 3607 fmov v1.d[1], r9 @ CTR block 4k+9 3608 rev r9, r12 @ CTR block 4k+10 3609 eor r20, r20, r14 @ AES block 4k+5 - round 12 high 3610#ifdef __ARMEB__ 3611 rev r20, r20 3612#endif 3613 eor r7, r7, r14 @ AES block 4k+4 - round 12 high 3614#ifdef __ARMEB__ 3615 rev r7, r7 3616#endif 3617 stp r6, r7, [r2], #16 @ AES block 4k+4 - store result 3618 eor v11.16b, v11.16b, v10.16b @ MODULO - fold into low 3619 3620 add r12, r12, #1 @ CTR block 4k+10 3621 rev64 q4, q4 @ GHASH block 4k+4 3622 orr r9, r11, r9, lsl #32 @ CTR block 4k+10 3623 3624 aese q3, v29.16b @ AES block 4k+7 - round 11 3625 stp r19, r20, [r2], #16 @ AES block 4k+5 - store result 3626 blt .L192_dec_main_loop 3627 3628.L192_dec_prepretail:@ PREPRETAIL 3629 mov r22, v2.d[1] @ AES block 4k+2 - mov high 3630 ext v11.16b, v11.16b, v11.16b, #8 @ PRE 0 3631 eor q3, q7, q3 @ AES block 4k+3 - result 3632 3633 aese q1, v18.16b 3634 aesmc q1, q1 @ AES block 4k+5 - round 0 3635 mov r21, v2.d[0] @ AES block 4k+2 - mov low 3636 3637 aese q0, v18.16b 3638 aesmc q0, q0 @ AES block 4k+4 - round 0 3639 mov d10, v17.d[1] @ GHASH block 4k - mid 3640 3641 eor q4, q4, v11.16b @ PRE 1 3642 fmov d2, r10 @ CTR block 4k+6 3643 3644 aese q1, v19.16b 3645 aesmc q1, q1 @ AES block 4k+5 - round 1 3646 mov r23, v3.d[0] @ AES block 4k+3 - mov low 3647 3648 aese q0, v19.16b 3649 aesmc q0, q0 @ AES block 4k+4 - round 1 3650 mov r24, v3.d[1] @ AES block 4k+3 - mov high 3651 3652 pmull v11.1q, q4, v15.1d @ GHASH block 4k - low 3653 mov d8, v4.d[1] @ GHASH block 4k - mid 3654 fmov d3, r10 @ CTR block 4k+7 3655 3656 aese q1, v20.16b 3657 aesmc q1, q1 @ AES block 4k+5 - round 2 3658 rev64 q6, q6 @ GHASH block 4k+2 3659 3660 pmull2 v9.1q, q4, v15.2d @ GHASH block 4k - high 3661 fmov v2.d[1], r9 @ CTR block 4k+6 3662 rev r9, r12 @ CTR block 4k+7 3663 3664 orr r9, r11, r9, lsl #32 @ CTR block 4k+7 3665 eor q8, q8, q4 @ GHASH block 4k - mid 3666 mov d4, v5.d[1] @ GHASH block 4k+1 - mid 3667 3668 pmull v31.1q, q5, v14.1d @ GHASH block 4k+1 - low 3669 eor r24, r24, r14 @ AES block 4k+3 - round 12 high 3670#ifdef __ARMEB__ 3671 rev r24, r24 3672#endif 3673 fmov v3.d[1], r9 @ CTR block 4k+7 3674 3675 aese q0, v20.16b 3676 aesmc q0, q0 @ AES block 4k+4 - round 2 3677 eor r21, r21, r13 @ AES block 4k+2 - round 12 low 3678#ifdef __ARMEB__ 3679 rev r21, r21 3680#endif 3681 pmull2 v30.1q, q5, v14.2d @ GHASH block 4k+1 - high 3682 eor r22, r22, r14 @ AES block 4k+2 - round 12 high 3683#ifdef __ARMEB__ 3684 rev r22, r22 3685#endif 3686 eor q4, q4, q5 @ GHASH block 4k+1 - mid 3687 3688 pmull v10.1q, q8, v10.1d @ GHASH block 4k - mid 3689 eor r23, r23, r13 @ AES block 4k+3 - round 12 low 3690#ifdef __ARMEB__ 3691 rev r23, r23 3692#endif 3693 stp r21, r22, [r2], #16 @ AES block 4k+2 - store result 3694 3695 rev64 q7, q7 @ GHASH block 4k+3 3696 stp r23, r24, [r2], #16 @ AES block 4k+3 - store result 3697 3698 aese q3, v18.16b 3699 aesmc q3, q3 @ AES block 4k+7 - round 0 3700 eor q9, q9, v30.16b @ GHASH block 4k+1 - high 3701 3702 pmull v4.1q, q4, v17.1d @ GHASH block 4k+1 - mid 3703 add r12, r12, #1 @ CTR block 4k+7 3704 3705 pmull2 v30.1q, q6, v13.2d @ GHASH block 4k+2 - high 3706 eor v11.16b, v11.16b, v31.16b @ GHASH block 4k+1 - low 3707 3708 aese q2, v18.16b 3709 aesmc q2, q2 @ AES block 4k+6 - round 0 3710 3711 eor v10.16b, v10.16b, q4 @ GHASH block 4k+1 - mid 3712 mov d31, v6.d[1] @ GHASH block 4k+2 - mid 3713 3714 aese q3, v19.16b 3715 aesmc q3, q3 @ AES block 4k+7 - round 1 3716 3717 aese q2, v19.16b 3718 aesmc q2, q2 @ AES block 4k+6 - round 1 3719 eor q9, q9, v30.16b @ GHASH block 4k+2 - high 3720 3721 eor v31.8b, v31.8b, q6 @ GHASH block 4k+2 - mid 3722 3723 pmull v8.1q, q6, v13.1d @ GHASH block 4k+2 - low 3724 3725 aese q2, v20.16b 3726 aesmc q2, q2 @ AES block 4k+6 - round 2 3727 mov d30, v7.d[1] @ GHASH block 4k+3 - mid 3728 3729 aese q3, v20.16b 3730 aesmc q3, q3 @ AES block 4k+7 - round 2 3731 ins v31.d[1], v31.d[0] @ GHASH block 4k+2 - mid 3732 3733 pmull v6.1q, q7, v12.1d @ GHASH block 4k+3 - low 3734 3735 aese q0, v21.16b 3736 aesmc q0, q0 @ AES block 4k+4 - round 3 3737 eor v30.8b, v30.8b, q7 @ GHASH block 4k+3 - mid 3738 3739 aese q1, v21.16b 3740 aesmc q1, q1 @ AES block 4k+5 - round 3 3741 3742 pmull2 v31.1q, v31.2d, v16.2d @ GHASH block 4k+2 - mid 3743 eor v11.16b, v11.16b, q8 @ GHASH block 4k+2 - low 3744 3745 aese q0, v22.16b 3746 aesmc q0, q0 @ AES block 4k+4 - round 4 3747 3748 pmull2 v5.1q, q7, v12.2d @ GHASH block 4k+3 - high 3749 movi q8, #0xc2 3750 3751 pmull v30.1q, v30.1d, v16.1d @ GHASH block 4k+3 - mid 3752 3753 aese q2, v21.16b 3754 aesmc q2, q2 @ AES block 4k+6 - round 3 3755 3756 shl d8, d8, #56 @ mod_constant 3757 eor q9, q9, q5 @ GHASH block 4k+3 - high 3758 3759 aese q0, v23.16b 3760 aesmc q0, q0 @ AES block 4k+4 - round 5 3761 eor v10.16b, v10.16b, v31.16b @ GHASH block 4k+2 - mid 3762 3763 aese q2, v22.16b 3764 aesmc q2, q2 @ AES block 4k+6 - round 4 3765 3766 pmull v31.1q, q9, q8 @ MODULO - top 64b align with mid 3767 eor v11.16b, v11.16b, q6 @ GHASH block 4k+3 - low 3768 3769 aese q0, v24.16b 3770 aesmc q0, q0 @ AES block 4k+4 - round 6 3771 3772 aese q3, v21.16b 3773 aesmc q3, q3 @ AES block 4k+7 - round 3 3774 eor v10.16b, v10.16b, v30.16b @ GHASH block 4k+3 - mid 3775 3776 aese q2, v23.16b 3777 aesmc q2, q2 @ AES block 4k+6 - round 5 3778 3779 aese q0, v25.16b 3780 aesmc q0, q0 @ AES block 4k+4 - round 7 3781 eor v30.16b, v11.16b, q9 @ MODULO - karatsuba tidy up 3782 3783 aese q3, v22.16b 3784 aesmc q3, q3 @ AES block 4k+7 - round 4 3785 3786 aese q2, v24.16b 3787 aesmc q2, q2 @ AES block 4k+6 - round 6 3788 ext q9, q9, q9, #8 @ MODULO - other top alignment 3789 3790 aese q0, v26.16b 3791 aesmc q0, q0 @ AES block 4k+4 - round 8 3792 3793 aese q3, v23.16b 3794 aesmc q3, q3 @ AES block 4k+7 - round 5 3795 eor v10.16b, v10.16b, v30.16b @ MODULO - karatsuba tidy up 3796 3797 aese q1, v22.16b 3798 aesmc q1, q1 @ AES block 4k+5 - round 4 3799 3800 aese q2, v25.16b 3801 aesmc q2, q2 @ AES block 4k+6 - round 7 3802 3803 aese q0, v27.16b 3804 aesmc q0, q0 @ AES block 4k+4 - round 9 3805 3806 aese q1, v23.16b 3807 aesmc q1, q1 @ AES block 4k+5 - round 5 3808 3809 aese q3, v24.16b 3810 aesmc q3, q3 @ AES block 4k+7 - round 6 3811 eor v10.16b, v10.16b, v31.16b @ MODULO - fold into mid 3812 3813 aese q0, v28.16b 3814 aesmc q0, q0 @ AES block 4k+4 - round 10 3815 3816 aese q1, v24.16b 3817 aesmc q1, q1 @ AES block 4k+5 - round 6 3818 3819 aese q3, v25.16b 3820 aesmc q3, q3 @ AES block 4k+7 - round 7 3821 3822 aese q2, v26.16b 3823 aesmc q2, q2 @ AES block 4k+6 - round 8 3824 eor v10.16b, v10.16b, q9 @ MODULO - fold into mid 3825 3826 aese q1, v25.16b 3827 aesmc q1, q1 @ AES block 4k+5 - round 7 3828 3829 aese q3, v26.16b 3830 aesmc q3, q3 @ AES block 4k+7 - round 8 3831 3832 aese q2, v27.16b 3833 aesmc q2, q2 @ AES block 4k+6 - round 9 3834 3835 aese q1, v26.16b 3836 aesmc q1, q1 @ AES block 4k+5 - round 8 3837 3838 aese q3, v27.16b 3839 aesmc q3, q3 @ AES block 4k+7 - round 9 3840 3841 pmull v8.1q, v10.1d, q8 @ MODULO - mid 64b align with low 3842 3843 aese q1, v27.16b 3844 aesmc q1, q1 @ AES block 4k+5 - round 9 3845 3846 aese q2, v28.16b 3847 aesmc q2, q2 @ AES block 4k+6 - round 10 3848 3849 aese q3, v28.16b 3850 aesmc q3, q3 @ AES block 4k+7 - round 10 3851 ext v10.16b, v10.16b, v10.16b, #8 @ MODULO - other mid alignment 3852 3853 aese q1, v28.16b 3854 aesmc q1, q1 @ AES block 4k+5 - round 10 3855 3856 aese q0, v29.16b 3857 eor v11.16b, v11.16b, q8 @ MODULO - fold into low 3858 3859 aese q2, v29.16b 3860 3861 aese q1, v29.16b 3862 3863 aese q3, v29.16b 3864 3865 eor v11.16b, v11.16b, v10.16b @ MODULO - fold into low 3866.L192_dec_tail:@ TAIL 3867 3868 sub r5, r4, r0 @ main_end_input_ptr is number of bytes left to process 3869 ld1 { q5}, [r0], #16 @ AES block 4k+4 - load ciphertext 3870 3871 eor q0, q5, q0 @ AES block 4k+4 - result 3872 3873 mov r7, v0.d[1] @ AES block 4k+4 - mov high 3874 3875 mov r6, v0.d[0] @ AES block 4k+4 - mov low 3876 3877 ext q8, v11.16b, v11.16b, #8 @ prepare final partial tag 3878 3879 cmp r5, #48 3880 3881 eor r7, r7, r14 @ AES block 4k+4 - round 12 high 3882#ifdef __ARMEB__ 3883 rev r7, r7 3884#endif 3885 eor r6, r6, r13 @ AES block 4k+4 - round 12 low 3886#ifdef __ARMEB__ 3887 rev r6, r6 3888#endif 3889 bgt .L192_dec_blocks_more_than_3 3890 3891 movi v11.8b, #0 3892 movi q9, #0 3893 3894 mov q3, q2 3895 mov q2, q1 3896 sub r12, r12, #1 3897 3898 movi v10.8b, #0 3899 cmp r5, #32 3900 bgt .L192_dec_blocks_more_than_2 3901 3902 mov q3, q1 3903 cmp r5, #16 3904 sub r12, r12, #1 3905 3906 bgt .L192_dec_blocks_more_than_1 3907 3908 sub r12, r12, #1 3909 b .L192_dec_blocks_less_than_1 3910.L192_dec_blocks_more_than_3:@ blocks left > 3 3911 rev64 q4, q5 @ GHASH final-3 block 3912 ld1 { q5}, [r0], #16 @ AES final-2 block - load ciphertext 3913 3914 stp r6, r7, [r2], #16 @ AES final-3 block - store result 3915 3916 eor q4, q4, q8 @ feed in partial tag 3917 3918 eor q0, q5, q1 @ AES final-2 block - result 3919 3920 pmull v11.1q, q4, v15.1d @ GHASH final-3 block - low 3921 mov r6, v0.d[0] @ AES final-2 block - mov low 3922 mov d22, v4.d[1] @ GHASH final-3 block - mid 3923 3924 mov r7, v0.d[1] @ AES final-2 block - mov high 3925 3926 mov d10, v17.d[1] @ GHASH final-3 block - mid 3927 eor v22.8b, v22.8b, q4 @ GHASH final-3 block - mid 3928 3929 pmull2 v9.1q, q4, v15.2d @ GHASH final-3 block - high 3930 3931 eor r6, r6, r13 @ AES final-2 block - round 12 low 3932#ifdef __ARMEB__ 3933 rev r6, r6 3934#endif 3935 movi q8, #0 @ suppress further partial tag feed in 3936 3937 pmull v10.1q, v22.1d, v10.1d @ GHASH final-3 block - mid 3938 eor r7, r7, r14 @ AES final-2 block - round 12 high 3939#ifdef __ARMEB__ 3940 rev r7, r7 3941#endif 3942.L192_dec_blocks_more_than_2:@ blocks left > 2 3943 3944 rev64 q4, q5 @ GHASH final-2 block 3945 ld1 { q5}, [r0], #16 @ AES final-1 block - load ciphertext 3946 3947 eor q4, q4, q8 @ feed in partial tag 3948 3949 movi q8, #0 @ suppress further partial tag feed in 3950 3951 eor q0, q5, q2 @ AES final-1 block - result 3952 3953 mov d22, v4.d[1] @ GHASH final-2 block - mid 3954 3955 pmull v21.1q, q4, v14.1d @ GHASH final-2 block - low 3956 3957 stp r6, r7, [r2], #16 @ AES final-2 block - store result 3958 3959 eor v22.8b, v22.8b, q4 @ GHASH final-2 block - mid 3960 mov r7, v0.d[1] @ AES final-1 block - mov high 3961 3962 eor v11.16b, v11.16b, v21.16b @ GHASH final-2 block - low 3963 mov r6, v0.d[0] @ AES final-1 block - mov low 3964 3965 pmull2 v20.1q, q4, v14.2d @ GHASH final-2 block - high 3966 3967 pmull v22.1q, v22.1d, v17.1d @ GHASH final-2 block - mid 3968 3969 eor q9, q9, v20.16b @ GHASH final-2 block - high 3970 eor r7, r7, r14 @ AES final-1 block - round 12 high 3971#ifdef __ARMEB__ 3972 rev r7, r7 3973#endif 3974 eor r6, r6, r13 @ AES final-1 block - round 12 low 3975#ifdef __ARMEB__ 3976 rev r6, r6 3977#endif 3978 eor v10.16b, v10.16b, v22.16b @ GHASH final-2 block - mid 3979.L192_dec_blocks_more_than_1:@ blocks left > 1 3980 3981 rev64 q4, q5 @ GHASH final-1 block 3982 3983 eor q4, q4, q8 @ feed in partial tag 3984 ld1 { q5}, [r0], #16 @ AES final block - load ciphertext 3985 3986 mov d22, v4.d[1] @ GHASH final-1 block - mid 3987 3988 pmull2 v20.1q, q4, v13.2d @ GHASH final-1 block - high 3989 3990 eor q0, q5, q3 @ AES final block - result 3991 stp r6, r7, [r2], #16 @ AES final-1 block - store result 3992 3993 eor v22.8b, v22.8b, q4 @ GHASH final-1 block - mid 3994 3995 eor q9, q9, v20.16b @ GHASH final-1 block - high 3996 3997 pmull v21.1q, q4, v13.1d @ GHASH final-1 block - low 3998 mov r7, v0.d[1] @ AES final block - mov high 3999 4000 ins v22.d[1], v22.d[0] @ GHASH final-1 block - mid 4001 mov r6, v0.d[0] @ AES final block - mov low 4002 4003 pmull2 v22.1q, v22.2d, v16.2d @ GHASH final-1 block - mid 4004 4005 movi q8, #0 @ suppress further partial tag feed in 4006 eor v11.16b, v11.16b, v21.16b @ GHASH final-1 block - low 4007 eor r7, r7, r14 @ AES final block - round 12 high 4008#ifdef __ARMEB__ 4009 rev r7, r7 4010#endif 4011 eor r6, r6, r13 @ AES final block - round 12 low 4012#ifdef __ARMEB__ 4013 rev r6, r6 4014#endif 4015 eor v10.16b, v10.16b, v22.16b @ GHASH final-1 block - mid 4016.L192_dec_blocks_less_than_1:@ blocks left <= 1 4017 4018 mvn r13, xzr @ rk12_l = 0xffffffffffffffff 4019 ldp r4, r5, [r2] @ load existing bytes we need to not overwrite 4020 and r1, r1, #127 @ bit_length %= 128 4021 4022 sub r1, r1, #128 @ bit_length -= 128 4023 4024 neg r1, r1 @ bit_length = 128 - #bits in input (in range [1,128]) 4025 4026 and r1, r1, #127 @ bit_length %= 128 4027 mvn r14, xzr @ rk12_h = 0xffffffffffffffff 4028 4029 lsr r14, r14, r1 @ rk12_h is mask for top 64b of last block 4030 cmp r1, #64 4031 4032 csel r9, r13, r14, lt 4033 csel r10, r14, xzr, lt 4034 4035 fmov d0, r9 @ ctr0b is mask for last block 4036 and r6, r6, r9 4037 bic r4, r4, r9 @ mask out low existing bytes 4038 4039 orr r6, r6, r4 4040 mov v0.d[1], r10 4041#ifndef __ARMEB__ 4042 rev r9, r12 4043#else 4044 mov r9, r12 4045#endif 4046 4047 and q5, q5, q0 @ possibly partial last block has zeroes in highest bits 4048 str r9, [r16, #12] @ store the updated counter 4049 4050 rev64 q4, q5 @ GHASH final block 4051 4052 eor q4, q4, q8 @ feed in partial tag 4053 bic r5, r5, r10 @ mask out high existing bytes 4054 4055 and r7, r7, r10 4056 4057 pmull2 v20.1q, q4, v12.2d @ GHASH final block - high 4058 mov d8, v4.d[1] @ GHASH final block - mid 4059 4060 pmull v21.1q, q4, v12.1d @ GHASH final block - low 4061 4062 eor q8, q8, q4 @ GHASH final block - mid 4063 4064 eor q9, q9, v20.16b @ GHASH final block - high 4065 4066 pmull v8.1q, q8, v16.1d @ GHASH final block - mid 4067 4068 eor v11.16b, v11.16b, v21.16b @ GHASH final block - low 4069 4070 eor v10.16b, v10.16b, q8 @ GHASH final block - mid 4071 movi q8, #0xc2 4072 4073 eor v30.16b, v11.16b, q9 @ MODULO - karatsuba tidy up 4074 4075 shl d8, d8, #56 @ mod_constant 4076 4077 eor v10.16b, v10.16b, v30.16b @ MODULO - karatsuba tidy up 4078 4079 pmull v31.1q, q9, q8 @ MODULO - top 64b align with mid 4080 orr r7, r7, r5 4081 stp r6, r7, [r2] 4082 4083 ext q9, q9, q9, #8 @ MODULO - other top alignment 4084 4085 eor v10.16b, v10.16b, v31.16b @ MODULO - fold into mid 4086 4087 eor v10.16b, v10.16b, q9 @ MODULO - fold into mid 4088 4089 pmull v8.1q, v10.1d, q8 @ MODULO - mid 64b align with low 4090 4091 eor v11.16b, v11.16b, q8 @ MODULO - fold into low 4092 4093 ext v10.16b, v10.16b, v10.16b, #8 @ MODULO - other mid alignment 4094 4095 eor v11.16b, v11.16b, v10.16b @ MODULO - fold into low 4096 ext v11.16b, v11.16b, v11.16b, #8 4097 rev64 v11.16b, v11.16b 4098 mov r0, r15 4099 st1 { v11.16b }, [r3] 4100 4101 ldp r21, r22, [sp, #16] 4102 ldp r23, r24, [sp, #32] 4103 ldp d8, d9, [sp, #48] 4104 ldp d10, d11, [sp, #64] 4105 ldp d12, d13, [sp, #80] 4106 ldp d14, d15, [sp, #96] 4107 ldp r19, r20, [sp], #112 4108 RET 4109 4110.L192_dec_ret: 4111 mov r0, #0x0 4112 RET 4113.size aes_gcm_dec_192_kernel,.-aes_gcm_dec_192_kernel 4114.globl aes_gcm_enc_256_kernel 4115.type aes_gcm_enc_256_kernel,%function 4116.align 4 4117aes_gcm_enc_256_kernel: 4118 cbz r1, .L256_enc_ret 4119 stp r19, r20, [sp, #-112]! 4120 mov r16, r4 4121 mov r8, r5 4122 stp r21, r22, [sp, #16] 4123 stp r23, r24, [sp, #32] 4124 stp d8, d9, [sp, #48] 4125 stp d10, d11, [sp, #64] 4126 stp d12, d13, [sp, #80] 4127 stp d14, d15, [sp, #96] 4128 4129 add r4, r0, r1, lsr #3 @ end_input_ptr 4130 lsr r5, r1, #3 @ byte_len 4131 mov r15, r5 4132 ldp r10, r11, [r16] @ ctr96_b64, ctr96_t32 4133#ifdef __ARMEB__ 4134 rev r10, r10 4135 rev r11, r11 4136#endif 4137 ldp r13, r14, [r8, #224] @ load rk14 4138#ifdef __ARMEB__ 4139 ror r13, r13, #32 4140 ror r14, r14, #32 4141#endif 4142 ld1 { q0}, [r16] @ special case vector load initial counter so we can start first AES block as quickly as possible 4143 sub r5, r5, #1 @ byte_len - 1 4144 4145 ld1 {v18.4s}, [r8], #16 @ load rk0 4146 and r5, r5, #0xffffffffffffffc0 @ number of bytes to be processed in main loop (at least 1 byte must be handled by tail) 4147 4148 ld1 {v19.4s}, [r8], #16 @ load rk1 4149 add r5, r5, r0 4150 4151 lsr r12, r11, #32 4152 fmov d2, r10 @ CTR block 2 4153 orr r11, r11, r11 4154 4155 rev r12, r12 @ rev_ctr32 4156 cmp r0, r5 @ check if we have <= 4 blocks 4157 fmov d1, r10 @ CTR block 1 4158 4159 aese q0, v18.16b 4160 aesmc q0, q0 @ AES block 0 - round 0 4161 add r12, r12, #1 @ increment rev_ctr32 4162 4163 rev r9, r12 @ CTR block 1 4164 fmov d3, r10 @ CTR block 3 4165 4166 orr r9, r11, r9, lsl #32 @ CTR block 1 4167 add r12, r12, #1 @ CTR block 1 4168 ld1 {v20.4s}, [r8], #16 @ load rk2 4169 4170 fmov v1.d[1], r9 @ CTR block 1 4171 rev r9, r12 @ CTR block 2 4172 add r12, r12, #1 @ CTR block 2 4173 4174 orr r9, r11, r9, lsl #32 @ CTR block 2 4175 ld1 {v21.4s}, [r8], #16 @ load rk3 4176 4177 fmov v2.d[1], r9 @ CTR block 2 4178 rev r9, r12 @ CTR block 3 4179 4180 aese q0, v19.16b 4181 aesmc q0, q0 @ AES block 0 - round 1 4182 orr r9, r11, r9, lsl #32 @ CTR block 3 4183 4184 fmov v3.d[1], r9 @ CTR block 3 4185 4186 aese q1, v18.16b 4187 aesmc q1, q1 @ AES block 1 - round 0 4188 ld1 {v22.4s}, [r8], #16 @ load rk4 4189 4190 aese q0, v20.16b 4191 aesmc q0, q0 @ AES block 0 - round 2 4192 ld1 {v23.4s}, [r8], #16 @ load rk5 4193 4194 aese q2, v18.16b 4195 aesmc q2, q2 @ AES block 2 - round 0 4196 ld1 {v24.4s}, [r8], #16 @ load rk6 4197 4198 aese q1, v19.16b 4199 aesmc q1, q1 @ AES block 1 - round 1 4200 ldr q14, [r3, #80] @ load h3l | h3h 4201#ifndef __ARMEB__ 4202 ext v14.16b, v14.16b, v14.16b, #8 4203#endif 4204 aese q3, v18.16b 4205 aesmc q3, q3 @ AES block 3 - round 0 4206 ld1 {v25.4s}, [r8], #16 @ load rk7 4207 4208 aese q2, v19.16b 4209 aesmc q2, q2 @ AES block 2 - round 1 4210 ld1 {v26.4s}, [r8], #16 @ load rk8 4211 4212 aese q1, v20.16b 4213 aesmc q1, q1 @ AES block 1 - round 2 4214 ldr q13, [r3, #64] @ load h2l | h2h 4215#ifndef __ARMEB__ 4216 ext v13.16b, v13.16b, v13.16b, #8 4217#endif 4218 aese q3, v19.16b 4219 aesmc q3, q3 @ AES block 3 - round 1 4220 ld1 {v27.4s}, [r8], #16 @ load rk9 4221 4222 aese q2, v20.16b 4223 aesmc q2, q2 @ AES block 2 - round 2 4224 ldr q15, [r3, #112] @ load h4l | h4h 4225#ifndef __ARMEB__ 4226 ext v15.16b, v15.16b, v15.16b, #8 4227#endif 4228 aese q1, v21.16b 4229 aesmc q1, q1 @ AES block 1 - round 3 4230 ld1 {v28.4s}, [r8], #16 @ load rk10 4231 4232 aese q3, v20.16b 4233 aesmc q3, q3 @ AES block 3 - round 2 4234 ld1 {v29.4s}, [r8], #16 @ load rk11 4235 4236 aese q2, v21.16b 4237 aesmc q2, q2 @ AES block 2 - round 3 4238 add r12, r12, #1 @ CTR block 3 4239 4240 aese q0, v21.16b 4241 aesmc q0, q0 @ AES block 0 - round 3 4242 4243 aese q3, v21.16b 4244 aesmc q3, q3 @ AES block 3 - round 3 4245 ld1 { v11.16b}, [r3] 4246 ext v11.16b, v11.16b, v11.16b, #8 4247 rev64 v11.16b, v11.16b 4248 4249 aese q2, v22.16b 4250 aesmc q2, q2 @ AES block 2 - round 4 4251 4252 aese q0, v22.16b 4253 aesmc q0, q0 @ AES block 0 - round 4 4254 4255 aese q1, v22.16b 4256 aesmc q1, q1 @ AES block 1 - round 4 4257 4258 aese q3, v22.16b 4259 aesmc q3, q3 @ AES block 3 - round 4 4260 4261 aese q0, v23.16b 4262 aesmc q0, q0 @ AES block 0 - round 5 4263 4264 aese q1, v23.16b 4265 aesmc q1, q1 @ AES block 1 - round 5 4266 4267 aese q3, v23.16b 4268 aesmc q3, q3 @ AES block 3 - round 5 4269 4270 aese q2, v23.16b 4271 aesmc q2, q2 @ AES block 2 - round 5 4272 4273 aese q1, v24.16b 4274 aesmc q1, q1 @ AES block 1 - round 6 4275 trn2 v17.2d, v14.2d, v15.2d @ h4l | h3l 4276 4277 aese q3, v24.16b 4278 aesmc q3, q3 @ AES block 3 - round 6 4279 ld1 {v30.4s}, [r8], #16 @ load rk12 4280 4281 aese q0, v24.16b 4282 aesmc q0, q0 @ AES block 0 - round 6 4283 ldr q12, [r3, #32] @ load h1l | h1h 4284#ifndef __ARMEB__ 4285 ext v12.16b, v12.16b, v12.16b, #8 4286#endif 4287 aese q2, v24.16b 4288 aesmc q2, q2 @ AES block 2 - round 6 4289 ld1 {v31.4s}, [r8], #16 @ load rk13 4290 4291 aese q1, v25.16b 4292 aesmc q1, q1 @ AES block 1 - round 7 4293 trn1 q9, v14.2d, v15.2d @ h4h | h3h 4294 4295 aese q0, v25.16b 4296 aesmc q0, q0 @ AES block 0 - round 7 4297 4298 aese q2, v25.16b 4299 aesmc q2, q2 @ AES block 2 - round 7 4300 4301 aese q3, v25.16b 4302 aesmc q3, q3 @ AES block 3 - round 7 4303 trn2 v16.2d, v12.2d, v13.2d @ h2l | h1l 4304 4305 aese q1, v26.16b 4306 aesmc q1, q1 @ AES block 1 - round 8 4307 4308 aese q2, v26.16b 4309 aesmc q2, q2 @ AES block 2 - round 8 4310 4311 aese q3, v26.16b 4312 aesmc q3, q3 @ AES block 3 - round 8 4313 4314 aese q1, v27.16b 4315 aesmc q1, q1 @ AES block 1 - round 9 4316 4317 aese q2, v27.16b 4318 aesmc q2, q2 @ AES block 2 - round 9 4319 4320 aese q0, v26.16b 4321 aesmc q0, q0 @ AES block 0 - round 8 4322 4323 aese q1, v28.16b 4324 aesmc q1, q1 @ AES block 1 - round 10 4325 4326 aese q3, v27.16b 4327 aesmc q3, q3 @ AES block 3 - round 9 4328 4329 aese q0, v27.16b 4330 aesmc q0, q0 @ AES block 0 - round 9 4331 4332 aese q2, v28.16b 4333 aesmc q2, q2 @ AES block 2 - round 10 4334 4335 aese q3, v28.16b 4336 aesmc q3, q3 @ AES block 3 - round 10 4337 4338 aese q1, v29.16b 4339 aesmc q1, q1 @ AES block 1 - round 11 4340 4341 aese q2, v29.16b 4342 aesmc q2, q2 @ AES block 2 - round 11 4343 4344 aese q0, v28.16b 4345 aesmc q0, q0 @ AES block 0 - round 10 4346 4347 aese q1, v30.16b 4348 aesmc q1, q1 @ AES block 1 - round 12 4349 4350 aese q2, v30.16b 4351 aesmc q2, q2 @ AES block 2 - round 12 4352 4353 aese q0, v29.16b 4354 aesmc q0, q0 @ AES block 0 - round 11 4355 eor v17.16b, v17.16b, q9 @ h4k | h3k 4356 4357 aese q3, v29.16b 4358 aesmc q3, q3 @ AES block 3 - round 11 4359 4360 aese q2, v31.16b @ AES block 2 - round 13 4361 trn1 q8, v12.2d, v13.2d @ h2h | h1h 4362 4363 aese q0, v30.16b 4364 aesmc q0, q0 @ AES block 0 - round 12 4365 4366 aese q3, v30.16b 4367 aesmc q3, q3 @ AES block 3 - round 12 4368 4369 aese q1, v31.16b @ AES block 1 - round 13 4370 4371 aese q0, v31.16b @ AES block 0 - round 13 4372 4373 aese q3, v31.16b @ AES block 3 - round 13 4374 eor v16.16b, v16.16b, q8 @ h2k | h1k 4375 bge .L256_enc_tail @ handle tail 4376 4377 ldp r19, r20, [r0, #16] @ AES block 1 - load plaintext 4378#ifdef __ARMEB__ 4379 rev r19, r19 4380 rev r20, r20 4381#endif 4382 rev r9, r12 @ CTR block 4 4383 ldp r6, r7, [r0, #0] @ AES block 0 - load plaintext 4384#ifdef __ARMEB__ 4385 rev r6, r6 4386 rev r7, r7 4387#endif 4388 ldp r23, r24, [r0, #48] @ AES block 3 - load plaintext 4389#ifdef __ARMEB__ 4390 rev r23, r23 4391 rev r24, r24 4392#endif 4393 ldp r21, r22, [r0, #32] @ AES block 2 - load plaintext 4394#ifdef __ARMEB__ 4395 rev r21, r21 4396 rev r22, r22 4397#endif 4398 add r0, r0, #64 @ AES input_ptr update 4399 4400 eor r19, r19, r13 @ AES block 1 - round 14 low 4401 eor r20, r20, r14 @ AES block 1 - round 14 high 4402 4403 fmov d5, r19 @ AES block 1 - mov low 4404 eor r6, r6, r13 @ AES block 0 - round 14 low 4405 4406 eor r7, r7, r14 @ AES block 0 - round 14 high 4407 eor r24, r24, r14 @ AES block 3 - round 14 high 4408 fmov d4, r6 @ AES block 0 - mov low 4409 4410 cmp r0, r5 @ check if we have <= 8 blocks 4411 fmov v4.d[1], r7 @ AES block 0 - mov high 4412 eor r23, r23, r13 @ AES block 3 - round 14 low 4413 4414 eor r21, r21, r13 @ AES block 2 - round 14 low 4415 fmov v5.d[1], r20 @ AES block 1 - mov high 4416 4417 fmov d6, r21 @ AES block 2 - mov low 4418 add r12, r12, #1 @ CTR block 4 4419 4420 orr r9, r11, r9, lsl #32 @ CTR block 4 4421 fmov d7, r23 @ AES block 3 - mov low 4422 eor r22, r22, r14 @ AES block 2 - round 14 high 4423 4424 fmov v6.d[1], r22 @ AES block 2 - mov high 4425 4426 eor q4, q4, q0 @ AES block 0 - result 4427 fmov d0, r10 @ CTR block 4 4428 4429 fmov v0.d[1], r9 @ CTR block 4 4430 rev r9, r12 @ CTR block 5 4431 add r12, r12, #1 @ CTR block 5 4432 4433 eor q5, q5, q1 @ AES block 1 - result 4434 fmov d1, r10 @ CTR block 5 4435 orr r9, r11, r9, lsl #32 @ CTR block 5 4436 4437 fmov v1.d[1], r9 @ CTR block 5 4438 rev r9, r12 @ CTR block 6 4439 st1 { q4}, [r2], #16 @ AES block 0 - store result 4440 4441 fmov v7.d[1], r24 @ AES block 3 - mov high 4442 orr r9, r11, r9, lsl #32 @ CTR block 6 4443 eor q6, q6, q2 @ AES block 2 - result 4444 4445 st1 { q5}, [r2], #16 @ AES block 1 - store result 4446 4447 add r12, r12, #1 @ CTR block 6 4448 fmov d2, r10 @ CTR block 6 4449 4450 fmov v2.d[1], r9 @ CTR block 6 4451 st1 { q6}, [r2], #16 @ AES block 2 - store result 4452 rev r9, r12 @ CTR block 7 4453 4454 orr r9, r11, r9, lsl #32 @ CTR block 7 4455 4456 eor q7, q7, q3 @ AES block 3 - result 4457 st1 { q7}, [r2], #16 @ AES block 3 - store result 4458 bge .L256_enc_prepretail @ do prepretail 4459 4460.L256_enc_main_loop:@ main loop start 4461 aese q0, v18.16b 4462 aesmc q0, q0 @ AES block 4k+4 - round 0 4463 rev64 q4, q4 @ GHASH block 4k (only t0 is free) 4464 4465 aese q1, v18.16b 4466 aesmc q1, q1 @ AES block 4k+5 - round 0 4467 fmov d3, r10 @ CTR block 4k+3 4468 4469 aese q2, v18.16b 4470 aesmc q2, q2 @ AES block 4k+6 - round 0 4471 ext v11.16b, v11.16b, v11.16b, #8 @ PRE 0 4472 4473 aese q0, v19.16b 4474 aesmc q0, q0 @ AES block 4k+4 - round 1 4475 fmov v3.d[1], r9 @ CTR block 4k+3 4476 4477 aese q1, v19.16b 4478 aesmc q1, q1 @ AES block 4k+5 - round 1 4479 ldp r23, r24, [r0, #48] @ AES block 4k+7 - load plaintext 4480#ifdef __ARMEB__ 4481 rev r23, r23 4482 rev r24, r24 4483#endif 4484 aese q2, v19.16b 4485 aesmc q2, q2 @ AES block 4k+6 - round 1 4486 ldp r21, r22, [r0, #32] @ AES block 4k+6 - load plaintext 4487#ifdef __ARMEB__ 4488 rev r21, r21 4489 rev r22, r22 4490#endif 4491 aese q0, v20.16b 4492 aesmc q0, q0 @ AES block 4k+4 - round 2 4493 eor q4, q4, v11.16b @ PRE 1 4494 4495 aese q1, v20.16b 4496 aesmc q1, q1 @ AES block 4k+5 - round 2 4497 4498 aese q3, v18.16b 4499 aesmc q3, q3 @ AES block 4k+7 - round 0 4500 eor r23, r23, r13 @ AES block 4k+7 - round 14 low 4501 4502 aese q0, v21.16b 4503 aesmc q0, q0 @ AES block 4k+4 - round 3 4504 mov d10, v17.d[1] @ GHASH block 4k - mid 4505 4506 pmull2 v9.1q, q4, v15.2d @ GHASH block 4k - high 4507 eor r22, r22, r14 @ AES block 4k+6 - round 14 high 4508 mov d8, v4.d[1] @ GHASH block 4k - mid 4509 4510 aese q3, v19.16b 4511 aesmc q3, q3 @ AES block 4k+7 - round 1 4512 rev64 q5, q5 @ GHASH block 4k+1 (t0 and t1 free) 4513 4514 aese q0, v22.16b 4515 aesmc q0, q0 @ AES block 4k+4 - round 4 4516 4517 pmull v11.1q, q4, v15.1d @ GHASH block 4k - low 4518 eor q8, q8, q4 @ GHASH block 4k - mid 4519 4520 aese q2, v20.16b 4521 aesmc q2, q2 @ AES block 4k+6 - round 2 4522 4523 aese q0, v23.16b 4524 aesmc q0, q0 @ AES block 4k+4 - round 5 4525 rev64 q7, q7 @ GHASH block 4k+3 (t0, t1, t2 and t3 free) 4526 4527 pmull2 v4.1q, q5, v14.2d @ GHASH block 4k+1 - high 4528 4529 pmull v10.1q, q8, v10.1d @ GHASH block 4k - mid 4530 rev64 q6, q6 @ GHASH block 4k+2 (t0, t1, and t2 free) 4531 4532 pmull v8.1q, q5, v14.1d @ GHASH block 4k+1 - low 4533 4534 eor q9, q9, q4 @ GHASH block 4k+1 - high 4535 mov d4, v5.d[1] @ GHASH block 4k+1 - mid 4536 4537 aese q1, v21.16b 4538 aesmc q1, q1 @ AES block 4k+5 - round 3 4539 4540 aese q3, v20.16b 4541 aesmc q3, q3 @ AES block 4k+7 - round 2 4542 eor v11.16b, v11.16b, q8 @ GHASH block 4k+1 - low 4543 4544 aese q2, v21.16b 4545 aesmc q2, q2 @ AES block 4k+6 - round 3 4546 4547 aese q1, v22.16b 4548 aesmc q1, q1 @ AES block 4k+5 - round 4 4549 mov d8, v6.d[1] @ GHASH block 4k+2 - mid 4550 4551 aese q3, v21.16b 4552 aesmc q3, q3 @ AES block 4k+7 - round 3 4553 eor q4, q4, q5 @ GHASH block 4k+1 - mid 4554 4555 aese q2, v22.16b 4556 aesmc q2, q2 @ AES block 4k+6 - round 4 4557 4558 aese q0, v24.16b 4559 aesmc q0, q0 @ AES block 4k+4 - round 6 4560 eor q8, q8, q6 @ GHASH block 4k+2 - mid 4561 4562 aese q3, v22.16b 4563 aesmc q3, q3 @ AES block 4k+7 - round 4 4564 4565 pmull v4.1q, q4, v17.1d @ GHASH block 4k+1 - mid 4566 4567 aese q0, v25.16b 4568 aesmc q0, q0 @ AES block 4k+4 - round 7 4569 4570 aese q3, v23.16b 4571 aesmc q3, q3 @ AES block 4k+7 - round 5 4572 ins v8.d[1], v8.d[0] @ GHASH block 4k+2 - mid 4573 4574 aese q1, v23.16b 4575 aesmc q1, q1 @ AES block 4k+5 - round 5 4576 4577 aese q0, v26.16b 4578 aesmc q0, q0 @ AES block 4k+4 - round 8 4579 4580 aese q2, v23.16b 4581 aesmc q2, q2 @ AES block 4k+6 - round 5 4582 4583 aese q1, v24.16b 4584 aesmc q1, q1 @ AES block 4k+5 - round 6 4585 eor v10.16b, v10.16b, q4 @ GHASH block 4k+1 - mid 4586 4587 pmull2 v4.1q, q6, v13.2d @ GHASH block 4k+2 - high 4588 4589 pmull v5.1q, q6, v13.1d @ GHASH block 4k+2 - low 4590 4591 aese q1, v25.16b 4592 aesmc q1, q1 @ AES block 4k+5 - round 7 4593 4594 pmull v6.1q, q7, v12.1d @ GHASH block 4k+3 - low 4595 eor q9, q9, q4 @ GHASH block 4k+2 - high 4596 4597 aese q3, v24.16b 4598 aesmc q3, q3 @ AES block 4k+7 - round 6 4599 ldp r19, r20, [r0, #16] @ AES block 4k+5 - load plaintext 4600#ifdef __ARMEB__ 4601 rev r19, r19 4602 rev r20, r20 4603#endif 4604 aese q1, v26.16b 4605 aesmc q1, q1 @ AES block 4k+5 - round 8 4606 mov d4, v7.d[1] @ GHASH block 4k+3 - mid 4607 4608 aese q2, v24.16b 4609 aesmc q2, q2 @ AES block 4k+6 - round 6 4610 eor v11.16b, v11.16b, q5 @ GHASH block 4k+2 - low 4611 4612 pmull2 v8.1q, q8, v16.2d @ GHASH block 4k+2 - mid 4613 4614 pmull2 v5.1q, q7, v12.2d @ GHASH block 4k+3 - high 4615 eor q4, q4, q7 @ GHASH block 4k+3 - mid 4616 4617 aese q2, v25.16b 4618 aesmc q2, q2 @ AES block 4k+6 - round 7 4619 eor r19, r19, r13 @ AES block 4k+5 - round 14 low 4620 4621 aese q1, v27.16b 4622 aesmc q1, q1 @ AES block 4k+5 - round 9 4623 eor v10.16b, v10.16b, q8 @ GHASH block 4k+2 - mid 4624 4625 aese q3, v25.16b 4626 aesmc q3, q3 @ AES block 4k+7 - round 7 4627 eor r21, r21, r13 @ AES block 4k+6 - round 14 low 4628 4629 aese q0, v27.16b 4630 aesmc q0, q0 @ AES block 4k+4 - round 9 4631 movi q8, #0xc2 4632 4633 pmull v4.1q, q4, v16.1d @ GHASH block 4k+3 - mid 4634 eor q9, q9, q5 @ GHASH block 4k+3 - high 4635 fmov d5, r19 @ AES block 4k+5 - mov low 4636 4637 aese q2, v26.16b 4638 aesmc q2, q2 @ AES block 4k+6 - round 8 4639 ldp r6, r7, [r0, #0] @ AES block 4k+4 - load plaintext 4640#ifdef __ARMEB__ 4641 rev r6, r6 4642 rev r7, r7 4643#endif 4644 aese q0, v28.16b 4645 aesmc q0, q0 @ AES block 4k+4 - round 10 4646 shl d8, d8, #56 @ mod_constant 4647 4648 aese q3, v26.16b 4649 aesmc q3, q3 @ AES block 4k+7 - round 8 4650 eor v11.16b, v11.16b, q6 @ GHASH block 4k+3 - low 4651 4652 aese q2, v27.16b 4653 aesmc q2, q2 @ AES block 4k+6 - round 9 4654 4655 aese q1, v28.16b 4656 aesmc q1, q1 @ AES block 4k+5 - round 10 4657 eor v10.16b, v10.16b, q4 @ GHASH block 4k+3 - mid 4658 4659 aese q3, v27.16b 4660 aesmc q3, q3 @ AES block 4k+7 - round 9 4661 add r12, r12, #1 @ CTR block 4k+3 4662 4663 aese q0, v29.16b 4664 aesmc q0, q0 @ AES block 4k+4 - round 11 4665 eor q4, v11.16b, q9 @ MODULO - karatsuba tidy up 4666 4667 aese q1, v29.16b 4668 aesmc q1, q1 @ AES block 4k+5 - round 11 4669 add r0, r0, #64 @ AES input_ptr update 4670 4671 pmull v7.1q, q9, q8 @ MODULO - top 64b align with mid 4672 rev r9, r12 @ CTR block 4k+8 4673 ext q9, q9, q9, #8 @ MODULO - other top alignment 4674 4675 aese q2, v28.16b 4676 aesmc q2, q2 @ AES block 4k+6 - round 10 4677 eor r6, r6, r13 @ AES block 4k+4 - round 14 low 4678 4679 aese q1, v30.16b 4680 aesmc q1, q1 @ AES block 4k+5 - round 12 4681 eor v10.16b, v10.16b, q4 @ MODULO - karatsuba tidy up 4682 4683 aese q3, v28.16b 4684 aesmc q3, q3 @ AES block 4k+7 - round 10 4685 eor r7, r7, r14 @ AES block 4k+4 - round 14 high 4686 4687 fmov d4, r6 @ AES block 4k+4 - mov low 4688 orr r9, r11, r9, lsl #32 @ CTR block 4k+8 4689 eor q7, q9, q7 @ MODULO - fold into mid 4690 4691 aese q0, v30.16b 4692 aesmc q0, q0 @ AES block 4k+4 - round 12 4693 eor r20, r20, r14 @ AES block 4k+5 - round 14 high 4694 4695 aese q2, v29.16b 4696 aesmc q2, q2 @ AES block 4k+6 - round 11 4697 eor r24, r24, r14 @ AES block 4k+7 - round 14 high 4698 4699 aese q3, v29.16b 4700 aesmc q3, q3 @ AES block 4k+7 - round 11 4701 add r12, r12, #1 @ CTR block 4k+8 4702 4703 aese q0, v31.16b @ AES block 4k+4 - round 13 4704 fmov v4.d[1], r7 @ AES block 4k+4 - mov high 4705 eor v10.16b, v10.16b, q7 @ MODULO - fold into mid 4706 4707 aese q2, v30.16b 4708 aesmc q2, q2 @ AES block 4k+6 - round 12 4709 fmov d7, r23 @ AES block 4k+7 - mov low 4710 4711 aese q1, v31.16b @ AES block 4k+5 - round 13 4712 fmov v5.d[1], r20 @ AES block 4k+5 - mov high 4713 4714 fmov d6, r21 @ AES block 4k+6 - mov low 4715 cmp r0, r5 @ .LOOP CONTROL 4716 4717 fmov v6.d[1], r22 @ AES block 4k+6 - mov high 4718 4719 pmull v9.1q, v10.1d, q8 @ MODULO - mid 64b align with low 4720 eor q4, q4, q0 @ AES block 4k+4 - result 4721 fmov d0, r10 @ CTR block 4k+8 4722 4723 fmov v0.d[1], r9 @ CTR block 4k+8 4724 rev r9, r12 @ CTR block 4k+9 4725 add r12, r12, #1 @ CTR block 4k+9 4726 4727 eor q5, q5, q1 @ AES block 4k+5 - result 4728 fmov d1, r10 @ CTR block 4k+9 4729 orr r9, r11, r9, lsl #32 @ CTR block 4k+9 4730 4731 aese q3, v30.16b 4732 aesmc q3, q3 @ AES block 4k+7 - round 12 4733 fmov v1.d[1], r9 @ CTR block 4k+9 4734 4735 aese q2, v31.16b @ AES block 4k+6 - round 13 4736 rev r9, r12 @ CTR block 4k+10 4737 st1 { q4}, [r2], #16 @ AES block 4k+4 - store result 4738 4739 orr r9, r11, r9, lsl #32 @ CTR block 4k+10 4740 eor v11.16b, v11.16b, q9 @ MODULO - fold into low 4741 fmov v7.d[1], r24 @ AES block 4k+7 - mov high 4742 4743 ext v10.16b, v10.16b, v10.16b, #8 @ MODULO - other mid alignment 4744 st1 { q5}, [r2], #16 @ AES block 4k+5 - store result 4745 add r12, r12, #1 @ CTR block 4k+10 4746 4747 aese q3, v31.16b @ AES block 4k+7 - round 13 4748 eor q6, q6, q2 @ AES block 4k+6 - result 4749 fmov d2, r10 @ CTR block 4k+10 4750 4751 st1 { q6}, [r2], #16 @ AES block 4k+6 - store result 4752 fmov v2.d[1], r9 @ CTR block 4k+10 4753 rev r9, r12 @ CTR block 4k+11 4754 4755 eor v11.16b, v11.16b, v10.16b @ MODULO - fold into low 4756 orr r9, r11, r9, lsl #32 @ CTR block 4k+11 4757 4758 eor q7, q7, q3 @ AES block 4k+7 - result 4759 st1 { q7}, [r2], #16 @ AES block 4k+7 - store result 4760 blt .L256_enc_main_loop 4761 4762.L256_enc_prepretail:@ PREPRETAIL 4763 aese q1, v18.16b 4764 aesmc q1, q1 @ AES block 4k+5 - round 0 4765 rev64 q6, q6 @ GHASH block 4k+2 (t0, t1, and t2 free) 4766 4767 aese q2, v18.16b 4768 aesmc q2, q2 @ AES block 4k+6 - round 0 4769 fmov d3, r10 @ CTR block 4k+3 4770 4771 aese q0, v18.16b 4772 aesmc q0, q0 @ AES block 4k+4 - round 0 4773 rev64 q4, q4 @ GHASH block 4k (only t0 is free) 4774 4775 fmov v3.d[1], r9 @ CTR block 4k+3 4776 ext v11.16b, v11.16b, v11.16b, #8 @ PRE 0 4777 4778 aese q2, v19.16b 4779 aesmc q2, q2 @ AES block 4k+6 - round 1 4780 4781 aese q0, v19.16b 4782 aesmc q0, q0 @ AES block 4k+4 - round 1 4783 4784 eor q4, q4, v11.16b @ PRE 1 4785 rev64 q5, q5 @ GHASH block 4k+1 (t0 and t1 free) 4786 4787 aese q2, v20.16b 4788 aesmc q2, q2 @ AES block 4k+6 - round 2 4789 4790 aese q3, v18.16b 4791 aesmc q3, q3 @ AES block 4k+7 - round 0 4792 mov d10, v17.d[1] @ GHASH block 4k - mid 4793 4794 aese q1, v19.16b 4795 aesmc q1, q1 @ AES block 4k+5 - round 1 4796 4797 pmull v11.1q, q4, v15.1d @ GHASH block 4k - low 4798 mov d8, v4.d[1] @ GHASH block 4k - mid 4799 4800 pmull2 v9.1q, q4, v15.2d @ GHASH block 4k - high 4801 4802 aese q2, v21.16b 4803 aesmc q2, q2 @ AES block 4k+6 - round 3 4804 4805 aese q1, v20.16b 4806 aesmc q1, q1 @ AES block 4k+5 - round 2 4807 eor q8, q8, q4 @ GHASH block 4k - mid 4808 4809 aese q0, v20.16b 4810 aesmc q0, q0 @ AES block 4k+4 - round 2 4811 4812 aese q3, v19.16b 4813 aesmc q3, q3 @ AES block 4k+7 - round 1 4814 4815 aese q1, v21.16b 4816 aesmc q1, q1 @ AES block 4k+5 - round 3 4817 4818 pmull v10.1q, q8, v10.1d @ GHASH block 4k - mid 4819 4820 pmull2 v4.1q, q5, v14.2d @ GHASH block 4k+1 - high 4821 4822 pmull v8.1q, q5, v14.1d @ GHASH block 4k+1 - low 4823 4824 aese q3, v20.16b 4825 aesmc q3, q3 @ AES block 4k+7 - round 2 4826 4827 eor q9, q9, q4 @ GHASH block 4k+1 - high 4828 mov d4, v5.d[1] @ GHASH block 4k+1 - mid 4829 4830 aese q0, v21.16b 4831 aesmc q0, q0 @ AES block 4k+4 - round 3 4832 eor v11.16b, v11.16b, q8 @ GHASH block 4k+1 - low 4833 4834 aese q3, v21.16b 4835 aesmc q3, q3 @ AES block 4k+7 - round 3 4836 4837 eor q4, q4, q5 @ GHASH block 4k+1 - mid 4838 mov d8, v6.d[1] @ GHASH block 4k+2 - mid 4839 4840 aese q0, v22.16b 4841 aesmc q0, q0 @ AES block 4k+4 - round 4 4842 rev64 q7, q7 @ GHASH block 4k+3 (t0, t1, t2 and t3 free) 4843 4844 aese q3, v22.16b 4845 aesmc q3, q3 @ AES block 4k+7 - round 4 4846 4847 pmull v4.1q, q4, v17.1d @ GHASH block 4k+1 - mid 4848 eor q8, q8, q6 @ GHASH block 4k+2 - mid 4849 add r12, r12, #1 @ CTR block 4k+3 4850 4851 pmull v5.1q, q6, v13.1d @ GHASH block 4k+2 - low 4852 4853 aese q3, v23.16b 4854 aesmc q3, q3 @ AES block 4k+7 - round 5 4855 4856 aese q2, v22.16b 4857 aesmc q2, q2 @ AES block 4k+6 - round 4 4858 eor v10.16b, v10.16b, q4 @ GHASH block 4k+1 - mid 4859 4860 pmull2 v4.1q, q6, v13.2d @ GHASH block 4k+2 - high 4861 4862 eor v11.16b, v11.16b, q5 @ GHASH block 4k+2 - low 4863 ins v8.d[1], v8.d[0] @ GHASH block 4k+2 - mid 4864 4865 aese q2, v23.16b 4866 aesmc q2, q2 @ AES block 4k+6 - round 5 4867 4868 eor q9, q9, q4 @ GHASH block 4k+2 - high 4869 mov d4, v7.d[1] @ GHASH block 4k+3 - mid 4870 4871 aese q1, v22.16b 4872 aesmc q1, q1 @ AES block 4k+5 - round 4 4873 4874 pmull2 v8.1q, q8, v16.2d @ GHASH block 4k+2 - mid 4875 4876 eor q4, q4, q7 @ GHASH block 4k+3 - mid 4877 4878 pmull2 v5.1q, q7, v12.2d @ GHASH block 4k+3 - high 4879 4880 aese q1, v23.16b 4881 aesmc q1, q1 @ AES block 4k+5 - round 5 4882 4883 pmull v4.1q, q4, v16.1d @ GHASH block 4k+3 - mid 4884 eor v10.16b, v10.16b, q8 @ GHASH block 4k+2 - mid 4885 4886 aese q0, v23.16b 4887 aesmc q0, q0 @ AES block 4k+4 - round 5 4888 4889 aese q1, v24.16b 4890 aesmc q1, q1 @ AES block 4k+5 - round 6 4891 4892 aese q2, v24.16b 4893 aesmc q2, q2 @ AES block 4k+6 - round 6 4894 4895 aese q0, v24.16b 4896 aesmc q0, q0 @ AES block 4k+4 - round 6 4897 movi q8, #0xc2 4898 4899 aese q3, v24.16b 4900 aesmc q3, q3 @ AES block 4k+7 - round 6 4901 4902 aese q1, v25.16b 4903 aesmc q1, q1 @ AES block 4k+5 - round 7 4904 eor q9, q9, q5 @ GHASH block 4k+3 - high 4905 4906 aese q0, v25.16b 4907 aesmc q0, q0 @ AES block 4k+4 - round 7 4908 4909 aese q3, v25.16b 4910 aesmc q3, q3 @ AES block 4k+7 - round 7 4911 shl d8, d8, #56 @ mod_constant 4912 4913 aese q1, v26.16b 4914 aesmc q1, q1 @ AES block 4k+5 - round 8 4915 eor v10.16b, v10.16b, q4 @ GHASH block 4k+3 - mid 4916 4917 pmull v6.1q, q7, v12.1d @ GHASH block 4k+3 - low 4918 4919 aese q3, v26.16b 4920 aesmc q3, q3 @ AES block 4k+7 - round 8 4921 4922 aese q1, v27.16b 4923 aesmc q1, q1 @ AES block 4k+5 - round 9 4924 4925 aese q0, v26.16b 4926 aesmc q0, q0 @ AES block 4k+4 - round 8 4927 eor v11.16b, v11.16b, q6 @ GHASH block 4k+3 - low 4928 4929 aese q3, v27.16b 4930 aesmc q3, q3 @ AES block 4k+7 - round 9 4931 4932 eor v10.16b, v10.16b, q9 @ karatsuba tidy up 4933 4934 pmull v4.1q, q9, q8 4935 ext q9, q9, q9, #8 4936 4937 aese q3, v28.16b 4938 aesmc q3, q3 @ AES block 4k+7 - round 10 4939 4940 aese q2, v25.16b 4941 aesmc q2, q2 @ AES block 4k+6 - round 7 4942 eor v10.16b, v10.16b, v11.16b 4943 4944 aese q1, v28.16b 4945 aesmc q1, q1 @ AES block 4k+5 - round 10 4946 4947 aese q0, v27.16b 4948 aesmc q0, q0 @ AES block 4k+4 - round 9 4949 4950 aese q2, v26.16b 4951 aesmc q2, q2 @ AES block 4k+6 - round 8 4952 4953 aese q1, v29.16b 4954 aesmc q1, q1 @ AES block 4k+5 - round 11 4955 eor v10.16b, v10.16b, q4 4956 4957 aese q0, v28.16b 4958 aesmc q0, q0 @ AES block 4k+4 - round 10 4959 4960 aese q2, v27.16b 4961 aesmc q2, q2 @ AES block 4k+6 - round 9 4962 4963 aese q1, v30.16b 4964 aesmc q1, q1 @ AES block 4k+5 - round 12 4965 4966 aese q0, v29.16b 4967 aesmc q0, q0 @ AES block 4k+4 - round 11 4968 eor v10.16b, v10.16b, q9 4969 4970 aese q3, v29.16b 4971 aesmc q3, q3 @ AES block 4k+7 - round 11 4972 4973 aese q2, v28.16b 4974 aesmc q2, q2 @ AES block 4k+6 - round 10 4975 4976 aese q0, v30.16b 4977 aesmc q0, q0 @ AES block 4k+4 - round 12 4978 4979 pmull v4.1q, v10.1d, q8 4980 4981 aese q2, v29.16b 4982 aesmc q2, q2 @ AES block 4k+6 - round 11 4983 ext v10.16b, v10.16b, v10.16b, #8 4984 4985 aese q3, v30.16b 4986 aesmc q3, q3 @ AES block 4k+7 - round 12 4987 4988 aese q1, v31.16b @ AES block 4k+5 - round 13 4989 eor v11.16b, v11.16b, q4 4990 4991 aese q2, v30.16b 4992 aesmc q2, q2 @ AES block 4k+6 - round 12 4993 4994 aese q3, v31.16b @ AES block 4k+7 - round 13 4995 4996 aese q0, v31.16b @ AES block 4k+4 - round 13 4997 4998 aese q2, v31.16b @ AES block 4k+6 - round 13 4999 eor v11.16b, v11.16b, v10.16b 5000.L256_enc_tail:@ TAIL 5001 5002 ext q8, v11.16b, v11.16b, #8 @ prepare final partial tag 5003 sub r5, r4, r0 @ main_end_input_ptr is number of bytes left to process 5004 ldp r6, r7, [r0], #16 @ AES block 4k+4 - load plaintext 5005#ifdef __ARMEB__ 5006 rev r6, r6 5007 rev r7, r7 5008#endif 5009 eor r6, r6, r13 @ AES block 4k+4 - round 14 low 5010 eor r7, r7, r14 @ AES block 4k+4 - round 14 high 5011 5012 cmp r5, #48 5013 fmov d4, r6 @ AES block 4k+4 - mov low 5014 5015 fmov v4.d[1], r7 @ AES block 4k+4 - mov high 5016 5017 eor q5, q4, q0 @ AES block 4k+4 - result 5018 bgt .L256_enc_blocks_more_than_3 5019 5020 cmp r5, #32 5021 mov q3, q2 5022 movi v11.8b, #0 5023 5024 movi q9, #0 5025 sub r12, r12, #1 5026 5027 mov q2, q1 5028 movi v10.8b, #0 5029 bgt .L256_enc_blocks_more_than_2 5030 5031 mov q3, q1 5032 sub r12, r12, #1 5033 cmp r5, #16 5034 5035 bgt .L256_enc_blocks_more_than_1 5036 5037 sub r12, r12, #1 5038 b .L256_enc_blocks_less_than_1 5039.L256_enc_blocks_more_than_3:@ blocks left > 3 5040 st1 { q5}, [r2], #16 @ AES final-3 block - store result 5041 5042 ldp r6, r7, [r0], #16 @ AES final-2 block - load input low & high 5043#ifdef __ARMEB__ 5044 rev r6, r6 5045 rev r7, r7 5046#endif 5047 rev64 q4, q5 @ GHASH final-3 block 5048 5049 eor r6, r6, r13 @ AES final-2 block - round 14 low 5050 eor q4, q4, q8 @ feed in partial tag 5051 5052 eor r7, r7, r14 @ AES final-2 block - round 14 high 5053 5054 mov d22, v4.d[1] @ GHASH final-3 block - mid 5055 fmov d5, r6 @ AES final-2 block - mov low 5056 5057 fmov v5.d[1], r7 @ AES final-2 block - mov high 5058 5059 eor v22.8b, v22.8b, q4 @ GHASH final-3 block - mid 5060 movi q8, #0 @ suppress further partial tag feed in 5061 5062 mov d10, v17.d[1] @ GHASH final-3 block - mid 5063 5064 pmull v11.1q, q4, v15.1d @ GHASH final-3 block - low 5065 5066 pmull2 v9.1q, q4, v15.2d @ GHASH final-3 block - high 5067 5068 pmull v10.1q, v22.1d, v10.1d @ GHASH final-3 block - mid 5069 eor q5, q5, q1 @ AES final-2 block - result 5070.L256_enc_blocks_more_than_2:@ blocks left > 2 5071 5072 st1 { q5}, [r2], #16 @ AES final-2 block - store result 5073 5074 ldp r6, r7, [r0], #16 @ AES final-1 block - load input low & high 5075#ifdef __ARMEB__ 5076 rev r6, r6 5077 rev r7, r7 5078#endif 5079 rev64 q4, q5 @ GHASH final-2 block 5080 5081 eor r6, r6, r13 @ AES final-1 block - round 14 low 5082 eor q4, q4, q8 @ feed in partial tag 5083 5084 fmov d5, r6 @ AES final-1 block - mov low 5085 eor r7, r7, r14 @ AES final-1 block - round 14 high 5086 5087 fmov v5.d[1], r7 @ AES final-1 block - mov high 5088 5089 movi q8, #0 @ suppress further partial tag feed in 5090 5091 pmull2 v20.1q, q4, v14.2d @ GHASH final-2 block - high 5092 mov d22, v4.d[1] @ GHASH final-2 block - mid 5093 5094 pmull v21.1q, q4, v14.1d @ GHASH final-2 block - low 5095 5096 eor v22.8b, v22.8b, q4 @ GHASH final-2 block - mid 5097 5098 eor q5, q5, q2 @ AES final-1 block - result 5099 5100 eor q9, q9, v20.16b @ GHASH final-2 block - high 5101 5102 pmull v22.1q, v22.1d, v17.1d @ GHASH final-2 block - mid 5103 5104 eor v11.16b, v11.16b, v21.16b @ GHASH final-2 block - low 5105 5106 eor v10.16b, v10.16b, v22.16b @ GHASH final-2 block - mid 5107.L256_enc_blocks_more_than_1:@ blocks left > 1 5108 5109 st1 { q5}, [r2], #16 @ AES final-1 block - store result 5110 5111 rev64 q4, q5 @ GHASH final-1 block 5112 5113 ldp r6, r7, [r0], #16 @ AES final block - load input low & high 5114#ifdef __ARMEB__ 5115 rev r6, r6 5116 rev r7, r7 5117#endif 5118 eor q4, q4, q8 @ feed in partial tag 5119 5120 movi q8, #0 @ suppress further partial tag feed in 5121 5122 eor r6, r6, r13 @ AES final block - round 14 low 5123 mov d22, v4.d[1] @ GHASH final-1 block - mid 5124 5125 pmull2 v20.1q, q4, v13.2d @ GHASH final-1 block - high 5126 eor r7, r7, r14 @ AES final block - round 14 high 5127 5128 eor v22.8b, v22.8b, q4 @ GHASH final-1 block - mid 5129 5130 eor q9, q9, v20.16b @ GHASH final-1 block - high 5131 5132 ins v22.d[1], v22.d[0] @ GHASH final-1 block - mid 5133 fmov d5, r6 @ AES final block - mov low 5134 5135 fmov v5.d[1], r7 @ AES final block - mov high 5136 5137 pmull2 v22.1q, v22.2d, v16.2d @ GHASH final-1 block - mid 5138 5139 pmull v21.1q, q4, v13.1d @ GHASH final-1 block - low 5140 5141 eor q5, q5, q3 @ AES final block - result 5142 eor v10.16b, v10.16b, v22.16b @ GHASH final-1 block - mid 5143 5144 eor v11.16b, v11.16b, v21.16b @ GHASH final-1 block - low 5145.L256_enc_blocks_less_than_1:@ blocks left <= 1 5146 5147 and r1, r1, #127 @ bit_length %= 128 5148 5149 mvn r13, xzr @ rk14_l = 0xffffffffffffffff 5150 sub r1, r1, #128 @ bit_length -= 128 5151 5152 neg r1, r1 @ bit_length = 128 - #bits in input (in range [1,128]) 5153 ld1 { v18.16b}, [r2] @ load existing bytes where the possibly partial last block is to be stored 5154 5155 mvn r14, xzr @ rk14_h = 0xffffffffffffffff 5156 and r1, r1, #127 @ bit_length %= 128 5157 5158 lsr r14, r14, r1 @ rk14_h is mask for top 64b of last block 5159 cmp r1, #64 5160 5161 csel r6, r13, r14, lt 5162 csel r7, r14, xzr, lt 5163 5164 fmov d0, r6 @ ctr0b is mask for last block 5165 5166 fmov v0.d[1], r7 5167 5168 and q5, q5, q0 @ possibly partial last block has zeroes in highest bits 5169 5170 rev64 q4, q5 @ GHASH final block 5171 5172 eor q4, q4, q8 @ feed in partial tag 5173 5174 bif q5, v18.16b, q0 @ insert existing bytes in top end of result before storing 5175 5176 pmull2 v20.1q, q4, v12.2d @ GHASH final block - high 5177 mov d8, v4.d[1] @ GHASH final block - mid 5178#ifndef __ARMEB__ 5179 rev r9, r12 5180#else 5181 mov r9, r12 5182#endif 5183 5184 pmull v21.1q, q4, v12.1d @ GHASH final block - low 5185 5186 eor q9, q9, v20.16b @ GHASH final block - high 5187 eor q8, q8, q4 @ GHASH final block - mid 5188 5189 pmull v8.1q, q8, v16.1d @ GHASH final block - mid 5190 5191 eor v11.16b, v11.16b, v21.16b @ GHASH final block - low 5192 5193 eor v10.16b, v10.16b, q8 @ GHASH final block - mid 5194 movi q8, #0xc2 5195 5196 eor q4, v11.16b, q9 @ MODULO - karatsuba tidy up 5197 5198 shl d8, d8, #56 @ mod_constant 5199 5200 eor v10.16b, v10.16b, q4 @ MODULO - karatsuba tidy up 5201 5202 pmull v7.1q, q9, q8 @ MODULO - top 64b align with mid 5203 5204 ext q9, q9, q9, #8 @ MODULO - other top alignment 5205 5206 eor v10.16b, v10.16b, q7 @ MODULO - fold into mid 5207 5208 eor v10.16b, v10.16b, q9 @ MODULO - fold into mid 5209 5210 pmull v9.1q, v10.1d, q8 @ MODULO - mid 64b align with low 5211 5212 ext v10.16b, v10.16b, v10.16b, #8 @ MODULO - other mid alignment 5213 5214 str r9, [r16, #12] @ store the updated counter 5215 5216 st1 { q5}, [r2] @ store all 16B 5217 eor v11.16b, v11.16b, q9 @ MODULO - fold into low 5218 5219 eor v11.16b, v11.16b, v10.16b @ MODULO - fold into low 5220 ext v11.16b, v11.16b, v11.16b, #8 5221 rev64 v11.16b, v11.16b 5222 mov r0, r15 5223 st1 { v11.16b }, [r3] 5224 5225 ldp r21, r22, [sp, #16] 5226 ldp r23, r24, [sp, #32] 5227 ldp d8, d9, [sp, #48] 5228 ldp d10, d11, [sp, #64] 5229 ldp d12, d13, [sp, #80] 5230 ldp d14, d15, [sp, #96] 5231 ldp r19, r20, [sp], #112 5232 RET 5233 5234.L256_enc_ret: 5235 mov r0, #0x0 5236 RET 5237.size aes_gcm_enc_256_kernel,.-aes_gcm_enc_256_kernel 5238.globl aes_gcm_dec_256_kernel 5239.type aes_gcm_dec_256_kernel,%function 5240.align 4 5241aes_gcm_dec_256_kernel: 5242 cbz r1, .L256_dec_ret 5243 stp r19, r20, [sp, #-112]! 5244 mov r16, r4 5245 mov r8, r5 5246 stp r21, r22, [sp, #16] 5247 stp r23, r24, [sp, #32] 5248 stp d8, d9, [sp, #48] 5249 stp d10, d11, [sp, #64] 5250 stp d12, d13, [sp, #80] 5251 stp d14, d15, [sp, #96] 5252 5253 lsr r5, r1, #3 @ byte_len 5254 mov r15, r5 5255 ldp r10, r11, [r16] @ ctr96_b64, ctr96_t32 5256#ifdef __ARMEB__ 5257 rev r10, r10 5258 rev r11, r11 5259#endif 5260 ldp r13, r14, [r8, #224] @ load rk14 5261#ifdef __ARMEB__ 5262 ror r14, r14, #32 5263 ror r13, r13, #32 5264#endif 5265 ld1 {v18.4s}, [r8], #16 @ load rk0 5266 sub r5, r5, #1 @ byte_len - 1 5267 5268 ld1 {v19.4s}, [r8], #16 @ load rk1 5269 and r5, r5, #0xffffffffffffffc0 @ number of bytes to be processed in main loop (at least 1 byte must be handled by tail) 5270 5271 add r4, r0, r1, lsr #3 @ end_input_ptr 5272 ld1 {v20.4s}, [r8], #16 @ load rk2 5273 5274 lsr r12, r11, #32 5275 ld1 {v21.4s}, [r8], #16 @ load rk3 5276 orr r11, r11, r11 5277 5278 ld1 {v22.4s}, [r8], #16 @ load rk4 5279 add r5, r5, r0 5280 rev r12, r12 @ rev_ctr32 5281 5282 add r12, r12, #1 @ increment rev_ctr32 5283 fmov d3, r10 @ CTR block 3 5284 5285 rev r9, r12 @ CTR block 1 5286 add r12, r12, #1 @ CTR block 1 5287 fmov d1, r10 @ CTR block 1 5288 5289 orr r9, r11, r9, lsl #32 @ CTR block 1 5290 ld1 { q0}, [r16] @ special case vector load initial counter so we can start first AES block as quickly as possible 5291 5292 fmov v1.d[1], r9 @ CTR block 1 5293 rev r9, r12 @ CTR block 2 5294 add r12, r12, #1 @ CTR block 2 5295 5296 fmov d2, r10 @ CTR block 2 5297 orr r9, r11, r9, lsl #32 @ CTR block 2 5298 5299 fmov v2.d[1], r9 @ CTR block 2 5300 rev r9, r12 @ CTR block 3 5301 5302 orr r9, r11, r9, lsl #32 @ CTR block 3 5303 ld1 {v23.4s}, [r8], #16 @ load rk5 5304 5305 fmov v3.d[1], r9 @ CTR block 3 5306 add r12, r12, #1 @ CTR block 3 5307 5308 ld1 {v24.4s}, [r8], #16 @ load rk6 5309 5310 ld1 {v25.4s}, [r8], #16 @ load rk7 5311 5312 ld1 {v26.4s}, [r8], #16 @ load rk8 5313 5314 aese q0, v18.16b 5315 aesmc q0, q0 @ AES block 0 - round 0 5316 ldr q14, [r3, #80] @ load h3l | h3h 5317#ifndef __ARMEB__ 5318 ext v14.16b, v14.16b, v14.16b, #8 5319#endif 5320 5321 aese q3, v18.16b 5322 aesmc q3, q3 @ AES block 3 - round 0 5323 ldr q15, [r3, #112] @ load h4l | h4h 5324#ifndef __ARMEB__ 5325 ext v15.16b, v15.16b, v15.16b, #8 5326#endif 5327 5328 aese q1, v18.16b 5329 aesmc q1, q1 @ AES block 1 - round 0 5330 ldr q13, [r3, #64] @ load h2l | h2h 5331#ifndef __ARMEB__ 5332 ext v13.16b, v13.16b, v13.16b, #8 5333#endif 5334 5335 aese q2, v18.16b 5336 aesmc q2, q2 @ AES block 2 - round 0 5337 ld1 {v27.4s}, [r8], #16 @ load rk9 5338 5339 aese q0, v19.16b 5340 aesmc q0, q0 @ AES block 0 - round 1 5341 5342 aese q1, v19.16b 5343 aesmc q1, q1 @ AES block 1 - round 1 5344 ld1 { v11.16b}, [r3] 5345 ext v11.16b, v11.16b, v11.16b, #8 5346 rev64 v11.16b, v11.16b 5347 5348 aese q2, v19.16b 5349 aesmc q2, q2 @ AES block 2 - round 1 5350 ld1 {v28.4s}, [r8], #16 @ load rk10 5351 5352 aese q3, v19.16b 5353 aesmc q3, q3 @ AES block 3 - round 1 5354 ld1 {v29.4s}, [r8], #16 @ load rk11 5355 5356 aese q0, v20.16b 5357 aesmc q0, q0 @ AES block 0 - round 2 5358 ldr q12, [r3, #32] @ load h1l | h1h 5359#ifndef __ARMEB__ 5360 ext v12.16b, v12.16b, v12.16b, #8 5361#endif 5362 aese q2, v20.16b 5363 aesmc q2, q2 @ AES block 2 - round 2 5364 ld1 {v30.4s}, [r8], #16 @ load rk12 5365 5366 aese q3, v20.16b 5367 aesmc q3, q3 @ AES block 3 - round 2 5368 5369 aese q0, v21.16b 5370 aesmc q0, q0 @ AES block 0 - round 3 5371 5372 aese q1, v20.16b 5373 aesmc q1, q1 @ AES block 1 - round 2 5374 5375 aese q3, v21.16b 5376 aesmc q3, q3 @ AES block 3 - round 3 5377 5378 aese q0, v22.16b 5379 aesmc q0, q0 @ AES block 0 - round 4 5380 cmp r0, r5 @ check if we have <= 4 blocks 5381 5382 aese q2, v21.16b 5383 aesmc q2, q2 @ AES block 2 - round 3 5384 5385 aese q1, v21.16b 5386 aesmc q1, q1 @ AES block 1 - round 3 5387 5388 aese q3, v22.16b 5389 aesmc q3, q3 @ AES block 3 - round 4 5390 5391 aese q2, v22.16b 5392 aesmc q2, q2 @ AES block 2 - round 4 5393 5394 aese q1, v22.16b 5395 aesmc q1, q1 @ AES block 1 - round 4 5396 5397 aese q3, v23.16b 5398 aesmc q3, q3 @ AES block 3 - round 5 5399 5400 aese q0, v23.16b 5401 aesmc q0, q0 @ AES block 0 - round 5 5402 5403 aese q1, v23.16b 5404 aesmc q1, q1 @ AES block 1 - round 5 5405 5406 aese q2, v23.16b 5407 aesmc q2, q2 @ AES block 2 - round 5 5408 5409 aese q0, v24.16b 5410 aesmc q0, q0 @ AES block 0 - round 6 5411 5412 aese q3, v24.16b 5413 aesmc q3, q3 @ AES block 3 - round 6 5414 5415 aese q1, v24.16b 5416 aesmc q1, q1 @ AES block 1 - round 6 5417 5418 aese q2, v24.16b 5419 aesmc q2, q2 @ AES block 2 - round 6 5420 5421 aese q0, v25.16b 5422 aesmc q0, q0 @ AES block 0 - round 7 5423 5424 aese q1, v25.16b 5425 aesmc q1, q1 @ AES block 1 - round 7 5426 5427 aese q3, v25.16b 5428 aesmc q3, q3 @ AES block 3 - round 7 5429 5430 aese q0, v26.16b 5431 aesmc q0, q0 @ AES block 0 - round 8 5432 5433 aese q2, v25.16b 5434 aesmc q2, q2 @ AES block 2 - round 7 5435 5436 aese q3, v26.16b 5437 aesmc q3, q3 @ AES block 3 - round 8 5438 5439 aese q1, v26.16b 5440 aesmc q1, q1 @ AES block 1 - round 8 5441 5442 aese q0, v27.16b 5443 aesmc q0, q0 @ AES block 0 - round 9 5444 5445 aese q2, v26.16b 5446 aesmc q2, q2 @ AES block 2 - round 8 5447 ld1 {v31.4s}, [r8], #16 @ load rk13 5448 5449 aese q1, v27.16b 5450 aesmc q1, q1 @ AES block 1 - round 9 5451 5452 aese q0, v28.16b 5453 aesmc q0, q0 @ AES block 0 - round 10 5454 5455 aese q3, v27.16b 5456 aesmc q3, q3 @ AES block 3 - round 9 5457 5458 aese q1, v28.16b 5459 aesmc q1, q1 @ AES block 1 - round 10 5460 5461 aese q2, v27.16b 5462 aesmc q2, q2 @ AES block 2 - round 9 5463 5464 aese q3, v28.16b 5465 aesmc q3, q3 @ AES block 3 - round 10 5466 5467 aese q0, v29.16b 5468 aesmc q0, q0 @ AES block 0 - round 11 5469 5470 aese q2, v28.16b 5471 aesmc q2, q2 @ AES block 2 - round 10 5472 5473 aese q3, v29.16b 5474 aesmc q3, q3 @ AES block 3 - round 11 5475 5476 aese q1, v29.16b 5477 aesmc q1, q1 @ AES block 1 - round 11 5478 5479 aese q2, v29.16b 5480 aesmc q2, q2 @ AES block 2 - round 11 5481 5482 trn1 q9, v14.2d, v15.2d @ h4h | h3h 5483 5484 trn2 v17.2d, v14.2d, v15.2d @ h4l | h3l 5485 5486 trn1 q8, v12.2d, v13.2d @ h2h | h1h 5487 trn2 v16.2d, v12.2d, v13.2d @ h2l | h1l 5488 5489 aese q1, v30.16b 5490 aesmc q1, q1 @ AES block 1 - round 12 5491 5492 aese q0, v30.16b 5493 aesmc q0, q0 @ AES block 0 - round 12 5494 5495 aese q2, v30.16b 5496 aesmc q2, q2 @ AES block 2 - round 12 5497 5498 aese q3, v30.16b 5499 aesmc q3, q3 @ AES block 3 - round 12 5500 eor v17.16b, v17.16b, q9 @ h4k | h3k 5501 5502 aese q1, v31.16b @ AES block 1 - round 13 5503 5504 aese q2, v31.16b @ AES block 2 - round 13 5505 eor v16.16b, v16.16b, q8 @ h2k | h1k 5506 5507 aese q3, v31.16b @ AES block 3 - round 13 5508 5509 aese q0, v31.16b @ AES block 0 - round 13 5510 bge .L256_dec_tail @ handle tail 5511 5512 ld1 {q4, q5}, [r0], #32 @ AES block 0,1 - load ciphertext 5513 5514 rev r9, r12 @ CTR block 4 5515 5516 eor q0, q4, q0 @ AES block 0 - result 5517 5518 eor q1, q5, q1 @ AES block 1 - result 5519 rev64 q5, q5 @ GHASH block 1 5520 ld1 {q6}, [r0], #16 @ AES block 2 - load ciphertext 5521 5522 mov r7, v0.d[1] @ AES block 0 - mov high 5523 5524 mov r6, v0.d[0] @ AES block 0 - mov low 5525 rev64 q4, q4 @ GHASH block 0 5526 add r12, r12, #1 @ CTR block 4 5527 5528 fmov d0, r10 @ CTR block 4 5529 orr r9, r11, r9, lsl #32 @ CTR block 4 5530 5531 fmov v0.d[1], r9 @ CTR block 4 5532 rev r9, r12 @ CTR block 5 5533 add r12, r12, #1 @ CTR block 5 5534 5535 mov r19, v1.d[0] @ AES block 1 - mov low 5536 5537 orr r9, r11, r9, lsl #32 @ CTR block 5 5538 mov r20, v1.d[1] @ AES block 1 - mov high 5539 eor r7, r7, r14 @ AES block 0 - round 14 high 5540#ifdef __ARMEB__ 5541 rev r7, r7 5542#endif 5543 eor r6, r6, r13 @ AES block 0 - round 14 low 5544#ifdef __ARMEB__ 5545 rev r6, r6 5546#endif 5547 stp r6, r7, [r2], #16 @ AES block 0 - store result 5548 fmov d1, r10 @ CTR block 5 5549 5550 ld1 {q7}, [r0], #16 @ AES block 3 - load ciphertext 5551 5552 fmov v1.d[1], r9 @ CTR block 5 5553 rev r9, r12 @ CTR block 6 5554 add r12, r12, #1 @ CTR block 6 5555 5556 eor r19, r19, r13 @ AES block 1 - round 14 low 5557#ifdef __ARMEB__ 5558 rev r19, r19 5559#endif 5560 orr r9, r11, r9, lsl #32 @ CTR block 6 5561 5562 eor r20, r20, r14 @ AES block 1 - round 14 high 5563#ifdef __ARMEB__ 5564 rev r20, r20 5565#endif 5566 stp r19, r20, [r2], #16 @ AES block 1 - store result 5567 5568 eor q2, q6, q2 @ AES block 2 - result 5569 cmp r0, r5 @ check if we have <= 8 blocks 5570 bge .L256_dec_prepretail @ do prepretail 5571 5572.L256_dec_main_loop:@ main loop start 5573 mov r21, v2.d[0] @ AES block 4k+2 - mov low 5574 ext v11.16b, v11.16b, v11.16b, #8 @ PRE 0 5575 eor q3, q7, q3 @ AES block 4k+3 - result 5576 5577 aese q0, v18.16b 5578 aesmc q0, q0 @ AES block 4k+4 - round 0 5579 mov r22, v2.d[1] @ AES block 4k+2 - mov high 5580 5581 aese q1, v18.16b 5582 aesmc q1, q1 @ AES block 4k+5 - round 0 5583 fmov d2, r10 @ CTR block 4k+6 5584 5585 fmov v2.d[1], r9 @ CTR block 4k+6 5586 eor q4, q4, v11.16b @ PRE 1 5587 rev r9, r12 @ CTR block 4k+7 5588 5589 aese q0, v19.16b 5590 aesmc q0, q0 @ AES block 4k+4 - round 1 5591 mov r24, v3.d[1] @ AES block 4k+3 - mov high 5592 5593 aese q1, v19.16b 5594 aesmc q1, q1 @ AES block 4k+5 - round 1 5595 mov r23, v3.d[0] @ AES block 4k+3 - mov low 5596 5597 pmull2 v9.1q, q4, v15.2d @ GHASH block 4k - high 5598 mov d8, v4.d[1] @ GHASH block 4k - mid 5599 fmov d3, r10 @ CTR block 4k+7 5600 5601 aese q0, v20.16b 5602 aesmc q0, q0 @ AES block 4k+4 - round 2 5603 orr r9, r11, r9, lsl #32 @ CTR block 4k+7 5604 5605 aese q2, v18.16b 5606 aesmc q2, q2 @ AES block 4k+6 - round 0 5607 fmov v3.d[1], r9 @ CTR block 4k+7 5608 5609 aese q1, v20.16b 5610 aesmc q1, q1 @ AES block 4k+5 - round 2 5611 eor q8, q8, q4 @ GHASH block 4k - mid 5612 5613 aese q0, v21.16b 5614 aesmc q0, q0 @ AES block 4k+4 - round 3 5615 eor r22, r22, r14 @ AES block 4k+2 - round 14 high 5616#ifdef __ARMEB__ 5617 rev r22, r22 5618#endif 5619 aese q2, v19.16b 5620 aesmc q2, q2 @ AES block 4k+6 - round 1 5621 mov d10, v17.d[1] @ GHASH block 4k - mid 5622 5623 aese q1, v21.16b 5624 aesmc q1, q1 @ AES block 4k+5 - round 3 5625 rev64 q6, q6 @ GHASH block 4k+2 5626 5627 aese q3, v18.16b 5628 aesmc q3, q3 @ AES block 4k+7 - round 0 5629 eor r21, r21, r13 @ AES block 4k+2 - round 14 low 5630#ifdef __ARMEB__ 5631 rev r21, r21 5632#endif 5633 aese q2, v20.16b 5634 aesmc q2, q2 @ AES block 4k+6 - round 2 5635 stp r21, r22, [r2], #16 @ AES block 4k+2 - store result 5636 5637 pmull v11.1q, q4, v15.1d @ GHASH block 4k - low 5638 5639 pmull2 v4.1q, q5, v14.2d @ GHASH block 4k+1 - high 5640 5641 aese q2, v21.16b 5642 aesmc q2, q2 @ AES block 4k+6 - round 3 5643 rev64 q7, q7 @ GHASH block 4k+3 5644 5645 pmull v10.1q, q8, v10.1d @ GHASH block 4k - mid 5646 eor r23, r23, r13 @ AES block 4k+3 - round 14 low 5647#ifdef __ARMEB__ 5648 rev r23, r23 5649#endif 5650 pmull v8.1q, q5, v14.1d @ GHASH block 4k+1 - low 5651 eor r24, r24, r14 @ AES block 4k+3 - round 14 high 5652#ifdef __ARMEB__ 5653 rev r24, r24 5654#endif 5655 eor q9, q9, q4 @ GHASH block 4k+1 - high 5656 5657 aese q2, v22.16b 5658 aesmc q2, q2 @ AES block 4k+6 - round 4 5659 5660 aese q3, v19.16b 5661 aesmc q3, q3 @ AES block 4k+7 - round 1 5662 mov d4, v5.d[1] @ GHASH block 4k+1 - mid 5663 5664 aese q0, v22.16b 5665 aesmc q0, q0 @ AES block 4k+4 - round 4 5666 eor v11.16b, v11.16b, q8 @ GHASH block 4k+1 - low 5667 5668 aese q2, v23.16b 5669 aesmc q2, q2 @ AES block 4k+6 - round 5 5670 add r12, r12, #1 @ CTR block 4k+7 5671 5672 aese q3, v20.16b 5673 aesmc q3, q3 @ AES block 4k+7 - round 2 5674 mov d8, v6.d[1] @ GHASH block 4k+2 - mid 5675 5676 aese q1, v22.16b 5677 aesmc q1, q1 @ AES block 4k+5 - round 4 5678 eor q4, q4, q5 @ GHASH block 4k+1 - mid 5679 5680 pmull v5.1q, q6, v13.1d @ GHASH block 4k+2 - low 5681 5682 aese q3, v21.16b 5683 aesmc q3, q3 @ AES block 4k+7 - round 3 5684 eor q8, q8, q6 @ GHASH block 4k+2 - mid 5685 5686 aese q1, v23.16b 5687 aesmc q1, q1 @ AES block 4k+5 - round 5 5688 5689 aese q0, v23.16b 5690 aesmc q0, q0 @ AES block 4k+4 - round 5 5691 eor v11.16b, v11.16b, q5 @ GHASH block 4k+2 - low 5692 5693 pmull v4.1q, q4, v17.1d @ GHASH block 4k+1 - mid 5694 rev r9, r12 @ CTR block 4k+8 5695 5696 aese q1, v24.16b 5697 aesmc q1, q1 @ AES block 4k+5 - round 6 5698 ins v8.d[1], v8.d[0] @ GHASH block 4k+2 - mid 5699 5700 aese q0, v24.16b 5701 aesmc q0, q0 @ AES block 4k+4 - round 6 5702 add r12, r12, #1 @ CTR block 4k+8 5703 5704 aese q3, v22.16b 5705 aesmc q3, q3 @ AES block 4k+7 - round 4 5706 5707 aese q1, v25.16b 5708 aesmc q1, q1 @ AES block 4k+5 - round 7 5709 eor v10.16b, v10.16b, q4 @ GHASH block 4k+1 - mid 5710 5711 aese q0, v25.16b 5712 aesmc q0, q0 @ AES block 4k+4 - round 7 5713 5714 pmull2 v4.1q, q6, v13.2d @ GHASH block 4k+2 - high 5715 mov d6, v7.d[1] @ GHASH block 4k+3 - mid 5716 5717 aese q3, v23.16b 5718 aesmc q3, q3 @ AES block 4k+7 - round 5 5719 5720 pmull2 v8.1q, q8, v16.2d @ GHASH block 4k+2 - mid 5721 5722 aese q0, v26.16b 5723 aesmc q0, q0 @ AES block 4k+4 - round 8 5724 eor q9, q9, q4 @ GHASH block 4k+2 - high 5725 5726 aese q3, v24.16b 5727 aesmc q3, q3 @ AES block 4k+7 - round 6 5728 5729 pmull v4.1q, q7, v12.1d @ GHASH block 4k+3 - low 5730 orr r9, r11, r9, lsl #32 @ CTR block 4k+8 5731 eor v10.16b, v10.16b, q8 @ GHASH block 4k+2 - mid 5732 5733 pmull2 v5.1q, q7, v12.2d @ GHASH block 4k+3 - high 5734 5735 aese q0, v27.16b 5736 aesmc q0, q0 @ AES block 4k+4 - round 9 5737 eor q6, q6, q7 @ GHASH block 4k+3 - mid 5738 5739 aese q1, v26.16b 5740 aesmc q1, q1 @ AES block 4k+5 - round 8 5741 5742 aese q2, v24.16b 5743 aesmc q2, q2 @ AES block 4k+6 - round 6 5744 eor q9, q9, q5 @ GHASH block 4k+3 - high 5745 5746 aese q0, v28.16b 5747 aesmc q0, q0 @ AES block 4k+4 - round 10 5748 5749 pmull v6.1q, q6, v16.1d @ GHASH block 4k+3 - mid 5750 movi q8, #0xc2 5751 5752 aese q2, v25.16b 5753 aesmc q2, q2 @ AES block 4k+6 - round 7 5754 eor v11.16b, v11.16b, q4 @ GHASH block 4k+3 - low 5755 5756 aese q0, v29.16b 5757 aesmc q0, q0 @ AES block 4k+4 - round 11 5758 5759 aese q3, v25.16b 5760 aesmc q3, q3 @ AES block 4k+7 - round 7 5761 shl d8, d8, #56 @ mod_constant 5762 5763 aese q2, v26.16b 5764 aesmc q2, q2 @ AES block 4k+6 - round 8 5765 eor v10.16b, v10.16b, q6 @ GHASH block 4k+3 - mid 5766 5767 aese q0, v30.16b 5768 aesmc q0, q0 @ AES block 4k+4 - round 12 5769 5770 pmull v7.1q, q9, q8 @ MODULO - top 64b align with mid 5771 eor q6, v11.16b, q9 @ MODULO - karatsuba tidy up 5772 5773 aese q1, v27.16b 5774 aesmc q1, q1 @ AES block 4k+5 - round 9 5775 ld1 {q4}, [r0], #16 @ AES block 4k+4 - load ciphertext 5776 5777 aese q0, v31.16b @ AES block 4k+4 - round 13 5778 ext q9, q9, q9, #8 @ MODULO - other top alignment 5779 5780 aese q1, v28.16b 5781 aesmc q1, q1 @ AES block 4k+5 - round 10 5782 eor v10.16b, v10.16b, q6 @ MODULO - karatsuba tidy up 5783 5784 aese q2, v27.16b 5785 aesmc q2, q2 @ AES block 4k+6 - round 9 5786 ld1 {q5}, [r0], #16 @ AES block 4k+5 - load ciphertext 5787 5788 aese q3, v26.16b 5789 aesmc q3, q3 @ AES block 4k+7 - round 8 5790 eor q0, q4, q0 @ AES block 4k+4 - result 5791 5792 aese q1, v29.16b 5793 aesmc q1, q1 @ AES block 4k+5 - round 11 5794 stp r23, r24, [r2], #16 @ AES block 4k+3 - store result 5795 5796 aese q2, v28.16b 5797 aesmc q2, q2 @ AES block 4k+6 - round 10 5798 eor v10.16b, v10.16b, q7 @ MODULO - fold into mid 5799 5800 aese q3, v27.16b 5801 aesmc q3, q3 @ AES block 4k+7 - round 9 5802 ld1 {q6}, [r0], #16 @ AES block 4k+6 - load ciphertext 5803 5804 aese q1, v30.16b 5805 aesmc q1, q1 @ AES block 4k+5 - round 12 5806 ld1 {q7}, [r0], #16 @ AES block 4k+7 - load ciphertext 5807 5808 aese q2, v29.16b 5809 aesmc q2, q2 @ AES block 4k+6 - round 11 5810 mov r7, v0.d[1] @ AES block 4k+4 - mov high 5811 5812 aese q3, v28.16b 5813 aesmc q3, q3 @ AES block 4k+7 - round 10 5814 eor v10.16b, v10.16b, q9 @ MODULO - fold into mid 5815 5816 aese q1, v31.16b @ AES block 4k+5 - round 13 5817 mov r6, v0.d[0] @ AES block 4k+4 - mov low 5818 5819 aese q2, v30.16b 5820 aesmc q2, q2 @ AES block 4k+6 - round 12 5821 fmov d0, r10 @ CTR block 4k+8 5822 5823 aese q3, v29.16b 5824 aesmc q3, q3 @ AES block 4k+7 - round 11 5825 fmov v0.d[1], r9 @ CTR block 4k+8 5826 5827 pmull v8.1q, v10.1d, q8 @ MODULO - mid 64b align with low 5828 eor q1, q5, q1 @ AES block 4k+5 - result 5829 rev r9, r12 @ CTR block 4k+9 5830 5831 aese q2, v31.16b @ AES block 4k+6 - round 13 5832 orr r9, r11, r9, lsl #32 @ CTR block 4k+9 5833 cmp r0, r5 @ .LOOP CONTROL 5834 5835 add r12, r12, #1 @ CTR block 4k+9 5836 5837 eor r6, r6, r13 @ AES block 4k+4 - round 14 low 5838#ifdef __ARMEB__ 5839 rev r6, r6 5840#endif 5841 eor r7, r7, r14 @ AES block 4k+4 - round 14 high 5842#ifdef __ARMEB__ 5843 rev r7, r7 5844#endif 5845 mov r20, v1.d[1] @ AES block 4k+5 - mov high 5846 eor q2, q6, q2 @ AES block 4k+6 - result 5847 eor v11.16b, v11.16b, q8 @ MODULO - fold into low 5848 5849 aese q3, v30.16b 5850 aesmc q3, q3 @ AES block 4k+7 - round 12 5851 mov r19, v1.d[0] @ AES block 4k+5 - mov low 5852 5853 fmov d1, r10 @ CTR block 4k+9 5854 ext v10.16b, v10.16b, v10.16b, #8 @ MODULO - other mid alignment 5855 5856 fmov v1.d[1], r9 @ CTR block 4k+9 5857 rev r9, r12 @ CTR block 4k+10 5858 add r12, r12, #1 @ CTR block 4k+10 5859 5860 aese q3, v31.16b @ AES block 4k+7 - round 13 5861 orr r9, r11, r9, lsl #32 @ CTR block 4k+10 5862 5863 rev64 q5, q5 @ GHASH block 4k+5 5864 eor r20, r20, r14 @ AES block 4k+5 - round 14 high 5865#ifdef __ARMEB__ 5866 rev r20, r20 5867#endif 5868 stp r6, r7, [r2], #16 @ AES block 4k+4 - store result 5869 5870 eor r19, r19, r13 @ AES block 4k+5 - round 14 low 5871#ifdef __ARMEB__ 5872 rev r19, r19 5873#endif 5874 stp r19, r20, [r2], #16 @ AES block 4k+5 - store result 5875 5876 rev64 q4, q4 @ GHASH block 4k+4 5877 eor v11.16b, v11.16b, v10.16b @ MODULO - fold into low 5878 blt .L256_dec_main_loop 5879 5880 5881.L256_dec_prepretail:@ PREPRETAIL 5882 ext v11.16b, v11.16b, v11.16b, #8 @ PRE 0 5883 mov r21, v2.d[0] @ AES block 4k+2 - mov low 5884 eor q3, q7, q3 @ AES block 4k+3 - result 5885 5886 aese q0, v18.16b 5887 aesmc q0, q0 @ AES block 4k+4 - round 0 5888 mov r22, v2.d[1] @ AES block 4k+2 - mov high 5889 5890 aese q1, v18.16b 5891 aesmc q1, q1 @ AES block 4k+5 - round 0 5892 fmov d2, r10 @ CTR block 4k+6 5893 5894 fmov v2.d[1], r9 @ CTR block 4k+6 5895 rev r9, r12 @ CTR block 4k+7 5896 eor q4, q4, v11.16b @ PRE 1 5897 5898 rev64 q6, q6 @ GHASH block 4k+2 5899 orr r9, r11, r9, lsl #32 @ CTR block 4k+7 5900 mov r23, v3.d[0] @ AES block 4k+3 - mov low 5901 5902 aese q1, v19.16b 5903 aesmc q1, q1 @ AES block 4k+5 - round 1 5904 mov r24, v3.d[1] @ AES block 4k+3 - mov high 5905 5906 pmull v11.1q, q4, v15.1d @ GHASH block 4k - low 5907 mov d8, v4.d[1] @ GHASH block 4k - mid 5908 fmov d3, r10 @ CTR block 4k+7 5909 5910 pmull2 v9.1q, q4, v15.2d @ GHASH block 4k - high 5911 fmov v3.d[1], r9 @ CTR block 4k+7 5912 5913 aese q2, v18.16b 5914 aesmc q2, q2 @ AES block 4k+6 - round 0 5915 mov d10, v17.d[1] @ GHASH block 4k - mid 5916 5917 aese q0, v19.16b 5918 aesmc q0, q0 @ AES block 4k+4 - round 1 5919 eor q8, q8, q4 @ GHASH block 4k - mid 5920 5921 pmull2 v4.1q, q5, v14.2d @ GHASH block 4k+1 - high 5922 5923 aese q2, v19.16b 5924 aesmc q2, q2 @ AES block 4k+6 - round 1 5925 rev64 q7, q7 @ GHASH block 4k+3 5926 5927 aese q3, v18.16b 5928 aesmc q3, q3 @ AES block 4k+7 - round 0 5929 5930 pmull v10.1q, q8, v10.1d @ GHASH block 4k - mid 5931 eor q9, q9, q4 @ GHASH block 4k+1 - high 5932 5933 pmull v8.1q, q5, v14.1d @ GHASH block 4k+1 - low 5934 5935 aese q3, v19.16b 5936 aesmc q3, q3 @ AES block 4k+7 - round 1 5937 mov d4, v5.d[1] @ GHASH block 4k+1 - mid 5938 5939 aese q0, v20.16b 5940 aesmc q0, q0 @ AES block 4k+4 - round 2 5941 5942 aese q1, v20.16b 5943 aesmc q1, q1 @ AES block 4k+5 - round 2 5944 eor v11.16b, v11.16b, q8 @ GHASH block 4k+1 - low 5945 5946 aese q2, v20.16b 5947 aesmc q2, q2 @ AES block 4k+6 - round 2 5948 5949 aese q0, v21.16b 5950 aesmc q0, q0 @ AES block 4k+4 - round 3 5951 mov d8, v6.d[1] @ GHASH block 4k+2 - mid 5952 5953 aese q3, v20.16b 5954 aesmc q3, q3 @ AES block 4k+7 - round 2 5955 eor q4, q4, q5 @ GHASH block 4k+1 - mid 5956 5957 pmull v5.1q, q6, v13.1d @ GHASH block 4k+2 - low 5958 5959 aese q0, v22.16b 5960 aesmc q0, q0 @ AES block 4k+4 - round 4 5961 5962 aese q3, v21.16b 5963 aesmc q3, q3 @ AES block 4k+7 - round 3 5964 eor q8, q8, q6 @ GHASH block 4k+2 - mid 5965 5966 pmull v4.1q, q4, v17.1d @ GHASH block 4k+1 - mid 5967 5968 aese q0, v23.16b 5969 aesmc q0, q0 @ AES block 4k+4 - round 5 5970 eor v11.16b, v11.16b, q5 @ GHASH block 4k+2 - low 5971 5972 aese q3, v22.16b 5973 aesmc q3, q3 @ AES block 4k+7 - round 4 5974 5975 pmull2 v5.1q, q7, v12.2d @ GHASH block 4k+3 - high 5976 eor v10.16b, v10.16b, q4 @ GHASH block 4k+1 - mid 5977 5978 pmull2 v4.1q, q6, v13.2d @ GHASH block 4k+2 - high 5979 5980 aese q3, v23.16b 5981 aesmc q3, q3 @ AES block 4k+7 - round 5 5982 ins v8.d[1], v8.d[0] @ GHASH block 4k+2 - mid 5983 5984 aese q2, v21.16b 5985 aesmc q2, q2 @ AES block 4k+6 - round 3 5986 5987 aese q1, v21.16b 5988 aesmc q1, q1 @ AES block 4k+5 - round 3 5989 eor q9, q9, q4 @ GHASH block 4k+2 - high 5990 5991 pmull v4.1q, q7, v12.1d @ GHASH block 4k+3 - low 5992 5993 aese q2, v22.16b 5994 aesmc q2, q2 @ AES block 4k+6 - round 4 5995 mov d6, v7.d[1] @ GHASH block 4k+3 - mid 5996 5997 aese q1, v22.16b 5998 aesmc q1, q1 @ AES block 4k+5 - round 4 5999 6000 pmull2 v8.1q, q8, v16.2d @ GHASH block 4k+2 - mid 6001 6002 aese q2, v23.16b 6003 aesmc q2, q2 @ AES block 4k+6 - round 5 6004 eor q6, q6, q7 @ GHASH block 4k+3 - mid 6005 6006 aese q1, v23.16b 6007 aesmc q1, q1 @ AES block 4k+5 - round 5 6008 6009 aese q3, v24.16b 6010 aesmc q3, q3 @ AES block 4k+7 - round 6 6011 eor v10.16b, v10.16b, q8 @ GHASH block 4k+2 - mid 6012 6013 aese q2, v24.16b 6014 aesmc q2, q2 @ AES block 4k+6 - round 6 6015 6016 aese q0, v24.16b 6017 aesmc q0, q0 @ AES block 4k+4 - round 6 6018 movi q8, #0xc2 6019 6020 aese q1, v24.16b 6021 aesmc q1, q1 @ AES block 4k+5 - round 6 6022 eor v11.16b, v11.16b, q4 @ GHASH block 4k+3 - low 6023 6024 pmull v6.1q, q6, v16.1d @ GHASH block 4k+3 - mid 6025 6026 aese q3, v25.16b 6027 aesmc q3, q3 @ AES block 4k+7 - round 7 6028 eor q9, q9, q5 @ GHASH block 4k+3 - high 6029 6030 aese q1, v25.16b 6031 aesmc q1, q1 @ AES block 4k+5 - round 7 6032 6033 aese q0, v25.16b 6034 aesmc q0, q0 @ AES block 4k+4 - round 7 6035 eor v10.16b, v10.16b, q6 @ GHASH block 4k+3 - mid 6036 6037 aese q3, v26.16b 6038 aesmc q3, q3 @ AES block 4k+7 - round 8 6039 6040 aese q2, v25.16b 6041 aesmc q2, q2 @ AES block 4k+6 - round 7 6042 eor q6, v11.16b, q9 @ MODULO - karatsuba tidy up 6043 6044 aese q1, v26.16b 6045 aesmc q1, q1 @ AES block 4k+5 - round 8 6046 6047 aese q0, v26.16b 6048 aesmc q0, q0 @ AES block 4k+4 - round 8 6049 shl d8, d8, #56 @ mod_constant 6050 6051 aese q2, v26.16b 6052 aesmc q2, q2 @ AES block 4k+6 - round 8 6053 6054 aese q1, v27.16b 6055 aesmc q1, q1 @ AES block 4k+5 - round 9 6056 eor v10.16b, v10.16b, q6 @ MODULO - karatsuba tidy up 6057 6058 pmull v7.1q, q9, q8 @ MODULO - top 64b align with mid 6059 6060 aese q2, v27.16b 6061 aesmc q2, q2 @ AES block 4k+6 - round 9 6062 ext q9, q9, q9, #8 @ MODULO - other top alignment 6063 6064 aese q3, v27.16b 6065 aesmc q3, q3 @ AES block 4k+7 - round 9 6066 6067 aese q0, v27.16b 6068 aesmc q0, q0 @ AES block 4k+4 - round 9 6069 eor v10.16b, v10.16b, q7 @ MODULO - fold into mid 6070 6071 aese q2, v28.16b 6072 aesmc q2, q2 @ AES block 4k+6 - round 10 6073 6074 aese q3, v28.16b 6075 aesmc q3, q3 @ AES block 4k+7 - round 10 6076 6077 aese q0, v28.16b 6078 aesmc q0, q0 @ AES block 4k+4 - round 10 6079 eor r22, r22, r14 @ AES block 4k+2 - round 14 high 6080#ifdef __ARMEB__ 6081 rev r22, r22 6082#endif 6083 aese q1, v28.16b 6084 aesmc q1, q1 @ AES block 4k+5 - round 10 6085 eor r23, r23, r13 @ AES block 4k+3 - round 14 low 6086#ifdef __ARMEB__ 6087 rev r23, r23 6088#endif 6089 aese q2, v29.16b 6090 aesmc q2, q2 @ AES block 4k+6 - round 11 6091 eor v10.16b, v10.16b, q9 @ MODULO - fold into mid 6092 6093 aese q0, v29.16b 6094 aesmc q0, q0 @ AES block 4k+4 - round 11 6095 add r12, r12, #1 @ CTR block 4k+7 6096 6097 aese q1, v29.16b 6098 aesmc q1, q1 @ AES block 4k+5 - round 11 6099 eor r21, r21, r13 @ AES block 4k+2 - round 14 low 6100#ifdef __ARMEB__ 6101 rev r21, r21 6102#endif 6103 6104 aese q2, v30.16b 6105 aesmc q2, q2 @ AES block 4k+6 - round 12 6106 6107 pmull v8.1q, v10.1d, q8 @ MODULO - mid 64b align with low 6108 eor r24, r24, r14 @ AES block 4k+3 - round 14 high 6109#ifdef __ARMEB__ 6110 rev r24, r24 6111#endif 6112 6113 aese q3, v29.16b 6114 aesmc q3, q3 @ AES block 4k+7 - round 11 6115 stp r21, r22, [r2], #16 @ AES block 4k+2 - store result 6116 6117 aese q1, v30.16b 6118 aesmc q1, q1 @ AES block 4k+5 - round 12 6119 ext v10.16b, v10.16b, v10.16b, #8 @ MODULO - other mid alignment 6120 6121 aese q0, v30.16b 6122 aesmc q0, q0 @ AES block 4k+4 - round 12 6123 stp r23, r24, [r2], #16 @ AES block 4k+3 - store result 6124 6125 aese q3, v30.16b 6126 aesmc q3, q3 @ AES block 4k+7 - round 12 6127 eor v11.16b, v11.16b, q8 @ MODULO - fold into low 6128 6129 aese q1, v31.16b @ AES block 4k+5 - round 13 6130 6131 aese q0, v31.16b @ AES block 4k+4 - round 13 6132 6133 aese q3, v31.16b @ AES block 4k+7 - round 13 6134 6135 aese q2, v31.16b @ AES block 4k+6 - round 13 6136 eor v11.16b, v11.16b, v10.16b @ MODULO - fold into low 6137.L256_dec_tail:@ TAIL 6138 6139 sub r5, r4, r0 @ main_end_input_ptr is number of bytes left to process 6140 ld1 { q5}, [r0], #16 @ AES block 4k+4 - load ciphertext 6141 6142 eor q0, q5, q0 @ AES block 4k+4 - result 6143 6144 mov r6, v0.d[0] @ AES block 4k+4 - mov low 6145 6146 mov r7, v0.d[1] @ AES block 4k+4 - mov high 6147 ext q8, v11.16b, v11.16b, #8 @ prepare final partial tag 6148 6149 cmp r5, #48 6150 6151 eor r6, r6, r13 @ AES block 4k+4 - round 14 low 6152#ifdef __ARMEB__ 6153 rev r6, r6 6154#endif 6155 6156 eor r7, r7, r14 @ AES block 4k+4 - round 14 high 6157#ifdef __ARMEB__ 6158 rev r7, r7 6159#endif 6160 bgt .L256_dec_blocks_more_than_3 6161 6162 sub r12, r12, #1 6163 mov q3, q2 6164 movi v10.8b, #0 6165 6166 movi v11.8b, #0 6167 cmp r5, #32 6168 6169 movi q9, #0 6170 mov q2, q1 6171 bgt .L256_dec_blocks_more_than_2 6172 6173 sub r12, r12, #1 6174 6175 mov q3, q1 6176 cmp r5, #16 6177 bgt .L256_dec_blocks_more_than_1 6178 6179 sub r12, r12, #1 6180 b .L256_dec_blocks_less_than_1 6181.L256_dec_blocks_more_than_3:@ blocks left > 3 6182 rev64 q4, q5 @ GHASH final-3 block 6183 ld1 { q5}, [r0], #16 @ AES final-2 block - load ciphertext 6184 6185 stp r6, r7, [r2], #16 @ AES final-3 block - store result 6186 6187 mov d10, v17.d[1] @ GHASH final-3 block - mid 6188 6189 eor q4, q4, q8 @ feed in partial tag 6190 6191 eor q0, q5, q1 @ AES final-2 block - result 6192 6193 mov d22, v4.d[1] @ GHASH final-3 block - mid 6194 6195 mov r6, v0.d[0] @ AES final-2 block - mov low 6196 6197 mov r7, v0.d[1] @ AES final-2 block - mov high 6198 6199 eor v22.8b, v22.8b, q4 @ GHASH final-3 block - mid 6200 6201 movi q8, #0 @ suppress further partial tag feed in 6202 6203 pmull2 v9.1q, q4, v15.2d @ GHASH final-3 block - high 6204 6205 pmull v10.1q, v22.1d, v10.1d @ GHASH final-3 block - mid 6206 eor r6, r6, r13 @ AES final-2 block - round 14 low 6207#ifdef __ARMEB__ 6208 rev r6, r6 6209#endif 6210 6211 pmull v11.1q, q4, v15.1d @ GHASH final-3 block - low 6212 eor r7, r7, r14 @ AES final-2 block - round 14 high 6213#ifdef __ARMEB__ 6214 rev r7, r7 6215#endif 6216.L256_dec_blocks_more_than_2:@ blocks left > 2 6217 6218 rev64 q4, q5 @ GHASH final-2 block 6219 ld1 { q5}, [r0], #16 @ AES final-1 block - load ciphertext 6220 6221 eor q4, q4, q8 @ feed in partial tag 6222 stp r6, r7, [r2], #16 @ AES final-2 block - store result 6223 6224 eor q0, q5, q2 @ AES final-1 block - result 6225 6226 mov d22, v4.d[1] @ GHASH final-2 block - mid 6227 6228 pmull v21.1q, q4, v14.1d @ GHASH final-2 block - low 6229 6230 pmull2 v20.1q, q4, v14.2d @ GHASH final-2 block - high 6231 6232 eor v22.8b, v22.8b, q4 @ GHASH final-2 block - mid 6233 mov r6, v0.d[0] @ AES final-1 block - mov low 6234 6235 mov r7, v0.d[1] @ AES final-1 block - mov high 6236 eor v11.16b, v11.16b, v21.16b @ GHASH final-2 block - low 6237 movi q8, #0 @ suppress further partial tag feed in 6238 6239 pmull v22.1q, v22.1d, v17.1d @ GHASH final-2 block - mid 6240 6241 eor q9, q9, v20.16b @ GHASH final-2 block - high 6242 eor r6, r6, r13 @ AES final-1 block - round 14 low 6243#ifdef __ARMEB__ 6244 rev r6, r6 6245#endif 6246 6247 eor v10.16b, v10.16b, v22.16b @ GHASH final-2 block - mid 6248 eor r7, r7, r14 @ AES final-1 block - round 14 high 6249#ifdef __ARMEB__ 6250 rev r7, r7 6251#endif 6252.L256_dec_blocks_more_than_1:@ blocks left > 1 6253 6254 stp r6, r7, [r2], #16 @ AES final-1 block - store result 6255 rev64 q4, q5 @ GHASH final-1 block 6256 6257 ld1 { q5}, [r0], #16 @ AES final block - load ciphertext 6258 6259 eor q4, q4, q8 @ feed in partial tag 6260 movi q8, #0 @ suppress further partial tag feed in 6261 6262 mov d22, v4.d[1] @ GHASH final-1 block - mid 6263 6264 eor q0, q5, q3 @ AES final block - result 6265 6266 pmull2 v20.1q, q4, v13.2d @ GHASH final-1 block - high 6267 6268 eor v22.8b, v22.8b, q4 @ GHASH final-1 block - mid 6269 6270 pmull v21.1q, q4, v13.1d @ GHASH final-1 block - low 6271 mov r6, v0.d[0] @ AES final block - mov low 6272 6273 ins v22.d[1], v22.d[0] @ GHASH final-1 block - mid 6274 6275 mov r7, v0.d[1] @ AES final block - mov high 6276 6277 pmull2 v22.1q, v22.2d, v16.2d @ GHASH final-1 block - mid 6278 eor r6, r6, r13 @ AES final block - round 14 low 6279#ifdef __ARMEB__ 6280 rev r6, r6 6281#endif 6282 eor v11.16b, v11.16b, v21.16b @ GHASH final-1 block - low 6283 6284 eor q9, q9, v20.16b @ GHASH final-1 block - high 6285 6286 eor v10.16b, v10.16b, v22.16b @ GHASH final-1 block - mid 6287 eor r7, r7, r14 @ AES final block - round 14 high 6288#ifdef __ARMEB__ 6289 rev r7, r7 6290#endif 6291.L256_dec_blocks_less_than_1:@ blocks left <= 1 6292 6293 and r1, r1, #127 @ bit_length %= 128 6294 mvn r14, xzr @ rk14_h = 0xffffffffffffffff 6295 6296 sub r1, r1, #128 @ bit_length -= 128 6297 mvn r13, xzr @ rk14_l = 0xffffffffffffffff 6298 6299 ldp r4, r5, [r2] @ load existing bytes we need to not overwrite 6300 neg r1, r1 @ bit_length = 128 - #bits in input (in range [1,128]) 6301 6302 and r1, r1, #127 @ bit_length %= 128 6303 6304 lsr r14, r14, r1 @ rk14_h is mask for top 64b of last block 6305 cmp r1, #64 6306 6307 csel r9, r13, r14, lt 6308 csel r10, r14, xzr, lt 6309 6310 fmov d0, r9 @ ctr0b is mask for last block 6311 and r6, r6, r9 6312 6313 mov v0.d[1], r10 6314 bic r4, r4, r9 @ mask out low existing bytes 6315 6316#ifndef __ARMEB__ 6317 rev r9, r12 6318#else 6319 mov r9, r12 6320#endif 6321 6322 bic r5, r5, r10 @ mask out high existing bytes 6323 6324 orr r6, r6, r4 6325 6326 and r7, r7, r10 6327 6328 orr r7, r7, r5 6329 6330 and q5, q5, q0 @ possibly partial last block has zeroes in highest bits 6331 6332 rev64 q4, q5 @ GHASH final block 6333 6334 eor q4, q4, q8 @ feed in partial tag 6335 6336 pmull v21.1q, q4, v12.1d @ GHASH final block - low 6337 6338 mov d8, v4.d[1] @ GHASH final block - mid 6339 6340 eor q8, q8, q4 @ GHASH final block - mid 6341 6342 pmull2 v20.1q, q4, v12.2d @ GHASH final block - high 6343 6344 pmull v8.1q, q8, v16.1d @ GHASH final block - mid 6345 6346 eor q9, q9, v20.16b @ GHASH final block - high 6347 6348 eor v11.16b, v11.16b, v21.16b @ GHASH final block - low 6349 6350 eor v10.16b, v10.16b, q8 @ GHASH final block - mid 6351 movi q8, #0xc2 6352 6353 eor q6, v11.16b, q9 @ MODULO - karatsuba tidy up 6354 6355 shl d8, d8, #56 @ mod_constant 6356 6357 eor v10.16b, v10.16b, q6 @ MODULO - karatsuba tidy up 6358 6359 pmull v7.1q, q9, q8 @ MODULO - top 64b align with mid 6360 6361 ext q9, q9, q9, #8 @ MODULO - other top alignment 6362 6363 eor v10.16b, v10.16b, q7 @ MODULO - fold into mid 6364 6365 eor v10.16b, v10.16b, q9 @ MODULO - fold into mid 6366 6367 pmull v8.1q, v10.1d, q8 @ MODULO - mid 64b align with low 6368 6369 ext v10.16b, v10.16b, v10.16b, #8 @ MODULO - other mid alignment 6370 6371 eor v11.16b, v11.16b, q8 @ MODULO - fold into low 6372 6373 stp r6, r7, [r2] 6374 6375 str r9, [r16, #12] @ store the updated counter 6376 6377 eor v11.16b, v11.16b, v10.16b @ MODULO - fold into low 6378 ext v11.16b, v11.16b, v11.16b, #8 6379 rev64 v11.16b, v11.16b 6380 mov r0, r15 6381 st1 { v11.16b }, [r3] 6382 6383 ldp r21, r22, [sp, #16] 6384 ldp r23, r24, [sp, #32] 6385 ldp d8, d9, [sp, #48] 6386 ldp d10, d11, [sp, #64] 6387 ldp d12, d13, [sp, #80] 6388 ldp d14, d15, [sp, #96] 6389 ldp r19, r20, [sp], #112 6390 RET 6391 6392.L256_dec_ret: 6393 mov r0, #0x0 6394 RET 6395.size aes_gcm_dec_256_kernel,.-aes_gcm_dec_256_kernel 6396.byte 71,72,65,83,72,32,102,111,114,32,65,82,77,118,56,44,32,67,82,89,80,84,79,71,65,77,83,32,98,121,32,60,97,112,112,114,111,64,111,112,101,110,115,115,108,46,111,114,103,62,0 6397.align 2 6398.align 2 6399#endif 6400