1#!/usr/bin/env perl 2# 3# ==================================================================== 4# Written by Andy Polyakov <appro@openssl.org> for the OpenSSL 5# project. The module is, however, dual licensed under OpenSSL and 6# CRYPTOGAMS licenses depending on where you obtain it. For further 7# details see http://www.openssl.org/~appro/cryptogams/. 8# ==================================================================== 9# 10# This module implements support for ARMv8 AES instructions. The 11# module is endian-agnostic in sense that it supports both big- and 12# little-endian cases. As does it support both 32- and 64-bit modes 13# of operation. Latter is achieved by limiting amount of utilized 14# registers to 16, which implies additional NEON load and integer 15# instructions. This has no effect on mighty Apple A7, where results 16# are literally equal to the theoretical estimates based on AES 17# instruction latencies and issue rates. On Cortex-A53, an in-order 18# execution core, this costs up to 10-15%, which is partially 19# compensated by implementing dedicated code path for 128-bit 20# CBC encrypt case. On Cortex-A57 parallelizable mode performance 21# seems to be limited by sheer amount of NEON instructions... 22# 23# Performance in cycles per byte processed with 128-bit key: 24# 25# CBC enc CBC dec CTR 26# Apple A7 2.39 1.20 1.20 27# Cortex-A53 1.32 1.29 1.46 28# Cortex-A57(*) 1.95 0.85 0.93 29# Denver 1.96 0.86 0.80 30# 31# (*) original 3.64/1.34/1.32 results were for r0p0 revision 32# and are still same even for updated module; 33 34$flavour = shift; 35open STDOUT,">".shift; 36 37$prefix="aes_v8"; 38 39$code=<<___; 40#include "arm_arch.h" 41 42#if __ARM_MAX_ARCH__>=7 43.text 44___ 45$code.=".arch armv8-a+crypto\n" if ($flavour =~ /64/); 46$code.=".arch armv7-a\n.fpu neon\n.code 32\n" if ($flavour !~ /64/); 47 #^^^^^^ this is done to simplify adoption by not depending 48 # on latest binutils. 49 50# Assembler mnemonics are an eclectic mix of 32- and 64-bit syntax, 51# NEON is mostly 32-bit mnemonics, integer - mostly 64. Goal is to 52# maintain both 32- and 64-bit codes within single module and 53# transliterate common code to either flavour with regex vodoo. 54# 55{{{ 56my ($inp,$bits,$out,$ptr,$rounds)=("x0","w1","x2","x3","w12"); 57my ($zero,$rcon,$mask,$in0,$in1,$tmp,$key)= 58 $flavour=~/64/? map("q$_",(0..6)) : map("q$_",(0..3,8..10)); 59 60 61$code.=<<___; 62.align 5 63rcon: 64.long 0x01,0x01,0x01,0x01 65.long 0x0c0f0e0d,0x0c0f0e0d,0x0c0f0e0d,0x0c0f0e0d // rotate-n-splat 66.long 0x1b,0x1b,0x1b,0x1b 67 68.globl ${prefix}_set_encrypt_key 69.type ${prefix}_set_encrypt_key,%function 70.align 5 71${prefix}_set_encrypt_key: 72.Lenc_key: 73___ 74$code.=<<___ if ($flavour =~ /64/); 75 stp x29,x30,[sp,#-16]! 76 add x29,sp,#0 77___ 78$code.=<<___; 79 mov $ptr,#-1 80 cmp $inp,#0 81 b.eq .Lenc_key_abort 82 cmp $out,#0 83 b.eq .Lenc_key_abort 84 mov $ptr,#-2 85 cmp $bits,#128 86 b.lt .Lenc_key_abort 87 cmp $bits,#256 88 b.gt .Lenc_key_abort 89 tst $bits,#0x3f 90 b.ne .Lenc_key_abort 91 92 adr $ptr,rcon 93 cmp $bits,#192 94 95 veor $zero,$zero,$zero 96 vld1.8 {$in0},[$inp],#16 97 mov $bits,#8 // reuse $bits 98 vld1.32 {$rcon,$mask},[$ptr],#32 99 100 b.lt .Loop128 101 b.eq .L192 102 b .L256 103 104.align 4 105.Loop128: 106 vtbl.8 $key,{$in0},$mask 107 vext.8 $tmp,$zero,$in0,#12 108 vst1.32 {$in0},[$out],#16 109 aese $key,$zero 110 subs $bits,$bits,#1 111 112 veor $in0,$in0,$tmp 113 vext.8 $tmp,$zero,$tmp,#12 114 veor $in0,$in0,$tmp 115 vext.8 $tmp,$zero,$tmp,#12 116 veor $key,$key,$rcon 117 veor $in0,$in0,$tmp 118 vshl.u8 $rcon,$rcon,#1 119 veor $in0,$in0,$key 120 b.ne .Loop128 121 122 vld1.32 {$rcon},[$ptr] 123 124 vtbl.8 $key,{$in0},$mask 125 vext.8 $tmp,$zero,$in0,#12 126 vst1.32 {$in0},[$out],#16 127 aese $key,$zero 128 129 veor $in0,$in0,$tmp 130 vext.8 $tmp,$zero,$tmp,#12 131 veor $in0,$in0,$tmp 132 vext.8 $tmp,$zero,$tmp,#12 133 veor $key,$key,$rcon 134 veor $in0,$in0,$tmp 135 vshl.u8 $rcon,$rcon,#1 136 veor $in0,$in0,$key 137 138 vtbl.8 $key,{$in0},$mask 139 vext.8 $tmp,$zero,$in0,#12 140 vst1.32 {$in0},[$out],#16 141 aese $key,$zero 142 143 veor $in0,$in0,$tmp 144 vext.8 $tmp,$zero,$tmp,#12 145 veor $in0,$in0,$tmp 146 vext.8 $tmp,$zero,$tmp,#12 147 veor $key,$key,$rcon 148 veor $in0,$in0,$tmp 149 veor $in0,$in0,$key 150 vst1.32 {$in0},[$out] 151 add $out,$out,#0x50 152 153 mov $rounds,#10 154 b .Ldone 155 156.align 4 157.L192: 158 vld1.8 {$in1},[$inp],#8 159 vmov.i8 $key,#8 // borrow $key 160 vst1.32 {$in0},[$out],#16 161 vsub.i8 $mask,$mask,$key // adjust the mask 162 163.Loop192: 164 vtbl.8 $key,{$in1},$mask 165 vext.8 $tmp,$zero,$in0,#12 166 vst1.32 {$in1},[$out],#8 167 aese $key,$zero 168 subs $bits,$bits,#1 169 170 veor $in0,$in0,$tmp 171 vext.8 $tmp,$zero,$tmp,#12 172 veor $in0,$in0,$tmp 173 vext.8 $tmp,$zero,$tmp,#12 174 veor $in0,$in0,$tmp 175 176 vdup.32 $tmp,${in0}[3] 177 veor $tmp,$tmp,$in1 178 veor $key,$key,$rcon 179 vext.8 $in1,$zero,$in1,#12 180 vshl.u8 $rcon,$rcon,#1 181 veor $in1,$in1,$tmp 182 veor $in0,$in0,$key 183 veor $in1,$in1,$key 184 vst1.32 {$in0},[$out],#16 185 b.ne .Loop192 186 187 mov $rounds,#12 188 add $out,$out,#0x20 189 b .Ldone 190 191.align 4 192.L256: 193 vld1.8 {$in1},[$inp] 194 mov $bits,#7 195 mov $rounds,#14 196 vst1.32 {$in0},[$out],#16 197 198.Loop256: 199 vtbl.8 $key,{$in1},$mask 200 vext.8 $tmp,$zero,$in0,#12 201 vst1.32 {$in1},[$out],#16 202 aese $key,$zero 203 subs $bits,$bits,#1 204 205 veor $in0,$in0,$tmp 206 vext.8 $tmp,$zero,$tmp,#12 207 veor $in0,$in0,$tmp 208 vext.8 $tmp,$zero,$tmp,#12 209 veor $key,$key,$rcon 210 veor $in0,$in0,$tmp 211 vshl.u8 $rcon,$rcon,#1 212 veor $in0,$in0,$key 213 vst1.32 {$in0},[$out],#16 214 b.eq .Ldone 215 216 vdup.32 $key,${in0}[3] // just splat 217 vext.8 $tmp,$zero,$in1,#12 218 aese $key,$zero 219 220 veor $in1,$in1,$tmp 221 vext.8 $tmp,$zero,$tmp,#12 222 veor $in1,$in1,$tmp 223 vext.8 $tmp,$zero,$tmp,#12 224 veor $in1,$in1,$tmp 225 226 veor $in1,$in1,$key 227 b .Loop256 228 229.Ldone: 230 str $rounds,[$out] 231 mov $ptr,#0 232 233.Lenc_key_abort: 234 mov x0,$ptr // return value 235 `"ldr x29,[sp],#16" if ($flavour =~ /64/)` 236 ret 237.size ${prefix}_set_encrypt_key,.-${prefix}_set_encrypt_key 238 239.globl ${prefix}_set_decrypt_key 240.type ${prefix}_set_decrypt_key,%function 241.align 5 242${prefix}_set_decrypt_key: 243___ 244$code.=<<___ if ($flavour =~ /64/); 245 stp x29,x30,[sp,#-16]! 246 add x29,sp,#0 247___ 248$code.=<<___ if ($flavour !~ /64/); 249 stmdb sp!,{r4,lr} 250___ 251$code.=<<___; 252 bl .Lenc_key 253 254 cmp x0,#0 255 b.ne .Ldec_key_abort 256 257 sub $out,$out,#240 // restore original $out 258 mov x4,#-16 259 add $inp,$out,x12,lsl#4 // end of key schedule 260 261 vld1.32 {v0.16b},[$out] 262 vld1.32 {v1.16b},[$inp] 263 vst1.32 {v0.16b},[$inp],x4 264 vst1.32 {v1.16b},[$out],#16 265 266.Loop_imc: 267 vld1.32 {v0.16b},[$out] 268 vld1.32 {v1.16b},[$inp] 269 aesimc v0.16b,v0.16b 270 aesimc v1.16b,v1.16b 271 vst1.32 {v0.16b},[$inp],x4 272 vst1.32 {v1.16b},[$out],#16 273 cmp $inp,$out 274 b.hi .Loop_imc 275 276 vld1.32 {v0.16b},[$out] 277 aesimc v0.16b,v0.16b 278 vst1.32 {v0.16b},[$inp] 279 280 eor x0,x0,x0 // return value 281.Ldec_key_abort: 282___ 283$code.=<<___ if ($flavour !~ /64/); 284 ldmia sp!,{r4,pc} 285___ 286$code.=<<___ if ($flavour =~ /64/); 287 ldp x29,x30,[sp],#16 288 ret 289___ 290$code.=<<___; 291.size ${prefix}_set_decrypt_key,.-${prefix}_set_decrypt_key 292___ 293}}} 294{{{ 295sub gen_block () { 296my $dir = shift; 297my ($e,$mc) = $dir eq "en" ? ("e","mc") : ("d","imc"); 298my ($inp,$out,$key)=map("x$_",(0..2)); 299my $rounds="w3"; 300my ($rndkey0,$rndkey1,$inout)=map("q$_",(0..3)); 301 302$code.=<<___; 303.globl ${prefix}_${dir}crypt 304.type ${prefix}_${dir}crypt,%function 305.align 5 306${prefix}_${dir}crypt: 307 ldr $rounds,[$key,#240] 308 vld1.32 {$rndkey0},[$key],#16 309 vld1.8 {$inout},[$inp] 310 sub $rounds,$rounds,#2 311 vld1.32 {$rndkey1},[$key],#16 312 313.Loop_${dir}c: 314 aes$e $inout,$rndkey0 315 aes$mc $inout,$inout 316 vld1.32 {$rndkey0},[$key],#16 317 subs $rounds,$rounds,#2 318 aes$e $inout,$rndkey1 319 aes$mc $inout,$inout 320 vld1.32 {$rndkey1},[$key],#16 321 b.gt .Loop_${dir}c 322 323 aes$e $inout,$rndkey0 324 aes$mc $inout,$inout 325 vld1.32 {$rndkey0},[$key] 326 aes$e $inout,$rndkey1 327 veor $inout,$inout,$rndkey0 328 329 vst1.8 {$inout},[$out] 330 ret 331.size ${prefix}_${dir}crypt,.-${prefix}_${dir}crypt 332___ 333} 334&gen_block("en"); 335&gen_block("de"); 336}}} 337{{{ 338my ($inp,$out,$len,$key,$ivp)=map("x$_",(0..4)); my $enc="w5"; 339my ($rounds,$cnt,$key_,$step,$step1)=($enc,"w6","x7","x8","x12"); 340my ($dat0,$dat1,$in0,$in1,$tmp0,$tmp1,$ivec,$rndlast)=map("q$_",(0..7)); 341 342my ($dat,$tmp,$rndzero_n_last)=($dat0,$tmp0,$tmp1); 343my ($key4,$key5,$key6,$key7)=("x6","x12","x14",$key); 344 345### q8-q15 preloaded key schedule 346 347$code.=<<___; 348.globl ${prefix}_cbc_encrypt 349.type ${prefix}_cbc_encrypt,%function 350.align 5 351${prefix}_cbc_encrypt: 352___ 353$code.=<<___ if ($flavour =~ /64/); 354 stp x29,x30,[sp,#-16]! 355 add x29,sp,#0 356___ 357$code.=<<___ if ($flavour !~ /64/); 358 mov ip,sp 359 stmdb sp!,{r4-r8,lr} 360 vstmdb sp!,{d8-d15} @ ABI specification says so 361 ldmia ip,{r4-r5} @ load remaining args 362___ 363$code.=<<___; 364 subs $len,$len,#16 365 mov $step,#16 366 b.lo .Lcbc_abort 367 cclr $step,eq 368 369 cmp $enc,#0 // en- or decrypting? 370 ldr $rounds,[$key,#240] 371 and $len,$len,#-16 372 vld1.8 {$ivec},[$ivp] 373 vld1.8 {$dat},[$inp],$step 374 375 vld1.32 {q8-q9},[$key] // load key schedule... 376 sub $rounds,$rounds,#6 377 add $key_,$key,x5,lsl#4 // pointer to last 7 round keys 378 sub $rounds,$rounds,#2 379 vld1.32 {q10-q11},[$key_],#32 380 vld1.32 {q12-q13},[$key_],#32 381 vld1.32 {q14-q15},[$key_],#32 382 vld1.32 {$rndlast},[$key_] 383 384 add $key_,$key,#32 385 mov $cnt,$rounds 386 b.eq .Lcbc_dec 387 388 cmp $rounds,#2 389 veor $dat,$dat,$ivec 390 veor $rndzero_n_last,q8,$rndlast 391 b.eq .Lcbc_enc128 392 393 vld1.32 {$in0-$in1},[$key_] 394 add $key_,$key,#16 395 add $key4,$key,#16*4 396 add $key5,$key,#16*5 397 aese $dat,q8 398 aesmc $dat,$dat 399 add $key6,$key,#16*6 400 add $key7,$key,#16*7 401 b .Lenter_cbc_enc 402 403.align 4 404.Loop_cbc_enc: 405 aese $dat,q8 406 aesmc $dat,$dat 407 vst1.8 {$ivec},[$out],#16 408.Lenter_cbc_enc: 409 aese $dat,q9 410 aesmc $dat,$dat 411 aese $dat,$in0 412 aesmc $dat,$dat 413 vld1.32 {q8},[$key4] 414 cmp $rounds,#4 415 aese $dat,$in1 416 aesmc $dat,$dat 417 vld1.32 {q9},[$key5] 418 b.eq .Lcbc_enc192 419 420 aese $dat,q8 421 aesmc $dat,$dat 422 vld1.32 {q8},[$key6] 423 aese $dat,q9 424 aesmc $dat,$dat 425 vld1.32 {q9},[$key7] 426 nop 427 428.Lcbc_enc192: 429 aese $dat,q8 430 aesmc $dat,$dat 431 subs $len,$len,#16 432 aese $dat,q9 433 aesmc $dat,$dat 434 cclr $step,eq 435 aese $dat,q10 436 aesmc $dat,$dat 437 aese $dat,q11 438 aesmc $dat,$dat 439 vld1.8 {q8},[$inp],$step 440 aese $dat,q12 441 aesmc $dat,$dat 442 veor q8,q8,$rndzero_n_last 443 aese $dat,q13 444 aesmc $dat,$dat 445 vld1.32 {q9},[$key_] // re-pre-load rndkey[1] 446 aese $dat,q14 447 aesmc $dat,$dat 448 aese $dat,q15 449 veor $ivec,$dat,$rndlast 450 b.hs .Loop_cbc_enc 451 452 vst1.8 {$ivec},[$out],#16 453 b .Lcbc_done 454 455.align 5 456.Lcbc_enc128: 457 vld1.32 {$in0-$in1},[$key_] 458 aese $dat,q8 459 aesmc $dat,$dat 460 b .Lenter_cbc_enc128 461.Loop_cbc_enc128: 462 aese $dat,q8 463 aesmc $dat,$dat 464 vst1.8 {$ivec},[$out],#16 465.Lenter_cbc_enc128: 466 aese $dat,q9 467 aesmc $dat,$dat 468 subs $len,$len,#16 469 aese $dat,$in0 470 aesmc $dat,$dat 471 cclr $step,eq 472 aese $dat,$in1 473 aesmc $dat,$dat 474 aese $dat,q10 475 aesmc $dat,$dat 476 aese $dat,q11 477 aesmc $dat,$dat 478 vld1.8 {q8},[$inp],$step 479 aese $dat,q12 480 aesmc $dat,$dat 481 aese $dat,q13 482 aesmc $dat,$dat 483 aese $dat,q14 484 aesmc $dat,$dat 485 veor q8,q8,$rndzero_n_last 486 aese $dat,q15 487 veor $ivec,$dat,$rndlast 488 b.hs .Loop_cbc_enc128 489 490 vst1.8 {$ivec},[$out],#16 491 b .Lcbc_done 492___ 493{ 494my ($dat2,$in2,$tmp2)=map("q$_",(10,11,9)); 495$code.=<<___; 496.align 5 497.Lcbc_dec: 498 vld1.8 {$dat2},[$inp],#16 499 subs $len,$len,#32 // bias 500 add $cnt,$rounds,#2 501 vorr $in1,$dat,$dat 502 vorr $dat1,$dat,$dat 503 vorr $in2,$dat2,$dat2 504 b.lo .Lcbc_dec_tail 505 506 vorr $dat1,$dat2,$dat2 507 vld1.8 {$dat2},[$inp],#16 508 vorr $in0,$dat,$dat 509 vorr $in1,$dat1,$dat1 510 vorr $in2,$dat2,$dat2 511 512.Loop3x_cbc_dec: 513 aesd $dat0,q8 514 aesimc $dat0,$dat0 515 aesd $dat1,q8 516 aesimc $dat1,$dat1 517 aesd $dat2,q8 518 aesimc $dat2,$dat2 519 vld1.32 {q8},[$key_],#16 520 subs $cnt,$cnt,#2 521 aesd $dat0,q9 522 aesimc $dat0,$dat0 523 aesd $dat1,q9 524 aesimc $dat1,$dat1 525 aesd $dat2,q9 526 aesimc $dat2,$dat2 527 vld1.32 {q9},[$key_],#16 528 b.gt .Loop3x_cbc_dec 529 530 aesd $dat0,q8 531 aesimc $dat0,$dat0 532 aesd $dat1,q8 533 aesimc $dat1,$dat1 534 aesd $dat2,q8 535 aesimc $dat2,$dat2 536 veor $tmp0,$ivec,$rndlast 537 subs $len,$len,#0x30 538 veor $tmp1,$in0,$rndlast 539 mov.lo x6,$len // x6, $cnt, is zero at this point 540 aesd $dat0,q9 541 aesimc $dat0,$dat0 542 aesd $dat1,q9 543 aesimc $dat1,$dat1 544 aesd $dat2,q9 545 aesimc $dat2,$dat2 546 veor $tmp2,$in1,$rndlast 547 add $inp,$inp,x6 // $inp is adjusted in such way that 548 // at exit from the loop $dat1-$dat2 549 // are loaded with last "words" 550 vorr $ivec,$in2,$in2 551 mov $key_,$key 552 aesd $dat0,q12 553 aesimc $dat0,$dat0 554 aesd $dat1,q12 555 aesimc $dat1,$dat1 556 aesd $dat2,q12 557 aesimc $dat2,$dat2 558 vld1.8 {$in0},[$inp],#16 559 aesd $dat0,q13 560 aesimc $dat0,$dat0 561 aesd $dat1,q13 562 aesimc $dat1,$dat1 563 aesd $dat2,q13 564 aesimc $dat2,$dat2 565 vld1.8 {$in1},[$inp],#16 566 aesd $dat0,q14 567 aesimc $dat0,$dat0 568 aesd $dat1,q14 569 aesimc $dat1,$dat1 570 aesd $dat2,q14 571 aesimc $dat2,$dat2 572 vld1.8 {$in2},[$inp],#16 573 aesd $dat0,q15 574 aesd $dat1,q15 575 aesd $dat2,q15 576 vld1.32 {q8},[$key_],#16 // re-pre-load rndkey[0] 577 add $cnt,$rounds,#2 578 veor $tmp0,$tmp0,$dat0 579 veor $tmp1,$tmp1,$dat1 580 veor $dat2,$dat2,$tmp2 581 vld1.32 {q9},[$key_],#16 // re-pre-load rndkey[1] 582 vst1.8 {$tmp0},[$out],#16 583 vorr $dat0,$in0,$in0 584 vst1.8 {$tmp1},[$out],#16 585 vorr $dat1,$in1,$in1 586 vst1.8 {$dat2},[$out],#16 587 vorr $dat2,$in2,$in2 588 b.hs .Loop3x_cbc_dec 589 590 cmn $len,#0x30 591 b.eq .Lcbc_done 592 nop 593 594.Lcbc_dec_tail: 595 aesd $dat1,q8 596 aesimc $dat1,$dat1 597 aesd $dat2,q8 598 aesimc $dat2,$dat2 599 vld1.32 {q8},[$key_],#16 600 subs $cnt,$cnt,#2 601 aesd $dat1,q9 602 aesimc $dat1,$dat1 603 aesd $dat2,q9 604 aesimc $dat2,$dat2 605 vld1.32 {q9},[$key_],#16 606 b.gt .Lcbc_dec_tail 607 608 aesd $dat1,q8 609 aesimc $dat1,$dat1 610 aesd $dat2,q8 611 aesimc $dat2,$dat2 612 aesd $dat1,q9 613 aesimc $dat1,$dat1 614 aesd $dat2,q9 615 aesimc $dat2,$dat2 616 aesd $dat1,q12 617 aesimc $dat1,$dat1 618 aesd $dat2,q12 619 aesimc $dat2,$dat2 620 cmn $len,#0x20 621 aesd $dat1,q13 622 aesimc $dat1,$dat1 623 aesd $dat2,q13 624 aesimc $dat2,$dat2 625 veor $tmp1,$ivec,$rndlast 626 aesd $dat1,q14 627 aesimc $dat1,$dat1 628 aesd $dat2,q14 629 aesimc $dat2,$dat2 630 veor $tmp2,$in1,$rndlast 631 aesd $dat1,q15 632 aesd $dat2,q15 633 b.eq .Lcbc_dec_one 634 veor $tmp1,$tmp1,$dat1 635 veor $tmp2,$tmp2,$dat2 636 vorr $ivec,$in2,$in2 637 vst1.8 {$tmp1},[$out],#16 638 vst1.8 {$tmp2},[$out],#16 639 b .Lcbc_done 640 641.Lcbc_dec_one: 642 veor $tmp1,$tmp1,$dat2 643 vorr $ivec,$in2,$in2 644 vst1.8 {$tmp1},[$out],#16 645 646.Lcbc_done: 647 vst1.8 {$ivec},[$ivp] 648.Lcbc_abort: 649___ 650} 651$code.=<<___ if ($flavour !~ /64/); 652 vldmia sp!,{d8-d15} 653 ldmia sp!,{r4-r8,pc} 654___ 655$code.=<<___ if ($flavour =~ /64/); 656 ldr x29,[sp],#16 657 ret 658___ 659$code.=<<___; 660.size ${prefix}_cbc_encrypt,.-${prefix}_cbc_encrypt 661___ 662}}} 663{{{ 664my ($inp,$out,$len,$key,$ivp)=map("x$_",(0..4)); 665my ($rounds,$cnt,$key_)=("w5","w6","x7"); 666my ($ctr,$tctr0,$tctr1,$tctr2)=map("w$_",(8..10,12)); 667my $step="x12"; # aliases with $tctr2 668 669my ($dat0,$dat1,$in0,$in1,$tmp0,$tmp1,$ivec,$rndlast)=map("q$_",(0..7)); 670my ($dat2,$in2,$tmp2)=map("q$_",(10,11,9)); 671 672my ($dat,$tmp)=($dat0,$tmp0); 673 674### q8-q15 preloaded key schedule 675 676$code.=<<___; 677.globl ${prefix}_ctr32_encrypt_blocks 678.type ${prefix}_ctr32_encrypt_blocks,%function 679.align 5 680${prefix}_ctr32_encrypt_blocks: 681___ 682$code.=<<___ if ($flavour =~ /64/); 683 stp x29,x30,[sp,#-16]! 684 add x29,sp,#0 685___ 686$code.=<<___ if ($flavour !~ /64/); 687 mov ip,sp 688 stmdb sp!,{r4-r10,lr} 689 vstmdb sp!,{d8-d15} @ ABI specification says so 690 ldr r4, [ip] @ load remaining arg 691___ 692$code.=<<___; 693 ldr $rounds,[$key,#240] 694 695 ldr $ctr, [$ivp, #12] 696 vld1.32 {$dat0},[$ivp] 697 698 vld1.32 {q8-q9},[$key] // load key schedule... 699 sub $rounds,$rounds,#4 700 mov $step,#16 701 cmp $len,#2 702 add $key_,$key,x5,lsl#4 // pointer to last 5 round keys 703 sub $rounds,$rounds,#2 704 vld1.32 {q12-q13},[$key_],#32 705 vld1.32 {q14-q15},[$key_],#32 706 vld1.32 {$rndlast},[$key_] 707 add $key_,$key,#32 708 mov $cnt,$rounds 709 cclr $step,lo 710#ifndef __ARMEB__ 711 rev $ctr, $ctr 712#endif 713 vorr $dat1,$dat0,$dat0 714 add $tctr1, $ctr, #1 715 vorr $dat2,$dat0,$dat0 716 add $ctr, $ctr, #2 717 vorr $ivec,$dat0,$dat0 718 rev $tctr1, $tctr1 719 vmov.32 ${dat1}[3],$tctr1 720 b.ls .Lctr32_tail 721 rev $tctr2, $ctr 722 sub $len,$len,#3 // bias 723 vmov.32 ${dat2}[3],$tctr2 724 b .Loop3x_ctr32 725 726.align 4 727.Loop3x_ctr32: 728 aese $dat0,q8 729 aesmc $dat0,$dat0 730 aese $dat1,q8 731 aesmc $dat1,$dat1 732 aese $dat2,q8 733 aesmc $dat2,$dat2 734 vld1.32 {q8},[$key_],#16 735 subs $cnt,$cnt,#2 736 aese $dat0,q9 737 aesmc $dat0,$dat0 738 aese $dat1,q9 739 aesmc $dat1,$dat1 740 aese $dat2,q9 741 aesmc $dat2,$dat2 742 vld1.32 {q9},[$key_],#16 743 b.gt .Loop3x_ctr32 744 745 aese $dat0,q8 746 aesmc $tmp0,$dat0 747 aese $dat1,q8 748 aesmc $tmp1,$dat1 749 vld1.8 {$in0},[$inp],#16 750 vorr $dat0,$ivec,$ivec 751 aese $dat2,q8 752 aesmc $dat2,$dat2 753 vld1.8 {$in1},[$inp],#16 754 vorr $dat1,$ivec,$ivec 755 aese $tmp0,q9 756 aesmc $tmp0,$tmp0 757 aese $tmp1,q9 758 aesmc $tmp1,$tmp1 759 vld1.8 {$in2},[$inp],#16 760 mov $key_,$key 761 aese $dat2,q9 762 aesmc $tmp2,$dat2 763 vorr $dat2,$ivec,$ivec 764 add $tctr0,$ctr,#1 765 aese $tmp0,q12 766 aesmc $tmp0,$tmp0 767 aese $tmp1,q12 768 aesmc $tmp1,$tmp1 769 veor $in0,$in0,$rndlast 770 add $tctr1,$ctr,#2 771 aese $tmp2,q12 772 aesmc $tmp2,$tmp2 773 veor $in1,$in1,$rndlast 774 add $ctr,$ctr,#3 775 aese $tmp0,q13 776 aesmc $tmp0,$tmp0 777 aese $tmp1,q13 778 aesmc $tmp1,$tmp1 779 veor $in2,$in2,$rndlast 780 rev $tctr0,$tctr0 781 aese $tmp2,q13 782 aesmc $tmp2,$tmp2 783 vmov.32 ${dat0}[3], $tctr0 784 rev $tctr1,$tctr1 785 aese $tmp0,q14 786 aesmc $tmp0,$tmp0 787 aese $tmp1,q14 788 aesmc $tmp1,$tmp1 789 vmov.32 ${dat1}[3], $tctr1 790 rev $tctr2,$ctr 791 aese $tmp2,q14 792 aesmc $tmp2,$tmp2 793 vmov.32 ${dat2}[3], $tctr2 794 subs $len,$len,#3 795 aese $tmp0,q15 796 aese $tmp1,q15 797 aese $tmp2,q15 798 799 veor $in0,$in0,$tmp0 800 vld1.32 {q8},[$key_],#16 // re-pre-load rndkey[0] 801 vst1.8 {$in0},[$out],#16 802 veor $in1,$in1,$tmp1 803 mov $cnt,$rounds 804 vst1.8 {$in1},[$out],#16 805 veor $in2,$in2,$tmp2 806 vld1.32 {q9},[$key_],#16 // re-pre-load rndkey[1] 807 vst1.8 {$in2},[$out],#16 808 b.hs .Loop3x_ctr32 809 810 adds $len,$len,#3 811 b.eq .Lctr32_done 812 cmp $len,#1 813 mov $step,#16 814 cclr $step,eq 815 816.Lctr32_tail: 817 aese $dat0,q8 818 aesmc $dat0,$dat0 819 aese $dat1,q8 820 aesmc $dat1,$dat1 821 vld1.32 {q8},[$key_],#16 822 subs $cnt,$cnt,#2 823 aese $dat0,q9 824 aesmc $dat0,$dat0 825 aese $dat1,q9 826 aesmc $dat1,$dat1 827 vld1.32 {q9},[$key_],#16 828 b.gt .Lctr32_tail 829 830 aese $dat0,q8 831 aesmc $dat0,$dat0 832 aese $dat1,q8 833 aesmc $dat1,$dat1 834 aese $dat0,q9 835 aesmc $dat0,$dat0 836 aese $dat1,q9 837 aesmc $dat1,$dat1 838 vld1.8 {$in0},[$inp],$step 839 aese $dat0,q12 840 aesmc $dat0,$dat0 841 aese $dat1,q12 842 aesmc $dat1,$dat1 843 vld1.8 {$in1},[$inp] 844 aese $dat0,q13 845 aesmc $dat0,$dat0 846 aese $dat1,q13 847 aesmc $dat1,$dat1 848 veor $in0,$in0,$rndlast 849 aese $dat0,q14 850 aesmc $dat0,$dat0 851 aese $dat1,q14 852 aesmc $dat1,$dat1 853 veor $in1,$in1,$rndlast 854 aese $dat0,q15 855 aese $dat1,q15 856 857 cmp $len,#1 858 veor $in0,$in0,$dat0 859 veor $in1,$in1,$dat1 860 vst1.8 {$in0},[$out],#16 861 b.eq .Lctr32_done 862 vst1.8 {$in1},[$out] 863 864.Lctr32_done: 865___ 866$code.=<<___ if ($flavour !~ /64/); 867 vldmia sp!,{d8-d15} 868 ldmia sp!,{r4-r10,pc} 869___ 870$code.=<<___ if ($flavour =~ /64/); 871 ldr x29,[sp],#16 872 ret 873___ 874$code.=<<___; 875.size ${prefix}_ctr32_encrypt_blocks,.-${prefix}_ctr32_encrypt_blocks 876___ 877}}} 878$code.=<<___; 879#endif 880___ 881######################################## 882if ($flavour =~ /64/) { ######## 64-bit code 883 my %opcode = ( 884 "aesd" => 0x4e285800, "aese" => 0x4e284800, 885 "aesimc"=> 0x4e287800, "aesmc" => 0x4e286800 ); 886 887 local *unaes = sub { 888 my ($mnemonic,$arg)=@_; 889 890 $arg =~ m/[qv]([0-9]+)[^,]*,\s*[qv]([0-9]+)/o && 891 sprintf ".inst\t0x%08x\t//%s %s", 892 $opcode{$mnemonic}|$1|($2<<5), 893 $mnemonic,$arg; 894 }; 895 896 foreach(split("\n",$code)) { 897 s/\`([^\`]*)\`/eval($1)/geo; 898 899 s/\bq([0-9]+)\b/"v".($1<8?$1:$1+8).".16b"/geo; # old->new registers 900 s/@\s/\/\//o; # old->new style commentary 901 902 #s/[v]?(aes\w+)\s+([qv].*)/unaes($1,$2)/geo or 903 s/cclr\s+([wx])([^,]+),\s*([a-z]+)/csel $1$2,$1zr,$1$2,$3/o or 904 s/mov\.([a-z]+)\s+([wx][0-9]+),\s*([wx][0-9]+)/csel $2,$3,$2,$1/o or 905 s/vmov\.i8/movi/o or # fix up legacy mnemonics 906 s/vext\.8/ext/o or 907 s/vrev32\.8/rev32/o or 908 s/vtst\.8/cmtst/o or 909 s/vshr/ushr/o or 910 s/^(\s+)v/$1/o or # strip off v prefix 911 s/\bbx\s+lr\b/ret/o; 912 913 # fix up remainig legacy suffixes 914 s/\.[ui]?8//o; 915 m/\],#8/o and s/\.16b/\.8b/go; 916 s/\.[ui]?32//o and s/\.16b/\.4s/go; 917 s/\.[ui]?64//o and s/\.16b/\.2d/go; 918 s/\.[42]([sd])\[([0-3])\]/\.$1\[$2\]/o; 919 920 print $_,"\n"; 921 } 922} else { ######## 32-bit code 923 my %opcode = ( 924 "aesd" => 0xf3b00340, "aese" => 0xf3b00300, 925 "aesimc"=> 0xf3b003c0, "aesmc" => 0xf3b00380 ); 926 927 local *unaes = sub { 928 my ($mnemonic,$arg)=@_; 929 930 if ($arg =~ m/[qv]([0-9]+)[^,]*,\s*[qv]([0-9]+)/o) { 931 my $word = $opcode{$mnemonic}|(($1&7)<<13)|(($1&8)<<19) 932 |(($2&7)<<1) |(($2&8)<<2); 933 # since ARMv7 instructions are always encoded little-endian. 934 # correct solution is to use .inst directive, but older 935 # assemblers don't implement it:-( 936 sprintf ".byte\t0x%02x,0x%02x,0x%02x,0x%02x\t@ %s %s", 937 $word&0xff,($word>>8)&0xff, 938 ($word>>16)&0xff,($word>>24)&0xff, 939 $mnemonic,$arg; 940 } 941 }; 942 943 sub unvtbl { 944 my $arg=shift; 945 946 $arg =~ m/q([0-9]+),\s*\{q([0-9]+)\},\s*q([0-9]+)/o && 947 sprintf "vtbl.8 d%d,{q%d},d%d\n\t". 948 "vtbl.8 d%d,{q%d},d%d", 2*$1,$2,2*$3, 2*$1+1,$2,2*$3+1; 949 } 950 951 sub unvdup32 { 952 my $arg=shift; 953 954 $arg =~ m/q([0-9]+),\s*q([0-9]+)\[([0-3])\]/o && 955 sprintf "vdup.32 q%d,d%d[%d]",$1,2*$2+($3>>1),$3&1; 956 } 957 958 sub unvmov32 { 959 my $arg=shift; 960 961 $arg =~ m/q([0-9]+)\[([0-3])\],(.*)/o && 962 sprintf "vmov.32 d%d[%d],%s",2*$1+($2>>1),$2&1,$3; 963 } 964 965 foreach(split("\n",$code)) { 966 s/\`([^\`]*)\`/eval($1)/geo; 967 968 s/\b[wx]([0-9]+)\b/r$1/go; # new->old registers 969 s/\bv([0-9])\.[12468]+[bsd]\b/q$1/go; # new->old registers 970 s/\/\/\s?/@ /o; # new->old style commentary 971 972 # fix up remainig new-style suffixes 973 s/\{q([0-9]+)\},\s*\[(.+)\],#8/sprintf "{d%d},[$2]!",2*$1/eo or 974 s/\],#[0-9]+/]!/o; 975 976 s/[v]?(aes\w+)\s+([qv].*)/unaes($1,$2)/geo or 977 s/cclr\s+([^,]+),\s*([a-z]+)/mov$2 $1,#0/o or 978 s/vtbl\.8\s+(.*)/unvtbl($1)/geo or 979 s/vdup\.32\s+(.*)/unvdup32($1)/geo or 980 s/vmov\.32\s+(.*)/unvmov32($1)/geo or 981 s/^(\s+)b\./$1b/o or 982 s/^(\s+)mov\./$1mov/o or 983 s/^(\s+)ret/$1bx\tlr/o; 984 985 print $_,"\n"; 986 } 987} 988 989close STDOUT; 990