1#! /usr/bin/env perl 2# Copyright 2009-2016 The OpenSSL Project Authors. All Rights Reserved. 3# 4# Licensed under the OpenSSL license (the "License"). You may not use 5# this file except in compliance with the License. You can obtain a copy 6# in the file LICENSE in the source distribution or at 7# https://www.openssl.org/source/license.html 8 9# 10# ==================================================================== 11# Written by Andy Polyakov <appro@openssl.org> for the OpenSSL 12# project. The module is, however, dual licensed under OpenSSL and 13# CRYPTOGAMS licenses depending on where you obtain it. For further 14# details see http://www.openssl.org/~appro/cryptogams/. 15# ==================================================================== 16# 17# This module implements support for Intel AES-NI extension. In 18# OpenSSL context it's used with Intel engine, but can also be used as 19# drop-in replacement for crypto/aes/asm/aes-x86_64.pl [see below for 20# details]. 21# 22# Performance. 23# 24# Given aes(enc|dec) instructions' latency asymptotic performance for 25# non-parallelizable modes such as CBC encrypt is 3.75 cycles per byte 26# processed with 128-bit key. And given their throughput asymptotic 27# performance for parallelizable modes is 1.25 cycles per byte. Being 28# asymptotic limit it's not something you commonly achieve in reality, 29# but how close does one get? Below are results collected for 30# different modes and block sized. Pairs of numbers are for en-/ 31# decryption. 32# 33# 16-byte 64-byte 256-byte 1-KB 8-KB 34# ECB 4.25/4.25 1.38/1.38 1.28/1.28 1.26/1.26 1.26/1.26 35# CTR 5.42/5.42 1.92/1.92 1.44/1.44 1.28/1.28 1.26/1.26 36# CBC 4.38/4.43 4.15/1.43 4.07/1.32 4.07/1.29 4.06/1.28 37# CCM 5.66/9.42 4.42/5.41 4.16/4.40 4.09/4.15 4.06/4.07 38# OFB 5.42/5.42 4.64/4.64 4.44/4.44 4.39/4.39 4.38/4.38 39# CFB 5.73/5.85 5.56/5.62 5.48/5.56 5.47/5.55 5.47/5.55 40# 41# ECB, CTR, CBC and CCM results are free from EVP overhead. This means 42# that otherwise used 'openssl speed -evp aes-128-??? -engine aesni 43# [-decrypt]' will exhibit 10-15% worse results for smaller blocks. 44# The results were collected with specially crafted speed.c benchmark 45# in order to compare them with results reported in "Intel Advanced 46# Encryption Standard (AES) New Instruction Set" White Paper Revision 47# 3.0 dated May 2010. All above results are consistently better. This 48# module also provides better performance for block sizes smaller than 49# 128 bytes in points *not* represented in the above table. 50# 51# Looking at the results for 8-KB buffer. 52# 53# CFB and OFB results are far from the limit, because implementation 54# uses "generic" CRYPTO_[c|o]fb128_encrypt interfaces relying on 55# single-block aesni_encrypt, which is not the most optimal way to go. 56# CBC encrypt result is unexpectedly high and there is no documented 57# explanation for it. Seemingly there is a small penalty for feeding 58# the result back to AES unit the way it's done in CBC mode. There is 59# nothing one can do and the result appears optimal. CCM result is 60# identical to CBC, because CBC-MAC is essentially CBC encrypt without 61# saving output. CCM CTR "stays invisible," because it's neatly 62# interleaved wih CBC-MAC. This provides ~30% improvement over 63# "straightforward" CCM implementation with CTR and CBC-MAC performed 64# disjointly. Parallelizable modes practically achieve the theoretical 65# limit. 66# 67# Looking at how results vary with buffer size. 68# 69# Curves are practically saturated at 1-KB buffer size. In most cases 70# "256-byte" performance is >95%, and "64-byte" is ~90% of "8-KB" one. 71# CTR curve doesn't follow this pattern and is "slowest" changing one 72# with "256-byte" result being 87% of "8-KB." This is because overhead 73# in CTR mode is most computationally intensive. Small-block CCM 74# decrypt is slower than encrypt, because first CTR and last CBC-MAC 75# iterations can't be interleaved. 76# 77# Results for 192- and 256-bit keys. 78# 79# EVP-free results were observed to scale perfectly with number of 80# rounds for larger block sizes, i.e. 192-bit result being 10/12 times 81# lower and 256-bit one - 10/14. Well, in CBC encrypt case differences 82# are a tad smaller, because the above mentioned penalty biases all 83# results by same constant value. In similar way function call 84# overhead affects small-block performance, as well as OFB and CFB 85# results. Differences are not large, most common coefficients are 86# 10/11.7 and 10/13.4 (as opposite to 10/12.0 and 10/14.0), but one 87# observe even 10/11.2 and 10/12.4 (CTR, OFB, CFB)... 88 89# January 2011 90# 91# While Westmere processor features 6 cycles latency for aes[enc|dec] 92# instructions, which can be scheduled every second cycle, Sandy 93# Bridge spends 8 cycles per instruction, but it can schedule them 94# every cycle. This means that code targeting Westmere would perform 95# suboptimally on Sandy Bridge. Therefore this update. 96# 97# In addition, non-parallelizable CBC encrypt (as well as CCM) is 98# optimized. Relative improvement might appear modest, 8% on Westmere, 99# but in absolute terms it's 3.77 cycles per byte encrypted with 100# 128-bit key on Westmere, and 5.07 - on Sandy Bridge. These numbers 101# should be compared to asymptotic limits of 3.75 for Westmere and 102# 5.00 for Sandy Bridge. Actually, the fact that they get this close 103# to asymptotic limits is quite amazing. Indeed, the limit is 104# calculated as latency times number of rounds, 10 for 128-bit key, 105# and divided by 16, the number of bytes in block, or in other words 106# it accounts *solely* for aesenc instructions. But there are extra 107# instructions, and numbers so close to the asymptotic limits mean 108# that it's as if it takes as little as *one* additional cycle to 109# execute all of them. How is it possible? It is possible thanks to 110# out-of-order execution logic, which manages to overlap post- 111# processing of previous block, things like saving the output, with 112# actual encryption of current block, as well as pre-processing of 113# current block, things like fetching input and xor-ing it with 114# 0-round element of the key schedule, with actual encryption of 115# previous block. Keep this in mind... 116# 117# For parallelizable modes, such as ECB, CBC decrypt, CTR, higher 118# performance is achieved by interleaving instructions working on 119# independent blocks. In which case asymptotic limit for such modes 120# can be obtained by dividing above mentioned numbers by AES 121# instructions' interleave factor. Westmere can execute at most 3 122# instructions at a time, meaning that optimal interleave factor is 3, 123# and that's where the "magic" number of 1.25 come from. "Optimal 124# interleave factor" means that increase of interleave factor does 125# not improve performance. The formula has proven to reflect reality 126# pretty well on Westmere... Sandy Bridge on the other hand can 127# execute up to 8 AES instructions at a time, so how does varying 128# interleave factor affect the performance? Here is table for ECB 129# (numbers are cycles per byte processed with 128-bit key): 130# 131# instruction interleave factor 3x 6x 8x 132# theoretical asymptotic limit 1.67 0.83 0.625 133# measured performance for 8KB block 1.05 0.86 0.84 134# 135# "as if" interleave factor 4.7x 5.8x 6.0x 136# 137# Further data for other parallelizable modes: 138# 139# CBC decrypt 1.16 0.93 0.74 140# CTR 1.14 0.91 0.74 141# 142# Well, given 3x column it's probably inappropriate to call the limit 143# asymptotic, if it can be surpassed, isn't it? What happens there? 144# Rewind to CBC paragraph for the answer. Yes, out-of-order execution 145# magic is responsible for this. Processor overlaps not only the 146# additional instructions with AES ones, but even AES instructions 147# processing adjacent triplets of independent blocks. In the 6x case 148# additional instructions still claim disproportionally small amount 149# of additional cycles, but in 8x case number of instructions must be 150# a tad too high for out-of-order logic to cope with, and AES unit 151# remains underutilized... As you can see 8x interleave is hardly 152# justifiable, so there no need to feel bad that 32-bit aesni-x86.pl 153# utilizes 6x interleave because of limited register bank capacity. 154# 155# Higher interleave factors do have negative impact on Westmere 156# performance. While for ECB mode it's negligible ~1.5%, other 157# parallelizables perform ~5% worse, which is outweighed by ~25% 158# improvement on Sandy Bridge. To balance regression on Westmere 159# CTR mode was implemented with 6x aesenc interleave factor. 160 161# April 2011 162# 163# Add aesni_xts_[en|de]crypt. Westmere spends 1.25 cycles processing 164# one byte out of 8KB with 128-bit key, Sandy Bridge - 0.90. Just like 165# in CTR mode AES instruction interleave factor was chosen to be 6x. 166 167# November 2015 168# 169# Add aesni_ocb_[en|de]crypt. AES instruction interleave factor was 170# chosen to be 6x. 171 172###################################################################### 173# Current large-block performance in cycles per byte processed with 174# 128-bit key (less is better). 175# 176# CBC en-/decrypt CTR XTS ECB OCB 177# Westmere 3.77/1.25 1.25 1.25 1.26 178# * Bridge 5.07/0.74 0.75 0.90 0.85 0.98 179# Haswell 4.44/0.63 0.63 0.73 0.63 0.70 180# Skylake 2.62/0.63 0.63 0.63 0.63 181# Silvermont 5.75/3.54 3.56 4.12 3.87(*) 4.11 182# Knights L 2.54/0.77 0.78 0.85 - 1.50 183# Goldmont 3.82/1.26 1.26 1.29 1.29 1.50 184# Bulldozer 5.77/0.70 0.72 0.90 0.70 0.95 185# Ryzen 2.71/0.35 0.35 0.44 0.38 0.49 186# 187# (*) Atom Silvermont ECB result is suboptimal because of penalties 188# incurred by operations on %xmm8-15. As ECB is not considered 189# critical, nothing was done to mitigate the problem. 190 191$PREFIX="aes_hw"; # if $PREFIX is set to "AES", the script 192 # generates drop-in replacement for 193 # crypto/aes/asm/aes-x86_64.pl:-) 194 195$flavour = shift; 196$output = shift; 197if ($flavour =~ /\./) { $output = $flavour; undef $flavour; } 198 199$win64=0; $win64=1 if ($flavour =~ /[nm]asm|mingw64/ || $output =~ /\.asm$/); 200 201$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1; 202( $xlate="${dir}x86_64-xlate.pl" and -f $xlate ) or 203( $xlate="${dir}../../../perlasm/x86_64-xlate.pl" and -f $xlate) or 204die "can't locate x86_64-xlate.pl"; 205 206open OUT,"| \"$^X\" \"$xlate\" $flavour \"$output\""; 207*STDOUT=*OUT; 208 209$movkey = $PREFIX eq "aes_hw" ? "movups" : "movups"; 210@_4args=$win64? ("%rcx","%rdx","%r8", "%r9") : # Win64 order 211 ("%rdi","%rsi","%rdx","%rcx"); # Unix order 212 213$code=".text\n"; 214$code.=".extern OPENSSL_ia32cap_P\n"; 215 216$rounds="%eax"; # input to and changed by aesni_[en|de]cryptN !!! 217# this is natural Unix argument order for public $PREFIX_[ecb|cbc]_encrypt ... 218$inp="%rdi"; 219$out="%rsi"; 220$len="%rdx"; 221$key="%rcx"; # input to and changed by aesni_[en|de]cryptN !!! 222$ivp="%r8"; # cbc, ctr, ... 223 224$rnds_="%r10d"; # backup copy for $rounds 225$key_="%r11"; # backup copy for $key 226 227# %xmm register layout 228$rndkey0="%xmm0"; $rndkey1="%xmm1"; 229$inout0="%xmm2"; $inout1="%xmm3"; 230$inout2="%xmm4"; $inout3="%xmm5"; 231$inout4="%xmm6"; $inout5="%xmm7"; 232$inout6="%xmm8"; $inout7="%xmm9"; 233 234$in2="%xmm6"; $in1="%xmm7"; # used in CBC decrypt, CTR, ... 235$in0="%xmm8"; $iv="%xmm9"; 236 237# Inline version of internal aesni_[en|de]crypt1. 238# 239# Why folded loop? Because aes[enc|dec] is slow enough to accommodate 240# cycles which take care of loop variables... 241{ my $sn; 242sub aesni_generate1 { 243my ($p,$key,$rounds,$inout,$ivec)=@_; $inout=$inout0 if (!defined($inout)); 244++$sn; 245$code.=<<___; 246 $movkey ($key),$rndkey0 247 $movkey 16($key),$rndkey1 248___ 249$code.=<<___ if (defined($ivec)); 250 xorps $rndkey0,$ivec 251 lea 32($key),$key 252 xorps $ivec,$inout 253___ 254$code.=<<___ if (!defined($ivec)); 255 lea 32($key),$key 256 xorps $rndkey0,$inout 257___ 258$code.=<<___; 259.Loop_${p}1_$sn: 260 aes${p} $rndkey1,$inout 261 dec $rounds 262 $movkey ($key),$rndkey1 263 lea 16($key),$key 264 jnz .Loop_${p}1_$sn # loop body is 16 bytes 265 aes${p}last $rndkey1,$inout 266___ 267}} 268# void $PREFIX_[en|de]crypt (const void *inp,void *out,const AES_KEY *key); 269# 270{ my ($inp,$out,$key) = @_4args; 271 272$code.=<<___; 273.globl ${PREFIX}_encrypt 274.type ${PREFIX}_encrypt,\@abi-omnipotent 275.align 16 276${PREFIX}_encrypt: 277 movups ($inp),$inout0 # load input 278 mov 240($key),$rounds # key->rounds 279___ 280 &aesni_generate1("enc",$key,$rounds); 281$code.=<<___; 282 pxor $rndkey0,$rndkey0 # clear register bank 283 pxor $rndkey1,$rndkey1 284 movups $inout0,($out) # output 285 pxor $inout0,$inout0 286 ret 287.size ${PREFIX}_encrypt,.-${PREFIX}_encrypt 288 289.globl ${PREFIX}_decrypt 290.type ${PREFIX}_decrypt,\@abi-omnipotent 291.align 16 292${PREFIX}_decrypt: 293 movups ($inp),$inout0 # load input 294 mov 240($key),$rounds # key->rounds 295___ 296 &aesni_generate1("dec",$key,$rounds); 297$code.=<<___; 298 pxor $rndkey0,$rndkey0 # clear register bank 299 pxor $rndkey1,$rndkey1 300 movups $inout0,($out) # output 301 pxor $inout0,$inout0 302 ret 303.size ${PREFIX}_decrypt, .-${PREFIX}_decrypt 304___ 305} 306 307# _aesni_[en|de]cryptN are private interfaces, N denotes interleave 308# factor. Why 3x subroutine were originally used in loops? Even though 309# aes[enc|dec] latency was originally 6, it could be scheduled only 310# every *2nd* cycle. Thus 3x interleave was the one providing optimal 311# utilization, i.e. when subroutine's throughput is virtually same as 312# of non-interleaved subroutine [for number of input blocks up to 3]. 313# This is why it originally made no sense to implement 2x subroutine. 314# But times change and it became appropriate to spend extra 192 bytes 315# on 2x subroutine on Atom Silvermont account. For processors that 316# can schedule aes[enc|dec] every cycle optimal interleave factor 317# equals to corresponding instructions latency. 8x is optimal for 318# * Bridge and "super-optimal" for other Intel CPUs... 319 320sub aesni_generate2 { 321my $dir=shift; 322# As already mentioned it takes in $key and $rounds, which are *not* 323# preserved. $inout[0-1] is cipher/clear text... 324$code.=<<___; 325.type _aesni_${dir}rypt2,\@abi-omnipotent 326.align 16 327_aesni_${dir}rypt2: 328 $movkey ($key),$rndkey0 329 shl \$4,$rounds 330 $movkey 16($key),$rndkey1 331 xorps $rndkey0,$inout0 332 xorps $rndkey0,$inout1 333 $movkey 32($key),$rndkey0 334 lea 32($key,$rounds),$key 335 neg %rax # $rounds 336 add \$16,%rax 337 338.L${dir}_loop2: 339 aes${dir} $rndkey1,$inout0 340 aes${dir} $rndkey1,$inout1 341 $movkey ($key,%rax),$rndkey1 342 add \$32,%rax 343 aes${dir} $rndkey0,$inout0 344 aes${dir} $rndkey0,$inout1 345 $movkey -16($key,%rax),$rndkey0 346 jnz .L${dir}_loop2 347 348 aes${dir} $rndkey1,$inout0 349 aes${dir} $rndkey1,$inout1 350 aes${dir}last $rndkey0,$inout0 351 aes${dir}last $rndkey0,$inout1 352 ret 353.size _aesni_${dir}rypt2,.-_aesni_${dir}rypt2 354___ 355} 356sub aesni_generate3 { 357my $dir=shift; 358# As already mentioned it takes in $key and $rounds, which are *not* 359# preserved. $inout[0-2] is cipher/clear text... 360$code.=<<___; 361.type _aesni_${dir}rypt3,\@abi-omnipotent 362.align 16 363_aesni_${dir}rypt3: 364 $movkey ($key),$rndkey0 365 shl \$4,$rounds 366 $movkey 16($key),$rndkey1 367 xorps $rndkey0,$inout0 368 xorps $rndkey0,$inout1 369 xorps $rndkey0,$inout2 370 $movkey 32($key),$rndkey0 371 lea 32($key,$rounds),$key 372 neg %rax # $rounds 373 add \$16,%rax 374 375.L${dir}_loop3: 376 aes${dir} $rndkey1,$inout0 377 aes${dir} $rndkey1,$inout1 378 aes${dir} $rndkey1,$inout2 379 $movkey ($key,%rax),$rndkey1 380 add \$32,%rax 381 aes${dir} $rndkey0,$inout0 382 aes${dir} $rndkey0,$inout1 383 aes${dir} $rndkey0,$inout2 384 $movkey -16($key,%rax),$rndkey0 385 jnz .L${dir}_loop3 386 387 aes${dir} $rndkey1,$inout0 388 aes${dir} $rndkey1,$inout1 389 aes${dir} $rndkey1,$inout2 390 aes${dir}last $rndkey0,$inout0 391 aes${dir}last $rndkey0,$inout1 392 aes${dir}last $rndkey0,$inout2 393 ret 394.size _aesni_${dir}rypt3,.-_aesni_${dir}rypt3 395___ 396} 397# 4x interleave is implemented to improve small block performance, 398# most notably [and naturally] 4 block by ~30%. One can argue that one 399# should have implemented 5x as well, but improvement would be <20%, 400# so it's not worth it... 401sub aesni_generate4 { 402my $dir=shift; 403# As already mentioned it takes in $key and $rounds, which are *not* 404# preserved. $inout[0-3] is cipher/clear text... 405$code.=<<___; 406.type _aesni_${dir}rypt4,\@abi-omnipotent 407.align 16 408_aesni_${dir}rypt4: 409 $movkey ($key),$rndkey0 410 shl \$4,$rounds 411 $movkey 16($key),$rndkey1 412 xorps $rndkey0,$inout0 413 xorps $rndkey0,$inout1 414 xorps $rndkey0,$inout2 415 xorps $rndkey0,$inout3 416 $movkey 32($key),$rndkey0 417 lea 32($key,$rounds),$key 418 neg %rax # $rounds 419 .byte 0x0f,0x1f,0x00 420 add \$16,%rax 421 422.L${dir}_loop4: 423 aes${dir} $rndkey1,$inout0 424 aes${dir} $rndkey1,$inout1 425 aes${dir} $rndkey1,$inout2 426 aes${dir} $rndkey1,$inout3 427 $movkey ($key,%rax),$rndkey1 428 add \$32,%rax 429 aes${dir} $rndkey0,$inout0 430 aes${dir} $rndkey0,$inout1 431 aes${dir} $rndkey0,$inout2 432 aes${dir} $rndkey0,$inout3 433 $movkey -16($key,%rax),$rndkey0 434 jnz .L${dir}_loop4 435 436 aes${dir} $rndkey1,$inout0 437 aes${dir} $rndkey1,$inout1 438 aes${dir} $rndkey1,$inout2 439 aes${dir} $rndkey1,$inout3 440 aes${dir}last $rndkey0,$inout0 441 aes${dir}last $rndkey0,$inout1 442 aes${dir}last $rndkey0,$inout2 443 aes${dir}last $rndkey0,$inout3 444 ret 445.size _aesni_${dir}rypt4,.-_aesni_${dir}rypt4 446___ 447} 448sub aesni_generate6 { 449my $dir=shift; 450# As already mentioned it takes in $key and $rounds, which are *not* 451# preserved. $inout[0-5] is cipher/clear text... 452$code.=<<___; 453.type _aesni_${dir}rypt6,\@abi-omnipotent 454.align 16 455_aesni_${dir}rypt6: 456 $movkey ($key),$rndkey0 457 shl \$4,$rounds 458 $movkey 16($key),$rndkey1 459 xorps $rndkey0,$inout0 460 pxor $rndkey0,$inout1 461 pxor $rndkey0,$inout2 462 aes${dir} $rndkey1,$inout0 463 lea 32($key,$rounds),$key 464 neg %rax # $rounds 465 aes${dir} $rndkey1,$inout1 466 pxor $rndkey0,$inout3 467 pxor $rndkey0,$inout4 468 aes${dir} $rndkey1,$inout2 469 pxor $rndkey0,$inout5 470 $movkey ($key,%rax),$rndkey0 471 add \$16,%rax 472 jmp .L${dir}_loop6_enter 473.align 16 474.L${dir}_loop6: 475 aes${dir} $rndkey1,$inout0 476 aes${dir} $rndkey1,$inout1 477 aes${dir} $rndkey1,$inout2 478.L${dir}_loop6_enter: 479 aes${dir} $rndkey1,$inout3 480 aes${dir} $rndkey1,$inout4 481 aes${dir} $rndkey1,$inout5 482 $movkey ($key,%rax),$rndkey1 483 add \$32,%rax 484 aes${dir} $rndkey0,$inout0 485 aes${dir} $rndkey0,$inout1 486 aes${dir} $rndkey0,$inout2 487 aes${dir} $rndkey0,$inout3 488 aes${dir} $rndkey0,$inout4 489 aes${dir} $rndkey0,$inout5 490 $movkey -16($key,%rax),$rndkey0 491 jnz .L${dir}_loop6 492 493 aes${dir} $rndkey1,$inout0 494 aes${dir} $rndkey1,$inout1 495 aes${dir} $rndkey1,$inout2 496 aes${dir} $rndkey1,$inout3 497 aes${dir} $rndkey1,$inout4 498 aes${dir} $rndkey1,$inout5 499 aes${dir}last $rndkey0,$inout0 500 aes${dir}last $rndkey0,$inout1 501 aes${dir}last $rndkey0,$inout2 502 aes${dir}last $rndkey0,$inout3 503 aes${dir}last $rndkey0,$inout4 504 aes${dir}last $rndkey0,$inout5 505 ret 506.size _aesni_${dir}rypt6,.-_aesni_${dir}rypt6 507___ 508} 509sub aesni_generate8 { 510my $dir=shift; 511# As already mentioned it takes in $key and $rounds, which are *not* 512# preserved. $inout[0-7] is cipher/clear text... 513$code.=<<___; 514.type _aesni_${dir}rypt8,\@abi-omnipotent 515.align 16 516_aesni_${dir}rypt8: 517 $movkey ($key),$rndkey0 518 shl \$4,$rounds 519 $movkey 16($key),$rndkey1 520 xorps $rndkey0,$inout0 521 xorps $rndkey0,$inout1 522 pxor $rndkey0,$inout2 523 pxor $rndkey0,$inout3 524 pxor $rndkey0,$inout4 525 lea 32($key,$rounds),$key 526 neg %rax # $rounds 527 aes${dir} $rndkey1,$inout0 528 pxor $rndkey0,$inout5 529 pxor $rndkey0,$inout6 530 aes${dir} $rndkey1,$inout1 531 pxor $rndkey0,$inout7 532 $movkey ($key,%rax),$rndkey0 533 add \$16,%rax 534 jmp .L${dir}_loop8_inner 535.align 16 536.L${dir}_loop8: 537 aes${dir} $rndkey1,$inout0 538 aes${dir} $rndkey1,$inout1 539.L${dir}_loop8_inner: 540 aes${dir} $rndkey1,$inout2 541 aes${dir} $rndkey1,$inout3 542 aes${dir} $rndkey1,$inout4 543 aes${dir} $rndkey1,$inout5 544 aes${dir} $rndkey1,$inout6 545 aes${dir} $rndkey1,$inout7 546.L${dir}_loop8_enter: 547 $movkey ($key,%rax),$rndkey1 548 add \$32,%rax 549 aes${dir} $rndkey0,$inout0 550 aes${dir} $rndkey0,$inout1 551 aes${dir} $rndkey0,$inout2 552 aes${dir} $rndkey0,$inout3 553 aes${dir} $rndkey0,$inout4 554 aes${dir} $rndkey0,$inout5 555 aes${dir} $rndkey0,$inout6 556 aes${dir} $rndkey0,$inout7 557 $movkey -16($key,%rax),$rndkey0 558 jnz .L${dir}_loop8 559 560 aes${dir} $rndkey1,$inout0 561 aes${dir} $rndkey1,$inout1 562 aes${dir} $rndkey1,$inout2 563 aes${dir} $rndkey1,$inout3 564 aes${dir} $rndkey1,$inout4 565 aes${dir} $rndkey1,$inout5 566 aes${dir} $rndkey1,$inout6 567 aes${dir} $rndkey1,$inout7 568 aes${dir}last $rndkey0,$inout0 569 aes${dir}last $rndkey0,$inout1 570 aes${dir}last $rndkey0,$inout2 571 aes${dir}last $rndkey0,$inout3 572 aes${dir}last $rndkey0,$inout4 573 aes${dir}last $rndkey0,$inout5 574 aes${dir}last $rndkey0,$inout6 575 aes${dir}last $rndkey0,$inout7 576 ret 577.size _aesni_${dir}rypt8,.-_aesni_${dir}rypt8 578___ 579} 580&aesni_generate2("enc") if ($PREFIX eq "aes_hw"); 581&aesni_generate2("dec"); 582&aesni_generate3("enc") if ($PREFIX eq "aes_hw"); 583&aesni_generate3("dec"); 584&aesni_generate4("enc") if ($PREFIX eq "aes_hw"); 585&aesni_generate4("dec"); 586&aesni_generate6("enc") if ($PREFIX eq "aes_hw"); 587&aesni_generate6("dec"); 588&aesni_generate8("enc") if ($PREFIX eq "aes_hw"); 589&aesni_generate8("dec"); 590 591if ($PREFIX eq "aes_hw") { 592######################################################################## 593# void aesni_ecb_encrypt (const void *in, void *out, 594# size_t length, const AES_KEY *key, 595# int enc); 596$code.=<<___; 597.globl ${PREFIX}_ecb_encrypt 598.type ${PREFIX}_ecb_encrypt,\@function,5 599.align 16 600${PREFIX}_ecb_encrypt: 601___ 602$code.=<<___ if ($win64); 603 lea -0x58(%rsp),%rsp 604 movaps %xmm6,(%rsp) # offload $inout4..7 605 movaps %xmm7,0x10(%rsp) 606 movaps %xmm8,0x20(%rsp) 607 movaps %xmm9,0x30(%rsp) 608.Lecb_enc_body: 609___ 610$code.=<<___; 611 and \$-16,$len # if ($len<16) 612 jz .Lecb_ret # return 613 614 mov 240($key),$rounds # key->rounds 615 $movkey ($key),$rndkey0 616 mov $key,$key_ # backup $key 617 mov $rounds,$rnds_ # backup $rounds 618 test %r8d,%r8d # 5th argument 619 jz .Lecb_decrypt 620#--------------------------- ECB ENCRYPT ------------------------------# 621 cmp \$0x80,$len # if ($len<8*16) 622 jb .Lecb_enc_tail # short input 623 624 movdqu ($inp),$inout0 # load 8 input blocks 625 movdqu 0x10($inp),$inout1 626 movdqu 0x20($inp),$inout2 627 movdqu 0x30($inp),$inout3 628 movdqu 0x40($inp),$inout4 629 movdqu 0x50($inp),$inout5 630 movdqu 0x60($inp),$inout6 631 movdqu 0x70($inp),$inout7 632 lea 0x80($inp),$inp # $inp+=8*16 633 sub \$0x80,$len # $len-=8*16 (can be zero) 634 jmp .Lecb_enc_loop8_enter 635.align 16 636.Lecb_enc_loop8: 637 movups $inout0,($out) # store 8 output blocks 638 mov $key_,$key # restore $key 639 movdqu ($inp),$inout0 # load 8 input blocks 640 mov $rnds_,$rounds # restore $rounds 641 movups $inout1,0x10($out) 642 movdqu 0x10($inp),$inout1 643 movups $inout2,0x20($out) 644 movdqu 0x20($inp),$inout2 645 movups $inout3,0x30($out) 646 movdqu 0x30($inp),$inout3 647 movups $inout4,0x40($out) 648 movdqu 0x40($inp),$inout4 649 movups $inout5,0x50($out) 650 movdqu 0x50($inp),$inout5 651 movups $inout6,0x60($out) 652 movdqu 0x60($inp),$inout6 653 movups $inout7,0x70($out) 654 lea 0x80($out),$out # $out+=8*16 655 movdqu 0x70($inp),$inout7 656 lea 0x80($inp),$inp # $inp+=8*16 657.Lecb_enc_loop8_enter: 658 659 call _aesni_encrypt8 660 661 sub \$0x80,$len 662 jnc .Lecb_enc_loop8 # loop if $len-=8*16 didn't borrow 663 664 movups $inout0,($out) # store 8 output blocks 665 mov $key_,$key # restore $key 666 movups $inout1,0x10($out) 667 mov $rnds_,$rounds # restore $rounds 668 movups $inout2,0x20($out) 669 movups $inout3,0x30($out) 670 movups $inout4,0x40($out) 671 movups $inout5,0x50($out) 672 movups $inout6,0x60($out) 673 movups $inout7,0x70($out) 674 lea 0x80($out),$out # $out+=8*16 675 add \$0x80,$len # restore real remaining $len 676 jz .Lecb_ret # done if ($len==0) 677 678.Lecb_enc_tail: # $len is less than 8*16 679 movups ($inp),$inout0 680 cmp \$0x20,$len 681 jb .Lecb_enc_one 682 movups 0x10($inp),$inout1 683 je .Lecb_enc_two 684 movups 0x20($inp),$inout2 685 cmp \$0x40,$len 686 jb .Lecb_enc_three 687 movups 0x30($inp),$inout3 688 je .Lecb_enc_four 689 movups 0x40($inp),$inout4 690 cmp \$0x60,$len 691 jb .Lecb_enc_five 692 movups 0x50($inp),$inout5 693 je .Lecb_enc_six 694 movdqu 0x60($inp),$inout6 695 xorps $inout7,$inout7 696 call _aesni_encrypt8 697 movups $inout0,($out) # store 7 output blocks 698 movups $inout1,0x10($out) 699 movups $inout2,0x20($out) 700 movups $inout3,0x30($out) 701 movups $inout4,0x40($out) 702 movups $inout5,0x50($out) 703 movups $inout6,0x60($out) 704 jmp .Lecb_ret 705.align 16 706.Lecb_enc_one: 707___ 708 &aesni_generate1("enc",$key,$rounds); 709$code.=<<___; 710 movups $inout0,($out) # store one output block 711 jmp .Lecb_ret 712.align 16 713.Lecb_enc_two: 714 call _aesni_encrypt2 715 movups $inout0,($out) # store 2 output blocks 716 movups $inout1,0x10($out) 717 jmp .Lecb_ret 718.align 16 719.Lecb_enc_three: 720 call _aesni_encrypt3 721 movups $inout0,($out) # store 3 output blocks 722 movups $inout1,0x10($out) 723 movups $inout2,0x20($out) 724 jmp .Lecb_ret 725.align 16 726.Lecb_enc_four: 727 call _aesni_encrypt4 728 movups $inout0,($out) # store 4 output blocks 729 movups $inout1,0x10($out) 730 movups $inout2,0x20($out) 731 movups $inout3,0x30($out) 732 jmp .Lecb_ret 733.align 16 734.Lecb_enc_five: 735 xorps $inout5,$inout5 736 call _aesni_encrypt6 737 movups $inout0,($out) # store 5 output blocks 738 movups $inout1,0x10($out) 739 movups $inout2,0x20($out) 740 movups $inout3,0x30($out) 741 movups $inout4,0x40($out) 742 jmp .Lecb_ret 743.align 16 744.Lecb_enc_six: 745 call _aesni_encrypt6 746 movups $inout0,($out) # store 6 output blocks 747 movups $inout1,0x10($out) 748 movups $inout2,0x20($out) 749 movups $inout3,0x30($out) 750 movups $inout4,0x40($out) 751 movups $inout5,0x50($out) 752 jmp .Lecb_ret 753#--------------------------- ECB DECRYPT ------------------------------# 754.align 16 755.Lecb_decrypt: 756 cmp \$0x80,$len # if ($len<8*16) 757 jb .Lecb_dec_tail # short input 758 759 movdqu ($inp),$inout0 # load 8 input blocks 760 movdqu 0x10($inp),$inout1 761 movdqu 0x20($inp),$inout2 762 movdqu 0x30($inp),$inout3 763 movdqu 0x40($inp),$inout4 764 movdqu 0x50($inp),$inout5 765 movdqu 0x60($inp),$inout6 766 movdqu 0x70($inp),$inout7 767 lea 0x80($inp),$inp # $inp+=8*16 768 sub \$0x80,$len # $len-=8*16 (can be zero) 769 jmp .Lecb_dec_loop8_enter 770.align 16 771.Lecb_dec_loop8: 772 movups $inout0,($out) # store 8 output blocks 773 mov $key_,$key # restore $key 774 movdqu ($inp),$inout0 # load 8 input blocks 775 mov $rnds_,$rounds # restore $rounds 776 movups $inout1,0x10($out) 777 movdqu 0x10($inp),$inout1 778 movups $inout2,0x20($out) 779 movdqu 0x20($inp),$inout2 780 movups $inout3,0x30($out) 781 movdqu 0x30($inp),$inout3 782 movups $inout4,0x40($out) 783 movdqu 0x40($inp),$inout4 784 movups $inout5,0x50($out) 785 movdqu 0x50($inp),$inout5 786 movups $inout6,0x60($out) 787 movdqu 0x60($inp),$inout6 788 movups $inout7,0x70($out) 789 lea 0x80($out),$out # $out+=8*16 790 movdqu 0x70($inp),$inout7 791 lea 0x80($inp),$inp # $inp+=8*16 792.Lecb_dec_loop8_enter: 793 794 call _aesni_decrypt8 795 796 $movkey ($key_),$rndkey0 797 sub \$0x80,$len 798 jnc .Lecb_dec_loop8 # loop if $len-=8*16 didn't borrow 799 800 movups $inout0,($out) # store 8 output blocks 801 pxor $inout0,$inout0 # clear register bank 802 mov $key_,$key # restore $key 803 movups $inout1,0x10($out) 804 pxor $inout1,$inout1 805 mov $rnds_,$rounds # restore $rounds 806 movups $inout2,0x20($out) 807 pxor $inout2,$inout2 808 movups $inout3,0x30($out) 809 pxor $inout3,$inout3 810 movups $inout4,0x40($out) 811 pxor $inout4,$inout4 812 movups $inout5,0x50($out) 813 pxor $inout5,$inout5 814 movups $inout6,0x60($out) 815 pxor $inout6,$inout6 816 movups $inout7,0x70($out) 817 pxor $inout7,$inout7 818 lea 0x80($out),$out # $out+=8*16 819 add \$0x80,$len # restore real remaining $len 820 jz .Lecb_ret # done if ($len==0) 821 822.Lecb_dec_tail: 823 movups ($inp),$inout0 824 cmp \$0x20,$len 825 jb .Lecb_dec_one 826 movups 0x10($inp),$inout1 827 je .Lecb_dec_two 828 movups 0x20($inp),$inout2 829 cmp \$0x40,$len 830 jb .Lecb_dec_three 831 movups 0x30($inp),$inout3 832 je .Lecb_dec_four 833 movups 0x40($inp),$inout4 834 cmp \$0x60,$len 835 jb .Lecb_dec_five 836 movups 0x50($inp),$inout5 837 je .Lecb_dec_six 838 movups 0x60($inp),$inout6 839 $movkey ($key),$rndkey0 840 xorps $inout7,$inout7 841 call _aesni_decrypt8 842 movups $inout0,($out) # store 7 output blocks 843 pxor $inout0,$inout0 # clear register bank 844 movups $inout1,0x10($out) 845 pxor $inout1,$inout1 846 movups $inout2,0x20($out) 847 pxor $inout2,$inout2 848 movups $inout3,0x30($out) 849 pxor $inout3,$inout3 850 movups $inout4,0x40($out) 851 pxor $inout4,$inout4 852 movups $inout5,0x50($out) 853 pxor $inout5,$inout5 854 movups $inout6,0x60($out) 855 pxor $inout6,$inout6 856 pxor $inout7,$inout7 857 jmp .Lecb_ret 858.align 16 859.Lecb_dec_one: 860___ 861 &aesni_generate1("dec",$key,$rounds); 862$code.=<<___; 863 movups $inout0,($out) # store one output block 864 pxor $inout0,$inout0 # clear register bank 865 jmp .Lecb_ret 866.align 16 867.Lecb_dec_two: 868 call _aesni_decrypt2 869 movups $inout0,($out) # store 2 output blocks 870 pxor $inout0,$inout0 # clear register bank 871 movups $inout1,0x10($out) 872 pxor $inout1,$inout1 873 jmp .Lecb_ret 874.align 16 875.Lecb_dec_three: 876 call _aesni_decrypt3 877 movups $inout0,($out) # store 3 output blocks 878 pxor $inout0,$inout0 # clear register bank 879 movups $inout1,0x10($out) 880 pxor $inout1,$inout1 881 movups $inout2,0x20($out) 882 pxor $inout2,$inout2 883 jmp .Lecb_ret 884.align 16 885.Lecb_dec_four: 886 call _aesni_decrypt4 887 movups $inout0,($out) # store 4 output blocks 888 pxor $inout0,$inout0 # clear register bank 889 movups $inout1,0x10($out) 890 pxor $inout1,$inout1 891 movups $inout2,0x20($out) 892 pxor $inout2,$inout2 893 movups $inout3,0x30($out) 894 pxor $inout3,$inout3 895 jmp .Lecb_ret 896.align 16 897.Lecb_dec_five: 898 xorps $inout5,$inout5 899 call _aesni_decrypt6 900 movups $inout0,($out) # store 5 output blocks 901 pxor $inout0,$inout0 # clear register bank 902 movups $inout1,0x10($out) 903 pxor $inout1,$inout1 904 movups $inout2,0x20($out) 905 pxor $inout2,$inout2 906 movups $inout3,0x30($out) 907 pxor $inout3,$inout3 908 movups $inout4,0x40($out) 909 pxor $inout4,$inout4 910 pxor $inout5,$inout5 911 jmp .Lecb_ret 912.align 16 913.Lecb_dec_six: 914 call _aesni_decrypt6 915 movups $inout0,($out) # store 6 output blocks 916 pxor $inout0,$inout0 # clear register bank 917 movups $inout1,0x10($out) 918 pxor $inout1,$inout1 919 movups $inout2,0x20($out) 920 pxor $inout2,$inout2 921 movups $inout3,0x30($out) 922 pxor $inout3,$inout3 923 movups $inout4,0x40($out) 924 pxor $inout4,$inout4 925 movups $inout5,0x50($out) 926 pxor $inout5,$inout5 927 928.Lecb_ret: 929 xorps $rndkey0,$rndkey0 # %xmm0 930 pxor $rndkey1,$rndkey1 931___ 932$code.=<<___ if ($win64); 933 movaps (%rsp),%xmm6 934 movaps %xmm0,(%rsp) # clear stack 935 movaps 0x10(%rsp),%xmm7 936 movaps %xmm0,0x10(%rsp) 937 movaps 0x20(%rsp),%xmm8 938 movaps %xmm0,0x20(%rsp) 939 movaps 0x30(%rsp),%xmm9 940 movaps %xmm0,0x30(%rsp) 941 lea 0x58(%rsp),%rsp 942.Lecb_enc_ret: 943___ 944$code.=<<___; 945 ret 946.size ${PREFIX}_ecb_encrypt,.-${PREFIX}_ecb_encrypt 947___ 948 949{ 950###################################################################### 951# void aesni_ccm64_[en|de]crypt_blocks (const void *in, void *out, 952# size_t blocks, const AES_KEY *key, 953# const char *ivec,char *cmac); 954# 955# Handles only complete blocks, operates on 64-bit counter and 956# does not update *ivec! Nor does it finalize CMAC value 957# (see engine/eng_aesni.c for details) 958# 959{ 960my $cmac="%r9"; # 6th argument 961 962my $increment="%xmm9"; 963my $iv="%xmm6"; 964my $bswap_mask="%xmm7"; 965 966$code.=<<___; 967.globl ${PREFIX}_ccm64_encrypt_blocks 968.type ${PREFIX}_ccm64_encrypt_blocks,\@function,6 969.align 16 970${PREFIX}_ccm64_encrypt_blocks: 971___ 972$code.=<<___ if ($win64); 973 lea -0x58(%rsp),%rsp 974 movaps %xmm6,(%rsp) # $iv 975 movaps %xmm7,0x10(%rsp) # $bswap_mask 976 movaps %xmm8,0x20(%rsp) # $in0 977 movaps %xmm9,0x30(%rsp) # $increment 978.Lccm64_enc_body: 979___ 980$code.=<<___; 981 mov 240($key),$rounds # key->rounds 982 movdqu ($ivp),$iv 983 movdqa .Lincrement64(%rip),$increment 984 movdqa .Lbswap_mask(%rip),$bswap_mask 985 986 shl \$4,$rounds 987 mov \$16,$rnds_ 988 lea 0($key),$key_ 989 movdqu ($cmac),$inout1 990 movdqa $iv,$inout0 991 lea 32($key,$rounds),$key # end of key schedule 992 pshufb $bswap_mask,$iv 993 sub %rax,%r10 # twisted $rounds 994 jmp .Lccm64_enc_outer 995.align 16 996.Lccm64_enc_outer: 997 $movkey ($key_),$rndkey0 998 mov %r10,%rax 999 movups ($inp),$in0 # load inp 1000 1001 xorps $rndkey0,$inout0 # counter 1002 $movkey 16($key_),$rndkey1 1003 xorps $in0,$rndkey0 1004 xorps $rndkey0,$inout1 # cmac^=inp 1005 $movkey 32($key_),$rndkey0 1006 1007.Lccm64_enc2_loop: 1008 aesenc $rndkey1,$inout0 1009 aesenc $rndkey1,$inout1 1010 $movkey ($key,%rax),$rndkey1 1011 add \$32,%rax 1012 aesenc $rndkey0,$inout0 1013 aesenc $rndkey0,$inout1 1014 $movkey -16($key,%rax),$rndkey0 1015 jnz .Lccm64_enc2_loop 1016 aesenc $rndkey1,$inout0 1017 aesenc $rndkey1,$inout1 1018 paddq $increment,$iv 1019 dec $len # $len-- ($len is in blocks) 1020 aesenclast $rndkey0,$inout0 1021 aesenclast $rndkey0,$inout1 1022 1023 lea 16($inp),$inp 1024 xorps $inout0,$in0 # inp ^= E(iv) 1025 movdqa $iv,$inout0 1026 movups $in0,($out) # save output 1027 pshufb $bswap_mask,$inout0 1028 lea 16($out),$out # $out+=16 1029 jnz .Lccm64_enc_outer # loop if ($len!=0) 1030 1031 pxor $rndkey0,$rndkey0 # clear register bank 1032 pxor $rndkey1,$rndkey1 1033 pxor $inout0,$inout0 1034 movups $inout1,($cmac) # store resulting mac 1035 pxor $inout1,$inout1 1036 pxor $in0,$in0 1037 pxor $iv,$iv 1038___ 1039$code.=<<___ if ($win64); 1040 movaps (%rsp),%xmm6 1041 movaps %xmm0,(%rsp) # clear stack 1042 movaps 0x10(%rsp),%xmm7 1043 movaps %xmm0,0x10(%rsp) 1044 movaps 0x20(%rsp),%xmm8 1045 movaps %xmm0,0x20(%rsp) 1046 movaps 0x30(%rsp),%xmm9 1047 movaps %xmm0,0x30(%rsp) 1048 lea 0x58(%rsp),%rsp 1049.Lccm64_enc_ret: 1050___ 1051$code.=<<___; 1052 ret 1053.size ${PREFIX}_ccm64_encrypt_blocks,.-${PREFIX}_ccm64_encrypt_blocks 1054___ 1055###################################################################### 1056$code.=<<___; 1057.globl ${PREFIX}_ccm64_decrypt_blocks 1058.type ${PREFIX}_ccm64_decrypt_blocks,\@function,6 1059.align 16 1060${PREFIX}_ccm64_decrypt_blocks: 1061___ 1062$code.=<<___ if ($win64); 1063 lea -0x58(%rsp),%rsp 1064 movaps %xmm6,(%rsp) # $iv 1065 movaps %xmm7,0x10(%rsp) # $bswap_mask 1066 movaps %xmm8,0x20(%rsp) # $in8 1067 movaps %xmm9,0x30(%rsp) # $increment 1068.Lccm64_dec_body: 1069___ 1070$code.=<<___; 1071 mov 240($key),$rounds # key->rounds 1072 movups ($ivp),$iv 1073 movdqu ($cmac),$inout1 1074 movdqa .Lincrement64(%rip),$increment 1075 movdqa .Lbswap_mask(%rip),$bswap_mask 1076 1077 movaps $iv,$inout0 1078 mov $rounds,$rnds_ 1079 mov $key,$key_ 1080 pshufb $bswap_mask,$iv 1081___ 1082 &aesni_generate1("enc",$key,$rounds); 1083$code.=<<___; 1084 shl \$4,$rnds_ 1085 mov \$16,$rounds 1086 movups ($inp),$in0 # load inp 1087 paddq $increment,$iv 1088 lea 16($inp),$inp # $inp+=16 1089 sub %r10,%rax # twisted $rounds 1090 lea 32($key_,$rnds_),$key # end of key schedule 1091 mov %rax,%r10 1092 jmp .Lccm64_dec_outer 1093.align 16 1094.Lccm64_dec_outer: 1095 xorps $inout0,$in0 # inp ^= E(iv) 1096 movdqa $iv,$inout0 1097 movups $in0,($out) # save output 1098 lea 16($out),$out # $out+=16 1099 pshufb $bswap_mask,$inout0 1100 1101 sub \$1,$len # $len-- ($len is in blocks) 1102 jz .Lccm64_dec_break # if ($len==0) break 1103 1104 $movkey ($key_),$rndkey0 1105 mov %r10,%rax 1106 $movkey 16($key_),$rndkey1 1107 xorps $rndkey0,$in0 1108 xorps $rndkey0,$inout0 1109 xorps $in0,$inout1 # cmac^=out 1110 $movkey 32($key_),$rndkey0 1111 jmp .Lccm64_dec2_loop 1112.align 16 1113.Lccm64_dec2_loop: 1114 aesenc $rndkey1,$inout0 1115 aesenc $rndkey1,$inout1 1116 $movkey ($key,%rax),$rndkey1 1117 add \$32,%rax 1118 aesenc $rndkey0,$inout0 1119 aesenc $rndkey0,$inout1 1120 $movkey -16($key,%rax),$rndkey0 1121 jnz .Lccm64_dec2_loop 1122 movups ($inp),$in0 # load input 1123 paddq $increment,$iv 1124 aesenc $rndkey1,$inout0 1125 aesenc $rndkey1,$inout1 1126 aesenclast $rndkey0,$inout0 1127 aesenclast $rndkey0,$inout1 1128 lea 16($inp),$inp # $inp+=16 1129 jmp .Lccm64_dec_outer 1130 1131.align 16 1132.Lccm64_dec_break: 1133 #xorps $in0,$inout1 # cmac^=out 1134 mov 240($key_),$rounds 1135___ 1136 &aesni_generate1("enc",$key_,$rounds,$inout1,$in0); 1137$code.=<<___; 1138 pxor $rndkey0,$rndkey0 # clear register bank 1139 pxor $rndkey1,$rndkey1 1140 pxor $inout0,$inout0 1141 movups $inout1,($cmac) # store resulting mac 1142 pxor $inout1,$inout1 1143 pxor $in0,$in0 1144 pxor $iv,$iv 1145___ 1146$code.=<<___ if ($win64); 1147 movaps (%rsp),%xmm6 1148 movaps %xmm0,(%rsp) # clear stack 1149 movaps 0x10(%rsp),%xmm7 1150 movaps %xmm0,0x10(%rsp) 1151 movaps 0x20(%rsp),%xmm8 1152 movaps %xmm0,0x20(%rsp) 1153 movaps 0x30(%rsp),%xmm9 1154 movaps %xmm0,0x30(%rsp) 1155 lea 0x58(%rsp),%rsp 1156.Lccm64_dec_ret: 1157___ 1158$code.=<<___; 1159 ret 1160.size ${PREFIX}_ccm64_decrypt_blocks,.-${PREFIX}_ccm64_decrypt_blocks 1161___ 1162} 1163###################################################################### 1164# void aesni_ctr32_encrypt_blocks (const void *in, void *out, 1165# size_t blocks, const AES_KEY *key, 1166# const char *ivec); 1167# 1168# Handles only complete blocks, operates on 32-bit counter and 1169# does not update *ivec! (see crypto/modes/ctr128.c for details) 1170# 1171# Overhaul based on suggestions from Shay Gueron and Vlad Krasnov, 1172# http://rt.openssl.org/Ticket/Display.html?id=3021&user=guest&pass=guest. 1173# Keywords are full unroll and modulo-schedule counter calculations 1174# with zero-round key xor. 1175{ 1176my ($in0,$in1,$in2,$in3,$in4,$in5)=map("%xmm$_",(10..15)); 1177my ($key0,$ctr)=("%ebp","${ivp}d"); 1178my $frame_size = 0x80 + ($win64?160:0); 1179 1180$code.=<<___; 1181.globl ${PREFIX}_ctr32_encrypt_blocks 1182.type ${PREFIX}_ctr32_encrypt_blocks,\@function,5 1183.align 16 1184${PREFIX}_ctr32_encrypt_blocks: 1185.cfi_startproc 1186 cmp \$1,$len 1187 jne .Lctr32_bulk 1188 1189 # handle single block without allocating stack frame, 1190 # useful when handling edges 1191 movups ($ivp),$inout0 1192 movups ($inp),$inout1 1193 mov 240($key),%edx # key->rounds 1194___ 1195 &aesni_generate1("enc",$key,"%edx"); 1196$code.=<<___; 1197 pxor $rndkey0,$rndkey0 # clear register bank 1198 pxor $rndkey1,$rndkey1 1199 xorps $inout1,$inout0 1200 pxor $inout1,$inout1 1201 movups $inout0,($out) 1202 xorps $inout0,$inout0 1203 jmp .Lctr32_epilogue 1204 1205.align 16 1206.Lctr32_bulk: 1207 lea (%rsp),$key_ # use $key_ as frame pointer 1208.cfi_def_cfa_register $key_ 1209 push %rbp 1210.cfi_push %rbp 1211 sub \$$frame_size,%rsp 1212 and \$-16,%rsp # Linux kernel stack can be incorrectly seeded 1213___ 1214$code.=<<___ if ($win64); 1215 movaps %xmm6,-0xa8($key_) # offload everything 1216 movaps %xmm7,-0x98($key_) 1217 movaps %xmm8,-0x88($key_) 1218 movaps %xmm9,-0x78($key_) 1219 movaps %xmm10,-0x68($key_) 1220 movaps %xmm11,-0x58($key_) 1221 movaps %xmm12,-0x48($key_) 1222 movaps %xmm13,-0x38($key_) 1223 movaps %xmm14,-0x28($key_) 1224 movaps %xmm15,-0x18($key_) 1225.Lctr32_body: 1226___ 1227$code.=<<___; 1228 1229 # 8 16-byte words on top of stack are counter values 1230 # xor-ed with zero-round key 1231 1232 movdqu ($ivp),$inout0 1233 movdqu ($key),$rndkey0 1234 mov 12($ivp),$ctr # counter LSB 1235 pxor $rndkey0,$inout0 1236 mov 12($key),$key0 # 0-round key LSB 1237 movdqa $inout0,0x00(%rsp) # populate counter block 1238 bswap $ctr 1239 movdqa $inout0,$inout1 1240 movdqa $inout0,$inout2 1241 movdqa $inout0,$inout3 1242 movdqa $inout0,0x40(%rsp) 1243 movdqa $inout0,0x50(%rsp) 1244 movdqa $inout0,0x60(%rsp) 1245 mov %rdx,%r10 # about to borrow %rdx 1246 movdqa $inout0,0x70(%rsp) 1247 1248 lea 1($ctr),%rax 1249 lea 2($ctr),%rdx 1250 bswap %eax 1251 bswap %edx 1252 xor $key0,%eax 1253 xor $key0,%edx 1254 pinsrd \$3,%eax,$inout1 1255 lea 3($ctr),%rax 1256 movdqa $inout1,0x10(%rsp) 1257 pinsrd \$3,%edx,$inout2 1258 bswap %eax 1259 mov %r10,%rdx # restore %rdx 1260 lea 4($ctr),%r10 1261 movdqa $inout2,0x20(%rsp) 1262 xor $key0,%eax 1263 bswap %r10d 1264 pinsrd \$3,%eax,$inout3 1265 xor $key0,%r10d 1266 movdqa $inout3,0x30(%rsp) 1267 lea 5($ctr),%r9 1268 mov %r10d,0x40+12(%rsp) 1269 bswap %r9d 1270 lea 6($ctr),%r10 1271 mov 240($key),$rounds # key->rounds 1272 xor $key0,%r9d 1273 bswap %r10d 1274 mov %r9d,0x50+12(%rsp) 1275 xor $key0,%r10d 1276 lea 7($ctr),%r9 1277 mov %r10d,0x60+12(%rsp) 1278 bswap %r9d 1279 leaq OPENSSL_ia32cap_P(%rip),%r10 1280 mov 4(%r10),%r10d 1281 xor $key0,%r9d 1282 and \$`1<<26|1<<22`,%r10d # isolate XSAVE+MOVBE 1283 mov %r9d,0x70+12(%rsp) 1284 1285 $movkey 0x10($key),$rndkey1 1286 1287 movdqa 0x40(%rsp),$inout4 1288 movdqa 0x50(%rsp),$inout5 1289 1290 cmp \$8,$len # $len is in blocks 1291 jb .Lctr32_tail # short input if ($len<8) 1292 1293 sub \$6,$len # $len is biased by -6 1294 cmp \$`1<<22`,%r10d # check for MOVBE without XSAVE 1295 je .Lctr32_6x # [which denotes Atom Silvermont] 1296 1297 lea 0x80($key),$key # size optimization 1298 sub \$2,$len # $len is biased by -8 1299 jmp .Lctr32_loop8 1300 1301.align 16 1302.Lctr32_6x: 1303 shl \$4,$rounds 1304 mov \$48,$rnds_ 1305 bswap $key0 1306 lea 32($key,$rounds),$key # end of key schedule 1307 sub %rax,%r10 # twisted $rounds 1308 jmp .Lctr32_loop6 1309 1310.align 16 1311.Lctr32_loop6: 1312 add \$6,$ctr # next counter value 1313 $movkey -48($key,$rnds_),$rndkey0 1314 aesenc $rndkey1,$inout0 1315 mov $ctr,%eax 1316 xor $key0,%eax 1317 aesenc $rndkey1,$inout1 1318 movbe %eax,`0x00+12`(%rsp) # store next counter value 1319 lea 1($ctr),%eax 1320 aesenc $rndkey1,$inout2 1321 xor $key0,%eax 1322 movbe %eax,`0x10+12`(%rsp) 1323 aesenc $rndkey1,$inout3 1324 lea 2($ctr),%eax 1325 xor $key0,%eax 1326 aesenc $rndkey1,$inout4 1327 movbe %eax,`0x20+12`(%rsp) 1328 lea 3($ctr),%eax 1329 aesenc $rndkey1,$inout5 1330 $movkey -32($key,$rnds_),$rndkey1 1331 xor $key0,%eax 1332 1333 aesenc $rndkey0,$inout0 1334 movbe %eax,`0x30+12`(%rsp) 1335 lea 4($ctr),%eax 1336 aesenc $rndkey0,$inout1 1337 xor $key0,%eax 1338 movbe %eax,`0x40+12`(%rsp) 1339 aesenc $rndkey0,$inout2 1340 lea 5($ctr),%eax 1341 xor $key0,%eax 1342 aesenc $rndkey0,$inout3 1343 movbe %eax,`0x50+12`(%rsp) 1344 mov %r10,%rax # mov $rnds_,$rounds 1345 aesenc $rndkey0,$inout4 1346 aesenc $rndkey0,$inout5 1347 $movkey -16($key,$rnds_),$rndkey0 1348 1349 call .Lenc_loop6 1350 1351 movdqu ($inp),$inout6 # load 6 input blocks 1352 movdqu 0x10($inp),$inout7 1353 movdqu 0x20($inp),$in0 1354 movdqu 0x30($inp),$in1 1355 movdqu 0x40($inp),$in2 1356 movdqu 0x50($inp),$in3 1357 lea 0x60($inp),$inp # $inp+=6*16 1358 $movkey -64($key,$rnds_),$rndkey1 1359 pxor $inout0,$inout6 # inp^=E(ctr) 1360 movaps 0x00(%rsp),$inout0 # load next counter [xor-ed with 0 round] 1361 pxor $inout1,$inout7 1362 movaps 0x10(%rsp),$inout1 1363 pxor $inout2,$in0 1364 movaps 0x20(%rsp),$inout2 1365 pxor $inout3,$in1 1366 movaps 0x30(%rsp),$inout3 1367 pxor $inout4,$in2 1368 movaps 0x40(%rsp),$inout4 1369 pxor $inout5,$in3 1370 movaps 0x50(%rsp),$inout5 1371 movdqu $inout6,($out) # store 6 output blocks 1372 movdqu $inout7,0x10($out) 1373 movdqu $in0,0x20($out) 1374 movdqu $in1,0x30($out) 1375 movdqu $in2,0x40($out) 1376 movdqu $in3,0x50($out) 1377 lea 0x60($out),$out # $out+=6*16 1378 1379 sub \$6,$len 1380 jnc .Lctr32_loop6 # loop if $len-=6 didn't borrow 1381 1382 add \$6,$len # restore real remaining $len 1383 jz .Lctr32_done # done if ($len==0) 1384 1385 lea -48($rnds_),$rounds 1386 lea -80($key,$rnds_),$key # restore $key 1387 neg $rounds 1388 shr \$4,$rounds # restore $rounds 1389 jmp .Lctr32_tail 1390 1391.align 32 1392.Lctr32_loop8: 1393 add \$8,$ctr # next counter value 1394 movdqa 0x60(%rsp),$inout6 1395 aesenc $rndkey1,$inout0 1396 mov $ctr,%r9d 1397 movdqa 0x70(%rsp),$inout7 1398 aesenc $rndkey1,$inout1 1399 bswap %r9d 1400 $movkey 0x20-0x80($key),$rndkey0 1401 aesenc $rndkey1,$inout2 1402 xor $key0,%r9d 1403 nop 1404 aesenc $rndkey1,$inout3 1405 mov %r9d,0x00+12(%rsp) # store next counter value 1406 lea 1($ctr),%r9 1407 aesenc $rndkey1,$inout4 1408 aesenc $rndkey1,$inout5 1409 aesenc $rndkey1,$inout6 1410 aesenc $rndkey1,$inout7 1411 $movkey 0x30-0x80($key),$rndkey1 1412___ 1413for($i=2;$i<8;$i++) { 1414my $rndkeyx = ($i&1)?$rndkey1:$rndkey0; 1415$code.=<<___; 1416 bswap %r9d 1417 aesenc $rndkeyx,$inout0 1418 aesenc $rndkeyx,$inout1 1419 xor $key0,%r9d 1420 .byte 0x66,0x90 1421 aesenc $rndkeyx,$inout2 1422 aesenc $rndkeyx,$inout3 1423 mov %r9d,`0x10*($i-1)`+12(%rsp) 1424 lea $i($ctr),%r9 1425 aesenc $rndkeyx,$inout4 1426 aesenc $rndkeyx,$inout5 1427 aesenc $rndkeyx,$inout6 1428 aesenc $rndkeyx,$inout7 1429 $movkey `0x20+0x10*$i`-0x80($key),$rndkeyx 1430___ 1431} 1432$code.=<<___; 1433 bswap %r9d 1434 aesenc $rndkey0,$inout0 1435 aesenc $rndkey0,$inout1 1436 aesenc $rndkey0,$inout2 1437 xor $key0,%r9d 1438 movdqu 0x00($inp),$in0 # start loading input 1439 aesenc $rndkey0,$inout3 1440 mov %r9d,0x70+12(%rsp) 1441 cmp \$11,$rounds 1442 aesenc $rndkey0,$inout4 1443 aesenc $rndkey0,$inout5 1444 aesenc $rndkey0,$inout6 1445 aesenc $rndkey0,$inout7 1446 $movkey 0xa0-0x80($key),$rndkey0 1447 1448 jb .Lctr32_enc_done 1449 1450 aesenc $rndkey1,$inout0 1451 aesenc $rndkey1,$inout1 1452 aesenc $rndkey1,$inout2 1453 aesenc $rndkey1,$inout3 1454 aesenc $rndkey1,$inout4 1455 aesenc $rndkey1,$inout5 1456 aesenc $rndkey1,$inout6 1457 aesenc $rndkey1,$inout7 1458 $movkey 0xb0-0x80($key),$rndkey1 1459 1460 aesenc $rndkey0,$inout0 1461 aesenc $rndkey0,$inout1 1462 aesenc $rndkey0,$inout2 1463 aesenc $rndkey0,$inout3 1464 aesenc $rndkey0,$inout4 1465 aesenc $rndkey0,$inout5 1466 aesenc $rndkey0,$inout6 1467 aesenc $rndkey0,$inout7 1468 $movkey 0xc0-0x80($key),$rndkey0 1469 je .Lctr32_enc_done 1470 1471 aesenc $rndkey1,$inout0 1472 aesenc $rndkey1,$inout1 1473 aesenc $rndkey1,$inout2 1474 aesenc $rndkey1,$inout3 1475 aesenc $rndkey1,$inout4 1476 aesenc $rndkey1,$inout5 1477 aesenc $rndkey1,$inout6 1478 aesenc $rndkey1,$inout7 1479 $movkey 0xd0-0x80($key),$rndkey1 1480 1481 aesenc $rndkey0,$inout0 1482 aesenc $rndkey0,$inout1 1483 aesenc $rndkey0,$inout2 1484 aesenc $rndkey0,$inout3 1485 aesenc $rndkey0,$inout4 1486 aesenc $rndkey0,$inout5 1487 aesenc $rndkey0,$inout6 1488 aesenc $rndkey0,$inout7 1489 $movkey 0xe0-0x80($key),$rndkey0 1490 jmp .Lctr32_enc_done 1491 1492.align 16 1493.Lctr32_enc_done: 1494 movdqu 0x10($inp),$in1 1495 pxor $rndkey0,$in0 # input^=round[last] 1496 movdqu 0x20($inp),$in2 1497 pxor $rndkey0,$in1 1498 movdqu 0x30($inp),$in3 1499 pxor $rndkey0,$in2 1500 movdqu 0x40($inp),$in4 1501 pxor $rndkey0,$in3 1502 movdqu 0x50($inp),$in5 1503 pxor $rndkey0,$in4 1504 pxor $rndkey0,$in5 1505 aesenc $rndkey1,$inout0 1506 aesenc $rndkey1,$inout1 1507 aesenc $rndkey1,$inout2 1508 aesenc $rndkey1,$inout3 1509 aesenc $rndkey1,$inout4 1510 aesenc $rndkey1,$inout5 1511 aesenc $rndkey1,$inout6 1512 aesenc $rndkey1,$inout7 1513 movdqu 0x60($inp),$rndkey1 # borrow $rndkey1 for inp[6] 1514 lea 0x80($inp),$inp # $inp+=8*16 1515 1516 aesenclast $in0,$inout0 # $inN is inp[N]^round[last] 1517 pxor $rndkey0,$rndkey1 # borrowed $rndkey 1518 movdqu 0x70-0x80($inp),$in0 1519 aesenclast $in1,$inout1 1520 pxor $rndkey0,$in0 1521 movdqa 0x00(%rsp),$in1 # load next counter block 1522 aesenclast $in2,$inout2 1523 aesenclast $in3,$inout3 1524 movdqa 0x10(%rsp),$in2 1525 movdqa 0x20(%rsp),$in3 1526 aesenclast $in4,$inout4 1527 aesenclast $in5,$inout5 1528 movdqa 0x30(%rsp),$in4 1529 movdqa 0x40(%rsp),$in5 1530 aesenclast $rndkey1,$inout6 1531 movdqa 0x50(%rsp),$rndkey0 1532 $movkey 0x10-0x80($key),$rndkey1#real 1st-round key 1533 aesenclast $in0,$inout7 1534 1535 movups $inout0,($out) # store 8 output blocks 1536 movdqa $in1,$inout0 1537 movups $inout1,0x10($out) 1538 movdqa $in2,$inout1 1539 movups $inout2,0x20($out) 1540 movdqa $in3,$inout2 1541 movups $inout3,0x30($out) 1542 movdqa $in4,$inout3 1543 movups $inout4,0x40($out) 1544 movdqa $in5,$inout4 1545 movups $inout5,0x50($out) 1546 movdqa $rndkey0,$inout5 1547 movups $inout6,0x60($out) 1548 movups $inout7,0x70($out) 1549 lea 0x80($out),$out # $out+=8*16 1550 1551 sub \$8,$len 1552 jnc .Lctr32_loop8 # loop if $len-=8 didn't borrow 1553 1554 add \$8,$len # restore real remaining $len 1555 jz .Lctr32_done # done if ($len==0) 1556 lea -0x80($key),$key 1557 1558.Lctr32_tail: 1559 # note that at this point $inout0..5 are populated with 1560 # counter values xor-ed with 0-round key 1561 lea 16($key),$key 1562 cmp \$4,$len 1563 jb .Lctr32_loop3 1564 je .Lctr32_loop4 1565 1566 # if ($len>4) compute 7 E(counter) 1567 shl \$4,$rounds 1568 movdqa 0x60(%rsp),$inout6 1569 pxor $inout7,$inout7 1570 1571 $movkey 16($key),$rndkey0 1572 aesenc $rndkey1,$inout0 1573 aesenc $rndkey1,$inout1 1574 lea 32-16($key,$rounds),$key# prepare for .Lenc_loop8_enter 1575 neg %rax 1576 aesenc $rndkey1,$inout2 1577 add \$16,%rax # prepare for .Lenc_loop8_enter 1578 movups ($inp),$in0 1579 aesenc $rndkey1,$inout3 1580 aesenc $rndkey1,$inout4 1581 movups 0x10($inp),$in1 # pre-load input 1582 movups 0x20($inp),$in2 1583 aesenc $rndkey1,$inout5 1584 aesenc $rndkey1,$inout6 1585 1586 call .Lenc_loop8_enter 1587 1588 movdqu 0x30($inp),$in3 1589 pxor $in0,$inout0 1590 movdqu 0x40($inp),$in0 1591 pxor $in1,$inout1 1592 movdqu $inout0,($out) # store output 1593 pxor $in2,$inout2 1594 movdqu $inout1,0x10($out) 1595 pxor $in3,$inout3 1596 movdqu $inout2,0x20($out) 1597 pxor $in0,$inout4 1598 movdqu $inout3,0x30($out) 1599 movdqu $inout4,0x40($out) 1600 cmp \$6,$len 1601 jb .Lctr32_done # $len was 5, stop store 1602 1603 movups 0x50($inp),$in1 1604 xorps $in1,$inout5 1605 movups $inout5,0x50($out) 1606 je .Lctr32_done # $len was 6, stop store 1607 1608 movups 0x60($inp),$in2 1609 xorps $in2,$inout6 1610 movups $inout6,0x60($out) 1611 jmp .Lctr32_done # $len was 7, stop store 1612 1613.align 32 1614.Lctr32_loop4: 1615 aesenc $rndkey1,$inout0 1616 lea 16($key),$key 1617 dec $rounds 1618 aesenc $rndkey1,$inout1 1619 aesenc $rndkey1,$inout2 1620 aesenc $rndkey1,$inout3 1621 $movkey ($key),$rndkey1 1622 jnz .Lctr32_loop4 1623 aesenclast $rndkey1,$inout0 1624 aesenclast $rndkey1,$inout1 1625 movups ($inp),$in0 # load input 1626 movups 0x10($inp),$in1 1627 aesenclast $rndkey1,$inout2 1628 aesenclast $rndkey1,$inout3 1629 movups 0x20($inp),$in2 1630 movups 0x30($inp),$in3 1631 1632 xorps $in0,$inout0 1633 movups $inout0,($out) # store output 1634 xorps $in1,$inout1 1635 movups $inout1,0x10($out) 1636 pxor $in2,$inout2 1637 movdqu $inout2,0x20($out) 1638 pxor $in3,$inout3 1639 movdqu $inout3,0x30($out) 1640 jmp .Lctr32_done # $len was 4, stop store 1641 1642.align 32 1643.Lctr32_loop3: 1644 aesenc $rndkey1,$inout0 1645 lea 16($key),$key 1646 dec $rounds 1647 aesenc $rndkey1,$inout1 1648 aesenc $rndkey1,$inout2 1649 $movkey ($key),$rndkey1 1650 jnz .Lctr32_loop3 1651 aesenclast $rndkey1,$inout0 1652 aesenclast $rndkey1,$inout1 1653 aesenclast $rndkey1,$inout2 1654 1655 movups ($inp),$in0 # load input 1656 xorps $in0,$inout0 1657 movups $inout0,($out) # store output 1658 cmp \$2,$len 1659 jb .Lctr32_done # $len was 1, stop store 1660 1661 movups 0x10($inp),$in1 1662 xorps $in1,$inout1 1663 movups $inout1,0x10($out) 1664 je .Lctr32_done # $len was 2, stop store 1665 1666 movups 0x20($inp),$in2 1667 xorps $in2,$inout2 1668 movups $inout2,0x20($out) # $len was 3, stop store 1669 1670.Lctr32_done: 1671 xorps %xmm0,%xmm0 # clear register bank 1672 xor $key0,$key0 1673 pxor %xmm1,%xmm1 1674 pxor %xmm2,%xmm2 1675 pxor %xmm3,%xmm3 1676 pxor %xmm4,%xmm4 1677 pxor %xmm5,%xmm5 1678___ 1679$code.=<<___ if (!$win64); 1680 pxor %xmm6,%xmm6 1681 pxor %xmm7,%xmm7 1682 movaps %xmm0,0x00(%rsp) # clear stack 1683 pxor %xmm8,%xmm8 1684 movaps %xmm0,0x10(%rsp) 1685 pxor %xmm9,%xmm9 1686 movaps %xmm0,0x20(%rsp) 1687 pxor %xmm10,%xmm10 1688 movaps %xmm0,0x30(%rsp) 1689 pxor %xmm11,%xmm11 1690 movaps %xmm0,0x40(%rsp) 1691 pxor %xmm12,%xmm12 1692 movaps %xmm0,0x50(%rsp) 1693 pxor %xmm13,%xmm13 1694 movaps %xmm0,0x60(%rsp) 1695 pxor %xmm14,%xmm14 1696 movaps %xmm0,0x70(%rsp) 1697 pxor %xmm15,%xmm15 1698___ 1699$code.=<<___ if ($win64); 1700 movaps -0xa8($key_),%xmm6 1701 movaps %xmm0,-0xa8($key_) # clear stack 1702 movaps -0x98($key_),%xmm7 1703 movaps %xmm0,-0x98($key_) 1704 movaps -0x88($key_),%xmm8 1705 movaps %xmm0,-0x88($key_) 1706 movaps -0x78($key_),%xmm9 1707 movaps %xmm0,-0x78($key_) 1708 movaps -0x68($key_),%xmm10 1709 movaps %xmm0,-0x68($key_) 1710 movaps -0x58($key_),%xmm11 1711 movaps %xmm0,-0x58($key_) 1712 movaps -0x48($key_),%xmm12 1713 movaps %xmm0,-0x48($key_) 1714 movaps -0x38($key_),%xmm13 1715 movaps %xmm0,-0x38($key_) 1716 movaps -0x28($key_),%xmm14 1717 movaps %xmm0,-0x28($key_) 1718 movaps -0x18($key_),%xmm15 1719 movaps %xmm0,-0x18($key_) 1720 movaps %xmm0,0x00(%rsp) 1721 movaps %xmm0,0x10(%rsp) 1722 movaps %xmm0,0x20(%rsp) 1723 movaps %xmm0,0x30(%rsp) 1724 movaps %xmm0,0x40(%rsp) 1725 movaps %xmm0,0x50(%rsp) 1726 movaps %xmm0,0x60(%rsp) 1727 movaps %xmm0,0x70(%rsp) 1728___ 1729$code.=<<___; 1730 mov -8($key_),%rbp 1731.cfi_restore %rbp 1732 lea ($key_),%rsp 1733.cfi_def_cfa_register %rsp 1734.Lctr32_epilogue: 1735 ret 1736.cfi_endproc 1737.size ${PREFIX}_ctr32_encrypt_blocks,.-${PREFIX}_ctr32_encrypt_blocks 1738___ 1739} 1740 1741###################################################################### 1742# void aesni_xts_[en|de]crypt(const char *inp,char *out,size_t len, 1743# const AES_KEY *key1, const AES_KEY *key2 1744# const unsigned char iv[16]); 1745# 1746{ 1747my @tweak=map("%xmm$_",(10..15)); 1748my ($twmask,$twres,$twtmp)=("%xmm8","%xmm9",@tweak[4]); 1749my ($key2,$ivp,$len_)=("%r8","%r9","%r9"); 1750my $frame_size = 0x70 + ($win64?160:0); 1751my $key_ = "%rbp"; # override so that we can use %r11 as FP 1752 1753$code.=<<___; 1754.globl ${PREFIX}_xts_encrypt 1755.type ${PREFIX}_xts_encrypt,\@function,6 1756.align 16 1757${PREFIX}_xts_encrypt: 1758.cfi_startproc 1759 lea (%rsp),%r11 # frame pointer 1760.cfi_def_cfa_register %r11 1761 push %rbp 1762.cfi_push %rbp 1763 sub \$$frame_size,%rsp 1764 and \$-16,%rsp # Linux kernel stack can be incorrectly seeded 1765___ 1766$code.=<<___ if ($win64); 1767 movaps %xmm6,-0xa8(%r11) # offload everything 1768 movaps %xmm7,-0x98(%r11) 1769 movaps %xmm8,-0x88(%r11) 1770 movaps %xmm9,-0x78(%r11) 1771 movaps %xmm10,-0x68(%r11) 1772 movaps %xmm11,-0x58(%r11) 1773 movaps %xmm12,-0x48(%r11) 1774 movaps %xmm13,-0x38(%r11) 1775 movaps %xmm14,-0x28(%r11) 1776 movaps %xmm15,-0x18(%r11) 1777.Lxts_enc_body: 1778___ 1779$code.=<<___; 1780 movups ($ivp),$inout0 # load clear-text tweak 1781 mov 240(%r8),$rounds # key2->rounds 1782 mov 240($key),$rnds_ # key1->rounds 1783___ 1784 # generate the tweak 1785 &aesni_generate1("enc",$key2,$rounds,$inout0); 1786$code.=<<___; 1787 $movkey ($key),$rndkey0 # zero round key 1788 mov $key,$key_ # backup $key 1789 mov $rnds_,$rounds # backup $rounds 1790 shl \$4,$rnds_ 1791 mov $len,$len_ # backup $len 1792 and \$-16,$len 1793 1794 $movkey 16($key,$rnds_),$rndkey1 # last round key 1795 1796 movdqa .Lxts_magic(%rip),$twmask 1797 movdqa $inout0,@tweak[5] 1798 pshufd \$0x5f,$inout0,$twres 1799 pxor $rndkey0,$rndkey1 1800___ 1801 # alternative tweak calculation algorithm is based on suggestions 1802 # by Shay Gueron. psrad doesn't conflict with AES-NI instructions 1803 # and should help in the future... 1804 for ($i=0;$i<4;$i++) { 1805 $code.=<<___; 1806 movdqa $twres,$twtmp 1807 paddd $twres,$twres 1808 movdqa @tweak[5],@tweak[$i] 1809 psrad \$31,$twtmp # broadcast upper bits 1810 paddq @tweak[5],@tweak[5] 1811 pand $twmask,$twtmp 1812 pxor $rndkey0,@tweak[$i] 1813 pxor $twtmp,@tweak[5] 1814___ 1815 } 1816$code.=<<___; 1817 movdqa @tweak[5],@tweak[4] 1818 psrad \$31,$twres 1819 paddq @tweak[5],@tweak[5] 1820 pand $twmask,$twres 1821 pxor $rndkey0,@tweak[4] 1822 pxor $twres,@tweak[5] 1823 movaps $rndkey1,0x60(%rsp) # save round[0]^round[last] 1824 1825 sub \$16*6,$len 1826 jc .Lxts_enc_short # if $len-=6*16 borrowed 1827 1828 mov \$16+96,$rounds 1829 lea 32($key_,$rnds_),$key # end of key schedule 1830 sub %r10,%rax # twisted $rounds 1831 $movkey 16($key_),$rndkey1 1832 mov %rax,%r10 # backup twisted $rounds 1833 lea .Lxts_magic(%rip),%r8 1834 jmp .Lxts_enc_grandloop 1835 1836.align 32 1837.Lxts_enc_grandloop: 1838 movdqu `16*0`($inp),$inout0 # load input 1839 movdqa $rndkey0,$twmask 1840 movdqu `16*1`($inp),$inout1 1841 pxor @tweak[0],$inout0 # input^=tweak^round[0] 1842 movdqu `16*2`($inp),$inout2 1843 pxor @tweak[1],$inout1 1844 aesenc $rndkey1,$inout0 1845 movdqu `16*3`($inp),$inout3 1846 pxor @tweak[2],$inout2 1847 aesenc $rndkey1,$inout1 1848 movdqu `16*4`($inp),$inout4 1849 pxor @tweak[3],$inout3 1850 aesenc $rndkey1,$inout2 1851 movdqu `16*5`($inp),$inout5 1852 pxor @tweak[5],$twmask # round[0]^=tweak[5] 1853 movdqa 0x60(%rsp),$twres # load round[0]^round[last] 1854 pxor @tweak[4],$inout4 1855 aesenc $rndkey1,$inout3 1856 $movkey 32($key_),$rndkey0 1857 lea `16*6`($inp),$inp 1858 pxor $twmask,$inout5 1859 1860 pxor $twres,@tweak[0] # calculate tweaks^round[last] 1861 aesenc $rndkey1,$inout4 1862 pxor $twres,@tweak[1] 1863 movdqa @tweak[0],`16*0`(%rsp) # put aside tweaks^round[last] 1864 aesenc $rndkey1,$inout5 1865 $movkey 48($key_),$rndkey1 1866 pxor $twres,@tweak[2] 1867 1868 aesenc $rndkey0,$inout0 1869 pxor $twres,@tweak[3] 1870 movdqa @tweak[1],`16*1`(%rsp) 1871 aesenc $rndkey0,$inout1 1872 pxor $twres,@tweak[4] 1873 movdqa @tweak[2],`16*2`(%rsp) 1874 aesenc $rndkey0,$inout2 1875 aesenc $rndkey0,$inout3 1876 pxor $twres,$twmask 1877 movdqa @tweak[4],`16*4`(%rsp) 1878 aesenc $rndkey0,$inout4 1879 aesenc $rndkey0,$inout5 1880 $movkey 64($key_),$rndkey0 1881 movdqa $twmask,`16*5`(%rsp) 1882 pshufd \$0x5f,@tweak[5],$twres 1883 jmp .Lxts_enc_loop6 1884.align 32 1885.Lxts_enc_loop6: 1886 aesenc $rndkey1,$inout0 1887 aesenc $rndkey1,$inout1 1888 aesenc $rndkey1,$inout2 1889 aesenc $rndkey1,$inout3 1890 aesenc $rndkey1,$inout4 1891 aesenc $rndkey1,$inout5 1892 $movkey -64($key,%rax),$rndkey1 1893 add \$32,%rax 1894 1895 aesenc $rndkey0,$inout0 1896 aesenc $rndkey0,$inout1 1897 aesenc $rndkey0,$inout2 1898 aesenc $rndkey0,$inout3 1899 aesenc $rndkey0,$inout4 1900 aesenc $rndkey0,$inout5 1901 $movkey -80($key,%rax),$rndkey0 1902 jnz .Lxts_enc_loop6 1903 1904 movdqa (%r8),$twmask # start calculating next tweak 1905 movdqa $twres,$twtmp 1906 paddd $twres,$twres 1907 aesenc $rndkey1,$inout0 1908 paddq @tweak[5],@tweak[5] 1909 psrad \$31,$twtmp 1910 aesenc $rndkey1,$inout1 1911 pand $twmask,$twtmp 1912 $movkey ($key_),@tweak[0] # load round[0] 1913 aesenc $rndkey1,$inout2 1914 aesenc $rndkey1,$inout3 1915 aesenc $rndkey1,$inout4 1916 pxor $twtmp,@tweak[5] 1917 movaps @tweak[0],@tweak[1] # copy round[0] 1918 aesenc $rndkey1,$inout5 1919 $movkey -64($key),$rndkey1 1920 1921 movdqa $twres,$twtmp 1922 aesenc $rndkey0,$inout0 1923 paddd $twres,$twres 1924 pxor @tweak[5],@tweak[0] 1925 aesenc $rndkey0,$inout1 1926 psrad \$31,$twtmp 1927 paddq @tweak[5],@tweak[5] 1928 aesenc $rndkey0,$inout2 1929 aesenc $rndkey0,$inout3 1930 pand $twmask,$twtmp 1931 movaps @tweak[1],@tweak[2] 1932 aesenc $rndkey0,$inout4 1933 pxor $twtmp,@tweak[5] 1934 movdqa $twres,$twtmp 1935 aesenc $rndkey0,$inout5 1936 $movkey -48($key),$rndkey0 1937 1938 paddd $twres,$twres 1939 aesenc $rndkey1,$inout0 1940 pxor @tweak[5],@tweak[1] 1941 psrad \$31,$twtmp 1942 aesenc $rndkey1,$inout1 1943 paddq @tweak[5],@tweak[5] 1944 pand $twmask,$twtmp 1945 aesenc $rndkey1,$inout2 1946 aesenc $rndkey1,$inout3 1947 movdqa @tweak[3],`16*3`(%rsp) 1948 pxor $twtmp,@tweak[5] 1949 aesenc $rndkey1,$inout4 1950 movaps @tweak[2],@tweak[3] 1951 movdqa $twres,$twtmp 1952 aesenc $rndkey1,$inout5 1953 $movkey -32($key),$rndkey1 1954 1955 paddd $twres,$twres 1956 aesenc $rndkey0,$inout0 1957 pxor @tweak[5],@tweak[2] 1958 psrad \$31,$twtmp 1959 aesenc $rndkey0,$inout1 1960 paddq @tweak[5],@tweak[5] 1961 pand $twmask,$twtmp 1962 aesenc $rndkey0,$inout2 1963 aesenc $rndkey0,$inout3 1964 aesenc $rndkey0,$inout4 1965 pxor $twtmp,@tweak[5] 1966 movaps @tweak[3],@tweak[4] 1967 aesenc $rndkey0,$inout5 1968 1969 movdqa $twres,$rndkey0 1970 paddd $twres,$twres 1971 aesenc $rndkey1,$inout0 1972 pxor @tweak[5],@tweak[3] 1973 psrad \$31,$rndkey0 1974 aesenc $rndkey1,$inout1 1975 paddq @tweak[5],@tweak[5] 1976 pand $twmask,$rndkey0 1977 aesenc $rndkey1,$inout2 1978 aesenc $rndkey1,$inout3 1979 pxor $rndkey0,@tweak[5] 1980 $movkey ($key_),$rndkey0 1981 aesenc $rndkey1,$inout4 1982 aesenc $rndkey1,$inout5 1983 $movkey 16($key_),$rndkey1 1984 1985 pxor @tweak[5],@tweak[4] 1986 aesenclast `16*0`(%rsp),$inout0 1987 psrad \$31,$twres 1988 paddq @tweak[5],@tweak[5] 1989 aesenclast `16*1`(%rsp),$inout1 1990 aesenclast `16*2`(%rsp),$inout2 1991 pand $twmask,$twres 1992 mov %r10,%rax # restore $rounds 1993 aesenclast `16*3`(%rsp),$inout3 1994 aesenclast `16*4`(%rsp),$inout4 1995 aesenclast `16*5`(%rsp),$inout5 1996 pxor $twres,@tweak[5] 1997 1998 lea `16*6`($out),$out # $out+=6*16 1999 movups $inout0,`-16*6`($out) # store 6 output blocks 2000 movups $inout1,`-16*5`($out) 2001 movups $inout2,`-16*4`($out) 2002 movups $inout3,`-16*3`($out) 2003 movups $inout4,`-16*2`($out) 2004 movups $inout5,`-16*1`($out) 2005 sub \$16*6,$len 2006 jnc .Lxts_enc_grandloop # loop if $len-=6*16 didn't borrow 2007 2008 mov \$16+96,$rounds 2009 sub $rnds_,$rounds 2010 mov $key_,$key # restore $key 2011 shr \$4,$rounds # restore original value 2012 2013.Lxts_enc_short: 2014 # at the point @tweak[0..5] are populated with tweak values 2015 mov $rounds,$rnds_ # backup $rounds 2016 pxor $rndkey0,@tweak[0] 2017 add \$16*6,$len # restore real remaining $len 2018 jz .Lxts_enc_done # done if ($len==0) 2019 2020 pxor $rndkey0,@tweak[1] 2021 cmp \$0x20,$len 2022 jb .Lxts_enc_one # $len is 1*16 2023 pxor $rndkey0,@tweak[2] 2024 je .Lxts_enc_two # $len is 2*16 2025 2026 pxor $rndkey0,@tweak[3] 2027 cmp \$0x40,$len 2028 jb .Lxts_enc_three # $len is 3*16 2029 pxor $rndkey0,@tweak[4] 2030 je .Lxts_enc_four # $len is 4*16 2031 2032 movdqu ($inp),$inout0 # $len is 5*16 2033 movdqu 16*1($inp),$inout1 2034 movdqu 16*2($inp),$inout2 2035 pxor @tweak[0],$inout0 2036 movdqu 16*3($inp),$inout3 2037 pxor @tweak[1],$inout1 2038 movdqu 16*4($inp),$inout4 2039 lea 16*5($inp),$inp # $inp+=5*16 2040 pxor @tweak[2],$inout2 2041 pxor @tweak[3],$inout3 2042 pxor @tweak[4],$inout4 2043 pxor $inout5,$inout5 2044 2045 call _aesni_encrypt6 2046 2047 xorps @tweak[0],$inout0 2048 movdqa @tweak[5],@tweak[0] 2049 xorps @tweak[1],$inout1 2050 xorps @tweak[2],$inout2 2051 movdqu $inout0,($out) # store 5 output blocks 2052 xorps @tweak[3],$inout3 2053 movdqu $inout1,16*1($out) 2054 xorps @tweak[4],$inout4 2055 movdqu $inout2,16*2($out) 2056 movdqu $inout3,16*3($out) 2057 movdqu $inout4,16*4($out) 2058 lea 16*5($out),$out # $out+=5*16 2059 jmp .Lxts_enc_done 2060 2061.align 16 2062.Lxts_enc_one: 2063 movups ($inp),$inout0 2064 lea 16*1($inp),$inp # inp+=1*16 2065 xorps @tweak[0],$inout0 2066___ 2067 &aesni_generate1("enc",$key,$rounds); 2068$code.=<<___; 2069 xorps @tweak[0],$inout0 2070 movdqa @tweak[1],@tweak[0] 2071 movups $inout0,($out) # store one output block 2072 lea 16*1($out),$out # $out+=1*16 2073 jmp .Lxts_enc_done 2074 2075.align 16 2076.Lxts_enc_two: 2077 movups ($inp),$inout0 2078 movups 16($inp),$inout1 2079 lea 32($inp),$inp # $inp+=2*16 2080 xorps @tweak[0],$inout0 2081 xorps @tweak[1],$inout1 2082 2083 call _aesni_encrypt2 2084 2085 xorps @tweak[0],$inout0 2086 movdqa @tweak[2],@tweak[0] 2087 xorps @tweak[1],$inout1 2088 movups $inout0,($out) # store 2 output blocks 2089 movups $inout1,16*1($out) 2090 lea 16*2($out),$out # $out+=2*16 2091 jmp .Lxts_enc_done 2092 2093.align 16 2094.Lxts_enc_three: 2095 movups ($inp),$inout0 2096 movups 16*1($inp),$inout1 2097 movups 16*2($inp),$inout2 2098 lea 16*3($inp),$inp # $inp+=3*16 2099 xorps @tweak[0],$inout0 2100 xorps @tweak[1],$inout1 2101 xorps @tweak[2],$inout2 2102 2103 call _aesni_encrypt3 2104 2105 xorps @tweak[0],$inout0 2106 movdqa @tweak[3],@tweak[0] 2107 xorps @tweak[1],$inout1 2108 xorps @tweak[2],$inout2 2109 movups $inout0,($out) # store 3 output blocks 2110 movups $inout1,16*1($out) 2111 movups $inout2,16*2($out) 2112 lea 16*3($out),$out # $out+=3*16 2113 jmp .Lxts_enc_done 2114 2115.align 16 2116.Lxts_enc_four: 2117 movups ($inp),$inout0 2118 movups 16*1($inp),$inout1 2119 movups 16*2($inp),$inout2 2120 xorps @tweak[0],$inout0 2121 movups 16*3($inp),$inout3 2122 lea 16*4($inp),$inp # $inp+=4*16 2123 xorps @tweak[1],$inout1 2124 xorps @tweak[2],$inout2 2125 xorps @tweak[3],$inout3 2126 2127 call _aesni_encrypt4 2128 2129 pxor @tweak[0],$inout0 2130 movdqa @tweak[4],@tweak[0] 2131 pxor @tweak[1],$inout1 2132 pxor @tweak[2],$inout2 2133 movdqu $inout0,($out) # store 4 output blocks 2134 pxor @tweak[3],$inout3 2135 movdqu $inout1,16*1($out) 2136 movdqu $inout2,16*2($out) 2137 movdqu $inout3,16*3($out) 2138 lea 16*4($out),$out # $out+=4*16 2139 jmp .Lxts_enc_done 2140 2141.align 16 2142.Lxts_enc_done: 2143 and \$15,$len_ # see if $len%16 is 0 2144 jz .Lxts_enc_ret 2145 mov $len_,$len 2146 2147.Lxts_enc_steal: 2148 movzb ($inp),%eax # borrow $rounds ... 2149 movzb -16($out),%ecx # ... and $key 2150 lea 1($inp),$inp 2151 mov %al,-16($out) 2152 mov %cl,0($out) 2153 lea 1($out),$out 2154 sub \$1,$len 2155 jnz .Lxts_enc_steal 2156 2157 sub $len_,$out # rewind $out 2158 mov $key_,$key # restore $key 2159 mov $rnds_,$rounds # restore $rounds 2160 2161 movups -16($out),$inout0 2162 xorps @tweak[0],$inout0 2163___ 2164 &aesni_generate1("enc",$key,$rounds); 2165$code.=<<___; 2166 xorps @tweak[0],$inout0 2167 movups $inout0,-16($out) 2168 2169.Lxts_enc_ret: 2170 xorps %xmm0,%xmm0 # clear register bank 2171 pxor %xmm1,%xmm1 2172 pxor %xmm2,%xmm2 2173 pxor %xmm3,%xmm3 2174 pxor %xmm4,%xmm4 2175 pxor %xmm5,%xmm5 2176___ 2177$code.=<<___ if (!$win64); 2178 pxor %xmm6,%xmm6 2179 pxor %xmm7,%xmm7 2180 movaps %xmm0,0x00(%rsp) # clear stack 2181 pxor %xmm8,%xmm8 2182 movaps %xmm0,0x10(%rsp) 2183 pxor %xmm9,%xmm9 2184 movaps %xmm0,0x20(%rsp) 2185 pxor %xmm10,%xmm10 2186 movaps %xmm0,0x30(%rsp) 2187 pxor %xmm11,%xmm11 2188 movaps %xmm0,0x40(%rsp) 2189 pxor %xmm12,%xmm12 2190 movaps %xmm0,0x50(%rsp) 2191 pxor %xmm13,%xmm13 2192 movaps %xmm0,0x60(%rsp) 2193 pxor %xmm14,%xmm14 2194 pxor %xmm15,%xmm15 2195___ 2196$code.=<<___ if ($win64); 2197 movaps -0xa8(%r11),%xmm6 2198 movaps %xmm0,-0xa8(%r11) # clear stack 2199 movaps -0x98(%r11),%xmm7 2200 movaps %xmm0,-0x98(%r11) 2201 movaps -0x88(%r11),%xmm8 2202 movaps %xmm0,-0x88(%r11) 2203 movaps -0x78(%r11),%xmm9 2204 movaps %xmm0,-0x78(%r11) 2205 movaps -0x68(%r11),%xmm10 2206 movaps %xmm0,-0x68(%r11) 2207 movaps -0x58(%r11),%xmm11 2208 movaps %xmm0,-0x58(%r11) 2209 movaps -0x48(%r11),%xmm12 2210 movaps %xmm0,-0x48(%r11) 2211 movaps -0x38(%r11),%xmm13 2212 movaps %xmm0,-0x38(%r11) 2213 movaps -0x28(%r11),%xmm14 2214 movaps %xmm0,-0x28(%r11) 2215 movaps -0x18(%r11),%xmm15 2216 movaps %xmm0,-0x18(%r11) 2217 movaps %xmm0,0x00(%rsp) 2218 movaps %xmm0,0x10(%rsp) 2219 movaps %xmm0,0x20(%rsp) 2220 movaps %xmm0,0x30(%rsp) 2221 movaps %xmm0,0x40(%rsp) 2222 movaps %xmm0,0x50(%rsp) 2223 movaps %xmm0,0x60(%rsp) 2224___ 2225$code.=<<___; 2226 mov -8(%r11),%rbp 2227.cfi_restore %rbp 2228 lea (%r11),%rsp 2229.cfi_def_cfa_register %rsp 2230.Lxts_enc_epilogue: 2231 ret 2232.cfi_endproc 2233.size ${PREFIX}_xts_encrypt,.-${PREFIX}_xts_encrypt 2234___ 2235 2236$code.=<<___; 2237.globl ${PREFIX}_xts_decrypt 2238.type ${PREFIX}_xts_decrypt,\@function,6 2239.align 16 2240${PREFIX}_xts_decrypt: 2241.cfi_startproc 2242 lea (%rsp),%r11 # frame pointer 2243.cfi_def_cfa_register %r11 2244 push %rbp 2245.cfi_push %rbp 2246 sub \$$frame_size,%rsp 2247 and \$-16,%rsp # Linux kernel stack can be incorrectly seeded 2248___ 2249$code.=<<___ if ($win64); 2250 movaps %xmm6,-0xa8(%r11) # offload everything 2251 movaps %xmm7,-0x98(%r11) 2252 movaps %xmm8,-0x88(%r11) 2253 movaps %xmm9,-0x78(%r11) 2254 movaps %xmm10,-0x68(%r11) 2255 movaps %xmm11,-0x58(%r11) 2256 movaps %xmm12,-0x48(%r11) 2257 movaps %xmm13,-0x38(%r11) 2258 movaps %xmm14,-0x28(%r11) 2259 movaps %xmm15,-0x18(%r11) 2260.Lxts_dec_body: 2261___ 2262$code.=<<___; 2263 movups ($ivp),$inout0 # load clear-text tweak 2264 mov 240($key2),$rounds # key2->rounds 2265 mov 240($key),$rnds_ # key1->rounds 2266___ 2267 # generate the tweak 2268 &aesni_generate1("enc",$key2,$rounds,$inout0); 2269$code.=<<___; 2270 xor %eax,%eax # if ($len%16) len-=16; 2271 test \$15,$len 2272 setnz %al 2273 shl \$4,%rax 2274 sub %rax,$len 2275 2276 $movkey ($key),$rndkey0 # zero round key 2277 mov $key,$key_ # backup $key 2278 mov $rnds_,$rounds # backup $rounds 2279 shl \$4,$rnds_ 2280 mov $len,$len_ # backup $len 2281 and \$-16,$len 2282 2283 $movkey 16($key,$rnds_),$rndkey1 # last round key 2284 2285 movdqa .Lxts_magic(%rip),$twmask 2286 movdqa $inout0,@tweak[5] 2287 pshufd \$0x5f,$inout0,$twres 2288 pxor $rndkey0,$rndkey1 2289___ 2290 for ($i=0;$i<4;$i++) { 2291 $code.=<<___; 2292 movdqa $twres,$twtmp 2293 paddd $twres,$twres 2294 movdqa @tweak[5],@tweak[$i] 2295 psrad \$31,$twtmp # broadcast upper bits 2296 paddq @tweak[5],@tweak[5] 2297 pand $twmask,$twtmp 2298 pxor $rndkey0,@tweak[$i] 2299 pxor $twtmp,@tweak[5] 2300___ 2301 } 2302$code.=<<___; 2303 movdqa @tweak[5],@tweak[4] 2304 psrad \$31,$twres 2305 paddq @tweak[5],@tweak[5] 2306 pand $twmask,$twres 2307 pxor $rndkey0,@tweak[4] 2308 pxor $twres,@tweak[5] 2309 movaps $rndkey1,0x60(%rsp) # save round[0]^round[last] 2310 2311 sub \$16*6,$len 2312 jc .Lxts_dec_short # if $len-=6*16 borrowed 2313 2314 mov \$16+96,$rounds 2315 lea 32($key_,$rnds_),$key # end of key schedule 2316 sub %r10,%rax # twisted $rounds 2317 $movkey 16($key_),$rndkey1 2318 mov %rax,%r10 # backup twisted $rounds 2319 lea .Lxts_magic(%rip),%r8 2320 jmp .Lxts_dec_grandloop 2321 2322.align 32 2323.Lxts_dec_grandloop: 2324 movdqu `16*0`($inp),$inout0 # load input 2325 movdqa $rndkey0,$twmask 2326 movdqu `16*1`($inp),$inout1 2327 pxor @tweak[0],$inout0 # intput^=tweak^round[0] 2328 movdqu `16*2`($inp),$inout2 2329 pxor @tweak[1],$inout1 2330 aesdec $rndkey1,$inout0 2331 movdqu `16*3`($inp),$inout3 2332 pxor @tweak[2],$inout2 2333 aesdec $rndkey1,$inout1 2334 movdqu `16*4`($inp),$inout4 2335 pxor @tweak[3],$inout3 2336 aesdec $rndkey1,$inout2 2337 movdqu `16*5`($inp),$inout5 2338 pxor @tweak[5],$twmask # round[0]^=tweak[5] 2339 movdqa 0x60(%rsp),$twres # load round[0]^round[last] 2340 pxor @tweak[4],$inout4 2341 aesdec $rndkey1,$inout3 2342 $movkey 32($key_),$rndkey0 2343 lea `16*6`($inp),$inp 2344 pxor $twmask,$inout5 2345 2346 pxor $twres,@tweak[0] # calculate tweaks^round[last] 2347 aesdec $rndkey1,$inout4 2348 pxor $twres,@tweak[1] 2349 movdqa @tweak[0],`16*0`(%rsp) # put aside tweaks^last round key 2350 aesdec $rndkey1,$inout5 2351 $movkey 48($key_),$rndkey1 2352 pxor $twres,@tweak[2] 2353 2354 aesdec $rndkey0,$inout0 2355 pxor $twres,@tweak[3] 2356 movdqa @tweak[1],`16*1`(%rsp) 2357 aesdec $rndkey0,$inout1 2358 pxor $twres,@tweak[4] 2359 movdqa @tweak[2],`16*2`(%rsp) 2360 aesdec $rndkey0,$inout2 2361 aesdec $rndkey0,$inout3 2362 pxor $twres,$twmask 2363 movdqa @tweak[4],`16*4`(%rsp) 2364 aesdec $rndkey0,$inout4 2365 aesdec $rndkey0,$inout5 2366 $movkey 64($key_),$rndkey0 2367 movdqa $twmask,`16*5`(%rsp) 2368 pshufd \$0x5f,@tweak[5],$twres 2369 jmp .Lxts_dec_loop6 2370.align 32 2371.Lxts_dec_loop6: 2372 aesdec $rndkey1,$inout0 2373 aesdec $rndkey1,$inout1 2374 aesdec $rndkey1,$inout2 2375 aesdec $rndkey1,$inout3 2376 aesdec $rndkey1,$inout4 2377 aesdec $rndkey1,$inout5 2378 $movkey -64($key,%rax),$rndkey1 2379 add \$32,%rax 2380 2381 aesdec $rndkey0,$inout0 2382 aesdec $rndkey0,$inout1 2383 aesdec $rndkey0,$inout2 2384 aesdec $rndkey0,$inout3 2385 aesdec $rndkey0,$inout4 2386 aesdec $rndkey0,$inout5 2387 $movkey -80($key,%rax),$rndkey0 2388 jnz .Lxts_dec_loop6 2389 2390 movdqa (%r8),$twmask # start calculating next tweak 2391 movdqa $twres,$twtmp 2392 paddd $twres,$twres 2393 aesdec $rndkey1,$inout0 2394 paddq @tweak[5],@tweak[5] 2395 psrad \$31,$twtmp 2396 aesdec $rndkey1,$inout1 2397 pand $twmask,$twtmp 2398 $movkey ($key_),@tweak[0] # load round[0] 2399 aesdec $rndkey1,$inout2 2400 aesdec $rndkey1,$inout3 2401 aesdec $rndkey1,$inout4 2402 pxor $twtmp,@tweak[5] 2403 movaps @tweak[0],@tweak[1] # copy round[0] 2404 aesdec $rndkey1,$inout5 2405 $movkey -64($key),$rndkey1 2406 2407 movdqa $twres,$twtmp 2408 aesdec $rndkey0,$inout0 2409 paddd $twres,$twres 2410 pxor @tweak[5],@tweak[0] 2411 aesdec $rndkey0,$inout1 2412 psrad \$31,$twtmp 2413 paddq @tweak[5],@tweak[5] 2414 aesdec $rndkey0,$inout2 2415 aesdec $rndkey0,$inout3 2416 pand $twmask,$twtmp 2417 movaps @tweak[1],@tweak[2] 2418 aesdec $rndkey0,$inout4 2419 pxor $twtmp,@tweak[5] 2420 movdqa $twres,$twtmp 2421 aesdec $rndkey0,$inout5 2422 $movkey -48($key),$rndkey0 2423 2424 paddd $twres,$twres 2425 aesdec $rndkey1,$inout0 2426 pxor @tweak[5],@tweak[1] 2427 psrad \$31,$twtmp 2428 aesdec $rndkey1,$inout1 2429 paddq @tweak[5],@tweak[5] 2430 pand $twmask,$twtmp 2431 aesdec $rndkey1,$inout2 2432 aesdec $rndkey1,$inout3 2433 movdqa @tweak[3],`16*3`(%rsp) 2434 pxor $twtmp,@tweak[5] 2435 aesdec $rndkey1,$inout4 2436 movaps @tweak[2],@tweak[3] 2437 movdqa $twres,$twtmp 2438 aesdec $rndkey1,$inout5 2439 $movkey -32($key),$rndkey1 2440 2441 paddd $twres,$twres 2442 aesdec $rndkey0,$inout0 2443 pxor @tweak[5],@tweak[2] 2444 psrad \$31,$twtmp 2445 aesdec $rndkey0,$inout1 2446 paddq @tweak[5],@tweak[5] 2447 pand $twmask,$twtmp 2448 aesdec $rndkey0,$inout2 2449 aesdec $rndkey0,$inout3 2450 aesdec $rndkey0,$inout4 2451 pxor $twtmp,@tweak[5] 2452 movaps @tweak[3],@tweak[4] 2453 aesdec $rndkey0,$inout5 2454 2455 movdqa $twres,$rndkey0 2456 paddd $twres,$twres 2457 aesdec $rndkey1,$inout0 2458 pxor @tweak[5],@tweak[3] 2459 psrad \$31,$rndkey0 2460 aesdec $rndkey1,$inout1 2461 paddq @tweak[5],@tweak[5] 2462 pand $twmask,$rndkey0 2463 aesdec $rndkey1,$inout2 2464 aesdec $rndkey1,$inout3 2465 pxor $rndkey0,@tweak[5] 2466 $movkey ($key_),$rndkey0 2467 aesdec $rndkey1,$inout4 2468 aesdec $rndkey1,$inout5 2469 $movkey 16($key_),$rndkey1 2470 2471 pxor @tweak[5],@tweak[4] 2472 aesdeclast `16*0`(%rsp),$inout0 2473 psrad \$31,$twres 2474 paddq @tweak[5],@tweak[5] 2475 aesdeclast `16*1`(%rsp),$inout1 2476 aesdeclast `16*2`(%rsp),$inout2 2477 pand $twmask,$twres 2478 mov %r10,%rax # restore $rounds 2479 aesdeclast `16*3`(%rsp),$inout3 2480 aesdeclast `16*4`(%rsp),$inout4 2481 aesdeclast `16*5`(%rsp),$inout5 2482 pxor $twres,@tweak[5] 2483 2484 lea `16*6`($out),$out # $out+=6*16 2485 movups $inout0,`-16*6`($out) # store 6 output blocks 2486 movups $inout1,`-16*5`($out) 2487 movups $inout2,`-16*4`($out) 2488 movups $inout3,`-16*3`($out) 2489 movups $inout4,`-16*2`($out) 2490 movups $inout5,`-16*1`($out) 2491 sub \$16*6,$len 2492 jnc .Lxts_dec_grandloop # loop if $len-=6*16 didn't borrow 2493 2494 mov \$16+96,$rounds 2495 sub $rnds_,$rounds 2496 mov $key_,$key # restore $key 2497 shr \$4,$rounds # restore original value 2498 2499.Lxts_dec_short: 2500 # at the point @tweak[0..5] are populated with tweak values 2501 mov $rounds,$rnds_ # backup $rounds 2502 pxor $rndkey0,@tweak[0] 2503 pxor $rndkey0,@tweak[1] 2504 add \$16*6,$len # restore real remaining $len 2505 jz .Lxts_dec_done # done if ($len==0) 2506 2507 pxor $rndkey0,@tweak[2] 2508 cmp \$0x20,$len 2509 jb .Lxts_dec_one # $len is 1*16 2510 pxor $rndkey0,@tweak[3] 2511 je .Lxts_dec_two # $len is 2*16 2512 2513 pxor $rndkey0,@tweak[4] 2514 cmp \$0x40,$len 2515 jb .Lxts_dec_three # $len is 3*16 2516 je .Lxts_dec_four # $len is 4*16 2517 2518 movdqu ($inp),$inout0 # $len is 5*16 2519 movdqu 16*1($inp),$inout1 2520 movdqu 16*2($inp),$inout2 2521 pxor @tweak[0],$inout0 2522 movdqu 16*3($inp),$inout3 2523 pxor @tweak[1],$inout1 2524 movdqu 16*4($inp),$inout4 2525 lea 16*5($inp),$inp # $inp+=5*16 2526 pxor @tweak[2],$inout2 2527 pxor @tweak[3],$inout3 2528 pxor @tweak[4],$inout4 2529 2530 call _aesni_decrypt6 2531 2532 xorps @tweak[0],$inout0 2533 xorps @tweak[1],$inout1 2534 xorps @tweak[2],$inout2 2535 movdqu $inout0,($out) # store 5 output blocks 2536 xorps @tweak[3],$inout3 2537 movdqu $inout1,16*1($out) 2538 xorps @tweak[4],$inout4 2539 movdqu $inout2,16*2($out) 2540 pxor $twtmp,$twtmp 2541 movdqu $inout3,16*3($out) 2542 pcmpgtd @tweak[5],$twtmp 2543 movdqu $inout4,16*4($out) 2544 lea 16*5($out),$out # $out+=5*16 2545 pshufd \$0x13,$twtmp,@tweak[1] # $twres 2546 and \$15,$len_ 2547 jz .Lxts_dec_ret 2548 2549 movdqa @tweak[5],@tweak[0] 2550 paddq @tweak[5],@tweak[5] # psllq 1,$tweak 2551 pand $twmask,@tweak[1] # isolate carry and residue 2552 pxor @tweak[5],@tweak[1] 2553 jmp .Lxts_dec_done2 2554 2555.align 16 2556.Lxts_dec_one: 2557 movups ($inp),$inout0 2558 lea 16*1($inp),$inp # $inp+=1*16 2559 xorps @tweak[0],$inout0 2560___ 2561 &aesni_generate1("dec",$key,$rounds); 2562$code.=<<___; 2563 xorps @tweak[0],$inout0 2564 movdqa @tweak[1],@tweak[0] 2565 movups $inout0,($out) # store one output block 2566 movdqa @tweak[2],@tweak[1] 2567 lea 16*1($out),$out # $out+=1*16 2568 jmp .Lxts_dec_done 2569 2570.align 16 2571.Lxts_dec_two: 2572 movups ($inp),$inout0 2573 movups 16($inp),$inout1 2574 lea 32($inp),$inp # $inp+=2*16 2575 xorps @tweak[0],$inout0 2576 xorps @tweak[1],$inout1 2577 2578 call _aesni_decrypt2 2579 2580 xorps @tweak[0],$inout0 2581 movdqa @tweak[2],@tweak[0] 2582 xorps @tweak[1],$inout1 2583 movdqa @tweak[3],@tweak[1] 2584 movups $inout0,($out) # store 2 output blocks 2585 movups $inout1,16*1($out) 2586 lea 16*2($out),$out # $out+=2*16 2587 jmp .Lxts_dec_done 2588 2589.align 16 2590.Lxts_dec_three: 2591 movups ($inp),$inout0 2592 movups 16*1($inp),$inout1 2593 movups 16*2($inp),$inout2 2594 lea 16*3($inp),$inp # $inp+=3*16 2595 xorps @tweak[0],$inout0 2596 xorps @tweak[1],$inout1 2597 xorps @tweak[2],$inout2 2598 2599 call _aesni_decrypt3 2600 2601 xorps @tweak[0],$inout0 2602 movdqa @tweak[3],@tweak[0] 2603 xorps @tweak[1],$inout1 2604 movdqa @tweak[4],@tweak[1] 2605 xorps @tweak[2],$inout2 2606 movups $inout0,($out) # store 3 output blocks 2607 movups $inout1,16*1($out) 2608 movups $inout2,16*2($out) 2609 lea 16*3($out),$out # $out+=3*16 2610 jmp .Lxts_dec_done 2611 2612.align 16 2613.Lxts_dec_four: 2614 movups ($inp),$inout0 2615 movups 16*1($inp),$inout1 2616 movups 16*2($inp),$inout2 2617 xorps @tweak[0],$inout0 2618 movups 16*3($inp),$inout3 2619 lea 16*4($inp),$inp # $inp+=4*16 2620 xorps @tweak[1],$inout1 2621 xorps @tweak[2],$inout2 2622 xorps @tweak[3],$inout3 2623 2624 call _aesni_decrypt4 2625 2626 pxor @tweak[0],$inout0 2627 movdqa @tweak[4],@tweak[0] 2628 pxor @tweak[1],$inout1 2629 movdqa @tweak[5],@tweak[1] 2630 pxor @tweak[2],$inout2 2631 movdqu $inout0,($out) # store 4 output blocks 2632 pxor @tweak[3],$inout3 2633 movdqu $inout1,16*1($out) 2634 movdqu $inout2,16*2($out) 2635 movdqu $inout3,16*3($out) 2636 lea 16*4($out),$out # $out+=4*16 2637 jmp .Lxts_dec_done 2638 2639.align 16 2640.Lxts_dec_done: 2641 and \$15,$len_ # see if $len%16 is 0 2642 jz .Lxts_dec_ret 2643.Lxts_dec_done2: 2644 mov $len_,$len 2645 mov $key_,$key # restore $key 2646 mov $rnds_,$rounds # restore $rounds 2647 2648 movups ($inp),$inout0 2649 xorps @tweak[1],$inout0 2650___ 2651 &aesni_generate1("dec",$key,$rounds); 2652$code.=<<___; 2653 xorps @tweak[1],$inout0 2654 movups $inout0,($out) 2655 2656.Lxts_dec_steal: 2657 movzb 16($inp),%eax # borrow $rounds ... 2658 movzb ($out),%ecx # ... and $key 2659 lea 1($inp),$inp 2660 mov %al,($out) 2661 mov %cl,16($out) 2662 lea 1($out),$out 2663 sub \$1,$len 2664 jnz .Lxts_dec_steal 2665 2666 sub $len_,$out # rewind $out 2667 mov $key_,$key # restore $key 2668 mov $rnds_,$rounds # restore $rounds 2669 2670 movups ($out),$inout0 2671 xorps @tweak[0],$inout0 2672___ 2673 &aesni_generate1("dec",$key,$rounds); 2674$code.=<<___; 2675 xorps @tweak[0],$inout0 2676 movups $inout0,($out) 2677 2678.Lxts_dec_ret: 2679 xorps %xmm0,%xmm0 # clear register bank 2680 pxor %xmm1,%xmm1 2681 pxor %xmm2,%xmm2 2682 pxor %xmm3,%xmm3 2683 pxor %xmm4,%xmm4 2684 pxor %xmm5,%xmm5 2685___ 2686$code.=<<___ if (!$win64); 2687 pxor %xmm6,%xmm6 2688 pxor %xmm7,%xmm7 2689 movaps %xmm0,0x00(%rsp) # clear stack 2690 pxor %xmm8,%xmm8 2691 movaps %xmm0,0x10(%rsp) 2692 pxor %xmm9,%xmm9 2693 movaps %xmm0,0x20(%rsp) 2694 pxor %xmm10,%xmm10 2695 movaps %xmm0,0x30(%rsp) 2696 pxor %xmm11,%xmm11 2697 movaps %xmm0,0x40(%rsp) 2698 pxor %xmm12,%xmm12 2699 movaps %xmm0,0x50(%rsp) 2700 pxor %xmm13,%xmm13 2701 movaps %xmm0,0x60(%rsp) 2702 pxor %xmm14,%xmm14 2703 pxor %xmm15,%xmm15 2704___ 2705$code.=<<___ if ($win64); 2706 movaps -0xa8(%r11),%xmm6 2707 movaps %xmm0,-0xa8(%r11) # clear stack 2708 movaps -0x98(%r11),%xmm7 2709 movaps %xmm0,-0x98(%r11) 2710 movaps -0x88(%r11),%xmm8 2711 movaps %xmm0,-0x88(%r11) 2712 movaps -0x78(%r11),%xmm9 2713 movaps %xmm0,-0x78(%r11) 2714 movaps -0x68(%r11),%xmm10 2715 movaps %xmm0,-0x68(%r11) 2716 movaps -0x58(%r11),%xmm11 2717 movaps %xmm0,-0x58(%r11) 2718 movaps -0x48(%r11),%xmm12 2719 movaps %xmm0,-0x48(%r11) 2720 movaps -0x38(%r11),%xmm13 2721 movaps %xmm0,-0x38(%r11) 2722 movaps -0x28(%r11),%xmm14 2723 movaps %xmm0,-0x28(%r11) 2724 movaps -0x18(%r11),%xmm15 2725 movaps %xmm0,-0x18(%r11) 2726 movaps %xmm0,0x00(%rsp) 2727 movaps %xmm0,0x10(%rsp) 2728 movaps %xmm0,0x20(%rsp) 2729 movaps %xmm0,0x30(%rsp) 2730 movaps %xmm0,0x40(%rsp) 2731 movaps %xmm0,0x50(%rsp) 2732 movaps %xmm0,0x60(%rsp) 2733___ 2734$code.=<<___; 2735 mov -8(%r11),%rbp 2736.cfi_restore %rbp 2737 lea (%r11),%rsp 2738.cfi_def_cfa_register %rsp 2739.Lxts_dec_epilogue: 2740 ret 2741.cfi_endproc 2742.size ${PREFIX}_xts_decrypt,.-${PREFIX}_xts_decrypt 2743___ 2744} 2745 2746###################################################################### 2747# void aesni_ocb_[en|de]crypt(const char *inp, char *out, size_t blocks, 2748# const AES_KEY *key, unsigned int start_block_num, 2749# unsigned char offset_i[16], const unsigned char L_[][16], 2750# unsigned char checksum[16]); 2751# 2752{ 2753my @offset=map("%xmm$_",(10..15)); 2754my ($checksum,$rndkey0l)=("%xmm8","%xmm9"); 2755my ($block_num,$offset_p)=("%r8","%r9"); # 5th and 6th arguments 2756my ($L_p,$checksum_p) = ("%rbx","%rbp"); 2757my ($i1,$i3,$i5) = ("%r12","%r13","%r14"); 2758my $seventh_arg = $win64 ? 56 : 8; 2759my $blocks = $len; 2760 2761$code.=<<___; 2762.globl ${PREFIX}_ocb_encrypt 2763.type ${PREFIX}_ocb_encrypt,\@function,6 2764.align 32 2765${PREFIX}_ocb_encrypt: 2766.cfi_startproc 2767 lea (%rsp),%rax 2768 push %rbx 2769.cfi_push %rbx 2770 push %rbp 2771.cfi_push %rbp 2772 push %r12 2773.cfi_push %r12 2774 push %r13 2775.cfi_push %r13 2776 push %r14 2777.cfi_push %r14 2778___ 2779$code.=<<___ if ($win64); 2780 lea -0xa0(%rsp),%rsp 2781 movaps %xmm6,0x00(%rsp) # offload everything 2782 movaps %xmm7,0x10(%rsp) 2783 movaps %xmm8,0x20(%rsp) 2784 movaps %xmm9,0x30(%rsp) 2785 movaps %xmm10,0x40(%rsp) 2786 movaps %xmm11,0x50(%rsp) 2787 movaps %xmm12,0x60(%rsp) 2788 movaps %xmm13,0x70(%rsp) 2789 movaps %xmm14,0x80(%rsp) 2790 movaps %xmm15,0x90(%rsp) 2791.Locb_enc_body: 2792___ 2793$code.=<<___; 2794 mov $seventh_arg(%rax),$L_p # 7th argument 2795 mov $seventh_arg+8(%rax),$checksum_p# 8th argument 2796 2797 mov 240($key),$rnds_ 2798 mov $key,$key_ 2799 shl \$4,$rnds_ 2800 $movkey ($key),$rndkey0l # round[0] 2801 $movkey 16($key,$rnds_),$rndkey1 # round[last] 2802 2803 movdqu ($offset_p),@offset[5] # load last offset_i 2804 pxor $rndkey1,$rndkey0l # round[0] ^ round[last] 2805 pxor $rndkey1,@offset[5] # offset_i ^ round[last] 2806 2807 mov \$16+32,$rounds 2808 lea 32($key_,$rnds_),$key 2809 $movkey 16($key_),$rndkey1 # round[1] 2810 sub %r10,%rax # twisted $rounds 2811 mov %rax,%r10 # backup twisted $rounds 2812 2813 movdqu ($L_p),@offset[0] # L_0 for all odd-numbered blocks 2814 movdqu ($checksum_p),$checksum # load checksum 2815 2816 test \$1,$block_num # is first block number odd? 2817 jnz .Locb_enc_odd 2818 2819 bsf $block_num,$i1 2820 add \$1,$block_num 2821 shl \$4,$i1 2822 movdqu ($L_p,$i1),$inout5 # borrow 2823 movdqu ($inp),$inout0 2824 lea 16($inp),$inp 2825 2826 call __ocb_encrypt1 2827 2828 movdqa $inout5,@offset[5] 2829 movups $inout0,($out) 2830 lea 16($out),$out 2831 sub \$1,$blocks 2832 jz .Locb_enc_done 2833 2834.Locb_enc_odd: 2835 lea 1($block_num),$i1 # even-numbered blocks 2836 lea 3($block_num),$i3 2837 lea 5($block_num),$i5 2838 lea 6($block_num),$block_num 2839 bsf $i1,$i1 # ntz(block) 2840 bsf $i3,$i3 2841 bsf $i5,$i5 2842 shl \$4,$i1 # ntz(block) -> table offset 2843 shl \$4,$i3 2844 shl \$4,$i5 2845 2846 sub \$6,$blocks 2847 jc .Locb_enc_short 2848 jmp .Locb_enc_grandloop 2849 2850.align 32 2851.Locb_enc_grandloop: 2852 movdqu `16*0`($inp),$inout0 # load input 2853 movdqu `16*1`($inp),$inout1 2854 movdqu `16*2`($inp),$inout2 2855 movdqu `16*3`($inp),$inout3 2856 movdqu `16*4`($inp),$inout4 2857 movdqu `16*5`($inp),$inout5 2858 lea `16*6`($inp),$inp 2859 2860 call __ocb_encrypt6 2861 2862 movups $inout0,`16*0`($out) # store output 2863 movups $inout1,`16*1`($out) 2864 movups $inout2,`16*2`($out) 2865 movups $inout3,`16*3`($out) 2866 movups $inout4,`16*4`($out) 2867 movups $inout5,`16*5`($out) 2868 lea `16*6`($out),$out 2869 sub \$6,$blocks 2870 jnc .Locb_enc_grandloop 2871 2872.Locb_enc_short: 2873 add \$6,$blocks 2874 jz .Locb_enc_done 2875 2876 movdqu `16*0`($inp),$inout0 2877 cmp \$2,$blocks 2878 jb .Locb_enc_one 2879 movdqu `16*1`($inp),$inout1 2880 je .Locb_enc_two 2881 2882 movdqu `16*2`($inp),$inout2 2883 cmp \$4,$blocks 2884 jb .Locb_enc_three 2885 movdqu `16*3`($inp),$inout3 2886 je .Locb_enc_four 2887 2888 movdqu `16*4`($inp),$inout4 2889 pxor $inout5,$inout5 2890 2891 call __ocb_encrypt6 2892 2893 movdqa @offset[4],@offset[5] 2894 movups $inout0,`16*0`($out) 2895 movups $inout1,`16*1`($out) 2896 movups $inout2,`16*2`($out) 2897 movups $inout3,`16*3`($out) 2898 movups $inout4,`16*4`($out) 2899 2900 jmp .Locb_enc_done 2901 2902.align 16 2903.Locb_enc_one: 2904 movdqa @offset[0],$inout5 # borrow 2905 2906 call __ocb_encrypt1 2907 2908 movdqa $inout5,@offset[5] 2909 movups $inout0,`16*0`($out) 2910 jmp .Locb_enc_done 2911 2912.align 16 2913.Locb_enc_two: 2914 pxor $inout2,$inout2 2915 pxor $inout3,$inout3 2916 2917 call __ocb_encrypt4 2918 2919 movdqa @offset[1],@offset[5] 2920 movups $inout0,`16*0`($out) 2921 movups $inout1,`16*1`($out) 2922 2923 jmp .Locb_enc_done 2924 2925.align 16 2926.Locb_enc_three: 2927 pxor $inout3,$inout3 2928 2929 call __ocb_encrypt4 2930 2931 movdqa @offset[2],@offset[5] 2932 movups $inout0,`16*0`($out) 2933 movups $inout1,`16*1`($out) 2934 movups $inout2,`16*2`($out) 2935 2936 jmp .Locb_enc_done 2937 2938.align 16 2939.Locb_enc_four: 2940 call __ocb_encrypt4 2941 2942 movdqa @offset[3],@offset[5] 2943 movups $inout0,`16*0`($out) 2944 movups $inout1,`16*1`($out) 2945 movups $inout2,`16*2`($out) 2946 movups $inout3,`16*3`($out) 2947 2948.Locb_enc_done: 2949 pxor $rndkey0,@offset[5] # "remove" round[last] 2950 movdqu $checksum,($checksum_p) # store checksum 2951 movdqu @offset[5],($offset_p) # store last offset_i 2952 2953 xorps %xmm0,%xmm0 # clear register bank 2954 pxor %xmm1,%xmm1 2955 pxor %xmm2,%xmm2 2956 pxor %xmm3,%xmm3 2957 pxor %xmm4,%xmm4 2958 pxor %xmm5,%xmm5 2959___ 2960$code.=<<___ if (!$win64); 2961 pxor %xmm6,%xmm6 2962 pxor %xmm7,%xmm7 2963 pxor %xmm8,%xmm8 2964 pxor %xmm9,%xmm9 2965 pxor %xmm10,%xmm10 2966 pxor %xmm11,%xmm11 2967 pxor %xmm12,%xmm12 2968 pxor %xmm13,%xmm13 2969 pxor %xmm14,%xmm14 2970 pxor %xmm15,%xmm15 2971 lea 0x28(%rsp),%rax 2972.cfi_def_cfa %rax,8 2973___ 2974$code.=<<___ if ($win64); 2975 movaps 0x00(%rsp),%xmm6 2976 movaps %xmm0,0x00(%rsp) # clear stack 2977 movaps 0x10(%rsp),%xmm7 2978 movaps %xmm0,0x10(%rsp) 2979 movaps 0x20(%rsp),%xmm8 2980 movaps %xmm0,0x20(%rsp) 2981 movaps 0x30(%rsp),%xmm9 2982 movaps %xmm0,0x30(%rsp) 2983 movaps 0x40(%rsp),%xmm10 2984 movaps %xmm0,0x40(%rsp) 2985 movaps 0x50(%rsp),%xmm11 2986 movaps %xmm0,0x50(%rsp) 2987 movaps 0x60(%rsp),%xmm12 2988 movaps %xmm0,0x60(%rsp) 2989 movaps 0x70(%rsp),%xmm13 2990 movaps %xmm0,0x70(%rsp) 2991 movaps 0x80(%rsp),%xmm14 2992 movaps %xmm0,0x80(%rsp) 2993 movaps 0x90(%rsp),%xmm15 2994 movaps %xmm0,0x90(%rsp) 2995 lea 0xa0+0x28(%rsp),%rax 2996.Locb_enc_pop: 2997___ 2998$code.=<<___; 2999 mov -40(%rax),%r14 3000.cfi_restore %r14 3001 mov -32(%rax),%r13 3002.cfi_restore %r13 3003 mov -24(%rax),%r12 3004.cfi_restore %r12 3005 mov -16(%rax),%rbp 3006.cfi_restore %rbp 3007 mov -8(%rax),%rbx 3008.cfi_restore %rbx 3009 lea (%rax),%rsp 3010.cfi_def_cfa_register %rsp 3011.Locb_enc_epilogue: 3012 ret 3013.cfi_endproc 3014.size ${PREFIX}_ocb_encrypt,.-${PREFIX}_ocb_encrypt 3015 3016.type __ocb_encrypt6,\@abi-omnipotent 3017.align 32 3018__ocb_encrypt6: 3019 pxor $rndkey0l,@offset[5] # offset_i ^ round[0] 3020 movdqu ($L_p,$i1),@offset[1] 3021 movdqa @offset[0],@offset[2] 3022 movdqu ($L_p,$i3),@offset[3] 3023 movdqa @offset[0],@offset[4] 3024 pxor @offset[5],@offset[0] 3025 movdqu ($L_p,$i5),@offset[5] 3026 pxor @offset[0],@offset[1] 3027 pxor $inout0,$checksum # accumulate checksum 3028 pxor @offset[0],$inout0 # input ^ round[0] ^ offset_i 3029 pxor @offset[1],@offset[2] 3030 pxor $inout1,$checksum 3031 pxor @offset[1],$inout1 3032 pxor @offset[2],@offset[3] 3033 pxor $inout2,$checksum 3034 pxor @offset[2],$inout2 3035 pxor @offset[3],@offset[4] 3036 pxor $inout3,$checksum 3037 pxor @offset[3],$inout3 3038 pxor @offset[4],@offset[5] 3039 pxor $inout4,$checksum 3040 pxor @offset[4],$inout4 3041 pxor $inout5,$checksum 3042 pxor @offset[5],$inout5 3043 $movkey 32($key_),$rndkey0 3044 3045 lea 1($block_num),$i1 # even-numbered blocks 3046 lea 3($block_num),$i3 3047 lea 5($block_num),$i5 3048 add \$6,$block_num 3049 pxor $rndkey0l,@offset[0] # offset_i ^ round[last] 3050 bsf $i1,$i1 # ntz(block) 3051 bsf $i3,$i3 3052 bsf $i5,$i5 3053 3054 aesenc $rndkey1,$inout0 3055 aesenc $rndkey1,$inout1 3056 aesenc $rndkey1,$inout2 3057 aesenc $rndkey1,$inout3 3058 pxor $rndkey0l,@offset[1] 3059 pxor $rndkey0l,@offset[2] 3060 aesenc $rndkey1,$inout4 3061 pxor $rndkey0l,@offset[3] 3062 pxor $rndkey0l,@offset[4] 3063 aesenc $rndkey1,$inout5 3064 $movkey 48($key_),$rndkey1 3065 pxor $rndkey0l,@offset[5] 3066 3067 aesenc $rndkey0,$inout0 3068 aesenc $rndkey0,$inout1 3069 aesenc $rndkey0,$inout2 3070 aesenc $rndkey0,$inout3 3071 aesenc $rndkey0,$inout4 3072 aesenc $rndkey0,$inout5 3073 $movkey 64($key_),$rndkey0 3074 shl \$4,$i1 # ntz(block) -> table offset 3075 shl \$4,$i3 3076 jmp .Locb_enc_loop6 3077 3078.align 32 3079.Locb_enc_loop6: 3080 aesenc $rndkey1,$inout0 3081 aesenc $rndkey1,$inout1 3082 aesenc $rndkey1,$inout2 3083 aesenc $rndkey1,$inout3 3084 aesenc $rndkey1,$inout4 3085 aesenc $rndkey1,$inout5 3086 $movkey ($key,%rax),$rndkey1 3087 add \$32,%rax 3088 3089 aesenc $rndkey0,$inout0 3090 aesenc $rndkey0,$inout1 3091 aesenc $rndkey0,$inout2 3092 aesenc $rndkey0,$inout3 3093 aesenc $rndkey0,$inout4 3094 aesenc $rndkey0,$inout5 3095 $movkey -16($key,%rax),$rndkey0 3096 jnz .Locb_enc_loop6 3097 3098 aesenc $rndkey1,$inout0 3099 aesenc $rndkey1,$inout1 3100 aesenc $rndkey1,$inout2 3101 aesenc $rndkey1,$inout3 3102 aesenc $rndkey1,$inout4 3103 aesenc $rndkey1,$inout5 3104 $movkey 16($key_),$rndkey1 3105 shl \$4,$i5 3106 3107 aesenclast @offset[0],$inout0 3108 movdqu ($L_p),@offset[0] # L_0 for all odd-numbered blocks 3109 mov %r10,%rax # restore twisted rounds 3110 aesenclast @offset[1],$inout1 3111 aesenclast @offset[2],$inout2 3112 aesenclast @offset[3],$inout3 3113 aesenclast @offset[4],$inout4 3114 aesenclast @offset[5],$inout5 3115 ret 3116.size __ocb_encrypt6,.-__ocb_encrypt6 3117 3118.type __ocb_encrypt4,\@abi-omnipotent 3119.align 32 3120__ocb_encrypt4: 3121 pxor $rndkey0l,@offset[5] # offset_i ^ round[0] 3122 movdqu ($L_p,$i1),@offset[1] 3123 movdqa @offset[0],@offset[2] 3124 movdqu ($L_p,$i3),@offset[3] 3125 pxor @offset[5],@offset[0] 3126 pxor @offset[0],@offset[1] 3127 pxor $inout0,$checksum # accumulate checksum 3128 pxor @offset[0],$inout0 # input ^ round[0] ^ offset_i 3129 pxor @offset[1],@offset[2] 3130 pxor $inout1,$checksum 3131 pxor @offset[1],$inout1 3132 pxor @offset[2],@offset[3] 3133 pxor $inout2,$checksum 3134 pxor @offset[2],$inout2 3135 pxor $inout3,$checksum 3136 pxor @offset[3],$inout3 3137 $movkey 32($key_),$rndkey0 3138 3139 pxor $rndkey0l,@offset[0] # offset_i ^ round[last] 3140 pxor $rndkey0l,@offset[1] 3141 pxor $rndkey0l,@offset[2] 3142 pxor $rndkey0l,@offset[3] 3143 3144 aesenc $rndkey1,$inout0 3145 aesenc $rndkey1,$inout1 3146 aesenc $rndkey1,$inout2 3147 aesenc $rndkey1,$inout3 3148 $movkey 48($key_),$rndkey1 3149 3150 aesenc $rndkey0,$inout0 3151 aesenc $rndkey0,$inout1 3152 aesenc $rndkey0,$inout2 3153 aesenc $rndkey0,$inout3 3154 $movkey 64($key_),$rndkey0 3155 jmp .Locb_enc_loop4 3156 3157.align 32 3158.Locb_enc_loop4: 3159 aesenc $rndkey1,$inout0 3160 aesenc $rndkey1,$inout1 3161 aesenc $rndkey1,$inout2 3162 aesenc $rndkey1,$inout3 3163 $movkey ($key,%rax),$rndkey1 3164 add \$32,%rax 3165 3166 aesenc $rndkey0,$inout0 3167 aesenc $rndkey0,$inout1 3168 aesenc $rndkey0,$inout2 3169 aesenc $rndkey0,$inout3 3170 $movkey -16($key,%rax),$rndkey0 3171 jnz .Locb_enc_loop4 3172 3173 aesenc $rndkey1,$inout0 3174 aesenc $rndkey1,$inout1 3175 aesenc $rndkey1,$inout2 3176 aesenc $rndkey1,$inout3 3177 $movkey 16($key_),$rndkey1 3178 mov %r10,%rax # restore twisted rounds 3179 3180 aesenclast @offset[0],$inout0 3181 aesenclast @offset[1],$inout1 3182 aesenclast @offset[2],$inout2 3183 aesenclast @offset[3],$inout3 3184 ret 3185.size __ocb_encrypt4,.-__ocb_encrypt4 3186 3187.type __ocb_encrypt1,\@abi-omnipotent 3188.align 32 3189__ocb_encrypt1: 3190 pxor @offset[5],$inout5 # offset_i 3191 pxor $rndkey0l,$inout5 # offset_i ^ round[0] 3192 pxor $inout0,$checksum # accumulate checksum 3193 pxor $inout5,$inout0 # input ^ round[0] ^ offset_i 3194 $movkey 32($key_),$rndkey0 3195 3196 aesenc $rndkey1,$inout0 3197 $movkey 48($key_),$rndkey1 3198 pxor $rndkey0l,$inout5 # offset_i ^ round[last] 3199 3200 aesenc $rndkey0,$inout0 3201 $movkey 64($key_),$rndkey0 3202 jmp .Locb_enc_loop1 3203 3204.align 32 3205.Locb_enc_loop1: 3206 aesenc $rndkey1,$inout0 3207 $movkey ($key,%rax),$rndkey1 3208 add \$32,%rax 3209 3210 aesenc $rndkey0,$inout0 3211 $movkey -16($key,%rax),$rndkey0 3212 jnz .Locb_enc_loop1 3213 3214 aesenc $rndkey1,$inout0 3215 $movkey 16($key_),$rndkey1 # redundant in tail 3216 mov %r10,%rax # restore twisted rounds 3217 3218 aesenclast $inout5,$inout0 3219 ret 3220.size __ocb_encrypt1,.-__ocb_encrypt1 3221 3222.globl ${PREFIX}_ocb_decrypt 3223.type ${PREFIX}_ocb_decrypt,\@function,6 3224.align 32 3225${PREFIX}_ocb_decrypt: 3226.cfi_startproc 3227 lea (%rsp),%rax 3228 push %rbx 3229.cfi_push %rbx 3230 push %rbp 3231.cfi_push %rbp 3232 push %r12 3233.cfi_push %r12 3234 push %r13 3235.cfi_push %r13 3236 push %r14 3237.cfi_push %r14 3238___ 3239$code.=<<___ if ($win64); 3240 lea -0xa0(%rsp),%rsp 3241 movaps %xmm6,0x00(%rsp) # offload everything 3242 movaps %xmm7,0x10(%rsp) 3243 movaps %xmm8,0x20(%rsp) 3244 movaps %xmm9,0x30(%rsp) 3245 movaps %xmm10,0x40(%rsp) 3246 movaps %xmm11,0x50(%rsp) 3247 movaps %xmm12,0x60(%rsp) 3248 movaps %xmm13,0x70(%rsp) 3249 movaps %xmm14,0x80(%rsp) 3250 movaps %xmm15,0x90(%rsp) 3251.Locb_dec_body: 3252___ 3253$code.=<<___; 3254 mov $seventh_arg(%rax),$L_p # 7th argument 3255 mov $seventh_arg+8(%rax),$checksum_p# 8th argument 3256 3257 mov 240($key),$rnds_ 3258 mov $key,$key_ 3259 shl \$4,$rnds_ 3260 $movkey ($key),$rndkey0l # round[0] 3261 $movkey 16($key,$rnds_),$rndkey1 # round[last] 3262 3263 movdqu ($offset_p),@offset[5] # load last offset_i 3264 pxor $rndkey1,$rndkey0l # round[0] ^ round[last] 3265 pxor $rndkey1,@offset[5] # offset_i ^ round[last] 3266 3267 mov \$16+32,$rounds 3268 lea 32($key_,$rnds_),$key 3269 $movkey 16($key_),$rndkey1 # round[1] 3270 sub %r10,%rax # twisted $rounds 3271 mov %rax,%r10 # backup twisted $rounds 3272 3273 movdqu ($L_p),@offset[0] # L_0 for all odd-numbered blocks 3274 movdqu ($checksum_p),$checksum # load checksum 3275 3276 test \$1,$block_num # is first block number odd? 3277 jnz .Locb_dec_odd 3278 3279 bsf $block_num,$i1 3280 add \$1,$block_num 3281 shl \$4,$i1 3282 movdqu ($L_p,$i1),$inout5 # borrow 3283 movdqu ($inp),$inout0 3284 lea 16($inp),$inp 3285 3286 call __ocb_decrypt1 3287 3288 movdqa $inout5,@offset[5] 3289 movups $inout0,($out) 3290 xorps $inout0,$checksum # accumulate checksum 3291 lea 16($out),$out 3292 sub \$1,$blocks 3293 jz .Locb_dec_done 3294 3295.Locb_dec_odd: 3296 lea 1($block_num),$i1 # even-numbered blocks 3297 lea 3($block_num),$i3 3298 lea 5($block_num),$i5 3299 lea 6($block_num),$block_num 3300 bsf $i1,$i1 # ntz(block) 3301 bsf $i3,$i3 3302 bsf $i5,$i5 3303 shl \$4,$i1 # ntz(block) -> table offset 3304 shl \$4,$i3 3305 shl \$4,$i5 3306 3307 sub \$6,$blocks 3308 jc .Locb_dec_short 3309 jmp .Locb_dec_grandloop 3310 3311.align 32 3312.Locb_dec_grandloop: 3313 movdqu `16*0`($inp),$inout0 # load input 3314 movdqu `16*1`($inp),$inout1 3315 movdqu `16*2`($inp),$inout2 3316 movdqu `16*3`($inp),$inout3 3317 movdqu `16*4`($inp),$inout4 3318 movdqu `16*5`($inp),$inout5 3319 lea `16*6`($inp),$inp 3320 3321 call __ocb_decrypt6 3322 3323 movups $inout0,`16*0`($out) # store output 3324 pxor $inout0,$checksum # accumulate checksum 3325 movups $inout1,`16*1`($out) 3326 pxor $inout1,$checksum 3327 movups $inout2,`16*2`($out) 3328 pxor $inout2,$checksum 3329 movups $inout3,`16*3`($out) 3330 pxor $inout3,$checksum 3331 movups $inout4,`16*4`($out) 3332 pxor $inout4,$checksum 3333 movups $inout5,`16*5`($out) 3334 pxor $inout5,$checksum 3335 lea `16*6`($out),$out 3336 sub \$6,$blocks 3337 jnc .Locb_dec_grandloop 3338 3339.Locb_dec_short: 3340 add \$6,$blocks 3341 jz .Locb_dec_done 3342 3343 movdqu `16*0`($inp),$inout0 3344 cmp \$2,$blocks 3345 jb .Locb_dec_one 3346 movdqu `16*1`($inp),$inout1 3347 je .Locb_dec_two 3348 3349 movdqu `16*2`($inp),$inout2 3350 cmp \$4,$blocks 3351 jb .Locb_dec_three 3352 movdqu `16*3`($inp),$inout3 3353 je .Locb_dec_four 3354 3355 movdqu `16*4`($inp),$inout4 3356 pxor $inout5,$inout5 3357 3358 call __ocb_decrypt6 3359 3360 movdqa @offset[4],@offset[5] 3361 movups $inout0,`16*0`($out) # store output 3362 pxor $inout0,$checksum # accumulate checksum 3363 movups $inout1,`16*1`($out) 3364 pxor $inout1,$checksum 3365 movups $inout2,`16*2`($out) 3366 pxor $inout2,$checksum 3367 movups $inout3,`16*3`($out) 3368 pxor $inout3,$checksum 3369 movups $inout4,`16*4`($out) 3370 pxor $inout4,$checksum 3371 3372 jmp .Locb_dec_done 3373 3374.align 16 3375.Locb_dec_one: 3376 movdqa @offset[0],$inout5 # borrow 3377 3378 call __ocb_decrypt1 3379 3380 movdqa $inout5,@offset[5] 3381 movups $inout0,`16*0`($out) # store output 3382 xorps $inout0,$checksum # accumulate checksum 3383 jmp .Locb_dec_done 3384 3385.align 16 3386.Locb_dec_two: 3387 pxor $inout2,$inout2 3388 pxor $inout3,$inout3 3389 3390 call __ocb_decrypt4 3391 3392 movdqa @offset[1],@offset[5] 3393 movups $inout0,`16*0`($out) # store output 3394 xorps $inout0,$checksum # accumulate checksum 3395 movups $inout1,`16*1`($out) 3396 xorps $inout1,$checksum 3397 3398 jmp .Locb_dec_done 3399 3400.align 16 3401.Locb_dec_three: 3402 pxor $inout3,$inout3 3403 3404 call __ocb_decrypt4 3405 3406 movdqa @offset[2],@offset[5] 3407 movups $inout0,`16*0`($out) # store output 3408 xorps $inout0,$checksum # accumulate checksum 3409 movups $inout1,`16*1`($out) 3410 xorps $inout1,$checksum 3411 movups $inout2,`16*2`($out) 3412 xorps $inout2,$checksum 3413 3414 jmp .Locb_dec_done 3415 3416.align 16 3417.Locb_dec_four: 3418 call __ocb_decrypt4 3419 3420 movdqa @offset[3],@offset[5] 3421 movups $inout0,`16*0`($out) # store output 3422 pxor $inout0,$checksum # accumulate checksum 3423 movups $inout1,`16*1`($out) 3424 pxor $inout1,$checksum 3425 movups $inout2,`16*2`($out) 3426 pxor $inout2,$checksum 3427 movups $inout3,`16*3`($out) 3428 pxor $inout3,$checksum 3429 3430.Locb_dec_done: 3431 pxor $rndkey0,@offset[5] # "remove" round[last] 3432 movdqu $checksum,($checksum_p) # store checksum 3433 movdqu @offset[5],($offset_p) # store last offset_i 3434 3435 xorps %xmm0,%xmm0 # clear register bank 3436 pxor %xmm1,%xmm1 3437 pxor %xmm2,%xmm2 3438 pxor %xmm3,%xmm3 3439 pxor %xmm4,%xmm4 3440 pxor %xmm5,%xmm5 3441___ 3442$code.=<<___ if (!$win64); 3443 pxor %xmm6,%xmm6 3444 pxor %xmm7,%xmm7 3445 pxor %xmm8,%xmm8 3446 pxor %xmm9,%xmm9 3447 pxor %xmm10,%xmm10 3448 pxor %xmm11,%xmm11 3449 pxor %xmm12,%xmm12 3450 pxor %xmm13,%xmm13 3451 pxor %xmm14,%xmm14 3452 pxor %xmm15,%xmm15 3453 lea 0x28(%rsp),%rax 3454.cfi_def_cfa %rax,8 3455___ 3456$code.=<<___ if ($win64); 3457 movaps 0x00(%rsp),%xmm6 3458 movaps %xmm0,0x00(%rsp) # clear stack 3459 movaps 0x10(%rsp),%xmm7 3460 movaps %xmm0,0x10(%rsp) 3461 movaps 0x20(%rsp),%xmm8 3462 movaps %xmm0,0x20(%rsp) 3463 movaps 0x30(%rsp),%xmm9 3464 movaps %xmm0,0x30(%rsp) 3465 movaps 0x40(%rsp),%xmm10 3466 movaps %xmm0,0x40(%rsp) 3467 movaps 0x50(%rsp),%xmm11 3468 movaps %xmm0,0x50(%rsp) 3469 movaps 0x60(%rsp),%xmm12 3470 movaps %xmm0,0x60(%rsp) 3471 movaps 0x70(%rsp),%xmm13 3472 movaps %xmm0,0x70(%rsp) 3473 movaps 0x80(%rsp),%xmm14 3474 movaps %xmm0,0x80(%rsp) 3475 movaps 0x90(%rsp),%xmm15 3476 movaps %xmm0,0x90(%rsp) 3477 lea 0xa0+0x28(%rsp),%rax 3478.Locb_dec_pop: 3479___ 3480$code.=<<___; 3481 mov -40(%rax),%r14 3482.cfi_restore %r14 3483 mov -32(%rax),%r13 3484.cfi_restore %r13 3485 mov -24(%rax),%r12 3486.cfi_restore %r12 3487 mov -16(%rax),%rbp 3488.cfi_restore %rbp 3489 mov -8(%rax),%rbx 3490.cfi_restore %rbx 3491 lea (%rax),%rsp 3492.cfi_def_cfa_register %rsp 3493.Locb_dec_epilogue: 3494 ret 3495.cfi_endproc 3496.size ${PREFIX}_ocb_decrypt,.-${PREFIX}_ocb_decrypt 3497 3498.type __ocb_decrypt6,\@abi-omnipotent 3499.align 32 3500__ocb_decrypt6: 3501 pxor $rndkey0l,@offset[5] # offset_i ^ round[0] 3502 movdqu ($L_p,$i1),@offset[1] 3503 movdqa @offset[0],@offset[2] 3504 movdqu ($L_p,$i3),@offset[3] 3505 movdqa @offset[0],@offset[4] 3506 pxor @offset[5],@offset[0] 3507 movdqu ($L_p,$i5),@offset[5] 3508 pxor @offset[0],@offset[1] 3509 pxor @offset[0],$inout0 # input ^ round[0] ^ offset_i 3510 pxor @offset[1],@offset[2] 3511 pxor @offset[1],$inout1 3512 pxor @offset[2],@offset[3] 3513 pxor @offset[2],$inout2 3514 pxor @offset[3],@offset[4] 3515 pxor @offset[3],$inout3 3516 pxor @offset[4],@offset[5] 3517 pxor @offset[4],$inout4 3518 pxor @offset[5],$inout5 3519 $movkey 32($key_),$rndkey0 3520 3521 lea 1($block_num),$i1 # even-numbered blocks 3522 lea 3($block_num),$i3 3523 lea 5($block_num),$i5 3524 add \$6,$block_num 3525 pxor $rndkey0l,@offset[0] # offset_i ^ round[last] 3526 bsf $i1,$i1 # ntz(block) 3527 bsf $i3,$i3 3528 bsf $i5,$i5 3529 3530 aesdec $rndkey1,$inout0 3531 aesdec $rndkey1,$inout1 3532 aesdec $rndkey1,$inout2 3533 aesdec $rndkey1,$inout3 3534 pxor $rndkey0l,@offset[1] 3535 pxor $rndkey0l,@offset[2] 3536 aesdec $rndkey1,$inout4 3537 pxor $rndkey0l,@offset[3] 3538 pxor $rndkey0l,@offset[4] 3539 aesdec $rndkey1,$inout5 3540 $movkey 48($key_),$rndkey1 3541 pxor $rndkey0l,@offset[5] 3542 3543 aesdec $rndkey0,$inout0 3544 aesdec $rndkey0,$inout1 3545 aesdec $rndkey0,$inout2 3546 aesdec $rndkey0,$inout3 3547 aesdec $rndkey0,$inout4 3548 aesdec $rndkey0,$inout5 3549 $movkey 64($key_),$rndkey0 3550 shl \$4,$i1 # ntz(block) -> table offset 3551 shl \$4,$i3 3552 jmp .Locb_dec_loop6 3553 3554.align 32 3555.Locb_dec_loop6: 3556 aesdec $rndkey1,$inout0 3557 aesdec $rndkey1,$inout1 3558 aesdec $rndkey1,$inout2 3559 aesdec $rndkey1,$inout3 3560 aesdec $rndkey1,$inout4 3561 aesdec $rndkey1,$inout5 3562 $movkey ($key,%rax),$rndkey1 3563 add \$32,%rax 3564 3565 aesdec $rndkey0,$inout0 3566 aesdec $rndkey0,$inout1 3567 aesdec $rndkey0,$inout2 3568 aesdec $rndkey0,$inout3 3569 aesdec $rndkey0,$inout4 3570 aesdec $rndkey0,$inout5 3571 $movkey -16($key,%rax),$rndkey0 3572 jnz .Locb_dec_loop6 3573 3574 aesdec $rndkey1,$inout0 3575 aesdec $rndkey1,$inout1 3576 aesdec $rndkey1,$inout2 3577 aesdec $rndkey1,$inout3 3578 aesdec $rndkey1,$inout4 3579 aesdec $rndkey1,$inout5 3580 $movkey 16($key_),$rndkey1 3581 shl \$4,$i5 3582 3583 aesdeclast @offset[0],$inout0 3584 movdqu ($L_p),@offset[0] # L_0 for all odd-numbered blocks 3585 mov %r10,%rax # restore twisted rounds 3586 aesdeclast @offset[1],$inout1 3587 aesdeclast @offset[2],$inout2 3588 aesdeclast @offset[3],$inout3 3589 aesdeclast @offset[4],$inout4 3590 aesdeclast @offset[5],$inout5 3591 ret 3592.size __ocb_decrypt6,.-__ocb_decrypt6 3593 3594.type __ocb_decrypt4,\@abi-omnipotent 3595.align 32 3596__ocb_decrypt4: 3597 pxor $rndkey0l,@offset[5] # offset_i ^ round[0] 3598 movdqu ($L_p,$i1),@offset[1] 3599 movdqa @offset[0],@offset[2] 3600 movdqu ($L_p,$i3),@offset[3] 3601 pxor @offset[5],@offset[0] 3602 pxor @offset[0],@offset[1] 3603 pxor @offset[0],$inout0 # input ^ round[0] ^ offset_i 3604 pxor @offset[1],@offset[2] 3605 pxor @offset[1],$inout1 3606 pxor @offset[2],@offset[3] 3607 pxor @offset[2],$inout2 3608 pxor @offset[3],$inout3 3609 $movkey 32($key_),$rndkey0 3610 3611 pxor $rndkey0l,@offset[0] # offset_i ^ round[last] 3612 pxor $rndkey0l,@offset[1] 3613 pxor $rndkey0l,@offset[2] 3614 pxor $rndkey0l,@offset[3] 3615 3616 aesdec $rndkey1,$inout0 3617 aesdec $rndkey1,$inout1 3618 aesdec $rndkey1,$inout2 3619 aesdec $rndkey1,$inout3 3620 $movkey 48($key_),$rndkey1 3621 3622 aesdec $rndkey0,$inout0 3623 aesdec $rndkey0,$inout1 3624 aesdec $rndkey0,$inout2 3625 aesdec $rndkey0,$inout3 3626 $movkey 64($key_),$rndkey0 3627 jmp .Locb_dec_loop4 3628 3629.align 32 3630.Locb_dec_loop4: 3631 aesdec $rndkey1,$inout0 3632 aesdec $rndkey1,$inout1 3633 aesdec $rndkey1,$inout2 3634 aesdec $rndkey1,$inout3 3635 $movkey ($key,%rax),$rndkey1 3636 add \$32,%rax 3637 3638 aesdec $rndkey0,$inout0 3639 aesdec $rndkey0,$inout1 3640 aesdec $rndkey0,$inout2 3641 aesdec $rndkey0,$inout3 3642 $movkey -16($key,%rax),$rndkey0 3643 jnz .Locb_dec_loop4 3644 3645 aesdec $rndkey1,$inout0 3646 aesdec $rndkey1,$inout1 3647 aesdec $rndkey1,$inout2 3648 aesdec $rndkey1,$inout3 3649 $movkey 16($key_),$rndkey1 3650 mov %r10,%rax # restore twisted rounds 3651 3652 aesdeclast @offset[0],$inout0 3653 aesdeclast @offset[1],$inout1 3654 aesdeclast @offset[2],$inout2 3655 aesdeclast @offset[3],$inout3 3656 ret 3657.size __ocb_decrypt4,.-__ocb_decrypt4 3658 3659.type __ocb_decrypt1,\@abi-omnipotent 3660.align 32 3661__ocb_decrypt1: 3662 pxor @offset[5],$inout5 # offset_i 3663 pxor $rndkey0l,$inout5 # offset_i ^ round[0] 3664 pxor $inout5,$inout0 # input ^ round[0] ^ offset_i 3665 $movkey 32($key_),$rndkey0 3666 3667 aesdec $rndkey1,$inout0 3668 $movkey 48($key_),$rndkey1 3669 pxor $rndkey0l,$inout5 # offset_i ^ round[last] 3670 3671 aesdec $rndkey0,$inout0 3672 $movkey 64($key_),$rndkey0 3673 jmp .Locb_dec_loop1 3674 3675.align 32 3676.Locb_dec_loop1: 3677 aesdec $rndkey1,$inout0 3678 $movkey ($key,%rax),$rndkey1 3679 add \$32,%rax 3680 3681 aesdec $rndkey0,$inout0 3682 $movkey -16($key,%rax),$rndkey0 3683 jnz .Locb_dec_loop1 3684 3685 aesdec $rndkey1,$inout0 3686 $movkey 16($key_),$rndkey1 # redundant in tail 3687 mov %r10,%rax # restore twisted rounds 3688 3689 aesdeclast $inout5,$inout0 3690 ret 3691.size __ocb_decrypt1,.-__ocb_decrypt1 3692___ 3693} }} 3694 3695######################################################################## 3696# void $PREFIX_cbc_encrypt (const void *inp, void *out, 3697# size_t length, const AES_KEY *key, 3698# unsigned char *ivp,const int enc); 3699{ 3700my $frame_size = 0x10 + ($win64?0xa0:0); # used in decrypt 3701my ($iv,$in0,$in1,$in2,$in3,$in4)=map("%xmm$_",(10..15)); 3702 3703$code.=<<___; 3704.globl ${PREFIX}_cbc_encrypt 3705.type ${PREFIX}_cbc_encrypt,\@function,6 3706.align 16 3707${PREFIX}_cbc_encrypt: 3708.cfi_startproc 3709 test $len,$len # check length 3710 jz .Lcbc_ret 3711 3712 mov 240($key),$rnds_ # key->rounds 3713 mov $key,$key_ # backup $key 3714 test %r9d,%r9d # 6th argument 3715 jz .Lcbc_decrypt 3716#--------------------------- CBC ENCRYPT ------------------------------# 3717 movups ($ivp),$inout0 # load iv as initial state 3718 mov $rnds_,$rounds 3719 cmp \$16,$len 3720 jb .Lcbc_enc_tail 3721 sub \$16,$len 3722 jmp .Lcbc_enc_loop 3723.align 16 3724.Lcbc_enc_loop: 3725 movups ($inp),$inout1 # load input 3726 lea 16($inp),$inp 3727 #xorps $inout1,$inout0 3728___ 3729 &aesni_generate1("enc",$key,$rounds,$inout0,$inout1); 3730$code.=<<___; 3731 mov $rnds_,$rounds # restore $rounds 3732 mov $key_,$key # restore $key 3733 movups $inout0,0($out) # store output 3734 lea 16($out),$out 3735 sub \$16,$len 3736 jnc .Lcbc_enc_loop 3737 add \$16,$len 3738 jnz .Lcbc_enc_tail 3739 pxor $rndkey0,$rndkey0 # clear register bank 3740 pxor $rndkey1,$rndkey1 3741 movups $inout0,($ivp) 3742 pxor $inout0,$inout0 3743 pxor $inout1,$inout1 3744 jmp .Lcbc_ret 3745 3746.Lcbc_enc_tail: 3747 mov $len,%rcx # zaps $key 3748 xchg $inp,$out # $inp is %rsi and $out is %rdi now 3749 .long 0x9066A4F3 # rep movsb 3750 mov \$16,%ecx # zero tail 3751 sub $len,%rcx 3752 xor %eax,%eax 3753 .long 0x9066AAF3 # rep stosb 3754 lea -16(%rdi),%rdi # rewind $out by 1 block 3755 mov $rnds_,$rounds # restore $rounds 3756 mov %rdi,%rsi # $inp and $out are the same 3757 mov $key_,$key # restore $key 3758 xor $len,$len # len=16 3759 jmp .Lcbc_enc_loop # one more spin 3760#--------------------------- CBC DECRYPT ------------------------------# 3761.align 16 3762.Lcbc_decrypt: 3763 cmp \$16,$len 3764 jne .Lcbc_decrypt_bulk 3765 3766 # handle single block without allocating stack frame, 3767 # useful in ciphertext stealing mode 3768 movdqu ($inp),$inout0 # load input 3769 movdqu ($ivp),$inout1 # load iv 3770 movdqa $inout0,$inout2 # future iv 3771___ 3772 &aesni_generate1("dec",$key,$rnds_); 3773$code.=<<___; 3774 pxor $rndkey0,$rndkey0 # clear register bank 3775 pxor $rndkey1,$rndkey1 3776 movdqu $inout2,($ivp) # store iv 3777 xorps $inout1,$inout0 # ^=iv 3778 pxor $inout1,$inout1 3779 movups $inout0,($out) # store output 3780 pxor $inout0,$inout0 3781 jmp .Lcbc_ret 3782.align 16 3783.Lcbc_decrypt_bulk: 3784 lea (%rsp),%r11 # frame pointer 3785.cfi_def_cfa_register %r11 3786 push %rbp 3787.cfi_push %rbp 3788 sub \$$frame_size,%rsp 3789 and \$-16,%rsp # Linux kernel stack can be incorrectly seeded 3790___ 3791$code.=<<___ if ($win64); 3792 movaps %xmm6,0x10(%rsp) 3793 movaps %xmm7,0x20(%rsp) 3794 movaps %xmm8,0x30(%rsp) 3795 movaps %xmm9,0x40(%rsp) 3796 movaps %xmm10,0x50(%rsp) 3797 movaps %xmm11,0x60(%rsp) 3798 movaps %xmm12,0x70(%rsp) 3799 movaps %xmm13,0x80(%rsp) 3800 movaps %xmm14,0x90(%rsp) 3801 movaps %xmm15,0xa0(%rsp) 3802.Lcbc_decrypt_body: 3803___ 3804 3805my $inp_=$key_="%rbp"; # reassign $key_ 3806 3807$code.=<<___; 3808 mov $key,$key_ # [re-]backup $key [after reassignment] 3809 movups ($ivp),$iv 3810 mov $rnds_,$rounds 3811 cmp \$0x50,$len 3812 jbe .Lcbc_dec_tail 3813 3814 $movkey ($key),$rndkey0 3815 movdqu 0x00($inp),$inout0 # load input 3816 movdqu 0x10($inp),$inout1 3817 movdqa $inout0,$in0 3818 movdqu 0x20($inp),$inout2 3819 movdqa $inout1,$in1 3820 movdqu 0x30($inp),$inout3 3821 movdqa $inout2,$in2 3822 movdqu 0x40($inp),$inout4 3823 movdqa $inout3,$in3 3824 movdqu 0x50($inp),$inout5 3825 movdqa $inout4,$in4 3826 leaq OPENSSL_ia32cap_P(%rip),%r9 3827 mov 4(%r9),%r9d 3828 cmp \$0x70,$len 3829 jbe .Lcbc_dec_six_or_seven 3830 3831 and \$`1<<26|1<<22`,%r9d # isolate XSAVE+MOVBE 3832 sub \$0x50,$len # $len is biased by -5*16 3833 cmp \$`1<<22`,%r9d # check for MOVBE without XSAVE 3834 je .Lcbc_dec_loop6_enter # [which denotes Atom Silvermont] 3835 sub \$0x20,$len # $len is biased by -7*16 3836 lea 0x70($key),$key # size optimization 3837 jmp .Lcbc_dec_loop8_enter 3838.align 16 3839.Lcbc_dec_loop8: 3840 movups $inout7,($out) 3841 lea 0x10($out),$out 3842.Lcbc_dec_loop8_enter: 3843 movdqu 0x60($inp),$inout6 3844 pxor $rndkey0,$inout0 3845 movdqu 0x70($inp),$inout7 3846 pxor $rndkey0,$inout1 3847 $movkey 0x10-0x70($key),$rndkey1 3848 pxor $rndkey0,$inout2 3849 mov \$-1,$inp_ 3850 cmp \$0x70,$len # is there at least 0x60 bytes ahead? 3851 pxor $rndkey0,$inout3 3852 pxor $rndkey0,$inout4 3853 pxor $rndkey0,$inout5 3854 pxor $rndkey0,$inout6 3855 3856 aesdec $rndkey1,$inout0 3857 pxor $rndkey0,$inout7 3858 $movkey 0x20-0x70($key),$rndkey0 3859 aesdec $rndkey1,$inout1 3860 aesdec $rndkey1,$inout2 3861 aesdec $rndkey1,$inout3 3862 aesdec $rndkey1,$inout4 3863 aesdec $rndkey1,$inout5 3864 aesdec $rndkey1,$inout6 3865 adc \$0,$inp_ 3866 and \$128,$inp_ 3867 aesdec $rndkey1,$inout7 3868 add $inp,$inp_ 3869 $movkey 0x30-0x70($key),$rndkey1 3870___ 3871for($i=1;$i<12;$i++) { 3872my $rndkeyx = ($i&1)?$rndkey0:$rndkey1; 3873$code.=<<___ if ($i==7); 3874 cmp \$11,$rounds 3875___ 3876$code.=<<___; 3877 aesdec $rndkeyx,$inout0 3878 aesdec $rndkeyx,$inout1 3879 aesdec $rndkeyx,$inout2 3880 aesdec $rndkeyx,$inout3 3881 aesdec $rndkeyx,$inout4 3882 aesdec $rndkeyx,$inout5 3883 aesdec $rndkeyx,$inout6 3884 aesdec $rndkeyx,$inout7 3885 $movkey `0x30+0x10*$i`-0x70($key),$rndkeyx 3886___ 3887$code.=<<___ if ($i<6 || (!($i&1) && $i>7)); 3888 nop 3889___ 3890$code.=<<___ if ($i==7); 3891 jb .Lcbc_dec_done 3892___ 3893$code.=<<___ if ($i==9); 3894 je .Lcbc_dec_done 3895___ 3896$code.=<<___ if ($i==11); 3897 jmp .Lcbc_dec_done 3898___ 3899} 3900$code.=<<___; 3901.align 16 3902.Lcbc_dec_done: 3903 aesdec $rndkey1,$inout0 3904 aesdec $rndkey1,$inout1 3905 pxor $rndkey0,$iv 3906 pxor $rndkey0,$in0 3907 aesdec $rndkey1,$inout2 3908 aesdec $rndkey1,$inout3 3909 pxor $rndkey0,$in1 3910 pxor $rndkey0,$in2 3911 aesdec $rndkey1,$inout4 3912 aesdec $rndkey1,$inout5 3913 pxor $rndkey0,$in3 3914 pxor $rndkey0,$in4 3915 aesdec $rndkey1,$inout6 3916 aesdec $rndkey1,$inout7 3917 movdqu 0x50($inp),$rndkey1 3918 3919 aesdeclast $iv,$inout0 3920 movdqu 0x60($inp),$iv # borrow $iv 3921 pxor $rndkey0,$rndkey1 3922 aesdeclast $in0,$inout1 3923 pxor $rndkey0,$iv 3924 movdqu 0x70($inp),$rndkey0 # next IV 3925 aesdeclast $in1,$inout2 3926 lea 0x80($inp),$inp 3927 movdqu 0x00($inp_),$in0 3928 aesdeclast $in2,$inout3 3929 aesdeclast $in3,$inout4 3930 movdqu 0x10($inp_),$in1 3931 movdqu 0x20($inp_),$in2 3932 aesdeclast $in4,$inout5 3933 aesdeclast $rndkey1,$inout6 3934 movdqu 0x30($inp_),$in3 3935 movdqu 0x40($inp_),$in4 3936 aesdeclast $iv,$inout7 3937 movdqa $rndkey0,$iv # return $iv 3938 movdqu 0x50($inp_),$rndkey1 3939 $movkey -0x70($key),$rndkey0 3940 3941 movups $inout0,($out) # store output 3942 movdqa $in0,$inout0 3943 movups $inout1,0x10($out) 3944 movdqa $in1,$inout1 3945 movups $inout2,0x20($out) 3946 movdqa $in2,$inout2 3947 movups $inout3,0x30($out) 3948 movdqa $in3,$inout3 3949 movups $inout4,0x40($out) 3950 movdqa $in4,$inout4 3951 movups $inout5,0x50($out) 3952 movdqa $rndkey1,$inout5 3953 movups $inout6,0x60($out) 3954 lea 0x70($out),$out 3955 3956 sub \$0x80,$len 3957 ja .Lcbc_dec_loop8 3958 3959 movaps $inout7,$inout0 3960 lea -0x70($key),$key 3961 add \$0x70,$len 3962 jle .Lcbc_dec_clear_tail_collected 3963 movups $inout7,($out) 3964 lea 0x10($out),$out 3965 cmp \$0x50,$len 3966 jbe .Lcbc_dec_tail 3967 3968 movaps $in0,$inout0 3969.Lcbc_dec_six_or_seven: 3970 cmp \$0x60,$len 3971 ja .Lcbc_dec_seven 3972 3973 movaps $inout5,$inout6 3974 call _aesni_decrypt6 3975 pxor $iv,$inout0 # ^= IV 3976 movaps $inout6,$iv 3977 pxor $in0,$inout1 3978 movdqu $inout0,($out) 3979 pxor $in1,$inout2 3980 movdqu $inout1,0x10($out) 3981 pxor $inout1,$inout1 # clear register bank 3982 pxor $in2,$inout3 3983 movdqu $inout2,0x20($out) 3984 pxor $inout2,$inout2 3985 pxor $in3,$inout4 3986 movdqu $inout3,0x30($out) 3987 pxor $inout3,$inout3 3988 pxor $in4,$inout5 3989 movdqu $inout4,0x40($out) 3990 pxor $inout4,$inout4 3991 lea 0x50($out),$out 3992 movdqa $inout5,$inout0 3993 pxor $inout5,$inout5 3994 jmp .Lcbc_dec_tail_collected 3995 3996.align 16 3997.Lcbc_dec_seven: 3998 movups 0x60($inp),$inout6 3999 xorps $inout7,$inout7 4000 call _aesni_decrypt8 4001 movups 0x50($inp),$inout7 4002 pxor $iv,$inout0 # ^= IV 4003 movups 0x60($inp),$iv 4004 pxor $in0,$inout1 4005 movdqu $inout0,($out) 4006 pxor $in1,$inout2 4007 movdqu $inout1,0x10($out) 4008 pxor $inout1,$inout1 # clear register bank 4009 pxor $in2,$inout3 4010 movdqu $inout2,0x20($out) 4011 pxor $inout2,$inout2 4012 pxor $in3,$inout4 4013 movdqu $inout3,0x30($out) 4014 pxor $inout3,$inout3 4015 pxor $in4,$inout5 4016 movdqu $inout4,0x40($out) 4017 pxor $inout4,$inout4 4018 pxor $inout7,$inout6 4019 movdqu $inout5,0x50($out) 4020 pxor $inout5,$inout5 4021 lea 0x60($out),$out 4022 movdqa $inout6,$inout0 4023 pxor $inout6,$inout6 4024 pxor $inout7,$inout7 4025 jmp .Lcbc_dec_tail_collected 4026 4027.align 16 4028.Lcbc_dec_loop6: 4029 movups $inout5,($out) 4030 lea 0x10($out),$out 4031 movdqu 0x00($inp),$inout0 # load input 4032 movdqu 0x10($inp),$inout1 4033 movdqa $inout0,$in0 4034 movdqu 0x20($inp),$inout2 4035 movdqa $inout1,$in1 4036 movdqu 0x30($inp),$inout3 4037 movdqa $inout2,$in2 4038 movdqu 0x40($inp),$inout4 4039 movdqa $inout3,$in3 4040 movdqu 0x50($inp),$inout5 4041 movdqa $inout4,$in4 4042.Lcbc_dec_loop6_enter: 4043 lea 0x60($inp),$inp 4044 movdqa $inout5,$inout6 4045 4046 call _aesni_decrypt6 4047 4048 pxor $iv,$inout0 # ^= IV 4049 movdqa $inout6,$iv 4050 pxor $in0,$inout1 4051 movdqu $inout0,($out) 4052 pxor $in1,$inout2 4053 movdqu $inout1,0x10($out) 4054 pxor $in2,$inout3 4055 movdqu $inout2,0x20($out) 4056 pxor $in3,$inout4 4057 mov $key_,$key 4058 movdqu $inout3,0x30($out) 4059 pxor $in4,$inout5 4060 mov $rnds_,$rounds 4061 movdqu $inout4,0x40($out) 4062 lea 0x50($out),$out 4063 sub \$0x60,$len 4064 ja .Lcbc_dec_loop6 4065 4066 movdqa $inout5,$inout0 4067 add \$0x50,$len 4068 jle .Lcbc_dec_clear_tail_collected 4069 movups $inout5,($out) 4070 lea 0x10($out),$out 4071 4072.Lcbc_dec_tail: 4073 movups ($inp),$inout0 4074 sub \$0x10,$len 4075 jbe .Lcbc_dec_one # $len is 1*16 or less 4076 4077 movups 0x10($inp),$inout1 4078 movaps $inout0,$in0 4079 sub \$0x10,$len 4080 jbe .Lcbc_dec_two # $len is 2*16 or less 4081 4082 movups 0x20($inp),$inout2 4083 movaps $inout1,$in1 4084 sub \$0x10,$len 4085 jbe .Lcbc_dec_three # $len is 3*16 or less 4086 4087 movups 0x30($inp),$inout3 4088 movaps $inout2,$in2 4089 sub \$0x10,$len 4090 jbe .Lcbc_dec_four # $len is 4*16 or less 4091 4092 movups 0x40($inp),$inout4 # $len is 5*16 or less 4093 movaps $inout3,$in3 4094 movaps $inout4,$in4 4095 xorps $inout5,$inout5 4096 call _aesni_decrypt6 4097 pxor $iv,$inout0 4098 movaps $in4,$iv 4099 pxor $in0,$inout1 4100 movdqu $inout0,($out) 4101 pxor $in1,$inout2 4102 movdqu $inout1,0x10($out) 4103 pxor $inout1,$inout1 # clear register bank 4104 pxor $in2,$inout3 4105 movdqu $inout2,0x20($out) 4106 pxor $inout2,$inout2 4107 pxor $in3,$inout4 4108 movdqu $inout3,0x30($out) 4109 pxor $inout3,$inout3 4110 lea 0x40($out),$out 4111 movdqa $inout4,$inout0 4112 pxor $inout4,$inout4 4113 pxor $inout5,$inout5 4114 sub \$0x10,$len 4115 jmp .Lcbc_dec_tail_collected 4116 4117.align 16 4118.Lcbc_dec_one: 4119 movaps $inout0,$in0 4120___ 4121 &aesni_generate1("dec",$key,$rounds); 4122$code.=<<___; 4123 xorps $iv,$inout0 4124 movaps $in0,$iv 4125 jmp .Lcbc_dec_tail_collected 4126.align 16 4127.Lcbc_dec_two: 4128 movaps $inout1,$in1 4129 call _aesni_decrypt2 4130 pxor $iv,$inout0 4131 movaps $in1,$iv 4132 pxor $in0,$inout1 4133 movdqu $inout0,($out) 4134 movdqa $inout1,$inout0 4135 pxor $inout1,$inout1 # clear register bank 4136 lea 0x10($out),$out 4137 jmp .Lcbc_dec_tail_collected 4138.align 16 4139.Lcbc_dec_three: 4140 movaps $inout2,$in2 4141 call _aesni_decrypt3 4142 pxor $iv,$inout0 4143 movaps $in2,$iv 4144 pxor $in0,$inout1 4145 movdqu $inout0,($out) 4146 pxor $in1,$inout2 4147 movdqu $inout1,0x10($out) 4148 pxor $inout1,$inout1 # clear register bank 4149 movdqa $inout2,$inout0 4150 pxor $inout2,$inout2 4151 lea 0x20($out),$out 4152 jmp .Lcbc_dec_tail_collected 4153.align 16 4154.Lcbc_dec_four: 4155 movaps $inout3,$in3 4156 call _aesni_decrypt4 4157 pxor $iv,$inout0 4158 movaps $in3,$iv 4159 pxor $in0,$inout1 4160 movdqu $inout0,($out) 4161 pxor $in1,$inout2 4162 movdqu $inout1,0x10($out) 4163 pxor $inout1,$inout1 # clear register bank 4164 pxor $in2,$inout3 4165 movdqu $inout2,0x20($out) 4166 pxor $inout2,$inout2 4167 movdqa $inout3,$inout0 4168 pxor $inout3,$inout3 4169 lea 0x30($out),$out 4170 jmp .Lcbc_dec_tail_collected 4171 4172.align 16 4173.Lcbc_dec_clear_tail_collected: 4174 pxor $inout1,$inout1 # clear register bank 4175 pxor $inout2,$inout2 4176 pxor $inout3,$inout3 4177___ 4178$code.=<<___ if (!$win64); 4179 pxor $inout4,$inout4 # %xmm6..9 4180 pxor $inout5,$inout5 4181 pxor $inout6,$inout6 4182 pxor $inout7,$inout7 4183___ 4184$code.=<<___; 4185.Lcbc_dec_tail_collected: 4186 movups $iv,($ivp) 4187 and \$15,$len 4188 jnz .Lcbc_dec_tail_partial 4189 movups $inout0,($out) 4190 pxor $inout0,$inout0 4191 jmp .Lcbc_dec_ret 4192.align 16 4193.Lcbc_dec_tail_partial: 4194 movaps $inout0,(%rsp) 4195 pxor $inout0,$inout0 4196 mov \$16,%rcx 4197 mov $out,%rdi 4198 sub $len,%rcx 4199 lea (%rsp),%rsi 4200 .long 0x9066A4F3 # rep movsb 4201 movdqa $inout0,(%rsp) 4202 4203.Lcbc_dec_ret: 4204 xorps $rndkey0,$rndkey0 # %xmm0 4205 pxor $rndkey1,$rndkey1 4206___ 4207$code.=<<___ if ($win64); 4208 movaps 0x10(%rsp),%xmm6 4209 movaps %xmm0,0x10(%rsp) # clear stack 4210 movaps 0x20(%rsp),%xmm7 4211 movaps %xmm0,0x20(%rsp) 4212 movaps 0x30(%rsp),%xmm8 4213 movaps %xmm0,0x30(%rsp) 4214 movaps 0x40(%rsp),%xmm9 4215 movaps %xmm0,0x40(%rsp) 4216 movaps 0x50(%rsp),%xmm10 4217 movaps %xmm0,0x50(%rsp) 4218 movaps 0x60(%rsp),%xmm11 4219 movaps %xmm0,0x60(%rsp) 4220 movaps 0x70(%rsp),%xmm12 4221 movaps %xmm0,0x70(%rsp) 4222 movaps 0x80(%rsp),%xmm13 4223 movaps %xmm0,0x80(%rsp) 4224 movaps 0x90(%rsp),%xmm14 4225 movaps %xmm0,0x90(%rsp) 4226 movaps 0xa0(%rsp),%xmm15 4227 movaps %xmm0,0xa0(%rsp) 4228___ 4229$code.=<<___; 4230 mov -8(%r11),%rbp 4231.cfi_restore %rbp 4232 lea (%r11),%rsp 4233.cfi_def_cfa_register %rsp 4234.Lcbc_ret: 4235 ret 4236.cfi_endproc 4237.size ${PREFIX}_cbc_encrypt,.-${PREFIX}_cbc_encrypt 4238___ 4239} 4240# int ${PREFIX}_set_decrypt_key(const unsigned char *inp, 4241# int bits, AES_KEY *key) 4242# 4243# input: $inp user-supplied key 4244# $bits $inp length in bits 4245# $key pointer to key schedule 4246# output: %eax 0 denoting success, -1 or -2 - failure (see C) 4247# *$key key schedule 4248# 4249{ my ($inp,$bits,$key) = @_4args; 4250 $bits =~ s/%r/%e/; 4251 4252$code.=<<___; 4253.globl ${PREFIX}_set_decrypt_key 4254.type ${PREFIX}_set_decrypt_key,\@abi-omnipotent 4255.align 16 4256${PREFIX}_set_decrypt_key: 4257.cfi_startproc 4258 .byte 0x48,0x83,0xEC,0x08 # sub rsp,8 4259.cfi_adjust_cfa_offset 8 4260 call __aesni_set_encrypt_key 4261 shl \$4,$bits # rounds-1 after _aesni_set_encrypt_key 4262 test %eax,%eax 4263 jnz .Ldec_key_ret 4264 lea 16($key,$bits),$inp # points at the end of key schedule 4265 4266 $movkey ($key),%xmm0 # just swap 4267 $movkey ($inp),%xmm1 4268 $movkey %xmm0,($inp) 4269 $movkey %xmm1,($key) 4270 lea 16($key),$key 4271 lea -16($inp),$inp 4272 4273.Ldec_key_inverse: 4274 $movkey ($key),%xmm0 # swap and inverse 4275 $movkey ($inp),%xmm1 4276 aesimc %xmm0,%xmm0 4277 aesimc %xmm1,%xmm1 4278 lea 16($key),$key 4279 lea -16($inp),$inp 4280 $movkey %xmm0,16($inp) 4281 $movkey %xmm1,-16($key) 4282 cmp $key,$inp 4283 ja .Ldec_key_inverse 4284 4285 $movkey ($key),%xmm0 # inverse middle 4286 aesimc %xmm0,%xmm0 4287 pxor %xmm1,%xmm1 4288 $movkey %xmm0,($inp) 4289 pxor %xmm0,%xmm0 4290.Ldec_key_ret: 4291 add \$8,%rsp 4292.cfi_adjust_cfa_offset -8 4293 ret 4294.cfi_endproc 4295.LSEH_end_set_decrypt_key: 4296.size ${PREFIX}_set_decrypt_key,.-${PREFIX}_set_decrypt_key 4297___ 4298 4299# This is based on submission from Intel by 4300# Huang Ying 4301# Vinodh Gopal 4302# Kahraman Akdemir 4303# 4304# Aggressively optimized in respect to aeskeygenassist's critical path 4305# and is contained in %xmm0-5 to meet Win64 ABI requirement. 4306# 4307# int ${PREFIX}_set_encrypt_key(const unsigned char *inp, 4308# int bits, AES_KEY * const key); 4309# 4310# input: $inp user-supplied key 4311# $bits $inp length in bits 4312# $key pointer to key schedule 4313# output: %eax 0 denoting success, -1 or -2 - failure (see C) 4314# $bits rounds-1 (used in aesni_set_decrypt_key) 4315# *$key key schedule 4316# $key pointer to key schedule (used in 4317# aesni_set_decrypt_key) 4318# 4319# Subroutine is frame-less, which means that only volatile registers 4320# are used. Note that it's declared "abi-omnipotent", which means that 4321# amount of volatile registers is smaller on Windows. 4322# 4323$code.=<<___; 4324.globl ${PREFIX}_set_encrypt_key 4325.type ${PREFIX}_set_encrypt_key,\@abi-omnipotent 4326.align 16 4327${PREFIX}_set_encrypt_key: 4328__aesni_set_encrypt_key: 4329.cfi_startproc 4330 .byte 0x48,0x83,0xEC,0x08 # sub rsp,8 4331.cfi_adjust_cfa_offset 8 4332 mov \$-1,%rax 4333 test $inp,$inp 4334 jz .Lenc_key_ret 4335 test $key,$key 4336 jz .Lenc_key_ret 4337 4338 movups ($inp),%xmm0 # pull first 128 bits of *userKey 4339 xorps %xmm4,%xmm4 # low dword of xmm4 is assumed 0 4340 leaq OPENSSL_ia32cap_P(%rip),%r10 4341 movl 4(%r10),%r10d 4342 and \$`1<<28|1<<11`,%r10d # AVX and XOP bits 4343 lea 16($key),%rax # %rax is used as modifiable copy of $key 4344 cmp \$256,$bits 4345 je .L14rounds 4346 cmp \$192,$bits 4347 je .L12rounds 4348 cmp \$128,$bits 4349 jne .Lbad_keybits 4350 4351.L10rounds: 4352 mov \$9,$bits # 10 rounds for 128-bit key 4353 cmp \$`1<<28`,%r10d # AVX, bit no XOP 4354 je .L10rounds_alt 4355 4356 $movkey %xmm0,($key) # round 0 4357 aeskeygenassist \$0x1,%xmm0,%xmm1 # round 1 4358 call .Lkey_expansion_128_cold 4359 aeskeygenassist \$0x2,%xmm0,%xmm1 # round 2 4360 call .Lkey_expansion_128 4361 aeskeygenassist \$0x4,%xmm0,%xmm1 # round 3 4362 call .Lkey_expansion_128 4363 aeskeygenassist \$0x8,%xmm0,%xmm1 # round 4 4364 call .Lkey_expansion_128 4365 aeskeygenassist \$0x10,%xmm0,%xmm1 # round 5 4366 call .Lkey_expansion_128 4367 aeskeygenassist \$0x20,%xmm0,%xmm1 # round 6 4368 call .Lkey_expansion_128 4369 aeskeygenassist \$0x40,%xmm0,%xmm1 # round 7 4370 call .Lkey_expansion_128 4371 aeskeygenassist \$0x80,%xmm0,%xmm1 # round 8 4372 call .Lkey_expansion_128 4373 aeskeygenassist \$0x1b,%xmm0,%xmm1 # round 9 4374 call .Lkey_expansion_128 4375 aeskeygenassist \$0x36,%xmm0,%xmm1 # round 10 4376 call .Lkey_expansion_128 4377 $movkey %xmm0,(%rax) 4378 mov $bits,80(%rax) # 240(%rdx) 4379 xor %eax,%eax 4380 jmp .Lenc_key_ret 4381 4382.align 16 4383.L10rounds_alt: 4384 movdqa .Lkey_rotate(%rip),%xmm5 4385 mov \$8,%r10d 4386 movdqa .Lkey_rcon1(%rip),%xmm4 4387 movdqa %xmm0,%xmm2 4388 movdqu %xmm0,($key) 4389 jmp .Loop_key128 4390 4391.align 16 4392.Loop_key128: 4393 pshufb %xmm5,%xmm0 4394 aesenclast %xmm4,%xmm0 4395 pslld \$1,%xmm4 4396 lea 16(%rax),%rax 4397 4398 movdqa %xmm2,%xmm3 4399 pslldq \$4,%xmm2 4400 pxor %xmm2,%xmm3 4401 pslldq \$4,%xmm2 4402 pxor %xmm2,%xmm3 4403 pslldq \$4,%xmm2 4404 pxor %xmm3,%xmm2 4405 4406 pxor %xmm2,%xmm0 4407 movdqu %xmm0,-16(%rax) 4408 movdqa %xmm0,%xmm2 4409 4410 dec %r10d 4411 jnz .Loop_key128 4412 4413 movdqa .Lkey_rcon1b(%rip),%xmm4 4414 4415 pshufb %xmm5,%xmm0 4416 aesenclast %xmm4,%xmm0 4417 pslld \$1,%xmm4 4418 4419 movdqa %xmm2,%xmm3 4420 pslldq \$4,%xmm2 4421 pxor %xmm2,%xmm3 4422 pslldq \$4,%xmm2 4423 pxor %xmm2,%xmm3 4424 pslldq \$4,%xmm2 4425 pxor %xmm3,%xmm2 4426 4427 pxor %xmm2,%xmm0 4428 movdqu %xmm0,(%rax) 4429 4430 movdqa %xmm0,%xmm2 4431 pshufb %xmm5,%xmm0 4432 aesenclast %xmm4,%xmm0 4433 4434 movdqa %xmm2,%xmm3 4435 pslldq \$4,%xmm2 4436 pxor %xmm2,%xmm3 4437 pslldq \$4,%xmm2 4438 pxor %xmm2,%xmm3 4439 pslldq \$4,%xmm2 4440 pxor %xmm3,%xmm2 4441 4442 pxor %xmm2,%xmm0 4443 movdqu %xmm0,16(%rax) 4444 4445 mov $bits,96(%rax) # 240($key) 4446 xor %eax,%eax 4447 jmp .Lenc_key_ret 4448 4449.align 16 4450.L12rounds: 4451 movq 16($inp),%xmm2 # remaining 1/3 of *userKey 4452 mov \$11,$bits # 12 rounds for 192 4453 cmp \$`1<<28`,%r10d # AVX, but no XOP 4454 je .L12rounds_alt 4455 4456 $movkey %xmm0,($key) # round 0 4457 aeskeygenassist \$0x1,%xmm2,%xmm1 # round 1,2 4458 call .Lkey_expansion_192a_cold 4459 aeskeygenassist \$0x2,%xmm2,%xmm1 # round 2,3 4460 call .Lkey_expansion_192b 4461 aeskeygenassist \$0x4,%xmm2,%xmm1 # round 4,5 4462 call .Lkey_expansion_192a 4463 aeskeygenassist \$0x8,%xmm2,%xmm1 # round 5,6 4464 call .Lkey_expansion_192b 4465 aeskeygenassist \$0x10,%xmm2,%xmm1 # round 7,8 4466 call .Lkey_expansion_192a 4467 aeskeygenassist \$0x20,%xmm2,%xmm1 # round 8,9 4468 call .Lkey_expansion_192b 4469 aeskeygenassist \$0x40,%xmm2,%xmm1 # round 10,11 4470 call .Lkey_expansion_192a 4471 aeskeygenassist \$0x80,%xmm2,%xmm1 # round 11,12 4472 call .Lkey_expansion_192b 4473 $movkey %xmm0,(%rax) 4474 mov $bits,48(%rax) # 240(%rdx) 4475 xor %rax, %rax 4476 jmp .Lenc_key_ret 4477 4478.align 16 4479.L12rounds_alt: 4480 movdqa .Lkey_rotate192(%rip),%xmm5 4481 movdqa .Lkey_rcon1(%rip),%xmm4 4482 mov \$8,%r10d 4483 movdqu %xmm0,($key) 4484 jmp .Loop_key192 4485 4486.align 16 4487.Loop_key192: 4488 movq %xmm2,0(%rax) 4489 movdqa %xmm2,%xmm1 4490 pshufb %xmm5,%xmm2 4491 aesenclast %xmm4,%xmm2 4492 pslld \$1, %xmm4 4493 lea 24(%rax),%rax 4494 4495 movdqa %xmm0,%xmm3 4496 pslldq \$4,%xmm0 4497 pxor %xmm0,%xmm3 4498 pslldq \$4,%xmm0 4499 pxor %xmm0,%xmm3 4500 pslldq \$4,%xmm0 4501 pxor %xmm3,%xmm0 4502 4503 pshufd \$0xff,%xmm0,%xmm3 4504 pxor %xmm1,%xmm3 4505 pslldq \$4,%xmm1 4506 pxor %xmm1,%xmm3 4507 4508 pxor %xmm2,%xmm0 4509 pxor %xmm3,%xmm2 4510 movdqu %xmm0,-16(%rax) 4511 4512 dec %r10d 4513 jnz .Loop_key192 4514 4515 mov $bits,32(%rax) # 240($key) 4516 xor %eax,%eax 4517 jmp .Lenc_key_ret 4518 4519.align 16 4520.L14rounds: 4521 movups 16($inp),%xmm2 # remaining half of *userKey 4522 mov \$13,$bits # 14 rounds for 256 4523 lea 16(%rax),%rax 4524 cmp \$`1<<28`,%r10d # AVX, but no XOP 4525 je .L14rounds_alt 4526 4527 $movkey %xmm0,($key) # round 0 4528 $movkey %xmm2,16($key) # round 1 4529 aeskeygenassist \$0x1,%xmm2,%xmm1 # round 2 4530 call .Lkey_expansion_256a_cold 4531 aeskeygenassist \$0x1,%xmm0,%xmm1 # round 3 4532 call .Lkey_expansion_256b 4533 aeskeygenassist \$0x2,%xmm2,%xmm1 # round 4 4534 call .Lkey_expansion_256a 4535 aeskeygenassist \$0x2,%xmm0,%xmm1 # round 5 4536 call .Lkey_expansion_256b 4537 aeskeygenassist \$0x4,%xmm2,%xmm1 # round 6 4538 call .Lkey_expansion_256a 4539 aeskeygenassist \$0x4,%xmm0,%xmm1 # round 7 4540 call .Lkey_expansion_256b 4541 aeskeygenassist \$0x8,%xmm2,%xmm1 # round 8 4542 call .Lkey_expansion_256a 4543 aeskeygenassist \$0x8,%xmm0,%xmm1 # round 9 4544 call .Lkey_expansion_256b 4545 aeskeygenassist \$0x10,%xmm2,%xmm1 # round 10 4546 call .Lkey_expansion_256a 4547 aeskeygenassist \$0x10,%xmm0,%xmm1 # round 11 4548 call .Lkey_expansion_256b 4549 aeskeygenassist \$0x20,%xmm2,%xmm1 # round 12 4550 call .Lkey_expansion_256a 4551 aeskeygenassist \$0x20,%xmm0,%xmm1 # round 13 4552 call .Lkey_expansion_256b 4553 aeskeygenassist \$0x40,%xmm2,%xmm1 # round 14 4554 call .Lkey_expansion_256a 4555 $movkey %xmm0,(%rax) 4556 mov $bits,16(%rax) # 240(%rdx) 4557 xor %rax,%rax 4558 jmp .Lenc_key_ret 4559 4560.align 16 4561.L14rounds_alt: 4562 movdqa .Lkey_rotate(%rip),%xmm5 4563 movdqa .Lkey_rcon1(%rip),%xmm4 4564 mov \$7,%r10d 4565 movdqu %xmm0,0($key) 4566 movdqa %xmm2,%xmm1 4567 movdqu %xmm2,16($key) 4568 jmp .Loop_key256 4569 4570.align 16 4571.Loop_key256: 4572 pshufb %xmm5,%xmm2 4573 aesenclast %xmm4,%xmm2 4574 4575 movdqa %xmm0,%xmm3 4576 pslldq \$4,%xmm0 4577 pxor %xmm0,%xmm3 4578 pslldq \$4,%xmm0 4579 pxor %xmm0,%xmm3 4580 pslldq \$4,%xmm0 4581 pxor %xmm3,%xmm0 4582 pslld \$1,%xmm4 4583 4584 pxor %xmm2,%xmm0 4585 movdqu %xmm0,(%rax) 4586 4587 dec %r10d 4588 jz .Ldone_key256 4589 4590 pshufd \$0xff,%xmm0,%xmm2 4591 pxor %xmm3,%xmm3 4592 aesenclast %xmm3,%xmm2 4593 4594 movdqa %xmm1,%xmm3 4595 pslldq \$4,%xmm1 4596 pxor %xmm1,%xmm3 4597 pslldq \$4,%xmm1 4598 pxor %xmm1,%xmm3 4599 pslldq \$4,%xmm1 4600 pxor %xmm3,%xmm1 4601 4602 pxor %xmm1,%xmm2 4603 movdqu %xmm2,16(%rax) 4604 lea 32(%rax),%rax 4605 movdqa %xmm2,%xmm1 4606 4607 jmp .Loop_key256 4608 4609.Ldone_key256: 4610 mov $bits,16(%rax) # 240($key) 4611 xor %eax,%eax 4612 jmp .Lenc_key_ret 4613 4614.align 16 4615.Lbad_keybits: 4616 mov \$-2,%rax 4617.Lenc_key_ret: 4618 pxor %xmm0,%xmm0 4619 pxor %xmm1,%xmm1 4620 pxor %xmm2,%xmm2 4621 pxor %xmm3,%xmm3 4622 pxor %xmm4,%xmm4 4623 pxor %xmm5,%xmm5 4624 add \$8,%rsp 4625.cfi_adjust_cfa_offset -8 4626 ret 4627.cfi_endproc 4628.LSEH_end_set_encrypt_key: 4629 4630.align 16 4631.Lkey_expansion_128: 4632 $movkey %xmm0,(%rax) 4633 lea 16(%rax),%rax 4634.Lkey_expansion_128_cold: 4635 shufps \$0b00010000,%xmm0,%xmm4 4636 xorps %xmm4, %xmm0 4637 shufps \$0b10001100,%xmm0,%xmm4 4638 xorps %xmm4, %xmm0 4639 shufps \$0b11111111,%xmm1,%xmm1 # critical path 4640 xorps %xmm1,%xmm0 4641 ret 4642 4643.align 16 4644.Lkey_expansion_192a: 4645 $movkey %xmm0,(%rax) 4646 lea 16(%rax),%rax 4647.Lkey_expansion_192a_cold: 4648 movaps %xmm2, %xmm5 4649.Lkey_expansion_192b_warm: 4650 shufps \$0b00010000,%xmm0,%xmm4 4651 movdqa %xmm2,%xmm3 4652 xorps %xmm4,%xmm0 4653 shufps \$0b10001100,%xmm0,%xmm4 4654 pslldq \$4,%xmm3 4655 xorps %xmm4,%xmm0 4656 pshufd \$0b01010101,%xmm1,%xmm1 # critical path 4657 pxor %xmm3,%xmm2 4658 pxor %xmm1,%xmm0 4659 pshufd \$0b11111111,%xmm0,%xmm3 4660 pxor %xmm3,%xmm2 4661 ret 4662 4663.align 16 4664.Lkey_expansion_192b: 4665 movaps %xmm0,%xmm3 4666 shufps \$0b01000100,%xmm0,%xmm5 4667 $movkey %xmm5,(%rax) 4668 shufps \$0b01001110,%xmm2,%xmm3 4669 $movkey %xmm3,16(%rax) 4670 lea 32(%rax),%rax 4671 jmp .Lkey_expansion_192b_warm 4672 4673.align 16 4674.Lkey_expansion_256a: 4675 $movkey %xmm2,(%rax) 4676 lea 16(%rax),%rax 4677.Lkey_expansion_256a_cold: 4678 shufps \$0b00010000,%xmm0,%xmm4 4679 xorps %xmm4,%xmm0 4680 shufps \$0b10001100,%xmm0,%xmm4 4681 xorps %xmm4,%xmm0 4682 shufps \$0b11111111,%xmm1,%xmm1 # critical path 4683 xorps %xmm1,%xmm0 4684 ret 4685 4686.align 16 4687.Lkey_expansion_256b: 4688 $movkey %xmm0,(%rax) 4689 lea 16(%rax),%rax 4690 4691 shufps \$0b00010000,%xmm2,%xmm4 4692 xorps %xmm4,%xmm2 4693 shufps \$0b10001100,%xmm2,%xmm4 4694 xorps %xmm4,%xmm2 4695 shufps \$0b10101010,%xmm1,%xmm1 # critical path 4696 xorps %xmm1,%xmm2 4697 ret 4698.size ${PREFIX}_set_encrypt_key,.-${PREFIX}_set_encrypt_key 4699.size __aesni_set_encrypt_key,.-__aesni_set_encrypt_key 4700___ 4701} 4702 4703$code.=<<___; 4704.align 64 4705.Lbswap_mask: 4706 .byte 15,14,13,12,11,10,9,8,7,6,5,4,3,2,1,0 4707.Lincrement32: 4708 .long 6,6,6,0 4709.Lincrement64: 4710 .long 1,0,0,0 4711.Lxts_magic: 4712 .long 0x87,0,1,0 4713.Lincrement1: 4714 .byte 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1 4715.Lkey_rotate: 4716 .long 0x0c0f0e0d,0x0c0f0e0d,0x0c0f0e0d,0x0c0f0e0d 4717.Lkey_rotate192: 4718 .long 0x04070605,0x04070605,0x04070605,0x04070605 4719.Lkey_rcon1: 4720 .long 1,1,1,1 4721.Lkey_rcon1b: 4722 .long 0x1b,0x1b,0x1b,0x1b 4723 4724.asciz "AES for Intel AES-NI, CRYPTOGAMS by <appro\@openssl.org>" 4725.align 64 4726___ 4727 4728# EXCEPTION_DISPOSITION handler (EXCEPTION_RECORD *rec,ULONG64 frame, 4729# CONTEXT *context,DISPATCHER_CONTEXT *disp) 4730if ($win64) { 4731$rec="%rcx"; 4732$frame="%rdx"; 4733$context="%r8"; 4734$disp="%r9"; 4735 4736$code.=<<___; 4737.extern __imp_RtlVirtualUnwind 4738___ 4739$code.=<<___ if ($PREFIX eq "aes_hw"); 4740.type ecb_ccm64_se_handler,\@abi-omnipotent 4741.align 16 4742ecb_ccm64_se_handler: 4743 push %rsi 4744 push %rdi 4745 push %rbx 4746 push %rbp 4747 push %r12 4748 push %r13 4749 push %r14 4750 push %r15 4751 pushfq 4752 sub \$64,%rsp 4753 4754 mov 120($context),%rax # pull context->Rax 4755 mov 248($context),%rbx # pull context->Rip 4756 4757 mov 8($disp),%rsi # disp->ImageBase 4758 mov 56($disp),%r11 # disp->HandlerData 4759 4760 mov 0(%r11),%r10d # HandlerData[0] 4761 lea (%rsi,%r10),%r10 # prologue label 4762 cmp %r10,%rbx # context->Rip<prologue label 4763 jb .Lcommon_seh_tail 4764 4765 mov 152($context),%rax # pull context->Rsp 4766 4767 mov 4(%r11),%r10d # HandlerData[1] 4768 lea (%rsi,%r10),%r10 # epilogue label 4769 cmp %r10,%rbx # context->Rip>=epilogue label 4770 jae .Lcommon_seh_tail 4771 4772 lea 0(%rax),%rsi # %xmm save area 4773 lea 512($context),%rdi # &context.Xmm6 4774 mov \$8,%ecx # 4*sizeof(%xmm0)/sizeof(%rax) 4775 .long 0xa548f3fc # cld; rep movsq 4776 lea 0x58(%rax),%rax # adjust stack pointer 4777 4778 jmp .Lcommon_seh_tail 4779.size ${PREFIX}_ccm64_se_handler,.-${PREFIX}_ccm64_se_handler 4780 4781.type ctr_xts_se_handler,\@abi-omnipotent 4782.align 16 4783ctr_xts_se_handler: 4784 push %rsi 4785 push %rdi 4786 push %rbx 4787 push %rbp 4788 push %r12 4789 push %r13 4790 push %r14 4791 push %r15 4792 pushfq 4793 sub \$64,%rsp 4794 4795 mov 120($context),%rax # pull context->Rax 4796 mov 248($context),%rbx # pull context->Rip 4797 4798 mov 8($disp),%rsi # disp->ImageBase 4799 mov 56($disp),%r11 # disp->HandlerData 4800 4801 mov 0(%r11),%r10d # HandlerData[0] 4802 lea (%rsi,%r10),%r10 # prologue lable 4803 cmp %r10,%rbx # context->Rip<prologue label 4804 jb .Lcommon_seh_tail 4805 4806 mov 152($context),%rax # pull context->Rsp 4807 4808 mov 4(%r11),%r10d # HandlerData[1] 4809 lea (%rsi,%r10),%r10 # epilogue label 4810 cmp %r10,%rbx # context->Rip>=epilogue label 4811 jae .Lcommon_seh_tail 4812 4813 mov 208($context),%rax # pull context->R11 4814 4815 lea -0xa8(%rax),%rsi # %xmm save area 4816 lea 512($context),%rdi # & context.Xmm6 4817 mov \$20,%ecx # 10*sizeof(%xmm0)/sizeof(%rax) 4818 .long 0xa548f3fc # cld; rep movsq 4819 4820 mov -8(%rax),%rbp # restore saved %rbp 4821 mov %rbp,160($context) # restore context->Rbp 4822 jmp .Lcommon_seh_tail 4823.size ctr_xts_se_handler,.-ctr_xts_se_handler 4824 4825.type ocb_se_handler,\@abi-omnipotent 4826.align 16 4827ocb_se_handler: 4828 push %rsi 4829 push %rdi 4830 push %rbx 4831 push %rbp 4832 push %r12 4833 push %r13 4834 push %r14 4835 push %r15 4836 pushfq 4837 sub \$64,%rsp 4838 4839 mov 120($context),%rax # pull context->Rax 4840 mov 248($context),%rbx # pull context->Rip 4841 4842 mov 8($disp),%rsi # disp->ImageBase 4843 mov 56($disp),%r11 # disp->HandlerData 4844 4845 mov 0(%r11),%r10d # HandlerData[0] 4846 lea (%rsi,%r10),%r10 # prologue lable 4847 cmp %r10,%rbx # context->Rip<prologue label 4848 jb .Lcommon_seh_tail 4849 4850 mov 4(%r11),%r10d # HandlerData[1] 4851 lea (%rsi,%r10),%r10 # epilogue label 4852 cmp %r10,%rbx # context->Rip>=epilogue label 4853 jae .Lcommon_seh_tail 4854 4855 mov 8(%r11),%r10d # HandlerData[2] 4856 lea (%rsi,%r10),%r10 4857 cmp %r10,%rbx # context->Rip>=pop label 4858 jae .Locb_no_xmm 4859 4860 mov 152($context),%rax # pull context->Rsp 4861 4862 lea (%rax),%rsi # %xmm save area 4863 lea 512($context),%rdi # & context.Xmm6 4864 mov \$20,%ecx # 10*sizeof(%xmm0)/sizeof(%rax) 4865 .long 0xa548f3fc # cld; rep movsq 4866 lea 0xa0+0x28(%rax),%rax 4867 4868.Locb_no_xmm: 4869 mov -8(%rax),%rbx 4870 mov -16(%rax),%rbp 4871 mov -24(%rax),%r12 4872 mov -32(%rax),%r13 4873 mov -40(%rax),%r14 4874 4875 mov %rbx,144($context) # restore context->Rbx 4876 mov %rbp,160($context) # restore context->Rbp 4877 mov %r12,216($context) # restore context->R12 4878 mov %r13,224($context) # restore context->R13 4879 mov %r14,232($context) # restore context->R14 4880 4881 jmp .Lcommon_seh_tail 4882.size ocb_se_handler,.-ocb_se_handler 4883___ 4884$code.=<<___; 4885.type cbc_se_handler,\@abi-omnipotent 4886.align 16 4887cbc_se_handler: 4888 push %rsi 4889 push %rdi 4890 push %rbx 4891 push %rbp 4892 push %r12 4893 push %r13 4894 push %r14 4895 push %r15 4896 pushfq 4897 sub \$64,%rsp 4898 4899 mov 152($context),%rax # pull context->Rsp 4900 mov 248($context),%rbx # pull context->Rip 4901 4902 lea .Lcbc_decrypt_bulk(%rip),%r10 4903 cmp %r10,%rbx # context->Rip<"prologue" label 4904 jb .Lcommon_seh_tail 4905 4906 mov 120($context),%rax # pull context->Rax 4907 4908 lea .Lcbc_decrypt_body(%rip),%r10 4909 cmp %r10,%rbx # context->Rip<cbc_decrypt_body 4910 jb .Lcommon_seh_tail 4911 4912 mov 152($context),%rax # pull context->Rsp 4913 4914 lea .Lcbc_ret(%rip),%r10 4915 cmp %r10,%rbx # context->Rip>="epilogue" label 4916 jae .Lcommon_seh_tail 4917 4918 lea 16(%rax),%rsi # %xmm save area 4919 lea 512($context),%rdi # &context.Xmm6 4920 mov \$20,%ecx # 10*sizeof(%xmm0)/sizeof(%rax) 4921 .long 0xa548f3fc # cld; rep movsq 4922 4923 mov 208($context),%rax # pull context->R11 4924 4925 mov -8(%rax),%rbp # restore saved %rbp 4926 mov %rbp,160($context) # restore context->Rbp 4927 4928.Lcommon_seh_tail: 4929 mov 8(%rax),%rdi 4930 mov 16(%rax),%rsi 4931 mov %rax,152($context) # restore context->Rsp 4932 mov %rsi,168($context) # restore context->Rsi 4933 mov %rdi,176($context) # restore context->Rdi 4934 4935 mov 40($disp),%rdi # disp->ContextRecord 4936 mov $context,%rsi # context 4937 mov \$154,%ecx # sizeof(CONTEXT) 4938 .long 0xa548f3fc # cld; rep movsq 4939 4940 mov $disp,%rsi 4941 xor %rcx,%rcx # arg1, UNW_FLAG_NHANDLER 4942 mov 8(%rsi),%rdx # arg2, disp->ImageBase 4943 mov 0(%rsi),%r8 # arg3, disp->ControlPc 4944 mov 16(%rsi),%r9 # arg4, disp->FunctionEntry 4945 mov 40(%rsi),%r10 # disp->ContextRecord 4946 lea 56(%rsi),%r11 # &disp->HandlerData 4947 lea 24(%rsi),%r12 # &disp->EstablisherFrame 4948 mov %r10,32(%rsp) # arg5 4949 mov %r11,40(%rsp) # arg6 4950 mov %r12,48(%rsp) # arg7 4951 mov %rcx,56(%rsp) # arg8, (NULL) 4952 call *__imp_RtlVirtualUnwind(%rip) 4953 4954 mov \$1,%eax # ExceptionContinueSearch 4955 add \$64,%rsp 4956 popfq 4957 pop %r15 4958 pop %r14 4959 pop %r13 4960 pop %r12 4961 pop %rbp 4962 pop %rbx 4963 pop %rdi 4964 pop %rsi 4965 ret 4966.size cbc_se_handler,.-cbc_se_handler 4967 4968.section .pdata 4969.align 4 4970___ 4971$code.=<<___ if ($PREFIX eq "aes_hw"); 4972 .rva .LSEH_begin_${PREFIX}_ecb_encrypt 4973 .rva .LSEH_end_${PREFIX}_ecb_encrypt 4974 .rva .LSEH_info_ecb 4975 4976 .rva .LSEH_begin_${PREFIX}_ccm64_encrypt_blocks 4977 .rva .LSEH_end_${PREFIX}_ccm64_encrypt_blocks 4978 .rva .LSEH_info_ccm64_enc 4979 4980 .rva .LSEH_begin_${PREFIX}_ccm64_decrypt_blocks 4981 .rva .LSEH_end_${PREFIX}_ccm64_decrypt_blocks 4982 .rva .LSEH_info_ccm64_dec 4983 4984 .rva .LSEH_begin_${PREFIX}_ctr32_encrypt_blocks 4985 .rva .LSEH_end_${PREFIX}_ctr32_encrypt_blocks 4986 .rva .LSEH_info_ctr32 4987 4988 .rva .LSEH_begin_${PREFIX}_xts_encrypt 4989 .rva .LSEH_end_${PREFIX}_xts_encrypt 4990 .rva .LSEH_info_xts_enc 4991 4992 .rva .LSEH_begin_${PREFIX}_xts_decrypt 4993 .rva .LSEH_end_${PREFIX}_xts_decrypt 4994 .rva .LSEH_info_xts_dec 4995 4996 .rva .LSEH_begin_${PREFIX}_ocb_encrypt 4997 .rva .LSEH_end_${PREFIX}_ocb_encrypt 4998 .rva .LSEH_info_ocb_enc 4999 5000 .rva .LSEH_begin_${PREFIX}_ocb_decrypt 5001 .rva .LSEH_end_${PREFIX}_ocb_decrypt 5002 .rva .LSEH_info_ocb_dec 5003___ 5004$code.=<<___; 5005 .rva .LSEH_begin_${PREFIX}_cbc_encrypt 5006 .rva .LSEH_end_${PREFIX}_cbc_encrypt 5007 .rva .LSEH_info_cbc 5008 5009 .rva ${PREFIX}_set_decrypt_key 5010 .rva .LSEH_end_set_decrypt_key 5011 .rva .LSEH_info_key 5012 5013 .rva ${PREFIX}_set_encrypt_key 5014 .rva .LSEH_end_set_encrypt_key 5015 .rva .LSEH_info_key 5016.section .xdata 5017.align 8 5018___ 5019$code.=<<___ if ($PREFIX eq "aes_hw"); 5020.LSEH_info_ecb: 5021 .byte 9,0,0,0 5022 .rva ecb_ccm64_se_handler 5023 .rva .Lecb_enc_body,.Lecb_enc_ret # HandlerData[] 5024.LSEH_info_ccm64_enc: 5025 .byte 9,0,0,0 5026 .rva ecb_ccm64_se_handler 5027 .rva .Lccm64_enc_body,.Lccm64_enc_ret # HandlerData[] 5028.LSEH_info_ccm64_dec: 5029 .byte 9,0,0,0 5030 .rva ecb_ccm64_se_handler 5031 .rva .Lccm64_dec_body,.Lccm64_dec_ret # HandlerData[] 5032.LSEH_info_ctr32: 5033 .byte 9,0,0,0 5034 .rva ctr_xts_se_handler 5035 .rva .Lctr32_body,.Lctr32_epilogue # HandlerData[] 5036.LSEH_info_xts_enc: 5037 .byte 9,0,0,0 5038 .rva ctr_xts_se_handler 5039 .rva .Lxts_enc_body,.Lxts_enc_epilogue # HandlerData[] 5040.LSEH_info_xts_dec: 5041 .byte 9,0,0,0 5042 .rva ctr_xts_se_handler 5043 .rva .Lxts_dec_body,.Lxts_dec_epilogue # HandlerData[] 5044.LSEH_info_ocb_enc: 5045 .byte 9,0,0,0 5046 .rva ocb_se_handler 5047 .rva .Locb_enc_body,.Locb_enc_epilogue # HandlerData[] 5048 .rva .Locb_enc_pop 5049 .long 0 5050.LSEH_info_ocb_dec: 5051 .byte 9,0,0,0 5052 .rva ocb_se_handler 5053 .rva .Locb_dec_body,.Locb_dec_epilogue # HandlerData[] 5054 .rva .Locb_dec_pop 5055 .long 0 5056___ 5057$code.=<<___; 5058.LSEH_info_cbc: 5059 .byte 9,0,0,0 5060 .rva cbc_se_handler 5061.LSEH_info_key: 5062 .byte 0x01,0x04,0x01,0x00 5063 .byte 0x04,0x02,0x00,0x00 # sub rsp,8 5064___ 5065} 5066 5067sub rex { 5068 local *opcode=shift; 5069 my ($dst,$src)=@_; 5070 my $rex=0; 5071 5072 $rex|=0x04 if($dst>=8); 5073 $rex|=0x01 if($src>=8); 5074 push @opcode,$rex|0x40 if($rex); 5075} 5076 5077sub aesni { 5078 my $line=shift; 5079 my @opcode=(0x66); 5080 5081 if ($line=~/(aeskeygenassist)\s+\$([x0-9a-f]+),\s*%xmm([0-9]+),\s*%xmm([0-9]+)/) { 5082 rex(\@opcode,$4,$3); 5083 push @opcode,0x0f,0x3a,0xdf; 5084 push @opcode,0xc0|($3&7)|(($4&7)<<3); # ModR/M 5085 my $c=$2; 5086 push @opcode,$c=~/^0/?oct($c):$c; 5087 return ".byte\t".join(',',@opcode); 5088 } 5089 elsif ($line=~/(aes[a-z]+)\s+%xmm([0-9]+),\s*%xmm([0-9]+)/) { 5090 my %opcodelet = ( 5091 "aesimc" => 0xdb, 5092 "aesenc" => 0xdc, "aesenclast" => 0xdd, 5093 "aesdec" => 0xde, "aesdeclast" => 0xdf 5094 ); 5095 return undef if (!defined($opcodelet{$1})); 5096 rex(\@opcode,$3,$2); 5097 push @opcode,0x0f,0x38,$opcodelet{$1}; 5098 push @opcode,0xc0|($2&7)|(($3&7)<<3); # ModR/M 5099 return ".byte\t".join(',',@opcode); 5100 } 5101 elsif ($line=~/(aes[a-z]+)\s+([0x1-9a-fA-F]*)\(%rsp\),\s*%xmm([0-9]+)/) { 5102 my %opcodelet = ( 5103 "aesenc" => 0xdc, "aesenclast" => 0xdd, 5104 "aesdec" => 0xde, "aesdeclast" => 0xdf 5105 ); 5106 return undef if (!defined($opcodelet{$1})); 5107 my $off = $2; 5108 push @opcode,0x44 if ($3>=8); 5109 push @opcode,0x0f,0x38,$opcodelet{$1}; 5110 push @opcode,0x44|(($3&7)<<3),0x24; # ModR/M 5111 push @opcode,($off=~/^0/?oct($off):$off)&0xff; 5112 return ".byte\t".join(',',@opcode); 5113 } 5114 return $line; 5115} 5116 5117sub movbe { 5118 ".byte 0x0f,0x38,0xf1,0x44,0x24,".shift; 5119} 5120 5121$code =~ s/\`([^\`]*)\`/eval($1)/gem; 5122$code =~ s/\b(aes.*%xmm[0-9]+).*$/aesni($1)/gem; 5123#$code =~ s/\bmovbe\s+%eax/bswap %eax; mov %eax/gm; # debugging artefact 5124$code =~ s/\bmovbe\s+%eax,\s*([0-9]+)\(%rsp\)/movbe($1)/gem; 5125 5126print $code; 5127 5128close STDOUT; 5129