1#!/usr/bin/env perl 2 3# ==================================================================== 4# Written by Andy Polyakov <appro@fy.chalmers.se> for the OpenSSL 5# project. The module is, however, dual licensed under OpenSSL and 6# CRYPTOGAMS licenses depending on where you obtain it. For further 7# details see http://www.openssl.org/~appro/cryptogams/. 8# ==================================================================== 9 10# AES for s390x. 11 12# April 2007. 13# 14# Software performance improvement over gcc-generated code is ~70% and 15# in absolute terms is ~73 cycles per byte processed with 128-bit key. 16# You're likely to exclaim "why so slow?" Keep in mind that z-CPUs are 17# *strictly* in-order execution and issued instruction [in this case 18# load value from memory is critical] has to complete before execution 19# flow proceeds. S-boxes are compressed to 2KB[+256B]. 20# 21# As for hardware acceleration support. It's basically a "teaser," as 22# it can and should be improved in several ways. Most notably support 23# for CBC is not utilized, nor multiple blocks are ever processed. 24# Then software key schedule can be postponed till hardware support 25# detection... Performance improvement over assembler is reportedly 26# ~2.5x, but can reach >8x [naturally on larger chunks] if proper 27# support is implemented. 28 29# May 2007. 30# 31# Implement AES_set_[en|de]crypt_key. Key schedule setup is avoided 32# for 128-bit keys, if hardware support is detected. 33 34# Januray 2009. 35# 36# Add support for hardware AES192/256 and reschedule instructions to 37# minimize/avoid Address Generation Interlock hazard and to favour 38# dual-issue z10 pipeline. This gave ~25% improvement on z10 and 39# almost 50% on z9. The gain is smaller on z10, because being dual- 40# issue z10 makes it improssible to eliminate the interlock condition: 41# critial path is not long enough. Yet it spends ~24 cycles per byte 42# processed with 128-bit key. 43# 44# Unlike previous version hardware support detection takes place only 45# at the moment of key schedule setup, which is denoted in key->rounds. 46# This is done, because deferred key setup can't be made MT-safe, not 47# for keys longer than 128 bits. 48# 49# Add AES_cbc_encrypt, which gives incredible performance improvement, 50# it was measured to be ~6.6x. It's less than previously mentioned 8x, 51# because software implementation was optimized. 52 53# May 2010. 54# 55# Add AES_ctr32_encrypt. If hardware-assisted, it provides up to 4.3x 56# performance improvement over "generic" counter mode routine relying 57# on single-block, also hardware-assisted, AES_encrypt. "Up to" refers 58# to the fact that exact throughput value depends on current stack 59# frame alignment within 4KB page. In worst case you get ~75% of the 60# maximum, but *on average* it would be as much as ~98%. Meaning that 61# worst case is unlike, it's like hitting ravine on plateau. 62 63# November 2010. 64# 65# Adapt for -m31 build. If kernel supports what's called "highgprs" 66# feature on Linux [see /proc/cpuinfo], it's possible to use 64-bit 67# instructions and achieve "64-bit" performance even in 31-bit legacy 68# application context. The feature is not specific to any particular 69# processor, as long as it's "z-CPU". Latter implies that the code 70# remains z/Architecture specific. On z990 it was measured to perform 71# 2x better than code generated by gcc 4.3. 72 73# December 2010. 74# 75# Add support for z196 "cipher message with counter" instruction. 76# Note however that it's disengaged, because it was measured to 77# perform ~12% worse than vanilla km-based code... 78 79# February 2011. 80# 81# Add AES_xts_[en|de]crypt. This includes support for z196 km-xts-aes 82# instructions, which deliver ~70% improvement at 8KB block size over 83# vanilla km-based code, 37% - at most like 512-bytes block size. 84 85$flavour = shift; 86 87if ($flavour =~ /3[12]/) { 88 $SIZE_T=4; 89 $g=""; 90} else { 91 $SIZE_T=8; 92 $g="g"; 93} 94 95while (($output=shift) && ($output!~/^\w[\w\-]*\.\w+$/)) {} 96open STDOUT,">$output"; 97 98$softonly=0; # allow hardware support 99 100$t0="%r0"; $mask="%r0"; 101$t1="%r1"; 102$t2="%r2"; $inp="%r2"; 103$t3="%r3"; $out="%r3"; $bits="%r3"; 104$key="%r4"; 105$i1="%r5"; 106$i2="%r6"; 107$i3="%r7"; 108$s0="%r8"; 109$s1="%r9"; 110$s2="%r10"; 111$s3="%r11"; 112$tbl="%r12"; 113$rounds="%r13"; 114$ra="%r14"; 115$sp="%r15"; 116 117$stdframe=16*$SIZE_T+4*8; 118 119sub _data_word() 120{ my $i; 121 while(defined($i=shift)) { $code.=sprintf".long\t0x%08x,0x%08x\n",$i,$i; } 122} 123 124$code=<<___; 125.text 126 127.type AES_Te,\@object 128.align 256 129AES_Te: 130___ 131&_data_word( 132 0xc66363a5, 0xf87c7c84, 0xee777799, 0xf67b7b8d, 133 0xfff2f20d, 0xd66b6bbd, 0xde6f6fb1, 0x91c5c554, 134 0x60303050, 0x02010103, 0xce6767a9, 0x562b2b7d, 135 0xe7fefe19, 0xb5d7d762, 0x4dababe6, 0xec76769a, 136 0x8fcaca45, 0x1f82829d, 0x89c9c940, 0xfa7d7d87, 137 0xeffafa15, 0xb25959eb, 0x8e4747c9, 0xfbf0f00b, 138 0x41adadec, 0xb3d4d467, 0x5fa2a2fd, 0x45afafea, 139 0x239c9cbf, 0x53a4a4f7, 0xe4727296, 0x9bc0c05b, 140 0x75b7b7c2, 0xe1fdfd1c, 0x3d9393ae, 0x4c26266a, 141 0x6c36365a, 0x7e3f3f41, 0xf5f7f702, 0x83cccc4f, 142 0x6834345c, 0x51a5a5f4, 0xd1e5e534, 0xf9f1f108, 143 0xe2717193, 0xabd8d873, 0x62313153, 0x2a15153f, 144 0x0804040c, 0x95c7c752, 0x46232365, 0x9dc3c35e, 145 0x30181828, 0x379696a1, 0x0a05050f, 0x2f9a9ab5, 146 0x0e070709, 0x24121236, 0x1b80809b, 0xdfe2e23d, 147 0xcdebeb26, 0x4e272769, 0x7fb2b2cd, 0xea75759f, 148 0x1209091b, 0x1d83839e, 0x582c2c74, 0x341a1a2e, 149 0x361b1b2d, 0xdc6e6eb2, 0xb45a5aee, 0x5ba0a0fb, 150 0xa45252f6, 0x763b3b4d, 0xb7d6d661, 0x7db3b3ce, 151 0x5229297b, 0xdde3e33e, 0x5e2f2f71, 0x13848497, 152 0xa65353f5, 0xb9d1d168, 0x00000000, 0xc1eded2c, 153 0x40202060, 0xe3fcfc1f, 0x79b1b1c8, 0xb65b5bed, 154 0xd46a6abe, 0x8dcbcb46, 0x67bebed9, 0x7239394b, 155 0x944a4ade, 0x984c4cd4, 0xb05858e8, 0x85cfcf4a, 156 0xbbd0d06b, 0xc5efef2a, 0x4faaaae5, 0xedfbfb16, 157 0x864343c5, 0x9a4d4dd7, 0x66333355, 0x11858594, 158 0x8a4545cf, 0xe9f9f910, 0x04020206, 0xfe7f7f81, 159 0xa05050f0, 0x783c3c44, 0x259f9fba, 0x4ba8a8e3, 160 0xa25151f3, 0x5da3a3fe, 0x804040c0, 0x058f8f8a, 161 0x3f9292ad, 0x219d9dbc, 0x70383848, 0xf1f5f504, 162 0x63bcbcdf, 0x77b6b6c1, 0xafdada75, 0x42212163, 163 0x20101030, 0xe5ffff1a, 0xfdf3f30e, 0xbfd2d26d, 164 0x81cdcd4c, 0x180c0c14, 0x26131335, 0xc3ecec2f, 165 0xbe5f5fe1, 0x359797a2, 0x884444cc, 0x2e171739, 166 0x93c4c457, 0x55a7a7f2, 0xfc7e7e82, 0x7a3d3d47, 167 0xc86464ac, 0xba5d5de7, 0x3219192b, 0xe6737395, 168 0xc06060a0, 0x19818198, 0x9e4f4fd1, 0xa3dcdc7f, 169 0x44222266, 0x542a2a7e, 0x3b9090ab, 0x0b888883, 170 0x8c4646ca, 0xc7eeee29, 0x6bb8b8d3, 0x2814143c, 171 0xa7dede79, 0xbc5e5ee2, 0x160b0b1d, 0xaddbdb76, 172 0xdbe0e03b, 0x64323256, 0x743a3a4e, 0x140a0a1e, 173 0x924949db, 0x0c06060a, 0x4824246c, 0xb85c5ce4, 174 0x9fc2c25d, 0xbdd3d36e, 0x43acacef, 0xc46262a6, 175 0x399191a8, 0x319595a4, 0xd3e4e437, 0xf279798b, 176 0xd5e7e732, 0x8bc8c843, 0x6e373759, 0xda6d6db7, 177 0x018d8d8c, 0xb1d5d564, 0x9c4e4ed2, 0x49a9a9e0, 178 0xd86c6cb4, 0xac5656fa, 0xf3f4f407, 0xcfeaea25, 179 0xca6565af, 0xf47a7a8e, 0x47aeaee9, 0x10080818, 180 0x6fbabad5, 0xf0787888, 0x4a25256f, 0x5c2e2e72, 181 0x381c1c24, 0x57a6a6f1, 0x73b4b4c7, 0x97c6c651, 182 0xcbe8e823, 0xa1dddd7c, 0xe874749c, 0x3e1f1f21, 183 0x964b4bdd, 0x61bdbddc, 0x0d8b8b86, 0x0f8a8a85, 184 0xe0707090, 0x7c3e3e42, 0x71b5b5c4, 0xcc6666aa, 185 0x904848d8, 0x06030305, 0xf7f6f601, 0x1c0e0e12, 186 0xc26161a3, 0x6a35355f, 0xae5757f9, 0x69b9b9d0, 187 0x17868691, 0x99c1c158, 0x3a1d1d27, 0x279e9eb9, 188 0xd9e1e138, 0xebf8f813, 0x2b9898b3, 0x22111133, 189 0xd26969bb, 0xa9d9d970, 0x078e8e89, 0x339494a7, 190 0x2d9b9bb6, 0x3c1e1e22, 0x15878792, 0xc9e9e920, 191 0x87cece49, 0xaa5555ff, 0x50282878, 0xa5dfdf7a, 192 0x038c8c8f, 0x59a1a1f8, 0x09898980, 0x1a0d0d17, 193 0x65bfbfda, 0xd7e6e631, 0x844242c6, 0xd06868b8, 194 0x824141c3, 0x299999b0, 0x5a2d2d77, 0x1e0f0f11, 195 0x7bb0b0cb, 0xa85454fc, 0x6dbbbbd6, 0x2c16163a); 196$code.=<<___; 197# Te4[256] 198.byte 0x63, 0x7c, 0x77, 0x7b, 0xf2, 0x6b, 0x6f, 0xc5 199.byte 0x30, 0x01, 0x67, 0x2b, 0xfe, 0xd7, 0xab, 0x76 200.byte 0xca, 0x82, 0xc9, 0x7d, 0xfa, 0x59, 0x47, 0xf0 201.byte 0xad, 0xd4, 0xa2, 0xaf, 0x9c, 0xa4, 0x72, 0xc0 202.byte 0xb7, 0xfd, 0x93, 0x26, 0x36, 0x3f, 0xf7, 0xcc 203.byte 0x34, 0xa5, 0xe5, 0xf1, 0x71, 0xd8, 0x31, 0x15 204.byte 0x04, 0xc7, 0x23, 0xc3, 0x18, 0x96, 0x05, 0x9a 205.byte 0x07, 0x12, 0x80, 0xe2, 0xeb, 0x27, 0xb2, 0x75 206.byte 0x09, 0x83, 0x2c, 0x1a, 0x1b, 0x6e, 0x5a, 0xa0 207.byte 0x52, 0x3b, 0xd6, 0xb3, 0x29, 0xe3, 0x2f, 0x84 208.byte 0x53, 0xd1, 0x00, 0xed, 0x20, 0xfc, 0xb1, 0x5b 209.byte 0x6a, 0xcb, 0xbe, 0x39, 0x4a, 0x4c, 0x58, 0xcf 210.byte 0xd0, 0xef, 0xaa, 0xfb, 0x43, 0x4d, 0x33, 0x85 211.byte 0x45, 0xf9, 0x02, 0x7f, 0x50, 0x3c, 0x9f, 0xa8 212.byte 0x51, 0xa3, 0x40, 0x8f, 0x92, 0x9d, 0x38, 0xf5 213.byte 0xbc, 0xb6, 0xda, 0x21, 0x10, 0xff, 0xf3, 0xd2 214.byte 0xcd, 0x0c, 0x13, 0xec, 0x5f, 0x97, 0x44, 0x17 215.byte 0xc4, 0xa7, 0x7e, 0x3d, 0x64, 0x5d, 0x19, 0x73 216.byte 0x60, 0x81, 0x4f, 0xdc, 0x22, 0x2a, 0x90, 0x88 217.byte 0x46, 0xee, 0xb8, 0x14, 0xde, 0x5e, 0x0b, 0xdb 218.byte 0xe0, 0x32, 0x3a, 0x0a, 0x49, 0x06, 0x24, 0x5c 219.byte 0xc2, 0xd3, 0xac, 0x62, 0x91, 0x95, 0xe4, 0x79 220.byte 0xe7, 0xc8, 0x37, 0x6d, 0x8d, 0xd5, 0x4e, 0xa9 221.byte 0x6c, 0x56, 0xf4, 0xea, 0x65, 0x7a, 0xae, 0x08 222.byte 0xba, 0x78, 0x25, 0x2e, 0x1c, 0xa6, 0xb4, 0xc6 223.byte 0xe8, 0xdd, 0x74, 0x1f, 0x4b, 0xbd, 0x8b, 0x8a 224.byte 0x70, 0x3e, 0xb5, 0x66, 0x48, 0x03, 0xf6, 0x0e 225.byte 0x61, 0x35, 0x57, 0xb9, 0x86, 0xc1, 0x1d, 0x9e 226.byte 0xe1, 0xf8, 0x98, 0x11, 0x69, 0xd9, 0x8e, 0x94 227.byte 0x9b, 0x1e, 0x87, 0xe9, 0xce, 0x55, 0x28, 0xdf 228.byte 0x8c, 0xa1, 0x89, 0x0d, 0xbf, 0xe6, 0x42, 0x68 229.byte 0x41, 0x99, 0x2d, 0x0f, 0xb0, 0x54, 0xbb, 0x16 230# rcon[] 231.long 0x01000000, 0x02000000, 0x04000000, 0x08000000 232.long 0x10000000, 0x20000000, 0x40000000, 0x80000000 233.long 0x1B000000, 0x36000000, 0, 0, 0, 0, 0, 0 234.align 256 235.size AES_Te,.-AES_Te 236 237# void AES_encrypt(const unsigned char *inp, unsigned char *out, 238# const AES_KEY *key) { 239.globl AES_encrypt 240.type AES_encrypt,\@function 241AES_encrypt: 242___ 243$code.=<<___ if (!$softonly); 244 l %r0,240($key) 245 lhi %r1,16 246 clr %r0,%r1 247 jl .Lesoft 248 249 la %r1,0($key) 250 #la %r2,0($inp) 251 la %r4,0($out) 252 lghi %r3,16 # single block length 253 .long 0xb92e0042 # km %r4,%r2 254 brc 1,.-4 # can this happen? 255 br %r14 256.align 64 257.Lesoft: 258___ 259$code.=<<___; 260 stm${g} %r3,$ra,3*$SIZE_T($sp) 261 262 llgf $s0,0($inp) 263 llgf $s1,4($inp) 264 llgf $s2,8($inp) 265 llgf $s3,12($inp) 266 267 larl $tbl,AES_Te 268 bras $ra,_s390x_AES_encrypt 269 270 l${g} $out,3*$SIZE_T($sp) 271 st $s0,0($out) 272 st $s1,4($out) 273 st $s2,8($out) 274 st $s3,12($out) 275 276 lm${g} %r6,$ra,6*$SIZE_T($sp) 277 br $ra 278.size AES_encrypt,.-AES_encrypt 279 280.type _s390x_AES_encrypt,\@function 281.align 16 282_s390x_AES_encrypt: 283 st${g} $ra,15*$SIZE_T($sp) 284 x $s0,0($key) 285 x $s1,4($key) 286 x $s2,8($key) 287 x $s3,12($key) 288 l $rounds,240($key) 289 llill $mask,`0xff<<3` 290 aghi $rounds,-1 291 j .Lenc_loop 292.align 16 293.Lenc_loop: 294 sllg $t1,$s0,`0+3` 295 srlg $t2,$s0,`8-3` 296 srlg $t3,$s0,`16-3` 297 srl $s0,`24-3` 298 nr $s0,$mask 299 ngr $t1,$mask 300 nr $t2,$mask 301 nr $t3,$mask 302 303 srlg $i1,$s1,`16-3` # i0 304 sllg $i2,$s1,`0+3` 305 srlg $i3,$s1,`8-3` 306 srl $s1,`24-3` 307 nr $i1,$mask 308 nr $s1,$mask 309 ngr $i2,$mask 310 nr $i3,$mask 311 312 l $s0,0($s0,$tbl) # Te0[s0>>24] 313 l $t1,1($t1,$tbl) # Te3[s0>>0] 314 l $t2,2($t2,$tbl) # Te2[s0>>8] 315 l $t3,3($t3,$tbl) # Te1[s0>>16] 316 317 x $s0,3($i1,$tbl) # Te1[s1>>16] 318 l $s1,0($s1,$tbl) # Te0[s1>>24] 319 x $t2,1($i2,$tbl) # Te3[s1>>0] 320 x $t3,2($i3,$tbl) # Te2[s1>>8] 321 322 srlg $i1,$s2,`8-3` # i0 323 srlg $i2,$s2,`16-3` # i1 324 nr $i1,$mask 325 nr $i2,$mask 326 sllg $i3,$s2,`0+3` 327 srl $s2,`24-3` 328 nr $s2,$mask 329 ngr $i3,$mask 330 331 xr $s1,$t1 332 srlg $ra,$s3,`8-3` # i1 333 sllg $t1,$s3,`0+3` # i0 334 nr $ra,$mask 335 la $key,16($key) 336 ngr $t1,$mask 337 338 x $s0,2($i1,$tbl) # Te2[s2>>8] 339 x $s1,3($i2,$tbl) # Te1[s2>>16] 340 l $s2,0($s2,$tbl) # Te0[s2>>24] 341 x $t3,1($i3,$tbl) # Te3[s2>>0] 342 343 srlg $i3,$s3,`16-3` # i2 344 xr $s2,$t2 345 srl $s3,`24-3` 346 nr $i3,$mask 347 nr $s3,$mask 348 349 x $s0,0($key) 350 x $s1,4($key) 351 x $s2,8($key) 352 x $t3,12($key) 353 354 x $s0,1($t1,$tbl) # Te3[s3>>0] 355 x $s1,2($ra,$tbl) # Te2[s3>>8] 356 x $s2,3($i3,$tbl) # Te1[s3>>16] 357 l $s3,0($s3,$tbl) # Te0[s3>>24] 358 xr $s3,$t3 359 360 brct $rounds,.Lenc_loop 361 .align 16 362 363 sllg $t1,$s0,`0+3` 364 srlg $t2,$s0,`8-3` 365 ngr $t1,$mask 366 srlg $t3,$s0,`16-3` 367 srl $s0,`24-3` 368 nr $s0,$mask 369 nr $t2,$mask 370 nr $t3,$mask 371 372 srlg $i1,$s1,`16-3` # i0 373 sllg $i2,$s1,`0+3` 374 ngr $i2,$mask 375 srlg $i3,$s1,`8-3` 376 srl $s1,`24-3` 377 nr $i1,$mask 378 nr $s1,$mask 379 nr $i3,$mask 380 381 llgc $s0,2($s0,$tbl) # Te4[s0>>24] 382 llgc $t1,2($t1,$tbl) # Te4[s0>>0] 383 sll $s0,24 384 llgc $t2,2($t2,$tbl) # Te4[s0>>8] 385 llgc $t3,2($t3,$tbl) # Te4[s0>>16] 386 sll $t2,8 387 sll $t3,16 388 389 llgc $i1,2($i1,$tbl) # Te4[s1>>16] 390 llgc $s1,2($s1,$tbl) # Te4[s1>>24] 391 llgc $i2,2($i2,$tbl) # Te4[s1>>0] 392 llgc $i3,2($i3,$tbl) # Te4[s1>>8] 393 sll $i1,16 394 sll $s1,24 395 sll $i3,8 396 or $s0,$i1 397 or $s1,$t1 398 or $t2,$i2 399 or $t3,$i3 400 401 srlg $i1,$s2,`8-3` # i0 402 srlg $i2,$s2,`16-3` # i1 403 nr $i1,$mask 404 nr $i2,$mask 405 sllg $i3,$s2,`0+3` 406 srl $s2,`24-3` 407 ngr $i3,$mask 408 nr $s2,$mask 409 410 sllg $t1,$s3,`0+3` # i0 411 srlg $ra,$s3,`8-3` # i1 412 ngr $t1,$mask 413 414 llgc $i1,2($i1,$tbl) # Te4[s2>>8] 415 llgc $i2,2($i2,$tbl) # Te4[s2>>16] 416 sll $i1,8 417 llgc $s2,2($s2,$tbl) # Te4[s2>>24] 418 llgc $i3,2($i3,$tbl) # Te4[s2>>0] 419 sll $i2,16 420 nr $ra,$mask 421 sll $s2,24 422 or $s0,$i1 423 or $s1,$i2 424 or $s2,$t2 425 or $t3,$i3 426 427 srlg $i3,$s3,`16-3` # i2 428 srl $s3,`24-3` 429 nr $i3,$mask 430 nr $s3,$mask 431 432 l $t0,16($key) 433 l $t2,20($key) 434 435 llgc $i1,2($t1,$tbl) # Te4[s3>>0] 436 llgc $i2,2($ra,$tbl) # Te4[s3>>8] 437 llgc $i3,2($i3,$tbl) # Te4[s3>>16] 438 llgc $s3,2($s3,$tbl) # Te4[s3>>24] 439 sll $i2,8 440 sll $i3,16 441 sll $s3,24 442 or $s0,$i1 443 or $s1,$i2 444 or $s2,$i3 445 or $s3,$t3 446 447 l${g} $ra,15*$SIZE_T($sp) 448 xr $s0,$t0 449 xr $s1,$t2 450 x $s2,24($key) 451 x $s3,28($key) 452 453 br $ra 454.size _s390x_AES_encrypt,.-_s390x_AES_encrypt 455___ 456 457$code.=<<___; 458.type AES_Td,\@object 459.align 256 460AES_Td: 461___ 462&_data_word( 463 0x51f4a750, 0x7e416553, 0x1a17a4c3, 0x3a275e96, 464 0x3bab6bcb, 0x1f9d45f1, 0xacfa58ab, 0x4be30393, 465 0x2030fa55, 0xad766df6, 0x88cc7691, 0xf5024c25, 466 0x4fe5d7fc, 0xc52acbd7, 0x26354480, 0xb562a38f, 467 0xdeb15a49, 0x25ba1b67, 0x45ea0e98, 0x5dfec0e1, 468 0xc32f7502, 0x814cf012, 0x8d4697a3, 0x6bd3f9c6, 469 0x038f5fe7, 0x15929c95, 0xbf6d7aeb, 0x955259da, 470 0xd4be832d, 0x587421d3, 0x49e06929, 0x8ec9c844, 471 0x75c2896a, 0xf48e7978, 0x99583e6b, 0x27b971dd, 472 0xbee14fb6, 0xf088ad17, 0xc920ac66, 0x7dce3ab4, 473 0x63df4a18, 0xe51a3182, 0x97513360, 0x62537f45, 474 0xb16477e0, 0xbb6bae84, 0xfe81a01c, 0xf9082b94, 475 0x70486858, 0x8f45fd19, 0x94de6c87, 0x527bf8b7, 476 0xab73d323, 0x724b02e2, 0xe31f8f57, 0x6655ab2a, 477 0xb2eb2807, 0x2fb5c203, 0x86c57b9a, 0xd33708a5, 478 0x302887f2, 0x23bfa5b2, 0x02036aba, 0xed16825c, 479 0x8acf1c2b, 0xa779b492, 0xf307f2f0, 0x4e69e2a1, 480 0x65daf4cd, 0x0605bed5, 0xd134621f, 0xc4a6fe8a, 481 0x342e539d, 0xa2f355a0, 0x058ae132, 0xa4f6eb75, 482 0x0b83ec39, 0x4060efaa, 0x5e719f06, 0xbd6e1051, 483 0x3e218af9, 0x96dd063d, 0xdd3e05ae, 0x4de6bd46, 484 0x91548db5, 0x71c45d05, 0x0406d46f, 0x605015ff, 485 0x1998fb24, 0xd6bde997, 0x894043cc, 0x67d99e77, 486 0xb0e842bd, 0x07898b88, 0xe7195b38, 0x79c8eedb, 487 0xa17c0a47, 0x7c420fe9, 0xf8841ec9, 0x00000000, 488 0x09808683, 0x322bed48, 0x1e1170ac, 0x6c5a724e, 489 0xfd0efffb, 0x0f853856, 0x3daed51e, 0x362d3927, 490 0x0a0fd964, 0x685ca621, 0x9b5b54d1, 0x24362e3a, 491 0x0c0a67b1, 0x9357e70f, 0xb4ee96d2, 0x1b9b919e, 492 0x80c0c54f, 0x61dc20a2, 0x5a774b69, 0x1c121a16, 493 0xe293ba0a, 0xc0a02ae5, 0x3c22e043, 0x121b171d, 494 0x0e090d0b, 0xf28bc7ad, 0x2db6a8b9, 0x141ea9c8, 495 0x57f11985, 0xaf75074c, 0xee99ddbb, 0xa37f60fd, 496 0xf701269f, 0x5c72f5bc, 0x44663bc5, 0x5bfb7e34, 497 0x8b432976, 0xcb23c6dc, 0xb6edfc68, 0xb8e4f163, 498 0xd731dcca, 0x42638510, 0x13972240, 0x84c61120, 499 0x854a247d, 0xd2bb3df8, 0xaef93211, 0xc729a16d, 500 0x1d9e2f4b, 0xdcb230f3, 0x0d8652ec, 0x77c1e3d0, 501 0x2bb3166c, 0xa970b999, 0x119448fa, 0x47e96422, 502 0xa8fc8cc4, 0xa0f03f1a, 0x567d2cd8, 0x223390ef, 503 0x87494ec7, 0xd938d1c1, 0x8ccaa2fe, 0x98d40b36, 504 0xa6f581cf, 0xa57ade28, 0xdab78e26, 0x3fadbfa4, 505 0x2c3a9de4, 0x5078920d, 0x6a5fcc9b, 0x547e4662, 506 0xf68d13c2, 0x90d8b8e8, 0x2e39f75e, 0x82c3aff5, 507 0x9f5d80be, 0x69d0937c, 0x6fd52da9, 0xcf2512b3, 508 0xc8ac993b, 0x10187da7, 0xe89c636e, 0xdb3bbb7b, 509 0xcd267809, 0x6e5918f4, 0xec9ab701, 0x834f9aa8, 510 0xe6956e65, 0xaaffe67e, 0x21bccf08, 0xef15e8e6, 511 0xbae79bd9, 0x4a6f36ce, 0xea9f09d4, 0x29b07cd6, 512 0x31a4b2af, 0x2a3f2331, 0xc6a59430, 0x35a266c0, 513 0x744ebc37, 0xfc82caa6, 0xe090d0b0, 0x33a7d815, 514 0xf104984a, 0x41ecdaf7, 0x7fcd500e, 0x1791f62f, 515 0x764dd68d, 0x43efb04d, 0xccaa4d54, 0xe49604df, 516 0x9ed1b5e3, 0x4c6a881b, 0xc12c1fb8, 0x4665517f, 517 0x9d5eea04, 0x018c355d, 0xfa877473, 0xfb0b412e, 518 0xb3671d5a, 0x92dbd252, 0xe9105633, 0x6dd64713, 519 0x9ad7618c, 0x37a10c7a, 0x59f8148e, 0xeb133c89, 520 0xcea927ee, 0xb761c935, 0xe11ce5ed, 0x7a47b13c, 521 0x9cd2df59, 0x55f2733f, 0x1814ce79, 0x73c737bf, 522 0x53f7cdea, 0x5ffdaa5b, 0xdf3d6f14, 0x7844db86, 523 0xcaaff381, 0xb968c43e, 0x3824342c, 0xc2a3405f, 524 0x161dc372, 0xbce2250c, 0x283c498b, 0xff0d9541, 525 0x39a80171, 0x080cb3de, 0xd8b4e49c, 0x6456c190, 526 0x7bcb8461, 0xd532b670, 0x486c5c74, 0xd0b85742); 527$code.=<<___; 528# Td4[256] 529.byte 0x52, 0x09, 0x6a, 0xd5, 0x30, 0x36, 0xa5, 0x38 530.byte 0xbf, 0x40, 0xa3, 0x9e, 0x81, 0xf3, 0xd7, 0xfb 531.byte 0x7c, 0xe3, 0x39, 0x82, 0x9b, 0x2f, 0xff, 0x87 532.byte 0x34, 0x8e, 0x43, 0x44, 0xc4, 0xde, 0xe9, 0xcb 533.byte 0x54, 0x7b, 0x94, 0x32, 0xa6, 0xc2, 0x23, 0x3d 534.byte 0xee, 0x4c, 0x95, 0x0b, 0x42, 0xfa, 0xc3, 0x4e 535.byte 0x08, 0x2e, 0xa1, 0x66, 0x28, 0xd9, 0x24, 0xb2 536.byte 0x76, 0x5b, 0xa2, 0x49, 0x6d, 0x8b, 0xd1, 0x25 537.byte 0x72, 0xf8, 0xf6, 0x64, 0x86, 0x68, 0x98, 0x16 538.byte 0xd4, 0xa4, 0x5c, 0xcc, 0x5d, 0x65, 0xb6, 0x92 539.byte 0x6c, 0x70, 0x48, 0x50, 0xfd, 0xed, 0xb9, 0xda 540.byte 0x5e, 0x15, 0x46, 0x57, 0xa7, 0x8d, 0x9d, 0x84 541.byte 0x90, 0xd8, 0xab, 0x00, 0x8c, 0xbc, 0xd3, 0x0a 542.byte 0xf7, 0xe4, 0x58, 0x05, 0xb8, 0xb3, 0x45, 0x06 543.byte 0xd0, 0x2c, 0x1e, 0x8f, 0xca, 0x3f, 0x0f, 0x02 544.byte 0xc1, 0xaf, 0xbd, 0x03, 0x01, 0x13, 0x8a, 0x6b 545.byte 0x3a, 0x91, 0x11, 0x41, 0x4f, 0x67, 0xdc, 0xea 546.byte 0x97, 0xf2, 0xcf, 0xce, 0xf0, 0xb4, 0xe6, 0x73 547.byte 0x96, 0xac, 0x74, 0x22, 0xe7, 0xad, 0x35, 0x85 548.byte 0xe2, 0xf9, 0x37, 0xe8, 0x1c, 0x75, 0xdf, 0x6e 549.byte 0x47, 0xf1, 0x1a, 0x71, 0x1d, 0x29, 0xc5, 0x89 550.byte 0x6f, 0xb7, 0x62, 0x0e, 0xaa, 0x18, 0xbe, 0x1b 551.byte 0xfc, 0x56, 0x3e, 0x4b, 0xc6, 0xd2, 0x79, 0x20 552.byte 0x9a, 0xdb, 0xc0, 0xfe, 0x78, 0xcd, 0x5a, 0xf4 553.byte 0x1f, 0xdd, 0xa8, 0x33, 0x88, 0x07, 0xc7, 0x31 554.byte 0xb1, 0x12, 0x10, 0x59, 0x27, 0x80, 0xec, 0x5f 555.byte 0x60, 0x51, 0x7f, 0xa9, 0x19, 0xb5, 0x4a, 0x0d 556.byte 0x2d, 0xe5, 0x7a, 0x9f, 0x93, 0xc9, 0x9c, 0xef 557.byte 0xa0, 0xe0, 0x3b, 0x4d, 0xae, 0x2a, 0xf5, 0xb0 558.byte 0xc8, 0xeb, 0xbb, 0x3c, 0x83, 0x53, 0x99, 0x61 559.byte 0x17, 0x2b, 0x04, 0x7e, 0xba, 0x77, 0xd6, 0x26 560.byte 0xe1, 0x69, 0x14, 0x63, 0x55, 0x21, 0x0c, 0x7d 561.size AES_Td,.-AES_Td 562 563# void AES_decrypt(const unsigned char *inp, unsigned char *out, 564# const AES_KEY *key) { 565.globl AES_decrypt 566.type AES_decrypt,\@function 567AES_decrypt: 568___ 569$code.=<<___ if (!$softonly); 570 l %r0,240($key) 571 lhi %r1,16 572 clr %r0,%r1 573 jl .Ldsoft 574 575 la %r1,0($key) 576 #la %r2,0($inp) 577 la %r4,0($out) 578 lghi %r3,16 # single block length 579 .long 0xb92e0042 # km %r4,%r2 580 brc 1,.-4 # can this happen? 581 br %r14 582.align 64 583.Ldsoft: 584___ 585$code.=<<___; 586 stm${g} %r3,$ra,3*$SIZE_T($sp) 587 588 llgf $s0,0($inp) 589 llgf $s1,4($inp) 590 llgf $s2,8($inp) 591 llgf $s3,12($inp) 592 593 larl $tbl,AES_Td 594 bras $ra,_s390x_AES_decrypt 595 596 l${g} $out,3*$SIZE_T($sp) 597 st $s0,0($out) 598 st $s1,4($out) 599 st $s2,8($out) 600 st $s3,12($out) 601 602 lm${g} %r6,$ra,6*$SIZE_T($sp) 603 br $ra 604.size AES_decrypt,.-AES_decrypt 605 606.type _s390x_AES_decrypt,\@function 607.align 16 608_s390x_AES_decrypt: 609 st${g} $ra,15*$SIZE_T($sp) 610 x $s0,0($key) 611 x $s1,4($key) 612 x $s2,8($key) 613 x $s3,12($key) 614 l $rounds,240($key) 615 llill $mask,`0xff<<3` 616 aghi $rounds,-1 617 j .Ldec_loop 618.align 16 619.Ldec_loop: 620 srlg $t1,$s0,`16-3` 621 srlg $t2,$s0,`8-3` 622 sllg $t3,$s0,`0+3` 623 srl $s0,`24-3` 624 nr $s0,$mask 625 nr $t1,$mask 626 nr $t2,$mask 627 ngr $t3,$mask 628 629 sllg $i1,$s1,`0+3` # i0 630 srlg $i2,$s1,`16-3` 631 srlg $i3,$s1,`8-3` 632 srl $s1,`24-3` 633 ngr $i1,$mask 634 nr $s1,$mask 635 nr $i2,$mask 636 nr $i3,$mask 637 638 l $s0,0($s0,$tbl) # Td0[s0>>24] 639 l $t1,3($t1,$tbl) # Td1[s0>>16] 640 l $t2,2($t2,$tbl) # Td2[s0>>8] 641 l $t3,1($t3,$tbl) # Td3[s0>>0] 642 643 x $s0,1($i1,$tbl) # Td3[s1>>0] 644 l $s1,0($s1,$tbl) # Td0[s1>>24] 645 x $t2,3($i2,$tbl) # Td1[s1>>16] 646 x $t3,2($i3,$tbl) # Td2[s1>>8] 647 648 srlg $i1,$s2,`8-3` # i0 649 sllg $i2,$s2,`0+3` # i1 650 srlg $i3,$s2,`16-3` 651 srl $s2,`24-3` 652 nr $i1,$mask 653 ngr $i2,$mask 654 nr $s2,$mask 655 nr $i3,$mask 656 657 xr $s1,$t1 658 srlg $ra,$s3,`8-3` # i1 659 srlg $t1,$s3,`16-3` # i0 660 nr $ra,$mask 661 la $key,16($key) 662 nr $t1,$mask 663 664 x $s0,2($i1,$tbl) # Td2[s2>>8] 665 x $s1,1($i2,$tbl) # Td3[s2>>0] 666 l $s2,0($s2,$tbl) # Td0[s2>>24] 667 x $t3,3($i3,$tbl) # Td1[s2>>16] 668 669 sllg $i3,$s3,`0+3` # i2 670 srl $s3,`24-3` 671 ngr $i3,$mask 672 nr $s3,$mask 673 674 xr $s2,$t2 675 x $s0,0($key) 676 x $s1,4($key) 677 x $s2,8($key) 678 x $t3,12($key) 679 680 x $s0,3($t1,$tbl) # Td1[s3>>16] 681 x $s1,2($ra,$tbl) # Td2[s3>>8] 682 x $s2,1($i3,$tbl) # Td3[s3>>0] 683 l $s3,0($s3,$tbl) # Td0[s3>>24] 684 xr $s3,$t3 685 686 brct $rounds,.Ldec_loop 687 .align 16 688 689 l $t1,`2048+0`($tbl) # prefetch Td4 690 l $t2,`2048+64`($tbl) 691 l $t3,`2048+128`($tbl) 692 l $i1,`2048+192`($tbl) 693 llill $mask,0xff 694 695 srlg $i3,$s0,24 # i0 696 srlg $t1,$s0,16 697 srlg $t2,$s0,8 698 nr $s0,$mask # i3 699 nr $t1,$mask 700 701 srlg $i1,$s1,24 702 nr $t2,$mask 703 srlg $i2,$s1,16 704 srlg $ra,$s1,8 705 nr $s1,$mask # i0 706 nr $i2,$mask 707 nr $ra,$mask 708 709 llgc $i3,2048($i3,$tbl) # Td4[s0>>24] 710 llgc $t1,2048($t1,$tbl) # Td4[s0>>16] 711 llgc $t2,2048($t2,$tbl) # Td4[s0>>8] 712 sll $t1,16 713 llgc $t3,2048($s0,$tbl) # Td4[s0>>0] 714 sllg $s0,$i3,24 715 sll $t2,8 716 717 llgc $s1,2048($s1,$tbl) # Td4[s1>>0] 718 llgc $i1,2048($i1,$tbl) # Td4[s1>>24] 719 llgc $i2,2048($i2,$tbl) # Td4[s1>>16] 720 sll $i1,24 721 llgc $i3,2048($ra,$tbl) # Td4[s1>>8] 722 sll $i2,16 723 sll $i3,8 724 or $s0,$s1 725 or $t1,$i1 726 or $t2,$i2 727 or $t3,$i3 728 729 srlg $i1,$s2,8 # i0 730 srlg $i2,$s2,24 731 srlg $i3,$s2,16 732 nr $s2,$mask # i1 733 nr $i1,$mask 734 nr $i3,$mask 735 llgc $i1,2048($i1,$tbl) # Td4[s2>>8] 736 llgc $s1,2048($s2,$tbl) # Td4[s2>>0] 737 llgc $i2,2048($i2,$tbl) # Td4[s2>>24] 738 llgc $i3,2048($i3,$tbl) # Td4[s2>>16] 739 sll $i1,8 740 sll $i2,24 741 or $s0,$i1 742 sll $i3,16 743 or $t2,$i2 744 or $t3,$i3 745 746 srlg $i1,$s3,16 # i0 747 srlg $i2,$s3,8 # i1 748 srlg $i3,$s3,24 749 nr $s3,$mask # i2 750 nr $i1,$mask 751 nr $i2,$mask 752 753 l${g} $ra,15*$SIZE_T($sp) 754 or $s1,$t1 755 l $t0,16($key) 756 l $t1,20($key) 757 758 llgc $i1,2048($i1,$tbl) # Td4[s3>>16] 759 llgc $i2,2048($i2,$tbl) # Td4[s3>>8] 760 sll $i1,16 761 llgc $s2,2048($s3,$tbl) # Td4[s3>>0] 762 llgc $s3,2048($i3,$tbl) # Td4[s3>>24] 763 sll $i2,8 764 sll $s3,24 765 or $s0,$i1 766 or $s1,$i2 767 or $s2,$t2 768 or $s3,$t3 769 770 xr $s0,$t0 771 xr $s1,$t1 772 x $s2,24($key) 773 x $s3,28($key) 774 775 br $ra 776.size _s390x_AES_decrypt,.-_s390x_AES_decrypt 777___ 778 779$code.=<<___; 780# void AES_set_encrypt_key(const unsigned char *in, int bits, 781# AES_KEY *key) { 782.globl private_AES_set_encrypt_key 783.type private_AES_set_encrypt_key,\@function 784.align 16 785private_AES_set_encrypt_key: 786_s390x_AES_set_encrypt_key: 787 lghi $t0,0 788 cl${g}r $inp,$t0 789 je .Lminus1 790 cl${g}r $key,$t0 791 je .Lminus1 792 793 lghi $t0,128 794 clr $bits,$t0 795 je .Lproceed 796 lghi $t0,192 797 clr $bits,$t0 798 je .Lproceed 799 lghi $t0,256 800 clr $bits,$t0 801 je .Lproceed 802 lghi %r2,-2 803 br %r14 804 805.align 16 806.Lproceed: 807___ 808$code.=<<___ if (!$softonly); 809 # convert bits to km code, [128,192,256]->[18,19,20] 810 lhi %r5,-128 811 lhi %r0,18 812 ar %r5,$bits 813 srl %r5,6 814 ar %r5,%r0 815 816 larl %r1,OPENSSL_s390xcap_P 817 lg %r0,0(%r1) 818 tmhl %r0,0x4000 # check for message-security assist 819 jz .Lekey_internal 820 821 llihh %r0,0x8000 822 srlg %r0,%r0,0(%r5) 823 ng %r0,48(%r1) # check kmc capability vector 824 jz .Lekey_internal 825 826 lmg %r0,%r1,0($inp) # just copy 128 bits... 827 stmg %r0,%r1,0($key) 828 lhi %r0,192 829 cr $bits,%r0 830 jl 1f 831 lg %r1,16($inp) 832 stg %r1,16($key) 833 je 1f 834 lg %r1,24($inp) 835 stg %r1,24($key) 8361: st $bits,236($key) # save bits [for debugging purposes] 837 lgr $t0,%r5 838 st %r5,240($key) # save km code 839 lghi %r2,0 840 br %r14 841___ 842$code.=<<___; 843.align 16 844.Lekey_internal: 845 stm${g} %r4,%r13,4*$SIZE_T($sp) # all non-volatile regs and $key 846 847 larl $tbl,AES_Te+2048 848 849 llgf $s0,0($inp) 850 llgf $s1,4($inp) 851 llgf $s2,8($inp) 852 llgf $s3,12($inp) 853 st $s0,0($key) 854 st $s1,4($key) 855 st $s2,8($key) 856 st $s3,12($key) 857 lghi $t0,128 858 cr $bits,$t0 859 jne .Lnot128 860 861 llill $mask,0xff 862 lghi $t3,0 # i=0 863 lghi $rounds,10 864 st $rounds,240($key) 865 866 llgfr $t2,$s3 # temp=rk[3] 867 srlg $i1,$s3,8 868 srlg $i2,$s3,16 869 srlg $i3,$s3,24 870 nr $t2,$mask 871 nr $i1,$mask 872 nr $i2,$mask 873 874.align 16 875.L128_loop: 876 la $t2,0($t2,$tbl) 877 la $i1,0($i1,$tbl) 878 la $i2,0($i2,$tbl) 879 la $i3,0($i3,$tbl) 880 icm $t2,2,0($t2) # Te4[rk[3]>>0]<<8 881 icm $t2,4,0($i1) # Te4[rk[3]>>8]<<16 882 icm $t2,8,0($i2) # Te4[rk[3]>>16]<<24 883 icm $t2,1,0($i3) # Te4[rk[3]>>24] 884 x $t2,256($t3,$tbl) # rcon[i] 885 xr $s0,$t2 # rk[4]=rk[0]^... 886 xr $s1,$s0 # rk[5]=rk[1]^rk[4] 887 xr $s2,$s1 # rk[6]=rk[2]^rk[5] 888 xr $s3,$s2 # rk[7]=rk[3]^rk[6] 889 890 llgfr $t2,$s3 # temp=rk[3] 891 srlg $i1,$s3,8 892 srlg $i2,$s3,16 893 nr $t2,$mask 894 nr $i1,$mask 895 srlg $i3,$s3,24 896 nr $i2,$mask 897 898 st $s0,16($key) 899 st $s1,20($key) 900 st $s2,24($key) 901 st $s3,28($key) 902 la $key,16($key) # key+=4 903 la $t3,4($t3) # i++ 904 brct $rounds,.L128_loop 905 lghi $t0,10 906 lghi %r2,0 907 lm${g} %r4,%r13,4*$SIZE_T($sp) 908 br $ra 909 910.align 16 911.Lnot128: 912 llgf $t0,16($inp) 913 llgf $t1,20($inp) 914 st $t0,16($key) 915 st $t1,20($key) 916 lghi $t0,192 917 cr $bits,$t0 918 jne .Lnot192 919 920 llill $mask,0xff 921 lghi $t3,0 # i=0 922 lghi $rounds,12 923 st $rounds,240($key) 924 lghi $rounds,8 925 926 srlg $i1,$t1,8 927 srlg $i2,$t1,16 928 srlg $i3,$t1,24 929 nr $t1,$mask 930 nr $i1,$mask 931 nr $i2,$mask 932 933.align 16 934.L192_loop: 935 la $t1,0($t1,$tbl) 936 la $i1,0($i1,$tbl) 937 la $i2,0($i2,$tbl) 938 la $i3,0($i3,$tbl) 939 icm $t1,2,0($t1) # Te4[rk[5]>>0]<<8 940 icm $t1,4,0($i1) # Te4[rk[5]>>8]<<16 941 icm $t1,8,0($i2) # Te4[rk[5]>>16]<<24 942 icm $t1,1,0($i3) # Te4[rk[5]>>24] 943 x $t1,256($t3,$tbl) # rcon[i] 944 xr $s0,$t1 # rk[6]=rk[0]^... 945 xr $s1,$s0 # rk[7]=rk[1]^rk[6] 946 xr $s2,$s1 # rk[8]=rk[2]^rk[7] 947 xr $s3,$s2 # rk[9]=rk[3]^rk[8] 948 949 st $s0,24($key) 950 st $s1,28($key) 951 st $s2,32($key) 952 st $s3,36($key) 953 brct $rounds,.L192_continue 954 lghi $t0,12 955 lghi %r2,0 956 lm${g} %r4,%r13,4*$SIZE_T($sp) 957 br $ra 958 959.align 16 960.L192_continue: 961 lgr $t1,$s3 962 x $t1,16($key) # rk[10]=rk[4]^rk[9] 963 st $t1,40($key) 964 x $t1,20($key) # rk[11]=rk[5]^rk[10] 965 st $t1,44($key) 966 967 srlg $i1,$t1,8 968 srlg $i2,$t1,16 969 srlg $i3,$t1,24 970 nr $t1,$mask 971 nr $i1,$mask 972 nr $i2,$mask 973 974 la $key,24($key) # key+=6 975 la $t3,4($t3) # i++ 976 j .L192_loop 977 978.align 16 979.Lnot192: 980 llgf $t0,24($inp) 981 llgf $t1,28($inp) 982 st $t0,24($key) 983 st $t1,28($key) 984 llill $mask,0xff 985 lghi $t3,0 # i=0 986 lghi $rounds,14 987 st $rounds,240($key) 988 lghi $rounds,7 989 990 srlg $i1,$t1,8 991 srlg $i2,$t1,16 992 srlg $i3,$t1,24 993 nr $t1,$mask 994 nr $i1,$mask 995 nr $i2,$mask 996 997.align 16 998.L256_loop: 999 la $t1,0($t1,$tbl) 1000 la $i1,0($i1,$tbl) 1001 la $i2,0($i2,$tbl) 1002 la $i3,0($i3,$tbl) 1003 icm $t1,2,0($t1) # Te4[rk[7]>>0]<<8 1004 icm $t1,4,0($i1) # Te4[rk[7]>>8]<<16 1005 icm $t1,8,0($i2) # Te4[rk[7]>>16]<<24 1006 icm $t1,1,0($i3) # Te4[rk[7]>>24] 1007 x $t1,256($t3,$tbl) # rcon[i] 1008 xr $s0,$t1 # rk[8]=rk[0]^... 1009 xr $s1,$s0 # rk[9]=rk[1]^rk[8] 1010 xr $s2,$s1 # rk[10]=rk[2]^rk[9] 1011 xr $s3,$s2 # rk[11]=rk[3]^rk[10] 1012 st $s0,32($key) 1013 st $s1,36($key) 1014 st $s2,40($key) 1015 st $s3,44($key) 1016 brct $rounds,.L256_continue 1017 lghi $t0,14 1018 lghi %r2,0 1019 lm${g} %r4,%r13,4*$SIZE_T($sp) 1020 br $ra 1021 1022.align 16 1023.L256_continue: 1024 lgr $t1,$s3 # temp=rk[11] 1025 srlg $i1,$s3,8 1026 srlg $i2,$s3,16 1027 srlg $i3,$s3,24 1028 nr $t1,$mask 1029 nr $i1,$mask 1030 nr $i2,$mask 1031 la $t1,0($t1,$tbl) 1032 la $i1,0($i1,$tbl) 1033 la $i2,0($i2,$tbl) 1034 la $i3,0($i3,$tbl) 1035 llgc $t1,0($t1) # Te4[rk[11]>>0] 1036 icm $t1,2,0($i1) # Te4[rk[11]>>8]<<8 1037 icm $t1,4,0($i2) # Te4[rk[11]>>16]<<16 1038 icm $t1,8,0($i3) # Te4[rk[11]>>24]<<24 1039 x $t1,16($key) # rk[12]=rk[4]^... 1040 st $t1,48($key) 1041 x $t1,20($key) # rk[13]=rk[5]^rk[12] 1042 st $t1,52($key) 1043 x $t1,24($key) # rk[14]=rk[6]^rk[13] 1044 st $t1,56($key) 1045 x $t1,28($key) # rk[15]=rk[7]^rk[14] 1046 st $t1,60($key) 1047 1048 srlg $i1,$t1,8 1049 srlg $i2,$t1,16 1050 srlg $i3,$t1,24 1051 nr $t1,$mask 1052 nr $i1,$mask 1053 nr $i2,$mask 1054 1055 la $key,32($key) # key+=8 1056 la $t3,4($t3) # i++ 1057 j .L256_loop 1058 1059.Lminus1: 1060 lghi %r2,-1 1061 br $ra 1062.size private_AES_set_encrypt_key,.-private_AES_set_encrypt_key 1063 1064# void AES_set_decrypt_key(const unsigned char *in, int bits, 1065# AES_KEY *key) { 1066.globl private_AES_set_decrypt_key 1067.type private_AES_set_decrypt_key,\@function 1068.align 16 1069private_AES_set_decrypt_key: 1070 #st${g} $key,4*$SIZE_T($sp) # I rely on AES_set_encrypt_key to 1071 st${g} $ra,14*$SIZE_T($sp) # save non-volatile registers and $key! 1072 bras $ra,_s390x_AES_set_encrypt_key 1073 #l${g} $key,4*$SIZE_T($sp) 1074 l${g} $ra,14*$SIZE_T($sp) 1075 ltgr %r2,%r2 1076 bnzr $ra 1077___ 1078$code.=<<___ if (!$softonly); 1079 #l $t0,240($key) 1080 lhi $t1,16 1081 cr $t0,$t1 1082 jl .Lgo 1083 oill $t0,0x80 # set "decrypt" bit 1084 st $t0,240($key) 1085 br $ra 1086___ 1087$code.=<<___; 1088.align 16 1089.Lgo: lgr $rounds,$t0 #llgf $rounds,240($key) 1090 la $i1,0($key) 1091 sllg $i2,$rounds,4 1092 la $i2,0($i2,$key) 1093 srl $rounds,1 1094 lghi $t1,-16 1095 1096.align 16 1097.Linv: lmg $s0,$s1,0($i1) 1098 lmg $s2,$s3,0($i2) 1099 stmg $s0,$s1,0($i2) 1100 stmg $s2,$s3,0($i1) 1101 la $i1,16($i1) 1102 la $i2,0($t1,$i2) 1103 brct $rounds,.Linv 1104___ 1105$mask80=$i1; 1106$mask1b=$i2; 1107$maskfe=$i3; 1108$code.=<<___; 1109 llgf $rounds,240($key) 1110 aghi $rounds,-1 1111 sll $rounds,2 # (rounds-1)*4 1112 llilh $mask80,0x8080 1113 llilh $mask1b,0x1b1b 1114 llilh $maskfe,0xfefe 1115 oill $mask80,0x8080 1116 oill $mask1b,0x1b1b 1117 oill $maskfe,0xfefe 1118 1119.align 16 1120.Lmix: l $s0,16($key) # tp1 1121 lr $s1,$s0 1122 ngr $s1,$mask80 1123 srlg $t1,$s1,7 1124 slr $s1,$t1 1125 nr $s1,$mask1b 1126 sllg $t1,$s0,1 1127 nr $t1,$maskfe 1128 xr $s1,$t1 # tp2 1129 1130 lr $s2,$s1 1131 ngr $s2,$mask80 1132 srlg $t1,$s2,7 1133 slr $s2,$t1 1134 nr $s2,$mask1b 1135 sllg $t1,$s1,1 1136 nr $t1,$maskfe 1137 xr $s2,$t1 # tp4 1138 1139 lr $s3,$s2 1140 ngr $s3,$mask80 1141 srlg $t1,$s3,7 1142 slr $s3,$t1 1143 nr $s3,$mask1b 1144 sllg $t1,$s2,1 1145 nr $t1,$maskfe 1146 xr $s3,$t1 # tp8 1147 1148 xr $s1,$s0 # tp2^tp1 1149 xr $s2,$s0 # tp4^tp1 1150 rll $s0,$s0,24 # = ROTATE(tp1,8) 1151 xr $s2,$s3 # ^=tp8 1152 xr $s0,$s1 # ^=tp2^tp1 1153 xr $s1,$s3 # tp2^tp1^tp8 1154 xr $s0,$s2 # ^=tp4^tp1^tp8 1155 rll $s1,$s1,8 1156 rll $s2,$s2,16 1157 xr $s0,$s1 # ^= ROTATE(tp8^tp2^tp1,24) 1158 rll $s3,$s3,24 1159 xr $s0,$s2 # ^= ROTATE(tp8^tp4^tp1,16) 1160 xr $s0,$s3 # ^= ROTATE(tp8,8) 1161 1162 st $s0,16($key) 1163 la $key,4($key) 1164 brct $rounds,.Lmix 1165 1166 lm${g} %r6,%r13,6*$SIZE_T($sp)# as was saved by AES_set_encrypt_key! 1167 lghi %r2,0 1168 br $ra 1169.size private_AES_set_decrypt_key,.-private_AES_set_decrypt_key 1170___ 1171 1172######################################################################## 1173# void AES_cbc_encrypt(const unsigned char *in, unsigned char *out, 1174# size_t length, const AES_KEY *key, 1175# unsigned char *ivec, const int enc) 1176{ 1177my $inp="%r2"; 1178my $out="%r4"; # length and out are swapped 1179my $len="%r3"; 1180my $key="%r5"; 1181my $ivp="%r6"; 1182 1183$code.=<<___; 1184.globl AES_cbc_encrypt 1185.type AES_cbc_encrypt,\@function 1186.align 16 1187AES_cbc_encrypt: 1188 xgr %r3,%r4 # flip %r3 and %r4, out and len 1189 xgr %r4,%r3 1190 xgr %r3,%r4 1191___ 1192$code.=<<___ if (!$softonly); 1193 lhi %r0,16 1194 cl %r0,240($key) 1195 jh .Lcbc_software 1196 1197 lg %r0,0($ivp) # copy ivec 1198 lg %r1,8($ivp) 1199 stmg %r0,%r1,16($sp) 1200 lmg %r0,%r1,0($key) # copy key, cover 256 bit 1201 stmg %r0,%r1,32($sp) 1202 lmg %r0,%r1,16($key) 1203 stmg %r0,%r1,48($sp) 1204 l %r0,240($key) # load kmc code 1205 lghi $key,15 # res=len%16, len-=res; 1206 ngr $key,$len 1207 sl${g}r $len,$key 1208 la %r1,16($sp) # parameter block - ivec || key 1209 jz .Lkmc_truncated 1210 .long 0xb92f0042 # kmc %r4,%r2 1211 brc 1,.-4 # pay attention to "partial completion" 1212 ltr $key,$key 1213 jnz .Lkmc_truncated 1214.Lkmc_done: 1215 lmg %r0,%r1,16($sp) # copy ivec to caller 1216 stg %r0,0($ivp) 1217 stg %r1,8($ivp) 1218 br $ra 1219.align 16 1220.Lkmc_truncated: 1221 ahi $key,-1 # it's the way it's encoded in mvc 1222 tmll %r0,0x80 1223 jnz .Lkmc_truncated_dec 1224 lghi %r1,0 1225 stg %r1,16*$SIZE_T($sp) 1226 stg %r1,16*$SIZE_T+8($sp) 1227 bras %r1,1f 1228 mvc 16*$SIZE_T(1,$sp),0($inp) 12291: ex $key,0(%r1) 1230 la %r1,16($sp) # restore parameter block 1231 la $inp,16*$SIZE_T($sp) 1232 lghi $len,16 1233 .long 0xb92f0042 # kmc %r4,%r2 1234 j .Lkmc_done 1235.align 16 1236.Lkmc_truncated_dec: 1237 st${g} $out,4*$SIZE_T($sp) 1238 la $out,16*$SIZE_T($sp) 1239 lghi $len,16 1240 .long 0xb92f0042 # kmc %r4,%r2 1241 l${g} $out,4*$SIZE_T($sp) 1242 bras %r1,2f 1243 mvc 0(1,$out),16*$SIZE_T($sp) 12442: ex $key,0(%r1) 1245 j .Lkmc_done 1246.align 16 1247.Lcbc_software: 1248___ 1249$code.=<<___; 1250 stm${g} $key,$ra,5*$SIZE_T($sp) 1251 lhi %r0,0 1252 cl %r0,`$stdframe+$SIZE_T-4`($sp) 1253 je .Lcbc_decrypt 1254 1255 larl $tbl,AES_Te 1256 1257 llgf $s0,0($ivp) 1258 llgf $s1,4($ivp) 1259 llgf $s2,8($ivp) 1260 llgf $s3,12($ivp) 1261 1262 lghi $t0,16 1263 sl${g}r $len,$t0 1264 brc 4,.Lcbc_enc_tail # if borrow 1265.Lcbc_enc_loop: 1266 stm${g} $inp,$out,2*$SIZE_T($sp) 1267 x $s0,0($inp) 1268 x $s1,4($inp) 1269 x $s2,8($inp) 1270 x $s3,12($inp) 1271 lgr %r4,$key 1272 1273 bras $ra,_s390x_AES_encrypt 1274 1275 lm${g} $inp,$key,2*$SIZE_T($sp) 1276 st $s0,0($out) 1277 st $s1,4($out) 1278 st $s2,8($out) 1279 st $s3,12($out) 1280 1281 la $inp,16($inp) 1282 la $out,16($out) 1283 lghi $t0,16 1284 lt${g}r $len,$len 1285 jz .Lcbc_enc_done 1286 sl${g}r $len,$t0 1287 brc 4,.Lcbc_enc_tail # if borrow 1288 j .Lcbc_enc_loop 1289.align 16 1290.Lcbc_enc_done: 1291 l${g} $ivp,6*$SIZE_T($sp) 1292 st $s0,0($ivp) 1293 st $s1,4($ivp) 1294 st $s2,8($ivp) 1295 st $s3,12($ivp) 1296 1297 lm${g} %r7,$ra,7*$SIZE_T($sp) 1298 br $ra 1299 1300.align 16 1301.Lcbc_enc_tail: 1302 aghi $len,15 1303 lghi $t0,0 1304 stg $t0,16*$SIZE_T($sp) 1305 stg $t0,16*$SIZE_T+8($sp) 1306 bras $t1,3f 1307 mvc 16*$SIZE_T(1,$sp),0($inp) 13083: ex $len,0($t1) 1309 lghi $len,0 1310 la $inp,16*$SIZE_T($sp) 1311 j .Lcbc_enc_loop 1312 1313.align 16 1314.Lcbc_decrypt: 1315 larl $tbl,AES_Td 1316 1317 lg $t0,0($ivp) 1318 lg $t1,8($ivp) 1319 stmg $t0,$t1,16*$SIZE_T($sp) 1320 1321.Lcbc_dec_loop: 1322 stm${g} $inp,$out,2*$SIZE_T($sp) 1323 llgf $s0,0($inp) 1324 llgf $s1,4($inp) 1325 llgf $s2,8($inp) 1326 llgf $s3,12($inp) 1327 lgr %r4,$key 1328 1329 bras $ra,_s390x_AES_decrypt 1330 1331 lm${g} $inp,$key,2*$SIZE_T($sp) 1332 sllg $s0,$s0,32 1333 sllg $s2,$s2,32 1334 lr $s0,$s1 1335 lr $s2,$s3 1336 1337 lg $t0,0($inp) 1338 lg $t1,8($inp) 1339 xg $s0,16*$SIZE_T($sp) 1340 xg $s2,16*$SIZE_T+8($sp) 1341 lghi $s1,16 1342 sl${g}r $len,$s1 1343 brc 4,.Lcbc_dec_tail # if borrow 1344 brc 2,.Lcbc_dec_done # if zero 1345 stg $s0,0($out) 1346 stg $s2,8($out) 1347 stmg $t0,$t1,16*$SIZE_T($sp) 1348 1349 la $inp,16($inp) 1350 la $out,16($out) 1351 j .Lcbc_dec_loop 1352 1353.Lcbc_dec_done: 1354 stg $s0,0($out) 1355 stg $s2,8($out) 1356.Lcbc_dec_exit: 1357 lm${g} %r6,$ra,6*$SIZE_T($sp) 1358 stmg $t0,$t1,0($ivp) 1359 1360 br $ra 1361 1362.align 16 1363.Lcbc_dec_tail: 1364 aghi $len,15 1365 stg $s0,16*$SIZE_T($sp) 1366 stg $s2,16*$SIZE_T+8($sp) 1367 bras $s1,4f 1368 mvc 0(1,$out),16*$SIZE_T($sp) 13694: ex $len,0($s1) 1370 j .Lcbc_dec_exit 1371.size AES_cbc_encrypt,.-AES_cbc_encrypt 1372___ 1373} 1374######################################################################## 1375# void AES_ctr32_encrypt(const unsigned char *in, unsigned char *out, 1376# size_t blocks, const AES_KEY *key, 1377# const unsigned char *ivec) 1378{ 1379my $inp="%r2"; 1380my $out="%r4"; # blocks and out are swapped 1381my $len="%r3"; 1382my $key="%r5"; my $iv0="%r5"; 1383my $ivp="%r6"; 1384my $fp ="%r7"; 1385 1386$code.=<<___; 1387.globl AES_ctr32_encrypt 1388.type AES_ctr32_encrypt,\@function 1389.align 16 1390AES_ctr32_encrypt: 1391 xgr %r3,%r4 # flip %r3 and %r4, $out and $len 1392 xgr %r4,%r3 1393 xgr %r3,%r4 1394 llgfr $len,$len # safe in ctr32 subroutine even in 64-bit case 1395___ 1396$code.=<<___ if (!$softonly); 1397 l %r0,240($key) 1398 lhi %r1,16 1399 clr %r0,%r1 1400 jl .Lctr32_software 1401 1402 stm${g} %r6,$s3,6*$SIZE_T($sp) 1403 1404 slgr $out,$inp 1405 la %r1,0($key) # %r1 is permanent copy of $key 1406 lg $iv0,0($ivp) # load ivec 1407 lg $ivp,8($ivp) 1408 1409 # prepare and allocate stack frame at the top of 4K page 1410 # with 1K reserved for eventual signal handling 1411 lghi $s0,-1024-256-16# guarantee at least 256-bytes buffer 1412 lghi $s1,-4096 1413 algr $s0,$sp 1414 lgr $fp,$sp 1415 ngr $s0,$s1 # align at page boundary 1416 slgr $fp,$s0 # total buffer size 1417 lgr $s2,$sp 1418 lghi $s1,1024+16 # sl[g]fi is extended-immediate facility 1419 slgr $fp,$s1 # deduct reservation to get usable buffer size 1420 # buffer size is at lest 256 and at most 3072+256-16 1421 1422 la $sp,1024($s0) # alloca 1423 srlg $fp,$fp,4 # convert bytes to blocks, minimum 16 1424 st${g} $s2,0($sp) # back-chain 1425 st${g} $fp,$SIZE_T($sp) 1426 1427 slgr $len,$fp 1428 brc 1,.Lctr32_hw_switch # not zero, no borrow 1429 algr $fp,$len # input is shorter than allocated buffer 1430 lghi $len,0 1431 st${g} $fp,$SIZE_T($sp) 1432 1433.Lctr32_hw_switch: 1434___ 1435$code.=<<___ if (0); ######### kmctr code was measured to be ~12% slower 1436 larl $s0,OPENSSL_s390xcap_P 1437 lg $s0,8($s0) 1438 tmhh $s0,0x0004 # check for message_security-assist-4 1439 jz .Lctr32_km_loop 1440 1441 llgfr $s0,%r0 1442 lgr $s1,%r1 1443 larl %r1,OPENSSL_s390xcap_P 1444 llihh %r0,0x8000 # check if kmctr supports the function code 1445 srlg %r0,%r0,0($s0) 1446 ng %r0,64(%r1) # check kmctr capability vector 1447 lgr %r0,$s0 1448 lgr %r1,$s1 1449 jz .Lctr32_km_loop 1450 1451####### kmctr code 1452 algr $out,$inp # restore $out 1453 lgr $s1,$len # $s1 undertakes $len 1454 j .Lctr32_kmctr_loop 1455.align 16 1456.Lctr32_kmctr_loop: 1457 la $s2,16($sp) 1458 lgr $s3,$fp 1459.Lctr32_kmctr_prepare: 1460 stg $iv0,0($s2) 1461 stg $ivp,8($s2) 1462 la $s2,16($s2) 1463 ahi $ivp,1 # 32-bit increment, preserves upper half 1464 brct $s3,.Lctr32_kmctr_prepare 1465 1466 #la $inp,0($inp) # inp 1467 sllg $len,$fp,4 # len 1468 #la $out,0($out) # out 1469 la $s2,16($sp) # iv 1470 .long 0xb92da042 # kmctr $out,$s2,$inp 1471 brc 1,.-4 # pay attention to "partial completion" 1472 1473 slgr $s1,$fp 1474 brc 1,.Lctr32_kmctr_loop # not zero, no borrow 1475 algr $fp,$s1 1476 lghi $s1,0 1477 brc 4+1,.Lctr32_kmctr_loop # not zero 1478 1479 l${g} $sp,0($sp) 1480 lm${g} %r6,$s3,6*$SIZE_T($sp) 1481 br $ra 1482.align 16 1483___ 1484$code.=<<___; 1485.Lctr32_km_loop: 1486 la $s2,16($sp) 1487 lgr $s3,$fp 1488.Lctr32_km_prepare: 1489 stg $iv0,0($s2) 1490 stg $ivp,8($s2) 1491 la $s2,16($s2) 1492 ahi $ivp,1 # 32-bit increment, preserves upper half 1493 brct $s3,.Lctr32_km_prepare 1494 1495 la $s0,16($sp) # inp 1496 sllg $s1,$fp,4 # len 1497 la $s2,16($sp) # out 1498 .long 0xb92e00a8 # km %r10,%r8 1499 brc 1,.-4 # pay attention to "partial completion" 1500 1501 la $s2,16($sp) 1502 lgr $s3,$fp 1503 slgr $s2,$inp 1504.Lctr32_km_xor: 1505 lg $s0,0($inp) 1506 lg $s1,8($inp) 1507 xg $s0,0($s2,$inp) 1508 xg $s1,8($s2,$inp) 1509 stg $s0,0($out,$inp) 1510 stg $s1,8($out,$inp) 1511 la $inp,16($inp) 1512 brct $s3,.Lctr32_km_xor 1513 1514 slgr $len,$fp 1515 brc 1,.Lctr32_km_loop # not zero, no borrow 1516 algr $fp,$len 1517 lghi $len,0 1518 brc 4+1,.Lctr32_km_loop # not zero 1519 1520 l${g} $s0,0($sp) 1521 l${g} $s1,$SIZE_T($sp) 1522 la $s2,16($sp) 1523.Lctr32_km_zap: 1524 stg $s0,0($s2) 1525 stg $s0,8($s2) 1526 la $s2,16($s2) 1527 brct $s1,.Lctr32_km_zap 1528 1529 la $sp,0($s0) 1530 lm${g} %r6,$s3,6*$SIZE_T($sp) 1531 br $ra 1532.align 16 1533.Lctr32_software: 1534___ 1535$code.=<<___; 1536 stm${g} $key,$ra,5*$SIZE_T($sp) 1537 sl${g}r $inp,$out 1538 larl $tbl,AES_Te 1539 llgf $t1,12($ivp) 1540 1541.Lctr32_loop: 1542 stm${g} $inp,$out,2*$SIZE_T($sp) 1543 llgf $s0,0($ivp) 1544 llgf $s1,4($ivp) 1545 llgf $s2,8($ivp) 1546 lgr $s3,$t1 1547 st $t1,16*$SIZE_T($sp) 1548 lgr %r4,$key 1549 1550 bras $ra,_s390x_AES_encrypt 1551 1552 lm${g} $inp,$ivp,2*$SIZE_T($sp) 1553 llgf $t1,16*$SIZE_T($sp) 1554 x $s0,0($inp,$out) 1555 x $s1,4($inp,$out) 1556 x $s2,8($inp,$out) 1557 x $s3,12($inp,$out) 1558 stm $s0,$s3,0($out) 1559 1560 la $out,16($out) 1561 ahi $t1,1 # 32-bit increment 1562 brct $len,.Lctr32_loop 1563 1564 lm${g} %r6,$ra,6*$SIZE_T($sp) 1565 br $ra 1566.size AES_ctr32_encrypt,.-AES_ctr32_encrypt 1567___ 1568} 1569 1570######################################################################## 1571# void AES_xts_encrypt(const char *inp,char *out,size_t len, 1572# const AES_KEY *key1, const AES_KEY *key2, 1573# const unsigned char iv[16]); 1574# 1575{ 1576my $inp="%r2"; 1577my $out="%r4"; # len and out are swapped 1578my $len="%r3"; 1579my $key1="%r5"; # $i1 1580my $key2="%r6"; # $i2 1581my $fp="%r7"; # $i3 1582my $tweak=16*$SIZE_T+16; # or $stdframe-16, bottom of the frame... 1583 1584$code.=<<___; 1585.type _s390x_xts_km,\@function 1586.align 16 1587_s390x_xts_km: 1588___ 1589$code.=<<___ if(1); 1590 llgfr $s0,%r0 # put aside the function code 1591 lghi $s1,0x7f 1592 nr $s1,%r0 1593 larl %r1,OPENSSL_s390xcap_P 1594 llihh %r0,0x8000 1595 srlg %r0,%r0,32($s1) # check for 32+function code 1596 ng %r0,32(%r1) # check km capability vector 1597 lgr %r0,$s0 # restore the function code 1598 la %r1,0($key1) # restore $key1 1599 jz .Lxts_km_vanilla 1600 1601 lmg $i2,$i3,$tweak($sp) # put aside the tweak value 1602 algr $out,$inp 1603 1604 oill %r0,32 # switch to xts function code 1605 aghi $s1,-18 # 1606 sllg $s1,$s1,3 # (function code - 18)*8, 0 or 16 1607 la %r1,$tweak-16($sp) 1608 slgr %r1,$s1 # parameter block position 1609 lmg $s0,$s3,0($key1) # load 256 bits of key material, 1610 stmg $s0,$s3,0(%r1) # and copy it to parameter block. 1611 # yes, it contains junk and overlaps 1612 # with the tweak in 128-bit case. 1613 # it's done to avoid conditional 1614 # branch. 1615 stmg $i2,$i3,$tweak($sp) # "re-seat" the tweak value 1616 1617 .long 0xb92e0042 # km %r4,%r2 1618 brc 1,.-4 # pay attention to "partial completion" 1619 1620 lrvg $s0,$tweak+0($sp) # load the last tweak 1621 lrvg $s1,$tweak+8($sp) 1622 stmg %r0,%r3,$tweak-32($sp) # wipe copy of the key 1623 1624 nill %r0,0xffdf # switch back to original function code 1625 la %r1,0($key1) # restore pointer to $key1 1626 slgr $out,$inp 1627 1628 llgc $len,2*$SIZE_T-1($sp) 1629 nill $len,0x0f # $len%=16 1630 br $ra 1631 1632.align 16 1633.Lxts_km_vanilla: 1634___ 1635$code.=<<___; 1636 # prepare and allocate stack frame at the top of 4K page 1637 # with 1K reserved for eventual signal handling 1638 lghi $s0,-1024-256-16# guarantee at least 256-bytes buffer 1639 lghi $s1,-4096 1640 algr $s0,$sp 1641 lgr $fp,$sp 1642 ngr $s0,$s1 # align at page boundary 1643 slgr $fp,$s0 # total buffer size 1644 lgr $s2,$sp 1645 lghi $s1,1024+16 # sl[g]fi is extended-immediate facility 1646 slgr $fp,$s1 # deduct reservation to get usable buffer size 1647 # buffer size is at lest 256 and at most 3072+256-16 1648 1649 la $sp,1024($s0) # alloca 1650 nill $fp,0xfff0 # round to 16*n 1651 st${g} $s2,0($sp) # back-chain 1652 nill $len,0xfff0 # redundant 1653 st${g} $fp,$SIZE_T($sp) 1654 1655 slgr $len,$fp 1656 brc 1,.Lxts_km_go # not zero, no borrow 1657 algr $fp,$len # input is shorter than allocated buffer 1658 lghi $len,0 1659 st${g} $fp,$SIZE_T($sp) 1660 1661.Lxts_km_go: 1662 lrvg $s0,$tweak+0($s2) # load the tweak value in little-endian 1663 lrvg $s1,$tweak+8($s2) 1664 1665 la $s2,16($sp) # vector of ascending tweak values 1666 slgr $s2,$inp 1667 srlg $s3,$fp,4 1668 j .Lxts_km_start 1669 1670.Lxts_km_loop: 1671 la $s2,16($sp) 1672 slgr $s2,$inp 1673 srlg $s3,$fp,4 1674.Lxts_km_prepare: 1675 lghi $i1,0x87 1676 srag $i2,$s1,63 # broadcast upper bit 1677 ngr $i1,$i2 # rem 1678 algr $s0,$s0 1679 alcgr $s1,$s1 1680 xgr $s0,$i1 1681.Lxts_km_start: 1682 lrvgr $i1,$s0 # flip byte order 1683 lrvgr $i2,$s1 1684 stg $i1,0($s2,$inp) 1685 stg $i2,8($s2,$inp) 1686 xg $i1,0($inp) 1687 xg $i2,8($inp) 1688 stg $i1,0($out,$inp) 1689 stg $i2,8($out,$inp) 1690 la $inp,16($inp) 1691 brct $s3,.Lxts_km_prepare 1692 1693 slgr $inp,$fp # rewind $inp 1694 la $s2,0($out,$inp) 1695 lgr $s3,$fp 1696 .long 0xb92e00aa # km $s2,$s2 1697 brc 1,.-4 # pay attention to "partial completion" 1698 1699 la $s2,16($sp) 1700 slgr $s2,$inp 1701 srlg $s3,$fp,4 1702.Lxts_km_xor: 1703 lg $i1,0($out,$inp) 1704 lg $i2,8($out,$inp) 1705 xg $i1,0($s2,$inp) 1706 xg $i2,8($s2,$inp) 1707 stg $i1,0($out,$inp) 1708 stg $i2,8($out,$inp) 1709 la $inp,16($inp) 1710 brct $s3,.Lxts_km_xor 1711 1712 slgr $len,$fp 1713 brc 1,.Lxts_km_loop # not zero, no borrow 1714 algr $fp,$len 1715 lghi $len,0 1716 brc 4+1,.Lxts_km_loop # not zero 1717 1718 l${g} $i1,0($sp) # back-chain 1719 llgf $fp,`2*$SIZE_T-4`($sp) # bytes used 1720 la $i2,16($sp) 1721 srlg $fp,$fp,4 1722.Lxts_km_zap: 1723 stg $i1,0($i2) 1724 stg $i1,8($i2) 1725 la $i2,16($i2) 1726 brct $fp,.Lxts_km_zap 1727 1728 la $sp,0($i1) 1729 llgc $len,2*$SIZE_T-1($i1) 1730 nill $len,0x0f # $len%=16 1731 bzr $ra 1732 1733 # generate one more tweak... 1734 lghi $i1,0x87 1735 srag $i2,$s1,63 # broadcast upper bit 1736 ngr $i1,$i2 # rem 1737 algr $s0,$s0 1738 alcgr $s1,$s1 1739 xgr $s0,$i1 1740 1741 ltr $len,$len # clear zero flag 1742 br $ra 1743.size _s390x_xts_km,.-_s390x_xts_km 1744 1745.globl AES_xts_encrypt 1746.type AES_xts_encrypt,\@function 1747.align 16 1748AES_xts_encrypt: 1749 xgr %r3,%r4 # flip %r3 and %r4, $out and $len 1750 xgr %r4,%r3 1751 xgr %r3,%r4 1752___ 1753$code.=<<___ if ($SIZE_T==4); 1754 llgfr $len,$len 1755___ 1756$code.=<<___; 1757 st${g} $len,1*$SIZE_T($sp) # save copy of $len 1758 srag $len,$len,4 # formally wrong, because it expands 1759 # sign byte, but who can afford asking 1760 # to process more than 2^63-1 bytes? 1761 # I use it, because it sets condition 1762 # code... 1763 bcr 8,$ra # abort if zero (i.e. less than 16) 1764___ 1765$code.=<<___ if (!$softonly); 1766 llgf %r0,240($key2) 1767 lhi %r1,16 1768 clr %r0,%r1 1769 jl .Lxts_enc_software 1770 1771 st${g} $ra,5*$SIZE_T($sp) 1772 stm${g} %r6,$s3,6*$SIZE_T($sp) 1773 1774 sllg $len,$len,4 # $len&=~15 1775 slgr $out,$inp 1776 1777 # generate the tweak value 1778 l${g} $s3,$stdframe($sp) # pointer to iv 1779 la $s2,$tweak($sp) 1780 lmg $s0,$s1,0($s3) 1781 lghi $s3,16 1782 stmg $s0,$s1,0($s2) 1783 la %r1,0($key2) # $key2 is not needed anymore 1784 .long 0xb92e00aa # km $s2,$s2, generate the tweak 1785 brc 1,.-4 # can this happen? 1786 1787 l %r0,240($key1) 1788 la %r1,0($key1) # $key1 is not needed anymore 1789 bras $ra,_s390x_xts_km 1790 jz .Lxts_enc_km_done 1791 1792 aghi $inp,-16 # take one step back 1793 la $i3,0($out,$inp) # put aside real $out 1794.Lxts_enc_km_steal: 1795 llgc $i1,16($inp) 1796 llgc $i2,0($out,$inp) 1797 stc $i1,0($out,$inp) 1798 stc $i2,16($out,$inp) 1799 la $inp,1($inp) 1800 brct $len,.Lxts_enc_km_steal 1801 1802 la $s2,0($i3) 1803 lghi $s3,16 1804 lrvgr $i1,$s0 # flip byte order 1805 lrvgr $i2,$s1 1806 xg $i1,0($s2) 1807 xg $i2,8($s2) 1808 stg $i1,0($s2) 1809 stg $i2,8($s2) 1810 .long 0xb92e00aa # km $s2,$s2 1811 brc 1,.-4 # can this happen? 1812 lrvgr $i1,$s0 # flip byte order 1813 lrvgr $i2,$s1 1814 xg $i1,0($i3) 1815 xg $i2,8($i3) 1816 stg $i1,0($i3) 1817 stg $i2,8($i3) 1818 1819.Lxts_enc_km_done: 1820 stg $sp,$tweak+0($sp) # wipe tweak 1821 stg $sp,$tweak+8($sp) 1822 l${g} $ra,5*$SIZE_T($sp) 1823 lm${g} %r6,$s3,6*$SIZE_T($sp) 1824 br $ra 1825.align 16 1826.Lxts_enc_software: 1827___ 1828$code.=<<___; 1829 stm${g} %r6,$ra,6*$SIZE_T($sp) 1830 1831 slgr $out,$inp 1832 1833 l${g} $s3,$stdframe($sp) # ivp 1834 llgf $s0,0($s3) # load iv 1835 llgf $s1,4($s3) 1836 llgf $s2,8($s3) 1837 llgf $s3,12($s3) 1838 stm${g} %r2,%r5,2*$SIZE_T($sp) 1839 la $key,0($key2) 1840 larl $tbl,AES_Te 1841 bras $ra,_s390x_AES_encrypt # generate the tweak 1842 lm${g} %r2,%r5,2*$SIZE_T($sp) 1843 stm $s0,$s3,$tweak($sp) # save the tweak 1844 j .Lxts_enc_enter 1845 1846.align 16 1847.Lxts_enc_loop: 1848 lrvg $s1,$tweak+0($sp) # load the tweak in little-endian 1849 lrvg $s3,$tweak+8($sp) 1850 lghi %r1,0x87 1851 srag %r0,$s3,63 # broadcast upper bit 1852 ngr %r1,%r0 # rem 1853 algr $s1,$s1 1854 alcgr $s3,$s3 1855 xgr $s1,%r1 1856 lrvgr $s1,$s1 # flip byte order 1857 lrvgr $s3,$s3 1858 srlg $s0,$s1,32 # smash the tweak to 4x32-bits 1859 stg $s1,$tweak+0($sp) # save the tweak 1860 llgfr $s1,$s1 1861 srlg $s2,$s3,32 1862 stg $s3,$tweak+8($sp) 1863 llgfr $s3,$s3 1864 la $inp,16($inp) # $inp+=16 1865.Lxts_enc_enter: 1866 x $s0,0($inp) # ^=*($inp) 1867 x $s1,4($inp) 1868 x $s2,8($inp) 1869 x $s3,12($inp) 1870 stm${g} %r2,%r3,2*$SIZE_T($sp) # only two registers are changing 1871 la $key,0($key1) 1872 bras $ra,_s390x_AES_encrypt 1873 lm${g} %r2,%r5,2*$SIZE_T($sp) 1874 x $s0,$tweak+0($sp) # ^=tweak 1875 x $s1,$tweak+4($sp) 1876 x $s2,$tweak+8($sp) 1877 x $s3,$tweak+12($sp) 1878 st $s0,0($out,$inp) 1879 st $s1,4($out,$inp) 1880 st $s2,8($out,$inp) 1881 st $s3,12($out,$inp) 1882 brct${g} $len,.Lxts_enc_loop 1883 1884 llgc $len,`2*$SIZE_T-1`($sp) 1885 nill $len,0x0f # $len%16 1886 jz .Lxts_enc_done 1887 1888 la $i3,0($inp,$out) # put aside real $out 1889.Lxts_enc_steal: 1890 llgc %r0,16($inp) 1891 llgc %r1,0($out,$inp) 1892 stc %r0,0($out,$inp) 1893 stc %r1,16($out,$inp) 1894 la $inp,1($inp) 1895 brct $len,.Lxts_enc_steal 1896 la $out,0($i3) # restore real $out 1897 1898 # generate last tweak... 1899 lrvg $s1,$tweak+0($sp) # load the tweak in little-endian 1900 lrvg $s3,$tweak+8($sp) 1901 lghi %r1,0x87 1902 srag %r0,$s3,63 # broadcast upper bit 1903 ngr %r1,%r0 # rem 1904 algr $s1,$s1 1905 alcgr $s3,$s3 1906 xgr $s1,%r1 1907 lrvgr $s1,$s1 # flip byte order 1908 lrvgr $s3,$s3 1909 srlg $s0,$s1,32 # smash the tweak to 4x32-bits 1910 stg $s1,$tweak+0($sp) # save the tweak 1911 llgfr $s1,$s1 1912 srlg $s2,$s3,32 1913 stg $s3,$tweak+8($sp) 1914 llgfr $s3,$s3 1915 1916 x $s0,0($out) # ^=*(inp)|stolen cipther-text 1917 x $s1,4($out) 1918 x $s2,8($out) 1919 x $s3,12($out) 1920 st${g} $out,4*$SIZE_T($sp) 1921 la $key,0($key1) 1922 bras $ra,_s390x_AES_encrypt 1923 l${g} $out,4*$SIZE_T($sp) 1924 x $s0,`$tweak+0`($sp) # ^=tweak 1925 x $s1,`$tweak+4`($sp) 1926 x $s2,`$tweak+8`($sp) 1927 x $s3,`$tweak+12`($sp) 1928 st $s0,0($out) 1929 st $s1,4($out) 1930 st $s2,8($out) 1931 st $s3,12($out) 1932 1933.Lxts_enc_done: 1934 stg $sp,$tweak+0($sp) # wipe tweak 1935 stg $sp,$twesk+8($sp) 1936 lm${g} %r6,$ra,6*$SIZE_T($sp) 1937 br $ra 1938.size AES_xts_encrypt,.-AES_xts_encrypt 1939___ 1940# void AES_xts_decrypt(const char *inp,char *out,size_t len, 1941# const AES_KEY *key1, const AES_KEY *key2, 1942# const unsigned char iv[16]); 1943# 1944$code.=<<___; 1945.globl AES_xts_decrypt 1946.type AES_xts_decrypt,\@function 1947.align 16 1948AES_xts_decrypt: 1949 xgr %r3,%r4 # flip %r3 and %r4, $out and $len 1950 xgr %r4,%r3 1951 xgr %r3,%r4 1952___ 1953$code.=<<___ if ($SIZE_T==4); 1954 llgfr $len,$len 1955___ 1956$code.=<<___; 1957 st${g} $len,1*$SIZE_T($sp) # save copy of $len 1958 aghi $len,-16 1959 bcr 4,$ra # abort if less than zero. formally 1960 # wrong, because $len is unsigned, 1961 # but who can afford asking to 1962 # process more than 2^63-1 bytes? 1963 tmll $len,0x0f 1964 jnz .Lxts_dec_proceed 1965 aghi $len,16 1966.Lxts_dec_proceed: 1967___ 1968$code.=<<___ if (!$softonly); 1969 llgf %r0,240($key2) 1970 lhi %r1,16 1971 clr %r0,%r1 1972 jl .Lxts_dec_software 1973 1974 st${g} $ra,5*$SIZE_T($sp) 1975 stm${g} %r6,$s3,6*$SIZE_T($sp) 1976 1977 nill $len,0xfff0 # $len&=~15 1978 slgr $out,$inp 1979 1980 # generate the tweak value 1981 l${g} $s3,$stdframe($sp) # pointer to iv 1982 la $s2,$tweak($sp) 1983 lmg $s0,$s1,0($s3) 1984 lghi $s3,16 1985 stmg $s0,$s1,0($s2) 1986 la %r1,0($key2) # $key2 is not needed past this point 1987 .long 0xb92e00aa # km $s2,$s2, generate the tweak 1988 brc 1,.-4 # can this happen? 1989 1990 l %r0,240($key1) 1991 la %r1,0($key1) # $key1 is not needed anymore 1992 1993 ltgr $len,$len 1994 jz .Lxts_dec_km_short 1995 bras $ra,_s390x_xts_km 1996 jz .Lxts_dec_km_done 1997 1998 lrvgr $s2,$s0 # make copy in reverse byte order 1999 lrvgr $s3,$s1 2000 j .Lxts_dec_km_2ndtweak 2001 2002.Lxts_dec_km_short: 2003 llgc $len,`2*$SIZE_T-1`($sp) 2004 nill $len,0x0f # $len%=16 2005 lrvg $s0,$tweak+0($sp) # load the tweak 2006 lrvg $s1,$tweak+8($sp) 2007 lrvgr $s2,$s0 # make copy in reverse byte order 2008 lrvgr $s3,$s1 2009 2010.Lxts_dec_km_2ndtweak: 2011 lghi $i1,0x87 2012 srag $i2,$s1,63 # broadcast upper bit 2013 ngr $i1,$i2 # rem 2014 algr $s0,$s0 2015 alcgr $s1,$s1 2016 xgr $s0,$i1 2017 lrvgr $i1,$s0 # flip byte order 2018 lrvgr $i2,$s1 2019 2020 xg $i1,0($inp) 2021 xg $i2,8($inp) 2022 stg $i1,0($out,$inp) 2023 stg $i2,8($out,$inp) 2024 la $i2,0($out,$inp) 2025 lghi $i3,16 2026 .long 0xb92e0066 # km $i2,$i2 2027 brc 1,.-4 # can this happen? 2028 lrvgr $i1,$s0 2029 lrvgr $i2,$s1 2030 xg $i1,0($out,$inp) 2031 xg $i2,8($out,$inp) 2032 stg $i1,0($out,$inp) 2033 stg $i2,8($out,$inp) 2034 2035 la $i3,0($out,$inp) # put aside real $out 2036.Lxts_dec_km_steal: 2037 llgc $i1,16($inp) 2038 llgc $i2,0($out,$inp) 2039 stc $i1,0($out,$inp) 2040 stc $i2,16($out,$inp) 2041 la $inp,1($inp) 2042 brct $len,.Lxts_dec_km_steal 2043 2044 lgr $s0,$s2 2045 lgr $s1,$s3 2046 xg $s0,0($i3) 2047 xg $s1,8($i3) 2048 stg $s0,0($i3) 2049 stg $s1,8($i3) 2050 la $s0,0($i3) 2051 lghi $s1,16 2052 .long 0xb92e0088 # km $s0,$s0 2053 brc 1,.-4 # can this happen? 2054 xg $s2,0($i3) 2055 xg $s3,8($i3) 2056 stg $s2,0($i3) 2057 stg $s3,8($i3) 2058.Lxts_dec_km_done: 2059 stg $sp,$tweak+0($sp) # wipe tweak 2060 stg $sp,$tweak+8($sp) 2061 l${g} $ra,5*$SIZE_T($sp) 2062 lm${g} %r6,$s3,6*$SIZE_T($sp) 2063 br $ra 2064.align 16 2065.Lxts_dec_software: 2066___ 2067$code.=<<___; 2068 stm${g} %r6,$ra,6*$SIZE_T($sp) 2069 2070 srlg $len,$len,4 2071 slgr $out,$inp 2072 2073 l${g} $s3,$stdframe($sp) # ivp 2074 llgf $s0,0($s3) # load iv 2075 llgf $s1,4($s3) 2076 llgf $s2,8($s3) 2077 llgf $s3,12($s3) 2078 stm${g} %r2,%r5,2*$SIZE_T($sp) 2079 la $key,0($key2) 2080 larl $tbl,AES_Te 2081 bras $ra,_s390x_AES_encrypt # generate the tweak 2082 lm${g} %r2,%r5,2*$SIZE_T($sp) 2083 larl $tbl,AES_Td 2084 lt${g}r $len,$len 2085 stm $s0,$s3,$tweak($sp) # save the tweak 2086 jz .Lxts_dec_short 2087 j .Lxts_dec_enter 2088 2089.align 16 2090.Lxts_dec_loop: 2091 lrvg $s1,$tweak+0($sp) # load the tweak in little-endian 2092 lrvg $s3,$tweak+8($sp) 2093 lghi %r1,0x87 2094 srag %r0,$s3,63 # broadcast upper bit 2095 ngr %r1,%r0 # rem 2096 algr $s1,$s1 2097 alcgr $s3,$s3 2098 xgr $s1,%r1 2099 lrvgr $s1,$s1 # flip byte order 2100 lrvgr $s3,$s3 2101 srlg $s0,$s1,32 # smash the tweak to 4x32-bits 2102 stg $s1,$tweak+0($sp) # save the tweak 2103 llgfr $s1,$s1 2104 srlg $s2,$s3,32 2105 stg $s3,$tweak+8($sp) 2106 llgfr $s3,$s3 2107.Lxts_dec_enter: 2108 x $s0,0($inp) # tweak^=*(inp) 2109 x $s1,4($inp) 2110 x $s2,8($inp) 2111 x $s3,12($inp) 2112 stm${g} %r2,%r3,2*$SIZE_T($sp) # only two registers are changing 2113 la $key,0($key1) 2114 bras $ra,_s390x_AES_decrypt 2115 lm${g} %r2,%r5,2*$SIZE_T($sp) 2116 x $s0,$tweak+0($sp) # ^=tweak 2117 x $s1,$tweak+4($sp) 2118 x $s2,$tweak+8($sp) 2119 x $s3,$tweak+12($sp) 2120 st $s0,0($out,$inp) 2121 st $s1,4($out,$inp) 2122 st $s2,8($out,$inp) 2123 st $s3,12($out,$inp) 2124 la $inp,16($inp) 2125 brct${g} $len,.Lxts_dec_loop 2126 2127 llgc $len,`2*$SIZE_T-1`($sp) 2128 nill $len,0x0f # $len%16 2129 jz .Lxts_dec_done 2130 2131 # generate pair of tweaks... 2132 lrvg $s1,$tweak+0($sp) # load the tweak in little-endian 2133 lrvg $s3,$tweak+8($sp) 2134 lghi %r1,0x87 2135 srag %r0,$s3,63 # broadcast upper bit 2136 ngr %r1,%r0 # rem 2137 algr $s1,$s1 2138 alcgr $s3,$s3 2139 xgr $s1,%r1 2140 lrvgr $i2,$s1 # flip byte order 2141 lrvgr $i3,$s3 2142 stmg $i2,$i3,$tweak($sp) # save the 1st tweak 2143 j .Lxts_dec_2ndtweak 2144 2145.align 16 2146.Lxts_dec_short: 2147 llgc $len,`2*$SIZE_T-1`($sp) 2148 nill $len,0x0f # $len%16 2149 lrvg $s1,$tweak+0($sp) # load the tweak in little-endian 2150 lrvg $s3,$tweak+8($sp) 2151.Lxts_dec_2ndtweak: 2152 lghi %r1,0x87 2153 srag %r0,$s3,63 # broadcast upper bit 2154 ngr %r1,%r0 # rem 2155 algr $s1,$s1 2156 alcgr $s3,$s3 2157 xgr $s1,%r1 2158 lrvgr $s1,$s1 # flip byte order 2159 lrvgr $s3,$s3 2160 srlg $s0,$s1,32 # smash the tweak to 4x32-bits 2161 stg $s1,$tweak-16+0($sp) # save the 2nd tweak 2162 llgfr $s1,$s1 2163 srlg $s2,$s3,32 2164 stg $s3,$tweak-16+8($sp) 2165 llgfr $s3,$s3 2166 2167 x $s0,0($inp) # tweak_the_2nd^=*(inp) 2168 x $s1,4($inp) 2169 x $s2,8($inp) 2170 x $s3,12($inp) 2171 stm${g} %r2,%r3,2*$SIZE_T($sp) 2172 la $key,0($key1) 2173 bras $ra,_s390x_AES_decrypt 2174 lm${g} %r2,%r5,2*$SIZE_T($sp) 2175 x $s0,$tweak-16+0($sp) # ^=tweak_the_2nd 2176 x $s1,$tweak-16+4($sp) 2177 x $s2,$tweak-16+8($sp) 2178 x $s3,$tweak-16+12($sp) 2179 st $s0,0($out,$inp) 2180 st $s1,4($out,$inp) 2181 st $s2,8($out,$inp) 2182 st $s3,12($out,$inp) 2183 2184 la $i3,0($out,$inp) # put aside real $out 2185.Lxts_dec_steal: 2186 llgc %r0,16($inp) 2187 llgc %r1,0($out,$inp) 2188 stc %r0,0($out,$inp) 2189 stc %r1,16($out,$inp) 2190 la $inp,1($inp) 2191 brct $len,.Lxts_dec_steal 2192 la $out,0($i3) # restore real $out 2193 2194 lm $s0,$s3,$tweak($sp) # load the 1st tweak 2195 x $s0,0($out) # tweak^=*(inp)|stolen cipher-text 2196 x $s1,4($out) 2197 x $s2,8($out) 2198 x $s3,12($out) 2199 st${g} $out,4*$SIZE_T($sp) 2200 la $key,0($key1) 2201 bras $ra,_s390x_AES_decrypt 2202 l${g} $out,4*$SIZE_T($sp) 2203 x $s0,$tweak+0($sp) # ^=tweak 2204 x $s1,$tweak+4($sp) 2205 x $s2,$tweak+8($sp) 2206 x $s3,$tweak+12($sp) 2207 st $s0,0($out) 2208 st $s1,4($out) 2209 st $s2,8($out) 2210 st $s3,12($out) 2211 stg $sp,$tweak-16+0($sp) # wipe 2nd tweak 2212 stg $sp,$tweak-16+8($sp) 2213.Lxts_dec_done: 2214 stg $sp,$tweak+0($sp) # wipe tweak 2215 stg $sp,$twesk+8($sp) 2216 lm${g} %r6,$ra,6*$SIZE_T($sp) 2217 br $ra 2218.size AES_xts_decrypt,.-AES_xts_decrypt 2219___ 2220} 2221$code.=<<___; 2222.string "AES for s390x, CRYPTOGAMS by <appro\@openssl.org>" 2223.comm OPENSSL_s390xcap_P,80,8 2224___ 2225 2226$code =~ s/\`([^\`]*)\`/eval $1/gem; 2227print $code; 2228close STDOUT; # force flush 2229