1/* 2* Copyright (c) 2016, Intel Corporation. 3* 4* DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. 5* 6* This code is free software; you can redistribute it and/or modify it 7* under the terms of the GNU General Public License version 2 only, as 8* published by the Free Software Foundation. 9* 10* This code is distributed in the hope that it will be useful, but WITHOUT 11* ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or 12* FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License 13* version 2 for more details (a copy is included in the LICENSE file that 14* accompanied this code). 15* 16* You should have received a copy of the GNU General Public License version 17* 2 along with this work; if not, write to the Free Software Foundation, 18* Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA. 19* 20* Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA 21* or visit www.oracle.com if you need additional information or have any 22* questions. 23* 24*/ 25 26#include "precompiled.hpp" 27#include "asm/assembler.hpp" 28#include "asm/assembler.inline.hpp" 29#include "runtime/stubRoutines.hpp" 30#include "macroAssembler_x86.hpp" 31 32// ofs and limit are used for multi-block byte array. 33// int com.sun.security.provider.DigestBase.implCompressMultiBlock(byte[] b, int ofs, int limit) 34void MacroAssembler::fast_sha1(XMMRegister abcd, XMMRegister e0, XMMRegister e1, XMMRegister msg0, 35 XMMRegister msg1, XMMRegister msg2, XMMRegister msg3, XMMRegister shuf_mask, 36 Register buf, Register state, Register ofs, Register limit, Register rsp, bool multi_block) { 37 38 Label start, done_hash, loop0; 39 40 address upper_word_mask = StubRoutines::x86::upper_word_mask_addr(); 41 address shuffle_byte_flip_mask = StubRoutines::x86::shuffle_byte_flip_mask_addr(); 42 43 bind(start); 44 movdqu(abcd, Address(state, 0)); 45 pinsrd(e0, Address(state, 16), 3); 46 movdqu(shuf_mask, ExternalAddress(upper_word_mask)); // 0xFFFFFFFF000000000000000000000000 47 pand(e0, shuf_mask); 48 pshufd(abcd, abcd, 0x1B); 49 movdqu(shuf_mask, ExternalAddress(shuffle_byte_flip_mask)); //0x000102030405060708090a0b0c0d0e0f 50 51 bind(loop0); 52 // Save hash values for addition after rounds 53 movdqu(Address(rsp, 0), e0); 54 movdqu(Address(rsp, 16), abcd); 55 56 57 // Rounds 0 - 3 58 movdqu(msg0, Address(buf, 0)); 59 pshufb(msg0, shuf_mask); 60 paddd(e0, msg0); 61 movdqa(e1, abcd); 62 sha1rnds4(abcd, e0, 0); 63 64 // Rounds 4 - 7 65 movdqu(msg1, Address(buf, 16)); 66 pshufb(msg1, shuf_mask); 67 sha1nexte(e1, msg1); 68 movdqa(e0, abcd); 69 sha1rnds4(abcd, e1, 0); 70 sha1msg1(msg0, msg1); 71 72 // Rounds 8 - 11 73 movdqu(msg2, Address(buf, 32)); 74 pshufb(msg2, shuf_mask); 75 sha1nexte(e0, msg2); 76 movdqa(e1, abcd); 77 sha1rnds4(abcd, e0, 0); 78 sha1msg1(msg1, msg2); 79 pxor(msg0, msg2); 80 81 // Rounds 12 - 15 82 movdqu(msg3, Address(buf, 48)); 83 pshufb(msg3, shuf_mask); 84 sha1nexte(e1, msg3); 85 movdqa(e0, abcd); 86 sha1msg2(msg0, msg3); 87 sha1rnds4(abcd, e1, 0); 88 sha1msg1(msg2, msg3); 89 pxor(msg1, msg3); 90 91 // Rounds 16 - 19 92 sha1nexte(e0, msg0); 93 movdqa(e1, abcd); 94 sha1msg2(msg1, msg0); 95 sha1rnds4(abcd, e0, 0); 96 sha1msg1(msg3, msg0); 97 pxor(msg2, msg0); 98 99 // Rounds 20 - 23 100 sha1nexte(e1, msg1); 101 movdqa(e0, abcd); 102 sha1msg2(msg2, msg1); 103 sha1rnds4(abcd, e1, 1); 104 sha1msg1(msg0, msg1); 105 pxor(msg3, msg1); 106 107 // Rounds 24 - 27 108 sha1nexte(e0, msg2); 109 movdqa(e1, abcd); 110 sha1msg2(msg3, msg2); 111 sha1rnds4(abcd, e0, 1); 112 sha1msg1(msg1, msg2); 113 pxor(msg0, msg2); 114 115 // Rounds 28 - 31 116 sha1nexte(e1, msg3); 117 movdqa(e0, abcd); 118 sha1msg2(msg0, msg3); 119 sha1rnds4(abcd, e1, 1); 120 sha1msg1(msg2, msg3); 121 pxor(msg1, msg3); 122 123 // Rounds 32 - 35 124 sha1nexte(e0, msg0); 125 movdqa(e1, abcd); 126 sha1msg2(msg1, msg0); 127 sha1rnds4(abcd, e0, 1); 128 sha1msg1(msg3, msg0); 129 pxor(msg2, msg0); 130 131 // Rounds 36 - 39 132 sha1nexte(e1, msg1); 133 movdqa(e0, abcd); 134 sha1msg2(msg2, msg1); 135 sha1rnds4(abcd, e1, 1); 136 sha1msg1(msg0, msg1); 137 pxor(msg3, msg1); 138 139 // Rounds 40 - 43 140 sha1nexte(e0, msg2); 141 movdqa(e1, abcd); 142 sha1msg2(msg3, msg2); 143 sha1rnds4(abcd, e0, 2); 144 sha1msg1(msg1, msg2); 145 pxor(msg0, msg2); 146 147 // Rounds 44 - 47 148 sha1nexte(e1, msg3); 149 movdqa(e0, abcd); 150 sha1msg2(msg0, msg3); 151 sha1rnds4(abcd, e1, 2); 152 sha1msg1(msg2, msg3); 153 pxor(msg1, msg3); 154 155 // Rounds 48 - 51 156 sha1nexte(e0, msg0); 157 movdqa(e1, abcd); 158 sha1msg2(msg1, msg0); 159 sha1rnds4(abcd, e0, 2); 160 sha1msg1(msg3, msg0); 161 pxor(msg2, msg0); 162 163 // Rounds 52 - 55 164 sha1nexte(e1, msg1); 165 movdqa(e0, abcd); 166 sha1msg2(msg2, msg1); 167 sha1rnds4(abcd, e1, 2); 168 sha1msg1(msg0, msg1); 169 pxor(msg3, msg1); 170 171 // Rounds 56 - 59 172 sha1nexte(e0, msg2); 173 movdqa(e1, abcd); 174 sha1msg2(msg3, msg2); 175 sha1rnds4(abcd, e0, 2); 176 sha1msg1(msg1, msg2); 177 pxor(msg0, msg2); 178 179 // Rounds 60 - 63 180 sha1nexte(e1, msg3); 181 movdqa(e0, abcd); 182 sha1msg2(msg0, msg3); 183 sha1rnds4(abcd, e1, 3); 184 sha1msg1(msg2, msg3); 185 pxor(msg1, msg3); 186 187 // Rounds 64 - 67 188 sha1nexte(e0, msg0); 189 movdqa(e1, abcd); 190 sha1msg2(msg1, msg0); 191 sha1rnds4(abcd, e0, 3); 192 sha1msg1(msg3, msg0); 193 pxor(msg2, msg0); 194 195 // Rounds 68 - 71 196 sha1nexte(e1, msg1); 197 movdqa(e0, abcd); 198 sha1msg2(msg2, msg1); 199 sha1rnds4(abcd, e1, 3); 200 pxor(msg3, msg1); 201 202 // Rounds 72 - 75 203 sha1nexte(e0, msg2); 204 movdqa(e1, abcd); 205 sha1msg2(msg3, msg2); 206 sha1rnds4(abcd, e0, 3); 207 208 // Rounds 76 - 79 209 sha1nexte(e1, msg3); 210 movdqa(e0, abcd); 211 sha1rnds4(abcd, e1, 3); 212 213 // add current hash values with previously saved 214 movdqu(msg0, Address(rsp, 0)); 215 sha1nexte(e0, msg0); 216 movdqu(msg0, Address(rsp, 16)); 217 paddd(abcd, msg0); 218 219 if (multi_block) { 220 // increment data pointer and loop if more to process 221 addptr(buf, 64); 222 addptr(ofs, 64); 223 cmpptr(ofs, limit); 224 jcc(Assembler::belowEqual, loop0); 225 movptr(rax, ofs); //return ofs 226 } 227 // write hash values back in the correct order 228 pshufd(abcd, abcd, 0x1b); 229 movdqu(Address(state, 0), abcd); 230 pextrd(Address(state, 16), e0, 3); 231 232 bind(done_hash); 233 234} 235 236// xmm0 (msg) is used as an implicit argument to sh256rnds2 237// and state0 and state1 can never use xmm0 register. 238// ofs and limit are used for multi-block byte array. 239// int com.sun.security.provider.DigestBase.implCompressMultiBlock(byte[] b, int ofs, int limit) 240#ifdef _LP64 241void MacroAssembler::fast_sha256(XMMRegister msg, XMMRegister state0, XMMRegister state1, XMMRegister msgtmp0, 242 XMMRegister msgtmp1, XMMRegister msgtmp2, XMMRegister msgtmp3, XMMRegister msgtmp4, 243 Register buf, Register state, Register ofs, Register limit, Register rsp, 244 bool multi_block, XMMRegister shuf_mask) { 245#else 246void MacroAssembler::fast_sha256(XMMRegister msg, XMMRegister state0, XMMRegister state1, XMMRegister msgtmp0, 247 XMMRegister msgtmp1, XMMRegister msgtmp2, XMMRegister msgtmp3, XMMRegister msgtmp4, 248 Register buf, Register state, Register ofs, Register limit, Register rsp, 249 bool multi_block) { 250#endif 251 Label start, done_hash, loop0; 252 253 address K256 = StubRoutines::x86::k256_addr(); 254 address pshuffle_byte_flip_mask = StubRoutines::x86::pshuffle_byte_flip_mask_addr(); 255 256 bind(start); 257 movdqu(state0, Address(state, 0)); 258 movdqu(state1, Address(state, 16)); 259 260 pshufd(state0, state0, 0xB1); 261 pshufd(state1, state1, 0x1B); 262 movdqa(msgtmp4, state0); 263 palignr(state0, state1, 8); 264 pblendw(state1, msgtmp4, 0xF0); 265 266#ifdef _LP64 267 movdqu(shuf_mask, ExternalAddress(pshuffle_byte_flip_mask)); 268#endif 269 lea(rax, ExternalAddress(K256)); 270 271 bind(loop0); 272 movdqu(Address(rsp, 0), state0); 273 movdqu(Address(rsp, 16), state1); 274 275 // Rounds 0-3 276 movdqu(msg, Address(buf, 0)); 277#ifdef _LP64 278 pshufb(msg, shuf_mask); 279#else 280 pshufb(msg, ExternalAddress(pshuffle_byte_flip_mask)); 281#endif 282 movdqa(msgtmp0, msg); 283 paddd(msg, Address(rax, 0)); 284 sha256rnds2(state1, state0); 285 pshufd(msg, msg, 0x0E); 286 sha256rnds2(state0, state1); 287 288 // Rounds 4-7 289 movdqu(msg, Address(buf, 16)); 290#ifdef _LP64 291 pshufb(msg, shuf_mask); 292#else 293 pshufb(msg, ExternalAddress(pshuffle_byte_flip_mask)); 294#endif 295 movdqa(msgtmp1, msg); 296 paddd(msg, Address(rax, 16)); 297 sha256rnds2(state1, state0); 298 pshufd(msg, msg, 0x0E); 299 sha256rnds2(state0, state1); 300 sha256msg1(msgtmp0, msgtmp1); 301 302 // Rounds 8-11 303 movdqu(msg, Address(buf, 32)); 304#ifdef _LP64 305 pshufb(msg, shuf_mask); 306#else 307 pshufb(msg, ExternalAddress(pshuffle_byte_flip_mask)); 308#endif 309 movdqa(msgtmp2, msg); 310 paddd(msg, Address(rax, 32)); 311 sha256rnds2(state1, state0); 312 pshufd(msg, msg, 0x0E); 313 sha256rnds2(state0, state1); 314 sha256msg1(msgtmp1, msgtmp2); 315 316 // Rounds 12-15 317 movdqu(msg, Address(buf, 48)); 318#ifdef _LP64 319 pshufb(msg, shuf_mask); 320#else 321 pshufb(msg, ExternalAddress(pshuffle_byte_flip_mask)); 322#endif 323 movdqa(msgtmp3, msg); 324 paddd(msg, Address(rax, 48)); 325 sha256rnds2(state1, state0); 326 movdqa(msgtmp4, msgtmp3); 327 palignr(msgtmp4, msgtmp2, 4); 328 paddd(msgtmp0, msgtmp4); 329 sha256msg2(msgtmp0, msgtmp3); 330 pshufd(msg, msg, 0x0E); 331 sha256rnds2(state0, state1); 332 sha256msg1(msgtmp2, msgtmp3); 333 334 // Rounds 16-19 335 movdqa(msg, msgtmp0); 336 paddd(msg, Address(rax, 64)); 337 sha256rnds2(state1, state0); 338 movdqa(msgtmp4, msgtmp0); 339 palignr(msgtmp4, msgtmp3, 4); 340 paddd(msgtmp1, msgtmp4); 341 sha256msg2(msgtmp1, msgtmp0); 342 pshufd(msg, msg, 0x0E); 343 sha256rnds2(state0, state1); 344 sha256msg1(msgtmp3, msgtmp0); 345 346 // Rounds 20-23 347 movdqa(msg, msgtmp1); 348 paddd(msg, Address(rax, 80)); 349 sha256rnds2(state1, state0); 350 movdqa(msgtmp4, msgtmp1); 351 palignr(msgtmp4, msgtmp0, 4); 352 paddd(msgtmp2, msgtmp4); 353 sha256msg2(msgtmp2, msgtmp1); 354 pshufd(msg, msg, 0x0E); 355 sha256rnds2(state0, state1); 356 sha256msg1(msgtmp0, msgtmp1); 357 358 // Rounds 24-27 359 movdqa(msg, msgtmp2); 360 paddd(msg, Address(rax, 96)); 361 sha256rnds2(state1, state0); 362 movdqa(msgtmp4, msgtmp2); 363 palignr(msgtmp4, msgtmp1, 4); 364 paddd(msgtmp3, msgtmp4); 365 sha256msg2(msgtmp3, msgtmp2); 366 pshufd(msg, msg, 0x0E); 367 sha256rnds2(state0, state1); 368 sha256msg1(msgtmp1, msgtmp2); 369 370 // Rounds 28-31 371 movdqa(msg, msgtmp3); 372 paddd(msg, Address(rax, 112)); 373 sha256rnds2(state1, state0); 374 movdqa(msgtmp4, msgtmp3); 375 palignr(msgtmp4, msgtmp2, 4); 376 paddd(msgtmp0, msgtmp4); 377 sha256msg2(msgtmp0, msgtmp3); 378 pshufd(msg, msg, 0x0E); 379 sha256rnds2(state0, state1); 380 sha256msg1(msgtmp2, msgtmp3); 381 382 // Rounds 32-35 383 movdqa(msg, msgtmp0); 384 paddd(msg, Address(rax, 128)); 385 sha256rnds2(state1, state0); 386 movdqa(msgtmp4, msgtmp0); 387 palignr(msgtmp4, msgtmp3, 4); 388 paddd(msgtmp1, msgtmp4); 389 sha256msg2(msgtmp1, msgtmp0); 390 pshufd(msg, msg, 0x0E); 391 sha256rnds2(state0, state1); 392 sha256msg1(msgtmp3, msgtmp0); 393 394 // Rounds 36-39 395 movdqa(msg, msgtmp1); 396 paddd(msg, Address(rax, 144)); 397 sha256rnds2(state1, state0); 398 movdqa(msgtmp4, msgtmp1); 399 palignr(msgtmp4, msgtmp0, 4); 400 paddd(msgtmp2, msgtmp4); 401 sha256msg2(msgtmp2, msgtmp1); 402 pshufd(msg, msg, 0x0E); 403 sha256rnds2(state0, state1); 404 sha256msg1(msgtmp0, msgtmp1); 405 406 // Rounds 40-43 407 movdqa(msg, msgtmp2); 408 paddd(msg, Address(rax, 160)); 409 sha256rnds2(state1, state0); 410 movdqa(msgtmp4, msgtmp2); 411 palignr(msgtmp4, msgtmp1, 4); 412 paddd(msgtmp3, msgtmp4); 413 sha256msg2(msgtmp3, msgtmp2); 414 pshufd(msg, msg, 0x0E); 415 sha256rnds2(state0, state1); 416 sha256msg1(msgtmp1, msgtmp2); 417 418 // Rounds 44-47 419 movdqa(msg, msgtmp3); 420 paddd(msg, Address(rax, 176)); 421 sha256rnds2(state1, state0); 422 movdqa(msgtmp4, msgtmp3); 423 palignr(msgtmp4, msgtmp2, 4); 424 paddd(msgtmp0, msgtmp4); 425 sha256msg2(msgtmp0, msgtmp3); 426 pshufd(msg, msg, 0x0E); 427 sha256rnds2(state0, state1); 428 sha256msg1(msgtmp2, msgtmp3); 429 430 // Rounds 48-51 431 movdqa(msg, msgtmp0); 432 paddd(msg, Address(rax, 192)); 433 sha256rnds2(state1, state0); 434 movdqa(msgtmp4, msgtmp0); 435 palignr(msgtmp4, msgtmp3, 4); 436 paddd(msgtmp1, msgtmp4); 437 sha256msg2(msgtmp1, msgtmp0); 438 pshufd(msg, msg, 0x0E); 439 sha256rnds2(state0, state1); 440 sha256msg1(msgtmp3, msgtmp0); 441 442 // Rounds 52-55 443 movdqa(msg, msgtmp1); 444 paddd(msg, Address(rax, 208)); 445 sha256rnds2(state1, state0); 446 movdqa(msgtmp4, msgtmp1); 447 palignr(msgtmp4, msgtmp0, 4); 448 paddd(msgtmp2, msgtmp4); 449 sha256msg2(msgtmp2, msgtmp1); 450 pshufd(msg, msg, 0x0E); 451 sha256rnds2(state0, state1); 452 453 // Rounds 56-59 454 movdqa(msg, msgtmp2); 455 paddd(msg, Address(rax, 224)); 456 sha256rnds2(state1, state0); 457 movdqa(msgtmp4, msgtmp2); 458 palignr(msgtmp4, msgtmp1, 4); 459 paddd(msgtmp3, msgtmp4); 460 sha256msg2(msgtmp3, msgtmp2); 461 pshufd(msg, msg, 0x0E); 462 sha256rnds2(state0, state1); 463 464 // Rounds 60-63 465 movdqa(msg, msgtmp3); 466 paddd(msg, Address(rax, 240)); 467 sha256rnds2(state1, state0); 468 pshufd(msg, msg, 0x0E); 469 sha256rnds2(state0, state1); 470 movdqu(msg, Address(rsp, 0)); 471 paddd(state0, msg); 472 movdqu(msg, Address(rsp, 16)); 473 paddd(state1, msg); 474 475 if (multi_block) { 476 // increment data pointer and loop if more to process 477 addptr(buf, 64); 478 addptr(ofs, 64); 479 cmpptr(ofs, limit); 480 jcc(Assembler::belowEqual, loop0); 481 movptr(rax, ofs); //return ofs 482 } 483 484 pshufd(state0, state0, 0x1B); 485 pshufd(state1, state1, 0xB1); 486 movdqa(msgtmp4, state0); 487 pblendw(state0, state1, 0xF0); 488 palignr(state1, msgtmp4, 8); 489 490 movdqu(Address(state, 0), state0); 491 movdqu(Address(state, 16), state1); 492 493 bind(done_hash); 494 495} 496 497#ifdef _LP64 498/* 499 The algorithm below is based on Intel publication: 500 "Fast SHA-256 Implementations on Intel�� Architecture Processors" by Jim Guilford, Kirk Yap and Vinodh Gopal. 501 The assembly code was originally provided by Sean Gulley and in many places preserves 502 the original assembly NAMES and comments to simplify matching Java assembly with its original. 503 The Java version was substantially redesigned to replace 1200 assembly instruction with 504 much shorter run-time generator of the same code in memory. 505*/ 506 507void MacroAssembler::sha256_AVX2_one_round_compute( 508 Register reg_old_h, 509 Register reg_a, 510 Register reg_b, 511 Register reg_c, 512 Register reg_d, 513 Register reg_e, 514 Register reg_f, 515 Register reg_g, 516 Register reg_h, 517 int iter) { 518 const Register& reg_y0 = r13; 519 const Register& reg_y1 = r14; 520 const Register& reg_y2 = r15; 521 const Register& reg_y3 = rcx; 522 const Register& reg_T1 = r12; 523 //;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; RND iter ;;;;;;;;;;;;;;;;;;;;;;;;;;; 524 if (iter%4 > 0) { 525 addl(reg_old_h, reg_y2); // reg_h = k + w + reg_h + S0 + S1 + CH = t1 + S0; -- 526 } 527 movl(reg_y2, reg_f); // reg_y2 = reg_f ; CH 528 rorxd(reg_y0, reg_e, 25); // reg_y0 = reg_e >> 25 ; S1A 529 rorxd(reg_y1, reg_e, 11); // reg_y1 = reg_e >> 11 ; S1B 530 xorl(reg_y2, reg_g); // reg_y2 = reg_f^reg_g ; CH 531 532 xorl(reg_y0, reg_y1); // reg_y0 = (reg_e>>25) ^ (reg_h>>11) ; S1 533 rorxd(reg_y1, reg_e, 6); // reg_y1 = (reg_e >> 6) ; S1 534 andl(reg_y2, reg_e); // reg_y2 = (reg_f^reg_g)®_e ; CH 535 536 if (iter%4 > 0) { 537 addl(reg_old_h, reg_y3); // reg_h = t1 + S0 + MAJ ; -- 538 } 539 540 xorl(reg_y0, reg_y1); // reg_y0 = (reg_e>>25) ^ (reg_e>>11) ^ (reg_e>>6) ; S1 541 rorxd(reg_T1, reg_a, 13); // reg_T1 = reg_a >> 13 ; S0B 542 xorl(reg_y2, reg_g); // reg_y2 = CH = ((reg_f^reg_g)®_e)^reg_g ; CH 543 rorxd(reg_y1, reg_a, 22); // reg_y1 = reg_a >> 22 ; S0A 544 movl(reg_y3, reg_a); // reg_y3 = reg_a ; MAJA 545 546 xorl(reg_y1, reg_T1); // reg_y1 = (reg_a>>22) ^ (reg_a>>13) ; S0 547 rorxd(reg_T1, reg_a, 2); // reg_T1 = (reg_a >> 2) ; S0 548 addl(reg_h, Address(rsp, rdx, Address::times_1, 4*iter)); // reg_h = k + w + reg_h ; -- 549 orl(reg_y3, reg_c); // reg_y3 = reg_a|reg_c ; MAJA 550 551 xorl(reg_y1, reg_T1); // reg_y1 = (reg_a>>22) ^ (reg_a>>13) ^ (reg_a>>2) ; S0 552 movl(reg_T1, reg_a); // reg_T1 = reg_a ; MAJB 553 andl(reg_y3, reg_b); // reg_y3 = (reg_a|reg_c)®_b ; MAJA 554 andl(reg_T1, reg_c); // reg_T1 = reg_a®_c ; MAJB 555 addl(reg_y2, reg_y0); // reg_y2 = S1 + CH ; -- 556 557 558 addl(reg_d, reg_h); // reg_d = k + w + reg_h + reg_d ; -- 559 orl(reg_y3, reg_T1); // reg_y3 = MAJ = (reg_a|reg_c)®_b)|(reg_a®_c) ; MAJ 560 addl(reg_h, reg_y1); // reg_h = k + w + reg_h + S0 ; -- 561 562 addl(reg_d, reg_y2); // reg_d = k + w + reg_h + reg_d + S1 + CH = reg_d + t1 ; -- 563 564 565 if (iter%4 == 3) { 566 addl(reg_h, reg_y2); // reg_h = k + w + reg_h + S0 + S1 + CH = t1 + S0; -- 567 addl(reg_h, reg_y3); // reg_h = t1 + S0 + MAJ ; -- 568 } 569} 570 571void MacroAssembler::sha256_AVX2_four_rounds_compute_first(int start) { 572 sha256_AVX2_one_round_compute(rax, rax, rbx, rdi, rsi, r8, r9, r10, r11, start + 0); 573 sha256_AVX2_one_round_compute(r11, r11, rax, rbx, rdi, rsi, r8, r9, r10, start + 1); 574 sha256_AVX2_one_round_compute(r10, r10, r11, rax, rbx, rdi, rsi, r8, r9, start + 2); 575 sha256_AVX2_one_round_compute(r9, r9, r10, r11, rax, rbx, rdi, rsi, r8, start + 3); 576} 577 578void MacroAssembler::sha256_AVX2_four_rounds_compute_last(int start) { 579 sha256_AVX2_one_round_compute(r8, r8, r9, r10, r11, rax, rbx, rdi, rsi, start + 0); 580 sha256_AVX2_one_round_compute(rsi, rsi, r8, r9, r10, r11, rax, rbx, rdi, start + 1); 581 sha256_AVX2_one_round_compute(rdi, rdi, rsi, r8, r9, r10, r11, rax, rbx, start + 2); 582 sha256_AVX2_one_round_compute(rbx, rbx, rdi, rsi, r8, r9, r10, r11, rax, start + 3); 583} 584 585void MacroAssembler::sha256_AVX2_one_round_and_sched( 586 XMMRegister xmm_0, /* == ymm4 on 0, 1, 2, 3 iterations, then rotate 4 registers left on 4, 8, 12 iterations */ 587 XMMRegister xmm_1, /* ymm5 */ /* full cycle is 16 iterations */ 588 XMMRegister xmm_2, /* ymm6 */ 589 XMMRegister xmm_3, /* ymm7 */ 590 Register reg_a, /* == rax on 0 iteration, then rotate 8 register right on each next iteration */ 591 Register reg_b, /* rbx */ /* full cycle is 8 iterations */ 592 Register reg_c, /* rdi */ 593 Register reg_d, /* rsi */ 594 Register reg_e, /* r8 */ 595 Register reg_f, /* r9d */ 596 Register reg_g, /* r10d */ 597 Register reg_h, /* r11d */ 598 int iter) 599{ 600 movl(rcx, reg_a); // rcx = reg_a ; MAJA 601 rorxd(r13, reg_e, 25); // r13 = reg_e >> 25 ; S1A 602 rorxd(r14, reg_e, 11); // r14 = reg_e >> 11 ; S1B 603 addl(reg_h, Address(rsp, rdx, Address::times_1, 4*iter)); 604 orl(rcx, reg_c); // rcx = reg_a|reg_c ; MAJA 605 606 movl(r15, reg_f); // r15 = reg_f ; CH 607 rorxd(r12, reg_a, 13); // r12 = reg_a >> 13 ; S0B 608 xorl(r13, r14); // r13 = (reg_e>>25) ^ (reg_e>>11) ; S1 609 xorl(r15, reg_g); // r15 = reg_f^reg_g ; CH 610 611 rorxd(r14, reg_e, 6); // r14 = (reg_e >> 6) ; S1 612 andl(r15, reg_e); // r15 = (reg_f^reg_g)®_e ; CH 613 614 xorl(r13, r14); // r13 = (reg_e>>25) ^ (reg_e>>11) ^ (reg_e>>6) ; S1 615 rorxd(r14, reg_a, 22); // r14 = reg_a >> 22 ; S0A 616 addl(reg_d, reg_h); // reg_d = k + w + reg_h + reg_d ; -- 617 618 andl(rcx, reg_b); // rcx = (reg_a|reg_c)®_b ; MAJA 619 xorl(r14, r12); // r14 = (reg_a>>22) ^ (reg_a>>13) ; S0 620 621 rorxd(r12, reg_a, 2); // r12 = (reg_a >> 2) ; S0 622 xorl(r15, reg_g); // r15 = CH = ((reg_f^reg_g)®_e)^reg_g ; CH 623 624 xorl(r14, r12); // r14 = (reg_a>>22) ^ (reg_a>>13) ^ (reg_a>>2) ; S0 625 movl(r12, reg_a); // r12 = reg_a ; MAJB 626 andl(r12, reg_c); // r12 = reg_a®_c ; MAJB 627 addl(r15, r13); // r15 = S1 + CH ; -- 628 629 orl(rcx, r12); // rcx = MAJ = (reg_a|reg_c)®_b)|(reg_a®_c) ; MAJ 630 addl(reg_h, r14); // reg_h = k + w + reg_h + S0 ; -- 631 addl(reg_d, r15); // reg_d = k + w + reg_h + reg_d + S1 + CH = reg_d + t1 ; -- 632 633 addl(reg_h, r15); // reg_h = k + w + reg_h + S0 + S1 + CH = t1 + S0; -- 634 addl(reg_h, rcx); // reg_h = t1 + S0 + MAJ ; -- 635 636 if (iter%4 == 0) { 637 vpalignr(xmm0, xmm_3, xmm_2, 4, AVX_256bit); // ymm0 = W[-7] 638 vpaddd(xmm0, xmm0, xmm_0, AVX_256bit); // ymm0 = W[-7] + W[-16]; y1 = (e >> 6) ; S1 639 vpalignr(xmm1, xmm_1, xmm_0, 4, AVX_256bit); // ymm1 = W[-15] 640 vpsrld(xmm2, xmm1, 7, AVX_256bit); 641 vpslld(xmm3, xmm1, 32-7, AVX_256bit); 642 vpor(xmm3, xmm3, xmm2, AVX_256bit); // ymm3 = W[-15] ror 7 643 vpsrld(xmm2, xmm1,18, AVX_256bit); 644 } else if (iter%4 == 1 ) { 645 vpsrld(xmm8, xmm1, 3, AVX_256bit); // ymm8 = W[-15] >> 3 646 vpslld(xmm1, xmm1, 32-18, AVX_256bit); 647 vpxor(xmm3, xmm3, xmm1, AVX_256bit); 648 vpxor(xmm3, xmm3, xmm2, AVX_256bit); // ymm3 = W[-15] ror 7 ^ W[-15] ror 18 649 vpxor(xmm1, xmm3, xmm8, AVX_256bit); // ymm1 = s0 650 vpshufd(xmm2, xmm_3, 0xFA, AVX_256bit); // 11111010b ; ymm2 = W[-2] {BBAA} 651 vpaddd(xmm0, xmm0, xmm1, AVX_256bit); // ymm0 = W[-16] + W[-7] + s0 652 vpsrld(xmm8, xmm2, 10, AVX_256bit); // ymm8 = W[-2] >> 10 {BBAA} 653 } else if (iter%4 == 2) { 654 vpsrlq(xmm3, xmm2, 19, AVX_256bit); // ymm3 = W[-2] ror 19 {xBxA} 655 vpsrlq(xmm2, xmm2, 17, AVX_256bit); // ymm2 = W[-2] ror 17 {xBxA} 656 vpxor(xmm2, xmm2, xmm3, AVX_256bit); 657 vpxor(xmm8, xmm8, xmm2, AVX_256bit); // ymm8 = s1 {xBxA} 658 vpshufb(xmm8, xmm8, xmm10, AVX_256bit); // ymm8 = s1 {00BA} 659 vpaddd(xmm0, xmm0, xmm8, AVX_256bit); // ymm0 = {..., ..., W[1], W[0]} 660 vpshufd(xmm2, xmm0, 0x50, AVX_256bit); // 01010000b ; ymm2 = W[-2] {DDCC} 661 } else if (iter%4 == 3) { 662 vpsrld(xmm11, xmm2, 10, AVX_256bit); // ymm11 = W[-2] >> 10 {DDCC} 663 vpsrlq(xmm3, xmm2, 19, AVX_256bit); // ymm3 = W[-2] ror 19 {xDxC} 664 vpsrlq(xmm2, xmm2, 17, AVX_256bit); // ymm2 = W[-2] ror 17 {xDxC} 665 vpxor(xmm2, xmm2, xmm3, AVX_256bit); 666 vpxor(xmm11, xmm11, xmm2, AVX_256bit); // ymm11 = s1 {xDxC} 667 vpshufb(xmm11, xmm11, xmm12, AVX_256bit); // ymm11 = s1 {DC00} 668 vpaddd(xmm_0, xmm11, xmm0, AVX_256bit); // xmm_0 = {W[3], W[2], W[1], W[0]} 669 } 670} 671 672void MacroAssembler::addm(int disp, Register r1, Register r2) { 673 addl(r2, Address(r1, disp)); 674 movl(Address(r1, disp), r2); 675} 676 677void MacroAssembler::addmq(int disp, Register r1, Register r2) { 678 addq(r2, Address(r1, disp)); 679 movq(Address(r1, disp), r2); 680} 681 682void MacroAssembler::sha256_AVX2(XMMRegister msg, XMMRegister state0, XMMRegister state1, XMMRegister msgtmp0, 683 XMMRegister msgtmp1, XMMRegister msgtmp2, XMMRegister msgtmp3, XMMRegister msgtmp4, 684 Register buf, Register state, Register ofs, Register limit, Register rsp, 685 bool multi_block, XMMRegister shuf_mask) { 686 687 Label loop0, loop1, loop2, loop3, 688 last_block_enter, do_last_block, only_one_block, done_hash, 689 compute_size, compute_size_end, 690 compute_size1, compute_size_end1; 691 692 address K256_W = StubRoutines::x86::k256_W_addr(); 693 address pshuffle_byte_flip_mask = StubRoutines::x86::pshuffle_byte_flip_mask_addr(); 694 address pshuffle_byte_flip_mask_addr = 0; 695 696const XMMRegister& SHUF_00BA = xmm10; // ymm10: shuffle xBxA -> 00BA 697const XMMRegister& SHUF_DC00 = xmm12; // ymm12: shuffle xDxC -> DC00 698const XMMRegister& BYTE_FLIP_MASK = xmm13; // ymm13 699 700const XMMRegister& X_BYTE_FLIP_MASK = xmm13; //XMM version of BYTE_FLIP_MASK 701 702const Register& NUM_BLKS = r8; // 3rd arg 703const Register& CTX = rdx; // 2nd arg 704const Register& INP = rcx; // 1st arg 705 706const Register& c = rdi; 707const Register& d = rsi; 708const Register& e = r8; // clobbers NUM_BLKS 709const Register& y3 = rcx; // clobbers INP 710 711const Register& TBL = rbp; 712const Register& SRND = CTX; // SRND is same register as CTX 713 714const Register& a = rax; 715const Register& b = rbx; 716const Register& f = r9; 717const Register& g = r10; 718const Register& h = r11; 719 720const Register& T1 = r12; 721const Register& y0 = r13; 722const Register& y1 = r14; 723const Register& y2 = r15; 724 725 726enum { 727 _XFER_SIZE = 2*64*4, // 2 blocks, 64 rounds, 4 bytes/round 728 _INP_END_SIZE = 8, 729 _INP_SIZE = 8, 730 _CTX_SIZE = 8, 731 _RSP_SIZE = 8, 732 733 _XFER = 0, 734 _INP_END = _XFER + _XFER_SIZE, 735 _INP = _INP_END + _INP_END_SIZE, 736 _CTX = _INP + _INP_SIZE, 737 _RSP = _CTX + _CTX_SIZE, 738 STACK_SIZE = _RSP + _RSP_SIZE 739}; 740 741#ifndef _WIN64 742 push(rcx); // linux: this is limit, need at the end 743 push(rdx); // linux: this is ofs 744#else 745 push(r8); // win64: this is ofs 746 push(r9); // win64: this is limit, we need them again at the very and 747#endif 748 749 750 push(rbx); 751#ifdef _WIN64 752 push(rsi); 753 push(rdi); 754#endif 755 push(rbp); 756 push(r12); 757 push(r13); 758 push(r14); 759 push(r15); 760 761 movq(rax, rsp); 762 subq(rsp, STACK_SIZE); 763 andq(rsp, -32); 764 movq(Address(rsp, _RSP), rax); 765 766#ifndef _WIN64 767 // copy linux params to win64 params, therefore the rest of code will be the same for both 768 movq(r9, rcx); 769 movq(r8, rdx); 770 movq(rdx, rsi); 771 movq(rcx, rdi); 772#endif 773 774 // setting original assembly ABI 775 /** message to encrypt in INP */ 776 lea(INP, Address(rcx, 0)); // rcx == message (buf) ;; linux: INP = buf = rdi 777 /** digest in CTX */ 778 movq(CTX, rdx); // rdx = digest (state) ;; linux: CTX = state = rsi 779 780 /** NUM_BLK is the length of message, need to set it from ofs and limit */ 781 if (multi_block) { 782 783 // Win64: cannot directly update NUM_BLKS, since NUM_BLKS = ofs = r8 784 // on entry r8 = ofs 785 // on exit r8 = NUM_BLKS 786 787 xorq(rax, rax); 788 789 bind(compute_size); 790 cmpptr(r8, r9); // assume the original ofs <= limit ;; linux: cmp rcx, rdx 791 jccb(Assembler::aboveEqual, compute_size_end); 792 addq(r8, 64); //;; linux: ofs = rdx 793 addq(rax, 64); 794 jmpb(compute_size); 795 796 bind(compute_size_end); 797 movq(NUM_BLKS, rax); // NUM_BLK (r8) ;; linux: NUM_BLK = rdx 798 799 cmpq(NUM_BLKS, 0); 800 jcc(Assembler::equal, done_hash); 801 802 } else { 803 xorq(NUM_BLKS, NUM_BLKS); 804 addq(NUM_BLKS, 64); 805 }//if (!multi_block) 806 807 lea(NUM_BLKS, Address(INP, NUM_BLKS, Address::times_1, -64)); // pointer to the last block 808 movq(Address(rsp, _INP_END), NUM_BLKS); // 809 810 cmpptr(INP, NUM_BLKS); //cmp INP, NUM_BLKS 811 jcc(Assembler::equal, only_one_block); //je only_one_block 812 813 // load initial digest 814 movl(a, Address(CTX, 4*0)); 815 movl(b, Address(CTX, 4*1)); 816 movl(c, Address(CTX, 4*2)); 817 movl(d, Address(CTX, 4*3)); 818 movl(e, Address(CTX, 4*4)); 819 movl(f, Address(CTX, 4*5)); 820 // load g - r10 after it is used as scratch 821 movl(h, Address(CTX, 4*7)); 822 823 pshuffle_byte_flip_mask_addr = pshuffle_byte_flip_mask; 824 vmovdqu(BYTE_FLIP_MASK, ExternalAddress(pshuffle_byte_flip_mask_addr +0)); //[PSHUFFLE_BYTE_FLIP_MASK wrt rip] 825 vmovdqu(SHUF_00BA, ExternalAddress(pshuffle_byte_flip_mask_addr + 32)); //[_SHUF_00BA wrt rip] 826 vmovdqu(SHUF_DC00, ExternalAddress(pshuffle_byte_flip_mask_addr + 64)); //[_SHUF_DC00 wrt rip] 827 828 movl(g, Address(CTX, 4*6)); 829 830 movq(Address(rsp, _CTX), CTX); // store 831 832bind(loop0); 833 lea(TBL, ExternalAddress(K256_W)); 834 835 // assume buffers not aligned 836 837 // Load first 16 dwords from two blocks 838 vmovdqu(xmm0, Address(INP, 0*32)); 839 vmovdqu(xmm1, Address(INP, 1*32)); 840 vmovdqu(xmm2, Address(INP, 2*32)); 841 vmovdqu(xmm3, Address(INP, 3*32)); 842 843 // byte swap data 844 vpshufb(xmm0, xmm0, BYTE_FLIP_MASK, AVX_256bit); 845 vpshufb(xmm1, xmm1, BYTE_FLIP_MASK, AVX_256bit); 846 vpshufb(xmm2, xmm2, BYTE_FLIP_MASK, AVX_256bit); 847 vpshufb(xmm3, xmm3, BYTE_FLIP_MASK, AVX_256bit); 848 849 // transpose data into high/low halves 850 vperm2i128(xmm4, xmm0, xmm2, 0x20); 851 vperm2i128(xmm5, xmm0, xmm2, 0x31); 852 vperm2i128(xmm6, xmm1, xmm3, 0x20); 853 vperm2i128(xmm7, xmm1, xmm3, 0x31); 854 855bind(last_block_enter); 856 addq(INP, 64); 857 movq(Address(rsp, _INP), INP); 858 859 //;; schedule 48 input dwords, by doing 3 rounds of 12 each 860 xorq(SRND, SRND); 861 862align(16); 863bind(loop1); 864 vpaddd(xmm9, xmm4, Address(TBL, SRND, Address::times_1, 0*32), AVX_256bit); 865 vmovdqu(Address(rsp, SRND, Address::times_1, _XFER + 0*32), xmm9); 866 sha256_AVX2_one_round_and_sched(xmm4, xmm5, xmm6, xmm7, rax, rbx, rdi, rsi, r8, r9, r10, r11, 0); 867 sha256_AVX2_one_round_and_sched(xmm4, xmm5, xmm6, xmm7, r11, rax, rbx, rdi, rsi, r8, r9, r10, 1); 868 sha256_AVX2_one_round_and_sched(xmm4, xmm5, xmm6, xmm7, r10, r11, rax, rbx, rdi, rsi, r8, r9, 2); 869 sha256_AVX2_one_round_and_sched(xmm4, xmm5, xmm6, xmm7, r9, r10, r11, rax, rbx, rdi, rsi, r8, 3); 870 871 vpaddd(xmm9, xmm5, Address(TBL, SRND, Address::times_1, 1*32), AVX_256bit); 872 vmovdqu(Address(rsp, SRND, Address::times_1, _XFER + 1*32), xmm9); 873 sha256_AVX2_one_round_and_sched(xmm5, xmm6, xmm7, xmm4, r8, r9, r10, r11, rax, rbx, rdi, rsi, 8+0); 874 sha256_AVX2_one_round_and_sched(xmm5, xmm6, xmm7, xmm4, rsi, r8, r9, r10, r11, rax, rbx, rdi, 8+1); 875 sha256_AVX2_one_round_and_sched(xmm5, xmm6, xmm7, xmm4, rdi, rsi, r8, r9, r10, r11, rax, rbx, 8+2); 876 sha256_AVX2_one_round_and_sched(xmm5, xmm6, xmm7, xmm4, rbx, rdi, rsi, r8, r9, r10, r11, rax, 8+3); 877 878 vpaddd(xmm9, xmm6, Address(TBL, SRND, Address::times_1, 2*32), AVX_256bit); 879 vmovdqu(Address(rsp, SRND, Address::times_1, _XFER + 2*32), xmm9); 880 sha256_AVX2_one_round_and_sched(xmm6, xmm7, xmm4, xmm5, rax, rbx, rdi, rsi, r8, r9, r10, r11, 16+0); 881 sha256_AVX2_one_round_and_sched(xmm6, xmm7, xmm4, xmm5, r11, rax, rbx, rdi, rsi, r8, r9, r10, 16+1); 882 sha256_AVX2_one_round_and_sched(xmm6, xmm7, xmm4, xmm5, r10, r11, rax, rbx, rdi, rsi, r8, r9, 16+2); 883 sha256_AVX2_one_round_and_sched(xmm6, xmm7, xmm4, xmm5, r9, r10, r11, rax, rbx, rdi, rsi, r8, 16+3); 884 885 vpaddd(xmm9, xmm7, Address(TBL, SRND, Address::times_1, 3*32), AVX_256bit); 886 vmovdqu(Address(rsp, SRND, Address::times_1, _XFER + 3*32), xmm9); 887 888 sha256_AVX2_one_round_and_sched(xmm7, xmm4, xmm5, xmm6, r8, r9, r10, r11, rax, rbx, rdi, rsi, 24+0); 889 sha256_AVX2_one_round_and_sched(xmm7, xmm4, xmm5, xmm6, rsi, r8, r9, r10, r11, rax, rbx, rdi, 24+1); 890 sha256_AVX2_one_round_and_sched(xmm7, xmm4, xmm5, xmm6, rdi, rsi, r8, r9, r10, r11, rax, rbx, 24+2); 891 sha256_AVX2_one_round_and_sched(xmm7, xmm4, xmm5, xmm6, rbx, rdi, rsi, r8, r9, r10, r11, rax, 24+3); 892 893 addq(SRND, 4*32); 894 cmpq(SRND, 3 * 4*32); 895 jcc(Assembler::below, loop1); 896 897bind(loop2); 898 // Do last 16 rounds with no scheduling 899 vpaddd(xmm9, xmm4, Address(TBL, SRND, Address::times_1, 0*32), AVX_256bit); 900 vmovdqu(Address(rsp, SRND, Address::times_1, _XFER + 0*32), xmm9); 901 sha256_AVX2_four_rounds_compute_first(0); 902 903 vpaddd(xmm9, xmm5, Address(TBL, SRND, Address::times_1, 1*32), AVX_256bit); 904 vmovdqu(Address(rsp, SRND, Address::times_1, _XFER + 1*32), xmm9); 905 sha256_AVX2_four_rounds_compute_last(0 + 8); 906 907 addq(SRND, 2*32); 908 909 vmovdqu(xmm4, xmm6); 910 vmovdqu(xmm5, xmm7); 911 912 cmpq(SRND, 4 * 4*32); 913 jcc(Assembler::below, loop2); 914 915 movq(CTX, Address(rsp, _CTX)); 916 movq(INP, Address(rsp, _INP)); 917 918 addm(4*0, CTX, a); 919 addm(4*1, CTX, b); 920 addm(4*2, CTX, c); 921 addm(4*3, CTX, d); 922 addm(4*4, CTX, e); 923 addm(4*5, CTX, f); 924 addm(4*6, CTX, g); 925 addm(4*7, CTX, h); 926 927 cmpq(INP, Address(rsp, _INP_END)); 928 jcc(Assembler::above, done_hash); 929 930 //Do second block using previously scheduled results 931 xorq(SRND, SRND); 932align(16); 933bind(loop3); 934 sha256_AVX2_four_rounds_compute_first(4); 935 sha256_AVX2_four_rounds_compute_last(4+8); 936 937 addq(SRND, 2*32); 938 cmpq(SRND, 4 * 4*32); 939 jcc(Assembler::below, loop3); 940 941 movq(CTX, Address(rsp, _CTX)); 942 movq(INP, Address(rsp, _INP)); 943 addq(INP, 64); 944 945 addm(4*0, CTX, a); 946 addm(4*1, CTX, b); 947 addm(4*2, CTX, c); 948 addm(4*3, CTX, d); 949 addm(4*4, CTX, e); 950 addm(4*5, CTX, f); 951 addm(4*6, CTX, g); 952 addm(4*7, CTX, h); 953 954 cmpq(INP, Address(rsp, _INP_END)); 955 jcc(Assembler::below, loop0); 956 jccb(Assembler::above, done_hash); 957 958bind(do_last_block); 959 lea(TBL, ExternalAddress(K256_W)); 960 961 movdqu(xmm4, Address(INP, 0*16)); 962 movdqu(xmm5, Address(INP, 1*16)); 963 movdqu(xmm6, Address(INP, 2*16)); 964 movdqu(xmm7, Address(INP, 3*16)); 965 966 vpshufb(xmm4, xmm4, xmm13, AVX_128bit); 967 vpshufb(xmm5, xmm5, xmm13, AVX_128bit); 968 vpshufb(xmm6, xmm6, xmm13, AVX_128bit); 969 vpshufb(xmm7, xmm7, xmm13, AVX_128bit); 970 971 jmp(last_block_enter); 972 973bind(only_one_block); 974 975 // load initial digest ;; table should be preloaded with following values 976 movl(a, Address(CTX, 4*0)); // 0x6a09e667 977 movl(b, Address(CTX, 4*1)); // 0xbb67ae85 978 movl(c, Address(CTX, 4*2)); // 0x3c6ef372 979 movl(d, Address(CTX, 4*3)); // 0xa54ff53a 980 movl(e, Address(CTX, 4*4)); // 0x510e527f 981 movl(f, Address(CTX, 4*5)); // 0x9b05688c 982 // load g - r10 after use as scratch 983 movl(h, Address(CTX, 4*7)); // 0x5be0cd19 984 985 986 pshuffle_byte_flip_mask_addr = pshuffle_byte_flip_mask; 987 vmovdqu(BYTE_FLIP_MASK, ExternalAddress(pshuffle_byte_flip_mask_addr + 0)); //[PSHUFFLE_BYTE_FLIP_MASK wrt rip] 988 vmovdqu(SHUF_00BA, ExternalAddress(pshuffle_byte_flip_mask_addr + 32)); //[_SHUF_00BA wrt rip] 989 vmovdqu(SHUF_DC00, ExternalAddress(pshuffle_byte_flip_mask_addr + 64)); //[_SHUF_DC00 wrt rip] 990 991 movl(g, Address(CTX, 4*6)); // 0x1f83d9ab 992 993 movq(Address(rsp, _CTX), CTX); 994 jmpb(do_last_block); 995 996bind(done_hash); 997 998 movq(rsp, Address(rsp, _RSP)); 999 1000 pop(r15); 1001 pop(r14); 1002 pop(r13); 1003 pop(r12); 1004 pop(rbp); 1005#ifdef _WIN64 1006 pop(rdi); 1007 pop(rsi); 1008#endif 1009 pop(rbx); 1010 1011#ifdef _WIN64 1012 pop(r9); 1013 pop(r8); 1014#else 1015 pop(rdx); 1016 pop(rcx); 1017#endif 1018 1019 if (multi_block) { 1020#ifdef _WIN64 1021const Register& limit_end = r9; 1022const Register& ofs_end = r8; 1023#else 1024const Register& limit_end = rcx; 1025const Register& ofs_end = rdx; 1026#endif 1027 movq(rax, ofs_end); 1028 1029bind(compute_size1); 1030 cmpptr(rax, limit_end); // assume the original ofs <= limit 1031 jccb(Assembler::aboveEqual, compute_size_end1); 1032 addq(rax, 64); 1033 jmpb(compute_size1); 1034 1035bind(compute_size_end1); 1036 } 1037} 1038 1039void MacroAssembler::sha512_AVX2_one_round_compute(Register old_h, Register a, Register b, Register c, 1040 Register d, Register e, Register f, Register g, Register h, 1041 int iteration) 1042{ 1043 1044 const Register& y0 = r13; 1045 const Register& y1 = r14; 1046 const Register& y2 = r15; 1047#ifdef _WIN64 1048 const Register& y3 = rcx; 1049#else 1050 const Register& y3 = rdi; 1051#endif 1052 const Register& T1 = r12; 1053 1054 if (iteration % 4 > 0) { 1055 addq(old_h, y2); //h = k + w + h + S0 + S1 + CH = t1 + S0; 1056 } 1057 movq(y2, f); //y2 = f; CH 1058 rorxq(y0, e, 41); //y0 = e >> 41; S1A 1059 rorxq(y1, e, 18); //y1 = e >> 18; S1B 1060 xorq(y2, g); //y2 = f^g; CH 1061 1062 xorq(y0, y1); //y0 = (e >> 41) ^ (e >> 18); S1 1063 rorxq(y1, e, 14); //y1 = (e >> 14); S1 1064 andq(y2, e); //y2 = (f^g)&e; CH 1065 1066 if (iteration % 4 > 0 ) { 1067 addq(old_h, y3); //h = t1 + S0 + MAJ 1068 } 1069 xorq(y0, y1); //y0 = (e >> 41) ^ (e >> 18) ^ (e >> 14); S1 1070 rorxq(T1, a, 34); //T1 = a >> 34; S0B 1071 xorq(y2, g); //y2 = CH = ((f^g)&e) ^g; CH 1072 rorxq(y1, a, 39); //y1 = a >> 39; S0A 1073 movq(y3, a); //y3 = a; MAJA 1074 1075 xorq(y1, T1); //y1 = (a >> 39) ^ (a >> 34); S0 1076 rorxq(T1, a, 28); //T1 = (a >> 28); S0 1077 addq(h, Address(rsp, (8 * iteration))); //h = k + w + h; -- 1078 orq(y3, c); //y3 = a | c; MAJA 1079 1080 xorq(y1, T1); //y1 = (a >> 39) ^ (a >> 34) ^ (a >> 28); S0 1081 movq(T1, a); //T1 = a; MAJB 1082 andq(y3, b); //y3 = (a | c)&b; MAJA 1083 andq(T1, c); //T1 = a&c; MAJB 1084 addq(y2, y0); //y2 = S1 + CH; -- 1085 1086 addq(d, h); //d = k + w + h + d; -- 1087 orq(y3, T1); //y3 = MAJ = (a | c)&b) | (a&c); MAJ 1088 addq(h, y1); //h = k + w + h + S0; -- 1089 1090 addq(d, y2); //d = k + w + h + d + S1 + CH = d + t1; -- 1091 1092 if (iteration % 4 == 3) { 1093 addq(h, y2); //h = k + w + h + S0 + S1 + CH = t1 + S0; -- 1094 addq(h, y3); //h = t1 + S0 + MAJ; -- 1095 } 1096} 1097 1098void MacroAssembler::sha512_AVX2_one_round_and_schedule( 1099 XMMRegister xmm4, // ymm4 1100 XMMRegister xmm5, // ymm5 1101 XMMRegister xmm6, // ymm6 1102 XMMRegister xmm7, // ymm7 1103 Register a, //rax 1104 Register b, //rbx 1105 Register c, //rdi 1106 Register d, //rsi 1107 Register e, //r8 1108 Register f, //r9 1109 Register g, //r10 1110 Register h, //r11 1111 int iteration) 1112{ 1113 1114 const Register& y0 = r13; 1115 const Register& y1 = r14; 1116 const Register& y2 = r15; 1117#ifdef _WIN64 1118 const Register& y3 = rcx; 1119#else 1120 const Register& y3 = rdi; 1121#endif 1122 const Register& T1 = r12; 1123 1124 if (iteration % 4 == 0) { 1125 // Extract w[t - 7] 1126 // xmm0 = W[-7] 1127 vperm2f128(xmm0, xmm7, xmm6, 3); 1128 vpalignr(xmm0, xmm0, xmm6, 8, AVX_256bit); 1129 1130 // Calculate w[t - 16] + w[t - 7] 1131 vpaddq(xmm0, xmm0, xmm4, AVX_256bit); //xmm0 = W[-7] + W[-16] 1132 // Extract w[t - 15] 1133 //xmm1 = W[-15] 1134 vperm2f128(xmm1, xmm5, xmm4, 3); 1135 vpalignr(xmm1, xmm1, xmm4, 8, AVX_256bit); 1136 1137 // Calculate sigma0 1138 // Calculate w[t - 15] ror 1 1139 vpsrlq(xmm2, xmm1, 1, AVX_256bit); 1140 vpsllq(xmm3, xmm1, (64 - 1), AVX_256bit); 1141 vpor(xmm3, xmm3, xmm2, AVX_256bit); //xmm3 = W[-15] ror 1 1142 // Calculate w[t - 15] shr 7 1143 vpsrlq(xmm8, xmm1, 7, AVX_256bit); //xmm8 = W[-15] >> 7 1144 1145 } else if (iteration % 4 == 1) { 1146 //Calculate w[t - 15] ror 8 1147 vpsrlq(xmm2, xmm1, 8, AVX_256bit); 1148 vpsllq(xmm1, xmm1, (64 - 8), AVX_256bit); 1149 vpor(xmm1, xmm1, xmm2, AVX_256bit); //xmm1 = W[-15] ror 8 1150 1151 //XOR the three components 1152 vpxor(xmm3, xmm3, xmm8, AVX_256bit); //xmm3 = W[-15] ror 1 ^ W[-15] >> 7 1153 vpxor(xmm1, xmm3, xmm1, AVX_256bit); //xmm1 = s0 1154 1155 //Add three components, w[t - 16], w[t - 7] and sigma0 1156 vpaddq(xmm0, xmm0, xmm1, AVX_256bit); //xmm0 = W[-16] + W[-7] + s0 1157 1158 // Move to appropriate lanes for calculating w[16] and w[17] 1159 vperm2f128(xmm4, xmm0, xmm0, 0); //xmm4 = W[-16] + W[-7] + s0{ BABA } 1160 1161 //Move to appropriate lanes for calculating w[18] and w[19] 1162 vpand(xmm0, xmm0, xmm10, AVX_256bit); //xmm0 = W[-16] + W[-7] + s0{ DC00 } 1163 //Calculate w[16] and w[17] in both 128 bit lanes 1164 //Calculate sigma1 for w[16] and w[17] on both 128 bit lanes 1165 vperm2f128(xmm2, xmm7, xmm7, 17); //xmm2 = W[-2] {BABA} 1166 vpsrlq(xmm8, xmm2, 6, AVX_256bit); //xmm8 = W[-2] >> 6 {BABA} 1167 1168 } else if (iteration % 4 == 2) { 1169 vpsrlq(xmm3, xmm2, 19, AVX_256bit); //xmm3 = W[-2] >> 19 {BABA} 1170 vpsllq(xmm1, xmm2, (64 - 19), AVX_256bit); //xmm1 = W[-2] << 19 {BABA} 1171 vpor(xmm3, xmm3, xmm1, AVX_256bit); //xmm3 = W[-2] ror 19 {BABA} 1172 vpxor(xmm8, xmm8, xmm3, AVX_256bit);// xmm8 = W[-2] ror 19 ^ W[-2] >> 6 {BABA} 1173 vpsrlq(xmm3, xmm2, 61, AVX_256bit); //xmm3 = W[-2] >> 61 {BABA} 1174 vpsllq(xmm1, xmm2, (64 - 61), AVX_256bit); //xmm1 = W[-2] << 61 {BABA} 1175 vpor(xmm3, xmm3, xmm1, AVX_256bit); //xmm3 = W[-2] ror 61 {BABA} 1176 vpxor(xmm8, xmm8, xmm3, AVX_256bit); //xmm8 = s1 = (W[-2] ror 19) ^ (W[-2] ror 61) ^ (W[-2] >> 6) { BABA } 1177 1178 //Add sigma1 to the other components to get w[16] and w[17] 1179 vpaddq(xmm4, xmm4, xmm8, AVX_256bit); //xmm4 = { W[1], W[0], W[1], W[0] } 1180 1181 //Calculate sigma1 for w[18] and w[19] for upper 128 bit lane 1182 vpsrlq(xmm8, xmm4, 6, AVX_256bit); //xmm8 = W[-2] >> 6 {DC--} 1183 1184 } else if (iteration % 4 == 3){ 1185 vpsrlq(xmm3, xmm4, 19, AVX_256bit); //xmm3 = W[-2] >> 19 {DC--} 1186 vpsllq(xmm1, xmm4, (64 - 19), AVX_256bit); //xmm1 = W[-2] << 19 {DC--} 1187 vpor(xmm3, xmm3, xmm1, AVX_256bit); //xmm3 = W[-2] ror 19 {DC--} 1188 vpxor(xmm8, xmm8, xmm3, AVX_256bit); //xmm8 = W[-2] ror 19 ^ W[-2] >> 6 {DC--} 1189 vpsrlq(xmm3, xmm4, 61, AVX_256bit); //xmm3 = W[-2] >> 61 {DC--} 1190 vpsllq(xmm1, xmm4, (64 - 61), AVX_256bit); //xmm1 = W[-2] << 61 {DC--} 1191 vpor(xmm3, xmm3, xmm1, AVX_256bit); //xmm3 = W[-2] ror 61 {DC--} 1192 vpxor(xmm8, xmm8, xmm3, AVX_256bit); //xmm8 = s1 = (W[-2] ror 19) ^ (W[-2] ror 61) ^ (W[-2] >> 6) { DC-- } 1193 1194 //Add the sigma0 + w[t - 7] + w[t - 16] for w[18] and w[19] to newly calculated sigma1 to get w[18] and w[19] 1195 vpaddq(xmm2, xmm0, xmm8, AVX_256bit); //xmm2 = { W[3], W[2], --, -- } 1196 1197 //Form w[19, w[18], w17], w[16] 1198 vpblendd(xmm4, xmm4, xmm2, 0xF0, AVX_256bit); //xmm4 = { W[3], W[2], W[1], W[0] } 1199 } 1200 1201 movq(y3, a); //y3 = a; MAJA 1202 rorxq(y0, e, 41); // y0 = e >> 41; S1A 1203 rorxq(y1, e, 18); //y1 = e >> 18; S1B 1204 addq(h, Address(rsp, (iteration * 8))); //h = k + w + h; -- 1205 orq(y3, c); //y3 = a | c; MAJA 1206 movq(y2, f); //y2 = f; CH 1207 1208 xorq(y2, g); //y2 = f^g; CH 1209 1210 rorxq(T1, a, 34); //T1 = a >> 34; S0B 1211 xorq(y0, y1); //y0 = (e >> 41) ^ (e >> 18); S1 1212 1213 rorxq(y1, e, 14); //y1 = (e >> 14); S1 1214 1215 andq(y2, e); //y2 = (f^g) & e; CH 1216 addq(d, h); //d = k + w + h + d; -- 1217 1218 andq(y3, b); //y3 = (a | c)&b; MAJA 1219 xorq(y0, y1); //y0 = (e >> 41) ^ (e >> 18) ^ (e >> 14); S1 1220 rorxq(y1, a, 39); //y1 = a >> 39; S0A 1221 1222 xorq(y1, T1); //y1 = (a >> 39) ^ (a >> 34); S0 1223 rorxq(T1, a, 28); //T1 = (a >> 28); S0 1224 xorq(y2, g); //y2 = CH = ((f^g)&e) ^ g; CH 1225 1226 xorq(y1, T1); //y1 = (a >> 39) ^ (a >> 34) ^ (a >> 28); S0 1227 movq(T1, a); //T1 = a; MAJB 1228 1229 andq(T1, c); //T1 = a&c; MAJB 1230 addq(y2, y0); //y2 = S1 + CH; -- 1231 1232 orq(y3, T1); //y3 = MAJ = (a | c)&b) | (a&c); MAJ 1233 addq(h, y1); //h = k + w + h + S0; -- 1234 1235 addq(d, y2); //d = k + w + h + d + S1 + CH = d + t1; -- 1236 addq(h, y2); //h = k + w + h + S0 + S1 + CH = t1 + S0; -- 1237 addq(h, y3); //h = t1 + S0 + MAJ; -- 1238} 1239 1240void MacroAssembler::sha512_AVX2(XMMRegister msg, XMMRegister state0, XMMRegister state1, XMMRegister msgtmp0, 1241 XMMRegister msgtmp1, XMMRegister msgtmp2, XMMRegister msgtmp3, XMMRegister msgtmp4, 1242 Register buf, Register state, Register ofs, Register limit, Register rsp, 1243 bool multi_block, XMMRegister shuf_mask) 1244{ 1245 1246 Label loop0, loop1, loop2, done_hash, 1247 compute_block_size, compute_size, 1248 compute_block_size_end, compute_size_end; 1249 1250 address K512_W = StubRoutines::x86::k512_W_addr(); 1251 address pshuffle_byte_flip_mask_sha512 = StubRoutines::x86::pshuffle_byte_flip_mask_addr_sha512(); 1252 address pshuffle_byte_flip_mask_addr = 0; 1253 1254 const XMMRegister& XFER = xmm0; // YTMP0 1255 const XMMRegister& BYTE_FLIP_MASK = xmm9; // ymm9 1256 const XMMRegister& YMM_MASK_LO = xmm10; // ymm10 1257#ifdef _WIN64 1258 const Register& INP = rcx; //1st arg 1259 const Register& CTX = rdx; //2nd arg 1260 const Register& NUM_BLKS = r8; //3rd arg 1261 const Register& c = rdi; 1262 const Register& d = rsi; 1263 const Register& e = r8; 1264 const Register& y3 = rcx; 1265 const Register& offset = r8; 1266 const Register& input_limit = r9; 1267#else 1268 const Register& INP = rdi; //1st arg 1269 const Register& CTX = rsi; //2nd arg 1270 const Register& NUM_BLKS = rdx; //3rd arg 1271 const Register& c = rcx; 1272 const Register& d = r8; 1273 const Register& e = rdx; 1274 const Register& y3 = rdi; 1275 const Register& offset = rdx; 1276 const Register& input_limit = rcx; 1277#endif 1278 1279 const Register& TBL = rbp; 1280 1281 const Register& a = rax; 1282 const Register& b = rbx; 1283 1284 const Register& f = r9; 1285 const Register& g = r10; 1286 const Register& h = r11; 1287 1288 //Local variables as defined in assembly file. 1289 enum 1290 { 1291 _XFER_SIZE = 4 * 8, // resq 4 => reserve 4 quadwords. Hence 4 * 8 1292 _SRND_SIZE = 8, // resq 1 1293 _INP_SIZE = 8, 1294 _INP_END_SIZE = 8, 1295 _RSP_SAVE_SIZE = 8, // defined as resq 1 1296 1297#ifdef _WIN64 1298 _GPR_SAVE_SIZE = 8 * 8, // defined as resq 8 1299#else 1300 _GPR_SAVE_SIZE = 6 * 8 // resq 6 1301#endif 1302 }; 1303 1304 enum 1305 { 1306 _XFER = 0, 1307 _SRND = _XFER + _XFER_SIZE, // 32 1308 _INP = _SRND + _SRND_SIZE, // 40 1309 _INP_END = _INP + _INP_SIZE, // 48 1310 _RSP = _INP_END + _INP_END_SIZE, // 56 1311 _GPR = _RSP + _RSP_SAVE_SIZE, // 64 1312 _STACK_SIZE = _GPR + _GPR_SAVE_SIZE // 128 for windows and 112 for linux. 1313 }; 1314 1315//Saving offset and limit as it will help with blocksize calculation for multiblock SHA512. 1316#ifdef _WIN64 1317 push(r8); // win64: this is ofs 1318 push(r9); // win64: this is limit, we need them again at the very end. 1319#else 1320 push(rdx); // linux : this is ofs, need at the end for multiblock calculation 1321 push(rcx); // linux: This is the limit. 1322#endif 1323 1324 //Allocate Stack Space 1325 movq(rax, rsp); 1326 subq(rsp, _STACK_SIZE); 1327 andq(rsp, -32); 1328 movq(Address(rsp, _RSP), rax); 1329 1330 //Save GPRs 1331 movq(Address(rsp, _GPR), rbp); 1332 movq(Address(rsp, (_GPR + 8)), rbx); 1333 movq(Address(rsp, (_GPR + 16)), r12); 1334 movq(Address(rsp, (_GPR + 24)), r13); 1335 movq(Address(rsp, (_GPR + 32)), r14); 1336 movq(Address(rsp, (_GPR + 40)), r15); 1337 1338#ifdef _WIN64 1339 movq(Address(rsp, (_GPR + 48)), rsi); 1340 movq(Address(rsp, (_GPR + 56)), rdi); 1341#endif 1342 1343 vpblendd(xmm0, xmm0, xmm1, 0xF0, AVX_128bit); 1344 vpblendd(xmm0, xmm0, xmm1, 0xF0, AVX_256bit); 1345 1346 if (multi_block) { 1347 xorq(rax, rax); 1348 bind(compute_block_size); 1349 cmpptr(offset, input_limit); // Assuming that offset is less than limit. 1350 jccb(Assembler::aboveEqual, compute_block_size_end); 1351 addq(offset, 128); 1352 addq(rax, 128); 1353 jmpb(compute_block_size); 1354 1355 bind(compute_block_size_end); 1356 movq(NUM_BLKS, rax); 1357 1358 cmpq(NUM_BLKS, 0); 1359 jcc(Assembler::equal, done_hash); 1360 } else { 1361 xorq(NUM_BLKS, NUM_BLKS); //If single block. 1362 addq(NUM_BLKS, 128); 1363 } 1364 1365 addq(NUM_BLKS, INP); //pointer to end of data 1366 movq(Address(rsp, _INP_END), NUM_BLKS); 1367 1368 //load initial digest 1369 movq(a, Address(CTX, 8 * 0)); 1370 movq(b, Address(CTX, 8 * 1)); 1371 movq(c, Address(CTX, 8 * 2)); 1372 movq(d, Address(CTX, 8 * 3)); 1373 movq(e, Address(CTX, 8 * 4)); 1374 movq(f, Address(CTX, 8 * 5)); 1375 // load g - r10 after it is used as scratch 1376 movq(h, Address(CTX, 8 * 7)); 1377 1378 pshuffle_byte_flip_mask_addr = pshuffle_byte_flip_mask_sha512; 1379 vmovdqu(BYTE_FLIP_MASK, ExternalAddress(pshuffle_byte_flip_mask_addr + 0)); //PSHUFFLE_BYTE_FLIP_MASK wrt rip 1380 vmovdqu(YMM_MASK_LO, ExternalAddress(pshuffle_byte_flip_mask_addr + 32)); 1381 1382 movq(g, Address(CTX, 8 * 6)); 1383 1384 bind(loop0); 1385 lea(TBL, ExternalAddress(K512_W)); 1386 1387 //byte swap first 16 dwords 1388 vmovdqu(xmm4, Address(INP, 32 * 0)); 1389 vpshufb(xmm4, xmm4, BYTE_FLIP_MASK, AVX_256bit); 1390 vmovdqu(xmm5, Address(INP, 32 * 1)); 1391 vpshufb(xmm5, xmm5, BYTE_FLIP_MASK, AVX_256bit); 1392 vmovdqu(xmm6, Address(INP, 32 * 2)); 1393 vpshufb(xmm6, xmm6, BYTE_FLIP_MASK, AVX_256bit); 1394 vmovdqu(xmm7, Address(INP, 32 * 3)); 1395 vpshufb(xmm7, xmm7, BYTE_FLIP_MASK, AVX_256bit); 1396 1397 movq(Address(rsp, _INP), INP); 1398 1399 movslq(Address(rsp, _SRND), 4); 1400 align(16); 1401 1402 //Schedule 64 input dwords, by calling sha512_AVX2_one_round_and_schedule 1403 bind(loop1); 1404 vpaddq(xmm0, xmm4, Address(TBL, 0 * 32), AVX_256bit); 1405 vmovdqu(Address(rsp, _XFER), xmm0); 1406 //four rounds and schedule 1407 sha512_AVX2_one_round_and_schedule(xmm4, xmm5, xmm6, xmm7, a, b, c, d, e, f, g, h, 0); 1408 sha512_AVX2_one_round_and_schedule(xmm4, xmm5, xmm6, xmm7, h, a, b, c, d, e, f, g, 1); 1409 sha512_AVX2_one_round_and_schedule(xmm4, xmm5, xmm6, xmm7, g, h, a, b, c, d, e, f, 2); 1410 sha512_AVX2_one_round_and_schedule(xmm4, xmm5, xmm6, xmm7, f, g, h, a, b, c, d, e, 3); 1411 1412 vpaddq(xmm0, xmm5, Address(TBL, 1 * 32), AVX_256bit); 1413 vmovdqu(Address(rsp, _XFER), xmm0); 1414 //four rounds and schedule 1415 sha512_AVX2_one_round_and_schedule(xmm5, xmm6, xmm7, xmm4, e, f, g, h, a, b, c, d, 0); 1416 sha512_AVX2_one_round_and_schedule(xmm5, xmm6, xmm7, xmm4, d, e, f, g, h, a, b, c, 1); 1417 sha512_AVX2_one_round_and_schedule(xmm5, xmm6, xmm7, xmm4, c, d, e, f, g, h, a, b, 2); 1418 sha512_AVX2_one_round_and_schedule(xmm5, xmm6, xmm7, xmm4, b, c, d, e, f, g, h, a, 3); 1419 1420 vpaddq(xmm0, xmm6, Address(TBL, 2 * 32), AVX_256bit); 1421 vmovdqu(Address(rsp, _XFER), xmm0); 1422 //four rounds and schedule 1423 sha512_AVX2_one_round_and_schedule(xmm6, xmm7, xmm4, xmm5, a, b, c, d, e, f, g, h, 0); 1424 sha512_AVX2_one_round_and_schedule(xmm6, xmm7, xmm4, xmm5, h, a, b, c, d, e, f, g, 1); 1425 sha512_AVX2_one_round_and_schedule(xmm6, xmm7, xmm4, xmm5, g, h, a, b, c, d, e, f, 2); 1426 sha512_AVX2_one_round_and_schedule(xmm6, xmm7, xmm4, xmm5, f, g, h, a, b, c, d, e, 3); 1427 1428 vpaddq(xmm0, xmm7, Address(TBL, 3 * 32), AVX_256bit); 1429 vmovdqu(Address(rsp, _XFER), xmm0); 1430 addq(TBL, 4 * 32); 1431 //four rounds and schedule 1432 sha512_AVX2_one_round_and_schedule(xmm7, xmm4, xmm5, xmm6, e, f, g, h, a, b, c, d, 0); 1433 sha512_AVX2_one_round_and_schedule(xmm7, xmm4, xmm5, xmm6, d, e, f, g, h, a, b, c, 1); 1434 sha512_AVX2_one_round_and_schedule(xmm7, xmm4, xmm5, xmm6, c, d, e, f, g, h, a, b, 2); 1435 sha512_AVX2_one_round_and_schedule(xmm7, xmm4, xmm5, xmm6, b, c, d, e, f, g, h, a, 3); 1436 1437 subq(Address(rsp, _SRND), 1); 1438 jcc(Assembler::notEqual, loop1); 1439 1440 movslq(Address(rsp, _SRND), 2); 1441 1442 bind(loop2); 1443 vpaddq(xmm0, xmm4, Address(TBL, 0 * 32), AVX_256bit); 1444 vmovdqu(Address(rsp, _XFER), xmm0); 1445 //four rounds and compute. 1446 sha512_AVX2_one_round_compute(a, a, b, c, d, e, f, g, h, 0); 1447 sha512_AVX2_one_round_compute(h, h, a, b, c, d, e, f, g, 1); 1448 sha512_AVX2_one_round_compute(g, g, h, a, b, c, d, e, f, 2); 1449 sha512_AVX2_one_round_compute(f, f, g, h, a, b, c, d, e, 3); 1450 1451 vpaddq(xmm0, xmm5, Address(TBL, 1 * 32), AVX_256bit); 1452 vmovdqu(Address(rsp, _XFER), xmm0); 1453 addq(TBL, 2 * 32); 1454 // four rounds and compute. 1455 sha512_AVX2_one_round_compute(e, e, f, g, h, a, b, c, d, 0); 1456 sha512_AVX2_one_round_compute(d, d, e, f, g, h, a, b, c, 1); 1457 sha512_AVX2_one_round_compute(c, c, d, e, f, g, h, a, b, 2); 1458 sha512_AVX2_one_round_compute(b, b, c, d, e, f, g, h, a, 3); 1459 1460 vmovdqu(xmm4, xmm6); 1461 vmovdqu(xmm5, xmm7); 1462 1463 subq(Address(rsp, _SRND), 1); 1464 jcc(Assembler::notEqual, loop2); 1465 1466 addmq(8 * 0, CTX, a); 1467 addmq(8 * 1, CTX, b); 1468 addmq(8 * 2, CTX, c); 1469 addmq(8 * 3, CTX, d); 1470 addmq(8 * 4, CTX, e); 1471 addmq(8 * 5, CTX, f); 1472 addmq(8 * 6, CTX, g); 1473 addmq(8 * 7, CTX, h); 1474 1475 movq(INP, Address(rsp, _INP)); 1476 addq(INP, 128); 1477 cmpq(INP, Address(rsp, _INP_END)); 1478 jcc(Assembler::notEqual, loop0); 1479 1480 bind(done_hash); 1481 1482 //Restore GPRs 1483 movq(rbp, Address(rsp, (_GPR + 0))); 1484 movq(rbx, Address(rsp, (_GPR + 8))); 1485 movq(r12, Address(rsp, (_GPR + 16))); 1486 movq(r13, Address(rsp, (_GPR + 24))); 1487 movq(r14, Address(rsp, (_GPR + 32))); 1488 movq(r15, Address(rsp, (_GPR + 40))); 1489 1490#ifdef _WIN64 1491 movq(rsi, Address(rsp, (_GPR + 48))); 1492 movq(rdi, Address(rsp, (_GPR + 56))); 1493#endif 1494 1495 //Restore Stack Pointer 1496 movq(rsp, Address(rsp, _RSP)); 1497 1498#ifdef _WIN64 1499 pop(r9); 1500 pop(r8); 1501#else 1502 pop(rcx); 1503 pop(rdx); 1504#endif 1505 1506 if (multi_block) { 1507#ifdef _WIN64 1508 const Register& limit_end = r9; 1509 const Register& ofs_end = r8; 1510#else 1511 const Register& limit_end = rcx; 1512 const Register& ofs_end = rdx; 1513#endif 1514 movq(rax, ofs_end); 1515 bind(compute_size); 1516 cmpptr(rax, limit_end); 1517 jccb(Assembler::aboveEqual, compute_size_end); 1518 addq(rax, 128); 1519 jmpb(compute_size); 1520 bind(compute_size_end); 1521 } 1522} 1523 1524#endif //#ifdef _LP64 1525 1526