1/*
2* Copyright (c) 2016, Intel Corporation.
3*
4* DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
5*
6* This code is free software; you can redistribute it and/or modify it
7* under the terms of the GNU General Public License version 2 only, as
8* published by the Free Software Foundation.
9*
10* This code is distributed in the hope that it will be useful, but WITHOUT
11* ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
12* FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
13* version 2 for more details (a copy is included in the LICENSE file that
14* accompanied this code).
15*
16* You should have received a copy of the GNU General Public License version
17* 2 along with this work; if not, write to the Free Software Foundation,
18* Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
19*
20* Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
21* or visit www.oracle.com if you need additional information or have any
22* questions.
23*
24*/
25
26#include "precompiled.hpp"
27#include "asm/assembler.hpp"
28#include "asm/assembler.inline.hpp"
29#include "runtime/stubRoutines.hpp"
30#include "macroAssembler_x86.hpp"
31
32// ofs and limit are used for multi-block byte array.
33// int com.sun.security.provider.DigestBase.implCompressMultiBlock(byte[] b, int ofs, int limit)
34void MacroAssembler::fast_sha1(XMMRegister abcd, XMMRegister e0, XMMRegister e1, XMMRegister msg0,
35  XMMRegister msg1, XMMRegister msg2, XMMRegister msg3, XMMRegister shuf_mask,
36  Register buf, Register state, Register ofs, Register limit, Register rsp, bool multi_block) {
37
38  Label start, done_hash, loop0;
39
40  address upper_word_mask = StubRoutines::x86::upper_word_mask_addr();
41  address shuffle_byte_flip_mask = StubRoutines::x86::shuffle_byte_flip_mask_addr();
42
43  bind(start);
44  movdqu(abcd, Address(state, 0));
45  pinsrd(e0, Address(state, 16), 3);
46  movdqu(shuf_mask, ExternalAddress(upper_word_mask)); // 0xFFFFFFFF000000000000000000000000
47  pand(e0, shuf_mask);
48  pshufd(abcd, abcd, 0x1B);
49  movdqu(shuf_mask, ExternalAddress(shuffle_byte_flip_mask)); //0x000102030405060708090a0b0c0d0e0f
50
51  bind(loop0);
52  // Save hash values for addition after rounds
53  movdqu(Address(rsp, 0), e0);
54  movdqu(Address(rsp, 16), abcd);
55
56
57  // Rounds 0 - 3
58  movdqu(msg0, Address(buf, 0));
59  pshufb(msg0, shuf_mask);
60  paddd(e0, msg0);
61  movdqa(e1, abcd);
62  sha1rnds4(abcd, e0, 0);
63
64  // Rounds 4 - 7
65  movdqu(msg1, Address(buf, 16));
66  pshufb(msg1, shuf_mask);
67  sha1nexte(e1, msg1);
68  movdqa(e0, abcd);
69  sha1rnds4(abcd, e1, 0);
70  sha1msg1(msg0, msg1);
71
72  // Rounds 8 - 11
73  movdqu(msg2, Address(buf, 32));
74  pshufb(msg2, shuf_mask);
75  sha1nexte(e0, msg2);
76  movdqa(e1, abcd);
77  sha1rnds4(abcd, e0, 0);
78  sha1msg1(msg1, msg2);
79  pxor(msg0, msg2);
80
81  // Rounds 12 - 15
82  movdqu(msg3, Address(buf, 48));
83  pshufb(msg3, shuf_mask);
84  sha1nexte(e1, msg3);
85  movdqa(e0, abcd);
86  sha1msg2(msg0, msg3);
87  sha1rnds4(abcd, e1, 0);
88  sha1msg1(msg2, msg3);
89  pxor(msg1, msg3);
90
91  // Rounds 16 - 19
92  sha1nexte(e0, msg0);
93  movdqa(e1, abcd);
94  sha1msg2(msg1, msg0);
95  sha1rnds4(abcd, e0, 0);
96  sha1msg1(msg3, msg0);
97  pxor(msg2, msg0);
98
99  // Rounds 20 - 23
100  sha1nexte(e1, msg1);
101  movdqa(e0, abcd);
102  sha1msg2(msg2, msg1);
103  sha1rnds4(abcd, e1, 1);
104  sha1msg1(msg0, msg1);
105  pxor(msg3, msg1);
106
107  // Rounds 24 - 27
108  sha1nexte(e0, msg2);
109  movdqa(e1, abcd);
110  sha1msg2(msg3, msg2);
111  sha1rnds4(abcd, e0, 1);
112  sha1msg1(msg1, msg2);
113  pxor(msg0, msg2);
114
115  // Rounds 28 - 31
116  sha1nexte(e1, msg3);
117  movdqa(e0, abcd);
118  sha1msg2(msg0, msg3);
119  sha1rnds4(abcd, e1, 1);
120  sha1msg1(msg2, msg3);
121  pxor(msg1, msg3);
122
123  // Rounds 32 - 35
124  sha1nexte(e0, msg0);
125  movdqa(e1, abcd);
126  sha1msg2(msg1, msg0);
127  sha1rnds4(abcd, e0, 1);
128  sha1msg1(msg3, msg0);
129  pxor(msg2, msg0);
130
131  // Rounds 36 - 39
132  sha1nexte(e1, msg1);
133  movdqa(e0, abcd);
134  sha1msg2(msg2, msg1);
135  sha1rnds4(abcd, e1, 1);
136  sha1msg1(msg0, msg1);
137  pxor(msg3, msg1);
138
139  // Rounds 40 - 43
140  sha1nexte(e0, msg2);
141  movdqa(e1, abcd);
142  sha1msg2(msg3, msg2);
143  sha1rnds4(abcd, e0, 2);
144  sha1msg1(msg1, msg2);
145  pxor(msg0, msg2);
146
147  // Rounds 44 - 47
148  sha1nexte(e1, msg3);
149  movdqa(e0, abcd);
150  sha1msg2(msg0, msg3);
151  sha1rnds4(abcd, e1, 2);
152  sha1msg1(msg2, msg3);
153  pxor(msg1, msg3);
154
155  // Rounds 48 - 51
156  sha1nexte(e0, msg0);
157  movdqa(e1, abcd);
158  sha1msg2(msg1, msg0);
159  sha1rnds4(abcd, e0, 2);
160  sha1msg1(msg3, msg0);
161  pxor(msg2, msg0);
162
163  // Rounds 52 - 55
164  sha1nexte(e1, msg1);
165  movdqa(e0, abcd);
166  sha1msg2(msg2, msg1);
167  sha1rnds4(abcd, e1, 2);
168  sha1msg1(msg0, msg1);
169  pxor(msg3, msg1);
170
171  // Rounds 56 - 59
172  sha1nexte(e0, msg2);
173  movdqa(e1, abcd);
174  sha1msg2(msg3, msg2);
175  sha1rnds4(abcd, e0, 2);
176  sha1msg1(msg1, msg2);
177  pxor(msg0, msg2);
178
179  // Rounds 60 - 63
180  sha1nexte(e1, msg3);
181  movdqa(e0, abcd);
182  sha1msg2(msg0, msg3);
183  sha1rnds4(abcd, e1, 3);
184  sha1msg1(msg2, msg3);
185  pxor(msg1, msg3);
186
187  // Rounds 64 - 67
188  sha1nexte(e0, msg0);
189  movdqa(e1, abcd);
190  sha1msg2(msg1, msg0);
191  sha1rnds4(abcd, e0, 3);
192  sha1msg1(msg3, msg0);
193  pxor(msg2, msg0);
194
195  // Rounds 68 - 71
196  sha1nexte(e1, msg1);
197  movdqa(e0, abcd);
198  sha1msg2(msg2, msg1);
199  sha1rnds4(abcd, e1, 3);
200  pxor(msg3, msg1);
201
202  // Rounds 72 - 75
203  sha1nexte(e0, msg2);
204  movdqa(e1, abcd);
205  sha1msg2(msg3, msg2);
206  sha1rnds4(abcd, e0, 3);
207
208  // Rounds 76 - 79
209  sha1nexte(e1, msg3);
210  movdqa(e0, abcd);
211  sha1rnds4(abcd, e1, 3);
212
213  // add current hash values with previously saved
214  movdqu(msg0, Address(rsp, 0));
215  sha1nexte(e0, msg0);
216  movdqu(msg0, Address(rsp, 16));
217  paddd(abcd, msg0);
218
219  if (multi_block) {
220    // increment data pointer and loop if more to process
221    addptr(buf, 64);
222    addptr(ofs, 64);
223    cmpptr(ofs, limit);
224    jcc(Assembler::belowEqual, loop0);
225    movptr(rax, ofs); //return ofs
226  }
227  // write hash values back in the correct order
228  pshufd(abcd, abcd, 0x1b);
229  movdqu(Address(state, 0), abcd);
230  pextrd(Address(state, 16), e0, 3);
231
232  bind(done_hash);
233
234}
235
236// xmm0 (msg) is used as an implicit argument to sh256rnds2
237// and state0 and state1 can never use xmm0 register.
238// ofs and limit are used for multi-block byte array.
239// int com.sun.security.provider.DigestBase.implCompressMultiBlock(byte[] b, int ofs, int limit)
240#ifdef _LP64
241void MacroAssembler::fast_sha256(XMMRegister msg, XMMRegister state0, XMMRegister state1, XMMRegister msgtmp0,
242  XMMRegister msgtmp1, XMMRegister msgtmp2, XMMRegister msgtmp3, XMMRegister msgtmp4,
243  Register buf, Register state, Register ofs, Register limit, Register rsp,
244  bool multi_block, XMMRegister shuf_mask) {
245#else
246void MacroAssembler::fast_sha256(XMMRegister msg, XMMRegister state0, XMMRegister state1, XMMRegister msgtmp0,
247  XMMRegister msgtmp1, XMMRegister msgtmp2, XMMRegister msgtmp3, XMMRegister msgtmp4,
248  Register buf, Register state, Register ofs, Register limit, Register rsp,
249  bool multi_block) {
250#endif
251  Label start, done_hash, loop0;
252
253  address K256 = StubRoutines::x86::k256_addr();
254  address pshuffle_byte_flip_mask = StubRoutines::x86::pshuffle_byte_flip_mask_addr();
255
256  bind(start);
257  movdqu(state0, Address(state, 0));
258  movdqu(state1, Address(state, 16));
259
260  pshufd(state0, state0, 0xB1);
261  pshufd(state1, state1, 0x1B);
262  movdqa(msgtmp4, state0);
263  palignr(state0, state1, 8);
264  pblendw(state1, msgtmp4, 0xF0);
265
266#ifdef _LP64
267  movdqu(shuf_mask, ExternalAddress(pshuffle_byte_flip_mask));
268#endif
269  lea(rax, ExternalAddress(K256));
270
271  bind(loop0);
272  movdqu(Address(rsp, 0), state0);
273  movdqu(Address(rsp, 16), state1);
274
275  // Rounds 0-3
276  movdqu(msg, Address(buf, 0));
277#ifdef _LP64
278  pshufb(msg, shuf_mask);
279#else
280  pshufb(msg, ExternalAddress(pshuffle_byte_flip_mask));
281#endif
282  movdqa(msgtmp0, msg);
283  paddd(msg, Address(rax, 0));
284  sha256rnds2(state1, state0);
285  pshufd(msg, msg, 0x0E);
286  sha256rnds2(state0, state1);
287
288  // Rounds 4-7
289  movdqu(msg, Address(buf, 16));
290#ifdef _LP64
291  pshufb(msg, shuf_mask);
292#else
293  pshufb(msg, ExternalAddress(pshuffle_byte_flip_mask));
294#endif
295  movdqa(msgtmp1, msg);
296  paddd(msg, Address(rax, 16));
297  sha256rnds2(state1, state0);
298  pshufd(msg, msg, 0x0E);
299  sha256rnds2(state0, state1);
300  sha256msg1(msgtmp0, msgtmp1);
301
302  // Rounds 8-11
303  movdqu(msg, Address(buf, 32));
304#ifdef _LP64
305  pshufb(msg, shuf_mask);
306#else
307  pshufb(msg, ExternalAddress(pshuffle_byte_flip_mask));
308#endif
309  movdqa(msgtmp2, msg);
310  paddd(msg, Address(rax, 32));
311  sha256rnds2(state1, state0);
312  pshufd(msg, msg, 0x0E);
313  sha256rnds2(state0, state1);
314  sha256msg1(msgtmp1, msgtmp2);
315
316  // Rounds 12-15
317  movdqu(msg, Address(buf, 48));
318#ifdef _LP64
319  pshufb(msg, shuf_mask);
320#else
321  pshufb(msg, ExternalAddress(pshuffle_byte_flip_mask));
322#endif
323  movdqa(msgtmp3, msg);
324  paddd(msg, Address(rax, 48));
325  sha256rnds2(state1, state0);
326  movdqa(msgtmp4, msgtmp3);
327  palignr(msgtmp4, msgtmp2, 4);
328  paddd(msgtmp0, msgtmp4);
329  sha256msg2(msgtmp0, msgtmp3);
330  pshufd(msg, msg, 0x0E);
331  sha256rnds2(state0, state1);
332  sha256msg1(msgtmp2, msgtmp3);
333
334  // Rounds 16-19
335  movdqa(msg, msgtmp0);
336  paddd(msg, Address(rax, 64));
337  sha256rnds2(state1, state0);
338  movdqa(msgtmp4, msgtmp0);
339  palignr(msgtmp4, msgtmp3, 4);
340  paddd(msgtmp1, msgtmp4);
341  sha256msg2(msgtmp1, msgtmp0);
342  pshufd(msg, msg, 0x0E);
343  sha256rnds2(state0, state1);
344  sha256msg1(msgtmp3, msgtmp0);
345
346  // Rounds 20-23
347  movdqa(msg, msgtmp1);
348  paddd(msg, Address(rax, 80));
349  sha256rnds2(state1, state0);
350  movdqa(msgtmp4, msgtmp1);
351  palignr(msgtmp4, msgtmp0, 4);
352  paddd(msgtmp2, msgtmp4);
353  sha256msg2(msgtmp2, msgtmp1);
354  pshufd(msg, msg, 0x0E);
355  sha256rnds2(state0, state1);
356  sha256msg1(msgtmp0, msgtmp1);
357
358  // Rounds 24-27
359  movdqa(msg, msgtmp2);
360  paddd(msg, Address(rax, 96));
361  sha256rnds2(state1, state0);
362  movdqa(msgtmp4, msgtmp2);
363  palignr(msgtmp4, msgtmp1, 4);
364  paddd(msgtmp3, msgtmp4);
365  sha256msg2(msgtmp3, msgtmp2);
366  pshufd(msg, msg, 0x0E);
367  sha256rnds2(state0, state1);
368  sha256msg1(msgtmp1, msgtmp2);
369
370  // Rounds 28-31
371  movdqa(msg, msgtmp3);
372  paddd(msg, Address(rax, 112));
373  sha256rnds2(state1, state0);
374  movdqa(msgtmp4, msgtmp3);
375  palignr(msgtmp4, msgtmp2, 4);
376  paddd(msgtmp0, msgtmp4);
377  sha256msg2(msgtmp0, msgtmp3);
378  pshufd(msg, msg, 0x0E);
379  sha256rnds2(state0, state1);
380  sha256msg1(msgtmp2, msgtmp3);
381
382  // Rounds 32-35
383  movdqa(msg, msgtmp0);
384  paddd(msg, Address(rax, 128));
385  sha256rnds2(state1, state0);
386  movdqa(msgtmp4, msgtmp0);
387  palignr(msgtmp4, msgtmp3, 4);
388  paddd(msgtmp1, msgtmp4);
389  sha256msg2(msgtmp1, msgtmp0);
390  pshufd(msg, msg, 0x0E);
391  sha256rnds2(state0, state1);
392  sha256msg1(msgtmp3, msgtmp0);
393
394  // Rounds 36-39
395  movdqa(msg, msgtmp1);
396  paddd(msg, Address(rax, 144));
397  sha256rnds2(state1, state0);
398  movdqa(msgtmp4, msgtmp1);
399  palignr(msgtmp4, msgtmp0, 4);
400  paddd(msgtmp2, msgtmp4);
401  sha256msg2(msgtmp2, msgtmp1);
402  pshufd(msg, msg, 0x0E);
403  sha256rnds2(state0, state1);
404  sha256msg1(msgtmp0, msgtmp1);
405
406  // Rounds 40-43
407  movdqa(msg, msgtmp2);
408  paddd(msg, Address(rax, 160));
409  sha256rnds2(state1, state0);
410  movdqa(msgtmp4, msgtmp2);
411  palignr(msgtmp4, msgtmp1, 4);
412  paddd(msgtmp3, msgtmp4);
413  sha256msg2(msgtmp3, msgtmp2);
414  pshufd(msg, msg, 0x0E);
415  sha256rnds2(state0, state1);
416  sha256msg1(msgtmp1, msgtmp2);
417
418  // Rounds 44-47
419  movdqa(msg, msgtmp3);
420  paddd(msg, Address(rax, 176));
421  sha256rnds2(state1, state0);
422  movdqa(msgtmp4, msgtmp3);
423  palignr(msgtmp4, msgtmp2, 4);
424  paddd(msgtmp0, msgtmp4);
425  sha256msg2(msgtmp0, msgtmp3);
426  pshufd(msg, msg, 0x0E);
427  sha256rnds2(state0, state1);
428  sha256msg1(msgtmp2, msgtmp3);
429
430  // Rounds 48-51
431  movdqa(msg, msgtmp0);
432  paddd(msg, Address(rax, 192));
433  sha256rnds2(state1, state0);
434  movdqa(msgtmp4, msgtmp0);
435  palignr(msgtmp4, msgtmp3, 4);
436  paddd(msgtmp1, msgtmp4);
437  sha256msg2(msgtmp1, msgtmp0);
438  pshufd(msg, msg, 0x0E);
439  sha256rnds2(state0, state1);
440  sha256msg1(msgtmp3, msgtmp0);
441
442  // Rounds 52-55
443  movdqa(msg, msgtmp1);
444  paddd(msg, Address(rax, 208));
445  sha256rnds2(state1, state0);
446  movdqa(msgtmp4, msgtmp1);
447  palignr(msgtmp4, msgtmp0, 4);
448  paddd(msgtmp2, msgtmp4);
449  sha256msg2(msgtmp2, msgtmp1);
450  pshufd(msg, msg, 0x0E);
451  sha256rnds2(state0, state1);
452
453  // Rounds 56-59
454  movdqa(msg, msgtmp2);
455  paddd(msg, Address(rax, 224));
456  sha256rnds2(state1, state0);
457  movdqa(msgtmp4, msgtmp2);
458  palignr(msgtmp4, msgtmp1, 4);
459  paddd(msgtmp3, msgtmp4);
460  sha256msg2(msgtmp3, msgtmp2);
461  pshufd(msg, msg, 0x0E);
462  sha256rnds2(state0, state1);
463
464  // Rounds 60-63
465  movdqa(msg, msgtmp3);
466  paddd(msg, Address(rax, 240));
467  sha256rnds2(state1, state0);
468  pshufd(msg, msg, 0x0E);
469  sha256rnds2(state0, state1);
470  movdqu(msg, Address(rsp, 0));
471  paddd(state0, msg);
472  movdqu(msg, Address(rsp, 16));
473  paddd(state1, msg);
474
475  if (multi_block) {
476    // increment data pointer and loop if more to process
477    addptr(buf, 64);
478    addptr(ofs, 64);
479    cmpptr(ofs, limit);
480    jcc(Assembler::belowEqual, loop0);
481    movptr(rax, ofs); //return ofs
482  }
483
484  pshufd(state0, state0, 0x1B);
485  pshufd(state1, state1, 0xB1);
486  movdqa(msgtmp4, state0);
487  pblendw(state0, state1, 0xF0);
488  palignr(state1, msgtmp4, 8);
489
490  movdqu(Address(state, 0), state0);
491  movdqu(Address(state, 16), state1);
492
493  bind(done_hash);
494
495}
496
497#ifdef _LP64
498/*
499  The algorithm below is based on Intel publication:
500  "Fast SHA-256 Implementations on Intel�� Architecture Processors" by Jim Guilford, Kirk Yap and Vinodh Gopal.
501  The assembly code was originally provided by Sean Gulley and in many places preserves
502  the original assembly NAMES and comments to simplify matching Java assembly with its original.
503  The Java version was substantially redesigned to replace 1200 assembly instruction with
504  much shorter run-time generator of the same code in memory.
505*/
506
507void MacroAssembler::sha256_AVX2_one_round_compute(
508    Register  reg_old_h,
509    Register  reg_a,
510    Register  reg_b,
511    Register  reg_c,
512    Register  reg_d,
513    Register  reg_e,
514    Register  reg_f,
515    Register  reg_g,
516    Register  reg_h,
517    int iter) {
518  const Register& reg_y0     = r13;
519  const Register& reg_y1     = r14;
520  const Register& reg_y2     = r15;
521  const Register& reg_y3     = rcx;
522  const Register& reg_T1     = r12;
523  //;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; RND iter ;;;;;;;;;;;;;;;;;;;;;;;;;;;
524  if (iter%4 > 0) {
525    addl(reg_old_h, reg_y2);   // reg_h = k + w + reg_h + S0 + S1 + CH = t1 + S0; --
526  }
527  movl(reg_y2, reg_f);         // reg_y2 = reg_f                                ; CH
528  rorxd(reg_y0, reg_e, 25);    // reg_y0 = reg_e >> 25   ; S1A
529  rorxd(reg_y1, reg_e, 11);    // reg_y1 = reg_e >> 11    ; S1B
530  xorl(reg_y2, reg_g);         // reg_y2 = reg_f^reg_g                              ; CH
531
532  xorl(reg_y0, reg_y1);        // reg_y0 = (reg_e>>25) ^ (reg_h>>11)  ; S1
533  rorxd(reg_y1, reg_e, 6);     // reg_y1 = (reg_e >> 6)    ; S1
534  andl(reg_y2, reg_e);         // reg_y2 = (reg_f^reg_g)&reg_e                          ; CH
535
536  if (iter%4 > 0) {
537    addl(reg_old_h, reg_y3);   // reg_h = t1 + S0 + MAJ                     ; --
538  }
539
540  xorl(reg_y0, reg_y1);       // reg_y0 = (reg_e>>25) ^ (reg_e>>11) ^ (reg_e>>6) ; S1
541  rorxd(reg_T1, reg_a, 13);   // reg_T1 = reg_a >> 13    ; S0B
542  xorl(reg_y2, reg_g);        // reg_y2 = CH = ((reg_f^reg_g)&reg_e)^reg_g                 ; CH
543  rorxd(reg_y1, reg_a, 22);   // reg_y1 = reg_a >> 22    ; S0A
544  movl(reg_y3, reg_a);        // reg_y3 = reg_a                                ; MAJA
545
546  xorl(reg_y1, reg_T1);       // reg_y1 = (reg_a>>22) ^ (reg_a>>13)  ; S0
547  rorxd(reg_T1, reg_a, 2);    // reg_T1 = (reg_a >> 2)    ; S0
548  addl(reg_h, Address(rsp, rdx, Address::times_1, 4*iter)); // reg_h = k + w + reg_h ; --
549  orl(reg_y3, reg_c);         // reg_y3 = reg_a|reg_c                              ; MAJA
550
551  xorl(reg_y1, reg_T1);       // reg_y1 = (reg_a>>22) ^ (reg_a>>13) ^ (reg_a>>2) ; S0
552  movl(reg_T1, reg_a);        // reg_T1 = reg_a                                ; MAJB
553  andl(reg_y3, reg_b);        // reg_y3 = (reg_a|reg_c)&reg_b                          ; MAJA
554  andl(reg_T1, reg_c);        // reg_T1 = reg_a&reg_c                              ; MAJB
555  addl(reg_y2, reg_y0);       // reg_y2 = S1 + CH                          ; --
556
557
558  addl(reg_d, reg_h);         // reg_d = k + w + reg_h + reg_d                     ; --
559  orl(reg_y3, reg_T1);        // reg_y3 = MAJ = (reg_a|reg_c)&reg_b)|(reg_a&reg_c)             ; MAJ
560  addl(reg_h, reg_y1);        // reg_h = k + w + reg_h + S0                    ; --
561
562  addl(reg_d, reg_y2);        // reg_d = k + w + reg_h + reg_d + S1 + CH = reg_d + t1  ; --
563
564
565  if (iter%4 == 3) {
566    addl(reg_h, reg_y2);      // reg_h = k + w + reg_h + S0 + S1 + CH = t1 + S0; --
567    addl(reg_h, reg_y3);      // reg_h = t1 + S0 + MAJ                     ; --
568  }
569}
570
571void MacroAssembler::sha256_AVX2_four_rounds_compute_first(int start) {
572    sha256_AVX2_one_round_compute(rax, rax, rbx, rdi, rsi,  r8,  r9, r10, r11, start + 0);
573    sha256_AVX2_one_round_compute(r11, r11, rax, rbx, rdi, rsi,  r8,  r9, r10, start + 1);
574    sha256_AVX2_one_round_compute(r10, r10, r11, rax, rbx, rdi, rsi,  r8,  r9, start + 2);
575    sha256_AVX2_one_round_compute(r9,  r9,  r10, r11, rax, rbx, rdi, rsi,  r8, start + 3);
576}
577
578void MacroAssembler::sha256_AVX2_four_rounds_compute_last(int start) {
579    sha256_AVX2_one_round_compute(r8,  r8,   r9, r10, r11, rax, rbx, rdi, rsi, start + 0);
580    sha256_AVX2_one_round_compute(rsi, rsi,  r8,  r9, r10, r11, rax, rbx, rdi, start + 1);
581    sha256_AVX2_one_round_compute(rdi, rdi, rsi,  r8,  r9, r10, r11, rax, rbx, start + 2);
582    sha256_AVX2_one_round_compute(rbx, rbx, rdi, rsi,  r8,  r9, r10, r11, rax, start + 3);
583}
584
585void MacroAssembler::sha256_AVX2_one_round_and_sched(
586        XMMRegister  xmm_0,     /* == ymm4 on 0, 1, 2, 3 iterations, then rotate 4 registers left on 4, 8, 12 iterations */
587        XMMRegister  xmm_1,     /* ymm5 */  /* full cycle is 16 iterations */
588        XMMRegister  xmm_2,     /* ymm6 */
589        XMMRegister  xmm_3,     /* ymm7 */
590        Register  reg_a,        /* == rax on 0 iteration, then rotate 8 register right on each next iteration */
591        Register  reg_b,        /* rbx */    /* full cycle is 8 iterations */
592        Register  reg_c,        /* rdi */
593        Register  reg_d,        /* rsi */
594        Register  reg_e,        /* r8 */
595        Register  reg_f,        /* r9d */
596        Register  reg_g,        /* r10d */
597        Register  reg_h,        /* r11d */
598        int iter)
599{
600  movl(rcx, reg_a);           // rcx = reg_a               ; MAJA
601  rorxd(r13, reg_e, 25);      // r13 = reg_e >> 25    ; S1A
602  rorxd(r14, reg_e, 11);      //  r14 = reg_e >> 11    ; S1B
603  addl(reg_h, Address(rsp, rdx, Address::times_1, 4*iter));
604  orl(rcx, reg_c);            // rcx = reg_a|reg_c          ; MAJA
605
606  movl(r15, reg_f);           // r15 = reg_f               ; CH
607  rorxd(r12, reg_a, 13);      // r12 = reg_a >> 13      ; S0B
608  xorl(r13, r14);             // r13 = (reg_e>>25) ^ (reg_e>>11)  ; S1
609  xorl(r15, reg_g);           // r15 = reg_f^reg_g         ; CH
610
611  rorxd(r14, reg_e, 6);       // r14 = (reg_e >> 6)    ; S1
612  andl(r15, reg_e);           // r15 = (reg_f^reg_g)&reg_e ; CH
613
614  xorl(r13, r14);             // r13 = (reg_e>>25) ^ (reg_e>>11) ^ (reg_e>>6) ; S1
615  rorxd(r14, reg_a, 22);      // r14 = reg_a >> 22    ; S0A
616  addl(reg_d, reg_h);         // reg_d = k + w + reg_h + reg_d                     ; --
617
618  andl(rcx, reg_b);          // rcx = (reg_a|reg_c)&reg_b                          ; MAJA
619  xorl(r14, r12);            // r14 = (reg_a>>22) ^ (reg_a>>13)  ; S0
620
621  rorxd(r12, reg_a, 2);      // r12 = (reg_a >> 2)    ; S0
622  xorl(r15, reg_g);          // r15 = CH = ((reg_f^reg_g)&reg_e)^reg_g                 ; CH
623
624  xorl(r14, r12);            // r14 = (reg_a>>22) ^ (reg_a>>13) ^ (reg_a>>2) ; S0
625  movl(r12, reg_a);          // r12 = reg_a                                ; MAJB
626  andl(r12, reg_c);          // r12 = reg_a&reg_c                              ; MAJB
627  addl(r15, r13);            // r15 = S1 + CH                          ; --
628
629  orl(rcx, r12);             // rcx = MAJ = (reg_a|reg_c)&reg_b)|(reg_a&reg_c)             ; MAJ
630  addl(reg_h, r14);          // reg_h = k + w + reg_h + S0                    ; --
631  addl(reg_d, r15);          // reg_d = k + w + reg_h + reg_d + S1 + CH = reg_d + t1  ; --
632
633  addl(reg_h, r15);          // reg_h = k + w + reg_h + S0 + S1 + CH = t1 + S0; --
634  addl(reg_h, rcx);          // reg_h = t1 + S0 + MAJ                     ; --
635
636  if (iter%4 == 0) {
637    vpalignr(xmm0, xmm_3, xmm_2, 4, AVX_256bit);   // ymm0 = W[-7]
638    vpaddd(xmm0, xmm0, xmm_0, AVX_256bit);         // ymm0 = W[-7] + W[-16]; y1 = (e >> 6)     ; S1
639    vpalignr(xmm1, xmm_1, xmm_0, 4, AVX_256bit);   // ymm1 = W[-15]
640    vpsrld(xmm2, xmm1, 7, AVX_256bit);
641    vpslld(xmm3, xmm1, 32-7, AVX_256bit);
642    vpor(xmm3, xmm3, xmm2, AVX_256bit);            // ymm3 = W[-15] ror 7
643    vpsrld(xmm2, xmm1,18, AVX_256bit);
644  } else if (iter%4 == 1 ) {
645    vpsrld(xmm8, xmm1, 3, AVX_256bit);             // ymm8 = W[-15] >> 3
646    vpslld(xmm1, xmm1, 32-18, AVX_256bit);
647    vpxor(xmm3, xmm3, xmm1, AVX_256bit);
648    vpxor(xmm3, xmm3, xmm2, AVX_256bit);           // ymm3 = W[-15] ror 7 ^ W[-15] ror 18
649    vpxor(xmm1, xmm3, xmm8, AVX_256bit);           // ymm1 = s0
650    vpshufd(xmm2, xmm_3, 0xFA, AVX_256bit);        // 11111010b ; ymm2 = W[-2] {BBAA}
651    vpaddd(xmm0, xmm0, xmm1, AVX_256bit);          // ymm0 = W[-16] + W[-7] + s0
652    vpsrld(xmm8, xmm2, 10, AVX_256bit);            // ymm8 = W[-2] >> 10 {BBAA}
653  } else if (iter%4 == 2) {
654    vpsrlq(xmm3, xmm2, 19, AVX_256bit);            // ymm3 = W[-2] ror 19 {xBxA}
655    vpsrlq(xmm2, xmm2, 17, AVX_256bit);            // ymm2 = W[-2] ror 17 {xBxA}
656    vpxor(xmm2, xmm2, xmm3, AVX_256bit);
657    vpxor(xmm8, xmm8, xmm2, AVX_256bit);           // ymm8 = s1 {xBxA}
658    vpshufb(xmm8, xmm8, xmm10, AVX_256bit);        // ymm8 = s1 {00BA}
659    vpaddd(xmm0, xmm0, xmm8, AVX_256bit);          // ymm0 = {..., ..., W[1], W[0]}
660    vpshufd(xmm2, xmm0, 0x50, AVX_256bit);         // 01010000b ; ymm2 = W[-2] {DDCC}
661  } else if (iter%4 == 3) {
662    vpsrld(xmm11, xmm2, 10, AVX_256bit);           // ymm11 = W[-2] >> 10 {DDCC}
663    vpsrlq(xmm3, xmm2, 19, AVX_256bit);            // ymm3 = W[-2] ror 19 {xDxC}
664    vpsrlq(xmm2, xmm2, 17, AVX_256bit);            // ymm2 = W[-2] ror 17 {xDxC}
665    vpxor(xmm2, xmm2, xmm3, AVX_256bit);
666    vpxor(xmm11, xmm11, xmm2, AVX_256bit);         // ymm11 = s1 {xDxC}
667    vpshufb(xmm11, xmm11, xmm12, AVX_256bit);      // ymm11 = s1 {DC00}
668    vpaddd(xmm_0, xmm11, xmm0, AVX_256bit);        // xmm_0 = {W[3], W[2], W[1], W[0]}
669  }
670}
671
672void MacroAssembler::addm(int disp, Register r1, Register r2) {
673  addl(r2, Address(r1, disp));
674  movl(Address(r1, disp), r2);
675}
676
677void MacroAssembler::addmq(int disp, Register r1, Register r2) {
678  addq(r2, Address(r1, disp));
679  movq(Address(r1, disp), r2);
680}
681
682void MacroAssembler::sha256_AVX2(XMMRegister msg, XMMRegister state0, XMMRegister state1, XMMRegister msgtmp0,
683  XMMRegister msgtmp1, XMMRegister msgtmp2, XMMRegister msgtmp3, XMMRegister msgtmp4,
684  Register buf, Register state, Register ofs, Register limit, Register rsp,
685  bool multi_block, XMMRegister shuf_mask) {
686
687  Label loop0, loop1, loop2, loop3,
688        last_block_enter, do_last_block, only_one_block, done_hash,
689        compute_size, compute_size_end,
690        compute_size1, compute_size_end1;
691
692  address K256_W = StubRoutines::x86::k256_W_addr();
693  address pshuffle_byte_flip_mask = StubRoutines::x86::pshuffle_byte_flip_mask_addr();
694  address pshuffle_byte_flip_mask_addr = 0;
695
696const XMMRegister& SHUF_00BA        = xmm10;    // ymm10: shuffle xBxA -> 00BA
697const XMMRegister& SHUF_DC00        = xmm12;    // ymm12: shuffle xDxC -> DC00
698const XMMRegister& BYTE_FLIP_MASK   = xmm13;   // ymm13
699
700const XMMRegister& X_BYTE_FLIP_MASK = xmm13;   //XMM version of BYTE_FLIP_MASK
701
702const Register& NUM_BLKS = r8;   // 3rd arg
703const Register& CTX      = rdx;  // 2nd arg
704const Register& INP      = rcx;  // 1st arg
705
706const Register& c        = rdi;
707const Register& d        = rsi;
708const Register& e        = r8;    // clobbers NUM_BLKS
709const Register& y3       = rcx;  // clobbers INP
710
711const Register& TBL      = rbp;
712const Register& SRND     = CTX;   // SRND is same register as CTX
713
714const Register& a        = rax;
715const Register& b        = rbx;
716const Register& f        = r9;
717const Register& g        = r10;
718const Register& h        = r11;
719
720const Register& T1       = r12;
721const Register& y0       = r13;
722const Register& y1       = r14;
723const Register& y2       = r15;
724
725
726enum {
727  _XFER_SIZE = 2*64*4, // 2 blocks, 64 rounds, 4 bytes/round
728  _INP_END_SIZE = 8,
729  _INP_SIZE = 8,
730  _CTX_SIZE = 8,
731  _RSP_SIZE = 8,
732
733  _XFER = 0,
734  _INP_END   = _XFER     + _XFER_SIZE,
735  _INP       = _INP_END  + _INP_END_SIZE,
736  _CTX       = _INP      + _INP_SIZE,
737  _RSP       = _CTX      + _CTX_SIZE,
738  STACK_SIZE = _RSP      + _RSP_SIZE
739};
740
741#ifndef _WIN64
742  push(rcx);    // linux: this is limit, need at the end
743  push(rdx);    // linux: this is ofs
744#else
745  push(r8);     // win64: this is ofs
746  push(r9);     // win64: this is limit, we need them again at the very and
747#endif
748
749
750  push(rbx);
751#ifdef _WIN64
752  push(rsi);
753  push(rdi);
754#endif
755  push(rbp);
756  push(r12);
757  push(r13);
758  push(r14);
759  push(r15);
760
761  movq(rax, rsp);
762  subq(rsp, STACK_SIZE);
763  andq(rsp, -32);
764  movq(Address(rsp, _RSP), rax);
765
766#ifndef _WIN64
767  // copy linux params to win64 params, therefore the rest of code will be the same for both
768  movq(r9,  rcx);
769  movq(r8,  rdx);
770  movq(rdx, rsi);
771  movq(rcx, rdi);
772#endif
773
774  // setting original assembly ABI
775  /** message to encrypt in INP */
776  lea(INP, Address(rcx, 0));    // rcx == message (buf)     ;; linux: INP = buf = rdi
777  /** digest in CTX             */
778  movq(CTX, rdx);               // rdx = digest  (state)    ;; linux: CTX = state = rsi
779
780  /** NUM_BLK is the length of message, need to set it from ofs and limit  */
781  if (multi_block) {
782
783    // Win64: cannot directly update NUM_BLKS, since NUM_BLKS = ofs = r8
784    // on entry r8 = ofs
785    // on exit  r8 = NUM_BLKS
786
787    xorq(rax, rax);
788
789    bind(compute_size);
790    cmpptr(r8, r9); // assume the original ofs <= limit ;; linux:  cmp rcx, rdx
791    jccb(Assembler::aboveEqual, compute_size_end);
792    addq(r8, 64);                                          //;; linux: ofs = rdx
793    addq(rax, 64);
794    jmpb(compute_size);
795
796    bind(compute_size_end);
797    movq(NUM_BLKS, rax);  // NUM_BLK (r8)                  ;; linux: NUM_BLK = rdx
798
799    cmpq(NUM_BLKS, 0);
800    jcc(Assembler::equal, done_hash);
801
802    } else {
803    xorq(NUM_BLKS, NUM_BLKS);
804    addq(NUM_BLKS, 64);
805  }//if (!multi_block)
806
807  lea(NUM_BLKS, Address(INP, NUM_BLKS, Address::times_1, -64)); // pointer to the last block
808  movq(Address(rsp, _INP_END), NUM_BLKS);  //
809
810  cmpptr(INP, NUM_BLKS);                   //cmp INP, NUM_BLKS
811  jcc(Assembler::equal, only_one_block);   //je only_one_block
812
813  // load initial digest
814  movl(a, Address(CTX, 4*0));
815  movl(b, Address(CTX, 4*1));
816  movl(c, Address(CTX, 4*2));
817  movl(d, Address(CTX, 4*3));
818  movl(e, Address(CTX, 4*4));
819  movl(f, Address(CTX, 4*5));
820  // load g - r10 after it is used as scratch
821  movl(h, Address(CTX, 4*7));
822
823  pshuffle_byte_flip_mask_addr = pshuffle_byte_flip_mask;
824  vmovdqu(BYTE_FLIP_MASK, ExternalAddress(pshuffle_byte_flip_mask_addr +0)); //[PSHUFFLE_BYTE_FLIP_MASK wrt rip]
825  vmovdqu(SHUF_00BA, ExternalAddress(pshuffle_byte_flip_mask_addr + 32));     //[_SHUF_00BA wrt rip]
826  vmovdqu(SHUF_DC00, ExternalAddress(pshuffle_byte_flip_mask_addr + 64));     //[_SHUF_DC00 wrt rip]
827
828  movl(g, Address(CTX, 4*6));
829
830  movq(Address(rsp, _CTX), CTX);           // store
831
832bind(loop0);
833  lea(TBL, ExternalAddress(K256_W));
834
835  // assume buffers not aligned
836
837  // Load first 16 dwords from two blocks
838  vmovdqu(xmm0, Address(INP, 0*32));
839  vmovdqu(xmm1, Address(INP, 1*32));
840  vmovdqu(xmm2, Address(INP, 2*32));
841  vmovdqu(xmm3, Address(INP, 3*32));
842
843  // byte swap data
844  vpshufb(xmm0, xmm0, BYTE_FLIP_MASK, AVX_256bit);
845  vpshufb(xmm1, xmm1, BYTE_FLIP_MASK, AVX_256bit);
846  vpshufb(xmm2, xmm2, BYTE_FLIP_MASK, AVX_256bit);
847  vpshufb(xmm3, xmm3, BYTE_FLIP_MASK, AVX_256bit);
848
849  // transpose data into high/low halves
850  vperm2i128(xmm4, xmm0, xmm2, 0x20);
851  vperm2i128(xmm5, xmm0, xmm2, 0x31);
852  vperm2i128(xmm6, xmm1, xmm3, 0x20);
853  vperm2i128(xmm7, xmm1, xmm3, 0x31);
854
855bind(last_block_enter);
856  addq(INP, 64);
857  movq(Address(rsp, _INP), INP);
858
859  //;; schedule 48 input dwords, by doing 3 rounds of 12 each
860  xorq(SRND, SRND);
861
862align(16);
863bind(loop1);
864  vpaddd(xmm9, xmm4, Address(TBL, SRND, Address::times_1, 0*32), AVX_256bit);
865  vmovdqu(Address(rsp, SRND, Address::times_1, _XFER + 0*32), xmm9);
866  sha256_AVX2_one_round_and_sched(xmm4, xmm5, xmm6, xmm7, rax, rbx, rdi, rsi, r8,  r9,  r10, r11, 0);
867  sha256_AVX2_one_round_and_sched(xmm4, xmm5, xmm6, xmm7, r11, rax, rbx, rdi, rsi, r8,  r9,  r10, 1);
868  sha256_AVX2_one_round_and_sched(xmm4, xmm5, xmm6, xmm7, r10, r11, rax, rbx, rdi, rsi, r8,  r9,  2);
869  sha256_AVX2_one_round_and_sched(xmm4, xmm5, xmm6, xmm7, r9,  r10, r11, rax, rbx, rdi, rsi, r8,  3);
870
871  vpaddd(xmm9, xmm5, Address(TBL, SRND, Address::times_1, 1*32), AVX_256bit);
872  vmovdqu(Address(rsp, SRND, Address::times_1, _XFER + 1*32), xmm9);
873  sha256_AVX2_one_round_and_sched(xmm5, xmm6, xmm7, xmm4, r8,  r9,  r10, r11, rax, rbx, rdi, rsi,  8+0);
874  sha256_AVX2_one_round_and_sched(xmm5, xmm6, xmm7, xmm4, rsi, r8,  r9,  r10, r11, rax, rbx, rdi,  8+1);
875  sha256_AVX2_one_round_and_sched(xmm5, xmm6, xmm7, xmm4, rdi, rsi, r8,  r9,  r10, r11, rax, rbx,  8+2);
876  sha256_AVX2_one_round_and_sched(xmm5, xmm6, xmm7, xmm4, rbx, rdi, rsi, r8,  r9,  r10, r11, rax,  8+3);
877
878  vpaddd(xmm9, xmm6, Address(TBL, SRND, Address::times_1, 2*32), AVX_256bit);
879  vmovdqu(Address(rsp, SRND, Address::times_1, _XFER + 2*32), xmm9);
880  sha256_AVX2_one_round_and_sched(xmm6, xmm7, xmm4, xmm5, rax, rbx, rdi, rsi, r8,  r9,  r10, r11, 16+0);
881  sha256_AVX2_one_round_and_sched(xmm6, xmm7, xmm4, xmm5, r11, rax, rbx, rdi, rsi, r8,  r9,  r10, 16+1);
882  sha256_AVX2_one_round_and_sched(xmm6, xmm7, xmm4, xmm5, r10, r11, rax, rbx, rdi, rsi, r8,  r9,  16+2);
883  sha256_AVX2_one_round_and_sched(xmm6, xmm7, xmm4, xmm5, r9,  r10, r11, rax, rbx, rdi, rsi, r8,  16+3);
884
885  vpaddd(xmm9, xmm7, Address(TBL, SRND, Address::times_1, 3*32), AVX_256bit);
886  vmovdqu(Address(rsp, SRND, Address::times_1, _XFER + 3*32), xmm9);
887
888  sha256_AVX2_one_round_and_sched(xmm7, xmm4, xmm5, xmm6, r8,  r9,  r10, r11, rax, rbx, rdi, rsi,  24+0);
889  sha256_AVX2_one_round_and_sched(xmm7, xmm4, xmm5, xmm6, rsi, r8,  r9,  r10, r11, rax, rbx, rdi,  24+1);
890  sha256_AVX2_one_round_and_sched(xmm7, xmm4, xmm5, xmm6, rdi, rsi, r8,  r9,  r10, r11, rax, rbx,  24+2);
891  sha256_AVX2_one_round_and_sched(xmm7, xmm4, xmm5, xmm6, rbx, rdi, rsi, r8,  r9,  r10, r11, rax,  24+3);
892
893  addq(SRND, 4*32);
894  cmpq(SRND, 3 * 4*32);
895  jcc(Assembler::below, loop1);
896
897bind(loop2);
898  // Do last 16 rounds with no scheduling
899  vpaddd(xmm9, xmm4, Address(TBL, SRND, Address::times_1, 0*32), AVX_256bit);
900  vmovdqu(Address(rsp, SRND, Address::times_1, _XFER + 0*32), xmm9);
901  sha256_AVX2_four_rounds_compute_first(0);
902
903  vpaddd(xmm9, xmm5, Address(TBL, SRND, Address::times_1, 1*32), AVX_256bit);
904  vmovdqu(Address(rsp, SRND, Address::times_1, _XFER + 1*32), xmm9);
905  sha256_AVX2_four_rounds_compute_last(0 + 8);
906
907  addq(SRND, 2*32);
908
909  vmovdqu(xmm4, xmm6);
910  vmovdqu(xmm5, xmm7);
911
912  cmpq(SRND, 4 * 4*32);
913  jcc(Assembler::below, loop2);
914
915  movq(CTX, Address(rsp, _CTX));
916  movq(INP, Address(rsp, _INP));
917
918  addm(4*0, CTX, a);
919  addm(4*1, CTX, b);
920  addm(4*2, CTX, c);
921  addm(4*3, CTX, d);
922  addm(4*4, CTX, e);
923  addm(4*5, CTX, f);
924  addm(4*6, CTX, g);
925  addm(4*7, CTX, h);
926
927  cmpq(INP, Address(rsp, _INP_END));
928  jcc(Assembler::above, done_hash);
929
930  //Do second block using previously scheduled results
931  xorq(SRND, SRND);
932align(16);
933bind(loop3);
934  sha256_AVX2_four_rounds_compute_first(4);
935  sha256_AVX2_four_rounds_compute_last(4+8);
936
937  addq(SRND, 2*32);
938  cmpq(SRND, 4 * 4*32);
939  jcc(Assembler::below, loop3);
940
941  movq(CTX, Address(rsp, _CTX));
942  movq(INP, Address(rsp, _INP));
943  addq(INP, 64);
944
945  addm(4*0, CTX, a);
946  addm(4*1, CTX, b);
947  addm(4*2, CTX, c);
948  addm(4*3, CTX, d);
949  addm(4*4, CTX, e);
950  addm(4*5, CTX, f);
951  addm(4*6, CTX, g);
952  addm(4*7, CTX, h);
953
954  cmpq(INP, Address(rsp, _INP_END));
955  jcc(Assembler::below, loop0);
956  jccb(Assembler::above, done_hash);
957
958bind(do_last_block);
959  lea(TBL, ExternalAddress(K256_W));
960
961  movdqu(xmm4, Address(INP, 0*16));
962  movdqu(xmm5, Address(INP, 1*16));
963  movdqu(xmm6, Address(INP, 2*16));
964  movdqu(xmm7, Address(INP, 3*16));
965
966  vpshufb(xmm4, xmm4, xmm13, AVX_128bit);
967  vpshufb(xmm5, xmm5, xmm13, AVX_128bit);
968  vpshufb(xmm6, xmm6, xmm13, AVX_128bit);
969  vpshufb(xmm7, xmm7, xmm13, AVX_128bit);
970
971  jmp(last_block_enter);
972
973bind(only_one_block);
974
975  // load initial digest ;; table should be preloaded with following values
976  movl(a, Address(CTX, 4*0));   // 0x6a09e667
977  movl(b, Address(CTX, 4*1));   // 0xbb67ae85
978  movl(c, Address(CTX, 4*2));   // 0x3c6ef372
979  movl(d, Address(CTX, 4*3));   // 0xa54ff53a
980  movl(e, Address(CTX, 4*4));   // 0x510e527f
981  movl(f, Address(CTX, 4*5));   // 0x9b05688c
982  // load g - r10 after use as scratch
983  movl(h, Address(CTX, 4*7));   // 0x5be0cd19
984
985
986  pshuffle_byte_flip_mask_addr = pshuffle_byte_flip_mask;
987  vmovdqu(BYTE_FLIP_MASK, ExternalAddress(pshuffle_byte_flip_mask_addr + 0)); //[PSHUFFLE_BYTE_FLIP_MASK wrt rip]
988  vmovdqu(SHUF_00BA, ExternalAddress(pshuffle_byte_flip_mask_addr + 32));     //[_SHUF_00BA wrt rip]
989  vmovdqu(SHUF_DC00, ExternalAddress(pshuffle_byte_flip_mask_addr + 64));     //[_SHUF_DC00 wrt rip]
990
991  movl(g, Address(CTX, 4*6));   // 0x1f83d9ab
992
993  movq(Address(rsp, _CTX), CTX);
994  jmpb(do_last_block);
995
996bind(done_hash);
997
998  movq(rsp, Address(rsp, _RSP));
999
1000  pop(r15);
1001  pop(r14);
1002  pop(r13);
1003  pop(r12);
1004  pop(rbp);
1005#ifdef _WIN64
1006  pop(rdi);
1007  pop(rsi);
1008#endif
1009  pop(rbx);
1010
1011#ifdef _WIN64
1012  pop(r9);
1013  pop(r8);
1014#else
1015  pop(rdx);
1016  pop(rcx);
1017#endif
1018
1019  if (multi_block) {
1020#ifdef _WIN64
1021const Register& limit_end = r9;
1022const Register& ofs_end   = r8;
1023#else
1024const Register& limit_end = rcx;
1025const Register& ofs_end   = rdx;
1026#endif
1027    movq(rax, ofs_end);
1028
1029bind(compute_size1);
1030    cmpptr(rax, limit_end); // assume the original ofs <= limit
1031    jccb(Assembler::aboveEqual, compute_size_end1);
1032    addq(rax, 64);
1033    jmpb(compute_size1);
1034
1035bind(compute_size_end1);
1036  }
1037}
1038
1039void MacroAssembler::sha512_AVX2_one_round_compute(Register  old_h, Register a, Register b, Register c,
1040                                                   Register d, Register e, Register f, Register g, Register h,
1041                                                   int iteration)
1042{
1043
1044    const Register& y0 = r13;
1045    const Register& y1 = r14;
1046    const Register& y2 = r15;
1047#ifdef _WIN64
1048    const Register& y3 = rcx;
1049#else
1050    const Register& y3 = rdi;
1051#endif
1052    const Register& T1 = r12;
1053
1054    if (iteration % 4 > 0) {
1055      addq(old_h, y2); //h = k + w + h + S0 + S1 + CH = t1 + S0;
1056    }
1057    movq(y2, f); //y2 = f; CH
1058    rorxq(y0, e, 41); //y0 = e >> 41; S1A
1059    rorxq(y1, e, 18); //y1 = e >> 18; S1B
1060    xorq(y2, g); //y2 = f^g; CH
1061
1062    xorq(y0, y1); //y0 = (e >> 41) ^ (e >> 18); S1
1063    rorxq(y1, e, 14); //y1 = (e >> 14); S1
1064    andq(y2, e); //y2 = (f^g)&e; CH
1065
1066    if (iteration % 4 > 0 ) {
1067      addq(old_h, y3); //h = t1 + S0 + MAJ
1068    }
1069    xorq(y0, y1); //y0 = (e >> 41) ^ (e >> 18) ^ (e >> 14); S1
1070    rorxq(T1, a, 34); //T1 = a >> 34; S0B
1071    xorq(y2, g); //y2 = CH = ((f^g)&e) ^g; CH
1072    rorxq(y1, a, 39); //y1 = a >> 39; S0A
1073    movq(y3, a); //y3 = a; MAJA
1074
1075    xorq(y1, T1); //y1 = (a >> 39) ^ (a >> 34); S0
1076    rorxq(T1, a, 28); //T1 = (a >> 28); S0
1077    addq(h, Address(rsp, (8 * iteration))); //h = k + w + h; --
1078    orq(y3, c); //y3 = a | c; MAJA
1079
1080    xorq(y1, T1); //y1 = (a >> 39) ^ (a >> 34) ^ (a >> 28); S0
1081    movq(T1, a); //T1 = a; MAJB
1082    andq(y3, b); //y3 = (a | c)&b; MAJA
1083    andq(T1, c); //T1 = a&c; MAJB
1084    addq(y2, y0); //y2 = S1 + CH; --
1085
1086    addq(d, h); //d = k + w + h + d; --
1087    orq(y3, T1); //y3 = MAJ = (a | c)&b) | (a&c); MAJ
1088    addq(h, y1); //h = k + w + h + S0; --
1089
1090    addq(d, y2); //d = k + w + h + d + S1 + CH = d + t1; --
1091
1092    if (iteration % 4 == 3) {
1093      addq(h, y2); //h = k + w + h + S0 + S1 + CH = t1 + S0; --
1094      addq(h, y3); //h = t1 + S0 + MAJ; --
1095    }
1096}
1097
1098void MacroAssembler::sha512_AVX2_one_round_and_schedule(
1099    XMMRegister xmm4, // ymm4
1100    XMMRegister xmm5, // ymm5
1101    XMMRegister xmm6, // ymm6
1102    XMMRegister xmm7, // ymm7
1103    Register a, //rax
1104    Register b, //rbx
1105    Register c, //rdi
1106    Register d, //rsi
1107    Register e, //r8
1108    Register f, //r9
1109    Register g, //r10
1110    Register h, //r11
1111    int iteration)
1112{
1113
1114    const Register& y0 = r13;
1115    const Register& y1 = r14;
1116    const Register& y2 = r15;
1117#ifdef _WIN64
1118    const Register& y3 = rcx;
1119#else
1120    const Register& y3 = rdi;
1121#endif
1122    const Register& T1 = r12;
1123
1124    if (iteration % 4 == 0) {
1125      // Extract w[t - 7]
1126      // xmm0 = W[-7]
1127      vperm2f128(xmm0, xmm7, xmm6, 3);
1128      vpalignr(xmm0, xmm0, xmm6, 8, AVX_256bit);
1129
1130      // Calculate w[t - 16] + w[t - 7]
1131      vpaddq(xmm0, xmm0, xmm4, AVX_256bit); //xmm0 = W[-7] + W[-16]
1132      // Extract w[t - 15]
1133      //xmm1 = W[-15]
1134      vperm2f128(xmm1, xmm5, xmm4, 3);
1135      vpalignr(xmm1, xmm1, xmm4, 8, AVX_256bit);
1136
1137      // Calculate sigma0
1138      // Calculate w[t - 15] ror 1
1139      vpsrlq(xmm2, xmm1, 1, AVX_256bit);
1140      vpsllq(xmm3, xmm1, (64 - 1), AVX_256bit);
1141      vpor(xmm3, xmm3, xmm2, AVX_256bit); //xmm3 = W[-15] ror 1
1142      // Calculate w[t - 15] shr 7
1143      vpsrlq(xmm8, xmm1, 7, AVX_256bit); //xmm8 = W[-15] >> 7
1144
1145    } else if (iteration % 4 == 1) {
1146      //Calculate w[t - 15] ror 8
1147      vpsrlq(xmm2, xmm1, 8, AVX_256bit);
1148      vpsllq(xmm1, xmm1, (64 - 8), AVX_256bit);
1149      vpor(xmm1, xmm1, xmm2, AVX_256bit); //xmm1 = W[-15] ror 8
1150
1151      //XOR the three components
1152      vpxor(xmm3, xmm3, xmm8, AVX_256bit); //xmm3 = W[-15] ror 1 ^ W[-15] >> 7
1153      vpxor(xmm1, xmm3, xmm1, AVX_256bit); //xmm1 = s0
1154
1155      //Add three components, w[t - 16], w[t - 7] and sigma0
1156      vpaddq(xmm0, xmm0, xmm1, AVX_256bit); //xmm0 = W[-16] + W[-7] + s0
1157
1158      // Move to appropriate lanes for calculating w[16] and w[17]
1159      vperm2f128(xmm4, xmm0, xmm0, 0); //xmm4 = W[-16] + W[-7] + s0{ BABA }
1160
1161      //Move to appropriate lanes for calculating w[18] and w[19]
1162      vpand(xmm0, xmm0, xmm10, AVX_256bit); //xmm0 = W[-16] + W[-7] + s0{ DC00 }
1163      //Calculate w[16] and w[17] in both 128 bit lanes
1164      //Calculate sigma1 for w[16] and w[17] on both 128 bit lanes
1165      vperm2f128(xmm2, xmm7, xmm7, 17); //xmm2 = W[-2] {BABA}
1166      vpsrlq(xmm8, xmm2, 6, AVX_256bit); //xmm8 = W[-2] >> 6 {BABA}
1167
1168    } else if (iteration % 4 == 2) {
1169      vpsrlq(xmm3, xmm2, 19, AVX_256bit); //xmm3 = W[-2] >> 19 {BABA}
1170      vpsllq(xmm1, xmm2, (64 - 19), AVX_256bit); //xmm1 = W[-2] << 19 {BABA}
1171      vpor(xmm3, xmm3, xmm1, AVX_256bit); //xmm3 = W[-2] ror 19 {BABA}
1172      vpxor(xmm8, xmm8, xmm3, AVX_256bit);// xmm8 = W[-2] ror 19 ^ W[-2] >> 6 {BABA}
1173      vpsrlq(xmm3, xmm2, 61, AVX_256bit); //xmm3 = W[-2] >> 61 {BABA}
1174      vpsllq(xmm1, xmm2, (64 - 61), AVX_256bit); //xmm1 = W[-2] << 61 {BABA}
1175      vpor(xmm3, xmm3, xmm1, AVX_256bit); //xmm3 = W[-2] ror 61 {BABA}
1176      vpxor(xmm8, xmm8, xmm3, AVX_256bit); //xmm8 = s1 = (W[-2] ror 19) ^ (W[-2] ror 61) ^ (W[-2] >> 6) { BABA }
1177
1178      //Add sigma1 to the other components to get w[16] and w[17]
1179      vpaddq(xmm4, xmm4, xmm8, AVX_256bit); //xmm4 = { W[1], W[0], W[1], W[0] }
1180
1181      //Calculate sigma1 for w[18] and w[19] for upper 128 bit lane
1182      vpsrlq(xmm8, xmm4, 6, AVX_256bit); //xmm8 = W[-2] >> 6 {DC--}
1183
1184    } else if (iteration % 4 == 3){
1185      vpsrlq(xmm3, xmm4, 19, AVX_256bit); //xmm3 = W[-2] >> 19 {DC--}
1186      vpsllq(xmm1, xmm4, (64 - 19), AVX_256bit); //xmm1 = W[-2] << 19 {DC--}
1187      vpor(xmm3, xmm3, xmm1, AVX_256bit); //xmm3 = W[-2] ror 19 {DC--}
1188      vpxor(xmm8, xmm8, xmm3, AVX_256bit); //xmm8 = W[-2] ror 19 ^ W[-2] >> 6 {DC--}
1189      vpsrlq(xmm3, xmm4, 61, AVX_256bit); //xmm3 = W[-2] >> 61 {DC--}
1190      vpsllq(xmm1, xmm4, (64 - 61), AVX_256bit); //xmm1 = W[-2] << 61 {DC--}
1191      vpor(xmm3, xmm3, xmm1, AVX_256bit); //xmm3 = W[-2] ror 61 {DC--}
1192      vpxor(xmm8, xmm8, xmm3, AVX_256bit); //xmm8 = s1 = (W[-2] ror 19) ^ (W[-2] ror 61) ^ (W[-2] >> 6) { DC-- }
1193
1194      //Add the sigma0 + w[t - 7] + w[t - 16] for w[18] and w[19] to newly calculated sigma1 to get w[18] and w[19]
1195      vpaddq(xmm2, xmm0, xmm8, AVX_256bit); //xmm2 = { W[3], W[2], --, -- }
1196
1197      //Form w[19, w[18], w17], w[16]
1198      vpblendd(xmm4, xmm4, xmm2, 0xF0, AVX_256bit); //xmm4 = { W[3], W[2], W[1], W[0] }
1199    }
1200
1201    movq(y3, a); //y3 = a; MAJA
1202    rorxq(y0, e, 41); // y0 = e >> 41; S1A
1203    rorxq(y1, e, 18); //y1 = e >> 18; S1B
1204    addq(h, Address(rsp, (iteration * 8))); //h = k + w + h; --
1205    orq(y3, c); //y3 = a | c; MAJA
1206    movq(y2, f); //y2 = f; CH
1207
1208    xorq(y2, g); //y2 = f^g; CH
1209
1210    rorxq(T1, a, 34); //T1 = a >> 34; S0B
1211    xorq(y0, y1); //y0 = (e >> 41) ^ (e >> 18); S1
1212
1213    rorxq(y1, e, 14); //y1 = (e >> 14); S1
1214
1215    andq(y2, e); //y2 = (f^g) & e; CH
1216    addq(d, h); //d = k + w + h + d; --
1217
1218    andq(y3, b); //y3 = (a | c)&b; MAJA
1219    xorq(y0, y1); //y0 = (e >> 41) ^ (e >> 18) ^ (e >> 14); S1
1220    rorxq(y1, a, 39); //y1 = a >> 39; S0A
1221
1222    xorq(y1, T1); //y1 = (a >> 39) ^ (a >> 34); S0
1223    rorxq(T1, a, 28); //T1 = (a >> 28); S0
1224    xorq(y2, g); //y2 = CH = ((f^g)&e) ^ g; CH
1225
1226    xorq(y1, T1); //y1 = (a >> 39) ^ (a >> 34) ^ (a >> 28); S0
1227    movq(T1, a); //T1 = a; MAJB
1228
1229    andq(T1, c); //T1 = a&c; MAJB
1230    addq(y2, y0); //y2 = S1 + CH; --
1231
1232    orq(y3, T1); //y3 = MAJ = (a | c)&b) | (a&c); MAJ
1233    addq(h, y1); //h = k + w + h + S0; --
1234
1235    addq(d, y2); //d = k + w + h + d + S1 + CH = d + t1; --
1236    addq(h, y2); //h = k + w + h + S0 + S1 + CH = t1 + S0; --
1237    addq(h, y3); //h = t1 + S0 + MAJ; --
1238}
1239
1240void MacroAssembler::sha512_AVX2(XMMRegister msg, XMMRegister state0, XMMRegister state1, XMMRegister msgtmp0,
1241                                 XMMRegister msgtmp1, XMMRegister msgtmp2, XMMRegister msgtmp3, XMMRegister msgtmp4,
1242                                 Register buf, Register state, Register ofs, Register limit, Register rsp,
1243                                 bool multi_block, XMMRegister shuf_mask)
1244{
1245
1246    Label loop0, loop1, loop2, done_hash,
1247    compute_block_size, compute_size,
1248    compute_block_size_end, compute_size_end;
1249
1250    address K512_W = StubRoutines::x86::k512_W_addr();
1251    address pshuffle_byte_flip_mask_sha512 = StubRoutines::x86::pshuffle_byte_flip_mask_addr_sha512();
1252    address pshuffle_byte_flip_mask_addr = 0;
1253
1254    const XMMRegister& XFER = xmm0; // YTMP0
1255    const XMMRegister& BYTE_FLIP_MASK = xmm9; // ymm9
1256    const XMMRegister& YMM_MASK_LO = xmm10; // ymm10
1257#ifdef _WIN64
1258    const Register& INP = rcx; //1st arg
1259    const Register& CTX = rdx; //2nd arg
1260    const Register& NUM_BLKS = r8; //3rd arg
1261    const Register& c = rdi;
1262    const Register& d = rsi;
1263    const Register& e = r8;
1264    const Register& y3 = rcx;
1265    const Register& offset = r8;
1266    const Register& input_limit = r9;
1267#else
1268    const Register& INP = rdi; //1st arg
1269    const Register& CTX = rsi; //2nd arg
1270    const Register& NUM_BLKS = rdx; //3rd arg
1271    const Register& c  = rcx;
1272    const Register& d  = r8;
1273    const Register& e  = rdx;
1274    const Register& y3 = rdi;
1275    const Register& offset = rdx;
1276    const Register& input_limit = rcx;
1277#endif
1278
1279    const Register& TBL = rbp;
1280
1281    const Register& a = rax;
1282    const Register& b = rbx;
1283
1284    const Register& f = r9;
1285    const Register& g = r10;
1286    const Register& h = r11;
1287
1288    //Local variables as defined in assembly file.
1289    enum
1290    {
1291      _XFER_SIZE = 4 * 8, // resq 4 => reserve 4 quadwords. Hence 4 * 8
1292      _SRND_SIZE = 8, // resq 1
1293      _INP_SIZE = 8,
1294      _INP_END_SIZE = 8,
1295      _RSP_SAVE_SIZE = 8,  // defined as resq 1
1296
1297#ifdef _WIN64
1298      _GPR_SAVE_SIZE = 8 * 8, // defined as resq 8
1299#else
1300      _GPR_SAVE_SIZE = 6 * 8 // resq 6
1301#endif
1302    };
1303
1304    enum
1305    {
1306      _XFER = 0,
1307      _SRND = _XFER + _XFER_SIZE, // 32
1308      _INP = _SRND + _SRND_SIZE, // 40
1309      _INP_END = _INP + _INP_SIZE, // 48
1310      _RSP = _INP_END + _INP_END_SIZE, // 56
1311      _GPR = _RSP + _RSP_SAVE_SIZE, // 64
1312      _STACK_SIZE = _GPR + _GPR_SAVE_SIZE // 128 for windows and 112 for linux.
1313    };
1314
1315//Saving offset and limit as it will help with blocksize calculation for multiblock SHA512.
1316#ifdef _WIN64
1317    push(r8);    // win64: this is ofs
1318    push(r9);    // win64: this is limit, we need them again at the very end.
1319#else
1320    push(rdx);   // linux : this is ofs, need at the end for multiblock calculation
1321    push(rcx);   // linux: This is the limit.
1322#endif
1323
1324    //Allocate Stack Space
1325    movq(rax, rsp);
1326    subq(rsp, _STACK_SIZE);
1327    andq(rsp, -32);
1328    movq(Address(rsp, _RSP), rax);
1329
1330    //Save GPRs
1331    movq(Address(rsp, _GPR), rbp);
1332    movq(Address(rsp, (_GPR + 8)), rbx);
1333    movq(Address(rsp, (_GPR + 16)), r12);
1334    movq(Address(rsp, (_GPR + 24)), r13);
1335    movq(Address(rsp, (_GPR + 32)), r14);
1336    movq(Address(rsp, (_GPR + 40)), r15);
1337
1338#ifdef _WIN64
1339    movq(Address(rsp, (_GPR + 48)), rsi);
1340    movq(Address(rsp, (_GPR + 56)), rdi);
1341#endif
1342
1343    vpblendd(xmm0, xmm0, xmm1, 0xF0, AVX_128bit);
1344    vpblendd(xmm0, xmm0, xmm1, 0xF0, AVX_256bit);
1345
1346    if (multi_block) {
1347      xorq(rax, rax);
1348      bind(compute_block_size);
1349      cmpptr(offset, input_limit); // Assuming that offset is less than limit.
1350      jccb(Assembler::aboveEqual, compute_block_size_end);
1351      addq(offset, 128);
1352      addq(rax, 128);
1353      jmpb(compute_block_size);
1354
1355      bind(compute_block_size_end);
1356      movq(NUM_BLKS, rax);
1357
1358      cmpq(NUM_BLKS, 0);
1359      jcc(Assembler::equal, done_hash);
1360    } else {
1361      xorq(NUM_BLKS, NUM_BLKS); //If single block.
1362      addq(NUM_BLKS, 128);
1363    }
1364
1365    addq(NUM_BLKS, INP); //pointer to end of data
1366    movq(Address(rsp, _INP_END), NUM_BLKS);
1367
1368    //load initial digest
1369    movq(a, Address(CTX, 8 * 0));
1370    movq(b, Address(CTX, 8 * 1));
1371    movq(c, Address(CTX, 8 * 2));
1372    movq(d, Address(CTX, 8 * 3));
1373    movq(e, Address(CTX, 8 * 4));
1374    movq(f, Address(CTX, 8 * 5));
1375    // load g - r10 after it is used as scratch
1376    movq(h, Address(CTX, 8 * 7));
1377
1378    pshuffle_byte_flip_mask_addr = pshuffle_byte_flip_mask_sha512;
1379    vmovdqu(BYTE_FLIP_MASK, ExternalAddress(pshuffle_byte_flip_mask_addr + 0)); //PSHUFFLE_BYTE_FLIP_MASK wrt rip
1380    vmovdqu(YMM_MASK_LO, ExternalAddress(pshuffle_byte_flip_mask_addr + 32));
1381
1382    movq(g, Address(CTX, 8 * 6));
1383
1384    bind(loop0);
1385    lea(TBL, ExternalAddress(K512_W));
1386
1387    //byte swap first 16 dwords
1388    vmovdqu(xmm4, Address(INP, 32 * 0));
1389    vpshufb(xmm4, xmm4, BYTE_FLIP_MASK, AVX_256bit);
1390    vmovdqu(xmm5, Address(INP, 32 * 1));
1391    vpshufb(xmm5, xmm5, BYTE_FLIP_MASK, AVX_256bit);
1392    vmovdqu(xmm6, Address(INP, 32 * 2));
1393    vpshufb(xmm6, xmm6, BYTE_FLIP_MASK, AVX_256bit);
1394    vmovdqu(xmm7, Address(INP, 32 * 3));
1395    vpshufb(xmm7, xmm7, BYTE_FLIP_MASK, AVX_256bit);
1396
1397    movq(Address(rsp, _INP), INP);
1398
1399    movslq(Address(rsp, _SRND), 4);
1400    align(16);
1401
1402    //Schedule 64 input dwords, by calling sha512_AVX2_one_round_and_schedule
1403    bind(loop1);
1404    vpaddq(xmm0, xmm4, Address(TBL, 0 * 32), AVX_256bit);
1405    vmovdqu(Address(rsp, _XFER), xmm0);
1406    //four rounds and schedule
1407    sha512_AVX2_one_round_and_schedule(xmm4, xmm5, xmm6, xmm7, a, b, c, d, e, f, g, h, 0);
1408    sha512_AVX2_one_round_and_schedule(xmm4, xmm5, xmm6, xmm7, h, a, b, c, d, e, f, g, 1);
1409    sha512_AVX2_one_round_and_schedule(xmm4, xmm5, xmm6, xmm7, g, h, a, b, c, d, e, f, 2);
1410    sha512_AVX2_one_round_and_schedule(xmm4, xmm5, xmm6, xmm7, f, g, h, a, b, c, d, e, 3);
1411
1412    vpaddq(xmm0, xmm5, Address(TBL, 1 * 32), AVX_256bit);
1413    vmovdqu(Address(rsp, _XFER), xmm0);
1414    //four rounds and schedule
1415    sha512_AVX2_one_round_and_schedule(xmm5, xmm6, xmm7, xmm4, e, f, g, h, a, b, c, d, 0);
1416    sha512_AVX2_one_round_and_schedule(xmm5, xmm6, xmm7, xmm4, d, e, f, g, h, a, b, c, 1);
1417    sha512_AVX2_one_round_and_schedule(xmm5, xmm6, xmm7, xmm4, c, d, e, f, g, h, a, b, 2);
1418    sha512_AVX2_one_round_and_schedule(xmm5, xmm6, xmm7, xmm4, b, c, d, e, f, g, h, a, 3);
1419
1420    vpaddq(xmm0, xmm6, Address(TBL, 2 * 32), AVX_256bit);
1421    vmovdqu(Address(rsp, _XFER), xmm0);
1422    //four rounds and schedule
1423    sha512_AVX2_one_round_and_schedule(xmm6, xmm7, xmm4, xmm5, a, b, c, d, e, f, g, h, 0);
1424    sha512_AVX2_one_round_and_schedule(xmm6, xmm7, xmm4, xmm5, h, a, b, c, d, e, f, g, 1);
1425    sha512_AVX2_one_round_and_schedule(xmm6, xmm7, xmm4, xmm5, g, h, a, b, c, d, e, f, 2);
1426    sha512_AVX2_one_round_and_schedule(xmm6, xmm7, xmm4, xmm5, f, g, h, a, b, c, d, e, 3);
1427
1428    vpaddq(xmm0, xmm7, Address(TBL, 3 * 32), AVX_256bit);
1429    vmovdqu(Address(rsp, _XFER), xmm0);
1430    addq(TBL, 4 * 32);
1431    //four rounds and schedule
1432    sha512_AVX2_one_round_and_schedule(xmm7, xmm4, xmm5, xmm6, e, f, g, h, a, b, c, d, 0);
1433    sha512_AVX2_one_round_and_schedule(xmm7, xmm4, xmm5, xmm6, d, e, f, g, h, a, b, c, 1);
1434    sha512_AVX2_one_round_and_schedule(xmm7, xmm4, xmm5, xmm6, c, d, e, f, g, h, a, b, 2);
1435    sha512_AVX2_one_round_and_schedule(xmm7, xmm4, xmm5, xmm6, b, c, d, e, f, g, h, a, 3);
1436
1437    subq(Address(rsp, _SRND), 1);
1438    jcc(Assembler::notEqual, loop1);
1439
1440    movslq(Address(rsp, _SRND), 2);
1441
1442    bind(loop2);
1443    vpaddq(xmm0, xmm4, Address(TBL, 0 * 32), AVX_256bit);
1444    vmovdqu(Address(rsp, _XFER), xmm0);
1445    //four rounds and compute.
1446    sha512_AVX2_one_round_compute(a, a, b, c, d, e, f, g, h, 0);
1447    sha512_AVX2_one_round_compute(h, h, a, b, c, d, e, f, g, 1);
1448    sha512_AVX2_one_round_compute(g, g, h, a, b, c, d, e, f, 2);
1449    sha512_AVX2_one_round_compute(f, f, g, h, a, b, c, d, e, 3);
1450
1451    vpaddq(xmm0, xmm5, Address(TBL, 1 * 32), AVX_256bit);
1452    vmovdqu(Address(rsp, _XFER), xmm0);
1453    addq(TBL, 2 * 32);
1454    // four rounds and compute.
1455    sha512_AVX2_one_round_compute(e, e, f, g, h, a, b, c, d, 0);
1456    sha512_AVX2_one_round_compute(d, d, e, f, g, h, a, b, c, 1);
1457    sha512_AVX2_one_round_compute(c, c, d, e, f, g, h, a, b, 2);
1458    sha512_AVX2_one_round_compute(b, b, c, d, e, f, g, h, a, 3);
1459
1460    vmovdqu(xmm4, xmm6);
1461    vmovdqu(xmm5, xmm7);
1462
1463    subq(Address(rsp, _SRND), 1);
1464    jcc(Assembler::notEqual, loop2);
1465
1466    addmq(8 * 0, CTX, a);
1467    addmq(8 * 1, CTX, b);
1468    addmq(8 * 2, CTX, c);
1469    addmq(8 * 3, CTX, d);
1470    addmq(8 * 4, CTX, e);
1471    addmq(8 * 5, CTX, f);
1472    addmq(8 * 6, CTX, g);
1473    addmq(8 * 7, CTX, h);
1474
1475    movq(INP, Address(rsp, _INP));
1476    addq(INP, 128);
1477    cmpq(INP, Address(rsp, _INP_END));
1478    jcc(Assembler::notEqual, loop0);
1479
1480    bind(done_hash);
1481
1482    //Restore GPRs
1483    movq(rbp, Address(rsp, (_GPR + 0)));
1484    movq(rbx, Address(rsp, (_GPR + 8)));
1485    movq(r12, Address(rsp, (_GPR + 16)));
1486    movq(r13, Address(rsp, (_GPR + 24)));
1487    movq(r14, Address(rsp, (_GPR + 32)));
1488    movq(r15, Address(rsp, (_GPR + 40)));
1489
1490#ifdef _WIN64
1491    movq(rsi, Address(rsp, (_GPR + 48)));
1492    movq(rdi, Address(rsp, (_GPR + 56)));
1493#endif
1494
1495    //Restore Stack Pointer
1496    movq(rsp, Address(rsp, _RSP));
1497
1498#ifdef _WIN64
1499    pop(r9);
1500    pop(r8);
1501#else
1502    pop(rcx);
1503    pop(rdx);
1504#endif
1505
1506    if (multi_block) {
1507#ifdef _WIN64
1508      const Register& limit_end = r9;
1509      const Register& ofs_end = r8;
1510#else
1511      const Register& limit_end = rcx;
1512      const Register& ofs_end   = rdx;
1513#endif
1514      movq(rax, ofs_end);
1515      bind(compute_size);
1516      cmpptr(rax, limit_end);
1517      jccb(Assembler::aboveEqual, compute_size_end);
1518      addq(rax, 128);
1519      jmpb(compute_size);
1520      bind(compute_size_end);
1521    }
1522}
1523
1524#endif //#ifdef _LP64
1525
1526