1public _llvm_blake3_hash_many_sse2
2public llvm_blake3_hash_many_sse2
3public llvm_blake3_compress_in_place_sse2
4public _llvm_blake3_compress_in_place_sse2
5public llvm_blake3_compress_xof_sse2
6public _llvm_blake3_compress_xof_sse2
7
8_TEXT   SEGMENT ALIGN(16) 'CODE'
9
10ALIGN   16
11llvm_blake3_hash_many_sse2 PROC
12_llvm_blake3_hash_many_sse2 PROC
13        push    r15
14        push    r14
15        push    r13
16        push    r12
17        push    rsi
18        push    rdi
19        push    rbx
20        push    rbp
21        mov     rbp, rsp
22        sub     rsp, 528
23        and     rsp, 0FFFFFFFFFFFFFFC0H
24        movdqa  xmmword ptr [rsp+170H], xmm6
25        movdqa  xmmword ptr [rsp+180H], xmm7
26        movdqa  xmmword ptr [rsp+190H], xmm8
27        movdqa  xmmword ptr [rsp+1A0H], xmm9
28        movdqa  xmmword ptr [rsp+1B0H], xmm10
29        movdqa  xmmword ptr [rsp+1C0H], xmm11
30        movdqa  xmmword ptr [rsp+1D0H], xmm12
31        movdqa  xmmword ptr [rsp+1E0H], xmm13
32        movdqa  xmmword ptr [rsp+1F0H], xmm14
33        movdqa  xmmword ptr [rsp+200H], xmm15
34        mov     rdi, rcx
35        mov     rsi, rdx
36        mov     rdx, r8
37        mov     rcx, r9
38        mov     r8, qword ptr [rbp+68H]
39        movzx   r9, byte ptr [rbp+70H]
40        neg     r9d
41        movd    xmm0, r9d
42        pshufd  xmm0, xmm0, 00H
43        movdqa  xmmword ptr [rsp+130H], xmm0
44        movdqa  xmm1, xmm0
45        pand    xmm1, xmmword ptr [ADD0]
46        pand    xmm0, xmmword ptr [ADD1]
47        movdqa  xmmword ptr [rsp+150H], xmm0
48        movd    xmm0, r8d
49        pshufd  xmm0, xmm0, 00H
50        paddd   xmm0, xmm1
51        movdqa  xmmword ptr [rsp+110H], xmm0
52        pxor    xmm0, xmmword ptr [CMP_MSB_MASK]
53        pxor    xmm1, xmmword ptr [CMP_MSB_MASK]
54        pcmpgtd xmm1, xmm0
55        shr     r8, 32
56        movd    xmm2, r8d
57        pshufd  xmm2, xmm2, 00H
58        psubd   xmm2, xmm1
59        movdqa  xmmword ptr [rsp+120H], xmm2
60        mov     rbx, qword ptr [rbp+90H]
61        mov     r15, rdx
62        shl     r15, 6
63        movzx   r13d, byte ptr [rbp+78H]
64        movzx   r12d, byte ptr [rbp+88H]
65        cmp     rsi, 4
66        jc      final3blocks
67outerloop4:
68        movdqu  xmm3, xmmword ptr [rcx]
69        pshufd  xmm0, xmm3, 00H
70        pshufd  xmm1, xmm3, 55H
71        pshufd  xmm2, xmm3, 0AAH
72        pshufd  xmm3, xmm3, 0FFH
73        movdqu  xmm7, xmmword ptr [rcx+10H]
74        pshufd  xmm4, xmm7, 00H
75        pshufd  xmm5, xmm7, 55H
76        pshufd  xmm6, xmm7, 0AAH
77        pshufd  xmm7, xmm7, 0FFH
78        mov     r8, qword ptr [rdi]
79        mov     r9, qword ptr [rdi+8H]
80        mov     r10, qword ptr [rdi+10H]
81        mov     r11, qword ptr [rdi+18H]
82        movzx   eax, byte ptr [rbp+80H]
83        or      eax, r13d
84        xor     edx, edx
85innerloop4:
86        mov     r14d, eax
87        or      eax, r12d
88        add     rdx, 64
89        cmp     rdx, r15
90        cmovne  eax, r14d
91        movdqu  xmm8, xmmword ptr [r8+rdx-40H]
92        movdqu  xmm9, xmmword ptr [r9+rdx-40H]
93        movdqu  xmm10, xmmword ptr [r10+rdx-40H]
94        movdqu  xmm11, xmmword ptr [r11+rdx-40H]
95        movdqa  xmm12, xmm8
96        punpckldq xmm8, xmm9
97        punpckhdq xmm12, xmm9
98        movdqa  xmm14, xmm10
99        punpckldq xmm10, xmm11
100        punpckhdq xmm14, xmm11
101        movdqa  xmm9, xmm8
102        punpcklqdq xmm8, xmm10
103        punpckhqdq xmm9, xmm10
104        movdqa  xmm13, xmm12
105        punpcklqdq xmm12, xmm14
106        punpckhqdq xmm13, xmm14
107        movdqa  xmmword ptr [rsp], xmm8
108        movdqa  xmmword ptr [rsp+10H], xmm9
109        movdqa  xmmword ptr [rsp+20H], xmm12
110        movdqa  xmmword ptr [rsp+30H], xmm13
111        movdqu  xmm8, xmmword ptr [r8+rdx-30H]
112        movdqu  xmm9, xmmword ptr [r9+rdx-30H]
113        movdqu  xmm10, xmmword ptr [r10+rdx-30H]
114        movdqu  xmm11, xmmword ptr [r11+rdx-30H]
115        movdqa  xmm12, xmm8
116        punpckldq xmm8, xmm9
117        punpckhdq xmm12, xmm9
118        movdqa  xmm14, xmm10
119        punpckldq xmm10, xmm11
120        punpckhdq xmm14, xmm11
121        movdqa  xmm9, xmm8
122        punpcklqdq xmm8, xmm10
123        punpckhqdq xmm9, xmm10
124        movdqa  xmm13, xmm12
125        punpcklqdq xmm12, xmm14
126        punpckhqdq xmm13, xmm14
127        movdqa  xmmword ptr [rsp+40H], xmm8
128        movdqa  xmmword ptr [rsp+50H], xmm9
129        movdqa  xmmword ptr [rsp+60H], xmm12
130        movdqa  xmmword ptr [rsp+70H], xmm13
131        movdqu  xmm8, xmmword ptr [r8+rdx-20H]
132        movdqu  xmm9, xmmword ptr [r9+rdx-20H]
133        movdqu  xmm10, xmmword ptr [r10+rdx-20H]
134        movdqu  xmm11, xmmword ptr [r11+rdx-20H]
135        movdqa  xmm12, xmm8
136        punpckldq xmm8, xmm9
137        punpckhdq xmm12, xmm9
138        movdqa  xmm14, xmm10
139        punpckldq xmm10, xmm11
140        punpckhdq xmm14, xmm11
141        movdqa  xmm9, xmm8
142        punpcklqdq xmm8, xmm10
143        punpckhqdq xmm9, xmm10
144        movdqa  xmm13, xmm12
145        punpcklqdq xmm12, xmm14
146        punpckhqdq xmm13, xmm14
147        movdqa  xmmword ptr [rsp+80H], xmm8
148        movdqa  xmmword ptr [rsp+90H], xmm9
149        movdqa  xmmword ptr [rsp+0A0H], xmm12
150        movdqa  xmmword ptr [rsp+0B0H], xmm13
151        movdqu  xmm8, xmmword ptr [r8+rdx-10H]
152        movdqu  xmm9, xmmword ptr [r9+rdx-10H]
153        movdqu  xmm10, xmmword ptr [r10+rdx-10H]
154        movdqu  xmm11, xmmword ptr [r11+rdx-10H]
155        movdqa  xmm12, xmm8
156        punpckldq xmm8, xmm9
157        punpckhdq xmm12, xmm9
158        movdqa  xmm14, xmm10
159        punpckldq xmm10, xmm11
160        punpckhdq xmm14, xmm11
161        movdqa  xmm9, xmm8
162        punpcklqdq xmm8, xmm10
163        punpckhqdq xmm9, xmm10
164        movdqa  xmm13, xmm12
165        punpcklqdq xmm12, xmm14
166        punpckhqdq xmm13, xmm14
167        movdqa  xmmword ptr [rsp+0C0H], xmm8
168        movdqa  xmmword ptr [rsp+0D0H], xmm9
169        movdqa  xmmword ptr [rsp+0E0H], xmm12
170        movdqa  xmmword ptr [rsp+0F0H], xmm13
171        movdqa  xmm9, xmmword ptr [BLAKE3_IV_1]
172        movdqa  xmm10, xmmword ptr [BLAKE3_IV_2]
173        movdqa  xmm11, xmmword ptr [BLAKE3_IV_3]
174        movdqa  xmm12, xmmword ptr [rsp+110H]
175        movdqa  xmm13, xmmword ptr [rsp+120H]
176        movdqa  xmm14, xmmword ptr [BLAKE3_BLOCK_LEN]
177        movd    xmm15, eax
178        pshufd  xmm15, xmm15, 00H
179        prefetcht0 byte ptr [r8+rdx+80H]
180        prefetcht0 byte ptr [r9+rdx+80H]
181        prefetcht0 byte ptr [r10+rdx+80H]
182        prefetcht0 byte ptr [r11+rdx+80H]
183        paddd   xmm0, xmmword ptr [rsp]
184        paddd   xmm1, xmmword ptr [rsp+20H]
185        paddd   xmm2, xmmword ptr [rsp+40H]
186        paddd   xmm3, xmmword ptr [rsp+60H]
187        paddd   xmm0, xmm4
188        paddd   xmm1, xmm5
189        paddd   xmm2, xmm6
190        paddd   xmm3, xmm7
191        pxor    xmm12, xmm0
192        pxor    xmm13, xmm1
193        pxor    xmm14, xmm2
194        pxor    xmm15, xmm3
195        pshuflw xmm12, xmm12, 0B1H
196        pshufhw xmm12, xmm12, 0B1H
197        pshuflw xmm13, xmm13, 0B1H
198        pshufhw xmm13, xmm13, 0B1H
199        pshuflw xmm14, xmm14, 0B1H
200        pshufhw xmm14, xmm14, 0B1H
201        pshuflw xmm15, xmm15, 0B1H
202        pshufhw xmm15, xmm15, 0B1H
203        movdqa  xmm8, xmmword ptr [BLAKE3_IV_0]
204        paddd   xmm8, xmm12
205        paddd   xmm9, xmm13
206        paddd   xmm10, xmm14
207        paddd   xmm11, xmm15
208        pxor    xmm4, xmm8
209        pxor    xmm5, xmm9
210        pxor    xmm6, xmm10
211        pxor    xmm7, xmm11
212        movdqa  xmmword ptr [rsp+100H], xmm8
213        movdqa  xmm8, xmm4
214        psrld   xmm8, 12
215        pslld   xmm4, 20
216        por     xmm4, xmm8
217        movdqa  xmm8, xmm5
218        psrld   xmm8, 12
219        pslld   xmm5, 20
220        por     xmm5, xmm8
221        movdqa  xmm8, xmm6
222        psrld   xmm8, 12
223        pslld   xmm6, 20
224        por     xmm6, xmm8
225        movdqa  xmm8, xmm7
226        psrld   xmm8, 12
227        pslld   xmm7, 20
228        por     xmm7, xmm8
229        paddd   xmm0, xmmword ptr [rsp+10H]
230        paddd   xmm1, xmmword ptr [rsp+30H]
231        paddd   xmm2, xmmword ptr [rsp+50H]
232        paddd   xmm3, xmmword ptr [rsp+70H]
233        paddd   xmm0, xmm4
234        paddd   xmm1, xmm5
235        paddd   xmm2, xmm6
236        paddd   xmm3, xmm7
237        pxor    xmm12, xmm0
238        pxor    xmm13, xmm1
239        pxor    xmm14, xmm2
240        pxor    xmm15, xmm3
241        movdqa  xmm8, xmm12
242        psrld   xmm12, 8
243        pslld   xmm8, 24
244        pxor    xmm12, xmm8
245        movdqa  xmm8, xmm13
246        psrld   xmm13, 8
247        pslld   xmm8, 24
248        pxor    xmm13, xmm8
249        movdqa  xmm8, xmm14
250        psrld   xmm14, 8
251        pslld   xmm8, 24
252        pxor    xmm14, xmm8
253        movdqa  xmm8, xmm15
254        psrld   xmm15, 8
255        pslld   xmm8, 24
256        pxor    xmm15, xmm8
257        movdqa  xmm8, xmmword ptr [rsp+100H]
258        paddd   xmm8, xmm12
259        paddd   xmm9, xmm13
260        paddd   xmm10, xmm14
261        paddd   xmm11, xmm15
262        pxor    xmm4, xmm8
263        pxor    xmm5, xmm9
264        pxor    xmm6, xmm10
265        pxor    xmm7, xmm11
266        movdqa  xmmword ptr [rsp+100H], xmm8
267        movdqa  xmm8, xmm4
268        psrld   xmm8, 7
269        pslld   xmm4, 25
270        por     xmm4, xmm8
271        movdqa  xmm8, xmm5
272        psrld   xmm8, 7
273        pslld   xmm5, 25
274        por     xmm5, xmm8
275        movdqa  xmm8, xmm6
276        psrld   xmm8, 7
277        pslld   xmm6, 25
278        por     xmm6, xmm8
279        movdqa  xmm8, xmm7
280        psrld   xmm8, 7
281        pslld   xmm7, 25
282        por     xmm7, xmm8
283        paddd   xmm0, xmmword ptr [rsp+80H]
284        paddd   xmm1, xmmword ptr [rsp+0A0H]
285        paddd   xmm2, xmmword ptr [rsp+0C0H]
286        paddd   xmm3, xmmword ptr [rsp+0E0H]
287        paddd   xmm0, xmm5
288        paddd   xmm1, xmm6
289        paddd   xmm2, xmm7
290        paddd   xmm3, xmm4
291        pxor    xmm15, xmm0
292        pxor    xmm12, xmm1
293        pxor    xmm13, xmm2
294        pxor    xmm14, xmm3
295        pshuflw xmm15, xmm15, 0B1H
296        pshufhw xmm15, xmm15, 0B1H
297        pshuflw xmm12, xmm12, 0B1H
298        pshufhw xmm12, xmm12, 0B1H
299        pshuflw xmm13, xmm13, 0B1H
300        pshufhw xmm13, xmm13, 0B1H
301        pshuflw xmm14, xmm14, 0B1H
302        pshufhw xmm14, xmm14, 0B1H
303        paddd   xmm10, xmm15
304        paddd   xmm11, xmm12
305        movdqa  xmm8, xmmword ptr [rsp+100H]
306        paddd   xmm8, xmm13
307        paddd   xmm9, xmm14
308        pxor    xmm5, xmm10
309        pxor    xmm6, xmm11
310        pxor    xmm7, xmm8
311        pxor    xmm4, xmm9
312        movdqa  xmmword ptr [rsp+100H], xmm8
313        movdqa  xmm8, xmm5
314        psrld   xmm8, 12
315        pslld   xmm5, 20
316        por     xmm5, xmm8
317        movdqa  xmm8, xmm6
318        psrld   xmm8, 12
319        pslld   xmm6, 20
320        por     xmm6, xmm8
321        movdqa  xmm8, xmm7
322        psrld   xmm8, 12
323        pslld   xmm7, 20
324        por     xmm7, xmm8
325        movdqa  xmm8, xmm4
326        psrld   xmm8, 12
327        pslld   xmm4, 20
328        por     xmm4, xmm8
329        paddd   xmm0, xmmword ptr [rsp+90H]
330        paddd   xmm1, xmmword ptr [rsp+0B0H]
331        paddd   xmm2, xmmword ptr [rsp+0D0H]
332        paddd   xmm3, xmmword ptr [rsp+0F0H]
333        paddd   xmm0, xmm5
334        paddd   xmm1, xmm6
335        paddd   xmm2, xmm7
336        paddd   xmm3, xmm4
337        pxor    xmm15, xmm0
338        pxor    xmm12, xmm1
339        pxor    xmm13, xmm2
340        pxor    xmm14, xmm3
341        movdqa  xmm8, xmm15
342        psrld   xmm15, 8
343        pslld   xmm8, 24
344        pxor    xmm15, xmm8
345        movdqa  xmm8, xmm12
346        psrld   xmm12, 8
347        pslld   xmm8, 24
348        pxor    xmm12, xmm8
349        movdqa  xmm8, xmm13
350        psrld   xmm13, 8
351        pslld   xmm8, 24
352        pxor    xmm13, xmm8
353        movdqa  xmm8, xmm14
354        psrld   xmm14, 8
355        pslld   xmm8, 24
356        pxor    xmm14, xmm8
357        paddd   xmm10, xmm15
358        paddd   xmm11, xmm12
359        movdqa  xmm8, xmmword ptr [rsp+100H]
360        paddd   xmm8, xmm13
361        paddd   xmm9, xmm14
362        pxor    xmm5, xmm10
363        pxor    xmm6, xmm11
364        pxor    xmm7, xmm8
365        pxor    xmm4, xmm9
366        movdqa  xmmword ptr [rsp+100H], xmm8
367        movdqa  xmm8, xmm5
368        psrld   xmm8, 7
369        pslld   xmm5, 25
370        por     xmm5, xmm8
371        movdqa  xmm8, xmm6
372        psrld   xmm8, 7
373        pslld   xmm6, 25
374        por     xmm6, xmm8
375        movdqa  xmm8, xmm7
376        psrld   xmm8, 7
377        pslld   xmm7, 25
378        por     xmm7, xmm8
379        movdqa  xmm8, xmm4
380        psrld   xmm8, 7
381        pslld   xmm4, 25
382        por     xmm4, xmm8
383        paddd   xmm0, xmmword ptr [rsp+20H]
384        paddd   xmm1, xmmword ptr [rsp+30H]
385        paddd   xmm2, xmmword ptr [rsp+70H]
386        paddd   xmm3, xmmword ptr [rsp+40H]
387        paddd   xmm0, xmm4
388        paddd   xmm1, xmm5
389        paddd   xmm2, xmm6
390        paddd   xmm3, xmm7
391        pxor    xmm12, xmm0
392        pxor    xmm13, xmm1
393        pxor    xmm14, xmm2
394        pxor    xmm15, xmm3
395        pshuflw xmm12, xmm12, 0B1H
396        pshufhw xmm12, xmm12, 0B1H
397        pshuflw xmm13, xmm13, 0B1H
398        pshufhw xmm13, xmm13, 0B1H
399        pshuflw xmm14, xmm14, 0B1H
400        pshufhw xmm14, xmm14, 0B1H
401        pshuflw xmm15, xmm15, 0B1H
402        pshufhw xmm15, xmm15, 0B1H
403        movdqa  xmm8, xmmword ptr [rsp+100H]
404        paddd   xmm8, xmm12
405        paddd   xmm9, xmm13
406        paddd   xmm10, xmm14
407        paddd   xmm11, xmm15
408        pxor    xmm4, xmm8
409        pxor    xmm5, xmm9
410        pxor    xmm6, xmm10
411        pxor    xmm7, xmm11
412        movdqa  xmmword ptr [rsp+100H], xmm8
413        movdqa  xmm8, xmm4
414        psrld   xmm8, 12
415        pslld   xmm4, 20
416        por     xmm4, xmm8
417        movdqa  xmm8, xmm5
418        psrld   xmm8, 12
419        pslld   xmm5, 20
420        por     xmm5, xmm8
421        movdqa  xmm8, xmm6
422        psrld   xmm8, 12
423        pslld   xmm6, 20
424        por     xmm6, xmm8
425        movdqa  xmm8, xmm7
426        psrld   xmm8, 12
427        pslld   xmm7, 20
428        por     xmm7, xmm8
429        paddd   xmm0, xmmword ptr [rsp+60H]
430        paddd   xmm1, xmmword ptr [rsp+0A0H]
431        paddd   xmm2, xmmword ptr [rsp]
432        paddd   xmm3, xmmword ptr [rsp+0D0H]
433        paddd   xmm0, xmm4
434        paddd   xmm1, xmm5
435        paddd   xmm2, xmm6
436        paddd   xmm3, xmm7
437        pxor    xmm12, xmm0
438        pxor    xmm13, xmm1
439        pxor    xmm14, xmm2
440        pxor    xmm15, xmm3
441        movdqa  xmm8, xmm12
442        psrld   xmm12, 8
443        pslld   xmm8, 24
444        pxor    xmm12, xmm8
445        movdqa  xmm8, xmm13
446        psrld   xmm13, 8
447        pslld   xmm8, 24
448        pxor    xmm13, xmm8
449        movdqa  xmm8, xmm14
450        psrld   xmm14, 8
451        pslld   xmm8, 24
452        pxor    xmm14, xmm8
453        movdqa  xmm8, xmm15
454        psrld   xmm15, 8
455        pslld   xmm8, 24
456        pxor    xmm15, xmm8
457        movdqa  xmm8, xmmword ptr [rsp+100H]
458        paddd   xmm8, xmm12
459        paddd   xmm9, xmm13
460        paddd   xmm10, xmm14
461        paddd   xmm11, xmm15
462        pxor    xmm4, xmm8
463        pxor    xmm5, xmm9
464        pxor    xmm6, xmm10
465        pxor    xmm7, xmm11
466        movdqa  xmmword ptr [rsp+100H], xmm8
467        movdqa  xmm8, xmm4
468        psrld   xmm8, 7
469        pslld   xmm4, 25
470        por     xmm4, xmm8
471        movdqa  xmm8, xmm5
472        psrld   xmm8, 7
473        pslld   xmm5, 25
474        por     xmm5, xmm8
475        movdqa  xmm8, xmm6
476        psrld   xmm8, 7
477        pslld   xmm6, 25
478        por     xmm6, xmm8
479        movdqa  xmm8, xmm7
480        psrld   xmm8, 7
481        pslld   xmm7, 25
482        por     xmm7, xmm8
483        paddd   xmm0, xmmword ptr [rsp+10H]
484        paddd   xmm1, xmmword ptr [rsp+0C0H]
485        paddd   xmm2, xmmword ptr [rsp+90H]
486        paddd   xmm3, xmmword ptr [rsp+0F0H]
487        paddd   xmm0, xmm5
488        paddd   xmm1, xmm6
489        paddd   xmm2, xmm7
490        paddd   xmm3, xmm4
491        pxor    xmm15, xmm0
492        pxor    xmm12, xmm1
493        pxor    xmm13, xmm2
494        pxor    xmm14, xmm3
495        pshuflw xmm15, xmm15, 0B1H
496        pshufhw xmm15, xmm15, 0B1H
497        pshuflw xmm12, xmm12, 0B1H
498        pshufhw xmm12, xmm12, 0B1H
499        pshuflw xmm13, xmm13, 0B1H
500        pshufhw xmm13, xmm13, 0B1H
501        pshuflw xmm14, xmm14, 0B1H
502        pshufhw xmm14, xmm14, 0B1H
503        paddd   xmm10, xmm15
504        paddd   xmm11, xmm12
505        movdqa  xmm8, xmmword ptr [rsp+100H]
506        paddd   xmm8, xmm13
507        paddd   xmm9, xmm14
508        pxor    xmm5, xmm10
509        pxor    xmm6, xmm11
510        pxor    xmm7, xmm8
511        pxor    xmm4, xmm9
512        movdqa  xmmword ptr [rsp+100H], xmm8
513        movdqa  xmm8, xmm5
514        psrld   xmm8, 12
515        pslld   xmm5, 20
516        por     xmm5, xmm8
517        movdqa  xmm8, xmm6
518        psrld   xmm8, 12
519        pslld   xmm6, 20
520        por     xmm6, xmm8
521        movdqa  xmm8, xmm7
522        psrld   xmm8, 12
523        pslld   xmm7, 20
524        por     xmm7, xmm8
525        movdqa  xmm8, xmm4
526        psrld   xmm8, 12
527        pslld   xmm4, 20
528        por     xmm4, xmm8
529        paddd   xmm0, xmmword ptr [rsp+0B0H]
530        paddd   xmm1, xmmword ptr [rsp+50H]
531        paddd   xmm2, xmmword ptr [rsp+0E0H]
532        paddd   xmm3, xmmword ptr [rsp+80H]
533        paddd   xmm0, xmm5
534        paddd   xmm1, xmm6
535        paddd   xmm2, xmm7
536        paddd   xmm3, xmm4
537        pxor    xmm15, xmm0
538        pxor    xmm12, xmm1
539        pxor    xmm13, xmm2
540        pxor    xmm14, xmm3
541        movdqa  xmm8, xmm15
542        psrld   xmm15, 8
543        pslld   xmm8, 24
544        pxor    xmm15, xmm8
545        movdqa  xmm8, xmm12
546        psrld   xmm12, 8
547        pslld   xmm8, 24
548        pxor    xmm12, xmm8
549        movdqa  xmm8, xmm13
550        psrld   xmm13, 8
551        pslld   xmm8, 24
552        pxor    xmm13, xmm8
553        movdqa  xmm8, xmm14
554        psrld   xmm14, 8
555        pslld   xmm8, 24
556        pxor    xmm14, xmm8
557        paddd   xmm10, xmm15
558        paddd   xmm11, xmm12
559        movdqa  xmm8, xmmword ptr [rsp+100H]
560        paddd   xmm8, xmm13
561        paddd   xmm9, xmm14
562        pxor    xmm5, xmm10
563        pxor    xmm6, xmm11
564        pxor    xmm7, xmm8
565        pxor    xmm4, xmm9
566        movdqa  xmmword ptr [rsp+100H], xmm8
567        movdqa  xmm8, xmm5
568        psrld   xmm8, 7
569        pslld   xmm5, 25
570        por     xmm5, xmm8
571        movdqa  xmm8, xmm6
572        psrld   xmm8, 7
573        pslld   xmm6, 25
574        por     xmm6, xmm8
575        movdqa  xmm8, xmm7
576        psrld   xmm8, 7
577        pslld   xmm7, 25
578        por     xmm7, xmm8
579        movdqa  xmm8, xmm4
580        psrld   xmm8, 7
581        pslld   xmm4, 25
582        por     xmm4, xmm8
583        paddd   xmm0, xmmword ptr [rsp+30H]
584        paddd   xmm1, xmmword ptr [rsp+0A0H]
585        paddd   xmm2, xmmword ptr [rsp+0D0H]
586        paddd   xmm3, xmmword ptr [rsp+70H]
587        paddd   xmm0, xmm4
588        paddd   xmm1, xmm5
589        paddd   xmm2, xmm6
590        paddd   xmm3, xmm7
591        pxor    xmm12, xmm0
592        pxor    xmm13, xmm1
593        pxor    xmm14, xmm2
594        pxor    xmm15, xmm3
595        pshuflw xmm12, xmm12, 0B1H
596        pshufhw xmm12, xmm12, 0B1H
597        pshuflw xmm13, xmm13, 0B1H
598        pshufhw xmm13, xmm13, 0B1H
599        pshuflw xmm14, xmm14, 0B1H
600        pshufhw xmm14, xmm14, 0B1H
601        pshuflw xmm15, xmm15, 0B1H
602        pshufhw xmm15, xmm15, 0B1H
603        movdqa  xmm8, xmmword ptr [rsp+100H]
604        paddd   xmm8, xmm12
605        paddd   xmm9, xmm13
606        paddd   xmm10, xmm14
607        paddd   xmm11, xmm15
608        pxor    xmm4, xmm8
609        pxor    xmm5, xmm9
610        pxor    xmm6, xmm10
611        pxor    xmm7, xmm11
612        movdqa  xmmword ptr [rsp+100H], xmm8
613        movdqa  xmm8, xmm4
614        psrld   xmm8, 12
615        pslld   xmm4, 20
616        por     xmm4, xmm8
617        movdqa  xmm8, xmm5
618        psrld   xmm8, 12
619        pslld   xmm5, 20
620        por     xmm5, xmm8
621        movdqa  xmm8, xmm6
622        psrld   xmm8, 12
623        pslld   xmm6, 20
624        por     xmm6, xmm8
625        movdqa  xmm8, xmm7
626        psrld   xmm8, 12
627        pslld   xmm7, 20
628        por     xmm7, xmm8
629        paddd   xmm0, xmmword ptr [rsp+40H]
630        paddd   xmm1, xmmword ptr [rsp+0C0H]
631        paddd   xmm2, xmmword ptr [rsp+20H]
632        paddd   xmm3, xmmword ptr [rsp+0E0H]
633        paddd   xmm0, xmm4
634        paddd   xmm1, xmm5
635        paddd   xmm2, xmm6
636        paddd   xmm3, xmm7
637        pxor    xmm12, xmm0
638        pxor    xmm13, xmm1
639        pxor    xmm14, xmm2
640        pxor    xmm15, xmm3
641        movdqa  xmm8, xmm12
642        psrld   xmm12, 8
643        pslld   xmm8, 24
644        pxor    xmm12, xmm8
645        movdqa  xmm8, xmm13
646        psrld   xmm13, 8
647        pslld   xmm8, 24
648        pxor    xmm13, xmm8
649        movdqa  xmm8, xmm14
650        psrld   xmm14, 8
651        pslld   xmm8, 24
652        pxor    xmm14, xmm8
653        movdqa  xmm8, xmm15
654        psrld   xmm15, 8
655        pslld   xmm8, 24
656        pxor    xmm15, xmm8
657        movdqa  xmm8, xmmword ptr [rsp+100H]
658        paddd   xmm8, xmm12
659        paddd   xmm9, xmm13
660        paddd   xmm10, xmm14
661        paddd   xmm11, xmm15
662        pxor    xmm4, xmm8
663        pxor    xmm5, xmm9
664        pxor    xmm6, xmm10
665        pxor    xmm7, xmm11
666        movdqa  xmmword ptr [rsp+100H], xmm8
667        movdqa  xmm8, xmm4
668        psrld   xmm8, 7
669        pslld   xmm4, 25
670        por     xmm4, xmm8
671        movdqa  xmm8, xmm5
672        psrld   xmm8, 7
673        pslld   xmm5, 25
674        por     xmm5, xmm8
675        movdqa  xmm8, xmm6
676        psrld   xmm8, 7
677        pslld   xmm6, 25
678        por     xmm6, xmm8
679        movdqa  xmm8, xmm7
680        psrld   xmm8, 7
681        pslld   xmm7, 25
682        por     xmm7, xmm8
683        paddd   xmm0, xmmword ptr [rsp+60H]
684        paddd   xmm1, xmmword ptr [rsp+90H]
685        paddd   xmm2, xmmword ptr [rsp+0B0H]
686        paddd   xmm3, xmmword ptr [rsp+80H]
687        paddd   xmm0, xmm5
688        paddd   xmm1, xmm6
689        paddd   xmm2, xmm7
690        paddd   xmm3, xmm4
691        pxor    xmm15, xmm0
692        pxor    xmm12, xmm1
693        pxor    xmm13, xmm2
694        pxor    xmm14, xmm3
695        pshuflw xmm15, xmm15, 0B1H
696        pshufhw xmm15, xmm15, 0B1H
697        pshuflw xmm12, xmm12, 0B1H
698        pshufhw xmm12, xmm12, 0B1H
699        pshuflw xmm13, xmm13, 0B1H
700        pshufhw xmm13, xmm13, 0B1H
701        pshuflw xmm14, xmm14, 0B1H
702        pshufhw xmm14, xmm14, 0B1H
703        paddd   xmm10, xmm15
704        paddd   xmm11, xmm12
705        movdqa  xmm8, xmmword ptr [rsp+100H]
706        paddd   xmm8, xmm13
707        paddd   xmm9, xmm14
708        pxor    xmm5, xmm10
709        pxor    xmm6, xmm11
710        pxor    xmm7, xmm8
711        pxor    xmm4, xmm9
712        movdqa  xmmword ptr [rsp+100H], xmm8
713        movdqa  xmm8, xmm5
714        psrld   xmm8, 12
715        pslld   xmm5, 20
716        por     xmm5, xmm8
717        movdqa  xmm8, xmm6
718        psrld   xmm8, 12
719        pslld   xmm6, 20
720        por     xmm6, xmm8
721        movdqa  xmm8, xmm7
722        psrld   xmm8, 12
723        pslld   xmm7, 20
724        por     xmm7, xmm8
725        movdqa  xmm8, xmm4
726        psrld   xmm8, 12
727        pslld   xmm4, 20
728        por     xmm4, xmm8
729        paddd   xmm0, xmmword ptr [rsp+50H]
730        paddd   xmm1, xmmword ptr [rsp]
731        paddd   xmm2, xmmword ptr [rsp+0F0H]
732        paddd   xmm3, xmmword ptr [rsp+10H]
733        paddd   xmm0, xmm5
734        paddd   xmm1, xmm6
735        paddd   xmm2, xmm7
736        paddd   xmm3, xmm4
737        pxor    xmm15, xmm0
738        pxor    xmm12, xmm1
739        pxor    xmm13, xmm2
740        pxor    xmm14, xmm3
741        movdqa  xmm8, xmm15
742        psrld   xmm15, 8
743        pslld   xmm8, 24
744        pxor    xmm15, xmm8
745        movdqa  xmm8, xmm12
746        psrld   xmm12, 8
747        pslld   xmm8, 24
748        pxor    xmm12, xmm8
749        movdqa  xmm8, xmm13
750        psrld   xmm13, 8
751        pslld   xmm8, 24
752        pxor    xmm13, xmm8
753        movdqa  xmm8, xmm14
754        psrld   xmm14, 8
755        pslld   xmm8, 24
756        pxor    xmm14, xmm8
757        paddd   xmm10, xmm15
758        paddd   xmm11, xmm12
759        movdqa  xmm8, xmmword ptr [rsp+100H]
760        paddd   xmm8, xmm13
761        paddd   xmm9, xmm14
762        pxor    xmm5, xmm10
763        pxor    xmm6, xmm11
764        pxor    xmm7, xmm8
765        pxor    xmm4, xmm9
766        movdqa  xmmword ptr [rsp+100H], xmm8
767        movdqa  xmm8, xmm5
768        psrld   xmm8, 7
769        pslld   xmm5, 25
770        por     xmm5, xmm8
771        movdqa  xmm8, xmm6
772        psrld   xmm8, 7
773        pslld   xmm6, 25
774        por     xmm6, xmm8
775        movdqa  xmm8, xmm7
776        psrld   xmm8, 7
777        pslld   xmm7, 25
778        por     xmm7, xmm8
779        movdqa  xmm8, xmm4
780        psrld   xmm8, 7
781        pslld   xmm4, 25
782        por     xmm4, xmm8
783        paddd   xmm0, xmmword ptr [rsp+0A0H]
784        paddd   xmm1, xmmword ptr [rsp+0C0H]
785        paddd   xmm2, xmmword ptr [rsp+0E0H]
786        paddd   xmm3, xmmword ptr [rsp+0D0H]
787        paddd   xmm0, xmm4
788        paddd   xmm1, xmm5
789        paddd   xmm2, xmm6
790        paddd   xmm3, xmm7
791        pxor    xmm12, xmm0
792        pxor    xmm13, xmm1
793        pxor    xmm14, xmm2
794        pxor    xmm15, xmm3
795        pshuflw xmm12, xmm12, 0B1H
796        pshufhw xmm12, xmm12, 0B1H
797        pshuflw xmm13, xmm13, 0B1H
798        pshufhw xmm13, xmm13, 0B1H
799        pshuflw xmm14, xmm14, 0B1H
800        pshufhw xmm14, xmm14, 0B1H
801        pshuflw xmm15, xmm15, 0B1H
802        pshufhw xmm15, xmm15, 0B1H
803        movdqa  xmm8, xmmword ptr [rsp+100H]
804        paddd   xmm8, xmm12
805        paddd   xmm9, xmm13
806        paddd   xmm10, xmm14
807        paddd   xmm11, xmm15
808        pxor    xmm4, xmm8
809        pxor    xmm5, xmm9
810        pxor    xmm6, xmm10
811        pxor    xmm7, xmm11
812        movdqa  xmmword ptr [rsp+100H], xmm8
813        movdqa  xmm8, xmm4
814        psrld   xmm8, 12
815        pslld   xmm4, 20
816        por     xmm4, xmm8
817        movdqa  xmm8, xmm5
818        psrld   xmm8, 12
819        pslld   xmm5, 20
820        por     xmm5, xmm8
821        movdqa  xmm8, xmm6
822        psrld   xmm8, 12
823        pslld   xmm6, 20
824        por     xmm6, xmm8
825        movdqa  xmm8, xmm7
826        psrld   xmm8, 12
827        pslld   xmm7, 20
828        por     xmm7, xmm8
829        paddd   xmm0, xmmword ptr [rsp+70H]
830        paddd   xmm1, xmmword ptr [rsp+90H]
831        paddd   xmm2, xmmword ptr [rsp+30H]
832        paddd   xmm3, xmmword ptr [rsp+0F0H]
833        paddd   xmm0, xmm4
834        paddd   xmm1, xmm5
835        paddd   xmm2, xmm6
836        paddd   xmm3, xmm7
837        pxor    xmm12, xmm0
838        pxor    xmm13, xmm1
839        pxor    xmm14, xmm2
840        pxor    xmm15, xmm3
841        movdqa  xmm8, xmm12
842        psrld   xmm12, 8
843        pslld   xmm8, 24
844        pxor    xmm12, xmm8
845        movdqa  xmm8, xmm13
846        psrld   xmm13, 8
847        pslld   xmm8, 24
848        pxor    xmm13, xmm8
849        movdqa  xmm8, xmm14
850        psrld   xmm14, 8
851        pslld   xmm8, 24
852        pxor    xmm14, xmm8
853        movdqa  xmm8, xmm15
854        psrld   xmm15, 8
855        pslld   xmm8, 24
856        pxor    xmm15, xmm8
857        movdqa  xmm8, xmmword ptr [rsp+100H]
858        paddd   xmm8, xmm12
859        paddd   xmm9, xmm13
860        paddd   xmm10, xmm14
861        paddd   xmm11, xmm15
862        pxor    xmm4, xmm8
863        pxor    xmm5, xmm9
864        pxor    xmm6, xmm10
865        pxor    xmm7, xmm11
866        movdqa  xmmword ptr [rsp+100H], xmm8
867        movdqa  xmm8, xmm4
868        psrld   xmm8, 7
869        pslld   xmm4, 25
870        por     xmm4, xmm8
871        movdqa  xmm8, xmm5
872        psrld   xmm8, 7
873        pslld   xmm5, 25
874        por     xmm5, xmm8
875        movdqa  xmm8, xmm6
876        psrld   xmm8, 7
877        pslld   xmm6, 25
878        por     xmm6, xmm8
879        movdqa  xmm8, xmm7
880        psrld   xmm8, 7
881        pslld   xmm7, 25
882        por     xmm7, xmm8
883        paddd   xmm0, xmmword ptr [rsp+40H]
884        paddd   xmm1, xmmword ptr [rsp+0B0H]
885        paddd   xmm2, xmmword ptr [rsp+50H]
886        paddd   xmm3, xmmword ptr [rsp+10H]
887        paddd   xmm0, xmm5
888        paddd   xmm1, xmm6
889        paddd   xmm2, xmm7
890        paddd   xmm3, xmm4
891        pxor    xmm15, xmm0
892        pxor    xmm12, xmm1
893        pxor    xmm13, xmm2
894        pxor    xmm14, xmm3
895        pshuflw xmm15, xmm15, 0B1H
896        pshufhw xmm15, xmm15, 0B1H
897        pshuflw xmm12, xmm12, 0B1H
898        pshufhw xmm12, xmm12, 0B1H
899        pshuflw xmm13, xmm13, 0B1H
900        pshufhw xmm13, xmm13, 0B1H
901        pshuflw xmm14, xmm14, 0B1H
902        pshufhw xmm14, xmm14, 0B1H
903        paddd   xmm10, xmm15
904        paddd   xmm11, xmm12
905        movdqa  xmm8, xmmword ptr [rsp+100H]
906        paddd   xmm8, xmm13
907        paddd   xmm9, xmm14
908        pxor    xmm5, xmm10
909        pxor    xmm6, xmm11
910        pxor    xmm7, xmm8
911        pxor    xmm4, xmm9
912        movdqa  xmmword ptr [rsp+100H], xmm8
913        movdqa  xmm8, xmm5
914        psrld   xmm8, 12
915        pslld   xmm5, 20
916        por     xmm5, xmm8
917        movdqa  xmm8, xmm6
918        psrld   xmm8, 12
919        pslld   xmm6, 20
920        por     xmm6, xmm8
921        movdqa  xmm8, xmm7
922        psrld   xmm8, 12
923        pslld   xmm7, 20
924        por     xmm7, xmm8
925        movdqa  xmm8, xmm4
926        psrld   xmm8, 12
927        pslld   xmm4, 20
928        por     xmm4, xmm8
929        paddd   xmm0, xmmword ptr [rsp]
930        paddd   xmm1, xmmword ptr [rsp+20H]
931        paddd   xmm2, xmmword ptr [rsp+80H]
932        paddd   xmm3, xmmword ptr [rsp+60H]
933        paddd   xmm0, xmm5
934        paddd   xmm1, xmm6
935        paddd   xmm2, xmm7
936        paddd   xmm3, xmm4
937        pxor    xmm15, xmm0
938        pxor    xmm12, xmm1
939        pxor    xmm13, xmm2
940        pxor    xmm14, xmm3
941        movdqa  xmm8, xmm15
942        psrld   xmm15, 8
943        pslld   xmm8, 24
944        pxor    xmm15, xmm8
945        movdqa  xmm8, xmm12
946        psrld   xmm12, 8
947        pslld   xmm8, 24
948        pxor    xmm12, xmm8
949        movdqa  xmm8, xmm13
950        psrld   xmm13, 8
951        pslld   xmm8, 24
952        pxor    xmm13, xmm8
953        movdqa  xmm8, xmm14
954        psrld   xmm14, 8
955        pslld   xmm8, 24
956        pxor    xmm14, xmm8
957        paddd   xmm10, xmm15
958        paddd   xmm11, xmm12
959        movdqa  xmm8, xmmword ptr [rsp+100H]
960        paddd   xmm8, xmm13
961        paddd   xmm9, xmm14
962        pxor    xmm5, xmm10
963        pxor    xmm6, xmm11
964        pxor    xmm7, xmm8
965        pxor    xmm4, xmm9
966        movdqa  xmmword ptr [rsp+100H], xmm8
967        movdqa  xmm8, xmm5
968        psrld   xmm8, 7
969        pslld   xmm5, 25
970        por     xmm5, xmm8
971        movdqa  xmm8, xmm6
972        psrld   xmm8, 7
973        pslld   xmm6, 25
974        por     xmm6, xmm8
975        movdqa  xmm8, xmm7
976        psrld   xmm8, 7
977        pslld   xmm7, 25
978        por     xmm7, xmm8
979        movdqa  xmm8, xmm4
980        psrld   xmm8, 7
981        pslld   xmm4, 25
982        por     xmm4, xmm8
983        paddd   xmm0, xmmword ptr [rsp+0C0H]
984        paddd   xmm1, xmmword ptr [rsp+90H]
985        paddd   xmm2, xmmword ptr [rsp+0F0H]
986        paddd   xmm3, xmmword ptr [rsp+0E0H]
987        paddd   xmm0, xmm4
988        paddd   xmm1, xmm5
989        paddd   xmm2, xmm6
990        paddd   xmm3, xmm7
991        pxor    xmm12, xmm0
992        pxor    xmm13, xmm1
993        pxor    xmm14, xmm2
994        pxor    xmm15, xmm3
995        pshuflw xmm12, xmm12, 0B1H
996        pshufhw xmm12, xmm12, 0B1H
997        pshuflw xmm13, xmm13, 0B1H
998        pshufhw xmm13, xmm13, 0B1H
999        pshuflw xmm14, xmm14, 0B1H
1000        pshufhw xmm14, xmm14, 0B1H
1001        pshuflw xmm15, xmm15, 0B1H
1002        pshufhw xmm15, xmm15, 0B1H
1003        movdqa  xmm8, xmmword ptr [rsp+100H]
1004        paddd   xmm8, xmm12
1005        paddd   xmm9, xmm13
1006        paddd   xmm10, xmm14
1007        paddd   xmm11, xmm15
1008        pxor    xmm4, xmm8
1009        pxor    xmm5, xmm9
1010        pxor    xmm6, xmm10
1011        pxor    xmm7, xmm11
1012        movdqa  xmmword ptr [rsp+100H], xmm8
1013        movdqa  xmm8, xmm4
1014        psrld   xmm8, 12
1015        pslld   xmm4, 20
1016        por     xmm4, xmm8
1017        movdqa  xmm8, xmm5
1018        psrld   xmm8, 12
1019        pslld   xmm5, 20
1020        por     xmm5, xmm8
1021        movdqa  xmm8, xmm6
1022        psrld   xmm8, 12
1023        pslld   xmm6, 20
1024        por     xmm6, xmm8
1025        movdqa  xmm8, xmm7
1026        psrld   xmm8, 12
1027        pslld   xmm7, 20
1028        por     xmm7, xmm8
1029        paddd   xmm0, xmmword ptr [rsp+0D0H]
1030        paddd   xmm1, xmmword ptr [rsp+0B0H]
1031        paddd   xmm2, xmmword ptr [rsp+0A0H]
1032        paddd   xmm3, xmmword ptr [rsp+80H]
1033        paddd   xmm0, xmm4
1034        paddd   xmm1, xmm5
1035        paddd   xmm2, xmm6
1036        paddd   xmm3, xmm7
1037        pxor    xmm12, xmm0
1038        pxor    xmm13, xmm1
1039        pxor    xmm14, xmm2
1040        pxor    xmm15, xmm3
1041        movdqa  xmm8, xmm12
1042        psrld   xmm12, 8
1043        pslld   xmm8, 24
1044        pxor    xmm12, xmm8
1045        movdqa  xmm8, xmm13
1046        psrld   xmm13, 8
1047        pslld   xmm8, 24
1048        pxor    xmm13, xmm8
1049        movdqa  xmm8, xmm14
1050        psrld   xmm14, 8
1051        pslld   xmm8, 24
1052        pxor    xmm14, xmm8
1053        movdqa  xmm8, xmm15
1054        psrld   xmm15, 8
1055        pslld   xmm8, 24
1056        pxor    xmm15, xmm8
1057        movdqa  xmm8, xmmword ptr [rsp+100H]
1058        paddd   xmm8, xmm12
1059        paddd   xmm9, xmm13
1060        paddd   xmm10, xmm14
1061        paddd   xmm11, xmm15
1062        pxor    xmm4, xmm8
1063        pxor    xmm5, xmm9
1064        pxor    xmm6, xmm10
1065        pxor    xmm7, xmm11
1066        movdqa  xmmword ptr [rsp+100H], xmm8
1067        movdqa  xmm8, xmm4
1068        psrld   xmm8, 7
1069        pslld   xmm4, 25
1070        por     xmm4, xmm8
1071        movdqa  xmm8, xmm5
1072        psrld   xmm8, 7
1073        pslld   xmm5, 25
1074        por     xmm5, xmm8
1075        movdqa  xmm8, xmm6
1076        psrld   xmm8, 7
1077        pslld   xmm6, 25
1078        por     xmm6, xmm8
1079        movdqa  xmm8, xmm7
1080        psrld   xmm8, 7
1081        pslld   xmm7, 25
1082        por     xmm7, xmm8
1083        paddd   xmm0, xmmword ptr [rsp+70H]
1084        paddd   xmm1, xmmword ptr [rsp+50H]
1085        paddd   xmm2, xmmword ptr [rsp]
1086        paddd   xmm3, xmmword ptr [rsp+60H]
1087        paddd   xmm0, xmm5
1088        paddd   xmm1, xmm6
1089        paddd   xmm2, xmm7
1090        paddd   xmm3, xmm4
1091        pxor    xmm15, xmm0
1092        pxor    xmm12, xmm1
1093        pxor    xmm13, xmm2
1094        pxor    xmm14, xmm3
1095        pshuflw xmm15, xmm15, 0B1H
1096        pshufhw xmm15, xmm15, 0B1H
1097        pshuflw xmm12, xmm12, 0B1H
1098        pshufhw xmm12, xmm12, 0B1H
1099        pshuflw xmm13, xmm13, 0B1H
1100        pshufhw xmm13, xmm13, 0B1H
1101        pshuflw xmm14, xmm14, 0B1H
1102        pshufhw xmm14, xmm14, 0B1H
1103        paddd   xmm10, xmm15
1104        paddd   xmm11, xmm12
1105        movdqa  xmm8, xmmword ptr [rsp+100H]
1106        paddd   xmm8, xmm13
1107        paddd   xmm9, xmm14
1108        pxor    xmm5, xmm10
1109        pxor    xmm6, xmm11
1110        pxor    xmm7, xmm8
1111        pxor    xmm4, xmm9
1112        movdqa  xmmword ptr [rsp+100H], xmm8
1113        movdqa  xmm8, xmm5
1114        psrld   xmm8, 12
1115        pslld   xmm5, 20
1116        por     xmm5, xmm8
1117        movdqa  xmm8, xmm6
1118        psrld   xmm8, 12
1119        pslld   xmm6, 20
1120        por     xmm6, xmm8
1121        movdqa  xmm8, xmm7
1122        psrld   xmm8, 12
1123        pslld   xmm7, 20
1124        por     xmm7, xmm8
1125        movdqa  xmm8, xmm4
1126        psrld   xmm8, 12
1127        pslld   xmm4, 20
1128        por     xmm4, xmm8
1129        paddd   xmm0, xmmword ptr [rsp+20H]
1130        paddd   xmm1, xmmword ptr [rsp+30H]
1131        paddd   xmm2, xmmword ptr [rsp+10H]
1132        paddd   xmm3, xmmword ptr [rsp+40H]
1133        paddd   xmm0, xmm5
1134        paddd   xmm1, xmm6
1135        paddd   xmm2, xmm7
1136        paddd   xmm3, xmm4
1137        pxor    xmm15, xmm0
1138        pxor    xmm12, xmm1
1139        pxor    xmm13, xmm2
1140        pxor    xmm14, xmm3
1141        movdqa  xmm8, xmm15
1142        psrld   xmm15, 8
1143        pslld   xmm8, 24
1144        pxor    xmm15, xmm8
1145        movdqa  xmm8, xmm12
1146        psrld   xmm12, 8
1147        pslld   xmm8, 24
1148        pxor    xmm12, xmm8
1149        movdqa  xmm8, xmm13
1150        psrld   xmm13, 8
1151        pslld   xmm8, 24
1152        pxor    xmm13, xmm8
1153        movdqa  xmm8, xmm14
1154        psrld   xmm14, 8
1155        pslld   xmm8, 24
1156        pxor    xmm14, xmm8
1157        paddd   xmm10, xmm15
1158        paddd   xmm11, xmm12
1159        movdqa  xmm8, xmmword ptr [rsp+100H]
1160        paddd   xmm8, xmm13
1161        paddd   xmm9, xmm14
1162        pxor    xmm5, xmm10
1163        pxor    xmm6, xmm11
1164        pxor    xmm7, xmm8
1165        pxor    xmm4, xmm9
1166        movdqa  xmmword ptr [rsp+100H], xmm8
1167        movdqa  xmm8, xmm5
1168        psrld   xmm8, 7
1169        pslld   xmm5, 25
1170        por     xmm5, xmm8
1171        movdqa  xmm8, xmm6
1172        psrld   xmm8, 7
1173        pslld   xmm6, 25
1174        por     xmm6, xmm8
1175        movdqa  xmm8, xmm7
1176        psrld   xmm8, 7
1177        pslld   xmm7, 25
1178        por     xmm7, xmm8
1179        movdqa  xmm8, xmm4
1180        psrld   xmm8, 7
1181        pslld   xmm4, 25
1182        por     xmm4, xmm8
1183        paddd   xmm0, xmmword ptr [rsp+90H]
1184        paddd   xmm1, xmmword ptr [rsp+0B0H]
1185        paddd   xmm2, xmmword ptr [rsp+80H]
1186        paddd   xmm3, xmmword ptr [rsp+0F0H]
1187        paddd   xmm0, xmm4
1188        paddd   xmm1, xmm5
1189        paddd   xmm2, xmm6
1190        paddd   xmm3, xmm7
1191        pxor    xmm12, xmm0
1192        pxor    xmm13, xmm1
1193        pxor    xmm14, xmm2
1194        pxor    xmm15, xmm3
1195        pshuflw xmm12, xmm12, 0B1H
1196        pshufhw xmm12, xmm12, 0B1H
1197        pshuflw xmm13, xmm13, 0B1H
1198        pshufhw xmm13, xmm13, 0B1H
1199        pshuflw xmm14, xmm14, 0B1H
1200        pshufhw xmm14, xmm14, 0B1H
1201        pshuflw xmm15, xmm15, 0B1H
1202        pshufhw xmm15, xmm15, 0B1H
1203        movdqa  xmm8, xmmword ptr [rsp+100H]
1204        paddd   xmm8, xmm12
1205        paddd   xmm9, xmm13
1206        paddd   xmm10, xmm14
1207        paddd   xmm11, xmm15
1208        pxor    xmm4, xmm8
1209        pxor    xmm5, xmm9
1210        pxor    xmm6, xmm10
1211        pxor    xmm7, xmm11
1212        movdqa  xmmword ptr [rsp+100H], xmm8
1213        movdqa  xmm8, xmm4
1214        psrld   xmm8, 12
1215        pslld   xmm4, 20
1216        por     xmm4, xmm8
1217        movdqa  xmm8, xmm5
1218        psrld   xmm8, 12
1219        pslld   xmm5, 20
1220        por     xmm5, xmm8
1221        movdqa  xmm8, xmm6
1222        psrld   xmm8, 12
1223        pslld   xmm6, 20
1224        por     xmm6, xmm8
1225        movdqa  xmm8, xmm7
1226        psrld   xmm8, 12
1227        pslld   xmm7, 20
1228        por     xmm7, xmm8
1229        paddd   xmm0, xmmword ptr [rsp+0E0H]
1230        paddd   xmm1, xmmword ptr [rsp+50H]
1231        paddd   xmm2, xmmword ptr [rsp+0C0H]
1232        paddd   xmm3, xmmword ptr [rsp+10H]
1233        paddd   xmm0, xmm4
1234        paddd   xmm1, xmm5
1235        paddd   xmm2, xmm6
1236        paddd   xmm3, xmm7
1237        pxor    xmm12, xmm0
1238        pxor    xmm13, xmm1
1239        pxor    xmm14, xmm2
1240        pxor    xmm15, xmm3
1241        movdqa  xmm8, xmm12
1242        psrld   xmm12, 8
1243        pslld   xmm8, 24
1244        pxor    xmm12, xmm8
1245        movdqa  xmm8, xmm13
1246        psrld   xmm13, 8
1247        pslld   xmm8, 24
1248        pxor    xmm13, xmm8
1249        movdqa  xmm8, xmm14
1250        psrld   xmm14, 8
1251        pslld   xmm8, 24
1252        pxor    xmm14, xmm8
1253        movdqa  xmm8, xmm15
1254        psrld   xmm15, 8
1255        pslld   xmm8, 24
1256        pxor    xmm15, xmm8
1257        movdqa  xmm8, xmmword ptr [rsp+100H]
1258        paddd   xmm8, xmm12
1259        paddd   xmm9, xmm13
1260        paddd   xmm10, xmm14
1261        paddd   xmm11, xmm15
1262        pxor    xmm4, xmm8
1263        pxor    xmm5, xmm9
1264        pxor    xmm6, xmm10
1265        pxor    xmm7, xmm11
1266        movdqa  xmmword ptr [rsp+100H], xmm8
1267        movdqa  xmm8, xmm4
1268        psrld   xmm8, 7
1269        pslld   xmm4, 25
1270        por     xmm4, xmm8
1271        movdqa  xmm8, xmm5
1272        psrld   xmm8, 7
1273        pslld   xmm5, 25
1274        por     xmm5, xmm8
1275        movdqa  xmm8, xmm6
1276        psrld   xmm8, 7
1277        pslld   xmm6, 25
1278        por     xmm6, xmm8
1279        movdqa  xmm8, xmm7
1280        psrld   xmm8, 7
1281        pslld   xmm7, 25
1282        por     xmm7, xmm8
1283        paddd   xmm0, xmmword ptr [rsp+0D0H]
1284        paddd   xmm1, xmmword ptr [rsp]
1285        paddd   xmm2, xmmword ptr [rsp+20H]
1286        paddd   xmm3, xmmword ptr [rsp+40H]
1287        paddd   xmm0, xmm5
1288        paddd   xmm1, xmm6
1289        paddd   xmm2, xmm7
1290        paddd   xmm3, xmm4
1291        pxor    xmm15, xmm0
1292        pxor    xmm12, xmm1
1293        pxor    xmm13, xmm2
1294        pxor    xmm14, xmm3
1295        pshuflw xmm15, xmm15, 0B1H
1296        pshufhw xmm15, xmm15, 0B1H
1297        pshuflw xmm12, xmm12, 0B1H
1298        pshufhw xmm12, xmm12, 0B1H
1299        pshuflw xmm13, xmm13, 0B1H
1300        pshufhw xmm13, xmm13, 0B1H
1301        pshuflw xmm14, xmm14, 0B1H
1302        pshufhw xmm14, xmm14, 0B1H
1303        paddd   xmm10, xmm15
1304        paddd   xmm11, xmm12
1305        movdqa  xmm8, xmmword ptr [rsp+100H]
1306        paddd   xmm8, xmm13
1307        paddd   xmm9, xmm14
1308        pxor    xmm5, xmm10
1309        pxor    xmm6, xmm11
1310        pxor    xmm7, xmm8
1311        pxor    xmm4, xmm9
1312        movdqa  xmmword ptr [rsp+100H], xmm8
1313        movdqa  xmm8, xmm5
1314        psrld   xmm8, 12
1315        pslld   xmm5, 20
1316        por     xmm5, xmm8
1317        movdqa  xmm8, xmm6
1318        psrld   xmm8, 12
1319        pslld   xmm6, 20
1320        por     xmm6, xmm8
1321        movdqa  xmm8, xmm7
1322        psrld   xmm8, 12
1323        pslld   xmm7, 20
1324        por     xmm7, xmm8
1325        movdqa  xmm8, xmm4
1326        psrld   xmm8, 12
1327        pslld   xmm4, 20
1328        por     xmm4, xmm8
1329        paddd   xmm0, xmmword ptr [rsp+30H]
1330        paddd   xmm1, xmmword ptr [rsp+0A0H]
1331        paddd   xmm2, xmmword ptr [rsp+60H]
1332        paddd   xmm3, xmmword ptr [rsp+70H]
1333        paddd   xmm0, xmm5
1334        paddd   xmm1, xmm6
1335        paddd   xmm2, xmm7
1336        paddd   xmm3, xmm4
1337        pxor    xmm15, xmm0
1338        pxor    xmm12, xmm1
1339        pxor    xmm13, xmm2
1340        pxor    xmm14, xmm3
1341        movdqa  xmm8, xmm15
1342        psrld   xmm15, 8
1343        pslld   xmm8, 24
1344        pxor    xmm15, xmm8
1345        movdqa  xmm8, xmm12
1346        psrld   xmm12, 8
1347        pslld   xmm8, 24
1348        pxor    xmm12, xmm8
1349        movdqa  xmm8, xmm13
1350        psrld   xmm13, 8
1351        pslld   xmm8, 24
1352        pxor    xmm13, xmm8
1353        movdqa  xmm8, xmm14
1354        psrld   xmm14, 8
1355        pslld   xmm8, 24
1356        pxor    xmm14, xmm8
1357        paddd   xmm10, xmm15
1358        paddd   xmm11, xmm12
1359        movdqa  xmm8, xmmword ptr [rsp+100H]
1360        paddd   xmm8, xmm13
1361        paddd   xmm9, xmm14
1362        pxor    xmm5, xmm10
1363        pxor    xmm6, xmm11
1364        pxor    xmm7, xmm8
1365        pxor    xmm4, xmm9
1366        movdqa  xmmword ptr [rsp+100H], xmm8
1367        movdqa  xmm8, xmm5
1368        psrld   xmm8, 7
1369        pslld   xmm5, 25
1370        por     xmm5, xmm8
1371        movdqa  xmm8, xmm6
1372        psrld   xmm8, 7
1373        pslld   xmm6, 25
1374        por     xmm6, xmm8
1375        movdqa  xmm8, xmm7
1376        psrld   xmm8, 7
1377        pslld   xmm7, 25
1378        por     xmm7, xmm8
1379        movdqa  xmm8, xmm4
1380        psrld   xmm8, 7
1381        pslld   xmm4, 25
1382        por     xmm4, xmm8
1383        paddd   xmm0, xmmword ptr [rsp+0B0H]
1384        paddd   xmm1, xmmword ptr [rsp+50H]
1385        paddd   xmm2, xmmword ptr [rsp+10H]
1386        paddd   xmm3, xmmword ptr [rsp+80H]
1387        paddd   xmm0, xmm4
1388        paddd   xmm1, xmm5
1389        paddd   xmm2, xmm6
1390        paddd   xmm3, xmm7
1391        pxor    xmm12, xmm0
1392        pxor    xmm13, xmm1
1393        pxor    xmm14, xmm2
1394        pxor    xmm15, xmm3
1395        pshuflw xmm12, xmm12, 0B1H
1396        pshufhw xmm12, xmm12, 0B1H
1397        pshuflw xmm13, xmm13, 0B1H
1398        pshufhw xmm13, xmm13, 0B1H
1399        pshuflw xmm14, xmm14, 0B1H
1400        pshufhw xmm14, xmm14, 0B1H
1401        pshuflw xmm15, xmm15, 0B1H
1402        pshufhw xmm15, xmm15, 0B1H
1403        movdqa  xmm8, xmmword ptr [rsp+100H]
1404        paddd   xmm8, xmm12
1405        paddd   xmm9, xmm13
1406        paddd   xmm10, xmm14
1407        paddd   xmm11, xmm15
1408        pxor    xmm4, xmm8
1409        pxor    xmm5, xmm9
1410        pxor    xmm6, xmm10
1411        pxor    xmm7, xmm11
1412        movdqa  xmmword ptr [rsp+100H], xmm8
1413        movdqa  xmm8, xmm4
1414        psrld   xmm8, 12
1415        pslld   xmm4, 20
1416        por     xmm4, xmm8
1417        movdqa  xmm8, xmm5
1418        psrld   xmm8, 12
1419        pslld   xmm5, 20
1420        por     xmm5, xmm8
1421        movdqa  xmm8, xmm6
1422        psrld   xmm8, 12
1423        pslld   xmm6, 20
1424        por     xmm6, xmm8
1425        movdqa  xmm8, xmm7
1426        psrld   xmm8, 12
1427        pslld   xmm7, 20
1428        por     xmm7, xmm8
1429        paddd   xmm0, xmmword ptr [rsp+0F0H]
1430        paddd   xmm1, xmmword ptr [rsp]
1431        paddd   xmm2, xmmword ptr [rsp+90H]
1432        paddd   xmm3, xmmword ptr [rsp+60H]
1433        paddd   xmm0, xmm4
1434        paddd   xmm1, xmm5
1435        paddd   xmm2, xmm6
1436        paddd   xmm3, xmm7
1437        pxor    xmm12, xmm0
1438        pxor    xmm13, xmm1
1439        pxor    xmm14, xmm2
1440        pxor    xmm15, xmm3
1441        movdqa  xmm8, xmm12
1442        psrld   xmm12, 8
1443        pslld   xmm8, 24
1444        pxor    xmm12, xmm8
1445        movdqa  xmm8, xmm13
1446        psrld   xmm13, 8
1447        pslld   xmm8, 24
1448        pxor    xmm13, xmm8
1449        movdqa  xmm8, xmm14
1450        psrld   xmm14, 8
1451        pslld   xmm8, 24
1452        pxor    xmm14, xmm8
1453        movdqa  xmm8, xmm15
1454        psrld   xmm15, 8
1455        pslld   xmm8, 24
1456        pxor    xmm15, xmm8
1457        movdqa  xmm8, xmmword ptr [rsp+100H]
1458        paddd   xmm8, xmm12
1459        paddd   xmm9, xmm13
1460        paddd   xmm10, xmm14
1461        paddd   xmm11, xmm15
1462        pxor    xmm4, xmm8
1463        pxor    xmm5, xmm9
1464        pxor    xmm6, xmm10
1465        pxor    xmm7, xmm11
1466        movdqa  xmmword ptr [rsp+100H], xmm8
1467        movdqa  xmm8, xmm4
1468        psrld   xmm8, 7
1469        pslld   xmm4, 25
1470        por     xmm4, xmm8
1471        movdqa  xmm8, xmm5
1472        psrld   xmm8, 7
1473        pslld   xmm5, 25
1474        por     xmm5, xmm8
1475        movdqa  xmm8, xmm6
1476        psrld   xmm8, 7
1477        pslld   xmm6, 25
1478        por     xmm6, xmm8
1479        movdqa  xmm8, xmm7
1480        psrld   xmm8, 7
1481        pslld   xmm7, 25
1482        por     xmm7, xmm8
1483        paddd   xmm0, xmmword ptr [rsp+0E0H]
1484        paddd   xmm1, xmmword ptr [rsp+20H]
1485        paddd   xmm2, xmmword ptr [rsp+30H]
1486        paddd   xmm3, xmmword ptr [rsp+70H]
1487        paddd   xmm0, xmm5
1488        paddd   xmm1, xmm6
1489        paddd   xmm2, xmm7
1490        paddd   xmm3, xmm4
1491        pxor    xmm15, xmm0
1492        pxor    xmm12, xmm1
1493        pxor    xmm13, xmm2
1494        pxor    xmm14, xmm3
1495        pshuflw xmm15, xmm15, 0B1H
1496        pshufhw xmm15, xmm15, 0B1H
1497        pshuflw xmm12, xmm12, 0B1H
1498        pshufhw xmm12, xmm12, 0B1H
1499        pshuflw xmm13, xmm13, 0B1H
1500        pshufhw xmm13, xmm13, 0B1H
1501        pshuflw xmm14, xmm14, 0B1H
1502        pshufhw xmm14, xmm14, 0B1H
1503        paddd   xmm10, xmm15
1504        paddd   xmm11, xmm12
1505        movdqa  xmm8, xmmword ptr [rsp+100H]
1506        paddd   xmm8, xmm13
1507        paddd   xmm9, xmm14
1508        pxor    xmm5, xmm10
1509        pxor    xmm6, xmm11
1510        pxor    xmm7, xmm8
1511        pxor    xmm4, xmm9
1512        movdqa  xmmword ptr [rsp+100H], xmm8
1513        movdqa  xmm8, xmm5
1514        psrld   xmm8, 12
1515        pslld   xmm5, 20
1516        por     xmm5, xmm8
1517        movdqa  xmm8, xmm6
1518        psrld   xmm8, 12
1519        pslld   xmm6, 20
1520        por     xmm6, xmm8
1521        movdqa  xmm8, xmm7
1522        psrld   xmm8, 12
1523        pslld   xmm7, 20
1524        por     xmm7, xmm8
1525        movdqa  xmm8, xmm4
1526        psrld   xmm8, 12
1527        pslld   xmm4, 20
1528        por     xmm4, xmm8
1529        paddd   xmm0, xmmword ptr [rsp+0A0H]
1530        paddd   xmm1, xmmword ptr [rsp+0C0H]
1531        paddd   xmm2, xmmword ptr [rsp+40H]
1532        paddd   xmm3, xmmword ptr [rsp+0D0H]
1533        paddd   xmm0, xmm5
1534        paddd   xmm1, xmm6
1535        paddd   xmm2, xmm7
1536        paddd   xmm3, xmm4
1537        pxor    xmm15, xmm0
1538        pxor    xmm12, xmm1
1539        pxor    xmm13, xmm2
1540        pxor    xmm14, xmm3
1541        movdqa  xmm8, xmm15
1542        psrld   xmm15, 8
1543        pslld   xmm8, 24
1544        pxor    xmm15, xmm8
1545        movdqa  xmm8, xmm12
1546        psrld   xmm12, 8
1547        pslld   xmm8, 24
1548        pxor    xmm12, xmm8
1549        movdqa  xmm8, xmm13
1550        psrld   xmm13, 8
1551        pslld   xmm8, 24
1552        pxor    xmm13, xmm8
1553        movdqa  xmm8, xmm14
1554        psrld   xmm14, 8
1555        pslld   xmm8, 24
1556        pxor    xmm14, xmm8
1557        paddd   xmm10, xmm15
1558        paddd   xmm11, xmm12
1559        movdqa  xmm8, xmmword ptr [rsp+100H]
1560        paddd   xmm8, xmm13
1561        paddd   xmm9, xmm14
1562        pxor    xmm5, xmm10
1563        pxor    xmm6, xmm11
1564        pxor    xmm7, xmm8
1565        pxor    xmm4, xmm9
1566        pxor    xmm0, xmm8
1567        pxor    xmm1, xmm9
1568        pxor    xmm2, xmm10
1569        pxor    xmm3, xmm11
1570        movdqa  xmm8, xmm5
1571        psrld   xmm8, 7
1572        pslld   xmm5, 25
1573        por     xmm5, xmm8
1574        movdqa  xmm8, xmm6
1575        psrld   xmm8, 7
1576        pslld   xmm6, 25
1577        por     xmm6, xmm8
1578        movdqa  xmm8, xmm7
1579        psrld   xmm8, 7
1580        pslld   xmm7, 25
1581        por     xmm7, xmm8
1582        movdqa  xmm8, xmm4
1583        psrld   xmm8, 7
1584        pslld   xmm4, 25
1585        por     xmm4, xmm8
1586        pxor    xmm4, xmm12
1587        pxor    xmm5, xmm13
1588        pxor    xmm6, xmm14
1589        pxor    xmm7, xmm15
1590        mov     eax, r13d
1591        jne     innerloop4
1592        movdqa  xmm9, xmm0
1593        punpckldq xmm0, xmm1
1594        punpckhdq xmm9, xmm1
1595        movdqa  xmm11, xmm2
1596        punpckldq xmm2, xmm3
1597        punpckhdq xmm11, xmm3
1598        movdqa  xmm1, xmm0
1599        punpcklqdq xmm0, xmm2
1600        punpckhqdq xmm1, xmm2
1601        movdqa  xmm3, xmm9
1602        punpcklqdq xmm9, xmm11
1603        punpckhqdq xmm3, xmm11
1604        movdqu  xmmword ptr [rbx], xmm0
1605        movdqu  xmmword ptr [rbx+20H], xmm1
1606        movdqu  xmmword ptr [rbx+40H], xmm9
1607        movdqu  xmmword ptr [rbx+60H], xmm3
1608        movdqa  xmm9, xmm4
1609        punpckldq xmm4, xmm5
1610        punpckhdq xmm9, xmm5
1611        movdqa  xmm11, xmm6
1612        punpckldq xmm6, xmm7
1613        punpckhdq xmm11, xmm7
1614        movdqa  xmm5, xmm4
1615        punpcklqdq xmm4, xmm6
1616        punpckhqdq xmm5, xmm6
1617        movdqa  xmm7, xmm9
1618        punpcklqdq xmm9, xmm11
1619        punpckhqdq xmm7, xmm11
1620        movdqu  xmmword ptr [rbx+10H], xmm4
1621        movdqu  xmmword ptr [rbx+30H], xmm5
1622        movdqu  xmmword ptr [rbx+50H], xmm9
1623        movdqu  xmmword ptr [rbx+70H], xmm7
1624        movdqa  xmm1, xmmword ptr [rsp+110H]
1625        movdqa  xmm0, xmm1
1626        paddd   xmm1, xmmword ptr [rsp+150H]
1627        movdqa  xmmword ptr [rsp+110H], xmm1
1628        pxor    xmm0, xmmword ptr [CMP_MSB_MASK]
1629        pxor    xmm1, xmmword ptr [CMP_MSB_MASK]
1630        pcmpgtd xmm0, xmm1
1631        movdqa  xmm1, xmmword ptr [rsp+120H]
1632        psubd   xmm1, xmm0
1633        movdqa  xmmword ptr [rsp+120H], xmm1
1634        add     rbx, 128
1635        add     rdi, 32
1636        sub     rsi, 4
1637        cmp     rsi, 4
1638        jnc     outerloop4
1639        test    rsi, rsi
1640        jne     final3blocks
1641unwind:
1642        movdqa  xmm6, xmmword ptr [rsp+170H]
1643        movdqa  xmm7, xmmword ptr [rsp+180H]
1644        movdqa  xmm8, xmmword ptr [rsp+190H]
1645        movdqa  xmm9, xmmword ptr [rsp+1A0H]
1646        movdqa  xmm10, xmmword ptr [rsp+1B0H]
1647        movdqa  xmm11, xmmword ptr [rsp+1C0H]
1648        movdqa  xmm12, xmmword ptr [rsp+1D0H]
1649        movdqa  xmm13, xmmword ptr [rsp+1E0H]
1650        movdqa  xmm14, xmmword ptr [rsp+1F0H]
1651        movdqa  xmm15, xmmword ptr [rsp+200H]
1652        mov     rsp, rbp
1653        pop     rbp
1654        pop     rbx
1655        pop     rdi
1656        pop     rsi
1657        pop     r12
1658        pop     r13
1659        pop     r14
1660        pop     r15
1661        ret
1662ALIGN   16
1663final3blocks:
1664        test    esi, 2H
1665        je      final1block
1666        movups  xmm0, xmmword ptr [rcx]
1667        movups  xmm1, xmmword ptr [rcx+10H]
1668        movaps  xmm8, xmm0
1669        movaps  xmm9, xmm1
1670        movd    xmm13, dword ptr [rsp+110H]
1671        movd    xmm14, dword ptr [rsp+120H]
1672        punpckldq xmm13, xmm14
1673        movaps  xmmword ptr [rsp], xmm13
1674        movd    xmm14, dword ptr [rsp+114H]
1675        movd    xmm13, dword ptr [rsp+124H]
1676        punpckldq xmm14, xmm13
1677        movaps  xmmword ptr [rsp+10H], xmm14
1678        mov     r8, qword ptr [rdi]
1679        mov     r9, qword ptr [rdi+8H]
1680        movzx   eax, byte ptr [rbp+80H]
1681        or      eax, r13d
1682        xor     edx, edx
1683innerloop2:
1684        mov     r14d, eax
1685        or      eax, r12d
1686        add     rdx, 64
1687        cmp     rdx, r15
1688        cmovne  eax, r14d
1689        movaps  xmm2, xmmword ptr [BLAKE3_IV]
1690        movaps  xmm10, xmm2
1691        movups  xmm4, xmmword ptr [r8+rdx-40H]
1692        movups  xmm5, xmmword ptr [r8+rdx-30H]
1693        movaps  xmm3, xmm4
1694        shufps  xmm4, xmm5, 136
1695        shufps  xmm3, xmm5, 221
1696        movaps  xmm5, xmm3
1697        movups  xmm6, xmmword ptr [r8+rdx-20H]
1698        movups  xmm7, xmmword ptr [r8+rdx-10H]
1699        movaps  xmm3, xmm6
1700        shufps  xmm6, xmm7, 136
1701        pshufd  xmm6, xmm6, 93H
1702        shufps  xmm3, xmm7, 221
1703        pshufd  xmm7, xmm3, 93H
1704        movups  xmm12, xmmword ptr [r9+rdx-40H]
1705        movups  xmm13, xmmword ptr [r9+rdx-30H]
1706        movaps  xmm11, xmm12
1707        shufps  xmm12, xmm13, 136
1708        shufps  xmm11, xmm13, 221
1709        movaps  xmm13, xmm11
1710        movups  xmm14, xmmword ptr [r9+rdx-20H]
1711        movups  xmm15, xmmword ptr [r9+rdx-10H]
1712        movaps  xmm11, xmm14
1713        shufps  xmm14, xmm15, 136
1714        pshufd  xmm14, xmm14, 93H
1715        shufps  xmm11, xmm15, 221
1716        pshufd  xmm15, xmm11, 93H
1717        shl     rax, 20H
1718        or      rax, 40H
1719        movd    xmm3, rax
1720        movdqa  xmmword ptr [rsp+20H], xmm3
1721        movaps  xmm3, xmmword ptr [rsp]
1722        movaps  xmm11, xmmword ptr [rsp+10H]
1723        punpcklqdq xmm3, xmmword ptr [rsp+20H]
1724        punpcklqdq xmm11, xmmword ptr [rsp+20H]
1725        mov     al, 7
1726roundloop2:
1727        paddd   xmm0, xmm4
1728        paddd   xmm8, xmm12
1729        movaps  xmmword ptr [rsp+20H], xmm4
1730        movaps  xmmword ptr [rsp+30H], xmm12
1731        paddd   xmm0, xmm1
1732        paddd   xmm8, xmm9
1733        pxor    xmm3, xmm0
1734        pxor    xmm11, xmm8
1735        pshuflw xmm3, xmm3, 0B1H
1736        pshufhw xmm3, xmm3, 0B1H
1737        pshuflw xmm11, xmm11, 0B1H
1738        pshufhw xmm11, xmm11, 0B1H
1739        paddd   xmm2, xmm3
1740        paddd   xmm10, xmm11
1741        pxor    xmm1, xmm2
1742        pxor    xmm9, xmm10
1743        movdqa  xmm4, xmm1
1744        pslld   xmm1, 20
1745        psrld   xmm4, 12
1746        por     xmm1, xmm4
1747        movdqa  xmm4, xmm9
1748        pslld   xmm9, 20
1749        psrld   xmm4, 12
1750        por     xmm9, xmm4
1751        paddd   xmm0, xmm5
1752        paddd   xmm8, xmm13
1753        movaps  xmmword ptr [rsp+40H], xmm5
1754        movaps  xmmword ptr [rsp+50H], xmm13
1755        paddd   xmm0, xmm1
1756        paddd   xmm8, xmm9
1757        pxor    xmm3, xmm0
1758        pxor    xmm11, xmm8
1759        movdqa  xmm13, xmm3
1760        psrld   xmm3, 8
1761        pslld   xmm13, 24
1762        pxor    xmm3, xmm13
1763        movdqa  xmm13, xmm11
1764        psrld   xmm11, 8
1765        pslld   xmm13, 24
1766        pxor    xmm11, xmm13
1767        paddd   xmm2, xmm3
1768        paddd   xmm10, xmm11
1769        pxor    xmm1, xmm2
1770        pxor    xmm9, xmm10
1771        movdqa  xmm4, xmm1
1772        pslld   xmm1, 25
1773        psrld   xmm4, 7
1774        por     xmm1, xmm4
1775        movdqa  xmm4, xmm9
1776        pslld   xmm9, 25
1777        psrld   xmm4, 7
1778        por     xmm9, xmm4
1779        pshufd  xmm0, xmm0, 93H
1780        pshufd  xmm8, xmm8, 93H
1781        pshufd  xmm3, xmm3, 4EH
1782        pshufd  xmm11, xmm11, 4EH
1783        pshufd  xmm2, xmm2, 39H
1784        pshufd  xmm10, xmm10, 39H
1785        paddd   xmm0, xmm6
1786        paddd   xmm8, xmm14
1787        paddd   xmm0, xmm1
1788        paddd   xmm8, xmm9
1789        pxor    xmm3, xmm0
1790        pxor    xmm11, xmm8
1791        pshuflw xmm3, xmm3, 0B1H
1792        pshufhw xmm3, xmm3, 0B1H
1793        pshuflw xmm11, xmm11, 0B1H
1794        pshufhw xmm11, xmm11, 0B1H
1795        paddd   xmm2, xmm3
1796        paddd   xmm10, xmm11
1797        pxor    xmm1, xmm2
1798        pxor    xmm9, xmm10
1799        movdqa  xmm4, xmm1
1800        pslld   xmm1, 20
1801        psrld   xmm4, 12
1802        por     xmm1, xmm4
1803        movdqa  xmm4, xmm9
1804        pslld   xmm9, 20
1805        psrld   xmm4, 12
1806        por     xmm9, xmm4
1807        paddd   xmm0, xmm7
1808        paddd   xmm8, xmm15
1809        paddd   xmm0, xmm1
1810        paddd   xmm8, xmm9
1811        pxor    xmm3, xmm0
1812        pxor    xmm11, xmm8
1813        movdqa  xmm13, xmm3
1814        psrld   xmm3, 8
1815        pslld   xmm13, 24
1816        pxor    xmm3, xmm13
1817        movdqa  xmm13, xmm11
1818        psrld   xmm11, 8
1819        pslld   xmm13, 24
1820        pxor    xmm11, xmm13
1821        paddd   xmm2, xmm3
1822        paddd   xmm10, xmm11
1823        pxor    xmm1, xmm2
1824        pxor    xmm9, xmm10
1825        movdqa  xmm4, xmm1
1826        pslld   xmm1, 25
1827        psrld   xmm4, 7
1828        por     xmm1, xmm4
1829        movdqa  xmm4, xmm9
1830        pslld   xmm9, 25
1831        psrld   xmm4, 7
1832        por     xmm9, xmm4
1833        pshufd  xmm0, xmm0, 39H
1834        pshufd  xmm8, xmm8, 39H
1835        pshufd  xmm3, xmm3, 4EH
1836        pshufd  xmm11, xmm11, 4EH
1837        pshufd  xmm2, xmm2, 93H
1838        pshufd  xmm10, xmm10, 93H
1839        dec     al
1840        je      endroundloop2
1841        movdqa  xmm12, xmmword ptr [rsp+20H]
1842        movdqa  xmm5, xmmword ptr [rsp+40H]
1843        pshufd  xmm13, xmm12, 0FH
1844        shufps  xmm12, xmm5, 214
1845        pshufd  xmm4, xmm12, 39H
1846        movdqa  xmm12, xmm6
1847        shufps  xmm12, xmm7, 250
1848        pand    xmm13, xmmword ptr [PBLENDW_0x33_MASK]
1849        pand    xmm12, xmmword ptr [PBLENDW_0xCC_MASK]
1850        por     xmm13, xmm12
1851        movdqa  xmmword ptr [rsp+20H], xmm13
1852        movdqa  xmm12, xmm7
1853        punpcklqdq xmm12, xmm5
1854        movdqa  xmm13, xmm6
1855        pand    xmm12, xmmword ptr [PBLENDW_0x3F_MASK]
1856        pand    xmm13, xmmword ptr [PBLENDW_0xC0_MASK]
1857        por     xmm12, xmm13
1858        pshufd  xmm12, xmm12, 78H
1859        punpckhdq xmm5, xmm7
1860        punpckldq xmm6, xmm5
1861        pshufd  xmm7, xmm6, 1EH
1862        movdqa  xmmword ptr [rsp+40H], xmm12
1863        movdqa  xmm5, xmmword ptr [rsp+30H]
1864        movdqa  xmm13, xmmword ptr [rsp+50H]
1865        pshufd  xmm6, xmm5, 0FH
1866        shufps  xmm5, xmm13, 214
1867        pshufd  xmm12, xmm5, 39H
1868        movdqa  xmm5, xmm14
1869        shufps  xmm5, xmm15, 250
1870        pand    xmm6, xmmword ptr [PBLENDW_0x33_MASK]
1871        pand    xmm5, xmmword ptr [PBLENDW_0xCC_MASK]
1872        por     xmm6, xmm5
1873        movdqa  xmm5, xmm15
1874        punpcklqdq xmm5, xmm13
1875        movdqa  xmmword ptr [rsp+30H], xmm2
1876        movdqa  xmm2, xmm14
1877        pand    xmm5, xmmword ptr [PBLENDW_0x3F_MASK]
1878        pand    xmm2, xmmword ptr [PBLENDW_0xC0_MASK]
1879        por     xmm5, xmm2
1880        movdqa  xmm2, xmmword ptr [rsp+30H]
1881        pshufd  xmm5, xmm5, 78H
1882        punpckhdq xmm13, xmm15
1883        punpckldq xmm14, xmm13
1884        pshufd  xmm15, xmm14, 1EH
1885        movdqa  xmm13, xmm6
1886        movdqa  xmm14, xmm5
1887        movdqa  xmm5, xmmword ptr [rsp+20H]
1888        movdqa  xmm6, xmmword ptr [rsp+40H]
1889        jmp     roundloop2
1890endroundloop2:
1891        pxor    xmm0, xmm2
1892        pxor    xmm1, xmm3
1893        pxor    xmm8, xmm10
1894        pxor    xmm9, xmm11
1895        mov     eax, r13d
1896        cmp     rdx, r15
1897        jne     innerloop2
1898        movups  xmmword ptr [rbx], xmm0
1899        movups  xmmword ptr [rbx+10H], xmm1
1900        movups  xmmword ptr [rbx+20H], xmm8
1901        movups  xmmword ptr [rbx+30H], xmm9
1902        mov     eax, dword ptr [rsp+130H]
1903        neg     eax
1904        mov    r10d, dword ptr [rsp+110H+8*rax]
1905        mov    r11d, dword ptr [rsp+120H+8*rax]
1906        mov dword ptr [rsp+110H], r10d
1907        mov dword ptr [rsp+120H], r11d
1908        add     rdi, 16
1909        add     rbx, 64
1910        sub     rsi, 2
1911final1block:
1912        test    esi, 1H
1913        je      unwind
1914        movups  xmm0, xmmword ptr [rcx]
1915        movups  xmm1, xmmword ptr [rcx+10H]
1916        movd    xmm13, dword ptr [rsp+110H]
1917        movd    xmm14, dword ptr [rsp+120H]
1918        punpckldq xmm13, xmm14
1919        mov     r8, qword ptr [rdi]
1920        movzx   eax, byte ptr [rbp+80H]
1921        or      eax, r13d
1922        xor     edx, edx
1923innerloop1:
1924        mov     r14d, eax
1925        or      eax, r12d
1926        add     rdx, 64
1927        cmp     rdx, r15
1928        cmovne  eax, r14d
1929        movaps  xmm2, xmmword ptr [BLAKE3_IV]
1930        shl     rax, 32
1931        or      rax, 64
1932        movd    xmm12, rax
1933        movdqa  xmm3, xmm13
1934        punpcklqdq xmm3, xmm12
1935        movups  xmm4, xmmword ptr [r8+rdx-40H]
1936        movups  xmm5, xmmword ptr [r8+rdx-30H]
1937        movaps  xmm8, xmm4
1938        shufps  xmm4, xmm5, 136
1939        shufps  xmm8, xmm5, 221
1940        movaps  xmm5, xmm8
1941        movups  xmm6, xmmword ptr [r8+rdx-20H]
1942        movups  xmm7, xmmword ptr [r8+rdx-10H]
1943        movaps  xmm8, xmm6
1944        shufps  xmm6, xmm7, 136
1945        pshufd  xmm6, xmm6, 93H
1946        shufps  xmm8, xmm7, 221
1947        pshufd  xmm7, xmm8, 93H
1948        mov     al, 7
1949roundloop1:
1950        paddd   xmm0, xmm4
1951        paddd   xmm0, xmm1
1952        pxor    xmm3, xmm0
1953        pshuflw xmm3, xmm3, 0B1H
1954        pshufhw xmm3, xmm3, 0B1H
1955        paddd   xmm2, xmm3
1956        pxor    xmm1, xmm2
1957        movdqa  xmm11, xmm1
1958        pslld   xmm1, 20
1959        psrld   xmm11, 12
1960        por     xmm1, xmm11
1961        paddd   xmm0, xmm5
1962        paddd   xmm0, xmm1
1963        pxor    xmm3, xmm0
1964        movdqa  xmm14, xmm3
1965        psrld   xmm3, 8
1966        pslld   xmm14, 24
1967        pxor    xmm3, xmm14
1968        paddd   xmm2, xmm3
1969        pxor    xmm1, xmm2
1970        movdqa  xmm11, xmm1
1971        pslld   xmm1, 25
1972        psrld   xmm11, 7
1973        por     xmm1, xmm11
1974        pshufd  xmm0, xmm0, 93H
1975        pshufd  xmm3, xmm3, 4EH
1976        pshufd  xmm2, xmm2, 39H
1977        paddd   xmm0, xmm6
1978        paddd   xmm0, xmm1
1979        pxor    xmm3, xmm0
1980        pshuflw xmm3, xmm3, 0B1H
1981        pshufhw xmm3, xmm3, 0B1H
1982        paddd   xmm2, xmm3
1983        pxor    xmm1, xmm2
1984        movdqa  xmm11, xmm1
1985        pslld   xmm1, 20
1986        psrld   xmm11, 12
1987        por     xmm1, xmm11
1988        paddd   xmm0, xmm7
1989        paddd   xmm0, xmm1
1990        pxor    xmm3, xmm0
1991        movdqa  xmm14, xmm3
1992        psrld   xmm3, 8
1993        pslld   xmm14, 24
1994        pxor    xmm3, xmm14
1995        paddd   xmm2, xmm3
1996        pxor    xmm1, xmm2
1997        movdqa  xmm11, xmm1
1998        pslld   xmm1, 25
1999        psrld   xmm11, 7
2000        por     xmm1, xmm11
2001        pshufd  xmm0, xmm0, 39H
2002        pshufd  xmm3, xmm3, 4EH
2003        pshufd  xmm2, xmm2, 93H
2004        dec     al
2005        jz      endroundloop1
2006        movdqa  xmm8, xmm4
2007        shufps  xmm8, xmm5, 214
2008        pshufd  xmm9, xmm4, 0FH
2009        pshufd  xmm4, xmm8, 39H
2010        movdqa  xmm8, xmm6
2011        shufps  xmm8, xmm7, 250
2012        pand    xmm9, xmmword ptr [PBLENDW_0x33_MASK]
2013        pand    xmm8, xmmword ptr [PBLENDW_0xCC_MASK]
2014        por     xmm9, xmm8
2015        movdqa  xmm8, xmm7
2016        punpcklqdq xmm8, xmm5
2017        movdqa  xmm10, xmm6
2018        pand    xmm8, xmmword ptr [PBLENDW_0x3F_MASK]
2019        pand    xmm10, xmmword ptr [PBLENDW_0xC0_MASK]
2020        por     xmm8, xmm10
2021        pshufd  xmm8, xmm8, 78H
2022        punpckhdq xmm5, xmm7
2023        punpckldq xmm6, xmm5
2024        pshufd  xmm7, xmm6, 1EH
2025        movdqa  xmm5, xmm9
2026        movdqa  xmm6, xmm8
2027        jmp     roundloop1
2028endroundloop1:
2029        pxor    xmm0, xmm2
2030        pxor    xmm1, xmm3
2031        mov     eax, r13d
2032        cmp     rdx, r15
2033        jne     innerloop1
2034        movups  xmmword ptr [rbx], xmm0
2035        movups  xmmword ptr [rbx+10H], xmm1
2036        jmp     unwind
2037_llvm_blake3_hash_many_sse2 ENDP
2038llvm_blake3_hash_many_sse2 ENDP
2039
2040llvm_blake3_compress_in_place_sse2 PROC
2041_llvm_blake3_compress_in_place_sse2 PROC
2042        sub     rsp, 120
2043        movdqa  xmmword ptr [rsp], xmm6
2044        movdqa  xmmword ptr [rsp+10H], xmm7
2045        movdqa  xmmword ptr [rsp+20H], xmm8
2046        movdqa  xmmword ptr [rsp+30H], xmm9
2047        movdqa  xmmword ptr [rsp+40H], xmm11
2048        movdqa  xmmword ptr [rsp+50H], xmm14
2049        movdqa  xmmword ptr [rsp+60H], xmm15
2050        movups  xmm0, xmmword ptr [rcx]
2051        movups  xmm1, xmmword ptr [rcx+10H]
2052        movaps  xmm2, xmmword ptr [BLAKE3_IV]
2053        movzx   eax, byte ptr [rsp+0A0H]
2054        movzx   r8d, r8b
2055        shl     rax, 32
2056        add     r8, rax
2057        movd    xmm3, r9
2058        movd    xmm4, r8
2059        punpcklqdq xmm3, xmm4
2060        movups  xmm4, xmmword ptr [rdx]
2061        movups  xmm5, xmmword ptr [rdx+10H]
2062        movaps  xmm8, xmm4
2063        shufps  xmm4, xmm5, 136
2064        shufps  xmm8, xmm5, 221
2065        movaps  xmm5, xmm8
2066        movups  xmm6, xmmword ptr [rdx+20H]
2067        movups  xmm7, xmmword ptr [rdx+30H]
2068        movaps  xmm8, xmm6
2069        shufps  xmm6, xmm7, 136
2070        pshufd  xmm6, xmm6, 93H
2071        shufps  xmm8, xmm7, 221
2072        pshufd  xmm7, xmm8, 93H
2073        mov     al, 7
2074@@:
2075        paddd   xmm0, xmm4
2076        paddd   xmm0, xmm1
2077        pxor    xmm3, xmm0
2078        pshuflw xmm3, xmm3, 0B1H
2079        pshufhw xmm3, xmm3, 0B1H
2080        paddd   xmm2, xmm3
2081        pxor    xmm1, xmm2
2082        movdqa  xmm11, xmm1
2083        pslld   xmm1, 20
2084        psrld   xmm11, 12
2085        por     xmm1, xmm11
2086        paddd   xmm0, xmm5
2087        paddd   xmm0, xmm1
2088        pxor    xmm3, xmm0
2089        movdqa  xmm14, xmm3
2090        psrld   xmm3, 8
2091        pslld   xmm14, 24
2092        pxor    xmm3, xmm14
2093        paddd   xmm2, xmm3
2094        pxor    xmm1, xmm2
2095        movdqa  xmm11, xmm1
2096        pslld   xmm1, 25
2097        psrld   xmm11, 7
2098        por     xmm1, xmm11
2099        pshufd  xmm0, xmm0, 93H
2100        pshufd  xmm3, xmm3, 4EH
2101        pshufd  xmm2, xmm2, 39H
2102        paddd   xmm0, xmm6
2103        paddd   xmm0, xmm1
2104        pxor    xmm3, xmm0
2105        pshuflw xmm3, xmm3, 0B1H
2106        pshufhw xmm3, xmm3, 0B1H
2107        paddd   xmm2, xmm3
2108        pxor    xmm1, xmm2
2109        movdqa  xmm11, xmm1
2110        pslld   xmm1, 20
2111        psrld   xmm11, 12
2112        por     xmm1, xmm11
2113        paddd   xmm0, xmm7
2114        paddd   xmm0, xmm1
2115        pxor    xmm3, xmm0
2116        movdqa  xmm14, xmm3
2117        psrld   xmm3, 8
2118        pslld   xmm14, 24
2119        pxor    xmm3, xmm14
2120        paddd   xmm2, xmm3
2121        pxor    xmm1, xmm2
2122        movdqa  xmm11, xmm1
2123        pslld   xmm1, 25
2124        psrld   xmm11, 7
2125        por     xmm1, xmm11
2126        pshufd  xmm0, xmm0, 39H
2127        pshufd  xmm3, xmm3, 4EH
2128        pshufd  xmm2, xmm2, 93H
2129        dec     al
2130        jz      @F
2131        movdqa  xmm8, xmm4
2132        shufps  xmm8, xmm5, 214
2133        pshufd  xmm9, xmm4, 0FH
2134        pshufd  xmm4, xmm8, 39H
2135        movdqa  xmm8, xmm6
2136        shufps  xmm8, xmm7, 250
2137        pand    xmm9, xmmword ptr [PBLENDW_0x33_MASK]
2138        pand    xmm8, xmmword ptr [PBLENDW_0xCC_MASK]
2139        por     xmm9, xmm8
2140        movdqa  xmm8, xmm7
2141        punpcklqdq xmm8, xmm5
2142        movdqa  xmm14, xmm6
2143        pand    xmm8, xmmword ptr [PBLENDW_0x3F_MASK]
2144        pand    xmm14, xmmword ptr [PBLENDW_0xC0_MASK]
2145        por     xmm8, xmm14
2146        pshufd  xmm8, xmm8, 78H
2147        punpckhdq xmm5, xmm7
2148        punpckldq xmm6, xmm5
2149        pshufd  xmm7, xmm6, 1EH
2150        movdqa  xmm5, xmm9
2151        movdqa  xmm6, xmm8
2152        jmp     @B
2153@@:
2154        pxor    xmm0, xmm2
2155        pxor    xmm1, xmm3
2156        movups  xmmword ptr [rcx], xmm0
2157        movups  xmmword ptr [rcx+10H], xmm1
2158        movdqa  xmm6, xmmword ptr [rsp]
2159        movdqa  xmm7, xmmword ptr [rsp+10H]
2160        movdqa  xmm8, xmmword ptr [rsp+20H]
2161        movdqa  xmm9, xmmword ptr [rsp+30H]
2162        movdqa  xmm11, xmmword ptr [rsp+40H]
2163        movdqa  xmm14, xmmword ptr [rsp+50H]
2164        movdqa  xmm15, xmmword ptr [rsp+60H]
2165        add     rsp, 120
2166        ret
2167_llvm_blake3_compress_in_place_sse2 ENDP
2168llvm_blake3_compress_in_place_sse2 ENDP
2169
2170ALIGN 16
2171llvm_blake3_compress_xof_sse2 PROC
2172_llvm_blake3_compress_xof_sse2 PROC
2173        sub     rsp, 120
2174        movdqa  xmmword ptr [rsp], xmm6
2175        movdqa  xmmword ptr [rsp+10H], xmm7
2176        movdqa  xmmword ptr [rsp+20H], xmm8
2177        movdqa  xmmword ptr [rsp+30H], xmm9
2178        movdqa  xmmword ptr [rsp+40H], xmm11
2179        movdqa  xmmword ptr [rsp+50H], xmm14
2180        movdqa  xmmword ptr [rsp+60H], xmm15
2181        movups  xmm0, xmmword ptr [rcx]
2182        movups  xmm1, xmmword ptr [rcx+10H]
2183        movaps  xmm2, xmmword ptr [BLAKE3_IV]
2184        movzx   eax, byte ptr [rsp+0A0H]
2185        movzx   r8d, r8b
2186        mov     r10, qword ptr [rsp+0A8H]
2187        shl     rax, 32
2188        add     r8, rax
2189        movd    xmm3, r9
2190        movd    xmm4, r8
2191        punpcklqdq xmm3, xmm4
2192        movups  xmm4, xmmword ptr [rdx]
2193        movups  xmm5, xmmword ptr [rdx+10H]
2194        movaps  xmm8, xmm4
2195        shufps  xmm4, xmm5, 136
2196        shufps  xmm8, xmm5, 221
2197        movaps  xmm5, xmm8
2198        movups  xmm6, xmmword ptr [rdx+20H]
2199        movups  xmm7, xmmword ptr [rdx+30H]
2200        movaps  xmm8, xmm6
2201        shufps  xmm6, xmm7, 136
2202        pshufd  xmm6, xmm6, 93H
2203        shufps  xmm8, xmm7, 221
2204        pshufd  xmm7, xmm8, 93H
2205        mov     al, 7
2206@@:
2207        paddd   xmm0, xmm4
2208        paddd   xmm0, xmm1
2209        pxor    xmm3, xmm0
2210        pshuflw xmm3, xmm3, 0B1H
2211        pshufhw xmm3, xmm3, 0B1H
2212        paddd   xmm2, xmm3
2213        pxor    xmm1, xmm2
2214        movdqa  xmm11, xmm1
2215        pslld   xmm1, 20
2216        psrld   xmm11, 12
2217        por     xmm1, xmm11
2218        paddd   xmm0, xmm5
2219        paddd   xmm0, xmm1
2220        pxor    xmm3, xmm0
2221        movdqa  xmm14, xmm3
2222        psrld   xmm3, 8
2223        pslld   xmm14, 24
2224        pxor    xmm3, xmm14
2225        paddd   xmm2, xmm3
2226        pxor    xmm1, xmm2
2227        movdqa  xmm11, xmm1
2228        pslld   xmm1, 25
2229        psrld   xmm11, 7
2230        por     xmm1, xmm11
2231        pshufd  xmm0, xmm0, 93H
2232        pshufd  xmm3, xmm3, 4EH
2233        pshufd  xmm2, xmm2, 39H
2234        paddd   xmm0, xmm6
2235        paddd   xmm0, xmm1
2236        pxor    xmm3, xmm0
2237        pshuflw xmm3, xmm3, 0B1H
2238        pshufhw xmm3, xmm3, 0B1H
2239        paddd   xmm2, xmm3
2240        pxor    xmm1, xmm2
2241        movdqa  xmm11, xmm1
2242        pslld   xmm1, 20
2243        psrld   xmm11, 12
2244        por     xmm1, xmm11
2245        paddd   xmm0, xmm7
2246        paddd   xmm0, xmm1
2247        pxor    xmm3, xmm0
2248        movdqa  xmm14, xmm3
2249        psrld   xmm3, 8
2250        pslld   xmm14, 24
2251        pxor    xmm3, xmm14
2252        paddd   xmm2, xmm3
2253        pxor    xmm1, xmm2
2254        movdqa  xmm11, xmm1
2255        pslld   xmm1, 25
2256        psrld   xmm11, 7
2257        por     xmm1, xmm11
2258        pshufd  xmm0, xmm0, 39H
2259        pshufd  xmm3, xmm3, 4EH
2260        pshufd  xmm2, xmm2, 93H
2261        dec     al
2262        jz      @F
2263        movdqa  xmm8, xmm4
2264        shufps  xmm8, xmm5, 214
2265        pshufd  xmm9, xmm4, 0FH
2266        pshufd  xmm4, xmm8, 39H
2267        movdqa  xmm8, xmm6
2268        shufps  xmm8, xmm7, 250
2269        pand    xmm9, xmmword ptr [PBLENDW_0x33_MASK]
2270        pand    xmm8, xmmword ptr [PBLENDW_0xCC_MASK]
2271        por     xmm9, xmm8
2272        movdqa  xmm8, xmm7
2273        punpcklqdq xmm8, xmm5
2274        movdqa  xmm14, xmm6
2275        pand    xmm8, xmmword ptr [PBLENDW_0x3F_MASK]
2276        pand    xmm14, xmmword ptr [PBLENDW_0xC0_MASK]
2277        por     xmm8, xmm14
2278        pshufd  xmm8, xmm8, 78H
2279        punpckhdq xmm5, xmm7
2280        punpckldq xmm6, xmm5
2281        pshufd  xmm7, xmm6, 1EH
2282        movdqa  xmm5, xmm9
2283        movdqa  xmm6, xmm8
2284        jmp     @B
2285@@:
2286        movdqu  xmm4, xmmword ptr [rcx]
2287        movdqu  xmm5, xmmword ptr [rcx+10H]
2288        pxor    xmm0, xmm2
2289        pxor    xmm1, xmm3
2290        pxor    xmm2, xmm4
2291        pxor    xmm3, xmm5
2292        movups  xmmword ptr [r10], xmm0
2293        movups  xmmword ptr [r10+10H], xmm1
2294        movups  xmmword ptr [r10+20H], xmm2
2295        movups  xmmword ptr [r10+30H], xmm3
2296        movdqa  xmm6, xmmword ptr [rsp]
2297        movdqa  xmm7, xmmword ptr [rsp+10H]
2298        movdqa  xmm8, xmmword ptr [rsp+20H]
2299        movdqa  xmm9, xmmword ptr [rsp+30H]
2300        movdqa  xmm11, xmmword ptr [rsp+40H]
2301        movdqa  xmm14, xmmword ptr [rsp+50H]
2302        movdqa  xmm15, xmmword ptr [rsp+60H]
2303        add     rsp, 120
2304        ret
2305_llvm_blake3_compress_xof_sse2 ENDP
2306llvm_blake3_compress_xof_sse2 ENDP
2307
2308_TEXT ENDS
2309
2310
2311_RDATA SEGMENT READONLY PAGE ALIAS(".rdata") 'CONST'
2312ALIGN   64
2313BLAKE3_IV:
2314        dd 6A09E667H, 0BB67AE85H, 3C6EF372H, 0A54FF53AH
2315
2316ADD0:
2317        dd 0, 1, 2, 3
2318
2319ADD1:
2320        dd 4 dup (4)
2321
2322BLAKE3_IV_0:
2323        dd 4 dup (6A09E667H)
2324
2325BLAKE3_IV_1:
2326        dd 4 dup (0BB67AE85H)
2327
2328BLAKE3_IV_2:
2329        dd 4 dup (3C6EF372H)
2330
2331BLAKE3_IV_3:
2332        dd 4 dup (0A54FF53AH)
2333
2334BLAKE3_BLOCK_LEN:
2335        dd 4 dup (64)
2336
2337CMP_MSB_MASK:
2338        dd 8 dup(80000000H)
2339
2340PBLENDW_0x33_MASK:
2341       dd 0FFFFFFFFH, 000000000H, 0FFFFFFFFH, 000000000H
2342PBLENDW_0xCC_MASK:
2343       dd 000000000H, 0FFFFFFFFH, 000000000H, 0FFFFFFFFH
2344PBLENDW_0x3F_MASK:
2345	dd 0FFFFFFFFH, 0FFFFFFFFH, 0FFFFFFFFH, 000000000H
2346PBLENDW_0xC0_MASK:
2347       dd 000000000H, 000000000H, 000000000H, 0FFFFFFFFH
2348
2349_RDATA ENDS
2350END
2351