rsaz-x86_64.pl revision 356290
1#!/usr/bin/env perl
2
3##############################################################################
4#                                                                            #
5#  Copyright (c) 2012, Intel Corporation                                     #
6#                                                                            #
7#  All rights reserved.                                                      #
8#                                                                            #
9#  Redistribution and use in source and binary forms, with or without        #
10#  modification, are permitted provided that the following conditions are    #
11#  met:                                                                      #
12#                                                                            #
13#  *  Redistributions of source code must retain the above copyright         #
14#     notice, this list of conditions and the following disclaimer.          #
15#                                                                            #
16#  *  Redistributions in binary form must reproduce the above copyright      #
17#     notice, this list of conditions and the following disclaimer in the    #
18#     documentation and/or other materials provided with the                 #
19#     distribution.                                                          #
20#                                                                            #
21#  *  Neither the name of the Intel Corporation nor the names of its         #
22#     contributors may be used to endorse or promote products derived from   #
23#     this software without specific prior written permission.               #
24#                                                                            #
25#                                                                            #
26#  THIS SOFTWARE IS PROVIDED BY INTEL CORPORATION ""AS IS"" AND ANY          #
27#  EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE         #
28#  IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR        #
29#  PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL INTEL CORPORATION OR            #
30#  CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,     #
31#  EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,       #
32#  PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR        #
33#  PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF    #
34#  LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING      #
35#  NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS        #
36#  SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.              #
37#                                                                            #
38##############################################################################
39# Developers and authors:                                                    #
40# Shay Gueron (1, 2), and Vlad Krasnov (1)                                   #
41# (1) Intel Architecture Group, Microprocessor and Chipset Development,      #
42#     Israel Development Center, Haifa, Israel                               #
43# (2) University of Haifa                                                    #
44##############################################################################
45# Reference:                                                                 #
46# [1] S. Gueron, "Efficient Software Implementations of Modular              #
47#     Exponentiation", http://eprint.iacr.org/2011/239                       #
48# [2] S. Gueron, V. Krasnov. "Speeding up Big-Numbers Squaring".             #
49#     IEEE Proceedings of 9th International Conference on Information        #
50#     Technology: New Generations (ITNG 2012), 821-823 (2012).               #
51# [3] S. Gueron, Efficient Software Implementations of Modular Exponentiation#
52#     Journal of Cryptographic Engineering 2:31-43 (2012).                   #
53# [4] S. Gueron, V. Krasnov: "[PATCH] Efficient and side channel analysis    #
54#     resistant 512-bit and 1024-bit modular exponentiation for optimizing   #
55#     RSA1024 and RSA2048 on x86_64 platforms",                              #
56#     http://rt.openssl.org/Ticket/Display.html?id=2582&user=guest&pass=guest#
57##############################################################################
58
59# While original submission covers 512- and 1024-bit exponentiation,
60# this module is limited to 512-bit version only (and as such
61# accelerates RSA1024 sign). This is because improvement for longer
62# keys is not high enough to justify the effort, highest measured
63# was ~5% on Westmere. [This is relative to OpenSSL 1.0.2, upcoming
64# for the moment of this writing!] Nor does this module implement
65# "monolithic" complete exponentiation jumbo-subroutine, but adheres
66# to more modular mixture of C and assembly. And it's optimized even
67# for processors other than Intel Core family (see table below for
68# improvement coefficients).
69# 						<appro@openssl.org>
70#
71# RSA1024 sign/sec	this/original	|this/rsax(*)	this/fips(*)
72#			----------------+---------------------------
73# Opteron		+13%		|+5%		+20%
74# Bulldozer		-0%		|-1%		+10%
75# P4			+11%		|+7%		+8%
76# Westmere		+5%		|+14%		+17%
77# Sandy Bridge		+2%		|+12%		+29%
78# Ivy Bridge		+1%		|+11%		+35%
79# Haswell(**)		-0%		|+12%		+39%
80# Atom			+13%		|+11%		+4%
81# VIA Nano		+70%		|+9%		+25%
82#
83# (*)	rsax engine and fips numbers are presented for reference
84#	purposes;
85# (**)	MULX was attempted, but found to give only marginal improvement;
86
87$flavour = shift;
88$output  = shift;
89if ($flavour =~ /\./) { $output = $flavour; undef $flavour; }
90
91$win64=0; $win64=1 if ($flavour =~ /[nm]asm|mingw64/ || $output =~ /\.asm$/);
92
93$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
94( $xlate="${dir}x86_64-xlate.pl" and -f $xlate ) or
95( $xlate="${dir}../../perlasm/x86_64-xlate.pl" and -f $xlate) or
96die "can't locate x86_64-xlate.pl";
97
98open OUT,"| \"$^X\" $xlate $flavour $output";
99*STDOUT=*OUT;
100
101if (`$ENV{CC} -Wa,-v -c -o /dev/null -x assembler /dev/null 2>&1`
102		=~ /GNU assembler version ([2-9]\.[0-9]+)/) {
103	$addx = ($1>=2.23);
104}
105
106if (!$addx && $win64 && ($flavour =~ /nasm/ || $ENV{ASM} =~ /nasm/) &&
107	    `nasm -v 2>&1` =~ /NASM version ([2-9]\.[0-9]+)/) {
108	$addx = ($1>=2.10);
109}
110
111if (!$addx && $win64 && ($flavour =~ /masm/ || $ENV{ASM} =~ /ml64/) &&
112	    `ml64 2>&1` =~ /Version ([0-9]+)\./) {
113	$addx = ($1>=12);
114}
115
116if (!$addx && `$ENV{CC} -v 2>&1` =~ /((?:^clang|LLVM) version|.*based on LLVM) ([3-9])\.([0-9]+)/) {
117	my $ver = $2 + $3/100.0;	# 3.1->3.01, 3.10->3.10
118	$addx = ($ver>=3.03);
119}
120
121($out, $inp, $mod) = ("%rdi", "%rsi", "%rbp");	# common internal API
122{
123my ($out,$inp,$mod,$n0,$times) = ("%rdi","%rsi","%rdx","%rcx","%r8d");
124
125$code.=<<___;
126.text
127
128.extern	OPENSSL_ia32cap_P
129
130.globl	rsaz_512_sqr
131.type	rsaz_512_sqr,\@function,5
132.align	32
133rsaz_512_sqr:				# 25-29% faster than rsaz_512_mul
134	push	%rbx
135	push	%rbp
136	push	%r12
137	push	%r13
138	push	%r14
139	push	%r15
140
141	subq	\$128+24, %rsp
142.Lsqr_body:
143	movq	$mod, %xmm1		# common off-load
144	movq	($inp), %rdx
145	movq	8($inp), %rax
146	movq	$n0, 128(%rsp)
147___
148$code.=<<___ if ($addx);
149	movl	\$0x80100,%r11d
150	andl	OPENSSL_ia32cap_P+8(%rip),%r11d
151	cmpl	\$0x80100,%r11d		# check for MULX and ADO/CX
152	je	.Loop_sqrx
153___
154$code.=<<___;
155	jmp	.Loop_sqr
156
157.align	32
158.Loop_sqr:
159	movl	$times,128+8(%rsp)
160#first iteration
161	movq	%rdx, %rbx		# 0($inp)
162	mov	%rax, %rbp		# 8($inp)
163	mulq	%rdx
164	movq	%rax, %r8
165	movq	16($inp), %rax
166	movq	%rdx, %r9
167
168	mulq	%rbx
169	addq	%rax, %r9
170	movq	24($inp), %rax
171	movq	%rdx, %r10
172	adcq	\$0, %r10
173
174	mulq	%rbx
175	addq	%rax, %r10
176	movq	32($inp), %rax
177	movq	%rdx, %r11
178	adcq	\$0, %r11
179
180	mulq	%rbx
181	addq	%rax, %r11
182	movq	40($inp), %rax
183	movq	%rdx, %r12
184	adcq	\$0, %r12
185
186	mulq	%rbx
187	addq	%rax, %r12
188	movq	48($inp), %rax
189	movq	%rdx, %r13
190	adcq	\$0, %r13
191
192	mulq	%rbx
193	addq	%rax, %r13
194	movq	56($inp), %rax
195	movq	%rdx, %r14
196	adcq	\$0, %r14
197
198	mulq	%rbx
199	addq	%rax, %r14
200	movq	%rbx, %rax
201	adcq	\$0, %rdx
202
203	xorq	%rcx,%rcx		# rcx:r8 = r8 << 1
204	addq	%r8, %r8
205	 movq	%rdx, %r15
206	adcq	\$0, %rcx
207
208	mulq	%rax
209	addq	%r8, %rdx
210	adcq	\$0, %rcx
211
212	movq	%rax, (%rsp)
213	movq	%rdx, 8(%rsp)
214
215#second iteration
216	movq	16($inp), %rax
217	mulq	%rbp
218	addq	%rax, %r10
219	movq	24($inp), %rax
220	movq	%rdx, %rbx
221	adcq	\$0, %rbx
222
223	mulq	%rbp
224	addq	%rax, %r11
225	movq	32($inp), %rax
226	adcq	\$0, %rdx
227	addq	%rbx, %r11
228	movq	%rdx, %rbx
229	adcq	\$0, %rbx
230
231	mulq	%rbp
232	addq	%rax, %r12
233	movq	40($inp), %rax
234	adcq	\$0, %rdx
235	addq	%rbx, %r12
236	movq	%rdx, %rbx
237	adcq	\$0, %rbx
238
239	mulq	%rbp
240	addq	%rax, %r13
241	movq	48($inp), %rax
242	adcq	\$0, %rdx
243	addq	%rbx, %r13
244	movq	%rdx, %rbx
245	adcq	\$0, %rbx
246
247	mulq	%rbp
248	addq	%rax, %r14
249	movq	56($inp), %rax
250	adcq	\$0, %rdx
251	addq	%rbx, %r14
252	movq	%rdx, %rbx
253	adcq	\$0, %rbx
254
255	mulq	%rbp
256	addq	%rax, %r15
257	movq	%rbp, %rax
258	adcq	\$0, %rdx
259	addq	%rbx, %r15
260	adcq	\$0, %rdx
261
262	xorq	%rbx, %rbx		# rbx:r10:r9 = r10:r9 << 1
263	addq	%r9, %r9
264	 movq	%rdx, %r8
265	adcq	%r10, %r10
266	adcq	\$0, %rbx
267
268	mulq	%rax
269	# rcx <= 2 and rax <= 0xFFFF..F9, so carry must be zero here
270	addq	%rcx, %rax
271	 movq	16($inp), %rbp
272	addq	%rax, %r9
273	 movq	24($inp), %rax
274	adcq	%rdx, %r10
275	adcq	\$0, %rbx
276
277	movq	%r9, 16(%rsp)
278	movq	%r10, 24(%rsp)
279
280#third iteration
281	mulq	%rbp
282	addq	%rax, %r12
283	movq	32($inp), %rax
284	movq	%rdx, %rcx
285	adcq	\$0, %rcx
286
287	mulq	%rbp
288	addq	%rax, %r13
289	movq	40($inp), %rax
290	adcq	\$0, %rdx
291	addq	%rcx, %r13
292	movq	%rdx, %rcx
293	adcq	\$0, %rcx
294
295	mulq	%rbp
296	addq	%rax, %r14
297	movq	48($inp), %rax
298	adcq	\$0, %rdx
299	addq	%rcx, %r14
300	movq	%rdx, %rcx
301	adcq	\$0, %rcx
302
303	mulq	%rbp
304	addq	%rax, %r15
305	movq	56($inp), %rax
306	adcq	\$0, %rdx
307	addq	%rcx, %r15
308	movq	%rdx, %rcx
309	adcq	\$0, %rcx
310
311	mulq	%rbp
312	addq	%rax, %r8
313	movq	%rbp, %rax
314	adcq	\$0, %rdx
315	addq	%rcx, %r8
316	adcq	\$0, %rdx
317
318	xorq	%rcx, %rcx		# rcx:r12:r11 = r12:r11 << 1
319	addq	%r11, %r11
320	 movq	%rdx, %r9
321	adcq	%r12, %r12
322	adcq	\$0, %rcx
323
324	mulq	%rax
325	# rbx <= 2 and rax <= 0xFFFF..F9, so carry must be zero here
326	addq	%rbx, %rax
327	 movq	24($inp), %r10
328	addq	%rax, %r11
329	 movq	32($inp), %rax
330	adcq	%rdx, %r12
331	adcq	\$0, %rcx
332
333	movq	%r11, 32(%rsp)
334	movq	%r12, 40(%rsp)
335
336#fourth iteration
337	mov	%rax, %r11		# 32($inp)
338	mulq	%r10
339	addq	%rax, %r14
340	movq	40($inp), %rax
341	movq	%rdx, %rbx
342	adcq	\$0, %rbx
343
344	mov	%rax, %r12		# 40($inp)
345	mulq	%r10
346	addq	%rax, %r15
347	movq	48($inp), %rax
348	adcq	\$0, %rdx
349	addq	%rbx, %r15
350	movq	%rdx, %rbx
351	adcq	\$0, %rbx
352
353	mov	%rax, %rbp		# 48($inp)
354	mulq	%r10
355	addq	%rax, %r8
356	movq	56($inp), %rax
357	adcq	\$0, %rdx
358	addq	%rbx, %r8
359	movq	%rdx, %rbx
360	adcq	\$0, %rbx
361
362	mulq	%r10
363	addq	%rax, %r9
364	movq	%r10, %rax
365	adcq	\$0, %rdx
366	addq	%rbx, %r9
367	adcq	\$0, %rdx
368
369	xorq	%rbx, %rbx		# rbx:r13:r14 = r13:r14 << 1
370	addq	%r13, %r13
371	 movq	%rdx, %r10
372	adcq	%r14, %r14
373	adcq	\$0, %rbx
374
375	mulq	%rax
376	# rcx <= 2 and rax <= 0xFFFF..F9, so carry must be zero here
377	addq	%rcx, %rax
378	addq	%rax, %r13
379	 movq	%r12, %rax		# 40($inp)
380	adcq	%rdx, %r14
381	adcq	\$0, %rbx
382
383	movq	%r13, 48(%rsp)
384	movq	%r14, 56(%rsp)
385
386#fifth iteration
387	mulq	%r11
388	addq	%rax, %r8
389	movq	%rbp, %rax		# 48($inp)
390	movq	%rdx, %rcx
391	adcq	\$0, %rcx
392
393	mulq	%r11
394	addq	%rax, %r9
395	movq	56($inp), %rax
396	adcq	\$0, %rdx
397	addq	%rcx, %r9
398	movq	%rdx, %rcx
399	adcq	\$0, %rcx
400
401	mov	%rax, %r14		# 56($inp)
402	mulq	%r11
403	addq	%rax, %r10
404	movq	%r11, %rax
405	adcq	\$0, %rdx
406	addq	%rcx, %r10
407	adcq	\$0, %rdx
408
409	xorq	%rcx, %rcx		# rcx:r8:r15 = r8:r15 << 1
410	addq	%r15, %r15
411	 movq	%rdx, %r11
412	adcq	%r8, %r8
413	adcq	\$0, %rcx
414
415	mulq	%rax
416	# rbx <= 2 and rax <= 0xFFFF..F9, so carry must be zero here
417	addq	%rbx, %rax
418	addq	%rax, %r15
419	 movq	%rbp, %rax		# 48($inp)
420	adcq	%rdx, %r8
421	adcq	\$0, %rcx
422
423	movq	%r15, 64(%rsp)
424	movq	%r8, 72(%rsp)
425
426#sixth iteration
427	mulq	%r12
428	addq	%rax, %r10
429	movq	%r14, %rax		# 56($inp)
430	movq	%rdx, %rbx
431	adcq	\$0, %rbx
432
433	mulq	%r12
434	addq	%rax, %r11
435	movq	%r12, %rax
436	adcq	\$0, %rdx
437	addq	%rbx, %r11
438	adcq	\$0, %rdx
439
440	xorq	%rbx, %rbx		# rbx:r10:r9 = r10:r9 << 1
441	addq	%r9, %r9
442	 movq	%rdx, %r12
443	adcq	%r10, %r10
444	adcq	\$0, %rbx
445
446	mulq	%rax
447	# rcx <= 2 and rax <= 0xFFFF..F9, so carry must be zero here
448	addq	%rcx, %rax
449	addq	%rax, %r9
450	 movq	%r14, %rax		# 56($inp)
451	adcq	%rdx, %r10
452	adcq	\$0, %rbx
453
454	movq	%r9, 80(%rsp)
455	movq	%r10, 88(%rsp)
456
457#seventh iteration
458	mulq	%rbp
459	addq	%rax, %r12
460	movq	%rbp, %rax
461	adcq	\$0, %rdx
462
463	xorq	%rcx, %rcx		# rcx:r12:r11 = r12:r11 << 1
464	addq	%r11, %r11
465	 movq	%rdx, %r13
466	adcq	%r12, %r12
467	adcq	\$0, %rcx
468
469	mulq	%rax
470	# rbx <= 2 and rax <= 0xFFFF..F9, so carry must be zero here
471	addq	%rbx, %rax
472	addq	%rax, %r11
473	 movq	%r14, %rax		# 56($inp)
474	adcq	%rdx, %r12
475	adcq	\$0, %rcx
476
477	movq	%r11, 96(%rsp)
478	movq	%r12, 104(%rsp)
479
480#eighth iteration
481	xorq	%rbx, %rbx		# rbx:r13 = r13 << 1
482	addq	%r13, %r13
483	adcq	\$0, %rbx
484
485	mulq	%rax
486	# rcx <= 2 and rax <= 0xFFFF..F9, so carry must be zero here
487	addq	%rcx, %rax
488	addq	%r13, %rax
489	adcq	%rbx, %rdx
490
491	movq	(%rsp), %r8
492	movq	8(%rsp), %r9
493	movq	16(%rsp), %r10
494	movq	24(%rsp), %r11
495	movq	32(%rsp), %r12
496	movq	40(%rsp), %r13
497	movq	48(%rsp), %r14
498	movq	56(%rsp), %r15
499	movq	%xmm1, %rbp
500
501	movq	%rax, 112(%rsp)
502	movq	%rdx, 120(%rsp)
503
504	call	__rsaz_512_reduce
505
506	addq	64(%rsp), %r8
507	adcq	72(%rsp), %r9
508	adcq	80(%rsp), %r10
509	adcq	88(%rsp), %r11
510	adcq	96(%rsp), %r12
511	adcq	104(%rsp), %r13
512	adcq	112(%rsp), %r14
513	adcq	120(%rsp), %r15
514	sbbq	%rcx, %rcx
515
516	call	__rsaz_512_subtract
517
518	movq	%r8, %rdx
519	movq	%r9, %rax
520	movl	128+8(%rsp), $times
521	movq	$out, $inp
522
523	decl	$times
524	jnz	.Loop_sqr
525___
526if ($addx) {
527$code.=<<___;
528	jmp	.Lsqr_tail
529
530.align	32
531.Loop_sqrx:
532	movl	$times,128+8(%rsp)
533	movq	$out, %xmm0		# off-load
534#first iteration
535	mulx	%rax, %r8, %r9
536	mov	%rax, %rbx
537
538	mulx	16($inp), %rcx, %r10
539	xor	%rbp, %rbp		# cf=0, of=0
540
541	mulx	24($inp), %rax, %r11
542	adcx	%rcx, %r9
543
544	.byte	0xc4,0x62,0xf3,0xf6,0xa6,0x20,0x00,0x00,0x00	# mulx	32($inp), %rcx, %r12
545	adcx	%rax, %r10
546
547	.byte	0xc4,0x62,0xfb,0xf6,0xae,0x28,0x00,0x00,0x00	# mulx	40($inp), %rax, %r13
548	adcx	%rcx, %r11
549
550	mulx	48($inp), %rcx, %r14
551	adcx	%rax, %r12
552	adcx	%rcx, %r13
553
554	mulx	56($inp), %rax, %r15
555	adcx	%rax, %r14
556	adcx	%rbp, %r15		# %rbp is 0
557
558	mulx	%rdx, %rax, $out
559	 mov	%rbx, %rdx		# 8($inp)
560	xor	%rcx, %rcx
561	adox	%r8, %r8
562	adcx	$out, %r8
563	adox	%rbp, %rcx
564	adcx	%rbp, %rcx
565
566	mov	%rax, (%rsp)
567	mov	%r8, 8(%rsp)
568
569#second iteration
570	.byte	0xc4,0xe2,0xfb,0xf6,0x9e,0x10,0x00,0x00,0x00	# mulx	16($inp), %rax, %rbx
571	adox	%rax, %r10
572	adcx	%rbx, %r11
573
574	mulx	24($inp), $out, %r8
575	adox	$out, %r11
576	.byte	0x66
577	adcx	%r8, %r12
578
579	mulx	32($inp), %rax, %rbx
580	adox	%rax, %r12
581	adcx	%rbx, %r13
582
583	mulx	40($inp), $out, %r8
584	adox	$out, %r13
585	adcx	%r8, %r14
586
587	.byte	0xc4,0xe2,0xfb,0xf6,0x9e,0x30,0x00,0x00,0x00	# mulx	48($inp), %rax, %rbx
588	adox	%rax, %r14
589	adcx	%rbx, %r15
590
591	.byte	0xc4,0x62,0xc3,0xf6,0x86,0x38,0x00,0x00,0x00	# mulx	56($inp), $out, %r8
592	adox	$out, %r15
593	adcx	%rbp, %r8
594	 mulx	%rdx, %rax, $out
595	adox	%rbp, %r8
596	 .byte	0x48,0x8b,0x96,0x10,0x00,0x00,0x00		# mov	16($inp), %rdx
597
598	xor	%rbx, %rbx
599	 adox	%r9, %r9
600	# rcx <= 2 and rax <= 0xFFFF..F9, so carry must be zero here
601	adcx	%rcx, %rax
602	adox	%r10, %r10
603	adcx	%rax, %r9
604	adox	%rbp, %rbx
605	adcx	$out, %r10
606	adcx	%rbp, %rbx
607
608	mov	%r9, 16(%rsp)
609	.byte	0x4c,0x89,0x94,0x24,0x18,0x00,0x00,0x00		# mov	%r10, 24(%rsp)
610
611#third iteration
612	mulx	24($inp), $out, %r9
613	adox	$out, %r12
614	adcx	%r9, %r13
615
616	mulx	32($inp), %rax, %rcx
617	adox	%rax, %r13
618	adcx	%rcx, %r14
619
620	.byte	0xc4,0x62,0xc3,0xf6,0x8e,0x28,0x00,0x00,0x00	# mulx	40($inp), $out, %r9
621	adox	$out, %r14
622	adcx	%r9, %r15
623
624	.byte	0xc4,0xe2,0xfb,0xf6,0x8e,0x30,0x00,0x00,0x00	# mulx	48($inp), %rax, %rcx
625	adox	%rax, %r15
626	adcx	%rcx, %r8
627
628	mulx	56($inp), $out, %r9
629	adox	$out, %r8
630	adcx	%rbp, %r9
631	 mulx	%rdx, %rax, $out
632	adox	%rbp, %r9
633	 mov	24($inp), %rdx
634
635	xor	%rcx, %rcx
636	 adox	%r11, %r11
637	# rbx <= 2 and rax <= 0xFFFF..F9, so carry must be zero here
638	adcx	%rbx, %rax
639	adox	%r12, %r12
640	adcx	%rax, %r11
641	adox	%rbp, %rcx
642	adcx	$out, %r12
643	adcx	%rbp, %rcx
644
645	mov	%r11, 32(%rsp)
646	mov	%r12, 40(%rsp)
647
648#fourth iteration
649	mulx	32($inp), %rax, %rbx
650	adox	%rax, %r14
651	adcx	%rbx, %r15
652
653	mulx	40($inp), $out, %r10
654	adox	$out, %r15
655	adcx	%r10, %r8
656
657	mulx	48($inp), %rax, %rbx
658	adox	%rax, %r8
659	adcx	%rbx, %r9
660
661	mulx	56($inp), $out, %r10
662	adox	$out, %r9
663	adcx	%rbp, %r10
664	 mulx	%rdx, %rax, $out
665	adox	%rbp, %r10
666	 mov	32($inp), %rdx
667
668	xor	%rbx, %rbx
669	 adox	%r13, %r13
670	# rcx <= 2 and rax <= 0xFFFF..F9, so carry must be zero here
671	adcx	%rcx, %rax
672	adox	%r14, %r14
673	adcx	%rax, %r13
674	adox	%rbp, %rbx
675	adcx	$out, %r14
676	adcx	%rbp, %rbx
677
678	mov	%r13, 48(%rsp)
679	mov	%r14, 56(%rsp)
680
681#fifth iteration
682	mulx	40($inp), $out, %r11
683	adox	$out, %r8
684	adcx	%r11, %r9
685
686	mulx	48($inp), %rax, %rcx
687	adox	%rax, %r9
688	adcx	%rcx, %r10
689
690	mulx	56($inp), $out, %r11
691	adox	$out, %r10
692	adcx	%rbp, %r11
693	 mulx	%rdx, %rax, $out
694	 mov	40($inp), %rdx
695	adox	%rbp, %r11
696
697	xor	%rcx, %rcx
698	 adox	%r15, %r15
699	# rbx <= 2 and rax <= 0xFFFF..F9, so carry must be zero here
700	adcx	%rbx, %rax
701	adox	%r8, %r8
702	adcx	%rax, %r15
703	adox	%rbp, %rcx
704	adcx	$out, %r8
705	adcx	%rbp, %rcx
706
707	mov	%r15, 64(%rsp)
708	mov	%r8, 72(%rsp)
709
710#sixth iteration
711	.byte	0xc4,0xe2,0xfb,0xf6,0x9e,0x30,0x00,0x00,0x00	# mulx	48($inp), %rax, %rbx
712	adox	%rax, %r10
713	adcx	%rbx, %r11
714
715	.byte	0xc4,0x62,0xc3,0xf6,0xa6,0x38,0x00,0x00,0x00	# mulx	56($inp), $out, %r12
716	adox	$out, %r11
717	adcx	%rbp, %r12
718	 mulx	%rdx, %rax, $out
719	adox	%rbp, %r12
720	 mov	48($inp), %rdx
721
722	xor	%rbx, %rbx
723	 adox	%r9, %r9
724	# rcx <= 2 and rax <= 0xFFFF..F9, so carry must be zero here
725	adcx	%rcx, %rax
726	adox	%r10, %r10
727	adcx	%rax, %r9
728	adcx	$out, %r10
729	adox	%rbp, %rbx
730	adcx	%rbp, %rbx
731
732	mov	%r9, 80(%rsp)
733	mov	%r10, 88(%rsp)
734
735#seventh iteration
736	.byte	0xc4,0x62,0xfb,0xf6,0xae,0x38,0x00,0x00,0x00	# mulx	56($inp), %rax, %r13
737	adox	%rax, %r12
738	adox	%rbp, %r13
739
740	mulx	%rdx, %rax, $out
741	xor	%rcx, %rcx
742	 mov	56($inp), %rdx
743	 adox	%r11, %r11
744	# rbx <= 2 and rax <= 0xFFFF..F9, so carry must be zero here
745	adcx	%rbx, %rax
746	adox	%r12, %r12
747	adcx	%rax, %r11
748	adox	%rbp, %rcx
749	adcx	$out, %r12
750	adcx	%rbp, %rcx
751
752	.byte	0x4c,0x89,0x9c,0x24,0x60,0x00,0x00,0x00		# mov	%r11, 96(%rsp)
753	.byte	0x4c,0x89,0xa4,0x24,0x68,0x00,0x00,0x00		# mov	%r12, 104(%rsp)
754
755#eighth iteration
756	mulx	%rdx, %rax, %rdx
757	xor	%rbx, %rbx
758	 adox	%r13, %r13
759	# rcx <= 2 and rax <= 0xFFFF..F9, so carry must be zero here
760	adcx	%rcx, %rax
761	adox	%rbp, %rbx
762	adcx	%r13, %rax
763	adcx	%rdx, %rbx
764
765	movq	%xmm0, $out
766	movq	%xmm1, %rbp
767
768	movq	128(%rsp), %rdx		# pull $n0
769	movq	(%rsp), %r8
770	movq	8(%rsp), %r9
771	movq	16(%rsp), %r10
772	movq	24(%rsp), %r11
773	movq	32(%rsp), %r12
774	movq	40(%rsp), %r13
775	movq	48(%rsp), %r14
776	movq	56(%rsp), %r15
777
778	movq	%rax, 112(%rsp)
779	movq	%rbx, 120(%rsp)
780
781	call	__rsaz_512_reducex
782
783	addq	64(%rsp), %r8
784	adcq	72(%rsp), %r9
785	adcq	80(%rsp), %r10
786	adcq	88(%rsp), %r11
787	adcq	96(%rsp), %r12
788	adcq	104(%rsp), %r13
789	adcq	112(%rsp), %r14
790	adcq	120(%rsp), %r15
791	sbbq	%rcx, %rcx
792
793	call	__rsaz_512_subtract
794
795	movq	%r8, %rdx
796	movq	%r9, %rax
797	movl	128+8(%rsp), $times
798	movq	$out, $inp
799
800	decl	$times
801	jnz	.Loop_sqrx
802
803.Lsqr_tail:
804___
805}
806$code.=<<___;
807
808	leaq	128+24+48(%rsp), %rax
809	movq	-48(%rax), %r15
810	movq	-40(%rax), %r14
811	movq	-32(%rax), %r13
812	movq	-24(%rax), %r12
813	movq	-16(%rax), %rbp
814	movq	-8(%rax), %rbx
815	leaq	(%rax), %rsp
816.Lsqr_epilogue:
817	ret
818.size	rsaz_512_sqr,.-rsaz_512_sqr
819___
820}
821{
822my ($out,$ap,$bp,$mod,$n0) = ("%rdi","%rsi","%rdx","%rcx","%r8");
823$code.=<<___;
824.globl	rsaz_512_mul
825.type	rsaz_512_mul,\@function,5
826.align	32
827rsaz_512_mul:
828	push	%rbx
829	push	%rbp
830	push	%r12
831	push	%r13
832	push	%r14
833	push	%r15
834
835	subq	\$128+24, %rsp
836.Lmul_body:
837	movq	$out, %xmm0		# off-load arguments
838	movq	$mod, %xmm1
839	movq	$n0, 128(%rsp)
840___
841$code.=<<___ if ($addx);
842	movl	\$0x80100,%r11d
843	andl	OPENSSL_ia32cap_P+8(%rip),%r11d
844	cmpl	\$0x80100,%r11d		# check for MULX and ADO/CX
845	je	.Lmulx
846___
847$code.=<<___;
848	movq	($bp), %rbx		# pass b[0]
849	movq	$bp, %rbp		# pass argument
850	call	__rsaz_512_mul
851
852	movq	%xmm0, $out
853	movq	%xmm1, %rbp
854
855	movq	(%rsp), %r8
856	movq	8(%rsp), %r9
857	movq	16(%rsp), %r10
858	movq	24(%rsp), %r11
859	movq	32(%rsp), %r12
860	movq	40(%rsp), %r13
861	movq	48(%rsp), %r14
862	movq	56(%rsp), %r15
863
864	call	__rsaz_512_reduce
865___
866$code.=<<___ if ($addx);
867	jmp	.Lmul_tail
868
869.align	32
870.Lmulx:
871	movq	$bp, %rbp		# pass argument
872	movq	($bp), %rdx		# pass b[0]
873	call	__rsaz_512_mulx
874
875	movq	%xmm0, $out
876	movq	%xmm1, %rbp
877
878	movq	128(%rsp), %rdx		# pull $n0
879	movq	(%rsp), %r8
880	movq	8(%rsp), %r9
881	movq	16(%rsp), %r10
882	movq	24(%rsp), %r11
883	movq	32(%rsp), %r12
884	movq	40(%rsp), %r13
885	movq	48(%rsp), %r14
886	movq	56(%rsp), %r15
887
888	call	__rsaz_512_reducex
889.Lmul_tail:
890___
891$code.=<<___;
892	addq	64(%rsp), %r8
893	adcq	72(%rsp), %r9
894	adcq	80(%rsp), %r10
895	adcq	88(%rsp), %r11
896	adcq	96(%rsp), %r12
897	adcq	104(%rsp), %r13
898	adcq	112(%rsp), %r14
899	adcq	120(%rsp), %r15
900	sbbq	%rcx, %rcx
901
902	call	__rsaz_512_subtract
903
904	leaq	128+24+48(%rsp), %rax
905	movq	-48(%rax), %r15
906	movq	-40(%rax), %r14
907	movq	-32(%rax), %r13
908	movq	-24(%rax), %r12
909	movq	-16(%rax), %rbp
910	movq	-8(%rax), %rbx
911	leaq	(%rax), %rsp
912.Lmul_epilogue:
913	ret
914.size	rsaz_512_mul,.-rsaz_512_mul
915___
916}
917{
918my ($out,$ap,$bp,$mod,$n0,$pwr) = ("%rdi","%rsi","%rdx","%rcx","%r8","%r9d");
919$code.=<<___;
920.globl	rsaz_512_mul_gather4
921.type	rsaz_512_mul_gather4,\@function,6
922.align	32
923rsaz_512_mul_gather4:
924	push	%rbx
925	push	%rbp
926	push	%r12
927	push	%r13
928	push	%r14
929	push	%r15
930
931	subq	\$`128+24+($win64?0xb0:0)`, %rsp
932___
933$code.=<<___	if ($win64);
934	movaps	%xmm6,0xa0(%rsp)
935	movaps	%xmm7,0xb0(%rsp)
936	movaps	%xmm8,0xc0(%rsp)
937	movaps	%xmm9,0xd0(%rsp)
938	movaps	%xmm10,0xe0(%rsp)
939	movaps	%xmm11,0xf0(%rsp)
940	movaps	%xmm12,0x100(%rsp)
941	movaps	%xmm13,0x110(%rsp)
942	movaps	%xmm14,0x120(%rsp)
943	movaps	%xmm15,0x130(%rsp)
944___
945$code.=<<___;
946.Lmul_gather4_body:
947	movd	$pwr,%xmm8
948	movdqa	.Linc+16(%rip),%xmm1	# 00000002000000020000000200000002
949	movdqa	.Linc(%rip),%xmm0	# 00000001000000010000000000000000
950
951	pshufd	\$0,%xmm8,%xmm8		# broadcast $power
952	movdqa	%xmm1,%xmm7
953	movdqa	%xmm1,%xmm2
954___
955########################################################################
956# calculate mask by comparing 0..15 to $power
957#
958for($i=0;$i<4;$i++) {
959$code.=<<___;
960	paddd	%xmm`$i`,%xmm`$i+1`
961	pcmpeqd	%xmm8,%xmm`$i`
962	movdqa	%xmm7,%xmm`$i+3`
963___
964}
965for(;$i<7;$i++) {
966$code.=<<___;
967	paddd	%xmm`$i`,%xmm`$i+1`
968	pcmpeqd	%xmm8,%xmm`$i`
969___
970}
971$code.=<<___;
972	pcmpeqd	%xmm8,%xmm7
973
974	movdqa	16*0($bp),%xmm8
975	movdqa	16*1($bp),%xmm9
976	movdqa	16*2($bp),%xmm10
977	movdqa	16*3($bp),%xmm11
978	pand	%xmm0,%xmm8
979	movdqa	16*4($bp),%xmm12
980	pand	%xmm1,%xmm9
981	movdqa	16*5($bp),%xmm13
982	pand	%xmm2,%xmm10
983	movdqa	16*6($bp),%xmm14
984	pand	%xmm3,%xmm11
985	movdqa	16*7($bp),%xmm15
986	leaq	128($bp), %rbp
987	pand	%xmm4,%xmm12
988	pand	%xmm5,%xmm13
989	pand	%xmm6,%xmm14
990	pand	%xmm7,%xmm15
991	por	%xmm10,%xmm8
992	por	%xmm11,%xmm9
993	por	%xmm12,%xmm8
994	por	%xmm13,%xmm9
995	por	%xmm14,%xmm8
996	por	%xmm15,%xmm9
997
998	por	%xmm9,%xmm8
999	pshufd	\$0x4e,%xmm8,%xmm9
1000	por	%xmm9,%xmm8
1001___
1002$code.=<<___ if ($addx);
1003	movl	\$0x80100,%r11d
1004	andl	OPENSSL_ia32cap_P+8(%rip),%r11d
1005	cmpl	\$0x80100,%r11d		# check for MULX and ADO/CX
1006	je	.Lmulx_gather
1007___
1008$code.=<<___;
1009	movq	%xmm8,%rbx
1010
1011	movq	$n0, 128(%rsp)		# off-load arguments
1012	movq	$out, 128+8(%rsp)
1013	movq	$mod, 128+16(%rsp)
1014
1015	movq	($ap), %rax
1016	 movq	8($ap), %rcx
1017	mulq	%rbx			# 0 iteration
1018	movq	%rax, (%rsp)
1019	movq	%rcx, %rax
1020	movq	%rdx, %r8
1021
1022	mulq	%rbx
1023	addq	%rax, %r8
1024	movq	16($ap), %rax
1025	movq	%rdx, %r9
1026	adcq	\$0, %r9
1027
1028	mulq	%rbx
1029	addq	%rax, %r9
1030	movq	24($ap), %rax
1031	movq	%rdx, %r10
1032	adcq	\$0, %r10
1033
1034	mulq	%rbx
1035	addq	%rax, %r10
1036	movq	32($ap), %rax
1037	movq	%rdx, %r11
1038	adcq	\$0, %r11
1039
1040	mulq	%rbx
1041	addq	%rax, %r11
1042	movq	40($ap), %rax
1043	movq	%rdx, %r12
1044	adcq	\$0, %r12
1045
1046	mulq	%rbx
1047	addq	%rax, %r12
1048	movq	48($ap), %rax
1049	movq	%rdx, %r13
1050	adcq	\$0, %r13
1051
1052	mulq	%rbx
1053	addq	%rax, %r13
1054	movq	56($ap), %rax
1055	movq	%rdx, %r14
1056	adcq	\$0, %r14
1057
1058	mulq	%rbx
1059	addq	%rax, %r14
1060	 movq	($ap), %rax
1061	movq	%rdx, %r15
1062	adcq	\$0, %r15
1063
1064	leaq	8(%rsp), %rdi
1065	movl	\$7, %ecx
1066	jmp	.Loop_mul_gather
1067
1068.align	32
1069.Loop_mul_gather:
1070	movdqa	16*0(%rbp),%xmm8
1071	movdqa	16*1(%rbp),%xmm9
1072	movdqa	16*2(%rbp),%xmm10
1073	movdqa	16*3(%rbp),%xmm11
1074	pand	%xmm0,%xmm8
1075	movdqa	16*4(%rbp),%xmm12
1076	pand	%xmm1,%xmm9
1077	movdqa	16*5(%rbp),%xmm13
1078	pand	%xmm2,%xmm10
1079	movdqa	16*6(%rbp),%xmm14
1080	pand	%xmm3,%xmm11
1081	movdqa	16*7(%rbp),%xmm15
1082	leaq	128(%rbp), %rbp
1083	pand	%xmm4,%xmm12
1084	pand	%xmm5,%xmm13
1085	pand	%xmm6,%xmm14
1086	pand	%xmm7,%xmm15
1087	por	%xmm10,%xmm8
1088	por	%xmm11,%xmm9
1089	por	%xmm12,%xmm8
1090	por	%xmm13,%xmm9
1091	por	%xmm14,%xmm8
1092	por	%xmm15,%xmm9
1093
1094	por	%xmm9,%xmm8
1095	pshufd	\$0x4e,%xmm8,%xmm9
1096	por	%xmm9,%xmm8
1097	movq	%xmm8,%rbx
1098
1099	mulq	%rbx
1100	addq	%rax, %r8
1101	movq	8($ap), %rax
1102	movq	%r8, (%rdi)
1103	movq	%rdx, %r8
1104	adcq	\$0, %r8
1105
1106	mulq	%rbx
1107	addq	%rax, %r9
1108	movq	16($ap), %rax
1109	adcq	\$0, %rdx
1110	addq	%r9, %r8
1111	movq	%rdx, %r9
1112	adcq	\$0, %r9
1113
1114	mulq	%rbx
1115	addq	%rax, %r10
1116	movq	24($ap), %rax
1117	adcq	\$0, %rdx
1118	addq	%r10, %r9
1119	movq	%rdx, %r10
1120	adcq	\$0, %r10
1121
1122	mulq	%rbx
1123	addq	%rax, %r11
1124	movq	32($ap), %rax
1125	adcq	\$0, %rdx
1126	addq	%r11, %r10
1127	movq	%rdx, %r11
1128	adcq	\$0, %r11
1129
1130	mulq	%rbx
1131	addq	%rax, %r12
1132	movq	40($ap), %rax
1133	adcq	\$0, %rdx
1134	addq	%r12, %r11
1135	movq	%rdx, %r12
1136	adcq	\$0, %r12
1137
1138	mulq	%rbx
1139	addq	%rax, %r13
1140	movq	48($ap), %rax
1141	adcq	\$0, %rdx
1142	addq	%r13, %r12
1143	movq	%rdx, %r13
1144	adcq	\$0, %r13
1145
1146	mulq	%rbx
1147	addq	%rax, %r14
1148	movq	56($ap), %rax
1149	adcq	\$0, %rdx
1150	addq	%r14, %r13
1151	movq	%rdx, %r14
1152	adcq	\$0, %r14
1153
1154	mulq	%rbx
1155	addq	%rax, %r15
1156	 movq	($ap), %rax
1157	adcq	\$0, %rdx
1158	addq	%r15, %r14
1159	movq	%rdx, %r15
1160	adcq	\$0, %r15
1161
1162	leaq	8(%rdi), %rdi
1163
1164	decl	%ecx
1165	jnz	.Loop_mul_gather
1166
1167	movq	%r8, (%rdi)
1168	movq	%r9, 8(%rdi)
1169	movq	%r10, 16(%rdi)
1170	movq	%r11, 24(%rdi)
1171	movq	%r12, 32(%rdi)
1172	movq	%r13, 40(%rdi)
1173	movq	%r14, 48(%rdi)
1174	movq	%r15, 56(%rdi)
1175
1176	movq	128+8(%rsp), $out
1177	movq	128+16(%rsp), %rbp
1178
1179	movq	(%rsp), %r8
1180	movq	8(%rsp), %r9
1181	movq	16(%rsp), %r10
1182	movq	24(%rsp), %r11
1183	movq	32(%rsp), %r12
1184	movq	40(%rsp), %r13
1185	movq	48(%rsp), %r14
1186	movq	56(%rsp), %r15
1187
1188	call	__rsaz_512_reduce
1189___
1190$code.=<<___ if ($addx);
1191	jmp	.Lmul_gather_tail
1192
1193.align	32
1194.Lmulx_gather:
1195	movq	%xmm8,%rdx
1196
1197	mov	$n0, 128(%rsp)		# off-load arguments
1198	mov	$out, 128+8(%rsp)
1199	mov	$mod, 128+16(%rsp)
1200
1201	mulx	($ap), %rbx, %r8	# 0 iteration
1202	mov	%rbx, (%rsp)
1203	xor	%edi, %edi		# cf=0, of=0
1204
1205	mulx	8($ap), %rax, %r9
1206
1207	mulx	16($ap), %rbx, %r10
1208	adcx	%rax, %r8
1209
1210	mulx	24($ap), %rax, %r11
1211	adcx	%rbx, %r9
1212
1213	mulx	32($ap), %rbx, %r12
1214	adcx	%rax, %r10
1215
1216	mulx	40($ap), %rax, %r13
1217	adcx	%rbx, %r11
1218
1219	mulx	48($ap), %rbx, %r14
1220	adcx	%rax, %r12
1221
1222	mulx	56($ap), %rax, %r15
1223	adcx	%rbx, %r13
1224	adcx	%rax, %r14
1225	.byte	0x67
1226	mov	%r8, %rbx
1227	adcx	%rdi, %r15		# %rdi is 0
1228
1229	mov	\$-7, %rcx
1230	jmp	.Loop_mulx_gather
1231
1232.align	32
1233.Loop_mulx_gather:
1234	movdqa	16*0(%rbp),%xmm8
1235	movdqa	16*1(%rbp),%xmm9
1236	movdqa	16*2(%rbp),%xmm10
1237	movdqa	16*3(%rbp),%xmm11
1238	pand	%xmm0,%xmm8
1239	movdqa	16*4(%rbp),%xmm12
1240	pand	%xmm1,%xmm9
1241	movdqa	16*5(%rbp),%xmm13
1242	pand	%xmm2,%xmm10
1243	movdqa	16*6(%rbp),%xmm14
1244	pand	%xmm3,%xmm11
1245	movdqa	16*7(%rbp),%xmm15
1246	leaq	128(%rbp), %rbp
1247	pand	%xmm4,%xmm12
1248	pand	%xmm5,%xmm13
1249	pand	%xmm6,%xmm14
1250	pand	%xmm7,%xmm15
1251	por	%xmm10,%xmm8
1252	por	%xmm11,%xmm9
1253	por	%xmm12,%xmm8
1254	por	%xmm13,%xmm9
1255	por	%xmm14,%xmm8
1256	por	%xmm15,%xmm9
1257
1258	por	%xmm9,%xmm8
1259	pshufd	\$0x4e,%xmm8,%xmm9
1260	por	%xmm9,%xmm8
1261	movq	%xmm8,%rdx
1262
1263	.byte	0xc4,0x62,0xfb,0xf6,0x86,0x00,0x00,0x00,0x00	# mulx	($ap), %rax, %r8
1264	adcx	%rax, %rbx
1265	adox	%r9, %r8
1266
1267	mulx	8($ap), %rax, %r9
1268	adcx	%rax, %r8
1269	adox	%r10, %r9
1270
1271	mulx	16($ap), %rax, %r10
1272	adcx	%rax, %r9
1273	adox	%r11, %r10
1274
1275	.byte	0xc4,0x62,0xfb,0xf6,0x9e,0x18,0x00,0x00,0x00	# mulx	24($ap), %rax, %r11
1276	adcx	%rax, %r10
1277	adox	%r12, %r11
1278
1279	mulx	32($ap), %rax, %r12
1280	adcx	%rax, %r11
1281	adox	%r13, %r12
1282
1283	mulx	40($ap), %rax, %r13
1284	adcx	%rax, %r12
1285	adox	%r14, %r13
1286
1287	.byte	0xc4,0x62,0xfb,0xf6,0xb6,0x30,0x00,0x00,0x00	# mulx	48($ap), %rax, %r14
1288	adcx	%rax, %r13
1289	.byte	0x67
1290	adox	%r15, %r14
1291
1292	mulx	56($ap), %rax, %r15
1293	 mov	%rbx, 64(%rsp,%rcx,8)
1294	adcx	%rax, %r14
1295	adox	%rdi, %r15
1296	mov	%r8, %rbx
1297	adcx	%rdi, %r15		# cf=0
1298
1299	inc	%rcx			# of=0
1300	jnz	.Loop_mulx_gather
1301
1302	mov	%r8, 64(%rsp)
1303	mov	%r9, 64+8(%rsp)
1304	mov	%r10, 64+16(%rsp)
1305	mov	%r11, 64+24(%rsp)
1306	mov	%r12, 64+32(%rsp)
1307	mov	%r13, 64+40(%rsp)
1308	mov	%r14, 64+48(%rsp)
1309	mov	%r15, 64+56(%rsp)
1310
1311	mov	128(%rsp), %rdx		# pull arguments
1312	mov	128+8(%rsp), $out
1313	mov	128+16(%rsp), %rbp
1314
1315	mov	(%rsp), %r8
1316	mov	8(%rsp), %r9
1317	mov	16(%rsp), %r10
1318	mov	24(%rsp), %r11
1319	mov	32(%rsp), %r12
1320	mov	40(%rsp), %r13
1321	mov	48(%rsp), %r14
1322	mov	56(%rsp), %r15
1323
1324	call	__rsaz_512_reducex
1325
1326.Lmul_gather_tail:
1327___
1328$code.=<<___;
1329	addq	64(%rsp), %r8
1330	adcq	72(%rsp), %r9
1331	adcq	80(%rsp), %r10
1332	adcq	88(%rsp), %r11
1333	adcq	96(%rsp), %r12
1334	adcq	104(%rsp), %r13
1335	adcq	112(%rsp), %r14
1336	adcq	120(%rsp), %r15
1337	sbbq	%rcx, %rcx
1338
1339	call	__rsaz_512_subtract
1340
1341	leaq	128+24+48(%rsp), %rax
1342___
1343$code.=<<___	if ($win64);
1344	movaps	0xa0-0xc8(%rax),%xmm6
1345	movaps	0xb0-0xc8(%rax),%xmm7
1346	movaps	0xc0-0xc8(%rax),%xmm8
1347	movaps	0xd0-0xc8(%rax),%xmm9
1348	movaps	0xe0-0xc8(%rax),%xmm10
1349	movaps	0xf0-0xc8(%rax),%xmm11
1350	movaps	0x100-0xc8(%rax),%xmm12
1351	movaps	0x110-0xc8(%rax),%xmm13
1352	movaps	0x120-0xc8(%rax),%xmm14
1353	movaps	0x130-0xc8(%rax),%xmm15
1354	lea	0xb0(%rax),%rax
1355___
1356$code.=<<___;
1357	movq	-48(%rax), %r15
1358	movq	-40(%rax), %r14
1359	movq	-32(%rax), %r13
1360	movq	-24(%rax), %r12
1361	movq	-16(%rax), %rbp
1362	movq	-8(%rax), %rbx
1363	leaq	(%rax), %rsp
1364.Lmul_gather4_epilogue:
1365	ret
1366.size	rsaz_512_mul_gather4,.-rsaz_512_mul_gather4
1367___
1368}
1369{
1370my ($out,$ap,$mod,$n0,$tbl,$pwr) = ("%rdi","%rsi","%rdx","%rcx","%r8","%r9d");
1371$code.=<<___;
1372.globl	rsaz_512_mul_scatter4
1373.type	rsaz_512_mul_scatter4,\@function,6
1374.align	32
1375rsaz_512_mul_scatter4:
1376	push	%rbx
1377	push	%rbp
1378	push	%r12
1379	push	%r13
1380	push	%r14
1381	push	%r15
1382
1383	mov	$pwr, $pwr
1384	subq	\$128+24, %rsp
1385.Lmul_scatter4_body:
1386	leaq	($tbl,$pwr,8), $tbl
1387	movq	$out, %xmm0		# off-load arguments
1388	movq	$mod, %xmm1
1389	movq	$tbl, %xmm2
1390	movq	$n0, 128(%rsp)
1391
1392	movq	$out, %rbp
1393___
1394$code.=<<___ if ($addx);
1395	movl	\$0x80100,%r11d
1396	andl	OPENSSL_ia32cap_P+8(%rip),%r11d
1397	cmpl	\$0x80100,%r11d		# check for MULX and ADO/CX
1398	je	.Lmulx_scatter
1399___
1400$code.=<<___;
1401	movq	($out),%rbx		# pass b[0]
1402	call	__rsaz_512_mul
1403
1404	movq	%xmm0, $out
1405	movq	%xmm1, %rbp
1406
1407	movq	(%rsp), %r8
1408	movq	8(%rsp), %r9
1409	movq	16(%rsp), %r10
1410	movq	24(%rsp), %r11
1411	movq	32(%rsp), %r12
1412	movq	40(%rsp), %r13
1413	movq	48(%rsp), %r14
1414	movq	56(%rsp), %r15
1415
1416	call	__rsaz_512_reduce
1417___
1418$code.=<<___ if ($addx);
1419	jmp	.Lmul_scatter_tail
1420
1421.align	32
1422.Lmulx_scatter:
1423	movq	($out), %rdx		# pass b[0]
1424	call	__rsaz_512_mulx
1425
1426	movq	%xmm0, $out
1427	movq	%xmm1, %rbp
1428
1429	movq	128(%rsp), %rdx		# pull $n0
1430	movq	(%rsp), %r8
1431	movq	8(%rsp), %r9
1432	movq	16(%rsp), %r10
1433	movq	24(%rsp), %r11
1434	movq	32(%rsp), %r12
1435	movq	40(%rsp), %r13
1436	movq	48(%rsp), %r14
1437	movq	56(%rsp), %r15
1438
1439	call	__rsaz_512_reducex
1440
1441.Lmul_scatter_tail:
1442___
1443$code.=<<___;
1444	addq	64(%rsp), %r8
1445	adcq	72(%rsp), %r9
1446	adcq	80(%rsp), %r10
1447	adcq	88(%rsp), %r11
1448	adcq	96(%rsp), %r12
1449	adcq	104(%rsp), %r13
1450	adcq	112(%rsp), %r14
1451	adcq	120(%rsp), %r15
1452	movq	%xmm2, $inp
1453	sbbq	%rcx, %rcx
1454
1455	call	__rsaz_512_subtract
1456
1457	movq	%r8, 128*0($inp)	# scatter
1458	movq	%r9, 128*1($inp)
1459	movq	%r10, 128*2($inp)
1460	movq	%r11, 128*3($inp)
1461	movq	%r12, 128*4($inp)
1462	movq	%r13, 128*5($inp)
1463	movq	%r14, 128*6($inp)
1464	movq	%r15, 128*7($inp)
1465
1466	leaq	128+24+48(%rsp), %rax
1467	movq	-48(%rax), %r15
1468	movq	-40(%rax), %r14
1469	movq	-32(%rax), %r13
1470	movq	-24(%rax), %r12
1471	movq	-16(%rax), %rbp
1472	movq	-8(%rax), %rbx
1473	leaq	(%rax), %rsp
1474.Lmul_scatter4_epilogue:
1475	ret
1476.size	rsaz_512_mul_scatter4,.-rsaz_512_mul_scatter4
1477___
1478}
1479{
1480my ($out,$inp,$mod,$n0) = ("%rdi","%rsi","%rdx","%rcx");
1481$code.=<<___;
1482.globl	rsaz_512_mul_by_one
1483.type	rsaz_512_mul_by_one,\@function,4
1484.align	32
1485rsaz_512_mul_by_one:
1486	push	%rbx
1487	push	%rbp
1488	push	%r12
1489	push	%r13
1490	push	%r14
1491	push	%r15
1492
1493	subq	\$128+24, %rsp
1494.Lmul_by_one_body:
1495___
1496$code.=<<___ if ($addx);
1497	movl	OPENSSL_ia32cap_P+8(%rip),%eax
1498___
1499$code.=<<___;
1500	movq	$mod, %rbp	# reassign argument
1501	movq	$n0, 128(%rsp)
1502
1503	movq	($inp), %r8
1504	pxor	%xmm0, %xmm0
1505	movq	8($inp), %r9
1506	movq	16($inp), %r10
1507	movq	24($inp), %r11
1508	movq	32($inp), %r12
1509	movq	40($inp), %r13
1510	movq	48($inp), %r14
1511	movq	56($inp), %r15
1512
1513	movdqa	%xmm0, (%rsp)
1514	movdqa	%xmm0, 16(%rsp)
1515	movdqa	%xmm0, 32(%rsp)
1516	movdqa	%xmm0, 48(%rsp)
1517	movdqa	%xmm0, 64(%rsp)
1518	movdqa	%xmm0, 80(%rsp)
1519	movdqa	%xmm0, 96(%rsp)
1520___
1521$code.=<<___ if ($addx);
1522	andl	\$0x80100,%eax
1523	cmpl	\$0x80100,%eax		# check for MULX and ADO/CX
1524	je	.Lby_one_callx
1525___
1526$code.=<<___;
1527	call	__rsaz_512_reduce
1528___
1529$code.=<<___ if ($addx);
1530	jmp	.Lby_one_tail
1531.align	32
1532.Lby_one_callx:
1533	movq	128(%rsp), %rdx		# pull $n0
1534	call	__rsaz_512_reducex
1535.Lby_one_tail:
1536___
1537$code.=<<___;
1538	movq	%r8, ($out)
1539	movq	%r9, 8($out)
1540	movq	%r10, 16($out)
1541	movq	%r11, 24($out)
1542	movq	%r12, 32($out)
1543	movq	%r13, 40($out)
1544	movq	%r14, 48($out)
1545	movq	%r15, 56($out)
1546
1547	leaq	128+24+48(%rsp), %rax
1548	movq	-48(%rax), %r15
1549	movq	-40(%rax), %r14
1550	movq	-32(%rax), %r13
1551	movq	-24(%rax), %r12
1552	movq	-16(%rax), %rbp
1553	movq	-8(%rax), %rbx
1554	leaq	(%rax), %rsp
1555.Lmul_by_one_epilogue:
1556	ret
1557.size	rsaz_512_mul_by_one,.-rsaz_512_mul_by_one
1558___
1559}
1560{	# __rsaz_512_reduce
1561	#
1562	# input:	%r8-%r15, %rbp - mod, 128(%rsp) - n0
1563	# output:	%r8-%r15
1564	# clobbers:	everything except %rbp and %rdi
1565$code.=<<___;
1566.type	__rsaz_512_reduce,\@abi-omnipotent
1567.align	32
1568__rsaz_512_reduce:
1569	movq	%r8, %rbx
1570	imulq	128+8(%rsp), %rbx
1571	movq	0(%rbp), %rax
1572	movl	\$8, %ecx
1573	jmp	.Lreduction_loop
1574
1575.align	32
1576.Lreduction_loop:
1577	mulq	%rbx
1578	movq	8(%rbp), %rax
1579	negq	%r8
1580	movq	%rdx, %r8
1581	adcq	\$0, %r8
1582
1583	mulq	%rbx
1584	addq	%rax, %r9
1585	movq	16(%rbp), %rax
1586	adcq	\$0, %rdx
1587	addq	%r9, %r8
1588	movq	%rdx, %r9
1589	adcq	\$0, %r9
1590
1591	mulq	%rbx
1592	addq	%rax, %r10
1593	movq	24(%rbp), %rax
1594	adcq	\$0, %rdx
1595	addq	%r10, %r9
1596	movq	%rdx, %r10
1597	adcq	\$0, %r10
1598
1599	mulq	%rbx
1600	addq	%rax, %r11
1601	movq	32(%rbp), %rax
1602	adcq	\$0, %rdx
1603	addq	%r11, %r10
1604	 movq	128+8(%rsp), %rsi
1605	#movq	%rdx, %r11
1606	#adcq	\$0, %r11
1607	adcq	\$0, %rdx
1608	movq	%rdx, %r11
1609
1610	mulq	%rbx
1611	addq	%rax, %r12
1612	movq	40(%rbp), %rax
1613	adcq	\$0, %rdx
1614	 imulq	%r8, %rsi
1615	addq	%r12, %r11
1616	movq	%rdx, %r12
1617	adcq	\$0, %r12
1618
1619	mulq	%rbx
1620	addq	%rax, %r13
1621	movq	48(%rbp), %rax
1622	adcq	\$0, %rdx
1623	addq	%r13, %r12
1624	movq	%rdx, %r13
1625	adcq	\$0, %r13
1626
1627	mulq	%rbx
1628	addq	%rax, %r14
1629	movq	56(%rbp), %rax
1630	adcq	\$0, %rdx
1631	addq	%r14, %r13
1632	movq	%rdx, %r14
1633	adcq	\$0, %r14
1634
1635	mulq	%rbx
1636	 movq	%rsi, %rbx
1637	addq	%rax, %r15
1638	 movq	0(%rbp), %rax
1639	adcq	\$0, %rdx
1640	addq	%r15, %r14
1641	movq	%rdx, %r15
1642	adcq	\$0, %r15
1643
1644	decl	%ecx
1645	jne	.Lreduction_loop
1646
1647	ret
1648.size	__rsaz_512_reduce,.-__rsaz_512_reduce
1649___
1650}
1651if ($addx) {
1652	# __rsaz_512_reducex
1653	#
1654	# input:	%r8-%r15, %rbp - mod, 128(%rsp) - n0
1655	# output:	%r8-%r15
1656	# clobbers:	everything except %rbp and %rdi
1657$code.=<<___;
1658.type	__rsaz_512_reducex,\@abi-omnipotent
1659.align	32
1660__rsaz_512_reducex:
1661	#movq	128+8(%rsp), %rdx		# pull $n0
1662	imulq	%r8, %rdx
1663	xorq	%rsi, %rsi			# cf=0,of=0
1664	movl	\$8, %ecx
1665	jmp	.Lreduction_loopx
1666
1667.align	32
1668.Lreduction_loopx:
1669	mov	%r8, %rbx
1670	mulx	0(%rbp), %rax, %r8
1671	adcx	%rbx, %rax
1672	adox	%r9, %r8
1673
1674	mulx	8(%rbp), %rax, %r9
1675	adcx	%rax, %r8
1676	adox	%r10, %r9
1677
1678	mulx	16(%rbp), %rbx, %r10
1679	adcx	%rbx, %r9
1680	adox	%r11, %r10
1681
1682	mulx	24(%rbp), %rbx, %r11
1683	adcx	%rbx, %r10
1684	adox	%r12, %r11
1685
1686	.byte	0xc4,0x62,0xe3,0xf6,0xa5,0x20,0x00,0x00,0x00	# mulx	32(%rbp), %rbx, %r12
1687	 mov	%rdx, %rax
1688	 mov	%r8, %rdx
1689	adcx	%rbx, %r11
1690	adox	%r13, %r12
1691
1692	 mulx	128+8(%rsp), %rbx, %rdx
1693	 mov	%rax, %rdx
1694
1695	mulx	40(%rbp), %rax, %r13
1696	adcx	%rax, %r12
1697	adox	%r14, %r13
1698
1699	.byte	0xc4,0x62,0xfb,0xf6,0xb5,0x30,0x00,0x00,0x00	# mulx	48(%rbp), %rax, %r14
1700	adcx	%rax, %r13
1701	adox	%r15, %r14
1702
1703	mulx	56(%rbp), %rax, %r15
1704	 mov	%rbx, %rdx
1705	adcx	%rax, %r14
1706	adox	%rsi, %r15			# %rsi is 0
1707	adcx	%rsi, %r15			# cf=0
1708
1709	decl	%ecx				# of=0
1710	jne	.Lreduction_loopx
1711
1712	ret
1713.size	__rsaz_512_reducex,.-__rsaz_512_reducex
1714___
1715}
1716{	# __rsaz_512_subtract
1717	# input: %r8-%r15, %rdi - $out, %rbp - $mod, %rcx - mask
1718	# output:
1719	# clobbers: everything but %rdi, %rsi and %rbp
1720$code.=<<___;
1721.type	__rsaz_512_subtract,\@abi-omnipotent
1722.align	32
1723__rsaz_512_subtract:
1724	movq	%r8, ($out)
1725	movq	%r9, 8($out)
1726	movq	%r10, 16($out)
1727	movq	%r11, 24($out)
1728	movq	%r12, 32($out)
1729	movq	%r13, 40($out)
1730	movq	%r14, 48($out)
1731	movq	%r15, 56($out)
1732
1733	movq	0($mod), %r8
1734	movq	8($mod), %r9
1735	negq	%r8
1736	notq	%r9
1737	andq	%rcx, %r8
1738	movq	16($mod), %r10
1739	andq	%rcx, %r9
1740	notq	%r10
1741	movq	24($mod), %r11
1742	andq	%rcx, %r10
1743	notq	%r11
1744	movq	32($mod), %r12
1745	andq	%rcx, %r11
1746	notq	%r12
1747	movq	40($mod), %r13
1748	andq	%rcx, %r12
1749	notq	%r13
1750	movq	48($mod), %r14
1751	andq	%rcx, %r13
1752	notq	%r14
1753	movq	56($mod), %r15
1754	andq	%rcx, %r14
1755	notq	%r15
1756	andq	%rcx, %r15
1757
1758	addq	($out), %r8
1759	adcq	8($out), %r9
1760	adcq	16($out), %r10
1761	adcq	24($out), %r11
1762	adcq	32($out), %r12
1763	adcq	40($out), %r13
1764	adcq	48($out), %r14
1765	adcq	56($out), %r15
1766
1767	movq	%r8, ($out)
1768	movq	%r9, 8($out)
1769	movq	%r10, 16($out)
1770	movq	%r11, 24($out)
1771	movq	%r12, 32($out)
1772	movq	%r13, 40($out)
1773	movq	%r14, 48($out)
1774	movq	%r15, 56($out)
1775
1776	ret
1777.size	__rsaz_512_subtract,.-__rsaz_512_subtract
1778___
1779}
1780{	# __rsaz_512_mul
1781	#
1782	# input: %rsi - ap, %rbp - bp
1783	# ouput:
1784	# clobbers: everything
1785my ($ap,$bp) = ("%rsi","%rbp");
1786$code.=<<___;
1787.type	__rsaz_512_mul,\@abi-omnipotent
1788.align	32
1789__rsaz_512_mul:
1790	leaq	8(%rsp), %rdi
1791
1792	movq	($ap), %rax
1793	mulq	%rbx
1794	movq	%rax, (%rdi)
1795	movq	8($ap), %rax
1796	movq	%rdx, %r8
1797
1798	mulq	%rbx
1799	addq	%rax, %r8
1800	movq	16($ap), %rax
1801	movq	%rdx, %r9
1802	adcq	\$0, %r9
1803
1804	mulq	%rbx
1805	addq	%rax, %r9
1806	movq	24($ap), %rax
1807	movq	%rdx, %r10
1808	adcq	\$0, %r10
1809
1810	mulq	%rbx
1811	addq	%rax, %r10
1812	movq	32($ap), %rax
1813	movq	%rdx, %r11
1814	adcq	\$0, %r11
1815
1816	mulq	%rbx
1817	addq	%rax, %r11
1818	movq	40($ap), %rax
1819	movq	%rdx, %r12
1820	adcq	\$0, %r12
1821
1822	mulq	%rbx
1823	addq	%rax, %r12
1824	movq	48($ap), %rax
1825	movq	%rdx, %r13
1826	adcq	\$0, %r13
1827
1828	mulq	%rbx
1829	addq	%rax, %r13
1830	movq	56($ap), %rax
1831	movq	%rdx, %r14
1832	adcq	\$0, %r14
1833
1834	mulq	%rbx
1835	addq	%rax, %r14
1836	 movq	($ap), %rax
1837	movq	%rdx, %r15
1838	adcq	\$0, %r15
1839
1840	leaq	8($bp), $bp
1841	leaq	8(%rdi), %rdi
1842
1843	movl	\$7, %ecx
1844	jmp	.Loop_mul
1845
1846.align	32
1847.Loop_mul:
1848	movq	($bp), %rbx
1849	mulq	%rbx
1850	addq	%rax, %r8
1851	movq	8($ap), %rax
1852	movq	%r8, (%rdi)
1853	movq	%rdx, %r8
1854	adcq	\$0, %r8
1855
1856	mulq	%rbx
1857	addq	%rax, %r9
1858	movq	16($ap), %rax
1859	adcq	\$0, %rdx
1860	addq	%r9, %r8
1861	movq	%rdx, %r9
1862	adcq	\$0, %r9
1863
1864	mulq	%rbx
1865	addq	%rax, %r10
1866	movq	24($ap), %rax
1867	adcq	\$0, %rdx
1868	addq	%r10, %r9
1869	movq	%rdx, %r10
1870	adcq	\$0, %r10
1871
1872	mulq	%rbx
1873	addq	%rax, %r11
1874	movq	32($ap), %rax
1875	adcq	\$0, %rdx
1876	addq	%r11, %r10
1877	movq	%rdx, %r11
1878	adcq	\$0, %r11
1879
1880	mulq	%rbx
1881	addq	%rax, %r12
1882	movq	40($ap), %rax
1883	adcq	\$0, %rdx
1884	addq	%r12, %r11
1885	movq	%rdx, %r12
1886	adcq	\$0, %r12
1887
1888	mulq	%rbx
1889	addq	%rax, %r13
1890	movq	48($ap), %rax
1891	adcq	\$0, %rdx
1892	addq	%r13, %r12
1893	movq	%rdx, %r13
1894	adcq	\$0, %r13
1895
1896	mulq	%rbx
1897	addq	%rax, %r14
1898	movq	56($ap), %rax
1899	adcq	\$0, %rdx
1900	addq	%r14, %r13
1901	movq	%rdx, %r14
1902	 leaq	8($bp), $bp
1903	adcq	\$0, %r14
1904
1905	mulq	%rbx
1906	addq	%rax, %r15
1907	 movq	($ap), %rax
1908	adcq	\$0, %rdx
1909	addq	%r15, %r14
1910	movq	%rdx, %r15
1911	adcq	\$0, %r15
1912
1913	leaq	8(%rdi), %rdi
1914
1915	decl	%ecx
1916	jnz	.Loop_mul
1917
1918	movq	%r8, (%rdi)
1919	movq	%r9, 8(%rdi)
1920	movq	%r10, 16(%rdi)
1921	movq	%r11, 24(%rdi)
1922	movq	%r12, 32(%rdi)
1923	movq	%r13, 40(%rdi)
1924	movq	%r14, 48(%rdi)
1925	movq	%r15, 56(%rdi)
1926
1927	ret
1928.size	__rsaz_512_mul,.-__rsaz_512_mul
1929___
1930}
1931if ($addx) {
1932	# __rsaz_512_mulx
1933	#
1934	# input: %rsi - ap, %rbp - bp
1935	# ouput:
1936	# clobbers: everything
1937my ($ap,$bp,$zero) = ("%rsi","%rbp","%rdi");
1938$code.=<<___;
1939.type	__rsaz_512_mulx,\@abi-omnipotent
1940.align	32
1941__rsaz_512_mulx:
1942	mulx	($ap), %rbx, %r8	# initial %rdx preloaded by caller
1943	mov	\$-6, %rcx
1944
1945	mulx	8($ap), %rax, %r9
1946	movq	%rbx, 8(%rsp)
1947
1948	mulx	16($ap), %rbx, %r10
1949	adc	%rax, %r8
1950
1951	mulx	24($ap), %rax, %r11
1952	adc	%rbx, %r9
1953
1954	mulx	32($ap), %rbx, %r12
1955	adc	%rax, %r10
1956
1957	mulx	40($ap), %rax, %r13
1958	adc	%rbx, %r11
1959
1960	mulx	48($ap), %rbx, %r14
1961	adc	%rax, %r12
1962
1963	mulx	56($ap), %rax, %r15
1964	 mov	8($bp), %rdx
1965	adc	%rbx, %r13
1966	adc	%rax, %r14
1967	adc	\$0, %r15
1968
1969	xor	$zero, $zero		# cf=0,of=0
1970	jmp	.Loop_mulx
1971
1972.align	32
1973.Loop_mulx:
1974	movq	%r8, %rbx
1975	mulx	($ap), %rax, %r8
1976	adcx	%rax, %rbx
1977	adox	%r9, %r8
1978
1979	mulx	8($ap), %rax, %r9
1980	adcx	%rax, %r8
1981	adox	%r10, %r9
1982
1983	mulx	16($ap), %rax, %r10
1984	adcx	%rax, %r9
1985	adox	%r11, %r10
1986
1987	mulx	24($ap), %rax, %r11
1988	adcx	%rax, %r10
1989	adox	%r12, %r11
1990
1991	.byte	0x3e,0xc4,0x62,0xfb,0xf6,0xa6,0x20,0x00,0x00,0x00	# mulx	32($ap), %rax, %r12
1992	adcx	%rax, %r11
1993	adox	%r13, %r12
1994
1995	mulx	40($ap), %rax, %r13
1996	adcx	%rax, %r12
1997	adox	%r14, %r13
1998
1999	mulx	48($ap), %rax, %r14
2000	adcx	%rax, %r13
2001	adox	%r15, %r14
2002
2003	mulx	56($ap), %rax, %r15
2004	 movq	64($bp,%rcx,8), %rdx
2005	 movq	%rbx, 8+64-8(%rsp,%rcx,8)
2006	adcx	%rax, %r14
2007	adox	$zero, %r15
2008	adcx	$zero, %r15		# cf=0
2009
2010	inc	%rcx			# of=0
2011	jnz	.Loop_mulx
2012
2013	movq	%r8, %rbx
2014	mulx	($ap), %rax, %r8
2015	adcx	%rax, %rbx
2016	adox	%r9, %r8
2017
2018	.byte	0xc4,0x62,0xfb,0xf6,0x8e,0x08,0x00,0x00,0x00	# mulx	8($ap), %rax, %r9
2019	adcx	%rax, %r8
2020	adox	%r10, %r9
2021
2022	.byte	0xc4,0x62,0xfb,0xf6,0x96,0x10,0x00,0x00,0x00	# mulx	16($ap), %rax, %r10
2023	adcx	%rax, %r9
2024	adox	%r11, %r10
2025
2026	mulx	24($ap), %rax, %r11
2027	adcx	%rax, %r10
2028	adox	%r12, %r11
2029
2030	mulx	32($ap), %rax, %r12
2031	adcx	%rax, %r11
2032	adox	%r13, %r12
2033
2034	mulx	40($ap), %rax, %r13
2035	adcx	%rax, %r12
2036	adox	%r14, %r13
2037
2038	.byte	0xc4,0x62,0xfb,0xf6,0xb6,0x30,0x00,0x00,0x00	# mulx	48($ap), %rax, %r14
2039	adcx	%rax, %r13
2040	adox	%r15, %r14
2041
2042	.byte	0xc4,0x62,0xfb,0xf6,0xbe,0x38,0x00,0x00,0x00	# mulx	56($ap), %rax, %r15
2043	adcx	%rax, %r14
2044	adox	$zero, %r15
2045	adcx	$zero, %r15
2046
2047	mov	%rbx, 8+64-8(%rsp)
2048	mov	%r8, 8+64(%rsp)
2049	mov	%r9, 8+64+8(%rsp)
2050	mov	%r10, 8+64+16(%rsp)
2051	mov	%r11, 8+64+24(%rsp)
2052	mov	%r12, 8+64+32(%rsp)
2053	mov	%r13, 8+64+40(%rsp)
2054	mov	%r14, 8+64+48(%rsp)
2055	mov	%r15, 8+64+56(%rsp)
2056
2057	ret
2058.size	__rsaz_512_mulx,.-__rsaz_512_mulx
2059___
2060}
2061{
2062my ($out,$inp,$power)= $win64 ? ("%rcx","%rdx","%r8d") : ("%rdi","%rsi","%edx");
2063$code.=<<___;
2064.globl	rsaz_512_scatter4
2065.type	rsaz_512_scatter4,\@abi-omnipotent
2066.align	16
2067rsaz_512_scatter4:
2068	leaq	($out,$power,8), $out
2069	movl	\$8, %r9d
2070	jmp	.Loop_scatter
2071.align	16
2072.Loop_scatter:
2073	movq	($inp), %rax
2074	leaq	8($inp), $inp
2075	movq	%rax, ($out)
2076	leaq	128($out), $out
2077	decl	%r9d
2078	jnz	.Loop_scatter
2079	ret
2080.size	rsaz_512_scatter4,.-rsaz_512_scatter4
2081
2082.globl	rsaz_512_gather4
2083.type	rsaz_512_gather4,\@abi-omnipotent
2084.align	16
2085rsaz_512_gather4:
2086___
2087$code.=<<___	if ($win64);
2088.LSEH_begin_rsaz_512_gather4:
2089	.byte	0x48,0x81,0xec,0xa8,0x00,0x00,0x00	# sub    $0xa8,%rsp
2090	.byte	0x0f,0x29,0x34,0x24			# movaps %xmm6,(%rsp)
2091	.byte	0x0f,0x29,0x7c,0x24,0x10		# movaps %xmm7,0x10(%rsp)
2092	.byte	0x44,0x0f,0x29,0x44,0x24,0x20		# movaps %xmm8,0x20(%rsp)
2093	.byte	0x44,0x0f,0x29,0x4c,0x24,0x30		# movaps %xmm9,0x30(%rsp)
2094	.byte	0x44,0x0f,0x29,0x54,0x24,0x40		# movaps %xmm10,0x40(%rsp)
2095	.byte	0x44,0x0f,0x29,0x5c,0x24,0x50		# movaps %xmm11,0x50(%rsp)
2096	.byte	0x44,0x0f,0x29,0x64,0x24,0x60		# movaps %xmm12,0x60(%rsp)
2097	.byte	0x44,0x0f,0x29,0x6c,0x24,0x70		# movaps %xmm13,0x70(%rsp)
2098	.byte	0x44,0x0f,0x29,0xb4,0x24,0x80,0,0,0	# movaps %xmm14,0x80(%rsp)
2099	.byte	0x44,0x0f,0x29,0xbc,0x24,0x90,0,0,0	# movaps %xmm15,0x90(%rsp)
2100___
2101$code.=<<___;
2102	movd	$power,%xmm8
2103	movdqa	.Linc+16(%rip),%xmm1	# 00000002000000020000000200000002
2104	movdqa	.Linc(%rip),%xmm0	# 00000001000000010000000000000000
2105
2106	pshufd	\$0,%xmm8,%xmm8		# broadcast $power
2107	movdqa	%xmm1,%xmm7
2108	movdqa	%xmm1,%xmm2
2109___
2110########################################################################
2111# calculate mask by comparing 0..15 to $power
2112#
2113for($i=0;$i<4;$i++) {
2114$code.=<<___;
2115	paddd	%xmm`$i`,%xmm`$i+1`
2116	pcmpeqd	%xmm8,%xmm`$i`
2117	movdqa	%xmm7,%xmm`$i+3`
2118___
2119}
2120for(;$i<7;$i++) {
2121$code.=<<___;
2122	paddd	%xmm`$i`,%xmm`$i+1`
2123	pcmpeqd	%xmm8,%xmm`$i`
2124___
2125}
2126$code.=<<___;
2127	pcmpeqd	%xmm8,%xmm7
2128	movl	\$8, %r9d
2129	jmp	.Loop_gather
2130.align	16
2131.Loop_gather:
2132	movdqa	16*0($inp),%xmm8
2133	movdqa	16*1($inp),%xmm9
2134	movdqa	16*2($inp),%xmm10
2135	movdqa	16*3($inp),%xmm11
2136	pand	%xmm0,%xmm8
2137	movdqa	16*4($inp),%xmm12
2138	pand	%xmm1,%xmm9
2139	movdqa	16*5($inp),%xmm13
2140	pand	%xmm2,%xmm10
2141	movdqa	16*6($inp),%xmm14
2142	pand	%xmm3,%xmm11
2143	movdqa	16*7($inp),%xmm15
2144	leaq	128($inp), $inp
2145	pand	%xmm4,%xmm12
2146	pand	%xmm5,%xmm13
2147	pand	%xmm6,%xmm14
2148	pand	%xmm7,%xmm15
2149	por	%xmm10,%xmm8
2150	por	%xmm11,%xmm9
2151	por	%xmm12,%xmm8
2152	por	%xmm13,%xmm9
2153	por	%xmm14,%xmm8
2154	por	%xmm15,%xmm9
2155
2156	por	%xmm9,%xmm8
2157	pshufd	\$0x4e,%xmm8,%xmm9
2158	por	%xmm9,%xmm8
2159	movq	%xmm8,($out)
2160	leaq	8($out), $out
2161	decl	%r9d
2162	jnz	.Loop_gather
2163___
2164$code.=<<___	if ($win64);
2165	movaps	0x00(%rsp),%xmm6
2166	movaps	0x10(%rsp),%xmm7
2167	movaps	0x20(%rsp),%xmm8
2168	movaps	0x30(%rsp),%xmm9
2169	movaps	0x40(%rsp),%xmm10
2170	movaps	0x50(%rsp),%xmm11
2171	movaps	0x60(%rsp),%xmm12
2172	movaps	0x70(%rsp),%xmm13
2173	movaps	0x80(%rsp),%xmm14
2174	movaps	0x90(%rsp),%xmm15
2175	add	\$0xa8,%rsp
2176___
2177$code.=<<___;
2178	ret
2179.LSEH_end_rsaz_512_gather4:
2180.size	rsaz_512_gather4,.-rsaz_512_gather4
2181
2182.align	64
2183.Linc:
2184	.long	0,0, 1,1
2185	.long	2,2, 2,2
2186___
2187}
2188
2189# EXCEPTION_DISPOSITION handler (EXCEPTION_RECORD *rec,ULONG64 frame,
2190#		CONTEXT *context,DISPATCHER_CONTEXT *disp)
2191if ($win64) {
2192$rec="%rcx";
2193$frame="%rdx";
2194$context="%r8";
2195$disp="%r9";
2196
2197$code.=<<___;
2198.extern	__imp_RtlVirtualUnwind
2199.type	se_handler,\@abi-omnipotent
2200.align	16
2201se_handler:
2202	push	%rsi
2203	push	%rdi
2204	push	%rbx
2205	push	%rbp
2206	push	%r12
2207	push	%r13
2208	push	%r14
2209	push	%r15
2210	pushfq
2211	sub	\$64,%rsp
2212
2213	mov	120($context),%rax	# pull context->Rax
2214	mov	248($context),%rbx	# pull context->Rip
2215
2216	mov	8($disp),%rsi		# disp->ImageBase
2217	mov	56($disp),%r11		# disp->HandlerData
2218
2219	mov	0(%r11),%r10d		# HandlerData[0]
2220	lea	(%rsi,%r10),%r10	# end of prologue label
2221	cmp	%r10,%rbx		# context->Rip<end of prologue label
2222	jb	.Lcommon_seh_tail
2223
2224	mov	152($context),%rax	# pull context->Rsp
2225
2226	mov	4(%r11),%r10d		# HandlerData[1]
2227	lea	(%rsi,%r10),%r10	# epilogue label
2228	cmp	%r10,%rbx		# context->Rip>=epilogue label
2229	jae	.Lcommon_seh_tail
2230
2231	lea	128+24+48(%rax),%rax
2232
2233	lea	.Lmul_gather4_epilogue(%rip),%rbx
2234	cmp	%r10,%rbx
2235	jne	.Lse_not_in_mul_gather4
2236
2237	lea	0xb0(%rax),%rax
2238
2239	lea	-48-0xa8(%rax),%rsi
2240	lea	512($context),%rdi
2241	mov	\$20,%ecx
2242	.long	0xa548f3fc		# cld; rep movsq
2243
2244.Lse_not_in_mul_gather4:
2245	mov	-8(%rax),%rbx
2246	mov	-16(%rax),%rbp
2247	mov	-24(%rax),%r12
2248	mov	-32(%rax),%r13
2249	mov	-40(%rax),%r14
2250	mov	-48(%rax),%r15
2251	mov	%rbx,144($context)	# restore context->Rbx
2252	mov	%rbp,160($context)	# restore context->Rbp
2253	mov	%r12,216($context)	# restore context->R12
2254	mov	%r13,224($context)	# restore context->R13
2255	mov	%r14,232($context)	# restore context->R14
2256	mov	%r15,240($context)	# restore context->R15
2257
2258.Lcommon_seh_tail:
2259	mov	8(%rax),%rdi
2260	mov	16(%rax),%rsi
2261	mov	%rax,152($context)	# restore context->Rsp
2262	mov	%rsi,168($context)	# restore context->Rsi
2263	mov	%rdi,176($context)	# restore context->Rdi
2264
2265	mov	40($disp),%rdi		# disp->ContextRecord
2266	mov	$context,%rsi		# context
2267	mov	\$154,%ecx		# sizeof(CONTEXT)
2268	.long	0xa548f3fc		# cld; rep movsq
2269
2270	mov	$disp,%rsi
2271	xor	%rcx,%rcx		# arg1, UNW_FLAG_NHANDLER
2272	mov	8(%rsi),%rdx		# arg2, disp->ImageBase
2273	mov	0(%rsi),%r8		# arg3, disp->ControlPc
2274	mov	16(%rsi),%r9		# arg4, disp->FunctionEntry
2275	mov	40(%rsi),%r10		# disp->ContextRecord
2276	lea	56(%rsi),%r11		# &disp->HandlerData
2277	lea	24(%rsi),%r12		# &disp->EstablisherFrame
2278	mov	%r10,32(%rsp)		# arg5
2279	mov	%r11,40(%rsp)		# arg6
2280	mov	%r12,48(%rsp)		# arg7
2281	mov	%rcx,56(%rsp)		# arg8, (NULL)
2282	call	*__imp_RtlVirtualUnwind(%rip)
2283
2284	mov	\$1,%eax		# ExceptionContinueSearch
2285	add	\$64,%rsp
2286	popfq
2287	pop	%r15
2288	pop	%r14
2289	pop	%r13
2290	pop	%r12
2291	pop	%rbp
2292	pop	%rbx
2293	pop	%rdi
2294	pop	%rsi
2295	ret
2296.size	se_handler,.-se_handler
2297
2298.section	.pdata
2299.align	4
2300	.rva	.LSEH_begin_rsaz_512_sqr
2301	.rva	.LSEH_end_rsaz_512_sqr
2302	.rva	.LSEH_info_rsaz_512_sqr
2303
2304	.rva	.LSEH_begin_rsaz_512_mul
2305	.rva	.LSEH_end_rsaz_512_mul
2306	.rva	.LSEH_info_rsaz_512_mul
2307
2308	.rva	.LSEH_begin_rsaz_512_mul_gather4
2309	.rva	.LSEH_end_rsaz_512_mul_gather4
2310	.rva	.LSEH_info_rsaz_512_mul_gather4
2311
2312	.rva	.LSEH_begin_rsaz_512_mul_scatter4
2313	.rva	.LSEH_end_rsaz_512_mul_scatter4
2314	.rva	.LSEH_info_rsaz_512_mul_scatter4
2315
2316	.rva	.LSEH_begin_rsaz_512_mul_by_one
2317	.rva	.LSEH_end_rsaz_512_mul_by_one
2318	.rva	.LSEH_info_rsaz_512_mul_by_one
2319
2320	.rva	.LSEH_begin_rsaz_512_gather4
2321	.rva	.LSEH_end_rsaz_512_gather4
2322	.rva	.LSEH_info_rsaz_512_gather4
2323
2324.section	.xdata
2325.align	8
2326.LSEH_info_rsaz_512_sqr:
2327	.byte	9,0,0,0
2328	.rva	se_handler
2329	.rva	.Lsqr_body,.Lsqr_epilogue			# HandlerData[]
2330.LSEH_info_rsaz_512_mul:
2331	.byte	9,0,0,0
2332	.rva	se_handler
2333	.rva	.Lmul_body,.Lmul_epilogue			# HandlerData[]
2334.LSEH_info_rsaz_512_mul_gather4:
2335	.byte	9,0,0,0
2336	.rva	se_handler
2337	.rva	.Lmul_gather4_body,.Lmul_gather4_epilogue	# HandlerData[]
2338.LSEH_info_rsaz_512_mul_scatter4:
2339	.byte	9,0,0,0
2340	.rva	se_handler
2341	.rva	.Lmul_scatter4_body,.Lmul_scatter4_epilogue	# HandlerData[]
2342.LSEH_info_rsaz_512_mul_by_one:
2343	.byte	9,0,0,0
2344	.rva	se_handler
2345	.rva	.Lmul_by_one_body,.Lmul_by_one_epilogue		# HandlerData[]
2346.LSEH_info_rsaz_512_gather4:
2347	.byte	0x01,0x46,0x16,0x00
2348	.byte	0x46,0xf8,0x09,0x00	# vmovaps 0x90(rsp),xmm15
2349	.byte	0x3d,0xe8,0x08,0x00	# vmovaps 0x80(rsp),xmm14
2350	.byte	0x34,0xd8,0x07,0x00	# vmovaps 0x70(rsp),xmm13
2351	.byte	0x2e,0xc8,0x06,0x00	# vmovaps 0x60(rsp),xmm12
2352	.byte	0x28,0xb8,0x05,0x00	# vmovaps 0x50(rsp),xmm11
2353	.byte	0x22,0xa8,0x04,0x00	# vmovaps 0x40(rsp),xmm10
2354	.byte	0x1c,0x98,0x03,0x00	# vmovaps 0x30(rsp),xmm9
2355	.byte	0x16,0x88,0x02,0x00	# vmovaps 0x20(rsp),xmm8
2356	.byte	0x10,0x78,0x01,0x00	# vmovaps 0x10(rsp),xmm7
2357	.byte	0x0b,0x68,0x00,0x00	# vmovaps 0x00(rsp),xmm6
2358	.byte	0x07,0x01,0x15,0x00	# sub     rsp,0xa8
2359___
2360}
2361
2362$code =~ s/\`([^\`]*)\`/eval $1/gem;
2363print $code;
2364close STDOUT;
2365