rsaz-x86_64.pl revision 296279
1289848Sjkim#!/usr/bin/env perl
2289848Sjkim
3289848Sjkim##############################################################################
4289848Sjkim#                                                                            #
5289848Sjkim#  Copyright (c) 2012, Intel Corporation                                     #
6289848Sjkim#                                                                            #
7289848Sjkim#  All rights reserved.                                                      #
8289848Sjkim#                                                                            #
9289848Sjkim#  Redistribution and use in source and binary forms, with or without        #
10289848Sjkim#  modification, are permitted provided that the following conditions are    #
11289848Sjkim#  met:                                                                      #
12289848Sjkim#                                                                            #
13289848Sjkim#  *  Redistributions of source code must retain the above copyright         #
14289848Sjkim#     notice, this list of conditions and the following disclaimer.          #
15289848Sjkim#                                                                            #
16289848Sjkim#  *  Redistributions in binary form must reproduce the above copyright      #
17289848Sjkim#     notice, this list of conditions and the following disclaimer in the    #
18289848Sjkim#     documentation and/or other materials provided with the                 #
19289848Sjkim#     distribution.                                                          #
20289848Sjkim#                                                                            #
21289848Sjkim#  *  Neither the name of the Intel Corporation nor the names of its         #
22289848Sjkim#     contributors may be used to endorse or promote products derived from   #
23289848Sjkim#     this software without specific prior written permission.               #
24289848Sjkim#                                                                            #
25289848Sjkim#                                                                            #
26289848Sjkim#  THIS SOFTWARE IS PROVIDED BY INTEL CORPORATION ""AS IS"" AND ANY          #
27289848Sjkim#  EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE         #
28289848Sjkim#  IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR        #
29289848Sjkim#  PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL INTEL CORPORATION OR            #
30289848Sjkim#  CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,     #
31289848Sjkim#  EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,       #
32289848Sjkim#  PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR        #
33289848Sjkim#  PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF    #
34289848Sjkim#  LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING      #
35289848Sjkim#  NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS        #
36289848Sjkim#  SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.              #
37289848Sjkim#                                                                            #
38289848Sjkim##############################################################################
39289848Sjkim# Developers and authors:                                                    #
40289848Sjkim# Shay Gueron (1, 2), and Vlad Krasnov (1)                                   #
41289848Sjkim# (1) Intel Architecture Group, Microprocessor and Chipset Development,      #
42289848Sjkim#     Israel Development Center, Haifa, Israel                               #
43289848Sjkim# (2) University of Haifa                                                    #
44289848Sjkim##############################################################################
45289848Sjkim# Reference:                                                                 #
46289848Sjkim# [1] S. Gueron, "Efficient Software Implementations of Modular              #
47289848Sjkim#     Exponentiation", http://eprint.iacr.org/2011/239                       #
48289848Sjkim# [2] S. Gueron, V. Krasnov. "Speeding up Big-Numbers Squaring".             #
49289848Sjkim#     IEEE Proceedings of 9th International Conference on Information        #
50289848Sjkim#     Technology: New Generations (ITNG 2012), 821-823 (2012).               #
51289848Sjkim# [3] S. Gueron, Efficient Software Implementations of Modular Exponentiation#
52289848Sjkim#     Journal of Cryptographic Engineering 2:31-43 (2012).                   #
53289848Sjkim# [4] S. Gueron, V. Krasnov: "[PATCH] Efficient and side channel analysis    #
54289848Sjkim#     resistant 512-bit and 1024-bit modular exponentiation for optimizing   #
55289848Sjkim#     RSA1024 and RSA2048 on x86_64 platforms",                              #
56289848Sjkim#     http://rt.openssl.org/Ticket/Display.html?id=2582&user=guest&pass=guest#
57289848Sjkim##############################################################################
58289848Sjkim
59289848Sjkim# While original submission covers 512- and 1024-bit exponentiation,
60289848Sjkim# this module is limited to 512-bit version only (and as such
61289848Sjkim# accelerates RSA1024 sign). This is because improvement for longer
62289848Sjkim# keys is not high enough to justify the effort, highest measured
63289848Sjkim# was ~5% on Westmere. [This is relative to OpenSSL 1.0.2, upcoming
64289848Sjkim# for the moment of this writing!] Nor does this module implement
65289848Sjkim# "monolithic" complete exponentiation jumbo-subroutine, but adheres
66289848Sjkim# to more modular mixture of C and assembly. And it's optimized even
67289848Sjkim# for processors other than Intel Core family (see table below for
68289848Sjkim# improvement coefficients).
69289848Sjkim# 						<appro@openssl.org>
70289848Sjkim#
71289848Sjkim# RSA1024 sign/sec	this/original	|this/rsax(*)	this/fips(*)
72289848Sjkim#			----------------+---------------------------
73289848Sjkim# Opteron		+13%		|+5%		+20%
74289848Sjkim# Bulldozer		-0%		|-1%		+10%
75289848Sjkim# P4			+11%		|+7%		+8%
76289848Sjkim# Westmere		+5%		|+14%		+17%
77289848Sjkim# Sandy Bridge		+2%		|+12%		+29%
78289848Sjkim# Ivy Bridge		+1%		|+11%		+35%
79289848Sjkim# Haswell(**)		-0%		|+12%		+39%
80289848Sjkim# Atom			+13%		|+11%		+4%
81289848Sjkim# VIA Nano		+70%		|+9%		+25%
82289848Sjkim#
83289848Sjkim# (*)	rsax engine and fips numbers are presented for reference
84289848Sjkim#	purposes;
85289848Sjkim# (**)	MULX was attempted, but found to give only marginal improvement;
86289848Sjkim
87289848Sjkim$flavour = shift;
88289848Sjkim$output  = shift;
89289848Sjkimif ($flavour =~ /\./) { $output = $flavour; undef $flavour; }
90289848Sjkim
91289848Sjkim$win64=0; $win64=1 if ($flavour =~ /[nm]asm|mingw64/ || $output =~ /\.asm$/);
92289848Sjkim
93289848Sjkim$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
94289848Sjkim( $xlate="${dir}x86_64-xlate.pl" and -f $xlate ) or
95289848Sjkim( $xlate="${dir}../../perlasm/x86_64-xlate.pl" and -f $xlate) or
96289848Sjkimdie "can't locate x86_64-xlate.pl";
97289848Sjkim
98289848Sjkimopen OUT,"| \"$^X\" $xlate $flavour $output";
99289848Sjkim*STDOUT=*OUT;
100289848Sjkim
101289848Sjkimif (`$ENV{CC} -Wa,-v -c -o /dev/null -x assembler /dev/null 2>&1`
102289848Sjkim		=~ /GNU assembler version ([2-9]\.[0-9]+)/) {
103289848Sjkim	$addx = ($1>=2.23);
104289848Sjkim}
105289848Sjkim
106289848Sjkimif (!$addx && $win64 && ($flavour =~ /nasm/ || $ENV{ASM} =~ /nasm/) &&
107289848Sjkim	    `nasm -v 2>&1` =~ /NASM version ([2-9]\.[0-9]+)/) {
108289848Sjkim	$addx = ($1>=2.10);
109289848Sjkim}
110289848Sjkim
111289848Sjkimif (!$addx && $win64 && ($flavour =~ /masm/ || $ENV{ASM} =~ /ml64/) &&
112289848Sjkim	    `ml64 2>&1` =~ /Version ([0-9]+)\./) {
113289848Sjkim	$addx = ($1>=12);
114289848Sjkim}
115289848Sjkim
116295009Sjkimif (!$addx && `$ENV{CC} -v 2>&1` =~ /((?:^clang|LLVM) version|.*based on LLVM) ([3-9])\.([0-9]+)/) {
117289848Sjkim	my $ver = $2 + $3/100.0;	# 3.1->3.01, 3.10->3.10
118289848Sjkim	$addx = ($ver>=3.03);
119289848Sjkim}
120289848Sjkim
121289848Sjkim($out, $inp, $mod) = ("%rdi", "%rsi", "%rbp");	# common internal API
122289848Sjkim{
123289848Sjkimmy ($out,$inp,$mod,$n0,$times) = ("%rdi","%rsi","%rdx","%rcx","%r8d");
124289848Sjkim
125289848Sjkim$code.=<<___;
126289848Sjkim.text
127289848Sjkim
128289848Sjkim.extern	OPENSSL_ia32cap_P
129289848Sjkim
130289848Sjkim.globl	rsaz_512_sqr
131289848Sjkim.type	rsaz_512_sqr,\@function,5
132289848Sjkim.align	32
133289848Sjkimrsaz_512_sqr:				# 25-29% faster than rsaz_512_mul
134289848Sjkim	push	%rbx
135289848Sjkim	push	%rbp
136289848Sjkim	push	%r12
137289848Sjkim	push	%r13
138289848Sjkim	push	%r14
139289848Sjkim	push	%r15
140289848Sjkim
141289848Sjkim	subq	\$128+24, %rsp
142289848Sjkim.Lsqr_body:
143289848Sjkim	movq	$mod, %rbp		# common argument
144289848Sjkim	movq	($inp), %rdx
145289848Sjkim	movq	8($inp), %rax
146289848Sjkim	movq	$n0, 128(%rsp)
147289848Sjkim___
148289848Sjkim$code.=<<___ if ($addx);
149289848Sjkim	movl	\$0x80100,%r11d
150289848Sjkim	andl	OPENSSL_ia32cap_P+8(%rip),%r11d
151289848Sjkim	cmpl	\$0x80100,%r11d		# check for MULX and ADO/CX
152289848Sjkim	je	.Loop_sqrx
153289848Sjkim___
154289848Sjkim$code.=<<___;
155289848Sjkim	jmp	.Loop_sqr
156289848Sjkim
157289848Sjkim.align	32
158289848Sjkim.Loop_sqr:
159289848Sjkim	movl	$times,128+8(%rsp)
160289848Sjkim#first iteration
161289848Sjkim	movq	%rdx, %rbx
162289848Sjkim	mulq	%rdx
163289848Sjkim	movq	%rax, %r8
164289848Sjkim	movq	16($inp), %rax
165289848Sjkim	movq	%rdx, %r9
166289848Sjkim
167289848Sjkim	mulq	%rbx
168289848Sjkim	addq	%rax, %r9
169289848Sjkim	movq	24($inp), %rax
170289848Sjkim	movq	%rdx, %r10
171289848Sjkim	adcq	\$0, %r10
172289848Sjkim
173289848Sjkim	mulq	%rbx
174289848Sjkim	addq	%rax, %r10
175289848Sjkim	movq	32($inp), %rax
176289848Sjkim	movq	%rdx, %r11
177289848Sjkim	adcq	\$0, %r11
178289848Sjkim
179289848Sjkim	mulq	%rbx
180289848Sjkim	addq	%rax, %r11
181289848Sjkim	movq	40($inp), %rax
182289848Sjkim	movq	%rdx, %r12
183289848Sjkim	adcq	\$0, %r12
184289848Sjkim
185289848Sjkim	mulq	%rbx
186289848Sjkim	addq	%rax, %r12
187289848Sjkim	movq	48($inp), %rax
188289848Sjkim	movq	%rdx, %r13
189289848Sjkim	adcq	\$0, %r13
190289848Sjkim
191289848Sjkim	mulq	%rbx
192289848Sjkim	addq	%rax, %r13
193289848Sjkim	movq	56($inp), %rax
194289848Sjkim	movq	%rdx, %r14
195289848Sjkim	adcq	\$0, %r14
196289848Sjkim
197289848Sjkim	mulq	%rbx
198289848Sjkim	addq	%rax, %r14
199289848Sjkim	movq	%rbx, %rax
200289848Sjkim	movq	%rdx, %r15
201289848Sjkim	adcq	\$0, %r15
202289848Sjkim
203289848Sjkim	addq	%r8, %r8		#shlq	\$1, %r8
204289848Sjkim	movq	%r9, %rcx
205289848Sjkim	adcq	%r9, %r9		#shld	\$1, %r8, %r9
206289848Sjkim
207289848Sjkim	mulq	%rax
208289848Sjkim	movq	%rax, (%rsp)
209289848Sjkim	addq	%rdx, %r8
210289848Sjkim	adcq	\$0, %r9
211289848Sjkim
212289848Sjkim	movq	%r8, 8(%rsp)
213289848Sjkim	shrq	\$63, %rcx
214289848Sjkim
215289848Sjkim#second iteration
216289848Sjkim	movq	8($inp), %r8
217289848Sjkim	movq	16($inp), %rax
218289848Sjkim	mulq	%r8
219289848Sjkim	addq	%rax, %r10
220289848Sjkim	movq	24($inp), %rax
221289848Sjkim	movq	%rdx, %rbx
222289848Sjkim	adcq	\$0, %rbx
223289848Sjkim
224289848Sjkim	mulq	%r8
225289848Sjkim	addq	%rax, %r11
226289848Sjkim	movq	32($inp), %rax
227289848Sjkim	adcq	\$0, %rdx
228289848Sjkim	addq	%rbx, %r11
229289848Sjkim	movq	%rdx, %rbx
230289848Sjkim	adcq	\$0, %rbx
231289848Sjkim
232289848Sjkim	mulq	%r8
233289848Sjkim	addq	%rax, %r12
234289848Sjkim	movq	40($inp), %rax
235289848Sjkim	adcq	\$0, %rdx
236289848Sjkim	addq	%rbx, %r12
237289848Sjkim	movq	%rdx, %rbx
238289848Sjkim	adcq	\$0, %rbx
239289848Sjkim
240289848Sjkim	mulq	%r8
241289848Sjkim	addq	%rax, %r13
242289848Sjkim	movq	48($inp), %rax
243289848Sjkim	adcq	\$0, %rdx
244289848Sjkim	addq	%rbx, %r13
245289848Sjkim	movq	%rdx, %rbx
246289848Sjkim	adcq	\$0, %rbx
247289848Sjkim
248289848Sjkim	mulq	%r8
249289848Sjkim	addq	%rax, %r14
250289848Sjkim	movq	56($inp), %rax
251289848Sjkim	adcq	\$0, %rdx
252289848Sjkim	addq	%rbx, %r14
253289848Sjkim	movq	%rdx, %rbx
254289848Sjkim	adcq	\$0, %rbx
255289848Sjkim
256289848Sjkim	mulq	%r8
257289848Sjkim	addq	%rax, %r15
258289848Sjkim	movq	%r8, %rax
259289848Sjkim	adcq	\$0, %rdx
260289848Sjkim	addq	%rbx, %r15
261289848Sjkim	movq	%rdx, %r8
262289848Sjkim	movq	%r10, %rdx
263289848Sjkim	adcq	\$0, %r8
264289848Sjkim
265289848Sjkim	add	%rdx, %rdx
266289848Sjkim	lea	(%rcx,%r10,2), %r10	#shld	\$1, %rcx, %r10
267289848Sjkim	movq	%r11, %rbx
268289848Sjkim	adcq	%r11, %r11		#shld	\$1, %r10, %r11
269289848Sjkim
270289848Sjkim	mulq	%rax
271289848Sjkim	addq	%rax, %r9
272289848Sjkim	adcq	%rdx, %r10
273289848Sjkim	adcq	\$0, %r11
274289848Sjkim
275289848Sjkim	movq	%r9, 16(%rsp)
276289848Sjkim	movq	%r10, 24(%rsp)
277289848Sjkim	shrq	\$63, %rbx
278289848Sjkim
279289848Sjkim#third iteration
280289848Sjkim	movq	16($inp), %r9
281289848Sjkim	movq	24($inp), %rax
282289848Sjkim	mulq	%r9
283289848Sjkim	addq	%rax, %r12
284289848Sjkim	movq	32($inp), %rax
285289848Sjkim	movq	%rdx, %rcx
286289848Sjkim	adcq	\$0, %rcx
287289848Sjkim
288289848Sjkim	mulq	%r9
289289848Sjkim	addq	%rax, %r13
290289848Sjkim	movq	40($inp), %rax
291289848Sjkim	adcq	\$0, %rdx
292289848Sjkim	addq	%rcx, %r13
293289848Sjkim	movq	%rdx, %rcx
294289848Sjkim	adcq	\$0, %rcx
295289848Sjkim
296289848Sjkim	mulq	%r9
297289848Sjkim	addq	%rax, %r14
298289848Sjkim	movq	48($inp), %rax
299289848Sjkim	adcq	\$0, %rdx
300289848Sjkim	addq	%rcx, %r14
301289848Sjkim	movq	%rdx, %rcx
302289848Sjkim	adcq	\$0, %rcx
303289848Sjkim
304289848Sjkim	mulq	%r9
305289848Sjkim	 movq	%r12, %r10
306289848Sjkim	 lea	(%rbx,%r12,2), %r12	#shld	\$1, %rbx, %r12
307289848Sjkim	addq	%rax, %r15
308289848Sjkim	movq	56($inp), %rax
309289848Sjkim	adcq	\$0, %rdx
310289848Sjkim	addq	%rcx, %r15
311289848Sjkim	movq	%rdx, %rcx
312289848Sjkim	adcq	\$0, %rcx
313289848Sjkim
314289848Sjkim	mulq	%r9
315289848Sjkim	 shrq	\$63, %r10
316289848Sjkim	addq	%rax, %r8
317289848Sjkim	movq	%r9, %rax
318289848Sjkim	adcq	\$0, %rdx
319289848Sjkim	addq	%rcx, %r8
320289848Sjkim	movq	%rdx, %r9
321289848Sjkim	adcq	\$0, %r9
322289848Sjkim
323289848Sjkim	movq	%r13, %rcx
324289848Sjkim	leaq	(%r10,%r13,2), %r13	#shld	\$1, %r12, %r13
325289848Sjkim
326289848Sjkim	mulq	%rax
327289848Sjkim	addq	%rax, %r11
328289848Sjkim	adcq	%rdx, %r12
329289848Sjkim	adcq	\$0, %r13
330289848Sjkim
331289848Sjkim	movq	%r11, 32(%rsp)
332289848Sjkim	movq	%r12, 40(%rsp)
333289848Sjkim	shrq	\$63, %rcx
334289848Sjkim
335289848Sjkim#fourth iteration
336289848Sjkim	movq	24($inp), %r10
337289848Sjkim	movq	32($inp), %rax
338289848Sjkim	mulq	%r10
339289848Sjkim	addq	%rax, %r14
340289848Sjkim	movq	40($inp), %rax
341289848Sjkim	movq	%rdx, %rbx
342289848Sjkim	adcq	\$0, %rbx
343289848Sjkim
344289848Sjkim	mulq	%r10
345289848Sjkim	addq	%rax, %r15
346289848Sjkim	movq	48($inp), %rax
347289848Sjkim	adcq	\$0, %rdx
348289848Sjkim	addq	%rbx, %r15
349289848Sjkim	movq	%rdx, %rbx
350289848Sjkim	adcq	\$0, %rbx
351289848Sjkim
352289848Sjkim	mulq	%r10
353289848Sjkim	 movq	%r14, %r12
354289848Sjkim	 leaq	(%rcx,%r14,2), %r14	#shld	\$1, %rcx, %r14
355289848Sjkim	addq	%rax, %r8
356289848Sjkim	movq	56($inp), %rax
357289848Sjkim	adcq	\$0, %rdx
358289848Sjkim	addq	%rbx, %r8
359289848Sjkim	movq	%rdx, %rbx
360289848Sjkim	adcq	\$0, %rbx
361289848Sjkim
362289848Sjkim	mulq	%r10
363289848Sjkim	 shrq	\$63, %r12
364289848Sjkim	addq	%rax, %r9
365289848Sjkim	movq	%r10, %rax
366289848Sjkim	adcq	\$0, %rdx
367289848Sjkim	addq	%rbx, %r9
368289848Sjkim	movq	%rdx, %r10
369289848Sjkim	adcq	\$0, %r10
370289848Sjkim
371289848Sjkim	movq	%r15, %rbx
372289848Sjkim	leaq	(%r12,%r15,2),%r15	#shld	\$1, %r14, %r15
373289848Sjkim
374289848Sjkim	mulq	%rax
375289848Sjkim	addq	%rax, %r13
376289848Sjkim	adcq	%rdx, %r14
377289848Sjkim	adcq	\$0, %r15
378289848Sjkim
379289848Sjkim	movq	%r13, 48(%rsp)
380289848Sjkim	movq	%r14, 56(%rsp)
381289848Sjkim	shrq	\$63, %rbx
382289848Sjkim
383289848Sjkim#fifth iteration
384289848Sjkim	movq	32($inp), %r11
385289848Sjkim	movq	40($inp), %rax
386289848Sjkim	mulq	%r11
387289848Sjkim	addq	%rax, %r8
388289848Sjkim	movq	48($inp), %rax
389289848Sjkim	movq	%rdx, %rcx
390289848Sjkim	adcq	\$0, %rcx
391289848Sjkim
392289848Sjkim	mulq	%r11
393289848Sjkim	addq	%rax, %r9
394289848Sjkim	movq	56($inp), %rax
395289848Sjkim	adcq	\$0, %rdx
396289848Sjkim	 movq	%r8, %r12
397289848Sjkim	 leaq	(%rbx,%r8,2), %r8	#shld	\$1, %rbx, %r8
398289848Sjkim	addq	%rcx, %r9
399289848Sjkim	movq	%rdx, %rcx
400289848Sjkim	adcq	\$0, %rcx
401289848Sjkim
402289848Sjkim	mulq	%r11
403289848Sjkim	 shrq	\$63, %r12
404289848Sjkim	addq	%rax, %r10
405289848Sjkim	movq	%r11, %rax
406289848Sjkim	adcq	\$0, %rdx
407289848Sjkim	addq	%rcx, %r10
408289848Sjkim	movq	%rdx, %r11
409289848Sjkim	adcq	\$0, %r11
410289848Sjkim
411289848Sjkim	movq	%r9, %rcx
412289848Sjkim	leaq	(%r12,%r9,2), %r9	#shld	\$1, %r8, %r9
413289848Sjkim
414289848Sjkim	mulq	%rax
415289848Sjkim	addq	%rax, %r15
416289848Sjkim	adcq	%rdx, %r8
417289848Sjkim	adcq	\$0, %r9
418289848Sjkim
419289848Sjkim	movq	%r15, 64(%rsp)
420289848Sjkim	movq	%r8, 72(%rsp)
421289848Sjkim	shrq	\$63, %rcx
422289848Sjkim
423289848Sjkim#sixth iteration
424289848Sjkim	movq	40($inp), %r12
425289848Sjkim	movq	48($inp), %rax
426289848Sjkim	mulq	%r12
427289848Sjkim	addq	%rax, %r10
428289848Sjkim	movq	56($inp), %rax
429289848Sjkim	movq	%rdx, %rbx
430289848Sjkim	adcq	\$0, %rbx
431289848Sjkim
432289848Sjkim	mulq	%r12
433289848Sjkim	addq	%rax, %r11
434289848Sjkim	movq	%r12, %rax
435289848Sjkim	 movq	%r10, %r15
436289848Sjkim	 leaq	(%rcx,%r10,2), %r10	#shld	\$1, %rcx, %r10
437289848Sjkim	adcq	\$0, %rdx
438289848Sjkim	 shrq	\$63, %r15
439289848Sjkim	addq	%rbx, %r11
440289848Sjkim	movq	%rdx, %r12
441289848Sjkim	adcq	\$0, %r12
442289848Sjkim
443289848Sjkim	movq	%r11, %rbx
444289848Sjkim	leaq	(%r15,%r11,2), %r11	#shld	\$1, %r10, %r11
445289848Sjkim
446289848Sjkim	mulq	%rax
447289848Sjkim	addq	%rax, %r9
448289848Sjkim	adcq	%rdx, %r10
449289848Sjkim	adcq	\$0, %r11
450289848Sjkim
451289848Sjkim	movq	%r9, 80(%rsp)
452289848Sjkim	movq	%r10, 88(%rsp)
453289848Sjkim
454289848Sjkim#seventh iteration
455289848Sjkim	movq	48($inp), %r13
456289848Sjkim	movq	56($inp), %rax
457289848Sjkim	mulq	%r13
458289848Sjkim	addq	%rax, %r12
459289848Sjkim	movq	%r13, %rax
460289848Sjkim	movq	%rdx, %r13
461289848Sjkim	adcq	\$0, %r13
462289848Sjkim
463289848Sjkim	xorq	%r14, %r14
464289848Sjkim	shlq	\$1, %rbx
465289848Sjkim	adcq	%r12, %r12		#shld	\$1, %rbx, %r12
466289848Sjkim	adcq	%r13, %r13		#shld	\$1, %r12, %r13
467289848Sjkim	adcq	%r14, %r14		#shld	\$1, %r13, %r14
468289848Sjkim
469289848Sjkim	mulq	%rax
470289848Sjkim	addq	%rax, %r11
471289848Sjkim	adcq	%rdx, %r12
472289848Sjkim	adcq	\$0, %r13
473289848Sjkim
474289848Sjkim	movq	%r11, 96(%rsp)
475289848Sjkim	movq	%r12, 104(%rsp)
476289848Sjkim
477289848Sjkim#eighth iteration
478289848Sjkim	movq	56($inp), %rax
479289848Sjkim	mulq	%rax
480289848Sjkim	addq	%rax, %r13
481289848Sjkim	adcq	\$0, %rdx
482289848Sjkim
483289848Sjkim	addq	%rdx, %r14
484289848Sjkim
485289848Sjkim	movq	%r13, 112(%rsp)
486289848Sjkim	movq	%r14, 120(%rsp)
487289848Sjkim
488289848Sjkim	movq	(%rsp), %r8
489289848Sjkim	movq	8(%rsp), %r9
490289848Sjkim	movq	16(%rsp), %r10
491289848Sjkim	movq	24(%rsp), %r11
492289848Sjkim	movq	32(%rsp), %r12
493289848Sjkim	movq	40(%rsp), %r13
494289848Sjkim	movq	48(%rsp), %r14
495289848Sjkim	movq	56(%rsp), %r15
496289848Sjkim
497289848Sjkim	call	__rsaz_512_reduce
498289848Sjkim
499289848Sjkim	addq	64(%rsp), %r8
500289848Sjkim	adcq	72(%rsp), %r9
501289848Sjkim	adcq	80(%rsp), %r10
502289848Sjkim	adcq	88(%rsp), %r11
503289848Sjkim	adcq	96(%rsp), %r12
504289848Sjkim	adcq	104(%rsp), %r13
505289848Sjkim	adcq	112(%rsp), %r14
506289848Sjkim	adcq	120(%rsp), %r15
507289848Sjkim	sbbq	%rcx, %rcx
508289848Sjkim
509289848Sjkim	call	__rsaz_512_subtract
510289848Sjkim
511289848Sjkim	movq	%r8, %rdx
512289848Sjkim	movq	%r9, %rax
513289848Sjkim	movl	128+8(%rsp), $times
514289848Sjkim	movq	$out, $inp
515289848Sjkim
516289848Sjkim	decl	$times
517289848Sjkim	jnz	.Loop_sqr
518289848Sjkim___
519289848Sjkimif ($addx) {
520289848Sjkim$code.=<<___;
521289848Sjkim	jmp	.Lsqr_tail
522289848Sjkim
523289848Sjkim.align	32
524289848Sjkim.Loop_sqrx:
525289848Sjkim	movl	$times,128+8(%rsp)
526289848Sjkim	movq	$out, %xmm0		# off-load
527289848Sjkim	movq	%rbp, %xmm1		# off-load
528289848Sjkim#first iteration
529289848Sjkim	mulx	%rax, %r8, %r9
530289848Sjkim
531289848Sjkim	mulx	16($inp), %rcx, %r10
532289848Sjkim	xor	%rbp, %rbp		# cf=0, of=0
533289848Sjkim
534289848Sjkim	mulx	24($inp), %rax, %r11
535289848Sjkim	adcx	%rcx, %r9
536289848Sjkim
537289848Sjkim	mulx	32($inp), %rcx, %r12
538289848Sjkim	adcx	%rax, %r10
539289848Sjkim
540289848Sjkim	mulx	40($inp), %rax, %r13
541289848Sjkim	adcx	%rcx, %r11
542289848Sjkim
543289848Sjkim	.byte	0xc4,0x62,0xf3,0xf6,0xb6,0x30,0x00,0x00,0x00	# mulx	48($inp), %rcx, %r14
544289848Sjkim	adcx	%rax, %r12
545289848Sjkim	adcx	%rcx, %r13
546289848Sjkim
547289848Sjkim	.byte	0xc4,0x62,0xfb,0xf6,0xbe,0x38,0x00,0x00,0x00	# mulx	56($inp), %rax, %r15
548289848Sjkim	adcx	%rax, %r14
549289848Sjkim	adcx	%rbp, %r15		# %rbp is 0
550289848Sjkim
551289848Sjkim	mov	%r9, %rcx
552289848Sjkim	shld	\$1, %r8, %r9
553289848Sjkim	shl	\$1, %r8
554289848Sjkim
555289848Sjkim	xor	%ebp, %ebp
556289848Sjkim	mulx	%rdx, %rax, %rdx
557289848Sjkim	adcx	%rdx, %r8
558289848Sjkim	 mov	8($inp), %rdx
559289848Sjkim	adcx	%rbp, %r9
560289848Sjkim
561289848Sjkim	mov	%rax, (%rsp)
562289848Sjkim	mov	%r8, 8(%rsp)
563289848Sjkim
564289848Sjkim#second iteration
565289848Sjkim	mulx	16($inp), %rax, %rbx
566289848Sjkim	adox	%rax, %r10
567289848Sjkim	adcx	%rbx, %r11
568289848Sjkim
569289848Sjkim	.byte	0xc4,0x62,0xc3,0xf6,0x86,0x18,0x00,0x00,0x00	# mulx	24($inp), $out, %r8
570289848Sjkim	adox	$out, %r11
571289848Sjkim	adcx	%r8, %r12
572289848Sjkim
573289848Sjkim	mulx	32($inp), %rax, %rbx
574289848Sjkim	adox	%rax, %r12
575289848Sjkim	adcx	%rbx, %r13
576289848Sjkim
577289848Sjkim	mulx	40($inp), $out, %r8
578289848Sjkim	adox	$out, %r13
579289848Sjkim	adcx	%r8, %r14
580289848Sjkim
581289848Sjkim	.byte	0xc4,0xe2,0xfb,0xf6,0x9e,0x30,0x00,0x00,0x00	# mulx	48($inp), %rax, %rbx
582289848Sjkim	adox	%rax, %r14
583289848Sjkim	adcx	%rbx, %r15
584289848Sjkim
585289848Sjkim	.byte	0xc4,0x62,0xc3,0xf6,0x86,0x38,0x00,0x00,0x00	# mulx	56($inp), $out, %r8
586289848Sjkim	adox	$out, %r15
587289848Sjkim	adcx	%rbp, %r8
588289848Sjkim	adox	%rbp, %r8
589289848Sjkim
590289848Sjkim	mov	%r11, %rbx
591289848Sjkim	shld	\$1, %r10, %r11
592289848Sjkim	shld	\$1, %rcx, %r10
593289848Sjkim
594289848Sjkim	xor	%ebp,%ebp
595289848Sjkim	mulx	%rdx, %rax, %rcx
596289848Sjkim	 mov	16($inp), %rdx
597289848Sjkim	adcx	%rax, %r9
598289848Sjkim	adcx	%rcx, %r10
599289848Sjkim	adcx	%rbp, %r11
600289848Sjkim
601289848Sjkim	mov	%r9, 16(%rsp)
602289848Sjkim	.byte	0x4c,0x89,0x94,0x24,0x18,0x00,0x00,0x00		# mov	%r10, 24(%rsp)
603289848Sjkim
604289848Sjkim#third iteration
605289848Sjkim	.byte	0xc4,0x62,0xc3,0xf6,0x8e,0x18,0x00,0x00,0x00	# mulx	24($inp), $out, %r9
606289848Sjkim	adox	$out, %r12
607289848Sjkim	adcx	%r9, %r13
608289848Sjkim
609289848Sjkim	mulx	32($inp), %rax, %rcx
610289848Sjkim	adox	%rax, %r13
611289848Sjkim	adcx	%rcx, %r14
612289848Sjkim
613289848Sjkim	mulx	40($inp), $out, %r9
614289848Sjkim	adox	$out, %r14
615289848Sjkim	adcx	%r9, %r15
616289848Sjkim
617289848Sjkim	.byte	0xc4,0xe2,0xfb,0xf6,0x8e,0x30,0x00,0x00,0x00	# mulx	48($inp), %rax, %rcx
618289848Sjkim	adox	%rax, %r15
619289848Sjkim	adcx	%rcx, %r8
620289848Sjkim
621289848Sjkim	.byte	0xc4,0x62,0xc3,0xf6,0x8e,0x38,0x00,0x00,0x00	# mulx	56($inp), $out, %r9
622289848Sjkim	adox	$out, %r8
623289848Sjkim	adcx	%rbp, %r9
624289848Sjkim	adox	%rbp, %r9
625289848Sjkim
626289848Sjkim	mov	%r13, %rcx
627289848Sjkim	shld	\$1, %r12, %r13
628289848Sjkim	shld	\$1, %rbx, %r12
629289848Sjkim
630289848Sjkim	xor	%ebp, %ebp
631289848Sjkim	mulx	%rdx, %rax, %rdx
632289848Sjkim	adcx	%rax, %r11
633289848Sjkim	adcx	%rdx, %r12
634289848Sjkim	 mov	24($inp), %rdx
635289848Sjkim	adcx	%rbp, %r13
636289848Sjkim
637289848Sjkim	mov	%r11, 32(%rsp)
638289848Sjkim	.byte	0x4c,0x89,0xa4,0x24,0x28,0x00,0x00,0x00		# mov	%r12, 40(%rsp)
639289848Sjkim
640289848Sjkim#fourth iteration
641289848Sjkim	.byte	0xc4,0xe2,0xfb,0xf6,0x9e,0x20,0x00,0x00,0x00	# mulx	32($inp), %rax, %rbx
642289848Sjkim	adox	%rax, %r14
643289848Sjkim	adcx	%rbx, %r15
644289848Sjkim
645289848Sjkim	mulx	40($inp), $out, %r10
646289848Sjkim	adox	$out, %r15
647289848Sjkim	adcx	%r10, %r8
648289848Sjkim
649289848Sjkim	mulx	48($inp), %rax, %rbx
650289848Sjkim	adox	%rax, %r8
651289848Sjkim	adcx	%rbx, %r9
652289848Sjkim
653289848Sjkim	mulx	56($inp), $out, %r10
654289848Sjkim	adox	$out, %r9
655289848Sjkim	adcx	%rbp, %r10
656289848Sjkim	adox	%rbp, %r10
657289848Sjkim
658289848Sjkim	.byte	0x66
659289848Sjkim	mov	%r15, %rbx
660289848Sjkim	shld	\$1, %r14, %r15
661289848Sjkim	shld	\$1, %rcx, %r14
662289848Sjkim
663289848Sjkim	xor	%ebp, %ebp
664289848Sjkim	mulx	%rdx, %rax, %rdx
665289848Sjkim	adcx	%rax, %r13
666289848Sjkim	adcx	%rdx, %r14
667289848Sjkim	 mov	32($inp), %rdx
668289848Sjkim	adcx	%rbp, %r15
669289848Sjkim
670289848Sjkim	mov	%r13, 48(%rsp)
671289848Sjkim	mov	%r14, 56(%rsp)
672289848Sjkim
673289848Sjkim#fifth iteration
674289848Sjkim	.byte	0xc4,0x62,0xc3,0xf6,0x9e,0x28,0x00,0x00,0x00	# mulx	40($inp), $out, %r11
675289848Sjkim	adox	$out, %r8
676289848Sjkim	adcx	%r11, %r9
677289848Sjkim
678289848Sjkim	mulx	48($inp), %rax, %rcx
679289848Sjkim	adox	%rax, %r9
680289848Sjkim	adcx	%rcx, %r10
681289848Sjkim
682289848Sjkim	mulx	56($inp), $out, %r11
683289848Sjkim	adox	$out, %r10
684289848Sjkim	adcx	%rbp, %r11
685289848Sjkim	adox	%rbp, %r11
686289848Sjkim
687289848Sjkim	mov	%r9, %rcx
688289848Sjkim	shld	\$1, %r8, %r9
689289848Sjkim	shld	\$1, %rbx, %r8
690289848Sjkim
691289848Sjkim	xor	%ebp, %ebp
692289848Sjkim	mulx	%rdx, %rax, %rdx
693289848Sjkim	adcx	%rax, %r15
694289848Sjkim	adcx	%rdx, %r8
695289848Sjkim	 mov	40($inp), %rdx
696289848Sjkim	adcx	%rbp, %r9
697289848Sjkim
698289848Sjkim	mov	%r15, 64(%rsp)
699289848Sjkim	mov	%r8, 72(%rsp)
700289848Sjkim
701289848Sjkim#sixth iteration
702289848Sjkim	.byte	0xc4,0xe2,0xfb,0xf6,0x9e,0x30,0x00,0x00,0x00	# mulx	48($inp), %rax, %rbx
703289848Sjkim	adox	%rax, %r10
704289848Sjkim	adcx	%rbx, %r11
705289848Sjkim
706289848Sjkim	.byte	0xc4,0x62,0xc3,0xf6,0xa6,0x38,0x00,0x00,0x00	# mulx	56($inp), $out, %r12
707289848Sjkim	adox	$out, %r11
708289848Sjkim	adcx	%rbp, %r12
709289848Sjkim	adox	%rbp, %r12
710289848Sjkim
711289848Sjkim	mov	%r11, %rbx
712289848Sjkim	shld	\$1, %r10, %r11
713289848Sjkim	shld	\$1, %rcx, %r10
714289848Sjkim
715289848Sjkim	xor	%ebp, %ebp
716289848Sjkim	mulx	%rdx, %rax, %rdx
717289848Sjkim	adcx	%rax, %r9
718289848Sjkim	adcx	%rdx, %r10
719289848Sjkim	 mov	48($inp), %rdx
720289848Sjkim	adcx	%rbp, %r11
721289848Sjkim
722289848Sjkim	mov	%r9, 80(%rsp)
723289848Sjkim	mov	%r10, 88(%rsp)
724289848Sjkim
725289848Sjkim#seventh iteration
726289848Sjkim	.byte	0xc4,0x62,0xfb,0xf6,0xae,0x38,0x00,0x00,0x00	# mulx	56($inp), %rax, %r13
727289848Sjkim	adox	%rax, %r12
728289848Sjkim	adox	%rbp, %r13
729289848Sjkim
730289848Sjkim	xor	%r14, %r14
731289848Sjkim	shld	\$1, %r13, %r14
732289848Sjkim	shld	\$1, %r12, %r13
733289848Sjkim	shld	\$1, %rbx, %r12
734289848Sjkim
735289848Sjkim	xor	%ebp, %ebp
736289848Sjkim	mulx	%rdx, %rax, %rdx
737289848Sjkim	adcx	%rax, %r11
738289848Sjkim	adcx	%rdx, %r12
739289848Sjkim	 mov	56($inp), %rdx
740289848Sjkim	adcx	%rbp, %r13
741289848Sjkim
742289848Sjkim	.byte	0x4c,0x89,0x9c,0x24,0x60,0x00,0x00,0x00		# mov	%r11, 96(%rsp)
743289848Sjkim	.byte	0x4c,0x89,0xa4,0x24,0x68,0x00,0x00,0x00		# mov	%r12, 104(%rsp)
744289848Sjkim
745289848Sjkim#eighth iteration
746289848Sjkim	mulx	%rdx, %rax, %rdx
747289848Sjkim	adox	%rax, %r13
748289848Sjkim	adox	%rbp, %rdx
749289848Sjkim
750289848Sjkim	.byte	0x66
751289848Sjkim	add	%rdx, %r14
752289848Sjkim
753289848Sjkim	movq	%r13, 112(%rsp)
754289848Sjkim	movq	%r14, 120(%rsp)
755289848Sjkim	movq	%xmm0, $out
756289848Sjkim	movq	%xmm1, %rbp
757289848Sjkim
758289848Sjkim	movq	128(%rsp), %rdx		# pull $n0
759289848Sjkim	movq	(%rsp), %r8
760289848Sjkim	movq	8(%rsp), %r9
761289848Sjkim	movq	16(%rsp), %r10
762289848Sjkim	movq	24(%rsp), %r11
763289848Sjkim	movq	32(%rsp), %r12
764289848Sjkim	movq	40(%rsp), %r13
765289848Sjkim	movq	48(%rsp), %r14
766289848Sjkim	movq	56(%rsp), %r15
767289848Sjkim
768289848Sjkim	call	__rsaz_512_reducex
769289848Sjkim
770289848Sjkim	addq	64(%rsp), %r8
771289848Sjkim	adcq	72(%rsp), %r9
772289848Sjkim	adcq	80(%rsp), %r10
773289848Sjkim	adcq	88(%rsp), %r11
774289848Sjkim	adcq	96(%rsp), %r12
775289848Sjkim	adcq	104(%rsp), %r13
776289848Sjkim	adcq	112(%rsp), %r14
777289848Sjkim	adcq	120(%rsp), %r15
778289848Sjkim	sbbq	%rcx, %rcx
779289848Sjkim
780289848Sjkim	call	__rsaz_512_subtract
781289848Sjkim
782289848Sjkim	movq	%r8, %rdx
783289848Sjkim	movq	%r9, %rax
784289848Sjkim	movl	128+8(%rsp), $times
785289848Sjkim	movq	$out, $inp
786289848Sjkim
787289848Sjkim	decl	$times
788289848Sjkim	jnz	.Loop_sqrx
789289848Sjkim
790289848Sjkim.Lsqr_tail:
791289848Sjkim___
792289848Sjkim}
793289848Sjkim$code.=<<___;
794289848Sjkim
795289848Sjkim	leaq	128+24+48(%rsp), %rax
796289848Sjkim	movq	-48(%rax), %r15
797289848Sjkim	movq	-40(%rax), %r14
798289848Sjkim	movq	-32(%rax), %r13
799289848Sjkim	movq	-24(%rax), %r12
800289848Sjkim	movq	-16(%rax), %rbp
801289848Sjkim	movq	-8(%rax), %rbx
802289848Sjkim	leaq	(%rax), %rsp
803289848Sjkim.Lsqr_epilogue:
804289848Sjkim	ret
805289848Sjkim.size	rsaz_512_sqr,.-rsaz_512_sqr
806289848Sjkim___
807289848Sjkim}
808289848Sjkim{
809289848Sjkimmy ($out,$ap,$bp,$mod,$n0) = ("%rdi","%rsi","%rdx","%rcx","%r8");
810289848Sjkim$code.=<<___;
811289848Sjkim.globl	rsaz_512_mul
812289848Sjkim.type	rsaz_512_mul,\@function,5
813289848Sjkim.align	32
814289848Sjkimrsaz_512_mul:
815289848Sjkim	push	%rbx
816289848Sjkim	push	%rbp
817289848Sjkim	push	%r12
818289848Sjkim	push	%r13
819289848Sjkim	push	%r14
820289848Sjkim	push	%r15
821289848Sjkim
822289848Sjkim	subq	\$128+24, %rsp
823289848Sjkim.Lmul_body:
824289848Sjkim	movq	$out, %xmm0		# off-load arguments
825289848Sjkim	movq	$mod, %xmm1
826289848Sjkim	movq	$n0, 128(%rsp)
827289848Sjkim___
828289848Sjkim$code.=<<___ if ($addx);
829289848Sjkim	movl	\$0x80100,%r11d
830289848Sjkim	andl	OPENSSL_ia32cap_P+8(%rip),%r11d
831289848Sjkim	cmpl	\$0x80100,%r11d		# check for MULX and ADO/CX
832289848Sjkim	je	.Lmulx
833289848Sjkim___
834289848Sjkim$code.=<<___;
835289848Sjkim	movq	($bp), %rbx		# pass b[0]
836289848Sjkim	movq	$bp, %rbp		# pass argument
837289848Sjkim	call	__rsaz_512_mul
838289848Sjkim
839289848Sjkim	movq	%xmm0, $out
840289848Sjkim	movq	%xmm1, %rbp
841289848Sjkim
842289848Sjkim	movq	(%rsp), %r8
843289848Sjkim	movq	8(%rsp), %r9
844289848Sjkim	movq	16(%rsp), %r10
845289848Sjkim	movq	24(%rsp), %r11
846289848Sjkim	movq	32(%rsp), %r12
847289848Sjkim	movq	40(%rsp), %r13
848289848Sjkim	movq	48(%rsp), %r14
849289848Sjkim	movq	56(%rsp), %r15
850289848Sjkim
851289848Sjkim	call	__rsaz_512_reduce
852289848Sjkim___
853289848Sjkim$code.=<<___ if ($addx);
854289848Sjkim	jmp	.Lmul_tail
855289848Sjkim
856289848Sjkim.align	32
857289848Sjkim.Lmulx:
858289848Sjkim	movq	$bp, %rbp		# pass argument
859289848Sjkim	movq	($bp), %rdx		# pass b[0]
860289848Sjkim	call	__rsaz_512_mulx
861289848Sjkim
862289848Sjkim	movq	%xmm0, $out
863289848Sjkim	movq	%xmm1, %rbp
864289848Sjkim
865289848Sjkim	movq	128(%rsp), %rdx		# pull $n0
866289848Sjkim	movq	(%rsp), %r8
867289848Sjkim	movq	8(%rsp), %r9
868289848Sjkim	movq	16(%rsp), %r10
869289848Sjkim	movq	24(%rsp), %r11
870289848Sjkim	movq	32(%rsp), %r12
871289848Sjkim	movq	40(%rsp), %r13
872289848Sjkim	movq	48(%rsp), %r14
873289848Sjkim	movq	56(%rsp), %r15
874289848Sjkim
875289848Sjkim	call	__rsaz_512_reducex
876289848Sjkim.Lmul_tail:
877289848Sjkim___
878289848Sjkim$code.=<<___;
879289848Sjkim	addq	64(%rsp), %r8
880289848Sjkim	adcq	72(%rsp), %r9
881289848Sjkim	adcq	80(%rsp), %r10
882289848Sjkim	adcq	88(%rsp), %r11
883289848Sjkim	adcq	96(%rsp), %r12
884289848Sjkim	adcq	104(%rsp), %r13
885289848Sjkim	adcq	112(%rsp), %r14
886289848Sjkim	adcq	120(%rsp), %r15
887289848Sjkim	sbbq	%rcx, %rcx
888289848Sjkim
889289848Sjkim	call	__rsaz_512_subtract
890289848Sjkim
891289848Sjkim	leaq	128+24+48(%rsp), %rax
892289848Sjkim	movq	-48(%rax), %r15
893289848Sjkim	movq	-40(%rax), %r14
894289848Sjkim	movq	-32(%rax), %r13
895289848Sjkim	movq	-24(%rax), %r12
896289848Sjkim	movq	-16(%rax), %rbp
897289848Sjkim	movq	-8(%rax), %rbx
898289848Sjkim	leaq	(%rax), %rsp
899289848Sjkim.Lmul_epilogue:
900289848Sjkim	ret
901289848Sjkim.size	rsaz_512_mul,.-rsaz_512_mul
902289848Sjkim___
903289848Sjkim}
904289848Sjkim{
905289848Sjkimmy ($out,$ap,$bp,$mod,$n0,$pwr) = ("%rdi","%rsi","%rdx","%rcx","%r8","%r9d");
906289848Sjkim$code.=<<___;
907289848Sjkim.globl	rsaz_512_mul_gather4
908289848Sjkim.type	rsaz_512_mul_gather4,\@function,6
909289848Sjkim.align	32
910289848Sjkimrsaz_512_mul_gather4:
911289848Sjkim	push	%rbx
912289848Sjkim	push	%rbp
913289848Sjkim	push	%r12
914289848Sjkim	push	%r13
915289848Sjkim	push	%r14
916289848Sjkim	push	%r15
917289848Sjkim
918296279Sjkim	subq	\$`128+24+($win64?0xb0:0)`, %rsp
919296279Sjkim___
920296279Sjkim$code.=<<___	if ($win64);
921296279Sjkim	movaps	%xmm6,0xa0(%rsp)
922296279Sjkim	movaps	%xmm7,0xb0(%rsp)
923296279Sjkim	movaps	%xmm8,0xc0(%rsp)
924296279Sjkim	movaps	%xmm9,0xd0(%rsp)
925296279Sjkim	movaps	%xmm10,0xe0(%rsp)
926296279Sjkim	movaps	%xmm11,0xf0(%rsp)
927296279Sjkim	movaps	%xmm12,0x100(%rsp)
928296279Sjkim	movaps	%xmm13,0x110(%rsp)
929296279Sjkim	movaps	%xmm14,0x120(%rsp)
930296279Sjkim	movaps	%xmm15,0x130(%rsp)
931296279Sjkim___
932296279Sjkim$code.=<<___;
933289848Sjkim.Lmul_gather4_body:
934296279Sjkim	movd	$pwr,%xmm8
935296279Sjkim	movdqa	.Linc+16(%rip),%xmm1	# 00000002000000020000000200000002
936296279Sjkim	movdqa	.Linc(%rip),%xmm0	# 00000001000000010000000000000000
937296279Sjkim
938296279Sjkim	pshufd	\$0,%xmm8,%xmm8		# broadcast $power
939296279Sjkim	movdqa	%xmm1,%xmm7
940296279Sjkim	movdqa	%xmm1,%xmm2
941289848Sjkim___
942296279Sjkim########################################################################
943296279Sjkim# calculate mask by comparing 0..15 to $power
944296279Sjkim#
945296279Sjkimfor($i=0;$i<4;$i++) {
946296279Sjkim$code.=<<___;
947296279Sjkim	paddd	%xmm`$i`,%xmm`$i+1`
948296279Sjkim	pcmpeqd	%xmm8,%xmm`$i`
949296279Sjkim	movdqa	%xmm7,%xmm`$i+3`
950296279Sjkim___
951296279Sjkim}
952296279Sjkimfor(;$i<7;$i++) {
953296279Sjkim$code.=<<___;
954296279Sjkim	paddd	%xmm`$i`,%xmm`$i+1`
955296279Sjkim	pcmpeqd	%xmm8,%xmm`$i`
956296279Sjkim___
957296279Sjkim}
958296279Sjkim$code.=<<___;
959296279Sjkim	pcmpeqd	%xmm8,%xmm7
960296279Sjkim
961296279Sjkim	movdqa	16*0($bp),%xmm8
962296279Sjkim	movdqa	16*1($bp),%xmm9
963296279Sjkim	movdqa	16*2($bp),%xmm10
964296279Sjkim	movdqa	16*3($bp),%xmm11
965296279Sjkim	pand	%xmm0,%xmm8
966296279Sjkim	movdqa	16*4($bp),%xmm12
967296279Sjkim	pand	%xmm1,%xmm9
968296279Sjkim	movdqa	16*5($bp),%xmm13
969296279Sjkim	pand	%xmm2,%xmm10
970296279Sjkim	movdqa	16*6($bp),%xmm14
971296279Sjkim	pand	%xmm3,%xmm11
972296279Sjkim	movdqa	16*7($bp),%xmm15
973296279Sjkim	leaq	128($bp), %rbp
974296279Sjkim	pand	%xmm4,%xmm12
975296279Sjkim	pand	%xmm5,%xmm13
976296279Sjkim	pand	%xmm6,%xmm14
977296279Sjkim	pand	%xmm7,%xmm15
978296279Sjkim	por	%xmm10,%xmm8
979296279Sjkim	por	%xmm11,%xmm9
980296279Sjkim	por	%xmm12,%xmm8
981296279Sjkim	por	%xmm13,%xmm9
982296279Sjkim	por	%xmm14,%xmm8
983296279Sjkim	por	%xmm15,%xmm9
984296279Sjkim
985296279Sjkim	por	%xmm9,%xmm8
986296279Sjkim	pshufd	\$0x4e,%xmm8,%xmm9
987296279Sjkim	por	%xmm9,%xmm8
988296279Sjkim___
989289848Sjkim$code.=<<___ if ($addx);
990289848Sjkim	movl	\$0x80100,%r11d
991289848Sjkim	andl	OPENSSL_ia32cap_P+8(%rip),%r11d
992289848Sjkim	cmpl	\$0x80100,%r11d		# check for MULX and ADO/CX
993289848Sjkim	je	.Lmulx_gather
994289848Sjkim___
995289848Sjkim$code.=<<___;
996296279Sjkim	movq	%xmm8,%rbx
997289848Sjkim
998296279Sjkim	movq	$n0, 128(%rsp)		# off-load arguments
999296279Sjkim	movq	$out, 128+8(%rsp)
1000296279Sjkim	movq	$mod, 128+16(%rsp)
1001296279Sjkim
1002289848Sjkim	movq	($ap), %rax
1003289848Sjkim	 movq	8($ap), %rcx
1004289848Sjkim	mulq	%rbx			# 0 iteration
1005289848Sjkim	movq	%rax, (%rsp)
1006289848Sjkim	movq	%rcx, %rax
1007289848Sjkim	movq	%rdx, %r8
1008289848Sjkim
1009289848Sjkim	mulq	%rbx
1010289848Sjkim	addq	%rax, %r8
1011289848Sjkim	movq	16($ap), %rax
1012289848Sjkim	movq	%rdx, %r9
1013289848Sjkim	adcq	\$0, %r9
1014289848Sjkim
1015289848Sjkim	mulq	%rbx
1016289848Sjkim	addq	%rax, %r9
1017289848Sjkim	movq	24($ap), %rax
1018289848Sjkim	movq	%rdx, %r10
1019289848Sjkim	adcq	\$0, %r10
1020289848Sjkim
1021289848Sjkim	mulq	%rbx
1022289848Sjkim	addq	%rax, %r10
1023289848Sjkim	movq	32($ap), %rax
1024289848Sjkim	movq	%rdx, %r11
1025289848Sjkim	adcq	\$0, %r11
1026289848Sjkim
1027289848Sjkim	mulq	%rbx
1028289848Sjkim	addq	%rax, %r11
1029289848Sjkim	movq	40($ap), %rax
1030289848Sjkim	movq	%rdx, %r12
1031289848Sjkim	adcq	\$0, %r12
1032289848Sjkim
1033289848Sjkim	mulq	%rbx
1034289848Sjkim	addq	%rax, %r12
1035289848Sjkim	movq	48($ap), %rax
1036289848Sjkim	movq	%rdx, %r13
1037289848Sjkim	adcq	\$0, %r13
1038289848Sjkim
1039289848Sjkim	mulq	%rbx
1040289848Sjkim	addq	%rax, %r13
1041289848Sjkim	movq	56($ap), %rax
1042289848Sjkim	movq	%rdx, %r14
1043289848Sjkim	adcq	\$0, %r14
1044289848Sjkim
1045289848Sjkim	mulq	%rbx
1046289848Sjkim	addq	%rax, %r14
1047289848Sjkim	 movq	($ap), %rax
1048289848Sjkim	movq	%rdx, %r15
1049289848Sjkim	adcq	\$0, %r15
1050289848Sjkim
1051289848Sjkim	leaq	8(%rsp), %rdi
1052289848Sjkim	movl	\$7, %ecx
1053289848Sjkim	jmp	.Loop_mul_gather
1054289848Sjkim
1055289848Sjkim.align	32
1056289848Sjkim.Loop_mul_gather:
1057296279Sjkim	movdqa	16*0(%rbp),%xmm8
1058296279Sjkim	movdqa	16*1(%rbp),%xmm9
1059296279Sjkim	movdqa	16*2(%rbp),%xmm10
1060296279Sjkim	movdqa	16*3(%rbp),%xmm11
1061296279Sjkim	pand	%xmm0,%xmm8
1062296279Sjkim	movdqa	16*4(%rbp),%xmm12
1063296279Sjkim	pand	%xmm1,%xmm9
1064296279Sjkim	movdqa	16*5(%rbp),%xmm13
1065296279Sjkim	pand	%xmm2,%xmm10
1066296279Sjkim	movdqa	16*6(%rbp),%xmm14
1067296279Sjkim	pand	%xmm3,%xmm11
1068296279Sjkim	movdqa	16*7(%rbp),%xmm15
1069296279Sjkim	leaq	128(%rbp), %rbp
1070296279Sjkim	pand	%xmm4,%xmm12
1071296279Sjkim	pand	%xmm5,%xmm13
1072296279Sjkim	pand	%xmm6,%xmm14
1073296279Sjkim	pand	%xmm7,%xmm15
1074296279Sjkim	por	%xmm10,%xmm8
1075296279Sjkim	por	%xmm11,%xmm9
1076296279Sjkim	por	%xmm12,%xmm8
1077296279Sjkim	por	%xmm13,%xmm9
1078296279Sjkim	por	%xmm14,%xmm8
1079296279Sjkim	por	%xmm15,%xmm9
1080296279Sjkim
1081296279Sjkim	por	%xmm9,%xmm8
1082296279Sjkim	pshufd	\$0x4e,%xmm8,%xmm9
1083296279Sjkim	por	%xmm9,%xmm8
1084296279Sjkim	movq	%xmm8,%rbx
1085296279Sjkim
1086289848Sjkim	mulq	%rbx
1087289848Sjkim	addq	%rax, %r8
1088289848Sjkim	movq	8($ap), %rax
1089289848Sjkim	movq	%r8, (%rdi)
1090289848Sjkim	movq	%rdx, %r8
1091289848Sjkim	adcq	\$0, %r8
1092289848Sjkim
1093289848Sjkim	mulq	%rbx
1094289848Sjkim	addq	%rax, %r9
1095289848Sjkim	movq	16($ap), %rax
1096289848Sjkim	adcq	\$0, %rdx
1097289848Sjkim	addq	%r9, %r8
1098289848Sjkim	movq	%rdx, %r9
1099289848Sjkim	adcq	\$0, %r9
1100289848Sjkim
1101289848Sjkim	mulq	%rbx
1102289848Sjkim	addq	%rax, %r10
1103289848Sjkim	movq	24($ap), %rax
1104289848Sjkim	adcq	\$0, %rdx
1105289848Sjkim	addq	%r10, %r9
1106289848Sjkim	movq	%rdx, %r10
1107289848Sjkim	adcq	\$0, %r10
1108289848Sjkim
1109289848Sjkim	mulq	%rbx
1110289848Sjkim	addq	%rax, %r11
1111289848Sjkim	movq	32($ap), %rax
1112289848Sjkim	adcq	\$0, %rdx
1113289848Sjkim	addq	%r11, %r10
1114289848Sjkim	movq	%rdx, %r11
1115289848Sjkim	adcq	\$0, %r11
1116289848Sjkim
1117289848Sjkim	mulq	%rbx
1118289848Sjkim	addq	%rax, %r12
1119289848Sjkim	movq	40($ap), %rax
1120289848Sjkim	adcq	\$0, %rdx
1121289848Sjkim	addq	%r12, %r11
1122289848Sjkim	movq	%rdx, %r12
1123289848Sjkim	adcq	\$0, %r12
1124289848Sjkim
1125289848Sjkim	mulq	%rbx
1126289848Sjkim	addq	%rax, %r13
1127289848Sjkim	movq	48($ap), %rax
1128289848Sjkim	adcq	\$0, %rdx
1129289848Sjkim	addq	%r13, %r12
1130289848Sjkim	movq	%rdx, %r13
1131289848Sjkim	adcq	\$0, %r13
1132289848Sjkim
1133289848Sjkim	mulq	%rbx
1134289848Sjkim	addq	%rax, %r14
1135289848Sjkim	movq	56($ap), %rax
1136289848Sjkim	adcq	\$0, %rdx
1137289848Sjkim	addq	%r14, %r13
1138289848Sjkim	movq	%rdx, %r14
1139289848Sjkim	adcq	\$0, %r14
1140289848Sjkim
1141289848Sjkim	mulq	%rbx
1142289848Sjkim	addq	%rax, %r15
1143289848Sjkim	 movq	($ap), %rax
1144289848Sjkim	adcq	\$0, %rdx
1145289848Sjkim	addq	%r15, %r14
1146289848Sjkim	movq	%rdx, %r15
1147289848Sjkim	adcq	\$0, %r15
1148289848Sjkim
1149289848Sjkim	leaq	8(%rdi), %rdi
1150289848Sjkim
1151289848Sjkim	decl	%ecx
1152289848Sjkim	jnz	.Loop_mul_gather
1153289848Sjkim
1154289848Sjkim	movq	%r8, (%rdi)
1155289848Sjkim	movq	%r9, 8(%rdi)
1156289848Sjkim	movq	%r10, 16(%rdi)
1157289848Sjkim	movq	%r11, 24(%rdi)
1158289848Sjkim	movq	%r12, 32(%rdi)
1159289848Sjkim	movq	%r13, 40(%rdi)
1160289848Sjkim	movq	%r14, 48(%rdi)
1161289848Sjkim	movq	%r15, 56(%rdi)
1162289848Sjkim
1163296279Sjkim	movq	128+8(%rsp), $out
1164296279Sjkim	movq	128+16(%rsp), %rbp
1165289848Sjkim
1166289848Sjkim	movq	(%rsp), %r8
1167289848Sjkim	movq	8(%rsp), %r9
1168289848Sjkim	movq	16(%rsp), %r10
1169289848Sjkim	movq	24(%rsp), %r11
1170289848Sjkim	movq	32(%rsp), %r12
1171289848Sjkim	movq	40(%rsp), %r13
1172289848Sjkim	movq	48(%rsp), %r14
1173289848Sjkim	movq	56(%rsp), %r15
1174289848Sjkim
1175289848Sjkim	call	__rsaz_512_reduce
1176289848Sjkim___
1177289848Sjkim$code.=<<___ if ($addx);
1178289848Sjkim	jmp	.Lmul_gather_tail
1179289848Sjkim
1180289848Sjkim.align	32
1181289848Sjkim.Lmulx_gather:
1182296279Sjkim	movq	%xmm8,%rdx
1183289848Sjkim
1184296279Sjkim	mov	$n0, 128(%rsp)		# off-load arguments
1185296279Sjkim	mov	$out, 128+8(%rsp)
1186296279Sjkim	mov	$mod, 128+16(%rsp)
1187296279Sjkim
1188289848Sjkim	mulx	($ap), %rbx, %r8	# 0 iteration
1189289848Sjkim	mov	%rbx, (%rsp)
1190289848Sjkim	xor	%edi, %edi		# cf=0, of=0
1191289848Sjkim
1192289848Sjkim	mulx	8($ap), %rax, %r9
1193289848Sjkim
1194289848Sjkim	mulx	16($ap), %rbx, %r10
1195289848Sjkim	adcx	%rax, %r8
1196289848Sjkim
1197289848Sjkim	mulx	24($ap), %rax, %r11
1198289848Sjkim	adcx	%rbx, %r9
1199289848Sjkim
1200289848Sjkim	mulx	32($ap), %rbx, %r12
1201289848Sjkim	adcx	%rax, %r10
1202289848Sjkim
1203289848Sjkim	mulx	40($ap), %rax, %r13
1204289848Sjkim	adcx	%rbx, %r11
1205289848Sjkim
1206289848Sjkim	mulx	48($ap), %rbx, %r14
1207289848Sjkim	adcx	%rax, %r12
1208289848Sjkim
1209289848Sjkim	mulx	56($ap), %rax, %r15
1210289848Sjkim	adcx	%rbx, %r13
1211289848Sjkim	adcx	%rax, %r14
1212296279Sjkim	.byte	0x67
1213289848Sjkim	mov	%r8, %rbx
1214289848Sjkim	adcx	%rdi, %r15		# %rdi is 0
1215289848Sjkim
1216289848Sjkim	mov	\$-7, %rcx
1217289848Sjkim	jmp	.Loop_mulx_gather
1218289848Sjkim
1219289848Sjkim.align	32
1220289848Sjkim.Loop_mulx_gather:
1221296279Sjkim	movdqa	16*0(%rbp),%xmm8
1222296279Sjkim	movdqa	16*1(%rbp),%xmm9
1223296279Sjkim	movdqa	16*2(%rbp),%xmm10
1224296279Sjkim	movdqa	16*3(%rbp),%xmm11
1225296279Sjkim	pand	%xmm0,%xmm8
1226296279Sjkim	movdqa	16*4(%rbp),%xmm12
1227296279Sjkim	pand	%xmm1,%xmm9
1228296279Sjkim	movdqa	16*5(%rbp),%xmm13
1229296279Sjkim	pand	%xmm2,%xmm10
1230296279Sjkim	movdqa	16*6(%rbp),%xmm14
1231296279Sjkim	pand	%xmm3,%xmm11
1232296279Sjkim	movdqa	16*7(%rbp),%xmm15
1233296279Sjkim	leaq	128(%rbp), %rbp
1234296279Sjkim	pand	%xmm4,%xmm12
1235296279Sjkim	pand	%xmm5,%xmm13
1236296279Sjkim	pand	%xmm6,%xmm14
1237296279Sjkim	pand	%xmm7,%xmm15
1238296279Sjkim	por	%xmm10,%xmm8
1239296279Sjkim	por	%xmm11,%xmm9
1240296279Sjkim	por	%xmm12,%xmm8
1241296279Sjkim	por	%xmm13,%xmm9
1242296279Sjkim	por	%xmm14,%xmm8
1243296279Sjkim	por	%xmm15,%xmm9
1244296279Sjkim
1245296279Sjkim	por	%xmm9,%xmm8
1246296279Sjkim	pshufd	\$0x4e,%xmm8,%xmm9
1247296279Sjkim	por	%xmm9,%xmm8
1248296279Sjkim	movq	%xmm8,%rdx
1249296279Sjkim
1250296279Sjkim	.byte	0xc4,0x62,0xfb,0xf6,0x86,0x00,0x00,0x00,0x00	# mulx	($ap), %rax, %r8
1251289848Sjkim	adcx	%rax, %rbx
1252289848Sjkim	adox	%r9, %r8
1253289848Sjkim
1254289848Sjkim	mulx	8($ap), %rax, %r9
1255289848Sjkim	adcx	%rax, %r8
1256289848Sjkim	adox	%r10, %r9
1257289848Sjkim
1258289848Sjkim	mulx	16($ap), %rax, %r10
1259289848Sjkim	adcx	%rax, %r9
1260289848Sjkim	adox	%r11, %r10
1261289848Sjkim
1262289848Sjkim	.byte	0xc4,0x62,0xfb,0xf6,0x9e,0x18,0x00,0x00,0x00	# mulx	24($ap), %rax, %r11
1263289848Sjkim	adcx	%rax, %r10
1264289848Sjkim	adox	%r12, %r11
1265289848Sjkim
1266289848Sjkim	mulx	32($ap), %rax, %r12
1267289848Sjkim	adcx	%rax, %r11
1268289848Sjkim	adox	%r13, %r12
1269289848Sjkim
1270289848Sjkim	mulx	40($ap), %rax, %r13
1271289848Sjkim	adcx	%rax, %r12
1272289848Sjkim	adox	%r14, %r13
1273289848Sjkim
1274289848Sjkim	.byte	0xc4,0x62,0xfb,0xf6,0xb6,0x30,0x00,0x00,0x00	# mulx	48($ap), %rax, %r14
1275289848Sjkim	adcx	%rax, %r13
1276296279Sjkim	.byte	0x67
1277289848Sjkim	adox	%r15, %r14
1278289848Sjkim
1279289848Sjkim	mulx	56($ap), %rax, %r15
1280289848Sjkim	 mov	%rbx, 64(%rsp,%rcx,8)
1281289848Sjkim	adcx	%rax, %r14
1282289848Sjkim	adox	%rdi, %r15
1283289848Sjkim	mov	%r8, %rbx
1284289848Sjkim	adcx	%rdi, %r15		# cf=0
1285289848Sjkim
1286289848Sjkim	inc	%rcx			# of=0
1287289848Sjkim	jnz	.Loop_mulx_gather
1288289848Sjkim
1289289848Sjkim	mov	%r8, 64(%rsp)
1290289848Sjkim	mov	%r9, 64+8(%rsp)
1291289848Sjkim	mov	%r10, 64+16(%rsp)
1292289848Sjkim	mov	%r11, 64+24(%rsp)
1293289848Sjkim	mov	%r12, 64+32(%rsp)
1294289848Sjkim	mov	%r13, 64+40(%rsp)
1295289848Sjkim	mov	%r14, 64+48(%rsp)
1296289848Sjkim	mov	%r15, 64+56(%rsp)
1297289848Sjkim
1298296279Sjkim	mov	128(%rsp), %rdx		# pull arguments
1299296279Sjkim	mov	128+8(%rsp), $out
1300296279Sjkim	mov	128+16(%rsp), %rbp
1301289848Sjkim
1302289848Sjkim	mov	(%rsp), %r8
1303289848Sjkim	mov	8(%rsp), %r9
1304289848Sjkim	mov	16(%rsp), %r10
1305289848Sjkim	mov	24(%rsp), %r11
1306289848Sjkim	mov	32(%rsp), %r12
1307289848Sjkim	mov	40(%rsp), %r13
1308289848Sjkim	mov	48(%rsp), %r14
1309289848Sjkim	mov	56(%rsp), %r15
1310289848Sjkim
1311289848Sjkim	call	__rsaz_512_reducex
1312289848Sjkim
1313289848Sjkim.Lmul_gather_tail:
1314289848Sjkim___
1315289848Sjkim$code.=<<___;
1316289848Sjkim	addq	64(%rsp), %r8
1317289848Sjkim	adcq	72(%rsp), %r9
1318289848Sjkim	adcq	80(%rsp), %r10
1319289848Sjkim	adcq	88(%rsp), %r11
1320289848Sjkim	adcq	96(%rsp), %r12
1321289848Sjkim	adcq	104(%rsp), %r13
1322289848Sjkim	adcq	112(%rsp), %r14
1323289848Sjkim	adcq	120(%rsp), %r15
1324289848Sjkim	sbbq	%rcx, %rcx
1325289848Sjkim
1326289848Sjkim	call	__rsaz_512_subtract
1327289848Sjkim
1328289848Sjkim	leaq	128+24+48(%rsp), %rax
1329296279Sjkim___
1330296279Sjkim$code.=<<___	if ($win64);
1331296279Sjkim	movaps	0xa0-0xc8(%rax),%xmm6
1332296279Sjkim	movaps	0xb0-0xc8(%rax),%xmm7
1333296279Sjkim	movaps	0xc0-0xc8(%rax),%xmm8
1334296279Sjkim	movaps	0xd0-0xc8(%rax),%xmm9
1335296279Sjkim	movaps	0xe0-0xc8(%rax),%xmm10
1336296279Sjkim	movaps	0xf0-0xc8(%rax),%xmm11
1337296279Sjkim	movaps	0x100-0xc8(%rax),%xmm12
1338296279Sjkim	movaps	0x110-0xc8(%rax),%xmm13
1339296279Sjkim	movaps	0x120-0xc8(%rax),%xmm14
1340296279Sjkim	movaps	0x130-0xc8(%rax),%xmm15
1341296279Sjkim	lea	0xb0(%rax),%rax
1342296279Sjkim___
1343296279Sjkim$code.=<<___;
1344289848Sjkim	movq	-48(%rax), %r15
1345289848Sjkim	movq	-40(%rax), %r14
1346289848Sjkim	movq	-32(%rax), %r13
1347289848Sjkim	movq	-24(%rax), %r12
1348289848Sjkim	movq	-16(%rax), %rbp
1349289848Sjkim	movq	-8(%rax), %rbx
1350289848Sjkim	leaq	(%rax), %rsp
1351289848Sjkim.Lmul_gather4_epilogue:
1352289848Sjkim	ret
1353289848Sjkim.size	rsaz_512_mul_gather4,.-rsaz_512_mul_gather4
1354289848Sjkim___
1355289848Sjkim}
1356289848Sjkim{
1357289848Sjkimmy ($out,$ap,$mod,$n0,$tbl,$pwr) = ("%rdi","%rsi","%rdx","%rcx","%r8","%r9d");
1358289848Sjkim$code.=<<___;
1359289848Sjkim.globl	rsaz_512_mul_scatter4
1360289848Sjkim.type	rsaz_512_mul_scatter4,\@function,6
1361289848Sjkim.align	32
1362289848Sjkimrsaz_512_mul_scatter4:
1363289848Sjkim	push	%rbx
1364289848Sjkim	push	%rbp
1365289848Sjkim	push	%r12
1366289848Sjkim	push	%r13
1367289848Sjkim	push	%r14
1368289848Sjkim	push	%r15
1369289848Sjkim
1370289848Sjkim	mov	$pwr, $pwr
1371289848Sjkim	subq	\$128+24, %rsp
1372289848Sjkim.Lmul_scatter4_body:
1373296279Sjkim	leaq	($tbl,$pwr,8), $tbl
1374289848Sjkim	movq	$out, %xmm0		# off-load arguments
1375289848Sjkim	movq	$mod, %xmm1
1376289848Sjkim	movq	$tbl, %xmm2
1377289848Sjkim	movq	$n0, 128(%rsp)
1378289848Sjkim
1379289848Sjkim	movq	$out, %rbp
1380289848Sjkim___
1381289848Sjkim$code.=<<___ if ($addx);
1382289848Sjkim	movl	\$0x80100,%r11d
1383289848Sjkim	andl	OPENSSL_ia32cap_P+8(%rip),%r11d
1384289848Sjkim	cmpl	\$0x80100,%r11d		# check for MULX and ADO/CX
1385289848Sjkim	je	.Lmulx_scatter
1386289848Sjkim___
1387289848Sjkim$code.=<<___;
1388289848Sjkim	movq	($out),%rbx		# pass b[0]
1389289848Sjkim	call	__rsaz_512_mul
1390289848Sjkim
1391289848Sjkim	movq	%xmm0, $out
1392289848Sjkim	movq	%xmm1, %rbp
1393289848Sjkim
1394289848Sjkim	movq	(%rsp), %r8
1395289848Sjkim	movq	8(%rsp), %r9
1396289848Sjkim	movq	16(%rsp), %r10
1397289848Sjkim	movq	24(%rsp), %r11
1398289848Sjkim	movq	32(%rsp), %r12
1399289848Sjkim	movq	40(%rsp), %r13
1400289848Sjkim	movq	48(%rsp), %r14
1401289848Sjkim	movq	56(%rsp), %r15
1402289848Sjkim
1403289848Sjkim	call	__rsaz_512_reduce
1404289848Sjkim___
1405289848Sjkim$code.=<<___ if ($addx);
1406289848Sjkim	jmp	.Lmul_scatter_tail
1407289848Sjkim
1408289848Sjkim.align	32
1409289848Sjkim.Lmulx_scatter:
1410289848Sjkim	movq	($out), %rdx		# pass b[0]
1411289848Sjkim	call	__rsaz_512_mulx
1412289848Sjkim
1413289848Sjkim	movq	%xmm0, $out
1414289848Sjkim	movq	%xmm1, %rbp
1415289848Sjkim
1416289848Sjkim	movq	128(%rsp), %rdx		# pull $n0
1417289848Sjkim	movq	(%rsp), %r8
1418289848Sjkim	movq	8(%rsp), %r9
1419289848Sjkim	movq	16(%rsp), %r10
1420289848Sjkim	movq	24(%rsp), %r11
1421289848Sjkim	movq	32(%rsp), %r12
1422289848Sjkim	movq	40(%rsp), %r13
1423289848Sjkim	movq	48(%rsp), %r14
1424289848Sjkim	movq	56(%rsp), %r15
1425289848Sjkim
1426289848Sjkim	call	__rsaz_512_reducex
1427289848Sjkim
1428289848Sjkim.Lmul_scatter_tail:
1429289848Sjkim___
1430289848Sjkim$code.=<<___;
1431289848Sjkim	addq	64(%rsp), %r8
1432289848Sjkim	adcq	72(%rsp), %r9
1433289848Sjkim	adcq	80(%rsp), %r10
1434289848Sjkim	adcq	88(%rsp), %r11
1435289848Sjkim	adcq	96(%rsp), %r12
1436289848Sjkim	adcq	104(%rsp), %r13
1437289848Sjkim	adcq	112(%rsp), %r14
1438289848Sjkim	adcq	120(%rsp), %r15
1439289848Sjkim	movq	%xmm2, $inp
1440289848Sjkim	sbbq	%rcx, %rcx
1441289848Sjkim
1442289848Sjkim	call	__rsaz_512_subtract
1443289848Sjkim
1444296279Sjkim	movq	%r8, 128*0($inp)	# scatter
1445296279Sjkim	movq	%r9, 128*1($inp)
1446296279Sjkim	movq	%r10, 128*2($inp)
1447296279Sjkim	movq	%r11, 128*3($inp)
1448296279Sjkim	movq	%r12, 128*4($inp)
1449296279Sjkim	movq	%r13, 128*5($inp)
1450296279Sjkim	movq	%r14, 128*6($inp)
1451296279Sjkim	movq	%r15, 128*7($inp)
1452289848Sjkim
1453289848Sjkim	leaq	128+24+48(%rsp), %rax
1454289848Sjkim	movq	-48(%rax), %r15
1455289848Sjkim	movq	-40(%rax), %r14
1456289848Sjkim	movq	-32(%rax), %r13
1457289848Sjkim	movq	-24(%rax), %r12
1458289848Sjkim	movq	-16(%rax), %rbp
1459289848Sjkim	movq	-8(%rax), %rbx
1460289848Sjkim	leaq	(%rax), %rsp
1461289848Sjkim.Lmul_scatter4_epilogue:
1462289848Sjkim	ret
1463289848Sjkim.size	rsaz_512_mul_scatter4,.-rsaz_512_mul_scatter4
1464289848Sjkim___
1465289848Sjkim}
1466289848Sjkim{
1467289848Sjkimmy ($out,$inp,$mod,$n0) = ("%rdi","%rsi","%rdx","%rcx");
1468289848Sjkim$code.=<<___;
1469289848Sjkim.globl	rsaz_512_mul_by_one
1470289848Sjkim.type	rsaz_512_mul_by_one,\@function,4
1471289848Sjkim.align	32
1472289848Sjkimrsaz_512_mul_by_one:
1473289848Sjkim	push	%rbx
1474289848Sjkim	push	%rbp
1475289848Sjkim	push	%r12
1476289848Sjkim	push	%r13
1477289848Sjkim	push	%r14
1478289848Sjkim	push	%r15
1479289848Sjkim
1480289848Sjkim	subq	\$128+24, %rsp
1481289848Sjkim.Lmul_by_one_body:
1482289848Sjkim___
1483289848Sjkim$code.=<<___ if ($addx);
1484289848Sjkim	movl	OPENSSL_ia32cap_P+8(%rip),%eax
1485289848Sjkim___
1486289848Sjkim$code.=<<___;
1487289848Sjkim	movq	$mod, %rbp	# reassign argument
1488289848Sjkim	movq	$n0, 128(%rsp)
1489289848Sjkim
1490289848Sjkim	movq	($inp), %r8
1491289848Sjkim	pxor	%xmm0, %xmm0
1492289848Sjkim	movq	8($inp), %r9
1493289848Sjkim	movq	16($inp), %r10
1494289848Sjkim	movq	24($inp), %r11
1495289848Sjkim	movq	32($inp), %r12
1496289848Sjkim	movq	40($inp), %r13
1497289848Sjkim	movq	48($inp), %r14
1498289848Sjkim	movq	56($inp), %r15
1499289848Sjkim
1500289848Sjkim	movdqa	%xmm0, (%rsp)
1501289848Sjkim	movdqa	%xmm0, 16(%rsp)
1502289848Sjkim	movdqa	%xmm0, 32(%rsp)
1503289848Sjkim	movdqa	%xmm0, 48(%rsp)
1504289848Sjkim	movdqa	%xmm0, 64(%rsp)
1505289848Sjkim	movdqa	%xmm0, 80(%rsp)
1506289848Sjkim	movdqa	%xmm0, 96(%rsp)
1507289848Sjkim___
1508289848Sjkim$code.=<<___ if ($addx);
1509289848Sjkim	andl	\$0x80100,%eax
1510289848Sjkim	cmpl	\$0x80100,%eax		# check for MULX and ADO/CX
1511289848Sjkim	je	.Lby_one_callx
1512289848Sjkim___
1513289848Sjkim$code.=<<___;
1514289848Sjkim	call	__rsaz_512_reduce
1515289848Sjkim___
1516289848Sjkim$code.=<<___ if ($addx);
1517289848Sjkim	jmp	.Lby_one_tail
1518289848Sjkim.align	32
1519289848Sjkim.Lby_one_callx:
1520289848Sjkim	movq	128(%rsp), %rdx		# pull $n0
1521289848Sjkim	call	__rsaz_512_reducex
1522289848Sjkim.Lby_one_tail:
1523289848Sjkim___
1524289848Sjkim$code.=<<___;
1525289848Sjkim	movq	%r8, ($out)
1526289848Sjkim	movq	%r9, 8($out)
1527289848Sjkim	movq	%r10, 16($out)
1528289848Sjkim	movq	%r11, 24($out)
1529289848Sjkim	movq	%r12, 32($out)
1530289848Sjkim	movq	%r13, 40($out)
1531289848Sjkim	movq	%r14, 48($out)
1532289848Sjkim	movq	%r15, 56($out)
1533289848Sjkim
1534289848Sjkim	leaq	128+24+48(%rsp), %rax
1535289848Sjkim	movq	-48(%rax), %r15
1536289848Sjkim	movq	-40(%rax), %r14
1537289848Sjkim	movq	-32(%rax), %r13
1538289848Sjkim	movq	-24(%rax), %r12
1539289848Sjkim	movq	-16(%rax), %rbp
1540289848Sjkim	movq	-8(%rax), %rbx
1541289848Sjkim	leaq	(%rax), %rsp
1542289848Sjkim.Lmul_by_one_epilogue:
1543289848Sjkim	ret
1544289848Sjkim.size	rsaz_512_mul_by_one,.-rsaz_512_mul_by_one
1545289848Sjkim___
1546289848Sjkim}
1547289848Sjkim{	# __rsaz_512_reduce
1548289848Sjkim	#
1549289848Sjkim	# input:	%r8-%r15, %rbp - mod, 128(%rsp) - n0
1550289848Sjkim	# output:	%r8-%r15
1551289848Sjkim	# clobbers:	everything except %rbp and %rdi
1552289848Sjkim$code.=<<___;
1553289848Sjkim.type	__rsaz_512_reduce,\@abi-omnipotent
1554289848Sjkim.align	32
1555289848Sjkim__rsaz_512_reduce:
1556289848Sjkim	movq	%r8, %rbx
1557289848Sjkim	imulq	128+8(%rsp), %rbx
1558289848Sjkim	movq	0(%rbp), %rax
1559289848Sjkim	movl	\$8, %ecx
1560289848Sjkim	jmp	.Lreduction_loop
1561289848Sjkim
1562289848Sjkim.align	32
1563289848Sjkim.Lreduction_loop:
1564289848Sjkim	mulq	%rbx
1565289848Sjkim	movq	8(%rbp), %rax
1566289848Sjkim	negq	%r8
1567289848Sjkim	movq	%rdx, %r8
1568289848Sjkim	adcq	\$0, %r8
1569289848Sjkim
1570289848Sjkim	mulq	%rbx
1571289848Sjkim	addq	%rax, %r9
1572289848Sjkim	movq	16(%rbp), %rax
1573289848Sjkim	adcq	\$0, %rdx
1574289848Sjkim	addq	%r9, %r8
1575289848Sjkim	movq	%rdx, %r9
1576289848Sjkim	adcq	\$0, %r9
1577289848Sjkim
1578289848Sjkim	mulq	%rbx
1579289848Sjkim	addq	%rax, %r10
1580289848Sjkim	movq	24(%rbp), %rax
1581289848Sjkim	adcq	\$0, %rdx
1582289848Sjkim	addq	%r10, %r9
1583289848Sjkim	movq	%rdx, %r10
1584289848Sjkim	adcq	\$0, %r10
1585289848Sjkim
1586289848Sjkim	mulq	%rbx
1587289848Sjkim	addq	%rax, %r11
1588289848Sjkim	movq	32(%rbp), %rax
1589289848Sjkim	adcq	\$0, %rdx
1590289848Sjkim	addq	%r11, %r10
1591289848Sjkim	 movq	128+8(%rsp), %rsi
1592289848Sjkim	#movq	%rdx, %r11
1593289848Sjkim	#adcq	\$0, %r11
1594289848Sjkim	adcq	\$0, %rdx
1595289848Sjkim	movq	%rdx, %r11
1596289848Sjkim
1597289848Sjkim	mulq	%rbx
1598289848Sjkim	addq	%rax, %r12
1599289848Sjkim	movq	40(%rbp), %rax
1600289848Sjkim	adcq	\$0, %rdx
1601289848Sjkim	 imulq	%r8, %rsi
1602289848Sjkim	addq	%r12, %r11
1603289848Sjkim	movq	%rdx, %r12
1604289848Sjkim	adcq	\$0, %r12
1605289848Sjkim
1606289848Sjkim	mulq	%rbx
1607289848Sjkim	addq	%rax, %r13
1608289848Sjkim	movq	48(%rbp), %rax
1609289848Sjkim	adcq	\$0, %rdx
1610289848Sjkim	addq	%r13, %r12
1611289848Sjkim	movq	%rdx, %r13
1612289848Sjkim	adcq	\$0, %r13
1613289848Sjkim
1614289848Sjkim	mulq	%rbx
1615289848Sjkim	addq	%rax, %r14
1616289848Sjkim	movq	56(%rbp), %rax
1617289848Sjkim	adcq	\$0, %rdx
1618289848Sjkim	addq	%r14, %r13
1619289848Sjkim	movq	%rdx, %r14
1620289848Sjkim	adcq	\$0, %r14
1621289848Sjkim
1622289848Sjkim	mulq	%rbx
1623289848Sjkim	 movq	%rsi, %rbx
1624289848Sjkim	addq	%rax, %r15
1625289848Sjkim	 movq	0(%rbp), %rax
1626289848Sjkim	adcq	\$0, %rdx
1627289848Sjkim	addq	%r15, %r14
1628289848Sjkim	movq	%rdx, %r15
1629289848Sjkim	adcq	\$0, %r15
1630289848Sjkim
1631289848Sjkim	decl	%ecx
1632289848Sjkim	jne	.Lreduction_loop
1633289848Sjkim
1634289848Sjkim	ret
1635289848Sjkim.size	__rsaz_512_reduce,.-__rsaz_512_reduce
1636289848Sjkim___
1637289848Sjkim}
1638289848Sjkimif ($addx) {
1639289848Sjkim	# __rsaz_512_reducex
1640289848Sjkim	#
1641289848Sjkim	# input:	%r8-%r15, %rbp - mod, 128(%rsp) - n0
1642289848Sjkim	# output:	%r8-%r15
1643289848Sjkim	# clobbers:	everything except %rbp and %rdi
1644289848Sjkim$code.=<<___;
1645289848Sjkim.type	__rsaz_512_reducex,\@abi-omnipotent
1646289848Sjkim.align	32
1647289848Sjkim__rsaz_512_reducex:
1648289848Sjkim	#movq	128+8(%rsp), %rdx		# pull $n0
1649289848Sjkim	imulq	%r8, %rdx
1650289848Sjkim	xorq	%rsi, %rsi			# cf=0,of=0
1651289848Sjkim	movl	\$8, %ecx
1652289848Sjkim	jmp	.Lreduction_loopx
1653289848Sjkim
1654289848Sjkim.align	32
1655289848Sjkim.Lreduction_loopx:
1656289848Sjkim	mov	%r8, %rbx
1657289848Sjkim	mulx	0(%rbp), %rax, %r8
1658289848Sjkim	adcx	%rbx, %rax
1659289848Sjkim	adox	%r9, %r8
1660289848Sjkim
1661289848Sjkim	mulx	8(%rbp), %rax, %r9
1662289848Sjkim	adcx	%rax, %r8
1663289848Sjkim	adox	%r10, %r9
1664289848Sjkim
1665289848Sjkim	mulx	16(%rbp), %rbx, %r10
1666289848Sjkim	adcx	%rbx, %r9
1667289848Sjkim	adox	%r11, %r10
1668289848Sjkim
1669289848Sjkim	mulx	24(%rbp), %rbx, %r11
1670289848Sjkim	adcx	%rbx, %r10
1671289848Sjkim	adox	%r12, %r11
1672289848Sjkim
1673289848Sjkim	.byte	0xc4,0x62,0xe3,0xf6,0xa5,0x20,0x00,0x00,0x00	# mulx	32(%rbp), %rbx, %r12
1674289848Sjkim	 mov	%rdx, %rax
1675289848Sjkim	 mov	%r8, %rdx
1676289848Sjkim	adcx	%rbx, %r11
1677289848Sjkim	adox	%r13, %r12
1678289848Sjkim
1679289848Sjkim	 mulx	128+8(%rsp), %rbx, %rdx
1680289848Sjkim	 mov	%rax, %rdx
1681289848Sjkim
1682289848Sjkim	mulx	40(%rbp), %rax, %r13
1683289848Sjkim	adcx	%rax, %r12
1684289848Sjkim	adox	%r14, %r13
1685289848Sjkim
1686289848Sjkim	.byte	0xc4,0x62,0xfb,0xf6,0xb5,0x30,0x00,0x00,0x00	# mulx	48(%rbp), %rax, %r14
1687289848Sjkim	adcx	%rax, %r13
1688289848Sjkim	adox	%r15, %r14
1689289848Sjkim
1690289848Sjkim	mulx	56(%rbp), %rax, %r15
1691289848Sjkim	 mov	%rbx, %rdx
1692289848Sjkim	adcx	%rax, %r14
1693289848Sjkim	adox	%rsi, %r15			# %rsi is 0
1694289848Sjkim	adcx	%rsi, %r15			# cf=0
1695289848Sjkim
1696289848Sjkim	decl	%ecx				# of=0
1697289848Sjkim	jne	.Lreduction_loopx
1698289848Sjkim
1699289848Sjkim	ret
1700289848Sjkim.size	__rsaz_512_reducex,.-__rsaz_512_reducex
1701289848Sjkim___
1702289848Sjkim}
1703289848Sjkim{	# __rsaz_512_subtract
1704289848Sjkim	# input: %r8-%r15, %rdi - $out, %rbp - $mod, %rcx - mask
1705289848Sjkim	# output:
1706289848Sjkim	# clobbers: everything but %rdi, %rsi and %rbp
1707289848Sjkim$code.=<<___;
1708289848Sjkim.type	__rsaz_512_subtract,\@abi-omnipotent
1709289848Sjkim.align	32
1710289848Sjkim__rsaz_512_subtract:
1711289848Sjkim	movq	%r8, ($out)
1712289848Sjkim	movq	%r9, 8($out)
1713289848Sjkim	movq	%r10, 16($out)
1714289848Sjkim	movq	%r11, 24($out)
1715289848Sjkim	movq	%r12, 32($out)
1716289848Sjkim	movq	%r13, 40($out)
1717289848Sjkim	movq	%r14, 48($out)
1718289848Sjkim	movq	%r15, 56($out)
1719289848Sjkim
1720289848Sjkim	movq	0($mod), %r8
1721289848Sjkim	movq	8($mod), %r9
1722289848Sjkim	negq	%r8
1723289848Sjkim	notq	%r9
1724289848Sjkim	andq	%rcx, %r8
1725289848Sjkim	movq	16($mod), %r10
1726289848Sjkim	andq	%rcx, %r9
1727289848Sjkim	notq	%r10
1728289848Sjkim	movq	24($mod), %r11
1729289848Sjkim	andq	%rcx, %r10
1730289848Sjkim	notq	%r11
1731289848Sjkim	movq	32($mod), %r12
1732289848Sjkim	andq	%rcx, %r11
1733289848Sjkim	notq	%r12
1734289848Sjkim	movq	40($mod), %r13
1735289848Sjkim	andq	%rcx, %r12
1736289848Sjkim	notq	%r13
1737289848Sjkim	movq	48($mod), %r14
1738289848Sjkim	andq	%rcx, %r13
1739289848Sjkim	notq	%r14
1740289848Sjkim	movq	56($mod), %r15
1741289848Sjkim	andq	%rcx, %r14
1742289848Sjkim	notq	%r15
1743289848Sjkim	andq	%rcx, %r15
1744289848Sjkim
1745289848Sjkim	addq	($out), %r8
1746289848Sjkim	adcq	8($out), %r9
1747289848Sjkim	adcq	16($out), %r10
1748289848Sjkim	adcq	24($out), %r11
1749289848Sjkim	adcq	32($out), %r12
1750289848Sjkim	adcq	40($out), %r13
1751289848Sjkim	adcq	48($out), %r14
1752289848Sjkim	adcq	56($out), %r15
1753289848Sjkim
1754289848Sjkim	movq	%r8, ($out)
1755289848Sjkim	movq	%r9, 8($out)
1756289848Sjkim	movq	%r10, 16($out)
1757289848Sjkim	movq	%r11, 24($out)
1758289848Sjkim	movq	%r12, 32($out)
1759289848Sjkim	movq	%r13, 40($out)
1760289848Sjkim	movq	%r14, 48($out)
1761289848Sjkim	movq	%r15, 56($out)
1762289848Sjkim
1763289848Sjkim	ret
1764289848Sjkim.size	__rsaz_512_subtract,.-__rsaz_512_subtract
1765289848Sjkim___
1766289848Sjkim}
1767289848Sjkim{	# __rsaz_512_mul
1768289848Sjkim	#
1769289848Sjkim	# input: %rsi - ap, %rbp - bp
1770289848Sjkim	# ouput:
1771289848Sjkim	# clobbers: everything
1772289848Sjkimmy ($ap,$bp) = ("%rsi","%rbp");
1773289848Sjkim$code.=<<___;
1774289848Sjkim.type	__rsaz_512_mul,\@abi-omnipotent
1775289848Sjkim.align	32
1776289848Sjkim__rsaz_512_mul:
1777289848Sjkim	leaq	8(%rsp), %rdi
1778289848Sjkim
1779289848Sjkim	movq	($ap), %rax
1780289848Sjkim	mulq	%rbx
1781289848Sjkim	movq	%rax, (%rdi)
1782289848Sjkim	movq	8($ap), %rax
1783289848Sjkim	movq	%rdx, %r8
1784289848Sjkim
1785289848Sjkim	mulq	%rbx
1786289848Sjkim	addq	%rax, %r8
1787289848Sjkim	movq	16($ap), %rax
1788289848Sjkim	movq	%rdx, %r9
1789289848Sjkim	adcq	\$0, %r9
1790289848Sjkim
1791289848Sjkim	mulq	%rbx
1792289848Sjkim	addq	%rax, %r9
1793289848Sjkim	movq	24($ap), %rax
1794289848Sjkim	movq	%rdx, %r10
1795289848Sjkim	adcq	\$0, %r10
1796289848Sjkim
1797289848Sjkim	mulq	%rbx
1798289848Sjkim	addq	%rax, %r10
1799289848Sjkim	movq	32($ap), %rax
1800289848Sjkim	movq	%rdx, %r11
1801289848Sjkim	adcq	\$0, %r11
1802289848Sjkim
1803289848Sjkim	mulq	%rbx
1804289848Sjkim	addq	%rax, %r11
1805289848Sjkim	movq	40($ap), %rax
1806289848Sjkim	movq	%rdx, %r12
1807289848Sjkim	adcq	\$0, %r12
1808289848Sjkim
1809289848Sjkim	mulq	%rbx
1810289848Sjkim	addq	%rax, %r12
1811289848Sjkim	movq	48($ap), %rax
1812289848Sjkim	movq	%rdx, %r13
1813289848Sjkim	adcq	\$0, %r13
1814289848Sjkim
1815289848Sjkim	mulq	%rbx
1816289848Sjkim	addq	%rax, %r13
1817289848Sjkim	movq	56($ap), %rax
1818289848Sjkim	movq	%rdx, %r14
1819289848Sjkim	adcq	\$0, %r14
1820289848Sjkim
1821289848Sjkim	mulq	%rbx
1822289848Sjkim	addq	%rax, %r14
1823289848Sjkim	 movq	($ap), %rax
1824289848Sjkim	movq	%rdx, %r15
1825289848Sjkim	adcq	\$0, %r15
1826289848Sjkim
1827289848Sjkim	leaq	8($bp), $bp
1828289848Sjkim	leaq	8(%rdi), %rdi
1829289848Sjkim
1830289848Sjkim	movl	\$7, %ecx
1831289848Sjkim	jmp	.Loop_mul
1832289848Sjkim
1833289848Sjkim.align	32
1834289848Sjkim.Loop_mul:
1835289848Sjkim	movq	($bp), %rbx
1836289848Sjkim	mulq	%rbx
1837289848Sjkim	addq	%rax, %r8
1838289848Sjkim	movq	8($ap), %rax
1839289848Sjkim	movq	%r8, (%rdi)
1840289848Sjkim	movq	%rdx, %r8
1841289848Sjkim	adcq	\$0, %r8
1842289848Sjkim
1843289848Sjkim	mulq	%rbx
1844289848Sjkim	addq	%rax, %r9
1845289848Sjkim	movq	16($ap), %rax
1846289848Sjkim	adcq	\$0, %rdx
1847289848Sjkim	addq	%r9, %r8
1848289848Sjkim	movq	%rdx, %r9
1849289848Sjkim	adcq	\$0, %r9
1850289848Sjkim
1851289848Sjkim	mulq	%rbx
1852289848Sjkim	addq	%rax, %r10
1853289848Sjkim	movq	24($ap), %rax
1854289848Sjkim	adcq	\$0, %rdx
1855289848Sjkim	addq	%r10, %r9
1856289848Sjkim	movq	%rdx, %r10
1857289848Sjkim	adcq	\$0, %r10
1858289848Sjkim
1859289848Sjkim	mulq	%rbx
1860289848Sjkim	addq	%rax, %r11
1861289848Sjkim	movq	32($ap), %rax
1862289848Sjkim	adcq	\$0, %rdx
1863289848Sjkim	addq	%r11, %r10
1864289848Sjkim	movq	%rdx, %r11
1865289848Sjkim	adcq	\$0, %r11
1866289848Sjkim
1867289848Sjkim	mulq	%rbx
1868289848Sjkim	addq	%rax, %r12
1869289848Sjkim	movq	40($ap), %rax
1870289848Sjkim	adcq	\$0, %rdx
1871289848Sjkim	addq	%r12, %r11
1872289848Sjkim	movq	%rdx, %r12
1873289848Sjkim	adcq	\$0, %r12
1874289848Sjkim
1875289848Sjkim	mulq	%rbx
1876289848Sjkim	addq	%rax, %r13
1877289848Sjkim	movq	48($ap), %rax
1878289848Sjkim	adcq	\$0, %rdx
1879289848Sjkim	addq	%r13, %r12
1880289848Sjkim	movq	%rdx, %r13
1881289848Sjkim	adcq	\$0, %r13
1882289848Sjkim
1883289848Sjkim	mulq	%rbx
1884289848Sjkim	addq	%rax, %r14
1885289848Sjkim	movq	56($ap), %rax
1886289848Sjkim	adcq	\$0, %rdx
1887289848Sjkim	addq	%r14, %r13
1888289848Sjkim	movq	%rdx, %r14
1889289848Sjkim	 leaq	8($bp), $bp
1890289848Sjkim	adcq	\$0, %r14
1891289848Sjkim
1892289848Sjkim	mulq	%rbx
1893289848Sjkim	addq	%rax, %r15
1894289848Sjkim	 movq	($ap), %rax
1895289848Sjkim	adcq	\$0, %rdx
1896289848Sjkim	addq	%r15, %r14
1897289848Sjkim	movq	%rdx, %r15
1898289848Sjkim	adcq	\$0, %r15
1899289848Sjkim
1900289848Sjkim	leaq	8(%rdi), %rdi
1901289848Sjkim
1902289848Sjkim	decl	%ecx
1903289848Sjkim	jnz	.Loop_mul
1904289848Sjkim
1905289848Sjkim	movq	%r8, (%rdi)
1906289848Sjkim	movq	%r9, 8(%rdi)
1907289848Sjkim	movq	%r10, 16(%rdi)
1908289848Sjkim	movq	%r11, 24(%rdi)
1909289848Sjkim	movq	%r12, 32(%rdi)
1910289848Sjkim	movq	%r13, 40(%rdi)
1911289848Sjkim	movq	%r14, 48(%rdi)
1912289848Sjkim	movq	%r15, 56(%rdi)
1913289848Sjkim
1914289848Sjkim	ret
1915289848Sjkim.size	__rsaz_512_mul,.-__rsaz_512_mul
1916289848Sjkim___
1917289848Sjkim}
1918289848Sjkimif ($addx) {
1919289848Sjkim	# __rsaz_512_mulx
1920289848Sjkim	#
1921289848Sjkim	# input: %rsi - ap, %rbp - bp
1922289848Sjkim	# ouput:
1923289848Sjkim	# clobbers: everything
1924289848Sjkimmy ($ap,$bp,$zero) = ("%rsi","%rbp","%rdi");
1925289848Sjkim$code.=<<___;
1926289848Sjkim.type	__rsaz_512_mulx,\@abi-omnipotent
1927289848Sjkim.align	32
1928289848Sjkim__rsaz_512_mulx:
1929289848Sjkim	mulx	($ap), %rbx, %r8	# initial %rdx preloaded by caller
1930289848Sjkim	mov	\$-6, %rcx
1931289848Sjkim
1932289848Sjkim	mulx	8($ap), %rax, %r9
1933289848Sjkim	movq	%rbx, 8(%rsp)
1934289848Sjkim
1935289848Sjkim	mulx	16($ap), %rbx, %r10
1936289848Sjkim	adc	%rax, %r8
1937289848Sjkim
1938289848Sjkim	mulx	24($ap), %rax, %r11
1939289848Sjkim	adc	%rbx, %r9
1940289848Sjkim
1941289848Sjkim	mulx	32($ap), %rbx, %r12
1942289848Sjkim	adc	%rax, %r10
1943289848Sjkim
1944289848Sjkim	mulx	40($ap), %rax, %r13
1945289848Sjkim	adc	%rbx, %r11
1946289848Sjkim
1947289848Sjkim	mulx	48($ap), %rbx, %r14
1948289848Sjkim	adc	%rax, %r12
1949289848Sjkim
1950289848Sjkim	mulx	56($ap), %rax, %r15
1951289848Sjkim	 mov	8($bp), %rdx
1952289848Sjkim	adc	%rbx, %r13
1953289848Sjkim	adc	%rax, %r14
1954289848Sjkim	adc	\$0, %r15
1955289848Sjkim
1956289848Sjkim	xor	$zero, $zero		# cf=0,of=0
1957289848Sjkim	jmp	.Loop_mulx
1958289848Sjkim
1959289848Sjkim.align	32
1960289848Sjkim.Loop_mulx:
1961289848Sjkim	movq	%r8, %rbx
1962289848Sjkim	mulx	($ap), %rax, %r8
1963289848Sjkim	adcx	%rax, %rbx
1964289848Sjkim	adox	%r9, %r8
1965289848Sjkim
1966289848Sjkim	mulx	8($ap), %rax, %r9
1967289848Sjkim	adcx	%rax, %r8
1968289848Sjkim	adox	%r10, %r9
1969289848Sjkim
1970289848Sjkim	mulx	16($ap), %rax, %r10
1971289848Sjkim	adcx	%rax, %r9
1972289848Sjkim	adox	%r11, %r10
1973289848Sjkim
1974289848Sjkim	mulx	24($ap), %rax, %r11
1975289848Sjkim	adcx	%rax, %r10
1976289848Sjkim	adox	%r12, %r11
1977289848Sjkim
1978289848Sjkim	.byte	0x3e,0xc4,0x62,0xfb,0xf6,0xa6,0x20,0x00,0x00,0x00	# mulx	32($ap), %rax, %r12
1979289848Sjkim	adcx	%rax, %r11
1980289848Sjkim	adox	%r13, %r12
1981289848Sjkim
1982289848Sjkim	mulx	40($ap), %rax, %r13
1983289848Sjkim	adcx	%rax, %r12
1984289848Sjkim	adox	%r14, %r13
1985289848Sjkim
1986289848Sjkim	mulx	48($ap), %rax, %r14
1987289848Sjkim	adcx	%rax, %r13
1988289848Sjkim	adox	%r15, %r14
1989289848Sjkim
1990289848Sjkim	mulx	56($ap), %rax, %r15
1991289848Sjkim	 movq	64($bp,%rcx,8), %rdx
1992289848Sjkim	 movq	%rbx, 8+64-8(%rsp,%rcx,8)
1993289848Sjkim	adcx	%rax, %r14
1994289848Sjkim	adox	$zero, %r15
1995289848Sjkim	adcx	$zero, %r15		# cf=0
1996289848Sjkim
1997289848Sjkim	inc	%rcx			# of=0
1998289848Sjkim	jnz	.Loop_mulx
1999289848Sjkim
2000289848Sjkim	movq	%r8, %rbx
2001289848Sjkim	mulx	($ap), %rax, %r8
2002289848Sjkim	adcx	%rax, %rbx
2003289848Sjkim	adox	%r9, %r8
2004289848Sjkim
2005289848Sjkim	.byte	0xc4,0x62,0xfb,0xf6,0x8e,0x08,0x00,0x00,0x00	# mulx	8($ap), %rax, %r9
2006289848Sjkim	adcx	%rax, %r8
2007289848Sjkim	adox	%r10, %r9
2008289848Sjkim
2009289848Sjkim	.byte	0xc4,0x62,0xfb,0xf6,0x96,0x10,0x00,0x00,0x00	# mulx	16($ap), %rax, %r10
2010289848Sjkim	adcx	%rax, %r9
2011289848Sjkim	adox	%r11, %r10
2012289848Sjkim
2013289848Sjkim	mulx	24($ap), %rax, %r11
2014289848Sjkim	adcx	%rax, %r10
2015289848Sjkim	adox	%r12, %r11
2016289848Sjkim
2017289848Sjkim	mulx	32($ap), %rax, %r12
2018289848Sjkim	adcx	%rax, %r11
2019289848Sjkim	adox	%r13, %r12
2020289848Sjkim
2021289848Sjkim	mulx	40($ap), %rax, %r13
2022289848Sjkim	adcx	%rax, %r12
2023289848Sjkim	adox	%r14, %r13
2024289848Sjkim
2025289848Sjkim	.byte	0xc4,0x62,0xfb,0xf6,0xb6,0x30,0x00,0x00,0x00	# mulx	48($ap), %rax, %r14
2026289848Sjkim	adcx	%rax, %r13
2027289848Sjkim	adox	%r15, %r14
2028289848Sjkim
2029289848Sjkim	.byte	0xc4,0x62,0xfb,0xf6,0xbe,0x38,0x00,0x00,0x00	# mulx	56($ap), %rax, %r15
2030289848Sjkim	adcx	%rax, %r14
2031289848Sjkim	adox	$zero, %r15
2032289848Sjkim	adcx	$zero, %r15
2033289848Sjkim
2034289848Sjkim	mov	%rbx, 8+64-8(%rsp)
2035289848Sjkim	mov	%r8, 8+64(%rsp)
2036289848Sjkim	mov	%r9, 8+64+8(%rsp)
2037289848Sjkim	mov	%r10, 8+64+16(%rsp)
2038289848Sjkim	mov	%r11, 8+64+24(%rsp)
2039289848Sjkim	mov	%r12, 8+64+32(%rsp)
2040289848Sjkim	mov	%r13, 8+64+40(%rsp)
2041289848Sjkim	mov	%r14, 8+64+48(%rsp)
2042289848Sjkim	mov	%r15, 8+64+56(%rsp)
2043289848Sjkim
2044289848Sjkim	ret
2045289848Sjkim.size	__rsaz_512_mulx,.-__rsaz_512_mulx
2046289848Sjkim___
2047289848Sjkim}
2048289848Sjkim{
2049289848Sjkimmy ($out,$inp,$power)= $win64 ? ("%rcx","%rdx","%r8d") : ("%rdi","%rsi","%edx");
2050289848Sjkim$code.=<<___;
2051289848Sjkim.globl	rsaz_512_scatter4
2052289848Sjkim.type	rsaz_512_scatter4,\@abi-omnipotent
2053289848Sjkim.align	16
2054289848Sjkimrsaz_512_scatter4:
2055296279Sjkim	leaq	($out,$power,8), $out
2056289848Sjkim	movl	\$8, %r9d
2057289848Sjkim	jmp	.Loop_scatter
2058289848Sjkim.align	16
2059289848Sjkim.Loop_scatter:
2060289848Sjkim	movq	($inp), %rax
2061289848Sjkim	leaq	8($inp), $inp
2062296279Sjkim	movq	%rax, ($out)
2063289848Sjkim	leaq	128($out), $out
2064289848Sjkim	decl	%r9d
2065289848Sjkim	jnz	.Loop_scatter
2066289848Sjkim	ret
2067289848Sjkim.size	rsaz_512_scatter4,.-rsaz_512_scatter4
2068289848Sjkim
2069289848Sjkim.globl	rsaz_512_gather4
2070289848Sjkim.type	rsaz_512_gather4,\@abi-omnipotent
2071289848Sjkim.align	16
2072289848Sjkimrsaz_512_gather4:
2073296279Sjkim___
2074296279Sjkim$code.=<<___	if ($win64);
2075296279Sjkim.LSEH_begin_rsaz_512_gather4:
2076296279Sjkim	.byte	0x48,0x81,0xec,0xa8,0x00,0x00,0x00	# sub    $0xa8,%rsp
2077296279Sjkim	.byte	0x0f,0x29,0x34,0x24			# movaps %xmm6,(%rsp)
2078296279Sjkim	.byte	0x0f,0x29,0x7c,0x24,0x10		# movaps %xmm7,0x10(%rsp)
2079296279Sjkim	.byte	0x44,0x0f,0x29,0x44,0x24,0x20		# movaps %xmm8,0x20(%rsp)
2080296279Sjkim	.byte	0x44,0x0f,0x29,0x4c,0x24,0x30		# movaps %xmm9,0x30(%rsp)
2081296279Sjkim	.byte	0x44,0x0f,0x29,0x54,0x24,0x40		# movaps %xmm10,0x40(%rsp)
2082296279Sjkim	.byte	0x44,0x0f,0x29,0x5c,0x24,0x50		# movaps %xmm11,0x50(%rsp)
2083296279Sjkim	.byte	0x44,0x0f,0x29,0x64,0x24,0x60		# movaps %xmm12,0x60(%rsp)
2084296279Sjkim	.byte	0x44,0x0f,0x29,0x6c,0x24,0x70		# movaps %xmm13,0x70(%rsp)
2085296279Sjkim	.byte	0x44,0x0f,0x29,0xb4,0x24,0x80,0,0,0	# movaps %xmm14,0x80(%rsp)
2086296279Sjkim	.byte	0x44,0x0f,0x29,0xbc,0x24,0x90,0,0,0	# movaps %xmm15,0x90(%rsp)
2087296279Sjkim___
2088296279Sjkim$code.=<<___;
2089296279Sjkim	movd	$power,%xmm8
2090296279Sjkim	movdqa	.Linc+16(%rip),%xmm1	# 00000002000000020000000200000002
2091296279Sjkim	movdqa	.Linc(%rip),%xmm0	# 00000001000000010000000000000000
2092296279Sjkim
2093296279Sjkim	pshufd	\$0,%xmm8,%xmm8		# broadcast $power
2094296279Sjkim	movdqa	%xmm1,%xmm7
2095296279Sjkim	movdqa	%xmm1,%xmm2
2096296279Sjkim___
2097296279Sjkim########################################################################
2098296279Sjkim# calculate mask by comparing 0..15 to $power
2099296279Sjkim#
2100296279Sjkimfor($i=0;$i<4;$i++) {
2101296279Sjkim$code.=<<___;
2102296279Sjkim	paddd	%xmm`$i`,%xmm`$i+1`
2103296279Sjkim	pcmpeqd	%xmm8,%xmm`$i`
2104296279Sjkim	movdqa	%xmm7,%xmm`$i+3`
2105296279Sjkim___
2106296279Sjkim}
2107296279Sjkimfor(;$i<7;$i++) {
2108296279Sjkim$code.=<<___;
2109296279Sjkim	paddd	%xmm`$i`,%xmm`$i+1`
2110296279Sjkim	pcmpeqd	%xmm8,%xmm`$i`
2111296279Sjkim___
2112296279Sjkim}
2113296279Sjkim$code.=<<___;
2114296279Sjkim	pcmpeqd	%xmm8,%xmm7
2115289848Sjkim	movl	\$8, %r9d
2116289848Sjkim	jmp	.Loop_gather
2117289848Sjkim.align	16
2118289848Sjkim.Loop_gather:
2119296279Sjkim	movdqa	16*0($inp),%xmm8
2120296279Sjkim	movdqa	16*1($inp),%xmm9
2121296279Sjkim	movdqa	16*2($inp),%xmm10
2122296279Sjkim	movdqa	16*3($inp),%xmm11
2123296279Sjkim	pand	%xmm0,%xmm8
2124296279Sjkim	movdqa	16*4($inp),%xmm12
2125296279Sjkim	pand	%xmm1,%xmm9
2126296279Sjkim	movdqa	16*5($inp),%xmm13
2127296279Sjkim	pand	%xmm2,%xmm10
2128296279Sjkim	movdqa	16*6($inp),%xmm14
2129296279Sjkim	pand	%xmm3,%xmm11
2130296279Sjkim	movdqa	16*7($inp),%xmm15
2131289848Sjkim	leaq	128($inp), $inp
2132296279Sjkim	pand	%xmm4,%xmm12
2133296279Sjkim	pand	%xmm5,%xmm13
2134296279Sjkim	pand	%xmm6,%xmm14
2135296279Sjkim	pand	%xmm7,%xmm15
2136296279Sjkim	por	%xmm10,%xmm8
2137296279Sjkim	por	%xmm11,%xmm9
2138296279Sjkim	por	%xmm12,%xmm8
2139296279Sjkim	por	%xmm13,%xmm9
2140296279Sjkim	por	%xmm14,%xmm8
2141296279Sjkim	por	%xmm15,%xmm9
2142296279Sjkim
2143296279Sjkim	por	%xmm9,%xmm8
2144296279Sjkim	pshufd	\$0x4e,%xmm8,%xmm9
2145296279Sjkim	por	%xmm9,%xmm8
2146296279Sjkim	movq	%xmm8,($out)
2147289848Sjkim	leaq	8($out), $out
2148289848Sjkim	decl	%r9d
2149289848Sjkim	jnz	.Loop_gather
2150296279Sjkim___
2151296279Sjkim$code.=<<___	if ($win64);
2152296279Sjkim	movaps	0x00(%rsp),%xmm6
2153296279Sjkim	movaps	0x10(%rsp),%xmm7
2154296279Sjkim	movaps	0x20(%rsp),%xmm8
2155296279Sjkim	movaps	0x30(%rsp),%xmm9
2156296279Sjkim	movaps	0x40(%rsp),%xmm10
2157296279Sjkim	movaps	0x50(%rsp),%xmm11
2158296279Sjkim	movaps	0x60(%rsp),%xmm12
2159296279Sjkim	movaps	0x70(%rsp),%xmm13
2160296279Sjkim	movaps	0x80(%rsp),%xmm14
2161296279Sjkim	movaps	0x90(%rsp),%xmm15
2162296279Sjkim	add	\$0xa8,%rsp
2163296279Sjkim___
2164296279Sjkim$code.=<<___;
2165289848Sjkim	ret
2166296279Sjkim.LSEH_end_rsaz_512_gather4:
2167289848Sjkim.size	rsaz_512_gather4,.-rsaz_512_gather4
2168296279Sjkim
2169296279Sjkim.align	64
2170296279Sjkim.Linc:
2171296279Sjkim	.long	0,0, 1,1
2172296279Sjkim	.long	2,2, 2,2
2173289848Sjkim___
2174289848Sjkim}
2175289848Sjkim
2176289848Sjkim# EXCEPTION_DISPOSITION handler (EXCEPTION_RECORD *rec,ULONG64 frame,
2177289848Sjkim#		CONTEXT *context,DISPATCHER_CONTEXT *disp)
2178289848Sjkimif ($win64) {
2179289848Sjkim$rec="%rcx";
2180289848Sjkim$frame="%rdx";
2181289848Sjkim$context="%r8";
2182289848Sjkim$disp="%r9";
2183289848Sjkim
2184289848Sjkim$code.=<<___;
2185289848Sjkim.extern	__imp_RtlVirtualUnwind
2186289848Sjkim.type	se_handler,\@abi-omnipotent
2187289848Sjkim.align	16
2188289848Sjkimse_handler:
2189289848Sjkim	push	%rsi
2190289848Sjkim	push	%rdi
2191289848Sjkim	push	%rbx
2192289848Sjkim	push	%rbp
2193289848Sjkim	push	%r12
2194289848Sjkim	push	%r13
2195289848Sjkim	push	%r14
2196289848Sjkim	push	%r15
2197289848Sjkim	pushfq
2198289848Sjkim	sub	\$64,%rsp
2199289848Sjkim
2200289848Sjkim	mov	120($context),%rax	# pull context->Rax
2201289848Sjkim	mov	248($context),%rbx	# pull context->Rip
2202289848Sjkim
2203289848Sjkim	mov	8($disp),%rsi		# disp->ImageBase
2204289848Sjkim	mov	56($disp),%r11		# disp->HandlerData
2205289848Sjkim
2206289848Sjkim	mov	0(%r11),%r10d		# HandlerData[0]
2207289848Sjkim	lea	(%rsi,%r10),%r10	# end of prologue label
2208289848Sjkim	cmp	%r10,%rbx		# context->Rip<end of prologue label
2209289848Sjkim	jb	.Lcommon_seh_tail
2210289848Sjkim
2211289848Sjkim	mov	152($context),%rax	# pull context->Rsp
2212289848Sjkim
2213289848Sjkim	mov	4(%r11),%r10d		# HandlerData[1]
2214289848Sjkim	lea	(%rsi,%r10),%r10	# epilogue label
2215289848Sjkim	cmp	%r10,%rbx		# context->Rip>=epilogue label
2216289848Sjkim	jae	.Lcommon_seh_tail
2217289848Sjkim
2218289848Sjkim	lea	128+24+48(%rax),%rax
2219289848Sjkim
2220296279Sjkim	lea	.Lmul_gather4_epilogue(%rip),%rbx
2221296279Sjkim	cmp	%r10,%rbx
2222296279Sjkim	jne	.Lse_not_in_mul_gather4
2223296279Sjkim
2224296279Sjkim	lea	0xb0(%rax),%rax
2225296279Sjkim
2226296279Sjkim	lea	-48-0xa8(%rax),%rsi
2227296279Sjkim	lea	512($context),%rdi
2228296279Sjkim	mov	\$20,%ecx
2229296279Sjkim	.long	0xa548f3fc		# cld; rep movsq
2230296279Sjkim
2231296279Sjkim.Lse_not_in_mul_gather4:
2232289848Sjkim	mov	-8(%rax),%rbx
2233289848Sjkim	mov	-16(%rax),%rbp
2234289848Sjkim	mov	-24(%rax),%r12
2235289848Sjkim	mov	-32(%rax),%r13
2236289848Sjkim	mov	-40(%rax),%r14
2237289848Sjkim	mov	-48(%rax),%r15
2238289848Sjkim	mov	%rbx,144($context)	# restore context->Rbx
2239289848Sjkim	mov	%rbp,160($context)	# restore context->Rbp
2240289848Sjkim	mov	%r12,216($context)	# restore context->R12
2241289848Sjkim	mov	%r13,224($context)	# restore context->R13
2242289848Sjkim	mov	%r14,232($context)	# restore context->R14
2243289848Sjkim	mov	%r15,240($context)	# restore context->R15
2244289848Sjkim
2245289848Sjkim.Lcommon_seh_tail:
2246289848Sjkim	mov	8(%rax),%rdi
2247289848Sjkim	mov	16(%rax),%rsi
2248289848Sjkim	mov	%rax,152($context)	# restore context->Rsp
2249289848Sjkim	mov	%rsi,168($context)	# restore context->Rsi
2250289848Sjkim	mov	%rdi,176($context)	# restore context->Rdi
2251289848Sjkim
2252289848Sjkim	mov	40($disp),%rdi		# disp->ContextRecord
2253289848Sjkim	mov	$context,%rsi		# context
2254289848Sjkim	mov	\$154,%ecx		# sizeof(CONTEXT)
2255289848Sjkim	.long	0xa548f3fc		# cld; rep movsq
2256289848Sjkim
2257289848Sjkim	mov	$disp,%rsi
2258289848Sjkim	xor	%rcx,%rcx		# arg1, UNW_FLAG_NHANDLER
2259289848Sjkim	mov	8(%rsi),%rdx		# arg2, disp->ImageBase
2260289848Sjkim	mov	0(%rsi),%r8		# arg3, disp->ControlPc
2261289848Sjkim	mov	16(%rsi),%r9		# arg4, disp->FunctionEntry
2262289848Sjkim	mov	40(%rsi),%r10		# disp->ContextRecord
2263289848Sjkim	lea	56(%rsi),%r11		# &disp->HandlerData
2264289848Sjkim	lea	24(%rsi),%r12		# &disp->EstablisherFrame
2265289848Sjkim	mov	%r10,32(%rsp)		# arg5
2266289848Sjkim	mov	%r11,40(%rsp)		# arg6
2267289848Sjkim	mov	%r12,48(%rsp)		# arg7
2268289848Sjkim	mov	%rcx,56(%rsp)		# arg8, (NULL)
2269289848Sjkim	call	*__imp_RtlVirtualUnwind(%rip)
2270289848Sjkim
2271289848Sjkim	mov	\$1,%eax		# ExceptionContinueSearch
2272289848Sjkim	add	\$64,%rsp
2273289848Sjkim	popfq
2274289848Sjkim	pop	%r15
2275289848Sjkim	pop	%r14
2276289848Sjkim	pop	%r13
2277289848Sjkim	pop	%r12
2278289848Sjkim	pop	%rbp
2279289848Sjkim	pop	%rbx
2280289848Sjkim	pop	%rdi
2281289848Sjkim	pop	%rsi
2282289848Sjkim	ret
2283296279Sjkim.size	se_handler,.-se_handler
2284289848Sjkim
2285289848Sjkim.section	.pdata
2286289848Sjkim.align	4
2287289848Sjkim	.rva	.LSEH_begin_rsaz_512_sqr
2288289848Sjkim	.rva	.LSEH_end_rsaz_512_sqr
2289289848Sjkim	.rva	.LSEH_info_rsaz_512_sqr
2290289848Sjkim
2291289848Sjkim	.rva	.LSEH_begin_rsaz_512_mul
2292289848Sjkim	.rva	.LSEH_end_rsaz_512_mul
2293289848Sjkim	.rva	.LSEH_info_rsaz_512_mul
2294289848Sjkim
2295289848Sjkim	.rva	.LSEH_begin_rsaz_512_mul_gather4
2296289848Sjkim	.rva	.LSEH_end_rsaz_512_mul_gather4
2297289848Sjkim	.rva	.LSEH_info_rsaz_512_mul_gather4
2298289848Sjkim
2299289848Sjkim	.rva	.LSEH_begin_rsaz_512_mul_scatter4
2300289848Sjkim	.rva	.LSEH_end_rsaz_512_mul_scatter4
2301289848Sjkim	.rva	.LSEH_info_rsaz_512_mul_scatter4
2302289848Sjkim
2303289848Sjkim	.rva	.LSEH_begin_rsaz_512_mul_by_one
2304289848Sjkim	.rva	.LSEH_end_rsaz_512_mul_by_one
2305289848Sjkim	.rva	.LSEH_info_rsaz_512_mul_by_one
2306289848Sjkim
2307296279Sjkim	.rva	.LSEH_begin_rsaz_512_gather4
2308296279Sjkim	.rva	.LSEH_end_rsaz_512_gather4
2309296279Sjkim	.rva	.LSEH_info_rsaz_512_gather4
2310296279Sjkim
2311289848Sjkim.section	.xdata
2312289848Sjkim.align	8
2313289848Sjkim.LSEH_info_rsaz_512_sqr:
2314289848Sjkim	.byte	9,0,0,0
2315289848Sjkim	.rva	se_handler
2316289848Sjkim	.rva	.Lsqr_body,.Lsqr_epilogue			# HandlerData[]
2317289848Sjkim.LSEH_info_rsaz_512_mul:
2318289848Sjkim	.byte	9,0,0,0
2319289848Sjkim	.rva	se_handler
2320289848Sjkim	.rva	.Lmul_body,.Lmul_epilogue			# HandlerData[]
2321289848Sjkim.LSEH_info_rsaz_512_mul_gather4:
2322289848Sjkim	.byte	9,0,0,0
2323289848Sjkim	.rva	se_handler
2324289848Sjkim	.rva	.Lmul_gather4_body,.Lmul_gather4_epilogue	# HandlerData[]
2325289848Sjkim.LSEH_info_rsaz_512_mul_scatter4:
2326289848Sjkim	.byte	9,0,0,0
2327289848Sjkim	.rva	se_handler
2328289848Sjkim	.rva	.Lmul_scatter4_body,.Lmul_scatter4_epilogue	# HandlerData[]
2329289848Sjkim.LSEH_info_rsaz_512_mul_by_one:
2330289848Sjkim	.byte	9,0,0,0
2331289848Sjkim	.rva	se_handler
2332289848Sjkim	.rva	.Lmul_by_one_body,.Lmul_by_one_epilogue		# HandlerData[]
2333296279Sjkim.LSEH_info_rsaz_512_gather4:
2334296279Sjkim	.byte	0x01,0x46,0x16,0x00
2335296279Sjkim	.byte	0x46,0xf8,0x09,0x00	# vmovaps 0x90(rsp),xmm15
2336296279Sjkim	.byte	0x3d,0xe8,0x08,0x00	# vmovaps 0x80(rsp),xmm14
2337296279Sjkim	.byte	0x34,0xd8,0x07,0x00	# vmovaps 0x70(rsp),xmm13
2338296279Sjkim	.byte	0x2e,0xc8,0x06,0x00	# vmovaps 0x60(rsp),xmm12
2339296279Sjkim	.byte	0x28,0xb8,0x05,0x00	# vmovaps 0x50(rsp),xmm11
2340296279Sjkim	.byte	0x22,0xa8,0x04,0x00	# vmovaps 0x40(rsp),xmm10
2341296279Sjkim	.byte	0x1c,0x98,0x03,0x00	# vmovaps 0x30(rsp),xmm9
2342296279Sjkim	.byte	0x16,0x88,0x02,0x00	# vmovaps 0x20(rsp),xmm8
2343296279Sjkim	.byte	0x10,0x78,0x01,0x00	# vmovaps 0x10(rsp),xmm7
2344296279Sjkim	.byte	0x0b,0x68,0x00,0x00	# vmovaps 0x00(rsp),xmm6
2345296279Sjkim	.byte	0x07,0x01,0x15,0x00	# sub     rsp,0xa8
2346289848Sjkim___
2347289848Sjkim}
2348289848Sjkim
2349289848Sjkim$code =~ s/\`([^\`]*)\`/eval $1/gem;
2350289848Sjkimprint $code;
2351289848Sjkimclose STDOUT;
2352