1289848Sjkim#!/usr/bin/env perl
2289848Sjkim
3289848Sjkim##############################################################################
4289848Sjkim#                                                                            #
5289848Sjkim#  Copyright (c) 2012, Intel Corporation                                     #
6289848Sjkim#                                                                            #
7289848Sjkim#  All rights reserved.                                                      #
8289848Sjkim#                                                                            #
9289848Sjkim#  Redistribution and use in source and binary forms, with or without        #
10289848Sjkim#  modification, are permitted provided that the following conditions are    #
11289848Sjkim#  met:                                                                      #
12289848Sjkim#                                                                            #
13289848Sjkim#  *  Redistributions of source code must retain the above copyright         #
14289848Sjkim#     notice, this list of conditions and the following disclaimer.          #
15289848Sjkim#                                                                            #
16289848Sjkim#  *  Redistributions in binary form must reproduce the above copyright      #
17289848Sjkim#     notice, this list of conditions and the following disclaimer in the    #
18289848Sjkim#     documentation and/or other materials provided with the                 #
19289848Sjkim#     distribution.                                                          #
20289848Sjkim#                                                                            #
21289848Sjkim#  *  Neither the name of the Intel Corporation nor the names of its         #
22289848Sjkim#     contributors may be used to endorse or promote products derived from   #
23289848Sjkim#     this software without specific prior written permission.               #
24289848Sjkim#                                                                            #
25289848Sjkim#                                                                            #
26289848Sjkim#  THIS SOFTWARE IS PROVIDED BY INTEL CORPORATION ""AS IS"" AND ANY          #
27289848Sjkim#  EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE         #
28289848Sjkim#  IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR        #
29289848Sjkim#  PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL INTEL CORPORATION OR            #
30289848Sjkim#  CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,     #
31289848Sjkim#  EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,       #
32289848Sjkim#  PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR        #
33289848Sjkim#  PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF    #
34289848Sjkim#  LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING      #
35289848Sjkim#  NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS        #
36289848Sjkim#  SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.              #
37289848Sjkim#                                                                            #
38289848Sjkim##############################################################################
39289848Sjkim# Developers and authors:                                                    #
40289848Sjkim# Shay Gueron (1, 2), and Vlad Krasnov (1)                                   #
41289848Sjkim# (1) Intel Architecture Group, Microprocessor and Chipset Development,      #
42289848Sjkim#     Israel Development Center, Haifa, Israel                               #
43289848Sjkim# (2) University of Haifa                                                    #
44289848Sjkim##############################################################################
45289848Sjkim# Reference:                                                                 #
46289848Sjkim# [1] S. Gueron, "Efficient Software Implementations of Modular              #
47289848Sjkim#     Exponentiation", http://eprint.iacr.org/2011/239                       #
48289848Sjkim# [2] S. Gueron, V. Krasnov. "Speeding up Big-Numbers Squaring".             #
49289848Sjkim#     IEEE Proceedings of 9th International Conference on Information        #
50289848Sjkim#     Technology: New Generations (ITNG 2012), 821-823 (2012).               #
51289848Sjkim# [3] S. Gueron, Efficient Software Implementations of Modular Exponentiation#
52289848Sjkim#     Journal of Cryptographic Engineering 2:31-43 (2012).                   #
53289848Sjkim# [4] S. Gueron, V. Krasnov: "[PATCH] Efficient and side channel analysis    #
54289848Sjkim#     resistant 512-bit and 1024-bit modular exponentiation for optimizing   #
55289848Sjkim#     RSA1024 and RSA2048 on x86_64 platforms",                              #
56289848Sjkim#     http://rt.openssl.org/Ticket/Display.html?id=2582&user=guest&pass=guest#
57289848Sjkim##############################################################################
58289848Sjkim
59289848Sjkim# While original submission covers 512- and 1024-bit exponentiation,
60289848Sjkim# this module is limited to 512-bit version only (and as such
61289848Sjkim# accelerates RSA1024 sign). This is because improvement for longer
62289848Sjkim# keys is not high enough to justify the effort, highest measured
63289848Sjkim# was ~5% on Westmere. [This is relative to OpenSSL 1.0.2, upcoming
64289848Sjkim# for the moment of this writing!] Nor does this module implement
65289848Sjkim# "monolithic" complete exponentiation jumbo-subroutine, but adheres
66289848Sjkim# to more modular mixture of C and assembly. And it's optimized even
67289848Sjkim# for processors other than Intel Core family (see table below for
68289848Sjkim# improvement coefficients).
69289848Sjkim# 						<appro@openssl.org>
70289848Sjkim#
71289848Sjkim# RSA1024 sign/sec	this/original	|this/rsax(*)	this/fips(*)
72289848Sjkim#			----------------+---------------------------
73289848Sjkim# Opteron		+13%		|+5%		+20%
74289848Sjkim# Bulldozer		-0%		|-1%		+10%
75289848Sjkim# P4			+11%		|+7%		+8%
76289848Sjkim# Westmere		+5%		|+14%		+17%
77289848Sjkim# Sandy Bridge		+2%		|+12%		+29%
78289848Sjkim# Ivy Bridge		+1%		|+11%		+35%
79289848Sjkim# Haswell(**)		-0%		|+12%		+39%
80289848Sjkim# Atom			+13%		|+11%		+4%
81289848Sjkim# VIA Nano		+70%		|+9%		+25%
82289848Sjkim#
83289848Sjkim# (*)	rsax engine and fips numbers are presented for reference
84289848Sjkim#	purposes;
85289848Sjkim# (**)	MULX was attempted, but found to give only marginal improvement;
86289848Sjkim
87289848Sjkim$flavour = shift;
88289848Sjkim$output  = shift;
89289848Sjkimif ($flavour =~ /\./) { $output = $flavour; undef $flavour; }
90289848Sjkim
91289848Sjkim$win64=0; $win64=1 if ($flavour =~ /[nm]asm|mingw64/ || $output =~ /\.asm$/);
92289848Sjkim
93289848Sjkim$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
94289848Sjkim( $xlate="${dir}x86_64-xlate.pl" and -f $xlate ) or
95289848Sjkim( $xlate="${dir}../../perlasm/x86_64-xlate.pl" and -f $xlate) or
96289848Sjkimdie "can't locate x86_64-xlate.pl";
97289848Sjkim
98289848Sjkimopen OUT,"| \"$^X\" $xlate $flavour $output";
99289848Sjkim*STDOUT=*OUT;
100289848Sjkim
101289848Sjkimif (`$ENV{CC} -Wa,-v -c -o /dev/null -x assembler /dev/null 2>&1`
102289848Sjkim		=~ /GNU assembler version ([2-9]\.[0-9]+)/) {
103289848Sjkim	$addx = ($1>=2.23);
104289848Sjkim}
105289848Sjkim
106289848Sjkimif (!$addx && $win64 && ($flavour =~ /nasm/ || $ENV{ASM} =~ /nasm/) &&
107289848Sjkim	    `nasm -v 2>&1` =~ /NASM version ([2-9]\.[0-9]+)/) {
108289848Sjkim	$addx = ($1>=2.10);
109289848Sjkim}
110289848Sjkim
111289848Sjkimif (!$addx && $win64 && ($flavour =~ /masm/ || $ENV{ASM} =~ /ml64/) &&
112289848Sjkim	    `ml64 2>&1` =~ /Version ([0-9]+)\./) {
113289848Sjkim	$addx = ($1>=12);
114289848Sjkim}
115289848Sjkim
116295009Sjkimif (!$addx && `$ENV{CC} -v 2>&1` =~ /((?:^clang|LLVM) version|.*based on LLVM) ([3-9])\.([0-9]+)/) {
117289848Sjkim	my $ver = $2 + $3/100.0;	# 3.1->3.01, 3.10->3.10
118289848Sjkim	$addx = ($ver>=3.03);
119289848Sjkim}
120289848Sjkim
121289848Sjkim($out, $inp, $mod) = ("%rdi", "%rsi", "%rbp");	# common internal API
122289848Sjkim{
123289848Sjkimmy ($out,$inp,$mod,$n0,$times) = ("%rdi","%rsi","%rdx","%rcx","%r8d");
124289848Sjkim
125289848Sjkim$code.=<<___;
126289848Sjkim.text
127289848Sjkim
128289848Sjkim.extern	OPENSSL_ia32cap_P
129289848Sjkim
130289848Sjkim.globl	rsaz_512_sqr
131289848Sjkim.type	rsaz_512_sqr,\@function,5
132289848Sjkim.align	32
133289848Sjkimrsaz_512_sqr:				# 25-29% faster than rsaz_512_mul
134289848Sjkim	push	%rbx
135289848Sjkim	push	%rbp
136289848Sjkim	push	%r12
137289848Sjkim	push	%r13
138289848Sjkim	push	%r14
139289848Sjkim	push	%r15
140289848Sjkim
141289848Sjkim	subq	\$128+24, %rsp
142289848Sjkim.Lsqr_body:
143356290Sjkim	movq	$mod, %xmm1		# common off-load
144289848Sjkim	movq	($inp), %rdx
145289848Sjkim	movq	8($inp), %rax
146289848Sjkim	movq	$n0, 128(%rsp)
147289848Sjkim___
148289848Sjkim$code.=<<___ if ($addx);
149289848Sjkim	movl	\$0x80100,%r11d
150289848Sjkim	andl	OPENSSL_ia32cap_P+8(%rip),%r11d
151289848Sjkim	cmpl	\$0x80100,%r11d		# check for MULX and ADO/CX
152289848Sjkim	je	.Loop_sqrx
153289848Sjkim___
154289848Sjkim$code.=<<___;
155289848Sjkim	jmp	.Loop_sqr
156289848Sjkim
157289848Sjkim.align	32
158289848Sjkim.Loop_sqr:
159289848Sjkim	movl	$times,128+8(%rsp)
160289848Sjkim#first iteration
161356290Sjkim	movq	%rdx, %rbx		# 0($inp)
162356290Sjkim	mov	%rax, %rbp		# 8($inp)
163289848Sjkim	mulq	%rdx
164289848Sjkim	movq	%rax, %r8
165289848Sjkim	movq	16($inp), %rax
166289848Sjkim	movq	%rdx, %r9
167289848Sjkim
168289848Sjkim	mulq	%rbx
169289848Sjkim	addq	%rax, %r9
170289848Sjkim	movq	24($inp), %rax
171289848Sjkim	movq	%rdx, %r10
172289848Sjkim	adcq	\$0, %r10
173289848Sjkim
174289848Sjkim	mulq	%rbx
175289848Sjkim	addq	%rax, %r10
176289848Sjkim	movq	32($inp), %rax
177289848Sjkim	movq	%rdx, %r11
178289848Sjkim	adcq	\$0, %r11
179289848Sjkim
180289848Sjkim	mulq	%rbx
181289848Sjkim	addq	%rax, %r11
182289848Sjkim	movq	40($inp), %rax
183289848Sjkim	movq	%rdx, %r12
184289848Sjkim	adcq	\$0, %r12
185289848Sjkim
186289848Sjkim	mulq	%rbx
187289848Sjkim	addq	%rax, %r12
188289848Sjkim	movq	48($inp), %rax
189289848Sjkim	movq	%rdx, %r13
190289848Sjkim	adcq	\$0, %r13
191289848Sjkim
192289848Sjkim	mulq	%rbx
193289848Sjkim	addq	%rax, %r13
194289848Sjkim	movq	56($inp), %rax
195289848Sjkim	movq	%rdx, %r14
196289848Sjkim	adcq	\$0, %r14
197289848Sjkim
198289848Sjkim	mulq	%rbx
199289848Sjkim	addq	%rax, %r14
200289848Sjkim	movq	%rbx, %rax
201356290Sjkim	adcq	\$0, %rdx
202289848Sjkim
203356290Sjkim	xorq	%rcx,%rcx		# rcx:r8 = r8 << 1
204356290Sjkim	addq	%r8, %r8
205356290Sjkim	 movq	%rdx, %r15
206356290Sjkim	adcq	\$0, %rcx
207289848Sjkim
208289848Sjkim	mulq	%rax
209356290Sjkim	addq	%r8, %rdx
210356290Sjkim	adcq	\$0, %rcx
211356290Sjkim
212289848Sjkim	movq	%rax, (%rsp)
213356290Sjkim	movq	%rdx, 8(%rsp)
214289848Sjkim
215289848Sjkim#second iteration
216289848Sjkim	movq	16($inp), %rax
217356290Sjkim	mulq	%rbp
218289848Sjkim	addq	%rax, %r10
219289848Sjkim	movq	24($inp), %rax
220289848Sjkim	movq	%rdx, %rbx
221289848Sjkim	adcq	\$0, %rbx
222289848Sjkim
223356290Sjkim	mulq	%rbp
224289848Sjkim	addq	%rax, %r11
225289848Sjkim	movq	32($inp), %rax
226289848Sjkim	adcq	\$0, %rdx
227289848Sjkim	addq	%rbx, %r11
228289848Sjkim	movq	%rdx, %rbx
229289848Sjkim	adcq	\$0, %rbx
230289848Sjkim
231356290Sjkim	mulq	%rbp
232289848Sjkim	addq	%rax, %r12
233289848Sjkim	movq	40($inp), %rax
234289848Sjkim	adcq	\$0, %rdx
235289848Sjkim	addq	%rbx, %r12
236289848Sjkim	movq	%rdx, %rbx
237289848Sjkim	adcq	\$0, %rbx
238289848Sjkim
239356290Sjkim	mulq	%rbp
240289848Sjkim	addq	%rax, %r13
241289848Sjkim	movq	48($inp), %rax
242289848Sjkim	adcq	\$0, %rdx
243289848Sjkim	addq	%rbx, %r13
244289848Sjkim	movq	%rdx, %rbx
245289848Sjkim	adcq	\$0, %rbx
246289848Sjkim
247356290Sjkim	mulq	%rbp
248289848Sjkim	addq	%rax, %r14
249289848Sjkim	movq	56($inp), %rax
250289848Sjkim	adcq	\$0, %rdx
251289848Sjkim	addq	%rbx, %r14
252289848Sjkim	movq	%rdx, %rbx
253289848Sjkim	adcq	\$0, %rbx
254289848Sjkim
255356290Sjkim	mulq	%rbp
256289848Sjkim	addq	%rax, %r15
257356290Sjkim	movq	%rbp, %rax
258289848Sjkim	adcq	\$0, %rdx
259289848Sjkim	addq	%rbx, %r15
260356290Sjkim	adcq	\$0, %rdx
261289848Sjkim
262356290Sjkim	xorq	%rbx, %rbx		# rbx:r10:r9 = r10:r9 << 1
263356290Sjkim	addq	%r9, %r9
264356290Sjkim	 movq	%rdx, %r8
265356290Sjkim	adcq	%r10, %r10
266356290Sjkim	adcq	\$0, %rbx
267289848Sjkim
268289848Sjkim	mulq	%rax
269356290Sjkim	# rcx <= 2 and rax <= 0xFFFF..F9, so carry must be zero here
270356290Sjkim	addq	%rcx, %rax
271356290Sjkim	 movq	16($inp), %rbp
272289848Sjkim	addq	%rax, %r9
273356290Sjkim	 movq	24($inp), %rax
274289848Sjkim	adcq	%rdx, %r10
275356290Sjkim	adcq	\$0, %rbx
276289848Sjkim
277289848Sjkim	movq	%r9, 16(%rsp)
278289848Sjkim	movq	%r10, 24(%rsp)
279356290Sjkim
280289848Sjkim#third iteration
281356290Sjkim	mulq	%rbp
282289848Sjkim	addq	%rax, %r12
283289848Sjkim	movq	32($inp), %rax
284289848Sjkim	movq	%rdx, %rcx
285289848Sjkim	adcq	\$0, %rcx
286289848Sjkim
287356290Sjkim	mulq	%rbp
288289848Sjkim	addq	%rax, %r13
289289848Sjkim	movq	40($inp), %rax
290289848Sjkim	adcq	\$0, %rdx
291289848Sjkim	addq	%rcx, %r13
292289848Sjkim	movq	%rdx, %rcx
293289848Sjkim	adcq	\$0, %rcx
294289848Sjkim
295356290Sjkim	mulq	%rbp
296289848Sjkim	addq	%rax, %r14
297289848Sjkim	movq	48($inp), %rax
298289848Sjkim	adcq	\$0, %rdx
299289848Sjkim	addq	%rcx, %r14
300289848Sjkim	movq	%rdx, %rcx
301289848Sjkim	adcq	\$0, %rcx
302289848Sjkim
303356290Sjkim	mulq	%rbp
304289848Sjkim	addq	%rax, %r15
305289848Sjkim	movq	56($inp), %rax
306289848Sjkim	adcq	\$0, %rdx
307289848Sjkim	addq	%rcx, %r15
308289848Sjkim	movq	%rdx, %rcx
309289848Sjkim	adcq	\$0, %rcx
310289848Sjkim
311356290Sjkim	mulq	%rbp
312289848Sjkim	addq	%rax, %r8
313356290Sjkim	movq	%rbp, %rax
314289848Sjkim	adcq	\$0, %rdx
315289848Sjkim	addq	%rcx, %r8
316356290Sjkim	adcq	\$0, %rdx
317289848Sjkim
318356290Sjkim	xorq	%rcx, %rcx		# rcx:r12:r11 = r12:r11 << 1
319356290Sjkim	addq	%r11, %r11
320356290Sjkim	 movq	%rdx, %r9
321356290Sjkim	adcq	%r12, %r12
322356290Sjkim	adcq	\$0, %rcx
323289848Sjkim
324289848Sjkim	mulq	%rax
325356290Sjkim	# rbx <= 2 and rax <= 0xFFFF..F9, so carry must be zero here
326356290Sjkim	addq	%rbx, %rax
327356290Sjkim	 movq	24($inp), %r10
328289848Sjkim	addq	%rax, %r11
329356290Sjkim	 movq	32($inp), %rax
330289848Sjkim	adcq	%rdx, %r12
331356290Sjkim	adcq	\$0, %rcx
332289848Sjkim
333289848Sjkim	movq	%r11, 32(%rsp)
334289848Sjkim	movq	%r12, 40(%rsp)
335289848Sjkim
336289848Sjkim#fourth iteration
337356290Sjkim	mov	%rax, %r11		# 32($inp)
338289848Sjkim	mulq	%r10
339289848Sjkim	addq	%rax, %r14
340289848Sjkim	movq	40($inp), %rax
341289848Sjkim	movq	%rdx, %rbx
342289848Sjkim	adcq	\$0, %rbx
343289848Sjkim
344356290Sjkim	mov	%rax, %r12		# 40($inp)
345289848Sjkim	mulq	%r10
346289848Sjkim	addq	%rax, %r15
347289848Sjkim	movq	48($inp), %rax
348289848Sjkim	adcq	\$0, %rdx
349289848Sjkim	addq	%rbx, %r15
350289848Sjkim	movq	%rdx, %rbx
351289848Sjkim	adcq	\$0, %rbx
352289848Sjkim
353356290Sjkim	mov	%rax, %rbp		# 48($inp)
354289848Sjkim	mulq	%r10
355289848Sjkim	addq	%rax, %r8
356289848Sjkim	movq	56($inp), %rax
357289848Sjkim	adcq	\$0, %rdx
358289848Sjkim	addq	%rbx, %r8
359289848Sjkim	movq	%rdx, %rbx
360289848Sjkim	adcq	\$0, %rbx
361289848Sjkim
362289848Sjkim	mulq	%r10
363289848Sjkim	addq	%rax, %r9
364289848Sjkim	movq	%r10, %rax
365289848Sjkim	adcq	\$0, %rdx
366289848Sjkim	addq	%rbx, %r9
367356290Sjkim	adcq	\$0, %rdx
368289848Sjkim
369356290Sjkim	xorq	%rbx, %rbx		# rbx:r13:r14 = r13:r14 << 1
370356290Sjkim	addq	%r13, %r13
371356290Sjkim	 movq	%rdx, %r10
372356290Sjkim	adcq	%r14, %r14
373356290Sjkim	adcq	\$0, %rbx
374289848Sjkim
375289848Sjkim	mulq	%rax
376356290Sjkim	# rcx <= 2 and rax <= 0xFFFF..F9, so carry must be zero here
377356290Sjkim	addq	%rcx, %rax
378289848Sjkim	addq	%rax, %r13
379356290Sjkim	 movq	%r12, %rax		# 40($inp)
380289848Sjkim	adcq	%rdx, %r14
381356290Sjkim	adcq	\$0, %rbx
382289848Sjkim
383289848Sjkim	movq	%r13, 48(%rsp)
384289848Sjkim	movq	%r14, 56(%rsp)
385289848Sjkim
386289848Sjkim#fifth iteration
387289848Sjkim	mulq	%r11
388289848Sjkim	addq	%rax, %r8
389356290Sjkim	movq	%rbp, %rax		# 48($inp)
390289848Sjkim	movq	%rdx, %rcx
391289848Sjkim	adcq	\$0, %rcx
392289848Sjkim
393289848Sjkim	mulq	%r11
394289848Sjkim	addq	%rax, %r9
395289848Sjkim	movq	56($inp), %rax
396289848Sjkim	adcq	\$0, %rdx
397289848Sjkim	addq	%rcx, %r9
398289848Sjkim	movq	%rdx, %rcx
399289848Sjkim	adcq	\$0, %rcx
400289848Sjkim
401356290Sjkim	mov	%rax, %r14		# 56($inp)
402289848Sjkim	mulq	%r11
403289848Sjkim	addq	%rax, %r10
404289848Sjkim	movq	%r11, %rax
405289848Sjkim	adcq	\$0, %rdx
406289848Sjkim	addq	%rcx, %r10
407356290Sjkim	adcq	\$0, %rdx
408289848Sjkim
409356290Sjkim	xorq	%rcx, %rcx		# rcx:r8:r15 = r8:r15 << 1
410356290Sjkim	addq	%r15, %r15
411356290Sjkim	 movq	%rdx, %r11
412356290Sjkim	adcq	%r8, %r8
413356290Sjkim	adcq	\$0, %rcx
414289848Sjkim
415289848Sjkim	mulq	%rax
416356290Sjkim	# rbx <= 2 and rax <= 0xFFFF..F9, so carry must be zero here
417356290Sjkim	addq	%rbx, %rax
418289848Sjkim	addq	%rax, %r15
419356290Sjkim	 movq	%rbp, %rax		# 48($inp)
420289848Sjkim	adcq	%rdx, %r8
421356290Sjkim	adcq	\$0, %rcx
422289848Sjkim
423289848Sjkim	movq	%r15, 64(%rsp)
424289848Sjkim	movq	%r8, 72(%rsp)
425289848Sjkim
426289848Sjkim#sixth iteration
427289848Sjkim	mulq	%r12
428289848Sjkim	addq	%rax, %r10
429356290Sjkim	movq	%r14, %rax		# 56($inp)
430289848Sjkim	movq	%rdx, %rbx
431289848Sjkim	adcq	\$0, %rbx
432289848Sjkim
433289848Sjkim	mulq	%r12
434289848Sjkim	addq	%rax, %r11
435289848Sjkim	movq	%r12, %rax
436289848Sjkim	adcq	\$0, %rdx
437289848Sjkim	addq	%rbx, %r11
438356290Sjkim	adcq	\$0, %rdx
439289848Sjkim
440356290Sjkim	xorq	%rbx, %rbx		# rbx:r10:r9 = r10:r9 << 1
441356290Sjkim	addq	%r9, %r9
442356290Sjkim	 movq	%rdx, %r12
443356290Sjkim	adcq	%r10, %r10
444356290Sjkim	adcq	\$0, %rbx
445289848Sjkim
446289848Sjkim	mulq	%rax
447356290Sjkim	# rcx <= 2 and rax <= 0xFFFF..F9, so carry must be zero here
448356290Sjkim	addq	%rcx, %rax
449289848Sjkim	addq	%rax, %r9
450356290Sjkim	 movq	%r14, %rax		# 56($inp)
451289848Sjkim	adcq	%rdx, %r10
452356290Sjkim	adcq	\$0, %rbx
453289848Sjkim
454289848Sjkim	movq	%r9, 80(%rsp)
455289848Sjkim	movq	%r10, 88(%rsp)
456289848Sjkim
457289848Sjkim#seventh iteration
458356290Sjkim	mulq	%rbp
459289848Sjkim	addq	%rax, %r12
460356290Sjkim	movq	%rbp, %rax
461356290Sjkim	adcq	\$0, %rdx
462289848Sjkim
463356290Sjkim	xorq	%rcx, %rcx		# rcx:r12:r11 = r12:r11 << 1
464356290Sjkim	addq	%r11, %r11
465356290Sjkim	 movq	%rdx, %r13
466356290Sjkim	adcq	%r12, %r12
467356290Sjkim	adcq	\$0, %rcx
468289848Sjkim
469289848Sjkim	mulq	%rax
470356290Sjkim	# rbx <= 2 and rax <= 0xFFFF..F9, so carry must be zero here
471356290Sjkim	addq	%rbx, %rax
472289848Sjkim	addq	%rax, %r11
473356290Sjkim	 movq	%r14, %rax		# 56($inp)
474289848Sjkim	adcq	%rdx, %r12
475356290Sjkim	adcq	\$0, %rcx
476289848Sjkim
477289848Sjkim	movq	%r11, 96(%rsp)
478289848Sjkim	movq	%r12, 104(%rsp)
479289848Sjkim
480289848Sjkim#eighth iteration
481356290Sjkim	xorq	%rbx, %rbx		# rbx:r13 = r13 << 1
482356290Sjkim	addq	%r13, %r13
483356290Sjkim	adcq	\$0, %rbx
484356290Sjkim
485289848Sjkim	mulq	%rax
486356290Sjkim	# rcx <= 2 and rax <= 0xFFFF..F9, so carry must be zero here
487356290Sjkim	addq	%rcx, %rax
488356290Sjkim	addq	%r13, %rax
489356290Sjkim	adcq	%rbx, %rdx
490289848Sjkim
491289848Sjkim	movq	(%rsp), %r8
492289848Sjkim	movq	8(%rsp), %r9
493289848Sjkim	movq	16(%rsp), %r10
494289848Sjkim	movq	24(%rsp), %r11
495289848Sjkim	movq	32(%rsp), %r12
496289848Sjkim	movq	40(%rsp), %r13
497289848Sjkim	movq	48(%rsp), %r14
498289848Sjkim	movq	56(%rsp), %r15
499356290Sjkim	movq	%xmm1, %rbp
500289848Sjkim
501356290Sjkim	movq	%rax, 112(%rsp)
502356290Sjkim	movq	%rdx, 120(%rsp)
503356290Sjkim
504289848Sjkim	call	__rsaz_512_reduce
505289848Sjkim
506289848Sjkim	addq	64(%rsp), %r8
507289848Sjkim	adcq	72(%rsp), %r9
508289848Sjkim	adcq	80(%rsp), %r10
509289848Sjkim	adcq	88(%rsp), %r11
510289848Sjkim	adcq	96(%rsp), %r12
511289848Sjkim	adcq	104(%rsp), %r13
512289848Sjkim	adcq	112(%rsp), %r14
513289848Sjkim	adcq	120(%rsp), %r15
514289848Sjkim	sbbq	%rcx, %rcx
515289848Sjkim
516289848Sjkim	call	__rsaz_512_subtract
517289848Sjkim
518289848Sjkim	movq	%r8, %rdx
519289848Sjkim	movq	%r9, %rax
520289848Sjkim	movl	128+8(%rsp), $times
521289848Sjkim	movq	$out, $inp
522289848Sjkim
523289848Sjkim	decl	$times
524289848Sjkim	jnz	.Loop_sqr
525289848Sjkim___
526289848Sjkimif ($addx) {
527289848Sjkim$code.=<<___;
528289848Sjkim	jmp	.Lsqr_tail
529289848Sjkim
530289848Sjkim.align	32
531289848Sjkim.Loop_sqrx:
532289848Sjkim	movl	$times,128+8(%rsp)
533289848Sjkim	movq	$out, %xmm0		# off-load
534356290Sjkim#first iteration
535289848Sjkim	mulx	%rax, %r8, %r9
536356290Sjkim	mov	%rax, %rbx
537289848Sjkim
538289848Sjkim	mulx	16($inp), %rcx, %r10
539289848Sjkim	xor	%rbp, %rbp		# cf=0, of=0
540289848Sjkim
541289848Sjkim	mulx	24($inp), %rax, %r11
542289848Sjkim	adcx	%rcx, %r9
543289848Sjkim
544356290Sjkim	.byte	0xc4,0x62,0xf3,0xf6,0xa6,0x20,0x00,0x00,0x00	# mulx	32($inp), %rcx, %r12
545289848Sjkim	adcx	%rax, %r10
546289848Sjkim
547356290Sjkim	.byte	0xc4,0x62,0xfb,0xf6,0xae,0x28,0x00,0x00,0x00	# mulx	40($inp), %rax, %r13
548289848Sjkim	adcx	%rcx, %r11
549289848Sjkim
550356290Sjkim	mulx	48($inp), %rcx, %r14
551289848Sjkim	adcx	%rax, %r12
552289848Sjkim	adcx	%rcx, %r13
553289848Sjkim
554356290Sjkim	mulx	56($inp), %rax, %r15
555289848Sjkim	adcx	%rax, %r14
556289848Sjkim	adcx	%rbp, %r15		# %rbp is 0
557289848Sjkim
558356290Sjkim	mulx	%rdx, %rax, $out
559356290Sjkim	 mov	%rbx, %rdx		# 8($inp)
560356290Sjkim	xor	%rcx, %rcx
561356290Sjkim	adox	%r8, %r8
562356290Sjkim	adcx	$out, %r8
563356290Sjkim	adox	%rbp, %rcx
564356290Sjkim	adcx	%rbp, %rcx
565289848Sjkim
566289848Sjkim	mov	%rax, (%rsp)
567289848Sjkim	mov	%r8, 8(%rsp)
568289848Sjkim
569356290Sjkim#second iteration
570356290Sjkim	.byte	0xc4,0xe2,0xfb,0xf6,0x9e,0x10,0x00,0x00,0x00	# mulx	16($inp), %rax, %rbx
571289848Sjkim	adox	%rax, %r10
572289848Sjkim	adcx	%rbx, %r11
573289848Sjkim
574356290Sjkim	mulx	24($inp), $out, %r8
575289848Sjkim	adox	$out, %r11
576356290Sjkim	.byte	0x66
577289848Sjkim	adcx	%r8, %r12
578289848Sjkim
579289848Sjkim	mulx	32($inp), %rax, %rbx
580289848Sjkim	adox	%rax, %r12
581289848Sjkim	adcx	%rbx, %r13
582289848Sjkim
583289848Sjkim	mulx	40($inp), $out, %r8
584289848Sjkim	adox	$out, %r13
585289848Sjkim	adcx	%r8, %r14
586289848Sjkim
587289848Sjkim	.byte	0xc4,0xe2,0xfb,0xf6,0x9e,0x30,0x00,0x00,0x00	# mulx	48($inp), %rax, %rbx
588289848Sjkim	adox	%rax, %r14
589289848Sjkim	adcx	%rbx, %r15
590289848Sjkim
591289848Sjkim	.byte	0xc4,0x62,0xc3,0xf6,0x86,0x38,0x00,0x00,0x00	# mulx	56($inp), $out, %r8
592289848Sjkim	adox	$out, %r15
593289848Sjkim	adcx	%rbp, %r8
594356290Sjkim	 mulx	%rdx, %rax, $out
595289848Sjkim	adox	%rbp, %r8
596356290Sjkim	 .byte	0x48,0x8b,0x96,0x10,0x00,0x00,0x00		# mov	16($inp), %rdx
597289848Sjkim
598356290Sjkim	xor	%rbx, %rbx
599356290Sjkim	 adox	%r9, %r9
600356290Sjkim	# rcx <= 2 and rax <= 0xFFFF..F9, so carry must be zero here
601356290Sjkim	adcx	%rcx, %rax
602356290Sjkim	adox	%r10, %r10
603289848Sjkim	adcx	%rax, %r9
604356290Sjkim	adox	%rbp, %rbx
605356290Sjkim	adcx	$out, %r10
606356290Sjkim	adcx	%rbp, %rbx
607289848Sjkim
608289848Sjkim	mov	%r9, 16(%rsp)
609289848Sjkim	.byte	0x4c,0x89,0x94,0x24,0x18,0x00,0x00,0x00		# mov	%r10, 24(%rsp)
610356290Sjkim
611356290Sjkim#third iteration
612356290Sjkim	mulx	24($inp), $out, %r9
613289848Sjkim	adox	$out, %r12
614289848Sjkim	adcx	%r9, %r13
615289848Sjkim
616289848Sjkim	mulx	32($inp), %rax, %rcx
617289848Sjkim	adox	%rax, %r13
618289848Sjkim	adcx	%rcx, %r14
619289848Sjkim
620356290Sjkim	.byte	0xc4,0x62,0xc3,0xf6,0x8e,0x28,0x00,0x00,0x00	# mulx	40($inp), $out, %r9
621289848Sjkim	adox	$out, %r14
622289848Sjkim	adcx	%r9, %r15
623289848Sjkim
624289848Sjkim	.byte	0xc4,0xe2,0xfb,0xf6,0x8e,0x30,0x00,0x00,0x00	# mulx	48($inp), %rax, %rcx
625289848Sjkim	adox	%rax, %r15
626289848Sjkim	adcx	%rcx, %r8
627289848Sjkim
628356290Sjkim	mulx	56($inp), $out, %r9
629289848Sjkim	adox	$out, %r8
630289848Sjkim	adcx	%rbp, %r9
631356290Sjkim	 mulx	%rdx, %rax, $out
632289848Sjkim	adox	%rbp, %r9
633356290Sjkim	 mov	24($inp), %rdx
634289848Sjkim
635356290Sjkim	xor	%rcx, %rcx
636356290Sjkim	 adox	%r11, %r11
637356290Sjkim	# rbx <= 2 and rax <= 0xFFFF..F9, so carry must be zero here
638356290Sjkim	adcx	%rbx, %rax
639356290Sjkim	adox	%r12, %r12
640289848Sjkim	adcx	%rax, %r11
641356290Sjkim	adox	%rbp, %rcx
642356290Sjkim	adcx	$out, %r12
643356290Sjkim	adcx	%rbp, %rcx
644289848Sjkim
645289848Sjkim	mov	%r11, 32(%rsp)
646356290Sjkim	mov	%r12, 40(%rsp)
647356290Sjkim
648356290Sjkim#fourth iteration
649356290Sjkim	mulx	32($inp), %rax, %rbx
650289848Sjkim	adox	%rax, %r14
651289848Sjkim	adcx	%rbx, %r15
652289848Sjkim
653289848Sjkim	mulx	40($inp), $out, %r10
654289848Sjkim	adox	$out, %r15
655289848Sjkim	adcx	%r10, %r8
656289848Sjkim
657289848Sjkim	mulx	48($inp), %rax, %rbx
658289848Sjkim	adox	%rax, %r8
659289848Sjkim	adcx	%rbx, %r9
660289848Sjkim
661289848Sjkim	mulx	56($inp), $out, %r10
662289848Sjkim	adox	$out, %r9
663289848Sjkim	adcx	%rbp, %r10
664356290Sjkim	 mulx	%rdx, %rax, $out
665289848Sjkim	adox	%rbp, %r10
666356290Sjkim	 mov	32($inp), %rdx
667289848Sjkim
668356290Sjkim	xor	%rbx, %rbx
669356290Sjkim	 adox	%r13, %r13
670356290Sjkim	# rcx <= 2 and rax <= 0xFFFF..F9, so carry must be zero here
671356290Sjkim	adcx	%rcx, %rax
672356290Sjkim	adox	%r14, %r14
673289848Sjkim	adcx	%rax, %r13
674356290Sjkim	adox	%rbp, %rbx
675356290Sjkim	adcx	$out, %r14
676356290Sjkim	adcx	%rbp, %rbx
677289848Sjkim
678289848Sjkim	mov	%r13, 48(%rsp)
679289848Sjkim	mov	%r14, 56(%rsp)
680356290Sjkim
681356290Sjkim#fifth iteration
682356290Sjkim	mulx	40($inp), $out, %r11
683289848Sjkim	adox	$out, %r8
684289848Sjkim	adcx	%r11, %r9
685289848Sjkim
686289848Sjkim	mulx	48($inp), %rax, %rcx
687289848Sjkim	adox	%rax, %r9
688289848Sjkim	adcx	%rcx, %r10
689289848Sjkim
690289848Sjkim	mulx	56($inp), $out, %r11
691289848Sjkim	adox	$out, %r10
692289848Sjkim	adcx	%rbp, %r11
693356290Sjkim	 mulx	%rdx, %rax, $out
694356290Sjkim	 mov	40($inp), %rdx
695289848Sjkim	adox	%rbp, %r11
696289848Sjkim
697356290Sjkim	xor	%rcx, %rcx
698356290Sjkim	 adox	%r15, %r15
699356290Sjkim	# rbx <= 2 and rax <= 0xFFFF..F9, so carry must be zero here
700356290Sjkim	adcx	%rbx, %rax
701356290Sjkim	adox	%r8, %r8
702289848Sjkim	adcx	%rax, %r15
703356290Sjkim	adox	%rbp, %rcx
704356290Sjkim	adcx	$out, %r8
705356290Sjkim	adcx	%rbp, %rcx
706289848Sjkim
707289848Sjkim	mov	%r15, 64(%rsp)
708289848Sjkim	mov	%r8, 72(%rsp)
709289848Sjkim
710289848Sjkim#sixth iteration
711289848Sjkim	.byte	0xc4,0xe2,0xfb,0xf6,0x9e,0x30,0x00,0x00,0x00	# mulx	48($inp), %rax, %rbx
712289848Sjkim	adox	%rax, %r10
713289848Sjkim	adcx	%rbx, %r11
714289848Sjkim
715289848Sjkim	.byte	0xc4,0x62,0xc3,0xf6,0xa6,0x38,0x00,0x00,0x00	# mulx	56($inp), $out, %r12
716289848Sjkim	adox	$out, %r11
717289848Sjkim	adcx	%rbp, %r12
718356290Sjkim	 mulx	%rdx, %rax, $out
719289848Sjkim	adox	%rbp, %r12
720356290Sjkim	 mov	48($inp), %rdx
721289848Sjkim
722356290Sjkim	xor	%rbx, %rbx
723356290Sjkim	 adox	%r9, %r9
724356290Sjkim	# rcx <= 2 and rax <= 0xFFFF..F9, so carry must be zero here
725356290Sjkim	adcx	%rcx, %rax
726356290Sjkim	adox	%r10, %r10
727289848Sjkim	adcx	%rax, %r9
728356290Sjkim	adcx	$out, %r10
729356290Sjkim	adox	%rbp, %rbx
730356290Sjkim	adcx	%rbp, %rbx
731289848Sjkim
732289848Sjkim	mov	%r9, 80(%rsp)
733289848Sjkim	mov	%r10, 88(%rsp)
734289848Sjkim
735289848Sjkim#seventh iteration
736289848Sjkim	.byte	0xc4,0x62,0xfb,0xf6,0xae,0x38,0x00,0x00,0x00	# mulx	56($inp), %rax, %r13
737289848Sjkim	adox	%rax, %r12
738289848Sjkim	adox	%rbp, %r13
739289848Sjkim
740356290Sjkim	mulx	%rdx, %rax, $out
741356290Sjkim	xor	%rcx, %rcx
742356290Sjkim	 mov	56($inp), %rdx
743356290Sjkim	 adox	%r11, %r11
744356290Sjkim	# rbx <= 2 and rax <= 0xFFFF..F9, so carry must be zero here
745356290Sjkim	adcx	%rbx, %rax
746356290Sjkim	adox	%r12, %r12
747289848Sjkim	adcx	%rax, %r11
748356290Sjkim	adox	%rbp, %rcx
749356290Sjkim	adcx	$out, %r12
750356290Sjkim	adcx	%rbp, %rcx
751289848Sjkim
752289848Sjkim	.byte	0x4c,0x89,0x9c,0x24,0x60,0x00,0x00,0x00		# mov	%r11, 96(%rsp)
753289848Sjkim	.byte	0x4c,0x89,0xa4,0x24,0x68,0x00,0x00,0x00		# mov	%r12, 104(%rsp)
754289848Sjkim
755289848Sjkim#eighth iteration
756289848Sjkim	mulx	%rdx, %rax, %rdx
757356290Sjkim	xor	%rbx, %rbx
758356290Sjkim	 adox	%r13, %r13
759356290Sjkim	# rcx <= 2 and rax <= 0xFFFF..F9, so carry must be zero here
760356290Sjkim	adcx	%rcx, %rax
761356290Sjkim	adox	%rbp, %rbx
762356290Sjkim	adcx	%r13, %rax
763356290Sjkim	adcx	%rdx, %rbx
764289848Sjkim
765289848Sjkim	movq	%xmm0, $out
766289848Sjkim	movq	%xmm1, %rbp
767289848Sjkim
768289848Sjkim	movq	128(%rsp), %rdx		# pull $n0
769289848Sjkim	movq	(%rsp), %r8
770289848Sjkim	movq	8(%rsp), %r9
771289848Sjkim	movq	16(%rsp), %r10
772289848Sjkim	movq	24(%rsp), %r11
773289848Sjkim	movq	32(%rsp), %r12
774289848Sjkim	movq	40(%rsp), %r13
775289848Sjkim	movq	48(%rsp), %r14
776289848Sjkim	movq	56(%rsp), %r15
777289848Sjkim
778356290Sjkim	movq	%rax, 112(%rsp)
779356290Sjkim	movq	%rbx, 120(%rsp)
780356290Sjkim
781289848Sjkim	call	__rsaz_512_reducex
782289848Sjkim
783289848Sjkim	addq	64(%rsp), %r8
784289848Sjkim	adcq	72(%rsp), %r9
785289848Sjkim	adcq	80(%rsp), %r10
786289848Sjkim	adcq	88(%rsp), %r11
787289848Sjkim	adcq	96(%rsp), %r12
788289848Sjkim	adcq	104(%rsp), %r13
789289848Sjkim	adcq	112(%rsp), %r14
790289848Sjkim	adcq	120(%rsp), %r15
791289848Sjkim	sbbq	%rcx, %rcx
792289848Sjkim
793289848Sjkim	call	__rsaz_512_subtract
794289848Sjkim
795289848Sjkim	movq	%r8, %rdx
796289848Sjkim	movq	%r9, %rax
797289848Sjkim	movl	128+8(%rsp), $times
798289848Sjkim	movq	$out, $inp
799289848Sjkim
800289848Sjkim	decl	$times
801289848Sjkim	jnz	.Loop_sqrx
802289848Sjkim
803289848Sjkim.Lsqr_tail:
804289848Sjkim___
805289848Sjkim}
806289848Sjkim$code.=<<___;
807289848Sjkim
808289848Sjkim	leaq	128+24+48(%rsp), %rax
809289848Sjkim	movq	-48(%rax), %r15
810289848Sjkim	movq	-40(%rax), %r14
811289848Sjkim	movq	-32(%rax), %r13
812289848Sjkim	movq	-24(%rax), %r12
813289848Sjkim	movq	-16(%rax), %rbp
814289848Sjkim	movq	-8(%rax), %rbx
815289848Sjkim	leaq	(%rax), %rsp
816289848Sjkim.Lsqr_epilogue:
817289848Sjkim	ret
818289848Sjkim.size	rsaz_512_sqr,.-rsaz_512_sqr
819289848Sjkim___
820289848Sjkim}
821289848Sjkim{
822289848Sjkimmy ($out,$ap,$bp,$mod,$n0) = ("%rdi","%rsi","%rdx","%rcx","%r8");
823289848Sjkim$code.=<<___;
824289848Sjkim.globl	rsaz_512_mul
825289848Sjkim.type	rsaz_512_mul,\@function,5
826289848Sjkim.align	32
827289848Sjkimrsaz_512_mul:
828289848Sjkim	push	%rbx
829289848Sjkim	push	%rbp
830289848Sjkim	push	%r12
831289848Sjkim	push	%r13
832289848Sjkim	push	%r14
833289848Sjkim	push	%r15
834289848Sjkim
835289848Sjkim	subq	\$128+24, %rsp
836289848Sjkim.Lmul_body:
837289848Sjkim	movq	$out, %xmm0		# off-load arguments
838289848Sjkim	movq	$mod, %xmm1
839289848Sjkim	movq	$n0, 128(%rsp)
840289848Sjkim___
841289848Sjkim$code.=<<___ if ($addx);
842289848Sjkim	movl	\$0x80100,%r11d
843289848Sjkim	andl	OPENSSL_ia32cap_P+8(%rip),%r11d
844289848Sjkim	cmpl	\$0x80100,%r11d		# check for MULX and ADO/CX
845289848Sjkim	je	.Lmulx
846289848Sjkim___
847289848Sjkim$code.=<<___;
848289848Sjkim	movq	($bp), %rbx		# pass b[0]
849289848Sjkim	movq	$bp, %rbp		# pass argument
850289848Sjkim	call	__rsaz_512_mul
851289848Sjkim
852289848Sjkim	movq	%xmm0, $out
853289848Sjkim	movq	%xmm1, %rbp
854289848Sjkim
855289848Sjkim	movq	(%rsp), %r8
856289848Sjkim	movq	8(%rsp), %r9
857289848Sjkim	movq	16(%rsp), %r10
858289848Sjkim	movq	24(%rsp), %r11
859289848Sjkim	movq	32(%rsp), %r12
860289848Sjkim	movq	40(%rsp), %r13
861289848Sjkim	movq	48(%rsp), %r14
862289848Sjkim	movq	56(%rsp), %r15
863289848Sjkim
864289848Sjkim	call	__rsaz_512_reduce
865289848Sjkim___
866289848Sjkim$code.=<<___ if ($addx);
867289848Sjkim	jmp	.Lmul_tail
868289848Sjkim
869289848Sjkim.align	32
870289848Sjkim.Lmulx:
871289848Sjkim	movq	$bp, %rbp		# pass argument
872289848Sjkim	movq	($bp), %rdx		# pass b[0]
873289848Sjkim	call	__rsaz_512_mulx
874289848Sjkim
875289848Sjkim	movq	%xmm0, $out
876289848Sjkim	movq	%xmm1, %rbp
877289848Sjkim
878289848Sjkim	movq	128(%rsp), %rdx		# pull $n0
879289848Sjkim	movq	(%rsp), %r8
880289848Sjkim	movq	8(%rsp), %r9
881289848Sjkim	movq	16(%rsp), %r10
882289848Sjkim	movq	24(%rsp), %r11
883289848Sjkim	movq	32(%rsp), %r12
884289848Sjkim	movq	40(%rsp), %r13
885289848Sjkim	movq	48(%rsp), %r14
886289848Sjkim	movq	56(%rsp), %r15
887289848Sjkim
888289848Sjkim	call	__rsaz_512_reducex
889289848Sjkim.Lmul_tail:
890289848Sjkim___
891289848Sjkim$code.=<<___;
892289848Sjkim	addq	64(%rsp), %r8
893289848Sjkim	adcq	72(%rsp), %r9
894289848Sjkim	adcq	80(%rsp), %r10
895289848Sjkim	adcq	88(%rsp), %r11
896289848Sjkim	adcq	96(%rsp), %r12
897289848Sjkim	adcq	104(%rsp), %r13
898289848Sjkim	adcq	112(%rsp), %r14
899289848Sjkim	adcq	120(%rsp), %r15
900289848Sjkim	sbbq	%rcx, %rcx
901289848Sjkim
902289848Sjkim	call	__rsaz_512_subtract
903289848Sjkim
904289848Sjkim	leaq	128+24+48(%rsp), %rax
905289848Sjkim	movq	-48(%rax), %r15
906289848Sjkim	movq	-40(%rax), %r14
907289848Sjkim	movq	-32(%rax), %r13
908289848Sjkim	movq	-24(%rax), %r12
909289848Sjkim	movq	-16(%rax), %rbp
910289848Sjkim	movq	-8(%rax), %rbx
911289848Sjkim	leaq	(%rax), %rsp
912289848Sjkim.Lmul_epilogue:
913289848Sjkim	ret
914289848Sjkim.size	rsaz_512_mul,.-rsaz_512_mul
915289848Sjkim___
916289848Sjkim}
917289848Sjkim{
918289848Sjkimmy ($out,$ap,$bp,$mod,$n0,$pwr) = ("%rdi","%rsi","%rdx","%rcx","%r8","%r9d");
919289848Sjkim$code.=<<___;
920289848Sjkim.globl	rsaz_512_mul_gather4
921289848Sjkim.type	rsaz_512_mul_gather4,\@function,6
922289848Sjkim.align	32
923289848Sjkimrsaz_512_mul_gather4:
924289848Sjkim	push	%rbx
925289848Sjkim	push	%rbp
926289848Sjkim	push	%r12
927289848Sjkim	push	%r13
928289848Sjkim	push	%r14
929289848Sjkim	push	%r15
930289848Sjkim
931296279Sjkim	subq	\$`128+24+($win64?0xb0:0)`, %rsp
932296279Sjkim___
933296279Sjkim$code.=<<___	if ($win64);
934296279Sjkim	movaps	%xmm6,0xa0(%rsp)
935296279Sjkim	movaps	%xmm7,0xb0(%rsp)
936296279Sjkim	movaps	%xmm8,0xc0(%rsp)
937296279Sjkim	movaps	%xmm9,0xd0(%rsp)
938296279Sjkim	movaps	%xmm10,0xe0(%rsp)
939296279Sjkim	movaps	%xmm11,0xf0(%rsp)
940296279Sjkim	movaps	%xmm12,0x100(%rsp)
941296279Sjkim	movaps	%xmm13,0x110(%rsp)
942296279Sjkim	movaps	%xmm14,0x120(%rsp)
943296279Sjkim	movaps	%xmm15,0x130(%rsp)
944296279Sjkim___
945296279Sjkim$code.=<<___;
946289848Sjkim.Lmul_gather4_body:
947296279Sjkim	movd	$pwr,%xmm8
948296279Sjkim	movdqa	.Linc+16(%rip),%xmm1	# 00000002000000020000000200000002
949296279Sjkim	movdqa	.Linc(%rip),%xmm0	# 00000001000000010000000000000000
950296279Sjkim
951296279Sjkim	pshufd	\$0,%xmm8,%xmm8		# broadcast $power
952296279Sjkim	movdqa	%xmm1,%xmm7
953296279Sjkim	movdqa	%xmm1,%xmm2
954289848Sjkim___
955296279Sjkim########################################################################
956296279Sjkim# calculate mask by comparing 0..15 to $power
957296279Sjkim#
958296279Sjkimfor($i=0;$i<4;$i++) {
959296279Sjkim$code.=<<___;
960296279Sjkim	paddd	%xmm`$i`,%xmm`$i+1`
961296279Sjkim	pcmpeqd	%xmm8,%xmm`$i`
962296279Sjkim	movdqa	%xmm7,%xmm`$i+3`
963296279Sjkim___
964296279Sjkim}
965296279Sjkimfor(;$i<7;$i++) {
966296279Sjkim$code.=<<___;
967296279Sjkim	paddd	%xmm`$i`,%xmm`$i+1`
968296279Sjkim	pcmpeqd	%xmm8,%xmm`$i`
969296279Sjkim___
970296279Sjkim}
971296279Sjkim$code.=<<___;
972296279Sjkim	pcmpeqd	%xmm8,%xmm7
973296279Sjkim
974296279Sjkim	movdqa	16*0($bp),%xmm8
975296279Sjkim	movdqa	16*1($bp),%xmm9
976296279Sjkim	movdqa	16*2($bp),%xmm10
977296279Sjkim	movdqa	16*3($bp),%xmm11
978296279Sjkim	pand	%xmm0,%xmm8
979296279Sjkim	movdqa	16*4($bp),%xmm12
980296279Sjkim	pand	%xmm1,%xmm9
981296279Sjkim	movdqa	16*5($bp),%xmm13
982296279Sjkim	pand	%xmm2,%xmm10
983296279Sjkim	movdqa	16*6($bp),%xmm14
984296279Sjkim	pand	%xmm3,%xmm11
985296279Sjkim	movdqa	16*7($bp),%xmm15
986296279Sjkim	leaq	128($bp), %rbp
987296279Sjkim	pand	%xmm4,%xmm12
988296279Sjkim	pand	%xmm5,%xmm13
989296279Sjkim	pand	%xmm6,%xmm14
990296279Sjkim	pand	%xmm7,%xmm15
991296279Sjkim	por	%xmm10,%xmm8
992296279Sjkim	por	%xmm11,%xmm9
993296279Sjkim	por	%xmm12,%xmm8
994296279Sjkim	por	%xmm13,%xmm9
995296279Sjkim	por	%xmm14,%xmm8
996296279Sjkim	por	%xmm15,%xmm9
997296279Sjkim
998296279Sjkim	por	%xmm9,%xmm8
999296279Sjkim	pshufd	\$0x4e,%xmm8,%xmm9
1000296279Sjkim	por	%xmm9,%xmm8
1001296279Sjkim___
1002289848Sjkim$code.=<<___ if ($addx);
1003289848Sjkim	movl	\$0x80100,%r11d
1004289848Sjkim	andl	OPENSSL_ia32cap_P+8(%rip),%r11d
1005289848Sjkim	cmpl	\$0x80100,%r11d		# check for MULX and ADO/CX
1006289848Sjkim	je	.Lmulx_gather
1007289848Sjkim___
1008289848Sjkim$code.=<<___;
1009296279Sjkim	movq	%xmm8,%rbx
1010289848Sjkim
1011296279Sjkim	movq	$n0, 128(%rsp)		# off-load arguments
1012296279Sjkim	movq	$out, 128+8(%rsp)
1013296279Sjkim	movq	$mod, 128+16(%rsp)
1014296279Sjkim
1015289848Sjkim	movq	($ap), %rax
1016289848Sjkim	 movq	8($ap), %rcx
1017289848Sjkim	mulq	%rbx			# 0 iteration
1018289848Sjkim	movq	%rax, (%rsp)
1019289848Sjkim	movq	%rcx, %rax
1020289848Sjkim	movq	%rdx, %r8
1021289848Sjkim
1022289848Sjkim	mulq	%rbx
1023289848Sjkim	addq	%rax, %r8
1024289848Sjkim	movq	16($ap), %rax
1025289848Sjkim	movq	%rdx, %r9
1026289848Sjkim	adcq	\$0, %r9
1027289848Sjkim
1028289848Sjkim	mulq	%rbx
1029289848Sjkim	addq	%rax, %r9
1030289848Sjkim	movq	24($ap), %rax
1031289848Sjkim	movq	%rdx, %r10
1032289848Sjkim	adcq	\$0, %r10
1033289848Sjkim
1034289848Sjkim	mulq	%rbx
1035289848Sjkim	addq	%rax, %r10
1036289848Sjkim	movq	32($ap), %rax
1037289848Sjkim	movq	%rdx, %r11
1038289848Sjkim	adcq	\$0, %r11
1039289848Sjkim
1040289848Sjkim	mulq	%rbx
1041289848Sjkim	addq	%rax, %r11
1042289848Sjkim	movq	40($ap), %rax
1043289848Sjkim	movq	%rdx, %r12
1044289848Sjkim	adcq	\$0, %r12
1045289848Sjkim
1046289848Sjkim	mulq	%rbx
1047289848Sjkim	addq	%rax, %r12
1048289848Sjkim	movq	48($ap), %rax
1049289848Sjkim	movq	%rdx, %r13
1050289848Sjkim	adcq	\$0, %r13
1051289848Sjkim
1052289848Sjkim	mulq	%rbx
1053289848Sjkim	addq	%rax, %r13
1054289848Sjkim	movq	56($ap), %rax
1055289848Sjkim	movq	%rdx, %r14
1056289848Sjkim	adcq	\$0, %r14
1057289848Sjkim
1058289848Sjkim	mulq	%rbx
1059289848Sjkim	addq	%rax, %r14
1060289848Sjkim	 movq	($ap), %rax
1061289848Sjkim	movq	%rdx, %r15
1062289848Sjkim	adcq	\$0, %r15
1063289848Sjkim
1064289848Sjkim	leaq	8(%rsp), %rdi
1065289848Sjkim	movl	\$7, %ecx
1066289848Sjkim	jmp	.Loop_mul_gather
1067289848Sjkim
1068289848Sjkim.align	32
1069289848Sjkim.Loop_mul_gather:
1070296279Sjkim	movdqa	16*0(%rbp),%xmm8
1071296279Sjkim	movdqa	16*1(%rbp),%xmm9
1072296279Sjkim	movdqa	16*2(%rbp),%xmm10
1073296279Sjkim	movdqa	16*3(%rbp),%xmm11
1074296279Sjkim	pand	%xmm0,%xmm8
1075296279Sjkim	movdqa	16*4(%rbp),%xmm12
1076296279Sjkim	pand	%xmm1,%xmm9
1077296279Sjkim	movdqa	16*5(%rbp),%xmm13
1078296279Sjkim	pand	%xmm2,%xmm10
1079296279Sjkim	movdqa	16*6(%rbp),%xmm14
1080296279Sjkim	pand	%xmm3,%xmm11
1081296279Sjkim	movdqa	16*7(%rbp),%xmm15
1082296279Sjkim	leaq	128(%rbp), %rbp
1083296279Sjkim	pand	%xmm4,%xmm12
1084296279Sjkim	pand	%xmm5,%xmm13
1085296279Sjkim	pand	%xmm6,%xmm14
1086296279Sjkim	pand	%xmm7,%xmm15
1087296279Sjkim	por	%xmm10,%xmm8
1088296279Sjkim	por	%xmm11,%xmm9
1089296279Sjkim	por	%xmm12,%xmm8
1090296279Sjkim	por	%xmm13,%xmm9
1091296279Sjkim	por	%xmm14,%xmm8
1092296279Sjkim	por	%xmm15,%xmm9
1093296279Sjkim
1094296279Sjkim	por	%xmm9,%xmm8
1095296279Sjkim	pshufd	\$0x4e,%xmm8,%xmm9
1096296279Sjkim	por	%xmm9,%xmm8
1097296279Sjkim	movq	%xmm8,%rbx
1098296279Sjkim
1099289848Sjkim	mulq	%rbx
1100289848Sjkim	addq	%rax, %r8
1101289848Sjkim	movq	8($ap), %rax
1102289848Sjkim	movq	%r8, (%rdi)
1103289848Sjkim	movq	%rdx, %r8
1104289848Sjkim	adcq	\$0, %r8
1105289848Sjkim
1106289848Sjkim	mulq	%rbx
1107289848Sjkim	addq	%rax, %r9
1108289848Sjkim	movq	16($ap), %rax
1109289848Sjkim	adcq	\$0, %rdx
1110289848Sjkim	addq	%r9, %r8
1111289848Sjkim	movq	%rdx, %r9
1112289848Sjkim	adcq	\$0, %r9
1113289848Sjkim
1114289848Sjkim	mulq	%rbx
1115289848Sjkim	addq	%rax, %r10
1116289848Sjkim	movq	24($ap), %rax
1117289848Sjkim	adcq	\$0, %rdx
1118289848Sjkim	addq	%r10, %r9
1119289848Sjkim	movq	%rdx, %r10
1120289848Sjkim	adcq	\$0, %r10
1121289848Sjkim
1122289848Sjkim	mulq	%rbx
1123289848Sjkim	addq	%rax, %r11
1124289848Sjkim	movq	32($ap), %rax
1125289848Sjkim	adcq	\$0, %rdx
1126289848Sjkim	addq	%r11, %r10
1127289848Sjkim	movq	%rdx, %r11
1128289848Sjkim	adcq	\$0, %r11
1129289848Sjkim
1130289848Sjkim	mulq	%rbx
1131289848Sjkim	addq	%rax, %r12
1132289848Sjkim	movq	40($ap), %rax
1133289848Sjkim	adcq	\$0, %rdx
1134289848Sjkim	addq	%r12, %r11
1135289848Sjkim	movq	%rdx, %r12
1136289848Sjkim	adcq	\$0, %r12
1137289848Sjkim
1138289848Sjkim	mulq	%rbx
1139289848Sjkim	addq	%rax, %r13
1140289848Sjkim	movq	48($ap), %rax
1141289848Sjkim	adcq	\$0, %rdx
1142289848Sjkim	addq	%r13, %r12
1143289848Sjkim	movq	%rdx, %r13
1144289848Sjkim	adcq	\$0, %r13
1145289848Sjkim
1146289848Sjkim	mulq	%rbx
1147289848Sjkim	addq	%rax, %r14
1148289848Sjkim	movq	56($ap), %rax
1149289848Sjkim	adcq	\$0, %rdx
1150289848Sjkim	addq	%r14, %r13
1151289848Sjkim	movq	%rdx, %r14
1152289848Sjkim	adcq	\$0, %r14
1153289848Sjkim
1154289848Sjkim	mulq	%rbx
1155289848Sjkim	addq	%rax, %r15
1156289848Sjkim	 movq	($ap), %rax
1157289848Sjkim	adcq	\$0, %rdx
1158289848Sjkim	addq	%r15, %r14
1159289848Sjkim	movq	%rdx, %r15
1160289848Sjkim	adcq	\$0, %r15
1161289848Sjkim
1162289848Sjkim	leaq	8(%rdi), %rdi
1163289848Sjkim
1164289848Sjkim	decl	%ecx
1165289848Sjkim	jnz	.Loop_mul_gather
1166289848Sjkim
1167289848Sjkim	movq	%r8, (%rdi)
1168289848Sjkim	movq	%r9, 8(%rdi)
1169289848Sjkim	movq	%r10, 16(%rdi)
1170289848Sjkim	movq	%r11, 24(%rdi)
1171289848Sjkim	movq	%r12, 32(%rdi)
1172289848Sjkim	movq	%r13, 40(%rdi)
1173289848Sjkim	movq	%r14, 48(%rdi)
1174289848Sjkim	movq	%r15, 56(%rdi)
1175289848Sjkim
1176296279Sjkim	movq	128+8(%rsp), $out
1177296279Sjkim	movq	128+16(%rsp), %rbp
1178289848Sjkim
1179289848Sjkim	movq	(%rsp), %r8
1180289848Sjkim	movq	8(%rsp), %r9
1181289848Sjkim	movq	16(%rsp), %r10
1182289848Sjkim	movq	24(%rsp), %r11
1183289848Sjkim	movq	32(%rsp), %r12
1184289848Sjkim	movq	40(%rsp), %r13
1185289848Sjkim	movq	48(%rsp), %r14
1186289848Sjkim	movq	56(%rsp), %r15
1187289848Sjkim
1188289848Sjkim	call	__rsaz_512_reduce
1189289848Sjkim___
1190289848Sjkim$code.=<<___ if ($addx);
1191289848Sjkim	jmp	.Lmul_gather_tail
1192289848Sjkim
1193289848Sjkim.align	32
1194289848Sjkim.Lmulx_gather:
1195296279Sjkim	movq	%xmm8,%rdx
1196289848Sjkim
1197296279Sjkim	mov	$n0, 128(%rsp)		# off-load arguments
1198296279Sjkim	mov	$out, 128+8(%rsp)
1199296279Sjkim	mov	$mod, 128+16(%rsp)
1200296279Sjkim
1201289848Sjkim	mulx	($ap), %rbx, %r8	# 0 iteration
1202289848Sjkim	mov	%rbx, (%rsp)
1203289848Sjkim	xor	%edi, %edi		# cf=0, of=0
1204289848Sjkim
1205289848Sjkim	mulx	8($ap), %rax, %r9
1206289848Sjkim
1207289848Sjkim	mulx	16($ap), %rbx, %r10
1208289848Sjkim	adcx	%rax, %r8
1209289848Sjkim
1210289848Sjkim	mulx	24($ap), %rax, %r11
1211289848Sjkim	adcx	%rbx, %r9
1212289848Sjkim
1213289848Sjkim	mulx	32($ap), %rbx, %r12
1214289848Sjkim	adcx	%rax, %r10
1215289848Sjkim
1216289848Sjkim	mulx	40($ap), %rax, %r13
1217289848Sjkim	adcx	%rbx, %r11
1218289848Sjkim
1219289848Sjkim	mulx	48($ap), %rbx, %r14
1220289848Sjkim	adcx	%rax, %r12
1221289848Sjkim
1222289848Sjkim	mulx	56($ap), %rax, %r15
1223289848Sjkim	adcx	%rbx, %r13
1224289848Sjkim	adcx	%rax, %r14
1225296279Sjkim	.byte	0x67
1226289848Sjkim	mov	%r8, %rbx
1227289848Sjkim	adcx	%rdi, %r15		# %rdi is 0
1228289848Sjkim
1229289848Sjkim	mov	\$-7, %rcx
1230289848Sjkim	jmp	.Loop_mulx_gather
1231289848Sjkim
1232289848Sjkim.align	32
1233289848Sjkim.Loop_mulx_gather:
1234296279Sjkim	movdqa	16*0(%rbp),%xmm8
1235296279Sjkim	movdqa	16*1(%rbp),%xmm9
1236296279Sjkim	movdqa	16*2(%rbp),%xmm10
1237296279Sjkim	movdqa	16*3(%rbp),%xmm11
1238296279Sjkim	pand	%xmm0,%xmm8
1239296279Sjkim	movdqa	16*4(%rbp),%xmm12
1240296279Sjkim	pand	%xmm1,%xmm9
1241296279Sjkim	movdqa	16*5(%rbp),%xmm13
1242296279Sjkim	pand	%xmm2,%xmm10
1243296279Sjkim	movdqa	16*6(%rbp),%xmm14
1244296279Sjkim	pand	%xmm3,%xmm11
1245296279Sjkim	movdqa	16*7(%rbp),%xmm15
1246296279Sjkim	leaq	128(%rbp), %rbp
1247296279Sjkim	pand	%xmm4,%xmm12
1248296279Sjkim	pand	%xmm5,%xmm13
1249296279Sjkim	pand	%xmm6,%xmm14
1250296279Sjkim	pand	%xmm7,%xmm15
1251296279Sjkim	por	%xmm10,%xmm8
1252296279Sjkim	por	%xmm11,%xmm9
1253296279Sjkim	por	%xmm12,%xmm8
1254296279Sjkim	por	%xmm13,%xmm9
1255296279Sjkim	por	%xmm14,%xmm8
1256296279Sjkim	por	%xmm15,%xmm9
1257296279Sjkim
1258296279Sjkim	por	%xmm9,%xmm8
1259296279Sjkim	pshufd	\$0x4e,%xmm8,%xmm9
1260296279Sjkim	por	%xmm9,%xmm8
1261296279Sjkim	movq	%xmm8,%rdx
1262296279Sjkim
1263296279Sjkim	.byte	0xc4,0x62,0xfb,0xf6,0x86,0x00,0x00,0x00,0x00	# mulx	($ap), %rax, %r8
1264289848Sjkim	adcx	%rax, %rbx
1265289848Sjkim	adox	%r9, %r8
1266289848Sjkim
1267289848Sjkim	mulx	8($ap), %rax, %r9
1268289848Sjkim	adcx	%rax, %r8
1269289848Sjkim	adox	%r10, %r9
1270289848Sjkim
1271289848Sjkim	mulx	16($ap), %rax, %r10
1272289848Sjkim	adcx	%rax, %r9
1273289848Sjkim	adox	%r11, %r10
1274289848Sjkim
1275289848Sjkim	.byte	0xc4,0x62,0xfb,0xf6,0x9e,0x18,0x00,0x00,0x00	# mulx	24($ap), %rax, %r11
1276289848Sjkim	adcx	%rax, %r10
1277289848Sjkim	adox	%r12, %r11
1278289848Sjkim
1279289848Sjkim	mulx	32($ap), %rax, %r12
1280289848Sjkim	adcx	%rax, %r11
1281289848Sjkim	adox	%r13, %r12
1282289848Sjkim
1283289848Sjkim	mulx	40($ap), %rax, %r13
1284289848Sjkim	adcx	%rax, %r12
1285289848Sjkim	adox	%r14, %r13
1286289848Sjkim
1287289848Sjkim	.byte	0xc4,0x62,0xfb,0xf6,0xb6,0x30,0x00,0x00,0x00	# mulx	48($ap), %rax, %r14
1288289848Sjkim	adcx	%rax, %r13
1289296279Sjkim	.byte	0x67
1290289848Sjkim	adox	%r15, %r14
1291289848Sjkim
1292289848Sjkim	mulx	56($ap), %rax, %r15
1293289848Sjkim	 mov	%rbx, 64(%rsp,%rcx,8)
1294289848Sjkim	adcx	%rax, %r14
1295289848Sjkim	adox	%rdi, %r15
1296289848Sjkim	mov	%r8, %rbx
1297289848Sjkim	adcx	%rdi, %r15		# cf=0
1298289848Sjkim
1299289848Sjkim	inc	%rcx			# of=0
1300289848Sjkim	jnz	.Loop_mulx_gather
1301289848Sjkim
1302289848Sjkim	mov	%r8, 64(%rsp)
1303289848Sjkim	mov	%r9, 64+8(%rsp)
1304289848Sjkim	mov	%r10, 64+16(%rsp)
1305289848Sjkim	mov	%r11, 64+24(%rsp)
1306289848Sjkim	mov	%r12, 64+32(%rsp)
1307289848Sjkim	mov	%r13, 64+40(%rsp)
1308289848Sjkim	mov	%r14, 64+48(%rsp)
1309289848Sjkim	mov	%r15, 64+56(%rsp)
1310289848Sjkim
1311296279Sjkim	mov	128(%rsp), %rdx		# pull arguments
1312296279Sjkim	mov	128+8(%rsp), $out
1313296279Sjkim	mov	128+16(%rsp), %rbp
1314289848Sjkim
1315289848Sjkim	mov	(%rsp), %r8
1316289848Sjkim	mov	8(%rsp), %r9
1317289848Sjkim	mov	16(%rsp), %r10
1318289848Sjkim	mov	24(%rsp), %r11
1319289848Sjkim	mov	32(%rsp), %r12
1320289848Sjkim	mov	40(%rsp), %r13
1321289848Sjkim	mov	48(%rsp), %r14
1322289848Sjkim	mov	56(%rsp), %r15
1323289848Sjkim
1324289848Sjkim	call	__rsaz_512_reducex
1325289848Sjkim
1326289848Sjkim.Lmul_gather_tail:
1327289848Sjkim___
1328289848Sjkim$code.=<<___;
1329289848Sjkim	addq	64(%rsp), %r8
1330289848Sjkim	adcq	72(%rsp), %r9
1331289848Sjkim	adcq	80(%rsp), %r10
1332289848Sjkim	adcq	88(%rsp), %r11
1333289848Sjkim	adcq	96(%rsp), %r12
1334289848Sjkim	adcq	104(%rsp), %r13
1335289848Sjkim	adcq	112(%rsp), %r14
1336289848Sjkim	adcq	120(%rsp), %r15
1337289848Sjkim	sbbq	%rcx, %rcx
1338289848Sjkim
1339289848Sjkim	call	__rsaz_512_subtract
1340289848Sjkim
1341289848Sjkim	leaq	128+24+48(%rsp), %rax
1342296279Sjkim___
1343296279Sjkim$code.=<<___	if ($win64);
1344296279Sjkim	movaps	0xa0-0xc8(%rax),%xmm6
1345296279Sjkim	movaps	0xb0-0xc8(%rax),%xmm7
1346296279Sjkim	movaps	0xc0-0xc8(%rax),%xmm8
1347296279Sjkim	movaps	0xd0-0xc8(%rax),%xmm9
1348296279Sjkim	movaps	0xe0-0xc8(%rax),%xmm10
1349296279Sjkim	movaps	0xf0-0xc8(%rax),%xmm11
1350296279Sjkim	movaps	0x100-0xc8(%rax),%xmm12
1351296279Sjkim	movaps	0x110-0xc8(%rax),%xmm13
1352296279Sjkim	movaps	0x120-0xc8(%rax),%xmm14
1353296279Sjkim	movaps	0x130-0xc8(%rax),%xmm15
1354296279Sjkim	lea	0xb0(%rax),%rax
1355296279Sjkim___
1356296279Sjkim$code.=<<___;
1357289848Sjkim	movq	-48(%rax), %r15
1358289848Sjkim	movq	-40(%rax), %r14
1359289848Sjkim	movq	-32(%rax), %r13
1360289848Sjkim	movq	-24(%rax), %r12
1361289848Sjkim	movq	-16(%rax), %rbp
1362289848Sjkim	movq	-8(%rax), %rbx
1363289848Sjkim	leaq	(%rax), %rsp
1364289848Sjkim.Lmul_gather4_epilogue:
1365289848Sjkim	ret
1366289848Sjkim.size	rsaz_512_mul_gather4,.-rsaz_512_mul_gather4
1367289848Sjkim___
1368289848Sjkim}
1369289848Sjkim{
1370289848Sjkimmy ($out,$ap,$mod,$n0,$tbl,$pwr) = ("%rdi","%rsi","%rdx","%rcx","%r8","%r9d");
1371289848Sjkim$code.=<<___;
1372289848Sjkim.globl	rsaz_512_mul_scatter4
1373289848Sjkim.type	rsaz_512_mul_scatter4,\@function,6
1374289848Sjkim.align	32
1375289848Sjkimrsaz_512_mul_scatter4:
1376289848Sjkim	push	%rbx
1377289848Sjkim	push	%rbp
1378289848Sjkim	push	%r12
1379289848Sjkim	push	%r13
1380289848Sjkim	push	%r14
1381289848Sjkim	push	%r15
1382289848Sjkim
1383289848Sjkim	mov	$pwr, $pwr
1384289848Sjkim	subq	\$128+24, %rsp
1385289848Sjkim.Lmul_scatter4_body:
1386296279Sjkim	leaq	($tbl,$pwr,8), $tbl
1387289848Sjkim	movq	$out, %xmm0		# off-load arguments
1388289848Sjkim	movq	$mod, %xmm1
1389289848Sjkim	movq	$tbl, %xmm2
1390289848Sjkim	movq	$n0, 128(%rsp)
1391289848Sjkim
1392289848Sjkim	movq	$out, %rbp
1393289848Sjkim___
1394289848Sjkim$code.=<<___ if ($addx);
1395289848Sjkim	movl	\$0x80100,%r11d
1396289848Sjkim	andl	OPENSSL_ia32cap_P+8(%rip),%r11d
1397289848Sjkim	cmpl	\$0x80100,%r11d		# check for MULX and ADO/CX
1398289848Sjkim	je	.Lmulx_scatter
1399289848Sjkim___
1400289848Sjkim$code.=<<___;
1401289848Sjkim	movq	($out),%rbx		# pass b[0]
1402289848Sjkim	call	__rsaz_512_mul
1403289848Sjkim
1404289848Sjkim	movq	%xmm0, $out
1405289848Sjkim	movq	%xmm1, %rbp
1406289848Sjkim
1407289848Sjkim	movq	(%rsp), %r8
1408289848Sjkim	movq	8(%rsp), %r9
1409289848Sjkim	movq	16(%rsp), %r10
1410289848Sjkim	movq	24(%rsp), %r11
1411289848Sjkim	movq	32(%rsp), %r12
1412289848Sjkim	movq	40(%rsp), %r13
1413289848Sjkim	movq	48(%rsp), %r14
1414289848Sjkim	movq	56(%rsp), %r15
1415289848Sjkim
1416289848Sjkim	call	__rsaz_512_reduce
1417289848Sjkim___
1418289848Sjkim$code.=<<___ if ($addx);
1419289848Sjkim	jmp	.Lmul_scatter_tail
1420289848Sjkim
1421289848Sjkim.align	32
1422289848Sjkim.Lmulx_scatter:
1423289848Sjkim	movq	($out), %rdx		# pass b[0]
1424289848Sjkim	call	__rsaz_512_mulx
1425289848Sjkim
1426289848Sjkim	movq	%xmm0, $out
1427289848Sjkim	movq	%xmm1, %rbp
1428289848Sjkim
1429289848Sjkim	movq	128(%rsp), %rdx		# pull $n0
1430289848Sjkim	movq	(%rsp), %r8
1431289848Sjkim	movq	8(%rsp), %r9
1432289848Sjkim	movq	16(%rsp), %r10
1433289848Sjkim	movq	24(%rsp), %r11
1434289848Sjkim	movq	32(%rsp), %r12
1435289848Sjkim	movq	40(%rsp), %r13
1436289848Sjkim	movq	48(%rsp), %r14
1437289848Sjkim	movq	56(%rsp), %r15
1438289848Sjkim
1439289848Sjkim	call	__rsaz_512_reducex
1440289848Sjkim
1441289848Sjkim.Lmul_scatter_tail:
1442289848Sjkim___
1443289848Sjkim$code.=<<___;
1444289848Sjkim	addq	64(%rsp), %r8
1445289848Sjkim	adcq	72(%rsp), %r9
1446289848Sjkim	adcq	80(%rsp), %r10
1447289848Sjkim	adcq	88(%rsp), %r11
1448289848Sjkim	adcq	96(%rsp), %r12
1449289848Sjkim	adcq	104(%rsp), %r13
1450289848Sjkim	adcq	112(%rsp), %r14
1451289848Sjkim	adcq	120(%rsp), %r15
1452289848Sjkim	movq	%xmm2, $inp
1453289848Sjkim	sbbq	%rcx, %rcx
1454289848Sjkim
1455289848Sjkim	call	__rsaz_512_subtract
1456289848Sjkim
1457296279Sjkim	movq	%r8, 128*0($inp)	# scatter
1458296279Sjkim	movq	%r9, 128*1($inp)
1459296279Sjkim	movq	%r10, 128*2($inp)
1460296279Sjkim	movq	%r11, 128*3($inp)
1461296279Sjkim	movq	%r12, 128*4($inp)
1462296279Sjkim	movq	%r13, 128*5($inp)
1463296279Sjkim	movq	%r14, 128*6($inp)
1464296279Sjkim	movq	%r15, 128*7($inp)
1465289848Sjkim
1466289848Sjkim	leaq	128+24+48(%rsp), %rax
1467289848Sjkim	movq	-48(%rax), %r15
1468289848Sjkim	movq	-40(%rax), %r14
1469289848Sjkim	movq	-32(%rax), %r13
1470289848Sjkim	movq	-24(%rax), %r12
1471289848Sjkim	movq	-16(%rax), %rbp
1472289848Sjkim	movq	-8(%rax), %rbx
1473289848Sjkim	leaq	(%rax), %rsp
1474289848Sjkim.Lmul_scatter4_epilogue:
1475289848Sjkim	ret
1476289848Sjkim.size	rsaz_512_mul_scatter4,.-rsaz_512_mul_scatter4
1477289848Sjkim___
1478289848Sjkim}
1479289848Sjkim{
1480289848Sjkimmy ($out,$inp,$mod,$n0) = ("%rdi","%rsi","%rdx","%rcx");
1481289848Sjkim$code.=<<___;
1482289848Sjkim.globl	rsaz_512_mul_by_one
1483289848Sjkim.type	rsaz_512_mul_by_one,\@function,4
1484289848Sjkim.align	32
1485289848Sjkimrsaz_512_mul_by_one:
1486289848Sjkim	push	%rbx
1487289848Sjkim	push	%rbp
1488289848Sjkim	push	%r12
1489289848Sjkim	push	%r13
1490289848Sjkim	push	%r14
1491289848Sjkim	push	%r15
1492289848Sjkim
1493289848Sjkim	subq	\$128+24, %rsp
1494289848Sjkim.Lmul_by_one_body:
1495289848Sjkim___
1496289848Sjkim$code.=<<___ if ($addx);
1497289848Sjkim	movl	OPENSSL_ia32cap_P+8(%rip),%eax
1498289848Sjkim___
1499289848Sjkim$code.=<<___;
1500289848Sjkim	movq	$mod, %rbp	# reassign argument
1501289848Sjkim	movq	$n0, 128(%rsp)
1502289848Sjkim
1503289848Sjkim	movq	($inp), %r8
1504289848Sjkim	pxor	%xmm0, %xmm0
1505289848Sjkim	movq	8($inp), %r9
1506289848Sjkim	movq	16($inp), %r10
1507289848Sjkim	movq	24($inp), %r11
1508289848Sjkim	movq	32($inp), %r12
1509289848Sjkim	movq	40($inp), %r13
1510289848Sjkim	movq	48($inp), %r14
1511289848Sjkim	movq	56($inp), %r15
1512289848Sjkim
1513289848Sjkim	movdqa	%xmm0, (%rsp)
1514289848Sjkim	movdqa	%xmm0, 16(%rsp)
1515289848Sjkim	movdqa	%xmm0, 32(%rsp)
1516289848Sjkim	movdqa	%xmm0, 48(%rsp)
1517289848Sjkim	movdqa	%xmm0, 64(%rsp)
1518289848Sjkim	movdqa	%xmm0, 80(%rsp)
1519289848Sjkim	movdqa	%xmm0, 96(%rsp)
1520289848Sjkim___
1521289848Sjkim$code.=<<___ if ($addx);
1522289848Sjkim	andl	\$0x80100,%eax
1523289848Sjkim	cmpl	\$0x80100,%eax		# check for MULX and ADO/CX
1524289848Sjkim	je	.Lby_one_callx
1525289848Sjkim___
1526289848Sjkim$code.=<<___;
1527289848Sjkim	call	__rsaz_512_reduce
1528289848Sjkim___
1529289848Sjkim$code.=<<___ if ($addx);
1530289848Sjkim	jmp	.Lby_one_tail
1531289848Sjkim.align	32
1532289848Sjkim.Lby_one_callx:
1533289848Sjkim	movq	128(%rsp), %rdx		# pull $n0
1534289848Sjkim	call	__rsaz_512_reducex
1535289848Sjkim.Lby_one_tail:
1536289848Sjkim___
1537289848Sjkim$code.=<<___;
1538289848Sjkim	movq	%r8, ($out)
1539289848Sjkim	movq	%r9, 8($out)
1540289848Sjkim	movq	%r10, 16($out)
1541289848Sjkim	movq	%r11, 24($out)
1542289848Sjkim	movq	%r12, 32($out)
1543289848Sjkim	movq	%r13, 40($out)
1544289848Sjkim	movq	%r14, 48($out)
1545289848Sjkim	movq	%r15, 56($out)
1546289848Sjkim
1547289848Sjkim	leaq	128+24+48(%rsp), %rax
1548289848Sjkim	movq	-48(%rax), %r15
1549289848Sjkim	movq	-40(%rax), %r14
1550289848Sjkim	movq	-32(%rax), %r13
1551289848Sjkim	movq	-24(%rax), %r12
1552289848Sjkim	movq	-16(%rax), %rbp
1553289848Sjkim	movq	-8(%rax), %rbx
1554289848Sjkim	leaq	(%rax), %rsp
1555289848Sjkim.Lmul_by_one_epilogue:
1556289848Sjkim	ret
1557289848Sjkim.size	rsaz_512_mul_by_one,.-rsaz_512_mul_by_one
1558289848Sjkim___
1559289848Sjkim}
1560289848Sjkim{	# __rsaz_512_reduce
1561289848Sjkim	#
1562289848Sjkim	# input:	%r8-%r15, %rbp - mod, 128(%rsp) - n0
1563289848Sjkim	# output:	%r8-%r15
1564289848Sjkim	# clobbers:	everything except %rbp and %rdi
1565289848Sjkim$code.=<<___;
1566289848Sjkim.type	__rsaz_512_reduce,\@abi-omnipotent
1567289848Sjkim.align	32
1568289848Sjkim__rsaz_512_reduce:
1569289848Sjkim	movq	%r8, %rbx
1570289848Sjkim	imulq	128+8(%rsp), %rbx
1571289848Sjkim	movq	0(%rbp), %rax
1572289848Sjkim	movl	\$8, %ecx
1573289848Sjkim	jmp	.Lreduction_loop
1574289848Sjkim
1575289848Sjkim.align	32
1576289848Sjkim.Lreduction_loop:
1577289848Sjkim	mulq	%rbx
1578289848Sjkim	movq	8(%rbp), %rax
1579289848Sjkim	negq	%r8
1580289848Sjkim	movq	%rdx, %r8
1581289848Sjkim	adcq	\$0, %r8
1582289848Sjkim
1583289848Sjkim	mulq	%rbx
1584289848Sjkim	addq	%rax, %r9
1585289848Sjkim	movq	16(%rbp), %rax
1586289848Sjkim	adcq	\$0, %rdx
1587289848Sjkim	addq	%r9, %r8
1588289848Sjkim	movq	%rdx, %r9
1589289848Sjkim	adcq	\$0, %r9
1590289848Sjkim
1591289848Sjkim	mulq	%rbx
1592289848Sjkim	addq	%rax, %r10
1593289848Sjkim	movq	24(%rbp), %rax
1594289848Sjkim	adcq	\$0, %rdx
1595289848Sjkim	addq	%r10, %r9
1596289848Sjkim	movq	%rdx, %r10
1597289848Sjkim	adcq	\$0, %r10
1598289848Sjkim
1599289848Sjkim	mulq	%rbx
1600289848Sjkim	addq	%rax, %r11
1601289848Sjkim	movq	32(%rbp), %rax
1602289848Sjkim	adcq	\$0, %rdx
1603289848Sjkim	addq	%r11, %r10
1604289848Sjkim	 movq	128+8(%rsp), %rsi
1605289848Sjkim	#movq	%rdx, %r11
1606289848Sjkim	#adcq	\$0, %r11
1607289848Sjkim	adcq	\$0, %rdx
1608289848Sjkim	movq	%rdx, %r11
1609289848Sjkim
1610289848Sjkim	mulq	%rbx
1611289848Sjkim	addq	%rax, %r12
1612289848Sjkim	movq	40(%rbp), %rax
1613289848Sjkim	adcq	\$0, %rdx
1614289848Sjkim	 imulq	%r8, %rsi
1615289848Sjkim	addq	%r12, %r11
1616289848Sjkim	movq	%rdx, %r12
1617289848Sjkim	adcq	\$0, %r12
1618289848Sjkim
1619289848Sjkim	mulq	%rbx
1620289848Sjkim	addq	%rax, %r13
1621289848Sjkim	movq	48(%rbp), %rax
1622289848Sjkim	adcq	\$0, %rdx
1623289848Sjkim	addq	%r13, %r12
1624289848Sjkim	movq	%rdx, %r13
1625289848Sjkim	adcq	\$0, %r13
1626289848Sjkim
1627289848Sjkim	mulq	%rbx
1628289848Sjkim	addq	%rax, %r14
1629289848Sjkim	movq	56(%rbp), %rax
1630289848Sjkim	adcq	\$0, %rdx
1631289848Sjkim	addq	%r14, %r13
1632289848Sjkim	movq	%rdx, %r14
1633289848Sjkim	adcq	\$0, %r14
1634289848Sjkim
1635289848Sjkim	mulq	%rbx
1636289848Sjkim	 movq	%rsi, %rbx
1637289848Sjkim	addq	%rax, %r15
1638289848Sjkim	 movq	0(%rbp), %rax
1639289848Sjkim	adcq	\$0, %rdx
1640289848Sjkim	addq	%r15, %r14
1641289848Sjkim	movq	%rdx, %r15
1642289848Sjkim	adcq	\$0, %r15
1643289848Sjkim
1644289848Sjkim	decl	%ecx
1645289848Sjkim	jne	.Lreduction_loop
1646289848Sjkim
1647289848Sjkim	ret
1648289848Sjkim.size	__rsaz_512_reduce,.-__rsaz_512_reduce
1649289848Sjkim___
1650289848Sjkim}
1651289848Sjkimif ($addx) {
1652289848Sjkim	# __rsaz_512_reducex
1653289848Sjkim	#
1654289848Sjkim	# input:	%r8-%r15, %rbp - mod, 128(%rsp) - n0
1655289848Sjkim	# output:	%r8-%r15
1656289848Sjkim	# clobbers:	everything except %rbp and %rdi
1657289848Sjkim$code.=<<___;
1658289848Sjkim.type	__rsaz_512_reducex,\@abi-omnipotent
1659289848Sjkim.align	32
1660289848Sjkim__rsaz_512_reducex:
1661289848Sjkim	#movq	128+8(%rsp), %rdx		# pull $n0
1662289848Sjkim	imulq	%r8, %rdx
1663289848Sjkim	xorq	%rsi, %rsi			# cf=0,of=0
1664289848Sjkim	movl	\$8, %ecx
1665289848Sjkim	jmp	.Lreduction_loopx
1666289848Sjkim
1667289848Sjkim.align	32
1668289848Sjkim.Lreduction_loopx:
1669289848Sjkim	mov	%r8, %rbx
1670289848Sjkim	mulx	0(%rbp), %rax, %r8
1671289848Sjkim	adcx	%rbx, %rax
1672289848Sjkim	adox	%r9, %r8
1673289848Sjkim
1674289848Sjkim	mulx	8(%rbp), %rax, %r9
1675289848Sjkim	adcx	%rax, %r8
1676289848Sjkim	adox	%r10, %r9
1677289848Sjkim
1678289848Sjkim	mulx	16(%rbp), %rbx, %r10
1679289848Sjkim	adcx	%rbx, %r9
1680289848Sjkim	adox	%r11, %r10
1681289848Sjkim
1682289848Sjkim	mulx	24(%rbp), %rbx, %r11
1683289848Sjkim	adcx	%rbx, %r10
1684289848Sjkim	adox	%r12, %r11
1685289848Sjkim
1686289848Sjkim	.byte	0xc4,0x62,0xe3,0xf6,0xa5,0x20,0x00,0x00,0x00	# mulx	32(%rbp), %rbx, %r12
1687289848Sjkim	 mov	%rdx, %rax
1688289848Sjkim	 mov	%r8, %rdx
1689289848Sjkim	adcx	%rbx, %r11
1690289848Sjkim	adox	%r13, %r12
1691289848Sjkim
1692289848Sjkim	 mulx	128+8(%rsp), %rbx, %rdx
1693289848Sjkim	 mov	%rax, %rdx
1694289848Sjkim
1695289848Sjkim	mulx	40(%rbp), %rax, %r13
1696289848Sjkim	adcx	%rax, %r12
1697289848Sjkim	adox	%r14, %r13
1698289848Sjkim
1699289848Sjkim	.byte	0xc4,0x62,0xfb,0xf6,0xb5,0x30,0x00,0x00,0x00	# mulx	48(%rbp), %rax, %r14
1700289848Sjkim	adcx	%rax, %r13
1701289848Sjkim	adox	%r15, %r14
1702289848Sjkim
1703289848Sjkim	mulx	56(%rbp), %rax, %r15
1704289848Sjkim	 mov	%rbx, %rdx
1705289848Sjkim	adcx	%rax, %r14
1706289848Sjkim	adox	%rsi, %r15			# %rsi is 0
1707289848Sjkim	adcx	%rsi, %r15			# cf=0
1708289848Sjkim
1709289848Sjkim	decl	%ecx				# of=0
1710289848Sjkim	jne	.Lreduction_loopx
1711289848Sjkim
1712289848Sjkim	ret
1713289848Sjkim.size	__rsaz_512_reducex,.-__rsaz_512_reducex
1714289848Sjkim___
1715289848Sjkim}
1716289848Sjkim{	# __rsaz_512_subtract
1717289848Sjkim	# input: %r8-%r15, %rdi - $out, %rbp - $mod, %rcx - mask
1718289848Sjkim	# output:
1719289848Sjkim	# clobbers: everything but %rdi, %rsi and %rbp
1720289848Sjkim$code.=<<___;
1721289848Sjkim.type	__rsaz_512_subtract,\@abi-omnipotent
1722289848Sjkim.align	32
1723289848Sjkim__rsaz_512_subtract:
1724289848Sjkim	movq	%r8, ($out)
1725289848Sjkim	movq	%r9, 8($out)
1726289848Sjkim	movq	%r10, 16($out)
1727289848Sjkim	movq	%r11, 24($out)
1728289848Sjkim	movq	%r12, 32($out)
1729289848Sjkim	movq	%r13, 40($out)
1730289848Sjkim	movq	%r14, 48($out)
1731289848Sjkim	movq	%r15, 56($out)
1732289848Sjkim
1733289848Sjkim	movq	0($mod), %r8
1734289848Sjkim	movq	8($mod), %r9
1735289848Sjkim	negq	%r8
1736289848Sjkim	notq	%r9
1737289848Sjkim	andq	%rcx, %r8
1738289848Sjkim	movq	16($mod), %r10
1739289848Sjkim	andq	%rcx, %r9
1740289848Sjkim	notq	%r10
1741289848Sjkim	movq	24($mod), %r11
1742289848Sjkim	andq	%rcx, %r10
1743289848Sjkim	notq	%r11
1744289848Sjkim	movq	32($mod), %r12
1745289848Sjkim	andq	%rcx, %r11
1746289848Sjkim	notq	%r12
1747289848Sjkim	movq	40($mod), %r13
1748289848Sjkim	andq	%rcx, %r12
1749289848Sjkim	notq	%r13
1750289848Sjkim	movq	48($mod), %r14
1751289848Sjkim	andq	%rcx, %r13
1752289848Sjkim	notq	%r14
1753289848Sjkim	movq	56($mod), %r15
1754289848Sjkim	andq	%rcx, %r14
1755289848Sjkim	notq	%r15
1756289848Sjkim	andq	%rcx, %r15
1757289848Sjkim
1758289848Sjkim	addq	($out), %r8
1759289848Sjkim	adcq	8($out), %r9
1760289848Sjkim	adcq	16($out), %r10
1761289848Sjkim	adcq	24($out), %r11
1762289848Sjkim	adcq	32($out), %r12
1763289848Sjkim	adcq	40($out), %r13
1764289848Sjkim	adcq	48($out), %r14
1765289848Sjkim	adcq	56($out), %r15
1766289848Sjkim
1767289848Sjkim	movq	%r8, ($out)
1768289848Sjkim	movq	%r9, 8($out)
1769289848Sjkim	movq	%r10, 16($out)
1770289848Sjkim	movq	%r11, 24($out)
1771289848Sjkim	movq	%r12, 32($out)
1772289848Sjkim	movq	%r13, 40($out)
1773289848Sjkim	movq	%r14, 48($out)
1774289848Sjkim	movq	%r15, 56($out)
1775289848Sjkim
1776289848Sjkim	ret
1777289848Sjkim.size	__rsaz_512_subtract,.-__rsaz_512_subtract
1778289848Sjkim___
1779289848Sjkim}
1780289848Sjkim{	# __rsaz_512_mul
1781289848Sjkim	#
1782289848Sjkim	# input: %rsi - ap, %rbp - bp
1783289848Sjkim	# ouput:
1784289848Sjkim	# clobbers: everything
1785289848Sjkimmy ($ap,$bp) = ("%rsi","%rbp");
1786289848Sjkim$code.=<<___;
1787289848Sjkim.type	__rsaz_512_mul,\@abi-omnipotent
1788289848Sjkim.align	32
1789289848Sjkim__rsaz_512_mul:
1790289848Sjkim	leaq	8(%rsp), %rdi
1791289848Sjkim
1792289848Sjkim	movq	($ap), %rax
1793289848Sjkim	mulq	%rbx
1794289848Sjkim	movq	%rax, (%rdi)
1795289848Sjkim	movq	8($ap), %rax
1796289848Sjkim	movq	%rdx, %r8
1797289848Sjkim
1798289848Sjkim	mulq	%rbx
1799289848Sjkim	addq	%rax, %r8
1800289848Sjkim	movq	16($ap), %rax
1801289848Sjkim	movq	%rdx, %r9
1802289848Sjkim	adcq	\$0, %r9
1803289848Sjkim
1804289848Sjkim	mulq	%rbx
1805289848Sjkim	addq	%rax, %r9
1806289848Sjkim	movq	24($ap), %rax
1807289848Sjkim	movq	%rdx, %r10
1808289848Sjkim	adcq	\$0, %r10
1809289848Sjkim
1810289848Sjkim	mulq	%rbx
1811289848Sjkim	addq	%rax, %r10
1812289848Sjkim	movq	32($ap), %rax
1813289848Sjkim	movq	%rdx, %r11
1814289848Sjkim	adcq	\$0, %r11
1815289848Sjkim
1816289848Sjkim	mulq	%rbx
1817289848Sjkim	addq	%rax, %r11
1818289848Sjkim	movq	40($ap), %rax
1819289848Sjkim	movq	%rdx, %r12
1820289848Sjkim	adcq	\$0, %r12
1821289848Sjkim
1822289848Sjkim	mulq	%rbx
1823289848Sjkim	addq	%rax, %r12
1824289848Sjkim	movq	48($ap), %rax
1825289848Sjkim	movq	%rdx, %r13
1826289848Sjkim	adcq	\$0, %r13
1827289848Sjkim
1828289848Sjkim	mulq	%rbx
1829289848Sjkim	addq	%rax, %r13
1830289848Sjkim	movq	56($ap), %rax
1831289848Sjkim	movq	%rdx, %r14
1832289848Sjkim	adcq	\$0, %r14
1833289848Sjkim
1834289848Sjkim	mulq	%rbx
1835289848Sjkim	addq	%rax, %r14
1836289848Sjkim	 movq	($ap), %rax
1837289848Sjkim	movq	%rdx, %r15
1838289848Sjkim	adcq	\$0, %r15
1839289848Sjkim
1840289848Sjkim	leaq	8($bp), $bp
1841289848Sjkim	leaq	8(%rdi), %rdi
1842289848Sjkim
1843289848Sjkim	movl	\$7, %ecx
1844289848Sjkim	jmp	.Loop_mul
1845289848Sjkim
1846289848Sjkim.align	32
1847289848Sjkim.Loop_mul:
1848289848Sjkim	movq	($bp), %rbx
1849289848Sjkim	mulq	%rbx
1850289848Sjkim	addq	%rax, %r8
1851289848Sjkim	movq	8($ap), %rax
1852289848Sjkim	movq	%r8, (%rdi)
1853289848Sjkim	movq	%rdx, %r8
1854289848Sjkim	adcq	\$0, %r8
1855289848Sjkim
1856289848Sjkim	mulq	%rbx
1857289848Sjkim	addq	%rax, %r9
1858289848Sjkim	movq	16($ap), %rax
1859289848Sjkim	adcq	\$0, %rdx
1860289848Sjkim	addq	%r9, %r8
1861289848Sjkim	movq	%rdx, %r9
1862289848Sjkim	adcq	\$0, %r9
1863289848Sjkim
1864289848Sjkim	mulq	%rbx
1865289848Sjkim	addq	%rax, %r10
1866289848Sjkim	movq	24($ap), %rax
1867289848Sjkim	adcq	\$0, %rdx
1868289848Sjkim	addq	%r10, %r9
1869289848Sjkim	movq	%rdx, %r10
1870289848Sjkim	adcq	\$0, %r10
1871289848Sjkim
1872289848Sjkim	mulq	%rbx
1873289848Sjkim	addq	%rax, %r11
1874289848Sjkim	movq	32($ap), %rax
1875289848Sjkim	adcq	\$0, %rdx
1876289848Sjkim	addq	%r11, %r10
1877289848Sjkim	movq	%rdx, %r11
1878289848Sjkim	adcq	\$0, %r11
1879289848Sjkim
1880289848Sjkim	mulq	%rbx
1881289848Sjkim	addq	%rax, %r12
1882289848Sjkim	movq	40($ap), %rax
1883289848Sjkim	adcq	\$0, %rdx
1884289848Sjkim	addq	%r12, %r11
1885289848Sjkim	movq	%rdx, %r12
1886289848Sjkim	adcq	\$0, %r12
1887289848Sjkim
1888289848Sjkim	mulq	%rbx
1889289848Sjkim	addq	%rax, %r13
1890289848Sjkim	movq	48($ap), %rax
1891289848Sjkim	adcq	\$0, %rdx
1892289848Sjkim	addq	%r13, %r12
1893289848Sjkim	movq	%rdx, %r13
1894289848Sjkim	adcq	\$0, %r13
1895289848Sjkim
1896289848Sjkim	mulq	%rbx
1897289848Sjkim	addq	%rax, %r14
1898289848Sjkim	movq	56($ap), %rax
1899289848Sjkim	adcq	\$0, %rdx
1900289848Sjkim	addq	%r14, %r13
1901289848Sjkim	movq	%rdx, %r14
1902289848Sjkim	 leaq	8($bp), $bp
1903289848Sjkim	adcq	\$0, %r14
1904289848Sjkim
1905289848Sjkim	mulq	%rbx
1906289848Sjkim	addq	%rax, %r15
1907289848Sjkim	 movq	($ap), %rax
1908289848Sjkim	adcq	\$0, %rdx
1909289848Sjkim	addq	%r15, %r14
1910289848Sjkim	movq	%rdx, %r15
1911289848Sjkim	adcq	\$0, %r15
1912289848Sjkim
1913289848Sjkim	leaq	8(%rdi), %rdi
1914289848Sjkim
1915289848Sjkim	decl	%ecx
1916289848Sjkim	jnz	.Loop_mul
1917289848Sjkim
1918289848Sjkim	movq	%r8, (%rdi)
1919289848Sjkim	movq	%r9, 8(%rdi)
1920289848Sjkim	movq	%r10, 16(%rdi)
1921289848Sjkim	movq	%r11, 24(%rdi)
1922289848Sjkim	movq	%r12, 32(%rdi)
1923289848Sjkim	movq	%r13, 40(%rdi)
1924289848Sjkim	movq	%r14, 48(%rdi)
1925289848Sjkim	movq	%r15, 56(%rdi)
1926289848Sjkim
1927289848Sjkim	ret
1928289848Sjkim.size	__rsaz_512_mul,.-__rsaz_512_mul
1929289848Sjkim___
1930289848Sjkim}
1931289848Sjkimif ($addx) {
1932289848Sjkim	# __rsaz_512_mulx
1933289848Sjkim	#
1934289848Sjkim	# input: %rsi - ap, %rbp - bp
1935289848Sjkim	# ouput:
1936289848Sjkim	# clobbers: everything
1937289848Sjkimmy ($ap,$bp,$zero) = ("%rsi","%rbp","%rdi");
1938289848Sjkim$code.=<<___;
1939289848Sjkim.type	__rsaz_512_mulx,\@abi-omnipotent
1940289848Sjkim.align	32
1941289848Sjkim__rsaz_512_mulx:
1942289848Sjkim	mulx	($ap), %rbx, %r8	# initial %rdx preloaded by caller
1943289848Sjkim	mov	\$-6, %rcx
1944289848Sjkim
1945289848Sjkim	mulx	8($ap), %rax, %r9
1946289848Sjkim	movq	%rbx, 8(%rsp)
1947289848Sjkim
1948289848Sjkim	mulx	16($ap), %rbx, %r10
1949289848Sjkim	adc	%rax, %r8
1950289848Sjkim
1951289848Sjkim	mulx	24($ap), %rax, %r11
1952289848Sjkim	adc	%rbx, %r9
1953289848Sjkim
1954289848Sjkim	mulx	32($ap), %rbx, %r12
1955289848Sjkim	adc	%rax, %r10
1956289848Sjkim
1957289848Sjkim	mulx	40($ap), %rax, %r13
1958289848Sjkim	adc	%rbx, %r11
1959289848Sjkim
1960289848Sjkim	mulx	48($ap), %rbx, %r14
1961289848Sjkim	adc	%rax, %r12
1962289848Sjkim
1963289848Sjkim	mulx	56($ap), %rax, %r15
1964289848Sjkim	 mov	8($bp), %rdx
1965289848Sjkim	adc	%rbx, %r13
1966289848Sjkim	adc	%rax, %r14
1967289848Sjkim	adc	\$0, %r15
1968289848Sjkim
1969289848Sjkim	xor	$zero, $zero		# cf=0,of=0
1970289848Sjkim	jmp	.Loop_mulx
1971289848Sjkim
1972289848Sjkim.align	32
1973289848Sjkim.Loop_mulx:
1974289848Sjkim	movq	%r8, %rbx
1975289848Sjkim	mulx	($ap), %rax, %r8
1976289848Sjkim	adcx	%rax, %rbx
1977289848Sjkim	adox	%r9, %r8
1978289848Sjkim
1979289848Sjkim	mulx	8($ap), %rax, %r9
1980289848Sjkim	adcx	%rax, %r8
1981289848Sjkim	adox	%r10, %r9
1982289848Sjkim
1983289848Sjkim	mulx	16($ap), %rax, %r10
1984289848Sjkim	adcx	%rax, %r9
1985289848Sjkim	adox	%r11, %r10
1986289848Sjkim
1987289848Sjkim	mulx	24($ap), %rax, %r11
1988289848Sjkim	adcx	%rax, %r10
1989289848Sjkim	adox	%r12, %r11
1990289848Sjkim
1991289848Sjkim	.byte	0x3e,0xc4,0x62,0xfb,0xf6,0xa6,0x20,0x00,0x00,0x00	# mulx	32($ap), %rax, %r12
1992289848Sjkim	adcx	%rax, %r11
1993289848Sjkim	adox	%r13, %r12
1994289848Sjkim
1995289848Sjkim	mulx	40($ap), %rax, %r13
1996289848Sjkim	adcx	%rax, %r12
1997289848Sjkim	adox	%r14, %r13
1998289848Sjkim
1999289848Sjkim	mulx	48($ap), %rax, %r14
2000289848Sjkim	adcx	%rax, %r13
2001289848Sjkim	adox	%r15, %r14
2002289848Sjkim
2003289848Sjkim	mulx	56($ap), %rax, %r15
2004289848Sjkim	 movq	64($bp,%rcx,8), %rdx
2005289848Sjkim	 movq	%rbx, 8+64-8(%rsp,%rcx,8)
2006289848Sjkim	adcx	%rax, %r14
2007289848Sjkim	adox	$zero, %r15
2008289848Sjkim	adcx	$zero, %r15		# cf=0
2009289848Sjkim
2010289848Sjkim	inc	%rcx			# of=0
2011289848Sjkim	jnz	.Loop_mulx
2012289848Sjkim
2013289848Sjkim	movq	%r8, %rbx
2014289848Sjkim	mulx	($ap), %rax, %r8
2015289848Sjkim	adcx	%rax, %rbx
2016289848Sjkim	adox	%r9, %r8
2017289848Sjkim
2018289848Sjkim	.byte	0xc4,0x62,0xfb,0xf6,0x8e,0x08,0x00,0x00,0x00	# mulx	8($ap), %rax, %r9
2019289848Sjkim	adcx	%rax, %r8
2020289848Sjkim	adox	%r10, %r9
2021289848Sjkim
2022289848Sjkim	.byte	0xc4,0x62,0xfb,0xf6,0x96,0x10,0x00,0x00,0x00	# mulx	16($ap), %rax, %r10
2023289848Sjkim	adcx	%rax, %r9
2024289848Sjkim	adox	%r11, %r10
2025289848Sjkim
2026289848Sjkim	mulx	24($ap), %rax, %r11
2027289848Sjkim	adcx	%rax, %r10
2028289848Sjkim	adox	%r12, %r11
2029289848Sjkim
2030289848Sjkim	mulx	32($ap), %rax, %r12
2031289848Sjkim	adcx	%rax, %r11
2032289848Sjkim	adox	%r13, %r12
2033289848Sjkim
2034289848Sjkim	mulx	40($ap), %rax, %r13
2035289848Sjkim	adcx	%rax, %r12
2036289848Sjkim	adox	%r14, %r13
2037289848Sjkim
2038289848Sjkim	.byte	0xc4,0x62,0xfb,0xf6,0xb6,0x30,0x00,0x00,0x00	# mulx	48($ap), %rax, %r14
2039289848Sjkim	adcx	%rax, %r13
2040289848Sjkim	adox	%r15, %r14
2041289848Sjkim
2042289848Sjkim	.byte	0xc4,0x62,0xfb,0xf6,0xbe,0x38,0x00,0x00,0x00	# mulx	56($ap), %rax, %r15
2043289848Sjkim	adcx	%rax, %r14
2044289848Sjkim	adox	$zero, %r15
2045289848Sjkim	adcx	$zero, %r15
2046289848Sjkim
2047289848Sjkim	mov	%rbx, 8+64-8(%rsp)
2048289848Sjkim	mov	%r8, 8+64(%rsp)
2049289848Sjkim	mov	%r9, 8+64+8(%rsp)
2050289848Sjkim	mov	%r10, 8+64+16(%rsp)
2051289848Sjkim	mov	%r11, 8+64+24(%rsp)
2052289848Sjkim	mov	%r12, 8+64+32(%rsp)
2053289848Sjkim	mov	%r13, 8+64+40(%rsp)
2054289848Sjkim	mov	%r14, 8+64+48(%rsp)
2055289848Sjkim	mov	%r15, 8+64+56(%rsp)
2056289848Sjkim
2057289848Sjkim	ret
2058289848Sjkim.size	__rsaz_512_mulx,.-__rsaz_512_mulx
2059289848Sjkim___
2060289848Sjkim}
2061289848Sjkim{
2062289848Sjkimmy ($out,$inp,$power)= $win64 ? ("%rcx","%rdx","%r8d") : ("%rdi","%rsi","%edx");
2063289848Sjkim$code.=<<___;
2064289848Sjkim.globl	rsaz_512_scatter4
2065289848Sjkim.type	rsaz_512_scatter4,\@abi-omnipotent
2066289848Sjkim.align	16
2067289848Sjkimrsaz_512_scatter4:
2068296279Sjkim	leaq	($out,$power,8), $out
2069289848Sjkim	movl	\$8, %r9d
2070289848Sjkim	jmp	.Loop_scatter
2071289848Sjkim.align	16
2072289848Sjkim.Loop_scatter:
2073289848Sjkim	movq	($inp), %rax
2074289848Sjkim	leaq	8($inp), $inp
2075296279Sjkim	movq	%rax, ($out)
2076289848Sjkim	leaq	128($out), $out
2077289848Sjkim	decl	%r9d
2078289848Sjkim	jnz	.Loop_scatter
2079289848Sjkim	ret
2080289848Sjkim.size	rsaz_512_scatter4,.-rsaz_512_scatter4
2081289848Sjkim
2082289848Sjkim.globl	rsaz_512_gather4
2083289848Sjkim.type	rsaz_512_gather4,\@abi-omnipotent
2084289848Sjkim.align	16
2085289848Sjkimrsaz_512_gather4:
2086296279Sjkim___
2087296279Sjkim$code.=<<___	if ($win64);
2088296279Sjkim.LSEH_begin_rsaz_512_gather4:
2089296279Sjkim	.byte	0x48,0x81,0xec,0xa8,0x00,0x00,0x00	# sub    $0xa8,%rsp
2090296279Sjkim	.byte	0x0f,0x29,0x34,0x24			# movaps %xmm6,(%rsp)
2091296279Sjkim	.byte	0x0f,0x29,0x7c,0x24,0x10		# movaps %xmm7,0x10(%rsp)
2092296279Sjkim	.byte	0x44,0x0f,0x29,0x44,0x24,0x20		# movaps %xmm8,0x20(%rsp)
2093296279Sjkim	.byte	0x44,0x0f,0x29,0x4c,0x24,0x30		# movaps %xmm9,0x30(%rsp)
2094296279Sjkim	.byte	0x44,0x0f,0x29,0x54,0x24,0x40		# movaps %xmm10,0x40(%rsp)
2095296279Sjkim	.byte	0x44,0x0f,0x29,0x5c,0x24,0x50		# movaps %xmm11,0x50(%rsp)
2096296279Sjkim	.byte	0x44,0x0f,0x29,0x64,0x24,0x60		# movaps %xmm12,0x60(%rsp)
2097296279Sjkim	.byte	0x44,0x0f,0x29,0x6c,0x24,0x70		# movaps %xmm13,0x70(%rsp)
2098296279Sjkim	.byte	0x44,0x0f,0x29,0xb4,0x24,0x80,0,0,0	# movaps %xmm14,0x80(%rsp)
2099296279Sjkim	.byte	0x44,0x0f,0x29,0xbc,0x24,0x90,0,0,0	# movaps %xmm15,0x90(%rsp)
2100296279Sjkim___
2101296279Sjkim$code.=<<___;
2102296279Sjkim	movd	$power,%xmm8
2103296279Sjkim	movdqa	.Linc+16(%rip),%xmm1	# 00000002000000020000000200000002
2104296279Sjkim	movdqa	.Linc(%rip),%xmm0	# 00000001000000010000000000000000
2105296279Sjkim
2106296279Sjkim	pshufd	\$0,%xmm8,%xmm8		# broadcast $power
2107296279Sjkim	movdqa	%xmm1,%xmm7
2108296279Sjkim	movdqa	%xmm1,%xmm2
2109296279Sjkim___
2110296279Sjkim########################################################################
2111296279Sjkim# calculate mask by comparing 0..15 to $power
2112296279Sjkim#
2113296279Sjkimfor($i=0;$i<4;$i++) {
2114296279Sjkim$code.=<<___;
2115296279Sjkim	paddd	%xmm`$i`,%xmm`$i+1`
2116296279Sjkim	pcmpeqd	%xmm8,%xmm`$i`
2117296279Sjkim	movdqa	%xmm7,%xmm`$i+3`
2118296279Sjkim___
2119296279Sjkim}
2120296279Sjkimfor(;$i<7;$i++) {
2121296279Sjkim$code.=<<___;
2122296279Sjkim	paddd	%xmm`$i`,%xmm`$i+1`
2123296279Sjkim	pcmpeqd	%xmm8,%xmm`$i`
2124296279Sjkim___
2125296279Sjkim}
2126296279Sjkim$code.=<<___;
2127296279Sjkim	pcmpeqd	%xmm8,%xmm7
2128289848Sjkim	movl	\$8, %r9d
2129289848Sjkim	jmp	.Loop_gather
2130289848Sjkim.align	16
2131289848Sjkim.Loop_gather:
2132296279Sjkim	movdqa	16*0($inp),%xmm8
2133296279Sjkim	movdqa	16*1($inp),%xmm9
2134296279Sjkim	movdqa	16*2($inp),%xmm10
2135296279Sjkim	movdqa	16*3($inp),%xmm11
2136296279Sjkim	pand	%xmm0,%xmm8
2137296279Sjkim	movdqa	16*4($inp),%xmm12
2138296279Sjkim	pand	%xmm1,%xmm9
2139296279Sjkim	movdqa	16*5($inp),%xmm13
2140296279Sjkim	pand	%xmm2,%xmm10
2141296279Sjkim	movdqa	16*6($inp),%xmm14
2142296279Sjkim	pand	%xmm3,%xmm11
2143296279Sjkim	movdqa	16*7($inp),%xmm15
2144289848Sjkim	leaq	128($inp), $inp
2145296279Sjkim	pand	%xmm4,%xmm12
2146296279Sjkim	pand	%xmm5,%xmm13
2147296279Sjkim	pand	%xmm6,%xmm14
2148296279Sjkim	pand	%xmm7,%xmm15
2149296279Sjkim	por	%xmm10,%xmm8
2150296279Sjkim	por	%xmm11,%xmm9
2151296279Sjkim	por	%xmm12,%xmm8
2152296279Sjkim	por	%xmm13,%xmm9
2153296279Sjkim	por	%xmm14,%xmm8
2154296279Sjkim	por	%xmm15,%xmm9
2155296279Sjkim
2156296279Sjkim	por	%xmm9,%xmm8
2157296279Sjkim	pshufd	\$0x4e,%xmm8,%xmm9
2158296279Sjkim	por	%xmm9,%xmm8
2159296279Sjkim	movq	%xmm8,($out)
2160289848Sjkim	leaq	8($out), $out
2161289848Sjkim	decl	%r9d
2162289848Sjkim	jnz	.Loop_gather
2163296279Sjkim___
2164296279Sjkim$code.=<<___	if ($win64);
2165296279Sjkim	movaps	0x00(%rsp),%xmm6
2166296279Sjkim	movaps	0x10(%rsp),%xmm7
2167296279Sjkim	movaps	0x20(%rsp),%xmm8
2168296279Sjkim	movaps	0x30(%rsp),%xmm9
2169296279Sjkim	movaps	0x40(%rsp),%xmm10
2170296279Sjkim	movaps	0x50(%rsp),%xmm11
2171296279Sjkim	movaps	0x60(%rsp),%xmm12
2172296279Sjkim	movaps	0x70(%rsp),%xmm13
2173296279Sjkim	movaps	0x80(%rsp),%xmm14
2174296279Sjkim	movaps	0x90(%rsp),%xmm15
2175296279Sjkim	add	\$0xa8,%rsp
2176296279Sjkim___
2177296279Sjkim$code.=<<___;
2178289848Sjkim	ret
2179296279Sjkim.LSEH_end_rsaz_512_gather4:
2180289848Sjkim.size	rsaz_512_gather4,.-rsaz_512_gather4
2181296279Sjkim
2182296279Sjkim.align	64
2183296279Sjkim.Linc:
2184296279Sjkim	.long	0,0, 1,1
2185296279Sjkim	.long	2,2, 2,2
2186289848Sjkim___
2187289848Sjkim}
2188289848Sjkim
2189289848Sjkim# EXCEPTION_DISPOSITION handler (EXCEPTION_RECORD *rec,ULONG64 frame,
2190289848Sjkim#		CONTEXT *context,DISPATCHER_CONTEXT *disp)
2191289848Sjkimif ($win64) {
2192289848Sjkim$rec="%rcx";
2193289848Sjkim$frame="%rdx";
2194289848Sjkim$context="%r8";
2195289848Sjkim$disp="%r9";
2196289848Sjkim
2197289848Sjkim$code.=<<___;
2198289848Sjkim.extern	__imp_RtlVirtualUnwind
2199289848Sjkim.type	se_handler,\@abi-omnipotent
2200289848Sjkim.align	16
2201289848Sjkimse_handler:
2202289848Sjkim	push	%rsi
2203289848Sjkim	push	%rdi
2204289848Sjkim	push	%rbx
2205289848Sjkim	push	%rbp
2206289848Sjkim	push	%r12
2207289848Sjkim	push	%r13
2208289848Sjkim	push	%r14
2209289848Sjkim	push	%r15
2210289848Sjkim	pushfq
2211289848Sjkim	sub	\$64,%rsp
2212289848Sjkim
2213289848Sjkim	mov	120($context),%rax	# pull context->Rax
2214289848Sjkim	mov	248($context),%rbx	# pull context->Rip
2215289848Sjkim
2216289848Sjkim	mov	8($disp),%rsi		# disp->ImageBase
2217289848Sjkim	mov	56($disp),%r11		# disp->HandlerData
2218289848Sjkim
2219289848Sjkim	mov	0(%r11),%r10d		# HandlerData[0]
2220289848Sjkim	lea	(%rsi,%r10),%r10	# end of prologue label
2221289848Sjkim	cmp	%r10,%rbx		# context->Rip<end of prologue label
2222289848Sjkim	jb	.Lcommon_seh_tail
2223289848Sjkim
2224289848Sjkim	mov	152($context),%rax	# pull context->Rsp
2225289848Sjkim
2226289848Sjkim	mov	4(%r11),%r10d		# HandlerData[1]
2227289848Sjkim	lea	(%rsi,%r10),%r10	# epilogue label
2228289848Sjkim	cmp	%r10,%rbx		# context->Rip>=epilogue label
2229289848Sjkim	jae	.Lcommon_seh_tail
2230289848Sjkim
2231289848Sjkim	lea	128+24+48(%rax),%rax
2232289848Sjkim
2233296279Sjkim	lea	.Lmul_gather4_epilogue(%rip),%rbx
2234296279Sjkim	cmp	%r10,%rbx
2235296279Sjkim	jne	.Lse_not_in_mul_gather4
2236296279Sjkim
2237296279Sjkim	lea	0xb0(%rax),%rax
2238296279Sjkim
2239296279Sjkim	lea	-48-0xa8(%rax),%rsi
2240296279Sjkim	lea	512($context),%rdi
2241296279Sjkim	mov	\$20,%ecx
2242296279Sjkim	.long	0xa548f3fc		# cld; rep movsq
2243296279Sjkim
2244296279Sjkim.Lse_not_in_mul_gather4:
2245289848Sjkim	mov	-8(%rax),%rbx
2246289848Sjkim	mov	-16(%rax),%rbp
2247289848Sjkim	mov	-24(%rax),%r12
2248289848Sjkim	mov	-32(%rax),%r13
2249289848Sjkim	mov	-40(%rax),%r14
2250289848Sjkim	mov	-48(%rax),%r15
2251289848Sjkim	mov	%rbx,144($context)	# restore context->Rbx
2252289848Sjkim	mov	%rbp,160($context)	# restore context->Rbp
2253289848Sjkim	mov	%r12,216($context)	# restore context->R12
2254289848Sjkim	mov	%r13,224($context)	# restore context->R13
2255289848Sjkim	mov	%r14,232($context)	# restore context->R14
2256289848Sjkim	mov	%r15,240($context)	# restore context->R15
2257289848Sjkim
2258289848Sjkim.Lcommon_seh_tail:
2259289848Sjkim	mov	8(%rax),%rdi
2260289848Sjkim	mov	16(%rax),%rsi
2261289848Sjkim	mov	%rax,152($context)	# restore context->Rsp
2262289848Sjkim	mov	%rsi,168($context)	# restore context->Rsi
2263289848Sjkim	mov	%rdi,176($context)	# restore context->Rdi
2264289848Sjkim
2265289848Sjkim	mov	40($disp),%rdi		# disp->ContextRecord
2266289848Sjkim	mov	$context,%rsi		# context
2267289848Sjkim	mov	\$154,%ecx		# sizeof(CONTEXT)
2268289848Sjkim	.long	0xa548f3fc		# cld; rep movsq
2269289848Sjkim
2270289848Sjkim	mov	$disp,%rsi
2271289848Sjkim	xor	%rcx,%rcx		# arg1, UNW_FLAG_NHANDLER
2272289848Sjkim	mov	8(%rsi),%rdx		# arg2, disp->ImageBase
2273289848Sjkim	mov	0(%rsi),%r8		# arg3, disp->ControlPc
2274289848Sjkim	mov	16(%rsi),%r9		# arg4, disp->FunctionEntry
2275289848Sjkim	mov	40(%rsi),%r10		# disp->ContextRecord
2276289848Sjkim	lea	56(%rsi),%r11		# &disp->HandlerData
2277289848Sjkim	lea	24(%rsi),%r12		# &disp->EstablisherFrame
2278289848Sjkim	mov	%r10,32(%rsp)		# arg5
2279289848Sjkim	mov	%r11,40(%rsp)		# arg6
2280289848Sjkim	mov	%r12,48(%rsp)		# arg7
2281289848Sjkim	mov	%rcx,56(%rsp)		# arg8, (NULL)
2282289848Sjkim	call	*__imp_RtlVirtualUnwind(%rip)
2283289848Sjkim
2284289848Sjkim	mov	\$1,%eax		# ExceptionContinueSearch
2285289848Sjkim	add	\$64,%rsp
2286289848Sjkim	popfq
2287289848Sjkim	pop	%r15
2288289848Sjkim	pop	%r14
2289289848Sjkim	pop	%r13
2290289848Sjkim	pop	%r12
2291289848Sjkim	pop	%rbp
2292289848Sjkim	pop	%rbx
2293289848Sjkim	pop	%rdi
2294289848Sjkim	pop	%rsi
2295289848Sjkim	ret
2296296279Sjkim.size	se_handler,.-se_handler
2297289848Sjkim
2298289848Sjkim.section	.pdata
2299289848Sjkim.align	4
2300289848Sjkim	.rva	.LSEH_begin_rsaz_512_sqr
2301289848Sjkim	.rva	.LSEH_end_rsaz_512_sqr
2302289848Sjkim	.rva	.LSEH_info_rsaz_512_sqr
2303289848Sjkim
2304289848Sjkim	.rva	.LSEH_begin_rsaz_512_mul
2305289848Sjkim	.rva	.LSEH_end_rsaz_512_mul
2306289848Sjkim	.rva	.LSEH_info_rsaz_512_mul
2307289848Sjkim
2308289848Sjkim	.rva	.LSEH_begin_rsaz_512_mul_gather4
2309289848Sjkim	.rva	.LSEH_end_rsaz_512_mul_gather4
2310289848Sjkim	.rva	.LSEH_info_rsaz_512_mul_gather4
2311289848Sjkim
2312289848Sjkim	.rva	.LSEH_begin_rsaz_512_mul_scatter4
2313289848Sjkim	.rva	.LSEH_end_rsaz_512_mul_scatter4
2314289848Sjkim	.rva	.LSEH_info_rsaz_512_mul_scatter4
2315289848Sjkim
2316289848Sjkim	.rva	.LSEH_begin_rsaz_512_mul_by_one
2317289848Sjkim	.rva	.LSEH_end_rsaz_512_mul_by_one
2318289848Sjkim	.rva	.LSEH_info_rsaz_512_mul_by_one
2319289848Sjkim
2320296279Sjkim	.rva	.LSEH_begin_rsaz_512_gather4
2321296279Sjkim	.rva	.LSEH_end_rsaz_512_gather4
2322296279Sjkim	.rva	.LSEH_info_rsaz_512_gather4
2323296279Sjkim
2324289848Sjkim.section	.xdata
2325289848Sjkim.align	8
2326289848Sjkim.LSEH_info_rsaz_512_sqr:
2327289848Sjkim	.byte	9,0,0,0
2328289848Sjkim	.rva	se_handler
2329289848Sjkim	.rva	.Lsqr_body,.Lsqr_epilogue			# HandlerData[]
2330289848Sjkim.LSEH_info_rsaz_512_mul:
2331289848Sjkim	.byte	9,0,0,0
2332289848Sjkim	.rva	se_handler
2333289848Sjkim	.rva	.Lmul_body,.Lmul_epilogue			# HandlerData[]
2334289848Sjkim.LSEH_info_rsaz_512_mul_gather4:
2335289848Sjkim	.byte	9,0,0,0
2336289848Sjkim	.rva	se_handler
2337289848Sjkim	.rva	.Lmul_gather4_body,.Lmul_gather4_epilogue	# HandlerData[]
2338289848Sjkim.LSEH_info_rsaz_512_mul_scatter4:
2339289848Sjkim	.byte	9,0,0,0
2340289848Sjkim	.rva	se_handler
2341289848Sjkim	.rva	.Lmul_scatter4_body,.Lmul_scatter4_epilogue	# HandlerData[]
2342289848Sjkim.LSEH_info_rsaz_512_mul_by_one:
2343289848Sjkim	.byte	9,0,0,0
2344289848Sjkim	.rva	se_handler
2345289848Sjkim	.rva	.Lmul_by_one_body,.Lmul_by_one_epilogue		# HandlerData[]
2346296279Sjkim.LSEH_info_rsaz_512_gather4:
2347296279Sjkim	.byte	0x01,0x46,0x16,0x00
2348296279Sjkim	.byte	0x46,0xf8,0x09,0x00	# vmovaps 0x90(rsp),xmm15
2349296279Sjkim	.byte	0x3d,0xe8,0x08,0x00	# vmovaps 0x80(rsp),xmm14
2350296279Sjkim	.byte	0x34,0xd8,0x07,0x00	# vmovaps 0x70(rsp),xmm13
2351296279Sjkim	.byte	0x2e,0xc8,0x06,0x00	# vmovaps 0x60(rsp),xmm12
2352296279Sjkim	.byte	0x28,0xb8,0x05,0x00	# vmovaps 0x50(rsp),xmm11
2353296279Sjkim	.byte	0x22,0xa8,0x04,0x00	# vmovaps 0x40(rsp),xmm10
2354296279Sjkim	.byte	0x1c,0x98,0x03,0x00	# vmovaps 0x30(rsp),xmm9
2355296279Sjkim	.byte	0x16,0x88,0x02,0x00	# vmovaps 0x20(rsp),xmm8
2356296279Sjkim	.byte	0x10,0x78,0x01,0x00	# vmovaps 0x10(rsp),xmm7
2357296279Sjkim	.byte	0x0b,0x68,0x00,0x00	# vmovaps 0x00(rsp),xmm6
2358296279Sjkim	.byte	0x07,0x01,0x15,0x00	# sub     rsp,0xa8
2359289848Sjkim___
2360289848Sjkim}
2361289848Sjkim
2362289848Sjkim$code =~ s/\`([^\`]*)\`/eval $1/gem;
2363289848Sjkimprint $code;
2364289848Sjkimclose STDOUT;
2365