ecp_nistz256-x86_64.pl revision 306195
1#!/usr/bin/env perl
2
3##############################################################################
4#                                                                            #
5# Copyright 2014 Intel Corporation                                           #
6#                                                                            #
7# Licensed under the Apache License, Version 2.0 (the "License");            #
8# you may not use this file except in compliance with the License.           #
9# You may obtain a copy of the License at                                    #
10#                                                                            #
11#    http://www.apache.org/licenses/LICENSE-2.0                              #
12#                                                                            #
13# Unless required by applicable law or agreed to in writing, software        #
14# distributed under the License is distributed on an "AS IS" BASIS,          #
15# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.   #
16# See the License for the specific language governing permissions and        #
17# limitations under the License.                                             #
18#                                                                            #
19##############################################################################
20#                                                                            #
21#  Developers and authors:                                                   #
22#  Shay Gueron (1, 2), and Vlad Krasnov (1)                                  #
23#  (1) Intel Corporation, Israel Development Center                          #
24#  (2) University of Haifa                                                   #
25#  Reference:                                                                #
26#  S.Gueron and V.Krasnov, "Fast Prime Field Elliptic Curve Cryptography with#
27#                           256 Bit Primes"                                  #
28#                                                                            #
29##############################################################################
30
31# Further optimization by <appro@openssl.org>:
32#
33#		this/original	with/without -DECP_NISTZ256_ASM(*)
34# Opteron	+12-49%		+110-150%
35# Bulldozer	+14-45%		+175-210%
36# P4		+18-46%		n/a :-(
37# Westmere	+12-34%		+80-87%
38# Sandy Bridge	+9-35%		+110-120%
39# Ivy Bridge	+9-35%		+110-125%
40# Haswell	+8-37%		+140-160%
41# Broadwell	+18-58%		+145-210%
42# Atom		+15-50%		+130-180%
43# VIA Nano	+43-160%	+300-480%
44#
45# (*)	"without -DECP_NISTZ256_ASM" refers to build with
46#	"enable-ec_nistp_64_gcc_128";
47#
48# Ranges denote minimum and maximum improvement coefficients depending
49# on benchmark. Lower coefficients are for ECDSA sign, relatively fastest
50# server-side operation. Keep in mind that +100% means 2x improvement.
51
52$flavour = shift;
53$output  = shift;
54if ($flavour =~ /\./) { $output = $flavour; undef $flavour; }
55
56$win64=0; $win64=1 if ($flavour =~ /[nm]asm|mingw64/ || $output =~ /\.asm$/);
57
58$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
59( $xlate="${dir}x86_64-xlate.pl" and -f $xlate ) or
60( $xlate="${dir}../../perlasm/x86_64-xlate.pl" and -f $xlate) or
61die "can't locate x86_64-xlate.pl";
62
63open OUT,"| \"$^X\" $xlate $flavour $output";
64*STDOUT=*OUT;
65
66if (`$ENV{CC} -Wa,-v -c -o /dev/null -x assembler /dev/null 2>&1`
67		=~ /GNU assembler version ([2-9]\.[0-9]+)/) {
68	$avx = ($1>=2.19) + ($1>=2.22);
69	$addx = ($1>=2.23);
70}
71
72if (!$addx && $win64 && ($flavour =~ /nasm/ || $ENV{ASM} =~ /nasm/) &&
73	    `nasm -v 2>&1` =~ /NASM version ([2-9]\.[0-9]+)/) {
74	$avx = ($1>=2.09) + ($1>=2.10);
75	$addx = ($1>=2.10);
76}
77
78if (!$addx && $win64 && ($flavour =~ /masm/ || $ENV{ASM} =~ /ml64/) &&
79	    `ml64 2>&1` =~ /Version ([0-9]+)\./) {
80	$avx = ($1>=10) + ($1>=11);
81	$addx = ($1>=12);
82}
83
84if (!$addx && `$ENV{CC} -v 2>&1` =~ /((?:^clang|LLVM) version|.*based on LLVM) ([3-9])\.([0-9]+)/) {
85	my $ver = $2 + $3/100.0;	# 3.1->3.01, 3.10->3.10
86	$avx = ($ver>=3.0) + ($ver>=3.01);
87	$addx = ($ver>=3.03);
88}
89
90$code.=<<___;
91.text
92.extern	OPENSSL_ia32cap_P
93
94# The polynomial
95.align 64
96.Lpoly:
97.quad 0xffffffffffffffff, 0x00000000ffffffff, 0x0000000000000000, 0xffffffff00000001
98
99# 2^512 mod P precomputed for NIST P256 polynomial
100.LRR:
101.quad 0x0000000000000003, 0xfffffffbffffffff, 0xfffffffffffffffe, 0x00000004fffffffd
102
103.LOne:
104.long 1,1,1,1,1,1,1,1
105.LTwo:
106.long 2,2,2,2,2,2,2,2
107.LThree:
108.long 3,3,3,3,3,3,3,3
109.LONE_mont:
110.quad 0x0000000000000001, 0xffffffff00000000, 0xffffffffffffffff, 0x00000000fffffffe
111___
112
113{
114################################################################################
115# void ecp_nistz256_mul_by_2(uint64_t res[4], uint64_t a[4]);
116
117my ($a0,$a1,$a2,$a3)=map("%r$_",(8..11));
118my ($t0,$t1,$t2,$t3,$t4)=("%rax","%rdx","%rcx","%r12","%r13");
119my ($r_ptr,$a_ptr,$b_ptr)=("%rdi","%rsi","%rdx");
120
121$code.=<<___;
122
123.globl	ecp_nistz256_mul_by_2
124.type	ecp_nistz256_mul_by_2,\@function,2
125.align	64
126ecp_nistz256_mul_by_2:
127	push	%r12
128	push	%r13
129
130	mov	8*0($a_ptr), $a0
131	xor	$t4,$t4
132	mov	8*1($a_ptr), $a1
133	add	$a0, $a0		# a0:a3+a0:a3
134	mov	8*2($a_ptr), $a2
135	adc	$a1, $a1
136	mov	8*3($a_ptr), $a3
137	lea	.Lpoly(%rip), $a_ptr
138	 mov	$a0, $t0
139	adc	$a2, $a2
140	adc	$a3, $a3
141	 mov	$a1, $t1
142	adc	\$0, $t4
143
144	sub	8*0($a_ptr), $a0
145	 mov	$a2, $t2
146	sbb	8*1($a_ptr), $a1
147	sbb	8*2($a_ptr), $a2
148	 mov	$a3, $t3
149	sbb	8*3($a_ptr), $a3
150	sbb	\$0, $t4
151
152	cmovc	$t0, $a0
153	cmovc	$t1, $a1
154	mov	$a0, 8*0($r_ptr)
155	cmovc	$t2, $a2
156	mov	$a1, 8*1($r_ptr)
157	cmovc	$t3, $a3
158	mov	$a2, 8*2($r_ptr)
159	mov	$a3, 8*3($r_ptr)
160
161	pop	%r13
162	pop	%r12
163	ret
164.size	ecp_nistz256_mul_by_2,.-ecp_nistz256_mul_by_2
165
166################################################################################
167# void ecp_nistz256_div_by_2(uint64_t res[4], uint64_t a[4]);
168.globl	ecp_nistz256_div_by_2
169.type	ecp_nistz256_div_by_2,\@function,2
170.align	32
171ecp_nistz256_div_by_2:
172	push	%r12
173	push	%r13
174
175	mov	8*0($a_ptr), $a0
176	mov	8*1($a_ptr), $a1
177	mov	8*2($a_ptr), $a2
178	 mov	$a0, $t0
179	mov	8*3($a_ptr), $a3
180	lea	.Lpoly(%rip), $a_ptr
181
182	 mov	$a1, $t1
183	xor	$t4, $t4
184	add	8*0($a_ptr), $a0
185	 mov	$a2, $t2
186	adc	8*1($a_ptr), $a1
187	adc	8*2($a_ptr), $a2
188	 mov	$a3, $t3
189	adc	8*3($a_ptr), $a3
190	adc	\$0, $t4
191	xor	$a_ptr, $a_ptr		# borrow $a_ptr
192	test	\$1, $t0
193
194	cmovz	$t0, $a0
195	cmovz	$t1, $a1
196	cmovz	$t2, $a2
197	cmovz	$t3, $a3
198	cmovz	$a_ptr, $t4
199
200	mov	$a1, $t0		# a0:a3>>1
201	shr	\$1, $a0
202	shl	\$63, $t0
203	mov	$a2, $t1
204	shr	\$1, $a1
205	or	$t0, $a0
206	shl	\$63, $t1
207	mov	$a3, $t2
208	shr	\$1, $a2
209	or	$t1, $a1
210	shl	\$63, $t2
211	shr	\$1, $a3
212	shl	\$63, $t4
213	or	$t2, $a2
214	or	$t4, $a3
215
216	mov	$a0, 8*0($r_ptr)
217	mov	$a1, 8*1($r_ptr)
218	mov	$a2, 8*2($r_ptr)
219	mov	$a3, 8*3($r_ptr)
220
221	pop	%r13
222	pop	%r12
223	ret
224.size	ecp_nistz256_div_by_2,.-ecp_nistz256_div_by_2
225
226################################################################################
227# void ecp_nistz256_mul_by_3(uint64_t res[4], uint64_t a[4]);
228.globl	ecp_nistz256_mul_by_3
229.type	ecp_nistz256_mul_by_3,\@function,2
230.align	32
231ecp_nistz256_mul_by_3:
232	push	%r12
233	push	%r13
234
235	mov	8*0($a_ptr), $a0
236	xor	$t4, $t4
237	mov	8*1($a_ptr), $a1
238	add	$a0, $a0		# a0:a3+a0:a3
239	mov	8*2($a_ptr), $a2
240	adc	$a1, $a1
241	mov	8*3($a_ptr), $a3
242	 mov	$a0, $t0
243	adc	$a2, $a2
244	adc	$a3, $a3
245	 mov	$a1, $t1
246	adc	\$0, $t4
247
248	sub	\$-1, $a0
249	 mov	$a2, $t2
250	sbb	.Lpoly+8*1(%rip), $a1
251	sbb	\$0, $a2
252	 mov	$a3, $t3
253	sbb	.Lpoly+8*3(%rip), $a3
254	sbb	\$0, $t4
255
256	cmovc	$t0, $a0
257	cmovc	$t1, $a1
258	cmovc	$t2, $a2
259	cmovc	$t3, $a3
260
261	xor	$t4, $t4
262	add	8*0($a_ptr), $a0	# a0:a3+=a_ptr[0:3]
263	adc	8*1($a_ptr), $a1
264	 mov	$a0, $t0
265	adc	8*2($a_ptr), $a2
266	adc	8*3($a_ptr), $a3
267	 mov	$a1, $t1
268	adc	\$0, $t4
269
270	sub	\$-1, $a0
271	 mov	$a2, $t2
272	sbb	.Lpoly+8*1(%rip), $a1
273	sbb	\$0, $a2
274	 mov	$a3, $t3
275	sbb	.Lpoly+8*3(%rip), $a3
276	sbb	\$0, $t4
277
278	cmovc	$t0, $a0
279	cmovc	$t1, $a1
280	mov	$a0, 8*0($r_ptr)
281	cmovc	$t2, $a2
282	mov	$a1, 8*1($r_ptr)
283	cmovc	$t3, $a3
284	mov	$a2, 8*2($r_ptr)
285	mov	$a3, 8*3($r_ptr)
286
287	pop %r13
288	pop %r12
289	ret
290.size	ecp_nistz256_mul_by_3,.-ecp_nistz256_mul_by_3
291
292################################################################################
293# void ecp_nistz256_add(uint64_t res[4], uint64_t a[4], uint64_t b[4]);
294.globl	ecp_nistz256_add
295.type	ecp_nistz256_add,\@function,3
296.align	32
297ecp_nistz256_add:
298	push	%r12
299	push	%r13
300
301	mov	8*0($a_ptr), $a0
302	xor	$t4, $t4
303	mov	8*1($a_ptr), $a1
304	mov	8*2($a_ptr), $a2
305	mov	8*3($a_ptr), $a3
306	lea	.Lpoly(%rip), $a_ptr
307
308	add	8*0($b_ptr), $a0
309	adc	8*1($b_ptr), $a1
310	 mov	$a0, $t0
311	adc	8*2($b_ptr), $a2
312	adc	8*3($b_ptr), $a3
313	 mov	$a1, $t1
314	adc	\$0, $t4
315
316	sub	8*0($a_ptr), $a0
317	 mov	$a2, $t2
318	sbb	8*1($a_ptr), $a1
319	sbb	8*2($a_ptr), $a2
320	 mov	$a3, $t3
321	sbb	8*3($a_ptr), $a3
322	sbb	\$0, $t4
323
324	cmovc	$t0, $a0
325	cmovc	$t1, $a1
326	mov	$a0, 8*0($r_ptr)
327	cmovc	$t2, $a2
328	mov	$a1, 8*1($r_ptr)
329	cmovc	$t3, $a3
330	mov	$a2, 8*2($r_ptr)
331	mov	$a3, 8*3($r_ptr)
332
333	pop %r13
334	pop %r12
335	ret
336.size	ecp_nistz256_add,.-ecp_nistz256_add
337
338################################################################################
339# void ecp_nistz256_sub(uint64_t res[4], uint64_t a[4], uint64_t b[4]);
340.globl	ecp_nistz256_sub
341.type	ecp_nistz256_sub,\@function,3
342.align	32
343ecp_nistz256_sub:
344	push	%r12
345	push	%r13
346
347	mov	8*0($a_ptr), $a0
348	xor	$t4, $t4
349	mov	8*1($a_ptr), $a1
350	mov	8*2($a_ptr), $a2
351	mov	8*3($a_ptr), $a3
352	lea	.Lpoly(%rip), $a_ptr
353
354	sub	8*0($b_ptr), $a0
355	sbb	8*1($b_ptr), $a1
356	 mov	$a0, $t0
357	sbb	8*2($b_ptr), $a2
358	sbb	8*3($b_ptr), $a3
359	 mov	$a1, $t1
360	sbb	\$0, $t4
361
362	add	8*0($a_ptr), $a0
363	 mov	$a2, $t2
364	adc	8*1($a_ptr), $a1
365	adc	8*2($a_ptr), $a2
366	 mov	$a3, $t3
367	adc	8*3($a_ptr), $a3
368	test	$t4, $t4
369
370	cmovz	$t0, $a0
371	cmovz	$t1, $a1
372	mov	$a0, 8*0($r_ptr)
373	cmovz	$t2, $a2
374	mov	$a1, 8*1($r_ptr)
375	cmovz	$t3, $a3
376	mov	$a2, 8*2($r_ptr)
377	mov	$a3, 8*3($r_ptr)
378
379	pop %r13
380	pop %r12
381	ret
382.size	ecp_nistz256_sub,.-ecp_nistz256_sub
383
384################################################################################
385# void ecp_nistz256_neg(uint64_t res[4], uint64_t a[4]);
386.globl	ecp_nistz256_neg
387.type	ecp_nistz256_neg,\@function,2
388.align	32
389ecp_nistz256_neg:
390	push	%r12
391	push	%r13
392
393	xor	$a0, $a0
394	xor	$a1, $a1
395	xor	$a2, $a2
396	xor	$a3, $a3
397	xor	$t4, $t4
398
399	sub	8*0($a_ptr), $a0
400	sbb	8*1($a_ptr), $a1
401	sbb	8*2($a_ptr), $a2
402	 mov	$a0, $t0
403	sbb	8*3($a_ptr), $a3
404	lea	.Lpoly(%rip), $a_ptr
405	 mov	$a1, $t1
406	sbb	\$0, $t4
407
408	add	8*0($a_ptr), $a0
409	 mov	$a2, $t2
410	adc	8*1($a_ptr), $a1
411	adc	8*2($a_ptr), $a2
412	 mov	$a3, $t3
413	adc	8*3($a_ptr), $a3
414	test	$t4, $t4
415
416	cmovz	$t0, $a0
417	cmovz	$t1, $a1
418	mov	$a0, 8*0($r_ptr)
419	cmovz	$t2, $a2
420	mov	$a1, 8*1($r_ptr)
421	cmovz	$t3, $a3
422	mov	$a2, 8*2($r_ptr)
423	mov	$a3, 8*3($r_ptr)
424
425	pop %r13
426	pop %r12
427	ret
428.size	ecp_nistz256_neg,.-ecp_nistz256_neg
429___
430}
431{
432my ($r_ptr,$a_ptr,$b_org,$b_ptr)=("%rdi","%rsi","%rdx","%rbx");
433my ($acc0,$acc1,$acc2,$acc3,$acc4,$acc5,$acc6,$acc7)=map("%r$_",(8..15));
434my ($t0,$t1,$t2,$t3,$t4)=("%rcx","%rbp","%rbx","%rdx","%rax");
435my ($poly1,$poly3)=($acc6,$acc7);
436
437$code.=<<___;
438################################################################################
439# void ecp_nistz256_to_mont(
440#   uint64_t res[4],
441#   uint64_t in[4]);
442.globl	ecp_nistz256_to_mont
443.type	ecp_nistz256_to_mont,\@function,2
444.align	32
445ecp_nistz256_to_mont:
446___
447$code.=<<___	if ($addx);
448	mov	\$0x80100, %ecx
449	and	OPENSSL_ia32cap_P+8(%rip), %ecx
450___
451$code.=<<___;
452	lea	.LRR(%rip), $b_org
453	jmp	.Lmul_mont
454.size	ecp_nistz256_to_mont,.-ecp_nistz256_to_mont
455
456################################################################################
457# void ecp_nistz256_mul_mont(
458#   uint64_t res[4],
459#   uint64_t a[4],
460#   uint64_t b[4]);
461
462.globl	ecp_nistz256_mul_mont
463.type	ecp_nistz256_mul_mont,\@function,3
464.align	32
465ecp_nistz256_mul_mont:
466___
467$code.=<<___	if ($addx);
468	mov	\$0x80100, %ecx
469	and	OPENSSL_ia32cap_P+8(%rip), %ecx
470___
471$code.=<<___;
472.Lmul_mont:
473	push	%rbp
474	push	%rbx
475	push	%r12
476	push	%r13
477	push	%r14
478	push	%r15
479___
480$code.=<<___	if ($addx);
481	cmp	\$0x80100, %ecx
482	je	.Lmul_montx
483___
484$code.=<<___;
485	mov	$b_org, $b_ptr
486	mov	8*0($b_org), %rax
487	mov	8*0($a_ptr), $acc1
488	mov	8*1($a_ptr), $acc2
489	mov	8*2($a_ptr), $acc3
490	mov	8*3($a_ptr), $acc4
491
492	call	__ecp_nistz256_mul_montq
493___
494$code.=<<___	if ($addx);
495	jmp	.Lmul_mont_done
496
497.align	32
498.Lmul_montx:
499	mov	$b_org, $b_ptr
500	mov	8*0($b_org), %rdx
501	mov	8*0($a_ptr), $acc1
502	mov	8*1($a_ptr), $acc2
503	mov	8*2($a_ptr), $acc3
504	mov	8*3($a_ptr), $acc4
505	lea	-128($a_ptr), $a_ptr	# control u-op density
506
507	call	__ecp_nistz256_mul_montx
508___
509$code.=<<___;
510.Lmul_mont_done:
511	pop	%r15
512	pop	%r14
513	pop	%r13
514	pop	%r12
515	pop	%rbx
516	pop	%rbp
517	ret
518.size	ecp_nistz256_mul_mont,.-ecp_nistz256_mul_mont
519
520.type	__ecp_nistz256_mul_montq,\@abi-omnipotent
521.align	32
522__ecp_nistz256_mul_montq:
523	########################################################################
524	# Multiply a by b[0]
525	mov	%rax, $t1
526	mulq	$acc1
527	mov	.Lpoly+8*1(%rip),$poly1
528	mov	%rax, $acc0
529	mov	$t1, %rax
530	mov	%rdx, $acc1
531
532	mulq	$acc2
533	mov	.Lpoly+8*3(%rip),$poly3
534	add	%rax, $acc1
535	mov	$t1, %rax
536	adc	\$0, %rdx
537	mov	%rdx, $acc2
538
539	mulq	$acc3
540	add	%rax, $acc2
541	mov	$t1, %rax
542	adc	\$0, %rdx
543	mov	%rdx, $acc3
544
545	mulq	$acc4
546	add	%rax, $acc3
547	 mov	$acc0, %rax
548	adc	\$0, %rdx
549	xor	$acc5, $acc5
550	mov	%rdx, $acc4
551
552	########################################################################
553	# First reduction step
554	# Basically now we want to multiply acc[0] by p256,
555	# and add the result to the acc.
556	# Due to the special form of p256 we do some optimizations
557	#
558	# acc[0] x p256[0..1] = acc[0] x 2^96 - acc[0]
559	# then we add acc[0] and get acc[0] x 2^96
560
561	mov	$acc0, $t1
562	shl	\$32, $acc0
563	mulq	$poly3
564	shr	\$32, $t1
565	add	$acc0, $acc1		# +=acc[0]<<96
566	adc	$t1, $acc2
567	adc	%rax, $acc3
568	 mov	8*1($b_ptr), %rax
569	adc	%rdx, $acc4
570	adc	\$0, $acc5
571	xor	$acc0, $acc0
572
573	########################################################################
574	# Multiply by b[1]
575	mov	%rax, $t1
576	mulq	8*0($a_ptr)
577	add	%rax, $acc1
578	mov	$t1, %rax
579	adc	\$0, %rdx
580	mov	%rdx, $t0
581
582	mulq	8*1($a_ptr)
583	add	$t0, $acc2
584	adc	\$0, %rdx
585	add	%rax, $acc2
586	mov	$t1, %rax
587	adc	\$0, %rdx
588	mov	%rdx, $t0
589
590	mulq	8*2($a_ptr)
591	add	$t0, $acc3
592	adc	\$0, %rdx
593	add	%rax, $acc3
594	mov	$t1, %rax
595	adc	\$0, %rdx
596	mov	%rdx, $t0
597
598	mulq	8*3($a_ptr)
599	add	$t0, $acc4
600	adc	\$0, %rdx
601	add	%rax, $acc4
602	 mov	$acc1, %rax
603	adc	%rdx, $acc5
604	adc	\$0, $acc0
605
606	########################################################################
607	# Second reduction step
608	mov	$acc1, $t1
609	shl	\$32, $acc1
610	mulq	$poly3
611	shr	\$32, $t1
612	add	$acc1, $acc2
613	adc	$t1, $acc3
614	adc	%rax, $acc4
615	 mov	8*2($b_ptr), %rax
616	adc	%rdx, $acc5
617	adc	\$0, $acc0
618	xor	$acc1, $acc1
619
620	########################################################################
621	# Multiply by b[2]
622	mov	%rax, $t1
623	mulq	8*0($a_ptr)
624	add	%rax, $acc2
625	mov	$t1, %rax
626	adc	\$0, %rdx
627	mov	%rdx, $t0
628
629	mulq	8*1($a_ptr)
630	add	$t0, $acc3
631	adc	\$0, %rdx
632	add	%rax, $acc3
633	mov	$t1, %rax
634	adc	\$0, %rdx
635	mov	%rdx, $t0
636
637	mulq	8*2($a_ptr)
638	add	$t0, $acc4
639	adc	\$0, %rdx
640	add	%rax, $acc4
641	mov	$t1, %rax
642	adc	\$0, %rdx
643	mov	%rdx, $t0
644
645	mulq	8*3($a_ptr)
646	add	$t0, $acc5
647	adc	\$0, %rdx
648	add	%rax, $acc5
649	 mov	$acc2, %rax
650	adc	%rdx, $acc0
651	adc	\$0, $acc1
652
653	########################################################################
654	# Third reduction step
655	mov	$acc2, $t1
656	shl	\$32, $acc2
657	mulq	$poly3
658	shr	\$32, $t1
659	add	$acc2, $acc3
660	adc	$t1, $acc4
661	adc	%rax, $acc5
662	 mov	8*3($b_ptr), %rax
663	adc	%rdx, $acc0
664	adc	\$0, $acc1
665	xor	$acc2, $acc2
666
667	########################################################################
668	# Multiply by b[3]
669	mov	%rax, $t1
670	mulq	8*0($a_ptr)
671	add	%rax, $acc3
672	mov	$t1, %rax
673	adc	\$0, %rdx
674	mov	%rdx, $t0
675
676	mulq	8*1($a_ptr)
677	add	$t0, $acc4
678	adc	\$0, %rdx
679	add	%rax, $acc4
680	mov	$t1, %rax
681	adc	\$0, %rdx
682	mov	%rdx, $t0
683
684	mulq	8*2($a_ptr)
685	add	$t0, $acc5
686	adc	\$0, %rdx
687	add	%rax, $acc5
688	mov	$t1, %rax
689	adc	\$0, %rdx
690	mov	%rdx, $t0
691
692	mulq	8*3($a_ptr)
693	add	$t0, $acc0
694	adc	\$0, %rdx
695	add	%rax, $acc0
696	 mov	$acc3, %rax
697	adc	%rdx, $acc1
698	adc	\$0, $acc2
699
700	########################################################################
701	# Final reduction step
702	mov	$acc3, $t1
703	shl	\$32, $acc3
704	mulq	$poly3
705	shr	\$32, $t1
706	add	$acc3, $acc4
707	adc	$t1, $acc5
708	 mov	$acc4, $t0
709	adc	%rax, $acc0
710	adc	%rdx, $acc1
711	 mov	$acc5, $t1
712	adc	\$0, $acc2
713
714	########################################################################
715	# Branch-less conditional subtraction of P
716	sub	\$-1, $acc4		# .Lpoly[0]
717	 mov	$acc0, $t2
718	sbb	$poly1, $acc5		# .Lpoly[1]
719	sbb	\$0, $acc0		# .Lpoly[2]
720	 mov	$acc1, $t3
721	sbb	$poly3, $acc1		# .Lpoly[3]
722	sbb	\$0, $acc2
723
724	cmovc	$t0, $acc4
725	cmovc	$t1, $acc5
726	mov	$acc4, 8*0($r_ptr)
727	cmovc	$t2, $acc0
728	mov	$acc5, 8*1($r_ptr)
729	cmovc	$t3, $acc1
730	mov	$acc0, 8*2($r_ptr)
731	mov	$acc1, 8*3($r_ptr)
732
733	ret
734.size	__ecp_nistz256_mul_montq,.-__ecp_nistz256_mul_montq
735
736################################################################################
737# void ecp_nistz256_sqr_mont(
738#   uint64_t res[4],
739#   uint64_t a[4]);
740
741# we optimize the square according to S.Gueron and V.Krasnov,
742# "Speeding up Big-Number Squaring"
743.globl	ecp_nistz256_sqr_mont
744.type	ecp_nistz256_sqr_mont,\@function,2
745.align	32
746ecp_nistz256_sqr_mont:
747___
748$code.=<<___	if ($addx);
749	mov	\$0x80100, %ecx
750	and	OPENSSL_ia32cap_P+8(%rip), %ecx
751___
752$code.=<<___;
753	push	%rbp
754	push	%rbx
755	push	%r12
756	push	%r13
757	push	%r14
758	push	%r15
759___
760$code.=<<___	if ($addx);
761	cmp	\$0x80100, %ecx
762	je	.Lsqr_montx
763___
764$code.=<<___;
765	mov	8*0($a_ptr), %rax
766	mov	8*1($a_ptr), $acc6
767	mov	8*2($a_ptr), $acc7
768	mov	8*3($a_ptr), $acc0
769
770	call	__ecp_nistz256_sqr_montq
771___
772$code.=<<___	if ($addx);
773	jmp	.Lsqr_mont_done
774
775.align	32
776.Lsqr_montx:
777	mov	8*0($a_ptr), %rdx
778	mov	8*1($a_ptr), $acc6
779	mov	8*2($a_ptr), $acc7
780	mov	8*3($a_ptr), $acc0
781	lea	-128($a_ptr), $a_ptr	# control u-op density
782
783	call	__ecp_nistz256_sqr_montx
784___
785$code.=<<___;
786.Lsqr_mont_done:
787	pop	%r15
788	pop	%r14
789	pop	%r13
790	pop	%r12
791	pop	%rbx
792	pop	%rbp
793	ret
794.size	ecp_nistz256_sqr_mont,.-ecp_nistz256_sqr_mont
795
796.type	__ecp_nistz256_sqr_montq,\@abi-omnipotent
797.align	32
798__ecp_nistz256_sqr_montq:
799	mov	%rax, $acc5
800	mulq	$acc6			# a[1]*a[0]
801	mov	%rax, $acc1
802	mov	$acc7, %rax
803	mov	%rdx, $acc2
804
805	mulq	$acc5			# a[0]*a[2]
806	add	%rax, $acc2
807	mov	$acc0, %rax
808	adc	\$0, %rdx
809	mov	%rdx, $acc3
810
811	mulq	$acc5			# a[0]*a[3]
812	add	%rax, $acc3
813	 mov	$acc7, %rax
814	adc	\$0, %rdx
815	mov	%rdx, $acc4
816
817	#################################
818	mulq	$acc6			# a[1]*a[2]
819	add	%rax, $acc3
820	mov	$acc0, %rax
821	adc	\$0, %rdx
822	mov	%rdx, $t1
823
824	mulq	$acc6			# a[1]*a[3]
825	add	%rax, $acc4
826	 mov	$acc0, %rax
827	adc	\$0, %rdx
828	add	$t1, $acc4
829	mov	%rdx, $acc5
830	adc	\$0, $acc5
831
832	#################################
833	mulq	$acc7			# a[2]*a[3]
834	xor	$acc7, $acc7
835	add	%rax, $acc5
836	 mov	8*0($a_ptr), %rax
837	mov	%rdx, $acc6
838	adc	\$0, $acc6
839
840	add	$acc1, $acc1		# acc1:6<<1
841	adc	$acc2, $acc2
842	adc	$acc3, $acc3
843	adc	$acc4, $acc4
844	adc	$acc5, $acc5
845	adc	$acc6, $acc6
846	adc	\$0, $acc7
847
848	mulq	%rax
849	mov	%rax, $acc0
850	mov	8*1($a_ptr), %rax
851	mov	%rdx, $t0
852
853	mulq	%rax
854	add	$t0, $acc1
855	adc	%rax, $acc2
856	mov	8*2($a_ptr), %rax
857	adc	\$0, %rdx
858	mov	%rdx, $t0
859
860	mulq	%rax
861	add	$t0, $acc3
862	adc	%rax, $acc4
863	mov	8*3($a_ptr), %rax
864	adc	\$0, %rdx
865	mov	%rdx, $t0
866
867	mulq	%rax
868	add	$t0, $acc5
869	adc	%rax, $acc6
870	 mov	$acc0, %rax
871	adc	%rdx, $acc7
872
873	mov	.Lpoly+8*1(%rip), $a_ptr
874	mov	.Lpoly+8*3(%rip), $t1
875
876	##########################################
877	# Now the reduction
878	# First iteration
879	mov	$acc0, $t0
880	shl	\$32, $acc0
881	mulq	$t1
882	shr	\$32, $t0
883	add	$acc0, $acc1		# +=acc[0]<<96
884	adc	$t0, $acc2
885	adc	%rax, $acc3
886	 mov	$acc1, %rax
887	adc	\$0, %rdx
888
889	##########################################
890	# Second iteration
891	mov	$acc1, $t0
892	shl	\$32, $acc1
893	mov	%rdx, $acc0
894	mulq	$t1
895	shr	\$32, $t0
896	add	$acc1, $acc2
897	adc	$t0, $acc3
898	adc	%rax, $acc0
899	 mov	$acc2, %rax
900	adc	\$0, %rdx
901
902	##########################################
903	# Third iteration
904	mov	$acc2, $t0
905	shl	\$32, $acc2
906	mov	%rdx, $acc1
907	mulq	$t1
908	shr	\$32, $t0
909	add	$acc2, $acc3
910	adc	$t0, $acc0
911	adc	%rax, $acc1
912	 mov	$acc3, %rax
913	adc	\$0, %rdx
914
915	###########################################
916	# Last iteration
917	mov	$acc3, $t0
918	shl	\$32, $acc3
919	mov	%rdx, $acc2
920	mulq	$t1
921	shr	\$32, $t0
922	add	$acc3, $acc0
923	adc	$t0, $acc1
924	adc	%rax, $acc2
925	adc	\$0, %rdx
926	xor	$acc3, $acc3
927
928	############################################
929	# Add the rest of the acc
930	add	$acc0, $acc4
931	adc	$acc1, $acc5
932	 mov	$acc4, $acc0
933	adc	$acc2, $acc6
934	adc	%rdx, $acc7
935	 mov	$acc5, $acc1
936	adc	\$0, $acc3
937
938	sub	\$-1, $acc4		# .Lpoly[0]
939	 mov	$acc6, $acc2
940	sbb	$a_ptr, $acc5		# .Lpoly[1]
941	sbb	\$0, $acc6		# .Lpoly[2]
942	 mov	$acc7, $t0
943	sbb	$t1, $acc7		# .Lpoly[3]
944	sbb	\$0, $acc3
945
946	cmovc	$acc0, $acc4
947	cmovc	$acc1, $acc5
948	mov	$acc4, 8*0($r_ptr)
949	cmovc	$acc2, $acc6
950	mov	$acc5, 8*1($r_ptr)
951	cmovc	$t0, $acc7
952	mov	$acc6, 8*2($r_ptr)
953	mov	$acc7, 8*3($r_ptr)
954
955	ret
956.size	__ecp_nistz256_sqr_montq,.-__ecp_nistz256_sqr_montq
957___
958
959if ($addx) {
960$code.=<<___;
961.type	__ecp_nistz256_mul_montx,\@abi-omnipotent
962.align	32
963__ecp_nistz256_mul_montx:
964	########################################################################
965	# Multiply by b[0]
966	mulx	$acc1, $acc0, $acc1
967	mulx	$acc2, $t0, $acc2
968	mov	\$32, $poly1
969	xor	$acc5, $acc5		# cf=0
970	mulx	$acc3, $t1, $acc3
971	mov	.Lpoly+8*3(%rip), $poly3
972	adc	$t0, $acc1
973	mulx	$acc4, $t0, $acc4
974	 mov	$acc0, %rdx
975	adc	$t1, $acc2
976	 shlx	$poly1,$acc0,$t1
977	adc	$t0, $acc3
978	 shrx	$poly1,$acc0,$t0
979	adc	\$0, $acc4
980
981	########################################################################
982	# First reduction step
983	add	$t1, $acc1
984	adc	$t0, $acc2
985
986	mulx	$poly3, $t0, $t1
987	 mov	8*1($b_ptr), %rdx
988	adc	$t0, $acc3
989	adc	$t1, $acc4
990	adc	\$0, $acc5
991	xor	$acc0, $acc0		# $acc0=0,cf=0,of=0
992
993	########################################################################
994	# Multiply by b[1]
995	mulx	8*0+128($a_ptr), $t0, $t1
996	adcx	$t0, $acc1
997	adox	$t1, $acc2
998
999	mulx	8*1+128($a_ptr), $t0, $t1
1000	adcx	$t0, $acc2
1001	adox	$t1, $acc3
1002
1003	mulx	8*2+128($a_ptr), $t0, $t1
1004	adcx	$t0, $acc3
1005	adox	$t1, $acc4
1006
1007	mulx	8*3+128($a_ptr), $t0, $t1
1008	 mov	$acc1, %rdx
1009	adcx	$t0, $acc4
1010	 shlx	$poly1, $acc1, $t0
1011	adox	$t1, $acc5
1012	 shrx	$poly1, $acc1, $t1
1013
1014	adcx	$acc0, $acc5
1015	adox	$acc0, $acc0
1016	adc	\$0, $acc0
1017
1018	########################################################################
1019	# Second reduction step
1020	add	$t0, $acc2
1021	adc	$t1, $acc3
1022
1023	mulx	$poly3, $t0, $t1
1024	 mov	8*2($b_ptr), %rdx
1025	adc	$t0, $acc4
1026	adc	$t1, $acc5
1027	adc	\$0, $acc0
1028	xor	$acc1 ,$acc1		# $acc1=0,cf=0,of=0
1029
1030	########################################################################
1031	# Multiply by b[2]
1032	mulx	8*0+128($a_ptr), $t0, $t1
1033	adcx	$t0, $acc2
1034	adox	$t1, $acc3
1035
1036	mulx	8*1+128($a_ptr), $t0, $t1
1037	adcx	$t0, $acc3
1038	adox	$t1, $acc4
1039
1040	mulx	8*2+128($a_ptr), $t0, $t1
1041	adcx	$t0, $acc4
1042	adox	$t1, $acc5
1043
1044	mulx	8*3+128($a_ptr), $t0, $t1
1045	 mov	$acc2, %rdx
1046	adcx	$t0, $acc5
1047	 shlx	$poly1, $acc2, $t0
1048	adox	$t1, $acc0
1049	 shrx	$poly1, $acc2, $t1
1050
1051	adcx	$acc1, $acc0
1052	adox	$acc1, $acc1
1053	adc	\$0, $acc1
1054
1055	########################################################################
1056	# Third reduction step
1057	add	$t0, $acc3
1058	adc	$t1, $acc4
1059
1060	mulx	$poly3, $t0, $t1
1061	 mov	8*3($b_ptr), %rdx
1062	adc	$t0, $acc5
1063	adc	$t1, $acc0
1064	adc	\$0, $acc1
1065	xor	$acc2, $acc2		# $acc2=0,cf=0,of=0
1066
1067	########################################################################
1068	# Multiply by b[3]
1069	mulx	8*0+128($a_ptr), $t0, $t1
1070	adcx	$t0, $acc3
1071	adox	$t1, $acc4
1072
1073	mulx	8*1+128($a_ptr), $t0, $t1
1074	adcx	$t0, $acc4
1075	adox	$t1, $acc5
1076
1077	mulx	8*2+128($a_ptr), $t0, $t1
1078	adcx	$t0, $acc5
1079	adox	$t1, $acc0
1080
1081	mulx	8*3+128($a_ptr), $t0, $t1
1082	 mov	$acc3, %rdx
1083	adcx	$t0, $acc0
1084	 shlx	$poly1, $acc3, $t0
1085	adox	$t1, $acc1
1086	 shrx	$poly1, $acc3, $t1
1087
1088	adcx	$acc2, $acc1
1089	adox	$acc2, $acc2
1090	adc	\$0, $acc2
1091
1092	########################################################################
1093	# Fourth reduction step
1094	add	$t0, $acc4
1095	adc	$t1, $acc5
1096
1097	mulx	$poly3, $t0, $t1
1098	 mov	$acc4, $t2
1099	mov	.Lpoly+8*1(%rip), $poly1
1100	adc	$t0, $acc0
1101	 mov	$acc5, $t3
1102	adc	$t1, $acc1
1103	adc	\$0, $acc2
1104
1105	########################################################################
1106	# Branch-less conditional subtraction of P
1107	xor	%eax, %eax
1108	 mov	$acc0, $t0
1109	sbb	\$-1, $acc4		# .Lpoly[0]
1110	sbb	$poly1, $acc5		# .Lpoly[1]
1111	sbb	\$0, $acc0		# .Lpoly[2]
1112	 mov	$acc1, $t1
1113	sbb	$poly3, $acc1		# .Lpoly[3]
1114	sbb	\$0, $acc2
1115
1116	cmovc	$t2, $acc4
1117	cmovc	$t3, $acc5
1118	mov	$acc4, 8*0($r_ptr)
1119	cmovc	$t0, $acc0
1120	mov	$acc5, 8*1($r_ptr)
1121	cmovc	$t1, $acc1
1122	mov	$acc0, 8*2($r_ptr)
1123	mov	$acc1, 8*3($r_ptr)
1124
1125	ret
1126.size	__ecp_nistz256_mul_montx,.-__ecp_nistz256_mul_montx
1127
1128.type	__ecp_nistz256_sqr_montx,\@abi-omnipotent
1129.align	32
1130__ecp_nistz256_sqr_montx:
1131	mulx	$acc6, $acc1, $acc2	# a[0]*a[1]
1132	mulx	$acc7, $t0, $acc3	# a[0]*a[2]
1133	xor	%eax, %eax
1134	adc	$t0, $acc2
1135	mulx	$acc0, $t1, $acc4	# a[0]*a[3]
1136	 mov	$acc6, %rdx
1137	adc	$t1, $acc3
1138	adc	\$0, $acc4
1139	xor	$acc5, $acc5		# $acc5=0,cf=0,of=0
1140
1141	#################################
1142	mulx	$acc7, $t0, $t1		# a[1]*a[2]
1143	adcx	$t0, $acc3
1144	adox	$t1, $acc4
1145
1146	mulx	$acc0, $t0, $t1		# a[1]*a[3]
1147	 mov	$acc7, %rdx
1148	adcx	$t0, $acc4
1149	adox	$t1, $acc5
1150	adc	\$0, $acc5
1151
1152	#################################
1153	mulx	$acc0, $t0, $acc6	# a[2]*a[3]
1154	 mov	8*0+128($a_ptr), %rdx
1155	xor	$acc7, $acc7		# $acc7=0,cf=0,of=0
1156	 adcx	$acc1, $acc1		# acc1:6<<1
1157	adox	$t0, $acc5
1158	 adcx	$acc2, $acc2
1159	adox	$acc7, $acc6		# of=0
1160
1161	mulx	%rdx, $acc0, $t1
1162	mov	8*1+128($a_ptr), %rdx
1163	 adcx	$acc3, $acc3
1164	adox	$t1, $acc1
1165	 adcx	$acc4, $acc4
1166	mulx	%rdx, $t0, $t4
1167	mov	8*2+128($a_ptr), %rdx
1168	 adcx	$acc5, $acc5
1169	adox	$t0, $acc2
1170	 adcx	$acc6, $acc6
1171	.byte	0x67
1172	mulx	%rdx, $t0, $t1
1173	mov	8*3+128($a_ptr), %rdx
1174	adox	$t4, $acc3
1175	 adcx	$acc7, $acc7
1176	adox	$t0, $acc4
1177	 mov	\$32, $a_ptr
1178	adox	$t1, $acc5
1179	.byte	0x67,0x67
1180	mulx	%rdx, $t0, $t4
1181	 mov	$acc0, %rdx
1182	adox	$t0, $acc6
1183	 shlx	$a_ptr, $acc0, $t0
1184	adox	$t4, $acc7
1185	 shrx	$a_ptr, $acc0, $t4
1186	 mov	.Lpoly+8*3(%rip), $t1
1187
1188	# reduction step 1
1189	add	$t0, $acc1
1190	adc	$t4, $acc2
1191
1192	mulx	$t1, $t0, $acc0
1193	 mov	$acc1, %rdx
1194	adc	$t0, $acc3
1195	 shlx	$a_ptr, $acc1, $t0
1196	adc	\$0, $acc0
1197	 shrx	$a_ptr, $acc1, $t4
1198
1199	# reduction step 2
1200	add	$t0, $acc2
1201	adc	$t4, $acc3
1202
1203	mulx	$t1, $t0, $acc1
1204	 mov	$acc2, %rdx
1205	adc	$t0, $acc0
1206	 shlx	$a_ptr, $acc2, $t0
1207	adc	\$0, $acc1
1208	 shrx	$a_ptr, $acc2, $t4
1209
1210	# reduction step 3
1211	add	$t0, $acc3
1212	adc	$t4, $acc0
1213
1214	mulx	$t1, $t0, $acc2
1215	 mov	$acc3, %rdx
1216	adc	$t0, $acc1
1217	 shlx	$a_ptr, $acc3, $t0
1218	adc	\$0, $acc2
1219	 shrx	$a_ptr, $acc3, $t4
1220
1221	# reduction step 4
1222	add	$t0, $acc0
1223	adc	$t4, $acc1
1224
1225	mulx	$t1, $t0, $acc3
1226	adc	$t0, $acc2
1227	adc	\$0, $acc3
1228
1229	xor	$t3, $t3		# cf=0
1230	adc	$acc0, $acc4		# accumulate upper half
1231	 mov	.Lpoly+8*1(%rip), $a_ptr
1232	adc	$acc1, $acc5
1233	 mov	$acc4, $acc0
1234	adc	$acc2, $acc6
1235	adc	$acc3, $acc7
1236	 mov	$acc5, $acc1
1237	adc	\$0, $t3
1238
1239	xor	%eax, %eax		# cf=0
1240	sbb	\$-1, $acc4		# .Lpoly[0]
1241	 mov	$acc6, $acc2
1242	sbb	$a_ptr, $acc5		# .Lpoly[1]
1243	sbb	\$0, $acc6		# .Lpoly[2]
1244	 mov	$acc7, $acc3
1245	sbb	$t1, $acc7		# .Lpoly[3]
1246	sbb	\$0, $t3
1247
1248	cmovc	$acc0, $acc4
1249	cmovc	$acc1, $acc5
1250	mov	$acc4, 8*0($r_ptr)
1251	cmovc	$acc2, $acc6
1252	mov	$acc5, 8*1($r_ptr)
1253	cmovc	$acc3, $acc7
1254	mov	$acc6, 8*2($r_ptr)
1255	mov	$acc7, 8*3($r_ptr)
1256
1257	ret
1258.size	__ecp_nistz256_sqr_montx,.-__ecp_nistz256_sqr_montx
1259___
1260}
1261}
1262{
1263my ($r_ptr,$in_ptr)=("%rdi","%rsi");
1264my ($acc0,$acc1,$acc2,$acc3)=map("%r$_",(8..11));
1265my ($t0,$t1,$t2)=("%rcx","%r12","%r13");
1266
1267$code.=<<___;
1268################################################################################
1269# void ecp_nistz256_from_mont(
1270#   uint64_t res[4],
1271#   uint64_t in[4]);
1272# This one performs Montgomery multiplication by 1, so we only need the reduction
1273
1274.globl	ecp_nistz256_from_mont
1275.type	ecp_nistz256_from_mont,\@function,2
1276.align	32
1277ecp_nistz256_from_mont:
1278	push	%r12
1279	push	%r13
1280
1281	mov	8*0($in_ptr), %rax
1282	mov	.Lpoly+8*3(%rip), $t2
1283	mov	8*1($in_ptr), $acc1
1284	mov	8*2($in_ptr), $acc2
1285	mov	8*3($in_ptr), $acc3
1286	mov	%rax, $acc0
1287	mov	.Lpoly+8*1(%rip), $t1
1288
1289	#########################################
1290	# First iteration
1291	mov	%rax, $t0
1292	shl	\$32, $acc0
1293	mulq	$t2
1294	shr	\$32, $t0
1295	add	$acc0, $acc1
1296	adc	$t0, $acc2
1297	adc	%rax, $acc3
1298	 mov	$acc1, %rax
1299	adc	\$0, %rdx
1300
1301	#########################################
1302	# Second iteration
1303	mov	$acc1, $t0
1304	shl	\$32, $acc1
1305	mov	%rdx, $acc0
1306	mulq	$t2
1307	shr	\$32, $t0
1308	add	$acc1, $acc2
1309	adc	$t0, $acc3
1310	adc	%rax, $acc0
1311	 mov	$acc2, %rax
1312	adc	\$0, %rdx
1313
1314	##########################################
1315	# Third iteration
1316	mov	$acc2, $t0
1317	shl	\$32, $acc2
1318	mov	%rdx, $acc1
1319	mulq	$t2
1320	shr	\$32, $t0
1321	add	$acc2, $acc3
1322	adc	$t0, $acc0
1323	adc	%rax, $acc1
1324	 mov	$acc3, %rax
1325	adc	\$0, %rdx
1326
1327	###########################################
1328	# Last iteration
1329	mov	$acc3, $t0
1330	shl	\$32, $acc3
1331	mov	%rdx, $acc2
1332	mulq	$t2
1333	shr	\$32, $t0
1334	add	$acc3, $acc0
1335	adc	$t0, $acc1
1336	 mov	$acc0, $t0
1337	adc	%rax, $acc2
1338	 mov	$acc1, $in_ptr
1339	adc	\$0, %rdx
1340
1341	###########################################
1342	# Branch-less conditional subtraction
1343	sub	\$-1, $acc0
1344	 mov	$acc2, %rax
1345	sbb	$t1, $acc1
1346	sbb	\$0, $acc2
1347	 mov	%rdx, $acc3
1348	sbb	$t2, %rdx
1349	sbb	$t2, $t2
1350
1351	cmovnz	$t0, $acc0
1352	cmovnz	$in_ptr, $acc1
1353	mov	$acc0, 8*0($r_ptr)
1354	cmovnz	%rax, $acc2
1355	mov	$acc1, 8*1($r_ptr)
1356	cmovz	%rdx, $acc3
1357	mov	$acc2, 8*2($r_ptr)
1358	mov	$acc3, 8*3($r_ptr)
1359
1360	pop	%r13
1361	pop	%r12
1362	ret
1363.size	ecp_nistz256_from_mont,.-ecp_nistz256_from_mont
1364___
1365}
1366{
1367my ($val,$in_t,$index)=$win64?("%rcx","%rdx","%r8d"):("%rdi","%rsi","%edx");
1368my ($ONE,$INDEX,$Ra,$Rb,$Rc,$Rd,$Re,$Rf)=map("%xmm$_",(0..7));
1369my ($M0,$T0a,$T0b,$T0c,$T0d,$T0e,$T0f,$TMP0)=map("%xmm$_",(8..15));
1370my ($M1,$T2a,$T2b,$TMP2,$M2,$T2a,$T2b,$TMP2)=map("%xmm$_",(8..15));
1371
1372$code.=<<___;
1373################################################################################
1374# void ecp_nistz256_select_w5(uint64_t *val, uint64_t *in_t, int index);
1375.globl	ecp_nistz256_select_w5
1376.type	ecp_nistz256_select_w5,\@abi-omnipotent
1377.align	32
1378ecp_nistz256_select_w5:
1379___
1380$code.=<<___	if ($avx>1);
1381	mov	OPENSSL_ia32cap_P+8(%rip), %eax
1382	test	\$`1<<5`, %eax
1383	jnz	.Lavx2_select_w5
1384___
1385$code.=<<___	if ($win64);
1386	lea	-0x88(%rsp), %rax
1387.LSEH_begin_ecp_nistz256_select_w5:
1388	.byte	0x48,0x8d,0x60,0xe0		#lea	-0x20(%rax), %rsp
1389	.byte	0x0f,0x29,0x70,0xe0		#movaps	%xmm6, -0x20(%rax)
1390	.byte	0x0f,0x29,0x78,0xf0		#movaps	%xmm7, -0x10(%rax)
1391	.byte	0x44,0x0f,0x29,0x00		#movaps	%xmm8, 0(%rax)
1392	.byte	0x44,0x0f,0x29,0x48,0x10	#movaps	%xmm9, 0x10(%rax)
1393	.byte	0x44,0x0f,0x29,0x50,0x20	#movaps	%xmm10, 0x20(%rax)
1394	.byte	0x44,0x0f,0x29,0x58,0x30	#movaps	%xmm11, 0x30(%rax)
1395	.byte	0x44,0x0f,0x29,0x60,0x40	#movaps	%xmm12, 0x40(%rax)
1396	.byte	0x44,0x0f,0x29,0x68,0x50	#movaps	%xmm13, 0x50(%rax)
1397	.byte	0x44,0x0f,0x29,0x70,0x60	#movaps	%xmm14, 0x60(%rax)
1398	.byte	0x44,0x0f,0x29,0x78,0x70	#movaps	%xmm15, 0x70(%rax)
1399___
1400$code.=<<___;
1401	movdqa	.LOne(%rip), $ONE
1402	movd	$index, $INDEX
1403
1404	pxor	$Ra, $Ra
1405	pxor	$Rb, $Rb
1406	pxor	$Rc, $Rc
1407	pxor	$Rd, $Rd
1408	pxor	$Re, $Re
1409	pxor	$Rf, $Rf
1410
1411	movdqa	$ONE, $M0
1412	pshufd	\$0, $INDEX, $INDEX
1413
1414	mov	\$16, %rax
1415.Lselect_loop_sse_w5:
1416
1417	movdqa	$M0, $TMP0
1418	paddd	$ONE, $M0
1419	pcmpeqd $INDEX, $TMP0
1420
1421	movdqa	16*0($in_t), $T0a
1422	movdqa	16*1($in_t), $T0b
1423	movdqa	16*2($in_t), $T0c
1424	movdqa	16*3($in_t), $T0d
1425	movdqa	16*4($in_t), $T0e
1426	movdqa	16*5($in_t), $T0f
1427	lea 16*6($in_t), $in_t
1428
1429	pand	$TMP0, $T0a
1430	pand	$TMP0, $T0b
1431	por	$T0a, $Ra
1432	pand	$TMP0, $T0c
1433	por	$T0b, $Rb
1434	pand	$TMP0, $T0d
1435	por	$T0c, $Rc
1436	pand	$TMP0, $T0e
1437	por	$T0d, $Rd
1438	pand	$TMP0, $T0f
1439	por	$T0e, $Re
1440	por	$T0f, $Rf
1441
1442	dec	%rax
1443	jnz	.Lselect_loop_sse_w5
1444
1445	movdqu	$Ra, 16*0($val)
1446	movdqu	$Rb, 16*1($val)
1447	movdqu	$Rc, 16*2($val)
1448	movdqu	$Rd, 16*3($val)
1449	movdqu	$Re, 16*4($val)
1450	movdqu	$Rf, 16*5($val)
1451___
1452$code.=<<___	if ($win64);
1453	movaps	(%rsp), %xmm6
1454	movaps	0x10(%rsp), %xmm7
1455	movaps	0x20(%rsp), %xmm8
1456	movaps	0x30(%rsp), %xmm9
1457	movaps	0x40(%rsp), %xmm10
1458	movaps	0x50(%rsp), %xmm11
1459	movaps	0x60(%rsp), %xmm12
1460	movaps	0x70(%rsp), %xmm13
1461	movaps	0x80(%rsp), %xmm14
1462	movaps	0x90(%rsp), %xmm15
1463	lea	0xa8(%rsp), %rsp
1464.LSEH_end_ecp_nistz256_select_w5:
1465___
1466$code.=<<___;
1467	ret
1468.size	ecp_nistz256_select_w5,.-ecp_nistz256_select_w5
1469
1470################################################################################
1471# void ecp_nistz256_select_w7(uint64_t *val, uint64_t *in_t, int index);
1472.globl	ecp_nistz256_select_w7
1473.type	ecp_nistz256_select_w7,\@abi-omnipotent
1474.align	32
1475ecp_nistz256_select_w7:
1476___
1477$code.=<<___	if ($avx>1);
1478	mov	OPENSSL_ia32cap_P+8(%rip), %eax
1479	test	\$`1<<5`, %eax
1480	jnz	.Lavx2_select_w7
1481___
1482$code.=<<___	if ($win64);
1483	lea	-0x88(%rsp), %rax
1484.LSEH_begin_ecp_nistz256_select_w7:
1485	.byte	0x48,0x8d,0x60,0xe0		#lea	-0x20(%rax), %rsp
1486	.byte	0x0f,0x29,0x70,0xe0		#movaps	%xmm6, -0x20(%rax)
1487	.byte	0x0f,0x29,0x78,0xf0		#movaps	%xmm7, -0x10(%rax)
1488	.byte	0x44,0x0f,0x29,0x00		#movaps	%xmm8, 0(%rax)
1489	.byte	0x44,0x0f,0x29,0x48,0x10	#movaps	%xmm9, 0x10(%rax)
1490	.byte	0x44,0x0f,0x29,0x50,0x20	#movaps	%xmm10, 0x20(%rax)
1491	.byte	0x44,0x0f,0x29,0x58,0x30	#movaps	%xmm11, 0x30(%rax)
1492	.byte	0x44,0x0f,0x29,0x60,0x40	#movaps	%xmm12, 0x40(%rax)
1493	.byte	0x44,0x0f,0x29,0x68,0x50	#movaps	%xmm13, 0x50(%rax)
1494	.byte	0x44,0x0f,0x29,0x70,0x60	#movaps	%xmm14, 0x60(%rax)
1495	.byte	0x44,0x0f,0x29,0x78,0x70	#movaps	%xmm15, 0x70(%rax)
1496___
1497$code.=<<___;
1498	movdqa	.LOne(%rip), $M0
1499	movd	$index, $INDEX
1500
1501	pxor	$Ra, $Ra
1502	pxor	$Rb, $Rb
1503	pxor	$Rc, $Rc
1504	pxor	$Rd, $Rd
1505
1506	movdqa	$M0, $ONE
1507	pshufd	\$0, $INDEX, $INDEX
1508	mov	\$64, %rax
1509
1510.Lselect_loop_sse_w7:
1511	movdqa	$M0, $TMP0
1512	paddd	$ONE, $M0
1513	movdqa	16*0($in_t), $T0a
1514	movdqa	16*1($in_t), $T0b
1515	pcmpeqd	$INDEX, $TMP0
1516	movdqa	16*2($in_t), $T0c
1517	movdqa	16*3($in_t), $T0d
1518	lea	16*4($in_t), $in_t
1519
1520	pand	$TMP0, $T0a
1521	pand	$TMP0, $T0b
1522	por	$T0a, $Ra
1523	pand	$TMP0, $T0c
1524	por	$T0b, $Rb
1525	pand	$TMP0, $T0d
1526	por	$T0c, $Rc
1527	prefetcht0	255($in_t)
1528	por	$T0d, $Rd
1529
1530	dec	%rax
1531	jnz	.Lselect_loop_sse_w7
1532
1533	movdqu	$Ra, 16*0($val)
1534	movdqu	$Rb, 16*1($val)
1535	movdqu	$Rc, 16*2($val)
1536	movdqu	$Rd, 16*3($val)
1537___
1538$code.=<<___	if ($win64);
1539	movaps	(%rsp), %xmm6
1540	movaps	0x10(%rsp), %xmm7
1541	movaps	0x20(%rsp), %xmm8
1542	movaps	0x30(%rsp), %xmm9
1543	movaps	0x40(%rsp), %xmm10
1544	movaps	0x50(%rsp), %xmm11
1545	movaps	0x60(%rsp), %xmm12
1546	movaps	0x70(%rsp), %xmm13
1547	movaps	0x80(%rsp), %xmm14
1548	movaps	0x90(%rsp), %xmm15
1549	lea	0xa8(%rsp), %rsp
1550.LSEH_end_ecp_nistz256_select_w7:
1551___
1552$code.=<<___;
1553	ret
1554.size	ecp_nistz256_select_w7,.-ecp_nistz256_select_w7
1555___
1556}
1557if ($avx>1) {
1558my ($val,$in_t,$index)=$win64?("%rcx","%rdx","%r8d"):("%rdi","%rsi","%edx");
1559my ($TWO,$INDEX,$Ra,$Rb,$Rc)=map("%ymm$_",(0..4));
1560my ($M0,$T0a,$T0b,$T0c,$TMP0)=map("%ymm$_",(5..9));
1561my ($M1,$T1a,$T1b,$T1c,$TMP1)=map("%ymm$_",(10..14));
1562
1563$code.=<<___;
1564################################################################################
1565# void ecp_nistz256_avx2_select_w5(uint64_t *val, uint64_t *in_t, int index);
1566.type	ecp_nistz256_avx2_select_w5,\@abi-omnipotent
1567.align	32
1568ecp_nistz256_avx2_select_w5:
1569.Lavx2_select_w5:
1570	vzeroupper
1571___
1572$code.=<<___	if ($win64);
1573	lea	-0x88(%rsp), %rax
1574.LSEH_begin_ecp_nistz256_avx2_select_w5:
1575	.byte	0x48,0x8d,0x60,0xe0		#lea	-0x20(%rax), %rsp
1576	.byte	0xc5,0xf8,0x29,0x70,0xe0	#vmovaps %xmm6, -0x20(%rax)
1577	.byte	0xc5,0xf8,0x29,0x78,0xf0	#vmovaps %xmm7, -0x10(%rax)
1578	.byte	0xc5,0x78,0x29,0x40,0x00	#vmovaps %xmm8, 8(%rax)
1579	.byte	0xc5,0x78,0x29,0x48,0x10	#vmovaps %xmm9, 0x10(%rax)
1580	.byte	0xc5,0x78,0x29,0x50,0x20	#vmovaps %xmm10, 0x20(%rax)
1581	.byte	0xc5,0x78,0x29,0x58,0x30	#vmovaps %xmm11, 0x30(%rax)
1582	.byte	0xc5,0x78,0x29,0x60,0x40	#vmovaps %xmm12, 0x40(%rax)
1583	.byte	0xc5,0x78,0x29,0x68,0x50	#vmovaps %xmm13, 0x50(%rax)
1584	.byte	0xc5,0x78,0x29,0x70,0x60	#vmovaps %xmm14, 0x60(%rax)
1585	.byte	0xc5,0x78,0x29,0x78,0x70	#vmovaps %xmm15, 0x70(%rax)
1586___
1587$code.=<<___;
1588	vmovdqa	.LTwo(%rip), $TWO
1589
1590	vpxor	$Ra, $Ra, $Ra
1591	vpxor	$Rb, $Rb, $Rb
1592	vpxor	$Rc, $Rc, $Rc
1593
1594	vmovdqa .LOne(%rip), $M0
1595	vmovdqa .LTwo(%rip), $M1
1596
1597	vmovd	$index, %xmm1
1598	vpermd	$INDEX, $Ra, $INDEX
1599
1600	mov	\$8, %rax
1601.Lselect_loop_avx2_w5:
1602
1603	vmovdqa	32*0($in_t), $T0a
1604	vmovdqa	32*1($in_t), $T0b
1605	vmovdqa	32*2($in_t), $T0c
1606
1607	vmovdqa	32*3($in_t), $T1a
1608	vmovdqa	32*4($in_t), $T1b
1609	vmovdqa	32*5($in_t), $T1c
1610
1611	vpcmpeqd	$INDEX, $M0, $TMP0
1612	vpcmpeqd	$INDEX, $M1, $TMP1
1613
1614	vpaddd	$TWO, $M0, $M0
1615	vpaddd	$TWO, $M1, $M1
1616	lea	32*6($in_t), $in_t
1617
1618	vpand	$TMP0, $T0a, $T0a
1619	vpand	$TMP0, $T0b, $T0b
1620	vpand	$TMP0, $T0c, $T0c
1621	vpand	$TMP1, $T1a, $T1a
1622	vpand	$TMP1, $T1b, $T1b
1623	vpand	$TMP1, $T1c, $T1c
1624
1625	vpxor	$T0a, $Ra, $Ra
1626	vpxor	$T0b, $Rb, $Rb
1627	vpxor	$T0c, $Rc, $Rc
1628	vpxor	$T1a, $Ra, $Ra
1629	vpxor	$T1b, $Rb, $Rb
1630	vpxor	$T1c, $Rc, $Rc
1631
1632	dec %rax
1633	jnz .Lselect_loop_avx2_w5
1634
1635	vmovdqu $Ra, 32*0($val)
1636	vmovdqu $Rb, 32*1($val)
1637	vmovdqu $Rc, 32*2($val)
1638	vzeroupper
1639___
1640$code.=<<___	if ($win64);
1641	movaps	(%rsp), %xmm6
1642	movaps	0x10(%rsp), %xmm7
1643	movaps	0x20(%rsp), %xmm8
1644	movaps	0x30(%rsp), %xmm9
1645	movaps	0x40(%rsp), %xmm10
1646	movaps	0x50(%rsp), %xmm11
1647	movaps	0x60(%rsp), %xmm12
1648	movaps	0x70(%rsp), %xmm13
1649	movaps	0x80(%rsp), %xmm14
1650	movaps	0x90(%rsp), %xmm15
1651	lea	0xa8(%rsp), %rsp
1652.LSEH_end_ecp_nistz256_avx2_select_w5:
1653___
1654$code.=<<___;
1655	ret
1656.size	ecp_nistz256_avx2_select_w5,.-ecp_nistz256_avx2_select_w5
1657___
1658}
1659if ($avx>1) {
1660my ($val,$in_t,$index)=$win64?("%rcx","%rdx","%r8d"):("%rdi","%rsi","%edx");
1661my ($THREE,$INDEX,$Ra,$Rb)=map("%ymm$_",(0..3));
1662my ($M0,$T0a,$T0b,$TMP0)=map("%ymm$_",(4..7));
1663my ($M1,$T1a,$T1b,$TMP1)=map("%ymm$_",(8..11));
1664my ($M2,$T2a,$T2b,$TMP2)=map("%ymm$_",(12..15));
1665
1666$code.=<<___;
1667
1668################################################################################
1669# void ecp_nistz256_avx2_select_w7(uint64_t *val, uint64_t *in_t, int index);
1670.globl	ecp_nistz256_avx2_select_w7
1671.type	ecp_nistz256_avx2_select_w7,\@abi-omnipotent
1672.align	32
1673ecp_nistz256_avx2_select_w7:
1674.Lavx2_select_w7:
1675	vzeroupper
1676___
1677$code.=<<___	if ($win64);
1678	lea	-0x88(%rsp), %rax
1679.LSEH_begin_ecp_nistz256_avx2_select_w7:
1680	.byte	0x48,0x8d,0x60,0xe0		#lea	-0x20(%rax), %rsp
1681	.byte	0xc5,0xf8,0x29,0x70,0xe0	#vmovaps %xmm6, -0x20(%rax)
1682	.byte	0xc5,0xf8,0x29,0x78,0xf0	#vmovaps %xmm7, -0x10(%rax)
1683	.byte	0xc5,0x78,0x29,0x40,0x00	#vmovaps %xmm8, 8(%rax)
1684	.byte	0xc5,0x78,0x29,0x48,0x10	#vmovaps %xmm9, 0x10(%rax)
1685	.byte	0xc5,0x78,0x29,0x50,0x20	#vmovaps %xmm10, 0x20(%rax)
1686	.byte	0xc5,0x78,0x29,0x58,0x30	#vmovaps %xmm11, 0x30(%rax)
1687	.byte	0xc5,0x78,0x29,0x60,0x40	#vmovaps %xmm12, 0x40(%rax)
1688	.byte	0xc5,0x78,0x29,0x68,0x50	#vmovaps %xmm13, 0x50(%rax)
1689	.byte	0xc5,0x78,0x29,0x70,0x60	#vmovaps %xmm14, 0x60(%rax)
1690	.byte	0xc5,0x78,0x29,0x78,0x70	#vmovaps %xmm15, 0x70(%rax)
1691___
1692$code.=<<___;
1693	vmovdqa	.LThree(%rip), $THREE
1694
1695	vpxor	$Ra, $Ra, $Ra
1696	vpxor	$Rb, $Rb, $Rb
1697
1698	vmovdqa .LOne(%rip), $M0
1699	vmovdqa .LTwo(%rip), $M1
1700	vmovdqa .LThree(%rip), $M2
1701
1702	vmovd	$index, %xmm1
1703	vpermd	$INDEX, $Ra, $INDEX
1704	# Skip index = 0, because it is implicitly the point at infinity
1705
1706	mov	\$21, %rax
1707.Lselect_loop_avx2_w7:
1708
1709	vmovdqa	32*0($in_t), $T0a
1710	vmovdqa	32*1($in_t), $T0b
1711
1712	vmovdqa	32*2($in_t), $T1a
1713	vmovdqa	32*3($in_t), $T1b
1714
1715	vmovdqa	32*4($in_t), $T2a
1716	vmovdqa	32*5($in_t), $T2b
1717
1718	vpcmpeqd	$INDEX, $M0, $TMP0
1719	vpcmpeqd	$INDEX, $M1, $TMP1
1720	vpcmpeqd	$INDEX, $M2, $TMP2
1721
1722	vpaddd	$THREE, $M0, $M0
1723	vpaddd	$THREE, $M1, $M1
1724	vpaddd	$THREE, $M2, $M2
1725	lea	32*6($in_t), $in_t
1726
1727	vpand	$TMP0, $T0a, $T0a
1728	vpand	$TMP0, $T0b, $T0b
1729	vpand	$TMP1, $T1a, $T1a
1730	vpand	$TMP1, $T1b, $T1b
1731	vpand	$TMP2, $T2a, $T2a
1732	vpand	$TMP2, $T2b, $T2b
1733
1734	vpxor	$T0a, $Ra, $Ra
1735	vpxor	$T0b, $Rb, $Rb
1736	vpxor	$T1a, $Ra, $Ra
1737	vpxor	$T1b, $Rb, $Rb
1738	vpxor	$T2a, $Ra, $Ra
1739	vpxor	$T2b, $Rb, $Rb
1740
1741	dec %rax
1742	jnz .Lselect_loop_avx2_w7
1743
1744
1745	vmovdqa	32*0($in_t), $T0a
1746	vmovdqa	32*1($in_t), $T0b
1747
1748	vpcmpeqd	$INDEX, $M0, $TMP0
1749
1750	vpand	$TMP0, $T0a, $T0a
1751	vpand	$TMP0, $T0b, $T0b
1752
1753	vpxor	$T0a, $Ra, $Ra
1754	vpxor	$T0b, $Rb, $Rb
1755
1756	vmovdqu $Ra, 32*0($val)
1757	vmovdqu $Rb, 32*1($val)
1758	vzeroupper
1759___
1760$code.=<<___	if ($win64);
1761	movaps	(%rsp), %xmm6
1762	movaps	0x10(%rsp), %xmm7
1763	movaps	0x20(%rsp), %xmm8
1764	movaps	0x30(%rsp), %xmm9
1765	movaps	0x40(%rsp), %xmm10
1766	movaps	0x50(%rsp), %xmm11
1767	movaps	0x60(%rsp), %xmm12
1768	movaps	0x70(%rsp), %xmm13
1769	movaps	0x80(%rsp), %xmm14
1770	movaps	0x90(%rsp), %xmm15
1771	lea	0xa8(%rsp), %rsp
1772.LSEH_end_ecp_nistz256_avx2_select_w7:
1773___
1774$code.=<<___;
1775	ret
1776.size	ecp_nistz256_avx2_select_w7,.-ecp_nistz256_avx2_select_w7
1777___
1778} else {
1779$code.=<<___;
1780.globl	ecp_nistz256_avx2_select_w7
1781.type	ecp_nistz256_avx2_select_w7,\@function,3
1782.align	32
1783ecp_nistz256_avx2_select_w7:
1784	.byte	0x0f,0x0b	# ud2
1785	ret
1786.size	ecp_nistz256_avx2_select_w7,.-ecp_nistz256_avx2_select_w7
1787___
1788}
1789{{{
1790########################################################################
1791# This block implements higher level point_double, point_add and
1792# point_add_affine. The key to performance in this case is to allow
1793# out-of-order execution logic to overlap computations from next step
1794# with tail processing from current step. By using tailored calling
1795# sequence we minimize inter-step overhead to give processor better
1796# shot at overlapping operations...
1797#
1798# You will notice that input data is copied to stack. Trouble is that
1799# there are no registers to spare for holding original pointers and
1800# reloading them, pointers, would create undesired dependencies on
1801# effective addresses calculation paths. In other words it's too done
1802# to favour out-of-order execution logic.
1803#						<appro@openssl.org>
1804
1805my ($r_ptr,$a_ptr,$b_org,$b_ptr)=("%rdi","%rsi","%rdx","%rbx");
1806my ($acc0,$acc1,$acc2,$acc3,$acc4,$acc5,$acc6,$acc7)=map("%r$_",(8..15));
1807my ($t0,$t1,$t2,$t3,$t4)=("%rax","%rbp","%rcx",$acc4,$acc4);
1808my ($poly1,$poly3)=($acc6,$acc7);
1809
1810sub load_for_mul () {
1811my ($a,$b,$src0) = @_;
1812my $bias = $src0 eq "%rax" ? 0 : -128;
1813
1814"	mov	$b, $src0
1815	lea	$b, $b_ptr
1816	mov	8*0+$a, $acc1
1817	mov	8*1+$a, $acc2
1818	lea	$bias+$a, $a_ptr
1819	mov	8*2+$a, $acc3
1820	mov	8*3+$a, $acc4"
1821}
1822
1823sub load_for_sqr () {
1824my ($a,$src0) = @_;
1825my $bias = $src0 eq "%rax" ? 0 : -128;
1826
1827"	mov	8*0+$a, $src0
1828	mov	8*1+$a, $acc6
1829	lea	$bias+$a, $a_ptr
1830	mov	8*2+$a, $acc7
1831	mov	8*3+$a, $acc0"
1832}
1833
1834									{
1835########################################################################
1836# operate in 4-5-0-1 "name space" that matches multiplication output
1837#
1838my ($a0,$a1,$a2,$a3,$t3,$t4)=($acc4,$acc5,$acc0,$acc1,$acc2,$acc3);
1839
1840$code.=<<___;
1841.type	__ecp_nistz256_add_toq,\@abi-omnipotent
1842.align	32
1843__ecp_nistz256_add_toq:
1844	xor	$t4,$t4
1845	add	8*0($b_ptr), $a0
1846	adc	8*1($b_ptr), $a1
1847	 mov	$a0, $t0
1848	adc	8*2($b_ptr), $a2
1849	adc	8*3($b_ptr), $a3
1850	 mov	$a1, $t1
1851	adc	\$0, $t4
1852
1853	sub	\$-1, $a0
1854	 mov	$a2, $t2
1855	sbb	$poly1, $a1
1856	sbb	\$0, $a2
1857	 mov	$a3, $t3
1858	sbb	$poly3, $a3
1859	sbb	\$0, $t4
1860
1861	cmovc	$t0, $a0
1862	cmovc	$t1, $a1
1863	mov	$a0, 8*0($r_ptr)
1864	cmovc	$t2, $a2
1865	mov	$a1, 8*1($r_ptr)
1866	cmovc	$t3, $a3
1867	mov	$a2, 8*2($r_ptr)
1868	mov	$a3, 8*3($r_ptr)
1869
1870	ret
1871.size	__ecp_nistz256_add_toq,.-__ecp_nistz256_add_toq
1872
1873.type	__ecp_nistz256_sub_fromq,\@abi-omnipotent
1874.align	32
1875__ecp_nistz256_sub_fromq:
1876	sub	8*0($b_ptr), $a0
1877	sbb	8*1($b_ptr), $a1
1878	 mov	$a0, $t0
1879	sbb	8*2($b_ptr), $a2
1880	sbb	8*3($b_ptr), $a3
1881	 mov	$a1, $t1
1882	sbb	$t4, $t4
1883
1884	add	\$-1, $a0
1885	 mov	$a2, $t2
1886	adc	$poly1, $a1
1887	adc	\$0, $a2
1888	 mov	$a3, $t3
1889	adc	$poly3, $a3
1890	test	$t4, $t4
1891
1892	cmovz	$t0, $a0
1893	cmovz	$t1, $a1
1894	mov	$a0, 8*0($r_ptr)
1895	cmovz	$t2, $a2
1896	mov	$a1, 8*1($r_ptr)
1897	cmovz	$t3, $a3
1898	mov	$a2, 8*2($r_ptr)
1899	mov	$a3, 8*3($r_ptr)
1900
1901	ret
1902.size	__ecp_nistz256_sub_fromq,.-__ecp_nistz256_sub_fromq
1903
1904.type	__ecp_nistz256_subq,\@abi-omnipotent
1905.align	32
1906__ecp_nistz256_subq:
1907	sub	$a0, $t0
1908	sbb	$a1, $t1
1909	 mov	$t0, $a0
1910	sbb	$a2, $t2
1911	sbb	$a3, $t3
1912	 mov	$t1, $a1
1913	sbb	$t4, $t4
1914
1915	add	\$-1, $t0
1916	 mov	$t2, $a2
1917	adc	$poly1, $t1
1918	adc	\$0, $t2
1919	 mov	$t3, $a3
1920	adc	$poly3, $t3
1921	test	$t4, $t4
1922
1923	cmovnz	$t0, $a0
1924	cmovnz	$t1, $a1
1925	cmovnz	$t2, $a2
1926	cmovnz	$t3, $a3
1927
1928	ret
1929.size	__ecp_nistz256_subq,.-__ecp_nistz256_subq
1930
1931.type	__ecp_nistz256_mul_by_2q,\@abi-omnipotent
1932.align	32
1933__ecp_nistz256_mul_by_2q:
1934	xor	$t4, $t4
1935	add	$a0, $a0		# a0:a3+a0:a3
1936	adc	$a1, $a1
1937	 mov	$a0, $t0
1938	adc	$a2, $a2
1939	adc	$a3, $a3
1940	 mov	$a1, $t1
1941	adc	\$0, $t4
1942
1943	sub	\$-1, $a0
1944	 mov	$a2, $t2
1945	sbb	$poly1, $a1
1946	sbb	\$0, $a2
1947	 mov	$a3, $t3
1948	sbb	$poly3, $a3
1949	sbb	\$0, $t4
1950
1951	cmovc	$t0, $a0
1952	cmovc	$t1, $a1
1953	mov	$a0, 8*0($r_ptr)
1954	cmovc	$t2, $a2
1955	mov	$a1, 8*1($r_ptr)
1956	cmovc	$t3, $a3
1957	mov	$a2, 8*2($r_ptr)
1958	mov	$a3, 8*3($r_ptr)
1959
1960	ret
1961.size	__ecp_nistz256_mul_by_2q,.-__ecp_nistz256_mul_by_2q
1962___
1963									}
1964sub gen_double () {
1965    my $x = shift;
1966    my ($src0,$sfx,$bias);
1967    my ($S,$M,$Zsqr,$in_x,$tmp0)=map(32*$_,(0..4));
1968
1969    if ($x ne "x") {
1970	$src0 = "%rax";
1971	$sfx  = "";
1972	$bias = 0;
1973
1974$code.=<<___;
1975.globl	ecp_nistz256_point_double
1976.type	ecp_nistz256_point_double,\@function,2
1977.align	32
1978ecp_nistz256_point_double:
1979___
1980$code.=<<___	if ($addx);
1981	mov	\$0x80100, %ecx
1982	and	OPENSSL_ia32cap_P+8(%rip), %ecx
1983	cmp	\$0x80100, %ecx
1984	je	.Lpoint_doublex
1985___
1986    } else {
1987	$src0 = "%rdx";
1988	$sfx  = "x";
1989	$bias = 128;
1990
1991$code.=<<___;
1992.type	ecp_nistz256_point_doublex,\@function,2
1993.align	32
1994ecp_nistz256_point_doublex:
1995.Lpoint_doublex:
1996___
1997    }
1998$code.=<<___;
1999	push	%rbp
2000	push	%rbx
2001	push	%r12
2002	push	%r13
2003	push	%r14
2004	push	%r15
2005	sub	\$32*5+8, %rsp
2006
2007.Lpoint_double_shortcut$x:
2008	movdqu	0x00($a_ptr), %xmm0		# copy	*(P256_POINT *)$a_ptr.x
2009	mov	$a_ptr, $b_ptr			# backup copy
2010	movdqu	0x10($a_ptr), %xmm1
2011	 mov	0x20+8*0($a_ptr), $acc4		# load in_y in "5-4-0-1" order
2012	 mov	0x20+8*1($a_ptr), $acc5
2013	 mov	0x20+8*2($a_ptr), $acc0
2014	 mov	0x20+8*3($a_ptr), $acc1
2015	 mov	.Lpoly+8*1(%rip), $poly1
2016	 mov	.Lpoly+8*3(%rip), $poly3
2017	movdqa	%xmm0, $in_x(%rsp)
2018	movdqa	%xmm1, $in_x+0x10(%rsp)
2019	lea	0x20($r_ptr), $acc2
2020	lea	0x40($r_ptr), $acc3
2021	movq	$r_ptr, %xmm0
2022	movq	$acc2, %xmm1
2023	movq	$acc3, %xmm2
2024
2025	lea	$S(%rsp), $r_ptr
2026	call	__ecp_nistz256_mul_by_2$x	# p256_mul_by_2(S, in_y);
2027
2028	mov	0x40+8*0($a_ptr), $src0
2029	mov	0x40+8*1($a_ptr), $acc6
2030	mov	0x40+8*2($a_ptr), $acc7
2031	mov	0x40+8*3($a_ptr), $acc0
2032	lea	0x40-$bias($a_ptr), $a_ptr
2033	lea	$Zsqr(%rsp), $r_ptr
2034	call	__ecp_nistz256_sqr_mont$x	# p256_sqr_mont(Zsqr, in_z);
2035
2036	`&load_for_sqr("$S(%rsp)", "$src0")`
2037	lea	$S(%rsp), $r_ptr
2038	call	__ecp_nistz256_sqr_mont$x	# p256_sqr_mont(S, S);
2039
2040	mov	0x20($b_ptr), $src0		# $b_ptr is still valid
2041	mov	0x40+8*0($b_ptr), $acc1
2042	mov	0x40+8*1($b_ptr), $acc2
2043	mov	0x40+8*2($b_ptr), $acc3
2044	mov	0x40+8*3($b_ptr), $acc4
2045	lea	0x40-$bias($b_ptr), $a_ptr
2046	lea	0x20($b_ptr), $b_ptr
2047	movq	%xmm2, $r_ptr
2048	call	__ecp_nistz256_mul_mont$x	# p256_mul_mont(res_z, in_z, in_y);
2049	call	__ecp_nistz256_mul_by_2$x	# p256_mul_by_2(res_z, res_z);
2050
2051	mov	$in_x+8*0(%rsp), $acc4		# "5-4-0-1" order
2052	mov	$in_x+8*1(%rsp), $acc5
2053	lea	$Zsqr(%rsp), $b_ptr
2054	mov	$in_x+8*2(%rsp), $acc0
2055	mov	$in_x+8*3(%rsp), $acc1
2056	lea	$M(%rsp), $r_ptr
2057	call	__ecp_nistz256_add_to$x		# p256_add(M, in_x, Zsqr);
2058
2059	mov	$in_x+8*0(%rsp), $acc4		# "5-4-0-1" order
2060	mov	$in_x+8*1(%rsp), $acc5
2061	lea	$Zsqr(%rsp), $b_ptr
2062	mov	$in_x+8*2(%rsp), $acc0
2063	mov	$in_x+8*3(%rsp), $acc1
2064	lea	$Zsqr(%rsp), $r_ptr
2065	call	__ecp_nistz256_sub_from$x	# p256_sub(Zsqr, in_x, Zsqr);
2066
2067	`&load_for_sqr("$S(%rsp)", "$src0")`
2068	movq	%xmm1, $r_ptr
2069	call	__ecp_nistz256_sqr_mont$x	# p256_sqr_mont(res_y, S);
2070___
2071{
2072######## ecp_nistz256_div_by_2(res_y, res_y); ##########################
2073# operate in 4-5-6-7 "name space" that matches squaring output
2074#
2075my ($poly1,$poly3)=($a_ptr,$t1);
2076my ($a0,$a1,$a2,$a3,$t3,$t4,$t1)=($acc4,$acc5,$acc6,$acc7,$acc0,$acc1,$acc2);
2077
2078$code.=<<___;
2079	xor	$t4, $t4
2080	mov	$a0, $t0
2081	add	\$-1, $a0
2082	mov	$a1, $t1
2083	adc	$poly1, $a1
2084	mov	$a2, $t2
2085	adc	\$0, $a2
2086	mov	$a3, $t3
2087	adc	$poly3, $a3
2088	adc	\$0, $t4
2089	xor	$a_ptr, $a_ptr		# borrow $a_ptr
2090	test	\$1, $t0
2091
2092	cmovz	$t0, $a0
2093	cmovz	$t1, $a1
2094	cmovz	$t2, $a2
2095	cmovz	$t3, $a3
2096	cmovz	$a_ptr, $t4
2097
2098	mov	$a1, $t0		# a0:a3>>1
2099	shr	\$1, $a0
2100	shl	\$63, $t0
2101	mov	$a2, $t1
2102	shr	\$1, $a1
2103	or	$t0, $a0
2104	shl	\$63, $t1
2105	mov	$a3, $t2
2106	shr	\$1, $a2
2107	or	$t1, $a1
2108	shl	\$63, $t2
2109	mov	$a0, 8*0($r_ptr)
2110	shr	\$1, $a3
2111	mov	$a1, 8*1($r_ptr)
2112	shl	\$63, $t4
2113	or	$t2, $a2
2114	or	$t4, $a3
2115	mov	$a2, 8*2($r_ptr)
2116	mov	$a3, 8*3($r_ptr)
2117___
2118}
2119$code.=<<___;
2120	`&load_for_mul("$M(%rsp)", "$Zsqr(%rsp)", "$src0")`
2121	lea	$M(%rsp), $r_ptr
2122	call	__ecp_nistz256_mul_mont$x	# p256_mul_mont(M, M, Zsqr);
2123
2124	lea	$tmp0(%rsp), $r_ptr
2125	call	__ecp_nistz256_mul_by_2$x
2126
2127	lea	$M(%rsp), $b_ptr
2128	lea	$M(%rsp), $r_ptr
2129	call	__ecp_nistz256_add_to$x		# p256_mul_by_3(M, M);
2130
2131	`&load_for_mul("$S(%rsp)", "$in_x(%rsp)", "$src0")`
2132	lea	$S(%rsp), $r_ptr
2133	call	__ecp_nistz256_mul_mont$x	# p256_mul_mont(S, S, in_x);
2134
2135	lea	$tmp0(%rsp), $r_ptr
2136	call	__ecp_nistz256_mul_by_2$x	# p256_mul_by_2(tmp0, S);
2137
2138	`&load_for_sqr("$M(%rsp)", "$src0")`
2139	movq	%xmm0, $r_ptr
2140	call	__ecp_nistz256_sqr_mont$x	# p256_sqr_mont(res_x, M);
2141
2142	lea	$tmp0(%rsp), $b_ptr
2143	mov	$acc6, $acc0			# harmonize sqr output and sub input
2144	mov	$acc7, $acc1
2145	mov	$a_ptr, $poly1
2146	mov	$t1, $poly3
2147	call	__ecp_nistz256_sub_from$x	# p256_sub(res_x, res_x, tmp0);
2148
2149	mov	$S+8*0(%rsp), $t0
2150	mov	$S+8*1(%rsp), $t1
2151	mov	$S+8*2(%rsp), $t2
2152	mov	$S+8*3(%rsp), $acc2		# "4-5-0-1" order
2153	lea	$S(%rsp), $r_ptr
2154	call	__ecp_nistz256_sub$x		# p256_sub(S, S, res_x);
2155
2156	mov	$M(%rsp), $src0
2157	lea	$M(%rsp), $b_ptr
2158	mov	$acc4, $acc6			# harmonize sub output and mul input
2159	xor	%ecx, %ecx
2160	mov	$acc4, $S+8*0(%rsp)		# have to save:-(
2161	mov	$acc5, $acc2
2162	mov	$acc5, $S+8*1(%rsp)
2163	cmovz	$acc0, $acc3
2164	mov	$acc0, $S+8*2(%rsp)
2165	lea	$S-$bias(%rsp), $a_ptr
2166	cmovz	$acc1, $acc4
2167	mov	$acc1, $S+8*3(%rsp)
2168	mov	$acc6, $acc1
2169	lea	$S(%rsp), $r_ptr
2170	call	__ecp_nistz256_mul_mont$x	# p256_mul_mont(S, S, M);
2171
2172	movq	%xmm1, $b_ptr
2173	movq	%xmm1, $r_ptr
2174	call	__ecp_nistz256_sub_from$x	# p256_sub(res_y, S, res_y);
2175
2176	add	\$32*5+8, %rsp
2177	pop	%r15
2178	pop	%r14
2179	pop	%r13
2180	pop	%r12
2181	pop	%rbx
2182	pop	%rbp
2183	ret
2184.size	ecp_nistz256_point_double$sfx,.-ecp_nistz256_point_double$sfx
2185___
2186}
2187&gen_double("q");
2188
2189sub gen_add () {
2190    my $x = shift;
2191    my ($src0,$sfx,$bias);
2192    my ($H,$Hsqr,$R,$Rsqr,$Hcub,
2193	$U1,$U2,$S1,$S2,
2194	$res_x,$res_y,$res_z,
2195	$in1_x,$in1_y,$in1_z,
2196	$in2_x,$in2_y,$in2_z)=map(32*$_,(0..17));
2197    my ($Z1sqr, $Z2sqr) = ($Hsqr, $Rsqr);
2198
2199    if ($x ne "x") {
2200	$src0 = "%rax";
2201	$sfx  = "";
2202	$bias = 0;
2203
2204$code.=<<___;
2205.globl	ecp_nistz256_point_add
2206.type	ecp_nistz256_point_add,\@function,3
2207.align	32
2208ecp_nistz256_point_add:
2209___
2210$code.=<<___	if ($addx);
2211	mov	\$0x80100, %ecx
2212	and	OPENSSL_ia32cap_P+8(%rip), %ecx
2213	cmp	\$0x80100, %ecx
2214	je	.Lpoint_addx
2215___
2216    } else {
2217	$src0 = "%rdx";
2218	$sfx  = "x";
2219	$bias = 128;
2220
2221$code.=<<___;
2222.type	ecp_nistz256_point_addx,\@function,3
2223.align	32
2224ecp_nistz256_point_addx:
2225.Lpoint_addx:
2226___
2227    }
2228$code.=<<___;
2229	push	%rbp
2230	push	%rbx
2231	push	%r12
2232	push	%r13
2233	push	%r14
2234	push	%r15
2235	sub	\$32*18+8, %rsp
2236
2237	movdqu	0x00($a_ptr), %xmm0		# copy	*(P256_POINT *)$a_ptr
2238	movdqu	0x10($a_ptr), %xmm1
2239	movdqu	0x20($a_ptr), %xmm2
2240	movdqu	0x30($a_ptr), %xmm3
2241	movdqu	0x40($a_ptr), %xmm4
2242	movdqu	0x50($a_ptr), %xmm5
2243	mov	$a_ptr, $b_ptr			# reassign
2244	mov	$b_org, $a_ptr			# reassign
2245	movdqa	%xmm0, $in1_x(%rsp)
2246	movdqa	%xmm1, $in1_x+0x10(%rsp)
2247	movdqa	%xmm2, $in1_y(%rsp)
2248	movdqa	%xmm3, $in1_y+0x10(%rsp)
2249	movdqa	%xmm4, $in1_z(%rsp)
2250	movdqa	%xmm5, $in1_z+0x10(%rsp)
2251	por	%xmm4, %xmm5
2252
2253	movdqu	0x00($a_ptr), %xmm0		# copy	*(P256_POINT *)$b_ptr
2254	 pshufd	\$0xb1, %xmm5, %xmm3
2255	movdqu	0x10($a_ptr), %xmm1
2256	movdqu	0x20($a_ptr), %xmm2
2257	 por	%xmm3, %xmm5
2258	movdqu	0x30($a_ptr), %xmm3
2259	 mov	0x40+8*0($a_ptr), $src0		# load original in2_z
2260	 mov	0x40+8*1($a_ptr), $acc6
2261	 mov	0x40+8*2($a_ptr), $acc7
2262	 mov	0x40+8*3($a_ptr), $acc0
2263	movdqa	%xmm0, $in2_x(%rsp)
2264	 pshufd	\$0x1e, %xmm5, %xmm4
2265	movdqa	%xmm1, $in2_x+0x10(%rsp)
2266	movdqu	0x40($a_ptr),%xmm0		# in2_z again
2267	movdqu	0x50($a_ptr),%xmm1
2268	movdqa	%xmm2, $in2_y(%rsp)
2269	movdqa	%xmm3, $in2_y+0x10(%rsp)
2270	 por	%xmm4, %xmm5
2271	 pxor	%xmm4, %xmm4
2272	por	%xmm0, %xmm1
2273	 movq	$r_ptr, %xmm0			# save $r_ptr
2274
2275	lea	0x40-$bias($a_ptr), $a_ptr	# $a_ptr is still valid
2276	 mov	$src0, $in2_z+8*0(%rsp)		# make in2_z copy
2277	 mov	$acc6, $in2_z+8*1(%rsp)
2278	 mov	$acc7, $in2_z+8*2(%rsp)
2279	 mov	$acc0, $in2_z+8*3(%rsp)
2280	lea	$Z2sqr(%rsp), $r_ptr		# Z2^2
2281	call	__ecp_nistz256_sqr_mont$x	# p256_sqr_mont(Z2sqr, in2_z);
2282
2283	pcmpeqd	%xmm4, %xmm5
2284	pshufd	\$0xb1, %xmm1, %xmm4
2285	por	%xmm1, %xmm4
2286	pshufd	\$0, %xmm5, %xmm5		# in1infty
2287	pshufd	\$0x1e, %xmm4, %xmm3
2288	por	%xmm3, %xmm4
2289	pxor	%xmm3, %xmm3
2290	pcmpeqd	%xmm3, %xmm4
2291	pshufd	\$0, %xmm4, %xmm4		# in2infty
2292	 mov	0x40+8*0($b_ptr), $src0		# load original in1_z
2293	 mov	0x40+8*1($b_ptr), $acc6
2294	 mov	0x40+8*2($b_ptr), $acc7
2295	 mov	0x40+8*3($b_ptr), $acc0
2296	movq	$b_ptr, %xmm1
2297
2298	lea	0x40-$bias($b_ptr), $a_ptr
2299	lea	$Z1sqr(%rsp), $r_ptr		# Z1^2
2300	call	__ecp_nistz256_sqr_mont$x	# p256_sqr_mont(Z1sqr, in1_z);
2301
2302	`&load_for_mul("$Z2sqr(%rsp)", "$in2_z(%rsp)", "$src0")`
2303	lea	$S1(%rsp), $r_ptr		# S1 = Z2^3
2304	call	__ecp_nistz256_mul_mont$x	# p256_mul_mont(S1, Z2sqr, in2_z);
2305
2306	`&load_for_mul("$Z1sqr(%rsp)", "$in1_z(%rsp)", "$src0")`
2307	lea	$S2(%rsp), $r_ptr		# S2 = Z1^3
2308	call	__ecp_nistz256_mul_mont$x	# p256_mul_mont(S2, Z1sqr, in1_z);
2309
2310	`&load_for_mul("$S1(%rsp)", "$in1_y(%rsp)", "$src0")`
2311	lea	$S1(%rsp), $r_ptr		# S1 = Y1*Z2^3
2312	call	__ecp_nistz256_mul_mont$x	# p256_mul_mont(S1, S1, in1_y);
2313
2314	`&load_for_mul("$S2(%rsp)", "$in2_y(%rsp)", "$src0")`
2315	lea	$S2(%rsp), $r_ptr		# S2 = Y2*Z1^3
2316	call	__ecp_nistz256_mul_mont$x	# p256_mul_mont(S2, S2, in2_y);
2317
2318	lea	$S1(%rsp), $b_ptr
2319	lea	$R(%rsp), $r_ptr		# R = S2 - S1
2320	call	__ecp_nistz256_sub_from$x	# p256_sub(R, S2, S1);
2321
2322	or	$acc5, $acc4			# see if result is zero
2323	movdqa	%xmm4, %xmm2
2324	or	$acc0, $acc4
2325	or	$acc1, $acc4
2326	por	%xmm5, %xmm2			# in1infty || in2infty
2327	movq	$acc4, %xmm3
2328
2329	`&load_for_mul("$Z2sqr(%rsp)", "$in1_x(%rsp)", "$src0")`
2330	lea	$U1(%rsp), $r_ptr		# U1 = X1*Z2^2
2331	call	__ecp_nistz256_mul_mont$x	# p256_mul_mont(U1, in1_x, Z2sqr);
2332
2333	`&load_for_mul("$Z1sqr(%rsp)", "$in2_x(%rsp)", "$src0")`
2334	lea	$U2(%rsp), $r_ptr		# U2 = X2*Z1^2
2335	call	__ecp_nistz256_mul_mont$x	# p256_mul_mont(U2, in2_x, Z1sqr);
2336
2337	lea	$U1(%rsp), $b_ptr
2338	lea	$H(%rsp), $r_ptr		# H = U2 - U1
2339	call	__ecp_nistz256_sub_from$x	# p256_sub(H, U2, U1);
2340
2341	or	$acc5, $acc4			# see if result is zero
2342	or	$acc0, $acc4
2343	or	$acc1, $acc4
2344
2345	.byte	0x3e				# predict taken
2346	jnz	.Ladd_proceed$x			# is_equal(U1,U2)?
2347	movq	%xmm2, $acc0
2348	movq	%xmm3, $acc1
2349	test	$acc0, $acc0
2350	jnz	.Ladd_proceed$x			# (in1infty || in2infty)?
2351	test	$acc1, $acc1
2352	jz	.Ladd_double$x			# is_equal(S1,S2)?
2353
2354	movq	%xmm0, $r_ptr			# restore $r_ptr
2355	pxor	%xmm0, %xmm0
2356	movdqu	%xmm0, 0x00($r_ptr)
2357	movdqu	%xmm0, 0x10($r_ptr)
2358	movdqu	%xmm0, 0x20($r_ptr)
2359	movdqu	%xmm0, 0x30($r_ptr)
2360	movdqu	%xmm0, 0x40($r_ptr)
2361	movdqu	%xmm0, 0x50($r_ptr)
2362	jmp	.Ladd_done$x
2363
2364.align	32
2365.Ladd_double$x:
2366	movq	%xmm1, $a_ptr			# restore $a_ptr
2367	movq	%xmm0, $r_ptr			# restore $r_ptr
2368	add	\$`32*(18-5)`, %rsp		# difference in frame sizes
2369	jmp	.Lpoint_double_shortcut$x
2370
2371.align	32
2372.Ladd_proceed$x:
2373	`&load_for_sqr("$R(%rsp)", "$src0")`
2374	lea	$Rsqr(%rsp), $r_ptr		# R^2
2375	call	__ecp_nistz256_sqr_mont$x	# p256_sqr_mont(Rsqr, R);
2376
2377	`&load_for_mul("$H(%rsp)", "$in1_z(%rsp)", "$src0")`
2378	lea	$res_z(%rsp), $r_ptr		# Z3 = H*Z1*Z2
2379	call	__ecp_nistz256_mul_mont$x	# p256_mul_mont(res_z, H, in1_z);
2380
2381	`&load_for_sqr("$H(%rsp)", "$src0")`
2382	lea	$Hsqr(%rsp), $r_ptr		# H^2
2383	call	__ecp_nistz256_sqr_mont$x	# p256_sqr_mont(Hsqr, H);
2384
2385	`&load_for_mul("$res_z(%rsp)", "$in2_z(%rsp)", "$src0")`
2386	lea	$res_z(%rsp), $r_ptr		# Z3 = H*Z1*Z2
2387	call	__ecp_nistz256_mul_mont$x	# p256_mul_mont(res_z, res_z, in2_z);
2388
2389	`&load_for_mul("$Hsqr(%rsp)", "$H(%rsp)", "$src0")`
2390	lea	$Hcub(%rsp), $r_ptr		# H^3
2391	call	__ecp_nistz256_mul_mont$x	# p256_mul_mont(Hcub, Hsqr, H);
2392
2393	`&load_for_mul("$Hsqr(%rsp)", "$U1(%rsp)", "$src0")`
2394	lea	$U2(%rsp), $r_ptr		# U1*H^2
2395	call	__ecp_nistz256_mul_mont$x	# p256_mul_mont(U2, U1, Hsqr);
2396___
2397{
2398#######################################################################
2399# operate in 4-5-0-1 "name space" that matches multiplication output
2400#
2401my ($acc0,$acc1,$acc2,$acc3,$t3,$t4)=($acc4,$acc5,$acc0,$acc1,$acc2,$acc3);
2402my ($poly1, $poly3)=($acc6,$acc7);
2403
2404$code.=<<___;
2405	#lea	$U2(%rsp), $a_ptr
2406	#lea	$Hsqr(%rsp), $r_ptr	# 2*U1*H^2
2407	#call	__ecp_nistz256_mul_by_2	# ecp_nistz256_mul_by_2(Hsqr, U2);
2408
2409	xor	$t4, $t4
2410	add	$acc0, $acc0		# a0:a3+a0:a3
2411	lea	$Rsqr(%rsp), $a_ptr
2412	adc	$acc1, $acc1
2413	 mov	$acc0, $t0
2414	adc	$acc2, $acc2
2415	adc	$acc3, $acc3
2416	 mov	$acc1, $t1
2417	adc	\$0, $t4
2418
2419	sub	\$-1, $acc0
2420	 mov	$acc2, $t2
2421	sbb	$poly1, $acc1
2422	sbb	\$0, $acc2
2423	 mov	$acc3, $t3
2424	sbb	$poly3, $acc3
2425	sbb	\$0, $t4
2426
2427	cmovc	$t0, $acc0
2428	mov	8*0($a_ptr), $t0
2429	cmovc	$t1, $acc1
2430	mov	8*1($a_ptr), $t1
2431	cmovc	$t2, $acc2
2432	mov	8*2($a_ptr), $t2
2433	cmovc	$t3, $acc3
2434	mov	8*3($a_ptr), $t3
2435
2436	call	__ecp_nistz256_sub$x		# p256_sub(res_x, Rsqr, Hsqr);
2437
2438	lea	$Hcub(%rsp), $b_ptr
2439	lea	$res_x(%rsp), $r_ptr
2440	call	__ecp_nistz256_sub_from$x	# p256_sub(res_x, res_x, Hcub);
2441
2442	mov	$U2+8*0(%rsp), $t0
2443	mov	$U2+8*1(%rsp), $t1
2444	mov	$U2+8*2(%rsp), $t2
2445	mov	$U2+8*3(%rsp), $t3
2446	lea	$res_y(%rsp), $r_ptr
2447
2448	call	__ecp_nistz256_sub$x		# p256_sub(res_y, U2, res_x);
2449
2450	mov	$acc0, 8*0($r_ptr)		# save the result, as
2451	mov	$acc1, 8*1($r_ptr)		# __ecp_nistz256_sub doesn't
2452	mov	$acc2, 8*2($r_ptr)
2453	mov	$acc3, 8*3($r_ptr)
2454___
2455}
2456$code.=<<___;
2457	`&load_for_mul("$S1(%rsp)", "$Hcub(%rsp)", "$src0")`
2458	lea	$S2(%rsp), $r_ptr
2459	call	__ecp_nistz256_mul_mont$x	# p256_mul_mont(S2, S1, Hcub);
2460
2461	`&load_for_mul("$R(%rsp)", "$res_y(%rsp)", "$src0")`
2462	lea	$res_y(%rsp), $r_ptr
2463	call	__ecp_nistz256_mul_mont$x	# p256_mul_mont(res_y, R, res_y);
2464
2465	lea	$S2(%rsp), $b_ptr
2466	lea	$res_y(%rsp), $r_ptr
2467	call	__ecp_nistz256_sub_from$x	# p256_sub(res_y, res_y, S2);
2468
2469	movq	%xmm0, $r_ptr		# restore $r_ptr
2470
2471	movdqa	%xmm5, %xmm0		# copy_conditional(res_z, in2_z, in1infty);
2472	movdqa	%xmm5, %xmm1
2473	pandn	$res_z(%rsp), %xmm0
2474	movdqa	%xmm5, %xmm2
2475	pandn	$res_z+0x10(%rsp), %xmm1
2476	movdqa	%xmm5, %xmm3
2477	pand	$in2_z(%rsp), %xmm2
2478	pand	$in2_z+0x10(%rsp), %xmm3
2479	por	%xmm0, %xmm2
2480	por	%xmm1, %xmm3
2481
2482	movdqa	%xmm4, %xmm0		# copy_conditional(res_z, in1_z, in2infty);
2483	movdqa	%xmm4, %xmm1
2484	pandn	%xmm2, %xmm0
2485	movdqa	%xmm4, %xmm2
2486	pandn	%xmm3, %xmm1
2487	movdqa	%xmm4, %xmm3
2488	pand	$in1_z(%rsp), %xmm2
2489	pand	$in1_z+0x10(%rsp), %xmm3
2490	por	%xmm0, %xmm2
2491	por	%xmm1, %xmm3
2492	movdqu	%xmm2, 0x40($r_ptr)
2493	movdqu	%xmm3, 0x50($r_ptr)
2494
2495	movdqa	%xmm5, %xmm0		# copy_conditional(res_x, in2_x, in1infty);
2496	movdqa	%xmm5, %xmm1
2497	pandn	$res_x(%rsp), %xmm0
2498	movdqa	%xmm5, %xmm2
2499	pandn	$res_x+0x10(%rsp), %xmm1
2500	movdqa	%xmm5, %xmm3
2501	pand	$in2_x(%rsp), %xmm2
2502	pand	$in2_x+0x10(%rsp), %xmm3
2503	por	%xmm0, %xmm2
2504	por	%xmm1, %xmm3
2505
2506	movdqa	%xmm4, %xmm0		# copy_conditional(res_x, in1_x, in2infty);
2507	movdqa	%xmm4, %xmm1
2508	pandn	%xmm2, %xmm0
2509	movdqa	%xmm4, %xmm2
2510	pandn	%xmm3, %xmm1
2511	movdqa	%xmm4, %xmm3
2512	pand	$in1_x(%rsp), %xmm2
2513	pand	$in1_x+0x10(%rsp), %xmm3
2514	por	%xmm0, %xmm2
2515	por	%xmm1, %xmm3
2516	movdqu	%xmm2, 0x00($r_ptr)
2517	movdqu	%xmm3, 0x10($r_ptr)
2518
2519	movdqa	%xmm5, %xmm0		# copy_conditional(res_y, in2_y, in1infty);
2520	movdqa	%xmm5, %xmm1
2521	pandn	$res_y(%rsp), %xmm0
2522	movdqa	%xmm5, %xmm2
2523	pandn	$res_y+0x10(%rsp), %xmm1
2524	movdqa	%xmm5, %xmm3
2525	pand	$in2_y(%rsp), %xmm2
2526	pand	$in2_y+0x10(%rsp), %xmm3
2527	por	%xmm0, %xmm2
2528	por	%xmm1, %xmm3
2529
2530	movdqa	%xmm4, %xmm0		# copy_conditional(res_y, in1_y, in2infty);
2531	movdqa	%xmm4, %xmm1
2532	pandn	%xmm2, %xmm0
2533	movdqa	%xmm4, %xmm2
2534	pandn	%xmm3, %xmm1
2535	movdqa	%xmm4, %xmm3
2536	pand	$in1_y(%rsp), %xmm2
2537	pand	$in1_y+0x10(%rsp), %xmm3
2538	por	%xmm0, %xmm2
2539	por	%xmm1, %xmm3
2540	movdqu	%xmm2, 0x20($r_ptr)
2541	movdqu	%xmm3, 0x30($r_ptr)
2542
2543.Ladd_done$x:
2544	add	\$32*18+8, %rsp
2545	pop	%r15
2546	pop	%r14
2547	pop	%r13
2548	pop	%r12
2549	pop	%rbx
2550	pop	%rbp
2551	ret
2552.size	ecp_nistz256_point_add$sfx,.-ecp_nistz256_point_add$sfx
2553___
2554}
2555&gen_add("q");
2556
2557sub gen_add_affine () {
2558    my $x = shift;
2559    my ($src0,$sfx,$bias);
2560    my ($U2,$S2,$H,$R,$Hsqr,$Hcub,$Rsqr,
2561	$res_x,$res_y,$res_z,
2562	$in1_x,$in1_y,$in1_z,
2563	$in2_x,$in2_y)=map(32*$_,(0..14));
2564    my $Z1sqr = $S2;
2565
2566    if ($x ne "x") {
2567	$src0 = "%rax";
2568	$sfx  = "";
2569	$bias = 0;
2570
2571$code.=<<___;
2572.globl	ecp_nistz256_point_add_affine
2573.type	ecp_nistz256_point_add_affine,\@function,3
2574.align	32
2575ecp_nistz256_point_add_affine:
2576___
2577$code.=<<___	if ($addx);
2578	mov	\$0x80100, %ecx
2579	and	OPENSSL_ia32cap_P+8(%rip), %ecx
2580	cmp	\$0x80100, %ecx
2581	je	.Lpoint_add_affinex
2582___
2583    } else {
2584	$src0 = "%rdx";
2585	$sfx  = "x";
2586	$bias = 128;
2587
2588$code.=<<___;
2589.type	ecp_nistz256_point_add_affinex,\@function,3
2590.align	32
2591ecp_nistz256_point_add_affinex:
2592.Lpoint_add_affinex:
2593___
2594    }
2595$code.=<<___;
2596	push	%rbp
2597	push	%rbx
2598	push	%r12
2599	push	%r13
2600	push	%r14
2601	push	%r15
2602	sub	\$32*15+8, %rsp
2603
2604	movdqu	0x00($a_ptr), %xmm0	# copy	*(P256_POINT *)$a_ptr
2605	mov	$b_org, $b_ptr		# reassign
2606	movdqu	0x10($a_ptr), %xmm1
2607	movdqu	0x20($a_ptr), %xmm2
2608	movdqu	0x30($a_ptr), %xmm3
2609	movdqu	0x40($a_ptr), %xmm4
2610	movdqu	0x50($a_ptr), %xmm5
2611	 mov	0x40+8*0($a_ptr), $src0	# load original in1_z
2612	 mov	0x40+8*1($a_ptr), $acc6
2613	 mov	0x40+8*2($a_ptr), $acc7
2614	 mov	0x40+8*3($a_ptr), $acc0
2615	movdqa	%xmm0, $in1_x(%rsp)
2616	movdqa	%xmm1, $in1_x+0x10(%rsp)
2617	movdqa	%xmm2, $in1_y(%rsp)
2618	movdqa	%xmm3, $in1_y+0x10(%rsp)
2619	movdqa	%xmm4, $in1_z(%rsp)
2620	movdqa	%xmm5, $in1_z+0x10(%rsp)
2621	por	%xmm4, %xmm5
2622
2623	movdqu	0x00($b_ptr), %xmm0	# copy	*(P256_POINT_AFFINE *)$b_ptr
2624	 pshufd	\$0xb1, %xmm5, %xmm3
2625	movdqu	0x10($b_ptr), %xmm1
2626	movdqu	0x20($b_ptr), %xmm2
2627	 por	%xmm3, %xmm5
2628	movdqu	0x30($b_ptr), %xmm3
2629	movdqa	%xmm0, $in2_x(%rsp)
2630	 pshufd	\$0x1e, %xmm5, %xmm4
2631	movdqa	%xmm1, $in2_x+0x10(%rsp)
2632	por	%xmm0, %xmm1
2633	 movq	$r_ptr, %xmm0		# save $r_ptr
2634	movdqa	%xmm2, $in2_y(%rsp)
2635	movdqa	%xmm3, $in2_y+0x10(%rsp)
2636	por	%xmm2, %xmm3
2637	 por	%xmm4, %xmm5
2638	 pxor	%xmm4, %xmm4
2639	por	%xmm1, %xmm3
2640
2641	lea	0x40-$bias($a_ptr), $a_ptr	# $a_ptr is still valid
2642	lea	$Z1sqr(%rsp), $r_ptr		# Z1^2
2643	call	__ecp_nistz256_sqr_mont$x	# p256_sqr_mont(Z1sqr, in1_z);
2644
2645	pcmpeqd	%xmm4, %xmm5
2646	pshufd	\$0xb1, %xmm3, %xmm4
2647	 mov	0x00($b_ptr), $src0		# $b_ptr is still valid
2648	 #lea	0x00($b_ptr), $b_ptr
2649	 mov	$acc4, $acc1			# harmonize sqr output and mul input
2650	por	%xmm3, %xmm4
2651	pshufd	\$0, %xmm5, %xmm5		# in1infty
2652	pshufd	\$0x1e, %xmm4, %xmm3
2653	 mov	$acc5, $acc2
2654	por	%xmm3, %xmm4
2655	pxor	%xmm3, %xmm3
2656	 mov	$acc6, $acc3
2657	pcmpeqd	%xmm3, %xmm4
2658	pshufd	\$0, %xmm4, %xmm4		# in2infty
2659
2660	lea	$Z1sqr-$bias(%rsp), $a_ptr
2661	mov	$acc7, $acc4
2662	lea	$U2(%rsp), $r_ptr		# U2 = X2*Z1^2
2663	call	__ecp_nistz256_mul_mont$x	# p256_mul_mont(U2, Z1sqr, in2_x);
2664
2665	lea	$in1_x(%rsp), $b_ptr
2666	lea	$H(%rsp), $r_ptr		# H = U2 - U1
2667	call	__ecp_nistz256_sub_from$x	# p256_sub(H, U2, in1_x);
2668
2669	`&load_for_mul("$Z1sqr(%rsp)", "$in1_z(%rsp)", "$src0")`
2670	lea	$S2(%rsp), $r_ptr		# S2 = Z1^3
2671	call	__ecp_nistz256_mul_mont$x	# p256_mul_mont(S2, Z1sqr, in1_z);
2672
2673	`&load_for_mul("$H(%rsp)", "$in1_z(%rsp)", "$src0")`
2674	lea	$res_z(%rsp), $r_ptr		# Z3 = H*Z1*Z2
2675	call	__ecp_nistz256_mul_mont$x	# p256_mul_mont(res_z, H, in1_z);
2676
2677	`&load_for_mul("$S2(%rsp)", "$in2_y(%rsp)", "$src0")`
2678	lea	$S2(%rsp), $r_ptr		# S2 = Y2*Z1^3
2679	call	__ecp_nistz256_mul_mont$x	# p256_mul_mont(S2, S2, in2_y);
2680
2681	lea	$in1_y(%rsp), $b_ptr
2682	lea	$R(%rsp), $r_ptr		# R = S2 - S1
2683	call	__ecp_nistz256_sub_from$x	# p256_sub(R, S2, in1_y);
2684
2685	`&load_for_sqr("$H(%rsp)", "$src0")`
2686	lea	$Hsqr(%rsp), $r_ptr		# H^2
2687	call	__ecp_nistz256_sqr_mont$x	# p256_sqr_mont(Hsqr, H);
2688
2689	`&load_for_sqr("$R(%rsp)", "$src0")`
2690	lea	$Rsqr(%rsp), $r_ptr		# R^2
2691	call	__ecp_nistz256_sqr_mont$x	# p256_sqr_mont(Rsqr, R);
2692
2693	`&load_for_mul("$H(%rsp)", "$Hsqr(%rsp)", "$src0")`
2694	lea	$Hcub(%rsp), $r_ptr		# H^3
2695	call	__ecp_nistz256_mul_mont$x	# p256_mul_mont(Hcub, Hsqr, H);
2696
2697	`&load_for_mul("$Hsqr(%rsp)", "$in1_x(%rsp)", "$src0")`
2698	lea	$U2(%rsp), $r_ptr		# U1*H^2
2699	call	__ecp_nistz256_mul_mont$x	# p256_mul_mont(U2, in1_x, Hsqr);
2700___
2701{
2702#######################################################################
2703# operate in 4-5-0-1 "name space" that matches multiplication output
2704#
2705my ($acc0,$acc1,$acc2,$acc3,$t3,$t4)=($acc4,$acc5,$acc0,$acc1,$acc2,$acc3);
2706my ($poly1, $poly3)=($acc6,$acc7);
2707
2708$code.=<<___;
2709	#lea	$U2(%rsp), $a_ptr
2710	#lea	$Hsqr(%rsp), $r_ptr	# 2*U1*H^2
2711	#call	__ecp_nistz256_mul_by_2	# ecp_nistz256_mul_by_2(Hsqr, U2);
2712
2713	xor	$t4, $t4
2714	add	$acc0, $acc0		# a0:a3+a0:a3
2715	lea	$Rsqr(%rsp), $a_ptr
2716	adc	$acc1, $acc1
2717	 mov	$acc0, $t0
2718	adc	$acc2, $acc2
2719	adc	$acc3, $acc3
2720	 mov	$acc1, $t1
2721	adc	\$0, $t4
2722
2723	sub	\$-1, $acc0
2724	 mov	$acc2, $t2
2725	sbb	$poly1, $acc1
2726	sbb	\$0, $acc2
2727	 mov	$acc3, $t3
2728	sbb	$poly3, $acc3
2729	sbb	\$0, $t4
2730
2731	cmovc	$t0, $acc0
2732	mov	8*0($a_ptr), $t0
2733	cmovc	$t1, $acc1
2734	mov	8*1($a_ptr), $t1
2735	cmovc	$t2, $acc2
2736	mov	8*2($a_ptr), $t2
2737	cmovc	$t3, $acc3
2738	mov	8*3($a_ptr), $t3
2739
2740	call	__ecp_nistz256_sub$x		# p256_sub(res_x, Rsqr, Hsqr);
2741
2742	lea	$Hcub(%rsp), $b_ptr
2743	lea	$res_x(%rsp), $r_ptr
2744	call	__ecp_nistz256_sub_from$x	# p256_sub(res_x, res_x, Hcub);
2745
2746	mov	$U2+8*0(%rsp), $t0
2747	mov	$U2+8*1(%rsp), $t1
2748	mov	$U2+8*2(%rsp), $t2
2749	mov	$U2+8*3(%rsp), $t3
2750	lea	$H(%rsp), $r_ptr
2751
2752	call	__ecp_nistz256_sub$x		# p256_sub(H, U2, res_x);
2753
2754	mov	$acc0, 8*0($r_ptr)		# save the result, as
2755	mov	$acc1, 8*1($r_ptr)		# __ecp_nistz256_sub doesn't
2756	mov	$acc2, 8*2($r_ptr)
2757	mov	$acc3, 8*3($r_ptr)
2758___
2759}
2760$code.=<<___;
2761	`&load_for_mul("$Hcub(%rsp)", "$in1_y(%rsp)", "$src0")`
2762	lea	$S2(%rsp), $r_ptr
2763	call	__ecp_nistz256_mul_mont$x	# p256_mul_mont(S2, Hcub, in1_y);
2764
2765	`&load_for_mul("$H(%rsp)", "$R(%rsp)", "$src0")`
2766	lea	$H(%rsp), $r_ptr
2767	call	__ecp_nistz256_mul_mont$x	# p256_mul_mont(H, H, R);
2768
2769	lea	$S2(%rsp), $b_ptr
2770	lea	$res_y(%rsp), $r_ptr
2771	call	__ecp_nistz256_sub_from$x	# p256_sub(res_y, H, S2);
2772
2773	movq	%xmm0, $r_ptr		# restore $r_ptr
2774
2775	movdqa	%xmm5, %xmm0		# copy_conditional(res_z, ONE, in1infty);
2776	movdqa	%xmm5, %xmm1
2777	pandn	$res_z(%rsp), %xmm0
2778	movdqa	%xmm5, %xmm2
2779	pandn	$res_z+0x10(%rsp), %xmm1
2780	movdqa	%xmm5, %xmm3
2781	pand	.LONE_mont(%rip), %xmm2
2782	pand	.LONE_mont+0x10(%rip), %xmm3
2783	por	%xmm0, %xmm2
2784	por	%xmm1, %xmm3
2785
2786	movdqa	%xmm4, %xmm0		# copy_conditional(res_z, in1_z, in2infty);
2787	movdqa	%xmm4, %xmm1
2788	pandn	%xmm2, %xmm0
2789	movdqa	%xmm4, %xmm2
2790	pandn	%xmm3, %xmm1
2791	movdqa	%xmm4, %xmm3
2792	pand	$in1_z(%rsp), %xmm2
2793	pand	$in1_z+0x10(%rsp), %xmm3
2794	por	%xmm0, %xmm2
2795	por	%xmm1, %xmm3
2796	movdqu	%xmm2, 0x40($r_ptr)
2797	movdqu	%xmm3, 0x50($r_ptr)
2798
2799	movdqa	%xmm5, %xmm0		# copy_conditional(res_x, in2_x, in1infty);
2800	movdqa	%xmm5, %xmm1
2801	pandn	$res_x(%rsp), %xmm0
2802	movdqa	%xmm5, %xmm2
2803	pandn	$res_x+0x10(%rsp), %xmm1
2804	movdqa	%xmm5, %xmm3
2805	pand	$in2_x(%rsp), %xmm2
2806	pand	$in2_x+0x10(%rsp), %xmm3
2807	por	%xmm0, %xmm2
2808	por	%xmm1, %xmm3
2809
2810	movdqa	%xmm4, %xmm0		# copy_conditional(res_x, in1_x, in2infty);
2811	movdqa	%xmm4, %xmm1
2812	pandn	%xmm2, %xmm0
2813	movdqa	%xmm4, %xmm2
2814	pandn	%xmm3, %xmm1
2815	movdqa	%xmm4, %xmm3
2816	pand	$in1_x(%rsp), %xmm2
2817	pand	$in1_x+0x10(%rsp), %xmm3
2818	por	%xmm0, %xmm2
2819	por	%xmm1, %xmm3
2820	movdqu	%xmm2, 0x00($r_ptr)
2821	movdqu	%xmm3, 0x10($r_ptr)
2822
2823	movdqa	%xmm5, %xmm0		# copy_conditional(res_y, in2_y, in1infty);
2824	movdqa	%xmm5, %xmm1
2825	pandn	$res_y(%rsp), %xmm0
2826	movdqa	%xmm5, %xmm2
2827	pandn	$res_y+0x10(%rsp), %xmm1
2828	movdqa	%xmm5, %xmm3
2829	pand	$in2_y(%rsp), %xmm2
2830	pand	$in2_y+0x10(%rsp), %xmm3
2831	por	%xmm0, %xmm2
2832	por	%xmm1, %xmm3
2833
2834	movdqa	%xmm4, %xmm0		# copy_conditional(res_y, in1_y, in2infty);
2835	movdqa	%xmm4, %xmm1
2836	pandn	%xmm2, %xmm0
2837	movdqa	%xmm4, %xmm2
2838	pandn	%xmm3, %xmm1
2839	movdqa	%xmm4, %xmm3
2840	pand	$in1_y(%rsp), %xmm2
2841	pand	$in1_y+0x10(%rsp), %xmm3
2842	por	%xmm0, %xmm2
2843	por	%xmm1, %xmm3
2844	movdqu	%xmm2, 0x20($r_ptr)
2845	movdqu	%xmm3, 0x30($r_ptr)
2846
2847	add	\$32*15+8, %rsp
2848	pop	%r15
2849	pop	%r14
2850	pop	%r13
2851	pop	%r12
2852	pop	%rbx
2853	pop	%rbp
2854	ret
2855.size	ecp_nistz256_point_add_affine$sfx,.-ecp_nistz256_point_add_affine$sfx
2856___
2857}
2858&gen_add_affine("q");
2859
2860########################################################################
2861# AD*X magic
2862#
2863if ($addx) {								{
2864########################################################################
2865# operate in 4-5-0-1 "name space" that matches multiplication output
2866#
2867my ($a0,$a1,$a2,$a3,$t3,$t4)=($acc4,$acc5,$acc0,$acc1,$acc2,$acc3);
2868
2869$code.=<<___;
2870.type	__ecp_nistz256_add_tox,\@abi-omnipotent
2871.align	32
2872__ecp_nistz256_add_tox:
2873	xor	$t4, $t4
2874	adc	8*0($b_ptr), $a0
2875	adc	8*1($b_ptr), $a1
2876	 mov	$a0, $t0
2877	adc	8*2($b_ptr), $a2
2878	adc	8*3($b_ptr), $a3
2879	 mov	$a1, $t1
2880	adc	\$0, $t4
2881
2882	xor	$t3, $t3
2883	sbb	\$-1, $a0
2884	 mov	$a2, $t2
2885	sbb	$poly1, $a1
2886	sbb	\$0, $a2
2887	 mov	$a3, $t3
2888	sbb	$poly3, $a3
2889	sbb	\$0, $t4
2890
2891	cmovc	$t0, $a0
2892	cmovc	$t1, $a1
2893	mov	$a0, 8*0($r_ptr)
2894	cmovc	$t2, $a2
2895	mov	$a1, 8*1($r_ptr)
2896	cmovc	$t3, $a3
2897	mov	$a2, 8*2($r_ptr)
2898	mov	$a3, 8*3($r_ptr)
2899
2900	ret
2901.size	__ecp_nistz256_add_tox,.-__ecp_nistz256_add_tox
2902
2903.type	__ecp_nistz256_sub_fromx,\@abi-omnipotent
2904.align	32
2905__ecp_nistz256_sub_fromx:
2906	xor	$t4, $t4
2907	sbb	8*0($b_ptr), $a0
2908	sbb	8*1($b_ptr), $a1
2909	 mov	$a0, $t0
2910	sbb	8*2($b_ptr), $a2
2911	sbb	8*3($b_ptr), $a3
2912	 mov	$a1, $t1
2913	sbb	\$0, $t4
2914
2915	xor	$t3, $t3
2916	adc	\$-1, $a0
2917	 mov	$a2, $t2
2918	adc	$poly1, $a1
2919	adc	\$0, $a2
2920	 mov	$a3, $t3
2921	adc	$poly3, $a3
2922
2923	bt	\$0, $t4
2924	cmovnc	$t0, $a0
2925	cmovnc	$t1, $a1
2926	mov	$a0, 8*0($r_ptr)
2927	cmovnc	$t2, $a2
2928	mov	$a1, 8*1($r_ptr)
2929	cmovnc	$t3, $a3
2930	mov	$a2, 8*2($r_ptr)
2931	mov	$a3, 8*3($r_ptr)
2932
2933	ret
2934.size	__ecp_nistz256_sub_fromx,.-__ecp_nistz256_sub_fromx
2935
2936.type	__ecp_nistz256_subx,\@abi-omnipotent
2937.align	32
2938__ecp_nistz256_subx:
2939	xor	$t4, $t4
2940	sbb	$a0, $t0
2941	sbb	$a1, $t1
2942	 mov	$t0, $a0
2943	sbb	$a2, $t2
2944	sbb	$a3, $t3
2945	 mov	$t1, $a1
2946	sbb	\$0, $t4
2947
2948	xor	$a3 ,$a3
2949	adc	\$-1, $t0
2950	 mov	$t2, $a2
2951	adc	$poly1, $t1
2952	adc	\$0, $t2
2953	 mov	$t3, $a3
2954	adc	$poly3, $t3
2955
2956	bt	\$0, $t4
2957	cmovc	$t0, $a0
2958	cmovc	$t1, $a1
2959	cmovc	$t2, $a2
2960	cmovc	$t3, $a3
2961
2962	ret
2963.size	__ecp_nistz256_subx,.-__ecp_nistz256_subx
2964
2965.type	__ecp_nistz256_mul_by_2x,\@abi-omnipotent
2966.align	32
2967__ecp_nistz256_mul_by_2x:
2968	xor	$t4, $t4
2969	adc	$a0, $a0		# a0:a3+a0:a3
2970	adc	$a1, $a1
2971	 mov	$a0, $t0
2972	adc	$a2, $a2
2973	adc	$a3, $a3
2974	 mov	$a1, $t1
2975	adc	\$0, $t4
2976
2977	xor	$t3, $t3
2978	sbb	\$-1, $a0
2979	 mov	$a2, $t2
2980	sbb	$poly1, $a1
2981	sbb	\$0, $a2
2982	 mov	$a3, $t3
2983	sbb	$poly3, $a3
2984	sbb	\$0, $t4
2985
2986	cmovc	$t0, $a0
2987	cmovc	$t1, $a1
2988	mov	$a0, 8*0($r_ptr)
2989	cmovc	$t2, $a2
2990	mov	$a1, 8*1($r_ptr)
2991	cmovc	$t3, $a3
2992	mov	$a2, 8*2($r_ptr)
2993	mov	$a3, 8*3($r_ptr)
2994
2995	ret
2996.size	__ecp_nistz256_mul_by_2x,.-__ecp_nistz256_mul_by_2x
2997___
2998									}
2999&gen_double("x");
3000&gen_add("x");
3001&gen_add_affine("x");
3002}
3003}}}
3004
3005$code =~ s/\`([^\`]*)\`/eval $1/gem;
3006print $code;
3007close STDOUT;
3008