ecp_nistz256-x86_64.pl revision 325337
1#!/usr/bin/env perl
2
3##############################################################################
4#                                                                            #
5# Copyright 2014 Intel Corporation                                           #
6#                                                                            #
7# Licensed under the Apache License, Version 2.0 (the "License");            #
8# you may not use this file except in compliance with the License.           #
9# You may obtain a copy of the License at                                    #
10#                                                                            #
11#    http://www.apache.org/licenses/LICENSE-2.0                              #
12#                                                                            #
13# Unless required by applicable law or agreed to in writing, software        #
14# distributed under the License is distributed on an "AS IS" BASIS,          #
15# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.   #
16# See the License for the specific language governing permissions and        #
17# limitations under the License.                                             #
18#                                                                            #
19##############################################################################
20#                                                                            #
21#  Developers and authors:                                                   #
22#  Shay Gueron (1, 2), and Vlad Krasnov (1)                                  #
23#  (1) Intel Corporation, Israel Development Center                          #
24#  (2) University of Haifa                                                   #
25#  Reference:                                                                #
26#  S.Gueron and V.Krasnov, "Fast Prime Field Elliptic Curve Cryptography with#
27#                           256 Bit Primes"                                  #
28#                                                                            #
29##############################################################################
30
31# Further optimization by <appro@openssl.org>:
32#
33#		this/original	with/without -DECP_NISTZ256_ASM(*)
34# Opteron	+12-49%		+110-150%
35# Bulldozer	+14-45%		+175-210%
36# P4		+18-46%		n/a :-(
37# Westmere	+12-34%		+80-87%
38# Sandy Bridge	+9-35%		+110-120%
39# Ivy Bridge	+9-35%		+110-125%
40# Haswell	+8-37%		+140-160%
41# Broadwell	+18-58%		+145-210%
42# Atom		+15-50%		+130-180%
43# VIA Nano	+43-160%	+300-480%
44#
45# (*)	"without -DECP_NISTZ256_ASM" refers to build with
46#	"enable-ec_nistp_64_gcc_128";
47#
48# Ranges denote minimum and maximum improvement coefficients depending
49# on benchmark. Lower coefficients are for ECDSA sign, relatively fastest
50# server-side operation. Keep in mind that +100% means 2x improvement.
51
52$flavour = shift;
53$output  = shift;
54if ($flavour =~ /\./) { $output = $flavour; undef $flavour; }
55
56$win64=0; $win64=1 if ($flavour =~ /[nm]asm|mingw64/ || $output =~ /\.asm$/);
57
58$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
59( $xlate="${dir}x86_64-xlate.pl" and -f $xlate ) or
60( $xlate="${dir}../../perlasm/x86_64-xlate.pl" and -f $xlate) or
61die "can't locate x86_64-xlate.pl";
62
63open OUT,"| \"$^X\" $xlate $flavour $output";
64*STDOUT=*OUT;
65
66if (`$ENV{CC} -Wa,-v -c -o /dev/null -x assembler /dev/null 2>&1`
67		=~ /GNU assembler version ([2-9]\.[0-9]+)/) {
68	$avx = ($1>=2.19) + ($1>=2.22);
69	$addx = ($1>=2.23);
70}
71
72if (!$addx && $win64 && ($flavour =~ /nasm/ || $ENV{ASM} =~ /nasm/) &&
73	    `nasm -v 2>&1` =~ /NASM version ([2-9]\.[0-9]+)/) {
74	$avx = ($1>=2.09) + ($1>=2.10);
75	$addx = ($1>=2.10);
76}
77
78if (!$addx && $win64 && ($flavour =~ /masm/ || $ENV{ASM} =~ /ml64/) &&
79	    `ml64 2>&1` =~ /Version ([0-9]+)\./) {
80	$avx = ($1>=10) + ($1>=11);
81	$addx = ($1>=12);
82}
83
84if (!$addx && `$ENV{CC} -v 2>&1` =~ /((?:^clang|LLVM) version|.*based on LLVM) ([3-9])\.([0-9]+)/) {
85	my $ver = $2 + $3/100.0;	# 3.1->3.01, 3.10->3.10
86	$avx = ($ver>=3.0) + ($ver>=3.01);
87	$addx = ($ver>=3.03);
88}
89
90$code.=<<___;
91.text
92.extern	OPENSSL_ia32cap_P
93
94# The polynomial
95.align 64
96.Lpoly:
97.quad 0xffffffffffffffff, 0x00000000ffffffff, 0x0000000000000000, 0xffffffff00000001
98
99# 2^512 mod P precomputed for NIST P256 polynomial
100.LRR:
101.quad 0x0000000000000003, 0xfffffffbffffffff, 0xfffffffffffffffe, 0x00000004fffffffd
102
103.LOne:
104.long 1,1,1,1,1,1,1,1
105.LTwo:
106.long 2,2,2,2,2,2,2,2
107.LThree:
108.long 3,3,3,3,3,3,3,3
109.LONE_mont:
110.quad 0x0000000000000001, 0xffffffff00000000, 0xffffffffffffffff, 0x00000000fffffffe
111___
112
113{
114################################################################################
115# void ecp_nistz256_mul_by_2(uint64_t res[4], uint64_t a[4]);
116
117my ($a0,$a1,$a2,$a3)=map("%r$_",(8..11));
118my ($t0,$t1,$t2,$t3,$t4)=("%rax","%rdx","%rcx","%r12","%r13");
119my ($r_ptr,$a_ptr,$b_ptr)=("%rdi","%rsi","%rdx");
120
121$code.=<<___;
122
123.globl	ecp_nistz256_mul_by_2
124.type	ecp_nistz256_mul_by_2,\@function,2
125.align	64
126ecp_nistz256_mul_by_2:
127	push	%r12
128	push	%r13
129
130	mov	8*0($a_ptr), $a0
131	xor	$t4,$t4
132	mov	8*1($a_ptr), $a1
133	add	$a0, $a0		# a0:a3+a0:a3
134	mov	8*2($a_ptr), $a2
135	adc	$a1, $a1
136	mov	8*3($a_ptr), $a3
137	lea	.Lpoly(%rip), $a_ptr
138	 mov	$a0, $t0
139	adc	$a2, $a2
140	adc	$a3, $a3
141	 mov	$a1, $t1
142	adc	\$0, $t4
143
144	sub	8*0($a_ptr), $a0
145	 mov	$a2, $t2
146	sbb	8*1($a_ptr), $a1
147	sbb	8*2($a_ptr), $a2
148	 mov	$a3, $t3
149	sbb	8*3($a_ptr), $a3
150	sbb	\$0, $t4
151
152	cmovc	$t0, $a0
153	cmovc	$t1, $a1
154	mov	$a0, 8*0($r_ptr)
155	cmovc	$t2, $a2
156	mov	$a1, 8*1($r_ptr)
157	cmovc	$t3, $a3
158	mov	$a2, 8*2($r_ptr)
159	mov	$a3, 8*3($r_ptr)
160
161	pop	%r13
162	pop	%r12
163	ret
164.size	ecp_nistz256_mul_by_2,.-ecp_nistz256_mul_by_2
165
166################################################################################
167# void ecp_nistz256_div_by_2(uint64_t res[4], uint64_t a[4]);
168.globl	ecp_nistz256_div_by_2
169.type	ecp_nistz256_div_by_2,\@function,2
170.align	32
171ecp_nistz256_div_by_2:
172	push	%r12
173	push	%r13
174
175	mov	8*0($a_ptr), $a0
176	mov	8*1($a_ptr), $a1
177	mov	8*2($a_ptr), $a2
178	 mov	$a0, $t0
179	mov	8*3($a_ptr), $a3
180	lea	.Lpoly(%rip), $a_ptr
181
182	 mov	$a1, $t1
183	xor	$t4, $t4
184	add	8*0($a_ptr), $a0
185	 mov	$a2, $t2
186	adc	8*1($a_ptr), $a1
187	adc	8*2($a_ptr), $a2
188	 mov	$a3, $t3
189	adc	8*3($a_ptr), $a3
190	adc	\$0, $t4
191	xor	$a_ptr, $a_ptr		# borrow $a_ptr
192	test	\$1, $t0
193
194	cmovz	$t0, $a0
195	cmovz	$t1, $a1
196	cmovz	$t2, $a2
197	cmovz	$t3, $a3
198	cmovz	$a_ptr, $t4
199
200	mov	$a1, $t0		# a0:a3>>1
201	shr	\$1, $a0
202	shl	\$63, $t0
203	mov	$a2, $t1
204	shr	\$1, $a1
205	or	$t0, $a0
206	shl	\$63, $t1
207	mov	$a3, $t2
208	shr	\$1, $a2
209	or	$t1, $a1
210	shl	\$63, $t2
211	shr	\$1, $a3
212	shl	\$63, $t4
213	or	$t2, $a2
214	or	$t4, $a3
215
216	mov	$a0, 8*0($r_ptr)
217	mov	$a1, 8*1($r_ptr)
218	mov	$a2, 8*2($r_ptr)
219	mov	$a3, 8*3($r_ptr)
220
221	pop	%r13
222	pop	%r12
223	ret
224.size	ecp_nistz256_div_by_2,.-ecp_nistz256_div_by_2
225
226################################################################################
227# void ecp_nistz256_mul_by_3(uint64_t res[4], uint64_t a[4]);
228.globl	ecp_nistz256_mul_by_3
229.type	ecp_nistz256_mul_by_3,\@function,2
230.align	32
231ecp_nistz256_mul_by_3:
232	push	%r12
233	push	%r13
234
235	mov	8*0($a_ptr), $a0
236	xor	$t4, $t4
237	mov	8*1($a_ptr), $a1
238	add	$a0, $a0		# a0:a3+a0:a3
239	mov	8*2($a_ptr), $a2
240	adc	$a1, $a1
241	mov	8*3($a_ptr), $a3
242	 mov	$a0, $t0
243	adc	$a2, $a2
244	adc	$a3, $a3
245	 mov	$a1, $t1
246	adc	\$0, $t4
247
248	sub	\$-1, $a0
249	 mov	$a2, $t2
250	sbb	.Lpoly+8*1(%rip), $a1
251	sbb	\$0, $a2
252	 mov	$a3, $t3
253	sbb	.Lpoly+8*3(%rip), $a3
254	sbb	\$0, $t4
255
256	cmovc	$t0, $a0
257	cmovc	$t1, $a1
258	cmovc	$t2, $a2
259	cmovc	$t3, $a3
260
261	xor	$t4, $t4
262	add	8*0($a_ptr), $a0	# a0:a3+=a_ptr[0:3]
263	adc	8*1($a_ptr), $a1
264	 mov	$a0, $t0
265	adc	8*2($a_ptr), $a2
266	adc	8*3($a_ptr), $a3
267	 mov	$a1, $t1
268	adc	\$0, $t4
269
270	sub	\$-1, $a0
271	 mov	$a2, $t2
272	sbb	.Lpoly+8*1(%rip), $a1
273	sbb	\$0, $a2
274	 mov	$a3, $t3
275	sbb	.Lpoly+8*3(%rip), $a3
276	sbb	\$0, $t4
277
278	cmovc	$t0, $a0
279	cmovc	$t1, $a1
280	mov	$a0, 8*0($r_ptr)
281	cmovc	$t2, $a2
282	mov	$a1, 8*1($r_ptr)
283	cmovc	$t3, $a3
284	mov	$a2, 8*2($r_ptr)
285	mov	$a3, 8*3($r_ptr)
286
287	pop %r13
288	pop %r12
289	ret
290.size	ecp_nistz256_mul_by_3,.-ecp_nistz256_mul_by_3
291
292################################################################################
293# void ecp_nistz256_add(uint64_t res[4], uint64_t a[4], uint64_t b[4]);
294.globl	ecp_nistz256_add
295.type	ecp_nistz256_add,\@function,3
296.align	32
297ecp_nistz256_add:
298	push	%r12
299	push	%r13
300
301	mov	8*0($a_ptr), $a0
302	xor	$t4, $t4
303	mov	8*1($a_ptr), $a1
304	mov	8*2($a_ptr), $a2
305	mov	8*3($a_ptr), $a3
306	lea	.Lpoly(%rip), $a_ptr
307
308	add	8*0($b_ptr), $a0
309	adc	8*1($b_ptr), $a1
310	 mov	$a0, $t0
311	adc	8*2($b_ptr), $a2
312	adc	8*3($b_ptr), $a3
313	 mov	$a1, $t1
314	adc	\$0, $t4
315
316	sub	8*0($a_ptr), $a0
317	 mov	$a2, $t2
318	sbb	8*1($a_ptr), $a1
319	sbb	8*2($a_ptr), $a2
320	 mov	$a3, $t3
321	sbb	8*3($a_ptr), $a3
322	sbb	\$0, $t4
323
324	cmovc	$t0, $a0
325	cmovc	$t1, $a1
326	mov	$a0, 8*0($r_ptr)
327	cmovc	$t2, $a2
328	mov	$a1, 8*1($r_ptr)
329	cmovc	$t3, $a3
330	mov	$a2, 8*2($r_ptr)
331	mov	$a3, 8*3($r_ptr)
332
333	pop %r13
334	pop %r12
335	ret
336.size	ecp_nistz256_add,.-ecp_nistz256_add
337
338################################################################################
339# void ecp_nistz256_sub(uint64_t res[4], uint64_t a[4], uint64_t b[4]);
340.globl	ecp_nistz256_sub
341.type	ecp_nistz256_sub,\@function,3
342.align	32
343ecp_nistz256_sub:
344	push	%r12
345	push	%r13
346
347	mov	8*0($a_ptr), $a0
348	xor	$t4, $t4
349	mov	8*1($a_ptr), $a1
350	mov	8*2($a_ptr), $a2
351	mov	8*3($a_ptr), $a3
352	lea	.Lpoly(%rip), $a_ptr
353
354	sub	8*0($b_ptr), $a0
355	sbb	8*1($b_ptr), $a1
356	 mov	$a0, $t0
357	sbb	8*2($b_ptr), $a2
358	sbb	8*3($b_ptr), $a3
359	 mov	$a1, $t1
360	sbb	\$0, $t4
361
362	add	8*0($a_ptr), $a0
363	 mov	$a2, $t2
364	adc	8*1($a_ptr), $a1
365	adc	8*2($a_ptr), $a2
366	 mov	$a3, $t3
367	adc	8*3($a_ptr), $a3
368	test	$t4, $t4
369
370	cmovz	$t0, $a0
371	cmovz	$t1, $a1
372	mov	$a0, 8*0($r_ptr)
373	cmovz	$t2, $a2
374	mov	$a1, 8*1($r_ptr)
375	cmovz	$t3, $a3
376	mov	$a2, 8*2($r_ptr)
377	mov	$a3, 8*3($r_ptr)
378
379	pop %r13
380	pop %r12
381	ret
382.size	ecp_nistz256_sub,.-ecp_nistz256_sub
383
384################################################################################
385# void ecp_nistz256_neg(uint64_t res[4], uint64_t a[4]);
386.globl	ecp_nistz256_neg
387.type	ecp_nistz256_neg,\@function,2
388.align	32
389ecp_nistz256_neg:
390	push	%r12
391	push	%r13
392
393	xor	$a0, $a0
394	xor	$a1, $a1
395	xor	$a2, $a2
396	xor	$a3, $a3
397	xor	$t4, $t4
398
399	sub	8*0($a_ptr), $a0
400	sbb	8*1($a_ptr), $a1
401	sbb	8*2($a_ptr), $a2
402	 mov	$a0, $t0
403	sbb	8*3($a_ptr), $a3
404	lea	.Lpoly(%rip), $a_ptr
405	 mov	$a1, $t1
406	sbb	\$0, $t4
407
408	add	8*0($a_ptr), $a0
409	 mov	$a2, $t2
410	adc	8*1($a_ptr), $a1
411	adc	8*2($a_ptr), $a2
412	 mov	$a3, $t3
413	adc	8*3($a_ptr), $a3
414	test	$t4, $t4
415
416	cmovz	$t0, $a0
417	cmovz	$t1, $a1
418	mov	$a0, 8*0($r_ptr)
419	cmovz	$t2, $a2
420	mov	$a1, 8*1($r_ptr)
421	cmovz	$t3, $a3
422	mov	$a2, 8*2($r_ptr)
423	mov	$a3, 8*3($r_ptr)
424
425	pop %r13
426	pop %r12
427	ret
428.size	ecp_nistz256_neg,.-ecp_nistz256_neg
429___
430}
431{
432my ($r_ptr,$a_ptr,$b_org,$b_ptr)=("%rdi","%rsi","%rdx","%rbx");
433my ($acc0,$acc1,$acc2,$acc3,$acc4,$acc5,$acc6,$acc7)=map("%r$_",(8..15));
434my ($t0,$t1,$t2,$t3,$t4)=("%rcx","%rbp","%rbx","%rdx","%rax");
435my ($poly1,$poly3)=($acc6,$acc7);
436
437$code.=<<___;
438################################################################################
439# void ecp_nistz256_to_mont(
440#   uint64_t res[4],
441#   uint64_t in[4]);
442.globl	ecp_nistz256_to_mont
443.type	ecp_nistz256_to_mont,\@function,2
444.align	32
445ecp_nistz256_to_mont:
446___
447$code.=<<___	if ($addx);
448	mov	\$0x80100, %ecx
449	and	OPENSSL_ia32cap_P+8(%rip), %ecx
450___
451$code.=<<___;
452	lea	.LRR(%rip), $b_org
453	jmp	.Lmul_mont
454.size	ecp_nistz256_to_mont,.-ecp_nistz256_to_mont
455
456################################################################################
457# void ecp_nistz256_mul_mont(
458#   uint64_t res[4],
459#   uint64_t a[4],
460#   uint64_t b[4]);
461
462.globl	ecp_nistz256_mul_mont
463.type	ecp_nistz256_mul_mont,\@function,3
464.align	32
465ecp_nistz256_mul_mont:
466___
467$code.=<<___	if ($addx);
468	mov	\$0x80100, %ecx
469	and	OPENSSL_ia32cap_P+8(%rip), %ecx
470___
471$code.=<<___;
472.Lmul_mont:
473	push	%rbp
474	push	%rbx
475	push	%r12
476	push	%r13
477	push	%r14
478	push	%r15
479___
480$code.=<<___	if ($addx);
481	cmp	\$0x80100, %ecx
482	je	.Lmul_montx
483___
484$code.=<<___;
485	mov	$b_org, $b_ptr
486	mov	8*0($b_org), %rax
487	mov	8*0($a_ptr), $acc1
488	mov	8*1($a_ptr), $acc2
489	mov	8*2($a_ptr), $acc3
490	mov	8*3($a_ptr), $acc4
491
492	call	__ecp_nistz256_mul_montq
493___
494$code.=<<___	if ($addx);
495	jmp	.Lmul_mont_done
496
497.align	32
498.Lmul_montx:
499	mov	$b_org, $b_ptr
500	mov	8*0($b_org), %rdx
501	mov	8*0($a_ptr), $acc1
502	mov	8*1($a_ptr), $acc2
503	mov	8*2($a_ptr), $acc3
504	mov	8*3($a_ptr), $acc4
505	lea	-128($a_ptr), $a_ptr	# control u-op density
506
507	call	__ecp_nistz256_mul_montx
508___
509$code.=<<___;
510.Lmul_mont_done:
511	pop	%r15
512	pop	%r14
513	pop	%r13
514	pop	%r12
515	pop	%rbx
516	pop	%rbp
517	ret
518.size	ecp_nistz256_mul_mont,.-ecp_nistz256_mul_mont
519
520.type	__ecp_nistz256_mul_montq,\@abi-omnipotent
521.align	32
522__ecp_nistz256_mul_montq:
523	########################################################################
524	# Multiply a by b[0]
525	mov	%rax, $t1
526	mulq	$acc1
527	mov	.Lpoly+8*1(%rip),$poly1
528	mov	%rax, $acc0
529	mov	$t1, %rax
530	mov	%rdx, $acc1
531
532	mulq	$acc2
533	mov	.Lpoly+8*3(%rip),$poly3
534	add	%rax, $acc1
535	mov	$t1, %rax
536	adc	\$0, %rdx
537	mov	%rdx, $acc2
538
539	mulq	$acc3
540	add	%rax, $acc2
541	mov	$t1, %rax
542	adc	\$0, %rdx
543	mov	%rdx, $acc3
544
545	mulq	$acc4
546	add	%rax, $acc3
547	 mov	$acc0, %rax
548	adc	\$0, %rdx
549	xor	$acc5, $acc5
550	mov	%rdx, $acc4
551
552	########################################################################
553	# First reduction step
554	# Basically now we want to multiply acc[0] by p256,
555	# and add the result to the acc.
556	# Due to the special form of p256 we do some optimizations
557	#
558	# acc[0] x p256[0..1] = acc[0] x 2^96 - acc[0]
559	# then we add acc[0] and get acc[0] x 2^96
560
561	mov	$acc0, $t1
562	shl	\$32, $acc0
563	mulq	$poly3
564	shr	\$32, $t1
565	add	$acc0, $acc1		# +=acc[0]<<96
566	adc	$t1, $acc2
567	adc	%rax, $acc3
568	 mov	8*1($b_ptr), %rax
569	adc	%rdx, $acc4
570	adc	\$0, $acc5
571	xor	$acc0, $acc0
572
573	########################################################################
574	# Multiply by b[1]
575	mov	%rax, $t1
576	mulq	8*0($a_ptr)
577	add	%rax, $acc1
578	mov	$t1, %rax
579	adc	\$0, %rdx
580	mov	%rdx, $t0
581
582	mulq	8*1($a_ptr)
583	add	$t0, $acc2
584	adc	\$0, %rdx
585	add	%rax, $acc2
586	mov	$t1, %rax
587	adc	\$0, %rdx
588	mov	%rdx, $t0
589
590	mulq	8*2($a_ptr)
591	add	$t0, $acc3
592	adc	\$0, %rdx
593	add	%rax, $acc3
594	mov	$t1, %rax
595	adc	\$0, %rdx
596	mov	%rdx, $t0
597
598	mulq	8*3($a_ptr)
599	add	$t0, $acc4
600	adc	\$0, %rdx
601	add	%rax, $acc4
602	 mov	$acc1, %rax
603	adc	%rdx, $acc5
604	adc	\$0, $acc0
605
606	########################################################################
607	# Second reduction step
608	mov	$acc1, $t1
609	shl	\$32, $acc1
610	mulq	$poly3
611	shr	\$32, $t1
612	add	$acc1, $acc2
613	adc	$t1, $acc3
614	adc	%rax, $acc4
615	 mov	8*2($b_ptr), %rax
616	adc	%rdx, $acc5
617	adc	\$0, $acc0
618	xor	$acc1, $acc1
619
620	########################################################################
621	# Multiply by b[2]
622	mov	%rax, $t1
623	mulq	8*0($a_ptr)
624	add	%rax, $acc2
625	mov	$t1, %rax
626	adc	\$0, %rdx
627	mov	%rdx, $t0
628
629	mulq	8*1($a_ptr)
630	add	$t0, $acc3
631	adc	\$0, %rdx
632	add	%rax, $acc3
633	mov	$t1, %rax
634	adc	\$0, %rdx
635	mov	%rdx, $t0
636
637	mulq	8*2($a_ptr)
638	add	$t0, $acc4
639	adc	\$0, %rdx
640	add	%rax, $acc4
641	mov	$t1, %rax
642	adc	\$0, %rdx
643	mov	%rdx, $t0
644
645	mulq	8*3($a_ptr)
646	add	$t0, $acc5
647	adc	\$0, %rdx
648	add	%rax, $acc5
649	 mov	$acc2, %rax
650	adc	%rdx, $acc0
651	adc	\$0, $acc1
652
653	########################################################################
654	# Third reduction step
655	mov	$acc2, $t1
656	shl	\$32, $acc2
657	mulq	$poly3
658	shr	\$32, $t1
659	add	$acc2, $acc3
660	adc	$t1, $acc4
661	adc	%rax, $acc5
662	 mov	8*3($b_ptr), %rax
663	adc	%rdx, $acc0
664	adc	\$0, $acc1
665	xor	$acc2, $acc2
666
667	########################################################################
668	# Multiply by b[3]
669	mov	%rax, $t1
670	mulq	8*0($a_ptr)
671	add	%rax, $acc3
672	mov	$t1, %rax
673	adc	\$0, %rdx
674	mov	%rdx, $t0
675
676	mulq	8*1($a_ptr)
677	add	$t0, $acc4
678	adc	\$0, %rdx
679	add	%rax, $acc4
680	mov	$t1, %rax
681	adc	\$0, %rdx
682	mov	%rdx, $t0
683
684	mulq	8*2($a_ptr)
685	add	$t0, $acc5
686	adc	\$0, %rdx
687	add	%rax, $acc5
688	mov	$t1, %rax
689	adc	\$0, %rdx
690	mov	%rdx, $t0
691
692	mulq	8*3($a_ptr)
693	add	$t0, $acc0
694	adc	\$0, %rdx
695	add	%rax, $acc0
696	 mov	$acc3, %rax
697	adc	%rdx, $acc1
698	adc	\$0, $acc2
699
700	########################################################################
701	# Final reduction step
702	mov	$acc3, $t1
703	shl	\$32, $acc3
704	mulq	$poly3
705	shr	\$32, $t1
706	add	$acc3, $acc4
707	adc	$t1, $acc5
708	 mov	$acc4, $t0
709	adc	%rax, $acc0
710	adc	%rdx, $acc1
711	 mov	$acc5, $t1
712	adc	\$0, $acc2
713
714	########################################################################
715	# Branch-less conditional subtraction of P
716	sub	\$-1, $acc4		# .Lpoly[0]
717	 mov	$acc0, $t2
718	sbb	$poly1, $acc5		# .Lpoly[1]
719	sbb	\$0, $acc0		# .Lpoly[2]
720	 mov	$acc1, $t3
721	sbb	$poly3, $acc1		# .Lpoly[3]
722	sbb	\$0, $acc2
723
724	cmovc	$t0, $acc4
725	cmovc	$t1, $acc5
726	mov	$acc4, 8*0($r_ptr)
727	cmovc	$t2, $acc0
728	mov	$acc5, 8*1($r_ptr)
729	cmovc	$t3, $acc1
730	mov	$acc0, 8*2($r_ptr)
731	mov	$acc1, 8*3($r_ptr)
732
733	ret
734.size	__ecp_nistz256_mul_montq,.-__ecp_nistz256_mul_montq
735
736################################################################################
737# void ecp_nistz256_sqr_mont(
738#   uint64_t res[4],
739#   uint64_t a[4]);
740
741# we optimize the square according to S.Gueron and V.Krasnov,
742# "Speeding up Big-Number Squaring"
743.globl	ecp_nistz256_sqr_mont
744.type	ecp_nistz256_sqr_mont,\@function,2
745.align	32
746ecp_nistz256_sqr_mont:
747___
748$code.=<<___	if ($addx);
749	mov	\$0x80100, %ecx
750	and	OPENSSL_ia32cap_P+8(%rip), %ecx
751___
752$code.=<<___;
753	push	%rbp
754	push	%rbx
755	push	%r12
756	push	%r13
757	push	%r14
758	push	%r15
759___
760$code.=<<___	if ($addx);
761	cmp	\$0x80100, %ecx
762	je	.Lsqr_montx
763___
764$code.=<<___;
765	mov	8*0($a_ptr), %rax
766	mov	8*1($a_ptr), $acc6
767	mov	8*2($a_ptr), $acc7
768	mov	8*3($a_ptr), $acc0
769
770	call	__ecp_nistz256_sqr_montq
771___
772$code.=<<___	if ($addx);
773	jmp	.Lsqr_mont_done
774
775.align	32
776.Lsqr_montx:
777	mov	8*0($a_ptr), %rdx
778	mov	8*1($a_ptr), $acc6
779	mov	8*2($a_ptr), $acc7
780	mov	8*3($a_ptr), $acc0
781	lea	-128($a_ptr), $a_ptr	# control u-op density
782
783	call	__ecp_nistz256_sqr_montx
784___
785$code.=<<___;
786.Lsqr_mont_done:
787	pop	%r15
788	pop	%r14
789	pop	%r13
790	pop	%r12
791	pop	%rbx
792	pop	%rbp
793	ret
794.size	ecp_nistz256_sqr_mont,.-ecp_nistz256_sqr_mont
795
796.type	__ecp_nistz256_sqr_montq,\@abi-omnipotent
797.align	32
798__ecp_nistz256_sqr_montq:
799	mov	%rax, $acc5
800	mulq	$acc6			# a[1]*a[0]
801	mov	%rax, $acc1
802	mov	$acc7, %rax
803	mov	%rdx, $acc2
804
805	mulq	$acc5			# a[0]*a[2]
806	add	%rax, $acc2
807	mov	$acc0, %rax
808	adc	\$0, %rdx
809	mov	%rdx, $acc3
810
811	mulq	$acc5			# a[0]*a[3]
812	add	%rax, $acc3
813	 mov	$acc7, %rax
814	adc	\$0, %rdx
815	mov	%rdx, $acc4
816
817	#################################
818	mulq	$acc6			# a[1]*a[2]
819	add	%rax, $acc3
820	mov	$acc0, %rax
821	adc	\$0, %rdx
822	mov	%rdx, $t1
823
824	mulq	$acc6			# a[1]*a[3]
825	add	%rax, $acc4
826	 mov	$acc0, %rax
827	adc	\$0, %rdx
828	add	$t1, $acc4
829	mov	%rdx, $acc5
830	adc	\$0, $acc5
831
832	#################################
833	mulq	$acc7			# a[2]*a[3]
834	xor	$acc7, $acc7
835	add	%rax, $acc5
836	 mov	8*0($a_ptr), %rax
837	mov	%rdx, $acc6
838	adc	\$0, $acc6
839
840	add	$acc1, $acc1		# acc1:6<<1
841	adc	$acc2, $acc2
842	adc	$acc3, $acc3
843	adc	$acc4, $acc4
844	adc	$acc5, $acc5
845	adc	$acc6, $acc6
846	adc	\$0, $acc7
847
848	mulq	%rax
849	mov	%rax, $acc0
850	mov	8*1($a_ptr), %rax
851	mov	%rdx, $t0
852
853	mulq	%rax
854	add	$t0, $acc1
855	adc	%rax, $acc2
856	mov	8*2($a_ptr), %rax
857	adc	\$0, %rdx
858	mov	%rdx, $t0
859
860	mulq	%rax
861	add	$t0, $acc3
862	adc	%rax, $acc4
863	mov	8*3($a_ptr), %rax
864	adc	\$0, %rdx
865	mov	%rdx, $t0
866
867	mulq	%rax
868	add	$t0, $acc5
869	adc	%rax, $acc6
870	 mov	$acc0, %rax
871	adc	%rdx, $acc7
872
873	mov	.Lpoly+8*1(%rip), $a_ptr
874	mov	.Lpoly+8*3(%rip), $t1
875
876	##########################################
877	# Now the reduction
878	# First iteration
879	mov	$acc0, $t0
880	shl	\$32, $acc0
881	mulq	$t1
882	shr	\$32, $t0
883	add	$acc0, $acc1		# +=acc[0]<<96
884	adc	$t0, $acc2
885	adc	%rax, $acc3
886	 mov	$acc1, %rax
887	adc	\$0, %rdx
888
889	##########################################
890	# Second iteration
891	mov	$acc1, $t0
892	shl	\$32, $acc1
893	mov	%rdx, $acc0
894	mulq	$t1
895	shr	\$32, $t0
896	add	$acc1, $acc2
897	adc	$t0, $acc3
898	adc	%rax, $acc0
899	 mov	$acc2, %rax
900	adc	\$0, %rdx
901
902	##########################################
903	# Third iteration
904	mov	$acc2, $t0
905	shl	\$32, $acc2
906	mov	%rdx, $acc1
907	mulq	$t1
908	shr	\$32, $t0
909	add	$acc2, $acc3
910	adc	$t0, $acc0
911	adc	%rax, $acc1
912	 mov	$acc3, %rax
913	adc	\$0, %rdx
914
915	###########################################
916	# Last iteration
917	mov	$acc3, $t0
918	shl	\$32, $acc3
919	mov	%rdx, $acc2
920	mulq	$t1
921	shr	\$32, $t0
922	add	$acc3, $acc0
923	adc	$t0, $acc1
924	adc	%rax, $acc2
925	adc	\$0, %rdx
926	xor	$acc3, $acc3
927
928	############################################
929	# Add the rest of the acc
930	add	$acc0, $acc4
931	adc	$acc1, $acc5
932	 mov	$acc4, $acc0
933	adc	$acc2, $acc6
934	adc	%rdx, $acc7
935	 mov	$acc5, $acc1
936	adc	\$0, $acc3
937
938	sub	\$-1, $acc4		# .Lpoly[0]
939	 mov	$acc6, $acc2
940	sbb	$a_ptr, $acc5		# .Lpoly[1]
941	sbb	\$0, $acc6		# .Lpoly[2]
942	 mov	$acc7, $t0
943	sbb	$t1, $acc7		# .Lpoly[3]
944	sbb	\$0, $acc3
945
946	cmovc	$acc0, $acc4
947	cmovc	$acc1, $acc5
948	mov	$acc4, 8*0($r_ptr)
949	cmovc	$acc2, $acc6
950	mov	$acc5, 8*1($r_ptr)
951	cmovc	$t0, $acc7
952	mov	$acc6, 8*2($r_ptr)
953	mov	$acc7, 8*3($r_ptr)
954
955	ret
956.size	__ecp_nistz256_sqr_montq,.-__ecp_nistz256_sqr_montq
957___
958
959if ($addx) {
960$code.=<<___;
961.type	__ecp_nistz256_mul_montx,\@abi-omnipotent
962.align	32
963__ecp_nistz256_mul_montx:
964	########################################################################
965	# Multiply by b[0]
966	mulx	$acc1, $acc0, $acc1
967	mulx	$acc2, $t0, $acc2
968	mov	\$32, $poly1
969	xor	$acc5, $acc5		# cf=0
970	mulx	$acc3, $t1, $acc3
971	mov	.Lpoly+8*3(%rip), $poly3
972	adc	$t0, $acc1
973	mulx	$acc4, $t0, $acc4
974	 mov	$acc0, %rdx
975	adc	$t1, $acc2
976	 shlx	$poly1,$acc0,$t1
977	adc	$t0, $acc3
978	 shrx	$poly1,$acc0,$t0
979	adc	\$0, $acc4
980
981	########################################################################
982	# First reduction step
983	add	$t1, $acc1
984	adc	$t0, $acc2
985
986	mulx	$poly3, $t0, $t1
987	 mov	8*1($b_ptr), %rdx
988	adc	$t0, $acc3
989	adc	$t1, $acc4
990	adc	\$0, $acc5
991	xor	$acc0, $acc0		# $acc0=0,cf=0,of=0
992
993	########################################################################
994	# Multiply by b[1]
995	mulx	8*0+128($a_ptr), $t0, $t1
996	adcx	$t0, $acc1
997	adox	$t1, $acc2
998
999	mulx	8*1+128($a_ptr), $t0, $t1
1000	adcx	$t0, $acc2
1001	adox	$t1, $acc3
1002
1003	mulx	8*2+128($a_ptr), $t0, $t1
1004	adcx	$t0, $acc3
1005	adox	$t1, $acc4
1006
1007	mulx	8*3+128($a_ptr), $t0, $t1
1008	 mov	$acc1, %rdx
1009	adcx	$t0, $acc4
1010	 shlx	$poly1, $acc1, $t0
1011	adox	$t1, $acc5
1012	 shrx	$poly1, $acc1, $t1
1013
1014	adcx	$acc0, $acc5
1015	adox	$acc0, $acc0
1016	adc	\$0, $acc0
1017
1018	########################################################################
1019	# Second reduction step
1020	add	$t0, $acc2
1021	adc	$t1, $acc3
1022
1023	mulx	$poly3, $t0, $t1
1024	 mov	8*2($b_ptr), %rdx
1025	adc	$t0, $acc4
1026	adc	$t1, $acc5
1027	adc	\$0, $acc0
1028	xor	$acc1 ,$acc1		# $acc1=0,cf=0,of=0
1029
1030	########################################################################
1031	# Multiply by b[2]
1032	mulx	8*0+128($a_ptr), $t0, $t1
1033	adcx	$t0, $acc2
1034	adox	$t1, $acc3
1035
1036	mulx	8*1+128($a_ptr), $t0, $t1
1037	adcx	$t0, $acc3
1038	adox	$t1, $acc4
1039
1040	mulx	8*2+128($a_ptr), $t0, $t1
1041	adcx	$t0, $acc4
1042	adox	$t1, $acc5
1043
1044	mulx	8*3+128($a_ptr), $t0, $t1
1045	 mov	$acc2, %rdx
1046	adcx	$t0, $acc5
1047	 shlx	$poly1, $acc2, $t0
1048	adox	$t1, $acc0
1049	 shrx	$poly1, $acc2, $t1
1050
1051	adcx	$acc1, $acc0
1052	adox	$acc1, $acc1
1053	adc	\$0, $acc1
1054
1055	########################################################################
1056	# Third reduction step
1057	add	$t0, $acc3
1058	adc	$t1, $acc4
1059
1060	mulx	$poly3, $t0, $t1
1061	 mov	8*3($b_ptr), %rdx
1062	adc	$t0, $acc5
1063	adc	$t1, $acc0
1064	adc	\$0, $acc1
1065	xor	$acc2, $acc2		# $acc2=0,cf=0,of=0
1066
1067	########################################################################
1068	# Multiply by b[3]
1069	mulx	8*0+128($a_ptr), $t0, $t1
1070	adcx	$t0, $acc3
1071	adox	$t1, $acc4
1072
1073	mulx	8*1+128($a_ptr), $t0, $t1
1074	adcx	$t0, $acc4
1075	adox	$t1, $acc5
1076
1077	mulx	8*2+128($a_ptr), $t0, $t1
1078	adcx	$t0, $acc5
1079	adox	$t1, $acc0
1080
1081	mulx	8*3+128($a_ptr), $t0, $t1
1082	 mov	$acc3, %rdx
1083	adcx	$t0, $acc0
1084	 shlx	$poly1, $acc3, $t0
1085	adox	$t1, $acc1
1086	 shrx	$poly1, $acc3, $t1
1087
1088	adcx	$acc2, $acc1
1089	adox	$acc2, $acc2
1090	adc	\$0, $acc2
1091
1092	########################################################################
1093	# Fourth reduction step
1094	add	$t0, $acc4
1095	adc	$t1, $acc5
1096
1097	mulx	$poly3, $t0, $t1
1098	 mov	$acc4, $t2
1099	mov	.Lpoly+8*1(%rip), $poly1
1100	adc	$t0, $acc0
1101	 mov	$acc5, $t3
1102	adc	$t1, $acc1
1103	adc	\$0, $acc2
1104
1105	########################################################################
1106	# Branch-less conditional subtraction of P
1107	xor	%eax, %eax
1108	 mov	$acc0, $t0
1109	sbb	\$-1, $acc4		# .Lpoly[0]
1110	sbb	$poly1, $acc5		# .Lpoly[1]
1111	sbb	\$0, $acc0		# .Lpoly[2]
1112	 mov	$acc1, $t1
1113	sbb	$poly3, $acc1		# .Lpoly[3]
1114	sbb	\$0, $acc2
1115
1116	cmovc	$t2, $acc4
1117	cmovc	$t3, $acc5
1118	mov	$acc4, 8*0($r_ptr)
1119	cmovc	$t0, $acc0
1120	mov	$acc5, 8*1($r_ptr)
1121	cmovc	$t1, $acc1
1122	mov	$acc0, 8*2($r_ptr)
1123	mov	$acc1, 8*3($r_ptr)
1124
1125	ret
1126.size	__ecp_nistz256_mul_montx,.-__ecp_nistz256_mul_montx
1127
1128.type	__ecp_nistz256_sqr_montx,\@abi-omnipotent
1129.align	32
1130__ecp_nistz256_sqr_montx:
1131	mulx	$acc6, $acc1, $acc2	# a[0]*a[1]
1132	mulx	$acc7, $t0, $acc3	# a[0]*a[2]
1133	xor	%eax, %eax
1134	adc	$t0, $acc2
1135	mulx	$acc0, $t1, $acc4	# a[0]*a[3]
1136	 mov	$acc6, %rdx
1137	adc	$t1, $acc3
1138	adc	\$0, $acc4
1139	xor	$acc5, $acc5		# $acc5=0,cf=0,of=0
1140
1141	#################################
1142	mulx	$acc7, $t0, $t1		# a[1]*a[2]
1143	adcx	$t0, $acc3
1144	adox	$t1, $acc4
1145
1146	mulx	$acc0, $t0, $t1		# a[1]*a[3]
1147	 mov	$acc7, %rdx
1148	adcx	$t0, $acc4
1149	adox	$t1, $acc5
1150	adc	\$0, $acc5
1151
1152	#################################
1153	mulx	$acc0, $t0, $acc6	# a[2]*a[3]
1154	 mov	8*0+128($a_ptr), %rdx
1155	xor	$acc7, $acc7		# $acc7=0,cf=0,of=0
1156	 adcx	$acc1, $acc1		# acc1:6<<1
1157	adox	$t0, $acc5
1158	 adcx	$acc2, $acc2
1159	adox	$acc7, $acc6		# of=0
1160
1161	mulx	%rdx, $acc0, $t1
1162	mov	8*1+128($a_ptr), %rdx
1163	 adcx	$acc3, $acc3
1164	adox	$t1, $acc1
1165	 adcx	$acc4, $acc4
1166	mulx	%rdx, $t0, $t4
1167	mov	8*2+128($a_ptr), %rdx
1168	 adcx	$acc5, $acc5
1169	adox	$t0, $acc2
1170	 adcx	$acc6, $acc6
1171	.byte	0x67
1172	mulx	%rdx, $t0, $t1
1173	mov	8*3+128($a_ptr), %rdx
1174	adox	$t4, $acc3
1175	 adcx	$acc7, $acc7
1176	adox	$t0, $acc4
1177	 mov	\$32, $a_ptr
1178	adox	$t1, $acc5
1179	.byte	0x67,0x67
1180	mulx	%rdx, $t0, $t4
1181	 mov	.Lpoly+8*3(%rip), %rdx
1182	adox	$t0, $acc6
1183	 shlx	$a_ptr, $acc0, $t0
1184	adox	$t4, $acc7
1185	 shrx	$a_ptr, $acc0, $t4
1186	mov	%rdx,$t1
1187
1188	# reduction step 1
1189	add	$t0, $acc1
1190	adc	$t4, $acc2
1191
1192	mulx	$acc0, $t0, $acc0
1193	adc	$t0, $acc3
1194	 shlx	$a_ptr, $acc1, $t0
1195	adc	\$0, $acc0
1196	 shrx	$a_ptr, $acc1, $t4
1197
1198	# reduction step 2
1199	add	$t0, $acc2
1200	adc	$t4, $acc3
1201
1202	mulx	$acc1, $t0, $acc1
1203	adc	$t0, $acc0
1204	 shlx	$a_ptr, $acc2, $t0
1205	adc	\$0, $acc1
1206	 shrx	$a_ptr, $acc2, $t4
1207
1208	# reduction step 3
1209	add	$t0, $acc3
1210	adc	$t4, $acc0
1211
1212	mulx	$acc2, $t0, $acc2
1213	adc	$t0, $acc1
1214	 shlx	$a_ptr, $acc3, $t0
1215	adc	\$0, $acc2
1216	 shrx	$a_ptr, $acc3, $t4
1217
1218	# reduction step 4
1219	add	$t0, $acc0
1220	adc	$t4, $acc1
1221
1222	mulx	$acc3, $t0, $acc3
1223	adc	$t0, $acc2
1224	adc	\$0, $acc3
1225
1226	xor	$t3, $t3
1227	add	$acc0, $acc4		# accumulate upper half
1228	 mov	.Lpoly+8*1(%rip), $a_ptr
1229	adc	$acc1, $acc5
1230	 mov	$acc4, $acc0
1231	adc	$acc2, $acc6
1232	adc	$acc3, $acc7
1233	 mov	$acc5, $acc1
1234	adc	\$0, $t3
1235
1236	sub	\$-1, $acc4		# .Lpoly[0]
1237	 mov	$acc6, $acc2
1238	sbb	$a_ptr, $acc5		# .Lpoly[1]
1239	sbb	\$0, $acc6		# .Lpoly[2]
1240	 mov	$acc7, $acc3
1241	sbb	$t1, $acc7		# .Lpoly[3]
1242	sbb	\$0, $t3
1243
1244	cmovc	$acc0, $acc4
1245	cmovc	$acc1, $acc5
1246	mov	$acc4, 8*0($r_ptr)
1247	cmovc	$acc2, $acc6
1248	mov	$acc5, 8*1($r_ptr)
1249	cmovc	$acc3, $acc7
1250	mov	$acc6, 8*2($r_ptr)
1251	mov	$acc7, 8*3($r_ptr)
1252
1253	ret
1254.size	__ecp_nistz256_sqr_montx,.-__ecp_nistz256_sqr_montx
1255___
1256}
1257}
1258{
1259my ($r_ptr,$in_ptr)=("%rdi","%rsi");
1260my ($acc0,$acc1,$acc2,$acc3)=map("%r$_",(8..11));
1261my ($t0,$t1,$t2)=("%rcx","%r12","%r13");
1262
1263$code.=<<___;
1264################################################################################
1265# void ecp_nistz256_from_mont(
1266#   uint64_t res[4],
1267#   uint64_t in[4]);
1268# This one performs Montgomery multiplication by 1, so we only need the reduction
1269
1270.globl	ecp_nistz256_from_mont
1271.type	ecp_nistz256_from_mont,\@function,2
1272.align	32
1273ecp_nistz256_from_mont:
1274	push	%r12
1275	push	%r13
1276
1277	mov	8*0($in_ptr), %rax
1278	mov	.Lpoly+8*3(%rip), $t2
1279	mov	8*1($in_ptr), $acc1
1280	mov	8*2($in_ptr), $acc2
1281	mov	8*3($in_ptr), $acc3
1282	mov	%rax, $acc0
1283	mov	.Lpoly+8*1(%rip), $t1
1284
1285	#########################################
1286	# First iteration
1287	mov	%rax, $t0
1288	shl	\$32, $acc0
1289	mulq	$t2
1290	shr	\$32, $t0
1291	add	$acc0, $acc1
1292	adc	$t0, $acc2
1293	adc	%rax, $acc3
1294	 mov	$acc1, %rax
1295	adc	\$0, %rdx
1296
1297	#########################################
1298	# Second iteration
1299	mov	$acc1, $t0
1300	shl	\$32, $acc1
1301	mov	%rdx, $acc0
1302	mulq	$t2
1303	shr	\$32, $t0
1304	add	$acc1, $acc2
1305	adc	$t0, $acc3
1306	adc	%rax, $acc0
1307	 mov	$acc2, %rax
1308	adc	\$0, %rdx
1309
1310	##########################################
1311	# Third iteration
1312	mov	$acc2, $t0
1313	shl	\$32, $acc2
1314	mov	%rdx, $acc1
1315	mulq	$t2
1316	shr	\$32, $t0
1317	add	$acc2, $acc3
1318	adc	$t0, $acc0
1319	adc	%rax, $acc1
1320	 mov	$acc3, %rax
1321	adc	\$0, %rdx
1322
1323	###########################################
1324	# Last iteration
1325	mov	$acc3, $t0
1326	shl	\$32, $acc3
1327	mov	%rdx, $acc2
1328	mulq	$t2
1329	shr	\$32, $t0
1330	add	$acc3, $acc0
1331	adc	$t0, $acc1
1332	 mov	$acc0, $t0
1333	adc	%rax, $acc2
1334	 mov	$acc1, $in_ptr
1335	adc	\$0, %rdx
1336
1337	###########################################
1338	# Branch-less conditional subtraction
1339	sub	\$-1, $acc0
1340	 mov	$acc2, %rax
1341	sbb	$t1, $acc1
1342	sbb	\$0, $acc2
1343	 mov	%rdx, $acc3
1344	sbb	$t2, %rdx
1345	sbb	$t2, $t2
1346
1347	cmovnz	$t0, $acc0
1348	cmovnz	$in_ptr, $acc1
1349	mov	$acc0, 8*0($r_ptr)
1350	cmovnz	%rax, $acc2
1351	mov	$acc1, 8*1($r_ptr)
1352	cmovz	%rdx, $acc3
1353	mov	$acc2, 8*2($r_ptr)
1354	mov	$acc3, 8*3($r_ptr)
1355
1356	pop	%r13
1357	pop	%r12
1358	ret
1359.size	ecp_nistz256_from_mont,.-ecp_nistz256_from_mont
1360___
1361}
1362{
1363my ($val,$in_t,$index)=$win64?("%rcx","%rdx","%r8d"):("%rdi","%rsi","%edx");
1364my ($ONE,$INDEX,$Ra,$Rb,$Rc,$Rd,$Re,$Rf)=map("%xmm$_",(0..7));
1365my ($M0,$T0a,$T0b,$T0c,$T0d,$T0e,$T0f,$TMP0)=map("%xmm$_",(8..15));
1366my ($M1,$T2a,$T2b,$TMP2,$M2,$T2a,$T2b,$TMP2)=map("%xmm$_",(8..15));
1367
1368$code.=<<___;
1369################################################################################
1370# void ecp_nistz256_select_w5(uint64_t *val, uint64_t *in_t, int index);
1371.globl	ecp_nistz256_select_w5
1372.type	ecp_nistz256_select_w5,\@abi-omnipotent
1373.align	32
1374ecp_nistz256_select_w5:
1375___
1376$code.=<<___	if ($avx>1);
1377	mov	OPENSSL_ia32cap_P+8(%rip), %eax
1378	test	\$`1<<5`, %eax
1379	jnz	.Lavx2_select_w5
1380___
1381$code.=<<___	if ($win64);
1382	lea	-0x88(%rsp), %rax
1383.LSEH_begin_ecp_nistz256_select_w5:
1384	.byte	0x48,0x8d,0x60,0xe0		#lea	-0x20(%rax), %rsp
1385	.byte	0x0f,0x29,0x70,0xe0		#movaps	%xmm6, -0x20(%rax)
1386	.byte	0x0f,0x29,0x78,0xf0		#movaps	%xmm7, -0x10(%rax)
1387	.byte	0x44,0x0f,0x29,0x00		#movaps	%xmm8, 0(%rax)
1388	.byte	0x44,0x0f,0x29,0x48,0x10	#movaps	%xmm9, 0x10(%rax)
1389	.byte	0x44,0x0f,0x29,0x50,0x20	#movaps	%xmm10, 0x20(%rax)
1390	.byte	0x44,0x0f,0x29,0x58,0x30	#movaps	%xmm11, 0x30(%rax)
1391	.byte	0x44,0x0f,0x29,0x60,0x40	#movaps	%xmm12, 0x40(%rax)
1392	.byte	0x44,0x0f,0x29,0x68,0x50	#movaps	%xmm13, 0x50(%rax)
1393	.byte	0x44,0x0f,0x29,0x70,0x60	#movaps	%xmm14, 0x60(%rax)
1394	.byte	0x44,0x0f,0x29,0x78,0x70	#movaps	%xmm15, 0x70(%rax)
1395___
1396$code.=<<___;
1397	movdqa	.LOne(%rip), $ONE
1398	movd	$index, $INDEX
1399
1400	pxor	$Ra, $Ra
1401	pxor	$Rb, $Rb
1402	pxor	$Rc, $Rc
1403	pxor	$Rd, $Rd
1404	pxor	$Re, $Re
1405	pxor	$Rf, $Rf
1406
1407	movdqa	$ONE, $M0
1408	pshufd	\$0, $INDEX, $INDEX
1409
1410	mov	\$16, %rax
1411.Lselect_loop_sse_w5:
1412
1413	movdqa	$M0, $TMP0
1414	paddd	$ONE, $M0
1415	pcmpeqd $INDEX, $TMP0
1416
1417	movdqa	16*0($in_t), $T0a
1418	movdqa	16*1($in_t), $T0b
1419	movdqa	16*2($in_t), $T0c
1420	movdqa	16*3($in_t), $T0d
1421	movdqa	16*4($in_t), $T0e
1422	movdqa	16*5($in_t), $T0f
1423	lea 16*6($in_t), $in_t
1424
1425	pand	$TMP0, $T0a
1426	pand	$TMP0, $T0b
1427	por	$T0a, $Ra
1428	pand	$TMP0, $T0c
1429	por	$T0b, $Rb
1430	pand	$TMP0, $T0d
1431	por	$T0c, $Rc
1432	pand	$TMP0, $T0e
1433	por	$T0d, $Rd
1434	pand	$TMP0, $T0f
1435	por	$T0e, $Re
1436	por	$T0f, $Rf
1437
1438	dec	%rax
1439	jnz	.Lselect_loop_sse_w5
1440
1441	movdqu	$Ra, 16*0($val)
1442	movdqu	$Rb, 16*1($val)
1443	movdqu	$Rc, 16*2($val)
1444	movdqu	$Rd, 16*3($val)
1445	movdqu	$Re, 16*4($val)
1446	movdqu	$Rf, 16*5($val)
1447___
1448$code.=<<___	if ($win64);
1449	movaps	(%rsp), %xmm6
1450	movaps	0x10(%rsp), %xmm7
1451	movaps	0x20(%rsp), %xmm8
1452	movaps	0x30(%rsp), %xmm9
1453	movaps	0x40(%rsp), %xmm10
1454	movaps	0x50(%rsp), %xmm11
1455	movaps	0x60(%rsp), %xmm12
1456	movaps	0x70(%rsp), %xmm13
1457	movaps	0x80(%rsp), %xmm14
1458	movaps	0x90(%rsp), %xmm15
1459	lea	0xa8(%rsp), %rsp
1460.LSEH_end_ecp_nistz256_select_w5:
1461___
1462$code.=<<___;
1463	ret
1464.size	ecp_nistz256_select_w5,.-ecp_nistz256_select_w5
1465
1466################################################################################
1467# void ecp_nistz256_select_w7(uint64_t *val, uint64_t *in_t, int index);
1468.globl	ecp_nistz256_select_w7
1469.type	ecp_nistz256_select_w7,\@abi-omnipotent
1470.align	32
1471ecp_nistz256_select_w7:
1472___
1473$code.=<<___	if ($avx>1);
1474	mov	OPENSSL_ia32cap_P+8(%rip), %eax
1475	test	\$`1<<5`, %eax
1476	jnz	.Lavx2_select_w7
1477___
1478$code.=<<___	if ($win64);
1479	lea	-0x88(%rsp), %rax
1480.LSEH_begin_ecp_nistz256_select_w7:
1481	.byte	0x48,0x8d,0x60,0xe0		#lea	-0x20(%rax), %rsp
1482	.byte	0x0f,0x29,0x70,0xe0		#movaps	%xmm6, -0x20(%rax)
1483	.byte	0x0f,0x29,0x78,0xf0		#movaps	%xmm7, -0x10(%rax)
1484	.byte	0x44,0x0f,0x29,0x00		#movaps	%xmm8, 0(%rax)
1485	.byte	0x44,0x0f,0x29,0x48,0x10	#movaps	%xmm9, 0x10(%rax)
1486	.byte	0x44,0x0f,0x29,0x50,0x20	#movaps	%xmm10, 0x20(%rax)
1487	.byte	0x44,0x0f,0x29,0x58,0x30	#movaps	%xmm11, 0x30(%rax)
1488	.byte	0x44,0x0f,0x29,0x60,0x40	#movaps	%xmm12, 0x40(%rax)
1489	.byte	0x44,0x0f,0x29,0x68,0x50	#movaps	%xmm13, 0x50(%rax)
1490	.byte	0x44,0x0f,0x29,0x70,0x60	#movaps	%xmm14, 0x60(%rax)
1491	.byte	0x44,0x0f,0x29,0x78,0x70	#movaps	%xmm15, 0x70(%rax)
1492___
1493$code.=<<___;
1494	movdqa	.LOne(%rip), $M0
1495	movd	$index, $INDEX
1496
1497	pxor	$Ra, $Ra
1498	pxor	$Rb, $Rb
1499	pxor	$Rc, $Rc
1500	pxor	$Rd, $Rd
1501
1502	movdqa	$M0, $ONE
1503	pshufd	\$0, $INDEX, $INDEX
1504	mov	\$64, %rax
1505
1506.Lselect_loop_sse_w7:
1507	movdqa	$M0, $TMP0
1508	paddd	$ONE, $M0
1509	movdqa	16*0($in_t), $T0a
1510	movdqa	16*1($in_t), $T0b
1511	pcmpeqd	$INDEX, $TMP0
1512	movdqa	16*2($in_t), $T0c
1513	movdqa	16*3($in_t), $T0d
1514	lea	16*4($in_t), $in_t
1515
1516	pand	$TMP0, $T0a
1517	pand	$TMP0, $T0b
1518	por	$T0a, $Ra
1519	pand	$TMP0, $T0c
1520	por	$T0b, $Rb
1521	pand	$TMP0, $T0d
1522	por	$T0c, $Rc
1523	prefetcht0	255($in_t)
1524	por	$T0d, $Rd
1525
1526	dec	%rax
1527	jnz	.Lselect_loop_sse_w7
1528
1529	movdqu	$Ra, 16*0($val)
1530	movdqu	$Rb, 16*1($val)
1531	movdqu	$Rc, 16*2($val)
1532	movdqu	$Rd, 16*3($val)
1533___
1534$code.=<<___	if ($win64);
1535	movaps	(%rsp), %xmm6
1536	movaps	0x10(%rsp), %xmm7
1537	movaps	0x20(%rsp), %xmm8
1538	movaps	0x30(%rsp), %xmm9
1539	movaps	0x40(%rsp), %xmm10
1540	movaps	0x50(%rsp), %xmm11
1541	movaps	0x60(%rsp), %xmm12
1542	movaps	0x70(%rsp), %xmm13
1543	movaps	0x80(%rsp), %xmm14
1544	movaps	0x90(%rsp), %xmm15
1545	lea	0xa8(%rsp), %rsp
1546.LSEH_end_ecp_nistz256_select_w7:
1547___
1548$code.=<<___;
1549	ret
1550.size	ecp_nistz256_select_w7,.-ecp_nistz256_select_w7
1551___
1552}
1553if ($avx>1) {
1554my ($val,$in_t,$index)=$win64?("%rcx","%rdx","%r8d"):("%rdi","%rsi","%edx");
1555my ($TWO,$INDEX,$Ra,$Rb,$Rc)=map("%ymm$_",(0..4));
1556my ($M0,$T0a,$T0b,$T0c,$TMP0)=map("%ymm$_",(5..9));
1557my ($M1,$T1a,$T1b,$T1c,$TMP1)=map("%ymm$_",(10..14));
1558
1559$code.=<<___;
1560################################################################################
1561# void ecp_nistz256_avx2_select_w5(uint64_t *val, uint64_t *in_t, int index);
1562.type	ecp_nistz256_avx2_select_w5,\@abi-omnipotent
1563.align	32
1564ecp_nistz256_avx2_select_w5:
1565.Lavx2_select_w5:
1566	vzeroupper
1567___
1568$code.=<<___	if ($win64);
1569	lea	-0x88(%rsp), %rax
1570.LSEH_begin_ecp_nistz256_avx2_select_w5:
1571	.byte	0x48,0x8d,0x60,0xe0		#lea	-0x20(%rax), %rsp
1572	.byte	0xc5,0xf8,0x29,0x70,0xe0	#vmovaps %xmm6, -0x20(%rax)
1573	.byte	0xc5,0xf8,0x29,0x78,0xf0	#vmovaps %xmm7, -0x10(%rax)
1574	.byte	0xc5,0x78,0x29,0x40,0x00	#vmovaps %xmm8, 8(%rax)
1575	.byte	0xc5,0x78,0x29,0x48,0x10	#vmovaps %xmm9, 0x10(%rax)
1576	.byte	0xc5,0x78,0x29,0x50,0x20	#vmovaps %xmm10, 0x20(%rax)
1577	.byte	0xc5,0x78,0x29,0x58,0x30	#vmovaps %xmm11, 0x30(%rax)
1578	.byte	0xc5,0x78,0x29,0x60,0x40	#vmovaps %xmm12, 0x40(%rax)
1579	.byte	0xc5,0x78,0x29,0x68,0x50	#vmovaps %xmm13, 0x50(%rax)
1580	.byte	0xc5,0x78,0x29,0x70,0x60	#vmovaps %xmm14, 0x60(%rax)
1581	.byte	0xc5,0x78,0x29,0x78,0x70	#vmovaps %xmm15, 0x70(%rax)
1582___
1583$code.=<<___;
1584	vmovdqa	.LTwo(%rip), $TWO
1585
1586	vpxor	$Ra, $Ra, $Ra
1587	vpxor	$Rb, $Rb, $Rb
1588	vpxor	$Rc, $Rc, $Rc
1589
1590	vmovdqa .LOne(%rip), $M0
1591	vmovdqa .LTwo(%rip), $M1
1592
1593	vmovd	$index, %xmm1
1594	vpermd	$INDEX, $Ra, $INDEX
1595
1596	mov	\$8, %rax
1597.Lselect_loop_avx2_w5:
1598
1599	vmovdqa	32*0($in_t), $T0a
1600	vmovdqa	32*1($in_t), $T0b
1601	vmovdqa	32*2($in_t), $T0c
1602
1603	vmovdqa	32*3($in_t), $T1a
1604	vmovdqa	32*4($in_t), $T1b
1605	vmovdqa	32*5($in_t), $T1c
1606
1607	vpcmpeqd	$INDEX, $M0, $TMP0
1608	vpcmpeqd	$INDEX, $M1, $TMP1
1609
1610	vpaddd	$TWO, $M0, $M0
1611	vpaddd	$TWO, $M1, $M1
1612	lea	32*6($in_t), $in_t
1613
1614	vpand	$TMP0, $T0a, $T0a
1615	vpand	$TMP0, $T0b, $T0b
1616	vpand	$TMP0, $T0c, $T0c
1617	vpand	$TMP1, $T1a, $T1a
1618	vpand	$TMP1, $T1b, $T1b
1619	vpand	$TMP1, $T1c, $T1c
1620
1621	vpxor	$T0a, $Ra, $Ra
1622	vpxor	$T0b, $Rb, $Rb
1623	vpxor	$T0c, $Rc, $Rc
1624	vpxor	$T1a, $Ra, $Ra
1625	vpxor	$T1b, $Rb, $Rb
1626	vpxor	$T1c, $Rc, $Rc
1627
1628	dec %rax
1629	jnz .Lselect_loop_avx2_w5
1630
1631	vmovdqu $Ra, 32*0($val)
1632	vmovdqu $Rb, 32*1($val)
1633	vmovdqu $Rc, 32*2($val)
1634	vzeroupper
1635___
1636$code.=<<___	if ($win64);
1637	movaps	(%rsp), %xmm6
1638	movaps	0x10(%rsp), %xmm7
1639	movaps	0x20(%rsp), %xmm8
1640	movaps	0x30(%rsp), %xmm9
1641	movaps	0x40(%rsp), %xmm10
1642	movaps	0x50(%rsp), %xmm11
1643	movaps	0x60(%rsp), %xmm12
1644	movaps	0x70(%rsp), %xmm13
1645	movaps	0x80(%rsp), %xmm14
1646	movaps	0x90(%rsp), %xmm15
1647	lea	0xa8(%rsp), %rsp
1648.LSEH_end_ecp_nistz256_avx2_select_w5:
1649___
1650$code.=<<___;
1651	ret
1652.size	ecp_nistz256_avx2_select_w5,.-ecp_nistz256_avx2_select_w5
1653___
1654}
1655if ($avx>1) {
1656my ($val,$in_t,$index)=$win64?("%rcx","%rdx","%r8d"):("%rdi","%rsi","%edx");
1657my ($THREE,$INDEX,$Ra,$Rb)=map("%ymm$_",(0..3));
1658my ($M0,$T0a,$T0b,$TMP0)=map("%ymm$_",(4..7));
1659my ($M1,$T1a,$T1b,$TMP1)=map("%ymm$_",(8..11));
1660my ($M2,$T2a,$T2b,$TMP2)=map("%ymm$_",(12..15));
1661
1662$code.=<<___;
1663
1664################################################################################
1665# void ecp_nistz256_avx2_select_w7(uint64_t *val, uint64_t *in_t, int index);
1666.globl	ecp_nistz256_avx2_select_w7
1667.type	ecp_nistz256_avx2_select_w7,\@abi-omnipotent
1668.align	32
1669ecp_nistz256_avx2_select_w7:
1670.Lavx2_select_w7:
1671	vzeroupper
1672___
1673$code.=<<___	if ($win64);
1674	lea	-0x88(%rsp), %rax
1675.LSEH_begin_ecp_nistz256_avx2_select_w7:
1676	.byte	0x48,0x8d,0x60,0xe0		#lea	-0x20(%rax), %rsp
1677	.byte	0xc5,0xf8,0x29,0x70,0xe0	#vmovaps %xmm6, -0x20(%rax)
1678	.byte	0xc5,0xf8,0x29,0x78,0xf0	#vmovaps %xmm7, -0x10(%rax)
1679	.byte	0xc5,0x78,0x29,0x40,0x00	#vmovaps %xmm8, 8(%rax)
1680	.byte	0xc5,0x78,0x29,0x48,0x10	#vmovaps %xmm9, 0x10(%rax)
1681	.byte	0xc5,0x78,0x29,0x50,0x20	#vmovaps %xmm10, 0x20(%rax)
1682	.byte	0xc5,0x78,0x29,0x58,0x30	#vmovaps %xmm11, 0x30(%rax)
1683	.byte	0xc5,0x78,0x29,0x60,0x40	#vmovaps %xmm12, 0x40(%rax)
1684	.byte	0xc5,0x78,0x29,0x68,0x50	#vmovaps %xmm13, 0x50(%rax)
1685	.byte	0xc5,0x78,0x29,0x70,0x60	#vmovaps %xmm14, 0x60(%rax)
1686	.byte	0xc5,0x78,0x29,0x78,0x70	#vmovaps %xmm15, 0x70(%rax)
1687___
1688$code.=<<___;
1689	vmovdqa	.LThree(%rip), $THREE
1690
1691	vpxor	$Ra, $Ra, $Ra
1692	vpxor	$Rb, $Rb, $Rb
1693
1694	vmovdqa .LOne(%rip), $M0
1695	vmovdqa .LTwo(%rip), $M1
1696	vmovdqa .LThree(%rip), $M2
1697
1698	vmovd	$index, %xmm1
1699	vpermd	$INDEX, $Ra, $INDEX
1700	# Skip index = 0, because it is implicitly the point at infinity
1701
1702	mov	\$21, %rax
1703.Lselect_loop_avx2_w7:
1704
1705	vmovdqa	32*0($in_t), $T0a
1706	vmovdqa	32*1($in_t), $T0b
1707
1708	vmovdqa	32*2($in_t), $T1a
1709	vmovdqa	32*3($in_t), $T1b
1710
1711	vmovdqa	32*4($in_t), $T2a
1712	vmovdqa	32*5($in_t), $T2b
1713
1714	vpcmpeqd	$INDEX, $M0, $TMP0
1715	vpcmpeqd	$INDEX, $M1, $TMP1
1716	vpcmpeqd	$INDEX, $M2, $TMP2
1717
1718	vpaddd	$THREE, $M0, $M0
1719	vpaddd	$THREE, $M1, $M1
1720	vpaddd	$THREE, $M2, $M2
1721	lea	32*6($in_t), $in_t
1722
1723	vpand	$TMP0, $T0a, $T0a
1724	vpand	$TMP0, $T0b, $T0b
1725	vpand	$TMP1, $T1a, $T1a
1726	vpand	$TMP1, $T1b, $T1b
1727	vpand	$TMP2, $T2a, $T2a
1728	vpand	$TMP2, $T2b, $T2b
1729
1730	vpxor	$T0a, $Ra, $Ra
1731	vpxor	$T0b, $Rb, $Rb
1732	vpxor	$T1a, $Ra, $Ra
1733	vpxor	$T1b, $Rb, $Rb
1734	vpxor	$T2a, $Ra, $Ra
1735	vpxor	$T2b, $Rb, $Rb
1736
1737	dec %rax
1738	jnz .Lselect_loop_avx2_w7
1739
1740
1741	vmovdqa	32*0($in_t), $T0a
1742	vmovdqa	32*1($in_t), $T0b
1743
1744	vpcmpeqd	$INDEX, $M0, $TMP0
1745
1746	vpand	$TMP0, $T0a, $T0a
1747	vpand	$TMP0, $T0b, $T0b
1748
1749	vpxor	$T0a, $Ra, $Ra
1750	vpxor	$T0b, $Rb, $Rb
1751
1752	vmovdqu $Ra, 32*0($val)
1753	vmovdqu $Rb, 32*1($val)
1754	vzeroupper
1755___
1756$code.=<<___	if ($win64);
1757	movaps	(%rsp), %xmm6
1758	movaps	0x10(%rsp), %xmm7
1759	movaps	0x20(%rsp), %xmm8
1760	movaps	0x30(%rsp), %xmm9
1761	movaps	0x40(%rsp), %xmm10
1762	movaps	0x50(%rsp), %xmm11
1763	movaps	0x60(%rsp), %xmm12
1764	movaps	0x70(%rsp), %xmm13
1765	movaps	0x80(%rsp), %xmm14
1766	movaps	0x90(%rsp), %xmm15
1767	lea	0xa8(%rsp), %rsp
1768.LSEH_end_ecp_nistz256_avx2_select_w7:
1769___
1770$code.=<<___;
1771	ret
1772.size	ecp_nistz256_avx2_select_w7,.-ecp_nistz256_avx2_select_w7
1773___
1774} else {
1775$code.=<<___;
1776.globl	ecp_nistz256_avx2_select_w7
1777.type	ecp_nistz256_avx2_select_w7,\@function,3
1778.align	32
1779ecp_nistz256_avx2_select_w7:
1780	.byte	0x0f,0x0b	# ud2
1781	ret
1782.size	ecp_nistz256_avx2_select_w7,.-ecp_nistz256_avx2_select_w7
1783___
1784}
1785{{{
1786########################################################################
1787# This block implements higher level point_double, point_add and
1788# point_add_affine. The key to performance in this case is to allow
1789# out-of-order execution logic to overlap computations from next step
1790# with tail processing from current step. By using tailored calling
1791# sequence we minimize inter-step overhead to give processor better
1792# shot at overlapping operations...
1793#
1794# You will notice that input data is copied to stack. Trouble is that
1795# there are no registers to spare for holding original pointers and
1796# reloading them, pointers, would create undesired dependencies on
1797# effective addresses calculation paths. In other words it's too done
1798# to favour out-of-order execution logic.
1799#						<appro@openssl.org>
1800
1801my ($r_ptr,$a_ptr,$b_org,$b_ptr)=("%rdi","%rsi","%rdx","%rbx");
1802my ($acc0,$acc1,$acc2,$acc3,$acc4,$acc5,$acc6,$acc7)=map("%r$_",(8..15));
1803my ($t0,$t1,$t2,$t3,$t4)=("%rax","%rbp","%rcx",$acc4,$acc4);
1804my ($poly1,$poly3)=($acc6,$acc7);
1805
1806sub load_for_mul () {
1807my ($a,$b,$src0) = @_;
1808my $bias = $src0 eq "%rax" ? 0 : -128;
1809
1810"	mov	$b, $src0
1811	lea	$b, $b_ptr
1812	mov	8*0+$a, $acc1
1813	mov	8*1+$a, $acc2
1814	lea	$bias+$a, $a_ptr
1815	mov	8*2+$a, $acc3
1816	mov	8*3+$a, $acc4"
1817}
1818
1819sub load_for_sqr () {
1820my ($a,$src0) = @_;
1821my $bias = $src0 eq "%rax" ? 0 : -128;
1822
1823"	mov	8*0+$a, $src0
1824	mov	8*1+$a, $acc6
1825	lea	$bias+$a, $a_ptr
1826	mov	8*2+$a, $acc7
1827	mov	8*3+$a, $acc0"
1828}
1829
1830									{
1831########################################################################
1832# operate in 4-5-0-1 "name space" that matches multiplication output
1833#
1834my ($a0,$a1,$a2,$a3,$t3,$t4)=($acc4,$acc5,$acc0,$acc1,$acc2,$acc3);
1835
1836$code.=<<___;
1837.type	__ecp_nistz256_add_toq,\@abi-omnipotent
1838.align	32
1839__ecp_nistz256_add_toq:
1840	xor	$t4,$t4
1841	add	8*0($b_ptr), $a0
1842	adc	8*1($b_ptr), $a1
1843	 mov	$a0, $t0
1844	adc	8*2($b_ptr), $a2
1845	adc	8*3($b_ptr), $a3
1846	 mov	$a1, $t1
1847	adc	\$0, $t4
1848
1849	sub	\$-1, $a0
1850	 mov	$a2, $t2
1851	sbb	$poly1, $a1
1852	sbb	\$0, $a2
1853	 mov	$a3, $t3
1854	sbb	$poly3, $a3
1855	sbb	\$0, $t4
1856
1857	cmovc	$t0, $a0
1858	cmovc	$t1, $a1
1859	mov	$a0, 8*0($r_ptr)
1860	cmovc	$t2, $a2
1861	mov	$a1, 8*1($r_ptr)
1862	cmovc	$t3, $a3
1863	mov	$a2, 8*2($r_ptr)
1864	mov	$a3, 8*3($r_ptr)
1865
1866	ret
1867.size	__ecp_nistz256_add_toq,.-__ecp_nistz256_add_toq
1868
1869.type	__ecp_nistz256_sub_fromq,\@abi-omnipotent
1870.align	32
1871__ecp_nistz256_sub_fromq:
1872	sub	8*0($b_ptr), $a0
1873	sbb	8*1($b_ptr), $a1
1874	 mov	$a0, $t0
1875	sbb	8*2($b_ptr), $a2
1876	sbb	8*3($b_ptr), $a3
1877	 mov	$a1, $t1
1878	sbb	$t4, $t4
1879
1880	add	\$-1, $a0
1881	 mov	$a2, $t2
1882	adc	$poly1, $a1
1883	adc	\$0, $a2
1884	 mov	$a3, $t3
1885	adc	$poly3, $a3
1886	test	$t4, $t4
1887
1888	cmovz	$t0, $a0
1889	cmovz	$t1, $a1
1890	mov	$a0, 8*0($r_ptr)
1891	cmovz	$t2, $a2
1892	mov	$a1, 8*1($r_ptr)
1893	cmovz	$t3, $a3
1894	mov	$a2, 8*2($r_ptr)
1895	mov	$a3, 8*3($r_ptr)
1896
1897	ret
1898.size	__ecp_nistz256_sub_fromq,.-__ecp_nistz256_sub_fromq
1899
1900.type	__ecp_nistz256_subq,\@abi-omnipotent
1901.align	32
1902__ecp_nistz256_subq:
1903	sub	$a0, $t0
1904	sbb	$a1, $t1
1905	 mov	$t0, $a0
1906	sbb	$a2, $t2
1907	sbb	$a3, $t3
1908	 mov	$t1, $a1
1909	sbb	$t4, $t4
1910
1911	add	\$-1, $t0
1912	 mov	$t2, $a2
1913	adc	$poly1, $t1
1914	adc	\$0, $t2
1915	 mov	$t3, $a3
1916	adc	$poly3, $t3
1917	test	$t4, $t4
1918
1919	cmovnz	$t0, $a0
1920	cmovnz	$t1, $a1
1921	cmovnz	$t2, $a2
1922	cmovnz	$t3, $a3
1923
1924	ret
1925.size	__ecp_nistz256_subq,.-__ecp_nistz256_subq
1926
1927.type	__ecp_nistz256_mul_by_2q,\@abi-omnipotent
1928.align	32
1929__ecp_nistz256_mul_by_2q:
1930	xor	$t4, $t4
1931	add	$a0, $a0		# a0:a3+a0:a3
1932	adc	$a1, $a1
1933	 mov	$a0, $t0
1934	adc	$a2, $a2
1935	adc	$a3, $a3
1936	 mov	$a1, $t1
1937	adc	\$0, $t4
1938
1939	sub	\$-1, $a0
1940	 mov	$a2, $t2
1941	sbb	$poly1, $a1
1942	sbb	\$0, $a2
1943	 mov	$a3, $t3
1944	sbb	$poly3, $a3
1945	sbb	\$0, $t4
1946
1947	cmovc	$t0, $a0
1948	cmovc	$t1, $a1
1949	mov	$a0, 8*0($r_ptr)
1950	cmovc	$t2, $a2
1951	mov	$a1, 8*1($r_ptr)
1952	cmovc	$t3, $a3
1953	mov	$a2, 8*2($r_ptr)
1954	mov	$a3, 8*3($r_ptr)
1955
1956	ret
1957.size	__ecp_nistz256_mul_by_2q,.-__ecp_nistz256_mul_by_2q
1958___
1959									}
1960sub gen_double () {
1961    my $x = shift;
1962    my ($src0,$sfx,$bias);
1963    my ($S,$M,$Zsqr,$in_x,$tmp0)=map(32*$_,(0..4));
1964
1965    if ($x ne "x") {
1966	$src0 = "%rax";
1967	$sfx  = "";
1968	$bias = 0;
1969
1970$code.=<<___;
1971.globl	ecp_nistz256_point_double
1972.type	ecp_nistz256_point_double,\@function,2
1973.align	32
1974ecp_nistz256_point_double:
1975___
1976$code.=<<___	if ($addx);
1977	mov	\$0x80100, %ecx
1978	and	OPENSSL_ia32cap_P+8(%rip), %ecx
1979	cmp	\$0x80100, %ecx
1980	je	.Lpoint_doublex
1981___
1982    } else {
1983	$src0 = "%rdx";
1984	$sfx  = "x";
1985	$bias = 128;
1986
1987$code.=<<___;
1988.type	ecp_nistz256_point_doublex,\@function,2
1989.align	32
1990ecp_nistz256_point_doublex:
1991.Lpoint_doublex:
1992___
1993    }
1994$code.=<<___;
1995	push	%rbp
1996	push	%rbx
1997	push	%r12
1998	push	%r13
1999	push	%r14
2000	push	%r15
2001	sub	\$32*5+8, %rsp
2002
2003.Lpoint_double_shortcut$x:
2004	movdqu	0x00($a_ptr), %xmm0		# copy	*(P256_POINT *)$a_ptr.x
2005	mov	$a_ptr, $b_ptr			# backup copy
2006	movdqu	0x10($a_ptr), %xmm1
2007	 mov	0x20+8*0($a_ptr), $acc4		# load in_y in "5-4-0-1" order
2008	 mov	0x20+8*1($a_ptr), $acc5
2009	 mov	0x20+8*2($a_ptr), $acc0
2010	 mov	0x20+8*3($a_ptr), $acc1
2011	 mov	.Lpoly+8*1(%rip), $poly1
2012	 mov	.Lpoly+8*3(%rip), $poly3
2013	movdqa	%xmm0, $in_x(%rsp)
2014	movdqa	%xmm1, $in_x+0x10(%rsp)
2015	lea	0x20($r_ptr), $acc2
2016	lea	0x40($r_ptr), $acc3
2017	movq	$r_ptr, %xmm0
2018	movq	$acc2, %xmm1
2019	movq	$acc3, %xmm2
2020
2021	lea	$S(%rsp), $r_ptr
2022	call	__ecp_nistz256_mul_by_2$x	# p256_mul_by_2(S, in_y);
2023
2024	mov	0x40+8*0($a_ptr), $src0
2025	mov	0x40+8*1($a_ptr), $acc6
2026	mov	0x40+8*2($a_ptr), $acc7
2027	mov	0x40+8*3($a_ptr), $acc0
2028	lea	0x40-$bias($a_ptr), $a_ptr
2029	lea	$Zsqr(%rsp), $r_ptr
2030	call	__ecp_nistz256_sqr_mont$x	# p256_sqr_mont(Zsqr, in_z);
2031
2032	`&load_for_sqr("$S(%rsp)", "$src0")`
2033	lea	$S(%rsp), $r_ptr
2034	call	__ecp_nistz256_sqr_mont$x	# p256_sqr_mont(S, S);
2035
2036	mov	0x20($b_ptr), $src0		# $b_ptr is still valid
2037	mov	0x40+8*0($b_ptr), $acc1
2038	mov	0x40+8*1($b_ptr), $acc2
2039	mov	0x40+8*2($b_ptr), $acc3
2040	mov	0x40+8*3($b_ptr), $acc4
2041	lea	0x40-$bias($b_ptr), $a_ptr
2042	lea	0x20($b_ptr), $b_ptr
2043	movq	%xmm2, $r_ptr
2044	call	__ecp_nistz256_mul_mont$x	# p256_mul_mont(res_z, in_z, in_y);
2045	call	__ecp_nistz256_mul_by_2$x	# p256_mul_by_2(res_z, res_z);
2046
2047	mov	$in_x+8*0(%rsp), $acc4		# "5-4-0-1" order
2048	mov	$in_x+8*1(%rsp), $acc5
2049	lea	$Zsqr(%rsp), $b_ptr
2050	mov	$in_x+8*2(%rsp), $acc0
2051	mov	$in_x+8*3(%rsp), $acc1
2052	lea	$M(%rsp), $r_ptr
2053	call	__ecp_nistz256_add_to$x		# p256_add(M, in_x, Zsqr);
2054
2055	mov	$in_x+8*0(%rsp), $acc4		# "5-4-0-1" order
2056	mov	$in_x+8*1(%rsp), $acc5
2057	lea	$Zsqr(%rsp), $b_ptr
2058	mov	$in_x+8*2(%rsp), $acc0
2059	mov	$in_x+8*3(%rsp), $acc1
2060	lea	$Zsqr(%rsp), $r_ptr
2061	call	__ecp_nistz256_sub_from$x	# p256_sub(Zsqr, in_x, Zsqr);
2062
2063	`&load_for_sqr("$S(%rsp)", "$src0")`
2064	movq	%xmm1, $r_ptr
2065	call	__ecp_nistz256_sqr_mont$x	# p256_sqr_mont(res_y, S);
2066___
2067{
2068######## ecp_nistz256_div_by_2(res_y, res_y); ##########################
2069# operate in 4-5-6-7 "name space" that matches squaring output
2070#
2071my ($poly1,$poly3)=($a_ptr,$t1);
2072my ($a0,$a1,$a2,$a3,$t3,$t4,$t1)=($acc4,$acc5,$acc6,$acc7,$acc0,$acc1,$acc2);
2073
2074$code.=<<___;
2075	xor	$t4, $t4
2076	mov	$a0, $t0
2077	add	\$-1, $a0
2078	mov	$a1, $t1
2079	adc	$poly1, $a1
2080	mov	$a2, $t2
2081	adc	\$0, $a2
2082	mov	$a3, $t3
2083	adc	$poly3, $a3
2084	adc	\$0, $t4
2085	xor	$a_ptr, $a_ptr		# borrow $a_ptr
2086	test	\$1, $t0
2087
2088	cmovz	$t0, $a0
2089	cmovz	$t1, $a1
2090	cmovz	$t2, $a2
2091	cmovz	$t3, $a3
2092	cmovz	$a_ptr, $t4
2093
2094	mov	$a1, $t0		# a0:a3>>1
2095	shr	\$1, $a0
2096	shl	\$63, $t0
2097	mov	$a2, $t1
2098	shr	\$1, $a1
2099	or	$t0, $a0
2100	shl	\$63, $t1
2101	mov	$a3, $t2
2102	shr	\$1, $a2
2103	or	$t1, $a1
2104	shl	\$63, $t2
2105	mov	$a0, 8*0($r_ptr)
2106	shr	\$1, $a3
2107	mov	$a1, 8*1($r_ptr)
2108	shl	\$63, $t4
2109	or	$t2, $a2
2110	or	$t4, $a3
2111	mov	$a2, 8*2($r_ptr)
2112	mov	$a3, 8*3($r_ptr)
2113___
2114}
2115$code.=<<___;
2116	`&load_for_mul("$M(%rsp)", "$Zsqr(%rsp)", "$src0")`
2117	lea	$M(%rsp), $r_ptr
2118	call	__ecp_nistz256_mul_mont$x	# p256_mul_mont(M, M, Zsqr);
2119
2120	lea	$tmp0(%rsp), $r_ptr
2121	call	__ecp_nistz256_mul_by_2$x
2122
2123	lea	$M(%rsp), $b_ptr
2124	lea	$M(%rsp), $r_ptr
2125	call	__ecp_nistz256_add_to$x		# p256_mul_by_3(M, M);
2126
2127	`&load_for_mul("$S(%rsp)", "$in_x(%rsp)", "$src0")`
2128	lea	$S(%rsp), $r_ptr
2129	call	__ecp_nistz256_mul_mont$x	# p256_mul_mont(S, S, in_x);
2130
2131	lea	$tmp0(%rsp), $r_ptr
2132	call	__ecp_nistz256_mul_by_2$x	# p256_mul_by_2(tmp0, S);
2133
2134	`&load_for_sqr("$M(%rsp)", "$src0")`
2135	movq	%xmm0, $r_ptr
2136	call	__ecp_nistz256_sqr_mont$x	# p256_sqr_mont(res_x, M);
2137
2138	lea	$tmp0(%rsp), $b_ptr
2139	mov	$acc6, $acc0			# harmonize sqr output and sub input
2140	mov	$acc7, $acc1
2141	mov	$a_ptr, $poly1
2142	mov	$t1, $poly3
2143	call	__ecp_nistz256_sub_from$x	# p256_sub(res_x, res_x, tmp0);
2144
2145	mov	$S+8*0(%rsp), $t0
2146	mov	$S+8*1(%rsp), $t1
2147	mov	$S+8*2(%rsp), $t2
2148	mov	$S+8*3(%rsp), $acc2		# "4-5-0-1" order
2149	lea	$S(%rsp), $r_ptr
2150	call	__ecp_nistz256_sub$x		# p256_sub(S, S, res_x);
2151
2152	mov	$M(%rsp), $src0
2153	lea	$M(%rsp), $b_ptr
2154	mov	$acc4, $acc6			# harmonize sub output and mul input
2155	xor	%ecx, %ecx
2156	mov	$acc4, $S+8*0(%rsp)		# have to save:-(
2157	mov	$acc5, $acc2
2158	mov	$acc5, $S+8*1(%rsp)
2159	cmovz	$acc0, $acc3
2160	mov	$acc0, $S+8*2(%rsp)
2161	lea	$S-$bias(%rsp), $a_ptr
2162	cmovz	$acc1, $acc4
2163	mov	$acc1, $S+8*3(%rsp)
2164	mov	$acc6, $acc1
2165	lea	$S(%rsp), $r_ptr
2166	call	__ecp_nistz256_mul_mont$x	# p256_mul_mont(S, S, M);
2167
2168	movq	%xmm1, $b_ptr
2169	movq	%xmm1, $r_ptr
2170	call	__ecp_nistz256_sub_from$x	# p256_sub(res_y, S, res_y);
2171
2172	add	\$32*5+8, %rsp
2173	pop	%r15
2174	pop	%r14
2175	pop	%r13
2176	pop	%r12
2177	pop	%rbx
2178	pop	%rbp
2179	ret
2180.size	ecp_nistz256_point_double$sfx,.-ecp_nistz256_point_double$sfx
2181___
2182}
2183&gen_double("q");
2184
2185sub gen_add () {
2186    my $x = shift;
2187    my ($src0,$sfx,$bias);
2188    my ($H,$Hsqr,$R,$Rsqr,$Hcub,
2189	$U1,$U2,$S1,$S2,
2190	$res_x,$res_y,$res_z,
2191	$in1_x,$in1_y,$in1_z,
2192	$in2_x,$in2_y,$in2_z)=map(32*$_,(0..17));
2193    my ($Z1sqr, $Z2sqr) = ($Hsqr, $Rsqr);
2194
2195    if ($x ne "x") {
2196	$src0 = "%rax";
2197	$sfx  = "";
2198	$bias = 0;
2199
2200$code.=<<___;
2201.globl	ecp_nistz256_point_add
2202.type	ecp_nistz256_point_add,\@function,3
2203.align	32
2204ecp_nistz256_point_add:
2205___
2206$code.=<<___	if ($addx);
2207	mov	\$0x80100, %ecx
2208	and	OPENSSL_ia32cap_P+8(%rip), %ecx
2209	cmp	\$0x80100, %ecx
2210	je	.Lpoint_addx
2211___
2212    } else {
2213	$src0 = "%rdx";
2214	$sfx  = "x";
2215	$bias = 128;
2216
2217$code.=<<___;
2218.type	ecp_nistz256_point_addx,\@function,3
2219.align	32
2220ecp_nistz256_point_addx:
2221.Lpoint_addx:
2222___
2223    }
2224$code.=<<___;
2225	push	%rbp
2226	push	%rbx
2227	push	%r12
2228	push	%r13
2229	push	%r14
2230	push	%r15
2231	sub	\$32*18+8, %rsp
2232
2233	movdqu	0x00($a_ptr), %xmm0		# copy	*(P256_POINT *)$a_ptr
2234	movdqu	0x10($a_ptr), %xmm1
2235	movdqu	0x20($a_ptr), %xmm2
2236	movdqu	0x30($a_ptr), %xmm3
2237	movdqu	0x40($a_ptr), %xmm4
2238	movdqu	0x50($a_ptr), %xmm5
2239	mov	$a_ptr, $b_ptr			# reassign
2240	mov	$b_org, $a_ptr			# reassign
2241	movdqa	%xmm0, $in1_x(%rsp)
2242	movdqa	%xmm1, $in1_x+0x10(%rsp)
2243	movdqa	%xmm2, $in1_y(%rsp)
2244	movdqa	%xmm3, $in1_y+0x10(%rsp)
2245	movdqa	%xmm4, $in1_z(%rsp)
2246	movdqa	%xmm5, $in1_z+0x10(%rsp)
2247	por	%xmm4, %xmm5
2248
2249	movdqu	0x00($a_ptr), %xmm0		# copy	*(P256_POINT *)$b_ptr
2250	 pshufd	\$0xb1, %xmm5, %xmm3
2251	movdqu	0x10($a_ptr), %xmm1
2252	movdqu	0x20($a_ptr), %xmm2
2253	 por	%xmm3, %xmm5
2254	movdqu	0x30($a_ptr), %xmm3
2255	 mov	0x40+8*0($a_ptr), $src0		# load original in2_z
2256	 mov	0x40+8*1($a_ptr), $acc6
2257	 mov	0x40+8*2($a_ptr), $acc7
2258	 mov	0x40+8*3($a_ptr), $acc0
2259	movdqa	%xmm0, $in2_x(%rsp)
2260	 pshufd	\$0x1e, %xmm5, %xmm4
2261	movdqa	%xmm1, $in2_x+0x10(%rsp)
2262	movdqu	0x40($a_ptr),%xmm0		# in2_z again
2263	movdqu	0x50($a_ptr),%xmm1
2264	movdqa	%xmm2, $in2_y(%rsp)
2265	movdqa	%xmm3, $in2_y+0x10(%rsp)
2266	 por	%xmm4, %xmm5
2267	 pxor	%xmm4, %xmm4
2268	por	%xmm0, %xmm1
2269	 movq	$r_ptr, %xmm0			# save $r_ptr
2270
2271	lea	0x40-$bias($a_ptr), $a_ptr	# $a_ptr is still valid
2272	 mov	$src0, $in2_z+8*0(%rsp)		# make in2_z copy
2273	 mov	$acc6, $in2_z+8*1(%rsp)
2274	 mov	$acc7, $in2_z+8*2(%rsp)
2275	 mov	$acc0, $in2_z+8*3(%rsp)
2276	lea	$Z2sqr(%rsp), $r_ptr		# Z2^2
2277	call	__ecp_nistz256_sqr_mont$x	# p256_sqr_mont(Z2sqr, in2_z);
2278
2279	pcmpeqd	%xmm4, %xmm5
2280	pshufd	\$0xb1, %xmm1, %xmm4
2281	por	%xmm1, %xmm4
2282	pshufd	\$0, %xmm5, %xmm5		# in1infty
2283	pshufd	\$0x1e, %xmm4, %xmm3
2284	por	%xmm3, %xmm4
2285	pxor	%xmm3, %xmm3
2286	pcmpeqd	%xmm3, %xmm4
2287	pshufd	\$0, %xmm4, %xmm4		# in2infty
2288	 mov	0x40+8*0($b_ptr), $src0		# load original in1_z
2289	 mov	0x40+8*1($b_ptr), $acc6
2290	 mov	0x40+8*2($b_ptr), $acc7
2291	 mov	0x40+8*3($b_ptr), $acc0
2292	movq	$b_ptr, %xmm1
2293
2294	lea	0x40-$bias($b_ptr), $a_ptr
2295	lea	$Z1sqr(%rsp), $r_ptr		# Z1^2
2296	call	__ecp_nistz256_sqr_mont$x	# p256_sqr_mont(Z1sqr, in1_z);
2297
2298	`&load_for_mul("$Z2sqr(%rsp)", "$in2_z(%rsp)", "$src0")`
2299	lea	$S1(%rsp), $r_ptr		# S1 = Z2^3
2300	call	__ecp_nistz256_mul_mont$x	# p256_mul_mont(S1, Z2sqr, in2_z);
2301
2302	`&load_for_mul("$Z1sqr(%rsp)", "$in1_z(%rsp)", "$src0")`
2303	lea	$S2(%rsp), $r_ptr		# S2 = Z1^3
2304	call	__ecp_nistz256_mul_mont$x	# p256_mul_mont(S2, Z1sqr, in1_z);
2305
2306	`&load_for_mul("$S1(%rsp)", "$in1_y(%rsp)", "$src0")`
2307	lea	$S1(%rsp), $r_ptr		# S1 = Y1*Z2^3
2308	call	__ecp_nistz256_mul_mont$x	# p256_mul_mont(S1, S1, in1_y);
2309
2310	`&load_for_mul("$S2(%rsp)", "$in2_y(%rsp)", "$src0")`
2311	lea	$S2(%rsp), $r_ptr		# S2 = Y2*Z1^3
2312	call	__ecp_nistz256_mul_mont$x	# p256_mul_mont(S2, S2, in2_y);
2313
2314	lea	$S1(%rsp), $b_ptr
2315	lea	$R(%rsp), $r_ptr		# R = S2 - S1
2316	call	__ecp_nistz256_sub_from$x	# p256_sub(R, S2, S1);
2317
2318	or	$acc5, $acc4			# see if result is zero
2319	movdqa	%xmm4, %xmm2
2320	or	$acc0, $acc4
2321	or	$acc1, $acc4
2322	por	%xmm5, %xmm2			# in1infty || in2infty
2323	movq	$acc4, %xmm3
2324
2325	`&load_for_mul("$Z2sqr(%rsp)", "$in1_x(%rsp)", "$src0")`
2326	lea	$U1(%rsp), $r_ptr		# U1 = X1*Z2^2
2327	call	__ecp_nistz256_mul_mont$x	# p256_mul_mont(U1, in1_x, Z2sqr);
2328
2329	`&load_for_mul("$Z1sqr(%rsp)", "$in2_x(%rsp)", "$src0")`
2330	lea	$U2(%rsp), $r_ptr		# U2 = X2*Z1^2
2331	call	__ecp_nistz256_mul_mont$x	# p256_mul_mont(U2, in2_x, Z1sqr);
2332
2333	lea	$U1(%rsp), $b_ptr
2334	lea	$H(%rsp), $r_ptr		# H = U2 - U1
2335	call	__ecp_nistz256_sub_from$x	# p256_sub(H, U2, U1);
2336
2337	or	$acc5, $acc4			# see if result is zero
2338	or	$acc0, $acc4
2339	or	$acc1, $acc4
2340
2341	.byte	0x3e				# predict taken
2342	jnz	.Ladd_proceed$x			# is_equal(U1,U2)?
2343	movq	%xmm2, $acc0
2344	movq	%xmm3, $acc1
2345	test	$acc0, $acc0
2346	jnz	.Ladd_proceed$x			# (in1infty || in2infty)?
2347	test	$acc1, $acc1
2348	jz	.Ladd_double$x			# is_equal(S1,S2)?
2349
2350	movq	%xmm0, $r_ptr			# restore $r_ptr
2351	pxor	%xmm0, %xmm0
2352	movdqu	%xmm0, 0x00($r_ptr)
2353	movdqu	%xmm0, 0x10($r_ptr)
2354	movdqu	%xmm0, 0x20($r_ptr)
2355	movdqu	%xmm0, 0x30($r_ptr)
2356	movdqu	%xmm0, 0x40($r_ptr)
2357	movdqu	%xmm0, 0x50($r_ptr)
2358	jmp	.Ladd_done$x
2359
2360.align	32
2361.Ladd_double$x:
2362	movq	%xmm1, $a_ptr			# restore $a_ptr
2363	movq	%xmm0, $r_ptr			# restore $r_ptr
2364	add	\$`32*(18-5)`, %rsp		# difference in frame sizes
2365	jmp	.Lpoint_double_shortcut$x
2366
2367.align	32
2368.Ladd_proceed$x:
2369	`&load_for_sqr("$R(%rsp)", "$src0")`
2370	lea	$Rsqr(%rsp), $r_ptr		# R^2
2371	call	__ecp_nistz256_sqr_mont$x	# p256_sqr_mont(Rsqr, R);
2372
2373	`&load_for_mul("$H(%rsp)", "$in1_z(%rsp)", "$src0")`
2374	lea	$res_z(%rsp), $r_ptr		# Z3 = H*Z1*Z2
2375	call	__ecp_nistz256_mul_mont$x	# p256_mul_mont(res_z, H, in1_z);
2376
2377	`&load_for_sqr("$H(%rsp)", "$src0")`
2378	lea	$Hsqr(%rsp), $r_ptr		# H^2
2379	call	__ecp_nistz256_sqr_mont$x	# p256_sqr_mont(Hsqr, H);
2380
2381	`&load_for_mul("$res_z(%rsp)", "$in2_z(%rsp)", "$src0")`
2382	lea	$res_z(%rsp), $r_ptr		# Z3 = H*Z1*Z2
2383	call	__ecp_nistz256_mul_mont$x	# p256_mul_mont(res_z, res_z, in2_z);
2384
2385	`&load_for_mul("$Hsqr(%rsp)", "$H(%rsp)", "$src0")`
2386	lea	$Hcub(%rsp), $r_ptr		# H^3
2387	call	__ecp_nistz256_mul_mont$x	# p256_mul_mont(Hcub, Hsqr, H);
2388
2389	`&load_for_mul("$Hsqr(%rsp)", "$U1(%rsp)", "$src0")`
2390	lea	$U2(%rsp), $r_ptr		# U1*H^2
2391	call	__ecp_nistz256_mul_mont$x	# p256_mul_mont(U2, U1, Hsqr);
2392___
2393{
2394#######################################################################
2395# operate in 4-5-0-1 "name space" that matches multiplication output
2396#
2397my ($acc0,$acc1,$acc2,$acc3,$t3,$t4)=($acc4,$acc5,$acc0,$acc1,$acc2,$acc3);
2398my ($poly1, $poly3)=($acc6,$acc7);
2399
2400$code.=<<___;
2401	#lea	$U2(%rsp), $a_ptr
2402	#lea	$Hsqr(%rsp), $r_ptr	# 2*U1*H^2
2403	#call	__ecp_nistz256_mul_by_2	# ecp_nistz256_mul_by_2(Hsqr, U2);
2404
2405	xor	$t4, $t4
2406	add	$acc0, $acc0		# a0:a3+a0:a3
2407	lea	$Rsqr(%rsp), $a_ptr
2408	adc	$acc1, $acc1
2409	 mov	$acc0, $t0
2410	adc	$acc2, $acc2
2411	adc	$acc3, $acc3
2412	 mov	$acc1, $t1
2413	adc	\$0, $t4
2414
2415	sub	\$-1, $acc0
2416	 mov	$acc2, $t2
2417	sbb	$poly1, $acc1
2418	sbb	\$0, $acc2
2419	 mov	$acc3, $t3
2420	sbb	$poly3, $acc3
2421	sbb	\$0, $t4
2422
2423	cmovc	$t0, $acc0
2424	mov	8*0($a_ptr), $t0
2425	cmovc	$t1, $acc1
2426	mov	8*1($a_ptr), $t1
2427	cmovc	$t2, $acc2
2428	mov	8*2($a_ptr), $t2
2429	cmovc	$t3, $acc3
2430	mov	8*3($a_ptr), $t3
2431
2432	call	__ecp_nistz256_sub$x		# p256_sub(res_x, Rsqr, Hsqr);
2433
2434	lea	$Hcub(%rsp), $b_ptr
2435	lea	$res_x(%rsp), $r_ptr
2436	call	__ecp_nistz256_sub_from$x	# p256_sub(res_x, res_x, Hcub);
2437
2438	mov	$U2+8*0(%rsp), $t0
2439	mov	$U2+8*1(%rsp), $t1
2440	mov	$U2+8*2(%rsp), $t2
2441	mov	$U2+8*3(%rsp), $t3
2442	lea	$res_y(%rsp), $r_ptr
2443
2444	call	__ecp_nistz256_sub$x		# p256_sub(res_y, U2, res_x);
2445
2446	mov	$acc0, 8*0($r_ptr)		# save the result, as
2447	mov	$acc1, 8*1($r_ptr)		# __ecp_nistz256_sub doesn't
2448	mov	$acc2, 8*2($r_ptr)
2449	mov	$acc3, 8*3($r_ptr)
2450___
2451}
2452$code.=<<___;
2453	`&load_for_mul("$S1(%rsp)", "$Hcub(%rsp)", "$src0")`
2454	lea	$S2(%rsp), $r_ptr
2455	call	__ecp_nistz256_mul_mont$x	# p256_mul_mont(S2, S1, Hcub);
2456
2457	`&load_for_mul("$R(%rsp)", "$res_y(%rsp)", "$src0")`
2458	lea	$res_y(%rsp), $r_ptr
2459	call	__ecp_nistz256_mul_mont$x	# p256_mul_mont(res_y, R, res_y);
2460
2461	lea	$S2(%rsp), $b_ptr
2462	lea	$res_y(%rsp), $r_ptr
2463	call	__ecp_nistz256_sub_from$x	# p256_sub(res_y, res_y, S2);
2464
2465	movq	%xmm0, $r_ptr		# restore $r_ptr
2466
2467	movdqa	%xmm5, %xmm0		# copy_conditional(res_z, in2_z, in1infty);
2468	movdqa	%xmm5, %xmm1
2469	pandn	$res_z(%rsp), %xmm0
2470	movdqa	%xmm5, %xmm2
2471	pandn	$res_z+0x10(%rsp), %xmm1
2472	movdqa	%xmm5, %xmm3
2473	pand	$in2_z(%rsp), %xmm2
2474	pand	$in2_z+0x10(%rsp), %xmm3
2475	por	%xmm0, %xmm2
2476	por	%xmm1, %xmm3
2477
2478	movdqa	%xmm4, %xmm0		# copy_conditional(res_z, in1_z, in2infty);
2479	movdqa	%xmm4, %xmm1
2480	pandn	%xmm2, %xmm0
2481	movdqa	%xmm4, %xmm2
2482	pandn	%xmm3, %xmm1
2483	movdqa	%xmm4, %xmm3
2484	pand	$in1_z(%rsp), %xmm2
2485	pand	$in1_z+0x10(%rsp), %xmm3
2486	por	%xmm0, %xmm2
2487	por	%xmm1, %xmm3
2488	movdqu	%xmm2, 0x40($r_ptr)
2489	movdqu	%xmm3, 0x50($r_ptr)
2490
2491	movdqa	%xmm5, %xmm0		# copy_conditional(res_x, in2_x, in1infty);
2492	movdqa	%xmm5, %xmm1
2493	pandn	$res_x(%rsp), %xmm0
2494	movdqa	%xmm5, %xmm2
2495	pandn	$res_x+0x10(%rsp), %xmm1
2496	movdqa	%xmm5, %xmm3
2497	pand	$in2_x(%rsp), %xmm2
2498	pand	$in2_x+0x10(%rsp), %xmm3
2499	por	%xmm0, %xmm2
2500	por	%xmm1, %xmm3
2501
2502	movdqa	%xmm4, %xmm0		# copy_conditional(res_x, in1_x, in2infty);
2503	movdqa	%xmm4, %xmm1
2504	pandn	%xmm2, %xmm0
2505	movdqa	%xmm4, %xmm2
2506	pandn	%xmm3, %xmm1
2507	movdqa	%xmm4, %xmm3
2508	pand	$in1_x(%rsp), %xmm2
2509	pand	$in1_x+0x10(%rsp), %xmm3
2510	por	%xmm0, %xmm2
2511	por	%xmm1, %xmm3
2512	movdqu	%xmm2, 0x00($r_ptr)
2513	movdqu	%xmm3, 0x10($r_ptr)
2514
2515	movdqa	%xmm5, %xmm0		# copy_conditional(res_y, in2_y, in1infty);
2516	movdqa	%xmm5, %xmm1
2517	pandn	$res_y(%rsp), %xmm0
2518	movdqa	%xmm5, %xmm2
2519	pandn	$res_y+0x10(%rsp), %xmm1
2520	movdqa	%xmm5, %xmm3
2521	pand	$in2_y(%rsp), %xmm2
2522	pand	$in2_y+0x10(%rsp), %xmm3
2523	por	%xmm0, %xmm2
2524	por	%xmm1, %xmm3
2525
2526	movdqa	%xmm4, %xmm0		# copy_conditional(res_y, in1_y, in2infty);
2527	movdqa	%xmm4, %xmm1
2528	pandn	%xmm2, %xmm0
2529	movdqa	%xmm4, %xmm2
2530	pandn	%xmm3, %xmm1
2531	movdqa	%xmm4, %xmm3
2532	pand	$in1_y(%rsp), %xmm2
2533	pand	$in1_y+0x10(%rsp), %xmm3
2534	por	%xmm0, %xmm2
2535	por	%xmm1, %xmm3
2536	movdqu	%xmm2, 0x20($r_ptr)
2537	movdqu	%xmm3, 0x30($r_ptr)
2538
2539.Ladd_done$x:
2540	add	\$32*18+8, %rsp
2541	pop	%r15
2542	pop	%r14
2543	pop	%r13
2544	pop	%r12
2545	pop	%rbx
2546	pop	%rbp
2547	ret
2548.size	ecp_nistz256_point_add$sfx,.-ecp_nistz256_point_add$sfx
2549___
2550}
2551&gen_add("q");
2552
2553sub gen_add_affine () {
2554    my $x = shift;
2555    my ($src0,$sfx,$bias);
2556    my ($U2,$S2,$H,$R,$Hsqr,$Hcub,$Rsqr,
2557	$res_x,$res_y,$res_z,
2558	$in1_x,$in1_y,$in1_z,
2559	$in2_x,$in2_y)=map(32*$_,(0..14));
2560    my $Z1sqr = $S2;
2561
2562    if ($x ne "x") {
2563	$src0 = "%rax";
2564	$sfx  = "";
2565	$bias = 0;
2566
2567$code.=<<___;
2568.globl	ecp_nistz256_point_add_affine
2569.type	ecp_nistz256_point_add_affine,\@function,3
2570.align	32
2571ecp_nistz256_point_add_affine:
2572___
2573$code.=<<___	if ($addx);
2574	mov	\$0x80100, %ecx
2575	and	OPENSSL_ia32cap_P+8(%rip), %ecx
2576	cmp	\$0x80100, %ecx
2577	je	.Lpoint_add_affinex
2578___
2579    } else {
2580	$src0 = "%rdx";
2581	$sfx  = "x";
2582	$bias = 128;
2583
2584$code.=<<___;
2585.type	ecp_nistz256_point_add_affinex,\@function,3
2586.align	32
2587ecp_nistz256_point_add_affinex:
2588.Lpoint_add_affinex:
2589___
2590    }
2591$code.=<<___;
2592	push	%rbp
2593	push	%rbx
2594	push	%r12
2595	push	%r13
2596	push	%r14
2597	push	%r15
2598	sub	\$32*15+8, %rsp
2599
2600	movdqu	0x00($a_ptr), %xmm0	# copy	*(P256_POINT *)$a_ptr
2601	mov	$b_org, $b_ptr		# reassign
2602	movdqu	0x10($a_ptr), %xmm1
2603	movdqu	0x20($a_ptr), %xmm2
2604	movdqu	0x30($a_ptr), %xmm3
2605	movdqu	0x40($a_ptr), %xmm4
2606	movdqu	0x50($a_ptr), %xmm5
2607	 mov	0x40+8*0($a_ptr), $src0	# load original in1_z
2608	 mov	0x40+8*1($a_ptr), $acc6
2609	 mov	0x40+8*2($a_ptr), $acc7
2610	 mov	0x40+8*3($a_ptr), $acc0
2611	movdqa	%xmm0, $in1_x(%rsp)
2612	movdqa	%xmm1, $in1_x+0x10(%rsp)
2613	movdqa	%xmm2, $in1_y(%rsp)
2614	movdqa	%xmm3, $in1_y+0x10(%rsp)
2615	movdqa	%xmm4, $in1_z(%rsp)
2616	movdqa	%xmm5, $in1_z+0x10(%rsp)
2617	por	%xmm4, %xmm5
2618
2619	movdqu	0x00($b_ptr), %xmm0	# copy	*(P256_POINT_AFFINE *)$b_ptr
2620	 pshufd	\$0xb1, %xmm5, %xmm3
2621	movdqu	0x10($b_ptr), %xmm1
2622	movdqu	0x20($b_ptr), %xmm2
2623	 por	%xmm3, %xmm5
2624	movdqu	0x30($b_ptr), %xmm3
2625	movdqa	%xmm0, $in2_x(%rsp)
2626	 pshufd	\$0x1e, %xmm5, %xmm4
2627	movdqa	%xmm1, $in2_x+0x10(%rsp)
2628	por	%xmm0, %xmm1
2629	 movq	$r_ptr, %xmm0		# save $r_ptr
2630	movdqa	%xmm2, $in2_y(%rsp)
2631	movdqa	%xmm3, $in2_y+0x10(%rsp)
2632	por	%xmm2, %xmm3
2633	 por	%xmm4, %xmm5
2634	 pxor	%xmm4, %xmm4
2635	por	%xmm1, %xmm3
2636
2637	lea	0x40-$bias($a_ptr), $a_ptr	# $a_ptr is still valid
2638	lea	$Z1sqr(%rsp), $r_ptr		# Z1^2
2639	call	__ecp_nistz256_sqr_mont$x	# p256_sqr_mont(Z1sqr, in1_z);
2640
2641	pcmpeqd	%xmm4, %xmm5
2642	pshufd	\$0xb1, %xmm3, %xmm4
2643	 mov	0x00($b_ptr), $src0		# $b_ptr is still valid
2644	 #lea	0x00($b_ptr), $b_ptr
2645	 mov	$acc4, $acc1			# harmonize sqr output and mul input
2646	por	%xmm3, %xmm4
2647	pshufd	\$0, %xmm5, %xmm5		# in1infty
2648	pshufd	\$0x1e, %xmm4, %xmm3
2649	 mov	$acc5, $acc2
2650	por	%xmm3, %xmm4
2651	pxor	%xmm3, %xmm3
2652	 mov	$acc6, $acc3
2653	pcmpeqd	%xmm3, %xmm4
2654	pshufd	\$0, %xmm4, %xmm4		# in2infty
2655
2656	lea	$Z1sqr-$bias(%rsp), $a_ptr
2657	mov	$acc7, $acc4
2658	lea	$U2(%rsp), $r_ptr		# U2 = X2*Z1^2
2659	call	__ecp_nistz256_mul_mont$x	# p256_mul_mont(U2, Z1sqr, in2_x);
2660
2661	lea	$in1_x(%rsp), $b_ptr
2662	lea	$H(%rsp), $r_ptr		# H = U2 - U1
2663	call	__ecp_nistz256_sub_from$x	# p256_sub(H, U2, in1_x);
2664
2665	`&load_for_mul("$Z1sqr(%rsp)", "$in1_z(%rsp)", "$src0")`
2666	lea	$S2(%rsp), $r_ptr		# S2 = Z1^3
2667	call	__ecp_nistz256_mul_mont$x	# p256_mul_mont(S2, Z1sqr, in1_z);
2668
2669	`&load_for_mul("$H(%rsp)", "$in1_z(%rsp)", "$src0")`
2670	lea	$res_z(%rsp), $r_ptr		# Z3 = H*Z1*Z2
2671	call	__ecp_nistz256_mul_mont$x	# p256_mul_mont(res_z, H, in1_z);
2672
2673	`&load_for_mul("$S2(%rsp)", "$in2_y(%rsp)", "$src0")`
2674	lea	$S2(%rsp), $r_ptr		# S2 = Y2*Z1^3
2675	call	__ecp_nistz256_mul_mont$x	# p256_mul_mont(S2, S2, in2_y);
2676
2677	lea	$in1_y(%rsp), $b_ptr
2678	lea	$R(%rsp), $r_ptr		# R = S2 - S1
2679	call	__ecp_nistz256_sub_from$x	# p256_sub(R, S2, in1_y);
2680
2681	`&load_for_sqr("$H(%rsp)", "$src0")`
2682	lea	$Hsqr(%rsp), $r_ptr		# H^2
2683	call	__ecp_nistz256_sqr_mont$x	# p256_sqr_mont(Hsqr, H);
2684
2685	`&load_for_sqr("$R(%rsp)", "$src0")`
2686	lea	$Rsqr(%rsp), $r_ptr		# R^2
2687	call	__ecp_nistz256_sqr_mont$x	# p256_sqr_mont(Rsqr, R);
2688
2689	`&load_for_mul("$H(%rsp)", "$Hsqr(%rsp)", "$src0")`
2690	lea	$Hcub(%rsp), $r_ptr		# H^3
2691	call	__ecp_nistz256_mul_mont$x	# p256_mul_mont(Hcub, Hsqr, H);
2692
2693	`&load_for_mul("$Hsqr(%rsp)", "$in1_x(%rsp)", "$src0")`
2694	lea	$U2(%rsp), $r_ptr		# U1*H^2
2695	call	__ecp_nistz256_mul_mont$x	# p256_mul_mont(U2, in1_x, Hsqr);
2696___
2697{
2698#######################################################################
2699# operate in 4-5-0-1 "name space" that matches multiplication output
2700#
2701my ($acc0,$acc1,$acc2,$acc3,$t3,$t4)=($acc4,$acc5,$acc0,$acc1,$acc2,$acc3);
2702my ($poly1, $poly3)=($acc6,$acc7);
2703
2704$code.=<<___;
2705	#lea	$U2(%rsp), $a_ptr
2706	#lea	$Hsqr(%rsp), $r_ptr	# 2*U1*H^2
2707	#call	__ecp_nistz256_mul_by_2	# ecp_nistz256_mul_by_2(Hsqr, U2);
2708
2709	xor	$t4, $t4
2710	add	$acc0, $acc0		# a0:a3+a0:a3
2711	lea	$Rsqr(%rsp), $a_ptr
2712	adc	$acc1, $acc1
2713	 mov	$acc0, $t0
2714	adc	$acc2, $acc2
2715	adc	$acc3, $acc3
2716	 mov	$acc1, $t1
2717	adc	\$0, $t4
2718
2719	sub	\$-1, $acc0
2720	 mov	$acc2, $t2
2721	sbb	$poly1, $acc1
2722	sbb	\$0, $acc2
2723	 mov	$acc3, $t3
2724	sbb	$poly3, $acc3
2725	sbb	\$0, $t4
2726
2727	cmovc	$t0, $acc0
2728	mov	8*0($a_ptr), $t0
2729	cmovc	$t1, $acc1
2730	mov	8*1($a_ptr), $t1
2731	cmovc	$t2, $acc2
2732	mov	8*2($a_ptr), $t2
2733	cmovc	$t3, $acc3
2734	mov	8*3($a_ptr), $t3
2735
2736	call	__ecp_nistz256_sub$x		# p256_sub(res_x, Rsqr, Hsqr);
2737
2738	lea	$Hcub(%rsp), $b_ptr
2739	lea	$res_x(%rsp), $r_ptr
2740	call	__ecp_nistz256_sub_from$x	# p256_sub(res_x, res_x, Hcub);
2741
2742	mov	$U2+8*0(%rsp), $t0
2743	mov	$U2+8*1(%rsp), $t1
2744	mov	$U2+8*2(%rsp), $t2
2745	mov	$U2+8*3(%rsp), $t3
2746	lea	$H(%rsp), $r_ptr
2747
2748	call	__ecp_nistz256_sub$x		# p256_sub(H, U2, res_x);
2749
2750	mov	$acc0, 8*0($r_ptr)		# save the result, as
2751	mov	$acc1, 8*1($r_ptr)		# __ecp_nistz256_sub doesn't
2752	mov	$acc2, 8*2($r_ptr)
2753	mov	$acc3, 8*3($r_ptr)
2754___
2755}
2756$code.=<<___;
2757	`&load_for_mul("$Hcub(%rsp)", "$in1_y(%rsp)", "$src0")`
2758	lea	$S2(%rsp), $r_ptr
2759	call	__ecp_nistz256_mul_mont$x	# p256_mul_mont(S2, Hcub, in1_y);
2760
2761	`&load_for_mul("$H(%rsp)", "$R(%rsp)", "$src0")`
2762	lea	$H(%rsp), $r_ptr
2763	call	__ecp_nistz256_mul_mont$x	# p256_mul_mont(H, H, R);
2764
2765	lea	$S2(%rsp), $b_ptr
2766	lea	$res_y(%rsp), $r_ptr
2767	call	__ecp_nistz256_sub_from$x	# p256_sub(res_y, H, S2);
2768
2769	movq	%xmm0, $r_ptr		# restore $r_ptr
2770
2771	movdqa	%xmm5, %xmm0		# copy_conditional(res_z, ONE, in1infty);
2772	movdqa	%xmm5, %xmm1
2773	pandn	$res_z(%rsp), %xmm0
2774	movdqa	%xmm5, %xmm2
2775	pandn	$res_z+0x10(%rsp), %xmm1
2776	movdqa	%xmm5, %xmm3
2777	pand	.LONE_mont(%rip), %xmm2
2778	pand	.LONE_mont+0x10(%rip), %xmm3
2779	por	%xmm0, %xmm2
2780	por	%xmm1, %xmm3
2781
2782	movdqa	%xmm4, %xmm0		# copy_conditional(res_z, in1_z, in2infty);
2783	movdqa	%xmm4, %xmm1
2784	pandn	%xmm2, %xmm0
2785	movdqa	%xmm4, %xmm2
2786	pandn	%xmm3, %xmm1
2787	movdqa	%xmm4, %xmm3
2788	pand	$in1_z(%rsp), %xmm2
2789	pand	$in1_z+0x10(%rsp), %xmm3
2790	por	%xmm0, %xmm2
2791	por	%xmm1, %xmm3
2792	movdqu	%xmm2, 0x40($r_ptr)
2793	movdqu	%xmm3, 0x50($r_ptr)
2794
2795	movdqa	%xmm5, %xmm0		# copy_conditional(res_x, in2_x, in1infty);
2796	movdqa	%xmm5, %xmm1
2797	pandn	$res_x(%rsp), %xmm0
2798	movdqa	%xmm5, %xmm2
2799	pandn	$res_x+0x10(%rsp), %xmm1
2800	movdqa	%xmm5, %xmm3
2801	pand	$in2_x(%rsp), %xmm2
2802	pand	$in2_x+0x10(%rsp), %xmm3
2803	por	%xmm0, %xmm2
2804	por	%xmm1, %xmm3
2805
2806	movdqa	%xmm4, %xmm0		# copy_conditional(res_x, in1_x, in2infty);
2807	movdqa	%xmm4, %xmm1
2808	pandn	%xmm2, %xmm0
2809	movdqa	%xmm4, %xmm2
2810	pandn	%xmm3, %xmm1
2811	movdqa	%xmm4, %xmm3
2812	pand	$in1_x(%rsp), %xmm2
2813	pand	$in1_x+0x10(%rsp), %xmm3
2814	por	%xmm0, %xmm2
2815	por	%xmm1, %xmm3
2816	movdqu	%xmm2, 0x00($r_ptr)
2817	movdqu	%xmm3, 0x10($r_ptr)
2818
2819	movdqa	%xmm5, %xmm0		# copy_conditional(res_y, in2_y, in1infty);
2820	movdqa	%xmm5, %xmm1
2821	pandn	$res_y(%rsp), %xmm0
2822	movdqa	%xmm5, %xmm2
2823	pandn	$res_y+0x10(%rsp), %xmm1
2824	movdqa	%xmm5, %xmm3
2825	pand	$in2_y(%rsp), %xmm2
2826	pand	$in2_y+0x10(%rsp), %xmm3
2827	por	%xmm0, %xmm2
2828	por	%xmm1, %xmm3
2829
2830	movdqa	%xmm4, %xmm0		# copy_conditional(res_y, in1_y, in2infty);
2831	movdqa	%xmm4, %xmm1
2832	pandn	%xmm2, %xmm0
2833	movdqa	%xmm4, %xmm2
2834	pandn	%xmm3, %xmm1
2835	movdqa	%xmm4, %xmm3
2836	pand	$in1_y(%rsp), %xmm2
2837	pand	$in1_y+0x10(%rsp), %xmm3
2838	por	%xmm0, %xmm2
2839	por	%xmm1, %xmm3
2840	movdqu	%xmm2, 0x20($r_ptr)
2841	movdqu	%xmm3, 0x30($r_ptr)
2842
2843	add	\$32*15+8, %rsp
2844	pop	%r15
2845	pop	%r14
2846	pop	%r13
2847	pop	%r12
2848	pop	%rbx
2849	pop	%rbp
2850	ret
2851.size	ecp_nistz256_point_add_affine$sfx,.-ecp_nistz256_point_add_affine$sfx
2852___
2853}
2854&gen_add_affine("q");
2855
2856########################################################################
2857# AD*X magic
2858#
2859if ($addx) {								{
2860########################################################################
2861# operate in 4-5-0-1 "name space" that matches multiplication output
2862#
2863my ($a0,$a1,$a2,$a3,$t3,$t4)=($acc4,$acc5,$acc0,$acc1,$acc2,$acc3);
2864
2865$code.=<<___;
2866.type	__ecp_nistz256_add_tox,\@abi-omnipotent
2867.align	32
2868__ecp_nistz256_add_tox:
2869	xor	$t4, $t4
2870	adc	8*0($b_ptr), $a0
2871	adc	8*1($b_ptr), $a1
2872	 mov	$a0, $t0
2873	adc	8*2($b_ptr), $a2
2874	adc	8*3($b_ptr), $a3
2875	 mov	$a1, $t1
2876	adc	\$0, $t4
2877
2878	xor	$t3, $t3
2879	sbb	\$-1, $a0
2880	 mov	$a2, $t2
2881	sbb	$poly1, $a1
2882	sbb	\$0, $a2
2883	 mov	$a3, $t3
2884	sbb	$poly3, $a3
2885	sbb	\$0, $t4
2886
2887	cmovc	$t0, $a0
2888	cmovc	$t1, $a1
2889	mov	$a0, 8*0($r_ptr)
2890	cmovc	$t2, $a2
2891	mov	$a1, 8*1($r_ptr)
2892	cmovc	$t3, $a3
2893	mov	$a2, 8*2($r_ptr)
2894	mov	$a3, 8*3($r_ptr)
2895
2896	ret
2897.size	__ecp_nistz256_add_tox,.-__ecp_nistz256_add_tox
2898
2899.type	__ecp_nistz256_sub_fromx,\@abi-omnipotent
2900.align	32
2901__ecp_nistz256_sub_fromx:
2902	xor	$t4, $t4
2903	sbb	8*0($b_ptr), $a0
2904	sbb	8*1($b_ptr), $a1
2905	 mov	$a0, $t0
2906	sbb	8*2($b_ptr), $a2
2907	sbb	8*3($b_ptr), $a3
2908	 mov	$a1, $t1
2909	sbb	\$0, $t4
2910
2911	xor	$t3, $t3
2912	adc	\$-1, $a0
2913	 mov	$a2, $t2
2914	adc	$poly1, $a1
2915	adc	\$0, $a2
2916	 mov	$a3, $t3
2917	adc	$poly3, $a3
2918
2919	bt	\$0, $t4
2920	cmovnc	$t0, $a0
2921	cmovnc	$t1, $a1
2922	mov	$a0, 8*0($r_ptr)
2923	cmovnc	$t2, $a2
2924	mov	$a1, 8*1($r_ptr)
2925	cmovnc	$t3, $a3
2926	mov	$a2, 8*2($r_ptr)
2927	mov	$a3, 8*3($r_ptr)
2928
2929	ret
2930.size	__ecp_nistz256_sub_fromx,.-__ecp_nistz256_sub_fromx
2931
2932.type	__ecp_nistz256_subx,\@abi-omnipotent
2933.align	32
2934__ecp_nistz256_subx:
2935	xor	$t4, $t4
2936	sbb	$a0, $t0
2937	sbb	$a1, $t1
2938	 mov	$t0, $a0
2939	sbb	$a2, $t2
2940	sbb	$a3, $t3
2941	 mov	$t1, $a1
2942	sbb	\$0, $t4
2943
2944	xor	$a3 ,$a3
2945	adc	\$-1, $t0
2946	 mov	$t2, $a2
2947	adc	$poly1, $t1
2948	adc	\$0, $t2
2949	 mov	$t3, $a3
2950	adc	$poly3, $t3
2951
2952	bt	\$0, $t4
2953	cmovc	$t0, $a0
2954	cmovc	$t1, $a1
2955	cmovc	$t2, $a2
2956	cmovc	$t3, $a3
2957
2958	ret
2959.size	__ecp_nistz256_subx,.-__ecp_nistz256_subx
2960
2961.type	__ecp_nistz256_mul_by_2x,\@abi-omnipotent
2962.align	32
2963__ecp_nistz256_mul_by_2x:
2964	xor	$t4, $t4
2965	adc	$a0, $a0		# a0:a3+a0:a3
2966	adc	$a1, $a1
2967	 mov	$a0, $t0
2968	adc	$a2, $a2
2969	adc	$a3, $a3
2970	 mov	$a1, $t1
2971	adc	\$0, $t4
2972
2973	xor	$t3, $t3
2974	sbb	\$-1, $a0
2975	 mov	$a2, $t2
2976	sbb	$poly1, $a1
2977	sbb	\$0, $a2
2978	 mov	$a3, $t3
2979	sbb	$poly3, $a3
2980	sbb	\$0, $t4
2981
2982	cmovc	$t0, $a0
2983	cmovc	$t1, $a1
2984	mov	$a0, 8*0($r_ptr)
2985	cmovc	$t2, $a2
2986	mov	$a1, 8*1($r_ptr)
2987	cmovc	$t3, $a3
2988	mov	$a2, 8*2($r_ptr)
2989	mov	$a3, 8*3($r_ptr)
2990
2991	ret
2992.size	__ecp_nistz256_mul_by_2x,.-__ecp_nistz256_mul_by_2x
2993___
2994									}
2995&gen_double("x");
2996&gen_add("x");
2997&gen_add_affine("x");
2998}
2999}}}
3000
3001$code =~ s/\`([^\`]*)\`/eval $1/gem;
3002print $code;
3003close STDOUT;
3004