ecp_nistz256-x86_64.S revision 1.1
1#include <machine/asm.h>
2.text
3
4
5
6.align	64
7.Lpoly:
8.quad	0xffffffffffffffff, 0x00000000ffffffff, 0x0000000000000000, 0xffffffff00000001
9
10
11.LRR:
12.quad	0x0000000000000003, 0xfffffffbffffffff, 0xfffffffffffffffe, 0x00000004fffffffd
13
14.LOne:
15.long	1,1,1,1,1,1,1,1
16.LTwo:
17.long	2,2,2,2,2,2,2,2
18.LThree:
19.long	3,3,3,3,3,3,3,3
20.LONE_mont:
21.quad	0x0000000000000001, 0xffffffff00000000, 0xffffffffffffffff, 0x00000000fffffffe
22
23.globl	ecp_nistz256_mul_by_2
24.type	ecp_nistz256_mul_by_2,@function
25.align	64
26ecp_nistz256_mul_by_2:
27	pushq	%r12
28	pushq	%r13
29
30	movq	0(%rsi),%r8
31	xorq	%r13,%r13
32	movq	8(%rsi),%r9
33	addq	%r8,%r8
34	movq	16(%rsi),%r10
35	adcq	%r9,%r9
36	movq	24(%rsi),%r11
37	leaq	.Lpoly(%rip),%rsi
38	movq	%r8,%rax
39	adcq	%r10,%r10
40	adcq	%r11,%r11
41	movq	%r9,%rdx
42	adcq	$0,%r13
43
44	subq	0(%rsi),%r8
45	movq	%r10,%rcx
46	sbbq	8(%rsi),%r9
47	sbbq	16(%rsi),%r10
48	movq	%r11,%r12
49	sbbq	24(%rsi),%r11
50	sbbq	$0,%r13
51
52	cmovcq	%rax,%r8
53	cmovcq	%rdx,%r9
54	movq	%r8,0(%rdi)
55	cmovcq	%rcx,%r10
56	movq	%r9,8(%rdi)
57	cmovcq	%r12,%r11
58	movq	%r10,16(%rdi)
59	movq	%r11,24(%rdi)
60
61	popq	%r13
62	popq	%r12
63	.byte	0xf3,0xc3
64.size	ecp_nistz256_mul_by_2,.-ecp_nistz256_mul_by_2
65
66
67
68.globl	ecp_nistz256_div_by_2
69.type	ecp_nistz256_div_by_2,@function
70.align	32
71ecp_nistz256_div_by_2:
72	pushq	%r12
73	pushq	%r13
74
75	movq	0(%rsi),%r8
76	movq	8(%rsi),%r9
77	movq	16(%rsi),%r10
78	movq	%r8,%rax
79	movq	24(%rsi),%r11
80	leaq	.Lpoly(%rip),%rsi
81
82	movq	%r9,%rdx
83	xorq	%r13,%r13
84	addq	0(%rsi),%r8
85	movq	%r10,%rcx
86	adcq	8(%rsi),%r9
87	adcq	16(%rsi),%r10
88	movq	%r11,%r12
89	adcq	24(%rsi),%r11
90	adcq	$0,%r13
91	xorq	%rsi,%rsi
92	testq	$1,%rax
93
94	cmovzq	%rax,%r8
95	cmovzq	%rdx,%r9
96	cmovzq	%rcx,%r10
97	cmovzq	%r12,%r11
98	cmovzq	%rsi,%r13
99
100	movq	%r9,%rax
101	shrq	$1,%r8
102	shlq	$63,%rax
103	movq	%r10,%rdx
104	shrq	$1,%r9
105	orq	%rax,%r8
106	shlq	$63,%rdx
107	movq	%r11,%rcx
108	shrq	$1,%r10
109	orq	%rdx,%r9
110	shlq	$63,%rcx
111	shrq	$1,%r11
112	shlq	$63,%r13
113	orq	%rcx,%r10
114	orq	%r13,%r11
115
116	movq	%r8,0(%rdi)
117	movq	%r9,8(%rdi)
118	movq	%r10,16(%rdi)
119	movq	%r11,24(%rdi)
120
121	popq	%r13
122	popq	%r12
123	.byte	0xf3,0xc3
124.size	ecp_nistz256_div_by_2,.-ecp_nistz256_div_by_2
125
126
127
128.globl	ecp_nistz256_mul_by_3
129.type	ecp_nistz256_mul_by_3,@function
130.align	32
131ecp_nistz256_mul_by_3:
132	pushq	%r12
133	pushq	%r13
134
135	movq	0(%rsi),%r8
136	xorq	%r13,%r13
137	movq	8(%rsi),%r9
138	addq	%r8,%r8
139	movq	16(%rsi),%r10
140	adcq	%r9,%r9
141	movq	24(%rsi),%r11
142	movq	%r8,%rax
143	adcq	%r10,%r10
144	adcq	%r11,%r11
145	movq	%r9,%rdx
146	adcq	$0,%r13
147
148	subq	$-1,%r8
149	movq	%r10,%rcx
150	sbbq	.Lpoly+8(%rip),%r9
151	sbbq	$0,%r10
152	movq	%r11,%r12
153	sbbq	.Lpoly+24(%rip),%r11
154	sbbq	$0,%r13
155
156	cmovcq	%rax,%r8
157	cmovcq	%rdx,%r9
158	cmovcq	%rcx,%r10
159	cmovcq	%r12,%r11
160
161	xorq	%r13,%r13
162	addq	0(%rsi),%r8
163	adcq	8(%rsi),%r9
164	movq	%r8,%rax
165	adcq	16(%rsi),%r10
166	adcq	24(%rsi),%r11
167	movq	%r9,%rdx
168	adcq	$0,%r13
169
170	subq	$-1,%r8
171	movq	%r10,%rcx
172	sbbq	.Lpoly+8(%rip),%r9
173	sbbq	$0,%r10
174	movq	%r11,%r12
175	sbbq	.Lpoly+24(%rip),%r11
176	sbbq	$0,%r13
177
178	cmovcq	%rax,%r8
179	cmovcq	%rdx,%r9
180	movq	%r8,0(%rdi)
181	cmovcq	%rcx,%r10
182	movq	%r9,8(%rdi)
183	cmovcq	%r12,%r11
184	movq	%r10,16(%rdi)
185	movq	%r11,24(%rdi)
186
187	popq	%r13
188	popq	%r12
189	.byte	0xf3,0xc3
190.size	ecp_nistz256_mul_by_3,.-ecp_nistz256_mul_by_3
191
192
193
194.globl	ecp_nistz256_add
195.type	ecp_nistz256_add,@function
196.align	32
197ecp_nistz256_add:
198	pushq	%r12
199	pushq	%r13
200
201	movq	0(%rsi),%r8
202	xorq	%r13,%r13
203	movq	8(%rsi),%r9
204	movq	16(%rsi),%r10
205	movq	24(%rsi),%r11
206	leaq	.Lpoly(%rip),%rsi
207
208	addq	0(%rdx),%r8
209	adcq	8(%rdx),%r9
210	movq	%r8,%rax
211	adcq	16(%rdx),%r10
212	adcq	24(%rdx),%r11
213	movq	%r9,%rdx
214	adcq	$0,%r13
215
216	subq	0(%rsi),%r8
217	movq	%r10,%rcx
218	sbbq	8(%rsi),%r9
219	sbbq	16(%rsi),%r10
220	movq	%r11,%r12
221	sbbq	24(%rsi),%r11
222	sbbq	$0,%r13
223
224	cmovcq	%rax,%r8
225	cmovcq	%rdx,%r9
226	movq	%r8,0(%rdi)
227	cmovcq	%rcx,%r10
228	movq	%r9,8(%rdi)
229	cmovcq	%r12,%r11
230	movq	%r10,16(%rdi)
231	movq	%r11,24(%rdi)
232
233	popq	%r13
234	popq	%r12
235	.byte	0xf3,0xc3
236.size	ecp_nistz256_add,.-ecp_nistz256_add
237
238
239
240.globl	ecp_nistz256_sub
241.type	ecp_nistz256_sub,@function
242.align	32
243ecp_nistz256_sub:
244	pushq	%r12
245	pushq	%r13
246
247	movq	0(%rsi),%r8
248	xorq	%r13,%r13
249	movq	8(%rsi),%r9
250	movq	16(%rsi),%r10
251	movq	24(%rsi),%r11
252	leaq	.Lpoly(%rip),%rsi
253
254	subq	0(%rdx),%r8
255	sbbq	8(%rdx),%r9
256	movq	%r8,%rax
257	sbbq	16(%rdx),%r10
258	sbbq	24(%rdx),%r11
259	movq	%r9,%rdx
260	sbbq	$0,%r13
261
262	addq	0(%rsi),%r8
263	movq	%r10,%rcx
264	adcq	8(%rsi),%r9
265	adcq	16(%rsi),%r10
266	movq	%r11,%r12
267	adcq	24(%rsi),%r11
268	testq	%r13,%r13
269
270	cmovzq	%rax,%r8
271	cmovzq	%rdx,%r9
272	movq	%r8,0(%rdi)
273	cmovzq	%rcx,%r10
274	movq	%r9,8(%rdi)
275	cmovzq	%r12,%r11
276	movq	%r10,16(%rdi)
277	movq	%r11,24(%rdi)
278
279	popq	%r13
280	popq	%r12
281	.byte	0xf3,0xc3
282.size	ecp_nistz256_sub,.-ecp_nistz256_sub
283
284
285
286.globl	ecp_nistz256_neg
287.type	ecp_nistz256_neg,@function
288.align	32
289ecp_nistz256_neg:
290	pushq	%r12
291	pushq	%r13
292
293	xorq	%r8,%r8
294	xorq	%r9,%r9
295	xorq	%r10,%r10
296	xorq	%r11,%r11
297	xorq	%r13,%r13
298
299	subq	0(%rsi),%r8
300	sbbq	8(%rsi),%r9
301	sbbq	16(%rsi),%r10
302	movq	%r8,%rax
303	sbbq	24(%rsi),%r11
304	leaq	.Lpoly(%rip),%rsi
305	movq	%r9,%rdx
306	sbbq	$0,%r13
307
308	addq	0(%rsi),%r8
309	movq	%r10,%rcx
310	adcq	8(%rsi),%r9
311	adcq	16(%rsi),%r10
312	movq	%r11,%r12
313	adcq	24(%rsi),%r11
314	testq	%r13,%r13
315
316	cmovzq	%rax,%r8
317	cmovzq	%rdx,%r9
318	movq	%r8,0(%rdi)
319	cmovzq	%rcx,%r10
320	movq	%r9,8(%rdi)
321	cmovzq	%r12,%r11
322	movq	%r10,16(%rdi)
323	movq	%r11,24(%rdi)
324
325	popq	%r13
326	popq	%r12
327	.byte	0xf3,0xc3
328.size	ecp_nistz256_neg,.-ecp_nistz256_neg
329
330
331
332
333.globl	ecp_nistz256_to_mont
334.type	ecp_nistz256_to_mont,@function
335.align	32
336ecp_nistz256_to_mont:
337	leaq	.LRR(%rip),%rdx
338	jmp	.Lmul_mont
339.size	ecp_nistz256_to_mont,.-ecp_nistz256_to_mont
340
341
342
343
344
345
346
347.globl	ecp_nistz256_mul_mont
348.type	ecp_nistz256_mul_mont,@function
349.align	32
350ecp_nistz256_mul_mont:
351.Lmul_mont:
352	pushq	%rbp
353	pushq	%rbx
354	pushq	%r12
355	pushq	%r13
356	pushq	%r14
357	pushq	%r15
358	movq	%rdx,%rbx
359	movq	0(%rdx),%rax
360	movq	0(%rsi),%r9
361	movq	8(%rsi),%r10
362	movq	16(%rsi),%r11
363	movq	24(%rsi),%r12
364
365	call	__ecp_nistz256_mul_montq
366.Lmul_mont_done:
367	popq	%r15
368	popq	%r14
369	popq	%r13
370	popq	%r12
371	popq	%rbx
372	popq	%rbp
373	.byte	0xf3,0xc3
374.size	ecp_nistz256_mul_mont,.-ecp_nistz256_mul_mont
375
376.type	__ecp_nistz256_mul_montq,@function
377.align	32
378__ecp_nistz256_mul_montq:
379
380
381	movq	%rax,%rbp
382	mulq	%r9
383	movq	.Lpoly+8(%rip),%r14
384	movq	%rax,%r8
385	movq	%rbp,%rax
386	movq	%rdx,%r9
387
388	mulq	%r10
389	movq	.Lpoly+24(%rip),%r15
390	addq	%rax,%r9
391	movq	%rbp,%rax
392	adcq	$0,%rdx
393	movq	%rdx,%r10
394
395	mulq	%r11
396	addq	%rax,%r10
397	movq	%rbp,%rax
398	adcq	$0,%rdx
399	movq	%rdx,%r11
400
401	mulq	%r12
402	addq	%rax,%r11
403	movq	%r8,%rax
404	adcq	$0,%rdx
405	xorq	%r13,%r13
406	movq	%rdx,%r12
407
408
409
410
411
412
413
414
415
416
417	movq	%r8,%rbp
418	shlq	$32,%r8
419	mulq	%r15
420	shrq	$32,%rbp
421	addq	%r8,%r9
422	adcq	%rbp,%r10
423	adcq	%rax,%r11
424	movq	8(%rbx),%rax
425	adcq	%rdx,%r12
426	adcq	$0,%r13
427	xorq	%r8,%r8
428
429
430
431	movq	%rax,%rbp
432	mulq	0(%rsi)
433	addq	%rax,%r9
434	movq	%rbp,%rax
435	adcq	$0,%rdx
436	movq	%rdx,%rcx
437
438	mulq	8(%rsi)
439	addq	%rcx,%r10
440	adcq	$0,%rdx
441	addq	%rax,%r10
442	movq	%rbp,%rax
443	adcq	$0,%rdx
444	movq	%rdx,%rcx
445
446	mulq	16(%rsi)
447	addq	%rcx,%r11
448	adcq	$0,%rdx
449	addq	%rax,%r11
450	movq	%rbp,%rax
451	adcq	$0,%rdx
452	movq	%rdx,%rcx
453
454	mulq	24(%rsi)
455	addq	%rcx,%r12
456	adcq	$0,%rdx
457	addq	%rax,%r12
458	movq	%r9,%rax
459	adcq	%rdx,%r13
460	adcq	$0,%r8
461
462
463
464	movq	%r9,%rbp
465	shlq	$32,%r9
466	mulq	%r15
467	shrq	$32,%rbp
468	addq	%r9,%r10
469	adcq	%rbp,%r11
470	adcq	%rax,%r12
471	movq	16(%rbx),%rax
472	adcq	%rdx,%r13
473	adcq	$0,%r8
474	xorq	%r9,%r9
475
476
477
478	movq	%rax,%rbp
479	mulq	0(%rsi)
480	addq	%rax,%r10
481	movq	%rbp,%rax
482	adcq	$0,%rdx
483	movq	%rdx,%rcx
484
485	mulq	8(%rsi)
486	addq	%rcx,%r11
487	adcq	$0,%rdx
488	addq	%rax,%r11
489	movq	%rbp,%rax
490	adcq	$0,%rdx
491	movq	%rdx,%rcx
492
493	mulq	16(%rsi)
494	addq	%rcx,%r12
495	adcq	$0,%rdx
496	addq	%rax,%r12
497	movq	%rbp,%rax
498	adcq	$0,%rdx
499	movq	%rdx,%rcx
500
501	mulq	24(%rsi)
502	addq	%rcx,%r13
503	adcq	$0,%rdx
504	addq	%rax,%r13
505	movq	%r10,%rax
506	adcq	%rdx,%r8
507	adcq	$0,%r9
508
509
510
511	movq	%r10,%rbp
512	shlq	$32,%r10
513	mulq	%r15
514	shrq	$32,%rbp
515	addq	%r10,%r11
516	adcq	%rbp,%r12
517	adcq	%rax,%r13
518	movq	24(%rbx),%rax
519	adcq	%rdx,%r8
520	adcq	$0,%r9
521	xorq	%r10,%r10
522
523
524
525	movq	%rax,%rbp
526	mulq	0(%rsi)
527	addq	%rax,%r11
528	movq	%rbp,%rax
529	adcq	$0,%rdx
530	movq	%rdx,%rcx
531
532	mulq	8(%rsi)
533	addq	%rcx,%r12
534	adcq	$0,%rdx
535	addq	%rax,%r12
536	movq	%rbp,%rax
537	adcq	$0,%rdx
538	movq	%rdx,%rcx
539
540	mulq	16(%rsi)
541	addq	%rcx,%r13
542	adcq	$0,%rdx
543	addq	%rax,%r13
544	movq	%rbp,%rax
545	adcq	$0,%rdx
546	movq	%rdx,%rcx
547
548	mulq	24(%rsi)
549	addq	%rcx,%r8
550	adcq	$0,%rdx
551	addq	%rax,%r8
552	movq	%r11,%rax
553	adcq	%rdx,%r9
554	adcq	$0,%r10
555
556
557
558	movq	%r11,%rbp
559	shlq	$32,%r11
560	mulq	%r15
561	shrq	$32,%rbp
562	addq	%r11,%r12
563	adcq	%rbp,%r13
564	movq	%r12,%rcx
565	adcq	%rax,%r8
566	adcq	%rdx,%r9
567	movq	%r13,%rbp
568	adcq	$0,%r10
569
570
571
572	subq	$-1,%r12
573	movq	%r8,%rbx
574	sbbq	%r14,%r13
575	sbbq	$0,%r8
576	movq	%r9,%rdx
577	sbbq	%r15,%r9
578	sbbq	$0,%r10
579
580	cmovcq	%rcx,%r12
581	cmovcq	%rbp,%r13
582	movq	%r12,0(%rdi)
583	cmovcq	%rbx,%r8
584	movq	%r13,8(%rdi)
585	cmovcq	%rdx,%r9
586	movq	%r8,16(%rdi)
587	movq	%r9,24(%rdi)
588
589	.byte	0xf3,0xc3
590.size	__ecp_nistz256_mul_montq,.-__ecp_nistz256_mul_montq
591
592
593
594
595
596
597
598
599.globl	ecp_nistz256_sqr_mont
600.type	ecp_nistz256_sqr_mont,@function
601.align	32
602ecp_nistz256_sqr_mont:
603	pushq	%rbp
604	pushq	%rbx
605	pushq	%r12
606	pushq	%r13
607	pushq	%r14
608	pushq	%r15
609	movq	0(%rsi),%rax
610	movq	8(%rsi),%r14
611	movq	16(%rsi),%r15
612	movq	24(%rsi),%r8
613
614	call	__ecp_nistz256_sqr_montq
615.Lsqr_mont_done:
616	popq	%r15
617	popq	%r14
618	popq	%r13
619	popq	%r12
620	popq	%rbx
621	popq	%rbp
622	.byte	0xf3,0xc3
623.size	ecp_nistz256_sqr_mont,.-ecp_nistz256_sqr_mont
624
625.type	__ecp_nistz256_sqr_montq,@function
626.align	32
627__ecp_nistz256_sqr_montq:
628	movq	%rax,%r13
629	mulq	%r14
630	movq	%rax,%r9
631	movq	%r15,%rax
632	movq	%rdx,%r10
633
634	mulq	%r13
635	addq	%rax,%r10
636	movq	%r8,%rax
637	adcq	$0,%rdx
638	movq	%rdx,%r11
639
640	mulq	%r13
641	addq	%rax,%r11
642	movq	%r15,%rax
643	adcq	$0,%rdx
644	movq	%rdx,%r12
645
646
647	mulq	%r14
648	addq	%rax,%r11
649	movq	%r8,%rax
650	adcq	$0,%rdx
651	movq	%rdx,%rbp
652
653	mulq	%r14
654	addq	%rax,%r12
655	movq	%r8,%rax
656	adcq	$0,%rdx
657	addq	%rbp,%r12
658	movq	%rdx,%r13
659	adcq	$0,%r13
660
661
662	mulq	%r15
663	xorq	%r15,%r15
664	addq	%rax,%r13
665	movq	0(%rsi),%rax
666	movq	%rdx,%r14
667	adcq	$0,%r14
668
669	addq	%r9,%r9
670	adcq	%r10,%r10
671	adcq	%r11,%r11
672	adcq	%r12,%r12
673	adcq	%r13,%r13
674	adcq	%r14,%r14
675	adcq	$0,%r15
676
677	mulq	%rax
678	movq	%rax,%r8
679	movq	8(%rsi),%rax
680	movq	%rdx,%rcx
681
682	mulq	%rax
683	addq	%rcx,%r9
684	adcq	%rax,%r10
685	movq	16(%rsi),%rax
686	adcq	$0,%rdx
687	movq	%rdx,%rcx
688
689	mulq	%rax
690	addq	%rcx,%r11
691	adcq	%rax,%r12
692	movq	24(%rsi),%rax
693	adcq	$0,%rdx
694	movq	%rdx,%rcx
695
696	mulq	%rax
697	addq	%rcx,%r13
698	adcq	%rax,%r14
699	movq	%r8,%rax
700	adcq	%rdx,%r15
701
702	movq	.Lpoly+8(%rip),%rsi
703	movq	.Lpoly+24(%rip),%rbp
704
705
706
707
708	movq	%r8,%rcx
709	shlq	$32,%r8
710	mulq	%rbp
711	shrq	$32,%rcx
712	addq	%r8,%r9
713	adcq	%rcx,%r10
714	adcq	%rax,%r11
715	movq	%r9,%rax
716	adcq	$0,%rdx
717
718
719
720	movq	%r9,%rcx
721	shlq	$32,%r9
722	movq	%rdx,%r8
723	mulq	%rbp
724	shrq	$32,%rcx
725	addq	%r9,%r10
726	adcq	%rcx,%r11
727	adcq	%rax,%r8
728	movq	%r10,%rax
729	adcq	$0,%rdx
730
731
732
733	movq	%r10,%rcx
734	shlq	$32,%r10
735	movq	%rdx,%r9
736	mulq	%rbp
737	shrq	$32,%rcx
738	addq	%r10,%r11
739	adcq	%rcx,%r8
740	adcq	%rax,%r9
741	movq	%r11,%rax
742	adcq	$0,%rdx
743
744
745
746	movq	%r11,%rcx
747	shlq	$32,%r11
748	movq	%rdx,%r10
749	mulq	%rbp
750	shrq	$32,%rcx
751	addq	%r11,%r8
752	adcq	%rcx,%r9
753	adcq	%rax,%r10
754	adcq	$0,%rdx
755	xorq	%r11,%r11
756
757
758
759	addq	%r8,%r12
760	adcq	%r9,%r13
761	movq	%r12,%r8
762	adcq	%r10,%r14
763	adcq	%rdx,%r15
764	movq	%r13,%r9
765	adcq	$0,%r11
766
767	subq	$-1,%r12
768	movq	%r14,%r10
769	sbbq	%rsi,%r13
770	sbbq	$0,%r14
771	movq	%r15,%rcx
772	sbbq	%rbp,%r15
773	sbbq	$0,%r11
774
775	cmovcq	%r8,%r12
776	cmovcq	%r9,%r13
777	movq	%r12,0(%rdi)
778	cmovcq	%r10,%r14
779	movq	%r13,8(%rdi)
780	cmovcq	%rcx,%r15
781	movq	%r14,16(%rdi)
782	movq	%r15,24(%rdi)
783
784	.byte	0xf3,0xc3
785.size	__ecp_nistz256_sqr_montq,.-__ecp_nistz256_sqr_montq
786
787
788
789
790
791
792.globl	ecp_nistz256_from_mont
793.type	ecp_nistz256_from_mont,@function
794.align	32
795ecp_nistz256_from_mont:
796	pushq	%r12
797	pushq	%r13
798
799	movq	0(%rsi),%rax
800	movq	.Lpoly+24(%rip),%r13
801	movq	8(%rsi),%r9
802	movq	16(%rsi),%r10
803	movq	24(%rsi),%r11
804	movq	%rax,%r8
805	movq	.Lpoly+8(%rip),%r12
806
807
808
809	movq	%rax,%rcx
810	shlq	$32,%r8
811	mulq	%r13
812	shrq	$32,%rcx
813	addq	%r8,%r9
814	adcq	%rcx,%r10
815	adcq	%rax,%r11
816	movq	%r9,%rax
817	adcq	$0,%rdx
818
819
820
821	movq	%r9,%rcx
822	shlq	$32,%r9
823	movq	%rdx,%r8
824	mulq	%r13
825	shrq	$32,%rcx
826	addq	%r9,%r10
827	adcq	%rcx,%r11
828	adcq	%rax,%r8
829	movq	%r10,%rax
830	adcq	$0,%rdx
831
832
833
834	movq	%r10,%rcx
835	shlq	$32,%r10
836	movq	%rdx,%r9
837	mulq	%r13
838	shrq	$32,%rcx
839	addq	%r10,%r11
840	adcq	%rcx,%r8
841	adcq	%rax,%r9
842	movq	%r11,%rax
843	adcq	$0,%rdx
844
845
846
847	movq	%r11,%rcx
848	shlq	$32,%r11
849	movq	%rdx,%r10
850	mulq	%r13
851	shrq	$32,%rcx
852	addq	%r11,%r8
853	adcq	%rcx,%r9
854	movq	%r8,%rcx
855	adcq	%rax,%r10
856	movq	%r9,%rsi
857	adcq	$0,%rdx
858
859
860
861	subq	$-1,%r8
862	movq	%r10,%rax
863	sbbq	%r12,%r9
864	sbbq	$0,%r10
865	movq	%rdx,%r11
866	sbbq	%r13,%rdx
867	sbbq	%r13,%r13
868
869	cmovnzq	%rcx,%r8
870	cmovnzq	%rsi,%r9
871	movq	%r8,0(%rdi)
872	cmovnzq	%rax,%r10
873	movq	%r9,8(%rdi)
874	cmovzq	%rdx,%r11
875	movq	%r10,16(%rdi)
876	movq	%r11,24(%rdi)
877
878	popq	%r13
879	popq	%r12
880	.byte	0xf3,0xc3
881.size	ecp_nistz256_from_mont,.-ecp_nistz256_from_mont
882
883
884.globl	ecp_nistz256_select_w5
885.type	ecp_nistz256_select_w5,@function
886.align	32
887ecp_nistz256_select_w5:
888	movdqa	.LOne(%rip),%xmm0
889	movd	%edx,%xmm1
890
891	pxor	%xmm2,%xmm2
892	pxor	%xmm3,%xmm3
893	pxor	%xmm4,%xmm4
894	pxor	%xmm5,%xmm5
895	pxor	%xmm6,%xmm6
896	pxor	%xmm7,%xmm7
897
898	movdqa	%xmm0,%xmm8
899	pshufd	$0,%xmm1,%xmm1
900
901	movq	$16,%rax
902.Lselect_loop_sse_w5:
903
904	movdqa	%xmm8,%xmm15
905	paddd	%xmm0,%xmm8
906	pcmpeqd	%xmm1,%xmm15
907
908	movdqa	0(%rsi),%xmm9
909	movdqa	16(%rsi),%xmm10
910	movdqa	32(%rsi),%xmm11
911	movdqa	48(%rsi),%xmm12
912	movdqa	64(%rsi),%xmm13
913	movdqa	80(%rsi),%xmm14
914	leaq	96(%rsi),%rsi
915
916	pand	%xmm15,%xmm9
917	pand	%xmm15,%xmm10
918	por	%xmm9,%xmm2
919	pand	%xmm15,%xmm11
920	por	%xmm10,%xmm3
921	pand	%xmm15,%xmm12
922	por	%xmm11,%xmm4
923	pand	%xmm15,%xmm13
924	por	%xmm12,%xmm5
925	pand	%xmm15,%xmm14
926	por	%xmm13,%xmm6
927	por	%xmm14,%xmm7
928
929	decq	%rax
930	jnz	.Lselect_loop_sse_w5
931
932	movdqu	%xmm2,0(%rdi)
933	movdqu	%xmm3,16(%rdi)
934	movdqu	%xmm4,32(%rdi)
935	movdqu	%xmm5,48(%rdi)
936	movdqu	%xmm6,64(%rdi)
937	movdqu	%xmm7,80(%rdi)
938	.byte	0xf3,0xc3
939.size	ecp_nistz256_select_w5,.-ecp_nistz256_select_w5
940
941
942
943.globl	ecp_nistz256_select_w7
944.type	ecp_nistz256_select_w7,@function
945.align	32
946ecp_nistz256_select_w7:
947	movdqa	.LOne(%rip),%xmm8
948	movd	%edx,%xmm1
949
950	pxor	%xmm2,%xmm2
951	pxor	%xmm3,%xmm3
952	pxor	%xmm4,%xmm4
953	pxor	%xmm5,%xmm5
954
955	movdqa	%xmm8,%xmm0
956	pshufd	$0,%xmm1,%xmm1
957	movq	$64,%rax
958
959.Lselect_loop_sse_w7:
960	movdqa	%xmm8,%xmm15
961	paddd	%xmm0,%xmm8
962	movdqa	0(%rsi),%xmm9
963	movdqa	16(%rsi),%xmm10
964	pcmpeqd	%xmm1,%xmm15
965	movdqa	32(%rsi),%xmm11
966	movdqa	48(%rsi),%xmm12
967	leaq	64(%rsi),%rsi
968
969	pand	%xmm15,%xmm9
970	pand	%xmm15,%xmm10
971	por	%xmm9,%xmm2
972	pand	%xmm15,%xmm11
973	por	%xmm10,%xmm3
974	pand	%xmm15,%xmm12
975	por	%xmm11,%xmm4
976	prefetcht0	255(%rsi)
977	por	%xmm12,%xmm5
978
979	decq	%rax
980	jnz	.Lselect_loop_sse_w7
981
982	movdqu	%xmm2,0(%rdi)
983	movdqu	%xmm3,16(%rdi)
984	movdqu	%xmm4,32(%rdi)
985	movdqu	%xmm5,48(%rdi)
986	.byte	0xf3,0xc3
987.size	ecp_nistz256_select_w7,.-ecp_nistz256_select_w7
988.globl	ecp_nistz256_avx2_select_w7
989.type	ecp_nistz256_avx2_select_w7,@function
990.align	32
991ecp_nistz256_avx2_select_w7:
992.byte	0x0f,0x0b
993	.byte	0xf3,0xc3
994.size	ecp_nistz256_avx2_select_w7,.-ecp_nistz256_avx2_select_w7
995.type	__ecp_nistz256_add_toq,@function
996.align	32
997__ecp_nistz256_add_toq:
998	xorq	%r11,%r11
999	addq	0(%rbx),%r12
1000	adcq	8(%rbx),%r13
1001	movq	%r12,%rax
1002	adcq	16(%rbx),%r8
1003	adcq	24(%rbx),%r9
1004	movq	%r13,%rbp
1005	adcq	$0,%r11
1006
1007	subq	$-1,%r12
1008	movq	%r8,%rcx
1009	sbbq	%r14,%r13
1010	sbbq	$0,%r8
1011	movq	%r9,%r10
1012	sbbq	%r15,%r9
1013	sbbq	$0,%r11
1014
1015	cmovcq	%rax,%r12
1016	cmovcq	%rbp,%r13
1017	movq	%r12,0(%rdi)
1018	cmovcq	%rcx,%r8
1019	movq	%r13,8(%rdi)
1020	cmovcq	%r10,%r9
1021	movq	%r8,16(%rdi)
1022	movq	%r9,24(%rdi)
1023
1024	.byte	0xf3,0xc3
1025.size	__ecp_nistz256_add_toq,.-__ecp_nistz256_add_toq
1026
1027.type	__ecp_nistz256_sub_fromq,@function
1028.align	32
1029__ecp_nistz256_sub_fromq:
1030	subq	0(%rbx),%r12
1031	sbbq	8(%rbx),%r13
1032	movq	%r12,%rax
1033	sbbq	16(%rbx),%r8
1034	sbbq	24(%rbx),%r9
1035	movq	%r13,%rbp
1036	sbbq	%r11,%r11
1037
1038	addq	$-1,%r12
1039	movq	%r8,%rcx
1040	adcq	%r14,%r13
1041	adcq	$0,%r8
1042	movq	%r9,%r10
1043	adcq	%r15,%r9
1044	testq	%r11,%r11
1045
1046	cmovzq	%rax,%r12
1047	cmovzq	%rbp,%r13
1048	movq	%r12,0(%rdi)
1049	cmovzq	%rcx,%r8
1050	movq	%r13,8(%rdi)
1051	cmovzq	%r10,%r9
1052	movq	%r8,16(%rdi)
1053	movq	%r9,24(%rdi)
1054
1055	.byte	0xf3,0xc3
1056.size	__ecp_nistz256_sub_fromq,.-__ecp_nistz256_sub_fromq
1057
1058.type	__ecp_nistz256_subq,@function
1059.align	32
1060__ecp_nistz256_subq:
1061	subq	%r12,%rax
1062	sbbq	%r13,%rbp
1063	movq	%rax,%r12
1064	sbbq	%r8,%rcx
1065	sbbq	%r9,%r10
1066	movq	%rbp,%r13
1067	sbbq	%r11,%r11
1068
1069	addq	$-1,%rax
1070	movq	%rcx,%r8
1071	adcq	%r14,%rbp
1072	adcq	$0,%rcx
1073	movq	%r10,%r9
1074	adcq	%r15,%r10
1075	testq	%r11,%r11
1076
1077	cmovnzq	%rax,%r12
1078	cmovnzq	%rbp,%r13
1079	cmovnzq	%rcx,%r8
1080	cmovnzq	%r10,%r9
1081
1082	.byte	0xf3,0xc3
1083.size	__ecp_nistz256_subq,.-__ecp_nistz256_subq
1084
1085.type	__ecp_nistz256_mul_by_2q,@function
1086.align	32
1087__ecp_nistz256_mul_by_2q:
1088	xorq	%r11,%r11
1089	addq	%r12,%r12
1090	adcq	%r13,%r13
1091	movq	%r12,%rax
1092	adcq	%r8,%r8
1093	adcq	%r9,%r9
1094	movq	%r13,%rbp
1095	adcq	$0,%r11
1096
1097	subq	$-1,%r12
1098	movq	%r8,%rcx
1099	sbbq	%r14,%r13
1100	sbbq	$0,%r8
1101	movq	%r9,%r10
1102	sbbq	%r15,%r9
1103	sbbq	$0,%r11
1104
1105	cmovcq	%rax,%r12
1106	cmovcq	%rbp,%r13
1107	movq	%r12,0(%rdi)
1108	cmovcq	%rcx,%r8
1109	movq	%r13,8(%rdi)
1110	cmovcq	%r10,%r9
1111	movq	%r8,16(%rdi)
1112	movq	%r9,24(%rdi)
1113
1114	.byte	0xf3,0xc3
1115.size	__ecp_nistz256_mul_by_2q,.-__ecp_nistz256_mul_by_2q
1116.globl	ecp_nistz256_point_double
1117.type	ecp_nistz256_point_double,@function
1118.align	32
1119ecp_nistz256_point_double:
1120	pushq	%rbp
1121	pushq	%rbx
1122	pushq	%r12
1123	pushq	%r13
1124	pushq	%r14
1125	pushq	%r15
1126	subq	$160+8,%rsp
1127
1128.Lpoint_double_shortcutq:
1129	movdqu	0(%rsi),%xmm0
1130	movq	%rsi,%rbx
1131	movdqu	16(%rsi),%xmm1
1132	movq	32+0(%rsi),%r12
1133	movq	32+8(%rsi),%r13
1134	movq	32+16(%rsi),%r8
1135	movq	32+24(%rsi),%r9
1136	movq	.Lpoly+8(%rip),%r14
1137	movq	.Lpoly+24(%rip),%r15
1138	movdqa	%xmm0,96(%rsp)
1139	movdqa	%xmm1,96+16(%rsp)
1140	leaq	32(%rdi),%r10
1141	leaq	64(%rdi),%r11
1142.byte	102,72,15,110,199
1143.byte	102,73,15,110,202
1144.byte	102,73,15,110,211
1145
1146	leaq	0(%rsp),%rdi
1147	call	__ecp_nistz256_mul_by_2q
1148
1149	movq	64+0(%rsi),%rax
1150	movq	64+8(%rsi),%r14
1151	movq	64+16(%rsi),%r15
1152	movq	64+24(%rsi),%r8
1153	leaq	64-0(%rsi),%rsi
1154	leaq	64(%rsp),%rdi
1155	call	__ecp_nistz256_sqr_montq
1156
1157	movq	0+0(%rsp),%rax
1158	movq	8+0(%rsp),%r14
1159	leaq	0+0(%rsp),%rsi
1160	movq	16+0(%rsp),%r15
1161	movq	24+0(%rsp),%r8
1162	leaq	0(%rsp),%rdi
1163	call	__ecp_nistz256_sqr_montq
1164
1165	movq	32(%rbx),%rax
1166	movq	64+0(%rbx),%r9
1167	movq	64+8(%rbx),%r10
1168	movq	64+16(%rbx),%r11
1169	movq	64+24(%rbx),%r12
1170	leaq	64-0(%rbx),%rsi
1171	leaq	32(%rbx),%rbx
1172.byte	102,72,15,126,215
1173	call	__ecp_nistz256_mul_montq
1174	call	__ecp_nistz256_mul_by_2q
1175
1176	movq	96+0(%rsp),%r12
1177	movq	96+8(%rsp),%r13
1178	leaq	64(%rsp),%rbx
1179	movq	96+16(%rsp),%r8
1180	movq	96+24(%rsp),%r9
1181	leaq	32(%rsp),%rdi
1182	call	__ecp_nistz256_add_toq
1183
1184	movq	96+0(%rsp),%r12
1185	movq	96+8(%rsp),%r13
1186	leaq	64(%rsp),%rbx
1187	movq	96+16(%rsp),%r8
1188	movq	96+24(%rsp),%r9
1189	leaq	64(%rsp),%rdi
1190	call	__ecp_nistz256_sub_fromq
1191
1192	movq	0+0(%rsp),%rax
1193	movq	8+0(%rsp),%r14
1194	leaq	0+0(%rsp),%rsi
1195	movq	16+0(%rsp),%r15
1196	movq	24+0(%rsp),%r8
1197.byte	102,72,15,126,207
1198	call	__ecp_nistz256_sqr_montq
1199	xorq	%r9,%r9
1200	movq	%r12,%rax
1201	addq	$-1,%r12
1202	movq	%r13,%r10
1203	adcq	%rsi,%r13
1204	movq	%r14,%rcx
1205	adcq	$0,%r14
1206	movq	%r15,%r8
1207	adcq	%rbp,%r15
1208	adcq	$0,%r9
1209	xorq	%rsi,%rsi
1210	testq	$1,%rax
1211
1212	cmovzq	%rax,%r12
1213	cmovzq	%r10,%r13
1214	cmovzq	%rcx,%r14
1215	cmovzq	%r8,%r15
1216	cmovzq	%rsi,%r9
1217
1218	movq	%r13,%rax
1219	shrq	$1,%r12
1220	shlq	$63,%rax
1221	movq	%r14,%r10
1222	shrq	$1,%r13
1223	orq	%rax,%r12
1224	shlq	$63,%r10
1225	movq	%r15,%rcx
1226	shrq	$1,%r14
1227	orq	%r10,%r13
1228	shlq	$63,%rcx
1229	movq	%r12,0(%rdi)
1230	shrq	$1,%r15
1231	movq	%r13,8(%rdi)
1232	shlq	$63,%r9
1233	orq	%rcx,%r14
1234	orq	%r9,%r15
1235	movq	%r14,16(%rdi)
1236	movq	%r15,24(%rdi)
1237	movq	64(%rsp),%rax
1238	leaq	64(%rsp),%rbx
1239	movq	0+32(%rsp),%r9
1240	movq	8+32(%rsp),%r10
1241	leaq	0+32(%rsp),%rsi
1242	movq	16+32(%rsp),%r11
1243	movq	24+32(%rsp),%r12
1244	leaq	32(%rsp),%rdi
1245	call	__ecp_nistz256_mul_montq
1246
1247	leaq	128(%rsp),%rdi
1248	call	__ecp_nistz256_mul_by_2q
1249
1250	leaq	32(%rsp),%rbx
1251	leaq	32(%rsp),%rdi
1252	call	__ecp_nistz256_add_toq
1253
1254	movq	96(%rsp),%rax
1255	leaq	96(%rsp),%rbx
1256	movq	0+0(%rsp),%r9
1257	movq	8+0(%rsp),%r10
1258	leaq	0+0(%rsp),%rsi
1259	movq	16+0(%rsp),%r11
1260	movq	24+0(%rsp),%r12
1261	leaq	0(%rsp),%rdi
1262	call	__ecp_nistz256_mul_montq
1263
1264	leaq	128(%rsp),%rdi
1265	call	__ecp_nistz256_mul_by_2q
1266
1267	movq	0+32(%rsp),%rax
1268	movq	8+32(%rsp),%r14
1269	leaq	0+32(%rsp),%rsi
1270	movq	16+32(%rsp),%r15
1271	movq	24+32(%rsp),%r8
1272.byte	102,72,15,126,199
1273	call	__ecp_nistz256_sqr_montq
1274
1275	leaq	128(%rsp),%rbx
1276	movq	%r14,%r8
1277	movq	%r15,%r9
1278	movq	%rsi,%r14
1279	movq	%rbp,%r15
1280	call	__ecp_nistz256_sub_fromq
1281
1282	movq	0+0(%rsp),%rax
1283	movq	0+8(%rsp),%rbp
1284	movq	0+16(%rsp),%rcx
1285	movq	0+24(%rsp),%r10
1286	leaq	0(%rsp),%rdi
1287	call	__ecp_nistz256_subq
1288
1289	movq	32(%rsp),%rax
1290	leaq	32(%rsp),%rbx
1291	movq	%r12,%r14
1292	xorl	%ecx,%ecx
1293	movq	%r12,0+0(%rsp)
1294	movq	%r13,%r10
1295	movq	%r13,0+8(%rsp)
1296	cmovzq	%r8,%r11
1297	movq	%r8,0+16(%rsp)
1298	leaq	0-0(%rsp),%rsi
1299	cmovzq	%r9,%r12
1300	movq	%r9,0+24(%rsp)
1301	movq	%r14,%r9
1302	leaq	0(%rsp),%rdi
1303	call	__ecp_nistz256_mul_montq
1304
1305.byte	102,72,15,126,203
1306.byte	102,72,15,126,207
1307	call	__ecp_nistz256_sub_fromq
1308
1309	addq	$160+8,%rsp
1310	popq	%r15
1311	popq	%r14
1312	popq	%r13
1313	popq	%r12
1314	popq	%rbx
1315	popq	%rbp
1316	.byte	0xf3,0xc3
1317.size	ecp_nistz256_point_double,.-ecp_nistz256_point_double
1318.globl	ecp_nistz256_point_add
1319.type	ecp_nistz256_point_add,@function
1320.align	32
1321ecp_nistz256_point_add:
1322	pushq	%rbp
1323	pushq	%rbx
1324	pushq	%r12
1325	pushq	%r13
1326	pushq	%r14
1327	pushq	%r15
1328	subq	$576+8,%rsp
1329
1330	movdqu	0(%rsi),%xmm0
1331	movdqu	16(%rsi),%xmm1
1332	movdqu	32(%rsi),%xmm2
1333	movdqu	48(%rsi),%xmm3
1334	movdqu	64(%rsi),%xmm4
1335	movdqu	80(%rsi),%xmm5
1336	movq	%rsi,%rbx
1337	movq	%rdx,%rsi
1338	movdqa	%xmm0,384(%rsp)
1339	movdqa	%xmm1,384+16(%rsp)
1340	movdqa	%xmm2,416(%rsp)
1341	movdqa	%xmm3,416+16(%rsp)
1342	movdqa	%xmm4,448(%rsp)
1343	movdqa	%xmm5,448+16(%rsp)
1344	por	%xmm4,%xmm5
1345
1346	movdqu	0(%rsi),%xmm0
1347	pshufd	$0xb1,%xmm5,%xmm3
1348	movdqu	16(%rsi),%xmm1
1349	movdqu	32(%rsi),%xmm2
1350	por	%xmm3,%xmm5
1351	movdqu	48(%rsi),%xmm3
1352	movq	64+0(%rsi),%rax
1353	movq	64+8(%rsi),%r14
1354	movq	64+16(%rsi),%r15
1355	movq	64+24(%rsi),%r8
1356	movdqa	%xmm0,480(%rsp)
1357	pshufd	$0x1e,%xmm5,%xmm4
1358	movdqa	%xmm1,480+16(%rsp)
1359	movdqu	64(%rsi),%xmm0
1360	movdqu	80(%rsi),%xmm1
1361	movdqa	%xmm2,512(%rsp)
1362	movdqa	%xmm3,512+16(%rsp)
1363	por	%xmm4,%xmm5
1364	pxor	%xmm4,%xmm4
1365	por	%xmm0,%xmm1
1366.byte	102,72,15,110,199
1367
1368	leaq	64-0(%rsi),%rsi
1369	movq	%rax,544+0(%rsp)
1370	movq	%r14,544+8(%rsp)
1371	movq	%r15,544+16(%rsp)
1372	movq	%r8,544+24(%rsp)
1373	leaq	96(%rsp),%rdi
1374	call	__ecp_nistz256_sqr_montq
1375
1376	pcmpeqd	%xmm4,%xmm5
1377	pshufd	$0xb1,%xmm1,%xmm4
1378	por	%xmm1,%xmm4
1379	pshufd	$0,%xmm5,%xmm5
1380	pshufd	$0x1e,%xmm4,%xmm3
1381	por	%xmm3,%xmm4
1382	pxor	%xmm3,%xmm3
1383	pcmpeqd	%xmm3,%xmm4
1384	pshufd	$0,%xmm4,%xmm4
1385	movq	64+0(%rbx),%rax
1386	movq	64+8(%rbx),%r14
1387	movq	64+16(%rbx),%r15
1388	movq	64+24(%rbx),%r8
1389.byte	102,72,15,110,203
1390
1391	leaq	64-0(%rbx),%rsi
1392	leaq	32(%rsp),%rdi
1393	call	__ecp_nistz256_sqr_montq
1394
1395	movq	544(%rsp),%rax
1396	leaq	544(%rsp),%rbx
1397	movq	0+96(%rsp),%r9
1398	movq	8+96(%rsp),%r10
1399	leaq	0+96(%rsp),%rsi
1400	movq	16+96(%rsp),%r11
1401	movq	24+96(%rsp),%r12
1402	leaq	224(%rsp),%rdi
1403	call	__ecp_nistz256_mul_montq
1404
1405	movq	448(%rsp),%rax
1406	leaq	448(%rsp),%rbx
1407	movq	0+32(%rsp),%r9
1408	movq	8+32(%rsp),%r10
1409	leaq	0+32(%rsp),%rsi
1410	movq	16+32(%rsp),%r11
1411	movq	24+32(%rsp),%r12
1412	leaq	256(%rsp),%rdi
1413	call	__ecp_nistz256_mul_montq
1414
1415	movq	416(%rsp),%rax
1416	leaq	416(%rsp),%rbx
1417	movq	0+224(%rsp),%r9
1418	movq	8+224(%rsp),%r10
1419	leaq	0+224(%rsp),%rsi
1420	movq	16+224(%rsp),%r11
1421	movq	24+224(%rsp),%r12
1422	leaq	224(%rsp),%rdi
1423	call	__ecp_nistz256_mul_montq
1424
1425	movq	512(%rsp),%rax
1426	leaq	512(%rsp),%rbx
1427	movq	0+256(%rsp),%r9
1428	movq	8+256(%rsp),%r10
1429	leaq	0+256(%rsp),%rsi
1430	movq	16+256(%rsp),%r11
1431	movq	24+256(%rsp),%r12
1432	leaq	256(%rsp),%rdi
1433	call	__ecp_nistz256_mul_montq
1434
1435	leaq	224(%rsp),%rbx
1436	leaq	64(%rsp),%rdi
1437	call	__ecp_nistz256_sub_fromq
1438
1439	orq	%r13,%r12
1440	movdqa	%xmm4,%xmm2
1441	orq	%r8,%r12
1442	orq	%r9,%r12
1443	por	%xmm5,%xmm2
1444.byte	102,73,15,110,220
1445
1446	movq	384(%rsp),%rax
1447	leaq	384(%rsp),%rbx
1448	movq	0+96(%rsp),%r9
1449	movq	8+96(%rsp),%r10
1450	leaq	0+96(%rsp),%rsi
1451	movq	16+96(%rsp),%r11
1452	movq	24+96(%rsp),%r12
1453	leaq	160(%rsp),%rdi
1454	call	__ecp_nistz256_mul_montq
1455
1456	movq	480(%rsp),%rax
1457	leaq	480(%rsp),%rbx
1458	movq	0+32(%rsp),%r9
1459	movq	8+32(%rsp),%r10
1460	leaq	0+32(%rsp),%rsi
1461	movq	16+32(%rsp),%r11
1462	movq	24+32(%rsp),%r12
1463	leaq	192(%rsp),%rdi
1464	call	__ecp_nistz256_mul_montq
1465
1466	leaq	160(%rsp),%rbx
1467	leaq	0(%rsp),%rdi
1468	call	__ecp_nistz256_sub_fromq
1469
1470	orq	%r13,%r12
1471	orq	%r8,%r12
1472	orq	%r9,%r12
1473
1474.byte	0x3e
1475	jnz	.Ladd_proceedq
1476.byte	102,73,15,126,208
1477.byte	102,73,15,126,217
1478	testq	%r8,%r8
1479	jnz	.Ladd_proceedq
1480	testq	%r9,%r9
1481	jz	.Ladd_doubleq
1482
1483.byte	102,72,15,126,199
1484	pxor	%xmm0,%xmm0
1485	movdqu	%xmm0,0(%rdi)
1486	movdqu	%xmm0,16(%rdi)
1487	movdqu	%xmm0,32(%rdi)
1488	movdqu	%xmm0,48(%rdi)
1489	movdqu	%xmm0,64(%rdi)
1490	movdqu	%xmm0,80(%rdi)
1491	jmp	.Ladd_doneq
1492
1493.align	32
1494.Ladd_doubleq:
1495.byte	102,72,15,126,206
1496.byte	102,72,15,126,199
1497	addq	$416,%rsp
1498	jmp	.Lpoint_double_shortcutq
1499
1500.align	32
1501.Ladd_proceedq:
1502	movq	0+64(%rsp),%rax
1503	movq	8+64(%rsp),%r14
1504	leaq	0+64(%rsp),%rsi
1505	movq	16+64(%rsp),%r15
1506	movq	24+64(%rsp),%r8
1507	leaq	96(%rsp),%rdi
1508	call	__ecp_nistz256_sqr_montq
1509
1510	movq	448(%rsp),%rax
1511	leaq	448(%rsp),%rbx
1512	movq	0+0(%rsp),%r9
1513	movq	8+0(%rsp),%r10
1514	leaq	0+0(%rsp),%rsi
1515	movq	16+0(%rsp),%r11
1516	movq	24+0(%rsp),%r12
1517	leaq	352(%rsp),%rdi
1518	call	__ecp_nistz256_mul_montq
1519
1520	movq	0+0(%rsp),%rax
1521	movq	8+0(%rsp),%r14
1522	leaq	0+0(%rsp),%rsi
1523	movq	16+0(%rsp),%r15
1524	movq	24+0(%rsp),%r8
1525	leaq	32(%rsp),%rdi
1526	call	__ecp_nistz256_sqr_montq
1527
1528	movq	544(%rsp),%rax
1529	leaq	544(%rsp),%rbx
1530	movq	0+352(%rsp),%r9
1531	movq	8+352(%rsp),%r10
1532	leaq	0+352(%rsp),%rsi
1533	movq	16+352(%rsp),%r11
1534	movq	24+352(%rsp),%r12
1535	leaq	352(%rsp),%rdi
1536	call	__ecp_nistz256_mul_montq
1537
1538	movq	0(%rsp),%rax
1539	leaq	0(%rsp),%rbx
1540	movq	0+32(%rsp),%r9
1541	movq	8+32(%rsp),%r10
1542	leaq	0+32(%rsp),%rsi
1543	movq	16+32(%rsp),%r11
1544	movq	24+32(%rsp),%r12
1545	leaq	128(%rsp),%rdi
1546	call	__ecp_nistz256_mul_montq
1547
1548	movq	160(%rsp),%rax
1549	leaq	160(%rsp),%rbx
1550	movq	0+32(%rsp),%r9
1551	movq	8+32(%rsp),%r10
1552	leaq	0+32(%rsp),%rsi
1553	movq	16+32(%rsp),%r11
1554	movq	24+32(%rsp),%r12
1555	leaq	192(%rsp),%rdi
1556	call	__ecp_nistz256_mul_montq
1557
1558
1559
1560
1561	xorq	%r11,%r11
1562	addq	%r12,%r12
1563	leaq	96(%rsp),%rsi
1564	adcq	%r13,%r13
1565	movq	%r12,%rax
1566	adcq	%r8,%r8
1567	adcq	%r9,%r9
1568	movq	%r13,%rbp
1569	adcq	$0,%r11
1570
1571	subq	$-1,%r12
1572	movq	%r8,%rcx
1573	sbbq	%r14,%r13
1574	sbbq	$0,%r8
1575	movq	%r9,%r10
1576	sbbq	%r15,%r9
1577	sbbq	$0,%r11
1578
1579	cmovcq	%rax,%r12
1580	movq	0(%rsi),%rax
1581	cmovcq	%rbp,%r13
1582	movq	8(%rsi),%rbp
1583	cmovcq	%rcx,%r8
1584	movq	16(%rsi),%rcx
1585	cmovcq	%r10,%r9
1586	movq	24(%rsi),%r10
1587
1588	call	__ecp_nistz256_subq
1589
1590	leaq	128(%rsp),%rbx
1591	leaq	288(%rsp),%rdi
1592	call	__ecp_nistz256_sub_fromq
1593
1594	movq	192+0(%rsp),%rax
1595	movq	192+8(%rsp),%rbp
1596	movq	192+16(%rsp),%rcx
1597	movq	192+24(%rsp),%r10
1598	leaq	320(%rsp),%rdi
1599
1600	call	__ecp_nistz256_subq
1601
1602	movq	%r12,0(%rdi)
1603	movq	%r13,8(%rdi)
1604	movq	%r8,16(%rdi)
1605	movq	%r9,24(%rdi)
1606	movq	128(%rsp),%rax
1607	leaq	128(%rsp),%rbx
1608	movq	0+224(%rsp),%r9
1609	movq	8+224(%rsp),%r10
1610	leaq	0+224(%rsp),%rsi
1611	movq	16+224(%rsp),%r11
1612	movq	24+224(%rsp),%r12
1613	leaq	256(%rsp),%rdi
1614	call	__ecp_nistz256_mul_montq
1615
1616	movq	320(%rsp),%rax
1617	leaq	320(%rsp),%rbx
1618	movq	0+64(%rsp),%r9
1619	movq	8+64(%rsp),%r10
1620	leaq	0+64(%rsp),%rsi
1621	movq	16+64(%rsp),%r11
1622	movq	24+64(%rsp),%r12
1623	leaq	320(%rsp),%rdi
1624	call	__ecp_nistz256_mul_montq
1625
1626	leaq	256(%rsp),%rbx
1627	leaq	320(%rsp),%rdi
1628	call	__ecp_nistz256_sub_fromq
1629
1630.byte	102,72,15,126,199
1631
1632	movdqa	%xmm5,%xmm0
1633	movdqa	%xmm5,%xmm1
1634	pandn	352(%rsp),%xmm0
1635	movdqa	%xmm5,%xmm2
1636	pandn	352+16(%rsp),%xmm1
1637	movdqa	%xmm5,%xmm3
1638	pand	544(%rsp),%xmm2
1639	pand	544+16(%rsp),%xmm3
1640	por	%xmm0,%xmm2
1641	por	%xmm1,%xmm3
1642
1643	movdqa	%xmm4,%xmm0
1644	movdqa	%xmm4,%xmm1
1645	pandn	%xmm2,%xmm0
1646	movdqa	%xmm4,%xmm2
1647	pandn	%xmm3,%xmm1
1648	movdqa	%xmm4,%xmm3
1649	pand	448(%rsp),%xmm2
1650	pand	448+16(%rsp),%xmm3
1651	por	%xmm0,%xmm2
1652	por	%xmm1,%xmm3
1653	movdqu	%xmm2,64(%rdi)
1654	movdqu	%xmm3,80(%rdi)
1655
1656	movdqa	%xmm5,%xmm0
1657	movdqa	%xmm5,%xmm1
1658	pandn	288(%rsp),%xmm0
1659	movdqa	%xmm5,%xmm2
1660	pandn	288+16(%rsp),%xmm1
1661	movdqa	%xmm5,%xmm3
1662	pand	480(%rsp),%xmm2
1663	pand	480+16(%rsp),%xmm3
1664	por	%xmm0,%xmm2
1665	por	%xmm1,%xmm3
1666
1667	movdqa	%xmm4,%xmm0
1668	movdqa	%xmm4,%xmm1
1669	pandn	%xmm2,%xmm0
1670	movdqa	%xmm4,%xmm2
1671	pandn	%xmm3,%xmm1
1672	movdqa	%xmm4,%xmm3
1673	pand	384(%rsp),%xmm2
1674	pand	384+16(%rsp),%xmm3
1675	por	%xmm0,%xmm2
1676	por	%xmm1,%xmm3
1677	movdqu	%xmm2,0(%rdi)
1678	movdqu	%xmm3,16(%rdi)
1679
1680	movdqa	%xmm5,%xmm0
1681	movdqa	%xmm5,%xmm1
1682	pandn	320(%rsp),%xmm0
1683	movdqa	%xmm5,%xmm2
1684	pandn	320+16(%rsp),%xmm1
1685	movdqa	%xmm5,%xmm3
1686	pand	512(%rsp),%xmm2
1687	pand	512+16(%rsp),%xmm3
1688	por	%xmm0,%xmm2
1689	por	%xmm1,%xmm3
1690
1691	movdqa	%xmm4,%xmm0
1692	movdqa	%xmm4,%xmm1
1693	pandn	%xmm2,%xmm0
1694	movdqa	%xmm4,%xmm2
1695	pandn	%xmm3,%xmm1
1696	movdqa	%xmm4,%xmm3
1697	pand	416(%rsp),%xmm2
1698	pand	416+16(%rsp),%xmm3
1699	por	%xmm0,%xmm2
1700	por	%xmm1,%xmm3
1701	movdqu	%xmm2,32(%rdi)
1702	movdqu	%xmm3,48(%rdi)
1703
1704.Ladd_doneq:
1705	addq	$576+8,%rsp
1706	popq	%r15
1707	popq	%r14
1708	popq	%r13
1709	popq	%r12
1710	popq	%rbx
1711	popq	%rbp
1712	.byte	0xf3,0xc3
1713.size	ecp_nistz256_point_add,.-ecp_nistz256_point_add
1714.globl	ecp_nistz256_point_add_affine
1715.type	ecp_nistz256_point_add_affine,@function
1716.align	32
1717ecp_nistz256_point_add_affine:
1718	pushq	%rbp
1719	pushq	%rbx
1720	pushq	%r12
1721	pushq	%r13
1722	pushq	%r14
1723	pushq	%r15
1724	subq	$480+8,%rsp
1725
1726	movdqu	0(%rsi),%xmm0
1727	movq	%rdx,%rbx
1728	movdqu	16(%rsi),%xmm1
1729	movdqu	32(%rsi),%xmm2
1730	movdqu	48(%rsi),%xmm3
1731	movdqu	64(%rsi),%xmm4
1732	movdqu	80(%rsi),%xmm5
1733	movq	64+0(%rsi),%rax
1734	movq	64+8(%rsi),%r14
1735	movq	64+16(%rsi),%r15
1736	movq	64+24(%rsi),%r8
1737	movdqa	%xmm0,320(%rsp)
1738	movdqa	%xmm1,320+16(%rsp)
1739	movdqa	%xmm2,352(%rsp)
1740	movdqa	%xmm3,352+16(%rsp)
1741	movdqa	%xmm4,384(%rsp)
1742	movdqa	%xmm5,384+16(%rsp)
1743	por	%xmm4,%xmm5
1744
1745	movdqu	0(%rbx),%xmm0
1746	pshufd	$0xb1,%xmm5,%xmm3
1747	movdqu	16(%rbx),%xmm1
1748	movdqu	32(%rbx),%xmm2
1749	por	%xmm3,%xmm5
1750	movdqu	48(%rbx),%xmm3
1751	movdqa	%xmm0,416(%rsp)
1752	pshufd	$0x1e,%xmm5,%xmm4
1753	movdqa	%xmm1,416+16(%rsp)
1754	por	%xmm0,%xmm1
1755.byte	102,72,15,110,199
1756	movdqa	%xmm2,448(%rsp)
1757	movdqa	%xmm3,448+16(%rsp)
1758	por	%xmm2,%xmm3
1759	por	%xmm4,%xmm5
1760	pxor	%xmm4,%xmm4
1761	por	%xmm1,%xmm3
1762
1763	leaq	64-0(%rsi),%rsi
1764	leaq	32(%rsp),%rdi
1765	call	__ecp_nistz256_sqr_montq
1766
1767	pcmpeqd	%xmm4,%xmm5
1768	pshufd	$0xb1,%xmm3,%xmm4
1769	movq	0(%rbx),%rax
1770
1771	movq	%r12,%r9
1772	por	%xmm3,%xmm4
1773	pshufd	$0,%xmm5,%xmm5
1774	pshufd	$0x1e,%xmm4,%xmm3
1775	movq	%r13,%r10
1776	por	%xmm3,%xmm4
1777	pxor	%xmm3,%xmm3
1778	movq	%r14,%r11
1779	pcmpeqd	%xmm3,%xmm4
1780	pshufd	$0,%xmm4,%xmm4
1781
1782	leaq	32-0(%rsp),%rsi
1783	movq	%r15,%r12
1784	leaq	0(%rsp),%rdi
1785	call	__ecp_nistz256_mul_montq
1786
1787	leaq	320(%rsp),%rbx
1788	leaq	64(%rsp),%rdi
1789	call	__ecp_nistz256_sub_fromq
1790
1791	movq	384(%rsp),%rax
1792	leaq	384(%rsp),%rbx
1793	movq	0+32(%rsp),%r9
1794	movq	8+32(%rsp),%r10
1795	leaq	0+32(%rsp),%rsi
1796	movq	16+32(%rsp),%r11
1797	movq	24+32(%rsp),%r12
1798	leaq	32(%rsp),%rdi
1799	call	__ecp_nistz256_mul_montq
1800
1801	movq	384(%rsp),%rax
1802	leaq	384(%rsp),%rbx
1803	movq	0+64(%rsp),%r9
1804	movq	8+64(%rsp),%r10
1805	leaq	0+64(%rsp),%rsi
1806	movq	16+64(%rsp),%r11
1807	movq	24+64(%rsp),%r12
1808	leaq	288(%rsp),%rdi
1809	call	__ecp_nistz256_mul_montq
1810
1811	movq	448(%rsp),%rax
1812	leaq	448(%rsp),%rbx
1813	movq	0+32(%rsp),%r9
1814	movq	8+32(%rsp),%r10
1815	leaq	0+32(%rsp),%rsi
1816	movq	16+32(%rsp),%r11
1817	movq	24+32(%rsp),%r12
1818	leaq	32(%rsp),%rdi
1819	call	__ecp_nistz256_mul_montq
1820
1821	leaq	352(%rsp),%rbx
1822	leaq	96(%rsp),%rdi
1823	call	__ecp_nistz256_sub_fromq
1824
1825	movq	0+64(%rsp),%rax
1826	movq	8+64(%rsp),%r14
1827	leaq	0+64(%rsp),%rsi
1828	movq	16+64(%rsp),%r15
1829	movq	24+64(%rsp),%r8
1830	leaq	128(%rsp),%rdi
1831	call	__ecp_nistz256_sqr_montq
1832
1833	movq	0+96(%rsp),%rax
1834	movq	8+96(%rsp),%r14
1835	leaq	0+96(%rsp),%rsi
1836	movq	16+96(%rsp),%r15
1837	movq	24+96(%rsp),%r8
1838	leaq	192(%rsp),%rdi
1839	call	__ecp_nistz256_sqr_montq
1840
1841	movq	128(%rsp),%rax
1842	leaq	128(%rsp),%rbx
1843	movq	0+64(%rsp),%r9
1844	movq	8+64(%rsp),%r10
1845	leaq	0+64(%rsp),%rsi
1846	movq	16+64(%rsp),%r11
1847	movq	24+64(%rsp),%r12
1848	leaq	160(%rsp),%rdi
1849	call	__ecp_nistz256_mul_montq
1850
1851	movq	320(%rsp),%rax
1852	leaq	320(%rsp),%rbx
1853	movq	0+128(%rsp),%r9
1854	movq	8+128(%rsp),%r10
1855	leaq	0+128(%rsp),%rsi
1856	movq	16+128(%rsp),%r11
1857	movq	24+128(%rsp),%r12
1858	leaq	0(%rsp),%rdi
1859	call	__ecp_nistz256_mul_montq
1860
1861
1862
1863
1864	xorq	%r11,%r11
1865	addq	%r12,%r12
1866	leaq	192(%rsp),%rsi
1867	adcq	%r13,%r13
1868	movq	%r12,%rax
1869	adcq	%r8,%r8
1870	adcq	%r9,%r9
1871	movq	%r13,%rbp
1872	adcq	$0,%r11
1873
1874	subq	$-1,%r12
1875	movq	%r8,%rcx
1876	sbbq	%r14,%r13
1877	sbbq	$0,%r8
1878	movq	%r9,%r10
1879	sbbq	%r15,%r9
1880	sbbq	$0,%r11
1881
1882	cmovcq	%rax,%r12
1883	movq	0(%rsi),%rax
1884	cmovcq	%rbp,%r13
1885	movq	8(%rsi),%rbp
1886	cmovcq	%rcx,%r8
1887	movq	16(%rsi),%rcx
1888	cmovcq	%r10,%r9
1889	movq	24(%rsi),%r10
1890
1891	call	__ecp_nistz256_subq
1892
1893	leaq	160(%rsp),%rbx
1894	leaq	224(%rsp),%rdi
1895	call	__ecp_nistz256_sub_fromq
1896
1897	movq	0+0(%rsp),%rax
1898	movq	0+8(%rsp),%rbp
1899	movq	0+16(%rsp),%rcx
1900	movq	0+24(%rsp),%r10
1901	leaq	64(%rsp),%rdi
1902
1903	call	__ecp_nistz256_subq
1904
1905	movq	%r12,0(%rdi)
1906	movq	%r13,8(%rdi)
1907	movq	%r8,16(%rdi)
1908	movq	%r9,24(%rdi)
1909	movq	352(%rsp),%rax
1910	leaq	352(%rsp),%rbx
1911	movq	0+160(%rsp),%r9
1912	movq	8+160(%rsp),%r10
1913	leaq	0+160(%rsp),%rsi
1914	movq	16+160(%rsp),%r11
1915	movq	24+160(%rsp),%r12
1916	leaq	32(%rsp),%rdi
1917	call	__ecp_nistz256_mul_montq
1918
1919	movq	96(%rsp),%rax
1920	leaq	96(%rsp),%rbx
1921	movq	0+64(%rsp),%r9
1922	movq	8+64(%rsp),%r10
1923	leaq	0+64(%rsp),%rsi
1924	movq	16+64(%rsp),%r11
1925	movq	24+64(%rsp),%r12
1926	leaq	64(%rsp),%rdi
1927	call	__ecp_nistz256_mul_montq
1928
1929	leaq	32(%rsp),%rbx
1930	leaq	256(%rsp),%rdi
1931	call	__ecp_nistz256_sub_fromq
1932
1933.byte	102,72,15,126,199
1934
1935	movdqa	%xmm5,%xmm0
1936	movdqa	%xmm5,%xmm1
1937	pandn	288(%rsp),%xmm0
1938	movdqa	%xmm5,%xmm2
1939	pandn	288+16(%rsp),%xmm1
1940	movdqa	%xmm5,%xmm3
1941	pand	.LONE_mont(%rip),%xmm2
1942	pand	.LONE_mont+16(%rip),%xmm3
1943	por	%xmm0,%xmm2
1944	por	%xmm1,%xmm3
1945
1946	movdqa	%xmm4,%xmm0
1947	movdqa	%xmm4,%xmm1
1948	pandn	%xmm2,%xmm0
1949	movdqa	%xmm4,%xmm2
1950	pandn	%xmm3,%xmm1
1951	movdqa	%xmm4,%xmm3
1952	pand	384(%rsp),%xmm2
1953	pand	384+16(%rsp),%xmm3
1954	por	%xmm0,%xmm2
1955	por	%xmm1,%xmm3
1956	movdqu	%xmm2,64(%rdi)
1957	movdqu	%xmm3,80(%rdi)
1958
1959	movdqa	%xmm5,%xmm0
1960	movdqa	%xmm5,%xmm1
1961	pandn	224(%rsp),%xmm0
1962	movdqa	%xmm5,%xmm2
1963	pandn	224+16(%rsp),%xmm1
1964	movdqa	%xmm5,%xmm3
1965	pand	416(%rsp),%xmm2
1966	pand	416+16(%rsp),%xmm3
1967	por	%xmm0,%xmm2
1968	por	%xmm1,%xmm3
1969
1970	movdqa	%xmm4,%xmm0
1971	movdqa	%xmm4,%xmm1
1972	pandn	%xmm2,%xmm0
1973	movdqa	%xmm4,%xmm2
1974	pandn	%xmm3,%xmm1
1975	movdqa	%xmm4,%xmm3
1976	pand	320(%rsp),%xmm2
1977	pand	320+16(%rsp),%xmm3
1978	por	%xmm0,%xmm2
1979	por	%xmm1,%xmm3
1980	movdqu	%xmm2,0(%rdi)
1981	movdqu	%xmm3,16(%rdi)
1982
1983	movdqa	%xmm5,%xmm0
1984	movdqa	%xmm5,%xmm1
1985	pandn	256(%rsp),%xmm0
1986	movdqa	%xmm5,%xmm2
1987	pandn	256+16(%rsp),%xmm1
1988	movdqa	%xmm5,%xmm3
1989	pand	448(%rsp),%xmm2
1990	pand	448+16(%rsp),%xmm3
1991	por	%xmm0,%xmm2
1992	por	%xmm1,%xmm3
1993
1994	movdqa	%xmm4,%xmm0
1995	movdqa	%xmm4,%xmm1
1996	pandn	%xmm2,%xmm0
1997	movdqa	%xmm4,%xmm2
1998	pandn	%xmm3,%xmm1
1999	movdqa	%xmm4,%xmm3
2000	pand	352(%rsp),%xmm2
2001	pand	352+16(%rsp),%xmm3
2002	por	%xmm0,%xmm2
2003	por	%xmm1,%xmm3
2004	movdqu	%xmm2,32(%rdi)
2005	movdqu	%xmm3,48(%rdi)
2006
2007	addq	$480+8,%rsp
2008	popq	%r15
2009	popq	%r14
2010	popq	%r13
2011	popq	%r12
2012	popq	%rbx
2013	popq	%rbp
2014	.byte	0xf3,0xc3
2015.size	ecp_nistz256_point_add_affine,.-ecp_nistz256_point_add_affine
2016