1#!/usr/bin/env perl
2
3##############################################################################
4#                                                                            #
5# Copyright 2014 Intel Corporation                                           #
6#                                                                            #
7# Licensed under the Apache License, Version 2.0 (the "License");            #
8# you may not use this file except in compliance with the License.           #
9# You may obtain a copy of the License at                                    #
10#                                                                            #
11#    http://www.apache.org/licenses/LICENSE-2.0                              #
12#                                                                            #
13# Unless required by applicable law or agreed to in writing, software        #
14# distributed under the License is distributed on an "AS IS" BASIS,          #
15# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.   #
16# See the License for the specific language governing permissions and        #
17# limitations under the License.                                             #
18#                                                                            #
19##############################################################################
20#                                                                            #
21#  Developers and authors:                                                   #
22#  Shay Gueron (1, 2), and Vlad Krasnov (1)                                  #
23#  (1) Intel Corporation, Israel Development Center                          #
24#  (2) University of Haifa                                                   #
25#  Reference:                                                                #
26#  S.Gueron and V.Krasnov, "Fast Prime Field Elliptic Curve Cryptography with#
27#                           256 Bit Primes"                                  #
28#                                                                            #
29##############################################################################
30
31$flavour = shift;
32$output  = shift;
33if ($flavour =~ /\./) { $output = $flavour; undef $flavour; }
34
35$win64=0; $win64=1 if ($flavour =~ /[nm]asm|mingw64/ || $output =~ /\.asm$/);
36
37$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
38( $xlate="${dir}x86_64-xlate.pl" and -f $xlate ) or
39( $xlate="${dir}../../perlasm/x86_64-xlate.pl" and -f $xlate) or
40die "can't locate x86_64-xlate.pl";
41
42open OUT,"| \"$^X\" $xlate $flavour $output";
43*STDOUT=*OUT;
44
45if (`$ENV{CC} -Wa,-v -c -o /dev/null -x assembler /dev/null 2>&1`
46		=~ /GNU assembler version ([2-9]\.[0-9]+)/) {
47	$avx = ($1>=2.19) + ($1>=2.22);
48	$addx = ($1>=2.23);
49}
50
51if (!$addx && $win64 && ($flavour =~ /nasm/ || $ENV{ASM} =~ /nasm/) &&
52	    `nasm -v 2>&1` =~ /NASM version ([2-9]\.[0-9]+)/) {
53	$avx = ($1>=2.09) + ($1>=2.10);
54	$addx = ($1>=2.10);
55}
56
57if (!$addx && $win64 && ($flavour =~ /masm/ || $ENV{ASM} =~ /ml64/) &&
58	    `ml64 2>&1` =~ /Version ([0-9]+)\./) {
59	$avx = ($1>=10) + ($1>=11);
60	$addx = ($1>=12);
61}
62
63if (!$addx && `$ENV{CC} -v 2>&1` =~ /(^clang version|based on LLVM) ([3-9])\.([0-9]+)/) {
64	my $ver = $2 + $3/100.0;	# 3.1->3.01, 3.10->3.10
65	$avx = ($ver>=3.0) + ($ver>=3.01);
66	$addx = ($ver>=3.03);
67}
68
69if ($avx>=2) {{
70$digit_size = "\$29";
71$n_digits = "\$9";
72
73$code.=<<___;
74.text
75
76.align 64
77.LAVX2_AND_MASK:
78.LAVX2_POLY:
79.quad 0x1fffffff, 0x1fffffff, 0x1fffffff, 0x1fffffff
80.quad 0x1fffffff, 0x1fffffff, 0x1fffffff, 0x1fffffff
81.quad 0x1fffffff, 0x1fffffff, 0x1fffffff, 0x1fffffff
82.quad 0x000001ff, 0x000001ff, 0x000001ff, 0x000001ff
83.quad 0x00000000, 0x00000000, 0x00000000, 0x00000000
84.quad 0x00000000, 0x00000000, 0x00000000, 0x00000000
85.quad 0x00040000, 0x00040000, 0x00040000, 0x00040000
86.quad 0x1fe00000, 0x1fe00000, 0x1fe00000, 0x1fe00000
87.quad 0x00ffffff, 0x00ffffff, 0x00ffffff, 0x00ffffff
88
89.LAVX2_POLY_x2:
90.quad 0x7FFFFFFC, 0x7FFFFFFC, 0x7FFFFFFC, 0x7FFFFFFC
91.quad 0x7FFFFFFC, 0x7FFFFFFC, 0x7FFFFFFC, 0x7FFFFFFC
92.quad 0x7FFFFFFC, 0x7FFFFFFC, 0x7FFFFFFC, 0x7FFFFFFC
93.quad 0x400007FC, 0x400007FC, 0x400007FC, 0x400007FC
94.quad 0x3FFFFFFE, 0x3FFFFFFE, 0x3FFFFFFE, 0x3FFFFFFE
95.quad 0x3FFFFFFE, 0x3FFFFFFE, 0x3FFFFFFE, 0x3FFFFFFE
96.quad 0x400FFFFE, 0x400FFFFE, 0x400FFFFE, 0x400FFFFE
97.quad 0x7F7FFFFE, 0x7F7FFFFE, 0x7F7FFFFE, 0x7F7FFFFE
98.quad 0x03FFFFFC, 0x03FFFFFC, 0x03FFFFFC, 0x03FFFFFC
99
100.LAVX2_POLY_x8:
101.quad 0xFFFFFFF8, 0xFFFFFFF8, 0xFFFFFFF8, 0xFFFFFFF8
102.quad 0xFFFFFFF8, 0xFFFFFFF8, 0xFFFFFFF8, 0xFFFFFFF8
103.quad 0xFFFFFFF8, 0xFFFFFFF8, 0xFFFFFFF8, 0xFFFFFFF8
104.quad 0x80000FF8, 0x80000FF8, 0x80000FF8, 0x80000FF8
105.quad 0x7FFFFFFC, 0x7FFFFFFC, 0x7FFFFFFC, 0x7FFFFFFC
106.quad 0x7FFFFFFC, 0x7FFFFFFC, 0x7FFFFFFC, 0x7FFFFFFC
107.quad 0x801FFFFC, 0x801FFFFC, 0x801FFFFC, 0x801FFFFC
108.quad 0xFEFFFFFC, 0xFEFFFFFC, 0xFEFFFFFC, 0xFEFFFFFC
109.quad 0x07FFFFF8, 0x07FFFFF8, 0x07FFFFF8, 0x07FFFFF8
110
111.LONE:
112.quad 0x00000020, 0x00000020, 0x00000020, 0x00000020
113.quad 0x00000000, 0x00000000, 0x00000000, 0x00000000
114.quad 0x00000000, 0x00000000, 0x00000000, 0x00000000
115.quad 0x1fffc000, 0x1fffc000, 0x1fffc000, 0x1fffc000
116.quad 0x1fffffff, 0x1fffffff, 0x1fffffff, 0x1fffffff
117.quad 0x1fffffff, 0x1fffffff, 0x1fffffff, 0x1fffffff
118.quad 0x1f7fffff, 0x1f7fffff, 0x1f7fffff, 0x1f7fffff
119.quad 0x03ffffff, 0x03ffffff, 0x03ffffff, 0x03ffffff
120.quad 0x00000000, 0x00000000, 0x00000000, 0x00000000
121
122# RR = 2^266 mod p in AVX2 format, to transform from the native OpenSSL
123# Montgomery form (*2^256) to our format (*2^261)
124
125.LTO_MONT_AVX2:
126.quad 0x00000400, 0x00000400, 0x00000400, 0x00000400
127.quad 0x00000000, 0x00000000, 0x00000000, 0x00000000
128.quad 0x00000000, 0x00000000, 0x00000000, 0x00000000
129.quad 0x1ff80000, 0x1ff80000, 0x1ff80000, 0x1ff80000
130.quad 0x1fffffff, 0x1fffffff, 0x1fffffff, 0x1fffffff
131.quad 0x1fffffff, 0x1fffffff, 0x1fffffff, 0x1fffffff
132.quad 0x0fffffff, 0x0fffffff, 0x0fffffff, 0x0fffffff
133.quad 0x1fffffff, 0x1fffffff, 0x1fffffff, 0x1fffffff
134.quad 0x00000003, 0x00000003, 0x00000003, 0x00000003
135
136.LFROM_MONT_AVX2:
137.quad 0x00000001, 0x00000001, 0x00000001, 0x00000001
138.quad 0x00000000, 0x00000000, 0x00000000, 0x00000000
139.quad 0x00000000, 0x00000000, 0x00000000, 0x00000000
140.quad 0x1ffffe00, 0x1ffffe00, 0x1ffffe00, 0x1ffffe00
141.quad 0x1fffffff, 0x1fffffff, 0x1fffffff, 0x1fffffff
142.quad 0x1fffffff, 0x1fffffff, 0x1fffffff, 0x1fffffff
143.quad 0x1ffbffff, 0x1ffbffff, 0x1ffbffff, 0x1ffbffff
144.quad 0x001fffff, 0x001fffff, 0x001fffff, 0x001fffff
145.quad 0x00000000, 0x00000000, 0x00000000, 0x00000000
146
147.LIntOne:
148.long 1,1,1,1,1,1,1,1
149___
150
151{
152# This function recieves a pointer to an array of four affine points
153# (X, Y, <1>) and rearanges the data for AVX2 execution, while
154# converting it to 2^29 radix redundant form
155
156my ($X0,$X1,$X2,$X3, $Y0,$Y1,$Y2,$Y3,
157    $T0,$T1,$T2,$T3, $T4,$T5,$T6,$T7)=map("%ymm$_",(0..15));
158
159$code.=<<___;
160.globl	ecp_nistz256_avx2_transpose_convert
161.type	ecp_nistz256_avx2_transpose_convert,\@function,2
162.align 64
163ecp_nistz256_avx2_transpose_convert:
164	vzeroupper
165___
166$code.=<<___	if ($win64);
167	lea	-8-16*10(%rsp), %rsp
168	vmovaps	%xmm6, -8-16*10(%rax)
169	vmovaps	%xmm7, -8-16*9(%rax)
170	vmovaps	%xmm8, -8-16*8(%rax)
171	vmovaps	%xmm9, -8-16*7(%rax)
172	vmovaps	%xmm10, -8-16*6(%rax)
173	vmovaps	%xmm11, -8-16*5(%rax)
174	vmovaps	%xmm12, -8-16*4(%rax)
175	vmovaps	%xmm13, -8-16*3(%rax)
176	vmovaps	%xmm14, -8-16*2(%rax)
177	vmovaps	%xmm15, -8-16*1(%rax)
178___
179$code.=<<___;
180	# Load the data
181	vmovdqa		32*0(%rsi), $X0
182	lea		112(%rsi), %rax		# size optimization
183	vmovdqa		32*1(%rsi), $Y0
184	lea		.LAVX2_AND_MASK(%rip), %rdx
185	vmovdqa		32*2(%rsi), $X1
186	vmovdqa		32*3(%rsi), $Y1
187	vmovdqa		32*4-112(%rax), $X2
188	vmovdqa		32*5-112(%rax), $Y2
189	vmovdqa		32*6-112(%rax), $X3
190	vmovdqa		32*7-112(%rax), $Y3
191
192	# Transpose X and Y independently
193	vpunpcklqdq	$X1, $X0, $T0		# T0 = [B2 A2 B0 A0]
194	vpunpcklqdq	$X3, $X2, $T1		# T1 = [D2 C2 D0 C0]
195	vpunpckhqdq	$X1, $X0, $T2		# T2 = [B3 A3 B1 A1]
196	vpunpckhqdq	$X3, $X2, $T3		# T3 = [D3 C3 D1 C1]
197
198	vpunpcklqdq	$Y1, $Y0, $T4
199	vpunpcklqdq	$Y3, $Y2, $T5
200	vpunpckhqdq	$Y1, $Y0, $T6
201	vpunpckhqdq	$Y3, $Y2, $T7
202
203	vperm2i128	\$0x20, $T1, $T0, $X0	# X0 = [D0 C0 B0 A0]
204	vperm2i128	\$0x20, $T3, $T2, $X1	# X1 = [D1 C1 B1 A1]
205	vperm2i128	\$0x31, $T1, $T0, $X2	# X2 = [D2 C2 B2 A2]
206	vperm2i128	\$0x31, $T3, $T2, $X3	# X3 = [D3 C3 B3 A3]
207
208	vperm2i128	\$0x20, $T5, $T4, $Y0
209	vperm2i128	\$0x20, $T7, $T6, $Y1
210	vperm2i128	\$0x31, $T5, $T4, $Y2
211	vperm2i128	\$0x31, $T7, $T6, $Y3
212	vmovdqa		(%rdx), $T7
213
214	vpand		(%rdx), $X0, $T0	# out[0] = in[0] & mask;
215	vpsrlq		\$29, $X0, $X0
216	vpand		$T7, $X0, $T1		# out[1] = (in[0] >> shift) & mask;
217	vpsrlq		\$29, $X0, $X0
218	vpsllq		\$6, $X1, $T2
219	vpxor		$X0, $T2, $T2
220	vpand		$T7, $T2, $T2		# out[2] = ((in[0] >> (shift*2)) ^ (in[1] << (64-shift*2))) & mask;
221	vpsrlq		\$23, $X1, $X1
222	vpand		$T7, $X1, $T3		# out[3] = (in[1] >> ((shift*3)%64)) & mask;
223	vpsrlq		\$29, $X1, $X1
224	vpsllq		\$12, $X2, $T4
225	vpxor		$X1, $T4, $T4
226	vpand		$T7, $T4, $T4		# out[4] = ((in[1] >> ((shift*4)%64)) ^ (in[2] << (64*2-shift*4))) & mask;
227	vpsrlq		\$17, $X2, $X2
228	vpand		$T7, $X2, $T5		# out[5] = (in[2] >> ((shift*5)%64)) & mask;
229	vpsrlq		\$29, $X2, $X2
230	vpsllq		\$18, $X3, $T6
231	vpxor		$X2, $T6, $T6
232	vpand		$T7, $T6, $T6		# out[6] = ((in[2] >> ((shift*6)%64)) ^ (in[3] << (64*3-shift*6))) & mask;
233	vpsrlq		\$11, $X3, $X3
234	 vmovdqa	$T0, 32*0(%rdi)
235	 lea		112(%rdi), %rax		# size optimization
236	vpand		$T7, $X3, $T0		# out[7] = (in[3] >> ((shift*7)%64)) & mask;
237	vpsrlq		\$29, $X3, $X3		# out[8] = (in[3] >> ((shift*8)%64)) & mask;
238
239	vmovdqa		$T1, 32*1(%rdi)
240	vmovdqa		$T2, 32*2(%rdi)
241	vmovdqa		$T3, 32*3(%rdi)
242	vmovdqa		$T4, 32*4-112(%rax)
243	vmovdqa		$T5, 32*5-112(%rax)
244	vmovdqa		$T6, 32*6-112(%rax)
245	vmovdqa		$T0, 32*7-112(%rax)
246	vmovdqa		$X3, 32*8-112(%rax)
247	lea		448(%rdi), %rax		# size optimization
248
249	vpand		$T7, $Y0, $T0		# out[0] = in[0] & mask;
250	vpsrlq		\$29, $Y0, $Y0
251	vpand		$T7, $Y0, $T1		# out[1] = (in[0] >> shift) & mask;
252	vpsrlq		\$29, $Y0, $Y0
253	vpsllq		\$6, $Y1, $T2
254	vpxor		$Y0, $T2, $T2
255	vpand		$T7, $T2, $T2		# out[2] = ((in[0] >> (shift*2)) ^ (in[1] << (64-shift*2))) & mask;
256	vpsrlq		\$23, $Y1, $Y1
257	vpand		$T7, $Y1, $T3		# out[3] = (in[1] >> ((shift*3)%64)) & mask;
258	vpsrlq		\$29, $Y1, $Y1
259	vpsllq		\$12, $Y2, $T4
260	vpxor		$Y1, $T4, $T4
261	vpand		$T7, $T4, $T4		# out[4] = ((in[1] >> ((shift*4)%64)) ^ (in[2] << (64*2-shift*4))) & mask;
262	vpsrlq		\$17, $Y2, $Y2
263	vpand		$T7, $Y2, $T5		# out[5] = (in[2] >> ((shift*5)%64)) & mask;
264	vpsrlq		\$29, $Y2, $Y2
265	vpsllq		\$18, $Y3, $T6
266	vpxor		$Y2, $T6, $T6
267	vpand		$T7, $T6, $T6		# out[6] = ((in[2] >> ((shift*6)%64)) ^ (in[3] << (64*3-shift*6))) & mask;
268	vpsrlq		\$11, $Y3, $Y3
269	 vmovdqa	$T0, 32*9-448(%rax)
270	vpand		$T7, $Y3, $T0		# out[7] = (in[3] >> ((shift*7)%64)) & mask;
271	vpsrlq		\$29, $Y3, $Y3		# out[8] = (in[3] >> ((shift*8)%64)) & mask;
272
273	vmovdqa		$T1, 32*10-448(%rax)
274	vmovdqa		$T2, 32*11-448(%rax)
275	vmovdqa		$T3, 32*12-448(%rax)
276	vmovdqa		$T4, 32*13-448(%rax)
277	vmovdqa		$T5, 32*14-448(%rax)
278	vmovdqa		$T6, 32*15-448(%rax)
279	vmovdqa		$T0, 32*16-448(%rax)
280	vmovdqa		$Y3, 32*17-448(%rax)
281
282	vzeroupper
283___
284$code.=<<___	if ($win64);
285	movaps	16*0(%rsp), %xmm6
286	movaps	16*1(%rsp), %xmm7
287	movaps	16*2(%rsp), %xmm8
288	movaps	16*3(%rsp), %xmm9
289	movaps	16*4(%rsp), %xmm10
290	movaps	16*5(%rsp), %xmm11
291	movaps	16*6(%rsp), %xmm12
292	movaps	16*7(%rsp), %xmm13
293	movaps	16*8(%rsp), %xmm14
294	movaps	16*9(%rsp), %xmm15
295	lea	8+16*10(%rsp), %rsp
296___
297$code.=<<___;
298	ret
299.size	ecp_nistz256_avx2_transpose_convert,.-ecp_nistz256_avx2_transpose_convert
300___
301}
302{
303################################################################################
304# This function recieves a pointer to an array of four AVX2 formatted points
305# (X, Y, Z) convert the data to normal representation, and rearanges the data
306
307my ($D0,$D1,$D2,$D3, $D4,$D5,$D6,$D7, $D8)=map("%ymm$_",(0..8));
308my ($T0,$T1,$T2,$T3, $T4,$T5,$T6)=map("%ymm$_",(9..15));
309
310$code.=<<___;
311
312.globl	ecp_nistz256_avx2_convert_transpose_back
313.type	ecp_nistz256_avx2_convert_transpose_back,\@function,2
314.align	32
315ecp_nistz256_avx2_convert_transpose_back:
316	vzeroupper
317___
318$code.=<<___	if ($win64);
319	lea	-8-16*10(%rsp), %rsp
320	vmovaps	%xmm6, -8-16*10(%rax)
321	vmovaps	%xmm7, -8-16*9(%rax)
322	vmovaps	%xmm8, -8-16*8(%rax)
323	vmovaps	%xmm9, -8-16*7(%rax)
324	vmovaps	%xmm10, -8-16*6(%rax)
325	vmovaps	%xmm11, -8-16*5(%rax)
326	vmovaps	%xmm12, -8-16*4(%rax)
327	vmovaps	%xmm13, -8-16*3(%rax)
328	vmovaps	%xmm14, -8-16*2(%rax)
329	vmovaps	%xmm15, -8-16*1(%rax)
330___
331$code.=<<___;
332	mov	\$3, %ecx
333
334.Lconv_loop:
335	vmovdqa		32*0(%rsi), $D0
336	lea		160(%rsi), %rax		# size optimization
337	vmovdqa		32*1(%rsi), $D1
338	vmovdqa		32*2(%rsi), $D2
339	vmovdqa		32*3(%rsi), $D3
340	vmovdqa		32*4-160(%rax), $D4
341	vmovdqa		32*5-160(%rax), $D5
342	vmovdqa		32*6-160(%rax), $D6
343	vmovdqa		32*7-160(%rax), $D7
344	vmovdqa		32*8-160(%rax), $D8
345
346	vpsllq		\$29, $D1, $D1
347	vpsllq		\$58, $D2, $T0
348	vpaddq		$D1, $D0, $D0
349	vpaddq		$T0, $D0, $D0		# out[0] = (in[0]) ^ (in[1] << shift*1) ^ (in[2] << shift*2);
350
351	vpsrlq		\$6, $D2, $D2
352	vpsllq		\$23, $D3, $D3
353	vpsllq		\$52, $D4, $T1
354	vpaddq		$D2, $D3, $D3
355	vpaddq		$D3, $T1, $D1		# out[1] = (in[2] >> (64*1-shift*2)) ^ (in[3] << shift*3%64) ^ (in[4] << shift*4%64);
356
357	vpsrlq		\$12, $D4, $D4
358	vpsllq		\$17, $D5, $D5
359	vpsllq		\$46, $D6, $T2
360	vpaddq		$D4, $D5, $D5
361	vpaddq		$D5, $T2, $D2		# out[2] = (in[4] >> (64*2-shift*4)) ^ (in[5] << shift*5%64) ^ (in[6] << shift*6%64);
362
363	vpsrlq		\$18, $D6, $D6
364	vpsllq		\$11, $D7, $D7
365	vpsllq		\$40, $D8, $T3
366	vpaddq		$D6, $D7, $D7
367	vpaddq		$D7, $T3, $D3		# out[3] = (in[6] >> (64*3-shift*6)) ^ (in[7] << shift*7%64) ^ (in[8] << shift*8%64);
368
369	vpunpcklqdq	$D1, $D0, $T0		# T0 = [B2 A2 B0 A0]
370	vpunpcklqdq	$D3, $D2, $T1		# T1 = [D2 C2 D0 C0]
371	vpunpckhqdq	$D1, $D0, $T2		# T2 = [B3 A3 B1 A1]
372	vpunpckhqdq	$D3, $D2, $T3		# T3 = [D3 C3 D1 C1]
373
374	vperm2i128	\$0x20, $T1, $T0, $D0	# X0 = [D0 C0 B0 A0]
375	vperm2i128	\$0x20, $T3, $T2, $D1	# X1 = [D1 C1 B1 A1]
376	vperm2i128	\$0x31, $T1, $T0, $D2	# X2 = [D2 C2 B2 A2]
377	vperm2i128	\$0x31, $T3, $T2, $D3	# X3 = [D3 C3 B3 A3]
378
379	vmovdqa		$D0, 32*0(%rdi)
380	vmovdqa		$D1, 32*3(%rdi)
381	vmovdqa		$D2, 32*6(%rdi)
382	vmovdqa		$D3, 32*9(%rdi)
383
384	lea		32*9(%rsi), %rsi
385	lea		32*1(%rdi), %rdi
386
387	dec	%ecx
388	jnz	.Lconv_loop
389
390	vzeroupper
391___
392$code.=<<___	if ($win64);
393	movaps	16*0(%rsp), %xmm6
394	movaps	16*1(%rsp), %xmm7
395	movaps	16*2(%rsp), %xmm8
396	movaps	16*3(%rsp), %xmm9
397	movaps	16*4(%rsp), %xmm10
398	movaps	16*5(%rsp), %xmm11
399	movaps	16*6(%rsp), %xmm12
400	movaps	16*7(%rsp), %xmm13
401	movaps	16*8(%rsp), %xmm14
402	movaps	16*9(%rsp), %xmm15
403	lea	8+16*10(%rsp), %rsp
404___
405$code.=<<___;
406	ret
407.size	ecp_nistz256_avx2_convert_transpose_back,.-ecp_nistz256_avx2_convert_transpose_back
408___
409}
410{
411my ($r_ptr,$a_ptr,$b_ptr,$itr)=("%rdi","%rsi","%rdx","%ecx");
412my ($ACC0,$ACC1,$ACC2,$ACC3,$ACC4,$ACC5,$ACC6,$ACC7,$ACC8)=map("%ymm$_",(0..8));
413my ($B,$Y,$T0,$AND_MASK,$OVERFLOW)=map("%ymm$_",(9..13));
414
415sub NORMALIZE {
416my $ret=<<___;
417	vpsrlq		$digit_size, $ACC0, $T0
418	vpand		$AND_MASK, $ACC0, $ACC0
419	vpaddq		$T0, $ACC1, $ACC1
420
421	vpsrlq		$digit_size, $ACC1, $T0
422	vpand		$AND_MASK, $ACC1, $ACC1
423	vpaddq		$T0, $ACC2, $ACC2
424
425	vpsrlq		$digit_size, $ACC2, $T0
426	vpand		$AND_MASK, $ACC2, $ACC2
427	vpaddq		$T0, $ACC3, $ACC3
428
429	vpsrlq		$digit_size, $ACC3, $T0
430	vpand		$AND_MASK, $ACC3, $ACC3
431	vpaddq		$T0, $ACC4, $ACC4
432
433	vpsrlq		$digit_size, $ACC4, $T0
434	vpand		$AND_MASK, $ACC4, $ACC4
435	vpaddq		$T0, $ACC5, $ACC5
436
437	vpsrlq		$digit_size, $ACC5, $T0
438	vpand		$AND_MASK, $ACC5, $ACC5
439	vpaddq		$T0, $ACC6, $ACC6
440
441	vpsrlq		$digit_size, $ACC6, $T0
442	vpand		$AND_MASK, $ACC6, $ACC6
443	vpaddq		$T0, $ACC7, $ACC7
444
445	vpsrlq		$digit_size, $ACC7, $T0
446	vpand		$AND_MASK, $ACC7, $ACC7
447	vpaddq		$T0, $ACC8, $ACC8
448	#vpand		$AND_MASK, $ACC8, $ACC8
449___
450    $ret;
451}
452
453sub STORE {
454my $ret=<<___;
455	vmovdqa		$ACC0, 32*0(%rdi)
456	lea		160(%rdi), %rax		# size optimization
457	vmovdqa		$ACC1, 32*1(%rdi)
458	vmovdqa		$ACC2, 32*2(%rdi)
459	vmovdqa		$ACC3, 32*3(%rdi)
460	vmovdqa		$ACC4, 32*4-160(%rax)
461	vmovdqa		$ACC5, 32*5-160(%rax)
462	vmovdqa		$ACC6, 32*6-160(%rax)
463	vmovdqa		$ACC7, 32*7-160(%rax)
464	vmovdqa		$ACC8, 32*8-160(%rax)
465___
466    $ret;
467}
468
469$code.=<<___;
470.type	avx2_normalize,\@abi-omnipotent
471.align	32
472avx2_normalize:
473	vpsrlq		$digit_size, $ACC0, $T0
474	vpand		$AND_MASK, $ACC0, $ACC0
475	vpaddq		$T0, $ACC1, $ACC1
476
477	vpsrlq		$digit_size, $ACC1, $T0
478	vpand		$AND_MASK, $ACC1, $ACC1
479	vpaddq		$T0, $ACC2, $ACC2
480
481	vpsrlq		$digit_size, $ACC2, $T0
482	vpand		$AND_MASK, $ACC2, $ACC2
483	vpaddq		$T0, $ACC3, $ACC3
484
485	vpsrlq		$digit_size, $ACC3, $T0
486	vpand		$AND_MASK, $ACC3, $ACC3
487	vpaddq		$T0, $ACC4, $ACC4
488
489	vpsrlq		$digit_size, $ACC4, $T0
490	vpand		$AND_MASK, $ACC4, $ACC4
491	vpaddq		$T0, $ACC5, $ACC5
492
493	vpsrlq		$digit_size, $ACC5, $T0
494	vpand		$AND_MASK, $ACC5, $ACC5
495	vpaddq		$T0, $ACC6, $ACC6
496
497	vpsrlq		$digit_size, $ACC6, $T0
498	vpand		$AND_MASK, $ACC6, $ACC6
499	vpaddq		$T0, $ACC7, $ACC7
500
501	vpsrlq		$digit_size, $ACC7, $T0
502	vpand		$AND_MASK, $ACC7, $ACC7
503	vpaddq		$T0, $ACC8, $ACC8
504	#vpand		$AND_MASK, $ACC8, $ACC8
505
506	ret
507.size	avx2_normalize,.-avx2_normalize
508
509.type	avx2_normalize_n_store,\@abi-omnipotent
510.align	32
511avx2_normalize_n_store:
512	vpsrlq		$digit_size, $ACC0, $T0
513	vpand		$AND_MASK, $ACC0, $ACC0
514	vpaddq		$T0, $ACC1, $ACC1
515
516	vpsrlq		$digit_size, $ACC1, $T0
517	vpand		$AND_MASK, $ACC1, $ACC1
518	 vmovdqa	$ACC0, 32*0(%rdi)
519	 lea		160(%rdi), %rax		# size optimization
520	vpaddq		$T0, $ACC2, $ACC2
521
522	vpsrlq		$digit_size, $ACC2, $T0
523	vpand		$AND_MASK, $ACC2, $ACC2
524	 vmovdqa	$ACC1, 32*1(%rdi)
525	vpaddq		$T0, $ACC3, $ACC3
526
527	vpsrlq		$digit_size, $ACC3, $T0
528	vpand		$AND_MASK, $ACC3, $ACC3
529	 vmovdqa	$ACC2, 32*2(%rdi)
530	vpaddq		$T0, $ACC4, $ACC4
531
532	vpsrlq		$digit_size, $ACC4, $T0
533	vpand		$AND_MASK, $ACC4, $ACC4
534	 vmovdqa	$ACC3, 32*3(%rdi)
535	vpaddq		$T0, $ACC5, $ACC5
536
537	vpsrlq		$digit_size, $ACC5, $T0
538	vpand		$AND_MASK, $ACC5, $ACC5
539	 vmovdqa	$ACC4, 32*4-160(%rax)
540	vpaddq		$T0, $ACC6, $ACC6
541
542	vpsrlq		$digit_size, $ACC6, $T0
543	vpand		$AND_MASK, $ACC6, $ACC6
544	 vmovdqa	$ACC5, 32*5-160(%rax)
545	vpaddq		$T0, $ACC7, $ACC7
546
547	vpsrlq		$digit_size, $ACC7, $T0
548	vpand		$AND_MASK, $ACC7, $ACC7
549	 vmovdqa	$ACC6, 32*6-160(%rax)
550	vpaddq		$T0, $ACC8, $ACC8
551	#vpand		$AND_MASK, $ACC8, $ACC8
552	 vmovdqa	$ACC7, 32*7-160(%rax)
553	 vmovdqa	$ACC8, 32*8-160(%rax)
554
555	ret
556.size	avx2_normalize_n_store,.-avx2_normalize_n_store
557
558################################################################################
559# void avx2_mul_x4(void* RESULTx4, void *Ax4, void *Bx4);
560.type	avx2_mul_x4,\@abi-omnipotent
561.align	32
562avx2_mul_x4:
563	lea	.LAVX2_POLY(%rip), %rax
564
565	vpxor	$ACC0, $ACC0, $ACC0
566	vpxor	$ACC1, $ACC1, $ACC1
567	vpxor	$ACC2, $ACC2, $ACC2
568	vpxor	$ACC3, $ACC3, $ACC3
569	vpxor	$ACC4, $ACC4, $ACC4
570	vpxor	$ACC5, $ACC5, $ACC5
571	vpxor	$ACC6, $ACC6, $ACC6
572	vpxor	$ACC7, $ACC7, $ACC7
573
574	vmovdqa	32*7(%rax), %ymm14
575	vmovdqa	32*8(%rax), %ymm15
576
577	mov	$n_digits, $itr
578	lea	-512($a_ptr), $a_ptr	# strategic bias to control u-op density
579	jmp	.Lavx2_mul_x4_loop
580
581.align	32
582.Lavx2_mul_x4_loop:
583	vmovdqa		32*0($b_ptr), $B
584	lea		32*1($b_ptr), $b_ptr
585
586	vpmuludq	32*0+512($a_ptr), $B, $T0
587	vpmuludq	32*1+512($a_ptr), $B, $OVERFLOW	# borrow $OVERFLOW
588	vpaddq		$T0, $ACC0, $ACC0
589	vpmuludq	32*2+512($a_ptr), $B, $T0
590	vpaddq		$OVERFLOW, $ACC1, $ACC1
591	 vpand		$AND_MASK, $ACC0, $Y
592	vpmuludq	32*3+512($a_ptr), $B, $OVERFLOW
593	vpaddq		$T0, $ACC2, $ACC2
594	vpmuludq	32*4+512($a_ptr), $B, $T0
595	vpaddq		$OVERFLOW, $ACC3, $ACC3
596	vpmuludq	32*5+512($a_ptr), $B, $OVERFLOW
597	vpaddq		$T0, $ACC4, $ACC4
598	vpmuludq	32*6+512($a_ptr), $B, $T0
599	vpaddq		$OVERFLOW, $ACC5, $ACC5
600	vpmuludq	32*7+512($a_ptr), $B, $OVERFLOW
601	vpaddq		$T0, $ACC6, $ACC6
602
603	# Skip some multiplications, optimizing for the constant poly
604	vpmuludq	$AND_MASK, $Y, $T0
605	 vpaddq		$OVERFLOW, $ACC7, $ACC7
606	 vpmuludq	32*8+512($a_ptr), $B, $ACC8
607	vpaddq		$T0, $ACC0, $OVERFLOW
608	vpaddq		$T0, $ACC1, $ACC0
609	vpsrlq		$digit_size, $OVERFLOW, $OVERFLOW
610	vpaddq		$T0, $ACC2, $ACC1
611	vpmuludq	32*3(%rax), $Y, $T0
612	vpaddq		$OVERFLOW, $ACC0, $ACC0
613	vpaddq		$T0, $ACC3, $ACC2
614	.byte		0x67
615	vmovdqa		$ACC4, $ACC3
616	vpsllq		\$18, $Y, $OVERFLOW
617	.byte		0x67
618	vmovdqa		$ACC5, $ACC4
619	vpmuludq	%ymm14, $Y, $T0
620	vpaddq		$OVERFLOW, $ACC6, $ACC5
621	vpmuludq	%ymm15, $Y, $OVERFLOW
622	vpaddq		$T0, $ACC7, $ACC6
623	vpaddq		$OVERFLOW, $ACC8, $ACC7
624
625	dec	$itr
626	jnz	.Lavx2_mul_x4_loop
627
628	vpxor	$ACC8, $ACC8, $ACC8
629
630	ret
631.size	avx2_mul_x4,.-avx2_mul_x4
632
633# Function optimized for the constant 1
634################################################################################
635# void avx2_mul_by1_x4(void* RESULTx4, void *Ax4);
636.type	avx2_mul_by1_x4,\@abi-omnipotent
637.align	32
638avx2_mul_by1_x4:
639	lea	.LAVX2_POLY(%rip), %rax
640
641	vpxor	$ACC0, $ACC0, $ACC0
642	vpxor	$ACC1, $ACC1, $ACC1
643	vpxor	$ACC2, $ACC2, $ACC2
644	vpxor	$ACC3, $ACC3, $ACC3
645	vpxor	$ACC4, $ACC4, $ACC4
646	vpxor	$ACC5, $ACC5, $ACC5
647	vpxor	$ACC6, $ACC6, $ACC6
648	vpxor	$ACC7, $ACC7, $ACC7
649	vpxor	$ACC8, $ACC8, $ACC8
650
651	vmovdqa	32*3+.LONE(%rip), %ymm14
652	vmovdqa	32*7+.LONE(%rip), %ymm15
653
654	mov	$n_digits, $itr
655	jmp	.Lavx2_mul_by1_x4_loop
656
657.align	32
658.Lavx2_mul_by1_x4_loop:
659	vmovdqa		32*0($a_ptr), $B
660	.byte		0x48,0x8d,0xb6,0x20,0,0,0	# lea	32*1($a_ptr), $a_ptr
661
662	vpsllq		\$5, $B, $OVERFLOW
663	vpmuludq	%ymm14, $B, $T0
664	vpaddq		$OVERFLOW, $ACC0, $ACC0
665	vpaddq		$T0, $ACC3, $ACC3
666	.byte		0x67
667	vpmuludq	$AND_MASK, $B, $T0
668	vpand		$AND_MASK, $ACC0, $Y
669	vpaddq		$T0, $ACC4, $ACC4
670	vpaddq		$T0, $ACC5, $ACC5
671	vpaddq		$T0, $ACC6, $ACC6
672	vpsllq		\$23, $B, $T0
673
674	.byte		0x67,0x67
675	vpmuludq	%ymm15, $B, $OVERFLOW
676	vpsubq		$T0, $ACC6, $ACC6
677
678	vpmuludq	$AND_MASK, $Y, $T0
679	vpaddq		$OVERFLOW, $ACC7, $ACC7
680	vpaddq		$T0, $ACC0, $OVERFLOW
681	vpaddq		$T0, $ACC1, $ACC0
682	.byte		0x67,0x67
683	vpsrlq		$digit_size, $OVERFLOW, $OVERFLOW
684	vpaddq		$T0, $ACC2, $ACC1
685	vpmuludq	32*3(%rax), $Y, $T0
686	vpaddq		$OVERFLOW, $ACC0, $ACC0
687	vpaddq		$T0, $ACC3, $ACC2
688	vmovdqa		$ACC4, $ACC3
689	vpsllq		\$18, $Y, $OVERFLOW
690	vmovdqa		$ACC5, $ACC4
691	vpmuludq	32*7(%rax), $Y, $T0
692	vpaddq		$OVERFLOW, $ACC6, $ACC5
693	vpaddq		$T0, $ACC7, $ACC6
694	vpmuludq	32*8(%rax), $Y, $ACC7
695
696	dec	$itr
697	jnz	.Lavx2_mul_by1_x4_loop
698
699	ret
700.size	avx2_mul_by1_x4,.-avx2_mul_by1_x4
701
702################################################################################
703# void avx2_sqr_x4(void* RESULTx4, void *Ax4, void *Bx4);
704.type	avx2_sqr_x4,\@abi-omnipotent
705.align	32
706avx2_sqr_x4:
707	lea		.LAVX2_POLY(%rip), %rax
708
709	vmovdqa		32*7(%rax), %ymm14
710	vmovdqa		32*8(%rax), %ymm15
711
712	vmovdqa		32*0($a_ptr), $B
713	vmovdqa		32*1($a_ptr), $ACC1
714	vmovdqa		32*2($a_ptr), $ACC2
715	vmovdqa		32*3($a_ptr), $ACC3
716	vmovdqa		32*4($a_ptr), $ACC4
717	vmovdqa		32*5($a_ptr), $ACC5
718	vmovdqa		32*6($a_ptr), $ACC6
719	vmovdqa		32*7($a_ptr), $ACC7
720	vpaddq		$ACC1, $ACC1, $ACC1	# 2*$ACC0..7
721	vmovdqa		32*8($a_ptr), $ACC8
722	vpaddq		$ACC2, $ACC2, $ACC2
723	vmovdqa		$ACC1, 32*0(%rcx)
724	vpaddq		$ACC3, $ACC3, $ACC3
725	vmovdqa		$ACC2, 32*1(%rcx)
726	vpaddq		$ACC4, $ACC4, $ACC4
727	vmovdqa		$ACC3, 32*2(%rcx)
728	vpaddq		$ACC5, $ACC5, $ACC5
729	vmovdqa		$ACC4, 32*3(%rcx)
730	vpaddq		$ACC6, $ACC6, $ACC6
731	vmovdqa		$ACC5, 32*4(%rcx)
732	vpaddq		$ACC7, $ACC7, $ACC7
733	vmovdqa		$ACC6, 32*5(%rcx)
734	vpaddq		$ACC8, $ACC8, $ACC8
735	vmovdqa		$ACC7, 32*6(%rcx)
736	vmovdqa		$ACC8, 32*7(%rcx)
737
738	#itr		1
739	vpmuludq	$B, $B, $ACC0
740	vpmuludq	$B, $ACC1, $ACC1
741	 vpand		$AND_MASK, $ACC0, $Y
742	vpmuludq	$B, $ACC2, $ACC2
743	vpmuludq	$B, $ACC3, $ACC3
744	vpmuludq	$B, $ACC4, $ACC4
745	vpmuludq	$B, $ACC5, $ACC5
746	vpmuludq	$B, $ACC6, $ACC6
747	 vpmuludq	$AND_MASK, $Y, $T0
748	vpmuludq	$B, $ACC7, $ACC7
749	vpmuludq	$B, $ACC8, $ACC8
750	 vmovdqa	32*1($a_ptr), $B
751
752	vpaddq		$T0, $ACC0, $OVERFLOW
753	vpaddq		$T0, $ACC1, $ACC0
754	vpsrlq		$digit_size, $OVERFLOW, $OVERFLOW
755	vpaddq		$T0, $ACC2, $ACC1
756	vpmuludq	32*3(%rax), $Y, $T0
757	vpaddq		$OVERFLOW, $ACC0, $ACC0
758	vpaddq		$T0, $ACC3, $ACC2
759	vmovdqa		$ACC4, $ACC3
760	vpsllq		\$18, $Y, $T0
761	vmovdqa		$ACC5, $ACC4
762	vpmuludq	%ymm14, $Y, $OVERFLOW
763	vpaddq		$T0, $ACC6, $ACC5
764	vpmuludq	%ymm15, $Y, $T0
765	vpaddq		$OVERFLOW, $ACC7, $ACC6
766	vpaddq		$T0, $ACC8, $ACC7
767
768	#itr		2
769	vpmuludq	$B, $B, $OVERFLOW
770	 vpand		$AND_MASK, $ACC0, $Y
771	vpmuludq	32*1(%rcx), $B, $T0
772	vpaddq		$OVERFLOW, $ACC1, $ACC1
773	vpmuludq	32*2(%rcx), $B, $OVERFLOW
774	vpaddq		$T0, $ACC2, $ACC2
775	vpmuludq	32*3(%rcx), $B, $T0
776	vpaddq		$OVERFLOW, $ACC3, $ACC3
777	vpmuludq	32*4(%rcx), $B, $OVERFLOW
778	vpaddq		$T0, $ACC4, $ACC4
779	vpmuludq	32*5(%rcx), $B, $T0
780	vpaddq		$OVERFLOW, $ACC5, $ACC5
781	vpmuludq	32*6(%rcx), $B, $OVERFLOW
782	vpaddq		$T0, $ACC6, $ACC6
783
784	vpmuludq	$AND_MASK, $Y, $T0
785	 vpaddq		$OVERFLOW, $ACC7, $ACC7
786	 vpmuludq	32*7(%rcx), $B, $ACC8
787	 vmovdqa	32*2($a_ptr), $B
788	vpaddq		$T0, $ACC0, $OVERFLOW
789	vpaddq		$T0, $ACC1, $ACC0
790	vpsrlq		$digit_size, $OVERFLOW, $OVERFLOW
791	vpaddq		$T0, $ACC2, $ACC1
792	vpmuludq	32*3(%rax), $Y, $T0
793	vpaddq		$OVERFLOW, $ACC0, $ACC0
794	vpaddq		$T0, $ACC3, $ACC2
795	vmovdqa		$ACC4, $ACC3
796	vpsllq		\$18, $Y, $T0
797	vmovdqa		$ACC5, $ACC4
798	vpmuludq	%ymm14, $Y, $OVERFLOW
799	vpaddq		$T0, $ACC6, $ACC5
800	vpmuludq	%ymm15, $Y, $T0
801	vpaddq		$OVERFLOW, $ACC7, $ACC6
802	vpaddq		$T0, $ACC8, $ACC7
803
804	#itr		3
805	vpmuludq	$B, $B, $T0
806	 vpand		$AND_MASK, $ACC0, $Y
807	vpmuludq	32*2(%rcx), $B, $OVERFLOW
808	vpaddq		$T0, $ACC2, $ACC2
809	vpmuludq	32*3(%rcx), $B, $T0
810	vpaddq		$OVERFLOW, $ACC3, $ACC3
811	vpmuludq	32*4(%rcx), $B, $OVERFLOW
812	vpaddq		$T0, $ACC4, $ACC4
813	vpmuludq	32*5(%rcx), $B, $T0
814	vpaddq		$OVERFLOW, $ACC5, $ACC5
815	vpmuludq	32*6(%rcx), $B, $OVERFLOW
816	vpaddq		$T0, $ACC6, $ACC6
817
818	vpmuludq	$AND_MASK, $Y, $T0
819	 vpaddq		$OVERFLOW, $ACC7, $ACC7
820	 vpmuludq	32*7(%rcx), $B, $ACC8
821	 vmovdqa	32*3($a_ptr), $B
822	vpaddq		$T0, $ACC0, $OVERFLOW
823	vpaddq		$T0, $ACC1, $ACC0
824	vpsrlq		$digit_size, $OVERFLOW, $OVERFLOW
825	vpaddq		$T0, $ACC2, $ACC1
826	vpmuludq	32*3(%rax), $Y, $T0
827	vpaddq		$OVERFLOW, $ACC0, $ACC0
828	vpaddq		$T0, $ACC3, $ACC2
829	vmovdqa		$ACC4, $ACC3
830	vpsllq		\$18, $Y, $T0
831	vmovdqa		$ACC5, $ACC4
832	vpmuludq	%ymm14, $Y, $OVERFLOW
833	vpaddq		$T0, $ACC6, $ACC5
834	vpmuludq	%ymm15, $Y, $T0
835	 vpand		$AND_MASK, $ACC0, $Y
836	vpaddq		$OVERFLOW, $ACC7, $ACC6
837	vpaddq		$T0, $ACC8, $ACC7
838
839	#itr		4
840	vpmuludq	$B, $B, $OVERFLOW
841	vpmuludq	32*3(%rcx), $B, $T0
842	vpaddq		$OVERFLOW, $ACC3, $ACC3
843	vpmuludq	32*4(%rcx), $B, $OVERFLOW
844	vpaddq		$T0, $ACC4, $ACC4
845	vpmuludq	32*5(%rcx), $B, $T0
846	vpaddq		$OVERFLOW, $ACC5, $ACC5
847	vpmuludq	32*6(%rcx), $B, $OVERFLOW
848	vpaddq		$T0, $ACC6, $ACC6
849
850	vpmuludq	$AND_MASK, $Y, $T0
851	 vpaddq		$OVERFLOW, $ACC7, $ACC7
852	 vpmuludq	32*7(%rcx), $B, $ACC8
853	 vmovdqa	32*4($a_ptr), $B
854	vpaddq		$T0, $ACC0, $OVERFLOW
855	vpaddq		$T0, $ACC1, $ACC0
856	vpsrlq		$digit_size, $OVERFLOW, $OVERFLOW
857	vpaddq		$T0, $ACC2, $ACC1
858	vpmuludq	32*3(%rax), $Y, $T0
859	vpaddq		$OVERFLOW, $ACC0, $ACC0
860	vpaddq		$T0, $ACC3, $ACC2
861	vmovdqa		$ACC4, $ACC3
862	vpsllq		\$18, $Y, $T0
863	vmovdqa		$ACC5, $ACC4
864	vpmuludq	%ymm14, $Y, $OVERFLOW
865	vpaddq		$T0, $ACC6, $ACC5
866	vpmuludq	%ymm15, $Y, $T0
867	 vpand		$AND_MASK, $ACC0, $Y
868	vpaddq		$OVERFLOW, $ACC7, $ACC6
869	vpaddq		$T0, $ACC8, $ACC7
870
871	#itr		5
872	vpmuludq	$B, $B, $T0
873	vpmuludq	32*4(%rcx), $B, $OVERFLOW
874	vpaddq		$T0, $ACC4, $ACC4
875	vpmuludq	32*5(%rcx), $B, $T0
876	vpaddq		$OVERFLOW, $ACC5, $ACC5
877	vpmuludq	32*6(%rcx), $B, $OVERFLOW
878	vpaddq		$T0, $ACC6, $ACC6
879
880	vpmuludq	$AND_MASK, $Y, $T0
881	 vpaddq		$OVERFLOW, $ACC7, $ACC7
882	 vpmuludq	32*7(%rcx), $B, $ACC8
883	 vmovdqa	32*5($a_ptr), $B
884	vpaddq		$T0, $ACC0, $OVERFLOW
885	vpsrlq		$digit_size, $OVERFLOW, $OVERFLOW
886	vpaddq		$T0, $ACC1, $ACC0
887	vpaddq		$T0, $ACC2, $ACC1
888	vpmuludq	32*3+.LAVX2_POLY(%rip), $Y, $T0
889	vpaddq		$OVERFLOW, $ACC0, $ACC0
890	vpaddq		$T0, $ACC3, $ACC2
891	vmovdqa		$ACC4, $ACC3
892	vpsllq		\$18, $Y, $T0
893	vmovdqa		$ACC5, $ACC4
894	vpmuludq	%ymm14, $Y, $OVERFLOW
895	vpaddq		$T0, $ACC6, $ACC5
896	vpmuludq	%ymm15, $Y, $T0
897	 vpand		$AND_MASK, $ACC0, $Y
898	vpaddq		$OVERFLOW, $ACC7, $ACC6
899	vpaddq		$T0, $ACC8, $ACC7
900
901	#itr		6
902	vpmuludq	$B, $B, $OVERFLOW
903	vpmuludq	32*5(%rcx), $B, $T0
904	vpaddq		$OVERFLOW, $ACC5, $ACC5
905	vpmuludq	32*6(%rcx), $B, $OVERFLOW
906	vpaddq		$T0, $ACC6, $ACC6
907
908	vpmuludq	$AND_MASK, $Y, $T0
909	 vpaddq		$OVERFLOW, $ACC7, $ACC7
910	 vpmuludq	32*7(%rcx), $B, $ACC8
911	 vmovdqa	32*6($a_ptr), $B
912	vpaddq		$T0, $ACC0, $OVERFLOW
913	vpaddq		$T0, $ACC1, $ACC0
914	vpsrlq		$digit_size, $OVERFLOW, $OVERFLOW
915	vpaddq		$T0, $ACC2, $ACC1
916	vpmuludq	32*3(%rax), $Y, $T0
917	vpaddq		$OVERFLOW, $ACC0, $ACC0
918	vpaddq		$T0, $ACC3, $ACC2
919	vmovdqa		$ACC4, $ACC3
920	vpsllq		\$18, $Y, $T0
921	vmovdqa		$ACC5, $ACC4
922	vpmuludq	%ymm14, $Y, $OVERFLOW
923	vpaddq		$T0, $ACC6, $ACC5
924	vpmuludq	%ymm15, $Y, $T0
925	 vpand		$AND_MASK, $ACC0, $Y
926	vpaddq		$OVERFLOW, $ACC7, $ACC6
927	vpaddq		$T0, $ACC8, $ACC7
928
929	#itr		7
930	vpmuludq	$B, $B, $T0
931	vpmuludq	32*6(%rcx), $B, $OVERFLOW
932	vpaddq		$T0, $ACC6, $ACC6
933
934	vpmuludq	$AND_MASK, $Y, $T0
935	 vpaddq		$OVERFLOW, $ACC7, $ACC7
936	 vpmuludq	32*7(%rcx), $B, $ACC8
937	 vmovdqa	32*7($a_ptr), $B
938	vpaddq		$T0, $ACC0, $OVERFLOW
939	vpsrlq		$digit_size, $OVERFLOW, $OVERFLOW
940	vpaddq		$T0, $ACC1, $ACC0
941	vpaddq		$T0, $ACC2, $ACC1
942	vpmuludq	32*3(%rax), $Y, $T0
943	vpaddq		$OVERFLOW, $ACC0, $ACC0
944	vpaddq		$T0, $ACC3, $ACC2
945	vmovdqa		$ACC4, $ACC3
946	vpsllq		\$18, $Y, $T0
947	vmovdqa		$ACC5, $ACC4
948	vpmuludq	%ymm14, $Y, $OVERFLOW
949	vpaddq		$T0, $ACC6, $ACC5
950	vpmuludq	%ymm15, $Y, $T0
951	 vpand		$AND_MASK, $ACC0, $Y
952	vpaddq		$OVERFLOW, $ACC7, $ACC6
953	vpaddq		$T0, $ACC8, $ACC7
954
955	#itr		8
956	vpmuludq	$B, $B, $OVERFLOW
957
958	vpmuludq	$AND_MASK, $Y, $T0
959	 vpaddq		$OVERFLOW, $ACC7, $ACC7
960	 vpmuludq	32*7(%rcx), $B, $ACC8
961	 vmovdqa	32*8($a_ptr), $B
962	vpaddq		$T0, $ACC0, $OVERFLOW
963	vpsrlq		$digit_size, $OVERFLOW, $OVERFLOW
964	vpaddq		$T0, $ACC1, $ACC0
965	vpaddq		$T0, $ACC2, $ACC1
966	vpmuludq	32*3(%rax), $Y, $T0
967	vpaddq		$OVERFLOW, $ACC0, $ACC0
968	vpaddq		$T0, $ACC3, $ACC2
969	vmovdqa		$ACC4, $ACC3
970	vpsllq		\$18, $Y, $T0
971	vmovdqa		$ACC5, $ACC4
972	vpmuludq	%ymm14, $Y, $OVERFLOW
973	vpaddq		$T0, $ACC6, $ACC5
974	vpmuludq	%ymm15, $Y, $T0
975	 vpand		$AND_MASK, $ACC0, $Y
976	vpaddq		$OVERFLOW, $ACC7, $ACC6
977	vpaddq		$T0, $ACC8, $ACC7
978
979	#itr		9
980	vpmuludq	$B, $B, $ACC8
981
982	vpmuludq	$AND_MASK, $Y, $T0
983	vpaddq		$T0, $ACC0, $OVERFLOW
984	vpsrlq		$digit_size, $OVERFLOW, $OVERFLOW
985	vpaddq		$T0, $ACC1, $ACC0
986	vpaddq		$T0, $ACC2, $ACC1
987	vpmuludq	32*3(%rax), $Y, $T0
988	vpaddq		$OVERFLOW, $ACC0, $ACC0
989	vpaddq		$T0, $ACC3, $ACC2
990	vmovdqa		$ACC4, $ACC3
991	vpsllq		\$18, $Y, $T0
992	vmovdqa		$ACC5, $ACC4
993	vpmuludq	%ymm14, $Y, $OVERFLOW
994	vpaddq		$T0, $ACC6, $ACC5
995	vpmuludq	%ymm15, $Y, $T0
996	vpaddq		$OVERFLOW, $ACC7, $ACC6
997	vpaddq		$T0, $ACC8, $ACC7
998
999	vpxor		$ACC8, $ACC8, $ACC8
1000
1001	ret
1002.size	avx2_sqr_x4,.-avx2_sqr_x4
1003
1004################################################################################
1005# void avx2_sub_x4(void* RESULTx4, void *Ax4, void *Bx4);
1006.type	avx2_sub_x4,\@abi-omnipotent
1007.align	32
1008avx2_sub_x4:
1009	vmovdqa	32*0($a_ptr), $ACC0
1010	lea	160($a_ptr), $a_ptr
1011	lea	.LAVX2_POLY_x8+128(%rip), %rax
1012	lea	128($b_ptr), $b_ptr
1013	vmovdqa	32*1-160($a_ptr), $ACC1
1014	vmovdqa	32*2-160($a_ptr), $ACC2
1015	vmovdqa	32*3-160($a_ptr), $ACC3
1016	vmovdqa	32*4-160($a_ptr), $ACC4
1017	vmovdqa	32*5-160($a_ptr), $ACC5
1018	vmovdqa	32*6-160($a_ptr), $ACC6
1019	vmovdqa	32*7-160($a_ptr), $ACC7
1020	vmovdqa	32*8-160($a_ptr), $ACC8
1021
1022	vpaddq	32*0-128(%rax), $ACC0, $ACC0
1023	vpaddq	32*1-128(%rax), $ACC1, $ACC1
1024	vpaddq	32*2-128(%rax), $ACC2, $ACC2
1025	vpaddq	32*3-128(%rax), $ACC3, $ACC3
1026	vpaddq	32*4-128(%rax), $ACC4, $ACC4
1027	vpaddq	32*5-128(%rax), $ACC5, $ACC5
1028	vpaddq	32*6-128(%rax), $ACC6, $ACC6
1029	vpaddq	32*7-128(%rax), $ACC7, $ACC7
1030	vpaddq	32*8-128(%rax), $ACC8, $ACC8
1031
1032	vpsubq	32*0-128($b_ptr), $ACC0, $ACC0
1033	vpsubq	32*1-128($b_ptr), $ACC1, $ACC1
1034	vpsubq	32*2-128($b_ptr), $ACC2, $ACC2
1035	vpsubq	32*3-128($b_ptr), $ACC3, $ACC3
1036	vpsubq	32*4-128($b_ptr), $ACC4, $ACC4
1037	vpsubq	32*5-128($b_ptr), $ACC5, $ACC5
1038	vpsubq	32*6-128($b_ptr), $ACC6, $ACC6
1039	vpsubq	32*7-128($b_ptr), $ACC7, $ACC7
1040	vpsubq	32*8-128($b_ptr), $ACC8, $ACC8
1041
1042	ret
1043.size	avx2_sub_x4,.-avx2_sub_x4
1044
1045.type	avx2_select_n_store,\@abi-omnipotent
1046.align	32
1047avx2_select_n_store:
1048	vmovdqa	`8+32*9*8`(%rsp), $Y
1049	vpor	`8+32*9*8+32`(%rsp), $Y, $Y
1050
1051	vpandn	$ACC0, $Y, $ACC0
1052	vpandn	$ACC1, $Y, $ACC1
1053	vpandn	$ACC2, $Y, $ACC2
1054	vpandn	$ACC3, $Y, $ACC3
1055	vpandn	$ACC4, $Y, $ACC4
1056	vpandn	$ACC5, $Y, $ACC5
1057	vpandn	$ACC6, $Y, $ACC6
1058	vmovdqa	`8+32*9*8+32`(%rsp), $B
1059	vpandn	$ACC7, $Y, $ACC7
1060	vpandn	`8+32*9*8`(%rsp), $B, $B
1061	vpandn	$ACC8, $Y, $ACC8
1062
1063	vpand	32*0(%rsi), $B, $T0
1064	lea	160(%rsi), %rax
1065	vpand	32*1(%rsi), $B, $Y
1066	vpxor	$T0, $ACC0, $ACC0
1067	vpand	32*2(%rsi), $B, $T0
1068	vpxor	$Y, $ACC1, $ACC1
1069	vpand	32*3(%rsi), $B, $Y
1070	vpxor	$T0, $ACC2, $ACC2
1071	vpand	32*4-160(%rax), $B, $T0
1072	vpxor	$Y, $ACC3, $ACC3
1073	vpand	32*5-160(%rax), $B, $Y
1074	vpxor	$T0, $ACC4, $ACC4
1075	vpand	32*6-160(%rax), $B, $T0
1076	vpxor	$Y, $ACC5, $ACC5
1077	vpand	32*7-160(%rax), $B, $Y
1078	vpxor	$T0, $ACC6, $ACC6
1079	vpand	32*8-160(%rax), $B, $T0
1080	vmovdqa	`8+32*9*8+32`(%rsp), $B
1081	vpxor	$Y, $ACC7, $ACC7
1082
1083	vpand	32*0(%rdx), $B, $Y
1084	lea	160(%rdx), %rax
1085	vpxor	$T0, $ACC8, $ACC8
1086	vpand	32*1(%rdx), $B, $T0
1087	vpxor	$Y, $ACC0, $ACC0
1088	vpand	32*2(%rdx), $B, $Y
1089	vpxor	$T0, $ACC1, $ACC1
1090	vpand	32*3(%rdx), $B, $T0
1091	vpxor	$Y, $ACC2, $ACC2
1092	vpand	32*4-160(%rax), $B, $Y
1093	vpxor	$T0, $ACC3, $ACC3
1094	vpand	32*5-160(%rax), $B, $T0
1095	vpxor	$Y, $ACC4, $ACC4
1096	vpand	32*6-160(%rax), $B, $Y
1097	vpxor	$T0, $ACC5, $ACC5
1098	vpand	32*7-160(%rax), $B, $T0
1099	vpxor	$Y, $ACC6, $ACC6
1100	vpand	32*8-160(%rax), $B, $Y
1101	vpxor	$T0, $ACC7, $ACC7
1102	vpxor	$Y, $ACC8, $ACC8
1103	`&STORE`
1104
1105	ret
1106.size	avx2_select_n_store,.-avx2_select_n_store
1107___
1108$code.=<<___	if (0);				# inlined
1109################################################################################
1110# void avx2_mul_by2_x4(void* RESULTx4, void *Ax4);
1111.type	avx2_mul_by2_x4,\@abi-omnipotent
1112.align	32
1113avx2_mul_by2_x4:
1114	vmovdqa	32*0($a_ptr), $ACC0
1115	lea	160($a_ptr), %rax
1116	vmovdqa	32*1($a_ptr), $ACC1
1117	vmovdqa	32*2($a_ptr), $ACC2
1118	vmovdqa	32*3($a_ptr), $ACC3
1119	vmovdqa	32*4-160(%rax), $ACC4
1120	vmovdqa	32*5-160(%rax), $ACC5
1121	vmovdqa	32*6-160(%rax), $ACC6
1122	vmovdqa	32*7-160(%rax), $ACC7
1123	vmovdqa	32*8-160(%rax), $ACC8
1124
1125	vpaddq	$ACC0, $ACC0, $ACC0
1126	vpaddq	$ACC1, $ACC1, $ACC1
1127	vpaddq	$ACC2, $ACC2, $ACC2
1128	vpaddq	$ACC3, $ACC3, $ACC3
1129	vpaddq	$ACC4, $ACC4, $ACC4
1130	vpaddq	$ACC5, $ACC5, $ACC5
1131	vpaddq	$ACC6, $ACC6, $ACC6
1132	vpaddq	$ACC7, $ACC7, $ACC7
1133	vpaddq	$ACC8, $ACC8, $ACC8
1134
1135	ret
1136.size	avx2_mul_by2_x4,.-avx2_mul_by2_x4
1137___
1138my ($r_ptr_in,$a_ptr_in,$b_ptr_in)=("%rdi","%rsi","%rdx");
1139my ($r_ptr,$a_ptr,$b_ptr)=("%r8","%r9","%r10");
1140
1141$code.=<<___;
1142################################################################################
1143# void ecp_nistz256_avx2_point_add_affine_x4(void* RESULTx4, void *Ax4, void *Bx4);
1144.globl	ecp_nistz256_avx2_point_add_affine_x4
1145.type	ecp_nistz256_avx2_point_add_affine_x4,\@function,3
1146.align	32
1147ecp_nistz256_avx2_point_add_affine_x4:
1148	mov	%rsp, %rax
1149	push    %rbp
1150	vzeroupper
1151___
1152$code.=<<___	if ($win64);
1153	lea	-16*10(%rsp), %rsp
1154	vmovaps	%xmm6, -8-16*10(%rax)
1155	vmovaps	%xmm7, -8-16*9(%rax)
1156	vmovaps	%xmm8, -8-16*8(%rax)
1157	vmovaps	%xmm9, -8-16*7(%rax)
1158	vmovaps	%xmm10, -8-16*6(%rax)
1159	vmovaps	%xmm11, -8-16*5(%rax)
1160	vmovaps	%xmm12, -8-16*4(%rax)
1161	vmovaps	%xmm13, -8-16*3(%rax)
1162	vmovaps	%xmm14, -8-16*2(%rax)
1163	vmovaps	%xmm15, -8-16*1(%rax)
1164___
1165$code.=<<___;
1166	lea	-8(%rax), %rbp
1167
1168# Result + 32*0 = Result.X
1169# Result + 32*9 = Result.Y
1170# Result + 32*18 = Result.Z
1171
1172# A + 32*0 = A.X
1173# A + 32*9 = A.Y
1174# A + 32*18 = A.Z
1175
1176# B + 32*0 = B.X
1177# B + 32*9 = B.Y
1178
1179	sub	\$`32*9*8+32*2+32*8`, %rsp
1180	and	\$-64, %rsp
1181
1182	mov	$r_ptr_in, $r_ptr
1183	mov	$a_ptr_in, $a_ptr
1184	mov	$b_ptr_in, $b_ptr
1185
1186	vmovdqa	32*0($a_ptr_in), %ymm0
1187	vmovdqa	.LAVX2_AND_MASK(%rip), $AND_MASK
1188	vpxor	%ymm1, %ymm1, %ymm1
1189	lea	256($a_ptr_in), %rax		# size optimization
1190	vpor	32*1($a_ptr_in), %ymm0, %ymm0
1191	vpor	32*2($a_ptr_in), %ymm0, %ymm0
1192	vpor	32*3($a_ptr_in), %ymm0, %ymm0
1193	vpor	32*4-256(%rax), %ymm0, %ymm0
1194	lea	256(%rax), %rcx			# size optimization
1195	vpor	32*5-256(%rax), %ymm0, %ymm0
1196	vpor	32*6-256(%rax), %ymm0, %ymm0
1197	vpor	32*7-256(%rax), %ymm0, %ymm0
1198	vpor	32*8-256(%rax), %ymm0, %ymm0
1199	vpor	32*9-256(%rax), %ymm0, %ymm0
1200	vpor	32*10-256(%rax), %ymm0, %ymm0
1201	vpor	32*11-256(%rax), %ymm0, %ymm0
1202	vpor	32*12-512(%rcx), %ymm0, %ymm0
1203	vpor	32*13-512(%rcx), %ymm0, %ymm0
1204	vpor	32*14-512(%rcx), %ymm0, %ymm0
1205	vpor	32*15-512(%rcx), %ymm0, %ymm0
1206	vpor	32*16-512(%rcx), %ymm0, %ymm0
1207	vpor	32*17-512(%rcx), %ymm0, %ymm0
1208	vpcmpeqq %ymm1, %ymm0, %ymm0
1209	vmovdqa	%ymm0, `32*9*8`(%rsp)
1210
1211	vpxor	%ymm1, %ymm1, %ymm1
1212	vmovdqa	32*0($b_ptr), %ymm0
1213	lea	256($b_ptr), %rax		# size optimization
1214	vpor	32*1($b_ptr), %ymm0, %ymm0
1215	vpor	32*2($b_ptr), %ymm0, %ymm0
1216	vpor	32*3($b_ptr), %ymm0, %ymm0
1217	vpor	32*4-256(%rax), %ymm0, %ymm0
1218	lea	256(%rax), %rcx			# size optimization
1219	vpor	32*5-256(%rax), %ymm0, %ymm0
1220	vpor	32*6-256(%rax), %ymm0, %ymm0
1221	vpor	32*7-256(%rax), %ymm0, %ymm0
1222	vpor	32*8-256(%rax), %ymm0, %ymm0
1223	vpor	32*9-256(%rax), %ymm0, %ymm0
1224	vpor	32*10-256(%rax), %ymm0, %ymm0
1225	vpor	32*11-256(%rax), %ymm0, %ymm0
1226	vpor	32*12-512(%rcx), %ymm0, %ymm0
1227	vpor	32*13-512(%rcx), %ymm0, %ymm0
1228	vpor	32*14-512(%rcx), %ymm0, %ymm0
1229	vpor	32*15-512(%rcx), %ymm0, %ymm0
1230	vpor	32*16-512(%rcx), %ymm0, %ymm0
1231	vpor	32*17-512(%rcx), %ymm0, %ymm0
1232	vpcmpeqq %ymm1, %ymm0, %ymm0
1233	vmovdqa	%ymm0, `32*9*8+32`(%rsp)
1234
1235	#	Z1^2 = Z1*Z1
1236	lea	`32*9*2`($a_ptr), %rsi
1237	lea	`32*9*2`(%rsp), %rdi
1238	lea	`32*9*8+32*2`(%rsp), %rcx	# temporary vector
1239	call	avx2_sqr_x4
1240	call	avx2_normalize_n_store
1241
1242	#	U2 = X2*Z1^2
1243	lea	`32*9*0`($b_ptr), %rsi
1244	lea	`32*9*2`(%rsp), %rdx
1245	lea	`32*9*0`(%rsp), %rdi
1246	call	avx2_mul_x4
1247	#call	avx2_normalize
1248	`&STORE`
1249
1250	#	S2 = Z1*Z1^2 = Z1^3
1251	lea	`32*9*2`($a_ptr), %rsi
1252	lea	`32*9*2`(%rsp), %rdx
1253	lea	`32*9*1`(%rsp), %rdi
1254	call	avx2_mul_x4
1255	call	avx2_normalize_n_store
1256
1257	#	S2 = S2*Y2 = Y2*Z1^3
1258	lea	`32*9*1`($b_ptr), %rsi
1259	lea	`32*9*1`(%rsp), %rdx
1260	lea	`32*9*1`(%rsp), %rdi
1261	call	avx2_mul_x4
1262	call	avx2_normalize_n_store
1263
1264	#	H = U2 - U1 = U2 - X1
1265	lea	`32*9*0`(%rsp), %rsi
1266	lea	`32*9*0`($a_ptr), %rdx
1267	lea	`32*9*3`(%rsp), %rdi
1268	call	avx2_sub_x4
1269	call	avx2_normalize_n_store
1270
1271	#	R = S2 - S1 = S2 - Y1
1272	lea	`32*9*1`(%rsp), %rsi
1273	lea	`32*9*1`($a_ptr), %rdx
1274	lea	`32*9*4`(%rsp), %rdi
1275	call	avx2_sub_x4
1276	call	avx2_normalize_n_store
1277
1278	#	Z3 = H*Z1*Z2
1279	lea	`32*9*3`(%rsp), %rsi
1280	lea	`32*9*2`($a_ptr), %rdx
1281	lea	`32*9*2`($r_ptr), %rdi
1282	call	avx2_mul_x4
1283	call	avx2_normalize
1284
1285	lea	.LONE(%rip), %rsi
1286	lea	`32*9*2`($a_ptr), %rdx
1287	call	avx2_select_n_store
1288
1289	#	R^2 = R^2
1290	lea	`32*9*4`(%rsp), %rsi
1291	lea	`32*9*6`(%rsp), %rdi
1292	lea	`32*9*8+32*2`(%rsp), %rcx	# temporary vector
1293	call	avx2_sqr_x4
1294	call	avx2_normalize_n_store
1295
1296	#	H^2 = H^2
1297	lea	`32*9*3`(%rsp), %rsi
1298	lea	`32*9*5`(%rsp), %rdi
1299	call	avx2_sqr_x4
1300	call	avx2_normalize_n_store
1301
1302	#	H^3 = H^2*H
1303	lea	`32*9*3`(%rsp), %rsi
1304	lea	`32*9*5`(%rsp), %rdx
1305	lea	`32*9*7`(%rsp), %rdi
1306	call	avx2_mul_x4
1307	call	avx2_normalize_n_store
1308
1309	#	U2 = U1*H^2
1310	lea	`32*9*0`($a_ptr), %rsi
1311	lea	`32*9*5`(%rsp), %rdx
1312	lea	`32*9*0`(%rsp), %rdi
1313	call	avx2_mul_x4
1314	#call	avx2_normalize
1315	`&STORE`
1316
1317	#	Hsqr = U2*2
1318	#lea	32*9*0(%rsp), %rsi
1319	#lea	32*9*5(%rsp), %rdi
1320	#call	avx2_mul_by2_x4
1321
1322	vpaddq	$ACC0, $ACC0, $ACC0	# inlined avx2_mul_by2_x4
1323	lea	`32*9*5`(%rsp), %rdi
1324	vpaddq	$ACC1, $ACC1, $ACC1
1325	vpaddq	$ACC2, $ACC2, $ACC2
1326	vpaddq	$ACC3, $ACC3, $ACC3
1327	vpaddq	$ACC4, $ACC4, $ACC4
1328	vpaddq	$ACC5, $ACC5, $ACC5
1329	vpaddq	$ACC6, $ACC6, $ACC6
1330	vpaddq	$ACC7, $ACC7, $ACC7
1331	vpaddq	$ACC8, $ACC8, $ACC8
1332	call	avx2_normalize_n_store
1333
1334	#	X3 = R^2 - H^3
1335	#lea	32*9*6(%rsp), %rsi
1336	#lea	32*9*7(%rsp), %rdx
1337	#lea	32*9*5(%rsp), %rcx
1338	#lea	32*9*0($r_ptr), %rdi
1339	#call	avx2_sub_x4
1340	#NORMALIZE
1341	#STORE
1342
1343	#	X3 = X3 - U2*2
1344	#lea	32*9*0($r_ptr), %rsi
1345	#lea	32*9*0($r_ptr), %rdi
1346	#call	avx2_sub_x4
1347	#NORMALIZE
1348	#STORE
1349
1350	lea	`32*9*6+128`(%rsp), %rsi
1351	lea	.LAVX2_POLY_x2+128(%rip), %rax
1352	lea	`32*9*7+128`(%rsp), %rdx
1353	lea	`32*9*5+128`(%rsp), %rcx
1354	lea	`32*9*0`($r_ptr), %rdi
1355
1356	vmovdqa	32*0-128(%rsi), $ACC0
1357	vmovdqa	32*1-128(%rsi), $ACC1
1358	vmovdqa	32*2-128(%rsi), $ACC2
1359	vmovdqa	32*3-128(%rsi), $ACC3
1360	vmovdqa	32*4-128(%rsi), $ACC4
1361	vmovdqa	32*5-128(%rsi), $ACC5
1362	vmovdqa	32*6-128(%rsi), $ACC6
1363	vmovdqa	32*7-128(%rsi), $ACC7
1364	vmovdqa	32*8-128(%rsi), $ACC8
1365
1366	vpaddq	32*0-128(%rax), $ACC0, $ACC0
1367	vpaddq	32*1-128(%rax), $ACC1, $ACC1
1368	vpaddq	32*2-128(%rax), $ACC2, $ACC2
1369	vpaddq	32*3-128(%rax), $ACC3, $ACC3
1370	vpaddq	32*4-128(%rax), $ACC4, $ACC4
1371	vpaddq	32*5-128(%rax), $ACC5, $ACC5
1372	vpaddq	32*6-128(%rax), $ACC6, $ACC6
1373	vpaddq	32*7-128(%rax), $ACC7, $ACC7
1374	vpaddq	32*8-128(%rax), $ACC8, $ACC8
1375
1376	vpsubq	32*0-128(%rdx), $ACC0, $ACC0
1377	vpsubq	32*1-128(%rdx), $ACC1, $ACC1
1378	vpsubq	32*2-128(%rdx), $ACC2, $ACC2
1379	vpsubq	32*3-128(%rdx), $ACC3, $ACC3
1380	vpsubq	32*4-128(%rdx), $ACC4, $ACC4
1381	vpsubq	32*5-128(%rdx), $ACC5, $ACC5
1382	vpsubq	32*6-128(%rdx), $ACC6, $ACC6
1383	vpsubq	32*7-128(%rdx), $ACC7, $ACC7
1384	vpsubq	32*8-128(%rdx), $ACC8, $ACC8
1385
1386	vpsubq	32*0-128(%rcx), $ACC0, $ACC0
1387	vpsubq	32*1-128(%rcx), $ACC1, $ACC1
1388	vpsubq	32*2-128(%rcx), $ACC2, $ACC2
1389	vpsubq	32*3-128(%rcx), $ACC3, $ACC3
1390	vpsubq	32*4-128(%rcx), $ACC4, $ACC4
1391	vpsubq	32*5-128(%rcx), $ACC5, $ACC5
1392	vpsubq	32*6-128(%rcx), $ACC6, $ACC6
1393	vpsubq	32*7-128(%rcx), $ACC7, $ACC7
1394	vpsubq	32*8-128(%rcx), $ACC8, $ACC8
1395	call	avx2_normalize
1396
1397	lea	32*0($b_ptr), %rsi
1398	lea	32*0($a_ptr), %rdx
1399	call	avx2_select_n_store
1400
1401	#	H = U2 - X3
1402	lea	`32*9*0`(%rsp), %rsi
1403	lea	`32*9*0`($r_ptr), %rdx
1404	lea	`32*9*3`(%rsp), %rdi
1405	call	avx2_sub_x4
1406	call	avx2_normalize_n_store
1407
1408	#
1409	lea	`32*9*3`(%rsp), %rsi
1410	lea	`32*9*4`(%rsp), %rdx
1411	lea	`32*9*3`(%rsp), %rdi
1412	call	avx2_mul_x4
1413	call	avx2_normalize_n_store
1414
1415	#
1416	lea	`32*9*7`(%rsp), %rsi
1417	lea	`32*9*1`($a_ptr), %rdx
1418	lea	`32*9*1`(%rsp), %rdi
1419	call	avx2_mul_x4
1420	call	avx2_normalize_n_store
1421
1422	#
1423	lea	`32*9*3`(%rsp), %rsi
1424	lea	`32*9*1`(%rsp), %rdx
1425	lea	`32*9*1`($r_ptr), %rdi
1426	call	avx2_sub_x4
1427	call	avx2_normalize
1428
1429	lea	32*9($b_ptr), %rsi
1430	lea	32*9($a_ptr), %rdx
1431	call	avx2_select_n_store
1432
1433	#lea	32*9*0($r_ptr), %rsi
1434	#lea	32*9*0($r_ptr), %rdi
1435	#call	avx2_mul_by1_x4
1436	#NORMALIZE
1437	#STORE
1438
1439	lea	`32*9*1`($r_ptr), %rsi
1440	lea	`32*9*1`($r_ptr), %rdi
1441	call	avx2_mul_by1_x4
1442	call	avx2_normalize_n_store
1443
1444	vzeroupper
1445___
1446$code.=<<___	if ($win64);
1447	movaps	%xmm6, -16*10(%rbp)
1448	movaps	%xmm7, -16*9(%rbp)
1449	movaps	%xmm8, -16*8(%rbp)
1450	movaps	%xmm9, -16*7(%rbp)
1451	movaps	%xmm10, -16*6(%rbp)
1452	movaps	%xmm11, -16*5(%rbp)
1453	movaps	%xmm12, -16*4(%rbp)
1454	movaps	%xmm13, -16*3(%rbp)
1455	movaps	%xmm14, -16*2(%rbp)
1456	movaps	%xmm15, -16*1(%rbp)
1457___
1458$code.=<<___;
1459	mov	%rbp, %rsp
1460	pop	%rbp
1461	ret
1462.size	ecp_nistz256_avx2_point_add_affine_x4,.-ecp_nistz256_avx2_point_add_affine_x4
1463
1464################################################################################
1465# void ecp_nistz256_avx2_point_add_affines_x4(void* RESULTx4, void *Ax4, void *Bx4);
1466.globl	ecp_nistz256_avx2_point_add_affines_x4
1467.type	ecp_nistz256_avx2_point_add_affines_x4,\@function,3
1468.align	32
1469ecp_nistz256_avx2_point_add_affines_x4:
1470	mov	%rsp, %rax
1471	push    %rbp
1472	vzeroupper
1473___
1474$code.=<<___	if ($win64);
1475	lea	-16*10(%rsp), %rsp
1476	vmovaps	%xmm6, -8-16*10(%rax)
1477	vmovaps	%xmm7, -8-16*9(%rax)
1478	vmovaps	%xmm8, -8-16*8(%rax)
1479	vmovaps	%xmm9, -8-16*7(%rax)
1480	vmovaps	%xmm10, -8-16*6(%rax)
1481	vmovaps	%xmm11, -8-16*5(%rax)
1482	vmovaps	%xmm12, -8-16*4(%rax)
1483	vmovaps	%xmm13, -8-16*3(%rax)
1484	vmovaps	%xmm14, -8-16*2(%rax)
1485	vmovaps	%xmm15, -8-16*1(%rax)
1486___
1487$code.=<<___;
1488	lea	-8(%rax), %rbp
1489
1490# Result + 32*0 = Result.X
1491# Result + 32*9 = Result.Y
1492# Result + 32*18 = Result.Z
1493
1494# A + 32*0 = A.X
1495# A + 32*9 = A.Y
1496
1497# B + 32*0 = B.X
1498# B + 32*9 = B.Y
1499
1500	sub	\$`32*9*8+32*2+32*8`, %rsp
1501	and	\$-64, %rsp
1502
1503	mov	$r_ptr_in, $r_ptr
1504	mov	$a_ptr_in, $a_ptr
1505	mov	$b_ptr_in, $b_ptr
1506
1507	vmovdqa	32*0($a_ptr_in), %ymm0
1508	vmovdqa	.LAVX2_AND_MASK(%rip), $AND_MASK
1509	vpxor	%ymm1, %ymm1, %ymm1
1510	lea	256($a_ptr_in), %rax		# size optimization
1511	vpor	32*1($a_ptr_in), %ymm0, %ymm0
1512	vpor	32*2($a_ptr_in), %ymm0, %ymm0
1513	vpor	32*3($a_ptr_in), %ymm0, %ymm0
1514	vpor	32*4-256(%rax), %ymm0, %ymm0
1515	lea	256(%rax), %rcx			# size optimization
1516	vpor	32*5-256(%rax), %ymm0, %ymm0
1517	vpor	32*6-256(%rax), %ymm0, %ymm0
1518	vpor	32*7-256(%rax), %ymm0, %ymm0
1519	vpor	32*8-256(%rax), %ymm0, %ymm0
1520	vpor	32*9-256(%rax), %ymm0, %ymm0
1521	vpor	32*10-256(%rax), %ymm0, %ymm0
1522	vpor	32*11-256(%rax), %ymm0, %ymm0
1523	vpor	32*12-512(%rcx), %ymm0, %ymm0
1524	vpor	32*13-512(%rcx), %ymm0, %ymm0
1525	vpor	32*14-512(%rcx), %ymm0, %ymm0
1526	vpor	32*15-512(%rcx), %ymm0, %ymm0
1527	vpor	32*16-512(%rcx), %ymm0, %ymm0
1528	vpor	32*17-512(%rcx), %ymm0, %ymm0
1529	vpcmpeqq %ymm1, %ymm0, %ymm0
1530	vmovdqa	%ymm0, `32*9*8`(%rsp)
1531
1532	vpxor	%ymm1, %ymm1, %ymm1
1533	vmovdqa	32*0($b_ptr), %ymm0
1534	lea	256($b_ptr), %rax		# size optimization
1535	vpor	32*1($b_ptr), %ymm0, %ymm0
1536	vpor	32*2($b_ptr), %ymm0, %ymm0
1537	vpor	32*3($b_ptr), %ymm0, %ymm0
1538	vpor	32*4-256(%rax), %ymm0, %ymm0
1539	lea	256(%rax), %rcx			# size optimization
1540	vpor	32*5-256(%rax), %ymm0, %ymm0
1541	vpor	32*6-256(%rax), %ymm0, %ymm0
1542	vpor	32*7-256(%rax), %ymm0, %ymm0
1543	vpor	32*8-256(%rax), %ymm0, %ymm0
1544	vpor	32*9-256(%rax), %ymm0, %ymm0
1545	vpor	32*10-256(%rax), %ymm0, %ymm0
1546	vpor	32*11-256(%rax), %ymm0, %ymm0
1547	vpor	32*12-512(%rcx), %ymm0, %ymm0
1548	vpor	32*13-512(%rcx), %ymm0, %ymm0
1549	vpor	32*14-512(%rcx), %ymm0, %ymm0
1550	vpor	32*15-512(%rcx), %ymm0, %ymm0
1551	vpor	32*16-512(%rcx), %ymm0, %ymm0
1552	vpor	32*17-512(%rcx), %ymm0, %ymm0
1553	vpcmpeqq %ymm1, %ymm0, %ymm0
1554	vmovdqa	%ymm0, `32*9*8+32`(%rsp)
1555
1556	#	H = U2 - U1 = X2 - X1
1557	lea	`32*9*0`($b_ptr), %rsi
1558	lea	`32*9*0`($a_ptr), %rdx
1559	lea	`32*9*3`(%rsp), %rdi
1560	call	avx2_sub_x4
1561	call	avx2_normalize_n_store
1562
1563	#	R = S2 - S1 = Y2 - Y1
1564	lea	`32*9*1`($b_ptr), %rsi
1565	lea	`32*9*1`($a_ptr), %rdx
1566	lea	`32*9*4`(%rsp), %rdi
1567	call	avx2_sub_x4
1568	call	avx2_normalize_n_store
1569
1570	#	Z3 = H*Z1*Z2 = H
1571	lea	`32*9*3`(%rsp), %rsi
1572	lea	`32*9*2`($r_ptr), %rdi
1573	call	avx2_mul_by1_x4
1574	call	avx2_normalize
1575
1576	vmovdqa	`32*9*8`(%rsp), $B
1577	vpor	`32*9*8+32`(%rsp), $B, $B
1578
1579	vpandn	$ACC0, $B, $ACC0
1580	lea	.LONE+128(%rip), %rax
1581	vpandn	$ACC1, $B, $ACC1
1582	vpandn	$ACC2, $B, $ACC2
1583	vpandn	$ACC3, $B, $ACC3
1584	vpandn	$ACC4, $B, $ACC4
1585	vpandn	$ACC5, $B, $ACC5
1586	vpandn	$ACC6, $B, $ACC6
1587	vpandn	$ACC7, $B, $ACC7
1588
1589	vpand	32*0-128(%rax), $B, $T0
1590	 vpandn	$ACC8, $B, $ACC8
1591	vpand	32*1-128(%rax), $B, $Y
1592	vpxor	$T0, $ACC0, $ACC0
1593	vpand	32*2-128(%rax), $B, $T0
1594	vpxor	$Y, $ACC1, $ACC1
1595	vpand	32*3-128(%rax), $B, $Y
1596	vpxor	$T0, $ACC2, $ACC2
1597	vpand	32*4-128(%rax), $B, $T0
1598	vpxor	$Y, $ACC3, $ACC3
1599	vpand	32*5-128(%rax), $B, $Y
1600	vpxor	$T0, $ACC4, $ACC4
1601	vpand	32*6-128(%rax), $B, $T0
1602	vpxor	$Y, $ACC5, $ACC5
1603	vpand	32*7-128(%rax), $B, $Y
1604	vpxor	$T0, $ACC6, $ACC6
1605	vpand	32*8-128(%rax), $B, $T0
1606	vpxor	$Y, $ACC7, $ACC7
1607	vpxor	$T0, $ACC8, $ACC8
1608	`&STORE`
1609
1610	#	R^2 = R^2
1611	lea	`32*9*4`(%rsp), %rsi
1612	lea	`32*9*6`(%rsp), %rdi
1613	lea	`32*9*8+32*2`(%rsp), %rcx	# temporary vector
1614	call	avx2_sqr_x4
1615	call	avx2_normalize_n_store
1616
1617	#	H^2 = H^2
1618	lea	`32*9*3`(%rsp), %rsi
1619	lea	`32*9*5`(%rsp), %rdi
1620	call	avx2_sqr_x4
1621	call	avx2_normalize_n_store
1622
1623	#	H^3 = H^2*H
1624	lea	`32*9*3`(%rsp), %rsi
1625	lea	`32*9*5`(%rsp), %rdx
1626	lea	`32*9*7`(%rsp), %rdi
1627	call	avx2_mul_x4
1628	call	avx2_normalize_n_store
1629
1630	#	U2 = U1*H^2
1631	lea	`32*9*0`($a_ptr), %rsi
1632	lea	`32*9*5`(%rsp), %rdx
1633	lea	`32*9*0`(%rsp), %rdi
1634	call	avx2_mul_x4
1635	#call	avx2_normalize
1636	`&STORE`
1637
1638	#	Hsqr = U2*2
1639	#lea	32*9*0(%rsp), %rsi
1640	#lea	32*9*5(%rsp), %rdi
1641	#call	avx2_mul_by2_x4
1642
1643	vpaddq	$ACC0, $ACC0, $ACC0	# inlined avx2_mul_by2_x4
1644	lea	`32*9*5`(%rsp), %rdi
1645	vpaddq	$ACC1, $ACC1, $ACC1
1646	vpaddq	$ACC2, $ACC2, $ACC2
1647	vpaddq	$ACC3, $ACC3, $ACC3
1648	vpaddq	$ACC4, $ACC4, $ACC4
1649	vpaddq	$ACC5, $ACC5, $ACC5
1650	vpaddq	$ACC6, $ACC6, $ACC6
1651	vpaddq	$ACC7, $ACC7, $ACC7
1652	vpaddq	$ACC8, $ACC8, $ACC8
1653	call	avx2_normalize_n_store
1654
1655	#	X3 = R^2 - H^3
1656	#lea	32*9*6(%rsp), %rsi
1657	#lea	32*9*7(%rsp), %rdx
1658	#lea	32*9*5(%rsp), %rcx
1659	#lea	32*9*0($r_ptr), %rdi
1660	#call	avx2_sub_x4
1661	#NORMALIZE
1662	#STORE
1663
1664	#	X3 = X3 - U2*2
1665	#lea	32*9*0($r_ptr), %rsi
1666	#lea	32*9*0($r_ptr), %rdi
1667	#call	avx2_sub_x4
1668	#NORMALIZE
1669	#STORE
1670
1671	lea	`32*9*6+128`(%rsp), %rsi
1672	lea	.LAVX2_POLY_x2+128(%rip), %rax
1673	lea	`32*9*7+128`(%rsp), %rdx
1674	lea	`32*9*5+128`(%rsp), %rcx
1675	lea	`32*9*0`($r_ptr), %rdi
1676
1677	vmovdqa	32*0-128(%rsi), $ACC0
1678	vmovdqa	32*1-128(%rsi), $ACC1
1679	vmovdqa	32*2-128(%rsi), $ACC2
1680	vmovdqa	32*3-128(%rsi), $ACC3
1681	vmovdqa	32*4-128(%rsi), $ACC4
1682	vmovdqa	32*5-128(%rsi), $ACC5
1683	vmovdqa	32*6-128(%rsi), $ACC6
1684	vmovdqa	32*7-128(%rsi), $ACC7
1685	vmovdqa	32*8-128(%rsi), $ACC8
1686
1687	vpaddq	32*0-128(%rax), $ACC0, $ACC0
1688	vpaddq	32*1-128(%rax), $ACC1, $ACC1
1689	vpaddq	32*2-128(%rax), $ACC2, $ACC2
1690	vpaddq	32*3-128(%rax), $ACC3, $ACC3
1691	vpaddq	32*4-128(%rax), $ACC4, $ACC4
1692	vpaddq	32*5-128(%rax), $ACC5, $ACC5
1693	vpaddq	32*6-128(%rax), $ACC6, $ACC6
1694	vpaddq	32*7-128(%rax), $ACC7, $ACC7
1695	vpaddq	32*8-128(%rax), $ACC8, $ACC8
1696
1697	vpsubq	32*0-128(%rdx), $ACC0, $ACC0
1698	vpsubq	32*1-128(%rdx), $ACC1, $ACC1
1699	vpsubq	32*2-128(%rdx), $ACC2, $ACC2
1700	vpsubq	32*3-128(%rdx), $ACC3, $ACC3
1701	vpsubq	32*4-128(%rdx), $ACC4, $ACC4
1702	vpsubq	32*5-128(%rdx), $ACC5, $ACC5
1703	vpsubq	32*6-128(%rdx), $ACC6, $ACC6
1704	vpsubq	32*7-128(%rdx), $ACC7, $ACC7
1705	vpsubq	32*8-128(%rdx), $ACC8, $ACC8
1706
1707	vpsubq	32*0-128(%rcx), $ACC0, $ACC0
1708	vpsubq	32*1-128(%rcx), $ACC1, $ACC1
1709	vpsubq	32*2-128(%rcx), $ACC2, $ACC2
1710	vpsubq	32*3-128(%rcx), $ACC3, $ACC3
1711	vpsubq	32*4-128(%rcx), $ACC4, $ACC4
1712	vpsubq	32*5-128(%rcx), $ACC5, $ACC5
1713	vpsubq	32*6-128(%rcx), $ACC6, $ACC6
1714	vpsubq	32*7-128(%rcx), $ACC7, $ACC7
1715	vpsubq	32*8-128(%rcx), $ACC8, $ACC8
1716	call	avx2_normalize
1717
1718	lea	32*0($b_ptr), %rsi
1719	lea	32*0($a_ptr), %rdx
1720	call	avx2_select_n_store
1721
1722	#	H = U2 - X3
1723	lea	`32*9*0`(%rsp), %rsi
1724	lea	`32*9*0`($r_ptr), %rdx
1725	lea	`32*9*3`(%rsp), %rdi
1726	call	avx2_sub_x4
1727	call	avx2_normalize_n_store
1728
1729	#	H = H*R
1730	lea	`32*9*3`(%rsp), %rsi
1731	lea	`32*9*4`(%rsp), %rdx
1732	lea	`32*9*3`(%rsp), %rdi
1733	call	avx2_mul_x4
1734	call	avx2_normalize_n_store
1735
1736	#	S2 = S1 * H^3
1737	lea	`32*9*7`(%rsp), %rsi
1738	lea	`32*9*1`($a_ptr), %rdx
1739	lea	`32*9*1`(%rsp), %rdi
1740	call	avx2_mul_x4
1741	call	avx2_normalize_n_store
1742
1743	#
1744	lea	`32*9*3`(%rsp), %rsi
1745	lea	`32*9*1`(%rsp), %rdx
1746	lea	`32*9*1`($r_ptr), %rdi
1747	call	avx2_sub_x4
1748	call	avx2_normalize
1749
1750	lea	32*9($b_ptr), %rsi
1751	lea	32*9($a_ptr), %rdx
1752	call	avx2_select_n_store
1753
1754	#lea	32*9*0($r_ptr), %rsi
1755	#lea	32*9*0($r_ptr), %rdi
1756	#call	avx2_mul_by1_x4
1757	#NORMALIZE
1758	#STORE
1759
1760	lea	`32*9*1`($r_ptr), %rsi
1761	lea	`32*9*1`($r_ptr), %rdi
1762	call	avx2_mul_by1_x4
1763	call	avx2_normalize_n_store
1764
1765	vzeroupper
1766___
1767$code.=<<___	if ($win64);
1768	movaps	%xmm6, -16*10(%rbp)
1769	movaps	%xmm7, -16*9(%rbp)
1770	movaps	%xmm8, -16*8(%rbp)
1771	movaps	%xmm9, -16*7(%rbp)
1772	movaps	%xmm10, -16*6(%rbp)
1773	movaps	%xmm11, -16*5(%rbp)
1774	movaps	%xmm12, -16*4(%rbp)
1775	movaps	%xmm13, -16*3(%rbp)
1776	movaps	%xmm14, -16*2(%rbp)
1777	movaps	%xmm15, -16*1(%rbp)
1778___
1779$code.=<<___;
1780	mov	%rbp, %rsp
1781	pop	%rbp
1782	ret
1783.size	ecp_nistz256_avx2_point_add_affines_x4,.-ecp_nistz256_avx2_point_add_affines_x4
1784
1785################################################################################
1786# void ecp_nistz256_avx2_to_mont(void* RESULTx4, void *Ax4);
1787.globl	ecp_nistz256_avx2_to_mont
1788.type	ecp_nistz256_avx2_to_mont,\@function,2
1789.align	32
1790ecp_nistz256_avx2_to_mont:
1791	vzeroupper
1792___
1793$code.=<<___	if ($win64);
1794	lea	-8-16*10(%rsp), %rsp
1795	vmovaps	%xmm6, -8-16*10(%rax)
1796	vmovaps	%xmm7, -8-16*9(%rax)
1797	vmovaps	%xmm8, -8-16*8(%rax)
1798	vmovaps	%xmm9, -8-16*7(%rax)
1799	vmovaps	%xmm10, -8-16*6(%rax)
1800	vmovaps	%xmm11, -8-16*5(%rax)
1801	vmovaps	%xmm12, -8-16*4(%rax)
1802	vmovaps	%xmm13, -8-16*3(%rax)
1803	vmovaps	%xmm14, -8-16*2(%rax)
1804	vmovaps	%xmm15, -8-16*1(%rax)
1805___
1806$code.=<<___;
1807	vmovdqa	.LAVX2_AND_MASK(%rip), $AND_MASK
1808	lea	.LTO_MONT_AVX2(%rip), %rdx
1809	call	avx2_mul_x4
1810	call	avx2_normalize_n_store
1811
1812	vzeroupper
1813___
1814$code.=<<___	if ($win64);
1815	movaps	16*0(%rsp), %xmm6
1816	movaps	16*1(%rsp), %xmm7
1817	movaps	16*2(%rsp), %xmm8
1818	movaps	16*3(%rsp), %xmm9
1819	movaps	16*4(%rsp), %xmm10
1820	movaps	16*5(%rsp), %xmm11
1821	movaps	16*6(%rsp), %xmm12
1822	movaps	16*7(%rsp), %xmm13
1823	movaps	16*8(%rsp), %xmm14
1824	movaps	16*9(%rsp), %xmm15
1825	lea	8+16*10(%rsp), %rsp
1826___
1827$code.=<<___;
1828	ret
1829.size	ecp_nistz256_avx2_to_mont,.-ecp_nistz256_avx2_to_mont
1830
1831################################################################################
1832# void ecp_nistz256_avx2_from_mont(void* RESULTx4, void *Ax4);
1833.globl	ecp_nistz256_avx2_from_mont
1834.type	ecp_nistz256_avx2_from_mont,\@function,2
1835.align	32
1836ecp_nistz256_avx2_from_mont:
1837	vzeroupper
1838___
1839$code.=<<___	if ($win64);
1840	lea	-8-16*10(%rsp), %rsp
1841	vmovaps	%xmm6, -8-16*10(%rax)
1842	vmovaps	%xmm7, -8-16*9(%rax)
1843	vmovaps	%xmm8, -8-16*8(%rax)
1844	vmovaps	%xmm9, -8-16*7(%rax)
1845	vmovaps	%xmm10, -8-16*6(%rax)
1846	vmovaps	%xmm11, -8-16*5(%rax)
1847	vmovaps	%xmm12, -8-16*4(%rax)
1848	vmovaps	%xmm13, -8-16*3(%rax)
1849	vmovaps	%xmm14, -8-16*2(%rax)
1850	vmovaps	%xmm15, -8-16*1(%rax)
1851___
1852$code.=<<___;
1853	vmovdqa	.LAVX2_AND_MASK(%rip), $AND_MASK
1854	lea	.LFROM_MONT_AVX2(%rip), %rdx
1855	call	avx2_mul_x4
1856	call	avx2_normalize_n_store
1857
1858	vzeroupper
1859___
1860$code.=<<___	if ($win64);
1861	movaps	16*0(%rsp), %xmm6
1862	movaps	16*1(%rsp), %xmm7
1863	movaps	16*2(%rsp), %xmm8
1864	movaps	16*3(%rsp), %xmm9
1865	movaps	16*4(%rsp), %xmm10
1866	movaps	16*5(%rsp), %xmm11
1867	movaps	16*6(%rsp), %xmm12
1868	movaps	16*7(%rsp), %xmm13
1869	movaps	16*8(%rsp), %xmm14
1870	movaps	16*9(%rsp), %xmm15
1871	lea	8+16*10(%rsp), %rsp
1872___
1873$code.=<<___;
1874	ret
1875.size	ecp_nistz256_avx2_from_mont,.-ecp_nistz256_avx2_from_mont
1876
1877################################################################################
1878# void ecp_nistz256_avx2_set1(void* RESULTx4);
1879.globl	ecp_nistz256_avx2_set1
1880.type	ecp_nistz256_avx2_set1,\@function,1
1881.align	32
1882ecp_nistz256_avx2_set1:
1883	lea	.LONE+128(%rip), %rax
1884	lea	128(%rdi), %rdi
1885	vzeroupper
1886	vmovdqa	32*0-128(%rax), %ymm0
1887	vmovdqa	32*1-128(%rax), %ymm1
1888	vmovdqa	32*2-128(%rax), %ymm2
1889	vmovdqa	32*3-128(%rax), %ymm3
1890	vmovdqa	32*4-128(%rax), %ymm4
1891	vmovdqa	32*5-128(%rax), %ymm5
1892	vmovdqa	%ymm0, 32*0-128(%rdi)
1893	vmovdqa	32*6-128(%rax), %ymm0
1894	vmovdqa	%ymm1, 32*1-128(%rdi)
1895	vmovdqa	32*7-128(%rax), %ymm1
1896	vmovdqa	%ymm2, 32*2-128(%rdi)
1897	vmovdqa	32*8-128(%rax), %ymm2
1898	vmovdqa	%ymm3, 32*3-128(%rdi)
1899	vmovdqa	%ymm4, 32*4-128(%rdi)
1900	vmovdqa	%ymm5, 32*5-128(%rdi)
1901	vmovdqa	%ymm0, 32*6-128(%rdi)
1902	vmovdqa	%ymm1, 32*7-128(%rdi)
1903	vmovdqa	%ymm2, 32*8-128(%rdi)
1904
1905	vzeroupper
1906	ret
1907.size	ecp_nistz256_avx2_set1,.-ecp_nistz256_avx2_set1
1908___
1909}
1910{
1911################################################################################
1912# void ecp_nistz256_avx2_multi_select_w7(void* RESULT, void *in,
1913#			    int index0, int index1, int index2, int index3);
1914################################################################################
1915
1916my ($val,$in_t,$index0,$index1,$index2,$index3)=("%rdi","%rsi","%edx","%ecx","%r8d","%r9d");
1917my ($INDEX0,$INDEX1,$INDEX2,$INDEX3)=map("%ymm$_",(0..3));
1918my ($R0a,$R0b,$R1a,$R1b,$R2a,$R2b,$R3a,$R3b)=map("%ymm$_",(4..11));
1919my ($M0,$T0,$T1,$TMP0)=map("%ymm$_",(12..15));
1920
1921$code.=<<___;
1922.globl	ecp_nistz256_avx2_multi_select_w7
1923.type	ecp_nistz256_avx2_multi_select_w7,\@function,6
1924.align	32
1925ecp_nistz256_avx2_multi_select_w7:
1926	vzeroupper
1927___
1928$code.=<<___	if ($win64);
1929	lea	-8-16*10(%rsp), %rsp
1930	vmovaps	%xmm6, -8-16*10(%rax)
1931	vmovaps	%xmm7, -8-16*9(%rax)
1932	vmovaps	%xmm8, -8-16*8(%rax)
1933	vmovaps	%xmm9, -8-16*7(%rax)
1934	vmovaps	%xmm10, -8-16*6(%rax)
1935	vmovaps	%xmm11, -8-16*5(%rax)
1936	vmovaps	%xmm12, -8-16*4(%rax)
1937	vmovaps	%xmm13, -8-16*3(%rax)
1938	vmovaps	%xmm14, -8-16*2(%rax)
1939	vmovaps	%xmm15, -8-16*1(%rax)
1940___
1941$code.=<<___;
1942	lea	.LIntOne(%rip), %rax
1943
1944	vmovd	$index0, %xmm0
1945	vmovd	$index1, %xmm1
1946	vmovd	$index2, %xmm2
1947	vmovd	$index3, %xmm3
1948
1949	vpxor	$R0a, $R0a, $R0a
1950	vpxor	$R0b, $R0b, $R0b
1951	vpxor	$R1a, $R1a, $R1a
1952	vpxor	$R1b, $R1b, $R1b
1953	vpxor	$R2a, $R2a, $R2a
1954	vpxor	$R2b, $R2b, $R2b
1955	vpxor	$R3a, $R3a, $R3a
1956	vpxor	$R3b, $R3b, $R3b
1957	vmovdqa	(%rax), $M0
1958
1959	vpermd	$INDEX0, $R0a, $INDEX0
1960	vpermd	$INDEX1, $R0a, $INDEX1
1961	vpermd	$INDEX2, $R0a, $INDEX2
1962	vpermd	$INDEX3, $R0a, $INDEX3
1963
1964	mov	\$64, %ecx
1965	lea	112($val), $val		# size optimization
1966	jmp	.Lmulti_select_loop_avx2
1967
1968# INDEX=0, corresponds to the point at infty (0,0)
1969.align	32
1970.Lmulti_select_loop_avx2:
1971	vpcmpeqd	$INDEX0, $M0, $TMP0
1972
1973	vmovdqa		`32*0+32*64*2*0`($in_t), $T0
1974	vmovdqa		`32*1+32*64*2*0`($in_t), $T1
1975	vpand		$TMP0, $T0, $T0
1976	vpand		$TMP0, $T1, $T1
1977	vpxor		$T0, $R0a, $R0a
1978	vpxor		$T1, $R0b, $R0b
1979
1980	vpcmpeqd	$INDEX1, $M0, $TMP0
1981
1982	vmovdqa		`32*0+32*64*2*1`($in_t), $T0
1983	vmovdqa		`32*1+32*64*2*1`($in_t), $T1
1984	vpand		$TMP0, $T0, $T0
1985	vpand		$TMP0, $T1, $T1
1986	vpxor		$T0, $R1a, $R1a
1987	vpxor		$T1, $R1b, $R1b
1988
1989	vpcmpeqd	$INDEX2, $M0, $TMP0
1990
1991	vmovdqa		`32*0+32*64*2*2`($in_t), $T0
1992	vmovdqa		`32*1+32*64*2*2`($in_t), $T1
1993	vpand		$TMP0, $T0, $T0
1994	vpand		$TMP0, $T1, $T1
1995	vpxor		$T0, $R2a, $R2a
1996	vpxor		$T1, $R2b, $R2b
1997
1998	vpcmpeqd	$INDEX3, $M0, $TMP0
1999
2000	vmovdqa		`32*0+32*64*2*3`($in_t), $T0
2001	vmovdqa		`32*1+32*64*2*3`($in_t), $T1
2002	vpand		$TMP0, $T0, $T0
2003	vpand		$TMP0, $T1, $T1
2004	vpxor		$T0, $R3a, $R3a
2005	vpxor		$T1, $R3b, $R3b
2006
2007	vpaddd		(%rax), $M0, $M0	# increment
2008	lea		32*2($in_t), $in_t
2009
2010        dec	%ecx
2011	jnz	.Lmulti_select_loop_avx2
2012
2013	vmovdqu	$R0a, 32*0-112($val)
2014	vmovdqu	$R0b, 32*1-112($val)
2015	vmovdqu	$R1a, 32*2-112($val)
2016	vmovdqu	$R1b, 32*3-112($val)
2017	vmovdqu	$R2a, 32*4-112($val)
2018	vmovdqu	$R2b, 32*5-112($val)
2019	vmovdqu	$R3a, 32*6-112($val)
2020	vmovdqu	$R3b, 32*7-112($val)
2021
2022	vzeroupper
2023___
2024$code.=<<___	if ($win64);
2025	movaps	16*0(%rsp), %xmm6
2026	movaps	16*1(%rsp), %xmm7
2027	movaps	16*2(%rsp), %xmm8
2028	movaps	16*3(%rsp), %xmm9
2029	movaps	16*4(%rsp), %xmm10
2030	movaps	16*5(%rsp), %xmm11
2031	movaps	16*6(%rsp), %xmm12
2032	movaps	16*7(%rsp), %xmm13
2033	movaps	16*8(%rsp), %xmm14
2034	movaps	16*9(%rsp), %xmm15
2035	lea	8+16*10(%rsp), %rsp
2036___
2037$code.=<<___;
2038	ret
2039.size	ecp_nistz256_avx2_multi_select_w7,.-ecp_nistz256_avx2_multi_select_w7
2040
2041.extern	OPENSSL_ia32cap_P
2042.globl	ecp_nistz_avx2_eligible
2043.type	ecp_nistz_avx2_eligible,\@abi-omnipotent
2044.align	32
2045ecp_nistz_avx2_eligible:
2046	mov	OPENSSL_ia32cap_P+8(%rip),%eax
2047	shr	\$5,%eax
2048	and	\$1,%eax
2049	ret
2050.size	ecp_nistz_avx2_eligible,.-ecp_nistz_avx2_eligible
2051___
2052}
2053}} else {{	# assembler is too old
2054$code.=<<___;
2055.text
2056
2057.globl	ecp_nistz256_avx2_transpose_convert
2058.globl	ecp_nistz256_avx2_convert_transpose_back
2059.globl	ecp_nistz256_avx2_point_add_affine_x4
2060.globl	ecp_nistz256_avx2_point_add_affines_x4
2061.globl	ecp_nistz256_avx2_to_mont
2062.globl	ecp_nistz256_avx2_from_mont
2063.globl	ecp_nistz256_avx2_set1
2064.globl	ecp_nistz256_avx2_multi_select_w7
2065.type	ecp_nistz256_avx2_multi_select_w7,\@abi-omnipotent
2066ecp_nistz256_avx2_transpose_convert:
2067ecp_nistz256_avx2_convert_transpose_back:
2068ecp_nistz256_avx2_point_add_affine_x4:
2069ecp_nistz256_avx2_point_add_affines_x4:
2070ecp_nistz256_avx2_to_mont:
2071ecp_nistz256_avx2_from_mont:
2072ecp_nistz256_avx2_set1:
2073ecp_nistz256_avx2_multi_select_w7:
2074	.byte	0x0f,0x0b	# ud2
2075	ret
2076.size	ecp_nistz256_avx2_multi_select_w7,.-ecp_nistz256_avx2_multi_select_w7
2077
2078.globl	ecp_nistz_avx2_eligible
2079.type	ecp_nistz_avx2_eligible,\@abi-omnipotent
2080ecp_nistz_avx2_eligible:
2081	xor	%eax,%eax
2082	ret
2083.size	ecp_nistz_avx2_eligible,.-ecp_nistz_avx2_eligible
2084___
2085}}
2086
2087foreach (split("\n",$code)) {
2088	s/\`([^\`]*)\`/eval($1)/geo;
2089
2090	print $_,"\n";
2091}
2092
2093close STDOUT;
2094