1#include "arm_asm.h"
2#include "arm_arch.h"
3
4.text
5
6// forward "declarations" are required for Apple
7
8.hidden	OPENSSL_armcap_P
9.globl	poly1305_init
10.hidden	poly1305_init
11.globl	poly1305_blocks
12.hidden	poly1305_blocks
13.globl	poly1305_emit
14.hidden	poly1305_emit
15
16.type	poly1305_init,%function
17.align	5
18poly1305_init:
19	cmp	x1,xzr
20	stp	xzr,xzr,[x0]		// zero hash value
21	stp	xzr,xzr,[x0,#16]	// [along with is_base2_26]
22
23	csel	x0,xzr,x0,eq
24	b.eq	.Lno_key
25
26	adrp	x17,OPENSSL_armcap_P
27	ldr	w17,[x17,#:lo12:OPENSSL_armcap_P]
28
29	ldp	x7,x8,[x1]		// load key
30	mov	x9,#0xfffffffc0fffffff
31	movk	x9,#0x0fff,lsl#48
32#ifdef	__AARCH64EB__
33	rev	x7,x7			// flip bytes
34	rev	x8,x8
35#endif
36	and	x7,x7,x9		// &=0ffffffc0fffffff
37	and	x9,x9,#-4
38	and	x8,x8,x9		// &=0ffffffc0ffffffc
39	stp	x7,x8,[x0,#32]	// save key value
40
41	tst	w17,#ARMV7_NEON
42
43	adr	x12,.Lpoly1305_blocks
44	adr	x7,.Lpoly1305_blocks_neon
45	adr	x13,.Lpoly1305_emit
46	adr	x8,.Lpoly1305_emit_neon
47
48	csel	x12,x12,x7,eq
49	csel	x13,x13,x8,eq
50
51#ifdef	__ILP32__
52	stp	w12,w13,[x2]
53#else
54	stp	x12,x13,[x2]
55#endif
56
57	mov	x0,#1
58.Lno_key:
59	ret
60.size	poly1305_init,.-poly1305_init
61
62.type	poly1305_blocks,%function
63.align	5
64poly1305_blocks:
65.Lpoly1305_blocks:
66	ands	x2,x2,#-16
67	b.eq	.Lno_data
68
69	ldp	x4,x5,[x0]		// load hash value
70	ldp	x7,x8,[x0,#32]	// load key value
71	ldr	x6,[x0,#16]
72	add	x9,x8,x8,lsr#2	// s1 = r1 + (r1 >> 2)
73	b	.Loop
74
75.align	5
76.Loop:
77	ldp	x10,x11,[x1],#16	// load input
78	sub	x2,x2,#16
79#ifdef	__AARCH64EB__
80	rev	x10,x10
81	rev	x11,x11
82#endif
83	adds	x4,x4,x10		// accumulate input
84	adcs	x5,x5,x11
85
86	mul	x12,x4,x7		// h0*r0
87	adc	x6,x6,x3
88	umulh	x13,x4,x7
89
90	mul	x10,x5,x9		// h1*5*r1
91	umulh	x11,x5,x9
92
93	adds	x12,x12,x10
94	mul	x10,x4,x8		// h0*r1
95	adc	x13,x13,x11
96	umulh	x14,x4,x8
97
98	adds	x13,x13,x10
99	mul	x10,x5,x7		// h1*r0
100	adc	x14,x14,xzr
101	umulh	x11,x5,x7
102
103	adds	x13,x13,x10
104	mul	x10,x6,x9		// h2*5*r1
105	adc	x14,x14,x11
106	mul	x11,x6,x7		// h2*r0
107
108	adds	x13,x13,x10
109	adc	x14,x14,x11
110
111	and	x10,x14,#-4		// final reduction
112	and	x6,x14,#3
113	add	x10,x10,x14,lsr#2
114	adds	x4,x12,x10
115	adcs	x5,x13,xzr
116	adc	x6,x6,xzr
117
118	cbnz	x2,.Loop
119
120	stp	x4,x5,[x0]		// store hash value
121	str	x6,[x0,#16]
122
123.Lno_data:
124	ret
125.size	poly1305_blocks,.-poly1305_blocks
126
127.type	poly1305_emit,%function
128.align	5
129poly1305_emit:
130.Lpoly1305_emit:
131	ldp	x4,x5,[x0]		// load hash base 2^64
132	ldr	x6,[x0,#16]
133	ldp	x10,x11,[x2]	// load nonce
134
135	adds	x12,x4,#5		// compare to modulus
136	adcs	x13,x5,xzr
137	adc	x14,x6,xzr
138
139	tst	x14,#-4			// see if it's carried/borrowed
140
141	csel	x4,x4,x12,eq
142	csel	x5,x5,x13,eq
143
144#ifdef	__AARCH64EB__
145	ror	x10,x10,#32		// flip nonce words
146	ror	x11,x11,#32
147#endif
148	adds	x4,x4,x10		// accumulate nonce
149	adc	x5,x5,x11
150#ifdef	__AARCH64EB__
151	rev	x4,x4			// flip output bytes
152	rev	x5,x5
153#endif
154	stp	x4,x5,[x1]		// write result
155
156	ret
157.size	poly1305_emit,.-poly1305_emit
158.type	poly1305_mult,%function
159.align	5
160poly1305_mult:
161	mul	x12,x4,x7		// h0*r0
162	umulh	x13,x4,x7
163
164	mul	x10,x5,x9		// h1*5*r1
165	umulh	x11,x5,x9
166
167	adds	x12,x12,x10
168	mul	x10,x4,x8		// h0*r1
169	adc	x13,x13,x11
170	umulh	x14,x4,x8
171
172	adds	x13,x13,x10
173	mul	x10,x5,x7		// h1*r0
174	adc	x14,x14,xzr
175	umulh	x11,x5,x7
176
177	adds	x13,x13,x10
178	mul	x10,x6,x9		// h2*5*r1
179	adc	x14,x14,x11
180	mul	x11,x6,x7		// h2*r0
181
182	adds	x13,x13,x10
183	adc	x14,x14,x11
184
185	and	x10,x14,#-4		// final reduction
186	and	x6,x14,#3
187	add	x10,x10,x14,lsr#2
188	adds	x4,x12,x10
189	adcs	x5,x13,xzr
190	adc	x6,x6,xzr
191
192	ret
193.size	poly1305_mult,.-poly1305_mult
194
195.type	poly1305_splat,%function
196.align	5
197poly1305_splat:
198	and	x12,x4,#0x03ffffff	// base 2^64 -> base 2^26
199	ubfx	x13,x4,#26,#26
200	extr	x14,x5,x4,#52
201	and	x14,x14,#0x03ffffff
202	ubfx	x15,x5,#14,#26
203	extr	x16,x6,x5,#40
204
205	str	w12,[x0,#16*0]	// r0
206	add	w12,w13,w13,lsl#2	// r1*5
207	str	w13,[x0,#16*1]	// r1
208	add	w13,w14,w14,lsl#2	// r2*5
209	str	w12,[x0,#16*2]	// s1
210	str	w14,[x0,#16*3]	// r2
211	add	w14,w15,w15,lsl#2	// r3*5
212	str	w13,[x0,#16*4]	// s2
213	str	w15,[x0,#16*5]	// r3
214	add	w15,w16,w16,lsl#2	// r4*5
215	str	w14,[x0,#16*6]	// s3
216	str	w16,[x0,#16*7]	// r4
217	str	w15,[x0,#16*8]	// s4
218
219	ret
220.size	poly1305_splat,.-poly1305_splat
221
222.type	poly1305_blocks_neon,%function
223.align	5
224poly1305_blocks_neon:
225.Lpoly1305_blocks_neon:
226	ldr	x17,[x0,#24]
227	cmp	x2,#128
228	b.hs	.Lblocks_neon
229	cbz	x17,.Lpoly1305_blocks
230
231.Lblocks_neon:
232.inst	0xd503233f		// paciasp
233	stp	x29,x30,[sp,#-80]!
234	add	x29,sp,#0
235
236	ands	x2,x2,#-16
237	b.eq	.Lno_data_neon
238
239	cbz	x17,.Lbase2_64_neon
240
241	ldp	w10,w11,[x0]		// load hash value base 2^26
242	ldp	w12,w13,[x0,#8]
243	ldr	w14,[x0,#16]
244
245	tst	x2,#31
246	b.eq	.Leven_neon
247
248	ldp	x7,x8,[x0,#32]	// load key value
249
250	add	x4,x10,x11,lsl#26	// base 2^26 -> base 2^64
251	lsr	x5,x12,#12
252	adds	x4,x4,x12,lsl#52
253	add	x5,x5,x13,lsl#14
254	adc	x5,x5,xzr
255	lsr	x6,x14,#24
256	adds	x5,x5,x14,lsl#40
257	adc	x14,x6,xzr		// can be partially reduced...
258
259	ldp	x12,x13,[x1],#16	// load input
260	sub	x2,x2,#16
261	add	x9,x8,x8,lsr#2	// s1 = r1 + (r1 >> 2)
262
263	and	x10,x14,#-4		// ... so reduce
264	and	x6,x14,#3
265	add	x10,x10,x14,lsr#2
266	adds	x4,x4,x10
267	adcs	x5,x5,xzr
268	adc	x6,x6,xzr
269
270#ifdef	__AARCH64EB__
271	rev	x12,x12
272	rev	x13,x13
273#endif
274	adds	x4,x4,x12		// accumulate input
275	adcs	x5,x5,x13
276	adc	x6,x6,x3
277
278	bl	poly1305_mult
279	ldr	x30,[sp,#8]
280
281	cbz	x3,.Lstore_base2_64_neon
282
283	and	x10,x4,#0x03ffffff	// base 2^64 -> base 2^26
284	ubfx	x11,x4,#26,#26
285	extr	x12,x5,x4,#52
286	and	x12,x12,#0x03ffffff
287	ubfx	x13,x5,#14,#26
288	extr	x14,x6,x5,#40
289
290	cbnz	x2,.Leven_neon
291
292	stp	w10,w11,[x0]		// store hash value base 2^26
293	stp	w12,w13,[x0,#8]
294	str	w14,[x0,#16]
295	b	.Lno_data_neon
296
297.align	4
298.Lstore_base2_64_neon:
299	stp	x4,x5,[x0]		// store hash value base 2^64
300	stp	x6,xzr,[x0,#16]	// note that is_base2_26 is zeroed
301	b	.Lno_data_neon
302
303.align	4
304.Lbase2_64_neon:
305	ldp	x7,x8,[x0,#32]	// load key value
306
307	ldp	x4,x5,[x0]		// load hash value base 2^64
308	ldr	x6,[x0,#16]
309
310	tst	x2,#31
311	b.eq	.Linit_neon
312
313	ldp	x12,x13,[x1],#16	// load input
314	sub	x2,x2,#16
315	add	x9,x8,x8,lsr#2	// s1 = r1 + (r1 >> 2)
316#ifdef	__AARCH64EB__
317	rev	x12,x12
318	rev	x13,x13
319#endif
320	adds	x4,x4,x12		// accumulate input
321	adcs	x5,x5,x13
322	adc	x6,x6,x3
323
324	bl	poly1305_mult
325
326.Linit_neon:
327	and	x10,x4,#0x03ffffff	// base 2^64 -> base 2^26
328	ubfx	x11,x4,#26,#26
329	extr	x12,x5,x4,#52
330	and	x12,x12,#0x03ffffff
331	ubfx	x13,x5,#14,#26
332	extr	x14,x6,x5,#40
333
334	stp	d8,d9,[sp,#16]		// meet ABI requirements
335	stp	d10,d11,[sp,#32]
336	stp	d12,d13,[sp,#48]
337	stp	d14,d15,[sp,#64]
338
339	fmov	d24,x10
340	fmov	d25,x11
341	fmov	d26,x12
342	fmov	d27,x13
343	fmov	d28,x14
344
345	////////////////////////////////// initialize r^n table
346	mov	x4,x7			// r^1
347	add	x9,x8,x8,lsr#2	// s1 = r1 + (r1 >> 2)
348	mov	x5,x8
349	mov	x6,xzr
350	add	x0,x0,#48+12
351	bl	poly1305_splat
352
353	bl	poly1305_mult		// r^2
354	sub	x0,x0,#4
355	bl	poly1305_splat
356
357	bl	poly1305_mult		// r^3
358	sub	x0,x0,#4
359	bl	poly1305_splat
360
361	bl	poly1305_mult		// r^4
362	sub	x0,x0,#4
363	bl	poly1305_splat
364	ldr	x30,[sp,#8]
365
366	add	x16,x1,#32
367	adr	x17,.Lzeros
368	subs	x2,x2,#64
369	csel	x16,x17,x16,lo
370
371	mov	x4,#1
372	stur	x4,[x0,#-24]		// set is_base2_26
373	sub	x0,x0,#48		// restore original x0
374	b	.Ldo_neon
375
376.align	4
377.Leven_neon:
378	add	x16,x1,#32
379	adr	x17,.Lzeros
380	subs	x2,x2,#64
381	csel	x16,x17,x16,lo
382
383	stp	d8,d9,[sp,#16]		// meet ABI requirements
384	stp	d10,d11,[sp,#32]
385	stp	d12,d13,[sp,#48]
386	stp	d14,d15,[sp,#64]
387
388	fmov	d24,x10
389	fmov	d25,x11
390	fmov	d26,x12
391	fmov	d27,x13
392	fmov	d28,x14
393
394.Ldo_neon:
395	ldp	x8,x12,[x16],#16	// inp[2:3] (or zero)
396	ldp	x9,x13,[x16],#48
397
398	lsl	x3,x3,#24
399	add	x15,x0,#48
400
401#ifdef	__AARCH64EB__
402	rev	x8,x8
403	rev	x12,x12
404	rev	x9,x9
405	rev	x13,x13
406#endif
407	and	x4,x8,#0x03ffffff	// base 2^64 -> base 2^26
408	and	x5,x9,#0x03ffffff
409	ubfx	x6,x8,#26,#26
410	ubfx	x7,x9,#26,#26
411	add	x4,x4,x5,lsl#32		// bfi	x4,x5,#32,#32
412	extr	x8,x12,x8,#52
413	extr	x9,x13,x9,#52
414	add	x6,x6,x7,lsl#32		// bfi	x6,x7,#32,#32
415	fmov	d14,x4
416	and	x8,x8,#0x03ffffff
417	and	x9,x9,#0x03ffffff
418	ubfx	x10,x12,#14,#26
419	ubfx	x11,x13,#14,#26
420	add	x12,x3,x12,lsr#40
421	add	x13,x3,x13,lsr#40
422	add	x8,x8,x9,lsl#32		// bfi	x8,x9,#32,#32
423	fmov	d15,x6
424	add	x10,x10,x11,lsl#32	// bfi	x10,x11,#32,#32
425	add	x12,x12,x13,lsl#32	// bfi	x12,x13,#32,#32
426	fmov	d16,x8
427	fmov	d17,x10
428	fmov	d18,x12
429
430	ldp	x8,x12,[x1],#16	// inp[0:1]
431	ldp	x9,x13,[x1],#48
432
433	ld1	{v0.4s,v1.4s,v2.4s,v3.4s},[x15],#64
434	ld1	{v4.4s,v5.4s,v6.4s,v7.4s},[x15],#64
435	ld1	{v8.4s},[x15]
436
437#ifdef	__AARCH64EB__
438	rev	x8,x8
439	rev	x12,x12
440	rev	x9,x9
441	rev	x13,x13
442#endif
443	and	x4,x8,#0x03ffffff	// base 2^64 -> base 2^26
444	and	x5,x9,#0x03ffffff
445	ubfx	x6,x8,#26,#26
446	ubfx	x7,x9,#26,#26
447	add	x4,x4,x5,lsl#32		// bfi	x4,x5,#32,#32
448	extr	x8,x12,x8,#52
449	extr	x9,x13,x9,#52
450	add	x6,x6,x7,lsl#32		// bfi	x6,x7,#32,#32
451	fmov	d9,x4
452	and	x8,x8,#0x03ffffff
453	and	x9,x9,#0x03ffffff
454	ubfx	x10,x12,#14,#26
455	ubfx	x11,x13,#14,#26
456	add	x12,x3,x12,lsr#40
457	add	x13,x3,x13,lsr#40
458	add	x8,x8,x9,lsl#32		// bfi	x8,x9,#32,#32
459	fmov	d10,x6
460	add	x10,x10,x11,lsl#32	// bfi	x10,x11,#32,#32
461	add	x12,x12,x13,lsl#32	// bfi	x12,x13,#32,#32
462	movi	v31.2d,#-1
463	fmov	d11,x8
464	fmov	d12,x10
465	fmov	d13,x12
466	ushr	v31.2d,v31.2d,#38
467
468	b.ls	.Lskip_loop
469
470.align	4
471.Loop_neon:
472	////////////////////////////////////////////////////////////////
473	// ((inp[0]*r^4+inp[2]*r^2+inp[4])*r^4+inp[6]*r^2
474	// ((inp[1]*r^4+inp[3]*r^2+inp[5])*r^3+inp[7]*r
475	//   ___________________/
476	// ((inp[0]*r^4+inp[2]*r^2+inp[4])*r^4+inp[6]*r^2+inp[8])*r^2
477	// ((inp[1]*r^4+inp[3]*r^2+inp[5])*r^4+inp[7]*r^2+inp[9])*r
478	//   ___________________/ ____________________/
479	//
480	// Note that we start with inp[2:3]*r^2. This is because it
481	// doesn't depend on reduction in previous iteration.
482	////////////////////////////////////////////////////////////////
483	// d4 = h0*r4 + h1*r3   + h2*r2   + h3*r1   + h4*r0
484	// d3 = h0*r3 + h1*r2   + h2*r1   + h3*r0   + h4*5*r4
485	// d2 = h0*r2 + h1*r1   + h2*r0   + h3*5*r4 + h4*5*r3
486	// d1 = h0*r1 + h1*r0   + h2*5*r4 + h3*5*r3 + h4*5*r2
487	// d0 = h0*r0 + h1*5*r4 + h2*5*r3 + h3*5*r2 + h4*5*r1
488
489	subs	x2,x2,#64
490	umull	v23.2d,v14.2s,v7.s[2]
491	csel	x16,x17,x16,lo
492	umull	v22.2d,v14.2s,v5.s[2]
493	umull	v21.2d,v14.2s,v3.s[2]
494	ldp	x8,x12,[x16],#16	// inp[2:3] (or zero)
495	umull	v20.2d,v14.2s,v1.s[2]
496	ldp	x9,x13,[x16],#48
497	umull	v19.2d,v14.2s,v0.s[2]
498#ifdef	__AARCH64EB__
499	rev	x8,x8
500	rev	x12,x12
501	rev	x9,x9
502	rev	x13,x13
503#endif
504
505	umlal	v23.2d,v15.2s,v5.s[2]
506	and	x4,x8,#0x03ffffff	// base 2^64 -> base 2^26
507	umlal	v22.2d,v15.2s,v3.s[2]
508	and	x5,x9,#0x03ffffff
509	umlal	v21.2d,v15.2s,v1.s[2]
510	ubfx	x6,x8,#26,#26
511	umlal	v20.2d,v15.2s,v0.s[2]
512	ubfx	x7,x9,#26,#26
513	umlal	v19.2d,v15.2s,v8.s[2]
514	add	x4,x4,x5,lsl#32		// bfi	x4,x5,#32,#32
515
516	umlal	v23.2d,v16.2s,v3.s[2]
517	extr	x8,x12,x8,#52
518	umlal	v22.2d,v16.2s,v1.s[2]
519	extr	x9,x13,x9,#52
520	umlal	v21.2d,v16.2s,v0.s[2]
521	add	x6,x6,x7,lsl#32		// bfi	x6,x7,#32,#32
522	umlal	v20.2d,v16.2s,v8.s[2]
523	fmov	d14,x4
524	umlal	v19.2d,v16.2s,v6.s[2]
525	and	x8,x8,#0x03ffffff
526
527	umlal	v23.2d,v17.2s,v1.s[2]
528	and	x9,x9,#0x03ffffff
529	umlal	v22.2d,v17.2s,v0.s[2]
530	ubfx	x10,x12,#14,#26
531	umlal	v21.2d,v17.2s,v8.s[2]
532	ubfx	x11,x13,#14,#26
533	umlal	v20.2d,v17.2s,v6.s[2]
534	add	x8,x8,x9,lsl#32		// bfi	x8,x9,#32,#32
535	umlal	v19.2d,v17.2s,v4.s[2]
536	fmov	d15,x6
537
538	add	v11.2s,v11.2s,v26.2s
539	add	x12,x3,x12,lsr#40
540	umlal	v23.2d,v18.2s,v0.s[2]
541	add	x13,x3,x13,lsr#40
542	umlal	v22.2d,v18.2s,v8.s[2]
543	add	x10,x10,x11,lsl#32	// bfi	x10,x11,#32,#32
544	umlal	v21.2d,v18.2s,v6.s[2]
545	add	x12,x12,x13,lsl#32	// bfi	x12,x13,#32,#32
546	umlal	v20.2d,v18.2s,v4.s[2]
547	fmov	d16,x8
548	umlal	v19.2d,v18.2s,v2.s[2]
549	fmov	d17,x10
550
551	////////////////////////////////////////////////////////////////
552	// (hash+inp[0:1])*r^4 and accumulate
553
554	add	v9.2s,v9.2s,v24.2s
555	fmov	d18,x12
556	umlal	v22.2d,v11.2s,v1.s[0]
557	ldp	x8,x12,[x1],#16	// inp[0:1]
558	umlal	v19.2d,v11.2s,v6.s[0]
559	ldp	x9,x13,[x1],#48
560	umlal	v23.2d,v11.2s,v3.s[0]
561	umlal	v20.2d,v11.2s,v8.s[0]
562	umlal	v21.2d,v11.2s,v0.s[0]
563#ifdef	__AARCH64EB__
564	rev	x8,x8
565	rev	x12,x12
566	rev	x9,x9
567	rev	x13,x13
568#endif
569
570	add	v10.2s,v10.2s,v25.2s
571	umlal	v22.2d,v9.2s,v5.s[0]
572	umlal	v23.2d,v9.2s,v7.s[0]
573	and	x4,x8,#0x03ffffff	// base 2^64 -> base 2^26
574	umlal	v21.2d,v9.2s,v3.s[0]
575	and	x5,x9,#0x03ffffff
576	umlal	v19.2d,v9.2s,v0.s[0]
577	ubfx	x6,x8,#26,#26
578	umlal	v20.2d,v9.2s,v1.s[0]
579	ubfx	x7,x9,#26,#26
580
581	add	v12.2s,v12.2s,v27.2s
582	add	x4,x4,x5,lsl#32		// bfi	x4,x5,#32,#32
583	umlal	v22.2d,v10.2s,v3.s[0]
584	extr	x8,x12,x8,#52
585	umlal	v23.2d,v10.2s,v5.s[0]
586	extr	x9,x13,x9,#52
587	umlal	v19.2d,v10.2s,v8.s[0]
588	add	x6,x6,x7,lsl#32		// bfi	x6,x7,#32,#32
589	umlal	v21.2d,v10.2s,v1.s[0]
590	fmov	d9,x4
591	umlal	v20.2d,v10.2s,v0.s[0]
592	and	x8,x8,#0x03ffffff
593
594	add	v13.2s,v13.2s,v28.2s
595	and	x9,x9,#0x03ffffff
596	umlal	v22.2d,v12.2s,v0.s[0]
597	ubfx	x10,x12,#14,#26
598	umlal	v19.2d,v12.2s,v4.s[0]
599	ubfx	x11,x13,#14,#26
600	umlal	v23.2d,v12.2s,v1.s[0]
601	add	x8,x8,x9,lsl#32		// bfi	x8,x9,#32,#32
602	umlal	v20.2d,v12.2s,v6.s[0]
603	fmov	d10,x6
604	umlal	v21.2d,v12.2s,v8.s[0]
605	add	x12,x3,x12,lsr#40
606
607	umlal	v22.2d,v13.2s,v8.s[0]
608	add	x13,x3,x13,lsr#40
609	umlal	v19.2d,v13.2s,v2.s[0]
610	add	x10,x10,x11,lsl#32	// bfi	x10,x11,#32,#32
611	umlal	v23.2d,v13.2s,v0.s[0]
612	add	x12,x12,x13,lsl#32	// bfi	x12,x13,#32,#32
613	umlal	v20.2d,v13.2s,v4.s[0]
614	fmov	d11,x8
615	umlal	v21.2d,v13.2s,v6.s[0]
616	fmov	d12,x10
617	fmov	d13,x12
618
619	/////////////////////////////////////////////////////////////////
620	// lazy reduction as discussed in "NEON crypto" by D.J. Bernstein
621	// and P. Schwabe
622	//
623	// [see discussion in poly1305-armv4 module]
624
625	ushr	v29.2d,v22.2d,#26
626	xtn	v27.2s,v22.2d
627	ushr	v30.2d,v19.2d,#26
628	and	v19.16b,v19.16b,v31.16b
629	add	v23.2d,v23.2d,v29.2d	// h3 -> h4
630	bic	v27.2s,#0xfc,lsl#24	// &=0x03ffffff
631	add	v20.2d,v20.2d,v30.2d	// h0 -> h1
632
633	ushr	v29.2d,v23.2d,#26
634	xtn	v28.2s,v23.2d
635	ushr	v30.2d,v20.2d,#26
636	xtn	v25.2s,v20.2d
637	bic	v28.2s,#0xfc,lsl#24
638	add	v21.2d,v21.2d,v30.2d	// h1 -> h2
639
640	add	v19.2d,v19.2d,v29.2d
641	shl	v29.2d,v29.2d,#2
642	shrn	v30.2s,v21.2d,#26
643	xtn	v26.2s,v21.2d
644	add	v19.2d,v19.2d,v29.2d	// h4 -> h0
645	bic	v25.2s,#0xfc,lsl#24
646	add	v27.2s,v27.2s,v30.2s		// h2 -> h3
647	bic	v26.2s,#0xfc,lsl#24
648
649	shrn	v29.2s,v19.2d,#26
650	xtn	v24.2s,v19.2d
651	ushr	v30.2s,v27.2s,#26
652	bic	v27.2s,#0xfc,lsl#24
653	bic	v24.2s,#0xfc,lsl#24
654	add	v25.2s,v25.2s,v29.2s		// h0 -> h1
655	add	v28.2s,v28.2s,v30.2s		// h3 -> h4
656
657	b.hi	.Loop_neon
658
659.Lskip_loop:
660	dup	v16.2d,v16.d[0]
661	add	v11.2s,v11.2s,v26.2s
662
663	////////////////////////////////////////////////////////////////
664	// multiply (inp[0:1]+hash) or inp[2:3] by r^2:r^1
665
666	adds	x2,x2,#32
667	b.ne	.Long_tail
668
669	dup	v16.2d,v11.d[0]
670	add	v14.2s,v9.2s,v24.2s
671	add	v17.2s,v12.2s,v27.2s
672	add	v15.2s,v10.2s,v25.2s
673	add	v18.2s,v13.2s,v28.2s
674
675.Long_tail:
676	dup	v14.2d,v14.d[0]
677	umull2	v19.2d,v16.4s,v6.4s
678	umull2	v22.2d,v16.4s,v1.4s
679	umull2	v23.2d,v16.4s,v3.4s
680	umull2	v21.2d,v16.4s,v0.4s
681	umull2	v20.2d,v16.4s,v8.4s
682
683	dup	v15.2d,v15.d[0]
684	umlal2	v19.2d,v14.4s,v0.4s
685	umlal2	v21.2d,v14.4s,v3.4s
686	umlal2	v22.2d,v14.4s,v5.4s
687	umlal2	v23.2d,v14.4s,v7.4s
688	umlal2	v20.2d,v14.4s,v1.4s
689
690	dup	v17.2d,v17.d[0]
691	umlal2	v19.2d,v15.4s,v8.4s
692	umlal2	v22.2d,v15.4s,v3.4s
693	umlal2	v21.2d,v15.4s,v1.4s
694	umlal2	v23.2d,v15.4s,v5.4s
695	umlal2	v20.2d,v15.4s,v0.4s
696
697	dup	v18.2d,v18.d[0]
698	umlal2	v22.2d,v17.4s,v0.4s
699	umlal2	v23.2d,v17.4s,v1.4s
700	umlal2	v19.2d,v17.4s,v4.4s
701	umlal2	v20.2d,v17.4s,v6.4s
702	umlal2	v21.2d,v17.4s,v8.4s
703
704	umlal2	v22.2d,v18.4s,v8.4s
705	umlal2	v19.2d,v18.4s,v2.4s
706	umlal2	v23.2d,v18.4s,v0.4s
707	umlal2	v20.2d,v18.4s,v4.4s
708	umlal2	v21.2d,v18.4s,v6.4s
709
710	b.eq	.Lshort_tail
711
712	////////////////////////////////////////////////////////////////
713	// (hash+inp[0:1])*r^4:r^3 and accumulate
714
715	add	v9.2s,v9.2s,v24.2s
716	umlal	v22.2d,v11.2s,v1.2s
717	umlal	v19.2d,v11.2s,v6.2s
718	umlal	v23.2d,v11.2s,v3.2s
719	umlal	v20.2d,v11.2s,v8.2s
720	umlal	v21.2d,v11.2s,v0.2s
721
722	add	v10.2s,v10.2s,v25.2s
723	umlal	v22.2d,v9.2s,v5.2s
724	umlal	v19.2d,v9.2s,v0.2s
725	umlal	v23.2d,v9.2s,v7.2s
726	umlal	v20.2d,v9.2s,v1.2s
727	umlal	v21.2d,v9.2s,v3.2s
728
729	add	v12.2s,v12.2s,v27.2s
730	umlal	v22.2d,v10.2s,v3.2s
731	umlal	v19.2d,v10.2s,v8.2s
732	umlal	v23.2d,v10.2s,v5.2s
733	umlal	v20.2d,v10.2s,v0.2s
734	umlal	v21.2d,v10.2s,v1.2s
735
736	add	v13.2s,v13.2s,v28.2s
737	umlal	v22.2d,v12.2s,v0.2s
738	umlal	v19.2d,v12.2s,v4.2s
739	umlal	v23.2d,v12.2s,v1.2s
740	umlal	v20.2d,v12.2s,v6.2s
741	umlal	v21.2d,v12.2s,v8.2s
742
743	umlal	v22.2d,v13.2s,v8.2s
744	umlal	v19.2d,v13.2s,v2.2s
745	umlal	v23.2d,v13.2s,v0.2s
746	umlal	v20.2d,v13.2s,v4.2s
747	umlal	v21.2d,v13.2s,v6.2s
748
749.Lshort_tail:
750	////////////////////////////////////////////////////////////////
751	// horizontal add
752
753	addp	v22.2d,v22.2d,v22.2d
754	ldp	d8,d9,[sp,#16]		// meet ABI requirements
755	addp	v19.2d,v19.2d,v19.2d
756	ldp	d10,d11,[sp,#32]
757	addp	v23.2d,v23.2d,v23.2d
758	ldp	d12,d13,[sp,#48]
759	addp	v20.2d,v20.2d,v20.2d
760	ldp	d14,d15,[sp,#64]
761	addp	v21.2d,v21.2d,v21.2d
762
763	////////////////////////////////////////////////////////////////
764	// lazy reduction, but without narrowing
765
766	ushr	v29.2d,v22.2d,#26
767	and	v22.16b,v22.16b,v31.16b
768	ushr	v30.2d,v19.2d,#26
769	and	v19.16b,v19.16b,v31.16b
770
771	add	v23.2d,v23.2d,v29.2d	// h3 -> h4
772	add	v20.2d,v20.2d,v30.2d	// h0 -> h1
773
774	ushr	v29.2d,v23.2d,#26
775	and	v23.16b,v23.16b,v31.16b
776	ushr	v30.2d,v20.2d,#26
777	and	v20.16b,v20.16b,v31.16b
778	add	v21.2d,v21.2d,v30.2d	// h1 -> h2
779
780	add	v19.2d,v19.2d,v29.2d
781	shl	v29.2d,v29.2d,#2
782	ushr	v30.2d,v21.2d,#26
783	and	v21.16b,v21.16b,v31.16b
784	add	v19.2d,v19.2d,v29.2d	// h4 -> h0
785	add	v22.2d,v22.2d,v30.2d	// h2 -> h3
786
787	ushr	v29.2d,v19.2d,#26
788	and	v19.16b,v19.16b,v31.16b
789	ushr	v30.2d,v22.2d,#26
790	and	v22.16b,v22.16b,v31.16b
791	add	v20.2d,v20.2d,v29.2d	// h0 -> h1
792	add	v23.2d,v23.2d,v30.2d	// h3 -> h4
793
794	////////////////////////////////////////////////////////////////
795	// write the result, can be partially reduced
796
797	st4	{v19.s,v20.s,v21.s,v22.s}[0],[x0],#16
798	st1	{v23.s}[0],[x0]
799
800.Lno_data_neon:
801	ldr	x29,[sp],#80
802.inst	0xd50323bf		// autiasp
803	ret
804.size	poly1305_blocks_neon,.-poly1305_blocks_neon
805
806.type	poly1305_emit_neon,%function
807.align	5
808poly1305_emit_neon:
809.Lpoly1305_emit_neon:
810	ldr	x17,[x0,#24]
811	cbz	x17,poly1305_emit
812
813	ldp	w10,w11,[x0]		// load hash value base 2^26
814	ldp	w12,w13,[x0,#8]
815	ldr	w14,[x0,#16]
816
817	add	x4,x10,x11,lsl#26	// base 2^26 -> base 2^64
818	lsr	x5,x12,#12
819	adds	x4,x4,x12,lsl#52
820	add	x5,x5,x13,lsl#14
821	adc	x5,x5,xzr
822	lsr	x6,x14,#24
823	adds	x5,x5,x14,lsl#40
824	adc	x6,x6,xzr		// can be partially reduced...
825
826	ldp	x10,x11,[x2]	// load nonce
827
828	and	x12,x6,#-4		// ... so reduce
829	add	x12,x12,x6,lsr#2
830	and	x6,x6,#3
831	adds	x4,x4,x12
832	adcs	x5,x5,xzr
833	adc	x6,x6,xzr
834
835	adds	x12,x4,#5		// compare to modulus
836	adcs	x13,x5,xzr
837	adc	x14,x6,xzr
838
839	tst	x14,#-4			// see if it's carried/borrowed
840
841	csel	x4,x4,x12,eq
842	csel	x5,x5,x13,eq
843
844#ifdef	__AARCH64EB__
845	ror	x10,x10,#32		// flip nonce words
846	ror	x11,x11,#32
847#endif
848	adds	x4,x4,x10		// accumulate nonce
849	adc	x5,x5,x11
850#ifdef	__AARCH64EB__
851	rev	x4,x4			// flip output bytes
852	rev	x5,x5
853#endif
854	stp	x4,x5,[x1]		// write result
855
856	ret
857.size	poly1305_emit_neon,.-poly1305_emit_neon
858
859.align	5
860.Lzeros:
861.long	0,0,0,0,0,0,0,0
862.byte	80,111,108,121,49,51,48,53,32,102,111,114,32,65,82,77,118,56,44,32,67,82,89,80,84,79,71,65,77,83,32,98,121,32,60,97,112,112,114,111,64,111,112,101,110,115,115,108,46,111,114,103,62,0
863.align	2
864.align	2
865