1#include "arm_arch.h"
2
3.text
4
5// forward "declarations" are required for Apple
6
7.hidden	OPENSSL_armcap_P
8.globl	poly1305_init
9.hidden	poly1305_init
10.globl	poly1305_blocks
11.hidden	poly1305_blocks
12.globl	poly1305_emit
13.hidden	poly1305_emit
14
15.type	poly1305_init,%function
16.align	5
17poly1305_init:
18	cmp	x1,xzr
19	stp	xzr,xzr,[x0]		// zero hash value
20	stp	xzr,xzr,[x0,#16]	// [along with is_base2_26]
21
22	csel	x0,xzr,x0,eq
23	b.eq	.Lno_key
24
25#ifdef	__ILP32__
26	ldrsw	x11,.LOPENSSL_armcap_P
27#else
28	ldr	x11,.LOPENSSL_armcap_P
29#endif
30	adr	x10,.LOPENSSL_armcap_P
31
32	ldp	x7,x8,[x1]		// load key
33	mov	x9,#0xfffffffc0fffffff
34	movk	x9,#0x0fff,lsl#48
35	ldr	w17,[x10,x11]
36#ifdef	__ARMEB__
37	rev	x7,x7			// flip bytes
38	rev	x8,x8
39#endif
40	and	x7,x7,x9		// &=0ffffffc0fffffff
41	and	x9,x9,#-4
42	and	x8,x8,x9		// &=0ffffffc0ffffffc
43	stp	x7,x8,[x0,#32]	// save key value
44
45	tst	w17,#ARMV7_NEON
46
47	adr	x12,poly1305_blocks
48	adr	x7,poly1305_blocks_neon
49	adr	x13,poly1305_emit
50	adr	x8,poly1305_emit_neon
51
52	csel	x12,x12,x7,eq
53	csel	x13,x13,x8,eq
54
55#ifdef	__ILP32__
56	stp	w12,w13,[x2]
57#else
58	stp	x12,x13,[x2]
59#endif
60
61	mov	x0,#1
62.Lno_key:
63	ret
64.size	poly1305_init,.-poly1305_init
65
66.type	poly1305_blocks,%function
67.align	5
68poly1305_blocks:
69	ands	x2,x2,#-16
70	b.eq	.Lno_data
71
72	ldp	x4,x5,[x0]		// load hash value
73	ldp	x7,x8,[x0,#32]	// load key value
74	ldr	x6,[x0,#16]
75	add	x9,x8,x8,lsr#2	// s1 = r1 + (r1 >> 2)
76	b	.Loop
77
78.align	5
79.Loop:
80	ldp	x10,x11,[x1],#16	// load input
81	sub	x2,x2,#16
82#ifdef	__ARMEB__
83	rev	x10,x10
84	rev	x11,x11
85#endif
86	adds	x4,x4,x10		// accumulate input
87	adcs	x5,x5,x11
88
89	mul	x12,x4,x7		// h0*r0
90	adc	x6,x6,x3
91	umulh	x13,x4,x7
92
93	mul	x10,x5,x9		// h1*5*r1
94	umulh	x11,x5,x9
95
96	adds	x12,x12,x10
97	mul	x10,x4,x8		// h0*r1
98	adc	x13,x13,x11
99	umulh	x14,x4,x8
100
101	adds	x13,x13,x10
102	mul	x10,x5,x7		// h1*r0
103	adc	x14,x14,xzr
104	umulh	x11,x5,x7
105
106	adds	x13,x13,x10
107	mul	x10,x6,x9		// h2*5*r1
108	adc	x14,x14,x11
109	mul	x11,x6,x7		// h2*r0
110
111	adds	x13,x13,x10
112	adc	x14,x14,x11
113
114	and	x10,x14,#-4		// final reduction
115	and	x6,x14,#3
116	add	x10,x10,x14,lsr#2
117	adds	x4,x12,x10
118	adcs	x5,x13,xzr
119	adc	x6,x6,xzr
120
121	cbnz	x2,.Loop
122
123	stp	x4,x5,[x0]		// store hash value
124	str	x6,[x0,#16]
125
126.Lno_data:
127	ret
128.size	poly1305_blocks,.-poly1305_blocks
129
130.type	poly1305_emit,%function
131.align	5
132poly1305_emit:
133	ldp	x4,x5,[x0]		// load hash base 2^64
134	ldr	x6,[x0,#16]
135	ldp	x10,x11,[x2]	// load nonce
136
137	adds	x12,x4,#5		// compare to modulus
138	adcs	x13,x5,xzr
139	adc	x14,x6,xzr
140
141	tst	x14,#-4			// see if it's carried/borrowed
142
143	csel	x4,x4,x12,eq
144	csel	x5,x5,x13,eq
145
146#ifdef	__ARMEB__
147	ror	x10,x10,#32		// flip nonce words
148	ror	x11,x11,#32
149#endif
150	adds	x4,x4,x10		// accumulate nonce
151	adc	x5,x5,x11
152#ifdef	__ARMEB__
153	rev	x4,x4			// flip output bytes
154	rev	x5,x5
155#endif
156	stp	x4,x5,[x1]		// write result
157
158	ret
159.size	poly1305_emit,.-poly1305_emit
160.type	poly1305_mult,%function
161.align	5
162poly1305_mult:
163	mul	x12,x4,x7		// h0*r0
164	umulh	x13,x4,x7
165
166	mul	x10,x5,x9		// h1*5*r1
167	umulh	x11,x5,x9
168
169	adds	x12,x12,x10
170	mul	x10,x4,x8		// h0*r1
171	adc	x13,x13,x11
172	umulh	x14,x4,x8
173
174	adds	x13,x13,x10
175	mul	x10,x5,x7		// h1*r0
176	adc	x14,x14,xzr
177	umulh	x11,x5,x7
178
179	adds	x13,x13,x10
180	mul	x10,x6,x9		// h2*5*r1
181	adc	x14,x14,x11
182	mul	x11,x6,x7		// h2*r0
183
184	adds	x13,x13,x10
185	adc	x14,x14,x11
186
187	and	x10,x14,#-4		// final reduction
188	and	x6,x14,#3
189	add	x10,x10,x14,lsr#2
190	adds	x4,x12,x10
191	adcs	x5,x13,xzr
192	adc	x6,x6,xzr
193
194	ret
195.size	poly1305_mult,.-poly1305_mult
196
197.type	poly1305_splat,%function
198.align	5
199poly1305_splat:
200	and	x12,x4,#0x03ffffff	// base 2^64 -> base 2^26
201	ubfx	x13,x4,#26,#26
202	extr	x14,x5,x4,#52
203	and	x14,x14,#0x03ffffff
204	ubfx	x15,x5,#14,#26
205	extr	x16,x6,x5,#40
206
207	str	w12,[x0,#16*0]	// r0
208	add	w12,w13,w13,lsl#2	// r1*5
209	str	w13,[x0,#16*1]	// r1
210	add	w13,w14,w14,lsl#2	// r2*5
211	str	w12,[x0,#16*2]	// s1
212	str	w14,[x0,#16*3]	// r2
213	add	w14,w15,w15,lsl#2	// r3*5
214	str	w13,[x0,#16*4]	// s2
215	str	w15,[x0,#16*5]	// r3
216	add	w15,w16,w16,lsl#2	// r4*5
217	str	w14,[x0,#16*6]	// s3
218	str	w16,[x0,#16*7]	// r4
219	str	w15,[x0,#16*8]	// s4
220
221	ret
222.size	poly1305_splat,.-poly1305_splat
223
224.type	poly1305_blocks_neon,%function
225.align	5
226poly1305_blocks_neon:
227	ldr	x17,[x0,#24]
228	cmp	x2,#128
229	b.hs	.Lblocks_neon
230	cbz	x17,poly1305_blocks
231
232.Lblocks_neon:
233.inst	0xd503233f		// paciasp
234	stp	x29,x30,[sp,#-80]!
235	add	x29,sp,#0
236
237	ands	x2,x2,#-16
238	b.eq	.Lno_data_neon
239
240	cbz	x17,.Lbase2_64_neon
241
242	ldp	w10,w11,[x0]		// load hash value base 2^26
243	ldp	w12,w13,[x0,#8]
244	ldr	w14,[x0,#16]
245
246	tst	x2,#31
247	b.eq	.Leven_neon
248
249	ldp	x7,x8,[x0,#32]	// load key value
250
251	add	x4,x10,x11,lsl#26	// base 2^26 -> base 2^64
252	lsr	x5,x12,#12
253	adds	x4,x4,x12,lsl#52
254	add	x5,x5,x13,lsl#14
255	adc	x5,x5,xzr
256	lsr	x6,x14,#24
257	adds	x5,x5,x14,lsl#40
258	adc	x14,x6,xzr		// can be partially reduced...
259
260	ldp	x12,x13,[x1],#16	// load input
261	sub	x2,x2,#16
262	add	x9,x8,x8,lsr#2	// s1 = r1 + (r1 >> 2)
263
264	and	x10,x14,#-4		// ... so reduce
265	and	x6,x14,#3
266	add	x10,x10,x14,lsr#2
267	adds	x4,x4,x10
268	adcs	x5,x5,xzr
269	adc	x6,x6,xzr
270
271#ifdef	__ARMEB__
272	rev	x12,x12
273	rev	x13,x13
274#endif
275	adds	x4,x4,x12		// accumulate input
276	adcs	x5,x5,x13
277	adc	x6,x6,x3
278
279	bl	poly1305_mult
280	ldr	x30,[sp,#8]
281
282	cbz	x3,.Lstore_base2_64_neon
283
284	and	x10,x4,#0x03ffffff	// base 2^64 -> base 2^26
285	ubfx	x11,x4,#26,#26
286	extr	x12,x5,x4,#52
287	and	x12,x12,#0x03ffffff
288	ubfx	x13,x5,#14,#26
289	extr	x14,x6,x5,#40
290
291	cbnz	x2,.Leven_neon
292
293	stp	w10,w11,[x0]		// store hash value base 2^26
294	stp	w12,w13,[x0,#8]
295	str	w14,[x0,#16]
296	b	.Lno_data_neon
297
298.align	4
299.Lstore_base2_64_neon:
300	stp	x4,x5,[x0]		// store hash value base 2^64
301	stp	x6,xzr,[x0,#16]	// note that is_base2_26 is zeroed
302	b	.Lno_data_neon
303
304.align	4
305.Lbase2_64_neon:
306	ldp	x7,x8,[x0,#32]	// load key value
307
308	ldp	x4,x5,[x0]		// load hash value base 2^64
309	ldr	x6,[x0,#16]
310
311	tst	x2,#31
312	b.eq	.Linit_neon
313
314	ldp	x12,x13,[x1],#16	// load input
315	sub	x2,x2,#16
316	add	x9,x8,x8,lsr#2	// s1 = r1 + (r1 >> 2)
317#ifdef	__ARMEB__
318	rev	x12,x12
319	rev	x13,x13
320#endif
321	adds	x4,x4,x12		// accumulate input
322	adcs	x5,x5,x13
323	adc	x6,x6,x3
324
325	bl	poly1305_mult
326
327.Linit_neon:
328	and	x10,x4,#0x03ffffff	// base 2^64 -> base 2^26
329	ubfx	x11,x4,#26,#26
330	extr	x12,x5,x4,#52
331	and	x12,x12,#0x03ffffff
332	ubfx	x13,x5,#14,#26
333	extr	x14,x6,x5,#40
334
335	stp	d8,d9,[sp,#16]		// meet ABI requirements
336	stp	d10,d11,[sp,#32]
337	stp	d12,d13,[sp,#48]
338	stp	d14,d15,[sp,#64]
339
340	fmov	d24,x10
341	fmov	d25,x11
342	fmov	d26,x12
343	fmov	d27,x13
344	fmov	d28,x14
345
346	////////////////////////////////// initialize r^n table
347	mov	x4,x7			// r^1
348	add	x9,x8,x8,lsr#2	// s1 = r1 + (r1 >> 2)
349	mov	x5,x8
350	mov	x6,xzr
351	add	x0,x0,#48+12
352	bl	poly1305_splat
353
354	bl	poly1305_mult		// r^2
355	sub	x0,x0,#4
356	bl	poly1305_splat
357
358	bl	poly1305_mult		// r^3
359	sub	x0,x0,#4
360	bl	poly1305_splat
361
362	bl	poly1305_mult		// r^4
363	sub	x0,x0,#4
364	bl	poly1305_splat
365	ldr	x30,[sp,#8]
366
367	add	x16,x1,#32
368	adr	x17,.Lzeros
369	subs	x2,x2,#64
370	csel	x16,x17,x16,lo
371
372	mov	x4,#1
373	str	x4,[x0,#-24]		// set is_base2_26
374	sub	x0,x0,#48		// restore original x0
375	b	.Ldo_neon
376
377.align	4
378.Leven_neon:
379	add	x16,x1,#32
380	adr	x17,.Lzeros
381	subs	x2,x2,#64
382	csel	x16,x17,x16,lo
383
384	stp	d8,d9,[sp,#16]		// meet ABI requirements
385	stp	d10,d11,[sp,#32]
386	stp	d12,d13,[sp,#48]
387	stp	d14,d15,[sp,#64]
388
389	fmov	d24,x10
390	fmov	d25,x11
391	fmov	d26,x12
392	fmov	d27,x13
393	fmov	d28,x14
394
395.Ldo_neon:
396	ldp	x8,x12,[x16],#16	// inp[2:3] (or zero)
397	ldp	x9,x13,[x16],#48
398
399	lsl	x3,x3,#24
400	add	x15,x0,#48
401
402#ifdef	__ARMEB__
403	rev	x8,x8
404	rev	x12,x12
405	rev	x9,x9
406	rev	x13,x13
407#endif
408	and	x4,x8,#0x03ffffff	// base 2^64 -> base 2^26
409	and	x5,x9,#0x03ffffff
410	ubfx	x6,x8,#26,#26
411	ubfx	x7,x9,#26,#26
412	add	x4,x4,x5,lsl#32		// bfi	x4,x5,#32,#32
413	extr	x8,x12,x8,#52
414	extr	x9,x13,x9,#52
415	add	x6,x6,x7,lsl#32		// bfi	x6,x7,#32,#32
416	fmov	d14,x4
417	and	x8,x8,#0x03ffffff
418	and	x9,x9,#0x03ffffff
419	ubfx	x10,x12,#14,#26
420	ubfx	x11,x13,#14,#26
421	add	x12,x3,x12,lsr#40
422	add	x13,x3,x13,lsr#40
423	add	x8,x8,x9,lsl#32		// bfi	x8,x9,#32,#32
424	fmov	d15,x6
425	add	x10,x10,x11,lsl#32	// bfi	x10,x11,#32,#32
426	add	x12,x12,x13,lsl#32	// bfi	x12,x13,#32,#32
427	fmov	d16,x8
428	fmov	d17,x10
429	fmov	d18,x12
430
431	ldp	x8,x12,[x1],#16	// inp[0:1]
432	ldp	x9,x13,[x1],#48
433
434	ld1	{v0.4s,v1.4s,v2.4s,v3.4s},[x15],#64
435	ld1	{v4.4s,v5.4s,v6.4s,v7.4s},[x15],#64
436	ld1	{v8.4s},[x15]
437
438#ifdef	__ARMEB__
439	rev	x8,x8
440	rev	x12,x12
441	rev	x9,x9
442	rev	x13,x13
443#endif
444	and	x4,x8,#0x03ffffff	// base 2^64 -> base 2^26
445	and	x5,x9,#0x03ffffff
446	ubfx	x6,x8,#26,#26
447	ubfx	x7,x9,#26,#26
448	add	x4,x4,x5,lsl#32		// bfi	x4,x5,#32,#32
449	extr	x8,x12,x8,#52
450	extr	x9,x13,x9,#52
451	add	x6,x6,x7,lsl#32		// bfi	x6,x7,#32,#32
452	fmov	d9,x4
453	and	x8,x8,#0x03ffffff
454	and	x9,x9,#0x03ffffff
455	ubfx	x10,x12,#14,#26
456	ubfx	x11,x13,#14,#26
457	add	x12,x3,x12,lsr#40
458	add	x13,x3,x13,lsr#40
459	add	x8,x8,x9,lsl#32		// bfi	x8,x9,#32,#32
460	fmov	d10,x6
461	add	x10,x10,x11,lsl#32	// bfi	x10,x11,#32,#32
462	add	x12,x12,x13,lsl#32	// bfi	x12,x13,#32,#32
463	movi	v31.2d,#-1
464	fmov	d11,x8
465	fmov	d12,x10
466	fmov	d13,x12
467	ushr	v31.2d,v31.2d,#38
468
469	b.ls	.Lskip_loop
470
471.align	4
472.Loop_neon:
473	////////////////////////////////////////////////////////////////
474	// ((inp[0]*r^4+inp[2]*r^2+inp[4])*r^4+inp[6]*r^2
475	// ((inp[1]*r^4+inp[3]*r^2+inp[5])*r^3+inp[7]*r
476	//   ___________________/
477	// ((inp[0]*r^4+inp[2]*r^2+inp[4])*r^4+inp[6]*r^2+inp[8])*r^2
478	// ((inp[1]*r^4+inp[3]*r^2+inp[5])*r^4+inp[7]*r^2+inp[9])*r
479	//   ___________________/ ____________________/
480	//
481	// Note that we start with inp[2:3]*r^2. This is because it
482	// doesn't depend on reduction in previous iteration.
483	////////////////////////////////////////////////////////////////
484	// d4 = h0*r4 + h1*r3   + h2*r2   + h3*r1   + h4*r0
485	// d3 = h0*r3 + h1*r2   + h2*r1   + h3*r0   + h4*5*r4
486	// d2 = h0*r2 + h1*r1   + h2*r0   + h3*5*r4 + h4*5*r3
487	// d1 = h0*r1 + h1*r0   + h2*5*r4 + h3*5*r3 + h4*5*r2
488	// d0 = h0*r0 + h1*5*r4 + h2*5*r3 + h3*5*r2 + h4*5*r1
489
490	subs	x2,x2,#64
491	umull	v23.2d,v14.2s,v7.s[2]
492	csel	x16,x17,x16,lo
493	umull	v22.2d,v14.2s,v5.s[2]
494	umull	v21.2d,v14.2s,v3.s[2]
495	ldp	x8,x12,[x16],#16	// inp[2:3] (or zero)
496	umull	v20.2d,v14.2s,v1.s[2]
497	ldp	x9,x13,[x16],#48
498	umull	v19.2d,v14.2s,v0.s[2]
499#ifdef	__ARMEB__
500	rev	x8,x8
501	rev	x12,x12
502	rev	x9,x9
503	rev	x13,x13
504#endif
505
506	umlal	v23.2d,v15.2s,v5.s[2]
507	and	x4,x8,#0x03ffffff	// base 2^64 -> base 2^26
508	umlal	v22.2d,v15.2s,v3.s[2]
509	and	x5,x9,#0x03ffffff
510	umlal	v21.2d,v15.2s,v1.s[2]
511	ubfx	x6,x8,#26,#26
512	umlal	v20.2d,v15.2s,v0.s[2]
513	ubfx	x7,x9,#26,#26
514	umlal	v19.2d,v15.2s,v8.s[2]
515	add	x4,x4,x5,lsl#32		// bfi	x4,x5,#32,#32
516
517	umlal	v23.2d,v16.2s,v3.s[2]
518	extr	x8,x12,x8,#52
519	umlal	v22.2d,v16.2s,v1.s[2]
520	extr	x9,x13,x9,#52
521	umlal	v21.2d,v16.2s,v0.s[2]
522	add	x6,x6,x7,lsl#32		// bfi	x6,x7,#32,#32
523	umlal	v20.2d,v16.2s,v8.s[2]
524	fmov	d14,x4
525	umlal	v19.2d,v16.2s,v6.s[2]
526	and	x8,x8,#0x03ffffff
527
528	umlal	v23.2d,v17.2s,v1.s[2]
529	and	x9,x9,#0x03ffffff
530	umlal	v22.2d,v17.2s,v0.s[2]
531	ubfx	x10,x12,#14,#26
532	umlal	v21.2d,v17.2s,v8.s[2]
533	ubfx	x11,x13,#14,#26
534	umlal	v20.2d,v17.2s,v6.s[2]
535	add	x8,x8,x9,lsl#32		// bfi	x8,x9,#32,#32
536	umlal	v19.2d,v17.2s,v4.s[2]
537	fmov	d15,x6
538
539	add	v11.2s,v11.2s,v26.2s
540	add	x12,x3,x12,lsr#40
541	umlal	v23.2d,v18.2s,v0.s[2]
542	add	x13,x3,x13,lsr#40
543	umlal	v22.2d,v18.2s,v8.s[2]
544	add	x10,x10,x11,lsl#32	// bfi	x10,x11,#32,#32
545	umlal	v21.2d,v18.2s,v6.s[2]
546	add	x12,x12,x13,lsl#32	// bfi	x12,x13,#32,#32
547	umlal	v20.2d,v18.2s,v4.s[2]
548	fmov	d16,x8
549	umlal	v19.2d,v18.2s,v2.s[2]
550	fmov	d17,x10
551
552	////////////////////////////////////////////////////////////////
553	// (hash+inp[0:1])*r^4 and accumulate
554
555	add	v9.2s,v9.2s,v24.2s
556	fmov	d18,x12
557	umlal	v22.2d,v11.2s,v1.s[0]
558	ldp	x8,x12,[x1],#16	// inp[0:1]
559	umlal	v19.2d,v11.2s,v6.s[0]
560	ldp	x9,x13,[x1],#48
561	umlal	v23.2d,v11.2s,v3.s[0]
562	umlal	v20.2d,v11.2s,v8.s[0]
563	umlal	v21.2d,v11.2s,v0.s[0]
564#ifdef	__ARMEB__
565	rev	x8,x8
566	rev	x12,x12
567	rev	x9,x9
568	rev	x13,x13
569#endif
570
571	add	v10.2s,v10.2s,v25.2s
572	umlal	v22.2d,v9.2s,v5.s[0]
573	umlal	v23.2d,v9.2s,v7.s[0]
574	and	x4,x8,#0x03ffffff	// base 2^64 -> base 2^26
575	umlal	v21.2d,v9.2s,v3.s[0]
576	and	x5,x9,#0x03ffffff
577	umlal	v19.2d,v9.2s,v0.s[0]
578	ubfx	x6,x8,#26,#26
579	umlal	v20.2d,v9.2s,v1.s[0]
580	ubfx	x7,x9,#26,#26
581
582	add	v12.2s,v12.2s,v27.2s
583	add	x4,x4,x5,lsl#32		// bfi	x4,x5,#32,#32
584	umlal	v22.2d,v10.2s,v3.s[0]
585	extr	x8,x12,x8,#52
586	umlal	v23.2d,v10.2s,v5.s[0]
587	extr	x9,x13,x9,#52
588	umlal	v19.2d,v10.2s,v8.s[0]
589	add	x6,x6,x7,lsl#32		// bfi	x6,x7,#32,#32
590	umlal	v21.2d,v10.2s,v1.s[0]
591	fmov	d9,x4
592	umlal	v20.2d,v10.2s,v0.s[0]
593	and	x8,x8,#0x03ffffff
594
595	add	v13.2s,v13.2s,v28.2s
596	and	x9,x9,#0x03ffffff
597	umlal	v22.2d,v12.2s,v0.s[0]
598	ubfx	x10,x12,#14,#26
599	umlal	v19.2d,v12.2s,v4.s[0]
600	ubfx	x11,x13,#14,#26
601	umlal	v23.2d,v12.2s,v1.s[0]
602	add	x8,x8,x9,lsl#32		// bfi	x8,x9,#32,#32
603	umlal	v20.2d,v12.2s,v6.s[0]
604	fmov	d10,x6
605	umlal	v21.2d,v12.2s,v8.s[0]
606	add	x12,x3,x12,lsr#40
607
608	umlal	v22.2d,v13.2s,v8.s[0]
609	add	x13,x3,x13,lsr#40
610	umlal	v19.2d,v13.2s,v2.s[0]
611	add	x10,x10,x11,lsl#32	// bfi	x10,x11,#32,#32
612	umlal	v23.2d,v13.2s,v0.s[0]
613	add	x12,x12,x13,lsl#32	// bfi	x12,x13,#32,#32
614	umlal	v20.2d,v13.2s,v4.s[0]
615	fmov	d11,x8
616	umlal	v21.2d,v13.2s,v6.s[0]
617	fmov	d12,x10
618	fmov	d13,x12
619
620	/////////////////////////////////////////////////////////////////
621	// lazy reduction as discussed in "NEON crypto" by D.J. Bernstein
622	// and P. Schwabe
623	//
624	// [see discussion in poly1305-armv4 module]
625
626	ushr	v29.2d,v22.2d,#26
627	xtn	v27.2s,v22.2d
628	ushr	v30.2d,v19.2d,#26
629	and	v19.16b,v19.16b,v31.16b
630	add	v23.2d,v23.2d,v29.2d	// h3 -> h4
631	bic	v27.2s,#0xfc,lsl#24	// &=0x03ffffff
632	add	v20.2d,v20.2d,v30.2d	// h0 -> h1
633
634	ushr	v29.2d,v23.2d,#26
635	xtn	v28.2s,v23.2d
636	ushr	v30.2d,v20.2d,#26
637	xtn	v25.2s,v20.2d
638	bic	v28.2s,#0xfc,lsl#24
639	add	v21.2d,v21.2d,v30.2d	// h1 -> h2
640
641	add	v19.2d,v19.2d,v29.2d
642	shl	v29.2d,v29.2d,#2
643	shrn	v30.2s,v21.2d,#26
644	xtn	v26.2s,v21.2d
645	add	v19.2d,v19.2d,v29.2d	// h4 -> h0
646	bic	v25.2s,#0xfc,lsl#24
647	add	v27.2s,v27.2s,v30.2s		// h2 -> h3
648	bic	v26.2s,#0xfc,lsl#24
649
650	shrn	v29.2s,v19.2d,#26
651	xtn	v24.2s,v19.2d
652	ushr	v30.2s,v27.2s,#26
653	bic	v27.2s,#0xfc,lsl#24
654	bic	v24.2s,#0xfc,lsl#24
655	add	v25.2s,v25.2s,v29.2s		// h0 -> h1
656	add	v28.2s,v28.2s,v30.2s		// h3 -> h4
657
658	b.hi	.Loop_neon
659
660.Lskip_loop:
661	dup	v16.2d,v16.d[0]
662	add	v11.2s,v11.2s,v26.2s
663
664	////////////////////////////////////////////////////////////////
665	// multiply (inp[0:1]+hash) or inp[2:3] by r^2:r^1
666
667	adds	x2,x2,#32
668	b.ne	.Long_tail
669
670	dup	v16.2d,v11.d[0]
671	add	v14.2s,v9.2s,v24.2s
672	add	v17.2s,v12.2s,v27.2s
673	add	v15.2s,v10.2s,v25.2s
674	add	v18.2s,v13.2s,v28.2s
675
676.Long_tail:
677	dup	v14.2d,v14.d[0]
678	umull2	v19.2d,v16.4s,v6.4s
679	umull2	v22.2d,v16.4s,v1.4s
680	umull2	v23.2d,v16.4s,v3.4s
681	umull2	v21.2d,v16.4s,v0.4s
682	umull2	v20.2d,v16.4s,v8.4s
683
684	dup	v15.2d,v15.d[0]
685	umlal2	v19.2d,v14.4s,v0.4s
686	umlal2	v21.2d,v14.4s,v3.4s
687	umlal2	v22.2d,v14.4s,v5.4s
688	umlal2	v23.2d,v14.4s,v7.4s
689	umlal2	v20.2d,v14.4s,v1.4s
690
691	dup	v17.2d,v17.d[0]
692	umlal2	v19.2d,v15.4s,v8.4s
693	umlal2	v22.2d,v15.4s,v3.4s
694	umlal2	v21.2d,v15.4s,v1.4s
695	umlal2	v23.2d,v15.4s,v5.4s
696	umlal2	v20.2d,v15.4s,v0.4s
697
698	dup	v18.2d,v18.d[0]
699	umlal2	v22.2d,v17.4s,v0.4s
700	umlal2	v23.2d,v17.4s,v1.4s
701	umlal2	v19.2d,v17.4s,v4.4s
702	umlal2	v20.2d,v17.4s,v6.4s
703	umlal2	v21.2d,v17.4s,v8.4s
704
705	umlal2	v22.2d,v18.4s,v8.4s
706	umlal2	v19.2d,v18.4s,v2.4s
707	umlal2	v23.2d,v18.4s,v0.4s
708	umlal2	v20.2d,v18.4s,v4.4s
709	umlal2	v21.2d,v18.4s,v6.4s
710
711	b.eq	.Lshort_tail
712
713	////////////////////////////////////////////////////////////////
714	// (hash+inp[0:1])*r^4:r^3 and accumulate
715
716	add	v9.2s,v9.2s,v24.2s
717	umlal	v22.2d,v11.2s,v1.2s
718	umlal	v19.2d,v11.2s,v6.2s
719	umlal	v23.2d,v11.2s,v3.2s
720	umlal	v20.2d,v11.2s,v8.2s
721	umlal	v21.2d,v11.2s,v0.2s
722
723	add	v10.2s,v10.2s,v25.2s
724	umlal	v22.2d,v9.2s,v5.2s
725	umlal	v19.2d,v9.2s,v0.2s
726	umlal	v23.2d,v9.2s,v7.2s
727	umlal	v20.2d,v9.2s,v1.2s
728	umlal	v21.2d,v9.2s,v3.2s
729
730	add	v12.2s,v12.2s,v27.2s
731	umlal	v22.2d,v10.2s,v3.2s
732	umlal	v19.2d,v10.2s,v8.2s
733	umlal	v23.2d,v10.2s,v5.2s
734	umlal	v20.2d,v10.2s,v0.2s
735	umlal	v21.2d,v10.2s,v1.2s
736
737	add	v13.2s,v13.2s,v28.2s
738	umlal	v22.2d,v12.2s,v0.2s
739	umlal	v19.2d,v12.2s,v4.2s
740	umlal	v23.2d,v12.2s,v1.2s
741	umlal	v20.2d,v12.2s,v6.2s
742	umlal	v21.2d,v12.2s,v8.2s
743
744	umlal	v22.2d,v13.2s,v8.2s
745	umlal	v19.2d,v13.2s,v2.2s
746	umlal	v23.2d,v13.2s,v0.2s
747	umlal	v20.2d,v13.2s,v4.2s
748	umlal	v21.2d,v13.2s,v6.2s
749
750.Lshort_tail:
751	////////////////////////////////////////////////////////////////
752	// horizontal add
753
754	addp	v22.2d,v22.2d,v22.2d
755	ldp	d8,d9,[sp,#16]		// meet ABI requirements
756	addp	v19.2d,v19.2d,v19.2d
757	ldp	d10,d11,[sp,#32]
758	addp	v23.2d,v23.2d,v23.2d
759	ldp	d12,d13,[sp,#48]
760	addp	v20.2d,v20.2d,v20.2d
761	ldp	d14,d15,[sp,#64]
762	addp	v21.2d,v21.2d,v21.2d
763
764	////////////////////////////////////////////////////////////////
765	// lazy reduction, but without narrowing
766
767	ushr	v29.2d,v22.2d,#26
768	and	v22.16b,v22.16b,v31.16b
769	ushr	v30.2d,v19.2d,#26
770	and	v19.16b,v19.16b,v31.16b
771
772	add	v23.2d,v23.2d,v29.2d	// h3 -> h4
773	add	v20.2d,v20.2d,v30.2d	// h0 -> h1
774
775	ushr	v29.2d,v23.2d,#26
776	and	v23.16b,v23.16b,v31.16b
777	ushr	v30.2d,v20.2d,#26
778	and	v20.16b,v20.16b,v31.16b
779	add	v21.2d,v21.2d,v30.2d	// h1 -> h2
780
781	add	v19.2d,v19.2d,v29.2d
782	shl	v29.2d,v29.2d,#2
783	ushr	v30.2d,v21.2d,#26
784	and	v21.16b,v21.16b,v31.16b
785	add	v19.2d,v19.2d,v29.2d	// h4 -> h0
786	add	v22.2d,v22.2d,v30.2d	// h2 -> h3
787
788	ushr	v29.2d,v19.2d,#26
789	and	v19.16b,v19.16b,v31.16b
790	ushr	v30.2d,v22.2d,#26
791	and	v22.16b,v22.16b,v31.16b
792	add	v20.2d,v20.2d,v29.2d	// h0 -> h1
793	add	v23.2d,v23.2d,v30.2d	// h3 -> h4
794
795	////////////////////////////////////////////////////////////////
796	// write the result, can be partially reduced
797
798	st4	{v19.s,v20.s,v21.s,v22.s}[0],[x0],#16
799	st1	{v23.s}[0],[x0]
800
801.Lno_data_neon:
802	ldr	x29,[sp],#80
803.inst	0xd50323bf		// autiasp
804	ret
805.size	poly1305_blocks_neon,.-poly1305_blocks_neon
806
807.type	poly1305_emit_neon,%function
808.align	5
809poly1305_emit_neon:
810	ldr	x17,[x0,#24]
811	cbz	x17,poly1305_emit
812
813	ldp	w10,w11,[x0]		// load hash value base 2^26
814	ldp	w12,w13,[x0,#8]
815	ldr	w14,[x0,#16]
816
817	add	x4,x10,x11,lsl#26	// base 2^26 -> base 2^64
818	lsr	x5,x12,#12
819	adds	x4,x4,x12,lsl#52
820	add	x5,x5,x13,lsl#14
821	adc	x5,x5,xzr
822	lsr	x6,x14,#24
823	adds	x5,x5,x14,lsl#40
824	adc	x6,x6,xzr		// can be partially reduced...
825
826	ldp	x10,x11,[x2]	// load nonce
827
828	and	x12,x6,#-4		// ... so reduce
829	add	x12,x12,x6,lsr#2
830	and	x6,x6,#3
831	adds	x4,x4,x12
832	adcs	x5,x5,xzr
833	adc	x6,x6,xzr
834
835	adds	x12,x4,#5		// compare to modulus
836	adcs	x13,x5,xzr
837	adc	x14,x6,xzr
838
839	tst	x14,#-4			// see if it's carried/borrowed
840
841	csel	x4,x4,x12,eq
842	csel	x5,x5,x13,eq
843
844#ifdef	__ARMEB__
845	ror	x10,x10,#32		// flip nonce words
846	ror	x11,x11,#32
847#endif
848	adds	x4,x4,x10		// accumulate nonce
849	adc	x5,x5,x11
850#ifdef	__ARMEB__
851	rev	x4,x4			// flip output bytes
852	rev	x5,x5
853#endif
854	stp	x4,x5,[x1]		// write result
855
856	ret
857.size	poly1305_emit_neon,.-poly1305_emit_neon
858
859.align	5
860.Lzeros:
861.long	0,0,0,0,0,0,0,0
862.LOPENSSL_armcap_P:
863#ifdef	__ILP32__
864.long	OPENSSL_armcap_P-.
865#else
866.quad	OPENSSL_armcap_P-.
867#endif
868.byte	80,111,108,121,49,51,48,53,32,102,111,114,32,65,82,77,118,56,44,32,67,82,89,80,84,79,71,65,77,83,32,98,121,32,60,97,112,112,114,111,64,111,112,101,110,115,115,108,46,111,114,103,62,0
869.align	2
870.align	2
871