ghash-armv4.S revision 305153
1/* $FreeBSD: stable/11/secure/lib/libcrypto/arm/ghash-armv4.S 305153 2016-08-31 20:33:59Z jkim $ */
2/* Do not modify. This file is auto-generated from ghash-armv4.pl. */
3#include "arm_arch.h"
4
5.text
6.code	32
7
8#ifdef __clang__
9#define ldrplb	ldrbpl
10#define ldrneb	ldrbne
11#endif
12
13.type	rem_4bit,%object
14.align	5
15rem_4bit:
16.short	0x0000,0x1C20,0x3840,0x2460
17.short	0x7080,0x6CA0,0x48C0,0x54E0
18.short	0xE100,0xFD20,0xD940,0xC560
19.short	0x9180,0x8DA0,0xA9C0,0xB5E0
20.size	rem_4bit,.-rem_4bit
21
22.type	rem_4bit_get,%function
23rem_4bit_get:
24	sub	r2,pc,#8
25	sub	r2,r2,#32	@ &rem_4bit
26	b	.Lrem_4bit_got
27	nop
28.size	rem_4bit_get,.-rem_4bit_get
29
30.global	gcm_ghash_4bit
31.type	gcm_ghash_4bit,%function
32gcm_ghash_4bit:
33	sub	r12,pc,#8
34	add	r3,r2,r3		@ r3 to point at the end
35	stmdb	sp!,{r3-r11,lr}		@ save r3/end too
36	sub	r12,r12,#48		@ &rem_4bit
37
38	ldmia	r12,{r4-r11}		@ copy rem_4bit ...
39	stmdb	sp!,{r4-r11}		@ ... to stack
40
41	ldrb	r12,[r2,#15]
42	ldrb	r14,[r0,#15]
43.Louter:
44	eor	r12,r12,r14
45	and	r14,r12,#0xf0
46	and	r12,r12,#0x0f
47	mov	r3,#14
48
49	add	r7,r1,r12,lsl#4
50	ldmia	r7,{r4-r7}	@ load Htbl[nlo]
51	add	r11,r1,r14
52	ldrb	r12,[r2,#14]
53
54	and	r14,r4,#0xf		@ rem
55	ldmia	r11,{r8-r11}	@ load Htbl[nhi]
56	add	r14,r14,r14
57	eor	r4,r8,r4,lsr#4
58	ldrh	r8,[sp,r14]		@ rem_4bit[rem]
59	eor	r4,r4,r5,lsl#28
60	ldrb	r14,[r0,#14]
61	eor	r5,r9,r5,lsr#4
62	eor	r5,r5,r6,lsl#28
63	eor	r6,r10,r6,lsr#4
64	eor	r6,r6,r7,lsl#28
65	eor	r7,r11,r7,lsr#4
66	eor	r12,r12,r14
67	and	r14,r12,#0xf0
68	and	r12,r12,#0x0f
69	eor	r7,r7,r8,lsl#16
70
71.Linner:
72	add	r11,r1,r12,lsl#4
73	and	r12,r4,#0xf		@ rem
74	subs	r3,r3,#1
75	add	r12,r12,r12
76	ldmia	r11,{r8-r11}	@ load Htbl[nlo]
77	eor	r4,r8,r4,lsr#4
78	eor	r4,r4,r5,lsl#28
79	eor	r5,r9,r5,lsr#4
80	eor	r5,r5,r6,lsl#28
81	ldrh	r8,[sp,r12]		@ rem_4bit[rem]
82	eor	r6,r10,r6,lsr#4
83	ldrplb	r12,[r2,r3]
84	eor	r6,r6,r7,lsl#28
85	eor	r7,r11,r7,lsr#4
86
87	add	r11,r1,r14
88	and	r14,r4,#0xf		@ rem
89	eor	r7,r7,r8,lsl#16	@ ^= rem_4bit[rem]
90	add	r14,r14,r14
91	ldmia	r11,{r8-r11}	@ load Htbl[nhi]
92	eor	r4,r8,r4,lsr#4
93	ldrplb	r8,[r0,r3]
94	eor	r4,r4,r5,lsl#28
95	eor	r5,r9,r5,lsr#4
96	ldrh	r9,[sp,r14]
97	eor	r5,r5,r6,lsl#28
98	eor	r6,r10,r6,lsr#4
99	eor	r6,r6,r7,lsl#28
100	eorpl	r12,r12,r8
101	eor	r7,r11,r7,lsr#4
102	andpl	r14,r12,#0xf0
103	andpl	r12,r12,#0x0f
104	eor	r7,r7,r9,lsl#16	@ ^= rem_4bit[rem]
105	bpl	.Linner
106
107	ldr	r3,[sp,#32]		@ re-load r3/end
108	add	r2,r2,#16
109	mov	r14,r4
110#if __ARM_ARCH__>=7 && defined(__ARMEL__)
111	rev	r4,r4
112	str	r4,[r0,#12]
113#elif defined(__ARMEB__)
114	str	r4,[r0,#12]
115#else
116	mov	r9,r4,lsr#8
117	strb	r4,[r0,#12+3]
118	mov	r10,r4,lsr#16
119	strb	r9,[r0,#12+2]
120	mov	r11,r4,lsr#24
121	strb	r10,[r0,#12+1]
122	strb	r11,[r0,#12]
123#endif
124	cmp	r2,r3
125#if __ARM_ARCH__>=7 && defined(__ARMEL__)
126	rev	r5,r5
127	str	r5,[r0,#8]
128#elif defined(__ARMEB__)
129	str	r5,[r0,#8]
130#else
131	mov	r9,r5,lsr#8
132	strb	r5,[r0,#8+3]
133	mov	r10,r5,lsr#16
134	strb	r9,[r0,#8+2]
135	mov	r11,r5,lsr#24
136	strb	r10,[r0,#8+1]
137	strb	r11,[r0,#8]
138#endif
139	ldrneb	r12,[r2,#15]
140#if __ARM_ARCH__>=7 && defined(__ARMEL__)
141	rev	r6,r6
142	str	r6,[r0,#4]
143#elif defined(__ARMEB__)
144	str	r6,[r0,#4]
145#else
146	mov	r9,r6,lsr#8
147	strb	r6,[r0,#4+3]
148	mov	r10,r6,lsr#16
149	strb	r9,[r0,#4+2]
150	mov	r11,r6,lsr#24
151	strb	r10,[r0,#4+1]
152	strb	r11,[r0,#4]
153#endif
154
155#if __ARM_ARCH__>=7 && defined(__ARMEL__)
156	rev	r7,r7
157	str	r7,[r0,#0]
158#elif defined(__ARMEB__)
159	str	r7,[r0,#0]
160#else
161	mov	r9,r7,lsr#8
162	strb	r7,[r0,#0+3]
163	mov	r10,r7,lsr#16
164	strb	r9,[r0,#0+2]
165	mov	r11,r7,lsr#24
166	strb	r10,[r0,#0+1]
167	strb	r11,[r0,#0]
168#endif
169
170	bne	.Louter
171
172	add	sp,sp,#36
173#if __ARM_ARCH__>=5
174	ldmia	sp!,{r4-r11,pc}
175#else
176	ldmia	sp!,{r4-r11,lr}
177	tst	lr,#1
178	moveq	pc,lr			@ be binary compatible with V4, yet
179	.word	0xe12fff1e			@ interoperable with Thumb ISA:-)
180#endif
181.size	gcm_ghash_4bit,.-gcm_ghash_4bit
182
183.global	gcm_gmult_4bit
184.type	gcm_gmult_4bit,%function
185gcm_gmult_4bit:
186	stmdb	sp!,{r4-r11,lr}
187	ldrb	r12,[r0,#15]
188	b	rem_4bit_get
189.Lrem_4bit_got:
190	and	r14,r12,#0xf0
191	and	r12,r12,#0x0f
192	mov	r3,#14
193
194	add	r7,r1,r12,lsl#4
195	ldmia	r7,{r4-r7}	@ load Htbl[nlo]
196	ldrb	r12,[r0,#14]
197
198	add	r11,r1,r14
199	and	r14,r4,#0xf		@ rem
200	ldmia	r11,{r8-r11}	@ load Htbl[nhi]
201	add	r14,r14,r14
202	eor	r4,r8,r4,lsr#4
203	ldrh	r8,[r2,r14]	@ rem_4bit[rem]
204	eor	r4,r4,r5,lsl#28
205	eor	r5,r9,r5,lsr#4
206	eor	r5,r5,r6,lsl#28
207	eor	r6,r10,r6,lsr#4
208	eor	r6,r6,r7,lsl#28
209	eor	r7,r11,r7,lsr#4
210	and	r14,r12,#0xf0
211	eor	r7,r7,r8,lsl#16
212	and	r12,r12,#0x0f
213
214.Loop:
215	add	r11,r1,r12,lsl#4
216	and	r12,r4,#0xf		@ rem
217	subs	r3,r3,#1
218	add	r12,r12,r12
219	ldmia	r11,{r8-r11}	@ load Htbl[nlo]
220	eor	r4,r8,r4,lsr#4
221	eor	r4,r4,r5,lsl#28
222	eor	r5,r9,r5,lsr#4
223	eor	r5,r5,r6,lsl#28
224	ldrh	r8,[r2,r12]	@ rem_4bit[rem]
225	eor	r6,r10,r6,lsr#4
226	ldrplb	r12,[r0,r3]
227	eor	r6,r6,r7,lsl#28
228	eor	r7,r11,r7,lsr#4
229
230	add	r11,r1,r14
231	and	r14,r4,#0xf		@ rem
232	eor	r7,r7,r8,lsl#16	@ ^= rem_4bit[rem]
233	add	r14,r14,r14
234	ldmia	r11,{r8-r11}	@ load Htbl[nhi]
235	eor	r4,r8,r4,lsr#4
236	eor	r4,r4,r5,lsl#28
237	eor	r5,r9,r5,lsr#4
238	ldrh	r8,[r2,r14]	@ rem_4bit[rem]
239	eor	r5,r5,r6,lsl#28
240	eor	r6,r10,r6,lsr#4
241	eor	r6,r6,r7,lsl#28
242	eor	r7,r11,r7,lsr#4
243	andpl	r14,r12,#0xf0
244	andpl	r12,r12,#0x0f
245	eor	r7,r7,r8,lsl#16	@ ^= rem_4bit[rem]
246	bpl	.Loop
247#if __ARM_ARCH__>=7 && defined(__ARMEL__)
248	rev	r4,r4
249	str	r4,[r0,#12]
250#elif defined(__ARMEB__)
251	str	r4,[r0,#12]
252#else
253	mov	r9,r4,lsr#8
254	strb	r4,[r0,#12+3]
255	mov	r10,r4,lsr#16
256	strb	r9,[r0,#12+2]
257	mov	r11,r4,lsr#24
258	strb	r10,[r0,#12+1]
259	strb	r11,[r0,#12]
260#endif
261
262#if __ARM_ARCH__>=7 && defined(__ARMEL__)
263	rev	r5,r5
264	str	r5,[r0,#8]
265#elif defined(__ARMEB__)
266	str	r5,[r0,#8]
267#else
268	mov	r9,r5,lsr#8
269	strb	r5,[r0,#8+3]
270	mov	r10,r5,lsr#16
271	strb	r9,[r0,#8+2]
272	mov	r11,r5,lsr#24
273	strb	r10,[r0,#8+1]
274	strb	r11,[r0,#8]
275#endif
276
277#if __ARM_ARCH__>=7 && defined(__ARMEL__)
278	rev	r6,r6
279	str	r6,[r0,#4]
280#elif defined(__ARMEB__)
281	str	r6,[r0,#4]
282#else
283	mov	r9,r6,lsr#8
284	strb	r6,[r0,#4+3]
285	mov	r10,r6,lsr#16
286	strb	r9,[r0,#4+2]
287	mov	r11,r6,lsr#24
288	strb	r10,[r0,#4+1]
289	strb	r11,[r0,#4]
290#endif
291
292#if __ARM_ARCH__>=7 && defined(__ARMEL__)
293	rev	r7,r7
294	str	r7,[r0,#0]
295#elif defined(__ARMEB__)
296	str	r7,[r0,#0]
297#else
298	mov	r9,r7,lsr#8
299	strb	r7,[r0,#0+3]
300	mov	r10,r7,lsr#16
301	strb	r9,[r0,#0+2]
302	mov	r11,r7,lsr#24
303	strb	r10,[r0,#0+1]
304	strb	r11,[r0,#0]
305#endif
306
307#if __ARM_ARCH__>=5
308	ldmia	sp!,{r4-r11,pc}
309#else
310	ldmia	sp!,{r4-r11,lr}
311	tst	lr,#1
312	moveq	pc,lr			@ be binary compatible with V4, yet
313	.word	0xe12fff1e			@ interoperable with Thumb ISA:-)
314#endif
315.size	gcm_gmult_4bit,.-gcm_gmult_4bit
316#if __ARM_MAX_ARCH__>=7
317.arch	armv7-a
318.fpu	neon
319
320.global	gcm_init_neon
321.type	gcm_init_neon,%function
322.align	4
323gcm_init_neon:
324	vld1.64		d7,[r1,:64]!	@ load H
325	vmov.i8		q8,#0xe1
326	vld1.64		d6,[r1,:64]
327	vshl.i64	d17,#57
328	vshr.u64	d16,#63		@ t0=0xc2....01
329	vdup.8		q9,d7[7]
330	vshr.u64	d26,d6,#63
331	vshr.s8		q9,#7			@ broadcast carry bit
332	vshl.i64	q3,q3,#1
333	vand		q8,q8,q9
334	vorr		d7,d26		@ H<<<=1
335	veor		q3,q3,q8		@ twisted H
336	vstmia		r0,{q3}
337
338	bx	lr					@ bx lr
339.size	gcm_init_neon,.-gcm_init_neon
340
341.global	gcm_gmult_neon
342.type	gcm_gmult_neon,%function
343.align	4
344gcm_gmult_neon:
345	vld1.64		d7,[r0,:64]!	@ load Xi
346	vld1.64		d6,[r0,:64]!
347	vmov.i64	d29,#0x0000ffffffffffff
348	vldmia		r1,{d26-d27}	@ load twisted H
349	vmov.i64	d30,#0x00000000ffffffff
350#ifdef __ARMEL__
351	vrev64.8	q3,q3
352#endif
353	vmov.i64	d31,#0x000000000000ffff
354	veor		d28,d26,d27		@ Karatsuba pre-processing
355	mov		r3,#16
356	b		.Lgmult_neon
357.size	gcm_gmult_neon,.-gcm_gmult_neon
358
359.global	gcm_ghash_neon
360.type	gcm_ghash_neon,%function
361.align	4
362gcm_ghash_neon:
363	vld1.64		d1,[r0,:64]!	@ load Xi
364	vld1.64		d0,[r0,:64]!
365	vmov.i64	d29,#0x0000ffffffffffff
366	vldmia		r1,{d26-d27}	@ load twisted H
367	vmov.i64	d30,#0x00000000ffffffff
368#ifdef __ARMEL__
369	vrev64.8	q0,q0
370#endif
371	vmov.i64	d31,#0x000000000000ffff
372	veor		d28,d26,d27		@ Karatsuba pre-processing
373
374.Loop_neon:
375	vld1.64		d7,[r2]!		@ load inp
376	vld1.64		d6,[r2]!
377#ifdef __ARMEL__
378	vrev64.8	q3,q3
379#endif
380	veor		q3,q0			@ inp^=Xi
381.Lgmult_neon:
382	vext.8		d16, d26, d26, #1	@ A1
383	vmull.p8	q8, d16, d6		@ F = A1*B
384	vext.8		d0, d6, d6, #1	@ B1
385	vmull.p8	q0, d26, d0		@ E = A*B1
386	vext.8		d18, d26, d26, #2	@ A2
387	vmull.p8	q9, d18, d6		@ H = A2*B
388	vext.8		d22, d6, d6, #2	@ B2
389	vmull.p8	q11, d26, d22		@ G = A*B2
390	vext.8		d20, d26, d26, #3	@ A3
391	veor		q8, q8, q0		@ L = E + F
392	vmull.p8	q10, d20, d6		@ J = A3*B
393	vext.8		d0, d6, d6, #3	@ B3
394	veor		q9, q9, q11		@ M = G + H
395	vmull.p8	q0, d26, d0		@ I = A*B3
396	veor		d16, d16, d17	@ t0 = (L) (P0 + P1) << 8
397	vand		d17, d17, d29
398	vext.8		d22, d6, d6, #4	@ B4
399	veor		d18, d18, d19	@ t1 = (M) (P2 + P3) << 16
400	vand		d19, d19, d30
401	vmull.p8	q11, d26, d22		@ K = A*B4
402	veor		q10, q10, q0		@ N = I + J
403	veor		d16, d16, d17
404	veor		d18, d18, d19
405	veor		d20, d20, d21	@ t2 = (N) (P4 + P5) << 24
406	vand		d21, d21, d31
407	vext.8		q8, q8, q8, #15
408	veor		d22, d22, d23	@ t3 = (K) (P6 + P7) << 32
409	vmov.i64	d23, #0
410	vext.8		q9, q9, q9, #14
411	veor		d20, d20, d21
412	vmull.p8	q0, d26, d6		@ D = A*B
413	vext.8		q11, q11, q11, #12
414	vext.8		q10, q10, q10, #13
415	veor		q8, q8, q9
416	veor		q10, q10, q11
417	veor		q0, q0, q8
418	veor		q0, q0, q10
419	veor		d6,d6,d7	@ Karatsuba pre-processing
420	vext.8		d16, d28, d28, #1	@ A1
421	vmull.p8	q8, d16, d6		@ F = A1*B
422	vext.8		d2, d6, d6, #1	@ B1
423	vmull.p8	q1, d28, d2		@ E = A*B1
424	vext.8		d18, d28, d28, #2	@ A2
425	vmull.p8	q9, d18, d6		@ H = A2*B
426	vext.8		d22, d6, d6, #2	@ B2
427	vmull.p8	q11, d28, d22		@ G = A*B2
428	vext.8		d20, d28, d28, #3	@ A3
429	veor		q8, q8, q1		@ L = E + F
430	vmull.p8	q10, d20, d6		@ J = A3*B
431	vext.8		d2, d6, d6, #3	@ B3
432	veor		q9, q9, q11		@ M = G + H
433	vmull.p8	q1, d28, d2		@ I = A*B3
434	veor		d16, d16, d17	@ t0 = (L) (P0 + P1) << 8
435	vand		d17, d17, d29
436	vext.8		d22, d6, d6, #4	@ B4
437	veor		d18, d18, d19	@ t1 = (M) (P2 + P3) << 16
438	vand		d19, d19, d30
439	vmull.p8	q11, d28, d22		@ K = A*B4
440	veor		q10, q10, q1		@ N = I + J
441	veor		d16, d16, d17
442	veor		d18, d18, d19
443	veor		d20, d20, d21	@ t2 = (N) (P4 + P5) << 24
444	vand		d21, d21, d31
445	vext.8		q8, q8, q8, #15
446	veor		d22, d22, d23	@ t3 = (K) (P6 + P7) << 32
447	vmov.i64	d23, #0
448	vext.8		q9, q9, q9, #14
449	veor		d20, d20, d21
450	vmull.p8	q1, d28, d6		@ D = A*B
451	vext.8		q11, q11, q11, #12
452	vext.8		q10, q10, q10, #13
453	veor		q8, q8, q9
454	veor		q10, q10, q11
455	veor		q1, q1, q8
456	veor		q1, q1, q10
457	vext.8		d16, d27, d27, #1	@ A1
458	vmull.p8	q8, d16, d7		@ F = A1*B
459	vext.8		d4, d7, d7, #1	@ B1
460	vmull.p8	q2, d27, d4		@ E = A*B1
461	vext.8		d18, d27, d27, #2	@ A2
462	vmull.p8	q9, d18, d7		@ H = A2*B
463	vext.8		d22, d7, d7, #2	@ B2
464	vmull.p8	q11, d27, d22		@ G = A*B2
465	vext.8		d20, d27, d27, #3	@ A3
466	veor		q8, q8, q2		@ L = E + F
467	vmull.p8	q10, d20, d7		@ J = A3*B
468	vext.8		d4, d7, d7, #3	@ B3
469	veor		q9, q9, q11		@ M = G + H
470	vmull.p8	q2, d27, d4		@ I = A*B3
471	veor		d16, d16, d17	@ t0 = (L) (P0 + P1) << 8
472	vand		d17, d17, d29
473	vext.8		d22, d7, d7, #4	@ B4
474	veor		d18, d18, d19	@ t1 = (M) (P2 + P3) << 16
475	vand		d19, d19, d30
476	vmull.p8	q11, d27, d22		@ K = A*B4
477	veor		q10, q10, q2		@ N = I + J
478	veor		d16, d16, d17
479	veor		d18, d18, d19
480	veor		d20, d20, d21	@ t2 = (N) (P4 + P5) << 24
481	vand		d21, d21, d31
482	vext.8		q8, q8, q8, #15
483	veor		d22, d22, d23	@ t3 = (K) (P6 + P7) << 32
484	vmov.i64	d23, #0
485	vext.8		q9, q9, q9, #14
486	veor		d20, d20, d21
487	vmull.p8	q2, d27, d7		@ D = A*B
488	vext.8		q11, q11, q11, #12
489	vext.8		q10, q10, q10, #13
490	veor		q8, q8, q9
491	veor		q10, q10, q11
492	veor		q2, q2, q8
493	veor		q2, q2, q10
494	veor		q1,q1,q0		@ Karatsuba post-processing
495	veor		q1,q1,q2
496	veor		d1,d1,d2
497	veor		d4,d4,d3	@ Xh|Xl - 256-bit result
498
499	@ equivalent of reduction_avx from ghash-x86_64.pl
500	vshl.i64	q9,q0,#57		@ 1st phase
501	vshl.i64	q10,q0,#62
502	veor		q10,q10,q9		@
503	vshl.i64	q9,q0,#63
504	veor		q10, q10, q9		@
505 	veor		d1,d1,d20	@
506	veor		d4,d4,d21
507
508	vshr.u64	q10,q0,#1		@ 2nd phase
509	veor		q2,q2,q0
510	veor		q0,q0,q10		@
511	vshr.u64	q10,q10,#6
512	vshr.u64	q0,q0,#1		@
513	veor		q0,q0,q2		@
514	veor		q0,q0,q10		@
515
516	subs		r3,#16
517	bne		.Loop_neon
518
519#ifdef __ARMEL__
520	vrev64.8	q0,q0
521#endif
522	sub		r0,#16
523	vst1.64		d1,[r0,:64]!	@ write out Xi
524	vst1.64		d0,[r0,:64]
525
526	bx	lr					@ bx lr
527.size	gcm_ghash_neon,.-gcm_ghash_neon
528#endif
529.asciz  "GHASH for ARMv4/NEON, CRYPTOGAMS by <appro@openssl.org>"
530.align  2
531