1#include <arm/arch.h>
2
3#define BASE 65521	    /* largest prime smaller than 65536 */
4#define NMAX 5552 		/* NMAX is the largest n such that 255n(n+1)/2 + (n+1)(BASE-1) <= 2^32-1 */
5
6// Note: buf should have been 16-byte aligned in the caller function,
7
8// uLong adler32_vec(unsigned int adler, unsigned int sum2, const Bytef* buf, int len) {
9//    unsigned n;
10//    while (len >= NMAX) {
11//        len -= NMAX;
12//        n = NMAX / 16;          /* NMAX is divisible by 16 */
13//        do {
14//            DO16(buf);          /* 16 sums unrolled */
15//            buf += 16;
16//        } while (--n);
17//        MOD(adler);
18//        MOD(sum2);
19//    }
20//    if (len) {                  /* avoid modulos if none remaining */
21//        while (len >= 16) {
22//            len -= 16;
23//            DO16(buf);
24//            buf += 16;
25//        }
26//        while (len--) {
27//            adler += *buf++;
28//            sum2 += adler;
29//        }
30//        MOD(adler);
31//        MOD(sum2);
32//    }
33//    return adler | (sum2 << 16); 		/* return recombined sums */
34// }
35
36
37/*
38	DO16 vectorization:
39	given initial unsigned int sum2 and adler, and a new set of 16 input bytes (x[0:15]), it can be shown that
40	sum2  += (16*adler + 16*x[0] + 15*x[1] + ... + 1*x[15]);
41	adler += (x[0] + x[1] + ... + x[15]);
42
43	therefore, this is what can be done to vectorize the above computation
44	1. 16-byte aligned vector load into q2 (x[0:x15])
45	2. sum2 += (adler<<4);
46	3. vmull.u8 (q9,q8),q2,d2 where d2 = (1,1,1,1...,1), (q9,q8) + 16 16-bit elements x[0:15]
47	4. vmull.u8 (q11,q10),q2,q0 where q0 = (1,2,3,4...,16), (q11,q10) + 16 16-bit elements (16:1)*x[0:15]
48	5. parallel add (with once expansion to 32-bit) (q9,q8) and (q11,q10) all the way to accumulate to adler and sum2
49
50	In this revision, whenever possible, 2 DO16 loops are combined into a DO32 loop.
51	1. 32-byte aligned vector load into q2,q14 (x[0:x31])
52    2. sum2 += (adler<<5);
53    3. vmull.u8 (4 q registers),(q2,q14),d2 where d2 = (1,1,1,1...,1), (4 q registers) : 32 16-bit elements x[0:31]
54	4. vmull.u8 (4 q registers),(q2,q14),(q0,q15) where q0 = (1,...,32), (4 q regs) : 32 16-bit elements (32:1)*x[0:31]
55    5. parallel add (with once expansion to 32-bit) the pair of (4 q regs) all the way to accumulate to adler and sum2
56
57	This change improves the performance by ~ 0.55 cycle/uncompress byte on ARM Cortex-A8.
58
59*/
60
61/*
62	MOD implementation:
63	adler%BASE = adler - floor(adler*(1/BASE))*BASE; where (1/BASE) = 0x80078071 in Q47
64	1. vmull.u32   q2,(adler,sum2),(1/BASE)		// *(1/BASE) in Q47
65    2. vshr.u64    q2,q2,#47					// floor function
66    3. vpadd.u32   d4,d4,d5						// merge into a double word in d4
67    4. vmls.u32    (adler,sum2),d4,d3[0]        // (adler,sum2) -= floor[(adler,sum2)/BASE]*BASE
68
69*/
70
71#if defined _ARM_ARCH_6			// this file would be used only for armv6 or above
72
73
74	.text
75	.align 2
76	.globl _adler32_vec
77_adler32_vec:
78
79#if (!KERNEL_SUPPORT_NEON) || (!defined _ARM_ARCH_7)	// for armv6 or armv7 without neon support
80
81
82	#define	adler			r0
83	#define	sum2			r1
84	#define	buf				r2
85	#define	len				r3
86	#define	one_by_base		r4
87	#define	base			r5
88	#define nmax			r6
89	#define	t				r12
90	#define	vecs			lr
91	#define	x0				r8
92	#define	x1				r10
93	#define	x2				r11
94	#define	x3				r12
95	#define	zero			r9
96
97	// this macro performs adler/sum2 update for 4 input bytes
98
99	.macro DO4
100	add		sum2, adler, lsl #2				// sum2 += 4*adler;
101	ldr		x0,[buf]						// 4 bytes in 1 32-bit word
102	usada8	adler, x0, zero, adler			// adler += sum(x0:x3)
103	ldrb	x0,[buf], #4					// x0
104	ldrb	x2,[buf,#-2]					// x2
105	ldrb	x1,[buf,#-3]					// x1
106	ldrb	x3,[buf,#-1]					// x3
107	add		sum2, x0, lsl #2				// sum2 += 4*x0
108	add		x3, x3, x1, lsl #1				// x3+2*x1
109	add		sum2, x2, lsl #1				// sum2 += 2*x2
110	add		x3, x1							// x3+3*x1
111	add		sum2, x3						// sum2 += x3+3*x1
112	.endm
113
114	// the following macro cascades 4 DO4 into a adler/sum2 update for 16 bytes
115	.macro DO16
116	DO4										// adler/sum2 update for 4 input bytes
117	DO4										// adler/sum2 update for 4 input bytes
118	DO4										// adler/sum2 update for 4 input bytes
119	DO4										// adler/sum2 update for 4 input bytes
120	.endm
121
122	// the following macro performs adler sum2 modulo BASE
123	.macro	modulo_base
124	umull	x0,x1,adler,one_by_base			// adler/BASE in Q47
125	umull	x2,x3,sum2,one_by_base			// sum2/BASE in Q47
126	lsr		x1, #15							// x1 >> 15 = floor(adler/BASE)
127	lsr		x3, #15							// x3 >> 15 = floor(sum2/BASE)
128	mla		adler, x1, base, adler			// adler %= base;
129	mla		sum2, x3, base, sum2			// sum2 %= base;
130	.endm
131
132	adr		t, coeffs
133	push	{r4-r6, r8-r11, lr}
134	ldmia	t, {one_by_base, base, nmax}	// load up coefficients
135
136	subs        len, nmax                   // pre-subtract len by NMAX
137	eor			zero, zero					// a dummy zero register to use usada8 instruction
138    blt         len_lessthan_NMAX           // if (len < NMAX) skip the while loop
139
140while_lengenmax_loop:						// do {
141    lsr         vecs, nmax, #4              // vecs = NMAX/16;
142
143len16_loop:									// do {
144
145	DO16
146
147	subs	vecs, #1						// vecs--;
148	bgt			len16_loop					// } while (vec>0);
149
150	modulo_base								// adler sum2 modulo BASE
151
152	subs		len, nmax					// len -= NMAX
153	bge			while_lengenmax_loop		// } while (len >= NMAX);
154
155len_lessthan_NMAX:
156	adds		len, nmax					// post-subtract len by NMAX
157
158	subs		len, #16					// pre-decrement len by 16
159	blt			len_lessthan_16
160
161len16_loop2:
162
163	DO16
164
165	subs		len, #16
166	bge			len16_loop2
167
168len_lessthan_16:
169	adds		len, #16					// post-increment len by 16
170	beq			len_is_zero
171
172remaining_buf:
173	ldrb		x0, [buf], #1
174	subs		len, #1
175	add			adler, x0
176	add			sum2, adler
177	bgt			remaining_buf
178
179len_is_zero:
180
181	modulo_base 							// adler sum2 modulo BASE
182
183	add		r0, adler, sum2, lsl #16		// to return sum2<<16 | adler
184
185	pop		{r4-r6, r8-r11, pc}
186
187	.align 2
188coeffs:
189	.long	-2146992015
190	.long	-BASE
191	.long	NMAX
192
193#else	// KERNEL_SUPPORT_NEON
194
195
196
197	#define	adler	r0
198	#define	sum2	r1
199	#define	buf		r2
200	#define	len		r3
201	#define	nmax	r4
202	#define	vecs	lr				// vecs = NMAX/16
203	#define	n		r5
204
205	#define	t		r12
206
207	#define	sum2_coeff		q0
208	#define	sum2_coeff0		d0
209	#define	sum2_coeff1		d1
210	#define	alder_coeff		q1
211	#define	ones			d2
212	#define	x0_x15			q2
213	#define	x0_x7			d4
214	#define	x8_x15			d5
215	#define	adlersum2		d6
216	#define	adler16			d25
217
218#if defined _ARM_ARCH_7
219
220	adr			t, vec_table				// address to vec_table[]
221	stmfd		sp!, {r4, r5, lr}
222
223	vld1.32		{q0-q1},[t,:128]!			// loading up coefficients for adler/sum2 computation
224	vld1.32		{q15},[t,:128]!				// for sum2 computation
225	ldr			nmax, [t]					// NMAX
226
227	vmov		adlersum2, sum2, adler		// pack up adler/sum2 into a double register
228
229	cmp			len, nmax					// len vs NMAX
230	lsr			vecs, nmax, #4				// vecs = NMAX/16;
231	blt			len_lessthan_NMAX			// if (len < NMAX) skip the while loop
232
233	sub			len, nmax					// pre-decrement len by NMAX
234
235while_len_ge_NMAX_loop: 					// while (len>=NMAX) {
236
237	mov			n, vecs, lsr #1			// n = NMAX/16;
238
239do_loop:									// do {
240
241	vshll.u32	q12, adlersum2, #5			// d25 = (0,32*adler) to be added into (adler,sum2)
242	vld1.32		{x0_x15},[buf,:128]!		// 16-byte input x0:x15
243	vmull.u8	q8, x0_x7, ones				// 16-bit x0-x7
244	vld1.32		{q14}, [buf,:128]!			// x16:x31
245	vmull.u8	q9, x8_x15, ones			// 16-bit x8-x15
246	vadd.u32	adlersum2,adler16			// sum2 += old adler*32;
247	vmull.u8	q12, d28, ones				// 16-bit x16-x23
248	vmull.u8	q13, d29, ones				// 16-bit x24-x31
249	vmull.u8	q10, d28, sum2_coeff0		// 16-bit x16*16, x17*15, ..., x23*9
250	vmull.u8	q11, d29, sum2_coeff1		// 16-bit x24*8, x25*7, ..., x31*1
251	vadd.u16	q8, q8, q9					// q8 = (x0+x8):(x7+x15) 8 16-bit elements for adler
252	vmull.u8	q9, x0_x7, d30				// 16-bit x0*32,...,x7*25
253	vmull.u8	q14, x8_x15, d31			// 16-bit x8*24,...,x15*17
254	vadd.u16	q12, q12, q13				// q12 = (x16+x24):(x23+x31) 8 16-bit elements for adler
255	vadd.u16	q10, q11					// 8 16-bit elements for sum2
256	vadd.u16	q8, q12						// 8 16-bit elements for adler
257	vadd.u16	q9, q14						// 8 16-bit elements for sum2
258	vadd.u16	q10, q9						// 8 16-bit elements for sum2
259	vpaddl.u16	q8, q8						// 4 32-bit elements for adler
260	vpaddl.u16	q10, q10					// 4 32-bit elements for sum2
261	vpadd.u32	d16,d16,d17					// 2 32-bit elements for adler
262	vpadd.u32	d17,d20,d21					// 2 32-bit elements for sum2
263	subs		n, #1						//  --n
264	vpadd.u32	d4,d17,d16					// s8 : 32-bit elements for sum2, s9 : 32-bit element for adler
265	vadd.u32	adlersum2,d4				// update adler/sum2 with the new 16 bytes input
266
267	bgt			do_loop						// } while (--n);
268
269	vshll.u32	q12, adlersum2, #4			// d25 = (0,16*adler) to be added into (adler,sum2)
270
271	vld1.32		{x0_x15},[buf,:128]!		// 	16-byte input
272
273	vmull.u8	q8, x0_x7, ones				// 16-bit x0-x7
274	vmull.u8	q9, x8_x15, ones			// 16-bit x8-x15
275	vmull.u8	q10, x0_x7, sum2_coeff0		// 16-bit x0*16, x1*15, ..., x7*9
276	vmull.u8	q11, x8_x15, sum2_coeff1	// 16-bit x8*8, x9*7, ..., x15*1
277
278	vadd.u16	q8, q8, q9					// 8 16-bit elements for adler
279	vadd.u16	q10, q10, q11				// 8 16-bit elements for sum2
280	vpaddl.u16	q8, q8						// 4 32-bit elements for adler
281	vpaddl.u16	q10, q10					// 4 32-bit elements for sum2
282	vpadd.u32	d16,d16,d17					// 2 32-bit elements for adler
283	vpadd.u32	d17,d20,d21					// 2 32-bit elements for sum2
284	vadd.u32	adlersum2,adler16			// sum2 += old adler;
285	vpadd.u32	d4,d17,d16					// s8 : 32-bit elements for sum2, s9 : 32-bit element for adler
286	vadd.u32	adlersum2,d4				// update adler/sum2 with the new 16 bytes input
287
288	// mod(alder,BASE); mod(sum2,BASE);
289	vmull.u32	q2,adlersum2,d3[1]			// alder/BASE, sum2/BASE in Q47
290	vshr.u64	q2,q2,#47					// take the integer part
291	vpadd.u32	d4,d4,d5					// merge into a double word in d4
292	vmls.u32	adlersum2,d4,d3[0]			// (adler,sum2) -= floor[(adler,sum2)/BASE]*BASE
293
294	subs		len, nmax					// len -= NMAX;
295	bge			while_len_ge_NMAX_loop		// repeat while len >= NMAX
296
297	add			len, nmax					// post-increment len by NMAX
298
299len_lessthan_NMAX:
300
301	cmp			len, #0
302	beq			len_is_zero					// if len==0, branch to skip the following
303
304
305	subs		len, #32					// pre-decrement len by 32
306	blt			len_lessthan_32				// if len < 32, branch to len16_loop
307
308len32_loop:
309
310	vshll.u32	q12, adlersum2, #5			// d25 = (0,32*adler) to be added into (adler,sum2)
311	vld1.32		{x0_x15},[buf,:128]!		// 16-byte input x0:x15
312	vmull.u8	q8, x0_x7, ones				// 16-bit x0-x7
313	vld1.32		{q14}, [buf,:128]!			// x16:x31
314	vmull.u8	q9, x8_x15, ones			// 16-bit x8-x15
315	vadd.u32	adlersum2,adler16			// sum2 += old adler*32;
316	vmull.u8	q12, d28, ones				// 16-bit x16-x23
317	vmull.u8	q13, d29, ones				// 16-bit x24-x31
318	vmull.u8	q10, d28, sum2_coeff0		// 16-bit x16*16, x17*15, ..., x23*9
319	vmull.u8	q11, d29, sum2_coeff1		// 16-bit x24*8, x25*7, ..., x31*1
320	vadd.u16	q8, q8, q9					// q8 = (x0+x8):(x7+x15) 8 16-bit elements for adler
321	vmull.u8	q9, x0_x7, d30				// 16-bit x0*32,...,x7*25
322	vmull.u8	q14, x8_x15, d31			// 16-bit x8*24,...,x15*17
323	vadd.u16	q12, q12, q13				// q12 = (x16+x24):(x23+x31) 8 16-bit elements for adler
324	vadd.u16	q10, q11					// 8 16-bit elements for sum2
325	vadd.u16	q8, q12						// 8 16-bit elements for adler
326	vadd.u16	q9, q14						// 8 16-bit elements for sum2
327	vadd.u16	q10, q9						// 8 16-bit elements for sum2
328	vpaddl.u16	q8, q8						// 4 32-bit elements for adler
329	vpaddl.u16	q10, q10					// 4 32-bit elements for sum2
330	vpadd.u32	d16,d16,d17					// 2 32-bit elements for adler
331	vpadd.u32	d17,d20,d21					// 2 32-bit elements for sum2
332	subs		len, #32					// len -= 32;
333	vpadd.u32	d4,d17,d16					// s8 : 32-bit elements for sum2, s9 : 32-bit element for adler
334	vadd.u32	adlersum2,d4				// update adler/sum2 with the new 16 bytes input
335
336	bge			len32_loop
337
338len_lessthan_32:
339
340	adds		len, #(32-16)				// post-increment len by 32, then pre-decrement by 16
341	blt			len_lessthan_16				// if len < 16, branch to len_lessthan_16
342
343	vshll.u32	q12, adlersum2, #4			// d25 = (0,16*adler) to be added into (adler,sum2)
344
345	vld1.32		{x0_x15},[buf,:128]!		// 	16-byte input
346
347
348	vmull.u8	q8, x0_x7, ones				// 16-bit x0-x7
349	vmull.u8	q9, x8_x15, ones			// 16-bit x8-x15
350	vmull.u8	q10, x0_x7, sum2_coeff0		// 16-bit x0*16, x1*15, ..., x7*9
351	vmull.u8	q11, x8_x15, sum2_coeff1	// 16-bit x8*8, x9*7, ..., x15*1
352
353	vadd.u16	q8, q8, q9					// 8 16-bit elements for adler
354	vadd.u16	q10, q10, q11				// 8 16-bit elements for sum2
355	vpaddl.u16	q8, q8						// 4 32-bit elements for adler
356	vpaddl.u16	q10, q10					// 4 32-bit elements for sum2
357	vpadd.u32	d16,d16,d17					// 2 32-bit elements for adler
358	vpadd.u32	d17,d20,d21					// 2 32-bit elements for sum2
359	subs		len, #16					// decrement len by 16
360	vadd.u32	adlersum2,adler16			// sum2 += old adler;
361	vpadd.u32	d4,d17,d16					// s8 : 32-bit elements for sum2, s9 : 32-bit element for adler
362	vadd.u32	adlersum2,d4				// update adler/sum2 with the new 16 bytes input
363
364len_lessthan_16:
365	adds		len, #16					// post-increment len by 16
366	beq			len_is_zero_internal		// if len==0, branch to len_is_zero_internal
367
368	// restore adler/sum2 into general registers for remaining (<16) bytes
369
370	vmov		sum2, adler, adlersum2
371remaining_len_loop:
372	ldrb		t, [buf], #1				// *buf++;
373	subs		len, #1						// len--;
374	add			adler,t						// adler += *buf
375	add			sum2,adler					// sum2 += adler
376	bgt			remaining_len_loop			// break if len<=0
377
378	vmov		adlersum2, sum2, adler		// move to double register for modulo operation
379
380len_is_zero_internal:
381
382	// mod(alder,BASE); mod(sum2,BASE);
383
384	vmull.u32	q2,adlersum2,d3[1]			// alder/BASE, sum2/BASE in Q47
385	vshr.u64	q2,q2,#47					// take the integer part
386	vpadd.u32	d4,d4,d5					// merge into a double word in d4
387	vmls.u32	adlersum2,d4,d3[0]			// (adler,sum2) -= floor[(adler,sum2)/BASE]*BASE
388
389len_is_zero:
390
391	vmov        sum2, adler, adlersum2		// restore adler/sum2 from (s12=sum2, s13=adler)
392	add			r0, adler, sum2, lsl #16	// to return adler | (sum2 << 16);
393	ldmfd       sp!, {r4, r5, pc}			// restore registers and return
394
395
396	// constants to be loaded into q registers
397	.align	4		// 16 byte aligned
398
399vec_table:
400
401	// coefficients for computing sum2
402	.long	0x0d0e0f10		// s0
403	.long	0x090a0b0c		// s1
404	.long	0x05060708		// s2
405	.long	0x01020304		// s3
406
407	// coefficients for computing adler
408	.long	0x01010101		// s4/d2
409	.long	0x01010101		// s5
410
411	.long	BASE			// s6 : BASE
412	.long	0x80078071		// s7 : 1/BASE in Q47
413
414	// q15 : d30.d31
415	.long	0x1d1e1f20		// s0
416	.long	0x191a1b1c		// s1
417	.long	0x15161718		// s2
418	.long	0x11121314		// s3
419
420NMAX_loc:
421	.long	NMAX			// NMAX
422
423#endif		// _ARM_ARCH_7
424
425#endif		//  (!KERNEL_SUPPORT_NEON) || (!defined _ARM_ARCH_7)
426
427#endif		// _ARM_ARCH_6
428