1/*
2 * Optmized version of the ip_fast_csum() function
3 * Used for calculating IP header checksum
4 *
5 * Return: 16bit checksum, complemented
6 *
7 * Inputs:
8 *      in0: address of buffer to checksum (char *)
9 *      in1: length of the buffer (int)
10 *
11 * Copyright (C) 2002 Intel Corp.
12 * Copyright (C) 2002 Ken Chen <kenneth.w.chen@intel.com>
13 */
14
15#include <asm/asmmacro.h>
16
17/*
18 * Since we know that most likely this function is called with buf aligned
19 * on 4-byte boundary and 20 bytes in length, we can execution rather quickly
20 * versus calling generic version of do_csum, which has lots of overhead in
21 * handling various alignments and sizes.  However, due to lack of constrains
22 * put on the function input argument, cases with alignment not on 4-byte or
23 * size not equal to 20 bytes will be handled by the generic do_csum function.
24 */
25
26#define in0	r32
27#define in1	r33
28#define ret0	r8
29
30GLOBAL_ENTRY(ip_fast_csum)
31	.prologue
32	.body
33	cmp.ne	p6,p7=5,in1	// size other than 20 byte?
34	and	r14=3,in0	// is it aligned on 4-byte?
35	add	r15=4,in0	// second source pointer
36	;;
37	cmp.ne.or.andcm p6,p7=r14,r0
38	;;
39(p7)	ld4	r20=[in0],8
40(p7)	ld4	r21=[r15],8
41(p6)	br.spnt	.generic
42	;;
43	ld4	r22=[in0],8
44	ld4	r23=[r15],8
45	;;
46	ld4	r24=[in0]
47	add	r20=r20,r21
48	add	r22=r22,r23
49	;;
50	add	r20=r20,r22
51	;;
52	add	r20=r20,r24
53	;;
54	shr.u	ret0=r20,16	// now need to add the carry
55	zxt2	r20=r20
56	;;
57	add	r20=ret0,r20
58	;;
59	shr.u	ret0=r20,16	// add carry again
60	zxt2	r20=r20
61	;;
62	add	r20=ret0,r20
63	;;
64	shr.u	ret0=r20,16
65	zxt2	r20=r20
66	;;
67	add	r20=ret0,r20
68	;;
69	andcm	ret0=-1,r20
70	.restore sp		// reset frame state
71	br.ret.sptk.many b0
72	;;
73
74.generic:
75	.prologue
76	.save ar.pfs, r35
77	alloc	r35=ar.pfs,2,2,2,0
78	.save rp, r34
79	mov	r34=b0
80	.body
81	dep.z	out1=in1,2,30
82	mov	out0=in0
83	;;
84	br.call.sptk.many b0=do_csum
85	;;
86	andcm	ret0=-1,ret0
87	mov	ar.pfs=r35
88	mov	b0=r34
89	br.ret.sptk.many b0
90END(ip_fast_csum)
91