1/* 2 * INET An implementation of the TCP/IP protocol suite for the LINUX 3 * operating system. INET is implemented using the BSD Socket 4 * interface as the means of communication with the user level. 5 * 6 * IP/TCP/UDP checksumming routines 7 * 8 * Authors: Jorge Cwik, <jorge@laser.satlink.net> 9 * Arnt Gulbrandsen, <agulbra@nvg.unit.no> 10 * Tom May, <ftom@netcom.com> 11 * Pentium Pro/II routines: 12 * Alexander Kjeldaas <astor@guardian.no> 13 * Finn Arne Gangstad <finnag@guardian.no> 14 * Lots of code moved from tcp.c and ip.c; see those files 15 * for more names. 16 * 17 * Changes: Ingo Molnar, converted csum_partial_copy() to 2.1 exception 18 * handling. 19 * Andi Kleen, add zeroing on error 20 * converted to pure assembler 21 * 22 * This program is free software; you can redistribute it and/or 23 * modify it under the terms of the GNU General Public License 24 * as published by the Free Software Foundation; either version 25 * 2 of the License, or (at your option) any later version. 26 */ 27 28#include <asm/errno.h> 29 30/* 31 * computes a partial checksum, e.g. for TCP/UDP fragments 32 */ 33 34/* 35unsigned int csum_partial(const unsigned char * buff, int len, unsigned int sum) 36 */ 37 38.text 39.align 4 40.globl csum_partial 41 42#ifndef CONFIG_X86_USE_PPRO_CHECKSUM 43 44 /* 45 * Experiments with Ethernet and SLIP connections show that buff 46 * is aligned on either a 2-byte or 4-byte boundary. We get at 47 * least a twofold speedup on 486 and Pentium if it is 4-byte aligned. 48 * Fortunately, it is easy to convert 2-byte alignment to 4-byte 49 * alignment for the unrolled loop. 50 */ 51csum_partial: 52 pushl %esi 53 pushl %ebx 54 movl 20(%esp),%eax # Function arg: unsigned int sum 55 movl 16(%esp),%ecx # Function arg: int len 56 movl 12(%esp),%esi # Function arg: unsigned char *buff 57 testl $2, %esi # Check alignment. 58 jz 2f # Jump if alignment is ok. 59 subl $2, %ecx # Alignment uses up two bytes. 60 jae 1f # Jump if we had at least two bytes. 61 addl $2, %ecx # ecx was < 2. Deal with it. 62 jmp 4f 631: movw (%esi), %bx 64 addl $2, %esi 65 addw %bx, %ax 66 adcl $0, %eax 672: 68 movl %ecx, %edx 69 shrl $5, %ecx 70 jz 2f 71 testl %esi, %esi 721: movl (%esi), %ebx 73 adcl %ebx, %eax 74 movl 4(%esi), %ebx 75 adcl %ebx, %eax 76 movl 8(%esi), %ebx 77 adcl %ebx, %eax 78 movl 12(%esi), %ebx 79 adcl %ebx, %eax 80 movl 16(%esi), %ebx 81 adcl %ebx, %eax 82 movl 20(%esi), %ebx 83 adcl %ebx, %eax 84 movl 24(%esi), %ebx 85 adcl %ebx, %eax 86 movl 28(%esi), %ebx 87 adcl %ebx, %eax 88 lea 32(%esi), %esi 89 dec %ecx 90 jne 1b 91 adcl $0, %eax 922: movl %edx, %ecx 93 andl $0x1c, %edx 94 je 4f 95 shrl $2, %edx # This clears CF 963: adcl (%esi), %eax 97 lea 4(%esi), %esi 98 dec %edx 99 jne 3b 100 adcl $0, %eax 1014: andl $3, %ecx 102 jz 7f 103 cmpl $2, %ecx 104 jb 5f 105 movw (%esi),%cx 106 leal 2(%esi),%esi 107 je 6f 108 shll $16,%ecx 1095: movb (%esi),%cl 1106: addl %ecx,%eax 111 adcl $0, %eax 1127: 113 popl %ebx 114 popl %esi 115 ret 116 117#else 118 119/* Version for PentiumII/PPro */ 120 121csum_partial: 122 pushl %esi 123 pushl %ebx 124 movl 20(%esp),%eax # Function arg: unsigned int sum 125 movl 16(%esp),%ecx # Function arg: int len 126 movl 12(%esp),%esi # Function arg: const unsigned char *buf 127 128 testl $2, %esi 129 jnz 30f 13010: 131 movl %ecx, %edx 132 movl %ecx, %ebx 133 andl $0x7c, %ebx 134 shrl $7, %ecx 135 addl %ebx,%esi 136 shrl $2, %ebx 137 negl %ebx 138 lea 45f(%ebx,%ebx,2), %ebx 139 testl %esi, %esi 140 jmp *%ebx 141 142 # Handle 2-byte-aligned regions 14320: addw (%esi), %ax 144 lea 2(%esi), %esi 145 adcl $0, %eax 146 jmp 10b 147 14830: subl $2, %ecx 149 ja 20b 150 je 32f 151 movzbl (%esi),%ebx # csumming 1 byte, 2-aligned 152 addl %ebx, %eax 153 adcl $0, %eax 154 jmp 80f 15532: 156 addw (%esi), %ax # csumming 2 bytes, 2-aligned 157 adcl $0, %eax 158 jmp 80f 159 16040: 161 addl -128(%esi), %eax 162 adcl -124(%esi), %eax 163 adcl -120(%esi), %eax 164 adcl -116(%esi), %eax 165 adcl -112(%esi), %eax 166 adcl -108(%esi), %eax 167 adcl -104(%esi), %eax 168 adcl -100(%esi), %eax 169 adcl -96(%esi), %eax 170 adcl -92(%esi), %eax 171 adcl -88(%esi), %eax 172 adcl -84(%esi), %eax 173 adcl -80(%esi), %eax 174 adcl -76(%esi), %eax 175 adcl -72(%esi), %eax 176 adcl -68(%esi), %eax 177 adcl -64(%esi), %eax 178 adcl -60(%esi), %eax 179 adcl -56(%esi), %eax 180 adcl -52(%esi), %eax 181 adcl -48(%esi), %eax 182 adcl -44(%esi), %eax 183 adcl -40(%esi), %eax 184 adcl -36(%esi), %eax 185 adcl -32(%esi), %eax 186 adcl -28(%esi), %eax 187 adcl -24(%esi), %eax 188 adcl -20(%esi), %eax 189 adcl -16(%esi), %eax 190 adcl -12(%esi), %eax 191 adcl -8(%esi), %eax 192 adcl -4(%esi), %eax 19345: 194 lea 128(%esi), %esi 195 adcl $0, %eax 196 dec %ecx 197 jge 40b 198 movl %edx, %ecx 19950: andl $3, %ecx 200 jz 80f 201 202 # Handle the last 1-3 bytes without jumping 203 notl %ecx # 1->2, 2->1, 3->0, higher bits are masked 204 movl $0xffffff,%ebx # by the shll and shrl instructions 205 shll $3,%ecx 206 shrl %cl,%ebx 207 andl -128(%esi),%ebx # esi is 4-aligned so should be ok 208 addl %ebx,%eax 209 adcl $0,%eax 21080: 211 popl %ebx 212 popl %esi 213 ret 214 215#endif 216 217/* 218unsigned int csum_partial_copy_generic (const char *src, char *dst, 219 int len, int sum, int *src_err_ptr, int *dst_err_ptr) 220 */ 221 222 223#define SRC(y...) \ 224 9999: y; \ 225 .section __ex_table, "a"; \ 226 .long 9999b, 6001f ; \ 227 .previous 228 229#define DST(y...) \ 230 9999: y; \ 231 .section __ex_table, "a"; \ 232 .long 9999b, 6002f ; \ 233 .previous 234 235.align 4 236 237#ifndef CONFIG_X86_USE_PPRO_CHECKSUM 238 239#define ARGBASE 16 240#define FP 12 241 242csum_partial_copy_generic_i386: 243 subl $4,%esp 244 pushl %edi 245 pushl %esi 246 pushl %ebx 247 movl ARGBASE+16(%esp),%eax # sum 248 movl ARGBASE+12(%esp),%ecx # len 249 movl ARGBASE+4(%esp),%esi # src 250 movl ARGBASE+8(%esp),%edi # dst 251 252 testl $2, %edi # Check alignment. 253 jz 2f # Jump if alignment is ok. 254 subl $2, %ecx # Alignment uses up two bytes. 255 jae 1f # Jump if we had at least two bytes. 256 addl $2, %ecx # ecx was < 2. Deal with it. 257 jmp 4f 258SRC(1: movw (%esi), %bx ) 259 addl $2, %esi 260DST( movw %bx, (%edi) ) 261 addl $2, %edi 262 addw %bx, %ax 263 adcl $0, %eax 2642: 265 movl %ecx, FP(%esp) 266 shrl $5, %ecx 267 jz 2f 268 testl %esi, %esi 269SRC(1: movl (%esi), %ebx ) 270SRC( movl 4(%esi), %edx ) 271 adcl %ebx, %eax 272DST( movl %ebx, (%edi) ) 273 adcl %edx, %eax 274DST( movl %edx, 4(%edi) ) 275 276SRC( movl 8(%esi), %ebx ) 277SRC( movl 12(%esi), %edx ) 278 adcl %ebx, %eax 279DST( movl %ebx, 8(%edi) ) 280 adcl %edx, %eax 281DST( movl %edx, 12(%edi) ) 282 283SRC( movl 16(%esi), %ebx ) 284SRC( movl 20(%esi), %edx ) 285 adcl %ebx, %eax 286DST( movl %ebx, 16(%edi) ) 287 adcl %edx, %eax 288DST( movl %edx, 20(%edi) ) 289 290SRC( movl 24(%esi), %ebx ) 291SRC( movl 28(%esi), %edx ) 292 adcl %ebx, %eax 293DST( movl %ebx, 24(%edi) ) 294 adcl %edx, %eax 295DST( movl %edx, 28(%edi) ) 296 297 lea 32(%esi), %esi 298 lea 32(%edi), %edi 299 dec %ecx 300 jne 1b 301 adcl $0, %eax 3022: movl FP(%esp), %edx 303 movl %edx, %ecx 304 andl $0x1c, %edx 305 je 4f 306 shrl $2, %edx # This clears CF 307SRC(3: movl (%esi), %ebx ) 308 adcl %ebx, %eax 309DST( movl %ebx, (%edi) ) 310 lea 4(%esi), %esi 311 lea 4(%edi), %edi 312 dec %edx 313 jne 3b 314 adcl $0, %eax 3154: andl $3, %ecx 316 jz 7f 317 cmpl $2, %ecx 318 jb 5f 319SRC( movw (%esi), %cx ) 320 leal 2(%esi), %esi 321DST( movw %cx, (%edi) ) 322 leal 2(%edi), %edi 323 je 6f 324 shll $16,%ecx 325SRC(5: movb (%esi), %cl ) 326DST( movb %cl, (%edi) ) 3276: addl %ecx, %eax 328 adcl $0, %eax 3297: 3305000: 331 332# Exception handler: 333.section .fixup, "ax" 334 3356001: 336 movl ARGBASE+20(%esp), %ebx # src_err_ptr 337 movl $-EFAULT, (%ebx) 338 339 # zero the complete destination - computing the rest 340 # is too much work 341 movl ARGBASE+8(%esp), %edi # dst 342 movl ARGBASE+12(%esp), %ecx # len 343 xorl %eax,%eax 344 rep ; stosb 345 346 jmp 5000b 347 3486002: 349 movl ARGBASE+24(%esp), %ebx # dst_err_ptr 350 movl $-EFAULT,(%ebx) 351 jmp 5000b 352 353.previous 354 355 popl %ebx 356 popl %esi 357 popl %edi 358 popl %ecx # equivalent to addl $4,%esp 359 ret 360 361#else 362 363/* Version for PentiumII/PPro */ 364 365#define ROUND1(x) \ 366 SRC(movl x(%esi), %ebx ) ; \ 367 addl %ebx, %eax ; \ 368 DST(movl %ebx, x(%edi) ) ; 369 370#define ROUND(x) \ 371 SRC(movl x(%esi), %ebx ) ; \ 372 adcl %ebx, %eax ; \ 373 DST(movl %ebx, x(%edi) ) ; 374 375#define ARGBASE 12 376 377csum_partial_copy_generic_i386: 378 pushl %ebx 379 pushl %edi 380 pushl %esi 381 movl ARGBASE+4(%esp),%esi #src 382 movl ARGBASE+8(%esp),%edi #dst 383 movl ARGBASE+12(%esp),%ecx #len 384 movl ARGBASE+16(%esp),%eax #sum 385# movl %ecx, %edx 386 movl %ecx, %ebx 387 movl %esi, %edx 388 shrl $6, %ecx 389 andl $0x3c, %ebx 390 negl %ebx 391 subl %ebx, %esi 392 subl %ebx, %edi 393 lea -1(%esi),%edx 394 andl $-32,%edx 395 lea 3f(%ebx,%ebx), %ebx 396 testl %esi, %esi 397 jmp *%ebx 3981: addl $64,%esi 399 addl $64,%edi 400 SRC(movb -32(%edx),%bl) ; SRC(movb (%edx),%bl) 401 ROUND1(-64) ROUND(-60) ROUND(-56) ROUND(-52) 402 ROUND (-48) ROUND(-44) ROUND(-40) ROUND(-36) 403 ROUND (-32) ROUND(-28) ROUND(-24) ROUND(-20) 404 ROUND (-16) ROUND(-12) ROUND(-8) ROUND(-4) 4053: adcl $0,%eax 406 addl $64, %edx 407 dec %ecx 408 jge 1b 4094: movl ARGBASE+12(%esp),%edx #len 410 andl $3, %edx 411 jz 7f 412 cmpl $2, %edx 413 jb 5f 414SRC( movw (%esi), %dx ) 415 leal 2(%esi), %esi 416DST( movw %dx, (%edi) ) 417 leal 2(%edi), %edi 418 je 6f 419 shll $16,%edx 4205: 421SRC( movb (%esi), %dl ) 422DST( movb %dl, (%edi) ) 4236: addl %edx, %eax 424 adcl $0, %eax 4257: 426.section .fixup, "ax" 4276001: movl ARGBASE+20(%esp), %ebx # src_err_ptr 428 movl $-EFAULT, (%ebx) 429 # zero the complete destination (computing the rest is too much work) 430 movl ARGBASE+8(%esp),%edi # dst 431 movl ARGBASE+12(%esp),%ecx # len 432 xorl %eax,%eax 433 rep; stosb 434 jmp 7b 4356002: movl ARGBASE+24(%esp), %ebx # dst_err_ptr 436 movl $-EFAULT, (%ebx) 437 jmp 7b 438.previous 439 440 popl %esi 441 popl %edi 442 popl %ebx 443 ret 444 445#undef ROUND 446#undef ROUND1 447 448#endif 449