1/*- 2 * Copyright (c) 1990 The Regents of the University of California. 3 * All rights reserved. 4 * 5 * Redistribution and use in source and binary forms, with or without 6 * modification, are permitted provided that the following conditions 7 * are met: 8 * 1. Redistributions of source code must retain the above copyright 9 * notice, this list of conditions and the following disclaimer. 10 * 2. Redistributions in binary form must reproduce the above copyright 11 * notice, this list of conditions and the following disclaimer in the 12 * documentation and/or other materials provided with the distribution. 13 * 4. Neither the name of the University nor the names of its contributors 14 * may be used to endorse or promote products derived from this software 15 * without specific prior written permission. 16 * 17 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND 18 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 19 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 20 * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE 21 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 22 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 23 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 24 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 25 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 26 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 27 * SUCH DAMAGE. 28 * 29 * from tahoe: in_cksum.c 1.2 86/01/05 30 * from: @(#)in_cksum.c 1.3 (Berkeley) 1/19/91 31 */ 32 33#include <sys/cdefs.h> 34__FBSDID("$FreeBSD: stable/11/sys/i386/i386/in_cksum.c 331722 2018-03-29 02:50:57Z eadler $"); 35 36#include <sys/param.h> 37#include <sys/systm.h> 38#include <sys/mbuf.h> 39 40#include <netinet/in.h> 41#include <netinet/in_systm.h> 42#include <netinet/ip.h> 43 44#include <machine/in_cksum.h> 45 46/* 47 * Checksum routine for Internet Protocol family headers. 48 * 49 * This routine is very heavily used in the network 50 * code and should be modified for each CPU to be as fast as possible. 51 * 52 * This implementation is 386 version. 53 */ 54 55#undef ADDCARRY 56#define ADDCARRY(x) if ((x) > 0xffff) (x) -= 0xffff 57/* 58 * icc needs to be special cased here, as the asm code below results 59 * in broken code if compiled with icc. 60 */ 61#if !defined(__GNUCLIKE_ASM) || defined(__INTEL_COMPILER) 62/* non gcc parts stolen from sys/alpha/alpha/in_cksum.c */ 63#define REDUCE32 \ 64 { \ 65 q_util.q = sum; \ 66 sum = q_util.s[0] + q_util.s[1] + q_util.s[2] + q_util.s[3]; \ 67 } 68#define REDUCE16 \ 69 { \ 70 q_util.q = sum; \ 71 l_util.l = q_util.s[0] + q_util.s[1] + q_util.s[2] + q_util.s[3]; \ 72 sum = l_util.s[0] + l_util.s[1]; \ 73 ADDCARRY(sum); \ 74 } 75#endif 76#define REDUCE {sum = (sum & 0xffff) + (sum >> 16); ADDCARRY(sum);} 77 78#if !defined(__GNUCLIKE_ASM) || defined(__INTEL_COMPILER) 79static const u_int32_t in_masks[] = { 80 /*0 bytes*/ /*1 byte*/ /*2 bytes*/ /*3 bytes*/ 81 0x00000000, 0x000000FF, 0x0000FFFF, 0x00FFFFFF, /* offset 0 */ 82 0x00000000, 0x0000FF00, 0x00FFFF00, 0xFFFFFF00, /* offset 1 */ 83 0x00000000, 0x00FF0000, 0xFFFF0000, 0xFFFF0000, /* offset 2 */ 84 0x00000000, 0xFF000000, 0xFF000000, 0xFF000000, /* offset 3 */ 85}; 86 87union l_util { 88 u_int16_t s[2]; 89 u_int32_t l; 90}; 91union q_util { 92 u_int16_t s[4]; 93 u_int32_t l[2]; 94 u_int64_t q; 95}; 96 97static u_int64_t 98in_cksumdata(const u_int32_t *lw, int len) 99{ 100 u_int64_t sum = 0; 101 u_int64_t prefilled; 102 int offset; 103 union q_util q_util; 104 105 if ((3 & (long) lw) == 0 && len == 20) { 106 sum = (u_int64_t) lw[0] + lw[1] + lw[2] + lw[3] + lw[4]; 107 REDUCE32; 108 return sum; 109 } 110 111 if ((offset = 3 & (long) lw) != 0) { 112 const u_int32_t *masks = in_masks + (offset << 2); 113 lw = (u_int32_t *) (((long) lw) - offset); 114 sum = *lw++ & masks[len >= 3 ? 3 : len]; 115 len -= 4 - offset; 116 if (len <= 0) { 117 REDUCE32; 118 return sum; 119 } 120 } 121#if 0 122 /* 123 * Force to cache line boundary. 124 */ 125 offset = 32 - (0x1f & (long) lw); 126 if (offset < 32 && len > offset) { 127 len -= offset; 128 if (4 & offset) { 129 sum += (u_int64_t) lw[0]; 130 lw += 1; 131 } 132 if (8 & offset) { 133 sum += (u_int64_t) lw[0] + lw[1]; 134 lw += 2; 135 } 136 if (16 & offset) { 137 sum += (u_int64_t) lw[0] + lw[1] + lw[2] + lw[3]; 138 lw += 4; 139 } 140 } 141#endif 142 /* 143 * access prefilling to start load of next cache line. 144 * then add current cache line 145 * save result of prefilling for loop iteration. 146 */ 147 prefilled = lw[0]; 148 while ((len -= 32) >= 4) { 149 u_int64_t prefilling = lw[8]; 150 sum += prefilled + lw[1] + lw[2] + lw[3] 151 + lw[4] + lw[5] + lw[6] + lw[7]; 152 lw += 8; 153 prefilled = prefilling; 154 } 155 if (len >= 0) { 156 sum += prefilled + lw[1] + lw[2] + lw[3] 157 + lw[4] + lw[5] + lw[6] + lw[7]; 158 lw += 8; 159 } else { 160 len += 32; 161 } 162 while ((len -= 16) >= 0) { 163 sum += (u_int64_t) lw[0] + lw[1] + lw[2] + lw[3]; 164 lw += 4; 165 } 166 len += 16; 167 while ((len -= 4) >= 0) { 168 sum += (u_int64_t) *lw++; 169 } 170 len += 4; 171 if (len > 0) 172 sum += (u_int64_t) (in_masks[len] & *lw); 173 REDUCE32; 174 return sum; 175} 176 177u_short 178in_addword(u_short a, u_short b) 179{ 180 u_int64_t sum = a + b; 181 182 ADDCARRY(sum); 183 return (sum); 184} 185 186u_short 187in_pseudo(u_int32_t a, u_int32_t b, u_int32_t c) 188{ 189 u_int64_t sum; 190 union q_util q_util; 191 union l_util l_util; 192 193 sum = (u_int64_t) a + b + c; 194 REDUCE16; 195 return (sum); 196} 197 198u_short 199in_cksum_skip(struct mbuf *m, int len, int skip) 200{ 201 u_int64_t sum = 0; 202 int mlen = 0; 203 int clen = 0; 204 caddr_t addr; 205 union q_util q_util; 206 union l_util l_util; 207 208 len -= skip; 209 for (; skip && m; m = m->m_next) { 210 if (m->m_len > skip) { 211 mlen = m->m_len - skip; 212 addr = mtod(m, caddr_t) + skip; 213 goto skip_start; 214 } else { 215 skip -= m->m_len; 216 } 217 } 218 219 for (; m && len; m = m->m_next) { 220 if (m->m_len == 0) 221 continue; 222 mlen = m->m_len; 223 addr = mtod(m, caddr_t); 224skip_start: 225 if (len < mlen) 226 mlen = len; 227 if ((clen ^ (long) addr) & 1) 228 sum += in_cksumdata((const u_int32_t *)addr, mlen) << 8; 229 else 230 sum += in_cksumdata((const u_int32_t *)addr, mlen); 231 232 clen += mlen; 233 len -= mlen; 234 } 235 REDUCE16; 236 return (~sum & 0xffff); 237} 238 239u_int in_cksum_hdr(const struct ip *ip) 240{ 241 u_int64_t sum = in_cksumdata((const u_int32_t *)ip, sizeof(struct ip)); 242 union q_util q_util; 243 union l_util l_util; 244 245 REDUCE16; 246 return (~sum & 0xffff); 247} 248#else 249 250/* 251 * These asm statements require __volatile because they pass information 252 * via the condition codes. GCC does not currently provide a way to specify 253 * the condition codes as an input or output operand. 254 * 255 * The LOAD macro below is effectively a prefetch into cache. GCC will 256 * load the value into a register but will not use it. Since modern CPUs 257 * reorder operations, this will generally take place in parallel with 258 * other calculations. 259 */ 260u_short 261in_cksum_skip(m, len, skip) 262 struct mbuf *m; 263 int len; 264 int skip; 265{ 266 u_short *w; 267 unsigned sum = 0; 268 int mlen = 0; 269 int byte_swapped = 0; 270 union { char c[2]; u_short s; } su; 271 272 len -= skip; 273 for (; skip && m; m = m->m_next) { 274 if (m->m_len > skip) { 275 mlen = m->m_len - skip; 276 w = (u_short *)(mtod(m, u_char *) + skip); 277 goto skip_start; 278 } else { 279 skip -= m->m_len; 280 } 281 } 282 283 for (;m && len; m = m->m_next) { 284 if (m->m_len == 0) 285 continue; 286 w = mtod(m, u_short *); 287 if (mlen == -1) { 288 /* 289 * The first byte of this mbuf is the continuation 290 * of a word spanning between this mbuf and the 291 * last mbuf. 292 */ 293 294 /* su.c[0] is already saved when scanning previous 295 * mbuf. sum was REDUCEd when we found mlen == -1 296 */ 297 su.c[1] = *(u_char *)w; 298 sum += su.s; 299 w = (u_short *)((char *)w + 1); 300 mlen = m->m_len - 1; 301 len--; 302 } else 303 mlen = m->m_len; 304skip_start: 305 if (len < mlen) 306 mlen = len; 307 len -= mlen; 308 /* 309 * Force to long boundary so we do longword aligned 310 * memory operations 311 */ 312 if (3 & (int) w) { 313 REDUCE; 314 if ((1 & (int) w) && (mlen > 0)) { 315 sum <<= 8; 316 su.c[0] = *(char *)w; 317 w = (u_short *)((char *)w + 1); 318 mlen--; 319 byte_swapped = 1; 320 } 321 if ((2 & (int) w) && (mlen >= 2)) { 322 sum += *w++; 323 mlen -= 2; 324 } 325 } 326 /* 327 * Advance to a 486 cache line boundary. 328 */ 329 if (4 & (int) w && mlen >= 4) { 330 __asm __volatile ( 331 "addl %1, %0\n" 332 "adcl $0, %0" 333 : "+r" (sum) 334 : "g" (((const u_int32_t *)w)[0]) 335 ); 336 w += 2; 337 mlen -= 4; 338 } 339 if (8 & (int) w && mlen >= 8) { 340 __asm __volatile ( 341 "addl %1, %0\n" 342 "adcl %2, %0\n" 343 "adcl $0, %0" 344 : "+r" (sum) 345 : "g" (((const u_int32_t *)w)[0]), 346 "g" (((const u_int32_t *)w)[1]) 347 ); 348 w += 4; 349 mlen -= 8; 350 } 351 /* 352 * Do as much of the checksum as possible 32 bits at at time. 353 * In fact, this loop is unrolled to make overhead from 354 * branches &c small. 355 */ 356 mlen -= 1; 357 while ((mlen -= 32) >= 0) { 358 /* 359 * Add with carry 16 words and fold in the last 360 * carry by adding a 0 with carry. 361 * 362 * The early ADD(16) and the LOAD(32) are to load 363 * the next 2 cache lines in advance on 486's. The 364 * 486 has a penalty of 2 clock cycles for loading 365 * a cache line, plus whatever time the external 366 * memory takes to load the first word(s) addressed. 367 * These penalties are unavoidable. Subsequent 368 * accesses to a cache line being loaded (and to 369 * other external memory?) are delayed until the 370 * whole load finishes. These penalties are mostly 371 * avoided by not accessing external memory for 372 * 8 cycles after the ADD(16) and 12 cycles after 373 * the LOAD(32). The loop terminates when mlen 374 * is initially 33 (not 32) to guaranteed that 375 * the LOAD(32) is within bounds. 376 */ 377 __asm __volatile ( 378 "addl %1, %0\n" 379 "adcl %2, %0\n" 380 "adcl %3, %0\n" 381 "adcl %4, %0\n" 382 "adcl %5, %0\n" 383 "mov %6, %%eax\n" 384 "adcl %7, %0\n" 385 "adcl %8, %0\n" 386 "adcl %9, %0\n" 387 "adcl $0, %0" 388 : "+r" (sum) 389 : "g" (((const u_int32_t *)w)[4]), 390 "g" (((const u_int32_t *)w)[0]), 391 "g" (((const u_int32_t *)w)[1]), 392 "g" (((const u_int32_t *)w)[2]), 393 "g" (((const u_int32_t *)w)[3]), 394 "g" (((const u_int32_t *)w)[8]), 395 "g" (((const u_int32_t *)w)[5]), 396 "g" (((const u_int32_t *)w)[6]), 397 "g" (((const u_int32_t *)w)[7]) 398 : "eax" 399 ); 400 w += 16; 401 } 402 mlen += 32 + 1; 403 if (mlen >= 32) { 404 __asm __volatile ( 405 "addl %1, %0\n" 406 "adcl %2, %0\n" 407 "adcl %3, %0\n" 408 "adcl %4, %0\n" 409 "adcl %5, %0\n" 410 "adcl %6, %0\n" 411 "adcl %7, %0\n" 412 "adcl %8, %0\n" 413 "adcl $0, %0" 414 : "+r" (sum) 415 : "g" (((const u_int32_t *)w)[4]), 416 "g" (((const u_int32_t *)w)[0]), 417 "g" (((const u_int32_t *)w)[1]), 418 "g" (((const u_int32_t *)w)[2]), 419 "g" (((const u_int32_t *)w)[3]), 420 "g" (((const u_int32_t *)w)[5]), 421 "g" (((const u_int32_t *)w)[6]), 422 "g" (((const u_int32_t *)w)[7]) 423 ); 424 w += 16; 425 mlen -= 32; 426 } 427 if (mlen >= 16) { 428 __asm __volatile ( 429 "addl %1, %0\n" 430 "adcl %2, %0\n" 431 "adcl %3, %0\n" 432 "adcl %4, %0\n" 433 "adcl $0, %0" 434 : "+r" (sum) 435 : "g" (((const u_int32_t *)w)[0]), 436 "g" (((const u_int32_t *)w)[1]), 437 "g" (((const u_int32_t *)w)[2]), 438 "g" (((const u_int32_t *)w)[3]) 439 ); 440 w += 8; 441 mlen -= 16; 442 } 443 if (mlen >= 8) { 444 __asm __volatile ( 445 "addl %1, %0\n" 446 "adcl %2, %0\n" 447 "adcl $0, %0" 448 : "+r" (sum) 449 : "g" (((const u_int32_t *)w)[0]), 450 "g" (((const u_int32_t *)w)[1]) 451 ); 452 w += 4; 453 mlen -= 8; 454 } 455 if (mlen == 0 && byte_swapped == 0) 456 continue; /* worth 1% maybe ?? */ 457 REDUCE; 458 while ((mlen -= 2) >= 0) { 459 sum += *w++; 460 } 461 if (byte_swapped) { 462 sum <<= 8; 463 byte_swapped = 0; 464 if (mlen == -1) { 465 su.c[1] = *(char *)w; 466 sum += su.s; 467 mlen = 0; 468 } else 469 mlen = -1; 470 } else if (mlen == -1) 471 /* 472 * This mbuf has odd number of bytes. 473 * There could be a word split betwen 474 * this mbuf and the next mbuf. 475 * Save the last byte (to prepend to next mbuf). 476 */ 477 su.c[0] = *(char *)w; 478 } 479 480 if (len) 481 printf("%s: out of data by %d\n", __func__, len); 482 if (mlen == -1) { 483 /* The last mbuf has odd # of bytes. Follow the 484 standard (the odd byte is shifted left by 8 bits) */ 485 su.c[1] = 0; 486 sum += su.s; 487 } 488 REDUCE; 489 return (~sum & 0xffff); 490} 491#endif 492