14Srgrimes/*- 24Srgrimes * Copyright (c) 1990 The Regents of the University of California. 34Srgrimes * All rights reserved. 44Srgrimes * 54Srgrimes * Redistribution and use in source and binary forms, with or without 64Srgrimes * modification, are permitted provided that the following conditions 74Srgrimes * are met: 84Srgrimes * 1. Redistributions of source code must retain the above copyright 94Srgrimes * notice, this list of conditions and the following disclaimer. 104Srgrimes * 2. Redistributions in binary form must reproduce the above copyright 114Srgrimes * notice, this list of conditions and the following disclaimer in the 124Srgrimes * documentation and/or other materials provided with the distribution. 134Srgrimes * 4. Neither the name of the University nor the names of its contributors 144Srgrimes * may be used to endorse or promote products derived from this software 154Srgrimes * without specific prior written permission. 164Srgrimes * 174Srgrimes * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND 184Srgrimes * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 194Srgrimes * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 204Srgrimes * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE 214Srgrimes * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 224Srgrimes * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 234Srgrimes * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 244Srgrimes * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 254Srgrimes * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 264Srgrimes * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 274Srgrimes * SUCH DAMAGE. 284Srgrimes * 29620Srgrimes * from tahoe: in_cksum.c 1.2 86/01/05 30620Srgrimes * from: @(#)in_cksum.c 1.3 (Berkeley) 1/19/91 314Srgrimes */ 324Srgrimes 33115683Sobrien#include <sys/cdefs.h> 34115683Sobrien__FBSDID("$FreeBSD: stable/11/sys/i386/i386/in_cksum.c 331722 2018-03-29 02:50:57Z eadler $"); 35115683Sobrien 362056Swollman#include <sys/param.h> 372056Swollman#include <sys/systm.h> 382056Swollman#include <sys/mbuf.h> 394Srgrimes 4028270Swollman#include <netinet/in.h> 4128270Swollman#include <netinet/in_systm.h> 4228270Swollman#include <netinet/ip.h> 4312607Sbde 4428270Swollman#include <machine/in_cksum.h> 4528270Swollman 464Srgrimes/* 474Srgrimes * Checksum routine for Internet Protocol family headers. 484Srgrimes * 494Srgrimes * This routine is very heavily used in the network 504Srgrimes * code and should be modified for each CPU to be as fast as possible. 518876Srgrimes * 524Srgrimes * This implementation is 386 version. 534Srgrimes */ 544Srgrimes 554Srgrimes#undef ADDCARRY 564Srgrimes#define ADDCARRY(x) if ((x) > 0xffff) (x) -= 0xffff 57143063Sjoerg/* 58143063Sjoerg * icc needs to be special cased here, as the asm code below results 59143063Sjoerg * in broken code if compiled with icc. 60143063Sjoerg */ 61143063Sjoerg#if !defined(__GNUCLIKE_ASM) || defined(__INTEL_COMPILER) 62126891Strhodes/* non gcc parts stolen from sys/alpha/alpha/in_cksum.c */ 63126891Strhodes#define REDUCE32 \ 64126891Strhodes { \ 65126891Strhodes q_util.q = sum; \ 66126891Strhodes sum = q_util.s[0] + q_util.s[1] + q_util.s[2] + q_util.s[3]; \ 67126891Strhodes } 68126891Strhodes#define REDUCE16 \ 69126891Strhodes { \ 70126891Strhodes q_util.q = sum; \ 71126891Strhodes l_util.l = q_util.s[0] + q_util.s[1] + q_util.s[2] + q_util.s[3]; \ 72126891Strhodes sum = l_util.s[0] + l_util.s[1]; \ 73126891Strhodes ADDCARRY(sum); \ 74126891Strhodes } 75126891Strhodes#endif 764Srgrimes#define REDUCE {sum = (sum & 0xffff) + (sum >> 16); ADDCARRY(sum);} 774Srgrimes 78143063Sjoerg#if !defined(__GNUCLIKE_ASM) || defined(__INTEL_COMPILER) 79126891Strhodesstatic const u_int32_t in_masks[] = { 80126891Strhodes /*0 bytes*/ /*1 byte*/ /*2 bytes*/ /*3 bytes*/ 81126891Strhodes 0x00000000, 0x000000FF, 0x0000FFFF, 0x00FFFFFF, /* offset 0 */ 82126891Strhodes 0x00000000, 0x0000FF00, 0x00FFFF00, 0xFFFFFF00, /* offset 1 */ 83126891Strhodes 0x00000000, 0x00FF0000, 0xFFFF0000, 0xFFFF0000, /* offset 2 */ 84126891Strhodes 0x00000000, 0xFF000000, 0xFF000000, 0xFF000000, /* offset 3 */ 85126891Strhodes}; 86126891Strhodes 87126891Strhodesunion l_util { 88126891Strhodes u_int16_t s[2]; 89126891Strhodes u_int32_t l; 90126891Strhodes}; 91126891Strhodesunion q_util { 92126891Strhodes u_int16_t s[4]; 93126891Strhodes u_int32_t l[2]; 94126891Strhodes u_int64_t q; 95126891Strhodes}; 96126891Strhodes 97126891Strhodesstatic u_int64_t 98126891Strhodesin_cksumdata(const u_int32_t *lw, int len) 99126891Strhodes{ 100126891Strhodes u_int64_t sum = 0; 101126891Strhodes u_int64_t prefilled; 102126891Strhodes int offset; 103126891Strhodes union q_util q_util; 104126891Strhodes 105126891Strhodes if ((3 & (long) lw) == 0 && len == 20) { 106126891Strhodes sum = (u_int64_t) lw[0] + lw[1] + lw[2] + lw[3] + lw[4]; 107126891Strhodes REDUCE32; 108126891Strhodes return sum; 109126891Strhodes } 110126891Strhodes 111126891Strhodes if ((offset = 3 & (long) lw) != 0) { 112126891Strhodes const u_int32_t *masks = in_masks + (offset << 2); 113126891Strhodes lw = (u_int32_t *) (((long) lw) - offset); 114126891Strhodes sum = *lw++ & masks[len >= 3 ? 3 : len]; 115126891Strhodes len -= 4 - offset; 116126891Strhodes if (len <= 0) { 117126891Strhodes REDUCE32; 118126891Strhodes return sum; 119126891Strhodes } 120126891Strhodes } 121126891Strhodes#if 0 122126891Strhodes /* 123126891Strhodes * Force to cache line boundary. 124126891Strhodes */ 125126891Strhodes offset = 32 - (0x1f & (long) lw); 126126891Strhodes if (offset < 32 && len > offset) { 127126891Strhodes len -= offset; 128126891Strhodes if (4 & offset) { 129126891Strhodes sum += (u_int64_t) lw[0]; 130126891Strhodes lw += 1; 131126891Strhodes } 132126891Strhodes if (8 & offset) { 133126891Strhodes sum += (u_int64_t) lw[0] + lw[1]; 134126891Strhodes lw += 2; 135126891Strhodes } 136126891Strhodes if (16 & offset) { 137126891Strhodes sum += (u_int64_t) lw[0] + lw[1] + lw[2] + lw[3]; 138126891Strhodes lw += 4; 139126891Strhodes } 140126891Strhodes } 141126891Strhodes#endif 142126891Strhodes /* 143126891Strhodes * access prefilling to start load of next cache line. 144126891Strhodes * then add current cache line 145126891Strhodes * save result of prefilling for loop iteration. 146126891Strhodes */ 147126891Strhodes prefilled = lw[0]; 148126891Strhodes while ((len -= 32) >= 4) { 149126891Strhodes u_int64_t prefilling = lw[8]; 150126891Strhodes sum += prefilled + lw[1] + lw[2] + lw[3] 151126891Strhodes + lw[4] + lw[5] + lw[6] + lw[7]; 152126891Strhodes lw += 8; 153126891Strhodes prefilled = prefilling; 154126891Strhodes } 155126891Strhodes if (len >= 0) { 156126891Strhodes sum += prefilled + lw[1] + lw[2] + lw[3] 157126891Strhodes + lw[4] + lw[5] + lw[6] + lw[7]; 158126891Strhodes lw += 8; 159126891Strhodes } else { 160126891Strhodes len += 32; 161126891Strhodes } 162126891Strhodes while ((len -= 16) >= 0) { 163126891Strhodes sum += (u_int64_t) lw[0] + lw[1] + lw[2] + lw[3]; 164126891Strhodes lw += 4; 165126891Strhodes } 166126891Strhodes len += 16; 167126891Strhodes while ((len -= 4) >= 0) { 168126891Strhodes sum += (u_int64_t) *lw++; 169126891Strhodes } 170126891Strhodes len += 4; 171126891Strhodes if (len > 0) 172126891Strhodes sum += (u_int64_t) (in_masks[len] & *lw); 173126891Strhodes REDUCE32; 174126891Strhodes return sum; 175126891Strhodes} 176126891Strhodes 177126891Strhodesu_short 178126891Strhodesin_addword(u_short a, u_short b) 179126891Strhodes{ 180126891Strhodes u_int64_t sum = a + b; 181126891Strhodes 182126891Strhodes ADDCARRY(sum); 183126891Strhodes return (sum); 184126891Strhodes} 185126891Strhodes 186126891Strhodesu_short 187126891Strhodesin_pseudo(u_int32_t a, u_int32_t b, u_int32_t c) 188126891Strhodes{ 189126891Strhodes u_int64_t sum; 190126891Strhodes union q_util q_util; 191126891Strhodes union l_util l_util; 192126891Strhodes 193126891Strhodes sum = (u_int64_t) a + b + c; 194126891Strhodes REDUCE16; 195126891Strhodes return (sum); 196126891Strhodes} 197126891Strhodes 198126891Strhodesu_short 199126891Strhodesin_cksum_skip(struct mbuf *m, int len, int skip) 200126891Strhodes{ 201126891Strhodes u_int64_t sum = 0; 202126891Strhodes int mlen = 0; 203126891Strhodes int clen = 0; 204126891Strhodes caddr_t addr; 205126891Strhodes union q_util q_util; 206126891Strhodes union l_util l_util; 207126891Strhodes 208126891Strhodes len -= skip; 209126891Strhodes for (; skip && m; m = m->m_next) { 210126891Strhodes if (m->m_len > skip) { 211126891Strhodes mlen = m->m_len - skip; 212126891Strhodes addr = mtod(m, caddr_t) + skip; 213126891Strhodes goto skip_start; 214126891Strhodes } else { 215126891Strhodes skip -= m->m_len; 216126891Strhodes } 217126891Strhodes } 218126891Strhodes 219126891Strhodes for (; m && len; m = m->m_next) { 220126891Strhodes if (m->m_len == 0) 221126891Strhodes continue; 222126891Strhodes mlen = m->m_len; 223126891Strhodes addr = mtod(m, caddr_t); 224126891Strhodesskip_start: 225126891Strhodes if (len < mlen) 226126891Strhodes mlen = len; 227126891Strhodes if ((clen ^ (long) addr) & 1) 228126891Strhodes sum += in_cksumdata((const u_int32_t *)addr, mlen) << 8; 229126891Strhodes else 230126891Strhodes sum += in_cksumdata((const u_int32_t *)addr, mlen); 231126891Strhodes 232126891Strhodes clen += mlen; 233126891Strhodes len -= mlen; 234126891Strhodes } 235126891Strhodes REDUCE16; 236126891Strhodes return (~sum & 0xffff); 237126891Strhodes} 238126891Strhodes 239126891Strhodesu_int in_cksum_hdr(const struct ip *ip) 240126891Strhodes{ 241126891Strhodes u_int64_t sum = in_cksumdata((const u_int32_t *)ip, sizeof(struct ip)); 242126891Strhodes union q_util q_util; 243126891Strhodes union l_util l_util; 244126891Strhodes 245126891Strhodes REDUCE16; 246126891Strhodes return (~sum & 0xffff); 247126891Strhodes} 248126891Strhodes#else 249126891Strhodes 2504Srgrimes/* 25198648Sjdp * These asm statements require __volatile because they pass information 25298648Sjdp * via the condition codes. GCC does not currently provide a way to specify 25398648Sjdp * the condition codes as an input or output operand. 25498648Sjdp * 25598648Sjdp * The LOAD macro below is effectively a prefetch into cache. GCC will 25698648Sjdp * load the value into a register but will not use it. Since modern CPUs 25798648Sjdp * reorder operations, this will generally take place in parallel with 25898648Sjdp * other calculations. 2594Srgrimes */ 26058698Sjlemonu_short 26158698Sjlemonin_cksum_skip(m, len, skip) 26258698Sjlemon struct mbuf *m; 26358698Sjlemon int len; 26458698Sjlemon int skip; 26558698Sjlemon{ 266331643Sdim u_short *w; 267331643Sdim unsigned sum = 0; 268331643Sdim int mlen = 0; 26958698Sjlemon int byte_swapped = 0; 27058698Sjlemon union { char c[2]; u_short s; } su; 27158698Sjlemon 27258698Sjlemon len -= skip; 27358698Sjlemon for (; skip && m; m = m->m_next) { 27458698Sjlemon if (m->m_len > skip) { 27558698Sjlemon mlen = m->m_len - skip; 27658698Sjlemon w = (u_short *)(mtod(m, u_char *) + skip); 27758698Sjlemon goto skip_start; 27858698Sjlemon } else { 27958698Sjlemon skip -= m->m_len; 28058698Sjlemon } 28158698Sjlemon } 28258698Sjlemon 28358698Sjlemon for (;m && len; m = m->m_next) { 28458698Sjlemon if (m->m_len == 0) 28558698Sjlemon continue; 28658698Sjlemon w = mtod(m, u_short *); 28758698Sjlemon if (mlen == -1) { 28858698Sjlemon /* 28958698Sjlemon * The first byte of this mbuf is the continuation 29058698Sjlemon * of a word spanning between this mbuf and the 29158698Sjlemon * last mbuf. 29258698Sjlemon */ 29358698Sjlemon 29458698Sjlemon /* su.c[0] is already saved when scanning previous 29558698Sjlemon * mbuf. sum was REDUCEd when we found mlen == -1 29658698Sjlemon */ 29758698Sjlemon su.c[1] = *(u_char *)w; 29858698Sjlemon sum += su.s; 29958698Sjlemon w = (u_short *)((char *)w + 1); 30058698Sjlemon mlen = m->m_len - 1; 30158698Sjlemon len--; 30258698Sjlemon } else 30358698Sjlemon mlen = m->m_len; 30458698Sjlemonskip_start: 30558698Sjlemon if (len < mlen) 30658698Sjlemon mlen = len; 30758698Sjlemon len -= mlen; 30858698Sjlemon /* 30958698Sjlemon * Force to long boundary so we do longword aligned 31058698Sjlemon * memory operations 31158698Sjlemon */ 31258698Sjlemon if (3 & (int) w) { 31358698Sjlemon REDUCE; 31458698Sjlemon if ((1 & (int) w) && (mlen > 0)) { 31558698Sjlemon sum <<= 8; 31658698Sjlemon su.c[0] = *(char *)w; 31758698Sjlemon w = (u_short *)((char *)w + 1); 31858698Sjlemon mlen--; 31958698Sjlemon byte_swapped = 1; 32058698Sjlemon } 32158698Sjlemon if ((2 & (int) w) && (mlen >= 2)) { 32258698Sjlemon sum += *w++; 32358698Sjlemon mlen -= 2; 32458698Sjlemon } 32558698Sjlemon } 32658698Sjlemon /* 32758698Sjlemon * Advance to a 486 cache line boundary. 32858698Sjlemon */ 32958698Sjlemon if (4 & (int) w && mlen >= 4) { 330172835Sbz __asm __volatile ( 331172835Sbz "addl %1, %0\n" 332172835Sbz "adcl $0, %0" 333172835Sbz : "+r" (sum) 334172835Sbz : "g" (((const u_int32_t *)w)[0]) 335172835Sbz ); 33658698Sjlemon w += 2; 33758698Sjlemon mlen -= 4; 33858698Sjlemon } 33958698Sjlemon if (8 & (int) w && mlen >= 8) { 340172835Sbz __asm __volatile ( 341172835Sbz "addl %1, %0\n" 342172835Sbz "adcl %2, %0\n" 343172835Sbz "adcl $0, %0" 344172835Sbz : "+r" (sum) 345172835Sbz : "g" (((const u_int32_t *)w)[0]), 346172835Sbz "g" (((const u_int32_t *)w)[1]) 347172835Sbz ); 34858698Sjlemon w += 4; 34958698Sjlemon mlen -= 8; 35058698Sjlemon } 35158698Sjlemon /* 35258698Sjlemon * Do as much of the checksum as possible 32 bits at at time. 35358698Sjlemon * In fact, this loop is unrolled to make overhead from 35458698Sjlemon * branches &c small. 35558698Sjlemon */ 35658698Sjlemon mlen -= 1; 35758698Sjlemon while ((mlen -= 32) >= 0) { 35858698Sjlemon /* 35958698Sjlemon * Add with carry 16 words and fold in the last 36058698Sjlemon * carry by adding a 0 with carry. 36158698Sjlemon * 36258698Sjlemon * The early ADD(16) and the LOAD(32) are to load 36358698Sjlemon * the next 2 cache lines in advance on 486's. The 36458698Sjlemon * 486 has a penalty of 2 clock cycles for loading 36558698Sjlemon * a cache line, plus whatever time the external 36658698Sjlemon * memory takes to load the first word(s) addressed. 36758698Sjlemon * These penalties are unavoidable. Subsequent 36858698Sjlemon * accesses to a cache line being loaded (and to 36958698Sjlemon * other external memory?) are delayed until the 37058698Sjlemon * whole load finishes. These penalties are mostly 37158698Sjlemon * avoided by not accessing external memory for 37258698Sjlemon * 8 cycles after the ADD(16) and 12 cycles after 37358698Sjlemon * the LOAD(32). The loop terminates when mlen 37458698Sjlemon * is initially 33 (not 32) to guaranteed that 37558698Sjlemon * the LOAD(32) is within bounds. 37658698Sjlemon */ 377172835Sbz __asm __volatile ( 378172835Sbz "addl %1, %0\n" 379172835Sbz "adcl %2, %0\n" 380172835Sbz "adcl %3, %0\n" 381172835Sbz "adcl %4, %0\n" 382172835Sbz "adcl %5, %0\n" 383172835Sbz "mov %6, %%eax\n" 384172835Sbz "adcl %7, %0\n" 385172835Sbz "adcl %8, %0\n" 386172835Sbz "adcl %9, %0\n" 387172835Sbz "adcl $0, %0" 388172835Sbz : "+r" (sum) 389172835Sbz : "g" (((const u_int32_t *)w)[4]), 390172835Sbz "g" (((const u_int32_t *)w)[0]), 391172835Sbz "g" (((const u_int32_t *)w)[1]), 392172835Sbz "g" (((const u_int32_t *)w)[2]), 393172835Sbz "g" (((const u_int32_t *)w)[3]), 394172835Sbz "g" (((const u_int32_t *)w)[8]), 395172835Sbz "g" (((const u_int32_t *)w)[5]), 396172835Sbz "g" (((const u_int32_t *)w)[6]), 397172835Sbz "g" (((const u_int32_t *)w)[7]) 398172835Sbz : "eax" 399172835Sbz ); 40058698Sjlemon w += 16; 40158698Sjlemon } 40258698Sjlemon mlen += 32 + 1; 40358698Sjlemon if (mlen >= 32) { 404172835Sbz __asm __volatile ( 405172835Sbz "addl %1, %0\n" 406172835Sbz "adcl %2, %0\n" 407172835Sbz "adcl %3, %0\n" 408172835Sbz "adcl %4, %0\n" 409172835Sbz "adcl %5, %0\n" 410172835Sbz "adcl %6, %0\n" 411172835Sbz "adcl %7, %0\n" 412172835Sbz "adcl %8, %0\n" 413172835Sbz "adcl $0, %0" 414172835Sbz : "+r" (sum) 415172835Sbz : "g" (((const u_int32_t *)w)[4]), 416172835Sbz "g" (((const u_int32_t *)w)[0]), 417172835Sbz "g" (((const u_int32_t *)w)[1]), 418172835Sbz "g" (((const u_int32_t *)w)[2]), 419172835Sbz "g" (((const u_int32_t *)w)[3]), 420172835Sbz "g" (((const u_int32_t *)w)[5]), 421172835Sbz "g" (((const u_int32_t *)w)[6]), 422172835Sbz "g" (((const u_int32_t *)w)[7]) 423172835Sbz ); 42458698Sjlemon w += 16; 42558698Sjlemon mlen -= 32; 42658698Sjlemon } 42758698Sjlemon if (mlen >= 16) { 428172835Sbz __asm __volatile ( 429172835Sbz "addl %1, %0\n" 430172835Sbz "adcl %2, %0\n" 431172835Sbz "adcl %3, %0\n" 432172835Sbz "adcl %4, %0\n" 433172835Sbz "adcl $0, %0" 434172835Sbz : "+r" (sum) 435172835Sbz : "g" (((const u_int32_t *)w)[0]), 436172835Sbz "g" (((const u_int32_t *)w)[1]), 437172835Sbz "g" (((const u_int32_t *)w)[2]), 438172835Sbz "g" (((const u_int32_t *)w)[3]) 439172835Sbz ); 44058698Sjlemon w += 8; 44158698Sjlemon mlen -= 16; 44258698Sjlemon } 44358698Sjlemon if (mlen >= 8) { 444172835Sbz __asm __volatile ( 445172835Sbz "addl %1, %0\n" 446172835Sbz "adcl %2, %0\n" 447172835Sbz "adcl $0, %0" 448172835Sbz : "+r" (sum) 449172835Sbz : "g" (((const u_int32_t *)w)[0]), 450172835Sbz "g" (((const u_int32_t *)w)[1]) 451172835Sbz ); 45258698Sjlemon w += 4; 45358698Sjlemon mlen -= 8; 45458698Sjlemon } 45558698Sjlemon if (mlen == 0 && byte_swapped == 0) 45658698Sjlemon continue; /* worth 1% maybe ?? */ 45758698Sjlemon REDUCE; 45858698Sjlemon while ((mlen -= 2) >= 0) { 45958698Sjlemon sum += *w++; 46058698Sjlemon } 46158698Sjlemon if (byte_swapped) { 46258698Sjlemon sum <<= 8; 46358698Sjlemon byte_swapped = 0; 46458698Sjlemon if (mlen == -1) { 46558698Sjlemon su.c[1] = *(char *)w; 46658698Sjlemon sum += su.s; 46758698Sjlemon mlen = 0; 46858698Sjlemon } else 46958698Sjlemon mlen = -1; 47058698Sjlemon } else if (mlen == -1) 47158698Sjlemon /* 47258698Sjlemon * This mbuf has odd number of bytes. 47358698Sjlemon * There could be a word split betwen 47458698Sjlemon * this mbuf and the next mbuf. 47558698Sjlemon * Save the last byte (to prepend to next mbuf). 47658698Sjlemon */ 47758698Sjlemon su.c[0] = *(char *)w; 47858698Sjlemon } 47958698Sjlemon 48058698Sjlemon if (len) 48158937Sshin printf("%s: out of data by %d\n", __func__, len); 48258698Sjlemon if (mlen == -1) { 48358698Sjlemon /* The last mbuf has odd # of bytes. Follow the 48458698Sjlemon standard (the odd byte is shifted left by 8 bits) */ 48558698Sjlemon su.c[1] = 0; 48658698Sjlemon sum += su.s; 48758698Sjlemon } 48858698Sjlemon REDUCE; 48960162Sps return (~sum & 0xffff); 49058698Sjlemon} 491126891Strhodes#endif 492