in_cksum.c revision 28270
14Srgrimes/*- 24Srgrimes * Copyright (c) 1990 The Regents of the University of California. 34Srgrimes * All rights reserved. 44Srgrimes * 54Srgrimes * Redistribution and use in source and binary forms, with or without 64Srgrimes * modification, are permitted provided that the following conditions 74Srgrimes * are met: 84Srgrimes * 1. Redistributions of source code must retain the above copyright 94Srgrimes * notice, this list of conditions and the following disclaimer. 104Srgrimes * 2. Redistributions in binary form must reproduce the above copyright 114Srgrimes * notice, this list of conditions and the following disclaimer in the 124Srgrimes * documentation and/or other materials provided with the distribution. 134Srgrimes * 3. All advertising materials mentioning features or use of this software 144Srgrimes * must display the following acknowledgement: 154Srgrimes * This product includes software developed by the University of 164Srgrimes * California, Berkeley and its contributors. 174Srgrimes * 4. Neither the name of the University nor the names of its contributors 184Srgrimes * may be used to endorse or promote products derived from this software 194Srgrimes * without specific prior written permission. 204Srgrimes * 214Srgrimes * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND 224Srgrimes * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 234Srgrimes * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 244Srgrimes * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE 254Srgrimes * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 264Srgrimes * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 274Srgrimes * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 284Srgrimes * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 294Srgrimes * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 304Srgrimes * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 314Srgrimes * SUCH DAMAGE. 324Srgrimes * 33620Srgrimes * from tahoe: in_cksum.c 1.2 86/01/05 34620Srgrimes * from: @(#)in_cksum.c 1.3 (Berkeley) 1/19/91 3528270Swollman * $Id: in_cksum.c,v 1.10 1997/02/22 09:32:20 peter Exp $ 364Srgrimes */ 374Srgrimes 382056Swollman#include <sys/param.h> 392056Swollman#include <sys/systm.h> 402056Swollman#include <sys/mbuf.h> 4128270Swollman#include <sys/socket.h> 424Srgrimes 4328270Swollman#include <netinet/in.h> 4428270Swollman#include <netinet/in_systm.h> 4528270Swollman#include <netinet/ip.h> 4612607Sbde 4728270Swollman#include <machine/in_cksum.h> 4828270Swollman 494Srgrimes/* 504Srgrimes * Checksum routine for Internet Protocol family headers. 514Srgrimes * 524Srgrimes * This routine is very heavily used in the network 534Srgrimes * code and should be modified for each CPU to be as fast as possible. 548876Srgrimes * 554Srgrimes * This implementation is 386 version. 564Srgrimes */ 574Srgrimes 584Srgrimes#undef ADDCARRY 594Srgrimes#define ADDCARRY(x) if ((x) > 0xffff) (x) -= 0xffff 604Srgrimes#define REDUCE {sum = (sum & 0xffff) + (sum >> 16); ADDCARRY(sum);} 614Srgrimes 624Srgrimes/* 634Srgrimes * Thanks to gcc we don't have to guess 644Srgrimes * which registers contain sum & w. 654Srgrimes */ 661247Sdg#define ADD(n) asm("addl " #n "(%2), %0" : "=r" (sum) : "0" (sum), "r" (w)) 671247Sdg#define ADDC(n) asm("adcl " #n "(%2), %0" : "=r" (sum) : "0" (sum), "r" (w)) 681247Sdg#define LOAD(n) asm volatile("movb " #n "(%1), %0" : "=r" (junk) : "r" (w)) 691247Sdg#define MOP asm("adcl $0, %0" : "=r" (sum) : "0" (sum)) 704Srgrimes 71798Swollmanint 724Srgrimesin_cksum(m, len) 734Srgrimes register struct mbuf *m; 744Srgrimes register int len; 754Srgrimes{ 764Srgrimes register u_short *w; 774Srgrimes register unsigned sum = 0; 784Srgrimes register int mlen = 0; 794Srgrimes int byte_swapped = 0; 804Srgrimes union { char c[2]; u_short s; } su; 814Srgrimes 824Srgrimes for (;m && len; m = m->m_next) { 834Srgrimes if (m->m_len == 0) 844Srgrimes continue; 854Srgrimes w = mtod(m, u_short *); 864Srgrimes if (mlen == -1) { 874Srgrimes /* 884Srgrimes * The first byte of this mbuf is the continuation 894Srgrimes * of a word spanning between this mbuf and the 904Srgrimes * last mbuf. 914Srgrimes */ 924Srgrimes 938876Srgrimes /* su.c[0] is already saved when scanning previous 944Srgrimes * mbuf. sum was REDUCEd when we found mlen == -1 954Srgrimes */ 964Srgrimes su.c[1] = *(u_char *)w; 974Srgrimes sum += su.s; 984Srgrimes w = (u_short *)((char *)w + 1); 994Srgrimes mlen = m->m_len - 1; 1004Srgrimes len--; 1014Srgrimes } else 1024Srgrimes mlen = m->m_len; 1034Srgrimes if (len < mlen) 1044Srgrimes mlen = len; 1054Srgrimes len -= mlen; 1064Srgrimes /* 1074Srgrimes * Force to long boundary so we do longword aligned 1084Srgrimes * memory operations 1094Srgrimes */ 1104Srgrimes if (3 & (int) w) { 1114Srgrimes REDUCE; 1124Srgrimes if ((1 & (int) w) && (mlen > 0)) { 1134Srgrimes sum <<= 8; 1144Srgrimes su.c[0] = *(char *)w; 1154Srgrimes w = (u_short *)((char *)w + 1); 1164Srgrimes mlen--; 1174Srgrimes byte_swapped = 1; 1184Srgrimes } 1194Srgrimes if ((2 & (int) w) && (mlen >= 2)) { 1204Srgrimes sum += *w++; 1214Srgrimes mlen -= 2; 1224Srgrimes } 1234Srgrimes } 1244Srgrimes /* 1251247Sdg * Advance to a 486 cache line boundary. 1261247Sdg */ 1271247Sdg if (4 & (int) w && mlen >= 4) { 1281247Sdg ADD(0); 1291247Sdg MOP; 1301247Sdg w += 2; 1311247Sdg mlen -= 4; 1321247Sdg } 1331247Sdg if (8 & (int) w && mlen >= 8) { 1341247Sdg ADD(0); 1351247Sdg ADDC(4); 1361247Sdg MOP; 1371247Sdg w += 4; 1381247Sdg mlen -= 8; 1391247Sdg } 1401247Sdg /* 1414Srgrimes * Do as much of the checksum as possible 32 bits at at time. 1424Srgrimes * In fact, this loop is unrolled to make overhead from 1434Srgrimes * branches &c small. 1444Srgrimes */ 1451247Sdg mlen -= 1; 1464Srgrimes while ((mlen -= 32) >= 0) { 1471247Sdg u_char junk; 1484Srgrimes /* 1491247Sdg * Add with carry 16 words and fold in the last 1501247Sdg * carry by adding a 0 with carry. 1511247Sdg * 1521247Sdg * The early ADD(16) and the LOAD(32) are to load 1531247Sdg * the next 2 cache lines in advance on 486's. The 1541247Sdg * 486 has a penalty of 2 clock cycles for loading 1551247Sdg * a cache line, plus whatever time the external 1561247Sdg * memory takes to load the first word(s) addressed. 1571247Sdg * These penalties are unavoidable. Subsequent 1581247Sdg * accesses to a cache line being loaded (and to 1598876Srgrimes * other external memory?) are delayed until the 1601247Sdg * whole load finishes. These penalties are mostly 1611247Sdg * avoided by not accessing external memory for 1621247Sdg * 8 cycles after the ADD(16) and 12 cycles after 1631247Sdg * the LOAD(32). The loop terminates when mlen 1641247Sdg * is initially 33 (not 32) to guaranteed that 1651247Sdg * the LOAD(32) is within bounds. 1664Srgrimes */ 1671247Sdg ADD(16); 1681247Sdg ADDC(0); 1691247Sdg ADDC(4); 1701247Sdg ADDC(8); 1711247Sdg ADDC(12); 1721247Sdg LOAD(32); 1731247Sdg ADDC(20); 1741247Sdg ADDC(24); 1751247Sdg ADDC(28); 1761247Sdg MOP; 1771247Sdg w += 16; 1784Srgrimes } 1791247Sdg mlen += 32 + 1; 1801247Sdg if (mlen >= 32) { 1811247Sdg ADD(16); 1821247Sdg ADDC(0); 1831247Sdg ADDC(4); 1841247Sdg ADDC(8); 1851247Sdg ADDC(12); 1861247Sdg ADDC(20); 1871247Sdg ADDC(24); 1881247Sdg ADDC(28); 1894Srgrimes MOP; 1901247Sdg w += 16; 1911247Sdg mlen -= 32; 1921247Sdg } 1931247Sdg if (mlen >= 16) { 1941247Sdg ADD(0); 1951247Sdg ADDC(4); 1961247Sdg ADDC(8); 1971247Sdg ADDC(12); 1981247Sdg MOP; 1991247Sdg w += 8; 2001247Sdg mlen -= 16; 2011247Sdg } 2021247Sdg if (mlen >= 8) { 2031247Sdg ADD(0); 2041247Sdg ADDC(4); 2051247Sdg MOP; 2064Srgrimes w += 4; 2071247Sdg mlen -= 8; 2084Srgrimes } 2094Srgrimes if (mlen == 0 && byte_swapped == 0) 2104Srgrimes continue; /* worth 1% maybe ?? */ 2114Srgrimes REDUCE; 2124Srgrimes while ((mlen -= 2) >= 0) { 2134Srgrimes sum += *w++; 2144Srgrimes } 2154Srgrimes if (byte_swapped) { 2164Srgrimes sum <<= 8; 2174Srgrimes byte_swapped = 0; 2184Srgrimes if (mlen == -1) { 2194Srgrimes su.c[1] = *(char *)w; 2204Srgrimes sum += su.s; 2214Srgrimes mlen = 0; 2224Srgrimes } else 2234Srgrimes mlen = -1; 2244Srgrimes } else if (mlen == -1) 2254Srgrimes /* 2264Srgrimes * This mbuf has odd number of bytes. 2274Srgrimes * There could be a word split betwen 2284Srgrimes * this mbuf and the next mbuf. 2294Srgrimes * Save the last byte (to prepend to next mbuf). 2304Srgrimes */ 2314Srgrimes su.c[0] = *(char *)w; 2324Srgrimes } 2334Srgrimes 2344Srgrimes if (len) 2354Srgrimes printf("cksum: out of data\n"); 2364Srgrimes if (mlen == -1) { 2374Srgrimes /* The last mbuf has odd # of bytes. Follow the 2384Srgrimes standard (the odd byte is shifted left by 8 bits) */ 2394Srgrimes su.c[1] = 0; 2404Srgrimes sum += su.s; 2414Srgrimes } 2424Srgrimes REDUCE; 2434Srgrimes return (~sum & 0xffff); 2444Srgrimes} 24528270Swollman 24628270Swollman/* 24728270Swollman * This is the exact same algorithm as above with a few exceptions: 24828270Swollman * (1) it is designed to operate on buffers, not mbufs 24928270Swollman * (2) it returns an intermediate form of the sum which has to be 25028270Swollman * explicitly finalized (but this can be delayed) 25128270Swollman * (3) it accepts an intermediate sum 25228270Swollman * 25328270Swollman * This is particularly useful when building packets quickly, 25428270Swollman * since one can compute the checksum of the pseudoheader ahead of 25528270Swollman * time and then use this function to complete the work. That way, 25628270Swollman * the pseudoheader never actually has to exist in the packet buffer, 25728270Swollman * which avoids needless duplication of work. 25828270Swollman */ 25928270Swollmanin_psum_t 26028270Swollmanin_cksum_partial(psum, w, len) 26128270Swollman in_psum_t psum; 26228270Swollman const u_short *w; 26328270Swollman int len; 26428270Swollman{ 26528270Swollman register in_psum_t sum = psum; 26628270Swollman int byte_swapped = 0; 26728270Swollman union { char c[2]; u_short s; } su; 26828270Swollman 26928270Swollman /* 27028270Swollman * Force to long boundary so we do longword aligned 27128270Swollman * memory operations 27228270Swollman */ 27328270Swollman if (3 & (int) w) { 27428270Swollman REDUCE; 27528270Swollman if ((1 & (int) w) && (len > 0)) { 27628270Swollman sum <<= 8; 27728270Swollman su.c[0] = *(char *)w; 27828270Swollman w = (u_short *)((char *)w + 1); 27928270Swollman len--; 28028270Swollman byte_swapped = 1; 28128270Swollman } 28228270Swollman if ((2 & (int) w) && (len >= 2)) { 28328270Swollman sum += *w++; 28428270Swollman len -= 2; 28528270Swollman } 28628270Swollman } 28728270Swollman /* 28828270Swollman * Advance to a 486 cache line boundary. 28928270Swollman */ 29028270Swollman if (4 & (int) w && len >= 4) { 29128270Swollman ADD(0); 29228270Swollman MOP; 29328270Swollman w += 2; 29428270Swollman len -= 4; 29528270Swollman } 29628270Swollman if (8 & (int) w && len >= 8) { 29728270Swollman ADD(0); 29828270Swollman ADDC(4); 29928270Swollman MOP; 30028270Swollman w += 4; 30128270Swollman len -= 8; 30228270Swollman } 30328270Swollman /* 30428270Swollman * Do as much of the checksum as possible 32 bits at at time. 30528270Swollman * In fact, this loop is unrolled to make overhead from 30628270Swollman * branches &c small. 30728270Swollman */ 30828270Swollman len -= 1; 30928270Swollman while ((len -= 32) >= 0) { 31028270Swollman u_char junk; 31128270Swollman /* 31228270Swollman * Add with carry 16 words and fold in the last 31328270Swollman * carry by adding a 0 with carry. 31428270Swollman * 31528270Swollman * The early ADD(16) and the LOAD(32) are to load 31628270Swollman * the next 2 cache lines in advance on 486's. The 31728270Swollman * 486 has a penalty of 2 clock cycles for loading 31828270Swollman * a cache line, plus whatever time the external 31928270Swollman * memory takes to load the first word(s) addressed. 32028270Swollman * These penalties are unavoidable. Subsequent 32128270Swollman * accesses to a cache line being loaded (and to 32228270Swollman * other external memory?) are delayed until the 32328270Swollman * whole load finishes. These penalties are mostly 32428270Swollman * avoided by not accessing external memory for 32528270Swollman * 8 cycles after the ADD(16) and 12 cycles after 32628270Swollman * the LOAD(32). The loop terminates when len 32728270Swollman * is initially 33 (not 32) to guaranteed that 32828270Swollman * the LOAD(32) is within bounds. 32928270Swollman */ 33028270Swollman ADD(16); 33128270Swollman ADDC(0); 33228270Swollman ADDC(4); 33328270Swollman ADDC(8); 33428270Swollman ADDC(12); 33528270Swollman LOAD(32); 33628270Swollman ADDC(20); 33728270Swollman ADDC(24); 33828270Swollman ADDC(28); 33928270Swollman MOP; 34028270Swollman w += 16; 34128270Swollman } 34228270Swollman len += 32 + 1; 34328270Swollman if (len >= 32) { 34428270Swollman ADD(16); 34528270Swollman ADDC(0); 34628270Swollman ADDC(4); 34728270Swollman ADDC(8); 34828270Swollman ADDC(12); 34928270Swollman ADDC(20); 35028270Swollman ADDC(24); 35128270Swollman ADDC(28); 35228270Swollman MOP; 35328270Swollman w += 16; 35428270Swollman len -= 32; 35528270Swollman } 35628270Swollman if (len >= 16) { 35728270Swollman ADD(0); 35828270Swollman ADDC(4); 35928270Swollman ADDC(8); 36028270Swollman ADDC(12); 36128270Swollman MOP; 36228270Swollman w += 8; 36328270Swollman len -= 16; 36428270Swollman } 36528270Swollman if (len >= 8) { 36628270Swollman ADD(0); 36728270Swollman ADDC(4); 36828270Swollman MOP; 36928270Swollman w += 4; 37028270Swollman len -= 8; 37128270Swollman } 37228270Swollman if (len == 0 && byte_swapped == 0) 37328270Swollman goto out; 37428270Swollman REDUCE; 37528270Swollman while ((len -= 2) >= 0) { 37628270Swollman sum += *w++; 37728270Swollman } 37828270Swollman if (byte_swapped) { 37928270Swollman sum <<= 8; 38028270Swollman byte_swapped = 0; 38128270Swollman if (len == -1) { 38228270Swollman su.c[1] = *(char *)w; 38328270Swollman sum += su.s; 38428270Swollman len = 0; 38528270Swollman } else 38628270Swollman len = -1; 38728270Swollman } else if (len == -1) { 38828270Swollman /* 38928270Swollman * This buffer has odd number of bytes. 39028270Swollman * There could be a word split betwen 39128270Swollman * this buffer and the next. 39228270Swollman */ 39328270Swollman su.c[0] = *(char *)w; 39428270Swollman } 39528270Swollmanout: 39628270Swollman if (len == -1) { 39728270Swollman /* The last buffer has odd # of bytes. Follow the 39828270Swollman standard (the odd byte is shifted left by 8 bits) */ 39928270Swollman su.c[1] = 0; 40028270Swollman sum += su.s; 40128270Swollman } 40228270Swollman return sum; 40328270Swollman} 40428270Swollman 40528270Swollmanint 40628270Swollmanin_cksum_finalize(psum) 40728270Swollman in_psum_t psum; 40828270Swollman{ 40928270Swollman in_psum_t sum = psum; 41028270Swollman REDUCE; 41128270Swollman return (sum & 0xffff); 41228270Swollman} 413