ip_output.c revision 242079
1/*- 2 * Copyright (c) 1982, 1986, 1988, 1990, 1993 3 * The Regents of the University of California. All rights reserved. 4 * 5 * Redistribution and use in source and binary forms, with or without 6 * modification, are permitted provided that the following conditions 7 * are met: 8 * 1. Redistributions of source code must retain the above copyright 9 * notice, this list of conditions and the following disclaimer. 10 * 2. Redistributions in binary form must reproduce the above copyright 11 * notice, this list of conditions and the following disclaimer in the 12 * documentation and/or other materials provided with the distribution. 13 * 4. Neither the name of the University nor the names of its contributors 14 * may be used to endorse or promote products derived from this software 15 * without specific prior written permission. 16 * 17 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND 18 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 19 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 20 * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE 21 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 22 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 23 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 24 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 25 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 26 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 27 * SUCH DAMAGE. 28 * 29 * @(#)ip_output.c 8.3 (Berkeley) 1/21/94 30 */ 31 32#include <sys/cdefs.h> 33__FBSDID("$FreeBSD: head/sys/netinet/ip_output.c 242079 2012-10-25 09:39:14Z ae $"); 34 35#include "opt_ipfw.h" 36#include "opt_ipsec.h" 37#include "opt_route.h" 38#include "opt_mbuf_stress_test.h" 39#include "opt_mpath.h" 40#include "opt_sctp.h" 41 42#include <sys/param.h> 43#include <sys/systm.h> 44#include <sys/kernel.h> 45#include <sys/malloc.h> 46#include <sys/mbuf.h> 47#include <sys/priv.h> 48#include <sys/proc.h> 49#include <sys/protosw.h> 50#include <sys/socket.h> 51#include <sys/socketvar.h> 52#include <sys/sysctl.h> 53#include <sys/ucred.h> 54 55#include <net/if.h> 56#include <net/if_llatbl.h> 57#include <net/netisr.h> 58#include <net/pfil.h> 59#include <net/route.h> 60#include <net/flowtable.h> 61#ifdef RADIX_MPATH 62#include <net/radix_mpath.h> 63#endif 64#include <net/vnet.h> 65 66#include <netinet/in.h> 67#include <netinet/in_systm.h> 68#include <netinet/ip.h> 69#include <netinet/in_pcb.h> 70#include <netinet/in_var.h> 71#include <netinet/ip_var.h> 72#include <netinet/ip_options.h> 73#ifdef SCTP 74#include <netinet/sctp.h> 75#include <netinet/sctp_crc32.h> 76#endif 77 78#ifdef IPSEC 79#include <netinet/ip_ipsec.h> 80#include <netipsec/ipsec.h> 81#endif /* IPSEC*/ 82 83#include <machine/in_cksum.h> 84 85#include <security/mac/mac_framework.h> 86 87VNET_DEFINE(u_short, ip_id); 88 89#ifdef MBUF_STRESS_TEST 90static int mbuf_frag_size = 0; 91SYSCTL_INT(_net_inet_ip, OID_AUTO, mbuf_frag_size, CTLFLAG_RW, 92 &mbuf_frag_size, 0, "Fragment outgoing mbufs to this size"); 93#endif 94 95static void ip_mloopback 96 (struct ifnet *, struct mbuf *, struct sockaddr_in *, int); 97 98 99extern int in_mcast_loop; 100extern struct protosw inetsw[]; 101 102/* 103 * IP output. The packet in mbuf chain m contains a skeletal IP 104 * header (with len, off, ttl, proto, tos, src, dst). 105 * The mbuf chain containing the packet will be freed. 106 * The mbuf opt, if present, will not be freed. 107 * If route ro is present and has ro_rt initialized, route lookup would be 108 * skipped and ro->ro_rt would be used. If ro is present but ro->ro_rt is NULL, 109 * then result of route lookup is stored in ro->ro_rt. 110 * 111 * In the IP forwarding case, the packet will arrive with options already 112 * inserted, so must have a NULL opt pointer. 113 */ 114int 115ip_output(struct mbuf *m, struct mbuf *opt, struct route *ro, int flags, 116 struct ip_moptions *imo, struct inpcb *inp) 117{ 118 struct ip *ip; 119 struct ifnet *ifp = NULL; /* keep compiler happy */ 120 struct mbuf *m0; 121 int hlen = sizeof (struct ip); 122 int mtu; 123 int n; /* scratchpad */ 124 int error = 0; 125 struct sockaddr_in *dst; 126 struct in_ifaddr *ia; 127 int isbroadcast; 128 uint16_t ip_len, ip_off, sw_csum; 129 struct route iproute; 130 struct rtentry *rte; /* cache for ro->ro_rt */ 131 struct in_addr odst; 132 struct m_tag *fwd_tag = NULL; 133#ifdef IPSEC 134 int no_route_but_check_spd = 0; 135#endif 136 M_ASSERTPKTHDR(m); 137 138 if (inp != NULL) { 139 INP_LOCK_ASSERT(inp); 140 M_SETFIB(m, inp->inp_inc.inc_fibnum); 141 if (inp->inp_flags & (INP_HW_FLOWID|INP_SW_FLOWID)) { 142 m->m_pkthdr.flowid = inp->inp_flowid; 143 m->m_flags |= M_FLOWID; 144 } 145 } 146 147 if (ro == NULL) { 148 ro = &iproute; 149 bzero(ro, sizeof (*ro)); 150 } 151 152#ifdef FLOWTABLE 153 if (ro->ro_rt == NULL) { 154 struct flentry *fle; 155 156 /* 157 * The flow table returns route entries valid for up to 30 158 * seconds; we rely on the remainder of ip_output() taking no 159 * longer than that long for the stability of ro_rt. The 160 * flow ID assignment must have happened before this point. 161 */ 162 fle = flowtable_lookup_mbuf(V_ip_ft, m, AF_INET); 163 if (fle != NULL) 164 flow_to_route(fle, ro); 165 } 166#endif 167 168 if (opt) { 169 int len = 0; 170 m = ip_insertoptions(m, opt, &len); 171 if (len != 0) 172 hlen = len; /* ip->ip_hl is updated above */ 173 } 174 ip = mtod(m, struct ip *); 175 ip_len = ntohs(ip->ip_len); 176 ip_off = ntohs(ip->ip_off); 177 178 /* 179 * Fill in IP header. If we are not allowing fragmentation, 180 * then the ip_id field is meaningless, but we don't set it 181 * to zero. Doing so causes various problems when devices along 182 * the path (routers, load balancers, firewalls, etc.) illegally 183 * disable DF on our packet. Note that a 16-bit counter 184 * will wrap around in less than 10 seconds at 100 Mbit/s on a 185 * medium with MTU 1500. See Steven M. Bellovin, "A Technique 186 * for Counting NATted Hosts", Proc. IMW'02, available at 187 * <http://www.cs.columbia.edu/~smb/papers/fnat.pdf>. 188 */ 189 if ((flags & (IP_FORWARDING|IP_RAWOUTPUT)) == 0) { 190 ip->ip_v = IPVERSION; 191 ip->ip_hl = hlen >> 2; 192 ip->ip_id = ip_newid(); 193 IPSTAT_INC(ips_localout); 194 } else { 195 /* Header already set, fetch hlen from there */ 196 hlen = ip->ip_hl << 2; 197 } 198 199 dst = (struct sockaddr_in *)&ro->ro_dst; 200again: 201 ia = NULL; 202 /* 203 * If there is a cached route, 204 * check that it is to the same destination 205 * and is still up. If not, free it and try again. 206 * The address family should also be checked in case of sharing the 207 * cache with IPv6. 208 */ 209 rte = ro->ro_rt; 210 if (rte && ((rte->rt_flags & RTF_UP) == 0 || 211 rte->rt_ifp == NULL || 212 !RT_LINK_IS_UP(rte->rt_ifp) || 213 dst->sin_family != AF_INET || 214 dst->sin_addr.s_addr != ip->ip_dst.s_addr)) { 215 RO_RTFREE(ro); 216 ro->ro_lle = NULL; 217 rte = NULL; 218 } 219 if (rte == NULL && fwd_tag == NULL) { 220 bzero(dst, sizeof(*dst)); 221 dst->sin_family = AF_INET; 222 dst->sin_len = sizeof(*dst); 223 dst->sin_addr = ip->ip_dst; 224 } 225 /* 226 * If routing to interface only, short circuit routing lookup. 227 * The use of an all-ones broadcast address implies this; an 228 * interface is specified by the broadcast address of an interface, 229 * or the destination address of a ptp interface. 230 */ 231 if (flags & IP_SENDONES) { 232 if ((ia = ifatoia(ifa_ifwithbroadaddr(sintosa(dst)))) == NULL && 233 (ia = ifatoia(ifa_ifwithdstaddr(sintosa(dst)))) == NULL) { 234 IPSTAT_INC(ips_noroute); 235 error = ENETUNREACH; 236 goto bad; 237 } 238 ip->ip_dst.s_addr = INADDR_BROADCAST; 239 dst->sin_addr = ip->ip_dst; 240 ifp = ia->ia_ifp; 241 ip->ip_ttl = 1; 242 isbroadcast = 1; 243 } else if (flags & IP_ROUTETOIF) { 244 if ((ia = ifatoia(ifa_ifwithdstaddr(sintosa(dst)))) == NULL && 245 (ia = ifatoia(ifa_ifwithnet(sintosa(dst), 0))) == NULL) { 246 IPSTAT_INC(ips_noroute); 247 error = ENETUNREACH; 248 goto bad; 249 } 250 ifp = ia->ia_ifp; 251 ip->ip_ttl = 1; 252 isbroadcast = in_broadcast(dst->sin_addr, ifp); 253 } else if (IN_MULTICAST(ntohl(ip->ip_dst.s_addr)) && 254 imo != NULL && imo->imo_multicast_ifp != NULL) { 255 /* 256 * Bypass the normal routing lookup for multicast 257 * packets if the interface is specified. 258 */ 259 ifp = imo->imo_multicast_ifp; 260 IFP_TO_IA(ifp, ia); 261 isbroadcast = 0; /* fool gcc */ 262 } else { 263 /* 264 * We want to do any cloning requested by the link layer, 265 * as this is probably required in all cases for correct 266 * operation (as it is for ARP). 267 */ 268 if (rte == NULL) { 269#ifdef RADIX_MPATH 270 rtalloc_mpath_fib(ro, 271 ntohl(ip->ip_src.s_addr ^ ip->ip_dst.s_addr), 272 inp ? inp->inp_inc.inc_fibnum : M_GETFIB(m)); 273#else 274 in_rtalloc_ign(ro, 0, 275 inp ? inp->inp_inc.inc_fibnum : M_GETFIB(m)); 276#endif 277 rte = ro->ro_rt; 278 } 279 if (rte == NULL || 280 rte->rt_ifp == NULL || 281 !RT_LINK_IS_UP(rte->rt_ifp)) { 282#ifdef IPSEC 283 /* 284 * There is no route for this packet, but it is 285 * possible that a matching SPD entry exists. 286 */ 287 no_route_but_check_spd = 1; 288 mtu = 0; /* Silence GCC warning. */ 289 goto sendit; 290#endif 291 IPSTAT_INC(ips_noroute); 292 error = EHOSTUNREACH; 293 goto bad; 294 } 295 ia = ifatoia(rte->rt_ifa); 296 ifa_ref(&ia->ia_ifa); 297 ifp = rte->rt_ifp; 298 rte->rt_rmx.rmx_pksent++; 299 if (rte->rt_flags & RTF_GATEWAY) 300 dst = (struct sockaddr_in *)rte->rt_gateway; 301 if (rte->rt_flags & RTF_HOST) 302 isbroadcast = (rte->rt_flags & RTF_BROADCAST); 303 else 304 isbroadcast = in_broadcast(dst->sin_addr, ifp); 305 } 306 /* 307 * Calculate MTU. If we have a route that is up, use that, 308 * otherwise use the interface's MTU. 309 */ 310 if (rte != NULL && (rte->rt_flags & (RTF_UP|RTF_HOST))) { 311 /* 312 * This case can happen if the user changed the MTU 313 * of an interface after enabling IP on it. Because 314 * most netifs don't keep track of routes pointing to 315 * them, there is no way for one to update all its 316 * routes when the MTU is changed. 317 */ 318 if (rte->rt_rmx.rmx_mtu > ifp->if_mtu) 319 rte->rt_rmx.rmx_mtu = ifp->if_mtu; 320 mtu = rte->rt_rmx.rmx_mtu; 321 } else { 322 mtu = ifp->if_mtu; 323 } 324 /* Catch a possible divide by zero later. */ 325 KASSERT(mtu > 0, ("%s: mtu %d <= 0, rte=%p (rt_flags=0x%08x) ifp=%p", 326 __func__, mtu, rte, (rte != NULL) ? rte->rt_flags : 0, ifp)); 327 if (IN_MULTICAST(ntohl(ip->ip_dst.s_addr))) { 328 m->m_flags |= M_MCAST; 329 /* 330 * IP destination address is multicast. Make sure "dst" 331 * still points to the address in "ro". (It may have been 332 * changed to point to a gateway address, above.) 333 */ 334 dst = (struct sockaddr_in *)&ro->ro_dst; 335 /* 336 * See if the caller provided any multicast options 337 */ 338 if (imo != NULL) { 339 ip->ip_ttl = imo->imo_multicast_ttl; 340 if (imo->imo_multicast_vif != -1) 341 ip->ip_src.s_addr = 342 ip_mcast_src ? 343 ip_mcast_src(imo->imo_multicast_vif) : 344 INADDR_ANY; 345 } else 346 ip->ip_ttl = IP_DEFAULT_MULTICAST_TTL; 347 /* 348 * Confirm that the outgoing interface supports multicast. 349 */ 350 if ((imo == NULL) || (imo->imo_multicast_vif == -1)) { 351 if ((ifp->if_flags & IFF_MULTICAST) == 0) { 352 IPSTAT_INC(ips_noroute); 353 error = ENETUNREACH; 354 goto bad; 355 } 356 } 357 /* 358 * If source address not specified yet, use address 359 * of outgoing interface. 360 */ 361 if (ip->ip_src.s_addr == INADDR_ANY) { 362 /* Interface may have no addresses. */ 363 if (ia != NULL) 364 ip->ip_src = IA_SIN(ia)->sin_addr; 365 } 366 367 if ((imo == NULL && in_mcast_loop) || 368 (imo && imo->imo_multicast_loop)) { 369 /* 370 * Loop back multicast datagram if not expressly 371 * forbidden to do so, even if we are not a member 372 * of the group; ip_input() will filter it later, 373 * thus deferring a hash lookup and mutex acquisition 374 * at the expense of a cheap copy using m_copym(). 375 */ 376 ip_mloopback(ifp, m, dst, hlen); 377 } else { 378 /* 379 * If we are acting as a multicast router, perform 380 * multicast forwarding as if the packet had just 381 * arrived on the interface to which we are about 382 * to send. The multicast forwarding function 383 * recursively calls this function, using the 384 * IP_FORWARDING flag to prevent infinite recursion. 385 * 386 * Multicasts that are looped back by ip_mloopback(), 387 * above, will be forwarded by the ip_input() routine, 388 * if necessary. 389 */ 390 if (V_ip_mrouter && (flags & IP_FORWARDING) == 0) { 391 /* 392 * If rsvp daemon is not running, do not 393 * set ip_moptions. This ensures that the packet 394 * is multicast and not just sent down one link 395 * as prescribed by rsvpd. 396 */ 397 if (!V_rsvp_on) 398 imo = NULL; 399 if (ip_mforward && 400 ip_mforward(ip, ifp, m, imo) != 0) { 401 m_freem(m); 402 goto done; 403 } 404 } 405 } 406 407 /* 408 * Multicasts with a time-to-live of zero may be looped- 409 * back, above, but must not be transmitted on a network. 410 * Also, multicasts addressed to the loopback interface 411 * are not sent -- the above call to ip_mloopback() will 412 * loop back a copy. ip_input() will drop the copy if 413 * this host does not belong to the destination group on 414 * the loopback interface. 415 */ 416 if (ip->ip_ttl == 0 || ifp->if_flags & IFF_LOOPBACK) { 417 m_freem(m); 418 goto done; 419 } 420 421 goto sendit; 422 } 423 424 /* 425 * If the source address is not specified yet, use the address 426 * of the outoing interface. 427 */ 428 if (ip->ip_src.s_addr == INADDR_ANY) { 429 /* Interface may have no addresses. */ 430 if (ia != NULL) { 431 ip->ip_src = IA_SIN(ia)->sin_addr; 432 } 433 } 434 435 /* 436 * Verify that we have any chance at all of being able to queue the 437 * packet or packet fragments, unless ALTQ is enabled on the given 438 * interface in which case packetdrop should be done by queueing. 439 */ 440 n = ip_len / mtu + 1; /* how many fragments ? */ 441 if ( 442#ifdef ALTQ 443 (!ALTQ_IS_ENABLED(&ifp->if_snd)) && 444#endif /* ALTQ */ 445 (ifp->if_snd.ifq_len + n) >= ifp->if_snd.ifq_maxlen ) { 446 error = ENOBUFS; 447 IPSTAT_INC(ips_odropped); 448 ifp->if_snd.ifq_drops += n; 449 goto bad; 450 } 451 452 /* 453 * Look for broadcast address and 454 * verify user is allowed to send 455 * such a packet. 456 */ 457 if (isbroadcast) { 458 if ((ifp->if_flags & IFF_BROADCAST) == 0) { 459 error = EADDRNOTAVAIL; 460 goto bad; 461 } 462 if ((flags & IP_ALLOWBROADCAST) == 0) { 463 error = EACCES; 464 goto bad; 465 } 466 /* don't allow broadcast messages to be fragmented */ 467 if (ip_len > mtu) { 468 error = EMSGSIZE; 469 goto bad; 470 } 471 m->m_flags |= M_BCAST; 472 } else { 473 m->m_flags &= ~M_BCAST; 474 } 475 476sendit: 477#ifdef IPSEC 478 switch(ip_ipsec_output(&m, inp, &flags, &error)) { 479 case 1: 480 goto bad; 481 case -1: 482 goto done; 483 case 0: 484 default: 485 break; /* Continue with packet processing. */ 486 } 487 /* 488 * Check if there was a route for this packet; return error if not. 489 */ 490 if (no_route_but_check_spd) { 491 IPSTAT_INC(ips_noroute); 492 error = EHOSTUNREACH; 493 goto bad; 494 } 495 /* Update variables that are affected by ipsec4_output(). */ 496 ip = mtod(m, struct ip *); 497 hlen = ip->ip_hl << 2; 498#endif /* IPSEC */ 499 500 /* Jump over all PFIL processing if hooks are not active. */ 501 if (!PFIL_HOOKED(&V_inet_pfil_hook)) 502 goto passout; 503 504 /* Run through list of hooks for output packets. */ 505 odst.s_addr = ip->ip_dst.s_addr; 506 error = pfil_run_hooks(&V_inet_pfil_hook, &m, ifp, PFIL_OUT, inp); 507 if (error != 0 || m == NULL) 508 goto done; 509 510 ip = mtod(m, struct ip *); 511 512 /* See if destination IP address was changed by packet filter. */ 513 if (odst.s_addr != ip->ip_dst.s_addr) { 514 m->m_flags |= M_SKIP_FIREWALL; 515 /* If destination is now ourself drop to ip_input(). */ 516 if (in_localip(ip->ip_dst)) { 517 m->m_flags |= M_FASTFWD_OURS; 518 if (m->m_pkthdr.rcvif == NULL) 519 m->m_pkthdr.rcvif = V_loif; 520 if (m->m_pkthdr.csum_flags & CSUM_DELAY_DATA) { 521 m->m_pkthdr.csum_flags |= 522 CSUM_DATA_VALID | CSUM_PSEUDO_HDR; 523 m->m_pkthdr.csum_data = 0xffff; 524 } 525 m->m_pkthdr.csum_flags |= 526 CSUM_IP_CHECKED | CSUM_IP_VALID; 527#ifdef SCTP 528 if (m->m_pkthdr.csum_flags & CSUM_SCTP) 529 m->m_pkthdr.csum_flags |= CSUM_SCTP_VALID; 530#endif 531 error = netisr_queue(NETISR_IP, m); 532 goto done; 533 } else { 534 if (ia != NULL) 535 ifa_free(&ia->ia_ifa); 536 goto again; /* Redo the routing table lookup. */ 537 } 538 } 539 540 if (V_pfilforward == 0) 541 goto passout; 542 543 /* See if local, if yes, send it to netisr with IP_FASTFWD_OURS. */ 544 if (m->m_flags & M_FASTFWD_OURS) { 545 if (m->m_pkthdr.rcvif == NULL) 546 m->m_pkthdr.rcvif = V_loif; 547 if (m->m_pkthdr.csum_flags & CSUM_DELAY_DATA) { 548 m->m_pkthdr.csum_flags |= 549 CSUM_DATA_VALID | CSUM_PSEUDO_HDR; 550 m->m_pkthdr.csum_data = 0xffff; 551 } 552#ifdef SCTP 553 if (m->m_pkthdr.csum_flags & CSUM_SCTP) 554 m->m_pkthdr.csum_flags |= CSUM_SCTP_VALID; 555#endif 556 m->m_pkthdr.csum_flags |= 557 CSUM_IP_CHECKED | CSUM_IP_VALID; 558 559 error = netisr_queue(NETISR_IP, m); 560 goto done; 561 } 562 /* Or forward to some other address? */ 563 fwd_tag = m_tag_find(m, PACKET_TAG_IPFORWARD, NULL); 564 if (fwd_tag) { 565 dst = (struct sockaddr_in *)&ro->ro_dst; 566 bcopy((fwd_tag+1), dst, sizeof(struct sockaddr_in)); 567 m->m_flags |= M_SKIP_FIREWALL; 568 m_tag_delete(m, fwd_tag); 569 if (ia != NULL) 570 ifa_free(&ia->ia_ifa); 571 goto again; 572 } 573 574passout: 575 /* 127/8 must not appear on wire - RFC1122. */ 576 if ((ntohl(ip->ip_dst.s_addr) >> IN_CLASSA_NSHIFT) == IN_LOOPBACKNET || 577 (ntohl(ip->ip_src.s_addr) >> IN_CLASSA_NSHIFT) == IN_LOOPBACKNET) { 578 if ((ifp->if_flags & IFF_LOOPBACK) == 0) { 579 IPSTAT_INC(ips_badaddr); 580 error = EADDRNOTAVAIL; 581 goto bad; 582 } 583 } 584 585 m->m_pkthdr.csum_flags |= CSUM_IP; 586 sw_csum = m->m_pkthdr.csum_flags & ~ifp->if_hwassist; 587 if (sw_csum & CSUM_DELAY_DATA) { 588 in_delayed_cksum(m); 589 sw_csum &= ~CSUM_DELAY_DATA; 590 } 591#ifdef SCTP 592 if (sw_csum & CSUM_SCTP) { 593 sctp_delayed_cksum(m, (uint32_t)(ip->ip_hl << 2)); 594 sw_csum &= ~CSUM_SCTP; 595 } 596#endif 597 m->m_pkthdr.csum_flags &= ifp->if_hwassist; 598 599 /* 600 * If small enough for interface, or the interface will take 601 * care of the fragmentation for us, we can just send directly. 602 */ 603 if (ip_len <= mtu || 604 (m->m_pkthdr.csum_flags & ifp->if_hwassist & CSUM_TSO) != 0 || 605 ((ip_off & IP_DF) == 0 && (ifp->if_hwassist & CSUM_FRAGMENT))) { 606 ip->ip_sum = 0; 607 if (sw_csum & CSUM_DELAY_IP) 608 ip->ip_sum = in_cksum(m, hlen); 609 610 /* 611 * Record statistics for this interface address. 612 * With CSUM_TSO the byte/packet count will be slightly 613 * incorrect because we count the IP+TCP headers only 614 * once instead of for every generated packet. 615 */ 616 if (!(flags & IP_FORWARDING) && ia) { 617 if (m->m_pkthdr.csum_flags & CSUM_TSO) 618 ia->ia_ifa.if_opackets += 619 m->m_pkthdr.len / m->m_pkthdr.tso_segsz; 620 else 621 ia->ia_ifa.if_opackets++; 622 ia->ia_ifa.if_obytes += m->m_pkthdr.len; 623 } 624#ifdef MBUF_STRESS_TEST 625 if (mbuf_frag_size && m->m_pkthdr.len > mbuf_frag_size) 626 m = m_fragment(m, M_DONTWAIT, mbuf_frag_size); 627#endif 628 /* 629 * Reset layer specific mbuf flags 630 * to avoid confusing lower layers. 631 */ 632 m->m_flags &= ~(M_PROTOFLAGS); 633 error = (*ifp->if_output)(ifp, m, 634 (struct sockaddr *)dst, ro); 635 goto done; 636 } 637 638 /* Balk when DF bit is set or the interface didn't support TSO. */ 639 if ((ip_off & IP_DF) || (m->m_pkthdr.csum_flags & CSUM_TSO)) { 640 error = EMSGSIZE; 641 IPSTAT_INC(ips_cantfrag); 642 goto bad; 643 } 644 645 /* 646 * Too large for interface; fragment if possible. If successful, 647 * on return, m will point to a list of packets to be sent. 648 */ 649 error = ip_fragment(ip, &m, mtu, ifp->if_hwassist, sw_csum); 650 if (error) 651 goto bad; 652 for (; m; m = m0) { 653 m0 = m->m_nextpkt; 654 m->m_nextpkt = 0; 655 if (error == 0) { 656 /* Record statistics for this interface address. */ 657 if (ia != NULL) { 658 ia->ia_ifa.if_opackets++; 659 ia->ia_ifa.if_obytes += m->m_pkthdr.len; 660 } 661 /* 662 * Reset layer specific mbuf flags 663 * to avoid confusing upper layers. 664 */ 665 m->m_flags &= ~(M_PROTOFLAGS); 666 667 error = (*ifp->if_output)(ifp, m, 668 (struct sockaddr *)dst, ro); 669 } else 670 m_freem(m); 671 } 672 673 if (error == 0) 674 IPSTAT_INC(ips_fragmented); 675 676done: 677 if (ro == &iproute) 678 RO_RTFREE(ro); 679 if (ia != NULL) 680 ifa_free(&ia->ia_ifa); 681 return (error); 682bad: 683 m_freem(m); 684 goto done; 685} 686 687/* 688 * Create a chain of fragments which fit the given mtu. m_frag points to the 689 * mbuf to be fragmented; on return it points to the chain with the fragments. 690 * Return 0 if no error. If error, m_frag may contain a partially built 691 * chain of fragments that should be freed by the caller. 692 * 693 * if_hwassist_flags is the hw offload capabilities (see if_data.ifi_hwassist) 694 * sw_csum contains the delayed checksums flags (e.g., CSUM_DELAY_IP). 695 */ 696int 697ip_fragment(struct ip *ip, struct mbuf **m_frag, int mtu, 698 u_long if_hwassist_flags, int sw_csum) 699{ 700 int error = 0; 701 int hlen = ip->ip_hl << 2; 702 int len = (mtu - hlen) & ~7; /* size of payload in each fragment */ 703 int off; 704 struct mbuf *m0 = *m_frag; /* the original packet */ 705 int firstlen; 706 struct mbuf **mnext; 707 int nfrags; 708 uint16_t ip_len, ip_off; 709 710 ip_len = ntohs(ip->ip_len); 711 ip_off = ntohs(ip->ip_off); 712 713 if (ip_off & IP_DF) { /* Fragmentation not allowed */ 714 IPSTAT_INC(ips_cantfrag); 715 return EMSGSIZE; 716 } 717 718 /* 719 * Must be able to put at least 8 bytes per fragment. 720 */ 721 if (len < 8) 722 return EMSGSIZE; 723 724 /* 725 * If the interface will not calculate checksums on 726 * fragmented packets, then do it here. 727 */ 728 if (m0->m_pkthdr.csum_flags & CSUM_DELAY_DATA && 729 (if_hwassist_flags & CSUM_IP_FRAGS) == 0) { 730 in_delayed_cksum(m0); 731 m0->m_pkthdr.csum_flags &= ~CSUM_DELAY_DATA; 732 } 733#ifdef SCTP 734 if (m0->m_pkthdr.csum_flags & CSUM_SCTP && 735 (if_hwassist_flags & CSUM_IP_FRAGS) == 0) { 736 sctp_delayed_cksum(m0, hlen); 737 m0->m_pkthdr.csum_flags &= ~CSUM_SCTP; 738 } 739#endif 740 if (len > PAGE_SIZE) { 741 /* 742 * Fragment large datagrams such that each segment 743 * contains a multiple of PAGE_SIZE amount of data, 744 * plus headers. This enables a receiver to perform 745 * page-flipping zero-copy optimizations. 746 * 747 * XXX When does this help given that sender and receiver 748 * could have different page sizes, and also mtu could 749 * be less than the receiver's page size ? 750 */ 751 int newlen; 752 struct mbuf *m; 753 754 for (m = m0, off = 0; m && (off+m->m_len) <= mtu; m = m->m_next) 755 off += m->m_len; 756 757 /* 758 * firstlen (off - hlen) must be aligned on an 759 * 8-byte boundary 760 */ 761 if (off < hlen) 762 goto smart_frag_failure; 763 off = ((off - hlen) & ~7) + hlen; 764 newlen = (~PAGE_MASK) & mtu; 765 if ((newlen + sizeof (struct ip)) > mtu) { 766 /* we failed, go back the default */ 767smart_frag_failure: 768 newlen = len; 769 off = hlen + len; 770 } 771 len = newlen; 772 773 } else { 774 off = hlen + len; 775 } 776 777 firstlen = off - hlen; 778 mnext = &m0->m_nextpkt; /* pointer to next packet */ 779 780 /* 781 * Loop through length of segment after first fragment, 782 * make new header and copy data of each part and link onto chain. 783 * Here, m0 is the original packet, m is the fragment being created. 784 * The fragments are linked off the m_nextpkt of the original 785 * packet, which after processing serves as the first fragment. 786 */ 787 for (nfrags = 1; off < ip_len; off += len, nfrags++) { 788 struct ip *mhip; /* ip header on the fragment */ 789 struct mbuf *m; 790 int mhlen = sizeof (struct ip); 791 792 MGETHDR(m, M_DONTWAIT, MT_DATA); 793 if (m == NULL) { 794 error = ENOBUFS; 795 IPSTAT_INC(ips_odropped); 796 goto done; 797 } 798 m->m_flags |= (m0->m_flags & M_MCAST) | M_FRAG; 799 /* 800 * In the first mbuf, leave room for the link header, then 801 * copy the original IP header including options. The payload 802 * goes into an additional mbuf chain returned by m_copym(). 803 */ 804 m->m_data += max_linkhdr; 805 mhip = mtod(m, struct ip *); 806 *mhip = *ip; 807 if (hlen > sizeof (struct ip)) { 808 mhlen = ip_optcopy(ip, mhip) + sizeof (struct ip); 809 mhip->ip_v = IPVERSION; 810 mhip->ip_hl = mhlen >> 2; 811 } 812 m->m_len = mhlen; 813 /* XXX do we need to add ip_off below ? */ 814 mhip->ip_off = ((off - hlen) >> 3) + ip_off; 815 if (off + len >= ip_len) { /* last fragment */ 816 len = ip_len - off; 817 m->m_flags |= M_LASTFRAG; 818 } else 819 mhip->ip_off |= IP_MF; 820 mhip->ip_len = htons((u_short)(len + mhlen)); 821 m->m_next = m_copym(m0, off, len, M_DONTWAIT); 822 if (m->m_next == NULL) { /* copy failed */ 823 m_free(m); 824 error = ENOBUFS; /* ??? */ 825 IPSTAT_INC(ips_odropped); 826 goto done; 827 } 828 m->m_pkthdr.len = mhlen + len; 829 m->m_pkthdr.rcvif = NULL; 830#ifdef MAC 831 mac_netinet_fragment(m0, m); 832#endif 833 m->m_pkthdr.csum_flags = m0->m_pkthdr.csum_flags; 834 mhip->ip_off = htons(mhip->ip_off); 835 mhip->ip_sum = 0; 836 if (sw_csum & CSUM_DELAY_IP) 837 mhip->ip_sum = in_cksum(m, mhlen); 838 *mnext = m; 839 mnext = &m->m_nextpkt; 840 } 841 IPSTAT_ADD(ips_ofragments, nfrags); 842 843 /* set first marker for fragment chain */ 844 m0->m_flags |= M_FIRSTFRAG | M_FRAG; 845 m0->m_pkthdr.csum_data = nfrags; 846 847 /* 848 * Update first fragment by trimming what's been copied out 849 * and updating header. 850 */ 851 m_adj(m0, hlen + firstlen - ip_len); 852 m0->m_pkthdr.len = hlen + firstlen; 853 ip->ip_len = htons((u_short)m0->m_pkthdr.len); 854 ip->ip_off = htons(ip_off | IP_MF); 855 ip->ip_sum = 0; 856 if (sw_csum & CSUM_DELAY_IP) 857 ip->ip_sum = in_cksum(m0, hlen); 858 859done: 860 *m_frag = m0; 861 return error; 862} 863 864void 865in_delayed_cksum(struct mbuf *m) 866{ 867 struct ip *ip; 868 uint16_t csum, offset, ip_len; 869 870 ip = mtod(m, struct ip *); 871 offset = ip->ip_hl << 2 ; 872 ip_len = ntohs(ip->ip_len); 873 csum = in_cksum_skip(m, ip_len, offset); 874 if (m->m_pkthdr.csum_flags & CSUM_UDP && csum == 0) 875 csum = 0xffff; 876 offset += m->m_pkthdr.csum_data; /* checksum offset */ 877 878 if (offset + sizeof(u_short) > m->m_len) { 879 printf("delayed m_pullup, m->len: %d off: %d p: %d\n", 880 m->m_len, offset, ip->ip_p); 881 /* 882 * XXX 883 * this shouldn't happen, but if it does, the 884 * correct behavior may be to insert the checksum 885 * in the appropriate next mbuf in the chain. 886 */ 887 return; 888 } 889 *(u_short *)(m->m_data + offset) = csum; 890} 891 892/* 893 * IP socket option processing. 894 */ 895int 896ip_ctloutput(struct socket *so, struct sockopt *sopt) 897{ 898 struct inpcb *inp = sotoinpcb(so); 899 int error, optval; 900 901 error = optval = 0; 902 if (sopt->sopt_level != IPPROTO_IP) { 903 error = EINVAL; 904 905 if (sopt->sopt_level == SOL_SOCKET && 906 sopt->sopt_dir == SOPT_SET) { 907 switch (sopt->sopt_name) { 908 case SO_REUSEADDR: 909 INP_WLOCK(inp); 910 if (IN_MULTICAST(ntohl(inp->inp_laddr.s_addr))) { 911 if ((so->so_options & 912 (SO_REUSEADDR | SO_REUSEPORT)) != 0) 913 inp->inp_flags2 |= INP_REUSEPORT; 914 else 915 inp->inp_flags2 &= ~INP_REUSEPORT; 916 } 917 INP_WUNLOCK(inp); 918 error = 0; 919 break; 920 case SO_REUSEPORT: 921 INP_WLOCK(inp); 922 if ((so->so_options & SO_REUSEPORT) != 0) 923 inp->inp_flags2 |= INP_REUSEPORT; 924 else 925 inp->inp_flags2 &= ~INP_REUSEPORT; 926 INP_WUNLOCK(inp); 927 error = 0; 928 break; 929 case SO_SETFIB: 930 INP_WLOCK(inp); 931 inp->inp_inc.inc_fibnum = so->so_fibnum; 932 INP_WUNLOCK(inp); 933 error = 0; 934 break; 935 default: 936 break; 937 } 938 } 939 return (error); 940 } 941 942 switch (sopt->sopt_dir) { 943 case SOPT_SET: 944 switch (sopt->sopt_name) { 945 case IP_OPTIONS: 946#ifdef notyet 947 case IP_RETOPTS: 948#endif 949 { 950 struct mbuf *m; 951 if (sopt->sopt_valsize > MLEN) { 952 error = EMSGSIZE; 953 break; 954 } 955 MGET(m, sopt->sopt_td ? M_WAIT : M_DONTWAIT, MT_DATA); 956 if (m == NULL) { 957 error = ENOBUFS; 958 break; 959 } 960 m->m_len = sopt->sopt_valsize; 961 error = sooptcopyin(sopt, mtod(m, char *), m->m_len, 962 m->m_len); 963 if (error) { 964 m_free(m); 965 break; 966 } 967 INP_WLOCK(inp); 968 error = ip_pcbopts(inp, sopt->sopt_name, m); 969 INP_WUNLOCK(inp); 970 return (error); 971 } 972 973 case IP_BINDANY: 974 if (sopt->sopt_td != NULL) { 975 error = priv_check(sopt->sopt_td, 976 PRIV_NETINET_BINDANY); 977 if (error) 978 break; 979 } 980 /* FALLTHROUGH */ 981 case IP_TOS: 982 case IP_TTL: 983 case IP_MINTTL: 984 case IP_RECVOPTS: 985 case IP_RECVRETOPTS: 986 case IP_RECVDSTADDR: 987 case IP_RECVTTL: 988 case IP_RECVIF: 989 case IP_FAITH: 990 case IP_ONESBCAST: 991 case IP_DONTFRAG: 992 case IP_RECVTOS: 993 error = sooptcopyin(sopt, &optval, sizeof optval, 994 sizeof optval); 995 if (error) 996 break; 997 998 switch (sopt->sopt_name) { 999 case IP_TOS: 1000 inp->inp_ip_tos = optval; 1001 break; 1002 1003 case IP_TTL: 1004 inp->inp_ip_ttl = optval; 1005 break; 1006 1007 case IP_MINTTL: 1008 if (optval >= 0 && optval <= MAXTTL) 1009 inp->inp_ip_minttl = optval; 1010 else 1011 error = EINVAL; 1012 break; 1013 1014#define OPTSET(bit) do { \ 1015 INP_WLOCK(inp); \ 1016 if (optval) \ 1017 inp->inp_flags |= bit; \ 1018 else \ 1019 inp->inp_flags &= ~bit; \ 1020 INP_WUNLOCK(inp); \ 1021} while (0) 1022 1023 case IP_RECVOPTS: 1024 OPTSET(INP_RECVOPTS); 1025 break; 1026 1027 case IP_RECVRETOPTS: 1028 OPTSET(INP_RECVRETOPTS); 1029 break; 1030 1031 case IP_RECVDSTADDR: 1032 OPTSET(INP_RECVDSTADDR); 1033 break; 1034 1035 case IP_RECVTTL: 1036 OPTSET(INP_RECVTTL); 1037 break; 1038 1039 case IP_RECVIF: 1040 OPTSET(INP_RECVIF); 1041 break; 1042 1043 case IP_FAITH: 1044 OPTSET(INP_FAITH); 1045 break; 1046 1047 case IP_ONESBCAST: 1048 OPTSET(INP_ONESBCAST); 1049 break; 1050 case IP_DONTFRAG: 1051 OPTSET(INP_DONTFRAG); 1052 break; 1053 case IP_BINDANY: 1054 OPTSET(INP_BINDANY); 1055 break; 1056 case IP_RECVTOS: 1057 OPTSET(INP_RECVTOS); 1058 break; 1059 } 1060 break; 1061#undef OPTSET 1062 1063 /* 1064 * Multicast socket options are processed by the in_mcast 1065 * module. 1066 */ 1067 case IP_MULTICAST_IF: 1068 case IP_MULTICAST_VIF: 1069 case IP_MULTICAST_TTL: 1070 case IP_MULTICAST_LOOP: 1071 case IP_ADD_MEMBERSHIP: 1072 case IP_DROP_MEMBERSHIP: 1073 case IP_ADD_SOURCE_MEMBERSHIP: 1074 case IP_DROP_SOURCE_MEMBERSHIP: 1075 case IP_BLOCK_SOURCE: 1076 case IP_UNBLOCK_SOURCE: 1077 case IP_MSFILTER: 1078 case MCAST_JOIN_GROUP: 1079 case MCAST_LEAVE_GROUP: 1080 case MCAST_JOIN_SOURCE_GROUP: 1081 case MCAST_LEAVE_SOURCE_GROUP: 1082 case MCAST_BLOCK_SOURCE: 1083 case MCAST_UNBLOCK_SOURCE: 1084 error = inp_setmoptions(inp, sopt); 1085 break; 1086 1087 case IP_PORTRANGE: 1088 error = sooptcopyin(sopt, &optval, sizeof optval, 1089 sizeof optval); 1090 if (error) 1091 break; 1092 1093 INP_WLOCK(inp); 1094 switch (optval) { 1095 case IP_PORTRANGE_DEFAULT: 1096 inp->inp_flags &= ~(INP_LOWPORT); 1097 inp->inp_flags &= ~(INP_HIGHPORT); 1098 break; 1099 1100 case IP_PORTRANGE_HIGH: 1101 inp->inp_flags &= ~(INP_LOWPORT); 1102 inp->inp_flags |= INP_HIGHPORT; 1103 break; 1104 1105 case IP_PORTRANGE_LOW: 1106 inp->inp_flags &= ~(INP_HIGHPORT); 1107 inp->inp_flags |= INP_LOWPORT; 1108 break; 1109 1110 default: 1111 error = EINVAL; 1112 break; 1113 } 1114 INP_WUNLOCK(inp); 1115 break; 1116 1117#ifdef IPSEC 1118 case IP_IPSEC_POLICY: 1119 { 1120 caddr_t req; 1121 struct mbuf *m; 1122 1123 if ((error = soopt_getm(sopt, &m)) != 0) /* XXX */ 1124 break; 1125 if ((error = soopt_mcopyin(sopt, m)) != 0) /* XXX */ 1126 break; 1127 req = mtod(m, caddr_t); 1128 error = ipsec_set_policy(inp, sopt->sopt_name, req, 1129 m->m_len, (sopt->sopt_td != NULL) ? 1130 sopt->sopt_td->td_ucred : NULL); 1131 m_freem(m); 1132 break; 1133 } 1134#endif /* IPSEC */ 1135 1136 default: 1137 error = ENOPROTOOPT; 1138 break; 1139 } 1140 break; 1141 1142 case SOPT_GET: 1143 switch (sopt->sopt_name) { 1144 case IP_OPTIONS: 1145 case IP_RETOPTS: 1146 if (inp->inp_options) 1147 error = sooptcopyout(sopt, 1148 mtod(inp->inp_options, 1149 char *), 1150 inp->inp_options->m_len); 1151 else 1152 sopt->sopt_valsize = 0; 1153 break; 1154 1155 case IP_TOS: 1156 case IP_TTL: 1157 case IP_MINTTL: 1158 case IP_RECVOPTS: 1159 case IP_RECVRETOPTS: 1160 case IP_RECVDSTADDR: 1161 case IP_RECVTTL: 1162 case IP_RECVIF: 1163 case IP_PORTRANGE: 1164 case IP_FAITH: 1165 case IP_ONESBCAST: 1166 case IP_DONTFRAG: 1167 case IP_BINDANY: 1168 case IP_RECVTOS: 1169 switch (sopt->sopt_name) { 1170 1171 case IP_TOS: 1172 optval = inp->inp_ip_tos; 1173 break; 1174 1175 case IP_TTL: 1176 optval = inp->inp_ip_ttl; 1177 break; 1178 1179 case IP_MINTTL: 1180 optval = inp->inp_ip_minttl; 1181 break; 1182 1183#define OPTBIT(bit) (inp->inp_flags & bit ? 1 : 0) 1184 1185 case IP_RECVOPTS: 1186 optval = OPTBIT(INP_RECVOPTS); 1187 break; 1188 1189 case IP_RECVRETOPTS: 1190 optval = OPTBIT(INP_RECVRETOPTS); 1191 break; 1192 1193 case IP_RECVDSTADDR: 1194 optval = OPTBIT(INP_RECVDSTADDR); 1195 break; 1196 1197 case IP_RECVTTL: 1198 optval = OPTBIT(INP_RECVTTL); 1199 break; 1200 1201 case IP_RECVIF: 1202 optval = OPTBIT(INP_RECVIF); 1203 break; 1204 1205 case IP_PORTRANGE: 1206 if (inp->inp_flags & INP_HIGHPORT) 1207 optval = IP_PORTRANGE_HIGH; 1208 else if (inp->inp_flags & INP_LOWPORT) 1209 optval = IP_PORTRANGE_LOW; 1210 else 1211 optval = 0; 1212 break; 1213 1214 case IP_FAITH: 1215 optval = OPTBIT(INP_FAITH); 1216 break; 1217 1218 case IP_ONESBCAST: 1219 optval = OPTBIT(INP_ONESBCAST); 1220 break; 1221 case IP_DONTFRAG: 1222 optval = OPTBIT(INP_DONTFRAG); 1223 break; 1224 case IP_BINDANY: 1225 optval = OPTBIT(INP_BINDANY); 1226 break; 1227 case IP_RECVTOS: 1228 optval = OPTBIT(INP_RECVTOS); 1229 break; 1230 } 1231 error = sooptcopyout(sopt, &optval, sizeof optval); 1232 break; 1233 1234 /* 1235 * Multicast socket options are processed by the in_mcast 1236 * module. 1237 */ 1238 case IP_MULTICAST_IF: 1239 case IP_MULTICAST_VIF: 1240 case IP_MULTICAST_TTL: 1241 case IP_MULTICAST_LOOP: 1242 case IP_MSFILTER: 1243 error = inp_getmoptions(inp, sopt); 1244 break; 1245 1246#ifdef IPSEC 1247 case IP_IPSEC_POLICY: 1248 { 1249 struct mbuf *m = NULL; 1250 caddr_t req = NULL; 1251 size_t len = 0; 1252 1253 if (m != 0) { 1254 req = mtod(m, caddr_t); 1255 len = m->m_len; 1256 } 1257 error = ipsec_get_policy(sotoinpcb(so), req, len, &m); 1258 if (error == 0) 1259 error = soopt_mcopyout(sopt, m); /* XXX */ 1260 if (error == 0) 1261 m_freem(m); 1262 break; 1263 } 1264#endif /* IPSEC */ 1265 1266 default: 1267 error = ENOPROTOOPT; 1268 break; 1269 } 1270 break; 1271 } 1272 return (error); 1273} 1274 1275/* 1276 * Routine called from ip_output() to loop back a copy of an IP multicast 1277 * packet to the input queue of a specified interface. Note that this 1278 * calls the output routine of the loopback "driver", but with an interface 1279 * pointer that might NOT be a loopback interface -- evil, but easier than 1280 * replicating that code here. 1281 */ 1282static void 1283ip_mloopback(struct ifnet *ifp, struct mbuf *m, struct sockaddr_in *dst, 1284 int hlen) 1285{ 1286 register struct ip *ip; 1287 struct mbuf *copym; 1288 1289 /* 1290 * Make a deep copy of the packet because we're going to 1291 * modify the pack in order to generate checksums. 1292 */ 1293 copym = m_dup(m, M_DONTWAIT); 1294 if (copym != NULL && (copym->m_flags & M_EXT || copym->m_len < hlen)) 1295 copym = m_pullup(copym, hlen); 1296 if (copym != NULL) { 1297 /* If needed, compute the checksum and mark it as valid. */ 1298 if (copym->m_pkthdr.csum_flags & CSUM_DELAY_DATA) { 1299 in_delayed_cksum(copym); 1300 copym->m_pkthdr.csum_flags &= ~CSUM_DELAY_DATA; 1301 copym->m_pkthdr.csum_flags |= 1302 CSUM_DATA_VALID | CSUM_PSEUDO_HDR; 1303 copym->m_pkthdr.csum_data = 0xffff; 1304 } 1305 /* 1306 * We don't bother to fragment if the IP length is greater 1307 * than the interface's MTU. Can this possibly matter? 1308 */ 1309 ip = mtod(copym, struct ip *); 1310 ip->ip_sum = 0; 1311 ip->ip_sum = in_cksum(copym, hlen); 1312#if 1 /* XXX */ 1313 if (dst->sin_family != AF_INET) { 1314 printf("ip_mloopback: bad address family %d\n", 1315 dst->sin_family); 1316 dst->sin_family = AF_INET; 1317 } 1318#endif 1319 if_simloop(ifp, copym, dst->sin_family, 0); 1320 } 1321} 1322