ip_output.c revision 190880
1/*- 2 * Copyright (c) 1982, 1986, 1988, 1990, 1993 3 * The Regents of the University of California. All rights reserved. 4 * 5 * Redistribution and use in source and binary forms, with or without 6 * modification, are permitted provided that the following conditions 7 * are met: 8 * 1. Redistributions of source code must retain the above copyright 9 * notice, this list of conditions and the following disclaimer. 10 * 2. Redistributions in binary form must reproduce the above copyright 11 * notice, this list of conditions and the following disclaimer in the 12 * documentation and/or other materials provided with the distribution. 13 * 4. Neither the name of the University nor the names of its contributors 14 * may be used to endorse or promote products derived from this software 15 * without specific prior written permission. 16 * 17 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND 18 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 19 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 20 * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE 21 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 22 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 23 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 24 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 25 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 26 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 27 * SUCH DAMAGE. 28 * 29 * @(#)ip_output.c 8.3 (Berkeley) 1/21/94 30 */ 31 32#include <sys/cdefs.h> 33__FBSDID("$FreeBSD: head/sys/netinet/ip_output.c 190880 2009-04-10 06:16:14Z kmacy $"); 34 35#include "opt_ipfw.h" 36#include "opt_inet.h" 37#include "opt_ipsec.h" 38#include "opt_route.h" 39#include "opt_mac.h" 40#include "opt_mbuf_stress_test.h" 41#include "opt_mpath.h" 42#include "opt_sctp.h" 43 44#include <sys/param.h> 45#include <sys/systm.h> 46#include <sys/kernel.h> 47#include <sys/malloc.h> 48#include <sys/mbuf.h> 49#include <sys/priv.h> 50#include <sys/proc.h> 51#include <sys/protosw.h> 52#include <sys/socket.h> 53#include <sys/socketvar.h> 54#include <sys/sysctl.h> 55#include <sys/ucred.h> 56#include <sys/vimage.h> 57 58#include <net/if.h> 59#include <net/netisr.h> 60#include <net/pfil.h> 61#include <net/route.h> 62#ifdef RADIX_MPATH 63#include <net/radix_mpath.h> 64#endif 65#include <net/vnet.h> 66 67#include <netinet/in.h> 68#include <netinet/in_systm.h> 69#include <netinet/ip.h> 70#include <netinet/in_pcb.h> 71#include <netinet/in_var.h> 72#include <netinet/ip_var.h> 73#include <netinet/ip_options.h> 74#include <netinet/vinet.h> 75#ifdef SCTP 76#include <netinet/sctp.h> 77#include <netinet/sctp_crc32.h> 78#endif 79 80#ifdef IPSEC 81#include <netinet/ip_ipsec.h> 82#include <netipsec/ipsec.h> 83#endif /* IPSEC*/ 84 85#include <machine/in_cksum.h> 86 87#include <security/mac/mac_framework.h> 88 89#define print_ip(x, a, y) printf("%s %d.%d.%d.%d%s",\ 90 x, (ntohl(a.s_addr)>>24)&0xFF,\ 91 (ntohl(a.s_addr)>>16)&0xFF,\ 92 (ntohl(a.s_addr)>>8)&0xFF,\ 93 (ntohl(a.s_addr))&0xFF, y); 94 95#ifdef VIMAGE_GLOBALS 96u_short ip_id; 97#endif 98 99#ifdef MBUF_STRESS_TEST 100int mbuf_frag_size = 0; 101SYSCTL_INT(_net_inet_ip, OID_AUTO, mbuf_frag_size, CTLFLAG_RW, 102 &mbuf_frag_size, 0, "Fragment outgoing mbufs to this size"); 103#endif 104 105#if defined(IP_NONLOCALBIND) 106static int ip_nonlocalok = 0; 107SYSCTL_INT(_net_inet_ip, OID_AUTO, nonlocalok, 108 CTLFLAG_RW|CTLFLAG_SECURE, &ip_nonlocalok, 0, ""); 109#endif 110 111static void ip_mloopback 112 (struct ifnet *, struct mbuf *, struct sockaddr_in *, int); 113 114 115extern int in_mcast_loop; 116extern struct protosw inetsw[]; 117 118/* 119 * IP output. The packet in mbuf chain m contains a skeletal IP 120 * header (with len, off, ttl, proto, tos, src, dst). 121 * The mbuf chain containing the packet will be freed. 122 * The mbuf opt, if present, will not be freed. 123 * In the IP forwarding case, the packet will arrive with options already 124 * inserted, so must have a NULL opt pointer. 125 */ 126int 127ip_output(struct mbuf *m, struct mbuf *opt, struct route *ro, int flags, 128 struct ip_moptions *imo, struct inpcb *inp) 129{ 130 INIT_VNET_NET(curvnet); 131 INIT_VNET_INET(curvnet); 132 struct ip *ip; 133 struct ifnet *ifp = NULL; /* keep compiler happy */ 134 struct mbuf *m0; 135 int hlen = sizeof (struct ip); 136 int mtu; 137 int len, error = 0; 138 struct sockaddr_in *dst = NULL; /* keep compiler happy */ 139 struct in_ifaddr *ia = NULL; 140 int isbroadcast, sw_csum; 141 struct route iproute; 142 struct in_addr odst; 143#ifdef IPFIREWALL_FORWARD 144 struct m_tag *fwd_tag = NULL; 145#endif 146 M_ASSERTPKTHDR(m); 147 148 if (ro == NULL) { 149 ro = &iproute; 150 bzero(ro, sizeof (*ro)); 151 } 152 153 if (inp != NULL) { 154 M_SETFIB(m, inp->inp_inc.inc_fibnum); 155 INP_LOCK_ASSERT(inp); 156 if (inp->inp_flags & (INP_HW_FLOWID|INP_SW_FLOWID)) { 157 m->m_pkthdr.flowid = inp->inp_flowid; 158 m->m_flags |= M_FLOWID; 159 } 160 } 161 162 if (opt) { 163 len = 0; 164 m = ip_insertoptions(m, opt, &len); 165 if (len != 0) 166 hlen = len; 167 } 168 ip = mtod(m, struct ip *); 169 170 /* 171 * Fill in IP header. If we are not allowing fragmentation, 172 * then the ip_id field is meaningless, but we don't set it 173 * to zero. Doing so causes various problems when devices along 174 * the path (routers, load balancers, firewalls, etc.) illegally 175 * disable DF on our packet. Note that a 16-bit counter 176 * will wrap around in less than 10 seconds at 100 Mbit/s on a 177 * medium with MTU 1500. See Steven M. Bellovin, "A Technique 178 * for Counting NATted Hosts", Proc. IMW'02, available at 179 * <http://www.cs.columbia.edu/~smb/papers/fnat.pdf>. 180 */ 181 if ((flags & (IP_FORWARDING|IP_RAWOUTPUT)) == 0) { 182 ip->ip_v = IPVERSION; 183 ip->ip_hl = hlen >> 2; 184 ip->ip_id = ip_newid(); 185 V_ipstat.ips_localout++; 186 } else { 187 hlen = ip->ip_hl << 2; 188 } 189 190 dst = (struct sockaddr_in *)&ro->ro_dst; 191again: 192 /* 193 * If there is a cached route, 194 * check that it is to the same destination 195 * and is still up. If not, free it and try again. 196 * The address family should also be checked in case of sharing the 197 * cache with IPv6. 198 */ 199 if (ro->ro_rt && ((ro->ro_rt->rt_flags & RTF_UP) == 0 || 200 dst->sin_family != AF_INET || 201 dst->sin_addr.s_addr != ip->ip_dst.s_addr)) { 202 RTFREE(ro->ro_rt); 203 ro->ro_rt = (struct rtentry *)NULL; 204 } 205#ifdef IPFIREWALL_FORWARD 206 if (ro->ro_rt == NULL && fwd_tag == NULL) { 207#else 208 if (ro->ro_rt == NULL) { 209#endif 210 bzero(dst, sizeof(*dst)); 211 dst->sin_family = AF_INET; 212 dst->sin_len = sizeof(*dst); 213 dst->sin_addr = ip->ip_dst; 214 } 215 /* 216 * If routing to interface only, short circuit routing lookup. 217 * The use of an all-ones broadcast address implies this; an 218 * interface is specified by the broadcast address of an interface, 219 * or the destination address of a ptp interface. 220 */ 221 if (flags & IP_SENDONES) { 222 if ((ia = ifatoia(ifa_ifwithbroadaddr(sintosa(dst)))) == NULL && 223 (ia = ifatoia(ifa_ifwithdstaddr(sintosa(dst)))) == NULL) { 224 V_ipstat.ips_noroute++; 225 error = ENETUNREACH; 226 goto bad; 227 } 228 ip->ip_dst.s_addr = INADDR_BROADCAST; 229 dst->sin_addr = ip->ip_dst; 230 ifp = ia->ia_ifp; 231 ip->ip_ttl = 1; 232 isbroadcast = 1; 233 } else if (flags & IP_ROUTETOIF) { 234 if ((ia = ifatoia(ifa_ifwithdstaddr(sintosa(dst)))) == NULL && 235 (ia = ifatoia(ifa_ifwithnet(sintosa(dst)))) == NULL) { 236 V_ipstat.ips_noroute++; 237 error = ENETUNREACH; 238 goto bad; 239 } 240 ifp = ia->ia_ifp; 241 ip->ip_ttl = 1; 242 isbroadcast = in_broadcast(dst->sin_addr, ifp); 243 } else if (IN_MULTICAST(ntohl(ip->ip_dst.s_addr)) && 244 imo != NULL && imo->imo_multicast_ifp != NULL) { 245 /* 246 * Bypass the normal routing lookup for multicast 247 * packets if the interface is specified. 248 */ 249 ifp = imo->imo_multicast_ifp; 250 IFP_TO_IA(ifp, ia); 251 isbroadcast = 0; /* fool gcc */ 252 } else { 253 /* 254 * We want to do any cloning requested by the link layer, 255 * as this is probably required in all cases for correct 256 * operation (as it is for ARP). 257 */ 258 if (ro->ro_rt == NULL) 259#ifdef RADIX_MPATH 260 rtalloc_mpath_fib(ro, 261 ntohl(ip->ip_src.s_addr ^ ip->ip_dst.s_addr), 262 inp ? inp->inp_inc.inc_fibnum : M_GETFIB(m)); 263#else 264 in_rtalloc_ign(ro, 0, 265 inp ? inp->inp_inc.inc_fibnum : M_GETFIB(m)); 266#endif 267 if (ro->ro_rt == NULL) { 268 V_ipstat.ips_noroute++; 269 error = EHOSTUNREACH; 270 goto bad; 271 } 272 ia = ifatoia(ro->ro_rt->rt_ifa); 273 ifp = ro->ro_rt->rt_ifp; 274 ro->ro_rt->rt_rmx.rmx_pksent++; 275 if (ro->ro_rt->rt_flags & RTF_GATEWAY) 276 dst = (struct sockaddr_in *)ro->ro_rt->rt_gateway; 277 if (ro->ro_rt->rt_flags & RTF_HOST) 278 isbroadcast = (ro->ro_rt->rt_flags & RTF_BROADCAST); 279 else 280 isbroadcast = in_broadcast(dst->sin_addr, ifp); 281 } 282 /* 283 * Calculate MTU. If we have a route that is up, use that, 284 * otherwise use the interface's MTU. 285 */ 286 if (ro->ro_rt != NULL && (ro->ro_rt->rt_flags & (RTF_UP|RTF_HOST))) { 287 /* 288 * This case can happen if the user changed the MTU 289 * of an interface after enabling IP on it. Because 290 * most netifs don't keep track of routes pointing to 291 * them, there is no way for one to update all its 292 * routes when the MTU is changed. 293 */ 294 if (ro->ro_rt->rt_rmx.rmx_mtu > ifp->if_mtu) 295 ro->ro_rt->rt_rmx.rmx_mtu = ifp->if_mtu; 296 mtu = ro->ro_rt->rt_rmx.rmx_mtu; 297 } else { 298 mtu = ifp->if_mtu; 299 } 300 if (IN_MULTICAST(ntohl(ip->ip_dst.s_addr))) { 301 m->m_flags |= M_MCAST; 302 /* 303 * IP destination address is multicast. Make sure "dst" 304 * still points to the address in "ro". (It may have been 305 * changed to point to a gateway address, above.) 306 */ 307 dst = (struct sockaddr_in *)&ro->ro_dst; 308 /* 309 * See if the caller provided any multicast options 310 */ 311 if (imo != NULL) { 312 ip->ip_ttl = imo->imo_multicast_ttl; 313 if (imo->imo_multicast_vif != -1) 314 ip->ip_src.s_addr = 315 ip_mcast_src ? 316 ip_mcast_src(imo->imo_multicast_vif) : 317 INADDR_ANY; 318 } else 319 ip->ip_ttl = IP_DEFAULT_MULTICAST_TTL; 320 /* 321 * Confirm that the outgoing interface supports multicast. 322 */ 323 if ((imo == NULL) || (imo->imo_multicast_vif == -1)) { 324 if ((ifp->if_flags & IFF_MULTICAST) == 0) { 325 V_ipstat.ips_noroute++; 326 error = ENETUNREACH; 327 goto bad; 328 } 329 } 330 /* 331 * If source address not specified yet, use address 332 * of outgoing interface. 333 */ 334 if (ip->ip_src.s_addr == INADDR_ANY) { 335 /* Interface may have no addresses. */ 336 if (ia != NULL) 337 ip->ip_src = IA_SIN(ia)->sin_addr; 338 } 339 340 if ((imo == NULL && in_mcast_loop) || 341 (imo && imo->imo_multicast_loop)) { 342 /* 343 * Loop back multicast datagram if not expressly 344 * forbidden to do so, even if we are not a member 345 * of the group; ip_input() will filter it later, 346 * thus deferring a hash lookup and mutex acquisition 347 * at the expense of a cheap copy using m_copym(). 348 */ 349 ip_mloopback(ifp, m, dst, hlen); 350 } else { 351 /* 352 * If we are acting as a multicast router, perform 353 * multicast forwarding as if the packet had just 354 * arrived on the interface to which we are about 355 * to send. The multicast forwarding function 356 * recursively calls this function, using the 357 * IP_FORWARDING flag to prevent infinite recursion. 358 * 359 * Multicasts that are looped back by ip_mloopback(), 360 * above, will be forwarded by the ip_input() routine, 361 * if necessary. 362 */ 363 if (V_ip_mrouter && (flags & IP_FORWARDING) == 0) { 364 /* 365 * If rsvp daemon is not running, do not 366 * set ip_moptions. This ensures that the packet 367 * is multicast and not just sent down one link 368 * as prescribed by rsvpd. 369 */ 370 if (!V_rsvp_on) 371 imo = NULL; 372 if (ip_mforward && 373 ip_mforward(ip, ifp, m, imo) != 0) { 374 m_freem(m); 375 goto done; 376 } 377 } 378 } 379 380 /* 381 * Multicasts with a time-to-live of zero may be looped- 382 * back, above, but must not be transmitted on a network. 383 * Also, multicasts addressed to the loopback interface 384 * are not sent -- the above call to ip_mloopback() will 385 * loop back a copy. ip_input() will drop the copy if 386 * this host does not belong to the destination group on 387 * the loopback interface. 388 */ 389 if (ip->ip_ttl == 0 || ifp->if_flags & IFF_LOOPBACK) { 390 m_freem(m); 391 goto done; 392 } 393 394 goto sendit; 395 } 396 397 /* 398 * If the source address is not specified yet, use the address 399 * of the outoing interface. 400 */ 401 if (ip->ip_src.s_addr == INADDR_ANY) { 402 /* Interface may have no addresses. */ 403 if (ia != NULL) { 404 ip->ip_src = IA_SIN(ia)->sin_addr; 405 } 406 } 407 408 /* 409 * Verify that we have any chance at all of being able to queue the 410 * packet or packet fragments, unless ALTQ is enabled on the given 411 * interface in which case packetdrop should be done by queueing. 412 */ 413#ifdef ALTQ 414 if ((!ALTQ_IS_ENABLED(&ifp->if_snd)) && 415 ((ifp->if_snd.ifq_len + ip->ip_len / mtu + 1) >= 416 ifp->if_snd.ifq_maxlen)) 417#else 418 if ((ifp->if_snd.ifq_len + ip->ip_len / mtu + 1) >= 419 ifp->if_snd.ifq_maxlen) 420#endif /* ALTQ */ 421 { 422 error = ENOBUFS; 423 V_ipstat.ips_odropped++; 424 ifp->if_snd.ifq_drops += (ip->ip_len / ifp->if_mtu + 1); 425 goto bad; 426 } 427 428 /* 429 * Look for broadcast address and 430 * verify user is allowed to send 431 * such a packet. 432 */ 433 if (isbroadcast) { 434 if ((ifp->if_flags & IFF_BROADCAST) == 0) { 435 error = EADDRNOTAVAIL; 436 goto bad; 437 } 438 if ((flags & IP_ALLOWBROADCAST) == 0) { 439 error = EACCES; 440 goto bad; 441 } 442 /* don't allow broadcast messages to be fragmented */ 443 if (ip->ip_len > mtu) { 444 error = EMSGSIZE; 445 goto bad; 446 } 447 m->m_flags |= M_BCAST; 448 } else { 449 m->m_flags &= ~M_BCAST; 450 } 451 452sendit: 453#ifdef IPSEC 454 switch(ip_ipsec_output(&m, inp, &flags, &error, &ro, &iproute, &dst, &ia, &ifp)) { 455 case 1: 456 goto bad; 457 case -1: 458 goto done; 459 case 0: 460 default: 461 break; /* Continue with packet processing. */ 462 } 463 /* Update variables that are affected by ipsec4_output(). */ 464 ip = mtod(m, struct ip *); 465 hlen = ip->ip_hl << 2; 466#endif /* IPSEC */ 467 468 /* Jump over all PFIL processing if hooks are not active. */ 469 if (!PFIL_HOOKED(&inet_pfil_hook)) 470 goto passout; 471 472 /* Run through list of hooks for output packets. */ 473 odst.s_addr = ip->ip_dst.s_addr; 474 error = pfil_run_hooks(&inet_pfil_hook, &m, ifp, PFIL_OUT, inp); 475 if (error != 0 || m == NULL) 476 goto done; 477 478 ip = mtod(m, struct ip *); 479 480 /* See if destination IP address was changed by packet filter. */ 481 if (odst.s_addr != ip->ip_dst.s_addr) { 482 m->m_flags |= M_SKIP_FIREWALL; 483 /* If destination is now ourself drop to ip_input(). */ 484 if (in_localip(ip->ip_dst)) { 485 m->m_flags |= M_FASTFWD_OURS; 486 if (m->m_pkthdr.rcvif == NULL) 487 m->m_pkthdr.rcvif = V_loif; 488 if (m->m_pkthdr.csum_flags & CSUM_DELAY_DATA) { 489 m->m_pkthdr.csum_flags |= 490 CSUM_DATA_VALID | CSUM_PSEUDO_HDR; 491 m->m_pkthdr.csum_data = 0xffff; 492 } 493 m->m_pkthdr.csum_flags |= 494 CSUM_IP_CHECKED | CSUM_IP_VALID; 495#ifdef SCTP 496 if (m->m_pkthdr.csum_flags & CSUM_SCTP) 497 m->m_pkthdr.csum_flags |= CSUM_SCTP_VALID; 498#endif 499 error = netisr_queue(NETISR_IP, m); 500 goto done; 501 } else 502 goto again; /* Redo the routing table lookup. */ 503 } 504 505#ifdef IPFIREWALL_FORWARD 506 /* See if local, if yes, send it to netisr with IP_FASTFWD_OURS. */ 507 if (m->m_flags & M_FASTFWD_OURS) { 508 if (m->m_pkthdr.rcvif == NULL) 509 m->m_pkthdr.rcvif = V_loif; 510 if (m->m_pkthdr.csum_flags & CSUM_DELAY_DATA) { 511 m->m_pkthdr.csum_flags |= 512 CSUM_DATA_VALID | CSUM_PSEUDO_HDR; 513 m->m_pkthdr.csum_data = 0xffff; 514 } 515#ifdef SCTP 516 if (m->m_pkthdr.csum_flags & CSUM_SCTP) 517 m->m_pkthdr.csum_flags |= CSUM_SCTP_VALID; 518#endif 519 m->m_pkthdr.csum_flags |= 520 CSUM_IP_CHECKED | CSUM_IP_VALID; 521 522 error = netisr_queue(NETISR_IP, m); 523 goto done; 524 } 525 /* Or forward to some other address? */ 526 fwd_tag = m_tag_find(m, PACKET_TAG_IPFORWARD, NULL); 527 if (fwd_tag) { 528 dst = (struct sockaddr_in *)&ro->ro_dst; 529 bcopy((fwd_tag+1), dst, sizeof(struct sockaddr_in)); 530 m->m_flags |= M_SKIP_FIREWALL; 531 m_tag_delete(m, fwd_tag); 532 goto again; 533 } 534#endif /* IPFIREWALL_FORWARD */ 535 536passout: 537 /* 127/8 must not appear on wire - RFC1122. */ 538 if ((ntohl(ip->ip_dst.s_addr) >> IN_CLASSA_NSHIFT) == IN_LOOPBACKNET || 539 (ntohl(ip->ip_src.s_addr) >> IN_CLASSA_NSHIFT) == IN_LOOPBACKNET) { 540 if ((ifp->if_flags & IFF_LOOPBACK) == 0) { 541 V_ipstat.ips_badaddr++; 542 error = EADDRNOTAVAIL; 543 goto bad; 544 } 545 } 546 547 m->m_pkthdr.csum_flags |= CSUM_IP; 548 sw_csum = m->m_pkthdr.csum_flags & ~ifp->if_hwassist; 549 if (sw_csum & CSUM_DELAY_DATA) { 550 in_delayed_cksum(m); 551 sw_csum &= ~CSUM_DELAY_DATA; 552 } 553#ifdef SCTP 554 if (sw_csum & CSUM_SCTP) { 555 sctp_delayed_cksum(m); 556 sw_csum &= ~CSUM_SCTP; 557 } 558#endif 559 m->m_pkthdr.csum_flags &= ifp->if_hwassist; 560 561 /* 562 * If small enough for interface, or the interface will take 563 * care of the fragmentation for us, we can just send directly. 564 */ 565 if (ip->ip_len <= mtu || 566 (m->m_pkthdr.csum_flags & ifp->if_hwassist & CSUM_TSO) != 0 || 567 ((ip->ip_off & IP_DF) == 0 && (ifp->if_hwassist & CSUM_FRAGMENT))) { 568 ip->ip_len = htons(ip->ip_len); 569 ip->ip_off = htons(ip->ip_off); 570 ip->ip_sum = 0; 571 if (sw_csum & CSUM_DELAY_IP) 572 ip->ip_sum = in_cksum(m, hlen); 573 574 /* 575 * Record statistics for this interface address. 576 * With CSUM_TSO the byte/packet count will be slightly 577 * incorrect because we count the IP+TCP headers only 578 * once instead of for every generated packet. 579 */ 580 if (!(flags & IP_FORWARDING) && ia) { 581 if (m->m_pkthdr.csum_flags & CSUM_TSO) 582 ia->ia_ifa.if_opackets += 583 m->m_pkthdr.len / m->m_pkthdr.tso_segsz; 584 else 585 ia->ia_ifa.if_opackets++; 586 ia->ia_ifa.if_obytes += m->m_pkthdr.len; 587 } 588#ifdef MBUF_STRESS_TEST 589 if (mbuf_frag_size && m->m_pkthdr.len > mbuf_frag_size) 590 m = m_fragment(m, M_DONTWAIT, mbuf_frag_size); 591#endif 592 /* 593 * Reset layer specific mbuf flags 594 * to avoid confusing lower layers. 595 */ 596 m->m_flags &= ~(M_PROTOFLAGS); 597 error = (*ifp->if_output)(ifp, m, 598 (struct sockaddr *)dst, ro->ro_rt); 599 goto done; 600 } 601 602 /* Balk when DF bit is set or the interface didn't support TSO. */ 603 if ((ip->ip_off & IP_DF) || (m->m_pkthdr.csum_flags & CSUM_TSO)) { 604 error = EMSGSIZE; 605 V_ipstat.ips_cantfrag++; 606 goto bad; 607 } 608 609 /* 610 * Too large for interface; fragment if possible. If successful, 611 * on return, m will point to a list of packets to be sent. 612 */ 613 error = ip_fragment(ip, &m, mtu, ifp->if_hwassist, sw_csum); 614 if (error) 615 goto bad; 616 for (; m; m = m0) { 617 m0 = m->m_nextpkt; 618 m->m_nextpkt = 0; 619 if (error == 0) { 620 /* Record statistics for this interface address. */ 621 if (ia != NULL) { 622 ia->ia_ifa.if_opackets++; 623 ia->ia_ifa.if_obytes += m->m_pkthdr.len; 624 } 625 /* 626 * Reset layer specific mbuf flags 627 * to avoid confusing upper layers. 628 */ 629 m->m_flags &= ~(M_PROTOFLAGS); 630 631 error = (*ifp->if_output)(ifp, m, 632 (struct sockaddr *)dst, ro->ro_rt); 633 } else 634 m_freem(m); 635 } 636 637 if (error == 0) 638 V_ipstat.ips_fragmented++; 639 640done: 641 if (ro == &iproute && ro->ro_rt) { 642 RTFREE(ro->ro_rt); 643 } 644 return (error); 645bad: 646 m_freem(m); 647 goto done; 648} 649 650/* 651 * Create a chain of fragments which fit the given mtu. m_frag points to the 652 * mbuf to be fragmented; on return it points to the chain with the fragments. 653 * Return 0 if no error. If error, m_frag may contain a partially built 654 * chain of fragments that should be freed by the caller. 655 * 656 * if_hwassist_flags is the hw offload capabilities (see if_data.ifi_hwassist) 657 * sw_csum contains the delayed checksums flags (e.g., CSUM_DELAY_IP). 658 */ 659int 660ip_fragment(struct ip *ip, struct mbuf **m_frag, int mtu, 661 u_long if_hwassist_flags, int sw_csum) 662{ 663 INIT_VNET_INET(curvnet); 664 int error = 0; 665 int hlen = ip->ip_hl << 2; 666 int len = (mtu - hlen) & ~7; /* size of payload in each fragment */ 667 int off; 668 struct mbuf *m0 = *m_frag; /* the original packet */ 669 int firstlen; 670 struct mbuf **mnext; 671 int nfrags; 672 673 if (ip->ip_off & IP_DF) { /* Fragmentation not allowed */ 674 V_ipstat.ips_cantfrag++; 675 return EMSGSIZE; 676 } 677 678 /* 679 * Must be able to put at least 8 bytes per fragment. 680 */ 681 if (len < 8) 682 return EMSGSIZE; 683 684 /* 685 * If the interface will not calculate checksums on 686 * fragmented packets, then do it here. 687 */ 688 if (m0->m_pkthdr.csum_flags & CSUM_DELAY_DATA && 689 (if_hwassist_flags & CSUM_IP_FRAGS) == 0) { 690 in_delayed_cksum(m0); 691 m0->m_pkthdr.csum_flags &= ~CSUM_DELAY_DATA; 692 } 693#ifdef SCTP 694 if (m0->m_pkthdr.csum_flags & CSUM_SCTP && 695 (if_hwassist_flags & CSUM_IP_FRAGS) == 0) { 696 sctp_delayed_cksum(m0); 697 m0->m_pkthdr.csum_flags &= ~CSUM_SCTP; 698 } 699#endif 700 if (len > PAGE_SIZE) { 701 /* 702 * Fragment large datagrams such that each segment 703 * contains a multiple of PAGE_SIZE amount of data, 704 * plus headers. This enables a receiver to perform 705 * page-flipping zero-copy optimizations. 706 * 707 * XXX When does this help given that sender and receiver 708 * could have different page sizes, and also mtu could 709 * be less than the receiver's page size ? 710 */ 711 int newlen; 712 struct mbuf *m; 713 714 for (m = m0, off = 0; m && (off+m->m_len) <= mtu; m = m->m_next) 715 off += m->m_len; 716 717 /* 718 * firstlen (off - hlen) must be aligned on an 719 * 8-byte boundary 720 */ 721 if (off < hlen) 722 goto smart_frag_failure; 723 off = ((off - hlen) & ~7) + hlen; 724 newlen = (~PAGE_MASK) & mtu; 725 if ((newlen + sizeof (struct ip)) > mtu) { 726 /* we failed, go back the default */ 727smart_frag_failure: 728 newlen = len; 729 off = hlen + len; 730 } 731 len = newlen; 732 733 } else { 734 off = hlen + len; 735 } 736 737 firstlen = off - hlen; 738 mnext = &m0->m_nextpkt; /* pointer to next packet */ 739 740 /* 741 * Loop through length of segment after first fragment, 742 * make new header and copy data of each part and link onto chain. 743 * Here, m0 is the original packet, m is the fragment being created. 744 * The fragments are linked off the m_nextpkt of the original 745 * packet, which after processing serves as the first fragment. 746 */ 747 for (nfrags = 1; off < ip->ip_len; off += len, nfrags++) { 748 struct ip *mhip; /* ip header on the fragment */ 749 struct mbuf *m; 750 int mhlen = sizeof (struct ip); 751 752 MGETHDR(m, M_DONTWAIT, MT_DATA); 753 if (m == NULL) { 754 error = ENOBUFS; 755 V_ipstat.ips_odropped++; 756 goto done; 757 } 758 m->m_flags |= (m0->m_flags & M_MCAST) | M_FRAG; 759 /* 760 * In the first mbuf, leave room for the link header, then 761 * copy the original IP header including options. The payload 762 * goes into an additional mbuf chain returned by m_copym(). 763 */ 764 m->m_data += max_linkhdr; 765 mhip = mtod(m, struct ip *); 766 *mhip = *ip; 767 if (hlen > sizeof (struct ip)) { 768 mhlen = ip_optcopy(ip, mhip) + sizeof (struct ip); 769 mhip->ip_v = IPVERSION; 770 mhip->ip_hl = mhlen >> 2; 771 } 772 m->m_len = mhlen; 773 /* XXX do we need to add ip->ip_off below ? */ 774 mhip->ip_off = ((off - hlen) >> 3) + ip->ip_off; 775 if (off + len >= ip->ip_len) { /* last fragment */ 776 len = ip->ip_len - off; 777 m->m_flags |= M_LASTFRAG; 778 } else 779 mhip->ip_off |= IP_MF; 780 mhip->ip_len = htons((u_short)(len + mhlen)); 781 m->m_next = m_copym(m0, off, len, M_DONTWAIT); 782 if (m->m_next == NULL) { /* copy failed */ 783 m_free(m); 784 error = ENOBUFS; /* ??? */ 785 V_ipstat.ips_odropped++; 786 goto done; 787 } 788 m->m_pkthdr.len = mhlen + len; 789 m->m_pkthdr.rcvif = NULL; 790#ifdef MAC 791 mac_netinet_fragment(m0, m); 792#endif 793 m->m_pkthdr.csum_flags = m0->m_pkthdr.csum_flags; 794 mhip->ip_off = htons(mhip->ip_off); 795 mhip->ip_sum = 0; 796 if (sw_csum & CSUM_DELAY_IP) 797 mhip->ip_sum = in_cksum(m, mhlen); 798 *mnext = m; 799 mnext = &m->m_nextpkt; 800 } 801 V_ipstat.ips_ofragments += nfrags; 802 803 /* set first marker for fragment chain */ 804 m0->m_flags |= M_FIRSTFRAG | M_FRAG; 805 m0->m_pkthdr.csum_data = nfrags; 806 807 /* 808 * Update first fragment by trimming what's been copied out 809 * and updating header. 810 */ 811 m_adj(m0, hlen + firstlen - ip->ip_len); 812 m0->m_pkthdr.len = hlen + firstlen; 813 ip->ip_len = htons((u_short)m0->m_pkthdr.len); 814 ip->ip_off |= IP_MF; 815 ip->ip_off = htons(ip->ip_off); 816 ip->ip_sum = 0; 817 if (sw_csum & CSUM_DELAY_IP) 818 ip->ip_sum = in_cksum(m0, hlen); 819 820done: 821 *m_frag = m0; 822 return error; 823} 824 825void 826in_delayed_cksum(struct mbuf *m) 827{ 828 struct ip *ip; 829 u_short csum, offset; 830 831 ip = mtod(m, struct ip *); 832 offset = ip->ip_hl << 2 ; 833 csum = in_cksum_skip(m, ip->ip_len, offset); 834 if (m->m_pkthdr.csum_flags & CSUM_UDP && csum == 0) 835 csum = 0xffff; 836 offset += m->m_pkthdr.csum_data; /* checksum offset */ 837 838 if (offset + sizeof(u_short) > m->m_len) { 839 printf("delayed m_pullup, m->len: %d off: %d p: %d\n", 840 m->m_len, offset, ip->ip_p); 841 /* 842 * XXX 843 * this shouldn't happen, but if it does, the 844 * correct behavior may be to insert the checksum 845 * in the appropriate next mbuf in the chain. 846 */ 847 return; 848 } 849 *(u_short *)(m->m_data + offset) = csum; 850} 851 852/* 853 * IP socket option processing. 854 */ 855int 856ip_ctloutput(struct socket *so, struct sockopt *sopt) 857{ 858 struct inpcb *inp = sotoinpcb(so); 859 int error, optval; 860 861 error = optval = 0; 862 if (sopt->sopt_level != IPPROTO_IP) { 863 if ((sopt->sopt_level == SOL_SOCKET) && 864 (sopt->sopt_name == SO_SETFIB)) { 865 inp->inp_inc.inc_fibnum = so->so_fibnum; 866 return (0); 867 } 868 return (EINVAL); 869 } 870 871 switch (sopt->sopt_dir) { 872 case SOPT_SET: 873 switch (sopt->sopt_name) { 874 case IP_OPTIONS: 875#ifdef notyet 876 case IP_RETOPTS: 877#endif 878 { 879 struct mbuf *m; 880 if (sopt->sopt_valsize > MLEN) { 881 error = EMSGSIZE; 882 break; 883 } 884 MGET(m, sopt->sopt_td ? M_WAIT : M_DONTWAIT, MT_DATA); 885 if (m == NULL) { 886 error = ENOBUFS; 887 break; 888 } 889 m->m_len = sopt->sopt_valsize; 890 error = sooptcopyin(sopt, mtod(m, char *), m->m_len, 891 m->m_len); 892 if (error) { 893 m_free(m); 894 break; 895 } 896 INP_WLOCK(inp); 897 error = ip_pcbopts(inp, sopt->sopt_name, m); 898 INP_WUNLOCK(inp); 899 return (error); 900 } 901 902#if defined(IP_NONLOCALBIND) 903 case IP_NONLOCALOK: 904 if (! ip_nonlocalok) { 905 error = ENOPROTOOPT; 906 break; 907 } 908 /* FALLTHROUGH */ 909#endif 910 case IP_TOS: 911 case IP_TTL: 912 case IP_MINTTL: 913 case IP_RECVOPTS: 914 case IP_RECVRETOPTS: 915 case IP_RECVDSTADDR: 916 case IP_RECVTTL: 917 case IP_RECVIF: 918 case IP_FAITH: 919 case IP_ONESBCAST: 920 case IP_DONTFRAG: 921 error = sooptcopyin(sopt, &optval, sizeof optval, 922 sizeof optval); 923 if (error) 924 break; 925 926 switch (sopt->sopt_name) { 927 case IP_TOS: 928 inp->inp_ip_tos = optval; 929 break; 930 931 case IP_TTL: 932 inp->inp_ip_ttl = optval; 933 break; 934 935 case IP_MINTTL: 936 if (optval >= 0 && optval <= MAXTTL) 937 inp->inp_ip_minttl = optval; 938 else 939 error = EINVAL; 940 break; 941 942#define OPTSET(bit) do { \ 943 INP_WLOCK(inp); \ 944 if (optval) \ 945 inp->inp_flags |= bit; \ 946 else \ 947 inp->inp_flags &= ~bit; \ 948 INP_WUNLOCK(inp); \ 949} while (0) 950 951 case IP_RECVOPTS: 952 OPTSET(INP_RECVOPTS); 953 break; 954 955 case IP_RECVRETOPTS: 956 OPTSET(INP_RECVRETOPTS); 957 break; 958 959 case IP_RECVDSTADDR: 960 OPTSET(INP_RECVDSTADDR); 961 break; 962 963 case IP_RECVTTL: 964 OPTSET(INP_RECVTTL); 965 break; 966 967 case IP_RECVIF: 968 OPTSET(INP_RECVIF); 969 break; 970 971 case IP_FAITH: 972 OPTSET(INP_FAITH); 973 break; 974 975 case IP_ONESBCAST: 976 OPTSET(INP_ONESBCAST); 977 break; 978 case IP_DONTFRAG: 979 OPTSET(INP_DONTFRAG); 980 break; 981#if defined(IP_NONLOCALBIND) 982 case IP_NONLOCALOK: 983 OPTSET(INP_NONLOCALOK); 984 break; 985#endif 986 } 987 break; 988#undef OPTSET 989 990 /* 991 * Multicast socket options are processed by the in_mcast 992 * module. 993 */ 994 case IP_MULTICAST_IF: 995 case IP_MULTICAST_VIF: 996 case IP_MULTICAST_TTL: 997 case IP_MULTICAST_LOOP: 998 case IP_ADD_MEMBERSHIP: 999 case IP_DROP_MEMBERSHIP: 1000 case IP_ADD_SOURCE_MEMBERSHIP: 1001 case IP_DROP_SOURCE_MEMBERSHIP: 1002 case IP_BLOCK_SOURCE: 1003 case IP_UNBLOCK_SOURCE: 1004 case IP_MSFILTER: 1005 case MCAST_JOIN_GROUP: 1006 case MCAST_LEAVE_GROUP: 1007 case MCAST_JOIN_SOURCE_GROUP: 1008 case MCAST_LEAVE_SOURCE_GROUP: 1009 case MCAST_BLOCK_SOURCE: 1010 case MCAST_UNBLOCK_SOURCE: 1011 error = inp_setmoptions(inp, sopt); 1012 break; 1013 1014 case IP_PORTRANGE: 1015 error = sooptcopyin(sopt, &optval, sizeof optval, 1016 sizeof optval); 1017 if (error) 1018 break; 1019 1020 INP_WLOCK(inp); 1021 switch (optval) { 1022 case IP_PORTRANGE_DEFAULT: 1023 inp->inp_flags &= ~(INP_LOWPORT); 1024 inp->inp_flags &= ~(INP_HIGHPORT); 1025 break; 1026 1027 case IP_PORTRANGE_HIGH: 1028 inp->inp_flags &= ~(INP_LOWPORT); 1029 inp->inp_flags |= INP_HIGHPORT; 1030 break; 1031 1032 case IP_PORTRANGE_LOW: 1033 inp->inp_flags &= ~(INP_HIGHPORT); 1034 inp->inp_flags |= INP_LOWPORT; 1035 break; 1036 1037 default: 1038 error = EINVAL; 1039 break; 1040 } 1041 INP_WUNLOCK(inp); 1042 break; 1043 1044#ifdef IPSEC 1045 case IP_IPSEC_POLICY: 1046 { 1047 caddr_t req; 1048 struct mbuf *m; 1049 1050 if ((error = soopt_getm(sopt, &m)) != 0) /* XXX */ 1051 break; 1052 if ((error = soopt_mcopyin(sopt, m)) != 0) /* XXX */ 1053 break; 1054 req = mtod(m, caddr_t); 1055 error = ipsec_set_policy(inp, sopt->sopt_name, req, 1056 m->m_len, (sopt->sopt_td != NULL) ? 1057 sopt->sopt_td->td_ucred : NULL); 1058 m_freem(m); 1059 break; 1060 } 1061#endif /* IPSEC */ 1062 1063 default: 1064 error = ENOPROTOOPT; 1065 break; 1066 } 1067 break; 1068 1069 case SOPT_GET: 1070 switch (sopt->sopt_name) { 1071 case IP_OPTIONS: 1072 case IP_RETOPTS: 1073 if (inp->inp_options) 1074 error = sooptcopyout(sopt, 1075 mtod(inp->inp_options, 1076 char *), 1077 inp->inp_options->m_len); 1078 else 1079 sopt->sopt_valsize = 0; 1080 break; 1081 1082 case IP_TOS: 1083 case IP_TTL: 1084 case IP_MINTTL: 1085 case IP_RECVOPTS: 1086 case IP_RECVRETOPTS: 1087 case IP_RECVDSTADDR: 1088 case IP_RECVTTL: 1089 case IP_RECVIF: 1090 case IP_PORTRANGE: 1091 case IP_FAITH: 1092 case IP_ONESBCAST: 1093 case IP_DONTFRAG: 1094 switch (sopt->sopt_name) { 1095 1096 case IP_TOS: 1097 optval = inp->inp_ip_tos; 1098 break; 1099 1100 case IP_TTL: 1101 optval = inp->inp_ip_ttl; 1102 break; 1103 1104 case IP_MINTTL: 1105 optval = inp->inp_ip_minttl; 1106 break; 1107 1108#define OPTBIT(bit) (inp->inp_flags & bit ? 1 : 0) 1109 1110 case IP_RECVOPTS: 1111 optval = OPTBIT(INP_RECVOPTS); 1112 break; 1113 1114 case IP_RECVRETOPTS: 1115 optval = OPTBIT(INP_RECVRETOPTS); 1116 break; 1117 1118 case IP_RECVDSTADDR: 1119 optval = OPTBIT(INP_RECVDSTADDR); 1120 break; 1121 1122 case IP_RECVTTL: 1123 optval = OPTBIT(INP_RECVTTL); 1124 break; 1125 1126 case IP_RECVIF: 1127 optval = OPTBIT(INP_RECVIF); 1128 break; 1129 1130 case IP_PORTRANGE: 1131 if (inp->inp_flags & INP_HIGHPORT) 1132 optval = IP_PORTRANGE_HIGH; 1133 else if (inp->inp_flags & INP_LOWPORT) 1134 optval = IP_PORTRANGE_LOW; 1135 else 1136 optval = 0; 1137 break; 1138 1139 case IP_FAITH: 1140 optval = OPTBIT(INP_FAITH); 1141 break; 1142 1143 case IP_ONESBCAST: 1144 optval = OPTBIT(INP_ONESBCAST); 1145 break; 1146 case IP_DONTFRAG: 1147 optval = OPTBIT(INP_DONTFRAG); 1148 break; 1149 } 1150 error = sooptcopyout(sopt, &optval, sizeof optval); 1151 break; 1152 1153 /* 1154 * Multicast socket options are processed by the in_mcast 1155 * module. 1156 */ 1157 case IP_MULTICAST_IF: 1158 case IP_MULTICAST_VIF: 1159 case IP_MULTICAST_TTL: 1160 case IP_MULTICAST_LOOP: 1161 case IP_MSFILTER: 1162 error = inp_getmoptions(inp, sopt); 1163 break; 1164 1165#ifdef IPSEC 1166 case IP_IPSEC_POLICY: 1167 { 1168 struct mbuf *m = NULL; 1169 caddr_t req = NULL; 1170 size_t len = 0; 1171 1172 if (m != 0) { 1173 req = mtod(m, caddr_t); 1174 len = m->m_len; 1175 } 1176 error = ipsec_get_policy(sotoinpcb(so), req, len, &m); 1177 if (error == 0) 1178 error = soopt_mcopyout(sopt, m); /* XXX */ 1179 if (error == 0) 1180 m_freem(m); 1181 break; 1182 } 1183#endif /* IPSEC */ 1184 1185 default: 1186 error = ENOPROTOOPT; 1187 break; 1188 } 1189 break; 1190 } 1191 return (error); 1192} 1193 1194/* 1195 * Routine called from ip_output() to loop back a copy of an IP multicast 1196 * packet to the input queue of a specified interface. Note that this 1197 * calls the output routine of the loopback "driver", but with an interface 1198 * pointer that might NOT be a loopback interface -- evil, but easier than 1199 * replicating that code here. 1200 */ 1201static void 1202ip_mloopback(struct ifnet *ifp, struct mbuf *m, struct sockaddr_in *dst, 1203 int hlen) 1204{ 1205 register struct ip *ip; 1206 struct mbuf *copym; 1207 1208 /* 1209 * Make a deep copy of the packet because we're going to 1210 * modify the pack in order to generate checksums. 1211 */ 1212 copym = m_dup(m, M_DONTWAIT); 1213 if (copym != NULL && (copym->m_flags & M_EXT || copym->m_len < hlen)) 1214 copym = m_pullup(copym, hlen); 1215 if (copym != NULL) { 1216 /* If needed, compute the checksum and mark it as valid. */ 1217 if (copym->m_pkthdr.csum_flags & CSUM_DELAY_DATA) { 1218 in_delayed_cksum(copym); 1219 copym->m_pkthdr.csum_flags &= ~CSUM_DELAY_DATA; 1220 copym->m_pkthdr.csum_flags |= 1221 CSUM_DATA_VALID | CSUM_PSEUDO_HDR; 1222 copym->m_pkthdr.csum_data = 0xffff; 1223 } 1224 /* 1225 * We don't bother to fragment if the IP length is greater 1226 * than the interface's MTU. Can this possibly matter? 1227 */ 1228 ip = mtod(copym, struct ip *); 1229 ip->ip_len = htons(ip->ip_len); 1230 ip->ip_off = htons(ip->ip_off); 1231 ip->ip_sum = 0; 1232 ip->ip_sum = in_cksum(copym, hlen); 1233#if 1 /* XXX */ 1234 if (dst->sin_family != AF_INET) { 1235 printf("ip_mloopback: bad address family %d\n", 1236 dst->sin_family); 1237 dst->sin_family = AF_INET; 1238 } 1239#endif 1240 if_simloop(ifp, copym, dst->sin_family, 0); 1241 } 1242} 1243