ip_output.c revision 186717
1/*- 2 * Copyright (c) 1982, 1986, 1988, 1990, 1993 3 * The Regents of the University of California. All rights reserved. 4 * 5 * Redistribution and use in source and binary forms, with or without 6 * modification, are permitted provided that the following conditions 7 * are met: 8 * 1. Redistributions of source code must retain the above copyright 9 * notice, this list of conditions and the following disclaimer. 10 * 2. Redistributions in binary form must reproduce the above copyright 11 * notice, this list of conditions and the following disclaimer in the 12 * documentation and/or other materials provided with the distribution. 13 * 4. Neither the name of the University nor the names of its contributors 14 * may be used to endorse or promote products derived from this software 15 * without specific prior written permission. 16 * 17 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND 18 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 19 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 20 * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE 21 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 22 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 23 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 24 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 25 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 26 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 27 * SUCH DAMAGE. 28 * 29 * @(#)ip_output.c 8.3 (Berkeley) 1/21/94 30 */ 31 32#include <sys/cdefs.h> 33__FBSDID("$FreeBSD: head/sys/netinet/ip_output.c 186717 2009-01-03 11:35:31Z rwatson $"); 34 35#include "opt_ipfw.h" 36#include "opt_ipsec.h" 37#include "opt_mac.h" 38#include "opt_mbuf_stress_test.h" 39#include "opt_mpath.h" 40 41#include <sys/param.h> 42#include <sys/systm.h> 43#include <sys/kernel.h> 44#include <sys/malloc.h> 45#include <sys/mbuf.h> 46#include <sys/priv.h> 47#include <sys/proc.h> 48#include <sys/protosw.h> 49#include <sys/socket.h> 50#include <sys/socketvar.h> 51#include <sys/sysctl.h> 52#include <sys/ucred.h> 53#include <sys/vimage.h> 54 55#include <net/if.h> 56#include <net/netisr.h> 57#include <net/pfil.h> 58#include <net/route.h> 59#ifdef RADIX_MPATH 60#include <net/radix_mpath.h> 61#endif 62#include <net/vnet.h> 63 64#include <netinet/in.h> 65#include <netinet/in_systm.h> 66#include <netinet/ip.h> 67#include <netinet/in_pcb.h> 68#include <netinet/in_var.h> 69#include <netinet/ip_var.h> 70#include <netinet/ip_options.h> 71#include <netinet/vinet.h> 72 73#ifdef IPSEC 74#include <netinet/ip_ipsec.h> 75#include <netipsec/ipsec.h> 76#endif /* IPSEC*/ 77 78#include <machine/in_cksum.h> 79 80#include <security/mac/mac_framework.h> 81 82#define print_ip(x, a, y) printf("%s %d.%d.%d.%d%s",\ 83 x, (ntohl(a.s_addr)>>24)&0xFF,\ 84 (ntohl(a.s_addr)>>16)&0xFF,\ 85 (ntohl(a.s_addr)>>8)&0xFF,\ 86 (ntohl(a.s_addr))&0xFF, y); 87 88#ifdef VIMAGE_GLOBALS 89u_short ip_id; 90#endif 91 92#ifdef MBUF_STRESS_TEST 93int mbuf_frag_size = 0; 94SYSCTL_INT(_net_inet_ip, OID_AUTO, mbuf_frag_size, CTLFLAG_RW, 95 &mbuf_frag_size, 0, "Fragment outgoing mbufs to this size"); 96#endif 97 98static void ip_mloopback 99 (struct ifnet *, struct mbuf *, struct sockaddr_in *, int); 100 101 102extern struct protosw inetsw[]; 103 104/* 105 * IP output. The packet in mbuf chain m contains a skeletal IP 106 * header (with len, off, ttl, proto, tos, src, dst). 107 * The mbuf chain containing the packet will be freed. 108 * The mbuf opt, if present, will not be freed. 109 * In the IP forwarding case, the packet will arrive with options already 110 * inserted, so must have a NULL opt pointer. 111 */ 112int 113ip_output(struct mbuf *m, struct mbuf *opt, struct route *ro, int flags, 114 struct ip_moptions *imo, struct inpcb *inp) 115{ 116 INIT_VNET_NET(curvnet); 117 INIT_VNET_INET(curvnet); 118 struct ip *ip; 119 struct ifnet *ifp = NULL; /* keep compiler happy */ 120 struct mbuf *m0; 121 int hlen = sizeof (struct ip); 122 int mtu; 123 int len, error = 0; 124 struct sockaddr_in *dst = NULL; /* keep compiler happy */ 125 struct in_ifaddr *ia = NULL; 126 int isbroadcast, sw_csum; 127 struct route iproute; 128 struct in_addr odst; 129#ifdef IPFIREWALL_FORWARD 130 struct m_tag *fwd_tag = NULL; 131#endif 132 M_ASSERTPKTHDR(m); 133 134 if (ro == NULL) { 135 ro = &iproute; 136 bzero(ro, sizeof (*ro)); 137 } 138 139 if (inp != NULL) { 140 M_SETFIB(m, inp->inp_inc.inc_fibnum); 141 INP_LOCK_ASSERT(inp); 142 } 143 144 if (opt) { 145 len = 0; 146 m = ip_insertoptions(m, opt, &len); 147 if (len != 0) 148 hlen = len; 149 } 150 ip = mtod(m, struct ip *); 151 152 /* 153 * Fill in IP header. If we are not allowing fragmentation, 154 * then the ip_id field is meaningless, but we don't set it 155 * to zero. Doing so causes various problems when devices along 156 * the path (routers, load balancers, firewalls, etc.) illegally 157 * disable DF on our packet. Note that a 16-bit counter 158 * will wrap around in less than 10 seconds at 100 Mbit/s on a 159 * medium with MTU 1500. See Steven M. Bellovin, "A Technique 160 * for Counting NATted Hosts", Proc. IMW'02, available at 161 * <http://www.cs.columbia.edu/~smb/papers/fnat.pdf>. 162 */ 163 if ((flags & (IP_FORWARDING|IP_RAWOUTPUT)) == 0) { 164 ip->ip_v = IPVERSION; 165 ip->ip_hl = hlen >> 2; 166 ip->ip_id = ip_newid(); 167 V_ipstat.ips_localout++; 168 } else { 169 hlen = ip->ip_hl << 2; 170 } 171 172 dst = (struct sockaddr_in *)&ro->ro_dst; 173again: 174 /* 175 * If there is a cached route, 176 * check that it is to the same destination 177 * and is still up. If not, free it and try again. 178 * The address family should also be checked in case of sharing the 179 * cache with IPv6. 180 */ 181 if (ro->ro_rt && ((ro->ro_rt->rt_flags & RTF_UP) == 0 || 182 dst->sin_family != AF_INET || 183 dst->sin_addr.s_addr != ip->ip_dst.s_addr)) { 184 RTFREE(ro->ro_rt); 185 ro->ro_rt = (struct rtentry *)NULL; 186 } 187#ifdef IPFIREWALL_FORWARD 188 if (ro->ro_rt == NULL && fwd_tag == NULL) { 189#else 190 if (ro->ro_rt == NULL) { 191#endif 192 bzero(dst, sizeof(*dst)); 193 dst->sin_family = AF_INET; 194 dst->sin_len = sizeof(*dst); 195 dst->sin_addr = ip->ip_dst; 196 } 197 /* 198 * If routing to interface only, short circuit routing lookup. 199 * The use of an all-ones broadcast address implies this; an 200 * interface is specified by the broadcast address of an interface, 201 * or the destination address of a ptp interface. 202 */ 203 if (flags & IP_SENDONES) { 204 if ((ia = ifatoia(ifa_ifwithbroadaddr(sintosa(dst)))) == NULL && 205 (ia = ifatoia(ifa_ifwithdstaddr(sintosa(dst)))) == NULL) { 206 V_ipstat.ips_noroute++; 207 error = ENETUNREACH; 208 goto bad; 209 } 210 ip->ip_dst.s_addr = INADDR_BROADCAST; 211 dst->sin_addr = ip->ip_dst; 212 ifp = ia->ia_ifp; 213 ip->ip_ttl = 1; 214 isbroadcast = 1; 215 } else if (flags & IP_ROUTETOIF) { 216 if ((ia = ifatoia(ifa_ifwithdstaddr(sintosa(dst)))) == NULL && 217 (ia = ifatoia(ifa_ifwithnet(sintosa(dst)))) == NULL) { 218 V_ipstat.ips_noroute++; 219 error = ENETUNREACH; 220 goto bad; 221 } 222 ifp = ia->ia_ifp; 223 ip->ip_ttl = 1; 224 isbroadcast = in_broadcast(dst->sin_addr, ifp); 225 } else if (IN_MULTICAST(ntohl(ip->ip_dst.s_addr)) && 226 imo != NULL && imo->imo_multicast_ifp != NULL) { 227 /* 228 * Bypass the normal routing lookup for multicast 229 * packets if the interface is specified. 230 */ 231 ifp = imo->imo_multicast_ifp; 232 IFP_TO_IA(ifp, ia); 233 isbroadcast = 0; /* fool gcc */ 234 } else { 235 /* 236 * We want to do any cloning requested by the link layer, 237 * as this is probably required in all cases for correct 238 * operation (as it is for ARP). 239 */ 240 if (ro->ro_rt == NULL) 241#ifdef RADIX_MPATH 242 rtalloc_mpath_fib(ro, 243 ntohl(ip->ip_src.s_addr ^ ip->ip_dst.s_addr), 244 inp ? inp->inp_inc.inc_fibnum : M_GETFIB(m)); 245#else 246 in_rtalloc_ign(ro, 0, 247 inp ? inp->inp_inc.inc_fibnum : M_GETFIB(m)); 248#endif 249 if (ro->ro_rt == NULL) { 250 V_ipstat.ips_noroute++; 251 error = EHOSTUNREACH; 252 goto bad; 253 } 254 ia = ifatoia(ro->ro_rt->rt_ifa); 255 ifp = ro->ro_rt->rt_ifp; 256 ro->ro_rt->rt_rmx.rmx_pksent++; 257 if (ro->ro_rt->rt_flags & RTF_GATEWAY) 258 dst = (struct sockaddr_in *)ro->ro_rt->rt_gateway; 259 if (ro->ro_rt->rt_flags & RTF_HOST) 260 isbroadcast = (ro->ro_rt->rt_flags & RTF_BROADCAST); 261 else 262 isbroadcast = in_broadcast(dst->sin_addr, ifp); 263 } 264 /* 265 * Calculate MTU. If we have a route that is up, use that, 266 * otherwise use the interface's MTU. 267 */ 268 if (ro->ro_rt != NULL && (ro->ro_rt->rt_flags & (RTF_UP|RTF_HOST))) { 269 /* 270 * This case can happen if the user changed the MTU 271 * of an interface after enabling IP on it. Because 272 * most netifs don't keep track of routes pointing to 273 * them, there is no way for one to update all its 274 * routes when the MTU is changed. 275 */ 276 if (ro->ro_rt->rt_rmx.rmx_mtu > ifp->if_mtu) 277 ro->ro_rt->rt_rmx.rmx_mtu = ifp->if_mtu; 278 mtu = ro->ro_rt->rt_rmx.rmx_mtu; 279 } else { 280 mtu = ifp->if_mtu; 281 } 282 if (IN_MULTICAST(ntohl(ip->ip_dst.s_addr))) { 283 struct in_multi *inm; 284 285 m->m_flags |= M_MCAST; 286 /* 287 * IP destination address is multicast. Make sure "dst" 288 * still points to the address in "ro". (It may have been 289 * changed to point to a gateway address, above.) 290 */ 291 dst = (struct sockaddr_in *)&ro->ro_dst; 292 /* 293 * See if the caller provided any multicast options 294 */ 295 if (imo != NULL) { 296 ip->ip_ttl = imo->imo_multicast_ttl; 297 if (imo->imo_multicast_vif != -1) 298 ip->ip_src.s_addr = 299 ip_mcast_src ? 300 ip_mcast_src(imo->imo_multicast_vif) : 301 INADDR_ANY; 302 } else 303 ip->ip_ttl = IP_DEFAULT_MULTICAST_TTL; 304 /* 305 * Confirm that the outgoing interface supports multicast. 306 */ 307 if ((imo == NULL) || (imo->imo_multicast_vif == -1)) { 308 if ((ifp->if_flags & IFF_MULTICAST) == 0) { 309 V_ipstat.ips_noroute++; 310 error = ENETUNREACH; 311 goto bad; 312 } 313 } 314 /* 315 * If source address not specified yet, use address 316 * of outgoing interface. 317 */ 318 if (ip->ip_src.s_addr == INADDR_ANY) { 319 /* Interface may have no addresses. */ 320 if (ia != NULL) 321 ip->ip_src = IA_SIN(ia)->sin_addr; 322 } 323 324 IN_MULTI_LOCK(); 325 IN_LOOKUP_MULTI(ip->ip_dst, ifp, inm); 326 if (inm != NULL && 327 (imo == NULL || imo->imo_multicast_loop)) { 328 IN_MULTI_UNLOCK(); 329 /* 330 * If we belong to the destination multicast group 331 * on the outgoing interface, and the caller did not 332 * forbid loopback, loop back a copy. 333 */ 334 ip_mloopback(ifp, m, dst, hlen); 335 } 336 else { 337 IN_MULTI_UNLOCK(); 338 /* 339 * If we are acting as a multicast router, perform 340 * multicast forwarding as if the packet had just 341 * arrived on the interface to which we are about 342 * to send. The multicast forwarding function 343 * recursively calls this function, using the 344 * IP_FORWARDING flag to prevent infinite recursion. 345 * 346 * Multicasts that are looped back by ip_mloopback(), 347 * above, will be forwarded by the ip_input() routine, 348 * if necessary. 349 */ 350 if (V_ip_mrouter && (flags & IP_FORWARDING) == 0) { 351 /* 352 * If rsvp daemon is not running, do not 353 * set ip_moptions. This ensures that the packet 354 * is multicast and not just sent down one link 355 * as prescribed by rsvpd. 356 */ 357 if (!V_rsvp_on) 358 imo = NULL; 359 if (ip_mforward && 360 ip_mforward(ip, ifp, m, imo) != 0) { 361 m_freem(m); 362 goto done; 363 } 364 } 365 } 366 367 /* 368 * Multicasts with a time-to-live of zero may be looped- 369 * back, above, but must not be transmitted on a network. 370 * Also, multicasts addressed to the loopback interface 371 * are not sent -- the above call to ip_mloopback() will 372 * loop back a copy if this host actually belongs to the 373 * destination group on the loopback interface. 374 */ 375 if (ip->ip_ttl == 0 || ifp->if_flags & IFF_LOOPBACK) { 376 m_freem(m); 377 goto done; 378 } 379 380 goto sendit; 381 } 382 383 /* 384 * If the source address is not specified yet, use the address 385 * of the outoing interface. 386 */ 387 if (ip->ip_src.s_addr == INADDR_ANY) { 388 /* Interface may have no addresses. */ 389 if (ia != NULL) { 390 ip->ip_src = IA_SIN(ia)->sin_addr; 391 } 392 } 393 394 /* 395 * Verify that we have any chance at all of being able to queue the 396 * packet or packet fragments, unless ALTQ is enabled on the given 397 * interface in which case packetdrop should be done by queueing. 398 */ 399#ifdef ALTQ 400 if ((!ALTQ_IS_ENABLED(&ifp->if_snd)) && 401 ((ifp->if_snd.ifq_len + ip->ip_len / mtu + 1) >= 402 ifp->if_snd.ifq_maxlen)) 403#else 404 if ((ifp->if_snd.ifq_len + ip->ip_len / mtu + 1) >= 405 ifp->if_snd.ifq_maxlen) 406#endif /* ALTQ */ 407 { 408 error = ENOBUFS; 409 V_ipstat.ips_odropped++; 410 ifp->if_snd.ifq_drops += (ip->ip_len / ifp->if_mtu + 1); 411 goto bad; 412 } 413 414 /* 415 * Look for broadcast address and 416 * verify user is allowed to send 417 * such a packet. 418 */ 419 if (isbroadcast) { 420 if ((ifp->if_flags & IFF_BROADCAST) == 0) { 421 error = EADDRNOTAVAIL; 422 goto bad; 423 } 424 if ((flags & IP_ALLOWBROADCAST) == 0) { 425 error = EACCES; 426 goto bad; 427 } 428 /* don't allow broadcast messages to be fragmented */ 429 if (ip->ip_len > mtu) { 430 error = EMSGSIZE; 431 goto bad; 432 } 433 m->m_flags |= M_BCAST; 434 } else { 435 m->m_flags &= ~M_BCAST; 436 } 437 438sendit: 439#ifdef IPSEC 440 switch(ip_ipsec_output(&m, inp, &flags, &error, &ro, &iproute, &dst, &ia, &ifp)) { 441 case 1: 442 goto bad; 443 case -1: 444 goto done; 445 case 0: 446 default: 447 break; /* Continue with packet processing. */ 448 } 449 /* Update variables that are affected by ipsec4_output(). */ 450 ip = mtod(m, struct ip *); 451 hlen = ip->ip_hl << 2; 452#endif /* IPSEC */ 453 454 /* Jump over all PFIL processing if hooks are not active. */ 455 if (!PFIL_HOOKED(&inet_pfil_hook)) 456 goto passout; 457 458 /* Run through list of hooks for output packets. */ 459 odst.s_addr = ip->ip_dst.s_addr; 460 error = pfil_run_hooks(&inet_pfil_hook, &m, ifp, PFIL_OUT, inp); 461 if (error != 0 || m == NULL) 462 goto done; 463 464 ip = mtod(m, struct ip *); 465 466 /* See if destination IP address was changed by packet filter. */ 467 if (odst.s_addr != ip->ip_dst.s_addr) { 468 m->m_flags |= M_SKIP_FIREWALL; 469 /* If destination is now ourself drop to ip_input(). */ 470 if (in_localip(ip->ip_dst)) { 471 m->m_flags |= M_FASTFWD_OURS; 472 if (m->m_pkthdr.rcvif == NULL) 473 m->m_pkthdr.rcvif = V_loif; 474 if (m->m_pkthdr.csum_flags & CSUM_DELAY_DATA) { 475 m->m_pkthdr.csum_flags |= 476 CSUM_DATA_VALID | CSUM_PSEUDO_HDR; 477 m->m_pkthdr.csum_data = 0xffff; 478 } 479 m->m_pkthdr.csum_flags |= 480 CSUM_IP_CHECKED | CSUM_IP_VALID; 481 482 error = netisr_queue(NETISR_IP, m); 483 goto done; 484 } else 485 goto again; /* Redo the routing table lookup. */ 486 } 487 488#ifdef IPFIREWALL_FORWARD 489 /* See if local, if yes, send it to netisr with IP_FASTFWD_OURS. */ 490 if (m->m_flags & M_FASTFWD_OURS) { 491 if (m->m_pkthdr.rcvif == NULL) 492 m->m_pkthdr.rcvif = V_loif; 493 if (m->m_pkthdr.csum_flags & CSUM_DELAY_DATA) { 494 m->m_pkthdr.csum_flags |= 495 CSUM_DATA_VALID | CSUM_PSEUDO_HDR; 496 m->m_pkthdr.csum_data = 0xffff; 497 } 498 m->m_pkthdr.csum_flags |= 499 CSUM_IP_CHECKED | CSUM_IP_VALID; 500 501 error = netisr_queue(NETISR_IP, m); 502 goto done; 503 } 504 /* Or forward to some other address? */ 505 fwd_tag = m_tag_find(m, PACKET_TAG_IPFORWARD, NULL); 506 if (fwd_tag) { 507 dst = (struct sockaddr_in *)&ro->ro_dst; 508 bcopy((fwd_tag+1), dst, sizeof(struct sockaddr_in)); 509 m->m_flags |= M_SKIP_FIREWALL; 510 m_tag_delete(m, fwd_tag); 511 goto again; 512 } 513#endif /* IPFIREWALL_FORWARD */ 514 515passout: 516 /* 127/8 must not appear on wire - RFC1122. */ 517 if ((ntohl(ip->ip_dst.s_addr) >> IN_CLASSA_NSHIFT) == IN_LOOPBACKNET || 518 (ntohl(ip->ip_src.s_addr) >> IN_CLASSA_NSHIFT) == IN_LOOPBACKNET) { 519 if ((ifp->if_flags & IFF_LOOPBACK) == 0) { 520 V_ipstat.ips_badaddr++; 521 error = EADDRNOTAVAIL; 522 goto bad; 523 } 524 } 525 526 m->m_pkthdr.csum_flags |= CSUM_IP; 527 sw_csum = m->m_pkthdr.csum_flags & ~ifp->if_hwassist; 528 if (sw_csum & CSUM_DELAY_DATA) { 529 in_delayed_cksum(m); 530 sw_csum &= ~CSUM_DELAY_DATA; 531 } 532 m->m_pkthdr.csum_flags &= ifp->if_hwassist; 533 534 /* 535 * If small enough for interface, or the interface will take 536 * care of the fragmentation for us, we can just send directly. 537 */ 538 if (ip->ip_len <= mtu || 539 (m->m_pkthdr.csum_flags & ifp->if_hwassist & CSUM_TSO) != 0 || 540 ((ip->ip_off & IP_DF) == 0 && (ifp->if_hwassist & CSUM_FRAGMENT))) { 541 ip->ip_len = htons(ip->ip_len); 542 ip->ip_off = htons(ip->ip_off); 543 ip->ip_sum = 0; 544 if (sw_csum & CSUM_DELAY_IP) 545 ip->ip_sum = in_cksum(m, hlen); 546 547 /* 548 * Record statistics for this interface address. 549 * With CSUM_TSO the byte/packet count will be slightly 550 * incorrect because we count the IP+TCP headers only 551 * once instead of for every generated packet. 552 */ 553 if (!(flags & IP_FORWARDING) && ia) { 554 if (m->m_pkthdr.csum_flags & CSUM_TSO) 555 ia->ia_ifa.if_opackets += 556 m->m_pkthdr.len / m->m_pkthdr.tso_segsz; 557 else 558 ia->ia_ifa.if_opackets++; 559 ia->ia_ifa.if_obytes += m->m_pkthdr.len; 560 } 561#ifdef MBUF_STRESS_TEST 562 if (mbuf_frag_size && m->m_pkthdr.len > mbuf_frag_size) 563 m = m_fragment(m, M_DONTWAIT, mbuf_frag_size); 564#endif 565 /* 566 * Reset layer specific mbuf flags 567 * to avoid confusing lower layers. 568 */ 569 m->m_flags &= ~(M_PROTOFLAGS); 570 error = (*ifp->if_output)(ifp, m, 571 (struct sockaddr *)dst, ro->ro_rt); 572 goto done; 573 } 574 575 /* Balk when DF bit is set or the interface didn't support TSO. */ 576 if ((ip->ip_off & IP_DF) || (m->m_pkthdr.csum_flags & CSUM_TSO)) { 577 error = EMSGSIZE; 578 V_ipstat.ips_cantfrag++; 579 goto bad; 580 } 581 582 /* 583 * Too large for interface; fragment if possible. If successful, 584 * on return, m will point to a list of packets to be sent. 585 */ 586 error = ip_fragment(ip, &m, mtu, ifp->if_hwassist, sw_csum); 587 if (error) 588 goto bad; 589 for (; m; m = m0) { 590 m0 = m->m_nextpkt; 591 m->m_nextpkt = 0; 592 if (error == 0) { 593 /* Record statistics for this interface address. */ 594 if (ia != NULL) { 595 ia->ia_ifa.if_opackets++; 596 ia->ia_ifa.if_obytes += m->m_pkthdr.len; 597 } 598 /* 599 * Reset layer specific mbuf flags 600 * to avoid confusing upper layers. 601 */ 602 m->m_flags &= ~(M_PROTOFLAGS); 603 604 error = (*ifp->if_output)(ifp, m, 605 (struct sockaddr *)dst, ro->ro_rt); 606 } else 607 m_freem(m); 608 } 609 610 if (error == 0) 611 V_ipstat.ips_fragmented++; 612 613done: 614 if (ro == &iproute && ro->ro_rt) { 615 RTFREE(ro->ro_rt); 616 } 617 return (error); 618bad: 619 m_freem(m); 620 goto done; 621} 622 623/* 624 * Create a chain of fragments which fit the given mtu. m_frag points to the 625 * mbuf to be fragmented; on return it points to the chain with the fragments. 626 * Return 0 if no error. If error, m_frag may contain a partially built 627 * chain of fragments that should be freed by the caller. 628 * 629 * if_hwassist_flags is the hw offload capabilities (see if_data.ifi_hwassist) 630 * sw_csum contains the delayed checksums flags (e.g., CSUM_DELAY_IP). 631 */ 632int 633ip_fragment(struct ip *ip, struct mbuf **m_frag, int mtu, 634 u_long if_hwassist_flags, int sw_csum) 635{ 636 INIT_VNET_INET(curvnet); 637 int error = 0; 638 int hlen = ip->ip_hl << 2; 639 int len = (mtu - hlen) & ~7; /* size of payload in each fragment */ 640 int off; 641 struct mbuf *m0 = *m_frag; /* the original packet */ 642 int firstlen; 643 struct mbuf **mnext; 644 int nfrags; 645 646 if (ip->ip_off & IP_DF) { /* Fragmentation not allowed */ 647 V_ipstat.ips_cantfrag++; 648 return EMSGSIZE; 649 } 650 651 /* 652 * Must be able to put at least 8 bytes per fragment. 653 */ 654 if (len < 8) 655 return EMSGSIZE; 656 657 /* 658 * If the interface will not calculate checksums on 659 * fragmented packets, then do it here. 660 */ 661 if (m0->m_pkthdr.csum_flags & CSUM_DELAY_DATA && 662 (if_hwassist_flags & CSUM_IP_FRAGS) == 0) { 663 in_delayed_cksum(m0); 664 m0->m_pkthdr.csum_flags &= ~CSUM_DELAY_DATA; 665 } 666 667 if (len > PAGE_SIZE) { 668 /* 669 * Fragment large datagrams such that each segment 670 * contains a multiple of PAGE_SIZE amount of data, 671 * plus headers. This enables a receiver to perform 672 * page-flipping zero-copy optimizations. 673 * 674 * XXX When does this help given that sender and receiver 675 * could have different page sizes, and also mtu could 676 * be less than the receiver's page size ? 677 */ 678 int newlen; 679 struct mbuf *m; 680 681 for (m = m0, off = 0; m && (off+m->m_len) <= mtu; m = m->m_next) 682 off += m->m_len; 683 684 /* 685 * firstlen (off - hlen) must be aligned on an 686 * 8-byte boundary 687 */ 688 if (off < hlen) 689 goto smart_frag_failure; 690 off = ((off - hlen) & ~7) + hlen; 691 newlen = (~PAGE_MASK) & mtu; 692 if ((newlen + sizeof (struct ip)) > mtu) { 693 /* we failed, go back the default */ 694smart_frag_failure: 695 newlen = len; 696 off = hlen + len; 697 } 698 len = newlen; 699 700 } else { 701 off = hlen + len; 702 } 703 704 firstlen = off - hlen; 705 mnext = &m0->m_nextpkt; /* pointer to next packet */ 706 707 /* 708 * Loop through length of segment after first fragment, 709 * make new header and copy data of each part and link onto chain. 710 * Here, m0 is the original packet, m is the fragment being created. 711 * The fragments are linked off the m_nextpkt of the original 712 * packet, which after processing serves as the first fragment. 713 */ 714 for (nfrags = 1; off < ip->ip_len; off += len, nfrags++) { 715 struct ip *mhip; /* ip header on the fragment */ 716 struct mbuf *m; 717 int mhlen = sizeof (struct ip); 718 719 MGETHDR(m, M_DONTWAIT, MT_DATA); 720 if (m == NULL) { 721 error = ENOBUFS; 722 V_ipstat.ips_odropped++; 723 goto done; 724 } 725 m->m_flags |= (m0->m_flags & M_MCAST) | M_FRAG; 726 /* 727 * In the first mbuf, leave room for the link header, then 728 * copy the original IP header including options. The payload 729 * goes into an additional mbuf chain returned by m_copy(). 730 */ 731 m->m_data += max_linkhdr; 732 mhip = mtod(m, struct ip *); 733 *mhip = *ip; 734 if (hlen > sizeof (struct ip)) { 735 mhlen = ip_optcopy(ip, mhip) + sizeof (struct ip); 736 mhip->ip_v = IPVERSION; 737 mhip->ip_hl = mhlen >> 2; 738 } 739 m->m_len = mhlen; 740 /* XXX do we need to add ip->ip_off below ? */ 741 mhip->ip_off = ((off - hlen) >> 3) + ip->ip_off; 742 if (off + len >= ip->ip_len) { /* last fragment */ 743 len = ip->ip_len - off; 744 m->m_flags |= M_LASTFRAG; 745 } else 746 mhip->ip_off |= IP_MF; 747 mhip->ip_len = htons((u_short)(len + mhlen)); 748 m->m_next = m_copy(m0, off, len); 749 if (m->m_next == NULL) { /* copy failed */ 750 m_free(m); 751 error = ENOBUFS; /* ??? */ 752 V_ipstat.ips_odropped++; 753 goto done; 754 } 755 m->m_pkthdr.len = mhlen + len; 756 m->m_pkthdr.rcvif = NULL; 757#ifdef MAC 758 mac_netinet_fragment(m0, m); 759#endif 760 m->m_pkthdr.csum_flags = m0->m_pkthdr.csum_flags; 761 mhip->ip_off = htons(mhip->ip_off); 762 mhip->ip_sum = 0; 763 if (sw_csum & CSUM_DELAY_IP) 764 mhip->ip_sum = in_cksum(m, mhlen); 765 *mnext = m; 766 mnext = &m->m_nextpkt; 767 } 768 V_ipstat.ips_ofragments += nfrags; 769 770 /* set first marker for fragment chain */ 771 m0->m_flags |= M_FIRSTFRAG | M_FRAG; 772 m0->m_pkthdr.csum_data = nfrags; 773 774 /* 775 * Update first fragment by trimming what's been copied out 776 * and updating header. 777 */ 778 m_adj(m0, hlen + firstlen - ip->ip_len); 779 m0->m_pkthdr.len = hlen + firstlen; 780 ip->ip_len = htons((u_short)m0->m_pkthdr.len); 781 ip->ip_off |= IP_MF; 782 ip->ip_off = htons(ip->ip_off); 783 ip->ip_sum = 0; 784 if (sw_csum & CSUM_DELAY_IP) 785 ip->ip_sum = in_cksum(m0, hlen); 786 787done: 788 *m_frag = m0; 789 return error; 790} 791 792void 793in_delayed_cksum(struct mbuf *m) 794{ 795 struct ip *ip; 796 u_short csum, offset; 797 798 ip = mtod(m, struct ip *); 799 offset = ip->ip_hl << 2 ; 800 csum = in_cksum_skip(m, ip->ip_len, offset); 801 if (m->m_pkthdr.csum_flags & CSUM_UDP && csum == 0) 802 csum = 0xffff; 803 offset += m->m_pkthdr.csum_data; /* checksum offset */ 804 805 if (offset + sizeof(u_short) > m->m_len) { 806 printf("delayed m_pullup, m->len: %d off: %d p: %d\n", 807 m->m_len, offset, ip->ip_p); 808 /* 809 * XXX 810 * this shouldn't happen, but if it does, the 811 * correct behavior may be to insert the checksum 812 * in the appropriate next mbuf in the chain. 813 */ 814 return; 815 } 816 *(u_short *)(m->m_data + offset) = csum; 817} 818 819/* 820 * IP socket option processing. 821 */ 822int 823ip_ctloutput(struct socket *so, struct sockopt *sopt) 824{ 825 struct inpcb *inp = sotoinpcb(so); 826 int error, optval; 827 828 error = optval = 0; 829 if (sopt->sopt_level != IPPROTO_IP) { 830 if ((sopt->sopt_level == SOL_SOCKET) && 831 (sopt->sopt_name == SO_SETFIB)) { 832 inp->inp_inc.inc_fibnum = so->so_fibnum; 833 return (0); 834 } 835 return (EINVAL); 836 } 837 838 switch (sopt->sopt_dir) { 839 case SOPT_SET: 840 switch (sopt->sopt_name) { 841 case IP_OPTIONS: 842#ifdef notyet 843 case IP_RETOPTS: 844#endif 845 { 846 struct mbuf *m; 847 if (sopt->sopt_valsize > MLEN) { 848 error = EMSGSIZE; 849 break; 850 } 851 MGET(m, sopt->sopt_td ? M_WAIT : M_DONTWAIT, MT_DATA); 852 if (m == NULL) { 853 error = ENOBUFS; 854 break; 855 } 856 m->m_len = sopt->sopt_valsize; 857 error = sooptcopyin(sopt, mtod(m, char *), m->m_len, 858 m->m_len); 859 if (error) { 860 m_free(m); 861 break; 862 } 863 INP_WLOCK(inp); 864 error = ip_pcbopts(inp, sopt->sopt_name, m); 865 INP_WUNLOCK(inp); 866 return (error); 867 } 868 869 case IP_TOS: 870 case IP_TTL: 871 case IP_MINTTL: 872 case IP_RECVOPTS: 873 case IP_RECVRETOPTS: 874 case IP_RECVDSTADDR: 875 case IP_RECVTTL: 876 case IP_RECVIF: 877 case IP_FAITH: 878 case IP_ONESBCAST: 879 case IP_DONTFRAG: 880 error = sooptcopyin(sopt, &optval, sizeof optval, 881 sizeof optval); 882 if (error) 883 break; 884 885 switch (sopt->sopt_name) { 886 case IP_TOS: 887 inp->inp_ip_tos = optval; 888 break; 889 890 case IP_TTL: 891 inp->inp_ip_ttl = optval; 892 break; 893 894 case IP_MINTTL: 895 if (optval >= 0 && optval <= MAXTTL) 896 inp->inp_ip_minttl = optval; 897 else 898 error = EINVAL; 899 break; 900 901#define OPTSET(bit) do { \ 902 INP_WLOCK(inp); \ 903 if (optval) \ 904 inp->inp_flags |= bit; \ 905 else \ 906 inp->inp_flags &= ~bit; \ 907 INP_WUNLOCK(inp); \ 908} while (0) 909 910 case IP_RECVOPTS: 911 OPTSET(INP_RECVOPTS); 912 break; 913 914 case IP_RECVRETOPTS: 915 OPTSET(INP_RECVRETOPTS); 916 break; 917 918 case IP_RECVDSTADDR: 919 OPTSET(INP_RECVDSTADDR); 920 break; 921 922 case IP_RECVTTL: 923 OPTSET(INP_RECVTTL); 924 break; 925 926 case IP_RECVIF: 927 OPTSET(INP_RECVIF); 928 break; 929 930 case IP_FAITH: 931 OPTSET(INP_FAITH); 932 break; 933 934 case IP_ONESBCAST: 935 OPTSET(INP_ONESBCAST); 936 break; 937 case IP_DONTFRAG: 938 OPTSET(INP_DONTFRAG); 939 break; 940 } 941 break; 942#undef OPTSET 943 944 /* 945 * Multicast socket options are processed by the in_mcast 946 * module. 947 */ 948 case IP_MULTICAST_IF: 949 case IP_MULTICAST_VIF: 950 case IP_MULTICAST_TTL: 951 case IP_MULTICAST_LOOP: 952 case IP_ADD_MEMBERSHIP: 953 case IP_DROP_MEMBERSHIP: 954 case IP_ADD_SOURCE_MEMBERSHIP: 955 case IP_DROP_SOURCE_MEMBERSHIP: 956 case IP_BLOCK_SOURCE: 957 case IP_UNBLOCK_SOURCE: 958 case IP_MSFILTER: 959 case MCAST_JOIN_GROUP: 960 case MCAST_LEAVE_GROUP: 961 case MCAST_JOIN_SOURCE_GROUP: 962 case MCAST_LEAVE_SOURCE_GROUP: 963 case MCAST_BLOCK_SOURCE: 964 case MCAST_UNBLOCK_SOURCE: 965 error = inp_setmoptions(inp, sopt); 966 break; 967 968 case IP_PORTRANGE: 969 error = sooptcopyin(sopt, &optval, sizeof optval, 970 sizeof optval); 971 if (error) 972 break; 973 974 INP_WLOCK(inp); 975 switch (optval) { 976 case IP_PORTRANGE_DEFAULT: 977 inp->inp_flags &= ~(INP_LOWPORT); 978 inp->inp_flags &= ~(INP_HIGHPORT); 979 break; 980 981 case IP_PORTRANGE_HIGH: 982 inp->inp_flags &= ~(INP_LOWPORT); 983 inp->inp_flags |= INP_HIGHPORT; 984 break; 985 986 case IP_PORTRANGE_LOW: 987 inp->inp_flags &= ~(INP_HIGHPORT); 988 inp->inp_flags |= INP_LOWPORT; 989 break; 990 991 default: 992 error = EINVAL; 993 break; 994 } 995 INP_WUNLOCK(inp); 996 break; 997 998#ifdef IPSEC 999 case IP_IPSEC_POLICY: 1000 { 1001 caddr_t req; 1002 struct mbuf *m; 1003 1004 if ((error = soopt_getm(sopt, &m)) != 0) /* XXX */ 1005 break; 1006 if ((error = soopt_mcopyin(sopt, m)) != 0) /* XXX */ 1007 break; 1008 req = mtod(m, caddr_t); 1009 error = ipsec4_set_policy(inp, sopt->sopt_name, req, 1010 m->m_len, (sopt->sopt_td != NULL) ? 1011 sopt->sopt_td->td_ucred : NULL); 1012 m_freem(m); 1013 break; 1014 } 1015#endif /* IPSEC */ 1016 1017 default: 1018 error = ENOPROTOOPT; 1019 break; 1020 } 1021 break; 1022 1023 case SOPT_GET: 1024 switch (sopt->sopt_name) { 1025 case IP_OPTIONS: 1026 case IP_RETOPTS: 1027 if (inp->inp_options) 1028 error = sooptcopyout(sopt, 1029 mtod(inp->inp_options, 1030 char *), 1031 inp->inp_options->m_len); 1032 else 1033 sopt->sopt_valsize = 0; 1034 break; 1035 1036 case IP_TOS: 1037 case IP_TTL: 1038 case IP_MINTTL: 1039 case IP_RECVOPTS: 1040 case IP_RECVRETOPTS: 1041 case IP_RECVDSTADDR: 1042 case IP_RECVTTL: 1043 case IP_RECVIF: 1044 case IP_PORTRANGE: 1045 case IP_FAITH: 1046 case IP_ONESBCAST: 1047 case IP_DONTFRAG: 1048 switch (sopt->sopt_name) { 1049 1050 case IP_TOS: 1051 optval = inp->inp_ip_tos; 1052 break; 1053 1054 case IP_TTL: 1055 optval = inp->inp_ip_ttl; 1056 break; 1057 1058 case IP_MINTTL: 1059 optval = inp->inp_ip_minttl; 1060 break; 1061 1062#define OPTBIT(bit) (inp->inp_flags & bit ? 1 : 0) 1063 1064 case IP_RECVOPTS: 1065 optval = OPTBIT(INP_RECVOPTS); 1066 break; 1067 1068 case IP_RECVRETOPTS: 1069 optval = OPTBIT(INP_RECVRETOPTS); 1070 break; 1071 1072 case IP_RECVDSTADDR: 1073 optval = OPTBIT(INP_RECVDSTADDR); 1074 break; 1075 1076 case IP_RECVTTL: 1077 optval = OPTBIT(INP_RECVTTL); 1078 break; 1079 1080 case IP_RECVIF: 1081 optval = OPTBIT(INP_RECVIF); 1082 break; 1083 1084 case IP_PORTRANGE: 1085 if (inp->inp_flags & INP_HIGHPORT) 1086 optval = IP_PORTRANGE_HIGH; 1087 else if (inp->inp_flags & INP_LOWPORT) 1088 optval = IP_PORTRANGE_LOW; 1089 else 1090 optval = 0; 1091 break; 1092 1093 case IP_FAITH: 1094 optval = OPTBIT(INP_FAITH); 1095 break; 1096 1097 case IP_ONESBCAST: 1098 optval = OPTBIT(INP_ONESBCAST); 1099 break; 1100 case IP_DONTFRAG: 1101 optval = OPTBIT(INP_DONTFRAG); 1102 break; 1103 } 1104 error = sooptcopyout(sopt, &optval, sizeof optval); 1105 break; 1106 1107 /* 1108 * Multicast socket options are processed by the in_mcast 1109 * module. 1110 */ 1111 case IP_MULTICAST_IF: 1112 case IP_MULTICAST_VIF: 1113 case IP_MULTICAST_TTL: 1114 case IP_MULTICAST_LOOP: 1115 case IP_MSFILTER: 1116 error = inp_getmoptions(inp, sopt); 1117 break; 1118 1119#ifdef IPSEC 1120 case IP_IPSEC_POLICY: 1121 { 1122 struct mbuf *m = NULL; 1123 caddr_t req = NULL; 1124 size_t len = 0; 1125 1126 if (m != 0) { 1127 req = mtod(m, caddr_t); 1128 len = m->m_len; 1129 } 1130 error = ipsec4_get_policy(sotoinpcb(so), req, len, &m); 1131 if (error == 0) 1132 error = soopt_mcopyout(sopt, m); /* XXX */ 1133 if (error == 0) 1134 m_freem(m); 1135 break; 1136 } 1137#endif /* IPSEC */ 1138 1139 default: 1140 error = ENOPROTOOPT; 1141 break; 1142 } 1143 break; 1144 } 1145 return (error); 1146} 1147 1148/* 1149 * Routine called from ip_output() to loop back a copy of an IP multicast 1150 * packet to the input queue of a specified interface. Note that this 1151 * calls the output routine of the loopback "driver", but with an interface 1152 * pointer that might NOT be a loopback interface -- evil, but easier than 1153 * replicating that code here. 1154 */ 1155static void 1156ip_mloopback(struct ifnet *ifp, struct mbuf *m, struct sockaddr_in *dst, 1157 int hlen) 1158{ 1159 register struct ip *ip; 1160 struct mbuf *copym; 1161 1162 /* 1163 * Make a deep copy of the packet because we're going to 1164 * modify the pack in order to generate checksums. 1165 */ 1166 copym = m_dup(m, M_DONTWAIT); 1167 if (copym != NULL && (copym->m_flags & M_EXT || copym->m_len < hlen)) 1168 copym = m_pullup(copym, hlen); 1169 if (copym != NULL) { 1170 /* If needed, compute the checksum and mark it as valid. */ 1171 if (copym->m_pkthdr.csum_flags & CSUM_DELAY_DATA) { 1172 in_delayed_cksum(copym); 1173 copym->m_pkthdr.csum_flags &= ~CSUM_DELAY_DATA; 1174 copym->m_pkthdr.csum_flags |= 1175 CSUM_DATA_VALID | CSUM_PSEUDO_HDR; 1176 copym->m_pkthdr.csum_data = 0xffff; 1177 } 1178 /* 1179 * We don't bother to fragment if the IP length is greater 1180 * than the interface's MTU. Can this possibly matter? 1181 */ 1182 ip = mtod(copym, struct ip *); 1183 ip->ip_len = htons(ip->ip_len); 1184 ip->ip_off = htons(ip->ip_off); 1185 ip->ip_sum = 0; 1186 ip->ip_sum = in_cksum(copym, hlen); 1187#if 1 /* XXX */ 1188 if (dst->sin_family != AF_INET) { 1189 printf("ip_mloopback: bad address family %d\n", 1190 dst->sin_family); 1191 dst->sin_family = AF_INET; 1192 } 1193#endif 1194 if_simloop(ifp, copym, dst->sin_family, 0); 1195 } 1196} 1197