ip_output.c revision 185895
1/*- 2 * Copyright (c) 1982, 1986, 1988, 1990, 1993 3 * The Regents of the University of California. All rights reserved. 4 * 5 * Redistribution and use in source and binary forms, with or without 6 * modification, are permitted provided that the following conditions 7 * are met: 8 * 1. Redistributions of source code must retain the above copyright 9 * notice, this list of conditions and the following disclaimer. 10 * 2. Redistributions in binary form must reproduce the above copyright 11 * notice, this list of conditions and the following disclaimer in the 12 * documentation and/or other materials provided with the distribution. 13 * 4. Neither the name of the University nor the names of its contributors 14 * may be used to endorse or promote products derived from this software 15 * without specific prior written permission. 16 * 17 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND 18 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 19 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 20 * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE 21 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 22 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 23 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 24 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 25 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 26 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 27 * SUCH DAMAGE. 28 * 29 * @(#)ip_output.c 8.3 (Berkeley) 1/21/94 30 */ 31 32#include <sys/cdefs.h> 33__FBSDID("$FreeBSD: head/sys/netinet/ip_output.c 185895 2008-12-10 23:12:39Z zec $"); 34 35#include "opt_ipfw.h" 36#include "opt_ipsec.h" 37#include "opt_mac.h" 38#include "opt_mbuf_stress_test.h" 39#include "opt_mpath.h" 40 41#include <sys/param.h> 42#include <sys/systm.h> 43#include <sys/kernel.h> 44#include <sys/malloc.h> 45#include <sys/mbuf.h> 46#include <sys/priv.h> 47#include <sys/proc.h> 48#include <sys/protosw.h> 49#include <sys/socket.h> 50#include <sys/socketvar.h> 51#include <sys/sysctl.h> 52#include <sys/ucred.h> 53#include <sys/vimage.h> 54 55#include <net/if.h> 56#include <net/netisr.h> 57#include <net/pfil.h> 58#include <net/route.h> 59#ifdef RADIX_MPATH 60#include <net/radix_mpath.h> 61#endif 62#include <net/vnet.h> 63 64#include <netinet/in.h> 65#include <netinet/in_systm.h> 66#include <netinet/ip.h> 67#include <netinet/in_pcb.h> 68#include <netinet/in_var.h> 69#include <netinet/ip_var.h> 70#include <netinet/ip_options.h> 71#include <netinet/vinet.h> 72 73#ifdef IPSEC 74#include <netinet/ip_ipsec.h> 75#include <netipsec/ipsec.h> 76#endif /* IPSEC*/ 77 78#include <machine/in_cksum.h> 79 80#include <security/mac/mac_framework.h> 81 82#define print_ip(x, a, y) printf("%s %d.%d.%d.%d%s",\ 83 x, (ntohl(a.s_addr)>>24)&0xFF,\ 84 (ntohl(a.s_addr)>>16)&0xFF,\ 85 (ntohl(a.s_addr)>>8)&0xFF,\ 86 (ntohl(a.s_addr))&0xFF, y); 87 88#ifdef VIMAGE_GLOBALS 89u_short ip_id; 90#endif 91 92#ifdef MBUF_STRESS_TEST 93int mbuf_frag_size = 0; 94SYSCTL_INT(_net_inet_ip, OID_AUTO, mbuf_frag_size, CTLFLAG_RW, 95 &mbuf_frag_size, 0, "Fragment outgoing mbufs to this size"); 96#endif 97 98static void ip_mloopback 99 (struct ifnet *, struct mbuf *, struct sockaddr_in *, int); 100 101 102extern struct protosw inetsw[]; 103 104/* 105 * IP output. The packet in mbuf chain m contains a skeletal IP 106 * header (with len, off, ttl, proto, tos, src, dst). 107 * The mbuf chain containing the packet will be freed. 108 * The mbuf opt, if present, will not be freed. 109 * In the IP forwarding case, the packet will arrive with options already 110 * inserted, so must have a NULL opt pointer. 111 */ 112int 113ip_output(struct mbuf *m, struct mbuf *opt, struct route *ro, int flags, 114 struct ip_moptions *imo, struct inpcb *inp) 115{ 116 INIT_VNET_NET(curvnet); 117 INIT_VNET_INET(curvnet); 118 struct ip *ip; 119 struct ifnet *ifp = NULL; /* keep compiler happy */ 120 struct mbuf *m0; 121 int hlen = sizeof (struct ip); 122 int mtu; 123 int len, error = 0; 124 struct sockaddr_in *dst = NULL; /* keep compiler happy */ 125 struct in_ifaddr *ia = NULL; 126 int isbroadcast, sw_csum; 127 struct route iproute; 128 struct in_addr odst; 129#ifdef IPFIREWALL_FORWARD 130 struct m_tag *fwd_tag = NULL; 131#endif 132 M_ASSERTPKTHDR(m); 133 134 if (ro == NULL) { 135 ro = &iproute; 136 bzero(ro, sizeof (*ro)); 137 } 138 139 if (inp != NULL) { 140 M_SETFIB(m, inp->inp_inc.inc_fibnum); 141 INP_LOCK_ASSERT(inp); 142 } 143 144 if (opt) { 145 len = 0; 146 m = ip_insertoptions(m, opt, &len); 147 if (len != 0) 148 hlen = len; 149 } 150 ip = mtod(m, struct ip *); 151 152 /* 153 * Fill in IP header. If we are not allowing fragmentation, 154 * then the ip_id field is meaningless, but we don't set it 155 * to zero. Doing so causes various problems when devices along 156 * the path (routers, load balancers, firewalls, etc.) illegally 157 * disable DF on our packet. Note that a 16-bit counter 158 * will wrap around in less than 10 seconds at 100 Mbit/s on a 159 * medium with MTU 1500. See Steven M. Bellovin, "A Technique 160 * for Counting NATted Hosts", Proc. IMW'02, available at 161 * <http://www.cs.columbia.edu/~smb/papers/fnat.pdf>. 162 */ 163 if ((flags & (IP_FORWARDING|IP_RAWOUTPUT)) == 0) { 164 ip->ip_v = IPVERSION; 165 ip->ip_hl = hlen >> 2; 166 ip->ip_id = ip_newid(); 167 V_ipstat.ips_localout++; 168 } else { 169 hlen = ip->ip_hl << 2; 170 } 171 172 dst = (struct sockaddr_in *)&ro->ro_dst; 173again: 174 /* 175 * If there is a cached route, 176 * check that it is to the same destination 177 * and is still up. If not, free it and try again. 178 * The address family should also be checked in case of sharing the 179 * cache with IPv6. 180 */ 181 if (ro->ro_rt && ((ro->ro_rt->rt_flags & RTF_UP) == 0 || 182 dst->sin_family != AF_INET || 183 dst->sin_addr.s_addr != ip->ip_dst.s_addr)) { 184 RTFREE(ro->ro_rt); 185 ro->ro_rt = (struct rtentry *)NULL; 186 } 187#ifdef IPFIREWALL_FORWARD 188 if (ro->ro_rt == NULL && fwd_tag == NULL) { 189#else 190 if (ro->ro_rt == NULL) { 191#endif 192 bzero(dst, sizeof(*dst)); 193 dst->sin_family = AF_INET; 194 dst->sin_len = sizeof(*dst); 195 dst->sin_addr = ip->ip_dst; 196 } 197 /* 198 * If routing to interface only, short circuit routing lookup. 199 * The use of an all-ones broadcast address implies this; an 200 * interface is specified by the broadcast address of an interface, 201 * or the destination address of a ptp interface. 202 */ 203 if (flags & IP_SENDONES) { 204 if ((ia = ifatoia(ifa_ifwithbroadaddr(sintosa(dst)))) == NULL && 205 (ia = ifatoia(ifa_ifwithdstaddr(sintosa(dst)))) == NULL) { 206 V_ipstat.ips_noroute++; 207 error = ENETUNREACH; 208 goto bad; 209 } 210 ip->ip_dst.s_addr = INADDR_BROADCAST; 211 dst->sin_addr = ip->ip_dst; 212 ifp = ia->ia_ifp; 213 ip->ip_ttl = 1; 214 isbroadcast = 1; 215 } else if (flags & IP_ROUTETOIF) { 216 if ((ia = ifatoia(ifa_ifwithdstaddr(sintosa(dst)))) == NULL && 217 (ia = ifatoia(ifa_ifwithnet(sintosa(dst)))) == NULL) { 218 V_ipstat.ips_noroute++; 219 error = ENETUNREACH; 220 goto bad; 221 } 222 ifp = ia->ia_ifp; 223 ip->ip_ttl = 1; 224 isbroadcast = in_broadcast(dst->sin_addr, ifp); 225 } else if (IN_MULTICAST(ntohl(ip->ip_dst.s_addr)) && 226 imo != NULL && imo->imo_multicast_ifp != NULL) { 227 /* 228 * Bypass the normal routing lookup for multicast 229 * packets if the interface is specified. 230 */ 231 ifp = imo->imo_multicast_ifp; 232 IFP_TO_IA(ifp, ia); 233 isbroadcast = 0; /* fool gcc */ 234 } else { 235 /* 236 * We want to do any cloning requested by the link layer, 237 * as this is probably required in all cases for correct 238 * operation (as it is for ARP). 239 */ 240 if (ro->ro_rt == NULL) 241#ifdef RADIX_MPATH 242 rtalloc_mpath_fib(ro, 243 ntohl(ip->ip_src.s_addr ^ ip->ip_dst.s_addr), 244 inp ? inp->inp_inc.inc_fibnum : M_GETFIB(m)); 245#else 246 in_rtalloc_ign(ro, 0, 247 inp ? inp->inp_inc.inc_fibnum : M_GETFIB(m)); 248#endif 249 if (ro->ro_rt == NULL) { 250 V_ipstat.ips_noroute++; 251 error = EHOSTUNREACH; 252 goto bad; 253 } 254 ia = ifatoia(ro->ro_rt->rt_ifa); 255 ifp = ro->ro_rt->rt_ifp; 256 ro->ro_rt->rt_rmx.rmx_pksent++; 257 if (ro->ro_rt->rt_flags & RTF_GATEWAY) 258 dst = (struct sockaddr_in *)ro->ro_rt->rt_gateway; 259 if (ro->ro_rt->rt_flags & RTF_HOST) 260 isbroadcast = (ro->ro_rt->rt_flags & RTF_BROADCAST); 261 else 262 isbroadcast = in_broadcast(dst->sin_addr, ifp); 263 } 264 /* 265 * Calculate MTU. If we have a route that is up, use that, 266 * otherwise use the interface's MTU. 267 */ 268 if (ro->ro_rt != NULL && (ro->ro_rt->rt_flags & (RTF_UP|RTF_HOST))) { 269 /* 270 * This case can happen if the user changed the MTU 271 * of an interface after enabling IP on it. Because 272 * most netifs don't keep track of routes pointing to 273 * them, there is no way for one to update all its 274 * routes when the MTU is changed. 275 */ 276 if (ro->ro_rt->rt_rmx.rmx_mtu > ifp->if_mtu) 277 ro->ro_rt->rt_rmx.rmx_mtu = ifp->if_mtu; 278 mtu = ro->ro_rt->rt_rmx.rmx_mtu; 279 } else { 280 mtu = ifp->if_mtu; 281 } 282 if (IN_MULTICAST(ntohl(ip->ip_dst.s_addr))) { 283 struct in_multi *inm; 284 285 m->m_flags |= M_MCAST; 286 /* 287 * IP destination address is multicast. Make sure "dst" 288 * still points to the address in "ro". (It may have been 289 * changed to point to a gateway address, above.) 290 */ 291 dst = (struct sockaddr_in *)&ro->ro_dst; 292 /* 293 * See if the caller provided any multicast options 294 */ 295 if (imo != NULL) { 296 ip->ip_ttl = imo->imo_multicast_ttl; 297 if (imo->imo_multicast_vif != -1) 298 ip->ip_src.s_addr = 299 ip_mcast_src ? 300 ip_mcast_src(imo->imo_multicast_vif) : 301 INADDR_ANY; 302 } else 303 ip->ip_ttl = IP_DEFAULT_MULTICAST_TTL; 304 /* 305 * Confirm that the outgoing interface supports multicast. 306 */ 307 if ((imo == NULL) || (imo->imo_multicast_vif == -1)) { 308 if ((ifp->if_flags & IFF_MULTICAST) == 0) { 309 V_ipstat.ips_noroute++; 310 error = ENETUNREACH; 311 goto bad; 312 } 313 } 314 /* 315 * If source address not specified yet, use address 316 * of outgoing interface. 317 */ 318 if (ip->ip_src.s_addr == INADDR_ANY) { 319 /* Interface may have no addresses. */ 320 if (ia != NULL) 321 ip->ip_src = IA_SIN(ia)->sin_addr; 322 } 323 324 IN_MULTI_LOCK(); 325 IN_LOOKUP_MULTI(ip->ip_dst, ifp, inm); 326 if (inm != NULL && 327 (imo == NULL || imo->imo_multicast_loop)) { 328 IN_MULTI_UNLOCK(); 329 /* 330 * If we belong to the destination multicast group 331 * on the outgoing interface, and the caller did not 332 * forbid loopback, loop back a copy. 333 */ 334 ip_mloopback(ifp, m, dst, hlen); 335 } 336 else { 337 IN_MULTI_UNLOCK(); 338 /* 339 * If we are acting as a multicast router, perform 340 * multicast forwarding as if the packet had just 341 * arrived on the interface to which we are about 342 * to send. The multicast forwarding function 343 * recursively calls this function, using the 344 * IP_FORWARDING flag to prevent infinite recursion. 345 * 346 * Multicasts that are looped back by ip_mloopback(), 347 * above, will be forwarded by the ip_input() routine, 348 * if necessary. 349 */ 350 if (V_ip_mrouter && (flags & IP_FORWARDING) == 0) { 351 /* 352 * If rsvp daemon is not running, do not 353 * set ip_moptions. This ensures that the packet 354 * is multicast and not just sent down one link 355 * as prescribed by rsvpd. 356 */ 357 if (!V_rsvp_on) 358 imo = NULL; 359 if (ip_mforward && 360 ip_mforward(ip, ifp, m, imo) != 0) { 361 m_freem(m); 362 goto done; 363 } 364 } 365 } 366 367 /* 368 * Multicasts with a time-to-live of zero may be looped- 369 * back, above, but must not be transmitted on a network. 370 * Also, multicasts addressed to the loopback interface 371 * are not sent -- the above call to ip_mloopback() will 372 * loop back a copy if this host actually belongs to the 373 * destination group on the loopback interface. 374 */ 375 if (ip->ip_ttl == 0 || ifp->if_flags & IFF_LOOPBACK) { 376 m_freem(m); 377 goto done; 378 } 379 380 goto sendit; 381 } 382 383 /* 384 * If the source address is not specified yet, use the address 385 * of the outoing interface. 386 */ 387 if (ip->ip_src.s_addr == INADDR_ANY) { 388 /* Interface may have no addresses. */ 389 if (ia != NULL) { 390 ip->ip_src = IA_SIN(ia)->sin_addr; 391 } 392 } 393 394 /* 395 * Verify that we have any chance at all of being able to queue the 396 * packet or packet fragments, unless ALTQ is enabled on the given 397 * interface in which case packetdrop should be done by queueing. 398 */ 399#ifdef ALTQ 400 if ((!ALTQ_IS_ENABLED(&ifp->if_snd)) && 401 ((ifp->if_snd.ifq_len + ip->ip_len / mtu + 1) >= 402 ifp->if_snd.ifq_maxlen)) 403#else 404 if ((ifp->if_snd.ifq_len + ip->ip_len / mtu + 1) >= 405 ifp->if_snd.ifq_maxlen) 406#endif /* ALTQ */ 407 { 408 error = ENOBUFS; 409 V_ipstat.ips_odropped++; 410 ifp->if_snd.ifq_drops += (ip->ip_len / ifp->if_mtu + 1); 411 goto bad; 412 } 413 414 /* 415 * Look for broadcast address and 416 * verify user is allowed to send 417 * such a packet. 418 */ 419 if (isbroadcast) { 420 if ((ifp->if_flags & IFF_BROADCAST) == 0) { 421 error = EADDRNOTAVAIL; 422 goto bad; 423 } 424 if ((flags & IP_ALLOWBROADCAST) == 0) { 425 error = EACCES; 426 goto bad; 427 } 428 /* don't allow broadcast messages to be fragmented */ 429 if (ip->ip_len > mtu) { 430 error = EMSGSIZE; 431 goto bad; 432 } 433 m->m_flags |= M_BCAST; 434 } else { 435 m->m_flags &= ~M_BCAST; 436 } 437 438sendit: 439#ifdef IPSEC 440 switch(ip_ipsec_output(&m, inp, &flags, &error, &ro, &iproute, &dst, &ia, &ifp)) { 441 case 1: 442 goto bad; 443 case -1: 444 goto done; 445 case 0: 446 default: 447 break; /* Continue with packet processing. */ 448 } 449 /* Update variables that are affected by ipsec4_output(). */ 450 ip = mtod(m, struct ip *); 451 hlen = ip->ip_hl << 2; 452#endif /* IPSEC */ 453 454 /* Jump over all PFIL processing if hooks are not active. */ 455 if (!PFIL_HOOKED(&inet_pfil_hook)) 456 goto passout; 457 458 /* Run through list of hooks for output packets. */ 459 odst.s_addr = ip->ip_dst.s_addr; 460 error = pfil_run_hooks(&inet_pfil_hook, &m, ifp, PFIL_OUT, inp); 461 if (error != 0 || m == NULL) 462 goto done; 463 464 ip = mtod(m, struct ip *); 465 466 /* See if destination IP address was changed by packet filter. */ 467 if (odst.s_addr != ip->ip_dst.s_addr) { 468 m->m_flags |= M_SKIP_FIREWALL; 469 /* If destination is now ourself drop to ip_input(). */ 470 if (in_localip(ip->ip_dst)) { 471 m->m_flags |= M_FASTFWD_OURS; 472 if (m->m_pkthdr.rcvif == NULL) 473 m->m_pkthdr.rcvif = V_loif; 474 if (m->m_pkthdr.csum_flags & CSUM_DELAY_DATA) { 475 m->m_pkthdr.csum_flags |= 476 CSUM_DATA_VALID | CSUM_PSEUDO_HDR; 477 m->m_pkthdr.csum_data = 0xffff; 478 } 479 m->m_pkthdr.csum_flags |= 480 CSUM_IP_CHECKED | CSUM_IP_VALID; 481 482 error = netisr_queue(NETISR_IP, m); 483 goto done; 484 } else 485 goto again; /* Redo the routing table lookup. */ 486 } 487 488#ifdef IPFIREWALL_FORWARD 489 /* See if local, if yes, send it to netisr with IP_FASTFWD_OURS. */ 490 if (m->m_flags & M_FASTFWD_OURS) { 491 if (m->m_pkthdr.rcvif == NULL) 492 m->m_pkthdr.rcvif = V_loif; 493 if (m->m_pkthdr.csum_flags & CSUM_DELAY_DATA) { 494 m->m_pkthdr.csum_flags |= 495 CSUM_DATA_VALID | CSUM_PSEUDO_HDR; 496 m->m_pkthdr.csum_data = 0xffff; 497 } 498 m->m_pkthdr.csum_flags |= 499 CSUM_IP_CHECKED | CSUM_IP_VALID; 500 501 error = netisr_queue(NETISR_IP, m); 502 goto done; 503 } 504 /* Or forward to some other address? */ 505 fwd_tag = m_tag_find(m, PACKET_TAG_IPFORWARD, NULL); 506 if (fwd_tag) { 507 dst = (struct sockaddr_in *)&ro->ro_dst; 508 bcopy((fwd_tag+1), dst, sizeof(struct sockaddr_in)); 509 m->m_flags |= M_SKIP_FIREWALL; 510 m_tag_delete(m, fwd_tag); 511 goto again; 512 } 513#endif /* IPFIREWALL_FORWARD */ 514 515passout: 516 /* 127/8 must not appear on wire - RFC1122. */ 517 if ((ntohl(ip->ip_dst.s_addr) >> IN_CLASSA_NSHIFT) == IN_LOOPBACKNET || 518 (ntohl(ip->ip_src.s_addr) >> IN_CLASSA_NSHIFT) == IN_LOOPBACKNET) { 519 if ((ifp->if_flags & IFF_LOOPBACK) == 0) { 520 V_ipstat.ips_badaddr++; 521 error = EADDRNOTAVAIL; 522 goto bad; 523 } 524 } 525 526 m->m_pkthdr.csum_flags |= CSUM_IP; 527 sw_csum = m->m_pkthdr.csum_flags & ~ifp->if_hwassist; 528 if (sw_csum & CSUM_DELAY_DATA) { 529 in_delayed_cksum(m); 530 sw_csum &= ~CSUM_DELAY_DATA; 531 } 532 m->m_pkthdr.csum_flags &= ifp->if_hwassist; 533 534 /* 535 * If small enough for interface, or the interface will take 536 * care of the fragmentation for us, we can just send directly. 537 */ 538 if (ip->ip_len <= mtu || 539 (m->m_pkthdr.csum_flags & ifp->if_hwassist & CSUM_TSO) != 0 || 540 ((ip->ip_off & IP_DF) == 0 && (ifp->if_hwassist & CSUM_FRAGMENT))) { 541 ip->ip_len = htons(ip->ip_len); 542 ip->ip_off = htons(ip->ip_off); 543 ip->ip_sum = 0; 544 if (sw_csum & CSUM_DELAY_IP) 545 ip->ip_sum = in_cksum(m, hlen); 546 547 /* 548 * Record statistics for this interface address. 549 * With CSUM_TSO the byte/packet count will be slightly 550 * incorrect because we count the IP+TCP headers only 551 * once instead of for every generated packet. 552 */ 553 if (!(flags & IP_FORWARDING) && ia) { 554 if (m->m_pkthdr.csum_flags & CSUM_TSO) 555 ia->ia_ifa.if_opackets += 556 m->m_pkthdr.len / m->m_pkthdr.tso_segsz; 557 else 558 ia->ia_ifa.if_opackets++; 559 ia->ia_ifa.if_obytes += m->m_pkthdr.len; 560 } 561#ifdef MBUF_STRESS_TEST 562 if (mbuf_frag_size && m->m_pkthdr.len > mbuf_frag_size) 563 m = m_fragment(m, M_DONTWAIT, mbuf_frag_size); 564#endif 565 /* 566 * Reset layer specific mbuf flags 567 * to avoid confusing lower layers. 568 */ 569 m->m_flags &= ~(M_PROTOFLAGS); 570 571 error = (*ifp->if_output)(ifp, m, 572 (struct sockaddr *)dst, ro->ro_rt); 573 goto done; 574 } 575 576 /* Balk when DF bit is set or the interface didn't support TSO. */ 577 if ((ip->ip_off & IP_DF) || (m->m_pkthdr.csum_flags & CSUM_TSO)) { 578 error = EMSGSIZE; 579 V_ipstat.ips_cantfrag++; 580 goto bad; 581 } 582 583 /* 584 * Too large for interface; fragment if possible. If successful, 585 * on return, m will point to a list of packets to be sent. 586 */ 587 error = ip_fragment(ip, &m, mtu, ifp->if_hwassist, sw_csum); 588 if (error) 589 goto bad; 590 for (; m; m = m0) { 591 m0 = m->m_nextpkt; 592 m->m_nextpkt = 0; 593 if (error == 0) { 594 /* Record statistics for this interface address. */ 595 if (ia != NULL) { 596 ia->ia_ifa.if_opackets++; 597 ia->ia_ifa.if_obytes += m->m_pkthdr.len; 598 } 599 /* 600 * Reset layer specific mbuf flags 601 * to avoid confusing upper layers. 602 */ 603 m->m_flags &= ~(M_PROTOFLAGS); 604 605 error = (*ifp->if_output)(ifp, m, 606 (struct sockaddr *)dst, ro->ro_rt); 607 } else 608 m_freem(m); 609 } 610 611 if (error == 0) 612 V_ipstat.ips_fragmented++; 613 614done: 615 if (ro == &iproute && ro->ro_rt) { 616 RTFREE(ro->ro_rt); 617 } 618 return (error); 619bad: 620 m_freem(m); 621 goto done; 622} 623 624/* 625 * Create a chain of fragments which fit the given mtu. m_frag points to the 626 * mbuf to be fragmented; on return it points to the chain with the fragments. 627 * Return 0 if no error. If error, m_frag may contain a partially built 628 * chain of fragments that should be freed by the caller. 629 * 630 * if_hwassist_flags is the hw offload capabilities (see if_data.ifi_hwassist) 631 * sw_csum contains the delayed checksums flags (e.g., CSUM_DELAY_IP). 632 */ 633int 634ip_fragment(struct ip *ip, struct mbuf **m_frag, int mtu, 635 u_long if_hwassist_flags, int sw_csum) 636{ 637 INIT_VNET_INET(curvnet); 638 int error = 0; 639 int hlen = ip->ip_hl << 2; 640 int len = (mtu - hlen) & ~7; /* size of payload in each fragment */ 641 int off; 642 struct mbuf *m0 = *m_frag; /* the original packet */ 643 int firstlen; 644 struct mbuf **mnext; 645 int nfrags; 646 647 if (ip->ip_off & IP_DF) { /* Fragmentation not allowed */ 648 V_ipstat.ips_cantfrag++; 649 return EMSGSIZE; 650 } 651 652 /* 653 * Must be able to put at least 8 bytes per fragment. 654 */ 655 if (len < 8) 656 return EMSGSIZE; 657 658 /* 659 * If the interface will not calculate checksums on 660 * fragmented packets, then do it here. 661 */ 662 if (m0->m_pkthdr.csum_flags & CSUM_DELAY_DATA && 663 (if_hwassist_flags & CSUM_IP_FRAGS) == 0) { 664 in_delayed_cksum(m0); 665 m0->m_pkthdr.csum_flags &= ~CSUM_DELAY_DATA; 666 } 667 668 if (len > PAGE_SIZE) { 669 /* 670 * Fragment large datagrams such that each segment 671 * contains a multiple of PAGE_SIZE amount of data, 672 * plus headers. This enables a receiver to perform 673 * page-flipping zero-copy optimizations. 674 * 675 * XXX When does this help given that sender and receiver 676 * could have different page sizes, and also mtu could 677 * be less than the receiver's page size ? 678 */ 679 int newlen; 680 struct mbuf *m; 681 682 for (m = m0, off = 0; m && (off+m->m_len) <= mtu; m = m->m_next) 683 off += m->m_len; 684 685 /* 686 * firstlen (off - hlen) must be aligned on an 687 * 8-byte boundary 688 */ 689 if (off < hlen) 690 goto smart_frag_failure; 691 off = ((off - hlen) & ~7) + hlen; 692 newlen = (~PAGE_MASK) & mtu; 693 if ((newlen + sizeof (struct ip)) > mtu) { 694 /* we failed, go back the default */ 695smart_frag_failure: 696 newlen = len; 697 off = hlen + len; 698 } 699 len = newlen; 700 701 } else { 702 off = hlen + len; 703 } 704 705 firstlen = off - hlen; 706 mnext = &m0->m_nextpkt; /* pointer to next packet */ 707 708 /* 709 * Loop through length of segment after first fragment, 710 * make new header and copy data of each part and link onto chain. 711 * Here, m0 is the original packet, m is the fragment being created. 712 * The fragments are linked off the m_nextpkt of the original 713 * packet, which after processing serves as the first fragment. 714 */ 715 for (nfrags = 1; off < ip->ip_len; off += len, nfrags++) { 716 struct ip *mhip; /* ip header on the fragment */ 717 struct mbuf *m; 718 int mhlen = sizeof (struct ip); 719 720 MGETHDR(m, M_DONTWAIT, MT_DATA); 721 if (m == NULL) { 722 error = ENOBUFS; 723 V_ipstat.ips_odropped++; 724 goto done; 725 } 726 m->m_flags |= (m0->m_flags & M_MCAST) | M_FRAG; 727 /* 728 * In the first mbuf, leave room for the link header, then 729 * copy the original IP header including options. The payload 730 * goes into an additional mbuf chain returned by m_copy(). 731 */ 732 m->m_data += max_linkhdr; 733 mhip = mtod(m, struct ip *); 734 *mhip = *ip; 735 if (hlen > sizeof (struct ip)) { 736 mhlen = ip_optcopy(ip, mhip) + sizeof (struct ip); 737 mhip->ip_v = IPVERSION; 738 mhip->ip_hl = mhlen >> 2; 739 } 740 m->m_len = mhlen; 741 /* XXX do we need to add ip->ip_off below ? */ 742 mhip->ip_off = ((off - hlen) >> 3) + ip->ip_off; 743 if (off + len >= ip->ip_len) { /* last fragment */ 744 len = ip->ip_len - off; 745 m->m_flags |= M_LASTFRAG; 746 } else 747 mhip->ip_off |= IP_MF; 748 mhip->ip_len = htons((u_short)(len + mhlen)); 749 m->m_next = m_copy(m0, off, len); 750 if (m->m_next == NULL) { /* copy failed */ 751 m_free(m); 752 error = ENOBUFS; /* ??? */ 753 V_ipstat.ips_odropped++; 754 goto done; 755 } 756 m->m_pkthdr.len = mhlen + len; 757 m->m_pkthdr.rcvif = NULL; 758#ifdef MAC 759 mac_netinet_fragment(m0, m); 760#endif 761 m->m_pkthdr.csum_flags = m0->m_pkthdr.csum_flags; 762 mhip->ip_off = htons(mhip->ip_off); 763 mhip->ip_sum = 0; 764 if (sw_csum & CSUM_DELAY_IP) 765 mhip->ip_sum = in_cksum(m, mhlen); 766 *mnext = m; 767 mnext = &m->m_nextpkt; 768 } 769 V_ipstat.ips_ofragments += nfrags; 770 771 /* set first marker for fragment chain */ 772 m0->m_flags |= M_FIRSTFRAG | M_FRAG; 773 m0->m_pkthdr.csum_data = nfrags; 774 775 /* 776 * Update first fragment by trimming what's been copied out 777 * and updating header. 778 */ 779 m_adj(m0, hlen + firstlen - ip->ip_len); 780 m0->m_pkthdr.len = hlen + firstlen; 781 ip->ip_len = htons((u_short)m0->m_pkthdr.len); 782 ip->ip_off |= IP_MF; 783 ip->ip_off = htons(ip->ip_off); 784 ip->ip_sum = 0; 785 if (sw_csum & CSUM_DELAY_IP) 786 ip->ip_sum = in_cksum(m0, hlen); 787 788done: 789 *m_frag = m0; 790 return error; 791} 792 793void 794in_delayed_cksum(struct mbuf *m) 795{ 796 struct ip *ip; 797 u_short csum, offset; 798 799 ip = mtod(m, struct ip *); 800 offset = ip->ip_hl << 2 ; 801 csum = in_cksum_skip(m, ip->ip_len, offset); 802 if (m->m_pkthdr.csum_flags & CSUM_UDP && csum == 0) 803 csum = 0xffff; 804 offset += m->m_pkthdr.csum_data; /* checksum offset */ 805 806 if (offset + sizeof(u_short) > m->m_len) { 807 printf("delayed m_pullup, m->len: %d off: %d p: %d\n", 808 m->m_len, offset, ip->ip_p); 809 /* 810 * XXX 811 * this shouldn't happen, but if it does, the 812 * correct behavior may be to insert the checksum 813 * in the appropriate next mbuf in the chain. 814 */ 815 return; 816 } 817 *(u_short *)(m->m_data + offset) = csum; 818} 819 820/* 821 * IP socket option processing. 822 */ 823int 824ip_ctloutput(struct socket *so, struct sockopt *sopt) 825{ 826 struct inpcb *inp = sotoinpcb(so); 827 int error, optval; 828 829 error = optval = 0; 830 if (sopt->sopt_level != IPPROTO_IP) { 831 if ((sopt->sopt_level == SOL_SOCKET) && 832 (sopt->sopt_name == SO_SETFIB)) { 833 inp->inp_inc.inc_fibnum = so->so_fibnum; 834 return (0); 835 } 836 return (EINVAL); 837 } 838 839 switch (sopt->sopt_dir) { 840 case SOPT_SET: 841 switch (sopt->sopt_name) { 842 case IP_OPTIONS: 843#ifdef notyet 844 case IP_RETOPTS: 845#endif 846 { 847 struct mbuf *m; 848 if (sopt->sopt_valsize > MLEN) { 849 error = EMSGSIZE; 850 break; 851 } 852 MGET(m, sopt->sopt_td ? M_WAIT : M_DONTWAIT, MT_DATA); 853 if (m == NULL) { 854 error = ENOBUFS; 855 break; 856 } 857 m->m_len = sopt->sopt_valsize; 858 error = sooptcopyin(sopt, mtod(m, char *), m->m_len, 859 m->m_len); 860 if (error) { 861 m_free(m); 862 break; 863 } 864 INP_WLOCK(inp); 865 error = ip_pcbopts(inp, sopt->sopt_name, m); 866 INP_WUNLOCK(inp); 867 return (error); 868 } 869 870 case IP_TOS: 871 case IP_TTL: 872 case IP_MINTTL: 873 case IP_RECVOPTS: 874 case IP_RECVRETOPTS: 875 case IP_RECVDSTADDR: 876 case IP_RECVTTL: 877 case IP_RECVIF: 878 case IP_FAITH: 879 case IP_ONESBCAST: 880 case IP_DONTFRAG: 881 error = sooptcopyin(sopt, &optval, sizeof optval, 882 sizeof optval); 883 if (error) 884 break; 885 886 switch (sopt->sopt_name) { 887 case IP_TOS: 888 inp->inp_ip_tos = optval; 889 break; 890 891 case IP_TTL: 892 inp->inp_ip_ttl = optval; 893 break; 894 895 case IP_MINTTL: 896 if (optval > 0 && optval <= MAXTTL) 897 inp->inp_ip_minttl = optval; 898 else 899 error = EINVAL; 900 break; 901 902#define OPTSET(bit) do { \ 903 INP_WLOCK(inp); \ 904 if (optval) \ 905 inp->inp_flags |= bit; \ 906 else \ 907 inp->inp_flags &= ~bit; \ 908 INP_WUNLOCK(inp); \ 909} while (0) 910 911 case IP_RECVOPTS: 912 OPTSET(INP_RECVOPTS); 913 break; 914 915 case IP_RECVRETOPTS: 916 OPTSET(INP_RECVRETOPTS); 917 break; 918 919 case IP_RECVDSTADDR: 920 OPTSET(INP_RECVDSTADDR); 921 break; 922 923 case IP_RECVTTL: 924 OPTSET(INP_RECVTTL); 925 break; 926 927 case IP_RECVIF: 928 OPTSET(INP_RECVIF); 929 break; 930 931 case IP_FAITH: 932 OPTSET(INP_FAITH); 933 break; 934 935 case IP_ONESBCAST: 936 OPTSET(INP_ONESBCAST); 937 break; 938 case IP_DONTFRAG: 939 OPTSET(INP_DONTFRAG); 940 break; 941 } 942 break; 943#undef OPTSET 944 945 /* 946 * Multicast socket options are processed by the in_mcast 947 * module. 948 */ 949 case IP_MULTICAST_IF: 950 case IP_MULTICAST_VIF: 951 case IP_MULTICAST_TTL: 952 case IP_MULTICAST_LOOP: 953 case IP_ADD_MEMBERSHIP: 954 case IP_DROP_MEMBERSHIP: 955 case IP_ADD_SOURCE_MEMBERSHIP: 956 case IP_DROP_SOURCE_MEMBERSHIP: 957 case IP_BLOCK_SOURCE: 958 case IP_UNBLOCK_SOURCE: 959 case IP_MSFILTER: 960 case MCAST_JOIN_GROUP: 961 case MCAST_LEAVE_GROUP: 962 case MCAST_JOIN_SOURCE_GROUP: 963 case MCAST_LEAVE_SOURCE_GROUP: 964 case MCAST_BLOCK_SOURCE: 965 case MCAST_UNBLOCK_SOURCE: 966 error = inp_setmoptions(inp, sopt); 967 break; 968 969 case IP_PORTRANGE: 970 error = sooptcopyin(sopt, &optval, sizeof optval, 971 sizeof optval); 972 if (error) 973 break; 974 975 INP_WLOCK(inp); 976 switch (optval) { 977 case IP_PORTRANGE_DEFAULT: 978 inp->inp_flags &= ~(INP_LOWPORT); 979 inp->inp_flags &= ~(INP_HIGHPORT); 980 break; 981 982 case IP_PORTRANGE_HIGH: 983 inp->inp_flags &= ~(INP_LOWPORT); 984 inp->inp_flags |= INP_HIGHPORT; 985 break; 986 987 case IP_PORTRANGE_LOW: 988 inp->inp_flags &= ~(INP_HIGHPORT); 989 inp->inp_flags |= INP_LOWPORT; 990 break; 991 992 default: 993 error = EINVAL; 994 break; 995 } 996 INP_WUNLOCK(inp); 997 break; 998 999#ifdef IPSEC 1000 case IP_IPSEC_POLICY: 1001 { 1002 caddr_t req; 1003 struct mbuf *m; 1004 1005 if ((error = soopt_getm(sopt, &m)) != 0) /* XXX */ 1006 break; 1007 if ((error = soopt_mcopyin(sopt, m)) != 0) /* XXX */ 1008 break; 1009 req = mtod(m, caddr_t); 1010 error = ipsec4_set_policy(inp, sopt->sopt_name, req, 1011 m->m_len, (sopt->sopt_td != NULL) ? 1012 sopt->sopt_td->td_ucred : NULL); 1013 m_freem(m); 1014 break; 1015 } 1016#endif /* IPSEC */ 1017 1018 default: 1019 error = ENOPROTOOPT; 1020 break; 1021 } 1022 break; 1023 1024 case SOPT_GET: 1025 switch (sopt->sopt_name) { 1026 case IP_OPTIONS: 1027 case IP_RETOPTS: 1028 if (inp->inp_options) 1029 error = sooptcopyout(sopt, 1030 mtod(inp->inp_options, 1031 char *), 1032 inp->inp_options->m_len); 1033 else 1034 sopt->sopt_valsize = 0; 1035 break; 1036 1037 case IP_TOS: 1038 case IP_TTL: 1039 case IP_MINTTL: 1040 case IP_RECVOPTS: 1041 case IP_RECVRETOPTS: 1042 case IP_RECVDSTADDR: 1043 case IP_RECVTTL: 1044 case IP_RECVIF: 1045 case IP_PORTRANGE: 1046 case IP_FAITH: 1047 case IP_ONESBCAST: 1048 case IP_DONTFRAG: 1049 switch (sopt->sopt_name) { 1050 1051 case IP_TOS: 1052 optval = inp->inp_ip_tos; 1053 break; 1054 1055 case IP_TTL: 1056 optval = inp->inp_ip_ttl; 1057 break; 1058 1059 case IP_MINTTL: 1060 optval = inp->inp_ip_minttl; 1061 break; 1062 1063#define OPTBIT(bit) (inp->inp_flags & bit ? 1 : 0) 1064 1065 case IP_RECVOPTS: 1066 optval = OPTBIT(INP_RECVOPTS); 1067 break; 1068 1069 case IP_RECVRETOPTS: 1070 optval = OPTBIT(INP_RECVRETOPTS); 1071 break; 1072 1073 case IP_RECVDSTADDR: 1074 optval = OPTBIT(INP_RECVDSTADDR); 1075 break; 1076 1077 case IP_RECVTTL: 1078 optval = OPTBIT(INP_RECVTTL); 1079 break; 1080 1081 case IP_RECVIF: 1082 optval = OPTBIT(INP_RECVIF); 1083 break; 1084 1085 case IP_PORTRANGE: 1086 if (inp->inp_flags & INP_HIGHPORT) 1087 optval = IP_PORTRANGE_HIGH; 1088 else if (inp->inp_flags & INP_LOWPORT) 1089 optval = IP_PORTRANGE_LOW; 1090 else 1091 optval = 0; 1092 break; 1093 1094 case IP_FAITH: 1095 optval = OPTBIT(INP_FAITH); 1096 break; 1097 1098 case IP_ONESBCAST: 1099 optval = OPTBIT(INP_ONESBCAST); 1100 break; 1101 case IP_DONTFRAG: 1102 optval = OPTBIT(INP_DONTFRAG); 1103 break; 1104 } 1105 error = sooptcopyout(sopt, &optval, sizeof optval); 1106 break; 1107 1108 /* 1109 * Multicast socket options are processed by the in_mcast 1110 * module. 1111 */ 1112 case IP_MULTICAST_IF: 1113 case IP_MULTICAST_VIF: 1114 case IP_MULTICAST_TTL: 1115 case IP_MULTICAST_LOOP: 1116 case IP_MSFILTER: 1117 error = inp_getmoptions(inp, sopt); 1118 break; 1119 1120#ifdef IPSEC 1121 case IP_IPSEC_POLICY: 1122 { 1123 struct mbuf *m = NULL; 1124 caddr_t req = NULL; 1125 size_t len = 0; 1126 1127 if (m != 0) { 1128 req = mtod(m, caddr_t); 1129 len = m->m_len; 1130 } 1131 error = ipsec4_get_policy(sotoinpcb(so), req, len, &m); 1132 if (error == 0) 1133 error = soopt_mcopyout(sopt, m); /* XXX */ 1134 if (error == 0) 1135 m_freem(m); 1136 break; 1137 } 1138#endif /* IPSEC */ 1139 1140 default: 1141 error = ENOPROTOOPT; 1142 break; 1143 } 1144 break; 1145 } 1146 return (error); 1147} 1148 1149/* 1150 * Routine called from ip_output() to loop back a copy of an IP multicast 1151 * packet to the input queue of a specified interface. Note that this 1152 * calls the output routine of the loopback "driver", but with an interface 1153 * pointer that might NOT be a loopback interface -- evil, but easier than 1154 * replicating that code here. 1155 */ 1156static void 1157ip_mloopback(struct ifnet *ifp, struct mbuf *m, struct sockaddr_in *dst, 1158 int hlen) 1159{ 1160 register struct ip *ip; 1161 struct mbuf *copym; 1162 1163 /* 1164 * Make a deep copy of the packet because we're going to 1165 * modify the pack in order to generate checksums. 1166 */ 1167 copym = m_dup(m, M_DONTWAIT); 1168 if (copym != NULL && (copym->m_flags & M_EXT || copym->m_len < hlen)) 1169 copym = m_pullup(copym, hlen); 1170 if (copym != NULL) { 1171 /* If needed, compute the checksum and mark it as valid. */ 1172 if (copym->m_pkthdr.csum_flags & CSUM_DELAY_DATA) { 1173 in_delayed_cksum(copym); 1174 copym->m_pkthdr.csum_flags &= ~CSUM_DELAY_DATA; 1175 copym->m_pkthdr.csum_flags |= 1176 CSUM_DATA_VALID | CSUM_PSEUDO_HDR; 1177 copym->m_pkthdr.csum_data = 0xffff; 1178 } 1179 /* 1180 * We don't bother to fragment if the IP length is greater 1181 * than the interface's MTU. Can this possibly matter? 1182 */ 1183 ip = mtod(copym, struct ip *); 1184 ip->ip_len = htons(ip->ip_len); 1185 ip->ip_off = htons(ip->ip_off); 1186 ip->ip_sum = 0; 1187 ip->ip_sum = in_cksum(copym, hlen); 1188#if 1 /* XXX */ 1189 if (dst->sin_family != AF_INET) { 1190 printf("ip_mloopback: bad address family %d\n", 1191 dst->sin_family); 1192 dst->sin_family = AF_INET; 1193 } 1194#endif 1195 if_simloop(ifp, copym, dst->sin_family, 0); 1196 } 1197} 1198