ip_output.c revision 178888
1/*- 2 * Copyright (c) 1982, 1986, 1988, 1990, 1993 3 * The Regents of the University of California. All rights reserved. 4 * 5 * Redistribution and use in source and binary forms, with or without 6 * modification, are permitted provided that the following conditions 7 * are met: 8 * 1. Redistributions of source code must retain the above copyright 9 * notice, this list of conditions and the following disclaimer. 10 * 2. Redistributions in binary form must reproduce the above copyright 11 * notice, this list of conditions and the following disclaimer in the 12 * documentation and/or other materials provided with the distribution. 13 * 4. Neither the name of the University nor the names of its contributors 14 * may be used to endorse or promote products derived from this software 15 * without specific prior written permission. 16 * 17 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND 18 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 19 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 20 * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE 21 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 22 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 23 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 24 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 25 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 26 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 27 * SUCH DAMAGE. 28 * 29 * @(#)ip_output.c 8.3 (Berkeley) 1/21/94 30 */ 31 32#include <sys/cdefs.h> 33__FBSDID("$FreeBSD: head/sys/netinet/ip_output.c 178888 2008-05-09 23:03:00Z julian $"); 34 35#include "opt_ipfw.h" 36#include "opt_ipsec.h" 37#include "opt_mac.h" 38#include "opt_mbuf_stress_test.h" 39#include "opt_mpath.h" 40 41#include <sys/param.h> 42#include <sys/systm.h> 43#include <sys/kernel.h> 44#include <sys/malloc.h> 45#include <sys/mbuf.h> 46#include <sys/priv.h> 47#include <sys/proc.h> 48#include <sys/protosw.h> 49#include <sys/socket.h> 50#include <sys/socketvar.h> 51#include <sys/sysctl.h> 52#include <sys/ucred.h> 53 54#include <net/if.h> 55#include <net/netisr.h> 56#include <net/pfil.h> 57#include <net/route.h> 58#ifdef RADIX_MPATH 59#include <net/radix_mpath.h> 60#endif 61 62#include <netinet/in.h> 63#include <netinet/in_systm.h> 64#include <netinet/ip.h> 65#include <netinet/in_pcb.h> 66#include <netinet/in_var.h> 67#include <netinet/ip_var.h> 68#include <netinet/ip_options.h> 69 70#ifdef IPSEC 71#include <netinet/ip_ipsec.h> 72#include <netipsec/ipsec.h> 73#endif /* IPSEC*/ 74 75#include <machine/in_cksum.h> 76 77#include <security/mac/mac_framework.h> 78 79#define print_ip(x, a, y) printf("%s %d.%d.%d.%d%s",\ 80 x, (ntohl(a.s_addr)>>24)&0xFF,\ 81 (ntohl(a.s_addr)>>16)&0xFF,\ 82 (ntohl(a.s_addr)>>8)&0xFF,\ 83 (ntohl(a.s_addr))&0xFF, y); 84 85u_short ip_id; 86 87#ifdef MBUF_STRESS_TEST 88int mbuf_frag_size = 0; 89SYSCTL_INT(_net_inet_ip, OID_AUTO, mbuf_frag_size, CTLFLAG_RW, 90 &mbuf_frag_size, 0, "Fragment outgoing mbufs to this size"); 91#endif 92 93static void ip_mloopback 94 (struct ifnet *, struct mbuf *, struct sockaddr_in *, int); 95 96 97extern struct protosw inetsw[]; 98 99/* 100 * IP output. The packet in mbuf chain m contains a skeletal IP 101 * header (with len, off, ttl, proto, tos, src, dst). 102 * The mbuf chain containing the packet will be freed. 103 * The mbuf opt, if present, will not be freed. 104 * In the IP forwarding case, the packet will arrive with options already 105 * inserted, so must have a NULL opt pointer. 106 */ 107int 108ip_output(struct mbuf *m, struct mbuf *opt, struct route *ro, int flags, 109 struct ip_moptions *imo, struct inpcb *inp) 110{ 111 struct ip *ip; 112 struct ifnet *ifp = NULL; /* keep compiler happy */ 113 struct mbuf *m0; 114 int hlen = sizeof (struct ip); 115 int mtu; 116 int len, error = 0; 117 struct sockaddr_in *dst = NULL; /* keep compiler happy */ 118 struct in_ifaddr *ia = NULL; 119 int isbroadcast, sw_csum; 120 struct route iproute; 121 struct in_addr odst; 122#ifdef IPFIREWALL_FORWARD 123 struct m_tag *fwd_tag = NULL; 124#endif 125 M_ASSERTPKTHDR(m); 126 127 if (ro == NULL) { 128 ro = &iproute; 129 bzero(ro, sizeof (*ro)); 130 } 131 132 if (inp != NULL) 133 INP_LOCK_ASSERT(inp); 134 135 if (opt) { 136 len = 0; 137 m = ip_insertoptions(m, opt, &len); 138 if (len != 0) 139 hlen = len; 140 } 141 ip = mtod(m, struct ip *); 142 143 /* 144 * Fill in IP header. If we are not allowing fragmentation, 145 * then the ip_id field is meaningless, but we don't set it 146 * to zero. Doing so causes various problems when devices along 147 * the path (routers, load balancers, firewalls, etc.) illegally 148 * disable DF on our packet. Note that a 16-bit counter 149 * will wrap around in less than 10 seconds at 100 Mbit/s on a 150 * medium with MTU 1500. See Steven M. Bellovin, "A Technique 151 * for Counting NATted Hosts", Proc. IMW'02, available at 152 * <http://www.cs.columbia.edu/~smb/papers/fnat.pdf>. 153 */ 154 if ((flags & (IP_FORWARDING|IP_RAWOUTPUT)) == 0) { 155 ip->ip_v = IPVERSION; 156 ip->ip_hl = hlen >> 2; 157 ip->ip_id = ip_newid(); 158 ipstat.ips_localout++; 159 } else { 160 hlen = ip->ip_hl << 2; 161 } 162 163 dst = (struct sockaddr_in *)&ro->ro_dst; 164again: 165 /* 166 * If there is a cached route, 167 * check that it is to the same destination 168 * and is still up. If not, free it and try again. 169 * The address family should also be checked in case of sharing the 170 * cache with IPv6. 171 */ 172 if (ro->ro_rt && ((ro->ro_rt->rt_flags & RTF_UP) == 0 || 173 dst->sin_family != AF_INET || 174 dst->sin_addr.s_addr != ip->ip_dst.s_addr)) { 175 RTFREE(ro->ro_rt); 176 ro->ro_rt = (struct rtentry *)NULL; 177 } 178#ifdef IPFIREWALL_FORWARD 179 if (ro->ro_rt == NULL && fwd_tag == NULL) { 180#else 181 if (ro->ro_rt == NULL) { 182#endif 183 bzero(dst, sizeof(*dst)); 184 dst->sin_family = AF_INET; 185 dst->sin_len = sizeof(*dst); 186 dst->sin_addr = ip->ip_dst; 187 } 188 /* 189 * If routing to interface only, short circuit routing lookup. 190 * The use of an all-ones broadcast address implies this; an 191 * interface is specified by the broadcast address of an interface, 192 * or the destination address of a ptp interface. 193 */ 194 if (flags & IP_SENDONES) { 195 if ((ia = ifatoia(ifa_ifwithbroadaddr(sintosa(dst)))) == NULL && 196 (ia = ifatoia(ifa_ifwithdstaddr(sintosa(dst)))) == NULL) { 197 ipstat.ips_noroute++; 198 error = ENETUNREACH; 199 goto bad; 200 } 201 ip->ip_dst.s_addr = INADDR_BROADCAST; 202 dst->sin_addr = ip->ip_dst; 203 ifp = ia->ia_ifp; 204 ip->ip_ttl = 1; 205 isbroadcast = 1; 206 } else if (flags & IP_ROUTETOIF) { 207 if ((ia = ifatoia(ifa_ifwithdstaddr(sintosa(dst)))) == NULL && 208 (ia = ifatoia(ifa_ifwithnet(sintosa(dst)))) == NULL) { 209 ipstat.ips_noroute++; 210 error = ENETUNREACH; 211 goto bad; 212 } 213 ifp = ia->ia_ifp; 214 ip->ip_ttl = 1; 215 isbroadcast = in_broadcast(dst->sin_addr, ifp); 216 } else if (IN_MULTICAST(ntohl(ip->ip_dst.s_addr)) && 217 imo != NULL && imo->imo_multicast_ifp != NULL) { 218 /* 219 * Bypass the normal routing lookup for multicast 220 * packets if the interface is specified. 221 */ 222 ifp = imo->imo_multicast_ifp; 223 IFP_TO_IA(ifp, ia); 224 isbroadcast = 0; /* fool gcc */ 225 } else { 226 /* 227 * We want to do any cloning requested by the link layer, 228 * as this is probably required in all cases for correct 229 * operation (as it is for ARP). 230 */ 231 if (ro->ro_rt == NULL) 232#ifdef RADIX_MPATH 233 rtalloc_mpath_fib(ro, 234 ntohl(ip->ip_src.s_addr ^ ip->ip_dst.s_addr), 235 inp ? inp->inp_inc.inc_fibnum : M_GETFIB(m)); 236#else 237 in_rtalloc_ign(ro, 0, 238 inp ? inp->inp_inc.inc_fibnum : M_GETFIB(m)); 239#endif 240 if (ro->ro_rt == NULL) { 241 ipstat.ips_noroute++; 242 error = EHOSTUNREACH; 243 goto bad; 244 } 245 ia = ifatoia(ro->ro_rt->rt_ifa); 246 ifp = ro->ro_rt->rt_ifp; 247 ro->ro_rt->rt_rmx.rmx_pksent++; 248 if (ro->ro_rt->rt_flags & RTF_GATEWAY) 249 dst = (struct sockaddr_in *)ro->ro_rt->rt_gateway; 250 if (ro->ro_rt->rt_flags & RTF_HOST) 251 isbroadcast = (ro->ro_rt->rt_flags & RTF_BROADCAST); 252 else 253 isbroadcast = in_broadcast(dst->sin_addr, ifp); 254 } 255 /* 256 * Calculate MTU. If we have a route that is up, use that, 257 * otherwise use the interface's MTU. 258 */ 259 if (ro->ro_rt != NULL && (ro->ro_rt->rt_flags & (RTF_UP|RTF_HOST))) { 260 /* 261 * This case can happen if the user changed the MTU 262 * of an interface after enabling IP on it. Because 263 * most netifs don't keep track of routes pointing to 264 * them, there is no way for one to update all its 265 * routes when the MTU is changed. 266 */ 267 if (ro->ro_rt->rt_rmx.rmx_mtu > ifp->if_mtu) 268 ro->ro_rt->rt_rmx.rmx_mtu = ifp->if_mtu; 269 mtu = ro->ro_rt->rt_rmx.rmx_mtu; 270 } else { 271 mtu = ifp->if_mtu; 272 } 273 if (IN_MULTICAST(ntohl(ip->ip_dst.s_addr))) { 274 struct in_multi *inm; 275 276 m->m_flags |= M_MCAST; 277 /* 278 * IP destination address is multicast. Make sure "dst" 279 * still points to the address in "ro". (It may have been 280 * changed to point to a gateway address, above.) 281 */ 282 dst = (struct sockaddr_in *)&ro->ro_dst; 283 /* 284 * See if the caller provided any multicast options 285 */ 286 if (imo != NULL) { 287 ip->ip_ttl = imo->imo_multicast_ttl; 288 if (imo->imo_multicast_vif != -1) 289 ip->ip_src.s_addr = 290 ip_mcast_src ? 291 ip_mcast_src(imo->imo_multicast_vif) : 292 INADDR_ANY; 293 } else 294 ip->ip_ttl = IP_DEFAULT_MULTICAST_TTL; 295 /* 296 * Confirm that the outgoing interface supports multicast. 297 */ 298 if ((imo == NULL) || (imo->imo_multicast_vif == -1)) { 299 if ((ifp->if_flags & IFF_MULTICAST) == 0) { 300 ipstat.ips_noroute++; 301 error = ENETUNREACH; 302 goto bad; 303 } 304 } 305 /* 306 * If source address not specified yet, use address 307 * of outgoing interface. 308 */ 309 if (ip->ip_src.s_addr == INADDR_ANY) { 310 /* Interface may have no addresses. */ 311 if (ia != NULL) 312 ip->ip_src = IA_SIN(ia)->sin_addr; 313 } 314 315 IN_MULTI_LOCK(); 316 IN_LOOKUP_MULTI(ip->ip_dst, ifp, inm); 317 if (inm != NULL && 318 (imo == NULL || imo->imo_multicast_loop)) { 319 IN_MULTI_UNLOCK(); 320 /* 321 * If we belong to the destination multicast group 322 * on the outgoing interface, and the caller did not 323 * forbid loopback, loop back a copy. 324 */ 325 ip_mloopback(ifp, m, dst, hlen); 326 } 327 else { 328 IN_MULTI_UNLOCK(); 329 /* 330 * If we are acting as a multicast router, perform 331 * multicast forwarding as if the packet had just 332 * arrived on the interface to which we are about 333 * to send. The multicast forwarding function 334 * recursively calls this function, using the 335 * IP_FORWARDING flag to prevent infinite recursion. 336 * 337 * Multicasts that are looped back by ip_mloopback(), 338 * above, will be forwarded by the ip_input() routine, 339 * if necessary. 340 */ 341 if (ip_mrouter && (flags & IP_FORWARDING) == 0) { 342 /* 343 * If rsvp daemon is not running, do not 344 * set ip_moptions. This ensures that the packet 345 * is multicast and not just sent down one link 346 * as prescribed by rsvpd. 347 */ 348 if (!rsvp_on) 349 imo = NULL; 350 if (ip_mforward && 351 ip_mforward(ip, ifp, m, imo) != 0) { 352 m_freem(m); 353 goto done; 354 } 355 } 356 } 357 358 /* 359 * Multicasts with a time-to-live of zero may be looped- 360 * back, above, but must not be transmitted on a network. 361 * Also, multicasts addressed to the loopback interface 362 * are not sent -- the above call to ip_mloopback() will 363 * loop back a copy if this host actually belongs to the 364 * destination group on the loopback interface. 365 */ 366 if (ip->ip_ttl == 0 || ifp->if_flags & IFF_LOOPBACK) { 367 m_freem(m); 368 goto done; 369 } 370 371 goto sendit; 372 } 373 374 /* 375 * If the source address is not specified yet, use the address 376 * of the outoing interface. 377 */ 378 if (ip->ip_src.s_addr == INADDR_ANY) { 379 /* Interface may have no addresses. */ 380 if (ia != NULL) { 381 ip->ip_src = IA_SIN(ia)->sin_addr; 382 } 383 } 384 385 /* 386 * Verify that we have any chance at all of being able to queue the 387 * packet or packet fragments, unless ALTQ is enabled on the given 388 * interface in which case packetdrop should be done by queueing. 389 */ 390#ifdef ALTQ 391 if ((!ALTQ_IS_ENABLED(&ifp->if_snd)) && 392 ((ifp->if_snd.ifq_len + ip->ip_len / mtu + 1) >= 393 ifp->if_snd.ifq_maxlen)) 394#else 395 if ((ifp->if_snd.ifq_len + ip->ip_len / mtu + 1) >= 396 ifp->if_snd.ifq_maxlen) 397#endif /* ALTQ */ 398 { 399 error = ENOBUFS; 400 ipstat.ips_odropped++; 401 ifp->if_snd.ifq_drops += (ip->ip_len / ifp->if_mtu + 1); 402 goto bad; 403 } 404 405 /* 406 * Look for broadcast address and 407 * verify user is allowed to send 408 * such a packet. 409 */ 410 if (isbroadcast) { 411 if ((ifp->if_flags & IFF_BROADCAST) == 0) { 412 error = EADDRNOTAVAIL; 413 goto bad; 414 } 415 if ((flags & IP_ALLOWBROADCAST) == 0) { 416 error = EACCES; 417 goto bad; 418 } 419 /* don't allow broadcast messages to be fragmented */ 420 if (ip->ip_len > mtu) { 421 error = EMSGSIZE; 422 goto bad; 423 } 424 m->m_flags |= M_BCAST; 425 } else { 426 m->m_flags &= ~M_BCAST; 427 } 428 429sendit: 430#ifdef IPSEC 431 switch(ip_ipsec_output(&m, inp, &flags, &error, &ro, &iproute, &dst, &ia, &ifp)) { 432 case 1: 433 goto bad; 434 case -1: 435 goto done; 436 case 0: 437 default: 438 break; /* Continue with packet processing. */ 439 } 440 /* Update variables that are affected by ipsec4_output(). */ 441 ip = mtod(m, struct ip *); 442 hlen = ip->ip_hl << 2; 443#endif /* IPSEC */ 444 445 /* Jump over all PFIL processing if hooks are not active. */ 446 if (!PFIL_HOOKED(&inet_pfil_hook)) 447 goto passout; 448 449 /* Run through list of hooks for output packets. */ 450 odst.s_addr = ip->ip_dst.s_addr; 451 error = pfil_run_hooks(&inet_pfil_hook, &m, ifp, PFIL_OUT, inp); 452 if (error != 0 || m == NULL) 453 goto done; 454 455 ip = mtod(m, struct ip *); 456 457 /* See if destination IP address was changed by packet filter. */ 458 if (odst.s_addr != ip->ip_dst.s_addr) { 459 m->m_flags |= M_SKIP_FIREWALL; 460 /* If destination is now ourself drop to ip_input(). */ 461 if (in_localip(ip->ip_dst)) { 462 m->m_flags |= M_FASTFWD_OURS; 463 if (m->m_pkthdr.rcvif == NULL) 464 m->m_pkthdr.rcvif = loif; 465 if (m->m_pkthdr.csum_flags & CSUM_DELAY_DATA) { 466 m->m_pkthdr.csum_flags |= 467 CSUM_DATA_VALID | CSUM_PSEUDO_HDR; 468 m->m_pkthdr.csum_data = 0xffff; 469 } 470 m->m_pkthdr.csum_flags |= 471 CSUM_IP_CHECKED | CSUM_IP_VALID; 472 473 error = netisr_queue(NETISR_IP, m); 474 goto done; 475 } else 476 goto again; /* Redo the routing table lookup. */ 477 } 478 479#ifdef IPFIREWALL_FORWARD 480 /* See if local, if yes, send it to netisr with IP_FASTFWD_OURS. */ 481 if (m->m_flags & M_FASTFWD_OURS) { 482 if (m->m_pkthdr.rcvif == NULL) 483 m->m_pkthdr.rcvif = loif; 484 if (m->m_pkthdr.csum_flags & CSUM_DELAY_DATA) { 485 m->m_pkthdr.csum_flags |= 486 CSUM_DATA_VALID | CSUM_PSEUDO_HDR; 487 m->m_pkthdr.csum_data = 0xffff; 488 } 489 m->m_pkthdr.csum_flags |= 490 CSUM_IP_CHECKED | CSUM_IP_VALID; 491 492 error = netisr_queue(NETISR_IP, m); 493 goto done; 494 } 495 /* Or forward to some other address? */ 496 fwd_tag = m_tag_find(m, PACKET_TAG_IPFORWARD, NULL); 497 if (fwd_tag) { 498 dst = (struct sockaddr_in *)&ro->ro_dst; 499 bcopy((fwd_tag+1), dst, sizeof(struct sockaddr_in)); 500 m->m_flags |= M_SKIP_FIREWALL; 501 m_tag_delete(m, fwd_tag); 502 goto again; 503 } 504#endif /* IPFIREWALL_FORWARD */ 505 506passout: 507 /* 127/8 must not appear on wire - RFC1122. */ 508 if ((ntohl(ip->ip_dst.s_addr) >> IN_CLASSA_NSHIFT) == IN_LOOPBACKNET || 509 (ntohl(ip->ip_src.s_addr) >> IN_CLASSA_NSHIFT) == IN_LOOPBACKNET) { 510 if ((ifp->if_flags & IFF_LOOPBACK) == 0) { 511 ipstat.ips_badaddr++; 512 error = EADDRNOTAVAIL; 513 goto bad; 514 } 515 } 516 517 m->m_pkthdr.csum_flags |= CSUM_IP; 518 sw_csum = m->m_pkthdr.csum_flags & ~ifp->if_hwassist; 519 if (sw_csum & CSUM_DELAY_DATA) { 520 in_delayed_cksum(m); 521 sw_csum &= ~CSUM_DELAY_DATA; 522 } 523 m->m_pkthdr.csum_flags &= ifp->if_hwassist; 524 525 /* 526 * If small enough for interface, or the interface will take 527 * care of the fragmentation for us, we can just send directly. 528 */ 529 if (ip->ip_len <= mtu || 530 (m->m_pkthdr.csum_flags & ifp->if_hwassist & CSUM_TSO) != 0 || 531 ((ip->ip_off & IP_DF) == 0 && (ifp->if_hwassist & CSUM_FRAGMENT))) { 532 ip->ip_len = htons(ip->ip_len); 533 ip->ip_off = htons(ip->ip_off); 534 ip->ip_sum = 0; 535 if (sw_csum & CSUM_DELAY_IP) 536 ip->ip_sum = in_cksum(m, hlen); 537 538 /* 539 * Record statistics for this interface address. 540 * With CSUM_TSO the byte/packet count will be slightly 541 * incorrect because we count the IP+TCP headers only 542 * once instead of for every generated packet. 543 */ 544 if (!(flags & IP_FORWARDING) && ia) { 545 if (m->m_pkthdr.csum_flags & CSUM_TSO) 546 ia->ia_ifa.if_opackets += 547 m->m_pkthdr.len / m->m_pkthdr.tso_segsz; 548 else 549 ia->ia_ifa.if_opackets++; 550 ia->ia_ifa.if_obytes += m->m_pkthdr.len; 551 } 552#ifdef MBUF_STRESS_TEST 553 if (mbuf_frag_size && m->m_pkthdr.len > mbuf_frag_size) 554 m = m_fragment(m, M_DONTWAIT, mbuf_frag_size); 555#endif 556 /* 557 * Reset layer specific mbuf flags 558 * to avoid confusing lower layers. 559 */ 560 m->m_flags &= ~(M_PROTOFLAGS); 561 562 error = (*ifp->if_output)(ifp, m, 563 (struct sockaddr *)dst, ro->ro_rt); 564 goto done; 565 } 566 567 /* Balk when DF bit is set or the interface didn't support TSO. */ 568 if ((ip->ip_off & IP_DF) || (m->m_pkthdr.csum_flags & CSUM_TSO)) { 569 error = EMSGSIZE; 570 ipstat.ips_cantfrag++; 571 goto bad; 572 } 573 574 /* 575 * Too large for interface; fragment if possible. If successful, 576 * on return, m will point to a list of packets to be sent. 577 */ 578 error = ip_fragment(ip, &m, mtu, ifp->if_hwassist, sw_csum); 579 if (error) 580 goto bad; 581 for (; m; m = m0) { 582 m0 = m->m_nextpkt; 583 m->m_nextpkt = 0; 584 if (error == 0) { 585 /* Record statistics for this interface address. */ 586 if (ia != NULL) { 587 ia->ia_ifa.if_opackets++; 588 ia->ia_ifa.if_obytes += m->m_pkthdr.len; 589 } 590 /* 591 * Reset layer specific mbuf flags 592 * to avoid confusing upper layers. 593 */ 594 m->m_flags &= ~(M_PROTOFLAGS); 595 596 error = (*ifp->if_output)(ifp, m, 597 (struct sockaddr *)dst, ro->ro_rt); 598 } else 599 m_freem(m); 600 } 601 602 if (error == 0) 603 ipstat.ips_fragmented++; 604 605done: 606 if (ro == &iproute && ro->ro_rt) { 607 RTFREE(ro->ro_rt); 608 } 609 return (error); 610bad: 611 m_freem(m); 612 goto done; 613} 614 615/* 616 * Create a chain of fragments which fit the given mtu. m_frag points to the 617 * mbuf to be fragmented; on return it points to the chain with the fragments. 618 * Return 0 if no error. If error, m_frag may contain a partially built 619 * chain of fragments that should be freed by the caller. 620 * 621 * if_hwassist_flags is the hw offload capabilities (see if_data.ifi_hwassist) 622 * sw_csum contains the delayed checksums flags (e.g., CSUM_DELAY_IP). 623 */ 624int 625ip_fragment(struct ip *ip, struct mbuf **m_frag, int mtu, 626 u_long if_hwassist_flags, int sw_csum) 627{ 628 int error = 0; 629 int hlen = ip->ip_hl << 2; 630 int len = (mtu - hlen) & ~7; /* size of payload in each fragment */ 631 int off; 632 struct mbuf *m0 = *m_frag; /* the original packet */ 633 int firstlen; 634 struct mbuf **mnext; 635 int nfrags; 636 637 if (ip->ip_off & IP_DF) { /* Fragmentation not allowed */ 638 ipstat.ips_cantfrag++; 639 return EMSGSIZE; 640 } 641 642 /* 643 * Must be able to put at least 8 bytes per fragment. 644 */ 645 if (len < 8) 646 return EMSGSIZE; 647 648 /* 649 * If the interface will not calculate checksums on 650 * fragmented packets, then do it here. 651 */ 652 if (m0->m_pkthdr.csum_flags & CSUM_DELAY_DATA && 653 (if_hwassist_flags & CSUM_IP_FRAGS) == 0) { 654 in_delayed_cksum(m0); 655 m0->m_pkthdr.csum_flags &= ~CSUM_DELAY_DATA; 656 } 657 658 if (len > PAGE_SIZE) { 659 /* 660 * Fragment large datagrams such that each segment 661 * contains a multiple of PAGE_SIZE amount of data, 662 * plus headers. This enables a receiver to perform 663 * page-flipping zero-copy optimizations. 664 * 665 * XXX When does this help given that sender and receiver 666 * could have different page sizes, and also mtu could 667 * be less than the receiver's page size ? 668 */ 669 int newlen; 670 struct mbuf *m; 671 672 for (m = m0, off = 0; m && (off+m->m_len) <= mtu; m = m->m_next) 673 off += m->m_len; 674 675 /* 676 * firstlen (off - hlen) must be aligned on an 677 * 8-byte boundary 678 */ 679 if (off < hlen) 680 goto smart_frag_failure; 681 off = ((off - hlen) & ~7) + hlen; 682 newlen = (~PAGE_MASK) & mtu; 683 if ((newlen + sizeof (struct ip)) > mtu) { 684 /* we failed, go back the default */ 685smart_frag_failure: 686 newlen = len; 687 off = hlen + len; 688 } 689 len = newlen; 690 691 } else { 692 off = hlen + len; 693 } 694 695 firstlen = off - hlen; 696 mnext = &m0->m_nextpkt; /* pointer to next packet */ 697 698 /* 699 * Loop through length of segment after first fragment, 700 * make new header and copy data of each part and link onto chain. 701 * Here, m0 is the original packet, m is the fragment being created. 702 * The fragments are linked off the m_nextpkt of the original 703 * packet, which after processing serves as the first fragment. 704 */ 705 for (nfrags = 1; off < ip->ip_len; off += len, nfrags++) { 706 struct ip *mhip; /* ip header on the fragment */ 707 struct mbuf *m; 708 int mhlen = sizeof (struct ip); 709 710 MGETHDR(m, M_DONTWAIT, MT_DATA); 711 if (m == NULL) { 712 error = ENOBUFS; 713 ipstat.ips_odropped++; 714 goto done; 715 } 716 m->m_flags |= (m0->m_flags & M_MCAST) | M_FRAG; 717 /* 718 * In the first mbuf, leave room for the link header, then 719 * copy the original IP header including options. The payload 720 * goes into an additional mbuf chain returned by m_copy(). 721 */ 722 m->m_data += max_linkhdr; 723 mhip = mtod(m, struct ip *); 724 *mhip = *ip; 725 if (hlen > sizeof (struct ip)) { 726 mhlen = ip_optcopy(ip, mhip) + sizeof (struct ip); 727 mhip->ip_v = IPVERSION; 728 mhip->ip_hl = mhlen >> 2; 729 } 730 m->m_len = mhlen; 731 /* XXX do we need to add ip->ip_off below ? */ 732 mhip->ip_off = ((off - hlen) >> 3) + ip->ip_off; 733 if (off + len >= ip->ip_len) { /* last fragment */ 734 len = ip->ip_len - off; 735 m->m_flags |= M_LASTFRAG; 736 } else 737 mhip->ip_off |= IP_MF; 738 mhip->ip_len = htons((u_short)(len + mhlen)); 739 m->m_next = m_copy(m0, off, len); 740 if (m->m_next == NULL) { /* copy failed */ 741 m_free(m); 742 error = ENOBUFS; /* ??? */ 743 ipstat.ips_odropped++; 744 goto done; 745 } 746 m->m_pkthdr.len = mhlen + len; 747 m->m_pkthdr.rcvif = NULL; 748#ifdef MAC 749 mac_netinet_fragment(m0, m); 750#endif 751 m->m_pkthdr.csum_flags = m0->m_pkthdr.csum_flags; 752 mhip->ip_off = htons(mhip->ip_off); 753 mhip->ip_sum = 0; 754 if (sw_csum & CSUM_DELAY_IP) 755 mhip->ip_sum = in_cksum(m, mhlen); 756 *mnext = m; 757 mnext = &m->m_nextpkt; 758 } 759 ipstat.ips_ofragments += nfrags; 760 761 /* set first marker for fragment chain */ 762 m0->m_flags |= M_FIRSTFRAG | M_FRAG; 763 m0->m_pkthdr.csum_data = nfrags; 764 765 /* 766 * Update first fragment by trimming what's been copied out 767 * and updating header. 768 */ 769 m_adj(m0, hlen + firstlen - ip->ip_len); 770 m0->m_pkthdr.len = hlen + firstlen; 771 ip->ip_len = htons((u_short)m0->m_pkthdr.len); 772 ip->ip_off |= IP_MF; 773 ip->ip_off = htons(ip->ip_off); 774 ip->ip_sum = 0; 775 if (sw_csum & CSUM_DELAY_IP) 776 ip->ip_sum = in_cksum(m0, hlen); 777 778done: 779 *m_frag = m0; 780 return error; 781} 782 783void 784in_delayed_cksum(struct mbuf *m) 785{ 786 struct ip *ip; 787 u_short csum, offset; 788 789 ip = mtod(m, struct ip *); 790 offset = ip->ip_hl << 2 ; 791 csum = in_cksum_skip(m, ip->ip_len, offset); 792 if (m->m_pkthdr.csum_flags & CSUM_UDP && csum == 0) 793 csum = 0xffff; 794 offset += m->m_pkthdr.csum_data; /* checksum offset */ 795 796 if (offset + sizeof(u_short) > m->m_len) { 797 printf("delayed m_pullup, m->len: %d off: %d p: %d\n", 798 m->m_len, offset, ip->ip_p); 799 /* 800 * XXX 801 * this shouldn't happen, but if it does, the 802 * correct behavior may be to insert the checksum 803 * in the appropriate next mbuf in the chain. 804 */ 805 return; 806 } 807 *(u_short *)(m->m_data + offset) = csum; 808} 809 810/* 811 * IP socket option processing. 812 */ 813int 814ip_ctloutput(struct socket *so, struct sockopt *sopt) 815{ 816 struct inpcb *inp = sotoinpcb(so); 817 int error, optval; 818 819 error = optval = 0; 820 if (sopt->sopt_level != IPPROTO_IP) { 821 return (EINVAL); 822 } 823 824 switch (sopt->sopt_dir) { 825 case SOPT_SET: 826 switch (sopt->sopt_name) { 827 case IP_OPTIONS: 828#ifdef notyet 829 case IP_RETOPTS: 830#endif 831 { 832 struct mbuf *m; 833 if (sopt->sopt_valsize > MLEN) { 834 error = EMSGSIZE; 835 break; 836 } 837 MGET(m, sopt->sopt_td ? M_WAIT : M_DONTWAIT, MT_DATA); 838 if (m == NULL) { 839 error = ENOBUFS; 840 break; 841 } 842 m->m_len = sopt->sopt_valsize; 843 error = sooptcopyin(sopt, mtod(m, char *), m->m_len, 844 m->m_len); 845 if (error) { 846 m_free(m); 847 break; 848 } 849 INP_WLOCK(inp); 850 error = ip_pcbopts(inp, sopt->sopt_name, m); 851 INP_WUNLOCK(inp); 852 return (error); 853 } 854 855 case IP_TOS: 856 case IP_TTL: 857 case IP_MINTTL: 858 case IP_RECVOPTS: 859 case IP_RECVRETOPTS: 860 case IP_RECVDSTADDR: 861 case IP_RECVTTL: 862 case IP_RECVIF: 863 case IP_FAITH: 864 case IP_ONESBCAST: 865 case IP_DONTFRAG: 866 error = sooptcopyin(sopt, &optval, sizeof optval, 867 sizeof optval); 868 if (error) 869 break; 870 871 switch (sopt->sopt_name) { 872 case IP_TOS: 873 inp->inp_ip_tos = optval; 874 break; 875 876 case IP_TTL: 877 inp->inp_ip_ttl = optval; 878 break; 879 880 case IP_MINTTL: 881 if (optval > 0 && optval <= MAXTTL) 882 inp->inp_ip_minttl = optval; 883 else 884 error = EINVAL; 885 break; 886 887#define OPTSET(bit) do { \ 888 INP_WLOCK(inp); \ 889 if (optval) \ 890 inp->inp_flags |= bit; \ 891 else \ 892 inp->inp_flags &= ~bit; \ 893 INP_WUNLOCK(inp); \ 894} while (0) 895 896 case IP_RECVOPTS: 897 OPTSET(INP_RECVOPTS); 898 break; 899 900 case IP_RECVRETOPTS: 901 OPTSET(INP_RECVRETOPTS); 902 break; 903 904 case IP_RECVDSTADDR: 905 OPTSET(INP_RECVDSTADDR); 906 break; 907 908 case IP_RECVTTL: 909 OPTSET(INP_RECVTTL); 910 break; 911 912 case IP_RECVIF: 913 OPTSET(INP_RECVIF); 914 break; 915 916 case IP_FAITH: 917 OPTSET(INP_FAITH); 918 break; 919 920 case IP_ONESBCAST: 921 OPTSET(INP_ONESBCAST); 922 break; 923 case IP_DONTFRAG: 924 OPTSET(INP_DONTFRAG); 925 break; 926 } 927 break; 928#undef OPTSET 929 930 /* 931 * Multicast socket options are processed by the in_mcast 932 * module. 933 */ 934 case IP_MULTICAST_IF: 935 case IP_MULTICAST_VIF: 936 case IP_MULTICAST_TTL: 937 case IP_MULTICAST_LOOP: 938 case IP_ADD_MEMBERSHIP: 939 case IP_DROP_MEMBERSHIP: 940 case IP_ADD_SOURCE_MEMBERSHIP: 941 case IP_DROP_SOURCE_MEMBERSHIP: 942 case IP_BLOCK_SOURCE: 943 case IP_UNBLOCK_SOURCE: 944 case IP_MSFILTER: 945 case MCAST_JOIN_GROUP: 946 case MCAST_LEAVE_GROUP: 947 case MCAST_JOIN_SOURCE_GROUP: 948 case MCAST_LEAVE_SOURCE_GROUP: 949 case MCAST_BLOCK_SOURCE: 950 case MCAST_UNBLOCK_SOURCE: 951 error = inp_setmoptions(inp, sopt); 952 break; 953 954 case IP_PORTRANGE: 955 error = sooptcopyin(sopt, &optval, sizeof optval, 956 sizeof optval); 957 if (error) 958 break; 959 960 INP_WLOCK(inp); 961 switch (optval) { 962 case IP_PORTRANGE_DEFAULT: 963 inp->inp_flags &= ~(INP_LOWPORT); 964 inp->inp_flags &= ~(INP_HIGHPORT); 965 break; 966 967 case IP_PORTRANGE_HIGH: 968 inp->inp_flags &= ~(INP_LOWPORT); 969 inp->inp_flags |= INP_HIGHPORT; 970 break; 971 972 case IP_PORTRANGE_LOW: 973 inp->inp_flags &= ~(INP_HIGHPORT); 974 inp->inp_flags |= INP_LOWPORT; 975 break; 976 977 default: 978 error = EINVAL; 979 break; 980 } 981 INP_WUNLOCK(inp); 982 break; 983 984#ifdef IPSEC 985 case IP_IPSEC_POLICY: 986 { 987 caddr_t req; 988 struct mbuf *m; 989 990 if ((error = soopt_getm(sopt, &m)) != 0) /* XXX */ 991 break; 992 if ((error = soopt_mcopyin(sopt, m)) != 0) /* XXX */ 993 break; 994 req = mtod(m, caddr_t); 995 error = ipsec4_set_policy(inp, sopt->sopt_name, req, 996 m->m_len, (sopt->sopt_td != NULL) ? 997 sopt->sopt_td->td_ucred : NULL); 998 m_freem(m); 999 break; 1000 } 1001#endif /* IPSEC */ 1002 1003 default: 1004 error = ENOPROTOOPT; 1005 break; 1006 } 1007 break; 1008 1009 case SOPT_GET: 1010 switch (sopt->sopt_name) { 1011 case IP_OPTIONS: 1012 case IP_RETOPTS: 1013 if (inp->inp_options) 1014 error = sooptcopyout(sopt, 1015 mtod(inp->inp_options, 1016 char *), 1017 inp->inp_options->m_len); 1018 else 1019 sopt->sopt_valsize = 0; 1020 break; 1021 1022 case IP_TOS: 1023 case IP_TTL: 1024 case IP_MINTTL: 1025 case IP_RECVOPTS: 1026 case IP_RECVRETOPTS: 1027 case IP_RECVDSTADDR: 1028 case IP_RECVTTL: 1029 case IP_RECVIF: 1030 case IP_PORTRANGE: 1031 case IP_FAITH: 1032 case IP_ONESBCAST: 1033 case IP_DONTFRAG: 1034 switch (sopt->sopt_name) { 1035 1036 case IP_TOS: 1037 optval = inp->inp_ip_tos; 1038 break; 1039 1040 case IP_TTL: 1041 optval = inp->inp_ip_ttl; 1042 break; 1043 1044 case IP_MINTTL: 1045 optval = inp->inp_ip_minttl; 1046 break; 1047 1048#define OPTBIT(bit) (inp->inp_flags & bit ? 1 : 0) 1049 1050 case IP_RECVOPTS: 1051 optval = OPTBIT(INP_RECVOPTS); 1052 break; 1053 1054 case IP_RECVRETOPTS: 1055 optval = OPTBIT(INP_RECVRETOPTS); 1056 break; 1057 1058 case IP_RECVDSTADDR: 1059 optval = OPTBIT(INP_RECVDSTADDR); 1060 break; 1061 1062 case IP_RECVTTL: 1063 optval = OPTBIT(INP_RECVTTL); 1064 break; 1065 1066 case IP_RECVIF: 1067 optval = OPTBIT(INP_RECVIF); 1068 break; 1069 1070 case IP_PORTRANGE: 1071 if (inp->inp_flags & INP_HIGHPORT) 1072 optval = IP_PORTRANGE_HIGH; 1073 else if (inp->inp_flags & INP_LOWPORT) 1074 optval = IP_PORTRANGE_LOW; 1075 else 1076 optval = 0; 1077 break; 1078 1079 case IP_FAITH: 1080 optval = OPTBIT(INP_FAITH); 1081 break; 1082 1083 case IP_ONESBCAST: 1084 optval = OPTBIT(INP_ONESBCAST); 1085 break; 1086 case IP_DONTFRAG: 1087 optval = OPTBIT(INP_DONTFRAG); 1088 break; 1089 } 1090 error = sooptcopyout(sopt, &optval, sizeof optval); 1091 break; 1092 1093 /* 1094 * Multicast socket options are processed by the in_mcast 1095 * module. 1096 */ 1097 case IP_MULTICAST_IF: 1098 case IP_MULTICAST_VIF: 1099 case IP_MULTICAST_TTL: 1100 case IP_MULTICAST_LOOP: 1101 case IP_MSFILTER: 1102 error = inp_getmoptions(inp, sopt); 1103 break; 1104 1105#ifdef IPSEC 1106 case IP_IPSEC_POLICY: 1107 { 1108 struct mbuf *m = NULL; 1109 caddr_t req = NULL; 1110 size_t len = 0; 1111 1112 if (m != 0) { 1113 req = mtod(m, caddr_t); 1114 len = m->m_len; 1115 } 1116 error = ipsec4_get_policy(sotoinpcb(so), req, len, &m); 1117 if (error == 0) 1118 error = soopt_mcopyout(sopt, m); /* XXX */ 1119 if (error == 0) 1120 m_freem(m); 1121 break; 1122 } 1123#endif /* IPSEC */ 1124 1125 default: 1126 error = ENOPROTOOPT; 1127 break; 1128 } 1129 break; 1130 } 1131 return (error); 1132} 1133 1134/* 1135 * Routine called from ip_output() to loop back a copy of an IP multicast 1136 * packet to the input queue of a specified interface. Note that this 1137 * calls the output routine of the loopback "driver", but with an interface 1138 * pointer that might NOT be a loopback interface -- evil, but easier than 1139 * replicating that code here. 1140 */ 1141static void 1142ip_mloopback(struct ifnet *ifp, struct mbuf *m, struct sockaddr_in *dst, 1143 int hlen) 1144{ 1145 register struct ip *ip; 1146 struct mbuf *copym; 1147 1148 copym = m_copy(m, 0, M_COPYALL); 1149 if (copym != NULL && (copym->m_flags & M_EXT || copym->m_len < hlen)) 1150 copym = m_pullup(copym, hlen); 1151 if (copym != NULL) { 1152 /* If needed, compute the checksum and mark it as valid. */ 1153 if (copym->m_pkthdr.csum_flags & CSUM_DELAY_DATA) { 1154 in_delayed_cksum(copym); 1155 copym->m_pkthdr.csum_flags &= ~CSUM_DELAY_DATA; 1156 copym->m_pkthdr.csum_flags |= 1157 CSUM_DATA_VALID | CSUM_PSEUDO_HDR; 1158 copym->m_pkthdr.csum_data = 0xffff; 1159 } 1160 /* 1161 * We don't bother to fragment if the IP length is greater 1162 * than the interface's MTU. Can this possibly matter? 1163 */ 1164 ip = mtod(copym, struct ip *); 1165 ip->ip_len = htons(ip->ip_len); 1166 ip->ip_off = htons(ip->ip_off); 1167 ip->ip_sum = 0; 1168 ip->ip_sum = in_cksum(copym, hlen); 1169 /* 1170 * NB: 1171 * It's not clear whether there are any lingering 1172 * reentrancy problems in other areas which might 1173 * be exposed by using ip_input directly (in 1174 * particular, everything which modifies the packet 1175 * in-place). Yet another option is using the 1176 * protosw directly to deliver the looped back 1177 * packet. For the moment, we'll err on the side 1178 * of safety by using if_simloop(). 1179 */ 1180#if 1 /* XXX */ 1181 if (dst->sin_family != AF_INET) { 1182 printf("ip_mloopback: bad address family %d\n", 1183 dst->sin_family); 1184 dst->sin_family = AF_INET; 1185 } 1186#endif 1187 1188#ifdef notdef 1189 copym->m_pkthdr.rcvif = ifp; 1190 ip_input(copym); 1191#else 1192 if_simloop(ifp, copym, dst->sin_family, 0); 1193#endif 1194 } 1195} 1196