ip_output.c revision 163606
1/*- 2 * Copyright (c) 1982, 1986, 1988, 1990, 1993 3 * The Regents of the University of California. All rights reserved. 4 * 5 * Redistribution and use in source and binary forms, with or without 6 * modification, are permitted provided that the following conditions 7 * are met: 8 * 1. Redistributions of source code must retain the above copyright 9 * notice, this list of conditions and the following disclaimer. 10 * 2. Redistributions in binary form must reproduce the above copyright 11 * notice, this list of conditions and the following disclaimer in the 12 * documentation and/or other materials provided with the distribution. 13 * 4. Neither the name of the University nor the names of its contributors 14 * may be used to endorse or promote products derived from this software 15 * without specific prior written permission. 16 * 17 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND 18 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 19 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 20 * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE 21 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 22 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 23 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 24 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 25 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 26 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 27 * SUCH DAMAGE. 28 * 29 * @(#)ip_output.c 8.3 (Berkeley) 1/21/94 30 * $FreeBSD: head/sys/netinet/ip_output.c 163606 2006-10-22 11:52:19Z rwatson $ 31 */ 32 33#include "opt_ipfw.h" 34#include "opt_ipsec.h" 35#include "opt_mac.h" 36#include "opt_mbuf_stress_test.h" 37 38#include <sys/param.h> 39#include <sys/systm.h> 40#include <sys/kernel.h> 41#include <sys/malloc.h> 42#include <sys/mbuf.h> 43#include <sys/protosw.h> 44#include <sys/socket.h> 45#include <sys/socketvar.h> 46#include <sys/sysctl.h> 47 48#include <net/if.h> 49#include <net/netisr.h> 50#include <net/pfil.h> 51#include <net/route.h> 52 53#include <netinet/in.h> 54#include <netinet/in_systm.h> 55#include <netinet/ip.h> 56#include <netinet/in_pcb.h> 57#include <netinet/in_var.h> 58#include <netinet/ip_var.h> 59#include <netinet/ip_options.h> 60 61#if defined(IPSEC) || defined(FAST_IPSEC) 62#include <netinet/ip_ipsec.h> 63#ifdef IPSEC 64#include <netinet6/ipsec.h> 65#endif 66#ifdef FAST_IPSEC 67#include <netipsec/ipsec.h> 68#endif 69#endif /*IPSEC*/ 70 71#include <machine/in_cksum.h> 72 73#include <security/mac/mac_framework.h> 74 75static MALLOC_DEFINE(M_IPMOPTS, "ip_moptions", "internet multicast options"); 76 77#define print_ip(x, a, y) printf("%s %d.%d.%d.%d%s",\ 78 x, (ntohl(a.s_addr)>>24)&0xFF,\ 79 (ntohl(a.s_addr)>>16)&0xFF,\ 80 (ntohl(a.s_addr)>>8)&0xFF,\ 81 (ntohl(a.s_addr))&0xFF, y); 82 83u_short ip_id; 84 85#ifdef MBUF_STRESS_TEST 86int mbuf_frag_size = 0; 87SYSCTL_INT(_net_inet_ip, OID_AUTO, mbuf_frag_size, CTLFLAG_RW, 88 &mbuf_frag_size, 0, "Fragment outgoing mbufs to this size"); 89#endif 90 91static struct ifnet *ip_multicast_if(struct in_addr *, int *); 92static void ip_mloopback 93 (struct ifnet *, struct mbuf *, struct sockaddr_in *, int); 94static int ip_getmoptions(struct inpcb *, struct sockopt *); 95static int ip_setmoptions(struct inpcb *, struct sockopt *); 96 97 98extern struct protosw inetsw[]; 99 100/* 101 * IP output. The packet in mbuf chain m contains a skeletal IP 102 * header (with len, off, ttl, proto, tos, src, dst). 103 * The mbuf chain containing the packet will be freed. 104 * The mbuf opt, if present, will not be freed. 105 * In the IP forwarding case, the packet will arrive with options already 106 * inserted, so must have a NULL opt pointer. 107 */ 108int 109ip_output(struct mbuf *m, struct mbuf *opt, struct route *ro, 110 int flags, struct ip_moptions *imo, struct inpcb *inp) 111{ 112 struct ip *ip; 113 struct ifnet *ifp = NULL; /* keep compiler happy */ 114 struct mbuf *m0; 115 int hlen = sizeof (struct ip); 116 int mtu; 117 int len, error = 0; 118 struct sockaddr_in *dst = NULL; /* keep compiler happy */ 119 struct in_ifaddr *ia = NULL; 120 struct in_ifaddr *sia = NULL; 121 int isbroadcast, sw_csum; 122 struct route iproute; 123 struct in_addr odst; 124#ifdef IPFIREWALL_FORWARD 125 struct m_tag *fwd_tag = NULL; 126#endif 127 M_ASSERTPKTHDR(m); 128 129 if (ro == NULL) { 130 ro = &iproute; 131 bzero(ro, sizeof (*ro)); 132 } 133 134 if (inp != NULL) 135 INP_LOCK_ASSERT(inp); 136 137 if (opt) { 138 len = 0; 139 m = ip_insertoptions(m, opt, &len); 140 if (len != 0) 141 hlen = len; 142 } 143 ip = mtod(m, struct ip *); 144 145 /* 146 * Fill in IP header. If we are not allowing fragmentation, 147 * then the ip_id field is meaningless, but we don't set it 148 * to zero. Doing so causes various problems when devices along 149 * the path (routers, load balancers, firewalls, etc.) illegally 150 * disable DF on our packet. Note that a 16-bit counter 151 * will wrap around in less than 10 seconds at 100 Mbit/s on a 152 * medium with MTU 1500. See Steven M. Bellovin, "A Technique 153 * for Counting NATted Hosts", Proc. IMW'02, available at 154 * <http://www.cs.columbia.edu/~smb/papers/fnat.pdf>. 155 */ 156 if ((flags & (IP_FORWARDING|IP_RAWOUTPUT)) == 0) { 157 ip->ip_v = IPVERSION; 158 ip->ip_hl = hlen >> 2; 159 ip->ip_id = ip_newid(); 160 ipstat.ips_localout++; 161 } else { 162 hlen = ip->ip_hl << 2; 163 } 164 165 dst = (struct sockaddr_in *)&ro->ro_dst; 166again: 167 /* 168 * If there is a cached route, 169 * check that it is to the same destination 170 * and is still up. If not, free it and try again. 171 * The address family should also be checked in case of sharing the 172 * cache with IPv6. 173 */ 174 if (ro->ro_rt && ((ro->ro_rt->rt_flags & RTF_UP) == 0 || 175 dst->sin_family != AF_INET || 176 dst->sin_addr.s_addr != ip->ip_dst.s_addr)) { 177 RTFREE(ro->ro_rt); 178 ro->ro_rt = (struct rtentry *)NULL; 179 } 180#ifdef IPFIREWALL_FORWARD 181 if (ro->ro_rt == NULL && fwd_tag == NULL) { 182#else 183 if (ro->ro_rt == NULL) { 184#endif 185 bzero(dst, sizeof(*dst)); 186 dst->sin_family = AF_INET; 187 dst->sin_len = sizeof(*dst); 188 dst->sin_addr = ip->ip_dst; 189 } 190 /* 191 * If routing to interface only, 192 * short circuit routing lookup. 193 */ 194 if (flags & IP_ROUTETOIF) { 195 if ((ia = ifatoia(ifa_ifwithdstaddr(sintosa(dst)))) == NULL && 196 (ia = ifatoia(ifa_ifwithnet(sintosa(dst)))) == NULL) { 197 ipstat.ips_noroute++; 198 error = ENETUNREACH; 199 goto bad; 200 } 201 ifp = ia->ia_ifp; 202 ip->ip_ttl = 1; 203 isbroadcast = in_broadcast(dst->sin_addr, ifp); 204 } else if (flags & IP_SENDONES) { 205 if ((ia = ifatoia(ifa_ifwithbroadaddr(sintosa(dst)))) == NULL) { 206 ipstat.ips_noroute++; 207 error = ENETUNREACH; 208 goto bad; 209 } 210 ifp = ia->ia_ifp; 211 ip->ip_dst.s_addr = INADDR_BROADCAST; 212 dst->sin_addr = ip->ip_dst; 213 ip->ip_ttl = 1; 214 isbroadcast = 1; 215 } else if (IN_MULTICAST(ntohl(ip->ip_dst.s_addr)) && 216 imo != NULL && imo->imo_multicast_ifp != NULL) { 217 /* 218 * Bypass the normal routing lookup for multicast 219 * packets if the interface is specified. 220 */ 221 ifp = imo->imo_multicast_ifp; 222 IFP_TO_IA(ifp, ia); 223 isbroadcast = 0; /* fool gcc */ 224 } else { 225 /* 226 * We want to do any cloning requested by the link layer, 227 * as this is probably required in all cases for correct 228 * operation (as it is for ARP). 229 */ 230 if (ro->ro_rt == NULL) 231 rtalloc_ign(ro, 0); 232 if (ro->ro_rt == NULL) { 233 ipstat.ips_noroute++; 234 error = EHOSTUNREACH; 235 goto bad; 236 } 237 ia = ifatoia(ro->ro_rt->rt_ifa); 238 ifp = ro->ro_rt->rt_ifp; 239 ro->ro_rt->rt_rmx.rmx_pksent++; 240 if (ro->ro_rt->rt_flags & RTF_GATEWAY) 241 dst = (struct sockaddr_in *)ro->ro_rt->rt_gateway; 242 if (ro->ro_rt->rt_flags & RTF_HOST) 243 isbroadcast = (ro->ro_rt->rt_flags & RTF_BROADCAST); 244 else 245 isbroadcast = in_broadcast(dst->sin_addr, ifp); 246 } 247 /* 248 * Calculate MTU. If we have a route that is up, use that, 249 * otherwise use the interface's MTU. 250 */ 251 if (ro->ro_rt != NULL && (ro->ro_rt->rt_flags & (RTF_UP|RTF_HOST))) { 252 /* 253 * This case can happen if the user changed the MTU 254 * of an interface after enabling IP on it. Because 255 * most netifs don't keep track of routes pointing to 256 * them, there is no way for one to update all its 257 * routes when the MTU is changed. 258 */ 259 if (ro->ro_rt->rt_rmx.rmx_mtu > ifp->if_mtu) 260 ro->ro_rt->rt_rmx.rmx_mtu = ifp->if_mtu; 261 mtu = ro->ro_rt->rt_rmx.rmx_mtu; 262 } else { 263 mtu = ifp->if_mtu; 264 } 265 if (IN_MULTICAST(ntohl(ip->ip_dst.s_addr))) { 266 struct in_multi *inm; 267 268 m->m_flags |= M_MCAST; 269 /* 270 * IP destination address is multicast. Make sure "dst" 271 * still points to the address in "ro". (It may have been 272 * changed to point to a gateway address, above.) 273 */ 274 dst = (struct sockaddr_in *)&ro->ro_dst; 275 /* 276 * See if the caller provided any multicast options 277 */ 278 if (imo != NULL) { 279 ip->ip_ttl = imo->imo_multicast_ttl; 280 if (imo->imo_multicast_vif != -1) 281 ip->ip_src.s_addr = 282 ip_mcast_src ? 283 ip_mcast_src(imo->imo_multicast_vif) : 284 INADDR_ANY; 285 } else 286 ip->ip_ttl = IP_DEFAULT_MULTICAST_TTL; 287 /* 288 * Confirm that the outgoing interface supports multicast. 289 */ 290 if ((imo == NULL) || (imo->imo_multicast_vif == -1)) { 291 if ((ifp->if_flags & IFF_MULTICAST) == 0) { 292 ipstat.ips_noroute++; 293 error = ENETUNREACH; 294 goto bad; 295 } 296 } 297 /* 298 * If source address not specified yet, use address 299 * of outgoing interface. 300 */ 301 if (ip->ip_src.s_addr == INADDR_ANY) { 302 /* Interface may have no addresses. */ 303 if (ia != NULL) 304 ip->ip_src = IA_SIN(ia)->sin_addr; 305 } 306 307 IN_MULTI_LOCK(); 308 IN_LOOKUP_MULTI(ip->ip_dst, ifp, inm); 309 if (inm != NULL && 310 (imo == NULL || imo->imo_multicast_loop)) { 311 IN_MULTI_UNLOCK(); 312 /* 313 * If we belong to the destination multicast group 314 * on the outgoing interface, and the caller did not 315 * forbid loopback, loop back a copy. 316 */ 317 ip_mloopback(ifp, m, dst, hlen); 318 } 319 else { 320 IN_MULTI_UNLOCK(); 321 /* 322 * If we are acting as a multicast router, perform 323 * multicast forwarding as if the packet had just 324 * arrived on the interface to which we are about 325 * to send. The multicast forwarding function 326 * recursively calls this function, using the 327 * IP_FORWARDING flag to prevent infinite recursion. 328 * 329 * Multicasts that are looped back by ip_mloopback(), 330 * above, will be forwarded by the ip_input() routine, 331 * if necessary. 332 */ 333 if (ip_mrouter && (flags & IP_FORWARDING) == 0) { 334 /* 335 * If rsvp daemon is not running, do not 336 * set ip_moptions. This ensures that the packet 337 * is multicast and not just sent down one link 338 * as prescribed by rsvpd. 339 */ 340 if (!rsvp_on) 341 imo = NULL; 342 if (ip_mforward && 343 ip_mforward(ip, ifp, m, imo) != 0) { 344 m_freem(m); 345 goto done; 346 } 347 } 348 } 349 350 /* 351 * Multicasts with a time-to-live of zero may be looped- 352 * back, above, but must not be transmitted on a network. 353 * Also, multicasts addressed to the loopback interface 354 * are not sent -- the above call to ip_mloopback() will 355 * loop back a copy if this host actually belongs to the 356 * destination group on the loopback interface. 357 */ 358 if (ip->ip_ttl == 0 || ifp->if_flags & IFF_LOOPBACK) { 359 m_freem(m); 360 goto done; 361 } 362 363 goto sendit; 364 } 365 366 /* 367 * If the source address is not specified yet, use the address 368 * of the outoing interface. 369 */ 370 if (ip->ip_src.s_addr == INADDR_ANY) { 371 /* Interface may have no addresses. */ 372 if (ia != NULL) { 373 ip->ip_src = IA_SIN(ia)->sin_addr; 374 } 375 } 376 377 /* 378 * Verify that we have any chance at all of being able to queue the 379 * packet or packet fragments, unless ALTQ is enabled on the given 380 * interface in which case packetdrop should be done by queueing. 381 */ 382#ifdef ALTQ 383 if ((!ALTQ_IS_ENABLED(&ifp->if_snd)) && 384 ((ifp->if_snd.ifq_len + ip->ip_len / mtu + 1) >= 385 ifp->if_snd.ifq_maxlen)) 386#else 387 if ((ifp->if_snd.ifq_len + ip->ip_len / mtu + 1) >= 388 ifp->if_snd.ifq_maxlen) 389#endif /* ALTQ */ 390 { 391 error = ENOBUFS; 392 ipstat.ips_odropped++; 393 ifp->if_snd.ifq_drops += (ip->ip_len / ifp->if_mtu + 1); 394 goto bad; 395 } 396 397 /* 398 * Look for broadcast address and 399 * verify user is allowed to send 400 * such a packet. 401 */ 402 if (isbroadcast) { 403 if ((ifp->if_flags & IFF_BROADCAST) == 0) { 404 error = EADDRNOTAVAIL; 405 goto bad; 406 } 407 if ((flags & IP_ALLOWBROADCAST) == 0) { 408 error = EACCES; 409 goto bad; 410 } 411 /* don't allow broadcast messages to be fragmented */ 412 if (ip->ip_len > mtu) { 413 error = EMSGSIZE; 414 goto bad; 415 } 416 m->m_flags |= M_BCAST; 417 } else { 418 m->m_flags &= ~M_BCAST; 419 } 420 421sendit: 422#if defined(IPSEC) || defined(FAST_IPSEC) 423 switch(ip_ipsec_output(&m, inp, &flags, &error, &ro, &iproute, &dst, &ia, &ifp)) { 424 case 1: 425 goto bad; 426 case -1: 427 goto done; 428 case 0: 429 default: 430 break; /* Continue with packet processing. */ 431 } 432 /* Update variables that are affected by ipsec4_output(). */ 433 ip = mtod(m, struct ip *); 434 hlen = ip->ip_hl << 2; 435#endif /* IPSEC */ 436 437 /* Jump over all PFIL processing if hooks are not active. */ 438 if (!PFIL_HOOKED(&inet_pfil_hook)) 439 goto passout; 440 441 /* Run through list of hooks for output packets. */ 442 odst.s_addr = ip->ip_dst.s_addr; 443 error = pfil_run_hooks(&inet_pfil_hook, &m, ifp, PFIL_OUT, inp); 444 if (error != 0 || m == NULL) 445 goto done; 446 447 ip = mtod(m, struct ip *); 448 449 /* See if destination IP address was changed by packet filter. */ 450 if (odst.s_addr != ip->ip_dst.s_addr) { 451 m->m_flags |= M_SKIP_FIREWALL; 452 /* If destination is now ourself drop to ip_input(). */ 453 if (in_localip(ip->ip_dst)) { 454 m->m_flags |= M_FASTFWD_OURS; 455 if (m->m_pkthdr.rcvif == NULL) 456 m->m_pkthdr.rcvif = loif; 457 if (m->m_pkthdr.csum_flags & CSUM_DELAY_DATA) { 458 m->m_pkthdr.csum_flags |= 459 CSUM_DATA_VALID | CSUM_PSEUDO_HDR; 460 m->m_pkthdr.csum_data = 0xffff; 461 } 462 m->m_pkthdr.csum_flags |= 463 CSUM_IP_CHECKED | CSUM_IP_VALID; 464 465 error = netisr_queue(NETISR_IP, m); 466 goto done; 467 } else 468 goto again; /* Redo the routing table lookup. */ 469 } 470 471#ifdef IPFIREWALL_FORWARD 472 /* See if local, if yes, send it to netisr with IP_FASTFWD_OURS. */ 473 if (m->m_flags & M_FASTFWD_OURS) { 474 if (m->m_pkthdr.rcvif == NULL) 475 m->m_pkthdr.rcvif = loif; 476 if (m->m_pkthdr.csum_flags & CSUM_DELAY_DATA) { 477 m->m_pkthdr.csum_flags |= 478 CSUM_DATA_VALID | CSUM_PSEUDO_HDR; 479 m->m_pkthdr.csum_data = 0xffff; 480 } 481 m->m_pkthdr.csum_flags |= 482 CSUM_IP_CHECKED | CSUM_IP_VALID; 483 484 error = netisr_queue(NETISR_IP, m); 485 goto done; 486 } 487 /* Or forward to some other address? */ 488 fwd_tag = m_tag_find(m, PACKET_TAG_IPFORWARD, NULL); 489 if (fwd_tag) { 490 dst = (struct sockaddr_in *)&ro->ro_dst; 491 bcopy((fwd_tag+1), dst, sizeof(struct sockaddr_in)); 492 m->m_flags |= M_SKIP_FIREWALL; 493 m_tag_delete(m, fwd_tag); 494 goto again; 495 } 496#endif /* IPFIREWALL_FORWARD */ 497 498passout: 499 /* 127/8 must not appear on wire - RFC1122. */ 500 if ((ntohl(ip->ip_dst.s_addr) >> IN_CLASSA_NSHIFT) == IN_LOOPBACKNET || 501 (ntohl(ip->ip_src.s_addr) >> IN_CLASSA_NSHIFT) == IN_LOOPBACKNET) { 502 if ((ifp->if_flags & IFF_LOOPBACK) == 0) { 503 ipstat.ips_badaddr++; 504 error = EADDRNOTAVAIL; 505 goto bad; 506 } 507 } 508 509 m->m_pkthdr.csum_flags |= CSUM_IP; 510 sw_csum = m->m_pkthdr.csum_flags & ~ifp->if_hwassist; 511 if (sw_csum & CSUM_DELAY_DATA) { 512 in_delayed_cksum(m); 513 sw_csum &= ~CSUM_DELAY_DATA; 514 } 515 m->m_pkthdr.csum_flags &= ifp->if_hwassist; 516 517 /* 518 * If small enough for interface, or the interface will take 519 * care of the fragmentation for us, we can just send directly. 520 */ 521 if (ip->ip_len <= mtu || 522 (m->m_pkthdr.csum_flags & ifp->if_hwassist & CSUM_TSO) != 0 || 523 ((ip->ip_off & IP_DF) == 0 && (ifp->if_hwassist & CSUM_FRAGMENT))) { 524 ip->ip_len = htons(ip->ip_len); 525 ip->ip_off = htons(ip->ip_off); 526 ip->ip_sum = 0; 527 if (sw_csum & CSUM_DELAY_IP) 528 ip->ip_sum = in_cksum(m, hlen); 529 530 /* 531 * Record statistics for this interface address. 532 * With CSUM_TSO the byte/packet count will be slightly 533 * incorrect because we count the IP+TCP headers only 534 * once instead of for every generated packet. 535 */ 536 if (!(flags & IP_FORWARDING) && ia) { 537 INADDR_TO_IFADDR(ip->ip_src, sia); 538 if (sia == NULL) 539 sia = ia; 540 if (m->m_pkthdr.csum_flags & CSUM_TSO) 541 sia->ia_ifa.if_opackets += 542 m->m_pkthdr.len / m->m_pkthdr.tso_segsz; 543 else 544 sia->ia_ifa.if_opackets++; 545 sia->ia_ifa.if_obytes += m->m_pkthdr.len; 546 } 547#ifdef IPSEC 548 /* clean ipsec history once it goes out of the node */ 549 ipsec_delaux(m); 550#endif 551#ifdef MBUF_STRESS_TEST 552 if (mbuf_frag_size && m->m_pkthdr.len > mbuf_frag_size) 553 m = m_fragment(m, M_DONTWAIT, mbuf_frag_size); 554#endif 555 /* 556 * Reset layer specific mbuf flags 557 * to avoid confusing lower layers. 558 */ 559 m->m_flags &= ~(M_PROTOFLAGS); 560 561 error = (*ifp->if_output)(ifp, m, 562 (struct sockaddr *)dst, ro->ro_rt); 563 goto done; 564 } 565 566 /* Balk when DF bit is set or the interface didn't support TSO. */ 567 if ((ip->ip_off & IP_DF) || (m->m_pkthdr.csum_flags & CSUM_TSO)) { 568 error = EMSGSIZE; 569 ipstat.ips_cantfrag++; 570 goto bad; 571 } 572 573 /* 574 * Too large for interface; fragment if possible. If successful, 575 * on return, m will point to a list of packets to be sent. 576 */ 577 error = ip_fragment(ip, &m, mtu, ifp->if_hwassist, sw_csum); 578 if (error) 579 goto bad; 580 for (; m; m = m0) { 581 m0 = m->m_nextpkt; 582 m->m_nextpkt = 0; 583#ifdef IPSEC 584 /* clean ipsec history once it goes out of the node */ 585 ipsec_delaux(m); 586#endif 587 if (error == 0) { 588 /* Record statistics for this interface address. */ 589 if (ia != NULL) { 590 INADDR_TO_IFADDR(ip->ip_src, sia); 591 if (sia == NULL) 592 sia = ia; 593 sia->ia_ifa.if_opackets++; 594 sia->ia_ifa.if_obytes += m->m_pkthdr.len; 595 } 596 /* 597 * Reset layer specific mbuf flags 598 * to avoid confusing upper layers. 599 */ 600 m->m_flags &= ~(M_PROTOFLAGS); 601 602 error = (*ifp->if_output)(ifp, m, 603 (struct sockaddr *)dst, ro->ro_rt); 604 } else 605 m_freem(m); 606 } 607 608 if (error == 0) 609 ipstat.ips_fragmented++; 610 611done: 612 if (ro == &iproute && ro->ro_rt) { 613 RTFREE(ro->ro_rt); 614 } 615 return (error); 616bad: 617 m_freem(m); 618 goto done; 619} 620 621/* 622 * Create a chain of fragments which fit the given mtu. m_frag points to the 623 * mbuf to be fragmented; on return it points to the chain with the fragments. 624 * Return 0 if no error. If error, m_frag may contain a partially built 625 * chain of fragments that should be freed by the caller. 626 * 627 * if_hwassist_flags is the hw offload capabilities (see if_data.ifi_hwassist) 628 * sw_csum contains the delayed checksums flags (e.g., CSUM_DELAY_IP). 629 */ 630int 631ip_fragment(struct ip *ip, struct mbuf **m_frag, int mtu, 632 u_long if_hwassist_flags, int sw_csum) 633{ 634 int error = 0; 635 int hlen = ip->ip_hl << 2; 636 int len = (mtu - hlen) & ~7; /* size of payload in each fragment */ 637 int off; 638 struct mbuf *m0 = *m_frag; /* the original packet */ 639 int firstlen; 640 struct mbuf **mnext; 641 int nfrags; 642 643 if (ip->ip_off & IP_DF) { /* Fragmentation not allowed */ 644 ipstat.ips_cantfrag++; 645 return EMSGSIZE; 646 } 647 648 /* 649 * Must be able to put at least 8 bytes per fragment. 650 */ 651 if (len < 8) 652 return EMSGSIZE; 653 654 /* 655 * If the interface will not calculate checksums on 656 * fragmented packets, then do it here. 657 */ 658 if (m0->m_pkthdr.csum_flags & CSUM_DELAY_DATA && 659 (if_hwassist_flags & CSUM_IP_FRAGS) == 0) { 660 in_delayed_cksum(m0); 661 m0->m_pkthdr.csum_flags &= ~CSUM_DELAY_DATA; 662 } 663 664 if (len > PAGE_SIZE) { 665 /* 666 * Fragment large datagrams such that each segment 667 * contains a multiple of PAGE_SIZE amount of data, 668 * plus headers. This enables a receiver to perform 669 * page-flipping zero-copy optimizations. 670 * 671 * XXX When does this help given that sender and receiver 672 * could have different page sizes, and also mtu could 673 * be less than the receiver's page size ? 674 */ 675 int newlen; 676 struct mbuf *m; 677 678 for (m = m0, off = 0; m && (off+m->m_len) <= mtu; m = m->m_next) 679 off += m->m_len; 680 681 /* 682 * firstlen (off - hlen) must be aligned on an 683 * 8-byte boundary 684 */ 685 if (off < hlen) 686 goto smart_frag_failure; 687 off = ((off - hlen) & ~7) + hlen; 688 newlen = (~PAGE_MASK) & mtu; 689 if ((newlen + sizeof (struct ip)) > mtu) { 690 /* we failed, go back the default */ 691smart_frag_failure: 692 newlen = len; 693 off = hlen + len; 694 } 695 len = newlen; 696 697 } else { 698 off = hlen + len; 699 } 700 701 firstlen = off - hlen; 702 mnext = &m0->m_nextpkt; /* pointer to next packet */ 703 704 /* 705 * Loop through length of segment after first fragment, 706 * make new header and copy data of each part and link onto chain. 707 * Here, m0 is the original packet, m is the fragment being created. 708 * The fragments are linked off the m_nextpkt of the original 709 * packet, which after processing serves as the first fragment. 710 */ 711 for (nfrags = 1; off < ip->ip_len; off += len, nfrags++) { 712 struct ip *mhip; /* ip header on the fragment */ 713 struct mbuf *m; 714 int mhlen = sizeof (struct ip); 715 716 MGETHDR(m, M_DONTWAIT, MT_DATA); 717 if (m == NULL) { 718 error = ENOBUFS; 719 ipstat.ips_odropped++; 720 goto done; 721 } 722 m->m_flags |= (m0->m_flags & M_MCAST) | M_FRAG; 723 /* 724 * In the first mbuf, leave room for the link header, then 725 * copy the original IP header including options. The payload 726 * goes into an additional mbuf chain returned by m_copy(). 727 */ 728 m->m_data += max_linkhdr; 729 mhip = mtod(m, struct ip *); 730 *mhip = *ip; 731 if (hlen > sizeof (struct ip)) { 732 mhlen = ip_optcopy(ip, mhip) + sizeof (struct ip); 733 mhip->ip_v = IPVERSION; 734 mhip->ip_hl = mhlen >> 2; 735 } 736 m->m_len = mhlen; 737 /* XXX do we need to add ip->ip_off below ? */ 738 mhip->ip_off = ((off - hlen) >> 3) + ip->ip_off; 739 if (off + len >= ip->ip_len) { /* last fragment */ 740 len = ip->ip_len - off; 741 m->m_flags |= M_LASTFRAG; 742 } else 743 mhip->ip_off |= IP_MF; 744 mhip->ip_len = htons((u_short)(len + mhlen)); 745 m->m_next = m_copy(m0, off, len); 746 if (m->m_next == NULL) { /* copy failed */ 747 m_free(m); 748 error = ENOBUFS; /* ??? */ 749 ipstat.ips_odropped++; 750 goto done; 751 } 752 m->m_pkthdr.len = mhlen + len; 753 m->m_pkthdr.rcvif = NULL; 754#ifdef MAC 755 mac_create_fragment(m0, m); 756#endif 757 m->m_pkthdr.csum_flags = m0->m_pkthdr.csum_flags; 758 mhip->ip_off = htons(mhip->ip_off); 759 mhip->ip_sum = 0; 760 if (sw_csum & CSUM_DELAY_IP) 761 mhip->ip_sum = in_cksum(m, mhlen); 762 *mnext = m; 763 mnext = &m->m_nextpkt; 764 } 765 ipstat.ips_ofragments += nfrags; 766 767 /* set first marker for fragment chain */ 768 m0->m_flags |= M_FIRSTFRAG | M_FRAG; 769 m0->m_pkthdr.csum_data = nfrags; 770 771 /* 772 * Update first fragment by trimming what's been copied out 773 * and updating header. 774 */ 775 m_adj(m0, hlen + firstlen - ip->ip_len); 776 m0->m_pkthdr.len = hlen + firstlen; 777 ip->ip_len = htons((u_short)m0->m_pkthdr.len); 778 ip->ip_off |= IP_MF; 779 ip->ip_off = htons(ip->ip_off); 780 ip->ip_sum = 0; 781 if (sw_csum & CSUM_DELAY_IP) 782 ip->ip_sum = in_cksum(m0, hlen); 783 784done: 785 *m_frag = m0; 786 return error; 787} 788 789void 790in_delayed_cksum(struct mbuf *m) 791{ 792 struct ip *ip; 793 u_short csum, offset; 794 795 ip = mtod(m, struct ip *); 796 offset = ip->ip_hl << 2 ; 797 csum = in_cksum_skip(m, ip->ip_len, offset); 798 if (m->m_pkthdr.csum_flags & CSUM_UDP && csum == 0) 799 csum = 0xffff; 800 offset += m->m_pkthdr.csum_data; /* checksum offset */ 801 802 if (offset + sizeof(u_short) > m->m_len) { 803 printf("delayed m_pullup, m->len: %d off: %d p: %d\n", 804 m->m_len, offset, ip->ip_p); 805 /* 806 * XXX 807 * this shouldn't happen, but if it does, the 808 * correct behavior may be to insert the checksum 809 * in the appropriate next mbuf in the chain. 810 */ 811 return; 812 } 813 *(u_short *)(m->m_data + offset) = csum; 814} 815 816/* 817 * IP socket option processing. 818 */ 819int 820ip_ctloutput(so, sopt) 821 struct socket *so; 822 struct sockopt *sopt; 823{ 824 struct inpcb *inp = sotoinpcb(so); 825 int error, optval; 826 827 error = optval = 0; 828 if (sopt->sopt_level != IPPROTO_IP) { 829 return (EINVAL); 830 } 831 832 switch (sopt->sopt_dir) { 833 case SOPT_SET: 834 switch (sopt->sopt_name) { 835 case IP_OPTIONS: 836#ifdef notyet 837 case IP_RETOPTS: 838#endif 839 { 840 struct mbuf *m; 841 if (sopt->sopt_valsize > MLEN) { 842 error = EMSGSIZE; 843 break; 844 } 845 MGET(m, sopt->sopt_td ? M_TRYWAIT : M_DONTWAIT, MT_DATA); 846 if (m == NULL) { 847 error = ENOBUFS; 848 break; 849 } 850 m->m_len = sopt->sopt_valsize; 851 error = sooptcopyin(sopt, mtod(m, char *), m->m_len, 852 m->m_len); 853 if (error) { 854 m_free(m); 855 break; 856 } 857 INP_LOCK(inp); 858 error = ip_pcbopts(inp, sopt->sopt_name, m); 859 INP_UNLOCK(inp); 860 return (error); 861 } 862 863 case IP_TOS: 864 case IP_TTL: 865 case IP_MINTTL: 866 case IP_RECVOPTS: 867 case IP_RECVRETOPTS: 868 case IP_RECVDSTADDR: 869 case IP_RECVTTL: 870 case IP_RECVIF: 871 case IP_FAITH: 872 case IP_ONESBCAST: 873 case IP_DONTFRAG: 874 error = sooptcopyin(sopt, &optval, sizeof optval, 875 sizeof optval); 876 if (error) 877 break; 878 879 switch (sopt->sopt_name) { 880 case IP_TOS: 881 inp->inp_ip_tos = optval; 882 break; 883 884 case IP_TTL: 885 inp->inp_ip_ttl = optval; 886 break; 887 888 case IP_MINTTL: 889 if (optval > 0 && optval <= MAXTTL) 890 inp->inp_ip_minttl = optval; 891 else 892 error = EINVAL; 893 break; 894 895#define OPTSET(bit) do { \ 896 INP_LOCK(inp); \ 897 if (optval) \ 898 inp->inp_flags |= bit; \ 899 else \ 900 inp->inp_flags &= ~bit; \ 901 INP_UNLOCK(inp); \ 902} while (0) 903 904 case IP_RECVOPTS: 905 OPTSET(INP_RECVOPTS); 906 break; 907 908 case IP_RECVRETOPTS: 909 OPTSET(INP_RECVRETOPTS); 910 break; 911 912 case IP_RECVDSTADDR: 913 OPTSET(INP_RECVDSTADDR); 914 break; 915 916 case IP_RECVTTL: 917 OPTSET(INP_RECVTTL); 918 break; 919 920 case IP_RECVIF: 921 OPTSET(INP_RECVIF); 922 break; 923 924 case IP_FAITH: 925 OPTSET(INP_FAITH); 926 break; 927 928 case IP_ONESBCAST: 929 OPTSET(INP_ONESBCAST); 930 break; 931 case IP_DONTFRAG: 932 OPTSET(INP_DONTFRAG); 933 break; 934 } 935 break; 936#undef OPTSET 937 938 case IP_MULTICAST_IF: 939 case IP_MULTICAST_VIF: 940 case IP_MULTICAST_TTL: 941 case IP_MULTICAST_LOOP: 942 case IP_ADD_MEMBERSHIP: 943 case IP_DROP_MEMBERSHIP: 944 error = ip_setmoptions(inp, sopt); 945 break; 946 947 case IP_PORTRANGE: 948 error = sooptcopyin(sopt, &optval, sizeof optval, 949 sizeof optval); 950 if (error) 951 break; 952 953 INP_LOCK(inp); 954 switch (optval) { 955 case IP_PORTRANGE_DEFAULT: 956 inp->inp_flags &= ~(INP_LOWPORT); 957 inp->inp_flags &= ~(INP_HIGHPORT); 958 break; 959 960 case IP_PORTRANGE_HIGH: 961 inp->inp_flags &= ~(INP_LOWPORT); 962 inp->inp_flags |= INP_HIGHPORT; 963 break; 964 965 case IP_PORTRANGE_LOW: 966 inp->inp_flags &= ~(INP_HIGHPORT); 967 inp->inp_flags |= INP_LOWPORT; 968 break; 969 970 default: 971 error = EINVAL; 972 break; 973 } 974 INP_UNLOCK(inp); 975 break; 976 977#if defined(IPSEC) || defined(FAST_IPSEC) 978 case IP_IPSEC_POLICY: 979 { 980 caddr_t req; 981 size_t len = 0; 982 int priv; 983 struct mbuf *m; 984 int optname; 985 986 if ((error = soopt_getm(sopt, &m)) != 0) /* XXX */ 987 break; 988 if ((error = soopt_mcopyin(sopt, m)) != 0) /* XXX */ 989 break; 990 priv = (sopt->sopt_td != NULL && 991 suser(sopt->sopt_td) != 0) ? 0 : 1; 992 req = mtod(m, caddr_t); 993 len = m->m_len; 994 optname = sopt->sopt_name; 995 error = ipsec4_set_policy(inp, optname, req, len, priv); 996 m_freem(m); 997 break; 998 } 999#endif /*IPSEC*/ 1000 1001 default: 1002 error = ENOPROTOOPT; 1003 break; 1004 } 1005 break; 1006 1007 case SOPT_GET: 1008 switch (sopt->sopt_name) { 1009 case IP_OPTIONS: 1010 case IP_RETOPTS: 1011 if (inp->inp_options) 1012 error = sooptcopyout(sopt, 1013 mtod(inp->inp_options, 1014 char *), 1015 inp->inp_options->m_len); 1016 else 1017 sopt->sopt_valsize = 0; 1018 break; 1019 1020 case IP_TOS: 1021 case IP_TTL: 1022 case IP_MINTTL: 1023 case IP_RECVOPTS: 1024 case IP_RECVRETOPTS: 1025 case IP_RECVDSTADDR: 1026 case IP_RECVTTL: 1027 case IP_RECVIF: 1028 case IP_PORTRANGE: 1029 case IP_FAITH: 1030 case IP_ONESBCAST: 1031 case IP_DONTFRAG: 1032 switch (sopt->sopt_name) { 1033 1034 case IP_TOS: 1035 optval = inp->inp_ip_tos; 1036 break; 1037 1038 case IP_TTL: 1039 optval = inp->inp_ip_ttl; 1040 break; 1041 1042 case IP_MINTTL: 1043 optval = inp->inp_ip_minttl; 1044 break; 1045 1046#define OPTBIT(bit) (inp->inp_flags & bit ? 1 : 0) 1047 1048 case IP_RECVOPTS: 1049 optval = OPTBIT(INP_RECVOPTS); 1050 break; 1051 1052 case IP_RECVRETOPTS: 1053 optval = OPTBIT(INP_RECVRETOPTS); 1054 break; 1055 1056 case IP_RECVDSTADDR: 1057 optval = OPTBIT(INP_RECVDSTADDR); 1058 break; 1059 1060 case IP_RECVTTL: 1061 optval = OPTBIT(INP_RECVTTL); 1062 break; 1063 1064 case IP_RECVIF: 1065 optval = OPTBIT(INP_RECVIF); 1066 break; 1067 1068 case IP_PORTRANGE: 1069 if (inp->inp_flags & INP_HIGHPORT) 1070 optval = IP_PORTRANGE_HIGH; 1071 else if (inp->inp_flags & INP_LOWPORT) 1072 optval = IP_PORTRANGE_LOW; 1073 else 1074 optval = 0; 1075 break; 1076 1077 case IP_FAITH: 1078 optval = OPTBIT(INP_FAITH); 1079 break; 1080 1081 case IP_ONESBCAST: 1082 optval = OPTBIT(INP_ONESBCAST); 1083 break; 1084 case IP_DONTFRAG: 1085 optval = OPTBIT(INP_DONTFRAG); 1086 break; 1087 } 1088 error = sooptcopyout(sopt, &optval, sizeof optval); 1089 break; 1090 1091 case IP_MULTICAST_IF: 1092 case IP_MULTICAST_VIF: 1093 case IP_MULTICAST_TTL: 1094 case IP_MULTICAST_LOOP: 1095 case IP_ADD_MEMBERSHIP: 1096 case IP_DROP_MEMBERSHIP: 1097 error = ip_getmoptions(inp, sopt); 1098 break; 1099 1100#if defined(IPSEC) || defined(FAST_IPSEC) 1101 case IP_IPSEC_POLICY: 1102 { 1103 struct mbuf *m = NULL; 1104 caddr_t req = NULL; 1105 size_t len = 0; 1106 1107 if (m != 0) { 1108 req = mtod(m, caddr_t); 1109 len = m->m_len; 1110 } 1111 error = ipsec4_get_policy(sotoinpcb(so), req, len, &m); 1112 if (error == 0) 1113 error = soopt_mcopyout(sopt, m); /* XXX */ 1114 if (error == 0) 1115 m_freem(m); 1116 break; 1117 } 1118#endif /*IPSEC*/ 1119 1120 default: 1121 error = ENOPROTOOPT; 1122 break; 1123 } 1124 break; 1125 } 1126 return (error); 1127} 1128 1129/* 1130 * XXX 1131 * The whole multicast option thing needs to be re-thought. 1132 * Several of these options are equally applicable to non-multicast 1133 * transmission, and one (IP_MULTICAST_TTL) totally duplicates a 1134 * standard option (IP_TTL). 1135 */ 1136 1137/* 1138 * following RFC1724 section 3.3, 0.0.0.0/8 is interpreted as interface index. 1139 */ 1140static struct ifnet * 1141ip_multicast_if(a, ifindexp) 1142 struct in_addr *a; 1143 int *ifindexp; 1144{ 1145 int ifindex; 1146 struct ifnet *ifp; 1147 1148 if (ifindexp) 1149 *ifindexp = 0; 1150 if (ntohl(a->s_addr) >> 24 == 0) { 1151 ifindex = ntohl(a->s_addr) & 0xffffff; 1152 if (ifindex < 0 || if_index < ifindex) 1153 return NULL; 1154 ifp = ifnet_byindex(ifindex); 1155 if (ifindexp) 1156 *ifindexp = ifindex; 1157 } else { 1158 INADDR_TO_IFP(*a, ifp); 1159 } 1160 return ifp; 1161} 1162 1163/* 1164 * Given an inpcb, return its multicast options structure pointer. Accepts 1165 * an unlocked inpcb pointer, but will return it locked. May sleep. 1166 */ 1167static struct ip_moptions * 1168ip_findmoptions(struct inpcb *inp) 1169{ 1170 struct ip_moptions *imo; 1171 struct in_multi **immp; 1172 1173 INP_LOCK(inp); 1174 if (inp->inp_moptions != NULL) 1175 return (inp->inp_moptions); 1176 1177 INP_UNLOCK(inp); 1178 1179 imo = (struct ip_moptions*)malloc(sizeof(*imo), M_IPMOPTS, M_WAITOK); 1180 immp = (struct in_multi **)malloc((sizeof(*immp) * IP_MIN_MEMBERSHIPS), 1181 M_IPMOPTS, M_WAITOK); 1182 1183 imo->imo_multicast_ifp = NULL; 1184 imo->imo_multicast_addr.s_addr = INADDR_ANY; 1185 imo->imo_multicast_vif = -1; 1186 imo->imo_multicast_ttl = IP_DEFAULT_MULTICAST_TTL; 1187 imo->imo_multicast_loop = IP_DEFAULT_MULTICAST_LOOP; 1188 imo->imo_num_memberships = 0; 1189 imo->imo_max_memberships = IP_MIN_MEMBERSHIPS; 1190 imo->imo_membership = immp; 1191 1192 INP_LOCK(inp); 1193 if (inp->inp_moptions != NULL) { 1194 free(immp, M_IPMOPTS); 1195 free(imo, M_IPMOPTS); 1196 return (inp->inp_moptions); 1197 } 1198 inp->inp_moptions = imo; 1199 return (imo); 1200} 1201 1202/* 1203 * Set the IP multicast options in response to user setsockopt(). 1204 */ 1205static int 1206ip_setmoptions(struct inpcb *inp, struct sockopt *sopt) 1207{ 1208 int error = 0; 1209 int i; 1210 struct in_addr addr; 1211 struct ip_mreq mreq; 1212 struct ifnet *ifp; 1213 struct ip_moptions *imo; 1214 struct route ro; 1215 struct sockaddr_in *dst; 1216 int ifindex; 1217 int s; 1218 1219 switch (sopt->sopt_name) { 1220 /* store an index number for the vif you wanna use in the send */ 1221 case IP_MULTICAST_VIF: 1222 if (legal_vif_num == 0) { 1223 error = EOPNOTSUPP; 1224 break; 1225 } 1226 error = sooptcopyin(sopt, &i, sizeof i, sizeof i); 1227 if (error) 1228 break; 1229 if (!legal_vif_num(i) && (i != -1)) { 1230 error = EINVAL; 1231 break; 1232 } 1233 imo = ip_findmoptions(inp); 1234 imo->imo_multicast_vif = i; 1235 INP_UNLOCK(inp); 1236 break; 1237 1238 case IP_MULTICAST_IF: 1239 /* 1240 * Select the interface for outgoing multicast packets. 1241 */ 1242 error = sooptcopyin(sopt, &addr, sizeof addr, sizeof addr); 1243 if (error) 1244 break; 1245 /* 1246 * INADDR_ANY is used to remove a previous selection. 1247 * When no interface is selected, a default one is 1248 * chosen every time a multicast packet is sent. 1249 */ 1250 imo = ip_findmoptions(inp); 1251 if (addr.s_addr == INADDR_ANY) { 1252 imo->imo_multicast_ifp = NULL; 1253 INP_UNLOCK(inp); 1254 break; 1255 } 1256 /* 1257 * The selected interface is identified by its local 1258 * IP address. Find the interface and confirm that 1259 * it supports multicasting. 1260 */ 1261 s = splimp(); 1262 ifp = ip_multicast_if(&addr, &ifindex); 1263 if (ifp == NULL || (ifp->if_flags & IFF_MULTICAST) == 0) { 1264 INP_UNLOCK(inp); 1265 splx(s); 1266 error = EADDRNOTAVAIL; 1267 break; 1268 } 1269 imo->imo_multicast_ifp = ifp; 1270 if (ifindex) 1271 imo->imo_multicast_addr = addr; 1272 else 1273 imo->imo_multicast_addr.s_addr = INADDR_ANY; 1274 INP_UNLOCK(inp); 1275 splx(s); 1276 break; 1277 1278 case IP_MULTICAST_TTL: 1279 /* 1280 * Set the IP time-to-live for outgoing multicast packets. 1281 * The original multicast API required a char argument, 1282 * which is inconsistent with the rest of the socket API. 1283 * We allow either a char or an int. 1284 */ 1285 if (sopt->sopt_valsize == 1) { 1286 u_char ttl; 1287 error = sooptcopyin(sopt, &ttl, 1, 1); 1288 if (error) 1289 break; 1290 imo = ip_findmoptions(inp); 1291 imo->imo_multicast_ttl = ttl; 1292 INP_UNLOCK(inp); 1293 } else { 1294 u_int ttl; 1295 error = sooptcopyin(sopt, &ttl, sizeof ttl, 1296 sizeof ttl); 1297 if (error) 1298 break; 1299 if (ttl > 255) 1300 error = EINVAL; 1301 else { 1302 imo = ip_findmoptions(inp); 1303 imo->imo_multicast_ttl = ttl; 1304 INP_UNLOCK(inp); 1305 } 1306 } 1307 break; 1308 1309 case IP_MULTICAST_LOOP: 1310 /* 1311 * Set the loopback flag for outgoing multicast packets. 1312 * Must be zero or one. The original multicast API required a 1313 * char argument, which is inconsistent with the rest 1314 * of the socket API. We allow either a char or an int. 1315 */ 1316 if (sopt->sopt_valsize == 1) { 1317 u_char loop; 1318 error = sooptcopyin(sopt, &loop, 1, 1); 1319 if (error) 1320 break; 1321 imo = ip_findmoptions(inp); 1322 imo->imo_multicast_loop = !!loop; 1323 INP_UNLOCK(inp); 1324 } else { 1325 u_int loop; 1326 error = sooptcopyin(sopt, &loop, sizeof loop, 1327 sizeof loop); 1328 if (error) 1329 break; 1330 imo = ip_findmoptions(inp); 1331 imo->imo_multicast_loop = !!loop; 1332 INP_UNLOCK(inp); 1333 } 1334 break; 1335 1336 case IP_ADD_MEMBERSHIP: 1337 /* 1338 * Add a multicast group membership. 1339 * Group must be a valid IP multicast address. 1340 */ 1341 error = sooptcopyin(sopt, &mreq, sizeof mreq, sizeof mreq); 1342 if (error) 1343 break; 1344 1345 if (!IN_MULTICAST(ntohl(mreq.imr_multiaddr.s_addr))) { 1346 error = EINVAL; 1347 break; 1348 } 1349 s = splimp(); 1350 /* 1351 * If no interface address was provided, use the interface of 1352 * the route to the given multicast address. 1353 */ 1354 if (mreq.imr_interface.s_addr == INADDR_ANY) { 1355 bzero((caddr_t)&ro, sizeof(ro)); 1356 dst = (struct sockaddr_in *)&ro.ro_dst; 1357 dst->sin_len = sizeof(*dst); 1358 dst->sin_family = AF_INET; 1359 dst->sin_addr = mreq.imr_multiaddr; 1360 rtalloc_ign(&ro, RTF_CLONING); 1361 if (ro.ro_rt == NULL) { 1362 error = EADDRNOTAVAIL; 1363 splx(s); 1364 break; 1365 } 1366 ifp = ro.ro_rt->rt_ifp; 1367 RTFREE(ro.ro_rt); 1368 } 1369 else { 1370 ifp = ip_multicast_if(&mreq.imr_interface, NULL); 1371 } 1372 1373 /* 1374 * See if we found an interface, and confirm that it 1375 * supports multicast. 1376 */ 1377 if (ifp == NULL || (ifp->if_flags & IFF_MULTICAST) == 0) { 1378 error = EADDRNOTAVAIL; 1379 splx(s); 1380 break; 1381 } 1382 /* 1383 * See if the membership already exists or if all the 1384 * membership slots are full. 1385 */ 1386 imo = ip_findmoptions(inp); 1387 for (i = 0; i < imo->imo_num_memberships; ++i) { 1388 if (imo->imo_membership[i]->inm_ifp == ifp && 1389 imo->imo_membership[i]->inm_addr.s_addr 1390 == mreq.imr_multiaddr.s_addr) 1391 break; 1392 } 1393 if (i < imo->imo_num_memberships) { 1394 INP_UNLOCK(inp); 1395 error = EADDRINUSE; 1396 splx(s); 1397 break; 1398 } 1399 if (imo->imo_num_memberships == imo->imo_max_memberships) { 1400 struct in_multi **nmships, **omships; 1401 size_t newmax; 1402 /* 1403 * Resize the vector to next power-of-two minus 1. If the 1404 * size would exceed the maximum then we know we've really 1405 * run out of entries. Otherwise, we realloc() the vector 1406 * with the INP lock held to avoid introducing a race. 1407 */ 1408 nmships = NULL; 1409 omships = imo->imo_membership; 1410 newmax = ((imo->imo_max_memberships + 1) * 2) - 1; 1411 if (newmax <= IP_MAX_MEMBERSHIPS) { 1412 nmships = (struct in_multi **)realloc(omships, 1413sizeof(*nmships) * newmax, M_IPMOPTS, M_NOWAIT); 1414 if (nmships != NULL) { 1415 imo->imo_membership = nmships; 1416 imo->imo_max_memberships = newmax; 1417 } 1418 } 1419 if (nmships == NULL) { 1420 INP_UNLOCK(inp); 1421 error = ETOOMANYREFS; 1422 splx(s); 1423 break; 1424 } 1425 } 1426 /* 1427 * Everything looks good; add a new record to the multicast 1428 * address list for the given interface. 1429 */ 1430 if ((imo->imo_membership[i] = 1431 in_addmulti(&mreq.imr_multiaddr, ifp)) == NULL) { 1432 INP_UNLOCK(inp); 1433 error = ENOBUFS; 1434 splx(s); 1435 break; 1436 } 1437 ++imo->imo_num_memberships; 1438 INP_UNLOCK(inp); 1439 splx(s); 1440 break; 1441 1442 case IP_DROP_MEMBERSHIP: 1443 /* 1444 * Drop a multicast group membership. 1445 * Group must be a valid IP multicast address. 1446 */ 1447 error = sooptcopyin(sopt, &mreq, sizeof mreq, sizeof mreq); 1448 if (error) 1449 break; 1450 1451 if (!IN_MULTICAST(ntohl(mreq.imr_multiaddr.s_addr))) { 1452 error = EINVAL; 1453 break; 1454 } 1455 1456 s = splimp(); 1457 /* 1458 * If an interface address was specified, get a pointer 1459 * to its ifnet structure. 1460 */ 1461 if (mreq.imr_interface.s_addr == INADDR_ANY) 1462 ifp = NULL; 1463 else { 1464 ifp = ip_multicast_if(&mreq.imr_interface, NULL); 1465 if (ifp == NULL) { 1466 error = EADDRNOTAVAIL; 1467 splx(s); 1468 break; 1469 } 1470 } 1471 /* 1472 * Find the membership in the membership array. 1473 */ 1474 imo = ip_findmoptions(inp); 1475 for (i = 0; i < imo->imo_num_memberships; ++i) { 1476 if ((ifp == NULL || 1477 imo->imo_membership[i]->inm_ifp == ifp) && 1478 imo->imo_membership[i]->inm_addr.s_addr == 1479 mreq.imr_multiaddr.s_addr) 1480 break; 1481 } 1482 if (i == imo->imo_num_memberships) { 1483 INP_UNLOCK(inp); 1484 error = EADDRNOTAVAIL; 1485 splx(s); 1486 break; 1487 } 1488 /* 1489 * Give up the multicast address record to which the 1490 * membership points. 1491 */ 1492 in_delmulti(imo->imo_membership[i]); 1493 /* 1494 * Remove the gap in the membership array. 1495 */ 1496 for (++i; i < imo->imo_num_memberships; ++i) 1497 imo->imo_membership[i-1] = imo->imo_membership[i]; 1498 --imo->imo_num_memberships; 1499 INP_UNLOCK(inp); 1500 splx(s); 1501 break; 1502 1503 default: 1504 error = EOPNOTSUPP; 1505 break; 1506 } 1507 1508 return (error); 1509} 1510 1511/* 1512 * Return the IP multicast options in response to user getsockopt(). 1513 */ 1514static int 1515ip_getmoptions(struct inpcb *inp, struct sockopt *sopt) 1516{ 1517 struct ip_moptions *imo; 1518 struct in_addr addr; 1519 struct in_ifaddr *ia; 1520 int error, optval; 1521 u_char coptval; 1522 1523 INP_LOCK(inp); 1524 imo = inp->inp_moptions; 1525 1526 error = 0; 1527 switch (sopt->sopt_name) { 1528 case IP_MULTICAST_VIF: 1529 if (imo != NULL) 1530 optval = imo->imo_multicast_vif; 1531 else 1532 optval = -1; 1533 INP_UNLOCK(inp); 1534 error = sooptcopyout(sopt, &optval, sizeof optval); 1535 break; 1536 1537 case IP_MULTICAST_IF: 1538 if (imo == NULL || imo->imo_multicast_ifp == NULL) 1539 addr.s_addr = INADDR_ANY; 1540 else if (imo->imo_multicast_addr.s_addr) { 1541 /* return the value user has set */ 1542 addr = imo->imo_multicast_addr; 1543 } else { 1544 IFP_TO_IA(imo->imo_multicast_ifp, ia); 1545 addr.s_addr = (ia == NULL) ? INADDR_ANY 1546 : IA_SIN(ia)->sin_addr.s_addr; 1547 } 1548 INP_UNLOCK(inp); 1549 error = sooptcopyout(sopt, &addr, sizeof addr); 1550 break; 1551 1552 case IP_MULTICAST_TTL: 1553 if (imo == 0) 1554 optval = coptval = IP_DEFAULT_MULTICAST_TTL; 1555 else 1556 optval = coptval = imo->imo_multicast_ttl; 1557 INP_UNLOCK(inp); 1558 if (sopt->sopt_valsize == 1) 1559 error = sooptcopyout(sopt, &coptval, 1); 1560 else 1561 error = sooptcopyout(sopt, &optval, sizeof optval); 1562 break; 1563 1564 case IP_MULTICAST_LOOP: 1565 if (imo == 0) 1566 optval = coptval = IP_DEFAULT_MULTICAST_LOOP; 1567 else 1568 optval = coptval = imo->imo_multicast_loop; 1569 INP_UNLOCK(inp); 1570 if (sopt->sopt_valsize == 1) 1571 error = sooptcopyout(sopt, &coptval, 1); 1572 else 1573 error = sooptcopyout(sopt, &optval, sizeof optval); 1574 break; 1575 1576 default: 1577 INP_UNLOCK(inp); 1578 error = ENOPROTOOPT; 1579 break; 1580 } 1581 INP_UNLOCK_ASSERT(inp); 1582 1583 return (error); 1584} 1585 1586/* 1587 * Discard the IP multicast options. 1588 */ 1589void 1590ip_freemoptions(imo) 1591 register struct ip_moptions *imo; 1592{ 1593 register int i; 1594 1595 if (imo != NULL) { 1596 for (i = 0; i < imo->imo_num_memberships; ++i) 1597 in_delmulti(imo->imo_membership[i]); 1598 free(imo->imo_membership, M_IPMOPTS); 1599 free(imo, M_IPMOPTS); 1600 } 1601} 1602 1603/* 1604 * Routine called from ip_output() to loop back a copy of an IP multicast 1605 * packet to the input queue of a specified interface. Note that this 1606 * calls the output routine of the loopback "driver", but with an interface 1607 * pointer that might NOT be a loopback interface -- evil, but easier than 1608 * replicating that code here. 1609 */ 1610static void 1611ip_mloopback(ifp, m, dst, hlen) 1612 struct ifnet *ifp; 1613 register struct mbuf *m; 1614 register struct sockaddr_in *dst; 1615 int hlen; 1616{ 1617 register struct ip *ip; 1618 struct mbuf *copym; 1619 1620 copym = m_copy(m, 0, M_COPYALL); 1621 if (copym != NULL && (copym->m_flags & M_EXT || copym->m_len < hlen)) 1622 copym = m_pullup(copym, hlen); 1623 if (copym != NULL) { 1624 /* If needed, compute the checksum and mark it as valid. */ 1625 if (copym->m_pkthdr.csum_flags & CSUM_DELAY_DATA) { 1626 in_delayed_cksum(copym); 1627 copym->m_pkthdr.csum_flags &= ~CSUM_DELAY_DATA; 1628 copym->m_pkthdr.csum_flags |= 1629 CSUM_DATA_VALID | CSUM_PSEUDO_HDR; 1630 copym->m_pkthdr.csum_data = 0xffff; 1631 } 1632 /* 1633 * We don't bother to fragment if the IP length is greater 1634 * than the interface's MTU. Can this possibly matter? 1635 */ 1636 ip = mtod(copym, struct ip *); 1637 ip->ip_len = htons(ip->ip_len); 1638 ip->ip_off = htons(ip->ip_off); 1639 ip->ip_sum = 0; 1640 ip->ip_sum = in_cksum(copym, hlen); 1641 /* 1642 * NB: 1643 * It's not clear whether there are any lingering 1644 * reentrancy problems in other areas which might 1645 * be exposed by using ip_input directly (in 1646 * particular, everything which modifies the packet 1647 * in-place). Yet another option is using the 1648 * protosw directly to deliver the looped back 1649 * packet. For the moment, we'll err on the side 1650 * of safety by using if_simloop(). 1651 */ 1652#if 1 /* XXX */ 1653 if (dst->sin_family != AF_INET) { 1654 printf("ip_mloopback: bad address family %d\n", 1655 dst->sin_family); 1656 dst->sin_family = AF_INET; 1657 } 1658#endif 1659 1660#ifdef notdef 1661 copym->m_pkthdr.rcvif = ifp; 1662 ip_input(copym); 1663#else 1664 if_simloop(ifp, copym, dst->sin_family, 0); 1665#endif 1666 } 1667} 1668