ip_output.c revision 164033
1/*- 2 * Copyright (c) 1982, 1986, 1988, 1990, 1993 3 * The Regents of the University of California. All rights reserved. 4 * 5 * Redistribution and use in source and binary forms, with or without 6 * modification, are permitted provided that the following conditions 7 * are met: 8 * 1. Redistributions of source code must retain the above copyright 9 * notice, this list of conditions and the following disclaimer. 10 * 2. Redistributions in binary form must reproduce the above copyright 11 * notice, this list of conditions and the following disclaimer in the 12 * documentation and/or other materials provided with the distribution. 13 * 4. Neither the name of the University nor the names of its contributors 14 * may be used to endorse or promote products derived from this software 15 * without specific prior written permission. 16 * 17 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND 18 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 19 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 20 * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE 21 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 22 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 23 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 24 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 25 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 26 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 27 * SUCH DAMAGE. 28 * 29 * @(#)ip_output.c 8.3 (Berkeley) 1/21/94 30 * $FreeBSD: head/sys/netinet/ip_output.c 164033 2006-11-06 13:42:10Z rwatson $ 31 */ 32 33#include "opt_ipfw.h" 34#include "opt_ipsec.h" 35#include "opt_mac.h" 36#include "opt_mbuf_stress_test.h" 37 38#include <sys/param.h> 39#include <sys/systm.h> 40#include <sys/kernel.h> 41#include <sys/malloc.h> 42#include <sys/mbuf.h> 43#include <sys/priv.h> 44#include <sys/protosw.h> 45#include <sys/socket.h> 46#include <sys/socketvar.h> 47#include <sys/sysctl.h> 48 49#include <net/if.h> 50#include <net/netisr.h> 51#include <net/pfil.h> 52#include <net/route.h> 53 54#include <netinet/in.h> 55#include <netinet/in_systm.h> 56#include <netinet/ip.h> 57#include <netinet/in_pcb.h> 58#include <netinet/in_var.h> 59#include <netinet/ip_var.h> 60#include <netinet/ip_options.h> 61 62#if defined(IPSEC) || defined(FAST_IPSEC) 63#include <netinet/ip_ipsec.h> 64#ifdef IPSEC 65#include <netinet6/ipsec.h> 66#endif 67#ifdef FAST_IPSEC 68#include <netipsec/ipsec.h> 69#endif 70#endif /*IPSEC*/ 71 72#include <machine/in_cksum.h> 73 74#include <security/mac/mac_framework.h> 75 76static MALLOC_DEFINE(M_IPMOPTS, "ip_moptions", "internet multicast options"); 77 78#define print_ip(x, a, y) printf("%s %d.%d.%d.%d%s",\ 79 x, (ntohl(a.s_addr)>>24)&0xFF,\ 80 (ntohl(a.s_addr)>>16)&0xFF,\ 81 (ntohl(a.s_addr)>>8)&0xFF,\ 82 (ntohl(a.s_addr))&0xFF, y); 83 84u_short ip_id; 85 86#ifdef MBUF_STRESS_TEST 87int mbuf_frag_size = 0; 88SYSCTL_INT(_net_inet_ip, OID_AUTO, mbuf_frag_size, CTLFLAG_RW, 89 &mbuf_frag_size, 0, "Fragment outgoing mbufs to this size"); 90#endif 91 92static struct ifnet *ip_multicast_if(struct in_addr *, int *); 93static void ip_mloopback 94 (struct ifnet *, struct mbuf *, struct sockaddr_in *, int); 95static int ip_getmoptions(struct inpcb *, struct sockopt *); 96static int ip_setmoptions(struct inpcb *, struct sockopt *); 97 98 99extern struct protosw inetsw[]; 100 101/* 102 * IP output. The packet in mbuf chain m contains a skeletal IP 103 * header (with len, off, ttl, proto, tos, src, dst). 104 * The mbuf chain containing the packet will be freed. 105 * The mbuf opt, if present, will not be freed. 106 * In the IP forwarding case, the packet will arrive with options already 107 * inserted, so must have a NULL opt pointer. 108 */ 109int 110ip_output(struct mbuf *m, struct mbuf *opt, struct route *ro, 111 int flags, struct ip_moptions *imo, struct inpcb *inp) 112{ 113 struct ip *ip; 114 struct ifnet *ifp = NULL; /* keep compiler happy */ 115 struct mbuf *m0; 116 int hlen = sizeof (struct ip); 117 int mtu; 118 int len, error = 0; 119 struct sockaddr_in *dst = NULL; /* keep compiler happy */ 120 struct in_ifaddr *ia = NULL; 121 struct in_ifaddr *sia = NULL; 122 int isbroadcast, sw_csum; 123 struct route iproute; 124 struct in_addr odst; 125#ifdef IPFIREWALL_FORWARD 126 struct m_tag *fwd_tag = NULL; 127#endif 128 M_ASSERTPKTHDR(m); 129 130 if (ro == NULL) { 131 ro = &iproute; 132 bzero(ro, sizeof (*ro)); 133 } 134 135 if (inp != NULL) 136 INP_LOCK_ASSERT(inp); 137 138 if (opt) { 139 len = 0; 140 m = ip_insertoptions(m, opt, &len); 141 if (len != 0) 142 hlen = len; 143 } 144 ip = mtod(m, struct ip *); 145 146 /* 147 * Fill in IP header. If we are not allowing fragmentation, 148 * then the ip_id field is meaningless, but we don't set it 149 * to zero. Doing so causes various problems when devices along 150 * the path (routers, load balancers, firewalls, etc.) illegally 151 * disable DF on our packet. Note that a 16-bit counter 152 * will wrap around in less than 10 seconds at 100 Mbit/s on a 153 * medium with MTU 1500. See Steven M. Bellovin, "A Technique 154 * for Counting NATted Hosts", Proc. IMW'02, available at 155 * <http://www.cs.columbia.edu/~smb/papers/fnat.pdf>. 156 */ 157 if ((flags & (IP_FORWARDING|IP_RAWOUTPUT)) == 0) { 158 ip->ip_v = IPVERSION; 159 ip->ip_hl = hlen >> 2; 160 ip->ip_id = ip_newid(); 161 ipstat.ips_localout++; 162 } else { 163 hlen = ip->ip_hl << 2; 164 } 165 166 dst = (struct sockaddr_in *)&ro->ro_dst; 167again: 168 /* 169 * If there is a cached route, 170 * check that it is to the same destination 171 * and is still up. If not, free it and try again. 172 * The address family should also be checked in case of sharing the 173 * cache with IPv6. 174 */ 175 if (ro->ro_rt && ((ro->ro_rt->rt_flags & RTF_UP) == 0 || 176 dst->sin_family != AF_INET || 177 dst->sin_addr.s_addr != ip->ip_dst.s_addr)) { 178 RTFREE(ro->ro_rt); 179 ro->ro_rt = (struct rtentry *)NULL; 180 } 181#ifdef IPFIREWALL_FORWARD 182 if (ro->ro_rt == NULL && fwd_tag == NULL) { 183#else 184 if (ro->ro_rt == NULL) { 185#endif 186 bzero(dst, sizeof(*dst)); 187 dst->sin_family = AF_INET; 188 dst->sin_len = sizeof(*dst); 189 dst->sin_addr = ip->ip_dst; 190 } 191 /* 192 * If routing to interface only, 193 * short circuit routing lookup. 194 */ 195 if (flags & IP_ROUTETOIF) { 196 if ((ia = ifatoia(ifa_ifwithdstaddr(sintosa(dst)))) == NULL && 197 (ia = ifatoia(ifa_ifwithnet(sintosa(dst)))) == NULL) { 198 ipstat.ips_noroute++; 199 error = ENETUNREACH; 200 goto bad; 201 } 202 ifp = ia->ia_ifp; 203 ip->ip_ttl = 1; 204 isbroadcast = in_broadcast(dst->sin_addr, ifp); 205 } else if (flags & IP_SENDONES) { 206 if ((ia = ifatoia(ifa_ifwithbroadaddr(sintosa(dst)))) == NULL) { 207 ipstat.ips_noroute++; 208 error = ENETUNREACH; 209 goto bad; 210 } 211 ifp = ia->ia_ifp; 212 ip->ip_dst.s_addr = INADDR_BROADCAST; 213 dst->sin_addr = ip->ip_dst; 214 ip->ip_ttl = 1; 215 isbroadcast = 1; 216 } else if (IN_MULTICAST(ntohl(ip->ip_dst.s_addr)) && 217 imo != NULL && imo->imo_multicast_ifp != NULL) { 218 /* 219 * Bypass the normal routing lookup for multicast 220 * packets if the interface is specified. 221 */ 222 ifp = imo->imo_multicast_ifp; 223 IFP_TO_IA(ifp, ia); 224 isbroadcast = 0; /* fool gcc */ 225 } else { 226 /* 227 * We want to do any cloning requested by the link layer, 228 * as this is probably required in all cases for correct 229 * operation (as it is for ARP). 230 */ 231 if (ro->ro_rt == NULL) 232 rtalloc_ign(ro, 0); 233 if (ro->ro_rt == NULL) { 234 ipstat.ips_noroute++; 235 error = EHOSTUNREACH; 236 goto bad; 237 } 238 ia = ifatoia(ro->ro_rt->rt_ifa); 239 ifp = ro->ro_rt->rt_ifp; 240 ro->ro_rt->rt_rmx.rmx_pksent++; 241 if (ro->ro_rt->rt_flags & RTF_GATEWAY) 242 dst = (struct sockaddr_in *)ro->ro_rt->rt_gateway; 243 if (ro->ro_rt->rt_flags & RTF_HOST) 244 isbroadcast = (ro->ro_rt->rt_flags & RTF_BROADCAST); 245 else 246 isbroadcast = in_broadcast(dst->sin_addr, ifp); 247 } 248 /* 249 * Calculate MTU. If we have a route that is up, use that, 250 * otherwise use the interface's MTU. 251 */ 252 if (ro->ro_rt != NULL && (ro->ro_rt->rt_flags & (RTF_UP|RTF_HOST))) { 253 /* 254 * This case can happen if the user changed the MTU 255 * of an interface after enabling IP on it. Because 256 * most netifs don't keep track of routes pointing to 257 * them, there is no way for one to update all its 258 * routes when the MTU is changed. 259 */ 260 if (ro->ro_rt->rt_rmx.rmx_mtu > ifp->if_mtu) 261 ro->ro_rt->rt_rmx.rmx_mtu = ifp->if_mtu; 262 mtu = ro->ro_rt->rt_rmx.rmx_mtu; 263 } else { 264 mtu = ifp->if_mtu; 265 } 266 if (IN_MULTICAST(ntohl(ip->ip_dst.s_addr))) { 267 struct in_multi *inm; 268 269 m->m_flags |= M_MCAST; 270 /* 271 * IP destination address is multicast. Make sure "dst" 272 * still points to the address in "ro". (It may have been 273 * changed to point to a gateway address, above.) 274 */ 275 dst = (struct sockaddr_in *)&ro->ro_dst; 276 /* 277 * See if the caller provided any multicast options 278 */ 279 if (imo != NULL) { 280 ip->ip_ttl = imo->imo_multicast_ttl; 281 if (imo->imo_multicast_vif != -1) 282 ip->ip_src.s_addr = 283 ip_mcast_src ? 284 ip_mcast_src(imo->imo_multicast_vif) : 285 INADDR_ANY; 286 } else 287 ip->ip_ttl = IP_DEFAULT_MULTICAST_TTL; 288 /* 289 * Confirm that the outgoing interface supports multicast. 290 */ 291 if ((imo == NULL) || (imo->imo_multicast_vif == -1)) { 292 if ((ifp->if_flags & IFF_MULTICAST) == 0) { 293 ipstat.ips_noroute++; 294 error = ENETUNREACH; 295 goto bad; 296 } 297 } 298 /* 299 * If source address not specified yet, use address 300 * of outgoing interface. 301 */ 302 if (ip->ip_src.s_addr == INADDR_ANY) { 303 /* Interface may have no addresses. */ 304 if (ia != NULL) 305 ip->ip_src = IA_SIN(ia)->sin_addr; 306 } 307 308 IN_MULTI_LOCK(); 309 IN_LOOKUP_MULTI(ip->ip_dst, ifp, inm); 310 if (inm != NULL && 311 (imo == NULL || imo->imo_multicast_loop)) { 312 IN_MULTI_UNLOCK(); 313 /* 314 * If we belong to the destination multicast group 315 * on the outgoing interface, and the caller did not 316 * forbid loopback, loop back a copy. 317 */ 318 ip_mloopback(ifp, m, dst, hlen); 319 } 320 else { 321 IN_MULTI_UNLOCK(); 322 /* 323 * If we are acting as a multicast router, perform 324 * multicast forwarding as if the packet had just 325 * arrived on the interface to which we are about 326 * to send. The multicast forwarding function 327 * recursively calls this function, using the 328 * IP_FORWARDING flag to prevent infinite recursion. 329 * 330 * Multicasts that are looped back by ip_mloopback(), 331 * above, will be forwarded by the ip_input() routine, 332 * if necessary. 333 */ 334 if (ip_mrouter && (flags & IP_FORWARDING) == 0) { 335 /* 336 * If rsvp daemon is not running, do not 337 * set ip_moptions. This ensures that the packet 338 * is multicast and not just sent down one link 339 * as prescribed by rsvpd. 340 */ 341 if (!rsvp_on) 342 imo = NULL; 343 if (ip_mforward && 344 ip_mforward(ip, ifp, m, imo) != 0) { 345 m_freem(m); 346 goto done; 347 } 348 } 349 } 350 351 /* 352 * Multicasts with a time-to-live of zero may be looped- 353 * back, above, but must not be transmitted on a network. 354 * Also, multicasts addressed to the loopback interface 355 * are not sent -- the above call to ip_mloopback() will 356 * loop back a copy if this host actually belongs to the 357 * destination group on the loopback interface. 358 */ 359 if (ip->ip_ttl == 0 || ifp->if_flags & IFF_LOOPBACK) { 360 m_freem(m); 361 goto done; 362 } 363 364 goto sendit; 365 } 366 367 /* 368 * If the source address is not specified yet, use the address 369 * of the outoing interface. 370 */ 371 if (ip->ip_src.s_addr == INADDR_ANY) { 372 /* Interface may have no addresses. */ 373 if (ia != NULL) { 374 ip->ip_src = IA_SIN(ia)->sin_addr; 375 } 376 } 377 378 /* 379 * Verify that we have any chance at all of being able to queue the 380 * packet or packet fragments, unless ALTQ is enabled on the given 381 * interface in which case packetdrop should be done by queueing. 382 */ 383#ifdef ALTQ 384 if ((!ALTQ_IS_ENABLED(&ifp->if_snd)) && 385 ((ifp->if_snd.ifq_len + ip->ip_len / mtu + 1) >= 386 ifp->if_snd.ifq_maxlen)) 387#else 388 if ((ifp->if_snd.ifq_len + ip->ip_len / mtu + 1) >= 389 ifp->if_snd.ifq_maxlen) 390#endif /* ALTQ */ 391 { 392 error = ENOBUFS; 393 ipstat.ips_odropped++; 394 ifp->if_snd.ifq_drops += (ip->ip_len / ifp->if_mtu + 1); 395 goto bad; 396 } 397 398 /* 399 * Look for broadcast address and 400 * verify user is allowed to send 401 * such a packet. 402 */ 403 if (isbroadcast) { 404 if ((ifp->if_flags & IFF_BROADCAST) == 0) { 405 error = EADDRNOTAVAIL; 406 goto bad; 407 } 408 if ((flags & IP_ALLOWBROADCAST) == 0) { 409 error = EACCES; 410 goto bad; 411 } 412 /* don't allow broadcast messages to be fragmented */ 413 if (ip->ip_len > mtu) { 414 error = EMSGSIZE; 415 goto bad; 416 } 417 m->m_flags |= M_BCAST; 418 } else { 419 m->m_flags &= ~M_BCAST; 420 } 421 422sendit: 423#if defined(IPSEC) || defined(FAST_IPSEC) 424 switch(ip_ipsec_output(&m, inp, &flags, &error, &ro, &iproute, &dst, &ia, &ifp)) { 425 case 1: 426 goto bad; 427 case -1: 428 goto done; 429 case 0: 430 default: 431 break; /* Continue with packet processing. */ 432 } 433 /* Update variables that are affected by ipsec4_output(). */ 434 ip = mtod(m, struct ip *); 435 hlen = ip->ip_hl << 2; 436#endif /* IPSEC */ 437 438 /* Jump over all PFIL processing if hooks are not active. */ 439 if (!PFIL_HOOKED(&inet_pfil_hook)) 440 goto passout; 441 442 /* Run through list of hooks for output packets. */ 443 odst.s_addr = ip->ip_dst.s_addr; 444 error = pfil_run_hooks(&inet_pfil_hook, &m, ifp, PFIL_OUT, inp); 445 if (error != 0 || m == NULL) 446 goto done; 447 448 ip = mtod(m, struct ip *); 449 450 /* See if destination IP address was changed by packet filter. */ 451 if (odst.s_addr != ip->ip_dst.s_addr) { 452 m->m_flags |= M_SKIP_FIREWALL; 453 /* If destination is now ourself drop to ip_input(). */ 454 if (in_localip(ip->ip_dst)) { 455 m->m_flags |= M_FASTFWD_OURS; 456 if (m->m_pkthdr.rcvif == NULL) 457 m->m_pkthdr.rcvif = loif; 458 if (m->m_pkthdr.csum_flags & CSUM_DELAY_DATA) { 459 m->m_pkthdr.csum_flags |= 460 CSUM_DATA_VALID | CSUM_PSEUDO_HDR; 461 m->m_pkthdr.csum_data = 0xffff; 462 } 463 m->m_pkthdr.csum_flags |= 464 CSUM_IP_CHECKED | CSUM_IP_VALID; 465 466 error = netisr_queue(NETISR_IP, m); 467 goto done; 468 } else 469 goto again; /* Redo the routing table lookup. */ 470 } 471 472#ifdef IPFIREWALL_FORWARD 473 /* See if local, if yes, send it to netisr with IP_FASTFWD_OURS. */ 474 if (m->m_flags & M_FASTFWD_OURS) { 475 if (m->m_pkthdr.rcvif == NULL) 476 m->m_pkthdr.rcvif = loif; 477 if (m->m_pkthdr.csum_flags & CSUM_DELAY_DATA) { 478 m->m_pkthdr.csum_flags |= 479 CSUM_DATA_VALID | CSUM_PSEUDO_HDR; 480 m->m_pkthdr.csum_data = 0xffff; 481 } 482 m->m_pkthdr.csum_flags |= 483 CSUM_IP_CHECKED | CSUM_IP_VALID; 484 485 error = netisr_queue(NETISR_IP, m); 486 goto done; 487 } 488 /* Or forward to some other address? */ 489 fwd_tag = m_tag_find(m, PACKET_TAG_IPFORWARD, NULL); 490 if (fwd_tag) { 491 dst = (struct sockaddr_in *)&ro->ro_dst; 492 bcopy((fwd_tag+1), dst, sizeof(struct sockaddr_in)); 493 m->m_flags |= M_SKIP_FIREWALL; 494 m_tag_delete(m, fwd_tag); 495 goto again; 496 } 497#endif /* IPFIREWALL_FORWARD */ 498 499passout: 500 /* 127/8 must not appear on wire - RFC1122. */ 501 if ((ntohl(ip->ip_dst.s_addr) >> IN_CLASSA_NSHIFT) == IN_LOOPBACKNET || 502 (ntohl(ip->ip_src.s_addr) >> IN_CLASSA_NSHIFT) == IN_LOOPBACKNET) { 503 if ((ifp->if_flags & IFF_LOOPBACK) == 0) { 504 ipstat.ips_badaddr++; 505 error = EADDRNOTAVAIL; 506 goto bad; 507 } 508 } 509 510 m->m_pkthdr.csum_flags |= CSUM_IP; 511 sw_csum = m->m_pkthdr.csum_flags & ~ifp->if_hwassist; 512 if (sw_csum & CSUM_DELAY_DATA) { 513 in_delayed_cksum(m); 514 sw_csum &= ~CSUM_DELAY_DATA; 515 } 516 m->m_pkthdr.csum_flags &= ifp->if_hwassist; 517 518 /* 519 * If small enough for interface, or the interface will take 520 * care of the fragmentation for us, we can just send directly. 521 */ 522 if (ip->ip_len <= mtu || 523 (m->m_pkthdr.csum_flags & ifp->if_hwassist & CSUM_TSO) != 0 || 524 ((ip->ip_off & IP_DF) == 0 && (ifp->if_hwassist & CSUM_FRAGMENT))) { 525 ip->ip_len = htons(ip->ip_len); 526 ip->ip_off = htons(ip->ip_off); 527 ip->ip_sum = 0; 528 if (sw_csum & CSUM_DELAY_IP) 529 ip->ip_sum = in_cksum(m, hlen); 530 531 /* 532 * Record statistics for this interface address. 533 * With CSUM_TSO the byte/packet count will be slightly 534 * incorrect because we count the IP+TCP headers only 535 * once instead of for every generated packet. 536 */ 537 if (!(flags & IP_FORWARDING) && ia) { 538 INADDR_TO_IFADDR(ip->ip_src, sia); 539 if (sia == NULL) 540 sia = ia; 541 if (m->m_pkthdr.csum_flags & CSUM_TSO) 542 sia->ia_ifa.if_opackets += 543 m->m_pkthdr.len / m->m_pkthdr.tso_segsz; 544 else 545 sia->ia_ifa.if_opackets++; 546 sia->ia_ifa.if_obytes += m->m_pkthdr.len; 547 } 548#ifdef IPSEC 549 /* clean ipsec history once it goes out of the node */ 550 ipsec_delaux(m); 551#endif 552#ifdef MBUF_STRESS_TEST 553 if (mbuf_frag_size && m->m_pkthdr.len > mbuf_frag_size) 554 m = m_fragment(m, M_DONTWAIT, mbuf_frag_size); 555#endif 556 /* 557 * Reset layer specific mbuf flags 558 * to avoid confusing lower layers. 559 */ 560 m->m_flags &= ~(M_PROTOFLAGS); 561 562 error = (*ifp->if_output)(ifp, m, 563 (struct sockaddr *)dst, ro->ro_rt); 564 goto done; 565 } 566 567 /* Balk when DF bit is set or the interface didn't support TSO. */ 568 if ((ip->ip_off & IP_DF) || (m->m_pkthdr.csum_flags & CSUM_TSO)) { 569 error = EMSGSIZE; 570 ipstat.ips_cantfrag++; 571 goto bad; 572 } 573 574 /* 575 * Too large for interface; fragment if possible. If successful, 576 * on return, m will point to a list of packets to be sent. 577 */ 578 error = ip_fragment(ip, &m, mtu, ifp->if_hwassist, sw_csum); 579 if (error) 580 goto bad; 581 for (; m; m = m0) { 582 m0 = m->m_nextpkt; 583 m->m_nextpkt = 0; 584#ifdef IPSEC 585 /* clean ipsec history once it goes out of the node */ 586 ipsec_delaux(m); 587#endif 588 if (error == 0) { 589 /* Record statistics for this interface address. */ 590 if (ia != NULL) { 591 INADDR_TO_IFADDR(ip->ip_src, sia); 592 if (sia == NULL) 593 sia = ia; 594 sia->ia_ifa.if_opackets++; 595 sia->ia_ifa.if_obytes += m->m_pkthdr.len; 596 } 597 /* 598 * Reset layer specific mbuf flags 599 * to avoid confusing upper layers. 600 */ 601 m->m_flags &= ~(M_PROTOFLAGS); 602 603 error = (*ifp->if_output)(ifp, m, 604 (struct sockaddr *)dst, ro->ro_rt); 605 } else 606 m_freem(m); 607 } 608 609 if (error == 0) 610 ipstat.ips_fragmented++; 611 612done: 613 if (ro == &iproute && ro->ro_rt) { 614 RTFREE(ro->ro_rt); 615 } 616 return (error); 617bad: 618 m_freem(m); 619 goto done; 620} 621 622/* 623 * Create a chain of fragments which fit the given mtu. m_frag points to the 624 * mbuf to be fragmented; on return it points to the chain with the fragments. 625 * Return 0 if no error. If error, m_frag may contain a partially built 626 * chain of fragments that should be freed by the caller. 627 * 628 * if_hwassist_flags is the hw offload capabilities (see if_data.ifi_hwassist) 629 * sw_csum contains the delayed checksums flags (e.g., CSUM_DELAY_IP). 630 */ 631int 632ip_fragment(struct ip *ip, struct mbuf **m_frag, int mtu, 633 u_long if_hwassist_flags, int sw_csum) 634{ 635 int error = 0; 636 int hlen = ip->ip_hl << 2; 637 int len = (mtu - hlen) & ~7; /* size of payload in each fragment */ 638 int off; 639 struct mbuf *m0 = *m_frag; /* the original packet */ 640 int firstlen; 641 struct mbuf **mnext; 642 int nfrags; 643 644 if (ip->ip_off & IP_DF) { /* Fragmentation not allowed */ 645 ipstat.ips_cantfrag++; 646 return EMSGSIZE; 647 } 648 649 /* 650 * Must be able to put at least 8 bytes per fragment. 651 */ 652 if (len < 8) 653 return EMSGSIZE; 654 655 /* 656 * If the interface will not calculate checksums on 657 * fragmented packets, then do it here. 658 */ 659 if (m0->m_pkthdr.csum_flags & CSUM_DELAY_DATA && 660 (if_hwassist_flags & CSUM_IP_FRAGS) == 0) { 661 in_delayed_cksum(m0); 662 m0->m_pkthdr.csum_flags &= ~CSUM_DELAY_DATA; 663 } 664 665 if (len > PAGE_SIZE) { 666 /* 667 * Fragment large datagrams such that each segment 668 * contains a multiple of PAGE_SIZE amount of data, 669 * plus headers. This enables a receiver to perform 670 * page-flipping zero-copy optimizations. 671 * 672 * XXX When does this help given that sender and receiver 673 * could have different page sizes, and also mtu could 674 * be less than the receiver's page size ? 675 */ 676 int newlen; 677 struct mbuf *m; 678 679 for (m = m0, off = 0; m && (off+m->m_len) <= mtu; m = m->m_next) 680 off += m->m_len; 681 682 /* 683 * firstlen (off - hlen) must be aligned on an 684 * 8-byte boundary 685 */ 686 if (off < hlen) 687 goto smart_frag_failure; 688 off = ((off - hlen) & ~7) + hlen; 689 newlen = (~PAGE_MASK) & mtu; 690 if ((newlen + sizeof (struct ip)) > mtu) { 691 /* we failed, go back the default */ 692smart_frag_failure: 693 newlen = len; 694 off = hlen + len; 695 } 696 len = newlen; 697 698 } else { 699 off = hlen + len; 700 } 701 702 firstlen = off - hlen; 703 mnext = &m0->m_nextpkt; /* pointer to next packet */ 704 705 /* 706 * Loop through length of segment after first fragment, 707 * make new header and copy data of each part and link onto chain. 708 * Here, m0 is the original packet, m is the fragment being created. 709 * The fragments are linked off the m_nextpkt of the original 710 * packet, which after processing serves as the first fragment. 711 */ 712 for (nfrags = 1; off < ip->ip_len; off += len, nfrags++) { 713 struct ip *mhip; /* ip header on the fragment */ 714 struct mbuf *m; 715 int mhlen = sizeof (struct ip); 716 717 MGETHDR(m, M_DONTWAIT, MT_DATA); 718 if (m == NULL) { 719 error = ENOBUFS; 720 ipstat.ips_odropped++; 721 goto done; 722 } 723 m->m_flags |= (m0->m_flags & M_MCAST) | M_FRAG; 724 /* 725 * In the first mbuf, leave room for the link header, then 726 * copy the original IP header including options. The payload 727 * goes into an additional mbuf chain returned by m_copy(). 728 */ 729 m->m_data += max_linkhdr; 730 mhip = mtod(m, struct ip *); 731 *mhip = *ip; 732 if (hlen > sizeof (struct ip)) { 733 mhlen = ip_optcopy(ip, mhip) + sizeof (struct ip); 734 mhip->ip_v = IPVERSION; 735 mhip->ip_hl = mhlen >> 2; 736 } 737 m->m_len = mhlen; 738 /* XXX do we need to add ip->ip_off below ? */ 739 mhip->ip_off = ((off - hlen) >> 3) + ip->ip_off; 740 if (off + len >= ip->ip_len) { /* last fragment */ 741 len = ip->ip_len - off; 742 m->m_flags |= M_LASTFRAG; 743 } else 744 mhip->ip_off |= IP_MF; 745 mhip->ip_len = htons((u_short)(len + mhlen)); 746 m->m_next = m_copy(m0, off, len); 747 if (m->m_next == NULL) { /* copy failed */ 748 m_free(m); 749 error = ENOBUFS; /* ??? */ 750 ipstat.ips_odropped++; 751 goto done; 752 } 753 m->m_pkthdr.len = mhlen + len; 754 m->m_pkthdr.rcvif = NULL; 755#ifdef MAC 756 mac_create_fragment(m0, m); 757#endif 758 m->m_pkthdr.csum_flags = m0->m_pkthdr.csum_flags; 759 mhip->ip_off = htons(mhip->ip_off); 760 mhip->ip_sum = 0; 761 if (sw_csum & CSUM_DELAY_IP) 762 mhip->ip_sum = in_cksum(m, mhlen); 763 *mnext = m; 764 mnext = &m->m_nextpkt; 765 } 766 ipstat.ips_ofragments += nfrags; 767 768 /* set first marker for fragment chain */ 769 m0->m_flags |= M_FIRSTFRAG | M_FRAG; 770 m0->m_pkthdr.csum_data = nfrags; 771 772 /* 773 * Update first fragment by trimming what's been copied out 774 * and updating header. 775 */ 776 m_adj(m0, hlen + firstlen - ip->ip_len); 777 m0->m_pkthdr.len = hlen + firstlen; 778 ip->ip_len = htons((u_short)m0->m_pkthdr.len); 779 ip->ip_off |= IP_MF; 780 ip->ip_off = htons(ip->ip_off); 781 ip->ip_sum = 0; 782 if (sw_csum & CSUM_DELAY_IP) 783 ip->ip_sum = in_cksum(m0, hlen); 784 785done: 786 *m_frag = m0; 787 return error; 788} 789 790void 791in_delayed_cksum(struct mbuf *m) 792{ 793 struct ip *ip; 794 u_short csum, offset; 795 796 ip = mtod(m, struct ip *); 797 offset = ip->ip_hl << 2 ; 798 csum = in_cksum_skip(m, ip->ip_len, offset); 799 if (m->m_pkthdr.csum_flags & CSUM_UDP && csum == 0) 800 csum = 0xffff; 801 offset += m->m_pkthdr.csum_data; /* checksum offset */ 802 803 if (offset + sizeof(u_short) > m->m_len) { 804 printf("delayed m_pullup, m->len: %d off: %d p: %d\n", 805 m->m_len, offset, ip->ip_p); 806 /* 807 * XXX 808 * this shouldn't happen, but if it does, the 809 * correct behavior may be to insert the checksum 810 * in the appropriate next mbuf in the chain. 811 */ 812 return; 813 } 814 *(u_short *)(m->m_data + offset) = csum; 815} 816 817/* 818 * IP socket option processing. 819 */ 820int 821ip_ctloutput(so, sopt) 822 struct socket *so; 823 struct sockopt *sopt; 824{ 825 struct inpcb *inp = sotoinpcb(so); 826 int error, optval; 827 828 error = optval = 0; 829 if (sopt->sopt_level != IPPROTO_IP) { 830 return (EINVAL); 831 } 832 833 switch (sopt->sopt_dir) { 834 case SOPT_SET: 835 switch (sopt->sopt_name) { 836 case IP_OPTIONS: 837#ifdef notyet 838 case IP_RETOPTS: 839#endif 840 { 841 struct mbuf *m; 842 if (sopt->sopt_valsize > MLEN) { 843 error = EMSGSIZE; 844 break; 845 } 846 MGET(m, sopt->sopt_td ? M_TRYWAIT : M_DONTWAIT, MT_DATA); 847 if (m == NULL) { 848 error = ENOBUFS; 849 break; 850 } 851 m->m_len = sopt->sopt_valsize; 852 error = sooptcopyin(sopt, mtod(m, char *), m->m_len, 853 m->m_len); 854 if (error) { 855 m_free(m); 856 break; 857 } 858 INP_LOCK(inp); 859 error = ip_pcbopts(inp, sopt->sopt_name, m); 860 INP_UNLOCK(inp); 861 return (error); 862 } 863 864 case IP_TOS: 865 case IP_TTL: 866 case IP_MINTTL: 867 case IP_RECVOPTS: 868 case IP_RECVRETOPTS: 869 case IP_RECVDSTADDR: 870 case IP_RECVTTL: 871 case IP_RECVIF: 872 case IP_FAITH: 873 case IP_ONESBCAST: 874 case IP_DONTFRAG: 875 error = sooptcopyin(sopt, &optval, sizeof optval, 876 sizeof optval); 877 if (error) 878 break; 879 880 switch (sopt->sopt_name) { 881 case IP_TOS: 882 inp->inp_ip_tos = optval; 883 break; 884 885 case IP_TTL: 886 inp->inp_ip_ttl = optval; 887 break; 888 889 case IP_MINTTL: 890 if (optval > 0 && optval <= MAXTTL) 891 inp->inp_ip_minttl = optval; 892 else 893 error = EINVAL; 894 break; 895 896#define OPTSET(bit) do { \ 897 INP_LOCK(inp); \ 898 if (optval) \ 899 inp->inp_flags |= bit; \ 900 else \ 901 inp->inp_flags &= ~bit; \ 902 INP_UNLOCK(inp); \ 903} while (0) 904 905 case IP_RECVOPTS: 906 OPTSET(INP_RECVOPTS); 907 break; 908 909 case IP_RECVRETOPTS: 910 OPTSET(INP_RECVRETOPTS); 911 break; 912 913 case IP_RECVDSTADDR: 914 OPTSET(INP_RECVDSTADDR); 915 break; 916 917 case IP_RECVTTL: 918 OPTSET(INP_RECVTTL); 919 break; 920 921 case IP_RECVIF: 922 OPTSET(INP_RECVIF); 923 break; 924 925 case IP_FAITH: 926 OPTSET(INP_FAITH); 927 break; 928 929 case IP_ONESBCAST: 930 OPTSET(INP_ONESBCAST); 931 break; 932 case IP_DONTFRAG: 933 OPTSET(INP_DONTFRAG); 934 break; 935 } 936 break; 937#undef OPTSET 938 939 case IP_MULTICAST_IF: 940 case IP_MULTICAST_VIF: 941 case IP_MULTICAST_TTL: 942 case IP_MULTICAST_LOOP: 943 case IP_ADD_MEMBERSHIP: 944 case IP_DROP_MEMBERSHIP: 945 error = ip_setmoptions(inp, sopt); 946 break; 947 948 case IP_PORTRANGE: 949 error = sooptcopyin(sopt, &optval, sizeof optval, 950 sizeof optval); 951 if (error) 952 break; 953 954 INP_LOCK(inp); 955 switch (optval) { 956 case IP_PORTRANGE_DEFAULT: 957 inp->inp_flags &= ~(INP_LOWPORT); 958 inp->inp_flags &= ~(INP_HIGHPORT); 959 break; 960 961 case IP_PORTRANGE_HIGH: 962 inp->inp_flags &= ~(INP_LOWPORT); 963 inp->inp_flags |= INP_HIGHPORT; 964 break; 965 966 case IP_PORTRANGE_LOW: 967 inp->inp_flags &= ~(INP_HIGHPORT); 968 inp->inp_flags |= INP_LOWPORT; 969 break; 970 971 default: 972 error = EINVAL; 973 break; 974 } 975 INP_UNLOCK(inp); 976 break; 977 978#if defined(IPSEC) || defined(FAST_IPSEC) 979 case IP_IPSEC_POLICY: 980 { 981 caddr_t req; 982 size_t len = 0; 983 int priv; 984 struct mbuf *m; 985 int optname; 986 987 if ((error = soopt_getm(sopt, &m)) != 0) /* XXX */ 988 break; 989 if ((error = soopt_mcopyin(sopt, m)) != 0) /* XXX */ 990 break; 991 if (sopt->sopt_td != NULL) { 992 /* 993 * XXXRW: Would be more desirable to do this 994 * one layer down so that we only exercise 995 * privilege if it is needed. 996 */ 997 error = priv_check(sopt->sopt_td, 998 PRIV_NETINET_IPSEC); 999 if (error) 1000 priv = 0; 1001 else 1002 priv = 1; 1003 } else 1004 priv = 1; 1005 req = mtod(m, caddr_t); 1006 len = m->m_len; 1007 optname = sopt->sopt_name; 1008 error = ipsec4_set_policy(inp, optname, req, len, priv); 1009 m_freem(m); 1010 break; 1011 } 1012#endif /*IPSEC*/ 1013 1014 default: 1015 error = ENOPROTOOPT; 1016 break; 1017 } 1018 break; 1019 1020 case SOPT_GET: 1021 switch (sopt->sopt_name) { 1022 case IP_OPTIONS: 1023 case IP_RETOPTS: 1024 if (inp->inp_options) 1025 error = sooptcopyout(sopt, 1026 mtod(inp->inp_options, 1027 char *), 1028 inp->inp_options->m_len); 1029 else 1030 sopt->sopt_valsize = 0; 1031 break; 1032 1033 case IP_TOS: 1034 case IP_TTL: 1035 case IP_MINTTL: 1036 case IP_RECVOPTS: 1037 case IP_RECVRETOPTS: 1038 case IP_RECVDSTADDR: 1039 case IP_RECVTTL: 1040 case IP_RECVIF: 1041 case IP_PORTRANGE: 1042 case IP_FAITH: 1043 case IP_ONESBCAST: 1044 case IP_DONTFRAG: 1045 switch (sopt->sopt_name) { 1046 1047 case IP_TOS: 1048 optval = inp->inp_ip_tos; 1049 break; 1050 1051 case IP_TTL: 1052 optval = inp->inp_ip_ttl; 1053 break; 1054 1055 case IP_MINTTL: 1056 optval = inp->inp_ip_minttl; 1057 break; 1058 1059#define OPTBIT(bit) (inp->inp_flags & bit ? 1 : 0) 1060 1061 case IP_RECVOPTS: 1062 optval = OPTBIT(INP_RECVOPTS); 1063 break; 1064 1065 case IP_RECVRETOPTS: 1066 optval = OPTBIT(INP_RECVRETOPTS); 1067 break; 1068 1069 case IP_RECVDSTADDR: 1070 optval = OPTBIT(INP_RECVDSTADDR); 1071 break; 1072 1073 case IP_RECVTTL: 1074 optval = OPTBIT(INP_RECVTTL); 1075 break; 1076 1077 case IP_RECVIF: 1078 optval = OPTBIT(INP_RECVIF); 1079 break; 1080 1081 case IP_PORTRANGE: 1082 if (inp->inp_flags & INP_HIGHPORT) 1083 optval = IP_PORTRANGE_HIGH; 1084 else if (inp->inp_flags & INP_LOWPORT) 1085 optval = IP_PORTRANGE_LOW; 1086 else 1087 optval = 0; 1088 break; 1089 1090 case IP_FAITH: 1091 optval = OPTBIT(INP_FAITH); 1092 break; 1093 1094 case IP_ONESBCAST: 1095 optval = OPTBIT(INP_ONESBCAST); 1096 break; 1097 case IP_DONTFRAG: 1098 optval = OPTBIT(INP_DONTFRAG); 1099 break; 1100 } 1101 error = sooptcopyout(sopt, &optval, sizeof optval); 1102 break; 1103 1104 case IP_MULTICAST_IF: 1105 case IP_MULTICAST_VIF: 1106 case IP_MULTICAST_TTL: 1107 case IP_MULTICAST_LOOP: 1108 case IP_ADD_MEMBERSHIP: 1109 case IP_DROP_MEMBERSHIP: 1110 error = ip_getmoptions(inp, sopt); 1111 break; 1112 1113#if defined(IPSEC) || defined(FAST_IPSEC) 1114 case IP_IPSEC_POLICY: 1115 { 1116 struct mbuf *m = NULL; 1117 caddr_t req = NULL; 1118 size_t len = 0; 1119 1120 if (m != 0) { 1121 req = mtod(m, caddr_t); 1122 len = m->m_len; 1123 } 1124 error = ipsec4_get_policy(sotoinpcb(so), req, len, &m); 1125 if (error == 0) 1126 error = soopt_mcopyout(sopt, m); /* XXX */ 1127 if (error == 0) 1128 m_freem(m); 1129 break; 1130 } 1131#endif /*IPSEC*/ 1132 1133 default: 1134 error = ENOPROTOOPT; 1135 break; 1136 } 1137 break; 1138 } 1139 return (error); 1140} 1141 1142/* 1143 * XXX 1144 * The whole multicast option thing needs to be re-thought. 1145 * Several of these options are equally applicable to non-multicast 1146 * transmission, and one (IP_MULTICAST_TTL) totally duplicates a 1147 * standard option (IP_TTL). 1148 */ 1149 1150/* 1151 * following RFC1724 section 3.3, 0.0.0.0/8 is interpreted as interface index. 1152 */ 1153static struct ifnet * 1154ip_multicast_if(a, ifindexp) 1155 struct in_addr *a; 1156 int *ifindexp; 1157{ 1158 int ifindex; 1159 struct ifnet *ifp; 1160 1161 if (ifindexp) 1162 *ifindexp = 0; 1163 if (ntohl(a->s_addr) >> 24 == 0) { 1164 ifindex = ntohl(a->s_addr) & 0xffffff; 1165 if (ifindex < 0 || if_index < ifindex) 1166 return NULL; 1167 ifp = ifnet_byindex(ifindex); 1168 if (ifindexp) 1169 *ifindexp = ifindex; 1170 } else { 1171 INADDR_TO_IFP(*a, ifp); 1172 } 1173 return ifp; 1174} 1175 1176/* 1177 * Given an inpcb, return its multicast options structure pointer. Accepts 1178 * an unlocked inpcb pointer, but will return it locked. May sleep. 1179 */ 1180static struct ip_moptions * 1181ip_findmoptions(struct inpcb *inp) 1182{ 1183 struct ip_moptions *imo; 1184 struct in_multi **immp; 1185 1186 INP_LOCK(inp); 1187 if (inp->inp_moptions != NULL) 1188 return (inp->inp_moptions); 1189 1190 INP_UNLOCK(inp); 1191 1192 imo = (struct ip_moptions*)malloc(sizeof(*imo), M_IPMOPTS, M_WAITOK); 1193 immp = (struct in_multi **)malloc((sizeof(*immp) * IP_MIN_MEMBERSHIPS), 1194 M_IPMOPTS, M_WAITOK); 1195 1196 imo->imo_multicast_ifp = NULL; 1197 imo->imo_multicast_addr.s_addr = INADDR_ANY; 1198 imo->imo_multicast_vif = -1; 1199 imo->imo_multicast_ttl = IP_DEFAULT_MULTICAST_TTL; 1200 imo->imo_multicast_loop = IP_DEFAULT_MULTICAST_LOOP; 1201 imo->imo_num_memberships = 0; 1202 imo->imo_max_memberships = IP_MIN_MEMBERSHIPS; 1203 imo->imo_membership = immp; 1204 1205 INP_LOCK(inp); 1206 if (inp->inp_moptions != NULL) { 1207 free(immp, M_IPMOPTS); 1208 free(imo, M_IPMOPTS); 1209 return (inp->inp_moptions); 1210 } 1211 inp->inp_moptions = imo; 1212 return (imo); 1213} 1214 1215/* 1216 * Set the IP multicast options in response to user setsockopt(). 1217 */ 1218static int 1219ip_setmoptions(struct inpcb *inp, struct sockopt *sopt) 1220{ 1221 int error = 0; 1222 int i; 1223 struct in_addr addr; 1224 struct ip_mreq mreq; 1225 struct ifnet *ifp; 1226 struct ip_moptions *imo; 1227 struct route ro; 1228 struct sockaddr_in *dst; 1229 int ifindex; 1230 int s; 1231 1232 switch (sopt->sopt_name) { 1233 /* store an index number for the vif you wanna use in the send */ 1234 case IP_MULTICAST_VIF: 1235 if (legal_vif_num == 0) { 1236 error = EOPNOTSUPP; 1237 break; 1238 } 1239 error = sooptcopyin(sopt, &i, sizeof i, sizeof i); 1240 if (error) 1241 break; 1242 if (!legal_vif_num(i) && (i != -1)) { 1243 error = EINVAL; 1244 break; 1245 } 1246 imo = ip_findmoptions(inp); 1247 imo->imo_multicast_vif = i; 1248 INP_UNLOCK(inp); 1249 break; 1250 1251 case IP_MULTICAST_IF: 1252 /* 1253 * Select the interface for outgoing multicast packets. 1254 */ 1255 error = sooptcopyin(sopt, &addr, sizeof addr, sizeof addr); 1256 if (error) 1257 break; 1258 /* 1259 * INADDR_ANY is used to remove a previous selection. 1260 * When no interface is selected, a default one is 1261 * chosen every time a multicast packet is sent. 1262 */ 1263 imo = ip_findmoptions(inp); 1264 if (addr.s_addr == INADDR_ANY) { 1265 imo->imo_multicast_ifp = NULL; 1266 INP_UNLOCK(inp); 1267 break; 1268 } 1269 /* 1270 * The selected interface is identified by its local 1271 * IP address. Find the interface and confirm that 1272 * it supports multicasting. 1273 */ 1274 s = splimp(); 1275 ifp = ip_multicast_if(&addr, &ifindex); 1276 if (ifp == NULL || (ifp->if_flags & IFF_MULTICAST) == 0) { 1277 INP_UNLOCK(inp); 1278 splx(s); 1279 error = EADDRNOTAVAIL; 1280 break; 1281 } 1282 imo->imo_multicast_ifp = ifp; 1283 if (ifindex) 1284 imo->imo_multicast_addr = addr; 1285 else 1286 imo->imo_multicast_addr.s_addr = INADDR_ANY; 1287 INP_UNLOCK(inp); 1288 splx(s); 1289 break; 1290 1291 case IP_MULTICAST_TTL: 1292 /* 1293 * Set the IP time-to-live for outgoing multicast packets. 1294 * The original multicast API required a char argument, 1295 * which is inconsistent with the rest of the socket API. 1296 * We allow either a char or an int. 1297 */ 1298 if (sopt->sopt_valsize == 1) { 1299 u_char ttl; 1300 error = sooptcopyin(sopt, &ttl, 1, 1); 1301 if (error) 1302 break; 1303 imo = ip_findmoptions(inp); 1304 imo->imo_multicast_ttl = ttl; 1305 INP_UNLOCK(inp); 1306 } else { 1307 u_int ttl; 1308 error = sooptcopyin(sopt, &ttl, sizeof ttl, 1309 sizeof ttl); 1310 if (error) 1311 break; 1312 if (ttl > 255) 1313 error = EINVAL; 1314 else { 1315 imo = ip_findmoptions(inp); 1316 imo->imo_multicast_ttl = ttl; 1317 INP_UNLOCK(inp); 1318 } 1319 } 1320 break; 1321 1322 case IP_MULTICAST_LOOP: 1323 /* 1324 * Set the loopback flag for outgoing multicast packets. 1325 * Must be zero or one. The original multicast API required a 1326 * char argument, which is inconsistent with the rest 1327 * of the socket API. We allow either a char or an int. 1328 */ 1329 if (sopt->sopt_valsize == 1) { 1330 u_char loop; 1331 error = sooptcopyin(sopt, &loop, 1, 1); 1332 if (error) 1333 break; 1334 imo = ip_findmoptions(inp); 1335 imo->imo_multicast_loop = !!loop; 1336 INP_UNLOCK(inp); 1337 } else { 1338 u_int loop; 1339 error = sooptcopyin(sopt, &loop, sizeof loop, 1340 sizeof loop); 1341 if (error) 1342 break; 1343 imo = ip_findmoptions(inp); 1344 imo->imo_multicast_loop = !!loop; 1345 INP_UNLOCK(inp); 1346 } 1347 break; 1348 1349 case IP_ADD_MEMBERSHIP: 1350 /* 1351 * Add a multicast group membership. 1352 * Group must be a valid IP multicast address. 1353 */ 1354 error = sooptcopyin(sopt, &mreq, sizeof mreq, sizeof mreq); 1355 if (error) 1356 break; 1357 1358 if (!IN_MULTICAST(ntohl(mreq.imr_multiaddr.s_addr))) { 1359 error = EINVAL; 1360 break; 1361 } 1362 s = splimp(); 1363 /* 1364 * If no interface address was provided, use the interface of 1365 * the route to the given multicast address. 1366 */ 1367 if (mreq.imr_interface.s_addr == INADDR_ANY) { 1368 bzero((caddr_t)&ro, sizeof(ro)); 1369 dst = (struct sockaddr_in *)&ro.ro_dst; 1370 dst->sin_len = sizeof(*dst); 1371 dst->sin_family = AF_INET; 1372 dst->sin_addr = mreq.imr_multiaddr; 1373 rtalloc_ign(&ro, RTF_CLONING); 1374 if (ro.ro_rt == NULL) { 1375 error = EADDRNOTAVAIL; 1376 splx(s); 1377 break; 1378 } 1379 ifp = ro.ro_rt->rt_ifp; 1380 RTFREE(ro.ro_rt); 1381 } 1382 else { 1383 ifp = ip_multicast_if(&mreq.imr_interface, NULL); 1384 } 1385 1386 /* 1387 * See if we found an interface, and confirm that it 1388 * supports multicast. 1389 */ 1390 if (ifp == NULL || (ifp->if_flags & IFF_MULTICAST) == 0) { 1391 error = EADDRNOTAVAIL; 1392 splx(s); 1393 break; 1394 } 1395 /* 1396 * See if the membership already exists or if all the 1397 * membership slots are full. 1398 */ 1399 imo = ip_findmoptions(inp); 1400 for (i = 0; i < imo->imo_num_memberships; ++i) { 1401 if (imo->imo_membership[i]->inm_ifp == ifp && 1402 imo->imo_membership[i]->inm_addr.s_addr 1403 == mreq.imr_multiaddr.s_addr) 1404 break; 1405 } 1406 if (i < imo->imo_num_memberships) { 1407 INP_UNLOCK(inp); 1408 error = EADDRINUSE; 1409 splx(s); 1410 break; 1411 } 1412 if (imo->imo_num_memberships == imo->imo_max_memberships) { 1413 struct in_multi **nmships, **omships; 1414 size_t newmax; 1415 /* 1416 * Resize the vector to next power-of-two minus 1. If the 1417 * size would exceed the maximum then we know we've really 1418 * run out of entries. Otherwise, we realloc() the vector 1419 * with the INP lock held to avoid introducing a race. 1420 */ 1421 nmships = NULL; 1422 omships = imo->imo_membership; 1423 newmax = ((imo->imo_max_memberships + 1) * 2) - 1; 1424 if (newmax <= IP_MAX_MEMBERSHIPS) { 1425 nmships = (struct in_multi **)realloc(omships, 1426sizeof(*nmships) * newmax, M_IPMOPTS, M_NOWAIT); 1427 if (nmships != NULL) { 1428 imo->imo_membership = nmships; 1429 imo->imo_max_memberships = newmax; 1430 } 1431 } 1432 if (nmships == NULL) { 1433 INP_UNLOCK(inp); 1434 error = ETOOMANYREFS; 1435 splx(s); 1436 break; 1437 } 1438 } 1439 /* 1440 * Everything looks good; add a new record to the multicast 1441 * address list for the given interface. 1442 */ 1443 if ((imo->imo_membership[i] = 1444 in_addmulti(&mreq.imr_multiaddr, ifp)) == NULL) { 1445 INP_UNLOCK(inp); 1446 error = ENOBUFS; 1447 splx(s); 1448 break; 1449 } 1450 ++imo->imo_num_memberships; 1451 INP_UNLOCK(inp); 1452 splx(s); 1453 break; 1454 1455 case IP_DROP_MEMBERSHIP: 1456 /* 1457 * Drop a multicast group membership. 1458 * Group must be a valid IP multicast address. 1459 */ 1460 error = sooptcopyin(sopt, &mreq, sizeof mreq, sizeof mreq); 1461 if (error) 1462 break; 1463 1464 if (!IN_MULTICAST(ntohl(mreq.imr_multiaddr.s_addr))) { 1465 error = EINVAL; 1466 break; 1467 } 1468 1469 s = splimp(); 1470 /* 1471 * If an interface address was specified, get a pointer 1472 * to its ifnet structure. 1473 */ 1474 if (mreq.imr_interface.s_addr == INADDR_ANY) 1475 ifp = NULL; 1476 else { 1477 ifp = ip_multicast_if(&mreq.imr_interface, NULL); 1478 if (ifp == NULL) { 1479 error = EADDRNOTAVAIL; 1480 splx(s); 1481 break; 1482 } 1483 } 1484 /* 1485 * Find the membership in the membership array. 1486 */ 1487 imo = ip_findmoptions(inp); 1488 for (i = 0; i < imo->imo_num_memberships; ++i) { 1489 if ((ifp == NULL || 1490 imo->imo_membership[i]->inm_ifp == ifp) && 1491 imo->imo_membership[i]->inm_addr.s_addr == 1492 mreq.imr_multiaddr.s_addr) 1493 break; 1494 } 1495 if (i == imo->imo_num_memberships) { 1496 INP_UNLOCK(inp); 1497 error = EADDRNOTAVAIL; 1498 splx(s); 1499 break; 1500 } 1501 /* 1502 * Give up the multicast address record to which the 1503 * membership points. 1504 */ 1505 in_delmulti(imo->imo_membership[i]); 1506 /* 1507 * Remove the gap in the membership array. 1508 */ 1509 for (++i; i < imo->imo_num_memberships; ++i) 1510 imo->imo_membership[i-1] = imo->imo_membership[i]; 1511 --imo->imo_num_memberships; 1512 INP_UNLOCK(inp); 1513 splx(s); 1514 break; 1515 1516 default: 1517 error = EOPNOTSUPP; 1518 break; 1519 } 1520 1521 return (error); 1522} 1523 1524/* 1525 * Return the IP multicast options in response to user getsockopt(). 1526 */ 1527static int 1528ip_getmoptions(struct inpcb *inp, struct sockopt *sopt) 1529{ 1530 struct ip_moptions *imo; 1531 struct in_addr addr; 1532 struct in_ifaddr *ia; 1533 int error, optval; 1534 u_char coptval; 1535 1536 INP_LOCK(inp); 1537 imo = inp->inp_moptions; 1538 1539 error = 0; 1540 switch (sopt->sopt_name) { 1541 case IP_MULTICAST_VIF: 1542 if (imo != NULL) 1543 optval = imo->imo_multicast_vif; 1544 else 1545 optval = -1; 1546 INP_UNLOCK(inp); 1547 error = sooptcopyout(sopt, &optval, sizeof optval); 1548 break; 1549 1550 case IP_MULTICAST_IF: 1551 if (imo == NULL || imo->imo_multicast_ifp == NULL) 1552 addr.s_addr = INADDR_ANY; 1553 else if (imo->imo_multicast_addr.s_addr) { 1554 /* return the value user has set */ 1555 addr = imo->imo_multicast_addr; 1556 } else { 1557 IFP_TO_IA(imo->imo_multicast_ifp, ia); 1558 addr.s_addr = (ia == NULL) ? INADDR_ANY 1559 : IA_SIN(ia)->sin_addr.s_addr; 1560 } 1561 INP_UNLOCK(inp); 1562 error = sooptcopyout(sopt, &addr, sizeof addr); 1563 break; 1564 1565 case IP_MULTICAST_TTL: 1566 if (imo == 0) 1567 optval = coptval = IP_DEFAULT_MULTICAST_TTL; 1568 else 1569 optval = coptval = imo->imo_multicast_ttl; 1570 INP_UNLOCK(inp); 1571 if (sopt->sopt_valsize == 1) 1572 error = sooptcopyout(sopt, &coptval, 1); 1573 else 1574 error = sooptcopyout(sopt, &optval, sizeof optval); 1575 break; 1576 1577 case IP_MULTICAST_LOOP: 1578 if (imo == 0) 1579 optval = coptval = IP_DEFAULT_MULTICAST_LOOP; 1580 else 1581 optval = coptval = imo->imo_multicast_loop; 1582 INP_UNLOCK(inp); 1583 if (sopt->sopt_valsize == 1) 1584 error = sooptcopyout(sopt, &coptval, 1); 1585 else 1586 error = sooptcopyout(sopt, &optval, sizeof optval); 1587 break; 1588 1589 default: 1590 INP_UNLOCK(inp); 1591 error = ENOPROTOOPT; 1592 break; 1593 } 1594 INP_UNLOCK_ASSERT(inp); 1595 1596 return (error); 1597} 1598 1599/* 1600 * Discard the IP multicast options. 1601 */ 1602void 1603ip_freemoptions(imo) 1604 register struct ip_moptions *imo; 1605{ 1606 register int i; 1607 1608 if (imo != NULL) { 1609 for (i = 0; i < imo->imo_num_memberships; ++i) 1610 in_delmulti(imo->imo_membership[i]); 1611 free(imo->imo_membership, M_IPMOPTS); 1612 free(imo, M_IPMOPTS); 1613 } 1614} 1615 1616/* 1617 * Routine called from ip_output() to loop back a copy of an IP multicast 1618 * packet to the input queue of a specified interface. Note that this 1619 * calls the output routine of the loopback "driver", but with an interface 1620 * pointer that might NOT be a loopback interface -- evil, but easier than 1621 * replicating that code here. 1622 */ 1623static void 1624ip_mloopback(ifp, m, dst, hlen) 1625 struct ifnet *ifp; 1626 register struct mbuf *m; 1627 register struct sockaddr_in *dst; 1628 int hlen; 1629{ 1630 register struct ip *ip; 1631 struct mbuf *copym; 1632 1633 copym = m_copy(m, 0, M_COPYALL); 1634 if (copym != NULL && (copym->m_flags & M_EXT || copym->m_len < hlen)) 1635 copym = m_pullup(copym, hlen); 1636 if (copym != NULL) { 1637 /* If needed, compute the checksum and mark it as valid. */ 1638 if (copym->m_pkthdr.csum_flags & CSUM_DELAY_DATA) { 1639 in_delayed_cksum(copym); 1640 copym->m_pkthdr.csum_flags &= ~CSUM_DELAY_DATA; 1641 copym->m_pkthdr.csum_flags |= 1642 CSUM_DATA_VALID | CSUM_PSEUDO_HDR; 1643 copym->m_pkthdr.csum_data = 0xffff; 1644 } 1645 /* 1646 * We don't bother to fragment if the IP length is greater 1647 * than the interface's MTU. Can this possibly matter? 1648 */ 1649 ip = mtod(copym, struct ip *); 1650 ip->ip_len = htons(ip->ip_len); 1651 ip->ip_off = htons(ip->ip_off); 1652 ip->ip_sum = 0; 1653 ip->ip_sum = in_cksum(copym, hlen); 1654 /* 1655 * NB: 1656 * It's not clear whether there are any lingering 1657 * reentrancy problems in other areas which might 1658 * be exposed by using ip_input directly (in 1659 * particular, everything which modifies the packet 1660 * in-place). Yet another option is using the 1661 * protosw directly to deliver the looped back 1662 * packet. For the moment, we'll err on the side 1663 * of safety by using if_simloop(). 1664 */ 1665#if 1 /* XXX */ 1666 if (dst->sin_family != AF_INET) { 1667 printf("ip_mloopback: bad address family %d\n", 1668 dst->sin_family); 1669 dst->sin_family = AF_INET; 1670 } 1671#endif 1672 1673#ifdef notdef 1674 copym->m_pkthdr.rcvif = ifp; 1675 ip_input(copym); 1676#else 1677 if_simloop(ifp, copym, dst->sin_family, 0); 1678#endif 1679 } 1680} 1681