ip_output.c revision 154520
1/*- 2 * Copyright (c) 1982, 1986, 1988, 1990, 1993 3 * The Regents of the University of California. All rights reserved. 4 * 5 * Redistribution and use in source and binary forms, with or without 6 * modification, are permitted provided that the following conditions 7 * are met: 8 * 1. Redistributions of source code must retain the above copyright 9 * notice, this list of conditions and the following disclaimer. 10 * 2. Redistributions in binary form must reproduce the above copyright 11 * notice, this list of conditions and the following disclaimer in the 12 * documentation and/or other materials provided with the distribution. 13 * 4. Neither the name of the University nor the names of its contributors 14 * may be used to endorse or promote products derived from this software 15 * without specific prior written permission. 16 * 17 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND 18 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 19 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 20 * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE 21 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 22 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 23 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 24 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 25 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 26 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 27 * SUCH DAMAGE. 28 * 29 * @(#)ip_output.c 8.3 (Berkeley) 1/21/94 30 * $FreeBSD: head/sys/netinet/ip_output.c 154520 2006-01-18 15:05:05Z andre $ 31 */ 32 33#include "opt_ipfw.h" 34#include "opt_ipsec.h" 35#include "opt_mac.h" 36#include "opt_mbuf_stress_test.h" 37 38#include <sys/param.h> 39#include <sys/systm.h> 40#include <sys/kernel.h> 41#include <sys/mac.h> 42#include <sys/malloc.h> 43#include <sys/mbuf.h> 44#include <sys/protosw.h> 45#include <sys/socket.h> 46#include <sys/socketvar.h> 47#include <sys/sysctl.h> 48 49#include <net/if.h> 50#include <net/netisr.h> 51#include <net/pfil.h> 52#include <net/route.h> 53 54#include <netinet/in.h> 55#include <netinet/in_systm.h> 56#include <netinet/ip.h> 57#include <netinet/in_pcb.h> 58#include <netinet/in_var.h> 59#include <netinet/ip_var.h> 60#include <netinet/ip_options.h> 61 62#include <machine/in_cksum.h> 63 64static MALLOC_DEFINE(M_IPMOPTS, "ip_moptions", "internet multicast options"); 65 66#ifdef IPSEC 67#include <netinet6/ipsec.h> 68#include <netkey/key.h> 69#ifdef IPSEC_DEBUG 70#include <netkey/key_debug.h> 71#else 72#define KEYDEBUG(lev,arg) 73#endif 74#endif /*IPSEC*/ 75 76#ifdef FAST_IPSEC 77#include <netipsec/ipsec.h> 78#include <netipsec/xform.h> 79#include <netipsec/key.h> 80#endif /*FAST_IPSEC*/ 81 82#define print_ip(x, a, y) printf("%s %d.%d.%d.%d%s",\ 83 x, (ntohl(a.s_addr)>>24)&0xFF,\ 84 (ntohl(a.s_addr)>>16)&0xFF,\ 85 (ntohl(a.s_addr)>>8)&0xFF,\ 86 (ntohl(a.s_addr))&0xFF, y); 87 88u_short ip_id; 89 90#ifdef MBUF_STRESS_TEST 91int mbuf_frag_size = 0; 92SYSCTL_INT(_net_inet_ip, OID_AUTO, mbuf_frag_size, CTLFLAG_RW, 93 &mbuf_frag_size, 0, "Fragment outgoing mbufs to this size"); 94#endif 95 96static struct ifnet *ip_multicast_if(struct in_addr *, int *); 97static void ip_mloopback 98 (struct ifnet *, struct mbuf *, struct sockaddr_in *, int); 99static int ip_getmoptions(struct inpcb *, struct sockopt *); 100static int ip_setmoptions(struct inpcb *, struct sockopt *); 101 102 103extern struct protosw inetsw[]; 104 105/* 106 * IP output. The packet in mbuf chain m contains a skeletal IP 107 * header (with len, off, ttl, proto, tos, src, dst). 108 * The mbuf chain containing the packet will be freed. 109 * The mbuf opt, if present, will not be freed. 110 * In the IP forwarding case, the packet will arrive with options already 111 * inserted, so must have a NULL opt pointer. 112 */ 113int 114ip_output(struct mbuf *m, struct mbuf *opt, struct route *ro, 115 int flags, struct ip_moptions *imo, struct inpcb *inp) 116{ 117 struct ip *ip; 118 struct ifnet *ifp = NULL; /* keep compiler happy */ 119 struct mbuf *m0; 120 int hlen = sizeof (struct ip); 121 int len, error = 0; 122 struct sockaddr_in *dst = NULL; /* keep compiler happy */ 123 struct in_ifaddr *ia = NULL; 124 int isbroadcast, sw_csum; 125 struct route iproute; 126 struct in_addr odst; 127#ifdef IPFIREWALL_FORWARD 128 struct m_tag *fwd_tag = NULL; 129#endif 130#ifdef IPSEC 131 struct secpolicy *sp = NULL; 132#endif 133#ifdef FAST_IPSEC 134 struct secpolicy *sp = NULL; 135 struct tdb_ident *tdbi; 136 struct m_tag *mtag; 137 int s; 138#endif /* FAST_IPSEC */ 139 140 M_ASSERTPKTHDR(m); 141 142 if (ro == NULL) { 143 ro = &iproute; 144 bzero(ro, sizeof (*ro)); 145 } 146 147 if (inp != NULL) 148 INP_LOCK_ASSERT(inp); 149 150 if (opt) { 151 len = 0; 152 m = ip_insertoptions(m, opt, &len); 153 if (len != 0) 154 hlen = len; 155 } 156 ip = mtod(m, struct ip *); 157 158 /* 159 * Fill in IP header. If we are not allowing fragmentation, 160 * then the ip_id field is meaningless, but we don't set it 161 * to zero. Doing so causes various problems when devices along 162 * the path (routers, load balancers, firewalls, etc.) illegally 163 * disable DF on our packet. Note that a 16-bit counter 164 * will wrap around in less than 10 seconds at 100 Mbit/s on a 165 * medium with MTU 1500. See Steven M. Bellovin, "A Technique 166 * for Counting NATted Hosts", Proc. IMW'02, available at 167 * <http://www.research.att.com/~smb/papers/fnat.pdf>. 168 */ 169 if ((flags & (IP_FORWARDING|IP_RAWOUTPUT)) == 0) { 170 ip->ip_v = IPVERSION; 171 ip->ip_hl = hlen >> 2; 172 ip->ip_id = ip_newid(); 173 ipstat.ips_localout++; 174 } else { 175 hlen = ip->ip_hl << 2; 176 } 177 178 dst = (struct sockaddr_in *)&ro->ro_dst; 179again: 180 /* 181 * If there is a cached route, 182 * check that it is to the same destination 183 * and is still up. If not, free it and try again. 184 * The address family should also be checked in case of sharing the 185 * cache with IPv6. 186 */ 187 if (ro->ro_rt && ((ro->ro_rt->rt_flags & RTF_UP) == 0 || 188 dst->sin_family != AF_INET || 189 dst->sin_addr.s_addr != ip->ip_dst.s_addr)) { 190 RTFREE(ro->ro_rt); 191 ro->ro_rt = (struct rtentry *)0; 192 } 193#ifdef IPFIREWALL_FORWARD 194 if (ro->ro_rt == NULL && fwd_tag == NULL) { 195#else 196 if (ro->ro_rt == NULL) { 197#endif 198 bzero(dst, sizeof(*dst)); 199 dst->sin_family = AF_INET; 200 dst->sin_len = sizeof(*dst); 201 dst->sin_addr = ip->ip_dst; 202 } 203 /* 204 * If routing to interface only, 205 * short circuit routing lookup. 206 */ 207 if (flags & IP_ROUTETOIF) { 208 if ((ia = ifatoia(ifa_ifwithdstaddr(sintosa(dst)))) == NULL && 209 (ia = ifatoia(ifa_ifwithnet(sintosa(dst)))) == NULL) { 210 ipstat.ips_noroute++; 211 error = ENETUNREACH; 212 goto bad; 213 } 214 ifp = ia->ia_ifp; 215 ip->ip_ttl = 1; 216 isbroadcast = in_broadcast(dst->sin_addr, ifp); 217 } else if (IN_MULTICAST(ntohl(ip->ip_dst.s_addr)) && 218 imo != NULL && imo->imo_multicast_ifp != NULL) { 219 /* 220 * Bypass the normal routing lookup for multicast 221 * packets if the interface is specified. 222 */ 223 ifp = imo->imo_multicast_ifp; 224 IFP_TO_IA(ifp, ia); 225 isbroadcast = 0; /* fool gcc */ 226 } else { 227 /* 228 * We want to do any cloning requested by the link layer, 229 * as this is probably required in all cases for correct 230 * operation (as it is for ARP). 231 */ 232 if (ro->ro_rt == NULL) 233 rtalloc_ign(ro, 0); 234 if (ro->ro_rt == NULL) { 235 ipstat.ips_noroute++; 236 error = EHOSTUNREACH; 237 goto bad; 238 } 239 ia = ifatoia(ro->ro_rt->rt_ifa); 240 ifp = ro->ro_rt->rt_ifp; 241 ro->ro_rt->rt_rmx.rmx_pksent++; 242 if (ro->ro_rt->rt_flags & RTF_GATEWAY) 243 dst = (struct sockaddr_in *)ro->ro_rt->rt_gateway; 244 if (ro->ro_rt->rt_flags & RTF_HOST) 245 isbroadcast = (ro->ro_rt->rt_flags & RTF_BROADCAST); 246 else 247 isbroadcast = in_broadcast(dst->sin_addr, ifp); 248 } 249 if (IN_MULTICAST(ntohl(ip->ip_dst.s_addr))) { 250 struct in_multi *inm; 251 252 m->m_flags |= M_MCAST; 253 /* 254 * IP destination address is multicast. Make sure "dst" 255 * still points to the address in "ro". (It may have been 256 * changed to point to a gateway address, above.) 257 */ 258 dst = (struct sockaddr_in *)&ro->ro_dst; 259 /* 260 * See if the caller provided any multicast options 261 */ 262 if (imo != NULL) { 263 ip->ip_ttl = imo->imo_multicast_ttl; 264 if (imo->imo_multicast_vif != -1) 265 ip->ip_src.s_addr = 266 ip_mcast_src ? 267 ip_mcast_src(imo->imo_multicast_vif) : 268 INADDR_ANY; 269 } else 270 ip->ip_ttl = IP_DEFAULT_MULTICAST_TTL; 271 /* 272 * Confirm that the outgoing interface supports multicast. 273 */ 274 if ((imo == NULL) || (imo->imo_multicast_vif == -1)) { 275 if ((ifp->if_flags & IFF_MULTICAST) == 0) { 276 ipstat.ips_noroute++; 277 error = ENETUNREACH; 278 goto bad; 279 } 280 } 281 /* 282 * If source address not specified yet, use address 283 * of outgoing interface. 284 */ 285 if (ip->ip_src.s_addr == INADDR_ANY) { 286 /* Interface may have no addresses. */ 287 if (ia != NULL) 288 ip->ip_src = IA_SIN(ia)->sin_addr; 289 } 290 291 IN_MULTI_LOCK(); 292 IN_LOOKUP_MULTI(ip->ip_dst, ifp, inm); 293 if (inm != NULL && 294 (imo == NULL || imo->imo_multicast_loop)) { 295 IN_MULTI_UNLOCK(); 296 /* 297 * If we belong to the destination multicast group 298 * on the outgoing interface, and the caller did not 299 * forbid loopback, loop back a copy. 300 */ 301 ip_mloopback(ifp, m, dst, hlen); 302 } 303 else { 304 IN_MULTI_UNLOCK(); 305 /* 306 * If we are acting as a multicast router, perform 307 * multicast forwarding as if the packet had just 308 * arrived on the interface to which we are about 309 * to send. The multicast forwarding function 310 * recursively calls this function, using the 311 * IP_FORWARDING flag to prevent infinite recursion. 312 * 313 * Multicasts that are looped back by ip_mloopback(), 314 * above, will be forwarded by the ip_input() routine, 315 * if necessary. 316 */ 317 if (ip_mrouter && (flags & IP_FORWARDING) == 0) { 318 /* 319 * If rsvp daemon is not running, do not 320 * set ip_moptions. This ensures that the packet 321 * is multicast and not just sent down one link 322 * as prescribed by rsvpd. 323 */ 324 if (!rsvp_on) 325 imo = NULL; 326 if (ip_mforward && 327 ip_mforward(ip, ifp, m, imo) != 0) { 328 m_freem(m); 329 goto done; 330 } 331 } 332 } 333 334 /* 335 * Multicasts with a time-to-live of zero may be looped- 336 * back, above, but must not be transmitted on a network. 337 * Also, multicasts addressed to the loopback interface 338 * are not sent -- the above call to ip_mloopback() will 339 * loop back a copy if this host actually belongs to the 340 * destination group on the loopback interface. 341 */ 342 if (ip->ip_ttl == 0 || ifp->if_flags & IFF_LOOPBACK) { 343 m_freem(m); 344 goto done; 345 } 346 347 goto sendit; 348 } 349#ifndef notdef 350 /* 351 * If the source address is not specified yet, use the address 352 * of the outoing interface. 353 */ 354 if (ip->ip_src.s_addr == INADDR_ANY) { 355 /* Interface may have no addresses. */ 356 if (ia != NULL) { 357 ip->ip_src = IA_SIN(ia)->sin_addr; 358 } 359 } 360#endif /* notdef */ 361 /* 362 * Verify that we have any chance at all of being able to queue the 363 * packet or packet fragments, unless ALTQ is enabled on the given 364 * interface in which case packetdrop should be done by queueing. 365 */ 366#ifdef ALTQ 367 if ((!ALTQ_IS_ENABLED(&ifp->if_snd)) && 368 ((ifp->if_snd.ifq_len + ip->ip_len / ifp->if_mtu + 1) >= 369 ifp->if_snd.ifq_maxlen)) 370#else 371 if ((ifp->if_snd.ifq_len + ip->ip_len / ifp->if_mtu + 1) >= 372 ifp->if_snd.ifq_maxlen) 373#endif /* ALTQ */ 374 { 375 error = ENOBUFS; 376 ipstat.ips_odropped++; 377 ifp->if_snd.ifq_drops += (ip->ip_len / ifp->if_mtu + 1); 378 goto bad; 379 } 380 381 /* 382 * Look for broadcast address and 383 * verify user is allowed to send 384 * such a packet. 385 */ 386 if (isbroadcast) { 387 if ((ifp->if_flags & IFF_BROADCAST) == 0) { 388 error = EADDRNOTAVAIL; 389 goto bad; 390 } 391 if ((flags & IP_ALLOWBROADCAST) == 0) { 392 error = EACCES; 393 goto bad; 394 } 395 /* don't allow broadcast messages to be fragmented */ 396 if (ip->ip_len > ifp->if_mtu) { 397 error = EMSGSIZE; 398 goto bad; 399 } 400 if (flags & IP_SENDONES) 401 ip->ip_dst.s_addr = INADDR_BROADCAST; 402 m->m_flags |= M_BCAST; 403 } else { 404 m->m_flags &= ~M_BCAST; 405 } 406 407sendit: 408#ifdef IPSEC 409 /* get SP for this packet */ 410 if (inp == NULL) 411 sp = ipsec4_getpolicybyaddr(m, IPSEC_DIR_OUTBOUND, 412 flags, &error); 413 else 414 sp = ipsec4_getpolicybypcb(m, IPSEC_DIR_OUTBOUND, inp, &error); 415 416 if (sp == NULL) { 417 ipsecstat.out_inval++; 418 goto bad; 419 } 420 421 error = 0; 422 423 /* check policy */ 424 switch (sp->policy) { 425 case IPSEC_POLICY_DISCARD: 426 /* 427 * This packet is just discarded. 428 */ 429 ipsecstat.out_polvio++; 430 goto bad; 431 432 case IPSEC_POLICY_BYPASS: 433 case IPSEC_POLICY_NONE: 434 case IPSEC_POLICY_TCP: 435 /* no need to do IPsec. */ 436 goto skip_ipsec; 437 438 case IPSEC_POLICY_IPSEC: 439 if (sp->req == NULL) { 440 /* acquire a policy */ 441 error = key_spdacquire(sp); 442 goto bad; 443 } 444 break; 445 446 case IPSEC_POLICY_ENTRUST: 447 default: 448 printf("ip_output: Invalid policy found. %d\n", sp->policy); 449 } 450 { 451 struct ipsec_output_state state; 452 bzero(&state, sizeof(state)); 453 state.m = m; 454 if (flags & IP_ROUTETOIF) { 455 state.ro = &iproute; 456 bzero(&iproute, sizeof(iproute)); 457 } else 458 state.ro = ro; 459 state.dst = (struct sockaddr *)dst; 460 461 ip->ip_sum = 0; 462 463 /* 464 * XXX 465 * delayed checksums are not currently compatible with IPsec 466 */ 467 if (m->m_pkthdr.csum_flags & CSUM_DELAY_DATA) { 468 in_delayed_cksum(m); 469 m->m_pkthdr.csum_flags &= ~CSUM_DELAY_DATA; 470 } 471 472 ip->ip_len = htons(ip->ip_len); 473 ip->ip_off = htons(ip->ip_off); 474 475 error = ipsec4_output(&state, sp, flags); 476 477 m = state.m; 478 if (flags & IP_ROUTETOIF) { 479 /* 480 * if we have tunnel mode SA, we may need to ignore 481 * IP_ROUTETOIF. 482 */ 483 if (state.ro != &iproute || state.ro->ro_rt != NULL) { 484 flags &= ~IP_ROUTETOIF; 485 ro = state.ro; 486 } 487 } else 488 ro = state.ro; 489 dst = (struct sockaddr_in *)state.dst; 490 if (error) { 491 /* mbuf is already reclaimed in ipsec4_output. */ 492 m = NULL; 493 switch (error) { 494 case EHOSTUNREACH: 495 case ENETUNREACH: 496 case EMSGSIZE: 497 case ENOBUFS: 498 case ENOMEM: 499 break; 500 default: 501 printf("ip4_output (ipsec): error code %d\n", error); 502 /*fall through*/ 503 case ENOENT: 504 /* don't show these error codes to the user */ 505 error = 0; 506 break; 507 } 508 goto bad; 509 } 510 511 /* be sure to update variables that are affected by ipsec4_output() */ 512 ip = mtod(m, struct ip *); 513 hlen = ip->ip_hl << 2; 514 if (ro->ro_rt == NULL) { 515 if ((flags & IP_ROUTETOIF) == 0) { 516 printf("ip_output: " 517 "can't update route after IPsec processing\n"); 518 error = EHOSTUNREACH; /*XXX*/ 519 goto bad; 520 } 521 } else { 522 if (state.encap) { 523 ia = ifatoia(ro->ro_rt->rt_ifa); 524 ifp = ro->ro_rt->rt_ifp; 525 } 526 } 527 } 528 529 /* make it flipped, again. */ 530 ip->ip_len = ntohs(ip->ip_len); 531 ip->ip_off = ntohs(ip->ip_off); 532skip_ipsec: 533#endif /*IPSEC*/ 534#ifdef FAST_IPSEC 535 /* 536 * Check the security policy (SP) for the packet and, if 537 * required, do IPsec-related processing. There are two 538 * cases here; the first time a packet is sent through 539 * it will be untagged and handled by ipsec4_checkpolicy. 540 * If the packet is resubmitted to ip_output (e.g. after 541 * AH, ESP, etc. processing), there will be a tag to bypass 542 * the lookup and related policy checking. 543 */ 544 mtag = m_tag_find(m, PACKET_TAG_IPSEC_PENDING_TDB, NULL); 545 s = splnet(); 546 if (mtag != NULL) { 547 tdbi = (struct tdb_ident *)(mtag + 1); 548 sp = ipsec_getpolicy(tdbi, IPSEC_DIR_OUTBOUND); 549 if (sp == NULL) 550 error = -EINVAL; /* force silent drop */ 551 m_tag_delete(m, mtag); 552 } else { 553 sp = ipsec4_checkpolicy(m, IPSEC_DIR_OUTBOUND, flags, 554 &error, inp); 555 } 556 /* 557 * There are four return cases: 558 * sp != NULL apply IPsec policy 559 * sp == NULL, error == 0 no IPsec handling needed 560 * sp == NULL, error == -EINVAL discard packet w/o error 561 * sp == NULL, error != 0 discard packet, report error 562 */ 563 if (sp != NULL) { 564 /* Loop detection, check if ipsec processing already done */ 565 KASSERT(sp->req != NULL, ("ip_output: no ipsec request")); 566 for (mtag = m_tag_first(m); mtag != NULL; 567 mtag = m_tag_next(m, mtag)) { 568 if (mtag->m_tag_cookie != MTAG_ABI_COMPAT) 569 continue; 570 if (mtag->m_tag_id != PACKET_TAG_IPSEC_OUT_DONE && 571 mtag->m_tag_id != PACKET_TAG_IPSEC_OUT_CRYPTO_NEEDED) 572 continue; 573 /* 574 * Check if policy has an SA associated with it. 575 * This can happen when an SP has yet to acquire 576 * an SA; e.g. on first reference. If it occurs, 577 * then we let ipsec4_process_packet do its thing. 578 */ 579 if (sp->req->sav == NULL) 580 break; 581 tdbi = (struct tdb_ident *)(mtag + 1); 582 if (tdbi->spi == sp->req->sav->spi && 583 tdbi->proto == sp->req->sav->sah->saidx.proto && 584 bcmp(&tdbi->dst, &sp->req->sav->sah->saidx.dst, 585 sizeof (union sockaddr_union)) == 0) { 586 /* 587 * No IPsec processing is needed, free 588 * reference to SP. 589 * 590 * NB: null pointer to avoid free at 591 * done: below. 592 */ 593 KEY_FREESP(&sp), sp = NULL; 594 splx(s); 595 goto spd_done; 596 } 597 } 598 599 /* 600 * Do delayed checksums now because we send before 601 * this is done in the normal processing path. 602 */ 603 if (m->m_pkthdr.csum_flags & CSUM_DELAY_DATA) { 604 in_delayed_cksum(m); 605 m->m_pkthdr.csum_flags &= ~CSUM_DELAY_DATA; 606 } 607 608 ip->ip_len = htons(ip->ip_len); 609 ip->ip_off = htons(ip->ip_off); 610 611 /* NB: callee frees mbuf */ 612 error = ipsec4_process_packet(m, sp->req, flags, 0); 613 /* 614 * Preserve KAME behaviour: ENOENT can be returned 615 * when an SA acquire is in progress. Don't propagate 616 * this to user-level; it confuses applications. 617 * 618 * XXX this will go away when the SADB is redone. 619 */ 620 if (error == ENOENT) 621 error = 0; 622 splx(s); 623 goto done; 624 } else { 625 splx(s); 626 627 if (error != 0) { 628 /* 629 * Hack: -EINVAL is used to signal that a packet 630 * should be silently discarded. This is typically 631 * because we asked key management for an SA and 632 * it was delayed (e.g. kicked up to IKE). 633 */ 634 if (error == -EINVAL) 635 error = 0; 636 goto bad; 637 } else { 638 /* No IPsec processing for this packet. */ 639 } 640#ifdef notyet 641 /* 642 * If deferred crypto processing is needed, check that 643 * the interface supports it. 644 */ 645 mtag = m_tag_find(m, PACKET_TAG_IPSEC_OUT_CRYPTO_NEEDED, NULL); 646 if (mtag != NULL && (ifp->if_capenable & IFCAP_IPSEC) == 0) { 647 /* notify IPsec to do its own crypto */ 648 ipsp_skipcrypto_unmark((struct tdb_ident *)(mtag + 1)); 649 error = EHOSTUNREACH; 650 goto bad; 651 } 652#endif 653 } 654spd_done: 655#endif /* FAST_IPSEC */ 656 657 /* Jump over all PFIL processing if hooks are not active. */ 658 if (inet_pfil_hook.ph_busy_count == -1) 659 goto passout; 660 661 /* Run through list of hooks for output packets. */ 662 odst.s_addr = ip->ip_dst.s_addr; 663 error = pfil_run_hooks(&inet_pfil_hook, &m, ifp, PFIL_OUT, inp); 664 if (error != 0 || m == NULL) 665 goto done; 666 667 ip = mtod(m, struct ip *); 668 669 /* See if destination IP address was changed by packet filter. */ 670 if (odst.s_addr != ip->ip_dst.s_addr) { 671 m->m_flags |= M_SKIP_FIREWALL; 672 /* If destination is now ourself drop to ip_input(). */ 673 if (in_localip(ip->ip_dst)) { 674 m->m_flags |= M_FASTFWD_OURS; 675 if (m->m_pkthdr.rcvif == NULL) 676 m->m_pkthdr.rcvif = loif; 677 if (m->m_pkthdr.csum_flags & CSUM_DELAY_DATA) { 678 m->m_pkthdr.csum_flags |= 679 CSUM_DATA_VALID | CSUM_PSEUDO_HDR; 680 m->m_pkthdr.csum_data = 0xffff; 681 } 682 m->m_pkthdr.csum_flags |= 683 CSUM_IP_CHECKED | CSUM_IP_VALID; 684 685 error = netisr_queue(NETISR_IP, m); 686 goto done; 687 } else 688 goto again; /* Redo the routing table lookup. */ 689 } 690 691#ifdef IPFIREWALL_FORWARD 692 /* See if local, if yes, send it to netisr with IP_FASTFWD_OURS. */ 693 if (m->m_flags & M_FASTFWD_OURS) { 694 if (m->m_pkthdr.rcvif == NULL) 695 m->m_pkthdr.rcvif = loif; 696 if (m->m_pkthdr.csum_flags & CSUM_DELAY_DATA) { 697 m->m_pkthdr.csum_flags |= 698 CSUM_DATA_VALID | CSUM_PSEUDO_HDR; 699 m->m_pkthdr.csum_data = 0xffff; 700 } 701 m->m_pkthdr.csum_flags |= 702 CSUM_IP_CHECKED | CSUM_IP_VALID; 703 704 error = netisr_queue(NETISR_IP, m); 705 goto done; 706 } 707 /* Or forward to some other address? */ 708 fwd_tag = m_tag_find(m, PACKET_TAG_IPFORWARD, NULL); 709 if (fwd_tag) { 710#ifndef IPFIREWALL_FORWARD_EXTENDED 711 if (!in_localip(ip->ip_src) && !in_localaddr(ip->ip_dst)) { 712#endif 713 dst = (struct sockaddr_in *)&ro->ro_dst; 714 bcopy((fwd_tag+1), dst, sizeof(struct sockaddr_in)); 715 m->m_flags |= M_SKIP_FIREWALL; 716 m_tag_delete(m, fwd_tag); 717 goto again; 718#ifndef IPFIREWALL_FORWARD_EXTENDED 719 } else { 720 m_tag_delete(m, fwd_tag); 721 /* Continue. */ 722 } 723#endif 724 } 725#endif /* IPFIREWALL_FORWARD */ 726 727passout: 728 /* 127/8 must not appear on wire - RFC1122. */ 729 if ((ntohl(ip->ip_dst.s_addr) >> IN_CLASSA_NSHIFT) == IN_LOOPBACKNET || 730 (ntohl(ip->ip_src.s_addr) >> IN_CLASSA_NSHIFT) == IN_LOOPBACKNET) { 731 if ((ifp->if_flags & IFF_LOOPBACK) == 0) { 732 ipstat.ips_badaddr++; 733 error = EADDRNOTAVAIL; 734 goto bad; 735 } 736 } 737 738 m->m_pkthdr.csum_flags |= CSUM_IP; 739 sw_csum = m->m_pkthdr.csum_flags & ~ifp->if_hwassist; 740 if (sw_csum & CSUM_DELAY_DATA) { 741 in_delayed_cksum(m); 742 sw_csum &= ~CSUM_DELAY_DATA; 743 } 744 m->m_pkthdr.csum_flags &= ifp->if_hwassist; 745 746 /* 747 * If small enough for interface, or the interface will take 748 * care of the fragmentation for us, can just send directly. 749 */ 750 if (ip->ip_len <= ifp->if_mtu || (ifp->if_hwassist & CSUM_FRAGMENT && 751 ((ip->ip_off & IP_DF) == 0))) { 752 ip->ip_len = htons(ip->ip_len); 753 ip->ip_off = htons(ip->ip_off); 754 ip->ip_sum = 0; 755 if (sw_csum & CSUM_DELAY_IP) 756 ip->ip_sum = in_cksum(m, hlen); 757 758 /* Record statistics for this interface address. */ 759 if (!(flags & IP_FORWARDING) && ia) { 760 ia->ia_ifa.if_opackets++; 761 ia->ia_ifa.if_obytes += m->m_pkthdr.len; 762 } 763 764#ifdef IPSEC 765 /* clean ipsec history once it goes out of the node */ 766 ipsec_delaux(m); 767#endif 768 769#ifdef MBUF_STRESS_TEST 770 if (mbuf_frag_size && m->m_pkthdr.len > mbuf_frag_size) 771 m = m_fragment(m, M_DONTWAIT, mbuf_frag_size); 772#endif 773 /* 774 * Reset layer specific mbuf flags 775 * to avoid confusing lower layers. 776 */ 777 m->m_flags &= ~(M_PROTOFLAGS); 778 779 error = (*ifp->if_output)(ifp, m, 780 (struct sockaddr *)dst, ro->ro_rt); 781 goto done; 782 } 783 784 if (ip->ip_off & IP_DF) { 785 error = EMSGSIZE; 786 /* 787 * This case can happen if the user changed the MTU 788 * of an interface after enabling IP on it. Because 789 * most netifs don't keep track of routes pointing to 790 * them, there is no way for one to update all its 791 * routes when the MTU is changed. 792 */ 793 if (ro != NULL && 794 (ro->ro_rt->rt_flags & (RTF_UP | RTF_HOST)) && 795 (ro->ro_rt->rt_rmx.rmx_mtu > ifp->if_mtu)) { 796 ro->ro_rt->rt_rmx.rmx_mtu = ifp->if_mtu; 797 } 798 ipstat.ips_cantfrag++; 799 goto bad; 800 } 801 802 /* 803 * Too large for interface; fragment if possible. If successful, 804 * on return, m will point to a list of packets to be sent. 805 */ 806 error = ip_fragment(ip, &m, ifp->if_mtu, ifp->if_hwassist, sw_csum); 807 if (error) 808 goto bad; 809 for (; m; m = m0) { 810 m0 = m->m_nextpkt; 811 m->m_nextpkt = 0; 812#ifdef IPSEC 813 /* clean ipsec history once it goes out of the node */ 814 ipsec_delaux(m); 815#endif 816 if (error == 0) { 817 /* Record statistics for this interface address. */ 818 if (ia != NULL) { 819 ia->ia_ifa.if_opackets++; 820 ia->ia_ifa.if_obytes += m->m_pkthdr.len; 821 } 822 /* 823 * Reset layer specific mbuf flags 824 * to avoid confusing upper layers. 825 */ 826 m->m_flags &= ~(M_PROTOFLAGS); 827 828 error = (*ifp->if_output)(ifp, m, 829 (struct sockaddr *)dst, ro->ro_rt); 830 } else 831 m_freem(m); 832 } 833 834 if (error == 0) 835 ipstat.ips_fragmented++; 836 837done: 838 if (ro == &iproute && ro->ro_rt) { 839 RTFREE(ro->ro_rt); 840 } 841#ifdef IPSEC 842 if (sp != NULL) { 843 KEYDEBUG(KEYDEBUG_IPSEC_STAMP, 844 printf("DP ip_output call free SP:%p\n", sp)); 845 key_freesp(sp); 846 } 847#endif 848#ifdef FAST_IPSEC 849 if (sp != NULL) 850 KEY_FREESP(&sp); 851#endif 852 return (error); 853bad: 854 m_freem(m); 855 goto done; 856} 857 858/* 859 * Create a chain of fragments which fit the given mtu. m_frag points to the 860 * mbuf to be fragmented; on return it points to the chain with the fragments. 861 * Return 0 if no error. If error, m_frag may contain a partially built 862 * chain of fragments that should be freed by the caller. 863 * 864 * if_hwassist_flags is the hw offload capabilities (see if_data.ifi_hwassist) 865 * sw_csum contains the delayed checksums flags (e.g., CSUM_DELAY_IP). 866 */ 867int 868ip_fragment(struct ip *ip, struct mbuf **m_frag, int mtu, 869 u_long if_hwassist_flags, int sw_csum) 870{ 871 int error = 0; 872 int hlen = ip->ip_hl << 2; 873 int len = (mtu - hlen) & ~7; /* size of payload in each fragment */ 874 int off; 875 struct mbuf *m0 = *m_frag; /* the original packet */ 876 int firstlen; 877 struct mbuf **mnext; 878 int nfrags; 879 880 if (ip->ip_off & IP_DF) { /* Fragmentation not allowed */ 881 ipstat.ips_cantfrag++; 882 return EMSGSIZE; 883 } 884 885 /* 886 * Must be able to put at least 8 bytes per fragment. 887 */ 888 if (len < 8) 889 return EMSGSIZE; 890 891 /* 892 * If the interface will not calculate checksums on 893 * fragmented packets, then do it here. 894 */ 895 if (m0->m_pkthdr.csum_flags & CSUM_DELAY_DATA && 896 (if_hwassist_flags & CSUM_IP_FRAGS) == 0) { 897 in_delayed_cksum(m0); 898 m0->m_pkthdr.csum_flags &= ~CSUM_DELAY_DATA; 899 } 900 901 if (len > PAGE_SIZE) { 902 /* 903 * Fragment large datagrams such that each segment 904 * contains a multiple of PAGE_SIZE amount of data, 905 * plus headers. This enables a receiver to perform 906 * page-flipping zero-copy optimizations. 907 * 908 * XXX When does this help given that sender and receiver 909 * could have different page sizes, and also mtu could 910 * be less than the receiver's page size ? 911 */ 912 int newlen; 913 struct mbuf *m; 914 915 for (m = m0, off = 0; m && (off+m->m_len) <= mtu; m = m->m_next) 916 off += m->m_len; 917 918 /* 919 * firstlen (off - hlen) must be aligned on an 920 * 8-byte boundary 921 */ 922 if (off < hlen) 923 goto smart_frag_failure; 924 off = ((off - hlen) & ~7) + hlen; 925 newlen = (~PAGE_MASK) & mtu; 926 if ((newlen + sizeof (struct ip)) > mtu) { 927 /* we failed, go back the default */ 928smart_frag_failure: 929 newlen = len; 930 off = hlen + len; 931 } 932 len = newlen; 933 934 } else { 935 off = hlen + len; 936 } 937 938 firstlen = off - hlen; 939 mnext = &m0->m_nextpkt; /* pointer to next packet */ 940 941 /* 942 * Loop through length of segment after first fragment, 943 * make new header and copy data of each part and link onto chain. 944 * Here, m0 is the original packet, m is the fragment being created. 945 * The fragments are linked off the m_nextpkt of the original 946 * packet, which after processing serves as the first fragment. 947 */ 948 for (nfrags = 1; off < ip->ip_len; off += len, nfrags++) { 949 struct ip *mhip; /* ip header on the fragment */ 950 struct mbuf *m; 951 int mhlen = sizeof (struct ip); 952 953 MGETHDR(m, M_DONTWAIT, MT_DATA); 954 if (m == NULL) { 955 error = ENOBUFS; 956 ipstat.ips_odropped++; 957 goto done; 958 } 959 m->m_flags |= (m0->m_flags & M_MCAST) | M_FRAG; 960 /* 961 * In the first mbuf, leave room for the link header, then 962 * copy the original IP header including options. The payload 963 * goes into an additional mbuf chain returned by m_copy(). 964 */ 965 m->m_data += max_linkhdr; 966 mhip = mtod(m, struct ip *); 967 *mhip = *ip; 968 if (hlen > sizeof (struct ip)) { 969 mhlen = ip_optcopy(ip, mhip) + sizeof (struct ip); 970 mhip->ip_v = IPVERSION; 971 mhip->ip_hl = mhlen >> 2; 972 } 973 m->m_len = mhlen; 974 /* XXX do we need to add ip->ip_off below ? */ 975 mhip->ip_off = ((off - hlen) >> 3) + ip->ip_off; 976 if (off + len >= ip->ip_len) { /* last fragment */ 977 len = ip->ip_len - off; 978 m->m_flags |= M_LASTFRAG; 979 } else 980 mhip->ip_off |= IP_MF; 981 mhip->ip_len = htons((u_short)(len + mhlen)); 982 m->m_next = m_copy(m0, off, len); 983 if (m->m_next == NULL) { /* copy failed */ 984 m_free(m); 985 error = ENOBUFS; /* ??? */ 986 ipstat.ips_odropped++; 987 goto done; 988 } 989 m->m_pkthdr.len = mhlen + len; 990 m->m_pkthdr.rcvif = NULL; 991#ifdef MAC 992 mac_create_fragment(m0, m); 993#endif 994 m->m_pkthdr.csum_flags = m0->m_pkthdr.csum_flags; 995 mhip->ip_off = htons(mhip->ip_off); 996 mhip->ip_sum = 0; 997 if (sw_csum & CSUM_DELAY_IP) 998 mhip->ip_sum = in_cksum(m, mhlen); 999 *mnext = m; 1000 mnext = &m->m_nextpkt; 1001 } 1002 ipstat.ips_ofragments += nfrags; 1003 1004 /* set first marker for fragment chain */ 1005 m0->m_flags |= M_FIRSTFRAG | M_FRAG; 1006 m0->m_pkthdr.csum_data = nfrags; 1007 1008 /* 1009 * Update first fragment by trimming what's been copied out 1010 * and updating header. 1011 */ 1012 m_adj(m0, hlen + firstlen - ip->ip_len); 1013 m0->m_pkthdr.len = hlen + firstlen; 1014 ip->ip_len = htons((u_short)m0->m_pkthdr.len); 1015 ip->ip_off |= IP_MF; 1016 ip->ip_off = htons(ip->ip_off); 1017 ip->ip_sum = 0; 1018 if (sw_csum & CSUM_DELAY_IP) 1019 ip->ip_sum = in_cksum(m0, hlen); 1020 1021done: 1022 *m_frag = m0; 1023 return error; 1024} 1025 1026void 1027in_delayed_cksum(struct mbuf *m) 1028{ 1029 struct ip *ip; 1030 u_short csum, offset; 1031 1032 ip = mtod(m, struct ip *); 1033 offset = ip->ip_hl << 2 ; 1034 csum = in_cksum_skip(m, ip->ip_len, offset); 1035 if (m->m_pkthdr.csum_flags & CSUM_UDP && csum == 0) 1036 csum = 0xffff; 1037 offset += m->m_pkthdr.csum_data; /* checksum offset */ 1038 1039 if (offset + sizeof(u_short) > m->m_len) { 1040 printf("delayed m_pullup, m->len: %d off: %d p: %d\n", 1041 m->m_len, offset, ip->ip_p); 1042 /* 1043 * XXX 1044 * this shouldn't happen, but if it does, the 1045 * correct behavior may be to insert the checksum 1046 * in the existing chain instead of rearranging it. 1047 */ 1048 m = m_pullup(m, offset + sizeof(u_short)); 1049 } 1050 *(u_short *)(m->m_data + offset) = csum; 1051} 1052 1053/* 1054 * IP socket option processing. 1055 */ 1056int 1057ip_ctloutput(so, sopt) 1058 struct socket *so; 1059 struct sockopt *sopt; 1060{ 1061 struct inpcb *inp = sotoinpcb(so); 1062 int error, optval; 1063 1064 error = optval = 0; 1065 if (sopt->sopt_level != IPPROTO_IP) { 1066 return (EINVAL); 1067 } 1068 1069 switch (sopt->sopt_dir) { 1070 case SOPT_SET: 1071 switch (sopt->sopt_name) { 1072 case IP_OPTIONS: 1073#ifdef notyet 1074 case IP_RETOPTS: 1075#endif 1076 { 1077 struct mbuf *m; 1078 if (sopt->sopt_valsize > MLEN) { 1079 error = EMSGSIZE; 1080 break; 1081 } 1082 MGET(m, sopt->sopt_td ? M_TRYWAIT : M_DONTWAIT, MT_DATA); 1083 if (m == NULL) { 1084 error = ENOBUFS; 1085 break; 1086 } 1087 m->m_len = sopt->sopt_valsize; 1088 error = sooptcopyin(sopt, mtod(m, char *), m->m_len, 1089 m->m_len); 1090 INP_LOCK(inp); 1091 error = ip_pcbopts(inp, sopt->sopt_name, m); 1092 INP_UNLOCK(inp); 1093 return (error); 1094 } 1095 1096 case IP_TOS: 1097 case IP_TTL: 1098 case IP_MINTTL: 1099 case IP_RECVOPTS: 1100 case IP_RECVRETOPTS: 1101 case IP_RECVDSTADDR: 1102 case IP_RECVTTL: 1103 case IP_RECVIF: 1104 case IP_FAITH: 1105 case IP_ONESBCAST: 1106 case IP_DONTFRAG: 1107 error = sooptcopyin(sopt, &optval, sizeof optval, 1108 sizeof optval); 1109 if (error) 1110 break; 1111 1112 switch (sopt->sopt_name) { 1113 case IP_TOS: 1114 inp->inp_ip_tos = optval; 1115 break; 1116 1117 case IP_TTL: 1118 inp->inp_ip_ttl = optval; 1119 break; 1120 1121 case IP_MINTTL: 1122 if (optval > 0 && optval <= MAXTTL) 1123 inp->inp_ip_minttl = optval; 1124 else 1125 error = EINVAL; 1126 break; 1127 1128#define OPTSET(bit) do { \ 1129 INP_LOCK(inp); \ 1130 if (optval) \ 1131 inp->inp_flags |= bit; \ 1132 else \ 1133 inp->inp_flags &= ~bit; \ 1134 INP_UNLOCK(inp); \ 1135} while (0) 1136 1137 case IP_RECVOPTS: 1138 OPTSET(INP_RECVOPTS); 1139 break; 1140 1141 case IP_RECVRETOPTS: 1142 OPTSET(INP_RECVRETOPTS); 1143 break; 1144 1145 case IP_RECVDSTADDR: 1146 OPTSET(INP_RECVDSTADDR); 1147 break; 1148 1149 case IP_RECVTTL: 1150 OPTSET(INP_RECVTTL); 1151 break; 1152 1153 case IP_RECVIF: 1154 OPTSET(INP_RECVIF); 1155 break; 1156 1157 case IP_FAITH: 1158 OPTSET(INP_FAITH); 1159 break; 1160 1161 case IP_ONESBCAST: 1162 OPTSET(INP_ONESBCAST); 1163 break; 1164 case IP_DONTFRAG: 1165 OPTSET(INP_DONTFRAG); 1166 break; 1167 } 1168 break; 1169#undef OPTSET 1170 1171 case IP_MULTICAST_IF: 1172 case IP_MULTICAST_VIF: 1173 case IP_MULTICAST_TTL: 1174 case IP_MULTICAST_LOOP: 1175 case IP_ADD_MEMBERSHIP: 1176 case IP_DROP_MEMBERSHIP: 1177 error = ip_setmoptions(inp, sopt); 1178 break; 1179 1180 case IP_PORTRANGE: 1181 error = sooptcopyin(sopt, &optval, sizeof optval, 1182 sizeof optval); 1183 if (error) 1184 break; 1185 1186 INP_LOCK(inp); 1187 switch (optval) { 1188 case IP_PORTRANGE_DEFAULT: 1189 inp->inp_flags &= ~(INP_LOWPORT); 1190 inp->inp_flags &= ~(INP_HIGHPORT); 1191 break; 1192 1193 case IP_PORTRANGE_HIGH: 1194 inp->inp_flags &= ~(INP_LOWPORT); 1195 inp->inp_flags |= INP_HIGHPORT; 1196 break; 1197 1198 case IP_PORTRANGE_LOW: 1199 inp->inp_flags &= ~(INP_HIGHPORT); 1200 inp->inp_flags |= INP_LOWPORT; 1201 break; 1202 1203 default: 1204 error = EINVAL; 1205 break; 1206 } 1207 INP_UNLOCK(inp); 1208 break; 1209 1210#if defined(IPSEC) || defined(FAST_IPSEC) 1211 case IP_IPSEC_POLICY: 1212 { 1213 caddr_t req; 1214 size_t len = 0; 1215 int priv; 1216 struct mbuf *m; 1217 int optname; 1218 1219 if ((error = soopt_getm(sopt, &m)) != 0) /* XXX */ 1220 break; 1221 if ((error = soopt_mcopyin(sopt, m)) != 0) /* XXX */ 1222 break; 1223 priv = (sopt->sopt_td != NULL && 1224 suser(sopt->sopt_td) != 0) ? 0 : 1; 1225 req = mtod(m, caddr_t); 1226 len = m->m_len; 1227 optname = sopt->sopt_name; 1228 error = ipsec4_set_policy(inp, optname, req, len, priv); 1229 m_freem(m); 1230 break; 1231 } 1232#endif /*IPSEC*/ 1233 1234 default: 1235 error = ENOPROTOOPT; 1236 break; 1237 } 1238 break; 1239 1240 case SOPT_GET: 1241 switch (sopt->sopt_name) { 1242 case IP_OPTIONS: 1243 case IP_RETOPTS: 1244 if (inp->inp_options) 1245 error = sooptcopyout(sopt, 1246 mtod(inp->inp_options, 1247 char *), 1248 inp->inp_options->m_len); 1249 else 1250 sopt->sopt_valsize = 0; 1251 break; 1252 1253 case IP_TOS: 1254 case IP_TTL: 1255 case IP_MINTTL: 1256 case IP_RECVOPTS: 1257 case IP_RECVRETOPTS: 1258 case IP_RECVDSTADDR: 1259 case IP_RECVTTL: 1260 case IP_RECVIF: 1261 case IP_PORTRANGE: 1262 case IP_FAITH: 1263 case IP_ONESBCAST: 1264 case IP_DONTFRAG: 1265 switch (sopt->sopt_name) { 1266 1267 case IP_TOS: 1268 optval = inp->inp_ip_tos; 1269 break; 1270 1271 case IP_TTL: 1272 optval = inp->inp_ip_ttl; 1273 break; 1274 1275 case IP_MINTTL: 1276 optval = inp->inp_ip_minttl; 1277 break; 1278 1279#define OPTBIT(bit) (inp->inp_flags & bit ? 1 : 0) 1280 1281 case IP_RECVOPTS: 1282 optval = OPTBIT(INP_RECVOPTS); 1283 break; 1284 1285 case IP_RECVRETOPTS: 1286 optval = OPTBIT(INP_RECVRETOPTS); 1287 break; 1288 1289 case IP_RECVDSTADDR: 1290 optval = OPTBIT(INP_RECVDSTADDR); 1291 break; 1292 1293 case IP_RECVTTL: 1294 optval = OPTBIT(INP_RECVTTL); 1295 break; 1296 1297 case IP_RECVIF: 1298 optval = OPTBIT(INP_RECVIF); 1299 break; 1300 1301 case IP_PORTRANGE: 1302 if (inp->inp_flags & INP_HIGHPORT) 1303 optval = IP_PORTRANGE_HIGH; 1304 else if (inp->inp_flags & INP_LOWPORT) 1305 optval = IP_PORTRANGE_LOW; 1306 else 1307 optval = 0; 1308 break; 1309 1310 case IP_FAITH: 1311 optval = OPTBIT(INP_FAITH); 1312 break; 1313 1314 case IP_ONESBCAST: 1315 optval = OPTBIT(INP_ONESBCAST); 1316 break; 1317 case IP_DONTFRAG: 1318 optval = OPTBIT(INP_DONTFRAG); 1319 break; 1320 } 1321 error = sooptcopyout(sopt, &optval, sizeof optval); 1322 break; 1323 1324 case IP_MULTICAST_IF: 1325 case IP_MULTICAST_VIF: 1326 case IP_MULTICAST_TTL: 1327 case IP_MULTICAST_LOOP: 1328 case IP_ADD_MEMBERSHIP: 1329 case IP_DROP_MEMBERSHIP: 1330 error = ip_getmoptions(inp, sopt); 1331 break; 1332 1333#if defined(IPSEC) || defined(FAST_IPSEC) 1334 case IP_IPSEC_POLICY: 1335 { 1336 struct mbuf *m = NULL; 1337 caddr_t req = NULL; 1338 size_t len = 0; 1339 1340 if (m != 0) { 1341 req = mtod(m, caddr_t); 1342 len = m->m_len; 1343 } 1344 error = ipsec4_get_policy(sotoinpcb(so), req, len, &m); 1345 if (error == 0) 1346 error = soopt_mcopyout(sopt, m); /* XXX */ 1347 if (error == 0) 1348 m_freem(m); 1349 break; 1350 } 1351#endif /*IPSEC*/ 1352 1353 default: 1354 error = ENOPROTOOPT; 1355 break; 1356 } 1357 break; 1358 } 1359 return (error); 1360} 1361 1362/* 1363 * XXX 1364 * The whole multicast option thing needs to be re-thought. 1365 * Several of these options are equally applicable to non-multicast 1366 * transmission, and one (IP_MULTICAST_TTL) totally duplicates a 1367 * standard option (IP_TTL). 1368 */ 1369 1370/* 1371 * following RFC1724 section 3.3, 0.0.0.0/8 is interpreted as interface index. 1372 */ 1373static struct ifnet * 1374ip_multicast_if(a, ifindexp) 1375 struct in_addr *a; 1376 int *ifindexp; 1377{ 1378 int ifindex; 1379 struct ifnet *ifp; 1380 1381 if (ifindexp) 1382 *ifindexp = 0; 1383 if (ntohl(a->s_addr) >> 24 == 0) { 1384 ifindex = ntohl(a->s_addr) & 0xffffff; 1385 if (ifindex < 0 || if_index < ifindex) 1386 return NULL; 1387 ifp = ifnet_byindex(ifindex); 1388 if (ifindexp) 1389 *ifindexp = ifindex; 1390 } else { 1391 INADDR_TO_IFP(*a, ifp); 1392 } 1393 return ifp; 1394} 1395 1396/* 1397 * Given an inpcb, return its multicast options structure pointer. Accepts 1398 * an unlocked inpcb pointer, but will return it locked. May sleep. 1399 */ 1400static struct ip_moptions * 1401ip_findmoptions(struct inpcb *inp) 1402{ 1403 struct ip_moptions *imo; 1404 1405 INP_LOCK(inp); 1406 if (inp->inp_moptions != NULL) 1407 return (inp->inp_moptions); 1408 1409 INP_UNLOCK(inp); 1410 1411 imo = (struct ip_moptions*)malloc(sizeof(*imo), M_IPMOPTS, M_WAITOK); 1412 1413 imo->imo_multicast_ifp = NULL; 1414 imo->imo_multicast_addr.s_addr = INADDR_ANY; 1415 imo->imo_multicast_vif = -1; 1416 imo->imo_multicast_ttl = IP_DEFAULT_MULTICAST_TTL; 1417 imo->imo_multicast_loop = IP_DEFAULT_MULTICAST_LOOP; 1418 imo->imo_num_memberships = 0; 1419 1420 INP_LOCK(inp); 1421 if (inp->inp_moptions != NULL) { 1422 free(imo, M_IPMOPTS); 1423 return (inp->inp_moptions); 1424 } 1425 inp->inp_moptions = imo; 1426 return (imo); 1427} 1428 1429/* 1430 * Set the IP multicast options in response to user setsockopt(). 1431 */ 1432static int 1433ip_setmoptions(struct inpcb *inp, struct sockopt *sopt) 1434{ 1435 int error = 0; 1436 int i; 1437 struct in_addr addr; 1438 struct ip_mreq mreq; 1439 struct ifnet *ifp; 1440 struct ip_moptions *imo; 1441 struct route ro; 1442 struct sockaddr_in *dst; 1443 int ifindex; 1444 int s; 1445 1446 switch (sopt->sopt_name) { 1447 /* store an index number for the vif you wanna use in the send */ 1448 case IP_MULTICAST_VIF: 1449 if (legal_vif_num == 0) { 1450 error = EOPNOTSUPP; 1451 break; 1452 } 1453 error = sooptcopyin(sopt, &i, sizeof i, sizeof i); 1454 if (error) 1455 break; 1456 if (!legal_vif_num(i) && (i != -1)) { 1457 error = EINVAL; 1458 break; 1459 } 1460 imo = ip_findmoptions(inp); 1461 imo->imo_multicast_vif = i; 1462 INP_UNLOCK(inp); 1463 break; 1464 1465 case IP_MULTICAST_IF: 1466 /* 1467 * Select the interface for outgoing multicast packets. 1468 */ 1469 error = sooptcopyin(sopt, &addr, sizeof addr, sizeof addr); 1470 if (error) 1471 break; 1472 /* 1473 * INADDR_ANY is used to remove a previous selection. 1474 * When no interface is selected, a default one is 1475 * chosen every time a multicast packet is sent. 1476 */ 1477 imo = ip_findmoptions(inp); 1478 if (addr.s_addr == INADDR_ANY) { 1479 imo->imo_multicast_ifp = NULL; 1480 INP_UNLOCK(inp); 1481 break; 1482 } 1483 /* 1484 * The selected interface is identified by its local 1485 * IP address. Find the interface and confirm that 1486 * it supports multicasting. 1487 */ 1488 s = splimp(); 1489 ifp = ip_multicast_if(&addr, &ifindex); 1490 if (ifp == NULL || (ifp->if_flags & IFF_MULTICAST) == 0) { 1491 INP_UNLOCK(inp); 1492 splx(s); 1493 error = EADDRNOTAVAIL; 1494 break; 1495 } 1496 imo->imo_multicast_ifp = ifp; 1497 if (ifindex) 1498 imo->imo_multicast_addr = addr; 1499 else 1500 imo->imo_multicast_addr.s_addr = INADDR_ANY; 1501 INP_UNLOCK(inp); 1502 splx(s); 1503 break; 1504 1505 case IP_MULTICAST_TTL: 1506 /* 1507 * Set the IP time-to-live for outgoing multicast packets. 1508 * The original multicast API required a char argument, 1509 * which is inconsistent with the rest of the socket API. 1510 * We allow either a char or an int. 1511 */ 1512 if (sopt->sopt_valsize == 1) { 1513 u_char ttl; 1514 error = sooptcopyin(sopt, &ttl, 1, 1); 1515 if (error) 1516 break; 1517 imo = ip_findmoptions(inp); 1518 imo->imo_multicast_ttl = ttl; 1519 INP_UNLOCK(inp); 1520 } else { 1521 u_int ttl; 1522 error = sooptcopyin(sopt, &ttl, sizeof ttl, 1523 sizeof ttl); 1524 if (error) 1525 break; 1526 if (ttl > 255) 1527 error = EINVAL; 1528 else { 1529 imo = ip_findmoptions(inp); 1530 imo->imo_multicast_ttl = ttl; 1531 INP_UNLOCK(inp); 1532 } 1533 } 1534 break; 1535 1536 case IP_MULTICAST_LOOP: 1537 /* 1538 * Set the loopback flag for outgoing multicast packets. 1539 * Must be zero or one. The original multicast API required a 1540 * char argument, which is inconsistent with the rest 1541 * of the socket API. We allow either a char or an int. 1542 */ 1543 if (sopt->sopt_valsize == 1) { 1544 u_char loop; 1545 error = sooptcopyin(sopt, &loop, 1, 1); 1546 if (error) 1547 break; 1548 imo = ip_findmoptions(inp); 1549 imo->imo_multicast_loop = !!loop; 1550 INP_UNLOCK(inp); 1551 } else { 1552 u_int loop; 1553 error = sooptcopyin(sopt, &loop, sizeof loop, 1554 sizeof loop); 1555 if (error) 1556 break; 1557 imo = ip_findmoptions(inp); 1558 imo->imo_multicast_loop = !!loop; 1559 INP_UNLOCK(inp); 1560 } 1561 break; 1562 1563 case IP_ADD_MEMBERSHIP: 1564 /* 1565 * Add a multicast group membership. 1566 * Group must be a valid IP multicast address. 1567 */ 1568 error = sooptcopyin(sopt, &mreq, sizeof mreq, sizeof mreq); 1569 if (error) 1570 break; 1571 1572 if (!IN_MULTICAST(ntohl(mreq.imr_multiaddr.s_addr))) { 1573 error = EINVAL; 1574 break; 1575 } 1576 s = splimp(); 1577 /* 1578 * If no interface address was provided, use the interface of 1579 * the route to the given multicast address. 1580 */ 1581 if (mreq.imr_interface.s_addr == INADDR_ANY) { 1582 bzero((caddr_t)&ro, sizeof(ro)); 1583 dst = (struct sockaddr_in *)&ro.ro_dst; 1584 dst->sin_len = sizeof(*dst); 1585 dst->sin_family = AF_INET; 1586 dst->sin_addr = mreq.imr_multiaddr; 1587 rtalloc_ign(&ro, RTF_CLONING); 1588 if (ro.ro_rt == NULL) { 1589 error = EADDRNOTAVAIL; 1590 splx(s); 1591 break; 1592 } 1593 ifp = ro.ro_rt->rt_ifp; 1594 RTFREE(ro.ro_rt); 1595 } 1596 else { 1597 ifp = ip_multicast_if(&mreq.imr_interface, NULL); 1598 } 1599 1600 /* 1601 * See if we found an interface, and confirm that it 1602 * supports multicast. 1603 */ 1604 if (ifp == NULL || (ifp->if_flags & IFF_MULTICAST) == 0) { 1605 error = EADDRNOTAVAIL; 1606 splx(s); 1607 break; 1608 } 1609 /* 1610 * See if the membership already exists or if all the 1611 * membership slots are full. 1612 */ 1613 imo = ip_findmoptions(inp); 1614 for (i = 0; i < imo->imo_num_memberships; ++i) { 1615 if (imo->imo_membership[i]->inm_ifp == ifp && 1616 imo->imo_membership[i]->inm_addr.s_addr 1617 == mreq.imr_multiaddr.s_addr) 1618 break; 1619 } 1620 if (i < imo->imo_num_memberships) { 1621 INP_UNLOCK(inp); 1622 error = EADDRINUSE; 1623 splx(s); 1624 break; 1625 } 1626 if (i == IP_MAX_MEMBERSHIPS) { 1627 INP_UNLOCK(inp); 1628 error = ETOOMANYREFS; 1629 splx(s); 1630 break; 1631 } 1632 /* 1633 * Everything looks good; add a new record to the multicast 1634 * address list for the given interface. 1635 */ 1636 if ((imo->imo_membership[i] = 1637 in_addmulti(&mreq.imr_multiaddr, ifp)) == NULL) { 1638 INP_UNLOCK(inp); 1639 error = ENOBUFS; 1640 splx(s); 1641 break; 1642 } 1643 ++imo->imo_num_memberships; 1644 INP_UNLOCK(inp); 1645 splx(s); 1646 break; 1647 1648 case IP_DROP_MEMBERSHIP: 1649 /* 1650 * Drop a multicast group membership. 1651 * Group must be a valid IP multicast address. 1652 */ 1653 error = sooptcopyin(sopt, &mreq, sizeof mreq, sizeof mreq); 1654 if (error) 1655 break; 1656 1657 if (!IN_MULTICAST(ntohl(mreq.imr_multiaddr.s_addr))) { 1658 error = EINVAL; 1659 break; 1660 } 1661 1662 s = splimp(); 1663 /* 1664 * If an interface address was specified, get a pointer 1665 * to its ifnet structure. 1666 */ 1667 if (mreq.imr_interface.s_addr == INADDR_ANY) 1668 ifp = NULL; 1669 else { 1670 ifp = ip_multicast_if(&mreq.imr_interface, NULL); 1671 if (ifp == NULL) { 1672 error = EADDRNOTAVAIL; 1673 splx(s); 1674 break; 1675 } 1676 } 1677 /* 1678 * Find the membership in the membership array. 1679 */ 1680 imo = ip_findmoptions(inp); 1681 for (i = 0; i < imo->imo_num_memberships; ++i) { 1682 if ((ifp == NULL || 1683 imo->imo_membership[i]->inm_ifp == ifp) && 1684 imo->imo_membership[i]->inm_addr.s_addr == 1685 mreq.imr_multiaddr.s_addr) 1686 break; 1687 } 1688 if (i == imo->imo_num_memberships) { 1689 INP_UNLOCK(inp); 1690 error = EADDRNOTAVAIL; 1691 splx(s); 1692 break; 1693 } 1694 /* 1695 * Give up the multicast address record to which the 1696 * membership points. 1697 */ 1698 in_delmulti(imo->imo_membership[i]); 1699 /* 1700 * Remove the gap in the membership array. 1701 */ 1702 for (++i; i < imo->imo_num_memberships; ++i) 1703 imo->imo_membership[i-1] = imo->imo_membership[i]; 1704 --imo->imo_num_memberships; 1705 INP_UNLOCK(inp); 1706 splx(s); 1707 break; 1708 1709 default: 1710 error = EOPNOTSUPP; 1711 break; 1712 } 1713 1714 return (error); 1715} 1716 1717/* 1718 * Return the IP multicast options in response to user getsockopt(). 1719 */ 1720static int 1721ip_getmoptions(struct inpcb *inp, struct sockopt *sopt) 1722{ 1723 struct ip_moptions *imo; 1724 struct in_addr addr; 1725 struct in_ifaddr *ia; 1726 int error, optval; 1727 u_char coptval; 1728 1729 INP_LOCK(inp); 1730 imo = inp->inp_moptions; 1731 1732 error = 0; 1733 switch (sopt->sopt_name) { 1734 case IP_MULTICAST_VIF: 1735 if (imo != NULL) 1736 optval = imo->imo_multicast_vif; 1737 else 1738 optval = -1; 1739 INP_UNLOCK(inp); 1740 error = sooptcopyout(sopt, &optval, sizeof optval); 1741 break; 1742 1743 case IP_MULTICAST_IF: 1744 if (imo == NULL || imo->imo_multicast_ifp == NULL) 1745 addr.s_addr = INADDR_ANY; 1746 else if (imo->imo_multicast_addr.s_addr) { 1747 /* return the value user has set */ 1748 addr = imo->imo_multicast_addr; 1749 } else { 1750 IFP_TO_IA(imo->imo_multicast_ifp, ia); 1751 addr.s_addr = (ia == NULL) ? INADDR_ANY 1752 : IA_SIN(ia)->sin_addr.s_addr; 1753 } 1754 INP_UNLOCK(inp); 1755 error = sooptcopyout(sopt, &addr, sizeof addr); 1756 break; 1757 1758 case IP_MULTICAST_TTL: 1759 if (imo == 0) 1760 optval = coptval = IP_DEFAULT_MULTICAST_TTL; 1761 else 1762 optval = coptval = imo->imo_multicast_ttl; 1763 INP_UNLOCK(inp); 1764 if (sopt->sopt_valsize == 1) 1765 error = sooptcopyout(sopt, &coptval, 1); 1766 else 1767 error = sooptcopyout(sopt, &optval, sizeof optval); 1768 break; 1769 1770 case IP_MULTICAST_LOOP: 1771 if (imo == 0) 1772 optval = coptval = IP_DEFAULT_MULTICAST_LOOP; 1773 else 1774 optval = coptval = imo->imo_multicast_loop; 1775 INP_UNLOCK(inp); 1776 if (sopt->sopt_valsize == 1) 1777 error = sooptcopyout(sopt, &coptval, 1); 1778 else 1779 error = sooptcopyout(sopt, &optval, sizeof optval); 1780 break; 1781 1782 default: 1783 INP_UNLOCK(inp); 1784 error = ENOPROTOOPT; 1785 break; 1786 } 1787 INP_UNLOCK_ASSERT(inp); 1788 1789 return (error); 1790} 1791 1792/* 1793 * Discard the IP multicast options. 1794 */ 1795void 1796ip_freemoptions(imo) 1797 register struct ip_moptions *imo; 1798{ 1799 register int i; 1800 1801 if (imo != NULL) { 1802 for (i = 0; i < imo->imo_num_memberships; ++i) 1803 in_delmulti(imo->imo_membership[i]); 1804 free(imo, M_IPMOPTS); 1805 } 1806} 1807 1808/* 1809 * Routine called from ip_output() to loop back a copy of an IP multicast 1810 * packet to the input queue of a specified interface. Note that this 1811 * calls the output routine of the loopback "driver", but with an interface 1812 * pointer that might NOT be a loopback interface -- evil, but easier than 1813 * replicating that code here. 1814 */ 1815static void 1816ip_mloopback(ifp, m, dst, hlen) 1817 struct ifnet *ifp; 1818 register struct mbuf *m; 1819 register struct sockaddr_in *dst; 1820 int hlen; 1821{ 1822 register struct ip *ip; 1823 struct mbuf *copym; 1824 1825 copym = m_copy(m, 0, M_COPYALL); 1826 if (copym != NULL && (copym->m_flags & M_EXT || copym->m_len < hlen)) 1827 copym = m_pullup(copym, hlen); 1828 if (copym != NULL) { 1829 /* If needed, compute the checksum and mark it as valid. */ 1830 if (copym->m_pkthdr.csum_flags & CSUM_DELAY_DATA) { 1831 in_delayed_cksum(copym); 1832 copym->m_pkthdr.csum_flags &= ~CSUM_DELAY_DATA; 1833 copym->m_pkthdr.csum_flags |= 1834 CSUM_DATA_VALID | CSUM_PSEUDO_HDR; 1835 copym->m_pkthdr.csum_data = 0xffff; 1836 } 1837 /* 1838 * We don't bother to fragment if the IP length is greater 1839 * than the interface's MTU. Can this possibly matter? 1840 */ 1841 ip = mtod(copym, struct ip *); 1842 ip->ip_len = htons(ip->ip_len); 1843 ip->ip_off = htons(ip->ip_off); 1844 ip->ip_sum = 0; 1845 ip->ip_sum = in_cksum(copym, hlen); 1846 /* 1847 * NB: 1848 * It's not clear whether there are any lingering 1849 * reentrancy problems in other areas which might 1850 * be exposed by using ip_input directly (in 1851 * particular, everything which modifies the packet 1852 * in-place). Yet another option is using the 1853 * protosw directly to deliver the looped back 1854 * packet. For the moment, we'll err on the side 1855 * of safety by using if_simloop(). 1856 */ 1857#if 1 /* XXX */ 1858 if (dst->sin_family != AF_INET) { 1859 printf("ip_mloopback: bad address family %d\n", 1860 dst->sin_family); 1861 dst->sin_family = AF_INET; 1862 } 1863#endif 1864 1865#ifdef notdef 1866 copym->m_pkthdr.rcvif = ifp; 1867 ip_input(copym); 1868#else 1869 if_simloop(ifp, copym, dst->sin_family, 0); 1870#endif 1871 } 1872} 1873