1/* $NetBSD: ip_output.c,v 1.212 2011/12/31 20:41:59 christos Exp $ */ 2 3/* 4 * Copyright (C) 1995, 1996, 1997, and 1998 WIDE Project. 5 * All rights reserved. 6 * 7 * Redistribution and use in source and binary forms, with or without 8 * modification, are permitted provided that the following conditions 9 * are met: 10 * 1. Redistributions of source code must retain the above copyright 11 * notice, this list of conditions and the following disclaimer. 12 * 2. Redistributions in binary form must reproduce the above copyright 13 * notice, this list of conditions and the following disclaimer in the 14 * documentation and/or other materials provided with the distribution. 15 * 3. Neither the name of the project nor the names of its contributors 16 * may be used to endorse or promote products derived from this software 17 * without specific prior written permission. 18 * 19 * THIS SOFTWARE IS PROVIDED BY THE PROJECT AND CONTRIBUTORS ``AS IS'' AND 20 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 21 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 22 * ARE DISCLAIMED. IN NO EVENT SHALL THE PROJECT OR CONTRIBUTORS BE LIABLE 23 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 24 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 25 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 26 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 27 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 28 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 29 * SUCH DAMAGE. 30 */ 31 32/*- 33 * Copyright (c) 1998 The NetBSD Foundation, Inc. 34 * All rights reserved. 35 * 36 * This code is derived from software contributed to The NetBSD Foundation 37 * by Public Access Networks Corporation ("Panix"). It was developed under 38 * contract to Panix by Eric Haszlakiewicz and Thor Lancelot Simon. 39 * 40 * Redistribution and use in source and binary forms, with or without 41 * modification, are permitted provided that the following conditions 42 * are met: 43 * 1. Redistributions of source code must retain the above copyright 44 * notice, this list of conditions and the following disclaimer. 45 * 2. Redistributions in binary form must reproduce the above copyright 46 * notice, this list of conditions and the following disclaimer in the 47 * documentation and/or other materials provided with the distribution. 48 * 49 * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS 50 * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED 51 * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR 52 * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS 53 * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR 54 * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF 55 * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS 56 * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN 57 * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) 58 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE 59 * POSSIBILITY OF SUCH DAMAGE. 60 */ 61 62/* 63 * Copyright (c) 1982, 1986, 1988, 1990, 1993 64 * The Regents of the University of California. All rights reserved. 65 * 66 * Redistribution and use in source and binary forms, with or without 67 * modification, are permitted provided that the following conditions 68 * are met: 69 * 1. Redistributions of source code must retain the above copyright 70 * notice, this list of conditions and the following disclaimer. 71 * 2. Redistributions in binary form must reproduce the above copyright 72 * notice, this list of conditions and the following disclaimer in the 73 * documentation and/or other materials provided with the distribution. 74 * 3. Neither the name of the University nor the names of its contributors 75 * may be used to endorse or promote products derived from this software 76 * without specific prior written permission. 77 * 78 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND 79 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 80 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 81 * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE 82 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 83 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 84 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 85 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 86 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 87 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 88 * SUCH DAMAGE. 89 * 90 * @(#)ip_output.c 8.3 (Berkeley) 1/21/94 91 */ 92 93#include <sys/cdefs.h> 94__KERNEL_RCSID(0, "$NetBSD: ip_output.c,v 1.212 2011/12/31 20:41:59 christos Exp $"); 95 96#include "opt_pfil_hooks.h" 97#include "opt_inet.h" 98#include "opt_ipsec.h" 99#include "opt_mrouting.h" 100 101#include <sys/param.h> 102#include <sys/malloc.h> 103#include <sys/mbuf.h> 104#include <sys/errno.h> 105#include <sys/protosw.h> 106#include <sys/socket.h> 107#include <sys/socketvar.h> 108#include <sys/kauth.h> 109#ifdef FAST_IPSEC 110#include <sys/domain.h> 111#endif 112#include <sys/systm.h> 113#include <sys/proc.h> 114 115#include <net/if.h> 116#include <net/route.h> 117#include <net/pfil.h> 118 119#include <netinet/in.h> 120#include <netinet/in_systm.h> 121#include <netinet/ip.h> 122#include <netinet/in_pcb.h> 123#include <netinet/in_var.h> 124#include <netinet/ip_var.h> 125#include <netinet/ip_private.h> 126#include <netinet/in_offload.h> 127 128#ifdef MROUTING 129#include <netinet/ip_mroute.h> 130#endif 131 132#ifdef KAME_IPSEC 133#include <netinet6/ipsec.h> 134#include <netinet6/ipsec_private.h> 135#include <netkey/key.h> 136#include <netkey/key_debug.h> 137#endif /*KAME_IPSEC*/ 138 139#ifdef FAST_IPSEC 140#include <netipsec/ipsec.h> 141#include <netipsec/key.h> 142#include <netipsec/xform.h> 143#endif /* FAST_IPSEC*/ 144 145#ifdef IPSEC_NAT_T 146#include <netinet/udp.h> 147#endif 148 149static struct mbuf *ip_insertoptions(struct mbuf *, struct mbuf *, int *); 150static struct ifnet *ip_multicast_if(struct in_addr *, int *); 151static void ip_mloopback(struct ifnet *, struct mbuf *, 152 const struct sockaddr_in *); 153 154#ifdef PFIL_HOOKS 155extern struct pfil_head inet_pfil_hook; /* XXX */ 156#endif 157 158int ip_do_loopback_cksum = 0; 159 160/* 161 * IP output. The packet in mbuf chain m contains a skeletal IP 162 * header (with len, off, ttl, proto, tos, src, dst). 163 * The mbuf chain containing the packet will be freed. 164 * The mbuf opt, if present, will not be freed. 165 */ 166int 167ip_output(struct mbuf *m0, ...) 168{ 169 struct rtentry *rt; 170 struct ip *ip; 171 struct ifnet *ifp; 172 struct mbuf *m = m0; 173 int hlen = sizeof (struct ip); 174 int len, error = 0; 175 struct route iproute; 176 const struct sockaddr_in *dst; 177 struct in_ifaddr *ia; 178 struct ifaddr *xifa; 179 struct mbuf *opt; 180 struct route *ro; 181 int flags, sw_csum; 182 int *mtu_p; 183 u_long mtu; 184 struct ip_moptions *imo; 185 struct socket *so; 186 va_list ap; 187#ifdef IPSEC_NAT_T 188 int natt_frag = 0; 189#endif 190#ifdef KAME_IPSEC 191 struct secpolicy *sp = NULL; 192#endif /*KAME_IPSEC*/ 193#ifdef FAST_IPSEC 194 struct inpcb *inp; 195 struct secpolicy *sp = NULL; 196 int s; 197#endif 198 u_int16_t ip_len; 199 union { 200 struct sockaddr dst; 201 struct sockaddr_in dst4; 202 } u; 203 struct sockaddr *rdst = &u.dst; /* real IP destination, as opposed 204 * to the nexthop 205 */ 206 207 len = 0; 208 va_start(ap, m0); 209 opt = va_arg(ap, struct mbuf *); 210 ro = va_arg(ap, struct route *); 211 flags = va_arg(ap, int); 212 imo = va_arg(ap, struct ip_moptions *); 213 so = va_arg(ap, struct socket *); 214 if (flags & IP_RETURNMTU) 215 mtu_p = va_arg(ap, int *); 216 else 217 mtu_p = NULL; 218 va_end(ap); 219 220 MCLAIM(m, &ip_tx_mowner); 221#ifdef FAST_IPSEC 222 if (so != NULL && so->so_proto->pr_domain->dom_family == AF_INET) 223 inp = (struct inpcb *)so->so_pcb; 224 else 225 inp = NULL; 226#endif /* FAST_IPSEC */ 227 228#ifdef DIAGNOSTIC 229 if ((m->m_flags & M_PKTHDR) == 0) 230 panic("ip_output: no HDR"); 231 232 if ((m->m_pkthdr.csum_flags & (M_CSUM_TCPv6|M_CSUM_UDPv6)) != 0) { 233 panic("ip_output: IPv6 checksum offload flags: %d", 234 m->m_pkthdr.csum_flags); 235 } 236 237 if ((m->m_pkthdr.csum_flags & (M_CSUM_TCPv4|M_CSUM_UDPv4)) == 238 (M_CSUM_TCPv4|M_CSUM_UDPv4)) { 239 panic("ip_output: conflicting checksum offload flags: %d", 240 m->m_pkthdr.csum_flags); 241 } 242#endif 243 if (opt) { 244 m = ip_insertoptions(m, opt, &len); 245 if (len >= sizeof(struct ip)) 246 hlen = len; 247 } 248 ip = mtod(m, struct ip *); 249 /* 250 * Fill in IP header. 251 */ 252 if ((flags & (IP_FORWARDING|IP_RAWOUTPUT)) == 0) { 253 ip->ip_v = IPVERSION; 254 ip->ip_off = htons(0); 255 /* ip->ip_id filled in after we find out source ia */ 256 ip->ip_hl = hlen >> 2; 257 IP_STATINC(IP_STAT_LOCALOUT); 258 } else { 259 hlen = ip->ip_hl << 2; 260 } 261 /* 262 * Route packet. 263 */ 264 memset(&iproute, 0, sizeof(iproute)); 265 if (ro == NULL) 266 ro = &iproute; 267 sockaddr_in_init(&u.dst4, &ip->ip_dst, 0); 268 dst = satocsin(rtcache_getdst(ro)); 269 /* 270 * If there is a cached route, 271 * check that it is to the same destination 272 * and is still up. If not, free it and try again. 273 * The address family should also be checked in case of sharing the 274 * cache with IPv6. 275 */ 276 if (dst == NULL) 277 ; 278 else if (dst->sin_family != AF_INET || 279 !in_hosteq(dst->sin_addr, ip->ip_dst)) 280 rtcache_free(ro); 281 282 if ((rt = rtcache_validate(ro)) == NULL && 283 (rt = rtcache_update(ro, 1)) == NULL) { 284 dst = &u.dst4; 285 rtcache_setdst(ro, &u.dst); 286 } 287 /* 288 * If routing to interface only, 289 * short circuit routing lookup. 290 */ 291 if (flags & IP_ROUTETOIF) { 292 if ((ia = ifatoia(ifa_ifwithladdr(sintocsa(dst)))) == NULL) { 293 IP_STATINC(IP_STAT_NOROUTE); 294 error = ENETUNREACH; 295 goto bad; 296 } 297 ifp = ia->ia_ifp; 298 mtu = ifp->if_mtu; 299 ip->ip_ttl = 1; 300 } else if ((IN_MULTICAST(ip->ip_dst.s_addr) || 301 ip->ip_dst.s_addr == INADDR_BROADCAST) && 302 imo != NULL && imo->imo_multicast_ifp != NULL) { 303 ifp = imo->imo_multicast_ifp; 304 mtu = ifp->if_mtu; 305 IFP_TO_IA(ifp, ia); 306 } else { 307 if (rt == NULL) 308 rt = rtcache_init(ro); 309 if (rt == NULL) { 310 IP_STATINC(IP_STAT_NOROUTE); 311 error = EHOSTUNREACH; 312 goto bad; 313 } 314 ia = ifatoia(rt->rt_ifa); 315 ifp = rt->rt_ifp; 316 if ((mtu = rt->rt_rmx.rmx_mtu) == 0) 317 mtu = ifp->if_mtu; 318 rt->rt_use++; 319 if (rt->rt_flags & RTF_GATEWAY) 320 dst = satosin(rt->rt_gateway); 321 } 322 if (IN_MULTICAST(ip->ip_dst.s_addr) || 323 (ip->ip_dst.s_addr == INADDR_BROADCAST)) { 324 struct in_multi *inm; 325 326 m->m_flags |= (ip->ip_dst.s_addr == INADDR_BROADCAST) ? 327 M_BCAST : M_MCAST; 328 /* 329 * See if the caller provided any multicast options 330 */ 331 if (imo != NULL) 332 ip->ip_ttl = imo->imo_multicast_ttl; 333 else 334 ip->ip_ttl = IP_DEFAULT_MULTICAST_TTL; 335 336 /* 337 * if we don't know the outgoing ifp yet, we can't generate 338 * output 339 */ 340 if (!ifp) { 341 IP_STATINC(IP_STAT_NOROUTE); 342 error = ENETUNREACH; 343 goto bad; 344 } 345 346 /* 347 * If the packet is multicast or broadcast, confirm that 348 * the outgoing interface can transmit it. 349 */ 350 if (((m->m_flags & M_MCAST) && 351 (ifp->if_flags & IFF_MULTICAST) == 0) || 352 ((m->m_flags & M_BCAST) && 353 (ifp->if_flags & (IFF_BROADCAST|IFF_POINTOPOINT)) == 0)) { 354 IP_STATINC(IP_STAT_NOROUTE); 355 error = ENETUNREACH; 356 goto bad; 357 } 358 /* 359 * If source address not specified yet, use an address 360 * of outgoing interface. 361 */ 362 if (in_nullhost(ip->ip_src)) { 363 struct in_ifaddr *xia; 364 365 IFP_TO_IA(ifp, xia); 366 if (!xia) { 367 error = EADDRNOTAVAIL; 368 goto bad; 369 } 370 xifa = &xia->ia_ifa; 371 if (xifa->ifa_getifa != NULL) { 372 xia = ifatoia((*xifa->ifa_getifa)(xifa, rdst)); 373 } 374 ip->ip_src = xia->ia_addr.sin_addr; 375 } 376 377 IN_LOOKUP_MULTI(ip->ip_dst, ifp, inm); 378 if (inm != NULL && 379 (imo == NULL || imo->imo_multicast_loop)) { 380 /* 381 * If we belong to the destination multicast group 382 * on the outgoing interface, and the caller did not 383 * forbid loopback, loop back a copy. 384 */ 385 ip_mloopback(ifp, m, &u.dst4); 386 } 387#ifdef MROUTING 388 else { 389 /* 390 * If we are acting as a multicast router, perform 391 * multicast forwarding as if the packet had just 392 * arrived on the interface to which we are about 393 * to send. The multicast forwarding function 394 * recursively calls this function, using the 395 * IP_FORWARDING flag to prevent infinite recursion. 396 * 397 * Multicasts that are looped back by ip_mloopback(), 398 * above, will be forwarded by the ip_input() routine, 399 * if necessary. 400 */ 401 extern struct socket *ip_mrouter; 402 403 if (ip_mrouter && (flags & IP_FORWARDING) == 0) { 404 if (ip_mforward(m, ifp) != 0) { 405 m_freem(m); 406 goto done; 407 } 408 } 409 } 410#endif 411 /* 412 * Multicasts with a time-to-live of zero may be looped- 413 * back, above, but must not be transmitted on a network. 414 * Also, multicasts addressed to the loopback interface 415 * are not sent -- the above call to ip_mloopback() will 416 * loop back a copy if this host actually belongs to the 417 * destination group on the loopback interface. 418 */ 419 if (ip->ip_ttl == 0 || (ifp->if_flags & IFF_LOOPBACK) != 0) { 420 m_freem(m); 421 goto done; 422 } 423 424 goto sendit; 425 } 426 /* 427 * If source address not specified yet, use address 428 * of outgoing interface. 429 */ 430 if (in_nullhost(ip->ip_src)) { 431 xifa = &ia->ia_ifa; 432 if (xifa->ifa_getifa != NULL) 433 ia = ifatoia((*xifa->ifa_getifa)(xifa, rdst)); 434 ip->ip_src = ia->ia_addr.sin_addr; 435 } 436 437 /* 438 * packets with Class-D address as source are not valid per 439 * RFC 1112 440 */ 441 if (IN_MULTICAST(ip->ip_src.s_addr)) { 442 IP_STATINC(IP_STAT_ODROPPED); 443 error = EADDRNOTAVAIL; 444 goto bad; 445 } 446 447 /* 448 * Look for broadcast address and 449 * and verify user is allowed to send 450 * such a packet. 451 */ 452 if (in_broadcast(dst->sin_addr, ifp)) { 453 if ((ifp->if_flags & IFF_BROADCAST) == 0) { 454 error = EADDRNOTAVAIL; 455 goto bad; 456 } 457 if ((flags & IP_ALLOWBROADCAST) == 0) { 458 error = EACCES; 459 goto bad; 460 } 461 /* don't allow broadcast messages to be fragmented */ 462 if (ntohs(ip->ip_len) > ifp->if_mtu) { 463 error = EMSGSIZE; 464 goto bad; 465 } 466 m->m_flags |= M_BCAST; 467 } else 468 m->m_flags &= ~M_BCAST; 469 470sendit: 471 if ((flags & (IP_FORWARDING|IP_NOIPNEWID)) == 0) { 472 if (m->m_pkthdr.len < IP_MINFRAGSIZE) { 473 ip->ip_id = 0; 474 } else if ((m->m_pkthdr.csum_flags & M_CSUM_TSOv4) == 0) { 475 ip->ip_id = ip_newid(ia); 476 } else { 477 478 /* 479 * TSO capable interfaces (typically?) increment 480 * ip_id for each segment. 481 * "allocate" enough ids here to increase the chance 482 * for them to be unique. 483 * 484 * note that the following calculation is not 485 * needed to be precise. wasting some ip_id is fine. 486 */ 487 488 unsigned int segsz = m->m_pkthdr.segsz; 489 unsigned int datasz = ntohs(ip->ip_len) - hlen; 490 unsigned int num = howmany(datasz, segsz); 491 492 ip->ip_id = ip_newid_range(ia, num); 493 } 494 } 495 /* 496 * If we're doing Path MTU Discovery, we need to set DF unless 497 * the route's MTU is locked. 498 */ 499 if ((flags & IP_MTUDISC) != 0 && rt != NULL && 500 (rt->rt_rmx.rmx_locks & RTV_MTU) == 0) 501 ip->ip_off |= htons(IP_DF); 502 503 /* Remember the current ip_len */ 504 ip_len = ntohs(ip->ip_len); 505 506#ifdef KAME_IPSEC 507 /* get SP for this packet */ 508 if (so == NULL) 509 sp = ipsec4_getpolicybyaddr(m, IPSEC_DIR_OUTBOUND, 510 flags, &error); 511 else { 512 if (IPSEC_PCB_SKIP_IPSEC(sotoinpcb_hdr(so)->inph_sp, 513 IPSEC_DIR_OUTBOUND)) 514 goto skip_ipsec; 515 sp = ipsec4_getpolicybysock(m, IPSEC_DIR_OUTBOUND, so, &error); 516 } 517 518 if (sp == NULL) { 519 IPSEC_STATINC(IPSEC_STAT_IN_INVAL); 520 goto bad; 521 } 522 523 error = 0; 524 525 /* check policy */ 526 switch (sp->policy) { 527 case IPSEC_POLICY_DISCARD: 528 /* 529 * This packet is just discarded. 530 */ 531 IPSEC_STATINC(IPSEC_STAT_OUT_POLVIO); 532 goto bad; 533 534 case IPSEC_POLICY_BYPASS: 535 case IPSEC_POLICY_NONE: 536 /* no need to do IPsec. */ 537 goto skip_ipsec; 538 539 case IPSEC_POLICY_IPSEC: 540 if (sp->req == NULL) { 541 /* XXX should be panic ? */ 542 printf("ip_output: No IPsec request specified.\n"); 543 error = EINVAL; 544 goto bad; 545 } 546 break; 547 548 case IPSEC_POLICY_ENTRUST: 549 default: 550 printf("ip_output: Invalid policy found. %d\n", sp->policy); 551 } 552 553#ifdef IPSEC_NAT_T 554 /* 555 * NAT-T ESP fragmentation: don't do IPSec processing now, 556 * we'll do it on each fragmented packet. 557 */ 558 if (sp->req->sav && 559 ((sp->req->sav->natt_type & UDP_ENCAP_ESPINUDP) || 560 (sp->req->sav->natt_type & UDP_ENCAP_ESPINUDP_NON_IKE))) { 561 if (ntohs(ip->ip_len) > sp->req->sav->esp_frag) { 562 natt_frag = 1; 563 mtu = sp->req->sav->esp_frag; 564 goto skip_ipsec; 565 } 566 } 567#endif /* IPSEC_NAT_T */ 568 569 /* 570 * ipsec4_output() expects ip_len and ip_off in network 571 * order. They have been set to network order above. 572 */ 573 574 { 575 struct ipsec_output_state state; 576 memset(&state, 0, sizeof(state)); 577 state.m = m; 578 if (flags & IP_ROUTETOIF) { 579 state.ro = &iproute; 580 memset(&iproute, 0, sizeof(iproute)); 581 } else 582 state.ro = ro; 583 state.dst = sintocsa(dst); 584 585 /* 586 * We can't defer the checksum of payload data if 587 * we're about to encrypt/authenticate it. 588 * 589 * XXX When we support crypto offloading functions of 590 * XXX network interfaces, we need to reconsider this, 591 * XXX since it's likely that they'll support checksumming, 592 * XXX as well. 593 */ 594 if (m->m_pkthdr.csum_flags & (M_CSUM_TCPv4|M_CSUM_UDPv4)) { 595 in_delayed_cksum(m); 596 m->m_pkthdr.csum_flags &= ~(M_CSUM_TCPv4|M_CSUM_UDPv4); 597 } 598 599 error = ipsec4_output(&state, sp, flags); 600 601 m = state.m; 602 if (flags & IP_ROUTETOIF) { 603 /* 604 * if we have tunnel mode SA, we may need to ignore 605 * IP_ROUTETOIF. 606 */ 607 if (state.ro != &iproute || 608 rtcache_validate(state.ro) != NULL) { 609 flags &= ~IP_ROUTETOIF; 610 ro = state.ro; 611 } 612 } else 613 ro = state.ro; 614 dst = satocsin(state.dst); 615 if (error) { 616 /* mbuf is already reclaimed in ipsec4_output. */ 617 m0 = NULL; 618 switch (error) { 619 case EHOSTUNREACH: 620 case ENETUNREACH: 621 case EMSGSIZE: 622 case ENOBUFS: 623 case ENOMEM: 624 break; 625 default: 626 printf("ip4_output (ipsec): error code %d\n", error); 627 /*fall through*/ 628 case ENOENT: 629 /* don't show these error codes to the user */ 630 error = 0; 631 break; 632 } 633 goto bad; 634 } 635 636 /* be sure to update variables that are affected by ipsec4_output() */ 637 ip = mtod(m, struct ip *); 638 hlen = ip->ip_hl << 2; 639 ip_len = ntohs(ip->ip_len); 640 641 if ((rt = rtcache_validate(ro)) == NULL) { 642 if ((flags & IP_ROUTETOIF) == 0) { 643 printf("ip_output: " 644 "can't update route after IPsec processing\n"); 645 error = EHOSTUNREACH; /*XXX*/ 646 goto bad; 647 } 648 } else { 649 /* nobody uses ia beyond here */ 650 if (state.encap) { 651 ifp = rt->rt_ifp; 652 if ((mtu = rt->rt_rmx.rmx_mtu) == 0) 653 mtu = ifp->if_mtu; 654 } 655 } 656 } 657skip_ipsec: 658#endif /*KAME_IPSEC*/ 659#ifdef FAST_IPSEC 660 /* 661 * Check the security policy (SP) for the packet and, if 662 * required, do IPsec-related processing. There are two 663 * cases here; the first time a packet is sent through 664 * it will be untagged and handled by ipsec4_checkpolicy. 665 * If the packet is resubmitted to ip_output (e.g. after 666 * AH, ESP, etc. processing), there will be a tag to bypass 667 * the lookup and related policy checking. 668 */ 669 if (!ipsec_outdone(m)) { 670 s = splsoftnet(); 671 if (inp != NULL && 672 IPSEC_PCB_SKIP_IPSEC(inp->inp_sp, IPSEC_DIR_OUTBOUND)) { 673 splx(s); 674 goto spd_done; 675 } 676 sp = ipsec4_checkpolicy(m, IPSEC_DIR_OUTBOUND, flags, 677 &error, inp); 678 /* 679 * There are four return cases: 680 * sp != NULL apply IPsec policy 681 * sp == NULL, error == 0 no IPsec handling needed 682 * sp == NULL, error == -EINVAL discard packet w/o error 683 * sp == NULL, error != 0 discard packet, report error 684 */ 685 if (sp != NULL) { 686#ifdef IPSEC_NAT_T 687 /* 688 * NAT-T ESP fragmentation: don't do IPSec processing now, 689 * we'll do it on each fragmented packet. 690 */ 691 if (sp->req->sav && 692 ((sp->req->sav->natt_type & UDP_ENCAP_ESPINUDP) || 693 (sp->req->sav->natt_type & UDP_ENCAP_ESPINUDP_NON_IKE))) { 694 if (ntohs(ip->ip_len) > sp->req->sav->esp_frag) { 695 natt_frag = 1; 696 mtu = sp->req->sav->esp_frag; 697 splx(s); 698 goto spd_done; 699 } 700 } 701#endif /* IPSEC_NAT_T */ 702 703 /* 704 * Do delayed checksums now because we send before 705 * this is done in the normal processing path. 706 */ 707 if (m->m_pkthdr.csum_flags & (M_CSUM_TCPv4|M_CSUM_UDPv4)) { 708 in_delayed_cksum(m); 709 m->m_pkthdr.csum_flags &= ~(M_CSUM_TCPv4|M_CSUM_UDPv4); 710 } 711 712#ifdef __FreeBSD__ 713 ip->ip_len = htons(ip->ip_len); 714 ip->ip_off = htons(ip->ip_off); 715#endif 716 717 /* NB: callee frees mbuf */ 718 error = ipsec4_process_packet(m, sp->req, flags, 0); 719 /* 720 * Preserve KAME behaviour: ENOENT can be returned 721 * when an SA acquire is in progress. Don't propagate 722 * this to user-level; it confuses applications. 723 * 724 * XXX this will go away when the SADB is redone. 725 */ 726 if (error == ENOENT) 727 error = 0; 728 splx(s); 729 goto done; 730 } else { 731 splx(s); 732 733 if (error != 0) { 734 /* 735 * Hack: -EINVAL is used to signal that a packet 736 * should be silently discarded. This is typically 737 * because we asked key management for an SA and 738 * it was delayed (e.g. kicked up to IKE). 739 */ 740 if (error == -EINVAL) 741 error = 0; 742 goto bad; 743 } else { 744 /* No IPsec processing for this packet. */ 745 } 746 } 747 } 748spd_done: 749#endif /* FAST_IPSEC */ 750 751#ifdef PFIL_HOOKS 752 /* 753 * Run through list of hooks for output packets. 754 */ 755 if ((error = pfil_run_hooks(&inet_pfil_hook, &m, ifp, PFIL_OUT)) != 0) 756 goto done; 757 if (m == NULL) 758 goto done; 759 760 ip = mtod(m, struct ip *); 761 hlen = ip->ip_hl << 2; 762 ip_len = ntohs(ip->ip_len); 763#endif /* PFIL_HOOKS */ 764 765 m->m_pkthdr.csum_data |= hlen << 16; 766 767#if IFA_STATS 768 /* 769 * search for the source address structure to 770 * maintain output statistics. 771 */ 772 INADDR_TO_IA(ip->ip_src, ia); 773#endif 774 775 /* Maybe skip checksums on loopback interfaces. */ 776 if (IN_NEED_CHECKSUM(ifp, M_CSUM_IPv4)) { 777 m->m_pkthdr.csum_flags |= M_CSUM_IPv4; 778 } 779 sw_csum = m->m_pkthdr.csum_flags & ~ifp->if_csum_flags_tx; 780 /* 781 * If small enough for mtu of path, or if using TCP segmentation 782 * offload, can just send directly. 783 */ 784 if (ip_len <= mtu || 785 (m->m_pkthdr.csum_flags & M_CSUM_TSOv4) != 0) { 786#if IFA_STATS 787 if (ia) 788 ia->ia_ifa.ifa_data.ifad_outbytes += ip_len; 789#endif 790 /* 791 * Always initialize the sum to 0! Some HW assisted 792 * checksumming requires this. 793 */ 794 ip->ip_sum = 0; 795 796 if ((m->m_pkthdr.csum_flags & M_CSUM_TSOv4) == 0) { 797 /* 798 * Perform any checksums that the hardware can't do 799 * for us. 800 * 801 * XXX Does any hardware require the {th,uh}_sum 802 * XXX fields to be 0? 803 */ 804 if (sw_csum & M_CSUM_IPv4) { 805 KASSERT(IN_NEED_CHECKSUM(ifp, M_CSUM_IPv4)); 806 ip->ip_sum = in_cksum(m, hlen); 807 m->m_pkthdr.csum_flags &= ~M_CSUM_IPv4; 808 } 809 if (sw_csum & (M_CSUM_TCPv4|M_CSUM_UDPv4)) { 810 if (IN_NEED_CHECKSUM(ifp, 811 sw_csum & (M_CSUM_TCPv4|M_CSUM_UDPv4))) { 812 in_delayed_cksum(m); 813 } 814 m->m_pkthdr.csum_flags &= 815 ~(M_CSUM_TCPv4|M_CSUM_UDPv4); 816 } 817 } 818 819#ifdef KAME_IPSEC 820 /* clean ipsec history once it goes out of the node */ 821 ipsec_delaux(m); 822#endif 823 824 if (__predict_true( 825 (m->m_pkthdr.csum_flags & M_CSUM_TSOv4) == 0 || 826 (ifp->if_capenable & IFCAP_TSOv4) != 0)) { 827 KERNEL_LOCK(1, NULL); 828 error = 829 (*ifp->if_output)(ifp, m, 830 (m->m_flags & M_MCAST) ? 831 sintocsa(rdst) : sintocsa(dst), 832 rt); 833 KERNEL_UNLOCK_ONE(NULL); 834 } else { 835 error = 836 ip_tso_output(ifp, m, 837 (m->m_flags & M_MCAST) ? 838 sintocsa(rdst) : sintocsa(dst), 839 rt); 840 } 841 goto done; 842 } 843 844 /* 845 * We can't use HW checksumming if we're about to 846 * to fragment the packet. 847 * 848 * XXX Some hardware can do this. 849 */ 850 if (m->m_pkthdr.csum_flags & (M_CSUM_TCPv4|M_CSUM_UDPv4)) { 851 if (IN_NEED_CHECKSUM(ifp, 852 m->m_pkthdr.csum_flags & (M_CSUM_TCPv4|M_CSUM_UDPv4))) { 853 in_delayed_cksum(m); 854 } 855 m->m_pkthdr.csum_flags &= ~(M_CSUM_TCPv4|M_CSUM_UDPv4); 856 } 857 858 /* 859 * Too large for interface; fragment if possible. 860 * Must be able to put at least 8 bytes per fragment. 861 */ 862 if (ntohs(ip->ip_off) & IP_DF) { 863 if (flags & IP_RETURNMTU) 864 *mtu_p = mtu; 865 error = EMSGSIZE; 866 IP_STATINC(IP_STAT_CANTFRAG); 867 goto bad; 868 } 869 870 error = ip_fragment(m, ifp, mtu); 871 if (error) { 872 m = NULL; 873 goto bad; 874 } 875 876 for (; m; m = m0) { 877 m0 = m->m_nextpkt; 878 m->m_nextpkt = 0; 879 if (error == 0) { 880#if IFA_STATS 881 if (ia) 882 ia->ia_ifa.ifa_data.ifad_outbytes += 883 ntohs(ip->ip_len); 884#endif 885#ifdef KAME_IPSEC 886 /* clean ipsec history once it goes out of the node */ 887 ipsec_delaux(m); 888#endif /* KAME_IPSEC */ 889 890#ifdef IPSEC_NAT_T 891 /* 892 * If we get there, the packet has not been handeld by 893 * IPSec whereas it should have. Now that it has been 894 * fragmented, re-inject it in ip_output so that IPsec 895 * processing can occur. 896 */ 897 if (natt_frag) { 898 error = ip_output(m, opt, 899 ro, flags | IP_RAWOUTPUT | IP_NOIPNEWID, imo, so, mtu_p); 900 } else 901#endif /* IPSEC_NAT_T */ 902 { 903 KASSERT((m->m_pkthdr.csum_flags & 904 (M_CSUM_UDPv4 | M_CSUM_TCPv4)) == 0); 905 KERNEL_LOCK(1, NULL); 906 error = (*ifp->if_output)(ifp, m, 907 (m->m_flags & M_MCAST) ? 908 sintocsa(rdst) : sintocsa(dst), 909 rt); 910 KERNEL_UNLOCK_ONE(NULL); 911 } 912 } else 913 m_freem(m); 914 } 915 916 if (error == 0) 917 IP_STATINC(IP_STAT_FRAGMENTED); 918done: 919 rtcache_free(&iproute); 920 921#ifdef KAME_IPSEC 922 if (sp != NULL) { 923 KEYDEBUG(KEYDEBUG_IPSEC_STAMP, 924 printf("DP ip_output call free SP:%p\n", sp)); 925 key_freesp(sp); 926 } 927#endif /* KAME_IPSEC */ 928#ifdef FAST_IPSEC 929 if (sp != NULL) 930 KEY_FREESP(&sp); 931#endif /* FAST_IPSEC */ 932 933 return (error); 934bad: 935 m_freem(m); 936 goto done; 937} 938 939int 940ip_fragment(struct mbuf *m, struct ifnet *ifp, u_long mtu) 941{ 942 struct ip *ip, *mhip; 943 struct mbuf *m0; 944 int len, hlen, off; 945 int mhlen, firstlen; 946 struct mbuf **mnext; 947 int sw_csum = m->m_pkthdr.csum_flags; 948 int fragments = 0; 949 int s; 950 int error = 0; 951 952 ip = mtod(m, struct ip *); 953 hlen = ip->ip_hl << 2; 954 if (ifp != NULL) 955 sw_csum &= ~ifp->if_csum_flags_tx; 956 957 len = (mtu - hlen) &~ 7; 958 if (len < 8) { 959 m_freem(m); 960 return (EMSGSIZE); 961 } 962 963 firstlen = len; 964 mnext = &m->m_nextpkt; 965 966 /* 967 * Loop through length of segment after first fragment, 968 * make new header and copy data of each part and link onto chain. 969 */ 970 m0 = m; 971 mhlen = sizeof (struct ip); 972 for (off = hlen + len; off < ntohs(ip->ip_len); off += len) { 973 MGETHDR(m, M_DONTWAIT, MT_HEADER); 974 if (m == 0) { 975 error = ENOBUFS; 976 IP_STATINC(IP_STAT_ODROPPED); 977 goto sendorfree; 978 } 979 MCLAIM(m, m0->m_owner); 980 *mnext = m; 981 mnext = &m->m_nextpkt; 982 m->m_data += max_linkhdr; 983 mhip = mtod(m, struct ip *); 984 *mhip = *ip; 985 /* we must inherit MCAST and BCAST flags */ 986 m->m_flags |= m0->m_flags & (M_MCAST|M_BCAST); 987 if (hlen > sizeof (struct ip)) { 988 mhlen = ip_optcopy(ip, mhip) + sizeof (struct ip); 989 mhip->ip_hl = mhlen >> 2; 990 } 991 m->m_len = mhlen; 992 mhip->ip_off = ((off - hlen) >> 3) + 993 (ntohs(ip->ip_off) & ~IP_MF); 994 if (ip->ip_off & htons(IP_MF)) 995 mhip->ip_off |= IP_MF; 996 if (off + len >= ntohs(ip->ip_len)) 997 len = ntohs(ip->ip_len) - off; 998 else 999 mhip->ip_off |= IP_MF; 1000 HTONS(mhip->ip_off); 1001 mhip->ip_len = htons((u_int16_t)(len + mhlen)); 1002 m->m_next = m_copym(m0, off, len, M_DONTWAIT); 1003 if (m->m_next == 0) { 1004 error = ENOBUFS; /* ??? */ 1005 IP_STATINC(IP_STAT_ODROPPED); 1006 goto sendorfree; 1007 } 1008 m->m_pkthdr.len = mhlen + len; 1009 m->m_pkthdr.rcvif = NULL; 1010 mhip->ip_sum = 0; 1011 KASSERT((m->m_pkthdr.csum_flags & M_CSUM_IPv4) == 0); 1012 if (sw_csum & M_CSUM_IPv4) { 1013 mhip->ip_sum = in_cksum(m, mhlen); 1014 } else { 1015 /* 1016 * checksum is hw-offloaded or not necessary. 1017 */ 1018 m->m_pkthdr.csum_flags |= 1019 m0->m_pkthdr.csum_flags & M_CSUM_IPv4; 1020 m->m_pkthdr.csum_data |= mhlen << 16; 1021 KASSERT(!(ifp != NULL && 1022 IN_NEED_CHECKSUM(ifp, M_CSUM_IPv4)) 1023 || (m->m_pkthdr.csum_flags & M_CSUM_IPv4) != 0); 1024 } 1025 IP_STATINC(IP_STAT_OFRAGMENTS); 1026 fragments++; 1027 } 1028 /* 1029 * Update first fragment by trimming what's been copied out 1030 * and updating header, then send each fragment (in order). 1031 */ 1032 m = m0; 1033 m_adj(m, hlen + firstlen - ntohs(ip->ip_len)); 1034 m->m_pkthdr.len = hlen + firstlen; 1035 ip->ip_len = htons((u_int16_t)m->m_pkthdr.len); 1036 ip->ip_off |= htons(IP_MF); 1037 ip->ip_sum = 0; 1038 if (sw_csum & M_CSUM_IPv4) { 1039 ip->ip_sum = in_cksum(m, hlen); 1040 m->m_pkthdr.csum_flags &= ~M_CSUM_IPv4; 1041 } else { 1042 /* 1043 * checksum is hw-offloaded or not necessary. 1044 */ 1045 KASSERT(!(ifp != NULL && IN_NEED_CHECKSUM(ifp, M_CSUM_IPv4)) 1046 || (m->m_pkthdr.csum_flags & M_CSUM_IPv4) != 0); 1047 KASSERT(M_CSUM_DATA_IPv4_IPHL(m->m_pkthdr.csum_data) >= 1048 sizeof(struct ip)); 1049 } 1050sendorfree: 1051 /* 1052 * If there is no room for all the fragments, don't queue 1053 * any of them. 1054 */ 1055 if (ifp != NULL) { 1056 s = splnet(); 1057 if (ifp->if_snd.ifq_maxlen - ifp->if_snd.ifq_len < fragments && 1058 error == 0) { 1059 error = ENOBUFS; 1060 IP_STATINC(IP_STAT_ODROPPED); 1061 IFQ_INC_DROPS(&ifp->if_snd); 1062 } 1063 splx(s); 1064 } 1065 if (error) { 1066 for (m = m0; m; m = m0) { 1067 m0 = m->m_nextpkt; 1068 m->m_nextpkt = NULL; 1069 m_freem(m); 1070 } 1071 } 1072 return (error); 1073} 1074 1075/* 1076 * Process a delayed payload checksum calculation. 1077 */ 1078void 1079in_delayed_cksum(struct mbuf *m) 1080{ 1081 struct ip *ip; 1082 u_int16_t csum, offset; 1083 1084 ip = mtod(m, struct ip *); 1085 offset = ip->ip_hl << 2; 1086 csum = in4_cksum(m, 0, offset, ntohs(ip->ip_len) - offset); 1087 if (csum == 0 && (m->m_pkthdr.csum_flags & M_CSUM_UDPv4) != 0) 1088 csum = 0xffff; 1089 1090 offset += M_CSUM_DATA_IPv4_OFFSET(m->m_pkthdr.csum_data); 1091 1092 if ((offset + sizeof(u_int16_t)) > m->m_len) { 1093 /* This happen when ip options were inserted 1094 printf("in_delayed_cksum: pullup len %d off %d proto %d\n", 1095 m->m_len, offset, ip->ip_p); 1096 */ 1097 m_copyback(m, offset, sizeof(csum), (void *) &csum); 1098 } else 1099 *(u_int16_t *)(mtod(m, char *) + offset) = csum; 1100} 1101 1102/* 1103 * Determine the maximum length of the options to be inserted; 1104 * we would far rather allocate too much space rather than too little. 1105 */ 1106 1107u_int 1108ip_optlen(struct inpcb *inp) 1109{ 1110 struct mbuf *m = inp->inp_options; 1111 1112 if (m && m->m_len > offsetof(struct ipoption, ipopt_dst)) 1113 return (m->m_len - offsetof(struct ipoption, ipopt_dst)); 1114 else 1115 return 0; 1116} 1117 1118 1119/* 1120 * Insert IP options into preformed packet. 1121 * Adjust IP destination as required for IP source routing, 1122 * as indicated by a non-zero in_addr at the start of the options. 1123 */ 1124static struct mbuf * 1125ip_insertoptions(struct mbuf *m, struct mbuf *opt, int *phlen) 1126{ 1127 struct ipoption *p = mtod(opt, struct ipoption *); 1128 struct mbuf *n; 1129 struct ip *ip = mtod(m, struct ip *); 1130 unsigned optlen; 1131 1132 optlen = opt->m_len - sizeof(p->ipopt_dst); 1133 if (optlen + ntohs(ip->ip_len) > IP_MAXPACKET) 1134 return (m); /* XXX should fail */ 1135 if (!in_nullhost(p->ipopt_dst)) 1136 ip->ip_dst = p->ipopt_dst; 1137 if (M_READONLY(m) || M_LEADINGSPACE(m) < optlen) { 1138 MGETHDR(n, M_DONTWAIT, MT_HEADER); 1139 if (n == 0) 1140 return (m); 1141 MCLAIM(n, m->m_owner); 1142 M_MOVE_PKTHDR(n, m); 1143 m->m_len -= sizeof(struct ip); 1144 m->m_data += sizeof(struct ip); 1145 n->m_next = m; 1146 m = n; 1147 m->m_len = optlen + sizeof(struct ip); 1148 m->m_data += max_linkhdr; 1149 bcopy((void *)ip, mtod(m, void *), sizeof(struct ip)); 1150 } else { 1151 m->m_data -= optlen; 1152 m->m_len += optlen; 1153 memmove(mtod(m, void *), ip, sizeof(struct ip)); 1154 } 1155 m->m_pkthdr.len += optlen; 1156 ip = mtod(m, struct ip *); 1157 bcopy((void *)p->ipopt_list, (void *)(ip + 1), (unsigned)optlen); 1158 *phlen = sizeof(struct ip) + optlen; 1159 ip->ip_len = htons(ntohs(ip->ip_len) + optlen); 1160 return (m); 1161} 1162 1163/* 1164 * Copy options from ip to jp, 1165 * omitting those not copied during fragmentation. 1166 */ 1167int 1168ip_optcopy(struct ip *ip, struct ip *jp) 1169{ 1170 u_char *cp, *dp; 1171 int opt, optlen, cnt; 1172 1173 cp = (u_char *)(ip + 1); 1174 dp = (u_char *)(jp + 1); 1175 cnt = (ip->ip_hl << 2) - sizeof (struct ip); 1176 for (; cnt > 0; cnt -= optlen, cp += optlen) { 1177 opt = cp[0]; 1178 if (opt == IPOPT_EOL) 1179 break; 1180 if (opt == IPOPT_NOP) { 1181 /* Preserve for IP mcast tunnel's LSRR alignment. */ 1182 *dp++ = IPOPT_NOP; 1183 optlen = 1; 1184 continue; 1185 } 1186#ifdef DIAGNOSTIC 1187 if (cnt < IPOPT_OLEN + sizeof(*cp)) 1188 panic("malformed IPv4 option passed to ip_optcopy"); 1189#endif 1190 optlen = cp[IPOPT_OLEN]; 1191#ifdef DIAGNOSTIC 1192 if (optlen < IPOPT_OLEN + sizeof(*cp) || optlen > cnt) 1193 panic("malformed IPv4 option passed to ip_optcopy"); 1194#endif 1195 /* bogus lengths should have been caught by ip_dooptions */ 1196 if (optlen > cnt) 1197 optlen = cnt; 1198 if (IPOPT_COPIED(opt)) { 1199 bcopy((void *)cp, (void *)dp, (unsigned)optlen); 1200 dp += optlen; 1201 } 1202 } 1203 for (optlen = dp - (u_char *)(jp+1); optlen & 0x3; optlen++) 1204 *dp++ = IPOPT_EOL; 1205 return (optlen); 1206} 1207 1208/* 1209 * IP socket option processing. 1210 */ 1211int 1212ip_ctloutput(int op, struct socket *so, struct sockopt *sopt) 1213{ 1214 struct inpcb *inp = sotoinpcb(so); 1215 int optval = 0; 1216 int error = 0; 1217#if defined(KAME_IPSEC) || defined(FAST_IPSEC) 1218 struct lwp *l = curlwp; /*XXX*/ 1219#endif 1220 1221 if (sopt->sopt_level != IPPROTO_IP) { 1222 if (sopt->sopt_level == SOL_SOCKET && sopt->sopt_name == SO_NOHEADER) 1223 return 0; 1224 return ENOPROTOOPT; 1225 } 1226 1227 switch (op) { 1228 case PRCO_SETOPT: 1229 switch (sopt->sopt_name) { 1230 case IP_OPTIONS: 1231#ifdef notyet 1232 case IP_RETOPTS: 1233#endif 1234 error = ip_pcbopts(&inp->inp_options, sopt); 1235 break; 1236 1237 case IP_TOS: 1238 case IP_TTL: 1239 case IP_MINTTL: 1240 case IP_RECVOPTS: 1241 case IP_RECVRETOPTS: 1242 case IP_RECVDSTADDR: 1243 case IP_RECVIF: 1244 case IP_RECVTTL: 1245 error = sockopt_getint(sopt, &optval); 1246 if (error) 1247 break; 1248 1249 switch (sopt->sopt_name) { 1250 case IP_TOS: 1251 inp->inp_ip.ip_tos = optval; 1252 break; 1253 1254 case IP_TTL: 1255 inp->inp_ip.ip_ttl = optval; 1256 break; 1257 1258 case IP_MINTTL: 1259 if (optval > 0 && optval <= MAXTTL) 1260 inp->inp_ip_minttl = optval; 1261 else 1262 error = EINVAL; 1263 break; 1264#define OPTSET(bit) \ 1265 if (optval) \ 1266 inp->inp_flags |= bit; \ 1267 else \ 1268 inp->inp_flags &= ~bit; 1269 1270 case IP_RECVOPTS: 1271 OPTSET(INP_RECVOPTS); 1272 break; 1273 1274 case IP_RECVRETOPTS: 1275 OPTSET(INP_RECVRETOPTS); 1276 break; 1277 1278 case IP_RECVDSTADDR: 1279 OPTSET(INP_RECVDSTADDR); 1280 break; 1281 1282 case IP_RECVIF: 1283 OPTSET(INP_RECVIF); 1284 break; 1285 1286 case IP_RECVTTL: 1287 OPTSET(INP_RECVTTL); 1288 break; 1289 } 1290 break; 1291#undef OPTSET 1292 1293 case IP_MULTICAST_IF: 1294 case IP_MULTICAST_TTL: 1295 case IP_MULTICAST_LOOP: 1296 case IP_ADD_MEMBERSHIP: 1297 case IP_DROP_MEMBERSHIP: 1298 error = ip_setmoptions(&inp->inp_moptions, sopt); 1299 break; 1300 1301 case IP_PORTRANGE: 1302 error = sockopt_getint(sopt, &optval); 1303 if (error) 1304 break; 1305 1306 /* INP_LOCK(inp); */ 1307 switch (optval) { 1308 case IP_PORTRANGE_DEFAULT: 1309 case IP_PORTRANGE_HIGH: 1310 inp->inp_flags &= ~(INP_LOWPORT); 1311 break; 1312 1313 case IP_PORTRANGE_LOW: 1314 inp->inp_flags |= INP_LOWPORT; 1315 break; 1316 1317 default: 1318 error = EINVAL; 1319 break; 1320 } 1321 /* INP_UNLOCK(inp); */ 1322 break; 1323 1324#if defined(KAME_IPSEC) || defined(FAST_IPSEC) 1325 case IP_IPSEC_POLICY: 1326 { 1327 error = ipsec4_set_policy(inp, sopt->sopt_name, 1328 sopt->sopt_data, sopt->sopt_size, l->l_cred); 1329 break; 1330 } 1331#endif /*IPSEC*/ 1332 1333 default: 1334 error = ENOPROTOOPT; 1335 break; 1336 } 1337 break; 1338 1339 case PRCO_GETOPT: 1340 switch (sopt->sopt_name) { 1341 case IP_OPTIONS: 1342 case IP_RETOPTS: 1343 if (inp->inp_options) { 1344 struct mbuf *m; 1345 1346 m = m_copym(inp->inp_options, 0, M_COPYALL, 1347 M_DONTWAIT); 1348 if (m == NULL) { 1349 error = ENOBUFS; 1350 break; 1351 } 1352 1353 error = sockopt_setmbuf(sopt, m); 1354 } 1355 break; 1356 1357 case IP_TOS: 1358 case IP_TTL: 1359 case IP_MINTTL: 1360 case IP_RECVOPTS: 1361 case IP_RECVRETOPTS: 1362 case IP_RECVDSTADDR: 1363 case IP_RECVIF: 1364 case IP_RECVTTL: 1365 case IP_ERRORMTU: 1366 switch (sopt->sopt_name) { 1367 case IP_TOS: 1368 optval = inp->inp_ip.ip_tos; 1369 break; 1370 1371 case IP_TTL: 1372 optval = inp->inp_ip.ip_ttl; 1373 break; 1374 1375 case IP_MINTTL: 1376 optval = inp->inp_ip_minttl; 1377 break; 1378 1379 case IP_ERRORMTU: 1380 optval = inp->inp_errormtu; 1381 break; 1382 1383#define OPTBIT(bit) (inp->inp_flags & bit ? 1 : 0) 1384 1385 case IP_RECVOPTS: 1386 optval = OPTBIT(INP_RECVOPTS); 1387 break; 1388 1389 case IP_RECVRETOPTS: 1390 optval = OPTBIT(INP_RECVRETOPTS); 1391 break; 1392 1393 case IP_RECVDSTADDR: 1394 optval = OPTBIT(INP_RECVDSTADDR); 1395 break; 1396 1397 case IP_RECVIF: 1398 optval = OPTBIT(INP_RECVIF); 1399 break; 1400 1401 case IP_RECVTTL: 1402 optval = OPTBIT(INP_RECVTTL); 1403 break; 1404 } 1405 error = sockopt_setint(sopt, optval); 1406 break; 1407 1408#if 0 /* defined(KAME_IPSEC) || defined(FAST_IPSEC) */ 1409 case IP_IPSEC_POLICY: 1410 { 1411 struct mbuf *m = NULL; 1412 1413 /* XXX this will return EINVAL as sopt is empty */ 1414 error = ipsec4_get_policy(inp, sopt->sopt_data, 1415 sopt->sopt_size, &m); 1416 if (error == 0) 1417 error = sockopt_setmbuf(sopt, m); 1418 break; 1419 } 1420#endif /*IPSEC*/ 1421 1422 case IP_MULTICAST_IF: 1423 case IP_MULTICAST_TTL: 1424 case IP_MULTICAST_LOOP: 1425 case IP_ADD_MEMBERSHIP: 1426 case IP_DROP_MEMBERSHIP: 1427 error = ip_getmoptions(inp->inp_moptions, sopt); 1428 break; 1429 1430 case IP_PORTRANGE: 1431 if (inp->inp_flags & INP_LOWPORT) 1432 optval = IP_PORTRANGE_LOW; 1433 else 1434 optval = IP_PORTRANGE_DEFAULT; 1435 1436 error = sockopt_setint(sopt, optval); 1437 1438 break; 1439 1440 default: 1441 error = ENOPROTOOPT; 1442 break; 1443 } 1444 break; 1445 } 1446 return (error); 1447} 1448 1449/* 1450 * Set up IP options in pcb for insertion in output packets. 1451 * Store in mbuf with pointer in pcbopt, adding pseudo-option 1452 * with destination address if source routed. 1453 */ 1454int 1455ip_pcbopts(struct mbuf **pcbopt, const struct sockopt *sopt) 1456{ 1457 struct mbuf *m; 1458 const u_char *cp; 1459 u_char *dp; 1460 int cnt; 1461 uint8_t optval, olen, offset; 1462 1463 /* turn off any old options */ 1464 if (*pcbopt) 1465 (void)m_free(*pcbopt); 1466 *pcbopt = NULL; 1467 1468 cp = sopt->sopt_data; 1469 cnt = sopt->sopt_size; 1470 1471 if (cnt == 0) 1472 return (0); /* Only turning off any previous options */ 1473 1474#ifndef __vax__ 1475 if (cnt % sizeof(int32_t)) 1476 return (EINVAL); 1477#endif 1478 1479 m = m_get(M_DONTWAIT, MT_SOOPTS); 1480 if (m == NULL) 1481 return (ENOBUFS); 1482 1483 dp = mtod(m, u_char *); 1484 memset(dp, 0, sizeof(struct in_addr)); 1485 dp += sizeof(struct in_addr); 1486 m->m_len = sizeof(struct in_addr); 1487 1488 /* 1489 * IP option list according to RFC791. Each option is of the form 1490 * 1491 * [optval] [olen] [(olen - 2) data bytes] 1492 * 1493 * we validate the list and copy options to an mbuf for prepending 1494 * to data packets. The IP first-hop destination address will be 1495 * stored before actual options and is zero if unset. 1496 */ 1497 while (cnt > 0) { 1498 optval = cp[IPOPT_OPTVAL]; 1499 1500 if (optval == IPOPT_EOL || optval == IPOPT_NOP) { 1501 olen = 1; 1502 } else { 1503 if (cnt < IPOPT_OLEN + 1) 1504 goto bad; 1505 1506 olen = cp[IPOPT_OLEN]; 1507 if (olen < IPOPT_OLEN + 1 || olen > cnt) 1508 goto bad; 1509 } 1510 1511 if (optval == IPOPT_LSRR || optval == IPOPT_SSRR) { 1512 /* 1513 * user process specifies route as: 1514 * ->A->B->C->D 1515 * D must be our final destination (but we can't 1516 * check that since we may not have connected yet). 1517 * A is first hop destination, which doesn't appear in 1518 * actual IP option, but is stored before the options. 1519 */ 1520 if (olen < IPOPT_OFFSET + 1 + sizeof(struct in_addr)) 1521 goto bad; 1522 1523 offset = cp[IPOPT_OFFSET]; 1524 memcpy(mtod(m, u_char *), cp + IPOPT_OFFSET + 1, 1525 sizeof(struct in_addr)); 1526 1527 cp += sizeof(struct in_addr); 1528 cnt -= sizeof(struct in_addr); 1529 olen -= sizeof(struct in_addr); 1530 1531 if (m->m_len + olen > MAX_IPOPTLEN + sizeof(struct in_addr)) 1532 goto bad; 1533 1534 memcpy(dp, cp, olen); 1535 dp[IPOPT_OPTVAL] = optval; 1536 dp[IPOPT_OLEN] = olen; 1537 dp[IPOPT_OFFSET] = offset; 1538 break; 1539 } else { 1540 if (m->m_len + olen > MAX_IPOPTLEN + sizeof(struct in_addr)) 1541 goto bad; 1542 1543 memcpy(dp, cp, olen); 1544 break; 1545 } 1546 1547 dp += olen; 1548 m->m_len += olen; 1549 1550 if (optval == IPOPT_EOL) 1551 break; 1552 1553 cp += olen; 1554 cnt -= olen; 1555 } 1556 1557 *pcbopt = m; 1558 return (0); 1559 1560bad: 1561 (void)m_free(m); 1562 return (EINVAL); 1563} 1564 1565/* 1566 * following RFC1724 section 3.3, 0.0.0.0/8 is interpreted as interface index. 1567 */ 1568static struct ifnet * 1569ip_multicast_if(struct in_addr *a, int *ifindexp) 1570{ 1571 int ifindex; 1572 struct ifnet *ifp = NULL; 1573 struct in_ifaddr *ia; 1574 1575 if (ifindexp) 1576 *ifindexp = 0; 1577 if (ntohl(a->s_addr) >> 24 == 0) { 1578 ifindex = ntohl(a->s_addr) & 0xffffff; 1579 if (ifindex < 0 || if_indexlim <= ifindex) 1580 return NULL; 1581 ifp = ifindex2ifnet[ifindex]; 1582 if (!ifp) 1583 return NULL; 1584 if (ifindexp) 1585 *ifindexp = ifindex; 1586 } else { 1587 LIST_FOREACH(ia, &IN_IFADDR_HASH(a->s_addr), ia_hash) { 1588 if (in_hosteq(ia->ia_addr.sin_addr, *a) && 1589 (ia->ia_ifp->if_flags & IFF_MULTICAST) != 0) { 1590 ifp = ia->ia_ifp; 1591 break; 1592 } 1593 } 1594 } 1595 return ifp; 1596} 1597 1598static int 1599ip_getoptval(const struct sockopt *sopt, u_int8_t *val, u_int maxval) 1600{ 1601 u_int tval; 1602 u_char cval; 1603 int error; 1604 1605 if (sopt == NULL) 1606 return EINVAL; 1607 1608 switch (sopt->sopt_size) { 1609 case sizeof(u_char): 1610 error = sockopt_get(sopt, &cval, sizeof(u_char)); 1611 tval = cval; 1612 break; 1613 1614 case sizeof(u_int): 1615 error = sockopt_get(sopt, &tval, sizeof(u_int)); 1616 break; 1617 1618 default: 1619 error = EINVAL; 1620 } 1621 1622 if (error) 1623 return error; 1624 1625 if (tval > maxval) 1626 return EINVAL; 1627 1628 *val = tval; 1629 return 0; 1630} 1631 1632/* 1633 * Set the IP multicast options in response to user setsockopt(). 1634 */ 1635int 1636ip_setmoptions(struct ip_moptions **imop, const struct sockopt *sopt) 1637{ 1638 int error = 0; 1639 int i; 1640 struct in_addr addr; 1641 struct ip_mreq lmreq, *mreq; 1642 struct ifnet *ifp; 1643 struct ip_moptions *imo = *imop; 1644 int ifindex; 1645 1646 if (imo == NULL) { 1647 /* 1648 * No multicast option buffer attached to the pcb; 1649 * allocate one and initialize to default values. 1650 */ 1651 imo = malloc(sizeof(*imo), M_IPMOPTS, M_NOWAIT); 1652 if (imo == NULL) 1653 return (ENOBUFS); 1654 1655 *imop = imo; 1656 imo->imo_multicast_ifp = NULL; 1657 imo->imo_multicast_addr.s_addr = INADDR_ANY; 1658 imo->imo_multicast_ttl = IP_DEFAULT_MULTICAST_TTL; 1659 imo->imo_multicast_loop = IP_DEFAULT_MULTICAST_LOOP; 1660 imo->imo_num_memberships = 0; 1661 } 1662 1663 switch (sopt->sopt_name) { 1664 case IP_MULTICAST_IF: 1665 /* 1666 * Select the interface for outgoing multicast packets. 1667 */ 1668 error = sockopt_get(sopt, &addr, sizeof(addr)); 1669 if (error) 1670 break; 1671 1672 /* 1673 * INADDR_ANY is used to remove a previous selection. 1674 * When no interface is selected, a default one is 1675 * chosen every time a multicast packet is sent. 1676 */ 1677 if (in_nullhost(addr)) { 1678 imo->imo_multicast_ifp = NULL; 1679 break; 1680 } 1681 /* 1682 * The selected interface is identified by its local 1683 * IP address. Find the interface and confirm that 1684 * it supports multicasting. 1685 */ 1686 ifp = ip_multicast_if(&addr, &ifindex); 1687 if (ifp == NULL || (ifp->if_flags & IFF_MULTICAST) == 0) { 1688 error = EADDRNOTAVAIL; 1689 break; 1690 } 1691 imo->imo_multicast_ifp = ifp; 1692 if (ifindex) 1693 imo->imo_multicast_addr = addr; 1694 else 1695 imo->imo_multicast_addr.s_addr = INADDR_ANY; 1696 break; 1697 1698 case IP_MULTICAST_TTL: 1699 /* 1700 * Set the IP time-to-live for outgoing multicast packets. 1701 */ 1702 error = ip_getoptval(sopt, &imo->imo_multicast_ttl, MAXTTL); 1703 break; 1704 1705 case IP_MULTICAST_LOOP: 1706 /* 1707 * Set the loopback flag for outgoing multicast packets. 1708 * Must be zero or one. 1709 */ 1710 error = ip_getoptval(sopt, &imo->imo_multicast_loop, 1); 1711 break; 1712 1713 case IP_ADD_MEMBERSHIP: 1714 /* 1715 * Add a multicast group membership. 1716 * Group must be a valid IP multicast address. 1717 */ 1718 error = sockopt_get(sopt, &lmreq, sizeof(lmreq)); 1719 if (error) 1720 break; 1721 1722 mreq = &lmreq; 1723 1724 if (!IN_MULTICAST(mreq->imr_multiaddr.s_addr)) { 1725 error = EINVAL; 1726 break; 1727 } 1728 /* 1729 * If no interface address was provided, use the interface of 1730 * the route to the given multicast address. 1731 */ 1732 if (in_nullhost(mreq->imr_interface)) { 1733 struct rtentry *rt; 1734 union { 1735 struct sockaddr dst; 1736 struct sockaddr_in dst4; 1737 } u; 1738 struct route ro; 1739 1740 memset(&ro, 0, sizeof(ro)); 1741 1742 sockaddr_in_init(&u.dst4, &mreq->imr_multiaddr, 0); 1743 rtcache_setdst(&ro, &u.dst); 1744 ifp = (rt = rtcache_init(&ro)) != NULL ? rt->rt_ifp 1745 : NULL; 1746 rtcache_free(&ro); 1747 } else { 1748 ifp = ip_multicast_if(&mreq->imr_interface, NULL); 1749 } 1750 /* 1751 * See if we found an interface, and confirm that it 1752 * supports multicast. 1753 */ 1754 if (ifp == NULL || (ifp->if_flags & IFF_MULTICAST) == 0) { 1755 error = EADDRNOTAVAIL; 1756 break; 1757 } 1758 /* 1759 * See if the membership already exists or if all the 1760 * membership slots are full. 1761 */ 1762 for (i = 0; i < imo->imo_num_memberships; ++i) { 1763 if (imo->imo_membership[i]->inm_ifp == ifp && 1764 in_hosteq(imo->imo_membership[i]->inm_addr, 1765 mreq->imr_multiaddr)) 1766 break; 1767 } 1768 if (i < imo->imo_num_memberships) { 1769 error = EADDRINUSE; 1770 break; 1771 } 1772 if (i == IP_MAX_MEMBERSHIPS) { 1773 error = ETOOMANYREFS; 1774 break; 1775 } 1776 /* 1777 * Everything looks good; add a new record to the multicast 1778 * address list for the given interface. 1779 */ 1780 if ((imo->imo_membership[i] = 1781 in_addmulti(&mreq->imr_multiaddr, ifp)) == NULL) { 1782 error = ENOBUFS; 1783 break; 1784 } 1785 ++imo->imo_num_memberships; 1786 break; 1787 1788 case IP_DROP_MEMBERSHIP: 1789 /* 1790 * Drop a multicast group membership. 1791 * Group must be a valid IP multicast address. 1792 */ 1793 error = sockopt_get(sopt, &lmreq, sizeof(lmreq)); 1794 if (error) 1795 break; 1796 1797 mreq = &lmreq; 1798 1799 if (!IN_MULTICAST(mreq->imr_multiaddr.s_addr)) { 1800 error = EINVAL; 1801 break; 1802 } 1803 /* 1804 * If an interface address was specified, get a pointer 1805 * to its ifnet structure. 1806 */ 1807 if (in_nullhost(mreq->imr_interface)) 1808 ifp = NULL; 1809 else { 1810 ifp = ip_multicast_if(&mreq->imr_interface, NULL); 1811 if (ifp == NULL) { 1812 error = EADDRNOTAVAIL; 1813 break; 1814 } 1815 } 1816 /* 1817 * Find the membership in the membership array. 1818 */ 1819 for (i = 0; i < imo->imo_num_memberships; ++i) { 1820 if ((ifp == NULL || 1821 imo->imo_membership[i]->inm_ifp == ifp) && 1822 in_hosteq(imo->imo_membership[i]->inm_addr, 1823 mreq->imr_multiaddr)) 1824 break; 1825 } 1826 if (i == imo->imo_num_memberships) { 1827 error = EADDRNOTAVAIL; 1828 break; 1829 } 1830 /* 1831 * Give up the multicast address record to which the 1832 * membership points. 1833 */ 1834 in_delmulti(imo->imo_membership[i]); 1835 /* 1836 * Remove the gap in the membership array. 1837 */ 1838 for (++i; i < imo->imo_num_memberships; ++i) 1839 imo->imo_membership[i-1] = imo->imo_membership[i]; 1840 --imo->imo_num_memberships; 1841 break; 1842 1843 default: 1844 error = EOPNOTSUPP; 1845 break; 1846 } 1847 1848 /* 1849 * If all options have default values, no need to keep the mbuf. 1850 */ 1851 if (imo->imo_multicast_ifp == NULL && 1852 imo->imo_multicast_ttl == IP_DEFAULT_MULTICAST_TTL && 1853 imo->imo_multicast_loop == IP_DEFAULT_MULTICAST_LOOP && 1854 imo->imo_num_memberships == 0) { 1855 free(*imop, M_IPMOPTS); 1856 *imop = NULL; 1857 } 1858 1859 return (error); 1860} 1861 1862/* 1863 * Return the IP multicast options in response to user getsockopt(). 1864 */ 1865int 1866ip_getmoptions(struct ip_moptions *imo, struct sockopt *sopt) 1867{ 1868 struct in_addr addr; 1869 struct in_ifaddr *ia; 1870 int error; 1871 uint8_t optval; 1872 1873 error = 0; 1874 1875 switch (sopt->sopt_name) { 1876 case IP_MULTICAST_IF: 1877 if (imo == NULL || imo->imo_multicast_ifp == NULL) 1878 addr = zeroin_addr; 1879 else if (imo->imo_multicast_addr.s_addr) { 1880 /* return the value user has set */ 1881 addr = imo->imo_multicast_addr; 1882 } else { 1883 IFP_TO_IA(imo->imo_multicast_ifp, ia); 1884 addr = ia ? ia->ia_addr.sin_addr : zeroin_addr; 1885 } 1886 error = sockopt_set(sopt, &addr, sizeof(addr)); 1887 break; 1888 1889 case IP_MULTICAST_TTL: 1890 optval = imo ? imo->imo_multicast_ttl 1891 : IP_DEFAULT_MULTICAST_TTL; 1892 1893 error = sockopt_set(sopt, &optval, sizeof(optval)); 1894 break; 1895 1896 case IP_MULTICAST_LOOP: 1897 optval = imo ? imo->imo_multicast_loop 1898 : IP_DEFAULT_MULTICAST_LOOP; 1899 1900 error = sockopt_set(sopt, &optval, sizeof(optval)); 1901 break; 1902 1903 default: 1904 error = EOPNOTSUPP; 1905 } 1906 1907 return (error); 1908} 1909 1910/* 1911 * Discard the IP multicast options. 1912 */ 1913void 1914ip_freemoptions(struct ip_moptions *imo) 1915{ 1916 int i; 1917 1918 if (imo != NULL) { 1919 for (i = 0; i < imo->imo_num_memberships; ++i) 1920 in_delmulti(imo->imo_membership[i]); 1921 free(imo, M_IPMOPTS); 1922 } 1923} 1924 1925/* 1926 * Routine called from ip_output() to loop back a copy of an IP multicast 1927 * packet to the input queue of a specified interface. Note that this 1928 * calls the output routine of the loopback "driver", but with an interface 1929 * pointer that might NOT be lo0ifp -- easier than replicating that code here. 1930 */ 1931static void 1932ip_mloopback(struct ifnet *ifp, struct mbuf *m, const struct sockaddr_in *dst) 1933{ 1934 struct ip *ip; 1935 struct mbuf *copym; 1936 1937 copym = m_copypacket(m, M_DONTWAIT); 1938 if (copym != NULL 1939 && (copym->m_flags & M_EXT || copym->m_len < sizeof(struct ip))) 1940 copym = m_pullup(copym, sizeof(struct ip)); 1941 if (copym == NULL) 1942 return; 1943 /* 1944 * We don't bother to fragment if the IP length is greater 1945 * than the interface's MTU. Can this possibly matter? 1946 */ 1947 ip = mtod(copym, struct ip *); 1948 1949 if (copym->m_pkthdr.csum_flags & (M_CSUM_TCPv4|M_CSUM_UDPv4)) { 1950 in_delayed_cksum(copym); 1951 copym->m_pkthdr.csum_flags &= 1952 ~(M_CSUM_TCPv4|M_CSUM_UDPv4); 1953 } 1954 1955 ip->ip_sum = 0; 1956 ip->ip_sum = in_cksum(copym, ip->ip_hl << 2); 1957 (void)looutput(ifp, copym, sintocsa(dst), NULL); 1958} 1959