1/*- 2 * Copyright (c) 1982, 1986, 1988, 1993 3 * The Regents of the University of California. 4 * All rights reserved. 5 * 6 * Redistribution and use in source and binary forms, with or without 7 * modification, are permitted provided that the following conditions 8 * are met: 9 * 1. Redistributions of source code must retain the above copyright 10 * notice, this list of conditions and the following disclaimer. 11 * 2. Redistributions in binary form must reproduce the above copyright 12 * notice, this list of conditions and the following disclaimer in the 13 * documentation and/or other materials provided with the distribution. 14 * 4. Neither the name of the University nor the names of its contributors 15 * may be used to endorse or promote products derived from this software 16 * without specific prior written permission. 17 * 18 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND 19 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 20 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 21 * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE 22 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 23 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 24 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 25 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 26 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 27 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 28 * SUCH DAMAGE. 29 * 30 * @(#)raw_ip.c 8.7 (Berkeley) 5/15/95 31 */ 32 33#include <sys/cdefs.h> 34__FBSDID("$FreeBSD$"); 35 36#include "opt_inet.h" 37#include "opt_inet6.h" 38#include "opt_ipsec.h" 39 40#include <sys/param.h> 41#include <sys/jail.h> 42#include <sys/kernel.h> 43#include <sys/lock.h> 44#include <sys/malloc.h> 45#include <sys/mbuf.h> 46#include <sys/priv.h> 47#include <sys/proc.h> 48#include <sys/protosw.h> 49#include <sys/rwlock.h> 50#include <sys/signalvar.h> 51#include <sys/socket.h> 52#include <sys/socketvar.h> 53#include <sys/sx.h> 54#include <sys/sysctl.h> 55#include <sys/systm.h> 56 57#include <vm/uma.h> 58 59#include <net/if.h> 60#include <net/route.h> 61#include <net/vnet.h> 62 63#include <netinet/in.h> 64#include <netinet/in_systm.h> 65#include <netinet/in_pcb.h> 66#include <netinet/in_var.h> 67#include <netinet/if_ether.h> 68#include <netinet/ip.h> 69#include <netinet/ip_var.h> 70#include <netinet/ip_mroute.h> 71 72#ifdef IPSEC 73#include <netipsec/ipsec.h> 74#endif /*IPSEC*/ 75 76#include <security/mac/mac_framework.h> 77 78VNET_DEFINE(int, ip_defttl) = IPDEFTTL; 79SYSCTL_VNET_INT(_net_inet_ip, IPCTL_DEFTTL, ttl, CTLFLAG_RW, 80 &VNET_NAME(ip_defttl), 0, 81 "Maximum TTL on IP packets"); 82 83VNET_DEFINE(struct inpcbhead, ripcb); 84VNET_DEFINE(struct inpcbinfo, ripcbinfo); 85 86#define V_ripcb VNET(ripcb) 87#define V_ripcbinfo VNET(ripcbinfo) 88 89/* 90 * Control and data hooks for ipfw, dummynet, divert and so on. 91 * The data hooks are not used here but it is convenient 92 * to keep them all in one place. 93 */ 94VNET_DEFINE(ip_fw_chk_ptr_t, ip_fw_chk_ptr) = NULL; 95VNET_DEFINE(ip_fw_ctl_ptr_t, ip_fw_ctl_ptr) = NULL; 96 97int (*ip_dn_ctl_ptr)(struct sockopt *); 98int (*ip_dn_io_ptr)(struct mbuf **, int, struct ip_fw_args *); 99void (*ip_divert_ptr)(struct mbuf *, int); 100int (*ng_ipfw_input_p)(struct mbuf **, int, 101 struct ip_fw_args *, int); 102 103/* Hook for telling pf that the destination address changed */ 104void (*m_addr_chg_pf_p)(struct mbuf *m); 105 106#ifdef INET 107/* 108 * Hooks for multicast routing. They all default to NULL, so leave them not 109 * initialized and rely on BSS being set to 0. 110 */ 111 112/* 113 * The socket used to communicate with the multicast routing daemon. 114 */ 115VNET_DEFINE(struct socket *, ip_mrouter); 116 117/* 118 * The various mrouter and rsvp functions. 119 */ 120int (*ip_mrouter_set)(struct socket *, struct sockopt *); 121int (*ip_mrouter_get)(struct socket *, struct sockopt *); 122int (*ip_mrouter_done)(void); 123int (*ip_mforward)(struct ip *, struct ifnet *, struct mbuf *, 124 struct ip_moptions *); 125int (*mrt_ioctl)(u_long, caddr_t, int); 126int (*legal_vif_num)(int); 127u_long (*ip_mcast_src)(int); 128 129void (*rsvp_input_p)(struct mbuf *m, int off); 130int (*ip_rsvp_vif)(struct socket *, struct sockopt *); 131void (*ip_rsvp_force_done)(struct socket *); 132#endif /* INET */ 133 134u_long rip_sendspace = 9216; 135SYSCTL_ULONG(_net_inet_raw, OID_AUTO, maxdgram, CTLFLAG_RW, 136 &rip_sendspace, 0, "Maximum outgoing raw IP datagram size"); 137 138u_long rip_recvspace = 9216; 139SYSCTL_ULONG(_net_inet_raw, OID_AUTO, recvspace, CTLFLAG_RW, 140 &rip_recvspace, 0, "Maximum space for incoming raw IP datagrams"); 141 142/* 143 * Hash functions 144 */ 145 146#define INP_PCBHASH_RAW_SIZE 256 147#define INP_PCBHASH_RAW(proto, laddr, faddr, mask) \ 148 (((proto) + (laddr) + (faddr)) % (mask) + 1) 149 150#ifdef INET 151static void 152rip_inshash(struct inpcb *inp) 153{ 154 struct inpcbinfo *pcbinfo = inp->inp_pcbinfo; 155 struct inpcbhead *pcbhash; 156 int hash; 157 158 INP_INFO_WLOCK_ASSERT(pcbinfo); 159 INP_WLOCK_ASSERT(inp); 160 161 if (inp->inp_ip_p != 0 && 162 inp->inp_laddr.s_addr != INADDR_ANY && 163 inp->inp_faddr.s_addr != INADDR_ANY) { 164 hash = INP_PCBHASH_RAW(inp->inp_ip_p, inp->inp_laddr.s_addr, 165 inp->inp_faddr.s_addr, pcbinfo->ipi_hashmask); 166 } else 167 hash = 0; 168 pcbhash = &pcbinfo->ipi_hashbase[hash]; 169 LIST_INSERT_HEAD(pcbhash, inp, inp_hash); 170} 171 172static void 173rip_delhash(struct inpcb *inp) 174{ 175 176 INP_INFO_WLOCK_ASSERT(inp->inp_pcbinfo); 177 INP_WLOCK_ASSERT(inp); 178 179 LIST_REMOVE(inp, inp_hash); 180} 181#endif /* INET */ 182 183/* 184 * Raw interface to IP protocol. 185 */ 186 187/* 188 * Initialize raw connection block q. 189 */ 190static void 191rip_zone_change(void *tag) 192{ 193 194 uma_zone_set_max(V_ripcbinfo.ipi_zone, maxsockets); 195} 196 197static int 198rip_inpcb_init(void *mem, int size, int flags) 199{ 200 struct inpcb *inp = mem; 201 202 INP_LOCK_INIT(inp, "inp", "rawinp"); 203 return (0); 204} 205 206void 207rip_init(void) 208{ 209 210 in_pcbinfo_init(&V_ripcbinfo, "rip", &V_ripcb, INP_PCBHASH_RAW_SIZE, 211 1, "ripcb", rip_inpcb_init, NULL, UMA_ZONE_NOFREE, 212 IPI_HASHFIELDS_NONE); 213 EVENTHANDLER_REGISTER(maxsockets_change, rip_zone_change, NULL, 214 EVENTHANDLER_PRI_ANY); 215} 216 217#ifdef VIMAGE 218void 219rip_destroy(void) 220{ 221 222 in_pcbinfo_destroy(&V_ripcbinfo); 223} 224#endif 225 226#ifdef INET 227static int 228rip_append(struct inpcb *last, struct ip *ip, struct mbuf *n, 229 struct sockaddr_in *ripsrc) 230{ 231 int policyfail = 0; 232 233 INP_LOCK_ASSERT(last); 234 235#ifdef IPSEC 236 /* check AH/ESP integrity. */ 237 if (ipsec4_in_reject(n, last)) { 238 policyfail = 1; 239 } 240#endif /* IPSEC */ 241#ifdef MAC 242 if (!policyfail && mac_inpcb_check_deliver(last, n) != 0) 243 policyfail = 1; 244#endif 245 /* Check the minimum TTL for socket. */ 246 if (last->inp_ip_minttl && last->inp_ip_minttl > ip->ip_ttl) 247 policyfail = 1; 248 if (!policyfail) { 249 struct mbuf *opts = NULL; 250 struct socket *so; 251 252 so = last->inp_socket; 253 if ((last->inp_flags & INP_CONTROLOPTS) || 254 (so->so_options & (SO_TIMESTAMP | SO_BINTIME))) 255 ip_savecontrol(last, &opts, ip, n); 256 SOCKBUF_LOCK(&so->so_rcv); 257 if (sbappendaddr_locked(&so->so_rcv, 258 (struct sockaddr *)ripsrc, n, opts) == 0) { 259 /* should notify about lost packet */ 260 m_freem(n); 261 if (opts) 262 m_freem(opts); 263 SOCKBUF_UNLOCK(&so->so_rcv); 264 } else 265 sorwakeup_locked(so); 266 } else 267 m_freem(n); 268 return (policyfail); 269} 270 271/* 272 * Setup generic address and protocol structures for raw_input routine, then 273 * pass them along with mbuf chain. 274 */ 275void 276rip_input(struct mbuf *m, int off) 277{ 278 struct ifnet *ifp; 279 struct ip *ip = mtod(m, struct ip *); 280 int proto = ip->ip_p; 281 struct inpcb *inp, *last; 282 struct sockaddr_in ripsrc; 283 int hash; 284 285 bzero(&ripsrc, sizeof(ripsrc)); 286 ripsrc.sin_len = sizeof(ripsrc); 287 ripsrc.sin_family = AF_INET; 288 ripsrc.sin_addr = ip->ip_src; 289 last = NULL; 290 291 ifp = m->m_pkthdr.rcvif; 292 293 hash = INP_PCBHASH_RAW(proto, ip->ip_src.s_addr, 294 ip->ip_dst.s_addr, V_ripcbinfo.ipi_hashmask); 295 INP_INFO_RLOCK(&V_ripcbinfo); 296 LIST_FOREACH(inp, &V_ripcbinfo.ipi_hashbase[hash], inp_hash) { 297 if (inp->inp_ip_p != proto) 298 continue; 299#ifdef INET6 300 /* XXX inp locking */ 301 if ((inp->inp_vflag & INP_IPV4) == 0) 302 continue; 303#endif 304 if (inp->inp_laddr.s_addr != ip->ip_dst.s_addr) 305 continue; 306 if (inp->inp_faddr.s_addr != ip->ip_src.s_addr) 307 continue; 308 if (jailed_without_vnet(inp->inp_cred)) { 309 /* 310 * XXX: If faddr was bound to multicast group, 311 * jailed raw socket will drop datagram. 312 */ 313 if (prison_check_ip4(inp->inp_cred, &ip->ip_dst) != 0) 314 continue; 315 } 316 if (last != NULL) { 317 struct mbuf *n; 318 319 n = m_copy(m, 0, (int)M_COPYALL); 320 if (n != NULL) 321 (void) rip_append(last, ip, n, &ripsrc); 322 /* XXX count dropped packet */ 323 INP_RUNLOCK(last); 324 } 325 INP_RLOCK(inp); 326 last = inp; 327 } 328 LIST_FOREACH(inp, &V_ripcbinfo.ipi_hashbase[0], inp_hash) { 329 if (inp->inp_ip_p && inp->inp_ip_p != proto) 330 continue; 331#ifdef INET6 332 /* XXX inp locking */ 333 if ((inp->inp_vflag & INP_IPV4) == 0) 334 continue; 335#endif 336 if (!in_nullhost(inp->inp_laddr) && 337 !in_hosteq(inp->inp_laddr, ip->ip_dst)) 338 continue; 339 if (!in_nullhost(inp->inp_faddr) && 340 !in_hosteq(inp->inp_faddr, ip->ip_src)) 341 continue; 342 if (jailed_without_vnet(inp->inp_cred)) { 343 /* 344 * Allow raw socket in jail to receive multicast; 345 * assume process had PRIV_NETINET_RAW at attach, 346 * and fall through into normal filter path if so. 347 */ 348 if (!IN_MULTICAST(ntohl(ip->ip_dst.s_addr)) && 349 prison_check_ip4(inp->inp_cred, &ip->ip_dst) != 0) 350 continue; 351 } 352 /* 353 * If this raw socket has multicast state, and we 354 * have received a multicast, check if this socket 355 * should receive it, as multicast filtering is now 356 * the responsibility of the transport layer. 357 */ 358 if (inp->inp_moptions != NULL && 359 IN_MULTICAST(ntohl(ip->ip_dst.s_addr))) { 360 /* 361 * If the incoming datagram is for IGMP, allow it 362 * through unconditionally to the raw socket. 363 * 364 * In the case of IGMPv2, we may not have explicitly 365 * joined the group, and may have set IFF_ALLMULTI 366 * on the interface. imo_multi_filter() may discard 367 * control traffic we actually need to see. 368 * 369 * Userland multicast routing daemons should continue 370 * filter the control traffic appropriately. 371 */ 372 int blocked; 373 374 blocked = MCAST_PASS; 375 if (proto != IPPROTO_IGMP) { 376 struct sockaddr_in group; 377 378 bzero(&group, sizeof(struct sockaddr_in)); 379 group.sin_len = sizeof(struct sockaddr_in); 380 group.sin_family = AF_INET; 381 group.sin_addr = ip->ip_dst; 382 383 blocked = imo_multi_filter(inp->inp_moptions, 384 ifp, 385 (struct sockaddr *)&group, 386 (struct sockaddr *)&ripsrc); 387 } 388 389 if (blocked != MCAST_PASS) { 390 IPSTAT_INC(ips_notmember); 391 continue; 392 } 393 } 394 if (last != NULL) { 395 struct mbuf *n; 396 397 n = m_copy(m, 0, (int)M_COPYALL); 398 if (n != NULL) 399 (void) rip_append(last, ip, n, &ripsrc); 400 /* XXX count dropped packet */ 401 INP_RUNLOCK(last); 402 } 403 INP_RLOCK(inp); 404 last = inp; 405 } 406 INP_INFO_RUNLOCK(&V_ripcbinfo); 407 if (last != NULL) { 408 if (rip_append(last, ip, m, &ripsrc) != 0) 409 IPSTAT_INC(ips_delivered); 410 INP_RUNLOCK(last); 411 } else { 412 m_freem(m); 413 IPSTAT_INC(ips_noproto); 414 IPSTAT_DEC(ips_delivered); 415 } 416} 417 418/* 419 * Generate IP header and pass packet to ip_output. Tack on options user may 420 * have setup with control call. 421 */ 422int 423rip_output(struct mbuf *m, struct socket *so, u_long dst) 424{ 425 struct ip *ip; 426 int error; 427 struct inpcb *inp = sotoinpcb(so); 428 int flags = ((so->so_options & SO_DONTROUTE) ? IP_ROUTETOIF : 0) | 429 IP_ALLOWBROADCAST; 430 431 /* 432 * If the user handed us a complete IP packet, use it. Otherwise, 433 * allocate an mbuf for a header and fill it in. 434 */ 435 if ((inp->inp_flags & INP_HDRINCL) == 0) { 436 if (m->m_pkthdr.len + sizeof(struct ip) > IP_MAXPACKET) { 437 m_freem(m); 438 return(EMSGSIZE); 439 } 440 M_PREPEND(m, sizeof(struct ip), M_DONTWAIT); 441 if (m == NULL) 442 return(ENOBUFS); 443 444 INP_RLOCK(inp); 445 ip = mtod(m, struct ip *); 446 ip->ip_tos = inp->inp_ip_tos; 447 if (inp->inp_flags & INP_DONTFRAG) 448 ip->ip_off = IP_DF; 449 else 450 ip->ip_off = 0; 451 ip->ip_p = inp->inp_ip_p; 452 ip->ip_len = m->m_pkthdr.len; 453 ip->ip_src = inp->inp_laddr; 454 if (jailed(inp->inp_cred)) { 455 /* 456 * prison_local_ip4() would be good enough but would 457 * let a source of INADDR_ANY pass, which we do not 458 * want to see from jails. We do not go through the 459 * pain of in_pcbladdr() for raw sockets. 460 */ 461 if (ip->ip_src.s_addr == INADDR_ANY) 462 error = prison_get_ip4(inp->inp_cred, 463 &ip->ip_src); 464 else 465 error = prison_local_ip4(inp->inp_cred, 466 &ip->ip_src); 467 if (error != 0) { 468 INP_RUNLOCK(inp); 469 m_freem(m); 470 return (error); 471 } 472 } 473 ip->ip_dst.s_addr = dst; 474 ip->ip_ttl = inp->inp_ip_ttl; 475 } else { 476 if (m->m_pkthdr.len > IP_MAXPACKET) { 477 m_freem(m); 478 return(EMSGSIZE); 479 } 480 INP_RLOCK(inp); 481 ip = mtod(m, struct ip *); 482 error = prison_check_ip4(inp->inp_cred, &ip->ip_src); 483 if (error != 0) { 484 INP_RUNLOCK(inp); 485 m_freem(m); 486 return (error); 487 } 488 489 /* 490 * Don't allow both user specified and setsockopt options, 491 * and don't allow packet length sizes that will crash. 492 */ 493 if (((ip->ip_hl != (sizeof (*ip) >> 2)) && inp->inp_options) 494 || (ip->ip_len > m->m_pkthdr.len) 495 || (ip->ip_len < (ip->ip_hl << 2))) { 496 INP_RUNLOCK(inp); 497 m_freem(m); 498 return (EINVAL); 499 } 500 if (ip->ip_id == 0) 501 ip->ip_id = ip_newid(); 502 503 /* 504 * XXX prevent ip_output from overwriting header fields. 505 */ 506 flags |= IP_RAWOUTPUT; 507 IPSTAT_INC(ips_rawout); 508 } 509 510 if (inp->inp_flags & INP_ONESBCAST) 511 flags |= IP_SENDONES; 512 513#ifdef MAC 514 mac_inpcb_create_mbuf(inp, m); 515#endif 516 517 error = ip_output(m, inp->inp_options, NULL, flags, 518 inp->inp_moptions, inp); 519 INP_RUNLOCK(inp); 520 return (error); 521} 522 523/* 524 * Raw IP socket option processing. 525 * 526 * IMPORTANT NOTE regarding access control: Traditionally, raw sockets could 527 * only be created by a privileged process, and as such, socket option 528 * operations to manage system properties on any raw socket were allowed to 529 * take place without explicit additional access control checks. However, 530 * raw sockets can now also be created in jail(), and therefore explicit 531 * checks are now required. Likewise, raw sockets can be used by a process 532 * after it gives up privilege, so some caution is required. For options 533 * passed down to the IP layer via ip_ctloutput(), checks are assumed to be 534 * performed in ip_ctloutput() and therefore no check occurs here. 535 * Unilaterally checking priv_check() here breaks normal IP socket option 536 * operations on raw sockets. 537 * 538 * When adding new socket options here, make sure to add access control 539 * checks here as necessary. 540 */ 541int 542rip_ctloutput(struct socket *so, struct sockopt *sopt) 543{ 544 struct inpcb *inp = sotoinpcb(so); 545 int error, optval; 546 547 if (sopt->sopt_level != IPPROTO_IP) { 548 if ((sopt->sopt_level == SOL_SOCKET) && 549 (sopt->sopt_name == SO_SETFIB)) { 550 inp->inp_inc.inc_fibnum = so->so_fibnum; 551 return (0); 552 } 553 return (EINVAL); 554 } 555 556 error = 0; 557 switch (sopt->sopt_dir) { 558 case SOPT_GET: 559 switch (sopt->sopt_name) { 560 case IP_HDRINCL: 561 optval = inp->inp_flags & INP_HDRINCL; 562 error = sooptcopyout(sopt, &optval, sizeof optval); 563 break; 564 565 case IP_FW3: /* generic ipfw v.3 functions */ 566 case IP_FW_ADD: /* ADD actually returns the body... */ 567 case IP_FW_GET: 568 case IP_FW_TABLE_GETSIZE: 569 case IP_FW_TABLE_LIST: 570 case IP_FW_NAT_GET_CONFIG: 571 case IP_FW_NAT_GET_LOG: 572 if (V_ip_fw_ctl_ptr != NULL) 573 error = V_ip_fw_ctl_ptr(sopt); 574 else 575 error = ENOPROTOOPT; 576 break; 577 578 case IP_DUMMYNET3: /* generic dummynet v.3 functions */ 579 case IP_DUMMYNET_GET: 580 if (ip_dn_ctl_ptr != NULL) 581 error = ip_dn_ctl_ptr(sopt); 582 else 583 error = ENOPROTOOPT; 584 break ; 585 586 case MRT_INIT: 587 case MRT_DONE: 588 case MRT_ADD_VIF: 589 case MRT_DEL_VIF: 590 case MRT_ADD_MFC: 591 case MRT_DEL_MFC: 592 case MRT_VERSION: 593 case MRT_ASSERT: 594 case MRT_API_SUPPORT: 595 case MRT_API_CONFIG: 596 case MRT_ADD_BW_UPCALL: 597 case MRT_DEL_BW_UPCALL: 598 error = priv_check(curthread, PRIV_NETINET_MROUTE); 599 if (error != 0) 600 return (error); 601 error = ip_mrouter_get ? ip_mrouter_get(so, sopt) : 602 EOPNOTSUPP; 603 break; 604 605 default: 606 error = ip_ctloutput(so, sopt); 607 break; 608 } 609 break; 610 611 case SOPT_SET: 612 switch (sopt->sopt_name) { 613 case IP_HDRINCL: 614 error = sooptcopyin(sopt, &optval, sizeof optval, 615 sizeof optval); 616 if (error) 617 break; 618 if (optval) 619 inp->inp_flags |= INP_HDRINCL; 620 else 621 inp->inp_flags &= ~INP_HDRINCL; 622 break; 623 624 case IP_FW3: /* generic ipfw v.3 functions */ 625 case IP_FW_ADD: 626 case IP_FW_DEL: 627 case IP_FW_FLUSH: 628 case IP_FW_ZERO: 629 case IP_FW_RESETLOG: 630 case IP_FW_TABLE_ADD: 631 case IP_FW_TABLE_DEL: 632 case IP_FW_TABLE_FLUSH: 633 case IP_FW_NAT_CFG: 634 case IP_FW_NAT_DEL: 635 if (V_ip_fw_ctl_ptr != NULL) 636 error = V_ip_fw_ctl_ptr(sopt); 637 else 638 error = ENOPROTOOPT; 639 break; 640 641 case IP_DUMMYNET3: /* generic dummynet v.3 functions */ 642 case IP_DUMMYNET_CONFIGURE: 643 case IP_DUMMYNET_DEL: 644 case IP_DUMMYNET_FLUSH: 645 if (ip_dn_ctl_ptr != NULL) 646 error = ip_dn_ctl_ptr(sopt); 647 else 648 error = ENOPROTOOPT ; 649 break ; 650 651 case IP_RSVP_ON: 652 error = priv_check(curthread, PRIV_NETINET_MROUTE); 653 if (error != 0) 654 return (error); 655 error = ip_rsvp_init(so); 656 break; 657 658 case IP_RSVP_OFF: 659 error = priv_check(curthread, PRIV_NETINET_MROUTE); 660 if (error != 0) 661 return (error); 662 error = ip_rsvp_done(); 663 break; 664 665 case IP_RSVP_VIF_ON: 666 case IP_RSVP_VIF_OFF: 667 error = priv_check(curthread, PRIV_NETINET_MROUTE); 668 if (error != 0) 669 return (error); 670 error = ip_rsvp_vif ? 671 ip_rsvp_vif(so, sopt) : EINVAL; 672 break; 673 674 case MRT_INIT: 675 case MRT_DONE: 676 case MRT_ADD_VIF: 677 case MRT_DEL_VIF: 678 case MRT_ADD_MFC: 679 case MRT_DEL_MFC: 680 case MRT_VERSION: 681 case MRT_ASSERT: 682 case MRT_API_SUPPORT: 683 case MRT_API_CONFIG: 684 case MRT_ADD_BW_UPCALL: 685 case MRT_DEL_BW_UPCALL: 686 error = priv_check(curthread, PRIV_NETINET_MROUTE); 687 if (error != 0) 688 return (error); 689 error = ip_mrouter_set ? ip_mrouter_set(so, sopt) : 690 EOPNOTSUPP; 691 break; 692 693 default: 694 error = ip_ctloutput(so, sopt); 695 break; 696 } 697 break; 698 } 699 700 return (error); 701} 702 703/* 704 * This function exists solely to receive the PRC_IFDOWN messages which are 705 * sent by if_down(). It looks for an ifaddr whose ifa_addr is sa, and calls 706 * in_ifadown() to remove all routes corresponding to that address. It also 707 * receives the PRC_IFUP messages from if_up() and reinstalls the interface 708 * routes. 709 */ 710void 711rip_ctlinput(int cmd, struct sockaddr *sa, void *vip) 712{ 713 struct in_ifaddr *ia; 714 struct ifnet *ifp; 715 int err; 716 int flags; 717 718 switch (cmd) { 719 case PRC_IFDOWN: 720 IN_IFADDR_RLOCK(); 721 TAILQ_FOREACH(ia, &V_in_ifaddrhead, ia_link) { 722 if (ia->ia_ifa.ifa_addr == sa 723 && (ia->ia_flags & IFA_ROUTE)) { 724 ifa_ref(&ia->ia_ifa); 725 IN_IFADDR_RUNLOCK(); 726 /* 727 * in_ifscrub kills the interface route. 728 */ 729 in_ifscrub(ia->ia_ifp, ia, 0); 730 /* 731 * in_ifadown gets rid of all the rest of the 732 * routes. This is not quite the right thing 733 * to do, but at least if we are running a 734 * routing process they will come back. 735 */ 736 in_ifadown(&ia->ia_ifa, 0); 737 ifa_free(&ia->ia_ifa); 738 break; 739 } 740 } 741 if (ia == NULL) /* If ia matched, already unlocked. */ 742 IN_IFADDR_RUNLOCK(); 743 break; 744 745 case PRC_IFUP: 746 IN_IFADDR_RLOCK(); 747 TAILQ_FOREACH(ia, &V_in_ifaddrhead, ia_link) { 748 if (ia->ia_ifa.ifa_addr == sa) 749 break; 750 } 751 if (ia == NULL || (ia->ia_flags & IFA_ROUTE)) { 752 IN_IFADDR_RUNLOCK(); 753 return; 754 } 755 ifa_ref(&ia->ia_ifa); 756 IN_IFADDR_RUNLOCK(); 757 flags = RTF_UP; 758 ifp = ia->ia_ifa.ifa_ifp; 759 760 if ((ifp->if_flags & IFF_LOOPBACK) 761 || (ifp->if_flags & IFF_POINTOPOINT)) 762 flags |= RTF_HOST; 763 764 err = ifa_del_loopback_route((struct ifaddr *)ia, sa); 765 if (err == 0) 766 ia->ia_flags &= ~IFA_RTSELF; 767 768 err = rtinit(&ia->ia_ifa, RTM_ADD, flags); 769 if (err == 0) 770 ia->ia_flags |= IFA_ROUTE; 771 772 err = ifa_add_loopback_route((struct ifaddr *)ia, sa); 773 if (err == 0) 774 ia->ia_flags |= IFA_RTSELF; 775 776 ifa_free(&ia->ia_ifa); 777 break; 778 } 779} 780 781static int 782rip_attach(struct socket *so, int proto, struct thread *td) 783{ 784 struct inpcb *inp; 785 int error; 786 787 inp = sotoinpcb(so); 788 KASSERT(inp == NULL, ("rip_attach: inp != NULL")); 789 790 error = priv_check(td, PRIV_NETINET_RAW); 791 if (error) 792 return (error); 793 if (proto >= IPPROTO_MAX || proto < 0) 794 return EPROTONOSUPPORT; 795 error = soreserve(so, rip_sendspace, rip_recvspace); 796 if (error) 797 return (error); 798 INP_INFO_WLOCK(&V_ripcbinfo); 799 error = in_pcballoc(so, &V_ripcbinfo); 800 if (error) { 801 INP_INFO_WUNLOCK(&V_ripcbinfo); 802 return (error); 803 } 804 inp = (struct inpcb *)so->so_pcb; 805 inp->inp_vflag |= INP_IPV4; 806 inp->inp_ip_p = proto; 807 inp->inp_ip_ttl = V_ip_defttl; 808 rip_inshash(inp); 809 INP_INFO_WUNLOCK(&V_ripcbinfo); 810 INP_WUNLOCK(inp); 811 return (0); 812} 813 814static void 815rip_detach(struct socket *so) 816{ 817 struct inpcb *inp; 818 819 inp = sotoinpcb(so); 820 KASSERT(inp != NULL, ("rip_detach: inp == NULL")); 821 KASSERT(inp->inp_faddr.s_addr == INADDR_ANY, 822 ("rip_detach: not closed")); 823 824 INP_INFO_WLOCK(&V_ripcbinfo); 825 INP_WLOCK(inp); 826 rip_delhash(inp); 827 if (so == V_ip_mrouter && ip_mrouter_done) 828 ip_mrouter_done(); 829 if (ip_rsvp_force_done) 830 ip_rsvp_force_done(so); 831 if (so == V_ip_rsvpd) 832 ip_rsvp_done(); 833 in_pcbdetach(inp); 834 in_pcbfree(inp); 835 INP_INFO_WUNLOCK(&V_ripcbinfo); 836} 837 838static void 839rip_dodisconnect(struct socket *so, struct inpcb *inp) 840{ 841 struct inpcbinfo *pcbinfo; 842 843 pcbinfo = inp->inp_pcbinfo; 844 INP_INFO_WLOCK(pcbinfo); 845 INP_WLOCK(inp); 846 rip_delhash(inp); 847 inp->inp_faddr.s_addr = INADDR_ANY; 848 rip_inshash(inp); 849 SOCK_LOCK(so); 850 so->so_state &= ~SS_ISCONNECTED; 851 SOCK_UNLOCK(so); 852 INP_WUNLOCK(inp); 853 INP_INFO_WUNLOCK(pcbinfo); 854} 855 856static void 857rip_abort(struct socket *so) 858{ 859 struct inpcb *inp; 860 861 inp = sotoinpcb(so); 862 KASSERT(inp != NULL, ("rip_abort: inp == NULL")); 863 864 rip_dodisconnect(so, inp); 865} 866 867static void 868rip_close(struct socket *so) 869{ 870 struct inpcb *inp; 871 872 inp = sotoinpcb(so); 873 KASSERT(inp != NULL, ("rip_close: inp == NULL")); 874 875 rip_dodisconnect(so, inp); 876} 877 878static int 879rip_disconnect(struct socket *so) 880{ 881 struct inpcb *inp; 882 883 if ((so->so_state & SS_ISCONNECTED) == 0) 884 return (ENOTCONN); 885 886 inp = sotoinpcb(so); 887 KASSERT(inp != NULL, ("rip_disconnect: inp == NULL")); 888 889 rip_dodisconnect(so, inp); 890 return (0); 891} 892 893static int 894rip_bind(struct socket *so, struct sockaddr *nam, struct thread *td) 895{ 896 struct sockaddr_in *addr = (struct sockaddr_in *)nam; 897 struct inpcb *inp; 898 int error; 899 900 if (nam->sa_len != sizeof(*addr)) 901 return (EINVAL); 902 903 error = prison_check_ip4(td->td_ucred, &addr->sin_addr); 904 if (error != 0) 905 return (error); 906 907 inp = sotoinpcb(so); 908 KASSERT(inp != NULL, ("rip_bind: inp == NULL")); 909 910 if (TAILQ_EMPTY(&V_ifnet) || 911 (addr->sin_family != AF_INET && addr->sin_family != AF_IMPLINK) || 912 (addr->sin_addr.s_addr && 913 (inp->inp_flags & INP_BINDANY) == 0 && 914 ifa_ifwithaddr_check((struct sockaddr *)addr) == 0)) 915 return (EADDRNOTAVAIL); 916 917 INP_INFO_WLOCK(&V_ripcbinfo); 918 INP_WLOCK(inp); 919 rip_delhash(inp); 920 inp->inp_laddr = addr->sin_addr; 921 rip_inshash(inp); 922 INP_WUNLOCK(inp); 923 INP_INFO_WUNLOCK(&V_ripcbinfo); 924 return (0); 925} 926 927static int 928rip_connect(struct socket *so, struct sockaddr *nam, struct thread *td) 929{ 930 struct sockaddr_in *addr = (struct sockaddr_in *)nam; 931 struct inpcb *inp; 932 933 if (nam->sa_len != sizeof(*addr)) 934 return (EINVAL); 935 if (TAILQ_EMPTY(&V_ifnet)) 936 return (EADDRNOTAVAIL); 937 if (addr->sin_family != AF_INET && addr->sin_family != AF_IMPLINK) 938 return (EAFNOSUPPORT); 939 940 inp = sotoinpcb(so); 941 KASSERT(inp != NULL, ("rip_connect: inp == NULL")); 942 943 INP_INFO_WLOCK(&V_ripcbinfo); 944 INP_WLOCK(inp); 945 rip_delhash(inp); 946 inp->inp_faddr = addr->sin_addr; 947 rip_inshash(inp); 948 soisconnected(so); 949 INP_WUNLOCK(inp); 950 INP_INFO_WUNLOCK(&V_ripcbinfo); 951 return (0); 952} 953 954static int 955rip_shutdown(struct socket *so) 956{ 957 struct inpcb *inp; 958 959 inp = sotoinpcb(so); 960 KASSERT(inp != NULL, ("rip_shutdown: inp == NULL")); 961 962 INP_WLOCK(inp); 963 socantsendmore(so); 964 INP_WUNLOCK(inp); 965 return (0); 966} 967 968static int 969rip_send(struct socket *so, int flags, struct mbuf *m, struct sockaddr *nam, 970 struct mbuf *control, struct thread *td) 971{ 972 struct inpcb *inp; 973 u_long dst; 974 975 inp = sotoinpcb(so); 976 KASSERT(inp != NULL, ("rip_send: inp == NULL")); 977 978 /* 979 * Note: 'dst' reads below are unlocked. 980 */ 981 if (so->so_state & SS_ISCONNECTED) { 982 if (nam) { 983 m_freem(m); 984 return (EISCONN); 985 } 986 dst = inp->inp_faddr.s_addr; /* Unlocked read. */ 987 } else { 988 if (nam == NULL) { 989 m_freem(m); 990 return (ENOTCONN); 991 } 992 dst = ((struct sockaddr_in *)nam)->sin_addr.s_addr; 993 } 994 return (rip_output(m, so, dst)); 995} 996#endif /* INET */ 997 998static int 999rip_pcblist(SYSCTL_HANDLER_ARGS) 1000{ 1001 int error, i, n; 1002 struct inpcb *inp, **inp_list; 1003 inp_gen_t gencnt; 1004 struct xinpgen xig; 1005 1006 /* 1007 * The process of preparing the TCB list is too time-consuming and 1008 * resource-intensive to repeat twice on every request. 1009 */ 1010 if (req->oldptr == 0) { 1011 n = V_ripcbinfo.ipi_count; 1012 n += imax(n / 8, 10); 1013 req->oldidx = 2 * (sizeof xig) + n * sizeof(struct xinpcb); 1014 return (0); 1015 } 1016 1017 if (req->newptr != 0) 1018 return (EPERM); 1019 1020 /* 1021 * OK, now we're committed to doing something. 1022 */ 1023 INP_INFO_RLOCK(&V_ripcbinfo); 1024 gencnt = V_ripcbinfo.ipi_gencnt; 1025 n = V_ripcbinfo.ipi_count; 1026 INP_INFO_RUNLOCK(&V_ripcbinfo); 1027 1028 xig.xig_len = sizeof xig; 1029 xig.xig_count = n; 1030 xig.xig_gen = gencnt; 1031 xig.xig_sogen = so_gencnt; 1032 error = SYSCTL_OUT(req, &xig, sizeof xig); 1033 if (error) 1034 return (error); 1035 1036 inp_list = malloc(n * sizeof *inp_list, M_TEMP, M_WAITOK); 1037 if (inp_list == 0) 1038 return (ENOMEM); 1039 1040 INP_INFO_RLOCK(&V_ripcbinfo); 1041 for (inp = LIST_FIRST(V_ripcbinfo.ipi_listhead), i = 0; inp && i < n; 1042 inp = LIST_NEXT(inp, inp_list)) { 1043 INP_WLOCK(inp); 1044 if (inp->inp_gencnt <= gencnt && 1045 cr_canseeinpcb(req->td->td_ucred, inp) == 0) { 1046 in_pcbref(inp); 1047 inp_list[i++] = inp; 1048 } 1049 INP_WUNLOCK(inp); 1050 } 1051 INP_INFO_RUNLOCK(&V_ripcbinfo); 1052 n = i; 1053 1054 error = 0; 1055 for (i = 0; i < n; i++) { 1056 inp = inp_list[i]; 1057 INP_RLOCK(inp); 1058 if (inp->inp_gencnt <= gencnt) { 1059 struct xinpcb xi; 1060 1061 bzero(&xi, sizeof(xi)); 1062 xi.xi_len = sizeof xi; 1063 /* XXX should avoid extra copy */ 1064 bcopy(inp, &xi.xi_inp, sizeof *inp); 1065 if (inp->inp_socket) 1066 sotoxsocket(inp->inp_socket, &xi.xi_socket); 1067 INP_RUNLOCK(inp); 1068 error = SYSCTL_OUT(req, &xi, sizeof xi); 1069 } else 1070 INP_RUNLOCK(inp); 1071 } 1072 INP_INFO_WLOCK(&V_ripcbinfo); 1073 for (i = 0; i < n; i++) { 1074 inp = inp_list[i]; 1075 INP_RLOCK(inp); 1076 if (!in_pcbrele_rlocked(inp)) 1077 INP_RUNLOCK(inp); 1078 } 1079 INP_INFO_WUNLOCK(&V_ripcbinfo); 1080 1081 if (!error) { 1082 /* 1083 * Give the user an updated idea of our state. If the 1084 * generation differs from what we told her before, she knows 1085 * that something happened while we were processing this 1086 * request, and it might be necessary to retry. 1087 */ 1088 INP_INFO_RLOCK(&V_ripcbinfo); 1089 xig.xig_gen = V_ripcbinfo.ipi_gencnt; 1090 xig.xig_sogen = so_gencnt; 1091 xig.xig_count = V_ripcbinfo.ipi_count; 1092 INP_INFO_RUNLOCK(&V_ripcbinfo); 1093 error = SYSCTL_OUT(req, &xig, sizeof xig); 1094 } 1095 free(inp_list, M_TEMP); 1096 return (error); 1097} 1098 1099SYSCTL_PROC(_net_inet_raw, OID_AUTO/*XXX*/, pcblist, 1100 CTLTYPE_OPAQUE | CTLFLAG_RD, NULL, 0, 1101 rip_pcblist, "S,xinpcb", "List of active raw IP sockets"); 1102 1103#ifdef INET 1104struct pr_usrreqs rip_usrreqs = { 1105 .pru_abort = rip_abort, 1106 .pru_attach = rip_attach, 1107 .pru_bind = rip_bind, 1108 .pru_connect = rip_connect, 1109 .pru_control = in_control, 1110 .pru_detach = rip_detach, 1111 .pru_disconnect = rip_disconnect, 1112 .pru_peeraddr = in_getpeeraddr, 1113 .pru_send = rip_send, 1114 .pru_shutdown = rip_shutdown, 1115 .pru_sockaddr = in_getsockaddr, 1116 .pru_sosetlabel = in_pcbsosetlabel, 1117 .pru_close = rip_close, 1118}; 1119#endif /* INET */ 1120