raw_ip.c revision 220880
1/*- 2 * Copyright (c) 1982, 1986, 1988, 1993 3 * The Regents of the University of California. 4 * All rights reserved. 5 * 6 * Redistribution and use in source and binary forms, with or without 7 * modification, are permitted provided that the following conditions 8 * are met: 9 * 1. Redistributions of source code must retain the above copyright 10 * notice, this list of conditions and the following disclaimer. 11 * 2. Redistributions in binary form must reproduce the above copyright 12 * notice, this list of conditions and the following disclaimer in the 13 * documentation and/or other materials provided with the distribution. 14 * 4. Neither the name of the University nor the names of its contributors 15 * may be used to endorse or promote products derived from this software 16 * without specific prior written permission. 17 * 18 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND 19 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 20 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 21 * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE 22 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 23 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 24 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 25 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 26 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 27 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 28 * SUCH DAMAGE. 29 * 30 * @(#)raw_ip.c 8.7 (Berkeley) 5/15/95 31 */ 32 33#include <sys/cdefs.h> 34__FBSDID("$FreeBSD: head/sys/netinet/raw_ip.c 220880 2011-04-20 08:03:22Z bz $"); 35 36#include "opt_inet.h" 37#include "opt_inet6.h" 38#include "opt_ipsec.h" 39 40#include <sys/param.h> 41#include <sys/jail.h> 42#include <sys/kernel.h> 43#include <sys/lock.h> 44#include <sys/malloc.h> 45#include <sys/mbuf.h> 46#include <sys/priv.h> 47#include <sys/proc.h> 48#include <sys/protosw.h> 49#include <sys/rwlock.h> 50#include <sys/signalvar.h> 51#include <sys/socket.h> 52#include <sys/socketvar.h> 53#include <sys/sx.h> 54#include <sys/sysctl.h> 55#include <sys/systm.h> 56 57#include <vm/uma.h> 58 59#include <net/if.h> 60#include <net/route.h> 61#include <net/vnet.h> 62 63#include <netinet/in.h> 64#include <netinet/in_systm.h> 65#include <netinet/in_pcb.h> 66#include <netinet/in_var.h> 67#include <netinet/ip.h> 68#include <netinet/ip_var.h> 69#include <netinet/ip_mroute.h> 70 71#ifdef IPSEC 72#include <netipsec/ipsec.h> 73#endif /*IPSEC*/ 74 75#include <security/mac/mac_framework.h> 76 77VNET_DEFINE(struct inpcbhead, ripcb); 78VNET_DEFINE(struct inpcbinfo, ripcbinfo); 79 80#define V_ripcb VNET(ripcb) 81#define V_ripcbinfo VNET(ripcbinfo) 82 83/* 84 * Control and data hooks for ipfw, dummynet, divert and so on. 85 * The data hooks are not used here but it is convenient 86 * to keep them all in one place. 87 */ 88VNET_DEFINE(ip_fw_chk_ptr_t, ip_fw_chk_ptr) = NULL; 89VNET_DEFINE(ip_fw_ctl_ptr_t, ip_fw_ctl_ptr) = NULL; 90 91int (*ip_dn_ctl_ptr)(struct sockopt *); 92int (*ip_dn_io_ptr)(struct mbuf **, int, struct ip_fw_args *); 93void (*ip_divert_ptr)(struct mbuf *, int); 94int (*ng_ipfw_input_p)(struct mbuf **, int, 95 struct ip_fw_args *, int); 96 97#ifdef INET 98/* 99 * Hooks for multicast routing. They all default to NULL, so leave them not 100 * initialized and rely on BSS being set to 0. 101 */ 102 103/* 104 * The socket used to communicate with the multicast routing daemon. 105 */ 106VNET_DEFINE(struct socket *, ip_mrouter); 107 108/* 109 * The various mrouter and rsvp functions. 110 */ 111int (*ip_mrouter_set)(struct socket *, struct sockopt *); 112int (*ip_mrouter_get)(struct socket *, struct sockopt *); 113int (*ip_mrouter_done)(void); 114int (*ip_mforward)(struct ip *, struct ifnet *, struct mbuf *, 115 struct ip_moptions *); 116int (*mrt_ioctl)(u_long, caddr_t, int); 117int (*legal_vif_num)(int); 118u_long (*ip_mcast_src)(int); 119 120void (*rsvp_input_p)(struct mbuf *m, int off); 121int (*ip_rsvp_vif)(struct socket *, struct sockopt *); 122void (*ip_rsvp_force_done)(struct socket *); 123#endif /* INET */ 124 125u_long rip_sendspace = 9216; 126SYSCTL_ULONG(_net_inet_raw, OID_AUTO, maxdgram, CTLFLAG_RW, 127 &rip_sendspace, 0, "Maximum outgoing raw IP datagram size"); 128 129u_long rip_recvspace = 9216; 130SYSCTL_ULONG(_net_inet_raw, OID_AUTO, recvspace, CTLFLAG_RW, 131 &rip_recvspace, 0, "Maximum space for incoming raw IP datagrams"); 132 133/* 134 * Hash functions 135 */ 136 137#define INP_PCBHASH_RAW_SIZE 256 138#define INP_PCBHASH_RAW(proto, laddr, faddr, mask) \ 139 (((proto) + (laddr) + (faddr)) % (mask) + 1) 140 141#ifdef INET 142static void 143rip_inshash(struct inpcb *inp) 144{ 145 struct inpcbinfo *pcbinfo = inp->inp_pcbinfo; 146 struct inpcbhead *pcbhash; 147 int hash; 148 149 INP_INFO_WLOCK_ASSERT(pcbinfo); 150 INP_WLOCK_ASSERT(inp); 151 152 if (inp->inp_ip_p != 0 && 153 inp->inp_laddr.s_addr != INADDR_ANY && 154 inp->inp_faddr.s_addr != INADDR_ANY) { 155 hash = INP_PCBHASH_RAW(inp->inp_ip_p, inp->inp_laddr.s_addr, 156 inp->inp_faddr.s_addr, pcbinfo->ipi_hashmask); 157 } else 158 hash = 0; 159 pcbhash = &pcbinfo->ipi_hashbase[hash]; 160 LIST_INSERT_HEAD(pcbhash, inp, inp_hash); 161} 162 163static void 164rip_delhash(struct inpcb *inp) 165{ 166 167 INP_INFO_WLOCK_ASSERT(inp->inp_pcbinfo); 168 INP_WLOCK_ASSERT(inp); 169 170 LIST_REMOVE(inp, inp_hash); 171} 172#endif /* INET */ 173 174/* 175 * Raw interface to IP protocol. 176 */ 177 178/* 179 * Initialize raw connection block q. 180 */ 181static void 182rip_zone_change(void *tag) 183{ 184 185 uma_zone_set_max(V_ripcbinfo.ipi_zone, maxsockets); 186} 187 188static int 189rip_inpcb_init(void *mem, int size, int flags) 190{ 191 struct inpcb *inp = mem; 192 193 INP_LOCK_INIT(inp, "inp", "rawinp"); 194 return (0); 195} 196 197void 198rip_init(void) 199{ 200 201 in_pcbinfo_init(&V_ripcbinfo, "rip", &V_ripcb, INP_PCBHASH_RAW_SIZE, 202 1, "ripcb", rip_inpcb_init, NULL, UMA_ZONE_NOFREE); 203 EVENTHANDLER_REGISTER(maxsockets_change, rip_zone_change, NULL, 204 EVENTHANDLER_PRI_ANY); 205} 206 207#ifdef VIMAGE 208void 209rip_destroy(void) 210{ 211 212 in_pcbinfo_destroy(&V_ripcbinfo); 213} 214#endif 215 216#ifdef INET 217static int 218rip_append(struct inpcb *last, struct ip *ip, struct mbuf *n, 219 struct sockaddr_in *ripsrc) 220{ 221 int policyfail = 0; 222 223 INP_RLOCK_ASSERT(last); 224 225#ifdef IPSEC 226 /* check AH/ESP integrity. */ 227 if (ipsec4_in_reject(n, last)) { 228 policyfail = 1; 229 } 230#endif /* IPSEC */ 231#ifdef MAC 232 if (!policyfail && mac_inpcb_check_deliver(last, n) != 0) 233 policyfail = 1; 234#endif 235 /* Check the minimum TTL for socket. */ 236 if (last->inp_ip_minttl && last->inp_ip_minttl > ip->ip_ttl) 237 policyfail = 1; 238 if (!policyfail) { 239 struct mbuf *opts = NULL; 240 struct socket *so; 241 242 so = last->inp_socket; 243 if ((last->inp_flags & INP_CONTROLOPTS) || 244 (so->so_options & (SO_TIMESTAMP | SO_BINTIME))) 245 ip_savecontrol(last, &opts, ip, n); 246 SOCKBUF_LOCK(&so->so_rcv); 247 if (sbappendaddr_locked(&so->so_rcv, 248 (struct sockaddr *)ripsrc, n, opts) == 0) { 249 /* should notify about lost packet */ 250 m_freem(n); 251 if (opts) 252 m_freem(opts); 253 SOCKBUF_UNLOCK(&so->so_rcv); 254 } else 255 sorwakeup_locked(so); 256 } else 257 m_freem(n); 258 return (policyfail); 259} 260 261/* 262 * Setup generic address and protocol structures for raw_input routine, then 263 * pass them along with mbuf chain. 264 */ 265void 266rip_input(struct mbuf *m, int off) 267{ 268 struct ifnet *ifp; 269 struct ip *ip = mtod(m, struct ip *); 270 int proto = ip->ip_p; 271 struct inpcb *inp, *last; 272 struct sockaddr_in ripsrc; 273 int hash; 274 275 bzero(&ripsrc, sizeof(ripsrc)); 276 ripsrc.sin_len = sizeof(ripsrc); 277 ripsrc.sin_family = AF_INET; 278 ripsrc.sin_addr = ip->ip_src; 279 last = NULL; 280 281 ifp = m->m_pkthdr.rcvif; 282 283 hash = INP_PCBHASH_RAW(proto, ip->ip_src.s_addr, 284 ip->ip_dst.s_addr, V_ripcbinfo.ipi_hashmask); 285 INP_INFO_RLOCK(&V_ripcbinfo); 286 LIST_FOREACH(inp, &V_ripcbinfo.ipi_hashbase[hash], inp_hash) { 287 if (inp->inp_ip_p != proto) 288 continue; 289#ifdef INET6 290 /* XXX inp locking */ 291 if ((inp->inp_vflag & INP_IPV4) == 0) 292 continue; 293#endif 294 if (inp->inp_laddr.s_addr != ip->ip_dst.s_addr) 295 continue; 296 if (inp->inp_faddr.s_addr != ip->ip_src.s_addr) 297 continue; 298 if (jailed_without_vnet(inp->inp_cred)) { 299 /* 300 * XXX: If faddr was bound to multicast group, 301 * jailed raw socket will drop datagram. 302 */ 303 if (prison_check_ip4(inp->inp_cred, &ip->ip_dst) != 0) 304 continue; 305 } 306 if (last != NULL) { 307 struct mbuf *n; 308 309 n = m_copy(m, 0, (int)M_COPYALL); 310 if (n != NULL) 311 (void) rip_append(last, ip, n, &ripsrc); 312 /* XXX count dropped packet */ 313 INP_RUNLOCK(last); 314 } 315 INP_RLOCK(inp); 316 last = inp; 317 } 318 LIST_FOREACH(inp, &V_ripcbinfo.ipi_hashbase[0], inp_hash) { 319 if (inp->inp_ip_p && inp->inp_ip_p != proto) 320 continue; 321#ifdef INET6 322 /* XXX inp locking */ 323 if ((inp->inp_vflag & INP_IPV4) == 0) 324 continue; 325#endif 326 if (!in_nullhost(inp->inp_laddr) && 327 !in_hosteq(inp->inp_laddr, ip->ip_dst)) 328 continue; 329 if (!in_nullhost(inp->inp_faddr) && 330 !in_hosteq(inp->inp_faddr, ip->ip_src)) 331 continue; 332 if (jailed_without_vnet(inp->inp_cred)) { 333 /* 334 * Allow raw socket in jail to receive multicast; 335 * assume process had PRIV_NETINET_RAW at attach, 336 * and fall through into normal filter path if so. 337 */ 338 if (!IN_MULTICAST(ntohl(ip->ip_dst.s_addr)) && 339 prison_check_ip4(inp->inp_cred, &ip->ip_dst) != 0) 340 continue; 341 } 342 /* 343 * If this raw socket has multicast state, and we 344 * have received a multicast, check if this socket 345 * should receive it, as multicast filtering is now 346 * the responsibility of the transport layer. 347 */ 348 if (inp->inp_moptions != NULL && 349 IN_MULTICAST(ntohl(ip->ip_dst.s_addr))) { 350 /* 351 * If the incoming datagram is for IGMP, allow it 352 * through unconditionally to the raw socket. 353 * 354 * In the case of IGMPv2, we may not have explicitly 355 * joined the group, and may have set IFF_ALLMULTI 356 * on the interface. imo_multi_filter() may discard 357 * control traffic we actually need to see. 358 * 359 * Userland multicast routing daemons should continue 360 * filter the control traffic appropriately. 361 */ 362 int blocked; 363 364 blocked = MCAST_PASS; 365 if (proto != IPPROTO_IGMP) { 366 struct sockaddr_in group; 367 368 bzero(&group, sizeof(struct sockaddr_in)); 369 group.sin_len = sizeof(struct sockaddr_in); 370 group.sin_family = AF_INET; 371 group.sin_addr = ip->ip_dst; 372 373 blocked = imo_multi_filter(inp->inp_moptions, 374 ifp, 375 (struct sockaddr *)&group, 376 (struct sockaddr *)&ripsrc); 377 } 378 379 if (blocked != MCAST_PASS) { 380 IPSTAT_INC(ips_notmember); 381 continue; 382 } 383 } 384 if (last != NULL) { 385 struct mbuf *n; 386 387 n = m_copy(m, 0, (int)M_COPYALL); 388 if (n != NULL) 389 (void) rip_append(last, ip, n, &ripsrc); 390 /* XXX count dropped packet */ 391 INP_RUNLOCK(last); 392 } 393 INP_RLOCK(inp); 394 last = inp; 395 } 396 INP_INFO_RUNLOCK(&V_ripcbinfo); 397 if (last != NULL) { 398 if (rip_append(last, ip, m, &ripsrc) != 0) 399 IPSTAT_INC(ips_delivered); 400 INP_RUNLOCK(last); 401 } else { 402 m_freem(m); 403 IPSTAT_INC(ips_noproto); 404 IPSTAT_DEC(ips_delivered); 405 } 406} 407 408/* 409 * Generate IP header and pass packet to ip_output. Tack on options user may 410 * have setup with control call. 411 */ 412int 413rip_output(struct mbuf *m, struct socket *so, u_long dst) 414{ 415 struct ip *ip; 416 int error; 417 struct inpcb *inp = sotoinpcb(so); 418 int flags = ((so->so_options & SO_DONTROUTE) ? IP_ROUTETOIF : 0) | 419 IP_ALLOWBROADCAST; 420 421 /* 422 * If the user handed us a complete IP packet, use it. Otherwise, 423 * allocate an mbuf for a header and fill it in. 424 */ 425 if ((inp->inp_flags & INP_HDRINCL) == 0) { 426 if (m->m_pkthdr.len + sizeof(struct ip) > IP_MAXPACKET) { 427 m_freem(m); 428 return(EMSGSIZE); 429 } 430 M_PREPEND(m, sizeof(struct ip), M_DONTWAIT); 431 if (m == NULL) 432 return(ENOBUFS); 433 434 INP_RLOCK(inp); 435 ip = mtod(m, struct ip *); 436 ip->ip_tos = inp->inp_ip_tos; 437 if (inp->inp_flags & INP_DONTFRAG) 438 ip->ip_off = IP_DF; 439 else 440 ip->ip_off = 0; 441 ip->ip_p = inp->inp_ip_p; 442 ip->ip_len = m->m_pkthdr.len; 443 ip->ip_src = inp->inp_laddr; 444 if (jailed(inp->inp_cred)) { 445 /* 446 * prison_local_ip4() would be good enough but would 447 * let a source of INADDR_ANY pass, which we do not 448 * want to see from jails. We do not go through the 449 * pain of in_pcbladdr() for raw sockets. 450 */ 451 if (ip->ip_src.s_addr == INADDR_ANY) 452 error = prison_get_ip4(inp->inp_cred, 453 &ip->ip_src); 454 else 455 error = prison_local_ip4(inp->inp_cred, 456 &ip->ip_src); 457 if (error != 0) { 458 INP_RUNLOCK(inp); 459 m_freem(m); 460 return (error); 461 } 462 } 463 ip->ip_dst.s_addr = dst; 464 ip->ip_ttl = inp->inp_ip_ttl; 465 } else { 466 if (m->m_pkthdr.len > IP_MAXPACKET) { 467 m_freem(m); 468 return(EMSGSIZE); 469 } 470 INP_RLOCK(inp); 471 ip = mtod(m, struct ip *); 472 error = prison_check_ip4(inp->inp_cred, &ip->ip_src); 473 if (error != 0) { 474 INP_RUNLOCK(inp); 475 m_freem(m); 476 return (error); 477 } 478 479 /* 480 * Don't allow both user specified and setsockopt options, 481 * and don't allow packet length sizes that will crash. 482 */ 483 if (((ip->ip_hl != (sizeof (*ip) >> 2)) && inp->inp_options) 484 || (ip->ip_len > m->m_pkthdr.len) 485 || (ip->ip_len < (ip->ip_hl << 2))) { 486 INP_RUNLOCK(inp); 487 m_freem(m); 488 return (EINVAL); 489 } 490 if (ip->ip_id == 0) 491 ip->ip_id = ip_newid(); 492 493 /* 494 * XXX prevent ip_output from overwriting header fields. 495 */ 496 flags |= IP_RAWOUTPUT; 497 IPSTAT_INC(ips_rawout); 498 } 499 500 if (inp->inp_flags & INP_ONESBCAST) 501 flags |= IP_SENDONES; 502 503#ifdef MAC 504 mac_inpcb_create_mbuf(inp, m); 505#endif 506 507 error = ip_output(m, inp->inp_options, NULL, flags, 508 inp->inp_moptions, inp); 509 INP_RUNLOCK(inp); 510 return (error); 511} 512 513/* 514 * Raw IP socket option processing. 515 * 516 * IMPORTANT NOTE regarding access control: Traditionally, raw sockets could 517 * only be created by a privileged process, and as such, socket option 518 * operations to manage system properties on any raw socket were allowed to 519 * take place without explicit additional access control checks. However, 520 * raw sockets can now also be created in jail(), and therefore explicit 521 * checks are now required. Likewise, raw sockets can be used by a process 522 * after it gives up privilege, so some caution is required. For options 523 * passed down to the IP layer via ip_ctloutput(), checks are assumed to be 524 * performed in ip_ctloutput() and therefore no check occurs here. 525 * Unilaterally checking priv_check() here breaks normal IP socket option 526 * operations on raw sockets. 527 * 528 * When adding new socket options here, make sure to add access control 529 * checks here as necessary. 530 */ 531int 532rip_ctloutput(struct socket *so, struct sockopt *sopt) 533{ 534 struct inpcb *inp = sotoinpcb(so); 535 int error, optval; 536 537 if (sopt->sopt_level != IPPROTO_IP) { 538 if ((sopt->sopt_level == SOL_SOCKET) && 539 (sopt->sopt_name == SO_SETFIB)) { 540 inp->inp_inc.inc_fibnum = so->so_fibnum; 541 return (0); 542 } 543 return (EINVAL); 544 } 545 546 error = 0; 547 switch (sopt->sopt_dir) { 548 case SOPT_GET: 549 switch (sopt->sopt_name) { 550 case IP_HDRINCL: 551 optval = inp->inp_flags & INP_HDRINCL; 552 error = sooptcopyout(sopt, &optval, sizeof optval); 553 break; 554 555 case IP_FW3: /* generic ipfw v.3 functions */ 556 case IP_FW_ADD: /* ADD actually returns the body... */ 557 case IP_FW_GET: 558 case IP_FW_TABLE_GETSIZE: 559 case IP_FW_TABLE_LIST: 560 case IP_FW_NAT_GET_CONFIG: 561 case IP_FW_NAT_GET_LOG: 562 if (V_ip_fw_ctl_ptr != NULL) 563 error = V_ip_fw_ctl_ptr(sopt); 564 else 565 error = ENOPROTOOPT; 566 break; 567 568 case IP_DUMMYNET3: /* generic dummynet v.3 functions */ 569 case IP_DUMMYNET_GET: 570 if (ip_dn_ctl_ptr != NULL) 571 error = ip_dn_ctl_ptr(sopt); 572 else 573 error = ENOPROTOOPT; 574 break ; 575 576 case MRT_INIT: 577 case MRT_DONE: 578 case MRT_ADD_VIF: 579 case MRT_DEL_VIF: 580 case MRT_ADD_MFC: 581 case MRT_DEL_MFC: 582 case MRT_VERSION: 583 case MRT_ASSERT: 584 case MRT_API_SUPPORT: 585 case MRT_API_CONFIG: 586 case MRT_ADD_BW_UPCALL: 587 case MRT_DEL_BW_UPCALL: 588 error = priv_check(curthread, PRIV_NETINET_MROUTE); 589 if (error != 0) 590 return (error); 591 error = ip_mrouter_get ? ip_mrouter_get(so, sopt) : 592 EOPNOTSUPP; 593 break; 594 595 default: 596 error = ip_ctloutput(so, sopt); 597 break; 598 } 599 break; 600 601 case SOPT_SET: 602 switch (sopt->sopt_name) { 603 case IP_HDRINCL: 604 error = sooptcopyin(sopt, &optval, sizeof optval, 605 sizeof optval); 606 if (error) 607 break; 608 if (optval) 609 inp->inp_flags |= INP_HDRINCL; 610 else 611 inp->inp_flags &= ~INP_HDRINCL; 612 break; 613 614 case IP_FW3: /* generic ipfw v.3 functions */ 615 case IP_FW_ADD: 616 case IP_FW_DEL: 617 case IP_FW_FLUSH: 618 case IP_FW_ZERO: 619 case IP_FW_RESETLOG: 620 case IP_FW_TABLE_ADD: 621 case IP_FW_TABLE_DEL: 622 case IP_FW_TABLE_FLUSH: 623 case IP_FW_NAT_CFG: 624 case IP_FW_NAT_DEL: 625 if (V_ip_fw_ctl_ptr != NULL) 626 error = V_ip_fw_ctl_ptr(sopt); 627 else 628 error = ENOPROTOOPT; 629 break; 630 631 case IP_DUMMYNET3: /* generic dummynet v.3 functions */ 632 case IP_DUMMYNET_CONFIGURE: 633 case IP_DUMMYNET_DEL: 634 case IP_DUMMYNET_FLUSH: 635 if (ip_dn_ctl_ptr != NULL) 636 error = ip_dn_ctl_ptr(sopt); 637 else 638 error = ENOPROTOOPT ; 639 break ; 640 641 case IP_RSVP_ON: 642 error = priv_check(curthread, PRIV_NETINET_MROUTE); 643 if (error != 0) 644 return (error); 645 error = ip_rsvp_init(so); 646 break; 647 648 case IP_RSVP_OFF: 649 error = priv_check(curthread, PRIV_NETINET_MROUTE); 650 if (error != 0) 651 return (error); 652 error = ip_rsvp_done(); 653 break; 654 655 case IP_RSVP_VIF_ON: 656 case IP_RSVP_VIF_OFF: 657 error = priv_check(curthread, PRIV_NETINET_MROUTE); 658 if (error != 0) 659 return (error); 660 error = ip_rsvp_vif ? 661 ip_rsvp_vif(so, sopt) : EINVAL; 662 break; 663 664 case MRT_INIT: 665 case MRT_DONE: 666 case MRT_ADD_VIF: 667 case MRT_DEL_VIF: 668 case MRT_ADD_MFC: 669 case MRT_DEL_MFC: 670 case MRT_VERSION: 671 case MRT_ASSERT: 672 case MRT_API_SUPPORT: 673 case MRT_API_CONFIG: 674 case MRT_ADD_BW_UPCALL: 675 case MRT_DEL_BW_UPCALL: 676 error = priv_check(curthread, PRIV_NETINET_MROUTE); 677 if (error != 0) 678 return (error); 679 error = ip_mrouter_set ? ip_mrouter_set(so, sopt) : 680 EOPNOTSUPP; 681 break; 682 683 default: 684 error = ip_ctloutput(so, sopt); 685 break; 686 } 687 break; 688 } 689 690 return (error); 691} 692 693/* 694 * This function exists solely to receive the PRC_IFDOWN messages which are 695 * sent by if_down(). It looks for an ifaddr whose ifa_addr is sa, and calls 696 * in_ifadown() to remove all routes corresponding to that address. It also 697 * receives the PRC_IFUP messages from if_up() and reinstalls the interface 698 * routes. 699 */ 700void 701rip_ctlinput(int cmd, struct sockaddr *sa, void *vip) 702{ 703 struct in_ifaddr *ia; 704 struct ifnet *ifp; 705 int err; 706 int flags; 707 708 switch (cmd) { 709 case PRC_IFDOWN: 710 IN_IFADDR_RLOCK(); 711 TAILQ_FOREACH(ia, &V_in_ifaddrhead, ia_link) { 712 if (ia->ia_ifa.ifa_addr == sa 713 && (ia->ia_flags & IFA_ROUTE)) { 714 ifa_ref(&ia->ia_ifa); 715 IN_IFADDR_RUNLOCK(); 716 /* 717 * in_ifscrub kills the interface route. 718 */ 719 in_ifscrub(ia->ia_ifp, ia); 720 /* 721 * in_ifadown gets rid of all the rest of the 722 * routes. This is not quite the right thing 723 * to do, but at least if we are running a 724 * routing process they will come back. 725 */ 726 in_ifadown(&ia->ia_ifa, 0); 727 ifa_free(&ia->ia_ifa); 728 break; 729 } 730 } 731 if (ia == NULL) /* If ia matched, already unlocked. */ 732 IN_IFADDR_RUNLOCK(); 733 break; 734 735 case PRC_IFUP: 736 IN_IFADDR_RLOCK(); 737 TAILQ_FOREACH(ia, &V_in_ifaddrhead, ia_link) { 738 if (ia->ia_ifa.ifa_addr == sa) 739 break; 740 } 741 if (ia == NULL || (ia->ia_flags & IFA_ROUTE)) { 742 IN_IFADDR_RUNLOCK(); 743 return; 744 } 745 ifa_ref(&ia->ia_ifa); 746 IN_IFADDR_RUNLOCK(); 747 flags = RTF_UP; 748 ifp = ia->ia_ifa.ifa_ifp; 749 750 if ((ifp->if_flags & IFF_LOOPBACK) 751 || (ifp->if_flags & IFF_POINTOPOINT)) 752 flags |= RTF_HOST; 753 754 err = rtinit(&ia->ia_ifa, RTM_ADD, flags); 755 if (err == 0) 756 ia->ia_flags |= IFA_ROUTE; 757 err = ifa_add_loopback_route((struct ifaddr *)ia, sa); 758 if (err == 0) 759 ia->ia_flags |= IFA_RTSELF; 760 ifa_free(&ia->ia_ifa); 761 break; 762 } 763} 764 765static int 766rip_attach(struct socket *so, int proto, struct thread *td) 767{ 768 struct inpcb *inp; 769 int error; 770 771 inp = sotoinpcb(so); 772 KASSERT(inp == NULL, ("rip_attach: inp != NULL")); 773 774 error = priv_check(td, PRIV_NETINET_RAW); 775 if (error) 776 return (error); 777 if (proto >= IPPROTO_MAX || proto < 0) 778 return EPROTONOSUPPORT; 779 error = soreserve(so, rip_sendspace, rip_recvspace); 780 if (error) 781 return (error); 782 INP_INFO_WLOCK(&V_ripcbinfo); 783 error = in_pcballoc(so, &V_ripcbinfo); 784 if (error) { 785 INP_INFO_WUNLOCK(&V_ripcbinfo); 786 return (error); 787 } 788 inp = (struct inpcb *)so->so_pcb; 789 inp->inp_vflag |= INP_IPV4; 790 inp->inp_ip_p = proto; 791 inp->inp_ip_ttl = V_ip_defttl; 792 rip_inshash(inp); 793 INP_INFO_WUNLOCK(&V_ripcbinfo); 794 INP_WUNLOCK(inp); 795 return (0); 796} 797 798static void 799rip_detach(struct socket *so) 800{ 801 struct inpcb *inp; 802 803 inp = sotoinpcb(so); 804 KASSERT(inp != NULL, ("rip_detach: inp == NULL")); 805 KASSERT(inp->inp_faddr.s_addr == INADDR_ANY, 806 ("rip_detach: not closed")); 807 808 INP_INFO_WLOCK(&V_ripcbinfo); 809 INP_WLOCK(inp); 810 rip_delhash(inp); 811 if (so == V_ip_mrouter && ip_mrouter_done) 812 ip_mrouter_done(); 813 if (ip_rsvp_force_done) 814 ip_rsvp_force_done(so); 815 if (so == V_ip_rsvpd) 816 ip_rsvp_done(); 817 in_pcbdetach(inp); 818 in_pcbfree(inp); 819 INP_INFO_WUNLOCK(&V_ripcbinfo); 820} 821 822static void 823rip_dodisconnect(struct socket *so, struct inpcb *inp) 824{ 825 826 INP_INFO_WLOCK_ASSERT(inp->inp_pcbinfo); 827 INP_WLOCK_ASSERT(inp); 828 829 rip_delhash(inp); 830 inp->inp_faddr.s_addr = INADDR_ANY; 831 rip_inshash(inp); 832 SOCK_LOCK(so); 833 so->so_state &= ~SS_ISCONNECTED; 834 SOCK_UNLOCK(so); 835} 836 837static void 838rip_abort(struct socket *so) 839{ 840 struct inpcb *inp; 841 842 inp = sotoinpcb(so); 843 KASSERT(inp != NULL, ("rip_abort: inp == NULL")); 844 845 INP_INFO_WLOCK(&V_ripcbinfo); 846 INP_WLOCK(inp); 847 rip_dodisconnect(so, inp); 848 INP_WUNLOCK(inp); 849 INP_INFO_WUNLOCK(&V_ripcbinfo); 850} 851 852static void 853rip_close(struct socket *so) 854{ 855 struct inpcb *inp; 856 857 inp = sotoinpcb(so); 858 KASSERT(inp != NULL, ("rip_close: inp == NULL")); 859 860 INP_INFO_WLOCK(&V_ripcbinfo); 861 INP_WLOCK(inp); 862 rip_dodisconnect(so, inp); 863 INP_WUNLOCK(inp); 864 INP_INFO_WUNLOCK(&V_ripcbinfo); 865} 866 867static int 868rip_disconnect(struct socket *so) 869{ 870 struct inpcb *inp; 871 872 if ((so->so_state & SS_ISCONNECTED) == 0) 873 return (ENOTCONN); 874 875 inp = sotoinpcb(so); 876 KASSERT(inp != NULL, ("rip_disconnect: inp == NULL")); 877 878 INP_INFO_WLOCK(&V_ripcbinfo); 879 INP_WLOCK(inp); 880 rip_dodisconnect(so, inp); 881 INP_WUNLOCK(inp); 882 INP_INFO_WUNLOCK(&V_ripcbinfo); 883 return (0); 884} 885 886static int 887rip_bind(struct socket *so, struct sockaddr *nam, struct thread *td) 888{ 889 struct sockaddr_in *addr = (struct sockaddr_in *)nam; 890 struct inpcb *inp; 891 int error; 892 893 if (nam->sa_len != sizeof(*addr)) 894 return (EINVAL); 895 896 error = prison_check_ip4(td->td_ucred, &addr->sin_addr); 897 if (error != 0) 898 return (error); 899 900 inp = sotoinpcb(so); 901 KASSERT(inp != NULL, ("rip_bind: inp == NULL")); 902 903 if (TAILQ_EMPTY(&V_ifnet) || 904 (addr->sin_family != AF_INET && addr->sin_family != AF_IMPLINK) || 905 (addr->sin_addr.s_addr && 906 (inp->inp_flags & INP_BINDANY) == 0 && 907 ifa_ifwithaddr_check((struct sockaddr *)addr) == 0)) 908 return (EADDRNOTAVAIL); 909 910 INP_INFO_WLOCK(&V_ripcbinfo); 911 INP_WLOCK(inp); 912 rip_delhash(inp); 913 inp->inp_laddr = addr->sin_addr; 914 rip_inshash(inp); 915 INP_WUNLOCK(inp); 916 INP_INFO_WUNLOCK(&V_ripcbinfo); 917 return (0); 918} 919 920static int 921rip_connect(struct socket *so, struct sockaddr *nam, struct thread *td) 922{ 923 struct sockaddr_in *addr = (struct sockaddr_in *)nam; 924 struct inpcb *inp; 925 926 if (nam->sa_len != sizeof(*addr)) 927 return (EINVAL); 928 if (TAILQ_EMPTY(&V_ifnet)) 929 return (EADDRNOTAVAIL); 930 if (addr->sin_family != AF_INET && addr->sin_family != AF_IMPLINK) 931 return (EAFNOSUPPORT); 932 933 inp = sotoinpcb(so); 934 KASSERT(inp != NULL, ("rip_connect: inp == NULL")); 935 936 INP_INFO_WLOCK(&V_ripcbinfo); 937 INP_WLOCK(inp); 938 rip_delhash(inp); 939 inp->inp_faddr = addr->sin_addr; 940 rip_inshash(inp); 941 soisconnected(so); 942 INP_WUNLOCK(inp); 943 INP_INFO_WUNLOCK(&V_ripcbinfo); 944 return (0); 945} 946 947static int 948rip_shutdown(struct socket *so) 949{ 950 struct inpcb *inp; 951 952 inp = sotoinpcb(so); 953 KASSERT(inp != NULL, ("rip_shutdown: inp == NULL")); 954 955 INP_WLOCK(inp); 956 socantsendmore(so); 957 INP_WUNLOCK(inp); 958 return (0); 959} 960 961static int 962rip_send(struct socket *so, int flags, struct mbuf *m, struct sockaddr *nam, 963 struct mbuf *control, struct thread *td) 964{ 965 struct inpcb *inp; 966 u_long dst; 967 968 inp = sotoinpcb(so); 969 KASSERT(inp != NULL, ("rip_send: inp == NULL")); 970 971 /* 972 * Note: 'dst' reads below are unlocked. 973 */ 974 if (so->so_state & SS_ISCONNECTED) { 975 if (nam) { 976 m_freem(m); 977 return (EISCONN); 978 } 979 dst = inp->inp_faddr.s_addr; /* Unlocked read. */ 980 } else { 981 if (nam == NULL) { 982 m_freem(m); 983 return (ENOTCONN); 984 } 985 dst = ((struct sockaddr_in *)nam)->sin_addr.s_addr; 986 } 987 return (rip_output(m, so, dst)); 988} 989#endif /* INET */ 990 991static int 992rip_pcblist(SYSCTL_HANDLER_ARGS) 993{ 994 int error, i, n; 995 struct inpcb *inp, **inp_list; 996 inp_gen_t gencnt; 997 struct xinpgen xig; 998 999 /* 1000 * The process of preparing the TCB list is too time-consuming and 1001 * resource-intensive to repeat twice on every request. 1002 */ 1003 if (req->oldptr == 0) { 1004 n = V_ripcbinfo.ipi_count; 1005 n += imax(n / 8, 10); 1006 req->oldidx = 2 * (sizeof xig) + n * sizeof(struct xinpcb); 1007 return (0); 1008 } 1009 1010 if (req->newptr != 0) 1011 return (EPERM); 1012 1013 /* 1014 * OK, now we're committed to doing something. 1015 */ 1016 INP_INFO_RLOCK(&V_ripcbinfo); 1017 gencnt = V_ripcbinfo.ipi_gencnt; 1018 n = V_ripcbinfo.ipi_count; 1019 INP_INFO_RUNLOCK(&V_ripcbinfo); 1020 1021 xig.xig_len = sizeof xig; 1022 xig.xig_count = n; 1023 xig.xig_gen = gencnt; 1024 xig.xig_sogen = so_gencnt; 1025 error = SYSCTL_OUT(req, &xig, sizeof xig); 1026 if (error) 1027 return (error); 1028 1029 inp_list = malloc(n * sizeof *inp_list, M_TEMP, M_WAITOK); 1030 if (inp_list == 0) 1031 return (ENOMEM); 1032 1033 INP_INFO_RLOCK(&V_ripcbinfo); 1034 for (inp = LIST_FIRST(V_ripcbinfo.ipi_listhead), i = 0; inp && i < n; 1035 inp = LIST_NEXT(inp, inp_list)) { 1036 INP_WLOCK(inp); 1037 if (inp->inp_gencnt <= gencnt && 1038 cr_canseeinpcb(req->td->td_ucred, inp) == 0) { 1039 in_pcbref(inp); 1040 inp_list[i++] = inp; 1041 } 1042 INP_WUNLOCK(inp); 1043 } 1044 INP_INFO_RUNLOCK(&V_ripcbinfo); 1045 n = i; 1046 1047 error = 0; 1048 for (i = 0; i < n; i++) { 1049 inp = inp_list[i]; 1050 INP_RLOCK(inp); 1051 if (inp->inp_gencnt <= gencnt) { 1052 struct xinpcb xi; 1053 1054 bzero(&xi, sizeof(xi)); 1055 xi.xi_len = sizeof xi; 1056 /* XXX should avoid extra copy */ 1057 bcopy(inp, &xi.xi_inp, sizeof *inp); 1058 if (inp->inp_socket) 1059 sotoxsocket(inp->inp_socket, &xi.xi_socket); 1060 INP_RUNLOCK(inp); 1061 error = SYSCTL_OUT(req, &xi, sizeof xi); 1062 } else 1063 INP_RUNLOCK(inp); 1064 } 1065 INP_INFO_WLOCK(&V_ripcbinfo); 1066 for (i = 0; i < n; i++) { 1067 inp = inp_list[i]; 1068 INP_WLOCK(inp); 1069 if (!in_pcbrele(inp)) 1070 INP_WUNLOCK(inp); 1071 } 1072 INP_INFO_WUNLOCK(&V_ripcbinfo); 1073 1074 if (!error) { 1075 /* 1076 * Give the user an updated idea of our state. If the 1077 * generation differs from what we told her before, she knows 1078 * that something happened while we were processing this 1079 * request, and it might be necessary to retry. 1080 */ 1081 INP_INFO_RLOCK(&V_ripcbinfo); 1082 xig.xig_gen = V_ripcbinfo.ipi_gencnt; 1083 xig.xig_sogen = so_gencnt; 1084 xig.xig_count = V_ripcbinfo.ipi_count; 1085 INP_INFO_RUNLOCK(&V_ripcbinfo); 1086 error = SYSCTL_OUT(req, &xig, sizeof xig); 1087 } 1088 free(inp_list, M_TEMP); 1089 return (error); 1090} 1091 1092SYSCTL_PROC(_net_inet_raw, OID_AUTO/*XXX*/, pcblist, 1093 CTLTYPE_OPAQUE | CTLFLAG_RD, NULL, 0, 1094 rip_pcblist, "S,xinpcb", "List of active raw IP sockets"); 1095 1096#ifdef INET 1097struct pr_usrreqs rip_usrreqs = { 1098 .pru_abort = rip_abort, 1099 .pru_attach = rip_attach, 1100 .pru_bind = rip_bind, 1101 .pru_connect = rip_connect, 1102 .pru_control = in_control, 1103 .pru_detach = rip_detach, 1104 .pru_disconnect = rip_disconnect, 1105 .pru_peeraddr = in_getpeeraddr, 1106 .pru_send = rip_send, 1107 .pru_shutdown = rip_shutdown, 1108 .pru_sockaddr = in_getsockaddr, 1109 .pru_sosetlabel = in_pcbsosetlabel, 1110 .pru_close = rip_close, 1111}; 1112#endif /* INET */ 1113