raw_ip.c revision 185571
1/*- 2 * Copyright (c) 1982, 1986, 1988, 1993 3 * The Regents of the University of California. 4 * All rights reserved. 5 * 6 * Redistribution and use in source and binary forms, with or without 7 * modification, are permitted provided that the following conditions 8 * are met: 9 * 1. Redistributions of source code must retain the above copyright 10 * notice, this list of conditions and the following disclaimer. 11 * 2. Redistributions in binary form must reproduce the above copyright 12 * notice, this list of conditions and the following disclaimer in the 13 * documentation and/or other materials provided with the distribution. 14 * 4. Neither the name of the University nor the names of its contributors 15 * may be used to endorse or promote products derived from this software 16 * without specific prior written permission. 17 * 18 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND 19 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 20 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 21 * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE 22 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 23 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 24 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 25 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 26 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 27 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 28 * SUCH DAMAGE. 29 * 30 * @(#)raw_ip.c 8.7 (Berkeley) 5/15/95 31 */ 32 33#include <sys/cdefs.h> 34__FBSDID("$FreeBSD: head/sys/netinet/raw_ip.c 185571 2008-12-02 21:37:28Z bz $"); 35 36#include "opt_inet6.h" 37#include "opt_ipsec.h" 38#include "opt_mac.h" 39 40#include <sys/param.h> 41#include <sys/jail.h> 42#include <sys/kernel.h> 43#include <sys/lock.h> 44#include <sys/malloc.h> 45#include <sys/mbuf.h> 46#include <sys/priv.h> 47#include <sys/proc.h> 48#include <sys/protosw.h> 49#include <sys/signalvar.h> 50#include <sys/socket.h> 51#include <sys/socketvar.h> 52#include <sys/sx.h> 53#include <sys/sysctl.h> 54#include <sys/systm.h> 55#include <sys/vimage.h> 56 57#include <vm/uma.h> 58 59#include <net/if.h> 60#include <net/route.h> 61#include <net/vnet.h> 62 63#include <netinet/in.h> 64#include <netinet/in_systm.h> 65#include <netinet/in_pcb.h> 66#include <netinet/in_var.h> 67#include <netinet/ip.h> 68#include <netinet/ip_var.h> 69#include <netinet/ip_mroute.h> 70 71#include <netinet/ip_fw.h> 72#include <netinet/ip_dummynet.h> 73#include <netinet/vinet.h> 74 75#ifdef IPSEC 76#include <netipsec/ipsec.h> 77#endif /*IPSEC*/ 78 79#include <security/mac/mac_framework.h> 80 81#ifdef VIMAGE_GLOBALS 82struct inpcbhead ripcb; 83struct inpcbinfo ripcbinfo; 84#endif 85 86/* control hooks for ipfw and dummynet */ 87ip_fw_ctl_t *ip_fw_ctl_ptr = NULL; 88ip_dn_ctl_t *ip_dn_ctl_ptr = NULL; 89 90/* 91 * Hooks for multicast routing. They all default to NULL, so leave them not 92 * initialized and rely on BSS being set to 0. 93 */ 94 95/* 96 * The socket used to communicate with the multicast routing daemon. 97 */ 98#ifdef VIMAGE_GLOBALS 99struct socket *ip_mrouter; 100#endif 101 102/* 103 * The various mrouter and rsvp functions. 104 */ 105int (*ip_mrouter_set)(struct socket *, struct sockopt *); 106int (*ip_mrouter_get)(struct socket *, struct sockopt *); 107int (*ip_mrouter_done)(void); 108int (*ip_mforward)(struct ip *, struct ifnet *, struct mbuf *, 109 struct ip_moptions *); 110int (*mrt_ioctl)(int, caddr_t, int); 111int (*legal_vif_num)(int); 112u_long (*ip_mcast_src)(int); 113 114void (*rsvp_input_p)(struct mbuf *m, int off); 115int (*ip_rsvp_vif)(struct socket *, struct sockopt *); 116void (*ip_rsvp_force_done)(struct socket *); 117 118/* 119 * Hash functions 120 */ 121 122#define INP_PCBHASH_RAW_SIZE 256 123#define INP_PCBHASH_RAW(proto, laddr, faddr, mask) \ 124 (((proto) + (laddr) + (faddr)) % (mask) + 1) 125 126static void 127rip_inshash(struct inpcb *inp) 128{ 129 struct inpcbinfo *pcbinfo = inp->inp_pcbinfo; 130 struct inpcbhead *pcbhash; 131 int hash; 132 133 INP_INFO_WLOCK_ASSERT(pcbinfo); 134 INP_WLOCK_ASSERT(inp); 135 136 if (inp->inp_ip_p != 0 && 137 inp->inp_laddr.s_addr != INADDR_ANY && 138 inp->inp_faddr.s_addr != INADDR_ANY) { 139 hash = INP_PCBHASH_RAW(inp->inp_ip_p, inp->inp_laddr.s_addr, 140 inp->inp_faddr.s_addr, pcbinfo->ipi_hashmask); 141 } else 142 hash = 0; 143 pcbhash = &pcbinfo->ipi_hashbase[hash]; 144 LIST_INSERT_HEAD(pcbhash, inp, inp_hash); 145} 146 147static void 148rip_delhash(struct inpcb *inp) 149{ 150 151 INP_INFO_WLOCK_ASSERT(inp->inp_pcbinfo); 152 INP_WLOCK_ASSERT(inp); 153 154 LIST_REMOVE(inp, inp_hash); 155} 156 157/* 158 * Raw interface to IP protocol. 159 */ 160 161/* 162 * Initialize raw connection block q. 163 */ 164static void 165rip_zone_change(void *tag) 166{ 167 INIT_VNET_INET(curvnet); 168 169 uma_zone_set_max(V_ripcbinfo.ipi_zone, maxsockets); 170} 171 172static int 173rip_inpcb_init(void *mem, int size, int flags) 174{ 175 struct inpcb *inp = mem; 176 177 INP_LOCK_INIT(inp, "inp", "rawinp"); 178 return (0); 179} 180 181void 182rip_init(void) 183{ 184 INIT_VNET_INET(curvnet); 185 186 INP_INFO_LOCK_INIT(&V_ripcbinfo, "rip"); 187 LIST_INIT(&V_ripcb); 188 V_ripcbinfo.ipi_listhead = &V_ripcb; 189 V_ripcbinfo.ipi_hashbase = 190 hashinit(INP_PCBHASH_RAW_SIZE, M_PCB, &V_ripcbinfo.ipi_hashmask); 191 V_ripcbinfo.ipi_porthashbase = 192 hashinit(1, M_PCB, &V_ripcbinfo.ipi_porthashmask); 193 V_ripcbinfo.ipi_zone = uma_zcreate("ripcb", sizeof(struct inpcb), 194 NULL, NULL, rip_inpcb_init, NULL, UMA_ALIGN_PTR, UMA_ZONE_NOFREE); 195 uma_zone_set_max(V_ripcbinfo.ipi_zone, maxsockets); 196 EVENTHANDLER_REGISTER(maxsockets_change, rip_zone_change, NULL, 197 EVENTHANDLER_PRI_ANY); 198} 199 200static int 201rip_append(struct inpcb *last, struct ip *ip, struct mbuf *n, 202 struct sockaddr_in *ripsrc) 203{ 204 int policyfail = 0; 205 206 INP_RLOCK_ASSERT(last); 207 208#ifdef IPSEC 209 /* check AH/ESP integrity. */ 210 if (ipsec4_in_reject(n, last)) { 211 policyfail = 1; 212 } 213#endif /* IPSEC */ 214#ifdef MAC 215 if (!policyfail && mac_inpcb_check_deliver(last, n) != 0) 216 policyfail = 1; 217#endif 218 /* Check the minimum TTL for socket. */ 219 if (last->inp_ip_minttl && last->inp_ip_minttl > ip->ip_ttl) 220 policyfail = 1; 221 if (!policyfail) { 222 struct mbuf *opts = NULL; 223 struct socket *so; 224 225 so = last->inp_socket; 226 if ((last->inp_flags & INP_CONTROLOPTS) || 227 (so->so_options & (SO_TIMESTAMP | SO_BINTIME))) 228 ip_savecontrol(last, &opts, ip, n); 229 SOCKBUF_LOCK(&so->so_rcv); 230 if (sbappendaddr_locked(&so->so_rcv, 231 (struct sockaddr *)ripsrc, n, opts) == 0) { 232 /* should notify about lost packet */ 233 m_freem(n); 234 if (opts) 235 m_freem(opts); 236 SOCKBUF_UNLOCK(&so->so_rcv); 237 } else 238 sorwakeup_locked(so); 239 } else 240 m_freem(n); 241 return (policyfail); 242} 243 244/* 245 * Setup generic address and protocol structures for raw_input routine, then 246 * pass them along with mbuf chain. 247 */ 248void 249rip_input(struct mbuf *m, int off) 250{ 251 INIT_VNET_INET(curvnet); 252 struct ip *ip = mtod(m, struct ip *); 253 int proto = ip->ip_p; 254 struct inpcb *inp, *last; 255 struct sockaddr_in ripsrc; 256 int hash; 257 258 bzero(&ripsrc, sizeof(ripsrc)); 259 ripsrc.sin_len = sizeof(ripsrc); 260 ripsrc.sin_family = AF_INET; 261 ripsrc.sin_addr = ip->ip_src; 262 last = NULL; 263 hash = INP_PCBHASH_RAW(proto, ip->ip_src.s_addr, 264 ip->ip_dst.s_addr, V_ripcbinfo.ipi_hashmask); 265 INP_INFO_RLOCK(&V_ripcbinfo); 266 LIST_FOREACH(inp, &V_ripcbinfo.ipi_hashbase[hash], inp_hash) { 267 if (inp->inp_ip_p != proto) 268 continue; 269#ifdef INET6 270 /* XXX inp locking */ 271 if ((inp->inp_vflag & INP_IPV4) == 0) 272 continue; 273#endif 274 if (inp->inp_laddr.s_addr != ip->ip_dst.s_addr) 275 continue; 276 if (inp->inp_faddr.s_addr != ip->ip_src.s_addr) 277 continue; 278 if (jailed(inp->inp_cred)) { 279 if (!prison_check_ip4(inp->inp_cred, &ip->ip_dst)) 280 continue; 281 } 282 if (last) { 283 struct mbuf *n; 284 285 n = m_copy(m, 0, (int)M_COPYALL); 286 if (n != NULL) 287 (void) rip_append(last, ip, n, &ripsrc); 288 /* XXX count dropped packet */ 289 INP_RUNLOCK(last); 290 } 291 INP_RLOCK(inp); 292 last = inp; 293 } 294 LIST_FOREACH(inp, &V_ripcbinfo.ipi_hashbase[0], inp_hash) { 295 if (inp->inp_ip_p && inp->inp_ip_p != proto) 296 continue; 297#ifdef INET6 298 /* XXX inp locking */ 299 if ((inp->inp_vflag & INP_IPV4) == 0) 300 continue; 301#endif 302 if (inp->inp_laddr.s_addr && 303 inp->inp_laddr.s_addr != ip->ip_dst.s_addr) 304 continue; 305 if (inp->inp_faddr.s_addr && 306 inp->inp_faddr.s_addr != ip->ip_src.s_addr) 307 continue; 308 if (jailed(inp->inp_cred)) { 309 if (!prison_check_ip4(inp->inp_cred, &ip->ip_dst)) 310 continue; 311 } 312 if (last) { 313 struct mbuf *n; 314 315 n = m_copy(m, 0, (int)M_COPYALL); 316 if (n != NULL) 317 (void) rip_append(last, ip, n, &ripsrc); 318 /* XXX count dropped packet */ 319 INP_RUNLOCK(last); 320 } 321 INP_RLOCK(inp); 322 last = inp; 323 } 324 INP_INFO_RUNLOCK(&V_ripcbinfo); 325 if (last != NULL) { 326 if (rip_append(last, ip, m, &ripsrc) != 0) 327 V_ipstat.ips_delivered--; 328 INP_RUNLOCK(last); 329 } else { 330 m_freem(m); 331 V_ipstat.ips_noproto++; 332 V_ipstat.ips_delivered--; 333 } 334} 335 336/* 337 * Generate IP header and pass packet to ip_output. Tack on options user may 338 * have setup with control call. 339 */ 340int 341rip_output(struct mbuf *m, struct socket *so, u_long dst) 342{ 343 INIT_VNET_INET(so->so_vnet); 344 struct ip *ip; 345 int error; 346 struct inpcb *inp = sotoinpcb(so); 347 int flags = ((so->so_options & SO_DONTROUTE) ? IP_ROUTETOIF : 0) | 348 IP_ALLOWBROADCAST; 349 350 /* 351 * If the user handed us a complete IP packet, use it. Otherwise, 352 * allocate an mbuf for a header and fill it in. 353 */ 354 if ((inp->inp_flags & INP_HDRINCL) == 0) { 355 if (m->m_pkthdr.len + sizeof(struct ip) > IP_MAXPACKET) { 356 m_freem(m); 357 return(EMSGSIZE); 358 } 359 M_PREPEND(m, sizeof(struct ip), M_DONTWAIT); 360 if (m == NULL) 361 return(ENOBUFS); 362 363 INP_RLOCK(inp); 364 ip = mtod(m, struct ip *); 365 ip->ip_tos = inp->inp_ip_tos; 366 if (inp->inp_flags & INP_DONTFRAG) 367 ip->ip_off = IP_DF; 368 else 369 ip->ip_off = 0; 370 ip->ip_p = inp->inp_ip_p; 371 ip->ip_len = m->m_pkthdr.len; 372 if (jailed(inp->inp_cred)) { 373 if (prison_getip4(inp->inp_cred, &ip->ip_src)) { 374 INP_RUNLOCK(inp); 375 m_freem(m); 376 return (EPERM); 377 } 378 } else { 379 ip->ip_src = inp->inp_laddr; 380 } 381 ip->ip_dst.s_addr = dst; 382 ip->ip_ttl = inp->inp_ip_ttl; 383 } else { 384 if (m->m_pkthdr.len > IP_MAXPACKET) { 385 m_freem(m); 386 return(EMSGSIZE); 387 } 388 INP_RLOCK(inp); 389 ip = mtod(m, struct ip *); 390 if (!prison_check_ip4(inp->inp_cred, &ip->ip_src)) { 391 INP_RUNLOCK(inp); 392 m_freem(m); 393 return (EPERM); 394 } 395 396 /* 397 * Don't allow both user specified and setsockopt options, 398 * and don't allow packet length sizes that will crash. 399 */ 400 if (((ip->ip_hl != (sizeof (*ip) >> 2)) && inp->inp_options) 401 || (ip->ip_len > m->m_pkthdr.len) 402 || (ip->ip_len < (ip->ip_hl << 2))) { 403 INP_RUNLOCK(inp); 404 m_freem(m); 405 return (EINVAL); 406 } 407 if (ip->ip_id == 0) 408 ip->ip_id = ip_newid(); 409 410 /* 411 * XXX prevent ip_output from overwriting header fields. 412 */ 413 flags |= IP_RAWOUTPUT; 414 V_ipstat.ips_rawout++; 415 } 416 417 if (inp->inp_flags & INP_ONESBCAST) 418 flags |= IP_SENDONES; 419 420#ifdef MAC 421 mac_inpcb_create_mbuf(inp, m); 422#endif 423 424 error = ip_output(m, inp->inp_options, NULL, flags, 425 inp->inp_moptions, inp); 426 INP_RUNLOCK(inp); 427 return (error); 428} 429 430/* 431 * Raw IP socket option processing. 432 * 433 * IMPORTANT NOTE regarding access control: Traditionally, raw sockets could 434 * only be created by a privileged process, and as such, socket option 435 * operations to manage system properties on any raw socket were allowed to 436 * take place without explicit additional access control checks. However, 437 * raw sockets can now also be created in jail(), and therefore explicit 438 * checks are now required. Likewise, raw sockets can be used by a process 439 * after it gives up privilege, so some caution is required. For options 440 * passed down to the IP layer via ip_ctloutput(), checks are assumed to be 441 * performed in ip_ctloutput() and therefore no check occurs here. 442 * Unilaterally checking priv_check() here breaks normal IP socket option 443 * operations on raw sockets. 444 * 445 * When adding new socket options here, make sure to add access control 446 * checks here as necessary. 447 */ 448int 449rip_ctloutput(struct socket *so, struct sockopt *sopt) 450{ 451 struct inpcb *inp = sotoinpcb(so); 452 int error, optval; 453 454 if (sopt->sopt_level != IPPROTO_IP) { 455 if ((sopt->sopt_level == SOL_SOCKET) && 456 (sopt->sopt_name == SO_SETFIB)) { 457 inp->inp_inc.inc_fibnum = so->so_fibnum; 458 return (0); 459 } 460 return (EINVAL); 461 } 462 463 error = 0; 464 switch (sopt->sopt_dir) { 465 case SOPT_GET: 466 switch (sopt->sopt_name) { 467 case IP_HDRINCL: 468 optval = inp->inp_flags & INP_HDRINCL; 469 error = sooptcopyout(sopt, &optval, sizeof optval); 470 break; 471 472 case IP_FW_ADD: /* ADD actually returns the body... */ 473 case IP_FW_GET: 474 case IP_FW_TABLE_GETSIZE: 475 case IP_FW_TABLE_LIST: 476 case IP_FW_NAT_GET_CONFIG: 477 case IP_FW_NAT_GET_LOG: 478 if (ip_fw_ctl_ptr != NULL) 479 error = ip_fw_ctl_ptr(sopt); 480 else 481 error = ENOPROTOOPT; 482 break; 483 484 case IP_DUMMYNET_GET: 485 if (ip_dn_ctl_ptr != NULL) 486 error = ip_dn_ctl_ptr(sopt); 487 else 488 error = ENOPROTOOPT; 489 break ; 490 491 case MRT_INIT: 492 case MRT_DONE: 493 case MRT_ADD_VIF: 494 case MRT_DEL_VIF: 495 case MRT_ADD_MFC: 496 case MRT_DEL_MFC: 497 case MRT_VERSION: 498 case MRT_ASSERT: 499 case MRT_API_SUPPORT: 500 case MRT_API_CONFIG: 501 case MRT_ADD_BW_UPCALL: 502 case MRT_DEL_BW_UPCALL: 503 error = priv_check(curthread, PRIV_NETINET_MROUTE); 504 if (error != 0) 505 return (error); 506 error = ip_mrouter_get ? ip_mrouter_get(so, sopt) : 507 EOPNOTSUPP; 508 break; 509 510 default: 511 error = ip_ctloutput(so, sopt); 512 break; 513 } 514 break; 515 516 case SOPT_SET: 517 switch (sopt->sopt_name) { 518 case IP_HDRINCL: 519 error = sooptcopyin(sopt, &optval, sizeof optval, 520 sizeof optval); 521 if (error) 522 break; 523 if (optval) 524 inp->inp_flags |= INP_HDRINCL; 525 else 526 inp->inp_flags &= ~INP_HDRINCL; 527 break; 528 529 case IP_FW_ADD: 530 case IP_FW_DEL: 531 case IP_FW_FLUSH: 532 case IP_FW_ZERO: 533 case IP_FW_RESETLOG: 534 case IP_FW_TABLE_ADD: 535 case IP_FW_TABLE_DEL: 536 case IP_FW_TABLE_FLUSH: 537 case IP_FW_NAT_CFG: 538 case IP_FW_NAT_DEL: 539 if (ip_fw_ctl_ptr != NULL) 540 error = ip_fw_ctl_ptr(sopt); 541 else 542 error = ENOPROTOOPT; 543 break; 544 545 case IP_DUMMYNET_CONFIGURE: 546 case IP_DUMMYNET_DEL: 547 case IP_DUMMYNET_FLUSH: 548 if (ip_dn_ctl_ptr != NULL) 549 error = ip_dn_ctl_ptr(sopt); 550 else 551 error = ENOPROTOOPT ; 552 break ; 553 554 case IP_RSVP_ON: 555 error = priv_check(curthread, PRIV_NETINET_MROUTE); 556 if (error != 0) 557 return (error); 558 error = ip_rsvp_init(so); 559 break; 560 561 case IP_RSVP_OFF: 562 error = priv_check(curthread, PRIV_NETINET_MROUTE); 563 if (error != 0) 564 return (error); 565 error = ip_rsvp_done(); 566 break; 567 568 case IP_RSVP_VIF_ON: 569 case IP_RSVP_VIF_OFF: 570 error = priv_check(curthread, PRIV_NETINET_MROUTE); 571 if (error != 0) 572 return (error); 573 error = ip_rsvp_vif ? 574 ip_rsvp_vif(so, sopt) : EINVAL; 575 break; 576 577 case MRT_INIT: 578 case MRT_DONE: 579 case MRT_ADD_VIF: 580 case MRT_DEL_VIF: 581 case MRT_ADD_MFC: 582 case MRT_DEL_MFC: 583 case MRT_VERSION: 584 case MRT_ASSERT: 585 case MRT_API_SUPPORT: 586 case MRT_API_CONFIG: 587 case MRT_ADD_BW_UPCALL: 588 case MRT_DEL_BW_UPCALL: 589 error = priv_check(curthread, PRIV_NETINET_MROUTE); 590 if (error != 0) 591 return (error); 592 error = ip_mrouter_set ? ip_mrouter_set(so, sopt) : 593 EOPNOTSUPP; 594 break; 595 596 default: 597 error = ip_ctloutput(so, sopt); 598 break; 599 } 600 break; 601 } 602 603 return (error); 604} 605 606/* 607 * This function exists solely to receive the PRC_IFDOWN messages which are 608 * sent by if_down(). It looks for an ifaddr whose ifa_addr is sa, and calls 609 * in_ifadown() to remove all routes corresponding to that address. It also 610 * receives the PRC_IFUP messages from if_up() and reinstalls the interface 611 * routes. 612 */ 613void 614rip_ctlinput(int cmd, struct sockaddr *sa, void *vip) 615{ 616 INIT_VNET_INET(curvnet); 617 struct in_ifaddr *ia; 618 struct ifnet *ifp; 619 int err; 620 int flags; 621 622 switch (cmd) { 623 case PRC_IFDOWN: 624 TAILQ_FOREACH(ia, &V_in_ifaddrhead, ia_link) { 625 if (ia->ia_ifa.ifa_addr == sa 626 && (ia->ia_flags & IFA_ROUTE)) { 627 /* 628 * in_ifscrub kills the interface route. 629 */ 630 in_ifscrub(ia->ia_ifp, ia); 631 /* 632 * in_ifadown gets rid of all the rest of the 633 * routes. This is not quite the right thing 634 * to do, but at least if we are running a 635 * routing process they will come back. 636 */ 637 in_ifadown(&ia->ia_ifa, 0); 638 break; 639 } 640 } 641 break; 642 643 case PRC_IFUP: 644 TAILQ_FOREACH(ia, &V_in_ifaddrhead, ia_link) { 645 if (ia->ia_ifa.ifa_addr == sa) 646 break; 647 } 648 if (ia == 0 || (ia->ia_flags & IFA_ROUTE)) 649 return; 650 flags = RTF_UP; 651 ifp = ia->ia_ifa.ifa_ifp; 652 653 if ((ifp->if_flags & IFF_LOOPBACK) 654 || (ifp->if_flags & IFF_POINTOPOINT)) 655 flags |= RTF_HOST; 656 657 err = rtinit(&ia->ia_ifa, RTM_ADD, flags); 658 if (err == 0) 659 ia->ia_flags |= IFA_ROUTE; 660 break; 661 } 662} 663 664u_long rip_sendspace = 9216; 665u_long rip_recvspace = 9216; 666 667SYSCTL_ULONG(_net_inet_raw, OID_AUTO, maxdgram, CTLFLAG_RW, 668 &rip_sendspace, 0, "Maximum outgoing raw IP datagram size"); 669SYSCTL_ULONG(_net_inet_raw, OID_AUTO, recvspace, CTLFLAG_RW, 670 &rip_recvspace, 0, "Maximum space for incoming raw IP datagrams"); 671 672static int 673rip_attach(struct socket *so, int proto, struct thread *td) 674{ 675 INIT_VNET_INET(so->so_vnet); 676 struct inpcb *inp; 677 int error; 678 679 inp = sotoinpcb(so); 680 KASSERT(inp == NULL, ("rip_attach: inp != NULL")); 681 682 error = priv_check(td, PRIV_NETINET_RAW); 683 if (error) 684 return (error); 685 if (proto >= IPPROTO_MAX || proto < 0) 686 return EPROTONOSUPPORT; 687 error = soreserve(so, rip_sendspace, rip_recvspace); 688 if (error) 689 return (error); 690 INP_INFO_WLOCK(&V_ripcbinfo); 691 error = in_pcballoc(so, &V_ripcbinfo); 692 if (error) { 693 INP_INFO_WUNLOCK(&V_ripcbinfo); 694 return (error); 695 } 696 inp = (struct inpcb *)so->so_pcb; 697 inp->inp_vflag |= INP_IPV4; 698 inp->inp_ip_p = proto; 699 inp->inp_ip_ttl = V_ip_defttl; 700 rip_inshash(inp); 701 INP_INFO_WUNLOCK(&V_ripcbinfo); 702 INP_WUNLOCK(inp); 703 return (0); 704} 705 706static void 707rip_detach(struct socket *so) 708{ 709 INIT_VNET_INET(so->so_vnet); 710 struct inpcb *inp; 711 712 inp = sotoinpcb(so); 713 KASSERT(inp != NULL, ("rip_detach: inp == NULL")); 714 KASSERT(inp->inp_faddr.s_addr == INADDR_ANY, 715 ("rip_detach: not closed")); 716 717 INP_INFO_WLOCK(&V_ripcbinfo); 718 INP_WLOCK(inp); 719 rip_delhash(inp); 720 if (so == V_ip_mrouter && ip_mrouter_done) 721 ip_mrouter_done(); 722 if (ip_rsvp_force_done) 723 ip_rsvp_force_done(so); 724 if (so == V_ip_rsvpd) 725 ip_rsvp_done(); 726 in_pcbdetach(inp); 727 in_pcbfree(inp); 728 INP_INFO_WUNLOCK(&V_ripcbinfo); 729} 730 731static void 732rip_dodisconnect(struct socket *so, struct inpcb *inp) 733{ 734 735 INP_INFO_WLOCK_ASSERT(inp->inp_pcbinfo); 736 INP_WLOCK_ASSERT(inp); 737 738 rip_delhash(inp); 739 inp->inp_faddr.s_addr = INADDR_ANY; 740 rip_inshash(inp); 741 SOCK_LOCK(so); 742 so->so_state &= ~SS_ISCONNECTED; 743 SOCK_UNLOCK(so); 744} 745 746static void 747rip_abort(struct socket *so) 748{ 749 INIT_VNET_INET(so->so_vnet); 750 struct inpcb *inp; 751 752 inp = sotoinpcb(so); 753 KASSERT(inp != NULL, ("rip_abort: inp == NULL")); 754 755 INP_INFO_WLOCK(&V_ripcbinfo); 756 INP_WLOCK(inp); 757 rip_dodisconnect(so, inp); 758 INP_WUNLOCK(inp); 759 INP_INFO_WUNLOCK(&V_ripcbinfo); 760} 761 762static void 763rip_close(struct socket *so) 764{ 765 INIT_VNET_INET(so->so_vnet); 766 struct inpcb *inp; 767 768 inp = sotoinpcb(so); 769 KASSERT(inp != NULL, ("rip_close: inp == NULL")); 770 771 INP_INFO_WLOCK(&V_ripcbinfo); 772 INP_WLOCK(inp); 773 rip_dodisconnect(so, inp); 774 INP_WUNLOCK(inp); 775 INP_INFO_WUNLOCK(&V_ripcbinfo); 776} 777 778static int 779rip_disconnect(struct socket *so) 780{ 781 INIT_VNET_INET(so->so_vnet); 782 struct inpcb *inp; 783 784 if ((so->so_state & SS_ISCONNECTED) == 0) 785 return (ENOTCONN); 786 787 inp = sotoinpcb(so); 788 KASSERT(inp != NULL, ("rip_disconnect: inp == NULL")); 789 790 INP_INFO_WLOCK(&V_ripcbinfo); 791 INP_WLOCK(inp); 792 rip_dodisconnect(so, inp); 793 INP_WUNLOCK(inp); 794 INP_INFO_WUNLOCK(&V_ripcbinfo); 795 return (0); 796} 797 798static int 799rip_bind(struct socket *so, struct sockaddr *nam, struct thread *td) 800{ 801 INIT_VNET_NET(so->so_vnet); 802 INIT_VNET_INET(so->so_vnet); 803 struct sockaddr_in *addr = (struct sockaddr_in *)nam; 804 struct inpcb *inp; 805 806 if (nam->sa_len != sizeof(*addr)) 807 return (EINVAL); 808 809 if (!prison_check_ip4(td->td_ucred, &addr->sin_addr)) 810 return (EADDRNOTAVAIL); 811 812 if (TAILQ_EMPTY(&V_ifnet) || 813 (addr->sin_family != AF_INET && addr->sin_family != AF_IMPLINK) || 814 (addr->sin_addr.s_addr && 815 ifa_ifwithaddr((struct sockaddr *)addr) == 0)) 816 return (EADDRNOTAVAIL); 817 818 inp = sotoinpcb(so); 819 KASSERT(inp != NULL, ("rip_bind: inp == NULL")); 820 821 INP_INFO_WLOCK(&V_ripcbinfo); 822 INP_WLOCK(inp); 823 rip_delhash(inp); 824 inp->inp_laddr = addr->sin_addr; 825 rip_inshash(inp); 826 INP_WUNLOCK(inp); 827 INP_INFO_WUNLOCK(&V_ripcbinfo); 828 return (0); 829} 830 831static int 832rip_connect(struct socket *so, struct sockaddr *nam, struct thread *td) 833{ 834 INIT_VNET_NET(so->so_vnet); 835 INIT_VNET_INET(so->so_vnet); 836 struct sockaddr_in *addr = (struct sockaddr_in *)nam; 837 struct inpcb *inp; 838 839 if (nam->sa_len != sizeof(*addr)) 840 return (EINVAL); 841 if (TAILQ_EMPTY(&V_ifnet)) 842 return (EADDRNOTAVAIL); 843 if (addr->sin_family != AF_INET && addr->sin_family != AF_IMPLINK) 844 return (EAFNOSUPPORT); 845 846 inp = sotoinpcb(so); 847 KASSERT(inp != NULL, ("rip_connect: inp == NULL")); 848 849 INP_INFO_WLOCK(&V_ripcbinfo); 850 INP_WLOCK(inp); 851 rip_delhash(inp); 852 inp->inp_faddr = addr->sin_addr; 853 rip_inshash(inp); 854 soisconnected(so); 855 INP_WUNLOCK(inp); 856 INP_INFO_WUNLOCK(&V_ripcbinfo); 857 return (0); 858} 859 860static int 861rip_shutdown(struct socket *so) 862{ 863 struct inpcb *inp; 864 865 inp = sotoinpcb(so); 866 KASSERT(inp != NULL, ("rip_shutdown: inp == NULL")); 867 868 INP_WLOCK(inp); 869 socantsendmore(so); 870 INP_WUNLOCK(inp); 871 return (0); 872} 873 874static int 875rip_send(struct socket *so, int flags, struct mbuf *m, struct sockaddr *nam, 876 struct mbuf *control, struct thread *td) 877{ 878 struct inpcb *inp; 879 u_long dst; 880 881 inp = sotoinpcb(so); 882 KASSERT(inp != NULL, ("rip_send: inp == NULL")); 883 884 /* 885 * Note: 'dst' reads below are unlocked. 886 */ 887 if (so->so_state & SS_ISCONNECTED) { 888 if (nam) { 889 m_freem(m); 890 return (EISCONN); 891 } 892 dst = inp->inp_faddr.s_addr; /* Unlocked read. */ 893 } else { 894 if (nam == NULL) { 895 m_freem(m); 896 return (ENOTCONN); 897 } 898 dst = ((struct sockaddr_in *)nam)->sin_addr.s_addr; 899 } 900 return (rip_output(m, so, dst)); 901} 902 903static int 904rip_pcblist(SYSCTL_HANDLER_ARGS) 905{ 906 INIT_VNET_INET(curvnet); 907 int error, i, n; 908 struct inpcb *inp, **inp_list; 909 inp_gen_t gencnt; 910 struct xinpgen xig; 911 912 /* 913 * The process of preparing the TCB list is too time-consuming and 914 * resource-intensive to repeat twice on every request. 915 */ 916 if (req->oldptr == 0) { 917 n = V_ripcbinfo.ipi_count; 918 req->oldidx = 2 * (sizeof xig) 919 + (n + n/8) * sizeof(struct xinpcb); 920 return (0); 921 } 922 923 if (req->newptr != 0) 924 return (EPERM); 925 926 /* 927 * OK, now we're committed to doing something. 928 */ 929 INP_INFO_RLOCK(&V_ripcbinfo); 930 gencnt = V_ripcbinfo.ipi_gencnt; 931 n = V_ripcbinfo.ipi_count; 932 INP_INFO_RUNLOCK(&V_ripcbinfo); 933 934 xig.xig_len = sizeof xig; 935 xig.xig_count = n; 936 xig.xig_gen = gencnt; 937 xig.xig_sogen = so_gencnt; 938 error = SYSCTL_OUT(req, &xig, sizeof xig); 939 if (error) 940 return (error); 941 942 inp_list = malloc(n * sizeof *inp_list, M_TEMP, M_WAITOK); 943 if (inp_list == 0) 944 return (ENOMEM); 945 946 INP_INFO_RLOCK(&V_ripcbinfo); 947 for (inp = LIST_FIRST(V_ripcbinfo.ipi_listhead), i = 0; inp && i < n; 948 inp = LIST_NEXT(inp, inp_list)) { 949 INP_RLOCK(inp); 950 if (inp->inp_gencnt <= gencnt && 951 cr_canseeinpcb(req->td->td_ucred, inp) == 0) { 952 /* XXX held references? */ 953 inp_list[i++] = inp; 954 } 955 INP_RUNLOCK(inp); 956 } 957 INP_INFO_RUNLOCK(&V_ripcbinfo); 958 n = i; 959 960 error = 0; 961 for (i = 0; i < n; i++) { 962 inp = inp_list[i]; 963 INP_RLOCK(inp); 964 if (inp->inp_gencnt <= gencnt) { 965 struct xinpcb xi; 966 bzero(&xi, sizeof(xi)); 967 xi.xi_len = sizeof xi; 968 /* XXX should avoid extra copy */ 969 bcopy(inp, &xi.xi_inp, sizeof *inp); 970 if (inp->inp_socket) 971 sotoxsocket(inp->inp_socket, &xi.xi_socket); 972 INP_RUNLOCK(inp); 973 error = SYSCTL_OUT(req, &xi, sizeof xi); 974 } else 975 INP_RUNLOCK(inp); 976 } 977 if (!error) { 978 /* 979 * Give the user an updated idea of our state. If the 980 * generation differs from what we told her before, she knows 981 * that something happened while we were processing this 982 * request, and it might be necessary to retry. 983 */ 984 INP_INFO_RLOCK(&V_ripcbinfo); 985 xig.xig_gen = V_ripcbinfo.ipi_gencnt; 986 xig.xig_sogen = so_gencnt; 987 xig.xig_count = V_ripcbinfo.ipi_count; 988 INP_INFO_RUNLOCK(&V_ripcbinfo); 989 error = SYSCTL_OUT(req, &xig, sizeof xig); 990 } 991 free(inp_list, M_TEMP); 992 return (error); 993} 994 995SYSCTL_PROC(_net_inet_raw, OID_AUTO/*XXX*/, pcblist, CTLFLAG_RD, 0, 0, 996 rip_pcblist, "S,xinpcb", "List of active raw IP sockets"); 997 998struct pr_usrreqs rip_usrreqs = { 999 .pru_abort = rip_abort, 1000 .pru_attach = rip_attach, 1001 .pru_bind = rip_bind, 1002 .pru_connect = rip_connect, 1003 .pru_control = in_control, 1004 .pru_detach = rip_detach, 1005 .pru_disconnect = rip_disconnect, 1006 .pru_peeraddr = in_getpeeraddr, 1007 .pru_send = rip_send, 1008 .pru_shutdown = rip_shutdown, 1009 .pru_sockaddr = in_getsockaddr, 1010 .pru_sosetlabel = in_pcbsosetlabel, 1011 .pru_close = rip_close, 1012}; 1013