raw_ip.c revision 181803
1/*- 2 * Copyright (c) 1982, 1986, 1988, 1993 3 * The Regents of the University of California. 4 * All rights reserved. 5 * 6 * Redistribution and use in source and binary forms, with or without 7 * modification, are permitted provided that the following conditions 8 * are met: 9 * 1. Redistributions of source code must retain the above copyright 10 * notice, this list of conditions and the following disclaimer. 11 * 2. Redistributions in binary form must reproduce the above copyright 12 * notice, this list of conditions and the following disclaimer in the 13 * documentation and/or other materials provided with the distribution. 14 * 4. Neither the name of the University nor the names of its contributors 15 * may be used to endorse or promote products derived from this software 16 * without specific prior written permission. 17 * 18 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND 19 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 20 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 21 * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE 22 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 23 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 24 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 25 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 26 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 27 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 28 * SUCH DAMAGE. 29 * 30 * @(#)raw_ip.c 8.7 (Berkeley) 5/15/95 31 */ 32 33#include <sys/cdefs.h> 34__FBSDID("$FreeBSD: head/sys/netinet/raw_ip.c 181803 2008-08-17 23:27:27Z bz $"); 35 36#include "opt_inet6.h" 37#include "opt_ipsec.h" 38#include "opt_mac.h" 39 40#include <sys/param.h> 41#include <sys/jail.h> 42#include <sys/kernel.h> 43#include <sys/lock.h> 44#include <sys/malloc.h> 45#include <sys/mbuf.h> 46#include <sys/priv.h> 47#include <sys/proc.h> 48#include <sys/protosw.h> 49#include <sys/signalvar.h> 50#include <sys/socket.h> 51#include <sys/socketvar.h> 52#include <sys/sx.h> 53#include <sys/sysctl.h> 54#include <sys/systm.h> 55#include <sys/vimage.h> 56 57#include <vm/uma.h> 58 59#include <net/if.h> 60#include <net/route.h> 61 62#include <netinet/in.h> 63#include <netinet/in_systm.h> 64#include <netinet/in_pcb.h> 65#include <netinet/in_var.h> 66#include <netinet/ip.h> 67#include <netinet/ip_var.h> 68#include <netinet/ip_mroute.h> 69 70#include <netinet/ip_fw.h> 71#include <netinet/ip_dummynet.h> 72 73#ifdef IPSEC 74#include <netipsec/ipsec.h> 75#endif /*IPSEC*/ 76 77#include <security/mac/mac_framework.h> 78 79struct inpcbhead ripcb; 80struct inpcbinfo ripcbinfo; 81 82/* control hooks for ipfw and dummynet */ 83ip_fw_ctl_t *ip_fw_ctl_ptr = NULL; 84ip_dn_ctl_t *ip_dn_ctl_ptr = NULL; 85 86/* 87 * Hooks for multicast routing. They all default to NULL, so leave them not 88 * initialized and rely on BSS being set to 0. 89 */ 90 91/* 92 * The socket used to communicate with the multicast routing daemon. 93 */ 94struct socket *ip_mrouter; 95 96/* 97 * The various mrouter and rsvp functions. 98 */ 99int (*ip_mrouter_set)(struct socket *, struct sockopt *); 100int (*ip_mrouter_get)(struct socket *, struct sockopt *); 101int (*ip_mrouter_done)(void); 102int (*ip_mforward)(struct ip *, struct ifnet *, struct mbuf *, 103 struct ip_moptions *); 104int (*mrt_ioctl)(int, caddr_t, int); 105int (*legal_vif_num)(int); 106u_long (*ip_mcast_src)(int); 107 108void (*rsvp_input_p)(struct mbuf *m, int off); 109int (*ip_rsvp_vif)(struct socket *, struct sockopt *); 110void (*ip_rsvp_force_done)(struct socket *); 111 112/* 113 * Hash functions 114 */ 115 116#define INP_PCBHASH_RAW_SIZE 256 117#define INP_PCBHASH_RAW(proto, laddr, faddr, mask) \ 118 (((proto) + (laddr) + (faddr)) % (mask) + 1) 119 120static void 121rip_inshash(struct inpcb *inp) 122{ 123 struct inpcbinfo *pcbinfo = inp->inp_pcbinfo; 124 struct inpcbhead *pcbhash; 125 int hash; 126 127 INP_INFO_WLOCK_ASSERT(pcbinfo); 128 INP_WLOCK_ASSERT(inp); 129 130 if (inp->inp_ip_p != 0 && 131 inp->inp_laddr.s_addr != INADDR_ANY && 132 inp->inp_faddr.s_addr != INADDR_ANY) { 133 hash = INP_PCBHASH_RAW(inp->inp_ip_p, inp->inp_laddr.s_addr, 134 inp->inp_faddr.s_addr, pcbinfo->ipi_hashmask); 135 } else 136 hash = 0; 137 pcbhash = &pcbinfo->ipi_hashbase[hash]; 138 LIST_INSERT_HEAD(pcbhash, inp, inp_hash); 139} 140 141static void 142rip_delhash(struct inpcb *inp) 143{ 144 145 INP_INFO_WLOCK_ASSERT(inp->inp_pcbinfo); 146 INP_WLOCK_ASSERT(inp); 147 148 LIST_REMOVE(inp, inp_hash); 149} 150 151/* 152 * Raw interface to IP protocol. 153 */ 154 155/* 156 * Initialize raw connection block q. 157 */ 158static void 159rip_zone_change(void *tag) 160{ 161 162 uma_zone_set_max(V_ripcbinfo.ipi_zone, maxsockets); 163} 164 165static int 166rip_inpcb_init(void *mem, int size, int flags) 167{ 168 struct inpcb *inp = mem; 169 170 INP_LOCK_INIT(inp, "inp", "rawinp"); 171 return (0); 172} 173 174void 175rip_init(void) 176{ 177 178 INP_INFO_LOCK_INIT(&V_ripcbinfo, "rip"); 179 LIST_INIT(&V_ripcb); 180 V_ripcbinfo.ipi_listhead = &V_ripcb; 181 V_ripcbinfo.ipi_hashbase = hashinit(INP_PCBHASH_RAW_SIZE, M_PCB, 182 &V_ripcbinfo.ipi_hashmask); 183 V_ripcbinfo.ipi_porthashbase = hashinit(1, M_PCB, 184 &V_ripcbinfo.ipi_porthashmask); 185 V_ripcbinfo.ipi_zone = uma_zcreate("ripcb", sizeof(struct inpcb), 186 NULL, NULL, rip_inpcb_init, NULL, UMA_ALIGN_PTR, UMA_ZONE_NOFREE); 187 uma_zone_set_max(V_ripcbinfo.ipi_zone, maxsockets); 188 EVENTHANDLER_REGISTER(maxsockets_change, rip_zone_change, NULL, 189 EVENTHANDLER_PRI_ANY); 190} 191 192static int 193rip_append(struct inpcb *last, struct ip *ip, struct mbuf *n, 194 struct sockaddr_in *ripsrc) 195{ 196 int policyfail = 0; 197 198 INP_RLOCK_ASSERT(last); 199 200#ifdef IPSEC 201 /* check AH/ESP integrity. */ 202 if (ipsec4_in_reject(n, last)) { 203 policyfail = 1; 204 } 205#endif /* IPSEC */ 206#ifdef MAC 207 if (!policyfail && mac_inpcb_check_deliver(last, n) != 0) 208 policyfail = 1; 209#endif 210 /* Check the minimum TTL for socket. */ 211 if (last->inp_ip_minttl && last->inp_ip_minttl > ip->ip_ttl) 212 policyfail = 1; 213 if (!policyfail) { 214 struct mbuf *opts = NULL; 215 struct socket *so; 216 217 so = last->inp_socket; 218 if ((last->inp_flags & INP_CONTROLOPTS) || 219 (so->so_options & (SO_TIMESTAMP | SO_BINTIME))) 220 ip_savecontrol(last, &opts, ip, n); 221 SOCKBUF_LOCK(&so->so_rcv); 222 if (sbappendaddr_locked(&so->so_rcv, 223 (struct sockaddr *)ripsrc, n, opts) == 0) { 224 /* should notify about lost packet */ 225 m_freem(n); 226 if (opts) 227 m_freem(opts); 228 SOCKBUF_UNLOCK(&so->so_rcv); 229 } else 230 sorwakeup_locked(so); 231 } else 232 m_freem(n); 233 return (policyfail); 234} 235 236/* 237 * Setup generic address and protocol structures for raw_input routine, then 238 * pass them along with mbuf chain. 239 */ 240void 241rip_input(struct mbuf *m, int off) 242{ 243 struct ip *ip = mtod(m, struct ip *); 244 int proto = ip->ip_p; 245 struct inpcb *inp, *last; 246 struct sockaddr_in ripsrc; 247 int hash; 248 249 bzero(&ripsrc, sizeof(ripsrc)); 250 ripsrc.sin_len = sizeof(ripsrc); 251 ripsrc.sin_family = AF_INET; 252 ripsrc.sin_addr = ip->ip_src; 253 last = NULL; 254 hash = INP_PCBHASH_RAW(proto, ip->ip_src.s_addr, 255 ip->ip_dst.s_addr, V_ripcbinfo.ipi_hashmask); 256 INP_INFO_RLOCK(&V_ripcbinfo); 257 LIST_FOREACH(inp, &V_ripcbinfo.ipi_hashbase[hash], inp_hash) { 258 if (inp->inp_ip_p != proto) 259 continue; 260#ifdef INET6 261 if ((inp->inp_vflag & INP_IPV4) == 0) 262 continue; 263#endif 264 if (inp->inp_laddr.s_addr != ip->ip_dst.s_addr) 265 continue; 266 if (inp->inp_faddr.s_addr != ip->ip_src.s_addr) 267 continue; 268 INP_RLOCK(inp); 269 if (jailed(inp->inp_socket->so_cred) && 270 (htonl(prison_getip(inp->inp_socket->so_cred)) != 271 ip->ip_dst.s_addr)) { 272 INP_RUNLOCK(inp); 273 continue; 274 } 275 if (last) { 276 struct mbuf *n; 277 278 n = m_copy(m, 0, (int)M_COPYALL); 279 if (n != NULL) 280 (void) rip_append(last, ip, n, &ripsrc); 281 /* XXX count dropped packet */ 282 INP_RUNLOCK(last); 283 } 284 last = inp; 285 } 286 LIST_FOREACH(inp, &V_ripcbinfo.ipi_hashbase[0], inp_hash) { 287 if (inp->inp_ip_p && inp->inp_ip_p != proto) 288 continue; 289#ifdef INET6 290 if ((inp->inp_vflag & INP_IPV4) == 0) 291 continue; 292#endif 293 if (inp->inp_laddr.s_addr && 294 inp->inp_laddr.s_addr != ip->ip_dst.s_addr) 295 continue; 296 if (inp->inp_faddr.s_addr && 297 inp->inp_faddr.s_addr != ip->ip_src.s_addr) 298 continue; 299 INP_RLOCK(inp); 300 if (jailed(inp->inp_socket->so_cred) && 301 (htonl(prison_getip(inp->inp_socket->so_cred)) != 302 ip->ip_dst.s_addr)) { 303 INP_RUNLOCK(inp); 304 continue; 305 } 306 if (last) { 307 struct mbuf *n; 308 309 n = m_copy(m, 0, (int)M_COPYALL); 310 if (n != NULL) 311 (void) rip_append(last, ip, n, &ripsrc); 312 /* XXX count dropped packet */ 313 INP_RUNLOCK(last); 314 } 315 last = inp; 316 } 317 INP_INFO_RUNLOCK(&V_ripcbinfo); 318 if (last != NULL) { 319 if (rip_append(last, ip, m, &ripsrc) != 0) 320 V_ipstat.ips_delivered--; 321 INP_RUNLOCK(last); 322 } else { 323 m_freem(m); 324 V_ipstat.ips_noproto++; 325 V_ipstat.ips_delivered--; 326 } 327} 328 329/* 330 * Generate IP header and pass packet to ip_output. Tack on options user may 331 * have setup with control call. 332 */ 333int 334rip_output(struct mbuf *m, struct socket *so, u_long dst) 335{ 336 struct ip *ip; 337 int error; 338 struct inpcb *inp = sotoinpcb(so); 339 int flags = ((so->so_options & SO_DONTROUTE) ? IP_ROUTETOIF : 0) | 340 IP_ALLOWBROADCAST; 341 342 /* 343 * If the user handed us a complete IP packet, use it. Otherwise, 344 * allocate an mbuf for a header and fill it in. 345 */ 346 if ((inp->inp_flags & INP_HDRINCL) == 0) { 347 if (m->m_pkthdr.len + sizeof(struct ip) > IP_MAXPACKET) { 348 m_freem(m); 349 return(EMSGSIZE); 350 } 351 M_PREPEND(m, sizeof(struct ip), M_DONTWAIT); 352 if (m == NULL) 353 return(ENOBUFS); 354 355 INP_RLOCK(inp); 356 ip = mtod(m, struct ip *); 357 ip->ip_tos = inp->inp_ip_tos; 358 if (inp->inp_flags & INP_DONTFRAG) 359 ip->ip_off = IP_DF; 360 else 361 ip->ip_off = 0; 362 ip->ip_p = inp->inp_ip_p; 363 ip->ip_len = m->m_pkthdr.len; 364 if (jailed(inp->inp_socket->so_cred)) 365 ip->ip_src.s_addr = 366 htonl(prison_getip(inp->inp_socket->so_cred)); 367 else 368 ip->ip_src = inp->inp_laddr; 369 ip->ip_dst.s_addr = dst; 370 ip->ip_ttl = inp->inp_ip_ttl; 371 } else { 372 if (m->m_pkthdr.len > IP_MAXPACKET) { 373 m_freem(m); 374 return(EMSGSIZE); 375 } 376 INP_RLOCK(inp); 377 ip = mtod(m, struct ip *); 378 if (jailed(inp->inp_socket->so_cred)) { 379 if (ip->ip_src.s_addr != 380 htonl(prison_getip(inp->inp_socket->so_cred))) { 381 INP_RUNLOCK(inp); 382 m_freem(m); 383 return (EPERM); 384 } 385 } 386 387 /* 388 * Don't allow both user specified and setsockopt options, 389 * and don't allow packet length sizes that will crash. 390 */ 391 if (((ip->ip_hl != (sizeof (*ip) >> 2)) && inp->inp_options) 392 || (ip->ip_len > m->m_pkthdr.len) 393 || (ip->ip_len < (ip->ip_hl << 2))) { 394 INP_RUNLOCK(inp); 395 m_freem(m); 396 return (EINVAL); 397 } 398 if (ip->ip_id == 0) 399 ip->ip_id = ip_newid(); 400 401 /* 402 * XXX prevent ip_output from overwriting header fields. 403 */ 404 flags |= IP_RAWOUTPUT; 405 V_ipstat.ips_rawout++; 406 } 407 408 if (inp->inp_flags & INP_ONESBCAST) 409 flags |= IP_SENDONES; 410 411#ifdef MAC 412 mac_inpcb_create_mbuf(inp, m); 413#endif 414 415 error = ip_output(m, inp->inp_options, NULL, flags, 416 inp->inp_moptions, inp); 417 INP_RUNLOCK(inp); 418 return (error); 419} 420 421/* 422 * Raw IP socket option processing. 423 * 424 * IMPORTANT NOTE regarding access control: Traditionally, raw sockets could 425 * only be created by a privileged process, and as such, socket option 426 * operations to manage system properties on any raw socket were allowed to 427 * take place without explicit additional access control checks. However, 428 * raw sockets can now also be created in jail(), and therefore explicit 429 * checks are now required. Likewise, raw sockets can be used by a process 430 * after it gives up privilege, so some caution is required. For options 431 * passed down to the IP layer via ip_ctloutput(), checks are assumed to be 432 * performed in ip_ctloutput() and therefore no check occurs here. 433 * Unilaterally checking priv_check() here breaks normal IP socket option 434 * operations on raw sockets. 435 * 436 * When adding new socket options here, make sure to add access control 437 * checks here as necessary. 438 */ 439int 440rip_ctloutput(struct socket *so, struct sockopt *sopt) 441{ 442 struct inpcb *inp = sotoinpcb(so); 443 int error, optval; 444 445 if (sopt->sopt_level != IPPROTO_IP) 446 return (EINVAL); 447 448 error = 0; 449 switch (sopt->sopt_dir) { 450 case SOPT_GET: 451 switch (sopt->sopt_name) { 452 case IP_HDRINCL: 453 optval = inp->inp_flags & INP_HDRINCL; 454 error = sooptcopyout(sopt, &optval, sizeof optval); 455 break; 456 457 case IP_FW_ADD: /* ADD actually returns the body... */ 458 case IP_FW_GET: 459 case IP_FW_TABLE_GETSIZE: 460 case IP_FW_TABLE_LIST: 461 case IP_FW_NAT_GET_CONFIG: 462 case IP_FW_NAT_GET_LOG: 463 if (ip_fw_ctl_ptr != NULL) 464 error = ip_fw_ctl_ptr(sopt); 465 else 466 error = ENOPROTOOPT; 467 break; 468 469 case IP_DUMMYNET_GET: 470 if (ip_dn_ctl_ptr != NULL) 471 error = ip_dn_ctl_ptr(sopt); 472 else 473 error = ENOPROTOOPT; 474 break ; 475 476 case MRT_INIT: 477 case MRT_DONE: 478 case MRT_ADD_VIF: 479 case MRT_DEL_VIF: 480 case MRT_ADD_MFC: 481 case MRT_DEL_MFC: 482 case MRT_VERSION: 483 case MRT_ASSERT: 484 case MRT_API_SUPPORT: 485 case MRT_API_CONFIG: 486 case MRT_ADD_BW_UPCALL: 487 case MRT_DEL_BW_UPCALL: 488 error = priv_check(curthread, PRIV_NETINET_MROUTE); 489 if (error != 0) 490 return (error); 491 error = ip_mrouter_get ? ip_mrouter_get(so, sopt) : 492 EOPNOTSUPP; 493 break; 494 495 default: 496 error = ip_ctloutput(so, sopt); 497 break; 498 } 499 break; 500 501 case SOPT_SET: 502 switch (sopt->sopt_name) { 503 case IP_HDRINCL: 504 error = sooptcopyin(sopt, &optval, sizeof optval, 505 sizeof optval); 506 if (error) 507 break; 508 if (optval) 509 inp->inp_flags |= INP_HDRINCL; 510 else 511 inp->inp_flags &= ~INP_HDRINCL; 512 break; 513 514 case IP_FW_ADD: 515 case IP_FW_DEL: 516 case IP_FW_FLUSH: 517 case IP_FW_ZERO: 518 case IP_FW_RESETLOG: 519 case IP_FW_TABLE_ADD: 520 case IP_FW_TABLE_DEL: 521 case IP_FW_TABLE_FLUSH: 522 case IP_FW_NAT_CFG: 523 case IP_FW_NAT_DEL: 524 if (ip_fw_ctl_ptr != NULL) 525 error = ip_fw_ctl_ptr(sopt); 526 else 527 error = ENOPROTOOPT; 528 break; 529 530 case IP_DUMMYNET_CONFIGURE: 531 case IP_DUMMYNET_DEL: 532 case IP_DUMMYNET_FLUSH: 533 if (ip_dn_ctl_ptr != NULL) 534 error = ip_dn_ctl_ptr(sopt); 535 else 536 error = ENOPROTOOPT ; 537 break ; 538 539 case IP_RSVP_ON: 540 error = priv_check(curthread, PRIV_NETINET_MROUTE); 541 if (error != 0) 542 return (error); 543 error = ip_rsvp_init(so); 544 break; 545 546 case IP_RSVP_OFF: 547 error = priv_check(curthread, PRIV_NETINET_MROUTE); 548 if (error != 0) 549 return (error); 550 error = ip_rsvp_done(); 551 break; 552 553 case IP_RSVP_VIF_ON: 554 case IP_RSVP_VIF_OFF: 555 error = priv_check(curthread, PRIV_NETINET_MROUTE); 556 if (error != 0) 557 return (error); 558 error = ip_rsvp_vif ? 559 ip_rsvp_vif(so, sopt) : EINVAL; 560 break; 561 562 case MRT_INIT: 563 case MRT_DONE: 564 case MRT_ADD_VIF: 565 case MRT_DEL_VIF: 566 case MRT_ADD_MFC: 567 case MRT_DEL_MFC: 568 case MRT_VERSION: 569 case MRT_ASSERT: 570 case MRT_API_SUPPORT: 571 case MRT_API_CONFIG: 572 case MRT_ADD_BW_UPCALL: 573 case MRT_DEL_BW_UPCALL: 574 error = priv_check(curthread, PRIV_NETINET_MROUTE); 575 if (error != 0) 576 return (error); 577 error = ip_mrouter_set ? ip_mrouter_set(so, sopt) : 578 EOPNOTSUPP; 579 break; 580 581 default: 582 error = ip_ctloutput(so, sopt); 583 break; 584 } 585 break; 586 } 587 588 return (error); 589} 590 591/* 592 * This function exists solely to receive the PRC_IFDOWN messages which are 593 * sent by if_down(). It looks for an ifaddr whose ifa_addr is sa, and calls 594 * in_ifadown() to remove all routes corresponding to that address. It also 595 * receives the PRC_IFUP messages from if_up() and reinstalls the interface 596 * routes. 597 */ 598void 599rip_ctlinput(int cmd, struct sockaddr *sa, void *vip) 600{ 601 struct in_ifaddr *ia; 602 struct ifnet *ifp; 603 int err; 604 int flags; 605 606 switch (cmd) { 607 case PRC_IFDOWN: 608 TAILQ_FOREACH(ia, &V_in_ifaddrhead, ia_link) { 609 if (ia->ia_ifa.ifa_addr == sa 610 && (ia->ia_flags & IFA_ROUTE)) { 611 /* 612 * in_ifscrub kills the interface route. 613 */ 614 in_ifscrub(ia->ia_ifp, ia); 615 /* 616 * in_ifadown gets rid of all the rest of the 617 * routes. This is not quite the right thing 618 * to do, but at least if we are running a 619 * routing process they will come back. 620 */ 621 in_ifadown(&ia->ia_ifa, 0); 622 break; 623 } 624 } 625 break; 626 627 case PRC_IFUP: 628 TAILQ_FOREACH(ia, &V_in_ifaddrhead, ia_link) { 629 if (ia->ia_ifa.ifa_addr == sa) 630 break; 631 } 632 if (ia == 0 || (ia->ia_flags & IFA_ROUTE)) 633 return; 634 flags = RTF_UP; 635 ifp = ia->ia_ifa.ifa_ifp; 636 637 if ((ifp->if_flags & IFF_LOOPBACK) 638 || (ifp->if_flags & IFF_POINTOPOINT)) 639 flags |= RTF_HOST; 640 641 err = rtinit(&ia->ia_ifa, RTM_ADD, flags); 642 if (err == 0) 643 ia->ia_flags |= IFA_ROUTE; 644 break; 645 } 646} 647 648u_long rip_sendspace = 9216; 649u_long rip_recvspace = 9216; 650 651SYSCTL_ULONG(_net_inet_raw, OID_AUTO, maxdgram, CTLFLAG_RW, 652 &rip_sendspace, 0, "Maximum outgoing raw IP datagram size"); 653SYSCTL_ULONG(_net_inet_raw, OID_AUTO, recvspace, CTLFLAG_RW, 654 &rip_recvspace, 0, "Maximum space for incoming raw IP datagrams"); 655 656static int 657rip_attach(struct socket *so, int proto, struct thread *td) 658{ 659 struct inpcb *inp; 660 int error; 661 662 inp = sotoinpcb(so); 663 KASSERT(inp == NULL, ("rip_attach: inp != NULL")); 664 665 error = priv_check(td, PRIV_NETINET_RAW); 666 if (error) 667 return (error); 668 if (proto >= IPPROTO_MAX || proto < 0) 669 return EPROTONOSUPPORT; 670 error = soreserve(so, rip_sendspace, rip_recvspace); 671 if (error) 672 return (error); 673 INP_INFO_WLOCK(&V_ripcbinfo); 674 error = in_pcballoc(so, &V_ripcbinfo); 675 if (error) { 676 INP_INFO_WUNLOCK(&V_ripcbinfo); 677 return (error); 678 } 679 inp = (struct inpcb *)so->so_pcb; 680 inp->inp_vflag |= INP_IPV4; 681 inp->inp_ip_p = proto; 682 inp->inp_ip_ttl = V_ip_defttl; 683 rip_inshash(inp); 684 INP_INFO_WUNLOCK(&V_ripcbinfo); 685 INP_WUNLOCK(inp); 686 return (0); 687} 688 689static void 690rip_detach(struct socket *so) 691{ 692 struct inpcb *inp; 693 694 inp = sotoinpcb(so); 695 KASSERT(inp != NULL, ("rip_detach: inp == NULL")); 696 KASSERT(inp->inp_faddr.s_addr == INADDR_ANY, 697 ("rip_detach: not closed")); 698 699 INP_INFO_WLOCK(&V_ripcbinfo); 700 INP_WLOCK(inp); 701 rip_delhash(inp); 702 if (so == V_ip_mrouter && ip_mrouter_done) 703 ip_mrouter_done(); 704 if (ip_rsvp_force_done) 705 ip_rsvp_force_done(so); 706 if (so == V_ip_rsvpd) 707 ip_rsvp_done(); 708 in_pcbdetach(inp); 709 in_pcbfree(inp); 710 INP_INFO_WUNLOCK(&V_ripcbinfo); 711} 712 713static void 714rip_dodisconnect(struct socket *so, struct inpcb *inp) 715{ 716 717 INP_INFO_WLOCK_ASSERT(inp->inp_pcbinfo); 718 INP_WLOCK_ASSERT(inp); 719 720 rip_delhash(inp); 721 inp->inp_faddr.s_addr = INADDR_ANY; 722 rip_inshash(inp); 723 SOCK_LOCK(so); 724 so->so_state &= ~SS_ISCONNECTED; 725 SOCK_UNLOCK(so); 726} 727 728static void 729rip_abort(struct socket *so) 730{ 731 struct inpcb *inp; 732 733 inp = sotoinpcb(so); 734 KASSERT(inp != NULL, ("rip_abort: inp == NULL")); 735 736 INP_INFO_WLOCK(&V_ripcbinfo); 737 INP_WLOCK(inp); 738 rip_dodisconnect(so, inp); 739 INP_WUNLOCK(inp); 740 INP_INFO_WUNLOCK(&V_ripcbinfo); 741} 742 743static void 744rip_close(struct socket *so) 745{ 746 struct inpcb *inp; 747 748 inp = sotoinpcb(so); 749 KASSERT(inp != NULL, ("rip_close: inp == NULL")); 750 751 INP_INFO_WLOCK(&V_ripcbinfo); 752 INP_WLOCK(inp); 753 rip_dodisconnect(so, inp); 754 INP_WUNLOCK(inp); 755 INP_INFO_WUNLOCK(&V_ripcbinfo); 756} 757 758static int 759rip_disconnect(struct socket *so) 760{ 761 struct inpcb *inp; 762 763 if ((so->so_state & SS_ISCONNECTED) == 0) 764 return (ENOTCONN); 765 766 inp = sotoinpcb(so); 767 KASSERT(inp != NULL, ("rip_disconnect: inp == NULL")); 768 769 INP_INFO_WLOCK(&V_ripcbinfo); 770 INP_WLOCK(inp); 771 rip_dodisconnect(so, inp); 772 INP_WUNLOCK(inp); 773 INP_INFO_WUNLOCK(&V_ripcbinfo); 774 return (0); 775} 776 777static int 778rip_bind(struct socket *so, struct sockaddr *nam, struct thread *td) 779{ 780 struct sockaddr_in *addr = (struct sockaddr_in *)nam; 781 struct inpcb *inp; 782 783 if (nam->sa_len != sizeof(*addr)) 784 return (EINVAL); 785 786 if (jailed(td->td_ucred)) { 787 if (addr->sin_addr.s_addr == INADDR_ANY) 788 addr->sin_addr.s_addr = 789 htonl(prison_getip(td->td_ucred)); 790 if (htonl(prison_getip(td->td_ucred)) != addr->sin_addr.s_addr) 791 return (EADDRNOTAVAIL); 792 } 793 794 if (TAILQ_EMPTY(&V_ifnet) || 795 (addr->sin_family != AF_INET && addr->sin_family != AF_IMPLINK) || 796 (addr->sin_addr.s_addr && 797 ifa_ifwithaddr((struct sockaddr *)addr) == 0)) 798 return (EADDRNOTAVAIL); 799 800 inp = sotoinpcb(so); 801 KASSERT(inp != NULL, ("rip_bind: inp == NULL")); 802 803 INP_INFO_WLOCK(&V_ripcbinfo); 804 INP_WLOCK(inp); 805 rip_delhash(inp); 806 inp->inp_laddr = addr->sin_addr; 807 rip_inshash(inp); 808 INP_WUNLOCK(inp); 809 INP_INFO_WUNLOCK(&V_ripcbinfo); 810 return (0); 811} 812 813static int 814rip_connect(struct socket *so, struct sockaddr *nam, struct thread *td) 815{ 816 struct sockaddr_in *addr = (struct sockaddr_in *)nam; 817 struct inpcb *inp; 818 819 if (nam->sa_len != sizeof(*addr)) 820 return (EINVAL); 821 if (TAILQ_EMPTY(&V_ifnet)) 822 return (EADDRNOTAVAIL); 823 if (addr->sin_family != AF_INET && addr->sin_family != AF_IMPLINK) 824 return (EAFNOSUPPORT); 825 826 inp = sotoinpcb(so); 827 KASSERT(inp != NULL, ("rip_connect: inp == NULL")); 828 829 INP_INFO_WLOCK(&V_ripcbinfo); 830 INP_WLOCK(inp); 831 rip_delhash(inp); 832 inp->inp_faddr = addr->sin_addr; 833 rip_inshash(inp); 834 soisconnected(so); 835 INP_WUNLOCK(inp); 836 INP_INFO_WUNLOCK(&V_ripcbinfo); 837 return (0); 838} 839 840static int 841rip_shutdown(struct socket *so) 842{ 843 struct inpcb *inp; 844 845 inp = sotoinpcb(so); 846 KASSERT(inp != NULL, ("rip_shutdown: inp == NULL")); 847 848 INP_WLOCK(inp); 849 socantsendmore(so); 850 INP_WUNLOCK(inp); 851 return (0); 852} 853 854static int 855rip_send(struct socket *so, int flags, struct mbuf *m, struct sockaddr *nam, 856 struct mbuf *control, struct thread *td) 857{ 858 struct inpcb *inp; 859 u_long dst; 860 861 inp = sotoinpcb(so); 862 KASSERT(inp != NULL, ("rip_send: inp == NULL")); 863 864 /* 865 * Note: 'dst' reads below are unlocked. 866 */ 867 if (so->so_state & SS_ISCONNECTED) { 868 if (nam) { 869 m_freem(m); 870 return (EISCONN); 871 } 872 dst = inp->inp_faddr.s_addr; /* Unlocked read. */ 873 } else { 874 if (nam == NULL) { 875 m_freem(m); 876 return (ENOTCONN); 877 } 878 dst = ((struct sockaddr_in *)nam)->sin_addr.s_addr; 879 } 880 return (rip_output(m, so, dst)); 881} 882 883static int 884rip_pcblist(SYSCTL_HANDLER_ARGS) 885{ 886 int error, i, n; 887 struct inpcb *inp, **inp_list; 888 inp_gen_t gencnt; 889 struct xinpgen xig; 890 891 /* 892 * The process of preparing the TCB list is too time-consuming and 893 * resource-intensive to repeat twice on every request. 894 */ 895 if (req->oldptr == 0) { 896 n = V_ripcbinfo.ipi_count; 897 req->oldidx = 2 * (sizeof xig) 898 + (n + n/8) * sizeof(struct xinpcb); 899 return (0); 900 } 901 902 if (req->newptr != 0) 903 return (EPERM); 904 905 /* 906 * OK, now we're committed to doing something. 907 */ 908 INP_INFO_RLOCK(&V_ripcbinfo); 909 gencnt = V_ripcbinfo.ipi_gencnt; 910 n = V_ripcbinfo.ipi_count; 911 INP_INFO_RUNLOCK(&V_ripcbinfo); 912 913 xig.xig_len = sizeof xig; 914 xig.xig_count = n; 915 xig.xig_gen = gencnt; 916 xig.xig_sogen = so_gencnt; 917 error = SYSCTL_OUT(req, &xig, sizeof xig); 918 if (error) 919 return (error); 920 921 inp_list = malloc(n * sizeof *inp_list, M_TEMP, M_WAITOK); 922 if (inp_list == 0) 923 return (ENOMEM); 924 925 INP_INFO_RLOCK(&V_ripcbinfo); 926 for (inp = LIST_FIRST(V_ripcbinfo.ipi_listhead), i = 0; inp && i < n; 927 inp = LIST_NEXT(inp, inp_list)) { 928 INP_RLOCK(inp); 929 if (inp->inp_gencnt <= gencnt && 930 cr_canseesocket(req->td->td_ucred, inp->inp_socket) == 0) { 931 /* XXX held references? */ 932 inp_list[i++] = inp; 933 } 934 INP_RUNLOCK(inp); 935 } 936 INP_INFO_RUNLOCK(&V_ripcbinfo); 937 n = i; 938 939 error = 0; 940 for (i = 0; i < n; i++) { 941 inp = inp_list[i]; 942 INP_RLOCK(inp); 943 if (inp->inp_gencnt <= gencnt) { 944 struct xinpcb xi; 945 bzero(&xi, sizeof(xi)); 946 xi.xi_len = sizeof xi; 947 /* XXX should avoid extra copy */ 948 bcopy(inp, &xi.xi_inp, sizeof *inp); 949 if (inp->inp_socket) 950 sotoxsocket(inp->inp_socket, &xi.xi_socket); 951 INP_RUNLOCK(inp); 952 error = SYSCTL_OUT(req, &xi, sizeof xi); 953 } else 954 INP_RUNLOCK(inp); 955 } 956 if (!error) { 957 /* 958 * Give the user an updated idea of our state. If the 959 * generation differs from what we told her before, she knows 960 * that something happened while we were processing this 961 * request, and it might be necessary to retry. 962 */ 963 INP_INFO_RLOCK(&V_ripcbinfo); 964 xig.xig_gen = V_ripcbinfo.ipi_gencnt; 965 xig.xig_sogen = so_gencnt; 966 xig.xig_count = V_ripcbinfo.ipi_count; 967 INP_INFO_RUNLOCK(&V_ripcbinfo); 968 error = SYSCTL_OUT(req, &xig, sizeof xig); 969 } 970 free(inp_list, M_TEMP); 971 return (error); 972} 973 974SYSCTL_PROC(_net_inet_raw, OID_AUTO/*XXX*/, pcblist, CTLFLAG_RD, 0, 0, 975 rip_pcblist, "S,xinpcb", "List of active raw IP sockets"); 976 977struct pr_usrreqs rip_usrreqs = { 978 .pru_abort = rip_abort, 979 .pru_attach = rip_attach, 980 .pru_bind = rip_bind, 981 .pru_connect = rip_connect, 982 .pru_control = in_control, 983 .pru_detach = rip_detach, 984 .pru_disconnect = rip_disconnect, 985 .pru_peeraddr = in_getpeeraddr, 986 .pru_send = rip_send, 987 .pru_shutdown = rip_shutdown, 988 .pru_sockaddr = in_getsockaddr, 989 .pru_sosetlabel = in_pcbsosetlabel, 990 .pru_close = rip_close, 991}; 992