raw_ip.c revision 180828
1/*- 2 * Copyright (c) 1982, 1986, 1988, 1993 3 * The Regents of the University of California. 4 * All rights reserved. 5 * 6 * Redistribution and use in source and binary forms, with or without 7 * modification, are permitted provided that the following conditions 8 * are met: 9 * 1. Redistributions of source code must retain the above copyright 10 * notice, this list of conditions and the following disclaimer. 11 * 2. Redistributions in binary form must reproduce the above copyright 12 * notice, this list of conditions and the following disclaimer in the 13 * documentation and/or other materials provided with the distribution. 14 * 4. Neither the name of the University nor the names of its contributors 15 * may be used to endorse or promote products derived from this software 16 * without specific prior written permission. 17 * 18 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND 19 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 20 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 21 * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE 22 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 23 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 24 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 25 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 26 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 27 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 28 * SUCH DAMAGE. 29 * 30 * @(#)raw_ip.c 8.7 (Berkeley) 5/15/95 31 */ 32 33#include <sys/cdefs.h> 34__FBSDID("$FreeBSD: head/sys/netinet/raw_ip.c 180828 2008-07-26 17:32:15Z mav $"); 35 36#include "opt_inet6.h" 37#include "opt_ipsec.h" 38#include "opt_mac.h" 39 40#include <sys/param.h> 41#include <sys/jail.h> 42#include <sys/kernel.h> 43#include <sys/lock.h> 44#include <sys/malloc.h> 45#include <sys/mbuf.h> 46#include <sys/priv.h> 47#include <sys/proc.h> 48#include <sys/protosw.h> 49#include <sys/signalvar.h> 50#include <sys/socket.h> 51#include <sys/socketvar.h> 52#include <sys/sx.h> 53#include <sys/sysctl.h> 54#include <sys/systm.h> 55 56#include <vm/uma.h> 57 58#include <net/if.h> 59#include <net/route.h> 60 61#include <netinet/in.h> 62#include <netinet/in_systm.h> 63#include <netinet/in_pcb.h> 64#include <netinet/in_var.h> 65#include <netinet/ip.h> 66#include <netinet/ip_var.h> 67#include <netinet/ip_mroute.h> 68 69#include <netinet/ip_fw.h> 70#include <netinet/ip_dummynet.h> 71 72#ifdef IPSEC 73#include <netipsec/ipsec.h> 74#endif /*IPSEC*/ 75 76#include <security/mac/mac_framework.h> 77 78struct inpcbhead ripcb; 79struct inpcbinfo ripcbinfo; 80 81/* control hooks for ipfw and dummynet */ 82ip_fw_ctl_t *ip_fw_ctl_ptr = NULL; 83ip_dn_ctl_t *ip_dn_ctl_ptr = NULL; 84 85/* 86 * Hooks for multicast routing. They all default to NULL, so leave them not 87 * initialized and rely on BSS being set to 0. 88 */ 89 90/* 91 * The socket used to communicate with the multicast routing daemon. 92 */ 93struct socket *ip_mrouter; 94 95/* 96 * The various mrouter and rsvp functions. 97 */ 98int (*ip_mrouter_set)(struct socket *, struct sockopt *); 99int (*ip_mrouter_get)(struct socket *, struct sockopt *); 100int (*ip_mrouter_done)(void); 101int (*ip_mforward)(struct ip *, struct ifnet *, struct mbuf *, 102 struct ip_moptions *); 103int (*mrt_ioctl)(int, caddr_t, int); 104int (*legal_vif_num)(int); 105u_long (*ip_mcast_src)(int); 106 107void (*rsvp_input_p)(struct mbuf *m, int off); 108int (*ip_rsvp_vif)(struct socket *, struct sockopt *); 109void (*ip_rsvp_force_done)(struct socket *); 110 111/* 112 * Hash functions 113 */ 114 115#define INP_PCBHASH_RAW_SIZE 256 116#define INP_PCBHASH_RAW(proto, laddr, faddr, mask) \ 117 (((proto) + (laddr) + (faddr)) % (mask) + 1) 118 119static void 120rip_inshash(struct inpcb *inp) 121{ 122 struct inpcbinfo *pcbinfo = inp->inp_pcbinfo; 123 struct inpcbhead *pcbhash; 124 int hash; 125 126 INP_INFO_WLOCK_ASSERT(pcbinfo); 127 INP_WLOCK_ASSERT(inp); 128 129 if (inp->inp_ip_p && inp->inp_laddr.s_addr && inp->inp_faddr.s_addr) { 130 hash = INP_PCBHASH_RAW(inp->inp_ip_p, inp->inp_laddr.s_addr, 131 inp->inp_faddr.s_addr, pcbinfo->ipi_hashmask); 132 } else { 133 hash = 0; 134 } 135 pcbhash = &pcbinfo->ipi_hashbase[hash]; 136 LIST_INSERT_HEAD(pcbhash, inp, inp_hash); 137} 138 139static void 140rip_delhash(struct inpcb *inp) 141{ 142 INP_WLOCK_ASSERT(inp); 143 LIST_REMOVE(inp, inp_hash); 144} 145 146/* 147 * Raw interface to IP protocol. 148 */ 149 150/* 151 * Initialize raw connection block q. 152 */ 153static void 154rip_zone_change(void *tag) 155{ 156 157 uma_zone_set_max(ripcbinfo.ipi_zone, maxsockets); 158} 159 160static int 161rip_inpcb_init(void *mem, int size, int flags) 162{ 163 struct inpcb *inp = mem; 164 165 INP_LOCK_INIT(inp, "inp", "rawinp"); 166 return (0); 167} 168 169void 170rip_init(void) 171{ 172 173 INP_INFO_LOCK_INIT(&ripcbinfo, "rip"); 174 LIST_INIT(&ripcb); 175 ripcbinfo.ipi_listhead = &ripcb; 176 ripcbinfo.ipi_hashbase = hashinit(INP_PCBHASH_RAW_SIZE, M_PCB, 177 &ripcbinfo.ipi_hashmask); 178 ripcbinfo.ipi_porthashbase = hashinit(1, M_PCB, 179 &ripcbinfo.ipi_porthashmask); 180 ripcbinfo.ipi_zone = uma_zcreate("ripcb", sizeof(struct inpcb), 181 NULL, NULL, rip_inpcb_init, NULL, UMA_ALIGN_PTR, UMA_ZONE_NOFREE); 182 uma_zone_set_max(ripcbinfo.ipi_zone, maxsockets); 183 EVENTHANDLER_REGISTER(maxsockets_change, rip_zone_change, NULL, 184 EVENTHANDLER_PRI_ANY); 185} 186 187static int 188rip_append(struct inpcb *last, struct ip *ip, struct mbuf *n, 189 struct sockaddr_in *ripsrc) 190{ 191 int policyfail = 0; 192 193 INP_RLOCK_ASSERT(last); 194 195#ifdef IPSEC 196 /* check AH/ESP integrity. */ 197 if (ipsec4_in_reject(n, last)) { 198 policyfail = 1; 199 } 200#endif /* IPSEC */ 201#ifdef MAC 202 if (!policyfail && mac_inpcb_check_deliver(last, n) != 0) 203 policyfail = 1; 204#endif 205 /* Check the minimum TTL for socket. */ 206 if (last->inp_ip_minttl && last->inp_ip_minttl > ip->ip_ttl) 207 policyfail = 1; 208 if (!policyfail) { 209 struct mbuf *opts = NULL; 210 struct socket *so; 211 212 so = last->inp_socket; 213 if ((last->inp_flags & INP_CONTROLOPTS) || 214 (so->so_options & (SO_TIMESTAMP | SO_BINTIME))) 215 ip_savecontrol(last, &opts, ip, n); 216 SOCKBUF_LOCK(&so->so_rcv); 217 if (sbappendaddr_locked(&so->so_rcv, 218 (struct sockaddr *)ripsrc, n, opts) == 0) { 219 /* should notify about lost packet */ 220 m_freem(n); 221 if (opts) 222 m_freem(opts); 223 SOCKBUF_UNLOCK(&so->so_rcv); 224 } else 225 sorwakeup_locked(so); 226 } else 227 m_freem(n); 228 return (policyfail); 229} 230 231/* 232 * Setup generic address and protocol structures for raw_input routine, then 233 * pass them along with mbuf chain. 234 */ 235void 236rip_input(struct mbuf *m, int off) 237{ 238 struct ip *ip = mtod(m, struct ip *); 239 int proto = ip->ip_p; 240 struct inpcb *inp, *last; 241 struct sockaddr_in ripsrc; 242 int hash; 243 244 bzero(&ripsrc, sizeof(ripsrc)); 245 ripsrc.sin_len = sizeof(ripsrc); 246 ripsrc.sin_family = AF_INET; 247 ripsrc.sin_addr = ip->ip_src; 248 last = NULL; 249 INP_INFO_RLOCK(&ripcbinfo); 250 hash = INP_PCBHASH_RAW(proto, ip->ip_src.s_addr, 251 ip->ip_dst.s_addr, ripcbinfo.ipi_hashmask); 252 LIST_FOREACH(inp, &ripcbinfo.ipi_hashbase[hash], inp_hash) { 253 INP_RLOCK(inp); 254 if (inp->inp_ip_p != proto) { 255 docontinue1: 256 INP_RUNLOCK(inp); 257 continue; 258 } 259#ifdef INET6 260 if ((inp->inp_vflag & INP_IPV4) == 0) 261 goto docontinue1; 262#endif 263 if (inp->inp_laddr.s_addr != ip->ip_dst.s_addr) 264 goto docontinue1; 265 if (inp->inp_faddr.s_addr != ip->ip_src.s_addr) 266 goto docontinue1; 267 if (jailed(inp->inp_socket->so_cred) && 268 (htonl(prison_getip(inp->inp_socket->so_cred)) != 269 ip->ip_dst.s_addr)) 270 goto docontinue1; 271 if (last) { 272 struct mbuf *n; 273 274 n = m_copy(m, 0, (int)M_COPYALL); 275 if (n != NULL) 276 (void) rip_append(last, ip, n, &ripsrc); 277 /* XXX count dropped packet */ 278 INP_RUNLOCK(last); 279 } 280 last = inp; 281 } 282 LIST_FOREACH(inp, &ripcbinfo.ipi_hashbase[0], inp_hash) { 283 INP_RLOCK(inp); 284 if (inp->inp_ip_p && inp->inp_ip_p != proto) { 285 docontinue: 286 INP_RUNLOCK(inp); 287 continue; 288 } 289#ifdef INET6 290 if ((inp->inp_vflag & INP_IPV4) == 0) 291 goto docontinue; 292#endif 293 if (inp->inp_laddr.s_addr && 294 inp->inp_laddr.s_addr != ip->ip_dst.s_addr) 295 goto docontinue; 296 if (inp->inp_faddr.s_addr && 297 inp->inp_faddr.s_addr != ip->ip_src.s_addr) 298 goto docontinue; 299 if (jailed(inp->inp_socket->so_cred)) 300 if (htonl(prison_getip(inp->inp_socket->so_cred)) != 301 ip->ip_dst.s_addr) 302 goto docontinue; 303 if (last) { 304 struct mbuf *n; 305 306 n = m_copy(m, 0, (int)M_COPYALL); 307 if (n != NULL) 308 (void) rip_append(last, ip, n, &ripsrc); 309 /* XXX count dropped packet */ 310 INP_RUNLOCK(last); 311 } 312 last = inp; 313 } 314 if (last != NULL) { 315 if (rip_append(last, ip, m, &ripsrc) != 0) 316 ipstat.ips_delivered--; 317 INP_RUNLOCK(last); 318 } else { 319 m_freem(m); 320 ipstat.ips_noproto++; 321 ipstat.ips_delivered--; 322 } 323 INP_INFO_RUNLOCK(&ripcbinfo); 324} 325 326/* 327 * Generate IP header and pass packet to ip_output. Tack on options user may 328 * have setup with control call. 329 */ 330int 331rip_output(struct mbuf *m, struct socket *so, u_long dst) 332{ 333 struct ip *ip; 334 int error; 335 struct inpcb *inp = sotoinpcb(so); 336 int flags = ((so->so_options & SO_DONTROUTE) ? IP_ROUTETOIF : 0) | 337 IP_ALLOWBROADCAST; 338 339 /* 340 * If the user handed us a complete IP packet, use it. Otherwise, 341 * allocate an mbuf for a header and fill it in. 342 */ 343 if ((inp->inp_flags & INP_HDRINCL) == 0) { 344 if (m->m_pkthdr.len + sizeof(struct ip) > IP_MAXPACKET) { 345 m_freem(m); 346 return(EMSGSIZE); 347 } 348 M_PREPEND(m, sizeof(struct ip), M_DONTWAIT); 349 if (m == NULL) 350 return(ENOBUFS); 351 352 INP_RLOCK(inp); 353 ip = mtod(m, struct ip *); 354 ip->ip_tos = inp->inp_ip_tos; 355 if (inp->inp_flags & INP_DONTFRAG) 356 ip->ip_off = IP_DF; 357 else 358 ip->ip_off = 0; 359 ip->ip_p = inp->inp_ip_p; 360 ip->ip_len = m->m_pkthdr.len; 361 if (jailed(inp->inp_socket->so_cred)) 362 ip->ip_src.s_addr = 363 htonl(prison_getip(inp->inp_socket->so_cred)); 364 else 365 ip->ip_src = inp->inp_laddr; 366 ip->ip_dst.s_addr = dst; 367 ip->ip_ttl = inp->inp_ip_ttl; 368 } else { 369 if (m->m_pkthdr.len > IP_MAXPACKET) { 370 m_freem(m); 371 return(EMSGSIZE); 372 } 373 INP_RLOCK(inp); 374 ip = mtod(m, struct ip *); 375 if (jailed(inp->inp_socket->so_cred)) { 376 if (ip->ip_src.s_addr != 377 htonl(prison_getip(inp->inp_socket->so_cred))) { 378 INP_RUNLOCK(inp); 379 m_freem(m); 380 return (EPERM); 381 } 382 } 383 384 /* 385 * Don't allow both user specified and setsockopt options, 386 * and don't allow packet length sizes that will crash. 387 */ 388 if (((ip->ip_hl != (sizeof (*ip) >> 2)) && inp->inp_options) 389 || (ip->ip_len > m->m_pkthdr.len) 390 || (ip->ip_len < (ip->ip_hl << 2))) { 391 INP_RUNLOCK(inp); 392 m_freem(m); 393 return (EINVAL); 394 } 395 if (ip->ip_id == 0) 396 ip->ip_id = ip_newid(); 397 398 /* 399 * XXX prevent ip_output from overwriting header fields. 400 */ 401 flags |= IP_RAWOUTPUT; 402 ipstat.ips_rawout++; 403 } 404 405 if (inp->inp_flags & INP_ONESBCAST) 406 flags |= IP_SENDONES; 407 408#ifdef MAC 409 mac_inpcb_create_mbuf(inp, m); 410#endif 411 412 error = ip_output(m, inp->inp_options, NULL, flags, 413 inp->inp_moptions, inp); 414 INP_RUNLOCK(inp); 415 return (error); 416} 417 418/* 419 * Raw IP socket option processing. 420 * 421 * IMPORTANT NOTE regarding access control: Traditionally, raw sockets could 422 * only be created by a privileged process, and as such, socket option 423 * operations to manage system properties on any raw socket were allowed to 424 * take place without explicit additional access control checks. However, 425 * raw sockets can now also be created in jail(), and therefore explicit 426 * checks are now required. Likewise, raw sockets can be used by a process 427 * after it gives up privilege, so some caution is required. For options 428 * passed down to the IP layer via ip_ctloutput(), checks are assumed to be 429 * performed in ip_ctloutput() and therefore no check occurs here. 430 * Unilaterally checking priv_check() here breaks normal IP socket option 431 * operations on raw sockets. 432 * 433 * When adding new socket options here, make sure to add access control 434 * checks here as necessary. 435 */ 436int 437rip_ctloutput(struct socket *so, struct sockopt *sopt) 438{ 439 struct inpcb *inp = sotoinpcb(so); 440 int error, optval; 441 442 if (sopt->sopt_level != IPPROTO_IP) 443 return (EINVAL); 444 445 error = 0; 446 switch (sopt->sopt_dir) { 447 case SOPT_GET: 448 switch (sopt->sopt_name) { 449 case IP_HDRINCL: 450 optval = inp->inp_flags & INP_HDRINCL; 451 error = sooptcopyout(sopt, &optval, sizeof optval); 452 break; 453 454 case IP_FW_ADD: /* ADD actually returns the body... */ 455 case IP_FW_GET: 456 case IP_FW_TABLE_GETSIZE: 457 case IP_FW_TABLE_LIST: 458 case IP_FW_NAT_GET_CONFIG: 459 case IP_FW_NAT_GET_LOG: 460 if (ip_fw_ctl_ptr != NULL) 461 error = ip_fw_ctl_ptr(sopt); 462 else 463 error = ENOPROTOOPT; 464 break; 465 466 case IP_DUMMYNET_GET: 467 if (ip_dn_ctl_ptr != NULL) 468 error = ip_dn_ctl_ptr(sopt); 469 else 470 error = ENOPROTOOPT; 471 break ; 472 473 case MRT_INIT: 474 case MRT_DONE: 475 case MRT_ADD_VIF: 476 case MRT_DEL_VIF: 477 case MRT_ADD_MFC: 478 case MRT_DEL_MFC: 479 case MRT_VERSION: 480 case MRT_ASSERT: 481 case MRT_API_SUPPORT: 482 case MRT_API_CONFIG: 483 case MRT_ADD_BW_UPCALL: 484 case MRT_DEL_BW_UPCALL: 485 error = priv_check(curthread, PRIV_NETINET_MROUTE); 486 if (error != 0) 487 return (error); 488 error = ip_mrouter_get ? ip_mrouter_get(so, sopt) : 489 EOPNOTSUPP; 490 break; 491 492 default: 493 error = ip_ctloutput(so, sopt); 494 break; 495 } 496 break; 497 498 case SOPT_SET: 499 switch (sopt->sopt_name) { 500 case IP_HDRINCL: 501 error = sooptcopyin(sopt, &optval, sizeof optval, 502 sizeof optval); 503 if (error) 504 break; 505 if (optval) 506 inp->inp_flags |= INP_HDRINCL; 507 else 508 inp->inp_flags &= ~INP_HDRINCL; 509 break; 510 511 case IP_FW_ADD: 512 case IP_FW_DEL: 513 case IP_FW_FLUSH: 514 case IP_FW_ZERO: 515 case IP_FW_RESETLOG: 516 case IP_FW_TABLE_ADD: 517 case IP_FW_TABLE_DEL: 518 case IP_FW_TABLE_FLUSH: 519 case IP_FW_NAT_CFG: 520 case IP_FW_NAT_DEL: 521 if (ip_fw_ctl_ptr != NULL) 522 error = ip_fw_ctl_ptr(sopt); 523 else 524 error = ENOPROTOOPT; 525 break; 526 527 case IP_DUMMYNET_CONFIGURE: 528 case IP_DUMMYNET_DEL: 529 case IP_DUMMYNET_FLUSH: 530 if (ip_dn_ctl_ptr != NULL) 531 error = ip_dn_ctl_ptr(sopt); 532 else 533 error = ENOPROTOOPT ; 534 break ; 535 536 case IP_RSVP_ON: 537 error = priv_check(curthread, PRIV_NETINET_MROUTE); 538 if (error != 0) 539 return (error); 540 error = ip_rsvp_init(so); 541 break; 542 543 case IP_RSVP_OFF: 544 error = priv_check(curthread, PRIV_NETINET_MROUTE); 545 if (error != 0) 546 return (error); 547 error = ip_rsvp_done(); 548 break; 549 550 case IP_RSVP_VIF_ON: 551 case IP_RSVP_VIF_OFF: 552 error = priv_check(curthread, PRIV_NETINET_MROUTE); 553 if (error != 0) 554 return (error); 555 error = ip_rsvp_vif ? 556 ip_rsvp_vif(so, sopt) : EINVAL; 557 break; 558 559 case MRT_INIT: 560 case MRT_DONE: 561 case MRT_ADD_VIF: 562 case MRT_DEL_VIF: 563 case MRT_ADD_MFC: 564 case MRT_DEL_MFC: 565 case MRT_VERSION: 566 case MRT_ASSERT: 567 case MRT_API_SUPPORT: 568 case MRT_API_CONFIG: 569 case MRT_ADD_BW_UPCALL: 570 case MRT_DEL_BW_UPCALL: 571 error = priv_check(curthread, PRIV_NETINET_MROUTE); 572 if (error != 0) 573 return (error); 574 error = ip_mrouter_set ? ip_mrouter_set(so, sopt) : 575 EOPNOTSUPP; 576 break; 577 578 default: 579 error = ip_ctloutput(so, sopt); 580 break; 581 } 582 break; 583 } 584 585 return (error); 586} 587 588/* 589 * This function exists solely to receive the PRC_IFDOWN messages which are 590 * sent by if_down(). It looks for an ifaddr whose ifa_addr is sa, and calls 591 * in_ifadown() to remove all routes corresponding to that address. It also 592 * receives the PRC_IFUP messages from if_up() and reinstalls the interface 593 * routes. 594 */ 595void 596rip_ctlinput(int cmd, struct sockaddr *sa, void *vip) 597{ 598 struct in_ifaddr *ia; 599 struct ifnet *ifp; 600 int err; 601 int flags; 602 603 switch (cmd) { 604 case PRC_IFDOWN: 605 TAILQ_FOREACH(ia, &in_ifaddrhead, ia_link) { 606 if (ia->ia_ifa.ifa_addr == sa 607 && (ia->ia_flags & IFA_ROUTE)) { 608 /* 609 * in_ifscrub kills the interface route. 610 */ 611 in_ifscrub(ia->ia_ifp, ia); 612 /* 613 * in_ifadown gets rid of all the rest of the 614 * routes. This is not quite the right thing 615 * to do, but at least if we are running a 616 * routing process they will come back. 617 */ 618 in_ifadown(&ia->ia_ifa, 0); 619 break; 620 } 621 } 622 break; 623 624 case PRC_IFUP: 625 TAILQ_FOREACH(ia, &in_ifaddrhead, ia_link) { 626 if (ia->ia_ifa.ifa_addr == sa) 627 break; 628 } 629 if (ia == 0 || (ia->ia_flags & IFA_ROUTE)) 630 return; 631 flags = RTF_UP; 632 ifp = ia->ia_ifa.ifa_ifp; 633 634 if ((ifp->if_flags & IFF_LOOPBACK) 635 || (ifp->if_flags & IFF_POINTOPOINT)) 636 flags |= RTF_HOST; 637 638 err = rtinit(&ia->ia_ifa, RTM_ADD, flags); 639 if (err == 0) 640 ia->ia_flags |= IFA_ROUTE; 641 break; 642 } 643} 644 645u_long rip_sendspace = 9216; 646u_long rip_recvspace = 9216; 647 648SYSCTL_ULONG(_net_inet_raw, OID_AUTO, maxdgram, CTLFLAG_RW, 649 &rip_sendspace, 0, "Maximum outgoing raw IP datagram size"); 650SYSCTL_ULONG(_net_inet_raw, OID_AUTO, recvspace, CTLFLAG_RW, 651 &rip_recvspace, 0, "Maximum space for incoming raw IP datagrams"); 652 653static int 654rip_attach(struct socket *so, int proto, struct thread *td) 655{ 656 struct inpcb *inp; 657 int error; 658 659 inp = sotoinpcb(so); 660 KASSERT(inp == NULL, ("rip_attach: inp != NULL")); 661 662 error = priv_check(td, PRIV_NETINET_RAW); 663 if (error) 664 return (error); 665 if (proto >= IPPROTO_MAX || proto < 0) 666 return EPROTONOSUPPORT; 667 error = soreserve(so, rip_sendspace, rip_recvspace); 668 if (error) 669 return (error); 670 INP_INFO_WLOCK(&ripcbinfo); 671 error = in_pcballoc(so, &ripcbinfo); 672 if (error) { 673 INP_INFO_WUNLOCK(&ripcbinfo); 674 return (error); 675 } 676 inp = (struct inpcb *)so->so_pcb; 677 inp->inp_vflag |= INP_IPV4; 678 inp->inp_ip_p = proto; 679 inp->inp_ip_ttl = ip_defttl; 680 rip_inshash(inp); 681 INP_INFO_WUNLOCK(&ripcbinfo); 682 INP_WUNLOCK(inp); 683 return (0); 684} 685 686static void 687rip_detach(struct socket *so) 688{ 689 struct inpcb *inp; 690 691 inp = sotoinpcb(so); 692 KASSERT(inp != NULL, ("rip_detach: inp == NULL")); 693 KASSERT(inp->inp_faddr.s_addr == INADDR_ANY, 694 ("rip_detach: not closed")); 695 696 INP_INFO_WLOCK(&ripcbinfo); 697 INP_WLOCK(inp); 698 rip_delhash(inp); 699 if (so == ip_mrouter && ip_mrouter_done) 700 ip_mrouter_done(); 701 if (ip_rsvp_force_done) 702 ip_rsvp_force_done(so); 703 if (so == ip_rsvpd) 704 ip_rsvp_done(); 705 in_pcbdetach(inp); 706 in_pcbfree(inp); 707 INP_INFO_WUNLOCK(&ripcbinfo); 708} 709 710static void 711rip_dodisconnect(struct socket *so, struct inpcb *inp) 712{ 713 INP_WLOCK_ASSERT(inp); 714 715 rip_delhash(inp); 716 inp->inp_faddr.s_addr = INADDR_ANY; 717 rip_inshash(inp); 718 SOCK_LOCK(so); 719 so->so_state &= ~SS_ISCONNECTED; 720 SOCK_UNLOCK(so); 721} 722 723static void 724rip_abort(struct socket *so) 725{ 726 struct inpcb *inp; 727 728 inp = sotoinpcb(so); 729 KASSERT(inp != NULL, ("rip_abort: inp == NULL")); 730 731 INP_INFO_WLOCK(&ripcbinfo); 732 INP_WLOCK(inp); 733 rip_dodisconnect(so, inp); 734 INP_WUNLOCK(inp); 735 INP_INFO_WUNLOCK(&ripcbinfo); 736} 737 738static void 739rip_close(struct socket *so) 740{ 741 struct inpcb *inp; 742 743 inp = sotoinpcb(so); 744 KASSERT(inp != NULL, ("rip_close: inp == NULL")); 745 746 INP_INFO_WLOCK(&ripcbinfo); 747 INP_WLOCK(inp); 748 rip_dodisconnect(so, inp); 749 INP_WUNLOCK(inp); 750 INP_INFO_WUNLOCK(&ripcbinfo); 751} 752 753static int 754rip_disconnect(struct socket *so) 755{ 756 struct inpcb *inp; 757 758 if ((so->so_state & SS_ISCONNECTED) == 0) 759 return (ENOTCONN); 760 761 inp = sotoinpcb(so); 762 KASSERT(inp != NULL, ("rip_disconnect: inp == NULL")); 763 764 INP_INFO_WLOCK(&ripcbinfo); 765 INP_WLOCK(inp); 766 rip_dodisconnect(so, inp); 767 INP_WUNLOCK(inp); 768 INP_INFO_WUNLOCK(&ripcbinfo); 769 return (0); 770} 771 772static int 773rip_bind(struct socket *so, struct sockaddr *nam, struct thread *td) 774{ 775 struct sockaddr_in *addr = (struct sockaddr_in *)nam; 776 struct inpcb *inp; 777 778 if (nam->sa_len != sizeof(*addr)) 779 return (EINVAL); 780 781 if (jailed(td->td_ucred)) { 782 if (addr->sin_addr.s_addr == INADDR_ANY) 783 addr->sin_addr.s_addr = 784 htonl(prison_getip(td->td_ucred)); 785 if (htonl(prison_getip(td->td_ucred)) != addr->sin_addr.s_addr) 786 return (EADDRNOTAVAIL); 787 } 788 789 if (TAILQ_EMPTY(&ifnet) || 790 (addr->sin_family != AF_INET && addr->sin_family != AF_IMPLINK) || 791 (addr->sin_addr.s_addr && 792 ifa_ifwithaddr((struct sockaddr *)addr) == 0)) 793 return (EADDRNOTAVAIL); 794 795 inp = sotoinpcb(so); 796 KASSERT(inp != NULL, ("rip_bind: inp == NULL")); 797 798 INP_INFO_WLOCK(&ripcbinfo); 799 INP_WLOCK(inp); 800 rip_delhash(inp); 801 inp->inp_laddr = addr->sin_addr; 802 rip_inshash(inp); 803 INP_WUNLOCK(inp); 804 INP_INFO_WUNLOCK(&ripcbinfo); 805 return (0); 806} 807 808static int 809rip_connect(struct socket *so, struct sockaddr *nam, struct thread *td) 810{ 811 struct sockaddr_in *addr = (struct sockaddr_in *)nam; 812 struct inpcb *inp; 813 814 if (nam->sa_len != sizeof(*addr)) 815 return (EINVAL); 816 if (TAILQ_EMPTY(&ifnet)) 817 return (EADDRNOTAVAIL); 818 if (addr->sin_family != AF_INET && addr->sin_family != AF_IMPLINK) 819 return (EAFNOSUPPORT); 820 821 inp = sotoinpcb(so); 822 KASSERT(inp != NULL, ("rip_connect: inp == NULL")); 823 824 INP_INFO_WLOCK(&ripcbinfo); 825 INP_WLOCK(inp); 826 rip_delhash(inp); 827 inp->inp_faddr = addr->sin_addr; 828 rip_inshash(inp); 829 soisconnected(so); 830 INP_WUNLOCK(inp); 831 INP_INFO_WUNLOCK(&ripcbinfo); 832 return (0); 833} 834 835static int 836rip_shutdown(struct socket *so) 837{ 838 struct inpcb *inp; 839 840 inp = sotoinpcb(so); 841 KASSERT(inp != NULL, ("rip_shutdown: inp == NULL")); 842 843 INP_WLOCK(inp); 844 socantsendmore(so); 845 INP_WUNLOCK(inp); 846 return (0); 847} 848 849static int 850rip_send(struct socket *so, int flags, struct mbuf *m, struct sockaddr *nam, 851 struct mbuf *control, struct thread *td) 852{ 853 struct inpcb *inp; 854 u_long dst; 855 856 inp = sotoinpcb(so); 857 KASSERT(inp != NULL, ("rip_send: inp == NULL")); 858 859 /* 860 * Note: 'dst' reads below are unlocked. 861 */ 862 if (so->so_state & SS_ISCONNECTED) { 863 if (nam) { 864 m_freem(m); 865 return (EISCONN); 866 } 867 dst = inp->inp_faddr.s_addr; /* Unlocked read. */ 868 } else { 869 if (nam == NULL) { 870 m_freem(m); 871 return (ENOTCONN); 872 } 873 dst = ((struct sockaddr_in *)nam)->sin_addr.s_addr; 874 } 875 return (rip_output(m, so, dst)); 876} 877 878static int 879rip_pcblist(SYSCTL_HANDLER_ARGS) 880{ 881 int error, i, n; 882 struct inpcb *inp, **inp_list; 883 inp_gen_t gencnt; 884 struct xinpgen xig; 885 886 /* 887 * The process of preparing the TCB list is too time-consuming and 888 * resource-intensive to repeat twice on every request. 889 */ 890 if (req->oldptr == 0) { 891 n = ripcbinfo.ipi_count; 892 req->oldidx = 2 * (sizeof xig) 893 + (n + n/8) * sizeof(struct xinpcb); 894 return (0); 895 } 896 897 if (req->newptr != 0) 898 return (EPERM); 899 900 /* 901 * OK, now we're committed to doing something. 902 */ 903 INP_INFO_RLOCK(&ripcbinfo); 904 gencnt = ripcbinfo.ipi_gencnt; 905 n = ripcbinfo.ipi_count; 906 INP_INFO_RUNLOCK(&ripcbinfo); 907 908 xig.xig_len = sizeof xig; 909 xig.xig_count = n; 910 xig.xig_gen = gencnt; 911 xig.xig_sogen = so_gencnt; 912 error = SYSCTL_OUT(req, &xig, sizeof xig); 913 if (error) 914 return (error); 915 916 inp_list = malloc(n * sizeof *inp_list, M_TEMP, M_WAITOK); 917 if (inp_list == 0) 918 return (ENOMEM); 919 920 INP_INFO_RLOCK(&ripcbinfo); 921 for (inp = LIST_FIRST(ripcbinfo.ipi_listhead), i = 0; inp && i < n; 922 inp = LIST_NEXT(inp, inp_list)) { 923 INP_RLOCK(inp); 924 if (inp->inp_gencnt <= gencnt && 925 cr_canseesocket(req->td->td_ucred, inp->inp_socket) == 0) { 926 /* XXX held references? */ 927 inp_list[i++] = inp; 928 } 929 INP_RUNLOCK(inp); 930 } 931 INP_INFO_RUNLOCK(&ripcbinfo); 932 n = i; 933 934 error = 0; 935 for (i = 0; i < n; i++) { 936 inp = inp_list[i]; 937 INP_RLOCK(inp); 938 if (inp->inp_gencnt <= gencnt) { 939 struct xinpcb xi; 940 bzero(&xi, sizeof(xi)); 941 xi.xi_len = sizeof xi; 942 /* XXX should avoid extra copy */ 943 bcopy(inp, &xi.xi_inp, sizeof *inp); 944 if (inp->inp_socket) 945 sotoxsocket(inp->inp_socket, &xi.xi_socket); 946 INP_RUNLOCK(inp); 947 error = SYSCTL_OUT(req, &xi, sizeof xi); 948 } else 949 INP_RUNLOCK(inp); 950 } 951 if (!error) { 952 /* 953 * Give the user an updated idea of our state. If the 954 * generation differs from what we told her before, she knows 955 * that something happened while we were processing this 956 * request, and it might be necessary to retry. 957 */ 958 INP_INFO_RLOCK(&ripcbinfo); 959 xig.xig_gen = ripcbinfo.ipi_gencnt; 960 xig.xig_sogen = so_gencnt; 961 xig.xig_count = ripcbinfo.ipi_count; 962 INP_INFO_RUNLOCK(&ripcbinfo); 963 error = SYSCTL_OUT(req, &xig, sizeof xig); 964 } 965 free(inp_list, M_TEMP); 966 return (error); 967} 968 969SYSCTL_PROC(_net_inet_raw, OID_AUTO/*XXX*/, pcblist, CTLFLAG_RD, 0, 0, 970 rip_pcblist, "S,xinpcb", "List of active raw IP sockets"); 971 972struct pr_usrreqs rip_usrreqs = { 973 .pru_abort = rip_abort, 974 .pru_attach = rip_attach, 975 .pru_bind = rip_bind, 976 .pru_connect = rip_connect, 977 .pru_control = in_control, 978 .pru_detach = rip_detach, 979 .pru_disconnect = rip_disconnect, 980 .pru_peeraddr = in_getpeeraddr, 981 .pru_send = rip_send, 982 .pru_shutdown = rip_shutdown, 983 .pru_sockaddr = in_getsockaddr, 984 .pru_sosetlabel = in_pcbsosetlabel, 985 .pru_close = rip_close, 986}; 987