uipc_usrreq.c revision 166883
1/*- 2 * Copyright (c) 1982, 1986, 1989, 1991, 1993 3 * The Regents of the University of California. 4 * Copyright (c) 2004-2007 Robert N. M. Watson 5 * All rights reserved. 6 * 7 * Redistribution and use in source and binary forms, with or without 8 * modification, are permitted provided that the following conditions 9 * are met: 10 * 1. Redistributions of source code must retain the above copyright 11 * notice, this list of conditions and the following disclaimer. 12 * 2. Redistributions in binary form must reproduce the above copyright 13 * notice, this list of conditions and the following disclaimer in the 14 * documentation and/or other materials provided with the distribution. 15 * 4. Neither the name of the University nor the names of its contributors 16 * may be used to endorse or promote products derived from this software 17 * without specific prior written permission. 18 * 19 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND 20 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 21 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 22 * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE 23 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 24 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 25 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 26 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 27 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 28 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 29 * SUCH DAMAGE. 30 * 31 * From: @(#)uipc_usrreq.c 8.3 (Berkeley) 1/4/94 32 */ 33 34/* 35 * UNIX Domain (Local) Sockets 36 * 37 * This is an implementation of UNIX (local) domain sockets. Each socket has 38 * an associated struct unpcb (UNIX protocol control block). Stream sockets 39 * may be connected to 0 or 1 other socket. Datagram sockets may be 40 * connected to 0, 1, or many other sockets. Sockets may be created and 41 * connected in pairs (socketpair(2)), or bound/connected to using the file 42 * system name space. For most purposes, only the receive socket buffer is 43 * used, as sending on one socket delivers directly to the receive socket 44 * buffer of a second socket. 45 * 46 * The implementation is substantially complicated by the fact that 47 * "ancillary data", such as file descriptors or credentials, may be passed 48 * across UNIX domain sockets. The potential for passing UNIX domain sockets 49 * over other UNIX domain sockets requires the implementation of a simple 50 * garbage collector to find and tear down cycles of disconnected sockets. 51 * 52 * TODO: 53 * SEQPACKET, RDM 54 * rethink name space problems 55 * need a proper out-of-band 56 * lock pushdown 57 */ 58 59#include <sys/cdefs.h> 60__FBSDID("$FreeBSD: head/sys/kern/uipc_usrreq.c 166883 2007-02-22 09:37:44Z rwatson $"); 61 62#include "opt_mac.h" 63 64#include <sys/param.h> 65#include <sys/domain.h> 66#include <sys/fcntl.h> 67#include <sys/malloc.h> /* XXX must be before <sys/file.h> */ 68#include <sys/eventhandler.h> 69#include <sys/file.h> 70#include <sys/filedesc.h> 71#include <sys/jail.h> 72#include <sys/kernel.h> 73#include <sys/lock.h> 74#include <sys/mbuf.h> 75#include <sys/mount.h> 76#include <sys/mutex.h> 77#include <sys/namei.h> 78#include <sys/proc.h> 79#include <sys/protosw.h> 80#include <sys/resourcevar.h> 81#include <sys/socket.h> 82#include <sys/socketvar.h> 83#include <sys/signalvar.h> 84#include <sys/stat.h> 85#include <sys/sx.h> 86#include <sys/sysctl.h> 87#include <sys/systm.h> 88#include <sys/taskqueue.h> 89#include <sys/un.h> 90#include <sys/unpcb.h> 91#include <sys/vnode.h> 92 93#include <security/mac/mac_framework.h> 94 95#include <vm/uma.h> 96 97static uma_zone_t unp_zone; 98static unp_gen_t unp_gencnt; 99static u_int unp_count; /* Count of local sockets. */ 100static ino_t unp_ino; /* Prototype for fake inode numbers. */ 101static int unp_rights; /* File descriptors in flight. */ 102static struct unp_head unp_shead; /* List of local stream sockets. */ 103static struct unp_head unp_dhead; /* List of local datagram sockets. */ 104 105static const struct sockaddr sun_noname = { sizeof(sun_noname), AF_LOCAL }; 106 107/* 108 * Garbage collection of cyclic file descriptor/socket references occurs 109 * asynchronously in a taskqueue context in order to avoid recursion and 110 * reentrance in the UNIX domain socket, file descriptor, and socket layer 111 * code. See unp_gc() for a full description. 112 */ 113static struct task unp_gc_task; 114 115/* 116 * Both send and receive buffers are allocated PIPSIZ bytes of buffering for 117 * stream sockets, although the total for sender and receiver is actually 118 * only PIPSIZ. 119 * 120 * Datagram sockets really use the sendspace as the maximum datagram size, 121 * and don't really want to reserve the sendspace. Their recvspace should be 122 * large enough for at least one max-size datagram plus address. 123 */ 124#ifndef PIPSIZ 125#define PIPSIZ 8192 126#endif 127static u_long unpst_sendspace = PIPSIZ; 128static u_long unpst_recvspace = PIPSIZ; 129static u_long unpdg_sendspace = 2*1024; /* really max datagram size */ 130static u_long unpdg_recvspace = 4*1024; 131 132SYSCTL_NODE(_net, PF_LOCAL, local, CTLFLAG_RW, 0, "Local domain"); 133SYSCTL_NODE(_net_local, SOCK_STREAM, stream, CTLFLAG_RW, 0, "SOCK_STREAM"); 134SYSCTL_NODE(_net_local, SOCK_DGRAM, dgram, CTLFLAG_RW, 0, "SOCK_DGRAM"); 135 136SYSCTL_ULONG(_net_local_stream, OID_AUTO, sendspace, CTLFLAG_RW, 137 &unpst_sendspace, 0, ""); 138SYSCTL_ULONG(_net_local_stream, OID_AUTO, recvspace, CTLFLAG_RW, 139 &unpst_recvspace, 0, ""); 140SYSCTL_ULONG(_net_local_dgram, OID_AUTO, maxdgram, CTLFLAG_RW, 141 &unpdg_sendspace, 0, ""); 142SYSCTL_ULONG(_net_local_dgram, OID_AUTO, recvspace, CTLFLAG_RW, 143 &unpdg_recvspace, 0, ""); 144SYSCTL_INT(_net_local, OID_AUTO, inflight, CTLFLAG_RD, &unp_rights, 0, ""); 145 146/* 147 * Currently, UNIX domain sockets are protected by a single subsystem lock, 148 * which covers global data structures and variables, the contents of each 149 * per-socket unpcb structure, and the so_pcb field in sockets attached to 150 * the UNIX domain. This provides for a moderate degree of paralellism, as 151 * receive operations on UNIX domain sockets do not need to acquire the 152 * subsystem lock. Finer grained locking to permit send() without acquiring 153 * a global lock would be a logical next step. 154 * 155 * The UNIX domain socket lock preceds all socket layer locks, including the 156 * socket lock and socket buffer lock, permitting UNIX domain socket code to 157 * call into socket support routines without releasing its locks. 158 * 159 * Some caution is required in areas where the UNIX domain socket code enters 160 * VFS in order to create or find rendezvous points. This results in 161 * dropping of the UNIX domain socket subsystem lock, acquisition of the 162 * Giant lock, and potential sleeping. This increases the chances of races, 163 * and exposes weaknesses in the socket->protocol API by offering poor 164 * failure modes. 165 */ 166static struct mtx unp_mtx; 167#define UNP_LOCK_INIT() \ 168 mtx_init(&unp_mtx, "unp", NULL, MTX_DEF | MTX_RECURSE) 169#define UNP_LOCK() mtx_lock(&unp_mtx) 170#define UNP_UNLOCK() mtx_unlock(&unp_mtx) 171#define UNP_LOCK_ASSERT() mtx_assert(&unp_mtx, MA_OWNED) 172#define UNP_UNLOCK_ASSERT() mtx_assert(&unp_mtx, MA_NOTOWNED) 173 174static int unp_connect(struct socket *, struct sockaddr *, 175 struct thread *); 176static int unp_connect2(struct socket *so, struct socket *so2, int); 177static void unp_disconnect(struct unpcb *); 178static void unp_shutdown(struct unpcb *); 179static void unp_drop(struct unpcb *, int); 180static void unp_gc(__unused void *, int); 181static void unp_scan(struct mbuf *, void (*)(struct file *)); 182static void unp_mark(struct file *); 183static void unp_discard(struct file *); 184static void unp_freerights(struct file **, int); 185static int unp_internalize(struct mbuf **, struct thread *); 186static int unp_listen(struct socket *, struct unpcb *, int, 187 struct thread *); 188static struct mbuf *unp_addsockcred(struct thread *, struct mbuf *); 189 190/* 191 * Definitions of protocols supported in the LOCAL domain. 192 */ 193static struct domain localdomain; 194static struct protosw localsw[] = { 195{ 196 .pr_type = SOCK_STREAM, 197 .pr_domain = &localdomain, 198 .pr_flags = PR_CONNREQUIRED|PR_WANTRCVD|PR_RIGHTS, 199 .pr_ctloutput = &uipc_ctloutput, 200 .pr_usrreqs = &uipc_usrreqs 201}, 202{ 203 .pr_type = SOCK_DGRAM, 204 .pr_domain = &localdomain, 205 .pr_flags = PR_ATOMIC|PR_ADDR|PR_RIGHTS, 206 .pr_usrreqs = &uipc_usrreqs 207}, 208}; 209 210static struct domain localdomain = { 211 .dom_family = AF_LOCAL, 212 .dom_name = "local", 213 .dom_init = unp_init, 214 .dom_externalize = unp_externalize, 215 .dom_dispose = unp_dispose, 216 .dom_protosw = localsw, 217 .dom_protoswNPROTOSW = &localsw[sizeof(localsw)/sizeof(localsw[0])] 218}; 219DOMAIN_SET(local); 220 221static void 222uipc_abort(struct socket *so) 223{ 224 struct unpcb *unp; 225 226 unp = sotounpcb(so); 227 KASSERT(unp != NULL, ("uipc_abort: unp == NULL")); 228 UNP_LOCK(); 229 unp_drop(unp, ECONNABORTED); 230 UNP_UNLOCK(); 231} 232 233static int 234uipc_accept(struct socket *so, struct sockaddr **nam) 235{ 236 struct unpcb *unp; 237 const struct sockaddr *sa; 238 239 /* 240 * Pass back name of connected socket, if it was bound and we are 241 * still connected (our peer may have closed already!). 242 */ 243 unp = sotounpcb(so); 244 KASSERT(unp != NULL, ("uipc_accept: unp == NULL")); 245 *nam = malloc(sizeof(struct sockaddr_un), M_SONAME, M_WAITOK); 246 UNP_LOCK(); 247 if (unp->unp_conn != NULL && unp->unp_conn->unp_addr != NULL) 248 sa = (struct sockaddr *) unp->unp_conn->unp_addr; 249 else 250 sa = &sun_noname; 251 bcopy(sa, *nam, sa->sa_len); 252 UNP_UNLOCK(); 253 return (0); 254} 255 256static int 257uipc_attach(struct socket *so, int proto, struct thread *td) 258{ 259 struct unpcb *unp; 260 int error; 261 262 KASSERT(so->so_pcb == NULL, ("uipc_attach: so_pcb != NULL")); 263 if (so->so_snd.sb_hiwat == 0 || so->so_rcv.sb_hiwat == 0) { 264 switch (so->so_type) { 265 case SOCK_STREAM: 266 error = soreserve(so, unpst_sendspace, unpst_recvspace); 267 break; 268 269 case SOCK_DGRAM: 270 error = soreserve(so, unpdg_sendspace, unpdg_recvspace); 271 break; 272 273 default: 274 panic("unp_attach"); 275 } 276 if (error) 277 return (error); 278 } 279 unp = uma_zalloc(unp_zone, M_NOWAIT | M_ZERO); 280 if (unp == NULL) 281 return (ENOBUFS); 282 LIST_INIT(&unp->unp_refs); 283 unp->unp_socket = so; 284 so->so_pcb = unp; 285 286 unp->unp_refcount = 1; 287 UNP_LOCK(); 288 unp->unp_gencnt = ++unp_gencnt; 289 unp_count++; 290 LIST_INSERT_HEAD(so->so_type == SOCK_DGRAM ? &unp_dhead : &unp_shead, 291 unp, unp_link); 292 UNP_UNLOCK(); 293 294 return (0); 295} 296 297static int 298uipc_bind(struct socket *so, struct sockaddr *nam, struct thread *td) 299{ 300 struct sockaddr_un *soun = (struct sockaddr_un *)nam; 301 struct vattr vattr; 302 int error, namelen; 303 struct nameidata nd; 304 struct unpcb *unp; 305 struct vnode *vp; 306 struct mount *mp; 307 char *buf; 308 309 unp = sotounpcb(so); 310 KASSERT(unp != NULL, ("uipc_bind: unp == NULL")); 311 312 namelen = soun->sun_len - offsetof(struct sockaddr_un, sun_path); 313 if (namelen <= 0) 314 return (EINVAL); 315 316 /* 317 * We don't allow simultaneous bind() calls on a single UNIX domain 318 * socket, so flag in-progress operations, and return an error if an 319 * operation is already in progress. 320 * 321 * Historically, we have not allowed a socket to be rebound, so this 322 * also returns an error. Not allowing re-binding certainly 323 * simplifies the implementation and avoids a great many possible 324 * failure modes. 325 */ 326 UNP_LOCK(); 327 if (unp->unp_vnode != NULL) { 328 UNP_UNLOCK(); 329 return (EINVAL); 330 } 331 if (unp->unp_flags & UNP_BINDING) { 332 UNP_UNLOCK(); 333 return (EALREADY); 334 } 335 unp->unp_flags |= UNP_BINDING; 336 UNP_UNLOCK(); 337 338 buf = malloc(namelen + 1, M_TEMP, M_WAITOK); 339 strlcpy(buf, soun->sun_path, namelen + 1); 340 341 mtx_lock(&Giant); 342restart: 343 mtx_assert(&Giant, MA_OWNED); 344 NDINIT(&nd, CREATE, NOFOLLOW | LOCKPARENT | SAVENAME, UIO_SYSSPACE, 345 buf, td); 346/* SHOULD BE ABLE TO ADOPT EXISTING AND wakeup() ALA FIFO's */ 347 error = namei(&nd); 348 if (error) 349 goto error; 350 vp = nd.ni_vp; 351 if (vp != NULL || vn_start_write(nd.ni_dvp, &mp, V_NOWAIT) != 0) { 352 NDFREE(&nd, NDF_ONLY_PNBUF); 353 if (nd.ni_dvp == vp) 354 vrele(nd.ni_dvp); 355 else 356 vput(nd.ni_dvp); 357 if (vp != NULL) { 358 vrele(vp); 359 error = EADDRINUSE; 360 goto error; 361 } 362 error = vn_start_write(NULL, &mp, V_XSLEEP | PCATCH); 363 if (error) 364 goto error; 365 goto restart; 366 } 367 VATTR_NULL(&vattr); 368 vattr.va_type = VSOCK; 369 vattr.va_mode = (ACCESSPERMS & ~td->td_proc->p_fd->fd_cmask); 370#ifdef MAC 371 error = mac_check_vnode_create(td->td_ucred, nd.ni_dvp, &nd.ni_cnd, 372 &vattr); 373#endif 374 if (error == 0) { 375 VOP_LEASE(nd.ni_dvp, td, td->td_ucred, LEASE_WRITE); 376 error = VOP_CREATE(nd.ni_dvp, &nd.ni_vp, &nd.ni_cnd, &vattr); 377 } 378 NDFREE(&nd, NDF_ONLY_PNBUF); 379 vput(nd.ni_dvp); 380 if (error) { 381 vn_finished_write(mp); 382 goto error; 383 } 384 vp = nd.ni_vp; 385 ASSERT_VOP_LOCKED(vp, "uipc_bind"); 386 soun = (struct sockaddr_un *)sodupsockaddr(nam, M_WAITOK); 387 UNP_LOCK(); 388 vp->v_socket = unp->unp_socket; 389 unp->unp_vnode = vp; 390 unp->unp_addr = soun; 391 unp->unp_flags &= ~UNP_BINDING; 392 UNP_UNLOCK(); 393 VOP_UNLOCK(vp, 0, td); 394 vn_finished_write(mp); 395 mtx_unlock(&Giant); 396 free(buf, M_TEMP); 397 return (0); 398error: 399 UNP_LOCK(); 400 unp->unp_flags &= ~UNP_BINDING; 401 UNP_UNLOCK(); 402 mtx_unlock(&Giant); 403 free(buf, M_TEMP); 404 return (error); 405} 406 407static int 408uipc_connect(struct socket *so, struct sockaddr *nam, struct thread *td) 409{ 410 int error; 411 412 KASSERT(td == curthread, ("uipc_connect: td != curthread")); 413 UNP_LOCK(); 414 error = unp_connect(so, nam, td); 415 UNP_UNLOCK(); 416 return (error); 417} 418 419/* 420 * XXXRW: Should also unbind? 421 */ 422static void 423uipc_close(struct socket *so) 424{ 425 struct unpcb *unp; 426 427 unp = sotounpcb(so); 428 KASSERT(unp != NULL, ("uipc_close: unp == NULL")); 429 UNP_LOCK(); 430 unp_disconnect(unp); 431 UNP_UNLOCK(); 432} 433 434int 435uipc_connect2(struct socket *so1, struct socket *so2) 436{ 437 struct unpcb *unp; 438 int error; 439 440 unp = sotounpcb(so1); 441 KASSERT(unp != NULL, ("uipc_connect2: unp == NULL")); 442 UNP_LOCK(); 443 error = unp_connect2(so1, so2, PRU_CONNECT2); 444 UNP_UNLOCK(); 445 return (error); 446} 447 448/* control is EOPNOTSUPP */ 449 450static void 451uipc_detach(struct socket *so) 452{ 453 struct sockaddr_un *saved_unp_addr; 454 struct unpcb *unp; 455 struct vnode *vp; 456 int freeunp, local_unp_rights; 457 458 unp = sotounpcb(so); 459 KASSERT(unp != NULL, ("uipc_detach: unp == NULL")); 460 UNP_LOCK(); 461 LIST_REMOVE(unp, unp_link); 462 unp->unp_gencnt = ++unp_gencnt; 463 --unp_count; 464 if ((vp = unp->unp_vnode) != NULL) { 465 unp->unp_vnode->v_socket = NULL; 466 unp->unp_vnode = NULL; 467 } 468 if (unp->unp_conn != NULL) 469 unp_disconnect(unp); 470 while (!LIST_EMPTY(&unp->unp_refs)) { 471 struct unpcb *ref = LIST_FIRST(&unp->unp_refs); 472 unp_drop(ref, ECONNRESET); 473 } 474 unp->unp_socket->so_pcb = NULL; 475 local_unp_rights = unp_rights; 476 saved_unp_addr = unp->unp_addr; 477 unp->unp_addr = NULL; 478 unp->unp_refcount--; 479 freeunp = (unp->unp_refcount == 0); 480 UNP_UNLOCK(); 481 if (saved_unp_addr != NULL) 482 FREE(saved_unp_addr, M_SONAME); 483 if (freeunp) 484 uma_zfree(unp_zone, unp); 485 if (vp) { 486 int vfslocked; 487 488 vfslocked = VFS_LOCK_GIANT(vp->v_mount); 489 vrele(vp); 490 VFS_UNLOCK_GIANT(vfslocked); 491 } 492 if (local_unp_rights) 493 taskqueue_enqueue(taskqueue_thread, &unp_gc_task); 494} 495 496static int 497uipc_disconnect(struct socket *so) 498{ 499 struct unpcb *unp; 500 501 unp = sotounpcb(so); 502 KASSERT(unp != NULL, ("uipc_disconnect: unp == NULL")); 503 UNP_LOCK(); 504 unp_disconnect(unp); 505 UNP_UNLOCK(); 506 return (0); 507} 508 509static int 510uipc_listen(struct socket *so, int backlog, struct thread *td) 511{ 512 struct unpcb *unp; 513 int error; 514 515 unp = sotounpcb(so); 516 KASSERT(unp != NULL, ("uipc_listen: unp == NULL")); 517 UNP_LOCK(); 518 if (unp->unp_vnode == NULL) { 519 UNP_UNLOCK(); 520 return (EINVAL); 521 } 522 error = unp_listen(so, unp, backlog, td); 523 UNP_UNLOCK(); 524 return (error); 525} 526 527static int 528uipc_peeraddr(struct socket *so, struct sockaddr **nam) 529{ 530 struct unpcb *unp; 531 const struct sockaddr *sa; 532 533 unp = sotounpcb(so); 534 KASSERT(unp != NULL, ("uipc_peeraddr: unp == NULL")); 535 *nam = malloc(sizeof(struct sockaddr_un), M_SONAME, M_WAITOK); 536 UNP_LOCK(); 537 if (unp->unp_conn != NULL && unp->unp_conn->unp_addr!= NULL) 538 sa = (struct sockaddr *) unp->unp_conn->unp_addr; 539 else { 540 /* 541 * XXX: It seems that this test always fails even when 542 * connection is established. So, this else clause is 543 * added as workaround to return PF_LOCAL sockaddr. 544 */ 545 sa = &sun_noname; 546 } 547 bcopy(sa, *nam, sa->sa_len); 548 UNP_UNLOCK(); 549 return (0); 550} 551 552static int 553uipc_rcvd(struct socket *so, int flags) 554{ 555 struct unpcb *unp; 556 struct socket *so2; 557 u_int mbcnt, sbcc; 558 u_long newhiwat; 559 560 unp = sotounpcb(so); 561 KASSERT(unp != NULL, ("uipc_rcvd: unp == NULL")); 562 switch (so->so_type) { 563 case SOCK_DGRAM: 564 panic("uipc_rcvd DGRAM?"); 565 /*NOTREACHED*/ 566 567 case SOCK_STREAM: 568 /* 569 * Adjust backpressure on sender and wakeup any waiting to 570 * write. 571 */ 572 SOCKBUF_LOCK(&so->so_rcv); 573 mbcnt = so->so_rcv.sb_mbcnt; 574 sbcc = so->so_rcv.sb_cc; 575 SOCKBUF_UNLOCK(&so->so_rcv); 576 UNP_LOCK(); 577 if (unp->unp_conn == NULL) { 578 UNP_UNLOCK(); 579 break; 580 } 581 so2 = unp->unp_conn->unp_socket; 582 SOCKBUF_LOCK(&so2->so_snd); 583 so2->so_snd.sb_mbmax += unp->unp_mbcnt - mbcnt; 584 newhiwat = so2->so_snd.sb_hiwat + unp->unp_cc - sbcc; 585 (void)chgsbsize(so2->so_cred->cr_uidinfo, &so2->so_snd.sb_hiwat, 586 newhiwat, RLIM_INFINITY); 587 sowwakeup_locked(so2); 588 unp->unp_mbcnt = mbcnt; 589 unp->unp_cc = sbcc; 590 UNP_UNLOCK(); 591 break; 592 593 default: 594 panic("uipc_rcvd unknown socktype"); 595 } 596 return (0); 597} 598 599/* pru_rcvoob is EOPNOTSUPP */ 600 601static int 602uipc_send(struct socket *so, int flags, struct mbuf *m, struct sockaddr *nam, 603 struct mbuf *control, struct thread *td) 604{ 605 struct unpcb *unp, *unp2; 606 struct socket *so2; 607 u_int mbcnt, sbcc; 608 u_long newhiwat; 609 int error = 0; 610 611 unp = sotounpcb(so); 612 KASSERT(unp != NULL, ("uipc_send: unp == NULL")); 613 if (flags & PRUS_OOB) { 614 error = EOPNOTSUPP; 615 goto release; 616 } 617 618 if (control != NULL && (error = unp_internalize(&control, td))) 619 goto release; 620 621 UNP_LOCK(); 622 switch (so->so_type) { 623 case SOCK_DGRAM: 624 { 625 const struct sockaddr *from; 626 627 if (nam != NULL) { 628 if (unp->unp_conn != NULL) { 629 error = EISCONN; 630 break; 631 } 632 error = unp_connect(so, nam, td); 633 if (error) 634 break; 635 } 636 /* 637 * Because connect() and send() are non-atomic in a sendto() 638 * with a target address, it's possible that the socket will 639 * have disconnected before the send() can run. In that case 640 * return the slightly counter-intuitive but otherwise 641 * correct error that the socket is not connected. 642 */ 643 unp2 = unp->unp_conn; 644 if (unp2 == NULL) { 645 error = ENOTCONN; 646 break; 647 } 648 so2 = unp2->unp_socket; 649 if (unp->unp_addr != NULL) 650 from = (struct sockaddr *)unp->unp_addr; 651 else 652 from = &sun_noname; 653 if (unp2->unp_flags & UNP_WANTCRED) 654 control = unp_addsockcred(td, control); 655 SOCKBUF_LOCK(&so2->so_rcv); 656 if (sbappendaddr_locked(&so2->so_rcv, from, m, control)) { 657 sorwakeup_locked(so2); 658 m = NULL; 659 control = NULL; 660 } else { 661 SOCKBUF_UNLOCK(&so2->so_rcv); 662 error = ENOBUFS; 663 } 664 if (nam != NULL) 665 unp_disconnect(unp); 666 break; 667 } 668 669 case SOCK_STREAM: 670 /* 671 * Connect if not connected yet. 672 * 673 * Note: A better implementation would complain if not equal 674 * to the peer's address. 675 */ 676 if ((so->so_state & SS_ISCONNECTED) == 0) { 677 if (nam != NULL) { 678 error = unp_connect(so, nam, td); 679 if (error) 680 break; /* XXX */ 681 } else { 682 error = ENOTCONN; 683 break; 684 } 685 } 686 687 /* Lockless read. */ 688 if (so->so_snd.sb_state & SBS_CANTSENDMORE) { 689 error = EPIPE; 690 break; 691 } 692 /* 693 * Because connect() and send() are non-atomic in a sendto() 694 * with a target address, it's possible that the socket will 695 * have disconnected before the send() can run. In that case 696 * return the slightly counter-intuitive but otherwise 697 * correct error that the socket is not connected. 698 */ 699 unp2 = unp->unp_conn; 700 if (unp2 == NULL) { 701 error = ENOTCONN; 702 break; 703 } 704 so2 = unp2->unp_socket; 705 SOCKBUF_LOCK(&so2->so_rcv); 706 if (unp2->unp_flags & UNP_WANTCRED) { 707 /* 708 * Credentials are passed only once on 709 * SOCK_STREAM. 710 */ 711 unp2->unp_flags &= ~UNP_WANTCRED; 712 control = unp_addsockcred(td, control); 713 } 714 /* 715 * Send to paired receive port, and then reduce send buffer 716 * hiwater marks to maintain backpressure. Wake up readers. 717 */ 718 if (control != NULL) { 719 if (sbappendcontrol_locked(&so2->so_rcv, m, control)) 720 control = NULL; 721 } else { 722 sbappend_locked(&so2->so_rcv, m); 723 } 724 mbcnt = so2->so_rcv.sb_mbcnt - unp2->unp_mbcnt; 725 unp2->unp_mbcnt = so2->so_rcv.sb_mbcnt; 726 sbcc = so2->so_rcv.sb_cc; 727 sorwakeup_locked(so2); 728 729 SOCKBUF_LOCK(&so->so_snd); 730 newhiwat = so->so_snd.sb_hiwat - (sbcc - unp2->unp_cc); 731 (void)chgsbsize(so->so_cred->cr_uidinfo, &so->so_snd.sb_hiwat, 732 newhiwat, RLIM_INFINITY); 733 so->so_snd.sb_mbmax -= mbcnt; 734 SOCKBUF_UNLOCK(&so->so_snd); 735 736 unp2->unp_cc = sbcc; 737 m = NULL; 738 break; 739 740 default: 741 panic("uipc_send unknown socktype"); 742 } 743 744 /* 745 * SEND_EOF is equivalent to a SEND followed by 746 * a SHUTDOWN. 747 */ 748 if (flags & PRUS_EOF) { 749 socantsendmore(so); 750 unp_shutdown(unp); 751 } 752 UNP_UNLOCK(); 753 754 if (control != NULL && error != 0) 755 unp_dispose(control); 756 757release: 758 if (control != NULL) 759 m_freem(control); 760 if (m != NULL) 761 m_freem(m); 762 return (error); 763} 764 765static int 766uipc_sense(struct socket *so, struct stat *sb) 767{ 768 struct unpcb *unp; 769 struct socket *so2; 770 771 unp = sotounpcb(so); 772 KASSERT(unp != NULL, ("uipc_sense: unp == NULL")); 773 UNP_LOCK(); 774 sb->st_blksize = so->so_snd.sb_hiwat; 775 if (so->so_type == SOCK_STREAM && unp->unp_conn != NULL) { 776 so2 = unp->unp_conn->unp_socket; 777 sb->st_blksize += so2->so_rcv.sb_cc; 778 } 779 sb->st_dev = NODEV; 780 if (unp->unp_ino == 0) 781 unp->unp_ino = (++unp_ino == 0) ? ++unp_ino : unp_ino; 782 sb->st_ino = unp->unp_ino; 783 UNP_UNLOCK(); 784 return (0); 785} 786 787static int 788uipc_shutdown(struct socket *so) 789{ 790 struct unpcb *unp; 791 792 unp = sotounpcb(so); 793 KASSERT(unp != NULL, ("uipc_shutdown: unp == NULL")); 794 UNP_LOCK(); 795 socantsendmore(so); 796 unp_shutdown(unp); 797 UNP_UNLOCK(); 798 return (0); 799} 800 801static int 802uipc_sockaddr(struct socket *so, struct sockaddr **nam) 803{ 804 struct unpcb *unp; 805 const struct sockaddr *sa; 806 807 unp = sotounpcb(so); 808 KASSERT(unp != NULL, ("uipc_sockaddr: unp == NULL")); 809 *nam = malloc(sizeof(struct sockaddr_un), M_SONAME, M_WAITOK); 810 UNP_LOCK(); 811 if (unp->unp_addr != NULL) 812 sa = (struct sockaddr *) unp->unp_addr; 813 else 814 sa = &sun_noname; 815 bcopy(sa, *nam, sa->sa_len); 816 UNP_UNLOCK(); 817 return (0); 818} 819 820struct pr_usrreqs uipc_usrreqs = { 821 .pru_abort = uipc_abort, 822 .pru_accept = uipc_accept, 823 .pru_attach = uipc_attach, 824 .pru_bind = uipc_bind, 825 .pru_connect = uipc_connect, 826 .pru_connect2 = uipc_connect2, 827 .pru_detach = uipc_detach, 828 .pru_disconnect = uipc_disconnect, 829 .pru_listen = uipc_listen, 830 .pru_peeraddr = uipc_peeraddr, 831 .pru_rcvd = uipc_rcvd, 832 .pru_send = uipc_send, 833 .pru_sense = uipc_sense, 834 .pru_shutdown = uipc_shutdown, 835 .pru_sockaddr = uipc_sockaddr, 836 .pru_close = uipc_close, 837}; 838 839int 840uipc_ctloutput(struct socket *so, struct sockopt *sopt) 841{ 842 struct unpcb *unp; 843 struct xucred xu; 844 int error, optval; 845 846 if (sopt->sopt_level != 0) 847 return (EINVAL); 848 849 unp = sotounpcb(so); 850 KASSERT(unp != NULL, ("uipc_ctloutput: unp == NULL")); 851 error = 0; 852 switch (sopt->sopt_dir) { 853 case SOPT_GET: 854 switch (sopt->sopt_name) { 855 case LOCAL_PEERCRED: 856 UNP_LOCK(); 857 if (unp->unp_flags & UNP_HAVEPC) 858 xu = unp->unp_peercred; 859 else { 860 if (so->so_type == SOCK_STREAM) 861 error = ENOTCONN; 862 else 863 error = EINVAL; 864 } 865 UNP_UNLOCK(); 866 if (error == 0) 867 error = sooptcopyout(sopt, &xu, sizeof(xu)); 868 break; 869 case LOCAL_CREDS: 870 /* Unocked read. */ 871 optval = unp->unp_flags & UNP_WANTCRED ? 1 : 0; 872 error = sooptcopyout(sopt, &optval, sizeof(optval)); 873 break; 874 case LOCAL_CONNWAIT: 875 /* Unocked read. */ 876 optval = unp->unp_flags & UNP_CONNWAIT ? 1 : 0; 877 error = sooptcopyout(sopt, &optval, sizeof(optval)); 878 break; 879 default: 880 error = EOPNOTSUPP; 881 break; 882 } 883 break; 884 case SOPT_SET: 885 switch (sopt->sopt_name) { 886 case LOCAL_CREDS: 887 case LOCAL_CONNWAIT: 888 error = sooptcopyin(sopt, &optval, sizeof(optval), 889 sizeof(optval)); 890 if (error) 891 break; 892 893#define OPTSET(bit) \ 894 if (optval) \ 895 unp->unp_flags |= bit; \ 896 else \ 897 unp->unp_flags &= ~bit; 898 899 UNP_LOCK(); 900 switch (sopt->sopt_name) { 901 case LOCAL_CREDS: 902 OPTSET(UNP_WANTCRED); 903 break; 904 case LOCAL_CONNWAIT: 905 OPTSET(UNP_CONNWAIT); 906 break; 907 default: 908 break; 909 } 910 UNP_UNLOCK(); 911 break; 912#undef OPTSET 913 default: 914 error = ENOPROTOOPT; 915 break; 916 } 917 break; 918 default: 919 error = EOPNOTSUPP; 920 break; 921 } 922 return (error); 923} 924 925static int 926unp_connect(struct socket *so, struct sockaddr *nam, struct thread *td) 927{ 928 struct sockaddr_un *soun = (struct sockaddr_un *)nam; 929 struct vnode *vp; 930 struct socket *so2, *so3; 931 struct unpcb *unp, *unp2, *unp3; 932 int error, len; 933 struct nameidata nd; 934 char buf[SOCK_MAXADDRLEN]; 935 struct sockaddr *sa; 936 937 UNP_LOCK_ASSERT(); 938 939 unp = sotounpcb(so); 940 KASSERT(unp != NULL, ("unp_connect: unp == NULL")); 941 len = nam->sa_len - offsetof(struct sockaddr_un, sun_path); 942 if (len <= 0) 943 return (EINVAL); 944 strlcpy(buf, soun->sun_path, len + 1); 945 if (unp->unp_flags & UNP_CONNECTING) { 946 UNP_UNLOCK(); 947 return (EALREADY); 948 } 949 unp->unp_flags |= UNP_CONNECTING; 950 UNP_UNLOCK(); 951 sa = malloc(sizeof(struct sockaddr_un), M_SONAME, M_WAITOK); 952 mtx_lock(&Giant); 953 NDINIT(&nd, LOOKUP, FOLLOW | LOCKLEAF, UIO_SYSSPACE, buf, td); 954 error = namei(&nd); 955 if (error) 956 vp = NULL; 957 else 958 vp = nd.ni_vp; 959 ASSERT_VOP_LOCKED(vp, "unp_connect"); 960 NDFREE(&nd, NDF_ONLY_PNBUF); 961 if (error) 962 goto bad; 963 964 if (vp->v_type != VSOCK) { 965 error = ENOTSOCK; 966 goto bad; 967 } 968#ifdef MAC 969 error = mac_check_vnode_open(td->td_ucred, vp, VWRITE | VREAD); 970 if (error) 971 goto bad; 972#endif 973 error = VOP_ACCESS(vp, VWRITE, td->td_ucred, td); 974 if (error) 975 goto bad; 976 mtx_unlock(&Giant); 977 UNP_LOCK(); 978 unp = sotounpcb(so); 979 KASSERT(unp != NULL, ("unp_connect: unp == NULL")); 980 so2 = vp->v_socket; 981 if (so2 == NULL) { 982 error = ECONNREFUSED; 983 goto bad2; 984 } 985 if (so->so_type != so2->so_type) { 986 error = EPROTOTYPE; 987 goto bad2; 988 } 989 if (so->so_proto->pr_flags & PR_CONNREQUIRED) { 990 if (so2->so_options & SO_ACCEPTCONN) 991 so3 = sonewconn(so2, 0); 992 else 993 so3 = NULL; 994 if (so3 == NULL) { 995 error = ECONNREFUSED; 996 goto bad2; 997 } 998 unp = sotounpcb(so); 999 unp2 = sotounpcb(so2); 1000 unp3 = sotounpcb(so3); 1001 if (unp2->unp_addr != NULL) { 1002 bcopy(unp2->unp_addr, sa, unp2->unp_addr->sun_len); 1003 unp3->unp_addr = (struct sockaddr_un *) sa; 1004 sa = NULL; 1005 } 1006 /* 1007 * unp_peercred management: 1008 * 1009 * The connecter's (client's) credentials are copied from its 1010 * process structure at the time of connect() (which is now). 1011 */ 1012 cru2x(td->td_ucred, &unp3->unp_peercred); 1013 unp3->unp_flags |= UNP_HAVEPC; 1014 /* 1015 * The receiver's (server's) credentials are copied from the 1016 * unp_peercred member of socket on which the former called 1017 * listen(); unp_listen() cached that process's credentials 1018 * at that time so we can use them now. 1019 */ 1020 KASSERT(unp2->unp_flags & UNP_HAVEPCCACHED, 1021 ("unp_connect: listener without cached peercred")); 1022 memcpy(&unp->unp_peercred, &unp2->unp_peercred, 1023 sizeof(unp->unp_peercred)); 1024 unp->unp_flags |= UNP_HAVEPC; 1025 if (unp2->unp_flags & UNP_WANTCRED) 1026 unp3->unp_flags |= UNP_WANTCRED; 1027#ifdef MAC 1028 SOCK_LOCK(so); 1029 mac_set_socket_peer_from_socket(so, so3); 1030 mac_set_socket_peer_from_socket(so3, so); 1031 SOCK_UNLOCK(so); 1032#endif 1033 1034 so2 = so3; 1035 } 1036 error = unp_connect2(so, so2, PRU_CONNECT); 1037bad2: 1038 UNP_UNLOCK(); 1039 mtx_lock(&Giant); 1040bad: 1041 mtx_assert(&Giant, MA_OWNED); 1042 if (vp != NULL) 1043 vput(vp); 1044 mtx_unlock(&Giant); 1045 free(sa, M_SONAME); 1046 UNP_LOCK(); 1047 unp->unp_flags &= ~UNP_CONNECTING; 1048 return (error); 1049} 1050 1051static int 1052unp_connect2(struct socket *so, struct socket *so2, int req) 1053{ 1054 struct unpcb *unp = sotounpcb(so); 1055 struct unpcb *unp2; 1056 1057 UNP_LOCK_ASSERT(); 1058 1059 if (so2->so_type != so->so_type) 1060 return (EPROTOTYPE); 1061 unp2 = sotounpcb(so2); 1062 KASSERT(unp2 != NULL, ("unp_connect2: unp2 == NULL")); 1063 unp->unp_conn = unp2; 1064 switch (so->so_type) { 1065 case SOCK_DGRAM: 1066 LIST_INSERT_HEAD(&unp2->unp_refs, unp, unp_reflink); 1067 soisconnected(so); 1068 break; 1069 1070 case SOCK_STREAM: 1071 unp2->unp_conn = unp; 1072 if (req == PRU_CONNECT && 1073 ((unp->unp_flags | unp2->unp_flags) & UNP_CONNWAIT)) 1074 soisconnecting(so); 1075 else 1076 soisconnected(so); 1077 soisconnected(so2); 1078 break; 1079 1080 default: 1081 panic("unp_connect2"); 1082 } 1083 return (0); 1084} 1085 1086static void 1087unp_disconnect(struct unpcb *unp) 1088{ 1089 struct unpcb *unp2 = unp->unp_conn; 1090 struct socket *so; 1091 1092 UNP_LOCK_ASSERT(); 1093 1094 if (unp2 == NULL) 1095 return; 1096 unp->unp_conn = NULL; 1097 switch (unp->unp_socket->so_type) { 1098 case SOCK_DGRAM: 1099 LIST_REMOVE(unp, unp_reflink); 1100 so = unp->unp_socket; 1101 SOCK_LOCK(so); 1102 so->so_state &= ~SS_ISCONNECTED; 1103 SOCK_UNLOCK(so); 1104 break; 1105 1106 case SOCK_STREAM: 1107 soisdisconnected(unp->unp_socket); 1108 unp2->unp_conn = NULL; 1109 soisdisconnected(unp2->unp_socket); 1110 break; 1111 } 1112} 1113 1114/* 1115 * unp_pcblist() assumes that UNIX domain socket memory is never reclaimed by 1116 * the zone (UMA_ZONE_NOFREE), and as such potentially stale pointers are 1117 * safe to reference. It first scans the list of struct unpcb's to generate 1118 * a pointer list, then it rescans its list one entry at a time to 1119 * externalize and copyout. It checks the generation number to see if a 1120 * struct unpcb has been reused, and will skip it if so. 1121 */ 1122static int 1123unp_pcblist(SYSCTL_HANDLER_ARGS) 1124{ 1125 int error, i, n; 1126 int freeunp; 1127 struct unpcb *unp, **unp_list; 1128 unp_gen_t gencnt; 1129 struct xunpgen *xug; 1130 struct unp_head *head; 1131 struct xunpcb *xu; 1132 1133 head = ((intptr_t)arg1 == SOCK_DGRAM ? &unp_dhead : &unp_shead); 1134 1135 /* 1136 * The process of preparing the PCB list is too time-consuming and 1137 * resource-intensive to repeat twice on every request. 1138 */ 1139 if (req->oldptr == NULL) { 1140 n = unp_count; 1141 req->oldidx = 2 * (sizeof *xug) 1142 + (n + n/8) * sizeof(struct xunpcb); 1143 return (0); 1144 } 1145 1146 if (req->newptr != NULL) 1147 return (EPERM); 1148 1149 /* 1150 * OK, now we're committed to doing something. 1151 */ 1152 xug = malloc(sizeof(*xug), M_TEMP, M_WAITOK); 1153 UNP_LOCK(); 1154 gencnt = unp_gencnt; 1155 n = unp_count; 1156 UNP_UNLOCK(); 1157 1158 xug->xug_len = sizeof *xug; 1159 xug->xug_count = n; 1160 xug->xug_gen = gencnt; 1161 xug->xug_sogen = so_gencnt; 1162 error = SYSCTL_OUT(req, xug, sizeof *xug); 1163 if (error) { 1164 free(xug, M_TEMP); 1165 return (error); 1166 } 1167 1168 unp_list = malloc(n * sizeof *unp_list, M_TEMP, M_WAITOK); 1169 1170 UNP_LOCK(); 1171 for (unp = LIST_FIRST(head), i = 0; unp && i < n; 1172 unp = LIST_NEXT(unp, unp_link)) { 1173 if (unp->unp_gencnt <= gencnt) { 1174 if (cr_cansee(req->td->td_ucred, 1175 unp->unp_socket->so_cred)) 1176 continue; 1177 unp_list[i++] = unp; 1178 unp->unp_refcount++; 1179 } 1180 } 1181 UNP_UNLOCK(); 1182 n = i; /* In case we lost some during malloc. */ 1183 1184 error = 0; 1185 xu = malloc(sizeof(*xu), M_TEMP, M_WAITOK | M_ZERO); 1186 for (i = 0; i < n; i++) { 1187 unp = unp_list[i]; 1188 UNP_LOCK(); 1189 unp->unp_refcount--; 1190 if (unp->unp_refcount != 0 && unp->unp_gencnt <= gencnt) { 1191 xu->xu_len = sizeof *xu; 1192 xu->xu_unpp = unp; 1193 /* 1194 * XXX - need more locking here to protect against 1195 * connect/disconnect races for SMP. 1196 */ 1197 if (unp->unp_addr != NULL) 1198 bcopy(unp->unp_addr, &xu->xu_addr, 1199 unp->unp_addr->sun_len); 1200 if (unp->unp_conn != NULL && 1201 unp->unp_conn->unp_addr != NULL) 1202 bcopy(unp->unp_conn->unp_addr, 1203 &xu->xu_caddr, 1204 unp->unp_conn->unp_addr->sun_len); 1205 bcopy(unp, &xu->xu_unp, sizeof *unp); 1206 sotoxsocket(unp->unp_socket, &xu->xu_socket); 1207 UNP_UNLOCK(); 1208 error = SYSCTL_OUT(req, xu, sizeof *xu); 1209 } else { 1210 freeunp = (unp->unp_refcount == 0); 1211 UNP_UNLOCK(); 1212 if (freeunp) 1213 uma_zfree(unp_zone, unp); 1214 } 1215 } 1216 free(xu, M_TEMP); 1217 if (!error) { 1218 /* 1219 * Give the user an updated idea of our state. If the 1220 * generation differs from what we told her before, she knows 1221 * that something happened while we were processing this 1222 * request, and it might be necessary to retry. 1223 */ 1224 xug->xug_gen = unp_gencnt; 1225 xug->xug_sogen = so_gencnt; 1226 xug->xug_count = unp_count; 1227 error = SYSCTL_OUT(req, xug, sizeof *xug); 1228 } 1229 free(unp_list, M_TEMP); 1230 free(xug, M_TEMP); 1231 return (error); 1232} 1233 1234SYSCTL_PROC(_net_local_dgram, OID_AUTO, pcblist, CTLFLAG_RD, 1235 (caddr_t)(long)SOCK_DGRAM, 0, unp_pcblist, "S,xunpcb", 1236 "List of active local datagram sockets"); 1237SYSCTL_PROC(_net_local_stream, OID_AUTO, pcblist, CTLFLAG_RD, 1238 (caddr_t)(long)SOCK_STREAM, 0, unp_pcblist, "S,xunpcb", 1239 "List of active local stream sockets"); 1240 1241static void 1242unp_shutdown(struct unpcb *unp) 1243{ 1244 struct socket *so; 1245 1246 UNP_LOCK_ASSERT(); 1247 1248 if (unp->unp_socket->so_type == SOCK_STREAM && unp->unp_conn && 1249 (so = unp->unp_conn->unp_socket)) 1250 socantrcvmore(so); 1251} 1252 1253static void 1254unp_drop(struct unpcb *unp, int errno) 1255{ 1256 struct socket *so = unp->unp_socket; 1257 1258 UNP_LOCK_ASSERT(); 1259 1260 so->so_error = errno; 1261 unp_disconnect(unp); 1262} 1263 1264static void 1265unp_freerights(struct file **rp, int fdcount) 1266{ 1267 int i; 1268 struct file *fp; 1269 1270 for (i = 0; i < fdcount; i++) { 1271 fp = *rp; 1272 /* 1273 * Zero the pointer before calling unp_discard since it may 1274 * end up in unp_gc().. 1275 * 1276 * XXXRW: This is less true than it used to be. 1277 */ 1278 *rp++ = 0; 1279 unp_discard(fp); 1280 } 1281} 1282 1283int 1284unp_externalize(struct mbuf *control, struct mbuf **controlp) 1285{ 1286 struct thread *td = curthread; /* XXX */ 1287 struct cmsghdr *cm = mtod(control, struct cmsghdr *); 1288 int i; 1289 int *fdp; 1290 struct file **rp; 1291 struct file *fp; 1292 void *data; 1293 socklen_t clen = control->m_len, datalen; 1294 int error, newfds; 1295 int f; 1296 u_int newlen; 1297 1298 UNP_UNLOCK_ASSERT(); 1299 1300 error = 0; 1301 if (controlp != NULL) /* controlp == NULL => free control messages */ 1302 *controlp = NULL; 1303 1304 while (cm != NULL) { 1305 if (sizeof(*cm) > clen || cm->cmsg_len > clen) { 1306 error = EINVAL; 1307 break; 1308 } 1309 1310 data = CMSG_DATA(cm); 1311 datalen = (caddr_t)cm + cm->cmsg_len - (caddr_t)data; 1312 1313 if (cm->cmsg_level == SOL_SOCKET 1314 && cm->cmsg_type == SCM_RIGHTS) { 1315 newfds = datalen / sizeof(struct file *); 1316 rp = data; 1317 1318 /* If we're not outputting the descriptors free them. */ 1319 if (error || controlp == NULL) { 1320 unp_freerights(rp, newfds); 1321 goto next; 1322 } 1323 FILEDESC_LOCK(td->td_proc->p_fd); 1324 /* if the new FD's will not fit free them. */ 1325 if (!fdavail(td, newfds)) { 1326 FILEDESC_UNLOCK(td->td_proc->p_fd); 1327 error = EMSGSIZE; 1328 unp_freerights(rp, newfds); 1329 goto next; 1330 } 1331 /* 1332 * Now change each pointer to an fd in the global 1333 * table to an integer that is the index to the local 1334 * fd table entry that we set up to point to the 1335 * global one we are transferring. 1336 */ 1337 newlen = newfds * sizeof(int); 1338 *controlp = sbcreatecontrol(NULL, newlen, 1339 SCM_RIGHTS, SOL_SOCKET); 1340 if (*controlp == NULL) { 1341 FILEDESC_UNLOCK(td->td_proc->p_fd); 1342 error = E2BIG; 1343 unp_freerights(rp, newfds); 1344 goto next; 1345 } 1346 1347 fdp = (int *) 1348 CMSG_DATA(mtod(*controlp, struct cmsghdr *)); 1349 for (i = 0; i < newfds; i++) { 1350 if (fdalloc(td, 0, &f)) 1351 panic("unp_externalize fdalloc failed"); 1352 fp = *rp++; 1353 td->td_proc->p_fd->fd_ofiles[f] = fp; 1354 FILE_LOCK(fp); 1355 fp->f_msgcount--; 1356 FILE_UNLOCK(fp); 1357 unp_rights--; 1358 *fdp++ = f; 1359 } 1360 FILEDESC_UNLOCK(td->td_proc->p_fd); 1361 } else { 1362 /* We can just copy anything else across. */ 1363 if (error || controlp == NULL) 1364 goto next; 1365 *controlp = sbcreatecontrol(NULL, datalen, 1366 cm->cmsg_type, cm->cmsg_level); 1367 if (*controlp == NULL) { 1368 error = ENOBUFS; 1369 goto next; 1370 } 1371 bcopy(data, 1372 CMSG_DATA(mtod(*controlp, struct cmsghdr *)), 1373 datalen); 1374 } 1375 1376 controlp = &(*controlp)->m_next; 1377 1378next: 1379 if (CMSG_SPACE(datalen) < clen) { 1380 clen -= CMSG_SPACE(datalen); 1381 cm = (struct cmsghdr *) 1382 ((caddr_t)cm + CMSG_SPACE(datalen)); 1383 } else { 1384 clen = 0; 1385 cm = NULL; 1386 } 1387 } 1388 1389 m_freem(control); 1390 1391 return (error); 1392} 1393 1394static void 1395unp_zone_change(void *tag) 1396{ 1397 1398 uma_zone_set_max(unp_zone, maxsockets); 1399} 1400 1401void 1402unp_init(void) 1403{ 1404 1405 unp_zone = uma_zcreate("unpcb", sizeof(struct unpcb), NULL, NULL, 1406 NULL, NULL, UMA_ALIGN_PTR, 0); 1407 if (unp_zone == NULL) 1408 panic("unp_init"); 1409 uma_zone_set_max(unp_zone, maxsockets); 1410 EVENTHANDLER_REGISTER(maxsockets_change, unp_zone_change, 1411 NULL, EVENTHANDLER_PRI_ANY); 1412 LIST_INIT(&unp_dhead); 1413 LIST_INIT(&unp_shead); 1414 TASK_INIT(&unp_gc_task, 0, unp_gc, NULL); 1415 UNP_LOCK_INIT(); 1416} 1417 1418static int 1419unp_internalize(struct mbuf **controlp, struct thread *td) 1420{ 1421 struct mbuf *control = *controlp; 1422 struct proc *p = td->td_proc; 1423 struct filedesc *fdescp = p->p_fd; 1424 struct cmsghdr *cm = mtod(control, struct cmsghdr *); 1425 struct cmsgcred *cmcred; 1426 struct file **rp; 1427 struct file *fp; 1428 struct timeval *tv; 1429 int i, fd, *fdp; 1430 void *data; 1431 socklen_t clen = control->m_len, datalen; 1432 int error, oldfds; 1433 u_int newlen; 1434 1435 UNP_UNLOCK_ASSERT(); 1436 1437 error = 0; 1438 *controlp = NULL; 1439 1440 while (cm != NULL) { 1441 if (sizeof(*cm) > clen || cm->cmsg_level != SOL_SOCKET 1442 || cm->cmsg_len > clen) { 1443 error = EINVAL; 1444 goto out; 1445 } 1446 1447 data = CMSG_DATA(cm); 1448 datalen = (caddr_t)cm + cm->cmsg_len - (caddr_t)data; 1449 1450 switch (cm->cmsg_type) { 1451 /* 1452 * Fill in credential information. 1453 */ 1454 case SCM_CREDS: 1455 *controlp = sbcreatecontrol(NULL, sizeof(*cmcred), 1456 SCM_CREDS, SOL_SOCKET); 1457 if (*controlp == NULL) { 1458 error = ENOBUFS; 1459 goto out; 1460 } 1461 1462 cmcred = (struct cmsgcred *) 1463 CMSG_DATA(mtod(*controlp, struct cmsghdr *)); 1464 cmcred->cmcred_pid = p->p_pid; 1465 cmcred->cmcred_uid = td->td_ucred->cr_ruid; 1466 cmcred->cmcred_gid = td->td_ucred->cr_rgid; 1467 cmcred->cmcred_euid = td->td_ucred->cr_uid; 1468 cmcred->cmcred_ngroups = MIN(td->td_ucred->cr_ngroups, 1469 CMGROUP_MAX); 1470 for (i = 0; i < cmcred->cmcred_ngroups; i++) 1471 cmcred->cmcred_groups[i] = 1472 td->td_ucred->cr_groups[i]; 1473 break; 1474 1475 case SCM_RIGHTS: 1476 oldfds = datalen / sizeof (int); 1477 /* 1478 * Check that all the FDs passed in refer to legal 1479 * files. If not, reject the entire operation. 1480 */ 1481 fdp = data; 1482 FILEDESC_LOCK(fdescp); 1483 for (i = 0; i < oldfds; i++) { 1484 fd = *fdp++; 1485 if ((unsigned)fd >= fdescp->fd_nfiles || 1486 fdescp->fd_ofiles[fd] == NULL) { 1487 FILEDESC_UNLOCK(fdescp); 1488 error = EBADF; 1489 goto out; 1490 } 1491 fp = fdescp->fd_ofiles[fd]; 1492 if (!(fp->f_ops->fo_flags & DFLAG_PASSABLE)) { 1493 FILEDESC_UNLOCK(fdescp); 1494 error = EOPNOTSUPP; 1495 goto out; 1496 } 1497 1498 } 1499 /* 1500 * Now replace the integer FDs with pointers to the 1501 * associated global file table entry.. 1502 */ 1503 newlen = oldfds * sizeof(struct file *); 1504 *controlp = sbcreatecontrol(NULL, newlen, 1505 SCM_RIGHTS, SOL_SOCKET); 1506 if (*controlp == NULL) { 1507 FILEDESC_UNLOCK(fdescp); 1508 error = E2BIG; 1509 goto out; 1510 } 1511 1512 fdp = data; 1513 rp = (struct file **) 1514 CMSG_DATA(mtod(*controlp, struct cmsghdr *)); 1515 for (i = 0; i < oldfds; i++) { 1516 fp = fdescp->fd_ofiles[*fdp++]; 1517 *rp++ = fp; 1518 FILE_LOCK(fp); 1519 fp->f_count++; 1520 fp->f_msgcount++; 1521 FILE_UNLOCK(fp); 1522 unp_rights++; 1523 } 1524 FILEDESC_UNLOCK(fdescp); 1525 break; 1526 1527 case SCM_TIMESTAMP: 1528 *controlp = sbcreatecontrol(NULL, sizeof(*tv), 1529 SCM_TIMESTAMP, SOL_SOCKET); 1530 if (*controlp == NULL) { 1531 error = ENOBUFS; 1532 goto out; 1533 } 1534 tv = (struct timeval *) 1535 CMSG_DATA(mtod(*controlp, struct cmsghdr *)); 1536 microtime(tv); 1537 break; 1538 1539 default: 1540 error = EINVAL; 1541 goto out; 1542 } 1543 1544 controlp = &(*controlp)->m_next; 1545 1546 if (CMSG_SPACE(datalen) < clen) { 1547 clen -= CMSG_SPACE(datalen); 1548 cm = (struct cmsghdr *) 1549 ((caddr_t)cm + CMSG_SPACE(datalen)); 1550 } else { 1551 clen = 0; 1552 cm = NULL; 1553 } 1554 } 1555 1556out: 1557 m_freem(control); 1558 1559 return (error); 1560} 1561 1562static struct mbuf * 1563unp_addsockcred(struct thread *td, struct mbuf *control) 1564{ 1565 struct mbuf *m, *n, *n_prev; 1566 struct sockcred *sc; 1567 const struct cmsghdr *cm; 1568 int ngroups; 1569 int i; 1570 1571 ngroups = MIN(td->td_ucred->cr_ngroups, CMGROUP_MAX); 1572 1573 m = sbcreatecontrol(NULL, SOCKCREDSIZE(ngroups), SCM_CREDS, SOL_SOCKET); 1574 if (m == NULL) 1575 return (control); 1576 1577 sc = (struct sockcred *) CMSG_DATA(mtod(m, struct cmsghdr *)); 1578 sc->sc_uid = td->td_ucred->cr_ruid; 1579 sc->sc_euid = td->td_ucred->cr_uid; 1580 sc->sc_gid = td->td_ucred->cr_rgid; 1581 sc->sc_egid = td->td_ucred->cr_gid; 1582 sc->sc_ngroups = ngroups; 1583 for (i = 0; i < sc->sc_ngroups; i++) 1584 sc->sc_groups[i] = td->td_ucred->cr_groups[i]; 1585 1586 /* 1587 * Unlink SCM_CREDS control messages (struct cmsgcred), since just 1588 * created SCM_CREDS control message (struct sockcred) has another 1589 * format. 1590 */ 1591 if (control != NULL) 1592 for (n = control, n_prev = NULL; n != NULL;) { 1593 cm = mtod(n, struct cmsghdr *); 1594 if (cm->cmsg_level == SOL_SOCKET && 1595 cm->cmsg_type == SCM_CREDS) { 1596 if (n_prev == NULL) 1597 control = n->m_next; 1598 else 1599 n_prev->m_next = n->m_next; 1600 n = m_free(n); 1601 } else { 1602 n_prev = n; 1603 n = n->m_next; 1604 } 1605 } 1606 1607 /* Prepend it to the head. */ 1608 m->m_next = control; 1609 1610 return (m); 1611} 1612 1613/* 1614 * unp_defer indicates whether additional work has been defered for a future 1615 * pass through unp_gc(). It is thread local and does not require explicit 1616 * synchronization. 1617 */ 1618static int unp_defer; 1619 1620static int unp_taskcount; 1621SYSCTL_INT(_net_local, OID_AUTO, taskcount, CTLFLAG_RD, &unp_taskcount, 0, ""); 1622 1623static int unp_recycled; 1624SYSCTL_INT(_net_local, OID_AUTO, recycled, CTLFLAG_RD, &unp_recycled, 0, ""); 1625 1626static void 1627unp_gc(__unused void *arg, int pending) 1628{ 1629 struct file *fp, *nextfp; 1630 struct socket *so; 1631 struct file **extra_ref, **fpp; 1632 int nunref, i; 1633 int nfiles_snap; 1634 int nfiles_slack = 20; 1635 1636 unp_taskcount++; 1637 unp_defer = 0; 1638 /* 1639 * Before going through all this, set all FDs to be NOT deferred and 1640 * NOT externally accessible. 1641 */ 1642 sx_slock(&filelist_lock); 1643 LIST_FOREACH(fp, &filehead, f_list) 1644 fp->f_gcflag &= ~(FMARK|FDEFER); 1645 do { 1646 KASSERT(unp_defer >= 0, ("unp_gc: unp_defer %d", unp_defer)); 1647 LIST_FOREACH(fp, &filehead, f_list) { 1648 FILE_LOCK(fp); 1649 /* 1650 * If the file is not open, skip it -- could be a 1651 * file in the process of being opened, or in the 1652 * process of being closed. If the file is 1653 * "closing", it may have been marked for deferred 1654 * consideration. Clear the flag now if so. 1655 */ 1656 if (fp->f_count == 0) { 1657 if (fp->f_gcflag & FDEFER) 1658 unp_defer--; 1659 fp->f_gcflag &= ~(FMARK|FDEFER); 1660 FILE_UNLOCK(fp); 1661 continue; 1662 } 1663 /* 1664 * If we already marked it as 'defer' in a 1665 * previous pass, then try to process it this 1666 * time and un-mark it. 1667 */ 1668 if (fp->f_gcflag & FDEFER) { 1669 fp->f_gcflag &= ~FDEFER; 1670 unp_defer--; 1671 } else { 1672 /* 1673 * if it's not deferred, then check if it's 1674 * already marked.. if so skip it 1675 */ 1676 if (fp->f_gcflag & FMARK) { 1677 FILE_UNLOCK(fp); 1678 continue; 1679 } 1680 /* 1681 * If all references are from messages in 1682 * transit, then skip it. it's not externally 1683 * accessible. 1684 */ 1685 if (fp->f_count == fp->f_msgcount) { 1686 FILE_UNLOCK(fp); 1687 continue; 1688 } 1689 /* 1690 * If it got this far then it must be 1691 * externally accessible. 1692 */ 1693 fp->f_gcflag |= FMARK; 1694 } 1695 /* 1696 * Either it was deferred, or it is externally 1697 * accessible and not already marked so. Now check 1698 * if it is possibly one of OUR sockets. 1699 */ 1700 if (fp->f_type != DTYPE_SOCKET || 1701 (so = fp->f_data) == NULL) { 1702 FILE_UNLOCK(fp); 1703 continue; 1704 } 1705 if (so->so_proto->pr_domain != &localdomain || 1706 (so->so_proto->pr_flags & PR_RIGHTS) == 0) { 1707 FILE_UNLOCK(fp); 1708 continue; 1709 } 1710 1711 /* 1712 * Tell any other threads that do a subsequent 1713 * fdrop() that we are scanning the message 1714 * buffers. 1715 */ 1716 fp->f_gcflag |= FWAIT; 1717 FILE_UNLOCK(fp); 1718 1719 /* 1720 * So, Ok, it's one of our sockets and it IS 1721 * externally accessible (or was deferred). Now we 1722 * look to see if we hold any file descriptors in its 1723 * message buffers. Follow those links and mark them 1724 * as accessible too. 1725 */ 1726 SOCKBUF_LOCK(&so->so_rcv); 1727 unp_scan(so->so_rcv.sb_mb, unp_mark); 1728 SOCKBUF_UNLOCK(&so->so_rcv); 1729 1730 /* 1731 * Wake up any threads waiting in fdrop(). 1732 */ 1733 FILE_LOCK(fp); 1734 fp->f_gcflag &= ~FWAIT; 1735 wakeup(&fp->f_gcflag); 1736 FILE_UNLOCK(fp); 1737 } 1738 } while (unp_defer); 1739 sx_sunlock(&filelist_lock); 1740 /* 1741 * XXXRW: The following comments need updating for a post-SMPng and 1742 * deferred unp_gc() world, but are still generally accurate. 1743 * 1744 * We grab an extra reference to each of the file table entries that 1745 * are not otherwise accessible and then free the rights that are 1746 * stored in messages on them. 1747 * 1748 * The bug in the orginal code is a little tricky, so I'll describe 1749 * what's wrong with it here. 1750 * 1751 * It is incorrect to simply unp_discard each entry for f_msgcount 1752 * times -- consider the case of sockets A and B that contain 1753 * references to each other. On a last close of some other socket, 1754 * we trigger a gc since the number of outstanding rights (unp_rights) 1755 * is non-zero. If during the sweep phase the gc code unp_discards, 1756 * we end up doing a (full) closef on the descriptor. A closef on A 1757 * results in the following chain. Closef calls soo_close, which 1758 * calls soclose. Soclose calls first (through the switch 1759 * uipc_usrreq) unp_detach, which re-invokes unp_gc. Unp_gc simply 1760 * returns because the previous instance had set unp_gcing, and we 1761 * return all the way back to soclose, which marks the socket with 1762 * SS_NOFDREF, and then calls sofree. Sofree calls sorflush to free 1763 * up the rights that are queued in messages on the socket A, i.e., 1764 * the reference on B. The sorflush calls via the dom_dispose switch 1765 * unp_dispose, which unp_scans with unp_discard. This second 1766 * instance of unp_discard just calls closef on B. 1767 * 1768 * Well, a similar chain occurs on B, resulting in a sorflush on B, 1769 * which results in another closef on A. Unfortunately, A is already 1770 * being closed, and the descriptor has already been marked with 1771 * SS_NOFDREF, and soclose panics at this point. 1772 * 1773 * Here, we first take an extra reference to each inaccessible 1774 * descriptor. Then, we call sorflush ourself, since we know it is a 1775 * Unix domain socket anyhow. After we destroy all the rights 1776 * carried in messages, we do a last closef to get rid of our extra 1777 * reference. This is the last close, and the unp_detach etc will 1778 * shut down the socket. 1779 * 1780 * 91/09/19, bsy@cs.cmu.edu 1781 */ 1782again: 1783 nfiles_snap = openfiles + nfiles_slack; /* some slack */ 1784 extra_ref = malloc(nfiles_snap * sizeof(struct file *), M_TEMP, 1785 M_WAITOK); 1786 sx_slock(&filelist_lock); 1787 if (nfiles_snap < openfiles) { 1788 sx_sunlock(&filelist_lock); 1789 free(extra_ref, M_TEMP); 1790 nfiles_slack += 20; 1791 goto again; 1792 } 1793 for (nunref = 0, fp = LIST_FIRST(&filehead), fpp = extra_ref; 1794 fp != NULL; fp = nextfp) { 1795 nextfp = LIST_NEXT(fp, f_list); 1796 FILE_LOCK(fp); 1797 /* 1798 * If it's not open, skip it 1799 */ 1800 if (fp->f_count == 0) { 1801 FILE_UNLOCK(fp); 1802 continue; 1803 } 1804 /* 1805 * If all refs are from msgs, and it's not marked accessible 1806 * then it must be referenced from some unreachable cycle of 1807 * (shut-down) FDs, so include it in our list of FDs to 1808 * remove. 1809 */ 1810 if (fp->f_count == fp->f_msgcount && !(fp->f_gcflag & FMARK)) { 1811 *fpp++ = fp; 1812 nunref++; 1813 fp->f_count++; 1814 } 1815 FILE_UNLOCK(fp); 1816 } 1817 sx_sunlock(&filelist_lock); 1818 /* 1819 * For each FD on our hit list, do the following two things: 1820 */ 1821 for (i = nunref, fpp = extra_ref; --i >= 0; ++fpp) { 1822 struct file *tfp = *fpp; 1823 FILE_LOCK(tfp); 1824 if (tfp->f_type == DTYPE_SOCKET && 1825 tfp->f_data != NULL) { 1826 FILE_UNLOCK(tfp); 1827 sorflush(tfp->f_data); 1828 } else { 1829 FILE_UNLOCK(tfp); 1830 } 1831 } 1832 for (i = nunref, fpp = extra_ref; --i >= 0; ++fpp) { 1833 closef(*fpp, (struct thread *) NULL); 1834 unp_recycled++; 1835 } 1836 free(extra_ref, M_TEMP); 1837} 1838 1839void 1840unp_dispose(struct mbuf *m) 1841{ 1842 1843 if (m) 1844 unp_scan(m, unp_discard); 1845} 1846 1847static int 1848unp_listen(struct socket *so, struct unpcb *unp, int backlog, 1849 struct thread *td) 1850{ 1851 int error; 1852 1853 UNP_LOCK_ASSERT(); 1854 1855 SOCK_LOCK(so); 1856 error = solisten_proto_check(so); 1857 if (error == 0) { 1858 cru2x(td->td_ucred, &unp->unp_peercred); 1859 unp->unp_flags |= UNP_HAVEPCCACHED; 1860 solisten_proto(so, backlog); 1861 } 1862 SOCK_UNLOCK(so); 1863 return (error); 1864} 1865 1866static void 1867unp_scan(struct mbuf *m0, void (*op)(struct file *)) 1868{ 1869 struct mbuf *m; 1870 struct file **rp; 1871 struct cmsghdr *cm; 1872 void *data; 1873 int i; 1874 socklen_t clen, datalen; 1875 int qfds; 1876 1877 while (m0 != NULL) { 1878 for (m = m0; m; m = m->m_next) { 1879 if (m->m_type != MT_CONTROL) 1880 continue; 1881 1882 cm = mtod(m, struct cmsghdr *); 1883 clen = m->m_len; 1884 1885 while (cm != NULL) { 1886 if (sizeof(*cm) > clen || cm->cmsg_len > clen) 1887 break; 1888 1889 data = CMSG_DATA(cm); 1890 datalen = (caddr_t)cm + cm->cmsg_len 1891 - (caddr_t)data; 1892 1893 if (cm->cmsg_level == SOL_SOCKET && 1894 cm->cmsg_type == SCM_RIGHTS) { 1895 qfds = datalen / sizeof (struct file *); 1896 rp = data; 1897 for (i = 0; i < qfds; i++) 1898 (*op)(*rp++); 1899 } 1900 1901 if (CMSG_SPACE(datalen) < clen) { 1902 clen -= CMSG_SPACE(datalen); 1903 cm = (struct cmsghdr *) 1904 ((caddr_t)cm + CMSG_SPACE(datalen)); 1905 } else { 1906 clen = 0; 1907 cm = NULL; 1908 } 1909 } 1910 } 1911 m0 = m0->m_act; 1912 } 1913} 1914 1915static void 1916unp_mark(struct file *fp) 1917{ 1918 if (fp->f_gcflag & FMARK) 1919 return; 1920 unp_defer++; 1921 fp->f_gcflag |= (FMARK|FDEFER); 1922} 1923 1924static void 1925unp_discard(struct file *fp) 1926{ 1927 UNP_LOCK(); 1928 FILE_LOCK(fp); 1929 fp->f_msgcount--; 1930 unp_rights--; 1931 FILE_UNLOCK(fp); 1932 UNP_UNLOCK(); 1933 (void) closef(fp, (struct thread *)NULL); 1934} 1935