uipc_usrreq.c revision 166844
1/*- 2 * Copyright (c) 1982, 1986, 1989, 1991, 1993 3 * The Regents of the University of California. 4 * Copyright (c) 2004-2007 Robert N. M. Watson 5 * All rights reserved. 6 * 7 * Redistribution and use in source and binary forms, with or without 8 * modification, are permitted provided that the following conditions 9 * are met: 10 * 1. Redistributions of source code must retain the above copyright 11 * notice, this list of conditions and the following disclaimer. 12 * 2. Redistributions in binary form must reproduce the above copyright 13 * notice, this list of conditions and the following disclaimer in the 14 * documentation and/or other materials provided with the distribution. 15 * 4. Neither the name of the University nor the names of its contributors 16 * may be used to endorse or promote products derived from this software 17 * without specific prior written permission. 18 * 19 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND 20 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 21 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 22 * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE 23 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 24 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 25 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 26 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 27 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 28 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 29 * SUCH DAMAGE. 30 * 31 * From: @(#)uipc_usrreq.c 8.3 (Berkeley) 1/4/94 32 */ 33 34/* 35 * UNIX Domain (Local) Sockets 36 * 37 * This is an implementation of UNIX (local) domain sockets. Each socket has 38 * an associated struct unpcb (UNIX protocol control block). Stream sockets 39 * may be connected to 0 or 1 other socket. Datagram sockets may be 40 * connected to 0, 1, or many other sockets. Sockets may be created and 41 * connected in pairs (socketpair(2)), or bound/connected to using the file 42 * system name space. For most purposes, only the receive socket buffer is 43 * used, as sending on one socket delivers directly to the receive socket 44 * buffer of a second socket. 45 * 46 * The implementation is substantially complicated by the fact that 47 * "ancillary data", such as file descriptors or credentials, may be passed 48 * across UNIX domain sockets. The potential for passing UNIX domain sockets 49 * over other UNIX domain sockets requires the implementation of a simple 50 * garbage collector to find and tear down cycles of disconnected sockets. 51 * 52 * TODO: 53 * SEQPACKET, RDM 54 * rethink name space problems 55 * need a proper out-of-band 56 * lock pushdown 57 */ 58 59#include <sys/cdefs.h> 60__FBSDID("$FreeBSD: head/sys/kern/uipc_usrreq.c 166844 2007-02-20 10:50:02Z rwatson $"); 61 62#include "opt_mac.h" 63 64#include <sys/param.h> 65#include <sys/domain.h> 66#include <sys/fcntl.h> 67#include <sys/malloc.h> /* XXX must be before <sys/file.h> */ 68#include <sys/eventhandler.h> 69#include <sys/file.h> 70#include <sys/filedesc.h> 71#include <sys/jail.h> 72#include <sys/kernel.h> 73#include <sys/lock.h> 74#include <sys/mbuf.h> 75#include <sys/mount.h> 76#include <sys/mutex.h> 77#include <sys/namei.h> 78#include <sys/proc.h> 79#include <sys/protosw.h> 80#include <sys/resourcevar.h> 81#include <sys/socket.h> 82#include <sys/socketvar.h> 83#include <sys/signalvar.h> 84#include <sys/stat.h> 85#include <sys/sx.h> 86#include <sys/sysctl.h> 87#include <sys/systm.h> 88#include <sys/taskqueue.h> 89#include <sys/un.h> 90#include <sys/unpcb.h> 91#include <sys/vnode.h> 92 93#include <security/mac/mac_framework.h> 94 95#include <vm/uma.h> 96 97static uma_zone_t unp_zone; 98static unp_gen_t unp_gencnt; 99static u_int unp_count; /* Count of local sockets. */ 100static ino_t unp_ino; /* Prototype for fake inode numbers. */ 101static int unp_rights; /* File descriptors in flight. */ 102static struct unp_head unp_shead; /* List of local stream sockets. */ 103static struct unp_head unp_dhead; /* List of local datagram sockets. */ 104 105static const struct sockaddr sun_noname = { sizeof(sun_noname), AF_LOCAL }; 106 107/* 108 * Garbage collection of cyclic file descriptor/socket references occurs 109 * asynchronously in a taskqueue context in order to avoid recursion and 110 * reentrance in the UNIX domain socket, file descriptor, and socket layer 111 * code. See unp_gc() for a full description. 112 */ 113static struct task unp_gc_task; 114 115/* 116 * Both send and receive buffers are allocated PIPSIZ bytes of buffering for 117 * stream sockets, although the total for sender and receiver is actually 118 * only PIPSIZ. 119 * 120 * Datagram sockets really use the sendspace as the maximum datagram size, 121 * and don't really want to reserve the sendspace. Their recvspace should be 122 * large enough for at least one max-size datagram plus address. 123 */ 124#ifndef PIPSIZ 125#define PIPSIZ 8192 126#endif 127static u_long unpst_sendspace = PIPSIZ; 128static u_long unpst_recvspace = PIPSIZ; 129static u_long unpdg_sendspace = 2*1024; /* really max datagram size */ 130static u_long unpdg_recvspace = 4*1024; 131 132SYSCTL_NODE(_net, PF_LOCAL, local, CTLFLAG_RW, 0, "Local domain"); 133SYSCTL_NODE(_net_local, SOCK_STREAM, stream, CTLFLAG_RW, 0, "SOCK_STREAM"); 134SYSCTL_NODE(_net_local, SOCK_DGRAM, dgram, CTLFLAG_RW, 0, "SOCK_DGRAM"); 135 136SYSCTL_ULONG(_net_local_stream, OID_AUTO, sendspace, CTLFLAG_RW, 137 &unpst_sendspace, 0, ""); 138SYSCTL_ULONG(_net_local_stream, OID_AUTO, recvspace, CTLFLAG_RW, 139 &unpst_recvspace, 0, ""); 140SYSCTL_ULONG(_net_local_dgram, OID_AUTO, maxdgram, CTLFLAG_RW, 141 &unpdg_sendspace, 0, ""); 142SYSCTL_ULONG(_net_local_dgram, OID_AUTO, recvspace, CTLFLAG_RW, 143 &unpdg_recvspace, 0, ""); 144SYSCTL_INT(_net_local, OID_AUTO, inflight, CTLFLAG_RD, &unp_rights, 0, ""); 145 146/* 147 * Currently, UNIX domain sockets are protected by a single subsystem lock, 148 * which covers global data structures and variables, the contents of each 149 * per-socket unpcb structure, and the so_pcb field in sockets attached to 150 * the UNIX domain. This provides for a moderate degree of paralellism, as 151 * receive operations on UNIX domain sockets do not need to acquire the 152 * subsystem lock. Finer grained locking to permit send() without acquiring 153 * a global lock would be a logical next step. 154 * 155 * The UNIX domain socket lock preceds all socket layer locks, including the 156 * socket lock and socket buffer lock, permitting UNIX domain socket code to 157 * call into socket support routines without releasing its locks. 158 * 159 * Some caution is required in areas where the UNIX domain socket code enters 160 * VFS in order to create or find rendezvous points. This results in 161 * dropping of the UNIX domain socket subsystem lock, acquisition of the 162 * Giant lock, and potential sleeping. This increases the chances of races, 163 * and exposes weaknesses in the socket->protocol API by offering poor 164 * failure modes. 165 */ 166static struct mtx unp_mtx; 167#define UNP_LOCK_INIT() \ 168 mtx_init(&unp_mtx, "unp", NULL, MTX_DEF | MTX_RECURSE) 169#define UNP_LOCK() mtx_lock(&unp_mtx) 170#define UNP_UNLOCK() mtx_unlock(&unp_mtx) 171#define UNP_LOCK_ASSERT() mtx_assert(&unp_mtx, MA_OWNED) 172#define UNP_UNLOCK_ASSERT() mtx_assert(&unp_mtx, MA_NOTOWNED) 173 174static int unp_connect(struct socket *, struct sockaddr *, 175 struct thread *); 176static int unp_connect2(struct socket *so, struct socket *so2, int); 177static void unp_disconnect(struct unpcb *); 178static void unp_shutdown(struct unpcb *); 179static void unp_drop(struct unpcb *, int); 180static void unp_gc(__unused void *, int); 181static void unp_scan(struct mbuf *, void (*)(struct file *)); 182static void unp_mark(struct file *); 183static void unp_discard(struct file *); 184static void unp_freerights(struct file **, int); 185static int unp_internalize(struct mbuf **, struct thread *); 186static int unp_listen(struct socket *, struct unpcb *, int, 187 struct thread *); 188static struct mbuf *unp_addsockcred(struct thread *, struct mbuf *); 189 190/* 191 * Definitions of protocols supported in the LOCAL domain. 192 */ 193static struct domain localdomain; 194static struct protosw localsw[] = { 195{ 196 .pr_type = SOCK_STREAM, 197 .pr_domain = &localdomain, 198 .pr_flags = PR_CONNREQUIRED|PR_WANTRCVD|PR_RIGHTS, 199 .pr_ctloutput = &uipc_ctloutput, 200 .pr_usrreqs = &uipc_usrreqs 201}, 202{ 203 .pr_type = SOCK_DGRAM, 204 .pr_domain = &localdomain, 205 .pr_flags = PR_ATOMIC|PR_ADDR|PR_RIGHTS, 206 .pr_usrreqs = &uipc_usrreqs 207}, 208}; 209 210static struct domain localdomain = { 211 .dom_family = AF_LOCAL, 212 .dom_name = "local", 213 .dom_init = unp_init, 214 .dom_externalize = unp_externalize, 215 .dom_dispose = unp_dispose, 216 .dom_protosw = localsw, 217 .dom_protoswNPROTOSW = &localsw[sizeof(localsw)/sizeof(localsw[0])] 218}; 219DOMAIN_SET(local); 220 221static void 222uipc_abort(struct socket *so) 223{ 224 struct unpcb *unp; 225 226 unp = sotounpcb(so); 227 KASSERT(unp != NULL, ("uipc_abort: unp == NULL")); 228 UNP_LOCK(); 229 unp_drop(unp, ECONNABORTED); 230 UNP_UNLOCK(); 231} 232 233static int 234uipc_accept(struct socket *so, struct sockaddr **nam) 235{ 236 struct unpcb *unp; 237 const struct sockaddr *sa; 238 239 /* 240 * Pass back name of connected socket, if it was bound and we are 241 * still connected (our peer may have closed already!). 242 */ 243 unp = sotounpcb(so); 244 KASSERT(unp != NULL, ("uipc_accept: unp == NULL")); 245 *nam = malloc(sizeof(struct sockaddr_un), M_SONAME, M_WAITOK); 246 UNP_LOCK(); 247 if (unp->unp_conn != NULL && unp->unp_conn->unp_addr != NULL) 248 sa = (struct sockaddr *) unp->unp_conn->unp_addr; 249 else 250 sa = &sun_noname; 251 bcopy(sa, *nam, sa->sa_len); 252 UNP_UNLOCK(); 253 return (0); 254} 255 256static int 257uipc_attach(struct socket *so, int proto, struct thread *td) 258{ 259 struct unpcb *unp; 260 int error; 261 262 KASSERT(so->so_pcb == NULL, ("uipc_attach: so_pcb != NULL")); 263 if (so->so_snd.sb_hiwat == 0 || so->so_rcv.sb_hiwat == 0) { 264 switch (so->so_type) { 265 case SOCK_STREAM: 266 error = soreserve(so, unpst_sendspace, unpst_recvspace); 267 break; 268 269 case SOCK_DGRAM: 270 error = soreserve(so, unpdg_sendspace, unpdg_recvspace); 271 break; 272 273 default: 274 panic("unp_attach"); 275 } 276 if (error) 277 return (error); 278 } 279 unp = uma_zalloc(unp_zone, M_NOWAIT | M_ZERO); 280 if (unp == NULL) 281 return (ENOBUFS); 282 LIST_INIT(&unp->unp_refs); 283 unp->unp_socket = so; 284 so->so_pcb = unp; 285 286 unp->unp_refcount = 1; 287 UNP_LOCK(); 288 unp->unp_gencnt = ++unp_gencnt; 289 unp_count++; 290 LIST_INSERT_HEAD(so->so_type == SOCK_DGRAM ? &unp_dhead : &unp_shead, 291 unp, unp_link); 292 UNP_UNLOCK(); 293 294 return (0); 295} 296 297static int 298uipc_bind(struct socket *so, struct sockaddr *nam, struct thread *td) 299{ 300 struct sockaddr_un *soun = (struct sockaddr_un *)nam; 301 struct vattr vattr; 302 int error, namelen; 303 struct nameidata nd; 304 struct unpcb *unp; 305 struct vnode *vp; 306 struct mount *mp; 307 char *buf; 308 309 unp = sotounpcb(so); 310 KASSERT(unp != NULL, ("uipc_bind: unp == NULL")); 311 312 namelen = soun->sun_len - offsetof(struct sockaddr_un, sun_path); 313 if (namelen <= 0) 314 return (EINVAL); 315 316 /* 317 * We don't allow simultaneous bind() calls on a single UNIX domain 318 * socket, so flag in-progress operations, and return an error if an 319 * operation is already in progress. 320 * 321 * Historically, we have not allowed a socket to be rebound, so this 322 * also returns an error. Not allowing re-binding certainly 323 * simplifies the implementation and avoids a great many possible 324 * failure modes. 325 */ 326 UNP_LOCK(); 327 if (unp->unp_vnode != NULL) { 328 UNP_UNLOCK(); 329 return (EINVAL); 330 } 331 if (unp->unp_flags & UNP_BINDING) { 332 UNP_UNLOCK(); 333 return (EALREADY); 334 } 335 unp->unp_flags |= UNP_BINDING; 336 UNP_UNLOCK(); 337 338 buf = malloc(namelen + 1, M_TEMP, M_WAITOK); 339 strlcpy(buf, soun->sun_path, namelen + 1); 340 341 mtx_lock(&Giant); 342restart: 343 mtx_assert(&Giant, MA_OWNED); 344 NDINIT(&nd, CREATE, NOFOLLOW | LOCKPARENT | SAVENAME, UIO_SYSSPACE, 345 buf, td); 346/* SHOULD BE ABLE TO ADOPT EXISTING AND wakeup() ALA FIFO's */ 347 error = namei(&nd); 348 if (error) 349 goto error; 350 vp = nd.ni_vp; 351 if (vp != NULL || vn_start_write(nd.ni_dvp, &mp, V_NOWAIT) != 0) { 352 NDFREE(&nd, NDF_ONLY_PNBUF); 353 if (nd.ni_dvp == vp) 354 vrele(nd.ni_dvp); 355 else 356 vput(nd.ni_dvp); 357 if (vp != NULL) { 358 vrele(vp); 359 error = EADDRINUSE; 360 goto error; 361 } 362 error = vn_start_write(NULL, &mp, V_XSLEEP | PCATCH); 363 if (error) 364 goto error; 365 goto restart; 366 } 367 VATTR_NULL(&vattr); 368 vattr.va_type = VSOCK; 369 vattr.va_mode = (ACCESSPERMS & ~td->td_proc->p_fd->fd_cmask); 370#ifdef MAC 371 error = mac_check_vnode_create(td->td_ucred, nd.ni_dvp, &nd.ni_cnd, 372 &vattr); 373#endif 374 if (error == 0) { 375 VOP_LEASE(nd.ni_dvp, td, td->td_ucred, LEASE_WRITE); 376 error = VOP_CREATE(nd.ni_dvp, &nd.ni_vp, &nd.ni_cnd, &vattr); 377 } 378 NDFREE(&nd, NDF_ONLY_PNBUF); 379 vput(nd.ni_dvp); 380 if (error) { 381 vn_finished_write(mp); 382 goto error; 383 } 384 vp = nd.ni_vp; 385 ASSERT_VOP_LOCKED(vp, "uipc_bind"); 386 soun = (struct sockaddr_un *)sodupsockaddr(nam, M_WAITOK); 387 UNP_LOCK(); 388 vp->v_socket = unp->unp_socket; 389 unp->unp_vnode = vp; 390 unp->unp_addr = soun; 391 unp->unp_flags &= ~UNP_BINDING; 392 UNP_UNLOCK(); 393 VOP_UNLOCK(vp, 0, td); 394 vn_finished_write(mp); 395 mtx_unlock(&Giant); 396 free(buf, M_TEMP); 397 return (0); 398error: 399 UNP_LOCK(); 400 unp->unp_flags &= ~UNP_BINDING; 401 UNP_UNLOCK(); 402 mtx_unlock(&Giant); 403 free(buf, M_TEMP); 404 return (error); 405} 406 407static int 408uipc_connect(struct socket *so, struct sockaddr *nam, struct thread *td) 409{ 410 int error; 411 412 KASSERT(td == curthread, ("uipc_connect: td != curthread")); 413 UNP_LOCK(); 414 error = unp_connect(so, nam, td); 415 UNP_UNLOCK(); 416 return (error); 417} 418 419/* 420 * XXXRW: Should also unbind? 421 */ 422static void 423uipc_close(struct socket *so) 424{ 425 struct unpcb *unp; 426 427 unp = sotounpcb(so); 428 KASSERT(unp != NULL, ("uipc_close: unp == NULL")); 429 UNP_LOCK(); 430 unp_disconnect(unp); 431 UNP_UNLOCK(); 432} 433 434int 435uipc_connect2(struct socket *so1, struct socket *so2) 436{ 437 struct unpcb *unp; 438 int error; 439 440 unp = sotounpcb(so1); 441 KASSERT(unp != NULL, ("uipc_connect2: unp == NULL")); 442 UNP_LOCK(); 443 error = unp_connect2(so1, so2, PRU_CONNECT2); 444 UNP_UNLOCK(); 445 return (error); 446} 447 448/* control is EOPNOTSUPP */ 449 450static void 451uipc_detach(struct socket *so) 452{ 453 struct sockaddr_un *saved_unp_addr; 454 struct unpcb *unp; 455 struct vnode *vp; 456 int freeunp, local_unp_rights; 457 458 unp = sotounpcb(so); 459 KASSERT(unp != NULL, ("uipc_detach: unp == NULL")); 460 UNP_LOCK(); 461 LIST_REMOVE(unp, unp_link); 462 unp->unp_gencnt = ++unp_gencnt; 463 --unp_count; 464 if ((vp = unp->unp_vnode) != NULL) { 465 unp->unp_vnode->v_socket = NULL; 466 unp->unp_vnode = NULL; 467 } 468 if (unp->unp_conn != NULL) 469 unp_disconnect(unp); 470 while (!LIST_EMPTY(&unp->unp_refs)) { 471 struct unpcb *ref = LIST_FIRST(&unp->unp_refs); 472 unp_drop(ref, ECONNRESET); 473 } 474 unp->unp_socket->so_pcb = NULL; 475 local_unp_rights = unp_rights; 476 saved_unp_addr = unp->unp_addr; 477 unp->unp_addr = NULL; 478 unp->unp_refcount--; 479 freeunp = (unp->unp_refcount == 0); 480 UNP_UNLOCK(); 481 if (saved_unp_addr != NULL) 482 FREE(saved_unp_addr, M_SONAME); 483 if (freeunp) 484 uma_zfree(unp_zone, unp); 485 if (vp) { 486 int vfslocked; 487 488 vfslocked = VFS_LOCK_GIANT(vp->v_mount); 489 vrele(vp); 490 VFS_UNLOCK_GIANT(vfslocked); 491 } 492 if (local_unp_rights) 493 taskqueue_enqueue(taskqueue_thread, &unp_gc_task); 494} 495 496static int 497uipc_disconnect(struct socket *so) 498{ 499 struct unpcb *unp; 500 501 unp = sotounpcb(so); 502 KASSERT(unp != NULL, ("uipc_disconnect: unp == NULL")); 503 UNP_LOCK(); 504 unp_disconnect(unp); 505 UNP_UNLOCK(); 506 return (0); 507} 508 509static int 510uipc_listen(struct socket *so, int backlog, struct thread *td) 511{ 512 struct unpcb *unp; 513 int error; 514 515 unp = sotounpcb(so); 516 KASSERT(unp != NULL, ("uipc_listen: unp == NULL")); 517 UNP_LOCK(); 518 if (unp->unp_vnode == NULL) { 519 UNP_UNLOCK(); 520 return (EINVAL); 521 } 522 error = unp_listen(so, unp, backlog, td); 523 UNP_UNLOCK(); 524 return (error); 525} 526 527static int 528uipc_peeraddr(struct socket *so, struct sockaddr **nam) 529{ 530 struct unpcb *unp; 531 const struct sockaddr *sa; 532 533 unp = sotounpcb(so); 534 KASSERT(unp != NULL, ("uipc_peeraddr: unp == NULL")); 535 *nam = malloc(sizeof(struct sockaddr_un), M_SONAME, M_WAITOK); 536 UNP_LOCK(); 537 if (unp->unp_conn != NULL && unp->unp_conn->unp_addr!= NULL) 538 sa = (struct sockaddr *) unp->unp_conn->unp_addr; 539 else { 540 /* 541 * XXX: It seems that this test always fails even when 542 * connection is established. So, this else clause is 543 * added as workaround to return PF_LOCAL sockaddr. 544 */ 545 sa = &sun_noname; 546 } 547 bcopy(sa, *nam, sa->sa_len); 548 UNP_UNLOCK(); 549 return (0); 550} 551 552static int 553uipc_rcvd(struct socket *so, int flags) 554{ 555 struct unpcb *unp; 556 struct socket *so2; 557 u_int mbcnt, sbcc; 558 u_long newhiwat; 559 560 unp = sotounpcb(so); 561 KASSERT(unp != NULL, ("uipc_rcvd: unp == NULL")); 562 switch (so->so_type) { 563 case SOCK_DGRAM: 564 panic("uipc_rcvd DGRAM?"); 565 /*NOTREACHED*/ 566 567 case SOCK_STREAM: 568 /* 569 * Adjust backpressure on sender and wakeup any waiting to 570 * write. 571 */ 572 SOCKBUF_LOCK(&so->so_rcv); 573 mbcnt = so->so_rcv.sb_mbcnt; 574 sbcc = so->so_rcv.sb_cc; 575 SOCKBUF_UNLOCK(&so->so_rcv); 576 UNP_LOCK(); 577 if (unp->unp_conn == NULL) { 578 UNP_UNLOCK(); 579 break; 580 } 581 so2 = unp->unp_conn->unp_socket; 582 SOCKBUF_LOCK(&so2->so_snd); 583 so2->so_snd.sb_mbmax += unp->unp_mbcnt - mbcnt; 584 newhiwat = so2->so_snd.sb_hiwat + unp->unp_cc - sbcc; 585 (void)chgsbsize(so2->so_cred->cr_uidinfo, &so2->so_snd.sb_hiwat, 586 newhiwat, RLIM_INFINITY); 587 sowwakeup_locked(so2); 588 unp->unp_mbcnt = mbcnt; 589 unp->unp_cc = sbcc; 590 UNP_UNLOCK(); 591 break; 592 593 default: 594 panic("uipc_rcvd unknown socktype"); 595 } 596 return (0); 597} 598 599/* pru_rcvoob is EOPNOTSUPP */ 600 601static int 602uipc_send(struct socket *so, int flags, struct mbuf *m, struct sockaddr *nam, 603 struct mbuf *control, struct thread *td) 604{ 605 struct unpcb *unp, *unp2; 606 struct socket *so2; 607 u_int mbcnt, sbcc; 608 u_long newhiwat; 609 int error = 0; 610 611 unp = sotounpcb(so); 612 KASSERT(unp != NULL, ("uipc_send: unp == NULL")); 613 if (flags & PRUS_OOB) { 614 error = EOPNOTSUPP; 615 goto release; 616 } 617 618 if (control != NULL && (error = unp_internalize(&control, td))) 619 goto release; 620 621 UNP_LOCK(); 622 switch (so->so_type) { 623 case SOCK_DGRAM: 624 { 625 const struct sockaddr *from; 626 627 if (nam != NULL) { 628 if (unp->unp_conn != NULL) { 629 error = EISCONN; 630 break; 631 } 632 error = unp_connect(so, nam, td); 633 if (error) 634 break; 635 } 636 /* 637 * Because connect() and send() are non-atomic in a sendto() 638 * with a target address, it's possible that the socket will 639 * have disconnected before the send() can run. In that case 640 * return the slightly counter-intuitive but otherwise 641 * correct error that the socket is not connected. 642 */ 643 unp2 = unp->unp_conn; 644 if (unp2 == NULL) { 645 error = ENOTCONN; 646 break; 647 } 648 so2 = unp2->unp_socket; 649 if (unp->unp_addr != NULL) 650 from = (struct sockaddr *)unp->unp_addr; 651 else 652 from = &sun_noname; 653 if (unp2->unp_flags & UNP_WANTCRED) 654 control = unp_addsockcred(td, control); 655 SOCKBUF_LOCK(&so2->so_rcv); 656 if (sbappendaddr_locked(&so2->so_rcv, from, m, control)) { 657 sorwakeup_locked(so2); 658 m = NULL; 659 control = NULL; 660 } else { 661 SOCKBUF_UNLOCK(&so2->so_rcv); 662 error = ENOBUFS; 663 } 664 if (nam != NULL) 665 unp_disconnect(unp); 666 break; 667 } 668 669 case SOCK_STREAM: 670 /* 671 * Connect if not connected yet. 672 * 673 * Note: A better implementation would complain if not equal 674 * to the peer's address. 675 */ 676 if ((so->so_state & SS_ISCONNECTED) == 0) { 677 if (nam != NULL) { 678 error = unp_connect(so, nam, td); 679 if (error) 680 break; /* XXX */ 681 } else { 682 error = ENOTCONN; 683 break; 684 } 685 } 686 687 /* Lockless read. */ 688 if (so->so_snd.sb_state & SBS_CANTSENDMORE) { 689 error = EPIPE; 690 break; 691 } 692 /* 693 * Because connect() and send() are non-atomic in a sendto() 694 * with a target address, it's possible that the socket will 695 * have disconnected before the send() can run. In that case 696 * return the slightly counter-intuitive but otherwise 697 * correct error that the socket is not connected. 698 */ 699 unp2 = unp->unp_conn; 700 if (unp2 == NULL) { 701 error = ENOTCONN; 702 break; 703 } 704 so2 = unp2->unp_socket; 705 SOCKBUF_LOCK(&so2->so_rcv); 706 if (unp2->unp_flags & UNP_WANTCRED) { 707 /* 708 * Credentials are passed only once on 709 * SOCK_STREAM. 710 */ 711 unp2->unp_flags &= ~UNP_WANTCRED; 712 control = unp_addsockcred(td, control); 713 } 714 /* 715 * Send to paired receive port, and then reduce send buffer 716 * hiwater marks to maintain backpressure. Wake up readers. 717 */ 718 if (control != NULL) { 719 if (sbappendcontrol_locked(&so2->so_rcv, m, control)) 720 control = NULL; 721 } else { 722 sbappend_locked(&so2->so_rcv, m); 723 } 724 mbcnt = so2->so_rcv.sb_mbcnt - unp2->unp_mbcnt; 725 unp2->unp_mbcnt = so2->so_rcv.sb_mbcnt; 726 sbcc = so2->so_rcv.sb_cc; 727 sorwakeup_locked(so2); 728 729 SOCKBUF_LOCK(&so->so_snd); 730 newhiwat = so->so_snd.sb_hiwat - (sbcc - unp2->unp_cc); 731 (void)chgsbsize(so->so_cred->cr_uidinfo, &so->so_snd.sb_hiwat, 732 newhiwat, RLIM_INFINITY); 733 so->so_snd.sb_mbmax -= mbcnt; 734 SOCKBUF_UNLOCK(&so->so_snd); 735 736 unp2->unp_cc = sbcc; 737 m = NULL; 738 break; 739 740 default: 741 panic("uipc_send unknown socktype"); 742 } 743 744 /* 745 * SEND_EOF is equivalent to a SEND followed by 746 * a SHUTDOWN. 747 */ 748 if (flags & PRUS_EOF) { 749 socantsendmore(so); 750 unp_shutdown(unp); 751 } 752 UNP_UNLOCK(); 753 754 if (control != NULL && error != 0) 755 unp_dispose(control); 756 757release: 758 if (control != NULL) 759 m_freem(control); 760 if (m != NULL) 761 m_freem(m); 762 return (error); 763} 764 765static int 766uipc_sense(struct socket *so, struct stat *sb) 767{ 768 struct unpcb *unp; 769 struct socket *so2; 770 771 unp = sotounpcb(so); 772 KASSERT(unp != NULL, ("uipc_sense: unp == NULL")); 773 UNP_LOCK(); 774 sb->st_blksize = so->so_snd.sb_hiwat; 775 if (so->so_type == SOCK_STREAM && unp->unp_conn != NULL) { 776 so2 = unp->unp_conn->unp_socket; 777 sb->st_blksize += so2->so_rcv.sb_cc; 778 } 779 sb->st_dev = NODEV; 780 if (unp->unp_ino == 0) 781 unp->unp_ino = (++unp_ino == 0) ? ++unp_ino : unp_ino; 782 sb->st_ino = unp->unp_ino; 783 UNP_UNLOCK(); 784 return (0); 785} 786 787static int 788uipc_shutdown(struct socket *so) 789{ 790 struct unpcb *unp; 791 792 unp = sotounpcb(so); 793 KASSERT(unp != NULL, ("uipc_shutdown: unp == NULL")); 794 UNP_LOCK(); 795 socantsendmore(so); 796 unp_shutdown(unp); 797 UNP_UNLOCK(); 798 return (0); 799} 800 801static int 802uipc_sockaddr(struct socket *so, struct sockaddr **nam) 803{ 804 struct unpcb *unp; 805 const struct sockaddr *sa; 806 807 unp = sotounpcb(so); 808 KASSERT(unp != NULL, ("uipc_sockaddr: unp == NULL")); 809 *nam = malloc(sizeof(struct sockaddr_un), M_SONAME, M_WAITOK); 810 UNP_LOCK(); 811 if (unp->unp_addr != NULL) 812 sa = (struct sockaddr *) unp->unp_addr; 813 else 814 sa = &sun_noname; 815 bcopy(sa, *nam, sa->sa_len); 816 UNP_UNLOCK(); 817 return (0); 818} 819 820struct pr_usrreqs uipc_usrreqs = { 821 .pru_abort = uipc_abort, 822 .pru_accept = uipc_accept, 823 .pru_attach = uipc_attach, 824 .pru_bind = uipc_bind, 825 .pru_connect = uipc_connect, 826 .pru_connect2 = uipc_connect2, 827 .pru_detach = uipc_detach, 828 .pru_disconnect = uipc_disconnect, 829 .pru_listen = uipc_listen, 830 .pru_peeraddr = uipc_peeraddr, 831 .pru_rcvd = uipc_rcvd, 832 .pru_send = uipc_send, 833 .pru_sense = uipc_sense, 834 .pru_shutdown = uipc_shutdown, 835 .pru_sockaddr = uipc_sockaddr, 836 .pru_close = uipc_close, 837}; 838 839int 840uipc_ctloutput(struct socket *so, struct sockopt *sopt) 841{ 842 struct unpcb *unp; 843 struct xucred xu; 844 int error, optval; 845 846 if (sopt->sopt_level != 0) 847 return (EINVAL); 848 849 unp = sotounpcb(so); 850 KASSERT(unp != NULL, ("uipc_ctloutput: unp == NULL")); 851 error = 0; 852 switch (sopt->sopt_dir) { 853 case SOPT_GET: 854 switch (sopt->sopt_name) { 855 case LOCAL_PEERCRED: 856 UNP_LOCK(); 857 if (unp->unp_flags & UNP_HAVEPC) 858 xu = unp->unp_peercred; 859 else { 860 if (so->so_type == SOCK_STREAM) 861 error = ENOTCONN; 862 else 863 error = EINVAL; 864 } 865 UNP_UNLOCK(); 866 if (error == 0) 867 error = sooptcopyout(sopt, &xu, sizeof(xu)); 868 break; 869 case LOCAL_CREDS: 870 /* Unocked read. */ 871 optval = unp->unp_flags & UNP_WANTCRED ? 1 : 0; 872 error = sooptcopyout(sopt, &optval, sizeof(optval)); 873 break; 874 case LOCAL_CONNWAIT: 875 /* Unocked read. */ 876 optval = unp->unp_flags & UNP_CONNWAIT ? 1 : 0; 877 error = sooptcopyout(sopt, &optval, sizeof(optval)); 878 break; 879 default: 880 error = EOPNOTSUPP; 881 break; 882 } 883 break; 884 case SOPT_SET: 885 switch (sopt->sopt_name) { 886 case LOCAL_CREDS: 887 case LOCAL_CONNWAIT: 888 error = sooptcopyin(sopt, &optval, sizeof(optval), 889 sizeof(optval)); 890 if (error) 891 break; 892 893#define OPTSET(bit) \ 894 if (optval) \ 895 unp->unp_flags |= bit; \ 896 else \ 897 unp->unp_flags &= ~bit; 898 899 UNP_LOCK(); 900 switch (sopt->sopt_name) { 901 case LOCAL_CREDS: 902 OPTSET(UNP_WANTCRED); 903 break; 904 case LOCAL_CONNWAIT: 905 OPTSET(UNP_CONNWAIT); 906 break; 907 default: 908 break; 909 } 910 UNP_UNLOCK(); 911 break; 912#undef OPTSET 913 default: 914 error = ENOPROTOOPT; 915 break; 916 } 917 break; 918 default: 919 error = EOPNOTSUPP; 920 break; 921 } 922 return (error); 923} 924 925static int 926unp_connect(struct socket *so, struct sockaddr *nam, struct thread *td) 927{ 928 struct sockaddr_un *soun = (struct sockaddr_un *)nam; 929 struct vnode *vp; 930 struct socket *so2, *so3; 931 struct unpcb *unp, *unp2, *unp3; 932 int error, len; 933 struct nameidata nd; 934 char buf[SOCK_MAXADDRLEN]; 935 struct sockaddr *sa; 936 937 UNP_LOCK_ASSERT(); 938 939 unp = sotounpcb(so); 940 KASSERT(unp != NULL, ("unp_connect: unp == NULL")); 941 len = nam->sa_len - offsetof(struct sockaddr_un, sun_path); 942 if (len <= 0) 943 return (EINVAL); 944 strlcpy(buf, soun->sun_path, len + 1); 945 if (unp->unp_flags & UNP_CONNECTING) { 946 UNP_UNLOCK(); 947 return (EALREADY); 948 } 949 unp->unp_flags |= UNP_CONNECTING; 950 UNP_UNLOCK(); 951 sa = malloc(sizeof(struct sockaddr_un), M_SONAME, M_WAITOK); 952 mtx_lock(&Giant); 953 NDINIT(&nd, LOOKUP, FOLLOW | LOCKLEAF, UIO_SYSSPACE, buf, td); 954 error = namei(&nd); 955 if (error) 956 vp = NULL; 957 else 958 vp = nd.ni_vp; 959 ASSERT_VOP_LOCKED(vp, "unp_connect"); 960 NDFREE(&nd, NDF_ONLY_PNBUF); 961 if (error) 962 goto bad; 963 964 if (vp->v_type != VSOCK) { 965 error = ENOTSOCK; 966 goto bad; 967 } 968 error = VOP_ACCESS(vp, VWRITE, td->td_ucred, td); 969 if (error) 970 goto bad; 971 mtx_unlock(&Giant); 972 UNP_LOCK(); 973 unp = sotounpcb(so); 974 KASSERT(unp != NULL, ("unp_connect: unp == NULL")); 975 so2 = vp->v_socket; 976 if (so2 == NULL) { 977 error = ECONNREFUSED; 978 goto bad2; 979 } 980 if (so->so_type != so2->so_type) { 981 error = EPROTOTYPE; 982 goto bad2; 983 } 984 if (so->so_proto->pr_flags & PR_CONNREQUIRED) { 985 if (so2->so_options & SO_ACCEPTCONN) 986 so3 = sonewconn(so2, 0); 987 else 988 so3 = NULL; 989 if (so3 == NULL) { 990 error = ECONNREFUSED; 991 goto bad2; 992 } 993 unp = sotounpcb(so); 994 unp2 = sotounpcb(so2); 995 unp3 = sotounpcb(so3); 996 if (unp2->unp_addr != NULL) { 997 bcopy(unp2->unp_addr, sa, unp2->unp_addr->sun_len); 998 unp3->unp_addr = (struct sockaddr_un *) sa; 999 sa = NULL; 1000 } 1001 /* 1002 * unp_peercred management: 1003 * 1004 * The connecter's (client's) credentials are copied from its 1005 * process structure at the time of connect() (which is now). 1006 */ 1007 cru2x(td->td_ucred, &unp3->unp_peercred); 1008 unp3->unp_flags |= UNP_HAVEPC; 1009 /* 1010 * The receiver's (server's) credentials are copied from the 1011 * unp_peercred member of socket on which the former called 1012 * listen(); unp_listen() cached that process's credentials 1013 * at that time so we can use them now. 1014 */ 1015 KASSERT(unp2->unp_flags & UNP_HAVEPCCACHED, 1016 ("unp_connect: listener without cached peercred")); 1017 memcpy(&unp->unp_peercred, &unp2->unp_peercred, 1018 sizeof(unp->unp_peercred)); 1019 unp->unp_flags |= UNP_HAVEPC; 1020 if (unp2->unp_flags & UNP_WANTCRED) 1021 unp3->unp_flags |= UNP_WANTCRED; 1022#ifdef MAC 1023 SOCK_LOCK(so); 1024 mac_set_socket_peer_from_socket(so, so3); 1025 mac_set_socket_peer_from_socket(so3, so); 1026 SOCK_UNLOCK(so); 1027#endif 1028 1029 so2 = so3; 1030 } 1031 error = unp_connect2(so, so2, PRU_CONNECT); 1032bad2: 1033 UNP_UNLOCK(); 1034 mtx_lock(&Giant); 1035bad: 1036 mtx_assert(&Giant, MA_OWNED); 1037 if (vp != NULL) 1038 vput(vp); 1039 mtx_unlock(&Giant); 1040 free(sa, M_SONAME); 1041 UNP_LOCK(); 1042 unp->unp_flags &= ~UNP_CONNECTING; 1043 return (error); 1044} 1045 1046static int 1047unp_connect2(struct socket *so, struct socket *so2, int req) 1048{ 1049 struct unpcb *unp = sotounpcb(so); 1050 struct unpcb *unp2; 1051 1052 UNP_LOCK_ASSERT(); 1053 1054 if (so2->so_type != so->so_type) 1055 return (EPROTOTYPE); 1056 unp2 = sotounpcb(so2); 1057 KASSERT(unp2 != NULL, ("unp_connect2: unp2 == NULL")); 1058 unp->unp_conn = unp2; 1059 switch (so->so_type) { 1060 case SOCK_DGRAM: 1061 LIST_INSERT_HEAD(&unp2->unp_refs, unp, unp_reflink); 1062 soisconnected(so); 1063 break; 1064 1065 case SOCK_STREAM: 1066 unp2->unp_conn = unp; 1067 if (req == PRU_CONNECT && 1068 ((unp->unp_flags | unp2->unp_flags) & UNP_CONNWAIT)) 1069 soisconnecting(so); 1070 else 1071 soisconnected(so); 1072 soisconnected(so2); 1073 break; 1074 1075 default: 1076 panic("unp_connect2"); 1077 } 1078 return (0); 1079} 1080 1081static void 1082unp_disconnect(struct unpcb *unp) 1083{ 1084 struct unpcb *unp2 = unp->unp_conn; 1085 struct socket *so; 1086 1087 UNP_LOCK_ASSERT(); 1088 1089 if (unp2 == NULL) 1090 return; 1091 unp->unp_conn = NULL; 1092 switch (unp->unp_socket->so_type) { 1093 case SOCK_DGRAM: 1094 LIST_REMOVE(unp, unp_reflink); 1095 so = unp->unp_socket; 1096 SOCK_LOCK(so); 1097 so->so_state &= ~SS_ISCONNECTED; 1098 SOCK_UNLOCK(so); 1099 break; 1100 1101 case SOCK_STREAM: 1102 soisdisconnected(unp->unp_socket); 1103 unp2->unp_conn = NULL; 1104 soisdisconnected(unp2->unp_socket); 1105 break; 1106 } 1107} 1108 1109/* 1110 * unp_pcblist() assumes that UNIX domain socket memory is never reclaimed by 1111 * the zone (UMA_ZONE_NOFREE), and as such potentially stale pointers are 1112 * safe to reference. It first scans the list of struct unpcb's to generate 1113 * a pointer list, then it rescans its list one entry at a time to 1114 * externalize and copyout. It checks the generation number to see if a 1115 * struct unpcb has been reused, and will skip it if so. 1116 */ 1117static int 1118unp_pcblist(SYSCTL_HANDLER_ARGS) 1119{ 1120 int error, i, n; 1121 int freeunp; 1122 struct unpcb *unp, **unp_list; 1123 unp_gen_t gencnt; 1124 struct xunpgen *xug; 1125 struct unp_head *head; 1126 struct xunpcb *xu; 1127 1128 head = ((intptr_t)arg1 == SOCK_DGRAM ? &unp_dhead : &unp_shead); 1129 1130 /* 1131 * The process of preparing the PCB list is too time-consuming and 1132 * resource-intensive to repeat twice on every request. 1133 */ 1134 if (req->oldptr == NULL) { 1135 n = unp_count; 1136 req->oldidx = 2 * (sizeof *xug) 1137 + (n + n/8) * sizeof(struct xunpcb); 1138 return (0); 1139 } 1140 1141 if (req->newptr != NULL) 1142 return (EPERM); 1143 1144 /* 1145 * OK, now we're committed to doing something. 1146 */ 1147 xug = malloc(sizeof(*xug), M_TEMP, M_WAITOK); 1148 UNP_LOCK(); 1149 gencnt = unp_gencnt; 1150 n = unp_count; 1151 UNP_UNLOCK(); 1152 1153 xug->xug_len = sizeof *xug; 1154 xug->xug_count = n; 1155 xug->xug_gen = gencnt; 1156 xug->xug_sogen = so_gencnt; 1157 error = SYSCTL_OUT(req, xug, sizeof *xug); 1158 if (error) { 1159 free(xug, M_TEMP); 1160 return (error); 1161 } 1162 1163 unp_list = malloc(n * sizeof *unp_list, M_TEMP, M_WAITOK); 1164 1165 UNP_LOCK(); 1166 for (unp = LIST_FIRST(head), i = 0; unp && i < n; 1167 unp = LIST_NEXT(unp, unp_link)) { 1168 if (unp->unp_gencnt <= gencnt) { 1169 if (cr_cansee(req->td->td_ucred, 1170 unp->unp_socket->so_cred)) 1171 continue; 1172 unp_list[i++] = unp; 1173 unp->unp_refcount++; 1174 } 1175 } 1176 UNP_UNLOCK(); 1177 n = i; /* In case we lost some during malloc. */ 1178 1179 error = 0; 1180 xu = malloc(sizeof(*xu), M_TEMP, M_WAITOK | M_ZERO); 1181 for (i = 0; i < n; i++) { 1182 unp = unp_list[i]; 1183 UNP_LOCK(); 1184 unp->unp_refcount--; 1185 if (unp->unp_refcount != 0 && unp->unp_gencnt <= gencnt) { 1186 xu->xu_len = sizeof *xu; 1187 xu->xu_unpp = unp; 1188 /* 1189 * XXX - need more locking here to protect against 1190 * connect/disconnect races for SMP. 1191 */ 1192 if (unp->unp_addr != NULL) 1193 bcopy(unp->unp_addr, &xu->xu_addr, 1194 unp->unp_addr->sun_len); 1195 if (unp->unp_conn != NULL && 1196 unp->unp_conn->unp_addr != NULL) 1197 bcopy(unp->unp_conn->unp_addr, 1198 &xu->xu_caddr, 1199 unp->unp_conn->unp_addr->sun_len); 1200 bcopy(unp, &xu->xu_unp, sizeof *unp); 1201 sotoxsocket(unp->unp_socket, &xu->xu_socket); 1202 UNP_UNLOCK(); 1203 error = SYSCTL_OUT(req, xu, sizeof *xu); 1204 } else { 1205 freeunp = (unp->unp_refcount == 0); 1206 UNP_UNLOCK(); 1207 if (freeunp) 1208 uma_zfree(unp_zone, unp); 1209 } 1210 } 1211 free(xu, M_TEMP); 1212 if (!error) { 1213 /* 1214 * Give the user an updated idea of our state. If the 1215 * generation differs from what we told her before, she knows 1216 * that something happened while we were processing this 1217 * request, and it might be necessary to retry. 1218 */ 1219 xug->xug_gen = unp_gencnt; 1220 xug->xug_sogen = so_gencnt; 1221 xug->xug_count = unp_count; 1222 error = SYSCTL_OUT(req, xug, sizeof *xug); 1223 } 1224 free(unp_list, M_TEMP); 1225 free(xug, M_TEMP); 1226 return (error); 1227} 1228 1229SYSCTL_PROC(_net_local_dgram, OID_AUTO, pcblist, CTLFLAG_RD, 1230 (caddr_t)(long)SOCK_DGRAM, 0, unp_pcblist, "S,xunpcb", 1231 "List of active local datagram sockets"); 1232SYSCTL_PROC(_net_local_stream, OID_AUTO, pcblist, CTLFLAG_RD, 1233 (caddr_t)(long)SOCK_STREAM, 0, unp_pcblist, "S,xunpcb", 1234 "List of active local stream sockets"); 1235 1236static void 1237unp_shutdown(struct unpcb *unp) 1238{ 1239 struct socket *so; 1240 1241 UNP_LOCK_ASSERT(); 1242 1243 if (unp->unp_socket->so_type == SOCK_STREAM && unp->unp_conn && 1244 (so = unp->unp_conn->unp_socket)) 1245 socantrcvmore(so); 1246} 1247 1248static void 1249unp_drop(struct unpcb *unp, int errno) 1250{ 1251 struct socket *so = unp->unp_socket; 1252 1253 UNP_LOCK_ASSERT(); 1254 1255 so->so_error = errno; 1256 unp_disconnect(unp); 1257} 1258 1259static void 1260unp_freerights(struct file **rp, int fdcount) 1261{ 1262 int i; 1263 struct file *fp; 1264 1265 for (i = 0; i < fdcount; i++) { 1266 fp = *rp; 1267 /* 1268 * Zero the pointer before calling unp_discard since it may 1269 * end up in unp_gc().. 1270 * 1271 * XXXRW: This is less true than it used to be. 1272 */ 1273 *rp++ = 0; 1274 unp_discard(fp); 1275 } 1276} 1277 1278int 1279unp_externalize(struct mbuf *control, struct mbuf **controlp) 1280{ 1281 struct thread *td = curthread; /* XXX */ 1282 struct cmsghdr *cm = mtod(control, struct cmsghdr *); 1283 int i; 1284 int *fdp; 1285 struct file **rp; 1286 struct file *fp; 1287 void *data; 1288 socklen_t clen = control->m_len, datalen; 1289 int error, newfds; 1290 int f; 1291 u_int newlen; 1292 1293 UNP_UNLOCK_ASSERT(); 1294 1295 error = 0; 1296 if (controlp != NULL) /* controlp == NULL => free control messages */ 1297 *controlp = NULL; 1298 1299 while (cm != NULL) { 1300 if (sizeof(*cm) > clen || cm->cmsg_len > clen) { 1301 error = EINVAL; 1302 break; 1303 } 1304 1305 data = CMSG_DATA(cm); 1306 datalen = (caddr_t)cm + cm->cmsg_len - (caddr_t)data; 1307 1308 if (cm->cmsg_level == SOL_SOCKET 1309 && cm->cmsg_type == SCM_RIGHTS) { 1310 newfds = datalen / sizeof(struct file *); 1311 rp = data; 1312 1313 /* If we're not outputting the descriptors free them. */ 1314 if (error || controlp == NULL) { 1315 unp_freerights(rp, newfds); 1316 goto next; 1317 } 1318 FILEDESC_LOCK(td->td_proc->p_fd); 1319 /* if the new FD's will not fit free them. */ 1320 if (!fdavail(td, newfds)) { 1321 FILEDESC_UNLOCK(td->td_proc->p_fd); 1322 error = EMSGSIZE; 1323 unp_freerights(rp, newfds); 1324 goto next; 1325 } 1326 /* 1327 * Now change each pointer to an fd in the global 1328 * table to an integer that is the index to the local 1329 * fd table entry that we set up to point to the 1330 * global one we are transferring. 1331 */ 1332 newlen = newfds * sizeof(int); 1333 *controlp = sbcreatecontrol(NULL, newlen, 1334 SCM_RIGHTS, SOL_SOCKET); 1335 if (*controlp == NULL) { 1336 FILEDESC_UNLOCK(td->td_proc->p_fd); 1337 error = E2BIG; 1338 unp_freerights(rp, newfds); 1339 goto next; 1340 } 1341 1342 fdp = (int *) 1343 CMSG_DATA(mtod(*controlp, struct cmsghdr *)); 1344 for (i = 0; i < newfds; i++) { 1345 if (fdalloc(td, 0, &f)) 1346 panic("unp_externalize fdalloc failed"); 1347 fp = *rp++; 1348 td->td_proc->p_fd->fd_ofiles[f] = fp; 1349 FILE_LOCK(fp); 1350 fp->f_msgcount--; 1351 FILE_UNLOCK(fp); 1352 unp_rights--; 1353 *fdp++ = f; 1354 } 1355 FILEDESC_UNLOCK(td->td_proc->p_fd); 1356 } else { 1357 /* We can just copy anything else across. */ 1358 if (error || controlp == NULL) 1359 goto next; 1360 *controlp = sbcreatecontrol(NULL, datalen, 1361 cm->cmsg_type, cm->cmsg_level); 1362 if (*controlp == NULL) { 1363 error = ENOBUFS; 1364 goto next; 1365 } 1366 bcopy(data, 1367 CMSG_DATA(mtod(*controlp, struct cmsghdr *)), 1368 datalen); 1369 } 1370 1371 controlp = &(*controlp)->m_next; 1372 1373next: 1374 if (CMSG_SPACE(datalen) < clen) { 1375 clen -= CMSG_SPACE(datalen); 1376 cm = (struct cmsghdr *) 1377 ((caddr_t)cm + CMSG_SPACE(datalen)); 1378 } else { 1379 clen = 0; 1380 cm = NULL; 1381 } 1382 } 1383 1384 m_freem(control); 1385 1386 return (error); 1387} 1388 1389static void 1390unp_zone_change(void *tag) 1391{ 1392 1393 uma_zone_set_max(unp_zone, maxsockets); 1394} 1395 1396void 1397unp_init(void) 1398{ 1399 1400 unp_zone = uma_zcreate("unpcb", sizeof(struct unpcb), NULL, NULL, 1401 NULL, NULL, UMA_ALIGN_PTR, 0); 1402 if (unp_zone == NULL) 1403 panic("unp_init"); 1404 uma_zone_set_max(unp_zone, maxsockets); 1405 EVENTHANDLER_REGISTER(maxsockets_change, unp_zone_change, 1406 NULL, EVENTHANDLER_PRI_ANY); 1407 LIST_INIT(&unp_dhead); 1408 LIST_INIT(&unp_shead); 1409 TASK_INIT(&unp_gc_task, 0, unp_gc, NULL); 1410 UNP_LOCK_INIT(); 1411} 1412 1413static int 1414unp_internalize(struct mbuf **controlp, struct thread *td) 1415{ 1416 struct mbuf *control = *controlp; 1417 struct proc *p = td->td_proc; 1418 struct filedesc *fdescp = p->p_fd; 1419 struct cmsghdr *cm = mtod(control, struct cmsghdr *); 1420 struct cmsgcred *cmcred; 1421 struct file **rp; 1422 struct file *fp; 1423 struct timeval *tv; 1424 int i, fd, *fdp; 1425 void *data; 1426 socklen_t clen = control->m_len, datalen; 1427 int error, oldfds; 1428 u_int newlen; 1429 1430 UNP_UNLOCK_ASSERT(); 1431 1432 error = 0; 1433 *controlp = NULL; 1434 1435 while (cm != NULL) { 1436 if (sizeof(*cm) > clen || cm->cmsg_level != SOL_SOCKET 1437 || cm->cmsg_len > clen) { 1438 error = EINVAL; 1439 goto out; 1440 } 1441 1442 data = CMSG_DATA(cm); 1443 datalen = (caddr_t)cm + cm->cmsg_len - (caddr_t)data; 1444 1445 switch (cm->cmsg_type) { 1446 /* 1447 * Fill in credential information. 1448 */ 1449 case SCM_CREDS: 1450 *controlp = sbcreatecontrol(NULL, sizeof(*cmcred), 1451 SCM_CREDS, SOL_SOCKET); 1452 if (*controlp == NULL) { 1453 error = ENOBUFS; 1454 goto out; 1455 } 1456 1457 cmcred = (struct cmsgcred *) 1458 CMSG_DATA(mtod(*controlp, struct cmsghdr *)); 1459 cmcred->cmcred_pid = p->p_pid; 1460 cmcred->cmcred_uid = td->td_ucred->cr_ruid; 1461 cmcred->cmcred_gid = td->td_ucred->cr_rgid; 1462 cmcred->cmcred_euid = td->td_ucred->cr_uid; 1463 cmcred->cmcred_ngroups = MIN(td->td_ucred->cr_ngroups, 1464 CMGROUP_MAX); 1465 for (i = 0; i < cmcred->cmcred_ngroups; i++) 1466 cmcred->cmcred_groups[i] = 1467 td->td_ucred->cr_groups[i]; 1468 break; 1469 1470 case SCM_RIGHTS: 1471 oldfds = datalen / sizeof (int); 1472 /* 1473 * Check that all the FDs passed in refer to legal 1474 * files. If not, reject the entire operation. 1475 */ 1476 fdp = data; 1477 FILEDESC_LOCK(fdescp); 1478 for (i = 0; i < oldfds; i++) { 1479 fd = *fdp++; 1480 if ((unsigned)fd >= fdescp->fd_nfiles || 1481 fdescp->fd_ofiles[fd] == NULL) { 1482 FILEDESC_UNLOCK(fdescp); 1483 error = EBADF; 1484 goto out; 1485 } 1486 fp = fdescp->fd_ofiles[fd]; 1487 if (!(fp->f_ops->fo_flags & DFLAG_PASSABLE)) { 1488 FILEDESC_UNLOCK(fdescp); 1489 error = EOPNOTSUPP; 1490 goto out; 1491 } 1492 1493 } 1494 /* 1495 * Now replace the integer FDs with pointers to the 1496 * associated global file table entry.. 1497 */ 1498 newlen = oldfds * sizeof(struct file *); 1499 *controlp = sbcreatecontrol(NULL, newlen, 1500 SCM_RIGHTS, SOL_SOCKET); 1501 if (*controlp == NULL) { 1502 FILEDESC_UNLOCK(fdescp); 1503 error = E2BIG; 1504 goto out; 1505 } 1506 1507 fdp = data; 1508 rp = (struct file **) 1509 CMSG_DATA(mtod(*controlp, struct cmsghdr *)); 1510 for (i = 0; i < oldfds; i++) { 1511 fp = fdescp->fd_ofiles[*fdp++]; 1512 *rp++ = fp; 1513 FILE_LOCK(fp); 1514 fp->f_count++; 1515 fp->f_msgcount++; 1516 FILE_UNLOCK(fp); 1517 unp_rights++; 1518 } 1519 FILEDESC_UNLOCK(fdescp); 1520 break; 1521 1522 case SCM_TIMESTAMP: 1523 *controlp = sbcreatecontrol(NULL, sizeof(*tv), 1524 SCM_TIMESTAMP, SOL_SOCKET); 1525 if (*controlp == NULL) { 1526 error = ENOBUFS; 1527 goto out; 1528 } 1529 tv = (struct timeval *) 1530 CMSG_DATA(mtod(*controlp, struct cmsghdr *)); 1531 microtime(tv); 1532 break; 1533 1534 default: 1535 error = EINVAL; 1536 goto out; 1537 } 1538 1539 controlp = &(*controlp)->m_next; 1540 1541 if (CMSG_SPACE(datalen) < clen) { 1542 clen -= CMSG_SPACE(datalen); 1543 cm = (struct cmsghdr *) 1544 ((caddr_t)cm + CMSG_SPACE(datalen)); 1545 } else { 1546 clen = 0; 1547 cm = NULL; 1548 } 1549 } 1550 1551out: 1552 m_freem(control); 1553 1554 return (error); 1555} 1556 1557static struct mbuf * 1558unp_addsockcred(struct thread *td, struct mbuf *control) 1559{ 1560 struct mbuf *m, *n, *n_prev; 1561 struct sockcred *sc; 1562 const struct cmsghdr *cm; 1563 int ngroups; 1564 int i; 1565 1566 ngroups = MIN(td->td_ucred->cr_ngroups, CMGROUP_MAX); 1567 1568 m = sbcreatecontrol(NULL, SOCKCREDSIZE(ngroups), SCM_CREDS, SOL_SOCKET); 1569 if (m == NULL) 1570 return (control); 1571 1572 sc = (struct sockcred *) CMSG_DATA(mtod(m, struct cmsghdr *)); 1573 sc->sc_uid = td->td_ucred->cr_ruid; 1574 sc->sc_euid = td->td_ucred->cr_uid; 1575 sc->sc_gid = td->td_ucred->cr_rgid; 1576 sc->sc_egid = td->td_ucred->cr_gid; 1577 sc->sc_ngroups = ngroups; 1578 for (i = 0; i < sc->sc_ngroups; i++) 1579 sc->sc_groups[i] = td->td_ucred->cr_groups[i]; 1580 1581 /* 1582 * Unlink SCM_CREDS control messages (struct cmsgcred), since just 1583 * created SCM_CREDS control message (struct sockcred) has another 1584 * format. 1585 */ 1586 if (control != NULL) 1587 for (n = control, n_prev = NULL; n != NULL;) { 1588 cm = mtod(n, struct cmsghdr *); 1589 if (cm->cmsg_level == SOL_SOCKET && 1590 cm->cmsg_type == SCM_CREDS) { 1591 if (n_prev == NULL) 1592 control = n->m_next; 1593 else 1594 n_prev->m_next = n->m_next; 1595 n = m_free(n); 1596 } else { 1597 n_prev = n; 1598 n = n->m_next; 1599 } 1600 } 1601 1602 /* Prepend it to the head. */ 1603 m->m_next = control; 1604 1605 return (m); 1606} 1607 1608/* 1609 * unp_defer indicates whether additional work has been defered for a future 1610 * pass through unp_gc(). It is thread local and does not require explicit 1611 * synchronization. 1612 */ 1613static int unp_defer; 1614 1615static int unp_taskcount; 1616SYSCTL_INT(_net_local, OID_AUTO, taskcount, CTLFLAG_RD, &unp_taskcount, 0, ""); 1617 1618static int unp_recycled; 1619SYSCTL_INT(_net_local, OID_AUTO, recycled, CTLFLAG_RD, &unp_recycled, 0, ""); 1620 1621static void 1622unp_gc(__unused void *arg, int pending) 1623{ 1624 struct file *fp, *nextfp; 1625 struct socket *so; 1626 struct file **extra_ref, **fpp; 1627 int nunref, i; 1628 int nfiles_snap; 1629 int nfiles_slack = 20; 1630 1631 unp_taskcount++; 1632 unp_defer = 0; 1633 /* 1634 * Before going through all this, set all FDs to be NOT deferred and 1635 * NOT externally accessible. 1636 */ 1637 sx_slock(&filelist_lock); 1638 LIST_FOREACH(fp, &filehead, f_list) 1639 fp->f_gcflag &= ~(FMARK|FDEFER); 1640 do { 1641 KASSERT(unp_defer >= 0, ("unp_gc: unp_defer %d", unp_defer)); 1642 LIST_FOREACH(fp, &filehead, f_list) { 1643 FILE_LOCK(fp); 1644 /* 1645 * If the file is not open, skip it -- could be a 1646 * file in the process of being opened, or in the 1647 * process of being closed. If the file is 1648 * "closing", it may have been marked for deferred 1649 * consideration. Clear the flag now if so. 1650 */ 1651 if (fp->f_count == 0) { 1652 if (fp->f_gcflag & FDEFER) 1653 unp_defer--; 1654 fp->f_gcflag &= ~(FMARK|FDEFER); 1655 FILE_UNLOCK(fp); 1656 continue; 1657 } 1658 /* 1659 * If we already marked it as 'defer' in a 1660 * previous pass, then try to process it this 1661 * time and un-mark it. 1662 */ 1663 if (fp->f_gcflag & FDEFER) { 1664 fp->f_gcflag &= ~FDEFER; 1665 unp_defer--; 1666 } else { 1667 /* 1668 * if it's not deferred, then check if it's 1669 * already marked.. if so skip it 1670 */ 1671 if (fp->f_gcflag & FMARK) { 1672 FILE_UNLOCK(fp); 1673 continue; 1674 } 1675 /* 1676 * If all references are from messages in 1677 * transit, then skip it. it's not externally 1678 * accessible. 1679 */ 1680 if (fp->f_count == fp->f_msgcount) { 1681 FILE_UNLOCK(fp); 1682 continue; 1683 } 1684 /* 1685 * If it got this far then it must be 1686 * externally accessible. 1687 */ 1688 fp->f_gcflag |= FMARK; 1689 } 1690 /* 1691 * Either it was deferred, or it is externally 1692 * accessible and not already marked so. Now check 1693 * if it is possibly one of OUR sockets. 1694 */ 1695 if (fp->f_type != DTYPE_SOCKET || 1696 (so = fp->f_data) == NULL) { 1697 FILE_UNLOCK(fp); 1698 continue; 1699 } 1700 if (so->so_proto->pr_domain != &localdomain || 1701 (so->so_proto->pr_flags & PR_RIGHTS) == 0) { 1702 FILE_UNLOCK(fp); 1703 continue; 1704 } 1705 1706 /* 1707 * Tell any other threads that do a subsequent 1708 * fdrop() that we are scanning the message 1709 * buffers. 1710 */ 1711 fp->f_gcflag |= FWAIT; 1712 FILE_UNLOCK(fp); 1713 1714 /* 1715 * So, Ok, it's one of our sockets and it IS 1716 * externally accessible (or was deferred). Now we 1717 * look to see if we hold any file descriptors in its 1718 * message buffers. Follow those links and mark them 1719 * as accessible too. 1720 */ 1721 SOCKBUF_LOCK(&so->so_rcv); 1722 unp_scan(so->so_rcv.sb_mb, unp_mark); 1723 SOCKBUF_UNLOCK(&so->so_rcv); 1724 1725 /* 1726 * Wake up any threads waiting in fdrop(). 1727 */ 1728 FILE_LOCK(fp); 1729 fp->f_gcflag &= ~FWAIT; 1730 wakeup(&fp->f_gcflag); 1731 FILE_UNLOCK(fp); 1732 } 1733 } while (unp_defer); 1734 sx_sunlock(&filelist_lock); 1735 /* 1736 * XXXRW: The following comments need updating for a post-SMPng and 1737 * deferred unp_gc() world, but are still generally accurate. 1738 * 1739 * We grab an extra reference to each of the file table entries that 1740 * are not otherwise accessible and then free the rights that are 1741 * stored in messages on them. 1742 * 1743 * The bug in the orginal code is a little tricky, so I'll describe 1744 * what's wrong with it here. 1745 * 1746 * It is incorrect to simply unp_discard each entry for f_msgcount 1747 * times -- consider the case of sockets A and B that contain 1748 * references to each other. On a last close of some other socket, 1749 * we trigger a gc since the number of outstanding rights (unp_rights) 1750 * is non-zero. If during the sweep phase the gc code unp_discards, 1751 * we end up doing a (full) closef on the descriptor. A closef on A 1752 * results in the following chain. Closef calls soo_close, which 1753 * calls soclose. Soclose calls first (through the switch 1754 * uipc_usrreq) unp_detach, which re-invokes unp_gc. Unp_gc simply 1755 * returns because the previous instance had set unp_gcing, and we 1756 * return all the way back to soclose, which marks the socket with 1757 * SS_NOFDREF, and then calls sofree. Sofree calls sorflush to free 1758 * up the rights that are queued in messages on the socket A, i.e., 1759 * the reference on B. The sorflush calls via the dom_dispose switch 1760 * unp_dispose, which unp_scans with unp_discard. This second 1761 * instance of unp_discard just calls closef on B. 1762 * 1763 * Well, a similar chain occurs on B, resulting in a sorflush on B, 1764 * which results in another closef on A. Unfortunately, A is already 1765 * being closed, and the descriptor has already been marked with 1766 * SS_NOFDREF, and soclose panics at this point. 1767 * 1768 * Here, we first take an extra reference to each inaccessible 1769 * descriptor. Then, we call sorflush ourself, since we know it is a 1770 * Unix domain socket anyhow. After we destroy all the rights 1771 * carried in messages, we do a last closef to get rid of our extra 1772 * reference. This is the last close, and the unp_detach etc will 1773 * shut down the socket. 1774 * 1775 * 91/09/19, bsy@cs.cmu.edu 1776 */ 1777again: 1778 nfiles_snap = openfiles + nfiles_slack; /* some slack */ 1779 extra_ref = malloc(nfiles_snap * sizeof(struct file *), M_TEMP, 1780 M_WAITOK); 1781 sx_slock(&filelist_lock); 1782 if (nfiles_snap < openfiles) { 1783 sx_sunlock(&filelist_lock); 1784 free(extra_ref, M_TEMP); 1785 nfiles_slack += 20; 1786 goto again; 1787 } 1788 for (nunref = 0, fp = LIST_FIRST(&filehead), fpp = extra_ref; 1789 fp != NULL; fp = nextfp) { 1790 nextfp = LIST_NEXT(fp, f_list); 1791 FILE_LOCK(fp); 1792 /* 1793 * If it's not open, skip it 1794 */ 1795 if (fp->f_count == 0) { 1796 FILE_UNLOCK(fp); 1797 continue; 1798 } 1799 /* 1800 * If all refs are from msgs, and it's not marked accessible 1801 * then it must be referenced from some unreachable cycle of 1802 * (shut-down) FDs, so include it in our list of FDs to 1803 * remove. 1804 */ 1805 if (fp->f_count == fp->f_msgcount && !(fp->f_gcflag & FMARK)) { 1806 *fpp++ = fp; 1807 nunref++; 1808 fp->f_count++; 1809 } 1810 FILE_UNLOCK(fp); 1811 } 1812 sx_sunlock(&filelist_lock); 1813 /* 1814 * For each FD on our hit list, do the following two things: 1815 */ 1816 for (i = nunref, fpp = extra_ref; --i >= 0; ++fpp) { 1817 struct file *tfp = *fpp; 1818 FILE_LOCK(tfp); 1819 if (tfp->f_type == DTYPE_SOCKET && 1820 tfp->f_data != NULL) { 1821 FILE_UNLOCK(tfp); 1822 sorflush(tfp->f_data); 1823 } else { 1824 FILE_UNLOCK(tfp); 1825 } 1826 } 1827 for (i = nunref, fpp = extra_ref; --i >= 0; ++fpp) { 1828 closef(*fpp, (struct thread *) NULL); 1829 unp_recycled++; 1830 } 1831 free(extra_ref, M_TEMP); 1832} 1833 1834void 1835unp_dispose(struct mbuf *m) 1836{ 1837 1838 if (m) 1839 unp_scan(m, unp_discard); 1840} 1841 1842static int 1843unp_listen(struct socket *so, struct unpcb *unp, int backlog, 1844 struct thread *td) 1845{ 1846 int error; 1847 1848 UNP_LOCK_ASSERT(); 1849 1850 SOCK_LOCK(so); 1851 error = solisten_proto_check(so); 1852 if (error == 0) { 1853 cru2x(td->td_ucred, &unp->unp_peercred); 1854 unp->unp_flags |= UNP_HAVEPCCACHED; 1855 solisten_proto(so, backlog); 1856 } 1857 SOCK_UNLOCK(so); 1858 return (error); 1859} 1860 1861static void 1862unp_scan(struct mbuf *m0, void (*op)(struct file *)) 1863{ 1864 struct mbuf *m; 1865 struct file **rp; 1866 struct cmsghdr *cm; 1867 void *data; 1868 int i; 1869 socklen_t clen, datalen; 1870 int qfds; 1871 1872 while (m0 != NULL) { 1873 for (m = m0; m; m = m->m_next) { 1874 if (m->m_type != MT_CONTROL) 1875 continue; 1876 1877 cm = mtod(m, struct cmsghdr *); 1878 clen = m->m_len; 1879 1880 while (cm != NULL) { 1881 if (sizeof(*cm) > clen || cm->cmsg_len > clen) 1882 break; 1883 1884 data = CMSG_DATA(cm); 1885 datalen = (caddr_t)cm + cm->cmsg_len 1886 - (caddr_t)data; 1887 1888 if (cm->cmsg_level == SOL_SOCKET && 1889 cm->cmsg_type == SCM_RIGHTS) { 1890 qfds = datalen / sizeof (struct file *); 1891 rp = data; 1892 for (i = 0; i < qfds; i++) 1893 (*op)(*rp++); 1894 } 1895 1896 if (CMSG_SPACE(datalen) < clen) { 1897 clen -= CMSG_SPACE(datalen); 1898 cm = (struct cmsghdr *) 1899 ((caddr_t)cm + CMSG_SPACE(datalen)); 1900 } else { 1901 clen = 0; 1902 cm = NULL; 1903 } 1904 } 1905 } 1906 m0 = m0->m_act; 1907 } 1908} 1909 1910static void 1911unp_mark(struct file *fp) 1912{ 1913 if (fp->f_gcflag & FMARK) 1914 return; 1915 unp_defer++; 1916 fp->f_gcflag |= (FMARK|FDEFER); 1917} 1918 1919static void 1920unp_discard(struct file *fp) 1921{ 1922 UNP_LOCK(); 1923 FILE_LOCK(fp); 1924 fp->f_msgcount--; 1925 unp_rights--; 1926 FILE_UNLOCK(fp); 1927 UNP_UNLOCK(); 1928 (void) closef(fp, (struct thread *)NULL); 1929} 1930