uipc_usrreq.c revision 160583
1/*- 2 * Copyright (c) 1982, 1986, 1989, 1991, 1993 3 * The Regents of the University of California. 4 * Copyright 2004-2006 Robert N. M. Watson 5 * All rights reserved. 6 * 7 * Redistribution and use in source and binary forms, with or without 8 * modification, are permitted provided that the following conditions 9 * are met: 10 * 1. Redistributions of source code must retain the above copyright 11 * notice, this list of conditions and the following disclaimer. 12 * 2. Redistributions in binary form must reproduce the above copyright 13 * notice, this list of conditions and the following disclaimer in the 14 * documentation and/or other materials provided with the distribution. 15 * 4. Neither the name of the University nor the names of its contributors 16 * may be used to endorse or promote products derived from this software 17 * without specific prior written permission. 18 * 19 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND 20 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 21 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 22 * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE 23 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 24 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 25 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 26 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 27 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 28 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 29 * SUCH DAMAGE. 30 * 31 * From: @(#)uipc_usrreq.c 8.3 (Berkeley) 1/4/94 32 */ 33 34#include <sys/cdefs.h> 35__FBSDID("$FreeBSD: head/sys/kern/uipc_usrreq.c 160583 2006-07-22 17:24:55Z rwatson $"); 36 37#include "opt_mac.h" 38 39#include <sys/param.h> 40#include <sys/domain.h> 41#include <sys/fcntl.h> 42#include <sys/malloc.h> /* XXX must be before <sys/file.h> */ 43#include <sys/eventhandler.h> 44#include <sys/file.h> 45#include <sys/filedesc.h> 46#include <sys/jail.h> 47#include <sys/kernel.h> 48#include <sys/lock.h> 49#include <sys/mac.h> 50#include <sys/mbuf.h> 51#include <sys/mount.h> 52#include <sys/mutex.h> 53#include <sys/namei.h> 54#include <sys/proc.h> 55#include <sys/protosw.h> 56#include <sys/resourcevar.h> 57#include <sys/socket.h> 58#include <sys/socketvar.h> 59#include <sys/signalvar.h> 60#include <sys/stat.h> 61#include <sys/sx.h> 62#include <sys/sysctl.h> 63#include <sys/systm.h> 64#include <sys/taskqueue.h> 65#include <sys/un.h> 66#include <sys/unpcb.h> 67#include <sys/vnode.h> 68 69#include <vm/uma.h> 70 71static uma_zone_t unp_zone; 72static unp_gen_t unp_gencnt; 73static u_int unp_count; 74 75static struct unp_head unp_shead, unp_dhead; 76 77/* 78 * Unix communications domain. 79 * 80 * TODO: 81 * SEQPACKET, RDM 82 * rethink name space problems 83 * need a proper out-of-band 84 * lock pushdown 85 */ 86static const struct sockaddr sun_noname = { sizeof(sun_noname), AF_LOCAL }; 87static ino_t unp_ino; /* prototype for fake inode numbers */ 88struct mbuf *unp_addsockcred(struct thread *, struct mbuf *); 89 90/* 91 * Currently, UNIX domain sockets are protected by a single subsystem lock, 92 * which covers global data structures and variables, the contents of each 93 * per-socket unpcb structure, and the so_pcb field in sockets attached to 94 * the UNIX domain. This provides for a moderate degree of paralellism, as 95 * receive operations on UNIX domain sockets do not need to acquire the 96 * subsystem lock. Finer grained locking to permit send() without acquiring 97 * a global lock would be a logical next step. 98 * 99 * The UNIX domain socket lock preceds all socket layer locks, including the 100 * socket lock and socket buffer lock, permitting UNIX domain socket code to 101 * call into socket support routines without releasing its locks. 102 * 103 * Some caution is required in areas where the UNIX domain socket code enters 104 * VFS in order to create or find rendezvous points. This results in 105 * dropping of the UNIX domain socket subsystem lock, acquisition of the 106 * Giant lock, and potential sleeping. This increases the chances of races, 107 * and exposes weaknesses in the socket->protocol API by offering poor 108 * failure modes. 109 */ 110static struct mtx unp_mtx; 111#define UNP_LOCK_INIT() \ 112 mtx_init(&unp_mtx, "unp", NULL, MTX_DEF) 113#define UNP_LOCK() mtx_lock(&unp_mtx) 114#define UNP_UNLOCK() mtx_unlock(&unp_mtx) 115#define UNP_LOCK_ASSERT() mtx_assert(&unp_mtx, MA_OWNED) 116#define UNP_UNLOCK_ASSERT() mtx_assert(&unp_mtx, MA_NOTOWNED) 117 118/* 119 * Garbage collection of cyclic file descriptor/socket references occurs 120 * asynchronously in a taskqueue context in order to avoid recursion and 121 * reentrance in the UNIX domain socket, file descriptor, and socket layer 122 * code. See unp_gc() for a full description. 123 */ 124static struct task unp_gc_task; 125 126static int unp_attach(struct socket *); 127static void unp_detach(struct unpcb *); 128static int unp_bind(struct unpcb *,struct sockaddr *, struct thread *); 129static int unp_connect(struct socket *,struct sockaddr *, struct thread *); 130static int unp_connect2(struct socket *so, struct socket *so2, int); 131static void unp_disconnect(struct unpcb *); 132static void unp_shutdown(struct unpcb *); 133static void unp_drop(struct unpcb *, int); 134static void unp_gc(__unused void *, int); 135static void unp_scan(struct mbuf *, void (*)(struct file *)); 136static void unp_mark(struct file *); 137static void unp_discard(struct file *); 138static void unp_freerights(struct file **, int); 139static int unp_internalize(struct mbuf **, struct thread *); 140static int unp_listen(struct socket *, struct unpcb *, int, 141 struct thread *); 142 143static void 144uipc_abort(struct socket *so) 145{ 146 struct unpcb *unp; 147 148 unp = sotounpcb(so); 149 KASSERT(unp != NULL, ("uipc_abort: unp == NULL")); 150 UNP_LOCK(); 151 unp_drop(unp, ECONNABORTED); 152 UNP_UNLOCK(); 153} 154 155static int 156uipc_accept(struct socket *so, struct sockaddr **nam) 157{ 158 struct unpcb *unp; 159 const struct sockaddr *sa; 160 161 /* 162 * Pass back name of connected socket, if it was bound and we are 163 * still connected (our peer may have closed already!). 164 */ 165 unp = sotounpcb(so); 166 KASSERT(unp != NULL, ("uipc_accept: unp == NULL")); 167 *nam = malloc(sizeof(struct sockaddr_un), M_SONAME, M_WAITOK); 168 UNP_LOCK(); 169 if (unp->unp_conn != NULL && unp->unp_conn->unp_addr != NULL) 170 sa = (struct sockaddr *) unp->unp_conn->unp_addr; 171 else 172 sa = &sun_noname; 173 bcopy(sa, *nam, sa->sa_len); 174 UNP_UNLOCK(); 175 return (0); 176} 177 178static int 179uipc_attach(struct socket *so, int proto, struct thread *td) 180{ 181 182 return (unp_attach(so)); 183} 184 185static int 186uipc_bind(struct socket *so, struct sockaddr *nam, struct thread *td) 187{ 188 struct unpcb *unp; 189 int error; 190 191 unp = sotounpcb(so); 192 KASSERT(unp != NULL, ("uipc_bind: unp == NULL")); 193 UNP_LOCK(); 194 error = unp_bind(unp, nam, td); 195 UNP_UNLOCK(); 196 return (error); 197} 198 199static int 200uipc_connect(struct socket *so, struct sockaddr *nam, struct thread *td) 201{ 202 int error; 203 204 KASSERT(td == curthread, ("uipc_connect: td != curthread")); 205 UNP_LOCK(); 206 error = unp_connect(so, nam, td); 207 UNP_UNLOCK(); 208 return (error); 209} 210 211/* 212 * XXXRW: Should also unbind? 213 */ 214static void 215uipc_close(struct socket *so) 216{ 217 struct unpcb *unp; 218 219 unp = sotounpcb(so); 220 KASSERT(unp != NULL, ("uipc_close: unp == NULL")); 221 UNP_LOCK(); 222 unp_disconnect(unp); 223 UNP_UNLOCK(); 224} 225 226int 227uipc_connect2(struct socket *so1, struct socket *so2) 228{ 229 struct unpcb *unp; 230 int error; 231 232 unp = sotounpcb(so1); 233 KASSERT(unp != NULL, ("uipc_connect2: unp == NULL")); 234 UNP_LOCK(); 235 error = unp_connect2(so1, so2, PRU_CONNECT2); 236 UNP_UNLOCK(); 237 return (error); 238} 239 240/* control is EOPNOTSUPP */ 241 242static void 243uipc_detach(struct socket *so) 244{ 245 struct unpcb *unp; 246 247 unp = sotounpcb(so); 248 KASSERT(unp != NULL, ("uipc_detach: unp == NULL")); 249 UNP_LOCK(); 250 unp_detach(unp); 251 UNP_UNLOCK_ASSERT(); 252} 253 254static int 255uipc_disconnect(struct socket *so) 256{ 257 struct unpcb *unp; 258 259 unp = sotounpcb(so); 260 KASSERT(unp != NULL, ("uipc_disconnect: unp == NULL")); 261 UNP_LOCK(); 262 unp_disconnect(unp); 263 UNP_UNLOCK(); 264 return (0); 265} 266 267static int 268uipc_listen(struct socket *so, int backlog, struct thread *td) 269{ 270 struct unpcb *unp; 271 int error; 272 273 unp = sotounpcb(so); 274 KASSERT(unp != NULL, ("uipc_listen: unp == NULL")); 275 UNP_LOCK(); 276 if (unp->unp_vnode == NULL) { 277 UNP_UNLOCK(); 278 return (EINVAL); 279 } 280 error = unp_listen(so, unp, backlog, td); 281 UNP_UNLOCK(); 282 return (error); 283} 284 285static int 286uipc_peeraddr(struct socket *so, struct sockaddr **nam) 287{ 288 struct unpcb *unp; 289 const struct sockaddr *sa; 290 291 unp = sotounpcb(so); 292 KASSERT(unp != NULL, ("uipc_peeraddr: unp == NULL")); 293 *nam = malloc(sizeof(struct sockaddr_un), M_SONAME, M_WAITOK); 294 UNP_LOCK(); 295 if (unp->unp_conn != NULL && unp->unp_conn->unp_addr!= NULL) 296 sa = (struct sockaddr *) unp->unp_conn->unp_addr; 297 else { 298 /* 299 * XXX: It seems that this test always fails even when 300 * connection is established. So, this else clause is 301 * added as workaround to return PF_LOCAL sockaddr. 302 */ 303 sa = &sun_noname; 304 } 305 bcopy(sa, *nam, sa->sa_len); 306 UNP_UNLOCK(); 307 return (0); 308} 309 310static int 311uipc_rcvd(struct socket *so, int flags) 312{ 313 struct unpcb *unp; 314 struct socket *so2; 315 u_int mbcnt, sbcc; 316 u_long newhiwat; 317 318 unp = sotounpcb(so); 319 KASSERT(unp != NULL, ("uipc_rcvd: unp == NULL")); 320 switch (so->so_type) { 321 case SOCK_DGRAM: 322 panic("uipc_rcvd DGRAM?"); 323 /*NOTREACHED*/ 324 325 case SOCK_STREAM: 326 /* 327 * Adjust backpressure on sender and wakeup any waiting to 328 * write. 329 */ 330 SOCKBUF_LOCK(&so->so_rcv); 331 mbcnt = so->so_rcv.sb_mbcnt; 332 sbcc = so->so_rcv.sb_cc; 333 SOCKBUF_UNLOCK(&so->so_rcv); 334 UNP_LOCK(); 335 if (unp->unp_conn == NULL) { 336 UNP_UNLOCK(); 337 break; 338 } 339 so2 = unp->unp_conn->unp_socket; 340 SOCKBUF_LOCK(&so2->so_snd); 341 so2->so_snd.sb_mbmax += unp->unp_mbcnt - mbcnt; 342 newhiwat = so2->so_snd.sb_hiwat + unp->unp_cc - sbcc; 343 (void)chgsbsize(so2->so_cred->cr_uidinfo, &so2->so_snd.sb_hiwat, 344 newhiwat, RLIM_INFINITY); 345 sowwakeup_locked(so2); 346 unp->unp_mbcnt = mbcnt; 347 unp->unp_cc = sbcc; 348 UNP_UNLOCK(); 349 break; 350 351 default: 352 panic("uipc_rcvd unknown socktype"); 353 } 354 return (0); 355} 356 357/* pru_rcvoob is EOPNOTSUPP */ 358 359static int 360uipc_send(struct socket *so, int flags, struct mbuf *m, struct sockaddr *nam, 361 struct mbuf *control, struct thread *td) 362{ 363 int error = 0; 364 struct unpcb *unp; 365 struct socket *so2; 366 u_int mbcnt, sbcc; 367 u_long newhiwat; 368 369 unp = sotounpcb(so); 370 KASSERT(unp != NULL, ("uipc_send: unp == NULL")); 371 if (flags & PRUS_OOB) { 372 error = EOPNOTSUPP; 373 goto release; 374 } 375 376 if (control != NULL && (error = unp_internalize(&control, td))) 377 goto release; 378 379 UNP_LOCK(); 380 switch (so->so_type) { 381 case SOCK_DGRAM: 382 { 383 const struct sockaddr *from; 384 385 if (nam != NULL) { 386 if (unp->unp_conn != NULL) { 387 error = EISCONN; 388 break; 389 } 390 error = unp_connect(so, nam, td); 391 if (error) 392 break; 393 } else { 394 if (unp->unp_conn == NULL) { 395 error = ENOTCONN; 396 break; 397 } 398 } 399 so2 = unp->unp_conn->unp_socket; 400 if (unp->unp_addr != NULL) 401 from = (struct sockaddr *)unp->unp_addr; 402 else 403 from = &sun_noname; 404 if (unp->unp_conn->unp_flags & UNP_WANTCRED) 405 control = unp_addsockcred(td, control); 406 SOCKBUF_LOCK(&so2->so_rcv); 407 if (sbappendaddr_locked(&so2->so_rcv, from, m, control)) { 408 sorwakeup_locked(so2); 409 m = NULL; 410 control = NULL; 411 } else { 412 SOCKBUF_UNLOCK(&so2->so_rcv); 413 error = ENOBUFS; 414 } 415 if (nam != NULL) 416 unp_disconnect(unp); 417 break; 418 } 419 420 case SOCK_STREAM: 421 /* 422 * Connect if not connected yet. 423 * 424 * Note: A better implementation would complain if not equal 425 * to the peer's address. 426 */ 427 if ((so->so_state & SS_ISCONNECTED) == 0) { 428 if (nam != NULL) { 429 error = unp_connect(so, nam, td); 430 if (error) 431 break; /* XXX */ 432 } else { 433 error = ENOTCONN; 434 break; 435 } 436 } 437 438 /* Lockless read. */ 439 if (so->so_snd.sb_state & SBS_CANTSENDMORE) { 440 error = EPIPE; 441 break; 442 } 443 if (unp->unp_conn == NULL) 444 panic("uipc_send connected but no connection?"); 445 so2 = unp->unp_conn->unp_socket; 446 SOCKBUF_LOCK(&so2->so_rcv); 447 if (unp->unp_conn->unp_flags & UNP_WANTCRED) { 448 /* 449 * Credentials are passed only once on 450 * SOCK_STREAM. 451 */ 452 unp->unp_conn->unp_flags &= ~UNP_WANTCRED; 453 control = unp_addsockcred(td, control); 454 } 455 /* 456 * Send to paired receive port, and then reduce send buffer 457 * hiwater marks to maintain backpressure. Wake up readers. 458 */ 459 if (control != NULL) { 460 if (sbappendcontrol_locked(&so2->so_rcv, m, control)) 461 control = NULL; 462 } else { 463 sbappend_locked(&so2->so_rcv, m); 464 } 465 mbcnt = so2->so_rcv.sb_mbcnt - unp->unp_conn->unp_mbcnt; 466 unp->unp_conn->unp_mbcnt = so2->so_rcv.sb_mbcnt; 467 sbcc = so2->so_rcv.sb_cc; 468 sorwakeup_locked(so2); 469 470 SOCKBUF_LOCK(&so->so_snd); 471 newhiwat = so->so_snd.sb_hiwat - 472 (sbcc - unp->unp_conn->unp_cc); 473 (void)chgsbsize(so->so_cred->cr_uidinfo, &so->so_snd.sb_hiwat, 474 newhiwat, RLIM_INFINITY); 475 so->so_snd.sb_mbmax -= mbcnt; 476 SOCKBUF_UNLOCK(&so->so_snd); 477 478 unp->unp_conn->unp_cc = sbcc; 479 m = NULL; 480 break; 481 482 default: 483 panic("uipc_send unknown socktype"); 484 } 485 486 /* 487 * SEND_EOF is equivalent to a SEND followed by 488 * a SHUTDOWN. 489 */ 490 if (flags & PRUS_EOF) { 491 socantsendmore(so); 492 unp_shutdown(unp); 493 } 494 UNP_UNLOCK(); 495 496 if (control != NULL && error != 0) 497 unp_dispose(control); 498 499release: 500 if (control != NULL) 501 m_freem(control); 502 if (m != NULL) 503 m_freem(m); 504 return (error); 505} 506 507static int 508uipc_sense(struct socket *so, struct stat *sb) 509{ 510 struct unpcb *unp; 511 struct socket *so2; 512 513 unp = sotounpcb(so); 514 KASSERT(unp != NULL, ("uipc_sense: unp == NULL")); 515 UNP_LOCK(); 516 sb->st_blksize = so->so_snd.sb_hiwat; 517 if (so->so_type == SOCK_STREAM && unp->unp_conn != NULL) { 518 so2 = unp->unp_conn->unp_socket; 519 sb->st_blksize += so2->so_rcv.sb_cc; 520 } 521 sb->st_dev = NODEV; 522 if (unp->unp_ino == 0) 523 unp->unp_ino = (++unp_ino == 0) ? ++unp_ino : unp_ino; 524 sb->st_ino = unp->unp_ino; 525 UNP_UNLOCK(); 526 return (0); 527} 528 529static int 530uipc_shutdown(struct socket *so) 531{ 532 struct unpcb *unp; 533 534 unp = sotounpcb(so); 535 KASSERT(unp != NULL, ("uipc_shutdown: unp == NULL")); 536 UNP_LOCK(); 537 socantsendmore(so); 538 unp_shutdown(unp); 539 UNP_UNLOCK(); 540 return (0); 541} 542 543static int 544uipc_sockaddr(struct socket *so, struct sockaddr **nam) 545{ 546 struct unpcb *unp; 547 const struct sockaddr *sa; 548 549 unp = sotounpcb(so); 550 KASSERT(unp != NULL, ("uipc_sockaddr: unp == NULL")); 551 *nam = malloc(sizeof(struct sockaddr_un), M_SONAME, M_WAITOK); 552 UNP_LOCK(); 553 if (unp->unp_addr != NULL) 554 sa = (struct sockaddr *) unp->unp_addr; 555 else 556 sa = &sun_noname; 557 bcopy(sa, *nam, sa->sa_len); 558 UNP_UNLOCK(); 559 return (0); 560} 561 562struct pr_usrreqs uipc_usrreqs = { 563 .pru_abort = uipc_abort, 564 .pru_accept = uipc_accept, 565 .pru_attach = uipc_attach, 566 .pru_bind = uipc_bind, 567 .pru_connect = uipc_connect, 568 .pru_connect2 = uipc_connect2, 569 .pru_detach = uipc_detach, 570 .pru_disconnect = uipc_disconnect, 571 .pru_listen = uipc_listen, 572 .pru_peeraddr = uipc_peeraddr, 573 .pru_rcvd = uipc_rcvd, 574 .pru_send = uipc_send, 575 .pru_sense = uipc_sense, 576 .pru_shutdown = uipc_shutdown, 577 .pru_sockaddr = uipc_sockaddr, 578 .pru_sosend = sosend, 579 .pru_soreceive = soreceive, 580 .pru_sopoll = sopoll, 581 .pru_close = uipc_close, 582}; 583 584int 585uipc_ctloutput(struct socket *so, struct sockopt *sopt) 586{ 587 struct unpcb *unp; 588 struct xucred xu; 589 int error, optval; 590 591 if (sopt->sopt_level != 0) 592 return (EINVAL); 593 594 unp = sotounpcb(so); 595 KASSERT(unp != NULL, ("uipc_ctloutput: unp == NULL")); 596 UNP_LOCK(); 597 error = 0; 598 switch (sopt->sopt_dir) { 599 case SOPT_GET: 600 switch (sopt->sopt_name) { 601 case LOCAL_PEERCRED: 602 if (unp->unp_flags & UNP_HAVEPC) 603 xu = unp->unp_peercred; 604 else { 605 if (so->so_type == SOCK_STREAM) 606 error = ENOTCONN; 607 else 608 error = EINVAL; 609 } 610 if (error == 0) 611 error = sooptcopyout(sopt, &xu, sizeof(xu)); 612 break; 613 case LOCAL_CREDS: 614 optval = unp->unp_flags & UNP_WANTCRED ? 1 : 0; 615 error = sooptcopyout(sopt, &optval, sizeof(optval)); 616 break; 617 case LOCAL_CONNWAIT: 618 optval = unp->unp_flags & UNP_CONNWAIT ? 1 : 0; 619 error = sooptcopyout(sopt, &optval, sizeof(optval)); 620 break; 621 default: 622 error = EOPNOTSUPP; 623 break; 624 } 625 break; 626 case SOPT_SET: 627 switch (sopt->sopt_name) { 628 case LOCAL_CREDS: 629 case LOCAL_CONNWAIT: 630 error = sooptcopyin(sopt, &optval, sizeof(optval), 631 sizeof(optval)); 632 if (error) 633 break; 634 635#define OPTSET(bit) \ 636 if (optval) \ 637 unp->unp_flags |= bit; \ 638 else \ 639 unp->unp_flags &= ~bit; 640 641 switch (sopt->sopt_name) { 642 case LOCAL_CREDS: 643 OPTSET(UNP_WANTCRED); 644 break; 645 case LOCAL_CONNWAIT: 646 OPTSET(UNP_CONNWAIT); 647 break; 648 default: 649 break; 650 } 651 break; 652#undef OPTSET 653 default: 654 error = ENOPROTOOPT; 655 break; 656 } 657 break; 658 default: 659 error = EOPNOTSUPP; 660 break; 661 } 662 UNP_UNLOCK(); 663 return (error); 664} 665 666/* 667 * Both send and receive buffers are allocated PIPSIZ bytes of buffering 668 * for stream sockets, although the total for sender and receiver is 669 * actually only PIPSIZ. 670 * 671 * Datagram sockets really use the sendspace as the maximum datagram size, 672 * and don't really want to reserve the sendspace. Their recvspace should 673 * be large enough for at least one max-size datagram plus address. 674 */ 675#ifndef PIPSIZ 676#define PIPSIZ 8192 677#endif 678static u_long unpst_sendspace = PIPSIZ; 679static u_long unpst_recvspace = PIPSIZ; 680static u_long unpdg_sendspace = 2*1024; /* really max datagram size */ 681static u_long unpdg_recvspace = 4*1024; 682 683static int unp_rights; /* file descriptors in flight */ 684 685SYSCTL_DECL(_net_local_stream); 686SYSCTL_ULONG(_net_local_stream, OID_AUTO, sendspace, CTLFLAG_RW, 687 &unpst_sendspace, 0, ""); 688SYSCTL_ULONG(_net_local_stream, OID_AUTO, recvspace, CTLFLAG_RW, 689 &unpst_recvspace, 0, ""); 690SYSCTL_DECL(_net_local_dgram); 691SYSCTL_ULONG(_net_local_dgram, OID_AUTO, maxdgram, CTLFLAG_RW, 692 &unpdg_sendspace, 0, ""); 693SYSCTL_ULONG(_net_local_dgram, OID_AUTO, recvspace, CTLFLAG_RW, 694 &unpdg_recvspace, 0, ""); 695SYSCTL_DECL(_net_local); 696SYSCTL_INT(_net_local, OID_AUTO, inflight, CTLFLAG_RD, &unp_rights, 0, ""); 697 698static int 699unp_attach(struct socket *so) 700{ 701 struct unpcb *unp; 702 int error; 703 704 KASSERT(so->so_pcb == NULL, ("unp_attach: so_pcb != NULL")); 705 if (so->so_snd.sb_hiwat == 0 || so->so_rcv.sb_hiwat == 0) { 706 switch (so->so_type) { 707 708 case SOCK_STREAM: 709 error = soreserve(so, unpst_sendspace, unpst_recvspace); 710 break; 711 712 case SOCK_DGRAM: 713 error = soreserve(so, unpdg_sendspace, unpdg_recvspace); 714 break; 715 716 default: 717 panic("unp_attach"); 718 } 719 if (error) 720 return (error); 721 } 722 unp = uma_zalloc(unp_zone, M_WAITOK | M_ZERO); 723 if (unp == NULL) 724 return (ENOBUFS); 725 LIST_INIT(&unp->unp_refs); 726 unp->unp_socket = so; 727 so->so_pcb = unp; 728 729 UNP_LOCK(); 730 unp->unp_gencnt = ++unp_gencnt; 731 unp_count++; 732 LIST_INSERT_HEAD(so->so_type == SOCK_DGRAM ? &unp_dhead 733 : &unp_shead, unp, unp_link); 734 UNP_UNLOCK(); 735 736 return (0); 737} 738 739static void 740unp_detach(struct unpcb *unp) 741{ 742 struct vnode *vp; 743 int local_unp_rights; 744 745 UNP_LOCK_ASSERT(); 746 747 LIST_REMOVE(unp, unp_link); 748 unp->unp_gencnt = ++unp_gencnt; 749 --unp_count; 750 if ((vp = unp->unp_vnode) != NULL) { 751 /* 752 * XXXRW: should v_socket be frobbed only while holding 753 * Giant? 754 */ 755 unp->unp_vnode->v_socket = NULL; 756 unp->unp_vnode = NULL; 757 } 758 if (unp->unp_conn != NULL) 759 unp_disconnect(unp); 760 while (!LIST_EMPTY(&unp->unp_refs)) { 761 struct unpcb *ref = LIST_FIRST(&unp->unp_refs); 762 unp_drop(ref, ECONNRESET); 763 } 764 soisdisconnected(unp->unp_socket); 765 unp->unp_socket->so_pcb = NULL; 766 local_unp_rights = unp_rights; 767 UNP_UNLOCK(); 768 if (unp->unp_addr != NULL) 769 FREE(unp->unp_addr, M_SONAME); 770 uma_zfree(unp_zone, unp); 771 if (vp) { 772 int vfslocked; 773 774 vfslocked = VFS_LOCK_GIANT(vp->v_mount); 775 vrele(vp); 776 VFS_UNLOCK_GIANT(vfslocked); 777 } 778 if (local_unp_rights) 779 taskqueue_enqueue(taskqueue_thread, &unp_gc_task); 780} 781 782static int 783unp_bind(struct unpcb *unp, struct sockaddr *nam, struct thread *td) 784{ 785 struct sockaddr_un *soun = (struct sockaddr_un *)nam; 786 struct vnode *vp; 787 struct mount *mp; 788 struct vattr vattr; 789 int error, namelen; 790 struct nameidata nd; 791 char *buf; 792 793 UNP_LOCK_ASSERT(); 794 795 /* 796 * XXXRW: This test-and-set of unp_vnode is non-atomic; the unlocked 797 * read here is fine, but the value of unp_vnode needs to be tested 798 * again after we do all the lookups to see if the pcb is still 799 * unbound? 800 */ 801 if (unp->unp_vnode != NULL) 802 return (EINVAL); 803 804 namelen = soun->sun_len - offsetof(struct sockaddr_un, sun_path); 805 if (namelen <= 0) 806 return (EINVAL); 807 808 UNP_UNLOCK(); 809 810 buf = malloc(namelen + 1, M_TEMP, M_WAITOK); 811 strlcpy(buf, soun->sun_path, namelen + 1); 812 813 mtx_lock(&Giant); 814restart: 815 mtx_assert(&Giant, MA_OWNED); 816 NDINIT(&nd, CREATE, NOFOLLOW | LOCKPARENT | SAVENAME, UIO_SYSSPACE, 817 buf, td); 818/* SHOULD BE ABLE TO ADOPT EXISTING AND wakeup() ALA FIFO's */ 819 error = namei(&nd); 820 if (error) 821 goto done; 822 vp = nd.ni_vp; 823 if (vp != NULL || vn_start_write(nd.ni_dvp, &mp, V_NOWAIT) != 0) { 824 NDFREE(&nd, NDF_ONLY_PNBUF); 825 if (nd.ni_dvp == vp) 826 vrele(nd.ni_dvp); 827 else 828 vput(nd.ni_dvp); 829 if (vp != NULL) { 830 vrele(vp); 831 error = EADDRINUSE; 832 goto done; 833 } 834 error = vn_start_write(NULL, &mp, V_XSLEEP | PCATCH); 835 if (error) 836 goto done; 837 goto restart; 838 } 839 VATTR_NULL(&vattr); 840 vattr.va_type = VSOCK; 841 vattr.va_mode = (ACCESSPERMS & ~td->td_proc->p_fd->fd_cmask); 842#ifdef MAC 843 error = mac_check_vnode_create(td->td_ucred, nd.ni_dvp, &nd.ni_cnd, 844 &vattr); 845#endif 846 if (error == 0) { 847 VOP_LEASE(nd.ni_dvp, td, td->td_ucred, LEASE_WRITE); 848 error = VOP_CREATE(nd.ni_dvp, &nd.ni_vp, &nd.ni_cnd, &vattr); 849 } 850 NDFREE(&nd, NDF_ONLY_PNBUF); 851 vput(nd.ni_dvp); 852 if (error) { 853 vn_finished_write(mp); 854 goto done; 855 } 856 vp = nd.ni_vp; 857 ASSERT_VOP_LOCKED(vp, "unp_bind"); 858 soun = (struct sockaddr_un *)sodupsockaddr(nam, M_WAITOK); 859 UNP_LOCK(); 860 vp->v_socket = unp->unp_socket; 861 unp->unp_vnode = vp; 862 unp->unp_addr = soun; 863 UNP_UNLOCK(); 864 VOP_UNLOCK(vp, 0, td); 865 vn_finished_write(mp); 866done: 867 mtx_unlock(&Giant); 868 free(buf, M_TEMP); 869 UNP_LOCK(); 870 return (error); 871} 872 873static int 874unp_connect(struct socket *so, struct sockaddr *nam, struct thread *td) 875{ 876 struct sockaddr_un *soun = (struct sockaddr_un *)nam; 877 struct vnode *vp; 878 struct socket *so2, *so3; 879 struct unpcb *unp, *unp2, *unp3; 880 int error, len; 881 struct nameidata nd; 882 char buf[SOCK_MAXADDRLEN]; 883 struct sockaddr *sa; 884 885 UNP_LOCK_ASSERT(); 886 887 unp = sotounpcb(so); 888 KASSERT(unp != NULL, ("unp_connect: unp == NULL")); 889 len = nam->sa_len - offsetof(struct sockaddr_un, sun_path); 890 if (len <= 0) 891 return (EINVAL); 892 strlcpy(buf, soun->sun_path, len + 1); 893 UNP_UNLOCK(); 894 sa = malloc(sizeof(struct sockaddr_un), M_SONAME, M_WAITOK); 895 mtx_lock(&Giant); 896 NDINIT(&nd, LOOKUP, FOLLOW | LOCKLEAF, UIO_SYSSPACE, buf, td); 897 error = namei(&nd); 898 if (error) 899 vp = NULL; 900 else 901 vp = nd.ni_vp; 902 ASSERT_VOP_LOCKED(vp, "unp_connect"); 903 NDFREE(&nd, NDF_ONLY_PNBUF); 904 if (error) 905 goto bad; 906 907 if (vp->v_type != VSOCK) { 908 error = ENOTSOCK; 909 goto bad; 910 } 911 error = VOP_ACCESS(vp, VWRITE, td->td_ucred, td); 912 if (error) 913 goto bad; 914 mtx_unlock(&Giant); 915 UNP_LOCK(); 916 unp = sotounpcb(so); 917 KASSERT(unp != NULL, ("unp_connect: unp == NULL")); 918 so2 = vp->v_socket; 919 if (so2 == NULL) { 920 error = ECONNREFUSED; 921 goto bad2; 922 } 923 if (so->so_type != so2->so_type) { 924 error = EPROTOTYPE; 925 goto bad2; 926 } 927 if (so->so_proto->pr_flags & PR_CONNREQUIRED) { 928 if (so2->so_options & SO_ACCEPTCONN) { 929 /* 930 * NB: drop locks here so unp_attach is entered w/o 931 * locks; this avoids a recursive lock of the head 932 * and holding sleep locks across a (potentially) 933 * blocking malloc. 934 */ 935 UNP_UNLOCK(); 936 so3 = sonewconn(so2, 0); 937 UNP_LOCK(); 938 } else 939 so3 = NULL; 940 if (so3 == NULL) { 941 error = ECONNREFUSED; 942 goto bad2; 943 } 944 unp = sotounpcb(so); 945 unp2 = sotounpcb(so2); 946 unp3 = sotounpcb(so3); 947 if (unp2->unp_addr != NULL) { 948 bcopy(unp2->unp_addr, sa, unp2->unp_addr->sun_len); 949 unp3->unp_addr = (struct sockaddr_un *) sa; 950 sa = NULL; 951 } 952 /* 953 * unp_peercred management: 954 * 955 * The connecter's (client's) credentials are copied from its 956 * process structure at the time of connect() (which is now). 957 */ 958 cru2x(td->td_ucred, &unp3->unp_peercred); 959 unp3->unp_flags |= UNP_HAVEPC; 960 /* 961 * The receiver's (server's) credentials are copied from the 962 * unp_peercred member of socket on which the former called 963 * listen(); unp_listen() cached that process's credentials 964 * at that time so we can use them now. 965 */ 966 KASSERT(unp2->unp_flags & UNP_HAVEPCCACHED, 967 ("unp_connect: listener without cached peercred")); 968 memcpy(&unp->unp_peercred, &unp2->unp_peercred, 969 sizeof(unp->unp_peercred)); 970 unp->unp_flags |= UNP_HAVEPC; 971 if (unp2->unp_flags & UNP_WANTCRED) 972 unp3->unp_flags |= UNP_WANTCRED; 973#ifdef MAC 974 SOCK_LOCK(so); 975 mac_set_socket_peer_from_socket(so, so3); 976 mac_set_socket_peer_from_socket(so3, so); 977 SOCK_UNLOCK(so); 978#endif 979 980 so2 = so3; 981 } 982 error = unp_connect2(so, so2, PRU_CONNECT); 983bad2: 984 UNP_UNLOCK(); 985 mtx_lock(&Giant); 986bad: 987 mtx_assert(&Giant, MA_OWNED); 988 if (vp != NULL) 989 vput(vp); 990 mtx_unlock(&Giant); 991 free(sa, M_SONAME); 992 UNP_LOCK(); 993 return (error); 994} 995 996static int 997unp_connect2(struct socket *so, struct socket *so2, int req) 998{ 999 struct unpcb *unp = sotounpcb(so); 1000 struct unpcb *unp2; 1001 1002 UNP_LOCK_ASSERT(); 1003 1004 if (so2->so_type != so->so_type) 1005 return (EPROTOTYPE); 1006 unp2 = sotounpcb(so2); 1007 KASSERT(unp2 != NULL, ("unp_connect2: unp2 == NULL")); 1008 unp->unp_conn = unp2; 1009 switch (so->so_type) { 1010 1011 case SOCK_DGRAM: 1012 LIST_INSERT_HEAD(&unp2->unp_refs, unp, unp_reflink); 1013 soisconnected(so); 1014 break; 1015 1016 case SOCK_STREAM: 1017 unp2->unp_conn = unp; 1018 if (req == PRU_CONNECT && 1019 ((unp->unp_flags | unp2->unp_flags) & UNP_CONNWAIT)) 1020 soisconnecting(so); 1021 else 1022 soisconnected(so); 1023 soisconnected(so2); 1024 break; 1025 1026 default: 1027 panic("unp_connect2"); 1028 } 1029 return (0); 1030} 1031 1032static void 1033unp_disconnect(struct unpcb *unp) 1034{ 1035 struct unpcb *unp2 = unp->unp_conn; 1036 struct socket *so; 1037 1038 UNP_LOCK_ASSERT(); 1039 1040 if (unp2 == NULL) 1041 return; 1042 unp->unp_conn = NULL; 1043 switch (unp->unp_socket->so_type) { 1044 case SOCK_DGRAM: 1045 LIST_REMOVE(unp, unp_reflink); 1046 so = unp->unp_socket; 1047 SOCK_LOCK(so); 1048 so->so_state &= ~SS_ISCONNECTED; 1049 SOCK_UNLOCK(so); 1050 break; 1051 1052 case SOCK_STREAM: 1053 soisdisconnected(unp->unp_socket); 1054 unp2->unp_conn = NULL; 1055 soisdisconnected(unp2->unp_socket); 1056 break; 1057 } 1058} 1059 1060/* 1061 * unp_pcblist() assumes that UNIX domain socket memory is never reclaimed by 1062 * the zone (UMA_ZONE_NOFREE), and as such potentially stale pointers are 1063 * safe to reference. It first scans the list of struct unpcb's to generate 1064 * a pointer list, then it rescans its list one entry at a time to 1065 * externalize and copyout. It checks the generation number to see if a 1066 * struct unpcb has been reused, and will skip it if so. 1067 */ 1068static int 1069unp_pcblist(SYSCTL_HANDLER_ARGS) 1070{ 1071 int error, i, n; 1072 struct unpcb *unp, **unp_list; 1073 unp_gen_t gencnt; 1074 struct xunpgen *xug; 1075 struct unp_head *head; 1076 struct xunpcb *xu; 1077 1078 head = ((intptr_t)arg1 == SOCK_DGRAM ? &unp_dhead : &unp_shead); 1079 1080 /* 1081 * The process of preparing the PCB list is too time-consuming and 1082 * resource-intensive to repeat twice on every request. 1083 */ 1084 if (req->oldptr == NULL) { 1085 n = unp_count; 1086 req->oldidx = 2 * (sizeof *xug) 1087 + (n + n/8) * sizeof(struct xunpcb); 1088 return (0); 1089 } 1090 1091 if (req->newptr != NULL) 1092 return (EPERM); 1093 1094 /* 1095 * OK, now we're committed to doing something. 1096 */ 1097 xug = malloc(sizeof(*xug), M_TEMP, M_WAITOK); 1098 UNP_LOCK(); 1099 gencnt = unp_gencnt; 1100 n = unp_count; 1101 UNP_UNLOCK(); 1102 1103 xug->xug_len = sizeof *xug; 1104 xug->xug_count = n; 1105 xug->xug_gen = gencnt; 1106 xug->xug_sogen = so_gencnt; 1107 error = SYSCTL_OUT(req, xug, sizeof *xug); 1108 if (error) { 1109 free(xug, M_TEMP); 1110 return (error); 1111 } 1112 1113 unp_list = malloc(n * sizeof *unp_list, M_TEMP, M_WAITOK); 1114 1115 UNP_LOCK(); 1116 for (unp = LIST_FIRST(head), i = 0; unp && i < n; 1117 unp = LIST_NEXT(unp, unp_link)) { 1118 if (unp->unp_gencnt <= gencnt) { 1119 if (cr_cansee(req->td->td_ucred, 1120 unp->unp_socket->so_cred)) 1121 continue; 1122 unp_list[i++] = unp; 1123 } 1124 } 1125 UNP_UNLOCK(); 1126 n = i; /* In case we lost some during malloc. */ 1127 1128 error = 0; 1129 xu = malloc(sizeof(*xu), M_TEMP, M_WAITOK | M_ZERO); 1130 for (i = 0; i < n; i++) { 1131 unp = unp_list[i]; 1132 if (unp->unp_gencnt <= gencnt) { 1133 xu->xu_len = sizeof *xu; 1134 xu->xu_unpp = unp; 1135 /* 1136 * XXX - need more locking here to protect against 1137 * connect/disconnect races for SMP. 1138 */ 1139 if (unp->unp_addr != NULL) 1140 bcopy(unp->unp_addr, &xu->xu_addr, 1141 unp->unp_addr->sun_len); 1142 if (unp->unp_conn != NULL && 1143 unp->unp_conn->unp_addr != NULL) 1144 bcopy(unp->unp_conn->unp_addr, 1145 &xu->xu_caddr, 1146 unp->unp_conn->unp_addr->sun_len); 1147 bcopy(unp, &xu->xu_unp, sizeof *unp); 1148 sotoxsocket(unp->unp_socket, &xu->xu_socket); 1149 error = SYSCTL_OUT(req, xu, sizeof *xu); 1150 } 1151 } 1152 free(xu, M_TEMP); 1153 if (!error) { 1154 /* 1155 * Give the user an updated idea of our state. If the 1156 * generation differs from what we told her before, she knows 1157 * that something happened while we were processing this 1158 * request, and it might be necessary to retry. 1159 */ 1160 xug->xug_gen = unp_gencnt; 1161 xug->xug_sogen = so_gencnt; 1162 xug->xug_count = unp_count; 1163 error = SYSCTL_OUT(req, xug, sizeof *xug); 1164 } 1165 free(unp_list, M_TEMP); 1166 free(xug, M_TEMP); 1167 return (error); 1168} 1169 1170SYSCTL_PROC(_net_local_dgram, OID_AUTO, pcblist, CTLFLAG_RD, 1171 (caddr_t)(long)SOCK_DGRAM, 0, unp_pcblist, "S,xunpcb", 1172 "List of active local datagram sockets"); 1173SYSCTL_PROC(_net_local_stream, OID_AUTO, pcblist, CTLFLAG_RD, 1174 (caddr_t)(long)SOCK_STREAM, 0, unp_pcblist, "S,xunpcb", 1175 "List of active local stream sockets"); 1176 1177static void 1178unp_shutdown(struct unpcb *unp) 1179{ 1180 struct socket *so; 1181 1182 UNP_LOCK_ASSERT(); 1183 1184 if (unp->unp_socket->so_type == SOCK_STREAM && unp->unp_conn && 1185 (so = unp->unp_conn->unp_socket)) 1186 socantrcvmore(so); 1187} 1188 1189static void 1190unp_drop(struct unpcb *unp, int errno) 1191{ 1192 struct socket *so = unp->unp_socket; 1193 1194 UNP_LOCK_ASSERT(); 1195 1196 so->so_error = errno; 1197 unp_disconnect(unp); 1198} 1199 1200static void 1201unp_freerights(struct file **rp, int fdcount) 1202{ 1203 int i; 1204 struct file *fp; 1205 1206 for (i = 0; i < fdcount; i++) { 1207 fp = *rp; 1208 /* 1209 * Zero the pointer before calling unp_discard since it may 1210 * end up in unp_gc().. 1211 * 1212 * XXXRW: This is less true than it used to be. 1213 */ 1214 *rp++ = 0; 1215 unp_discard(fp); 1216 } 1217} 1218 1219int 1220unp_externalize(struct mbuf *control, struct mbuf **controlp) 1221{ 1222 struct thread *td = curthread; /* XXX */ 1223 struct cmsghdr *cm = mtod(control, struct cmsghdr *); 1224 int i; 1225 int *fdp; 1226 struct file **rp; 1227 struct file *fp; 1228 void *data; 1229 socklen_t clen = control->m_len, datalen; 1230 int error, newfds; 1231 int f; 1232 u_int newlen; 1233 1234 UNP_UNLOCK_ASSERT(); 1235 1236 error = 0; 1237 if (controlp != NULL) /* controlp == NULL => free control messages */ 1238 *controlp = NULL; 1239 1240 while (cm != NULL) { 1241 if (sizeof(*cm) > clen || cm->cmsg_len > clen) { 1242 error = EINVAL; 1243 break; 1244 } 1245 1246 data = CMSG_DATA(cm); 1247 datalen = (caddr_t)cm + cm->cmsg_len - (caddr_t)data; 1248 1249 if (cm->cmsg_level == SOL_SOCKET 1250 && cm->cmsg_type == SCM_RIGHTS) { 1251 newfds = datalen / sizeof(struct file *); 1252 rp = data; 1253 1254 /* If we're not outputting the descriptors free them. */ 1255 if (error || controlp == NULL) { 1256 unp_freerights(rp, newfds); 1257 goto next; 1258 } 1259 FILEDESC_LOCK(td->td_proc->p_fd); 1260 /* if the new FD's will not fit free them. */ 1261 if (!fdavail(td, newfds)) { 1262 FILEDESC_UNLOCK(td->td_proc->p_fd); 1263 error = EMSGSIZE; 1264 unp_freerights(rp, newfds); 1265 goto next; 1266 } 1267 /* 1268 * Now change each pointer to an fd in the global 1269 * table to an integer that is the index to the local 1270 * fd table entry that we set up to point to the 1271 * global one we are transferring. 1272 */ 1273 newlen = newfds * sizeof(int); 1274 *controlp = sbcreatecontrol(NULL, newlen, 1275 SCM_RIGHTS, SOL_SOCKET); 1276 if (*controlp == NULL) { 1277 FILEDESC_UNLOCK(td->td_proc->p_fd); 1278 error = E2BIG; 1279 unp_freerights(rp, newfds); 1280 goto next; 1281 } 1282 1283 fdp = (int *) 1284 CMSG_DATA(mtod(*controlp, struct cmsghdr *)); 1285 for (i = 0; i < newfds; i++) { 1286 if (fdalloc(td, 0, &f)) 1287 panic("unp_externalize fdalloc failed"); 1288 fp = *rp++; 1289 td->td_proc->p_fd->fd_ofiles[f] = fp; 1290 FILE_LOCK(fp); 1291 fp->f_msgcount--; 1292 FILE_UNLOCK(fp); 1293 unp_rights--; 1294 *fdp++ = f; 1295 } 1296 FILEDESC_UNLOCK(td->td_proc->p_fd); 1297 } else { 1298 /* We can just copy anything else across. */ 1299 if (error || controlp == NULL) 1300 goto next; 1301 *controlp = sbcreatecontrol(NULL, datalen, 1302 cm->cmsg_type, cm->cmsg_level); 1303 if (*controlp == NULL) { 1304 error = ENOBUFS; 1305 goto next; 1306 } 1307 bcopy(data, 1308 CMSG_DATA(mtod(*controlp, struct cmsghdr *)), 1309 datalen); 1310 } 1311 1312 controlp = &(*controlp)->m_next; 1313 1314next: 1315 if (CMSG_SPACE(datalen) < clen) { 1316 clen -= CMSG_SPACE(datalen); 1317 cm = (struct cmsghdr *) 1318 ((caddr_t)cm + CMSG_SPACE(datalen)); 1319 } else { 1320 clen = 0; 1321 cm = NULL; 1322 } 1323 } 1324 1325 m_freem(control); 1326 1327 return (error); 1328} 1329 1330static void 1331unp_zone_change(void *tag) 1332{ 1333 1334 uma_zone_set_max(unp_zone, maxsockets); 1335} 1336 1337void 1338unp_init(void) 1339{ 1340 1341 unp_zone = uma_zcreate("unpcb", sizeof(struct unpcb), NULL, NULL, 1342 NULL, NULL, UMA_ALIGN_PTR, UMA_ZONE_NOFREE); 1343 if (unp_zone == NULL) 1344 panic("unp_init"); 1345 uma_zone_set_max(unp_zone, maxsockets); 1346 EVENTHANDLER_REGISTER(maxsockets_change, unp_zone_change, 1347 NULL, EVENTHANDLER_PRI_ANY); 1348 LIST_INIT(&unp_dhead); 1349 LIST_INIT(&unp_shead); 1350 TASK_INIT(&unp_gc_task, 0, unp_gc, NULL); 1351 UNP_LOCK_INIT(); 1352} 1353 1354static int 1355unp_internalize(struct mbuf **controlp, struct thread *td) 1356{ 1357 struct mbuf *control = *controlp; 1358 struct proc *p = td->td_proc; 1359 struct filedesc *fdescp = p->p_fd; 1360 struct cmsghdr *cm = mtod(control, struct cmsghdr *); 1361 struct cmsgcred *cmcred; 1362 struct file **rp; 1363 struct file *fp; 1364 struct timeval *tv; 1365 int i, fd, *fdp; 1366 void *data; 1367 socklen_t clen = control->m_len, datalen; 1368 int error, oldfds; 1369 u_int newlen; 1370 1371 UNP_UNLOCK_ASSERT(); 1372 1373 error = 0; 1374 *controlp = NULL; 1375 1376 while (cm != NULL) { 1377 if (sizeof(*cm) > clen || cm->cmsg_level != SOL_SOCKET 1378 || cm->cmsg_len > clen) { 1379 error = EINVAL; 1380 goto out; 1381 } 1382 1383 data = CMSG_DATA(cm); 1384 datalen = (caddr_t)cm + cm->cmsg_len - (caddr_t)data; 1385 1386 switch (cm->cmsg_type) { 1387 /* 1388 * Fill in credential information. 1389 */ 1390 case SCM_CREDS: 1391 *controlp = sbcreatecontrol(NULL, sizeof(*cmcred), 1392 SCM_CREDS, SOL_SOCKET); 1393 if (*controlp == NULL) { 1394 error = ENOBUFS; 1395 goto out; 1396 } 1397 1398 cmcred = (struct cmsgcred *) 1399 CMSG_DATA(mtod(*controlp, struct cmsghdr *)); 1400 cmcred->cmcred_pid = p->p_pid; 1401 cmcred->cmcred_uid = td->td_ucred->cr_ruid; 1402 cmcred->cmcred_gid = td->td_ucred->cr_rgid; 1403 cmcred->cmcred_euid = td->td_ucred->cr_uid; 1404 cmcred->cmcred_ngroups = MIN(td->td_ucred->cr_ngroups, 1405 CMGROUP_MAX); 1406 for (i = 0; i < cmcred->cmcred_ngroups; i++) 1407 cmcred->cmcred_groups[i] = 1408 td->td_ucred->cr_groups[i]; 1409 break; 1410 1411 case SCM_RIGHTS: 1412 oldfds = datalen / sizeof (int); 1413 /* 1414 * Check that all the FDs passed in refer to legal 1415 * files. If not, reject the entire operation. 1416 */ 1417 fdp = data; 1418 FILEDESC_LOCK(fdescp); 1419 for (i = 0; i < oldfds; i++) { 1420 fd = *fdp++; 1421 if ((unsigned)fd >= fdescp->fd_nfiles || 1422 fdescp->fd_ofiles[fd] == NULL) { 1423 FILEDESC_UNLOCK(fdescp); 1424 error = EBADF; 1425 goto out; 1426 } 1427 fp = fdescp->fd_ofiles[fd]; 1428 if (!(fp->f_ops->fo_flags & DFLAG_PASSABLE)) { 1429 FILEDESC_UNLOCK(fdescp); 1430 error = EOPNOTSUPP; 1431 goto out; 1432 } 1433 1434 } 1435 /* 1436 * Now replace the integer FDs with pointers to the 1437 * associated global file table entry.. 1438 */ 1439 newlen = oldfds * sizeof(struct file *); 1440 *controlp = sbcreatecontrol(NULL, newlen, 1441 SCM_RIGHTS, SOL_SOCKET); 1442 if (*controlp == NULL) { 1443 FILEDESC_UNLOCK(fdescp); 1444 error = E2BIG; 1445 goto out; 1446 } 1447 1448 fdp = data; 1449 rp = (struct file **) 1450 CMSG_DATA(mtod(*controlp, struct cmsghdr *)); 1451 for (i = 0; i < oldfds; i++) { 1452 fp = fdescp->fd_ofiles[*fdp++]; 1453 *rp++ = fp; 1454 FILE_LOCK(fp); 1455 fp->f_count++; 1456 fp->f_msgcount++; 1457 FILE_UNLOCK(fp); 1458 unp_rights++; 1459 } 1460 FILEDESC_UNLOCK(fdescp); 1461 break; 1462 1463 case SCM_TIMESTAMP: 1464 *controlp = sbcreatecontrol(NULL, sizeof(*tv), 1465 SCM_TIMESTAMP, SOL_SOCKET); 1466 if (*controlp == NULL) { 1467 error = ENOBUFS; 1468 goto out; 1469 } 1470 tv = (struct timeval *) 1471 CMSG_DATA(mtod(*controlp, struct cmsghdr *)); 1472 microtime(tv); 1473 break; 1474 1475 default: 1476 error = EINVAL; 1477 goto out; 1478 } 1479 1480 controlp = &(*controlp)->m_next; 1481 1482 if (CMSG_SPACE(datalen) < clen) { 1483 clen -= CMSG_SPACE(datalen); 1484 cm = (struct cmsghdr *) 1485 ((caddr_t)cm + CMSG_SPACE(datalen)); 1486 } else { 1487 clen = 0; 1488 cm = NULL; 1489 } 1490 } 1491 1492out: 1493 m_freem(control); 1494 1495 return (error); 1496} 1497 1498struct mbuf * 1499unp_addsockcred(struct thread *td, struct mbuf *control) 1500{ 1501 struct mbuf *m, *n, *n_prev; 1502 struct sockcred *sc; 1503 const struct cmsghdr *cm; 1504 int ngroups; 1505 int i; 1506 1507 ngroups = MIN(td->td_ucred->cr_ngroups, CMGROUP_MAX); 1508 1509 m = sbcreatecontrol(NULL, SOCKCREDSIZE(ngroups), SCM_CREDS, SOL_SOCKET); 1510 if (m == NULL) 1511 return (control); 1512 1513 sc = (struct sockcred *) CMSG_DATA(mtod(m, struct cmsghdr *)); 1514 sc->sc_uid = td->td_ucred->cr_ruid; 1515 sc->sc_euid = td->td_ucred->cr_uid; 1516 sc->sc_gid = td->td_ucred->cr_rgid; 1517 sc->sc_egid = td->td_ucred->cr_gid; 1518 sc->sc_ngroups = ngroups; 1519 for (i = 0; i < sc->sc_ngroups; i++) 1520 sc->sc_groups[i] = td->td_ucred->cr_groups[i]; 1521 1522 /* 1523 * Unlink SCM_CREDS control messages (struct cmsgcred), since just 1524 * created SCM_CREDS control message (struct sockcred) has another 1525 * format. 1526 */ 1527 if (control != NULL) 1528 for (n = control, n_prev = NULL; n != NULL;) { 1529 cm = mtod(n, struct cmsghdr *); 1530 if (cm->cmsg_level == SOL_SOCKET && 1531 cm->cmsg_type == SCM_CREDS) { 1532 if (n_prev == NULL) 1533 control = n->m_next; 1534 else 1535 n_prev->m_next = n->m_next; 1536 n = m_free(n); 1537 } else { 1538 n_prev = n; 1539 n = n->m_next; 1540 } 1541 } 1542 1543 /* Prepend it to the head. */ 1544 m->m_next = control; 1545 1546 return (m); 1547} 1548 1549/* 1550 * unp_defer indicates whether additional work has been defered for a future 1551 * pass through unp_gc(). It is thread local and does not require explicit 1552 * synchronization. 1553 */ 1554static int unp_defer; 1555 1556static int unp_taskcount; 1557SYSCTL_INT(_net_local, OID_AUTO, taskcount, CTLFLAG_RD, &unp_taskcount, 0, ""); 1558 1559static int unp_recycled; 1560SYSCTL_INT(_net_local, OID_AUTO, recycled, CTLFLAG_RD, &unp_recycled, 0, ""); 1561 1562static void 1563unp_gc(__unused void *arg, int pending) 1564{ 1565 struct file *fp, *nextfp; 1566 struct socket *so; 1567 struct file **extra_ref, **fpp; 1568 int nunref, i; 1569 int nfiles_snap; 1570 int nfiles_slack = 20; 1571 1572 unp_taskcount++; 1573 unp_defer = 0; 1574 /* 1575 * Before going through all this, set all FDs to be NOT defered and 1576 * NOT externally accessible. 1577 */ 1578 sx_slock(&filelist_lock); 1579 LIST_FOREACH(fp, &filehead, f_list) 1580 fp->f_gcflag &= ~(FMARK|FDEFER); 1581 do { 1582 KASSERT(unp_defer >= 0, ("unp_gc: unp_defer %d", unp_defer)); 1583 LIST_FOREACH(fp, &filehead, f_list) { 1584 FILE_LOCK(fp); 1585 /* 1586 * If the file is not open, skip it -- could be a 1587 * file in the process of being opened, or in the 1588 * process of being closed. If the file is 1589 * "closing", it may have been marked for deferred 1590 * consideration. Clear the flag now if so. 1591 */ 1592 if (fp->f_count == 0) { 1593 if (fp->f_gcflag & FDEFER) 1594 unp_defer--; 1595 fp->f_gcflag &= ~(FMARK|FDEFER); 1596 FILE_UNLOCK(fp); 1597 continue; 1598 } 1599 /* 1600 * If we already marked it as 'defer' in a previous 1601 * pass, then try process it this time and un-mark 1602 * it. 1603 */ 1604 if (fp->f_gcflag & FDEFER) { 1605 fp->f_gcflag &= ~FDEFER; 1606 unp_defer--; 1607 } else { 1608 /* 1609 * if it's not defered, then check if it's 1610 * already marked.. if so skip it 1611 */ 1612 if (fp->f_gcflag & FMARK) { 1613 FILE_UNLOCK(fp); 1614 continue; 1615 } 1616 /* 1617 * If all references are from messages in 1618 * transit, then skip it. it's not externally 1619 * accessible. 1620 */ 1621 if (fp->f_count == fp->f_msgcount) { 1622 FILE_UNLOCK(fp); 1623 continue; 1624 } 1625 /* 1626 * If it got this far then it must be 1627 * externally accessible. 1628 */ 1629 fp->f_gcflag |= FMARK; 1630 } 1631 /* 1632 * Either it was defered, or it is externally 1633 * accessible and not already marked so. Now check 1634 * if it is possibly one of OUR sockets. 1635 */ 1636 if (fp->f_type != DTYPE_SOCKET || 1637 (so = fp->f_data) == NULL) { 1638 FILE_UNLOCK(fp); 1639 continue; 1640 } 1641 FILE_UNLOCK(fp); 1642 if (so->so_proto->pr_domain != &localdomain || 1643 (so->so_proto->pr_flags&PR_RIGHTS) == 0) 1644 continue; 1645 /* 1646 * So, Ok, it's one of our sockets and it IS 1647 * externally accessible (or was defered). Now we 1648 * look to see if we hold any file descriptors in its 1649 * message buffers. Follow those links and mark them 1650 * as accessible too. 1651 */ 1652 SOCKBUF_LOCK(&so->so_rcv); 1653 unp_scan(so->so_rcv.sb_mb, unp_mark); 1654 SOCKBUF_UNLOCK(&so->so_rcv); 1655 } 1656 } while (unp_defer); 1657 sx_sunlock(&filelist_lock); 1658 /* 1659 * XXXRW: The following comments need updating for a post-SMPng and 1660 * deferred unp_gc() world, but are still generally accurate. 1661 * 1662 * We grab an extra reference to each of the file table entries that 1663 * are not otherwise accessible and then free the rights that are 1664 * stored in messages on them. 1665 * 1666 * The bug in the orginal code is a little tricky, so I'll describe 1667 * what's wrong with it here. 1668 * 1669 * It is incorrect to simply unp_discard each entry for f_msgcount 1670 * times -- consider the case of sockets A and B that contain 1671 * references to each other. On a last close of some other socket, 1672 * we trigger a gc since the number of outstanding rights (unp_rights) 1673 * is non-zero. If during the sweep phase the gc code unp_discards, 1674 * we end up doing a (full) closef on the descriptor. A closef on A 1675 * results in the following chain. Closef calls soo_close, which 1676 * calls soclose. Soclose calls first (through the switch 1677 * uipc_usrreq) unp_detach, which re-invokes unp_gc. Unp_gc simply 1678 * returns because the previous instance had set unp_gcing, and we 1679 * return all the way back to soclose, which marks the socket with 1680 * SS_NOFDREF, and then calls sofree. Sofree calls sorflush to free 1681 * up the rights that are queued in messages on the socket A, i.e., 1682 * the reference on B. The sorflush calls via the dom_dispose switch 1683 * unp_dispose, which unp_scans with unp_discard. This second 1684 * instance of unp_discard just calls closef on B. 1685 * 1686 * Well, a similar chain occurs on B, resulting in a sorflush on B, 1687 * which results in another closef on A. Unfortunately, A is already 1688 * being closed, and the descriptor has already been marked with 1689 * SS_NOFDREF, and soclose panics at this point. 1690 * 1691 * Here, we first take an extra reference to each inaccessible 1692 * descriptor. Then, we call sorflush ourself, since we know it is a 1693 * Unix domain socket anyhow. After we destroy all the rights 1694 * carried in messages, we do a last closef to get rid of our extra 1695 * reference. This is the last close, and the unp_detach etc will 1696 * shut down the socket. 1697 * 1698 * 91/09/19, bsy@cs.cmu.edu 1699 */ 1700again: 1701 nfiles_snap = openfiles + nfiles_slack; /* some slack */ 1702 extra_ref = malloc(nfiles_snap * sizeof(struct file *), M_TEMP, 1703 M_WAITOK); 1704 sx_slock(&filelist_lock); 1705 if (nfiles_snap < openfiles) { 1706 sx_sunlock(&filelist_lock); 1707 free(extra_ref, M_TEMP); 1708 nfiles_slack += 20; 1709 goto again; 1710 } 1711 for (nunref = 0, fp = LIST_FIRST(&filehead), fpp = extra_ref; 1712 fp != NULL; fp = nextfp) { 1713 nextfp = LIST_NEXT(fp, f_list); 1714 FILE_LOCK(fp); 1715 /* 1716 * If it's not open, skip it 1717 */ 1718 if (fp->f_count == 0) { 1719 FILE_UNLOCK(fp); 1720 continue; 1721 } 1722 /* 1723 * If all refs are from msgs, and it's not marked accessible 1724 * then it must be referenced from some unreachable cycle of 1725 * (shut-down) FDs, so include it in our list of FDs to 1726 * remove. 1727 */ 1728 if (fp->f_count == fp->f_msgcount && !(fp->f_gcflag & FMARK)) { 1729 *fpp++ = fp; 1730 nunref++; 1731 fp->f_count++; 1732 } 1733 FILE_UNLOCK(fp); 1734 } 1735 sx_sunlock(&filelist_lock); 1736 /* 1737 * For each FD on our hit list, do the following two things: 1738 */ 1739 for (i = nunref, fpp = extra_ref; --i >= 0; ++fpp) { 1740 struct file *tfp = *fpp; 1741 FILE_LOCK(tfp); 1742 if (tfp->f_type == DTYPE_SOCKET && 1743 tfp->f_data != NULL) { 1744 FILE_UNLOCK(tfp); 1745 sorflush(tfp->f_data); 1746 } else { 1747 FILE_UNLOCK(tfp); 1748 } 1749 } 1750 for (i = nunref, fpp = extra_ref; --i >= 0; ++fpp) { 1751 closef(*fpp, (struct thread *) NULL); 1752 unp_recycled++; 1753 } 1754 free(extra_ref, M_TEMP); 1755} 1756 1757void 1758unp_dispose(struct mbuf *m) 1759{ 1760 1761 if (m) 1762 unp_scan(m, unp_discard); 1763} 1764 1765static int 1766unp_listen(struct socket *so, struct unpcb *unp, int backlog, 1767 struct thread *td) 1768{ 1769 int error; 1770 1771 UNP_LOCK_ASSERT(); 1772 1773 SOCK_LOCK(so); 1774 error = solisten_proto_check(so); 1775 if (error == 0) { 1776 cru2x(td->td_ucred, &unp->unp_peercred); 1777 unp->unp_flags |= UNP_HAVEPCCACHED; 1778 solisten_proto(so, backlog); 1779 } 1780 SOCK_UNLOCK(so); 1781 return (error); 1782} 1783 1784static void 1785unp_scan(struct mbuf *m0, void (*op)(struct file *)) 1786{ 1787 struct mbuf *m; 1788 struct file **rp; 1789 struct cmsghdr *cm; 1790 void *data; 1791 int i; 1792 socklen_t clen, datalen; 1793 int qfds; 1794 1795 while (m0 != NULL) { 1796 for (m = m0; m; m = m->m_next) { 1797 if (m->m_type != MT_CONTROL) 1798 continue; 1799 1800 cm = mtod(m, struct cmsghdr *); 1801 clen = m->m_len; 1802 1803 while (cm != NULL) { 1804 if (sizeof(*cm) > clen || cm->cmsg_len > clen) 1805 break; 1806 1807 data = CMSG_DATA(cm); 1808 datalen = (caddr_t)cm + cm->cmsg_len 1809 - (caddr_t)data; 1810 1811 if (cm->cmsg_level == SOL_SOCKET && 1812 cm->cmsg_type == SCM_RIGHTS) { 1813 qfds = datalen / sizeof (struct file *); 1814 rp = data; 1815 for (i = 0; i < qfds; i++) 1816 (*op)(*rp++); 1817 } 1818 1819 if (CMSG_SPACE(datalen) < clen) { 1820 clen -= CMSG_SPACE(datalen); 1821 cm = (struct cmsghdr *) 1822 ((caddr_t)cm + CMSG_SPACE(datalen)); 1823 } else { 1824 clen = 0; 1825 cm = NULL; 1826 } 1827 } 1828 } 1829 m0 = m0->m_act; 1830 } 1831} 1832 1833static void 1834unp_mark(struct file *fp) 1835{ 1836 if (fp->f_gcflag & FMARK) 1837 return; 1838 unp_defer++; 1839 fp->f_gcflag |= (FMARK|FDEFER); 1840} 1841 1842static void 1843unp_discard(struct file *fp) 1844{ 1845 UNP_LOCK(); 1846 FILE_LOCK(fp); 1847 fp->f_msgcount--; 1848 unp_rights--; 1849 FILE_UNLOCK(fp); 1850 UNP_UNLOCK(); 1851 (void) closef(fp, (struct thread *)NULL); 1852} 1853