uipc_syscalls.c revision 321021
1/*- 2 * Copyright (c) 1982, 1986, 1989, 1990, 1993 3 * The Regents of the University of California. All rights reserved. 4 * 5 * sendfile(2) and related extensions: 6 * Copyright (c) 1998, David Greenman. All rights reserved. 7 * 8 * Redistribution and use in source and binary forms, with or without 9 * modification, are permitted provided that the following conditions 10 * are met: 11 * 1. Redistributions of source code must retain the above copyright 12 * notice, this list of conditions and the following disclaimer. 13 * 2. Redistributions in binary form must reproduce the above copyright 14 * notice, this list of conditions and the following disclaimer in the 15 * documentation and/or other materials provided with the distribution. 16 * 4. Neither the name of the University nor the names of its contributors 17 * may be used to endorse or promote products derived from this software 18 * without specific prior written permission. 19 * 20 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND 21 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 22 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 23 * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE 24 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 25 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 26 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 27 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 28 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 29 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 30 * SUCH DAMAGE. 31 * 32 * @(#)uipc_syscalls.c 8.4 (Berkeley) 2/21/94 33 */ 34 35#include <sys/cdefs.h> 36__FBSDID("$FreeBSD: stable/10/sys/kern/uipc_syscalls.c 321021 2017-07-15 17:28:03Z dchagin $"); 37 38#include "opt_capsicum.h" 39#include "opt_inet.h" 40#include "opt_inet6.h" 41#include "opt_compat.h" 42#include "opt_ktrace.h" 43 44#include <sys/param.h> 45#include <sys/systm.h> 46#include <sys/capsicum.h> 47#include <sys/condvar.h> 48#include <sys/kernel.h> 49#include <sys/lock.h> 50#include <sys/mutex.h> 51#include <sys/sysproto.h> 52#include <sys/malloc.h> 53#include <sys/filedesc.h> 54#include <sys/event.h> 55#include <sys/proc.h> 56#include <sys/fcntl.h> 57#include <sys/file.h> 58#include <sys/filio.h> 59#include <sys/jail.h> 60#include <sys/mman.h> 61#include <sys/mount.h> 62#include <sys/mbuf.h> 63#include <sys/protosw.h> 64#include <sys/rwlock.h> 65#include <sys/sf_buf.h> 66#include <sys/sysent.h> 67#include <sys/socket.h> 68#include <sys/socketvar.h> 69#include <sys/signalvar.h> 70#include <sys/syscallsubr.h> 71#include <sys/sysctl.h> 72#include <sys/uio.h> 73#include <sys/vnode.h> 74#ifdef KTRACE 75#include <sys/ktrace.h> 76#endif 77#ifdef COMPAT_FREEBSD32 78#include <compat/freebsd32/freebsd32_util.h> 79#endif 80 81#include <net/vnet.h> 82 83#include <security/audit/audit.h> 84#include <security/mac/mac_framework.h> 85 86#include <vm/vm.h> 87#include <vm/vm_param.h> 88#include <vm/vm_object.h> 89#include <vm/vm_page.h> 90#include <vm/vm_pager.h> 91#include <vm/vm_kern.h> 92#include <vm/vm_extern.h> 93 94/* 95 * Flags for accept1() and kern_accept4(), in addition to SOCK_CLOEXEC 96 * and SOCK_NONBLOCK. 97 */ 98#define ACCEPT4_INHERIT 0x1 99#define ACCEPT4_COMPAT 0x2 100 101static int sendit(struct thread *td, int s, struct msghdr *mp, int flags); 102static int recvit(struct thread *td, int s, struct msghdr *mp, void *namelenp); 103 104static int accept1(struct thread *td, int s, struct sockaddr *uname, 105 socklen_t *anamelen, int flags); 106static int do_sendfile(struct thread *td, struct sendfile_args *uap, 107 int compat); 108static int getsockname1(struct thread *td, struct getsockname_args *uap, 109 int compat); 110static int getpeername1(struct thread *td, struct getpeername_args *uap, 111 int compat); 112 113counter_u64_t sfstat[sizeof(struct sfstat) / sizeof(uint64_t)]; 114 115/* 116 * sendfile(2)-related variables and associated sysctls 117 */ 118static SYSCTL_NODE(_kern_ipc, OID_AUTO, sendfile, CTLFLAG_RW, 0, 119 "sendfile(2) tunables"); 120static int sfreadahead = 1; 121SYSCTL_INT(_kern_ipc_sendfile, OID_AUTO, readahead, CTLFLAG_RW, 122 &sfreadahead, 0, "Number of sendfile(2) read-ahead MAXBSIZE blocks"); 123 124 125static void 126sfstat_init(const void *unused) 127{ 128 129 COUNTER_ARRAY_ALLOC(sfstat, sizeof(struct sfstat) / sizeof(uint64_t), 130 M_WAITOK); 131} 132SYSINIT(sfstat, SI_SUB_MBUF, SI_ORDER_FIRST, sfstat_init, NULL); 133 134static int 135sfstat_sysctl(SYSCTL_HANDLER_ARGS) 136{ 137 struct sfstat s; 138 139 COUNTER_ARRAY_COPY(sfstat, &s, sizeof(s) / sizeof(uint64_t)); 140 if (req->newptr) 141 COUNTER_ARRAY_ZERO(sfstat, sizeof(s) / sizeof(uint64_t)); 142 return (SYSCTL_OUT(req, &s, sizeof(s))); 143} 144SYSCTL_PROC(_kern_ipc, OID_AUTO, sfstat, CTLTYPE_OPAQUE | CTLFLAG_RW, 145 NULL, 0, sfstat_sysctl, "I", "sendfile statistics"); 146 147/* 148 * Convert a user file descriptor to a kernel file entry and check if required 149 * capability rights are present. 150 * A reference on the file entry is held upon returning. 151 */ 152int 153getsock_cap(struct thread *td, int fd, cap_rights_t *rightsp, 154 struct file **fpp, u_int *fflagp) 155{ 156 struct file *fp; 157 int error; 158 159 error = fget_unlocked(td->td_proc->p_fd, fd, rightsp, 0, &fp, NULL); 160 if (error != 0) 161 return (error); 162 if (fp->f_type != DTYPE_SOCKET) { 163 fdrop(fp, td); 164 return (ENOTSOCK); 165 } 166 if (fflagp != NULL) 167 *fflagp = fp->f_flag; 168 *fpp = fp; 169 return (0); 170} 171 172/* 173 * System call interface to the socket abstraction. 174 */ 175#if defined(COMPAT_43) 176#define COMPAT_OLDSOCK 177#endif 178 179int 180sys_socket(td, uap) 181 struct thread *td; 182 struct socket_args /* { 183 int domain; 184 int type; 185 int protocol; 186 } */ *uap; 187{ 188 struct socket *so; 189 struct file *fp; 190 int fd, error, type, oflag, fflag; 191 192 AUDIT_ARG_SOCKET(uap->domain, uap->type, uap->protocol); 193 194 type = uap->type; 195 oflag = 0; 196 fflag = 0; 197 if ((type & SOCK_CLOEXEC) != 0) { 198 type &= ~SOCK_CLOEXEC; 199 oflag |= O_CLOEXEC; 200 } 201 if ((type & SOCK_NONBLOCK) != 0) { 202 type &= ~SOCK_NONBLOCK; 203 fflag |= FNONBLOCK; 204 } 205 206#ifdef MAC 207 error = mac_socket_check_create(td->td_ucred, uap->domain, type, 208 uap->protocol); 209 if (error != 0) 210 return (error); 211#endif 212 error = falloc(td, &fp, &fd, oflag); 213 if (error != 0) 214 return (error); 215 /* An extra reference on `fp' has been held for us by falloc(). */ 216 error = socreate(uap->domain, &so, type, uap->protocol, 217 td->td_ucred, td); 218 if (error != 0) { 219 fdclose(td, fp, fd); 220 } else { 221 finit(fp, FREAD | FWRITE | fflag, DTYPE_SOCKET, so, &socketops); 222 if ((fflag & FNONBLOCK) != 0) 223 (void) fo_ioctl(fp, FIONBIO, &fflag, td->td_ucred, td); 224 td->td_retval[0] = fd; 225 } 226 fdrop(fp, td); 227 return (error); 228} 229 230/* ARGSUSED */ 231int 232sys_bind(td, uap) 233 struct thread *td; 234 struct bind_args /* { 235 int s; 236 caddr_t name; 237 int namelen; 238 } */ *uap; 239{ 240 struct sockaddr *sa; 241 int error; 242 243 error = getsockaddr(&sa, uap->name, uap->namelen); 244 if (error == 0) { 245 error = kern_bind(td, uap->s, sa); 246 free(sa, M_SONAME); 247 } 248 return (error); 249} 250 251static int 252kern_bindat(struct thread *td, int dirfd, int fd, struct sockaddr *sa) 253{ 254 struct socket *so; 255 struct file *fp; 256 cap_rights_t rights; 257 int error; 258 259 AUDIT_ARG_FD(fd); 260 AUDIT_ARG_SOCKADDR(td, dirfd, sa); 261 error = getsock_cap(td, fd, cap_rights_init(&rights, CAP_BIND), 262 &fp, NULL); 263 if (error != 0) 264 return (error); 265 so = fp->f_data; 266#ifdef KTRACE 267 if (KTRPOINT(td, KTR_STRUCT)) 268 ktrsockaddr(sa); 269#endif 270#ifdef MAC 271 error = mac_socket_check_bind(td->td_ucred, so, sa); 272 if (error == 0) { 273#endif 274 if (dirfd == AT_FDCWD) 275 error = sobind(so, sa, td); 276 else 277 error = sobindat(dirfd, so, sa, td); 278#ifdef MAC 279 } 280#endif 281 fdrop(fp, td); 282 return (error); 283} 284 285int 286kern_bind(struct thread *td, int fd, struct sockaddr *sa) 287{ 288 289 return (kern_bindat(td, AT_FDCWD, fd, sa)); 290} 291 292/* ARGSUSED */ 293int 294sys_bindat(td, uap) 295 struct thread *td; 296 struct bindat_args /* { 297 int fd; 298 int s; 299 caddr_t name; 300 int namelen; 301 } */ *uap; 302{ 303 struct sockaddr *sa; 304 int error; 305 306 error = getsockaddr(&sa, uap->name, uap->namelen); 307 if (error == 0) { 308 error = kern_bindat(td, uap->fd, uap->s, sa); 309 free(sa, M_SONAME); 310 } 311 return (error); 312} 313 314/* ARGSUSED */ 315int 316sys_listen(td, uap) 317 struct thread *td; 318 struct listen_args /* { 319 int s; 320 int backlog; 321 } */ *uap; 322{ 323 struct socket *so; 324 struct file *fp; 325 cap_rights_t rights; 326 int error; 327 328 AUDIT_ARG_FD(uap->s); 329 error = getsock_cap(td, uap->s, cap_rights_init(&rights, CAP_LISTEN), 330 &fp, NULL); 331 if (error == 0) { 332 so = fp->f_data; 333#ifdef MAC 334 error = mac_socket_check_listen(td->td_ucred, so); 335 if (error == 0) 336#endif 337 error = solisten(so, uap->backlog, td); 338 fdrop(fp, td); 339 } 340 return(error); 341} 342 343/* 344 * accept1() 345 */ 346static int 347accept1(td, s, uname, anamelen, flags) 348 struct thread *td; 349 int s; 350 struct sockaddr *uname; 351 socklen_t *anamelen; 352 int flags; 353{ 354 struct sockaddr *name; 355 socklen_t namelen; 356 struct file *fp; 357 int error; 358 359 if (uname == NULL) 360 return (kern_accept4(td, s, NULL, NULL, flags, NULL)); 361 362 error = copyin(anamelen, &namelen, sizeof (namelen)); 363 if (error != 0) 364 return (error); 365 366 error = kern_accept4(td, s, &name, &namelen, flags, &fp); 367 368 /* 369 * return a namelen of zero for older code which might 370 * ignore the return value from accept. 371 */ 372 if (error != 0) { 373 (void) copyout(&namelen, anamelen, sizeof(*anamelen)); 374 return (error); 375 } 376 377 if (error == 0 && uname != NULL) { 378#ifdef COMPAT_OLDSOCK 379 if (flags & ACCEPT4_COMPAT) 380 ((struct osockaddr *)name)->sa_family = 381 name->sa_family; 382#endif 383 error = copyout(name, uname, namelen); 384 } 385 if (error == 0) 386 error = copyout(&namelen, anamelen, 387 sizeof(namelen)); 388 if (error != 0) 389 fdclose(td, fp, td->td_retval[0]); 390 fdrop(fp, td); 391 free(name, M_SONAME); 392 return (error); 393} 394 395int 396kern_accept(struct thread *td, int s, struct sockaddr **name, 397 socklen_t *namelen, struct file **fp) 398{ 399 return (kern_accept4(td, s, name, namelen, ACCEPT4_INHERIT, fp)); 400} 401 402int 403kern_accept4(struct thread *td, int s, struct sockaddr **name, 404 socklen_t *namelen, int flags, struct file **fp) 405{ 406 struct file *headfp, *nfp = NULL; 407 struct sockaddr *sa = NULL; 408 struct socket *head, *so; 409 cap_rights_t rights; 410 u_int fflag; 411 pid_t pgid; 412 int error, fd, tmp; 413 414 if (name != NULL) 415 *name = NULL; 416 417 AUDIT_ARG_FD(s); 418 error = getsock_cap(td, s, cap_rights_init(&rights, CAP_ACCEPT), 419 &headfp, &fflag); 420 if (error != 0) 421 return (error); 422 head = headfp->f_data; 423 if ((head->so_options & SO_ACCEPTCONN) == 0) { 424 error = EINVAL; 425 goto done; 426 } 427#ifdef MAC 428 error = mac_socket_check_accept(td->td_ucred, head); 429 if (error != 0) 430 goto done; 431#endif 432 error = falloc(td, &nfp, &fd, (flags & SOCK_CLOEXEC) ? O_CLOEXEC : 0); 433 if (error != 0) 434 goto done; 435 ACCEPT_LOCK(); 436 if ((head->so_state & SS_NBIO) && TAILQ_EMPTY(&head->so_comp)) { 437 ACCEPT_UNLOCK(); 438 error = EWOULDBLOCK; 439 goto noconnection; 440 } 441 while (TAILQ_EMPTY(&head->so_comp) && head->so_error == 0) { 442 if (head->so_rcv.sb_state & SBS_CANTRCVMORE) { 443 head->so_error = ECONNABORTED; 444 break; 445 } 446 error = msleep(&head->so_timeo, &accept_mtx, PSOCK | PCATCH, 447 "accept", 0); 448 if (error != 0) { 449 ACCEPT_UNLOCK(); 450 goto noconnection; 451 } 452 } 453 if (head->so_error) { 454 error = head->so_error; 455 head->so_error = 0; 456 ACCEPT_UNLOCK(); 457 goto noconnection; 458 } 459 so = TAILQ_FIRST(&head->so_comp); 460 KASSERT(!(so->so_qstate & SQ_INCOMP), ("accept1: so SQ_INCOMP")); 461 KASSERT(so->so_qstate & SQ_COMP, ("accept1: so not SQ_COMP")); 462 463 /* 464 * Before changing the flags on the socket, we have to bump the 465 * reference count. Otherwise, if the protocol calls sofree(), 466 * the socket will be released due to a zero refcount. 467 */ 468 SOCK_LOCK(so); /* soref() and so_state update */ 469 soref(so); /* file descriptor reference */ 470 471 TAILQ_REMOVE(&head->so_comp, so, so_list); 472 head->so_qlen--; 473 if (flags & ACCEPT4_INHERIT) 474 so->so_state |= (head->so_state & SS_NBIO); 475 else 476 so->so_state |= (flags & SOCK_NONBLOCK) ? SS_NBIO : 0; 477 so->so_qstate &= ~SQ_COMP; 478 so->so_head = NULL; 479 480 SOCK_UNLOCK(so); 481 ACCEPT_UNLOCK(); 482 483 /* An extra reference on `nfp' has been held for us by falloc(). */ 484 td->td_retval[0] = fd; 485 486 /* connection has been removed from the listen queue */ 487 KNOTE_UNLOCKED(&head->so_rcv.sb_sel.si_note, 0); 488 489 if (flags & ACCEPT4_INHERIT) { 490 pgid = fgetown(&head->so_sigio); 491 if (pgid != 0) 492 fsetown(pgid, &so->so_sigio); 493 } else { 494 fflag &= ~(FNONBLOCK | FASYNC); 495 if (flags & SOCK_NONBLOCK) 496 fflag |= FNONBLOCK; 497 } 498 499 finit(nfp, fflag, DTYPE_SOCKET, so, &socketops); 500 /* Sync socket nonblocking/async state with file flags */ 501 tmp = fflag & FNONBLOCK; 502 (void) fo_ioctl(nfp, FIONBIO, &tmp, td->td_ucred, td); 503 tmp = fflag & FASYNC; 504 (void) fo_ioctl(nfp, FIOASYNC, &tmp, td->td_ucred, td); 505 sa = 0; 506 error = soaccept(so, &sa); 507 if (error != 0) { 508 /* 509 * return a namelen of zero for older code which might 510 * ignore the return value from accept. 511 */ 512 if (name) 513 *namelen = 0; 514 goto noconnection; 515 } 516 if (sa == NULL) { 517 if (name) 518 *namelen = 0; 519 goto done; 520 } 521 AUDIT_ARG_SOCKADDR(td, AT_FDCWD, sa); 522 if (name) { 523 /* check sa_len before it is destroyed */ 524 if (*namelen > sa->sa_len) 525 *namelen = sa->sa_len; 526#ifdef KTRACE 527 if (KTRPOINT(td, KTR_STRUCT)) 528 ktrsockaddr(sa); 529#endif 530 *name = sa; 531 sa = NULL; 532 } 533noconnection: 534 free(sa, M_SONAME); 535 536 /* 537 * close the new descriptor, assuming someone hasn't ripped it 538 * out from under us. 539 */ 540 if (error != 0) 541 fdclose(td, nfp, fd); 542 543 /* 544 * Release explicitly held references before returning. We return 545 * a reference on nfp to the caller on success if they request it. 546 */ 547done: 548 if (fp != NULL) { 549 if (error == 0) { 550 *fp = nfp; 551 nfp = NULL; 552 } else 553 *fp = NULL; 554 } 555 if (nfp != NULL) 556 fdrop(nfp, td); 557 fdrop(headfp, td); 558 return (error); 559} 560 561int 562sys_accept(td, uap) 563 struct thread *td; 564 struct accept_args *uap; 565{ 566 567 return (accept1(td, uap->s, uap->name, uap->anamelen, ACCEPT4_INHERIT)); 568} 569 570int 571sys_accept4(td, uap) 572 struct thread *td; 573 struct accept4_args *uap; 574{ 575 576 if (uap->flags & ~(SOCK_CLOEXEC | SOCK_NONBLOCK)) 577 return (EINVAL); 578 579 return (accept1(td, uap->s, uap->name, uap->anamelen, uap->flags)); 580} 581 582#ifdef COMPAT_OLDSOCK 583int 584oaccept(td, uap) 585 struct thread *td; 586 struct accept_args *uap; 587{ 588 589 return (accept1(td, uap->s, uap->name, uap->anamelen, 590 ACCEPT4_INHERIT | ACCEPT4_COMPAT)); 591} 592#endif /* COMPAT_OLDSOCK */ 593 594/* ARGSUSED */ 595int 596sys_connect(td, uap) 597 struct thread *td; 598 struct connect_args /* { 599 int s; 600 caddr_t name; 601 int namelen; 602 } */ *uap; 603{ 604 struct sockaddr *sa; 605 int error; 606 607 error = getsockaddr(&sa, uap->name, uap->namelen); 608 if (error == 0) { 609 error = kern_connect(td, uap->s, sa); 610 free(sa, M_SONAME); 611 } 612 return (error); 613} 614 615static int 616kern_connectat(struct thread *td, int dirfd, int fd, struct sockaddr *sa) 617{ 618 struct socket *so; 619 struct file *fp; 620 cap_rights_t rights; 621 int error, interrupted = 0; 622 623 AUDIT_ARG_FD(fd); 624 AUDIT_ARG_SOCKADDR(td, dirfd, sa); 625 error = getsock_cap(td, fd, cap_rights_init(&rights, CAP_CONNECT), 626 &fp, NULL); 627 if (error != 0) 628 return (error); 629 so = fp->f_data; 630 if (so->so_state & SS_ISCONNECTING) { 631 error = EALREADY; 632 goto done1; 633 } 634#ifdef KTRACE 635 if (KTRPOINT(td, KTR_STRUCT)) 636 ktrsockaddr(sa); 637#endif 638#ifdef MAC 639 error = mac_socket_check_connect(td->td_ucred, so, sa); 640 if (error != 0) 641 goto bad; 642#endif 643 if (dirfd == AT_FDCWD) 644 error = soconnect(so, sa, td); 645 else 646 error = soconnectat(dirfd, so, sa, td); 647 if (error != 0) 648 goto bad; 649 if ((so->so_state & SS_NBIO) && (so->so_state & SS_ISCONNECTING)) { 650 error = EINPROGRESS; 651 goto done1; 652 } 653 SOCK_LOCK(so); 654 while ((so->so_state & SS_ISCONNECTING) && so->so_error == 0) { 655 error = msleep(&so->so_timeo, SOCK_MTX(so), PSOCK | PCATCH, 656 "connec", 0); 657 if (error != 0) { 658 if (error == EINTR || error == ERESTART) 659 interrupted = 1; 660 break; 661 } 662 } 663 if (error == 0) { 664 error = so->so_error; 665 so->so_error = 0; 666 } 667 SOCK_UNLOCK(so); 668bad: 669 if (!interrupted) 670 so->so_state &= ~SS_ISCONNECTING; 671 if (error == ERESTART) 672 error = EINTR; 673done1: 674 fdrop(fp, td); 675 return (error); 676} 677 678int 679kern_connect(struct thread *td, int fd, struct sockaddr *sa) 680{ 681 682 return (kern_connectat(td, AT_FDCWD, fd, sa)); 683} 684 685/* ARGSUSED */ 686int 687sys_connectat(td, uap) 688 struct thread *td; 689 struct connectat_args /* { 690 int fd; 691 int s; 692 caddr_t name; 693 int namelen; 694 } */ *uap; 695{ 696 struct sockaddr *sa; 697 int error; 698 699 error = getsockaddr(&sa, uap->name, uap->namelen); 700 if (error == 0) { 701 error = kern_connectat(td, uap->fd, uap->s, sa); 702 free(sa, M_SONAME); 703 } 704 return (error); 705} 706 707int 708kern_socketpair(struct thread *td, int domain, int type, int protocol, 709 int *rsv) 710{ 711 struct file *fp1, *fp2; 712 struct socket *so1, *so2; 713 int fd, error, oflag, fflag; 714 715 AUDIT_ARG_SOCKET(domain, type, protocol); 716 717 oflag = 0; 718 fflag = 0; 719 if ((type & SOCK_CLOEXEC) != 0) { 720 type &= ~SOCK_CLOEXEC; 721 oflag |= O_CLOEXEC; 722 } 723 if ((type & SOCK_NONBLOCK) != 0) { 724 type &= ~SOCK_NONBLOCK; 725 fflag |= FNONBLOCK; 726 } 727#ifdef MAC 728 /* We might want to have a separate check for socket pairs. */ 729 error = mac_socket_check_create(td->td_ucred, domain, type, 730 protocol); 731 if (error != 0) 732 return (error); 733#endif 734 error = socreate(domain, &so1, type, protocol, td->td_ucred, td); 735 if (error != 0) 736 return (error); 737 error = socreate(domain, &so2, type, protocol, td->td_ucred, td); 738 if (error != 0) 739 goto free1; 740 /* On success extra reference to `fp1' and 'fp2' is set by falloc. */ 741 error = falloc(td, &fp1, &fd, oflag); 742 if (error != 0) 743 goto free2; 744 rsv[0] = fd; 745 fp1->f_data = so1; /* so1 already has ref count */ 746 error = falloc(td, &fp2, &fd, oflag); 747 if (error != 0) 748 goto free3; 749 fp2->f_data = so2; /* so2 already has ref count */ 750 rsv[1] = fd; 751 error = soconnect2(so1, so2); 752 if (error != 0) 753 goto free4; 754 if (type == SOCK_DGRAM) { 755 /* 756 * Datagram socket connection is asymmetric. 757 */ 758 error = soconnect2(so2, so1); 759 if (error != 0) 760 goto free4; 761 } 762 finit(fp1, FREAD | FWRITE | fflag, DTYPE_SOCKET, fp1->f_data, 763 &socketops); 764 finit(fp2, FREAD | FWRITE | fflag, DTYPE_SOCKET, fp2->f_data, 765 &socketops); 766 if ((fflag & FNONBLOCK) != 0) { 767 (void) fo_ioctl(fp1, FIONBIO, &fflag, td->td_ucred, td); 768 (void) fo_ioctl(fp2, FIONBIO, &fflag, td->td_ucred, td); 769 } 770 fdrop(fp1, td); 771 fdrop(fp2, td); 772 return (0); 773free4: 774 fdclose(td, fp2, rsv[1]); 775 fdrop(fp2, td); 776free3: 777 fdclose(td, fp1, rsv[0]); 778 fdrop(fp1, td); 779free2: 780 if (so2 != NULL) 781 (void)soclose(so2); 782free1: 783 if (so1 != NULL) 784 (void)soclose(so1); 785 return (error); 786} 787 788int 789sys_socketpair(struct thread *td, struct socketpair_args *uap) 790{ 791 int error, sv[2]; 792 793 error = kern_socketpair(td, uap->domain, uap->type, 794 uap->protocol, sv); 795 if (error != 0) 796 return (error); 797 error = copyout(sv, uap->rsv, 2 * sizeof(int)); 798 if (error != 0) { 799 (void)kern_close(td, sv[0]); 800 (void)kern_close(td, sv[1]); 801 } 802 return (error); 803} 804 805static int 806sendit(td, s, mp, flags) 807 struct thread *td; 808 int s; 809 struct msghdr *mp; 810 int flags; 811{ 812 struct mbuf *control; 813 struct sockaddr *to; 814 int error; 815 816#ifdef CAPABILITY_MODE 817 if (IN_CAPABILITY_MODE(td) && (mp->msg_name != NULL)) 818 return (ECAPMODE); 819#endif 820 821 if (mp->msg_name != NULL) { 822 error = getsockaddr(&to, mp->msg_name, mp->msg_namelen); 823 if (error != 0) { 824 to = NULL; 825 goto bad; 826 } 827 mp->msg_name = to; 828 } else { 829 to = NULL; 830 } 831 832 if (mp->msg_control) { 833 if (mp->msg_controllen < sizeof(struct cmsghdr) 834#ifdef COMPAT_OLDSOCK 835 && mp->msg_flags != MSG_COMPAT 836#endif 837 ) { 838 error = EINVAL; 839 goto bad; 840 } 841 error = sockargs(&control, mp->msg_control, 842 mp->msg_controllen, MT_CONTROL); 843 if (error != 0) 844 goto bad; 845#ifdef COMPAT_OLDSOCK 846 if (mp->msg_flags == MSG_COMPAT) { 847 struct cmsghdr *cm; 848 849 M_PREPEND(control, sizeof(*cm), M_WAITOK); 850 cm = mtod(control, struct cmsghdr *); 851 cm->cmsg_len = control->m_len; 852 cm->cmsg_level = SOL_SOCKET; 853 cm->cmsg_type = SCM_RIGHTS; 854 } 855#endif 856 } else { 857 control = NULL; 858 } 859 860 error = kern_sendit(td, s, mp, flags, control, UIO_USERSPACE); 861 862bad: 863 free(to, M_SONAME); 864 return (error); 865} 866 867int 868kern_sendit(td, s, mp, flags, control, segflg) 869 struct thread *td; 870 int s; 871 struct msghdr *mp; 872 int flags; 873 struct mbuf *control; 874 enum uio_seg segflg; 875{ 876 struct file *fp; 877 struct uio auio; 878 struct iovec *iov; 879 struct socket *so; 880 cap_rights_t rights; 881#ifdef KTRACE 882 struct uio *ktruio = NULL; 883#endif 884 ssize_t len; 885 int i, error; 886 887 AUDIT_ARG_FD(s); 888 cap_rights_init(&rights, CAP_SEND); 889 if (mp->msg_name != NULL) { 890 AUDIT_ARG_SOCKADDR(td, AT_FDCWD, mp->msg_name); 891 cap_rights_set(&rights, CAP_CONNECT); 892 } 893 error = getsock_cap(td, s, &rights, &fp, NULL); 894 if (error != 0) 895 return (error); 896 so = (struct socket *)fp->f_data; 897 898#ifdef KTRACE 899 if (mp->msg_name != NULL && KTRPOINT(td, KTR_STRUCT)) 900 ktrsockaddr(mp->msg_name); 901#endif 902#ifdef MAC 903 if (mp->msg_name != NULL) { 904 error = mac_socket_check_connect(td->td_ucred, so, 905 mp->msg_name); 906 if (error != 0) 907 goto bad; 908 } 909 error = mac_socket_check_send(td->td_ucred, so); 910 if (error != 0) 911 goto bad; 912#endif 913 914 auio.uio_iov = mp->msg_iov; 915 auio.uio_iovcnt = mp->msg_iovlen; 916 auio.uio_segflg = segflg; 917 auio.uio_rw = UIO_WRITE; 918 auio.uio_td = td; 919 auio.uio_offset = 0; /* XXX */ 920 auio.uio_resid = 0; 921 iov = mp->msg_iov; 922 for (i = 0; i < mp->msg_iovlen; i++, iov++) { 923 if ((auio.uio_resid += iov->iov_len) < 0) { 924 error = EINVAL; 925 goto bad; 926 } 927 } 928#ifdef KTRACE 929 if (KTRPOINT(td, KTR_GENIO)) 930 ktruio = cloneuio(&auio); 931#endif 932 len = auio.uio_resid; 933 error = sosend(so, mp->msg_name, &auio, 0, control, flags, td); 934 if (error != 0) { 935 if (auio.uio_resid != len && (error == ERESTART || 936 error == EINTR || error == EWOULDBLOCK)) 937 error = 0; 938 /* Generation of SIGPIPE can be controlled per socket */ 939 if (error == EPIPE && !(so->so_options & SO_NOSIGPIPE) && 940 !(flags & MSG_NOSIGNAL)) { 941 PROC_LOCK(td->td_proc); 942 tdsignal(td, SIGPIPE); 943 PROC_UNLOCK(td->td_proc); 944 } 945 } 946 if (error == 0) 947 td->td_retval[0] = len - auio.uio_resid; 948#ifdef KTRACE 949 if (ktruio != NULL) { 950 ktruio->uio_resid = td->td_retval[0]; 951 ktrgenio(s, UIO_WRITE, ktruio, error); 952 } 953#endif 954bad: 955 fdrop(fp, td); 956 return (error); 957} 958 959int 960sys_sendto(td, uap) 961 struct thread *td; 962 struct sendto_args /* { 963 int s; 964 caddr_t buf; 965 size_t len; 966 int flags; 967 caddr_t to; 968 int tolen; 969 } */ *uap; 970{ 971 struct msghdr msg; 972 struct iovec aiov; 973 974 msg.msg_name = uap->to; 975 msg.msg_namelen = uap->tolen; 976 msg.msg_iov = &aiov; 977 msg.msg_iovlen = 1; 978 msg.msg_control = 0; 979#ifdef COMPAT_OLDSOCK 980 msg.msg_flags = 0; 981#endif 982 aiov.iov_base = uap->buf; 983 aiov.iov_len = uap->len; 984 return (sendit(td, uap->s, &msg, uap->flags)); 985} 986 987#ifdef COMPAT_OLDSOCK 988int 989osend(td, uap) 990 struct thread *td; 991 struct osend_args /* { 992 int s; 993 caddr_t buf; 994 int len; 995 int flags; 996 } */ *uap; 997{ 998 struct msghdr msg; 999 struct iovec aiov; 1000 1001 msg.msg_name = 0; 1002 msg.msg_namelen = 0; 1003 msg.msg_iov = &aiov; 1004 msg.msg_iovlen = 1; 1005 aiov.iov_base = uap->buf; 1006 aiov.iov_len = uap->len; 1007 msg.msg_control = 0; 1008 msg.msg_flags = 0; 1009 return (sendit(td, uap->s, &msg, uap->flags)); 1010} 1011 1012int 1013osendmsg(td, uap) 1014 struct thread *td; 1015 struct osendmsg_args /* { 1016 int s; 1017 caddr_t msg; 1018 int flags; 1019 } */ *uap; 1020{ 1021 struct msghdr msg; 1022 struct iovec *iov; 1023 int error; 1024 1025 error = copyin(uap->msg, &msg, sizeof (struct omsghdr)); 1026 if (error != 0) 1027 return (error); 1028 error = copyiniov(msg.msg_iov, msg.msg_iovlen, &iov, EMSGSIZE); 1029 if (error != 0) 1030 return (error); 1031 msg.msg_iov = iov; 1032 msg.msg_flags = MSG_COMPAT; 1033 error = sendit(td, uap->s, &msg, uap->flags); 1034 free(iov, M_IOV); 1035 return (error); 1036} 1037#endif 1038 1039int 1040sys_sendmsg(td, uap) 1041 struct thread *td; 1042 struct sendmsg_args /* { 1043 int s; 1044 caddr_t msg; 1045 int flags; 1046 } */ *uap; 1047{ 1048 struct msghdr msg; 1049 struct iovec *iov; 1050 int error; 1051 1052 error = copyin(uap->msg, &msg, sizeof (msg)); 1053 if (error != 0) 1054 return (error); 1055 error = copyiniov(msg.msg_iov, msg.msg_iovlen, &iov, EMSGSIZE); 1056 if (error != 0) 1057 return (error); 1058 msg.msg_iov = iov; 1059#ifdef COMPAT_OLDSOCK 1060 msg.msg_flags = 0; 1061#endif 1062 error = sendit(td, uap->s, &msg, uap->flags); 1063 free(iov, M_IOV); 1064 return (error); 1065} 1066 1067int 1068kern_recvit(td, s, mp, fromseg, controlp) 1069 struct thread *td; 1070 int s; 1071 struct msghdr *mp; 1072 enum uio_seg fromseg; 1073 struct mbuf **controlp; 1074{ 1075 struct uio auio; 1076 struct iovec *iov; 1077 struct mbuf *m, *control = NULL; 1078 caddr_t ctlbuf; 1079 struct file *fp; 1080 struct socket *so; 1081 struct sockaddr *fromsa = NULL; 1082 cap_rights_t rights; 1083#ifdef KTRACE 1084 struct uio *ktruio = NULL; 1085#endif 1086 ssize_t len; 1087 int error, i; 1088 1089 if (controlp != NULL) 1090 *controlp = NULL; 1091 1092 AUDIT_ARG_FD(s); 1093 error = getsock_cap(td, s, cap_rights_init(&rights, CAP_RECV), 1094 &fp, NULL); 1095 if (error != 0) 1096 return (error); 1097 so = fp->f_data; 1098 1099#ifdef MAC 1100 error = mac_socket_check_receive(td->td_ucred, so); 1101 if (error != 0) { 1102 fdrop(fp, td); 1103 return (error); 1104 } 1105#endif 1106 1107 auio.uio_iov = mp->msg_iov; 1108 auio.uio_iovcnt = mp->msg_iovlen; 1109 auio.uio_segflg = UIO_USERSPACE; 1110 auio.uio_rw = UIO_READ; 1111 auio.uio_td = td; 1112 auio.uio_offset = 0; /* XXX */ 1113 auio.uio_resid = 0; 1114 iov = mp->msg_iov; 1115 for (i = 0; i < mp->msg_iovlen; i++, iov++) { 1116 if ((auio.uio_resid += iov->iov_len) < 0) { 1117 fdrop(fp, td); 1118 return (EINVAL); 1119 } 1120 } 1121#ifdef KTRACE 1122 if (KTRPOINT(td, KTR_GENIO)) 1123 ktruio = cloneuio(&auio); 1124#endif 1125 len = auio.uio_resid; 1126 error = soreceive(so, &fromsa, &auio, NULL, 1127 (mp->msg_control || controlp) ? &control : NULL, 1128 &mp->msg_flags); 1129 if (error != 0) { 1130 if (auio.uio_resid != len && (error == ERESTART || 1131 error == EINTR || error == EWOULDBLOCK)) 1132 error = 0; 1133 } 1134 if (fromsa != NULL) 1135 AUDIT_ARG_SOCKADDR(td, AT_FDCWD, fromsa); 1136#ifdef KTRACE 1137 if (ktruio != NULL) { 1138 ktruio->uio_resid = len - auio.uio_resid; 1139 ktrgenio(s, UIO_READ, ktruio, error); 1140 } 1141#endif 1142 if (error != 0) 1143 goto out; 1144 td->td_retval[0] = len - auio.uio_resid; 1145 if (mp->msg_name) { 1146 len = mp->msg_namelen; 1147 if (len <= 0 || fromsa == NULL) 1148 len = 0; 1149 else { 1150 /* save sa_len before it is destroyed by MSG_COMPAT */ 1151 len = MIN(len, fromsa->sa_len); 1152#ifdef COMPAT_OLDSOCK 1153 if (mp->msg_flags & MSG_COMPAT) 1154 ((struct osockaddr *)fromsa)->sa_family = 1155 fromsa->sa_family; 1156#endif 1157 if (fromseg == UIO_USERSPACE) { 1158 error = copyout(fromsa, mp->msg_name, 1159 (unsigned)len); 1160 if (error != 0) 1161 goto out; 1162 } else 1163 bcopy(fromsa, mp->msg_name, len); 1164 } 1165 mp->msg_namelen = len; 1166 } 1167 if (mp->msg_control && controlp == NULL) { 1168#ifdef COMPAT_OLDSOCK 1169 /* 1170 * We assume that old recvmsg calls won't receive access 1171 * rights and other control info, esp. as control info 1172 * is always optional and those options didn't exist in 4.3. 1173 * If we receive rights, trim the cmsghdr; anything else 1174 * is tossed. 1175 */ 1176 if (control && mp->msg_flags & MSG_COMPAT) { 1177 if (mtod(control, struct cmsghdr *)->cmsg_level != 1178 SOL_SOCKET || 1179 mtod(control, struct cmsghdr *)->cmsg_type != 1180 SCM_RIGHTS) { 1181 mp->msg_controllen = 0; 1182 goto out; 1183 } 1184 control->m_len -= sizeof (struct cmsghdr); 1185 control->m_data += sizeof (struct cmsghdr); 1186 } 1187#endif 1188 len = mp->msg_controllen; 1189 m = control; 1190 mp->msg_controllen = 0; 1191 ctlbuf = mp->msg_control; 1192 1193 while (m && len > 0) { 1194 unsigned int tocopy; 1195 1196 if (len >= m->m_len) 1197 tocopy = m->m_len; 1198 else { 1199 mp->msg_flags |= MSG_CTRUNC; 1200 tocopy = len; 1201 } 1202 1203 if ((error = copyout(mtod(m, caddr_t), 1204 ctlbuf, tocopy)) != 0) 1205 goto out; 1206 1207 ctlbuf += tocopy; 1208 len -= tocopy; 1209 m = m->m_next; 1210 } 1211 mp->msg_controllen = ctlbuf - (caddr_t)mp->msg_control; 1212 } 1213out: 1214 fdrop(fp, td); 1215#ifdef KTRACE 1216 if (fromsa && KTRPOINT(td, KTR_STRUCT)) 1217 ktrsockaddr(fromsa); 1218#endif 1219 free(fromsa, M_SONAME); 1220 1221 if (error == 0 && controlp != NULL) 1222 *controlp = control; 1223 else if (control) 1224 m_freem(control); 1225 1226 return (error); 1227} 1228 1229static int 1230recvit(td, s, mp, namelenp) 1231 struct thread *td; 1232 int s; 1233 struct msghdr *mp; 1234 void *namelenp; 1235{ 1236 int error; 1237 1238 error = kern_recvit(td, s, mp, UIO_USERSPACE, NULL); 1239 if (error != 0) 1240 return (error); 1241 if (namelenp != NULL) { 1242 error = copyout(&mp->msg_namelen, namelenp, sizeof (socklen_t)); 1243#ifdef COMPAT_OLDSOCK 1244 if (mp->msg_flags & MSG_COMPAT) 1245 error = 0; /* old recvfrom didn't check */ 1246#endif 1247 } 1248 return (error); 1249} 1250 1251int 1252sys_recvfrom(td, uap) 1253 struct thread *td; 1254 struct recvfrom_args /* { 1255 int s; 1256 caddr_t buf; 1257 size_t len; 1258 int flags; 1259 struct sockaddr * __restrict from; 1260 socklen_t * __restrict fromlenaddr; 1261 } */ *uap; 1262{ 1263 struct msghdr msg; 1264 struct iovec aiov; 1265 int error; 1266 1267 if (uap->fromlenaddr) { 1268 error = copyin(uap->fromlenaddr, 1269 &msg.msg_namelen, sizeof (msg.msg_namelen)); 1270 if (error != 0) 1271 goto done2; 1272 } else { 1273 msg.msg_namelen = 0; 1274 } 1275 msg.msg_name = uap->from; 1276 msg.msg_iov = &aiov; 1277 msg.msg_iovlen = 1; 1278 aiov.iov_base = uap->buf; 1279 aiov.iov_len = uap->len; 1280 msg.msg_control = 0; 1281 msg.msg_flags = uap->flags; 1282 error = recvit(td, uap->s, &msg, uap->fromlenaddr); 1283done2: 1284 return (error); 1285} 1286 1287#ifdef COMPAT_OLDSOCK 1288int 1289orecvfrom(td, uap) 1290 struct thread *td; 1291 struct recvfrom_args *uap; 1292{ 1293 1294 uap->flags |= MSG_COMPAT; 1295 return (sys_recvfrom(td, uap)); 1296} 1297#endif 1298 1299#ifdef COMPAT_OLDSOCK 1300int 1301orecv(td, uap) 1302 struct thread *td; 1303 struct orecv_args /* { 1304 int s; 1305 caddr_t buf; 1306 int len; 1307 int flags; 1308 } */ *uap; 1309{ 1310 struct msghdr msg; 1311 struct iovec aiov; 1312 1313 msg.msg_name = 0; 1314 msg.msg_namelen = 0; 1315 msg.msg_iov = &aiov; 1316 msg.msg_iovlen = 1; 1317 aiov.iov_base = uap->buf; 1318 aiov.iov_len = uap->len; 1319 msg.msg_control = 0; 1320 msg.msg_flags = uap->flags; 1321 return (recvit(td, uap->s, &msg, NULL)); 1322} 1323 1324/* 1325 * Old recvmsg. This code takes advantage of the fact that the old msghdr 1326 * overlays the new one, missing only the flags, and with the (old) access 1327 * rights where the control fields are now. 1328 */ 1329int 1330orecvmsg(td, uap) 1331 struct thread *td; 1332 struct orecvmsg_args /* { 1333 int s; 1334 struct omsghdr *msg; 1335 int flags; 1336 } */ *uap; 1337{ 1338 struct msghdr msg; 1339 struct iovec *iov; 1340 int error; 1341 1342 error = copyin(uap->msg, &msg, sizeof (struct omsghdr)); 1343 if (error != 0) 1344 return (error); 1345 error = copyiniov(msg.msg_iov, msg.msg_iovlen, &iov, EMSGSIZE); 1346 if (error != 0) 1347 return (error); 1348 msg.msg_flags = uap->flags | MSG_COMPAT; 1349 msg.msg_iov = iov; 1350 error = recvit(td, uap->s, &msg, &uap->msg->msg_namelen); 1351 if (msg.msg_controllen && error == 0) 1352 error = copyout(&msg.msg_controllen, 1353 &uap->msg->msg_accrightslen, sizeof (int)); 1354 free(iov, M_IOV); 1355 return (error); 1356} 1357#endif 1358 1359int 1360sys_recvmsg(td, uap) 1361 struct thread *td; 1362 struct recvmsg_args /* { 1363 int s; 1364 struct msghdr *msg; 1365 int flags; 1366 } */ *uap; 1367{ 1368 struct msghdr msg; 1369 struct iovec *uiov, *iov; 1370 int error; 1371 1372 error = copyin(uap->msg, &msg, sizeof (msg)); 1373 if (error != 0) 1374 return (error); 1375 error = copyiniov(msg.msg_iov, msg.msg_iovlen, &iov, EMSGSIZE); 1376 if (error != 0) 1377 return (error); 1378 msg.msg_flags = uap->flags; 1379#ifdef COMPAT_OLDSOCK 1380 msg.msg_flags &= ~MSG_COMPAT; 1381#endif 1382 uiov = msg.msg_iov; 1383 msg.msg_iov = iov; 1384 error = recvit(td, uap->s, &msg, NULL); 1385 if (error == 0) { 1386 msg.msg_iov = uiov; 1387 error = copyout(&msg, uap->msg, sizeof(msg)); 1388 } 1389 free(iov, M_IOV); 1390 return (error); 1391} 1392 1393/* ARGSUSED */ 1394int 1395sys_shutdown(td, uap) 1396 struct thread *td; 1397 struct shutdown_args /* { 1398 int s; 1399 int how; 1400 } */ *uap; 1401{ 1402 struct socket *so; 1403 struct file *fp; 1404 cap_rights_t rights; 1405 int error; 1406 1407 AUDIT_ARG_FD(uap->s); 1408 error = getsock_cap(td, uap->s, cap_rights_init(&rights, CAP_SHUTDOWN), 1409 &fp, NULL); 1410 if (error == 0) { 1411 so = fp->f_data; 1412 error = soshutdown(so, uap->how); 1413 fdrop(fp, td); 1414 } 1415 return (error); 1416} 1417 1418/* ARGSUSED */ 1419int 1420sys_setsockopt(td, uap) 1421 struct thread *td; 1422 struct setsockopt_args /* { 1423 int s; 1424 int level; 1425 int name; 1426 caddr_t val; 1427 int valsize; 1428 } */ *uap; 1429{ 1430 1431 return (kern_setsockopt(td, uap->s, uap->level, uap->name, 1432 uap->val, UIO_USERSPACE, uap->valsize)); 1433} 1434 1435int 1436kern_setsockopt(td, s, level, name, val, valseg, valsize) 1437 struct thread *td; 1438 int s; 1439 int level; 1440 int name; 1441 void *val; 1442 enum uio_seg valseg; 1443 socklen_t valsize; 1444{ 1445 struct socket *so; 1446 struct file *fp; 1447 struct sockopt sopt; 1448 cap_rights_t rights; 1449 int error; 1450 1451 if (val == NULL && valsize != 0) 1452 return (EFAULT); 1453 if ((int)valsize < 0) 1454 return (EINVAL); 1455 1456 sopt.sopt_dir = SOPT_SET; 1457 sopt.sopt_level = level; 1458 sopt.sopt_name = name; 1459 sopt.sopt_val = val; 1460 sopt.sopt_valsize = valsize; 1461 switch (valseg) { 1462 case UIO_USERSPACE: 1463 sopt.sopt_td = td; 1464 break; 1465 case UIO_SYSSPACE: 1466 sopt.sopt_td = NULL; 1467 break; 1468 default: 1469 panic("kern_setsockopt called with bad valseg"); 1470 } 1471 1472 AUDIT_ARG_FD(s); 1473 error = getsock_cap(td, s, cap_rights_init(&rights, CAP_SETSOCKOPT), 1474 &fp, NULL); 1475 if (error == 0) { 1476 so = fp->f_data; 1477 error = sosetopt(so, &sopt); 1478 fdrop(fp, td); 1479 } 1480 return(error); 1481} 1482 1483/* ARGSUSED */ 1484int 1485sys_getsockopt(td, uap) 1486 struct thread *td; 1487 struct getsockopt_args /* { 1488 int s; 1489 int level; 1490 int name; 1491 void * __restrict val; 1492 socklen_t * __restrict avalsize; 1493 } */ *uap; 1494{ 1495 socklen_t valsize; 1496 int error; 1497 1498 if (uap->val) { 1499 error = copyin(uap->avalsize, &valsize, sizeof (valsize)); 1500 if (error != 0) 1501 return (error); 1502 } 1503 1504 error = kern_getsockopt(td, uap->s, uap->level, uap->name, 1505 uap->val, UIO_USERSPACE, &valsize); 1506 1507 if (error == 0) 1508 error = copyout(&valsize, uap->avalsize, sizeof (valsize)); 1509 return (error); 1510} 1511 1512/* 1513 * Kernel version of getsockopt. 1514 * optval can be a userland or userspace. optlen is always a kernel pointer. 1515 */ 1516int 1517kern_getsockopt(td, s, level, name, val, valseg, valsize) 1518 struct thread *td; 1519 int s; 1520 int level; 1521 int name; 1522 void *val; 1523 enum uio_seg valseg; 1524 socklen_t *valsize; 1525{ 1526 struct socket *so; 1527 struct file *fp; 1528 struct sockopt sopt; 1529 cap_rights_t rights; 1530 int error; 1531 1532 if (val == NULL) 1533 *valsize = 0; 1534 if ((int)*valsize < 0) 1535 return (EINVAL); 1536 1537 sopt.sopt_dir = SOPT_GET; 1538 sopt.sopt_level = level; 1539 sopt.sopt_name = name; 1540 sopt.sopt_val = val; 1541 sopt.sopt_valsize = (size_t)*valsize; /* checked non-negative above */ 1542 switch (valseg) { 1543 case UIO_USERSPACE: 1544 sopt.sopt_td = td; 1545 break; 1546 case UIO_SYSSPACE: 1547 sopt.sopt_td = NULL; 1548 break; 1549 default: 1550 panic("kern_getsockopt called with bad valseg"); 1551 } 1552 1553 AUDIT_ARG_FD(s); 1554 error = getsock_cap(td, s, cap_rights_init(&rights, CAP_GETSOCKOPT), 1555 &fp, NULL); 1556 if (error == 0) { 1557 so = fp->f_data; 1558 error = sogetopt(so, &sopt); 1559 *valsize = sopt.sopt_valsize; 1560 fdrop(fp, td); 1561 } 1562 return (error); 1563} 1564 1565/* 1566 * getsockname1() - Get socket name. 1567 */ 1568/* ARGSUSED */ 1569static int 1570getsockname1(td, uap, compat) 1571 struct thread *td; 1572 struct getsockname_args /* { 1573 int fdes; 1574 struct sockaddr * __restrict asa; 1575 socklen_t * __restrict alen; 1576 } */ *uap; 1577 int compat; 1578{ 1579 struct sockaddr *sa; 1580 socklen_t len; 1581 int error; 1582 1583 error = copyin(uap->alen, &len, sizeof(len)); 1584 if (error != 0) 1585 return (error); 1586 1587 error = kern_getsockname(td, uap->fdes, &sa, &len); 1588 if (error != 0) 1589 return (error); 1590 1591 if (len != 0) { 1592#ifdef COMPAT_OLDSOCK 1593 if (compat) 1594 ((struct osockaddr *)sa)->sa_family = sa->sa_family; 1595#endif 1596 error = copyout(sa, uap->asa, (u_int)len); 1597 } 1598 free(sa, M_SONAME); 1599 if (error == 0) 1600 error = copyout(&len, uap->alen, sizeof(len)); 1601 return (error); 1602} 1603 1604int 1605kern_getsockname(struct thread *td, int fd, struct sockaddr **sa, 1606 socklen_t *alen) 1607{ 1608 struct socket *so; 1609 struct file *fp; 1610 cap_rights_t rights; 1611 socklen_t len; 1612 int error; 1613 1614 AUDIT_ARG_FD(fd); 1615 error = getsock_cap(td, fd, cap_rights_init(&rights, CAP_GETSOCKNAME), 1616 &fp, NULL); 1617 if (error != 0) 1618 return (error); 1619 so = fp->f_data; 1620 *sa = NULL; 1621 CURVNET_SET(so->so_vnet); 1622 error = (*so->so_proto->pr_usrreqs->pru_sockaddr)(so, sa); 1623 CURVNET_RESTORE(); 1624 if (error != 0) 1625 goto bad; 1626 if (*sa == NULL) 1627 len = 0; 1628 else 1629 len = MIN(*alen, (*sa)->sa_len); 1630 *alen = len; 1631#ifdef KTRACE 1632 if (KTRPOINT(td, KTR_STRUCT)) 1633 ktrsockaddr(*sa); 1634#endif 1635bad: 1636 fdrop(fp, td); 1637 if (error != 0 && *sa != NULL) { 1638 free(*sa, M_SONAME); 1639 *sa = NULL; 1640 } 1641 return (error); 1642} 1643 1644int 1645sys_getsockname(td, uap) 1646 struct thread *td; 1647 struct getsockname_args *uap; 1648{ 1649 1650 return (getsockname1(td, uap, 0)); 1651} 1652 1653#ifdef COMPAT_OLDSOCK 1654int 1655ogetsockname(td, uap) 1656 struct thread *td; 1657 struct getsockname_args *uap; 1658{ 1659 1660 return (getsockname1(td, uap, 1)); 1661} 1662#endif /* COMPAT_OLDSOCK */ 1663 1664/* 1665 * getpeername1() - Get name of peer for connected socket. 1666 */ 1667/* ARGSUSED */ 1668static int 1669getpeername1(td, uap, compat) 1670 struct thread *td; 1671 struct getpeername_args /* { 1672 int fdes; 1673 struct sockaddr * __restrict asa; 1674 socklen_t * __restrict alen; 1675 } */ *uap; 1676 int compat; 1677{ 1678 struct sockaddr *sa; 1679 socklen_t len; 1680 int error; 1681 1682 error = copyin(uap->alen, &len, sizeof (len)); 1683 if (error != 0) 1684 return (error); 1685 1686 error = kern_getpeername(td, uap->fdes, &sa, &len); 1687 if (error != 0) 1688 return (error); 1689 1690 if (len != 0) { 1691#ifdef COMPAT_OLDSOCK 1692 if (compat) 1693 ((struct osockaddr *)sa)->sa_family = sa->sa_family; 1694#endif 1695 error = copyout(sa, uap->asa, (u_int)len); 1696 } 1697 free(sa, M_SONAME); 1698 if (error == 0) 1699 error = copyout(&len, uap->alen, sizeof(len)); 1700 return (error); 1701} 1702 1703int 1704kern_getpeername(struct thread *td, int fd, struct sockaddr **sa, 1705 socklen_t *alen) 1706{ 1707 struct socket *so; 1708 struct file *fp; 1709 cap_rights_t rights; 1710 socklen_t len; 1711 int error; 1712 1713 AUDIT_ARG_FD(fd); 1714 error = getsock_cap(td, fd, cap_rights_init(&rights, CAP_GETPEERNAME), 1715 &fp, NULL); 1716 if (error != 0) 1717 return (error); 1718 so = fp->f_data; 1719 if ((so->so_state & (SS_ISCONNECTED|SS_ISCONFIRMING)) == 0) { 1720 error = ENOTCONN; 1721 goto done; 1722 } 1723 *sa = NULL; 1724 CURVNET_SET(so->so_vnet); 1725 error = (*so->so_proto->pr_usrreqs->pru_peeraddr)(so, sa); 1726 CURVNET_RESTORE(); 1727 if (error != 0) 1728 goto bad; 1729 if (*sa == NULL) 1730 len = 0; 1731 else 1732 len = MIN(*alen, (*sa)->sa_len); 1733 *alen = len; 1734#ifdef KTRACE 1735 if (KTRPOINT(td, KTR_STRUCT)) 1736 ktrsockaddr(*sa); 1737#endif 1738bad: 1739 if (error != 0 && *sa != NULL) { 1740 free(*sa, M_SONAME); 1741 *sa = NULL; 1742 } 1743done: 1744 fdrop(fp, td); 1745 return (error); 1746} 1747 1748int 1749sys_getpeername(td, uap) 1750 struct thread *td; 1751 struct getpeername_args *uap; 1752{ 1753 1754 return (getpeername1(td, uap, 0)); 1755} 1756 1757#ifdef COMPAT_OLDSOCK 1758int 1759ogetpeername(td, uap) 1760 struct thread *td; 1761 struct ogetpeername_args *uap; 1762{ 1763 1764 /* XXX uap should have type `getpeername_args *' to begin with. */ 1765 return (getpeername1(td, (struct getpeername_args *)uap, 1)); 1766} 1767#endif /* COMPAT_OLDSOCK */ 1768 1769int 1770sockargs(mp, buf, buflen, type) 1771 struct mbuf **mp; 1772 caddr_t buf; 1773 int buflen, type; 1774{ 1775 struct sockaddr *sa; 1776 struct mbuf *m; 1777 int error; 1778 1779 if (buflen < 0) 1780 return (EINVAL); 1781 1782 if (buflen > MLEN) { 1783#ifdef COMPAT_OLDSOCK 1784 if (type == MT_SONAME && buflen <= 112) 1785 buflen = MLEN; /* unix domain compat. hack */ 1786 else 1787#endif 1788 if (buflen > MCLBYTES) 1789 return (EINVAL); 1790 } 1791 m = m_get2(buflen, M_WAITOK, type, 0); 1792 m->m_len = buflen; 1793 error = copyin(buf, mtod(m, caddr_t), (u_int)buflen); 1794 if (error != 0) 1795 (void) m_free(m); 1796 else { 1797 *mp = m; 1798 if (type == MT_SONAME) { 1799 sa = mtod(m, struct sockaddr *); 1800 1801#if defined(COMPAT_OLDSOCK) && BYTE_ORDER != BIG_ENDIAN 1802 if (sa->sa_family == 0 && sa->sa_len < AF_MAX) 1803 sa->sa_family = sa->sa_len; 1804#endif 1805 sa->sa_len = buflen; 1806 } 1807 } 1808 return (error); 1809} 1810 1811int 1812getsockaddr(namp, uaddr, len) 1813 struct sockaddr **namp; 1814 caddr_t uaddr; 1815 size_t len; 1816{ 1817 struct sockaddr *sa; 1818 int error; 1819 1820 if (len > SOCK_MAXADDRLEN) 1821 return (ENAMETOOLONG); 1822 if (len < offsetof(struct sockaddr, sa_data[0])) 1823 return (EINVAL); 1824 sa = malloc(len, M_SONAME, M_WAITOK); 1825 error = copyin(uaddr, sa, len); 1826 if (error != 0) { 1827 free(sa, M_SONAME); 1828 } else { 1829#if defined(COMPAT_OLDSOCK) && BYTE_ORDER != BIG_ENDIAN 1830 if (sa->sa_family == 0 && sa->sa_len < AF_MAX) 1831 sa->sa_family = sa->sa_len; 1832#endif 1833 sa->sa_len = len; 1834 *namp = sa; 1835 } 1836 return (error); 1837} 1838 1839struct sendfile_sync { 1840 struct mtx mtx; 1841 struct cv cv; 1842 unsigned count; 1843}; 1844 1845/* 1846 * Detach mapped page and release resources back to the system. 1847 */ 1848int 1849sf_buf_mext(struct mbuf *mb, void *addr, void *args) 1850{ 1851 vm_page_t m; 1852 struct sendfile_sync *sfs; 1853 1854 m = sf_buf_page(args); 1855 sf_buf_free(args); 1856 vm_page_lock(m); 1857 vm_page_unwire(m, 0); 1858 /* 1859 * Check for the object going away on us. This can 1860 * happen since we don't hold a reference to it. 1861 * If so, we're responsible for freeing the page. 1862 */ 1863 if (m->wire_count == 0 && m->object == NULL) 1864 vm_page_free(m); 1865 vm_page_unlock(m); 1866 if (addr == NULL) 1867 return (EXT_FREE_OK); 1868 sfs = addr; 1869 mtx_lock(&sfs->mtx); 1870 KASSERT(sfs->count> 0, ("Sendfile sync botchup count == 0")); 1871 if (--sfs->count == 0) 1872 cv_signal(&sfs->cv); 1873 mtx_unlock(&sfs->mtx); 1874 return (EXT_FREE_OK); 1875} 1876 1877/* 1878 * sendfile(2) 1879 * 1880 * int sendfile(int fd, int s, off_t offset, size_t nbytes, 1881 * struct sf_hdtr *hdtr, off_t *sbytes, int flags) 1882 * 1883 * Send a file specified by 'fd' and starting at 'offset' to a socket 1884 * specified by 's'. Send only 'nbytes' of the file or until EOF if nbytes == 1885 * 0. Optionally add a header and/or trailer to the socket output. If 1886 * specified, write the total number of bytes sent into *sbytes. 1887 */ 1888int 1889sys_sendfile(struct thread *td, struct sendfile_args *uap) 1890{ 1891 1892 return (do_sendfile(td, uap, 0)); 1893} 1894 1895static int 1896do_sendfile(struct thread *td, struct sendfile_args *uap, int compat) 1897{ 1898 struct sf_hdtr hdtr; 1899 struct uio *hdr_uio, *trl_uio; 1900 struct file *fp; 1901 cap_rights_t rights; 1902 int error; 1903 1904 /* 1905 * File offset must be positive. If it goes beyond EOF 1906 * we send only the header/trailer and no payload data. 1907 */ 1908 if (uap->offset < 0) 1909 return (EINVAL); 1910 1911 hdr_uio = trl_uio = NULL; 1912 1913 if (uap->hdtr != NULL) { 1914 error = copyin(uap->hdtr, &hdtr, sizeof(hdtr)); 1915 if (error != 0) 1916 goto out; 1917 if (hdtr.headers != NULL) { 1918 error = copyinuio(hdtr.headers, hdtr.hdr_cnt, &hdr_uio); 1919 if (error != 0) 1920 goto out; 1921 } 1922 if (hdtr.trailers != NULL) { 1923 error = copyinuio(hdtr.trailers, hdtr.trl_cnt, &trl_uio); 1924 if (error != 0) 1925 goto out; 1926 1927 } 1928 } 1929 1930 AUDIT_ARG_FD(uap->fd); 1931 1932 /* 1933 * sendfile(2) can start at any offset within a file so we require 1934 * CAP_READ+CAP_SEEK = CAP_PREAD. 1935 */ 1936 if ((error = fget_read(td, uap->fd, 1937 cap_rights_init(&rights, CAP_PREAD), &fp)) != 0) { 1938 goto out; 1939 } 1940 1941 error = fo_sendfile(fp, uap->s, hdr_uio, trl_uio, uap->offset, 1942 uap->nbytes, uap->sbytes, uap->flags, compat ? SFK_COMPAT : 0, td); 1943 fdrop(fp, td); 1944 1945out: 1946 free(hdr_uio, M_IOV); 1947 free(trl_uio, M_IOV); 1948 return (error); 1949} 1950 1951#ifdef COMPAT_FREEBSD4 1952int 1953freebsd4_sendfile(struct thread *td, struct freebsd4_sendfile_args *uap) 1954{ 1955 struct sendfile_args args; 1956 1957 args.fd = uap->fd; 1958 args.s = uap->s; 1959 args.offset = uap->offset; 1960 args.nbytes = uap->nbytes; 1961 args.hdtr = uap->hdtr; 1962 args.sbytes = uap->sbytes; 1963 args.flags = uap->flags; 1964 1965 return (do_sendfile(td, &args, 1)); 1966} 1967#endif /* COMPAT_FREEBSD4 */ 1968 1969static int 1970sendfile_readpage(vm_object_t obj, struct vnode *vp, int nd, 1971 off_t off, int xfsize, int bsize, struct thread *td, vm_page_t *res) 1972{ 1973 vm_page_t m; 1974 vm_pindex_t pindex; 1975 ssize_t resid; 1976 int error, readahead, rv; 1977 1978 pindex = OFF_TO_IDX(off); 1979 VM_OBJECT_WLOCK(obj); 1980 m = vm_page_grab(obj, pindex, (vp != NULL ? VM_ALLOC_NOBUSY | 1981 VM_ALLOC_IGN_SBUSY : 0) | VM_ALLOC_WIRED | VM_ALLOC_NORMAL); 1982 1983 /* 1984 * Check if page is valid for what we need, otherwise initiate I/O. 1985 * 1986 * The non-zero nd argument prevents disk I/O, instead we 1987 * return the caller what he specified in nd. In particular, 1988 * if we already turned some pages into mbufs, nd == EAGAIN 1989 * and the main function send them the pages before we come 1990 * here again and block. 1991 */ 1992 if (m->valid != 0 && vm_page_is_valid(m, off & PAGE_MASK, xfsize)) { 1993 if (vp == NULL) 1994 vm_page_xunbusy(m); 1995 VM_OBJECT_WUNLOCK(obj); 1996 *res = m; 1997 return (0); 1998 } else if (nd != 0) { 1999 if (vp == NULL) 2000 vm_page_xunbusy(m); 2001 error = nd; 2002 goto free_page; 2003 } 2004 2005 /* 2006 * Get the page from backing store. 2007 */ 2008 error = 0; 2009 if (vp != NULL) { 2010 VM_OBJECT_WUNLOCK(obj); 2011 readahead = sfreadahead * MAXBSIZE; 2012 2013 /* 2014 * Use vn_rdwr() instead of the pager interface for 2015 * the vnode, to allow the read-ahead. 2016 * 2017 * XXXMAC: Because we don't have fp->f_cred here, we 2018 * pass in NOCRED. This is probably wrong, but is 2019 * consistent with our original implementation. 2020 */ 2021 error = vn_rdwr(UIO_READ, vp, NULL, readahead, trunc_page(off), 2022 UIO_NOCOPY, IO_NODELOCKED | IO_VMIO | ((readahead / 2023 bsize) << IO_SEQSHIFT), td->td_ucred, NOCRED, &resid, td); 2024 SFSTAT_INC(sf_iocnt); 2025 VM_OBJECT_WLOCK(obj); 2026 } else { 2027 if (vm_pager_has_page(obj, pindex, NULL, NULL)) { 2028 rv = vm_pager_get_pages(obj, &m, 1, 0); 2029 SFSTAT_INC(sf_iocnt); 2030 m = vm_page_lookup(obj, pindex); 2031 if (m == NULL) 2032 error = EIO; 2033 else if (rv != VM_PAGER_OK) { 2034 vm_page_lock(m); 2035 vm_page_free(m); 2036 vm_page_unlock(m); 2037 m = NULL; 2038 error = EIO; 2039 } 2040 } else { 2041 pmap_zero_page(m); 2042 m->valid = VM_PAGE_BITS_ALL; 2043 m->dirty = 0; 2044 } 2045 if (m != NULL) 2046 vm_page_xunbusy(m); 2047 } 2048 if (error == 0) { 2049 *res = m; 2050 } else if (m != NULL) { 2051free_page: 2052 vm_page_lock(m); 2053 vm_page_unwire(m, 0); 2054 2055 /* 2056 * See if anyone else might know about this page. If 2057 * not and it is not valid, then free it. 2058 */ 2059 if (m->wire_count == 0 && m->valid == 0 && !vm_page_busied(m)) 2060 vm_page_free(m); 2061 vm_page_unlock(m); 2062 } 2063 KASSERT(error != 0 || (m->wire_count > 0 && 2064 vm_page_is_valid(m, off & PAGE_MASK, xfsize)), 2065 ("wrong page state m %p off %#jx xfsize %d", m, (uintmax_t)off, 2066 xfsize)); 2067 VM_OBJECT_WUNLOCK(obj); 2068 return (error); 2069} 2070 2071static int 2072sendfile_getobj(struct thread *td, struct file *fp, vm_object_t *obj_res, 2073 struct vnode **vp_res, struct shmfd **shmfd_res, off_t *obj_size, 2074 int *bsize) 2075{ 2076 struct vattr va; 2077 vm_object_t obj; 2078 struct vnode *vp; 2079 struct shmfd *shmfd; 2080 int error; 2081 2082 vp = *vp_res = NULL; 2083 obj = NULL; 2084 shmfd = *shmfd_res = NULL; 2085 *bsize = 0; 2086 2087 /* 2088 * The file descriptor must be a regular file and have a 2089 * backing VM object. 2090 */ 2091 if (fp->f_type == DTYPE_VNODE) { 2092 vp = fp->f_vnode; 2093 vn_lock(vp, LK_SHARED | LK_RETRY); 2094 if (vp->v_type != VREG) { 2095 error = EINVAL; 2096 goto out; 2097 } 2098 *bsize = vp->v_mount->mnt_stat.f_iosize; 2099 error = VOP_GETATTR(vp, &va, td->td_ucred); 2100 if (error != 0) 2101 goto out; 2102 *obj_size = va.va_size; 2103 obj = vp->v_object; 2104 if (obj == NULL) { 2105 error = EINVAL; 2106 goto out; 2107 } 2108 } else if (fp->f_type == DTYPE_SHM) { 2109 error = 0; 2110 shmfd = fp->f_data; 2111 obj = shmfd->shm_object; 2112 *obj_size = shmfd->shm_size; 2113 } else { 2114 error = EINVAL; 2115 goto out; 2116 } 2117 2118 VM_OBJECT_WLOCK(obj); 2119 if ((obj->flags & OBJ_DEAD) != 0) { 2120 VM_OBJECT_WUNLOCK(obj); 2121 error = EBADF; 2122 goto out; 2123 } 2124 2125 /* 2126 * Temporarily increase the backing VM object's reference 2127 * count so that a forced reclamation of its vnode does not 2128 * immediately destroy it. 2129 */ 2130 vm_object_reference_locked(obj); 2131 VM_OBJECT_WUNLOCK(obj); 2132 *obj_res = obj; 2133 *vp_res = vp; 2134 *shmfd_res = shmfd; 2135 2136out: 2137 if (vp != NULL) 2138 VOP_UNLOCK(vp, 0); 2139 return (error); 2140} 2141 2142static int 2143kern_sendfile_getsock(struct thread *td, int s, struct file **sock_fp, 2144 struct socket **so) 2145{ 2146 cap_rights_t rights; 2147 int error; 2148 2149 *sock_fp = NULL; 2150 *so = NULL; 2151 2152 /* 2153 * The socket must be a stream socket and connected. 2154 */ 2155 error = getsock_cap(td, s, cap_rights_init(&rights, CAP_SEND), 2156 sock_fp, NULL); 2157 if (error != 0) 2158 return (error); 2159 *so = (*sock_fp)->f_data; 2160 if ((*so)->so_type != SOCK_STREAM) 2161 return (EINVAL); 2162 if (((*so)->so_state & SS_ISCONNECTED) == 0) 2163 return (ENOTCONN); 2164 return (0); 2165} 2166 2167int 2168vn_sendfile(struct file *fp, int sockfd, struct uio *hdr_uio, 2169 struct uio *trl_uio, off_t offset, size_t nbytes, off_t *sent, int flags, 2170 int kflags, struct thread *td) 2171{ 2172 struct file *sock_fp; 2173 struct vnode *vp; 2174 struct vm_object *obj; 2175 struct socket *so; 2176 struct mbuf *m; 2177 struct sf_buf *sf; 2178 struct vm_page *pg; 2179 struct shmfd *shmfd; 2180 struct sendfile_sync *sfs; 2181 struct vattr va; 2182 off_t off, xfsize, fsbytes, sbytes, rem, obj_size; 2183 int error, bsize, nd, hdrlen, mnw; 2184 bool inflight_called; 2185 2186 pg = NULL; 2187 obj = NULL; 2188 so = NULL; 2189 m = NULL; 2190 sfs = NULL; 2191 fsbytes = sbytes = 0; 2192 hdrlen = mnw = 0; 2193 rem = nbytes; 2194 obj_size = 0; 2195 inflight_called = false; 2196 2197 error = sendfile_getobj(td, fp, &obj, &vp, &shmfd, &obj_size, &bsize); 2198 if (error != 0) 2199 return (error); 2200 if (rem == 0) 2201 rem = obj_size; 2202 2203 error = kern_sendfile_getsock(td, sockfd, &sock_fp, &so); 2204 if (error != 0) 2205 goto out; 2206 2207 /* 2208 * Do not wait on memory allocations but return ENOMEM for 2209 * caller to retry later. 2210 * XXX: Experimental. 2211 */ 2212 if (flags & SF_MNOWAIT) 2213 mnw = 1; 2214 2215 if (flags & SF_SYNC) { 2216 sfs = malloc(sizeof *sfs, M_TEMP, M_WAITOK | M_ZERO); 2217 mtx_init(&sfs->mtx, "sendfile", NULL, MTX_DEF); 2218 cv_init(&sfs->cv, "sendfile"); 2219 } 2220 2221#ifdef MAC 2222 error = mac_socket_check_send(td->td_ucred, so); 2223 if (error != 0) 2224 goto out; 2225#endif 2226 2227 /* If headers are specified copy them into mbufs. */ 2228 if (hdr_uio != NULL) { 2229 hdr_uio->uio_td = td; 2230 hdr_uio->uio_rw = UIO_WRITE; 2231 if (hdr_uio->uio_resid > 0) { 2232 /* 2233 * In FBSD < 5.0 the nbytes to send also included 2234 * the header. If compat is specified subtract the 2235 * header size from nbytes. 2236 */ 2237 if (kflags & SFK_COMPAT) { 2238 if (nbytes > hdr_uio->uio_resid) 2239 nbytes -= hdr_uio->uio_resid; 2240 else 2241 nbytes = 0; 2242 } 2243 m = m_uiotombuf(hdr_uio, (mnw ? M_NOWAIT : M_WAITOK), 2244 0, 0, 0); 2245 if (m == NULL) { 2246 error = mnw ? EAGAIN : ENOBUFS; 2247 goto out; 2248 } 2249 hdrlen = m_length(m, NULL); 2250 } 2251 } 2252 2253 /* 2254 * Protect against multiple writers to the socket. 2255 * 2256 * XXXRW: Historically this has assumed non-interruptibility, so now 2257 * we implement that, but possibly shouldn't. 2258 */ 2259 (void)sblock(&so->so_snd, SBL_WAIT | SBL_NOINTR); 2260 2261 /* 2262 * Loop through the pages of the file, starting with the requested 2263 * offset. Get a file page (do I/O if necessary), map the file page 2264 * into an sf_buf, attach an mbuf header to the sf_buf, and queue 2265 * it on the socket. 2266 * This is done in two loops. The inner loop turns as many pages 2267 * as it can, up to available socket buffer space, without blocking 2268 * into mbufs to have it bulk delivered into the socket send buffer. 2269 * The outer loop checks the state and available space of the socket 2270 * and takes care of the overall progress. 2271 */ 2272 for (off = offset; ; ) { 2273 struct mbuf *mtail; 2274 int loopbytes; 2275 int space; 2276 int done; 2277 2278 if ((nbytes != 0 && nbytes == fsbytes) || 2279 (nbytes == 0 && obj_size == fsbytes)) 2280 break; 2281 2282 mtail = NULL; 2283 loopbytes = 0; 2284 space = 0; 2285 done = 0; 2286 2287 /* 2288 * Check the socket state for ongoing connection, 2289 * no errors and space in socket buffer. 2290 * If space is low allow for the remainder of the 2291 * file to be processed if it fits the socket buffer. 2292 * Otherwise block in waiting for sufficient space 2293 * to proceed, or if the socket is nonblocking, return 2294 * to userland with EAGAIN while reporting how far 2295 * we've come. 2296 * We wait until the socket buffer has significant free 2297 * space to do bulk sends. This makes good use of file 2298 * system read ahead and allows packet segmentation 2299 * offloading hardware to take over lots of work. If 2300 * we were not careful here we would send off only one 2301 * sfbuf at a time. 2302 */ 2303 SOCKBUF_LOCK(&so->so_snd); 2304 if (so->so_snd.sb_lowat < so->so_snd.sb_hiwat / 2) 2305 so->so_snd.sb_lowat = so->so_snd.sb_hiwat / 2; 2306retry_space: 2307 if (so->so_snd.sb_state & SBS_CANTSENDMORE) { 2308 error = EPIPE; 2309 SOCKBUF_UNLOCK(&so->so_snd); 2310 goto done; 2311 } else if (so->so_error) { 2312 error = so->so_error; 2313 so->so_error = 0; 2314 SOCKBUF_UNLOCK(&so->so_snd); 2315 goto done; 2316 } 2317 space = sbspace(&so->so_snd); 2318 if (space < rem && 2319 (space <= 0 || 2320 space < so->so_snd.sb_lowat)) { 2321 if (so->so_state & SS_NBIO) { 2322 SOCKBUF_UNLOCK(&so->so_snd); 2323 error = EAGAIN; 2324 goto done; 2325 } 2326 /* 2327 * sbwait drops the lock while sleeping. 2328 * When we loop back to retry_space the 2329 * state may have changed and we retest 2330 * for it. 2331 */ 2332 error = sbwait(&so->so_snd); 2333 /* 2334 * An error from sbwait usually indicates that we've 2335 * been interrupted by a signal. If we've sent anything 2336 * then return bytes sent, otherwise return the error. 2337 */ 2338 if (error != 0) { 2339 SOCKBUF_UNLOCK(&so->so_snd); 2340 goto done; 2341 } 2342 goto retry_space; 2343 } 2344 SOCKBUF_UNLOCK(&so->so_snd); 2345 2346 /* 2347 * Reduce space in the socket buffer by the size of 2348 * the header mbuf chain. 2349 * hdrlen is set to 0 after the first loop. 2350 */ 2351 space -= hdrlen; 2352 2353 if (vp != NULL) { 2354 error = vn_lock(vp, LK_SHARED); 2355 if (error != 0) 2356 goto done; 2357 error = VOP_GETATTR(vp, &va, td->td_ucred); 2358 if (error != 0 || off >= va.va_size) { 2359 VOP_UNLOCK(vp, 0); 2360 goto done; 2361 } 2362 obj_size = va.va_size; 2363 } 2364 2365 /* 2366 * Loop and construct maximum sized mbuf chain to be bulk 2367 * dumped into socket buffer. 2368 */ 2369 while (space > loopbytes) { 2370 vm_offset_t pgoff; 2371 struct mbuf *m0; 2372 2373 /* 2374 * Calculate the amount to transfer. 2375 * Not to exceed a page, the EOF, 2376 * or the passed in nbytes. 2377 */ 2378 pgoff = (vm_offset_t)(off & PAGE_MASK); 2379 rem = obj_size - offset; 2380 if (nbytes != 0) 2381 rem = omin(rem, nbytes); 2382 rem -= fsbytes + loopbytes; 2383 xfsize = omin(PAGE_SIZE - pgoff, rem); 2384 xfsize = omin(space - loopbytes, xfsize); 2385 if (xfsize <= 0) { 2386 done = 1; /* all data sent */ 2387 break; 2388 } 2389 2390 /* 2391 * Attempt to look up the page. Allocate 2392 * if not found or wait and loop if busy. 2393 */ 2394 if (m != NULL) 2395 nd = EAGAIN; /* send what we already got */ 2396 else if ((flags & SF_NODISKIO) != 0) 2397 nd = EBUSY; 2398 else 2399 nd = 0; 2400 error = sendfile_readpage(obj, vp, nd, off, 2401 xfsize, bsize, td, &pg); 2402 if (error != 0) { 2403 if (error == EAGAIN) 2404 error = 0; /* not a real error */ 2405 break; 2406 } 2407 2408 /* 2409 * Get a sendfile buf. When allocating the 2410 * first buffer for mbuf chain, we usually 2411 * wait as long as necessary, but this wait 2412 * can be interrupted. For consequent 2413 * buffers, do not sleep, since several 2414 * threads might exhaust the buffers and then 2415 * deadlock. 2416 */ 2417 sf = sf_buf_alloc(pg, (mnw || m != NULL) ? SFB_NOWAIT : 2418 SFB_CATCH); 2419 if (sf == NULL) { 2420 SFSTAT_INC(sf_allocfail); 2421 vm_page_lock(pg); 2422 vm_page_unwire(pg, 0); 2423 KASSERT(pg->object != NULL, 2424 ("%s: object disappeared", __func__)); 2425 vm_page_unlock(pg); 2426 if (m == NULL) 2427 error = (mnw ? EAGAIN : EINTR); 2428 break; 2429 } 2430 2431 /* 2432 * Get an mbuf and set it up as having 2433 * external storage. 2434 */ 2435 m0 = m_get((mnw ? M_NOWAIT : M_WAITOK), MT_DATA); 2436 if (m0 == NULL) { 2437 error = (mnw ? EAGAIN : ENOBUFS); 2438 (void)sf_buf_mext(NULL, NULL, sf); 2439 break; 2440 } 2441 if (m_extadd(m0, (caddr_t )sf_buf_kva(sf), PAGE_SIZE, 2442 sf_buf_mext, sfs, sf, M_RDONLY, EXT_SFBUF, 2443 (mnw ? M_NOWAIT : M_WAITOK)) != 0) { 2444 error = (mnw ? EAGAIN : ENOBUFS); 2445 (void)sf_buf_mext(NULL, NULL, sf); 2446 m_freem(m0); 2447 break; 2448 } 2449 m0->m_data = (char *)sf_buf_kva(sf) + pgoff; 2450 m0->m_len = xfsize; 2451 2452 /* Append to mbuf chain. */ 2453 if (mtail != NULL) 2454 mtail->m_next = m0; 2455 else if (m != NULL) 2456 m_last(m)->m_next = m0; 2457 else 2458 m = m0; 2459 mtail = m0; 2460 2461 /* Keep track of bits processed. */ 2462 loopbytes += xfsize; 2463 off += xfsize; 2464 2465 if (sfs != NULL) { 2466 mtx_lock(&sfs->mtx); 2467 sfs->count++; 2468 mtx_unlock(&sfs->mtx); 2469 } 2470 } 2471 2472 if (vp != NULL) 2473 VOP_UNLOCK(vp, 0); 2474 2475 /* Add the buffer chain to the socket buffer. */ 2476 if (m != NULL) { 2477 int mlen, err; 2478 2479 mlen = m_length(m, NULL); 2480 SOCKBUF_LOCK(&so->so_snd); 2481 if (so->so_snd.sb_state & SBS_CANTSENDMORE) { 2482 error = EPIPE; 2483 SOCKBUF_UNLOCK(&so->so_snd); 2484 goto done; 2485 } 2486 SOCKBUF_UNLOCK(&so->so_snd); 2487 CURVNET_SET(so->so_vnet); 2488 /* Avoid error aliasing. */ 2489 err = (*so->so_proto->pr_usrreqs->pru_send) 2490 (so, 0, m, NULL, NULL, td); 2491 CURVNET_RESTORE(); 2492 if (err == 0) { 2493 /* 2494 * We need two counters to get the 2495 * file offset and nbytes to send 2496 * right: 2497 * - sbytes contains the total amount 2498 * of bytes sent, including headers. 2499 * - fsbytes contains the total amount 2500 * of bytes sent from the file. 2501 */ 2502 sbytes += mlen; 2503 fsbytes += mlen; 2504 if (hdrlen) { 2505 fsbytes -= hdrlen; 2506 hdrlen = 0; 2507 } 2508 } else if (error == 0) 2509 error = err; 2510 m = NULL; /* pru_send always consumes */ 2511 } 2512 2513 /* Quit outer loop on error or when we're done. */ 2514 if (done) 2515 break; 2516 if (error != 0) 2517 goto done; 2518 } 2519 2520 /* 2521 * Send trailers. Wimp out and use writev(2). 2522 */ 2523 if (trl_uio != NULL) { 2524 sbunlock(&so->so_snd); 2525 error = kern_writev(td, sockfd, trl_uio); 2526 if (error == 0) 2527 sbytes += td->td_retval[0]; 2528 goto out; 2529 } 2530 2531done: 2532 sbunlock(&so->so_snd); 2533out: 2534 /* 2535 * If there was no error we have to clear td->td_retval[0] 2536 * because it may have been set by writev. 2537 */ 2538 if (error == 0) { 2539 td->td_retval[0] = 0; 2540 } 2541 if (sent != NULL) { 2542 copyout(&sbytes, sent, sizeof(off_t)); 2543 } 2544 if (obj != NULL) 2545 vm_object_deallocate(obj); 2546 if (so) 2547 fdrop(sock_fp, td); 2548 if (m) 2549 m_freem(m); 2550 2551 if (sfs != NULL) { 2552 mtx_lock(&sfs->mtx); 2553 if (sfs->count != 0) 2554 cv_wait(&sfs->cv, &sfs->mtx); 2555 KASSERT(sfs->count == 0, ("sendfile sync still busy")); 2556 cv_destroy(&sfs->cv); 2557 mtx_destroy(&sfs->mtx); 2558 free(sfs, M_TEMP); 2559 } 2560 2561 if (error == ERESTART) 2562 error = EINTR; 2563 2564 return (error); 2565} 2566