kern_sendfile.c revision 41484
1/* 2 * Copyright (c) 1982, 1986, 1989, 1990, 1993 3 * The Regents of the University of California. All rights reserved. 4 * 5 * sendfile(2) and related extensions: 6 * Copyright (c) 1998, David Greenman. All rights reserved. 7 * 8 * Redistribution and use in source and binary forms, with or without 9 * modification, are permitted provided that the following conditions 10 * are met: 11 * 1. Redistributions of source code must retain the above copyright 12 * notice, this list of conditions and the following disclaimer. 13 * 2. Redistributions in binary form must reproduce the above copyright 14 * notice, this list of conditions and the following disclaimer in the 15 * documentation and/or other materials provided with the distribution. 16 * 3. All advertising materials mentioning features or use of this software 17 * must display the following acknowledgement: 18 * This product includes software developed by the University of 19 * California, Berkeley and its contributors. 20 * 4. Neither the name of the University nor the names of its contributors 21 * may be used to endorse or promote products derived from this software 22 * without specific prior written permission. 23 * 24 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND 25 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 26 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 27 * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE 28 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 29 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 30 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 31 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 32 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 33 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 34 * SUCH DAMAGE. 35 * 36 * @(#)uipc_syscalls.c 8.4 (Berkeley) 2/21/94 37 * $Id: uipc_syscalls.c,v 1.47 1998/11/23 00:45:39 truckman Exp $ 38 */ 39 40#include "opt_compat.h" 41#include "opt_ktrace.h" 42 43#include <sys/param.h> 44#include <sys/systm.h> 45#include <sys/kernel.h> 46#include <sys/sysproto.h> 47#include <sys/malloc.h> 48#include <sys/filedesc.h> 49#include <sys/proc.h> 50#include <sys/fcntl.h> 51#include <sys/file.h> 52#include <sys/mbuf.h> 53#include <sys/protosw.h> 54#include <sys/socket.h> 55#include <sys/socketvar.h> 56#include <sys/signalvar.h> 57#include <sys/uio.h> 58#include <sys/vnode.h> 59#include <sys/lock.h> 60#include <sys/mount.h> 61#ifdef KTRACE 62#include <sys/ktrace.h> 63#endif 64#include <vm/vm.h> 65#include <vm/vm_prot.h> 66#include <vm/vm_object.h> 67#include <vm/vm_page.h> 68#include <vm/vm_pager.h> 69#include <vm/vm_pageout.h> 70#include <vm/vm_kern.h> 71#include <vm/vm_extern.h> 72#include <machine/limits.h> 73 74static void sf_buf_init(void *arg); 75SYSINIT(sock_sf, SI_SUB_MBUF, SI_ORDER_ANY, sf_buf_init, NULL) 76static struct sf_buf *sf_buf_alloc(void); 77static void sf_buf_ref(caddr_t addr, u_int size); 78static void sf_buf_free(caddr_t addr, u_int size); 79 80static int sendit __P((struct proc *p, int s, struct msghdr *mp, int flags)); 81static int recvit __P((struct proc *p, int s, struct msghdr *mp, 82 caddr_t namelenp)); 83 84static int accept1 __P((struct proc *p, struct accept_args *uap, int compat)); 85static int getsockname1 __P((struct proc *p, struct getsockname_args *uap, 86 int compat)); 87static int getpeername1 __P((struct proc *p, struct getpeername_args *uap, 88 int compat)); 89 90static SLIST_HEAD(, sf_buf) sf_freelist; 91static vm_offset_t sf_base; 92static struct sf_buf *sf_bufs; 93static int sf_buf_alloc_want; 94 95/* 96 * System call interface to the socket abstraction. 97 */ 98#if defined(COMPAT_43) || defined(COMPAT_SUNOS) 99#define COMPAT_OLDSOCK 100#endif 101 102extern struct fileops socketops; 103 104int 105socket(p, uap) 106 struct proc *p; 107 register struct socket_args /* { 108 int domain; 109 int type; 110 int protocol; 111 } */ *uap; 112{ 113 struct filedesc *fdp = p->p_fd; 114 struct socket *so; 115 struct file *fp; 116 int fd, error; 117 118 error = falloc(p, &fp, &fd); 119 if (error) 120 return (error); 121 fp->f_flag = FREAD|FWRITE; 122 fp->f_type = DTYPE_SOCKET; 123 fp->f_ops = &socketops; 124 error = socreate(uap->domain, &so, uap->type, uap->protocol, p); 125 if (error) { 126 fdp->fd_ofiles[fd] = 0; 127 ffree(fp); 128 } else { 129 fp->f_data = (caddr_t)so; 130 p->p_retval[0] = fd; 131 } 132 return (error); 133} 134 135/* ARGSUSED */ 136int 137bind(p, uap) 138 struct proc *p; 139 register struct bind_args /* { 140 int s; 141 caddr_t name; 142 int namelen; 143 } */ *uap; 144{ 145 struct file *fp; 146 struct sockaddr *sa; 147 int error; 148 149 error = getsock(p->p_fd, uap->s, &fp); 150 if (error) 151 return (error); 152 error = getsockaddr(&sa, uap->name, uap->namelen); 153 if (error) 154 return (error); 155 error = sobind((struct socket *)fp->f_data, sa, p); 156 FREE(sa, M_SONAME); 157 return (error); 158} 159 160/* ARGSUSED */ 161int 162listen(p, uap) 163 struct proc *p; 164 register struct listen_args /* { 165 int s; 166 int backlog; 167 } */ *uap; 168{ 169 struct file *fp; 170 int error; 171 172 error = getsock(p->p_fd, uap->s, &fp); 173 if (error) 174 return (error); 175 return (solisten((struct socket *)fp->f_data, uap->backlog, p)); 176} 177 178static int 179accept1(p, uap, compat) 180 struct proc *p; 181 register struct accept_args /* { 182 int s; 183 caddr_t name; 184 int *anamelen; 185 } */ *uap; 186 int compat; 187{ 188 struct file *fp; 189 struct sockaddr *sa; 190 int namelen, error, s; 191 struct socket *head, *so; 192 int fd; 193 short fflag; /* type must match fp->f_flag */ 194 195 if (uap->name) { 196 error = copyin((caddr_t)uap->anamelen, (caddr_t)&namelen, 197 sizeof (namelen)); 198 if(error) 199 return (error); 200 } 201 error = getsock(p->p_fd, uap->s, &fp); 202 if (error) 203 return (error); 204 s = splnet(); 205 head = (struct socket *)fp->f_data; 206 if ((head->so_options & SO_ACCEPTCONN) == 0) { 207 splx(s); 208 return (EINVAL); 209 } 210 if ((head->so_state & SS_NBIO) && head->so_comp.tqh_first == NULL) { 211 splx(s); 212 return (EWOULDBLOCK); 213 } 214 while (head->so_comp.tqh_first == NULL && head->so_error == 0) { 215 if (head->so_state & SS_CANTRCVMORE) { 216 head->so_error = ECONNABORTED; 217 break; 218 } 219 error = tsleep((caddr_t)&head->so_timeo, PSOCK | PCATCH, 220 "accept", 0); 221 if (error) { 222 splx(s); 223 return (error); 224 } 225 } 226 if (head->so_error) { 227 error = head->so_error; 228 head->so_error = 0; 229 splx(s); 230 return (error); 231 } 232 233 /* 234 * At this point we know that there is at least one connection 235 * ready to be accepted. Remove it from the queue prior to 236 * allocating the file descriptor for it since falloc() may 237 * block allowing another process to accept the connection 238 * instead. 239 */ 240 so = head->so_comp.tqh_first; 241 TAILQ_REMOVE(&head->so_comp, so, so_list); 242 head->so_qlen--; 243 244 fflag = fp->f_flag; 245 error = falloc(p, &fp, &fd); 246 if (error) { 247 /* 248 * Probably ran out of file descriptors. Put the 249 * unaccepted connection back onto the queue and 250 * do another wakeup so some other process might 251 * have a chance at it. 252 */ 253 TAILQ_INSERT_HEAD(&head->so_comp, so, so_list); 254 head->so_qlen++; 255 wakeup_one(&head->so_timeo); 256 splx(s); 257 return (error); 258 } else 259 p->p_retval[0] = fd; 260 261 so->so_state &= ~SS_COMP; 262 so->so_head = NULL; 263 if (head->so_sigio != NULL) 264 fsetown(fgetown(head->so_sigio), &so->so_sigio); 265 266 fp->f_type = DTYPE_SOCKET; 267 fp->f_flag = fflag; 268 fp->f_ops = &socketops; 269 fp->f_data = (caddr_t)so; 270 sa = 0; 271 (void) soaccept(so, &sa); 272 if (sa == 0) { 273 namelen = 0; 274 if (uap->name) 275 goto gotnoname; 276 return 0; 277 } 278 if (uap->name) { 279 /* check sa_len before it is destroyed */ 280 if (namelen > sa->sa_len) 281 namelen = sa->sa_len; 282#ifdef COMPAT_OLDSOCK 283 if (compat) 284 ((struct osockaddr *)sa)->sa_family = 285 sa->sa_family; 286#endif 287 error = copyout(sa, (caddr_t)uap->name, (u_int)namelen); 288 if (!error) 289gotnoname: 290 error = copyout((caddr_t)&namelen, 291 (caddr_t)uap->anamelen, sizeof (*uap->anamelen)); 292 } 293 FREE(sa, M_SONAME); 294 splx(s); 295 return (error); 296} 297 298int 299accept(p, uap) 300 struct proc *p; 301 struct accept_args *uap; 302{ 303 304 return (accept1(p, uap, 0)); 305} 306 307#ifdef COMPAT_OLDSOCK 308int 309oaccept(p, uap) 310 struct proc *p; 311 struct accept_args *uap; 312{ 313 314 return (accept1(p, uap, 1)); 315} 316#endif /* COMPAT_OLDSOCK */ 317 318/* ARGSUSED */ 319int 320connect(p, uap) 321 struct proc *p; 322 register struct connect_args /* { 323 int s; 324 caddr_t name; 325 int namelen; 326 } */ *uap; 327{ 328 struct file *fp; 329 register struct socket *so; 330 struct sockaddr *sa; 331 int error, s; 332 333 error = getsock(p->p_fd, uap->s, &fp); 334 if (error) 335 return (error); 336 so = (struct socket *)fp->f_data; 337 if ((so->so_state & SS_NBIO) && (so->so_state & SS_ISCONNECTING)) 338 return (EALREADY); 339 error = getsockaddr(&sa, uap->name, uap->namelen); 340 if (error) 341 return (error); 342 error = soconnect(so, sa, p); 343 if (error) 344 goto bad; 345 if ((so->so_state & SS_NBIO) && (so->so_state & SS_ISCONNECTING)) { 346 FREE(sa, M_SONAME); 347 return (EINPROGRESS); 348 } 349 s = splnet(); 350 while ((so->so_state & SS_ISCONNECTING) && so->so_error == 0) { 351 error = tsleep((caddr_t)&so->so_timeo, PSOCK | PCATCH, 352 "connec", 0); 353 if (error) 354 break; 355 } 356 if (error == 0) { 357 error = so->so_error; 358 so->so_error = 0; 359 } 360 splx(s); 361bad: 362 so->so_state &= ~SS_ISCONNECTING; 363 FREE(sa, M_SONAME); 364 if (error == ERESTART) 365 error = EINTR; 366 return (error); 367} 368 369int 370socketpair(p, uap) 371 struct proc *p; 372 register struct socketpair_args /* { 373 int domain; 374 int type; 375 int protocol; 376 int *rsv; 377 } */ *uap; 378{ 379 register struct filedesc *fdp = p->p_fd; 380 struct file *fp1, *fp2; 381 struct socket *so1, *so2; 382 int fd, error, sv[2]; 383 384 error = socreate(uap->domain, &so1, uap->type, uap->protocol, p); 385 if (error) 386 return (error); 387 error = socreate(uap->domain, &so2, uap->type, uap->protocol, p); 388 if (error) 389 goto free1; 390 error = falloc(p, &fp1, &fd); 391 if (error) 392 goto free2; 393 sv[0] = fd; 394 fp1->f_flag = FREAD|FWRITE; 395 fp1->f_type = DTYPE_SOCKET; 396 fp1->f_ops = &socketops; 397 fp1->f_data = (caddr_t)so1; 398 error = falloc(p, &fp2, &fd); 399 if (error) 400 goto free3; 401 fp2->f_flag = FREAD|FWRITE; 402 fp2->f_type = DTYPE_SOCKET; 403 fp2->f_ops = &socketops; 404 fp2->f_data = (caddr_t)so2; 405 sv[1] = fd; 406 error = soconnect2(so1, so2); 407 if (error) 408 goto free4; 409 if (uap->type == SOCK_DGRAM) { 410 /* 411 * Datagram socket connection is asymmetric. 412 */ 413 error = soconnect2(so2, so1); 414 if (error) 415 goto free4; 416 } 417 error = copyout((caddr_t)sv, (caddr_t)uap->rsv, 2 * sizeof (int)); 418 return (error); 419free4: 420 ffree(fp2); 421 fdp->fd_ofiles[sv[1]] = 0; 422free3: 423 ffree(fp1); 424 fdp->fd_ofiles[sv[0]] = 0; 425free2: 426 (void)soclose(so2); 427free1: 428 (void)soclose(so1); 429 return (error); 430} 431 432static int 433sendit(p, s, mp, flags) 434 register struct proc *p; 435 int s; 436 register struct msghdr *mp; 437 int flags; 438{ 439 struct file *fp; 440 struct uio auio; 441 register struct iovec *iov; 442 register int i; 443 struct mbuf *control; 444 struct sockaddr *to; 445 int len, error; 446 struct socket *so; 447#ifdef KTRACE 448 struct iovec *ktriov = NULL; 449#endif 450 451 error = getsock(p->p_fd, s, &fp); 452 if (error) 453 return (error); 454 auio.uio_iov = mp->msg_iov; 455 auio.uio_iovcnt = mp->msg_iovlen; 456 auio.uio_segflg = UIO_USERSPACE; 457 auio.uio_rw = UIO_WRITE; 458 auio.uio_procp = p; 459 auio.uio_offset = 0; /* XXX */ 460 auio.uio_resid = 0; 461 iov = mp->msg_iov; 462 for (i = 0; i < mp->msg_iovlen; i++, iov++) { 463 if ((auio.uio_resid += iov->iov_len) < 0) 464 return (EINVAL); 465 } 466 if (mp->msg_name) { 467 error = getsockaddr(&to, mp->msg_name, mp->msg_namelen); 468 if (error) 469 return (error); 470 } else 471 to = 0; 472 if (mp->msg_control) { 473 if (mp->msg_controllen < sizeof(struct cmsghdr) 474#ifdef COMPAT_OLDSOCK 475 && mp->msg_flags != MSG_COMPAT 476#endif 477 ) { 478 error = EINVAL; 479 goto bad; 480 } 481 error = sockargs(&control, mp->msg_control, 482 mp->msg_controllen, MT_CONTROL); 483 if (error) 484 goto bad; 485#ifdef COMPAT_OLDSOCK 486 if (mp->msg_flags == MSG_COMPAT) { 487 register struct cmsghdr *cm; 488 489 M_PREPEND(control, sizeof(*cm), M_WAIT); 490 if (control == 0) { 491 error = ENOBUFS; 492 goto bad; 493 } else { 494 cm = mtod(control, struct cmsghdr *); 495 cm->cmsg_len = control->m_len; 496 cm->cmsg_level = SOL_SOCKET; 497 cm->cmsg_type = SCM_RIGHTS; 498 } 499 } 500#endif 501 } else 502 control = 0; 503#ifdef KTRACE 504 if (KTRPOINT(p, KTR_GENIO)) { 505 int iovlen = auio.uio_iovcnt * sizeof (struct iovec); 506 507 MALLOC(ktriov, struct iovec *, iovlen, M_TEMP, M_WAITOK); 508 bcopy((caddr_t)auio.uio_iov, (caddr_t)ktriov, iovlen); 509 } 510#endif 511 len = auio.uio_resid; 512 so = (struct socket *)fp->f_data; 513 error = so->so_proto->pr_usrreqs->pru_sosend(so, to, &auio, 0, control, 514 flags, p); 515 if (error) { 516 if (auio.uio_resid != len && (error == ERESTART || 517 error == EINTR || error == EWOULDBLOCK)) 518 error = 0; 519 if (error == EPIPE) 520 psignal(p, SIGPIPE); 521 } 522 if (error == 0) 523 p->p_retval[0] = len - auio.uio_resid; 524#ifdef KTRACE 525 if (ktriov != NULL) { 526 if (error == 0) 527 ktrgenio(p->p_tracep, s, UIO_WRITE, 528 ktriov, p->p_retval[0], error); 529 FREE(ktriov, M_TEMP); 530 } 531#endif 532bad: 533 if (to) 534 FREE(to, M_SONAME); 535 return (error); 536} 537 538int 539sendto(p, uap) 540 struct proc *p; 541 register struct sendto_args /* { 542 int s; 543 caddr_t buf; 544 size_t len; 545 int flags; 546 caddr_t to; 547 int tolen; 548 } */ *uap; 549{ 550 struct msghdr msg; 551 struct iovec aiov; 552 553 msg.msg_name = uap->to; 554 msg.msg_namelen = uap->tolen; 555 msg.msg_iov = &aiov; 556 msg.msg_iovlen = 1; 557 msg.msg_control = 0; 558#ifdef COMPAT_OLDSOCK 559 msg.msg_flags = 0; 560#endif 561 aiov.iov_base = uap->buf; 562 aiov.iov_len = uap->len; 563 return (sendit(p, uap->s, &msg, uap->flags)); 564} 565 566#ifdef COMPAT_OLDSOCK 567int 568osend(p, uap) 569 struct proc *p; 570 register struct osend_args /* { 571 int s; 572 caddr_t buf; 573 int len; 574 int flags; 575 } */ *uap; 576{ 577 struct msghdr msg; 578 struct iovec aiov; 579 580 msg.msg_name = 0; 581 msg.msg_namelen = 0; 582 msg.msg_iov = &aiov; 583 msg.msg_iovlen = 1; 584 aiov.iov_base = uap->buf; 585 aiov.iov_len = uap->len; 586 msg.msg_control = 0; 587 msg.msg_flags = 0; 588 return (sendit(p, uap->s, &msg, uap->flags)); 589} 590 591int 592osendmsg(p, uap) 593 struct proc *p; 594 register struct osendmsg_args /* { 595 int s; 596 caddr_t msg; 597 int flags; 598 } */ *uap; 599{ 600 struct msghdr msg; 601 struct iovec aiov[UIO_SMALLIOV], *iov; 602 int error; 603 604 error = copyin(uap->msg, (caddr_t)&msg, sizeof (struct omsghdr)); 605 if (error) 606 return (error); 607 if ((u_int)msg.msg_iovlen >= UIO_SMALLIOV) { 608 if ((u_int)msg.msg_iovlen >= UIO_MAXIOV) 609 return (EMSGSIZE); 610 MALLOC(iov, struct iovec *, 611 sizeof(struct iovec) * (u_int)msg.msg_iovlen, M_IOV, 612 M_WAITOK); 613 } else 614 iov = aiov; 615 error = copyin((caddr_t)msg.msg_iov, (caddr_t)iov, 616 (unsigned)(msg.msg_iovlen * sizeof (struct iovec))); 617 if (error) 618 goto done; 619 msg.msg_flags = MSG_COMPAT; 620 msg.msg_iov = iov; 621 error = sendit(p, uap->s, &msg, uap->flags); 622done: 623 if (iov != aiov) 624 FREE(iov, M_IOV); 625 return (error); 626} 627#endif 628 629int 630sendmsg(p, uap) 631 struct proc *p; 632 register struct sendmsg_args /* { 633 int s; 634 caddr_t msg; 635 int flags; 636 } */ *uap; 637{ 638 struct msghdr msg; 639 struct iovec aiov[UIO_SMALLIOV], *iov; 640 int error; 641 642 error = copyin(uap->msg, (caddr_t)&msg, sizeof (msg)); 643 if (error) 644 return (error); 645 if ((u_int)msg.msg_iovlen >= UIO_SMALLIOV) { 646 if ((u_int)msg.msg_iovlen >= UIO_MAXIOV) 647 return (EMSGSIZE); 648 MALLOC(iov, struct iovec *, 649 sizeof(struct iovec) * (u_int)msg.msg_iovlen, M_IOV, 650 M_WAITOK); 651 } else 652 iov = aiov; 653 if (msg.msg_iovlen && 654 (error = copyin((caddr_t)msg.msg_iov, (caddr_t)iov, 655 (unsigned)(msg.msg_iovlen * sizeof (struct iovec))))) 656 goto done; 657 msg.msg_iov = iov; 658#ifdef COMPAT_OLDSOCK 659 msg.msg_flags = 0; 660#endif 661 error = sendit(p, uap->s, &msg, uap->flags); 662done: 663 if (iov != aiov) 664 FREE(iov, M_IOV); 665 return (error); 666} 667 668static int 669recvit(p, s, mp, namelenp) 670 register struct proc *p; 671 int s; 672 register struct msghdr *mp; 673 caddr_t namelenp; 674{ 675 struct file *fp; 676 struct uio auio; 677 register struct iovec *iov; 678 register int i; 679 int len, error; 680 struct mbuf *m, *control = 0; 681 caddr_t ctlbuf; 682 struct socket *so; 683 struct sockaddr *fromsa = 0; 684#ifdef KTRACE 685 struct iovec *ktriov = NULL; 686#endif 687 688 error = getsock(p->p_fd, s, &fp); 689 if (error) 690 return (error); 691 auio.uio_iov = mp->msg_iov; 692 auio.uio_iovcnt = mp->msg_iovlen; 693 auio.uio_segflg = UIO_USERSPACE; 694 auio.uio_rw = UIO_READ; 695 auio.uio_procp = p; 696 auio.uio_offset = 0; /* XXX */ 697 auio.uio_resid = 0; 698 iov = mp->msg_iov; 699 for (i = 0; i < mp->msg_iovlen; i++, iov++) { 700 if ((auio.uio_resid += iov->iov_len) < 0) 701 return (EINVAL); 702 } 703#ifdef KTRACE 704 if (KTRPOINT(p, KTR_GENIO)) { 705 int iovlen = auio.uio_iovcnt * sizeof (struct iovec); 706 707 MALLOC(ktriov, struct iovec *, iovlen, M_TEMP, M_WAITOK); 708 bcopy((caddr_t)auio.uio_iov, (caddr_t)ktriov, iovlen); 709 } 710#endif 711 len = auio.uio_resid; 712 so = (struct socket *)fp->f_data; 713 error = so->so_proto->pr_usrreqs->pru_soreceive(so, &fromsa, &auio, 714 (struct mbuf **)0, mp->msg_control ? &control : (struct mbuf **)0, 715 &mp->msg_flags); 716 if (error) { 717 if (auio.uio_resid != len && (error == ERESTART || 718 error == EINTR || error == EWOULDBLOCK)) 719 error = 0; 720 } 721#ifdef KTRACE 722 if (ktriov != NULL) { 723 if (error == 0) 724 ktrgenio(p->p_tracep, s, UIO_READ, 725 ktriov, len - auio.uio_resid, error); 726 FREE(ktriov, M_TEMP); 727 } 728#endif 729 if (error) 730 goto out; 731 p->p_retval[0] = len - auio.uio_resid; 732 if (mp->msg_name) { 733 len = mp->msg_namelen; 734 if (len <= 0 || fromsa == 0) 735 len = 0; 736 else { 737#ifndef MIN 738#define MIN(a,b) ((a)>(b)?(b):(a)) 739#endif 740 /* save sa_len before it is destroyed by MSG_COMPAT */ 741 len = MIN(len, fromsa->sa_len); 742#ifdef COMPAT_OLDSOCK 743 if (mp->msg_flags & MSG_COMPAT) 744 ((struct osockaddr *)fromsa)->sa_family = 745 fromsa->sa_family; 746#endif 747 error = copyout(fromsa, 748 (caddr_t)mp->msg_name, (unsigned)len); 749 if (error) 750 goto out; 751 } 752 mp->msg_namelen = len; 753 if (namelenp && 754 (error = copyout((caddr_t)&len, namelenp, sizeof (int)))) { 755#ifdef COMPAT_OLDSOCK 756 if (mp->msg_flags & MSG_COMPAT) 757 error = 0; /* old recvfrom didn't check */ 758 else 759#endif 760 goto out; 761 } 762 } 763 if (mp->msg_control) { 764#ifdef COMPAT_OLDSOCK 765 /* 766 * We assume that old recvmsg calls won't receive access 767 * rights and other control info, esp. as control info 768 * is always optional and those options didn't exist in 4.3. 769 * If we receive rights, trim the cmsghdr; anything else 770 * is tossed. 771 */ 772 if (control && mp->msg_flags & MSG_COMPAT) { 773 if (mtod(control, struct cmsghdr *)->cmsg_level != 774 SOL_SOCKET || 775 mtod(control, struct cmsghdr *)->cmsg_type != 776 SCM_RIGHTS) { 777 mp->msg_controllen = 0; 778 goto out; 779 } 780 control->m_len -= sizeof (struct cmsghdr); 781 control->m_data += sizeof (struct cmsghdr); 782 } 783#endif 784 len = mp->msg_controllen; 785 m = control; 786 mp->msg_controllen = 0; 787 ctlbuf = (caddr_t) mp->msg_control; 788 789 while (m && len > 0) { 790 unsigned int tocopy; 791 792 if (len >= m->m_len) 793 tocopy = m->m_len; 794 else { 795 mp->msg_flags |= MSG_CTRUNC; 796 tocopy = len; 797 } 798 799 if (error = copyout((caddr_t)mtod(m, caddr_t), 800 ctlbuf, tocopy)) 801 goto out; 802 803 ctlbuf += tocopy; 804 len -= tocopy; 805 m = m->m_next; 806 } 807 mp->msg_controllen = ctlbuf - mp->msg_control; 808 } 809out: 810 if (fromsa) 811 FREE(fromsa, M_SONAME); 812 if (control) 813 m_freem(control); 814 return (error); 815} 816 817int 818recvfrom(p, uap) 819 struct proc *p; 820 register struct recvfrom_args /* { 821 int s; 822 caddr_t buf; 823 size_t len; 824 int flags; 825 caddr_t from; 826 int *fromlenaddr; 827 } */ *uap; 828{ 829 struct msghdr msg; 830 struct iovec aiov; 831 int error; 832 833 if (uap->fromlenaddr) { 834 error = copyin((caddr_t)uap->fromlenaddr, 835 (caddr_t)&msg.msg_namelen, sizeof (msg.msg_namelen)); 836 if (error) 837 return (error); 838 } else 839 msg.msg_namelen = 0; 840 msg.msg_name = uap->from; 841 msg.msg_iov = &aiov; 842 msg.msg_iovlen = 1; 843 aiov.iov_base = uap->buf; 844 aiov.iov_len = uap->len; 845 msg.msg_control = 0; 846 msg.msg_flags = uap->flags; 847 return (recvit(p, uap->s, &msg, (caddr_t)uap->fromlenaddr)); 848} 849 850#ifdef COMPAT_OLDSOCK 851int 852orecvfrom(p, uap) 853 struct proc *p; 854 struct recvfrom_args *uap; 855{ 856 857 uap->flags |= MSG_COMPAT; 858 return (recvfrom(p, uap)); 859} 860#endif 861 862 863#ifdef COMPAT_OLDSOCK 864int 865orecv(p, uap) 866 struct proc *p; 867 register struct orecv_args /* { 868 int s; 869 caddr_t buf; 870 int len; 871 int flags; 872 } */ *uap; 873{ 874 struct msghdr msg; 875 struct iovec aiov; 876 877 msg.msg_name = 0; 878 msg.msg_namelen = 0; 879 msg.msg_iov = &aiov; 880 msg.msg_iovlen = 1; 881 aiov.iov_base = uap->buf; 882 aiov.iov_len = uap->len; 883 msg.msg_control = 0; 884 msg.msg_flags = uap->flags; 885 return (recvit(p, uap->s, &msg, (caddr_t)0)); 886} 887 888/* 889 * Old recvmsg. This code takes advantage of the fact that the old msghdr 890 * overlays the new one, missing only the flags, and with the (old) access 891 * rights where the control fields are now. 892 */ 893int 894orecvmsg(p, uap) 895 struct proc *p; 896 register struct orecvmsg_args /* { 897 int s; 898 struct omsghdr *msg; 899 int flags; 900 } */ *uap; 901{ 902 struct msghdr msg; 903 struct iovec aiov[UIO_SMALLIOV], *iov; 904 int error; 905 906 error = copyin((caddr_t)uap->msg, (caddr_t)&msg, 907 sizeof (struct omsghdr)); 908 if (error) 909 return (error); 910 if ((u_int)msg.msg_iovlen >= UIO_SMALLIOV) { 911 if ((u_int)msg.msg_iovlen >= UIO_MAXIOV) 912 return (EMSGSIZE); 913 MALLOC(iov, struct iovec *, 914 sizeof(struct iovec) * (u_int)msg.msg_iovlen, M_IOV, 915 M_WAITOK); 916 } else 917 iov = aiov; 918 msg.msg_flags = uap->flags | MSG_COMPAT; 919 error = copyin((caddr_t)msg.msg_iov, (caddr_t)iov, 920 (unsigned)(msg.msg_iovlen * sizeof (struct iovec))); 921 if (error) 922 goto done; 923 msg.msg_iov = iov; 924 error = recvit(p, uap->s, &msg, (caddr_t)&uap->msg->msg_namelen); 925 926 if (msg.msg_controllen && error == 0) 927 error = copyout((caddr_t)&msg.msg_controllen, 928 (caddr_t)&uap->msg->msg_accrightslen, sizeof (int)); 929done: 930 if (iov != aiov) 931 FREE(iov, M_IOV); 932 return (error); 933} 934#endif 935 936int 937recvmsg(p, uap) 938 struct proc *p; 939 register struct recvmsg_args /* { 940 int s; 941 struct msghdr *msg; 942 int flags; 943 } */ *uap; 944{ 945 struct msghdr msg; 946 struct iovec aiov[UIO_SMALLIOV], *uiov, *iov; 947 register int error; 948 949 error = copyin((caddr_t)uap->msg, (caddr_t)&msg, sizeof (msg)); 950 if (error) 951 return (error); 952 if ((u_int)msg.msg_iovlen >= UIO_SMALLIOV) { 953 if ((u_int)msg.msg_iovlen >= UIO_MAXIOV) 954 return (EMSGSIZE); 955 MALLOC(iov, struct iovec *, 956 sizeof(struct iovec) * (u_int)msg.msg_iovlen, M_IOV, 957 M_WAITOK); 958 } else 959 iov = aiov; 960#ifdef COMPAT_OLDSOCK 961 msg.msg_flags = uap->flags &~ MSG_COMPAT; 962#else 963 msg.msg_flags = uap->flags; 964#endif 965 uiov = msg.msg_iov; 966 msg.msg_iov = iov; 967 error = copyin((caddr_t)uiov, (caddr_t)iov, 968 (unsigned)(msg.msg_iovlen * sizeof (struct iovec))); 969 if (error) 970 goto done; 971 error = recvit(p, uap->s, &msg, (caddr_t)0); 972 if (!error) { 973 msg.msg_iov = uiov; 974 error = copyout((caddr_t)&msg, (caddr_t)uap->msg, sizeof(msg)); 975 } 976done: 977 if (iov != aiov) 978 FREE(iov, M_IOV); 979 return (error); 980} 981 982/* ARGSUSED */ 983int 984shutdown(p, uap) 985 struct proc *p; 986 register struct shutdown_args /* { 987 int s; 988 int how; 989 } */ *uap; 990{ 991 struct file *fp; 992 int error; 993 994 error = getsock(p->p_fd, uap->s, &fp); 995 if (error) 996 return (error); 997 return (soshutdown((struct socket *)fp->f_data, uap->how)); 998} 999 1000/* ARGSUSED */ 1001int 1002setsockopt(p, uap) 1003 struct proc *p; 1004 register struct setsockopt_args /* { 1005 int s; 1006 int level; 1007 int name; 1008 caddr_t val; 1009 int valsize; 1010 } */ *uap; 1011{ 1012 struct file *fp; 1013 struct sockopt sopt; 1014 int error; 1015 1016 if (uap->val == 0 && uap->valsize != 0) 1017 return (EFAULT); 1018 if (uap->valsize < 0) 1019 return (EINVAL); 1020 1021 error = getsock(p->p_fd, uap->s, &fp); 1022 if (error) 1023 return (error); 1024 1025 sopt.sopt_dir = SOPT_SET; 1026 sopt.sopt_level = uap->level; 1027 sopt.sopt_name = uap->name; 1028 sopt.sopt_val = uap->val; 1029 sopt.sopt_valsize = uap->valsize; 1030 sopt.sopt_p = p; 1031 1032 return (sosetopt((struct socket *)fp->f_data, &sopt)); 1033} 1034 1035/* ARGSUSED */ 1036int 1037getsockopt(p, uap) 1038 struct proc *p; 1039 register struct getsockopt_args /* { 1040 int s; 1041 int level; 1042 int name; 1043 caddr_t val; 1044 int *avalsize; 1045 } */ *uap; 1046{ 1047 int valsize, error; 1048 struct file *fp; 1049 struct sockopt sopt; 1050 1051 error = getsock(p->p_fd, uap->s, &fp); 1052 if (error) 1053 return (error); 1054 if (uap->val) { 1055 error = copyin((caddr_t)uap->avalsize, (caddr_t)&valsize, 1056 sizeof (valsize)); 1057 if (error) 1058 return (error); 1059 if (valsize < 0) 1060 return (EINVAL); 1061 } else 1062 valsize = 0; 1063 1064 sopt.sopt_dir = SOPT_GET; 1065 sopt.sopt_level = uap->level; 1066 sopt.sopt_name = uap->name; 1067 sopt.sopt_val = uap->val; 1068 sopt.sopt_valsize = (size_t)valsize; /* checked non-negative above */ 1069 sopt.sopt_p = p; 1070 1071 error = sogetopt((struct socket *)fp->f_data, &sopt); 1072 if (error == 0) { 1073 valsize = sopt.sopt_valsize; 1074 error = copyout((caddr_t)&valsize, 1075 (caddr_t)uap->avalsize, sizeof (valsize)); 1076 } 1077 return (error); 1078} 1079 1080/* 1081 * Get socket name. 1082 */ 1083/* ARGSUSED */ 1084static int 1085getsockname1(p, uap, compat) 1086 struct proc *p; 1087 register struct getsockname_args /* { 1088 int fdes; 1089 caddr_t asa; 1090 int *alen; 1091 } */ *uap; 1092 int compat; 1093{ 1094 struct file *fp; 1095 register struct socket *so; 1096 struct sockaddr *sa; 1097 int len, error; 1098 1099 error = getsock(p->p_fd, uap->fdes, &fp); 1100 if (error) 1101 return (error); 1102 error = copyin((caddr_t)uap->alen, (caddr_t)&len, sizeof (len)); 1103 if (error) 1104 return (error); 1105 so = (struct socket *)fp->f_data; 1106 sa = 0; 1107 error = (*so->so_proto->pr_usrreqs->pru_sockaddr)(so, &sa); 1108 if (error) 1109 goto bad; 1110 if (sa == 0) { 1111 len = 0; 1112 goto gotnothing; 1113 } 1114 1115 len = MIN(len, sa->sa_len); 1116#ifdef COMPAT_OLDSOCK 1117 if (compat) 1118 ((struct osockaddr *)sa)->sa_family = sa->sa_family; 1119#endif 1120 error = copyout(sa, (caddr_t)uap->asa, (u_int)len); 1121 if (error == 0) 1122gotnothing: 1123 error = copyout((caddr_t)&len, (caddr_t)uap->alen, 1124 sizeof (len)); 1125bad: 1126 if (sa) 1127 FREE(sa, M_SONAME); 1128 return (error); 1129} 1130 1131int 1132getsockname(p, uap) 1133 struct proc *p; 1134 struct getsockname_args *uap; 1135{ 1136 1137 return (getsockname1(p, uap, 0)); 1138} 1139 1140#ifdef COMPAT_OLDSOCK 1141int 1142ogetsockname(p, uap) 1143 struct proc *p; 1144 struct getsockname_args *uap; 1145{ 1146 1147 return (getsockname1(p, uap, 1)); 1148} 1149#endif /* COMPAT_OLDSOCK */ 1150 1151/* 1152 * Get name of peer for connected socket. 1153 */ 1154/* ARGSUSED */ 1155static int 1156getpeername1(p, uap, compat) 1157 struct proc *p; 1158 register struct getpeername_args /* { 1159 int fdes; 1160 caddr_t asa; 1161 int *alen; 1162 } */ *uap; 1163 int compat; 1164{ 1165 struct file *fp; 1166 register struct socket *so; 1167 struct sockaddr *sa; 1168 int len, error; 1169 1170 error = getsock(p->p_fd, uap->fdes, &fp); 1171 if (error) 1172 return (error); 1173 so = (struct socket *)fp->f_data; 1174 if ((so->so_state & (SS_ISCONNECTED|SS_ISCONFIRMING)) == 0) 1175 return (ENOTCONN); 1176 error = copyin((caddr_t)uap->alen, (caddr_t)&len, sizeof (len)); 1177 if (error) 1178 return (error); 1179 sa = 0; 1180 error = (*so->so_proto->pr_usrreqs->pru_peeraddr)(so, &sa); 1181 if (error) 1182 goto bad; 1183 if (sa == 0) { 1184 len = 0; 1185 goto gotnothing; 1186 } 1187 len = MIN(len, sa->sa_len); 1188#ifdef COMPAT_OLDSOCK 1189 if (compat) 1190 ((struct osockaddr *)sa)->sa_family = 1191 sa->sa_family; 1192#endif 1193 error = copyout(sa, (caddr_t)uap->asa, (u_int)len); 1194 if (error) 1195 goto bad; 1196gotnothing: 1197 error = copyout((caddr_t)&len, (caddr_t)uap->alen, sizeof (len)); 1198bad: 1199 if (sa) FREE(sa, M_SONAME); 1200 return (error); 1201} 1202 1203int 1204getpeername(p, uap) 1205 struct proc *p; 1206 struct getpeername_args *uap; 1207{ 1208 1209 return (getpeername1(p, uap, 0)); 1210} 1211 1212#ifdef COMPAT_OLDSOCK 1213int 1214ogetpeername(p, uap) 1215 struct proc *p; 1216 struct ogetpeername_args *uap; 1217{ 1218 1219 /* XXX uap should have type `getpeername_args *' to begin with. */ 1220 return (getpeername1(p, (struct getpeername_args *)uap, 1)); 1221} 1222#endif /* COMPAT_OLDSOCK */ 1223 1224int 1225sockargs(mp, buf, buflen, type) 1226 struct mbuf **mp; 1227 caddr_t buf; 1228 int buflen, type; 1229{ 1230 register struct sockaddr *sa; 1231 register struct mbuf *m; 1232 int error; 1233 1234 if ((u_int)buflen > MLEN) { 1235#ifdef COMPAT_OLDSOCK 1236 if (type == MT_SONAME && (u_int)buflen <= 112) 1237 buflen = MLEN; /* unix domain compat. hack */ 1238 else 1239#endif 1240 return (EINVAL); 1241 } 1242 m = m_get(M_WAIT, type); 1243 if (m == NULL) 1244 return (ENOBUFS); 1245 m->m_len = buflen; 1246 error = copyin(buf, mtod(m, caddr_t), (u_int)buflen); 1247 if (error) 1248 (void) m_free(m); 1249 else { 1250 *mp = m; 1251 if (type == MT_SONAME) { 1252 sa = mtod(m, struct sockaddr *); 1253 1254#if defined(COMPAT_OLDSOCK) && BYTE_ORDER != BIG_ENDIAN 1255 if (sa->sa_family == 0 && sa->sa_len < AF_MAX) 1256 sa->sa_family = sa->sa_len; 1257#endif 1258 sa->sa_len = buflen; 1259 } 1260 } 1261 return (error); 1262} 1263 1264int 1265getsockaddr(namp, uaddr, len) 1266 struct sockaddr **namp; 1267 caddr_t uaddr; 1268 size_t len; 1269{ 1270 struct sockaddr *sa; 1271 int error; 1272 1273 if (len > SOCK_MAXADDRLEN) 1274 return ENAMETOOLONG; 1275 MALLOC(sa, struct sockaddr *, len, M_SONAME, M_WAITOK); 1276 error = copyin(uaddr, sa, len); 1277 if (error) { 1278 FREE(sa, M_SONAME); 1279 } else { 1280#if defined(COMPAT_OLDSOCK) && BYTE_ORDER != BIG_ENDIAN 1281 if (sa->sa_family == 0 && sa->sa_len < AF_MAX) 1282 sa->sa_family = sa->sa_len; 1283#endif 1284 sa->sa_len = len; 1285 *namp = sa; 1286 } 1287 return error; 1288} 1289 1290int 1291getsock(fdp, fdes, fpp) 1292 struct filedesc *fdp; 1293 int fdes; 1294 struct file **fpp; 1295{ 1296 register struct file *fp; 1297 1298 if ((unsigned)fdes >= fdp->fd_nfiles || 1299 (fp = fdp->fd_ofiles[fdes]) == NULL) 1300 return (EBADF); 1301 if (fp->f_type != DTYPE_SOCKET) 1302 return (ENOTSOCK); 1303 *fpp = fp; 1304 return (0); 1305} 1306 1307/* 1308 * Allocate a pool of sf_bufs (sendfile(2) or "super-fast" if you prefer. :-)) 1309 * XXX - The sf_buf functions are currently private to sendfile(2), so have 1310 * been made static, but may be useful in the future for doing zero-copy in 1311 * other parts of the networking code. 1312 */ 1313static void 1314sf_buf_init(void *arg) 1315{ 1316 int i; 1317 1318 SLIST_INIT(&sf_freelist); 1319 sf_base = kmem_alloc_pageable(kernel_map, nsfbufs * PAGE_SIZE); 1320 sf_bufs = malloc(nsfbufs * sizeof(struct sf_buf), M_TEMP, M_NOWAIT); 1321 bzero(sf_bufs, nsfbufs * sizeof(struct sf_buf)); 1322 for (i = 0; i < nsfbufs; i++) { 1323 sf_bufs[i].kva = sf_base + i * PAGE_SIZE; 1324 SLIST_INSERT_HEAD(&sf_freelist, &sf_bufs[i], free_list); 1325 } 1326} 1327 1328/* 1329 * Get an sf_buf from the freelist. Will block if none are available. 1330 */ 1331static struct sf_buf * 1332sf_buf_alloc() 1333{ 1334 struct sf_buf *sf; 1335 int s; 1336 1337 s = splimp(); 1338 while ((sf = SLIST_FIRST(&sf_freelist)) == NULL) { 1339 sf_buf_alloc_want = 1; 1340 tsleep(&sf_freelist, PVM, "sfbufa", 0); 1341 } 1342 SLIST_REMOVE_HEAD(&sf_freelist, free_list); 1343 splx(s); 1344 sf->refcnt = 1; 1345 return (sf); 1346} 1347 1348#define dtosf(x) (&sf_bufs[((uintptr_t)(x) - (uintptr_t)sf_base) >> PAGE_SHIFT]) 1349static void 1350sf_buf_ref(caddr_t addr, u_int size) 1351{ 1352 struct sf_buf *sf; 1353 1354 sf = dtosf(addr); 1355 if (sf->refcnt == 0) 1356 panic("sf_buf_ref: referencing a free sf_buf"); 1357 sf->refcnt++; 1358} 1359 1360/* 1361 * Lose a reference to an sf_buf. When none left, detach mapped page 1362 * and release resources back to the system. 1363 * 1364 * Must be called at splimp. 1365 */ 1366static void 1367sf_buf_free(caddr_t addr, u_int size) 1368{ 1369 struct sf_buf *sf; 1370 struct vm_page *m; 1371 int s; 1372 1373 sf = dtosf(addr); 1374 if (sf->refcnt == 0) 1375 panic("sf_buf_free: freeing free sf_buf"); 1376 sf->refcnt--; 1377 if (sf->refcnt == 0) { 1378 pmap_qremove((vm_offset_t)addr, 1); 1379 m = sf->m; 1380 s = splvm(); 1381 vm_page_unwire(m, 0); 1382 /* 1383 * Check for the object going away on us. This can 1384 * happen since we don't hold a reference to it. 1385 * If so, we're responsible for freeing the page. 1386 */ 1387 if (m->wire_count == 0 && m->object == NULL) 1388 vm_page_free(m); 1389 splx(s); 1390 sf->m = NULL; 1391 SLIST_INSERT_HEAD(&sf_freelist, sf, free_list); 1392 if (sf_buf_alloc_want) { 1393 sf_buf_alloc_want = 0; 1394 wakeup(&sf_freelist); 1395 } 1396 } 1397} 1398 1399/* 1400 * sendfile(2). 1401 * int sendfile(int fd, int s, off_t offset, size_t nbytes, 1402 * struct sf_hdtr *hdtr, off_t *sbytes, int flags) 1403 * 1404 * Send a file specified by 'fd' and starting at 'offset' to a socket 1405 * specified by 's'. Send only 'nbytes' of the file or until EOF if 1406 * nbytes == 0. Optionally add a header and/or trailer to the socket 1407 * output. If specified, write the total number of bytes sent into *sbytes. 1408 */ 1409int 1410sendfile(struct proc *p, struct sendfile_args *uap) 1411{ 1412 struct file *fp; 1413 struct filedesc *fdp = p->p_fd; 1414 struct vnode *vp; 1415 struct vm_object *obj; 1416 struct socket *so; 1417 struct mbuf *m; 1418 struct sf_buf *sf; 1419 struct vm_page *pg; 1420 struct writev_args nuap; 1421 struct sf_hdtr hdtr; 1422 off_t off, xfsize, sbytes = 0; 1423 int error = 0, i, s; 1424 1425 /* 1426 * Do argument checking. Must be a regular file in, stream 1427 * type and connected socket out, positive offset. 1428 */ 1429 if (((u_int)uap->fd) >= fdp->fd_nfiles || 1430 (fp = fdp->fd_ofiles[uap->fd]) == NULL || 1431 (fp->f_flag & FREAD) == 0) { 1432 error = EBADF; 1433 goto done; 1434 } 1435 if (fp->f_type != DTYPE_VNODE) { 1436 error = EINVAL; 1437 goto done; 1438 } 1439 vp = (struct vnode *)fp->f_data; 1440 obj = vp->v_object; 1441 if (vp->v_type != VREG || obj == NULL) { 1442 error = EINVAL; 1443 goto done; 1444 } 1445 error = getsock(p->p_fd, uap->s, &fp); 1446 if (error) 1447 goto done; 1448 so = (struct socket *)fp->f_data; 1449 if (so->so_type != SOCK_STREAM) { 1450 error = EINVAL; 1451 goto done; 1452 } 1453 if ((so->so_state & SS_ISCONNECTED) == 0) { 1454 error = ENOTCONN; 1455 goto done; 1456 } 1457 if (uap->offset < 0) { 1458 error = EINVAL; 1459 goto done; 1460 } 1461 1462 /* 1463 * If specified, get the pointer to the sf_hdtr struct for 1464 * any headers/trailers. 1465 */ 1466 if (uap->hdtr != NULL) { 1467 error = copyin(uap->hdtr, &hdtr, sizeof(hdtr)); 1468 if (error) 1469 goto done; 1470 /* 1471 * Send any headers. Wimp out and use writev(2). 1472 */ 1473 if (hdtr.headers != NULL) { 1474 nuap.fd = uap->s; 1475 nuap.iovp = hdtr.headers; 1476 nuap.iovcnt = hdtr.hdr_cnt; 1477 error = writev(p, &nuap); 1478 if (error) 1479 goto done; 1480 sbytes += p->p_retval[0]; 1481 } 1482 } 1483 1484 /* 1485 * Protect against multiple writers to the socket. 1486 */ 1487 (void) sblock(&so->so_snd, M_WAITOK); 1488 1489 /* 1490 * Loop through the pages in the file, starting with the requested 1491 * offset. Get a file page (do I/O if necessary), map the file page 1492 * into an sf_buf, attach an mbuf header to the sf_buf, and queue 1493 * it on the socket. 1494 */ 1495 for (off = uap->offset; ; off += xfsize, sbytes += xfsize) { 1496 vm_pindex_t pindex; 1497 vm_offset_t pgoff; 1498 1499 pindex = OFF_TO_IDX(off); 1500retry_lookup: 1501 /* 1502 * Calculate the amount to transfer. Not to exceed a page, 1503 * the EOF, or the passed in nbytes. 1504 */ 1505 xfsize = obj->un_pager.vnp.vnp_size - off; 1506 if (xfsize > PAGE_SIZE) 1507 xfsize = PAGE_SIZE; 1508 pgoff = (vm_offset_t)(off & PAGE_MASK); 1509 if (PAGE_SIZE - pgoff < xfsize) 1510 xfsize = PAGE_SIZE - pgoff; 1511 if (uap->nbytes && xfsize > (uap->nbytes - sbytes)) 1512 xfsize = uap->nbytes - sbytes; 1513 if (xfsize <= 0) 1514 break; 1515 /* 1516 * Optimize the non-blocking case by looking at the socket space 1517 * before going to the extra work of constituting the sf_buf. 1518 */ 1519 if ((so->so_state & SS_NBIO) && sbspace(&so->so_snd) <= 0) { 1520 if (so->so_state & SS_CANTSENDMORE) 1521 error = EPIPE; 1522 else 1523 error = EAGAIN; 1524 sbunlock(&so->so_snd); 1525 goto done; 1526 } 1527 /* 1528 * Attempt to look up the page. If the page doesn't exist or the 1529 * part we're interested in isn't valid, then read it from disk. 1530 * If some other part of the kernel has this page (i.e. it's busy), 1531 * then disk I/O may be occuring on it, so wait and retry. 1532 */ 1533 pg = vm_page_lookup(obj, pindex); 1534 if (pg == NULL || (!(pg->flags & PG_BUSY) && !pg->busy && 1535 !vm_page_is_valid(pg, pgoff, xfsize))) { 1536 struct uio auio; 1537 struct iovec aiov; 1538 int bsize; 1539 1540 if (pg == NULL) { 1541 pg = vm_page_alloc(obj, pindex, VM_ALLOC_NORMAL); 1542 if (pg == NULL) { 1543 VM_WAIT; 1544 goto retry_lookup; 1545 } 1546 vm_page_flag_clear(pg, PG_BUSY); 1547 } 1548 /* 1549 * Ensure that our page is still around when the I/O completes. 1550 */ 1551 vm_page_io_start(pg); 1552 vm_page_wire(pg); 1553 /* 1554 * Get the page from backing store. 1555 */ 1556 bsize = vp->v_mount->mnt_stat.f_iosize; 1557 auio.uio_iov = &aiov; 1558 auio.uio_iovcnt = 1; 1559 aiov.iov_base = 0; 1560 aiov.iov_len = MAXBSIZE; 1561 auio.uio_resid = MAXBSIZE; 1562 auio.uio_offset = trunc_page(off); 1563 auio.uio_segflg = UIO_NOCOPY; 1564 auio.uio_rw = UIO_READ; 1565 auio.uio_procp = p; 1566 vn_lock(vp, LK_SHARED | LK_NOPAUSE | LK_RETRY, p); 1567 error = VOP_READ(vp, &auio, IO_VMIO | ((MAXBSIZE / bsize) << 16), 1568 p->p_ucred); 1569 VOP_UNLOCK(vp, 0, p); 1570 vm_page_flag_clear(pg, PG_ZERO); 1571 vm_page_io_finish(pg); 1572 if (error) { 1573 vm_page_unwire(pg, 0); 1574 /* 1575 * See if anyone else might know about this page. 1576 * If not and it is not valid, then free it. 1577 */ 1578 if (pg->wire_count == 0 && pg->valid == 0 && 1579 pg->busy == 0 && !(pg->flags & PG_BUSY) && 1580 pg->hold_count == 0) 1581 vm_page_free(pg); 1582 sbunlock(&so->so_snd); 1583 goto done; 1584 } 1585 } else { 1586 if ((pg->flags & PG_BUSY) || pg->busy) { 1587 s = splvm(); 1588 if ((pg->flags & PG_BUSY) || pg->busy) { 1589 /* 1590 * Page is busy. Wait and retry. 1591 */ 1592 vm_page_flag_set(pg, PG_WANTED); 1593 tsleep(pg, PVM, "sfpbsy", 0); 1594 splx(s); 1595 goto retry_lookup; 1596 } 1597 splx(s); 1598 } 1599 /* 1600 * Protect from having the page ripped out from beneath us. 1601 */ 1602 vm_page_wire(pg); 1603 } 1604 /* 1605 * Allocate a kernel virtual page and insert the physical page 1606 * into it. 1607 */ 1608 sf = sf_buf_alloc(); 1609 sf->m = pg; 1610 pmap_qenter(sf->kva, &pg, 1); 1611 /* 1612 * Get an mbuf header and set it up as having external storage. 1613 */ 1614 MGETHDR(m, M_WAIT, MT_DATA); 1615 m->m_ext.ext_free = sf_buf_free; 1616 m->m_ext.ext_ref = sf_buf_ref; 1617 m->m_ext.ext_buf = (void *)sf->kva; 1618 m->m_ext.ext_size = PAGE_SIZE; 1619 m->m_data = (char *) sf->kva + pgoff; 1620 m->m_flags |= M_EXT; 1621 m->m_pkthdr.len = m->m_len = xfsize; 1622 /* 1623 * Add the buffer to the socket buffer chain. 1624 */ 1625 s = splnet(); 1626retry_space: 1627 /* 1628 * Make sure that the socket is still able to take more data. 1629 * CANTSENDMORE being true usually means that the connection 1630 * was closed. so_error is true when an error was sensed after 1631 * a previous send. 1632 * The state is checked after the page mapping and buffer 1633 * allocation above since those operations may block and make 1634 * any socket checks stale. From this point forward, nothing 1635 * blocks before the pru_send (or more accurately, any blocking 1636 * results in a loop back to here to re-check). 1637 */ 1638 if ((so->so_state & SS_CANTSENDMORE) || so->so_error) { 1639 if (so->so_state & SS_CANTSENDMORE) { 1640 error = EPIPE; 1641 } else { 1642 error = so->so_error; 1643 so->so_error = 0; 1644 } 1645 m_freem(m); 1646 sbunlock(&so->so_snd); 1647 splx(s); 1648 goto done; 1649 } 1650 /* 1651 * Wait for socket space to become available. We do this just 1652 * after checking the connection state above in order to avoid 1653 * a race condition with sbwait(). 1654 */ 1655 if (sbspace(&so->so_snd) < so->so_snd.sb_lowat) { 1656 if (so->so_state & SS_NBIO) { 1657 m_freem(m); 1658 sbunlock(&so->so_snd); 1659 splx(s); 1660 error = EAGAIN; 1661 goto done; 1662 } 1663 error = sbwait(&so->so_snd); 1664 /* 1665 * An error from sbwait usually indicates that we've 1666 * been interrupted by a signal. If we've sent anything 1667 * then return bytes sent, otherwise return the error. 1668 */ 1669 if (error) { 1670 m_freem(m); 1671 sbunlock(&so->so_snd); 1672 splx(s); 1673 goto done; 1674 } 1675 goto retry_space; 1676 } 1677 error = (*so->so_proto->pr_usrreqs->pru_send)(so, 0, m, 0, 0, p); 1678 splx(s); 1679 if (error) { 1680 sbunlock(&so->so_snd); 1681 goto done; 1682 } 1683 } 1684 sbunlock(&so->so_snd); 1685 1686 /* 1687 * Send trailers. Wimp out and use writev(2). 1688 */ 1689 if (uap->hdtr != NULL && hdtr.trailers != NULL) { 1690 nuap.fd = uap->s; 1691 nuap.iovp = hdtr.trailers; 1692 nuap.iovcnt = hdtr.trl_cnt; 1693 error = writev(p, &nuap); 1694 if (error) 1695 goto done; 1696 sbytes += p->p_retval[0]; 1697 } 1698 1699done: 1700 if (uap->sbytes != NULL) { 1701 copyout(&sbytes, uap->sbytes, sizeof(off_t)); 1702 } 1703 return (error); 1704} 1705