Cross Reference: /freebsd-11.0-release/sys/kern/uipc_socket.c

Deleted Added

sdiff udiff text old ( 167799 ) new ( 167895 )

full compact

uipc_socket.c (167799)	uipc_socket.c (167895)
1/- 2 Copyright (c) 1982, 1986, 1988, 1990, 1993 3 * The Regents of the University of California. 4 * Copyright (c) 2004 The FreeBSD Foundation 5 * Copyright (c) 2004-2006 Robert N. M. Watson 6 * All rights reserved. 7 * 8 * Redistribution and use in source and binary forms, with or without 9 * modification, are permitted provided that the following conditions 10 * are met: 11 * 1. Redistributions of source code must retain the above copyright 12 * notice, this list of conditions and the following disclaimer. 13 * 2. Redistributions in binary form must reproduce the above copyright 14 * notice, this list of conditions and the following disclaimer in the 15 * documentation and/or other materials provided with the distribution. 16 * 4. Neither the name of the University nor the names of its contributors 17 * may be used to endorse or promote products derived from this software 18 * without specific prior written permission. 19 * 20 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND 21 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 22 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 23 * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE 24 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 25 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 26 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 27 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 28 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 29 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 30 * SUCH DAMAGE. 31 * 32 * @(#)uipc_socket.c 8.3 (Berkeley) 4/15/94 33 / 34 35/ 36 * Comments on the socket life cycle: 37 * 38 * soalloc() sets of socket layer state for a socket, called only by 39 * socreate() and sonewconn(). Socket layer private. 40 * 41 * sodealloc() tears down socket layer state for a socket, called only by 42 * sofree() and sonewconn(). Socket layer private. 43 * 44 * pru_attach() associates protocol layer state with an allocated socket; 45 * called only once, may fail, aborting socket allocation. This is called 46 * from socreate() and sonewconn(). Socket layer private. 47 * 48 * pru_detach() disassociates protocol layer state from an attached socket, 49 * and will be called exactly once for sockets in which pru_attach() has 50 * been successfully called. If pru_attach() returned an error, 51 * pru_detach() will not be called. Socket layer private. 52 * 53 * pru_abort() and pru_close() notify the protocol layer that the last 54 * consumer of a socket is starting to tear down the socket, and that the 55 * protocol should terminate the connection. Historically, pru_abort() also 56 * detached protocol state from the socket state, but this is no longer the 57 * case. 58 * 59 * socreate() creates a socket and attaches protocol state. This is a public 60 * interface that may be used by socket layer consumers to create new 61 * sockets. 62 * 63 * sonewconn() creates a socket and attaches protocol state. This is a 64 * public interface that may be used by protocols to create new sockets when 65 * a new connection is received and will be available for accept() on a 66 * listen socket. 67 * 68 * soclose() destroys a socket after possibly waiting for it to disconnect. 69 * This is a public interface that socket consumers should use to close and 70 * release a socket when done with it. 71 * 72 * soabort() destroys a socket without waiting for it to disconnect (used 73 * only for incoming connections that are already partially or fully 74 * connected). This is used internally by the socket layer when clearing 75 * listen socket queues (due to overflow or close on the listen socket), but 76 * is also a public interface protocols may use to abort connections in 77 * their incomplete listen queues should they no longer be required. Sockets 78 * placed in completed connection listen queues should not be aborted for 79 * reasons described in the comment above the soclose() implementation. This 80 * is not a general purpose close routine, and except in the specific 81 * circumstances described here, should not be used. 82 * 83 * sofree() will free a socket and its protocol state if all references on 84 * the socket have been released, and is the public interface to attempt to 85 * free a socket when a reference is removed. This is a socket layer private 86 * interface. 87 * 88 * NOTE: In addition to socreate() and soclose(), which provide a single 89 * socket reference to the consumer to be managed as required, there are two 90 * calls to explicitly manage socket references, soref(), and sorele(). 91 * Currently, these are generally required only when transitioning a socket 92 * from a listen queue to a file descriptor, in order to prevent garbage 93 * collection of the socket at an untimely moment. For a number of reasons, 94 * these interfaces are not preferred, and should be avoided. 95 */ 96 97#include <sys/cdefs.h>	1/- 2 Copyright (c) 1982, 1986, 1988, 1990, 1993 3 * The Regents of the University of California. 4 * Copyright (c) 2004 The FreeBSD Foundation 5 * Copyright (c) 2004-2006 Robert N. M. Watson 6 * All rights reserved. 7 * 8 * Redistribution and use in source and binary forms, with or without 9 * modification, are permitted provided that the following conditions 10 * are met: 11 * 1. Redistributions of source code must retain the above copyright 12 * notice, this list of conditions and the following disclaimer. 13 * 2. Redistributions in binary form must reproduce the above copyright 14 * notice, this list of conditions and the following disclaimer in the 15 * documentation and/or other materials provided with the distribution. 16 * 4. Neither the name of the University nor the names of its contributors 17 * may be used to endorse or promote products derived from this software 18 * without specific prior written permission. 19 * 20 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND 21 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 22 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 23 * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE 24 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 25 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 26 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 27 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 28 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 29 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 30 * SUCH DAMAGE. 31 * 32 * @(#)uipc_socket.c 8.3 (Berkeley) 4/15/94 33 / 34 35/ 36 * Comments on the socket life cycle: 37 * 38 * soalloc() sets of socket layer state for a socket, called only by 39 * socreate() and sonewconn(). Socket layer private. 40 * 41 * sodealloc() tears down socket layer state for a socket, called only by 42 * sofree() and sonewconn(). Socket layer private. 43 * 44 * pru_attach() associates protocol layer state with an allocated socket; 45 * called only once, may fail, aborting socket allocation. This is called 46 * from socreate() and sonewconn(). Socket layer private. 47 * 48 * pru_detach() disassociates protocol layer state from an attached socket, 49 * and will be called exactly once for sockets in which pru_attach() has 50 * been successfully called. If pru_attach() returned an error, 51 * pru_detach() will not be called. Socket layer private. 52 * 53 * pru_abort() and pru_close() notify the protocol layer that the last 54 * consumer of a socket is starting to tear down the socket, and that the 55 * protocol should terminate the connection. Historically, pru_abort() also 56 * detached protocol state from the socket state, but this is no longer the 57 * case. 58 * 59 * socreate() creates a socket and attaches protocol state. This is a public 60 * interface that may be used by socket layer consumers to create new 61 * sockets. 62 * 63 * sonewconn() creates a socket and attaches protocol state. This is a 64 * public interface that may be used by protocols to create new sockets when 65 * a new connection is received and will be available for accept() on a 66 * listen socket. 67 * 68 * soclose() destroys a socket after possibly waiting for it to disconnect. 69 * This is a public interface that socket consumers should use to close and 70 * release a socket when done with it. 71 * 72 * soabort() destroys a socket without waiting for it to disconnect (used 73 * only for incoming connections that are already partially or fully 74 * connected). This is used internally by the socket layer when clearing 75 * listen socket queues (due to overflow or close on the listen socket), but 76 * is also a public interface protocols may use to abort connections in 77 * their incomplete listen queues should they no longer be required. Sockets 78 * placed in completed connection listen queues should not be aborted for 79 * reasons described in the comment above the soclose() implementation. This 80 * is not a general purpose close routine, and except in the specific 81 * circumstances described here, should not be used. 82 * 83 * sofree() will free a socket and its protocol state if all references on 84 * the socket have been released, and is the public interface to attempt to 85 * free a socket when a reference is removed. This is a socket layer private 86 * interface. 87 * 88 * NOTE: In addition to socreate() and soclose(), which provide a single 89 * socket reference to the consumer to be managed as required, there are two 90 * calls to explicitly manage socket references, soref(), and sorele(). 91 * Currently, these are generally required only when transitioning a socket 92 * from a listen queue to a file descriptor, in order to prevent garbage 93 * collection of the socket at an untimely moment. For a number of reasons, 94 * these interfaces are not preferred, and should be avoided. 95 */ 96 97#include <sys/cdefs.h>
98__FBSDID("$FreeBSD: head/sys/kern/uipc_socket.c 167799 2007-03-22 13:21:24Z glebius $");	98__FBSDID("$FreeBSD: head/sys/kern/uipc_socket.c 167895 2007-03-26 08:59:03Z rwatson $");
99 100#include "opt_inet.h" 101#include "opt_mac.h" 102#include "opt_zero.h" 103#include "opt_compat.h" 104 105#include <sys/param.h> 106#include <sys/systm.h> 107#include <sys/fcntl.h> 108#include <sys/limits.h> 109#include <sys/lock.h> 110#include <sys/mac.h> 111#include <sys/malloc.h> 112#include <sys/mbuf.h> 113#include <sys/mutex.h> 114#include <sys/domain.h> 115#include <sys/file.h> /* for struct knote / 116#include <sys/kernel.h> 117#include <sys/event.h> 118#include <sys/eventhandler.h> 119#include <sys/poll.h> 120#include <sys/proc.h> 121#include <sys/protosw.h> 122#include <sys/socket.h> 123#include <sys/socketvar.h> 124#include <sys/resourcevar.h> 125*#include <sys/signalvar.h>	99 100#include "opt_inet.h" 101#include "opt_mac.h" 102#include "opt_zero.h" 103#include "opt_compat.h" 104 105#include <sys/param.h> 106#include <sys/systm.h> 107#include <sys/fcntl.h> 108#include <sys/limits.h> 109#include <sys/lock.h> 110#include <sys/mac.h> 111#include <sys/malloc.h> 112#include <sys/mbuf.h> 113#include <sys/mutex.h> 114#include <sys/domain.h> 115#include <sys/file.h> /* for struct knote / 116#include <sys/kernel.h> 117#include <sys/event.h> 118#include <sys/eventhandler.h> 119#include <sys/poll.h> 120#include <sys/proc.h> 121#include <sys/protosw.h> 122#include <sys/socket.h> 123#include <sys/socketvar.h> 124#include <sys/resourcevar.h> 125*#include <sys/signalvar.h>
	126#include <sys/stat.h>
126#include <sys/sysctl.h> 127#include <sys/uio.h> 128#include <sys/jail.h> 129 130#include <security/mac/mac_framework.h> 131 132#include <vm/uma.h> 133 134#ifdef COMPAT_IA32 135#include <sys/mount.h> 136#include <compat/freebsd32/freebsd32.h> 137 138extern struct sysentvec ia32_freebsd_sysvec; 139#endif 140 141static int soreceive_rcvoob(struct socket so, struct uio uio, 142 int flags); 143 144static void filt_sordetach(struct knote kn); 145static int filt_soread(struct knote kn, long hint); 146static void filt_sowdetach(struct knote kn); 147static int filt_sowrite(struct knote kn, long hint); 148static int filt_solisten(struct knote kn, long hint); 149* 150static struct filterops solisten_filtops = 151 { 1, NULL, filt_sordetach, filt_solisten }; 152static struct filterops soread_filtops = 153 { 1, NULL, filt_sordetach, filt_soread }; 154static struct filterops sowrite_filtops = 155 { 1, NULL, filt_sowdetach, filt_sowrite }; 156 157uma_zone_t socket_zone; 158so_gen_t so_gencnt; /* generation count for sockets / 159* 160int maxsockets; 161 162MALLOC_DEFINE(M_SONAME, "soname", "socket name"); 163MALLOC_DEFINE(M_PCB, "pcb", "protocol control block"); 164 165static int somaxconn = SOMAXCONN; 166static int sysctl_somaxconn(SYSCTL_HANDLER_ARGS); 167/* XXX: we dont have SYSCTL_USHORT / 168SYSCTL_PROC(_kern_ipc, KIPC_SOMAXCONN, somaxconn, CTLTYPE_UINT \| CTLFLAG_RW, 169* 0, sizeof(int), sysctl_somaxconn, "I", "Maximum pending socket connection " 170 "queue size"); 171static int numopensockets; 172SYSCTL_INT(_kern_ipc, OID_AUTO, numopensockets, CTLFLAG_RD, 173 &numopensockets, 0, "Number of open sockets"); 174#ifdef ZERO_COPY_SOCKETS 175/* These aren't static because they're used in other files. / 176int so_zero_copy_send = 1; 177int so_zero_copy_receive = 1; 178SYSCTL_NODE(_kern_ipc, OID_AUTO, zero_copy, CTLFLAG_RD, 0, 179* "Zero copy controls"); 180SYSCTL_INT(_kern_ipc_zero_copy, OID_AUTO, receive, CTLFLAG_RW, 181 &so_zero_copy_receive, 0, "Enable zero copy receive"); 182SYSCTL_INT(_kern_ipc_zero_copy, OID_AUTO, send, CTLFLAG_RW, 183 &so_zero_copy_send, 0, "Enable zero copy send"); 184#endif /* ZERO_COPY_SOCKETS / 185* 186/* 187 * accept_mtx locks down per-socket fields relating to accept queues. See 188 * socketvar.h for an annotation of the protected fields of struct socket. 189 / 190struct mtx accept_mtx; 191MTX_SYSINIT(accept_mtx, &accept_mtx, "accept", MTX_DEF); 192* 193/* 194 * so_global_mtx protects so_gencnt, numopensockets, and the per-socket 195 * so_gencnt field. 196 / 197static struct mtx so_global_mtx; 198MTX_SYSINIT(so_global_mtx, &so_global_mtx, "so_glabel", MTX_DEF); 199* 200/* 201 * General IPC sysctl name space, used by sockets and a variety of other IPC 202 * types. 203 / 204SYSCTL_NODE(_kern, KERN_IPC, ipc, CTLFLAG_RW, 0, "IPC"); 205* 206/* 207 * Sysctl to get and set the maximum global sockets limit. Notify protocols 208 * of the change so that they can update their dependent limits as required. 209 / 210static int 211sysctl_maxsockets(SYSCTL_HANDLER_ARGS) 212{ 213* int error, newmaxsockets; 214 215 newmaxsockets = maxsockets; 216 error = sysctl_handle_int(oidp, &newmaxsockets, sizeof(int), req); 217 if (error == 0 && req->newptr) { 218 if (newmaxsockets > maxsockets) { 219 maxsockets = newmaxsockets; 220 if (maxsockets > ((maxfiles / 4) * 3)) { 221 maxfiles = (maxsockets * 5) / 4; 222 maxfilesperproc = (maxfiles * 9) / 10; 223 } 224 EVENTHANDLER_INVOKE(maxsockets_change); 225 } else 226 error = EINVAL; 227 } 228 return (error); 229} 230 231SYSCTL_PROC(_kern_ipc, OID_AUTO, maxsockets, CTLTYPE_INT\|CTLFLAG_RW, 232 &maxsockets, 0, sysctl_maxsockets, "IU", 233 "Maximum number of sockets avaliable"); 234 235/* 236 * Initialise maxsockets. 237 / 238static void init_maxsockets(void ignored) 239{ 240 TUNABLE_INT_FETCH("kern.ipc.maxsockets", &maxsockets); 241 maxsockets = imax(maxsockets, imax(maxfiles, nmbclusters)); 242} 243SYSINIT(param, SI_SUB_TUNABLES, SI_ORDER_ANY, init_maxsockets, NULL); 244 245/* 246 * Socket operation routines. These routines are called by the routines in 247 * sys_socket.c or from a system process, and implement the semantics of 248 * socket operations by switching out to the protocol specific routines. 249 / 250* 251/* 252 * Get a socket structure from our zone, and initialize it. Note that it 253 * would probably be better to allocate socket and PCB at the same time, but 254 * I'm not convinced that all the protocols can be easily modified to do 255 * this. 256 * 257 * soalloc() returns a socket with a ref count of 0. 258 / 259static struct socket 260soalloc(void) 261{ 262 struct socket so; 263* 264 so = uma_zalloc(socket_zone, M_NOWAIT \| M_ZERO); 265 if (so == NULL) 266 return (NULL); 267#ifdef MAC 268 if (mac_init_socket(so, M_NOWAIT) != 0) { 269 uma_zfree(socket_zone, so); 270 return (NULL); 271 } 272#endif 273 SOCKBUF_LOCK_INIT(&so->so_snd, "so_snd"); 274 SOCKBUF_LOCK_INIT(&so->so_rcv, "so_rcv"); 275 TAILQ_INIT(&so->so_aiojobq); 276 mtx_lock(&so_global_mtx); 277 so->so_gencnt = ++so_gencnt; 278 ++numopensockets; 279 mtx_unlock(&so_global_mtx); 280 return (so); 281} 282 283/* 284 * Free the storage associated with a socket at the socket layer, tear down 285 * locks, labels, etc. All protocol state is assumed already to have been 286 * torn down (and possibly never set up) by the caller. 287 / 288static void 289sodealloc(struct socket so) 290{ 291 292 KASSERT(so->so_count == 0, ("sodealloc(): so_count %d", so->so_count)); 293 KASSERT(so->so_pcb == NULL, ("sodealloc(): so_pcb != NULL")); 294 295 mtx_lock(&so_global_mtx); 296 so->so_gencnt = ++so_gencnt; 297 --numopensockets; /* Could be below, but faster here. / 298* mtx_unlock(&so_global_mtx); 299 if (so->so_rcv.sb_hiwat) 300 (void)chgsbsize(so->so_cred->cr_uidinfo, 301 &so->so_rcv.sb_hiwat, 0, RLIM_INFINITY); 302 if (so->so_snd.sb_hiwat) 303 (void)chgsbsize(so->so_cred->cr_uidinfo, 304 &so->so_snd.sb_hiwat, 0, RLIM_INFINITY); 305#ifdef INET 306 /* remove acccept filter if one is present. / 307* if (so->so_accf != NULL) 308 do_setopt_accept_filter(so, NULL); 309#endif 310#ifdef MAC 311 mac_destroy_socket(so); 312#endif 313 crfree(so->so_cred); 314 SOCKBUF_LOCK_DESTROY(&so->so_snd); 315 SOCKBUF_LOCK_DESTROY(&so->so_rcv); 316 uma_zfree(socket_zone, so); 317} 318 319/* 320 * socreate returns a socket with a ref count of 1. The socket should be 321 * closed with soclose(). 322 / 323int 324socreate(dom, aso, type, proto, cred, td) 325* int dom; 326 struct socket *aso; 327* int type; 328 int proto; 329 struct ucred cred; 330* struct thread td; 331{ 332* struct protosw prp; 333* struct socket so; 334* int error; 335 336 if (proto) 337 prp = pffindproto(dom, proto, type); 338 else 339 prp = pffindtype(dom, type); 340 341 if (prp == NULL \|\| prp->pr_usrreqs->pru_attach == NULL \|\| 342 prp->pr_usrreqs->pru_attach == pru_attach_notsupp) 343 return (EPROTONOSUPPORT); 344 345 if (jailed(cred) && jail_socket_unixiproute_only && 346 prp->pr_domain->dom_family != PF_LOCAL && 347 prp->pr_domain->dom_family != PF_INET && 348 prp->pr_domain->dom_family != PF_ROUTE) { 349 return (EPROTONOSUPPORT); 350 } 351 352 if (prp->pr_type != type) 353 return (EPROTOTYPE); 354 so = soalloc(); 355 if (so == NULL) 356 return (ENOBUFS); 357 358 TAILQ_INIT(&so->so_incomp); 359 TAILQ_INIT(&so->so_comp); 360 so->so_type = type; 361 so->so_cred = crhold(cred); 362 so->so_proto = prp; 363#ifdef MAC 364 mac_create_socket(cred, so); 365#endif 366 knlist_init(&so->so_rcv.sb_sel.si_note, SOCKBUF_MTX(&so->so_rcv), 367 NULL, NULL, NULL); 368 knlist_init(&so->so_snd.sb_sel.si_note, SOCKBUF_MTX(&so->so_snd), 369 NULL, NULL, NULL); 370 so->so_count = 1; 371 /* 372 * Auto-sizing of socket buffers is managed by the protocols and 373 * the appropriate flags must be set in the pru_attach function. 374 / 375* error = (prp->pr_usrreqs->pru_attach)(so, proto, td); 376* if (error) { 377 KASSERT(so->so_count == 1, ("socreate: so_count %d", 378 so->so_count)); 379 so->so_count = 0; 380 sodealloc(so); 381 return (error); 382 } 383 aso = so; 384* return (0); 385} 386 387#ifdef REGRESSION 388static int regression_sonewconn_earlytest = 1; 389SYSCTL_INT(_regression, OID_AUTO, sonewconn_earlytest, CTLFLAG_RW, 390 &regression_sonewconn_earlytest, 0, "Perform early sonewconn limit test"); 391#endif 392 393/* 394 * When an attempt at a new connection is noted on a socket which accepts 395 * connections, sonewconn is called. If the connection is possible (subject 396 * to space constraints, etc.) then we allocate a new structure, propoerly 397 * linked into the data structure of the original socket, and return this. 398 * Connstatus may be 0, or SO_ISCONFIRMING, or SO_ISCONNECTED. 399 * 400 * Note: the ref count on the socket is 0 on return. 401 / 402struct socket 403sonewconn(head, connstatus) 404 register struct socket head; 405* int connstatus; 406{ 407 register struct socket so; 408* int over; 409 410 ACCEPT_LOCK(); 411 over = (head->so_qlen > 3 * head->so_qlimit / 2); 412 ACCEPT_UNLOCK(); 413#ifdef REGRESSION 414 if (regression_sonewconn_earlytest && over) 415#else 416 if (over) 417#endif 418 return (NULL); 419 so = soalloc(); 420 if (so == NULL) 421 return (NULL); 422 if ((head->so_options & SO_ACCEPTFILTER) != 0) 423 connstatus = 0; 424 so->so_head = head; 425 so->so_type = head->so_type; 426 so->so_options = head->so_options &~ SO_ACCEPTCONN; 427 so->so_linger = head->so_linger; 428 so->so_state = head->so_state \| SS_NOFDREF; 429 so->so_proto = head->so_proto; 430 so->so_cred = crhold(head->so_cred); 431#ifdef MAC 432 SOCK_LOCK(head); 433 mac_create_socket_from_socket(head, so); 434 SOCK_UNLOCK(head); 435#endif 436 knlist_init(&so->so_rcv.sb_sel.si_note, SOCKBUF_MTX(&so->so_rcv), 437 NULL, NULL, NULL); 438 knlist_init(&so->so_snd.sb_sel.si_note, SOCKBUF_MTX(&so->so_snd), 439 NULL, NULL, NULL); 440 if (soreserve(so, head->so_snd.sb_hiwat, head->so_rcv.sb_hiwat) \|\| 441 (so->so_proto->pr_usrreqs->pru_attach)(so, 0, NULL)) { 442* sodealloc(so); 443 return (NULL); 444 } 445 so->so_rcv.sb_lowat = head->so_rcv.sb_lowat; 446 so->so_snd.sb_lowat = head->so_snd.sb_lowat; 447 so->so_rcv.sb_timeo = head->so_rcv.sb_timeo; 448 so->so_snd.sb_timeo = head->so_snd.sb_timeo; 449 so->so_rcv.sb_flags \|= head->so_rcv.sb_flags & SB_AUTOSIZE; 450 so->so_snd.sb_flags \|= head->so_snd.sb_flags & SB_AUTOSIZE; 451 so->so_state \|= connstatus; 452 ACCEPT_LOCK(); 453 if (connstatus) { 454 TAILQ_INSERT_TAIL(&head->so_comp, so, so_list); 455 so->so_qstate \|= SQ_COMP; 456 head->so_qlen++; 457 } else { 458 /* 459 * Keep removing sockets from the head until there's room for 460 * us to insert on the tail. In pre-locking revisions, this 461 * was a simple if(), but as we could be racing with other 462 * threads and soabort() requires dropping locks, we must 463 * loop waiting for the condition to be true. 464 / 465* while (head->so_incqlen > head->so_qlimit) { 466 struct socket sp; 467* sp = TAILQ_FIRST(&head->so_incomp); 468 TAILQ_REMOVE(&head->so_incomp, sp, so_list); 469 head->so_incqlen--; 470 sp->so_qstate &= ~SQ_INCOMP; 471 sp->so_head = NULL; 472 ACCEPT_UNLOCK(); 473 soabort(sp); 474 ACCEPT_LOCK(); 475 } 476 TAILQ_INSERT_TAIL(&head->so_incomp, so, so_list); 477 so->so_qstate \|= SQ_INCOMP; 478 head->so_incqlen++; 479 } 480 ACCEPT_UNLOCK(); 481 if (connstatus) { 482 sorwakeup(head); 483 wakeup_one(&head->so_timeo); 484 } 485 return (so); 486} 487 488int 489sobind(so, nam, td) 490 struct socket so; 491* struct sockaddr nam; 492* struct thread td; 493{ 494* 495 return ((so->so_proto->pr_usrreqs->pru_bind)(so, nam, td)); 496} 497* 498/* 499 * solisten() transitions a socket from a non-listening state to a listening 500 * state, but can also be used to update the listen queue depth on an 501 * existing listen socket. The protocol will call back into the sockets 502 * layer using solisten_proto_check() and solisten_proto() to check and set 503 * socket-layer listen state. Call backs are used so that the protocol can 504 * acquire both protocol and socket layer locks in whatever order is required 505 * by the protocol. 506 * 507 * Protocol implementors are advised to hold the socket lock across the 508 * socket-layer test and set to avoid races at the socket layer. 509 / 510int 511solisten(so, backlog, td) 512* struct socket so; 513* int backlog; 514 struct thread td; 515{ 516* 517 return ((so->so_proto->pr_usrreqs->pru_listen)(so, backlog, td)); 518} 519* 520int 521solisten_proto_check(so) 522 struct socket so; 523{ 524* 525 SOCK_LOCK_ASSERT(so); 526 527 if (so->so_state & (SS_ISCONNECTED \| SS_ISCONNECTING \| 528 SS_ISDISCONNECTING)) 529 return (EINVAL); 530 return (0); 531} 532 533void 534solisten_proto(so, backlog) 535 struct socket so; 536* int backlog; 537{ 538 539 SOCK_LOCK_ASSERT(so); 540 541 if (backlog < 0 \|\| backlog > somaxconn) 542 backlog = somaxconn; 543 so->so_qlimit = backlog; 544 so->so_options \|= SO_ACCEPTCONN; 545} 546 547/* 548 * Attempt to free a socket. This should really be sotryfree(). 549 * 550 * sofree() will succeed if: 551 * 552 * - There are no outstanding file descriptor references or related consumers 553 * (so_count == 0). 554 * 555 * - The socket has been closed by user space, if ever open (SS_NOFDREF). 556 * 557 * - The protocol does not have an outstanding strong reference on the socket 558 * (SS_PROTOREF). 559 * 560 * - The socket is not in a completed connection queue, so a process has been 561 * notified that it is present. If it is removed, the user process may 562 * block in accept() despite select() saying the socket was ready. 563 * 564 * Otherwise, it will quietly abort so that a future call to sofree(), when 565 * conditions are right, can succeed. 566 / 567void 568sofree(so) 569* struct socket so; 570{ 571* struct protosw pr = so->so_proto; 572* struct socket head; 573* 574 ACCEPT_LOCK_ASSERT(); 575 SOCK_LOCK_ASSERT(so); 576 577 if ((so->so_state & SS_NOFDREF) == 0 \|\| so->so_count != 0 \|\| 578 (so->so_state & SS_PROTOREF) \|\| (so->so_qstate & SQ_COMP)) { 579 SOCK_UNLOCK(so); 580 ACCEPT_UNLOCK(); 581 return; 582 } 583 584 head = so->so_head; 585 if (head != NULL) { 586 KASSERT((so->so_qstate & SQ_COMP) != 0 \|\| 587 (so->so_qstate & SQ_INCOMP) != 0, 588 ("sofree: so_head != NULL, but neither SQ_COMP nor " 589 "SQ_INCOMP")); 590 KASSERT((so->so_qstate & SQ_COMP) == 0 \|\| 591 (so->so_qstate & SQ_INCOMP) == 0, 592 ("sofree: so->so_qstate is SQ_COMP and also SQ_INCOMP")); 593 TAILQ_REMOVE(&head->so_incomp, so, so_list); 594 head->so_incqlen--; 595 so->so_qstate &= ~SQ_INCOMP; 596 so->so_head = NULL; 597 } 598 KASSERT((so->so_qstate & SQ_COMP) == 0 && 599 (so->so_qstate & SQ_INCOMP) == 0, 600 ("sofree: so_head == NULL, but still SQ_COMP(%d) or SQ_INCOMP(%d)", 601 so->so_qstate & SQ_COMP, so->so_qstate & SQ_INCOMP)); 602 if (so->so_options & SO_ACCEPTCONN) { 603 KASSERT((TAILQ_EMPTY(&so->so_comp)), ("sofree: so_comp populated")); 604 KASSERT((TAILQ_EMPTY(&so->so_incomp)), ("sofree: so_comp populated")); 605 } 606 SOCK_UNLOCK(so); 607 ACCEPT_UNLOCK(); 608 609 if (pr->pr_flags & PR_RIGHTS && pr->pr_domain->dom_dispose != NULL) 610 (pr->pr_domain->dom_dispose)(so->so_rcv.sb_mb); 611* if (pr->pr_usrreqs->pru_detach != NULL) 612 (pr->pr_usrreqs->pru_detach)(so); 613* 614 /* 615 * From this point on, we assume that no other references to this 616 * socket exist anywhere else in the stack. Therefore, no locks need 617 * to be acquired or held. 618 * 619 * We used to do a lot of socket buffer and socket locking here, as 620 * well as invoke sorflush() and perform wakeups. The direct call to 621 * dom_dispose() and sbrelease_internal() are an inlining of what was 622 * necessary from sorflush(). 623 * 624 * Notice that the socket buffer and kqueue state are torn down 625 * before calling pru_detach. This means that protocols shold not 626 * assume they can perform socket wakeups, etc, in their detach 627 * code. 628 / 629* KASSERT((so->so_snd.sb_flags & SB_LOCK) == 0, ("sofree: snd sblock")); 630 KASSERT((so->so_rcv.sb_flags & SB_LOCK) == 0, ("sofree: rcv sblock")); 631 sbdestroy(&so->so_snd, so); 632 sbdestroy(&so->so_rcv, so); 633 knlist_destroy(&so->so_rcv.sb_sel.si_note); 634 knlist_destroy(&so->so_snd.sb_sel.si_note); 635 sodealloc(so); 636} 637 638/* 639 * Close a socket on last file table reference removal. Initiate disconnect 640 * if connected. Free socket when disconnect complete. 641 * 642 * This function will sorele() the socket. Note that soclose() may be called 643 * prior to the ref count reaching zero. The actual socket structure will 644 * not be freed until the ref count reaches zero. 645 / 646int 647soclose(so) 648* struct socket so; 649{ 650* int error = 0; 651 652 KASSERT(!(so->so_state & SS_NOFDREF), ("soclose: SS_NOFDREF on enter")); 653 654 funsetown(&so->so_sigio); 655 if (so->so_state & SS_ISCONNECTED) { 656 if ((so->so_state & SS_ISDISCONNECTING) == 0) { 657 error = sodisconnect(so); 658 if (error) 659 goto drop; 660 } 661 if (so->so_options & SO_LINGER) { 662 if ((so->so_state & SS_ISDISCONNECTING) && 663 (so->so_state & SS_NBIO)) 664 goto drop; 665 while (so->so_state & SS_ISCONNECTED) { 666 error = tsleep(&so->so_timeo, 667 PSOCK \| PCATCH, "soclos", so->so_linger * hz); 668 if (error) 669 break; 670 } 671 } 672 } 673 674drop: 675 if (so->so_proto->pr_usrreqs->pru_close != NULL) 676 (so->so_proto->pr_usrreqs->pru_close)(so); 677* if (so->so_options & SO_ACCEPTCONN) { 678 struct socket sp; 679* ACCEPT_LOCK(); 680 while ((sp = TAILQ_FIRST(&so->so_incomp)) != NULL) { 681 TAILQ_REMOVE(&so->so_incomp, sp, so_list); 682 so->so_incqlen--; 683 sp->so_qstate &= ~SQ_INCOMP; 684 sp->so_head = NULL; 685 ACCEPT_UNLOCK(); 686 soabort(sp); 687 ACCEPT_LOCK(); 688 } 689 while ((sp = TAILQ_FIRST(&so->so_comp)) != NULL) { 690 TAILQ_REMOVE(&so->so_comp, sp, so_list); 691 so->so_qlen--; 692 sp->so_qstate &= ~SQ_COMP; 693 sp->so_head = NULL; 694 ACCEPT_UNLOCK(); 695 soabort(sp); 696 ACCEPT_LOCK(); 697 } 698 ACCEPT_UNLOCK(); 699 } 700 ACCEPT_LOCK(); 701 SOCK_LOCK(so); 702 KASSERT((so->so_state & SS_NOFDREF) == 0, ("soclose: NOFDREF")); 703 so->so_state \|= SS_NOFDREF; 704 sorele(so); 705 return (error); 706} 707 708/* 709 * soabort() is used to abruptly tear down a connection, such as when a 710 * resource limit is reached (listen queue depth exceeded), or if a listen 711 * socket is closed while there are sockets waiting to be accepted. 712 * 713 * This interface is tricky, because it is called on an unreferenced socket, 714 * and must be called only by a thread that has actually removed the socket 715 * from the listen queue it was on, or races with other threads are risked. 716 * 717 * This interface will call into the protocol code, so must not be called 718 * with any socket locks held. Protocols do call it while holding their own 719 * recursible protocol mutexes, but this is something that should be subject 720 * to review in the future. 721 / 722void 723soabort(so) 724* struct socket so; 725{ 726* 727 /* 728 * In as much as is possible, assert that no references to this 729 * socket are held. This is not quite the same as asserting that the 730 * current thread is responsible for arranging for no references, but 731 * is as close as we can get for now. 732 / 733* KASSERT(so->so_count == 0, ("soabort: so_count")); 734 KASSERT((so->so_state & SS_PROTOREF) == 0, ("soabort: SS_PROTOREF")); 735 KASSERT(so->so_state & SS_NOFDREF, ("soabort: !SS_NOFDREF")); 736 KASSERT((so->so_state & SQ_COMP) == 0, ("soabort: SQ_COMP")); 737 KASSERT((so->so_state & SQ_INCOMP) == 0, ("soabort: SQ_INCOMP")); 738 739 if (so->so_proto->pr_usrreqs->pru_abort != NULL) 740 (so->so_proto->pr_usrreqs->pru_abort)(so); 741* ACCEPT_LOCK(); 742 SOCK_LOCK(so); 743 sofree(so); 744} 745 746int 747soaccept(so, nam) 748 struct socket so; 749* struct sockaddr *nam; 750{ 751* int error; 752 753 SOCK_LOCK(so); 754 KASSERT((so->so_state & SS_NOFDREF) != 0, ("soaccept: !NOFDREF")); 755 so->so_state &= ~SS_NOFDREF; 756 SOCK_UNLOCK(so); 757 error = (so->so_proto->pr_usrreqs->pru_accept)(so, nam); 758* return (error); 759} 760 761int 762soconnect(so, nam, td) 763 struct socket so; 764* struct sockaddr nam; 765* struct thread td; 766{ 767* int error; 768 769 if (so->so_options & SO_ACCEPTCONN) 770 return (EOPNOTSUPP); 771 /* 772 * If protocol is connection-based, can only connect once. 773 * Otherwise, if connected, try to disconnect first. This allows 774 * user to disconnect by connecting to, e.g., a null address. 775 / 776* if (so->so_state & (SS_ISCONNECTED\|SS_ISCONNECTING) && 777 ((so->so_proto->pr_flags & PR_CONNREQUIRED) \|\| 778 (error = sodisconnect(so)))) { 779 error = EISCONN; 780 } else { 781 /* 782 * Prevent accumulated error from previous connection from 783 * biting us. 784 / 785* so->so_error = 0; 786 error = (so->so_proto->pr_usrreqs->pru_connect)(so, nam, td); 787* } 788 789 return (error); 790} 791 792int 793soconnect2(so1, so2) 794 struct socket so1; 795* struct socket so2; 796{ 797* 798 return ((so1->so_proto->pr_usrreqs->pru_connect2)(so1, so2)); 799} 800* 801int 802sodisconnect(so) 803 struct socket so; 804{ 805* int error; 806 807 if ((so->so_state & SS_ISCONNECTED) == 0) 808 return (ENOTCONN); 809 if (so->so_state & SS_ISDISCONNECTING) 810 return (EALREADY); 811 error = (so->so_proto->pr_usrreqs->pru_disconnect)(so); 812* return (error); 813} 814 815#ifdef ZERO_COPY_SOCKETS 816struct so_zerocopy_stats{ 817 int size_ok; 818 int align_ok; 819 int found_ifp; 820}; 821struct so_zerocopy_stats so_zerocp_stats = {0,0,0}; 822#include <netinet/in.h> 823#include <net/route.h> 824#include <netinet/in_pcb.h> 825#include <vm/vm.h> 826#include <vm/vm_page.h> 827#include <vm/vm_object.h> 828 829/* 830 * sosend_copyin() is only used if zero copy sockets are enabled. Otherwise 831 * sosend_dgram() and sosend_generic() use m_uiotombuf(). 832 * 833 * sosend_copyin() accepts a uio and prepares an mbuf chain holding part or 834 * all of the data referenced by the uio. If desired, it uses zero-copy. 835 * space will be updated to reflect data copied in. 836* * 837 * NB: If atomic I/O is requested, the caller must already have checked that 838 * space can hold resid bytes. 839 * 840 * NB: In the event of an error, the caller may need to free the partial 841 * chain pointed to by mpp. The contents of both uio and space may be 842* * modified even in the case of an error. 843 / 844static int 845sosend_copyin(struct uio uio, struct mbuf *retmp, int atomic, long space, 846 int flags) 847{ 848 struct mbuf m, mp, top; 849 long len, resid; 850 int error; 851#ifdef ZERO_COPY_SOCKETS 852 int cow_send; 853#endif 854 855 retmp = top = NULL; 856* mp = &top; 857 len = 0; 858 resid = uio->uio_resid; 859 error = 0; 860 do { 861#ifdef ZERO_COPY_SOCKETS 862 cow_send = 0; 863#endif /* ZERO_COPY_SOCKETS / 864* if (resid >= MINCLSIZE) { 865#ifdef ZERO_COPY_SOCKETS 866 if (top == NULL) { 867 m = m_gethdr(M_WAITOK, MT_DATA); 868 m->m_pkthdr.len = 0; 869 m->m_pkthdr.rcvif = NULL; 870 } else 871 m = m_get(M_WAITOK, MT_DATA); 872 if (so_zero_copy_send && 873 resid>=PAGE_SIZE && 874 space>=PAGE_SIZE && 875* uio->uio_iov->iov_len>=PAGE_SIZE) { 876 so_zerocp_stats.size_ok++; 877 so_zerocp_stats.align_ok++; 878 cow_send = socow_setup(m, uio); 879 len = cow_send; 880 } 881 if (!cow_send) { 882 m_clget(m, M_WAITOK); 883 len = min(min(MCLBYTES, resid), space); 884* } 885#else /* ZERO_COPY_SOCKETS / 886* if (top == NULL) { 887 m = m_getcl(M_TRYWAIT, MT_DATA, M_PKTHDR); 888 m->m_pkthdr.len = 0; 889 m->m_pkthdr.rcvif = NULL; 890 } else 891 m = m_getcl(M_TRYWAIT, MT_DATA, 0); 892 len = min(min(MCLBYTES, resid), space); 893#endif / ZERO_COPY_SOCKETS / 894* } else { 895 if (top == NULL) { 896 m = m_gethdr(M_TRYWAIT, MT_DATA); 897 m->m_pkthdr.len = 0; 898 m->m_pkthdr.rcvif = NULL; 899 900 len = min(min(MHLEN, resid), space); 901* /* 902 * For datagram protocols, leave room 903 * for protocol headers in first mbuf. 904 / 905* if (atomic && m && len < MHLEN) 906 MH_ALIGN(m, len); 907 } else { 908 m = m_get(M_TRYWAIT, MT_DATA); 909 len = min(min(MLEN, resid), space); 910* } 911 } 912 if (m == NULL) { 913 error = ENOBUFS; 914 goto out; 915 } 916 917 space -= len; 918#ifdef ZERO_COPY_SOCKETS 919* if (cow_send) 920 error = 0; 921 else 922#endif /* ZERO_COPY_SOCKETS / 923* error = uiomove(mtod(m, void ), (int)len, uio); 924* resid = uio->uio_resid; 925 m->m_len = len; 926 mp = m; 927* top->m_pkthdr.len += len; 928 if (error) 929 goto out; 930 mp = &m->m_next; 931 if (resid <= 0) { 932 if (flags & MSG_EOR) 933 top->m_flags \|= M_EOR; 934 break; 935 } 936 } while (space > 0 && atomic); 937out: 938* retmp = top; 939* return (error); 940} 941#endif /ZERO_COPY_SOCKETS/ 942 943#define SBLOCKWAIT(f) (((f) & MSG_DONTWAIT) ? M_NOWAIT : M_WAITOK) 944 945int 946sosend_dgram(so, addr, uio, top, control, flags, td) 947 struct socket so; 948* struct sockaddr addr; 949* struct uio uio; 950* struct mbuf top; 951* struct mbuf control; 952* int flags; 953 struct thread td; 954{ 955* long space, resid; 956 int clen = 0, error, dontroute; 957#ifdef ZERO_COPY_SOCKETS 958 int atomic = sosendallatonce(so) \|\| top; 959#endif 960 961 KASSERT(so->so_type == SOCK_DGRAM, ("sodgram_send: !SOCK_DGRAM")); 962 KASSERT(so->so_proto->pr_flags & PR_ATOMIC, 963 ("sodgram_send: !PR_ATOMIC")); 964 965 if (uio != NULL) 966 resid = uio->uio_resid; 967 else 968 resid = top->m_pkthdr.len; 969 /* 970 * In theory resid should be unsigned. However, space must be 971 * signed, as it might be less than 0 if we over-committed, and we 972 * must use a signed comparison of space and resid. On the other 973 * hand, a negative resid causes us to loop sending 0-length 974 * segments to the protocol. 975 * 976 * Also check to make sure that MSG_EOR isn't used on SOCK_STREAM 977 * type sockets since that's an error. 978 / 979* if (resid < 0) { 980 error = EINVAL; 981 goto out; 982 } 983 984 dontroute = 985 (flags & MSG_DONTROUTE) && (so->so_options & SO_DONTROUTE) == 0; 986 if (td != NULL) 987 td->td_proc->p_stats->p_ru.ru_msgsnd++; 988 if (control != NULL) 989 clen = control->m_len; 990 991 SOCKBUF_LOCK(&so->so_snd); 992 if (so->so_snd.sb_state & SBS_CANTSENDMORE) { 993 SOCKBUF_UNLOCK(&so->so_snd); 994 error = EPIPE; 995 goto out; 996 } 997 if (so->so_error) { 998 error = so->so_error; 999 so->so_error = 0; 1000 SOCKBUF_UNLOCK(&so->so_snd); 1001 goto out; 1002 } 1003 if ((so->so_state & SS_ISCONNECTED) == 0) { 1004 /* 1005 * `sendto' and `sendmsg' is allowed on a connection-based 1006 * socket if it supports implied connect. Return ENOTCONN if 1007 * not connected and no address is supplied. 1008 / 1009* if ((so->so_proto->pr_flags & PR_CONNREQUIRED) && 1010 (so->so_proto->pr_flags & PR_IMPLOPCL) == 0) { 1011 if ((so->so_state & SS_ISCONFIRMING) == 0 && 1012 !(resid == 0 && clen != 0)) { 1013 SOCKBUF_UNLOCK(&so->so_snd); 1014 error = ENOTCONN; 1015 goto out; 1016 } 1017 } else if (addr == NULL) { 1018 if (so->so_proto->pr_flags & PR_CONNREQUIRED) 1019 error = ENOTCONN; 1020 else 1021 error = EDESTADDRREQ; 1022 SOCKBUF_UNLOCK(&so->so_snd); 1023 goto out; 1024 } 1025 } 1026 1027 /* 1028 * Do we need MSG_OOB support in SOCK_DGRAM? Signs here may be a 1029 * problem and need fixing. 1030 / 1031* space = sbspace(&so->so_snd); 1032 if (flags & MSG_OOB) 1033 space += 1024; 1034 space -= clen; 1035 SOCKBUF_UNLOCK(&so->so_snd); 1036 if (resid > space) { 1037 error = EMSGSIZE; 1038 goto out; 1039 } 1040 if (uio == NULL) { 1041 resid = 0; 1042 if (flags & MSG_EOR) 1043 top->m_flags \|= M_EOR; 1044 } else { 1045#ifdef ZERO_COPY_SOCKETS 1046 error = sosend_copyin(uio, &top, atomic, &space, flags); 1047 if (error) 1048 goto out; 1049#else 1050 /* 1051 * Copy the data from userland into a mbuf chain. 1052 * If no data is to be copied in, a single empty mbuf 1053 * is returned. 1054 / 1055* top = m_uiotombuf(uio, M_WAITOK, space, max_hdr, 1056 (M_PKTHDR \| ((flags & MSG_EOR) ? M_EOR : 0))); 1057 if (top == NULL) { 1058 error = EFAULT; /* only possible error / 1059* goto out; 1060 } 1061 space -= resid - uio->uio_resid; 1062#endif 1063 resid = uio->uio_resid; 1064 } 1065 KASSERT(resid == 0, ("sosend_dgram: resid != 0")); 1066 /* 1067 * XXXRW: Frobbing SO_DONTROUTE here is even worse without sblock 1068 * than with. 1069 / 1070* if (dontroute) { 1071 SOCK_LOCK(so); 1072 so->so_options \|= SO_DONTROUTE; 1073 SOCK_UNLOCK(so); 1074 } 1075 /* 1076 * XXX all the SBS_CANTSENDMORE checks previously done could be out 1077 * of date. We could have recieved a reset packet in an interrupt or 1078 * maybe we slept while doing page faults in uiomove() etc. We could 1079 * probably recheck again inside the locking protection here, but 1080 * there are probably other places that this also happens. We must 1081 * rethink this. 1082 / 1083* error = (so->so_proto->pr_usrreqs->pru_send)(so, 1084* (flags & MSG_OOB) ? PRUS_OOB : 1085 /* 1086 * If the user set MSG_EOF, the protocol understands this flag and 1087 * nothing left to send then use PRU_SEND_EOF instead of PRU_SEND. 1088 / 1089* ((flags & MSG_EOF) && 1090 (so->so_proto->pr_flags & PR_IMPLOPCL) && 1091 (resid <= 0)) ? 1092 PRUS_EOF : 1093 /* If there is more to send set PRUS_MORETOCOME / 1094* (resid > 0 && space > 0) ? PRUS_MORETOCOME : 0, 1095 top, addr, control, td); 1096 if (dontroute) { 1097 SOCK_LOCK(so); 1098 so->so_options &= ~SO_DONTROUTE; 1099 SOCK_UNLOCK(so); 1100 } 1101 clen = 0; 1102 control = NULL; 1103 top = NULL; 1104out: 1105 if (top != NULL) 1106 m_freem(top); 1107 if (control != NULL) 1108 m_freem(control); 1109 return (error); 1110} 1111 1112/* 1113 * Send on a socket. If send must go all at once and message is larger than 1114 * send buffering, then hard error. Lock against other senders. If must go 1115 * all at once and not enough room now, then inform user that this would 1116 * block and do nothing. Otherwise, if nonblocking, send as much as 1117 * possible. The data to be sent is described by "uio" if nonzero, otherwise 1118 * by the mbuf chain "top" (which must be null if uio is not). Data provided 1119 * in mbuf chain must be small enough to send all at once. 1120 * 1121 * Returns nonzero on error, timeout or signal; callers must check for short 1122 * counts if EINTR/ERESTART are returned. Data and control buffers are freed 1123 * on return. 1124 / 1125#define snderr(errno) { error = (errno); goto release; } 1126int 1127sosend_generic(so, addr, uio, top, control, flags, td) 1128* struct socket so; 1129* struct sockaddr addr; 1130* struct uio uio; 1131* struct mbuf top; 1132* struct mbuf control; 1133* int flags; 1134 struct thread td; 1135{ 1136* long space, resid; 1137 int clen = 0, error, dontroute; 1138 int atomic = sosendallatonce(so) \|\| top; 1139 1140 if (uio != NULL) 1141 resid = uio->uio_resid; 1142 else 1143 resid = top->m_pkthdr.len; 1144 /* 1145 * In theory resid should be unsigned. However, space must be 1146 * signed, as it might be less than 0 if we over-committed, and we 1147 * must use a signed comparison of space and resid. On the other 1148 * hand, a negative resid causes us to loop sending 0-length 1149 * segments to the protocol. 1150 * 1151 * Also check to make sure that MSG_EOR isn't used on SOCK_STREAM 1152 * type sockets since that's an error. 1153 / 1154* if (resid < 0 \|\| (so->so_type == SOCK_STREAM && (flags & MSG_EOR))) { 1155 error = EINVAL; 1156 goto out; 1157 } 1158 1159 dontroute = 1160 (flags & MSG_DONTROUTE) && (so->so_options & SO_DONTROUTE) == 0 && 1161 (so->so_proto->pr_flags & PR_ATOMIC); 1162 if (td != NULL) 1163 td->td_proc->p_stats->p_ru.ru_msgsnd++; 1164 if (control != NULL) 1165 clen = control->m_len; 1166 1167 SOCKBUF_LOCK(&so->so_snd); 1168restart: 1169 SOCKBUF_LOCK_ASSERT(&so->so_snd); 1170 error = sblock(&so->so_snd, SBLOCKWAIT(flags)); 1171 if (error) 1172 goto out_locked; 1173 do { 1174 SOCKBUF_LOCK_ASSERT(&so->so_snd); 1175 if (so->so_snd.sb_state & SBS_CANTSENDMORE) 1176 snderr(EPIPE); 1177 if (so->so_error) { 1178 error = so->so_error; 1179 so->so_error = 0; 1180 goto release; 1181 } 1182 if ((so->so_state & SS_ISCONNECTED) == 0) { 1183 /* 1184 * `sendto' and `sendmsg' is allowed on a connection- 1185 * based socket if it supports implied connect. 1186 * Return ENOTCONN if not connected and no address is 1187 * supplied. 1188 / 1189* if ((so->so_proto->pr_flags & PR_CONNREQUIRED) && 1190 (so->so_proto->pr_flags & PR_IMPLOPCL) == 0) { 1191 if ((so->so_state & SS_ISCONFIRMING) == 0 && 1192 !(resid == 0 && clen != 0)) 1193 snderr(ENOTCONN); 1194 } else if (addr == NULL) 1195 snderr(so->so_proto->pr_flags & PR_CONNREQUIRED ? 1196 ENOTCONN : EDESTADDRREQ); 1197 } 1198 space = sbspace(&so->so_snd); 1199 if (flags & MSG_OOB) 1200 space += 1024; 1201 if ((atomic && resid > so->so_snd.sb_hiwat) \|\| 1202 clen > so->so_snd.sb_hiwat) 1203 snderr(EMSGSIZE); 1204 if (space < resid + clen && 1205 (atomic \|\| space < so->so_snd.sb_lowat \|\| space < clen)) { 1206 if ((so->so_state & SS_NBIO) \|\| (flags & MSG_NBIO)) 1207 snderr(EWOULDBLOCK); 1208 sbunlock(&so->so_snd); 1209 error = sbwait(&so->so_snd); 1210 if (error) 1211 goto out_locked; 1212 goto restart; 1213 } 1214 SOCKBUF_UNLOCK(&so->so_snd); 1215 space -= clen; 1216 do { 1217 if (uio == NULL) { 1218 resid = 0; 1219 if (flags & MSG_EOR) 1220 top->m_flags \|= M_EOR; 1221 } else { 1222#ifdef ZERO_COPY_SOCKETS 1223 error = sosend_copyin(uio, &top, atomic, 1224 &space, flags); 1225 if (error != 0) { 1226 SOCKBUF_LOCK(&so->so_snd); 1227 goto release; 1228 } 1229#else 1230 /* 1231 * Copy the data from userland into a mbuf 1232 * chain. If no data is to be copied in, 1233 * a single empty mbuf is returned. 1234 / 1235* top = m_uiotombuf(uio, M_WAITOK, space, 1236 (atomic ? max_hdr : 0), 1237 (atomic ? M_PKTHDR : 0) \| 1238 ((flags & MSG_EOR) ? M_EOR : 0)); 1239 if (top == NULL) { 1240 SOCKBUF_LOCK(&so->so_snd); 1241 error = EFAULT; /* only possible error / 1242* goto release; 1243 } 1244 space -= resid - uio->uio_resid; 1245#endif 1246 resid = uio->uio_resid; 1247 } 1248 if (dontroute) { 1249 SOCK_LOCK(so); 1250 so->so_options \|= SO_DONTROUTE; 1251 SOCK_UNLOCK(so); 1252 } 1253 /* 1254 * XXX all the SBS_CANTSENDMORE checks previously 1255 * done could be out of date. We could have recieved 1256 * a reset packet in an interrupt or maybe we slept 1257 * while doing page faults in uiomove() etc. We 1258 * could probably recheck again inside the locking 1259 * protection here, but there are probably other 1260 * places that this also happens. We must rethink 1261 * this. 1262 / 1263* error = (so->so_proto->pr_usrreqs->pru_send)(so, 1264* (flags & MSG_OOB) ? PRUS_OOB : 1265 /* 1266 * If the user set MSG_EOF, the protocol understands 1267 * this flag and nothing left to send then use 1268 * PRU_SEND_EOF instead of PRU_SEND. 1269 / 1270* ((flags & MSG_EOF) && 1271 (so->so_proto->pr_flags & PR_IMPLOPCL) && 1272 (resid <= 0)) ? 1273 PRUS_EOF : 1274 /* If there is more to send set PRUS_MORETOCOME. / 1275* (resid > 0 && space > 0) ? PRUS_MORETOCOME : 0, 1276 top, addr, control, td); 1277 if (dontroute) { 1278 SOCK_LOCK(so); 1279 so->so_options &= ~SO_DONTROUTE; 1280 SOCK_UNLOCK(so); 1281 } 1282 clen = 0; 1283 control = NULL; 1284 top = NULL; 1285 if (error) { 1286 SOCKBUF_LOCK(&so->so_snd); 1287 goto release; 1288 } 1289 } while (resid && space > 0); 1290 SOCKBUF_LOCK(&so->so_snd); 1291 } while (resid); 1292 1293release: 1294 SOCKBUF_LOCK_ASSERT(&so->so_snd); 1295 sbunlock(&so->so_snd); 1296out_locked: 1297 SOCKBUF_LOCK_ASSERT(&so->so_snd); 1298 SOCKBUF_UNLOCK(&so->so_snd); 1299out: 1300 if (top != NULL) 1301 m_freem(top); 1302 if (control != NULL) 1303 m_freem(control); 1304 return (error); 1305} 1306#undef snderr 1307 1308int 1309sosend(so, addr, uio, top, control, flags, td) 1310 struct socket so; 1311* struct sockaddr addr; 1312* struct uio uio; 1313* struct mbuf top; 1314* struct mbuf control; 1315* int flags; 1316 struct thread td; 1317{ 1318* 1319 /* XXXRW: Temporary debugging. / 1320* KASSERT(so->so_proto->pr_usrreqs->pru_sosend != sosend, 1321 ("sosend: protocol calls sosend")); 1322 1323 return (so->so_proto->pr_usrreqs->pru_sosend(so, addr, uio, top, 1324 control, flags, td)); 1325} 1326 1327/* 1328 * The part of soreceive() that implements reading non-inline out-of-band 1329 * data from a socket. For more complete comments, see soreceive(), from 1330 * which this code originated. 1331 * 1332 * Note that soreceive_rcvoob(), unlike the remainder of soreceive(), is 1333 * unable to return an mbuf chain to the caller. 1334 / 1335static int 1336soreceive_rcvoob(so, uio, flags) 1337* struct socket so; 1338* struct uio uio; 1339* int flags; 1340{ 1341 struct protosw pr = so->so_proto; 1342* struct mbuf m; 1343* int error; 1344 1345 KASSERT(flags & MSG_OOB, ("soreceive_rcvoob: (flags & MSG_OOB) == 0")); 1346 1347 m = m_get(M_TRYWAIT, MT_DATA); 1348 if (m == NULL) 1349 return (ENOBUFS); 1350 error = (pr->pr_usrreqs->pru_rcvoob)(so, m, flags & MSG_PEEK); 1351* if (error) 1352 goto bad; 1353 do { 1354#ifdef ZERO_COPY_SOCKETS 1355 if (so_zero_copy_receive) { 1356 int disposable; 1357 1358 if ((m->m_flags & M_EXT) 1359 && (m->m_ext.ext_type == EXT_DISPOSABLE)) 1360 disposable = 1; 1361 else 1362 disposable = 0; 1363 1364 error = uiomoveco(mtod(m, void ), 1365* min(uio->uio_resid, m->m_len), 1366 uio, disposable); 1367 } else 1368#endif /* ZERO_COPY_SOCKETS / 1369* error = uiomove(mtod(m, void ), 1370* (int) min(uio->uio_resid, m->m_len), uio); 1371 m = m_free(m); 1372 } while (uio->uio_resid && error == 0 && m); 1373bad: 1374 if (m != NULL) 1375 m_freem(m); 1376 return (error); 1377} 1378 1379/* 1380 * Following replacement or removal of the first mbuf on the first mbuf chain 1381 * of a socket buffer, push necessary state changes back into the socket 1382 * buffer so that other consumers see the values consistently. 'nextrecord' 1383 * is the callers locally stored value of the original value of 1384 * sb->sb_mb->m_nextpkt which must be restored when the lead mbuf changes. 1385 * NOTE: 'nextrecord' may be NULL. 1386 / 1387static __inline void 1388sockbuf_pushsync(struct sockbuf sb, struct mbuf nextrecord) 1389{ 1390* 1391 SOCKBUF_LOCK_ASSERT(sb); 1392 /* 1393 * First, update for the new value of nextrecord. If necessary, make 1394 * it the first record. 1395 / 1396* if (sb->sb_mb != NULL) 1397 sb->sb_mb->m_nextpkt = nextrecord; 1398 else 1399 sb->sb_mb = nextrecord; 1400 1401 /* 1402 * Now update any dependent socket buffer fields to reflect the new 1403 * state. This is an expanded inline of SB_EMPTY_FIXUP(), with the 1404 * addition of a second clause that takes care of the case where 1405 * sb_mb has been updated, but remains the last record. 1406 / 1407* if (sb->sb_mb == NULL) { 1408 sb->sb_mbtail = NULL; 1409 sb->sb_lastrecord = NULL; 1410 } else if (sb->sb_mb->m_nextpkt == NULL) 1411 sb->sb_lastrecord = sb->sb_mb; 1412} 1413 1414 1415/* 1416 * Implement receive operations on a socket. We depend on the way that 1417 * records are added to the sockbuf by sbappend. In particular, each record 1418 * (mbufs linked through m_next) must begin with an address if the protocol 1419 * so specifies, followed by an optional mbuf or mbufs containing ancillary 1420 * data, and then zero or more mbufs of data. In order to allow parallelism 1421 * between network receive and copying to user space, as well as avoid 1422 * sleeping with a mutex held, we release the socket buffer mutex during the 1423 * user space copy. Although the sockbuf is locked, new data may still be 1424 * appended, and thus we must maintain consistency of the sockbuf during that 1425 * time. 1426 * 1427 * The caller may receive the data as a single mbuf chain by supplying an 1428 * mbuf *mp0 for use in returning the chain. The uio is then used only for 1429* * the count in uio_resid. 1430 / 1431int 1432soreceive_generic(so, psa, uio, mp0, controlp, flagsp) 1433* struct socket so; 1434* struct sockaddr *psa; 1435* struct uio uio; 1436* struct mbuf *mp0; 1437* struct mbuf *controlp; 1438* int flagsp; 1439{ 1440* struct mbuf m, mp; 1441* int flags, len, error, offset; 1442 struct protosw pr = so->so_proto; 1443* struct mbuf nextrecord; 1444* int moff, type = 0; 1445 int orig_resid = uio->uio_resid; 1446 1447 mp = mp0; 1448 if (psa != NULL) 1449 psa = NULL; 1450* if (controlp != NULL) 1451 controlp = NULL; 1452* if (flagsp != NULL) 1453 flags = flagsp &~ MSG_EOR; 1454* else 1455 flags = 0; 1456 if (flags & MSG_OOB) 1457 return (soreceive_rcvoob(so, uio, flags)); 1458 if (mp != NULL) 1459 mp = NULL; 1460* if ((pr->pr_flags & PR_WANTRCVD) && (so->so_state & SS_ISCONFIRMING) 1461 && uio->uio_resid) 1462 (pr->pr_usrreqs->pru_rcvd)(so, 0); 1463* 1464 SOCKBUF_LOCK(&so->so_rcv); 1465restart: 1466 SOCKBUF_LOCK_ASSERT(&so->so_rcv); 1467 error = sblock(&so->so_rcv, SBLOCKWAIT(flags)); 1468 if (error) 1469 goto out; 1470 1471 m = so->so_rcv.sb_mb; 1472 /* 1473 * If we have less data than requested, block awaiting more (subject 1474 * to any timeout) if: 1475 * 1. the current count is less than the low water mark, or 1476 * 2. MSG_WAITALL is set, and it is possible to do the entire 1477 * receive operation at once if we block (resid <= hiwat). 1478 * 3. MSG_DONTWAIT is not set 1479 * If MSG_WAITALL is set but resid is larger than the receive buffer, 1480 * we have to do the receive in sections, and thus risk returning a 1481 * short count if a timeout or signal occurs after we start. 1482 / 1483* if (m == NULL \|\| (((flags & MSG_DONTWAIT) == 0 && 1484 so->so_rcv.sb_cc < uio->uio_resid) && 1485 (so->so_rcv.sb_cc < so->so_rcv.sb_lowat \|\| 1486 ((flags & MSG_WAITALL) && uio->uio_resid <= so->so_rcv.sb_hiwat)) && 1487 m->m_nextpkt == NULL && (pr->pr_flags & PR_ATOMIC) == 0)) { 1488 KASSERT(m != NULL \|\| !so->so_rcv.sb_cc, 1489 ("receive: m == %p so->so_rcv.sb_cc == %u", 1490 m, so->so_rcv.sb_cc)); 1491 if (so->so_error) { 1492 if (m != NULL) 1493 goto dontblock; 1494 error = so->so_error; 1495 if ((flags & MSG_PEEK) == 0) 1496 so->so_error = 0; 1497 goto release; 1498 } 1499 SOCKBUF_LOCK_ASSERT(&so->so_rcv); 1500 if (so->so_rcv.sb_state & SBS_CANTRCVMORE) { 1501 if (m) 1502 goto dontblock; 1503 else 1504 goto release; 1505 } 1506 for (; m != NULL; m = m->m_next) 1507 if (m->m_type == MT_OOBDATA \|\| (m->m_flags & M_EOR)) { 1508 m = so->so_rcv.sb_mb; 1509 goto dontblock; 1510 } 1511 if ((so->so_state & (SS_ISCONNECTED\|SS_ISCONNECTING)) == 0 && 1512 (so->so_proto->pr_flags & PR_CONNREQUIRED)) { 1513 error = ENOTCONN; 1514 goto release; 1515 } 1516 if (uio->uio_resid == 0) 1517 goto release; 1518 if ((so->so_state & SS_NBIO) \|\| 1519 (flags & (MSG_DONTWAIT\|MSG_NBIO))) { 1520 error = EWOULDBLOCK; 1521 goto release; 1522 } 1523 SBLASTRECORDCHK(&so->so_rcv); 1524 SBLASTMBUFCHK(&so->so_rcv); 1525 sbunlock(&so->so_rcv); 1526 error = sbwait(&so->so_rcv); 1527 if (error) 1528 goto out; 1529 goto restart; 1530 } 1531dontblock: 1532 /* 1533 * From this point onward, we maintain 'nextrecord' as a cache of the 1534 * pointer to the next record in the socket buffer. We must keep the 1535 * various socket buffer pointers and local stack versions of the 1536 * pointers in sync, pushing out modifications before dropping the 1537 * socket buffer mutex, and re-reading them when picking it up. 1538 * 1539 * Otherwise, we will race with the network stack appending new data 1540 * or records onto the socket buffer by using inconsistent/stale 1541 * versions of the field, possibly resulting in socket buffer 1542 * corruption. 1543 * 1544 * By holding the high-level sblock(), we prevent simultaneous 1545 * readers from pulling off the front of the socket buffer. 1546 / 1547* SOCKBUF_LOCK_ASSERT(&so->so_rcv); 1548 if (uio->uio_td) 1549 uio->uio_td->td_proc->p_stats->p_ru.ru_msgrcv++; 1550 KASSERT(m == so->so_rcv.sb_mb, ("soreceive: m != so->so_rcv.sb_mb")); 1551 SBLASTRECORDCHK(&so->so_rcv); 1552 SBLASTMBUFCHK(&so->so_rcv); 1553 nextrecord = m->m_nextpkt; 1554 if (pr->pr_flags & PR_ADDR) { 1555 KASSERT(m->m_type == MT_SONAME, 1556 ("m->m_type == %d", m->m_type)); 1557 orig_resid = 0; 1558 if (psa != NULL) 1559 psa = sodupsockaddr(mtod(m, struct sockaddr ), 1560 M_NOWAIT); 1561 if (flags & MSG_PEEK) { 1562 m = m->m_next; 1563 } else { 1564 sbfree(&so->so_rcv, m); 1565 so->so_rcv.sb_mb = m_free(m); 1566 m = so->so_rcv.sb_mb; 1567 sockbuf_pushsync(&so->so_rcv, nextrecord); 1568 } 1569 } 1570 1571 /* 1572 * Process one or more MT_CONTROL mbufs present before any data mbufs 1573 * in the first mbuf chain on the socket buffer. If MSG_PEEK, we 1574 * just copy the data; if !MSG_PEEK, we call into the protocol to 1575 * perform externalization (or freeing if controlp == NULL). 1576 / 1577* if (m != NULL && m->m_type == MT_CONTROL) { 1578 struct mbuf cm = NULL, cmn; 1579 struct mbuf *cme = &cm; 1580* 1581 do { 1582 if (flags & MSG_PEEK) { 1583 if (controlp != NULL) { 1584 controlp = m_copy(m, 0, m->m_len); 1585* controlp = &(controlp)->m_next; 1586* } 1587 m = m->m_next; 1588 } else { 1589 sbfree(&so->so_rcv, m); 1590 so->so_rcv.sb_mb = m->m_next; 1591 m->m_next = NULL; 1592 cme = m; 1593* cme = &(cme)->m_next; 1594* m = so->so_rcv.sb_mb; 1595 } 1596 } while (m != NULL && m->m_type == MT_CONTROL); 1597 if ((flags & MSG_PEEK) == 0) 1598 sockbuf_pushsync(&so->so_rcv, nextrecord); 1599 while (cm != NULL) { 1600 cmn = cm->m_next; 1601 cm->m_next = NULL; 1602 if (pr->pr_domain->dom_externalize != NULL) { 1603 SOCKBUF_UNLOCK(&so->so_rcv); 1604 error = (pr->pr_domain->dom_externalize) 1605* (cm, controlp); 1606 SOCKBUF_LOCK(&so->so_rcv); 1607 } else if (controlp != NULL) 1608 controlp = cm; 1609* else 1610 m_freem(cm); 1611 if (controlp != NULL) { 1612 orig_resid = 0; 1613 while (controlp != NULL) 1614* controlp = &(controlp)->m_next; 1615* } 1616 cm = cmn; 1617 } 1618 if (m != NULL) 1619 nextrecord = so->so_rcv.sb_mb->m_nextpkt; 1620 else 1621 nextrecord = so->so_rcv.sb_mb; 1622 orig_resid = 0; 1623 } 1624 if (m != NULL) { 1625 if ((flags & MSG_PEEK) == 0) { 1626 KASSERT(m->m_nextpkt == nextrecord, 1627 ("soreceive: post-control, nextrecord !sync")); 1628 if (nextrecord == NULL) { 1629 KASSERT(so->so_rcv.sb_mb == m, 1630 ("soreceive: post-control, sb_mb!=m")); 1631 KASSERT(so->so_rcv.sb_lastrecord == m, 1632 ("soreceive: post-control, lastrecord!=m")); 1633 } 1634 } 1635 type = m->m_type; 1636 if (type == MT_OOBDATA) 1637 flags \|= MSG_OOB; 1638 } else { 1639 if ((flags & MSG_PEEK) == 0) { 1640 KASSERT(so->so_rcv.sb_mb == nextrecord, 1641 ("soreceive: sb_mb != nextrecord")); 1642 if (so->so_rcv.sb_mb == NULL) { 1643 KASSERT(so->so_rcv.sb_lastrecord == NULL, 1644 ("soreceive: sb_lastercord != NULL")); 1645 } 1646 } 1647 } 1648 SOCKBUF_LOCK_ASSERT(&so->so_rcv); 1649 SBLASTRECORDCHK(&so->so_rcv); 1650 SBLASTMBUFCHK(&so->so_rcv); 1651 1652 /* 1653 * Now continue to read any data mbufs off of the head of the socket 1654 * buffer until the read request is satisfied. Note that 'type' is 1655 * used to store the type of any mbuf reads that have happened so far 1656 * such that soreceive() can stop reading if the type changes, which 1657 * causes soreceive() to return only one of regular data and inline 1658 * out-of-band data in a single socket receive operation. 1659 / 1660* moff = 0; 1661 offset = 0; 1662 while (m != NULL && uio->uio_resid > 0 && error == 0) { 1663 /* 1664 * If the type of mbuf has changed since the last mbuf 1665 * examined ('type'), end the receive operation. 1666 / 1667* SOCKBUF_LOCK_ASSERT(&so->so_rcv); 1668 if (m->m_type == MT_OOBDATA) { 1669 if (type != MT_OOBDATA) 1670 break; 1671 } else if (type == MT_OOBDATA) 1672 break; 1673 else 1674 KASSERT(m->m_type == MT_DATA, 1675 ("m->m_type == %d", m->m_type)); 1676 so->so_rcv.sb_state &= ~SBS_RCVATMARK; 1677 len = uio->uio_resid; 1678 if (so->so_oobmark && len > so->so_oobmark - offset) 1679 len = so->so_oobmark - offset; 1680 if (len > m->m_len - moff) 1681 len = m->m_len - moff; 1682 /* 1683 * If mp is set, just pass back the mbufs. Otherwise copy 1684 * them out via the uio, then free. Sockbuf must be 1685 * consistent here (points to current mbuf, it points to next 1686 * record) when we drop priority; we must note any additions 1687 * to the sockbuf when we block interrupts again. 1688 / 1689* if (mp == NULL) { 1690 SOCKBUF_LOCK_ASSERT(&so->so_rcv); 1691 SBLASTRECORDCHK(&so->so_rcv); 1692 SBLASTMBUFCHK(&so->so_rcv); 1693 SOCKBUF_UNLOCK(&so->so_rcv); 1694#ifdef ZERO_COPY_SOCKETS 1695 if (so_zero_copy_receive) { 1696 int disposable; 1697 1698 if ((m->m_flags & M_EXT) 1699 && (m->m_ext.ext_type == EXT_DISPOSABLE)) 1700 disposable = 1; 1701 else 1702 disposable = 0; 1703 1704 error = uiomoveco(mtod(m, char ) + moff, 1705* (int)len, uio, 1706 disposable); 1707 } else 1708#endif /* ZERO_COPY_SOCKETS / 1709* error = uiomove(mtod(m, char ) + moff, (int)len, uio); 1710* SOCKBUF_LOCK(&so->so_rcv); 1711 if (error) { 1712 /* 1713 * The MT_SONAME mbuf has already been removed 1714 * from the record, so it is necessary to 1715 * remove the data mbufs, if any, to preserve 1716 * the invariant in the case of PR_ADDR that 1717 * requires MT_SONAME mbufs at the head of 1718 * each record. 1719 / 1720* if (m && pr->pr_flags & PR_ATOMIC && 1721 ((flags & MSG_PEEK) == 0)) 1722 (void)sbdroprecord_locked(&so->so_rcv); 1723 goto release; 1724 } 1725 } else 1726 uio->uio_resid -= len; 1727 SOCKBUF_LOCK_ASSERT(&so->so_rcv); 1728 if (len == m->m_len - moff) { 1729 if (m->m_flags & M_EOR) 1730 flags \|= MSG_EOR; 1731 if (flags & MSG_PEEK) { 1732 m = m->m_next; 1733 moff = 0; 1734 } else { 1735 nextrecord = m->m_nextpkt; 1736 sbfree(&so->so_rcv, m); 1737 if (mp != NULL) { 1738 mp = m; 1739* mp = &m->m_next; 1740 so->so_rcv.sb_mb = m = m->m_next; 1741 mp = NULL; 1742* } else { 1743 so->so_rcv.sb_mb = m_free(m); 1744 m = so->so_rcv.sb_mb; 1745 } 1746 sockbuf_pushsync(&so->so_rcv, nextrecord); 1747 SBLASTRECORDCHK(&so->so_rcv); 1748 SBLASTMBUFCHK(&so->so_rcv); 1749 } 1750 } else { 1751 if (flags & MSG_PEEK) 1752 moff += len; 1753 else { 1754 if (mp != NULL) { 1755 int copy_flag; 1756 1757 if (flags & MSG_DONTWAIT) 1758 copy_flag = M_DONTWAIT; 1759 else 1760 copy_flag = M_TRYWAIT; 1761 if (copy_flag == M_TRYWAIT) 1762 SOCKBUF_UNLOCK(&so->so_rcv); 1763 mp = m_copym(m, 0, len, copy_flag); 1764* if (copy_flag == M_TRYWAIT) 1765 SOCKBUF_LOCK(&so->so_rcv); 1766 if (mp == NULL) { 1767* /* 1768 * m_copym() couldn't 1769 * allocate an mbuf. Adjust 1770 * uio_resid back (it was 1771 * adjusted down by len 1772 * bytes, which we didn't end 1773 * up "copying" over). 1774 / 1775* uio->uio_resid += len; 1776 break; 1777 } 1778 } 1779 m->m_data += len; 1780 m->m_len -= len; 1781 so->so_rcv.sb_cc -= len; 1782 } 1783 } 1784 SOCKBUF_LOCK_ASSERT(&so->so_rcv); 1785 if (so->so_oobmark) { 1786 if ((flags & MSG_PEEK) == 0) { 1787 so->so_oobmark -= len; 1788 if (so->so_oobmark == 0) { 1789 so->so_rcv.sb_state \|= SBS_RCVATMARK; 1790 break; 1791 } 1792 } else { 1793 offset += len; 1794 if (offset == so->so_oobmark) 1795 break; 1796 } 1797 } 1798 if (flags & MSG_EOR) 1799 break; 1800 /* 1801 * If the MSG_WAITALL flag is set (for non-atomic socket), we 1802 * must not quit until "uio->uio_resid == 0" or an error 1803 * termination. If a signal/timeout occurs, return with a 1804 * short count but without error. Keep sockbuf locked 1805 * against other readers. 1806 / 1807* while (flags & MSG_WAITALL && m == NULL && uio->uio_resid > 0 && 1808 !sosendallatonce(so) && nextrecord == NULL) { 1809 SOCKBUF_LOCK_ASSERT(&so->so_rcv); 1810 if (so->so_error \|\| so->so_rcv.sb_state & SBS_CANTRCVMORE) 1811 break; 1812 /* 1813 * Notify the protocol that some data has been 1814 * drained before blocking. 1815 / 1816* if (pr->pr_flags & PR_WANTRCVD) { 1817 SOCKBUF_UNLOCK(&so->so_rcv); 1818 (pr->pr_usrreqs->pru_rcvd)(so, flags); 1819* SOCKBUF_LOCK(&so->so_rcv); 1820 } 1821 SBLASTRECORDCHK(&so->so_rcv); 1822 SBLASTMBUFCHK(&so->so_rcv); 1823 error = sbwait(&so->so_rcv); 1824 if (error) 1825 goto release; 1826 m = so->so_rcv.sb_mb; 1827 if (m != NULL) 1828 nextrecord = m->m_nextpkt; 1829 } 1830 } 1831 1832 SOCKBUF_LOCK_ASSERT(&so->so_rcv); 1833 if (m != NULL && pr->pr_flags & PR_ATOMIC) { 1834 flags \|= MSG_TRUNC; 1835 if ((flags & MSG_PEEK) == 0) 1836 (void) sbdroprecord_locked(&so->so_rcv); 1837 } 1838 if ((flags & MSG_PEEK) == 0) { 1839 if (m == NULL) { 1840 /* 1841 * First part is an inline SB_EMPTY_FIXUP(). Second 1842 * part makes sure sb_lastrecord is up-to-date if 1843 * there is still data in the socket buffer. 1844 / 1845* so->so_rcv.sb_mb = nextrecord; 1846 if (so->so_rcv.sb_mb == NULL) { 1847 so->so_rcv.sb_mbtail = NULL; 1848 so->so_rcv.sb_lastrecord = NULL; 1849 } else if (nextrecord->m_nextpkt == NULL) 1850 so->so_rcv.sb_lastrecord = nextrecord; 1851 } 1852 SBLASTRECORDCHK(&so->so_rcv); 1853 SBLASTMBUFCHK(&so->so_rcv); 1854 /* 1855 * If soreceive() is being done from the socket callback, 1856 * then don't need to generate ACK to peer to update window, 1857 * since ACK will be generated on return to TCP. 1858 / 1859* if (!(flags & MSG_SOCALLBCK) && 1860 (pr->pr_flags & PR_WANTRCVD)) { 1861 SOCKBUF_UNLOCK(&so->so_rcv); 1862 (pr->pr_usrreqs->pru_rcvd)(so, flags); 1863* SOCKBUF_LOCK(&so->so_rcv); 1864 } 1865 } 1866 SOCKBUF_LOCK_ASSERT(&so->so_rcv); 1867 if (orig_resid == uio->uio_resid && orig_resid && 1868 (flags & MSG_EOR) == 0 && (so->so_rcv.sb_state & SBS_CANTRCVMORE) == 0) { 1869 sbunlock(&so->so_rcv); 1870 goto restart; 1871 } 1872 1873 if (flagsp != NULL) 1874 flagsp \|= flags; 1875release: 1876* SOCKBUF_LOCK_ASSERT(&so->so_rcv); 1877 sbunlock(&so->so_rcv); 1878out: 1879 SOCKBUF_LOCK_ASSERT(&so->so_rcv); 1880 SOCKBUF_UNLOCK(&so->so_rcv); 1881 return (error); 1882} 1883 1884int 1885soreceive(so, psa, uio, mp0, controlp, flagsp) 1886 struct socket so; 1887* struct sockaddr *psa; 1888* struct uio uio; 1889* struct mbuf *mp0; 1890* struct mbuf *controlp; 1891* int flagsp; 1892{ 1893* 1894 /* XXXRW: Temporary debugging. / 1895* KASSERT(so->so_proto->pr_usrreqs->pru_soreceive != soreceive, 1896 ("soreceive: protocol calls soreceive")); 1897 1898 return (so->so_proto->pr_usrreqs->pru_soreceive(so, psa, uio, mp0, 1899 controlp, flagsp)); 1900} 1901 1902int 1903soshutdown(so, how) 1904 struct socket so; 1905* int how; 1906{ 1907 struct protosw pr = so->so_proto; 1908* 1909 if (!(how == SHUT_RD \|\| how == SHUT_WR \|\| how == SHUT_RDWR)) 1910 return (EINVAL); 1911 1912 if (how != SHUT_WR) 1913 sorflush(so); 1914 if (how != SHUT_RD) 1915 return ((pr->pr_usrreqs->pru_shutdown)(so)); 1916* return (0); 1917} 1918 1919void 1920sorflush(so) 1921 struct socket so; 1922{ 1923* struct sockbuf sb = &so->so_rcv; 1924* struct protosw pr = so->so_proto; 1925* struct sockbuf asb; 1926 1927 /* 1928 * XXXRW: This is quite ugly. Previously, this code made a copy of 1929 * the socket buffer, then zero'd the original to clear the buffer 1930 * fields. However, with mutexes in the socket buffer, this causes 1931 * problems. We only clear the zeroable bits of the original; 1932 * however, we have to initialize and destroy the mutex in the copy 1933 * so that dom_dispose() and sbrelease() can lock t as needed. 1934 / 1935* SOCKBUF_LOCK(sb); 1936 sb->sb_flags \|= SB_NOINTR; 1937 (void) sblock(sb, M_WAITOK); 1938 /* 1939 * socantrcvmore_locked() drops the socket buffer mutex so that it 1940 * can safely perform wakeups. Re-acquire the mutex before 1941 * continuing. 1942 / 1943* socantrcvmore_locked(so); 1944 SOCKBUF_LOCK(sb); 1945 sbunlock(sb); 1946 /* 1947 * Invalidate/clear most of the sockbuf structure, but leave selinfo 1948 * and mutex data unchanged. 1949 / 1950* bzero(&asb, offsetof(struct sockbuf, sb_startzero)); 1951 bcopy(&sb->sb_startzero, &asb.sb_startzero, 1952 sizeof(sb) - offsetof(struct sockbuf, sb_startzero)); 1953* bzero(&sb->sb_startzero, 1954 sizeof(sb) - offsetof(struct sockbuf, sb_startzero)); 1955* SOCKBUF_UNLOCK(sb); 1956 1957 SOCKBUF_LOCK_INIT(&asb, "so_rcv"); 1958 if (pr->pr_flags & PR_RIGHTS && pr->pr_domain->dom_dispose != NULL) 1959 (pr->pr_domain->dom_dispose)(asb.sb_mb); 1960* sbrelease(&asb, so); 1961 SOCKBUF_LOCK_DESTROY(&asb); 1962} 1963 1964/* 1965 * Perhaps this routine, and sooptcopyout(), below, ought to come in an 1966 * additional variant to handle the case where the option value needs to be 1967 * some kind of integer, but not a specific size. In addition to their use 1968 * here, these functions are also called by the protocol-level pr_ctloutput() 1969 * routines. 1970 / 1971int 1972sooptcopyin(sopt, buf, len, minlen) 1973* struct sockopt sopt; 1974* void buf; 1975* size_t len; 1976 size_t minlen; 1977{ 1978 size_t valsize; 1979 1980 /* 1981 * If the user gives us more than we wanted, we ignore it, but if we 1982 * don't get the minimum length the caller wants, we return EINVAL. 1983 * On success, sopt->sopt_valsize is set to however much we actually 1984 * retrieved. 1985 / 1986* if ((valsize = sopt->sopt_valsize) < minlen) 1987 return EINVAL; 1988 if (valsize > len) 1989 sopt->sopt_valsize = valsize = len; 1990 1991 if (sopt->sopt_td != NULL) 1992 return (copyin(sopt->sopt_val, buf, valsize)); 1993 1994 bcopy(sopt->sopt_val, buf, valsize); 1995 return (0); 1996} 1997 1998/* 1999 * Kernel version of setsockopt(2). 2000 * 2001 * XXX: optlen is size_t, not socklen_t 2002 / 2003int 2004so_setsockopt(struct socket so, int level, int optname, void optval, 2005* size_t optlen) 2006{ 2007 struct sockopt sopt; 2008 2009 sopt.sopt_level = level; 2010 sopt.sopt_name = optname; 2011 sopt.sopt_dir = SOPT_SET; 2012 sopt.sopt_val = optval; 2013 sopt.sopt_valsize = optlen; 2014 sopt.sopt_td = NULL; 2015 return (sosetopt(so, &sopt)); 2016} 2017 2018int 2019sosetopt(so, sopt) 2020 struct socket so; 2021* struct sockopt sopt; 2022{ 2023* int error, optval; 2024 struct linger l; 2025 struct timeval tv; 2026 u_long val; 2027#ifdef MAC 2028 struct mac extmac; 2029#endif 2030 2031 error = 0; 2032 if (sopt->sopt_level != SOL_SOCKET) { 2033 if (so->so_proto && so->so_proto->pr_ctloutput) 2034 return ((so->so_proto->pr_ctloutput) 2035* (so, sopt)); 2036 error = ENOPROTOOPT; 2037 } else { 2038 switch (sopt->sopt_name) { 2039#ifdef INET 2040 case SO_ACCEPTFILTER: 2041 error = do_setopt_accept_filter(so, sopt); 2042 if (error) 2043 goto bad; 2044 break; 2045#endif 2046 case SO_LINGER: 2047 error = sooptcopyin(sopt, &l, sizeof l, sizeof l); 2048 if (error) 2049 goto bad; 2050 2051 SOCK_LOCK(so); 2052 so->so_linger = l.l_linger; 2053 if (l.l_onoff) 2054 so->so_options \|= SO_LINGER; 2055 else 2056 so->so_options &= ~SO_LINGER; 2057 SOCK_UNLOCK(so); 2058 break; 2059 2060 case SO_DEBUG: 2061 case SO_KEEPALIVE: 2062 case SO_DONTROUTE: 2063 case SO_USELOOPBACK: 2064 case SO_BROADCAST: 2065 case SO_REUSEADDR: 2066 case SO_REUSEPORT: 2067 case SO_OOBINLINE: 2068 case SO_TIMESTAMP: 2069 case SO_BINTIME: 2070 case SO_NOSIGPIPE: 2071 error = sooptcopyin(sopt, &optval, sizeof optval, 2072 sizeof optval); 2073 if (error) 2074 goto bad; 2075 SOCK_LOCK(so); 2076 if (optval) 2077 so->so_options \|= sopt->sopt_name; 2078 else 2079 so->so_options &= ~sopt->sopt_name; 2080 SOCK_UNLOCK(so); 2081 break; 2082 2083 case SO_SNDBUF: 2084 case SO_RCVBUF: 2085 case SO_SNDLOWAT: 2086 case SO_RCVLOWAT: 2087 error = sooptcopyin(sopt, &optval, sizeof optval, 2088 sizeof optval); 2089 if (error) 2090 goto bad; 2091 2092 /* 2093 * Values < 1 make no sense for any of these options, 2094 * so disallow them. 2095 / 2096* if (optval < 1) { 2097 error = EINVAL; 2098 goto bad; 2099 } 2100 2101 switch (sopt->sopt_name) { 2102 case SO_SNDBUF: 2103 case SO_RCVBUF: 2104 if (sbreserve(sopt->sopt_name == SO_SNDBUF ? 2105 &so->so_snd : &so->so_rcv, (u_long)optval, 2106 so, curthread) == 0) { 2107 error = ENOBUFS; 2108 goto bad; 2109 } 2110 (sopt->sopt_name == SO_SNDBUF ? &so->so_snd : 2111 &so->so_rcv)->sb_flags &= ~SB_AUTOSIZE; 2112 break; 2113 2114 /* 2115 * Make sure the low-water is never greater than the 2116 * high-water. 2117 / 2118* case SO_SNDLOWAT: 2119 SOCKBUF_LOCK(&so->so_snd); 2120 so->so_snd.sb_lowat = 2121 (optval > so->so_snd.sb_hiwat) ? 2122 so->so_snd.sb_hiwat : optval; 2123 SOCKBUF_UNLOCK(&so->so_snd); 2124 break; 2125 case SO_RCVLOWAT: 2126 SOCKBUF_LOCK(&so->so_rcv); 2127 so->so_rcv.sb_lowat = 2128 (optval > so->so_rcv.sb_hiwat) ? 2129 so->so_rcv.sb_hiwat : optval; 2130 SOCKBUF_UNLOCK(&so->so_rcv); 2131 break; 2132 } 2133 break; 2134 2135 case SO_SNDTIMEO: 2136 case SO_RCVTIMEO: 2137#ifdef COMPAT_IA32 2138 if (curthread->td_proc->p_sysent == &ia32_freebsd_sysvec) { 2139 struct timeval32 tv32; 2140 2141 error = sooptcopyin(sopt, &tv32, sizeof tv32, 2142 sizeof tv32); 2143 CP(tv32, tv, tv_sec); 2144 CP(tv32, tv, tv_usec); 2145 } else 2146#endif 2147 error = sooptcopyin(sopt, &tv, sizeof tv, 2148 sizeof tv); 2149 if (error) 2150 goto bad; 2151 2152 /* assert(hz > 0); / 2153* if (tv.tv_sec < 0 \|\| tv.tv_sec > INT_MAX / hz \|\| 2154 tv.tv_usec < 0 \|\| tv.tv_usec >= 1000000) { 2155 error = EDOM; 2156 goto bad; 2157 } 2158 /* assert(tick > 0); / 2159* /* assert(ULONG_MAX - INT_MAX >= 1000000); / 2160* val = (u_long)(tv.tv_sec * hz) + tv.tv_usec / tick; 2161 if (val > INT_MAX) { 2162 error = EDOM; 2163 goto bad; 2164 } 2165 if (val == 0 && tv.tv_usec != 0) 2166 val = 1; 2167 2168 switch (sopt->sopt_name) { 2169 case SO_SNDTIMEO: 2170 so->so_snd.sb_timeo = val; 2171 break; 2172 case SO_RCVTIMEO: 2173 so->so_rcv.sb_timeo = val; 2174 break; 2175 } 2176 break; 2177 2178 case SO_LABEL: 2179#ifdef MAC 2180 error = sooptcopyin(sopt, &extmac, sizeof extmac, 2181 sizeof extmac); 2182 if (error) 2183 goto bad; 2184 error = mac_setsockopt_label(sopt->sopt_td->td_ucred, 2185 so, &extmac); 2186#else 2187 error = EOPNOTSUPP; 2188#endif 2189 break; 2190 2191 default: 2192 error = ENOPROTOOPT; 2193 break; 2194 } 2195 if (error == 0 && so->so_proto != NULL && 2196 so->so_proto->pr_ctloutput != NULL) { 2197 (void) ((so->so_proto->pr_ctloutput) 2198* (so, sopt)); 2199 } 2200 } 2201bad: 2202 return (error); 2203} 2204 2205/* 2206 * Helper routine for getsockopt. 2207 / 2208int 2209sooptcopyout(struct sockopt sopt, const void buf, size_t len) 2210{ 2211* int error; 2212 size_t valsize; 2213 2214 error = 0; 2215 2216 /* 2217 * Documented get behavior is that we always return a value, possibly 2218 * truncated to fit in the user's buffer. Traditional behavior is 2219 * that we always tell the user precisely how much we copied, rather 2220 * than something useful like the total amount we had available for 2221 * her. Note that this interface is not idempotent; the entire 2222 * answer must generated ahead of time. 2223 / 2224* valsize = min(len, sopt->sopt_valsize); 2225 sopt->sopt_valsize = valsize; 2226 if (sopt->sopt_val != NULL) { 2227 if (sopt->sopt_td != NULL) 2228 error = copyout(buf, sopt->sopt_val, valsize); 2229 else 2230 bcopy(buf, sopt->sopt_val, valsize); 2231 } 2232 return (error); 2233} 2234 2235int 2236sogetopt(so, sopt) 2237 struct socket so; 2238* struct sockopt sopt; 2239{ 2240* int error, optval; 2241 struct linger l; 2242 struct timeval tv; 2243#ifdef MAC 2244 struct mac extmac; 2245#endif 2246 2247 error = 0; 2248 if (sopt->sopt_level != SOL_SOCKET) { 2249 if (so->so_proto && so->so_proto->pr_ctloutput) { 2250 return ((so->so_proto->pr_ctloutput) 2251* (so, sopt)); 2252 } else 2253 return (ENOPROTOOPT); 2254 } else { 2255 switch (sopt->sopt_name) { 2256#ifdef INET 2257 case SO_ACCEPTFILTER: 2258 error = do_getopt_accept_filter(so, sopt); 2259 break; 2260#endif 2261 case SO_LINGER: 2262 SOCK_LOCK(so); 2263 l.l_onoff = so->so_options & SO_LINGER; 2264 l.l_linger = so->so_linger; 2265 SOCK_UNLOCK(so); 2266 error = sooptcopyout(sopt, &l, sizeof l); 2267 break; 2268 2269 case SO_USELOOPBACK: 2270 case SO_DONTROUTE: 2271 case SO_DEBUG: 2272 case SO_KEEPALIVE: 2273 case SO_REUSEADDR: 2274 case SO_REUSEPORT: 2275 case SO_BROADCAST: 2276 case SO_OOBINLINE: 2277 case SO_ACCEPTCONN: 2278 case SO_TIMESTAMP: 2279 case SO_BINTIME: 2280 case SO_NOSIGPIPE: 2281 optval = so->so_options & sopt->sopt_name; 2282integer: 2283 error = sooptcopyout(sopt, &optval, sizeof optval); 2284 break; 2285 2286 case SO_TYPE: 2287 optval = so->so_type; 2288 goto integer; 2289 2290 case SO_ERROR: 2291 SOCK_LOCK(so); 2292 optval = so->so_error; 2293 so->so_error = 0; 2294 SOCK_UNLOCK(so); 2295 goto integer; 2296 2297 case SO_SNDBUF: 2298 optval = so->so_snd.sb_hiwat; 2299 goto integer; 2300 2301 case SO_RCVBUF: 2302 optval = so->so_rcv.sb_hiwat; 2303 goto integer; 2304 2305 case SO_SNDLOWAT: 2306 optval = so->so_snd.sb_lowat; 2307 goto integer; 2308 2309 case SO_RCVLOWAT: 2310 optval = so->so_rcv.sb_lowat; 2311 goto integer; 2312 2313 case SO_SNDTIMEO: 2314 case SO_RCVTIMEO: 2315 optval = (sopt->sopt_name == SO_SNDTIMEO ? 2316 so->so_snd.sb_timeo : so->so_rcv.sb_timeo); 2317 2318 tv.tv_sec = optval / hz; 2319 tv.tv_usec = (optval % hz) * tick; 2320#ifdef COMPAT_IA32 2321 if (curthread->td_proc->p_sysent == &ia32_freebsd_sysvec) { 2322 struct timeval32 tv32; 2323 2324 CP(tv, tv32, tv_sec); 2325 CP(tv, tv32, tv_usec); 2326 error = sooptcopyout(sopt, &tv32, sizeof tv32); 2327 } else 2328#endif 2329 error = sooptcopyout(sopt, &tv, sizeof tv); 2330 break; 2331 2332 case SO_LABEL: 2333#ifdef MAC 2334 error = sooptcopyin(sopt, &extmac, sizeof(extmac), 2335 sizeof(extmac)); 2336 if (error) 2337 return (error); 2338 error = mac_getsockopt_label(sopt->sopt_td->td_ucred, 2339 so, &extmac); 2340 if (error) 2341 return (error); 2342 error = sooptcopyout(sopt, &extmac, sizeof extmac); 2343#else 2344 error = EOPNOTSUPP; 2345#endif 2346 break; 2347 2348 case SO_PEERLABEL: 2349#ifdef MAC 2350 error = sooptcopyin(sopt, &extmac, sizeof(extmac), 2351 sizeof(extmac)); 2352 if (error) 2353 return (error); 2354 error = mac_getsockopt_peerlabel( 2355 sopt->sopt_td->td_ucred, so, &extmac); 2356 if (error) 2357 return (error); 2358 error = sooptcopyout(sopt, &extmac, sizeof extmac); 2359#else 2360 error = EOPNOTSUPP; 2361#endif 2362 break; 2363 2364 case SO_LISTENQLIMIT: 2365 optval = so->so_qlimit; 2366 goto integer; 2367 2368 case SO_LISTENQLEN: 2369 optval = so->so_qlen; 2370 goto integer; 2371 2372 case SO_LISTENINCQLEN: 2373 optval = so->so_incqlen; 2374 goto integer; 2375 2376 default: 2377 error = ENOPROTOOPT; 2378 break; 2379 } 2380 return (error); 2381 } 2382} 2383 2384/* XXX; prepare mbuf for (__FreeBSD__ < 3) routines. / 2385int 2386soopt_getm(struct sockopt sopt, struct mbuf *mp) 2387{ 2388* struct mbuf m, m_prev; 2389 int sopt_size = sopt->sopt_valsize; 2390 2391 MGET(m, sopt->sopt_td ? M_TRYWAIT : M_DONTWAIT, MT_DATA); 2392 if (m == NULL) 2393 return ENOBUFS; 2394 if (sopt_size > MLEN) { 2395 MCLGET(m, sopt->sopt_td ? M_TRYWAIT : M_DONTWAIT); 2396 if ((m->m_flags & M_EXT) == 0) { 2397 m_free(m); 2398 return ENOBUFS; 2399 } 2400 m->m_len = min(MCLBYTES, sopt_size); 2401 } else { 2402 m->m_len = min(MLEN, sopt_size); 2403 } 2404 sopt_size -= m->m_len; 2405 mp = m; 2406* m_prev = m; 2407 2408 while (sopt_size) { 2409 MGET(m, sopt->sopt_td ? M_TRYWAIT : M_DONTWAIT, MT_DATA); 2410 if (m == NULL) { 2411 m_freem(mp); 2412* return ENOBUFS; 2413 } 2414 if (sopt_size > MLEN) { 2415 MCLGET(m, sopt->sopt_td != NULL ? M_TRYWAIT : 2416 M_DONTWAIT); 2417 if ((m->m_flags & M_EXT) == 0) { 2418 m_freem(m); 2419 m_freem(mp); 2420* return ENOBUFS; 2421 } 2422 m->m_len = min(MCLBYTES, sopt_size); 2423 } else { 2424 m->m_len = min(MLEN, sopt_size); 2425 } 2426 sopt_size -= m->m_len; 2427 m_prev->m_next = m; 2428 m_prev = m; 2429 } 2430 return (0); 2431} 2432 2433/* XXX; copyin sopt data into mbuf chain for (__FreeBSD__ < 3) routines. / 2434int 2435soopt_mcopyin(struct sockopt sopt, struct mbuf m) 2436{ 2437* struct mbuf m0 = m; 2438* 2439 if (sopt->sopt_val == NULL) 2440 return (0); 2441 while (m != NULL && sopt->sopt_valsize >= m->m_len) { 2442 if (sopt->sopt_td != NULL) { 2443 int error; 2444 2445 error = copyin(sopt->sopt_val, mtod(m, char ), 2446* m->m_len); 2447 if (error != 0) { 2448 m_freem(m0); 2449 return(error); 2450 } 2451 } else 2452 bcopy(sopt->sopt_val, mtod(m, char ), m->m_len); 2453* sopt->sopt_valsize -= m->m_len; 2454 sopt->sopt_val = (char )sopt->sopt_val + m->m_len; 2455* m = m->m_next; 2456 } 2457 if (m != NULL) /* should be allocated enoughly at ip6_sooptmcopyin() / 2458* panic("ip6_sooptmcopyin"); 2459 return (0); 2460} 2461 2462/* XXX; copyout mbuf chain data into soopt for (__FreeBSD__ < 3) routines. / 2463int 2464soopt_mcopyout(struct sockopt sopt, struct mbuf m) 2465{ 2466* struct mbuf m0 = m; 2467* size_t valsize = 0; 2468 2469 if (sopt->sopt_val == NULL) 2470 return (0); 2471 while (m != NULL && sopt->sopt_valsize >= m->m_len) { 2472 if (sopt->sopt_td != NULL) { 2473 int error; 2474 2475 error = copyout(mtod(m, char ), sopt->sopt_val, 2476* m->m_len); 2477 if (error != 0) { 2478 m_freem(m0); 2479 return(error); 2480 } 2481 } else 2482 bcopy(mtod(m, char ), sopt->sopt_val, m->m_len); 2483* sopt->sopt_valsize -= m->m_len; 2484 sopt->sopt_val = (char )sopt->sopt_val + m->m_len; 2485* valsize += m->m_len; 2486 m = m->m_next; 2487 } 2488 if (m != NULL) { 2489 /* enough soopt buffer should be given from user-land / 2490* m_freem(m0); 2491 return(EINVAL); 2492 } 2493 sopt->sopt_valsize = valsize; 2494 return (0); 2495} 2496 2497/* 2498 * sohasoutofband(): protocol notifies socket layer of the arrival of new 2499 * out-of-band data, which will then notify socket consumers. 2500 / 2501void 2502sohasoutofband(so) 2503* struct socket so; 2504{ 2505* if (so->so_sigio != NULL) 2506 pgsigio(&so->so_sigio, SIGURG, 0); 2507 selwakeuppri(&so->so_rcv.sb_sel, PSOCK); 2508} 2509 2510int 2511sopoll(struct socket so, int events, struct ucred active_cred, 2512 struct thread td) 2513{ 2514* 2515 /* XXXRW: Temporary debugging. / 2516* KASSERT(so->so_proto->pr_usrreqs->pru_sopoll != sopoll, 2517 ("sopoll: protocol calls sopoll")); 2518 2519 return (so->so_proto->pr_usrreqs->pru_sopoll(so, events, active_cred, 2520 td)); 2521} 2522 2523int 2524sopoll_generic(struct socket so, int events, struct ucred active_cred, 2525 struct thread td) 2526{ 2527* int revents = 0; 2528 2529 SOCKBUF_LOCK(&so->so_snd); 2530 SOCKBUF_LOCK(&so->so_rcv); 2531 if (events & (POLLIN \| POLLRDNORM)) 2532 if (soreadable(so)) 2533 revents \|= events & (POLLIN \| POLLRDNORM); 2534 2535 if (events & POLLINIGNEOF) 2536 if (so->so_rcv.sb_cc >= so->so_rcv.sb_lowat \|\| 2537 !TAILQ_EMPTY(&so->so_comp) \|\| so->so_error) 2538 revents \|= POLLINIGNEOF; 2539 2540 if (events & (POLLOUT \| POLLWRNORM)) 2541 if (sowriteable(so)) 2542 revents \|= events & (POLLOUT \| POLLWRNORM); 2543 2544 if (events & (POLLPRI \| POLLRDBAND)) 2545 if (so->so_oobmark \|\| (so->so_rcv.sb_state & SBS_RCVATMARK)) 2546 revents \|= events & (POLLPRI \| POLLRDBAND); 2547 2548 if (revents == 0) { 2549 if (events & 2550 (POLLIN \| POLLINIGNEOF \| POLLPRI \| POLLRDNORM \| 2551 POLLRDBAND)) { 2552 selrecord(td, &so->so_rcv.sb_sel); 2553 so->so_rcv.sb_flags \|= SB_SEL; 2554 } 2555 2556 if (events & (POLLOUT \| POLLWRNORM)) { 2557 selrecord(td, &so->so_snd.sb_sel); 2558 so->so_snd.sb_flags \|= SB_SEL; 2559 } 2560 } 2561 2562 SOCKBUF_UNLOCK(&so->so_rcv); 2563 SOCKBUF_UNLOCK(&so->so_snd); 2564 return (revents); 2565} 2566 2567int 2568soo_kqfilter(struct file fp, struct knote kn) 2569{ 2570 struct socket so = kn->kn_fp->f_data; 2571* struct sockbuf sb; 2572* 2573 switch (kn->kn_filter) { 2574 case EVFILT_READ: 2575 if (so->so_options & SO_ACCEPTCONN) 2576 kn->kn_fop = &solisten_filtops; 2577 else 2578 kn->kn_fop = &soread_filtops; 2579 sb = &so->so_rcv; 2580 break; 2581 case EVFILT_WRITE: 2582 kn->kn_fop = &sowrite_filtops; 2583 sb = &so->so_snd; 2584 break; 2585 default: 2586 return (EINVAL); 2587 } 2588 2589 SOCKBUF_LOCK(sb); 2590 knlist_add(&sb->sb_sel.si_note, kn, 1); 2591 sb->sb_flags \|= SB_KNOTE; 2592 SOCKBUF_UNLOCK(sb); 2593 return (0); 2594} 2595	127#include <sys/sysctl.h> 128#include <sys/uio.h> 129#include <sys/jail.h> 130 131#include <security/mac/mac_framework.h> 132 133#include <vm/uma.h> 134 135#ifdef COMPAT_IA32 136#include <sys/mount.h> 137#include <compat/freebsd32/freebsd32.h> 138 139extern struct sysentvec ia32_freebsd_sysvec; 140#endif 141 142static int soreceive_rcvoob(struct socket so, struct uio uio, 143 int flags); 144 145static void filt_sordetach(struct knote kn); 146static int filt_soread(struct knote kn, long hint); 147static void filt_sowdetach(struct knote kn); 148static int filt_sowrite(struct knote kn, long hint); 149static int filt_solisten(struct knote kn, long hint); 150* 151static struct filterops solisten_filtops = 152 { 1, NULL, filt_sordetach, filt_solisten }; 153static struct filterops soread_filtops = 154 { 1, NULL, filt_sordetach, filt_soread }; 155static struct filterops sowrite_filtops = 156 { 1, NULL, filt_sowdetach, filt_sowrite }; 157 158uma_zone_t socket_zone; 159so_gen_t so_gencnt; /* generation count for sockets / 160* 161int maxsockets; 162 163MALLOC_DEFINE(M_SONAME, "soname", "socket name"); 164MALLOC_DEFINE(M_PCB, "pcb", "protocol control block"); 165 166static int somaxconn = SOMAXCONN; 167static int sysctl_somaxconn(SYSCTL_HANDLER_ARGS); 168/* XXX: we dont have SYSCTL_USHORT / 169SYSCTL_PROC(_kern_ipc, KIPC_SOMAXCONN, somaxconn, CTLTYPE_UINT \| CTLFLAG_RW, 170* 0, sizeof(int), sysctl_somaxconn, "I", "Maximum pending socket connection " 171 "queue size"); 172static int numopensockets; 173SYSCTL_INT(_kern_ipc, OID_AUTO, numopensockets, CTLFLAG_RD, 174 &numopensockets, 0, "Number of open sockets"); 175#ifdef ZERO_COPY_SOCKETS 176/* These aren't static because they're used in other files. / 177int so_zero_copy_send = 1; 178int so_zero_copy_receive = 1; 179SYSCTL_NODE(_kern_ipc, OID_AUTO, zero_copy, CTLFLAG_RD, 0, 180* "Zero copy controls"); 181SYSCTL_INT(_kern_ipc_zero_copy, OID_AUTO, receive, CTLFLAG_RW, 182 &so_zero_copy_receive, 0, "Enable zero copy receive"); 183SYSCTL_INT(_kern_ipc_zero_copy, OID_AUTO, send, CTLFLAG_RW, 184 &so_zero_copy_send, 0, "Enable zero copy send"); 185#endif /* ZERO_COPY_SOCKETS / 186* 187/* 188 * accept_mtx locks down per-socket fields relating to accept queues. See 189 * socketvar.h for an annotation of the protected fields of struct socket. 190 / 191struct mtx accept_mtx; 192MTX_SYSINIT(accept_mtx, &accept_mtx, "accept", MTX_DEF); 193* 194/* 195 * so_global_mtx protects so_gencnt, numopensockets, and the per-socket 196 * so_gencnt field. 197 / 198static struct mtx so_global_mtx; 199MTX_SYSINIT(so_global_mtx, &so_global_mtx, "so_glabel", MTX_DEF); 200* 201/* 202 * General IPC sysctl name space, used by sockets and a variety of other IPC 203 * types. 204 / 205SYSCTL_NODE(_kern, KERN_IPC, ipc, CTLFLAG_RW, 0, "IPC"); 206* 207/* 208 * Sysctl to get and set the maximum global sockets limit. Notify protocols 209 * of the change so that they can update their dependent limits as required. 210 / 211static int 212sysctl_maxsockets(SYSCTL_HANDLER_ARGS) 213{ 214* int error, newmaxsockets; 215 216 newmaxsockets = maxsockets; 217 error = sysctl_handle_int(oidp, &newmaxsockets, sizeof(int), req); 218 if (error == 0 && req->newptr) { 219 if (newmaxsockets > maxsockets) { 220 maxsockets = newmaxsockets; 221 if (maxsockets > ((maxfiles / 4) * 3)) { 222 maxfiles = (maxsockets * 5) / 4; 223 maxfilesperproc = (maxfiles * 9) / 10; 224 } 225 EVENTHANDLER_INVOKE(maxsockets_change); 226 } else 227 error = EINVAL; 228 } 229 return (error); 230} 231 232SYSCTL_PROC(_kern_ipc, OID_AUTO, maxsockets, CTLTYPE_INT\|CTLFLAG_RW, 233 &maxsockets, 0, sysctl_maxsockets, "IU", 234 "Maximum number of sockets avaliable"); 235 236/* 237 * Initialise maxsockets. 238 / 239static void init_maxsockets(void ignored) 240{ 241 TUNABLE_INT_FETCH("kern.ipc.maxsockets", &maxsockets); 242 maxsockets = imax(maxsockets, imax(maxfiles, nmbclusters)); 243} 244SYSINIT(param, SI_SUB_TUNABLES, SI_ORDER_ANY, init_maxsockets, NULL); 245 246/* 247 * Socket operation routines. These routines are called by the routines in 248 * sys_socket.c or from a system process, and implement the semantics of 249 * socket operations by switching out to the protocol specific routines. 250 / 251* 252/* 253 * Get a socket structure from our zone, and initialize it. Note that it 254 * would probably be better to allocate socket and PCB at the same time, but 255 * I'm not convinced that all the protocols can be easily modified to do 256 * this. 257 * 258 * soalloc() returns a socket with a ref count of 0. 259 / 260static struct socket 261soalloc(void) 262{ 263 struct socket so; 264* 265 so = uma_zalloc(socket_zone, M_NOWAIT \| M_ZERO); 266 if (so == NULL) 267 return (NULL); 268#ifdef MAC 269 if (mac_init_socket(so, M_NOWAIT) != 0) { 270 uma_zfree(socket_zone, so); 271 return (NULL); 272 } 273#endif 274 SOCKBUF_LOCK_INIT(&so->so_snd, "so_snd"); 275 SOCKBUF_LOCK_INIT(&so->so_rcv, "so_rcv"); 276 TAILQ_INIT(&so->so_aiojobq); 277 mtx_lock(&so_global_mtx); 278 so->so_gencnt = ++so_gencnt; 279 ++numopensockets; 280 mtx_unlock(&so_global_mtx); 281 return (so); 282} 283 284/* 285 * Free the storage associated with a socket at the socket layer, tear down 286 * locks, labels, etc. All protocol state is assumed already to have been 287 * torn down (and possibly never set up) by the caller. 288 / 289static void 290sodealloc(struct socket so) 291{ 292 293 KASSERT(so->so_count == 0, ("sodealloc(): so_count %d", so->so_count)); 294 KASSERT(so->so_pcb == NULL, ("sodealloc(): so_pcb != NULL")); 295 296 mtx_lock(&so_global_mtx); 297 so->so_gencnt = ++so_gencnt; 298 --numopensockets; /* Could be below, but faster here. / 299* mtx_unlock(&so_global_mtx); 300 if (so->so_rcv.sb_hiwat) 301 (void)chgsbsize(so->so_cred->cr_uidinfo, 302 &so->so_rcv.sb_hiwat, 0, RLIM_INFINITY); 303 if (so->so_snd.sb_hiwat) 304 (void)chgsbsize(so->so_cred->cr_uidinfo, 305 &so->so_snd.sb_hiwat, 0, RLIM_INFINITY); 306#ifdef INET 307 /* remove acccept filter if one is present. / 308* if (so->so_accf != NULL) 309 do_setopt_accept_filter(so, NULL); 310#endif 311#ifdef MAC 312 mac_destroy_socket(so); 313#endif 314 crfree(so->so_cred); 315 SOCKBUF_LOCK_DESTROY(&so->so_snd); 316 SOCKBUF_LOCK_DESTROY(&so->so_rcv); 317 uma_zfree(socket_zone, so); 318} 319 320/* 321 * socreate returns a socket with a ref count of 1. The socket should be 322 * closed with soclose(). 323 / 324int 325socreate(dom, aso, type, proto, cred, td) 326* int dom; 327 struct socket *aso; 328* int type; 329 int proto; 330 struct ucred cred; 331* struct thread td; 332{ 333* struct protosw prp; 334* struct socket so; 335* int error; 336 337 if (proto) 338 prp = pffindproto(dom, proto, type); 339 else 340 prp = pffindtype(dom, type); 341 342 if (prp == NULL \|\| prp->pr_usrreqs->pru_attach == NULL \|\| 343 prp->pr_usrreqs->pru_attach == pru_attach_notsupp) 344 return (EPROTONOSUPPORT); 345 346 if (jailed(cred) && jail_socket_unixiproute_only && 347 prp->pr_domain->dom_family != PF_LOCAL && 348 prp->pr_domain->dom_family != PF_INET && 349 prp->pr_domain->dom_family != PF_ROUTE) { 350 return (EPROTONOSUPPORT); 351 } 352 353 if (prp->pr_type != type) 354 return (EPROTOTYPE); 355 so = soalloc(); 356 if (so == NULL) 357 return (ENOBUFS); 358 359 TAILQ_INIT(&so->so_incomp); 360 TAILQ_INIT(&so->so_comp); 361 so->so_type = type; 362 so->so_cred = crhold(cred); 363 so->so_proto = prp; 364#ifdef MAC 365 mac_create_socket(cred, so); 366#endif 367 knlist_init(&so->so_rcv.sb_sel.si_note, SOCKBUF_MTX(&so->so_rcv), 368 NULL, NULL, NULL); 369 knlist_init(&so->so_snd.sb_sel.si_note, SOCKBUF_MTX(&so->so_snd), 370 NULL, NULL, NULL); 371 so->so_count = 1; 372 /* 373 * Auto-sizing of socket buffers is managed by the protocols and 374 * the appropriate flags must be set in the pru_attach function. 375 / 376* error = (prp->pr_usrreqs->pru_attach)(so, proto, td); 377* if (error) { 378 KASSERT(so->so_count == 1, ("socreate: so_count %d", 379 so->so_count)); 380 so->so_count = 0; 381 sodealloc(so); 382 return (error); 383 } 384 aso = so; 385* return (0); 386} 387 388#ifdef REGRESSION 389static int regression_sonewconn_earlytest = 1; 390SYSCTL_INT(_regression, OID_AUTO, sonewconn_earlytest, CTLFLAG_RW, 391 &regression_sonewconn_earlytest, 0, "Perform early sonewconn limit test"); 392#endif 393 394/* 395 * When an attempt at a new connection is noted on a socket which accepts 396 * connections, sonewconn is called. If the connection is possible (subject 397 * to space constraints, etc.) then we allocate a new structure, propoerly 398 * linked into the data structure of the original socket, and return this. 399 * Connstatus may be 0, or SO_ISCONFIRMING, or SO_ISCONNECTED. 400 * 401 * Note: the ref count on the socket is 0 on return. 402 / 403struct socket 404sonewconn(head, connstatus) 405 register struct socket head; 406* int connstatus; 407{ 408 register struct socket so; 409* int over; 410 411 ACCEPT_LOCK(); 412 over = (head->so_qlen > 3 * head->so_qlimit / 2); 413 ACCEPT_UNLOCK(); 414#ifdef REGRESSION 415 if (regression_sonewconn_earlytest && over) 416#else 417 if (over) 418#endif 419 return (NULL); 420 so = soalloc(); 421 if (so == NULL) 422 return (NULL); 423 if ((head->so_options & SO_ACCEPTFILTER) != 0) 424 connstatus = 0; 425 so->so_head = head; 426 so->so_type = head->so_type; 427 so->so_options = head->so_options &~ SO_ACCEPTCONN; 428 so->so_linger = head->so_linger; 429 so->so_state = head->so_state \| SS_NOFDREF; 430 so->so_proto = head->so_proto; 431 so->so_cred = crhold(head->so_cred); 432#ifdef MAC 433 SOCK_LOCK(head); 434 mac_create_socket_from_socket(head, so); 435 SOCK_UNLOCK(head); 436#endif 437 knlist_init(&so->so_rcv.sb_sel.si_note, SOCKBUF_MTX(&so->so_rcv), 438 NULL, NULL, NULL); 439 knlist_init(&so->so_snd.sb_sel.si_note, SOCKBUF_MTX(&so->so_snd), 440 NULL, NULL, NULL); 441 if (soreserve(so, head->so_snd.sb_hiwat, head->so_rcv.sb_hiwat) \|\| 442 (so->so_proto->pr_usrreqs->pru_attach)(so, 0, NULL)) { 443* sodealloc(so); 444 return (NULL); 445 } 446 so->so_rcv.sb_lowat = head->so_rcv.sb_lowat; 447 so->so_snd.sb_lowat = head->so_snd.sb_lowat; 448 so->so_rcv.sb_timeo = head->so_rcv.sb_timeo; 449 so->so_snd.sb_timeo = head->so_snd.sb_timeo; 450 so->so_rcv.sb_flags \|= head->so_rcv.sb_flags & SB_AUTOSIZE; 451 so->so_snd.sb_flags \|= head->so_snd.sb_flags & SB_AUTOSIZE; 452 so->so_state \|= connstatus; 453 ACCEPT_LOCK(); 454 if (connstatus) { 455 TAILQ_INSERT_TAIL(&head->so_comp, so, so_list); 456 so->so_qstate \|= SQ_COMP; 457 head->so_qlen++; 458 } else { 459 /* 460 * Keep removing sockets from the head until there's room for 461 * us to insert on the tail. In pre-locking revisions, this 462 * was a simple if(), but as we could be racing with other 463 * threads and soabort() requires dropping locks, we must 464 * loop waiting for the condition to be true. 465 / 466* while (head->so_incqlen > head->so_qlimit) { 467 struct socket sp; 468* sp = TAILQ_FIRST(&head->so_incomp); 469 TAILQ_REMOVE(&head->so_incomp, sp, so_list); 470 head->so_incqlen--; 471 sp->so_qstate &= ~SQ_INCOMP; 472 sp->so_head = NULL; 473 ACCEPT_UNLOCK(); 474 soabort(sp); 475 ACCEPT_LOCK(); 476 } 477 TAILQ_INSERT_TAIL(&head->so_incomp, so, so_list); 478 so->so_qstate \|= SQ_INCOMP; 479 head->so_incqlen++; 480 } 481 ACCEPT_UNLOCK(); 482 if (connstatus) { 483 sorwakeup(head); 484 wakeup_one(&head->so_timeo); 485 } 486 return (so); 487} 488 489int 490sobind(so, nam, td) 491 struct socket so; 492* struct sockaddr nam; 493* struct thread td; 494{ 495* 496 return ((so->so_proto->pr_usrreqs->pru_bind)(so, nam, td)); 497} 498* 499/* 500 * solisten() transitions a socket from a non-listening state to a listening 501 * state, but can also be used to update the listen queue depth on an 502 * existing listen socket. The protocol will call back into the sockets 503 * layer using solisten_proto_check() and solisten_proto() to check and set 504 * socket-layer listen state. Call backs are used so that the protocol can 505 * acquire both protocol and socket layer locks in whatever order is required 506 * by the protocol. 507 * 508 * Protocol implementors are advised to hold the socket lock across the 509 * socket-layer test and set to avoid races at the socket layer. 510 / 511int 512solisten(so, backlog, td) 513* struct socket so; 514* int backlog; 515 struct thread td; 516{ 517* 518 return ((so->so_proto->pr_usrreqs->pru_listen)(so, backlog, td)); 519} 520* 521int 522solisten_proto_check(so) 523 struct socket so; 524{ 525* 526 SOCK_LOCK_ASSERT(so); 527 528 if (so->so_state & (SS_ISCONNECTED \| SS_ISCONNECTING \| 529 SS_ISDISCONNECTING)) 530 return (EINVAL); 531 return (0); 532} 533 534void 535solisten_proto(so, backlog) 536 struct socket so; 537* int backlog; 538{ 539 540 SOCK_LOCK_ASSERT(so); 541 542 if (backlog < 0 \|\| backlog > somaxconn) 543 backlog = somaxconn; 544 so->so_qlimit = backlog; 545 so->so_options \|= SO_ACCEPTCONN; 546} 547 548/* 549 * Attempt to free a socket. This should really be sotryfree(). 550 * 551 * sofree() will succeed if: 552 * 553 * - There are no outstanding file descriptor references or related consumers 554 * (so_count == 0). 555 * 556 * - The socket has been closed by user space, if ever open (SS_NOFDREF). 557 * 558 * - The protocol does not have an outstanding strong reference on the socket 559 * (SS_PROTOREF). 560 * 561 * - The socket is not in a completed connection queue, so a process has been 562 * notified that it is present. If it is removed, the user process may 563 * block in accept() despite select() saying the socket was ready. 564 * 565 * Otherwise, it will quietly abort so that a future call to sofree(), when 566 * conditions are right, can succeed. 567 / 568void 569sofree(so) 570* struct socket so; 571{ 572* struct protosw pr = so->so_proto; 573* struct socket head; 574* 575 ACCEPT_LOCK_ASSERT(); 576 SOCK_LOCK_ASSERT(so); 577 578 if ((so->so_state & SS_NOFDREF) == 0 \|\| so->so_count != 0 \|\| 579 (so->so_state & SS_PROTOREF) \|\| (so->so_qstate & SQ_COMP)) { 580 SOCK_UNLOCK(so); 581 ACCEPT_UNLOCK(); 582 return; 583 } 584 585 head = so->so_head; 586 if (head != NULL) { 587 KASSERT((so->so_qstate & SQ_COMP) != 0 \|\| 588 (so->so_qstate & SQ_INCOMP) != 0, 589 ("sofree: so_head != NULL, but neither SQ_COMP nor " 590 "SQ_INCOMP")); 591 KASSERT((so->so_qstate & SQ_COMP) == 0 \|\| 592 (so->so_qstate & SQ_INCOMP) == 0, 593 ("sofree: so->so_qstate is SQ_COMP and also SQ_INCOMP")); 594 TAILQ_REMOVE(&head->so_incomp, so, so_list); 595 head->so_incqlen--; 596 so->so_qstate &= ~SQ_INCOMP; 597 so->so_head = NULL; 598 } 599 KASSERT((so->so_qstate & SQ_COMP) == 0 && 600 (so->so_qstate & SQ_INCOMP) == 0, 601 ("sofree: so_head == NULL, but still SQ_COMP(%d) or SQ_INCOMP(%d)", 602 so->so_qstate & SQ_COMP, so->so_qstate & SQ_INCOMP)); 603 if (so->so_options & SO_ACCEPTCONN) { 604 KASSERT((TAILQ_EMPTY(&so->so_comp)), ("sofree: so_comp populated")); 605 KASSERT((TAILQ_EMPTY(&so->so_incomp)), ("sofree: so_comp populated")); 606 } 607 SOCK_UNLOCK(so); 608 ACCEPT_UNLOCK(); 609 610 if (pr->pr_flags & PR_RIGHTS && pr->pr_domain->dom_dispose != NULL) 611 (pr->pr_domain->dom_dispose)(so->so_rcv.sb_mb); 612* if (pr->pr_usrreqs->pru_detach != NULL) 613 (pr->pr_usrreqs->pru_detach)(so); 614* 615 /* 616 * From this point on, we assume that no other references to this 617 * socket exist anywhere else in the stack. Therefore, no locks need 618 * to be acquired or held. 619 * 620 * We used to do a lot of socket buffer and socket locking here, as 621 * well as invoke sorflush() and perform wakeups. The direct call to 622 * dom_dispose() and sbrelease_internal() are an inlining of what was 623 * necessary from sorflush(). 624 * 625 * Notice that the socket buffer and kqueue state are torn down 626 * before calling pru_detach. This means that protocols shold not 627 * assume they can perform socket wakeups, etc, in their detach 628 * code. 629 / 630* KASSERT((so->so_snd.sb_flags & SB_LOCK) == 0, ("sofree: snd sblock")); 631 KASSERT((so->so_rcv.sb_flags & SB_LOCK) == 0, ("sofree: rcv sblock")); 632 sbdestroy(&so->so_snd, so); 633 sbdestroy(&so->so_rcv, so); 634 knlist_destroy(&so->so_rcv.sb_sel.si_note); 635 knlist_destroy(&so->so_snd.sb_sel.si_note); 636 sodealloc(so); 637} 638 639/* 640 * Close a socket on last file table reference removal. Initiate disconnect 641 * if connected. Free socket when disconnect complete. 642 * 643 * This function will sorele() the socket. Note that soclose() may be called 644 * prior to the ref count reaching zero. The actual socket structure will 645 * not be freed until the ref count reaches zero. 646 / 647int 648soclose(so) 649* struct socket so; 650{ 651* int error = 0; 652 653 KASSERT(!(so->so_state & SS_NOFDREF), ("soclose: SS_NOFDREF on enter")); 654 655 funsetown(&so->so_sigio); 656 if (so->so_state & SS_ISCONNECTED) { 657 if ((so->so_state & SS_ISDISCONNECTING) == 0) { 658 error = sodisconnect(so); 659 if (error) 660 goto drop; 661 } 662 if (so->so_options & SO_LINGER) { 663 if ((so->so_state & SS_ISDISCONNECTING) && 664 (so->so_state & SS_NBIO)) 665 goto drop; 666 while (so->so_state & SS_ISCONNECTED) { 667 error = tsleep(&so->so_timeo, 668 PSOCK \| PCATCH, "soclos", so->so_linger * hz); 669 if (error) 670 break; 671 } 672 } 673 } 674 675drop: 676 if (so->so_proto->pr_usrreqs->pru_close != NULL) 677 (so->so_proto->pr_usrreqs->pru_close)(so); 678* if (so->so_options & SO_ACCEPTCONN) { 679 struct socket sp; 680* ACCEPT_LOCK(); 681 while ((sp = TAILQ_FIRST(&so->so_incomp)) != NULL) { 682 TAILQ_REMOVE(&so->so_incomp, sp, so_list); 683 so->so_incqlen--; 684 sp->so_qstate &= ~SQ_INCOMP; 685 sp->so_head = NULL; 686 ACCEPT_UNLOCK(); 687 soabort(sp); 688 ACCEPT_LOCK(); 689 } 690 while ((sp = TAILQ_FIRST(&so->so_comp)) != NULL) { 691 TAILQ_REMOVE(&so->so_comp, sp, so_list); 692 so->so_qlen--; 693 sp->so_qstate &= ~SQ_COMP; 694 sp->so_head = NULL; 695 ACCEPT_UNLOCK(); 696 soabort(sp); 697 ACCEPT_LOCK(); 698 } 699 ACCEPT_UNLOCK(); 700 } 701 ACCEPT_LOCK(); 702 SOCK_LOCK(so); 703 KASSERT((so->so_state & SS_NOFDREF) == 0, ("soclose: NOFDREF")); 704 so->so_state \|= SS_NOFDREF; 705 sorele(so); 706 return (error); 707} 708 709/* 710 * soabort() is used to abruptly tear down a connection, such as when a 711 * resource limit is reached (listen queue depth exceeded), or if a listen 712 * socket is closed while there are sockets waiting to be accepted. 713 * 714 * This interface is tricky, because it is called on an unreferenced socket, 715 * and must be called only by a thread that has actually removed the socket 716 * from the listen queue it was on, or races with other threads are risked. 717 * 718 * This interface will call into the protocol code, so must not be called 719 * with any socket locks held. Protocols do call it while holding their own 720 * recursible protocol mutexes, but this is something that should be subject 721 * to review in the future. 722 / 723void 724soabort(so) 725* struct socket so; 726{ 727* 728 /* 729 * In as much as is possible, assert that no references to this 730 * socket are held. This is not quite the same as asserting that the 731 * current thread is responsible for arranging for no references, but 732 * is as close as we can get for now. 733 / 734* KASSERT(so->so_count == 0, ("soabort: so_count")); 735 KASSERT((so->so_state & SS_PROTOREF) == 0, ("soabort: SS_PROTOREF")); 736 KASSERT(so->so_state & SS_NOFDREF, ("soabort: !SS_NOFDREF")); 737 KASSERT((so->so_state & SQ_COMP) == 0, ("soabort: SQ_COMP")); 738 KASSERT((so->so_state & SQ_INCOMP) == 0, ("soabort: SQ_INCOMP")); 739 740 if (so->so_proto->pr_usrreqs->pru_abort != NULL) 741 (so->so_proto->pr_usrreqs->pru_abort)(so); 742* ACCEPT_LOCK(); 743 SOCK_LOCK(so); 744 sofree(so); 745} 746 747int 748soaccept(so, nam) 749 struct socket so; 750* struct sockaddr *nam; 751{ 752* int error; 753 754 SOCK_LOCK(so); 755 KASSERT((so->so_state & SS_NOFDREF) != 0, ("soaccept: !NOFDREF")); 756 so->so_state &= ~SS_NOFDREF; 757 SOCK_UNLOCK(so); 758 error = (so->so_proto->pr_usrreqs->pru_accept)(so, nam); 759* return (error); 760} 761 762int 763soconnect(so, nam, td) 764 struct socket so; 765* struct sockaddr nam; 766* struct thread td; 767{ 768* int error; 769 770 if (so->so_options & SO_ACCEPTCONN) 771 return (EOPNOTSUPP); 772 /* 773 * If protocol is connection-based, can only connect once. 774 * Otherwise, if connected, try to disconnect first. This allows 775 * user to disconnect by connecting to, e.g., a null address. 776 / 777* if (so->so_state & (SS_ISCONNECTED\|SS_ISCONNECTING) && 778 ((so->so_proto->pr_flags & PR_CONNREQUIRED) \|\| 779 (error = sodisconnect(so)))) { 780 error = EISCONN; 781 } else { 782 /* 783 * Prevent accumulated error from previous connection from 784 * biting us. 785 / 786* so->so_error = 0; 787 error = (so->so_proto->pr_usrreqs->pru_connect)(so, nam, td); 788* } 789 790 return (error); 791} 792 793int 794soconnect2(so1, so2) 795 struct socket so1; 796* struct socket so2; 797{ 798* 799 return ((so1->so_proto->pr_usrreqs->pru_connect2)(so1, so2)); 800} 801* 802int 803sodisconnect(so) 804 struct socket so; 805{ 806* int error; 807 808 if ((so->so_state & SS_ISCONNECTED) == 0) 809 return (ENOTCONN); 810 if (so->so_state & SS_ISDISCONNECTING) 811 return (EALREADY); 812 error = (so->so_proto->pr_usrreqs->pru_disconnect)(so); 813* return (error); 814} 815 816#ifdef ZERO_COPY_SOCKETS 817struct so_zerocopy_stats{ 818 int size_ok; 819 int align_ok; 820 int found_ifp; 821}; 822struct so_zerocopy_stats so_zerocp_stats = {0,0,0}; 823#include <netinet/in.h> 824#include <net/route.h> 825#include <netinet/in_pcb.h> 826#include <vm/vm.h> 827#include <vm/vm_page.h> 828#include <vm/vm_object.h> 829 830/* 831 * sosend_copyin() is only used if zero copy sockets are enabled. Otherwise 832 * sosend_dgram() and sosend_generic() use m_uiotombuf(). 833 * 834 * sosend_copyin() accepts a uio and prepares an mbuf chain holding part or 835 * all of the data referenced by the uio. If desired, it uses zero-copy. 836 * space will be updated to reflect data copied in. 837* * 838 * NB: If atomic I/O is requested, the caller must already have checked that 839 * space can hold resid bytes. 840 * 841 * NB: In the event of an error, the caller may need to free the partial 842 * chain pointed to by mpp. The contents of both uio and space may be 843* * modified even in the case of an error. 844 / 845static int 846sosend_copyin(struct uio uio, struct mbuf *retmp, int atomic, long space, 847 int flags) 848{ 849 struct mbuf m, mp, top; 850 long len, resid; 851 int error; 852#ifdef ZERO_COPY_SOCKETS 853 int cow_send; 854#endif 855 856 retmp = top = NULL; 857* mp = &top; 858 len = 0; 859 resid = uio->uio_resid; 860 error = 0; 861 do { 862#ifdef ZERO_COPY_SOCKETS 863 cow_send = 0; 864#endif /* ZERO_COPY_SOCKETS / 865* if (resid >= MINCLSIZE) { 866#ifdef ZERO_COPY_SOCKETS 867 if (top == NULL) { 868 m = m_gethdr(M_WAITOK, MT_DATA); 869 m->m_pkthdr.len = 0; 870 m->m_pkthdr.rcvif = NULL; 871 } else 872 m = m_get(M_WAITOK, MT_DATA); 873 if (so_zero_copy_send && 874 resid>=PAGE_SIZE && 875 space>=PAGE_SIZE && 876* uio->uio_iov->iov_len>=PAGE_SIZE) { 877 so_zerocp_stats.size_ok++; 878 so_zerocp_stats.align_ok++; 879 cow_send = socow_setup(m, uio); 880 len = cow_send; 881 } 882 if (!cow_send) { 883 m_clget(m, M_WAITOK); 884 len = min(min(MCLBYTES, resid), space); 885* } 886#else /* ZERO_COPY_SOCKETS / 887* if (top == NULL) { 888 m = m_getcl(M_TRYWAIT, MT_DATA, M_PKTHDR); 889 m->m_pkthdr.len = 0; 890 m->m_pkthdr.rcvif = NULL; 891 } else 892 m = m_getcl(M_TRYWAIT, MT_DATA, 0); 893 len = min(min(MCLBYTES, resid), space); 894#endif / ZERO_COPY_SOCKETS / 895* } else { 896 if (top == NULL) { 897 m = m_gethdr(M_TRYWAIT, MT_DATA); 898 m->m_pkthdr.len = 0; 899 m->m_pkthdr.rcvif = NULL; 900 901 len = min(min(MHLEN, resid), space); 902* /* 903 * For datagram protocols, leave room 904 * for protocol headers in first mbuf. 905 / 906* if (atomic && m && len < MHLEN) 907 MH_ALIGN(m, len); 908 } else { 909 m = m_get(M_TRYWAIT, MT_DATA); 910 len = min(min(MLEN, resid), space); 911* } 912 } 913 if (m == NULL) { 914 error = ENOBUFS; 915 goto out; 916 } 917 918 space -= len; 919#ifdef ZERO_COPY_SOCKETS 920* if (cow_send) 921 error = 0; 922 else 923#endif /* ZERO_COPY_SOCKETS / 924* error = uiomove(mtod(m, void ), (int)len, uio); 925* resid = uio->uio_resid; 926 m->m_len = len; 927 mp = m; 928* top->m_pkthdr.len += len; 929 if (error) 930 goto out; 931 mp = &m->m_next; 932 if (resid <= 0) { 933 if (flags & MSG_EOR) 934 top->m_flags \|= M_EOR; 935 break; 936 } 937 } while (space > 0 && atomic); 938out: 939* retmp = top; 940* return (error); 941} 942#endif /ZERO_COPY_SOCKETS/ 943 944#define SBLOCKWAIT(f) (((f) & MSG_DONTWAIT) ? M_NOWAIT : M_WAITOK) 945 946int 947sosend_dgram(so, addr, uio, top, control, flags, td) 948 struct socket so; 949* struct sockaddr addr; 950* struct uio uio; 951* struct mbuf top; 952* struct mbuf control; 953* int flags; 954 struct thread td; 955{ 956* long space, resid; 957 int clen = 0, error, dontroute; 958#ifdef ZERO_COPY_SOCKETS 959 int atomic = sosendallatonce(so) \|\| top; 960#endif 961 962 KASSERT(so->so_type == SOCK_DGRAM, ("sodgram_send: !SOCK_DGRAM")); 963 KASSERT(so->so_proto->pr_flags & PR_ATOMIC, 964 ("sodgram_send: !PR_ATOMIC")); 965 966 if (uio != NULL) 967 resid = uio->uio_resid; 968 else 969 resid = top->m_pkthdr.len; 970 /* 971 * In theory resid should be unsigned. However, space must be 972 * signed, as it might be less than 0 if we over-committed, and we 973 * must use a signed comparison of space and resid. On the other 974 * hand, a negative resid causes us to loop sending 0-length 975 * segments to the protocol. 976 * 977 * Also check to make sure that MSG_EOR isn't used on SOCK_STREAM 978 * type sockets since that's an error. 979 / 980* if (resid < 0) { 981 error = EINVAL; 982 goto out; 983 } 984 985 dontroute = 986 (flags & MSG_DONTROUTE) && (so->so_options & SO_DONTROUTE) == 0; 987 if (td != NULL) 988 td->td_proc->p_stats->p_ru.ru_msgsnd++; 989 if (control != NULL) 990 clen = control->m_len; 991 992 SOCKBUF_LOCK(&so->so_snd); 993 if (so->so_snd.sb_state & SBS_CANTSENDMORE) { 994 SOCKBUF_UNLOCK(&so->so_snd); 995 error = EPIPE; 996 goto out; 997 } 998 if (so->so_error) { 999 error = so->so_error; 1000 so->so_error = 0; 1001 SOCKBUF_UNLOCK(&so->so_snd); 1002 goto out; 1003 } 1004 if ((so->so_state & SS_ISCONNECTED) == 0) { 1005 /* 1006 * `sendto' and `sendmsg' is allowed on a connection-based 1007 * socket if it supports implied connect. Return ENOTCONN if 1008 * not connected and no address is supplied. 1009 / 1010* if ((so->so_proto->pr_flags & PR_CONNREQUIRED) && 1011 (so->so_proto->pr_flags & PR_IMPLOPCL) == 0) { 1012 if ((so->so_state & SS_ISCONFIRMING) == 0 && 1013 !(resid == 0 && clen != 0)) { 1014 SOCKBUF_UNLOCK(&so->so_snd); 1015 error = ENOTCONN; 1016 goto out; 1017 } 1018 } else if (addr == NULL) { 1019 if (so->so_proto->pr_flags & PR_CONNREQUIRED) 1020 error = ENOTCONN; 1021 else 1022 error = EDESTADDRREQ; 1023 SOCKBUF_UNLOCK(&so->so_snd); 1024 goto out; 1025 } 1026 } 1027 1028 /* 1029 * Do we need MSG_OOB support in SOCK_DGRAM? Signs here may be a 1030 * problem and need fixing. 1031 / 1032* space = sbspace(&so->so_snd); 1033 if (flags & MSG_OOB) 1034 space += 1024; 1035 space -= clen; 1036 SOCKBUF_UNLOCK(&so->so_snd); 1037 if (resid > space) { 1038 error = EMSGSIZE; 1039 goto out; 1040 } 1041 if (uio == NULL) { 1042 resid = 0; 1043 if (flags & MSG_EOR) 1044 top->m_flags \|= M_EOR; 1045 } else { 1046#ifdef ZERO_COPY_SOCKETS 1047 error = sosend_copyin(uio, &top, atomic, &space, flags); 1048 if (error) 1049 goto out; 1050#else 1051 /* 1052 * Copy the data from userland into a mbuf chain. 1053 * If no data is to be copied in, a single empty mbuf 1054 * is returned. 1055 / 1056* top = m_uiotombuf(uio, M_WAITOK, space, max_hdr, 1057 (M_PKTHDR \| ((flags & MSG_EOR) ? M_EOR : 0))); 1058 if (top == NULL) { 1059 error = EFAULT; /* only possible error / 1060* goto out; 1061 } 1062 space -= resid - uio->uio_resid; 1063#endif 1064 resid = uio->uio_resid; 1065 } 1066 KASSERT(resid == 0, ("sosend_dgram: resid != 0")); 1067 /* 1068 * XXXRW: Frobbing SO_DONTROUTE here is even worse without sblock 1069 * than with. 1070 / 1071* if (dontroute) { 1072 SOCK_LOCK(so); 1073 so->so_options \|= SO_DONTROUTE; 1074 SOCK_UNLOCK(so); 1075 } 1076 /* 1077 * XXX all the SBS_CANTSENDMORE checks previously done could be out 1078 * of date. We could have recieved a reset packet in an interrupt or 1079 * maybe we slept while doing page faults in uiomove() etc. We could 1080 * probably recheck again inside the locking protection here, but 1081 * there are probably other places that this also happens. We must 1082 * rethink this. 1083 / 1084* error = (so->so_proto->pr_usrreqs->pru_send)(so, 1085* (flags & MSG_OOB) ? PRUS_OOB : 1086 /* 1087 * If the user set MSG_EOF, the protocol understands this flag and 1088 * nothing left to send then use PRU_SEND_EOF instead of PRU_SEND. 1089 / 1090* ((flags & MSG_EOF) && 1091 (so->so_proto->pr_flags & PR_IMPLOPCL) && 1092 (resid <= 0)) ? 1093 PRUS_EOF : 1094 /* If there is more to send set PRUS_MORETOCOME / 1095* (resid > 0 && space > 0) ? PRUS_MORETOCOME : 0, 1096 top, addr, control, td); 1097 if (dontroute) { 1098 SOCK_LOCK(so); 1099 so->so_options &= ~SO_DONTROUTE; 1100 SOCK_UNLOCK(so); 1101 } 1102 clen = 0; 1103 control = NULL; 1104 top = NULL; 1105out: 1106 if (top != NULL) 1107 m_freem(top); 1108 if (control != NULL) 1109 m_freem(control); 1110 return (error); 1111} 1112 1113/* 1114 * Send on a socket. If send must go all at once and message is larger than 1115 * send buffering, then hard error. Lock against other senders. If must go 1116 * all at once and not enough room now, then inform user that this would 1117 * block and do nothing. Otherwise, if nonblocking, send as much as 1118 * possible. The data to be sent is described by "uio" if nonzero, otherwise 1119 * by the mbuf chain "top" (which must be null if uio is not). Data provided 1120 * in mbuf chain must be small enough to send all at once. 1121 * 1122 * Returns nonzero on error, timeout or signal; callers must check for short 1123 * counts if EINTR/ERESTART are returned. Data and control buffers are freed 1124 * on return. 1125 / 1126#define snderr(errno) { error = (errno); goto release; } 1127int 1128sosend_generic(so, addr, uio, top, control, flags, td) 1129* struct socket so; 1130* struct sockaddr addr; 1131* struct uio uio; 1132* struct mbuf top; 1133* struct mbuf control; 1134* int flags; 1135 struct thread td; 1136{ 1137* long space, resid; 1138 int clen = 0, error, dontroute; 1139 int atomic = sosendallatonce(so) \|\| top; 1140 1141 if (uio != NULL) 1142 resid = uio->uio_resid; 1143 else 1144 resid = top->m_pkthdr.len; 1145 /* 1146 * In theory resid should be unsigned. However, space must be 1147 * signed, as it might be less than 0 if we over-committed, and we 1148 * must use a signed comparison of space and resid. On the other 1149 * hand, a negative resid causes us to loop sending 0-length 1150 * segments to the protocol. 1151 * 1152 * Also check to make sure that MSG_EOR isn't used on SOCK_STREAM 1153 * type sockets since that's an error. 1154 / 1155* if (resid < 0 \|\| (so->so_type == SOCK_STREAM && (flags & MSG_EOR))) { 1156 error = EINVAL; 1157 goto out; 1158 } 1159 1160 dontroute = 1161 (flags & MSG_DONTROUTE) && (so->so_options & SO_DONTROUTE) == 0 && 1162 (so->so_proto->pr_flags & PR_ATOMIC); 1163 if (td != NULL) 1164 td->td_proc->p_stats->p_ru.ru_msgsnd++; 1165 if (control != NULL) 1166 clen = control->m_len; 1167 1168 SOCKBUF_LOCK(&so->so_snd); 1169restart: 1170 SOCKBUF_LOCK_ASSERT(&so->so_snd); 1171 error = sblock(&so->so_snd, SBLOCKWAIT(flags)); 1172 if (error) 1173 goto out_locked; 1174 do { 1175 SOCKBUF_LOCK_ASSERT(&so->so_snd); 1176 if (so->so_snd.sb_state & SBS_CANTSENDMORE) 1177 snderr(EPIPE); 1178 if (so->so_error) { 1179 error = so->so_error; 1180 so->so_error = 0; 1181 goto release; 1182 } 1183 if ((so->so_state & SS_ISCONNECTED) == 0) { 1184 /* 1185 * `sendto' and `sendmsg' is allowed on a connection- 1186 * based socket if it supports implied connect. 1187 * Return ENOTCONN if not connected and no address is 1188 * supplied. 1189 / 1190* if ((so->so_proto->pr_flags & PR_CONNREQUIRED) && 1191 (so->so_proto->pr_flags & PR_IMPLOPCL) == 0) { 1192 if ((so->so_state & SS_ISCONFIRMING) == 0 && 1193 !(resid == 0 && clen != 0)) 1194 snderr(ENOTCONN); 1195 } else if (addr == NULL) 1196 snderr(so->so_proto->pr_flags & PR_CONNREQUIRED ? 1197 ENOTCONN : EDESTADDRREQ); 1198 } 1199 space = sbspace(&so->so_snd); 1200 if (flags & MSG_OOB) 1201 space += 1024; 1202 if ((atomic && resid > so->so_snd.sb_hiwat) \|\| 1203 clen > so->so_snd.sb_hiwat) 1204 snderr(EMSGSIZE); 1205 if (space < resid + clen && 1206 (atomic \|\| space < so->so_snd.sb_lowat \|\| space < clen)) { 1207 if ((so->so_state & SS_NBIO) \|\| (flags & MSG_NBIO)) 1208 snderr(EWOULDBLOCK); 1209 sbunlock(&so->so_snd); 1210 error = sbwait(&so->so_snd); 1211 if (error) 1212 goto out_locked; 1213 goto restart; 1214 } 1215 SOCKBUF_UNLOCK(&so->so_snd); 1216 space -= clen; 1217 do { 1218 if (uio == NULL) { 1219 resid = 0; 1220 if (flags & MSG_EOR) 1221 top->m_flags \|= M_EOR; 1222 } else { 1223#ifdef ZERO_COPY_SOCKETS 1224 error = sosend_copyin(uio, &top, atomic, 1225 &space, flags); 1226 if (error != 0) { 1227 SOCKBUF_LOCK(&so->so_snd); 1228 goto release; 1229 } 1230#else 1231 /* 1232 * Copy the data from userland into a mbuf 1233 * chain. If no data is to be copied in, 1234 * a single empty mbuf is returned. 1235 / 1236* top = m_uiotombuf(uio, M_WAITOK, space, 1237 (atomic ? max_hdr : 0), 1238 (atomic ? M_PKTHDR : 0) \| 1239 ((flags & MSG_EOR) ? M_EOR : 0)); 1240 if (top == NULL) { 1241 SOCKBUF_LOCK(&so->so_snd); 1242 error = EFAULT; /* only possible error / 1243* goto release; 1244 } 1245 space -= resid - uio->uio_resid; 1246#endif 1247 resid = uio->uio_resid; 1248 } 1249 if (dontroute) { 1250 SOCK_LOCK(so); 1251 so->so_options \|= SO_DONTROUTE; 1252 SOCK_UNLOCK(so); 1253 } 1254 /* 1255 * XXX all the SBS_CANTSENDMORE checks previously 1256 * done could be out of date. We could have recieved 1257 * a reset packet in an interrupt or maybe we slept 1258 * while doing page faults in uiomove() etc. We 1259 * could probably recheck again inside the locking 1260 * protection here, but there are probably other 1261 * places that this also happens. We must rethink 1262 * this. 1263 / 1264* error = (so->so_proto->pr_usrreqs->pru_send)(so, 1265* (flags & MSG_OOB) ? PRUS_OOB : 1266 /* 1267 * If the user set MSG_EOF, the protocol understands 1268 * this flag and nothing left to send then use 1269 * PRU_SEND_EOF instead of PRU_SEND. 1270 / 1271* ((flags & MSG_EOF) && 1272 (so->so_proto->pr_flags & PR_IMPLOPCL) && 1273 (resid <= 0)) ? 1274 PRUS_EOF : 1275 /* If there is more to send set PRUS_MORETOCOME. / 1276* (resid > 0 && space > 0) ? PRUS_MORETOCOME : 0, 1277 top, addr, control, td); 1278 if (dontroute) { 1279 SOCK_LOCK(so); 1280 so->so_options &= ~SO_DONTROUTE; 1281 SOCK_UNLOCK(so); 1282 } 1283 clen = 0; 1284 control = NULL; 1285 top = NULL; 1286 if (error) { 1287 SOCKBUF_LOCK(&so->so_snd); 1288 goto release; 1289 } 1290 } while (resid && space > 0); 1291 SOCKBUF_LOCK(&so->so_snd); 1292 } while (resid); 1293 1294release: 1295 SOCKBUF_LOCK_ASSERT(&so->so_snd); 1296 sbunlock(&so->so_snd); 1297out_locked: 1298 SOCKBUF_LOCK_ASSERT(&so->so_snd); 1299 SOCKBUF_UNLOCK(&so->so_snd); 1300out: 1301 if (top != NULL) 1302 m_freem(top); 1303 if (control != NULL) 1304 m_freem(control); 1305 return (error); 1306} 1307#undef snderr 1308 1309int 1310sosend(so, addr, uio, top, control, flags, td) 1311 struct socket so; 1312* struct sockaddr addr; 1313* struct uio uio; 1314* struct mbuf top; 1315* struct mbuf control; 1316* int flags; 1317 struct thread td; 1318{ 1319* 1320 /* XXXRW: Temporary debugging. / 1321* KASSERT(so->so_proto->pr_usrreqs->pru_sosend != sosend, 1322 ("sosend: protocol calls sosend")); 1323 1324 return (so->so_proto->pr_usrreqs->pru_sosend(so, addr, uio, top, 1325 control, flags, td)); 1326} 1327 1328/* 1329 * The part of soreceive() that implements reading non-inline out-of-band 1330 * data from a socket. For more complete comments, see soreceive(), from 1331 * which this code originated. 1332 * 1333 * Note that soreceive_rcvoob(), unlike the remainder of soreceive(), is 1334 * unable to return an mbuf chain to the caller. 1335 / 1336static int 1337soreceive_rcvoob(so, uio, flags) 1338* struct socket so; 1339* struct uio uio; 1340* int flags; 1341{ 1342 struct protosw pr = so->so_proto; 1343* struct mbuf m; 1344* int error; 1345 1346 KASSERT(flags & MSG_OOB, ("soreceive_rcvoob: (flags & MSG_OOB) == 0")); 1347 1348 m = m_get(M_TRYWAIT, MT_DATA); 1349 if (m == NULL) 1350 return (ENOBUFS); 1351 error = (pr->pr_usrreqs->pru_rcvoob)(so, m, flags & MSG_PEEK); 1352* if (error) 1353 goto bad; 1354 do { 1355#ifdef ZERO_COPY_SOCKETS 1356 if (so_zero_copy_receive) { 1357 int disposable; 1358 1359 if ((m->m_flags & M_EXT) 1360 && (m->m_ext.ext_type == EXT_DISPOSABLE)) 1361 disposable = 1; 1362 else 1363 disposable = 0; 1364 1365 error = uiomoveco(mtod(m, void ), 1366* min(uio->uio_resid, m->m_len), 1367 uio, disposable); 1368 } else 1369#endif /* ZERO_COPY_SOCKETS / 1370* error = uiomove(mtod(m, void ), 1371* (int) min(uio->uio_resid, m->m_len), uio); 1372 m = m_free(m); 1373 } while (uio->uio_resid && error == 0 && m); 1374bad: 1375 if (m != NULL) 1376 m_freem(m); 1377 return (error); 1378} 1379 1380/* 1381 * Following replacement or removal of the first mbuf on the first mbuf chain 1382 * of a socket buffer, push necessary state changes back into the socket 1383 * buffer so that other consumers see the values consistently. 'nextrecord' 1384 * is the callers locally stored value of the original value of 1385 * sb->sb_mb->m_nextpkt which must be restored when the lead mbuf changes. 1386 * NOTE: 'nextrecord' may be NULL. 1387 / 1388static __inline void 1389sockbuf_pushsync(struct sockbuf sb, struct mbuf nextrecord) 1390{ 1391* 1392 SOCKBUF_LOCK_ASSERT(sb); 1393 /* 1394 * First, update for the new value of nextrecord. If necessary, make 1395 * it the first record. 1396 / 1397* if (sb->sb_mb != NULL) 1398 sb->sb_mb->m_nextpkt = nextrecord; 1399 else 1400 sb->sb_mb = nextrecord; 1401 1402 /* 1403 * Now update any dependent socket buffer fields to reflect the new 1404 * state. This is an expanded inline of SB_EMPTY_FIXUP(), with the 1405 * addition of a second clause that takes care of the case where 1406 * sb_mb has been updated, but remains the last record. 1407 / 1408* if (sb->sb_mb == NULL) { 1409 sb->sb_mbtail = NULL; 1410 sb->sb_lastrecord = NULL; 1411 } else if (sb->sb_mb->m_nextpkt == NULL) 1412 sb->sb_lastrecord = sb->sb_mb; 1413} 1414 1415 1416/* 1417 * Implement receive operations on a socket. We depend on the way that 1418 * records are added to the sockbuf by sbappend. In particular, each record 1419 * (mbufs linked through m_next) must begin with an address if the protocol 1420 * so specifies, followed by an optional mbuf or mbufs containing ancillary 1421 * data, and then zero or more mbufs of data. In order to allow parallelism 1422 * between network receive and copying to user space, as well as avoid 1423 * sleeping with a mutex held, we release the socket buffer mutex during the 1424 * user space copy. Although the sockbuf is locked, new data may still be 1425 * appended, and thus we must maintain consistency of the sockbuf during that 1426 * time. 1427 * 1428 * The caller may receive the data as a single mbuf chain by supplying an 1429 * mbuf *mp0 for use in returning the chain. The uio is then used only for 1430* * the count in uio_resid. 1431 / 1432int 1433soreceive_generic(so, psa, uio, mp0, controlp, flagsp) 1434* struct socket so; 1435* struct sockaddr *psa; 1436* struct uio uio; 1437* struct mbuf *mp0; 1438* struct mbuf *controlp; 1439* int flagsp; 1440{ 1441* struct mbuf m, mp; 1442* int flags, len, error, offset; 1443 struct protosw pr = so->so_proto; 1444* struct mbuf nextrecord; 1445* int moff, type = 0; 1446 int orig_resid = uio->uio_resid; 1447 1448 mp = mp0; 1449 if (psa != NULL) 1450 psa = NULL; 1451* if (controlp != NULL) 1452 controlp = NULL; 1453* if (flagsp != NULL) 1454 flags = flagsp &~ MSG_EOR; 1455* else 1456 flags = 0; 1457 if (flags & MSG_OOB) 1458 return (soreceive_rcvoob(so, uio, flags)); 1459 if (mp != NULL) 1460 mp = NULL; 1461* if ((pr->pr_flags & PR_WANTRCVD) && (so->so_state & SS_ISCONFIRMING) 1462 && uio->uio_resid) 1463 (pr->pr_usrreqs->pru_rcvd)(so, 0); 1464* 1465 SOCKBUF_LOCK(&so->so_rcv); 1466restart: 1467 SOCKBUF_LOCK_ASSERT(&so->so_rcv); 1468 error = sblock(&so->so_rcv, SBLOCKWAIT(flags)); 1469 if (error) 1470 goto out; 1471 1472 m = so->so_rcv.sb_mb; 1473 /* 1474 * If we have less data than requested, block awaiting more (subject 1475 * to any timeout) if: 1476 * 1. the current count is less than the low water mark, or 1477 * 2. MSG_WAITALL is set, and it is possible to do the entire 1478 * receive operation at once if we block (resid <= hiwat). 1479 * 3. MSG_DONTWAIT is not set 1480 * If MSG_WAITALL is set but resid is larger than the receive buffer, 1481 * we have to do the receive in sections, and thus risk returning a 1482 * short count if a timeout or signal occurs after we start. 1483 / 1484* if (m == NULL \|\| (((flags & MSG_DONTWAIT) == 0 && 1485 so->so_rcv.sb_cc < uio->uio_resid) && 1486 (so->so_rcv.sb_cc < so->so_rcv.sb_lowat \|\| 1487 ((flags & MSG_WAITALL) && uio->uio_resid <= so->so_rcv.sb_hiwat)) && 1488 m->m_nextpkt == NULL && (pr->pr_flags & PR_ATOMIC) == 0)) { 1489 KASSERT(m != NULL \|\| !so->so_rcv.sb_cc, 1490 ("receive: m == %p so->so_rcv.sb_cc == %u", 1491 m, so->so_rcv.sb_cc)); 1492 if (so->so_error) { 1493 if (m != NULL) 1494 goto dontblock; 1495 error = so->so_error; 1496 if ((flags & MSG_PEEK) == 0) 1497 so->so_error = 0; 1498 goto release; 1499 } 1500 SOCKBUF_LOCK_ASSERT(&so->so_rcv); 1501 if (so->so_rcv.sb_state & SBS_CANTRCVMORE) { 1502 if (m) 1503 goto dontblock; 1504 else 1505 goto release; 1506 } 1507 for (; m != NULL; m = m->m_next) 1508 if (m->m_type == MT_OOBDATA \|\| (m->m_flags & M_EOR)) { 1509 m = so->so_rcv.sb_mb; 1510 goto dontblock; 1511 } 1512 if ((so->so_state & (SS_ISCONNECTED\|SS_ISCONNECTING)) == 0 && 1513 (so->so_proto->pr_flags & PR_CONNREQUIRED)) { 1514 error = ENOTCONN; 1515 goto release; 1516 } 1517 if (uio->uio_resid == 0) 1518 goto release; 1519 if ((so->so_state & SS_NBIO) \|\| 1520 (flags & (MSG_DONTWAIT\|MSG_NBIO))) { 1521 error = EWOULDBLOCK; 1522 goto release; 1523 } 1524 SBLASTRECORDCHK(&so->so_rcv); 1525 SBLASTMBUFCHK(&so->so_rcv); 1526 sbunlock(&so->so_rcv); 1527 error = sbwait(&so->so_rcv); 1528 if (error) 1529 goto out; 1530 goto restart; 1531 } 1532dontblock: 1533 /* 1534 * From this point onward, we maintain 'nextrecord' as a cache of the 1535 * pointer to the next record in the socket buffer. We must keep the 1536 * various socket buffer pointers and local stack versions of the 1537 * pointers in sync, pushing out modifications before dropping the 1538 * socket buffer mutex, and re-reading them when picking it up. 1539 * 1540 * Otherwise, we will race with the network stack appending new data 1541 * or records onto the socket buffer by using inconsistent/stale 1542 * versions of the field, possibly resulting in socket buffer 1543 * corruption. 1544 * 1545 * By holding the high-level sblock(), we prevent simultaneous 1546 * readers from pulling off the front of the socket buffer. 1547 / 1548* SOCKBUF_LOCK_ASSERT(&so->so_rcv); 1549 if (uio->uio_td) 1550 uio->uio_td->td_proc->p_stats->p_ru.ru_msgrcv++; 1551 KASSERT(m == so->so_rcv.sb_mb, ("soreceive: m != so->so_rcv.sb_mb")); 1552 SBLASTRECORDCHK(&so->so_rcv); 1553 SBLASTMBUFCHK(&so->so_rcv); 1554 nextrecord = m->m_nextpkt; 1555 if (pr->pr_flags & PR_ADDR) { 1556 KASSERT(m->m_type == MT_SONAME, 1557 ("m->m_type == %d", m->m_type)); 1558 orig_resid = 0; 1559 if (psa != NULL) 1560 psa = sodupsockaddr(mtod(m, struct sockaddr ), 1561 M_NOWAIT); 1562 if (flags & MSG_PEEK) { 1563 m = m->m_next; 1564 } else { 1565 sbfree(&so->so_rcv, m); 1566 so->so_rcv.sb_mb = m_free(m); 1567 m = so->so_rcv.sb_mb; 1568 sockbuf_pushsync(&so->so_rcv, nextrecord); 1569 } 1570 } 1571 1572 /* 1573 * Process one or more MT_CONTROL mbufs present before any data mbufs 1574 * in the first mbuf chain on the socket buffer. If MSG_PEEK, we 1575 * just copy the data; if !MSG_PEEK, we call into the protocol to 1576 * perform externalization (or freeing if controlp == NULL). 1577 / 1578* if (m != NULL && m->m_type == MT_CONTROL) { 1579 struct mbuf cm = NULL, cmn; 1580 struct mbuf *cme = &cm; 1581* 1582 do { 1583 if (flags & MSG_PEEK) { 1584 if (controlp != NULL) { 1585 controlp = m_copy(m, 0, m->m_len); 1586* controlp = &(controlp)->m_next; 1587* } 1588 m = m->m_next; 1589 } else { 1590 sbfree(&so->so_rcv, m); 1591 so->so_rcv.sb_mb = m->m_next; 1592 m->m_next = NULL; 1593 cme = m; 1594* cme = &(cme)->m_next; 1595* m = so->so_rcv.sb_mb; 1596 } 1597 } while (m != NULL && m->m_type == MT_CONTROL); 1598 if ((flags & MSG_PEEK) == 0) 1599 sockbuf_pushsync(&so->so_rcv, nextrecord); 1600 while (cm != NULL) { 1601 cmn = cm->m_next; 1602 cm->m_next = NULL; 1603 if (pr->pr_domain->dom_externalize != NULL) { 1604 SOCKBUF_UNLOCK(&so->so_rcv); 1605 error = (pr->pr_domain->dom_externalize) 1606* (cm, controlp); 1607 SOCKBUF_LOCK(&so->so_rcv); 1608 } else if (controlp != NULL) 1609 controlp = cm; 1610* else 1611 m_freem(cm); 1612 if (controlp != NULL) { 1613 orig_resid = 0; 1614 while (controlp != NULL) 1615* controlp = &(controlp)->m_next; 1616* } 1617 cm = cmn; 1618 } 1619 if (m != NULL) 1620 nextrecord = so->so_rcv.sb_mb->m_nextpkt; 1621 else 1622 nextrecord = so->so_rcv.sb_mb; 1623 orig_resid = 0; 1624 } 1625 if (m != NULL) { 1626 if ((flags & MSG_PEEK) == 0) { 1627 KASSERT(m->m_nextpkt == nextrecord, 1628 ("soreceive: post-control, nextrecord !sync")); 1629 if (nextrecord == NULL) { 1630 KASSERT(so->so_rcv.sb_mb == m, 1631 ("soreceive: post-control, sb_mb!=m")); 1632 KASSERT(so->so_rcv.sb_lastrecord == m, 1633 ("soreceive: post-control, lastrecord!=m")); 1634 } 1635 } 1636 type = m->m_type; 1637 if (type == MT_OOBDATA) 1638 flags \|= MSG_OOB; 1639 } else { 1640 if ((flags & MSG_PEEK) == 0) { 1641 KASSERT(so->so_rcv.sb_mb == nextrecord, 1642 ("soreceive: sb_mb != nextrecord")); 1643 if (so->so_rcv.sb_mb == NULL) { 1644 KASSERT(so->so_rcv.sb_lastrecord == NULL, 1645 ("soreceive: sb_lastercord != NULL")); 1646 } 1647 } 1648 } 1649 SOCKBUF_LOCK_ASSERT(&so->so_rcv); 1650 SBLASTRECORDCHK(&so->so_rcv); 1651 SBLASTMBUFCHK(&so->so_rcv); 1652 1653 /* 1654 * Now continue to read any data mbufs off of the head of the socket 1655 * buffer until the read request is satisfied. Note that 'type' is 1656 * used to store the type of any mbuf reads that have happened so far 1657 * such that soreceive() can stop reading if the type changes, which 1658 * causes soreceive() to return only one of regular data and inline 1659 * out-of-band data in a single socket receive operation. 1660 / 1661* moff = 0; 1662 offset = 0; 1663 while (m != NULL && uio->uio_resid > 0 && error == 0) { 1664 /* 1665 * If the type of mbuf has changed since the last mbuf 1666 * examined ('type'), end the receive operation. 1667 / 1668* SOCKBUF_LOCK_ASSERT(&so->so_rcv); 1669 if (m->m_type == MT_OOBDATA) { 1670 if (type != MT_OOBDATA) 1671 break; 1672 } else if (type == MT_OOBDATA) 1673 break; 1674 else 1675 KASSERT(m->m_type == MT_DATA, 1676 ("m->m_type == %d", m->m_type)); 1677 so->so_rcv.sb_state &= ~SBS_RCVATMARK; 1678 len = uio->uio_resid; 1679 if (so->so_oobmark && len > so->so_oobmark - offset) 1680 len = so->so_oobmark - offset; 1681 if (len > m->m_len - moff) 1682 len = m->m_len - moff; 1683 /* 1684 * If mp is set, just pass back the mbufs. Otherwise copy 1685 * them out via the uio, then free. Sockbuf must be 1686 * consistent here (points to current mbuf, it points to next 1687 * record) when we drop priority; we must note any additions 1688 * to the sockbuf when we block interrupts again. 1689 / 1690* if (mp == NULL) { 1691 SOCKBUF_LOCK_ASSERT(&so->so_rcv); 1692 SBLASTRECORDCHK(&so->so_rcv); 1693 SBLASTMBUFCHK(&so->so_rcv); 1694 SOCKBUF_UNLOCK(&so->so_rcv); 1695#ifdef ZERO_COPY_SOCKETS 1696 if (so_zero_copy_receive) { 1697 int disposable; 1698 1699 if ((m->m_flags & M_EXT) 1700 && (m->m_ext.ext_type == EXT_DISPOSABLE)) 1701 disposable = 1; 1702 else 1703 disposable = 0; 1704 1705 error = uiomoveco(mtod(m, char ) + moff, 1706* (int)len, uio, 1707 disposable); 1708 } else 1709#endif /* ZERO_COPY_SOCKETS / 1710* error = uiomove(mtod(m, char ) + moff, (int)len, uio); 1711* SOCKBUF_LOCK(&so->so_rcv); 1712 if (error) { 1713 /* 1714 * The MT_SONAME mbuf has already been removed 1715 * from the record, so it is necessary to 1716 * remove the data mbufs, if any, to preserve 1717 * the invariant in the case of PR_ADDR that 1718 * requires MT_SONAME mbufs at the head of 1719 * each record. 1720 / 1721* if (m && pr->pr_flags & PR_ATOMIC && 1722 ((flags & MSG_PEEK) == 0)) 1723 (void)sbdroprecord_locked(&so->so_rcv); 1724 goto release; 1725 } 1726 } else 1727 uio->uio_resid -= len; 1728 SOCKBUF_LOCK_ASSERT(&so->so_rcv); 1729 if (len == m->m_len - moff) { 1730 if (m->m_flags & M_EOR) 1731 flags \|= MSG_EOR; 1732 if (flags & MSG_PEEK) { 1733 m = m->m_next; 1734 moff = 0; 1735 } else { 1736 nextrecord = m->m_nextpkt; 1737 sbfree(&so->so_rcv, m); 1738 if (mp != NULL) { 1739 mp = m; 1740* mp = &m->m_next; 1741 so->so_rcv.sb_mb = m = m->m_next; 1742 mp = NULL; 1743* } else { 1744 so->so_rcv.sb_mb = m_free(m); 1745 m = so->so_rcv.sb_mb; 1746 } 1747 sockbuf_pushsync(&so->so_rcv, nextrecord); 1748 SBLASTRECORDCHK(&so->so_rcv); 1749 SBLASTMBUFCHK(&so->so_rcv); 1750 } 1751 } else { 1752 if (flags & MSG_PEEK) 1753 moff += len; 1754 else { 1755 if (mp != NULL) { 1756 int copy_flag; 1757 1758 if (flags & MSG_DONTWAIT) 1759 copy_flag = M_DONTWAIT; 1760 else 1761 copy_flag = M_TRYWAIT; 1762 if (copy_flag == M_TRYWAIT) 1763 SOCKBUF_UNLOCK(&so->so_rcv); 1764 mp = m_copym(m, 0, len, copy_flag); 1765* if (copy_flag == M_TRYWAIT) 1766 SOCKBUF_LOCK(&so->so_rcv); 1767 if (mp == NULL) { 1768* /* 1769 * m_copym() couldn't 1770 * allocate an mbuf. Adjust 1771 * uio_resid back (it was 1772 * adjusted down by len 1773 * bytes, which we didn't end 1774 * up "copying" over). 1775 / 1776* uio->uio_resid += len; 1777 break; 1778 } 1779 } 1780 m->m_data += len; 1781 m->m_len -= len; 1782 so->so_rcv.sb_cc -= len; 1783 } 1784 } 1785 SOCKBUF_LOCK_ASSERT(&so->so_rcv); 1786 if (so->so_oobmark) { 1787 if ((flags & MSG_PEEK) == 0) { 1788 so->so_oobmark -= len; 1789 if (so->so_oobmark == 0) { 1790 so->so_rcv.sb_state \|= SBS_RCVATMARK; 1791 break; 1792 } 1793 } else { 1794 offset += len; 1795 if (offset == so->so_oobmark) 1796 break; 1797 } 1798 } 1799 if (flags & MSG_EOR) 1800 break; 1801 /* 1802 * If the MSG_WAITALL flag is set (for non-atomic socket), we 1803 * must not quit until "uio->uio_resid == 0" or an error 1804 * termination. If a signal/timeout occurs, return with a 1805 * short count but without error. Keep sockbuf locked 1806 * against other readers. 1807 / 1808* while (flags & MSG_WAITALL && m == NULL && uio->uio_resid > 0 && 1809 !sosendallatonce(so) && nextrecord == NULL) { 1810 SOCKBUF_LOCK_ASSERT(&so->so_rcv); 1811 if (so->so_error \|\| so->so_rcv.sb_state & SBS_CANTRCVMORE) 1812 break; 1813 /* 1814 * Notify the protocol that some data has been 1815 * drained before blocking. 1816 / 1817* if (pr->pr_flags & PR_WANTRCVD) { 1818 SOCKBUF_UNLOCK(&so->so_rcv); 1819 (pr->pr_usrreqs->pru_rcvd)(so, flags); 1820* SOCKBUF_LOCK(&so->so_rcv); 1821 } 1822 SBLASTRECORDCHK(&so->so_rcv); 1823 SBLASTMBUFCHK(&so->so_rcv); 1824 error = sbwait(&so->so_rcv); 1825 if (error) 1826 goto release; 1827 m = so->so_rcv.sb_mb; 1828 if (m != NULL) 1829 nextrecord = m->m_nextpkt; 1830 } 1831 } 1832 1833 SOCKBUF_LOCK_ASSERT(&so->so_rcv); 1834 if (m != NULL && pr->pr_flags & PR_ATOMIC) { 1835 flags \|= MSG_TRUNC; 1836 if ((flags & MSG_PEEK) == 0) 1837 (void) sbdroprecord_locked(&so->so_rcv); 1838 } 1839 if ((flags & MSG_PEEK) == 0) { 1840 if (m == NULL) { 1841 /* 1842 * First part is an inline SB_EMPTY_FIXUP(). Second 1843 * part makes sure sb_lastrecord is up-to-date if 1844 * there is still data in the socket buffer. 1845 / 1846* so->so_rcv.sb_mb = nextrecord; 1847 if (so->so_rcv.sb_mb == NULL) { 1848 so->so_rcv.sb_mbtail = NULL; 1849 so->so_rcv.sb_lastrecord = NULL; 1850 } else if (nextrecord->m_nextpkt == NULL) 1851 so->so_rcv.sb_lastrecord = nextrecord; 1852 } 1853 SBLASTRECORDCHK(&so->so_rcv); 1854 SBLASTMBUFCHK(&so->so_rcv); 1855 /* 1856 * If soreceive() is being done from the socket callback, 1857 * then don't need to generate ACK to peer to update window, 1858 * since ACK will be generated on return to TCP. 1859 / 1860* if (!(flags & MSG_SOCALLBCK) && 1861 (pr->pr_flags & PR_WANTRCVD)) { 1862 SOCKBUF_UNLOCK(&so->so_rcv); 1863 (pr->pr_usrreqs->pru_rcvd)(so, flags); 1864* SOCKBUF_LOCK(&so->so_rcv); 1865 } 1866 } 1867 SOCKBUF_LOCK_ASSERT(&so->so_rcv); 1868 if (orig_resid == uio->uio_resid && orig_resid && 1869 (flags & MSG_EOR) == 0 && (so->so_rcv.sb_state & SBS_CANTRCVMORE) == 0) { 1870 sbunlock(&so->so_rcv); 1871 goto restart; 1872 } 1873 1874 if (flagsp != NULL) 1875 flagsp \|= flags; 1876release: 1877* SOCKBUF_LOCK_ASSERT(&so->so_rcv); 1878 sbunlock(&so->so_rcv); 1879out: 1880 SOCKBUF_LOCK_ASSERT(&so->so_rcv); 1881 SOCKBUF_UNLOCK(&so->so_rcv); 1882 return (error); 1883} 1884 1885int 1886soreceive(so, psa, uio, mp0, controlp, flagsp) 1887 struct socket so; 1888* struct sockaddr *psa; 1889* struct uio uio; 1890* struct mbuf *mp0; 1891* struct mbuf *controlp; 1892* int flagsp; 1893{ 1894* 1895 /* XXXRW: Temporary debugging. / 1896* KASSERT(so->so_proto->pr_usrreqs->pru_soreceive != soreceive, 1897 ("soreceive: protocol calls soreceive")); 1898 1899 return (so->so_proto->pr_usrreqs->pru_soreceive(so, psa, uio, mp0, 1900 controlp, flagsp)); 1901} 1902 1903int 1904soshutdown(so, how) 1905 struct socket so; 1906* int how; 1907{ 1908 struct protosw pr = so->so_proto; 1909* 1910 if (!(how == SHUT_RD \|\| how == SHUT_WR \|\| how == SHUT_RDWR)) 1911 return (EINVAL); 1912 1913 if (how != SHUT_WR) 1914 sorflush(so); 1915 if (how != SHUT_RD) 1916 return ((pr->pr_usrreqs->pru_shutdown)(so)); 1917* return (0); 1918} 1919 1920void 1921sorflush(so) 1922 struct socket so; 1923{ 1924* struct sockbuf sb = &so->so_rcv; 1925* struct protosw pr = so->so_proto; 1926* struct sockbuf asb; 1927 1928 /* 1929 * XXXRW: This is quite ugly. Previously, this code made a copy of 1930 * the socket buffer, then zero'd the original to clear the buffer 1931 * fields. However, with mutexes in the socket buffer, this causes 1932 * problems. We only clear the zeroable bits of the original; 1933 * however, we have to initialize and destroy the mutex in the copy 1934 * so that dom_dispose() and sbrelease() can lock t as needed. 1935 / 1936* SOCKBUF_LOCK(sb); 1937 sb->sb_flags \|= SB_NOINTR; 1938 (void) sblock(sb, M_WAITOK); 1939 /* 1940 * socantrcvmore_locked() drops the socket buffer mutex so that it 1941 * can safely perform wakeups. Re-acquire the mutex before 1942 * continuing. 1943 / 1944* socantrcvmore_locked(so); 1945 SOCKBUF_LOCK(sb); 1946 sbunlock(sb); 1947 /* 1948 * Invalidate/clear most of the sockbuf structure, but leave selinfo 1949 * and mutex data unchanged. 1950 / 1951* bzero(&asb, offsetof(struct sockbuf, sb_startzero)); 1952 bcopy(&sb->sb_startzero, &asb.sb_startzero, 1953 sizeof(sb) - offsetof(struct sockbuf, sb_startzero)); 1954* bzero(&sb->sb_startzero, 1955 sizeof(sb) - offsetof(struct sockbuf, sb_startzero)); 1956* SOCKBUF_UNLOCK(sb); 1957 1958 SOCKBUF_LOCK_INIT(&asb, "so_rcv"); 1959 if (pr->pr_flags & PR_RIGHTS && pr->pr_domain->dom_dispose != NULL) 1960 (pr->pr_domain->dom_dispose)(asb.sb_mb); 1961* sbrelease(&asb, so); 1962 SOCKBUF_LOCK_DESTROY(&asb); 1963} 1964 1965/* 1966 * Perhaps this routine, and sooptcopyout(), below, ought to come in an 1967 * additional variant to handle the case where the option value needs to be 1968 * some kind of integer, but not a specific size. In addition to their use 1969 * here, these functions are also called by the protocol-level pr_ctloutput() 1970 * routines. 1971 / 1972int 1973sooptcopyin(sopt, buf, len, minlen) 1974* struct sockopt sopt; 1975* void buf; 1976* size_t len; 1977 size_t minlen; 1978{ 1979 size_t valsize; 1980 1981 /* 1982 * If the user gives us more than we wanted, we ignore it, but if we 1983 * don't get the minimum length the caller wants, we return EINVAL. 1984 * On success, sopt->sopt_valsize is set to however much we actually 1985 * retrieved. 1986 / 1987* if ((valsize = sopt->sopt_valsize) < minlen) 1988 return EINVAL; 1989 if (valsize > len) 1990 sopt->sopt_valsize = valsize = len; 1991 1992 if (sopt->sopt_td != NULL) 1993 return (copyin(sopt->sopt_val, buf, valsize)); 1994 1995 bcopy(sopt->sopt_val, buf, valsize); 1996 return (0); 1997} 1998 1999/* 2000 * Kernel version of setsockopt(2). 2001 * 2002 * XXX: optlen is size_t, not socklen_t 2003 / 2004int 2005so_setsockopt(struct socket so, int level, int optname, void optval, 2006* size_t optlen) 2007{ 2008 struct sockopt sopt; 2009 2010 sopt.sopt_level = level; 2011 sopt.sopt_name = optname; 2012 sopt.sopt_dir = SOPT_SET; 2013 sopt.sopt_val = optval; 2014 sopt.sopt_valsize = optlen; 2015 sopt.sopt_td = NULL; 2016 return (sosetopt(so, &sopt)); 2017} 2018 2019int 2020sosetopt(so, sopt) 2021 struct socket so; 2022* struct sockopt sopt; 2023{ 2024* int error, optval; 2025 struct linger l; 2026 struct timeval tv; 2027 u_long val; 2028#ifdef MAC 2029 struct mac extmac; 2030#endif 2031 2032 error = 0; 2033 if (sopt->sopt_level != SOL_SOCKET) { 2034 if (so->so_proto && so->so_proto->pr_ctloutput) 2035 return ((so->so_proto->pr_ctloutput) 2036* (so, sopt)); 2037 error = ENOPROTOOPT; 2038 } else { 2039 switch (sopt->sopt_name) { 2040#ifdef INET 2041 case SO_ACCEPTFILTER: 2042 error = do_setopt_accept_filter(so, sopt); 2043 if (error) 2044 goto bad; 2045 break; 2046#endif 2047 case SO_LINGER: 2048 error = sooptcopyin(sopt, &l, sizeof l, sizeof l); 2049 if (error) 2050 goto bad; 2051 2052 SOCK_LOCK(so); 2053 so->so_linger = l.l_linger; 2054 if (l.l_onoff) 2055 so->so_options \|= SO_LINGER; 2056 else 2057 so->so_options &= ~SO_LINGER; 2058 SOCK_UNLOCK(so); 2059 break; 2060 2061 case SO_DEBUG: 2062 case SO_KEEPALIVE: 2063 case SO_DONTROUTE: 2064 case SO_USELOOPBACK: 2065 case SO_BROADCAST: 2066 case SO_REUSEADDR: 2067 case SO_REUSEPORT: 2068 case SO_OOBINLINE: 2069 case SO_TIMESTAMP: 2070 case SO_BINTIME: 2071 case SO_NOSIGPIPE: 2072 error = sooptcopyin(sopt, &optval, sizeof optval, 2073 sizeof optval); 2074 if (error) 2075 goto bad; 2076 SOCK_LOCK(so); 2077 if (optval) 2078 so->so_options \|= sopt->sopt_name; 2079 else 2080 so->so_options &= ~sopt->sopt_name; 2081 SOCK_UNLOCK(so); 2082 break; 2083 2084 case SO_SNDBUF: 2085 case SO_RCVBUF: 2086 case SO_SNDLOWAT: 2087 case SO_RCVLOWAT: 2088 error = sooptcopyin(sopt, &optval, sizeof optval, 2089 sizeof optval); 2090 if (error) 2091 goto bad; 2092 2093 /* 2094 * Values < 1 make no sense for any of these options, 2095 * so disallow them. 2096 / 2097* if (optval < 1) { 2098 error = EINVAL; 2099 goto bad; 2100 } 2101 2102 switch (sopt->sopt_name) { 2103 case SO_SNDBUF: 2104 case SO_RCVBUF: 2105 if (sbreserve(sopt->sopt_name == SO_SNDBUF ? 2106 &so->so_snd : &so->so_rcv, (u_long)optval, 2107 so, curthread) == 0) { 2108 error = ENOBUFS; 2109 goto bad; 2110 } 2111 (sopt->sopt_name == SO_SNDBUF ? &so->so_snd : 2112 &so->so_rcv)->sb_flags &= ~SB_AUTOSIZE; 2113 break; 2114 2115 /* 2116 * Make sure the low-water is never greater than the 2117 * high-water. 2118 / 2119* case SO_SNDLOWAT: 2120 SOCKBUF_LOCK(&so->so_snd); 2121 so->so_snd.sb_lowat = 2122 (optval > so->so_snd.sb_hiwat) ? 2123 so->so_snd.sb_hiwat : optval; 2124 SOCKBUF_UNLOCK(&so->so_snd); 2125 break; 2126 case SO_RCVLOWAT: 2127 SOCKBUF_LOCK(&so->so_rcv); 2128 so->so_rcv.sb_lowat = 2129 (optval > so->so_rcv.sb_hiwat) ? 2130 so->so_rcv.sb_hiwat : optval; 2131 SOCKBUF_UNLOCK(&so->so_rcv); 2132 break; 2133 } 2134 break; 2135 2136 case SO_SNDTIMEO: 2137 case SO_RCVTIMEO: 2138#ifdef COMPAT_IA32 2139 if (curthread->td_proc->p_sysent == &ia32_freebsd_sysvec) { 2140 struct timeval32 tv32; 2141 2142 error = sooptcopyin(sopt, &tv32, sizeof tv32, 2143 sizeof tv32); 2144 CP(tv32, tv, tv_sec); 2145 CP(tv32, tv, tv_usec); 2146 } else 2147#endif 2148 error = sooptcopyin(sopt, &tv, sizeof tv, 2149 sizeof tv); 2150 if (error) 2151 goto bad; 2152 2153 /* assert(hz > 0); / 2154* if (tv.tv_sec < 0 \|\| tv.tv_sec > INT_MAX / hz \|\| 2155 tv.tv_usec < 0 \|\| tv.tv_usec >= 1000000) { 2156 error = EDOM; 2157 goto bad; 2158 } 2159 /* assert(tick > 0); / 2160* /* assert(ULONG_MAX - INT_MAX >= 1000000); / 2161* val = (u_long)(tv.tv_sec * hz) + tv.tv_usec / tick; 2162 if (val > INT_MAX) { 2163 error = EDOM; 2164 goto bad; 2165 } 2166 if (val == 0 && tv.tv_usec != 0) 2167 val = 1; 2168 2169 switch (sopt->sopt_name) { 2170 case SO_SNDTIMEO: 2171 so->so_snd.sb_timeo = val; 2172 break; 2173 case SO_RCVTIMEO: 2174 so->so_rcv.sb_timeo = val; 2175 break; 2176 } 2177 break; 2178 2179 case SO_LABEL: 2180#ifdef MAC 2181 error = sooptcopyin(sopt, &extmac, sizeof extmac, 2182 sizeof extmac); 2183 if (error) 2184 goto bad; 2185 error = mac_setsockopt_label(sopt->sopt_td->td_ucred, 2186 so, &extmac); 2187#else 2188 error = EOPNOTSUPP; 2189#endif 2190 break; 2191 2192 default: 2193 error = ENOPROTOOPT; 2194 break; 2195 } 2196 if (error == 0 && so->so_proto != NULL && 2197 so->so_proto->pr_ctloutput != NULL) { 2198 (void) ((so->so_proto->pr_ctloutput) 2199* (so, sopt)); 2200 } 2201 } 2202bad: 2203 return (error); 2204} 2205 2206/* 2207 * Helper routine for getsockopt. 2208 / 2209int 2210sooptcopyout(struct sockopt sopt, const void buf, size_t len) 2211{ 2212* int error; 2213 size_t valsize; 2214 2215 error = 0; 2216 2217 /* 2218 * Documented get behavior is that we always return a value, possibly 2219 * truncated to fit in the user's buffer. Traditional behavior is 2220 * that we always tell the user precisely how much we copied, rather 2221 * than something useful like the total amount we had available for 2222 * her. Note that this interface is not idempotent; the entire 2223 * answer must generated ahead of time. 2224 / 2225* valsize = min(len, sopt->sopt_valsize); 2226 sopt->sopt_valsize = valsize; 2227 if (sopt->sopt_val != NULL) { 2228 if (sopt->sopt_td != NULL) 2229 error = copyout(buf, sopt->sopt_val, valsize); 2230 else 2231 bcopy(buf, sopt->sopt_val, valsize); 2232 } 2233 return (error); 2234} 2235 2236int 2237sogetopt(so, sopt) 2238 struct socket so; 2239* struct sockopt sopt; 2240{ 2241* int error, optval; 2242 struct linger l; 2243 struct timeval tv; 2244#ifdef MAC 2245 struct mac extmac; 2246#endif 2247 2248 error = 0; 2249 if (sopt->sopt_level != SOL_SOCKET) { 2250 if (so->so_proto && so->so_proto->pr_ctloutput) { 2251 return ((so->so_proto->pr_ctloutput) 2252* (so, sopt)); 2253 } else 2254 return (ENOPROTOOPT); 2255 } else { 2256 switch (sopt->sopt_name) { 2257#ifdef INET 2258 case SO_ACCEPTFILTER: 2259 error = do_getopt_accept_filter(so, sopt); 2260 break; 2261#endif 2262 case SO_LINGER: 2263 SOCK_LOCK(so); 2264 l.l_onoff = so->so_options & SO_LINGER; 2265 l.l_linger = so->so_linger; 2266 SOCK_UNLOCK(so); 2267 error = sooptcopyout(sopt, &l, sizeof l); 2268 break; 2269 2270 case SO_USELOOPBACK: 2271 case SO_DONTROUTE: 2272 case SO_DEBUG: 2273 case SO_KEEPALIVE: 2274 case SO_REUSEADDR: 2275 case SO_REUSEPORT: 2276 case SO_BROADCAST: 2277 case SO_OOBINLINE: 2278 case SO_ACCEPTCONN: 2279 case SO_TIMESTAMP: 2280 case SO_BINTIME: 2281 case SO_NOSIGPIPE: 2282 optval = so->so_options & sopt->sopt_name; 2283integer: 2284 error = sooptcopyout(sopt, &optval, sizeof optval); 2285 break; 2286 2287 case SO_TYPE: 2288 optval = so->so_type; 2289 goto integer; 2290 2291 case SO_ERROR: 2292 SOCK_LOCK(so); 2293 optval = so->so_error; 2294 so->so_error = 0; 2295 SOCK_UNLOCK(so); 2296 goto integer; 2297 2298 case SO_SNDBUF: 2299 optval = so->so_snd.sb_hiwat; 2300 goto integer; 2301 2302 case SO_RCVBUF: 2303 optval = so->so_rcv.sb_hiwat; 2304 goto integer; 2305 2306 case SO_SNDLOWAT: 2307 optval = so->so_snd.sb_lowat; 2308 goto integer; 2309 2310 case SO_RCVLOWAT: 2311 optval = so->so_rcv.sb_lowat; 2312 goto integer; 2313 2314 case SO_SNDTIMEO: 2315 case SO_RCVTIMEO: 2316 optval = (sopt->sopt_name == SO_SNDTIMEO ? 2317 so->so_snd.sb_timeo : so->so_rcv.sb_timeo); 2318 2319 tv.tv_sec = optval / hz; 2320 tv.tv_usec = (optval % hz) * tick; 2321#ifdef COMPAT_IA32 2322 if (curthread->td_proc->p_sysent == &ia32_freebsd_sysvec) { 2323 struct timeval32 tv32; 2324 2325 CP(tv, tv32, tv_sec); 2326 CP(tv, tv32, tv_usec); 2327 error = sooptcopyout(sopt, &tv32, sizeof tv32); 2328 } else 2329#endif 2330 error = sooptcopyout(sopt, &tv, sizeof tv); 2331 break; 2332 2333 case SO_LABEL: 2334#ifdef MAC 2335 error = sooptcopyin(sopt, &extmac, sizeof(extmac), 2336 sizeof(extmac)); 2337 if (error) 2338 return (error); 2339 error = mac_getsockopt_label(sopt->sopt_td->td_ucred, 2340 so, &extmac); 2341 if (error) 2342 return (error); 2343 error = sooptcopyout(sopt, &extmac, sizeof extmac); 2344#else 2345 error = EOPNOTSUPP; 2346#endif 2347 break; 2348 2349 case SO_PEERLABEL: 2350#ifdef MAC 2351 error = sooptcopyin(sopt, &extmac, sizeof(extmac), 2352 sizeof(extmac)); 2353 if (error) 2354 return (error); 2355 error = mac_getsockopt_peerlabel( 2356 sopt->sopt_td->td_ucred, so, &extmac); 2357 if (error) 2358 return (error); 2359 error = sooptcopyout(sopt, &extmac, sizeof extmac); 2360#else 2361 error = EOPNOTSUPP; 2362#endif 2363 break; 2364 2365 case SO_LISTENQLIMIT: 2366 optval = so->so_qlimit; 2367 goto integer; 2368 2369 case SO_LISTENQLEN: 2370 optval = so->so_qlen; 2371 goto integer; 2372 2373 case SO_LISTENINCQLEN: 2374 optval = so->so_incqlen; 2375 goto integer; 2376 2377 default: 2378 error = ENOPROTOOPT; 2379 break; 2380 } 2381 return (error); 2382 } 2383} 2384 2385/* XXX; prepare mbuf for (__FreeBSD__ < 3) routines. / 2386int 2387soopt_getm(struct sockopt sopt, struct mbuf *mp) 2388{ 2389* struct mbuf m, m_prev; 2390 int sopt_size = sopt->sopt_valsize; 2391 2392 MGET(m, sopt->sopt_td ? M_TRYWAIT : M_DONTWAIT, MT_DATA); 2393 if (m == NULL) 2394 return ENOBUFS; 2395 if (sopt_size > MLEN) { 2396 MCLGET(m, sopt->sopt_td ? M_TRYWAIT : M_DONTWAIT); 2397 if ((m->m_flags & M_EXT) == 0) { 2398 m_free(m); 2399 return ENOBUFS; 2400 } 2401 m->m_len = min(MCLBYTES, sopt_size); 2402 } else { 2403 m->m_len = min(MLEN, sopt_size); 2404 } 2405 sopt_size -= m->m_len; 2406 mp = m; 2407* m_prev = m; 2408 2409 while (sopt_size) { 2410 MGET(m, sopt->sopt_td ? M_TRYWAIT : M_DONTWAIT, MT_DATA); 2411 if (m == NULL) { 2412 m_freem(mp); 2413* return ENOBUFS; 2414 } 2415 if (sopt_size > MLEN) { 2416 MCLGET(m, sopt->sopt_td != NULL ? M_TRYWAIT : 2417 M_DONTWAIT); 2418 if ((m->m_flags & M_EXT) == 0) { 2419 m_freem(m); 2420 m_freem(mp); 2421* return ENOBUFS; 2422 } 2423 m->m_len = min(MCLBYTES, sopt_size); 2424 } else { 2425 m->m_len = min(MLEN, sopt_size); 2426 } 2427 sopt_size -= m->m_len; 2428 m_prev->m_next = m; 2429 m_prev = m; 2430 } 2431 return (0); 2432} 2433 2434/* XXX; copyin sopt data into mbuf chain for (__FreeBSD__ < 3) routines. / 2435int 2436soopt_mcopyin(struct sockopt sopt, struct mbuf m) 2437{ 2438* struct mbuf m0 = m; 2439* 2440 if (sopt->sopt_val == NULL) 2441 return (0); 2442 while (m != NULL && sopt->sopt_valsize >= m->m_len) { 2443 if (sopt->sopt_td != NULL) { 2444 int error; 2445 2446 error = copyin(sopt->sopt_val, mtod(m, char ), 2447* m->m_len); 2448 if (error != 0) { 2449 m_freem(m0); 2450 return(error); 2451 } 2452 } else 2453 bcopy(sopt->sopt_val, mtod(m, char ), m->m_len); 2454* sopt->sopt_valsize -= m->m_len; 2455 sopt->sopt_val = (char )sopt->sopt_val + m->m_len; 2456* m = m->m_next; 2457 } 2458 if (m != NULL) /* should be allocated enoughly at ip6_sooptmcopyin() / 2459* panic("ip6_sooptmcopyin"); 2460 return (0); 2461} 2462 2463/* XXX; copyout mbuf chain data into soopt for (__FreeBSD__ < 3) routines. / 2464int 2465soopt_mcopyout(struct sockopt sopt, struct mbuf m) 2466{ 2467* struct mbuf m0 = m; 2468* size_t valsize = 0; 2469 2470 if (sopt->sopt_val == NULL) 2471 return (0); 2472 while (m != NULL && sopt->sopt_valsize >= m->m_len) { 2473 if (sopt->sopt_td != NULL) { 2474 int error; 2475 2476 error = copyout(mtod(m, char ), sopt->sopt_val, 2477* m->m_len); 2478 if (error != 0) { 2479 m_freem(m0); 2480 return(error); 2481 } 2482 } else 2483 bcopy(mtod(m, char ), sopt->sopt_val, m->m_len); 2484* sopt->sopt_valsize -= m->m_len; 2485 sopt->sopt_val = (char )sopt->sopt_val + m->m_len; 2486* valsize += m->m_len; 2487 m = m->m_next; 2488 } 2489 if (m != NULL) { 2490 /* enough soopt buffer should be given from user-land / 2491* m_freem(m0); 2492 return(EINVAL); 2493 } 2494 sopt->sopt_valsize = valsize; 2495 return (0); 2496} 2497 2498/* 2499 * sohasoutofband(): protocol notifies socket layer of the arrival of new 2500 * out-of-band data, which will then notify socket consumers. 2501 / 2502void 2503sohasoutofband(so) 2504* struct socket so; 2505{ 2506* if (so->so_sigio != NULL) 2507 pgsigio(&so->so_sigio, SIGURG, 0); 2508 selwakeuppri(&so->so_rcv.sb_sel, PSOCK); 2509} 2510 2511int 2512sopoll(struct socket so, int events, struct ucred active_cred, 2513 struct thread td) 2514{ 2515* 2516 /* XXXRW: Temporary debugging. / 2517* KASSERT(so->so_proto->pr_usrreqs->pru_sopoll != sopoll, 2518 ("sopoll: protocol calls sopoll")); 2519 2520 return (so->so_proto->pr_usrreqs->pru_sopoll(so, events, active_cred, 2521 td)); 2522} 2523 2524int 2525sopoll_generic(struct socket so, int events, struct ucred active_cred, 2526 struct thread td) 2527{ 2528* int revents = 0; 2529 2530 SOCKBUF_LOCK(&so->so_snd); 2531 SOCKBUF_LOCK(&so->so_rcv); 2532 if (events & (POLLIN \| POLLRDNORM)) 2533 if (soreadable(so)) 2534 revents \|= events & (POLLIN \| POLLRDNORM); 2535 2536 if (events & POLLINIGNEOF) 2537 if (so->so_rcv.sb_cc >= so->so_rcv.sb_lowat \|\| 2538 !TAILQ_EMPTY(&so->so_comp) \|\| so->so_error) 2539 revents \|= POLLINIGNEOF; 2540 2541 if (events & (POLLOUT \| POLLWRNORM)) 2542 if (sowriteable(so)) 2543 revents \|= events & (POLLOUT \| POLLWRNORM); 2544 2545 if (events & (POLLPRI \| POLLRDBAND)) 2546 if (so->so_oobmark \|\| (so->so_rcv.sb_state & SBS_RCVATMARK)) 2547 revents \|= events & (POLLPRI \| POLLRDBAND); 2548 2549 if (revents == 0) { 2550 if (events & 2551 (POLLIN \| POLLINIGNEOF \| POLLPRI \| POLLRDNORM \| 2552 POLLRDBAND)) { 2553 selrecord(td, &so->so_rcv.sb_sel); 2554 so->so_rcv.sb_flags \|= SB_SEL; 2555 } 2556 2557 if (events & (POLLOUT \| POLLWRNORM)) { 2558 selrecord(td, &so->so_snd.sb_sel); 2559 so->so_snd.sb_flags \|= SB_SEL; 2560 } 2561 } 2562 2563 SOCKBUF_UNLOCK(&so->so_rcv); 2564 SOCKBUF_UNLOCK(&so->so_snd); 2565 return (revents); 2566} 2567 2568int 2569soo_kqfilter(struct file fp, struct knote kn) 2570{ 2571 struct socket so = kn->kn_fp->f_data; 2572* struct sockbuf sb; 2573* 2574 switch (kn->kn_filter) { 2575 case EVFILT_READ: 2576 if (so->so_options & SO_ACCEPTCONN) 2577 kn->kn_fop = &solisten_filtops; 2578 else 2579 kn->kn_fop = &soread_filtops; 2580 sb = &so->so_rcv; 2581 break; 2582 case EVFILT_WRITE: 2583 kn->kn_fop = &sowrite_filtops; 2584 sb = &so->so_snd; 2585 break; 2586 default: 2587 return (EINVAL); 2588 } 2589 2590 SOCKBUF_LOCK(sb); 2591 knlist_add(&sb->sb_sel.si_note, kn, 1); 2592 sb->sb_flags \|= SB_KNOTE; 2593 SOCKBUF_UNLOCK(sb); 2594 return (0); 2595} 2596
	2597/* 2598 * Some routines that return EOPNOTSUPP for entry points that are not 2599 * supported by a protocol. Fill in as needed. 2600 / 2601int 2602pru_accept_notsupp(struct socket so, struct sockaddr *nam) 2603{ 2604* return EOPNOTSUPP; 2605} 2606 2607int 2608pru_attach_notsupp(struct socket so, int proto, struct thread td) 2609{ 2610 return EOPNOTSUPP; 2611} 2612 2613int 2614pru_bind_notsupp(struct socket so, struct sockaddr nam, struct thread td) 2615{ 2616* return EOPNOTSUPP; 2617} 2618 2619int 2620pru_connect_notsupp(struct socket so, struct sockaddr nam, struct thread td) 2621{ 2622* return EOPNOTSUPP; 2623} 2624 2625int 2626pru_connect2_notsupp(struct socket so1, struct socket so2) 2627{ 2628 return EOPNOTSUPP; 2629} 2630 2631int 2632pru_control_notsupp(struct socket so, u_long cmd, caddr_t data, 2633* struct ifnet ifp, struct thread td) 2634{ 2635 return EOPNOTSUPP; 2636} 2637 2638int 2639pru_disconnect_notsupp(struct socket so) 2640{ 2641* return EOPNOTSUPP; 2642} 2643 2644int 2645pru_listen_notsupp(struct socket so, int backlog, struct thread td) 2646{ 2647 return EOPNOTSUPP; 2648} 2649 2650int 2651pru_peeraddr_notsupp(struct socket so, struct sockaddr nam) 2652{ 2653* return EOPNOTSUPP; 2654} 2655 2656int 2657pru_rcvd_notsupp(struct socket so, int flags) 2658{ 2659* return EOPNOTSUPP; 2660} 2661 2662int 2663pru_rcvoob_notsupp(struct socket so, struct mbuf m, int flags) 2664{ 2665 return EOPNOTSUPP; 2666} 2667 2668int 2669pru_send_notsupp(struct socket so, int flags, struct mbuf m, 2670 struct sockaddr addr, struct mbuf control, struct thread td) 2671{ 2672* return EOPNOTSUPP; 2673} 2674 2675/* 2676 * This isn't really a ``null'' operation, but it's the default one 2677 * and doesn't do anything destructive. 2678 / 2679int 2680pru_sense_null(struct socket so, struct stat sb) 2681{ 2682* sb->st_blksize = so->so_snd.sb_hiwat; 2683 return 0; 2684} 2685 2686int 2687pru_shutdown_notsupp(struct socket so) 2688{ 2689* return EOPNOTSUPP; 2690} 2691 2692int 2693pru_sockaddr_notsupp(struct socket so, struct sockaddr nam) 2694{ 2695* return EOPNOTSUPP; 2696} 2697 2698int 2699pru_sosend_notsupp(struct socket so, struct sockaddr addr, struct uio uio, 2700* struct mbuf top, struct mbuf control, int flags, struct thread td) 2701{ 2702* return EOPNOTSUPP; 2703} 2704 2705int 2706pru_soreceive_notsupp(struct socket so, struct sockaddr paddr, 2707* struct uio uio, struct mbuf mp0, struct mbuf controlp, 2708* int flagsp) 2709{ 2710* return EOPNOTSUPP; 2711} 2712 2713int 2714pru_sopoll_notsupp(struct socket so, int events, struct ucred cred, 2715 struct thread td) 2716{ 2717* return EOPNOTSUPP; 2718} 2719
2596static void 2597filt_sordetach(struct knote kn) 2598{ 2599* struct socket so = kn->kn_fp->f_data; 2600* 2601 SOCKBUF_LOCK(&so->so_rcv); 2602 knlist_remove(&so->so_rcv.sb_sel.si_note, kn, 1); 2603 if (knlist_empty(&so->so_rcv.sb_sel.si_note)) 2604 so->so_rcv.sb_flags &= ~SB_KNOTE; 2605 SOCKBUF_UNLOCK(&so->so_rcv); 2606} 2607 2608/ARGSUSED/ 2609static int 2610filt_soread(struct knote kn, long hint) 2611{ 2612* struct socket so; 2613* 2614 so = kn->kn_fp->f_data; 2615 SOCKBUF_LOCK_ASSERT(&so->so_rcv); 2616 2617 kn->kn_data = so->so_rcv.sb_cc - so->so_rcv.sb_ctl; 2618 if (so->so_rcv.sb_state & SBS_CANTRCVMORE) { 2619 kn->kn_flags \|= EV_EOF; 2620 kn->kn_fflags = so->so_error; 2621 return (1); 2622 } else if (so->so_error) /* temporary udp error / 2623* return (1); 2624 else if (kn->kn_sfflags & NOTE_LOWAT) 2625 return (kn->kn_data >= kn->kn_sdata); 2626 else 2627 return (so->so_rcv.sb_cc >= so->so_rcv.sb_lowat); 2628} 2629 2630static void 2631filt_sowdetach(struct knote kn) 2632{ 2633* struct socket so = kn->kn_fp->f_data; 2634* 2635 SOCKBUF_LOCK(&so->so_snd); 2636 knlist_remove(&so->so_snd.sb_sel.si_note, kn, 1); 2637 if (knlist_empty(&so->so_snd.sb_sel.si_note)) 2638 so->so_snd.sb_flags &= ~SB_KNOTE; 2639 SOCKBUF_UNLOCK(&so->so_snd); 2640} 2641 2642/ARGSUSED/ 2643static int 2644filt_sowrite(struct knote kn, long hint) 2645{ 2646* struct socket so; 2647* 2648 so = kn->kn_fp->f_data; 2649 SOCKBUF_LOCK_ASSERT(&so->so_snd); 2650 kn->kn_data = sbspace(&so->so_snd); 2651 if (so->so_snd.sb_state & SBS_CANTSENDMORE) { 2652 kn->kn_flags \|= EV_EOF; 2653 kn->kn_fflags = so->so_error; 2654 return (1); 2655 } else if (so->so_error) /* temporary udp error / 2656* return (1); 2657 else if (((so->so_state & SS_ISCONNECTED) == 0) && 2658 (so->so_proto->pr_flags & PR_CONNREQUIRED)) 2659 return (0); 2660 else if (kn->kn_sfflags & NOTE_LOWAT) 2661 return (kn->kn_data >= kn->kn_sdata); 2662 else 2663 return (kn->kn_data >= so->so_snd.sb_lowat); 2664} 2665 2666/ARGSUSED/ 2667static int 2668filt_solisten(struct knote kn, long hint) 2669{ 2670* struct socket so = kn->kn_fp->f_data; 2671* 2672 kn->kn_data = so->so_qlen; 2673 return (! TAILQ_EMPTY(&so->so_comp)); 2674} 2675 2676int 2677socheckuid(struct socket so, uid_t uid) 2678{ 2679* 2680 if (so == NULL) 2681 return (EPERM); 2682 if (so->so_cred->cr_uid != uid) 2683 return (EPERM); 2684 return (0); 2685} 2686 2687static int 2688sysctl_somaxconn(SYSCTL_HANDLER_ARGS) 2689{ 2690 int error; 2691 int val; 2692 2693 val = somaxconn; 2694 error = sysctl_handle_int(oidp, &val, sizeof(int), req); 2695 if (error \|\| !req->newptr ) 2696 return (error); 2697 2698 if (val < 1 \|\| val > USHRT_MAX) 2699 return (EINVAL); 2700 2701 somaxconn = val; 2702 return (0); 2703}	2720static void 2721filt_sordetach(struct knote kn) 2722{ 2723* struct socket so = kn->kn_fp->f_data; 2724* 2725 SOCKBUF_LOCK(&so->so_rcv); 2726 knlist_remove(&so->so_rcv.sb_sel.si_note, kn, 1); 2727 if (knlist_empty(&so->so_rcv.sb_sel.si_note)) 2728 so->so_rcv.sb_flags &= ~SB_KNOTE; 2729 SOCKBUF_UNLOCK(&so->so_rcv); 2730} 2731 2732/ARGSUSED/ 2733static int 2734filt_soread(struct knote kn, long hint) 2735{ 2736* struct socket so; 2737* 2738 so = kn->kn_fp->f_data; 2739 SOCKBUF_LOCK_ASSERT(&so->so_rcv); 2740 2741 kn->kn_data = so->so_rcv.sb_cc - so->so_rcv.sb_ctl; 2742 if (so->so_rcv.sb_state & SBS_CANTRCVMORE) { 2743 kn->kn_flags \|= EV_EOF; 2744 kn->kn_fflags = so->so_error; 2745 return (1); 2746 } else if (so->so_error) /* temporary udp error / 2747* return (1); 2748 else if (kn->kn_sfflags & NOTE_LOWAT) 2749 return (kn->kn_data >= kn->kn_sdata); 2750 else 2751 return (so->so_rcv.sb_cc >= so->so_rcv.sb_lowat); 2752} 2753 2754static void 2755filt_sowdetach(struct knote kn) 2756{ 2757* struct socket so = kn->kn_fp->f_data; 2758* 2759 SOCKBUF_LOCK(&so->so_snd); 2760 knlist_remove(&so->so_snd.sb_sel.si_note, kn, 1); 2761 if (knlist_empty(&so->so_snd.sb_sel.si_note)) 2762 so->so_snd.sb_flags &= ~SB_KNOTE; 2763 SOCKBUF_UNLOCK(&so->so_snd); 2764} 2765 2766/ARGSUSED/ 2767static int 2768filt_sowrite(struct knote kn, long hint) 2769{ 2770* struct socket so; 2771* 2772 so = kn->kn_fp->f_data; 2773 SOCKBUF_LOCK_ASSERT(&so->so_snd); 2774 kn->kn_data = sbspace(&so->so_snd); 2775 if (so->so_snd.sb_state & SBS_CANTSENDMORE) { 2776 kn->kn_flags \|= EV_EOF; 2777 kn->kn_fflags = so->so_error; 2778 return (1); 2779 } else if (so->so_error) /* temporary udp error / 2780* return (1); 2781 else if (((so->so_state & SS_ISCONNECTED) == 0) && 2782 (so->so_proto->pr_flags & PR_CONNREQUIRED)) 2783 return (0); 2784 else if (kn->kn_sfflags & NOTE_LOWAT) 2785 return (kn->kn_data >= kn->kn_sdata); 2786 else 2787 return (kn->kn_data >= so->so_snd.sb_lowat); 2788} 2789 2790/ARGSUSED/ 2791static int 2792filt_solisten(struct knote kn, long hint) 2793{ 2794* struct socket so = kn->kn_fp->f_data; 2795* 2796 kn->kn_data = so->so_qlen; 2797 return (! TAILQ_EMPTY(&so->so_comp)); 2798} 2799 2800int 2801socheckuid(struct socket so, uid_t uid) 2802{ 2803* 2804 if (so == NULL) 2805 return (EPERM); 2806 if (so->so_cred->cr_uid != uid) 2807 return (EPERM); 2808 return (0); 2809} 2810 2811static int 2812sysctl_somaxconn(SYSCTL_HANDLER_ARGS) 2813{ 2814 int error; 2815 int val; 2816 2817 val = somaxconn; 2818 error = sysctl_handle_int(oidp, &val, sizeof(int), req); 2819 if (error \|\| !req->newptr ) 2820 return (error); 2821 2822 if (val < 1 \|\| val > USHRT_MAX) 2823 return (EINVAL); 2824 2825 somaxconn = val; 2826 return (0); 2827}
	2828 2829/* 2830 * Primitive routines for operating on sockets. 2831 / 2832* 2833/* 2834 * Procedures to manipulate state flags of socket 2835 * and do appropriate wakeups. Normal sequence from the 2836 * active (originating) side is that soisconnecting() is 2837 * called during processing of connect() call, 2838 * resulting in an eventual call to soisconnected() if/when the 2839 * connection is established. When the connection is torn down 2840 * soisdisconnecting() is called during processing of disconnect() call, 2841 * and soisdisconnected() is called when the connection to the peer 2842 * is totally severed. The semantics of these routines are such that 2843 * connectionless protocols can call soisconnected() and soisdisconnected() 2844 * only, bypassing the in-progress calls when setting up a ``connection'' 2845 * takes no time. 2846 * 2847 * From the passive side, a socket is created with 2848 * two queues of sockets: so_incomp for connections in progress 2849 * and so_comp for connections already made and awaiting user acceptance. 2850 * As a protocol is preparing incoming connections, it creates a socket 2851 * structure queued on so_incomp by calling sonewconn(). When the connection 2852 * is established, soisconnected() is called, and transfers the 2853 * socket structure to so_comp, making it available to accept(). 2854 * 2855 * If a socket is closed with sockets on either 2856 * so_incomp or so_comp, these sockets are dropped. 2857 * 2858 * If higher level protocols are implemented in 2859 * the kernel, the wakeups done here will sometimes 2860 * cause software-interrupt process scheduling. 2861 / 2862* 2863void 2864soisconnecting(so) 2865 register struct socket so; 2866{ 2867* 2868 SOCK_LOCK(so); 2869 so->so_state &= ~(SS_ISCONNECTED\|SS_ISDISCONNECTING); 2870 so->so_state \|= SS_ISCONNECTING; 2871 SOCK_UNLOCK(so); 2872} 2873 2874void 2875soisconnected(so) 2876 struct socket so; 2877{ 2878* struct socket head; 2879* 2880 ACCEPT_LOCK(); 2881 SOCK_LOCK(so); 2882 so->so_state &= ~(SS_ISCONNECTING\|SS_ISDISCONNECTING\|SS_ISCONFIRMING); 2883 so->so_state \|= SS_ISCONNECTED; 2884 head = so->so_head; 2885 if (head != NULL && (so->so_qstate & SQ_INCOMP)) { 2886 if ((so->so_options & SO_ACCEPTFILTER) == 0) { 2887 SOCK_UNLOCK(so); 2888 TAILQ_REMOVE(&head->so_incomp, so, so_list); 2889 head->so_incqlen--; 2890 so->so_qstate &= ~SQ_INCOMP; 2891 TAILQ_INSERT_TAIL(&head->so_comp, so, so_list); 2892 head->so_qlen++; 2893 so->so_qstate \|= SQ_COMP; 2894 ACCEPT_UNLOCK(); 2895 sorwakeup(head); 2896 wakeup_one(&head->so_timeo); 2897 } else { 2898 ACCEPT_UNLOCK(); 2899 so->so_upcall = 2900 head->so_accf->so_accept_filter->accf_callback; 2901 so->so_upcallarg = head->so_accf->so_accept_filter_arg; 2902 so->so_rcv.sb_flags \|= SB_UPCALL; 2903 so->so_options &= ~SO_ACCEPTFILTER; 2904 SOCK_UNLOCK(so); 2905 so->so_upcall(so, so->so_upcallarg, M_DONTWAIT); 2906 } 2907 return; 2908 } 2909 SOCK_UNLOCK(so); 2910 ACCEPT_UNLOCK(); 2911 wakeup(&so->so_timeo); 2912 sorwakeup(so); 2913 sowwakeup(so); 2914} 2915 2916void 2917soisdisconnecting(so) 2918 register struct socket so; 2919{ 2920* 2921 /* 2922 * Note: This code assumes that SOCK_LOCK(so) and 2923 * SOCKBUF_LOCK(&so->so_rcv) are the same. 2924 / 2925* SOCKBUF_LOCK(&so->so_rcv); 2926 so->so_state &= ~SS_ISCONNECTING; 2927 so->so_state \|= SS_ISDISCONNECTING; 2928 so->so_rcv.sb_state \|= SBS_CANTRCVMORE; 2929 sorwakeup_locked(so); 2930 SOCKBUF_LOCK(&so->so_snd); 2931 so->so_snd.sb_state \|= SBS_CANTSENDMORE; 2932 sowwakeup_locked(so); 2933 wakeup(&so->so_timeo); 2934} 2935 2936void 2937soisdisconnected(so) 2938 register struct socket so; 2939{ 2940* 2941 /* 2942 * Note: This code assumes that SOCK_LOCK(so) and 2943 * SOCKBUF_LOCK(&so->so_rcv) are the same. 2944 / 2945* SOCKBUF_LOCK(&so->so_rcv); 2946 so->so_state &= ~(SS_ISCONNECTING\|SS_ISCONNECTED\|SS_ISDISCONNECTING); 2947 so->so_state \|= SS_ISDISCONNECTED; 2948 so->so_rcv.sb_state \|= SBS_CANTRCVMORE; 2949 sorwakeup_locked(so); 2950 SOCKBUF_LOCK(&so->so_snd); 2951 so->so_snd.sb_state \|= SBS_CANTSENDMORE; 2952 sbdrop_locked(&so->so_snd, so->so_snd.sb_cc); 2953 sowwakeup_locked(so); 2954 wakeup(&so->so_timeo); 2955} 2956 2957/* 2958 * Make a copy of a sockaddr in a malloced buffer of type M_SONAME. 2959 / 2960struct sockaddr 2961sodupsockaddr(const struct sockaddr sa, int mflags) 2962{ 2963* struct sockaddr sa2; 2964* 2965 sa2 = malloc(sa->sa_len, M_SONAME, mflags); 2966 if (sa2) 2967 bcopy(sa, sa2, sa->sa_len); 2968 return sa2; 2969} 2970 2971/* 2972 * Create an external-format (``xsocket'') structure using the information 2973 * in the kernel-format socket structure pointed to by so. This is done 2974 * to reduce the spew of irrelevant information over this interface, 2975 * to isolate user code from changes in the kernel structure, and 2976 * potentially to provide information-hiding if we decide that 2977 * some of this information should be hidden from users. 2978 / 2979void 2980sotoxsocket(struct socket so, struct xsocket xso) 2981{ 2982* xso->xso_len = sizeof xso; 2983* xso->xso_so = so; 2984 xso->so_type = so->so_type; 2985 xso->so_options = so->so_options; 2986 xso->so_linger = so->so_linger; 2987 xso->so_state = so->so_state; 2988 xso->so_pcb = so->so_pcb; 2989 xso->xso_protocol = so->so_proto->pr_protocol; 2990 xso->xso_family = so->so_proto->pr_domain->dom_family; 2991 xso->so_qlen = so->so_qlen; 2992 xso->so_incqlen = so->so_incqlen; 2993 xso->so_qlimit = so->so_qlimit; 2994 xso->so_timeo = so->so_timeo; 2995 xso->so_error = so->so_error; 2996 xso->so_pgid = so->so_sigio ? so->so_sigio->sio_pgid : 0; 2997 xso->so_oobmark = so->so_oobmark; 2998 sbtoxsockbuf(&so->so_snd, &xso->so_snd); 2999 sbtoxsockbuf(&so->so_rcv, &xso->so_rcv); 3000 xso->so_uid = so->so_cred->cr_uid; 3001}