Deleted Added
full compact
uipc_socket.c (167799) uipc_socket.c (167895)
1/*-
2 * Copyright (c) 1982, 1986, 1988, 1990, 1993
3 * The Regents of the University of California.
4 * Copyright (c) 2004 The FreeBSD Foundation
5 * Copyright (c) 2004-2006 Robert N. M. Watson
6 * All rights reserved.
7 *
8 * Redistribution and use in source and binary forms, with or without
9 * modification, are permitted provided that the following conditions
10 * are met:
11 * 1. Redistributions of source code must retain the above copyright
12 * notice, this list of conditions and the following disclaimer.
13 * 2. Redistributions in binary form must reproduce the above copyright
14 * notice, this list of conditions and the following disclaimer in the
15 * documentation and/or other materials provided with the distribution.
16 * 4. Neither the name of the University nor the names of its contributors
17 * may be used to endorse or promote products derived from this software
18 * without specific prior written permission.
19 *
20 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
21 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
22 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
23 * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
24 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
25 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
26 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
27 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
28 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
29 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
30 * SUCH DAMAGE.
31 *
32 * @(#)uipc_socket.c 8.3 (Berkeley) 4/15/94
33 */
34
35/*
36 * Comments on the socket life cycle:
37 *
38 * soalloc() sets of socket layer state for a socket, called only by
39 * socreate() and sonewconn(). Socket layer private.
40 *
41 * sodealloc() tears down socket layer state for a socket, called only by
42 * sofree() and sonewconn(). Socket layer private.
43 *
44 * pru_attach() associates protocol layer state with an allocated socket;
45 * called only once, may fail, aborting socket allocation. This is called
46 * from socreate() and sonewconn(). Socket layer private.
47 *
48 * pru_detach() disassociates protocol layer state from an attached socket,
49 * and will be called exactly once for sockets in which pru_attach() has
50 * been successfully called. If pru_attach() returned an error,
51 * pru_detach() will not be called. Socket layer private.
52 *
53 * pru_abort() and pru_close() notify the protocol layer that the last
54 * consumer of a socket is starting to tear down the socket, and that the
55 * protocol should terminate the connection. Historically, pru_abort() also
56 * detached protocol state from the socket state, but this is no longer the
57 * case.
58 *
59 * socreate() creates a socket and attaches protocol state. This is a public
60 * interface that may be used by socket layer consumers to create new
61 * sockets.
62 *
63 * sonewconn() creates a socket and attaches protocol state. This is a
64 * public interface that may be used by protocols to create new sockets when
65 * a new connection is received and will be available for accept() on a
66 * listen socket.
67 *
68 * soclose() destroys a socket after possibly waiting for it to disconnect.
69 * This is a public interface that socket consumers should use to close and
70 * release a socket when done with it.
71 *
72 * soabort() destroys a socket without waiting for it to disconnect (used
73 * only for incoming connections that are already partially or fully
74 * connected). This is used internally by the socket layer when clearing
75 * listen socket queues (due to overflow or close on the listen socket), but
76 * is also a public interface protocols may use to abort connections in
77 * their incomplete listen queues should they no longer be required. Sockets
78 * placed in completed connection listen queues should not be aborted for
79 * reasons described in the comment above the soclose() implementation. This
80 * is not a general purpose close routine, and except in the specific
81 * circumstances described here, should not be used.
82 *
83 * sofree() will free a socket and its protocol state if all references on
84 * the socket have been released, and is the public interface to attempt to
85 * free a socket when a reference is removed. This is a socket layer private
86 * interface.
87 *
88 * NOTE: In addition to socreate() and soclose(), which provide a single
89 * socket reference to the consumer to be managed as required, there are two
90 * calls to explicitly manage socket references, soref(), and sorele().
91 * Currently, these are generally required only when transitioning a socket
92 * from a listen queue to a file descriptor, in order to prevent garbage
93 * collection of the socket at an untimely moment. For a number of reasons,
94 * these interfaces are not preferred, and should be avoided.
95 */
96
97#include <sys/cdefs.h>
1/*-
2 * Copyright (c) 1982, 1986, 1988, 1990, 1993
3 * The Regents of the University of California.
4 * Copyright (c) 2004 The FreeBSD Foundation
5 * Copyright (c) 2004-2006 Robert N. M. Watson
6 * All rights reserved.
7 *
8 * Redistribution and use in source and binary forms, with or without
9 * modification, are permitted provided that the following conditions
10 * are met:
11 * 1. Redistributions of source code must retain the above copyright
12 * notice, this list of conditions and the following disclaimer.
13 * 2. Redistributions in binary form must reproduce the above copyright
14 * notice, this list of conditions and the following disclaimer in the
15 * documentation and/or other materials provided with the distribution.
16 * 4. Neither the name of the University nor the names of its contributors
17 * may be used to endorse or promote products derived from this software
18 * without specific prior written permission.
19 *
20 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
21 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
22 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
23 * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
24 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
25 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
26 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
27 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
28 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
29 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
30 * SUCH DAMAGE.
31 *
32 * @(#)uipc_socket.c 8.3 (Berkeley) 4/15/94
33 */
34
35/*
36 * Comments on the socket life cycle:
37 *
38 * soalloc() sets of socket layer state for a socket, called only by
39 * socreate() and sonewconn(). Socket layer private.
40 *
41 * sodealloc() tears down socket layer state for a socket, called only by
42 * sofree() and sonewconn(). Socket layer private.
43 *
44 * pru_attach() associates protocol layer state with an allocated socket;
45 * called only once, may fail, aborting socket allocation. This is called
46 * from socreate() and sonewconn(). Socket layer private.
47 *
48 * pru_detach() disassociates protocol layer state from an attached socket,
49 * and will be called exactly once for sockets in which pru_attach() has
50 * been successfully called. If pru_attach() returned an error,
51 * pru_detach() will not be called. Socket layer private.
52 *
53 * pru_abort() and pru_close() notify the protocol layer that the last
54 * consumer of a socket is starting to tear down the socket, and that the
55 * protocol should terminate the connection. Historically, pru_abort() also
56 * detached protocol state from the socket state, but this is no longer the
57 * case.
58 *
59 * socreate() creates a socket and attaches protocol state. This is a public
60 * interface that may be used by socket layer consumers to create new
61 * sockets.
62 *
63 * sonewconn() creates a socket and attaches protocol state. This is a
64 * public interface that may be used by protocols to create new sockets when
65 * a new connection is received and will be available for accept() on a
66 * listen socket.
67 *
68 * soclose() destroys a socket after possibly waiting for it to disconnect.
69 * This is a public interface that socket consumers should use to close and
70 * release a socket when done with it.
71 *
72 * soabort() destroys a socket without waiting for it to disconnect (used
73 * only for incoming connections that are already partially or fully
74 * connected). This is used internally by the socket layer when clearing
75 * listen socket queues (due to overflow or close on the listen socket), but
76 * is also a public interface protocols may use to abort connections in
77 * their incomplete listen queues should they no longer be required. Sockets
78 * placed in completed connection listen queues should not be aborted for
79 * reasons described in the comment above the soclose() implementation. This
80 * is not a general purpose close routine, and except in the specific
81 * circumstances described here, should not be used.
82 *
83 * sofree() will free a socket and its protocol state if all references on
84 * the socket have been released, and is the public interface to attempt to
85 * free a socket when a reference is removed. This is a socket layer private
86 * interface.
87 *
88 * NOTE: In addition to socreate() and soclose(), which provide a single
89 * socket reference to the consumer to be managed as required, there are two
90 * calls to explicitly manage socket references, soref(), and sorele().
91 * Currently, these are generally required only when transitioning a socket
92 * from a listen queue to a file descriptor, in order to prevent garbage
93 * collection of the socket at an untimely moment. For a number of reasons,
94 * these interfaces are not preferred, and should be avoided.
95 */
96
97#include <sys/cdefs.h>
98__FBSDID("$FreeBSD: head/sys/kern/uipc_socket.c 167799 2007-03-22 13:21:24Z glebius $");
98__FBSDID("$FreeBSD: head/sys/kern/uipc_socket.c 167895 2007-03-26 08:59:03Z rwatson $");
99
100#include "opt_inet.h"
101#include "opt_mac.h"
102#include "opt_zero.h"
103#include "opt_compat.h"
104
105#include <sys/param.h>
106#include <sys/systm.h>
107#include <sys/fcntl.h>
108#include <sys/limits.h>
109#include <sys/lock.h>
110#include <sys/mac.h>
111#include <sys/malloc.h>
112#include <sys/mbuf.h>
113#include <sys/mutex.h>
114#include <sys/domain.h>
115#include <sys/file.h> /* for struct knote */
116#include <sys/kernel.h>
117#include <sys/event.h>
118#include <sys/eventhandler.h>
119#include <sys/poll.h>
120#include <sys/proc.h>
121#include <sys/protosw.h>
122#include <sys/socket.h>
123#include <sys/socketvar.h>
124#include <sys/resourcevar.h>
125#include <sys/signalvar.h>
99
100#include "opt_inet.h"
101#include "opt_mac.h"
102#include "opt_zero.h"
103#include "opt_compat.h"
104
105#include <sys/param.h>
106#include <sys/systm.h>
107#include <sys/fcntl.h>
108#include <sys/limits.h>
109#include <sys/lock.h>
110#include <sys/mac.h>
111#include <sys/malloc.h>
112#include <sys/mbuf.h>
113#include <sys/mutex.h>
114#include <sys/domain.h>
115#include <sys/file.h> /* for struct knote */
116#include <sys/kernel.h>
117#include <sys/event.h>
118#include <sys/eventhandler.h>
119#include <sys/poll.h>
120#include <sys/proc.h>
121#include <sys/protosw.h>
122#include <sys/socket.h>
123#include <sys/socketvar.h>
124#include <sys/resourcevar.h>
125#include <sys/signalvar.h>
126#include <sys/stat.h>
126#include <sys/sysctl.h>
127#include <sys/uio.h>
128#include <sys/jail.h>
129
130#include <security/mac/mac_framework.h>
131
132#include <vm/uma.h>
133
134#ifdef COMPAT_IA32
135#include <sys/mount.h>
136#include <compat/freebsd32/freebsd32.h>
137
138extern struct sysentvec ia32_freebsd_sysvec;
139#endif
140
141static int soreceive_rcvoob(struct socket *so, struct uio *uio,
142 int flags);
143
144static void filt_sordetach(struct knote *kn);
145static int filt_soread(struct knote *kn, long hint);
146static void filt_sowdetach(struct knote *kn);
147static int filt_sowrite(struct knote *kn, long hint);
148static int filt_solisten(struct knote *kn, long hint);
149
150static struct filterops solisten_filtops =
151 { 1, NULL, filt_sordetach, filt_solisten };
152static struct filterops soread_filtops =
153 { 1, NULL, filt_sordetach, filt_soread };
154static struct filterops sowrite_filtops =
155 { 1, NULL, filt_sowdetach, filt_sowrite };
156
157uma_zone_t socket_zone;
158so_gen_t so_gencnt; /* generation count for sockets */
159
160int maxsockets;
161
162MALLOC_DEFINE(M_SONAME, "soname", "socket name");
163MALLOC_DEFINE(M_PCB, "pcb", "protocol control block");
164
165static int somaxconn = SOMAXCONN;
166static int sysctl_somaxconn(SYSCTL_HANDLER_ARGS);
167/* XXX: we dont have SYSCTL_USHORT */
168SYSCTL_PROC(_kern_ipc, KIPC_SOMAXCONN, somaxconn, CTLTYPE_UINT | CTLFLAG_RW,
169 0, sizeof(int), sysctl_somaxconn, "I", "Maximum pending socket connection "
170 "queue size");
171static int numopensockets;
172SYSCTL_INT(_kern_ipc, OID_AUTO, numopensockets, CTLFLAG_RD,
173 &numopensockets, 0, "Number of open sockets");
174#ifdef ZERO_COPY_SOCKETS
175/* These aren't static because they're used in other files. */
176int so_zero_copy_send = 1;
177int so_zero_copy_receive = 1;
178SYSCTL_NODE(_kern_ipc, OID_AUTO, zero_copy, CTLFLAG_RD, 0,
179 "Zero copy controls");
180SYSCTL_INT(_kern_ipc_zero_copy, OID_AUTO, receive, CTLFLAG_RW,
181 &so_zero_copy_receive, 0, "Enable zero copy receive");
182SYSCTL_INT(_kern_ipc_zero_copy, OID_AUTO, send, CTLFLAG_RW,
183 &so_zero_copy_send, 0, "Enable zero copy send");
184#endif /* ZERO_COPY_SOCKETS */
185
186/*
187 * accept_mtx locks down per-socket fields relating to accept queues. See
188 * socketvar.h for an annotation of the protected fields of struct socket.
189 */
190struct mtx accept_mtx;
191MTX_SYSINIT(accept_mtx, &accept_mtx, "accept", MTX_DEF);
192
193/*
194 * so_global_mtx protects so_gencnt, numopensockets, and the per-socket
195 * so_gencnt field.
196 */
197static struct mtx so_global_mtx;
198MTX_SYSINIT(so_global_mtx, &so_global_mtx, "so_glabel", MTX_DEF);
199
200/*
201 * General IPC sysctl name space, used by sockets and a variety of other IPC
202 * types.
203 */
204SYSCTL_NODE(_kern, KERN_IPC, ipc, CTLFLAG_RW, 0, "IPC");
205
206/*
207 * Sysctl to get and set the maximum global sockets limit. Notify protocols
208 * of the change so that they can update their dependent limits as required.
209 */
210static int
211sysctl_maxsockets(SYSCTL_HANDLER_ARGS)
212{
213 int error, newmaxsockets;
214
215 newmaxsockets = maxsockets;
216 error = sysctl_handle_int(oidp, &newmaxsockets, sizeof(int), req);
217 if (error == 0 && req->newptr) {
218 if (newmaxsockets > maxsockets) {
219 maxsockets = newmaxsockets;
220 if (maxsockets > ((maxfiles / 4) * 3)) {
221 maxfiles = (maxsockets * 5) / 4;
222 maxfilesperproc = (maxfiles * 9) / 10;
223 }
224 EVENTHANDLER_INVOKE(maxsockets_change);
225 } else
226 error = EINVAL;
227 }
228 return (error);
229}
230
231SYSCTL_PROC(_kern_ipc, OID_AUTO, maxsockets, CTLTYPE_INT|CTLFLAG_RW,
232 &maxsockets, 0, sysctl_maxsockets, "IU",
233 "Maximum number of sockets avaliable");
234
235/*
236 * Initialise maxsockets.
237 */
238static void init_maxsockets(void *ignored)
239{
240 TUNABLE_INT_FETCH("kern.ipc.maxsockets", &maxsockets);
241 maxsockets = imax(maxsockets, imax(maxfiles, nmbclusters));
242}
243SYSINIT(param, SI_SUB_TUNABLES, SI_ORDER_ANY, init_maxsockets, NULL);
244
245/*
246 * Socket operation routines. These routines are called by the routines in
247 * sys_socket.c or from a system process, and implement the semantics of
248 * socket operations by switching out to the protocol specific routines.
249 */
250
251/*
252 * Get a socket structure from our zone, and initialize it. Note that it
253 * would probably be better to allocate socket and PCB at the same time, but
254 * I'm not convinced that all the protocols can be easily modified to do
255 * this.
256 *
257 * soalloc() returns a socket with a ref count of 0.
258 */
259static struct socket *
260soalloc(void)
261{
262 struct socket *so;
263
264 so = uma_zalloc(socket_zone, M_NOWAIT | M_ZERO);
265 if (so == NULL)
266 return (NULL);
267#ifdef MAC
268 if (mac_init_socket(so, M_NOWAIT) != 0) {
269 uma_zfree(socket_zone, so);
270 return (NULL);
271 }
272#endif
273 SOCKBUF_LOCK_INIT(&so->so_snd, "so_snd");
274 SOCKBUF_LOCK_INIT(&so->so_rcv, "so_rcv");
275 TAILQ_INIT(&so->so_aiojobq);
276 mtx_lock(&so_global_mtx);
277 so->so_gencnt = ++so_gencnt;
278 ++numopensockets;
279 mtx_unlock(&so_global_mtx);
280 return (so);
281}
282
283/*
284 * Free the storage associated with a socket at the socket layer, tear down
285 * locks, labels, etc. All protocol state is assumed already to have been
286 * torn down (and possibly never set up) by the caller.
287 */
288static void
289sodealloc(struct socket *so)
290{
291
292 KASSERT(so->so_count == 0, ("sodealloc(): so_count %d", so->so_count));
293 KASSERT(so->so_pcb == NULL, ("sodealloc(): so_pcb != NULL"));
294
295 mtx_lock(&so_global_mtx);
296 so->so_gencnt = ++so_gencnt;
297 --numopensockets; /* Could be below, but faster here. */
298 mtx_unlock(&so_global_mtx);
299 if (so->so_rcv.sb_hiwat)
300 (void)chgsbsize(so->so_cred->cr_uidinfo,
301 &so->so_rcv.sb_hiwat, 0, RLIM_INFINITY);
302 if (so->so_snd.sb_hiwat)
303 (void)chgsbsize(so->so_cred->cr_uidinfo,
304 &so->so_snd.sb_hiwat, 0, RLIM_INFINITY);
305#ifdef INET
306 /* remove acccept filter if one is present. */
307 if (so->so_accf != NULL)
308 do_setopt_accept_filter(so, NULL);
309#endif
310#ifdef MAC
311 mac_destroy_socket(so);
312#endif
313 crfree(so->so_cred);
314 SOCKBUF_LOCK_DESTROY(&so->so_snd);
315 SOCKBUF_LOCK_DESTROY(&so->so_rcv);
316 uma_zfree(socket_zone, so);
317}
318
319/*
320 * socreate returns a socket with a ref count of 1. The socket should be
321 * closed with soclose().
322 */
323int
324socreate(dom, aso, type, proto, cred, td)
325 int dom;
326 struct socket **aso;
327 int type;
328 int proto;
329 struct ucred *cred;
330 struct thread *td;
331{
332 struct protosw *prp;
333 struct socket *so;
334 int error;
335
336 if (proto)
337 prp = pffindproto(dom, proto, type);
338 else
339 prp = pffindtype(dom, type);
340
341 if (prp == NULL || prp->pr_usrreqs->pru_attach == NULL ||
342 prp->pr_usrreqs->pru_attach == pru_attach_notsupp)
343 return (EPROTONOSUPPORT);
344
345 if (jailed(cred) && jail_socket_unixiproute_only &&
346 prp->pr_domain->dom_family != PF_LOCAL &&
347 prp->pr_domain->dom_family != PF_INET &&
348 prp->pr_domain->dom_family != PF_ROUTE) {
349 return (EPROTONOSUPPORT);
350 }
351
352 if (prp->pr_type != type)
353 return (EPROTOTYPE);
354 so = soalloc();
355 if (so == NULL)
356 return (ENOBUFS);
357
358 TAILQ_INIT(&so->so_incomp);
359 TAILQ_INIT(&so->so_comp);
360 so->so_type = type;
361 so->so_cred = crhold(cred);
362 so->so_proto = prp;
363#ifdef MAC
364 mac_create_socket(cred, so);
365#endif
366 knlist_init(&so->so_rcv.sb_sel.si_note, SOCKBUF_MTX(&so->so_rcv),
367 NULL, NULL, NULL);
368 knlist_init(&so->so_snd.sb_sel.si_note, SOCKBUF_MTX(&so->so_snd),
369 NULL, NULL, NULL);
370 so->so_count = 1;
371 /*
372 * Auto-sizing of socket buffers is managed by the protocols and
373 * the appropriate flags must be set in the pru_attach function.
374 */
375 error = (*prp->pr_usrreqs->pru_attach)(so, proto, td);
376 if (error) {
377 KASSERT(so->so_count == 1, ("socreate: so_count %d",
378 so->so_count));
379 so->so_count = 0;
380 sodealloc(so);
381 return (error);
382 }
383 *aso = so;
384 return (0);
385}
386
387#ifdef REGRESSION
388static int regression_sonewconn_earlytest = 1;
389SYSCTL_INT(_regression, OID_AUTO, sonewconn_earlytest, CTLFLAG_RW,
390 &regression_sonewconn_earlytest, 0, "Perform early sonewconn limit test");
391#endif
392
393/*
394 * When an attempt at a new connection is noted on a socket which accepts
395 * connections, sonewconn is called. If the connection is possible (subject
396 * to space constraints, etc.) then we allocate a new structure, propoerly
397 * linked into the data structure of the original socket, and return this.
398 * Connstatus may be 0, or SO_ISCONFIRMING, or SO_ISCONNECTED.
399 *
400 * Note: the ref count on the socket is 0 on return.
401 */
402struct socket *
403sonewconn(head, connstatus)
404 register struct socket *head;
405 int connstatus;
406{
407 register struct socket *so;
408 int over;
409
410 ACCEPT_LOCK();
411 over = (head->so_qlen > 3 * head->so_qlimit / 2);
412 ACCEPT_UNLOCK();
413#ifdef REGRESSION
414 if (regression_sonewconn_earlytest && over)
415#else
416 if (over)
417#endif
418 return (NULL);
419 so = soalloc();
420 if (so == NULL)
421 return (NULL);
422 if ((head->so_options & SO_ACCEPTFILTER) != 0)
423 connstatus = 0;
424 so->so_head = head;
425 so->so_type = head->so_type;
426 so->so_options = head->so_options &~ SO_ACCEPTCONN;
427 so->so_linger = head->so_linger;
428 so->so_state = head->so_state | SS_NOFDREF;
429 so->so_proto = head->so_proto;
430 so->so_cred = crhold(head->so_cred);
431#ifdef MAC
432 SOCK_LOCK(head);
433 mac_create_socket_from_socket(head, so);
434 SOCK_UNLOCK(head);
435#endif
436 knlist_init(&so->so_rcv.sb_sel.si_note, SOCKBUF_MTX(&so->so_rcv),
437 NULL, NULL, NULL);
438 knlist_init(&so->so_snd.sb_sel.si_note, SOCKBUF_MTX(&so->so_snd),
439 NULL, NULL, NULL);
440 if (soreserve(so, head->so_snd.sb_hiwat, head->so_rcv.sb_hiwat) ||
441 (*so->so_proto->pr_usrreqs->pru_attach)(so, 0, NULL)) {
442 sodealloc(so);
443 return (NULL);
444 }
445 so->so_rcv.sb_lowat = head->so_rcv.sb_lowat;
446 so->so_snd.sb_lowat = head->so_snd.sb_lowat;
447 so->so_rcv.sb_timeo = head->so_rcv.sb_timeo;
448 so->so_snd.sb_timeo = head->so_snd.sb_timeo;
449 so->so_rcv.sb_flags |= head->so_rcv.sb_flags & SB_AUTOSIZE;
450 so->so_snd.sb_flags |= head->so_snd.sb_flags & SB_AUTOSIZE;
451 so->so_state |= connstatus;
452 ACCEPT_LOCK();
453 if (connstatus) {
454 TAILQ_INSERT_TAIL(&head->so_comp, so, so_list);
455 so->so_qstate |= SQ_COMP;
456 head->so_qlen++;
457 } else {
458 /*
459 * Keep removing sockets from the head until there's room for
460 * us to insert on the tail. In pre-locking revisions, this
461 * was a simple if(), but as we could be racing with other
462 * threads and soabort() requires dropping locks, we must
463 * loop waiting for the condition to be true.
464 */
465 while (head->so_incqlen > head->so_qlimit) {
466 struct socket *sp;
467 sp = TAILQ_FIRST(&head->so_incomp);
468 TAILQ_REMOVE(&head->so_incomp, sp, so_list);
469 head->so_incqlen--;
470 sp->so_qstate &= ~SQ_INCOMP;
471 sp->so_head = NULL;
472 ACCEPT_UNLOCK();
473 soabort(sp);
474 ACCEPT_LOCK();
475 }
476 TAILQ_INSERT_TAIL(&head->so_incomp, so, so_list);
477 so->so_qstate |= SQ_INCOMP;
478 head->so_incqlen++;
479 }
480 ACCEPT_UNLOCK();
481 if (connstatus) {
482 sorwakeup(head);
483 wakeup_one(&head->so_timeo);
484 }
485 return (so);
486}
487
488int
489sobind(so, nam, td)
490 struct socket *so;
491 struct sockaddr *nam;
492 struct thread *td;
493{
494
495 return ((*so->so_proto->pr_usrreqs->pru_bind)(so, nam, td));
496}
497
498/*
499 * solisten() transitions a socket from a non-listening state to a listening
500 * state, but can also be used to update the listen queue depth on an
501 * existing listen socket. The protocol will call back into the sockets
502 * layer using solisten_proto_check() and solisten_proto() to check and set
503 * socket-layer listen state. Call backs are used so that the protocol can
504 * acquire both protocol and socket layer locks in whatever order is required
505 * by the protocol.
506 *
507 * Protocol implementors are advised to hold the socket lock across the
508 * socket-layer test and set to avoid races at the socket layer.
509 */
510int
511solisten(so, backlog, td)
512 struct socket *so;
513 int backlog;
514 struct thread *td;
515{
516
517 return ((*so->so_proto->pr_usrreqs->pru_listen)(so, backlog, td));
518}
519
520int
521solisten_proto_check(so)
522 struct socket *so;
523{
524
525 SOCK_LOCK_ASSERT(so);
526
527 if (so->so_state & (SS_ISCONNECTED | SS_ISCONNECTING |
528 SS_ISDISCONNECTING))
529 return (EINVAL);
530 return (0);
531}
532
533void
534solisten_proto(so, backlog)
535 struct socket *so;
536 int backlog;
537{
538
539 SOCK_LOCK_ASSERT(so);
540
541 if (backlog < 0 || backlog > somaxconn)
542 backlog = somaxconn;
543 so->so_qlimit = backlog;
544 so->so_options |= SO_ACCEPTCONN;
545}
546
547/*
548 * Attempt to free a socket. This should really be sotryfree().
549 *
550 * sofree() will succeed if:
551 *
552 * - There are no outstanding file descriptor references or related consumers
553 * (so_count == 0).
554 *
555 * - The socket has been closed by user space, if ever open (SS_NOFDREF).
556 *
557 * - The protocol does not have an outstanding strong reference on the socket
558 * (SS_PROTOREF).
559 *
560 * - The socket is not in a completed connection queue, so a process has been
561 * notified that it is present. If it is removed, the user process may
562 * block in accept() despite select() saying the socket was ready.
563 *
564 * Otherwise, it will quietly abort so that a future call to sofree(), when
565 * conditions are right, can succeed.
566 */
567void
568sofree(so)
569 struct socket *so;
570{
571 struct protosw *pr = so->so_proto;
572 struct socket *head;
573
574 ACCEPT_LOCK_ASSERT();
575 SOCK_LOCK_ASSERT(so);
576
577 if ((so->so_state & SS_NOFDREF) == 0 || so->so_count != 0 ||
578 (so->so_state & SS_PROTOREF) || (so->so_qstate & SQ_COMP)) {
579 SOCK_UNLOCK(so);
580 ACCEPT_UNLOCK();
581 return;
582 }
583
584 head = so->so_head;
585 if (head != NULL) {
586 KASSERT((so->so_qstate & SQ_COMP) != 0 ||
587 (so->so_qstate & SQ_INCOMP) != 0,
588 ("sofree: so_head != NULL, but neither SQ_COMP nor "
589 "SQ_INCOMP"));
590 KASSERT((so->so_qstate & SQ_COMP) == 0 ||
591 (so->so_qstate & SQ_INCOMP) == 0,
592 ("sofree: so->so_qstate is SQ_COMP and also SQ_INCOMP"));
593 TAILQ_REMOVE(&head->so_incomp, so, so_list);
594 head->so_incqlen--;
595 so->so_qstate &= ~SQ_INCOMP;
596 so->so_head = NULL;
597 }
598 KASSERT((so->so_qstate & SQ_COMP) == 0 &&
599 (so->so_qstate & SQ_INCOMP) == 0,
600 ("sofree: so_head == NULL, but still SQ_COMP(%d) or SQ_INCOMP(%d)",
601 so->so_qstate & SQ_COMP, so->so_qstate & SQ_INCOMP));
602 if (so->so_options & SO_ACCEPTCONN) {
603 KASSERT((TAILQ_EMPTY(&so->so_comp)), ("sofree: so_comp populated"));
604 KASSERT((TAILQ_EMPTY(&so->so_incomp)), ("sofree: so_comp populated"));
605 }
606 SOCK_UNLOCK(so);
607 ACCEPT_UNLOCK();
608
609 if (pr->pr_flags & PR_RIGHTS && pr->pr_domain->dom_dispose != NULL)
610 (*pr->pr_domain->dom_dispose)(so->so_rcv.sb_mb);
611 if (pr->pr_usrreqs->pru_detach != NULL)
612 (*pr->pr_usrreqs->pru_detach)(so);
613
614 /*
615 * From this point on, we assume that no other references to this
616 * socket exist anywhere else in the stack. Therefore, no locks need
617 * to be acquired or held.
618 *
619 * We used to do a lot of socket buffer and socket locking here, as
620 * well as invoke sorflush() and perform wakeups. The direct call to
621 * dom_dispose() and sbrelease_internal() are an inlining of what was
622 * necessary from sorflush().
623 *
624 * Notice that the socket buffer and kqueue state are torn down
625 * before calling pru_detach. This means that protocols shold not
626 * assume they can perform socket wakeups, etc, in their detach
627 * code.
628 */
629 KASSERT((so->so_snd.sb_flags & SB_LOCK) == 0, ("sofree: snd sblock"));
630 KASSERT((so->so_rcv.sb_flags & SB_LOCK) == 0, ("sofree: rcv sblock"));
631 sbdestroy(&so->so_snd, so);
632 sbdestroy(&so->so_rcv, so);
633 knlist_destroy(&so->so_rcv.sb_sel.si_note);
634 knlist_destroy(&so->so_snd.sb_sel.si_note);
635 sodealloc(so);
636}
637
638/*
639 * Close a socket on last file table reference removal. Initiate disconnect
640 * if connected. Free socket when disconnect complete.
641 *
642 * This function will sorele() the socket. Note that soclose() may be called
643 * prior to the ref count reaching zero. The actual socket structure will
644 * not be freed until the ref count reaches zero.
645 */
646int
647soclose(so)
648 struct socket *so;
649{
650 int error = 0;
651
652 KASSERT(!(so->so_state & SS_NOFDREF), ("soclose: SS_NOFDREF on enter"));
653
654 funsetown(&so->so_sigio);
655 if (so->so_state & SS_ISCONNECTED) {
656 if ((so->so_state & SS_ISDISCONNECTING) == 0) {
657 error = sodisconnect(so);
658 if (error)
659 goto drop;
660 }
661 if (so->so_options & SO_LINGER) {
662 if ((so->so_state & SS_ISDISCONNECTING) &&
663 (so->so_state & SS_NBIO))
664 goto drop;
665 while (so->so_state & SS_ISCONNECTED) {
666 error = tsleep(&so->so_timeo,
667 PSOCK | PCATCH, "soclos", so->so_linger * hz);
668 if (error)
669 break;
670 }
671 }
672 }
673
674drop:
675 if (so->so_proto->pr_usrreqs->pru_close != NULL)
676 (*so->so_proto->pr_usrreqs->pru_close)(so);
677 if (so->so_options & SO_ACCEPTCONN) {
678 struct socket *sp;
679 ACCEPT_LOCK();
680 while ((sp = TAILQ_FIRST(&so->so_incomp)) != NULL) {
681 TAILQ_REMOVE(&so->so_incomp, sp, so_list);
682 so->so_incqlen--;
683 sp->so_qstate &= ~SQ_INCOMP;
684 sp->so_head = NULL;
685 ACCEPT_UNLOCK();
686 soabort(sp);
687 ACCEPT_LOCK();
688 }
689 while ((sp = TAILQ_FIRST(&so->so_comp)) != NULL) {
690 TAILQ_REMOVE(&so->so_comp, sp, so_list);
691 so->so_qlen--;
692 sp->so_qstate &= ~SQ_COMP;
693 sp->so_head = NULL;
694 ACCEPT_UNLOCK();
695 soabort(sp);
696 ACCEPT_LOCK();
697 }
698 ACCEPT_UNLOCK();
699 }
700 ACCEPT_LOCK();
701 SOCK_LOCK(so);
702 KASSERT((so->so_state & SS_NOFDREF) == 0, ("soclose: NOFDREF"));
703 so->so_state |= SS_NOFDREF;
704 sorele(so);
705 return (error);
706}
707
708/*
709 * soabort() is used to abruptly tear down a connection, such as when a
710 * resource limit is reached (listen queue depth exceeded), or if a listen
711 * socket is closed while there are sockets waiting to be accepted.
712 *
713 * This interface is tricky, because it is called on an unreferenced socket,
714 * and must be called only by a thread that has actually removed the socket
715 * from the listen queue it was on, or races with other threads are risked.
716 *
717 * This interface will call into the protocol code, so must not be called
718 * with any socket locks held. Protocols do call it while holding their own
719 * recursible protocol mutexes, but this is something that should be subject
720 * to review in the future.
721 */
722void
723soabort(so)
724 struct socket *so;
725{
726
727 /*
728 * In as much as is possible, assert that no references to this
729 * socket are held. This is not quite the same as asserting that the
730 * current thread is responsible for arranging for no references, but
731 * is as close as we can get for now.
732 */
733 KASSERT(so->so_count == 0, ("soabort: so_count"));
734 KASSERT((so->so_state & SS_PROTOREF) == 0, ("soabort: SS_PROTOREF"));
735 KASSERT(so->so_state & SS_NOFDREF, ("soabort: !SS_NOFDREF"));
736 KASSERT((so->so_state & SQ_COMP) == 0, ("soabort: SQ_COMP"));
737 KASSERT((so->so_state & SQ_INCOMP) == 0, ("soabort: SQ_INCOMP"));
738
739 if (so->so_proto->pr_usrreqs->pru_abort != NULL)
740 (*so->so_proto->pr_usrreqs->pru_abort)(so);
741 ACCEPT_LOCK();
742 SOCK_LOCK(so);
743 sofree(so);
744}
745
746int
747soaccept(so, nam)
748 struct socket *so;
749 struct sockaddr **nam;
750{
751 int error;
752
753 SOCK_LOCK(so);
754 KASSERT((so->so_state & SS_NOFDREF) != 0, ("soaccept: !NOFDREF"));
755 so->so_state &= ~SS_NOFDREF;
756 SOCK_UNLOCK(so);
757 error = (*so->so_proto->pr_usrreqs->pru_accept)(so, nam);
758 return (error);
759}
760
761int
762soconnect(so, nam, td)
763 struct socket *so;
764 struct sockaddr *nam;
765 struct thread *td;
766{
767 int error;
768
769 if (so->so_options & SO_ACCEPTCONN)
770 return (EOPNOTSUPP);
771 /*
772 * If protocol is connection-based, can only connect once.
773 * Otherwise, if connected, try to disconnect first. This allows
774 * user to disconnect by connecting to, e.g., a null address.
775 */
776 if (so->so_state & (SS_ISCONNECTED|SS_ISCONNECTING) &&
777 ((so->so_proto->pr_flags & PR_CONNREQUIRED) ||
778 (error = sodisconnect(so)))) {
779 error = EISCONN;
780 } else {
781 /*
782 * Prevent accumulated error from previous connection from
783 * biting us.
784 */
785 so->so_error = 0;
786 error = (*so->so_proto->pr_usrreqs->pru_connect)(so, nam, td);
787 }
788
789 return (error);
790}
791
792int
793soconnect2(so1, so2)
794 struct socket *so1;
795 struct socket *so2;
796{
797
798 return ((*so1->so_proto->pr_usrreqs->pru_connect2)(so1, so2));
799}
800
801int
802sodisconnect(so)
803 struct socket *so;
804{
805 int error;
806
807 if ((so->so_state & SS_ISCONNECTED) == 0)
808 return (ENOTCONN);
809 if (so->so_state & SS_ISDISCONNECTING)
810 return (EALREADY);
811 error = (*so->so_proto->pr_usrreqs->pru_disconnect)(so);
812 return (error);
813}
814
815#ifdef ZERO_COPY_SOCKETS
816struct so_zerocopy_stats{
817 int size_ok;
818 int align_ok;
819 int found_ifp;
820};
821struct so_zerocopy_stats so_zerocp_stats = {0,0,0};
822#include <netinet/in.h>
823#include <net/route.h>
824#include <netinet/in_pcb.h>
825#include <vm/vm.h>
826#include <vm/vm_page.h>
827#include <vm/vm_object.h>
828
829/*
830 * sosend_copyin() is only used if zero copy sockets are enabled. Otherwise
831 * sosend_dgram() and sosend_generic() use m_uiotombuf().
832 *
833 * sosend_copyin() accepts a uio and prepares an mbuf chain holding part or
834 * all of the data referenced by the uio. If desired, it uses zero-copy.
835 * *space will be updated to reflect data copied in.
836 *
837 * NB: If atomic I/O is requested, the caller must already have checked that
838 * space can hold resid bytes.
839 *
840 * NB: In the event of an error, the caller may need to free the partial
841 * chain pointed to by *mpp. The contents of both *uio and *space may be
842 * modified even in the case of an error.
843 */
844static int
845sosend_copyin(struct uio *uio, struct mbuf **retmp, int atomic, long *space,
846 int flags)
847{
848 struct mbuf *m, **mp, *top;
849 long len, resid;
850 int error;
851#ifdef ZERO_COPY_SOCKETS
852 int cow_send;
853#endif
854
855 *retmp = top = NULL;
856 mp = &top;
857 len = 0;
858 resid = uio->uio_resid;
859 error = 0;
860 do {
861#ifdef ZERO_COPY_SOCKETS
862 cow_send = 0;
863#endif /* ZERO_COPY_SOCKETS */
864 if (resid >= MINCLSIZE) {
865#ifdef ZERO_COPY_SOCKETS
866 if (top == NULL) {
867 m = m_gethdr(M_WAITOK, MT_DATA);
868 m->m_pkthdr.len = 0;
869 m->m_pkthdr.rcvif = NULL;
870 } else
871 m = m_get(M_WAITOK, MT_DATA);
872 if (so_zero_copy_send &&
873 resid>=PAGE_SIZE &&
874 *space>=PAGE_SIZE &&
875 uio->uio_iov->iov_len>=PAGE_SIZE) {
876 so_zerocp_stats.size_ok++;
877 so_zerocp_stats.align_ok++;
878 cow_send = socow_setup(m, uio);
879 len = cow_send;
880 }
881 if (!cow_send) {
882 m_clget(m, M_WAITOK);
883 len = min(min(MCLBYTES, resid), *space);
884 }
885#else /* ZERO_COPY_SOCKETS */
886 if (top == NULL) {
887 m = m_getcl(M_TRYWAIT, MT_DATA, M_PKTHDR);
888 m->m_pkthdr.len = 0;
889 m->m_pkthdr.rcvif = NULL;
890 } else
891 m = m_getcl(M_TRYWAIT, MT_DATA, 0);
892 len = min(min(MCLBYTES, resid), *space);
893#endif /* ZERO_COPY_SOCKETS */
894 } else {
895 if (top == NULL) {
896 m = m_gethdr(M_TRYWAIT, MT_DATA);
897 m->m_pkthdr.len = 0;
898 m->m_pkthdr.rcvif = NULL;
899
900 len = min(min(MHLEN, resid), *space);
901 /*
902 * For datagram protocols, leave room
903 * for protocol headers in first mbuf.
904 */
905 if (atomic && m && len < MHLEN)
906 MH_ALIGN(m, len);
907 } else {
908 m = m_get(M_TRYWAIT, MT_DATA);
909 len = min(min(MLEN, resid), *space);
910 }
911 }
912 if (m == NULL) {
913 error = ENOBUFS;
914 goto out;
915 }
916
917 *space -= len;
918#ifdef ZERO_COPY_SOCKETS
919 if (cow_send)
920 error = 0;
921 else
922#endif /* ZERO_COPY_SOCKETS */
923 error = uiomove(mtod(m, void *), (int)len, uio);
924 resid = uio->uio_resid;
925 m->m_len = len;
926 *mp = m;
927 top->m_pkthdr.len += len;
928 if (error)
929 goto out;
930 mp = &m->m_next;
931 if (resid <= 0) {
932 if (flags & MSG_EOR)
933 top->m_flags |= M_EOR;
934 break;
935 }
936 } while (*space > 0 && atomic);
937out:
938 *retmp = top;
939 return (error);
940}
941#endif /*ZERO_COPY_SOCKETS*/
942
943#define SBLOCKWAIT(f) (((f) & MSG_DONTWAIT) ? M_NOWAIT : M_WAITOK)
944
945int
946sosend_dgram(so, addr, uio, top, control, flags, td)
947 struct socket *so;
948 struct sockaddr *addr;
949 struct uio *uio;
950 struct mbuf *top;
951 struct mbuf *control;
952 int flags;
953 struct thread *td;
954{
955 long space, resid;
956 int clen = 0, error, dontroute;
957#ifdef ZERO_COPY_SOCKETS
958 int atomic = sosendallatonce(so) || top;
959#endif
960
961 KASSERT(so->so_type == SOCK_DGRAM, ("sodgram_send: !SOCK_DGRAM"));
962 KASSERT(so->so_proto->pr_flags & PR_ATOMIC,
963 ("sodgram_send: !PR_ATOMIC"));
964
965 if (uio != NULL)
966 resid = uio->uio_resid;
967 else
968 resid = top->m_pkthdr.len;
969 /*
970 * In theory resid should be unsigned. However, space must be
971 * signed, as it might be less than 0 if we over-committed, and we
972 * must use a signed comparison of space and resid. On the other
973 * hand, a negative resid causes us to loop sending 0-length
974 * segments to the protocol.
975 *
976 * Also check to make sure that MSG_EOR isn't used on SOCK_STREAM
977 * type sockets since that's an error.
978 */
979 if (resid < 0) {
980 error = EINVAL;
981 goto out;
982 }
983
984 dontroute =
985 (flags & MSG_DONTROUTE) && (so->so_options & SO_DONTROUTE) == 0;
986 if (td != NULL)
987 td->td_proc->p_stats->p_ru.ru_msgsnd++;
988 if (control != NULL)
989 clen = control->m_len;
990
991 SOCKBUF_LOCK(&so->so_snd);
992 if (so->so_snd.sb_state & SBS_CANTSENDMORE) {
993 SOCKBUF_UNLOCK(&so->so_snd);
994 error = EPIPE;
995 goto out;
996 }
997 if (so->so_error) {
998 error = so->so_error;
999 so->so_error = 0;
1000 SOCKBUF_UNLOCK(&so->so_snd);
1001 goto out;
1002 }
1003 if ((so->so_state & SS_ISCONNECTED) == 0) {
1004 /*
1005 * `sendto' and `sendmsg' is allowed on a connection-based
1006 * socket if it supports implied connect. Return ENOTCONN if
1007 * not connected and no address is supplied.
1008 */
1009 if ((so->so_proto->pr_flags & PR_CONNREQUIRED) &&
1010 (so->so_proto->pr_flags & PR_IMPLOPCL) == 0) {
1011 if ((so->so_state & SS_ISCONFIRMING) == 0 &&
1012 !(resid == 0 && clen != 0)) {
1013 SOCKBUF_UNLOCK(&so->so_snd);
1014 error = ENOTCONN;
1015 goto out;
1016 }
1017 } else if (addr == NULL) {
1018 if (so->so_proto->pr_flags & PR_CONNREQUIRED)
1019 error = ENOTCONN;
1020 else
1021 error = EDESTADDRREQ;
1022 SOCKBUF_UNLOCK(&so->so_snd);
1023 goto out;
1024 }
1025 }
1026
1027 /*
1028 * Do we need MSG_OOB support in SOCK_DGRAM? Signs here may be a
1029 * problem and need fixing.
1030 */
1031 space = sbspace(&so->so_snd);
1032 if (flags & MSG_OOB)
1033 space += 1024;
1034 space -= clen;
1035 SOCKBUF_UNLOCK(&so->so_snd);
1036 if (resid > space) {
1037 error = EMSGSIZE;
1038 goto out;
1039 }
1040 if (uio == NULL) {
1041 resid = 0;
1042 if (flags & MSG_EOR)
1043 top->m_flags |= M_EOR;
1044 } else {
1045#ifdef ZERO_COPY_SOCKETS
1046 error = sosend_copyin(uio, &top, atomic, &space, flags);
1047 if (error)
1048 goto out;
1049#else
1050 /*
1051 * Copy the data from userland into a mbuf chain.
1052 * If no data is to be copied in, a single empty mbuf
1053 * is returned.
1054 */
1055 top = m_uiotombuf(uio, M_WAITOK, space, max_hdr,
1056 (M_PKTHDR | ((flags & MSG_EOR) ? M_EOR : 0)));
1057 if (top == NULL) {
1058 error = EFAULT; /* only possible error */
1059 goto out;
1060 }
1061 space -= resid - uio->uio_resid;
1062#endif
1063 resid = uio->uio_resid;
1064 }
1065 KASSERT(resid == 0, ("sosend_dgram: resid != 0"));
1066 /*
1067 * XXXRW: Frobbing SO_DONTROUTE here is even worse without sblock
1068 * than with.
1069 */
1070 if (dontroute) {
1071 SOCK_LOCK(so);
1072 so->so_options |= SO_DONTROUTE;
1073 SOCK_UNLOCK(so);
1074 }
1075 /*
1076 * XXX all the SBS_CANTSENDMORE checks previously done could be out
1077 * of date. We could have recieved a reset packet in an interrupt or
1078 * maybe we slept while doing page faults in uiomove() etc. We could
1079 * probably recheck again inside the locking protection here, but
1080 * there are probably other places that this also happens. We must
1081 * rethink this.
1082 */
1083 error = (*so->so_proto->pr_usrreqs->pru_send)(so,
1084 (flags & MSG_OOB) ? PRUS_OOB :
1085 /*
1086 * If the user set MSG_EOF, the protocol understands this flag and
1087 * nothing left to send then use PRU_SEND_EOF instead of PRU_SEND.
1088 */
1089 ((flags & MSG_EOF) &&
1090 (so->so_proto->pr_flags & PR_IMPLOPCL) &&
1091 (resid <= 0)) ?
1092 PRUS_EOF :
1093 /* If there is more to send set PRUS_MORETOCOME */
1094 (resid > 0 && space > 0) ? PRUS_MORETOCOME : 0,
1095 top, addr, control, td);
1096 if (dontroute) {
1097 SOCK_LOCK(so);
1098 so->so_options &= ~SO_DONTROUTE;
1099 SOCK_UNLOCK(so);
1100 }
1101 clen = 0;
1102 control = NULL;
1103 top = NULL;
1104out:
1105 if (top != NULL)
1106 m_freem(top);
1107 if (control != NULL)
1108 m_freem(control);
1109 return (error);
1110}
1111
1112/*
1113 * Send on a socket. If send must go all at once and message is larger than
1114 * send buffering, then hard error. Lock against other senders. If must go
1115 * all at once and not enough room now, then inform user that this would
1116 * block and do nothing. Otherwise, if nonblocking, send as much as
1117 * possible. The data to be sent is described by "uio" if nonzero, otherwise
1118 * by the mbuf chain "top" (which must be null if uio is not). Data provided
1119 * in mbuf chain must be small enough to send all at once.
1120 *
1121 * Returns nonzero on error, timeout or signal; callers must check for short
1122 * counts if EINTR/ERESTART are returned. Data and control buffers are freed
1123 * on return.
1124 */
1125#define snderr(errno) { error = (errno); goto release; }
1126int
1127sosend_generic(so, addr, uio, top, control, flags, td)
1128 struct socket *so;
1129 struct sockaddr *addr;
1130 struct uio *uio;
1131 struct mbuf *top;
1132 struct mbuf *control;
1133 int flags;
1134 struct thread *td;
1135{
1136 long space, resid;
1137 int clen = 0, error, dontroute;
1138 int atomic = sosendallatonce(so) || top;
1139
1140 if (uio != NULL)
1141 resid = uio->uio_resid;
1142 else
1143 resid = top->m_pkthdr.len;
1144 /*
1145 * In theory resid should be unsigned. However, space must be
1146 * signed, as it might be less than 0 if we over-committed, and we
1147 * must use a signed comparison of space and resid. On the other
1148 * hand, a negative resid causes us to loop sending 0-length
1149 * segments to the protocol.
1150 *
1151 * Also check to make sure that MSG_EOR isn't used on SOCK_STREAM
1152 * type sockets since that's an error.
1153 */
1154 if (resid < 0 || (so->so_type == SOCK_STREAM && (flags & MSG_EOR))) {
1155 error = EINVAL;
1156 goto out;
1157 }
1158
1159 dontroute =
1160 (flags & MSG_DONTROUTE) && (so->so_options & SO_DONTROUTE) == 0 &&
1161 (so->so_proto->pr_flags & PR_ATOMIC);
1162 if (td != NULL)
1163 td->td_proc->p_stats->p_ru.ru_msgsnd++;
1164 if (control != NULL)
1165 clen = control->m_len;
1166
1167 SOCKBUF_LOCK(&so->so_snd);
1168restart:
1169 SOCKBUF_LOCK_ASSERT(&so->so_snd);
1170 error = sblock(&so->so_snd, SBLOCKWAIT(flags));
1171 if (error)
1172 goto out_locked;
1173 do {
1174 SOCKBUF_LOCK_ASSERT(&so->so_snd);
1175 if (so->so_snd.sb_state & SBS_CANTSENDMORE)
1176 snderr(EPIPE);
1177 if (so->so_error) {
1178 error = so->so_error;
1179 so->so_error = 0;
1180 goto release;
1181 }
1182 if ((so->so_state & SS_ISCONNECTED) == 0) {
1183 /*
1184 * `sendto' and `sendmsg' is allowed on a connection-
1185 * based socket if it supports implied connect.
1186 * Return ENOTCONN if not connected and no address is
1187 * supplied.
1188 */
1189 if ((so->so_proto->pr_flags & PR_CONNREQUIRED) &&
1190 (so->so_proto->pr_flags & PR_IMPLOPCL) == 0) {
1191 if ((so->so_state & SS_ISCONFIRMING) == 0 &&
1192 !(resid == 0 && clen != 0))
1193 snderr(ENOTCONN);
1194 } else if (addr == NULL)
1195 snderr(so->so_proto->pr_flags & PR_CONNREQUIRED ?
1196 ENOTCONN : EDESTADDRREQ);
1197 }
1198 space = sbspace(&so->so_snd);
1199 if (flags & MSG_OOB)
1200 space += 1024;
1201 if ((atomic && resid > so->so_snd.sb_hiwat) ||
1202 clen > so->so_snd.sb_hiwat)
1203 snderr(EMSGSIZE);
1204 if (space < resid + clen &&
1205 (atomic || space < so->so_snd.sb_lowat || space < clen)) {
1206 if ((so->so_state & SS_NBIO) || (flags & MSG_NBIO))
1207 snderr(EWOULDBLOCK);
1208 sbunlock(&so->so_snd);
1209 error = sbwait(&so->so_snd);
1210 if (error)
1211 goto out_locked;
1212 goto restart;
1213 }
1214 SOCKBUF_UNLOCK(&so->so_snd);
1215 space -= clen;
1216 do {
1217 if (uio == NULL) {
1218 resid = 0;
1219 if (flags & MSG_EOR)
1220 top->m_flags |= M_EOR;
1221 } else {
1222#ifdef ZERO_COPY_SOCKETS
1223 error = sosend_copyin(uio, &top, atomic,
1224 &space, flags);
1225 if (error != 0) {
1226 SOCKBUF_LOCK(&so->so_snd);
1227 goto release;
1228 }
1229#else
1230 /*
1231 * Copy the data from userland into a mbuf
1232 * chain. If no data is to be copied in,
1233 * a single empty mbuf is returned.
1234 */
1235 top = m_uiotombuf(uio, M_WAITOK, space,
1236 (atomic ? max_hdr : 0),
1237 (atomic ? M_PKTHDR : 0) |
1238 ((flags & MSG_EOR) ? M_EOR : 0));
1239 if (top == NULL) {
1240 SOCKBUF_LOCK(&so->so_snd);
1241 error = EFAULT; /* only possible error */
1242 goto release;
1243 }
1244 space -= resid - uio->uio_resid;
1245#endif
1246 resid = uio->uio_resid;
1247 }
1248 if (dontroute) {
1249 SOCK_LOCK(so);
1250 so->so_options |= SO_DONTROUTE;
1251 SOCK_UNLOCK(so);
1252 }
1253 /*
1254 * XXX all the SBS_CANTSENDMORE checks previously
1255 * done could be out of date. We could have recieved
1256 * a reset packet in an interrupt or maybe we slept
1257 * while doing page faults in uiomove() etc. We
1258 * could probably recheck again inside the locking
1259 * protection here, but there are probably other
1260 * places that this also happens. We must rethink
1261 * this.
1262 */
1263 error = (*so->so_proto->pr_usrreqs->pru_send)(so,
1264 (flags & MSG_OOB) ? PRUS_OOB :
1265 /*
1266 * If the user set MSG_EOF, the protocol understands
1267 * this flag and nothing left to send then use
1268 * PRU_SEND_EOF instead of PRU_SEND.
1269 */
1270 ((flags & MSG_EOF) &&
1271 (so->so_proto->pr_flags & PR_IMPLOPCL) &&
1272 (resid <= 0)) ?
1273 PRUS_EOF :
1274 /* If there is more to send set PRUS_MORETOCOME. */
1275 (resid > 0 && space > 0) ? PRUS_MORETOCOME : 0,
1276 top, addr, control, td);
1277 if (dontroute) {
1278 SOCK_LOCK(so);
1279 so->so_options &= ~SO_DONTROUTE;
1280 SOCK_UNLOCK(so);
1281 }
1282 clen = 0;
1283 control = NULL;
1284 top = NULL;
1285 if (error) {
1286 SOCKBUF_LOCK(&so->so_snd);
1287 goto release;
1288 }
1289 } while (resid && space > 0);
1290 SOCKBUF_LOCK(&so->so_snd);
1291 } while (resid);
1292
1293release:
1294 SOCKBUF_LOCK_ASSERT(&so->so_snd);
1295 sbunlock(&so->so_snd);
1296out_locked:
1297 SOCKBUF_LOCK_ASSERT(&so->so_snd);
1298 SOCKBUF_UNLOCK(&so->so_snd);
1299out:
1300 if (top != NULL)
1301 m_freem(top);
1302 if (control != NULL)
1303 m_freem(control);
1304 return (error);
1305}
1306#undef snderr
1307
1308int
1309sosend(so, addr, uio, top, control, flags, td)
1310 struct socket *so;
1311 struct sockaddr *addr;
1312 struct uio *uio;
1313 struct mbuf *top;
1314 struct mbuf *control;
1315 int flags;
1316 struct thread *td;
1317{
1318
1319 /* XXXRW: Temporary debugging. */
1320 KASSERT(so->so_proto->pr_usrreqs->pru_sosend != sosend,
1321 ("sosend: protocol calls sosend"));
1322
1323 return (so->so_proto->pr_usrreqs->pru_sosend(so, addr, uio, top,
1324 control, flags, td));
1325}
1326
1327/*
1328 * The part of soreceive() that implements reading non-inline out-of-band
1329 * data from a socket. For more complete comments, see soreceive(), from
1330 * which this code originated.
1331 *
1332 * Note that soreceive_rcvoob(), unlike the remainder of soreceive(), is
1333 * unable to return an mbuf chain to the caller.
1334 */
1335static int
1336soreceive_rcvoob(so, uio, flags)
1337 struct socket *so;
1338 struct uio *uio;
1339 int flags;
1340{
1341 struct protosw *pr = so->so_proto;
1342 struct mbuf *m;
1343 int error;
1344
1345 KASSERT(flags & MSG_OOB, ("soreceive_rcvoob: (flags & MSG_OOB) == 0"));
1346
1347 m = m_get(M_TRYWAIT, MT_DATA);
1348 if (m == NULL)
1349 return (ENOBUFS);
1350 error = (*pr->pr_usrreqs->pru_rcvoob)(so, m, flags & MSG_PEEK);
1351 if (error)
1352 goto bad;
1353 do {
1354#ifdef ZERO_COPY_SOCKETS
1355 if (so_zero_copy_receive) {
1356 int disposable;
1357
1358 if ((m->m_flags & M_EXT)
1359 && (m->m_ext.ext_type == EXT_DISPOSABLE))
1360 disposable = 1;
1361 else
1362 disposable = 0;
1363
1364 error = uiomoveco(mtod(m, void *),
1365 min(uio->uio_resid, m->m_len),
1366 uio, disposable);
1367 } else
1368#endif /* ZERO_COPY_SOCKETS */
1369 error = uiomove(mtod(m, void *),
1370 (int) min(uio->uio_resid, m->m_len), uio);
1371 m = m_free(m);
1372 } while (uio->uio_resid && error == 0 && m);
1373bad:
1374 if (m != NULL)
1375 m_freem(m);
1376 return (error);
1377}
1378
1379/*
1380 * Following replacement or removal of the first mbuf on the first mbuf chain
1381 * of a socket buffer, push necessary state changes back into the socket
1382 * buffer so that other consumers see the values consistently. 'nextrecord'
1383 * is the callers locally stored value of the original value of
1384 * sb->sb_mb->m_nextpkt which must be restored when the lead mbuf changes.
1385 * NOTE: 'nextrecord' may be NULL.
1386 */
1387static __inline void
1388sockbuf_pushsync(struct sockbuf *sb, struct mbuf *nextrecord)
1389{
1390
1391 SOCKBUF_LOCK_ASSERT(sb);
1392 /*
1393 * First, update for the new value of nextrecord. If necessary, make
1394 * it the first record.
1395 */
1396 if (sb->sb_mb != NULL)
1397 sb->sb_mb->m_nextpkt = nextrecord;
1398 else
1399 sb->sb_mb = nextrecord;
1400
1401 /*
1402 * Now update any dependent socket buffer fields to reflect the new
1403 * state. This is an expanded inline of SB_EMPTY_FIXUP(), with the
1404 * addition of a second clause that takes care of the case where
1405 * sb_mb has been updated, but remains the last record.
1406 */
1407 if (sb->sb_mb == NULL) {
1408 sb->sb_mbtail = NULL;
1409 sb->sb_lastrecord = NULL;
1410 } else if (sb->sb_mb->m_nextpkt == NULL)
1411 sb->sb_lastrecord = sb->sb_mb;
1412}
1413
1414
1415/*
1416 * Implement receive operations on a socket. We depend on the way that
1417 * records are added to the sockbuf by sbappend. In particular, each record
1418 * (mbufs linked through m_next) must begin with an address if the protocol
1419 * so specifies, followed by an optional mbuf or mbufs containing ancillary
1420 * data, and then zero or more mbufs of data. In order to allow parallelism
1421 * between network receive and copying to user space, as well as avoid
1422 * sleeping with a mutex held, we release the socket buffer mutex during the
1423 * user space copy. Although the sockbuf is locked, new data may still be
1424 * appended, and thus we must maintain consistency of the sockbuf during that
1425 * time.
1426 *
1427 * The caller may receive the data as a single mbuf chain by supplying an
1428 * mbuf **mp0 for use in returning the chain. The uio is then used only for
1429 * the count in uio_resid.
1430 */
1431int
1432soreceive_generic(so, psa, uio, mp0, controlp, flagsp)
1433 struct socket *so;
1434 struct sockaddr **psa;
1435 struct uio *uio;
1436 struct mbuf **mp0;
1437 struct mbuf **controlp;
1438 int *flagsp;
1439{
1440 struct mbuf *m, **mp;
1441 int flags, len, error, offset;
1442 struct protosw *pr = so->so_proto;
1443 struct mbuf *nextrecord;
1444 int moff, type = 0;
1445 int orig_resid = uio->uio_resid;
1446
1447 mp = mp0;
1448 if (psa != NULL)
1449 *psa = NULL;
1450 if (controlp != NULL)
1451 *controlp = NULL;
1452 if (flagsp != NULL)
1453 flags = *flagsp &~ MSG_EOR;
1454 else
1455 flags = 0;
1456 if (flags & MSG_OOB)
1457 return (soreceive_rcvoob(so, uio, flags));
1458 if (mp != NULL)
1459 *mp = NULL;
1460 if ((pr->pr_flags & PR_WANTRCVD) && (so->so_state & SS_ISCONFIRMING)
1461 && uio->uio_resid)
1462 (*pr->pr_usrreqs->pru_rcvd)(so, 0);
1463
1464 SOCKBUF_LOCK(&so->so_rcv);
1465restart:
1466 SOCKBUF_LOCK_ASSERT(&so->so_rcv);
1467 error = sblock(&so->so_rcv, SBLOCKWAIT(flags));
1468 if (error)
1469 goto out;
1470
1471 m = so->so_rcv.sb_mb;
1472 /*
1473 * If we have less data than requested, block awaiting more (subject
1474 * to any timeout) if:
1475 * 1. the current count is less than the low water mark, or
1476 * 2. MSG_WAITALL is set, and it is possible to do the entire
1477 * receive operation at once if we block (resid <= hiwat).
1478 * 3. MSG_DONTWAIT is not set
1479 * If MSG_WAITALL is set but resid is larger than the receive buffer,
1480 * we have to do the receive in sections, and thus risk returning a
1481 * short count if a timeout or signal occurs after we start.
1482 */
1483 if (m == NULL || (((flags & MSG_DONTWAIT) == 0 &&
1484 so->so_rcv.sb_cc < uio->uio_resid) &&
1485 (so->so_rcv.sb_cc < so->so_rcv.sb_lowat ||
1486 ((flags & MSG_WAITALL) && uio->uio_resid <= so->so_rcv.sb_hiwat)) &&
1487 m->m_nextpkt == NULL && (pr->pr_flags & PR_ATOMIC) == 0)) {
1488 KASSERT(m != NULL || !so->so_rcv.sb_cc,
1489 ("receive: m == %p so->so_rcv.sb_cc == %u",
1490 m, so->so_rcv.sb_cc));
1491 if (so->so_error) {
1492 if (m != NULL)
1493 goto dontblock;
1494 error = so->so_error;
1495 if ((flags & MSG_PEEK) == 0)
1496 so->so_error = 0;
1497 goto release;
1498 }
1499 SOCKBUF_LOCK_ASSERT(&so->so_rcv);
1500 if (so->so_rcv.sb_state & SBS_CANTRCVMORE) {
1501 if (m)
1502 goto dontblock;
1503 else
1504 goto release;
1505 }
1506 for (; m != NULL; m = m->m_next)
1507 if (m->m_type == MT_OOBDATA || (m->m_flags & M_EOR)) {
1508 m = so->so_rcv.sb_mb;
1509 goto dontblock;
1510 }
1511 if ((so->so_state & (SS_ISCONNECTED|SS_ISCONNECTING)) == 0 &&
1512 (so->so_proto->pr_flags & PR_CONNREQUIRED)) {
1513 error = ENOTCONN;
1514 goto release;
1515 }
1516 if (uio->uio_resid == 0)
1517 goto release;
1518 if ((so->so_state & SS_NBIO) ||
1519 (flags & (MSG_DONTWAIT|MSG_NBIO))) {
1520 error = EWOULDBLOCK;
1521 goto release;
1522 }
1523 SBLASTRECORDCHK(&so->so_rcv);
1524 SBLASTMBUFCHK(&so->so_rcv);
1525 sbunlock(&so->so_rcv);
1526 error = sbwait(&so->so_rcv);
1527 if (error)
1528 goto out;
1529 goto restart;
1530 }
1531dontblock:
1532 /*
1533 * From this point onward, we maintain 'nextrecord' as a cache of the
1534 * pointer to the next record in the socket buffer. We must keep the
1535 * various socket buffer pointers and local stack versions of the
1536 * pointers in sync, pushing out modifications before dropping the
1537 * socket buffer mutex, and re-reading them when picking it up.
1538 *
1539 * Otherwise, we will race with the network stack appending new data
1540 * or records onto the socket buffer by using inconsistent/stale
1541 * versions of the field, possibly resulting in socket buffer
1542 * corruption.
1543 *
1544 * By holding the high-level sblock(), we prevent simultaneous
1545 * readers from pulling off the front of the socket buffer.
1546 */
1547 SOCKBUF_LOCK_ASSERT(&so->so_rcv);
1548 if (uio->uio_td)
1549 uio->uio_td->td_proc->p_stats->p_ru.ru_msgrcv++;
1550 KASSERT(m == so->so_rcv.sb_mb, ("soreceive: m != so->so_rcv.sb_mb"));
1551 SBLASTRECORDCHK(&so->so_rcv);
1552 SBLASTMBUFCHK(&so->so_rcv);
1553 nextrecord = m->m_nextpkt;
1554 if (pr->pr_flags & PR_ADDR) {
1555 KASSERT(m->m_type == MT_SONAME,
1556 ("m->m_type == %d", m->m_type));
1557 orig_resid = 0;
1558 if (psa != NULL)
1559 *psa = sodupsockaddr(mtod(m, struct sockaddr *),
1560 M_NOWAIT);
1561 if (flags & MSG_PEEK) {
1562 m = m->m_next;
1563 } else {
1564 sbfree(&so->so_rcv, m);
1565 so->so_rcv.sb_mb = m_free(m);
1566 m = so->so_rcv.sb_mb;
1567 sockbuf_pushsync(&so->so_rcv, nextrecord);
1568 }
1569 }
1570
1571 /*
1572 * Process one or more MT_CONTROL mbufs present before any data mbufs
1573 * in the first mbuf chain on the socket buffer. If MSG_PEEK, we
1574 * just copy the data; if !MSG_PEEK, we call into the protocol to
1575 * perform externalization (or freeing if controlp == NULL).
1576 */
1577 if (m != NULL && m->m_type == MT_CONTROL) {
1578 struct mbuf *cm = NULL, *cmn;
1579 struct mbuf **cme = &cm;
1580
1581 do {
1582 if (flags & MSG_PEEK) {
1583 if (controlp != NULL) {
1584 *controlp = m_copy(m, 0, m->m_len);
1585 controlp = &(*controlp)->m_next;
1586 }
1587 m = m->m_next;
1588 } else {
1589 sbfree(&so->so_rcv, m);
1590 so->so_rcv.sb_mb = m->m_next;
1591 m->m_next = NULL;
1592 *cme = m;
1593 cme = &(*cme)->m_next;
1594 m = so->so_rcv.sb_mb;
1595 }
1596 } while (m != NULL && m->m_type == MT_CONTROL);
1597 if ((flags & MSG_PEEK) == 0)
1598 sockbuf_pushsync(&so->so_rcv, nextrecord);
1599 while (cm != NULL) {
1600 cmn = cm->m_next;
1601 cm->m_next = NULL;
1602 if (pr->pr_domain->dom_externalize != NULL) {
1603 SOCKBUF_UNLOCK(&so->so_rcv);
1604 error = (*pr->pr_domain->dom_externalize)
1605 (cm, controlp);
1606 SOCKBUF_LOCK(&so->so_rcv);
1607 } else if (controlp != NULL)
1608 *controlp = cm;
1609 else
1610 m_freem(cm);
1611 if (controlp != NULL) {
1612 orig_resid = 0;
1613 while (*controlp != NULL)
1614 controlp = &(*controlp)->m_next;
1615 }
1616 cm = cmn;
1617 }
1618 if (m != NULL)
1619 nextrecord = so->so_rcv.sb_mb->m_nextpkt;
1620 else
1621 nextrecord = so->so_rcv.sb_mb;
1622 orig_resid = 0;
1623 }
1624 if (m != NULL) {
1625 if ((flags & MSG_PEEK) == 0) {
1626 KASSERT(m->m_nextpkt == nextrecord,
1627 ("soreceive: post-control, nextrecord !sync"));
1628 if (nextrecord == NULL) {
1629 KASSERT(so->so_rcv.sb_mb == m,
1630 ("soreceive: post-control, sb_mb!=m"));
1631 KASSERT(so->so_rcv.sb_lastrecord == m,
1632 ("soreceive: post-control, lastrecord!=m"));
1633 }
1634 }
1635 type = m->m_type;
1636 if (type == MT_OOBDATA)
1637 flags |= MSG_OOB;
1638 } else {
1639 if ((flags & MSG_PEEK) == 0) {
1640 KASSERT(so->so_rcv.sb_mb == nextrecord,
1641 ("soreceive: sb_mb != nextrecord"));
1642 if (so->so_rcv.sb_mb == NULL) {
1643 KASSERT(so->so_rcv.sb_lastrecord == NULL,
1644 ("soreceive: sb_lastercord != NULL"));
1645 }
1646 }
1647 }
1648 SOCKBUF_LOCK_ASSERT(&so->so_rcv);
1649 SBLASTRECORDCHK(&so->so_rcv);
1650 SBLASTMBUFCHK(&so->so_rcv);
1651
1652 /*
1653 * Now continue to read any data mbufs off of the head of the socket
1654 * buffer until the read request is satisfied. Note that 'type' is
1655 * used to store the type of any mbuf reads that have happened so far
1656 * such that soreceive() can stop reading if the type changes, which
1657 * causes soreceive() to return only one of regular data and inline
1658 * out-of-band data in a single socket receive operation.
1659 */
1660 moff = 0;
1661 offset = 0;
1662 while (m != NULL && uio->uio_resid > 0 && error == 0) {
1663 /*
1664 * If the type of mbuf has changed since the last mbuf
1665 * examined ('type'), end the receive operation.
1666 */
1667 SOCKBUF_LOCK_ASSERT(&so->so_rcv);
1668 if (m->m_type == MT_OOBDATA) {
1669 if (type != MT_OOBDATA)
1670 break;
1671 } else if (type == MT_OOBDATA)
1672 break;
1673 else
1674 KASSERT(m->m_type == MT_DATA,
1675 ("m->m_type == %d", m->m_type));
1676 so->so_rcv.sb_state &= ~SBS_RCVATMARK;
1677 len = uio->uio_resid;
1678 if (so->so_oobmark && len > so->so_oobmark - offset)
1679 len = so->so_oobmark - offset;
1680 if (len > m->m_len - moff)
1681 len = m->m_len - moff;
1682 /*
1683 * If mp is set, just pass back the mbufs. Otherwise copy
1684 * them out via the uio, then free. Sockbuf must be
1685 * consistent here (points to current mbuf, it points to next
1686 * record) when we drop priority; we must note any additions
1687 * to the sockbuf when we block interrupts again.
1688 */
1689 if (mp == NULL) {
1690 SOCKBUF_LOCK_ASSERT(&so->so_rcv);
1691 SBLASTRECORDCHK(&so->so_rcv);
1692 SBLASTMBUFCHK(&so->so_rcv);
1693 SOCKBUF_UNLOCK(&so->so_rcv);
1694#ifdef ZERO_COPY_SOCKETS
1695 if (so_zero_copy_receive) {
1696 int disposable;
1697
1698 if ((m->m_flags & M_EXT)
1699 && (m->m_ext.ext_type == EXT_DISPOSABLE))
1700 disposable = 1;
1701 else
1702 disposable = 0;
1703
1704 error = uiomoveco(mtod(m, char *) + moff,
1705 (int)len, uio,
1706 disposable);
1707 } else
1708#endif /* ZERO_COPY_SOCKETS */
1709 error = uiomove(mtod(m, char *) + moff, (int)len, uio);
1710 SOCKBUF_LOCK(&so->so_rcv);
1711 if (error) {
1712 /*
1713 * The MT_SONAME mbuf has already been removed
1714 * from the record, so it is necessary to
1715 * remove the data mbufs, if any, to preserve
1716 * the invariant in the case of PR_ADDR that
1717 * requires MT_SONAME mbufs at the head of
1718 * each record.
1719 */
1720 if (m && pr->pr_flags & PR_ATOMIC &&
1721 ((flags & MSG_PEEK) == 0))
1722 (void)sbdroprecord_locked(&so->so_rcv);
1723 goto release;
1724 }
1725 } else
1726 uio->uio_resid -= len;
1727 SOCKBUF_LOCK_ASSERT(&so->so_rcv);
1728 if (len == m->m_len - moff) {
1729 if (m->m_flags & M_EOR)
1730 flags |= MSG_EOR;
1731 if (flags & MSG_PEEK) {
1732 m = m->m_next;
1733 moff = 0;
1734 } else {
1735 nextrecord = m->m_nextpkt;
1736 sbfree(&so->so_rcv, m);
1737 if (mp != NULL) {
1738 *mp = m;
1739 mp = &m->m_next;
1740 so->so_rcv.sb_mb = m = m->m_next;
1741 *mp = NULL;
1742 } else {
1743 so->so_rcv.sb_mb = m_free(m);
1744 m = so->so_rcv.sb_mb;
1745 }
1746 sockbuf_pushsync(&so->so_rcv, nextrecord);
1747 SBLASTRECORDCHK(&so->so_rcv);
1748 SBLASTMBUFCHK(&so->so_rcv);
1749 }
1750 } else {
1751 if (flags & MSG_PEEK)
1752 moff += len;
1753 else {
1754 if (mp != NULL) {
1755 int copy_flag;
1756
1757 if (flags & MSG_DONTWAIT)
1758 copy_flag = M_DONTWAIT;
1759 else
1760 copy_flag = M_TRYWAIT;
1761 if (copy_flag == M_TRYWAIT)
1762 SOCKBUF_UNLOCK(&so->so_rcv);
1763 *mp = m_copym(m, 0, len, copy_flag);
1764 if (copy_flag == M_TRYWAIT)
1765 SOCKBUF_LOCK(&so->so_rcv);
1766 if (*mp == NULL) {
1767 /*
1768 * m_copym() couldn't
1769 * allocate an mbuf. Adjust
1770 * uio_resid back (it was
1771 * adjusted down by len
1772 * bytes, which we didn't end
1773 * up "copying" over).
1774 */
1775 uio->uio_resid += len;
1776 break;
1777 }
1778 }
1779 m->m_data += len;
1780 m->m_len -= len;
1781 so->so_rcv.sb_cc -= len;
1782 }
1783 }
1784 SOCKBUF_LOCK_ASSERT(&so->so_rcv);
1785 if (so->so_oobmark) {
1786 if ((flags & MSG_PEEK) == 0) {
1787 so->so_oobmark -= len;
1788 if (so->so_oobmark == 0) {
1789 so->so_rcv.sb_state |= SBS_RCVATMARK;
1790 break;
1791 }
1792 } else {
1793 offset += len;
1794 if (offset == so->so_oobmark)
1795 break;
1796 }
1797 }
1798 if (flags & MSG_EOR)
1799 break;
1800 /*
1801 * If the MSG_WAITALL flag is set (for non-atomic socket), we
1802 * must not quit until "uio->uio_resid == 0" or an error
1803 * termination. If a signal/timeout occurs, return with a
1804 * short count but without error. Keep sockbuf locked
1805 * against other readers.
1806 */
1807 while (flags & MSG_WAITALL && m == NULL && uio->uio_resid > 0 &&
1808 !sosendallatonce(so) && nextrecord == NULL) {
1809 SOCKBUF_LOCK_ASSERT(&so->so_rcv);
1810 if (so->so_error || so->so_rcv.sb_state & SBS_CANTRCVMORE)
1811 break;
1812 /*
1813 * Notify the protocol that some data has been
1814 * drained before blocking.
1815 */
1816 if (pr->pr_flags & PR_WANTRCVD) {
1817 SOCKBUF_UNLOCK(&so->so_rcv);
1818 (*pr->pr_usrreqs->pru_rcvd)(so, flags);
1819 SOCKBUF_LOCK(&so->so_rcv);
1820 }
1821 SBLASTRECORDCHK(&so->so_rcv);
1822 SBLASTMBUFCHK(&so->so_rcv);
1823 error = sbwait(&so->so_rcv);
1824 if (error)
1825 goto release;
1826 m = so->so_rcv.sb_mb;
1827 if (m != NULL)
1828 nextrecord = m->m_nextpkt;
1829 }
1830 }
1831
1832 SOCKBUF_LOCK_ASSERT(&so->so_rcv);
1833 if (m != NULL && pr->pr_flags & PR_ATOMIC) {
1834 flags |= MSG_TRUNC;
1835 if ((flags & MSG_PEEK) == 0)
1836 (void) sbdroprecord_locked(&so->so_rcv);
1837 }
1838 if ((flags & MSG_PEEK) == 0) {
1839 if (m == NULL) {
1840 /*
1841 * First part is an inline SB_EMPTY_FIXUP(). Second
1842 * part makes sure sb_lastrecord is up-to-date if
1843 * there is still data in the socket buffer.
1844 */
1845 so->so_rcv.sb_mb = nextrecord;
1846 if (so->so_rcv.sb_mb == NULL) {
1847 so->so_rcv.sb_mbtail = NULL;
1848 so->so_rcv.sb_lastrecord = NULL;
1849 } else if (nextrecord->m_nextpkt == NULL)
1850 so->so_rcv.sb_lastrecord = nextrecord;
1851 }
1852 SBLASTRECORDCHK(&so->so_rcv);
1853 SBLASTMBUFCHK(&so->so_rcv);
1854 /*
1855 * If soreceive() is being done from the socket callback,
1856 * then don't need to generate ACK to peer to update window,
1857 * since ACK will be generated on return to TCP.
1858 */
1859 if (!(flags & MSG_SOCALLBCK) &&
1860 (pr->pr_flags & PR_WANTRCVD)) {
1861 SOCKBUF_UNLOCK(&so->so_rcv);
1862 (*pr->pr_usrreqs->pru_rcvd)(so, flags);
1863 SOCKBUF_LOCK(&so->so_rcv);
1864 }
1865 }
1866 SOCKBUF_LOCK_ASSERT(&so->so_rcv);
1867 if (orig_resid == uio->uio_resid && orig_resid &&
1868 (flags & MSG_EOR) == 0 && (so->so_rcv.sb_state & SBS_CANTRCVMORE) == 0) {
1869 sbunlock(&so->so_rcv);
1870 goto restart;
1871 }
1872
1873 if (flagsp != NULL)
1874 *flagsp |= flags;
1875release:
1876 SOCKBUF_LOCK_ASSERT(&so->so_rcv);
1877 sbunlock(&so->so_rcv);
1878out:
1879 SOCKBUF_LOCK_ASSERT(&so->so_rcv);
1880 SOCKBUF_UNLOCK(&so->so_rcv);
1881 return (error);
1882}
1883
1884int
1885soreceive(so, psa, uio, mp0, controlp, flagsp)
1886 struct socket *so;
1887 struct sockaddr **psa;
1888 struct uio *uio;
1889 struct mbuf **mp0;
1890 struct mbuf **controlp;
1891 int *flagsp;
1892{
1893
1894 /* XXXRW: Temporary debugging. */
1895 KASSERT(so->so_proto->pr_usrreqs->pru_soreceive != soreceive,
1896 ("soreceive: protocol calls soreceive"));
1897
1898 return (so->so_proto->pr_usrreqs->pru_soreceive(so, psa, uio, mp0,
1899 controlp, flagsp));
1900}
1901
1902int
1903soshutdown(so, how)
1904 struct socket *so;
1905 int how;
1906{
1907 struct protosw *pr = so->so_proto;
1908
1909 if (!(how == SHUT_RD || how == SHUT_WR || how == SHUT_RDWR))
1910 return (EINVAL);
1911
1912 if (how != SHUT_WR)
1913 sorflush(so);
1914 if (how != SHUT_RD)
1915 return ((*pr->pr_usrreqs->pru_shutdown)(so));
1916 return (0);
1917}
1918
1919void
1920sorflush(so)
1921 struct socket *so;
1922{
1923 struct sockbuf *sb = &so->so_rcv;
1924 struct protosw *pr = so->so_proto;
1925 struct sockbuf asb;
1926
1927 /*
1928 * XXXRW: This is quite ugly. Previously, this code made a copy of
1929 * the socket buffer, then zero'd the original to clear the buffer
1930 * fields. However, with mutexes in the socket buffer, this causes
1931 * problems. We only clear the zeroable bits of the original;
1932 * however, we have to initialize and destroy the mutex in the copy
1933 * so that dom_dispose() and sbrelease() can lock t as needed.
1934 */
1935 SOCKBUF_LOCK(sb);
1936 sb->sb_flags |= SB_NOINTR;
1937 (void) sblock(sb, M_WAITOK);
1938 /*
1939 * socantrcvmore_locked() drops the socket buffer mutex so that it
1940 * can safely perform wakeups. Re-acquire the mutex before
1941 * continuing.
1942 */
1943 socantrcvmore_locked(so);
1944 SOCKBUF_LOCK(sb);
1945 sbunlock(sb);
1946 /*
1947 * Invalidate/clear most of the sockbuf structure, but leave selinfo
1948 * and mutex data unchanged.
1949 */
1950 bzero(&asb, offsetof(struct sockbuf, sb_startzero));
1951 bcopy(&sb->sb_startzero, &asb.sb_startzero,
1952 sizeof(*sb) - offsetof(struct sockbuf, sb_startzero));
1953 bzero(&sb->sb_startzero,
1954 sizeof(*sb) - offsetof(struct sockbuf, sb_startzero));
1955 SOCKBUF_UNLOCK(sb);
1956
1957 SOCKBUF_LOCK_INIT(&asb, "so_rcv");
1958 if (pr->pr_flags & PR_RIGHTS && pr->pr_domain->dom_dispose != NULL)
1959 (*pr->pr_domain->dom_dispose)(asb.sb_mb);
1960 sbrelease(&asb, so);
1961 SOCKBUF_LOCK_DESTROY(&asb);
1962}
1963
1964/*
1965 * Perhaps this routine, and sooptcopyout(), below, ought to come in an
1966 * additional variant to handle the case where the option value needs to be
1967 * some kind of integer, but not a specific size. In addition to their use
1968 * here, these functions are also called by the protocol-level pr_ctloutput()
1969 * routines.
1970 */
1971int
1972sooptcopyin(sopt, buf, len, minlen)
1973 struct sockopt *sopt;
1974 void *buf;
1975 size_t len;
1976 size_t minlen;
1977{
1978 size_t valsize;
1979
1980 /*
1981 * If the user gives us more than we wanted, we ignore it, but if we
1982 * don't get the minimum length the caller wants, we return EINVAL.
1983 * On success, sopt->sopt_valsize is set to however much we actually
1984 * retrieved.
1985 */
1986 if ((valsize = sopt->sopt_valsize) < minlen)
1987 return EINVAL;
1988 if (valsize > len)
1989 sopt->sopt_valsize = valsize = len;
1990
1991 if (sopt->sopt_td != NULL)
1992 return (copyin(sopt->sopt_val, buf, valsize));
1993
1994 bcopy(sopt->sopt_val, buf, valsize);
1995 return (0);
1996}
1997
1998/*
1999 * Kernel version of setsockopt(2).
2000 *
2001 * XXX: optlen is size_t, not socklen_t
2002 */
2003int
2004so_setsockopt(struct socket *so, int level, int optname, void *optval,
2005 size_t optlen)
2006{
2007 struct sockopt sopt;
2008
2009 sopt.sopt_level = level;
2010 sopt.sopt_name = optname;
2011 sopt.sopt_dir = SOPT_SET;
2012 sopt.sopt_val = optval;
2013 sopt.sopt_valsize = optlen;
2014 sopt.sopt_td = NULL;
2015 return (sosetopt(so, &sopt));
2016}
2017
2018int
2019sosetopt(so, sopt)
2020 struct socket *so;
2021 struct sockopt *sopt;
2022{
2023 int error, optval;
2024 struct linger l;
2025 struct timeval tv;
2026 u_long val;
2027#ifdef MAC
2028 struct mac extmac;
2029#endif
2030
2031 error = 0;
2032 if (sopt->sopt_level != SOL_SOCKET) {
2033 if (so->so_proto && so->so_proto->pr_ctloutput)
2034 return ((*so->so_proto->pr_ctloutput)
2035 (so, sopt));
2036 error = ENOPROTOOPT;
2037 } else {
2038 switch (sopt->sopt_name) {
2039#ifdef INET
2040 case SO_ACCEPTFILTER:
2041 error = do_setopt_accept_filter(so, sopt);
2042 if (error)
2043 goto bad;
2044 break;
2045#endif
2046 case SO_LINGER:
2047 error = sooptcopyin(sopt, &l, sizeof l, sizeof l);
2048 if (error)
2049 goto bad;
2050
2051 SOCK_LOCK(so);
2052 so->so_linger = l.l_linger;
2053 if (l.l_onoff)
2054 so->so_options |= SO_LINGER;
2055 else
2056 so->so_options &= ~SO_LINGER;
2057 SOCK_UNLOCK(so);
2058 break;
2059
2060 case SO_DEBUG:
2061 case SO_KEEPALIVE:
2062 case SO_DONTROUTE:
2063 case SO_USELOOPBACK:
2064 case SO_BROADCAST:
2065 case SO_REUSEADDR:
2066 case SO_REUSEPORT:
2067 case SO_OOBINLINE:
2068 case SO_TIMESTAMP:
2069 case SO_BINTIME:
2070 case SO_NOSIGPIPE:
2071 error = sooptcopyin(sopt, &optval, sizeof optval,
2072 sizeof optval);
2073 if (error)
2074 goto bad;
2075 SOCK_LOCK(so);
2076 if (optval)
2077 so->so_options |= sopt->sopt_name;
2078 else
2079 so->so_options &= ~sopt->sopt_name;
2080 SOCK_UNLOCK(so);
2081 break;
2082
2083 case SO_SNDBUF:
2084 case SO_RCVBUF:
2085 case SO_SNDLOWAT:
2086 case SO_RCVLOWAT:
2087 error = sooptcopyin(sopt, &optval, sizeof optval,
2088 sizeof optval);
2089 if (error)
2090 goto bad;
2091
2092 /*
2093 * Values < 1 make no sense for any of these options,
2094 * so disallow them.
2095 */
2096 if (optval < 1) {
2097 error = EINVAL;
2098 goto bad;
2099 }
2100
2101 switch (sopt->sopt_name) {
2102 case SO_SNDBUF:
2103 case SO_RCVBUF:
2104 if (sbreserve(sopt->sopt_name == SO_SNDBUF ?
2105 &so->so_snd : &so->so_rcv, (u_long)optval,
2106 so, curthread) == 0) {
2107 error = ENOBUFS;
2108 goto bad;
2109 }
2110 (sopt->sopt_name == SO_SNDBUF ? &so->so_snd :
2111 &so->so_rcv)->sb_flags &= ~SB_AUTOSIZE;
2112 break;
2113
2114 /*
2115 * Make sure the low-water is never greater than the
2116 * high-water.
2117 */
2118 case SO_SNDLOWAT:
2119 SOCKBUF_LOCK(&so->so_snd);
2120 so->so_snd.sb_lowat =
2121 (optval > so->so_snd.sb_hiwat) ?
2122 so->so_snd.sb_hiwat : optval;
2123 SOCKBUF_UNLOCK(&so->so_snd);
2124 break;
2125 case SO_RCVLOWAT:
2126 SOCKBUF_LOCK(&so->so_rcv);
2127 so->so_rcv.sb_lowat =
2128 (optval > so->so_rcv.sb_hiwat) ?
2129 so->so_rcv.sb_hiwat : optval;
2130 SOCKBUF_UNLOCK(&so->so_rcv);
2131 break;
2132 }
2133 break;
2134
2135 case SO_SNDTIMEO:
2136 case SO_RCVTIMEO:
2137#ifdef COMPAT_IA32
2138 if (curthread->td_proc->p_sysent == &ia32_freebsd_sysvec) {
2139 struct timeval32 tv32;
2140
2141 error = sooptcopyin(sopt, &tv32, sizeof tv32,
2142 sizeof tv32);
2143 CP(tv32, tv, tv_sec);
2144 CP(tv32, tv, tv_usec);
2145 } else
2146#endif
2147 error = sooptcopyin(sopt, &tv, sizeof tv,
2148 sizeof tv);
2149 if (error)
2150 goto bad;
2151
2152 /* assert(hz > 0); */
2153 if (tv.tv_sec < 0 || tv.tv_sec > INT_MAX / hz ||
2154 tv.tv_usec < 0 || tv.tv_usec >= 1000000) {
2155 error = EDOM;
2156 goto bad;
2157 }
2158 /* assert(tick > 0); */
2159 /* assert(ULONG_MAX - INT_MAX >= 1000000); */
2160 val = (u_long)(tv.tv_sec * hz) + tv.tv_usec / tick;
2161 if (val > INT_MAX) {
2162 error = EDOM;
2163 goto bad;
2164 }
2165 if (val == 0 && tv.tv_usec != 0)
2166 val = 1;
2167
2168 switch (sopt->sopt_name) {
2169 case SO_SNDTIMEO:
2170 so->so_snd.sb_timeo = val;
2171 break;
2172 case SO_RCVTIMEO:
2173 so->so_rcv.sb_timeo = val;
2174 break;
2175 }
2176 break;
2177
2178 case SO_LABEL:
2179#ifdef MAC
2180 error = sooptcopyin(sopt, &extmac, sizeof extmac,
2181 sizeof extmac);
2182 if (error)
2183 goto bad;
2184 error = mac_setsockopt_label(sopt->sopt_td->td_ucred,
2185 so, &extmac);
2186#else
2187 error = EOPNOTSUPP;
2188#endif
2189 break;
2190
2191 default:
2192 error = ENOPROTOOPT;
2193 break;
2194 }
2195 if (error == 0 && so->so_proto != NULL &&
2196 so->so_proto->pr_ctloutput != NULL) {
2197 (void) ((*so->so_proto->pr_ctloutput)
2198 (so, sopt));
2199 }
2200 }
2201bad:
2202 return (error);
2203}
2204
2205/*
2206 * Helper routine for getsockopt.
2207 */
2208int
2209sooptcopyout(struct sockopt *sopt, const void *buf, size_t len)
2210{
2211 int error;
2212 size_t valsize;
2213
2214 error = 0;
2215
2216 /*
2217 * Documented get behavior is that we always return a value, possibly
2218 * truncated to fit in the user's buffer. Traditional behavior is
2219 * that we always tell the user precisely how much we copied, rather
2220 * than something useful like the total amount we had available for
2221 * her. Note that this interface is not idempotent; the entire
2222 * answer must generated ahead of time.
2223 */
2224 valsize = min(len, sopt->sopt_valsize);
2225 sopt->sopt_valsize = valsize;
2226 if (sopt->sopt_val != NULL) {
2227 if (sopt->sopt_td != NULL)
2228 error = copyout(buf, sopt->sopt_val, valsize);
2229 else
2230 bcopy(buf, sopt->sopt_val, valsize);
2231 }
2232 return (error);
2233}
2234
2235int
2236sogetopt(so, sopt)
2237 struct socket *so;
2238 struct sockopt *sopt;
2239{
2240 int error, optval;
2241 struct linger l;
2242 struct timeval tv;
2243#ifdef MAC
2244 struct mac extmac;
2245#endif
2246
2247 error = 0;
2248 if (sopt->sopt_level != SOL_SOCKET) {
2249 if (so->so_proto && so->so_proto->pr_ctloutput) {
2250 return ((*so->so_proto->pr_ctloutput)
2251 (so, sopt));
2252 } else
2253 return (ENOPROTOOPT);
2254 } else {
2255 switch (sopt->sopt_name) {
2256#ifdef INET
2257 case SO_ACCEPTFILTER:
2258 error = do_getopt_accept_filter(so, sopt);
2259 break;
2260#endif
2261 case SO_LINGER:
2262 SOCK_LOCK(so);
2263 l.l_onoff = so->so_options & SO_LINGER;
2264 l.l_linger = so->so_linger;
2265 SOCK_UNLOCK(so);
2266 error = sooptcopyout(sopt, &l, sizeof l);
2267 break;
2268
2269 case SO_USELOOPBACK:
2270 case SO_DONTROUTE:
2271 case SO_DEBUG:
2272 case SO_KEEPALIVE:
2273 case SO_REUSEADDR:
2274 case SO_REUSEPORT:
2275 case SO_BROADCAST:
2276 case SO_OOBINLINE:
2277 case SO_ACCEPTCONN:
2278 case SO_TIMESTAMP:
2279 case SO_BINTIME:
2280 case SO_NOSIGPIPE:
2281 optval = so->so_options & sopt->sopt_name;
2282integer:
2283 error = sooptcopyout(sopt, &optval, sizeof optval);
2284 break;
2285
2286 case SO_TYPE:
2287 optval = so->so_type;
2288 goto integer;
2289
2290 case SO_ERROR:
2291 SOCK_LOCK(so);
2292 optval = so->so_error;
2293 so->so_error = 0;
2294 SOCK_UNLOCK(so);
2295 goto integer;
2296
2297 case SO_SNDBUF:
2298 optval = so->so_snd.sb_hiwat;
2299 goto integer;
2300
2301 case SO_RCVBUF:
2302 optval = so->so_rcv.sb_hiwat;
2303 goto integer;
2304
2305 case SO_SNDLOWAT:
2306 optval = so->so_snd.sb_lowat;
2307 goto integer;
2308
2309 case SO_RCVLOWAT:
2310 optval = so->so_rcv.sb_lowat;
2311 goto integer;
2312
2313 case SO_SNDTIMEO:
2314 case SO_RCVTIMEO:
2315 optval = (sopt->sopt_name == SO_SNDTIMEO ?
2316 so->so_snd.sb_timeo : so->so_rcv.sb_timeo);
2317
2318 tv.tv_sec = optval / hz;
2319 tv.tv_usec = (optval % hz) * tick;
2320#ifdef COMPAT_IA32
2321 if (curthread->td_proc->p_sysent == &ia32_freebsd_sysvec) {
2322 struct timeval32 tv32;
2323
2324 CP(tv, tv32, tv_sec);
2325 CP(tv, tv32, tv_usec);
2326 error = sooptcopyout(sopt, &tv32, sizeof tv32);
2327 } else
2328#endif
2329 error = sooptcopyout(sopt, &tv, sizeof tv);
2330 break;
2331
2332 case SO_LABEL:
2333#ifdef MAC
2334 error = sooptcopyin(sopt, &extmac, sizeof(extmac),
2335 sizeof(extmac));
2336 if (error)
2337 return (error);
2338 error = mac_getsockopt_label(sopt->sopt_td->td_ucred,
2339 so, &extmac);
2340 if (error)
2341 return (error);
2342 error = sooptcopyout(sopt, &extmac, sizeof extmac);
2343#else
2344 error = EOPNOTSUPP;
2345#endif
2346 break;
2347
2348 case SO_PEERLABEL:
2349#ifdef MAC
2350 error = sooptcopyin(sopt, &extmac, sizeof(extmac),
2351 sizeof(extmac));
2352 if (error)
2353 return (error);
2354 error = mac_getsockopt_peerlabel(
2355 sopt->sopt_td->td_ucred, so, &extmac);
2356 if (error)
2357 return (error);
2358 error = sooptcopyout(sopt, &extmac, sizeof extmac);
2359#else
2360 error = EOPNOTSUPP;
2361#endif
2362 break;
2363
2364 case SO_LISTENQLIMIT:
2365 optval = so->so_qlimit;
2366 goto integer;
2367
2368 case SO_LISTENQLEN:
2369 optval = so->so_qlen;
2370 goto integer;
2371
2372 case SO_LISTENINCQLEN:
2373 optval = so->so_incqlen;
2374 goto integer;
2375
2376 default:
2377 error = ENOPROTOOPT;
2378 break;
2379 }
2380 return (error);
2381 }
2382}
2383
2384/* XXX; prepare mbuf for (__FreeBSD__ < 3) routines. */
2385int
2386soopt_getm(struct sockopt *sopt, struct mbuf **mp)
2387{
2388 struct mbuf *m, *m_prev;
2389 int sopt_size = sopt->sopt_valsize;
2390
2391 MGET(m, sopt->sopt_td ? M_TRYWAIT : M_DONTWAIT, MT_DATA);
2392 if (m == NULL)
2393 return ENOBUFS;
2394 if (sopt_size > MLEN) {
2395 MCLGET(m, sopt->sopt_td ? M_TRYWAIT : M_DONTWAIT);
2396 if ((m->m_flags & M_EXT) == 0) {
2397 m_free(m);
2398 return ENOBUFS;
2399 }
2400 m->m_len = min(MCLBYTES, sopt_size);
2401 } else {
2402 m->m_len = min(MLEN, sopt_size);
2403 }
2404 sopt_size -= m->m_len;
2405 *mp = m;
2406 m_prev = m;
2407
2408 while (sopt_size) {
2409 MGET(m, sopt->sopt_td ? M_TRYWAIT : M_DONTWAIT, MT_DATA);
2410 if (m == NULL) {
2411 m_freem(*mp);
2412 return ENOBUFS;
2413 }
2414 if (sopt_size > MLEN) {
2415 MCLGET(m, sopt->sopt_td != NULL ? M_TRYWAIT :
2416 M_DONTWAIT);
2417 if ((m->m_flags & M_EXT) == 0) {
2418 m_freem(m);
2419 m_freem(*mp);
2420 return ENOBUFS;
2421 }
2422 m->m_len = min(MCLBYTES, sopt_size);
2423 } else {
2424 m->m_len = min(MLEN, sopt_size);
2425 }
2426 sopt_size -= m->m_len;
2427 m_prev->m_next = m;
2428 m_prev = m;
2429 }
2430 return (0);
2431}
2432
2433/* XXX; copyin sopt data into mbuf chain for (__FreeBSD__ < 3) routines. */
2434int
2435soopt_mcopyin(struct sockopt *sopt, struct mbuf *m)
2436{
2437 struct mbuf *m0 = m;
2438
2439 if (sopt->sopt_val == NULL)
2440 return (0);
2441 while (m != NULL && sopt->sopt_valsize >= m->m_len) {
2442 if (sopt->sopt_td != NULL) {
2443 int error;
2444
2445 error = copyin(sopt->sopt_val, mtod(m, char *),
2446 m->m_len);
2447 if (error != 0) {
2448 m_freem(m0);
2449 return(error);
2450 }
2451 } else
2452 bcopy(sopt->sopt_val, mtod(m, char *), m->m_len);
2453 sopt->sopt_valsize -= m->m_len;
2454 sopt->sopt_val = (char *)sopt->sopt_val + m->m_len;
2455 m = m->m_next;
2456 }
2457 if (m != NULL) /* should be allocated enoughly at ip6_sooptmcopyin() */
2458 panic("ip6_sooptmcopyin");
2459 return (0);
2460}
2461
2462/* XXX; copyout mbuf chain data into soopt for (__FreeBSD__ < 3) routines. */
2463int
2464soopt_mcopyout(struct sockopt *sopt, struct mbuf *m)
2465{
2466 struct mbuf *m0 = m;
2467 size_t valsize = 0;
2468
2469 if (sopt->sopt_val == NULL)
2470 return (0);
2471 while (m != NULL && sopt->sopt_valsize >= m->m_len) {
2472 if (sopt->sopt_td != NULL) {
2473 int error;
2474
2475 error = copyout(mtod(m, char *), sopt->sopt_val,
2476 m->m_len);
2477 if (error != 0) {
2478 m_freem(m0);
2479 return(error);
2480 }
2481 } else
2482 bcopy(mtod(m, char *), sopt->sopt_val, m->m_len);
2483 sopt->sopt_valsize -= m->m_len;
2484 sopt->sopt_val = (char *)sopt->sopt_val + m->m_len;
2485 valsize += m->m_len;
2486 m = m->m_next;
2487 }
2488 if (m != NULL) {
2489 /* enough soopt buffer should be given from user-land */
2490 m_freem(m0);
2491 return(EINVAL);
2492 }
2493 sopt->sopt_valsize = valsize;
2494 return (0);
2495}
2496
2497/*
2498 * sohasoutofband(): protocol notifies socket layer of the arrival of new
2499 * out-of-band data, which will then notify socket consumers.
2500 */
2501void
2502sohasoutofband(so)
2503 struct socket *so;
2504{
2505 if (so->so_sigio != NULL)
2506 pgsigio(&so->so_sigio, SIGURG, 0);
2507 selwakeuppri(&so->so_rcv.sb_sel, PSOCK);
2508}
2509
2510int
2511sopoll(struct socket *so, int events, struct ucred *active_cred,
2512 struct thread *td)
2513{
2514
2515 /* XXXRW: Temporary debugging. */
2516 KASSERT(so->so_proto->pr_usrreqs->pru_sopoll != sopoll,
2517 ("sopoll: protocol calls sopoll"));
2518
2519 return (so->so_proto->pr_usrreqs->pru_sopoll(so, events, active_cred,
2520 td));
2521}
2522
2523int
2524sopoll_generic(struct socket *so, int events, struct ucred *active_cred,
2525 struct thread *td)
2526{
2527 int revents = 0;
2528
2529 SOCKBUF_LOCK(&so->so_snd);
2530 SOCKBUF_LOCK(&so->so_rcv);
2531 if (events & (POLLIN | POLLRDNORM))
2532 if (soreadable(so))
2533 revents |= events & (POLLIN | POLLRDNORM);
2534
2535 if (events & POLLINIGNEOF)
2536 if (so->so_rcv.sb_cc >= so->so_rcv.sb_lowat ||
2537 !TAILQ_EMPTY(&so->so_comp) || so->so_error)
2538 revents |= POLLINIGNEOF;
2539
2540 if (events & (POLLOUT | POLLWRNORM))
2541 if (sowriteable(so))
2542 revents |= events & (POLLOUT | POLLWRNORM);
2543
2544 if (events & (POLLPRI | POLLRDBAND))
2545 if (so->so_oobmark || (so->so_rcv.sb_state & SBS_RCVATMARK))
2546 revents |= events & (POLLPRI | POLLRDBAND);
2547
2548 if (revents == 0) {
2549 if (events &
2550 (POLLIN | POLLINIGNEOF | POLLPRI | POLLRDNORM |
2551 POLLRDBAND)) {
2552 selrecord(td, &so->so_rcv.sb_sel);
2553 so->so_rcv.sb_flags |= SB_SEL;
2554 }
2555
2556 if (events & (POLLOUT | POLLWRNORM)) {
2557 selrecord(td, &so->so_snd.sb_sel);
2558 so->so_snd.sb_flags |= SB_SEL;
2559 }
2560 }
2561
2562 SOCKBUF_UNLOCK(&so->so_rcv);
2563 SOCKBUF_UNLOCK(&so->so_snd);
2564 return (revents);
2565}
2566
2567int
2568soo_kqfilter(struct file *fp, struct knote *kn)
2569{
2570 struct socket *so = kn->kn_fp->f_data;
2571 struct sockbuf *sb;
2572
2573 switch (kn->kn_filter) {
2574 case EVFILT_READ:
2575 if (so->so_options & SO_ACCEPTCONN)
2576 kn->kn_fop = &solisten_filtops;
2577 else
2578 kn->kn_fop = &soread_filtops;
2579 sb = &so->so_rcv;
2580 break;
2581 case EVFILT_WRITE:
2582 kn->kn_fop = &sowrite_filtops;
2583 sb = &so->so_snd;
2584 break;
2585 default:
2586 return (EINVAL);
2587 }
2588
2589 SOCKBUF_LOCK(sb);
2590 knlist_add(&sb->sb_sel.si_note, kn, 1);
2591 sb->sb_flags |= SB_KNOTE;
2592 SOCKBUF_UNLOCK(sb);
2593 return (0);
2594}
2595
127#include <sys/sysctl.h>
128#include <sys/uio.h>
129#include <sys/jail.h>
130
131#include <security/mac/mac_framework.h>
132
133#include <vm/uma.h>
134
135#ifdef COMPAT_IA32
136#include <sys/mount.h>
137#include <compat/freebsd32/freebsd32.h>
138
139extern struct sysentvec ia32_freebsd_sysvec;
140#endif
141
142static int soreceive_rcvoob(struct socket *so, struct uio *uio,
143 int flags);
144
145static void filt_sordetach(struct knote *kn);
146static int filt_soread(struct knote *kn, long hint);
147static void filt_sowdetach(struct knote *kn);
148static int filt_sowrite(struct knote *kn, long hint);
149static int filt_solisten(struct knote *kn, long hint);
150
151static struct filterops solisten_filtops =
152 { 1, NULL, filt_sordetach, filt_solisten };
153static struct filterops soread_filtops =
154 { 1, NULL, filt_sordetach, filt_soread };
155static struct filterops sowrite_filtops =
156 { 1, NULL, filt_sowdetach, filt_sowrite };
157
158uma_zone_t socket_zone;
159so_gen_t so_gencnt; /* generation count for sockets */
160
161int maxsockets;
162
163MALLOC_DEFINE(M_SONAME, "soname", "socket name");
164MALLOC_DEFINE(M_PCB, "pcb", "protocol control block");
165
166static int somaxconn = SOMAXCONN;
167static int sysctl_somaxconn(SYSCTL_HANDLER_ARGS);
168/* XXX: we dont have SYSCTL_USHORT */
169SYSCTL_PROC(_kern_ipc, KIPC_SOMAXCONN, somaxconn, CTLTYPE_UINT | CTLFLAG_RW,
170 0, sizeof(int), sysctl_somaxconn, "I", "Maximum pending socket connection "
171 "queue size");
172static int numopensockets;
173SYSCTL_INT(_kern_ipc, OID_AUTO, numopensockets, CTLFLAG_RD,
174 &numopensockets, 0, "Number of open sockets");
175#ifdef ZERO_COPY_SOCKETS
176/* These aren't static because they're used in other files. */
177int so_zero_copy_send = 1;
178int so_zero_copy_receive = 1;
179SYSCTL_NODE(_kern_ipc, OID_AUTO, zero_copy, CTLFLAG_RD, 0,
180 "Zero copy controls");
181SYSCTL_INT(_kern_ipc_zero_copy, OID_AUTO, receive, CTLFLAG_RW,
182 &so_zero_copy_receive, 0, "Enable zero copy receive");
183SYSCTL_INT(_kern_ipc_zero_copy, OID_AUTO, send, CTLFLAG_RW,
184 &so_zero_copy_send, 0, "Enable zero copy send");
185#endif /* ZERO_COPY_SOCKETS */
186
187/*
188 * accept_mtx locks down per-socket fields relating to accept queues. See
189 * socketvar.h for an annotation of the protected fields of struct socket.
190 */
191struct mtx accept_mtx;
192MTX_SYSINIT(accept_mtx, &accept_mtx, "accept", MTX_DEF);
193
194/*
195 * so_global_mtx protects so_gencnt, numopensockets, and the per-socket
196 * so_gencnt field.
197 */
198static struct mtx so_global_mtx;
199MTX_SYSINIT(so_global_mtx, &so_global_mtx, "so_glabel", MTX_DEF);
200
201/*
202 * General IPC sysctl name space, used by sockets and a variety of other IPC
203 * types.
204 */
205SYSCTL_NODE(_kern, KERN_IPC, ipc, CTLFLAG_RW, 0, "IPC");
206
207/*
208 * Sysctl to get and set the maximum global sockets limit. Notify protocols
209 * of the change so that they can update their dependent limits as required.
210 */
211static int
212sysctl_maxsockets(SYSCTL_HANDLER_ARGS)
213{
214 int error, newmaxsockets;
215
216 newmaxsockets = maxsockets;
217 error = sysctl_handle_int(oidp, &newmaxsockets, sizeof(int), req);
218 if (error == 0 && req->newptr) {
219 if (newmaxsockets > maxsockets) {
220 maxsockets = newmaxsockets;
221 if (maxsockets > ((maxfiles / 4) * 3)) {
222 maxfiles = (maxsockets * 5) / 4;
223 maxfilesperproc = (maxfiles * 9) / 10;
224 }
225 EVENTHANDLER_INVOKE(maxsockets_change);
226 } else
227 error = EINVAL;
228 }
229 return (error);
230}
231
232SYSCTL_PROC(_kern_ipc, OID_AUTO, maxsockets, CTLTYPE_INT|CTLFLAG_RW,
233 &maxsockets, 0, sysctl_maxsockets, "IU",
234 "Maximum number of sockets avaliable");
235
236/*
237 * Initialise maxsockets.
238 */
239static void init_maxsockets(void *ignored)
240{
241 TUNABLE_INT_FETCH("kern.ipc.maxsockets", &maxsockets);
242 maxsockets = imax(maxsockets, imax(maxfiles, nmbclusters));
243}
244SYSINIT(param, SI_SUB_TUNABLES, SI_ORDER_ANY, init_maxsockets, NULL);
245
246/*
247 * Socket operation routines. These routines are called by the routines in
248 * sys_socket.c or from a system process, and implement the semantics of
249 * socket operations by switching out to the protocol specific routines.
250 */
251
252/*
253 * Get a socket structure from our zone, and initialize it. Note that it
254 * would probably be better to allocate socket and PCB at the same time, but
255 * I'm not convinced that all the protocols can be easily modified to do
256 * this.
257 *
258 * soalloc() returns a socket with a ref count of 0.
259 */
260static struct socket *
261soalloc(void)
262{
263 struct socket *so;
264
265 so = uma_zalloc(socket_zone, M_NOWAIT | M_ZERO);
266 if (so == NULL)
267 return (NULL);
268#ifdef MAC
269 if (mac_init_socket(so, M_NOWAIT) != 0) {
270 uma_zfree(socket_zone, so);
271 return (NULL);
272 }
273#endif
274 SOCKBUF_LOCK_INIT(&so->so_snd, "so_snd");
275 SOCKBUF_LOCK_INIT(&so->so_rcv, "so_rcv");
276 TAILQ_INIT(&so->so_aiojobq);
277 mtx_lock(&so_global_mtx);
278 so->so_gencnt = ++so_gencnt;
279 ++numopensockets;
280 mtx_unlock(&so_global_mtx);
281 return (so);
282}
283
284/*
285 * Free the storage associated with a socket at the socket layer, tear down
286 * locks, labels, etc. All protocol state is assumed already to have been
287 * torn down (and possibly never set up) by the caller.
288 */
289static void
290sodealloc(struct socket *so)
291{
292
293 KASSERT(so->so_count == 0, ("sodealloc(): so_count %d", so->so_count));
294 KASSERT(so->so_pcb == NULL, ("sodealloc(): so_pcb != NULL"));
295
296 mtx_lock(&so_global_mtx);
297 so->so_gencnt = ++so_gencnt;
298 --numopensockets; /* Could be below, but faster here. */
299 mtx_unlock(&so_global_mtx);
300 if (so->so_rcv.sb_hiwat)
301 (void)chgsbsize(so->so_cred->cr_uidinfo,
302 &so->so_rcv.sb_hiwat, 0, RLIM_INFINITY);
303 if (so->so_snd.sb_hiwat)
304 (void)chgsbsize(so->so_cred->cr_uidinfo,
305 &so->so_snd.sb_hiwat, 0, RLIM_INFINITY);
306#ifdef INET
307 /* remove acccept filter if one is present. */
308 if (so->so_accf != NULL)
309 do_setopt_accept_filter(so, NULL);
310#endif
311#ifdef MAC
312 mac_destroy_socket(so);
313#endif
314 crfree(so->so_cred);
315 SOCKBUF_LOCK_DESTROY(&so->so_snd);
316 SOCKBUF_LOCK_DESTROY(&so->so_rcv);
317 uma_zfree(socket_zone, so);
318}
319
320/*
321 * socreate returns a socket with a ref count of 1. The socket should be
322 * closed with soclose().
323 */
324int
325socreate(dom, aso, type, proto, cred, td)
326 int dom;
327 struct socket **aso;
328 int type;
329 int proto;
330 struct ucred *cred;
331 struct thread *td;
332{
333 struct protosw *prp;
334 struct socket *so;
335 int error;
336
337 if (proto)
338 prp = pffindproto(dom, proto, type);
339 else
340 prp = pffindtype(dom, type);
341
342 if (prp == NULL || prp->pr_usrreqs->pru_attach == NULL ||
343 prp->pr_usrreqs->pru_attach == pru_attach_notsupp)
344 return (EPROTONOSUPPORT);
345
346 if (jailed(cred) && jail_socket_unixiproute_only &&
347 prp->pr_domain->dom_family != PF_LOCAL &&
348 prp->pr_domain->dom_family != PF_INET &&
349 prp->pr_domain->dom_family != PF_ROUTE) {
350 return (EPROTONOSUPPORT);
351 }
352
353 if (prp->pr_type != type)
354 return (EPROTOTYPE);
355 so = soalloc();
356 if (so == NULL)
357 return (ENOBUFS);
358
359 TAILQ_INIT(&so->so_incomp);
360 TAILQ_INIT(&so->so_comp);
361 so->so_type = type;
362 so->so_cred = crhold(cred);
363 so->so_proto = prp;
364#ifdef MAC
365 mac_create_socket(cred, so);
366#endif
367 knlist_init(&so->so_rcv.sb_sel.si_note, SOCKBUF_MTX(&so->so_rcv),
368 NULL, NULL, NULL);
369 knlist_init(&so->so_snd.sb_sel.si_note, SOCKBUF_MTX(&so->so_snd),
370 NULL, NULL, NULL);
371 so->so_count = 1;
372 /*
373 * Auto-sizing of socket buffers is managed by the protocols and
374 * the appropriate flags must be set in the pru_attach function.
375 */
376 error = (*prp->pr_usrreqs->pru_attach)(so, proto, td);
377 if (error) {
378 KASSERT(so->so_count == 1, ("socreate: so_count %d",
379 so->so_count));
380 so->so_count = 0;
381 sodealloc(so);
382 return (error);
383 }
384 *aso = so;
385 return (0);
386}
387
388#ifdef REGRESSION
389static int regression_sonewconn_earlytest = 1;
390SYSCTL_INT(_regression, OID_AUTO, sonewconn_earlytest, CTLFLAG_RW,
391 &regression_sonewconn_earlytest, 0, "Perform early sonewconn limit test");
392#endif
393
394/*
395 * When an attempt at a new connection is noted on a socket which accepts
396 * connections, sonewconn is called. If the connection is possible (subject
397 * to space constraints, etc.) then we allocate a new structure, propoerly
398 * linked into the data structure of the original socket, and return this.
399 * Connstatus may be 0, or SO_ISCONFIRMING, or SO_ISCONNECTED.
400 *
401 * Note: the ref count on the socket is 0 on return.
402 */
403struct socket *
404sonewconn(head, connstatus)
405 register struct socket *head;
406 int connstatus;
407{
408 register struct socket *so;
409 int over;
410
411 ACCEPT_LOCK();
412 over = (head->so_qlen > 3 * head->so_qlimit / 2);
413 ACCEPT_UNLOCK();
414#ifdef REGRESSION
415 if (regression_sonewconn_earlytest && over)
416#else
417 if (over)
418#endif
419 return (NULL);
420 so = soalloc();
421 if (so == NULL)
422 return (NULL);
423 if ((head->so_options & SO_ACCEPTFILTER) != 0)
424 connstatus = 0;
425 so->so_head = head;
426 so->so_type = head->so_type;
427 so->so_options = head->so_options &~ SO_ACCEPTCONN;
428 so->so_linger = head->so_linger;
429 so->so_state = head->so_state | SS_NOFDREF;
430 so->so_proto = head->so_proto;
431 so->so_cred = crhold(head->so_cred);
432#ifdef MAC
433 SOCK_LOCK(head);
434 mac_create_socket_from_socket(head, so);
435 SOCK_UNLOCK(head);
436#endif
437 knlist_init(&so->so_rcv.sb_sel.si_note, SOCKBUF_MTX(&so->so_rcv),
438 NULL, NULL, NULL);
439 knlist_init(&so->so_snd.sb_sel.si_note, SOCKBUF_MTX(&so->so_snd),
440 NULL, NULL, NULL);
441 if (soreserve(so, head->so_snd.sb_hiwat, head->so_rcv.sb_hiwat) ||
442 (*so->so_proto->pr_usrreqs->pru_attach)(so, 0, NULL)) {
443 sodealloc(so);
444 return (NULL);
445 }
446 so->so_rcv.sb_lowat = head->so_rcv.sb_lowat;
447 so->so_snd.sb_lowat = head->so_snd.sb_lowat;
448 so->so_rcv.sb_timeo = head->so_rcv.sb_timeo;
449 so->so_snd.sb_timeo = head->so_snd.sb_timeo;
450 so->so_rcv.sb_flags |= head->so_rcv.sb_flags & SB_AUTOSIZE;
451 so->so_snd.sb_flags |= head->so_snd.sb_flags & SB_AUTOSIZE;
452 so->so_state |= connstatus;
453 ACCEPT_LOCK();
454 if (connstatus) {
455 TAILQ_INSERT_TAIL(&head->so_comp, so, so_list);
456 so->so_qstate |= SQ_COMP;
457 head->so_qlen++;
458 } else {
459 /*
460 * Keep removing sockets from the head until there's room for
461 * us to insert on the tail. In pre-locking revisions, this
462 * was a simple if(), but as we could be racing with other
463 * threads and soabort() requires dropping locks, we must
464 * loop waiting for the condition to be true.
465 */
466 while (head->so_incqlen > head->so_qlimit) {
467 struct socket *sp;
468 sp = TAILQ_FIRST(&head->so_incomp);
469 TAILQ_REMOVE(&head->so_incomp, sp, so_list);
470 head->so_incqlen--;
471 sp->so_qstate &= ~SQ_INCOMP;
472 sp->so_head = NULL;
473 ACCEPT_UNLOCK();
474 soabort(sp);
475 ACCEPT_LOCK();
476 }
477 TAILQ_INSERT_TAIL(&head->so_incomp, so, so_list);
478 so->so_qstate |= SQ_INCOMP;
479 head->so_incqlen++;
480 }
481 ACCEPT_UNLOCK();
482 if (connstatus) {
483 sorwakeup(head);
484 wakeup_one(&head->so_timeo);
485 }
486 return (so);
487}
488
489int
490sobind(so, nam, td)
491 struct socket *so;
492 struct sockaddr *nam;
493 struct thread *td;
494{
495
496 return ((*so->so_proto->pr_usrreqs->pru_bind)(so, nam, td));
497}
498
499/*
500 * solisten() transitions a socket from a non-listening state to a listening
501 * state, but can also be used to update the listen queue depth on an
502 * existing listen socket. The protocol will call back into the sockets
503 * layer using solisten_proto_check() and solisten_proto() to check and set
504 * socket-layer listen state. Call backs are used so that the protocol can
505 * acquire both protocol and socket layer locks in whatever order is required
506 * by the protocol.
507 *
508 * Protocol implementors are advised to hold the socket lock across the
509 * socket-layer test and set to avoid races at the socket layer.
510 */
511int
512solisten(so, backlog, td)
513 struct socket *so;
514 int backlog;
515 struct thread *td;
516{
517
518 return ((*so->so_proto->pr_usrreqs->pru_listen)(so, backlog, td));
519}
520
521int
522solisten_proto_check(so)
523 struct socket *so;
524{
525
526 SOCK_LOCK_ASSERT(so);
527
528 if (so->so_state & (SS_ISCONNECTED | SS_ISCONNECTING |
529 SS_ISDISCONNECTING))
530 return (EINVAL);
531 return (0);
532}
533
534void
535solisten_proto(so, backlog)
536 struct socket *so;
537 int backlog;
538{
539
540 SOCK_LOCK_ASSERT(so);
541
542 if (backlog < 0 || backlog > somaxconn)
543 backlog = somaxconn;
544 so->so_qlimit = backlog;
545 so->so_options |= SO_ACCEPTCONN;
546}
547
548/*
549 * Attempt to free a socket. This should really be sotryfree().
550 *
551 * sofree() will succeed if:
552 *
553 * - There are no outstanding file descriptor references or related consumers
554 * (so_count == 0).
555 *
556 * - The socket has been closed by user space, if ever open (SS_NOFDREF).
557 *
558 * - The protocol does not have an outstanding strong reference on the socket
559 * (SS_PROTOREF).
560 *
561 * - The socket is not in a completed connection queue, so a process has been
562 * notified that it is present. If it is removed, the user process may
563 * block in accept() despite select() saying the socket was ready.
564 *
565 * Otherwise, it will quietly abort so that a future call to sofree(), when
566 * conditions are right, can succeed.
567 */
568void
569sofree(so)
570 struct socket *so;
571{
572 struct protosw *pr = so->so_proto;
573 struct socket *head;
574
575 ACCEPT_LOCK_ASSERT();
576 SOCK_LOCK_ASSERT(so);
577
578 if ((so->so_state & SS_NOFDREF) == 0 || so->so_count != 0 ||
579 (so->so_state & SS_PROTOREF) || (so->so_qstate & SQ_COMP)) {
580 SOCK_UNLOCK(so);
581 ACCEPT_UNLOCK();
582 return;
583 }
584
585 head = so->so_head;
586 if (head != NULL) {
587 KASSERT((so->so_qstate & SQ_COMP) != 0 ||
588 (so->so_qstate & SQ_INCOMP) != 0,
589 ("sofree: so_head != NULL, but neither SQ_COMP nor "
590 "SQ_INCOMP"));
591 KASSERT((so->so_qstate & SQ_COMP) == 0 ||
592 (so->so_qstate & SQ_INCOMP) == 0,
593 ("sofree: so->so_qstate is SQ_COMP and also SQ_INCOMP"));
594 TAILQ_REMOVE(&head->so_incomp, so, so_list);
595 head->so_incqlen--;
596 so->so_qstate &= ~SQ_INCOMP;
597 so->so_head = NULL;
598 }
599 KASSERT((so->so_qstate & SQ_COMP) == 0 &&
600 (so->so_qstate & SQ_INCOMP) == 0,
601 ("sofree: so_head == NULL, but still SQ_COMP(%d) or SQ_INCOMP(%d)",
602 so->so_qstate & SQ_COMP, so->so_qstate & SQ_INCOMP));
603 if (so->so_options & SO_ACCEPTCONN) {
604 KASSERT((TAILQ_EMPTY(&so->so_comp)), ("sofree: so_comp populated"));
605 KASSERT((TAILQ_EMPTY(&so->so_incomp)), ("sofree: so_comp populated"));
606 }
607 SOCK_UNLOCK(so);
608 ACCEPT_UNLOCK();
609
610 if (pr->pr_flags & PR_RIGHTS && pr->pr_domain->dom_dispose != NULL)
611 (*pr->pr_domain->dom_dispose)(so->so_rcv.sb_mb);
612 if (pr->pr_usrreqs->pru_detach != NULL)
613 (*pr->pr_usrreqs->pru_detach)(so);
614
615 /*
616 * From this point on, we assume that no other references to this
617 * socket exist anywhere else in the stack. Therefore, no locks need
618 * to be acquired or held.
619 *
620 * We used to do a lot of socket buffer and socket locking here, as
621 * well as invoke sorflush() and perform wakeups. The direct call to
622 * dom_dispose() and sbrelease_internal() are an inlining of what was
623 * necessary from sorflush().
624 *
625 * Notice that the socket buffer and kqueue state are torn down
626 * before calling pru_detach. This means that protocols shold not
627 * assume they can perform socket wakeups, etc, in their detach
628 * code.
629 */
630 KASSERT((so->so_snd.sb_flags & SB_LOCK) == 0, ("sofree: snd sblock"));
631 KASSERT((so->so_rcv.sb_flags & SB_LOCK) == 0, ("sofree: rcv sblock"));
632 sbdestroy(&so->so_snd, so);
633 sbdestroy(&so->so_rcv, so);
634 knlist_destroy(&so->so_rcv.sb_sel.si_note);
635 knlist_destroy(&so->so_snd.sb_sel.si_note);
636 sodealloc(so);
637}
638
639/*
640 * Close a socket on last file table reference removal. Initiate disconnect
641 * if connected. Free socket when disconnect complete.
642 *
643 * This function will sorele() the socket. Note that soclose() may be called
644 * prior to the ref count reaching zero. The actual socket structure will
645 * not be freed until the ref count reaches zero.
646 */
647int
648soclose(so)
649 struct socket *so;
650{
651 int error = 0;
652
653 KASSERT(!(so->so_state & SS_NOFDREF), ("soclose: SS_NOFDREF on enter"));
654
655 funsetown(&so->so_sigio);
656 if (so->so_state & SS_ISCONNECTED) {
657 if ((so->so_state & SS_ISDISCONNECTING) == 0) {
658 error = sodisconnect(so);
659 if (error)
660 goto drop;
661 }
662 if (so->so_options & SO_LINGER) {
663 if ((so->so_state & SS_ISDISCONNECTING) &&
664 (so->so_state & SS_NBIO))
665 goto drop;
666 while (so->so_state & SS_ISCONNECTED) {
667 error = tsleep(&so->so_timeo,
668 PSOCK | PCATCH, "soclos", so->so_linger * hz);
669 if (error)
670 break;
671 }
672 }
673 }
674
675drop:
676 if (so->so_proto->pr_usrreqs->pru_close != NULL)
677 (*so->so_proto->pr_usrreqs->pru_close)(so);
678 if (so->so_options & SO_ACCEPTCONN) {
679 struct socket *sp;
680 ACCEPT_LOCK();
681 while ((sp = TAILQ_FIRST(&so->so_incomp)) != NULL) {
682 TAILQ_REMOVE(&so->so_incomp, sp, so_list);
683 so->so_incqlen--;
684 sp->so_qstate &= ~SQ_INCOMP;
685 sp->so_head = NULL;
686 ACCEPT_UNLOCK();
687 soabort(sp);
688 ACCEPT_LOCK();
689 }
690 while ((sp = TAILQ_FIRST(&so->so_comp)) != NULL) {
691 TAILQ_REMOVE(&so->so_comp, sp, so_list);
692 so->so_qlen--;
693 sp->so_qstate &= ~SQ_COMP;
694 sp->so_head = NULL;
695 ACCEPT_UNLOCK();
696 soabort(sp);
697 ACCEPT_LOCK();
698 }
699 ACCEPT_UNLOCK();
700 }
701 ACCEPT_LOCK();
702 SOCK_LOCK(so);
703 KASSERT((so->so_state & SS_NOFDREF) == 0, ("soclose: NOFDREF"));
704 so->so_state |= SS_NOFDREF;
705 sorele(so);
706 return (error);
707}
708
709/*
710 * soabort() is used to abruptly tear down a connection, such as when a
711 * resource limit is reached (listen queue depth exceeded), or if a listen
712 * socket is closed while there are sockets waiting to be accepted.
713 *
714 * This interface is tricky, because it is called on an unreferenced socket,
715 * and must be called only by a thread that has actually removed the socket
716 * from the listen queue it was on, or races with other threads are risked.
717 *
718 * This interface will call into the protocol code, so must not be called
719 * with any socket locks held. Protocols do call it while holding their own
720 * recursible protocol mutexes, but this is something that should be subject
721 * to review in the future.
722 */
723void
724soabort(so)
725 struct socket *so;
726{
727
728 /*
729 * In as much as is possible, assert that no references to this
730 * socket are held. This is not quite the same as asserting that the
731 * current thread is responsible for arranging for no references, but
732 * is as close as we can get for now.
733 */
734 KASSERT(so->so_count == 0, ("soabort: so_count"));
735 KASSERT((so->so_state & SS_PROTOREF) == 0, ("soabort: SS_PROTOREF"));
736 KASSERT(so->so_state & SS_NOFDREF, ("soabort: !SS_NOFDREF"));
737 KASSERT((so->so_state & SQ_COMP) == 0, ("soabort: SQ_COMP"));
738 KASSERT((so->so_state & SQ_INCOMP) == 0, ("soabort: SQ_INCOMP"));
739
740 if (so->so_proto->pr_usrreqs->pru_abort != NULL)
741 (*so->so_proto->pr_usrreqs->pru_abort)(so);
742 ACCEPT_LOCK();
743 SOCK_LOCK(so);
744 sofree(so);
745}
746
747int
748soaccept(so, nam)
749 struct socket *so;
750 struct sockaddr **nam;
751{
752 int error;
753
754 SOCK_LOCK(so);
755 KASSERT((so->so_state & SS_NOFDREF) != 0, ("soaccept: !NOFDREF"));
756 so->so_state &= ~SS_NOFDREF;
757 SOCK_UNLOCK(so);
758 error = (*so->so_proto->pr_usrreqs->pru_accept)(so, nam);
759 return (error);
760}
761
762int
763soconnect(so, nam, td)
764 struct socket *so;
765 struct sockaddr *nam;
766 struct thread *td;
767{
768 int error;
769
770 if (so->so_options & SO_ACCEPTCONN)
771 return (EOPNOTSUPP);
772 /*
773 * If protocol is connection-based, can only connect once.
774 * Otherwise, if connected, try to disconnect first. This allows
775 * user to disconnect by connecting to, e.g., a null address.
776 */
777 if (so->so_state & (SS_ISCONNECTED|SS_ISCONNECTING) &&
778 ((so->so_proto->pr_flags & PR_CONNREQUIRED) ||
779 (error = sodisconnect(so)))) {
780 error = EISCONN;
781 } else {
782 /*
783 * Prevent accumulated error from previous connection from
784 * biting us.
785 */
786 so->so_error = 0;
787 error = (*so->so_proto->pr_usrreqs->pru_connect)(so, nam, td);
788 }
789
790 return (error);
791}
792
793int
794soconnect2(so1, so2)
795 struct socket *so1;
796 struct socket *so2;
797{
798
799 return ((*so1->so_proto->pr_usrreqs->pru_connect2)(so1, so2));
800}
801
802int
803sodisconnect(so)
804 struct socket *so;
805{
806 int error;
807
808 if ((so->so_state & SS_ISCONNECTED) == 0)
809 return (ENOTCONN);
810 if (so->so_state & SS_ISDISCONNECTING)
811 return (EALREADY);
812 error = (*so->so_proto->pr_usrreqs->pru_disconnect)(so);
813 return (error);
814}
815
816#ifdef ZERO_COPY_SOCKETS
817struct so_zerocopy_stats{
818 int size_ok;
819 int align_ok;
820 int found_ifp;
821};
822struct so_zerocopy_stats so_zerocp_stats = {0,0,0};
823#include <netinet/in.h>
824#include <net/route.h>
825#include <netinet/in_pcb.h>
826#include <vm/vm.h>
827#include <vm/vm_page.h>
828#include <vm/vm_object.h>
829
830/*
831 * sosend_copyin() is only used if zero copy sockets are enabled. Otherwise
832 * sosend_dgram() and sosend_generic() use m_uiotombuf().
833 *
834 * sosend_copyin() accepts a uio and prepares an mbuf chain holding part or
835 * all of the data referenced by the uio. If desired, it uses zero-copy.
836 * *space will be updated to reflect data copied in.
837 *
838 * NB: If atomic I/O is requested, the caller must already have checked that
839 * space can hold resid bytes.
840 *
841 * NB: In the event of an error, the caller may need to free the partial
842 * chain pointed to by *mpp. The contents of both *uio and *space may be
843 * modified even in the case of an error.
844 */
845static int
846sosend_copyin(struct uio *uio, struct mbuf **retmp, int atomic, long *space,
847 int flags)
848{
849 struct mbuf *m, **mp, *top;
850 long len, resid;
851 int error;
852#ifdef ZERO_COPY_SOCKETS
853 int cow_send;
854#endif
855
856 *retmp = top = NULL;
857 mp = &top;
858 len = 0;
859 resid = uio->uio_resid;
860 error = 0;
861 do {
862#ifdef ZERO_COPY_SOCKETS
863 cow_send = 0;
864#endif /* ZERO_COPY_SOCKETS */
865 if (resid >= MINCLSIZE) {
866#ifdef ZERO_COPY_SOCKETS
867 if (top == NULL) {
868 m = m_gethdr(M_WAITOK, MT_DATA);
869 m->m_pkthdr.len = 0;
870 m->m_pkthdr.rcvif = NULL;
871 } else
872 m = m_get(M_WAITOK, MT_DATA);
873 if (so_zero_copy_send &&
874 resid>=PAGE_SIZE &&
875 *space>=PAGE_SIZE &&
876 uio->uio_iov->iov_len>=PAGE_SIZE) {
877 so_zerocp_stats.size_ok++;
878 so_zerocp_stats.align_ok++;
879 cow_send = socow_setup(m, uio);
880 len = cow_send;
881 }
882 if (!cow_send) {
883 m_clget(m, M_WAITOK);
884 len = min(min(MCLBYTES, resid), *space);
885 }
886#else /* ZERO_COPY_SOCKETS */
887 if (top == NULL) {
888 m = m_getcl(M_TRYWAIT, MT_DATA, M_PKTHDR);
889 m->m_pkthdr.len = 0;
890 m->m_pkthdr.rcvif = NULL;
891 } else
892 m = m_getcl(M_TRYWAIT, MT_DATA, 0);
893 len = min(min(MCLBYTES, resid), *space);
894#endif /* ZERO_COPY_SOCKETS */
895 } else {
896 if (top == NULL) {
897 m = m_gethdr(M_TRYWAIT, MT_DATA);
898 m->m_pkthdr.len = 0;
899 m->m_pkthdr.rcvif = NULL;
900
901 len = min(min(MHLEN, resid), *space);
902 /*
903 * For datagram protocols, leave room
904 * for protocol headers in first mbuf.
905 */
906 if (atomic && m && len < MHLEN)
907 MH_ALIGN(m, len);
908 } else {
909 m = m_get(M_TRYWAIT, MT_DATA);
910 len = min(min(MLEN, resid), *space);
911 }
912 }
913 if (m == NULL) {
914 error = ENOBUFS;
915 goto out;
916 }
917
918 *space -= len;
919#ifdef ZERO_COPY_SOCKETS
920 if (cow_send)
921 error = 0;
922 else
923#endif /* ZERO_COPY_SOCKETS */
924 error = uiomove(mtod(m, void *), (int)len, uio);
925 resid = uio->uio_resid;
926 m->m_len = len;
927 *mp = m;
928 top->m_pkthdr.len += len;
929 if (error)
930 goto out;
931 mp = &m->m_next;
932 if (resid <= 0) {
933 if (flags & MSG_EOR)
934 top->m_flags |= M_EOR;
935 break;
936 }
937 } while (*space > 0 && atomic);
938out:
939 *retmp = top;
940 return (error);
941}
942#endif /*ZERO_COPY_SOCKETS*/
943
944#define SBLOCKWAIT(f) (((f) & MSG_DONTWAIT) ? M_NOWAIT : M_WAITOK)
945
946int
947sosend_dgram(so, addr, uio, top, control, flags, td)
948 struct socket *so;
949 struct sockaddr *addr;
950 struct uio *uio;
951 struct mbuf *top;
952 struct mbuf *control;
953 int flags;
954 struct thread *td;
955{
956 long space, resid;
957 int clen = 0, error, dontroute;
958#ifdef ZERO_COPY_SOCKETS
959 int atomic = sosendallatonce(so) || top;
960#endif
961
962 KASSERT(so->so_type == SOCK_DGRAM, ("sodgram_send: !SOCK_DGRAM"));
963 KASSERT(so->so_proto->pr_flags & PR_ATOMIC,
964 ("sodgram_send: !PR_ATOMIC"));
965
966 if (uio != NULL)
967 resid = uio->uio_resid;
968 else
969 resid = top->m_pkthdr.len;
970 /*
971 * In theory resid should be unsigned. However, space must be
972 * signed, as it might be less than 0 if we over-committed, and we
973 * must use a signed comparison of space and resid. On the other
974 * hand, a negative resid causes us to loop sending 0-length
975 * segments to the protocol.
976 *
977 * Also check to make sure that MSG_EOR isn't used on SOCK_STREAM
978 * type sockets since that's an error.
979 */
980 if (resid < 0) {
981 error = EINVAL;
982 goto out;
983 }
984
985 dontroute =
986 (flags & MSG_DONTROUTE) && (so->so_options & SO_DONTROUTE) == 0;
987 if (td != NULL)
988 td->td_proc->p_stats->p_ru.ru_msgsnd++;
989 if (control != NULL)
990 clen = control->m_len;
991
992 SOCKBUF_LOCK(&so->so_snd);
993 if (so->so_snd.sb_state & SBS_CANTSENDMORE) {
994 SOCKBUF_UNLOCK(&so->so_snd);
995 error = EPIPE;
996 goto out;
997 }
998 if (so->so_error) {
999 error = so->so_error;
1000 so->so_error = 0;
1001 SOCKBUF_UNLOCK(&so->so_snd);
1002 goto out;
1003 }
1004 if ((so->so_state & SS_ISCONNECTED) == 0) {
1005 /*
1006 * `sendto' and `sendmsg' is allowed on a connection-based
1007 * socket if it supports implied connect. Return ENOTCONN if
1008 * not connected and no address is supplied.
1009 */
1010 if ((so->so_proto->pr_flags & PR_CONNREQUIRED) &&
1011 (so->so_proto->pr_flags & PR_IMPLOPCL) == 0) {
1012 if ((so->so_state & SS_ISCONFIRMING) == 0 &&
1013 !(resid == 0 && clen != 0)) {
1014 SOCKBUF_UNLOCK(&so->so_snd);
1015 error = ENOTCONN;
1016 goto out;
1017 }
1018 } else if (addr == NULL) {
1019 if (so->so_proto->pr_flags & PR_CONNREQUIRED)
1020 error = ENOTCONN;
1021 else
1022 error = EDESTADDRREQ;
1023 SOCKBUF_UNLOCK(&so->so_snd);
1024 goto out;
1025 }
1026 }
1027
1028 /*
1029 * Do we need MSG_OOB support in SOCK_DGRAM? Signs here may be a
1030 * problem and need fixing.
1031 */
1032 space = sbspace(&so->so_snd);
1033 if (flags & MSG_OOB)
1034 space += 1024;
1035 space -= clen;
1036 SOCKBUF_UNLOCK(&so->so_snd);
1037 if (resid > space) {
1038 error = EMSGSIZE;
1039 goto out;
1040 }
1041 if (uio == NULL) {
1042 resid = 0;
1043 if (flags & MSG_EOR)
1044 top->m_flags |= M_EOR;
1045 } else {
1046#ifdef ZERO_COPY_SOCKETS
1047 error = sosend_copyin(uio, &top, atomic, &space, flags);
1048 if (error)
1049 goto out;
1050#else
1051 /*
1052 * Copy the data from userland into a mbuf chain.
1053 * If no data is to be copied in, a single empty mbuf
1054 * is returned.
1055 */
1056 top = m_uiotombuf(uio, M_WAITOK, space, max_hdr,
1057 (M_PKTHDR | ((flags & MSG_EOR) ? M_EOR : 0)));
1058 if (top == NULL) {
1059 error = EFAULT; /* only possible error */
1060 goto out;
1061 }
1062 space -= resid - uio->uio_resid;
1063#endif
1064 resid = uio->uio_resid;
1065 }
1066 KASSERT(resid == 0, ("sosend_dgram: resid != 0"));
1067 /*
1068 * XXXRW: Frobbing SO_DONTROUTE here is even worse without sblock
1069 * than with.
1070 */
1071 if (dontroute) {
1072 SOCK_LOCK(so);
1073 so->so_options |= SO_DONTROUTE;
1074 SOCK_UNLOCK(so);
1075 }
1076 /*
1077 * XXX all the SBS_CANTSENDMORE checks previously done could be out
1078 * of date. We could have recieved a reset packet in an interrupt or
1079 * maybe we slept while doing page faults in uiomove() etc. We could
1080 * probably recheck again inside the locking protection here, but
1081 * there are probably other places that this also happens. We must
1082 * rethink this.
1083 */
1084 error = (*so->so_proto->pr_usrreqs->pru_send)(so,
1085 (flags & MSG_OOB) ? PRUS_OOB :
1086 /*
1087 * If the user set MSG_EOF, the protocol understands this flag and
1088 * nothing left to send then use PRU_SEND_EOF instead of PRU_SEND.
1089 */
1090 ((flags & MSG_EOF) &&
1091 (so->so_proto->pr_flags & PR_IMPLOPCL) &&
1092 (resid <= 0)) ?
1093 PRUS_EOF :
1094 /* If there is more to send set PRUS_MORETOCOME */
1095 (resid > 0 && space > 0) ? PRUS_MORETOCOME : 0,
1096 top, addr, control, td);
1097 if (dontroute) {
1098 SOCK_LOCK(so);
1099 so->so_options &= ~SO_DONTROUTE;
1100 SOCK_UNLOCK(so);
1101 }
1102 clen = 0;
1103 control = NULL;
1104 top = NULL;
1105out:
1106 if (top != NULL)
1107 m_freem(top);
1108 if (control != NULL)
1109 m_freem(control);
1110 return (error);
1111}
1112
1113/*
1114 * Send on a socket. If send must go all at once and message is larger than
1115 * send buffering, then hard error. Lock against other senders. If must go
1116 * all at once and not enough room now, then inform user that this would
1117 * block and do nothing. Otherwise, if nonblocking, send as much as
1118 * possible. The data to be sent is described by "uio" if nonzero, otherwise
1119 * by the mbuf chain "top" (which must be null if uio is not). Data provided
1120 * in mbuf chain must be small enough to send all at once.
1121 *
1122 * Returns nonzero on error, timeout or signal; callers must check for short
1123 * counts if EINTR/ERESTART are returned. Data and control buffers are freed
1124 * on return.
1125 */
1126#define snderr(errno) { error = (errno); goto release; }
1127int
1128sosend_generic(so, addr, uio, top, control, flags, td)
1129 struct socket *so;
1130 struct sockaddr *addr;
1131 struct uio *uio;
1132 struct mbuf *top;
1133 struct mbuf *control;
1134 int flags;
1135 struct thread *td;
1136{
1137 long space, resid;
1138 int clen = 0, error, dontroute;
1139 int atomic = sosendallatonce(so) || top;
1140
1141 if (uio != NULL)
1142 resid = uio->uio_resid;
1143 else
1144 resid = top->m_pkthdr.len;
1145 /*
1146 * In theory resid should be unsigned. However, space must be
1147 * signed, as it might be less than 0 if we over-committed, and we
1148 * must use a signed comparison of space and resid. On the other
1149 * hand, a negative resid causes us to loop sending 0-length
1150 * segments to the protocol.
1151 *
1152 * Also check to make sure that MSG_EOR isn't used on SOCK_STREAM
1153 * type sockets since that's an error.
1154 */
1155 if (resid < 0 || (so->so_type == SOCK_STREAM && (flags & MSG_EOR))) {
1156 error = EINVAL;
1157 goto out;
1158 }
1159
1160 dontroute =
1161 (flags & MSG_DONTROUTE) && (so->so_options & SO_DONTROUTE) == 0 &&
1162 (so->so_proto->pr_flags & PR_ATOMIC);
1163 if (td != NULL)
1164 td->td_proc->p_stats->p_ru.ru_msgsnd++;
1165 if (control != NULL)
1166 clen = control->m_len;
1167
1168 SOCKBUF_LOCK(&so->so_snd);
1169restart:
1170 SOCKBUF_LOCK_ASSERT(&so->so_snd);
1171 error = sblock(&so->so_snd, SBLOCKWAIT(flags));
1172 if (error)
1173 goto out_locked;
1174 do {
1175 SOCKBUF_LOCK_ASSERT(&so->so_snd);
1176 if (so->so_snd.sb_state & SBS_CANTSENDMORE)
1177 snderr(EPIPE);
1178 if (so->so_error) {
1179 error = so->so_error;
1180 so->so_error = 0;
1181 goto release;
1182 }
1183 if ((so->so_state & SS_ISCONNECTED) == 0) {
1184 /*
1185 * `sendto' and `sendmsg' is allowed on a connection-
1186 * based socket if it supports implied connect.
1187 * Return ENOTCONN if not connected and no address is
1188 * supplied.
1189 */
1190 if ((so->so_proto->pr_flags & PR_CONNREQUIRED) &&
1191 (so->so_proto->pr_flags & PR_IMPLOPCL) == 0) {
1192 if ((so->so_state & SS_ISCONFIRMING) == 0 &&
1193 !(resid == 0 && clen != 0))
1194 snderr(ENOTCONN);
1195 } else if (addr == NULL)
1196 snderr(so->so_proto->pr_flags & PR_CONNREQUIRED ?
1197 ENOTCONN : EDESTADDRREQ);
1198 }
1199 space = sbspace(&so->so_snd);
1200 if (flags & MSG_OOB)
1201 space += 1024;
1202 if ((atomic && resid > so->so_snd.sb_hiwat) ||
1203 clen > so->so_snd.sb_hiwat)
1204 snderr(EMSGSIZE);
1205 if (space < resid + clen &&
1206 (atomic || space < so->so_snd.sb_lowat || space < clen)) {
1207 if ((so->so_state & SS_NBIO) || (flags & MSG_NBIO))
1208 snderr(EWOULDBLOCK);
1209 sbunlock(&so->so_snd);
1210 error = sbwait(&so->so_snd);
1211 if (error)
1212 goto out_locked;
1213 goto restart;
1214 }
1215 SOCKBUF_UNLOCK(&so->so_snd);
1216 space -= clen;
1217 do {
1218 if (uio == NULL) {
1219 resid = 0;
1220 if (flags & MSG_EOR)
1221 top->m_flags |= M_EOR;
1222 } else {
1223#ifdef ZERO_COPY_SOCKETS
1224 error = sosend_copyin(uio, &top, atomic,
1225 &space, flags);
1226 if (error != 0) {
1227 SOCKBUF_LOCK(&so->so_snd);
1228 goto release;
1229 }
1230#else
1231 /*
1232 * Copy the data from userland into a mbuf
1233 * chain. If no data is to be copied in,
1234 * a single empty mbuf is returned.
1235 */
1236 top = m_uiotombuf(uio, M_WAITOK, space,
1237 (atomic ? max_hdr : 0),
1238 (atomic ? M_PKTHDR : 0) |
1239 ((flags & MSG_EOR) ? M_EOR : 0));
1240 if (top == NULL) {
1241 SOCKBUF_LOCK(&so->so_snd);
1242 error = EFAULT; /* only possible error */
1243 goto release;
1244 }
1245 space -= resid - uio->uio_resid;
1246#endif
1247 resid = uio->uio_resid;
1248 }
1249 if (dontroute) {
1250 SOCK_LOCK(so);
1251 so->so_options |= SO_DONTROUTE;
1252 SOCK_UNLOCK(so);
1253 }
1254 /*
1255 * XXX all the SBS_CANTSENDMORE checks previously
1256 * done could be out of date. We could have recieved
1257 * a reset packet in an interrupt or maybe we slept
1258 * while doing page faults in uiomove() etc. We
1259 * could probably recheck again inside the locking
1260 * protection here, but there are probably other
1261 * places that this also happens. We must rethink
1262 * this.
1263 */
1264 error = (*so->so_proto->pr_usrreqs->pru_send)(so,
1265 (flags & MSG_OOB) ? PRUS_OOB :
1266 /*
1267 * If the user set MSG_EOF, the protocol understands
1268 * this flag and nothing left to send then use
1269 * PRU_SEND_EOF instead of PRU_SEND.
1270 */
1271 ((flags & MSG_EOF) &&
1272 (so->so_proto->pr_flags & PR_IMPLOPCL) &&
1273 (resid <= 0)) ?
1274 PRUS_EOF :
1275 /* If there is more to send set PRUS_MORETOCOME. */
1276 (resid > 0 && space > 0) ? PRUS_MORETOCOME : 0,
1277 top, addr, control, td);
1278 if (dontroute) {
1279 SOCK_LOCK(so);
1280 so->so_options &= ~SO_DONTROUTE;
1281 SOCK_UNLOCK(so);
1282 }
1283 clen = 0;
1284 control = NULL;
1285 top = NULL;
1286 if (error) {
1287 SOCKBUF_LOCK(&so->so_snd);
1288 goto release;
1289 }
1290 } while (resid && space > 0);
1291 SOCKBUF_LOCK(&so->so_snd);
1292 } while (resid);
1293
1294release:
1295 SOCKBUF_LOCK_ASSERT(&so->so_snd);
1296 sbunlock(&so->so_snd);
1297out_locked:
1298 SOCKBUF_LOCK_ASSERT(&so->so_snd);
1299 SOCKBUF_UNLOCK(&so->so_snd);
1300out:
1301 if (top != NULL)
1302 m_freem(top);
1303 if (control != NULL)
1304 m_freem(control);
1305 return (error);
1306}
1307#undef snderr
1308
1309int
1310sosend(so, addr, uio, top, control, flags, td)
1311 struct socket *so;
1312 struct sockaddr *addr;
1313 struct uio *uio;
1314 struct mbuf *top;
1315 struct mbuf *control;
1316 int flags;
1317 struct thread *td;
1318{
1319
1320 /* XXXRW: Temporary debugging. */
1321 KASSERT(so->so_proto->pr_usrreqs->pru_sosend != sosend,
1322 ("sosend: protocol calls sosend"));
1323
1324 return (so->so_proto->pr_usrreqs->pru_sosend(so, addr, uio, top,
1325 control, flags, td));
1326}
1327
1328/*
1329 * The part of soreceive() that implements reading non-inline out-of-band
1330 * data from a socket. For more complete comments, see soreceive(), from
1331 * which this code originated.
1332 *
1333 * Note that soreceive_rcvoob(), unlike the remainder of soreceive(), is
1334 * unable to return an mbuf chain to the caller.
1335 */
1336static int
1337soreceive_rcvoob(so, uio, flags)
1338 struct socket *so;
1339 struct uio *uio;
1340 int flags;
1341{
1342 struct protosw *pr = so->so_proto;
1343 struct mbuf *m;
1344 int error;
1345
1346 KASSERT(flags & MSG_OOB, ("soreceive_rcvoob: (flags & MSG_OOB) == 0"));
1347
1348 m = m_get(M_TRYWAIT, MT_DATA);
1349 if (m == NULL)
1350 return (ENOBUFS);
1351 error = (*pr->pr_usrreqs->pru_rcvoob)(so, m, flags & MSG_PEEK);
1352 if (error)
1353 goto bad;
1354 do {
1355#ifdef ZERO_COPY_SOCKETS
1356 if (so_zero_copy_receive) {
1357 int disposable;
1358
1359 if ((m->m_flags & M_EXT)
1360 && (m->m_ext.ext_type == EXT_DISPOSABLE))
1361 disposable = 1;
1362 else
1363 disposable = 0;
1364
1365 error = uiomoveco(mtod(m, void *),
1366 min(uio->uio_resid, m->m_len),
1367 uio, disposable);
1368 } else
1369#endif /* ZERO_COPY_SOCKETS */
1370 error = uiomove(mtod(m, void *),
1371 (int) min(uio->uio_resid, m->m_len), uio);
1372 m = m_free(m);
1373 } while (uio->uio_resid && error == 0 && m);
1374bad:
1375 if (m != NULL)
1376 m_freem(m);
1377 return (error);
1378}
1379
1380/*
1381 * Following replacement or removal of the first mbuf on the first mbuf chain
1382 * of a socket buffer, push necessary state changes back into the socket
1383 * buffer so that other consumers see the values consistently. 'nextrecord'
1384 * is the callers locally stored value of the original value of
1385 * sb->sb_mb->m_nextpkt which must be restored when the lead mbuf changes.
1386 * NOTE: 'nextrecord' may be NULL.
1387 */
1388static __inline void
1389sockbuf_pushsync(struct sockbuf *sb, struct mbuf *nextrecord)
1390{
1391
1392 SOCKBUF_LOCK_ASSERT(sb);
1393 /*
1394 * First, update for the new value of nextrecord. If necessary, make
1395 * it the first record.
1396 */
1397 if (sb->sb_mb != NULL)
1398 sb->sb_mb->m_nextpkt = nextrecord;
1399 else
1400 sb->sb_mb = nextrecord;
1401
1402 /*
1403 * Now update any dependent socket buffer fields to reflect the new
1404 * state. This is an expanded inline of SB_EMPTY_FIXUP(), with the
1405 * addition of a second clause that takes care of the case where
1406 * sb_mb has been updated, but remains the last record.
1407 */
1408 if (sb->sb_mb == NULL) {
1409 sb->sb_mbtail = NULL;
1410 sb->sb_lastrecord = NULL;
1411 } else if (sb->sb_mb->m_nextpkt == NULL)
1412 sb->sb_lastrecord = sb->sb_mb;
1413}
1414
1415
1416/*
1417 * Implement receive operations on a socket. We depend on the way that
1418 * records are added to the sockbuf by sbappend. In particular, each record
1419 * (mbufs linked through m_next) must begin with an address if the protocol
1420 * so specifies, followed by an optional mbuf or mbufs containing ancillary
1421 * data, and then zero or more mbufs of data. In order to allow parallelism
1422 * between network receive and copying to user space, as well as avoid
1423 * sleeping with a mutex held, we release the socket buffer mutex during the
1424 * user space copy. Although the sockbuf is locked, new data may still be
1425 * appended, and thus we must maintain consistency of the sockbuf during that
1426 * time.
1427 *
1428 * The caller may receive the data as a single mbuf chain by supplying an
1429 * mbuf **mp0 for use in returning the chain. The uio is then used only for
1430 * the count in uio_resid.
1431 */
1432int
1433soreceive_generic(so, psa, uio, mp0, controlp, flagsp)
1434 struct socket *so;
1435 struct sockaddr **psa;
1436 struct uio *uio;
1437 struct mbuf **mp0;
1438 struct mbuf **controlp;
1439 int *flagsp;
1440{
1441 struct mbuf *m, **mp;
1442 int flags, len, error, offset;
1443 struct protosw *pr = so->so_proto;
1444 struct mbuf *nextrecord;
1445 int moff, type = 0;
1446 int orig_resid = uio->uio_resid;
1447
1448 mp = mp0;
1449 if (psa != NULL)
1450 *psa = NULL;
1451 if (controlp != NULL)
1452 *controlp = NULL;
1453 if (flagsp != NULL)
1454 flags = *flagsp &~ MSG_EOR;
1455 else
1456 flags = 0;
1457 if (flags & MSG_OOB)
1458 return (soreceive_rcvoob(so, uio, flags));
1459 if (mp != NULL)
1460 *mp = NULL;
1461 if ((pr->pr_flags & PR_WANTRCVD) && (so->so_state & SS_ISCONFIRMING)
1462 && uio->uio_resid)
1463 (*pr->pr_usrreqs->pru_rcvd)(so, 0);
1464
1465 SOCKBUF_LOCK(&so->so_rcv);
1466restart:
1467 SOCKBUF_LOCK_ASSERT(&so->so_rcv);
1468 error = sblock(&so->so_rcv, SBLOCKWAIT(flags));
1469 if (error)
1470 goto out;
1471
1472 m = so->so_rcv.sb_mb;
1473 /*
1474 * If we have less data than requested, block awaiting more (subject
1475 * to any timeout) if:
1476 * 1. the current count is less than the low water mark, or
1477 * 2. MSG_WAITALL is set, and it is possible to do the entire
1478 * receive operation at once if we block (resid <= hiwat).
1479 * 3. MSG_DONTWAIT is not set
1480 * If MSG_WAITALL is set but resid is larger than the receive buffer,
1481 * we have to do the receive in sections, and thus risk returning a
1482 * short count if a timeout or signal occurs after we start.
1483 */
1484 if (m == NULL || (((flags & MSG_DONTWAIT) == 0 &&
1485 so->so_rcv.sb_cc < uio->uio_resid) &&
1486 (so->so_rcv.sb_cc < so->so_rcv.sb_lowat ||
1487 ((flags & MSG_WAITALL) && uio->uio_resid <= so->so_rcv.sb_hiwat)) &&
1488 m->m_nextpkt == NULL && (pr->pr_flags & PR_ATOMIC) == 0)) {
1489 KASSERT(m != NULL || !so->so_rcv.sb_cc,
1490 ("receive: m == %p so->so_rcv.sb_cc == %u",
1491 m, so->so_rcv.sb_cc));
1492 if (so->so_error) {
1493 if (m != NULL)
1494 goto dontblock;
1495 error = so->so_error;
1496 if ((flags & MSG_PEEK) == 0)
1497 so->so_error = 0;
1498 goto release;
1499 }
1500 SOCKBUF_LOCK_ASSERT(&so->so_rcv);
1501 if (so->so_rcv.sb_state & SBS_CANTRCVMORE) {
1502 if (m)
1503 goto dontblock;
1504 else
1505 goto release;
1506 }
1507 for (; m != NULL; m = m->m_next)
1508 if (m->m_type == MT_OOBDATA || (m->m_flags & M_EOR)) {
1509 m = so->so_rcv.sb_mb;
1510 goto dontblock;
1511 }
1512 if ((so->so_state & (SS_ISCONNECTED|SS_ISCONNECTING)) == 0 &&
1513 (so->so_proto->pr_flags & PR_CONNREQUIRED)) {
1514 error = ENOTCONN;
1515 goto release;
1516 }
1517 if (uio->uio_resid == 0)
1518 goto release;
1519 if ((so->so_state & SS_NBIO) ||
1520 (flags & (MSG_DONTWAIT|MSG_NBIO))) {
1521 error = EWOULDBLOCK;
1522 goto release;
1523 }
1524 SBLASTRECORDCHK(&so->so_rcv);
1525 SBLASTMBUFCHK(&so->so_rcv);
1526 sbunlock(&so->so_rcv);
1527 error = sbwait(&so->so_rcv);
1528 if (error)
1529 goto out;
1530 goto restart;
1531 }
1532dontblock:
1533 /*
1534 * From this point onward, we maintain 'nextrecord' as a cache of the
1535 * pointer to the next record in the socket buffer. We must keep the
1536 * various socket buffer pointers and local stack versions of the
1537 * pointers in sync, pushing out modifications before dropping the
1538 * socket buffer mutex, and re-reading them when picking it up.
1539 *
1540 * Otherwise, we will race with the network stack appending new data
1541 * or records onto the socket buffer by using inconsistent/stale
1542 * versions of the field, possibly resulting in socket buffer
1543 * corruption.
1544 *
1545 * By holding the high-level sblock(), we prevent simultaneous
1546 * readers from pulling off the front of the socket buffer.
1547 */
1548 SOCKBUF_LOCK_ASSERT(&so->so_rcv);
1549 if (uio->uio_td)
1550 uio->uio_td->td_proc->p_stats->p_ru.ru_msgrcv++;
1551 KASSERT(m == so->so_rcv.sb_mb, ("soreceive: m != so->so_rcv.sb_mb"));
1552 SBLASTRECORDCHK(&so->so_rcv);
1553 SBLASTMBUFCHK(&so->so_rcv);
1554 nextrecord = m->m_nextpkt;
1555 if (pr->pr_flags & PR_ADDR) {
1556 KASSERT(m->m_type == MT_SONAME,
1557 ("m->m_type == %d", m->m_type));
1558 orig_resid = 0;
1559 if (psa != NULL)
1560 *psa = sodupsockaddr(mtod(m, struct sockaddr *),
1561 M_NOWAIT);
1562 if (flags & MSG_PEEK) {
1563 m = m->m_next;
1564 } else {
1565 sbfree(&so->so_rcv, m);
1566 so->so_rcv.sb_mb = m_free(m);
1567 m = so->so_rcv.sb_mb;
1568 sockbuf_pushsync(&so->so_rcv, nextrecord);
1569 }
1570 }
1571
1572 /*
1573 * Process one or more MT_CONTROL mbufs present before any data mbufs
1574 * in the first mbuf chain on the socket buffer. If MSG_PEEK, we
1575 * just copy the data; if !MSG_PEEK, we call into the protocol to
1576 * perform externalization (or freeing if controlp == NULL).
1577 */
1578 if (m != NULL && m->m_type == MT_CONTROL) {
1579 struct mbuf *cm = NULL, *cmn;
1580 struct mbuf **cme = &cm;
1581
1582 do {
1583 if (flags & MSG_PEEK) {
1584 if (controlp != NULL) {
1585 *controlp = m_copy(m, 0, m->m_len);
1586 controlp = &(*controlp)->m_next;
1587 }
1588 m = m->m_next;
1589 } else {
1590 sbfree(&so->so_rcv, m);
1591 so->so_rcv.sb_mb = m->m_next;
1592 m->m_next = NULL;
1593 *cme = m;
1594 cme = &(*cme)->m_next;
1595 m = so->so_rcv.sb_mb;
1596 }
1597 } while (m != NULL && m->m_type == MT_CONTROL);
1598 if ((flags & MSG_PEEK) == 0)
1599 sockbuf_pushsync(&so->so_rcv, nextrecord);
1600 while (cm != NULL) {
1601 cmn = cm->m_next;
1602 cm->m_next = NULL;
1603 if (pr->pr_domain->dom_externalize != NULL) {
1604 SOCKBUF_UNLOCK(&so->so_rcv);
1605 error = (*pr->pr_domain->dom_externalize)
1606 (cm, controlp);
1607 SOCKBUF_LOCK(&so->so_rcv);
1608 } else if (controlp != NULL)
1609 *controlp = cm;
1610 else
1611 m_freem(cm);
1612 if (controlp != NULL) {
1613 orig_resid = 0;
1614 while (*controlp != NULL)
1615 controlp = &(*controlp)->m_next;
1616 }
1617 cm = cmn;
1618 }
1619 if (m != NULL)
1620 nextrecord = so->so_rcv.sb_mb->m_nextpkt;
1621 else
1622 nextrecord = so->so_rcv.sb_mb;
1623 orig_resid = 0;
1624 }
1625 if (m != NULL) {
1626 if ((flags & MSG_PEEK) == 0) {
1627 KASSERT(m->m_nextpkt == nextrecord,
1628 ("soreceive: post-control, nextrecord !sync"));
1629 if (nextrecord == NULL) {
1630 KASSERT(so->so_rcv.sb_mb == m,
1631 ("soreceive: post-control, sb_mb!=m"));
1632 KASSERT(so->so_rcv.sb_lastrecord == m,
1633 ("soreceive: post-control, lastrecord!=m"));
1634 }
1635 }
1636 type = m->m_type;
1637 if (type == MT_OOBDATA)
1638 flags |= MSG_OOB;
1639 } else {
1640 if ((flags & MSG_PEEK) == 0) {
1641 KASSERT(so->so_rcv.sb_mb == nextrecord,
1642 ("soreceive: sb_mb != nextrecord"));
1643 if (so->so_rcv.sb_mb == NULL) {
1644 KASSERT(so->so_rcv.sb_lastrecord == NULL,
1645 ("soreceive: sb_lastercord != NULL"));
1646 }
1647 }
1648 }
1649 SOCKBUF_LOCK_ASSERT(&so->so_rcv);
1650 SBLASTRECORDCHK(&so->so_rcv);
1651 SBLASTMBUFCHK(&so->so_rcv);
1652
1653 /*
1654 * Now continue to read any data mbufs off of the head of the socket
1655 * buffer until the read request is satisfied. Note that 'type' is
1656 * used to store the type of any mbuf reads that have happened so far
1657 * such that soreceive() can stop reading if the type changes, which
1658 * causes soreceive() to return only one of regular data and inline
1659 * out-of-band data in a single socket receive operation.
1660 */
1661 moff = 0;
1662 offset = 0;
1663 while (m != NULL && uio->uio_resid > 0 && error == 0) {
1664 /*
1665 * If the type of mbuf has changed since the last mbuf
1666 * examined ('type'), end the receive operation.
1667 */
1668 SOCKBUF_LOCK_ASSERT(&so->so_rcv);
1669 if (m->m_type == MT_OOBDATA) {
1670 if (type != MT_OOBDATA)
1671 break;
1672 } else if (type == MT_OOBDATA)
1673 break;
1674 else
1675 KASSERT(m->m_type == MT_DATA,
1676 ("m->m_type == %d", m->m_type));
1677 so->so_rcv.sb_state &= ~SBS_RCVATMARK;
1678 len = uio->uio_resid;
1679 if (so->so_oobmark && len > so->so_oobmark - offset)
1680 len = so->so_oobmark - offset;
1681 if (len > m->m_len - moff)
1682 len = m->m_len - moff;
1683 /*
1684 * If mp is set, just pass back the mbufs. Otherwise copy
1685 * them out via the uio, then free. Sockbuf must be
1686 * consistent here (points to current mbuf, it points to next
1687 * record) when we drop priority; we must note any additions
1688 * to the sockbuf when we block interrupts again.
1689 */
1690 if (mp == NULL) {
1691 SOCKBUF_LOCK_ASSERT(&so->so_rcv);
1692 SBLASTRECORDCHK(&so->so_rcv);
1693 SBLASTMBUFCHK(&so->so_rcv);
1694 SOCKBUF_UNLOCK(&so->so_rcv);
1695#ifdef ZERO_COPY_SOCKETS
1696 if (so_zero_copy_receive) {
1697 int disposable;
1698
1699 if ((m->m_flags & M_EXT)
1700 && (m->m_ext.ext_type == EXT_DISPOSABLE))
1701 disposable = 1;
1702 else
1703 disposable = 0;
1704
1705 error = uiomoveco(mtod(m, char *) + moff,
1706 (int)len, uio,
1707 disposable);
1708 } else
1709#endif /* ZERO_COPY_SOCKETS */
1710 error = uiomove(mtod(m, char *) + moff, (int)len, uio);
1711 SOCKBUF_LOCK(&so->so_rcv);
1712 if (error) {
1713 /*
1714 * The MT_SONAME mbuf has already been removed
1715 * from the record, so it is necessary to
1716 * remove the data mbufs, if any, to preserve
1717 * the invariant in the case of PR_ADDR that
1718 * requires MT_SONAME mbufs at the head of
1719 * each record.
1720 */
1721 if (m && pr->pr_flags & PR_ATOMIC &&
1722 ((flags & MSG_PEEK) == 0))
1723 (void)sbdroprecord_locked(&so->so_rcv);
1724 goto release;
1725 }
1726 } else
1727 uio->uio_resid -= len;
1728 SOCKBUF_LOCK_ASSERT(&so->so_rcv);
1729 if (len == m->m_len - moff) {
1730 if (m->m_flags & M_EOR)
1731 flags |= MSG_EOR;
1732 if (flags & MSG_PEEK) {
1733 m = m->m_next;
1734 moff = 0;
1735 } else {
1736 nextrecord = m->m_nextpkt;
1737 sbfree(&so->so_rcv, m);
1738 if (mp != NULL) {
1739 *mp = m;
1740 mp = &m->m_next;
1741 so->so_rcv.sb_mb = m = m->m_next;
1742 *mp = NULL;
1743 } else {
1744 so->so_rcv.sb_mb = m_free(m);
1745 m = so->so_rcv.sb_mb;
1746 }
1747 sockbuf_pushsync(&so->so_rcv, nextrecord);
1748 SBLASTRECORDCHK(&so->so_rcv);
1749 SBLASTMBUFCHK(&so->so_rcv);
1750 }
1751 } else {
1752 if (flags & MSG_PEEK)
1753 moff += len;
1754 else {
1755 if (mp != NULL) {
1756 int copy_flag;
1757
1758 if (flags & MSG_DONTWAIT)
1759 copy_flag = M_DONTWAIT;
1760 else
1761 copy_flag = M_TRYWAIT;
1762 if (copy_flag == M_TRYWAIT)
1763 SOCKBUF_UNLOCK(&so->so_rcv);
1764 *mp = m_copym(m, 0, len, copy_flag);
1765 if (copy_flag == M_TRYWAIT)
1766 SOCKBUF_LOCK(&so->so_rcv);
1767 if (*mp == NULL) {
1768 /*
1769 * m_copym() couldn't
1770 * allocate an mbuf. Adjust
1771 * uio_resid back (it was
1772 * adjusted down by len
1773 * bytes, which we didn't end
1774 * up "copying" over).
1775 */
1776 uio->uio_resid += len;
1777 break;
1778 }
1779 }
1780 m->m_data += len;
1781 m->m_len -= len;
1782 so->so_rcv.sb_cc -= len;
1783 }
1784 }
1785 SOCKBUF_LOCK_ASSERT(&so->so_rcv);
1786 if (so->so_oobmark) {
1787 if ((flags & MSG_PEEK) == 0) {
1788 so->so_oobmark -= len;
1789 if (so->so_oobmark == 0) {
1790 so->so_rcv.sb_state |= SBS_RCVATMARK;
1791 break;
1792 }
1793 } else {
1794 offset += len;
1795 if (offset == so->so_oobmark)
1796 break;
1797 }
1798 }
1799 if (flags & MSG_EOR)
1800 break;
1801 /*
1802 * If the MSG_WAITALL flag is set (for non-atomic socket), we
1803 * must not quit until "uio->uio_resid == 0" or an error
1804 * termination. If a signal/timeout occurs, return with a
1805 * short count but without error. Keep sockbuf locked
1806 * against other readers.
1807 */
1808 while (flags & MSG_WAITALL && m == NULL && uio->uio_resid > 0 &&
1809 !sosendallatonce(so) && nextrecord == NULL) {
1810 SOCKBUF_LOCK_ASSERT(&so->so_rcv);
1811 if (so->so_error || so->so_rcv.sb_state & SBS_CANTRCVMORE)
1812 break;
1813 /*
1814 * Notify the protocol that some data has been
1815 * drained before blocking.
1816 */
1817 if (pr->pr_flags & PR_WANTRCVD) {
1818 SOCKBUF_UNLOCK(&so->so_rcv);
1819 (*pr->pr_usrreqs->pru_rcvd)(so, flags);
1820 SOCKBUF_LOCK(&so->so_rcv);
1821 }
1822 SBLASTRECORDCHK(&so->so_rcv);
1823 SBLASTMBUFCHK(&so->so_rcv);
1824 error = sbwait(&so->so_rcv);
1825 if (error)
1826 goto release;
1827 m = so->so_rcv.sb_mb;
1828 if (m != NULL)
1829 nextrecord = m->m_nextpkt;
1830 }
1831 }
1832
1833 SOCKBUF_LOCK_ASSERT(&so->so_rcv);
1834 if (m != NULL && pr->pr_flags & PR_ATOMIC) {
1835 flags |= MSG_TRUNC;
1836 if ((flags & MSG_PEEK) == 0)
1837 (void) sbdroprecord_locked(&so->so_rcv);
1838 }
1839 if ((flags & MSG_PEEK) == 0) {
1840 if (m == NULL) {
1841 /*
1842 * First part is an inline SB_EMPTY_FIXUP(). Second
1843 * part makes sure sb_lastrecord is up-to-date if
1844 * there is still data in the socket buffer.
1845 */
1846 so->so_rcv.sb_mb = nextrecord;
1847 if (so->so_rcv.sb_mb == NULL) {
1848 so->so_rcv.sb_mbtail = NULL;
1849 so->so_rcv.sb_lastrecord = NULL;
1850 } else if (nextrecord->m_nextpkt == NULL)
1851 so->so_rcv.sb_lastrecord = nextrecord;
1852 }
1853 SBLASTRECORDCHK(&so->so_rcv);
1854 SBLASTMBUFCHK(&so->so_rcv);
1855 /*
1856 * If soreceive() is being done from the socket callback,
1857 * then don't need to generate ACK to peer to update window,
1858 * since ACK will be generated on return to TCP.
1859 */
1860 if (!(flags & MSG_SOCALLBCK) &&
1861 (pr->pr_flags & PR_WANTRCVD)) {
1862 SOCKBUF_UNLOCK(&so->so_rcv);
1863 (*pr->pr_usrreqs->pru_rcvd)(so, flags);
1864 SOCKBUF_LOCK(&so->so_rcv);
1865 }
1866 }
1867 SOCKBUF_LOCK_ASSERT(&so->so_rcv);
1868 if (orig_resid == uio->uio_resid && orig_resid &&
1869 (flags & MSG_EOR) == 0 && (so->so_rcv.sb_state & SBS_CANTRCVMORE) == 0) {
1870 sbunlock(&so->so_rcv);
1871 goto restart;
1872 }
1873
1874 if (flagsp != NULL)
1875 *flagsp |= flags;
1876release:
1877 SOCKBUF_LOCK_ASSERT(&so->so_rcv);
1878 sbunlock(&so->so_rcv);
1879out:
1880 SOCKBUF_LOCK_ASSERT(&so->so_rcv);
1881 SOCKBUF_UNLOCK(&so->so_rcv);
1882 return (error);
1883}
1884
1885int
1886soreceive(so, psa, uio, mp0, controlp, flagsp)
1887 struct socket *so;
1888 struct sockaddr **psa;
1889 struct uio *uio;
1890 struct mbuf **mp0;
1891 struct mbuf **controlp;
1892 int *flagsp;
1893{
1894
1895 /* XXXRW: Temporary debugging. */
1896 KASSERT(so->so_proto->pr_usrreqs->pru_soreceive != soreceive,
1897 ("soreceive: protocol calls soreceive"));
1898
1899 return (so->so_proto->pr_usrreqs->pru_soreceive(so, psa, uio, mp0,
1900 controlp, flagsp));
1901}
1902
1903int
1904soshutdown(so, how)
1905 struct socket *so;
1906 int how;
1907{
1908 struct protosw *pr = so->so_proto;
1909
1910 if (!(how == SHUT_RD || how == SHUT_WR || how == SHUT_RDWR))
1911 return (EINVAL);
1912
1913 if (how != SHUT_WR)
1914 sorflush(so);
1915 if (how != SHUT_RD)
1916 return ((*pr->pr_usrreqs->pru_shutdown)(so));
1917 return (0);
1918}
1919
1920void
1921sorflush(so)
1922 struct socket *so;
1923{
1924 struct sockbuf *sb = &so->so_rcv;
1925 struct protosw *pr = so->so_proto;
1926 struct sockbuf asb;
1927
1928 /*
1929 * XXXRW: This is quite ugly. Previously, this code made a copy of
1930 * the socket buffer, then zero'd the original to clear the buffer
1931 * fields. However, with mutexes in the socket buffer, this causes
1932 * problems. We only clear the zeroable bits of the original;
1933 * however, we have to initialize and destroy the mutex in the copy
1934 * so that dom_dispose() and sbrelease() can lock t as needed.
1935 */
1936 SOCKBUF_LOCK(sb);
1937 sb->sb_flags |= SB_NOINTR;
1938 (void) sblock(sb, M_WAITOK);
1939 /*
1940 * socantrcvmore_locked() drops the socket buffer mutex so that it
1941 * can safely perform wakeups. Re-acquire the mutex before
1942 * continuing.
1943 */
1944 socantrcvmore_locked(so);
1945 SOCKBUF_LOCK(sb);
1946 sbunlock(sb);
1947 /*
1948 * Invalidate/clear most of the sockbuf structure, but leave selinfo
1949 * and mutex data unchanged.
1950 */
1951 bzero(&asb, offsetof(struct sockbuf, sb_startzero));
1952 bcopy(&sb->sb_startzero, &asb.sb_startzero,
1953 sizeof(*sb) - offsetof(struct sockbuf, sb_startzero));
1954 bzero(&sb->sb_startzero,
1955 sizeof(*sb) - offsetof(struct sockbuf, sb_startzero));
1956 SOCKBUF_UNLOCK(sb);
1957
1958 SOCKBUF_LOCK_INIT(&asb, "so_rcv");
1959 if (pr->pr_flags & PR_RIGHTS && pr->pr_domain->dom_dispose != NULL)
1960 (*pr->pr_domain->dom_dispose)(asb.sb_mb);
1961 sbrelease(&asb, so);
1962 SOCKBUF_LOCK_DESTROY(&asb);
1963}
1964
1965/*
1966 * Perhaps this routine, and sooptcopyout(), below, ought to come in an
1967 * additional variant to handle the case where the option value needs to be
1968 * some kind of integer, but not a specific size. In addition to their use
1969 * here, these functions are also called by the protocol-level pr_ctloutput()
1970 * routines.
1971 */
1972int
1973sooptcopyin(sopt, buf, len, minlen)
1974 struct sockopt *sopt;
1975 void *buf;
1976 size_t len;
1977 size_t minlen;
1978{
1979 size_t valsize;
1980
1981 /*
1982 * If the user gives us more than we wanted, we ignore it, but if we
1983 * don't get the minimum length the caller wants, we return EINVAL.
1984 * On success, sopt->sopt_valsize is set to however much we actually
1985 * retrieved.
1986 */
1987 if ((valsize = sopt->sopt_valsize) < minlen)
1988 return EINVAL;
1989 if (valsize > len)
1990 sopt->sopt_valsize = valsize = len;
1991
1992 if (sopt->sopt_td != NULL)
1993 return (copyin(sopt->sopt_val, buf, valsize));
1994
1995 bcopy(sopt->sopt_val, buf, valsize);
1996 return (0);
1997}
1998
1999/*
2000 * Kernel version of setsockopt(2).
2001 *
2002 * XXX: optlen is size_t, not socklen_t
2003 */
2004int
2005so_setsockopt(struct socket *so, int level, int optname, void *optval,
2006 size_t optlen)
2007{
2008 struct sockopt sopt;
2009
2010 sopt.sopt_level = level;
2011 sopt.sopt_name = optname;
2012 sopt.sopt_dir = SOPT_SET;
2013 sopt.sopt_val = optval;
2014 sopt.sopt_valsize = optlen;
2015 sopt.sopt_td = NULL;
2016 return (sosetopt(so, &sopt));
2017}
2018
2019int
2020sosetopt(so, sopt)
2021 struct socket *so;
2022 struct sockopt *sopt;
2023{
2024 int error, optval;
2025 struct linger l;
2026 struct timeval tv;
2027 u_long val;
2028#ifdef MAC
2029 struct mac extmac;
2030#endif
2031
2032 error = 0;
2033 if (sopt->sopt_level != SOL_SOCKET) {
2034 if (so->so_proto && so->so_proto->pr_ctloutput)
2035 return ((*so->so_proto->pr_ctloutput)
2036 (so, sopt));
2037 error = ENOPROTOOPT;
2038 } else {
2039 switch (sopt->sopt_name) {
2040#ifdef INET
2041 case SO_ACCEPTFILTER:
2042 error = do_setopt_accept_filter(so, sopt);
2043 if (error)
2044 goto bad;
2045 break;
2046#endif
2047 case SO_LINGER:
2048 error = sooptcopyin(sopt, &l, sizeof l, sizeof l);
2049 if (error)
2050 goto bad;
2051
2052 SOCK_LOCK(so);
2053 so->so_linger = l.l_linger;
2054 if (l.l_onoff)
2055 so->so_options |= SO_LINGER;
2056 else
2057 so->so_options &= ~SO_LINGER;
2058 SOCK_UNLOCK(so);
2059 break;
2060
2061 case SO_DEBUG:
2062 case SO_KEEPALIVE:
2063 case SO_DONTROUTE:
2064 case SO_USELOOPBACK:
2065 case SO_BROADCAST:
2066 case SO_REUSEADDR:
2067 case SO_REUSEPORT:
2068 case SO_OOBINLINE:
2069 case SO_TIMESTAMP:
2070 case SO_BINTIME:
2071 case SO_NOSIGPIPE:
2072 error = sooptcopyin(sopt, &optval, sizeof optval,
2073 sizeof optval);
2074 if (error)
2075 goto bad;
2076 SOCK_LOCK(so);
2077 if (optval)
2078 so->so_options |= sopt->sopt_name;
2079 else
2080 so->so_options &= ~sopt->sopt_name;
2081 SOCK_UNLOCK(so);
2082 break;
2083
2084 case SO_SNDBUF:
2085 case SO_RCVBUF:
2086 case SO_SNDLOWAT:
2087 case SO_RCVLOWAT:
2088 error = sooptcopyin(sopt, &optval, sizeof optval,
2089 sizeof optval);
2090 if (error)
2091 goto bad;
2092
2093 /*
2094 * Values < 1 make no sense for any of these options,
2095 * so disallow them.
2096 */
2097 if (optval < 1) {
2098 error = EINVAL;
2099 goto bad;
2100 }
2101
2102 switch (sopt->sopt_name) {
2103 case SO_SNDBUF:
2104 case SO_RCVBUF:
2105 if (sbreserve(sopt->sopt_name == SO_SNDBUF ?
2106 &so->so_snd : &so->so_rcv, (u_long)optval,
2107 so, curthread) == 0) {
2108 error = ENOBUFS;
2109 goto bad;
2110 }
2111 (sopt->sopt_name == SO_SNDBUF ? &so->so_snd :
2112 &so->so_rcv)->sb_flags &= ~SB_AUTOSIZE;
2113 break;
2114
2115 /*
2116 * Make sure the low-water is never greater than the
2117 * high-water.
2118 */
2119 case SO_SNDLOWAT:
2120 SOCKBUF_LOCK(&so->so_snd);
2121 so->so_snd.sb_lowat =
2122 (optval > so->so_snd.sb_hiwat) ?
2123 so->so_snd.sb_hiwat : optval;
2124 SOCKBUF_UNLOCK(&so->so_snd);
2125 break;
2126 case SO_RCVLOWAT:
2127 SOCKBUF_LOCK(&so->so_rcv);
2128 so->so_rcv.sb_lowat =
2129 (optval > so->so_rcv.sb_hiwat) ?
2130 so->so_rcv.sb_hiwat : optval;
2131 SOCKBUF_UNLOCK(&so->so_rcv);
2132 break;
2133 }
2134 break;
2135
2136 case SO_SNDTIMEO:
2137 case SO_RCVTIMEO:
2138#ifdef COMPAT_IA32
2139 if (curthread->td_proc->p_sysent == &ia32_freebsd_sysvec) {
2140 struct timeval32 tv32;
2141
2142 error = sooptcopyin(sopt, &tv32, sizeof tv32,
2143 sizeof tv32);
2144 CP(tv32, tv, tv_sec);
2145 CP(tv32, tv, tv_usec);
2146 } else
2147#endif
2148 error = sooptcopyin(sopt, &tv, sizeof tv,
2149 sizeof tv);
2150 if (error)
2151 goto bad;
2152
2153 /* assert(hz > 0); */
2154 if (tv.tv_sec < 0 || tv.tv_sec > INT_MAX / hz ||
2155 tv.tv_usec < 0 || tv.tv_usec >= 1000000) {
2156 error = EDOM;
2157 goto bad;
2158 }
2159 /* assert(tick > 0); */
2160 /* assert(ULONG_MAX - INT_MAX >= 1000000); */
2161 val = (u_long)(tv.tv_sec * hz) + tv.tv_usec / tick;
2162 if (val > INT_MAX) {
2163 error = EDOM;
2164 goto bad;
2165 }
2166 if (val == 0 && tv.tv_usec != 0)
2167 val = 1;
2168
2169 switch (sopt->sopt_name) {
2170 case SO_SNDTIMEO:
2171 so->so_snd.sb_timeo = val;
2172 break;
2173 case SO_RCVTIMEO:
2174 so->so_rcv.sb_timeo = val;
2175 break;
2176 }
2177 break;
2178
2179 case SO_LABEL:
2180#ifdef MAC
2181 error = sooptcopyin(sopt, &extmac, sizeof extmac,
2182 sizeof extmac);
2183 if (error)
2184 goto bad;
2185 error = mac_setsockopt_label(sopt->sopt_td->td_ucred,
2186 so, &extmac);
2187#else
2188 error = EOPNOTSUPP;
2189#endif
2190 break;
2191
2192 default:
2193 error = ENOPROTOOPT;
2194 break;
2195 }
2196 if (error == 0 && so->so_proto != NULL &&
2197 so->so_proto->pr_ctloutput != NULL) {
2198 (void) ((*so->so_proto->pr_ctloutput)
2199 (so, sopt));
2200 }
2201 }
2202bad:
2203 return (error);
2204}
2205
2206/*
2207 * Helper routine for getsockopt.
2208 */
2209int
2210sooptcopyout(struct sockopt *sopt, const void *buf, size_t len)
2211{
2212 int error;
2213 size_t valsize;
2214
2215 error = 0;
2216
2217 /*
2218 * Documented get behavior is that we always return a value, possibly
2219 * truncated to fit in the user's buffer. Traditional behavior is
2220 * that we always tell the user precisely how much we copied, rather
2221 * than something useful like the total amount we had available for
2222 * her. Note that this interface is not idempotent; the entire
2223 * answer must generated ahead of time.
2224 */
2225 valsize = min(len, sopt->sopt_valsize);
2226 sopt->sopt_valsize = valsize;
2227 if (sopt->sopt_val != NULL) {
2228 if (sopt->sopt_td != NULL)
2229 error = copyout(buf, sopt->sopt_val, valsize);
2230 else
2231 bcopy(buf, sopt->sopt_val, valsize);
2232 }
2233 return (error);
2234}
2235
2236int
2237sogetopt(so, sopt)
2238 struct socket *so;
2239 struct sockopt *sopt;
2240{
2241 int error, optval;
2242 struct linger l;
2243 struct timeval tv;
2244#ifdef MAC
2245 struct mac extmac;
2246#endif
2247
2248 error = 0;
2249 if (sopt->sopt_level != SOL_SOCKET) {
2250 if (so->so_proto && so->so_proto->pr_ctloutput) {
2251 return ((*so->so_proto->pr_ctloutput)
2252 (so, sopt));
2253 } else
2254 return (ENOPROTOOPT);
2255 } else {
2256 switch (sopt->sopt_name) {
2257#ifdef INET
2258 case SO_ACCEPTFILTER:
2259 error = do_getopt_accept_filter(so, sopt);
2260 break;
2261#endif
2262 case SO_LINGER:
2263 SOCK_LOCK(so);
2264 l.l_onoff = so->so_options & SO_LINGER;
2265 l.l_linger = so->so_linger;
2266 SOCK_UNLOCK(so);
2267 error = sooptcopyout(sopt, &l, sizeof l);
2268 break;
2269
2270 case SO_USELOOPBACK:
2271 case SO_DONTROUTE:
2272 case SO_DEBUG:
2273 case SO_KEEPALIVE:
2274 case SO_REUSEADDR:
2275 case SO_REUSEPORT:
2276 case SO_BROADCAST:
2277 case SO_OOBINLINE:
2278 case SO_ACCEPTCONN:
2279 case SO_TIMESTAMP:
2280 case SO_BINTIME:
2281 case SO_NOSIGPIPE:
2282 optval = so->so_options & sopt->sopt_name;
2283integer:
2284 error = sooptcopyout(sopt, &optval, sizeof optval);
2285 break;
2286
2287 case SO_TYPE:
2288 optval = so->so_type;
2289 goto integer;
2290
2291 case SO_ERROR:
2292 SOCK_LOCK(so);
2293 optval = so->so_error;
2294 so->so_error = 0;
2295 SOCK_UNLOCK(so);
2296 goto integer;
2297
2298 case SO_SNDBUF:
2299 optval = so->so_snd.sb_hiwat;
2300 goto integer;
2301
2302 case SO_RCVBUF:
2303 optval = so->so_rcv.sb_hiwat;
2304 goto integer;
2305
2306 case SO_SNDLOWAT:
2307 optval = so->so_snd.sb_lowat;
2308 goto integer;
2309
2310 case SO_RCVLOWAT:
2311 optval = so->so_rcv.sb_lowat;
2312 goto integer;
2313
2314 case SO_SNDTIMEO:
2315 case SO_RCVTIMEO:
2316 optval = (sopt->sopt_name == SO_SNDTIMEO ?
2317 so->so_snd.sb_timeo : so->so_rcv.sb_timeo);
2318
2319 tv.tv_sec = optval / hz;
2320 tv.tv_usec = (optval % hz) * tick;
2321#ifdef COMPAT_IA32
2322 if (curthread->td_proc->p_sysent == &ia32_freebsd_sysvec) {
2323 struct timeval32 tv32;
2324
2325 CP(tv, tv32, tv_sec);
2326 CP(tv, tv32, tv_usec);
2327 error = sooptcopyout(sopt, &tv32, sizeof tv32);
2328 } else
2329#endif
2330 error = sooptcopyout(sopt, &tv, sizeof tv);
2331 break;
2332
2333 case SO_LABEL:
2334#ifdef MAC
2335 error = sooptcopyin(sopt, &extmac, sizeof(extmac),
2336 sizeof(extmac));
2337 if (error)
2338 return (error);
2339 error = mac_getsockopt_label(sopt->sopt_td->td_ucred,
2340 so, &extmac);
2341 if (error)
2342 return (error);
2343 error = sooptcopyout(sopt, &extmac, sizeof extmac);
2344#else
2345 error = EOPNOTSUPP;
2346#endif
2347 break;
2348
2349 case SO_PEERLABEL:
2350#ifdef MAC
2351 error = sooptcopyin(sopt, &extmac, sizeof(extmac),
2352 sizeof(extmac));
2353 if (error)
2354 return (error);
2355 error = mac_getsockopt_peerlabel(
2356 sopt->sopt_td->td_ucred, so, &extmac);
2357 if (error)
2358 return (error);
2359 error = sooptcopyout(sopt, &extmac, sizeof extmac);
2360#else
2361 error = EOPNOTSUPP;
2362#endif
2363 break;
2364
2365 case SO_LISTENQLIMIT:
2366 optval = so->so_qlimit;
2367 goto integer;
2368
2369 case SO_LISTENQLEN:
2370 optval = so->so_qlen;
2371 goto integer;
2372
2373 case SO_LISTENINCQLEN:
2374 optval = so->so_incqlen;
2375 goto integer;
2376
2377 default:
2378 error = ENOPROTOOPT;
2379 break;
2380 }
2381 return (error);
2382 }
2383}
2384
2385/* XXX; prepare mbuf for (__FreeBSD__ < 3) routines. */
2386int
2387soopt_getm(struct sockopt *sopt, struct mbuf **mp)
2388{
2389 struct mbuf *m, *m_prev;
2390 int sopt_size = sopt->sopt_valsize;
2391
2392 MGET(m, sopt->sopt_td ? M_TRYWAIT : M_DONTWAIT, MT_DATA);
2393 if (m == NULL)
2394 return ENOBUFS;
2395 if (sopt_size > MLEN) {
2396 MCLGET(m, sopt->sopt_td ? M_TRYWAIT : M_DONTWAIT);
2397 if ((m->m_flags & M_EXT) == 0) {
2398 m_free(m);
2399 return ENOBUFS;
2400 }
2401 m->m_len = min(MCLBYTES, sopt_size);
2402 } else {
2403 m->m_len = min(MLEN, sopt_size);
2404 }
2405 sopt_size -= m->m_len;
2406 *mp = m;
2407 m_prev = m;
2408
2409 while (sopt_size) {
2410 MGET(m, sopt->sopt_td ? M_TRYWAIT : M_DONTWAIT, MT_DATA);
2411 if (m == NULL) {
2412 m_freem(*mp);
2413 return ENOBUFS;
2414 }
2415 if (sopt_size > MLEN) {
2416 MCLGET(m, sopt->sopt_td != NULL ? M_TRYWAIT :
2417 M_DONTWAIT);
2418 if ((m->m_flags & M_EXT) == 0) {
2419 m_freem(m);
2420 m_freem(*mp);
2421 return ENOBUFS;
2422 }
2423 m->m_len = min(MCLBYTES, sopt_size);
2424 } else {
2425 m->m_len = min(MLEN, sopt_size);
2426 }
2427 sopt_size -= m->m_len;
2428 m_prev->m_next = m;
2429 m_prev = m;
2430 }
2431 return (0);
2432}
2433
2434/* XXX; copyin sopt data into mbuf chain for (__FreeBSD__ < 3) routines. */
2435int
2436soopt_mcopyin(struct sockopt *sopt, struct mbuf *m)
2437{
2438 struct mbuf *m0 = m;
2439
2440 if (sopt->sopt_val == NULL)
2441 return (0);
2442 while (m != NULL && sopt->sopt_valsize >= m->m_len) {
2443 if (sopt->sopt_td != NULL) {
2444 int error;
2445
2446 error = copyin(sopt->sopt_val, mtod(m, char *),
2447 m->m_len);
2448 if (error != 0) {
2449 m_freem(m0);
2450 return(error);
2451 }
2452 } else
2453 bcopy(sopt->sopt_val, mtod(m, char *), m->m_len);
2454 sopt->sopt_valsize -= m->m_len;
2455 sopt->sopt_val = (char *)sopt->sopt_val + m->m_len;
2456 m = m->m_next;
2457 }
2458 if (m != NULL) /* should be allocated enoughly at ip6_sooptmcopyin() */
2459 panic("ip6_sooptmcopyin");
2460 return (0);
2461}
2462
2463/* XXX; copyout mbuf chain data into soopt for (__FreeBSD__ < 3) routines. */
2464int
2465soopt_mcopyout(struct sockopt *sopt, struct mbuf *m)
2466{
2467 struct mbuf *m0 = m;
2468 size_t valsize = 0;
2469
2470 if (sopt->sopt_val == NULL)
2471 return (0);
2472 while (m != NULL && sopt->sopt_valsize >= m->m_len) {
2473 if (sopt->sopt_td != NULL) {
2474 int error;
2475
2476 error = copyout(mtod(m, char *), sopt->sopt_val,
2477 m->m_len);
2478 if (error != 0) {
2479 m_freem(m0);
2480 return(error);
2481 }
2482 } else
2483 bcopy(mtod(m, char *), sopt->sopt_val, m->m_len);
2484 sopt->sopt_valsize -= m->m_len;
2485 sopt->sopt_val = (char *)sopt->sopt_val + m->m_len;
2486 valsize += m->m_len;
2487 m = m->m_next;
2488 }
2489 if (m != NULL) {
2490 /* enough soopt buffer should be given from user-land */
2491 m_freem(m0);
2492 return(EINVAL);
2493 }
2494 sopt->sopt_valsize = valsize;
2495 return (0);
2496}
2497
2498/*
2499 * sohasoutofband(): protocol notifies socket layer of the arrival of new
2500 * out-of-band data, which will then notify socket consumers.
2501 */
2502void
2503sohasoutofband(so)
2504 struct socket *so;
2505{
2506 if (so->so_sigio != NULL)
2507 pgsigio(&so->so_sigio, SIGURG, 0);
2508 selwakeuppri(&so->so_rcv.sb_sel, PSOCK);
2509}
2510
2511int
2512sopoll(struct socket *so, int events, struct ucred *active_cred,
2513 struct thread *td)
2514{
2515
2516 /* XXXRW: Temporary debugging. */
2517 KASSERT(so->so_proto->pr_usrreqs->pru_sopoll != sopoll,
2518 ("sopoll: protocol calls sopoll"));
2519
2520 return (so->so_proto->pr_usrreqs->pru_sopoll(so, events, active_cred,
2521 td));
2522}
2523
2524int
2525sopoll_generic(struct socket *so, int events, struct ucred *active_cred,
2526 struct thread *td)
2527{
2528 int revents = 0;
2529
2530 SOCKBUF_LOCK(&so->so_snd);
2531 SOCKBUF_LOCK(&so->so_rcv);
2532 if (events & (POLLIN | POLLRDNORM))
2533 if (soreadable(so))
2534 revents |= events & (POLLIN | POLLRDNORM);
2535
2536 if (events & POLLINIGNEOF)
2537 if (so->so_rcv.sb_cc >= so->so_rcv.sb_lowat ||
2538 !TAILQ_EMPTY(&so->so_comp) || so->so_error)
2539 revents |= POLLINIGNEOF;
2540
2541 if (events & (POLLOUT | POLLWRNORM))
2542 if (sowriteable(so))
2543 revents |= events & (POLLOUT | POLLWRNORM);
2544
2545 if (events & (POLLPRI | POLLRDBAND))
2546 if (so->so_oobmark || (so->so_rcv.sb_state & SBS_RCVATMARK))
2547 revents |= events & (POLLPRI | POLLRDBAND);
2548
2549 if (revents == 0) {
2550 if (events &
2551 (POLLIN | POLLINIGNEOF | POLLPRI | POLLRDNORM |
2552 POLLRDBAND)) {
2553 selrecord(td, &so->so_rcv.sb_sel);
2554 so->so_rcv.sb_flags |= SB_SEL;
2555 }
2556
2557 if (events & (POLLOUT | POLLWRNORM)) {
2558 selrecord(td, &so->so_snd.sb_sel);
2559 so->so_snd.sb_flags |= SB_SEL;
2560 }
2561 }
2562
2563 SOCKBUF_UNLOCK(&so->so_rcv);
2564 SOCKBUF_UNLOCK(&so->so_snd);
2565 return (revents);
2566}
2567
2568int
2569soo_kqfilter(struct file *fp, struct knote *kn)
2570{
2571 struct socket *so = kn->kn_fp->f_data;
2572 struct sockbuf *sb;
2573
2574 switch (kn->kn_filter) {
2575 case EVFILT_READ:
2576 if (so->so_options & SO_ACCEPTCONN)
2577 kn->kn_fop = &solisten_filtops;
2578 else
2579 kn->kn_fop = &soread_filtops;
2580 sb = &so->so_rcv;
2581 break;
2582 case EVFILT_WRITE:
2583 kn->kn_fop = &sowrite_filtops;
2584 sb = &so->so_snd;
2585 break;
2586 default:
2587 return (EINVAL);
2588 }
2589
2590 SOCKBUF_LOCK(sb);
2591 knlist_add(&sb->sb_sel.si_note, kn, 1);
2592 sb->sb_flags |= SB_KNOTE;
2593 SOCKBUF_UNLOCK(sb);
2594 return (0);
2595}
2596
2597/*
2598 * Some routines that return EOPNOTSUPP for entry points that are not
2599 * supported by a protocol. Fill in as needed.
2600 */
2601int
2602pru_accept_notsupp(struct socket *so, struct sockaddr **nam)
2603{
2604 return EOPNOTSUPP;
2605}
2606
2607int
2608pru_attach_notsupp(struct socket *so, int proto, struct thread *td)
2609{
2610 return EOPNOTSUPP;
2611}
2612
2613int
2614pru_bind_notsupp(struct socket *so, struct sockaddr *nam, struct thread *td)
2615{
2616 return EOPNOTSUPP;
2617}
2618
2619int
2620pru_connect_notsupp(struct socket *so, struct sockaddr *nam, struct thread *td)
2621{
2622 return EOPNOTSUPP;
2623}
2624
2625int
2626pru_connect2_notsupp(struct socket *so1, struct socket *so2)
2627{
2628 return EOPNOTSUPP;
2629}
2630
2631int
2632pru_control_notsupp(struct socket *so, u_long cmd, caddr_t data,
2633 struct ifnet *ifp, struct thread *td)
2634{
2635 return EOPNOTSUPP;
2636}
2637
2638int
2639pru_disconnect_notsupp(struct socket *so)
2640{
2641 return EOPNOTSUPP;
2642}
2643
2644int
2645pru_listen_notsupp(struct socket *so, int backlog, struct thread *td)
2646{
2647 return EOPNOTSUPP;
2648}
2649
2650int
2651pru_peeraddr_notsupp(struct socket *so, struct sockaddr **nam)
2652{
2653 return EOPNOTSUPP;
2654}
2655
2656int
2657pru_rcvd_notsupp(struct socket *so, int flags)
2658{
2659 return EOPNOTSUPP;
2660}
2661
2662int
2663pru_rcvoob_notsupp(struct socket *so, struct mbuf *m, int flags)
2664{
2665 return EOPNOTSUPP;
2666}
2667
2668int
2669pru_send_notsupp(struct socket *so, int flags, struct mbuf *m,
2670 struct sockaddr *addr, struct mbuf *control, struct thread *td)
2671{
2672 return EOPNOTSUPP;
2673}
2674
2675/*
2676 * This isn't really a ``null'' operation, but it's the default one
2677 * and doesn't do anything destructive.
2678 */
2679int
2680pru_sense_null(struct socket *so, struct stat *sb)
2681{
2682 sb->st_blksize = so->so_snd.sb_hiwat;
2683 return 0;
2684}
2685
2686int
2687pru_shutdown_notsupp(struct socket *so)
2688{
2689 return EOPNOTSUPP;
2690}
2691
2692int
2693pru_sockaddr_notsupp(struct socket *so, struct sockaddr **nam)
2694{
2695 return EOPNOTSUPP;
2696}
2697
2698int
2699pru_sosend_notsupp(struct socket *so, struct sockaddr *addr, struct uio *uio,
2700 struct mbuf *top, struct mbuf *control, int flags, struct thread *td)
2701{
2702 return EOPNOTSUPP;
2703}
2704
2705int
2706pru_soreceive_notsupp(struct socket *so, struct sockaddr **paddr,
2707 struct uio *uio, struct mbuf **mp0, struct mbuf **controlp,
2708 int *flagsp)
2709{
2710 return EOPNOTSUPP;
2711}
2712
2713int
2714pru_sopoll_notsupp(struct socket *so, int events, struct ucred *cred,
2715 struct thread *td)
2716{
2717 return EOPNOTSUPP;
2718}
2719
2596static void
2597filt_sordetach(struct knote *kn)
2598{
2599 struct socket *so = kn->kn_fp->f_data;
2600
2601 SOCKBUF_LOCK(&so->so_rcv);
2602 knlist_remove(&so->so_rcv.sb_sel.si_note, kn, 1);
2603 if (knlist_empty(&so->so_rcv.sb_sel.si_note))
2604 so->so_rcv.sb_flags &= ~SB_KNOTE;
2605 SOCKBUF_UNLOCK(&so->so_rcv);
2606}
2607
2608/*ARGSUSED*/
2609static int
2610filt_soread(struct knote *kn, long hint)
2611{
2612 struct socket *so;
2613
2614 so = kn->kn_fp->f_data;
2615 SOCKBUF_LOCK_ASSERT(&so->so_rcv);
2616
2617 kn->kn_data = so->so_rcv.sb_cc - so->so_rcv.sb_ctl;
2618 if (so->so_rcv.sb_state & SBS_CANTRCVMORE) {
2619 kn->kn_flags |= EV_EOF;
2620 kn->kn_fflags = so->so_error;
2621 return (1);
2622 } else if (so->so_error) /* temporary udp error */
2623 return (1);
2624 else if (kn->kn_sfflags & NOTE_LOWAT)
2625 return (kn->kn_data >= kn->kn_sdata);
2626 else
2627 return (so->so_rcv.sb_cc >= so->so_rcv.sb_lowat);
2628}
2629
2630static void
2631filt_sowdetach(struct knote *kn)
2632{
2633 struct socket *so = kn->kn_fp->f_data;
2634
2635 SOCKBUF_LOCK(&so->so_snd);
2636 knlist_remove(&so->so_snd.sb_sel.si_note, kn, 1);
2637 if (knlist_empty(&so->so_snd.sb_sel.si_note))
2638 so->so_snd.sb_flags &= ~SB_KNOTE;
2639 SOCKBUF_UNLOCK(&so->so_snd);
2640}
2641
2642/*ARGSUSED*/
2643static int
2644filt_sowrite(struct knote *kn, long hint)
2645{
2646 struct socket *so;
2647
2648 so = kn->kn_fp->f_data;
2649 SOCKBUF_LOCK_ASSERT(&so->so_snd);
2650 kn->kn_data = sbspace(&so->so_snd);
2651 if (so->so_snd.sb_state & SBS_CANTSENDMORE) {
2652 kn->kn_flags |= EV_EOF;
2653 kn->kn_fflags = so->so_error;
2654 return (1);
2655 } else if (so->so_error) /* temporary udp error */
2656 return (1);
2657 else if (((so->so_state & SS_ISCONNECTED) == 0) &&
2658 (so->so_proto->pr_flags & PR_CONNREQUIRED))
2659 return (0);
2660 else if (kn->kn_sfflags & NOTE_LOWAT)
2661 return (kn->kn_data >= kn->kn_sdata);
2662 else
2663 return (kn->kn_data >= so->so_snd.sb_lowat);
2664}
2665
2666/*ARGSUSED*/
2667static int
2668filt_solisten(struct knote *kn, long hint)
2669{
2670 struct socket *so = kn->kn_fp->f_data;
2671
2672 kn->kn_data = so->so_qlen;
2673 return (! TAILQ_EMPTY(&so->so_comp));
2674}
2675
2676int
2677socheckuid(struct socket *so, uid_t uid)
2678{
2679
2680 if (so == NULL)
2681 return (EPERM);
2682 if (so->so_cred->cr_uid != uid)
2683 return (EPERM);
2684 return (0);
2685}
2686
2687static int
2688sysctl_somaxconn(SYSCTL_HANDLER_ARGS)
2689{
2690 int error;
2691 int val;
2692
2693 val = somaxconn;
2694 error = sysctl_handle_int(oidp, &val, sizeof(int), req);
2695 if (error || !req->newptr )
2696 return (error);
2697
2698 if (val < 1 || val > USHRT_MAX)
2699 return (EINVAL);
2700
2701 somaxconn = val;
2702 return (0);
2703}
2720static void
2721filt_sordetach(struct knote *kn)
2722{
2723 struct socket *so = kn->kn_fp->f_data;
2724
2725 SOCKBUF_LOCK(&so->so_rcv);
2726 knlist_remove(&so->so_rcv.sb_sel.si_note, kn, 1);
2727 if (knlist_empty(&so->so_rcv.sb_sel.si_note))
2728 so->so_rcv.sb_flags &= ~SB_KNOTE;
2729 SOCKBUF_UNLOCK(&so->so_rcv);
2730}
2731
2732/*ARGSUSED*/
2733static int
2734filt_soread(struct knote *kn, long hint)
2735{
2736 struct socket *so;
2737
2738 so = kn->kn_fp->f_data;
2739 SOCKBUF_LOCK_ASSERT(&so->so_rcv);
2740
2741 kn->kn_data = so->so_rcv.sb_cc - so->so_rcv.sb_ctl;
2742 if (so->so_rcv.sb_state & SBS_CANTRCVMORE) {
2743 kn->kn_flags |= EV_EOF;
2744 kn->kn_fflags = so->so_error;
2745 return (1);
2746 } else if (so->so_error) /* temporary udp error */
2747 return (1);
2748 else if (kn->kn_sfflags & NOTE_LOWAT)
2749 return (kn->kn_data >= kn->kn_sdata);
2750 else
2751 return (so->so_rcv.sb_cc >= so->so_rcv.sb_lowat);
2752}
2753
2754static void
2755filt_sowdetach(struct knote *kn)
2756{
2757 struct socket *so = kn->kn_fp->f_data;
2758
2759 SOCKBUF_LOCK(&so->so_snd);
2760 knlist_remove(&so->so_snd.sb_sel.si_note, kn, 1);
2761 if (knlist_empty(&so->so_snd.sb_sel.si_note))
2762 so->so_snd.sb_flags &= ~SB_KNOTE;
2763 SOCKBUF_UNLOCK(&so->so_snd);
2764}
2765
2766/*ARGSUSED*/
2767static int
2768filt_sowrite(struct knote *kn, long hint)
2769{
2770 struct socket *so;
2771
2772 so = kn->kn_fp->f_data;
2773 SOCKBUF_LOCK_ASSERT(&so->so_snd);
2774 kn->kn_data = sbspace(&so->so_snd);
2775 if (so->so_snd.sb_state & SBS_CANTSENDMORE) {
2776 kn->kn_flags |= EV_EOF;
2777 kn->kn_fflags = so->so_error;
2778 return (1);
2779 } else if (so->so_error) /* temporary udp error */
2780 return (1);
2781 else if (((so->so_state & SS_ISCONNECTED) == 0) &&
2782 (so->so_proto->pr_flags & PR_CONNREQUIRED))
2783 return (0);
2784 else if (kn->kn_sfflags & NOTE_LOWAT)
2785 return (kn->kn_data >= kn->kn_sdata);
2786 else
2787 return (kn->kn_data >= so->so_snd.sb_lowat);
2788}
2789
2790/*ARGSUSED*/
2791static int
2792filt_solisten(struct knote *kn, long hint)
2793{
2794 struct socket *so = kn->kn_fp->f_data;
2795
2796 kn->kn_data = so->so_qlen;
2797 return (! TAILQ_EMPTY(&so->so_comp));
2798}
2799
2800int
2801socheckuid(struct socket *so, uid_t uid)
2802{
2803
2804 if (so == NULL)
2805 return (EPERM);
2806 if (so->so_cred->cr_uid != uid)
2807 return (EPERM);
2808 return (0);
2809}
2810
2811static int
2812sysctl_somaxconn(SYSCTL_HANDLER_ARGS)
2813{
2814 int error;
2815 int val;
2816
2817 val = somaxconn;
2818 error = sysctl_handle_int(oidp, &val, sizeof(int), req);
2819 if (error || !req->newptr )
2820 return (error);
2821
2822 if (val < 1 || val > USHRT_MAX)
2823 return (EINVAL);
2824
2825 somaxconn = val;
2826 return (0);
2827}
2828
2829/*
2830 * Primitive routines for operating on sockets.
2831 */
2832
2833/*
2834 * Procedures to manipulate state flags of socket
2835 * and do appropriate wakeups. Normal sequence from the
2836 * active (originating) side is that soisconnecting() is
2837 * called during processing of connect() call,
2838 * resulting in an eventual call to soisconnected() if/when the
2839 * connection is established. When the connection is torn down
2840 * soisdisconnecting() is called during processing of disconnect() call,
2841 * and soisdisconnected() is called when the connection to the peer
2842 * is totally severed. The semantics of these routines are such that
2843 * connectionless protocols can call soisconnected() and soisdisconnected()
2844 * only, bypassing the in-progress calls when setting up a ``connection''
2845 * takes no time.
2846 *
2847 * From the passive side, a socket is created with
2848 * two queues of sockets: so_incomp for connections in progress
2849 * and so_comp for connections already made and awaiting user acceptance.
2850 * As a protocol is preparing incoming connections, it creates a socket
2851 * structure queued on so_incomp by calling sonewconn(). When the connection
2852 * is established, soisconnected() is called, and transfers the
2853 * socket structure to so_comp, making it available to accept().
2854 *
2855 * If a socket is closed with sockets on either
2856 * so_incomp or so_comp, these sockets are dropped.
2857 *
2858 * If higher level protocols are implemented in
2859 * the kernel, the wakeups done here will sometimes
2860 * cause software-interrupt process scheduling.
2861 */
2862
2863void
2864soisconnecting(so)
2865 register struct socket *so;
2866{
2867
2868 SOCK_LOCK(so);
2869 so->so_state &= ~(SS_ISCONNECTED|SS_ISDISCONNECTING);
2870 so->so_state |= SS_ISCONNECTING;
2871 SOCK_UNLOCK(so);
2872}
2873
2874void
2875soisconnected(so)
2876 struct socket *so;
2877{
2878 struct socket *head;
2879
2880 ACCEPT_LOCK();
2881 SOCK_LOCK(so);
2882 so->so_state &= ~(SS_ISCONNECTING|SS_ISDISCONNECTING|SS_ISCONFIRMING);
2883 so->so_state |= SS_ISCONNECTED;
2884 head = so->so_head;
2885 if (head != NULL && (so->so_qstate & SQ_INCOMP)) {
2886 if ((so->so_options & SO_ACCEPTFILTER) == 0) {
2887 SOCK_UNLOCK(so);
2888 TAILQ_REMOVE(&head->so_incomp, so, so_list);
2889 head->so_incqlen--;
2890 so->so_qstate &= ~SQ_INCOMP;
2891 TAILQ_INSERT_TAIL(&head->so_comp, so, so_list);
2892 head->so_qlen++;
2893 so->so_qstate |= SQ_COMP;
2894 ACCEPT_UNLOCK();
2895 sorwakeup(head);
2896 wakeup_one(&head->so_timeo);
2897 } else {
2898 ACCEPT_UNLOCK();
2899 so->so_upcall =
2900 head->so_accf->so_accept_filter->accf_callback;
2901 so->so_upcallarg = head->so_accf->so_accept_filter_arg;
2902 so->so_rcv.sb_flags |= SB_UPCALL;
2903 so->so_options &= ~SO_ACCEPTFILTER;
2904 SOCK_UNLOCK(so);
2905 so->so_upcall(so, so->so_upcallarg, M_DONTWAIT);
2906 }
2907 return;
2908 }
2909 SOCK_UNLOCK(so);
2910 ACCEPT_UNLOCK();
2911 wakeup(&so->so_timeo);
2912 sorwakeup(so);
2913 sowwakeup(so);
2914}
2915
2916void
2917soisdisconnecting(so)
2918 register struct socket *so;
2919{
2920
2921 /*
2922 * Note: This code assumes that SOCK_LOCK(so) and
2923 * SOCKBUF_LOCK(&so->so_rcv) are the same.
2924 */
2925 SOCKBUF_LOCK(&so->so_rcv);
2926 so->so_state &= ~SS_ISCONNECTING;
2927 so->so_state |= SS_ISDISCONNECTING;
2928 so->so_rcv.sb_state |= SBS_CANTRCVMORE;
2929 sorwakeup_locked(so);
2930 SOCKBUF_LOCK(&so->so_snd);
2931 so->so_snd.sb_state |= SBS_CANTSENDMORE;
2932 sowwakeup_locked(so);
2933 wakeup(&so->so_timeo);
2934}
2935
2936void
2937soisdisconnected(so)
2938 register struct socket *so;
2939{
2940
2941 /*
2942 * Note: This code assumes that SOCK_LOCK(so) and
2943 * SOCKBUF_LOCK(&so->so_rcv) are the same.
2944 */
2945 SOCKBUF_LOCK(&so->so_rcv);
2946 so->so_state &= ~(SS_ISCONNECTING|SS_ISCONNECTED|SS_ISDISCONNECTING);
2947 so->so_state |= SS_ISDISCONNECTED;
2948 so->so_rcv.sb_state |= SBS_CANTRCVMORE;
2949 sorwakeup_locked(so);
2950 SOCKBUF_LOCK(&so->so_snd);
2951 so->so_snd.sb_state |= SBS_CANTSENDMORE;
2952 sbdrop_locked(&so->so_snd, so->so_snd.sb_cc);
2953 sowwakeup_locked(so);
2954 wakeup(&so->so_timeo);
2955}
2956
2957/*
2958 * Make a copy of a sockaddr in a malloced buffer of type M_SONAME.
2959 */
2960struct sockaddr *
2961sodupsockaddr(const struct sockaddr *sa, int mflags)
2962{
2963 struct sockaddr *sa2;
2964
2965 sa2 = malloc(sa->sa_len, M_SONAME, mflags);
2966 if (sa2)
2967 bcopy(sa, sa2, sa->sa_len);
2968 return sa2;
2969}
2970
2971/*
2972 * Create an external-format (``xsocket'') structure using the information
2973 * in the kernel-format socket structure pointed to by so. This is done
2974 * to reduce the spew of irrelevant information over this interface,
2975 * to isolate user code from changes in the kernel structure, and
2976 * potentially to provide information-hiding if we decide that
2977 * some of this information should be hidden from users.
2978 */
2979void
2980sotoxsocket(struct socket *so, struct xsocket *xso)
2981{
2982 xso->xso_len = sizeof *xso;
2983 xso->xso_so = so;
2984 xso->so_type = so->so_type;
2985 xso->so_options = so->so_options;
2986 xso->so_linger = so->so_linger;
2987 xso->so_state = so->so_state;
2988 xso->so_pcb = so->so_pcb;
2989 xso->xso_protocol = so->so_proto->pr_protocol;
2990 xso->xso_family = so->so_proto->pr_domain->dom_family;
2991 xso->so_qlen = so->so_qlen;
2992 xso->so_incqlen = so->so_incqlen;
2993 xso->so_qlimit = so->so_qlimit;
2994 xso->so_timeo = so->so_timeo;
2995 xso->so_error = so->so_error;
2996 xso->so_pgid = so->so_sigio ? so->so_sigio->sio_pgid : 0;
2997 xso->so_oobmark = so->so_oobmark;
2998 sbtoxsockbuf(&so->so_snd, &xso->so_snd);
2999 sbtoxsockbuf(&so->so_rcv, &xso->so_rcv);
3000 xso->so_uid = so->so_cred->cr_uid;
3001}