Deleted Added
full compact
uipc_socket.c (185892) uipc_socket.c (185893)
1/*-
2 * Copyright (c) 1982, 1986, 1988, 1990, 1993
3 * The Regents of the University of California.
4 * Copyright (c) 2004 The FreeBSD Foundation
5 * Copyright (c) 2004-2008 Robert N. M. Watson
6 * All rights reserved.
7 *
8 * Redistribution and use in source and binary forms, with or without
9 * modification, are permitted provided that the following conditions
10 * are met:
11 * 1. Redistributions of source code must retain the above copyright
12 * notice, this list of conditions and the following disclaimer.
13 * 2. Redistributions in binary form must reproduce the above copyright
14 * notice, this list of conditions and the following disclaimer in the
15 * documentation and/or other materials provided with the distribution.
16 * 4. Neither the name of the University nor the names of its contributors
17 * may be used to endorse or promote products derived from this software
18 * without specific prior written permission.
19 *
20 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
21 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
22 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
23 * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
24 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
25 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
26 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
27 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
28 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
29 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
30 * SUCH DAMAGE.
31 *
32 * @(#)uipc_socket.c 8.3 (Berkeley) 4/15/94
33 */
34
35/*
36 * Comments on the socket life cycle:
37 *
38 * soalloc() sets of socket layer state for a socket, called only by
39 * socreate() and sonewconn(). Socket layer private.
40 *
41 * sodealloc() tears down socket layer state for a socket, called only by
42 * sofree() and sonewconn(). Socket layer private.
43 *
44 * pru_attach() associates protocol layer state with an allocated socket;
45 * called only once, may fail, aborting socket allocation. This is called
46 * from socreate() and sonewconn(). Socket layer private.
47 *
48 * pru_detach() disassociates protocol layer state from an attached socket,
49 * and will be called exactly once for sockets in which pru_attach() has
50 * been successfully called. If pru_attach() returned an error,
51 * pru_detach() will not be called. Socket layer private.
52 *
53 * pru_abort() and pru_close() notify the protocol layer that the last
54 * consumer of a socket is starting to tear down the socket, and that the
55 * protocol should terminate the connection. Historically, pru_abort() also
56 * detached protocol state from the socket state, but this is no longer the
57 * case.
58 *
59 * socreate() creates a socket and attaches protocol state. This is a public
60 * interface that may be used by socket layer consumers to create new
61 * sockets.
62 *
63 * sonewconn() creates a socket and attaches protocol state. This is a
64 * public interface that may be used by protocols to create new sockets when
65 * a new connection is received and will be available for accept() on a
66 * listen socket.
67 *
68 * soclose() destroys a socket after possibly waiting for it to disconnect.
69 * This is a public interface that socket consumers should use to close and
70 * release a socket when done with it.
71 *
72 * soabort() destroys a socket without waiting for it to disconnect (used
73 * only for incoming connections that are already partially or fully
74 * connected). This is used internally by the socket layer when clearing
75 * listen socket queues (due to overflow or close on the listen socket), but
76 * is also a public interface protocols may use to abort connections in
77 * their incomplete listen queues should they no longer be required. Sockets
78 * placed in completed connection listen queues should not be aborted for
79 * reasons described in the comment above the soclose() implementation. This
80 * is not a general purpose close routine, and except in the specific
81 * circumstances described here, should not be used.
82 *
83 * sofree() will free a socket and its protocol state if all references on
84 * the socket have been released, and is the public interface to attempt to
85 * free a socket when a reference is removed. This is a socket layer private
86 * interface.
87 *
88 * NOTE: In addition to socreate() and soclose(), which provide a single
89 * socket reference to the consumer to be managed as required, there are two
90 * calls to explicitly manage socket references, soref(), and sorele().
91 * Currently, these are generally required only when transitioning a socket
92 * from a listen queue to a file descriptor, in order to prevent garbage
93 * collection of the socket at an untimely moment. For a number of reasons,
94 * these interfaces are not preferred, and should be avoided.
95 */
96
97#include <sys/cdefs.h>
1/*-
2 * Copyright (c) 1982, 1986, 1988, 1990, 1993
3 * The Regents of the University of California.
4 * Copyright (c) 2004 The FreeBSD Foundation
5 * Copyright (c) 2004-2008 Robert N. M. Watson
6 * All rights reserved.
7 *
8 * Redistribution and use in source and binary forms, with or without
9 * modification, are permitted provided that the following conditions
10 * are met:
11 * 1. Redistributions of source code must retain the above copyright
12 * notice, this list of conditions and the following disclaimer.
13 * 2. Redistributions in binary form must reproduce the above copyright
14 * notice, this list of conditions and the following disclaimer in the
15 * documentation and/or other materials provided with the distribution.
16 * 4. Neither the name of the University nor the names of its contributors
17 * may be used to endorse or promote products derived from this software
18 * without specific prior written permission.
19 *
20 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
21 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
22 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
23 * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
24 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
25 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
26 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
27 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
28 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
29 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
30 * SUCH DAMAGE.
31 *
32 * @(#)uipc_socket.c 8.3 (Berkeley) 4/15/94
33 */
34
35/*
36 * Comments on the socket life cycle:
37 *
38 * soalloc() sets of socket layer state for a socket, called only by
39 * socreate() and sonewconn(). Socket layer private.
40 *
41 * sodealloc() tears down socket layer state for a socket, called only by
42 * sofree() and sonewconn(). Socket layer private.
43 *
44 * pru_attach() associates protocol layer state with an allocated socket;
45 * called only once, may fail, aborting socket allocation. This is called
46 * from socreate() and sonewconn(). Socket layer private.
47 *
48 * pru_detach() disassociates protocol layer state from an attached socket,
49 * and will be called exactly once for sockets in which pru_attach() has
50 * been successfully called. If pru_attach() returned an error,
51 * pru_detach() will not be called. Socket layer private.
52 *
53 * pru_abort() and pru_close() notify the protocol layer that the last
54 * consumer of a socket is starting to tear down the socket, and that the
55 * protocol should terminate the connection. Historically, pru_abort() also
56 * detached protocol state from the socket state, but this is no longer the
57 * case.
58 *
59 * socreate() creates a socket and attaches protocol state. This is a public
60 * interface that may be used by socket layer consumers to create new
61 * sockets.
62 *
63 * sonewconn() creates a socket and attaches protocol state. This is a
64 * public interface that may be used by protocols to create new sockets when
65 * a new connection is received and will be available for accept() on a
66 * listen socket.
67 *
68 * soclose() destroys a socket after possibly waiting for it to disconnect.
69 * This is a public interface that socket consumers should use to close and
70 * release a socket when done with it.
71 *
72 * soabort() destroys a socket without waiting for it to disconnect (used
73 * only for incoming connections that are already partially or fully
74 * connected). This is used internally by the socket layer when clearing
75 * listen socket queues (due to overflow or close on the listen socket), but
76 * is also a public interface protocols may use to abort connections in
77 * their incomplete listen queues should they no longer be required. Sockets
78 * placed in completed connection listen queues should not be aborted for
79 * reasons described in the comment above the soclose() implementation. This
80 * is not a general purpose close routine, and except in the specific
81 * circumstances described here, should not be used.
82 *
83 * sofree() will free a socket and its protocol state if all references on
84 * the socket have been released, and is the public interface to attempt to
85 * free a socket when a reference is removed. This is a socket layer private
86 * interface.
87 *
88 * NOTE: In addition to socreate() and soclose(), which provide a single
89 * socket reference to the consumer to be managed as required, there are two
90 * calls to explicitly manage socket references, soref(), and sorele().
91 * Currently, these are generally required only when transitioning a socket
92 * from a listen queue to a file descriptor, in order to prevent garbage
93 * collection of the socket at an untimely moment. For a number of reasons,
94 * these interfaces are not preferred, and should be avoided.
95 */
96
97#include <sys/cdefs.h>
98__FBSDID("$FreeBSD: head/sys/kern/uipc_socket.c 185892 2008-12-10 22:10:37Z bz $");
98__FBSDID("$FreeBSD: head/sys/kern/uipc_socket.c 185893 2008-12-10 22:17:09Z bz $");
99
100#include "opt_inet.h"
101#include "opt_inet6.h"
102#include "opt_mac.h"
103#include "opt_zero.h"
104#include "opt_compat.h"
105
106#include <sys/param.h>
107#include <sys/systm.h>
108#include <sys/fcntl.h>
109#include <sys/limits.h>
110#include <sys/lock.h>
111#include <sys/mac.h>
112#include <sys/malloc.h>
113#include <sys/mbuf.h>
114#include <sys/mutex.h>
115#include <sys/domain.h>
116#include <sys/file.h> /* for struct knote */
117#include <sys/kernel.h>
118#include <sys/event.h>
119#include <sys/eventhandler.h>
120#include <sys/poll.h>
121#include <sys/proc.h>
122#include <sys/protosw.h>
123#include <sys/socket.h>
124#include <sys/socketvar.h>
125#include <sys/resourcevar.h>
126#include <net/route.h>
127#include <sys/signalvar.h>
128#include <sys/stat.h>
129#include <sys/sx.h>
130#include <sys/sysctl.h>
131#include <sys/uio.h>
132#include <sys/jail.h>
133
134#include <security/mac/mac_framework.h>
135
136#include <vm/uma.h>
137
138#ifdef COMPAT_IA32
139#include <sys/mount.h>
140#include <sys/sysent.h>
141#include <compat/freebsd32/freebsd32.h>
142#endif
143
144static int soreceive_rcvoob(struct socket *so, struct uio *uio,
145 int flags);
146
147static void filt_sordetach(struct knote *kn);
148static int filt_soread(struct knote *kn, long hint);
149static void filt_sowdetach(struct knote *kn);
150static int filt_sowrite(struct knote *kn, long hint);
151static int filt_solisten(struct knote *kn, long hint);
152
153static struct filterops solisten_filtops =
154 { 1, NULL, filt_sordetach, filt_solisten };
155static struct filterops soread_filtops =
156 { 1, NULL, filt_sordetach, filt_soread };
157static struct filterops sowrite_filtops =
158 { 1, NULL, filt_sowdetach, filt_sowrite };
159
160uma_zone_t socket_zone;
161so_gen_t so_gencnt; /* generation count for sockets */
162
163int maxsockets;
164
165MALLOC_DEFINE(M_SONAME, "soname", "socket name");
166MALLOC_DEFINE(M_PCB, "pcb", "protocol control block");
167
168static int somaxconn = SOMAXCONN;
169static int sysctl_somaxconn(SYSCTL_HANDLER_ARGS);
170/* XXX: we dont have SYSCTL_USHORT */
171SYSCTL_PROC(_kern_ipc, KIPC_SOMAXCONN, somaxconn, CTLTYPE_UINT | CTLFLAG_RW,
172 0, sizeof(int), sysctl_somaxconn, "I", "Maximum pending socket connection "
173 "queue size");
174static int numopensockets;
175SYSCTL_INT(_kern_ipc, OID_AUTO, numopensockets, CTLFLAG_RD,
176 &numopensockets, 0, "Number of open sockets");
177#ifdef ZERO_COPY_SOCKETS
178/* These aren't static because they're used in other files. */
179int so_zero_copy_send = 1;
180int so_zero_copy_receive = 1;
181SYSCTL_NODE(_kern_ipc, OID_AUTO, zero_copy, CTLFLAG_RD, 0,
182 "Zero copy controls");
183SYSCTL_INT(_kern_ipc_zero_copy, OID_AUTO, receive, CTLFLAG_RW,
184 &so_zero_copy_receive, 0, "Enable zero copy receive");
185SYSCTL_INT(_kern_ipc_zero_copy, OID_AUTO, send, CTLFLAG_RW,
186 &so_zero_copy_send, 0, "Enable zero copy send");
187#endif /* ZERO_COPY_SOCKETS */
188
189/*
190 * accept_mtx locks down per-socket fields relating to accept queues. See
191 * socketvar.h for an annotation of the protected fields of struct socket.
192 */
193struct mtx accept_mtx;
194MTX_SYSINIT(accept_mtx, &accept_mtx, "accept", MTX_DEF);
195
196/*
197 * so_global_mtx protects so_gencnt, numopensockets, and the per-socket
198 * so_gencnt field.
199 */
200static struct mtx so_global_mtx;
201MTX_SYSINIT(so_global_mtx, &so_global_mtx, "so_glabel", MTX_DEF);
202
203/*
204 * General IPC sysctl name space, used by sockets and a variety of other IPC
205 * types.
206 */
207SYSCTL_NODE(_kern, KERN_IPC, ipc, CTLFLAG_RW, 0, "IPC");
208
209/*
210 * Sysctl to get and set the maximum global sockets limit. Notify protocols
211 * of the change so that they can update their dependent limits as required.
212 */
213static int
214sysctl_maxsockets(SYSCTL_HANDLER_ARGS)
215{
216 int error, newmaxsockets;
217
218 newmaxsockets = maxsockets;
219 error = sysctl_handle_int(oidp, &newmaxsockets, 0, req);
220 if (error == 0 && req->newptr) {
221 if (newmaxsockets > maxsockets) {
222 maxsockets = newmaxsockets;
223 if (maxsockets > ((maxfiles / 4) * 3)) {
224 maxfiles = (maxsockets * 5) / 4;
225 maxfilesperproc = (maxfiles * 9) / 10;
226 }
227 EVENTHANDLER_INVOKE(maxsockets_change);
228 } else
229 error = EINVAL;
230 }
231 return (error);
232}
233
234SYSCTL_PROC(_kern_ipc, OID_AUTO, maxsockets, CTLTYPE_INT|CTLFLAG_RW,
235 &maxsockets, 0, sysctl_maxsockets, "IU",
236 "Maximum number of sockets avaliable");
237
238/*
99
100#include "opt_inet.h"
101#include "opt_inet6.h"
102#include "opt_mac.h"
103#include "opt_zero.h"
104#include "opt_compat.h"
105
106#include <sys/param.h>
107#include <sys/systm.h>
108#include <sys/fcntl.h>
109#include <sys/limits.h>
110#include <sys/lock.h>
111#include <sys/mac.h>
112#include <sys/malloc.h>
113#include <sys/mbuf.h>
114#include <sys/mutex.h>
115#include <sys/domain.h>
116#include <sys/file.h> /* for struct knote */
117#include <sys/kernel.h>
118#include <sys/event.h>
119#include <sys/eventhandler.h>
120#include <sys/poll.h>
121#include <sys/proc.h>
122#include <sys/protosw.h>
123#include <sys/socket.h>
124#include <sys/socketvar.h>
125#include <sys/resourcevar.h>
126#include <net/route.h>
127#include <sys/signalvar.h>
128#include <sys/stat.h>
129#include <sys/sx.h>
130#include <sys/sysctl.h>
131#include <sys/uio.h>
132#include <sys/jail.h>
133
134#include <security/mac/mac_framework.h>
135
136#include <vm/uma.h>
137
138#ifdef COMPAT_IA32
139#include <sys/mount.h>
140#include <sys/sysent.h>
141#include <compat/freebsd32/freebsd32.h>
142#endif
143
144static int soreceive_rcvoob(struct socket *so, struct uio *uio,
145 int flags);
146
147static void filt_sordetach(struct knote *kn);
148static int filt_soread(struct knote *kn, long hint);
149static void filt_sowdetach(struct knote *kn);
150static int filt_sowrite(struct knote *kn, long hint);
151static int filt_solisten(struct knote *kn, long hint);
152
153static struct filterops solisten_filtops =
154 { 1, NULL, filt_sordetach, filt_solisten };
155static struct filterops soread_filtops =
156 { 1, NULL, filt_sordetach, filt_soread };
157static struct filterops sowrite_filtops =
158 { 1, NULL, filt_sowdetach, filt_sowrite };
159
160uma_zone_t socket_zone;
161so_gen_t so_gencnt; /* generation count for sockets */
162
163int maxsockets;
164
165MALLOC_DEFINE(M_SONAME, "soname", "socket name");
166MALLOC_DEFINE(M_PCB, "pcb", "protocol control block");
167
168static int somaxconn = SOMAXCONN;
169static int sysctl_somaxconn(SYSCTL_HANDLER_ARGS);
170/* XXX: we dont have SYSCTL_USHORT */
171SYSCTL_PROC(_kern_ipc, KIPC_SOMAXCONN, somaxconn, CTLTYPE_UINT | CTLFLAG_RW,
172 0, sizeof(int), sysctl_somaxconn, "I", "Maximum pending socket connection "
173 "queue size");
174static int numopensockets;
175SYSCTL_INT(_kern_ipc, OID_AUTO, numopensockets, CTLFLAG_RD,
176 &numopensockets, 0, "Number of open sockets");
177#ifdef ZERO_COPY_SOCKETS
178/* These aren't static because they're used in other files. */
179int so_zero_copy_send = 1;
180int so_zero_copy_receive = 1;
181SYSCTL_NODE(_kern_ipc, OID_AUTO, zero_copy, CTLFLAG_RD, 0,
182 "Zero copy controls");
183SYSCTL_INT(_kern_ipc_zero_copy, OID_AUTO, receive, CTLFLAG_RW,
184 &so_zero_copy_receive, 0, "Enable zero copy receive");
185SYSCTL_INT(_kern_ipc_zero_copy, OID_AUTO, send, CTLFLAG_RW,
186 &so_zero_copy_send, 0, "Enable zero copy send");
187#endif /* ZERO_COPY_SOCKETS */
188
189/*
190 * accept_mtx locks down per-socket fields relating to accept queues. See
191 * socketvar.h for an annotation of the protected fields of struct socket.
192 */
193struct mtx accept_mtx;
194MTX_SYSINIT(accept_mtx, &accept_mtx, "accept", MTX_DEF);
195
196/*
197 * so_global_mtx protects so_gencnt, numopensockets, and the per-socket
198 * so_gencnt field.
199 */
200static struct mtx so_global_mtx;
201MTX_SYSINIT(so_global_mtx, &so_global_mtx, "so_glabel", MTX_DEF);
202
203/*
204 * General IPC sysctl name space, used by sockets and a variety of other IPC
205 * types.
206 */
207SYSCTL_NODE(_kern, KERN_IPC, ipc, CTLFLAG_RW, 0, "IPC");
208
209/*
210 * Sysctl to get and set the maximum global sockets limit. Notify protocols
211 * of the change so that they can update their dependent limits as required.
212 */
213static int
214sysctl_maxsockets(SYSCTL_HANDLER_ARGS)
215{
216 int error, newmaxsockets;
217
218 newmaxsockets = maxsockets;
219 error = sysctl_handle_int(oidp, &newmaxsockets, 0, req);
220 if (error == 0 && req->newptr) {
221 if (newmaxsockets > maxsockets) {
222 maxsockets = newmaxsockets;
223 if (maxsockets > ((maxfiles / 4) * 3)) {
224 maxfiles = (maxsockets * 5) / 4;
225 maxfilesperproc = (maxfiles * 9) / 10;
226 }
227 EVENTHANDLER_INVOKE(maxsockets_change);
228 } else
229 error = EINVAL;
230 }
231 return (error);
232}
233
234SYSCTL_PROC(_kern_ipc, OID_AUTO, maxsockets, CTLTYPE_INT|CTLFLAG_RW,
235 &maxsockets, 0, sysctl_maxsockets, "IU",
236 "Maximum number of sockets avaliable");
237
238/*
239 * Initialise maxsockets.
239 * Initialise maxsockets. This SYSINIT must be run after
240 * tunable_mbinit().
240 */
241static void
242init_maxsockets(void *ignored)
243{
244
245 TUNABLE_INT_FETCH("kern.ipc.maxsockets", &maxsockets);
246 maxsockets = imax(maxsockets, imax(maxfiles, nmbclusters));
247}
248SYSINIT(param, SI_SUB_TUNABLES, SI_ORDER_ANY, init_maxsockets, NULL);
249
250/*
251 * Socket operation routines. These routines are called by the routines in
252 * sys_socket.c or from a system process, and implement the semantics of
253 * socket operations by switching out to the protocol specific routines.
254 */
255
256/*
257 * Get a socket structure from our zone, and initialize it. Note that it
258 * would probably be better to allocate socket and PCB at the same time, but
259 * I'm not convinced that all the protocols can be easily modified to do
260 * this.
261 *
262 * soalloc() returns a socket with a ref count of 0.
263 */
264static struct socket *
265soalloc(void)
266{
267 struct socket *so;
268
269 so = uma_zalloc(socket_zone, M_NOWAIT | M_ZERO);
270 if (so == NULL)
271 return (NULL);
272#ifdef MAC
273 if (mac_socket_init(so, M_NOWAIT) != 0) {
274 uma_zfree(socket_zone, so);
275 return (NULL);
276 }
277#endif
278 SOCKBUF_LOCK_INIT(&so->so_snd, "so_snd");
279 SOCKBUF_LOCK_INIT(&so->so_rcv, "so_rcv");
280 sx_init(&so->so_snd.sb_sx, "so_snd_sx");
281 sx_init(&so->so_rcv.sb_sx, "so_rcv_sx");
282 TAILQ_INIT(&so->so_aiojobq);
283 mtx_lock(&so_global_mtx);
284 so->so_gencnt = ++so_gencnt;
285 ++numopensockets;
286 mtx_unlock(&so_global_mtx);
287 return (so);
288}
289
290/*
291 * Free the storage associated with a socket at the socket layer, tear down
292 * locks, labels, etc. All protocol state is assumed already to have been
293 * torn down (and possibly never set up) by the caller.
294 */
295static void
296sodealloc(struct socket *so)
297{
298
299 KASSERT(so->so_count == 0, ("sodealloc(): so_count %d", so->so_count));
300 KASSERT(so->so_pcb == NULL, ("sodealloc(): so_pcb != NULL"));
301
302 mtx_lock(&so_global_mtx);
303 so->so_gencnt = ++so_gencnt;
304 --numopensockets; /* Could be below, but faster here. */
305 mtx_unlock(&so_global_mtx);
306 if (so->so_rcv.sb_hiwat)
307 (void)chgsbsize(so->so_cred->cr_uidinfo,
308 &so->so_rcv.sb_hiwat, 0, RLIM_INFINITY);
309 if (so->so_snd.sb_hiwat)
310 (void)chgsbsize(so->so_cred->cr_uidinfo,
311 &so->so_snd.sb_hiwat, 0, RLIM_INFINITY);
312#ifdef INET
313 /* remove acccept filter if one is present. */
314 if (so->so_accf != NULL)
315 do_setopt_accept_filter(so, NULL);
316#endif
317#ifdef MAC
318 mac_socket_destroy(so);
319#endif
320 crfree(so->so_cred);
321 sx_destroy(&so->so_snd.sb_sx);
322 sx_destroy(&so->so_rcv.sb_sx);
323 SOCKBUF_LOCK_DESTROY(&so->so_snd);
324 SOCKBUF_LOCK_DESTROY(&so->so_rcv);
325 uma_zfree(socket_zone, so);
326}
327
328/*
329 * socreate returns a socket with a ref count of 1. The socket should be
330 * closed with soclose().
331 */
332int
333socreate(int dom, struct socket **aso, int type, int proto,
334 struct ucred *cred, struct thread *td)
335{
336 struct protosw *prp;
337 struct socket *so;
338 int error;
339
340 if (proto)
341 prp = pffindproto(dom, proto, type);
342 else
343 prp = pffindtype(dom, type);
344
345 if (prp == NULL || prp->pr_usrreqs->pru_attach == NULL ||
346 prp->pr_usrreqs->pru_attach == pru_attach_notsupp)
347 return (EPROTONOSUPPORT);
348
349 if (jailed(cred) && jail_socket_unixiproute_only &&
350 prp->pr_domain->dom_family != PF_LOCAL &&
351 prp->pr_domain->dom_family != PF_INET &&
352#ifdef INET6
353 prp->pr_domain->dom_family != PF_INET6 &&
354#endif
355 prp->pr_domain->dom_family != PF_ROUTE) {
356 return (EPROTONOSUPPORT);
357 }
358
359 if (prp->pr_type != type)
360 return (EPROTOTYPE);
361 so = soalloc();
362 if (so == NULL)
363 return (ENOBUFS);
364
365 TAILQ_INIT(&so->so_incomp);
366 TAILQ_INIT(&so->so_comp);
367 so->so_type = type;
368 so->so_cred = crhold(cred);
369 if ((prp->pr_domain->dom_family == PF_INET) ||
370 (prp->pr_domain->dom_family == PF_ROUTE))
371 so->so_fibnum = td->td_proc->p_fibnum;
372 else
373 so->so_fibnum = 0;
374 so->so_proto = prp;
375#ifdef MAC
376 mac_socket_create(cred, so);
377#endif
378 knlist_init(&so->so_rcv.sb_sel.si_note, SOCKBUF_MTX(&so->so_rcv),
379 NULL, NULL, NULL);
380 knlist_init(&so->so_snd.sb_sel.si_note, SOCKBUF_MTX(&so->so_snd),
381 NULL, NULL, NULL);
382 so->so_count = 1;
383 /*
384 * Auto-sizing of socket buffers is managed by the protocols and
385 * the appropriate flags must be set in the pru_attach function.
386 */
387 error = (*prp->pr_usrreqs->pru_attach)(so, proto, td);
388 if (error) {
389 KASSERT(so->so_count == 1, ("socreate: so_count %d",
390 so->so_count));
391 so->so_count = 0;
392 sodealloc(so);
393 return (error);
394 }
395 *aso = so;
396 return (0);
397}
398
399#ifdef REGRESSION
400static int regression_sonewconn_earlytest = 1;
401SYSCTL_INT(_regression, OID_AUTO, sonewconn_earlytest, CTLFLAG_RW,
402 &regression_sonewconn_earlytest, 0, "Perform early sonewconn limit test");
403#endif
404
405/*
406 * When an attempt at a new connection is noted on a socket which accepts
407 * connections, sonewconn is called. If the connection is possible (subject
408 * to space constraints, etc.) then we allocate a new structure, propoerly
409 * linked into the data structure of the original socket, and return this.
410 * Connstatus may be 0, or SO_ISCONFIRMING, or SO_ISCONNECTED.
411 *
412 * Note: the ref count on the socket is 0 on return.
413 */
414struct socket *
415sonewconn(struct socket *head, int connstatus)
416{
417 struct socket *so;
418 int over;
419
420 ACCEPT_LOCK();
421 over = (head->so_qlen > 3 * head->so_qlimit / 2);
422 ACCEPT_UNLOCK();
423#ifdef REGRESSION
424 if (regression_sonewconn_earlytest && over)
425#else
426 if (over)
427#endif
428 return (NULL);
429 so = soalloc();
430 if (so == NULL)
431 return (NULL);
432 if ((head->so_options & SO_ACCEPTFILTER) != 0)
433 connstatus = 0;
434 so->so_head = head;
435 so->so_type = head->so_type;
436 so->so_options = head->so_options &~ SO_ACCEPTCONN;
437 so->so_linger = head->so_linger;
438 so->so_state = head->so_state | SS_NOFDREF;
439 so->so_proto = head->so_proto;
440 so->so_cred = crhold(head->so_cred);
441#ifdef MAC
442 SOCK_LOCK(head);
443 mac_socket_newconn(head, so);
444 SOCK_UNLOCK(head);
445#endif
446 knlist_init(&so->so_rcv.sb_sel.si_note, SOCKBUF_MTX(&so->so_rcv),
447 NULL, NULL, NULL);
448 knlist_init(&so->so_snd.sb_sel.si_note, SOCKBUF_MTX(&so->so_snd),
449 NULL, NULL, NULL);
450 if (soreserve(so, head->so_snd.sb_hiwat, head->so_rcv.sb_hiwat) ||
451 (*so->so_proto->pr_usrreqs->pru_attach)(so, 0, NULL)) {
452 sodealloc(so);
453 return (NULL);
454 }
455 so->so_rcv.sb_lowat = head->so_rcv.sb_lowat;
456 so->so_snd.sb_lowat = head->so_snd.sb_lowat;
457 so->so_rcv.sb_timeo = head->so_rcv.sb_timeo;
458 so->so_snd.sb_timeo = head->so_snd.sb_timeo;
459 so->so_rcv.sb_flags |= head->so_rcv.sb_flags & SB_AUTOSIZE;
460 so->so_snd.sb_flags |= head->so_snd.sb_flags & SB_AUTOSIZE;
461 so->so_state |= connstatus;
462 ACCEPT_LOCK();
463 if (connstatus) {
464 TAILQ_INSERT_TAIL(&head->so_comp, so, so_list);
465 so->so_qstate |= SQ_COMP;
466 head->so_qlen++;
467 } else {
468 /*
469 * Keep removing sockets from the head until there's room for
470 * us to insert on the tail. In pre-locking revisions, this
471 * was a simple if(), but as we could be racing with other
472 * threads and soabort() requires dropping locks, we must
473 * loop waiting for the condition to be true.
474 */
475 while (head->so_incqlen > head->so_qlimit) {
476 struct socket *sp;
477 sp = TAILQ_FIRST(&head->so_incomp);
478 TAILQ_REMOVE(&head->so_incomp, sp, so_list);
479 head->so_incqlen--;
480 sp->so_qstate &= ~SQ_INCOMP;
481 sp->so_head = NULL;
482 ACCEPT_UNLOCK();
483 soabort(sp);
484 ACCEPT_LOCK();
485 }
486 TAILQ_INSERT_TAIL(&head->so_incomp, so, so_list);
487 so->so_qstate |= SQ_INCOMP;
488 head->so_incqlen++;
489 }
490 ACCEPT_UNLOCK();
491 if (connstatus) {
492 sorwakeup(head);
493 wakeup_one(&head->so_timeo);
494 }
495 return (so);
496}
497
498int
499sobind(struct socket *so, struct sockaddr *nam, struct thread *td)
500{
501
502 return ((*so->so_proto->pr_usrreqs->pru_bind)(so, nam, td));
503}
504
505/*
506 * solisten() transitions a socket from a non-listening state to a listening
507 * state, but can also be used to update the listen queue depth on an
508 * existing listen socket. The protocol will call back into the sockets
509 * layer using solisten_proto_check() and solisten_proto() to check and set
510 * socket-layer listen state. Call backs are used so that the protocol can
511 * acquire both protocol and socket layer locks in whatever order is required
512 * by the protocol.
513 *
514 * Protocol implementors are advised to hold the socket lock across the
515 * socket-layer test and set to avoid races at the socket layer.
516 */
517int
518solisten(struct socket *so, int backlog, struct thread *td)
519{
520
521 return ((*so->so_proto->pr_usrreqs->pru_listen)(so, backlog, td));
522}
523
524int
525solisten_proto_check(struct socket *so)
526{
527
528 SOCK_LOCK_ASSERT(so);
529
530 if (so->so_state & (SS_ISCONNECTED | SS_ISCONNECTING |
531 SS_ISDISCONNECTING))
532 return (EINVAL);
533 return (0);
534}
535
536void
537solisten_proto(struct socket *so, int backlog)
538{
539
540 SOCK_LOCK_ASSERT(so);
541
542 if (backlog < 0 || backlog > somaxconn)
543 backlog = somaxconn;
544 so->so_qlimit = backlog;
545 so->so_options |= SO_ACCEPTCONN;
546}
547
548/*
549 * Attempt to free a socket. This should really be sotryfree().
550 *
551 * sofree() will succeed if:
552 *
553 * - There are no outstanding file descriptor references or related consumers
554 * (so_count == 0).
555 *
556 * - The socket has been closed by user space, if ever open (SS_NOFDREF).
557 *
558 * - The protocol does not have an outstanding strong reference on the socket
559 * (SS_PROTOREF).
560 *
561 * - The socket is not in a completed connection queue, so a process has been
562 * notified that it is present. If it is removed, the user process may
563 * block in accept() despite select() saying the socket was ready.
564 *
565 * Otherwise, it will quietly abort so that a future call to sofree(), when
566 * conditions are right, can succeed.
567 */
568void
569sofree(struct socket *so)
570{
571 struct protosw *pr = so->so_proto;
572 struct socket *head;
573
574 ACCEPT_LOCK_ASSERT();
575 SOCK_LOCK_ASSERT(so);
576
577 if ((so->so_state & SS_NOFDREF) == 0 || so->so_count != 0 ||
578 (so->so_state & SS_PROTOREF) || (so->so_qstate & SQ_COMP)) {
579 SOCK_UNLOCK(so);
580 ACCEPT_UNLOCK();
581 return;
582 }
583
584 head = so->so_head;
585 if (head != NULL) {
586 KASSERT((so->so_qstate & SQ_COMP) != 0 ||
587 (so->so_qstate & SQ_INCOMP) != 0,
588 ("sofree: so_head != NULL, but neither SQ_COMP nor "
589 "SQ_INCOMP"));
590 KASSERT((so->so_qstate & SQ_COMP) == 0 ||
591 (so->so_qstate & SQ_INCOMP) == 0,
592 ("sofree: so->so_qstate is SQ_COMP and also SQ_INCOMP"));
593 TAILQ_REMOVE(&head->so_incomp, so, so_list);
594 head->so_incqlen--;
595 so->so_qstate &= ~SQ_INCOMP;
596 so->so_head = NULL;
597 }
598 KASSERT((so->so_qstate & SQ_COMP) == 0 &&
599 (so->so_qstate & SQ_INCOMP) == 0,
600 ("sofree: so_head == NULL, but still SQ_COMP(%d) or SQ_INCOMP(%d)",
601 so->so_qstate & SQ_COMP, so->so_qstate & SQ_INCOMP));
602 if (so->so_options & SO_ACCEPTCONN) {
603 KASSERT((TAILQ_EMPTY(&so->so_comp)), ("sofree: so_comp populated"));
604 KASSERT((TAILQ_EMPTY(&so->so_incomp)), ("sofree: so_comp populated"));
605 }
606 SOCK_UNLOCK(so);
607 ACCEPT_UNLOCK();
608
609 if (pr->pr_flags & PR_RIGHTS && pr->pr_domain->dom_dispose != NULL)
610 (*pr->pr_domain->dom_dispose)(so->so_rcv.sb_mb);
611 if (pr->pr_usrreqs->pru_detach != NULL)
612 (*pr->pr_usrreqs->pru_detach)(so);
613
614 /*
615 * From this point on, we assume that no other references to this
616 * socket exist anywhere else in the stack. Therefore, no locks need
617 * to be acquired or held.
618 *
619 * We used to do a lot of socket buffer and socket locking here, as
620 * well as invoke sorflush() and perform wakeups. The direct call to
621 * dom_dispose() and sbrelease_internal() are an inlining of what was
622 * necessary from sorflush().
623 *
624 * Notice that the socket buffer and kqueue state are torn down
625 * before calling pru_detach. This means that protocols shold not
626 * assume they can perform socket wakeups, etc, in their detach code.
627 */
628 sbdestroy(&so->so_snd, so);
629 sbdestroy(&so->so_rcv, so);
630 knlist_destroy(&so->so_rcv.sb_sel.si_note);
631 knlist_destroy(&so->so_snd.sb_sel.si_note);
632 sodealloc(so);
633}
634
635/*
636 * Close a socket on last file table reference removal. Initiate disconnect
637 * if connected. Free socket when disconnect complete.
638 *
639 * This function will sorele() the socket. Note that soclose() may be called
640 * prior to the ref count reaching zero. The actual socket structure will
641 * not be freed until the ref count reaches zero.
642 */
643int
644soclose(struct socket *so)
645{
646 int error = 0;
647
648 KASSERT(!(so->so_state & SS_NOFDREF), ("soclose: SS_NOFDREF on enter"));
649
650 funsetown(&so->so_sigio);
651 if (so->so_state & SS_ISCONNECTED) {
652 if ((so->so_state & SS_ISDISCONNECTING) == 0) {
653 error = sodisconnect(so);
654 if (error)
655 goto drop;
656 }
657 if (so->so_options & SO_LINGER) {
658 if ((so->so_state & SS_ISDISCONNECTING) &&
659 (so->so_state & SS_NBIO))
660 goto drop;
661 while (so->so_state & SS_ISCONNECTED) {
662 error = tsleep(&so->so_timeo,
663 PSOCK | PCATCH, "soclos", so->so_linger * hz);
664 if (error)
665 break;
666 }
667 }
668 }
669
670drop:
671 if (so->so_proto->pr_usrreqs->pru_close != NULL)
672 (*so->so_proto->pr_usrreqs->pru_close)(so);
673 if (so->so_options & SO_ACCEPTCONN) {
674 struct socket *sp;
675 ACCEPT_LOCK();
676 while ((sp = TAILQ_FIRST(&so->so_incomp)) != NULL) {
677 TAILQ_REMOVE(&so->so_incomp, sp, so_list);
678 so->so_incqlen--;
679 sp->so_qstate &= ~SQ_INCOMP;
680 sp->so_head = NULL;
681 ACCEPT_UNLOCK();
682 soabort(sp);
683 ACCEPT_LOCK();
684 }
685 while ((sp = TAILQ_FIRST(&so->so_comp)) != NULL) {
686 TAILQ_REMOVE(&so->so_comp, sp, so_list);
687 so->so_qlen--;
688 sp->so_qstate &= ~SQ_COMP;
689 sp->so_head = NULL;
690 ACCEPT_UNLOCK();
691 soabort(sp);
692 ACCEPT_LOCK();
693 }
694 ACCEPT_UNLOCK();
695 }
696 ACCEPT_LOCK();
697 SOCK_LOCK(so);
698 KASSERT((so->so_state & SS_NOFDREF) == 0, ("soclose: NOFDREF"));
699 so->so_state |= SS_NOFDREF;
700 sorele(so);
701 return (error);
702}
703
704/*
705 * soabort() is used to abruptly tear down a connection, such as when a
706 * resource limit is reached (listen queue depth exceeded), or if a listen
707 * socket is closed while there are sockets waiting to be accepted.
708 *
709 * This interface is tricky, because it is called on an unreferenced socket,
710 * and must be called only by a thread that has actually removed the socket
711 * from the listen queue it was on, or races with other threads are risked.
712 *
713 * This interface will call into the protocol code, so must not be called
714 * with any socket locks held. Protocols do call it while holding their own
715 * recursible protocol mutexes, but this is something that should be subject
716 * to review in the future.
717 */
718void
719soabort(struct socket *so)
720{
721
722 /*
723 * In as much as is possible, assert that no references to this
724 * socket are held. This is not quite the same as asserting that the
725 * current thread is responsible for arranging for no references, but
726 * is as close as we can get for now.
727 */
728 KASSERT(so->so_count == 0, ("soabort: so_count"));
729 KASSERT((so->so_state & SS_PROTOREF) == 0, ("soabort: SS_PROTOREF"));
730 KASSERT(so->so_state & SS_NOFDREF, ("soabort: !SS_NOFDREF"));
731 KASSERT((so->so_state & SQ_COMP) == 0, ("soabort: SQ_COMP"));
732 KASSERT((so->so_state & SQ_INCOMP) == 0, ("soabort: SQ_INCOMP"));
733
734 if (so->so_proto->pr_usrreqs->pru_abort != NULL)
735 (*so->so_proto->pr_usrreqs->pru_abort)(so);
736 ACCEPT_LOCK();
737 SOCK_LOCK(so);
738 sofree(so);
739}
740
741int
742soaccept(struct socket *so, struct sockaddr **nam)
743{
744 int error;
745
746 SOCK_LOCK(so);
747 KASSERT((so->so_state & SS_NOFDREF) != 0, ("soaccept: !NOFDREF"));
748 so->so_state &= ~SS_NOFDREF;
749 SOCK_UNLOCK(so);
750 error = (*so->so_proto->pr_usrreqs->pru_accept)(so, nam);
751 return (error);
752}
753
754int
755soconnect(struct socket *so, struct sockaddr *nam, struct thread *td)
756{
757 int error;
758
759 if (so->so_options & SO_ACCEPTCONN)
760 return (EOPNOTSUPP);
761 /*
762 * If protocol is connection-based, can only connect once.
763 * Otherwise, if connected, try to disconnect first. This allows
764 * user to disconnect by connecting to, e.g., a null address.
765 */
766 if (so->so_state & (SS_ISCONNECTED|SS_ISCONNECTING) &&
767 ((so->so_proto->pr_flags & PR_CONNREQUIRED) ||
768 (error = sodisconnect(so)))) {
769 error = EISCONN;
770 } else {
771 /*
772 * Prevent accumulated error from previous connection from
773 * biting us.
774 */
775 so->so_error = 0;
776 error = (*so->so_proto->pr_usrreqs->pru_connect)(so, nam, td);
777 }
778
779 return (error);
780}
781
782int
783soconnect2(struct socket *so1, struct socket *so2)
784{
785
786 return ((*so1->so_proto->pr_usrreqs->pru_connect2)(so1, so2));
787}
788
789int
790sodisconnect(struct socket *so)
791{
792 int error;
793
794 if ((so->so_state & SS_ISCONNECTED) == 0)
795 return (ENOTCONN);
796 if (so->so_state & SS_ISDISCONNECTING)
797 return (EALREADY);
798 error = (*so->so_proto->pr_usrreqs->pru_disconnect)(so);
799 return (error);
800}
801
802#ifdef ZERO_COPY_SOCKETS
803struct so_zerocopy_stats{
804 int size_ok;
805 int align_ok;
806 int found_ifp;
807};
808struct so_zerocopy_stats so_zerocp_stats = {0,0,0};
809#include <netinet/in.h>
810#include <net/route.h>
811#include <netinet/in_pcb.h>
812#include <vm/vm.h>
813#include <vm/vm_page.h>
814#include <vm/vm_object.h>
815
816/*
817 * sosend_copyin() is only used if zero copy sockets are enabled. Otherwise
818 * sosend_dgram() and sosend_generic() use m_uiotombuf().
819 *
820 * sosend_copyin() accepts a uio and prepares an mbuf chain holding part or
821 * all of the data referenced by the uio. If desired, it uses zero-copy.
822 * *space will be updated to reflect data copied in.
823 *
824 * NB: If atomic I/O is requested, the caller must already have checked that
825 * space can hold resid bytes.
826 *
827 * NB: In the event of an error, the caller may need to free the partial
828 * chain pointed to by *mpp. The contents of both *uio and *space may be
829 * modified even in the case of an error.
830 */
831static int
832sosend_copyin(struct uio *uio, struct mbuf **retmp, int atomic, long *space,
833 int flags)
834{
835 struct mbuf *m, **mp, *top;
836 long len, resid;
837 int error;
838#ifdef ZERO_COPY_SOCKETS
839 int cow_send;
840#endif
841
842 *retmp = top = NULL;
843 mp = &top;
844 len = 0;
845 resid = uio->uio_resid;
846 error = 0;
847 do {
848#ifdef ZERO_COPY_SOCKETS
849 cow_send = 0;
850#endif /* ZERO_COPY_SOCKETS */
851 if (resid >= MINCLSIZE) {
852#ifdef ZERO_COPY_SOCKETS
853 if (top == NULL) {
854 m = m_gethdr(M_WAITOK, MT_DATA);
855 m->m_pkthdr.len = 0;
856 m->m_pkthdr.rcvif = NULL;
857 } else
858 m = m_get(M_WAITOK, MT_DATA);
859 if (so_zero_copy_send &&
860 resid>=PAGE_SIZE &&
861 *space>=PAGE_SIZE &&
862 uio->uio_iov->iov_len>=PAGE_SIZE) {
863 so_zerocp_stats.size_ok++;
864 so_zerocp_stats.align_ok++;
865 cow_send = socow_setup(m, uio);
866 len = cow_send;
867 }
868 if (!cow_send) {
869 m_clget(m, M_WAITOK);
870 len = min(min(MCLBYTES, resid), *space);
871 }
872#else /* ZERO_COPY_SOCKETS */
873 if (top == NULL) {
874 m = m_getcl(M_WAIT, MT_DATA, M_PKTHDR);
875 m->m_pkthdr.len = 0;
876 m->m_pkthdr.rcvif = NULL;
877 } else
878 m = m_getcl(M_WAIT, MT_DATA, 0);
879 len = min(min(MCLBYTES, resid), *space);
880#endif /* ZERO_COPY_SOCKETS */
881 } else {
882 if (top == NULL) {
883 m = m_gethdr(M_WAIT, MT_DATA);
884 m->m_pkthdr.len = 0;
885 m->m_pkthdr.rcvif = NULL;
886
887 len = min(min(MHLEN, resid), *space);
888 /*
889 * For datagram protocols, leave room
890 * for protocol headers in first mbuf.
891 */
892 if (atomic && m && len < MHLEN)
893 MH_ALIGN(m, len);
894 } else {
895 m = m_get(M_WAIT, MT_DATA);
896 len = min(min(MLEN, resid), *space);
897 }
898 }
899 if (m == NULL) {
900 error = ENOBUFS;
901 goto out;
902 }
903
904 *space -= len;
905#ifdef ZERO_COPY_SOCKETS
906 if (cow_send)
907 error = 0;
908 else
909#endif /* ZERO_COPY_SOCKETS */
910 error = uiomove(mtod(m, void *), (int)len, uio);
911 resid = uio->uio_resid;
912 m->m_len = len;
913 *mp = m;
914 top->m_pkthdr.len += len;
915 if (error)
916 goto out;
917 mp = &m->m_next;
918 if (resid <= 0) {
919 if (flags & MSG_EOR)
920 top->m_flags |= M_EOR;
921 break;
922 }
923 } while (*space > 0 && atomic);
924out:
925 *retmp = top;
926 return (error);
927}
928#endif /*ZERO_COPY_SOCKETS*/
929
930#define SBLOCKWAIT(f) (((f) & MSG_DONTWAIT) ? 0 : SBL_WAIT)
931
932int
933sosend_dgram(struct socket *so, struct sockaddr *addr, struct uio *uio,
934 struct mbuf *top, struct mbuf *control, int flags, struct thread *td)
935{
936 long space, resid;
937 int clen = 0, error, dontroute;
938#ifdef ZERO_COPY_SOCKETS
939 int atomic = sosendallatonce(so) || top;
940#endif
941
942 KASSERT(so->so_type == SOCK_DGRAM, ("sodgram_send: !SOCK_DGRAM"));
943 KASSERT(so->so_proto->pr_flags & PR_ATOMIC,
944 ("sodgram_send: !PR_ATOMIC"));
945
946 if (uio != NULL)
947 resid = uio->uio_resid;
948 else
949 resid = top->m_pkthdr.len;
950 /*
951 * In theory resid should be unsigned. However, space must be
952 * signed, as it might be less than 0 if we over-committed, and we
953 * must use a signed comparison of space and resid. On the other
954 * hand, a negative resid causes us to loop sending 0-length
955 * segments to the protocol.
956 *
957 * Also check to make sure that MSG_EOR isn't used on SOCK_STREAM
958 * type sockets since that's an error.
959 */
960 if (resid < 0) {
961 error = EINVAL;
962 goto out;
963 }
964
965 dontroute =
966 (flags & MSG_DONTROUTE) && (so->so_options & SO_DONTROUTE) == 0;
967 if (td != NULL)
968 td->td_ru.ru_msgsnd++;
969 if (control != NULL)
970 clen = control->m_len;
971
972 SOCKBUF_LOCK(&so->so_snd);
973 if (so->so_snd.sb_state & SBS_CANTSENDMORE) {
974 SOCKBUF_UNLOCK(&so->so_snd);
975 error = EPIPE;
976 goto out;
977 }
978 if (so->so_error) {
979 error = so->so_error;
980 so->so_error = 0;
981 SOCKBUF_UNLOCK(&so->so_snd);
982 goto out;
983 }
984 if ((so->so_state & SS_ISCONNECTED) == 0) {
985 /*
986 * `sendto' and `sendmsg' is allowed on a connection-based
987 * socket if it supports implied connect. Return ENOTCONN if
988 * not connected and no address is supplied.
989 */
990 if ((so->so_proto->pr_flags & PR_CONNREQUIRED) &&
991 (so->so_proto->pr_flags & PR_IMPLOPCL) == 0) {
992 if ((so->so_state & SS_ISCONFIRMING) == 0 &&
993 !(resid == 0 && clen != 0)) {
994 SOCKBUF_UNLOCK(&so->so_snd);
995 error = ENOTCONN;
996 goto out;
997 }
998 } else if (addr == NULL) {
999 if (so->so_proto->pr_flags & PR_CONNREQUIRED)
1000 error = ENOTCONN;
1001 else
1002 error = EDESTADDRREQ;
1003 SOCKBUF_UNLOCK(&so->so_snd);
1004 goto out;
1005 }
1006 }
1007
1008 /*
1009 * Do we need MSG_OOB support in SOCK_DGRAM? Signs here may be a
1010 * problem and need fixing.
1011 */
1012 space = sbspace(&so->so_snd);
1013 if (flags & MSG_OOB)
1014 space += 1024;
1015 space -= clen;
1016 SOCKBUF_UNLOCK(&so->so_snd);
1017 if (resid > space) {
1018 error = EMSGSIZE;
1019 goto out;
1020 }
1021 if (uio == NULL) {
1022 resid = 0;
1023 if (flags & MSG_EOR)
1024 top->m_flags |= M_EOR;
1025 } else {
1026#ifdef ZERO_COPY_SOCKETS
1027 error = sosend_copyin(uio, &top, atomic, &space, flags);
1028 if (error)
1029 goto out;
1030#else
1031 /*
1032 * Copy the data from userland into a mbuf chain.
1033 * If no data is to be copied in, a single empty mbuf
1034 * is returned.
1035 */
1036 top = m_uiotombuf(uio, M_WAITOK, space, max_hdr,
1037 (M_PKTHDR | ((flags & MSG_EOR) ? M_EOR : 0)));
1038 if (top == NULL) {
1039 error = EFAULT; /* only possible error */
1040 goto out;
1041 }
1042 space -= resid - uio->uio_resid;
1043#endif
1044 resid = uio->uio_resid;
1045 }
1046 KASSERT(resid == 0, ("sosend_dgram: resid != 0"));
1047 /*
1048 * XXXRW: Frobbing SO_DONTROUTE here is even worse without sblock
1049 * than with.
1050 */
1051 if (dontroute) {
1052 SOCK_LOCK(so);
1053 so->so_options |= SO_DONTROUTE;
1054 SOCK_UNLOCK(so);
1055 }
1056 /*
1057 * XXX all the SBS_CANTSENDMORE checks previously done could be out
1058 * of date. We could have recieved a reset packet in an interrupt or
1059 * maybe we slept while doing page faults in uiomove() etc. We could
1060 * probably recheck again inside the locking protection here, but
1061 * there are probably other places that this also happens. We must
1062 * rethink this.
1063 */
1064 error = (*so->so_proto->pr_usrreqs->pru_send)(so,
1065 (flags & MSG_OOB) ? PRUS_OOB :
1066 /*
1067 * If the user set MSG_EOF, the protocol understands this flag and
1068 * nothing left to send then use PRU_SEND_EOF instead of PRU_SEND.
1069 */
1070 ((flags & MSG_EOF) &&
1071 (so->so_proto->pr_flags & PR_IMPLOPCL) &&
1072 (resid <= 0)) ?
1073 PRUS_EOF :
1074 /* If there is more to send set PRUS_MORETOCOME */
1075 (resid > 0 && space > 0) ? PRUS_MORETOCOME : 0,
1076 top, addr, control, td);
1077 if (dontroute) {
1078 SOCK_LOCK(so);
1079 so->so_options &= ~SO_DONTROUTE;
1080 SOCK_UNLOCK(so);
1081 }
1082 clen = 0;
1083 control = NULL;
1084 top = NULL;
1085out:
1086 if (top != NULL)
1087 m_freem(top);
1088 if (control != NULL)
1089 m_freem(control);
1090 return (error);
1091}
1092
1093/*
1094 * Send on a socket. If send must go all at once and message is larger than
1095 * send buffering, then hard error. Lock against other senders. If must go
1096 * all at once and not enough room now, then inform user that this would
1097 * block and do nothing. Otherwise, if nonblocking, send as much as
1098 * possible. The data to be sent is described by "uio" if nonzero, otherwise
1099 * by the mbuf chain "top" (which must be null if uio is not). Data provided
1100 * in mbuf chain must be small enough to send all at once.
1101 *
1102 * Returns nonzero on error, timeout or signal; callers must check for short
1103 * counts if EINTR/ERESTART are returned. Data and control buffers are freed
1104 * on return.
1105 */
1106int
1107sosend_generic(struct socket *so, struct sockaddr *addr, struct uio *uio,
1108 struct mbuf *top, struct mbuf *control, int flags, struct thread *td)
1109{
1110 long space, resid;
1111 int clen = 0, error, dontroute;
1112 int atomic = sosendallatonce(so) || top;
1113
1114 if (uio != NULL)
1115 resid = uio->uio_resid;
1116 else
1117 resid = top->m_pkthdr.len;
1118 /*
1119 * In theory resid should be unsigned. However, space must be
1120 * signed, as it might be less than 0 if we over-committed, and we
1121 * must use a signed comparison of space and resid. On the other
1122 * hand, a negative resid causes us to loop sending 0-length
1123 * segments to the protocol.
1124 *
1125 * Also check to make sure that MSG_EOR isn't used on SOCK_STREAM
1126 * type sockets since that's an error.
1127 */
1128 if (resid < 0 || (so->so_type == SOCK_STREAM && (flags & MSG_EOR))) {
1129 error = EINVAL;
1130 goto out;
1131 }
1132
1133 dontroute =
1134 (flags & MSG_DONTROUTE) && (so->so_options & SO_DONTROUTE) == 0 &&
1135 (so->so_proto->pr_flags & PR_ATOMIC);
1136 if (td != NULL)
1137 td->td_ru.ru_msgsnd++;
1138 if (control != NULL)
1139 clen = control->m_len;
1140
1141 error = sblock(&so->so_snd, SBLOCKWAIT(flags));
1142 if (error)
1143 goto out;
1144
1145restart:
1146 do {
1147 SOCKBUF_LOCK(&so->so_snd);
1148 if (so->so_snd.sb_state & SBS_CANTSENDMORE) {
1149 SOCKBUF_UNLOCK(&so->so_snd);
1150 error = EPIPE;
1151 goto release;
1152 }
1153 if (so->so_error) {
1154 error = so->so_error;
1155 so->so_error = 0;
1156 SOCKBUF_UNLOCK(&so->so_snd);
1157 goto release;
1158 }
1159 if ((so->so_state & SS_ISCONNECTED) == 0) {
1160 /*
1161 * `sendto' and `sendmsg' is allowed on a connection-
1162 * based socket if it supports implied connect.
1163 * Return ENOTCONN if not connected and no address is
1164 * supplied.
1165 */
1166 if ((so->so_proto->pr_flags & PR_CONNREQUIRED) &&
1167 (so->so_proto->pr_flags & PR_IMPLOPCL) == 0) {
1168 if ((so->so_state & SS_ISCONFIRMING) == 0 &&
1169 !(resid == 0 && clen != 0)) {
1170 SOCKBUF_UNLOCK(&so->so_snd);
1171 error = ENOTCONN;
1172 goto release;
1173 }
1174 } else if (addr == NULL) {
1175 SOCKBUF_UNLOCK(&so->so_snd);
1176 if (so->so_proto->pr_flags & PR_CONNREQUIRED)
1177 error = ENOTCONN;
1178 else
1179 error = EDESTADDRREQ;
1180 goto release;
1181 }
1182 }
1183 space = sbspace(&so->so_snd);
1184 if (flags & MSG_OOB)
1185 space += 1024;
1186 if ((atomic && resid > so->so_snd.sb_hiwat) ||
1187 clen > so->so_snd.sb_hiwat) {
1188 SOCKBUF_UNLOCK(&so->so_snd);
1189 error = EMSGSIZE;
1190 goto release;
1191 }
1192 if (space < resid + clen &&
1193 (atomic || space < so->so_snd.sb_lowat || space < clen)) {
1194 if ((so->so_state & SS_NBIO) || (flags & MSG_NBIO)) {
1195 SOCKBUF_UNLOCK(&so->so_snd);
1196 error = EWOULDBLOCK;
1197 goto release;
1198 }
1199 error = sbwait(&so->so_snd);
1200 SOCKBUF_UNLOCK(&so->so_snd);
1201 if (error)
1202 goto release;
1203 goto restart;
1204 }
1205 SOCKBUF_UNLOCK(&so->so_snd);
1206 space -= clen;
1207 do {
1208 if (uio == NULL) {
1209 resid = 0;
1210 if (flags & MSG_EOR)
1211 top->m_flags |= M_EOR;
1212 } else {
1213#ifdef ZERO_COPY_SOCKETS
1214 error = sosend_copyin(uio, &top, atomic,
1215 &space, flags);
1216 if (error != 0)
1217 goto release;
1218#else
1219 /*
1220 * Copy the data from userland into a mbuf
1221 * chain. If no data is to be copied in,
1222 * a single empty mbuf is returned.
1223 */
1224 top = m_uiotombuf(uio, M_WAITOK, space,
1225 (atomic ? max_hdr : 0),
1226 (atomic ? M_PKTHDR : 0) |
1227 ((flags & MSG_EOR) ? M_EOR : 0));
1228 if (top == NULL) {
1229 error = EFAULT; /* only possible error */
1230 goto release;
1231 }
1232 space -= resid - uio->uio_resid;
1233#endif
1234 resid = uio->uio_resid;
1235 }
1236 if (dontroute) {
1237 SOCK_LOCK(so);
1238 so->so_options |= SO_DONTROUTE;
1239 SOCK_UNLOCK(so);
1240 }
1241 /*
1242 * XXX all the SBS_CANTSENDMORE checks previously
1243 * done could be out of date. We could have recieved
1244 * a reset packet in an interrupt or maybe we slept
1245 * while doing page faults in uiomove() etc. We
1246 * could probably recheck again inside the locking
1247 * protection here, but there are probably other
1248 * places that this also happens. We must rethink
1249 * this.
1250 */
1251 error = (*so->so_proto->pr_usrreqs->pru_send)(so,
1252 (flags & MSG_OOB) ? PRUS_OOB :
1253 /*
1254 * If the user set MSG_EOF, the protocol understands
1255 * this flag and nothing left to send then use
1256 * PRU_SEND_EOF instead of PRU_SEND.
1257 */
1258 ((flags & MSG_EOF) &&
1259 (so->so_proto->pr_flags & PR_IMPLOPCL) &&
1260 (resid <= 0)) ?
1261 PRUS_EOF :
1262 /* If there is more to send set PRUS_MORETOCOME. */
1263 (resid > 0 && space > 0) ? PRUS_MORETOCOME : 0,
1264 top, addr, control, td);
1265 if (dontroute) {
1266 SOCK_LOCK(so);
1267 so->so_options &= ~SO_DONTROUTE;
1268 SOCK_UNLOCK(so);
1269 }
1270 clen = 0;
1271 control = NULL;
1272 top = NULL;
1273 if (error)
1274 goto release;
1275 } while (resid && space > 0);
1276 } while (resid);
1277
1278release:
1279 sbunlock(&so->so_snd);
1280out:
1281 if (top != NULL)
1282 m_freem(top);
1283 if (control != NULL)
1284 m_freem(control);
1285 return (error);
1286}
1287
1288int
1289sosend(struct socket *so, struct sockaddr *addr, struct uio *uio,
1290 struct mbuf *top, struct mbuf *control, int flags, struct thread *td)
1291{
1292
1293 return (so->so_proto->pr_usrreqs->pru_sosend(so, addr, uio, top,
1294 control, flags, td));
1295}
1296
1297/*
1298 * The part of soreceive() that implements reading non-inline out-of-band
1299 * data from a socket. For more complete comments, see soreceive(), from
1300 * which this code originated.
1301 *
1302 * Note that soreceive_rcvoob(), unlike the remainder of soreceive(), is
1303 * unable to return an mbuf chain to the caller.
1304 */
1305static int
1306soreceive_rcvoob(struct socket *so, struct uio *uio, int flags)
1307{
1308 struct protosw *pr = so->so_proto;
1309 struct mbuf *m;
1310 int error;
1311
1312 KASSERT(flags & MSG_OOB, ("soreceive_rcvoob: (flags & MSG_OOB) == 0"));
1313
1314 m = m_get(M_WAIT, MT_DATA);
1315 error = (*pr->pr_usrreqs->pru_rcvoob)(so, m, flags & MSG_PEEK);
1316 if (error)
1317 goto bad;
1318 do {
1319#ifdef ZERO_COPY_SOCKETS
1320 if (so_zero_copy_receive) {
1321 int disposable;
1322
1323 if ((m->m_flags & M_EXT)
1324 && (m->m_ext.ext_type == EXT_DISPOSABLE))
1325 disposable = 1;
1326 else
1327 disposable = 0;
1328
1329 error = uiomoveco(mtod(m, void *),
1330 min(uio->uio_resid, m->m_len),
1331 uio, disposable);
1332 } else
1333#endif /* ZERO_COPY_SOCKETS */
1334 error = uiomove(mtod(m, void *),
1335 (int) min(uio->uio_resid, m->m_len), uio);
1336 m = m_free(m);
1337 } while (uio->uio_resid && error == 0 && m);
1338bad:
1339 if (m != NULL)
1340 m_freem(m);
1341 return (error);
1342}
1343
1344/*
1345 * Following replacement or removal of the first mbuf on the first mbuf chain
1346 * of a socket buffer, push necessary state changes back into the socket
1347 * buffer so that other consumers see the values consistently. 'nextrecord'
1348 * is the callers locally stored value of the original value of
1349 * sb->sb_mb->m_nextpkt which must be restored when the lead mbuf changes.
1350 * NOTE: 'nextrecord' may be NULL.
1351 */
1352static __inline void
1353sockbuf_pushsync(struct sockbuf *sb, struct mbuf *nextrecord)
1354{
1355
1356 SOCKBUF_LOCK_ASSERT(sb);
1357 /*
1358 * First, update for the new value of nextrecord. If necessary, make
1359 * it the first record.
1360 */
1361 if (sb->sb_mb != NULL)
1362 sb->sb_mb->m_nextpkt = nextrecord;
1363 else
1364 sb->sb_mb = nextrecord;
1365
1366 /*
1367 * Now update any dependent socket buffer fields to reflect the new
1368 * state. This is an expanded inline of SB_EMPTY_FIXUP(), with the
1369 * addition of a second clause that takes care of the case where
1370 * sb_mb has been updated, but remains the last record.
1371 */
1372 if (sb->sb_mb == NULL) {
1373 sb->sb_mbtail = NULL;
1374 sb->sb_lastrecord = NULL;
1375 } else if (sb->sb_mb->m_nextpkt == NULL)
1376 sb->sb_lastrecord = sb->sb_mb;
1377}
1378
1379
1380/*
1381 * Implement receive operations on a socket. We depend on the way that
1382 * records are added to the sockbuf by sbappend. In particular, each record
1383 * (mbufs linked through m_next) must begin with an address if the protocol
1384 * so specifies, followed by an optional mbuf or mbufs containing ancillary
1385 * data, and then zero or more mbufs of data. In order to allow parallelism
1386 * between network receive and copying to user space, as well as avoid
1387 * sleeping with a mutex held, we release the socket buffer mutex during the
1388 * user space copy. Although the sockbuf is locked, new data may still be
1389 * appended, and thus we must maintain consistency of the sockbuf during that
1390 * time.
1391 *
1392 * The caller may receive the data as a single mbuf chain by supplying an
1393 * mbuf **mp0 for use in returning the chain. The uio is then used only for
1394 * the count in uio_resid.
1395 */
1396int
1397soreceive_generic(struct socket *so, struct sockaddr **psa, struct uio *uio,
1398 struct mbuf **mp0, struct mbuf **controlp, int *flagsp)
1399{
1400 struct mbuf *m, **mp;
1401 int flags, len, error, offset;
1402 struct protosw *pr = so->so_proto;
1403 struct mbuf *nextrecord;
1404 int moff, type = 0;
1405 int orig_resid = uio->uio_resid;
1406
1407 mp = mp0;
1408 if (psa != NULL)
1409 *psa = NULL;
1410 if (controlp != NULL)
1411 *controlp = NULL;
1412 if (flagsp != NULL)
1413 flags = *flagsp &~ MSG_EOR;
1414 else
1415 flags = 0;
1416 if (flags & MSG_OOB)
1417 return (soreceive_rcvoob(so, uio, flags));
1418 if (mp != NULL)
1419 *mp = NULL;
1420 if ((pr->pr_flags & PR_WANTRCVD) && (so->so_state & SS_ISCONFIRMING)
1421 && uio->uio_resid)
1422 (*pr->pr_usrreqs->pru_rcvd)(so, 0);
1423
1424 error = sblock(&so->so_rcv, SBLOCKWAIT(flags));
1425 if (error)
1426 return (error);
1427
1428restart:
1429 SOCKBUF_LOCK(&so->so_rcv);
1430 m = so->so_rcv.sb_mb;
1431 /*
1432 * If we have less data than requested, block awaiting more (subject
1433 * to any timeout) if:
1434 * 1. the current count is less than the low water mark, or
1435 * 2. MSG_WAITALL is set, and it is possible to do the entire
1436 * receive operation at once if we block (resid <= hiwat).
1437 * 3. MSG_DONTWAIT is not set
1438 * If MSG_WAITALL is set but resid is larger than the receive buffer,
1439 * we have to do the receive in sections, and thus risk returning a
1440 * short count if a timeout or signal occurs after we start.
1441 */
1442 if (m == NULL || (((flags & MSG_DONTWAIT) == 0 &&
1443 so->so_rcv.sb_cc < uio->uio_resid) &&
1444 (so->so_rcv.sb_cc < so->so_rcv.sb_lowat ||
1445 ((flags & MSG_WAITALL) && uio->uio_resid <= so->so_rcv.sb_hiwat)) &&
1446 m->m_nextpkt == NULL && (pr->pr_flags & PR_ATOMIC) == 0)) {
1447 KASSERT(m != NULL || !so->so_rcv.sb_cc,
1448 ("receive: m == %p so->so_rcv.sb_cc == %u",
1449 m, so->so_rcv.sb_cc));
1450 if (so->so_error) {
1451 if (m != NULL)
1452 goto dontblock;
1453 error = so->so_error;
1454 if ((flags & MSG_PEEK) == 0)
1455 so->so_error = 0;
1456 SOCKBUF_UNLOCK(&so->so_rcv);
1457 goto release;
1458 }
1459 SOCKBUF_LOCK_ASSERT(&so->so_rcv);
1460 if (so->so_rcv.sb_state & SBS_CANTRCVMORE) {
1461 if (m == NULL) {
1462 SOCKBUF_UNLOCK(&so->so_rcv);
1463 goto release;
1464 } else
1465 goto dontblock;
1466 }
1467 for (; m != NULL; m = m->m_next)
1468 if (m->m_type == MT_OOBDATA || (m->m_flags & M_EOR)) {
1469 m = so->so_rcv.sb_mb;
1470 goto dontblock;
1471 }
1472 if ((so->so_state & (SS_ISCONNECTED|SS_ISCONNECTING)) == 0 &&
1473 (so->so_proto->pr_flags & PR_CONNREQUIRED)) {
1474 SOCKBUF_UNLOCK(&so->so_rcv);
1475 error = ENOTCONN;
1476 goto release;
1477 }
1478 if (uio->uio_resid == 0) {
1479 SOCKBUF_UNLOCK(&so->so_rcv);
1480 goto release;
1481 }
1482 if ((so->so_state & SS_NBIO) ||
1483 (flags & (MSG_DONTWAIT|MSG_NBIO))) {
1484 SOCKBUF_UNLOCK(&so->so_rcv);
1485 error = EWOULDBLOCK;
1486 goto release;
1487 }
1488 SBLASTRECORDCHK(&so->so_rcv);
1489 SBLASTMBUFCHK(&so->so_rcv);
1490 error = sbwait(&so->so_rcv);
1491 SOCKBUF_UNLOCK(&so->so_rcv);
1492 if (error)
1493 goto release;
1494 goto restart;
1495 }
1496dontblock:
1497 /*
1498 * From this point onward, we maintain 'nextrecord' as a cache of the
1499 * pointer to the next record in the socket buffer. We must keep the
1500 * various socket buffer pointers and local stack versions of the
1501 * pointers in sync, pushing out modifications before dropping the
1502 * socket buffer mutex, and re-reading them when picking it up.
1503 *
1504 * Otherwise, we will race with the network stack appending new data
1505 * or records onto the socket buffer by using inconsistent/stale
1506 * versions of the field, possibly resulting in socket buffer
1507 * corruption.
1508 *
1509 * By holding the high-level sblock(), we prevent simultaneous
1510 * readers from pulling off the front of the socket buffer.
1511 */
1512 SOCKBUF_LOCK_ASSERT(&so->so_rcv);
1513 if (uio->uio_td)
1514 uio->uio_td->td_ru.ru_msgrcv++;
1515 KASSERT(m == so->so_rcv.sb_mb, ("soreceive: m != so->so_rcv.sb_mb"));
1516 SBLASTRECORDCHK(&so->so_rcv);
1517 SBLASTMBUFCHK(&so->so_rcv);
1518 nextrecord = m->m_nextpkt;
1519 if (pr->pr_flags & PR_ADDR) {
1520 KASSERT(m->m_type == MT_SONAME,
1521 ("m->m_type == %d", m->m_type));
1522 orig_resid = 0;
1523 if (psa != NULL)
1524 *psa = sodupsockaddr(mtod(m, struct sockaddr *),
1525 M_NOWAIT);
1526 if (flags & MSG_PEEK) {
1527 m = m->m_next;
1528 } else {
1529 sbfree(&so->so_rcv, m);
1530 so->so_rcv.sb_mb = m_free(m);
1531 m = so->so_rcv.sb_mb;
1532 sockbuf_pushsync(&so->so_rcv, nextrecord);
1533 }
1534 }
1535
1536 /*
1537 * Process one or more MT_CONTROL mbufs present before any data mbufs
1538 * in the first mbuf chain on the socket buffer. If MSG_PEEK, we
1539 * just copy the data; if !MSG_PEEK, we call into the protocol to
1540 * perform externalization (or freeing if controlp == NULL).
1541 */
1542 if (m != NULL && m->m_type == MT_CONTROL) {
1543 struct mbuf *cm = NULL, *cmn;
1544 struct mbuf **cme = &cm;
1545
1546 do {
1547 if (flags & MSG_PEEK) {
1548 if (controlp != NULL) {
1549 *controlp = m_copy(m, 0, m->m_len);
1550 controlp = &(*controlp)->m_next;
1551 }
1552 m = m->m_next;
1553 } else {
1554 sbfree(&so->so_rcv, m);
1555 so->so_rcv.sb_mb = m->m_next;
1556 m->m_next = NULL;
1557 *cme = m;
1558 cme = &(*cme)->m_next;
1559 m = so->so_rcv.sb_mb;
1560 }
1561 } while (m != NULL && m->m_type == MT_CONTROL);
1562 if ((flags & MSG_PEEK) == 0)
1563 sockbuf_pushsync(&so->so_rcv, nextrecord);
1564 while (cm != NULL) {
1565 cmn = cm->m_next;
1566 cm->m_next = NULL;
1567 if (pr->pr_domain->dom_externalize != NULL) {
1568 SOCKBUF_UNLOCK(&so->so_rcv);
1569 error = (*pr->pr_domain->dom_externalize)
1570 (cm, controlp);
1571 SOCKBUF_LOCK(&so->so_rcv);
1572 } else if (controlp != NULL)
1573 *controlp = cm;
1574 else
1575 m_freem(cm);
1576 if (controlp != NULL) {
1577 orig_resid = 0;
1578 while (*controlp != NULL)
1579 controlp = &(*controlp)->m_next;
1580 }
1581 cm = cmn;
1582 }
1583 if (m != NULL)
1584 nextrecord = so->so_rcv.sb_mb->m_nextpkt;
1585 else
1586 nextrecord = so->so_rcv.sb_mb;
1587 orig_resid = 0;
1588 }
1589 if (m != NULL) {
1590 if ((flags & MSG_PEEK) == 0) {
1591 KASSERT(m->m_nextpkt == nextrecord,
1592 ("soreceive: post-control, nextrecord !sync"));
1593 if (nextrecord == NULL) {
1594 KASSERT(so->so_rcv.sb_mb == m,
1595 ("soreceive: post-control, sb_mb!=m"));
1596 KASSERT(so->so_rcv.sb_lastrecord == m,
1597 ("soreceive: post-control, lastrecord!=m"));
1598 }
1599 }
1600 type = m->m_type;
1601 if (type == MT_OOBDATA)
1602 flags |= MSG_OOB;
1603 } else {
1604 if ((flags & MSG_PEEK) == 0) {
1605 KASSERT(so->so_rcv.sb_mb == nextrecord,
1606 ("soreceive: sb_mb != nextrecord"));
1607 if (so->so_rcv.sb_mb == NULL) {
1608 KASSERT(so->so_rcv.sb_lastrecord == NULL,
1609 ("soreceive: sb_lastercord != NULL"));
1610 }
1611 }
1612 }
1613 SOCKBUF_LOCK_ASSERT(&so->so_rcv);
1614 SBLASTRECORDCHK(&so->so_rcv);
1615 SBLASTMBUFCHK(&so->so_rcv);
1616
1617 /*
1618 * Now continue to read any data mbufs off of the head of the socket
1619 * buffer until the read request is satisfied. Note that 'type' is
1620 * used to store the type of any mbuf reads that have happened so far
1621 * such that soreceive() can stop reading if the type changes, which
1622 * causes soreceive() to return only one of regular data and inline
1623 * out-of-band data in a single socket receive operation.
1624 */
1625 moff = 0;
1626 offset = 0;
1627 while (m != NULL && uio->uio_resid > 0 && error == 0) {
1628 /*
1629 * If the type of mbuf has changed since the last mbuf
1630 * examined ('type'), end the receive operation.
1631 */
1632 SOCKBUF_LOCK_ASSERT(&so->so_rcv);
1633 if (m->m_type == MT_OOBDATA) {
1634 if (type != MT_OOBDATA)
1635 break;
1636 } else if (type == MT_OOBDATA)
1637 break;
1638 else
1639 KASSERT(m->m_type == MT_DATA,
1640 ("m->m_type == %d", m->m_type));
1641 so->so_rcv.sb_state &= ~SBS_RCVATMARK;
1642 len = uio->uio_resid;
1643 if (so->so_oobmark && len > so->so_oobmark - offset)
1644 len = so->so_oobmark - offset;
1645 if (len > m->m_len - moff)
1646 len = m->m_len - moff;
1647 /*
1648 * If mp is set, just pass back the mbufs. Otherwise copy
1649 * them out via the uio, then free. Sockbuf must be
1650 * consistent here (points to current mbuf, it points to next
1651 * record) when we drop priority; we must note any additions
1652 * to the sockbuf when we block interrupts again.
1653 */
1654 if (mp == NULL) {
1655 SOCKBUF_LOCK_ASSERT(&so->so_rcv);
1656 SBLASTRECORDCHK(&so->so_rcv);
1657 SBLASTMBUFCHK(&so->so_rcv);
1658 SOCKBUF_UNLOCK(&so->so_rcv);
1659#ifdef ZERO_COPY_SOCKETS
1660 if (so_zero_copy_receive) {
1661 int disposable;
1662
1663 if ((m->m_flags & M_EXT)
1664 && (m->m_ext.ext_type == EXT_DISPOSABLE))
1665 disposable = 1;
1666 else
1667 disposable = 0;
1668
1669 error = uiomoveco(mtod(m, char *) + moff,
1670 (int)len, uio,
1671 disposable);
1672 } else
1673#endif /* ZERO_COPY_SOCKETS */
1674 error = uiomove(mtod(m, char *) + moff, (int)len, uio);
1675 SOCKBUF_LOCK(&so->so_rcv);
1676 if (error) {
1677 /*
1678 * The MT_SONAME mbuf has already been removed
1679 * from the record, so it is necessary to
1680 * remove the data mbufs, if any, to preserve
1681 * the invariant in the case of PR_ADDR that
1682 * requires MT_SONAME mbufs at the head of
1683 * each record.
1684 */
1685 if (m && pr->pr_flags & PR_ATOMIC &&
1686 ((flags & MSG_PEEK) == 0))
1687 (void)sbdroprecord_locked(&so->so_rcv);
1688 SOCKBUF_UNLOCK(&so->so_rcv);
1689 goto release;
1690 }
1691 } else
1692 uio->uio_resid -= len;
1693 SOCKBUF_LOCK_ASSERT(&so->so_rcv);
1694 if (len == m->m_len - moff) {
1695 if (m->m_flags & M_EOR)
1696 flags |= MSG_EOR;
1697 if (flags & MSG_PEEK) {
1698 m = m->m_next;
1699 moff = 0;
1700 } else {
1701 nextrecord = m->m_nextpkt;
1702 sbfree(&so->so_rcv, m);
1703 if (mp != NULL) {
1704 *mp = m;
1705 mp = &m->m_next;
1706 so->so_rcv.sb_mb = m = m->m_next;
1707 *mp = NULL;
1708 } else {
1709 so->so_rcv.sb_mb = m_free(m);
1710 m = so->so_rcv.sb_mb;
1711 }
1712 sockbuf_pushsync(&so->so_rcv, nextrecord);
1713 SBLASTRECORDCHK(&so->so_rcv);
1714 SBLASTMBUFCHK(&so->so_rcv);
1715 }
1716 } else {
1717 if (flags & MSG_PEEK)
1718 moff += len;
1719 else {
1720 if (mp != NULL) {
1721 int copy_flag;
1722
1723 if (flags & MSG_DONTWAIT)
1724 copy_flag = M_DONTWAIT;
1725 else
1726 copy_flag = M_WAIT;
1727 if (copy_flag == M_WAIT)
1728 SOCKBUF_UNLOCK(&so->so_rcv);
1729 *mp = m_copym(m, 0, len, copy_flag);
1730 if (copy_flag == M_WAIT)
1731 SOCKBUF_LOCK(&so->so_rcv);
1732 if (*mp == NULL) {
1733 /*
1734 * m_copym() couldn't
1735 * allocate an mbuf. Adjust
1736 * uio_resid back (it was
1737 * adjusted down by len
1738 * bytes, which we didn't end
1739 * up "copying" over).
1740 */
1741 uio->uio_resid += len;
1742 break;
1743 }
1744 }
1745 m->m_data += len;
1746 m->m_len -= len;
1747 so->so_rcv.sb_cc -= len;
1748 }
1749 }
1750 SOCKBUF_LOCK_ASSERT(&so->so_rcv);
1751 if (so->so_oobmark) {
1752 if ((flags & MSG_PEEK) == 0) {
1753 so->so_oobmark -= len;
1754 if (so->so_oobmark == 0) {
1755 so->so_rcv.sb_state |= SBS_RCVATMARK;
1756 break;
1757 }
1758 } else {
1759 offset += len;
1760 if (offset == so->so_oobmark)
1761 break;
1762 }
1763 }
1764 if (flags & MSG_EOR)
1765 break;
1766 /*
1767 * If the MSG_WAITALL flag is set (for non-atomic socket), we
1768 * must not quit until "uio->uio_resid == 0" or an error
1769 * termination. If a signal/timeout occurs, return with a
1770 * short count but without error. Keep sockbuf locked
1771 * against other readers.
1772 */
1773 while (flags & MSG_WAITALL && m == NULL && uio->uio_resid > 0 &&
1774 !sosendallatonce(so) && nextrecord == NULL) {
1775 SOCKBUF_LOCK_ASSERT(&so->so_rcv);
1776 if (so->so_error || so->so_rcv.sb_state & SBS_CANTRCVMORE)
1777 break;
1778 /*
1779 * Notify the protocol that some data has been
1780 * drained before blocking.
1781 */
1782 if (pr->pr_flags & PR_WANTRCVD) {
1783 SOCKBUF_UNLOCK(&so->so_rcv);
1784 (*pr->pr_usrreqs->pru_rcvd)(so, flags);
1785 SOCKBUF_LOCK(&so->so_rcv);
1786 }
1787 SBLASTRECORDCHK(&so->so_rcv);
1788 SBLASTMBUFCHK(&so->so_rcv);
1789 error = sbwait(&so->so_rcv);
1790 if (error) {
1791 SOCKBUF_UNLOCK(&so->so_rcv);
1792 goto release;
1793 }
1794 m = so->so_rcv.sb_mb;
1795 if (m != NULL)
1796 nextrecord = m->m_nextpkt;
1797 }
1798 }
1799
1800 SOCKBUF_LOCK_ASSERT(&so->so_rcv);
1801 if (m != NULL && pr->pr_flags & PR_ATOMIC) {
1802 flags |= MSG_TRUNC;
1803 if ((flags & MSG_PEEK) == 0)
1804 (void) sbdroprecord_locked(&so->so_rcv);
1805 }
1806 if ((flags & MSG_PEEK) == 0) {
1807 if (m == NULL) {
1808 /*
1809 * First part is an inline SB_EMPTY_FIXUP(). Second
1810 * part makes sure sb_lastrecord is up-to-date if
1811 * there is still data in the socket buffer.
1812 */
1813 so->so_rcv.sb_mb = nextrecord;
1814 if (so->so_rcv.sb_mb == NULL) {
1815 so->so_rcv.sb_mbtail = NULL;
1816 so->so_rcv.sb_lastrecord = NULL;
1817 } else if (nextrecord->m_nextpkt == NULL)
1818 so->so_rcv.sb_lastrecord = nextrecord;
1819 }
1820 SBLASTRECORDCHK(&so->so_rcv);
1821 SBLASTMBUFCHK(&so->so_rcv);
1822 /*
1823 * If soreceive() is being done from the socket callback,
1824 * then don't need to generate ACK to peer to update window,
1825 * since ACK will be generated on return to TCP.
1826 */
1827 if (!(flags & MSG_SOCALLBCK) &&
1828 (pr->pr_flags & PR_WANTRCVD)) {
1829 SOCKBUF_UNLOCK(&so->so_rcv);
1830 (*pr->pr_usrreqs->pru_rcvd)(so, flags);
1831 SOCKBUF_LOCK(&so->so_rcv);
1832 }
1833 }
1834 SOCKBUF_LOCK_ASSERT(&so->so_rcv);
1835 if (orig_resid == uio->uio_resid && orig_resid &&
1836 (flags & MSG_EOR) == 0 && (so->so_rcv.sb_state & SBS_CANTRCVMORE) == 0) {
1837 SOCKBUF_UNLOCK(&so->so_rcv);
1838 goto restart;
1839 }
1840 SOCKBUF_UNLOCK(&so->so_rcv);
1841
1842 if (flagsp != NULL)
1843 *flagsp |= flags;
1844release:
1845 sbunlock(&so->so_rcv);
1846 return (error);
1847}
1848
1849/*
1850 * Optimized version of soreceive() for simple datagram cases from userspace.
1851 * Unlike in the stream case, we're able to drop a datagram if copyout()
1852 * fails, and because we handle datagrams atomically, we don't need to use a
1853 * sleep lock to prevent I/O interlacing.
1854 */
1855int
1856soreceive_dgram(struct socket *so, struct sockaddr **psa, struct uio *uio,
1857 struct mbuf **mp0, struct mbuf **controlp, int *flagsp)
1858{
1859 struct mbuf *m, *m2;
1860 int flags, len, error, offset;
1861 struct protosw *pr = so->so_proto;
1862 struct mbuf *nextrecord;
1863
1864 if (psa != NULL)
1865 *psa = NULL;
1866 if (controlp != NULL)
1867 *controlp = NULL;
1868 if (flagsp != NULL)
1869 flags = *flagsp &~ MSG_EOR;
1870 else
1871 flags = 0;
1872
1873 /*
1874 * For any complicated cases, fall back to the full
1875 * soreceive_generic().
1876 */
1877 if (mp0 != NULL || (flags & MSG_PEEK) || (flags & MSG_OOB))
1878 return (soreceive_generic(so, psa, uio, mp0, controlp,
1879 flagsp));
1880
1881 /*
1882 * Enforce restrictions on use.
1883 */
1884 KASSERT((pr->pr_flags & PR_WANTRCVD) == 0,
1885 ("soreceive_dgram: wantrcvd"));
1886 KASSERT(pr->pr_flags & PR_ATOMIC, ("soreceive_dgram: !atomic"));
1887 KASSERT((so->so_rcv.sb_state & SBS_RCVATMARK) == 0,
1888 ("soreceive_dgram: SBS_RCVATMARK"));
1889 KASSERT((so->so_proto->pr_flags & PR_CONNREQUIRED) == 0,
1890 ("soreceive_dgram: P_CONNREQUIRED"));
1891
1892 /*
1893 * Loop blocking while waiting for a datagram.
1894 */
1895 SOCKBUF_LOCK(&so->so_rcv);
1896 while ((m = so->so_rcv.sb_mb) == NULL) {
1897 KASSERT(so->so_rcv.sb_cc == 0,
1898 ("soreceive_dgram: sb_mb NULL but sb_cc %u",
1899 so->so_rcv.sb_cc));
1900 if (so->so_error) {
1901 error = so->so_error;
1902 so->so_error = 0;
1903 SOCKBUF_UNLOCK(&so->so_rcv);
1904 return (error);
1905 }
1906 if (so->so_rcv.sb_state & SBS_CANTRCVMORE ||
1907 uio->uio_resid == 0) {
1908 SOCKBUF_UNLOCK(&so->so_rcv);
1909 return (0);
1910 }
1911 if ((so->so_state & SS_NBIO) ||
1912 (flags & (MSG_DONTWAIT|MSG_NBIO))) {
1913 SOCKBUF_UNLOCK(&so->so_rcv);
1914 return (EWOULDBLOCK);
1915 }
1916 SBLASTRECORDCHK(&so->so_rcv);
1917 SBLASTMBUFCHK(&so->so_rcv);
1918 error = sbwait(&so->so_rcv);
1919 if (error) {
1920 SOCKBUF_UNLOCK(&so->so_rcv);
1921 return (error);
1922 }
1923 }
1924 SOCKBUF_LOCK_ASSERT(&so->so_rcv);
1925
1926 if (uio->uio_td)
1927 uio->uio_td->td_ru.ru_msgrcv++;
1928 SBLASTRECORDCHK(&so->so_rcv);
1929 SBLASTMBUFCHK(&so->so_rcv);
1930 nextrecord = m->m_nextpkt;
1931 if (nextrecord == NULL) {
1932 KASSERT(so->so_rcv.sb_lastrecord == m,
1933 ("soreceive_dgram: lastrecord != m"));
1934 }
1935
1936 KASSERT(so->so_rcv.sb_mb->m_nextpkt == nextrecord,
1937 ("soreceive_dgram: m_nextpkt != nextrecord"));
1938
1939 /*
1940 * Pull 'm' and its chain off the front of the packet queue.
1941 */
1942 so->so_rcv.sb_mb = NULL;
1943 sockbuf_pushsync(&so->so_rcv, nextrecord);
1944
1945 /*
1946 * Walk 'm's chain and free that many bytes from the socket buffer.
1947 */
1948 for (m2 = m; m2 != NULL; m2 = m2->m_next)
1949 sbfree(&so->so_rcv, m2);
1950
1951 /*
1952 * Do a few last checks before we let go of the lock.
1953 */
1954 SBLASTRECORDCHK(&so->so_rcv);
1955 SBLASTMBUFCHK(&so->so_rcv);
1956 SOCKBUF_UNLOCK(&so->so_rcv);
1957
1958 if (pr->pr_flags & PR_ADDR) {
1959 KASSERT(m->m_type == MT_SONAME,
1960 ("m->m_type == %d", m->m_type));
1961 if (psa != NULL)
1962 *psa = sodupsockaddr(mtod(m, struct sockaddr *),
1963 M_NOWAIT);
1964 m = m_free(m);
1965 }
1966 if (m == NULL) {
1967 /* XXXRW: Can this happen? */
1968 return (0);
1969 }
1970
1971 /*
1972 * Packet to copyout() is now in 'm' and it is disconnected from the
1973 * queue.
1974 *
1975 * Process one or more MT_CONTROL mbufs present before any data mbufs
1976 * in the first mbuf chain on the socket buffer. We call into the
1977 * protocol to perform externalization (or freeing if controlp ==
1978 * NULL).
1979 */
1980 if (m->m_type == MT_CONTROL) {
1981 struct mbuf *cm = NULL, *cmn;
1982 struct mbuf **cme = &cm;
1983
1984 do {
1985 m2 = m->m_next;
1986 m->m_next = NULL;
1987 *cme = m;
1988 cme = &(*cme)->m_next;
1989 m = m2;
1990 } while (m != NULL && m->m_type == MT_CONTROL);
1991 while (cm != NULL) {
1992 cmn = cm->m_next;
1993 cm->m_next = NULL;
1994 if (pr->pr_domain->dom_externalize != NULL) {
1995 error = (*pr->pr_domain->dom_externalize)
1996 (cm, controlp);
1997 } else if (controlp != NULL)
1998 *controlp = cm;
1999 else
2000 m_freem(cm);
2001 if (controlp != NULL) {
2002 while (*controlp != NULL)
2003 controlp = &(*controlp)->m_next;
2004 }
2005 cm = cmn;
2006 }
2007 }
2008 KASSERT(m->m_type == MT_DATA, ("soreceive_dgram: !data"));
2009
2010 offset = 0;
2011 while (m != NULL && uio->uio_resid > 0) {
2012 len = uio->uio_resid;
2013 if (len > m->m_len)
2014 len = m->m_len;
2015 error = uiomove(mtod(m, char *), (int)len, uio);
2016 if (error) {
2017 m_freem(m);
2018 return (error);
2019 }
2020 m = m_free(m);
2021 }
2022 if (m != NULL)
2023 flags |= MSG_TRUNC;
2024 m_freem(m);
2025 if (flagsp != NULL)
2026 *flagsp |= flags;
2027 return (0);
2028}
2029
2030int
2031soreceive(struct socket *so, struct sockaddr **psa, struct uio *uio,
2032 struct mbuf **mp0, struct mbuf **controlp, int *flagsp)
2033{
2034
2035 return (so->so_proto->pr_usrreqs->pru_soreceive(so, psa, uio, mp0,
2036 controlp, flagsp));
2037}
2038
2039int
2040soshutdown(struct socket *so, int how)
2041{
2042 struct protosw *pr = so->so_proto;
2043
2044 if (!(how == SHUT_RD || how == SHUT_WR || how == SHUT_RDWR))
2045 return (EINVAL);
2046 if (pr->pr_usrreqs->pru_flush != NULL) {
2047 (*pr->pr_usrreqs->pru_flush)(so, how);
2048 }
2049 if (how != SHUT_WR)
2050 sorflush(so);
2051 if (how != SHUT_RD)
2052 return ((*pr->pr_usrreqs->pru_shutdown)(so));
2053 return (0);
2054}
2055
2056void
2057sorflush(struct socket *so)
2058{
2059 struct sockbuf *sb = &so->so_rcv;
2060 struct protosw *pr = so->so_proto;
2061 struct sockbuf asb;
2062
2063 /*
2064 * In order to avoid calling dom_dispose with the socket buffer mutex
2065 * held, and in order to generally avoid holding the lock for a long
2066 * time, we make a copy of the socket buffer and clear the original
2067 * (except locks, state). The new socket buffer copy won't have
2068 * initialized locks so we can only call routines that won't use or
2069 * assert those locks.
2070 *
2071 * Dislodge threads currently blocked in receive and wait to acquire
2072 * a lock against other simultaneous readers before clearing the
2073 * socket buffer. Don't let our acquire be interrupted by a signal
2074 * despite any existing socket disposition on interruptable waiting.
2075 */
2076 socantrcvmore(so);
2077 (void) sblock(sb, SBL_WAIT | SBL_NOINTR);
2078
2079 /*
2080 * Invalidate/clear most of the sockbuf structure, but leave selinfo
2081 * and mutex data unchanged.
2082 */
2083 SOCKBUF_LOCK(sb);
2084 bzero(&asb, offsetof(struct sockbuf, sb_startzero));
2085 bcopy(&sb->sb_startzero, &asb.sb_startzero,
2086 sizeof(*sb) - offsetof(struct sockbuf, sb_startzero));
2087 bzero(&sb->sb_startzero,
2088 sizeof(*sb) - offsetof(struct sockbuf, sb_startzero));
2089 SOCKBUF_UNLOCK(sb);
2090 sbunlock(sb);
2091
2092 /*
2093 * Dispose of special rights and flush the socket buffer. Don't call
2094 * any unsafe routines (that rely on locks being initialized) on asb.
2095 */
2096 if (pr->pr_flags & PR_RIGHTS && pr->pr_domain->dom_dispose != NULL)
2097 (*pr->pr_domain->dom_dispose)(asb.sb_mb);
2098 sbrelease_internal(&asb, so);
2099}
2100
2101/*
2102 * Perhaps this routine, and sooptcopyout(), below, ought to come in an
2103 * additional variant to handle the case where the option value needs to be
2104 * some kind of integer, but not a specific size. In addition to their use
2105 * here, these functions are also called by the protocol-level pr_ctloutput()
2106 * routines.
2107 */
2108int
2109sooptcopyin(struct sockopt *sopt, void *buf, size_t len, size_t minlen)
2110{
2111 size_t valsize;
2112
2113 /*
2114 * If the user gives us more than we wanted, we ignore it, but if we
2115 * don't get the minimum length the caller wants, we return EINVAL.
2116 * On success, sopt->sopt_valsize is set to however much we actually
2117 * retrieved.
2118 */
2119 if ((valsize = sopt->sopt_valsize) < minlen)
2120 return EINVAL;
2121 if (valsize > len)
2122 sopt->sopt_valsize = valsize = len;
2123
2124 if (sopt->sopt_td != NULL)
2125 return (copyin(sopt->sopt_val, buf, valsize));
2126
2127 bcopy(sopt->sopt_val, buf, valsize);
2128 return (0);
2129}
2130
2131/*
2132 * Kernel version of setsockopt(2).
2133 *
2134 * XXX: optlen is size_t, not socklen_t
2135 */
2136int
2137so_setsockopt(struct socket *so, int level, int optname, void *optval,
2138 size_t optlen)
2139{
2140 struct sockopt sopt;
2141
2142 sopt.sopt_level = level;
2143 sopt.sopt_name = optname;
2144 sopt.sopt_dir = SOPT_SET;
2145 sopt.sopt_val = optval;
2146 sopt.sopt_valsize = optlen;
2147 sopt.sopt_td = NULL;
2148 return (sosetopt(so, &sopt));
2149}
2150
2151int
2152sosetopt(struct socket *so, struct sockopt *sopt)
2153{
2154 int error, optval;
2155 struct linger l;
2156 struct timeval tv;
2157 u_long val;
2158#ifdef MAC
2159 struct mac extmac;
2160#endif
2161
2162 error = 0;
2163 if (sopt->sopt_level != SOL_SOCKET) {
2164 if (so->so_proto && so->so_proto->pr_ctloutput)
2165 return ((*so->so_proto->pr_ctloutput)
2166 (so, sopt));
2167 error = ENOPROTOOPT;
2168 } else {
2169 switch (sopt->sopt_name) {
2170#ifdef INET
2171 case SO_ACCEPTFILTER:
2172 error = do_setopt_accept_filter(so, sopt);
2173 if (error)
2174 goto bad;
2175 break;
2176#endif
2177 case SO_LINGER:
2178 error = sooptcopyin(sopt, &l, sizeof l, sizeof l);
2179 if (error)
2180 goto bad;
2181
2182 SOCK_LOCK(so);
2183 so->so_linger = l.l_linger;
2184 if (l.l_onoff)
2185 so->so_options |= SO_LINGER;
2186 else
2187 so->so_options &= ~SO_LINGER;
2188 SOCK_UNLOCK(so);
2189 break;
2190
2191 case SO_DEBUG:
2192 case SO_KEEPALIVE:
2193 case SO_DONTROUTE:
2194 case SO_USELOOPBACK:
2195 case SO_BROADCAST:
2196 case SO_REUSEADDR:
2197 case SO_REUSEPORT:
2198 case SO_OOBINLINE:
2199 case SO_TIMESTAMP:
2200 case SO_BINTIME:
2201 case SO_NOSIGPIPE:
2202 case SO_NO_DDP:
2203 case SO_NO_OFFLOAD:
2204 error = sooptcopyin(sopt, &optval, sizeof optval,
2205 sizeof optval);
2206 if (error)
2207 goto bad;
2208 SOCK_LOCK(so);
2209 if (optval)
2210 so->so_options |= sopt->sopt_name;
2211 else
2212 so->so_options &= ~sopt->sopt_name;
2213 SOCK_UNLOCK(so);
2214 break;
2215
2216 case SO_SETFIB:
2217 error = sooptcopyin(sopt, &optval, sizeof optval,
2218 sizeof optval);
2219 if (optval < 1 || optval > rt_numfibs) {
2220 error = EINVAL;
2221 goto bad;
2222 }
2223 if ((so->so_proto->pr_domain->dom_family == PF_INET) ||
2224 (so->so_proto->pr_domain->dom_family == PF_ROUTE)) {
2225 so->so_fibnum = optval;
2226 /* Note: ignore error */
2227 if (so->so_proto && so->so_proto->pr_ctloutput)
2228 (*so->so_proto->pr_ctloutput)(so, sopt);
2229 } else {
2230 so->so_fibnum = 0;
2231 }
2232 break;
2233 case SO_SNDBUF:
2234 case SO_RCVBUF:
2235 case SO_SNDLOWAT:
2236 case SO_RCVLOWAT:
2237 error = sooptcopyin(sopt, &optval, sizeof optval,
2238 sizeof optval);
2239 if (error)
2240 goto bad;
2241
2242 /*
2243 * Values < 1 make no sense for any of these options,
2244 * so disallow them.
2245 */
2246 if (optval < 1) {
2247 error = EINVAL;
2248 goto bad;
2249 }
2250
2251 switch (sopt->sopt_name) {
2252 case SO_SNDBUF:
2253 case SO_RCVBUF:
2254 if (sbreserve(sopt->sopt_name == SO_SNDBUF ?
2255 &so->so_snd : &so->so_rcv, (u_long)optval,
2256 so, curthread) == 0) {
2257 error = ENOBUFS;
2258 goto bad;
2259 }
2260 (sopt->sopt_name == SO_SNDBUF ? &so->so_snd :
2261 &so->so_rcv)->sb_flags &= ~SB_AUTOSIZE;
2262 break;
2263
2264 /*
2265 * Make sure the low-water is never greater than the
2266 * high-water.
2267 */
2268 case SO_SNDLOWAT:
2269 SOCKBUF_LOCK(&so->so_snd);
2270 so->so_snd.sb_lowat =
2271 (optval > so->so_snd.sb_hiwat) ?
2272 so->so_snd.sb_hiwat : optval;
2273 SOCKBUF_UNLOCK(&so->so_snd);
2274 break;
2275 case SO_RCVLOWAT:
2276 SOCKBUF_LOCK(&so->so_rcv);
2277 so->so_rcv.sb_lowat =
2278 (optval > so->so_rcv.sb_hiwat) ?
2279 so->so_rcv.sb_hiwat : optval;
2280 SOCKBUF_UNLOCK(&so->so_rcv);
2281 break;
2282 }
2283 break;
2284
2285 case SO_SNDTIMEO:
2286 case SO_RCVTIMEO:
2287#ifdef COMPAT_IA32
2288 if (SV_CURPROC_FLAG(SV_ILP32)) {
2289 struct timeval32 tv32;
2290
2291 error = sooptcopyin(sopt, &tv32, sizeof tv32,
2292 sizeof tv32);
2293 CP(tv32, tv, tv_sec);
2294 CP(tv32, tv, tv_usec);
2295 } else
2296#endif
2297 error = sooptcopyin(sopt, &tv, sizeof tv,
2298 sizeof tv);
2299 if (error)
2300 goto bad;
2301
2302 /* assert(hz > 0); */
2303 if (tv.tv_sec < 0 || tv.tv_sec > INT_MAX / hz ||
2304 tv.tv_usec < 0 || tv.tv_usec >= 1000000) {
2305 error = EDOM;
2306 goto bad;
2307 }
2308 /* assert(tick > 0); */
2309 /* assert(ULONG_MAX - INT_MAX >= 1000000); */
2310 val = (u_long)(tv.tv_sec * hz) + tv.tv_usec / tick;
2311 if (val > INT_MAX) {
2312 error = EDOM;
2313 goto bad;
2314 }
2315 if (val == 0 && tv.tv_usec != 0)
2316 val = 1;
2317
2318 switch (sopt->sopt_name) {
2319 case SO_SNDTIMEO:
2320 so->so_snd.sb_timeo = val;
2321 break;
2322 case SO_RCVTIMEO:
2323 so->so_rcv.sb_timeo = val;
2324 break;
2325 }
2326 break;
2327
2328 case SO_LABEL:
2329#ifdef MAC
2330 error = sooptcopyin(sopt, &extmac, sizeof extmac,
2331 sizeof extmac);
2332 if (error)
2333 goto bad;
2334 error = mac_setsockopt_label(sopt->sopt_td->td_ucred,
2335 so, &extmac);
2336#else
2337 error = EOPNOTSUPP;
2338#endif
2339 break;
2340
2341 default:
2342 error = ENOPROTOOPT;
2343 break;
2344 }
2345 if (error == 0 && so->so_proto != NULL &&
2346 so->so_proto->pr_ctloutput != NULL) {
2347 (void) ((*so->so_proto->pr_ctloutput)
2348 (so, sopt));
2349 }
2350 }
2351bad:
2352 return (error);
2353}
2354
2355/*
2356 * Helper routine for getsockopt.
2357 */
2358int
2359sooptcopyout(struct sockopt *sopt, const void *buf, size_t len)
2360{
2361 int error;
2362 size_t valsize;
2363
2364 error = 0;
2365
2366 /*
2367 * Documented get behavior is that we always return a value, possibly
2368 * truncated to fit in the user's buffer. Traditional behavior is
2369 * that we always tell the user precisely how much we copied, rather
2370 * than something useful like the total amount we had available for
2371 * her. Note that this interface is not idempotent; the entire
2372 * answer must generated ahead of time.
2373 */
2374 valsize = min(len, sopt->sopt_valsize);
2375 sopt->sopt_valsize = valsize;
2376 if (sopt->sopt_val != NULL) {
2377 if (sopt->sopt_td != NULL)
2378 error = copyout(buf, sopt->sopt_val, valsize);
2379 else
2380 bcopy(buf, sopt->sopt_val, valsize);
2381 }
2382 return (error);
2383}
2384
2385int
2386sogetopt(struct socket *so, struct sockopt *sopt)
2387{
2388 int error, optval;
2389 struct linger l;
2390 struct timeval tv;
2391#ifdef MAC
2392 struct mac extmac;
2393#endif
2394
2395 error = 0;
2396 if (sopt->sopt_level != SOL_SOCKET) {
2397 if (so->so_proto && so->so_proto->pr_ctloutput) {
2398 return ((*so->so_proto->pr_ctloutput)
2399 (so, sopt));
2400 } else
2401 return (ENOPROTOOPT);
2402 } else {
2403 switch (sopt->sopt_name) {
2404#ifdef INET
2405 case SO_ACCEPTFILTER:
2406 error = do_getopt_accept_filter(so, sopt);
2407 break;
2408#endif
2409 case SO_LINGER:
2410 SOCK_LOCK(so);
2411 l.l_onoff = so->so_options & SO_LINGER;
2412 l.l_linger = so->so_linger;
2413 SOCK_UNLOCK(so);
2414 error = sooptcopyout(sopt, &l, sizeof l);
2415 break;
2416
2417 case SO_USELOOPBACK:
2418 case SO_DONTROUTE:
2419 case SO_DEBUG:
2420 case SO_KEEPALIVE:
2421 case SO_REUSEADDR:
2422 case SO_REUSEPORT:
2423 case SO_BROADCAST:
2424 case SO_OOBINLINE:
2425 case SO_ACCEPTCONN:
2426 case SO_TIMESTAMP:
2427 case SO_BINTIME:
2428 case SO_NOSIGPIPE:
2429 optval = so->so_options & sopt->sopt_name;
2430integer:
2431 error = sooptcopyout(sopt, &optval, sizeof optval);
2432 break;
2433
2434 case SO_TYPE:
2435 optval = so->so_type;
2436 goto integer;
2437
2438 case SO_ERROR:
2439 SOCK_LOCK(so);
2440 optval = so->so_error;
2441 so->so_error = 0;
2442 SOCK_UNLOCK(so);
2443 goto integer;
2444
2445 case SO_SNDBUF:
2446 optval = so->so_snd.sb_hiwat;
2447 goto integer;
2448
2449 case SO_RCVBUF:
2450 optval = so->so_rcv.sb_hiwat;
2451 goto integer;
2452
2453 case SO_SNDLOWAT:
2454 optval = so->so_snd.sb_lowat;
2455 goto integer;
2456
2457 case SO_RCVLOWAT:
2458 optval = so->so_rcv.sb_lowat;
2459 goto integer;
2460
2461 case SO_SNDTIMEO:
2462 case SO_RCVTIMEO:
2463 optval = (sopt->sopt_name == SO_SNDTIMEO ?
2464 so->so_snd.sb_timeo : so->so_rcv.sb_timeo);
2465
2466 tv.tv_sec = optval / hz;
2467 tv.tv_usec = (optval % hz) * tick;
2468#ifdef COMPAT_IA32
2469 if (SV_CURPROC_FLAG(SV_ILP32)) {
2470 struct timeval32 tv32;
2471
2472 CP(tv, tv32, tv_sec);
2473 CP(tv, tv32, tv_usec);
2474 error = sooptcopyout(sopt, &tv32, sizeof tv32);
2475 } else
2476#endif
2477 error = sooptcopyout(sopt, &tv, sizeof tv);
2478 break;
2479
2480 case SO_LABEL:
2481#ifdef MAC
2482 error = sooptcopyin(sopt, &extmac, sizeof(extmac),
2483 sizeof(extmac));
2484 if (error)
2485 return (error);
2486 error = mac_getsockopt_label(sopt->sopt_td->td_ucred,
2487 so, &extmac);
2488 if (error)
2489 return (error);
2490 error = sooptcopyout(sopt, &extmac, sizeof extmac);
2491#else
2492 error = EOPNOTSUPP;
2493#endif
2494 break;
2495
2496 case SO_PEERLABEL:
2497#ifdef MAC
2498 error = sooptcopyin(sopt, &extmac, sizeof(extmac),
2499 sizeof(extmac));
2500 if (error)
2501 return (error);
2502 error = mac_getsockopt_peerlabel(
2503 sopt->sopt_td->td_ucred, so, &extmac);
2504 if (error)
2505 return (error);
2506 error = sooptcopyout(sopt, &extmac, sizeof extmac);
2507#else
2508 error = EOPNOTSUPP;
2509#endif
2510 break;
2511
2512 case SO_LISTENQLIMIT:
2513 optval = so->so_qlimit;
2514 goto integer;
2515
2516 case SO_LISTENQLEN:
2517 optval = so->so_qlen;
2518 goto integer;
2519
2520 case SO_LISTENINCQLEN:
2521 optval = so->so_incqlen;
2522 goto integer;
2523
2524 default:
2525 error = ENOPROTOOPT;
2526 break;
2527 }
2528 return (error);
2529 }
2530}
2531
2532/* XXX; prepare mbuf for (__FreeBSD__ < 3) routines. */
2533int
2534soopt_getm(struct sockopt *sopt, struct mbuf **mp)
2535{
2536 struct mbuf *m, *m_prev;
2537 int sopt_size = sopt->sopt_valsize;
2538
2539 MGET(m, sopt->sopt_td ? M_WAIT : M_DONTWAIT, MT_DATA);
2540 if (m == NULL)
2541 return ENOBUFS;
2542 if (sopt_size > MLEN) {
2543 MCLGET(m, sopt->sopt_td ? M_WAIT : M_DONTWAIT);
2544 if ((m->m_flags & M_EXT) == 0) {
2545 m_free(m);
2546 return ENOBUFS;
2547 }
2548 m->m_len = min(MCLBYTES, sopt_size);
2549 } else {
2550 m->m_len = min(MLEN, sopt_size);
2551 }
2552 sopt_size -= m->m_len;
2553 *mp = m;
2554 m_prev = m;
2555
2556 while (sopt_size) {
2557 MGET(m, sopt->sopt_td ? M_WAIT : M_DONTWAIT, MT_DATA);
2558 if (m == NULL) {
2559 m_freem(*mp);
2560 return ENOBUFS;
2561 }
2562 if (sopt_size > MLEN) {
2563 MCLGET(m, sopt->sopt_td != NULL ? M_WAIT :
2564 M_DONTWAIT);
2565 if ((m->m_flags & M_EXT) == 0) {
2566 m_freem(m);
2567 m_freem(*mp);
2568 return ENOBUFS;
2569 }
2570 m->m_len = min(MCLBYTES, sopt_size);
2571 } else {
2572 m->m_len = min(MLEN, sopt_size);
2573 }
2574 sopt_size -= m->m_len;
2575 m_prev->m_next = m;
2576 m_prev = m;
2577 }
2578 return (0);
2579}
2580
2581/* XXX; copyin sopt data into mbuf chain for (__FreeBSD__ < 3) routines. */
2582int
2583soopt_mcopyin(struct sockopt *sopt, struct mbuf *m)
2584{
2585 struct mbuf *m0 = m;
2586
2587 if (sopt->sopt_val == NULL)
2588 return (0);
2589 while (m != NULL && sopt->sopt_valsize >= m->m_len) {
2590 if (sopt->sopt_td != NULL) {
2591 int error;
2592
2593 error = copyin(sopt->sopt_val, mtod(m, char *),
2594 m->m_len);
2595 if (error != 0) {
2596 m_freem(m0);
2597 return(error);
2598 }
2599 } else
2600 bcopy(sopt->sopt_val, mtod(m, char *), m->m_len);
2601 sopt->sopt_valsize -= m->m_len;
2602 sopt->sopt_val = (char *)sopt->sopt_val + m->m_len;
2603 m = m->m_next;
2604 }
2605 if (m != NULL) /* should be allocated enoughly at ip6_sooptmcopyin() */
2606 panic("ip6_sooptmcopyin");
2607 return (0);
2608}
2609
2610/* XXX; copyout mbuf chain data into soopt for (__FreeBSD__ < 3) routines. */
2611int
2612soopt_mcopyout(struct sockopt *sopt, struct mbuf *m)
2613{
2614 struct mbuf *m0 = m;
2615 size_t valsize = 0;
2616
2617 if (sopt->sopt_val == NULL)
2618 return (0);
2619 while (m != NULL && sopt->sopt_valsize >= m->m_len) {
2620 if (sopt->sopt_td != NULL) {
2621 int error;
2622
2623 error = copyout(mtod(m, char *), sopt->sopt_val,
2624 m->m_len);
2625 if (error != 0) {
2626 m_freem(m0);
2627 return(error);
2628 }
2629 } else
2630 bcopy(mtod(m, char *), sopt->sopt_val, m->m_len);
2631 sopt->sopt_valsize -= m->m_len;
2632 sopt->sopt_val = (char *)sopt->sopt_val + m->m_len;
2633 valsize += m->m_len;
2634 m = m->m_next;
2635 }
2636 if (m != NULL) {
2637 /* enough soopt buffer should be given from user-land */
2638 m_freem(m0);
2639 return(EINVAL);
2640 }
2641 sopt->sopt_valsize = valsize;
2642 return (0);
2643}
2644
2645/*
2646 * sohasoutofband(): protocol notifies socket layer of the arrival of new
2647 * out-of-band data, which will then notify socket consumers.
2648 */
2649void
2650sohasoutofband(struct socket *so)
2651{
2652
2653 if (so->so_sigio != NULL)
2654 pgsigio(&so->so_sigio, SIGURG, 0);
2655 selwakeuppri(&so->so_rcv.sb_sel, PSOCK);
2656}
2657
2658int
2659sopoll(struct socket *so, int events, struct ucred *active_cred,
2660 struct thread *td)
2661{
2662
2663 return (so->so_proto->pr_usrreqs->pru_sopoll(so, events, active_cred,
2664 td));
2665}
2666
2667int
2668sopoll_generic(struct socket *so, int events, struct ucred *active_cred,
2669 struct thread *td)
2670{
2671 int revents = 0;
2672
2673 SOCKBUF_LOCK(&so->so_snd);
2674 SOCKBUF_LOCK(&so->so_rcv);
2675 if (events & (POLLIN | POLLRDNORM))
2676 if (soreadable(so))
2677 revents |= events & (POLLIN | POLLRDNORM);
2678
2679 if (events & POLLINIGNEOF)
2680 if (so->so_rcv.sb_cc >= so->so_rcv.sb_lowat ||
2681 !TAILQ_EMPTY(&so->so_comp) || so->so_error)
2682 revents |= POLLINIGNEOF;
2683
2684 if (events & (POLLOUT | POLLWRNORM))
2685 if (sowriteable(so))
2686 revents |= events & (POLLOUT | POLLWRNORM);
2687
2688 if (events & (POLLPRI | POLLRDBAND))
2689 if (so->so_oobmark || (so->so_rcv.sb_state & SBS_RCVATMARK))
2690 revents |= events & (POLLPRI | POLLRDBAND);
2691
2692 if (revents == 0) {
2693 if (events &
2694 (POLLIN | POLLINIGNEOF | POLLPRI | POLLRDNORM |
2695 POLLRDBAND)) {
2696 selrecord(td, &so->so_rcv.sb_sel);
2697 so->so_rcv.sb_flags |= SB_SEL;
2698 }
2699
2700 if (events & (POLLOUT | POLLWRNORM)) {
2701 selrecord(td, &so->so_snd.sb_sel);
2702 so->so_snd.sb_flags |= SB_SEL;
2703 }
2704 }
2705
2706 SOCKBUF_UNLOCK(&so->so_rcv);
2707 SOCKBUF_UNLOCK(&so->so_snd);
2708 return (revents);
2709}
2710
2711int
2712soo_kqfilter(struct file *fp, struct knote *kn)
2713{
2714 struct socket *so = kn->kn_fp->f_data;
2715 struct sockbuf *sb;
2716
2717 switch (kn->kn_filter) {
2718 case EVFILT_READ:
2719 if (so->so_options & SO_ACCEPTCONN)
2720 kn->kn_fop = &solisten_filtops;
2721 else
2722 kn->kn_fop = &soread_filtops;
2723 sb = &so->so_rcv;
2724 break;
2725 case EVFILT_WRITE:
2726 kn->kn_fop = &sowrite_filtops;
2727 sb = &so->so_snd;
2728 break;
2729 default:
2730 return (EINVAL);
2731 }
2732
2733 SOCKBUF_LOCK(sb);
2734 knlist_add(&sb->sb_sel.si_note, kn, 1);
2735 sb->sb_flags |= SB_KNOTE;
2736 SOCKBUF_UNLOCK(sb);
2737 return (0);
2738}
2739
2740/*
2741 * Some routines that return EOPNOTSUPP for entry points that are not
2742 * supported by a protocol. Fill in as needed.
2743 */
2744int
2745pru_accept_notsupp(struct socket *so, struct sockaddr **nam)
2746{
2747
2748 return EOPNOTSUPP;
2749}
2750
2751int
2752pru_attach_notsupp(struct socket *so, int proto, struct thread *td)
2753{
2754
2755 return EOPNOTSUPP;
2756}
2757
2758int
2759pru_bind_notsupp(struct socket *so, struct sockaddr *nam, struct thread *td)
2760{
2761
2762 return EOPNOTSUPP;
2763}
2764
2765int
2766pru_connect_notsupp(struct socket *so, struct sockaddr *nam, struct thread *td)
2767{
2768
2769 return EOPNOTSUPP;
2770}
2771
2772int
2773pru_connect2_notsupp(struct socket *so1, struct socket *so2)
2774{
2775
2776 return EOPNOTSUPP;
2777}
2778
2779int
2780pru_control_notsupp(struct socket *so, u_long cmd, caddr_t data,
2781 struct ifnet *ifp, struct thread *td)
2782{
2783
2784 return EOPNOTSUPP;
2785}
2786
2787int
2788pru_disconnect_notsupp(struct socket *so)
2789{
2790
2791 return EOPNOTSUPP;
2792}
2793
2794int
2795pru_listen_notsupp(struct socket *so, int backlog, struct thread *td)
2796{
2797
2798 return EOPNOTSUPP;
2799}
2800
2801int
2802pru_peeraddr_notsupp(struct socket *so, struct sockaddr **nam)
2803{
2804
2805 return EOPNOTSUPP;
2806}
2807
2808int
2809pru_rcvd_notsupp(struct socket *so, int flags)
2810{
2811
2812 return EOPNOTSUPP;
2813}
2814
2815int
2816pru_rcvoob_notsupp(struct socket *so, struct mbuf *m, int flags)
2817{
2818
2819 return EOPNOTSUPP;
2820}
2821
2822int
2823pru_send_notsupp(struct socket *so, int flags, struct mbuf *m,
2824 struct sockaddr *addr, struct mbuf *control, struct thread *td)
2825{
2826
2827 return EOPNOTSUPP;
2828}
2829
2830/*
2831 * This isn't really a ``null'' operation, but it's the default one and
2832 * doesn't do anything destructive.
2833 */
2834int
2835pru_sense_null(struct socket *so, struct stat *sb)
2836{
2837
2838 sb->st_blksize = so->so_snd.sb_hiwat;
2839 return 0;
2840}
2841
2842int
2843pru_shutdown_notsupp(struct socket *so)
2844{
2845
2846 return EOPNOTSUPP;
2847}
2848
2849int
2850pru_sockaddr_notsupp(struct socket *so, struct sockaddr **nam)
2851{
2852
2853 return EOPNOTSUPP;
2854}
2855
2856int
2857pru_sosend_notsupp(struct socket *so, struct sockaddr *addr, struct uio *uio,
2858 struct mbuf *top, struct mbuf *control, int flags, struct thread *td)
2859{
2860
2861 return EOPNOTSUPP;
2862}
2863
2864int
2865pru_soreceive_notsupp(struct socket *so, struct sockaddr **paddr,
2866 struct uio *uio, struct mbuf **mp0, struct mbuf **controlp, int *flagsp)
2867{
2868
2869 return EOPNOTSUPP;
2870}
2871
2872int
2873pru_sopoll_notsupp(struct socket *so, int events, struct ucred *cred,
2874 struct thread *td)
2875{
2876
2877 return EOPNOTSUPP;
2878}
2879
2880static void
2881filt_sordetach(struct knote *kn)
2882{
2883 struct socket *so = kn->kn_fp->f_data;
2884
2885 SOCKBUF_LOCK(&so->so_rcv);
2886 knlist_remove(&so->so_rcv.sb_sel.si_note, kn, 1);
2887 if (knlist_empty(&so->so_rcv.sb_sel.si_note))
2888 so->so_rcv.sb_flags &= ~SB_KNOTE;
2889 SOCKBUF_UNLOCK(&so->so_rcv);
2890}
2891
2892/*ARGSUSED*/
2893static int
2894filt_soread(struct knote *kn, long hint)
2895{
2896 struct socket *so;
2897
2898 so = kn->kn_fp->f_data;
2899 SOCKBUF_LOCK_ASSERT(&so->so_rcv);
2900
2901 kn->kn_data = so->so_rcv.sb_cc - so->so_rcv.sb_ctl;
2902 if (so->so_rcv.sb_state & SBS_CANTRCVMORE) {
2903 kn->kn_flags |= EV_EOF;
2904 kn->kn_fflags = so->so_error;
2905 return (1);
2906 } else if (so->so_error) /* temporary udp error */
2907 return (1);
2908 else if (kn->kn_sfflags & NOTE_LOWAT)
2909 return (kn->kn_data >= kn->kn_sdata);
2910 else
2911 return (so->so_rcv.sb_cc >= so->so_rcv.sb_lowat);
2912}
2913
2914static void
2915filt_sowdetach(struct knote *kn)
2916{
2917 struct socket *so = kn->kn_fp->f_data;
2918
2919 SOCKBUF_LOCK(&so->so_snd);
2920 knlist_remove(&so->so_snd.sb_sel.si_note, kn, 1);
2921 if (knlist_empty(&so->so_snd.sb_sel.si_note))
2922 so->so_snd.sb_flags &= ~SB_KNOTE;
2923 SOCKBUF_UNLOCK(&so->so_snd);
2924}
2925
2926/*ARGSUSED*/
2927static int
2928filt_sowrite(struct knote *kn, long hint)
2929{
2930 struct socket *so;
2931
2932 so = kn->kn_fp->f_data;
2933 SOCKBUF_LOCK_ASSERT(&so->so_snd);
2934 kn->kn_data = sbspace(&so->so_snd);
2935 if (so->so_snd.sb_state & SBS_CANTSENDMORE) {
2936 kn->kn_flags |= EV_EOF;
2937 kn->kn_fflags = so->so_error;
2938 return (1);
2939 } else if (so->so_error) /* temporary udp error */
2940 return (1);
2941 else if (((so->so_state & SS_ISCONNECTED) == 0) &&
2942 (so->so_proto->pr_flags & PR_CONNREQUIRED))
2943 return (0);
2944 else if (kn->kn_sfflags & NOTE_LOWAT)
2945 return (kn->kn_data >= kn->kn_sdata);
2946 else
2947 return (kn->kn_data >= so->so_snd.sb_lowat);
2948}
2949
2950/*ARGSUSED*/
2951static int
2952filt_solisten(struct knote *kn, long hint)
2953{
2954 struct socket *so = kn->kn_fp->f_data;
2955
2956 kn->kn_data = so->so_qlen;
2957 return (! TAILQ_EMPTY(&so->so_comp));
2958}
2959
2960int
2961socheckuid(struct socket *so, uid_t uid)
2962{
2963
2964 if (so == NULL)
2965 return (EPERM);
2966 if (so->so_cred->cr_uid != uid)
2967 return (EPERM);
2968 return (0);
2969}
2970
2971static int
2972sysctl_somaxconn(SYSCTL_HANDLER_ARGS)
2973{
2974 int error;
2975 int val;
2976
2977 val = somaxconn;
2978 error = sysctl_handle_int(oidp, &val, 0, req);
2979 if (error || !req->newptr )
2980 return (error);
2981
2982 if (val < 1 || val > USHRT_MAX)
2983 return (EINVAL);
2984
2985 somaxconn = val;
2986 return (0);
2987}
2988
2989/*
2990 * These functions are used by protocols to notify the socket layer (and its
2991 * consumers) of state changes in the sockets driven by protocol-side events.
2992 */
2993
2994/*
2995 * Procedures to manipulate state flags of socket and do appropriate wakeups.
2996 *
2997 * Normal sequence from the active (originating) side is that
2998 * soisconnecting() is called during processing of connect() call, resulting
2999 * in an eventual call to soisconnected() if/when the connection is
3000 * established. When the connection is torn down soisdisconnecting() is
3001 * called during processing of disconnect() call, and soisdisconnected() is
3002 * called when the connection to the peer is totally severed. The semantics
3003 * of these routines are such that connectionless protocols can call
3004 * soisconnected() and soisdisconnected() only, bypassing the in-progress
3005 * calls when setting up a ``connection'' takes no time.
3006 *
3007 * From the passive side, a socket is created with two queues of sockets:
3008 * so_incomp for connections in progress and so_comp for connections already
3009 * made and awaiting user acceptance. As a protocol is preparing incoming
3010 * connections, it creates a socket structure queued on so_incomp by calling
3011 * sonewconn(). When the connection is established, soisconnected() is
3012 * called, and transfers the socket structure to so_comp, making it available
3013 * to accept().
3014 *
3015 * If a socket is closed with sockets on either so_incomp or so_comp, these
3016 * sockets are dropped.
3017 *
3018 * If higher-level protocols are implemented in the kernel, the wakeups done
3019 * here will sometimes cause software-interrupt process scheduling.
3020 */
3021void
3022soisconnecting(struct socket *so)
3023{
3024
3025 SOCK_LOCK(so);
3026 so->so_state &= ~(SS_ISCONNECTED|SS_ISDISCONNECTING);
3027 so->so_state |= SS_ISCONNECTING;
3028 SOCK_UNLOCK(so);
3029}
3030
3031void
3032soisconnected(struct socket *so)
3033{
3034 struct socket *head;
3035
3036 ACCEPT_LOCK();
3037 SOCK_LOCK(so);
3038 so->so_state &= ~(SS_ISCONNECTING|SS_ISDISCONNECTING|SS_ISCONFIRMING);
3039 so->so_state |= SS_ISCONNECTED;
3040 head = so->so_head;
3041 if (head != NULL && (so->so_qstate & SQ_INCOMP)) {
3042 if ((so->so_options & SO_ACCEPTFILTER) == 0) {
3043 SOCK_UNLOCK(so);
3044 TAILQ_REMOVE(&head->so_incomp, so, so_list);
3045 head->so_incqlen--;
3046 so->so_qstate &= ~SQ_INCOMP;
3047 TAILQ_INSERT_TAIL(&head->so_comp, so, so_list);
3048 head->so_qlen++;
3049 so->so_qstate |= SQ_COMP;
3050 ACCEPT_UNLOCK();
3051 sorwakeup(head);
3052 wakeup_one(&head->so_timeo);
3053 } else {
3054 ACCEPT_UNLOCK();
3055 so->so_upcall =
3056 head->so_accf->so_accept_filter->accf_callback;
3057 so->so_upcallarg = head->so_accf->so_accept_filter_arg;
3058 so->so_rcv.sb_flags |= SB_UPCALL;
3059 so->so_options &= ~SO_ACCEPTFILTER;
3060 SOCK_UNLOCK(so);
3061 so->so_upcall(so, so->so_upcallarg, M_DONTWAIT);
3062 }
3063 return;
3064 }
3065 SOCK_UNLOCK(so);
3066 ACCEPT_UNLOCK();
3067 wakeup(&so->so_timeo);
3068 sorwakeup(so);
3069 sowwakeup(so);
3070}
3071
3072void
3073soisdisconnecting(struct socket *so)
3074{
3075
3076 /*
3077 * Note: This code assumes that SOCK_LOCK(so) and
3078 * SOCKBUF_LOCK(&so->so_rcv) are the same.
3079 */
3080 SOCKBUF_LOCK(&so->so_rcv);
3081 so->so_state &= ~SS_ISCONNECTING;
3082 so->so_state |= SS_ISDISCONNECTING;
3083 so->so_rcv.sb_state |= SBS_CANTRCVMORE;
3084 sorwakeup_locked(so);
3085 SOCKBUF_LOCK(&so->so_snd);
3086 so->so_snd.sb_state |= SBS_CANTSENDMORE;
3087 sowwakeup_locked(so);
3088 wakeup(&so->so_timeo);
3089}
3090
3091void
3092soisdisconnected(struct socket *so)
3093{
3094
3095 /*
3096 * Note: This code assumes that SOCK_LOCK(so) and
3097 * SOCKBUF_LOCK(&so->so_rcv) are the same.
3098 */
3099 SOCKBUF_LOCK(&so->so_rcv);
3100 so->so_state &= ~(SS_ISCONNECTING|SS_ISCONNECTED|SS_ISDISCONNECTING);
3101 so->so_state |= SS_ISDISCONNECTED;
3102 so->so_rcv.sb_state |= SBS_CANTRCVMORE;
3103 sorwakeup_locked(so);
3104 SOCKBUF_LOCK(&so->so_snd);
3105 so->so_snd.sb_state |= SBS_CANTSENDMORE;
3106 sbdrop_locked(&so->so_snd, so->so_snd.sb_cc);
3107 sowwakeup_locked(so);
3108 wakeup(&so->so_timeo);
3109}
3110
3111/*
3112 * Make a copy of a sockaddr in a malloced buffer of type M_SONAME.
3113 */
3114struct sockaddr *
3115sodupsockaddr(const struct sockaddr *sa, int mflags)
3116{
3117 struct sockaddr *sa2;
3118
3119 sa2 = malloc(sa->sa_len, M_SONAME, mflags);
3120 if (sa2)
3121 bcopy(sa, sa2, sa->sa_len);
3122 return sa2;
3123}
3124
3125/*
3126 * Create an external-format (``xsocket'') structure using the information in
3127 * the kernel-format socket structure pointed to by so. This is done to
3128 * reduce the spew of irrelevant information over this interface, to isolate
3129 * user code from changes in the kernel structure, and potentially to provide
3130 * information-hiding if we decide that some of this information should be
3131 * hidden from users.
3132 */
3133void
3134sotoxsocket(struct socket *so, struct xsocket *xso)
3135{
3136
3137 xso->xso_len = sizeof *xso;
3138 xso->xso_so = so;
3139 xso->so_type = so->so_type;
3140 xso->so_options = so->so_options;
3141 xso->so_linger = so->so_linger;
3142 xso->so_state = so->so_state;
3143 xso->so_pcb = so->so_pcb;
3144 xso->xso_protocol = so->so_proto->pr_protocol;
3145 xso->xso_family = so->so_proto->pr_domain->dom_family;
3146 xso->so_qlen = so->so_qlen;
3147 xso->so_incqlen = so->so_incqlen;
3148 xso->so_qlimit = so->so_qlimit;
3149 xso->so_timeo = so->so_timeo;
3150 xso->so_error = so->so_error;
3151 xso->so_pgid = so->so_sigio ? so->so_sigio->sio_pgid : 0;
3152 xso->so_oobmark = so->so_oobmark;
3153 sbtoxsockbuf(&so->so_snd, &xso->so_snd);
3154 sbtoxsockbuf(&so->so_rcv, &xso->so_rcv);
3155 xso->so_uid = so->so_cred->cr_uid;
3156}
3157
3158
3159/*
3160 * Socket accessor functions to provide external consumers with
3161 * a safe interface to socket state
3162 *
3163 */
3164
3165void
3166so_listeners_apply_all(struct socket *so, void (*func)(struct socket *, void *), void *arg)
3167{
3168
3169 TAILQ_FOREACH(so, &so->so_comp, so_list)
3170 func(so, arg);
3171}
3172
3173struct sockbuf *
3174so_sockbuf_rcv(struct socket *so)
3175{
3176
3177 return (&so->so_rcv);
3178}
3179
3180struct sockbuf *
3181so_sockbuf_snd(struct socket *so)
3182{
3183
3184 return (&so->so_snd);
3185}
3186
3187int
3188so_state_get(const struct socket *so)
3189{
3190
3191 return (so->so_state);
3192}
3193
3194void
3195so_state_set(struct socket *so, int val)
3196{
3197
3198 so->so_state = val;
3199}
3200
3201int
3202so_options_get(const struct socket *so)
3203{
3204
3205 return (so->so_options);
3206}
3207
3208void
3209so_options_set(struct socket *so, int val)
3210{
3211
3212 so->so_options = val;
3213}
3214
3215int
3216so_error_get(const struct socket *so)
3217{
3218
3219 return (so->so_error);
3220}
3221
3222void
3223so_error_set(struct socket *so, int val)
3224{
3225
3226 so->so_error = val;
3227}
3228
3229int
3230so_linger_get(const struct socket *so)
3231{
3232
3233 return (so->so_linger);
3234}
3235
3236void
3237so_linger_set(struct socket *so, int val)
3238{
3239
3240 so->so_linger = val;
3241}
3242
3243struct protosw *
3244so_protosw_get(const struct socket *so)
3245{
3246
3247 return (so->so_proto);
3248}
3249
3250void
3251so_protosw_set(struct socket *so, struct protosw *val)
3252{
3253
3254 so->so_proto = val;
3255}
3256
3257void
3258so_sorwakeup(struct socket *so)
3259{
3260
3261 sorwakeup(so);
3262}
3263
3264void
3265so_sowwakeup(struct socket *so)
3266{
3267
3268 sowwakeup(so);
3269}
3270
3271void
3272so_sorwakeup_locked(struct socket *so)
3273{
3274
3275 sorwakeup_locked(so);
3276}
3277
3278void
3279so_sowwakeup_locked(struct socket *so)
3280{
3281
3282 sowwakeup_locked(so);
3283}
3284
3285void
3286so_lock(struct socket *so)
3287{
3288 SOCK_LOCK(so);
3289}
3290
3291void
3292so_unlock(struct socket *so)
3293{
3294 SOCK_UNLOCK(so);
3295}
241 */
242static void
243init_maxsockets(void *ignored)
244{
245
246 TUNABLE_INT_FETCH("kern.ipc.maxsockets", &maxsockets);
247 maxsockets = imax(maxsockets, imax(maxfiles, nmbclusters));
248}
249SYSINIT(param, SI_SUB_TUNABLES, SI_ORDER_ANY, init_maxsockets, NULL);
250
251/*
252 * Socket operation routines. These routines are called by the routines in
253 * sys_socket.c or from a system process, and implement the semantics of
254 * socket operations by switching out to the protocol specific routines.
255 */
256
257/*
258 * Get a socket structure from our zone, and initialize it. Note that it
259 * would probably be better to allocate socket and PCB at the same time, but
260 * I'm not convinced that all the protocols can be easily modified to do
261 * this.
262 *
263 * soalloc() returns a socket with a ref count of 0.
264 */
265static struct socket *
266soalloc(void)
267{
268 struct socket *so;
269
270 so = uma_zalloc(socket_zone, M_NOWAIT | M_ZERO);
271 if (so == NULL)
272 return (NULL);
273#ifdef MAC
274 if (mac_socket_init(so, M_NOWAIT) != 0) {
275 uma_zfree(socket_zone, so);
276 return (NULL);
277 }
278#endif
279 SOCKBUF_LOCK_INIT(&so->so_snd, "so_snd");
280 SOCKBUF_LOCK_INIT(&so->so_rcv, "so_rcv");
281 sx_init(&so->so_snd.sb_sx, "so_snd_sx");
282 sx_init(&so->so_rcv.sb_sx, "so_rcv_sx");
283 TAILQ_INIT(&so->so_aiojobq);
284 mtx_lock(&so_global_mtx);
285 so->so_gencnt = ++so_gencnt;
286 ++numopensockets;
287 mtx_unlock(&so_global_mtx);
288 return (so);
289}
290
291/*
292 * Free the storage associated with a socket at the socket layer, tear down
293 * locks, labels, etc. All protocol state is assumed already to have been
294 * torn down (and possibly never set up) by the caller.
295 */
296static void
297sodealloc(struct socket *so)
298{
299
300 KASSERT(so->so_count == 0, ("sodealloc(): so_count %d", so->so_count));
301 KASSERT(so->so_pcb == NULL, ("sodealloc(): so_pcb != NULL"));
302
303 mtx_lock(&so_global_mtx);
304 so->so_gencnt = ++so_gencnt;
305 --numopensockets; /* Could be below, but faster here. */
306 mtx_unlock(&so_global_mtx);
307 if (so->so_rcv.sb_hiwat)
308 (void)chgsbsize(so->so_cred->cr_uidinfo,
309 &so->so_rcv.sb_hiwat, 0, RLIM_INFINITY);
310 if (so->so_snd.sb_hiwat)
311 (void)chgsbsize(so->so_cred->cr_uidinfo,
312 &so->so_snd.sb_hiwat, 0, RLIM_INFINITY);
313#ifdef INET
314 /* remove acccept filter if one is present. */
315 if (so->so_accf != NULL)
316 do_setopt_accept_filter(so, NULL);
317#endif
318#ifdef MAC
319 mac_socket_destroy(so);
320#endif
321 crfree(so->so_cred);
322 sx_destroy(&so->so_snd.sb_sx);
323 sx_destroy(&so->so_rcv.sb_sx);
324 SOCKBUF_LOCK_DESTROY(&so->so_snd);
325 SOCKBUF_LOCK_DESTROY(&so->so_rcv);
326 uma_zfree(socket_zone, so);
327}
328
329/*
330 * socreate returns a socket with a ref count of 1. The socket should be
331 * closed with soclose().
332 */
333int
334socreate(int dom, struct socket **aso, int type, int proto,
335 struct ucred *cred, struct thread *td)
336{
337 struct protosw *prp;
338 struct socket *so;
339 int error;
340
341 if (proto)
342 prp = pffindproto(dom, proto, type);
343 else
344 prp = pffindtype(dom, type);
345
346 if (prp == NULL || prp->pr_usrreqs->pru_attach == NULL ||
347 prp->pr_usrreqs->pru_attach == pru_attach_notsupp)
348 return (EPROTONOSUPPORT);
349
350 if (jailed(cred) && jail_socket_unixiproute_only &&
351 prp->pr_domain->dom_family != PF_LOCAL &&
352 prp->pr_domain->dom_family != PF_INET &&
353#ifdef INET6
354 prp->pr_domain->dom_family != PF_INET6 &&
355#endif
356 prp->pr_domain->dom_family != PF_ROUTE) {
357 return (EPROTONOSUPPORT);
358 }
359
360 if (prp->pr_type != type)
361 return (EPROTOTYPE);
362 so = soalloc();
363 if (so == NULL)
364 return (ENOBUFS);
365
366 TAILQ_INIT(&so->so_incomp);
367 TAILQ_INIT(&so->so_comp);
368 so->so_type = type;
369 so->so_cred = crhold(cred);
370 if ((prp->pr_domain->dom_family == PF_INET) ||
371 (prp->pr_domain->dom_family == PF_ROUTE))
372 so->so_fibnum = td->td_proc->p_fibnum;
373 else
374 so->so_fibnum = 0;
375 so->so_proto = prp;
376#ifdef MAC
377 mac_socket_create(cred, so);
378#endif
379 knlist_init(&so->so_rcv.sb_sel.si_note, SOCKBUF_MTX(&so->so_rcv),
380 NULL, NULL, NULL);
381 knlist_init(&so->so_snd.sb_sel.si_note, SOCKBUF_MTX(&so->so_snd),
382 NULL, NULL, NULL);
383 so->so_count = 1;
384 /*
385 * Auto-sizing of socket buffers is managed by the protocols and
386 * the appropriate flags must be set in the pru_attach function.
387 */
388 error = (*prp->pr_usrreqs->pru_attach)(so, proto, td);
389 if (error) {
390 KASSERT(so->so_count == 1, ("socreate: so_count %d",
391 so->so_count));
392 so->so_count = 0;
393 sodealloc(so);
394 return (error);
395 }
396 *aso = so;
397 return (0);
398}
399
400#ifdef REGRESSION
401static int regression_sonewconn_earlytest = 1;
402SYSCTL_INT(_regression, OID_AUTO, sonewconn_earlytest, CTLFLAG_RW,
403 &regression_sonewconn_earlytest, 0, "Perform early sonewconn limit test");
404#endif
405
406/*
407 * When an attempt at a new connection is noted on a socket which accepts
408 * connections, sonewconn is called. If the connection is possible (subject
409 * to space constraints, etc.) then we allocate a new structure, propoerly
410 * linked into the data structure of the original socket, and return this.
411 * Connstatus may be 0, or SO_ISCONFIRMING, or SO_ISCONNECTED.
412 *
413 * Note: the ref count on the socket is 0 on return.
414 */
415struct socket *
416sonewconn(struct socket *head, int connstatus)
417{
418 struct socket *so;
419 int over;
420
421 ACCEPT_LOCK();
422 over = (head->so_qlen > 3 * head->so_qlimit / 2);
423 ACCEPT_UNLOCK();
424#ifdef REGRESSION
425 if (regression_sonewconn_earlytest && over)
426#else
427 if (over)
428#endif
429 return (NULL);
430 so = soalloc();
431 if (so == NULL)
432 return (NULL);
433 if ((head->so_options & SO_ACCEPTFILTER) != 0)
434 connstatus = 0;
435 so->so_head = head;
436 so->so_type = head->so_type;
437 so->so_options = head->so_options &~ SO_ACCEPTCONN;
438 so->so_linger = head->so_linger;
439 so->so_state = head->so_state | SS_NOFDREF;
440 so->so_proto = head->so_proto;
441 so->so_cred = crhold(head->so_cred);
442#ifdef MAC
443 SOCK_LOCK(head);
444 mac_socket_newconn(head, so);
445 SOCK_UNLOCK(head);
446#endif
447 knlist_init(&so->so_rcv.sb_sel.si_note, SOCKBUF_MTX(&so->so_rcv),
448 NULL, NULL, NULL);
449 knlist_init(&so->so_snd.sb_sel.si_note, SOCKBUF_MTX(&so->so_snd),
450 NULL, NULL, NULL);
451 if (soreserve(so, head->so_snd.sb_hiwat, head->so_rcv.sb_hiwat) ||
452 (*so->so_proto->pr_usrreqs->pru_attach)(so, 0, NULL)) {
453 sodealloc(so);
454 return (NULL);
455 }
456 so->so_rcv.sb_lowat = head->so_rcv.sb_lowat;
457 so->so_snd.sb_lowat = head->so_snd.sb_lowat;
458 so->so_rcv.sb_timeo = head->so_rcv.sb_timeo;
459 so->so_snd.sb_timeo = head->so_snd.sb_timeo;
460 so->so_rcv.sb_flags |= head->so_rcv.sb_flags & SB_AUTOSIZE;
461 so->so_snd.sb_flags |= head->so_snd.sb_flags & SB_AUTOSIZE;
462 so->so_state |= connstatus;
463 ACCEPT_LOCK();
464 if (connstatus) {
465 TAILQ_INSERT_TAIL(&head->so_comp, so, so_list);
466 so->so_qstate |= SQ_COMP;
467 head->so_qlen++;
468 } else {
469 /*
470 * Keep removing sockets from the head until there's room for
471 * us to insert on the tail. In pre-locking revisions, this
472 * was a simple if(), but as we could be racing with other
473 * threads and soabort() requires dropping locks, we must
474 * loop waiting for the condition to be true.
475 */
476 while (head->so_incqlen > head->so_qlimit) {
477 struct socket *sp;
478 sp = TAILQ_FIRST(&head->so_incomp);
479 TAILQ_REMOVE(&head->so_incomp, sp, so_list);
480 head->so_incqlen--;
481 sp->so_qstate &= ~SQ_INCOMP;
482 sp->so_head = NULL;
483 ACCEPT_UNLOCK();
484 soabort(sp);
485 ACCEPT_LOCK();
486 }
487 TAILQ_INSERT_TAIL(&head->so_incomp, so, so_list);
488 so->so_qstate |= SQ_INCOMP;
489 head->so_incqlen++;
490 }
491 ACCEPT_UNLOCK();
492 if (connstatus) {
493 sorwakeup(head);
494 wakeup_one(&head->so_timeo);
495 }
496 return (so);
497}
498
499int
500sobind(struct socket *so, struct sockaddr *nam, struct thread *td)
501{
502
503 return ((*so->so_proto->pr_usrreqs->pru_bind)(so, nam, td));
504}
505
506/*
507 * solisten() transitions a socket from a non-listening state to a listening
508 * state, but can also be used to update the listen queue depth on an
509 * existing listen socket. The protocol will call back into the sockets
510 * layer using solisten_proto_check() and solisten_proto() to check and set
511 * socket-layer listen state. Call backs are used so that the protocol can
512 * acquire both protocol and socket layer locks in whatever order is required
513 * by the protocol.
514 *
515 * Protocol implementors are advised to hold the socket lock across the
516 * socket-layer test and set to avoid races at the socket layer.
517 */
518int
519solisten(struct socket *so, int backlog, struct thread *td)
520{
521
522 return ((*so->so_proto->pr_usrreqs->pru_listen)(so, backlog, td));
523}
524
525int
526solisten_proto_check(struct socket *so)
527{
528
529 SOCK_LOCK_ASSERT(so);
530
531 if (so->so_state & (SS_ISCONNECTED | SS_ISCONNECTING |
532 SS_ISDISCONNECTING))
533 return (EINVAL);
534 return (0);
535}
536
537void
538solisten_proto(struct socket *so, int backlog)
539{
540
541 SOCK_LOCK_ASSERT(so);
542
543 if (backlog < 0 || backlog > somaxconn)
544 backlog = somaxconn;
545 so->so_qlimit = backlog;
546 so->so_options |= SO_ACCEPTCONN;
547}
548
549/*
550 * Attempt to free a socket. This should really be sotryfree().
551 *
552 * sofree() will succeed if:
553 *
554 * - There are no outstanding file descriptor references or related consumers
555 * (so_count == 0).
556 *
557 * - The socket has been closed by user space, if ever open (SS_NOFDREF).
558 *
559 * - The protocol does not have an outstanding strong reference on the socket
560 * (SS_PROTOREF).
561 *
562 * - The socket is not in a completed connection queue, so a process has been
563 * notified that it is present. If it is removed, the user process may
564 * block in accept() despite select() saying the socket was ready.
565 *
566 * Otherwise, it will quietly abort so that a future call to sofree(), when
567 * conditions are right, can succeed.
568 */
569void
570sofree(struct socket *so)
571{
572 struct protosw *pr = so->so_proto;
573 struct socket *head;
574
575 ACCEPT_LOCK_ASSERT();
576 SOCK_LOCK_ASSERT(so);
577
578 if ((so->so_state & SS_NOFDREF) == 0 || so->so_count != 0 ||
579 (so->so_state & SS_PROTOREF) || (so->so_qstate & SQ_COMP)) {
580 SOCK_UNLOCK(so);
581 ACCEPT_UNLOCK();
582 return;
583 }
584
585 head = so->so_head;
586 if (head != NULL) {
587 KASSERT((so->so_qstate & SQ_COMP) != 0 ||
588 (so->so_qstate & SQ_INCOMP) != 0,
589 ("sofree: so_head != NULL, but neither SQ_COMP nor "
590 "SQ_INCOMP"));
591 KASSERT((so->so_qstate & SQ_COMP) == 0 ||
592 (so->so_qstate & SQ_INCOMP) == 0,
593 ("sofree: so->so_qstate is SQ_COMP and also SQ_INCOMP"));
594 TAILQ_REMOVE(&head->so_incomp, so, so_list);
595 head->so_incqlen--;
596 so->so_qstate &= ~SQ_INCOMP;
597 so->so_head = NULL;
598 }
599 KASSERT((so->so_qstate & SQ_COMP) == 0 &&
600 (so->so_qstate & SQ_INCOMP) == 0,
601 ("sofree: so_head == NULL, but still SQ_COMP(%d) or SQ_INCOMP(%d)",
602 so->so_qstate & SQ_COMP, so->so_qstate & SQ_INCOMP));
603 if (so->so_options & SO_ACCEPTCONN) {
604 KASSERT((TAILQ_EMPTY(&so->so_comp)), ("sofree: so_comp populated"));
605 KASSERT((TAILQ_EMPTY(&so->so_incomp)), ("sofree: so_comp populated"));
606 }
607 SOCK_UNLOCK(so);
608 ACCEPT_UNLOCK();
609
610 if (pr->pr_flags & PR_RIGHTS && pr->pr_domain->dom_dispose != NULL)
611 (*pr->pr_domain->dom_dispose)(so->so_rcv.sb_mb);
612 if (pr->pr_usrreqs->pru_detach != NULL)
613 (*pr->pr_usrreqs->pru_detach)(so);
614
615 /*
616 * From this point on, we assume that no other references to this
617 * socket exist anywhere else in the stack. Therefore, no locks need
618 * to be acquired or held.
619 *
620 * We used to do a lot of socket buffer and socket locking here, as
621 * well as invoke sorflush() and perform wakeups. The direct call to
622 * dom_dispose() and sbrelease_internal() are an inlining of what was
623 * necessary from sorflush().
624 *
625 * Notice that the socket buffer and kqueue state are torn down
626 * before calling pru_detach. This means that protocols shold not
627 * assume they can perform socket wakeups, etc, in their detach code.
628 */
629 sbdestroy(&so->so_snd, so);
630 sbdestroy(&so->so_rcv, so);
631 knlist_destroy(&so->so_rcv.sb_sel.si_note);
632 knlist_destroy(&so->so_snd.sb_sel.si_note);
633 sodealloc(so);
634}
635
636/*
637 * Close a socket on last file table reference removal. Initiate disconnect
638 * if connected. Free socket when disconnect complete.
639 *
640 * This function will sorele() the socket. Note that soclose() may be called
641 * prior to the ref count reaching zero. The actual socket structure will
642 * not be freed until the ref count reaches zero.
643 */
644int
645soclose(struct socket *so)
646{
647 int error = 0;
648
649 KASSERT(!(so->so_state & SS_NOFDREF), ("soclose: SS_NOFDREF on enter"));
650
651 funsetown(&so->so_sigio);
652 if (so->so_state & SS_ISCONNECTED) {
653 if ((so->so_state & SS_ISDISCONNECTING) == 0) {
654 error = sodisconnect(so);
655 if (error)
656 goto drop;
657 }
658 if (so->so_options & SO_LINGER) {
659 if ((so->so_state & SS_ISDISCONNECTING) &&
660 (so->so_state & SS_NBIO))
661 goto drop;
662 while (so->so_state & SS_ISCONNECTED) {
663 error = tsleep(&so->so_timeo,
664 PSOCK | PCATCH, "soclos", so->so_linger * hz);
665 if (error)
666 break;
667 }
668 }
669 }
670
671drop:
672 if (so->so_proto->pr_usrreqs->pru_close != NULL)
673 (*so->so_proto->pr_usrreqs->pru_close)(so);
674 if (so->so_options & SO_ACCEPTCONN) {
675 struct socket *sp;
676 ACCEPT_LOCK();
677 while ((sp = TAILQ_FIRST(&so->so_incomp)) != NULL) {
678 TAILQ_REMOVE(&so->so_incomp, sp, so_list);
679 so->so_incqlen--;
680 sp->so_qstate &= ~SQ_INCOMP;
681 sp->so_head = NULL;
682 ACCEPT_UNLOCK();
683 soabort(sp);
684 ACCEPT_LOCK();
685 }
686 while ((sp = TAILQ_FIRST(&so->so_comp)) != NULL) {
687 TAILQ_REMOVE(&so->so_comp, sp, so_list);
688 so->so_qlen--;
689 sp->so_qstate &= ~SQ_COMP;
690 sp->so_head = NULL;
691 ACCEPT_UNLOCK();
692 soabort(sp);
693 ACCEPT_LOCK();
694 }
695 ACCEPT_UNLOCK();
696 }
697 ACCEPT_LOCK();
698 SOCK_LOCK(so);
699 KASSERT((so->so_state & SS_NOFDREF) == 0, ("soclose: NOFDREF"));
700 so->so_state |= SS_NOFDREF;
701 sorele(so);
702 return (error);
703}
704
705/*
706 * soabort() is used to abruptly tear down a connection, such as when a
707 * resource limit is reached (listen queue depth exceeded), or if a listen
708 * socket is closed while there are sockets waiting to be accepted.
709 *
710 * This interface is tricky, because it is called on an unreferenced socket,
711 * and must be called only by a thread that has actually removed the socket
712 * from the listen queue it was on, or races with other threads are risked.
713 *
714 * This interface will call into the protocol code, so must not be called
715 * with any socket locks held. Protocols do call it while holding their own
716 * recursible protocol mutexes, but this is something that should be subject
717 * to review in the future.
718 */
719void
720soabort(struct socket *so)
721{
722
723 /*
724 * In as much as is possible, assert that no references to this
725 * socket are held. This is not quite the same as asserting that the
726 * current thread is responsible for arranging for no references, but
727 * is as close as we can get for now.
728 */
729 KASSERT(so->so_count == 0, ("soabort: so_count"));
730 KASSERT((so->so_state & SS_PROTOREF) == 0, ("soabort: SS_PROTOREF"));
731 KASSERT(so->so_state & SS_NOFDREF, ("soabort: !SS_NOFDREF"));
732 KASSERT((so->so_state & SQ_COMP) == 0, ("soabort: SQ_COMP"));
733 KASSERT((so->so_state & SQ_INCOMP) == 0, ("soabort: SQ_INCOMP"));
734
735 if (so->so_proto->pr_usrreqs->pru_abort != NULL)
736 (*so->so_proto->pr_usrreqs->pru_abort)(so);
737 ACCEPT_LOCK();
738 SOCK_LOCK(so);
739 sofree(so);
740}
741
742int
743soaccept(struct socket *so, struct sockaddr **nam)
744{
745 int error;
746
747 SOCK_LOCK(so);
748 KASSERT((so->so_state & SS_NOFDREF) != 0, ("soaccept: !NOFDREF"));
749 so->so_state &= ~SS_NOFDREF;
750 SOCK_UNLOCK(so);
751 error = (*so->so_proto->pr_usrreqs->pru_accept)(so, nam);
752 return (error);
753}
754
755int
756soconnect(struct socket *so, struct sockaddr *nam, struct thread *td)
757{
758 int error;
759
760 if (so->so_options & SO_ACCEPTCONN)
761 return (EOPNOTSUPP);
762 /*
763 * If protocol is connection-based, can only connect once.
764 * Otherwise, if connected, try to disconnect first. This allows
765 * user to disconnect by connecting to, e.g., a null address.
766 */
767 if (so->so_state & (SS_ISCONNECTED|SS_ISCONNECTING) &&
768 ((so->so_proto->pr_flags & PR_CONNREQUIRED) ||
769 (error = sodisconnect(so)))) {
770 error = EISCONN;
771 } else {
772 /*
773 * Prevent accumulated error from previous connection from
774 * biting us.
775 */
776 so->so_error = 0;
777 error = (*so->so_proto->pr_usrreqs->pru_connect)(so, nam, td);
778 }
779
780 return (error);
781}
782
783int
784soconnect2(struct socket *so1, struct socket *so2)
785{
786
787 return ((*so1->so_proto->pr_usrreqs->pru_connect2)(so1, so2));
788}
789
790int
791sodisconnect(struct socket *so)
792{
793 int error;
794
795 if ((so->so_state & SS_ISCONNECTED) == 0)
796 return (ENOTCONN);
797 if (so->so_state & SS_ISDISCONNECTING)
798 return (EALREADY);
799 error = (*so->so_proto->pr_usrreqs->pru_disconnect)(so);
800 return (error);
801}
802
803#ifdef ZERO_COPY_SOCKETS
804struct so_zerocopy_stats{
805 int size_ok;
806 int align_ok;
807 int found_ifp;
808};
809struct so_zerocopy_stats so_zerocp_stats = {0,0,0};
810#include <netinet/in.h>
811#include <net/route.h>
812#include <netinet/in_pcb.h>
813#include <vm/vm.h>
814#include <vm/vm_page.h>
815#include <vm/vm_object.h>
816
817/*
818 * sosend_copyin() is only used if zero copy sockets are enabled. Otherwise
819 * sosend_dgram() and sosend_generic() use m_uiotombuf().
820 *
821 * sosend_copyin() accepts a uio and prepares an mbuf chain holding part or
822 * all of the data referenced by the uio. If desired, it uses zero-copy.
823 * *space will be updated to reflect data copied in.
824 *
825 * NB: If atomic I/O is requested, the caller must already have checked that
826 * space can hold resid bytes.
827 *
828 * NB: In the event of an error, the caller may need to free the partial
829 * chain pointed to by *mpp. The contents of both *uio and *space may be
830 * modified even in the case of an error.
831 */
832static int
833sosend_copyin(struct uio *uio, struct mbuf **retmp, int atomic, long *space,
834 int flags)
835{
836 struct mbuf *m, **mp, *top;
837 long len, resid;
838 int error;
839#ifdef ZERO_COPY_SOCKETS
840 int cow_send;
841#endif
842
843 *retmp = top = NULL;
844 mp = &top;
845 len = 0;
846 resid = uio->uio_resid;
847 error = 0;
848 do {
849#ifdef ZERO_COPY_SOCKETS
850 cow_send = 0;
851#endif /* ZERO_COPY_SOCKETS */
852 if (resid >= MINCLSIZE) {
853#ifdef ZERO_COPY_SOCKETS
854 if (top == NULL) {
855 m = m_gethdr(M_WAITOK, MT_DATA);
856 m->m_pkthdr.len = 0;
857 m->m_pkthdr.rcvif = NULL;
858 } else
859 m = m_get(M_WAITOK, MT_DATA);
860 if (so_zero_copy_send &&
861 resid>=PAGE_SIZE &&
862 *space>=PAGE_SIZE &&
863 uio->uio_iov->iov_len>=PAGE_SIZE) {
864 so_zerocp_stats.size_ok++;
865 so_zerocp_stats.align_ok++;
866 cow_send = socow_setup(m, uio);
867 len = cow_send;
868 }
869 if (!cow_send) {
870 m_clget(m, M_WAITOK);
871 len = min(min(MCLBYTES, resid), *space);
872 }
873#else /* ZERO_COPY_SOCKETS */
874 if (top == NULL) {
875 m = m_getcl(M_WAIT, MT_DATA, M_PKTHDR);
876 m->m_pkthdr.len = 0;
877 m->m_pkthdr.rcvif = NULL;
878 } else
879 m = m_getcl(M_WAIT, MT_DATA, 0);
880 len = min(min(MCLBYTES, resid), *space);
881#endif /* ZERO_COPY_SOCKETS */
882 } else {
883 if (top == NULL) {
884 m = m_gethdr(M_WAIT, MT_DATA);
885 m->m_pkthdr.len = 0;
886 m->m_pkthdr.rcvif = NULL;
887
888 len = min(min(MHLEN, resid), *space);
889 /*
890 * For datagram protocols, leave room
891 * for protocol headers in first mbuf.
892 */
893 if (atomic && m && len < MHLEN)
894 MH_ALIGN(m, len);
895 } else {
896 m = m_get(M_WAIT, MT_DATA);
897 len = min(min(MLEN, resid), *space);
898 }
899 }
900 if (m == NULL) {
901 error = ENOBUFS;
902 goto out;
903 }
904
905 *space -= len;
906#ifdef ZERO_COPY_SOCKETS
907 if (cow_send)
908 error = 0;
909 else
910#endif /* ZERO_COPY_SOCKETS */
911 error = uiomove(mtod(m, void *), (int)len, uio);
912 resid = uio->uio_resid;
913 m->m_len = len;
914 *mp = m;
915 top->m_pkthdr.len += len;
916 if (error)
917 goto out;
918 mp = &m->m_next;
919 if (resid <= 0) {
920 if (flags & MSG_EOR)
921 top->m_flags |= M_EOR;
922 break;
923 }
924 } while (*space > 0 && atomic);
925out:
926 *retmp = top;
927 return (error);
928}
929#endif /*ZERO_COPY_SOCKETS*/
930
931#define SBLOCKWAIT(f) (((f) & MSG_DONTWAIT) ? 0 : SBL_WAIT)
932
933int
934sosend_dgram(struct socket *so, struct sockaddr *addr, struct uio *uio,
935 struct mbuf *top, struct mbuf *control, int flags, struct thread *td)
936{
937 long space, resid;
938 int clen = 0, error, dontroute;
939#ifdef ZERO_COPY_SOCKETS
940 int atomic = sosendallatonce(so) || top;
941#endif
942
943 KASSERT(so->so_type == SOCK_DGRAM, ("sodgram_send: !SOCK_DGRAM"));
944 KASSERT(so->so_proto->pr_flags & PR_ATOMIC,
945 ("sodgram_send: !PR_ATOMIC"));
946
947 if (uio != NULL)
948 resid = uio->uio_resid;
949 else
950 resid = top->m_pkthdr.len;
951 /*
952 * In theory resid should be unsigned. However, space must be
953 * signed, as it might be less than 0 if we over-committed, and we
954 * must use a signed comparison of space and resid. On the other
955 * hand, a negative resid causes us to loop sending 0-length
956 * segments to the protocol.
957 *
958 * Also check to make sure that MSG_EOR isn't used on SOCK_STREAM
959 * type sockets since that's an error.
960 */
961 if (resid < 0) {
962 error = EINVAL;
963 goto out;
964 }
965
966 dontroute =
967 (flags & MSG_DONTROUTE) && (so->so_options & SO_DONTROUTE) == 0;
968 if (td != NULL)
969 td->td_ru.ru_msgsnd++;
970 if (control != NULL)
971 clen = control->m_len;
972
973 SOCKBUF_LOCK(&so->so_snd);
974 if (so->so_snd.sb_state & SBS_CANTSENDMORE) {
975 SOCKBUF_UNLOCK(&so->so_snd);
976 error = EPIPE;
977 goto out;
978 }
979 if (so->so_error) {
980 error = so->so_error;
981 so->so_error = 0;
982 SOCKBUF_UNLOCK(&so->so_snd);
983 goto out;
984 }
985 if ((so->so_state & SS_ISCONNECTED) == 0) {
986 /*
987 * `sendto' and `sendmsg' is allowed on a connection-based
988 * socket if it supports implied connect. Return ENOTCONN if
989 * not connected and no address is supplied.
990 */
991 if ((so->so_proto->pr_flags & PR_CONNREQUIRED) &&
992 (so->so_proto->pr_flags & PR_IMPLOPCL) == 0) {
993 if ((so->so_state & SS_ISCONFIRMING) == 0 &&
994 !(resid == 0 && clen != 0)) {
995 SOCKBUF_UNLOCK(&so->so_snd);
996 error = ENOTCONN;
997 goto out;
998 }
999 } else if (addr == NULL) {
1000 if (so->so_proto->pr_flags & PR_CONNREQUIRED)
1001 error = ENOTCONN;
1002 else
1003 error = EDESTADDRREQ;
1004 SOCKBUF_UNLOCK(&so->so_snd);
1005 goto out;
1006 }
1007 }
1008
1009 /*
1010 * Do we need MSG_OOB support in SOCK_DGRAM? Signs here may be a
1011 * problem and need fixing.
1012 */
1013 space = sbspace(&so->so_snd);
1014 if (flags & MSG_OOB)
1015 space += 1024;
1016 space -= clen;
1017 SOCKBUF_UNLOCK(&so->so_snd);
1018 if (resid > space) {
1019 error = EMSGSIZE;
1020 goto out;
1021 }
1022 if (uio == NULL) {
1023 resid = 0;
1024 if (flags & MSG_EOR)
1025 top->m_flags |= M_EOR;
1026 } else {
1027#ifdef ZERO_COPY_SOCKETS
1028 error = sosend_copyin(uio, &top, atomic, &space, flags);
1029 if (error)
1030 goto out;
1031#else
1032 /*
1033 * Copy the data from userland into a mbuf chain.
1034 * If no data is to be copied in, a single empty mbuf
1035 * is returned.
1036 */
1037 top = m_uiotombuf(uio, M_WAITOK, space, max_hdr,
1038 (M_PKTHDR | ((flags & MSG_EOR) ? M_EOR : 0)));
1039 if (top == NULL) {
1040 error = EFAULT; /* only possible error */
1041 goto out;
1042 }
1043 space -= resid - uio->uio_resid;
1044#endif
1045 resid = uio->uio_resid;
1046 }
1047 KASSERT(resid == 0, ("sosend_dgram: resid != 0"));
1048 /*
1049 * XXXRW: Frobbing SO_DONTROUTE here is even worse without sblock
1050 * than with.
1051 */
1052 if (dontroute) {
1053 SOCK_LOCK(so);
1054 so->so_options |= SO_DONTROUTE;
1055 SOCK_UNLOCK(so);
1056 }
1057 /*
1058 * XXX all the SBS_CANTSENDMORE checks previously done could be out
1059 * of date. We could have recieved a reset packet in an interrupt or
1060 * maybe we slept while doing page faults in uiomove() etc. We could
1061 * probably recheck again inside the locking protection here, but
1062 * there are probably other places that this also happens. We must
1063 * rethink this.
1064 */
1065 error = (*so->so_proto->pr_usrreqs->pru_send)(so,
1066 (flags & MSG_OOB) ? PRUS_OOB :
1067 /*
1068 * If the user set MSG_EOF, the protocol understands this flag and
1069 * nothing left to send then use PRU_SEND_EOF instead of PRU_SEND.
1070 */
1071 ((flags & MSG_EOF) &&
1072 (so->so_proto->pr_flags & PR_IMPLOPCL) &&
1073 (resid <= 0)) ?
1074 PRUS_EOF :
1075 /* If there is more to send set PRUS_MORETOCOME */
1076 (resid > 0 && space > 0) ? PRUS_MORETOCOME : 0,
1077 top, addr, control, td);
1078 if (dontroute) {
1079 SOCK_LOCK(so);
1080 so->so_options &= ~SO_DONTROUTE;
1081 SOCK_UNLOCK(so);
1082 }
1083 clen = 0;
1084 control = NULL;
1085 top = NULL;
1086out:
1087 if (top != NULL)
1088 m_freem(top);
1089 if (control != NULL)
1090 m_freem(control);
1091 return (error);
1092}
1093
1094/*
1095 * Send on a socket. If send must go all at once and message is larger than
1096 * send buffering, then hard error. Lock against other senders. If must go
1097 * all at once and not enough room now, then inform user that this would
1098 * block and do nothing. Otherwise, if nonblocking, send as much as
1099 * possible. The data to be sent is described by "uio" if nonzero, otherwise
1100 * by the mbuf chain "top" (which must be null if uio is not). Data provided
1101 * in mbuf chain must be small enough to send all at once.
1102 *
1103 * Returns nonzero on error, timeout or signal; callers must check for short
1104 * counts if EINTR/ERESTART are returned. Data and control buffers are freed
1105 * on return.
1106 */
1107int
1108sosend_generic(struct socket *so, struct sockaddr *addr, struct uio *uio,
1109 struct mbuf *top, struct mbuf *control, int flags, struct thread *td)
1110{
1111 long space, resid;
1112 int clen = 0, error, dontroute;
1113 int atomic = sosendallatonce(so) || top;
1114
1115 if (uio != NULL)
1116 resid = uio->uio_resid;
1117 else
1118 resid = top->m_pkthdr.len;
1119 /*
1120 * In theory resid should be unsigned. However, space must be
1121 * signed, as it might be less than 0 if we over-committed, and we
1122 * must use a signed comparison of space and resid. On the other
1123 * hand, a negative resid causes us to loop sending 0-length
1124 * segments to the protocol.
1125 *
1126 * Also check to make sure that MSG_EOR isn't used on SOCK_STREAM
1127 * type sockets since that's an error.
1128 */
1129 if (resid < 0 || (so->so_type == SOCK_STREAM && (flags & MSG_EOR))) {
1130 error = EINVAL;
1131 goto out;
1132 }
1133
1134 dontroute =
1135 (flags & MSG_DONTROUTE) && (so->so_options & SO_DONTROUTE) == 0 &&
1136 (so->so_proto->pr_flags & PR_ATOMIC);
1137 if (td != NULL)
1138 td->td_ru.ru_msgsnd++;
1139 if (control != NULL)
1140 clen = control->m_len;
1141
1142 error = sblock(&so->so_snd, SBLOCKWAIT(flags));
1143 if (error)
1144 goto out;
1145
1146restart:
1147 do {
1148 SOCKBUF_LOCK(&so->so_snd);
1149 if (so->so_snd.sb_state & SBS_CANTSENDMORE) {
1150 SOCKBUF_UNLOCK(&so->so_snd);
1151 error = EPIPE;
1152 goto release;
1153 }
1154 if (so->so_error) {
1155 error = so->so_error;
1156 so->so_error = 0;
1157 SOCKBUF_UNLOCK(&so->so_snd);
1158 goto release;
1159 }
1160 if ((so->so_state & SS_ISCONNECTED) == 0) {
1161 /*
1162 * `sendto' and `sendmsg' is allowed on a connection-
1163 * based socket if it supports implied connect.
1164 * Return ENOTCONN if not connected and no address is
1165 * supplied.
1166 */
1167 if ((so->so_proto->pr_flags & PR_CONNREQUIRED) &&
1168 (so->so_proto->pr_flags & PR_IMPLOPCL) == 0) {
1169 if ((so->so_state & SS_ISCONFIRMING) == 0 &&
1170 !(resid == 0 && clen != 0)) {
1171 SOCKBUF_UNLOCK(&so->so_snd);
1172 error = ENOTCONN;
1173 goto release;
1174 }
1175 } else if (addr == NULL) {
1176 SOCKBUF_UNLOCK(&so->so_snd);
1177 if (so->so_proto->pr_flags & PR_CONNREQUIRED)
1178 error = ENOTCONN;
1179 else
1180 error = EDESTADDRREQ;
1181 goto release;
1182 }
1183 }
1184 space = sbspace(&so->so_snd);
1185 if (flags & MSG_OOB)
1186 space += 1024;
1187 if ((atomic && resid > so->so_snd.sb_hiwat) ||
1188 clen > so->so_snd.sb_hiwat) {
1189 SOCKBUF_UNLOCK(&so->so_snd);
1190 error = EMSGSIZE;
1191 goto release;
1192 }
1193 if (space < resid + clen &&
1194 (atomic || space < so->so_snd.sb_lowat || space < clen)) {
1195 if ((so->so_state & SS_NBIO) || (flags & MSG_NBIO)) {
1196 SOCKBUF_UNLOCK(&so->so_snd);
1197 error = EWOULDBLOCK;
1198 goto release;
1199 }
1200 error = sbwait(&so->so_snd);
1201 SOCKBUF_UNLOCK(&so->so_snd);
1202 if (error)
1203 goto release;
1204 goto restart;
1205 }
1206 SOCKBUF_UNLOCK(&so->so_snd);
1207 space -= clen;
1208 do {
1209 if (uio == NULL) {
1210 resid = 0;
1211 if (flags & MSG_EOR)
1212 top->m_flags |= M_EOR;
1213 } else {
1214#ifdef ZERO_COPY_SOCKETS
1215 error = sosend_copyin(uio, &top, atomic,
1216 &space, flags);
1217 if (error != 0)
1218 goto release;
1219#else
1220 /*
1221 * Copy the data from userland into a mbuf
1222 * chain. If no data is to be copied in,
1223 * a single empty mbuf is returned.
1224 */
1225 top = m_uiotombuf(uio, M_WAITOK, space,
1226 (atomic ? max_hdr : 0),
1227 (atomic ? M_PKTHDR : 0) |
1228 ((flags & MSG_EOR) ? M_EOR : 0));
1229 if (top == NULL) {
1230 error = EFAULT; /* only possible error */
1231 goto release;
1232 }
1233 space -= resid - uio->uio_resid;
1234#endif
1235 resid = uio->uio_resid;
1236 }
1237 if (dontroute) {
1238 SOCK_LOCK(so);
1239 so->so_options |= SO_DONTROUTE;
1240 SOCK_UNLOCK(so);
1241 }
1242 /*
1243 * XXX all the SBS_CANTSENDMORE checks previously
1244 * done could be out of date. We could have recieved
1245 * a reset packet in an interrupt or maybe we slept
1246 * while doing page faults in uiomove() etc. We
1247 * could probably recheck again inside the locking
1248 * protection here, but there are probably other
1249 * places that this also happens. We must rethink
1250 * this.
1251 */
1252 error = (*so->so_proto->pr_usrreqs->pru_send)(so,
1253 (flags & MSG_OOB) ? PRUS_OOB :
1254 /*
1255 * If the user set MSG_EOF, the protocol understands
1256 * this flag and nothing left to send then use
1257 * PRU_SEND_EOF instead of PRU_SEND.
1258 */
1259 ((flags & MSG_EOF) &&
1260 (so->so_proto->pr_flags & PR_IMPLOPCL) &&
1261 (resid <= 0)) ?
1262 PRUS_EOF :
1263 /* If there is more to send set PRUS_MORETOCOME. */
1264 (resid > 0 && space > 0) ? PRUS_MORETOCOME : 0,
1265 top, addr, control, td);
1266 if (dontroute) {
1267 SOCK_LOCK(so);
1268 so->so_options &= ~SO_DONTROUTE;
1269 SOCK_UNLOCK(so);
1270 }
1271 clen = 0;
1272 control = NULL;
1273 top = NULL;
1274 if (error)
1275 goto release;
1276 } while (resid && space > 0);
1277 } while (resid);
1278
1279release:
1280 sbunlock(&so->so_snd);
1281out:
1282 if (top != NULL)
1283 m_freem(top);
1284 if (control != NULL)
1285 m_freem(control);
1286 return (error);
1287}
1288
1289int
1290sosend(struct socket *so, struct sockaddr *addr, struct uio *uio,
1291 struct mbuf *top, struct mbuf *control, int flags, struct thread *td)
1292{
1293
1294 return (so->so_proto->pr_usrreqs->pru_sosend(so, addr, uio, top,
1295 control, flags, td));
1296}
1297
1298/*
1299 * The part of soreceive() that implements reading non-inline out-of-band
1300 * data from a socket. For more complete comments, see soreceive(), from
1301 * which this code originated.
1302 *
1303 * Note that soreceive_rcvoob(), unlike the remainder of soreceive(), is
1304 * unable to return an mbuf chain to the caller.
1305 */
1306static int
1307soreceive_rcvoob(struct socket *so, struct uio *uio, int flags)
1308{
1309 struct protosw *pr = so->so_proto;
1310 struct mbuf *m;
1311 int error;
1312
1313 KASSERT(flags & MSG_OOB, ("soreceive_rcvoob: (flags & MSG_OOB) == 0"));
1314
1315 m = m_get(M_WAIT, MT_DATA);
1316 error = (*pr->pr_usrreqs->pru_rcvoob)(so, m, flags & MSG_PEEK);
1317 if (error)
1318 goto bad;
1319 do {
1320#ifdef ZERO_COPY_SOCKETS
1321 if (so_zero_copy_receive) {
1322 int disposable;
1323
1324 if ((m->m_flags & M_EXT)
1325 && (m->m_ext.ext_type == EXT_DISPOSABLE))
1326 disposable = 1;
1327 else
1328 disposable = 0;
1329
1330 error = uiomoveco(mtod(m, void *),
1331 min(uio->uio_resid, m->m_len),
1332 uio, disposable);
1333 } else
1334#endif /* ZERO_COPY_SOCKETS */
1335 error = uiomove(mtod(m, void *),
1336 (int) min(uio->uio_resid, m->m_len), uio);
1337 m = m_free(m);
1338 } while (uio->uio_resid && error == 0 && m);
1339bad:
1340 if (m != NULL)
1341 m_freem(m);
1342 return (error);
1343}
1344
1345/*
1346 * Following replacement or removal of the first mbuf on the first mbuf chain
1347 * of a socket buffer, push necessary state changes back into the socket
1348 * buffer so that other consumers see the values consistently. 'nextrecord'
1349 * is the callers locally stored value of the original value of
1350 * sb->sb_mb->m_nextpkt which must be restored when the lead mbuf changes.
1351 * NOTE: 'nextrecord' may be NULL.
1352 */
1353static __inline void
1354sockbuf_pushsync(struct sockbuf *sb, struct mbuf *nextrecord)
1355{
1356
1357 SOCKBUF_LOCK_ASSERT(sb);
1358 /*
1359 * First, update for the new value of nextrecord. If necessary, make
1360 * it the first record.
1361 */
1362 if (sb->sb_mb != NULL)
1363 sb->sb_mb->m_nextpkt = nextrecord;
1364 else
1365 sb->sb_mb = nextrecord;
1366
1367 /*
1368 * Now update any dependent socket buffer fields to reflect the new
1369 * state. This is an expanded inline of SB_EMPTY_FIXUP(), with the
1370 * addition of a second clause that takes care of the case where
1371 * sb_mb has been updated, but remains the last record.
1372 */
1373 if (sb->sb_mb == NULL) {
1374 sb->sb_mbtail = NULL;
1375 sb->sb_lastrecord = NULL;
1376 } else if (sb->sb_mb->m_nextpkt == NULL)
1377 sb->sb_lastrecord = sb->sb_mb;
1378}
1379
1380
1381/*
1382 * Implement receive operations on a socket. We depend on the way that
1383 * records are added to the sockbuf by sbappend. In particular, each record
1384 * (mbufs linked through m_next) must begin with an address if the protocol
1385 * so specifies, followed by an optional mbuf or mbufs containing ancillary
1386 * data, and then zero or more mbufs of data. In order to allow parallelism
1387 * between network receive and copying to user space, as well as avoid
1388 * sleeping with a mutex held, we release the socket buffer mutex during the
1389 * user space copy. Although the sockbuf is locked, new data may still be
1390 * appended, and thus we must maintain consistency of the sockbuf during that
1391 * time.
1392 *
1393 * The caller may receive the data as a single mbuf chain by supplying an
1394 * mbuf **mp0 for use in returning the chain. The uio is then used only for
1395 * the count in uio_resid.
1396 */
1397int
1398soreceive_generic(struct socket *so, struct sockaddr **psa, struct uio *uio,
1399 struct mbuf **mp0, struct mbuf **controlp, int *flagsp)
1400{
1401 struct mbuf *m, **mp;
1402 int flags, len, error, offset;
1403 struct protosw *pr = so->so_proto;
1404 struct mbuf *nextrecord;
1405 int moff, type = 0;
1406 int orig_resid = uio->uio_resid;
1407
1408 mp = mp0;
1409 if (psa != NULL)
1410 *psa = NULL;
1411 if (controlp != NULL)
1412 *controlp = NULL;
1413 if (flagsp != NULL)
1414 flags = *flagsp &~ MSG_EOR;
1415 else
1416 flags = 0;
1417 if (flags & MSG_OOB)
1418 return (soreceive_rcvoob(so, uio, flags));
1419 if (mp != NULL)
1420 *mp = NULL;
1421 if ((pr->pr_flags & PR_WANTRCVD) && (so->so_state & SS_ISCONFIRMING)
1422 && uio->uio_resid)
1423 (*pr->pr_usrreqs->pru_rcvd)(so, 0);
1424
1425 error = sblock(&so->so_rcv, SBLOCKWAIT(flags));
1426 if (error)
1427 return (error);
1428
1429restart:
1430 SOCKBUF_LOCK(&so->so_rcv);
1431 m = so->so_rcv.sb_mb;
1432 /*
1433 * If we have less data than requested, block awaiting more (subject
1434 * to any timeout) if:
1435 * 1. the current count is less than the low water mark, or
1436 * 2. MSG_WAITALL is set, and it is possible to do the entire
1437 * receive operation at once if we block (resid <= hiwat).
1438 * 3. MSG_DONTWAIT is not set
1439 * If MSG_WAITALL is set but resid is larger than the receive buffer,
1440 * we have to do the receive in sections, and thus risk returning a
1441 * short count if a timeout or signal occurs after we start.
1442 */
1443 if (m == NULL || (((flags & MSG_DONTWAIT) == 0 &&
1444 so->so_rcv.sb_cc < uio->uio_resid) &&
1445 (so->so_rcv.sb_cc < so->so_rcv.sb_lowat ||
1446 ((flags & MSG_WAITALL) && uio->uio_resid <= so->so_rcv.sb_hiwat)) &&
1447 m->m_nextpkt == NULL && (pr->pr_flags & PR_ATOMIC) == 0)) {
1448 KASSERT(m != NULL || !so->so_rcv.sb_cc,
1449 ("receive: m == %p so->so_rcv.sb_cc == %u",
1450 m, so->so_rcv.sb_cc));
1451 if (so->so_error) {
1452 if (m != NULL)
1453 goto dontblock;
1454 error = so->so_error;
1455 if ((flags & MSG_PEEK) == 0)
1456 so->so_error = 0;
1457 SOCKBUF_UNLOCK(&so->so_rcv);
1458 goto release;
1459 }
1460 SOCKBUF_LOCK_ASSERT(&so->so_rcv);
1461 if (so->so_rcv.sb_state & SBS_CANTRCVMORE) {
1462 if (m == NULL) {
1463 SOCKBUF_UNLOCK(&so->so_rcv);
1464 goto release;
1465 } else
1466 goto dontblock;
1467 }
1468 for (; m != NULL; m = m->m_next)
1469 if (m->m_type == MT_OOBDATA || (m->m_flags & M_EOR)) {
1470 m = so->so_rcv.sb_mb;
1471 goto dontblock;
1472 }
1473 if ((so->so_state & (SS_ISCONNECTED|SS_ISCONNECTING)) == 0 &&
1474 (so->so_proto->pr_flags & PR_CONNREQUIRED)) {
1475 SOCKBUF_UNLOCK(&so->so_rcv);
1476 error = ENOTCONN;
1477 goto release;
1478 }
1479 if (uio->uio_resid == 0) {
1480 SOCKBUF_UNLOCK(&so->so_rcv);
1481 goto release;
1482 }
1483 if ((so->so_state & SS_NBIO) ||
1484 (flags & (MSG_DONTWAIT|MSG_NBIO))) {
1485 SOCKBUF_UNLOCK(&so->so_rcv);
1486 error = EWOULDBLOCK;
1487 goto release;
1488 }
1489 SBLASTRECORDCHK(&so->so_rcv);
1490 SBLASTMBUFCHK(&so->so_rcv);
1491 error = sbwait(&so->so_rcv);
1492 SOCKBUF_UNLOCK(&so->so_rcv);
1493 if (error)
1494 goto release;
1495 goto restart;
1496 }
1497dontblock:
1498 /*
1499 * From this point onward, we maintain 'nextrecord' as a cache of the
1500 * pointer to the next record in the socket buffer. We must keep the
1501 * various socket buffer pointers and local stack versions of the
1502 * pointers in sync, pushing out modifications before dropping the
1503 * socket buffer mutex, and re-reading them when picking it up.
1504 *
1505 * Otherwise, we will race with the network stack appending new data
1506 * or records onto the socket buffer by using inconsistent/stale
1507 * versions of the field, possibly resulting in socket buffer
1508 * corruption.
1509 *
1510 * By holding the high-level sblock(), we prevent simultaneous
1511 * readers from pulling off the front of the socket buffer.
1512 */
1513 SOCKBUF_LOCK_ASSERT(&so->so_rcv);
1514 if (uio->uio_td)
1515 uio->uio_td->td_ru.ru_msgrcv++;
1516 KASSERT(m == so->so_rcv.sb_mb, ("soreceive: m != so->so_rcv.sb_mb"));
1517 SBLASTRECORDCHK(&so->so_rcv);
1518 SBLASTMBUFCHK(&so->so_rcv);
1519 nextrecord = m->m_nextpkt;
1520 if (pr->pr_flags & PR_ADDR) {
1521 KASSERT(m->m_type == MT_SONAME,
1522 ("m->m_type == %d", m->m_type));
1523 orig_resid = 0;
1524 if (psa != NULL)
1525 *psa = sodupsockaddr(mtod(m, struct sockaddr *),
1526 M_NOWAIT);
1527 if (flags & MSG_PEEK) {
1528 m = m->m_next;
1529 } else {
1530 sbfree(&so->so_rcv, m);
1531 so->so_rcv.sb_mb = m_free(m);
1532 m = so->so_rcv.sb_mb;
1533 sockbuf_pushsync(&so->so_rcv, nextrecord);
1534 }
1535 }
1536
1537 /*
1538 * Process one or more MT_CONTROL mbufs present before any data mbufs
1539 * in the first mbuf chain on the socket buffer. If MSG_PEEK, we
1540 * just copy the data; if !MSG_PEEK, we call into the protocol to
1541 * perform externalization (or freeing if controlp == NULL).
1542 */
1543 if (m != NULL && m->m_type == MT_CONTROL) {
1544 struct mbuf *cm = NULL, *cmn;
1545 struct mbuf **cme = &cm;
1546
1547 do {
1548 if (flags & MSG_PEEK) {
1549 if (controlp != NULL) {
1550 *controlp = m_copy(m, 0, m->m_len);
1551 controlp = &(*controlp)->m_next;
1552 }
1553 m = m->m_next;
1554 } else {
1555 sbfree(&so->so_rcv, m);
1556 so->so_rcv.sb_mb = m->m_next;
1557 m->m_next = NULL;
1558 *cme = m;
1559 cme = &(*cme)->m_next;
1560 m = so->so_rcv.sb_mb;
1561 }
1562 } while (m != NULL && m->m_type == MT_CONTROL);
1563 if ((flags & MSG_PEEK) == 0)
1564 sockbuf_pushsync(&so->so_rcv, nextrecord);
1565 while (cm != NULL) {
1566 cmn = cm->m_next;
1567 cm->m_next = NULL;
1568 if (pr->pr_domain->dom_externalize != NULL) {
1569 SOCKBUF_UNLOCK(&so->so_rcv);
1570 error = (*pr->pr_domain->dom_externalize)
1571 (cm, controlp);
1572 SOCKBUF_LOCK(&so->so_rcv);
1573 } else if (controlp != NULL)
1574 *controlp = cm;
1575 else
1576 m_freem(cm);
1577 if (controlp != NULL) {
1578 orig_resid = 0;
1579 while (*controlp != NULL)
1580 controlp = &(*controlp)->m_next;
1581 }
1582 cm = cmn;
1583 }
1584 if (m != NULL)
1585 nextrecord = so->so_rcv.sb_mb->m_nextpkt;
1586 else
1587 nextrecord = so->so_rcv.sb_mb;
1588 orig_resid = 0;
1589 }
1590 if (m != NULL) {
1591 if ((flags & MSG_PEEK) == 0) {
1592 KASSERT(m->m_nextpkt == nextrecord,
1593 ("soreceive: post-control, nextrecord !sync"));
1594 if (nextrecord == NULL) {
1595 KASSERT(so->so_rcv.sb_mb == m,
1596 ("soreceive: post-control, sb_mb!=m"));
1597 KASSERT(so->so_rcv.sb_lastrecord == m,
1598 ("soreceive: post-control, lastrecord!=m"));
1599 }
1600 }
1601 type = m->m_type;
1602 if (type == MT_OOBDATA)
1603 flags |= MSG_OOB;
1604 } else {
1605 if ((flags & MSG_PEEK) == 0) {
1606 KASSERT(so->so_rcv.sb_mb == nextrecord,
1607 ("soreceive: sb_mb != nextrecord"));
1608 if (so->so_rcv.sb_mb == NULL) {
1609 KASSERT(so->so_rcv.sb_lastrecord == NULL,
1610 ("soreceive: sb_lastercord != NULL"));
1611 }
1612 }
1613 }
1614 SOCKBUF_LOCK_ASSERT(&so->so_rcv);
1615 SBLASTRECORDCHK(&so->so_rcv);
1616 SBLASTMBUFCHK(&so->so_rcv);
1617
1618 /*
1619 * Now continue to read any data mbufs off of the head of the socket
1620 * buffer until the read request is satisfied. Note that 'type' is
1621 * used to store the type of any mbuf reads that have happened so far
1622 * such that soreceive() can stop reading if the type changes, which
1623 * causes soreceive() to return only one of regular data and inline
1624 * out-of-band data in a single socket receive operation.
1625 */
1626 moff = 0;
1627 offset = 0;
1628 while (m != NULL && uio->uio_resid > 0 && error == 0) {
1629 /*
1630 * If the type of mbuf has changed since the last mbuf
1631 * examined ('type'), end the receive operation.
1632 */
1633 SOCKBUF_LOCK_ASSERT(&so->so_rcv);
1634 if (m->m_type == MT_OOBDATA) {
1635 if (type != MT_OOBDATA)
1636 break;
1637 } else if (type == MT_OOBDATA)
1638 break;
1639 else
1640 KASSERT(m->m_type == MT_DATA,
1641 ("m->m_type == %d", m->m_type));
1642 so->so_rcv.sb_state &= ~SBS_RCVATMARK;
1643 len = uio->uio_resid;
1644 if (so->so_oobmark && len > so->so_oobmark - offset)
1645 len = so->so_oobmark - offset;
1646 if (len > m->m_len - moff)
1647 len = m->m_len - moff;
1648 /*
1649 * If mp is set, just pass back the mbufs. Otherwise copy
1650 * them out via the uio, then free. Sockbuf must be
1651 * consistent here (points to current mbuf, it points to next
1652 * record) when we drop priority; we must note any additions
1653 * to the sockbuf when we block interrupts again.
1654 */
1655 if (mp == NULL) {
1656 SOCKBUF_LOCK_ASSERT(&so->so_rcv);
1657 SBLASTRECORDCHK(&so->so_rcv);
1658 SBLASTMBUFCHK(&so->so_rcv);
1659 SOCKBUF_UNLOCK(&so->so_rcv);
1660#ifdef ZERO_COPY_SOCKETS
1661 if (so_zero_copy_receive) {
1662 int disposable;
1663
1664 if ((m->m_flags & M_EXT)
1665 && (m->m_ext.ext_type == EXT_DISPOSABLE))
1666 disposable = 1;
1667 else
1668 disposable = 0;
1669
1670 error = uiomoveco(mtod(m, char *) + moff,
1671 (int)len, uio,
1672 disposable);
1673 } else
1674#endif /* ZERO_COPY_SOCKETS */
1675 error = uiomove(mtod(m, char *) + moff, (int)len, uio);
1676 SOCKBUF_LOCK(&so->so_rcv);
1677 if (error) {
1678 /*
1679 * The MT_SONAME mbuf has already been removed
1680 * from the record, so it is necessary to
1681 * remove the data mbufs, if any, to preserve
1682 * the invariant in the case of PR_ADDR that
1683 * requires MT_SONAME mbufs at the head of
1684 * each record.
1685 */
1686 if (m && pr->pr_flags & PR_ATOMIC &&
1687 ((flags & MSG_PEEK) == 0))
1688 (void)sbdroprecord_locked(&so->so_rcv);
1689 SOCKBUF_UNLOCK(&so->so_rcv);
1690 goto release;
1691 }
1692 } else
1693 uio->uio_resid -= len;
1694 SOCKBUF_LOCK_ASSERT(&so->so_rcv);
1695 if (len == m->m_len - moff) {
1696 if (m->m_flags & M_EOR)
1697 flags |= MSG_EOR;
1698 if (flags & MSG_PEEK) {
1699 m = m->m_next;
1700 moff = 0;
1701 } else {
1702 nextrecord = m->m_nextpkt;
1703 sbfree(&so->so_rcv, m);
1704 if (mp != NULL) {
1705 *mp = m;
1706 mp = &m->m_next;
1707 so->so_rcv.sb_mb = m = m->m_next;
1708 *mp = NULL;
1709 } else {
1710 so->so_rcv.sb_mb = m_free(m);
1711 m = so->so_rcv.sb_mb;
1712 }
1713 sockbuf_pushsync(&so->so_rcv, nextrecord);
1714 SBLASTRECORDCHK(&so->so_rcv);
1715 SBLASTMBUFCHK(&so->so_rcv);
1716 }
1717 } else {
1718 if (flags & MSG_PEEK)
1719 moff += len;
1720 else {
1721 if (mp != NULL) {
1722 int copy_flag;
1723
1724 if (flags & MSG_DONTWAIT)
1725 copy_flag = M_DONTWAIT;
1726 else
1727 copy_flag = M_WAIT;
1728 if (copy_flag == M_WAIT)
1729 SOCKBUF_UNLOCK(&so->so_rcv);
1730 *mp = m_copym(m, 0, len, copy_flag);
1731 if (copy_flag == M_WAIT)
1732 SOCKBUF_LOCK(&so->so_rcv);
1733 if (*mp == NULL) {
1734 /*
1735 * m_copym() couldn't
1736 * allocate an mbuf. Adjust
1737 * uio_resid back (it was
1738 * adjusted down by len
1739 * bytes, which we didn't end
1740 * up "copying" over).
1741 */
1742 uio->uio_resid += len;
1743 break;
1744 }
1745 }
1746 m->m_data += len;
1747 m->m_len -= len;
1748 so->so_rcv.sb_cc -= len;
1749 }
1750 }
1751 SOCKBUF_LOCK_ASSERT(&so->so_rcv);
1752 if (so->so_oobmark) {
1753 if ((flags & MSG_PEEK) == 0) {
1754 so->so_oobmark -= len;
1755 if (so->so_oobmark == 0) {
1756 so->so_rcv.sb_state |= SBS_RCVATMARK;
1757 break;
1758 }
1759 } else {
1760 offset += len;
1761 if (offset == so->so_oobmark)
1762 break;
1763 }
1764 }
1765 if (flags & MSG_EOR)
1766 break;
1767 /*
1768 * If the MSG_WAITALL flag is set (for non-atomic socket), we
1769 * must not quit until "uio->uio_resid == 0" or an error
1770 * termination. If a signal/timeout occurs, return with a
1771 * short count but without error. Keep sockbuf locked
1772 * against other readers.
1773 */
1774 while (flags & MSG_WAITALL && m == NULL && uio->uio_resid > 0 &&
1775 !sosendallatonce(so) && nextrecord == NULL) {
1776 SOCKBUF_LOCK_ASSERT(&so->so_rcv);
1777 if (so->so_error || so->so_rcv.sb_state & SBS_CANTRCVMORE)
1778 break;
1779 /*
1780 * Notify the protocol that some data has been
1781 * drained before blocking.
1782 */
1783 if (pr->pr_flags & PR_WANTRCVD) {
1784 SOCKBUF_UNLOCK(&so->so_rcv);
1785 (*pr->pr_usrreqs->pru_rcvd)(so, flags);
1786 SOCKBUF_LOCK(&so->so_rcv);
1787 }
1788 SBLASTRECORDCHK(&so->so_rcv);
1789 SBLASTMBUFCHK(&so->so_rcv);
1790 error = sbwait(&so->so_rcv);
1791 if (error) {
1792 SOCKBUF_UNLOCK(&so->so_rcv);
1793 goto release;
1794 }
1795 m = so->so_rcv.sb_mb;
1796 if (m != NULL)
1797 nextrecord = m->m_nextpkt;
1798 }
1799 }
1800
1801 SOCKBUF_LOCK_ASSERT(&so->so_rcv);
1802 if (m != NULL && pr->pr_flags & PR_ATOMIC) {
1803 flags |= MSG_TRUNC;
1804 if ((flags & MSG_PEEK) == 0)
1805 (void) sbdroprecord_locked(&so->so_rcv);
1806 }
1807 if ((flags & MSG_PEEK) == 0) {
1808 if (m == NULL) {
1809 /*
1810 * First part is an inline SB_EMPTY_FIXUP(). Second
1811 * part makes sure sb_lastrecord is up-to-date if
1812 * there is still data in the socket buffer.
1813 */
1814 so->so_rcv.sb_mb = nextrecord;
1815 if (so->so_rcv.sb_mb == NULL) {
1816 so->so_rcv.sb_mbtail = NULL;
1817 so->so_rcv.sb_lastrecord = NULL;
1818 } else if (nextrecord->m_nextpkt == NULL)
1819 so->so_rcv.sb_lastrecord = nextrecord;
1820 }
1821 SBLASTRECORDCHK(&so->so_rcv);
1822 SBLASTMBUFCHK(&so->so_rcv);
1823 /*
1824 * If soreceive() is being done from the socket callback,
1825 * then don't need to generate ACK to peer to update window,
1826 * since ACK will be generated on return to TCP.
1827 */
1828 if (!(flags & MSG_SOCALLBCK) &&
1829 (pr->pr_flags & PR_WANTRCVD)) {
1830 SOCKBUF_UNLOCK(&so->so_rcv);
1831 (*pr->pr_usrreqs->pru_rcvd)(so, flags);
1832 SOCKBUF_LOCK(&so->so_rcv);
1833 }
1834 }
1835 SOCKBUF_LOCK_ASSERT(&so->so_rcv);
1836 if (orig_resid == uio->uio_resid && orig_resid &&
1837 (flags & MSG_EOR) == 0 && (so->so_rcv.sb_state & SBS_CANTRCVMORE) == 0) {
1838 SOCKBUF_UNLOCK(&so->so_rcv);
1839 goto restart;
1840 }
1841 SOCKBUF_UNLOCK(&so->so_rcv);
1842
1843 if (flagsp != NULL)
1844 *flagsp |= flags;
1845release:
1846 sbunlock(&so->so_rcv);
1847 return (error);
1848}
1849
1850/*
1851 * Optimized version of soreceive() for simple datagram cases from userspace.
1852 * Unlike in the stream case, we're able to drop a datagram if copyout()
1853 * fails, and because we handle datagrams atomically, we don't need to use a
1854 * sleep lock to prevent I/O interlacing.
1855 */
1856int
1857soreceive_dgram(struct socket *so, struct sockaddr **psa, struct uio *uio,
1858 struct mbuf **mp0, struct mbuf **controlp, int *flagsp)
1859{
1860 struct mbuf *m, *m2;
1861 int flags, len, error, offset;
1862 struct protosw *pr = so->so_proto;
1863 struct mbuf *nextrecord;
1864
1865 if (psa != NULL)
1866 *psa = NULL;
1867 if (controlp != NULL)
1868 *controlp = NULL;
1869 if (flagsp != NULL)
1870 flags = *flagsp &~ MSG_EOR;
1871 else
1872 flags = 0;
1873
1874 /*
1875 * For any complicated cases, fall back to the full
1876 * soreceive_generic().
1877 */
1878 if (mp0 != NULL || (flags & MSG_PEEK) || (flags & MSG_OOB))
1879 return (soreceive_generic(so, psa, uio, mp0, controlp,
1880 flagsp));
1881
1882 /*
1883 * Enforce restrictions on use.
1884 */
1885 KASSERT((pr->pr_flags & PR_WANTRCVD) == 0,
1886 ("soreceive_dgram: wantrcvd"));
1887 KASSERT(pr->pr_flags & PR_ATOMIC, ("soreceive_dgram: !atomic"));
1888 KASSERT((so->so_rcv.sb_state & SBS_RCVATMARK) == 0,
1889 ("soreceive_dgram: SBS_RCVATMARK"));
1890 KASSERT((so->so_proto->pr_flags & PR_CONNREQUIRED) == 0,
1891 ("soreceive_dgram: P_CONNREQUIRED"));
1892
1893 /*
1894 * Loop blocking while waiting for a datagram.
1895 */
1896 SOCKBUF_LOCK(&so->so_rcv);
1897 while ((m = so->so_rcv.sb_mb) == NULL) {
1898 KASSERT(so->so_rcv.sb_cc == 0,
1899 ("soreceive_dgram: sb_mb NULL but sb_cc %u",
1900 so->so_rcv.sb_cc));
1901 if (so->so_error) {
1902 error = so->so_error;
1903 so->so_error = 0;
1904 SOCKBUF_UNLOCK(&so->so_rcv);
1905 return (error);
1906 }
1907 if (so->so_rcv.sb_state & SBS_CANTRCVMORE ||
1908 uio->uio_resid == 0) {
1909 SOCKBUF_UNLOCK(&so->so_rcv);
1910 return (0);
1911 }
1912 if ((so->so_state & SS_NBIO) ||
1913 (flags & (MSG_DONTWAIT|MSG_NBIO))) {
1914 SOCKBUF_UNLOCK(&so->so_rcv);
1915 return (EWOULDBLOCK);
1916 }
1917 SBLASTRECORDCHK(&so->so_rcv);
1918 SBLASTMBUFCHK(&so->so_rcv);
1919 error = sbwait(&so->so_rcv);
1920 if (error) {
1921 SOCKBUF_UNLOCK(&so->so_rcv);
1922 return (error);
1923 }
1924 }
1925 SOCKBUF_LOCK_ASSERT(&so->so_rcv);
1926
1927 if (uio->uio_td)
1928 uio->uio_td->td_ru.ru_msgrcv++;
1929 SBLASTRECORDCHK(&so->so_rcv);
1930 SBLASTMBUFCHK(&so->so_rcv);
1931 nextrecord = m->m_nextpkt;
1932 if (nextrecord == NULL) {
1933 KASSERT(so->so_rcv.sb_lastrecord == m,
1934 ("soreceive_dgram: lastrecord != m"));
1935 }
1936
1937 KASSERT(so->so_rcv.sb_mb->m_nextpkt == nextrecord,
1938 ("soreceive_dgram: m_nextpkt != nextrecord"));
1939
1940 /*
1941 * Pull 'm' and its chain off the front of the packet queue.
1942 */
1943 so->so_rcv.sb_mb = NULL;
1944 sockbuf_pushsync(&so->so_rcv, nextrecord);
1945
1946 /*
1947 * Walk 'm's chain and free that many bytes from the socket buffer.
1948 */
1949 for (m2 = m; m2 != NULL; m2 = m2->m_next)
1950 sbfree(&so->so_rcv, m2);
1951
1952 /*
1953 * Do a few last checks before we let go of the lock.
1954 */
1955 SBLASTRECORDCHK(&so->so_rcv);
1956 SBLASTMBUFCHK(&so->so_rcv);
1957 SOCKBUF_UNLOCK(&so->so_rcv);
1958
1959 if (pr->pr_flags & PR_ADDR) {
1960 KASSERT(m->m_type == MT_SONAME,
1961 ("m->m_type == %d", m->m_type));
1962 if (psa != NULL)
1963 *psa = sodupsockaddr(mtod(m, struct sockaddr *),
1964 M_NOWAIT);
1965 m = m_free(m);
1966 }
1967 if (m == NULL) {
1968 /* XXXRW: Can this happen? */
1969 return (0);
1970 }
1971
1972 /*
1973 * Packet to copyout() is now in 'm' and it is disconnected from the
1974 * queue.
1975 *
1976 * Process one or more MT_CONTROL mbufs present before any data mbufs
1977 * in the first mbuf chain on the socket buffer. We call into the
1978 * protocol to perform externalization (or freeing if controlp ==
1979 * NULL).
1980 */
1981 if (m->m_type == MT_CONTROL) {
1982 struct mbuf *cm = NULL, *cmn;
1983 struct mbuf **cme = &cm;
1984
1985 do {
1986 m2 = m->m_next;
1987 m->m_next = NULL;
1988 *cme = m;
1989 cme = &(*cme)->m_next;
1990 m = m2;
1991 } while (m != NULL && m->m_type == MT_CONTROL);
1992 while (cm != NULL) {
1993 cmn = cm->m_next;
1994 cm->m_next = NULL;
1995 if (pr->pr_domain->dom_externalize != NULL) {
1996 error = (*pr->pr_domain->dom_externalize)
1997 (cm, controlp);
1998 } else if (controlp != NULL)
1999 *controlp = cm;
2000 else
2001 m_freem(cm);
2002 if (controlp != NULL) {
2003 while (*controlp != NULL)
2004 controlp = &(*controlp)->m_next;
2005 }
2006 cm = cmn;
2007 }
2008 }
2009 KASSERT(m->m_type == MT_DATA, ("soreceive_dgram: !data"));
2010
2011 offset = 0;
2012 while (m != NULL && uio->uio_resid > 0) {
2013 len = uio->uio_resid;
2014 if (len > m->m_len)
2015 len = m->m_len;
2016 error = uiomove(mtod(m, char *), (int)len, uio);
2017 if (error) {
2018 m_freem(m);
2019 return (error);
2020 }
2021 m = m_free(m);
2022 }
2023 if (m != NULL)
2024 flags |= MSG_TRUNC;
2025 m_freem(m);
2026 if (flagsp != NULL)
2027 *flagsp |= flags;
2028 return (0);
2029}
2030
2031int
2032soreceive(struct socket *so, struct sockaddr **psa, struct uio *uio,
2033 struct mbuf **mp0, struct mbuf **controlp, int *flagsp)
2034{
2035
2036 return (so->so_proto->pr_usrreqs->pru_soreceive(so, psa, uio, mp0,
2037 controlp, flagsp));
2038}
2039
2040int
2041soshutdown(struct socket *so, int how)
2042{
2043 struct protosw *pr = so->so_proto;
2044
2045 if (!(how == SHUT_RD || how == SHUT_WR || how == SHUT_RDWR))
2046 return (EINVAL);
2047 if (pr->pr_usrreqs->pru_flush != NULL) {
2048 (*pr->pr_usrreqs->pru_flush)(so, how);
2049 }
2050 if (how != SHUT_WR)
2051 sorflush(so);
2052 if (how != SHUT_RD)
2053 return ((*pr->pr_usrreqs->pru_shutdown)(so));
2054 return (0);
2055}
2056
2057void
2058sorflush(struct socket *so)
2059{
2060 struct sockbuf *sb = &so->so_rcv;
2061 struct protosw *pr = so->so_proto;
2062 struct sockbuf asb;
2063
2064 /*
2065 * In order to avoid calling dom_dispose with the socket buffer mutex
2066 * held, and in order to generally avoid holding the lock for a long
2067 * time, we make a copy of the socket buffer and clear the original
2068 * (except locks, state). The new socket buffer copy won't have
2069 * initialized locks so we can only call routines that won't use or
2070 * assert those locks.
2071 *
2072 * Dislodge threads currently blocked in receive and wait to acquire
2073 * a lock against other simultaneous readers before clearing the
2074 * socket buffer. Don't let our acquire be interrupted by a signal
2075 * despite any existing socket disposition on interruptable waiting.
2076 */
2077 socantrcvmore(so);
2078 (void) sblock(sb, SBL_WAIT | SBL_NOINTR);
2079
2080 /*
2081 * Invalidate/clear most of the sockbuf structure, but leave selinfo
2082 * and mutex data unchanged.
2083 */
2084 SOCKBUF_LOCK(sb);
2085 bzero(&asb, offsetof(struct sockbuf, sb_startzero));
2086 bcopy(&sb->sb_startzero, &asb.sb_startzero,
2087 sizeof(*sb) - offsetof(struct sockbuf, sb_startzero));
2088 bzero(&sb->sb_startzero,
2089 sizeof(*sb) - offsetof(struct sockbuf, sb_startzero));
2090 SOCKBUF_UNLOCK(sb);
2091 sbunlock(sb);
2092
2093 /*
2094 * Dispose of special rights and flush the socket buffer. Don't call
2095 * any unsafe routines (that rely on locks being initialized) on asb.
2096 */
2097 if (pr->pr_flags & PR_RIGHTS && pr->pr_domain->dom_dispose != NULL)
2098 (*pr->pr_domain->dom_dispose)(asb.sb_mb);
2099 sbrelease_internal(&asb, so);
2100}
2101
2102/*
2103 * Perhaps this routine, and sooptcopyout(), below, ought to come in an
2104 * additional variant to handle the case where the option value needs to be
2105 * some kind of integer, but not a specific size. In addition to their use
2106 * here, these functions are also called by the protocol-level pr_ctloutput()
2107 * routines.
2108 */
2109int
2110sooptcopyin(struct sockopt *sopt, void *buf, size_t len, size_t minlen)
2111{
2112 size_t valsize;
2113
2114 /*
2115 * If the user gives us more than we wanted, we ignore it, but if we
2116 * don't get the minimum length the caller wants, we return EINVAL.
2117 * On success, sopt->sopt_valsize is set to however much we actually
2118 * retrieved.
2119 */
2120 if ((valsize = sopt->sopt_valsize) < minlen)
2121 return EINVAL;
2122 if (valsize > len)
2123 sopt->sopt_valsize = valsize = len;
2124
2125 if (sopt->sopt_td != NULL)
2126 return (copyin(sopt->sopt_val, buf, valsize));
2127
2128 bcopy(sopt->sopt_val, buf, valsize);
2129 return (0);
2130}
2131
2132/*
2133 * Kernel version of setsockopt(2).
2134 *
2135 * XXX: optlen is size_t, not socklen_t
2136 */
2137int
2138so_setsockopt(struct socket *so, int level, int optname, void *optval,
2139 size_t optlen)
2140{
2141 struct sockopt sopt;
2142
2143 sopt.sopt_level = level;
2144 sopt.sopt_name = optname;
2145 sopt.sopt_dir = SOPT_SET;
2146 sopt.sopt_val = optval;
2147 sopt.sopt_valsize = optlen;
2148 sopt.sopt_td = NULL;
2149 return (sosetopt(so, &sopt));
2150}
2151
2152int
2153sosetopt(struct socket *so, struct sockopt *sopt)
2154{
2155 int error, optval;
2156 struct linger l;
2157 struct timeval tv;
2158 u_long val;
2159#ifdef MAC
2160 struct mac extmac;
2161#endif
2162
2163 error = 0;
2164 if (sopt->sopt_level != SOL_SOCKET) {
2165 if (so->so_proto && so->so_proto->pr_ctloutput)
2166 return ((*so->so_proto->pr_ctloutput)
2167 (so, sopt));
2168 error = ENOPROTOOPT;
2169 } else {
2170 switch (sopt->sopt_name) {
2171#ifdef INET
2172 case SO_ACCEPTFILTER:
2173 error = do_setopt_accept_filter(so, sopt);
2174 if (error)
2175 goto bad;
2176 break;
2177#endif
2178 case SO_LINGER:
2179 error = sooptcopyin(sopt, &l, sizeof l, sizeof l);
2180 if (error)
2181 goto bad;
2182
2183 SOCK_LOCK(so);
2184 so->so_linger = l.l_linger;
2185 if (l.l_onoff)
2186 so->so_options |= SO_LINGER;
2187 else
2188 so->so_options &= ~SO_LINGER;
2189 SOCK_UNLOCK(so);
2190 break;
2191
2192 case SO_DEBUG:
2193 case SO_KEEPALIVE:
2194 case SO_DONTROUTE:
2195 case SO_USELOOPBACK:
2196 case SO_BROADCAST:
2197 case SO_REUSEADDR:
2198 case SO_REUSEPORT:
2199 case SO_OOBINLINE:
2200 case SO_TIMESTAMP:
2201 case SO_BINTIME:
2202 case SO_NOSIGPIPE:
2203 case SO_NO_DDP:
2204 case SO_NO_OFFLOAD:
2205 error = sooptcopyin(sopt, &optval, sizeof optval,
2206 sizeof optval);
2207 if (error)
2208 goto bad;
2209 SOCK_LOCK(so);
2210 if (optval)
2211 so->so_options |= sopt->sopt_name;
2212 else
2213 so->so_options &= ~sopt->sopt_name;
2214 SOCK_UNLOCK(so);
2215 break;
2216
2217 case SO_SETFIB:
2218 error = sooptcopyin(sopt, &optval, sizeof optval,
2219 sizeof optval);
2220 if (optval < 1 || optval > rt_numfibs) {
2221 error = EINVAL;
2222 goto bad;
2223 }
2224 if ((so->so_proto->pr_domain->dom_family == PF_INET) ||
2225 (so->so_proto->pr_domain->dom_family == PF_ROUTE)) {
2226 so->so_fibnum = optval;
2227 /* Note: ignore error */
2228 if (so->so_proto && so->so_proto->pr_ctloutput)
2229 (*so->so_proto->pr_ctloutput)(so, sopt);
2230 } else {
2231 so->so_fibnum = 0;
2232 }
2233 break;
2234 case SO_SNDBUF:
2235 case SO_RCVBUF:
2236 case SO_SNDLOWAT:
2237 case SO_RCVLOWAT:
2238 error = sooptcopyin(sopt, &optval, sizeof optval,
2239 sizeof optval);
2240 if (error)
2241 goto bad;
2242
2243 /*
2244 * Values < 1 make no sense for any of these options,
2245 * so disallow them.
2246 */
2247 if (optval < 1) {
2248 error = EINVAL;
2249 goto bad;
2250 }
2251
2252 switch (sopt->sopt_name) {
2253 case SO_SNDBUF:
2254 case SO_RCVBUF:
2255 if (sbreserve(sopt->sopt_name == SO_SNDBUF ?
2256 &so->so_snd : &so->so_rcv, (u_long)optval,
2257 so, curthread) == 0) {
2258 error = ENOBUFS;
2259 goto bad;
2260 }
2261 (sopt->sopt_name == SO_SNDBUF ? &so->so_snd :
2262 &so->so_rcv)->sb_flags &= ~SB_AUTOSIZE;
2263 break;
2264
2265 /*
2266 * Make sure the low-water is never greater than the
2267 * high-water.
2268 */
2269 case SO_SNDLOWAT:
2270 SOCKBUF_LOCK(&so->so_snd);
2271 so->so_snd.sb_lowat =
2272 (optval > so->so_snd.sb_hiwat) ?
2273 so->so_snd.sb_hiwat : optval;
2274 SOCKBUF_UNLOCK(&so->so_snd);
2275 break;
2276 case SO_RCVLOWAT:
2277 SOCKBUF_LOCK(&so->so_rcv);
2278 so->so_rcv.sb_lowat =
2279 (optval > so->so_rcv.sb_hiwat) ?
2280 so->so_rcv.sb_hiwat : optval;
2281 SOCKBUF_UNLOCK(&so->so_rcv);
2282 break;
2283 }
2284 break;
2285
2286 case SO_SNDTIMEO:
2287 case SO_RCVTIMEO:
2288#ifdef COMPAT_IA32
2289 if (SV_CURPROC_FLAG(SV_ILP32)) {
2290 struct timeval32 tv32;
2291
2292 error = sooptcopyin(sopt, &tv32, sizeof tv32,
2293 sizeof tv32);
2294 CP(tv32, tv, tv_sec);
2295 CP(tv32, tv, tv_usec);
2296 } else
2297#endif
2298 error = sooptcopyin(sopt, &tv, sizeof tv,
2299 sizeof tv);
2300 if (error)
2301 goto bad;
2302
2303 /* assert(hz > 0); */
2304 if (tv.tv_sec < 0 || tv.tv_sec > INT_MAX / hz ||
2305 tv.tv_usec < 0 || tv.tv_usec >= 1000000) {
2306 error = EDOM;
2307 goto bad;
2308 }
2309 /* assert(tick > 0); */
2310 /* assert(ULONG_MAX - INT_MAX >= 1000000); */
2311 val = (u_long)(tv.tv_sec * hz) + tv.tv_usec / tick;
2312 if (val > INT_MAX) {
2313 error = EDOM;
2314 goto bad;
2315 }
2316 if (val == 0 && tv.tv_usec != 0)
2317 val = 1;
2318
2319 switch (sopt->sopt_name) {
2320 case SO_SNDTIMEO:
2321 so->so_snd.sb_timeo = val;
2322 break;
2323 case SO_RCVTIMEO:
2324 so->so_rcv.sb_timeo = val;
2325 break;
2326 }
2327 break;
2328
2329 case SO_LABEL:
2330#ifdef MAC
2331 error = sooptcopyin(sopt, &extmac, sizeof extmac,
2332 sizeof extmac);
2333 if (error)
2334 goto bad;
2335 error = mac_setsockopt_label(sopt->sopt_td->td_ucred,
2336 so, &extmac);
2337#else
2338 error = EOPNOTSUPP;
2339#endif
2340 break;
2341
2342 default:
2343 error = ENOPROTOOPT;
2344 break;
2345 }
2346 if (error == 0 && so->so_proto != NULL &&
2347 so->so_proto->pr_ctloutput != NULL) {
2348 (void) ((*so->so_proto->pr_ctloutput)
2349 (so, sopt));
2350 }
2351 }
2352bad:
2353 return (error);
2354}
2355
2356/*
2357 * Helper routine for getsockopt.
2358 */
2359int
2360sooptcopyout(struct sockopt *sopt, const void *buf, size_t len)
2361{
2362 int error;
2363 size_t valsize;
2364
2365 error = 0;
2366
2367 /*
2368 * Documented get behavior is that we always return a value, possibly
2369 * truncated to fit in the user's buffer. Traditional behavior is
2370 * that we always tell the user precisely how much we copied, rather
2371 * than something useful like the total amount we had available for
2372 * her. Note that this interface is not idempotent; the entire
2373 * answer must generated ahead of time.
2374 */
2375 valsize = min(len, sopt->sopt_valsize);
2376 sopt->sopt_valsize = valsize;
2377 if (sopt->sopt_val != NULL) {
2378 if (sopt->sopt_td != NULL)
2379 error = copyout(buf, sopt->sopt_val, valsize);
2380 else
2381 bcopy(buf, sopt->sopt_val, valsize);
2382 }
2383 return (error);
2384}
2385
2386int
2387sogetopt(struct socket *so, struct sockopt *sopt)
2388{
2389 int error, optval;
2390 struct linger l;
2391 struct timeval tv;
2392#ifdef MAC
2393 struct mac extmac;
2394#endif
2395
2396 error = 0;
2397 if (sopt->sopt_level != SOL_SOCKET) {
2398 if (so->so_proto && so->so_proto->pr_ctloutput) {
2399 return ((*so->so_proto->pr_ctloutput)
2400 (so, sopt));
2401 } else
2402 return (ENOPROTOOPT);
2403 } else {
2404 switch (sopt->sopt_name) {
2405#ifdef INET
2406 case SO_ACCEPTFILTER:
2407 error = do_getopt_accept_filter(so, sopt);
2408 break;
2409#endif
2410 case SO_LINGER:
2411 SOCK_LOCK(so);
2412 l.l_onoff = so->so_options & SO_LINGER;
2413 l.l_linger = so->so_linger;
2414 SOCK_UNLOCK(so);
2415 error = sooptcopyout(sopt, &l, sizeof l);
2416 break;
2417
2418 case SO_USELOOPBACK:
2419 case SO_DONTROUTE:
2420 case SO_DEBUG:
2421 case SO_KEEPALIVE:
2422 case SO_REUSEADDR:
2423 case SO_REUSEPORT:
2424 case SO_BROADCAST:
2425 case SO_OOBINLINE:
2426 case SO_ACCEPTCONN:
2427 case SO_TIMESTAMP:
2428 case SO_BINTIME:
2429 case SO_NOSIGPIPE:
2430 optval = so->so_options & sopt->sopt_name;
2431integer:
2432 error = sooptcopyout(sopt, &optval, sizeof optval);
2433 break;
2434
2435 case SO_TYPE:
2436 optval = so->so_type;
2437 goto integer;
2438
2439 case SO_ERROR:
2440 SOCK_LOCK(so);
2441 optval = so->so_error;
2442 so->so_error = 0;
2443 SOCK_UNLOCK(so);
2444 goto integer;
2445
2446 case SO_SNDBUF:
2447 optval = so->so_snd.sb_hiwat;
2448 goto integer;
2449
2450 case SO_RCVBUF:
2451 optval = so->so_rcv.sb_hiwat;
2452 goto integer;
2453
2454 case SO_SNDLOWAT:
2455 optval = so->so_snd.sb_lowat;
2456 goto integer;
2457
2458 case SO_RCVLOWAT:
2459 optval = so->so_rcv.sb_lowat;
2460 goto integer;
2461
2462 case SO_SNDTIMEO:
2463 case SO_RCVTIMEO:
2464 optval = (sopt->sopt_name == SO_SNDTIMEO ?
2465 so->so_snd.sb_timeo : so->so_rcv.sb_timeo);
2466
2467 tv.tv_sec = optval / hz;
2468 tv.tv_usec = (optval % hz) * tick;
2469#ifdef COMPAT_IA32
2470 if (SV_CURPROC_FLAG(SV_ILP32)) {
2471 struct timeval32 tv32;
2472
2473 CP(tv, tv32, tv_sec);
2474 CP(tv, tv32, tv_usec);
2475 error = sooptcopyout(sopt, &tv32, sizeof tv32);
2476 } else
2477#endif
2478 error = sooptcopyout(sopt, &tv, sizeof tv);
2479 break;
2480
2481 case SO_LABEL:
2482#ifdef MAC
2483 error = sooptcopyin(sopt, &extmac, sizeof(extmac),
2484 sizeof(extmac));
2485 if (error)
2486 return (error);
2487 error = mac_getsockopt_label(sopt->sopt_td->td_ucred,
2488 so, &extmac);
2489 if (error)
2490 return (error);
2491 error = sooptcopyout(sopt, &extmac, sizeof extmac);
2492#else
2493 error = EOPNOTSUPP;
2494#endif
2495 break;
2496
2497 case SO_PEERLABEL:
2498#ifdef MAC
2499 error = sooptcopyin(sopt, &extmac, sizeof(extmac),
2500 sizeof(extmac));
2501 if (error)
2502 return (error);
2503 error = mac_getsockopt_peerlabel(
2504 sopt->sopt_td->td_ucred, so, &extmac);
2505 if (error)
2506 return (error);
2507 error = sooptcopyout(sopt, &extmac, sizeof extmac);
2508#else
2509 error = EOPNOTSUPP;
2510#endif
2511 break;
2512
2513 case SO_LISTENQLIMIT:
2514 optval = so->so_qlimit;
2515 goto integer;
2516
2517 case SO_LISTENQLEN:
2518 optval = so->so_qlen;
2519 goto integer;
2520
2521 case SO_LISTENINCQLEN:
2522 optval = so->so_incqlen;
2523 goto integer;
2524
2525 default:
2526 error = ENOPROTOOPT;
2527 break;
2528 }
2529 return (error);
2530 }
2531}
2532
2533/* XXX; prepare mbuf for (__FreeBSD__ < 3) routines. */
2534int
2535soopt_getm(struct sockopt *sopt, struct mbuf **mp)
2536{
2537 struct mbuf *m, *m_prev;
2538 int sopt_size = sopt->sopt_valsize;
2539
2540 MGET(m, sopt->sopt_td ? M_WAIT : M_DONTWAIT, MT_DATA);
2541 if (m == NULL)
2542 return ENOBUFS;
2543 if (sopt_size > MLEN) {
2544 MCLGET(m, sopt->sopt_td ? M_WAIT : M_DONTWAIT);
2545 if ((m->m_flags & M_EXT) == 0) {
2546 m_free(m);
2547 return ENOBUFS;
2548 }
2549 m->m_len = min(MCLBYTES, sopt_size);
2550 } else {
2551 m->m_len = min(MLEN, sopt_size);
2552 }
2553 sopt_size -= m->m_len;
2554 *mp = m;
2555 m_prev = m;
2556
2557 while (sopt_size) {
2558 MGET(m, sopt->sopt_td ? M_WAIT : M_DONTWAIT, MT_DATA);
2559 if (m == NULL) {
2560 m_freem(*mp);
2561 return ENOBUFS;
2562 }
2563 if (sopt_size > MLEN) {
2564 MCLGET(m, sopt->sopt_td != NULL ? M_WAIT :
2565 M_DONTWAIT);
2566 if ((m->m_flags & M_EXT) == 0) {
2567 m_freem(m);
2568 m_freem(*mp);
2569 return ENOBUFS;
2570 }
2571 m->m_len = min(MCLBYTES, sopt_size);
2572 } else {
2573 m->m_len = min(MLEN, sopt_size);
2574 }
2575 sopt_size -= m->m_len;
2576 m_prev->m_next = m;
2577 m_prev = m;
2578 }
2579 return (0);
2580}
2581
2582/* XXX; copyin sopt data into mbuf chain for (__FreeBSD__ < 3) routines. */
2583int
2584soopt_mcopyin(struct sockopt *sopt, struct mbuf *m)
2585{
2586 struct mbuf *m0 = m;
2587
2588 if (sopt->sopt_val == NULL)
2589 return (0);
2590 while (m != NULL && sopt->sopt_valsize >= m->m_len) {
2591 if (sopt->sopt_td != NULL) {
2592 int error;
2593
2594 error = copyin(sopt->sopt_val, mtod(m, char *),
2595 m->m_len);
2596 if (error != 0) {
2597 m_freem(m0);
2598 return(error);
2599 }
2600 } else
2601 bcopy(sopt->sopt_val, mtod(m, char *), m->m_len);
2602 sopt->sopt_valsize -= m->m_len;
2603 sopt->sopt_val = (char *)sopt->sopt_val + m->m_len;
2604 m = m->m_next;
2605 }
2606 if (m != NULL) /* should be allocated enoughly at ip6_sooptmcopyin() */
2607 panic("ip6_sooptmcopyin");
2608 return (0);
2609}
2610
2611/* XXX; copyout mbuf chain data into soopt for (__FreeBSD__ < 3) routines. */
2612int
2613soopt_mcopyout(struct sockopt *sopt, struct mbuf *m)
2614{
2615 struct mbuf *m0 = m;
2616 size_t valsize = 0;
2617
2618 if (sopt->sopt_val == NULL)
2619 return (0);
2620 while (m != NULL && sopt->sopt_valsize >= m->m_len) {
2621 if (sopt->sopt_td != NULL) {
2622 int error;
2623
2624 error = copyout(mtod(m, char *), sopt->sopt_val,
2625 m->m_len);
2626 if (error != 0) {
2627 m_freem(m0);
2628 return(error);
2629 }
2630 } else
2631 bcopy(mtod(m, char *), sopt->sopt_val, m->m_len);
2632 sopt->sopt_valsize -= m->m_len;
2633 sopt->sopt_val = (char *)sopt->sopt_val + m->m_len;
2634 valsize += m->m_len;
2635 m = m->m_next;
2636 }
2637 if (m != NULL) {
2638 /* enough soopt buffer should be given from user-land */
2639 m_freem(m0);
2640 return(EINVAL);
2641 }
2642 sopt->sopt_valsize = valsize;
2643 return (0);
2644}
2645
2646/*
2647 * sohasoutofband(): protocol notifies socket layer of the arrival of new
2648 * out-of-band data, which will then notify socket consumers.
2649 */
2650void
2651sohasoutofband(struct socket *so)
2652{
2653
2654 if (so->so_sigio != NULL)
2655 pgsigio(&so->so_sigio, SIGURG, 0);
2656 selwakeuppri(&so->so_rcv.sb_sel, PSOCK);
2657}
2658
2659int
2660sopoll(struct socket *so, int events, struct ucred *active_cred,
2661 struct thread *td)
2662{
2663
2664 return (so->so_proto->pr_usrreqs->pru_sopoll(so, events, active_cred,
2665 td));
2666}
2667
2668int
2669sopoll_generic(struct socket *so, int events, struct ucred *active_cred,
2670 struct thread *td)
2671{
2672 int revents = 0;
2673
2674 SOCKBUF_LOCK(&so->so_snd);
2675 SOCKBUF_LOCK(&so->so_rcv);
2676 if (events & (POLLIN | POLLRDNORM))
2677 if (soreadable(so))
2678 revents |= events & (POLLIN | POLLRDNORM);
2679
2680 if (events & POLLINIGNEOF)
2681 if (so->so_rcv.sb_cc >= so->so_rcv.sb_lowat ||
2682 !TAILQ_EMPTY(&so->so_comp) || so->so_error)
2683 revents |= POLLINIGNEOF;
2684
2685 if (events & (POLLOUT | POLLWRNORM))
2686 if (sowriteable(so))
2687 revents |= events & (POLLOUT | POLLWRNORM);
2688
2689 if (events & (POLLPRI | POLLRDBAND))
2690 if (so->so_oobmark || (so->so_rcv.sb_state & SBS_RCVATMARK))
2691 revents |= events & (POLLPRI | POLLRDBAND);
2692
2693 if (revents == 0) {
2694 if (events &
2695 (POLLIN | POLLINIGNEOF | POLLPRI | POLLRDNORM |
2696 POLLRDBAND)) {
2697 selrecord(td, &so->so_rcv.sb_sel);
2698 so->so_rcv.sb_flags |= SB_SEL;
2699 }
2700
2701 if (events & (POLLOUT | POLLWRNORM)) {
2702 selrecord(td, &so->so_snd.sb_sel);
2703 so->so_snd.sb_flags |= SB_SEL;
2704 }
2705 }
2706
2707 SOCKBUF_UNLOCK(&so->so_rcv);
2708 SOCKBUF_UNLOCK(&so->so_snd);
2709 return (revents);
2710}
2711
2712int
2713soo_kqfilter(struct file *fp, struct knote *kn)
2714{
2715 struct socket *so = kn->kn_fp->f_data;
2716 struct sockbuf *sb;
2717
2718 switch (kn->kn_filter) {
2719 case EVFILT_READ:
2720 if (so->so_options & SO_ACCEPTCONN)
2721 kn->kn_fop = &solisten_filtops;
2722 else
2723 kn->kn_fop = &soread_filtops;
2724 sb = &so->so_rcv;
2725 break;
2726 case EVFILT_WRITE:
2727 kn->kn_fop = &sowrite_filtops;
2728 sb = &so->so_snd;
2729 break;
2730 default:
2731 return (EINVAL);
2732 }
2733
2734 SOCKBUF_LOCK(sb);
2735 knlist_add(&sb->sb_sel.si_note, kn, 1);
2736 sb->sb_flags |= SB_KNOTE;
2737 SOCKBUF_UNLOCK(sb);
2738 return (0);
2739}
2740
2741/*
2742 * Some routines that return EOPNOTSUPP for entry points that are not
2743 * supported by a protocol. Fill in as needed.
2744 */
2745int
2746pru_accept_notsupp(struct socket *so, struct sockaddr **nam)
2747{
2748
2749 return EOPNOTSUPP;
2750}
2751
2752int
2753pru_attach_notsupp(struct socket *so, int proto, struct thread *td)
2754{
2755
2756 return EOPNOTSUPP;
2757}
2758
2759int
2760pru_bind_notsupp(struct socket *so, struct sockaddr *nam, struct thread *td)
2761{
2762
2763 return EOPNOTSUPP;
2764}
2765
2766int
2767pru_connect_notsupp(struct socket *so, struct sockaddr *nam, struct thread *td)
2768{
2769
2770 return EOPNOTSUPP;
2771}
2772
2773int
2774pru_connect2_notsupp(struct socket *so1, struct socket *so2)
2775{
2776
2777 return EOPNOTSUPP;
2778}
2779
2780int
2781pru_control_notsupp(struct socket *so, u_long cmd, caddr_t data,
2782 struct ifnet *ifp, struct thread *td)
2783{
2784
2785 return EOPNOTSUPP;
2786}
2787
2788int
2789pru_disconnect_notsupp(struct socket *so)
2790{
2791
2792 return EOPNOTSUPP;
2793}
2794
2795int
2796pru_listen_notsupp(struct socket *so, int backlog, struct thread *td)
2797{
2798
2799 return EOPNOTSUPP;
2800}
2801
2802int
2803pru_peeraddr_notsupp(struct socket *so, struct sockaddr **nam)
2804{
2805
2806 return EOPNOTSUPP;
2807}
2808
2809int
2810pru_rcvd_notsupp(struct socket *so, int flags)
2811{
2812
2813 return EOPNOTSUPP;
2814}
2815
2816int
2817pru_rcvoob_notsupp(struct socket *so, struct mbuf *m, int flags)
2818{
2819
2820 return EOPNOTSUPP;
2821}
2822
2823int
2824pru_send_notsupp(struct socket *so, int flags, struct mbuf *m,
2825 struct sockaddr *addr, struct mbuf *control, struct thread *td)
2826{
2827
2828 return EOPNOTSUPP;
2829}
2830
2831/*
2832 * This isn't really a ``null'' operation, but it's the default one and
2833 * doesn't do anything destructive.
2834 */
2835int
2836pru_sense_null(struct socket *so, struct stat *sb)
2837{
2838
2839 sb->st_blksize = so->so_snd.sb_hiwat;
2840 return 0;
2841}
2842
2843int
2844pru_shutdown_notsupp(struct socket *so)
2845{
2846
2847 return EOPNOTSUPP;
2848}
2849
2850int
2851pru_sockaddr_notsupp(struct socket *so, struct sockaddr **nam)
2852{
2853
2854 return EOPNOTSUPP;
2855}
2856
2857int
2858pru_sosend_notsupp(struct socket *so, struct sockaddr *addr, struct uio *uio,
2859 struct mbuf *top, struct mbuf *control, int flags, struct thread *td)
2860{
2861
2862 return EOPNOTSUPP;
2863}
2864
2865int
2866pru_soreceive_notsupp(struct socket *so, struct sockaddr **paddr,
2867 struct uio *uio, struct mbuf **mp0, struct mbuf **controlp, int *flagsp)
2868{
2869
2870 return EOPNOTSUPP;
2871}
2872
2873int
2874pru_sopoll_notsupp(struct socket *so, int events, struct ucred *cred,
2875 struct thread *td)
2876{
2877
2878 return EOPNOTSUPP;
2879}
2880
2881static void
2882filt_sordetach(struct knote *kn)
2883{
2884 struct socket *so = kn->kn_fp->f_data;
2885
2886 SOCKBUF_LOCK(&so->so_rcv);
2887 knlist_remove(&so->so_rcv.sb_sel.si_note, kn, 1);
2888 if (knlist_empty(&so->so_rcv.sb_sel.si_note))
2889 so->so_rcv.sb_flags &= ~SB_KNOTE;
2890 SOCKBUF_UNLOCK(&so->so_rcv);
2891}
2892
2893/*ARGSUSED*/
2894static int
2895filt_soread(struct knote *kn, long hint)
2896{
2897 struct socket *so;
2898
2899 so = kn->kn_fp->f_data;
2900 SOCKBUF_LOCK_ASSERT(&so->so_rcv);
2901
2902 kn->kn_data = so->so_rcv.sb_cc - so->so_rcv.sb_ctl;
2903 if (so->so_rcv.sb_state & SBS_CANTRCVMORE) {
2904 kn->kn_flags |= EV_EOF;
2905 kn->kn_fflags = so->so_error;
2906 return (1);
2907 } else if (so->so_error) /* temporary udp error */
2908 return (1);
2909 else if (kn->kn_sfflags & NOTE_LOWAT)
2910 return (kn->kn_data >= kn->kn_sdata);
2911 else
2912 return (so->so_rcv.sb_cc >= so->so_rcv.sb_lowat);
2913}
2914
2915static void
2916filt_sowdetach(struct knote *kn)
2917{
2918 struct socket *so = kn->kn_fp->f_data;
2919
2920 SOCKBUF_LOCK(&so->so_snd);
2921 knlist_remove(&so->so_snd.sb_sel.si_note, kn, 1);
2922 if (knlist_empty(&so->so_snd.sb_sel.si_note))
2923 so->so_snd.sb_flags &= ~SB_KNOTE;
2924 SOCKBUF_UNLOCK(&so->so_snd);
2925}
2926
2927/*ARGSUSED*/
2928static int
2929filt_sowrite(struct knote *kn, long hint)
2930{
2931 struct socket *so;
2932
2933 so = kn->kn_fp->f_data;
2934 SOCKBUF_LOCK_ASSERT(&so->so_snd);
2935 kn->kn_data = sbspace(&so->so_snd);
2936 if (so->so_snd.sb_state & SBS_CANTSENDMORE) {
2937 kn->kn_flags |= EV_EOF;
2938 kn->kn_fflags = so->so_error;
2939 return (1);
2940 } else if (so->so_error) /* temporary udp error */
2941 return (1);
2942 else if (((so->so_state & SS_ISCONNECTED) == 0) &&
2943 (so->so_proto->pr_flags & PR_CONNREQUIRED))
2944 return (0);
2945 else if (kn->kn_sfflags & NOTE_LOWAT)
2946 return (kn->kn_data >= kn->kn_sdata);
2947 else
2948 return (kn->kn_data >= so->so_snd.sb_lowat);
2949}
2950
2951/*ARGSUSED*/
2952static int
2953filt_solisten(struct knote *kn, long hint)
2954{
2955 struct socket *so = kn->kn_fp->f_data;
2956
2957 kn->kn_data = so->so_qlen;
2958 return (! TAILQ_EMPTY(&so->so_comp));
2959}
2960
2961int
2962socheckuid(struct socket *so, uid_t uid)
2963{
2964
2965 if (so == NULL)
2966 return (EPERM);
2967 if (so->so_cred->cr_uid != uid)
2968 return (EPERM);
2969 return (0);
2970}
2971
2972static int
2973sysctl_somaxconn(SYSCTL_HANDLER_ARGS)
2974{
2975 int error;
2976 int val;
2977
2978 val = somaxconn;
2979 error = sysctl_handle_int(oidp, &val, 0, req);
2980 if (error || !req->newptr )
2981 return (error);
2982
2983 if (val < 1 || val > USHRT_MAX)
2984 return (EINVAL);
2985
2986 somaxconn = val;
2987 return (0);
2988}
2989
2990/*
2991 * These functions are used by protocols to notify the socket layer (and its
2992 * consumers) of state changes in the sockets driven by protocol-side events.
2993 */
2994
2995/*
2996 * Procedures to manipulate state flags of socket and do appropriate wakeups.
2997 *
2998 * Normal sequence from the active (originating) side is that
2999 * soisconnecting() is called during processing of connect() call, resulting
3000 * in an eventual call to soisconnected() if/when the connection is
3001 * established. When the connection is torn down soisdisconnecting() is
3002 * called during processing of disconnect() call, and soisdisconnected() is
3003 * called when the connection to the peer is totally severed. The semantics
3004 * of these routines are such that connectionless protocols can call
3005 * soisconnected() and soisdisconnected() only, bypassing the in-progress
3006 * calls when setting up a ``connection'' takes no time.
3007 *
3008 * From the passive side, a socket is created with two queues of sockets:
3009 * so_incomp for connections in progress and so_comp for connections already
3010 * made and awaiting user acceptance. As a protocol is preparing incoming
3011 * connections, it creates a socket structure queued on so_incomp by calling
3012 * sonewconn(). When the connection is established, soisconnected() is
3013 * called, and transfers the socket structure to so_comp, making it available
3014 * to accept().
3015 *
3016 * If a socket is closed with sockets on either so_incomp or so_comp, these
3017 * sockets are dropped.
3018 *
3019 * If higher-level protocols are implemented in the kernel, the wakeups done
3020 * here will sometimes cause software-interrupt process scheduling.
3021 */
3022void
3023soisconnecting(struct socket *so)
3024{
3025
3026 SOCK_LOCK(so);
3027 so->so_state &= ~(SS_ISCONNECTED|SS_ISDISCONNECTING);
3028 so->so_state |= SS_ISCONNECTING;
3029 SOCK_UNLOCK(so);
3030}
3031
3032void
3033soisconnected(struct socket *so)
3034{
3035 struct socket *head;
3036
3037 ACCEPT_LOCK();
3038 SOCK_LOCK(so);
3039 so->so_state &= ~(SS_ISCONNECTING|SS_ISDISCONNECTING|SS_ISCONFIRMING);
3040 so->so_state |= SS_ISCONNECTED;
3041 head = so->so_head;
3042 if (head != NULL && (so->so_qstate & SQ_INCOMP)) {
3043 if ((so->so_options & SO_ACCEPTFILTER) == 0) {
3044 SOCK_UNLOCK(so);
3045 TAILQ_REMOVE(&head->so_incomp, so, so_list);
3046 head->so_incqlen--;
3047 so->so_qstate &= ~SQ_INCOMP;
3048 TAILQ_INSERT_TAIL(&head->so_comp, so, so_list);
3049 head->so_qlen++;
3050 so->so_qstate |= SQ_COMP;
3051 ACCEPT_UNLOCK();
3052 sorwakeup(head);
3053 wakeup_one(&head->so_timeo);
3054 } else {
3055 ACCEPT_UNLOCK();
3056 so->so_upcall =
3057 head->so_accf->so_accept_filter->accf_callback;
3058 so->so_upcallarg = head->so_accf->so_accept_filter_arg;
3059 so->so_rcv.sb_flags |= SB_UPCALL;
3060 so->so_options &= ~SO_ACCEPTFILTER;
3061 SOCK_UNLOCK(so);
3062 so->so_upcall(so, so->so_upcallarg, M_DONTWAIT);
3063 }
3064 return;
3065 }
3066 SOCK_UNLOCK(so);
3067 ACCEPT_UNLOCK();
3068 wakeup(&so->so_timeo);
3069 sorwakeup(so);
3070 sowwakeup(so);
3071}
3072
3073void
3074soisdisconnecting(struct socket *so)
3075{
3076
3077 /*
3078 * Note: This code assumes that SOCK_LOCK(so) and
3079 * SOCKBUF_LOCK(&so->so_rcv) are the same.
3080 */
3081 SOCKBUF_LOCK(&so->so_rcv);
3082 so->so_state &= ~SS_ISCONNECTING;
3083 so->so_state |= SS_ISDISCONNECTING;
3084 so->so_rcv.sb_state |= SBS_CANTRCVMORE;
3085 sorwakeup_locked(so);
3086 SOCKBUF_LOCK(&so->so_snd);
3087 so->so_snd.sb_state |= SBS_CANTSENDMORE;
3088 sowwakeup_locked(so);
3089 wakeup(&so->so_timeo);
3090}
3091
3092void
3093soisdisconnected(struct socket *so)
3094{
3095
3096 /*
3097 * Note: This code assumes that SOCK_LOCK(so) and
3098 * SOCKBUF_LOCK(&so->so_rcv) are the same.
3099 */
3100 SOCKBUF_LOCK(&so->so_rcv);
3101 so->so_state &= ~(SS_ISCONNECTING|SS_ISCONNECTED|SS_ISDISCONNECTING);
3102 so->so_state |= SS_ISDISCONNECTED;
3103 so->so_rcv.sb_state |= SBS_CANTRCVMORE;
3104 sorwakeup_locked(so);
3105 SOCKBUF_LOCK(&so->so_snd);
3106 so->so_snd.sb_state |= SBS_CANTSENDMORE;
3107 sbdrop_locked(&so->so_snd, so->so_snd.sb_cc);
3108 sowwakeup_locked(so);
3109 wakeup(&so->so_timeo);
3110}
3111
3112/*
3113 * Make a copy of a sockaddr in a malloced buffer of type M_SONAME.
3114 */
3115struct sockaddr *
3116sodupsockaddr(const struct sockaddr *sa, int mflags)
3117{
3118 struct sockaddr *sa2;
3119
3120 sa2 = malloc(sa->sa_len, M_SONAME, mflags);
3121 if (sa2)
3122 bcopy(sa, sa2, sa->sa_len);
3123 return sa2;
3124}
3125
3126/*
3127 * Create an external-format (``xsocket'') structure using the information in
3128 * the kernel-format socket structure pointed to by so. This is done to
3129 * reduce the spew of irrelevant information over this interface, to isolate
3130 * user code from changes in the kernel structure, and potentially to provide
3131 * information-hiding if we decide that some of this information should be
3132 * hidden from users.
3133 */
3134void
3135sotoxsocket(struct socket *so, struct xsocket *xso)
3136{
3137
3138 xso->xso_len = sizeof *xso;
3139 xso->xso_so = so;
3140 xso->so_type = so->so_type;
3141 xso->so_options = so->so_options;
3142 xso->so_linger = so->so_linger;
3143 xso->so_state = so->so_state;
3144 xso->so_pcb = so->so_pcb;
3145 xso->xso_protocol = so->so_proto->pr_protocol;
3146 xso->xso_family = so->so_proto->pr_domain->dom_family;
3147 xso->so_qlen = so->so_qlen;
3148 xso->so_incqlen = so->so_incqlen;
3149 xso->so_qlimit = so->so_qlimit;
3150 xso->so_timeo = so->so_timeo;
3151 xso->so_error = so->so_error;
3152 xso->so_pgid = so->so_sigio ? so->so_sigio->sio_pgid : 0;
3153 xso->so_oobmark = so->so_oobmark;
3154 sbtoxsockbuf(&so->so_snd, &xso->so_snd);
3155 sbtoxsockbuf(&so->so_rcv, &xso->so_rcv);
3156 xso->so_uid = so->so_cred->cr_uid;
3157}
3158
3159
3160/*
3161 * Socket accessor functions to provide external consumers with
3162 * a safe interface to socket state
3163 *
3164 */
3165
3166void
3167so_listeners_apply_all(struct socket *so, void (*func)(struct socket *, void *), void *arg)
3168{
3169
3170 TAILQ_FOREACH(so, &so->so_comp, so_list)
3171 func(so, arg);
3172}
3173
3174struct sockbuf *
3175so_sockbuf_rcv(struct socket *so)
3176{
3177
3178 return (&so->so_rcv);
3179}
3180
3181struct sockbuf *
3182so_sockbuf_snd(struct socket *so)
3183{
3184
3185 return (&so->so_snd);
3186}
3187
3188int
3189so_state_get(const struct socket *so)
3190{
3191
3192 return (so->so_state);
3193}
3194
3195void
3196so_state_set(struct socket *so, int val)
3197{
3198
3199 so->so_state = val;
3200}
3201
3202int
3203so_options_get(const struct socket *so)
3204{
3205
3206 return (so->so_options);
3207}
3208
3209void
3210so_options_set(struct socket *so, int val)
3211{
3212
3213 so->so_options = val;
3214}
3215
3216int
3217so_error_get(const struct socket *so)
3218{
3219
3220 return (so->so_error);
3221}
3222
3223void
3224so_error_set(struct socket *so, int val)
3225{
3226
3227 so->so_error = val;
3228}
3229
3230int
3231so_linger_get(const struct socket *so)
3232{
3233
3234 return (so->so_linger);
3235}
3236
3237void
3238so_linger_set(struct socket *so, int val)
3239{
3240
3241 so->so_linger = val;
3242}
3243
3244struct protosw *
3245so_protosw_get(const struct socket *so)
3246{
3247
3248 return (so->so_proto);
3249}
3250
3251void
3252so_protosw_set(struct socket *so, struct protosw *val)
3253{
3254
3255 so->so_proto = val;
3256}
3257
3258void
3259so_sorwakeup(struct socket *so)
3260{
3261
3262 sorwakeup(so);
3263}
3264
3265void
3266so_sowwakeup(struct socket *so)
3267{
3268
3269 sowwakeup(so);
3270}
3271
3272void
3273so_sorwakeup_locked(struct socket *so)
3274{
3275
3276 sorwakeup_locked(so);
3277}
3278
3279void
3280so_sowwakeup_locked(struct socket *so)
3281{
3282
3283 sowwakeup_locked(so);
3284}
3285
3286void
3287so_lock(struct socket *so)
3288{
3289 SOCK_LOCK(so);
3290}
3291
3292void
3293so_unlock(struct socket *so)
3294{
3295 SOCK_UNLOCK(so);
3296}