Deleted Added
full compact
uipc_socket.c (169375) uipc_socket.c (169624)
1/*-
2 * Copyright (c) 1982, 1986, 1988, 1990, 1993
3 * The Regents of the University of California.
4 * Copyright (c) 2004 The FreeBSD Foundation
5 * Copyright (c) 2004-2007 Robert N. M. Watson
6 * All rights reserved.
7 *
8 * Redistribution and use in source and binary forms, with or without
9 * modification, are permitted provided that the following conditions
10 * are met:
11 * 1. Redistributions of source code must retain the above copyright
12 * notice, this list of conditions and the following disclaimer.
13 * 2. Redistributions in binary form must reproduce the above copyright
14 * notice, this list of conditions and the following disclaimer in the
15 * documentation and/or other materials provided with the distribution.
16 * 4. Neither the name of the University nor the names of its contributors
17 * may be used to endorse or promote products derived from this software
18 * without specific prior written permission.
19 *
20 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
21 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
22 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
23 * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
24 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
25 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
26 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
27 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
28 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
29 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
30 * SUCH DAMAGE.
31 *
32 * @(#)uipc_socket.c 8.3 (Berkeley) 4/15/94
33 */
34
35/*
36 * Comments on the socket life cycle:
37 *
38 * soalloc() sets of socket layer state for a socket, called only by
39 * socreate() and sonewconn(). Socket layer private.
40 *
41 * sodealloc() tears down socket layer state for a socket, called only by
42 * sofree() and sonewconn(). Socket layer private.
43 *
44 * pru_attach() associates protocol layer state with an allocated socket;
45 * called only once, may fail, aborting socket allocation. This is called
46 * from socreate() and sonewconn(). Socket layer private.
47 *
48 * pru_detach() disassociates protocol layer state from an attached socket,
49 * and will be called exactly once for sockets in which pru_attach() has
50 * been successfully called. If pru_attach() returned an error,
51 * pru_detach() will not be called. Socket layer private.
52 *
53 * pru_abort() and pru_close() notify the protocol layer that the last
54 * consumer of a socket is starting to tear down the socket, and that the
55 * protocol should terminate the connection. Historically, pru_abort() also
56 * detached protocol state from the socket state, but this is no longer the
57 * case.
58 *
59 * socreate() creates a socket and attaches protocol state. This is a public
60 * interface that may be used by socket layer consumers to create new
61 * sockets.
62 *
63 * sonewconn() creates a socket and attaches protocol state. This is a
64 * public interface that may be used by protocols to create new sockets when
65 * a new connection is received and will be available for accept() on a
66 * listen socket.
67 *
68 * soclose() destroys a socket after possibly waiting for it to disconnect.
69 * This is a public interface that socket consumers should use to close and
70 * release a socket when done with it.
71 *
72 * soabort() destroys a socket without waiting for it to disconnect (used
73 * only for incoming connections that are already partially or fully
74 * connected). This is used internally by the socket layer when clearing
75 * listen socket queues (due to overflow or close on the listen socket), but
76 * is also a public interface protocols may use to abort connections in
77 * their incomplete listen queues should they no longer be required. Sockets
78 * placed in completed connection listen queues should not be aborted for
79 * reasons described in the comment above the soclose() implementation. This
80 * is not a general purpose close routine, and except in the specific
81 * circumstances described here, should not be used.
82 *
83 * sofree() will free a socket and its protocol state if all references on
84 * the socket have been released, and is the public interface to attempt to
85 * free a socket when a reference is removed. This is a socket layer private
86 * interface.
87 *
88 * NOTE: In addition to socreate() and soclose(), which provide a single
89 * socket reference to the consumer to be managed as required, there are two
90 * calls to explicitly manage socket references, soref(), and sorele().
91 * Currently, these are generally required only when transitioning a socket
92 * from a listen queue to a file descriptor, in order to prevent garbage
93 * collection of the socket at an untimely moment. For a number of reasons,
94 * these interfaces are not preferred, and should be avoided.
95 */
96
97#include <sys/cdefs.h>
1/*-
2 * Copyright (c) 1982, 1986, 1988, 1990, 1993
3 * The Regents of the University of California.
4 * Copyright (c) 2004 The FreeBSD Foundation
5 * Copyright (c) 2004-2007 Robert N. M. Watson
6 * All rights reserved.
7 *
8 * Redistribution and use in source and binary forms, with or without
9 * modification, are permitted provided that the following conditions
10 * are met:
11 * 1. Redistributions of source code must retain the above copyright
12 * notice, this list of conditions and the following disclaimer.
13 * 2. Redistributions in binary form must reproduce the above copyright
14 * notice, this list of conditions and the following disclaimer in the
15 * documentation and/or other materials provided with the distribution.
16 * 4. Neither the name of the University nor the names of its contributors
17 * may be used to endorse or promote products derived from this software
18 * without specific prior written permission.
19 *
20 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
21 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
22 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
23 * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
24 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
25 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
26 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
27 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
28 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
29 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
30 * SUCH DAMAGE.
31 *
32 * @(#)uipc_socket.c 8.3 (Berkeley) 4/15/94
33 */
34
35/*
36 * Comments on the socket life cycle:
37 *
38 * soalloc() sets of socket layer state for a socket, called only by
39 * socreate() and sonewconn(). Socket layer private.
40 *
41 * sodealloc() tears down socket layer state for a socket, called only by
42 * sofree() and sonewconn(). Socket layer private.
43 *
44 * pru_attach() associates protocol layer state with an allocated socket;
45 * called only once, may fail, aborting socket allocation. This is called
46 * from socreate() and sonewconn(). Socket layer private.
47 *
48 * pru_detach() disassociates protocol layer state from an attached socket,
49 * and will be called exactly once for sockets in which pru_attach() has
50 * been successfully called. If pru_attach() returned an error,
51 * pru_detach() will not be called. Socket layer private.
52 *
53 * pru_abort() and pru_close() notify the protocol layer that the last
54 * consumer of a socket is starting to tear down the socket, and that the
55 * protocol should terminate the connection. Historically, pru_abort() also
56 * detached protocol state from the socket state, but this is no longer the
57 * case.
58 *
59 * socreate() creates a socket and attaches protocol state. This is a public
60 * interface that may be used by socket layer consumers to create new
61 * sockets.
62 *
63 * sonewconn() creates a socket and attaches protocol state. This is a
64 * public interface that may be used by protocols to create new sockets when
65 * a new connection is received and will be available for accept() on a
66 * listen socket.
67 *
68 * soclose() destroys a socket after possibly waiting for it to disconnect.
69 * This is a public interface that socket consumers should use to close and
70 * release a socket when done with it.
71 *
72 * soabort() destroys a socket without waiting for it to disconnect (used
73 * only for incoming connections that are already partially or fully
74 * connected). This is used internally by the socket layer when clearing
75 * listen socket queues (due to overflow or close on the listen socket), but
76 * is also a public interface protocols may use to abort connections in
77 * their incomplete listen queues should they no longer be required. Sockets
78 * placed in completed connection listen queues should not be aborted for
79 * reasons described in the comment above the soclose() implementation. This
80 * is not a general purpose close routine, and except in the specific
81 * circumstances described here, should not be used.
82 *
83 * sofree() will free a socket and its protocol state if all references on
84 * the socket have been released, and is the public interface to attempt to
85 * free a socket when a reference is removed. This is a socket layer private
86 * interface.
87 *
88 * NOTE: In addition to socreate() and soclose(), which provide a single
89 * socket reference to the consumer to be managed as required, there are two
90 * calls to explicitly manage socket references, soref(), and sorele().
91 * Currently, these are generally required only when transitioning a socket
92 * from a listen queue to a file descriptor, in order to prevent garbage
93 * collection of the socket at an untimely moment. For a number of reasons,
94 * these interfaces are not preferred, and should be avoided.
95 */
96
97#include <sys/cdefs.h>
98__FBSDID("$FreeBSD: head/sys/kern/uipc_socket.c 169375 2007-05-08 12:34:14Z yongari $");
98__FBSDID("$FreeBSD: head/sys/kern/uipc_socket.c 169624 2007-05-16 20:41:08Z rwatson $");
99
100#include "opt_inet.h"
101#include "opt_mac.h"
102#include "opt_zero.h"
103#include "opt_compat.h"
104
105#include <sys/param.h>
106#include <sys/systm.h>
107#include <sys/fcntl.h>
108#include <sys/limits.h>
109#include <sys/lock.h>
110#include <sys/mac.h>
111#include <sys/malloc.h>
112#include <sys/mbuf.h>
113#include <sys/mutex.h>
114#include <sys/domain.h>
115#include <sys/file.h> /* for struct knote */
116#include <sys/kernel.h>
117#include <sys/event.h>
118#include <sys/eventhandler.h>
119#include <sys/poll.h>
120#include <sys/proc.h>
121#include <sys/protosw.h>
122#include <sys/socket.h>
123#include <sys/socketvar.h>
124#include <sys/resourcevar.h>
125#include <sys/signalvar.h>
126#include <sys/stat.h>
127#include <sys/sx.h>
128#include <sys/sysctl.h>
129#include <sys/uio.h>
130#include <sys/jail.h>
131
132#include <security/mac/mac_framework.h>
133
134#include <vm/uma.h>
135
136#ifdef COMPAT_IA32
137#include <sys/mount.h>
138#include <compat/freebsd32/freebsd32.h>
139
140extern struct sysentvec ia32_freebsd_sysvec;
141#endif
142
143static int soreceive_rcvoob(struct socket *so, struct uio *uio,
144 int flags);
145
146static void filt_sordetach(struct knote *kn);
147static int filt_soread(struct knote *kn, long hint);
148static void filt_sowdetach(struct knote *kn);
149static int filt_sowrite(struct knote *kn, long hint);
150static int filt_solisten(struct knote *kn, long hint);
151
152static struct filterops solisten_filtops =
153 { 1, NULL, filt_sordetach, filt_solisten };
154static struct filterops soread_filtops =
155 { 1, NULL, filt_sordetach, filt_soread };
156static struct filterops sowrite_filtops =
157 { 1, NULL, filt_sowdetach, filt_sowrite };
158
159uma_zone_t socket_zone;
160so_gen_t so_gencnt; /* generation count for sockets */
161
162int maxsockets;
163
164MALLOC_DEFINE(M_SONAME, "soname", "socket name");
165MALLOC_DEFINE(M_PCB, "pcb", "protocol control block");
166
167static int somaxconn = SOMAXCONN;
168static int sysctl_somaxconn(SYSCTL_HANDLER_ARGS);
169/* XXX: we dont have SYSCTL_USHORT */
170SYSCTL_PROC(_kern_ipc, KIPC_SOMAXCONN, somaxconn, CTLTYPE_UINT | CTLFLAG_RW,
171 0, sizeof(int), sysctl_somaxconn, "I", "Maximum pending socket connection "
172 "queue size");
173static int numopensockets;
174SYSCTL_INT(_kern_ipc, OID_AUTO, numopensockets, CTLFLAG_RD,
175 &numopensockets, 0, "Number of open sockets");
176#ifdef ZERO_COPY_SOCKETS
177/* These aren't static because they're used in other files. */
178int so_zero_copy_send = 1;
179int so_zero_copy_receive = 1;
180SYSCTL_NODE(_kern_ipc, OID_AUTO, zero_copy, CTLFLAG_RD, 0,
181 "Zero copy controls");
182SYSCTL_INT(_kern_ipc_zero_copy, OID_AUTO, receive, CTLFLAG_RW,
183 &so_zero_copy_receive, 0, "Enable zero copy receive");
184SYSCTL_INT(_kern_ipc_zero_copy, OID_AUTO, send, CTLFLAG_RW,
185 &so_zero_copy_send, 0, "Enable zero copy send");
186#endif /* ZERO_COPY_SOCKETS */
187
188/*
189 * accept_mtx locks down per-socket fields relating to accept queues. See
190 * socketvar.h for an annotation of the protected fields of struct socket.
191 */
192struct mtx accept_mtx;
193MTX_SYSINIT(accept_mtx, &accept_mtx, "accept", MTX_DEF);
194
195/*
196 * so_global_mtx protects so_gencnt, numopensockets, and the per-socket
197 * so_gencnt field.
198 */
199static struct mtx so_global_mtx;
200MTX_SYSINIT(so_global_mtx, &so_global_mtx, "so_glabel", MTX_DEF);
201
202/*
203 * General IPC sysctl name space, used by sockets and a variety of other IPC
204 * types.
205 */
206SYSCTL_NODE(_kern, KERN_IPC, ipc, CTLFLAG_RW, 0, "IPC");
207
208/*
209 * Sysctl to get and set the maximum global sockets limit. Notify protocols
210 * of the change so that they can update their dependent limits as required.
211 */
212static int
213sysctl_maxsockets(SYSCTL_HANDLER_ARGS)
214{
215 int error, newmaxsockets;
216
217 newmaxsockets = maxsockets;
218 error = sysctl_handle_int(oidp, &newmaxsockets, sizeof(int), req);
219 if (error == 0 && req->newptr) {
220 if (newmaxsockets > maxsockets) {
221 maxsockets = newmaxsockets;
222 if (maxsockets > ((maxfiles / 4) * 3)) {
223 maxfiles = (maxsockets * 5) / 4;
224 maxfilesperproc = (maxfiles * 9) / 10;
225 }
226 EVENTHANDLER_INVOKE(maxsockets_change);
227 } else
228 error = EINVAL;
229 }
230 return (error);
231}
232
233SYSCTL_PROC(_kern_ipc, OID_AUTO, maxsockets, CTLTYPE_INT|CTLFLAG_RW,
234 &maxsockets, 0, sysctl_maxsockets, "IU",
235 "Maximum number of sockets avaliable");
236
237/*
238 * Initialise maxsockets.
239 */
240static void init_maxsockets(void *ignored)
241{
242 TUNABLE_INT_FETCH("kern.ipc.maxsockets", &maxsockets);
243 maxsockets = imax(maxsockets, imax(maxfiles, nmbclusters));
244}
245SYSINIT(param, SI_SUB_TUNABLES, SI_ORDER_ANY, init_maxsockets, NULL);
246
247/*
248 * Socket operation routines. These routines are called by the routines in
249 * sys_socket.c or from a system process, and implement the semantics of
250 * socket operations by switching out to the protocol specific routines.
251 */
252
253/*
254 * Get a socket structure from our zone, and initialize it. Note that it
255 * would probably be better to allocate socket and PCB at the same time, but
256 * I'm not convinced that all the protocols can be easily modified to do
257 * this.
258 *
259 * soalloc() returns a socket with a ref count of 0.
260 */
261static struct socket *
262soalloc(void)
263{
264 struct socket *so;
265
266 so = uma_zalloc(socket_zone, M_NOWAIT | M_ZERO);
267 if (so == NULL)
268 return (NULL);
269#ifdef MAC
270 if (mac_init_socket(so, M_NOWAIT) != 0) {
271 uma_zfree(socket_zone, so);
272 return (NULL);
273 }
274#endif
275 SOCKBUF_LOCK_INIT(&so->so_snd, "so_snd");
276 SOCKBUF_LOCK_INIT(&so->so_rcv, "so_rcv");
277 sx_init(&so->so_snd.sb_sx, "so_snd_sx");
278 sx_init(&so->so_rcv.sb_sx, "so_rcv_sx");
279 TAILQ_INIT(&so->so_aiojobq);
280 mtx_lock(&so_global_mtx);
281 so->so_gencnt = ++so_gencnt;
282 ++numopensockets;
283 mtx_unlock(&so_global_mtx);
284 return (so);
285}
286
287/*
288 * Free the storage associated with a socket at the socket layer, tear down
289 * locks, labels, etc. All protocol state is assumed already to have been
290 * torn down (and possibly never set up) by the caller.
291 */
292static void
293sodealloc(struct socket *so)
294{
295
296 KASSERT(so->so_count == 0, ("sodealloc(): so_count %d", so->so_count));
297 KASSERT(so->so_pcb == NULL, ("sodealloc(): so_pcb != NULL"));
298
299 mtx_lock(&so_global_mtx);
300 so->so_gencnt = ++so_gencnt;
301 --numopensockets; /* Could be below, but faster here. */
302 mtx_unlock(&so_global_mtx);
303 if (so->so_rcv.sb_hiwat)
304 (void)chgsbsize(so->so_cred->cr_uidinfo,
305 &so->so_rcv.sb_hiwat, 0, RLIM_INFINITY);
306 if (so->so_snd.sb_hiwat)
307 (void)chgsbsize(so->so_cred->cr_uidinfo,
308 &so->so_snd.sb_hiwat, 0, RLIM_INFINITY);
309#ifdef INET
310 /* remove acccept filter if one is present. */
311 if (so->so_accf != NULL)
312 do_setopt_accept_filter(so, NULL);
313#endif
314#ifdef MAC
315 mac_destroy_socket(so);
316#endif
317 crfree(so->so_cred);
318 sx_destroy(&so->so_snd.sb_sx);
319 sx_destroy(&so->so_rcv.sb_sx);
320 SOCKBUF_LOCK_DESTROY(&so->so_snd);
321 SOCKBUF_LOCK_DESTROY(&so->so_rcv);
322 uma_zfree(socket_zone, so);
323}
324
325/*
326 * socreate returns a socket with a ref count of 1. The socket should be
327 * closed with soclose().
328 */
329int
99
100#include "opt_inet.h"
101#include "opt_mac.h"
102#include "opt_zero.h"
103#include "opt_compat.h"
104
105#include <sys/param.h>
106#include <sys/systm.h>
107#include <sys/fcntl.h>
108#include <sys/limits.h>
109#include <sys/lock.h>
110#include <sys/mac.h>
111#include <sys/malloc.h>
112#include <sys/mbuf.h>
113#include <sys/mutex.h>
114#include <sys/domain.h>
115#include <sys/file.h> /* for struct knote */
116#include <sys/kernel.h>
117#include <sys/event.h>
118#include <sys/eventhandler.h>
119#include <sys/poll.h>
120#include <sys/proc.h>
121#include <sys/protosw.h>
122#include <sys/socket.h>
123#include <sys/socketvar.h>
124#include <sys/resourcevar.h>
125#include <sys/signalvar.h>
126#include <sys/stat.h>
127#include <sys/sx.h>
128#include <sys/sysctl.h>
129#include <sys/uio.h>
130#include <sys/jail.h>
131
132#include <security/mac/mac_framework.h>
133
134#include <vm/uma.h>
135
136#ifdef COMPAT_IA32
137#include <sys/mount.h>
138#include <compat/freebsd32/freebsd32.h>
139
140extern struct sysentvec ia32_freebsd_sysvec;
141#endif
142
143static int soreceive_rcvoob(struct socket *so, struct uio *uio,
144 int flags);
145
146static void filt_sordetach(struct knote *kn);
147static int filt_soread(struct knote *kn, long hint);
148static void filt_sowdetach(struct knote *kn);
149static int filt_sowrite(struct knote *kn, long hint);
150static int filt_solisten(struct knote *kn, long hint);
151
152static struct filterops solisten_filtops =
153 { 1, NULL, filt_sordetach, filt_solisten };
154static struct filterops soread_filtops =
155 { 1, NULL, filt_sordetach, filt_soread };
156static struct filterops sowrite_filtops =
157 { 1, NULL, filt_sowdetach, filt_sowrite };
158
159uma_zone_t socket_zone;
160so_gen_t so_gencnt; /* generation count for sockets */
161
162int maxsockets;
163
164MALLOC_DEFINE(M_SONAME, "soname", "socket name");
165MALLOC_DEFINE(M_PCB, "pcb", "protocol control block");
166
167static int somaxconn = SOMAXCONN;
168static int sysctl_somaxconn(SYSCTL_HANDLER_ARGS);
169/* XXX: we dont have SYSCTL_USHORT */
170SYSCTL_PROC(_kern_ipc, KIPC_SOMAXCONN, somaxconn, CTLTYPE_UINT | CTLFLAG_RW,
171 0, sizeof(int), sysctl_somaxconn, "I", "Maximum pending socket connection "
172 "queue size");
173static int numopensockets;
174SYSCTL_INT(_kern_ipc, OID_AUTO, numopensockets, CTLFLAG_RD,
175 &numopensockets, 0, "Number of open sockets");
176#ifdef ZERO_COPY_SOCKETS
177/* These aren't static because they're used in other files. */
178int so_zero_copy_send = 1;
179int so_zero_copy_receive = 1;
180SYSCTL_NODE(_kern_ipc, OID_AUTO, zero_copy, CTLFLAG_RD, 0,
181 "Zero copy controls");
182SYSCTL_INT(_kern_ipc_zero_copy, OID_AUTO, receive, CTLFLAG_RW,
183 &so_zero_copy_receive, 0, "Enable zero copy receive");
184SYSCTL_INT(_kern_ipc_zero_copy, OID_AUTO, send, CTLFLAG_RW,
185 &so_zero_copy_send, 0, "Enable zero copy send");
186#endif /* ZERO_COPY_SOCKETS */
187
188/*
189 * accept_mtx locks down per-socket fields relating to accept queues. See
190 * socketvar.h for an annotation of the protected fields of struct socket.
191 */
192struct mtx accept_mtx;
193MTX_SYSINIT(accept_mtx, &accept_mtx, "accept", MTX_DEF);
194
195/*
196 * so_global_mtx protects so_gencnt, numopensockets, and the per-socket
197 * so_gencnt field.
198 */
199static struct mtx so_global_mtx;
200MTX_SYSINIT(so_global_mtx, &so_global_mtx, "so_glabel", MTX_DEF);
201
202/*
203 * General IPC sysctl name space, used by sockets and a variety of other IPC
204 * types.
205 */
206SYSCTL_NODE(_kern, KERN_IPC, ipc, CTLFLAG_RW, 0, "IPC");
207
208/*
209 * Sysctl to get and set the maximum global sockets limit. Notify protocols
210 * of the change so that they can update their dependent limits as required.
211 */
212static int
213sysctl_maxsockets(SYSCTL_HANDLER_ARGS)
214{
215 int error, newmaxsockets;
216
217 newmaxsockets = maxsockets;
218 error = sysctl_handle_int(oidp, &newmaxsockets, sizeof(int), req);
219 if (error == 0 && req->newptr) {
220 if (newmaxsockets > maxsockets) {
221 maxsockets = newmaxsockets;
222 if (maxsockets > ((maxfiles / 4) * 3)) {
223 maxfiles = (maxsockets * 5) / 4;
224 maxfilesperproc = (maxfiles * 9) / 10;
225 }
226 EVENTHANDLER_INVOKE(maxsockets_change);
227 } else
228 error = EINVAL;
229 }
230 return (error);
231}
232
233SYSCTL_PROC(_kern_ipc, OID_AUTO, maxsockets, CTLTYPE_INT|CTLFLAG_RW,
234 &maxsockets, 0, sysctl_maxsockets, "IU",
235 "Maximum number of sockets avaliable");
236
237/*
238 * Initialise maxsockets.
239 */
240static void init_maxsockets(void *ignored)
241{
242 TUNABLE_INT_FETCH("kern.ipc.maxsockets", &maxsockets);
243 maxsockets = imax(maxsockets, imax(maxfiles, nmbclusters));
244}
245SYSINIT(param, SI_SUB_TUNABLES, SI_ORDER_ANY, init_maxsockets, NULL);
246
247/*
248 * Socket operation routines. These routines are called by the routines in
249 * sys_socket.c or from a system process, and implement the semantics of
250 * socket operations by switching out to the protocol specific routines.
251 */
252
253/*
254 * Get a socket structure from our zone, and initialize it. Note that it
255 * would probably be better to allocate socket and PCB at the same time, but
256 * I'm not convinced that all the protocols can be easily modified to do
257 * this.
258 *
259 * soalloc() returns a socket with a ref count of 0.
260 */
261static struct socket *
262soalloc(void)
263{
264 struct socket *so;
265
266 so = uma_zalloc(socket_zone, M_NOWAIT | M_ZERO);
267 if (so == NULL)
268 return (NULL);
269#ifdef MAC
270 if (mac_init_socket(so, M_NOWAIT) != 0) {
271 uma_zfree(socket_zone, so);
272 return (NULL);
273 }
274#endif
275 SOCKBUF_LOCK_INIT(&so->so_snd, "so_snd");
276 SOCKBUF_LOCK_INIT(&so->so_rcv, "so_rcv");
277 sx_init(&so->so_snd.sb_sx, "so_snd_sx");
278 sx_init(&so->so_rcv.sb_sx, "so_rcv_sx");
279 TAILQ_INIT(&so->so_aiojobq);
280 mtx_lock(&so_global_mtx);
281 so->so_gencnt = ++so_gencnt;
282 ++numopensockets;
283 mtx_unlock(&so_global_mtx);
284 return (so);
285}
286
287/*
288 * Free the storage associated with a socket at the socket layer, tear down
289 * locks, labels, etc. All protocol state is assumed already to have been
290 * torn down (and possibly never set up) by the caller.
291 */
292static void
293sodealloc(struct socket *so)
294{
295
296 KASSERT(so->so_count == 0, ("sodealloc(): so_count %d", so->so_count));
297 KASSERT(so->so_pcb == NULL, ("sodealloc(): so_pcb != NULL"));
298
299 mtx_lock(&so_global_mtx);
300 so->so_gencnt = ++so_gencnt;
301 --numopensockets; /* Could be below, but faster here. */
302 mtx_unlock(&so_global_mtx);
303 if (so->so_rcv.sb_hiwat)
304 (void)chgsbsize(so->so_cred->cr_uidinfo,
305 &so->so_rcv.sb_hiwat, 0, RLIM_INFINITY);
306 if (so->so_snd.sb_hiwat)
307 (void)chgsbsize(so->so_cred->cr_uidinfo,
308 &so->so_snd.sb_hiwat, 0, RLIM_INFINITY);
309#ifdef INET
310 /* remove acccept filter if one is present. */
311 if (so->so_accf != NULL)
312 do_setopt_accept_filter(so, NULL);
313#endif
314#ifdef MAC
315 mac_destroy_socket(so);
316#endif
317 crfree(so->so_cred);
318 sx_destroy(&so->so_snd.sb_sx);
319 sx_destroy(&so->so_rcv.sb_sx);
320 SOCKBUF_LOCK_DESTROY(&so->so_snd);
321 SOCKBUF_LOCK_DESTROY(&so->so_rcv);
322 uma_zfree(socket_zone, so);
323}
324
325/*
326 * socreate returns a socket with a ref count of 1. The socket should be
327 * closed with soclose().
328 */
329int
330socreate(dom, aso, type, proto, cred, td)
331 int dom;
332 struct socket **aso;
333 int type;
334 int proto;
335 struct ucred *cred;
336 struct thread *td;
330socreate(int dom, struct socket **aso, int type, int proto,
331 struct ucred *cred, struct thread *td)
337{
338 struct protosw *prp;
339 struct socket *so;
340 int error;
341
342 if (proto)
343 prp = pffindproto(dom, proto, type);
344 else
345 prp = pffindtype(dom, type);
346
347 if (prp == NULL || prp->pr_usrreqs->pru_attach == NULL ||
348 prp->pr_usrreqs->pru_attach == pru_attach_notsupp)
349 return (EPROTONOSUPPORT);
350
351 if (jailed(cred) && jail_socket_unixiproute_only &&
352 prp->pr_domain->dom_family != PF_LOCAL &&
353 prp->pr_domain->dom_family != PF_INET &&
354 prp->pr_domain->dom_family != PF_ROUTE) {
355 return (EPROTONOSUPPORT);
356 }
357
358 if (prp->pr_type != type)
359 return (EPROTOTYPE);
360 so = soalloc();
361 if (so == NULL)
362 return (ENOBUFS);
363
364 TAILQ_INIT(&so->so_incomp);
365 TAILQ_INIT(&so->so_comp);
366 so->so_type = type;
367 so->so_cred = crhold(cred);
368 so->so_proto = prp;
369#ifdef MAC
370 mac_create_socket(cred, so);
371#endif
372 knlist_init(&so->so_rcv.sb_sel.si_note, SOCKBUF_MTX(&so->so_rcv),
373 NULL, NULL, NULL);
374 knlist_init(&so->so_snd.sb_sel.si_note, SOCKBUF_MTX(&so->so_snd),
375 NULL, NULL, NULL);
376 so->so_count = 1;
377 /*
378 * Auto-sizing of socket buffers is managed by the protocols and
379 * the appropriate flags must be set in the pru_attach function.
380 */
381 error = (*prp->pr_usrreqs->pru_attach)(so, proto, td);
382 if (error) {
383 KASSERT(so->so_count == 1, ("socreate: so_count %d",
384 so->so_count));
385 so->so_count = 0;
386 sodealloc(so);
387 return (error);
388 }
389 *aso = so;
390 return (0);
391}
392
393#ifdef REGRESSION
394static int regression_sonewconn_earlytest = 1;
395SYSCTL_INT(_regression, OID_AUTO, sonewconn_earlytest, CTLFLAG_RW,
396 &regression_sonewconn_earlytest, 0, "Perform early sonewconn limit test");
397#endif
398
399/*
400 * When an attempt at a new connection is noted on a socket which accepts
401 * connections, sonewconn is called. If the connection is possible (subject
402 * to space constraints, etc.) then we allocate a new structure, propoerly
403 * linked into the data structure of the original socket, and return this.
404 * Connstatus may be 0, or SO_ISCONFIRMING, or SO_ISCONNECTED.
405 *
406 * Note: the ref count on the socket is 0 on return.
407 */
408struct socket *
332{
333 struct protosw *prp;
334 struct socket *so;
335 int error;
336
337 if (proto)
338 prp = pffindproto(dom, proto, type);
339 else
340 prp = pffindtype(dom, type);
341
342 if (prp == NULL || prp->pr_usrreqs->pru_attach == NULL ||
343 prp->pr_usrreqs->pru_attach == pru_attach_notsupp)
344 return (EPROTONOSUPPORT);
345
346 if (jailed(cred) && jail_socket_unixiproute_only &&
347 prp->pr_domain->dom_family != PF_LOCAL &&
348 prp->pr_domain->dom_family != PF_INET &&
349 prp->pr_domain->dom_family != PF_ROUTE) {
350 return (EPROTONOSUPPORT);
351 }
352
353 if (prp->pr_type != type)
354 return (EPROTOTYPE);
355 so = soalloc();
356 if (so == NULL)
357 return (ENOBUFS);
358
359 TAILQ_INIT(&so->so_incomp);
360 TAILQ_INIT(&so->so_comp);
361 so->so_type = type;
362 so->so_cred = crhold(cred);
363 so->so_proto = prp;
364#ifdef MAC
365 mac_create_socket(cred, so);
366#endif
367 knlist_init(&so->so_rcv.sb_sel.si_note, SOCKBUF_MTX(&so->so_rcv),
368 NULL, NULL, NULL);
369 knlist_init(&so->so_snd.sb_sel.si_note, SOCKBUF_MTX(&so->so_snd),
370 NULL, NULL, NULL);
371 so->so_count = 1;
372 /*
373 * Auto-sizing of socket buffers is managed by the protocols and
374 * the appropriate flags must be set in the pru_attach function.
375 */
376 error = (*prp->pr_usrreqs->pru_attach)(so, proto, td);
377 if (error) {
378 KASSERT(so->so_count == 1, ("socreate: so_count %d",
379 so->so_count));
380 so->so_count = 0;
381 sodealloc(so);
382 return (error);
383 }
384 *aso = so;
385 return (0);
386}
387
388#ifdef REGRESSION
389static int regression_sonewconn_earlytest = 1;
390SYSCTL_INT(_regression, OID_AUTO, sonewconn_earlytest, CTLFLAG_RW,
391 &regression_sonewconn_earlytest, 0, "Perform early sonewconn limit test");
392#endif
393
394/*
395 * When an attempt at a new connection is noted on a socket which accepts
396 * connections, sonewconn is called. If the connection is possible (subject
397 * to space constraints, etc.) then we allocate a new structure, propoerly
398 * linked into the data structure of the original socket, and return this.
399 * Connstatus may be 0, or SO_ISCONFIRMING, or SO_ISCONNECTED.
400 *
401 * Note: the ref count on the socket is 0 on return.
402 */
403struct socket *
409sonewconn(head, connstatus)
410 register struct socket *head;
411 int connstatus;
404sonewconn(struct socket *head, int connstatus)
412{
405{
413 register struct socket *so;
406 struct socket *so;
414 int over;
415
416 ACCEPT_LOCK();
417 over = (head->so_qlen > 3 * head->so_qlimit / 2);
418 ACCEPT_UNLOCK();
419#ifdef REGRESSION
420 if (regression_sonewconn_earlytest && over)
421#else
422 if (over)
423#endif
424 return (NULL);
425 so = soalloc();
426 if (so == NULL)
427 return (NULL);
428 if ((head->so_options & SO_ACCEPTFILTER) != 0)
429 connstatus = 0;
430 so->so_head = head;
431 so->so_type = head->so_type;
432 so->so_options = head->so_options &~ SO_ACCEPTCONN;
433 so->so_linger = head->so_linger;
434 so->so_state = head->so_state | SS_NOFDREF;
435 so->so_proto = head->so_proto;
436 so->so_cred = crhold(head->so_cred);
437#ifdef MAC
438 SOCK_LOCK(head);
439 mac_create_socket_from_socket(head, so);
440 SOCK_UNLOCK(head);
441#endif
442 knlist_init(&so->so_rcv.sb_sel.si_note, SOCKBUF_MTX(&so->so_rcv),
443 NULL, NULL, NULL);
444 knlist_init(&so->so_snd.sb_sel.si_note, SOCKBUF_MTX(&so->so_snd),
445 NULL, NULL, NULL);
446 if (soreserve(so, head->so_snd.sb_hiwat, head->so_rcv.sb_hiwat) ||
447 (*so->so_proto->pr_usrreqs->pru_attach)(so, 0, NULL)) {
448 sodealloc(so);
449 return (NULL);
450 }
451 so->so_rcv.sb_lowat = head->so_rcv.sb_lowat;
452 so->so_snd.sb_lowat = head->so_snd.sb_lowat;
453 so->so_rcv.sb_timeo = head->so_rcv.sb_timeo;
454 so->so_snd.sb_timeo = head->so_snd.sb_timeo;
455 so->so_rcv.sb_flags |= head->so_rcv.sb_flags & SB_AUTOSIZE;
456 so->so_snd.sb_flags |= head->so_snd.sb_flags & SB_AUTOSIZE;
457 so->so_state |= connstatus;
458 ACCEPT_LOCK();
459 if (connstatus) {
460 TAILQ_INSERT_TAIL(&head->so_comp, so, so_list);
461 so->so_qstate |= SQ_COMP;
462 head->so_qlen++;
463 } else {
464 /*
465 * Keep removing sockets from the head until there's room for
466 * us to insert on the tail. In pre-locking revisions, this
467 * was a simple if(), but as we could be racing with other
468 * threads and soabort() requires dropping locks, we must
469 * loop waiting for the condition to be true.
470 */
471 while (head->so_incqlen > head->so_qlimit) {
472 struct socket *sp;
473 sp = TAILQ_FIRST(&head->so_incomp);
474 TAILQ_REMOVE(&head->so_incomp, sp, so_list);
475 head->so_incqlen--;
476 sp->so_qstate &= ~SQ_INCOMP;
477 sp->so_head = NULL;
478 ACCEPT_UNLOCK();
479 soabort(sp);
480 ACCEPT_LOCK();
481 }
482 TAILQ_INSERT_TAIL(&head->so_incomp, so, so_list);
483 so->so_qstate |= SQ_INCOMP;
484 head->so_incqlen++;
485 }
486 ACCEPT_UNLOCK();
487 if (connstatus) {
488 sorwakeup(head);
489 wakeup_one(&head->so_timeo);
490 }
491 return (so);
492}
493
494int
407 int over;
408
409 ACCEPT_LOCK();
410 over = (head->so_qlen > 3 * head->so_qlimit / 2);
411 ACCEPT_UNLOCK();
412#ifdef REGRESSION
413 if (regression_sonewconn_earlytest && over)
414#else
415 if (over)
416#endif
417 return (NULL);
418 so = soalloc();
419 if (so == NULL)
420 return (NULL);
421 if ((head->so_options & SO_ACCEPTFILTER) != 0)
422 connstatus = 0;
423 so->so_head = head;
424 so->so_type = head->so_type;
425 so->so_options = head->so_options &~ SO_ACCEPTCONN;
426 so->so_linger = head->so_linger;
427 so->so_state = head->so_state | SS_NOFDREF;
428 so->so_proto = head->so_proto;
429 so->so_cred = crhold(head->so_cred);
430#ifdef MAC
431 SOCK_LOCK(head);
432 mac_create_socket_from_socket(head, so);
433 SOCK_UNLOCK(head);
434#endif
435 knlist_init(&so->so_rcv.sb_sel.si_note, SOCKBUF_MTX(&so->so_rcv),
436 NULL, NULL, NULL);
437 knlist_init(&so->so_snd.sb_sel.si_note, SOCKBUF_MTX(&so->so_snd),
438 NULL, NULL, NULL);
439 if (soreserve(so, head->so_snd.sb_hiwat, head->so_rcv.sb_hiwat) ||
440 (*so->so_proto->pr_usrreqs->pru_attach)(so, 0, NULL)) {
441 sodealloc(so);
442 return (NULL);
443 }
444 so->so_rcv.sb_lowat = head->so_rcv.sb_lowat;
445 so->so_snd.sb_lowat = head->so_snd.sb_lowat;
446 so->so_rcv.sb_timeo = head->so_rcv.sb_timeo;
447 so->so_snd.sb_timeo = head->so_snd.sb_timeo;
448 so->so_rcv.sb_flags |= head->so_rcv.sb_flags & SB_AUTOSIZE;
449 so->so_snd.sb_flags |= head->so_snd.sb_flags & SB_AUTOSIZE;
450 so->so_state |= connstatus;
451 ACCEPT_LOCK();
452 if (connstatus) {
453 TAILQ_INSERT_TAIL(&head->so_comp, so, so_list);
454 so->so_qstate |= SQ_COMP;
455 head->so_qlen++;
456 } else {
457 /*
458 * Keep removing sockets from the head until there's room for
459 * us to insert on the tail. In pre-locking revisions, this
460 * was a simple if(), but as we could be racing with other
461 * threads and soabort() requires dropping locks, we must
462 * loop waiting for the condition to be true.
463 */
464 while (head->so_incqlen > head->so_qlimit) {
465 struct socket *sp;
466 sp = TAILQ_FIRST(&head->so_incomp);
467 TAILQ_REMOVE(&head->so_incomp, sp, so_list);
468 head->so_incqlen--;
469 sp->so_qstate &= ~SQ_INCOMP;
470 sp->so_head = NULL;
471 ACCEPT_UNLOCK();
472 soabort(sp);
473 ACCEPT_LOCK();
474 }
475 TAILQ_INSERT_TAIL(&head->so_incomp, so, so_list);
476 so->so_qstate |= SQ_INCOMP;
477 head->so_incqlen++;
478 }
479 ACCEPT_UNLOCK();
480 if (connstatus) {
481 sorwakeup(head);
482 wakeup_one(&head->so_timeo);
483 }
484 return (so);
485}
486
487int
495sobind(so, nam, td)
496 struct socket *so;
497 struct sockaddr *nam;
498 struct thread *td;
488sobind(struct socket *so, struct sockaddr *nam, struct thread *td)
499{
500
501 return ((*so->so_proto->pr_usrreqs->pru_bind)(so, nam, td));
502}
503
504/*
505 * solisten() transitions a socket from a non-listening state to a listening
506 * state, but can also be used to update the listen queue depth on an
507 * existing listen socket. The protocol will call back into the sockets
508 * layer using solisten_proto_check() and solisten_proto() to check and set
509 * socket-layer listen state. Call backs are used so that the protocol can
510 * acquire both protocol and socket layer locks in whatever order is required
511 * by the protocol.
512 *
513 * Protocol implementors are advised to hold the socket lock across the
514 * socket-layer test and set to avoid races at the socket layer.
515 */
516int
489{
490
491 return ((*so->so_proto->pr_usrreqs->pru_bind)(so, nam, td));
492}
493
494/*
495 * solisten() transitions a socket from a non-listening state to a listening
496 * state, but can also be used to update the listen queue depth on an
497 * existing listen socket. The protocol will call back into the sockets
498 * layer using solisten_proto_check() and solisten_proto() to check and set
499 * socket-layer listen state. Call backs are used so that the protocol can
500 * acquire both protocol and socket layer locks in whatever order is required
501 * by the protocol.
502 *
503 * Protocol implementors are advised to hold the socket lock across the
504 * socket-layer test and set to avoid races at the socket layer.
505 */
506int
517solisten(so, backlog, td)
518 struct socket *so;
519 int backlog;
520 struct thread *td;
507solisten(struct socket *so, int backlog, struct thread *td)
521{
522
523 return ((*so->so_proto->pr_usrreqs->pru_listen)(so, backlog, td));
524}
525
526int
508{
509
510 return ((*so->so_proto->pr_usrreqs->pru_listen)(so, backlog, td));
511}
512
513int
527solisten_proto_check(so)
528 struct socket *so;
514solisten_proto_check(struct socket *so)
529{
530
531 SOCK_LOCK_ASSERT(so);
532
533 if (so->so_state & (SS_ISCONNECTED | SS_ISCONNECTING |
534 SS_ISDISCONNECTING))
535 return (EINVAL);
536 return (0);
537}
538
539void
515{
516
517 SOCK_LOCK_ASSERT(so);
518
519 if (so->so_state & (SS_ISCONNECTED | SS_ISCONNECTING |
520 SS_ISDISCONNECTING))
521 return (EINVAL);
522 return (0);
523}
524
525void
540solisten_proto(so, backlog)
541 struct socket *so;
542 int backlog;
526solisten_proto(struct socket *so, int backlog)
543{
544
545 SOCK_LOCK_ASSERT(so);
546
547 if (backlog < 0 || backlog > somaxconn)
548 backlog = somaxconn;
549 so->so_qlimit = backlog;
550 so->so_options |= SO_ACCEPTCONN;
551}
552
553/*
554 * Attempt to free a socket. This should really be sotryfree().
555 *
556 * sofree() will succeed if:
557 *
558 * - There are no outstanding file descriptor references or related consumers
559 * (so_count == 0).
560 *
561 * - The socket has been closed by user space, if ever open (SS_NOFDREF).
562 *
563 * - The protocol does not have an outstanding strong reference on the socket
564 * (SS_PROTOREF).
565 *
566 * - The socket is not in a completed connection queue, so a process has been
567 * notified that it is present. If it is removed, the user process may
568 * block in accept() despite select() saying the socket was ready.
569 *
570 * Otherwise, it will quietly abort so that a future call to sofree(), when
571 * conditions are right, can succeed.
572 */
573void
527{
528
529 SOCK_LOCK_ASSERT(so);
530
531 if (backlog < 0 || backlog > somaxconn)
532 backlog = somaxconn;
533 so->so_qlimit = backlog;
534 so->so_options |= SO_ACCEPTCONN;
535}
536
537/*
538 * Attempt to free a socket. This should really be sotryfree().
539 *
540 * sofree() will succeed if:
541 *
542 * - There are no outstanding file descriptor references or related consumers
543 * (so_count == 0).
544 *
545 * - The socket has been closed by user space, if ever open (SS_NOFDREF).
546 *
547 * - The protocol does not have an outstanding strong reference on the socket
548 * (SS_PROTOREF).
549 *
550 * - The socket is not in a completed connection queue, so a process has been
551 * notified that it is present. If it is removed, the user process may
552 * block in accept() despite select() saying the socket was ready.
553 *
554 * Otherwise, it will quietly abort so that a future call to sofree(), when
555 * conditions are right, can succeed.
556 */
557void
574sofree(so)
575 struct socket *so;
558sofree(struct socket *so)
576{
577 struct protosw *pr = so->so_proto;
578 struct socket *head;
579
580 ACCEPT_LOCK_ASSERT();
581 SOCK_LOCK_ASSERT(so);
582
583 if ((so->so_state & SS_NOFDREF) == 0 || so->so_count != 0 ||
584 (so->so_state & SS_PROTOREF) || (so->so_qstate & SQ_COMP)) {
585 SOCK_UNLOCK(so);
586 ACCEPT_UNLOCK();
587 return;
588 }
589
590 head = so->so_head;
591 if (head != NULL) {
592 KASSERT((so->so_qstate & SQ_COMP) != 0 ||
593 (so->so_qstate & SQ_INCOMP) != 0,
594 ("sofree: so_head != NULL, but neither SQ_COMP nor "
595 "SQ_INCOMP"));
596 KASSERT((so->so_qstate & SQ_COMP) == 0 ||
597 (so->so_qstate & SQ_INCOMP) == 0,
598 ("sofree: so->so_qstate is SQ_COMP and also SQ_INCOMP"));
599 TAILQ_REMOVE(&head->so_incomp, so, so_list);
600 head->so_incqlen--;
601 so->so_qstate &= ~SQ_INCOMP;
602 so->so_head = NULL;
603 }
604 KASSERT((so->so_qstate & SQ_COMP) == 0 &&
605 (so->so_qstate & SQ_INCOMP) == 0,
606 ("sofree: so_head == NULL, but still SQ_COMP(%d) or SQ_INCOMP(%d)",
607 so->so_qstate & SQ_COMP, so->so_qstate & SQ_INCOMP));
608 if (so->so_options & SO_ACCEPTCONN) {
609 KASSERT((TAILQ_EMPTY(&so->so_comp)), ("sofree: so_comp populated"));
610 KASSERT((TAILQ_EMPTY(&so->so_incomp)), ("sofree: so_comp populated"));
611 }
612 SOCK_UNLOCK(so);
613 ACCEPT_UNLOCK();
614
615 if (pr->pr_flags & PR_RIGHTS && pr->pr_domain->dom_dispose != NULL)
616 (*pr->pr_domain->dom_dispose)(so->so_rcv.sb_mb);
617 if (pr->pr_usrreqs->pru_detach != NULL)
618 (*pr->pr_usrreqs->pru_detach)(so);
619
620 /*
621 * From this point on, we assume that no other references to this
622 * socket exist anywhere else in the stack. Therefore, no locks need
623 * to be acquired or held.
624 *
625 * We used to do a lot of socket buffer and socket locking here, as
626 * well as invoke sorflush() and perform wakeups. The direct call to
627 * dom_dispose() and sbrelease_internal() are an inlining of what was
628 * necessary from sorflush().
629 *
630 * Notice that the socket buffer and kqueue state are torn down
631 * before calling pru_detach. This means that protocols shold not
632 * assume they can perform socket wakeups, etc, in their detach code.
633 */
634 sbdestroy(&so->so_snd, so);
635 sbdestroy(&so->so_rcv, so);
636 knlist_destroy(&so->so_rcv.sb_sel.si_note);
637 knlist_destroy(&so->so_snd.sb_sel.si_note);
638 sodealloc(so);
639}
640
641/*
642 * Close a socket on last file table reference removal. Initiate disconnect
643 * if connected. Free socket when disconnect complete.
644 *
645 * This function will sorele() the socket. Note that soclose() may be called
646 * prior to the ref count reaching zero. The actual socket structure will
647 * not be freed until the ref count reaches zero.
648 */
649int
559{
560 struct protosw *pr = so->so_proto;
561 struct socket *head;
562
563 ACCEPT_LOCK_ASSERT();
564 SOCK_LOCK_ASSERT(so);
565
566 if ((so->so_state & SS_NOFDREF) == 0 || so->so_count != 0 ||
567 (so->so_state & SS_PROTOREF) || (so->so_qstate & SQ_COMP)) {
568 SOCK_UNLOCK(so);
569 ACCEPT_UNLOCK();
570 return;
571 }
572
573 head = so->so_head;
574 if (head != NULL) {
575 KASSERT((so->so_qstate & SQ_COMP) != 0 ||
576 (so->so_qstate & SQ_INCOMP) != 0,
577 ("sofree: so_head != NULL, but neither SQ_COMP nor "
578 "SQ_INCOMP"));
579 KASSERT((so->so_qstate & SQ_COMP) == 0 ||
580 (so->so_qstate & SQ_INCOMP) == 0,
581 ("sofree: so->so_qstate is SQ_COMP and also SQ_INCOMP"));
582 TAILQ_REMOVE(&head->so_incomp, so, so_list);
583 head->so_incqlen--;
584 so->so_qstate &= ~SQ_INCOMP;
585 so->so_head = NULL;
586 }
587 KASSERT((so->so_qstate & SQ_COMP) == 0 &&
588 (so->so_qstate & SQ_INCOMP) == 0,
589 ("sofree: so_head == NULL, but still SQ_COMP(%d) or SQ_INCOMP(%d)",
590 so->so_qstate & SQ_COMP, so->so_qstate & SQ_INCOMP));
591 if (so->so_options & SO_ACCEPTCONN) {
592 KASSERT((TAILQ_EMPTY(&so->so_comp)), ("sofree: so_comp populated"));
593 KASSERT((TAILQ_EMPTY(&so->so_incomp)), ("sofree: so_comp populated"));
594 }
595 SOCK_UNLOCK(so);
596 ACCEPT_UNLOCK();
597
598 if (pr->pr_flags & PR_RIGHTS && pr->pr_domain->dom_dispose != NULL)
599 (*pr->pr_domain->dom_dispose)(so->so_rcv.sb_mb);
600 if (pr->pr_usrreqs->pru_detach != NULL)
601 (*pr->pr_usrreqs->pru_detach)(so);
602
603 /*
604 * From this point on, we assume that no other references to this
605 * socket exist anywhere else in the stack. Therefore, no locks need
606 * to be acquired or held.
607 *
608 * We used to do a lot of socket buffer and socket locking here, as
609 * well as invoke sorflush() and perform wakeups. The direct call to
610 * dom_dispose() and sbrelease_internal() are an inlining of what was
611 * necessary from sorflush().
612 *
613 * Notice that the socket buffer and kqueue state are torn down
614 * before calling pru_detach. This means that protocols shold not
615 * assume they can perform socket wakeups, etc, in their detach code.
616 */
617 sbdestroy(&so->so_snd, so);
618 sbdestroy(&so->so_rcv, so);
619 knlist_destroy(&so->so_rcv.sb_sel.si_note);
620 knlist_destroy(&so->so_snd.sb_sel.si_note);
621 sodealloc(so);
622}
623
624/*
625 * Close a socket on last file table reference removal. Initiate disconnect
626 * if connected. Free socket when disconnect complete.
627 *
628 * This function will sorele() the socket. Note that soclose() may be called
629 * prior to the ref count reaching zero. The actual socket structure will
630 * not be freed until the ref count reaches zero.
631 */
632int
650soclose(so)
651 struct socket *so;
633soclose(struct socket *so)
652{
653 int error = 0;
654
655 KASSERT(!(so->so_state & SS_NOFDREF), ("soclose: SS_NOFDREF on enter"));
656
657 funsetown(&so->so_sigio);
658 if (so->so_state & SS_ISCONNECTED) {
659 if ((so->so_state & SS_ISDISCONNECTING) == 0) {
660 error = sodisconnect(so);
661 if (error)
662 goto drop;
663 }
664 if (so->so_options & SO_LINGER) {
665 if ((so->so_state & SS_ISDISCONNECTING) &&
666 (so->so_state & SS_NBIO))
667 goto drop;
668 while (so->so_state & SS_ISCONNECTED) {
669 error = tsleep(&so->so_timeo,
670 PSOCK | PCATCH, "soclos", so->so_linger * hz);
671 if (error)
672 break;
673 }
674 }
675 }
676
677drop:
678 if (so->so_proto->pr_usrreqs->pru_close != NULL)
679 (*so->so_proto->pr_usrreqs->pru_close)(so);
680 if (so->so_options & SO_ACCEPTCONN) {
681 struct socket *sp;
682 ACCEPT_LOCK();
683 while ((sp = TAILQ_FIRST(&so->so_incomp)) != NULL) {
684 TAILQ_REMOVE(&so->so_incomp, sp, so_list);
685 so->so_incqlen--;
686 sp->so_qstate &= ~SQ_INCOMP;
687 sp->so_head = NULL;
688 ACCEPT_UNLOCK();
689 soabort(sp);
690 ACCEPT_LOCK();
691 }
692 while ((sp = TAILQ_FIRST(&so->so_comp)) != NULL) {
693 TAILQ_REMOVE(&so->so_comp, sp, so_list);
694 so->so_qlen--;
695 sp->so_qstate &= ~SQ_COMP;
696 sp->so_head = NULL;
697 ACCEPT_UNLOCK();
698 soabort(sp);
699 ACCEPT_LOCK();
700 }
701 ACCEPT_UNLOCK();
702 }
703 ACCEPT_LOCK();
704 SOCK_LOCK(so);
705 KASSERT((so->so_state & SS_NOFDREF) == 0, ("soclose: NOFDREF"));
706 so->so_state |= SS_NOFDREF;
707 sorele(so);
708 return (error);
709}
710
711/*
712 * soabort() is used to abruptly tear down a connection, such as when a
713 * resource limit is reached (listen queue depth exceeded), or if a listen
714 * socket is closed while there are sockets waiting to be accepted.
715 *
716 * This interface is tricky, because it is called on an unreferenced socket,
717 * and must be called only by a thread that has actually removed the socket
718 * from the listen queue it was on, or races with other threads are risked.
719 *
720 * This interface will call into the protocol code, so must not be called
721 * with any socket locks held. Protocols do call it while holding their own
722 * recursible protocol mutexes, but this is something that should be subject
723 * to review in the future.
724 */
725void
634{
635 int error = 0;
636
637 KASSERT(!(so->so_state & SS_NOFDREF), ("soclose: SS_NOFDREF on enter"));
638
639 funsetown(&so->so_sigio);
640 if (so->so_state & SS_ISCONNECTED) {
641 if ((so->so_state & SS_ISDISCONNECTING) == 0) {
642 error = sodisconnect(so);
643 if (error)
644 goto drop;
645 }
646 if (so->so_options & SO_LINGER) {
647 if ((so->so_state & SS_ISDISCONNECTING) &&
648 (so->so_state & SS_NBIO))
649 goto drop;
650 while (so->so_state & SS_ISCONNECTED) {
651 error = tsleep(&so->so_timeo,
652 PSOCK | PCATCH, "soclos", so->so_linger * hz);
653 if (error)
654 break;
655 }
656 }
657 }
658
659drop:
660 if (so->so_proto->pr_usrreqs->pru_close != NULL)
661 (*so->so_proto->pr_usrreqs->pru_close)(so);
662 if (so->so_options & SO_ACCEPTCONN) {
663 struct socket *sp;
664 ACCEPT_LOCK();
665 while ((sp = TAILQ_FIRST(&so->so_incomp)) != NULL) {
666 TAILQ_REMOVE(&so->so_incomp, sp, so_list);
667 so->so_incqlen--;
668 sp->so_qstate &= ~SQ_INCOMP;
669 sp->so_head = NULL;
670 ACCEPT_UNLOCK();
671 soabort(sp);
672 ACCEPT_LOCK();
673 }
674 while ((sp = TAILQ_FIRST(&so->so_comp)) != NULL) {
675 TAILQ_REMOVE(&so->so_comp, sp, so_list);
676 so->so_qlen--;
677 sp->so_qstate &= ~SQ_COMP;
678 sp->so_head = NULL;
679 ACCEPT_UNLOCK();
680 soabort(sp);
681 ACCEPT_LOCK();
682 }
683 ACCEPT_UNLOCK();
684 }
685 ACCEPT_LOCK();
686 SOCK_LOCK(so);
687 KASSERT((so->so_state & SS_NOFDREF) == 0, ("soclose: NOFDREF"));
688 so->so_state |= SS_NOFDREF;
689 sorele(so);
690 return (error);
691}
692
693/*
694 * soabort() is used to abruptly tear down a connection, such as when a
695 * resource limit is reached (listen queue depth exceeded), or if a listen
696 * socket is closed while there are sockets waiting to be accepted.
697 *
698 * This interface is tricky, because it is called on an unreferenced socket,
699 * and must be called only by a thread that has actually removed the socket
700 * from the listen queue it was on, or races with other threads are risked.
701 *
702 * This interface will call into the protocol code, so must not be called
703 * with any socket locks held. Protocols do call it while holding their own
704 * recursible protocol mutexes, but this is something that should be subject
705 * to review in the future.
706 */
707void
726soabort(so)
727 struct socket *so;
708soabort(struct socket *so)
728{
729
730 /*
731 * In as much as is possible, assert that no references to this
732 * socket are held. This is not quite the same as asserting that the
733 * current thread is responsible for arranging for no references, but
734 * is as close as we can get for now.
735 */
736 KASSERT(so->so_count == 0, ("soabort: so_count"));
737 KASSERT((so->so_state & SS_PROTOREF) == 0, ("soabort: SS_PROTOREF"));
738 KASSERT(so->so_state & SS_NOFDREF, ("soabort: !SS_NOFDREF"));
739 KASSERT((so->so_state & SQ_COMP) == 0, ("soabort: SQ_COMP"));
740 KASSERT((so->so_state & SQ_INCOMP) == 0, ("soabort: SQ_INCOMP"));
741
742 if (so->so_proto->pr_usrreqs->pru_abort != NULL)
743 (*so->so_proto->pr_usrreqs->pru_abort)(so);
744 ACCEPT_LOCK();
745 SOCK_LOCK(so);
746 sofree(so);
747}
748
749int
709{
710
711 /*
712 * In as much as is possible, assert that no references to this
713 * socket are held. This is not quite the same as asserting that the
714 * current thread is responsible for arranging for no references, but
715 * is as close as we can get for now.
716 */
717 KASSERT(so->so_count == 0, ("soabort: so_count"));
718 KASSERT((so->so_state & SS_PROTOREF) == 0, ("soabort: SS_PROTOREF"));
719 KASSERT(so->so_state & SS_NOFDREF, ("soabort: !SS_NOFDREF"));
720 KASSERT((so->so_state & SQ_COMP) == 0, ("soabort: SQ_COMP"));
721 KASSERT((so->so_state & SQ_INCOMP) == 0, ("soabort: SQ_INCOMP"));
722
723 if (so->so_proto->pr_usrreqs->pru_abort != NULL)
724 (*so->so_proto->pr_usrreqs->pru_abort)(so);
725 ACCEPT_LOCK();
726 SOCK_LOCK(so);
727 sofree(so);
728}
729
730int
750soaccept(so, nam)
751 struct socket *so;
752 struct sockaddr **nam;
731soaccept(struct socket *so, struct sockaddr **nam)
753{
754 int error;
755
756 SOCK_LOCK(so);
757 KASSERT((so->so_state & SS_NOFDREF) != 0, ("soaccept: !NOFDREF"));
758 so->so_state &= ~SS_NOFDREF;
759 SOCK_UNLOCK(so);
760 error = (*so->so_proto->pr_usrreqs->pru_accept)(so, nam);
761 return (error);
762}
763
764int
732{
733 int error;
734
735 SOCK_LOCK(so);
736 KASSERT((so->so_state & SS_NOFDREF) != 0, ("soaccept: !NOFDREF"));
737 so->so_state &= ~SS_NOFDREF;
738 SOCK_UNLOCK(so);
739 error = (*so->so_proto->pr_usrreqs->pru_accept)(so, nam);
740 return (error);
741}
742
743int
765soconnect(so, nam, td)
766 struct socket *so;
767 struct sockaddr *nam;
768 struct thread *td;
744soconnect(struct socket *so, struct sockaddr *nam, struct thread *td)
769{
770 int error;
771
772 if (so->so_options & SO_ACCEPTCONN)
773 return (EOPNOTSUPP);
774 /*
775 * If protocol is connection-based, can only connect once.
776 * Otherwise, if connected, try to disconnect first. This allows
777 * user to disconnect by connecting to, e.g., a null address.
778 */
779 if (so->so_state & (SS_ISCONNECTED|SS_ISCONNECTING) &&
780 ((so->so_proto->pr_flags & PR_CONNREQUIRED) ||
781 (error = sodisconnect(so)))) {
782 error = EISCONN;
783 } else {
784 /*
785 * Prevent accumulated error from previous connection from
786 * biting us.
787 */
788 so->so_error = 0;
789 error = (*so->so_proto->pr_usrreqs->pru_connect)(so, nam, td);
790 }
791
792 return (error);
793}
794
795int
745{
746 int error;
747
748 if (so->so_options & SO_ACCEPTCONN)
749 return (EOPNOTSUPP);
750 /*
751 * If protocol is connection-based, can only connect once.
752 * Otherwise, if connected, try to disconnect first. This allows
753 * user to disconnect by connecting to, e.g., a null address.
754 */
755 if (so->so_state & (SS_ISCONNECTED|SS_ISCONNECTING) &&
756 ((so->so_proto->pr_flags & PR_CONNREQUIRED) ||
757 (error = sodisconnect(so)))) {
758 error = EISCONN;
759 } else {
760 /*
761 * Prevent accumulated error from previous connection from
762 * biting us.
763 */
764 so->so_error = 0;
765 error = (*so->so_proto->pr_usrreqs->pru_connect)(so, nam, td);
766 }
767
768 return (error);
769}
770
771int
796soconnect2(so1, so2)
797 struct socket *so1;
798 struct socket *so2;
772soconnect2(struct socket *so1, struct socket *so2)
799{
800
801 return ((*so1->so_proto->pr_usrreqs->pru_connect2)(so1, so2));
802}
803
804int
773{
774
775 return ((*so1->so_proto->pr_usrreqs->pru_connect2)(so1, so2));
776}
777
778int
805sodisconnect(so)
806 struct socket *so;
779sodisconnect(struct socket *so)
807{
808 int error;
809
810 if ((so->so_state & SS_ISCONNECTED) == 0)
811 return (ENOTCONN);
812 if (so->so_state & SS_ISDISCONNECTING)
813 return (EALREADY);
814 error = (*so->so_proto->pr_usrreqs->pru_disconnect)(so);
815 return (error);
816}
817
818#ifdef ZERO_COPY_SOCKETS
819struct so_zerocopy_stats{
820 int size_ok;
821 int align_ok;
822 int found_ifp;
823};
824struct so_zerocopy_stats so_zerocp_stats = {0,0,0};
825#include <netinet/in.h>
826#include <net/route.h>
827#include <netinet/in_pcb.h>
828#include <vm/vm.h>
829#include <vm/vm_page.h>
830#include <vm/vm_object.h>
831
832/*
833 * sosend_copyin() is only used if zero copy sockets are enabled. Otherwise
834 * sosend_dgram() and sosend_generic() use m_uiotombuf().
835 *
836 * sosend_copyin() accepts a uio and prepares an mbuf chain holding part or
837 * all of the data referenced by the uio. If desired, it uses zero-copy.
838 * *space will be updated to reflect data copied in.
839 *
840 * NB: If atomic I/O is requested, the caller must already have checked that
841 * space can hold resid bytes.
842 *
843 * NB: In the event of an error, the caller may need to free the partial
844 * chain pointed to by *mpp. The contents of both *uio and *space may be
845 * modified even in the case of an error.
846 */
847static int
848sosend_copyin(struct uio *uio, struct mbuf **retmp, int atomic, long *space,
849 int flags)
850{
851 struct mbuf *m, **mp, *top;
852 long len, resid;
853 int error;
854#ifdef ZERO_COPY_SOCKETS
855 int cow_send;
856#endif
857
858 *retmp = top = NULL;
859 mp = &top;
860 len = 0;
861 resid = uio->uio_resid;
862 error = 0;
863 do {
864#ifdef ZERO_COPY_SOCKETS
865 cow_send = 0;
866#endif /* ZERO_COPY_SOCKETS */
867 if (resid >= MINCLSIZE) {
868#ifdef ZERO_COPY_SOCKETS
869 if (top == NULL) {
870 m = m_gethdr(M_WAITOK, MT_DATA);
871 m->m_pkthdr.len = 0;
872 m->m_pkthdr.rcvif = NULL;
873 } else
874 m = m_get(M_WAITOK, MT_DATA);
875 if (so_zero_copy_send &&
876 resid>=PAGE_SIZE &&
877 *space>=PAGE_SIZE &&
878 uio->uio_iov->iov_len>=PAGE_SIZE) {
879 so_zerocp_stats.size_ok++;
880 so_zerocp_stats.align_ok++;
881 cow_send = socow_setup(m, uio);
882 len = cow_send;
883 }
884 if (!cow_send) {
885 m_clget(m, M_WAITOK);
886 len = min(min(MCLBYTES, resid), *space);
887 }
888#else /* ZERO_COPY_SOCKETS */
889 if (top == NULL) {
890 m = m_getcl(M_TRYWAIT, MT_DATA, M_PKTHDR);
891 m->m_pkthdr.len = 0;
892 m->m_pkthdr.rcvif = NULL;
893 } else
894 m = m_getcl(M_TRYWAIT, MT_DATA, 0);
895 len = min(min(MCLBYTES, resid), *space);
896#endif /* ZERO_COPY_SOCKETS */
897 } else {
898 if (top == NULL) {
899 m = m_gethdr(M_TRYWAIT, MT_DATA);
900 m->m_pkthdr.len = 0;
901 m->m_pkthdr.rcvif = NULL;
902
903 len = min(min(MHLEN, resid), *space);
904 /*
905 * For datagram protocols, leave room
906 * for protocol headers in first mbuf.
907 */
908 if (atomic && m && len < MHLEN)
909 MH_ALIGN(m, len);
910 } else {
911 m = m_get(M_TRYWAIT, MT_DATA);
912 len = min(min(MLEN, resid), *space);
913 }
914 }
915 if (m == NULL) {
916 error = ENOBUFS;
917 goto out;
918 }
919
920 *space -= len;
921#ifdef ZERO_COPY_SOCKETS
922 if (cow_send)
923 error = 0;
924 else
925#endif /* ZERO_COPY_SOCKETS */
926 error = uiomove(mtod(m, void *), (int)len, uio);
927 resid = uio->uio_resid;
928 m->m_len = len;
929 *mp = m;
930 top->m_pkthdr.len += len;
931 if (error)
932 goto out;
933 mp = &m->m_next;
934 if (resid <= 0) {
935 if (flags & MSG_EOR)
936 top->m_flags |= M_EOR;
937 break;
938 }
939 } while (*space > 0 && atomic);
940out:
941 *retmp = top;
942 return (error);
943}
944#endif /*ZERO_COPY_SOCKETS*/
945
946#define SBLOCKWAIT(f) (((f) & MSG_DONTWAIT) ? M_NOWAIT : M_WAITOK)
947
948int
780{
781 int error;
782
783 if ((so->so_state & SS_ISCONNECTED) == 0)
784 return (ENOTCONN);
785 if (so->so_state & SS_ISDISCONNECTING)
786 return (EALREADY);
787 error = (*so->so_proto->pr_usrreqs->pru_disconnect)(so);
788 return (error);
789}
790
791#ifdef ZERO_COPY_SOCKETS
792struct so_zerocopy_stats{
793 int size_ok;
794 int align_ok;
795 int found_ifp;
796};
797struct so_zerocopy_stats so_zerocp_stats = {0,0,0};
798#include <netinet/in.h>
799#include <net/route.h>
800#include <netinet/in_pcb.h>
801#include <vm/vm.h>
802#include <vm/vm_page.h>
803#include <vm/vm_object.h>
804
805/*
806 * sosend_copyin() is only used if zero copy sockets are enabled. Otherwise
807 * sosend_dgram() and sosend_generic() use m_uiotombuf().
808 *
809 * sosend_copyin() accepts a uio and prepares an mbuf chain holding part or
810 * all of the data referenced by the uio. If desired, it uses zero-copy.
811 * *space will be updated to reflect data copied in.
812 *
813 * NB: If atomic I/O is requested, the caller must already have checked that
814 * space can hold resid bytes.
815 *
816 * NB: In the event of an error, the caller may need to free the partial
817 * chain pointed to by *mpp. The contents of both *uio and *space may be
818 * modified even in the case of an error.
819 */
820static int
821sosend_copyin(struct uio *uio, struct mbuf **retmp, int atomic, long *space,
822 int flags)
823{
824 struct mbuf *m, **mp, *top;
825 long len, resid;
826 int error;
827#ifdef ZERO_COPY_SOCKETS
828 int cow_send;
829#endif
830
831 *retmp = top = NULL;
832 mp = &top;
833 len = 0;
834 resid = uio->uio_resid;
835 error = 0;
836 do {
837#ifdef ZERO_COPY_SOCKETS
838 cow_send = 0;
839#endif /* ZERO_COPY_SOCKETS */
840 if (resid >= MINCLSIZE) {
841#ifdef ZERO_COPY_SOCKETS
842 if (top == NULL) {
843 m = m_gethdr(M_WAITOK, MT_DATA);
844 m->m_pkthdr.len = 0;
845 m->m_pkthdr.rcvif = NULL;
846 } else
847 m = m_get(M_WAITOK, MT_DATA);
848 if (so_zero_copy_send &&
849 resid>=PAGE_SIZE &&
850 *space>=PAGE_SIZE &&
851 uio->uio_iov->iov_len>=PAGE_SIZE) {
852 so_zerocp_stats.size_ok++;
853 so_zerocp_stats.align_ok++;
854 cow_send = socow_setup(m, uio);
855 len = cow_send;
856 }
857 if (!cow_send) {
858 m_clget(m, M_WAITOK);
859 len = min(min(MCLBYTES, resid), *space);
860 }
861#else /* ZERO_COPY_SOCKETS */
862 if (top == NULL) {
863 m = m_getcl(M_TRYWAIT, MT_DATA, M_PKTHDR);
864 m->m_pkthdr.len = 0;
865 m->m_pkthdr.rcvif = NULL;
866 } else
867 m = m_getcl(M_TRYWAIT, MT_DATA, 0);
868 len = min(min(MCLBYTES, resid), *space);
869#endif /* ZERO_COPY_SOCKETS */
870 } else {
871 if (top == NULL) {
872 m = m_gethdr(M_TRYWAIT, MT_DATA);
873 m->m_pkthdr.len = 0;
874 m->m_pkthdr.rcvif = NULL;
875
876 len = min(min(MHLEN, resid), *space);
877 /*
878 * For datagram protocols, leave room
879 * for protocol headers in first mbuf.
880 */
881 if (atomic && m && len < MHLEN)
882 MH_ALIGN(m, len);
883 } else {
884 m = m_get(M_TRYWAIT, MT_DATA);
885 len = min(min(MLEN, resid), *space);
886 }
887 }
888 if (m == NULL) {
889 error = ENOBUFS;
890 goto out;
891 }
892
893 *space -= len;
894#ifdef ZERO_COPY_SOCKETS
895 if (cow_send)
896 error = 0;
897 else
898#endif /* ZERO_COPY_SOCKETS */
899 error = uiomove(mtod(m, void *), (int)len, uio);
900 resid = uio->uio_resid;
901 m->m_len = len;
902 *mp = m;
903 top->m_pkthdr.len += len;
904 if (error)
905 goto out;
906 mp = &m->m_next;
907 if (resid <= 0) {
908 if (flags & MSG_EOR)
909 top->m_flags |= M_EOR;
910 break;
911 }
912 } while (*space > 0 && atomic);
913out:
914 *retmp = top;
915 return (error);
916}
917#endif /*ZERO_COPY_SOCKETS*/
918
919#define SBLOCKWAIT(f) (((f) & MSG_DONTWAIT) ? M_NOWAIT : M_WAITOK)
920
921int
949sosend_dgram(so, addr, uio, top, control, flags, td)
950 struct socket *so;
951 struct sockaddr *addr;
952 struct uio *uio;
953 struct mbuf *top;
954 struct mbuf *control;
955 int flags;
956 struct thread *td;
922sosend_dgram(struct socket *so, struct sockaddr *addr, struct uio *uio,
923 struct mbuf *top, struct mbuf *control, int flags, struct thread *td)
957{
958 long space, resid;
959 int clen = 0, error, dontroute;
960#ifdef ZERO_COPY_SOCKETS
961 int atomic = sosendallatonce(so) || top;
962#endif
963
964 KASSERT(so->so_type == SOCK_DGRAM, ("sodgram_send: !SOCK_DGRAM"));
965 KASSERT(so->so_proto->pr_flags & PR_ATOMIC,
966 ("sodgram_send: !PR_ATOMIC"));
967
968 if (uio != NULL)
969 resid = uio->uio_resid;
970 else
971 resid = top->m_pkthdr.len;
972 /*
973 * In theory resid should be unsigned. However, space must be
974 * signed, as it might be less than 0 if we over-committed, and we
975 * must use a signed comparison of space and resid. On the other
976 * hand, a negative resid causes us to loop sending 0-length
977 * segments to the protocol.
978 *
979 * Also check to make sure that MSG_EOR isn't used on SOCK_STREAM
980 * type sockets since that's an error.
981 */
982 if (resid < 0) {
983 error = EINVAL;
984 goto out;
985 }
986
987 dontroute =
988 (flags & MSG_DONTROUTE) && (so->so_options & SO_DONTROUTE) == 0;
989 if (td != NULL)
990 td->td_proc->p_stats->p_ru.ru_msgsnd++;
991 if (control != NULL)
992 clen = control->m_len;
993
994 SOCKBUF_LOCK(&so->so_snd);
995 if (so->so_snd.sb_state & SBS_CANTSENDMORE) {
996 SOCKBUF_UNLOCK(&so->so_snd);
997 error = EPIPE;
998 goto out;
999 }
1000 if (so->so_error) {
1001 error = so->so_error;
1002 so->so_error = 0;
1003 SOCKBUF_UNLOCK(&so->so_snd);
1004 goto out;
1005 }
1006 if ((so->so_state & SS_ISCONNECTED) == 0) {
1007 /*
1008 * `sendto' and `sendmsg' is allowed on a connection-based
1009 * socket if it supports implied connect. Return ENOTCONN if
1010 * not connected and no address is supplied.
1011 */
1012 if ((so->so_proto->pr_flags & PR_CONNREQUIRED) &&
1013 (so->so_proto->pr_flags & PR_IMPLOPCL) == 0) {
1014 if ((so->so_state & SS_ISCONFIRMING) == 0 &&
1015 !(resid == 0 && clen != 0)) {
1016 SOCKBUF_UNLOCK(&so->so_snd);
1017 error = ENOTCONN;
1018 goto out;
1019 }
1020 } else if (addr == NULL) {
1021 if (so->so_proto->pr_flags & PR_CONNREQUIRED)
1022 error = ENOTCONN;
1023 else
1024 error = EDESTADDRREQ;
1025 SOCKBUF_UNLOCK(&so->so_snd);
1026 goto out;
1027 }
1028 }
1029
1030 /*
1031 * Do we need MSG_OOB support in SOCK_DGRAM? Signs here may be a
1032 * problem and need fixing.
1033 */
1034 space = sbspace(&so->so_snd);
1035 if (flags & MSG_OOB)
1036 space += 1024;
1037 space -= clen;
1038 SOCKBUF_UNLOCK(&so->so_snd);
1039 if (resid > space) {
1040 error = EMSGSIZE;
1041 goto out;
1042 }
1043 if (uio == NULL) {
1044 resid = 0;
1045 if (flags & MSG_EOR)
1046 top->m_flags |= M_EOR;
1047 } else {
1048#ifdef ZERO_COPY_SOCKETS
1049 error = sosend_copyin(uio, &top, atomic, &space, flags);
1050 if (error)
1051 goto out;
1052#else
1053 /*
1054 * Copy the data from userland into a mbuf chain.
1055 * If no data is to be copied in, a single empty mbuf
1056 * is returned.
1057 */
1058 top = m_uiotombuf(uio, M_WAITOK, space, max_hdr,
1059 (M_PKTHDR | ((flags & MSG_EOR) ? M_EOR : 0)));
1060 if (top == NULL) {
1061 error = EFAULT; /* only possible error */
1062 goto out;
1063 }
1064 space -= resid - uio->uio_resid;
1065#endif
1066 resid = uio->uio_resid;
1067 }
1068 KASSERT(resid == 0, ("sosend_dgram: resid != 0"));
1069 /*
1070 * XXXRW: Frobbing SO_DONTROUTE here is even worse without sblock
1071 * than with.
1072 */
1073 if (dontroute) {
1074 SOCK_LOCK(so);
1075 so->so_options |= SO_DONTROUTE;
1076 SOCK_UNLOCK(so);
1077 }
1078 /*
1079 * XXX all the SBS_CANTSENDMORE checks previously done could be out
1080 * of date. We could have recieved a reset packet in an interrupt or
1081 * maybe we slept while doing page faults in uiomove() etc. We could
1082 * probably recheck again inside the locking protection here, but
1083 * there are probably other places that this also happens. We must
1084 * rethink this.
1085 */
1086 error = (*so->so_proto->pr_usrreqs->pru_send)(so,
1087 (flags & MSG_OOB) ? PRUS_OOB :
1088 /*
1089 * If the user set MSG_EOF, the protocol understands this flag and
1090 * nothing left to send then use PRU_SEND_EOF instead of PRU_SEND.
1091 */
1092 ((flags & MSG_EOF) &&
1093 (so->so_proto->pr_flags & PR_IMPLOPCL) &&
1094 (resid <= 0)) ?
1095 PRUS_EOF :
1096 /* If there is more to send set PRUS_MORETOCOME */
1097 (resid > 0 && space > 0) ? PRUS_MORETOCOME : 0,
1098 top, addr, control, td);
1099 if (dontroute) {
1100 SOCK_LOCK(so);
1101 so->so_options &= ~SO_DONTROUTE;
1102 SOCK_UNLOCK(so);
1103 }
1104 clen = 0;
1105 control = NULL;
1106 top = NULL;
1107out:
1108 if (top != NULL)
1109 m_freem(top);
1110 if (control != NULL)
1111 m_freem(control);
1112 return (error);
1113}
1114
1115/*
1116 * Send on a socket. If send must go all at once and message is larger than
1117 * send buffering, then hard error. Lock against other senders. If must go
1118 * all at once and not enough room now, then inform user that this would
1119 * block and do nothing. Otherwise, if nonblocking, send as much as
1120 * possible. The data to be sent is described by "uio" if nonzero, otherwise
1121 * by the mbuf chain "top" (which must be null if uio is not). Data provided
1122 * in mbuf chain must be small enough to send all at once.
1123 *
1124 * Returns nonzero on error, timeout or signal; callers must check for short
1125 * counts if EINTR/ERESTART are returned. Data and control buffers are freed
1126 * on return.
1127 */
1128int
924{
925 long space, resid;
926 int clen = 0, error, dontroute;
927#ifdef ZERO_COPY_SOCKETS
928 int atomic = sosendallatonce(so) || top;
929#endif
930
931 KASSERT(so->so_type == SOCK_DGRAM, ("sodgram_send: !SOCK_DGRAM"));
932 KASSERT(so->so_proto->pr_flags & PR_ATOMIC,
933 ("sodgram_send: !PR_ATOMIC"));
934
935 if (uio != NULL)
936 resid = uio->uio_resid;
937 else
938 resid = top->m_pkthdr.len;
939 /*
940 * In theory resid should be unsigned. However, space must be
941 * signed, as it might be less than 0 if we over-committed, and we
942 * must use a signed comparison of space and resid. On the other
943 * hand, a negative resid causes us to loop sending 0-length
944 * segments to the protocol.
945 *
946 * Also check to make sure that MSG_EOR isn't used on SOCK_STREAM
947 * type sockets since that's an error.
948 */
949 if (resid < 0) {
950 error = EINVAL;
951 goto out;
952 }
953
954 dontroute =
955 (flags & MSG_DONTROUTE) && (so->so_options & SO_DONTROUTE) == 0;
956 if (td != NULL)
957 td->td_proc->p_stats->p_ru.ru_msgsnd++;
958 if (control != NULL)
959 clen = control->m_len;
960
961 SOCKBUF_LOCK(&so->so_snd);
962 if (so->so_snd.sb_state & SBS_CANTSENDMORE) {
963 SOCKBUF_UNLOCK(&so->so_snd);
964 error = EPIPE;
965 goto out;
966 }
967 if (so->so_error) {
968 error = so->so_error;
969 so->so_error = 0;
970 SOCKBUF_UNLOCK(&so->so_snd);
971 goto out;
972 }
973 if ((so->so_state & SS_ISCONNECTED) == 0) {
974 /*
975 * `sendto' and `sendmsg' is allowed on a connection-based
976 * socket if it supports implied connect. Return ENOTCONN if
977 * not connected and no address is supplied.
978 */
979 if ((so->so_proto->pr_flags & PR_CONNREQUIRED) &&
980 (so->so_proto->pr_flags & PR_IMPLOPCL) == 0) {
981 if ((so->so_state & SS_ISCONFIRMING) == 0 &&
982 !(resid == 0 && clen != 0)) {
983 SOCKBUF_UNLOCK(&so->so_snd);
984 error = ENOTCONN;
985 goto out;
986 }
987 } else if (addr == NULL) {
988 if (so->so_proto->pr_flags & PR_CONNREQUIRED)
989 error = ENOTCONN;
990 else
991 error = EDESTADDRREQ;
992 SOCKBUF_UNLOCK(&so->so_snd);
993 goto out;
994 }
995 }
996
997 /*
998 * Do we need MSG_OOB support in SOCK_DGRAM? Signs here may be a
999 * problem and need fixing.
1000 */
1001 space = sbspace(&so->so_snd);
1002 if (flags & MSG_OOB)
1003 space += 1024;
1004 space -= clen;
1005 SOCKBUF_UNLOCK(&so->so_snd);
1006 if (resid > space) {
1007 error = EMSGSIZE;
1008 goto out;
1009 }
1010 if (uio == NULL) {
1011 resid = 0;
1012 if (flags & MSG_EOR)
1013 top->m_flags |= M_EOR;
1014 } else {
1015#ifdef ZERO_COPY_SOCKETS
1016 error = sosend_copyin(uio, &top, atomic, &space, flags);
1017 if (error)
1018 goto out;
1019#else
1020 /*
1021 * Copy the data from userland into a mbuf chain.
1022 * If no data is to be copied in, a single empty mbuf
1023 * is returned.
1024 */
1025 top = m_uiotombuf(uio, M_WAITOK, space, max_hdr,
1026 (M_PKTHDR | ((flags & MSG_EOR) ? M_EOR : 0)));
1027 if (top == NULL) {
1028 error = EFAULT; /* only possible error */
1029 goto out;
1030 }
1031 space -= resid - uio->uio_resid;
1032#endif
1033 resid = uio->uio_resid;
1034 }
1035 KASSERT(resid == 0, ("sosend_dgram: resid != 0"));
1036 /*
1037 * XXXRW: Frobbing SO_DONTROUTE here is even worse without sblock
1038 * than with.
1039 */
1040 if (dontroute) {
1041 SOCK_LOCK(so);
1042 so->so_options |= SO_DONTROUTE;
1043 SOCK_UNLOCK(so);
1044 }
1045 /*
1046 * XXX all the SBS_CANTSENDMORE checks previously done could be out
1047 * of date. We could have recieved a reset packet in an interrupt or
1048 * maybe we slept while doing page faults in uiomove() etc. We could
1049 * probably recheck again inside the locking protection here, but
1050 * there are probably other places that this also happens. We must
1051 * rethink this.
1052 */
1053 error = (*so->so_proto->pr_usrreqs->pru_send)(so,
1054 (flags & MSG_OOB) ? PRUS_OOB :
1055 /*
1056 * If the user set MSG_EOF, the protocol understands this flag and
1057 * nothing left to send then use PRU_SEND_EOF instead of PRU_SEND.
1058 */
1059 ((flags & MSG_EOF) &&
1060 (so->so_proto->pr_flags & PR_IMPLOPCL) &&
1061 (resid <= 0)) ?
1062 PRUS_EOF :
1063 /* If there is more to send set PRUS_MORETOCOME */
1064 (resid > 0 && space > 0) ? PRUS_MORETOCOME : 0,
1065 top, addr, control, td);
1066 if (dontroute) {
1067 SOCK_LOCK(so);
1068 so->so_options &= ~SO_DONTROUTE;
1069 SOCK_UNLOCK(so);
1070 }
1071 clen = 0;
1072 control = NULL;
1073 top = NULL;
1074out:
1075 if (top != NULL)
1076 m_freem(top);
1077 if (control != NULL)
1078 m_freem(control);
1079 return (error);
1080}
1081
1082/*
1083 * Send on a socket. If send must go all at once and message is larger than
1084 * send buffering, then hard error. Lock against other senders. If must go
1085 * all at once and not enough room now, then inform user that this would
1086 * block and do nothing. Otherwise, if nonblocking, send as much as
1087 * possible. The data to be sent is described by "uio" if nonzero, otherwise
1088 * by the mbuf chain "top" (which must be null if uio is not). Data provided
1089 * in mbuf chain must be small enough to send all at once.
1090 *
1091 * Returns nonzero on error, timeout or signal; callers must check for short
1092 * counts if EINTR/ERESTART are returned. Data and control buffers are freed
1093 * on return.
1094 */
1095int
1129sosend_generic(so, addr, uio, top, control, flags, td)
1130 struct socket *so;
1131 struct sockaddr *addr;
1132 struct uio *uio;
1133 struct mbuf *top;
1134 struct mbuf *control;
1135 int flags;
1136 struct thread *td;
1096sosend_generic(struct socket *so, struct sockaddr *addr, struct uio *uio,
1097 struct mbuf *top, struct mbuf *control, int flags, struct thread *td)
1137{
1138 long space, resid;
1139 int clen = 0, error, dontroute;
1140 int atomic = sosendallatonce(so) || top;
1141
1142 if (uio != NULL)
1143 resid = uio->uio_resid;
1144 else
1145 resid = top->m_pkthdr.len;
1146 /*
1147 * In theory resid should be unsigned. However, space must be
1148 * signed, as it might be less than 0 if we over-committed, and we
1149 * must use a signed comparison of space and resid. On the other
1150 * hand, a negative resid causes us to loop sending 0-length
1151 * segments to the protocol.
1152 *
1153 * Also check to make sure that MSG_EOR isn't used on SOCK_STREAM
1154 * type sockets since that's an error.
1155 */
1156 if (resid < 0 || (so->so_type == SOCK_STREAM && (flags & MSG_EOR))) {
1157 error = EINVAL;
1158 goto out;
1159 }
1160
1161 dontroute =
1162 (flags & MSG_DONTROUTE) && (so->so_options & SO_DONTROUTE) == 0 &&
1163 (so->so_proto->pr_flags & PR_ATOMIC);
1164 if (td != NULL)
1165 td->td_proc->p_stats->p_ru.ru_msgsnd++;
1166 if (control != NULL)
1167 clen = control->m_len;
1168
1169 error = sblock(&so->so_snd, SBLOCKWAIT(flags));
1170 if (error)
1171 goto out;
1172
1173restart:
1174 do {
1175 SOCKBUF_LOCK(&so->so_snd);
1176 if (so->so_snd.sb_state & SBS_CANTSENDMORE) {
1177 SOCKBUF_UNLOCK(&so->so_snd);
1178 error = EPIPE;
1179 goto release;
1180 }
1181 if (so->so_error) {
1182 error = so->so_error;
1183 so->so_error = 0;
1184 SOCKBUF_UNLOCK(&so->so_snd);
1185 goto release;
1186 }
1187 if ((so->so_state & SS_ISCONNECTED) == 0) {
1188 /*
1189 * `sendto' and `sendmsg' is allowed on a connection-
1190 * based socket if it supports implied connect.
1191 * Return ENOTCONN if not connected and no address is
1192 * supplied.
1193 */
1194 if ((so->so_proto->pr_flags & PR_CONNREQUIRED) &&
1195 (so->so_proto->pr_flags & PR_IMPLOPCL) == 0) {
1196 if ((so->so_state & SS_ISCONFIRMING) == 0 &&
1197 !(resid == 0 && clen != 0)) {
1198 SOCKBUF_UNLOCK(&so->so_snd);
1199 error = ENOTCONN;
1200 goto release;
1201 }
1202 } else if (addr == NULL) {
1203 SOCKBUF_UNLOCK(&so->so_snd);
1204 if (so->so_proto->pr_flags & PR_CONNREQUIRED)
1205 error = ENOTCONN;
1206 else
1207 error = EDESTADDRREQ;
1208 goto release;
1209 }
1210 }
1211 space = sbspace(&so->so_snd);
1212 if (flags & MSG_OOB)
1213 space += 1024;
1214 if ((atomic && resid > so->so_snd.sb_hiwat) ||
1215 clen > so->so_snd.sb_hiwat) {
1216 SOCKBUF_UNLOCK(&so->so_snd);
1217 error = EMSGSIZE;
1218 goto release;
1219 }
1220 if (space < resid + clen &&
1221 (atomic || space < so->so_snd.sb_lowat || space < clen)) {
1222 if ((so->so_state & SS_NBIO) || (flags & MSG_NBIO)) {
1223 SOCKBUF_UNLOCK(&so->so_snd);
1224 error = EWOULDBLOCK;
1225 goto release;
1226 }
1227 error = sbwait(&so->so_snd);
1228 SOCKBUF_UNLOCK(&so->so_snd);
1229 if (error)
1230 goto release;
1231 goto restart;
1232 }
1233 SOCKBUF_UNLOCK(&so->so_snd);
1234 space -= clen;
1235 do {
1236 if (uio == NULL) {
1237 resid = 0;
1238 if (flags & MSG_EOR)
1239 top->m_flags |= M_EOR;
1240 } else {
1241#ifdef ZERO_COPY_SOCKETS
1242 error = sosend_copyin(uio, &top, atomic,
1243 &space, flags);
1244 if (error != 0)
1245 goto release;
1246#else
1247 /*
1248 * Copy the data from userland into a mbuf
1249 * chain. If no data is to be copied in,
1250 * a single empty mbuf is returned.
1251 */
1252 top = m_uiotombuf(uio, M_WAITOK, space,
1253 (atomic ? max_hdr : 0),
1254 (atomic ? M_PKTHDR : 0) |
1255 ((flags & MSG_EOR) ? M_EOR : 0));
1256 if (top == NULL) {
1257 error = EFAULT; /* only possible error */
1258 goto release;
1259 }
1260 space -= resid - uio->uio_resid;
1261#endif
1262 resid = uio->uio_resid;
1263 }
1264 if (dontroute) {
1265 SOCK_LOCK(so);
1266 so->so_options |= SO_DONTROUTE;
1267 SOCK_UNLOCK(so);
1268 }
1269 /*
1270 * XXX all the SBS_CANTSENDMORE checks previously
1271 * done could be out of date. We could have recieved
1272 * a reset packet in an interrupt or maybe we slept
1273 * while doing page faults in uiomove() etc. We
1274 * could probably recheck again inside the locking
1275 * protection here, but there are probably other
1276 * places that this also happens. We must rethink
1277 * this.
1278 */
1279 error = (*so->so_proto->pr_usrreqs->pru_send)(so,
1280 (flags & MSG_OOB) ? PRUS_OOB :
1281 /*
1282 * If the user set MSG_EOF, the protocol understands
1283 * this flag and nothing left to send then use
1284 * PRU_SEND_EOF instead of PRU_SEND.
1285 */
1286 ((flags & MSG_EOF) &&
1287 (so->so_proto->pr_flags & PR_IMPLOPCL) &&
1288 (resid <= 0)) ?
1289 PRUS_EOF :
1290 /* If there is more to send set PRUS_MORETOCOME. */
1291 (resid > 0 && space > 0) ? PRUS_MORETOCOME : 0,
1292 top, addr, control, td);
1293 if (dontroute) {
1294 SOCK_LOCK(so);
1295 so->so_options &= ~SO_DONTROUTE;
1296 SOCK_UNLOCK(so);
1297 }
1298 clen = 0;
1299 control = NULL;
1300 top = NULL;
1301 if (error)
1302 goto release;
1303 } while (resid && space > 0);
1304 } while (resid);
1305
1306release:
1307 sbunlock(&so->so_snd);
1308out:
1309 if (top != NULL)
1310 m_freem(top);
1311 if (control != NULL)
1312 m_freem(control);
1313 return (error);
1314}
1315
1316int
1098{
1099 long space, resid;
1100 int clen = 0, error, dontroute;
1101 int atomic = sosendallatonce(so) || top;
1102
1103 if (uio != NULL)
1104 resid = uio->uio_resid;
1105 else
1106 resid = top->m_pkthdr.len;
1107 /*
1108 * In theory resid should be unsigned. However, space must be
1109 * signed, as it might be less than 0 if we over-committed, and we
1110 * must use a signed comparison of space and resid. On the other
1111 * hand, a negative resid causes us to loop sending 0-length
1112 * segments to the protocol.
1113 *
1114 * Also check to make sure that MSG_EOR isn't used on SOCK_STREAM
1115 * type sockets since that's an error.
1116 */
1117 if (resid < 0 || (so->so_type == SOCK_STREAM && (flags & MSG_EOR))) {
1118 error = EINVAL;
1119 goto out;
1120 }
1121
1122 dontroute =
1123 (flags & MSG_DONTROUTE) && (so->so_options & SO_DONTROUTE) == 0 &&
1124 (so->so_proto->pr_flags & PR_ATOMIC);
1125 if (td != NULL)
1126 td->td_proc->p_stats->p_ru.ru_msgsnd++;
1127 if (control != NULL)
1128 clen = control->m_len;
1129
1130 error = sblock(&so->so_snd, SBLOCKWAIT(flags));
1131 if (error)
1132 goto out;
1133
1134restart:
1135 do {
1136 SOCKBUF_LOCK(&so->so_snd);
1137 if (so->so_snd.sb_state & SBS_CANTSENDMORE) {
1138 SOCKBUF_UNLOCK(&so->so_snd);
1139 error = EPIPE;
1140 goto release;
1141 }
1142 if (so->so_error) {
1143 error = so->so_error;
1144 so->so_error = 0;
1145 SOCKBUF_UNLOCK(&so->so_snd);
1146 goto release;
1147 }
1148 if ((so->so_state & SS_ISCONNECTED) == 0) {
1149 /*
1150 * `sendto' and `sendmsg' is allowed on a connection-
1151 * based socket if it supports implied connect.
1152 * Return ENOTCONN if not connected and no address is
1153 * supplied.
1154 */
1155 if ((so->so_proto->pr_flags & PR_CONNREQUIRED) &&
1156 (so->so_proto->pr_flags & PR_IMPLOPCL) == 0) {
1157 if ((so->so_state & SS_ISCONFIRMING) == 0 &&
1158 !(resid == 0 && clen != 0)) {
1159 SOCKBUF_UNLOCK(&so->so_snd);
1160 error = ENOTCONN;
1161 goto release;
1162 }
1163 } else if (addr == NULL) {
1164 SOCKBUF_UNLOCK(&so->so_snd);
1165 if (so->so_proto->pr_flags & PR_CONNREQUIRED)
1166 error = ENOTCONN;
1167 else
1168 error = EDESTADDRREQ;
1169 goto release;
1170 }
1171 }
1172 space = sbspace(&so->so_snd);
1173 if (flags & MSG_OOB)
1174 space += 1024;
1175 if ((atomic && resid > so->so_snd.sb_hiwat) ||
1176 clen > so->so_snd.sb_hiwat) {
1177 SOCKBUF_UNLOCK(&so->so_snd);
1178 error = EMSGSIZE;
1179 goto release;
1180 }
1181 if (space < resid + clen &&
1182 (atomic || space < so->so_snd.sb_lowat || space < clen)) {
1183 if ((so->so_state & SS_NBIO) || (flags & MSG_NBIO)) {
1184 SOCKBUF_UNLOCK(&so->so_snd);
1185 error = EWOULDBLOCK;
1186 goto release;
1187 }
1188 error = sbwait(&so->so_snd);
1189 SOCKBUF_UNLOCK(&so->so_snd);
1190 if (error)
1191 goto release;
1192 goto restart;
1193 }
1194 SOCKBUF_UNLOCK(&so->so_snd);
1195 space -= clen;
1196 do {
1197 if (uio == NULL) {
1198 resid = 0;
1199 if (flags & MSG_EOR)
1200 top->m_flags |= M_EOR;
1201 } else {
1202#ifdef ZERO_COPY_SOCKETS
1203 error = sosend_copyin(uio, &top, atomic,
1204 &space, flags);
1205 if (error != 0)
1206 goto release;
1207#else
1208 /*
1209 * Copy the data from userland into a mbuf
1210 * chain. If no data is to be copied in,
1211 * a single empty mbuf is returned.
1212 */
1213 top = m_uiotombuf(uio, M_WAITOK, space,
1214 (atomic ? max_hdr : 0),
1215 (atomic ? M_PKTHDR : 0) |
1216 ((flags & MSG_EOR) ? M_EOR : 0));
1217 if (top == NULL) {
1218 error = EFAULT; /* only possible error */
1219 goto release;
1220 }
1221 space -= resid - uio->uio_resid;
1222#endif
1223 resid = uio->uio_resid;
1224 }
1225 if (dontroute) {
1226 SOCK_LOCK(so);
1227 so->so_options |= SO_DONTROUTE;
1228 SOCK_UNLOCK(so);
1229 }
1230 /*
1231 * XXX all the SBS_CANTSENDMORE checks previously
1232 * done could be out of date. We could have recieved
1233 * a reset packet in an interrupt or maybe we slept
1234 * while doing page faults in uiomove() etc. We
1235 * could probably recheck again inside the locking
1236 * protection here, but there are probably other
1237 * places that this also happens. We must rethink
1238 * this.
1239 */
1240 error = (*so->so_proto->pr_usrreqs->pru_send)(so,
1241 (flags & MSG_OOB) ? PRUS_OOB :
1242 /*
1243 * If the user set MSG_EOF, the protocol understands
1244 * this flag and nothing left to send then use
1245 * PRU_SEND_EOF instead of PRU_SEND.
1246 */
1247 ((flags & MSG_EOF) &&
1248 (so->so_proto->pr_flags & PR_IMPLOPCL) &&
1249 (resid <= 0)) ?
1250 PRUS_EOF :
1251 /* If there is more to send set PRUS_MORETOCOME. */
1252 (resid > 0 && space > 0) ? PRUS_MORETOCOME : 0,
1253 top, addr, control, td);
1254 if (dontroute) {
1255 SOCK_LOCK(so);
1256 so->so_options &= ~SO_DONTROUTE;
1257 SOCK_UNLOCK(so);
1258 }
1259 clen = 0;
1260 control = NULL;
1261 top = NULL;
1262 if (error)
1263 goto release;
1264 } while (resid && space > 0);
1265 } while (resid);
1266
1267release:
1268 sbunlock(&so->so_snd);
1269out:
1270 if (top != NULL)
1271 m_freem(top);
1272 if (control != NULL)
1273 m_freem(control);
1274 return (error);
1275}
1276
1277int
1317sosend(so, addr, uio, top, control, flags, td)
1318 struct socket *so;
1319 struct sockaddr *addr;
1320 struct uio *uio;
1321 struct mbuf *top;
1322 struct mbuf *control;
1323 int flags;
1324 struct thread *td;
1278sosend(struct socket *so, struct sockaddr *addr, struct uio *uio,
1279 struct mbuf *top, struct mbuf *control, int flags, struct thread *td)
1325{
1326
1327 /* XXXRW: Temporary debugging. */
1328 KASSERT(so->so_proto->pr_usrreqs->pru_sosend != sosend,
1329 ("sosend: protocol calls sosend"));
1330
1331 return (so->so_proto->pr_usrreqs->pru_sosend(so, addr, uio, top,
1332 control, flags, td));
1333}
1334
1335/*
1336 * The part of soreceive() that implements reading non-inline out-of-band
1337 * data from a socket. For more complete comments, see soreceive(), from
1338 * which this code originated.
1339 *
1340 * Note that soreceive_rcvoob(), unlike the remainder of soreceive(), is
1341 * unable to return an mbuf chain to the caller.
1342 */
1343static int
1280{
1281
1282 /* XXXRW: Temporary debugging. */
1283 KASSERT(so->so_proto->pr_usrreqs->pru_sosend != sosend,
1284 ("sosend: protocol calls sosend"));
1285
1286 return (so->so_proto->pr_usrreqs->pru_sosend(so, addr, uio, top,
1287 control, flags, td));
1288}
1289
1290/*
1291 * The part of soreceive() that implements reading non-inline out-of-band
1292 * data from a socket. For more complete comments, see soreceive(), from
1293 * which this code originated.
1294 *
1295 * Note that soreceive_rcvoob(), unlike the remainder of soreceive(), is
1296 * unable to return an mbuf chain to the caller.
1297 */
1298static int
1344soreceive_rcvoob(so, uio, flags)
1345 struct socket *so;
1346 struct uio *uio;
1347 int flags;
1299soreceive_rcvoob(struct socket *so, struct uio *uio, int flags)
1348{
1349 struct protosw *pr = so->so_proto;
1350 struct mbuf *m;
1351 int error;
1352
1353 KASSERT(flags & MSG_OOB, ("soreceive_rcvoob: (flags & MSG_OOB) == 0"));
1354
1355 m = m_get(M_TRYWAIT, MT_DATA);
1356 if (m == NULL)
1357 return (ENOBUFS);
1358 error = (*pr->pr_usrreqs->pru_rcvoob)(so, m, flags & MSG_PEEK);
1359 if (error)
1360 goto bad;
1361 do {
1362#ifdef ZERO_COPY_SOCKETS
1363 if (so_zero_copy_receive) {
1364 int disposable;
1365
1366 if ((m->m_flags & M_EXT)
1367 && (m->m_ext.ext_type == EXT_DISPOSABLE))
1368 disposable = 1;
1369 else
1370 disposable = 0;
1371
1372 error = uiomoveco(mtod(m, void *),
1373 min(uio->uio_resid, m->m_len),
1374 uio, disposable);
1375 } else
1376#endif /* ZERO_COPY_SOCKETS */
1377 error = uiomove(mtod(m, void *),
1378 (int) min(uio->uio_resid, m->m_len), uio);
1379 m = m_free(m);
1380 } while (uio->uio_resid && error == 0 && m);
1381bad:
1382 if (m != NULL)
1383 m_freem(m);
1384 return (error);
1385}
1386
1387/*
1388 * Following replacement or removal of the first mbuf on the first mbuf chain
1389 * of a socket buffer, push necessary state changes back into the socket
1390 * buffer so that other consumers see the values consistently. 'nextrecord'
1391 * is the callers locally stored value of the original value of
1392 * sb->sb_mb->m_nextpkt which must be restored when the lead mbuf changes.
1393 * NOTE: 'nextrecord' may be NULL.
1394 */
1395static __inline void
1396sockbuf_pushsync(struct sockbuf *sb, struct mbuf *nextrecord)
1397{
1398
1399 SOCKBUF_LOCK_ASSERT(sb);
1400 /*
1401 * First, update for the new value of nextrecord. If necessary, make
1402 * it the first record.
1403 */
1404 if (sb->sb_mb != NULL)
1405 sb->sb_mb->m_nextpkt = nextrecord;
1406 else
1407 sb->sb_mb = nextrecord;
1408
1409 /*
1410 * Now update any dependent socket buffer fields to reflect the new
1411 * state. This is an expanded inline of SB_EMPTY_FIXUP(), with the
1412 * addition of a second clause that takes care of the case where
1413 * sb_mb has been updated, but remains the last record.
1414 */
1415 if (sb->sb_mb == NULL) {
1416 sb->sb_mbtail = NULL;
1417 sb->sb_lastrecord = NULL;
1418 } else if (sb->sb_mb->m_nextpkt == NULL)
1419 sb->sb_lastrecord = sb->sb_mb;
1420}
1421
1422
1423/*
1424 * Implement receive operations on a socket. We depend on the way that
1425 * records are added to the sockbuf by sbappend. In particular, each record
1426 * (mbufs linked through m_next) must begin with an address if the protocol
1427 * so specifies, followed by an optional mbuf or mbufs containing ancillary
1428 * data, and then zero or more mbufs of data. In order to allow parallelism
1429 * between network receive and copying to user space, as well as avoid
1430 * sleeping with a mutex held, we release the socket buffer mutex during the
1431 * user space copy. Although the sockbuf is locked, new data may still be
1432 * appended, and thus we must maintain consistency of the sockbuf during that
1433 * time.
1434 *
1435 * The caller may receive the data as a single mbuf chain by supplying an
1436 * mbuf **mp0 for use in returning the chain. The uio is then used only for
1437 * the count in uio_resid.
1438 */
1439int
1300{
1301 struct protosw *pr = so->so_proto;
1302 struct mbuf *m;
1303 int error;
1304
1305 KASSERT(flags & MSG_OOB, ("soreceive_rcvoob: (flags & MSG_OOB) == 0"));
1306
1307 m = m_get(M_TRYWAIT, MT_DATA);
1308 if (m == NULL)
1309 return (ENOBUFS);
1310 error = (*pr->pr_usrreqs->pru_rcvoob)(so, m, flags & MSG_PEEK);
1311 if (error)
1312 goto bad;
1313 do {
1314#ifdef ZERO_COPY_SOCKETS
1315 if (so_zero_copy_receive) {
1316 int disposable;
1317
1318 if ((m->m_flags & M_EXT)
1319 && (m->m_ext.ext_type == EXT_DISPOSABLE))
1320 disposable = 1;
1321 else
1322 disposable = 0;
1323
1324 error = uiomoveco(mtod(m, void *),
1325 min(uio->uio_resid, m->m_len),
1326 uio, disposable);
1327 } else
1328#endif /* ZERO_COPY_SOCKETS */
1329 error = uiomove(mtod(m, void *),
1330 (int) min(uio->uio_resid, m->m_len), uio);
1331 m = m_free(m);
1332 } while (uio->uio_resid && error == 0 && m);
1333bad:
1334 if (m != NULL)
1335 m_freem(m);
1336 return (error);
1337}
1338
1339/*
1340 * Following replacement or removal of the first mbuf on the first mbuf chain
1341 * of a socket buffer, push necessary state changes back into the socket
1342 * buffer so that other consumers see the values consistently. 'nextrecord'
1343 * is the callers locally stored value of the original value of
1344 * sb->sb_mb->m_nextpkt which must be restored when the lead mbuf changes.
1345 * NOTE: 'nextrecord' may be NULL.
1346 */
1347static __inline void
1348sockbuf_pushsync(struct sockbuf *sb, struct mbuf *nextrecord)
1349{
1350
1351 SOCKBUF_LOCK_ASSERT(sb);
1352 /*
1353 * First, update for the new value of nextrecord. If necessary, make
1354 * it the first record.
1355 */
1356 if (sb->sb_mb != NULL)
1357 sb->sb_mb->m_nextpkt = nextrecord;
1358 else
1359 sb->sb_mb = nextrecord;
1360
1361 /*
1362 * Now update any dependent socket buffer fields to reflect the new
1363 * state. This is an expanded inline of SB_EMPTY_FIXUP(), with the
1364 * addition of a second clause that takes care of the case where
1365 * sb_mb has been updated, but remains the last record.
1366 */
1367 if (sb->sb_mb == NULL) {
1368 sb->sb_mbtail = NULL;
1369 sb->sb_lastrecord = NULL;
1370 } else if (sb->sb_mb->m_nextpkt == NULL)
1371 sb->sb_lastrecord = sb->sb_mb;
1372}
1373
1374
1375/*
1376 * Implement receive operations on a socket. We depend on the way that
1377 * records are added to the sockbuf by sbappend. In particular, each record
1378 * (mbufs linked through m_next) must begin with an address if the protocol
1379 * so specifies, followed by an optional mbuf or mbufs containing ancillary
1380 * data, and then zero or more mbufs of data. In order to allow parallelism
1381 * between network receive and copying to user space, as well as avoid
1382 * sleeping with a mutex held, we release the socket buffer mutex during the
1383 * user space copy. Although the sockbuf is locked, new data may still be
1384 * appended, and thus we must maintain consistency of the sockbuf during that
1385 * time.
1386 *
1387 * The caller may receive the data as a single mbuf chain by supplying an
1388 * mbuf **mp0 for use in returning the chain. The uio is then used only for
1389 * the count in uio_resid.
1390 */
1391int
1440soreceive_generic(so, psa, uio, mp0, controlp, flagsp)
1441 struct socket *so;
1442 struct sockaddr **psa;
1443 struct uio *uio;
1444 struct mbuf **mp0;
1445 struct mbuf **controlp;
1446 int *flagsp;
1392soreceive_generic(struct socket *so, struct sockaddr **psa, struct uio *uio,
1393 struct mbuf **mp0, struct mbuf **controlp, int *flagsp)
1447{
1448 struct mbuf *m, **mp;
1449 int flags, len, error, offset;
1450 struct protosw *pr = so->so_proto;
1451 struct mbuf *nextrecord;
1452 int moff, type = 0;
1453 int orig_resid = uio->uio_resid;
1454
1455 mp = mp0;
1456 if (psa != NULL)
1457 *psa = NULL;
1458 if (controlp != NULL)
1459 *controlp = NULL;
1460 if (flagsp != NULL)
1461 flags = *flagsp &~ MSG_EOR;
1462 else
1463 flags = 0;
1464 if (flags & MSG_OOB)
1465 return (soreceive_rcvoob(so, uio, flags));
1466 if (mp != NULL)
1467 *mp = NULL;
1468 if ((pr->pr_flags & PR_WANTRCVD) && (so->so_state & SS_ISCONFIRMING)
1469 && uio->uio_resid)
1470 (*pr->pr_usrreqs->pru_rcvd)(so, 0);
1471
1472 error = sblock(&so->so_rcv, SBLOCKWAIT(flags));
1473 if (error)
1474 return (error);
1475
1476restart:
1477 SOCKBUF_LOCK(&so->so_rcv);
1478 m = so->so_rcv.sb_mb;
1479 /*
1480 * If we have less data than requested, block awaiting more (subject
1481 * to any timeout) if:
1482 * 1. the current count is less than the low water mark, or
1483 * 2. MSG_WAITALL is set, and it is possible to do the entire
1484 * receive operation at once if we block (resid <= hiwat).
1485 * 3. MSG_DONTWAIT is not set
1486 * If MSG_WAITALL is set but resid is larger than the receive buffer,
1487 * we have to do the receive in sections, and thus risk returning a
1488 * short count if a timeout or signal occurs after we start.
1489 */
1490 if (m == NULL || (((flags & MSG_DONTWAIT) == 0 &&
1491 so->so_rcv.sb_cc < uio->uio_resid) &&
1492 (so->so_rcv.sb_cc < so->so_rcv.sb_lowat ||
1493 ((flags & MSG_WAITALL) && uio->uio_resid <= so->so_rcv.sb_hiwat)) &&
1494 m->m_nextpkt == NULL && (pr->pr_flags & PR_ATOMIC) == 0)) {
1495 KASSERT(m != NULL || !so->so_rcv.sb_cc,
1496 ("receive: m == %p so->so_rcv.sb_cc == %u",
1497 m, so->so_rcv.sb_cc));
1498 if (so->so_error) {
1499 if (m != NULL)
1500 goto dontblock;
1501 error = so->so_error;
1502 if ((flags & MSG_PEEK) == 0)
1503 so->so_error = 0;
1504 SOCKBUF_UNLOCK(&so->so_rcv);
1505 goto release;
1506 }
1507 SOCKBUF_LOCK_ASSERT(&so->so_rcv);
1508 if (so->so_rcv.sb_state & SBS_CANTRCVMORE) {
1509 if (m == NULL) {
1510 SOCKBUF_UNLOCK(&so->so_rcv);
1511 goto release;
1512 } else
1513 goto dontblock;
1514 }
1515 for (; m != NULL; m = m->m_next)
1516 if (m->m_type == MT_OOBDATA || (m->m_flags & M_EOR)) {
1517 m = so->so_rcv.sb_mb;
1518 goto dontblock;
1519 }
1520 if ((so->so_state & (SS_ISCONNECTED|SS_ISCONNECTING)) == 0 &&
1521 (so->so_proto->pr_flags & PR_CONNREQUIRED)) {
1522 SOCKBUF_UNLOCK(&so->so_rcv);
1523 error = ENOTCONN;
1524 goto release;
1525 }
1526 if (uio->uio_resid == 0) {
1527 SOCKBUF_UNLOCK(&so->so_rcv);
1528 goto release;
1529 }
1530 if ((so->so_state & SS_NBIO) ||
1531 (flags & (MSG_DONTWAIT|MSG_NBIO))) {
1532 SOCKBUF_UNLOCK(&so->so_rcv);
1533 error = EWOULDBLOCK;
1534 goto release;
1535 }
1536 SBLASTRECORDCHK(&so->so_rcv);
1537 SBLASTMBUFCHK(&so->so_rcv);
1538 error = sbwait(&so->so_rcv);
1539 SOCKBUF_UNLOCK(&so->so_rcv);
1540 if (error)
1541 goto release;
1542 goto restart;
1543 }
1544dontblock:
1545 /*
1546 * From this point onward, we maintain 'nextrecord' as a cache of the
1547 * pointer to the next record in the socket buffer. We must keep the
1548 * various socket buffer pointers and local stack versions of the
1549 * pointers in sync, pushing out modifications before dropping the
1550 * socket buffer mutex, and re-reading them when picking it up.
1551 *
1552 * Otherwise, we will race with the network stack appending new data
1553 * or records onto the socket buffer by using inconsistent/stale
1554 * versions of the field, possibly resulting in socket buffer
1555 * corruption.
1556 *
1557 * By holding the high-level sblock(), we prevent simultaneous
1558 * readers from pulling off the front of the socket buffer.
1559 */
1560 SOCKBUF_LOCK_ASSERT(&so->so_rcv);
1561 if (uio->uio_td)
1562 uio->uio_td->td_proc->p_stats->p_ru.ru_msgrcv++;
1563 KASSERT(m == so->so_rcv.sb_mb, ("soreceive: m != so->so_rcv.sb_mb"));
1564 SBLASTRECORDCHK(&so->so_rcv);
1565 SBLASTMBUFCHK(&so->so_rcv);
1566 nextrecord = m->m_nextpkt;
1567 if (pr->pr_flags & PR_ADDR) {
1568 KASSERT(m->m_type == MT_SONAME,
1569 ("m->m_type == %d", m->m_type));
1570 orig_resid = 0;
1571 if (psa != NULL)
1572 *psa = sodupsockaddr(mtod(m, struct sockaddr *),
1573 M_NOWAIT);
1574 if (flags & MSG_PEEK) {
1575 m = m->m_next;
1576 } else {
1577 sbfree(&so->so_rcv, m);
1578 so->so_rcv.sb_mb = m_free(m);
1579 m = so->so_rcv.sb_mb;
1580 sockbuf_pushsync(&so->so_rcv, nextrecord);
1581 }
1582 }
1583
1584 /*
1585 * Process one or more MT_CONTROL mbufs present before any data mbufs
1586 * in the first mbuf chain on the socket buffer. If MSG_PEEK, we
1587 * just copy the data; if !MSG_PEEK, we call into the protocol to
1588 * perform externalization (or freeing if controlp == NULL).
1589 */
1590 if (m != NULL && m->m_type == MT_CONTROL) {
1591 struct mbuf *cm = NULL, *cmn;
1592 struct mbuf **cme = &cm;
1593
1594 do {
1595 if (flags & MSG_PEEK) {
1596 if (controlp != NULL) {
1597 *controlp = m_copy(m, 0, m->m_len);
1598 controlp = &(*controlp)->m_next;
1599 }
1600 m = m->m_next;
1601 } else {
1602 sbfree(&so->so_rcv, m);
1603 so->so_rcv.sb_mb = m->m_next;
1604 m->m_next = NULL;
1605 *cme = m;
1606 cme = &(*cme)->m_next;
1607 m = so->so_rcv.sb_mb;
1608 }
1609 } while (m != NULL && m->m_type == MT_CONTROL);
1610 if ((flags & MSG_PEEK) == 0)
1611 sockbuf_pushsync(&so->so_rcv, nextrecord);
1612 while (cm != NULL) {
1613 cmn = cm->m_next;
1614 cm->m_next = NULL;
1615 if (pr->pr_domain->dom_externalize != NULL) {
1616 SOCKBUF_UNLOCK(&so->so_rcv);
1617 error = (*pr->pr_domain->dom_externalize)
1618 (cm, controlp);
1619 SOCKBUF_LOCK(&so->so_rcv);
1620 } else if (controlp != NULL)
1621 *controlp = cm;
1622 else
1623 m_freem(cm);
1624 if (controlp != NULL) {
1625 orig_resid = 0;
1626 while (*controlp != NULL)
1627 controlp = &(*controlp)->m_next;
1628 }
1629 cm = cmn;
1630 }
1631 if (m != NULL)
1632 nextrecord = so->so_rcv.sb_mb->m_nextpkt;
1633 else
1634 nextrecord = so->so_rcv.sb_mb;
1635 orig_resid = 0;
1636 }
1637 if (m != NULL) {
1638 if ((flags & MSG_PEEK) == 0) {
1639 KASSERT(m->m_nextpkt == nextrecord,
1640 ("soreceive: post-control, nextrecord !sync"));
1641 if (nextrecord == NULL) {
1642 KASSERT(so->so_rcv.sb_mb == m,
1643 ("soreceive: post-control, sb_mb!=m"));
1644 KASSERT(so->so_rcv.sb_lastrecord == m,
1645 ("soreceive: post-control, lastrecord!=m"));
1646 }
1647 }
1648 type = m->m_type;
1649 if (type == MT_OOBDATA)
1650 flags |= MSG_OOB;
1651 } else {
1652 if ((flags & MSG_PEEK) == 0) {
1653 KASSERT(so->so_rcv.sb_mb == nextrecord,
1654 ("soreceive: sb_mb != nextrecord"));
1655 if (so->so_rcv.sb_mb == NULL) {
1656 KASSERT(so->so_rcv.sb_lastrecord == NULL,
1657 ("soreceive: sb_lastercord != NULL"));
1658 }
1659 }
1660 }
1661 SOCKBUF_LOCK_ASSERT(&so->so_rcv);
1662 SBLASTRECORDCHK(&so->so_rcv);
1663 SBLASTMBUFCHK(&so->so_rcv);
1664
1665 /*
1666 * Now continue to read any data mbufs off of the head of the socket
1667 * buffer until the read request is satisfied. Note that 'type' is
1668 * used to store the type of any mbuf reads that have happened so far
1669 * such that soreceive() can stop reading if the type changes, which
1670 * causes soreceive() to return only one of regular data and inline
1671 * out-of-band data in a single socket receive operation.
1672 */
1673 moff = 0;
1674 offset = 0;
1675 while (m != NULL && uio->uio_resid > 0 && error == 0) {
1676 /*
1677 * If the type of mbuf has changed since the last mbuf
1678 * examined ('type'), end the receive operation.
1679 */
1680 SOCKBUF_LOCK_ASSERT(&so->so_rcv);
1681 if (m->m_type == MT_OOBDATA) {
1682 if (type != MT_OOBDATA)
1683 break;
1684 } else if (type == MT_OOBDATA)
1685 break;
1686 else
1687 KASSERT(m->m_type == MT_DATA,
1688 ("m->m_type == %d", m->m_type));
1689 so->so_rcv.sb_state &= ~SBS_RCVATMARK;
1690 len = uio->uio_resid;
1691 if (so->so_oobmark && len > so->so_oobmark - offset)
1692 len = so->so_oobmark - offset;
1693 if (len > m->m_len - moff)
1694 len = m->m_len - moff;
1695 /*
1696 * If mp is set, just pass back the mbufs. Otherwise copy
1697 * them out via the uio, then free. Sockbuf must be
1698 * consistent here (points to current mbuf, it points to next
1699 * record) when we drop priority; we must note any additions
1700 * to the sockbuf when we block interrupts again.
1701 */
1702 if (mp == NULL) {
1703 SOCKBUF_LOCK_ASSERT(&so->so_rcv);
1704 SBLASTRECORDCHK(&so->so_rcv);
1705 SBLASTMBUFCHK(&so->so_rcv);
1706 SOCKBUF_UNLOCK(&so->so_rcv);
1707#ifdef ZERO_COPY_SOCKETS
1708 if (so_zero_copy_receive) {
1709 int disposable;
1710
1711 if ((m->m_flags & M_EXT)
1712 && (m->m_ext.ext_type == EXT_DISPOSABLE))
1713 disposable = 1;
1714 else
1715 disposable = 0;
1716
1717 error = uiomoveco(mtod(m, char *) + moff,
1718 (int)len, uio,
1719 disposable);
1720 } else
1721#endif /* ZERO_COPY_SOCKETS */
1722 error = uiomove(mtod(m, char *) + moff, (int)len, uio);
1723 SOCKBUF_LOCK(&so->so_rcv);
1724 if (error) {
1725 /*
1726 * The MT_SONAME mbuf has already been removed
1727 * from the record, so it is necessary to
1728 * remove the data mbufs, if any, to preserve
1729 * the invariant in the case of PR_ADDR that
1730 * requires MT_SONAME mbufs at the head of
1731 * each record.
1732 */
1733 if (m && pr->pr_flags & PR_ATOMIC &&
1734 ((flags & MSG_PEEK) == 0))
1735 (void)sbdroprecord_locked(&so->so_rcv);
1736 SOCKBUF_UNLOCK(&so->so_rcv);
1737 goto release;
1738 }
1739 } else
1740 uio->uio_resid -= len;
1741 SOCKBUF_LOCK_ASSERT(&so->so_rcv);
1742 if (len == m->m_len - moff) {
1743 if (m->m_flags & M_EOR)
1744 flags |= MSG_EOR;
1745 if (flags & MSG_PEEK) {
1746 m = m->m_next;
1747 moff = 0;
1748 } else {
1749 nextrecord = m->m_nextpkt;
1750 sbfree(&so->so_rcv, m);
1751 if (mp != NULL) {
1752 *mp = m;
1753 mp = &m->m_next;
1754 so->so_rcv.sb_mb = m = m->m_next;
1755 *mp = NULL;
1756 } else {
1757 so->so_rcv.sb_mb = m_free(m);
1758 m = so->so_rcv.sb_mb;
1759 }
1760 sockbuf_pushsync(&so->so_rcv, nextrecord);
1761 SBLASTRECORDCHK(&so->so_rcv);
1762 SBLASTMBUFCHK(&so->so_rcv);
1763 }
1764 } else {
1765 if (flags & MSG_PEEK)
1766 moff += len;
1767 else {
1768 if (mp != NULL) {
1769 int copy_flag;
1770
1771 if (flags & MSG_DONTWAIT)
1772 copy_flag = M_DONTWAIT;
1773 else
1774 copy_flag = M_TRYWAIT;
1775 if (copy_flag == M_TRYWAIT)
1776 SOCKBUF_UNLOCK(&so->so_rcv);
1777 *mp = m_copym(m, 0, len, copy_flag);
1778 if (copy_flag == M_TRYWAIT)
1779 SOCKBUF_LOCK(&so->so_rcv);
1780 if (*mp == NULL) {
1781 /*
1782 * m_copym() couldn't
1783 * allocate an mbuf. Adjust
1784 * uio_resid back (it was
1785 * adjusted down by len
1786 * bytes, which we didn't end
1787 * up "copying" over).
1788 */
1789 uio->uio_resid += len;
1790 break;
1791 }
1792 }
1793 m->m_data += len;
1794 m->m_len -= len;
1795 so->so_rcv.sb_cc -= len;
1796 }
1797 }
1798 SOCKBUF_LOCK_ASSERT(&so->so_rcv);
1799 if (so->so_oobmark) {
1800 if ((flags & MSG_PEEK) == 0) {
1801 so->so_oobmark -= len;
1802 if (so->so_oobmark == 0) {
1803 so->so_rcv.sb_state |= SBS_RCVATMARK;
1804 break;
1805 }
1806 } else {
1807 offset += len;
1808 if (offset == so->so_oobmark)
1809 break;
1810 }
1811 }
1812 if (flags & MSG_EOR)
1813 break;
1814 /*
1815 * If the MSG_WAITALL flag is set (for non-atomic socket), we
1816 * must not quit until "uio->uio_resid == 0" or an error
1817 * termination. If a signal/timeout occurs, return with a
1818 * short count but without error. Keep sockbuf locked
1819 * against other readers.
1820 */
1821 while (flags & MSG_WAITALL && m == NULL && uio->uio_resid > 0 &&
1822 !sosendallatonce(so) && nextrecord == NULL) {
1823 SOCKBUF_LOCK_ASSERT(&so->so_rcv);
1824 if (so->so_error || so->so_rcv.sb_state & SBS_CANTRCVMORE)
1825 break;
1826 /*
1827 * Notify the protocol that some data has been
1828 * drained before blocking.
1829 */
1830 if (pr->pr_flags & PR_WANTRCVD) {
1831 SOCKBUF_UNLOCK(&so->so_rcv);
1832 (*pr->pr_usrreqs->pru_rcvd)(so, flags);
1833 SOCKBUF_LOCK(&so->so_rcv);
1834 }
1835 SBLASTRECORDCHK(&so->so_rcv);
1836 SBLASTMBUFCHK(&so->so_rcv);
1837 error = sbwait(&so->so_rcv);
1838 if (error) {
1839 SOCKBUF_UNLOCK(&so->so_rcv);
1840 goto release;
1841 }
1842 m = so->so_rcv.sb_mb;
1843 if (m != NULL)
1844 nextrecord = m->m_nextpkt;
1845 }
1846 }
1847
1848 SOCKBUF_LOCK_ASSERT(&so->so_rcv);
1849 if (m != NULL && pr->pr_flags & PR_ATOMIC) {
1850 flags |= MSG_TRUNC;
1851 if ((flags & MSG_PEEK) == 0)
1852 (void) sbdroprecord_locked(&so->so_rcv);
1853 }
1854 if ((flags & MSG_PEEK) == 0) {
1855 if (m == NULL) {
1856 /*
1857 * First part is an inline SB_EMPTY_FIXUP(). Second
1858 * part makes sure sb_lastrecord is up-to-date if
1859 * there is still data in the socket buffer.
1860 */
1861 so->so_rcv.sb_mb = nextrecord;
1862 if (so->so_rcv.sb_mb == NULL) {
1863 so->so_rcv.sb_mbtail = NULL;
1864 so->so_rcv.sb_lastrecord = NULL;
1865 } else if (nextrecord->m_nextpkt == NULL)
1866 so->so_rcv.sb_lastrecord = nextrecord;
1867 }
1868 SBLASTRECORDCHK(&so->so_rcv);
1869 SBLASTMBUFCHK(&so->so_rcv);
1870 /*
1871 * If soreceive() is being done from the socket callback,
1872 * then don't need to generate ACK to peer to update window,
1873 * since ACK will be generated on return to TCP.
1874 */
1875 if (!(flags & MSG_SOCALLBCK) &&
1876 (pr->pr_flags & PR_WANTRCVD)) {
1877 SOCKBUF_UNLOCK(&so->so_rcv);
1878 (*pr->pr_usrreqs->pru_rcvd)(so, flags);
1879 SOCKBUF_LOCK(&so->so_rcv);
1880 }
1881 }
1882 SOCKBUF_LOCK_ASSERT(&so->so_rcv);
1883 if (orig_resid == uio->uio_resid && orig_resid &&
1884 (flags & MSG_EOR) == 0 && (so->so_rcv.sb_state & SBS_CANTRCVMORE) == 0) {
1885 SOCKBUF_UNLOCK(&so->so_rcv);
1886 goto restart;
1887 }
1888 SOCKBUF_UNLOCK(&so->so_rcv);
1889
1890 if (flagsp != NULL)
1891 *flagsp |= flags;
1892release:
1893 sbunlock(&so->so_rcv);
1894 return (error);
1895}
1896
1897int
1394{
1395 struct mbuf *m, **mp;
1396 int flags, len, error, offset;
1397 struct protosw *pr = so->so_proto;
1398 struct mbuf *nextrecord;
1399 int moff, type = 0;
1400 int orig_resid = uio->uio_resid;
1401
1402 mp = mp0;
1403 if (psa != NULL)
1404 *psa = NULL;
1405 if (controlp != NULL)
1406 *controlp = NULL;
1407 if (flagsp != NULL)
1408 flags = *flagsp &~ MSG_EOR;
1409 else
1410 flags = 0;
1411 if (flags & MSG_OOB)
1412 return (soreceive_rcvoob(so, uio, flags));
1413 if (mp != NULL)
1414 *mp = NULL;
1415 if ((pr->pr_flags & PR_WANTRCVD) && (so->so_state & SS_ISCONFIRMING)
1416 && uio->uio_resid)
1417 (*pr->pr_usrreqs->pru_rcvd)(so, 0);
1418
1419 error = sblock(&so->so_rcv, SBLOCKWAIT(flags));
1420 if (error)
1421 return (error);
1422
1423restart:
1424 SOCKBUF_LOCK(&so->so_rcv);
1425 m = so->so_rcv.sb_mb;
1426 /*
1427 * If we have less data than requested, block awaiting more (subject
1428 * to any timeout) if:
1429 * 1. the current count is less than the low water mark, or
1430 * 2. MSG_WAITALL is set, and it is possible to do the entire
1431 * receive operation at once if we block (resid <= hiwat).
1432 * 3. MSG_DONTWAIT is not set
1433 * If MSG_WAITALL is set but resid is larger than the receive buffer,
1434 * we have to do the receive in sections, and thus risk returning a
1435 * short count if a timeout or signal occurs after we start.
1436 */
1437 if (m == NULL || (((flags & MSG_DONTWAIT) == 0 &&
1438 so->so_rcv.sb_cc < uio->uio_resid) &&
1439 (so->so_rcv.sb_cc < so->so_rcv.sb_lowat ||
1440 ((flags & MSG_WAITALL) && uio->uio_resid <= so->so_rcv.sb_hiwat)) &&
1441 m->m_nextpkt == NULL && (pr->pr_flags & PR_ATOMIC) == 0)) {
1442 KASSERT(m != NULL || !so->so_rcv.sb_cc,
1443 ("receive: m == %p so->so_rcv.sb_cc == %u",
1444 m, so->so_rcv.sb_cc));
1445 if (so->so_error) {
1446 if (m != NULL)
1447 goto dontblock;
1448 error = so->so_error;
1449 if ((flags & MSG_PEEK) == 0)
1450 so->so_error = 0;
1451 SOCKBUF_UNLOCK(&so->so_rcv);
1452 goto release;
1453 }
1454 SOCKBUF_LOCK_ASSERT(&so->so_rcv);
1455 if (so->so_rcv.sb_state & SBS_CANTRCVMORE) {
1456 if (m == NULL) {
1457 SOCKBUF_UNLOCK(&so->so_rcv);
1458 goto release;
1459 } else
1460 goto dontblock;
1461 }
1462 for (; m != NULL; m = m->m_next)
1463 if (m->m_type == MT_OOBDATA || (m->m_flags & M_EOR)) {
1464 m = so->so_rcv.sb_mb;
1465 goto dontblock;
1466 }
1467 if ((so->so_state & (SS_ISCONNECTED|SS_ISCONNECTING)) == 0 &&
1468 (so->so_proto->pr_flags & PR_CONNREQUIRED)) {
1469 SOCKBUF_UNLOCK(&so->so_rcv);
1470 error = ENOTCONN;
1471 goto release;
1472 }
1473 if (uio->uio_resid == 0) {
1474 SOCKBUF_UNLOCK(&so->so_rcv);
1475 goto release;
1476 }
1477 if ((so->so_state & SS_NBIO) ||
1478 (flags & (MSG_DONTWAIT|MSG_NBIO))) {
1479 SOCKBUF_UNLOCK(&so->so_rcv);
1480 error = EWOULDBLOCK;
1481 goto release;
1482 }
1483 SBLASTRECORDCHK(&so->so_rcv);
1484 SBLASTMBUFCHK(&so->so_rcv);
1485 error = sbwait(&so->so_rcv);
1486 SOCKBUF_UNLOCK(&so->so_rcv);
1487 if (error)
1488 goto release;
1489 goto restart;
1490 }
1491dontblock:
1492 /*
1493 * From this point onward, we maintain 'nextrecord' as a cache of the
1494 * pointer to the next record in the socket buffer. We must keep the
1495 * various socket buffer pointers and local stack versions of the
1496 * pointers in sync, pushing out modifications before dropping the
1497 * socket buffer mutex, and re-reading them when picking it up.
1498 *
1499 * Otherwise, we will race with the network stack appending new data
1500 * or records onto the socket buffer by using inconsistent/stale
1501 * versions of the field, possibly resulting in socket buffer
1502 * corruption.
1503 *
1504 * By holding the high-level sblock(), we prevent simultaneous
1505 * readers from pulling off the front of the socket buffer.
1506 */
1507 SOCKBUF_LOCK_ASSERT(&so->so_rcv);
1508 if (uio->uio_td)
1509 uio->uio_td->td_proc->p_stats->p_ru.ru_msgrcv++;
1510 KASSERT(m == so->so_rcv.sb_mb, ("soreceive: m != so->so_rcv.sb_mb"));
1511 SBLASTRECORDCHK(&so->so_rcv);
1512 SBLASTMBUFCHK(&so->so_rcv);
1513 nextrecord = m->m_nextpkt;
1514 if (pr->pr_flags & PR_ADDR) {
1515 KASSERT(m->m_type == MT_SONAME,
1516 ("m->m_type == %d", m->m_type));
1517 orig_resid = 0;
1518 if (psa != NULL)
1519 *psa = sodupsockaddr(mtod(m, struct sockaddr *),
1520 M_NOWAIT);
1521 if (flags & MSG_PEEK) {
1522 m = m->m_next;
1523 } else {
1524 sbfree(&so->so_rcv, m);
1525 so->so_rcv.sb_mb = m_free(m);
1526 m = so->so_rcv.sb_mb;
1527 sockbuf_pushsync(&so->so_rcv, nextrecord);
1528 }
1529 }
1530
1531 /*
1532 * Process one or more MT_CONTROL mbufs present before any data mbufs
1533 * in the first mbuf chain on the socket buffer. If MSG_PEEK, we
1534 * just copy the data; if !MSG_PEEK, we call into the protocol to
1535 * perform externalization (or freeing if controlp == NULL).
1536 */
1537 if (m != NULL && m->m_type == MT_CONTROL) {
1538 struct mbuf *cm = NULL, *cmn;
1539 struct mbuf **cme = &cm;
1540
1541 do {
1542 if (flags & MSG_PEEK) {
1543 if (controlp != NULL) {
1544 *controlp = m_copy(m, 0, m->m_len);
1545 controlp = &(*controlp)->m_next;
1546 }
1547 m = m->m_next;
1548 } else {
1549 sbfree(&so->so_rcv, m);
1550 so->so_rcv.sb_mb = m->m_next;
1551 m->m_next = NULL;
1552 *cme = m;
1553 cme = &(*cme)->m_next;
1554 m = so->so_rcv.sb_mb;
1555 }
1556 } while (m != NULL && m->m_type == MT_CONTROL);
1557 if ((flags & MSG_PEEK) == 0)
1558 sockbuf_pushsync(&so->so_rcv, nextrecord);
1559 while (cm != NULL) {
1560 cmn = cm->m_next;
1561 cm->m_next = NULL;
1562 if (pr->pr_domain->dom_externalize != NULL) {
1563 SOCKBUF_UNLOCK(&so->so_rcv);
1564 error = (*pr->pr_domain->dom_externalize)
1565 (cm, controlp);
1566 SOCKBUF_LOCK(&so->so_rcv);
1567 } else if (controlp != NULL)
1568 *controlp = cm;
1569 else
1570 m_freem(cm);
1571 if (controlp != NULL) {
1572 orig_resid = 0;
1573 while (*controlp != NULL)
1574 controlp = &(*controlp)->m_next;
1575 }
1576 cm = cmn;
1577 }
1578 if (m != NULL)
1579 nextrecord = so->so_rcv.sb_mb->m_nextpkt;
1580 else
1581 nextrecord = so->so_rcv.sb_mb;
1582 orig_resid = 0;
1583 }
1584 if (m != NULL) {
1585 if ((flags & MSG_PEEK) == 0) {
1586 KASSERT(m->m_nextpkt == nextrecord,
1587 ("soreceive: post-control, nextrecord !sync"));
1588 if (nextrecord == NULL) {
1589 KASSERT(so->so_rcv.sb_mb == m,
1590 ("soreceive: post-control, sb_mb!=m"));
1591 KASSERT(so->so_rcv.sb_lastrecord == m,
1592 ("soreceive: post-control, lastrecord!=m"));
1593 }
1594 }
1595 type = m->m_type;
1596 if (type == MT_OOBDATA)
1597 flags |= MSG_OOB;
1598 } else {
1599 if ((flags & MSG_PEEK) == 0) {
1600 KASSERT(so->so_rcv.sb_mb == nextrecord,
1601 ("soreceive: sb_mb != nextrecord"));
1602 if (so->so_rcv.sb_mb == NULL) {
1603 KASSERT(so->so_rcv.sb_lastrecord == NULL,
1604 ("soreceive: sb_lastercord != NULL"));
1605 }
1606 }
1607 }
1608 SOCKBUF_LOCK_ASSERT(&so->so_rcv);
1609 SBLASTRECORDCHK(&so->so_rcv);
1610 SBLASTMBUFCHK(&so->so_rcv);
1611
1612 /*
1613 * Now continue to read any data mbufs off of the head of the socket
1614 * buffer until the read request is satisfied. Note that 'type' is
1615 * used to store the type of any mbuf reads that have happened so far
1616 * such that soreceive() can stop reading if the type changes, which
1617 * causes soreceive() to return only one of regular data and inline
1618 * out-of-band data in a single socket receive operation.
1619 */
1620 moff = 0;
1621 offset = 0;
1622 while (m != NULL && uio->uio_resid > 0 && error == 0) {
1623 /*
1624 * If the type of mbuf has changed since the last mbuf
1625 * examined ('type'), end the receive operation.
1626 */
1627 SOCKBUF_LOCK_ASSERT(&so->so_rcv);
1628 if (m->m_type == MT_OOBDATA) {
1629 if (type != MT_OOBDATA)
1630 break;
1631 } else if (type == MT_OOBDATA)
1632 break;
1633 else
1634 KASSERT(m->m_type == MT_DATA,
1635 ("m->m_type == %d", m->m_type));
1636 so->so_rcv.sb_state &= ~SBS_RCVATMARK;
1637 len = uio->uio_resid;
1638 if (so->so_oobmark && len > so->so_oobmark - offset)
1639 len = so->so_oobmark - offset;
1640 if (len > m->m_len - moff)
1641 len = m->m_len - moff;
1642 /*
1643 * If mp is set, just pass back the mbufs. Otherwise copy
1644 * them out via the uio, then free. Sockbuf must be
1645 * consistent here (points to current mbuf, it points to next
1646 * record) when we drop priority; we must note any additions
1647 * to the sockbuf when we block interrupts again.
1648 */
1649 if (mp == NULL) {
1650 SOCKBUF_LOCK_ASSERT(&so->so_rcv);
1651 SBLASTRECORDCHK(&so->so_rcv);
1652 SBLASTMBUFCHK(&so->so_rcv);
1653 SOCKBUF_UNLOCK(&so->so_rcv);
1654#ifdef ZERO_COPY_SOCKETS
1655 if (so_zero_copy_receive) {
1656 int disposable;
1657
1658 if ((m->m_flags & M_EXT)
1659 && (m->m_ext.ext_type == EXT_DISPOSABLE))
1660 disposable = 1;
1661 else
1662 disposable = 0;
1663
1664 error = uiomoveco(mtod(m, char *) + moff,
1665 (int)len, uio,
1666 disposable);
1667 } else
1668#endif /* ZERO_COPY_SOCKETS */
1669 error = uiomove(mtod(m, char *) + moff, (int)len, uio);
1670 SOCKBUF_LOCK(&so->so_rcv);
1671 if (error) {
1672 /*
1673 * The MT_SONAME mbuf has already been removed
1674 * from the record, so it is necessary to
1675 * remove the data mbufs, if any, to preserve
1676 * the invariant in the case of PR_ADDR that
1677 * requires MT_SONAME mbufs at the head of
1678 * each record.
1679 */
1680 if (m && pr->pr_flags & PR_ATOMIC &&
1681 ((flags & MSG_PEEK) == 0))
1682 (void)sbdroprecord_locked(&so->so_rcv);
1683 SOCKBUF_UNLOCK(&so->so_rcv);
1684 goto release;
1685 }
1686 } else
1687 uio->uio_resid -= len;
1688 SOCKBUF_LOCK_ASSERT(&so->so_rcv);
1689 if (len == m->m_len - moff) {
1690 if (m->m_flags & M_EOR)
1691 flags |= MSG_EOR;
1692 if (flags & MSG_PEEK) {
1693 m = m->m_next;
1694 moff = 0;
1695 } else {
1696 nextrecord = m->m_nextpkt;
1697 sbfree(&so->so_rcv, m);
1698 if (mp != NULL) {
1699 *mp = m;
1700 mp = &m->m_next;
1701 so->so_rcv.sb_mb = m = m->m_next;
1702 *mp = NULL;
1703 } else {
1704 so->so_rcv.sb_mb = m_free(m);
1705 m = so->so_rcv.sb_mb;
1706 }
1707 sockbuf_pushsync(&so->so_rcv, nextrecord);
1708 SBLASTRECORDCHK(&so->so_rcv);
1709 SBLASTMBUFCHK(&so->so_rcv);
1710 }
1711 } else {
1712 if (flags & MSG_PEEK)
1713 moff += len;
1714 else {
1715 if (mp != NULL) {
1716 int copy_flag;
1717
1718 if (flags & MSG_DONTWAIT)
1719 copy_flag = M_DONTWAIT;
1720 else
1721 copy_flag = M_TRYWAIT;
1722 if (copy_flag == M_TRYWAIT)
1723 SOCKBUF_UNLOCK(&so->so_rcv);
1724 *mp = m_copym(m, 0, len, copy_flag);
1725 if (copy_flag == M_TRYWAIT)
1726 SOCKBUF_LOCK(&so->so_rcv);
1727 if (*mp == NULL) {
1728 /*
1729 * m_copym() couldn't
1730 * allocate an mbuf. Adjust
1731 * uio_resid back (it was
1732 * adjusted down by len
1733 * bytes, which we didn't end
1734 * up "copying" over).
1735 */
1736 uio->uio_resid += len;
1737 break;
1738 }
1739 }
1740 m->m_data += len;
1741 m->m_len -= len;
1742 so->so_rcv.sb_cc -= len;
1743 }
1744 }
1745 SOCKBUF_LOCK_ASSERT(&so->so_rcv);
1746 if (so->so_oobmark) {
1747 if ((flags & MSG_PEEK) == 0) {
1748 so->so_oobmark -= len;
1749 if (so->so_oobmark == 0) {
1750 so->so_rcv.sb_state |= SBS_RCVATMARK;
1751 break;
1752 }
1753 } else {
1754 offset += len;
1755 if (offset == so->so_oobmark)
1756 break;
1757 }
1758 }
1759 if (flags & MSG_EOR)
1760 break;
1761 /*
1762 * If the MSG_WAITALL flag is set (for non-atomic socket), we
1763 * must not quit until "uio->uio_resid == 0" or an error
1764 * termination. If a signal/timeout occurs, return with a
1765 * short count but without error. Keep sockbuf locked
1766 * against other readers.
1767 */
1768 while (flags & MSG_WAITALL && m == NULL && uio->uio_resid > 0 &&
1769 !sosendallatonce(so) && nextrecord == NULL) {
1770 SOCKBUF_LOCK_ASSERT(&so->so_rcv);
1771 if (so->so_error || so->so_rcv.sb_state & SBS_CANTRCVMORE)
1772 break;
1773 /*
1774 * Notify the protocol that some data has been
1775 * drained before blocking.
1776 */
1777 if (pr->pr_flags & PR_WANTRCVD) {
1778 SOCKBUF_UNLOCK(&so->so_rcv);
1779 (*pr->pr_usrreqs->pru_rcvd)(so, flags);
1780 SOCKBUF_LOCK(&so->so_rcv);
1781 }
1782 SBLASTRECORDCHK(&so->so_rcv);
1783 SBLASTMBUFCHK(&so->so_rcv);
1784 error = sbwait(&so->so_rcv);
1785 if (error) {
1786 SOCKBUF_UNLOCK(&so->so_rcv);
1787 goto release;
1788 }
1789 m = so->so_rcv.sb_mb;
1790 if (m != NULL)
1791 nextrecord = m->m_nextpkt;
1792 }
1793 }
1794
1795 SOCKBUF_LOCK_ASSERT(&so->so_rcv);
1796 if (m != NULL && pr->pr_flags & PR_ATOMIC) {
1797 flags |= MSG_TRUNC;
1798 if ((flags & MSG_PEEK) == 0)
1799 (void) sbdroprecord_locked(&so->so_rcv);
1800 }
1801 if ((flags & MSG_PEEK) == 0) {
1802 if (m == NULL) {
1803 /*
1804 * First part is an inline SB_EMPTY_FIXUP(). Second
1805 * part makes sure sb_lastrecord is up-to-date if
1806 * there is still data in the socket buffer.
1807 */
1808 so->so_rcv.sb_mb = nextrecord;
1809 if (so->so_rcv.sb_mb == NULL) {
1810 so->so_rcv.sb_mbtail = NULL;
1811 so->so_rcv.sb_lastrecord = NULL;
1812 } else if (nextrecord->m_nextpkt == NULL)
1813 so->so_rcv.sb_lastrecord = nextrecord;
1814 }
1815 SBLASTRECORDCHK(&so->so_rcv);
1816 SBLASTMBUFCHK(&so->so_rcv);
1817 /*
1818 * If soreceive() is being done from the socket callback,
1819 * then don't need to generate ACK to peer to update window,
1820 * since ACK will be generated on return to TCP.
1821 */
1822 if (!(flags & MSG_SOCALLBCK) &&
1823 (pr->pr_flags & PR_WANTRCVD)) {
1824 SOCKBUF_UNLOCK(&so->so_rcv);
1825 (*pr->pr_usrreqs->pru_rcvd)(so, flags);
1826 SOCKBUF_LOCK(&so->so_rcv);
1827 }
1828 }
1829 SOCKBUF_LOCK_ASSERT(&so->so_rcv);
1830 if (orig_resid == uio->uio_resid && orig_resid &&
1831 (flags & MSG_EOR) == 0 && (so->so_rcv.sb_state & SBS_CANTRCVMORE) == 0) {
1832 SOCKBUF_UNLOCK(&so->so_rcv);
1833 goto restart;
1834 }
1835 SOCKBUF_UNLOCK(&so->so_rcv);
1836
1837 if (flagsp != NULL)
1838 *flagsp |= flags;
1839release:
1840 sbunlock(&so->so_rcv);
1841 return (error);
1842}
1843
1844int
1898soreceive(so, psa, uio, mp0, controlp, flagsp)
1899 struct socket *so;
1900 struct sockaddr **psa;
1901 struct uio *uio;
1902 struct mbuf **mp0;
1903 struct mbuf **controlp;
1904 int *flagsp;
1845soreceive(struct socket *so, struct sockaddr **psa, struct uio *uio,
1846 struct mbuf **mp0, struct mbuf **controlp, int *flagsp)
1905{
1906
1907 /* XXXRW: Temporary debugging. */
1908 KASSERT(so->so_proto->pr_usrreqs->pru_soreceive != soreceive,
1909 ("soreceive: protocol calls soreceive"));
1910
1911 return (so->so_proto->pr_usrreqs->pru_soreceive(so, psa, uio, mp0,
1912 controlp, flagsp));
1913}
1914
1915int
1847{
1848
1849 /* XXXRW: Temporary debugging. */
1850 KASSERT(so->so_proto->pr_usrreqs->pru_soreceive != soreceive,
1851 ("soreceive: protocol calls soreceive"));
1852
1853 return (so->so_proto->pr_usrreqs->pru_soreceive(so, psa, uio, mp0,
1854 controlp, flagsp));
1855}
1856
1857int
1916soshutdown(so, how)
1917 struct socket *so;
1918 int how;
1858soshutdown(struct socket *so, int how)
1919{
1920 struct protosw *pr = so->so_proto;
1921
1922 if (!(how == SHUT_RD || how == SHUT_WR || how == SHUT_RDWR))
1923 return (EINVAL);
1924
1925 if (how != SHUT_WR)
1926 sorflush(so);
1927 if (how != SHUT_RD)
1928 return ((*pr->pr_usrreqs->pru_shutdown)(so));
1929 return (0);
1930}
1931
1932void
1859{
1860 struct protosw *pr = so->so_proto;
1861
1862 if (!(how == SHUT_RD || how == SHUT_WR || how == SHUT_RDWR))
1863 return (EINVAL);
1864
1865 if (how != SHUT_WR)
1866 sorflush(so);
1867 if (how != SHUT_RD)
1868 return ((*pr->pr_usrreqs->pru_shutdown)(so));
1869 return (0);
1870}
1871
1872void
1933sorflush(so)
1934 struct socket *so;
1873sorflush(struct socket *so)
1935{
1936 struct sockbuf *sb = &so->so_rcv;
1937 struct protosw *pr = so->so_proto;
1938 struct sockbuf asb;
1939
1940 /*
1941 * XXXRW: This is quite ugly. Previously, this code made a copy of
1942 * the socket buffer, then zero'd the original to clear the buffer
1943 * fields. However, with mutexes in the socket buffer, this causes
1944 * problems. We only clear the zeroable bits of the original;
1945 * however, we have to initialize and destroy the mutex in the copy
1946 * so that dom_dispose() and sbrelease() can lock t as needed.
1947 */
1948 (void) sblock(sb, M_WAITOK);
1949 SOCKBUF_LOCK(sb);
1950 sb->sb_flags |= SB_NOINTR;
1951 socantrcvmore_locked(so);
1952 /*
1953 * Invalidate/clear most of the sockbuf structure, but leave selinfo
1954 * and mutex data unchanged.
1955 */
1956 SOCKBUF_LOCK(sb);
1957 bzero(&asb, offsetof(struct sockbuf, sb_startzero));
1958 bcopy(&sb->sb_startzero, &asb.sb_startzero,
1959 sizeof(*sb) - offsetof(struct sockbuf, sb_startzero));
1960 bzero(&sb->sb_startzero,
1961 sizeof(*sb) - offsetof(struct sockbuf, sb_startzero));
1962 SOCKBUF_UNLOCK(sb);
1963 sbunlock(sb);
1964
1965 SOCKBUF_LOCK_INIT(&asb, "so_rcv");
1966 if (pr->pr_flags & PR_RIGHTS && pr->pr_domain->dom_dispose != NULL)
1967 (*pr->pr_domain->dom_dispose)(asb.sb_mb);
1968 sbrelease(&asb, so);
1969 SOCKBUF_LOCK_DESTROY(&asb);
1970}
1971
1972/*
1973 * Perhaps this routine, and sooptcopyout(), below, ought to come in an
1974 * additional variant to handle the case where the option value needs to be
1975 * some kind of integer, but not a specific size. In addition to their use
1976 * here, these functions are also called by the protocol-level pr_ctloutput()
1977 * routines.
1978 */
1979int
1874{
1875 struct sockbuf *sb = &so->so_rcv;
1876 struct protosw *pr = so->so_proto;
1877 struct sockbuf asb;
1878
1879 /*
1880 * XXXRW: This is quite ugly. Previously, this code made a copy of
1881 * the socket buffer, then zero'd the original to clear the buffer
1882 * fields. However, with mutexes in the socket buffer, this causes
1883 * problems. We only clear the zeroable bits of the original;
1884 * however, we have to initialize and destroy the mutex in the copy
1885 * so that dom_dispose() and sbrelease() can lock t as needed.
1886 */
1887 (void) sblock(sb, M_WAITOK);
1888 SOCKBUF_LOCK(sb);
1889 sb->sb_flags |= SB_NOINTR;
1890 socantrcvmore_locked(so);
1891 /*
1892 * Invalidate/clear most of the sockbuf structure, but leave selinfo
1893 * and mutex data unchanged.
1894 */
1895 SOCKBUF_LOCK(sb);
1896 bzero(&asb, offsetof(struct sockbuf, sb_startzero));
1897 bcopy(&sb->sb_startzero, &asb.sb_startzero,
1898 sizeof(*sb) - offsetof(struct sockbuf, sb_startzero));
1899 bzero(&sb->sb_startzero,
1900 sizeof(*sb) - offsetof(struct sockbuf, sb_startzero));
1901 SOCKBUF_UNLOCK(sb);
1902 sbunlock(sb);
1903
1904 SOCKBUF_LOCK_INIT(&asb, "so_rcv");
1905 if (pr->pr_flags & PR_RIGHTS && pr->pr_domain->dom_dispose != NULL)
1906 (*pr->pr_domain->dom_dispose)(asb.sb_mb);
1907 sbrelease(&asb, so);
1908 SOCKBUF_LOCK_DESTROY(&asb);
1909}
1910
1911/*
1912 * Perhaps this routine, and sooptcopyout(), below, ought to come in an
1913 * additional variant to handle the case where the option value needs to be
1914 * some kind of integer, but not a specific size. In addition to their use
1915 * here, these functions are also called by the protocol-level pr_ctloutput()
1916 * routines.
1917 */
1918int
1980sooptcopyin(sopt, buf, len, minlen)
1981 struct sockopt *sopt;
1982 void *buf;
1983 size_t len;
1984 size_t minlen;
1919sooptcopyin(struct sockopt *sopt, void *buf, size_t len, size_t minlen)
1985{
1986 size_t valsize;
1987
1988 /*
1989 * If the user gives us more than we wanted, we ignore it, but if we
1990 * don't get the minimum length the caller wants, we return EINVAL.
1991 * On success, sopt->sopt_valsize is set to however much we actually
1992 * retrieved.
1993 */
1994 if ((valsize = sopt->sopt_valsize) < minlen)
1995 return EINVAL;
1996 if (valsize > len)
1997 sopt->sopt_valsize = valsize = len;
1998
1999 if (sopt->sopt_td != NULL)
2000 return (copyin(sopt->sopt_val, buf, valsize));
2001
2002 bcopy(sopt->sopt_val, buf, valsize);
2003 return (0);
2004}
2005
2006/*
2007 * Kernel version of setsockopt(2).
2008 *
2009 * XXX: optlen is size_t, not socklen_t
2010 */
2011int
2012so_setsockopt(struct socket *so, int level, int optname, void *optval,
2013 size_t optlen)
2014{
2015 struct sockopt sopt;
2016
2017 sopt.sopt_level = level;
2018 sopt.sopt_name = optname;
2019 sopt.sopt_dir = SOPT_SET;
2020 sopt.sopt_val = optval;
2021 sopt.sopt_valsize = optlen;
2022 sopt.sopt_td = NULL;
2023 return (sosetopt(so, &sopt));
2024}
2025
2026int
1920{
1921 size_t valsize;
1922
1923 /*
1924 * If the user gives us more than we wanted, we ignore it, but if we
1925 * don't get the minimum length the caller wants, we return EINVAL.
1926 * On success, sopt->sopt_valsize is set to however much we actually
1927 * retrieved.
1928 */
1929 if ((valsize = sopt->sopt_valsize) < minlen)
1930 return EINVAL;
1931 if (valsize > len)
1932 sopt->sopt_valsize = valsize = len;
1933
1934 if (sopt->sopt_td != NULL)
1935 return (copyin(sopt->sopt_val, buf, valsize));
1936
1937 bcopy(sopt->sopt_val, buf, valsize);
1938 return (0);
1939}
1940
1941/*
1942 * Kernel version of setsockopt(2).
1943 *
1944 * XXX: optlen is size_t, not socklen_t
1945 */
1946int
1947so_setsockopt(struct socket *so, int level, int optname, void *optval,
1948 size_t optlen)
1949{
1950 struct sockopt sopt;
1951
1952 sopt.sopt_level = level;
1953 sopt.sopt_name = optname;
1954 sopt.sopt_dir = SOPT_SET;
1955 sopt.sopt_val = optval;
1956 sopt.sopt_valsize = optlen;
1957 sopt.sopt_td = NULL;
1958 return (sosetopt(so, &sopt));
1959}
1960
1961int
2027sosetopt(so, sopt)
2028 struct socket *so;
2029 struct sockopt *sopt;
1962sosetopt(struct socket *so, struct sockopt *sopt)
2030{
2031 int error, optval;
2032 struct linger l;
2033 struct timeval tv;
2034 u_long val;
2035#ifdef MAC
2036 struct mac extmac;
2037#endif
2038
2039 error = 0;
2040 if (sopt->sopt_level != SOL_SOCKET) {
2041 if (so->so_proto && so->so_proto->pr_ctloutput)
2042 return ((*so->so_proto->pr_ctloutput)
2043 (so, sopt));
2044 error = ENOPROTOOPT;
2045 } else {
2046 switch (sopt->sopt_name) {
2047#ifdef INET
2048 case SO_ACCEPTFILTER:
2049 error = do_setopt_accept_filter(so, sopt);
2050 if (error)
2051 goto bad;
2052 break;
2053#endif
2054 case SO_LINGER:
2055 error = sooptcopyin(sopt, &l, sizeof l, sizeof l);
2056 if (error)
2057 goto bad;
2058
2059 SOCK_LOCK(so);
2060 so->so_linger = l.l_linger;
2061 if (l.l_onoff)
2062 so->so_options |= SO_LINGER;
2063 else
2064 so->so_options &= ~SO_LINGER;
2065 SOCK_UNLOCK(so);
2066 break;
2067
2068 case SO_DEBUG:
2069 case SO_KEEPALIVE:
2070 case SO_DONTROUTE:
2071 case SO_USELOOPBACK:
2072 case SO_BROADCAST:
2073 case SO_REUSEADDR:
2074 case SO_REUSEPORT:
2075 case SO_OOBINLINE:
2076 case SO_TIMESTAMP:
2077 case SO_BINTIME:
2078 case SO_NOSIGPIPE:
2079 error = sooptcopyin(sopt, &optval, sizeof optval,
2080 sizeof optval);
2081 if (error)
2082 goto bad;
2083 SOCK_LOCK(so);
2084 if (optval)
2085 so->so_options |= sopt->sopt_name;
2086 else
2087 so->so_options &= ~sopt->sopt_name;
2088 SOCK_UNLOCK(so);
2089 break;
2090
2091 case SO_SNDBUF:
2092 case SO_RCVBUF:
2093 case SO_SNDLOWAT:
2094 case SO_RCVLOWAT:
2095 error = sooptcopyin(sopt, &optval, sizeof optval,
2096 sizeof optval);
2097 if (error)
2098 goto bad;
2099
2100 /*
2101 * Values < 1 make no sense for any of these options,
2102 * so disallow them.
2103 */
2104 if (optval < 1) {
2105 error = EINVAL;
2106 goto bad;
2107 }
2108
2109 switch (sopt->sopt_name) {
2110 case SO_SNDBUF:
2111 case SO_RCVBUF:
2112 if (sbreserve(sopt->sopt_name == SO_SNDBUF ?
2113 &so->so_snd : &so->so_rcv, (u_long)optval,
2114 so, curthread) == 0) {
2115 error = ENOBUFS;
2116 goto bad;
2117 }
2118 (sopt->sopt_name == SO_SNDBUF ? &so->so_snd :
2119 &so->so_rcv)->sb_flags &= ~SB_AUTOSIZE;
2120 break;
2121
2122 /*
2123 * Make sure the low-water is never greater than the
2124 * high-water.
2125 */
2126 case SO_SNDLOWAT:
2127 SOCKBUF_LOCK(&so->so_snd);
2128 so->so_snd.sb_lowat =
2129 (optval > so->so_snd.sb_hiwat) ?
2130 so->so_snd.sb_hiwat : optval;
2131 SOCKBUF_UNLOCK(&so->so_snd);
2132 break;
2133 case SO_RCVLOWAT:
2134 SOCKBUF_LOCK(&so->so_rcv);
2135 so->so_rcv.sb_lowat =
2136 (optval > so->so_rcv.sb_hiwat) ?
2137 so->so_rcv.sb_hiwat : optval;
2138 SOCKBUF_UNLOCK(&so->so_rcv);
2139 break;
2140 }
2141 break;
2142
2143 case SO_SNDTIMEO:
2144 case SO_RCVTIMEO:
2145#ifdef COMPAT_IA32
2146 if (curthread->td_proc->p_sysent == &ia32_freebsd_sysvec) {
2147 struct timeval32 tv32;
2148
2149 error = sooptcopyin(sopt, &tv32, sizeof tv32,
2150 sizeof tv32);
2151 CP(tv32, tv, tv_sec);
2152 CP(tv32, tv, tv_usec);
2153 } else
2154#endif
2155 error = sooptcopyin(sopt, &tv, sizeof tv,
2156 sizeof tv);
2157 if (error)
2158 goto bad;
2159
2160 /* assert(hz > 0); */
2161 if (tv.tv_sec < 0 || tv.tv_sec > INT_MAX / hz ||
2162 tv.tv_usec < 0 || tv.tv_usec >= 1000000) {
2163 error = EDOM;
2164 goto bad;
2165 }
2166 /* assert(tick > 0); */
2167 /* assert(ULONG_MAX - INT_MAX >= 1000000); */
2168 val = (u_long)(tv.tv_sec * hz) + tv.tv_usec / tick;
2169 if (val > INT_MAX) {
2170 error = EDOM;
2171 goto bad;
2172 }
2173 if (val == 0 && tv.tv_usec != 0)
2174 val = 1;
2175
2176 switch (sopt->sopt_name) {
2177 case SO_SNDTIMEO:
2178 so->so_snd.sb_timeo = val;
2179 break;
2180 case SO_RCVTIMEO:
2181 so->so_rcv.sb_timeo = val;
2182 break;
2183 }
2184 break;
2185
2186 case SO_LABEL:
2187#ifdef MAC
2188 error = sooptcopyin(sopt, &extmac, sizeof extmac,
2189 sizeof extmac);
2190 if (error)
2191 goto bad;
2192 error = mac_setsockopt_label(sopt->sopt_td->td_ucred,
2193 so, &extmac);
2194#else
2195 error = EOPNOTSUPP;
2196#endif
2197 break;
2198
2199 default:
2200 error = ENOPROTOOPT;
2201 break;
2202 }
2203 if (error == 0 && so->so_proto != NULL &&
2204 so->so_proto->pr_ctloutput != NULL) {
2205 (void) ((*so->so_proto->pr_ctloutput)
2206 (so, sopt));
2207 }
2208 }
2209bad:
2210 return (error);
2211}
2212
2213/*
2214 * Helper routine for getsockopt.
2215 */
2216int
2217sooptcopyout(struct sockopt *sopt, const void *buf, size_t len)
2218{
2219 int error;
2220 size_t valsize;
2221
2222 error = 0;
2223
2224 /*
2225 * Documented get behavior is that we always return a value, possibly
2226 * truncated to fit in the user's buffer. Traditional behavior is
2227 * that we always tell the user precisely how much we copied, rather
2228 * than something useful like the total amount we had available for
2229 * her. Note that this interface is not idempotent; the entire
2230 * answer must generated ahead of time.
2231 */
2232 valsize = min(len, sopt->sopt_valsize);
2233 sopt->sopt_valsize = valsize;
2234 if (sopt->sopt_val != NULL) {
2235 if (sopt->sopt_td != NULL)
2236 error = copyout(buf, sopt->sopt_val, valsize);
2237 else
2238 bcopy(buf, sopt->sopt_val, valsize);
2239 }
2240 return (error);
2241}
2242
2243int
1963{
1964 int error, optval;
1965 struct linger l;
1966 struct timeval tv;
1967 u_long val;
1968#ifdef MAC
1969 struct mac extmac;
1970#endif
1971
1972 error = 0;
1973 if (sopt->sopt_level != SOL_SOCKET) {
1974 if (so->so_proto && so->so_proto->pr_ctloutput)
1975 return ((*so->so_proto->pr_ctloutput)
1976 (so, sopt));
1977 error = ENOPROTOOPT;
1978 } else {
1979 switch (sopt->sopt_name) {
1980#ifdef INET
1981 case SO_ACCEPTFILTER:
1982 error = do_setopt_accept_filter(so, sopt);
1983 if (error)
1984 goto bad;
1985 break;
1986#endif
1987 case SO_LINGER:
1988 error = sooptcopyin(sopt, &l, sizeof l, sizeof l);
1989 if (error)
1990 goto bad;
1991
1992 SOCK_LOCK(so);
1993 so->so_linger = l.l_linger;
1994 if (l.l_onoff)
1995 so->so_options |= SO_LINGER;
1996 else
1997 so->so_options &= ~SO_LINGER;
1998 SOCK_UNLOCK(so);
1999 break;
2000
2001 case SO_DEBUG:
2002 case SO_KEEPALIVE:
2003 case SO_DONTROUTE:
2004 case SO_USELOOPBACK:
2005 case SO_BROADCAST:
2006 case SO_REUSEADDR:
2007 case SO_REUSEPORT:
2008 case SO_OOBINLINE:
2009 case SO_TIMESTAMP:
2010 case SO_BINTIME:
2011 case SO_NOSIGPIPE:
2012 error = sooptcopyin(sopt, &optval, sizeof optval,
2013 sizeof optval);
2014 if (error)
2015 goto bad;
2016 SOCK_LOCK(so);
2017 if (optval)
2018 so->so_options |= sopt->sopt_name;
2019 else
2020 so->so_options &= ~sopt->sopt_name;
2021 SOCK_UNLOCK(so);
2022 break;
2023
2024 case SO_SNDBUF:
2025 case SO_RCVBUF:
2026 case SO_SNDLOWAT:
2027 case SO_RCVLOWAT:
2028 error = sooptcopyin(sopt, &optval, sizeof optval,
2029 sizeof optval);
2030 if (error)
2031 goto bad;
2032
2033 /*
2034 * Values < 1 make no sense for any of these options,
2035 * so disallow them.
2036 */
2037 if (optval < 1) {
2038 error = EINVAL;
2039 goto bad;
2040 }
2041
2042 switch (sopt->sopt_name) {
2043 case SO_SNDBUF:
2044 case SO_RCVBUF:
2045 if (sbreserve(sopt->sopt_name == SO_SNDBUF ?
2046 &so->so_snd : &so->so_rcv, (u_long)optval,
2047 so, curthread) == 0) {
2048 error = ENOBUFS;
2049 goto bad;
2050 }
2051 (sopt->sopt_name == SO_SNDBUF ? &so->so_snd :
2052 &so->so_rcv)->sb_flags &= ~SB_AUTOSIZE;
2053 break;
2054
2055 /*
2056 * Make sure the low-water is never greater than the
2057 * high-water.
2058 */
2059 case SO_SNDLOWAT:
2060 SOCKBUF_LOCK(&so->so_snd);
2061 so->so_snd.sb_lowat =
2062 (optval > so->so_snd.sb_hiwat) ?
2063 so->so_snd.sb_hiwat : optval;
2064 SOCKBUF_UNLOCK(&so->so_snd);
2065 break;
2066 case SO_RCVLOWAT:
2067 SOCKBUF_LOCK(&so->so_rcv);
2068 so->so_rcv.sb_lowat =
2069 (optval > so->so_rcv.sb_hiwat) ?
2070 so->so_rcv.sb_hiwat : optval;
2071 SOCKBUF_UNLOCK(&so->so_rcv);
2072 break;
2073 }
2074 break;
2075
2076 case SO_SNDTIMEO:
2077 case SO_RCVTIMEO:
2078#ifdef COMPAT_IA32
2079 if (curthread->td_proc->p_sysent == &ia32_freebsd_sysvec) {
2080 struct timeval32 tv32;
2081
2082 error = sooptcopyin(sopt, &tv32, sizeof tv32,
2083 sizeof tv32);
2084 CP(tv32, tv, tv_sec);
2085 CP(tv32, tv, tv_usec);
2086 } else
2087#endif
2088 error = sooptcopyin(sopt, &tv, sizeof tv,
2089 sizeof tv);
2090 if (error)
2091 goto bad;
2092
2093 /* assert(hz > 0); */
2094 if (tv.tv_sec < 0 || tv.tv_sec > INT_MAX / hz ||
2095 tv.tv_usec < 0 || tv.tv_usec >= 1000000) {
2096 error = EDOM;
2097 goto bad;
2098 }
2099 /* assert(tick > 0); */
2100 /* assert(ULONG_MAX - INT_MAX >= 1000000); */
2101 val = (u_long)(tv.tv_sec * hz) + tv.tv_usec / tick;
2102 if (val > INT_MAX) {
2103 error = EDOM;
2104 goto bad;
2105 }
2106 if (val == 0 && tv.tv_usec != 0)
2107 val = 1;
2108
2109 switch (sopt->sopt_name) {
2110 case SO_SNDTIMEO:
2111 so->so_snd.sb_timeo = val;
2112 break;
2113 case SO_RCVTIMEO:
2114 so->so_rcv.sb_timeo = val;
2115 break;
2116 }
2117 break;
2118
2119 case SO_LABEL:
2120#ifdef MAC
2121 error = sooptcopyin(sopt, &extmac, sizeof extmac,
2122 sizeof extmac);
2123 if (error)
2124 goto bad;
2125 error = mac_setsockopt_label(sopt->sopt_td->td_ucred,
2126 so, &extmac);
2127#else
2128 error = EOPNOTSUPP;
2129#endif
2130 break;
2131
2132 default:
2133 error = ENOPROTOOPT;
2134 break;
2135 }
2136 if (error == 0 && so->so_proto != NULL &&
2137 so->so_proto->pr_ctloutput != NULL) {
2138 (void) ((*so->so_proto->pr_ctloutput)
2139 (so, sopt));
2140 }
2141 }
2142bad:
2143 return (error);
2144}
2145
2146/*
2147 * Helper routine for getsockopt.
2148 */
2149int
2150sooptcopyout(struct sockopt *sopt, const void *buf, size_t len)
2151{
2152 int error;
2153 size_t valsize;
2154
2155 error = 0;
2156
2157 /*
2158 * Documented get behavior is that we always return a value, possibly
2159 * truncated to fit in the user's buffer. Traditional behavior is
2160 * that we always tell the user precisely how much we copied, rather
2161 * than something useful like the total amount we had available for
2162 * her. Note that this interface is not idempotent; the entire
2163 * answer must generated ahead of time.
2164 */
2165 valsize = min(len, sopt->sopt_valsize);
2166 sopt->sopt_valsize = valsize;
2167 if (sopt->sopt_val != NULL) {
2168 if (sopt->sopt_td != NULL)
2169 error = copyout(buf, sopt->sopt_val, valsize);
2170 else
2171 bcopy(buf, sopt->sopt_val, valsize);
2172 }
2173 return (error);
2174}
2175
2176int
2244sogetopt(so, sopt)
2245 struct socket *so;
2246 struct sockopt *sopt;
2177sogetopt(struct socket *so, struct sockopt *sopt)
2247{
2248 int error, optval;
2249 struct linger l;
2250 struct timeval tv;
2251#ifdef MAC
2252 struct mac extmac;
2253#endif
2254
2255 error = 0;
2256 if (sopt->sopt_level != SOL_SOCKET) {
2257 if (so->so_proto && so->so_proto->pr_ctloutput) {
2258 return ((*so->so_proto->pr_ctloutput)
2259 (so, sopt));
2260 } else
2261 return (ENOPROTOOPT);
2262 } else {
2263 switch (sopt->sopt_name) {
2264#ifdef INET
2265 case SO_ACCEPTFILTER:
2266 error = do_getopt_accept_filter(so, sopt);
2267 break;
2268#endif
2269 case SO_LINGER:
2270 SOCK_LOCK(so);
2271 l.l_onoff = so->so_options & SO_LINGER;
2272 l.l_linger = so->so_linger;
2273 SOCK_UNLOCK(so);
2274 error = sooptcopyout(sopt, &l, sizeof l);
2275 break;
2276
2277 case SO_USELOOPBACK:
2278 case SO_DONTROUTE:
2279 case SO_DEBUG:
2280 case SO_KEEPALIVE:
2281 case SO_REUSEADDR:
2282 case SO_REUSEPORT:
2283 case SO_BROADCAST:
2284 case SO_OOBINLINE:
2285 case SO_ACCEPTCONN:
2286 case SO_TIMESTAMP:
2287 case SO_BINTIME:
2288 case SO_NOSIGPIPE:
2289 optval = so->so_options & sopt->sopt_name;
2290integer:
2291 error = sooptcopyout(sopt, &optval, sizeof optval);
2292 break;
2293
2294 case SO_TYPE:
2295 optval = so->so_type;
2296 goto integer;
2297
2298 case SO_ERROR:
2299 SOCK_LOCK(so);
2300 optval = so->so_error;
2301 so->so_error = 0;
2302 SOCK_UNLOCK(so);
2303 goto integer;
2304
2305 case SO_SNDBUF:
2306 optval = so->so_snd.sb_hiwat;
2307 goto integer;
2308
2309 case SO_RCVBUF:
2310 optval = so->so_rcv.sb_hiwat;
2311 goto integer;
2312
2313 case SO_SNDLOWAT:
2314 optval = so->so_snd.sb_lowat;
2315 goto integer;
2316
2317 case SO_RCVLOWAT:
2318 optval = so->so_rcv.sb_lowat;
2319 goto integer;
2320
2321 case SO_SNDTIMEO:
2322 case SO_RCVTIMEO:
2323 optval = (sopt->sopt_name == SO_SNDTIMEO ?
2324 so->so_snd.sb_timeo : so->so_rcv.sb_timeo);
2325
2326 tv.tv_sec = optval / hz;
2327 tv.tv_usec = (optval % hz) * tick;
2328#ifdef COMPAT_IA32
2329 if (curthread->td_proc->p_sysent == &ia32_freebsd_sysvec) {
2330 struct timeval32 tv32;
2331
2332 CP(tv, tv32, tv_sec);
2333 CP(tv, tv32, tv_usec);
2334 error = sooptcopyout(sopt, &tv32, sizeof tv32);
2335 } else
2336#endif
2337 error = sooptcopyout(sopt, &tv, sizeof tv);
2338 break;
2339
2340 case SO_LABEL:
2341#ifdef MAC
2342 error = sooptcopyin(sopt, &extmac, sizeof(extmac),
2343 sizeof(extmac));
2344 if (error)
2345 return (error);
2346 error = mac_getsockopt_label(sopt->sopt_td->td_ucred,
2347 so, &extmac);
2348 if (error)
2349 return (error);
2350 error = sooptcopyout(sopt, &extmac, sizeof extmac);
2351#else
2352 error = EOPNOTSUPP;
2353#endif
2354 break;
2355
2356 case SO_PEERLABEL:
2357#ifdef MAC
2358 error = sooptcopyin(sopt, &extmac, sizeof(extmac),
2359 sizeof(extmac));
2360 if (error)
2361 return (error);
2362 error = mac_getsockopt_peerlabel(
2363 sopt->sopt_td->td_ucred, so, &extmac);
2364 if (error)
2365 return (error);
2366 error = sooptcopyout(sopt, &extmac, sizeof extmac);
2367#else
2368 error = EOPNOTSUPP;
2369#endif
2370 break;
2371
2372 case SO_LISTENQLIMIT:
2373 optval = so->so_qlimit;
2374 goto integer;
2375
2376 case SO_LISTENQLEN:
2377 optval = so->so_qlen;
2378 goto integer;
2379
2380 case SO_LISTENINCQLEN:
2381 optval = so->so_incqlen;
2382 goto integer;
2383
2384 default:
2385 error = ENOPROTOOPT;
2386 break;
2387 }
2388 return (error);
2389 }
2390}
2391
2392/* XXX; prepare mbuf for (__FreeBSD__ < 3) routines. */
2393int
2394soopt_getm(struct sockopt *sopt, struct mbuf **mp)
2395{
2396 struct mbuf *m, *m_prev;
2397 int sopt_size = sopt->sopt_valsize;
2398
2399 MGET(m, sopt->sopt_td ? M_TRYWAIT : M_DONTWAIT, MT_DATA);
2400 if (m == NULL)
2401 return ENOBUFS;
2402 if (sopt_size > MLEN) {
2403 MCLGET(m, sopt->sopt_td ? M_TRYWAIT : M_DONTWAIT);
2404 if ((m->m_flags & M_EXT) == 0) {
2405 m_free(m);
2406 return ENOBUFS;
2407 }
2408 m->m_len = min(MCLBYTES, sopt_size);
2409 } else {
2410 m->m_len = min(MLEN, sopt_size);
2411 }
2412 sopt_size -= m->m_len;
2413 *mp = m;
2414 m_prev = m;
2415
2416 while (sopt_size) {
2417 MGET(m, sopt->sopt_td ? M_TRYWAIT : M_DONTWAIT, MT_DATA);
2418 if (m == NULL) {
2419 m_freem(*mp);
2420 return ENOBUFS;
2421 }
2422 if (sopt_size > MLEN) {
2423 MCLGET(m, sopt->sopt_td != NULL ? M_TRYWAIT :
2424 M_DONTWAIT);
2425 if ((m->m_flags & M_EXT) == 0) {
2426 m_freem(m);
2427 m_freem(*mp);
2428 return ENOBUFS;
2429 }
2430 m->m_len = min(MCLBYTES, sopt_size);
2431 } else {
2432 m->m_len = min(MLEN, sopt_size);
2433 }
2434 sopt_size -= m->m_len;
2435 m_prev->m_next = m;
2436 m_prev = m;
2437 }
2438 return (0);
2439}
2440
2441/* XXX; copyin sopt data into mbuf chain for (__FreeBSD__ < 3) routines. */
2442int
2443soopt_mcopyin(struct sockopt *sopt, struct mbuf *m)
2444{
2445 struct mbuf *m0 = m;
2446
2447 if (sopt->sopt_val == NULL)
2448 return (0);
2449 while (m != NULL && sopt->sopt_valsize >= m->m_len) {
2450 if (sopt->sopt_td != NULL) {
2451 int error;
2452
2453 error = copyin(sopt->sopt_val, mtod(m, char *),
2454 m->m_len);
2455 if (error != 0) {
2456 m_freem(m0);
2457 return(error);
2458 }
2459 } else
2460 bcopy(sopt->sopt_val, mtod(m, char *), m->m_len);
2461 sopt->sopt_valsize -= m->m_len;
2462 sopt->sopt_val = (char *)sopt->sopt_val + m->m_len;
2463 m = m->m_next;
2464 }
2465 if (m != NULL) /* should be allocated enoughly at ip6_sooptmcopyin() */
2466 panic("ip6_sooptmcopyin");
2467 return (0);
2468}
2469
2470/* XXX; copyout mbuf chain data into soopt for (__FreeBSD__ < 3) routines. */
2471int
2472soopt_mcopyout(struct sockopt *sopt, struct mbuf *m)
2473{
2474 struct mbuf *m0 = m;
2475 size_t valsize = 0;
2476
2477 if (sopt->sopt_val == NULL)
2478 return (0);
2479 while (m != NULL && sopt->sopt_valsize >= m->m_len) {
2480 if (sopt->sopt_td != NULL) {
2481 int error;
2482
2483 error = copyout(mtod(m, char *), sopt->sopt_val,
2484 m->m_len);
2485 if (error != 0) {
2486 m_freem(m0);
2487 return(error);
2488 }
2489 } else
2490 bcopy(mtod(m, char *), sopt->sopt_val, m->m_len);
2491 sopt->sopt_valsize -= m->m_len;
2492 sopt->sopt_val = (char *)sopt->sopt_val + m->m_len;
2493 valsize += m->m_len;
2494 m = m->m_next;
2495 }
2496 if (m != NULL) {
2497 /* enough soopt buffer should be given from user-land */
2498 m_freem(m0);
2499 return(EINVAL);
2500 }
2501 sopt->sopt_valsize = valsize;
2502 return (0);
2503}
2504
2505/*
2506 * sohasoutofband(): protocol notifies socket layer of the arrival of new
2507 * out-of-band data, which will then notify socket consumers.
2508 */
2509void
2178{
2179 int error, optval;
2180 struct linger l;
2181 struct timeval tv;
2182#ifdef MAC
2183 struct mac extmac;
2184#endif
2185
2186 error = 0;
2187 if (sopt->sopt_level != SOL_SOCKET) {
2188 if (so->so_proto && so->so_proto->pr_ctloutput) {
2189 return ((*so->so_proto->pr_ctloutput)
2190 (so, sopt));
2191 } else
2192 return (ENOPROTOOPT);
2193 } else {
2194 switch (sopt->sopt_name) {
2195#ifdef INET
2196 case SO_ACCEPTFILTER:
2197 error = do_getopt_accept_filter(so, sopt);
2198 break;
2199#endif
2200 case SO_LINGER:
2201 SOCK_LOCK(so);
2202 l.l_onoff = so->so_options & SO_LINGER;
2203 l.l_linger = so->so_linger;
2204 SOCK_UNLOCK(so);
2205 error = sooptcopyout(sopt, &l, sizeof l);
2206 break;
2207
2208 case SO_USELOOPBACK:
2209 case SO_DONTROUTE:
2210 case SO_DEBUG:
2211 case SO_KEEPALIVE:
2212 case SO_REUSEADDR:
2213 case SO_REUSEPORT:
2214 case SO_BROADCAST:
2215 case SO_OOBINLINE:
2216 case SO_ACCEPTCONN:
2217 case SO_TIMESTAMP:
2218 case SO_BINTIME:
2219 case SO_NOSIGPIPE:
2220 optval = so->so_options & sopt->sopt_name;
2221integer:
2222 error = sooptcopyout(sopt, &optval, sizeof optval);
2223 break;
2224
2225 case SO_TYPE:
2226 optval = so->so_type;
2227 goto integer;
2228
2229 case SO_ERROR:
2230 SOCK_LOCK(so);
2231 optval = so->so_error;
2232 so->so_error = 0;
2233 SOCK_UNLOCK(so);
2234 goto integer;
2235
2236 case SO_SNDBUF:
2237 optval = so->so_snd.sb_hiwat;
2238 goto integer;
2239
2240 case SO_RCVBUF:
2241 optval = so->so_rcv.sb_hiwat;
2242 goto integer;
2243
2244 case SO_SNDLOWAT:
2245 optval = so->so_snd.sb_lowat;
2246 goto integer;
2247
2248 case SO_RCVLOWAT:
2249 optval = so->so_rcv.sb_lowat;
2250 goto integer;
2251
2252 case SO_SNDTIMEO:
2253 case SO_RCVTIMEO:
2254 optval = (sopt->sopt_name == SO_SNDTIMEO ?
2255 so->so_snd.sb_timeo : so->so_rcv.sb_timeo);
2256
2257 tv.tv_sec = optval / hz;
2258 tv.tv_usec = (optval % hz) * tick;
2259#ifdef COMPAT_IA32
2260 if (curthread->td_proc->p_sysent == &ia32_freebsd_sysvec) {
2261 struct timeval32 tv32;
2262
2263 CP(tv, tv32, tv_sec);
2264 CP(tv, tv32, tv_usec);
2265 error = sooptcopyout(sopt, &tv32, sizeof tv32);
2266 } else
2267#endif
2268 error = sooptcopyout(sopt, &tv, sizeof tv);
2269 break;
2270
2271 case SO_LABEL:
2272#ifdef MAC
2273 error = sooptcopyin(sopt, &extmac, sizeof(extmac),
2274 sizeof(extmac));
2275 if (error)
2276 return (error);
2277 error = mac_getsockopt_label(sopt->sopt_td->td_ucred,
2278 so, &extmac);
2279 if (error)
2280 return (error);
2281 error = sooptcopyout(sopt, &extmac, sizeof extmac);
2282#else
2283 error = EOPNOTSUPP;
2284#endif
2285 break;
2286
2287 case SO_PEERLABEL:
2288#ifdef MAC
2289 error = sooptcopyin(sopt, &extmac, sizeof(extmac),
2290 sizeof(extmac));
2291 if (error)
2292 return (error);
2293 error = mac_getsockopt_peerlabel(
2294 sopt->sopt_td->td_ucred, so, &extmac);
2295 if (error)
2296 return (error);
2297 error = sooptcopyout(sopt, &extmac, sizeof extmac);
2298#else
2299 error = EOPNOTSUPP;
2300#endif
2301 break;
2302
2303 case SO_LISTENQLIMIT:
2304 optval = so->so_qlimit;
2305 goto integer;
2306
2307 case SO_LISTENQLEN:
2308 optval = so->so_qlen;
2309 goto integer;
2310
2311 case SO_LISTENINCQLEN:
2312 optval = so->so_incqlen;
2313 goto integer;
2314
2315 default:
2316 error = ENOPROTOOPT;
2317 break;
2318 }
2319 return (error);
2320 }
2321}
2322
2323/* XXX; prepare mbuf for (__FreeBSD__ < 3) routines. */
2324int
2325soopt_getm(struct sockopt *sopt, struct mbuf **mp)
2326{
2327 struct mbuf *m, *m_prev;
2328 int sopt_size = sopt->sopt_valsize;
2329
2330 MGET(m, sopt->sopt_td ? M_TRYWAIT : M_DONTWAIT, MT_DATA);
2331 if (m == NULL)
2332 return ENOBUFS;
2333 if (sopt_size > MLEN) {
2334 MCLGET(m, sopt->sopt_td ? M_TRYWAIT : M_DONTWAIT);
2335 if ((m->m_flags & M_EXT) == 0) {
2336 m_free(m);
2337 return ENOBUFS;
2338 }
2339 m->m_len = min(MCLBYTES, sopt_size);
2340 } else {
2341 m->m_len = min(MLEN, sopt_size);
2342 }
2343 sopt_size -= m->m_len;
2344 *mp = m;
2345 m_prev = m;
2346
2347 while (sopt_size) {
2348 MGET(m, sopt->sopt_td ? M_TRYWAIT : M_DONTWAIT, MT_DATA);
2349 if (m == NULL) {
2350 m_freem(*mp);
2351 return ENOBUFS;
2352 }
2353 if (sopt_size > MLEN) {
2354 MCLGET(m, sopt->sopt_td != NULL ? M_TRYWAIT :
2355 M_DONTWAIT);
2356 if ((m->m_flags & M_EXT) == 0) {
2357 m_freem(m);
2358 m_freem(*mp);
2359 return ENOBUFS;
2360 }
2361 m->m_len = min(MCLBYTES, sopt_size);
2362 } else {
2363 m->m_len = min(MLEN, sopt_size);
2364 }
2365 sopt_size -= m->m_len;
2366 m_prev->m_next = m;
2367 m_prev = m;
2368 }
2369 return (0);
2370}
2371
2372/* XXX; copyin sopt data into mbuf chain for (__FreeBSD__ < 3) routines. */
2373int
2374soopt_mcopyin(struct sockopt *sopt, struct mbuf *m)
2375{
2376 struct mbuf *m0 = m;
2377
2378 if (sopt->sopt_val == NULL)
2379 return (0);
2380 while (m != NULL && sopt->sopt_valsize >= m->m_len) {
2381 if (sopt->sopt_td != NULL) {
2382 int error;
2383
2384 error = copyin(sopt->sopt_val, mtod(m, char *),
2385 m->m_len);
2386 if (error != 0) {
2387 m_freem(m0);
2388 return(error);
2389 }
2390 } else
2391 bcopy(sopt->sopt_val, mtod(m, char *), m->m_len);
2392 sopt->sopt_valsize -= m->m_len;
2393 sopt->sopt_val = (char *)sopt->sopt_val + m->m_len;
2394 m = m->m_next;
2395 }
2396 if (m != NULL) /* should be allocated enoughly at ip6_sooptmcopyin() */
2397 panic("ip6_sooptmcopyin");
2398 return (0);
2399}
2400
2401/* XXX; copyout mbuf chain data into soopt for (__FreeBSD__ < 3) routines. */
2402int
2403soopt_mcopyout(struct sockopt *sopt, struct mbuf *m)
2404{
2405 struct mbuf *m0 = m;
2406 size_t valsize = 0;
2407
2408 if (sopt->sopt_val == NULL)
2409 return (0);
2410 while (m != NULL && sopt->sopt_valsize >= m->m_len) {
2411 if (sopt->sopt_td != NULL) {
2412 int error;
2413
2414 error = copyout(mtod(m, char *), sopt->sopt_val,
2415 m->m_len);
2416 if (error != 0) {
2417 m_freem(m0);
2418 return(error);
2419 }
2420 } else
2421 bcopy(mtod(m, char *), sopt->sopt_val, m->m_len);
2422 sopt->sopt_valsize -= m->m_len;
2423 sopt->sopt_val = (char *)sopt->sopt_val + m->m_len;
2424 valsize += m->m_len;
2425 m = m->m_next;
2426 }
2427 if (m != NULL) {
2428 /* enough soopt buffer should be given from user-land */
2429 m_freem(m0);
2430 return(EINVAL);
2431 }
2432 sopt->sopt_valsize = valsize;
2433 return (0);
2434}
2435
2436/*
2437 * sohasoutofband(): protocol notifies socket layer of the arrival of new
2438 * out-of-band data, which will then notify socket consumers.
2439 */
2440void
2510sohasoutofband(so)
2511 struct socket *so;
2441sohasoutofband(struct socket *so)
2512{
2442{
2443
2513 if (so->so_sigio != NULL)
2514 pgsigio(&so->so_sigio, SIGURG, 0);
2515 selwakeuppri(&so->so_rcv.sb_sel, PSOCK);
2516}
2517
2518int
2519sopoll(struct socket *so, int events, struct ucred *active_cred,
2520 struct thread *td)
2521{
2522
2523 /* XXXRW: Temporary debugging. */
2524 KASSERT(so->so_proto->pr_usrreqs->pru_sopoll != sopoll,
2525 ("sopoll: protocol calls sopoll"));
2526
2527 return (so->so_proto->pr_usrreqs->pru_sopoll(so, events, active_cred,
2528 td));
2529}
2530
2531int
2532sopoll_generic(struct socket *so, int events, struct ucred *active_cred,
2533 struct thread *td)
2534{
2535 int revents = 0;
2536
2537 SOCKBUF_LOCK(&so->so_snd);
2538 SOCKBUF_LOCK(&so->so_rcv);
2539 if (events & (POLLIN | POLLRDNORM))
2540 if (soreadable(so))
2541 revents |= events & (POLLIN | POLLRDNORM);
2542
2543 if (events & POLLINIGNEOF)
2544 if (so->so_rcv.sb_cc >= so->so_rcv.sb_lowat ||
2545 !TAILQ_EMPTY(&so->so_comp) || so->so_error)
2546 revents |= POLLINIGNEOF;
2547
2548 if (events & (POLLOUT | POLLWRNORM))
2549 if (sowriteable(so))
2550 revents |= events & (POLLOUT | POLLWRNORM);
2551
2552 if (events & (POLLPRI | POLLRDBAND))
2553 if (so->so_oobmark || (so->so_rcv.sb_state & SBS_RCVATMARK))
2554 revents |= events & (POLLPRI | POLLRDBAND);
2555
2556 if (revents == 0) {
2557 if (events &
2558 (POLLIN | POLLINIGNEOF | POLLPRI | POLLRDNORM |
2559 POLLRDBAND)) {
2560 selrecord(td, &so->so_rcv.sb_sel);
2561 so->so_rcv.sb_flags |= SB_SEL;
2562 }
2563
2564 if (events & (POLLOUT | POLLWRNORM)) {
2565 selrecord(td, &so->so_snd.sb_sel);
2566 so->so_snd.sb_flags |= SB_SEL;
2567 }
2568 }
2569
2570 SOCKBUF_UNLOCK(&so->so_rcv);
2571 SOCKBUF_UNLOCK(&so->so_snd);
2572 return (revents);
2573}
2574
2575int
2576soo_kqfilter(struct file *fp, struct knote *kn)
2577{
2578 struct socket *so = kn->kn_fp->f_data;
2579 struct sockbuf *sb;
2580
2581 switch (kn->kn_filter) {
2582 case EVFILT_READ:
2583 if (so->so_options & SO_ACCEPTCONN)
2584 kn->kn_fop = &solisten_filtops;
2585 else
2586 kn->kn_fop = &soread_filtops;
2587 sb = &so->so_rcv;
2588 break;
2589 case EVFILT_WRITE:
2590 kn->kn_fop = &sowrite_filtops;
2591 sb = &so->so_snd;
2592 break;
2593 default:
2594 return (EINVAL);
2595 }
2596
2597 SOCKBUF_LOCK(sb);
2598 knlist_add(&sb->sb_sel.si_note, kn, 1);
2599 sb->sb_flags |= SB_KNOTE;
2600 SOCKBUF_UNLOCK(sb);
2601 return (0);
2602}
2603
2604/*
2605 * Some routines that return EOPNOTSUPP for entry points that are not
2606 * supported by a protocol. Fill in as needed.
2607 */
2608int
2609pru_accept_notsupp(struct socket *so, struct sockaddr **nam)
2610{
2444 if (so->so_sigio != NULL)
2445 pgsigio(&so->so_sigio, SIGURG, 0);
2446 selwakeuppri(&so->so_rcv.sb_sel, PSOCK);
2447}
2448
2449int
2450sopoll(struct socket *so, int events, struct ucred *active_cred,
2451 struct thread *td)
2452{
2453
2454 /* XXXRW: Temporary debugging. */
2455 KASSERT(so->so_proto->pr_usrreqs->pru_sopoll != sopoll,
2456 ("sopoll: protocol calls sopoll"));
2457
2458 return (so->so_proto->pr_usrreqs->pru_sopoll(so, events, active_cred,
2459 td));
2460}
2461
2462int
2463sopoll_generic(struct socket *so, int events, struct ucred *active_cred,
2464 struct thread *td)
2465{
2466 int revents = 0;
2467
2468 SOCKBUF_LOCK(&so->so_snd);
2469 SOCKBUF_LOCK(&so->so_rcv);
2470 if (events & (POLLIN | POLLRDNORM))
2471 if (soreadable(so))
2472 revents |= events & (POLLIN | POLLRDNORM);
2473
2474 if (events & POLLINIGNEOF)
2475 if (so->so_rcv.sb_cc >= so->so_rcv.sb_lowat ||
2476 !TAILQ_EMPTY(&so->so_comp) || so->so_error)
2477 revents |= POLLINIGNEOF;
2478
2479 if (events & (POLLOUT | POLLWRNORM))
2480 if (sowriteable(so))
2481 revents |= events & (POLLOUT | POLLWRNORM);
2482
2483 if (events & (POLLPRI | POLLRDBAND))
2484 if (so->so_oobmark || (so->so_rcv.sb_state & SBS_RCVATMARK))
2485 revents |= events & (POLLPRI | POLLRDBAND);
2486
2487 if (revents == 0) {
2488 if (events &
2489 (POLLIN | POLLINIGNEOF | POLLPRI | POLLRDNORM |
2490 POLLRDBAND)) {
2491 selrecord(td, &so->so_rcv.sb_sel);
2492 so->so_rcv.sb_flags |= SB_SEL;
2493 }
2494
2495 if (events & (POLLOUT | POLLWRNORM)) {
2496 selrecord(td, &so->so_snd.sb_sel);
2497 so->so_snd.sb_flags |= SB_SEL;
2498 }
2499 }
2500
2501 SOCKBUF_UNLOCK(&so->so_rcv);
2502 SOCKBUF_UNLOCK(&so->so_snd);
2503 return (revents);
2504}
2505
2506int
2507soo_kqfilter(struct file *fp, struct knote *kn)
2508{
2509 struct socket *so = kn->kn_fp->f_data;
2510 struct sockbuf *sb;
2511
2512 switch (kn->kn_filter) {
2513 case EVFILT_READ:
2514 if (so->so_options & SO_ACCEPTCONN)
2515 kn->kn_fop = &solisten_filtops;
2516 else
2517 kn->kn_fop = &soread_filtops;
2518 sb = &so->so_rcv;
2519 break;
2520 case EVFILT_WRITE:
2521 kn->kn_fop = &sowrite_filtops;
2522 sb = &so->so_snd;
2523 break;
2524 default:
2525 return (EINVAL);
2526 }
2527
2528 SOCKBUF_LOCK(sb);
2529 knlist_add(&sb->sb_sel.si_note, kn, 1);
2530 sb->sb_flags |= SB_KNOTE;
2531 SOCKBUF_UNLOCK(sb);
2532 return (0);
2533}
2534
2535/*
2536 * Some routines that return EOPNOTSUPP for entry points that are not
2537 * supported by a protocol. Fill in as needed.
2538 */
2539int
2540pru_accept_notsupp(struct socket *so, struct sockaddr **nam)
2541{
2542
2611 return EOPNOTSUPP;
2612}
2613
2614int
2615pru_attach_notsupp(struct socket *so, int proto, struct thread *td)
2616{
2543 return EOPNOTSUPP;
2544}
2545
2546int
2547pru_attach_notsupp(struct socket *so, int proto, struct thread *td)
2548{
2549
2617 return EOPNOTSUPP;
2618}
2619
2620int
2621pru_bind_notsupp(struct socket *so, struct sockaddr *nam, struct thread *td)
2622{
2550 return EOPNOTSUPP;
2551}
2552
2553int
2554pru_bind_notsupp(struct socket *so, struct sockaddr *nam, struct thread *td)
2555{
2556
2623 return EOPNOTSUPP;
2624}
2625
2626int
2627pru_connect_notsupp(struct socket *so, struct sockaddr *nam, struct thread *td)
2628{
2557 return EOPNOTSUPP;
2558}
2559
2560int
2561pru_connect_notsupp(struct socket *so, struct sockaddr *nam, struct thread *td)
2562{
2563
2629 return EOPNOTSUPP;
2630}
2631
2632int
2633pru_connect2_notsupp(struct socket *so1, struct socket *so2)
2634{
2564 return EOPNOTSUPP;
2565}
2566
2567int
2568pru_connect2_notsupp(struct socket *so1, struct socket *so2)
2569{
2570
2635 return EOPNOTSUPP;
2636}
2637
2638int
2639pru_control_notsupp(struct socket *so, u_long cmd, caddr_t data,
2571 return EOPNOTSUPP;
2572}
2573
2574int
2575pru_control_notsupp(struct socket *so, u_long cmd, caddr_t data,
2640 struct ifnet *ifp, struct thread *td)
2576 struct ifnet *ifp, struct thread *td)
2641{
2577{
2578
2642 return EOPNOTSUPP;
2643}
2644
2645int
2646pru_disconnect_notsupp(struct socket *so)
2647{
2579 return EOPNOTSUPP;
2580}
2581
2582int
2583pru_disconnect_notsupp(struct socket *so)
2584{
2585
2648 return EOPNOTSUPP;
2649}
2650
2651int
2652pru_listen_notsupp(struct socket *so, int backlog, struct thread *td)
2653{
2586 return EOPNOTSUPP;
2587}
2588
2589int
2590pru_listen_notsupp(struct socket *so, int backlog, struct thread *td)
2591{
2592
2654 return EOPNOTSUPP;
2655}
2656
2657int
2658pru_peeraddr_notsupp(struct socket *so, struct sockaddr **nam)
2659{
2593 return EOPNOTSUPP;
2594}
2595
2596int
2597pru_peeraddr_notsupp(struct socket *so, struct sockaddr **nam)
2598{
2599
2660 return EOPNOTSUPP;
2661}
2662
2663int
2664pru_rcvd_notsupp(struct socket *so, int flags)
2665{
2600 return EOPNOTSUPP;
2601}
2602
2603int
2604pru_rcvd_notsupp(struct socket *so, int flags)
2605{
2606
2666 return EOPNOTSUPP;
2667}
2668
2669int
2670pru_rcvoob_notsupp(struct socket *so, struct mbuf *m, int flags)
2671{
2607 return EOPNOTSUPP;
2608}
2609
2610int
2611pru_rcvoob_notsupp(struct socket *so, struct mbuf *m, int flags)
2612{
2613
2672 return EOPNOTSUPP;
2673}
2674
2675int
2676pru_send_notsupp(struct socket *so, int flags, struct mbuf *m,
2614 return EOPNOTSUPP;
2615}
2616
2617int
2618pru_send_notsupp(struct socket *so, int flags, struct mbuf *m,
2677 struct sockaddr *addr, struct mbuf *control, struct thread *td)
2619 struct sockaddr *addr, struct mbuf *control, struct thread *td)
2678{
2620{
2621
2679 return EOPNOTSUPP;
2680}
2681
2682/*
2683 * This isn't really a ``null'' operation, but it's the default one and
2684 * doesn't do anything destructive.
2685 */
2686int
2687pru_sense_null(struct socket *so, struct stat *sb)
2688{
2622 return EOPNOTSUPP;
2623}
2624
2625/*
2626 * This isn't really a ``null'' operation, but it's the default one and
2627 * doesn't do anything destructive.
2628 */
2629int
2630pru_sense_null(struct socket *so, struct stat *sb)
2631{
2632
2689 sb->st_blksize = so->so_snd.sb_hiwat;
2690 return 0;
2691}
2692
2693int
2694pru_shutdown_notsupp(struct socket *so)
2695{
2633 sb->st_blksize = so->so_snd.sb_hiwat;
2634 return 0;
2635}
2636
2637int
2638pru_shutdown_notsupp(struct socket *so)
2639{
2640
2696 return EOPNOTSUPP;
2697}
2698
2699int
2700pru_sockaddr_notsupp(struct socket *so, struct sockaddr **nam)
2701{
2641 return EOPNOTSUPP;
2642}
2643
2644int
2645pru_sockaddr_notsupp(struct socket *so, struct sockaddr **nam)
2646{
2647
2702 return EOPNOTSUPP;
2703}
2704
2705int
2706pru_sosend_notsupp(struct socket *so, struct sockaddr *addr, struct uio *uio,
2648 return EOPNOTSUPP;
2649}
2650
2651int
2652pru_sosend_notsupp(struct socket *so, struct sockaddr *addr, struct uio *uio,
2707 struct mbuf *top, struct mbuf *control, int flags, struct thread *td)
2653 struct mbuf *top, struct mbuf *control, int flags, struct thread *td)
2708{
2654{
2655
2709 return EOPNOTSUPP;
2710}
2711
2712int
2713pru_soreceive_notsupp(struct socket *so, struct sockaddr **paddr,
2656 return EOPNOTSUPP;
2657}
2658
2659int
2660pru_soreceive_notsupp(struct socket *so, struct sockaddr **paddr,
2714 struct uio *uio, struct mbuf **mp0, struct mbuf **controlp,
2715 int *flagsp)
2661 struct uio *uio, struct mbuf **mp0, struct mbuf **controlp, int *flagsp)
2716{
2662{
2663
2717 return EOPNOTSUPP;
2718}
2719
2720int
2721pru_sopoll_notsupp(struct socket *so, int events, struct ucred *cred,
2664 return EOPNOTSUPP;
2665}
2666
2667int
2668pru_sopoll_notsupp(struct socket *so, int events, struct ucred *cred,
2722 struct thread *td)
2669 struct thread *td)
2723{
2670{
2671
2724 return EOPNOTSUPP;
2725}
2726
2727static void
2728filt_sordetach(struct knote *kn)
2729{
2730 struct socket *so = kn->kn_fp->f_data;
2731
2732 SOCKBUF_LOCK(&so->so_rcv);
2733 knlist_remove(&so->so_rcv.sb_sel.si_note, kn, 1);
2734 if (knlist_empty(&so->so_rcv.sb_sel.si_note))
2735 so->so_rcv.sb_flags &= ~SB_KNOTE;
2736 SOCKBUF_UNLOCK(&so->so_rcv);
2737}
2738
2739/*ARGSUSED*/
2740static int
2741filt_soread(struct knote *kn, long hint)
2742{
2743 struct socket *so;
2744
2745 so = kn->kn_fp->f_data;
2746 SOCKBUF_LOCK_ASSERT(&so->so_rcv);
2747
2748 kn->kn_data = so->so_rcv.sb_cc - so->so_rcv.sb_ctl;
2749 if (so->so_rcv.sb_state & SBS_CANTRCVMORE) {
2750 kn->kn_flags |= EV_EOF;
2751 kn->kn_fflags = so->so_error;
2752 return (1);
2753 } else if (so->so_error) /* temporary udp error */
2754 return (1);
2755 else if (kn->kn_sfflags & NOTE_LOWAT)
2756 return (kn->kn_data >= kn->kn_sdata);
2757 else
2758 return (so->so_rcv.sb_cc >= so->so_rcv.sb_lowat);
2759}
2760
2761static void
2762filt_sowdetach(struct knote *kn)
2763{
2764 struct socket *so = kn->kn_fp->f_data;
2765
2766 SOCKBUF_LOCK(&so->so_snd);
2767 knlist_remove(&so->so_snd.sb_sel.si_note, kn, 1);
2768 if (knlist_empty(&so->so_snd.sb_sel.si_note))
2769 so->so_snd.sb_flags &= ~SB_KNOTE;
2770 SOCKBUF_UNLOCK(&so->so_snd);
2771}
2772
2773/*ARGSUSED*/
2774static int
2775filt_sowrite(struct knote *kn, long hint)
2776{
2777 struct socket *so;
2778
2779 so = kn->kn_fp->f_data;
2780 SOCKBUF_LOCK_ASSERT(&so->so_snd);
2781 kn->kn_data = sbspace(&so->so_snd);
2782 if (so->so_snd.sb_state & SBS_CANTSENDMORE) {
2783 kn->kn_flags |= EV_EOF;
2784 kn->kn_fflags = so->so_error;
2785 return (1);
2786 } else if (so->so_error) /* temporary udp error */
2787 return (1);
2788 else if (((so->so_state & SS_ISCONNECTED) == 0) &&
2789 (so->so_proto->pr_flags & PR_CONNREQUIRED))
2790 return (0);
2791 else if (kn->kn_sfflags & NOTE_LOWAT)
2792 return (kn->kn_data >= kn->kn_sdata);
2793 else
2794 return (kn->kn_data >= so->so_snd.sb_lowat);
2795}
2796
2797/*ARGSUSED*/
2798static int
2799filt_solisten(struct knote *kn, long hint)
2800{
2801 struct socket *so = kn->kn_fp->f_data;
2802
2803 kn->kn_data = so->so_qlen;
2804 return (! TAILQ_EMPTY(&so->so_comp));
2805}
2806
2807int
2808socheckuid(struct socket *so, uid_t uid)
2809{
2810
2811 if (so == NULL)
2812 return (EPERM);
2813 if (so->so_cred->cr_uid != uid)
2814 return (EPERM);
2815 return (0);
2816}
2817
2818static int
2819sysctl_somaxconn(SYSCTL_HANDLER_ARGS)
2820{
2821 int error;
2822 int val;
2823
2824 val = somaxconn;
2825 error = sysctl_handle_int(oidp, &val, sizeof(int), req);
2826 if (error || !req->newptr )
2827 return (error);
2828
2829 if (val < 1 || val > USHRT_MAX)
2830 return (EINVAL);
2831
2832 somaxconn = val;
2833 return (0);
2834}
2835
2836/*
2837 * These functions are used by protocols to notify the socket layer (and its
2838 * consumers) of state changes in the sockets driven by protocol-side events.
2839 */
2840
2841/*
2842 * Procedures to manipulate state flags of socket and do appropriate wakeups.
2843 *
2844 * Normal sequence from the active (originating) side is that
2845 * soisconnecting() is called during processing of connect() call, resulting
2846 * in an eventual call to soisconnected() if/when the connection is
2847 * established. When the connection is torn down soisdisconnecting() is
2848 * called during processing of disconnect() call, and soisdisconnected() is
2849 * called when the connection to the peer is totally severed. The semantics
2850 * of these routines are such that connectionless protocols can call
2851 * soisconnected() and soisdisconnected() only, bypassing the in-progress
2852 * calls when setting up a ``connection'' takes no time.
2853 *
2854 * From the passive side, a socket is created with two queues of sockets:
2855 * so_incomp for connections in progress and so_comp for connections already
2856 * made and awaiting user acceptance. As a protocol is preparing incoming
2857 * connections, it creates a socket structure queued on so_incomp by calling
2858 * sonewconn(). When the connection is established, soisconnected() is
2859 * called, and transfers the socket structure to so_comp, making it available
2860 * to accept().
2861 *
2862 * If a socket is closed with sockets on either so_incomp or so_comp, these
2863 * sockets are dropped.
2864 *
2865 * If higher-level protocols are implemented in the kernel, the wakeups done
2866 * here will sometimes cause software-interrupt process scheduling.
2867 */
2868void
2672 return EOPNOTSUPP;
2673}
2674
2675static void
2676filt_sordetach(struct knote *kn)
2677{
2678 struct socket *so = kn->kn_fp->f_data;
2679
2680 SOCKBUF_LOCK(&so->so_rcv);
2681 knlist_remove(&so->so_rcv.sb_sel.si_note, kn, 1);
2682 if (knlist_empty(&so->so_rcv.sb_sel.si_note))
2683 so->so_rcv.sb_flags &= ~SB_KNOTE;
2684 SOCKBUF_UNLOCK(&so->so_rcv);
2685}
2686
2687/*ARGSUSED*/
2688static int
2689filt_soread(struct knote *kn, long hint)
2690{
2691 struct socket *so;
2692
2693 so = kn->kn_fp->f_data;
2694 SOCKBUF_LOCK_ASSERT(&so->so_rcv);
2695
2696 kn->kn_data = so->so_rcv.sb_cc - so->so_rcv.sb_ctl;
2697 if (so->so_rcv.sb_state & SBS_CANTRCVMORE) {
2698 kn->kn_flags |= EV_EOF;
2699 kn->kn_fflags = so->so_error;
2700 return (1);
2701 } else if (so->so_error) /* temporary udp error */
2702 return (1);
2703 else if (kn->kn_sfflags & NOTE_LOWAT)
2704 return (kn->kn_data >= kn->kn_sdata);
2705 else
2706 return (so->so_rcv.sb_cc >= so->so_rcv.sb_lowat);
2707}
2708
2709static void
2710filt_sowdetach(struct knote *kn)
2711{
2712 struct socket *so = kn->kn_fp->f_data;
2713
2714 SOCKBUF_LOCK(&so->so_snd);
2715 knlist_remove(&so->so_snd.sb_sel.si_note, kn, 1);
2716 if (knlist_empty(&so->so_snd.sb_sel.si_note))
2717 so->so_snd.sb_flags &= ~SB_KNOTE;
2718 SOCKBUF_UNLOCK(&so->so_snd);
2719}
2720
2721/*ARGSUSED*/
2722static int
2723filt_sowrite(struct knote *kn, long hint)
2724{
2725 struct socket *so;
2726
2727 so = kn->kn_fp->f_data;
2728 SOCKBUF_LOCK_ASSERT(&so->so_snd);
2729 kn->kn_data = sbspace(&so->so_snd);
2730 if (so->so_snd.sb_state & SBS_CANTSENDMORE) {
2731 kn->kn_flags |= EV_EOF;
2732 kn->kn_fflags = so->so_error;
2733 return (1);
2734 } else if (so->so_error) /* temporary udp error */
2735 return (1);
2736 else if (((so->so_state & SS_ISCONNECTED) == 0) &&
2737 (so->so_proto->pr_flags & PR_CONNREQUIRED))
2738 return (0);
2739 else if (kn->kn_sfflags & NOTE_LOWAT)
2740 return (kn->kn_data >= kn->kn_sdata);
2741 else
2742 return (kn->kn_data >= so->so_snd.sb_lowat);
2743}
2744
2745/*ARGSUSED*/
2746static int
2747filt_solisten(struct knote *kn, long hint)
2748{
2749 struct socket *so = kn->kn_fp->f_data;
2750
2751 kn->kn_data = so->so_qlen;
2752 return (! TAILQ_EMPTY(&so->so_comp));
2753}
2754
2755int
2756socheckuid(struct socket *so, uid_t uid)
2757{
2758
2759 if (so == NULL)
2760 return (EPERM);
2761 if (so->so_cred->cr_uid != uid)
2762 return (EPERM);
2763 return (0);
2764}
2765
2766static int
2767sysctl_somaxconn(SYSCTL_HANDLER_ARGS)
2768{
2769 int error;
2770 int val;
2771
2772 val = somaxconn;
2773 error = sysctl_handle_int(oidp, &val, sizeof(int), req);
2774 if (error || !req->newptr )
2775 return (error);
2776
2777 if (val < 1 || val > USHRT_MAX)
2778 return (EINVAL);
2779
2780 somaxconn = val;
2781 return (0);
2782}
2783
2784/*
2785 * These functions are used by protocols to notify the socket layer (and its
2786 * consumers) of state changes in the sockets driven by protocol-side events.
2787 */
2788
2789/*
2790 * Procedures to manipulate state flags of socket and do appropriate wakeups.
2791 *
2792 * Normal sequence from the active (originating) side is that
2793 * soisconnecting() is called during processing of connect() call, resulting
2794 * in an eventual call to soisconnected() if/when the connection is
2795 * established. When the connection is torn down soisdisconnecting() is
2796 * called during processing of disconnect() call, and soisdisconnected() is
2797 * called when the connection to the peer is totally severed. The semantics
2798 * of these routines are such that connectionless protocols can call
2799 * soisconnected() and soisdisconnected() only, bypassing the in-progress
2800 * calls when setting up a ``connection'' takes no time.
2801 *
2802 * From the passive side, a socket is created with two queues of sockets:
2803 * so_incomp for connections in progress and so_comp for connections already
2804 * made and awaiting user acceptance. As a protocol is preparing incoming
2805 * connections, it creates a socket structure queued on so_incomp by calling
2806 * sonewconn(). When the connection is established, soisconnected() is
2807 * called, and transfers the socket structure to so_comp, making it available
2808 * to accept().
2809 *
2810 * If a socket is closed with sockets on either so_incomp or so_comp, these
2811 * sockets are dropped.
2812 *
2813 * If higher-level protocols are implemented in the kernel, the wakeups done
2814 * here will sometimes cause software-interrupt process scheduling.
2815 */
2816void
2869soisconnecting(so)
2870 register struct socket *so;
2817soisconnecting(struct socket *so)
2871{
2872
2873 SOCK_LOCK(so);
2874 so->so_state &= ~(SS_ISCONNECTED|SS_ISDISCONNECTING);
2875 so->so_state |= SS_ISCONNECTING;
2876 SOCK_UNLOCK(so);
2877}
2878
2879void
2818{
2819
2820 SOCK_LOCK(so);
2821 so->so_state &= ~(SS_ISCONNECTED|SS_ISDISCONNECTING);
2822 so->so_state |= SS_ISCONNECTING;
2823 SOCK_UNLOCK(so);
2824}
2825
2826void
2880soisconnected(so)
2881 struct socket *so;
2827soisconnected(struct socket *so)
2882{
2883 struct socket *head;
2884
2885 ACCEPT_LOCK();
2886 SOCK_LOCK(so);
2887 so->so_state &= ~(SS_ISCONNECTING|SS_ISDISCONNECTING|SS_ISCONFIRMING);
2888 so->so_state |= SS_ISCONNECTED;
2889 head = so->so_head;
2890 if (head != NULL && (so->so_qstate & SQ_INCOMP)) {
2891 if ((so->so_options & SO_ACCEPTFILTER) == 0) {
2892 SOCK_UNLOCK(so);
2893 TAILQ_REMOVE(&head->so_incomp, so, so_list);
2894 head->so_incqlen--;
2895 so->so_qstate &= ~SQ_INCOMP;
2896 TAILQ_INSERT_TAIL(&head->so_comp, so, so_list);
2897 head->so_qlen++;
2898 so->so_qstate |= SQ_COMP;
2899 ACCEPT_UNLOCK();
2900 sorwakeup(head);
2901 wakeup_one(&head->so_timeo);
2902 } else {
2903 ACCEPT_UNLOCK();
2904 so->so_upcall =
2905 head->so_accf->so_accept_filter->accf_callback;
2906 so->so_upcallarg = head->so_accf->so_accept_filter_arg;
2907 so->so_rcv.sb_flags |= SB_UPCALL;
2908 so->so_options &= ~SO_ACCEPTFILTER;
2909 SOCK_UNLOCK(so);
2910 so->so_upcall(so, so->so_upcallarg, M_DONTWAIT);
2911 }
2912 return;
2913 }
2914 SOCK_UNLOCK(so);
2915 ACCEPT_UNLOCK();
2916 wakeup(&so->so_timeo);
2917 sorwakeup(so);
2918 sowwakeup(so);
2919}
2920
2921void
2828{
2829 struct socket *head;
2830
2831 ACCEPT_LOCK();
2832 SOCK_LOCK(so);
2833 so->so_state &= ~(SS_ISCONNECTING|SS_ISDISCONNECTING|SS_ISCONFIRMING);
2834 so->so_state |= SS_ISCONNECTED;
2835 head = so->so_head;
2836 if (head != NULL && (so->so_qstate & SQ_INCOMP)) {
2837 if ((so->so_options & SO_ACCEPTFILTER) == 0) {
2838 SOCK_UNLOCK(so);
2839 TAILQ_REMOVE(&head->so_incomp, so, so_list);
2840 head->so_incqlen--;
2841 so->so_qstate &= ~SQ_INCOMP;
2842 TAILQ_INSERT_TAIL(&head->so_comp, so, so_list);
2843 head->so_qlen++;
2844 so->so_qstate |= SQ_COMP;
2845 ACCEPT_UNLOCK();
2846 sorwakeup(head);
2847 wakeup_one(&head->so_timeo);
2848 } else {
2849 ACCEPT_UNLOCK();
2850 so->so_upcall =
2851 head->so_accf->so_accept_filter->accf_callback;
2852 so->so_upcallarg = head->so_accf->so_accept_filter_arg;
2853 so->so_rcv.sb_flags |= SB_UPCALL;
2854 so->so_options &= ~SO_ACCEPTFILTER;
2855 SOCK_UNLOCK(so);
2856 so->so_upcall(so, so->so_upcallarg, M_DONTWAIT);
2857 }
2858 return;
2859 }
2860 SOCK_UNLOCK(so);
2861 ACCEPT_UNLOCK();
2862 wakeup(&so->so_timeo);
2863 sorwakeup(so);
2864 sowwakeup(so);
2865}
2866
2867void
2922soisdisconnecting(so)
2923 register struct socket *so;
2868soisdisconnecting(struct socket *so)
2924{
2925
2926 /*
2927 * Note: This code assumes that SOCK_LOCK(so) and
2928 * SOCKBUF_LOCK(&so->so_rcv) are the same.
2929 */
2930 SOCKBUF_LOCK(&so->so_rcv);
2931 so->so_state &= ~SS_ISCONNECTING;
2932 so->so_state |= SS_ISDISCONNECTING;
2933 so->so_rcv.sb_state |= SBS_CANTRCVMORE;
2934 sorwakeup_locked(so);
2935 SOCKBUF_LOCK(&so->so_snd);
2936 so->so_snd.sb_state |= SBS_CANTSENDMORE;
2937 sowwakeup_locked(so);
2938 wakeup(&so->so_timeo);
2939}
2940
2941void
2869{
2870
2871 /*
2872 * Note: This code assumes that SOCK_LOCK(so) and
2873 * SOCKBUF_LOCK(&so->so_rcv) are the same.
2874 */
2875 SOCKBUF_LOCK(&so->so_rcv);
2876 so->so_state &= ~SS_ISCONNECTING;
2877 so->so_state |= SS_ISDISCONNECTING;
2878 so->so_rcv.sb_state |= SBS_CANTRCVMORE;
2879 sorwakeup_locked(so);
2880 SOCKBUF_LOCK(&so->so_snd);
2881 so->so_snd.sb_state |= SBS_CANTSENDMORE;
2882 sowwakeup_locked(so);
2883 wakeup(&so->so_timeo);
2884}
2885
2886void
2942soisdisconnected(so)
2943 register struct socket *so;
2887soisdisconnected(struct socket *so)
2944{
2945
2946 /*
2947 * Note: This code assumes that SOCK_LOCK(so) and
2948 * SOCKBUF_LOCK(&so->so_rcv) are the same.
2949 */
2950 SOCKBUF_LOCK(&so->so_rcv);
2951 so->so_state &= ~(SS_ISCONNECTING|SS_ISCONNECTED|SS_ISDISCONNECTING);
2952 so->so_state |= SS_ISDISCONNECTED;
2953 so->so_rcv.sb_state |= SBS_CANTRCVMORE;
2954 sorwakeup_locked(so);
2955 SOCKBUF_LOCK(&so->so_snd);
2956 so->so_snd.sb_state |= SBS_CANTSENDMORE;
2957 sbdrop_locked(&so->so_snd, so->so_snd.sb_cc);
2958 sowwakeup_locked(so);
2959 wakeup(&so->so_timeo);
2960}
2961
2962/*
2963 * Make a copy of a sockaddr in a malloced buffer of type M_SONAME.
2964 */
2965struct sockaddr *
2966sodupsockaddr(const struct sockaddr *sa, int mflags)
2967{
2968 struct sockaddr *sa2;
2969
2970 sa2 = malloc(sa->sa_len, M_SONAME, mflags);
2971 if (sa2)
2972 bcopy(sa, sa2, sa->sa_len);
2973 return sa2;
2974}
2975
2976/*
2977 * Create an external-format (``xsocket'') structure using the information in
2978 * the kernel-format socket structure pointed to by so. This is done to
2979 * reduce the spew of irrelevant information over this interface, to isolate
2980 * user code from changes in the kernel structure, and potentially to provide
2981 * information-hiding if we decide that some of this information should be
2982 * hidden from users.
2983 */
2984void
2985sotoxsocket(struct socket *so, struct xsocket *xso)
2986{
2888{
2889
2890 /*
2891 * Note: This code assumes that SOCK_LOCK(so) and
2892 * SOCKBUF_LOCK(&so->so_rcv) are the same.
2893 */
2894 SOCKBUF_LOCK(&so->so_rcv);
2895 so->so_state &= ~(SS_ISCONNECTING|SS_ISCONNECTED|SS_ISDISCONNECTING);
2896 so->so_state |= SS_ISDISCONNECTED;
2897 so->so_rcv.sb_state |= SBS_CANTRCVMORE;
2898 sorwakeup_locked(so);
2899 SOCKBUF_LOCK(&so->so_snd);
2900 so->so_snd.sb_state |= SBS_CANTSENDMORE;
2901 sbdrop_locked(&so->so_snd, so->so_snd.sb_cc);
2902 sowwakeup_locked(so);
2903 wakeup(&so->so_timeo);
2904}
2905
2906/*
2907 * Make a copy of a sockaddr in a malloced buffer of type M_SONAME.
2908 */
2909struct sockaddr *
2910sodupsockaddr(const struct sockaddr *sa, int mflags)
2911{
2912 struct sockaddr *sa2;
2913
2914 sa2 = malloc(sa->sa_len, M_SONAME, mflags);
2915 if (sa2)
2916 bcopy(sa, sa2, sa->sa_len);
2917 return sa2;
2918}
2919
2920/*
2921 * Create an external-format (``xsocket'') structure using the information in
2922 * the kernel-format socket structure pointed to by so. This is done to
2923 * reduce the spew of irrelevant information over this interface, to isolate
2924 * user code from changes in the kernel structure, and potentially to provide
2925 * information-hiding if we decide that some of this information should be
2926 * hidden from users.
2927 */
2928void
2929sotoxsocket(struct socket *so, struct xsocket *xso)
2930{
2931
2987 xso->xso_len = sizeof *xso;
2988 xso->xso_so = so;
2989 xso->so_type = so->so_type;
2990 xso->so_options = so->so_options;
2991 xso->so_linger = so->so_linger;
2992 xso->so_state = so->so_state;
2993 xso->so_pcb = so->so_pcb;
2994 xso->xso_protocol = so->so_proto->pr_protocol;
2995 xso->xso_family = so->so_proto->pr_domain->dom_family;
2996 xso->so_qlen = so->so_qlen;
2997 xso->so_incqlen = so->so_incqlen;
2998 xso->so_qlimit = so->so_qlimit;
2999 xso->so_timeo = so->so_timeo;
3000 xso->so_error = so->so_error;
3001 xso->so_pgid = so->so_sigio ? so->so_sigio->sio_pgid : 0;
3002 xso->so_oobmark = so->so_oobmark;
3003 sbtoxsockbuf(&so->so_snd, &xso->so_snd);
3004 sbtoxsockbuf(&so->so_rcv, &xso->so_rcv);
3005 xso->so_uid = so->so_cred->cr_uid;
3006}
2932 xso->xso_len = sizeof *xso;
2933 xso->xso_so = so;
2934 xso->so_type = so->so_type;
2935 xso->so_options = so->so_options;
2936 xso->so_linger = so->so_linger;
2937 xso->so_state = so->so_state;
2938 xso->so_pcb = so->so_pcb;
2939 xso->xso_protocol = so->so_proto->pr_protocol;
2940 xso->xso_family = so->so_proto->pr_domain->dom_family;
2941 xso->so_qlen = so->so_qlen;
2942 xso->so_incqlen = so->so_incqlen;
2943 xso->so_qlimit = so->so_qlimit;
2944 xso->so_timeo = so->so_timeo;
2945 xso->so_error = so->so_error;
2946 xso->so_pgid = so->so_sigio ? so->so_sigio->sio_pgid : 0;
2947 xso->so_oobmark = so->so_oobmark;
2948 sbtoxsockbuf(&so->so_snd, &xso->so_snd);
2949 sbtoxsockbuf(&so->so_rcv, &xso->so_rcv);
2950 xso->so_uid = so->so_cred->cr_uid;
2951}