Deleted Added
full compact
uipc_socket.c (128052) uipc_socket.c (129906)
1/*
2 * Copyright (c) 1982, 1986, 1988, 1990, 1993
3 * The Regents of the University of California. All rights reserved.
4 *
5 * Redistribution and use in source and binary forms, with or without
6 * modification, are permitted provided that the following conditions
7 * are met:
8 * 1. Redistributions of source code must retain the above copyright
9 * notice, this list of conditions and the following disclaimer.
10 * 2. Redistributions in binary form must reproduce the above copyright
11 * notice, this list of conditions and the following disclaimer in the
12 * documentation and/or other materials provided with the distribution.
13 * 4. Neither the name of the University nor the names of its contributors
14 * may be used to endorse or promote products derived from this software
15 * without specific prior written permission.
16 *
17 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
18 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
19 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
20 * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
21 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
22 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
23 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
24 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
25 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
26 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
27 * SUCH DAMAGE.
28 *
29 * @(#)uipc_socket.c 8.3 (Berkeley) 4/15/94
30 */
31
32#include <sys/cdefs.h>
1/*
2 * Copyright (c) 1982, 1986, 1988, 1990, 1993
3 * The Regents of the University of California. All rights reserved.
4 *
5 * Redistribution and use in source and binary forms, with or without
6 * modification, are permitted provided that the following conditions
7 * are met:
8 * 1. Redistributions of source code must retain the above copyright
9 * notice, this list of conditions and the following disclaimer.
10 * 2. Redistributions in binary form must reproduce the above copyright
11 * notice, this list of conditions and the following disclaimer in the
12 * documentation and/or other materials provided with the distribution.
13 * 4. Neither the name of the University nor the names of its contributors
14 * may be used to endorse or promote products derived from this software
15 * without specific prior written permission.
16 *
17 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
18 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
19 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
20 * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
21 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
22 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
23 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
24 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
25 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
26 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
27 * SUCH DAMAGE.
28 *
29 * @(#)uipc_socket.c 8.3 (Berkeley) 4/15/94
30 */
31
32#include <sys/cdefs.h>
33__FBSDID("$FreeBSD: head/sys/kern/uipc_socket.c 128052 2004-04-09 13:23:51Z rwatson $");
33__FBSDID("$FreeBSD: head/sys/kern/uipc_socket.c 129906 2004-05-31 21:46:06Z bmilekic $");
34
35#include "opt_inet.h"
36#include "opt_mac.h"
37#include "opt_zero.h"
38
39#include <sys/param.h>
40#include <sys/systm.h>
41#include <sys/fcntl.h>
42#include <sys/limits.h>
43#include <sys/lock.h>
44#include <sys/mac.h>
45#include <sys/malloc.h>
46#include <sys/mbuf.h>
47#include <sys/mutex.h>
48#include <sys/domain.h>
49#include <sys/file.h> /* for struct knote */
50#include <sys/kernel.h>
51#include <sys/event.h>
52#include <sys/poll.h>
53#include <sys/proc.h>
54#include <sys/protosw.h>
55#include <sys/socket.h>
56#include <sys/socketvar.h>
57#include <sys/resourcevar.h>
58#include <sys/signalvar.h>
59#include <sys/sysctl.h>
60#include <sys/uio.h>
61#include <sys/jail.h>
62
63#include <vm/uma.h>
64
65
66#ifdef INET
67static int do_setopt_accept_filter(struct socket *so, struct sockopt *sopt);
68#endif
69
70static void filt_sordetach(struct knote *kn);
71static int filt_soread(struct knote *kn, long hint);
72static void filt_sowdetach(struct knote *kn);
73static int filt_sowrite(struct knote *kn, long hint);
74static int filt_solisten(struct knote *kn, long hint);
75
76static struct filterops solisten_filtops =
77 { 1, NULL, filt_sordetach, filt_solisten };
78static struct filterops soread_filtops =
79 { 1, NULL, filt_sordetach, filt_soread };
80static struct filterops sowrite_filtops =
81 { 1, NULL, filt_sowdetach, filt_sowrite };
82
83uma_zone_t socket_zone;
84so_gen_t so_gencnt; /* generation count for sockets */
85
86MALLOC_DEFINE(M_SONAME, "soname", "socket name");
87MALLOC_DEFINE(M_PCB, "pcb", "protocol control block");
88
89SYSCTL_DECL(_kern_ipc);
90
91static int somaxconn = SOMAXCONN;
92SYSCTL_INT(_kern_ipc, KIPC_SOMAXCONN, somaxconn, CTLFLAG_RW,
93 &somaxconn, 0, "Maximum pending socket connection queue size");
94static int numopensockets;
95SYSCTL_INT(_kern_ipc, OID_AUTO, numopensockets, CTLFLAG_RD,
96 &numopensockets, 0, "Number of open sockets");
97#ifdef ZERO_COPY_SOCKETS
98/* These aren't static because they're used in other files. */
99int so_zero_copy_send = 1;
100int so_zero_copy_receive = 1;
101SYSCTL_NODE(_kern_ipc, OID_AUTO, zero_copy, CTLFLAG_RD, 0,
102 "Zero copy controls");
103SYSCTL_INT(_kern_ipc_zero_copy, OID_AUTO, receive, CTLFLAG_RW,
104 &so_zero_copy_receive, 0, "Enable zero copy receive");
105SYSCTL_INT(_kern_ipc_zero_copy, OID_AUTO, send, CTLFLAG_RW,
106 &so_zero_copy_send, 0, "Enable zero copy send");
107#endif /* ZERO_COPY_SOCKETS */
108
109
110/*
111 * Socket operation routines.
112 * These routines are called by the routines in
113 * sys_socket.c or from a system process, and
114 * implement the semantics of socket operations by
115 * switching out to the protocol specific routines.
116 */
117
118/*
119 * Get a socket structure from our zone, and initialize it.
120 * Note that it would probably be better to allocate socket
121 * and PCB at the same time, but I'm not convinced that all
122 * the protocols can be easily modified to do this.
123 *
124 * soalloc() returns a socket with a ref count of 0.
125 */
126struct socket *
127soalloc(int mflags)
128{
129 struct socket *so;
130#ifdef MAC
131 int error;
132#endif
133
134 so = uma_zalloc(socket_zone, mflags | M_ZERO);
135 if (so != NULL) {
136#ifdef MAC
137 error = mac_init_socket(so, mflags);
138 if (error != 0) {
139 uma_zfree(socket_zone, so);
140 so = NULL;
141 return so;
142 }
143#endif
144 /* XXX race condition for reentrant kernel */
145 so->so_gencnt = ++so_gencnt;
146 /* sx_init(&so->so_sxlock, "socket sxlock"); */
147 TAILQ_INIT(&so->so_aiojobq);
148 ++numopensockets;
149 }
150 return so;
151}
152
153/*
154 * socreate returns a socket with a ref count of 1. The socket should be
155 * closed with soclose().
156 */
157int
158socreate(dom, aso, type, proto, cred, td)
159 int dom;
160 struct socket **aso;
161 int type;
162 int proto;
163 struct ucred *cred;
164 struct thread *td;
165{
166 struct protosw *prp;
167 struct socket *so;
168 int error;
169
170 if (proto)
171 prp = pffindproto(dom, proto, type);
172 else
173 prp = pffindtype(dom, type);
174
175 if (prp == NULL || prp->pr_usrreqs->pru_attach == NULL)
176 return (EPROTONOSUPPORT);
177
178 if (jailed(cred) && jail_socket_unixiproute_only &&
179 prp->pr_domain->dom_family != PF_LOCAL &&
180 prp->pr_domain->dom_family != PF_INET &&
181 prp->pr_domain->dom_family != PF_ROUTE) {
182 return (EPROTONOSUPPORT);
183 }
184
185 if (prp->pr_type != type)
186 return (EPROTOTYPE);
187 so = soalloc(M_WAITOK);
188 if (so == NULL)
189 return (ENOBUFS);
190
191 TAILQ_INIT(&so->so_incomp);
192 TAILQ_INIT(&so->so_comp);
193 so->so_type = type;
194 so->so_cred = crhold(cred);
195 so->so_proto = prp;
196#ifdef MAC
197 mac_create_socket(cred, so);
198#endif
199 soref(so);
200 error = (*prp->pr_usrreqs->pru_attach)(so, proto, td);
201 if (error) {
202 so->so_state |= SS_NOFDREF;
203 sorele(so);
204 return (error);
205 }
206 *aso = so;
207 return (0);
208}
209
210int
211sobind(so, nam, td)
212 struct socket *so;
213 struct sockaddr *nam;
214 struct thread *td;
215{
216 int s = splnet();
217 int error;
218
219 error = (*so->so_proto->pr_usrreqs->pru_bind)(so, nam, td);
220 splx(s);
221 return (error);
222}
223
224void
225sodealloc(struct socket *so)
226{
227
228 KASSERT(so->so_count == 0, ("sodealloc(): so_count %d", so->so_count));
229 so->so_gencnt = ++so_gencnt;
230 if (so->so_rcv.sb_hiwat)
231 (void)chgsbsize(so->so_cred->cr_uidinfo,
232 &so->so_rcv.sb_hiwat, 0, RLIM_INFINITY);
233 if (so->so_snd.sb_hiwat)
234 (void)chgsbsize(so->so_cred->cr_uidinfo,
235 &so->so_snd.sb_hiwat, 0, RLIM_INFINITY);
236#ifdef INET
237 /* remove acccept filter if one is present. */
238 if (so->so_accf != NULL)
239 do_setopt_accept_filter(so, NULL);
240#endif
241#ifdef MAC
242 mac_destroy_socket(so);
243#endif
244 crfree(so->so_cred);
245 /* sx_destroy(&so->so_sxlock); */
246 uma_zfree(socket_zone, so);
247 --numopensockets;
248}
249
250int
251solisten(so, backlog, td)
252 struct socket *so;
253 int backlog;
254 struct thread *td;
255{
256 int s, error;
257
258 s = splnet();
259 if (so->so_state & (SS_ISCONNECTED | SS_ISCONNECTING |
260 SS_ISDISCONNECTING)) {
261 splx(s);
262 return (EINVAL);
263 }
264 error = (*so->so_proto->pr_usrreqs->pru_listen)(so, td);
265 if (error) {
266 splx(s);
267 return (error);
268 }
269 if (TAILQ_EMPTY(&so->so_comp))
270 so->so_options |= SO_ACCEPTCONN;
271 if (backlog < 0 || backlog > somaxconn)
272 backlog = somaxconn;
273 so->so_qlimit = backlog;
274 splx(s);
275 return (0);
276}
277
278void
279sofree(so)
280 struct socket *so;
281{
282 struct socket *head;
283 int s;
284
285 KASSERT(so->so_count == 0, ("socket %p so_count not 0", so));
286
287 if (so->so_pcb != NULL || (so->so_state & SS_NOFDREF) == 0)
288 return;
289 if (so->so_head != NULL) {
290 head = so->so_head;
291 if (so->so_state & SS_INCOMP) {
292 TAILQ_REMOVE(&head->so_incomp, so, so_list);
293 head->so_incqlen--;
294 } else if (so->so_state & SS_COMP) {
295 /*
296 * We must not decommission a socket that's
297 * on the accept(2) queue. If we do, then
298 * accept(2) may hang after select(2) indicated
299 * that the listening socket was ready.
300 */
301 return;
302 } else {
303 panic("sofree: not queued");
304 }
305 so->so_state &= ~SS_INCOMP;
306 so->so_head = NULL;
307 }
308 so->so_snd.sb_flags |= SB_NOINTR;
309 (void)sblock(&so->so_snd, M_WAITOK);
310 s = splimp();
311 socantsendmore(so);
312 splx(s);
313 sbunlock(&so->so_snd);
314 sbrelease(&so->so_snd, so);
315 sorflush(so);
316 sodealloc(so);
317}
318
319/*
320 * Close a socket on last file table reference removal.
321 * Initiate disconnect if connected.
322 * Free socket when disconnect complete.
323 *
324 * This function will sorele() the socket. Note that soclose() may be
325 * called prior to the ref count reaching zero. The actual socket
326 * structure will not be freed until the ref count reaches zero.
327 */
328int
329soclose(so)
330 struct socket *so;
331{
332 int s = splnet(); /* conservative */
333 int error = 0;
334
335 funsetown(&so->so_sigio);
336 if (so->so_options & SO_ACCEPTCONN) {
337 struct socket *sp, *sonext;
338
339 sp = TAILQ_FIRST(&so->so_incomp);
340 for (; sp != NULL; sp = sonext) {
341 sonext = TAILQ_NEXT(sp, so_list);
342 (void) soabort(sp);
343 }
344 for (sp = TAILQ_FIRST(&so->so_comp); sp != NULL; sp = sonext) {
345 sonext = TAILQ_NEXT(sp, so_list);
346 /* Dequeue from so_comp since sofree() won't do it */
347 TAILQ_REMOVE(&so->so_comp, sp, so_list);
348 so->so_qlen--;
349 sp->so_state &= ~SS_COMP;
350 sp->so_head = NULL;
351 (void) soabort(sp);
352 }
353 }
354 if (so->so_pcb == NULL)
355 goto discard;
356 if (so->so_state & SS_ISCONNECTED) {
357 if ((so->so_state & SS_ISDISCONNECTING) == 0) {
358 error = sodisconnect(so);
359 if (error)
360 goto drop;
361 }
362 if (so->so_options & SO_LINGER) {
363 if ((so->so_state & SS_ISDISCONNECTING) &&
364 (so->so_state & SS_NBIO))
365 goto drop;
366 while (so->so_state & SS_ISCONNECTED) {
367 error = tsleep(&so->so_timeo,
368 PSOCK | PCATCH, "soclos", so->so_linger * hz);
369 if (error)
370 break;
371 }
372 }
373 }
374drop:
375 if (so->so_pcb != NULL) {
376 int error2 = (*so->so_proto->pr_usrreqs->pru_detach)(so);
377 if (error == 0)
378 error = error2;
379 }
380discard:
381 if (so->so_state & SS_NOFDREF)
382 panic("soclose: NOFDREF");
383 so->so_state |= SS_NOFDREF;
384 sorele(so);
385 splx(s);
386 return (error);
387}
388
389/*
390 * Must be called at splnet...
391 */
392int
393soabort(so)
394 struct socket *so;
395{
396 int error;
397
398 error = (*so->so_proto->pr_usrreqs->pru_abort)(so);
399 if (error) {
400 sotryfree(so); /* note: does not decrement the ref count */
401 return error;
402 }
403 return (0);
404}
405
406int
407soaccept(so, nam)
408 struct socket *so;
409 struct sockaddr **nam;
410{
411 int s = splnet();
412 int error;
413
414 if ((so->so_state & SS_NOFDREF) == 0)
415 panic("soaccept: !NOFDREF");
416 so->so_state &= ~SS_NOFDREF;
417 error = (*so->so_proto->pr_usrreqs->pru_accept)(so, nam);
418 splx(s);
419 return (error);
420}
421
422int
423soconnect(so, nam, td)
424 struct socket *so;
425 struct sockaddr *nam;
426 struct thread *td;
427{
428 int s;
429 int error;
430
431 if (so->so_options & SO_ACCEPTCONN)
432 return (EOPNOTSUPP);
433 s = splnet();
434 /*
435 * If protocol is connection-based, can only connect once.
436 * Otherwise, if connected, try to disconnect first.
437 * This allows user to disconnect by connecting to, e.g.,
438 * a null address.
439 */
440 if (so->so_state & (SS_ISCONNECTED|SS_ISCONNECTING) &&
441 ((so->so_proto->pr_flags & PR_CONNREQUIRED) ||
442 (error = sodisconnect(so))))
443 error = EISCONN;
444 else
445 error = (*so->so_proto->pr_usrreqs->pru_connect)(so, nam, td);
446 splx(s);
447 return (error);
448}
449
450int
451soconnect2(so1, so2)
452 struct socket *so1;
453 struct socket *so2;
454{
455 int s = splnet();
456 int error;
457
458 error = (*so1->so_proto->pr_usrreqs->pru_connect2)(so1, so2);
459 splx(s);
460 return (error);
461}
462
463int
464sodisconnect(so)
465 struct socket *so;
466{
467 int s = splnet();
468 int error;
469
470 if ((so->so_state & SS_ISCONNECTED) == 0) {
471 error = ENOTCONN;
472 goto bad;
473 }
474 if (so->so_state & SS_ISDISCONNECTING) {
475 error = EALREADY;
476 goto bad;
477 }
478 error = (*so->so_proto->pr_usrreqs->pru_disconnect)(so);
479bad:
480 splx(s);
481 return (error);
482}
483
484#define SBLOCKWAIT(f) (((f) & MSG_DONTWAIT) ? M_NOWAIT : M_WAITOK)
485/*
486 * Send on a socket.
487 * If send must go all at once and message is larger than
488 * send buffering, then hard error.
489 * Lock against other senders.
490 * If must go all at once and not enough room now, then
491 * inform user that this would block and do nothing.
492 * Otherwise, if nonblocking, send as much as possible.
493 * The data to be sent is described by "uio" if nonzero,
494 * otherwise by the mbuf chain "top" (which must be null
495 * if uio is not). Data provided in mbuf chain must be small
496 * enough to send all at once.
497 *
498 * Returns nonzero on error, timeout or signal; callers
499 * must check for short counts if EINTR/ERESTART are returned.
500 * Data and control buffers are freed on return.
501 */
502
503#ifdef ZERO_COPY_SOCKETS
504struct so_zerocopy_stats{
505 int size_ok;
506 int align_ok;
507 int found_ifp;
508};
509struct so_zerocopy_stats so_zerocp_stats = {0,0,0};
510#include <netinet/in.h>
511#include <net/route.h>
512#include <netinet/in_pcb.h>
513#include <vm/vm.h>
514#include <vm/vm_page.h>
515#include <vm/vm_object.h>
516#endif /*ZERO_COPY_SOCKETS*/
517
518int
519sosend(so, addr, uio, top, control, flags, td)
520 struct socket *so;
521 struct sockaddr *addr;
522 struct uio *uio;
523 struct mbuf *top;
524 struct mbuf *control;
525 int flags;
526 struct thread *td;
527{
528 struct mbuf **mp;
529 struct mbuf *m;
34
35#include "opt_inet.h"
36#include "opt_mac.h"
37#include "opt_zero.h"
38
39#include <sys/param.h>
40#include <sys/systm.h>
41#include <sys/fcntl.h>
42#include <sys/limits.h>
43#include <sys/lock.h>
44#include <sys/mac.h>
45#include <sys/malloc.h>
46#include <sys/mbuf.h>
47#include <sys/mutex.h>
48#include <sys/domain.h>
49#include <sys/file.h> /* for struct knote */
50#include <sys/kernel.h>
51#include <sys/event.h>
52#include <sys/poll.h>
53#include <sys/proc.h>
54#include <sys/protosw.h>
55#include <sys/socket.h>
56#include <sys/socketvar.h>
57#include <sys/resourcevar.h>
58#include <sys/signalvar.h>
59#include <sys/sysctl.h>
60#include <sys/uio.h>
61#include <sys/jail.h>
62
63#include <vm/uma.h>
64
65
66#ifdef INET
67static int do_setopt_accept_filter(struct socket *so, struct sockopt *sopt);
68#endif
69
70static void filt_sordetach(struct knote *kn);
71static int filt_soread(struct knote *kn, long hint);
72static void filt_sowdetach(struct knote *kn);
73static int filt_sowrite(struct knote *kn, long hint);
74static int filt_solisten(struct knote *kn, long hint);
75
76static struct filterops solisten_filtops =
77 { 1, NULL, filt_sordetach, filt_solisten };
78static struct filterops soread_filtops =
79 { 1, NULL, filt_sordetach, filt_soread };
80static struct filterops sowrite_filtops =
81 { 1, NULL, filt_sowdetach, filt_sowrite };
82
83uma_zone_t socket_zone;
84so_gen_t so_gencnt; /* generation count for sockets */
85
86MALLOC_DEFINE(M_SONAME, "soname", "socket name");
87MALLOC_DEFINE(M_PCB, "pcb", "protocol control block");
88
89SYSCTL_DECL(_kern_ipc);
90
91static int somaxconn = SOMAXCONN;
92SYSCTL_INT(_kern_ipc, KIPC_SOMAXCONN, somaxconn, CTLFLAG_RW,
93 &somaxconn, 0, "Maximum pending socket connection queue size");
94static int numopensockets;
95SYSCTL_INT(_kern_ipc, OID_AUTO, numopensockets, CTLFLAG_RD,
96 &numopensockets, 0, "Number of open sockets");
97#ifdef ZERO_COPY_SOCKETS
98/* These aren't static because they're used in other files. */
99int so_zero_copy_send = 1;
100int so_zero_copy_receive = 1;
101SYSCTL_NODE(_kern_ipc, OID_AUTO, zero_copy, CTLFLAG_RD, 0,
102 "Zero copy controls");
103SYSCTL_INT(_kern_ipc_zero_copy, OID_AUTO, receive, CTLFLAG_RW,
104 &so_zero_copy_receive, 0, "Enable zero copy receive");
105SYSCTL_INT(_kern_ipc_zero_copy, OID_AUTO, send, CTLFLAG_RW,
106 &so_zero_copy_send, 0, "Enable zero copy send");
107#endif /* ZERO_COPY_SOCKETS */
108
109
110/*
111 * Socket operation routines.
112 * These routines are called by the routines in
113 * sys_socket.c or from a system process, and
114 * implement the semantics of socket operations by
115 * switching out to the protocol specific routines.
116 */
117
118/*
119 * Get a socket structure from our zone, and initialize it.
120 * Note that it would probably be better to allocate socket
121 * and PCB at the same time, but I'm not convinced that all
122 * the protocols can be easily modified to do this.
123 *
124 * soalloc() returns a socket with a ref count of 0.
125 */
126struct socket *
127soalloc(int mflags)
128{
129 struct socket *so;
130#ifdef MAC
131 int error;
132#endif
133
134 so = uma_zalloc(socket_zone, mflags | M_ZERO);
135 if (so != NULL) {
136#ifdef MAC
137 error = mac_init_socket(so, mflags);
138 if (error != 0) {
139 uma_zfree(socket_zone, so);
140 so = NULL;
141 return so;
142 }
143#endif
144 /* XXX race condition for reentrant kernel */
145 so->so_gencnt = ++so_gencnt;
146 /* sx_init(&so->so_sxlock, "socket sxlock"); */
147 TAILQ_INIT(&so->so_aiojobq);
148 ++numopensockets;
149 }
150 return so;
151}
152
153/*
154 * socreate returns a socket with a ref count of 1. The socket should be
155 * closed with soclose().
156 */
157int
158socreate(dom, aso, type, proto, cred, td)
159 int dom;
160 struct socket **aso;
161 int type;
162 int proto;
163 struct ucred *cred;
164 struct thread *td;
165{
166 struct protosw *prp;
167 struct socket *so;
168 int error;
169
170 if (proto)
171 prp = pffindproto(dom, proto, type);
172 else
173 prp = pffindtype(dom, type);
174
175 if (prp == NULL || prp->pr_usrreqs->pru_attach == NULL)
176 return (EPROTONOSUPPORT);
177
178 if (jailed(cred) && jail_socket_unixiproute_only &&
179 prp->pr_domain->dom_family != PF_LOCAL &&
180 prp->pr_domain->dom_family != PF_INET &&
181 prp->pr_domain->dom_family != PF_ROUTE) {
182 return (EPROTONOSUPPORT);
183 }
184
185 if (prp->pr_type != type)
186 return (EPROTOTYPE);
187 so = soalloc(M_WAITOK);
188 if (so == NULL)
189 return (ENOBUFS);
190
191 TAILQ_INIT(&so->so_incomp);
192 TAILQ_INIT(&so->so_comp);
193 so->so_type = type;
194 so->so_cred = crhold(cred);
195 so->so_proto = prp;
196#ifdef MAC
197 mac_create_socket(cred, so);
198#endif
199 soref(so);
200 error = (*prp->pr_usrreqs->pru_attach)(so, proto, td);
201 if (error) {
202 so->so_state |= SS_NOFDREF;
203 sorele(so);
204 return (error);
205 }
206 *aso = so;
207 return (0);
208}
209
210int
211sobind(so, nam, td)
212 struct socket *so;
213 struct sockaddr *nam;
214 struct thread *td;
215{
216 int s = splnet();
217 int error;
218
219 error = (*so->so_proto->pr_usrreqs->pru_bind)(so, nam, td);
220 splx(s);
221 return (error);
222}
223
224void
225sodealloc(struct socket *so)
226{
227
228 KASSERT(so->so_count == 0, ("sodealloc(): so_count %d", so->so_count));
229 so->so_gencnt = ++so_gencnt;
230 if (so->so_rcv.sb_hiwat)
231 (void)chgsbsize(so->so_cred->cr_uidinfo,
232 &so->so_rcv.sb_hiwat, 0, RLIM_INFINITY);
233 if (so->so_snd.sb_hiwat)
234 (void)chgsbsize(so->so_cred->cr_uidinfo,
235 &so->so_snd.sb_hiwat, 0, RLIM_INFINITY);
236#ifdef INET
237 /* remove acccept filter if one is present. */
238 if (so->so_accf != NULL)
239 do_setopt_accept_filter(so, NULL);
240#endif
241#ifdef MAC
242 mac_destroy_socket(so);
243#endif
244 crfree(so->so_cred);
245 /* sx_destroy(&so->so_sxlock); */
246 uma_zfree(socket_zone, so);
247 --numopensockets;
248}
249
250int
251solisten(so, backlog, td)
252 struct socket *so;
253 int backlog;
254 struct thread *td;
255{
256 int s, error;
257
258 s = splnet();
259 if (so->so_state & (SS_ISCONNECTED | SS_ISCONNECTING |
260 SS_ISDISCONNECTING)) {
261 splx(s);
262 return (EINVAL);
263 }
264 error = (*so->so_proto->pr_usrreqs->pru_listen)(so, td);
265 if (error) {
266 splx(s);
267 return (error);
268 }
269 if (TAILQ_EMPTY(&so->so_comp))
270 so->so_options |= SO_ACCEPTCONN;
271 if (backlog < 0 || backlog > somaxconn)
272 backlog = somaxconn;
273 so->so_qlimit = backlog;
274 splx(s);
275 return (0);
276}
277
278void
279sofree(so)
280 struct socket *so;
281{
282 struct socket *head;
283 int s;
284
285 KASSERT(so->so_count == 0, ("socket %p so_count not 0", so));
286
287 if (so->so_pcb != NULL || (so->so_state & SS_NOFDREF) == 0)
288 return;
289 if (so->so_head != NULL) {
290 head = so->so_head;
291 if (so->so_state & SS_INCOMP) {
292 TAILQ_REMOVE(&head->so_incomp, so, so_list);
293 head->so_incqlen--;
294 } else if (so->so_state & SS_COMP) {
295 /*
296 * We must not decommission a socket that's
297 * on the accept(2) queue. If we do, then
298 * accept(2) may hang after select(2) indicated
299 * that the listening socket was ready.
300 */
301 return;
302 } else {
303 panic("sofree: not queued");
304 }
305 so->so_state &= ~SS_INCOMP;
306 so->so_head = NULL;
307 }
308 so->so_snd.sb_flags |= SB_NOINTR;
309 (void)sblock(&so->so_snd, M_WAITOK);
310 s = splimp();
311 socantsendmore(so);
312 splx(s);
313 sbunlock(&so->so_snd);
314 sbrelease(&so->so_snd, so);
315 sorflush(so);
316 sodealloc(so);
317}
318
319/*
320 * Close a socket on last file table reference removal.
321 * Initiate disconnect if connected.
322 * Free socket when disconnect complete.
323 *
324 * This function will sorele() the socket. Note that soclose() may be
325 * called prior to the ref count reaching zero. The actual socket
326 * structure will not be freed until the ref count reaches zero.
327 */
328int
329soclose(so)
330 struct socket *so;
331{
332 int s = splnet(); /* conservative */
333 int error = 0;
334
335 funsetown(&so->so_sigio);
336 if (so->so_options & SO_ACCEPTCONN) {
337 struct socket *sp, *sonext;
338
339 sp = TAILQ_FIRST(&so->so_incomp);
340 for (; sp != NULL; sp = sonext) {
341 sonext = TAILQ_NEXT(sp, so_list);
342 (void) soabort(sp);
343 }
344 for (sp = TAILQ_FIRST(&so->so_comp); sp != NULL; sp = sonext) {
345 sonext = TAILQ_NEXT(sp, so_list);
346 /* Dequeue from so_comp since sofree() won't do it */
347 TAILQ_REMOVE(&so->so_comp, sp, so_list);
348 so->so_qlen--;
349 sp->so_state &= ~SS_COMP;
350 sp->so_head = NULL;
351 (void) soabort(sp);
352 }
353 }
354 if (so->so_pcb == NULL)
355 goto discard;
356 if (so->so_state & SS_ISCONNECTED) {
357 if ((so->so_state & SS_ISDISCONNECTING) == 0) {
358 error = sodisconnect(so);
359 if (error)
360 goto drop;
361 }
362 if (so->so_options & SO_LINGER) {
363 if ((so->so_state & SS_ISDISCONNECTING) &&
364 (so->so_state & SS_NBIO))
365 goto drop;
366 while (so->so_state & SS_ISCONNECTED) {
367 error = tsleep(&so->so_timeo,
368 PSOCK | PCATCH, "soclos", so->so_linger * hz);
369 if (error)
370 break;
371 }
372 }
373 }
374drop:
375 if (so->so_pcb != NULL) {
376 int error2 = (*so->so_proto->pr_usrreqs->pru_detach)(so);
377 if (error == 0)
378 error = error2;
379 }
380discard:
381 if (so->so_state & SS_NOFDREF)
382 panic("soclose: NOFDREF");
383 so->so_state |= SS_NOFDREF;
384 sorele(so);
385 splx(s);
386 return (error);
387}
388
389/*
390 * Must be called at splnet...
391 */
392int
393soabort(so)
394 struct socket *so;
395{
396 int error;
397
398 error = (*so->so_proto->pr_usrreqs->pru_abort)(so);
399 if (error) {
400 sotryfree(so); /* note: does not decrement the ref count */
401 return error;
402 }
403 return (0);
404}
405
406int
407soaccept(so, nam)
408 struct socket *so;
409 struct sockaddr **nam;
410{
411 int s = splnet();
412 int error;
413
414 if ((so->so_state & SS_NOFDREF) == 0)
415 panic("soaccept: !NOFDREF");
416 so->so_state &= ~SS_NOFDREF;
417 error = (*so->so_proto->pr_usrreqs->pru_accept)(so, nam);
418 splx(s);
419 return (error);
420}
421
422int
423soconnect(so, nam, td)
424 struct socket *so;
425 struct sockaddr *nam;
426 struct thread *td;
427{
428 int s;
429 int error;
430
431 if (so->so_options & SO_ACCEPTCONN)
432 return (EOPNOTSUPP);
433 s = splnet();
434 /*
435 * If protocol is connection-based, can only connect once.
436 * Otherwise, if connected, try to disconnect first.
437 * This allows user to disconnect by connecting to, e.g.,
438 * a null address.
439 */
440 if (so->so_state & (SS_ISCONNECTED|SS_ISCONNECTING) &&
441 ((so->so_proto->pr_flags & PR_CONNREQUIRED) ||
442 (error = sodisconnect(so))))
443 error = EISCONN;
444 else
445 error = (*so->so_proto->pr_usrreqs->pru_connect)(so, nam, td);
446 splx(s);
447 return (error);
448}
449
450int
451soconnect2(so1, so2)
452 struct socket *so1;
453 struct socket *so2;
454{
455 int s = splnet();
456 int error;
457
458 error = (*so1->so_proto->pr_usrreqs->pru_connect2)(so1, so2);
459 splx(s);
460 return (error);
461}
462
463int
464sodisconnect(so)
465 struct socket *so;
466{
467 int s = splnet();
468 int error;
469
470 if ((so->so_state & SS_ISCONNECTED) == 0) {
471 error = ENOTCONN;
472 goto bad;
473 }
474 if (so->so_state & SS_ISDISCONNECTING) {
475 error = EALREADY;
476 goto bad;
477 }
478 error = (*so->so_proto->pr_usrreqs->pru_disconnect)(so);
479bad:
480 splx(s);
481 return (error);
482}
483
484#define SBLOCKWAIT(f) (((f) & MSG_DONTWAIT) ? M_NOWAIT : M_WAITOK)
485/*
486 * Send on a socket.
487 * If send must go all at once and message is larger than
488 * send buffering, then hard error.
489 * Lock against other senders.
490 * If must go all at once and not enough room now, then
491 * inform user that this would block and do nothing.
492 * Otherwise, if nonblocking, send as much as possible.
493 * The data to be sent is described by "uio" if nonzero,
494 * otherwise by the mbuf chain "top" (which must be null
495 * if uio is not). Data provided in mbuf chain must be small
496 * enough to send all at once.
497 *
498 * Returns nonzero on error, timeout or signal; callers
499 * must check for short counts if EINTR/ERESTART are returned.
500 * Data and control buffers are freed on return.
501 */
502
503#ifdef ZERO_COPY_SOCKETS
504struct so_zerocopy_stats{
505 int size_ok;
506 int align_ok;
507 int found_ifp;
508};
509struct so_zerocopy_stats so_zerocp_stats = {0,0,0};
510#include <netinet/in.h>
511#include <net/route.h>
512#include <netinet/in_pcb.h>
513#include <vm/vm.h>
514#include <vm/vm_page.h>
515#include <vm/vm_object.h>
516#endif /*ZERO_COPY_SOCKETS*/
517
518int
519sosend(so, addr, uio, top, control, flags, td)
520 struct socket *so;
521 struct sockaddr *addr;
522 struct uio *uio;
523 struct mbuf *top;
524 struct mbuf *control;
525 int flags;
526 struct thread *td;
527{
528 struct mbuf **mp;
529 struct mbuf *m;
530 long space, len, resid;
531 int clen = 0, error, s, dontroute, mlen;
530 long space, len = 0, resid;
531 int clen = 0, error, s, dontroute;
532 int atomic = sosendallatonce(so) || top;
533#ifdef ZERO_COPY_SOCKETS
534 int cow_send;
535#endif /* ZERO_COPY_SOCKETS */
536
537 if (uio != NULL)
538 resid = uio->uio_resid;
539 else
540 resid = top->m_pkthdr.len;
541 /*
542 * In theory resid should be unsigned.
543 * However, space must be signed, as it might be less than 0
544 * if we over-committed, and we must use a signed comparison
545 * of space and resid. On the other hand, a negative resid
546 * causes us to loop sending 0-length segments to the protocol.
547 *
548 * Also check to make sure that MSG_EOR isn't used on SOCK_STREAM
549 * type sockets since that's an error.
550 */
551 if (resid < 0 || (so->so_type == SOCK_STREAM && (flags & MSG_EOR))) {
552 error = EINVAL;
553 goto out;
554 }
555
556 dontroute =
557 (flags & MSG_DONTROUTE) && (so->so_options & SO_DONTROUTE) == 0 &&
558 (so->so_proto->pr_flags & PR_ATOMIC);
559 if (td != NULL)
560 td->td_proc->p_stats->p_ru.ru_msgsnd++;
561 if (control != NULL)
562 clen = control->m_len;
563#define snderr(errno) { error = (errno); splx(s); goto release; }
564
565restart:
566 error = sblock(&so->so_snd, SBLOCKWAIT(flags));
567 if (error)
568 goto out;
569 do {
570 s = splnet();
571 if (so->so_state & SS_CANTSENDMORE)
572 snderr(EPIPE);
573 if (so->so_error) {
574 error = so->so_error;
575 so->so_error = 0;
576 splx(s);
577 goto release;
578 }
579 if ((so->so_state & SS_ISCONNECTED) == 0) {
580 /*
581 * `sendto' and `sendmsg' is allowed on a connection-
582 * based socket if it supports implied connect.
583 * Return ENOTCONN if not connected and no address is
584 * supplied.
585 */
586 if ((so->so_proto->pr_flags & PR_CONNREQUIRED) &&
587 (so->so_proto->pr_flags & PR_IMPLOPCL) == 0) {
588 if ((so->so_state & SS_ISCONFIRMING) == 0 &&
589 !(resid == 0 && clen != 0))
590 snderr(ENOTCONN);
591 } else if (addr == NULL)
592 snderr(so->so_proto->pr_flags & PR_CONNREQUIRED ?
593 ENOTCONN : EDESTADDRREQ);
594 }
595 space = sbspace(&so->so_snd);
596 if (flags & MSG_OOB)
597 space += 1024;
598 if ((atomic && resid > so->so_snd.sb_hiwat) ||
599 clen > so->so_snd.sb_hiwat)
600 snderr(EMSGSIZE);
601 if (space < resid + clen &&
602 (atomic || space < so->so_snd.sb_lowat || space < clen)) {
603 if (so->so_state & SS_NBIO)
604 snderr(EWOULDBLOCK);
605 sbunlock(&so->so_snd);
606 error = sbwait(&so->so_snd);
607 splx(s);
608 if (error)
609 goto out;
610 goto restart;
611 }
612 splx(s);
613 mp = &top;
614 space -= clen;
615 do {
616 if (uio == NULL) {
617 /*
618 * Data is prepackaged in "top".
619 */
620 resid = 0;
621 if (flags & MSG_EOR)
622 top->m_flags |= M_EOR;
623 } else do {
624#ifdef ZERO_COPY_SOCKETS
625 cow_send = 0;
626#endif /* ZERO_COPY_SOCKETS */
532 int atomic = sosendallatonce(so) || top;
533#ifdef ZERO_COPY_SOCKETS
534 int cow_send;
535#endif /* ZERO_COPY_SOCKETS */
536
537 if (uio != NULL)
538 resid = uio->uio_resid;
539 else
540 resid = top->m_pkthdr.len;
541 /*
542 * In theory resid should be unsigned.
543 * However, space must be signed, as it might be less than 0
544 * if we over-committed, and we must use a signed comparison
545 * of space and resid. On the other hand, a negative resid
546 * causes us to loop sending 0-length segments to the protocol.
547 *
548 * Also check to make sure that MSG_EOR isn't used on SOCK_STREAM
549 * type sockets since that's an error.
550 */
551 if (resid < 0 || (so->so_type == SOCK_STREAM && (flags & MSG_EOR))) {
552 error = EINVAL;
553 goto out;
554 }
555
556 dontroute =
557 (flags & MSG_DONTROUTE) && (so->so_options & SO_DONTROUTE) == 0 &&
558 (so->so_proto->pr_flags & PR_ATOMIC);
559 if (td != NULL)
560 td->td_proc->p_stats->p_ru.ru_msgsnd++;
561 if (control != NULL)
562 clen = control->m_len;
563#define snderr(errno) { error = (errno); splx(s); goto release; }
564
565restart:
566 error = sblock(&so->so_snd, SBLOCKWAIT(flags));
567 if (error)
568 goto out;
569 do {
570 s = splnet();
571 if (so->so_state & SS_CANTSENDMORE)
572 snderr(EPIPE);
573 if (so->so_error) {
574 error = so->so_error;
575 so->so_error = 0;
576 splx(s);
577 goto release;
578 }
579 if ((so->so_state & SS_ISCONNECTED) == 0) {
580 /*
581 * `sendto' and `sendmsg' is allowed on a connection-
582 * based socket if it supports implied connect.
583 * Return ENOTCONN if not connected and no address is
584 * supplied.
585 */
586 if ((so->so_proto->pr_flags & PR_CONNREQUIRED) &&
587 (so->so_proto->pr_flags & PR_IMPLOPCL) == 0) {
588 if ((so->so_state & SS_ISCONFIRMING) == 0 &&
589 !(resid == 0 && clen != 0))
590 snderr(ENOTCONN);
591 } else if (addr == NULL)
592 snderr(so->so_proto->pr_flags & PR_CONNREQUIRED ?
593 ENOTCONN : EDESTADDRREQ);
594 }
595 space = sbspace(&so->so_snd);
596 if (flags & MSG_OOB)
597 space += 1024;
598 if ((atomic && resid > so->so_snd.sb_hiwat) ||
599 clen > so->so_snd.sb_hiwat)
600 snderr(EMSGSIZE);
601 if (space < resid + clen &&
602 (atomic || space < so->so_snd.sb_lowat || space < clen)) {
603 if (so->so_state & SS_NBIO)
604 snderr(EWOULDBLOCK);
605 sbunlock(&so->so_snd);
606 error = sbwait(&so->so_snd);
607 splx(s);
608 if (error)
609 goto out;
610 goto restart;
611 }
612 splx(s);
613 mp = &top;
614 space -= clen;
615 do {
616 if (uio == NULL) {
617 /*
618 * Data is prepackaged in "top".
619 */
620 resid = 0;
621 if (flags & MSG_EOR)
622 top->m_flags |= M_EOR;
623 } else do {
624#ifdef ZERO_COPY_SOCKETS
625 cow_send = 0;
626#endif /* ZERO_COPY_SOCKETS */
627 if (top == 0) {
628 MGETHDR(m, M_TRYWAIT, MT_DATA);
629 if (m == NULL) {
630 error = ENOBUFS;
631 goto release;
632 }
633 mlen = MHLEN;
634 m->m_pkthdr.len = 0;
635 m->m_pkthdr.rcvif = (struct ifnet *)0;
636 } else {
637 MGET(m, M_TRYWAIT, MT_DATA);
638 if (m == NULL) {
639 error = ENOBUFS;
640 goto release;
641 }
642 mlen = MLEN;
643 }
644 if (resid >= MINCLSIZE) {
645#ifdef ZERO_COPY_SOCKETS
627 if (resid >= MINCLSIZE) {
628#ifdef ZERO_COPY_SOCKETS
629 if (top == NULL) {
630 MGETHDR(m, M_TRYWAIT, MT_DATA);
631 if (m == NULL) {
632 error = ENOBUFS;
633 goto release;
634 }
635 m->m_pkthdr.len = 0;
636 m->m_pkthdr.rcvif = (struct ifnet *)0;
637 } else {
638 MGET(m, M_TRYWAIT, MT_DATA);
639 if (m == NULL) {
640 error = ENOBUFS;
641 goto release;
642 }
643 }
646 if (so_zero_copy_send &&
647 resid>=PAGE_SIZE &&
648 space>=PAGE_SIZE &&
649 uio->uio_iov->iov_len>=PAGE_SIZE) {
650 so_zerocp_stats.size_ok++;
651 if (!((vm_offset_t)
652 uio->uio_iov->iov_base & PAGE_MASK)){
653 so_zerocp_stats.align_ok++;
654 cow_send = socow_setup(m, uio);
655 }
656 }
644 if (so_zero_copy_send &&
645 resid>=PAGE_SIZE &&
646 space>=PAGE_SIZE &&
647 uio->uio_iov->iov_len>=PAGE_SIZE) {
648 so_zerocp_stats.size_ok++;
649 if (!((vm_offset_t)
650 uio->uio_iov->iov_base & PAGE_MASK)){
651 so_zerocp_stats.align_ok++;
652 cow_send = socow_setup(m, uio);
653 }
654 }
657 if (!cow_send){
655 if (!cow_send) {
656 MCLGET(m, M_TRYWAIT);
657 if ((m->m_flags & M_EXT) == 0) {
658 m_free(m);
659 m = NULL;
660 } else {
661 len = min(min(MCLBYTES, resid), space);
662 }
663 } else
664 len = PAGE_SIZE;
665#else /* ZERO_COPY_SOCKETS */
666 if (top == NULL) {
667 m = m_getcl(M_TRYWAIT, MT_DATA, M_PKTHDR);
668 m->m_pkthdr.len = 0;
669 m->m_pkthdr.rcvif = (struct ifnet *)0;
670 } else
671 m = m_getcl(M_TRYWAIT, MT_DATA, 0);
672 len = min(min(MCLBYTES, resid), space);
658#endif /* ZERO_COPY_SOCKETS */
673#endif /* ZERO_COPY_SOCKETS */
659 MCLGET(m, M_TRYWAIT);
660 if ((m->m_flags & M_EXT) == 0)
661 goto nopages;
662 mlen = MCLBYTES;
663 len = min(min(mlen, resid), space);
664 } else {
674 } else {
665#ifdef ZERO_COPY_SOCKETS
666 len = PAGE_SIZE;
667 }
675 if (top == NULL) {
676 m = m_gethdr(M_TRYWAIT, MT_DATA);
677 m->m_pkthdr.len = 0;
678 m->m_pkthdr.rcvif = (struct ifnet *)0;
668
679
669 } else {
670#endif /* ZERO_COPY_SOCKETS */
671nopages:
672 len = min(min(mlen, resid), space);
673 /*
674 * For datagram protocols, leave room
675 * for protocol headers in first mbuf.
676 */
677 if (atomic && top == 0 && len < mlen)
678 MH_ALIGN(m, len);
680 len = min(min(MHLEN, resid), space);
681 /*
682 * For datagram protocols, leave room
683 * for protocol headers in first mbuf.
684 */
685 if (atomic && m && len < MHLEN)
686 MH_ALIGN(m, len);
687 } else {
688 m = m_get(M_TRYWAIT, MT_DATA);
689 len = min(min(MLEN, resid), space);
690 }
679 }
691 }
692 if (m == NULL) {
693 error = ENOBUFS;
694 goto release;
695 }
696
680 space -= len;
681#ifdef ZERO_COPY_SOCKETS
682 if (cow_send)
683 error = 0;
684 else
685#endif /* ZERO_COPY_SOCKETS */
686 error = uiomove(mtod(m, void *), (int)len, uio);
687 resid = uio->uio_resid;
688 m->m_len = len;
689 *mp = m;
690 top->m_pkthdr.len += len;
691 if (error)
692 goto release;
693 mp = &m->m_next;
694 if (resid <= 0) {
695 if (flags & MSG_EOR)
696 top->m_flags |= M_EOR;
697 break;
698 }
699 } while (space > 0 && atomic);
700 if (dontroute)
701 so->so_options |= SO_DONTROUTE;
702 s = splnet(); /* XXX */
703 /*
704 * XXX all the SS_CANTSENDMORE checks previously
705 * done could be out of date. We could have recieved
706 * a reset packet in an interrupt or maybe we slept
707 * while doing page faults in uiomove() etc. We could
708 * probably recheck again inside the splnet() protection
709 * here, but there are probably other places that this
710 * also happens. We must rethink this.
711 */
712 error = (*so->so_proto->pr_usrreqs->pru_send)(so,
713 (flags & MSG_OOB) ? PRUS_OOB :
714 /*
715 * If the user set MSG_EOF, the protocol
716 * understands this flag and nothing left to
717 * send then use PRU_SEND_EOF instead of PRU_SEND.
718 */
719 ((flags & MSG_EOF) &&
720 (so->so_proto->pr_flags & PR_IMPLOPCL) &&
721 (resid <= 0)) ?
722 PRUS_EOF :
723 /* If there is more to send set PRUS_MORETOCOME */
724 (resid > 0 && space > 0) ? PRUS_MORETOCOME : 0,
725 top, addr, control, td);
726 splx(s);
727 if (dontroute)
728 so->so_options &= ~SO_DONTROUTE;
729 clen = 0;
730 control = NULL;
731 top = NULL;
732 mp = &top;
733 if (error)
734 goto release;
735 } while (resid && space > 0);
736 } while (resid);
737
738release:
739 sbunlock(&so->so_snd);
740out:
741 if (top != NULL)
742 m_freem(top);
743 if (control != NULL)
744 m_freem(control);
745 return (error);
746}
747
748/*
749 * Implement receive operations on a socket.
750 * We depend on the way that records are added to the sockbuf
751 * by sbappend*. In particular, each record (mbufs linked through m_next)
752 * must begin with an address if the protocol so specifies,
753 * followed by an optional mbuf or mbufs containing ancillary data,
754 * and then zero or more mbufs of data.
755 * In order to avoid blocking network interrupts for the entire time here,
756 * we splx() while doing the actual copy to user space.
757 * Although the sockbuf is locked, new data may still be appended,
758 * and thus we must maintain consistency of the sockbuf during that time.
759 *
760 * The caller may receive the data as a single mbuf chain by supplying
761 * an mbuf **mp0 for use in returning the chain. The uio is then used
762 * only for the count in uio_resid.
763 */
764int
765soreceive(so, psa, uio, mp0, controlp, flagsp)
766 struct socket *so;
767 struct sockaddr **psa;
768 struct uio *uio;
769 struct mbuf **mp0;
770 struct mbuf **controlp;
771 int *flagsp;
772{
773 struct mbuf *m, **mp;
774 int flags, len, error, s, offset;
775 struct protosw *pr = so->so_proto;
776 struct mbuf *nextrecord;
777 int moff, type = 0;
778 int orig_resid = uio->uio_resid;
779
780 mp = mp0;
781 if (psa != NULL)
782 *psa = 0;
783 if (controlp != NULL)
784 *controlp = 0;
785 if (flagsp != NULL)
786 flags = *flagsp &~ MSG_EOR;
787 else
788 flags = 0;
789 if (flags & MSG_OOB) {
790 m = m_get(M_TRYWAIT, MT_DATA);
791 if (m == NULL)
792 return (ENOBUFS);
793 error = (*pr->pr_usrreqs->pru_rcvoob)(so, m, flags & MSG_PEEK);
794 if (error)
795 goto bad;
796 do {
797#ifdef ZERO_COPY_SOCKETS
798 if (so_zero_copy_receive) {
799 vm_page_t pg;
800 int disposable;
801
802 if ((m->m_flags & M_EXT)
803 && (m->m_ext.ext_type == EXT_DISPOSABLE))
804 disposable = 1;
805 else
806 disposable = 0;
807
808 pg = PHYS_TO_VM_PAGE(vtophys(mtod(m, caddr_t)));
809 if (uio->uio_offset == -1)
810 uio->uio_offset =IDX_TO_OFF(pg->pindex);
811
812 error = uiomoveco(mtod(m, void *),
813 min(uio->uio_resid, m->m_len),
814 uio, pg->object,
815 disposable);
816 } else
817#endif /* ZERO_COPY_SOCKETS */
818 error = uiomove(mtod(m, void *),
819 (int) min(uio->uio_resid, m->m_len), uio);
820 m = m_free(m);
821 } while (uio->uio_resid && error == 0 && m);
822bad:
823 if (m != NULL)
824 m_freem(m);
825 return (error);
826 }
827 if (mp != NULL)
828 *mp = NULL;
829 if (so->so_state & SS_ISCONFIRMING && uio->uio_resid)
830 (*pr->pr_usrreqs->pru_rcvd)(so, 0);
831
832restart:
833 error = sblock(&so->so_rcv, SBLOCKWAIT(flags));
834 if (error)
835 return (error);
836 s = splnet();
837
838 m = so->so_rcv.sb_mb;
839 /*
840 * If we have less data than requested, block awaiting more
841 * (subject to any timeout) if:
842 * 1. the current count is less than the low water mark, or
843 * 2. MSG_WAITALL is set, and it is possible to do the entire
844 * receive operation at once if we block (resid <= hiwat).
845 * 3. MSG_DONTWAIT is not set
846 * If MSG_WAITALL is set but resid is larger than the receive buffer,
847 * we have to do the receive in sections, and thus risk returning
848 * a short count if a timeout or signal occurs after we start.
849 */
850 if (m == NULL || (((flags & MSG_DONTWAIT) == 0 &&
851 so->so_rcv.sb_cc < uio->uio_resid) &&
852 (so->so_rcv.sb_cc < so->so_rcv.sb_lowat ||
853 ((flags & MSG_WAITALL) && uio->uio_resid <= so->so_rcv.sb_hiwat)) &&
854 m->m_nextpkt == NULL && (pr->pr_flags & PR_ATOMIC) == 0)) {
855 KASSERT(m != NULL || !so->so_rcv.sb_cc,
856 ("receive: m == %p so->so_rcv.sb_cc == %u",
857 m, so->so_rcv.sb_cc));
858 if (so->so_error) {
859 if (m != NULL)
860 goto dontblock;
861 error = so->so_error;
862 if ((flags & MSG_PEEK) == 0)
863 so->so_error = 0;
864 goto release;
865 }
866 if (so->so_state & SS_CANTRCVMORE) {
867 if (m)
868 goto dontblock;
869 else
870 goto release;
871 }
872 for (; m != NULL; m = m->m_next)
873 if (m->m_type == MT_OOBDATA || (m->m_flags & M_EOR)) {
874 m = so->so_rcv.sb_mb;
875 goto dontblock;
876 }
877 if ((so->so_state & (SS_ISCONNECTED|SS_ISCONNECTING)) == 0 &&
878 (so->so_proto->pr_flags & PR_CONNREQUIRED)) {
879 error = ENOTCONN;
880 goto release;
881 }
882 if (uio->uio_resid == 0)
883 goto release;
884 if ((so->so_state & SS_NBIO) || (flags & MSG_DONTWAIT)) {
885 error = EWOULDBLOCK;
886 goto release;
887 }
888 SBLASTRECORDCHK(&so->so_rcv);
889 SBLASTMBUFCHK(&so->so_rcv);
890 sbunlock(&so->so_rcv);
891 error = sbwait(&so->so_rcv);
892 splx(s);
893 if (error)
894 return (error);
895 goto restart;
896 }
897dontblock:
898 if (uio->uio_td)
899 uio->uio_td->td_proc->p_stats->p_ru.ru_msgrcv++;
900 SBLASTRECORDCHK(&so->so_rcv);
901 SBLASTMBUFCHK(&so->so_rcv);
902 nextrecord = m->m_nextpkt;
903 if (pr->pr_flags & PR_ADDR) {
904 KASSERT(m->m_type == MT_SONAME,
905 ("m->m_type == %d", m->m_type));
906 orig_resid = 0;
907 if (psa != NULL)
908 *psa = sodupsockaddr(mtod(m, struct sockaddr *),
909 mp0 == NULL ? M_WAITOK : M_NOWAIT);
910 if (flags & MSG_PEEK) {
911 m = m->m_next;
912 } else {
913 sbfree(&so->so_rcv, m);
914 so->so_rcv.sb_mb = m_free(m);
915 m = so->so_rcv.sb_mb;
916 }
917 }
918 while (m != NULL && m->m_type == MT_CONTROL && error == 0) {
919 if (flags & MSG_PEEK) {
920 if (controlp != NULL)
921 *controlp = m_copy(m, 0, m->m_len);
922 m = m->m_next;
923 } else {
924 sbfree(&so->so_rcv, m);
925 so->so_rcv.sb_mb = m->m_next;
926 m->m_next = NULL;
927 if (pr->pr_domain->dom_externalize)
928 error =
929 (*pr->pr_domain->dom_externalize)(m, controlp);
930 else if (controlp != NULL)
931 *controlp = m;
932 else
933 m_freem(m);
934 m = so->so_rcv.sb_mb;
935 }
936 if (controlp != NULL) {
937 orig_resid = 0;
938 while (*controlp != NULL)
939 controlp = &(*controlp)->m_next;
940 }
941 }
942 if (m != NULL) {
943 if ((flags & MSG_PEEK) == 0) {
944 m->m_nextpkt = nextrecord;
945 /*
946 * If nextrecord == NULL (this is a single chain),
947 * then sb_lastrecord may not be valid here if m
948 * was changed earlier.
949 */
950 if (nextrecord == NULL) {
951 KASSERT(so->so_rcv.sb_mb == m,
952 ("receive tailq 1"));
953 so->so_rcv.sb_lastrecord = m;
954 }
955 }
956 type = m->m_type;
957 if (type == MT_OOBDATA)
958 flags |= MSG_OOB;
959 } else {
960 if ((flags & MSG_PEEK) == 0) {
961 KASSERT(so->so_rcv.sb_mb == m,("receive tailq 2"));
962 so->so_rcv.sb_mb = nextrecord;
963 SB_EMPTY_FIXUP(&so->so_rcv);
964 }
965 }
966 SBLASTRECORDCHK(&so->so_rcv);
967 SBLASTMBUFCHK(&so->so_rcv);
968
969 moff = 0;
970 offset = 0;
971 while (m != NULL && uio->uio_resid > 0 && error == 0) {
972 if (m->m_type == MT_OOBDATA) {
973 if (type != MT_OOBDATA)
974 break;
975 } else if (type == MT_OOBDATA)
976 break;
977 else
978 KASSERT(m->m_type == MT_DATA || m->m_type == MT_HEADER,
979 ("m->m_type == %d", m->m_type));
980 so->so_state &= ~SS_RCVATMARK;
981 len = uio->uio_resid;
982 if (so->so_oobmark && len > so->so_oobmark - offset)
983 len = so->so_oobmark - offset;
984 if (len > m->m_len - moff)
985 len = m->m_len - moff;
986 /*
987 * If mp is set, just pass back the mbufs.
988 * Otherwise copy them out via the uio, then free.
989 * Sockbuf must be consistent here (points to current mbuf,
990 * it points to next record) when we drop priority;
991 * we must note any additions to the sockbuf when we
992 * block interrupts again.
993 */
994 if (mp == NULL) {
995 SBLASTRECORDCHK(&so->so_rcv);
996 SBLASTMBUFCHK(&so->so_rcv);
997 splx(s);
998#ifdef ZERO_COPY_SOCKETS
999 if (so_zero_copy_receive) {
1000 vm_page_t pg;
1001 int disposable;
1002
1003 if ((m->m_flags & M_EXT)
1004 && (m->m_ext.ext_type == EXT_DISPOSABLE))
1005 disposable = 1;
1006 else
1007 disposable = 0;
1008
1009 pg = PHYS_TO_VM_PAGE(vtophys(mtod(m, caddr_t) +
1010 moff));
1011
1012 if (uio->uio_offset == -1)
1013 uio->uio_offset =IDX_TO_OFF(pg->pindex);
1014
1015 error = uiomoveco(mtod(m, char *) + moff,
1016 (int)len, uio,pg->object,
1017 disposable);
1018 } else
1019#endif /* ZERO_COPY_SOCKETS */
1020 error = uiomove(mtod(m, char *) + moff, (int)len, uio);
1021 s = splnet();
1022 if (error)
1023 goto release;
1024 } else
1025 uio->uio_resid -= len;
1026 if (len == m->m_len - moff) {
1027 if (m->m_flags & M_EOR)
1028 flags |= MSG_EOR;
1029 if (flags & MSG_PEEK) {
1030 m = m->m_next;
1031 moff = 0;
1032 } else {
1033 nextrecord = m->m_nextpkt;
1034 sbfree(&so->so_rcv, m);
1035 if (mp != NULL) {
1036 *mp = m;
1037 mp = &m->m_next;
1038 so->so_rcv.sb_mb = m = m->m_next;
1039 *mp = NULL;
1040 } else {
1041 so->so_rcv.sb_mb = m_free(m);
1042 m = so->so_rcv.sb_mb;
1043 }
1044 if (m != NULL) {
1045 m->m_nextpkt = nextrecord;
1046 if (nextrecord == NULL)
1047 so->so_rcv.sb_lastrecord = m;
1048 } else {
1049 so->so_rcv.sb_mb = nextrecord;
1050 SB_EMPTY_FIXUP(&so->so_rcv);
1051 }
1052 SBLASTRECORDCHK(&so->so_rcv);
1053 SBLASTMBUFCHK(&so->so_rcv);
1054 }
1055 } else {
1056 if (flags & MSG_PEEK)
1057 moff += len;
1058 else {
1059 if (mp != NULL)
1060 *mp = m_copym(m, 0, len, M_TRYWAIT);
1061 m->m_data += len;
1062 m->m_len -= len;
1063 so->so_rcv.sb_cc -= len;
1064 }
1065 }
1066 if (so->so_oobmark) {
1067 if ((flags & MSG_PEEK) == 0) {
1068 so->so_oobmark -= len;
1069 if (so->so_oobmark == 0) {
1070 so->so_state |= SS_RCVATMARK;
1071 break;
1072 }
1073 } else {
1074 offset += len;
1075 if (offset == so->so_oobmark)
1076 break;
1077 }
1078 }
1079 if (flags & MSG_EOR)
1080 break;
1081 /*
1082 * If the MSG_WAITALL flag is set (for non-atomic socket),
1083 * we must not quit until "uio->uio_resid == 0" or an error
1084 * termination. If a signal/timeout occurs, return
1085 * with a short count but without error.
1086 * Keep sockbuf locked against other readers.
1087 */
1088 while (flags & MSG_WAITALL && m == NULL && uio->uio_resid > 0 &&
1089 !sosendallatonce(so) && nextrecord == NULL) {
1090 if (so->so_error || so->so_state & SS_CANTRCVMORE)
1091 break;
1092 /*
1093 * Notify the protocol that some data has been
1094 * drained before blocking.
1095 */
1096 if (pr->pr_flags & PR_WANTRCVD && so->so_pcb != NULL)
1097 (*pr->pr_usrreqs->pru_rcvd)(so, flags);
1098 SBLASTRECORDCHK(&so->so_rcv);
1099 SBLASTMBUFCHK(&so->so_rcv);
1100 error = sbwait(&so->so_rcv);
1101 if (error) {
1102 sbunlock(&so->so_rcv);
1103 splx(s);
1104 return (0);
1105 }
1106 m = so->so_rcv.sb_mb;
1107 if (m != NULL)
1108 nextrecord = m->m_nextpkt;
1109 }
1110 }
1111
1112 if (m != NULL && pr->pr_flags & PR_ATOMIC) {
1113 flags |= MSG_TRUNC;
1114 if ((flags & MSG_PEEK) == 0)
1115 (void) sbdroprecord(&so->so_rcv);
1116 }
1117 if ((flags & MSG_PEEK) == 0) {
1118 if (m == NULL) {
1119 /*
1120 * First part is an inline SB_EMPTY_FIXUP(). Second
1121 * part makes sure sb_lastrecord is up-to-date if
1122 * there is still data in the socket buffer.
1123 */
1124 so->so_rcv.sb_mb = nextrecord;
1125 if (so->so_rcv.sb_mb == NULL) {
1126 so->so_rcv.sb_mbtail = NULL;
1127 so->so_rcv.sb_lastrecord = NULL;
1128 } else if (nextrecord->m_nextpkt == NULL)
1129 so->so_rcv.sb_lastrecord = nextrecord;
1130 }
1131 SBLASTRECORDCHK(&so->so_rcv);
1132 SBLASTMBUFCHK(&so->so_rcv);
1133 if (pr->pr_flags & PR_WANTRCVD && so->so_pcb)
1134 (*pr->pr_usrreqs->pru_rcvd)(so, flags);
1135 }
1136 if (orig_resid == uio->uio_resid && orig_resid &&
1137 (flags & MSG_EOR) == 0 && (so->so_state & SS_CANTRCVMORE) == 0) {
1138 sbunlock(&so->so_rcv);
1139 splx(s);
1140 goto restart;
1141 }
1142
1143 if (flagsp != NULL)
1144 *flagsp |= flags;
1145release:
1146 sbunlock(&so->so_rcv);
1147 splx(s);
1148 return (error);
1149}
1150
1151int
1152soshutdown(so, how)
1153 struct socket *so;
1154 int how;
1155{
1156 struct protosw *pr = so->so_proto;
1157
1158 if (!(how == SHUT_RD || how == SHUT_WR || how == SHUT_RDWR))
1159 return (EINVAL);
1160
1161 if (how != SHUT_WR)
1162 sorflush(so);
1163 if (how != SHUT_RD)
1164 return ((*pr->pr_usrreqs->pru_shutdown)(so));
1165 return (0);
1166}
1167
1168void
1169sorflush(so)
1170 struct socket *so;
1171{
1172 struct sockbuf *sb = &so->so_rcv;
1173 struct protosw *pr = so->so_proto;
1174 int s;
1175 struct sockbuf asb;
1176
1177 sb->sb_flags |= SB_NOINTR;
1178 (void) sblock(sb, M_WAITOK);
1179 s = splimp();
1180 socantrcvmore(so);
1181 sbunlock(sb);
1182 asb = *sb;
1183 /*
1184 * Invalidate/clear most of the sockbuf structure, but keep
1185 * its selinfo structure valid.
1186 */
1187 bzero(&sb->sb_startzero,
1188 sizeof(*sb) - offsetof(struct sockbuf, sb_startzero));
1189 splx(s);
1190
1191 if (pr->pr_flags & PR_RIGHTS && pr->pr_domain->dom_dispose != NULL)
1192 (*pr->pr_domain->dom_dispose)(asb.sb_mb);
1193 sbrelease(&asb, so);
1194}
1195
1196#ifdef INET
1197static int
1198do_setopt_accept_filter(so, sopt)
1199 struct socket *so;
1200 struct sockopt *sopt;
1201{
1202 struct accept_filter_arg *afap = NULL;
1203 struct accept_filter *afp;
1204 struct so_accf *af = so->so_accf;
1205 int error = 0;
1206
1207 /* do not set/remove accept filters on non listen sockets */
1208 if ((so->so_options & SO_ACCEPTCONN) == 0) {
1209 error = EINVAL;
1210 goto out;
1211 }
1212
1213 /* removing the filter */
1214 if (sopt == NULL) {
1215 if (af != NULL) {
1216 if (af->so_accept_filter != NULL &&
1217 af->so_accept_filter->accf_destroy != NULL) {
1218 af->so_accept_filter->accf_destroy(so);
1219 }
1220 if (af->so_accept_filter_str != NULL) {
1221 FREE(af->so_accept_filter_str, M_ACCF);
1222 }
1223 FREE(af, M_ACCF);
1224 so->so_accf = NULL;
1225 }
1226 so->so_options &= ~SO_ACCEPTFILTER;
1227 return (0);
1228 }
1229 /* adding a filter */
1230 /* must remove previous filter first */
1231 if (af != NULL) {
1232 error = EINVAL;
1233 goto out;
1234 }
1235 /* don't put large objects on the kernel stack */
1236 MALLOC(afap, struct accept_filter_arg *, sizeof(*afap), M_TEMP, M_WAITOK);
1237 error = sooptcopyin(sopt, afap, sizeof *afap, sizeof *afap);
1238 afap->af_name[sizeof(afap->af_name)-1] = '\0';
1239 afap->af_arg[sizeof(afap->af_arg)-1] = '\0';
1240 if (error)
1241 goto out;
1242 afp = accept_filt_get(afap->af_name);
1243 if (afp == NULL) {
1244 error = ENOENT;
1245 goto out;
1246 }
1247 MALLOC(af, struct so_accf *, sizeof(*af), M_ACCF, M_WAITOK | M_ZERO);
1248 if (afp->accf_create != NULL) {
1249 if (afap->af_name[0] != '\0') {
1250 int len = strlen(afap->af_name) + 1;
1251
1252 MALLOC(af->so_accept_filter_str, char *, len, M_ACCF, M_WAITOK);
1253 strcpy(af->so_accept_filter_str, afap->af_name);
1254 }
1255 af->so_accept_filter_arg = afp->accf_create(so, afap->af_arg);
1256 if (af->so_accept_filter_arg == NULL) {
1257 FREE(af->so_accept_filter_str, M_ACCF);
1258 FREE(af, M_ACCF);
1259 so->so_accf = NULL;
1260 error = EINVAL;
1261 goto out;
1262 }
1263 }
1264 af->so_accept_filter = afp;
1265 so->so_accf = af;
1266 so->so_options |= SO_ACCEPTFILTER;
1267out:
1268 if (afap != NULL)
1269 FREE(afap, M_TEMP);
1270 return (error);
1271}
1272#endif /* INET */
1273
1274/*
1275 * Perhaps this routine, and sooptcopyout(), below, ought to come in
1276 * an additional variant to handle the case where the option value needs
1277 * to be some kind of integer, but not a specific size.
1278 * In addition to their use here, these functions are also called by the
1279 * protocol-level pr_ctloutput() routines.
1280 */
1281int
1282sooptcopyin(sopt, buf, len, minlen)
1283 struct sockopt *sopt;
1284 void *buf;
1285 size_t len;
1286 size_t minlen;
1287{
1288 size_t valsize;
1289
1290 /*
1291 * If the user gives us more than we wanted, we ignore it,
1292 * but if we don't get the minimum length the caller
1293 * wants, we return EINVAL. On success, sopt->sopt_valsize
1294 * is set to however much we actually retrieved.
1295 */
1296 if ((valsize = sopt->sopt_valsize) < minlen)
1297 return EINVAL;
1298 if (valsize > len)
1299 sopt->sopt_valsize = valsize = len;
1300
1301 if (sopt->sopt_td != NULL)
1302 return (copyin(sopt->sopt_val, buf, valsize));
1303
1304 bcopy(sopt->sopt_val, buf, valsize);
1305 return 0;
1306}
1307
1308int
1309sosetopt(so, sopt)
1310 struct socket *so;
1311 struct sockopt *sopt;
1312{
1313 int error, optval;
1314 struct linger l;
1315 struct timeval tv;
1316 u_long val;
1317#ifdef MAC
1318 struct mac extmac;
1319#endif
1320
1321 error = 0;
1322 if (sopt->sopt_level != SOL_SOCKET) {
1323 if (so->so_proto && so->so_proto->pr_ctloutput)
1324 return ((*so->so_proto->pr_ctloutput)
1325 (so, sopt));
1326 error = ENOPROTOOPT;
1327 } else {
1328 switch (sopt->sopt_name) {
1329#ifdef INET
1330 case SO_ACCEPTFILTER:
1331 error = do_setopt_accept_filter(so, sopt);
1332 if (error)
1333 goto bad;
1334 break;
1335#endif
1336 case SO_LINGER:
1337 error = sooptcopyin(sopt, &l, sizeof l, sizeof l);
1338 if (error)
1339 goto bad;
1340
1341 so->so_linger = l.l_linger;
1342 if (l.l_onoff)
1343 so->so_options |= SO_LINGER;
1344 else
1345 so->so_options &= ~SO_LINGER;
1346 break;
1347
1348 case SO_DEBUG:
1349 case SO_KEEPALIVE:
1350 case SO_DONTROUTE:
1351 case SO_USELOOPBACK:
1352 case SO_BROADCAST:
1353 case SO_REUSEADDR:
1354 case SO_REUSEPORT:
1355 case SO_OOBINLINE:
1356 case SO_TIMESTAMP:
1357 case SO_BINTIME:
1358 case SO_NOSIGPIPE:
1359 error = sooptcopyin(sopt, &optval, sizeof optval,
1360 sizeof optval);
1361 if (error)
1362 goto bad;
1363 if (optval)
1364 so->so_options |= sopt->sopt_name;
1365 else
1366 so->so_options &= ~sopt->sopt_name;
1367 break;
1368
1369 case SO_SNDBUF:
1370 case SO_RCVBUF:
1371 case SO_SNDLOWAT:
1372 case SO_RCVLOWAT:
1373 error = sooptcopyin(sopt, &optval, sizeof optval,
1374 sizeof optval);
1375 if (error)
1376 goto bad;
1377
1378 /*
1379 * Values < 1 make no sense for any of these
1380 * options, so disallow them.
1381 */
1382 if (optval < 1) {
1383 error = EINVAL;
1384 goto bad;
1385 }
1386
1387 switch (sopt->sopt_name) {
1388 case SO_SNDBUF:
1389 case SO_RCVBUF:
1390 if (sbreserve(sopt->sopt_name == SO_SNDBUF ?
1391 &so->so_snd : &so->so_rcv, (u_long)optval,
1392 so, curthread) == 0) {
1393 error = ENOBUFS;
1394 goto bad;
1395 }
1396 break;
1397
1398 /*
1399 * Make sure the low-water is never greater than
1400 * the high-water.
1401 */
1402 case SO_SNDLOWAT:
1403 so->so_snd.sb_lowat =
1404 (optval > so->so_snd.sb_hiwat) ?
1405 so->so_snd.sb_hiwat : optval;
1406 break;
1407 case SO_RCVLOWAT:
1408 so->so_rcv.sb_lowat =
1409 (optval > so->so_rcv.sb_hiwat) ?
1410 so->so_rcv.sb_hiwat : optval;
1411 break;
1412 }
1413 break;
1414
1415 case SO_SNDTIMEO:
1416 case SO_RCVTIMEO:
1417 error = sooptcopyin(sopt, &tv, sizeof tv,
1418 sizeof tv);
1419 if (error)
1420 goto bad;
1421
1422 /* assert(hz > 0); */
1423 if (tv.tv_sec < 0 || tv.tv_sec > SHRT_MAX / hz ||
1424 tv.tv_usec < 0 || tv.tv_usec >= 1000000) {
1425 error = EDOM;
1426 goto bad;
1427 }
1428 /* assert(tick > 0); */
1429 /* assert(ULONG_MAX - SHRT_MAX >= 1000000); */
1430 val = (u_long)(tv.tv_sec * hz) + tv.tv_usec / tick;
1431 if (val > SHRT_MAX) {
1432 error = EDOM;
1433 goto bad;
1434 }
1435 if (val == 0 && tv.tv_usec != 0)
1436 val = 1;
1437
1438 switch (sopt->sopt_name) {
1439 case SO_SNDTIMEO:
1440 so->so_snd.sb_timeo = val;
1441 break;
1442 case SO_RCVTIMEO:
1443 so->so_rcv.sb_timeo = val;
1444 break;
1445 }
1446 break;
1447 case SO_LABEL:
1448#ifdef MAC
1449 error = sooptcopyin(sopt, &extmac, sizeof extmac,
1450 sizeof extmac);
1451 if (error)
1452 goto bad;
1453 error = mac_setsockopt_label(sopt->sopt_td->td_ucred,
1454 so, &extmac);
1455#else
1456 error = EOPNOTSUPP;
1457#endif
1458 break;
1459 default:
1460 error = ENOPROTOOPT;
1461 break;
1462 }
1463 if (error == 0 && so->so_proto != NULL &&
1464 so->so_proto->pr_ctloutput != NULL) {
1465 (void) ((*so->so_proto->pr_ctloutput)
1466 (so, sopt));
1467 }
1468 }
1469bad:
1470 return (error);
1471}
1472
1473/* Helper routine for getsockopt */
1474int
1475sooptcopyout(struct sockopt *sopt, const void *buf, size_t len)
1476{
1477 int error;
1478 size_t valsize;
1479
1480 error = 0;
1481
1482 /*
1483 * Documented get behavior is that we always return a value,
1484 * possibly truncated to fit in the user's buffer.
1485 * Traditional behavior is that we always tell the user
1486 * precisely how much we copied, rather than something useful
1487 * like the total amount we had available for her.
1488 * Note that this interface is not idempotent; the entire answer must
1489 * generated ahead of time.
1490 */
1491 valsize = min(len, sopt->sopt_valsize);
1492 sopt->sopt_valsize = valsize;
1493 if (sopt->sopt_val != NULL) {
1494 if (sopt->sopt_td != NULL)
1495 error = copyout(buf, sopt->sopt_val, valsize);
1496 else
1497 bcopy(buf, sopt->sopt_val, valsize);
1498 }
1499 return error;
1500}
1501
1502int
1503sogetopt(so, sopt)
1504 struct socket *so;
1505 struct sockopt *sopt;
1506{
1507 int error, optval;
1508 struct linger l;
1509 struct timeval tv;
1510#ifdef INET
1511 struct accept_filter_arg *afap;
1512#endif
1513#ifdef MAC
1514 struct mac extmac;
1515#endif
1516
1517 error = 0;
1518 if (sopt->sopt_level != SOL_SOCKET) {
1519 if (so->so_proto && so->so_proto->pr_ctloutput) {
1520 return ((*so->so_proto->pr_ctloutput)
1521 (so, sopt));
1522 } else
1523 return (ENOPROTOOPT);
1524 } else {
1525 switch (sopt->sopt_name) {
1526#ifdef INET
1527 case SO_ACCEPTFILTER:
1528 if ((so->so_options & SO_ACCEPTCONN) == 0)
1529 return (EINVAL);
1530 MALLOC(afap, struct accept_filter_arg *, sizeof(*afap),
1531 M_TEMP, M_WAITOK | M_ZERO);
1532 if ((so->so_options & SO_ACCEPTFILTER) != 0) {
1533 strcpy(afap->af_name, so->so_accf->so_accept_filter->accf_name);
1534 if (so->so_accf->so_accept_filter_str != NULL)
1535 strcpy(afap->af_arg, so->so_accf->so_accept_filter_str);
1536 }
1537 error = sooptcopyout(sopt, afap, sizeof(*afap));
1538 FREE(afap, M_TEMP);
1539 break;
1540#endif
1541
1542 case SO_LINGER:
1543 l.l_onoff = so->so_options & SO_LINGER;
1544 l.l_linger = so->so_linger;
1545 error = sooptcopyout(sopt, &l, sizeof l);
1546 break;
1547
1548 case SO_USELOOPBACK:
1549 case SO_DONTROUTE:
1550 case SO_DEBUG:
1551 case SO_KEEPALIVE:
1552 case SO_REUSEADDR:
1553 case SO_REUSEPORT:
1554 case SO_BROADCAST:
1555 case SO_OOBINLINE:
1556 case SO_TIMESTAMP:
1557 case SO_BINTIME:
1558 case SO_NOSIGPIPE:
1559 optval = so->so_options & sopt->sopt_name;
1560integer:
1561 error = sooptcopyout(sopt, &optval, sizeof optval);
1562 break;
1563
1564 case SO_TYPE:
1565 optval = so->so_type;
1566 goto integer;
1567
1568 case SO_ERROR:
1569 optval = so->so_error;
1570 so->so_error = 0;
1571 goto integer;
1572
1573 case SO_SNDBUF:
1574 optval = so->so_snd.sb_hiwat;
1575 goto integer;
1576
1577 case SO_RCVBUF:
1578 optval = so->so_rcv.sb_hiwat;
1579 goto integer;
1580
1581 case SO_SNDLOWAT:
1582 optval = so->so_snd.sb_lowat;
1583 goto integer;
1584
1585 case SO_RCVLOWAT:
1586 optval = so->so_rcv.sb_lowat;
1587 goto integer;
1588
1589 case SO_SNDTIMEO:
1590 case SO_RCVTIMEO:
1591 optval = (sopt->sopt_name == SO_SNDTIMEO ?
1592 so->so_snd.sb_timeo : so->so_rcv.sb_timeo);
1593
1594 tv.tv_sec = optval / hz;
1595 tv.tv_usec = (optval % hz) * tick;
1596 error = sooptcopyout(sopt, &tv, sizeof tv);
1597 break;
1598 case SO_LABEL:
1599#ifdef MAC
1600 error = sooptcopyin(sopt, &extmac, sizeof(extmac),
1601 sizeof(extmac));
1602 if (error)
1603 return (error);
1604 error = mac_getsockopt_label(sopt->sopt_td->td_ucred,
1605 so, &extmac);
1606 if (error)
1607 return (error);
1608 error = sooptcopyout(sopt, &extmac, sizeof extmac);
1609#else
1610 error = EOPNOTSUPP;
1611#endif
1612 break;
1613 case SO_PEERLABEL:
1614#ifdef MAC
1615 error = sooptcopyin(sopt, &extmac, sizeof(extmac),
1616 sizeof(extmac));
1617 if (error)
1618 return (error);
1619 error = mac_getsockopt_peerlabel(
1620 sopt->sopt_td->td_ucred, so, &extmac);
1621 if (error)
1622 return (error);
1623 error = sooptcopyout(sopt, &extmac, sizeof extmac);
1624#else
1625 error = EOPNOTSUPP;
1626#endif
1627 break;
1628 default:
1629 error = ENOPROTOOPT;
1630 break;
1631 }
1632 return (error);
1633 }
1634}
1635
1636/* XXX; prepare mbuf for (__FreeBSD__ < 3) routines. */
1637int
1638soopt_getm(struct sockopt *sopt, struct mbuf **mp)
1639{
1640 struct mbuf *m, *m_prev;
1641 int sopt_size = sopt->sopt_valsize;
1642
1643 MGET(m, sopt->sopt_td ? M_TRYWAIT : M_DONTWAIT, MT_DATA);
1644 if (m == NULL)
1645 return ENOBUFS;
1646 if (sopt_size > MLEN) {
1647 MCLGET(m, sopt->sopt_td ? M_TRYWAIT : M_DONTWAIT);
1648 if ((m->m_flags & M_EXT) == 0) {
1649 m_free(m);
1650 return ENOBUFS;
1651 }
1652 m->m_len = min(MCLBYTES, sopt_size);
1653 } else {
1654 m->m_len = min(MLEN, sopt_size);
1655 }
1656 sopt_size -= m->m_len;
1657 *mp = m;
1658 m_prev = m;
1659
1660 while (sopt_size) {
1661 MGET(m, sopt->sopt_td ? M_TRYWAIT : M_DONTWAIT, MT_DATA);
1662 if (m == NULL) {
1663 m_freem(*mp);
1664 return ENOBUFS;
1665 }
1666 if (sopt_size > MLEN) {
1667 MCLGET(m, sopt->sopt_td != NULL ? M_TRYWAIT :
1668 M_DONTWAIT);
1669 if ((m->m_flags & M_EXT) == 0) {
1670 m_freem(m);
1671 m_freem(*mp);
1672 return ENOBUFS;
1673 }
1674 m->m_len = min(MCLBYTES, sopt_size);
1675 } else {
1676 m->m_len = min(MLEN, sopt_size);
1677 }
1678 sopt_size -= m->m_len;
1679 m_prev->m_next = m;
1680 m_prev = m;
1681 }
1682 return 0;
1683}
1684
1685/* XXX; copyin sopt data into mbuf chain for (__FreeBSD__ < 3) routines. */
1686int
1687soopt_mcopyin(struct sockopt *sopt, struct mbuf *m)
1688{
1689 struct mbuf *m0 = m;
1690
1691 if (sopt->sopt_val == NULL)
1692 return 0;
1693 while (m != NULL && sopt->sopt_valsize >= m->m_len) {
1694 if (sopt->sopt_td != NULL) {
1695 int error;
1696
1697 error = copyin(sopt->sopt_val, mtod(m, char *),
1698 m->m_len);
1699 if (error != 0) {
1700 m_freem(m0);
1701 return(error);
1702 }
1703 } else
1704 bcopy(sopt->sopt_val, mtod(m, char *), m->m_len);
1705 sopt->sopt_valsize -= m->m_len;
1706 (caddr_t)sopt->sopt_val += m->m_len;
1707 m = m->m_next;
1708 }
1709 if (m != NULL) /* should be allocated enoughly at ip6_sooptmcopyin() */
1710 panic("ip6_sooptmcopyin");
1711 return 0;
1712}
1713
1714/* XXX; copyout mbuf chain data into soopt for (__FreeBSD__ < 3) routines. */
1715int
1716soopt_mcopyout(struct sockopt *sopt, struct mbuf *m)
1717{
1718 struct mbuf *m0 = m;
1719 size_t valsize = 0;
1720
1721 if (sopt->sopt_val == NULL)
1722 return 0;
1723 while (m != NULL && sopt->sopt_valsize >= m->m_len) {
1724 if (sopt->sopt_td != NULL) {
1725 int error;
1726
1727 error = copyout(mtod(m, char *), sopt->sopt_val,
1728 m->m_len);
1729 if (error != 0) {
1730 m_freem(m0);
1731 return(error);
1732 }
1733 } else
1734 bcopy(mtod(m, char *), sopt->sopt_val, m->m_len);
1735 sopt->sopt_valsize -= m->m_len;
1736 (caddr_t)sopt->sopt_val += m->m_len;
1737 valsize += m->m_len;
1738 m = m->m_next;
1739 }
1740 if (m != NULL) {
1741 /* enough soopt buffer should be given from user-land */
1742 m_freem(m0);
1743 return(EINVAL);
1744 }
1745 sopt->sopt_valsize = valsize;
1746 return 0;
1747}
1748
1749void
1750sohasoutofband(so)
1751 struct socket *so;
1752{
1753 if (so->so_sigio != NULL)
1754 pgsigio(&so->so_sigio, SIGURG, 0);
1755 selwakeuppri(&so->so_rcv.sb_sel, PSOCK);
1756}
1757
1758int
1759sopoll(struct socket *so, int events, struct ucred *active_cred,
1760 struct thread *td)
1761{
1762 int revents = 0;
1763 int s = splnet();
1764
1765 if (events & (POLLIN | POLLRDNORM))
1766 if (soreadable(so))
1767 revents |= events & (POLLIN | POLLRDNORM);
1768
1769 if (events & POLLINIGNEOF)
1770 if (so->so_rcv.sb_cc >= so->so_rcv.sb_lowat ||
1771 !TAILQ_EMPTY(&so->so_comp) || so->so_error)
1772 revents |= POLLINIGNEOF;
1773
1774 if (events & (POLLOUT | POLLWRNORM))
1775 if (sowriteable(so))
1776 revents |= events & (POLLOUT | POLLWRNORM);
1777
1778 if (events & (POLLPRI | POLLRDBAND))
1779 if (so->so_oobmark || (so->so_state & SS_RCVATMARK))
1780 revents |= events & (POLLPRI | POLLRDBAND);
1781
1782 if (revents == 0) {
1783 if (events &
1784 (POLLIN | POLLINIGNEOF | POLLPRI | POLLRDNORM |
1785 POLLRDBAND)) {
1786 selrecord(td, &so->so_rcv.sb_sel);
1787 so->so_rcv.sb_flags |= SB_SEL;
1788 }
1789
1790 if (events & (POLLOUT | POLLWRNORM)) {
1791 selrecord(td, &so->so_snd.sb_sel);
1792 so->so_snd.sb_flags |= SB_SEL;
1793 }
1794 }
1795
1796 splx(s);
1797 return (revents);
1798}
1799
1800int
1801soo_kqfilter(struct file *fp, struct knote *kn)
1802{
1803 struct socket *so = kn->kn_fp->f_data;
1804 struct sockbuf *sb;
1805 int s;
1806
1807 switch (kn->kn_filter) {
1808 case EVFILT_READ:
1809 if (so->so_options & SO_ACCEPTCONN)
1810 kn->kn_fop = &solisten_filtops;
1811 else
1812 kn->kn_fop = &soread_filtops;
1813 sb = &so->so_rcv;
1814 break;
1815 case EVFILT_WRITE:
1816 kn->kn_fop = &sowrite_filtops;
1817 sb = &so->so_snd;
1818 break;
1819 default:
1820 return (1);
1821 }
1822
1823 s = splnet();
1824 SLIST_INSERT_HEAD(&sb->sb_sel.si_note, kn, kn_selnext);
1825 sb->sb_flags |= SB_KNOTE;
1826 splx(s);
1827 return (0);
1828}
1829
1830static void
1831filt_sordetach(struct knote *kn)
1832{
1833 struct socket *so = kn->kn_fp->f_data;
1834 int s = splnet();
1835
1836 SLIST_REMOVE(&so->so_rcv.sb_sel.si_note, kn, knote, kn_selnext);
1837 if (SLIST_EMPTY(&so->so_rcv.sb_sel.si_note))
1838 so->so_rcv.sb_flags &= ~SB_KNOTE;
1839 splx(s);
1840}
1841
1842/*ARGSUSED*/
1843static int
1844filt_soread(struct knote *kn, long hint)
1845{
1846 struct socket *so = kn->kn_fp->f_data;
1847 int result;
1848
1849 kn->kn_data = so->so_rcv.sb_cc - so->so_rcv.sb_ctl;
1850 if (so->so_state & SS_CANTRCVMORE) {
1851 kn->kn_flags |= EV_EOF;
1852 kn->kn_fflags = so->so_error;
1853 result = 1;
1854 } else if (so->so_error) /* temporary udp error */
1855 result = 1;
1856 else if (kn->kn_sfflags & NOTE_LOWAT)
1857 result = (kn->kn_data >= kn->kn_sdata);
1858 else
1859 result = (so->so_rcv.sb_cc >= so->so_rcv.sb_lowat);
1860 return (result);
1861}
1862
1863static void
1864filt_sowdetach(struct knote *kn)
1865{
1866 struct socket *so = kn->kn_fp->f_data;
1867 int s = splnet();
1868
1869 SLIST_REMOVE(&so->so_snd.sb_sel.si_note, kn, knote, kn_selnext);
1870 if (SLIST_EMPTY(&so->so_snd.sb_sel.si_note))
1871 so->so_snd.sb_flags &= ~SB_KNOTE;
1872 splx(s);
1873}
1874
1875/*ARGSUSED*/
1876static int
1877filt_sowrite(struct knote *kn, long hint)
1878{
1879 struct socket *so = kn->kn_fp->f_data;
1880 int result;
1881
1882 kn->kn_data = sbspace(&so->so_snd);
1883 if (so->so_state & SS_CANTSENDMORE) {
1884 kn->kn_flags |= EV_EOF;
1885 kn->kn_fflags = so->so_error;
1886 result = 1;
1887 } else if (so->so_error) /* temporary udp error */
1888 result = 1;
1889 else if (((so->so_state & SS_ISCONNECTED) == 0) &&
1890 (so->so_proto->pr_flags & PR_CONNREQUIRED))
1891 result = 0;
1892 else if (kn->kn_sfflags & NOTE_LOWAT)
1893 result = (kn->kn_data >= kn->kn_sdata);
1894 else
1895 result = (kn->kn_data >= so->so_snd.sb_lowat);
1896 return (result);
1897}
1898
1899/*ARGSUSED*/
1900static int
1901filt_solisten(struct knote *kn, long hint)
1902{
1903 struct socket *so = kn->kn_fp->f_data;
1904
1905 kn->kn_data = so->so_qlen;
1906 return (! TAILQ_EMPTY(&so->so_comp));
1907}
1908
1909int
1910socheckuid(struct socket *so, uid_t uid)
1911{
1912
1913 if (so == NULL)
1914 return (EPERM);
1915 if (so->so_cred->cr_uid == uid)
1916 return (0);
1917 return (EPERM);
1918}
697 space -= len;
698#ifdef ZERO_COPY_SOCKETS
699 if (cow_send)
700 error = 0;
701 else
702#endif /* ZERO_COPY_SOCKETS */
703 error = uiomove(mtod(m, void *), (int)len, uio);
704 resid = uio->uio_resid;
705 m->m_len = len;
706 *mp = m;
707 top->m_pkthdr.len += len;
708 if (error)
709 goto release;
710 mp = &m->m_next;
711 if (resid <= 0) {
712 if (flags & MSG_EOR)
713 top->m_flags |= M_EOR;
714 break;
715 }
716 } while (space > 0 && atomic);
717 if (dontroute)
718 so->so_options |= SO_DONTROUTE;
719 s = splnet(); /* XXX */
720 /*
721 * XXX all the SS_CANTSENDMORE checks previously
722 * done could be out of date. We could have recieved
723 * a reset packet in an interrupt or maybe we slept
724 * while doing page faults in uiomove() etc. We could
725 * probably recheck again inside the splnet() protection
726 * here, but there are probably other places that this
727 * also happens. We must rethink this.
728 */
729 error = (*so->so_proto->pr_usrreqs->pru_send)(so,
730 (flags & MSG_OOB) ? PRUS_OOB :
731 /*
732 * If the user set MSG_EOF, the protocol
733 * understands this flag and nothing left to
734 * send then use PRU_SEND_EOF instead of PRU_SEND.
735 */
736 ((flags & MSG_EOF) &&
737 (so->so_proto->pr_flags & PR_IMPLOPCL) &&
738 (resid <= 0)) ?
739 PRUS_EOF :
740 /* If there is more to send set PRUS_MORETOCOME */
741 (resid > 0 && space > 0) ? PRUS_MORETOCOME : 0,
742 top, addr, control, td);
743 splx(s);
744 if (dontroute)
745 so->so_options &= ~SO_DONTROUTE;
746 clen = 0;
747 control = NULL;
748 top = NULL;
749 mp = &top;
750 if (error)
751 goto release;
752 } while (resid && space > 0);
753 } while (resid);
754
755release:
756 sbunlock(&so->so_snd);
757out:
758 if (top != NULL)
759 m_freem(top);
760 if (control != NULL)
761 m_freem(control);
762 return (error);
763}
764
765/*
766 * Implement receive operations on a socket.
767 * We depend on the way that records are added to the sockbuf
768 * by sbappend*. In particular, each record (mbufs linked through m_next)
769 * must begin with an address if the protocol so specifies,
770 * followed by an optional mbuf or mbufs containing ancillary data,
771 * and then zero or more mbufs of data.
772 * In order to avoid blocking network interrupts for the entire time here,
773 * we splx() while doing the actual copy to user space.
774 * Although the sockbuf is locked, new data may still be appended,
775 * and thus we must maintain consistency of the sockbuf during that time.
776 *
777 * The caller may receive the data as a single mbuf chain by supplying
778 * an mbuf **mp0 for use in returning the chain. The uio is then used
779 * only for the count in uio_resid.
780 */
781int
782soreceive(so, psa, uio, mp0, controlp, flagsp)
783 struct socket *so;
784 struct sockaddr **psa;
785 struct uio *uio;
786 struct mbuf **mp0;
787 struct mbuf **controlp;
788 int *flagsp;
789{
790 struct mbuf *m, **mp;
791 int flags, len, error, s, offset;
792 struct protosw *pr = so->so_proto;
793 struct mbuf *nextrecord;
794 int moff, type = 0;
795 int orig_resid = uio->uio_resid;
796
797 mp = mp0;
798 if (psa != NULL)
799 *psa = 0;
800 if (controlp != NULL)
801 *controlp = 0;
802 if (flagsp != NULL)
803 flags = *flagsp &~ MSG_EOR;
804 else
805 flags = 0;
806 if (flags & MSG_OOB) {
807 m = m_get(M_TRYWAIT, MT_DATA);
808 if (m == NULL)
809 return (ENOBUFS);
810 error = (*pr->pr_usrreqs->pru_rcvoob)(so, m, flags & MSG_PEEK);
811 if (error)
812 goto bad;
813 do {
814#ifdef ZERO_COPY_SOCKETS
815 if (so_zero_copy_receive) {
816 vm_page_t pg;
817 int disposable;
818
819 if ((m->m_flags & M_EXT)
820 && (m->m_ext.ext_type == EXT_DISPOSABLE))
821 disposable = 1;
822 else
823 disposable = 0;
824
825 pg = PHYS_TO_VM_PAGE(vtophys(mtod(m, caddr_t)));
826 if (uio->uio_offset == -1)
827 uio->uio_offset =IDX_TO_OFF(pg->pindex);
828
829 error = uiomoveco(mtod(m, void *),
830 min(uio->uio_resid, m->m_len),
831 uio, pg->object,
832 disposable);
833 } else
834#endif /* ZERO_COPY_SOCKETS */
835 error = uiomove(mtod(m, void *),
836 (int) min(uio->uio_resid, m->m_len), uio);
837 m = m_free(m);
838 } while (uio->uio_resid && error == 0 && m);
839bad:
840 if (m != NULL)
841 m_freem(m);
842 return (error);
843 }
844 if (mp != NULL)
845 *mp = NULL;
846 if (so->so_state & SS_ISCONFIRMING && uio->uio_resid)
847 (*pr->pr_usrreqs->pru_rcvd)(so, 0);
848
849restart:
850 error = sblock(&so->so_rcv, SBLOCKWAIT(flags));
851 if (error)
852 return (error);
853 s = splnet();
854
855 m = so->so_rcv.sb_mb;
856 /*
857 * If we have less data than requested, block awaiting more
858 * (subject to any timeout) if:
859 * 1. the current count is less than the low water mark, or
860 * 2. MSG_WAITALL is set, and it is possible to do the entire
861 * receive operation at once if we block (resid <= hiwat).
862 * 3. MSG_DONTWAIT is not set
863 * If MSG_WAITALL is set but resid is larger than the receive buffer,
864 * we have to do the receive in sections, and thus risk returning
865 * a short count if a timeout or signal occurs after we start.
866 */
867 if (m == NULL || (((flags & MSG_DONTWAIT) == 0 &&
868 so->so_rcv.sb_cc < uio->uio_resid) &&
869 (so->so_rcv.sb_cc < so->so_rcv.sb_lowat ||
870 ((flags & MSG_WAITALL) && uio->uio_resid <= so->so_rcv.sb_hiwat)) &&
871 m->m_nextpkt == NULL && (pr->pr_flags & PR_ATOMIC) == 0)) {
872 KASSERT(m != NULL || !so->so_rcv.sb_cc,
873 ("receive: m == %p so->so_rcv.sb_cc == %u",
874 m, so->so_rcv.sb_cc));
875 if (so->so_error) {
876 if (m != NULL)
877 goto dontblock;
878 error = so->so_error;
879 if ((flags & MSG_PEEK) == 0)
880 so->so_error = 0;
881 goto release;
882 }
883 if (so->so_state & SS_CANTRCVMORE) {
884 if (m)
885 goto dontblock;
886 else
887 goto release;
888 }
889 for (; m != NULL; m = m->m_next)
890 if (m->m_type == MT_OOBDATA || (m->m_flags & M_EOR)) {
891 m = so->so_rcv.sb_mb;
892 goto dontblock;
893 }
894 if ((so->so_state & (SS_ISCONNECTED|SS_ISCONNECTING)) == 0 &&
895 (so->so_proto->pr_flags & PR_CONNREQUIRED)) {
896 error = ENOTCONN;
897 goto release;
898 }
899 if (uio->uio_resid == 0)
900 goto release;
901 if ((so->so_state & SS_NBIO) || (flags & MSG_DONTWAIT)) {
902 error = EWOULDBLOCK;
903 goto release;
904 }
905 SBLASTRECORDCHK(&so->so_rcv);
906 SBLASTMBUFCHK(&so->so_rcv);
907 sbunlock(&so->so_rcv);
908 error = sbwait(&so->so_rcv);
909 splx(s);
910 if (error)
911 return (error);
912 goto restart;
913 }
914dontblock:
915 if (uio->uio_td)
916 uio->uio_td->td_proc->p_stats->p_ru.ru_msgrcv++;
917 SBLASTRECORDCHK(&so->so_rcv);
918 SBLASTMBUFCHK(&so->so_rcv);
919 nextrecord = m->m_nextpkt;
920 if (pr->pr_flags & PR_ADDR) {
921 KASSERT(m->m_type == MT_SONAME,
922 ("m->m_type == %d", m->m_type));
923 orig_resid = 0;
924 if (psa != NULL)
925 *psa = sodupsockaddr(mtod(m, struct sockaddr *),
926 mp0 == NULL ? M_WAITOK : M_NOWAIT);
927 if (flags & MSG_PEEK) {
928 m = m->m_next;
929 } else {
930 sbfree(&so->so_rcv, m);
931 so->so_rcv.sb_mb = m_free(m);
932 m = so->so_rcv.sb_mb;
933 }
934 }
935 while (m != NULL && m->m_type == MT_CONTROL && error == 0) {
936 if (flags & MSG_PEEK) {
937 if (controlp != NULL)
938 *controlp = m_copy(m, 0, m->m_len);
939 m = m->m_next;
940 } else {
941 sbfree(&so->so_rcv, m);
942 so->so_rcv.sb_mb = m->m_next;
943 m->m_next = NULL;
944 if (pr->pr_domain->dom_externalize)
945 error =
946 (*pr->pr_domain->dom_externalize)(m, controlp);
947 else if (controlp != NULL)
948 *controlp = m;
949 else
950 m_freem(m);
951 m = so->so_rcv.sb_mb;
952 }
953 if (controlp != NULL) {
954 orig_resid = 0;
955 while (*controlp != NULL)
956 controlp = &(*controlp)->m_next;
957 }
958 }
959 if (m != NULL) {
960 if ((flags & MSG_PEEK) == 0) {
961 m->m_nextpkt = nextrecord;
962 /*
963 * If nextrecord == NULL (this is a single chain),
964 * then sb_lastrecord may not be valid here if m
965 * was changed earlier.
966 */
967 if (nextrecord == NULL) {
968 KASSERT(so->so_rcv.sb_mb == m,
969 ("receive tailq 1"));
970 so->so_rcv.sb_lastrecord = m;
971 }
972 }
973 type = m->m_type;
974 if (type == MT_OOBDATA)
975 flags |= MSG_OOB;
976 } else {
977 if ((flags & MSG_PEEK) == 0) {
978 KASSERT(so->so_rcv.sb_mb == m,("receive tailq 2"));
979 so->so_rcv.sb_mb = nextrecord;
980 SB_EMPTY_FIXUP(&so->so_rcv);
981 }
982 }
983 SBLASTRECORDCHK(&so->so_rcv);
984 SBLASTMBUFCHK(&so->so_rcv);
985
986 moff = 0;
987 offset = 0;
988 while (m != NULL && uio->uio_resid > 0 && error == 0) {
989 if (m->m_type == MT_OOBDATA) {
990 if (type != MT_OOBDATA)
991 break;
992 } else if (type == MT_OOBDATA)
993 break;
994 else
995 KASSERT(m->m_type == MT_DATA || m->m_type == MT_HEADER,
996 ("m->m_type == %d", m->m_type));
997 so->so_state &= ~SS_RCVATMARK;
998 len = uio->uio_resid;
999 if (so->so_oobmark && len > so->so_oobmark - offset)
1000 len = so->so_oobmark - offset;
1001 if (len > m->m_len - moff)
1002 len = m->m_len - moff;
1003 /*
1004 * If mp is set, just pass back the mbufs.
1005 * Otherwise copy them out via the uio, then free.
1006 * Sockbuf must be consistent here (points to current mbuf,
1007 * it points to next record) when we drop priority;
1008 * we must note any additions to the sockbuf when we
1009 * block interrupts again.
1010 */
1011 if (mp == NULL) {
1012 SBLASTRECORDCHK(&so->so_rcv);
1013 SBLASTMBUFCHK(&so->so_rcv);
1014 splx(s);
1015#ifdef ZERO_COPY_SOCKETS
1016 if (so_zero_copy_receive) {
1017 vm_page_t pg;
1018 int disposable;
1019
1020 if ((m->m_flags & M_EXT)
1021 && (m->m_ext.ext_type == EXT_DISPOSABLE))
1022 disposable = 1;
1023 else
1024 disposable = 0;
1025
1026 pg = PHYS_TO_VM_PAGE(vtophys(mtod(m, caddr_t) +
1027 moff));
1028
1029 if (uio->uio_offset == -1)
1030 uio->uio_offset =IDX_TO_OFF(pg->pindex);
1031
1032 error = uiomoveco(mtod(m, char *) + moff,
1033 (int)len, uio,pg->object,
1034 disposable);
1035 } else
1036#endif /* ZERO_COPY_SOCKETS */
1037 error = uiomove(mtod(m, char *) + moff, (int)len, uio);
1038 s = splnet();
1039 if (error)
1040 goto release;
1041 } else
1042 uio->uio_resid -= len;
1043 if (len == m->m_len - moff) {
1044 if (m->m_flags & M_EOR)
1045 flags |= MSG_EOR;
1046 if (flags & MSG_PEEK) {
1047 m = m->m_next;
1048 moff = 0;
1049 } else {
1050 nextrecord = m->m_nextpkt;
1051 sbfree(&so->so_rcv, m);
1052 if (mp != NULL) {
1053 *mp = m;
1054 mp = &m->m_next;
1055 so->so_rcv.sb_mb = m = m->m_next;
1056 *mp = NULL;
1057 } else {
1058 so->so_rcv.sb_mb = m_free(m);
1059 m = so->so_rcv.sb_mb;
1060 }
1061 if (m != NULL) {
1062 m->m_nextpkt = nextrecord;
1063 if (nextrecord == NULL)
1064 so->so_rcv.sb_lastrecord = m;
1065 } else {
1066 so->so_rcv.sb_mb = nextrecord;
1067 SB_EMPTY_FIXUP(&so->so_rcv);
1068 }
1069 SBLASTRECORDCHK(&so->so_rcv);
1070 SBLASTMBUFCHK(&so->so_rcv);
1071 }
1072 } else {
1073 if (flags & MSG_PEEK)
1074 moff += len;
1075 else {
1076 if (mp != NULL)
1077 *mp = m_copym(m, 0, len, M_TRYWAIT);
1078 m->m_data += len;
1079 m->m_len -= len;
1080 so->so_rcv.sb_cc -= len;
1081 }
1082 }
1083 if (so->so_oobmark) {
1084 if ((flags & MSG_PEEK) == 0) {
1085 so->so_oobmark -= len;
1086 if (so->so_oobmark == 0) {
1087 so->so_state |= SS_RCVATMARK;
1088 break;
1089 }
1090 } else {
1091 offset += len;
1092 if (offset == so->so_oobmark)
1093 break;
1094 }
1095 }
1096 if (flags & MSG_EOR)
1097 break;
1098 /*
1099 * If the MSG_WAITALL flag is set (for non-atomic socket),
1100 * we must not quit until "uio->uio_resid == 0" or an error
1101 * termination. If a signal/timeout occurs, return
1102 * with a short count but without error.
1103 * Keep sockbuf locked against other readers.
1104 */
1105 while (flags & MSG_WAITALL && m == NULL && uio->uio_resid > 0 &&
1106 !sosendallatonce(so) && nextrecord == NULL) {
1107 if (so->so_error || so->so_state & SS_CANTRCVMORE)
1108 break;
1109 /*
1110 * Notify the protocol that some data has been
1111 * drained before blocking.
1112 */
1113 if (pr->pr_flags & PR_WANTRCVD && so->so_pcb != NULL)
1114 (*pr->pr_usrreqs->pru_rcvd)(so, flags);
1115 SBLASTRECORDCHK(&so->so_rcv);
1116 SBLASTMBUFCHK(&so->so_rcv);
1117 error = sbwait(&so->so_rcv);
1118 if (error) {
1119 sbunlock(&so->so_rcv);
1120 splx(s);
1121 return (0);
1122 }
1123 m = so->so_rcv.sb_mb;
1124 if (m != NULL)
1125 nextrecord = m->m_nextpkt;
1126 }
1127 }
1128
1129 if (m != NULL && pr->pr_flags & PR_ATOMIC) {
1130 flags |= MSG_TRUNC;
1131 if ((flags & MSG_PEEK) == 0)
1132 (void) sbdroprecord(&so->so_rcv);
1133 }
1134 if ((flags & MSG_PEEK) == 0) {
1135 if (m == NULL) {
1136 /*
1137 * First part is an inline SB_EMPTY_FIXUP(). Second
1138 * part makes sure sb_lastrecord is up-to-date if
1139 * there is still data in the socket buffer.
1140 */
1141 so->so_rcv.sb_mb = nextrecord;
1142 if (so->so_rcv.sb_mb == NULL) {
1143 so->so_rcv.sb_mbtail = NULL;
1144 so->so_rcv.sb_lastrecord = NULL;
1145 } else if (nextrecord->m_nextpkt == NULL)
1146 so->so_rcv.sb_lastrecord = nextrecord;
1147 }
1148 SBLASTRECORDCHK(&so->so_rcv);
1149 SBLASTMBUFCHK(&so->so_rcv);
1150 if (pr->pr_flags & PR_WANTRCVD && so->so_pcb)
1151 (*pr->pr_usrreqs->pru_rcvd)(so, flags);
1152 }
1153 if (orig_resid == uio->uio_resid && orig_resid &&
1154 (flags & MSG_EOR) == 0 && (so->so_state & SS_CANTRCVMORE) == 0) {
1155 sbunlock(&so->so_rcv);
1156 splx(s);
1157 goto restart;
1158 }
1159
1160 if (flagsp != NULL)
1161 *flagsp |= flags;
1162release:
1163 sbunlock(&so->so_rcv);
1164 splx(s);
1165 return (error);
1166}
1167
1168int
1169soshutdown(so, how)
1170 struct socket *so;
1171 int how;
1172{
1173 struct protosw *pr = so->so_proto;
1174
1175 if (!(how == SHUT_RD || how == SHUT_WR || how == SHUT_RDWR))
1176 return (EINVAL);
1177
1178 if (how != SHUT_WR)
1179 sorflush(so);
1180 if (how != SHUT_RD)
1181 return ((*pr->pr_usrreqs->pru_shutdown)(so));
1182 return (0);
1183}
1184
1185void
1186sorflush(so)
1187 struct socket *so;
1188{
1189 struct sockbuf *sb = &so->so_rcv;
1190 struct protosw *pr = so->so_proto;
1191 int s;
1192 struct sockbuf asb;
1193
1194 sb->sb_flags |= SB_NOINTR;
1195 (void) sblock(sb, M_WAITOK);
1196 s = splimp();
1197 socantrcvmore(so);
1198 sbunlock(sb);
1199 asb = *sb;
1200 /*
1201 * Invalidate/clear most of the sockbuf structure, but keep
1202 * its selinfo structure valid.
1203 */
1204 bzero(&sb->sb_startzero,
1205 sizeof(*sb) - offsetof(struct sockbuf, sb_startzero));
1206 splx(s);
1207
1208 if (pr->pr_flags & PR_RIGHTS && pr->pr_domain->dom_dispose != NULL)
1209 (*pr->pr_domain->dom_dispose)(asb.sb_mb);
1210 sbrelease(&asb, so);
1211}
1212
1213#ifdef INET
1214static int
1215do_setopt_accept_filter(so, sopt)
1216 struct socket *so;
1217 struct sockopt *sopt;
1218{
1219 struct accept_filter_arg *afap = NULL;
1220 struct accept_filter *afp;
1221 struct so_accf *af = so->so_accf;
1222 int error = 0;
1223
1224 /* do not set/remove accept filters on non listen sockets */
1225 if ((so->so_options & SO_ACCEPTCONN) == 0) {
1226 error = EINVAL;
1227 goto out;
1228 }
1229
1230 /* removing the filter */
1231 if (sopt == NULL) {
1232 if (af != NULL) {
1233 if (af->so_accept_filter != NULL &&
1234 af->so_accept_filter->accf_destroy != NULL) {
1235 af->so_accept_filter->accf_destroy(so);
1236 }
1237 if (af->so_accept_filter_str != NULL) {
1238 FREE(af->so_accept_filter_str, M_ACCF);
1239 }
1240 FREE(af, M_ACCF);
1241 so->so_accf = NULL;
1242 }
1243 so->so_options &= ~SO_ACCEPTFILTER;
1244 return (0);
1245 }
1246 /* adding a filter */
1247 /* must remove previous filter first */
1248 if (af != NULL) {
1249 error = EINVAL;
1250 goto out;
1251 }
1252 /* don't put large objects on the kernel stack */
1253 MALLOC(afap, struct accept_filter_arg *, sizeof(*afap), M_TEMP, M_WAITOK);
1254 error = sooptcopyin(sopt, afap, sizeof *afap, sizeof *afap);
1255 afap->af_name[sizeof(afap->af_name)-1] = '\0';
1256 afap->af_arg[sizeof(afap->af_arg)-1] = '\0';
1257 if (error)
1258 goto out;
1259 afp = accept_filt_get(afap->af_name);
1260 if (afp == NULL) {
1261 error = ENOENT;
1262 goto out;
1263 }
1264 MALLOC(af, struct so_accf *, sizeof(*af), M_ACCF, M_WAITOK | M_ZERO);
1265 if (afp->accf_create != NULL) {
1266 if (afap->af_name[0] != '\0') {
1267 int len = strlen(afap->af_name) + 1;
1268
1269 MALLOC(af->so_accept_filter_str, char *, len, M_ACCF, M_WAITOK);
1270 strcpy(af->so_accept_filter_str, afap->af_name);
1271 }
1272 af->so_accept_filter_arg = afp->accf_create(so, afap->af_arg);
1273 if (af->so_accept_filter_arg == NULL) {
1274 FREE(af->so_accept_filter_str, M_ACCF);
1275 FREE(af, M_ACCF);
1276 so->so_accf = NULL;
1277 error = EINVAL;
1278 goto out;
1279 }
1280 }
1281 af->so_accept_filter = afp;
1282 so->so_accf = af;
1283 so->so_options |= SO_ACCEPTFILTER;
1284out:
1285 if (afap != NULL)
1286 FREE(afap, M_TEMP);
1287 return (error);
1288}
1289#endif /* INET */
1290
1291/*
1292 * Perhaps this routine, and sooptcopyout(), below, ought to come in
1293 * an additional variant to handle the case where the option value needs
1294 * to be some kind of integer, but not a specific size.
1295 * In addition to their use here, these functions are also called by the
1296 * protocol-level pr_ctloutput() routines.
1297 */
1298int
1299sooptcopyin(sopt, buf, len, minlen)
1300 struct sockopt *sopt;
1301 void *buf;
1302 size_t len;
1303 size_t minlen;
1304{
1305 size_t valsize;
1306
1307 /*
1308 * If the user gives us more than we wanted, we ignore it,
1309 * but if we don't get the minimum length the caller
1310 * wants, we return EINVAL. On success, sopt->sopt_valsize
1311 * is set to however much we actually retrieved.
1312 */
1313 if ((valsize = sopt->sopt_valsize) < minlen)
1314 return EINVAL;
1315 if (valsize > len)
1316 sopt->sopt_valsize = valsize = len;
1317
1318 if (sopt->sopt_td != NULL)
1319 return (copyin(sopt->sopt_val, buf, valsize));
1320
1321 bcopy(sopt->sopt_val, buf, valsize);
1322 return 0;
1323}
1324
1325int
1326sosetopt(so, sopt)
1327 struct socket *so;
1328 struct sockopt *sopt;
1329{
1330 int error, optval;
1331 struct linger l;
1332 struct timeval tv;
1333 u_long val;
1334#ifdef MAC
1335 struct mac extmac;
1336#endif
1337
1338 error = 0;
1339 if (sopt->sopt_level != SOL_SOCKET) {
1340 if (so->so_proto && so->so_proto->pr_ctloutput)
1341 return ((*so->so_proto->pr_ctloutput)
1342 (so, sopt));
1343 error = ENOPROTOOPT;
1344 } else {
1345 switch (sopt->sopt_name) {
1346#ifdef INET
1347 case SO_ACCEPTFILTER:
1348 error = do_setopt_accept_filter(so, sopt);
1349 if (error)
1350 goto bad;
1351 break;
1352#endif
1353 case SO_LINGER:
1354 error = sooptcopyin(sopt, &l, sizeof l, sizeof l);
1355 if (error)
1356 goto bad;
1357
1358 so->so_linger = l.l_linger;
1359 if (l.l_onoff)
1360 so->so_options |= SO_LINGER;
1361 else
1362 so->so_options &= ~SO_LINGER;
1363 break;
1364
1365 case SO_DEBUG:
1366 case SO_KEEPALIVE:
1367 case SO_DONTROUTE:
1368 case SO_USELOOPBACK:
1369 case SO_BROADCAST:
1370 case SO_REUSEADDR:
1371 case SO_REUSEPORT:
1372 case SO_OOBINLINE:
1373 case SO_TIMESTAMP:
1374 case SO_BINTIME:
1375 case SO_NOSIGPIPE:
1376 error = sooptcopyin(sopt, &optval, sizeof optval,
1377 sizeof optval);
1378 if (error)
1379 goto bad;
1380 if (optval)
1381 so->so_options |= sopt->sopt_name;
1382 else
1383 so->so_options &= ~sopt->sopt_name;
1384 break;
1385
1386 case SO_SNDBUF:
1387 case SO_RCVBUF:
1388 case SO_SNDLOWAT:
1389 case SO_RCVLOWAT:
1390 error = sooptcopyin(sopt, &optval, sizeof optval,
1391 sizeof optval);
1392 if (error)
1393 goto bad;
1394
1395 /*
1396 * Values < 1 make no sense for any of these
1397 * options, so disallow them.
1398 */
1399 if (optval < 1) {
1400 error = EINVAL;
1401 goto bad;
1402 }
1403
1404 switch (sopt->sopt_name) {
1405 case SO_SNDBUF:
1406 case SO_RCVBUF:
1407 if (sbreserve(sopt->sopt_name == SO_SNDBUF ?
1408 &so->so_snd : &so->so_rcv, (u_long)optval,
1409 so, curthread) == 0) {
1410 error = ENOBUFS;
1411 goto bad;
1412 }
1413 break;
1414
1415 /*
1416 * Make sure the low-water is never greater than
1417 * the high-water.
1418 */
1419 case SO_SNDLOWAT:
1420 so->so_snd.sb_lowat =
1421 (optval > so->so_snd.sb_hiwat) ?
1422 so->so_snd.sb_hiwat : optval;
1423 break;
1424 case SO_RCVLOWAT:
1425 so->so_rcv.sb_lowat =
1426 (optval > so->so_rcv.sb_hiwat) ?
1427 so->so_rcv.sb_hiwat : optval;
1428 break;
1429 }
1430 break;
1431
1432 case SO_SNDTIMEO:
1433 case SO_RCVTIMEO:
1434 error = sooptcopyin(sopt, &tv, sizeof tv,
1435 sizeof tv);
1436 if (error)
1437 goto bad;
1438
1439 /* assert(hz > 0); */
1440 if (tv.tv_sec < 0 || tv.tv_sec > SHRT_MAX / hz ||
1441 tv.tv_usec < 0 || tv.tv_usec >= 1000000) {
1442 error = EDOM;
1443 goto bad;
1444 }
1445 /* assert(tick > 0); */
1446 /* assert(ULONG_MAX - SHRT_MAX >= 1000000); */
1447 val = (u_long)(tv.tv_sec * hz) + tv.tv_usec / tick;
1448 if (val > SHRT_MAX) {
1449 error = EDOM;
1450 goto bad;
1451 }
1452 if (val == 0 && tv.tv_usec != 0)
1453 val = 1;
1454
1455 switch (sopt->sopt_name) {
1456 case SO_SNDTIMEO:
1457 so->so_snd.sb_timeo = val;
1458 break;
1459 case SO_RCVTIMEO:
1460 so->so_rcv.sb_timeo = val;
1461 break;
1462 }
1463 break;
1464 case SO_LABEL:
1465#ifdef MAC
1466 error = sooptcopyin(sopt, &extmac, sizeof extmac,
1467 sizeof extmac);
1468 if (error)
1469 goto bad;
1470 error = mac_setsockopt_label(sopt->sopt_td->td_ucred,
1471 so, &extmac);
1472#else
1473 error = EOPNOTSUPP;
1474#endif
1475 break;
1476 default:
1477 error = ENOPROTOOPT;
1478 break;
1479 }
1480 if (error == 0 && so->so_proto != NULL &&
1481 so->so_proto->pr_ctloutput != NULL) {
1482 (void) ((*so->so_proto->pr_ctloutput)
1483 (so, sopt));
1484 }
1485 }
1486bad:
1487 return (error);
1488}
1489
1490/* Helper routine for getsockopt */
1491int
1492sooptcopyout(struct sockopt *sopt, const void *buf, size_t len)
1493{
1494 int error;
1495 size_t valsize;
1496
1497 error = 0;
1498
1499 /*
1500 * Documented get behavior is that we always return a value,
1501 * possibly truncated to fit in the user's buffer.
1502 * Traditional behavior is that we always tell the user
1503 * precisely how much we copied, rather than something useful
1504 * like the total amount we had available for her.
1505 * Note that this interface is not idempotent; the entire answer must
1506 * generated ahead of time.
1507 */
1508 valsize = min(len, sopt->sopt_valsize);
1509 sopt->sopt_valsize = valsize;
1510 if (sopt->sopt_val != NULL) {
1511 if (sopt->sopt_td != NULL)
1512 error = copyout(buf, sopt->sopt_val, valsize);
1513 else
1514 bcopy(buf, sopt->sopt_val, valsize);
1515 }
1516 return error;
1517}
1518
1519int
1520sogetopt(so, sopt)
1521 struct socket *so;
1522 struct sockopt *sopt;
1523{
1524 int error, optval;
1525 struct linger l;
1526 struct timeval tv;
1527#ifdef INET
1528 struct accept_filter_arg *afap;
1529#endif
1530#ifdef MAC
1531 struct mac extmac;
1532#endif
1533
1534 error = 0;
1535 if (sopt->sopt_level != SOL_SOCKET) {
1536 if (so->so_proto && so->so_proto->pr_ctloutput) {
1537 return ((*so->so_proto->pr_ctloutput)
1538 (so, sopt));
1539 } else
1540 return (ENOPROTOOPT);
1541 } else {
1542 switch (sopt->sopt_name) {
1543#ifdef INET
1544 case SO_ACCEPTFILTER:
1545 if ((so->so_options & SO_ACCEPTCONN) == 0)
1546 return (EINVAL);
1547 MALLOC(afap, struct accept_filter_arg *, sizeof(*afap),
1548 M_TEMP, M_WAITOK | M_ZERO);
1549 if ((so->so_options & SO_ACCEPTFILTER) != 0) {
1550 strcpy(afap->af_name, so->so_accf->so_accept_filter->accf_name);
1551 if (so->so_accf->so_accept_filter_str != NULL)
1552 strcpy(afap->af_arg, so->so_accf->so_accept_filter_str);
1553 }
1554 error = sooptcopyout(sopt, afap, sizeof(*afap));
1555 FREE(afap, M_TEMP);
1556 break;
1557#endif
1558
1559 case SO_LINGER:
1560 l.l_onoff = so->so_options & SO_LINGER;
1561 l.l_linger = so->so_linger;
1562 error = sooptcopyout(sopt, &l, sizeof l);
1563 break;
1564
1565 case SO_USELOOPBACK:
1566 case SO_DONTROUTE:
1567 case SO_DEBUG:
1568 case SO_KEEPALIVE:
1569 case SO_REUSEADDR:
1570 case SO_REUSEPORT:
1571 case SO_BROADCAST:
1572 case SO_OOBINLINE:
1573 case SO_TIMESTAMP:
1574 case SO_BINTIME:
1575 case SO_NOSIGPIPE:
1576 optval = so->so_options & sopt->sopt_name;
1577integer:
1578 error = sooptcopyout(sopt, &optval, sizeof optval);
1579 break;
1580
1581 case SO_TYPE:
1582 optval = so->so_type;
1583 goto integer;
1584
1585 case SO_ERROR:
1586 optval = so->so_error;
1587 so->so_error = 0;
1588 goto integer;
1589
1590 case SO_SNDBUF:
1591 optval = so->so_snd.sb_hiwat;
1592 goto integer;
1593
1594 case SO_RCVBUF:
1595 optval = so->so_rcv.sb_hiwat;
1596 goto integer;
1597
1598 case SO_SNDLOWAT:
1599 optval = so->so_snd.sb_lowat;
1600 goto integer;
1601
1602 case SO_RCVLOWAT:
1603 optval = so->so_rcv.sb_lowat;
1604 goto integer;
1605
1606 case SO_SNDTIMEO:
1607 case SO_RCVTIMEO:
1608 optval = (sopt->sopt_name == SO_SNDTIMEO ?
1609 so->so_snd.sb_timeo : so->so_rcv.sb_timeo);
1610
1611 tv.tv_sec = optval / hz;
1612 tv.tv_usec = (optval % hz) * tick;
1613 error = sooptcopyout(sopt, &tv, sizeof tv);
1614 break;
1615 case SO_LABEL:
1616#ifdef MAC
1617 error = sooptcopyin(sopt, &extmac, sizeof(extmac),
1618 sizeof(extmac));
1619 if (error)
1620 return (error);
1621 error = mac_getsockopt_label(sopt->sopt_td->td_ucred,
1622 so, &extmac);
1623 if (error)
1624 return (error);
1625 error = sooptcopyout(sopt, &extmac, sizeof extmac);
1626#else
1627 error = EOPNOTSUPP;
1628#endif
1629 break;
1630 case SO_PEERLABEL:
1631#ifdef MAC
1632 error = sooptcopyin(sopt, &extmac, sizeof(extmac),
1633 sizeof(extmac));
1634 if (error)
1635 return (error);
1636 error = mac_getsockopt_peerlabel(
1637 sopt->sopt_td->td_ucred, so, &extmac);
1638 if (error)
1639 return (error);
1640 error = sooptcopyout(sopt, &extmac, sizeof extmac);
1641#else
1642 error = EOPNOTSUPP;
1643#endif
1644 break;
1645 default:
1646 error = ENOPROTOOPT;
1647 break;
1648 }
1649 return (error);
1650 }
1651}
1652
1653/* XXX; prepare mbuf for (__FreeBSD__ < 3) routines. */
1654int
1655soopt_getm(struct sockopt *sopt, struct mbuf **mp)
1656{
1657 struct mbuf *m, *m_prev;
1658 int sopt_size = sopt->sopt_valsize;
1659
1660 MGET(m, sopt->sopt_td ? M_TRYWAIT : M_DONTWAIT, MT_DATA);
1661 if (m == NULL)
1662 return ENOBUFS;
1663 if (sopt_size > MLEN) {
1664 MCLGET(m, sopt->sopt_td ? M_TRYWAIT : M_DONTWAIT);
1665 if ((m->m_flags & M_EXT) == 0) {
1666 m_free(m);
1667 return ENOBUFS;
1668 }
1669 m->m_len = min(MCLBYTES, sopt_size);
1670 } else {
1671 m->m_len = min(MLEN, sopt_size);
1672 }
1673 sopt_size -= m->m_len;
1674 *mp = m;
1675 m_prev = m;
1676
1677 while (sopt_size) {
1678 MGET(m, sopt->sopt_td ? M_TRYWAIT : M_DONTWAIT, MT_DATA);
1679 if (m == NULL) {
1680 m_freem(*mp);
1681 return ENOBUFS;
1682 }
1683 if (sopt_size > MLEN) {
1684 MCLGET(m, sopt->sopt_td != NULL ? M_TRYWAIT :
1685 M_DONTWAIT);
1686 if ((m->m_flags & M_EXT) == 0) {
1687 m_freem(m);
1688 m_freem(*mp);
1689 return ENOBUFS;
1690 }
1691 m->m_len = min(MCLBYTES, sopt_size);
1692 } else {
1693 m->m_len = min(MLEN, sopt_size);
1694 }
1695 sopt_size -= m->m_len;
1696 m_prev->m_next = m;
1697 m_prev = m;
1698 }
1699 return 0;
1700}
1701
1702/* XXX; copyin sopt data into mbuf chain for (__FreeBSD__ < 3) routines. */
1703int
1704soopt_mcopyin(struct sockopt *sopt, struct mbuf *m)
1705{
1706 struct mbuf *m0 = m;
1707
1708 if (sopt->sopt_val == NULL)
1709 return 0;
1710 while (m != NULL && sopt->sopt_valsize >= m->m_len) {
1711 if (sopt->sopt_td != NULL) {
1712 int error;
1713
1714 error = copyin(sopt->sopt_val, mtod(m, char *),
1715 m->m_len);
1716 if (error != 0) {
1717 m_freem(m0);
1718 return(error);
1719 }
1720 } else
1721 bcopy(sopt->sopt_val, mtod(m, char *), m->m_len);
1722 sopt->sopt_valsize -= m->m_len;
1723 (caddr_t)sopt->sopt_val += m->m_len;
1724 m = m->m_next;
1725 }
1726 if (m != NULL) /* should be allocated enoughly at ip6_sooptmcopyin() */
1727 panic("ip6_sooptmcopyin");
1728 return 0;
1729}
1730
1731/* XXX; copyout mbuf chain data into soopt for (__FreeBSD__ < 3) routines. */
1732int
1733soopt_mcopyout(struct sockopt *sopt, struct mbuf *m)
1734{
1735 struct mbuf *m0 = m;
1736 size_t valsize = 0;
1737
1738 if (sopt->sopt_val == NULL)
1739 return 0;
1740 while (m != NULL && sopt->sopt_valsize >= m->m_len) {
1741 if (sopt->sopt_td != NULL) {
1742 int error;
1743
1744 error = copyout(mtod(m, char *), sopt->sopt_val,
1745 m->m_len);
1746 if (error != 0) {
1747 m_freem(m0);
1748 return(error);
1749 }
1750 } else
1751 bcopy(mtod(m, char *), sopt->sopt_val, m->m_len);
1752 sopt->sopt_valsize -= m->m_len;
1753 (caddr_t)sopt->sopt_val += m->m_len;
1754 valsize += m->m_len;
1755 m = m->m_next;
1756 }
1757 if (m != NULL) {
1758 /* enough soopt buffer should be given from user-land */
1759 m_freem(m0);
1760 return(EINVAL);
1761 }
1762 sopt->sopt_valsize = valsize;
1763 return 0;
1764}
1765
1766void
1767sohasoutofband(so)
1768 struct socket *so;
1769{
1770 if (so->so_sigio != NULL)
1771 pgsigio(&so->so_sigio, SIGURG, 0);
1772 selwakeuppri(&so->so_rcv.sb_sel, PSOCK);
1773}
1774
1775int
1776sopoll(struct socket *so, int events, struct ucred *active_cred,
1777 struct thread *td)
1778{
1779 int revents = 0;
1780 int s = splnet();
1781
1782 if (events & (POLLIN | POLLRDNORM))
1783 if (soreadable(so))
1784 revents |= events & (POLLIN | POLLRDNORM);
1785
1786 if (events & POLLINIGNEOF)
1787 if (so->so_rcv.sb_cc >= so->so_rcv.sb_lowat ||
1788 !TAILQ_EMPTY(&so->so_comp) || so->so_error)
1789 revents |= POLLINIGNEOF;
1790
1791 if (events & (POLLOUT | POLLWRNORM))
1792 if (sowriteable(so))
1793 revents |= events & (POLLOUT | POLLWRNORM);
1794
1795 if (events & (POLLPRI | POLLRDBAND))
1796 if (so->so_oobmark || (so->so_state & SS_RCVATMARK))
1797 revents |= events & (POLLPRI | POLLRDBAND);
1798
1799 if (revents == 0) {
1800 if (events &
1801 (POLLIN | POLLINIGNEOF | POLLPRI | POLLRDNORM |
1802 POLLRDBAND)) {
1803 selrecord(td, &so->so_rcv.sb_sel);
1804 so->so_rcv.sb_flags |= SB_SEL;
1805 }
1806
1807 if (events & (POLLOUT | POLLWRNORM)) {
1808 selrecord(td, &so->so_snd.sb_sel);
1809 so->so_snd.sb_flags |= SB_SEL;
1810 }
1811 }
1812
1813 splx(s);
1814 return (revents);
1815}
1816
1817int
1818soo_kqfilter(struct file *fp, struct knote *kn)
1819{
1820 struct socket *so = kn->kn_fp->f_data;
1821 struct sockbuf *sb;
1822 int s;
1823
1824 switch (kn->kn_filter) {
1825 case EVFILT_READ:
1826 if (so->so_options & SO_ACCEPTCONN)
1827 kn->kn_fop = &solisten_filtops;
1828 else
1829 kn->kn_fop = &soread_filtops;
1830 sb = &so->so_rcv;
1831 break;
1832 case EVFILT_WRITE:
1833 kn->kn_fop = &sowrite_filtops;
1834 sb = &so->so_snd;
1835 break;
1836 default:
1837 return (1);
1838 }
1839
1840 s = splnet();
1841 SLIST_INSERT_HEAD(&sb->sb_sel.si_note, kn, kn_selnext);
1842 sb->sb_flags |= SB_KNOTE;
1843 splx(s);
1844 return (0);
1845}
1846
1847static void
1848filt_sordetach(struct knote *kn)
1849{
1850 struct socket *so = kn->kn_fp->f_data;
1851 int s = splnet();
1852
1853 SLIST_REMOVE(&so->so_rcv.sb_sel.si_note, kn, knote, kn_selnext);
1854 if (SLIST_EMPTY(&so->so_rcv.sb_sel.si_note))
1855 so->so_rcv.sb_flags &= ~SB_KNOTE;
1856 splx(s);
1857}
1858
1859/*ARGSUSED*/
1860static int
1861filt_soread(struct knote *kn, long hint)
1862{
1863 struct socket *so = kn->kn_fp->f_data;
1864 int result;
1865
1866 kn->kn_data = so->so_rcv.sb_cc - so->so_rcv.sb_ctl;
1867 if (so->so_state & SS_CANTRCVMORE) {
1868 kn->kn_flags |= EV_EOF;
1869 kn->kn_fflags = so->so_error;
1870 result = 1;
1871 } else if (so->so_error) /* temporary udp error */
1872 result = 1;
1873 else if (kn->kn_sfflags & NOTE_LOWAT)
1874 result = (kn->kn_data >= kn->kn_sdata);
1875 else
1876 result = (so->so_rcv.sb_cc >= so->so_rcv.sb_lowat);
1877 return (result);
1878}
1879
1880static void
1881filt_sowdetach(struct knote *kn)
1882{
1883 struct socket *so = kn->kn_fp->f_data;
1884 int s = splnet();
1885
1886 SLIST_REMOVE(&so->so_snd.sb_sel.si_note, kn, knote, kn_selnext);
1887 if (SLIST_EMPTY(&so->so_snd.sb_sel.si_note))
1888 so->so_snd.sb_flags &= ~SB_KNOTE;
1889 splx(s);
1890}
1891
1892/*ARGSUSED*/
1893static int
1894filt_sowrite(struct knote *kn, long hint)
1895{
1896 struct socket *so = kn->kn_fp->f_data;
1897 int result;
1898
1899 kn->kn_data = sbspace(&so->so_snd);
1900 if (so->so_state & SS_CANTSENDMORE) {
1901 kn->kn_flags |= EV_EOF;
1902 kn->kn_fflags = so->so_error;
1903 result = 1;
1904 } else if (so->so_error) /* temporary udp error */
1905 result = 1;
1906 else if (((so->so_state & SS_ISCONNECTED) == 0) &&
1907 (so->so_proto->pr_flags & PR_CONNREQUIRED))
1908 result = 0;
1909 else if (kn->kn_sfflags & NOTE_LOWAT)
1910 result = (kn->kn_data >= kn->kn_sdata);
1911 else
1912 result = (kn->kn_data >= so->so_snd.sb_lowat);
1913 return (result);
1914}
1915
1916/*ARGSUSED*/
1917static int
1918filt_solisten(struct knote *kn, long hint)
1919{
1920 struct socket *so = kn->kn_fp->f_data;
1921
1922 kn->kn_data = so->so_qlen;
1923 return (! TAILQ_EMPTY(&so->so_comp));
1924}
1925
1926int
1927socheckuid(struct socket *so, uid_t uid)
1928{
1929
1930 if (so == NULL)
1931 return (EPERM);
1932 if (so->so_cred->cr_uid == uid)
1933 return (0);
1934 return (EPERM);
1935}