Deleted Added
sdiff udiff text old ( 157359 ) new ( 157366 )
full compact
1/*-
2 * Copyright (c) 1982, 1986, 1988, 1990, 1993
3 * The Regents of the University of California. All rights reserved.
4 * Copyright (c) 2004 The FreeBSD Foundation
5 * Copyright (c) 2004-2006 Robert N. M. Watson
6 *
7 * Redistribution and use in source and binary forms, with or without
8 * modification, are permitted provided that the following conditions
9 * are met:
10 * 1. Redistributions of source code must retain the above copyright
11 * notice, this list of conditions and the following disclaimer.
12 * 2. Redistributions in binary form must reproduce the above copyright
13 * notice, this list of conditions and the following disclaimer in the
14 * documentation and/or other materials provided with the distribution.
15 * 4. Neither the name of the University nor the names of its contributors
16 * may be used to endorse or promote products derived from this software
17 * without specific prior written permission.
18 *
19 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
20 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
21 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
22 * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
23 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
24 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
25 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
26 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
27 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
28 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
29 * SUCH DAMAGE.
30 *
31 * @(#)uipc_socket.c 8.3 (Berkeley) 4/15/94
32 */
33
34/*
35 * Comments on the socket life cycle:
36 *
37 * soalloc() sets of socket layer state for a socket, called only by
38 * socreate() and sonewconn(). Socket layer private.
39 *
40 * sdealloc() tears down socket layer state for a socket, called only by
41 * sofree() and sonewconn(). Socket layer private.
42 *
43 * pru_attach() associates protocol layer state with an allocated socket;
44 * called only once, may fail, aborting socket allocation. This is called
45 * from socreate() and sonewconn(). Socket layer private.
46 *
47 * pru_detach() disassociates protocol layer state from an attached socket,
48 * and will be called exactly once for sockets in which pru_attach() has
49 * been successfully called. If pru_attach() returned an error,
50 * pru_detach() will not be called. Socket layer private.
51 *
52 * socreate() creates a socket and attaches protocol state. This is a public
53 * interface that may be used by socket layer consumers to create new
54 * sockets.
55 *
56 * sonewconn() creates a socket and attaches protocol state. This is a
57 * public interface that may be used by protocols to create new sockets when
58 * a new connection is received and will be available for accept() on a
59 * listen socket.
60 *
61 * soclose() destroys a socket after possibly waiting for it to disconnect.
62 * This is a public interface that socket consumers should use to close and
63 * release a socket when done with it.
64 *
65 * soabort() destroys a socket without waiting for it to disconnect (used
66 * only for incoming connections that are already partially or fully
67 * connected). This is used internally by the socket layer when clearing
68 * listen socket queues (due to overflow or close on the listen socket), but
69 * is also a public interface protocols may use to abort connections in
70 * their incomplete listen queues should they no longer be required. Sockets
71 * placed in completed connection listen queues should not be aborted.
72 *
73 * sofree() will free a socket and its protocol state if all references on
74 * the socket have been released, and is the public interface to attempt to
75 * free a socket when a reference is removed. This is a socket layer private
76 * interface.
77 *
78 * NOTE: In addition to socreate() and soclose(), which provide a single
79 * socket reference to the consumer to be managed as required, there are two
80 * calls to explicitly manage socket references, soref(), and sorele().
81 * Currently, these are generally required only when transitioning a socket
82 * from a listen queue to a file descriptor, in order to prevent garbage
83 * collection of the socket at an untimely moment. For a number of reasons,
84 * these interfaces are not preferred, and should be avoided.
85 *
86 * XXXRW: The behavior of sockets after soclose() but before the last
87 * sorele() is poorly defined. We can probably entirely eliminate them with
88 * a little work, since consumers are managing references anyway.
89 */
90
91#include <sys/cdefs.h>
92__FBSDID("$FreeBSD: head/sys/kern/uipc_socket.c 157366 2006-04-01 15:15:05Z rwatson $");
93
94#include "opt_inet.h"
95#include "opt_mac.h"
96#include "opt_zero.h"
97#include "opt_compat.h"
98
99#include <sys/param.h>
100#include <sys/systm.h>
101#include <sys/fcntl.h>
102#include <sys/limits.h>
103#include <sys/lock.h>
104#include <sys/mac.h>
105#include <sys/malloc.h>
106#include <sys/mbuf.h>
107#include <sys/mutex.h>
108#include <sys/domain.h>
109#include <sys/file.h> /* for struct knote */
110#include <sys/kernel.h>
111#include <sys/event.h>
112#include <sys/poll.h>
113#include <sys/proc.h>
114#include <sys/protosw.h>
115#include <sys/socket.h>
116#include <sys/socketvar.h>
117#include <sys/resourcevar.h>
118#include <sys/signalvar.h>
119#include <sys/sysctl.h>
120#include <sys/uio.h>
121#include <sys/jail.h>
122
123#include <vm/uma.h>
124
125#ifdef COMPAT_IA32
126#include <sys/mount.h>
127#include <compat/freebsd32/freebsd32.h>
128
129extern struct sysentvec ia32_freebsd_sysvec;
130#endif
131
132static int soreceive_rcvoob(struct socket *so, struct uio *uio,
133 int flags);
134
135static void filt_sordetach(struct knote *kn);
136static int filt_soread(struct knote *kn, long hint);
137static void filt_sowdetach(struct knote *kn);
138static int filt_sowrite(struct knote *kn, long hint);
139static int filt_solisten(struct knote *kn, long hint);
140
141static struct filterops solisten_filtops =
142 { 1, NULL, filt_sordetach, filt_solisten };
143static struct filterops soread_filtops =
144 { 1, NULL, filt_sordetach, filt_soread };
145static struct filterops sowrite_filtops =
146 { 1, NULL, filt_sowdetach, filt_sowrite };
147
148uma_zone_t socket_zone;
149so_gen_t so_gencnt; /* generation count for sockets */
150
151MALLOC_DEFINE(M_SONAME, "soname", "socket name");
152MALLOC_DEFINE(M_PCB, "pcb", "protocol control block");
153
154SYSCTL_DECL(_kern_ipc);
155
156static int somaxconn = SOMAXCONN;
157static int somaxconn_sysctl(SYSCTL_HANDLER_ARGS);
158/* XXX: we dont have SYSCTL_USHORT */
159SYSCTL_PROC(_kern_ipc, KIPC_SOMAXCONN, somaxconn, CTLTYPE_UINT | CTLFLAG_RW,
160 0, sizeof(int), somaxconn_sysctl, "I", "Maximum pending socket connection "
161 "queue size");
162static int numopensockets;
163SYSCTL_INT(_kern_ipc, OID_AUTO, numopensockets, CTLFLAG_RD,
164 &numopensockets, 0, "Number of open sockets");
165#ifdef ZERO_COPY_SOCKETS
166/* These aren't static because they're used in other files. */
167int so_zero_copy_send = 1;
168int so_zero_copy_receive = 1;
169SYSCTL_NODE(_kern_ipc, OID_AUTO, zero_copy, CTLFLAG_RD, 0,
170 "Zero copy controls");
171SYSCTL_INT(_kern_ipc_zero_copy, OID_AUTO, receive, CTLFLAG_RW,
172 &so_zero_copy_receive, 0, "Enable zero copy receive");
173SYSCTL_INT(_kern_ipc_zero_copy, OID_AUTO, send, CTLFLAG_RW,
174 &so_zero_copy_send, 0, "Enable zero copy send");
175#endif /* ZERO_COPY_SOCKETS */
176
177/*
178 * accept_mtx locks down per-socket fields relating to accept queues. See
179 * socketvar.h for an annotation of the protected fields of struct socket.
180 */
181struct mtx accept_mtx;
182MTX_SYSINIT(accept_mtx, &accept_mtx, "accept", MTX_DEF);
183
184/*
185 * so_global_mtx protects so_gencnt, numopensockets, and the per-socket
186 * so_gencnt field.
187 */
188static struct mtx so_global_mtx;
189MTX_SYSINIT(so_global_mtx, &so_global_mtx, "so_glabel", MTX_DEF);
190
191/*
192 * Socket operation routines.
193 * These routines are called by the routines in
194 * sys_socket.c or from a system process, and
195 * implement the semantics of socket operations by
196 * switching out to the protocol specific routines.
197 */
198
199/*
200 * Get a socket structure from our zone, and initialize it.
201 * Note that it would probably be better to allocate socket
202 * and PCB at the same time, but I'm not convinced that all
203 * the protocols can be easily modified to do this.
204 *
205 * soalloc() returns a socket with a ref count of 0.
206 */
207struct socket *
208soalloc(int mflags)
209{
210 struct socket *so;
211
212 so = uma_zalloc(socket_zone, mflags | M_ZERO);
213 if (so != NULL) {
214#ifdef MAC
215 if (mac_init_socket(so, mflags) != 0) {
216 uma_zfree(socket_zone, so);
217 return (NULL);
218 }
219#endif
220 SOCKBUF_LOCK_INIT(&so->so_snd, "so_snd");
221 SOCKBUF_LOCK_INIT(&so->so_rcv, "so_rcv");
222 TAILQ_INIT(&so->so_aiojobq);
223 mtx_lock(&so_global_mtx);
224 so->so_gencnt = ++so_gencnt;
225 ++numopensockets;
226 mtx_unlock(&so_global_mtx);
227 }
228 return (so);
229}
230
231/*
232 * socreate returns a socket with a ref count of 1. The socket should be
233 * closed with soclose().
234 */
235int
236socreate(dom, aso, type, proto, cred, td)
237 int dom;
238 struct socket **aso;
239 int type;
240 int proto;
241 struct ucred *cred;
242 struct thread *td;
243{
244 struct protosw *prp;
245 struct socket *so;
246 int error;
247
248 if (proto)
249 prp = pffindproto(dom, proto, type);
250 else
251 prp = pffindtype(dom, type);
252
253 if (prp == NULL || prp->pr_usrreqs->pru_attach == NULL ||
254 prp->pr_usrreqs->pru_attach == pru_attach_notsupp)
255 return (EPROTONOSUPPORT);
256
257 if (jailed(cred) && jail_socket_unixiproute_only &&
258 prp->pr_domain->dom_family != PF_LOCAL &&
259 prp->pr_domain->dom_family != PF_INET &&
260 prp->pr_domain->dom_family != PF_ROUTE) {
261 return (EPROTONOSUPPORT);
262 }
263
264 if (prp->pr_type != type)
265 return (EPROTOTYPE);
266 so = soalloc(M_WAITOK);
267 if (so == NULL)
268 return (ENOBUFS);
269
270 TAILQ_INIT(&so->so_incomp);
271 TAILQ_INIT(&so->so_comp);
272 so->so_type = type;
273 so->so_cred = crhold(cred);
274 so->so_proto = prp;
275#ifdef MAC
276 mac_create_socket(cred, so);
277#endif
278 knlist_init(&so->so_rcv.sb_sel.si_note, SOCKBUF_MTX(&so->so_rcv),
279 NULL, NULL, NULL);
280 knlist_init(&so->so_snd.sb_sel.si_note, SOCKBUF_MTX(&so->so_snd),
281 NULL, NULL, NULL);
282 so->so_count = 1;
283 error = (*prp->pr_usrreqs->pru_attach)(so, proto, td);
284 if (error) {
285 ACCEPT_LOCK();
286 SOCK_LOCK(so);
287 so->so_state |= SS_NOFDREF;
288 sorele(so);
289 return (error);
290 }
291 *aso = so;
292 return (0);
293}
294
295int
296sobind(so, nam, td)
297 struct socket *so;
298 struct sockaddr *nam;
299 struct thread *td;
300{
301
302 return ((*so->so_proto->pr_usrreqs->pru_bind)(so, nam, td));
303}
304
305void
306sodealloc(struct socket *so)
307{
308
309 KASSERT(so->so_count == 0, ("sodealloc(): so_count %d", so->so_count));
310 KASSERT(so->so_pcb == NULL, ("sodealloc(): so_pcb != NULL"));
311
312 mtx_lock(&so_global_mtx);
313 so->so_gencnt = ++so_gencnt;
314 mtx_unlock(&so_global_mtx);
315 if (so->so_rcv.sb_hiwat)
316 (void)chgsbsize(so->so_cred->cr_uidinfo,
317 &so->so_rcv.sb_hiwat, 0, RLIM_INFINITY);
318 if (so->so_snd.sb_hiwat)
319 (void)chgsbsize(so->so_cred->cr_uidinfo,
320 &so->so_snd.sb_hiwat, 0, RLIM_INFINITY);
321#ifdef INET
322 /* remove acccept filter if one is present. */
323 if (so->so_accf != NULL)
324 do_setopt_accept_filter(so, NULL);
325#endif
326#ifdef MAC
327 mac_destroy_socket(so);
328#endif
329 crfree(so->so_cred);
330 SOCKBUF_LOCK_DESTROY(&so->so_snd);
331 SOCKBUF_LOCK_DESTROY(&so->so_rcv);
332 uma_zfree(socket_zone, so);
333 mtx_lock(&so_global_mtx);
334 --numopensockets;
335 mtx_unlock(&so_global_mtx);
336}
337
338/*
339 * solisten() transitions a socket from a non-listening state to a listening
340 * state, but can also be used to update the listen queue depth on an
341 * existing listen socket. The protocol will call back into the sockets
342 * layer using solisten_proto_check() and solisten_proto() to check and set
343 * socket-layer listen state. Call backs are used so that the protocol can
344 * acquire both protocol and socket layer locks in whatever order is required
345 * by the protocol.
346 *
347 * Protocol implementors are advised to hold the socket lock across the
348 * socket-layer test and set to avoid races at the socket layer.
349 */
350int
351solisten(so, backlog, td)
352 struct socket *so;
353 int backlog;
354 struct thread *td;
355{
356
357 return ((*so->so_proto->pr_usrreqs->pru_listen)(so, backlog, td));
358}
359
360int
361solisten_proto_check(so)
362 struct socket *so;
363{
364
365 SOCK_LOCK_ASSERT(so);
366
367 if (so->so_state & (SS_ISCONNECTED | SS_ISCONNECTING |
368 SS_ISDISCONNECTING))
369 return (EINVAL);
370 return (0);
371}
372
373void
374solisten_proto(so, backlog)
375 struct socket *so;
376 int backlog;
377{
378
379 SOCK_LOCK_ASSERT(so);
380
381 if (backlog < 0 || backlog > somaxconn)
382 backlog = somaxconn;
383 so->so_qlimit = backlog;
384 so->so_options |= SO_ACCEPTCONN;
385}
386
387/*
388 * Attempt to free a socket. This should really be sotryfree().
389 *
390 * We free the socket if the protocol is no longer interested in the socket,
391 * there's no file descriptor reference, and the refcount is 0. While the
392 * calling macro sotryfree() tests the refcount, sofree() has to test it
393 * again as it's possible to race with an accept()ing thread if the socket is
394 * in an listen queue of a listen socket, as being in the listen queue
395 * doesn't elevate the reference count. sofree() acquires the accept mutex
396 * early for this test in order to avoid that race.
397 */
398void
399sofree(so)
400 struct socket *so;
401{
402 struct socket *head;
403
404 ACCEPT_LOCK_ASSERT();
405 SOCK_LOCK_ASSERT(so);
406
407 if (so->so_pcb != NULL || (so->so_state & SS_NOFDREF) == 0 ||
408 so->so_count != 0 || (so->so_state & SS_PROTOREF)) {
409 SOCK_UNLOCK(so);
410 ACCEPT_UNLOCK();
411 return;
412 }
413
414 head = so->so_head;
415 if (head != NULL) {
416 KASSERT((so->so_qstate & SQ_COMP) != 0 ||
417 (so->so_qstate & SQ_INCOMP) != 0,
418 ("sofree: so_head != NULL, but neither SQ_COMP nor "
419 "SQ_INCOMP"));
420 KASSERT((so->so_qstate & SQ_COMP) == 0 ||
421 (so->so_qstate & SQ_INCOMP) == 0,
422 ("sofree: so->so_qstate is SQ_COMP and also SQ_INCOMP"));
423 /*
424 * accept(2) is responsible draining the completed
425 * connection queue and freeing those sockets, so
426 * we just return here if this socket is currently
427 * on the completed connection queue. Otherwise,
428 * accept(2) may hang after select(2) has indicating
429 * that a listening socket was ready. If it's an
430 * incomplete connection, we remove it from the queue
431 * and free it; otherwise, it won't be released until
432 * the listening socket is closed.
433 */
434 if ((so->so_qstate & SQ_COMP) != 0) {
435 SOCK_UNLOCK(so);
436 ACCEPT_UNLOCK();
437 return;
438 }
439 TAILQ_REMOVE(&head->so_incomp, so, so_list);
440 head->so_incqlen--;
441 so->so_qstate &= ~SQ_INCOMP;
442 so->so_head = NULL;
443 }
444 KASSERT((so->so_qstate & SQ_COMP) == 0 &&
445 (so->so_qstate & SQ_INCOMP) == 0,
446 ("sofree: so_head == NULL, but still SQ_COMP(%d) or SQ_INCOMP(%d)",
447 so->so_qstate & SQ_COMP, so->so_qstate & SQ_INCOMP));
448 SOCK_UNLOCK(so);
449 ACCEPT_UNLOCK();
450 SOCKBUF_LOCK(&so->so_snd);
451 so->so_snd.sb_flags |= SB_NOINTR;
452 (void)sblock(&so->so_snd, M_WAITOK);
453 /*
454 * socantsendmore_locked() drops the socket buffer mutex so that it
455 * can safely perform wakeups. Re-acquire the mutex before
456 * continuing.
457 */
458 socantsendmore_locked(so);
459 SOCKBUF_LOCK(&so->so_snd);
460 sbunlock(&so->so_snd);
461 sbrelease_locked(&so->so_snd, so);
462 SOCKBUF_UNLOCK(&so->so_snd);
463 sorflush(so);
464 knlist_destroy(&so->so_rcv.sb_sel.si_note);
465 knlist_destroy(&so->so_snd.sb_sel.si_note);
466 sodealloc(so);
467}
468
469/*
470 * Close a socket on last file table reference removal.
471 * Initiate disconnect if connected.
472 * Free socket when disconnect complete.
473 *
474 * This function will sorele() the socket. Note that soclose() may be
475 * called prior to the ref count reaching zero. The actual socket
476 * structure will not be freed until the ref count reaches zero.
477 */
478int
479soclose(so)
480 struct socket *so;
481{
482 int error = 0;
483
484 KASSERT(!(so->so_state & SS_NOFDREF), ("soclose: SS_NOFDREF on enter"));
485
486 funsetown(&so->so_sigio);
487 if (so->so_options & SO_ACCEPTCONN) {
488 struct socket *sp;
489 ACCEPT_LOCK();
490 while ((sp = TAILQ_FIRST(&so->so_incomp)) != NULL) {
491 TAILQ_REMOVE(&so->so_incomp, sp, so_list);
492 so->so_incqlen--;
493 sp->so_qstate &= ~SQ_INCOMP;
494 sp->so_head = NULL;
495 ACCEPT_UNLOCK();
496 soabort(sp);
497 ACCEPT_LOCK();
498 }
499 while ((sp = TAILQ_FIRST(&so->so_comp)) != NULL) {
500 TAILQ_REMOVE(&so->so_comp, sp, so_list);
501 so->so_qlen--;
502 sp->so_qstate &= ~SQ_COMP;
503 sp->so_head = NULL;
504 ACCEPT_UNLOCK();
505 soabort(sp);
506 ACCEPT_LOCK();
507 }
508 ACCEPT_UNLOCK();
509 }
510 if (so->so_pcb == NULL)
511 goto discard;
512 if (so->so_state & SS_ISCONNECTED) {
513 if ((so->so_state & SS_ISDISCONNECTING) == 0) {
514 error = sodisconnect(so);
515 if (error)
516 goto drop;
517 }
518 if (so->so_options & SO_LINGER) {
519 if ((so->so_state & SS_ISDISCONNECTING) &&
520 (so->so_state & SS_NBIO))
521 goto drop;
522 while (so->so_state & SS_ISCONNECTED) {
523 error = tsleep(&so->so_timeo,
524 PSOCK | PCATCH, "soclos", so->so_linger * hz);
525 if (error)
526 break;
527 }
528 }
529 }
530drop:
531 if (so->so_pcb != NULL) {
532 int error2 = (*so->so_proto->pr_usrreqs->pru_detach)(so);
533 if (error == 0)
534 error = error2;
535 }
536discard:
537 ACCEPT_LOCK();
538 SOCK_LOCK(so);
539 KASSERT((so->so_state & SS_NOFDREF) == 0, ("soclose: NOFDREF"));
540 so->so_state |= SS_NOFDREF;
541 sorele(so);
542 return (error);
543}
544
545/*
546 * soabort() allows the socket code or protocol code to detach a socket that
547 * has been in an incomplete or completed listen queue, but has not yet been
548 * accepted.
549 *
550 * This interface is tricky, because it is called on an unreferenced socket,
551 * and must be called only by a thread that has actually removed the socket
552 * from the listen queue it was on, or races with other threads are risked.
553 *
554 * This interface will call into the protocol code, so must not be called
555 * with any socket locks held. Protocols do call it while holding their own
556 * recursible protocol mutexes, but this is something that should be subject
557 * to review in the future.
558 *
559 * XXXRW: Why do we maintain a distinction between pru_abort() and
560 * pru_detach()?
561 */
562void
563soabort(so)
564 struct socket *so;
565{
566
567 /*
568 * In as much as is possible, assert that no references to this
569 * socket are held. This is not quite the same as asserting that the
570 * current thread is responsible for arranging for no references, but
571 * is as close as we can get for now.
572 */
573 KASSERT(so->so_count == 0, ("soabort: so_count"));
574 KASSERT(!(so->so_state & SS_PROTOREF), ("soabort: SS_PROTOREF"));
575 KASSERT(so->so_state & SS_NOFDREF, ("soabort: !SS_NOFDREF"));
576
577 (*so->so_proto->pr_usrreqs->pru_abort)(so);
578 ACCEPT_LOCK();
579 SOCK_LOCK(so);
580 sofree(so);
581}
582
583int
584soaccept(so, nam)
585 struct socket *so;
586 struct sockaddr **nam;
587{
588 int error;
589
590 SOCK_LOCK(so);
591 KASSERT((so->so_state & SS_NOFDREF) != 0, ("soaccept: !NOFDREF"));
592 so->so_state &= ~SS_NOFDREF;
593 SOCK_UNLOCK(so);
594 error = (*so->so_proto->pr_usrreqs->pru_accept)(so, nam);
595 return (error);
596}
597
598int
599soconnect(so, nam, td)
600 struct socket *so;
601 struct sockaddr *nam;
602 struct thread *td;
603{
604 int error;
605
606 if (so->so_options & SO_ACCEPTCONN)
607 return (EOPNOTSUPP);
608 /*
609 * If protocol is connection-based, can only connect once.
610 * Otherwise, if connected, try to disconnect first.
611 * This allows user to disconnect by connecting to, e.g.,
612 * a null address.
613 */
614 if (so->so_state & (SS_ISCONNECTED|SS_ISCONNECTING) &&
615 ((so->so_proto->pr_flags & PR_CONNREQUIRED) ||
616 (error = sodisconnect(so)))) {
617 error = EISCONN;
618 } else {
619 /*
620 * Prevent accumulated error from previous connection
621 * from biting us.
622 */
623 so->so_error = 0;
624 error = (*so->so_proto->pr_usrreqs->pru_connect)(so, nam, td);
625 }
626
627 return (error);
628}
629
630int
631soconnect2(so1, so2)
632 struct socket *so1;
633 struct socket *so2;
634{
635
636 return ((*so1->so_proto->pr_usrreqs->pru_connect2)(so1, so2));
637}
638
639int
640sodisconnect(so)
641 struct socket *so;
642{
643 int error;
644
645 if ((so->so_state & SS_ISCONNECTED) == 0)
646 return (ENOTCONN);
647 if (so->so_state & SS_ISDISCONNECTING)
648 return (EALREADY);
649 error = (*so->so_proto->pr_usrreqs->pru_disconnect)(so);
650 return (error);
651}
652
653#ifdef ZERO_COPY_SOCKETS
654struct so_zerocopy_stats{
655 int size_ok;
656 int align_ok;
657 int found_ifp;
658};
659struct so_zerocopy_stats so_zerocp_stats = {0,0,0};
660#include <netinet/in.h>
661#include <net/route.h>
662#include <netinet/in_pcb.h>
663#include <vm/vm.h>
664#include <vm/vm_page.h>
665#include <vm/vm_object.h>
666#endif /*ZERO_COPY_SOCKETS*/
667
668/*
669 * sosend_copyin() accepts a uio and prepares an mbuf chain holding part or
670 * all of the data referenced by the uio. If desired, it uses zero-copy.
671 * *space will be updated to reflect data copied in.
672 *
673 * NB: If atomic I/O is requested, the caller must already have checked that
674 * space can hold resid bytes.
675 *
676 * NB: In the event of an error, the caller may need to free the partial
677 * chain pointed to by *mpp. The contents of both *uio and *space may be
678 * modified even in the case of an error.
679 */
680static int
681sosend_copyin(struct uio *uio, struct mbuf **retmp, int atomic, long *space,
682 int flags)
683{
684 struct mbuf *m, **mp, *top;
685 long len, resid;
686 int error;
687#ifdef ZERO_COPY_SOCKETS
688 int cow_send;
689#endif
690
691 *retmp = top = NULL;
692 mp = &top;
693 len = 0;
694 resid = uio->uio_resid;
695 error = 0;
696 do {
697#ifdef ZERO_COPY_SOCKETS
698 cow_send = 0;
699#endif /* ZERO_COPY_SOCKETS */
700 if (resid >= MINCLSIZE) {
701#ifdef ZERO_COPY_SOCKETS
702 if (top == NULL) {
703 MGETHDR(m, M_TRYWAIT, MT_DATA);
704 if (m == NULL) {
705 error = ENOBUFS;
706 goto out;
707 }
708 m->m_pkthdr.len = 0;
709 m->m_pkthdr.rcvif = NULL;
710 } else {
711 MGET(m, M_TRYWAIT, MT_DATA);
712 if (m == NULL) {
713 error = ENOBUFS;
714 goto out;
715 }
716 }
717 if (so_zero_copy_send &&
718 resid>=PAGE_SIZE &&
719 *space>=PAGE_SIZE &&
720 uio->uio_iov->iov_len>=PAGE_SIZE) {
721 so_zerocp_stats.size_ok++;
722 so_zerocp_stats.align_ok++;
723 cow_send = socow_setup(m, uio);
724 len = cow_send;
725 }
726 if (!cow_send) {
727 MCLGET(m, M_TRYWAIT);
728 if ((m->m_flags & M_EXT) == 0) {
729 m_free(m);
730 m = NULL;
731 } else {
732 len = min(min(MCLBYTES, resid),
733 *space);
734 }
735 }
736#else /* ZERO_COPY_SOCKETS */
737 if (top == NULL) {
738 m = m_getcl(M_TRYWAIT, MT_DATA, M_PKTHDR);
739 m->m_pkthdr.len = 0;
740 m->m_pkthdr.rcvif = NULL;
741 } else
742 m = m_getcl(M_TRYWAIT, MT_DATA, 0);
743 len = min(min(MCLBYTES, resid), *space);
744#endif /* ZERO_COPY_SOCKETS */
745 } else {
746 if (top == NULL) {
747 m = m_gethdr(M_TRYWAIT, MT_DATA);
748 m->m_pkthdr.len = 0;
749 m->m_pkthdr.rcvif = NULL;
750
751 len = min(min(MHLEN, resid), *space);
752 /*
753 * For datagram protocols, leave room
754 * for protocol headers in first mbuf.
755 */
756 if (atomic && m && len < MHLEN)
757 MH_ALIGN(m, len);
758 } else {
759 m = m_get(M_TRYWAIT, MT_DATA);
760 len = min(min(MLEN, resid), *space);
761 }
762 }
763 if (m == NULL) {
764 error = ENOBUFS;
765 goto out;
766 }
767
768 *space -= len;
769#ifdef ZERO_COPY_SOCKETS
770 if (cow_send)
771 error = 0;
772 else
773#endif /* ZERO_COPY_SOCKETS */
774 error = uiomove(mtod(m, void *), (int)len, uio);
775 resid = uio->uio_resid;
776 m->m_len = len;
777 *mp = m;
778 top->m_pkthdr.len += len;
779 if (error)
780 goto out;
781 mp = &m->m_next;
782 if (resid <= 0) {
783 if (flags & MSG_EOR)
784 top->m_flags |= M_EOR;
785 break;
786 }
787 } while (*space > 0 && atomic);
788out:
789 *retmp = top;
790 return (error);
791}
792
793#define SBLOCKWAIT(f) (((f) & MSG_DONTWAIT) ? M_NOWAIT : M_WAITOK)
794
795int
796sosend_dgram(so, addr, uio, top, control, flags, td)
797 struct socket *so;
798 struct sockaddr *addr;
799 struct uio *uio;
800 struct mbuf *top;
801 struct mbuf *control;
802 int flags;
803 struct thread *td;
804{
805 long space, resid;
806 int clen = 0, error, dontroute;
807 int atomic = sosendallatonce(so) || top;
808
809 KASSERT(so->so_type == SOCK_DGRAM, ("sodgram_send: !SOCK_DGRAM"));
810 KASSERT(so->so_proto->pr_flags & PR_ATOMIC,
811 ("sodgram_send: !PR_ATOMIC"));
812
813 if (uio != NULL)
814 resid = uio->uio_resid;
815 else
816 resid = top->m_pkthdr.len;
817 /*
818 * In theory resid should be unsigned.
819 * However, space must be signed, as it might be less than 0
820 * if we over-committed, and we must use a signed comparison
821 * of space and resid. On the other hand, a negative resid
822 * causes us to loop sending 0-length segments to the protocol.
823 *
824 * Also check to make sure that MSG_EOR isn't used on SOCK_STREAM
825 * type sockets since that's an error.
826 */
827 if (resid < 0) {
828 error = EINVAL;
829 goto out;
830 }
831
832 dontroute =
833 (flags & MSG_DONTROUTE) && (so->so_options & SO_DONTROUTE) == 0;
834 if (td != NULL)
835 td->td_proc->p_stats->p_ru.ru_msgsnd++;
836 if (control != NULL)
837 clen = control->m_len;
838
839 SOCKBUF_LOCK(&so->so_snd);
840 if (so->so_snd.sb_state & SBS_CANTSENDMORE) {
841 SOCKBUF_UNLOCK(&so->so_snd);
842 error = EPIPE;
843 goto out;
844 }
845 if (so->so_error) {
846 error = so->so_error;
847 so->so_error = 0;
848 SOCKBUF_UNLOCK(&so->so_snd);
849 goto out;
850 }
851 if ((so->so_state & SS_ISCONNECTED) == 0) {
852 /*
853 * `sendto' and `sendmsg' is allowed on a connection-
854 * based socket if it supports implied connect.
855 * Return ENOTCONN if not connected and no address is
856 * supplied.
857 */
858 if ((so->so_proto->pr_flags & PR_CONNREQUIRED) &&
859 (so->so_proto->pr_flags & PR_IMPLOPCL) == 0) {
860 if ((so->so_state & SS_ISCONFIRMING) == 0 &&
861 !(resid == 0 && clen != 0)) {
862 SOCKBUF_UNLOCK(&so->so_snd);
863 error = ENOTCONN;
864 goto out;
865 }
866 } else if (addr == NULL) {
867 if (so->so_proto->pr_flags & PR_CONNREQUIRED)
868 error = ENOTCONN;
869 else
870 error = EDESTADDRREQ;
871 SOCKBUF_UNLOCK(&so->so_snd);
872 goto out;
873 }
874 }
875
876 /*
877 * Do we need MSG_OOB support in SOCK_DGRAM? Signs here may be a
878 * problem and need fixing.
879 */
880 space = sbspace(&so->so_snd);
881 if (flags & MSG_OOB)
882 space += 1024;
883 space -= clen;
884 if (resid > space) {
885 error = EMSGSIZE;
886 goto out;
887 }
888 SOCKBUF_UNLOCK(&so->so_snd);
889 if (uio == NULL) {
890 resid = 0;
891 if (flags & MSG_EOR)
892 top->m_flags |= M_EOR;
893 } else {
894 error = sosend_copyin(uio, &top, atomic, &space, flags);
895 if (error)
896 goto out;
897 resid = uio->uio_resid;
898 }
899 KASSERT(resid == 0, ("sosend_dgram: resid != 0"));
900 /*
901 * XXXRW: Frobbing SO_DONTROUTE here is even worse without sblock
902 * than with.
903 */
904 if (dontroute) {
905 SOCK_LOCK(so);
906 so->so_options |= SO_DONTROUTE;
907 SOCK_UNLOCK(so);
908 }
909 /*
910 * XXX all the SBS_CANTSENDMORE checks previously
911 * done could be out of date. We could have recieved
912 * a reset packet in an interrupt or maybe we slept
913 * while doing page faults in uiomove() etc. We could
914 * probably recheck again inside the locking protection
915 * here, but there are probably other places that this
916 * also happens. We must rethink this.
917 */
918 error = (*so->so_proto->pr_usrreqs->pru_send)(so,
919 (flags & MSG_OOB) ? PRUS_OOB :
920 /*
921 * If the user set MSG_EOF, the protocol
922 * understands this flag and nothing left to
923 * send then use PRU_SEND_EOF instead of PRU_SEND.
924 */
925 ((flags & MSG_EOF) &&
926 (so->so_proto->pr_flags & PR_IMPLOPCL) &&
927 (resid <= 0)) ?
928 PRUS_EOF :
929 /* If there is more to send set PRUS_MORETOCOME */
930 (resid > 0 && space > 0) ? PRUS_MORETOCOME : 0,
931 top, addr, control, td);
932 if (dontroute) {
933 SOCK_LOCK(so);
934 so->so_options &= ~SO_DONTROUTE;
935 SOCK_UNLOCK(so);
936 }
937 clen = 0;
938 control = NULL;
939 top = NULL;
940out:
941 if (top != NULL)
942 m_freem(top);
943 if (control != NULL)
944 m_freem(control);
945 return (error);
946}
947
948/*
949 * Send on a socket.
950 * If send must go all at once and message is larger than
951 * send buffering, then hard error.
952 * Lock against other senders.
953 * If must go all at once and not enough room now, then
954 * inform user that this would block and do nothing.
955 * Otherwise, if nonblocking, send as much as possible.
956 * The data to be sent is described by "uio" if nonzero,
957 * otherwise by the mbuf chain "top" (which must be null
958 * if uio is not). Data provided in mbuf chain must be small
959 * enough to send all at once.
960 *
961 * Returns nonzero on error, timeout or signal; callers
962 * must check for short counts if EINTR/ERESTART are returned.
963 * Data and control buffers are freed on return.
964 */
965#define snderr(errno) { error = (errno); goto release; }
966int
967sosend(so, addr, uio, top, control, flags, td)
968 struct socket *so;
969 struct sockaddr *addr;
970 struct uio *uio;
971 struct mbuf *top;
972 struct mbuf *control;
973 int flags;
974 struct thread *td;
975{
976 long space, resid;
977 int clen = 0, error, dontroute;
978 int atomic = sosendallatonce(so) || top;
979
980 if (uio != NULL)
981 resid = uio->uio_resid;
982 else
983 resid = top->m_pkthdr.len;
984 /*
985 * In theory resid should be unsigned.
986 * However, space must be signed, as it might be less than 0
987 * if we over-committed, and we must use a signed comparison
988 * of space and resid. On the other hand, a negative resid
989 * causes us to loop sending 0-length segments to the protocol.
990 *
991 * Also check to make sure that MSG_EOR isn't used on SOCK_STREAM
992 * type sockets since that's an error.
993 */
994 if (resid < 0 || (so->so_type == SOCK_STREAM && (flags & MSG_EOR))) {
995 error = EINVAL;
996 goto out;
997 }
998
999 dontroute =
1000 (flags & MSG_DONTROUTE) && (so->so_options & SO_DONTROUTE) == 0 &&
1001 (so->so_proto->pr_flags & PR_ATOMIC);
1002 if (td != NULL)
1003 td->td_proc->p_stats->p_ru.ru_msgsnd++;
1004 if (control != NULL)
1005 clen = control->m_len;
1006
1007 SOCKBUF_LOCK(&so->so_snd);
1008restart:
1009 SOCKBUF_LOCK_ASSERT(&so->so_snd);
1010 error = sblock(&so->so_snd, SBLOCKWAIT(flags));
1011 if (error)
1012 goto out_locked;
1013 do {
1014 SOCKBUF_LOCK_ASSERT(&so->so_snd);
1015 if (so->so_snd.sb_state & SBS_CANTSENDMORE)
1016 snderr(EPIPE);
1017 if (so->so_error) {
1018 error = so->so_error;
1019 so->so_error = 0;
1020 goto release;
1021 }
1022 if ((so->so_state & SS_ISCONNECTED) == 0) {
1023 /*
1024 * `sendto' and `sendmsg' is allowed on a connection-
1025 * based socket if it supports implied connect.
1026 * Return ENOTCONN if not connected and no address is
1027 * supplied.
1028 */
1029 if ((so->so_proto->pr_flags & PR_CONNREQUIRED) &&
1030 (so->so_proto->pr_flags & PR_IMPLOPCL) == 0) {
1031 if ((so->so_state & SS_ISCONFIRMING) == 0 &&
1032 !(resid == 0 && clen != 0))
1033 snderr(ENOTCONN);
1034 } else if (addr == NULL)
1035 snderr(so->so_proto->pr_flags & PR_CONNREQUIRED ?
1036 ENOTCONN : EDESTADDRREQ);
1037 }
1038 space = sbspace(&so->so_snd);
1039 if (flags & MSG_OOB)
1040 space += 1024;
1041 if ((atomic && resid > so->so_snd.sb_hiwat) ||
1042 clen > so->so_snd.sb_hiwat)
1043 snderr(EMSGSIZE);
1044 if (space < resid + clen &&
1045 (atomic || space < so->so_snd.sb_lowat || space < clen)) {
1046 if ((so->so_state & SS_NBIO) || (flags & MSG_NBIO))
1047 snderr(EWOULDBLOCK);
1048 sbunlock(&so->so_snd);
1049 error = sbwait(&so->so_snd);
1050 if (error)
1051 goto out_locked;
1052 goto restart;
1053 }
1054 SOCKBUF_UNLOCK(&so->so_snd);
1055 space -= clen;
1056 do {
1057 if (uio == NULL) {
1058 resid = 0;
1059 if (flags & MSG_EOR)
1060 top->m_flags |= M_EOR;
1061 } else {
1062 error = sosend_copyin(uio, &top, atomic,
1063 &space, flags);
1064 if (error != 0) {
1065 SOCKBUF_LOCK(&so->so_snd);
1066 goto release;
1067 }
1068 resid = uio->uio_resid;
1069 }
1070 if (dontroute) {
1071 SOCK_LOCK(so);
1072 so->so_options |= SO_DONTROUTE;
1073 SOCK_UNLOCK(so);
1074 }
1075 /*
1076 * XXX all the SBS_CANTSENDMORE checks previously
1077 * done could be out of date. We could have recieved
1078 * a reset packet in an interrupt or maybe we slept
1079 * while doing page faults in uiomove() etc. We could
1080 * probably recheck again inside the locking protection
1081 * here, but there are probably other places that this
1082 * also happens. We must rethink this.
1083 */
1084 error = (*so->so_proto->pr_usrreqs->pru_send)(so,
1085 (flags & MSG_OOB) ? PRUS_OOB :
1086 /*
1087 * If the user set MSG_EOF, the protocol
1088 * understands this flag and nothing left to
1089 * send then use PRU_SEND_EOF instead of PRU_SEND.
1090 */
1091 ((flags & MSG_EOF) &&
1092 (so->so_proto->pr_flags & PR_IMPLOPCL) &&
1093 (resid <= 0)) ?
1094 PRUS_EOF :
1095 /* If there is more to send set PRUS_MORETOCOME */
1096 (resid > 0 && space > 0) ? PRUS_MORETOCOME : 0,
1097 top, addr, control, td);
1098 if (dontroute) {
1099 SOCK_LOCK(so);
1100 so->so_options &= ~SO_DONTROUTE;
1101 SOCK_UNLOCK(so);
1102 }
1103 clen = 0;
1104 control = NULL;
1105 top = NULL;
1106 if (error) {
1107 SOCKBUF_LOCK(&so->so_snd);
1108 goto release;
1109 }
1110 } while (resid && space > 0);
1111 SOCKBUF_LOCK(&so->so_snd);
1112 } while (resid);
1113
1114release:
1115 SOCKBUF_LOCK_ASSERT(&so->so_snd);
1116 sbunlock(&so->so_snd);
1117out_locked:
1118 SOCKBUF_LOCK_ASSERT(&so->so_snd);
1119 SOCKBUF_UNLOCK(&so->so_snd);
1120out:
1121 if (top != NULL)
1122 m_freem(top);
1123 if (control != NULL)
1124 m_freem(control);
1125 return (error);
1126}
1127#undef snderr
1128
1129/*
1130 * The part of soreceive() that implements reading non-inline out-of-band
1131 * data from a socket. For more complete comments, see soreceive(), from
1132 * which this code originated.
1133 *
1134 * Note that soreceive_rcvoob(), unlike the remainder of soreceive(), is
1135 * unable to return an mbuf chain to the caller.
1136 */
1137static int
1138soreceive_rcvoob(so, uio, flags)
1139 struct socket *so;
1140 struct uio *uio;
1141 int flags;
1142{
1143 struct protosw *pr = so->so_proto;
1144 struct mbuf *m;
1145 int error;
1146
1147 KASSERT(flags & MSG_OOB, ("soreceive_rcvoob: (flags & MSG_OOB) == 0"));
1148
1149 m = m_get(M_TRYWAIT, MT_DATA);
1150 if (m == NULL)
1151 return (ENOBUFS);
1152 error = (*pr->pr_usrreqs->pru_rcvoob)(so, m, flags & MSG_PEEK);
1153 if (error)
1154 goto bad;
1155 do {
1156#ifdef ZERO_COPY_SOCKETS
1157 if (so_zero_copy_receive) {
1158 int disposable;
1159
1160 if ((m->m_flags & M_EXT)
1161 && (m->m_ext.ext_type == EXT_DISPOSABLE))
1162 disposable = 1;
1163 else
1164 disposable = 0;
1165
1166 error = uiomoveco(mtod(m, void *),
1167 min(uio->uio_resid, m->m_len),
1168 uio, disposable);
1169 } else
1170#endif /* ZERO_COPY_SOCKETS */
1171 error = uiomove(mtod(m, void *),
1172 (int) min(uio->uio_resid, m->m_len), uio);
1173 m = m_free(m);
1174 } while (uio->uio_resid && error == 0 && m);
1175bad:
1176 if (m != NULL)
1177 m_freem(m);
1178 return (error);
1179}
1180
1181/*
1182 * Following replacement or removal of the first mbuf on the first mbuf chain
1183 * of a socket buffer, push necessary state changes back into the socket
1184 * buffer so that other consumers see the values consistently. 'nextrecord'
1185 * is the callers locally stored value of the original value of
1186 * sb->sb_mb->m_nextpkt which must be restored when the lead mbuf changes.
1187 * NOTE: 'nextrecord' may be NULL.
1188 */
1189static __inline void
1190sockbuf_pushsync(struct sockbuf *sb, struct mbuf *nextrecord)
1191{
1192
1193 SOCKBUF_LOCK_ASSERT(sb);
1194 /*
1195 * First, update for the new value of nextrecord. If necessary, make
1196 * it the first record.
1197 */
1198 if (sb->sb_mb != NULL)
1199 sb->sb_mb->m_nextpkt = nextrecord;
1200 else
1201 sb->sb_mb = nextrecord;
1202
1203 /*
1204 * Now update any dependent socket buffer fields to reflect the new
1205 * state. This is an expanded inline of SB_EMPTY_FIXUP(), with the
1206 * addition of a second clause that takes care of the case where
1207 * sb_mb has been updated, but remains the last record.
1208 */
1209 if (sb->sb_mb == NULL) {
1210 sb->sb_mbtail = NULL;
1211 sb->sb_lastrecord = NULL;
1212 } else if (sb->sb_mb->m_nextpkt == NULL)
1213 sb->sb_lastrecord = sb->sb_mb;
1214}
1215
1216
1217/*
1218 * Implement receive operations on a socket.
1219 * We depend on the way that records are added to the sockbuf
1220 * by sbappend*. In particular, each record (mbufs linked through m_next)
1221 * must begin with an address if the protocol so specifies,
1222 * followed by an optional mbuf or mbufs containing ancillary data,
1223 * and then zero or more mbufs of data.
1224 * In order to avoid blocking network interrupts for the entire time here,
1225 * we splx() while doing the actual copy to user space.
1226 * Although the sockbuf is locked, new data may still be appended,
1227 * and thus we must maintain consistency of the sockbuf during that time.
1228 *
1229 * The caller may receive the data as a single mbuf chain by supplying
1230 * an mbuf **mp0 for use in returning the chain. The uio is then used
1231 * only for the count in uio_resid.
1232 */
1233int
1234soreceive(so, psa, uio, mp0, controlp, flagsp)
1235 struct socket *so;
1236 struct sockaddr **psa;
1237 struct uio *uio;
1238 struct mbuf **mp0;
1239 struct mbuf **controlp;
1240 int *flagsp;
1241{
1242 struct mbuf *m, **mp;
1243 int flags, len, error, offset;
1244 struct protosw *pr = so->so_proto;
1245 struct mbuf *nextrecord;
1246 int moff, type = 0;
1247 int orig_resid = uio->uio_resid;
1248
1249 mp = mp0;
1250 if (psa != NULL)
1251 *psa = NULL;
1252 if (controlp != NULL)
1253 *controlp = NULL;
1254 if (flagsp != NULL)
1255 flags = *flagsp &~ MSG_EOR;
1256 else
1257 flags = 0;
1258 if (flags & MSG_OOB)
1259 return (soreceive_rcvoob(so, uio, flags));
1260 if (mp != NULL)
1261 *mp = NULL;
1262 if ((pr->pr_flags & PR_WANTRCVD) && (so->so_state & SS_ISCONFIRMING)
1263 && uio->uio_resid)
1264 (*pr->pr_usrreqs->pru_rcvd)(so, 0);
1265
1266 SOCKBUF_LOCK(&so->so_rcv);
1267restart:
1268 SOCKBUF_LOCK_ASSERT(&so->so_rcv);
1269 error = sblock(&so->so_rcv, SBLOCKWAIT(flags));
1270 if (error)
1271 goto out;
1272
1273 m = so->so_rcv.sb_mb;
1274 /*
1275 * If we have less data than requested, block awaiting more
1276 * (subject to any timeout) if:
1277 * 1. the current count is less than the low water mark, or
1278 * 2. MSG_WAITALL is set, and it is possible to do the entire
1279 * receive operation at once if we block (resid <= hiwat).
1280 * 3. MSG_DONTWAIT is not set
1281 * If MSG_WAITALL is set but resid is larger than the receive buffer,
1282 * we have to do the receive in sections, and thus risk returning
1283 * a short count if a timeout or signal occurs after we start.
1284 */
1285 if (m == NULL || (((flags & MSG_DONTWAIT) == 0 &&
1286 so->so_rcv.sb_cc < uio->uio_resid) &&
1287 (so->so_rcv.sb_cc < so->so_rcv.sb_lowat ||
1288 ((flags & MSG_WAITALL) && uio->uio_resid <= so->so_rcv.sb_hiwat)) &&
1289 m->m_nextpkt == NULL && (pr->pr_flags & PR_ATOMIC) == 0)) {
1290 KASSERT(m != NULL || !so->so_rcv.sb_cc,
1291 ("receive: m == %p so->so_rcv.sb_cc == %u",
1292 m, so->so_rcv.sb_cc));
1293 if (so->so_error) {
1294 if (m != NULL)
1295 goto dontblock;
1296 error = so->so_error;
1297 if ((flags & MSG_PEEK) == 0)
1298 so->so_error = 0;
1299 goto release;
1300 }
1301 SOCKBUF_LOCK_ASSERT(&so->so_rcv);
1302 if (so->so_rcv.sb_state & SBS_CANTRCVMORE) {
1303 if (m)
1304 goto dontblock;
1305 else
1306 goto release;
1307 }
1308 for (; m != NULL; m = m->m_next)
1309 if (m->m_type == MT_OOBDATA || (m->m_flags & M_EOR)) {
1310 m = so->so_rcv.sb_mb;
1311 goto dontblock;
1312 }
1313 if ((so->so_state & (SS_ISCONNECTED|SS_ISCONNECTING)) == 0 &&
1314 (so->so_proto->pr_flags & PR_CONNREQUIRED)) {
1315 error = ENOTCONN;
1316 goto release;
1317 }
1318 if (uio->uio_resid == 0)
1319 goto release;
1320 if ((so->so_state & SS_NBIO) ||
1321 (flags & (MSG_DONTWAIT|MSG_NBIO))) {
1322 error = EWOULDBLOCK;
1323 goto release;
1324 }
1325 SBLASTRECORDCHK(&so->so_rcv);
1326 SBLASTMBUFCHK(&so->so_rcv);
1327 sbunlock(&so->so_rcv);
1328 error = sbwait(&so->so_rcv);
1329 if (error)
1330 goto out;
1331 goto restart;
1332 }
1333dontblock:
1334 /*
1335 * From this point onward, we maintain 'nextrecord' as a cache of the
1336 * pointer to the next record in the socket buffer. We must keep the
1337 * various socket buffer pointers and local stack versions of the
1338 * pointers in sync, pushing out modifications before dropping the
1339 * socket buffer mutex, and re-reading them when picking it up.
1340 *
1341 * Otherwise, we will race with the network stack appending new data
1342 * or records onto the socket buffer by using inconsistent/stale
1343 * versions of the field, possibly resulting in socket buffer
1344 * corruption.
1345 *
1346 * By holding the high-level sblock(), we prevent simultaneous
1347 * readers from pulling off the front of the socket buffer.
1348 */
1349 SOCKBUF_LOCK_ASSERT(&so->so_rcv);
1350 if (uio->uio_td)
1351 uio->uio_td->td_proc->p_stats->p_ru.ru_msgrcv++;
1352 KASSERT(m == so->so_rcv.sb_mb, ("soreceive: m != so->so_rcv.sb_mb"));
1353 SBLASTRECORDCHK(&so->so_rcv);
1354 SBLASTMBUFCHK(&so->so_rcv);
1355 nextrecord = m->m_nextpkt;
1356 if (pr->pr_flags & PR_ADDR) {
1357 KASSERT(m->m_type == MT_SONAME,
1358 ("m->m_type == %d", m->m_type));
1359 orig_resid = 0;
1360 if (psa != NULL)
1361 *psa = sodupsockaddr(mtod(m, struct sockaddr *),
1362 M_NOWAIT);
1363 if (flags & MSG_PEEK) {
1364 m = m->m_next;
1365 } else {
1366 sbfree(&so->so_rcv, m);
1367 so->so_rcv.sb_mb = m_free(m);
1368 m = so->so_rcv.sb_mb;
1369 sockbuf_pushsync(&so->so_rcv, nextrecord);
1370 }
1371 }
1372
1373 /*
1374 * Process one or more MT_CONTROL mbufs present before any data mbufs
1375 * in the first mbuf chain on the socket buffer. If MSG_PEEK, we
1376 * just copy the data; if !MSG_PEEK, we call into the protocol to
1377 * perform externalization (or freeing if controlp == NULL).
1378 */
1379 if (m != NULL && m->m_type == MT_CONTROL) {
1380 struct mbuf *cm = NULL, *cmn;
1381 struct mbuf **cme = &cm;
1382
1383 do {
1384 if (flags & MSG_PEEK) {
1385 if (controlp != NULL) {
1386 *controlp = m_copy(m, 0, m->m_len);
1387 controlp = &(*controlp)->m_next;
1388 }
1389 m = m->m_next;
1390 } else {
1391 sbfree(&so->so_rcv, m);
1392 so->so_rcv.sb_mb = m->m_next;
1393 m->m_next = NULL;
1394 *cme = m;
1395 cme = &(*cme)->m_next;
1396 m = so->so_rcv.sb_mb;
1397 }
1398 } while (m != NULL && m->m_type == MT_CONTROL);
1399 if ((flags & MSG_PEEK) == 0)
1400 sockbuf_pushsync(&so->so_rcv, nextrecord);
1401 while (cm != NULL) {
1402 cmn = cm->m_next;
1403 cm->m_next = NULL;
1404 if (pr->pr_domain->dom_externalize != NULL) {
1405 SOCKBUF_UNLOCK(&so->so_rcv);
1406 error = (*pr->pr_domain->dom_externalize)
1407 (cm, controlp);
1408 SOCKBUF_LOCK(&so->so_rcv);
1409 } else if (controlp != NULL)
1410 *controlp = cm;
1411 else
1412 m_freem(cm);
1413 if (controlp != NULL) {
1414 orig_resid = 0;
1415 while (*controlp != NULL)
1416 controlp = &(*controlp)->m_next;
1417 }
1418 cm = cmn;
1419 }
1420 if (so->so_rcv.sb_mb)
1421 nextrecord = so->so_rcv.sb_mb->m_nextpkt;
1422 else
1423 nextrecord = NULL;
1424 orig_resid = 0;
1425 }
1426 if (m != NULL) {
1427 if ((flags & MSG_PEEK) == 0) {
1428 KASSERT(m->m_nextpkt == nextrecord,
1429 ("soreceive: post-control, nextrecord !sync"));
1430 if (nextrecord == NULL) {
1431 KASSERT(so->so_rcv.sb_mb == m,
1432 ("soreceive: post-control, sb_mb!=m"));
1433 KASSERT(so->so_rcv.sb_lastrecord == m,
1434 ("soreceive: post-control, lastrecord!=m"));
1435 }
1436 }
1437 type = m->m_type;
1438 if (type == MT_OOBDATA)
1439 flags |= MSG_OOB;
1440 } else {
1441 if ((flags & MSG_PEEK) == 0) {
1442 KASSERT(so->so_rcv.sb_mb == nextrecord,
1443 ("soreceive: sb_mb != nextrecord"));
1444 if (so->so_rcv.sb_mb == NULL) {
1445 KASSERT(so->so_rcv.sb_lastrecord == NULL,
1446 ("soreceive: sb_lastercord != NULL"));
1447 }
1448 }
1449 }
1450 SOCKBUF_LOCK_ASSERT(&so->so_rcv);
1451 SBLASTRECORDCHK(&so->so_rcv);
1452 SBLASTMBUFCHK(&so->so_rcv);
1453
1454 /*
1455 * Now continue to read any data mbufs off of the head of the socket
1456 * buffer until the read request is satisfied. Note that 'type' is
1457 * used to store the type of any mbuf reads that have happened so far
1458 * such that soreceive() can stop reading if the type changes, which
1459 * causes soreceive() to return only one of regular data and inline
1460 * out-of-band data in a single socket receive operation.
1461 */
1462 moff = 0;
1463 offset = 0;
1464 while (m != NULL && uio->uio_resid > 0 && error == 0) {
1465 /*
1466 * If the type of mbuf has changed since the last mbuf
1467 * examined ('type'), end the receive operation.
1468 */
1469 SOCKBUF_LOCK_ASSERT(&so->so_rcv);
1470 if (m->m_type == MT_OOBDATA) {
1471 if (type != MT_OOBDATA)
1472 break;
1473 } else if (type == MT_OOBDATA)
1474 break;
1475 else
1476 KASSERT(m->m_type == MT_DATA,
1477 ("m->m_type == %d", m->m_type));
1478 so->so_rcv.sb_state &= ~SBS_RCVATMARK;
1479 len = uio->uio_resid;
1480 if (so->so_oobmark && len > so->so_oobmark - offset)
1481 len = so->so_oobmark - offset;
1482 if (len > m->m_len - moff)
1483 len = m->m_len - moff;
1484 /*
1485 * If mp is set, just pass back the mbufs.
1486 * Otherwise copy them out via the uio, then free.
1487 * Sockbuf must be consistent here (points to current mbuf,
1488 * it points to next record) when we drop priority;
1489 * we must note any additions to the sockbuf when we
1490 * block interrupts again.
1491 */
1492 if (mp == NULL) {
1493 SOCKBUF_LOCK_ASSERT(&so->so_rcv);
1494 SBLASTRECORDCHK(&so->so_rcv);
1495 SBLASTMBUFCHK(&so->so_rcv);
1496 SOCKBUF_UNLOCK(&so->so_rcv);
1497#ifdef ZERO_COPY_SOCKETS
1498 if (so_zero_copy_receive) {
1499 int disposable;
1500
1501 if ((m->m_flags & M_EXT)
1502 && (m->m_ext.ext_type == EXT_DISPOSABLE))
1503 disposable = 1;
1504 else
1505 disposable = 0;
1506
1507 error = uiomoveco(mtod(m, char *) + moff,
1508 (int)len, uio,
1509 disposable);
1510 } else
1511#endif /* ZERO_COPY_SOCKETS */
1512 error = uiomove(mtod(m, char *) + moff, (int)len, uio);
1513 SOCKBUF_LOCK(&so->so_rcv);
1514 if (error)
1515 goto release;
1516 } else
1517 uio->uio_resid -= len;
1518 SOCKBUF_LOCK_ASSERT(&so->so_rcv);
1519 if (len == m->m_len - moff) {
1520 if (m->m_flags & M_EOR)
1521 flags |= MSG_EOR;
1522 if (flags & MSG_PEEK) {
1523 m = m->m_next;
1524 moff = 0;
1525 } else {
1526 nextrecord = m->m_nextpkt;
1527 sbfree(&so->so_rcv, m);
1528 if (mp != NULL) {
1529 *mp = m;
1530 mp = &m->m_next;
1531 so->so_rcv.sb_mb = m = m->m_next;
1532 *mp = NULL;
1533 } else {
1534 so->so_rcv.sb_mb = m_free(m);
1535 m = so->so_rcv.sb_mb;
1536 }
1537 sockbuf_pushsync(&so->so_rcv, nextrecord);
1538 SBLASTRECORDCHK(&so->so_rcv);
1539 SBLASTMBUFCHK(&so->so_rcv);
1540 }
1541 } else {
1542 if (flags & MSG_PEEK)
1543 moff += len;
1544 else {
1545 if (mp != NULL) {
1546 int copy_flag;
1547
1548 if (flags & MSG_DONTWAIT)
1549 copy_flag = M_DONTWAIT;
1550 else
1551 copy_flag = M_TRYWAIT;
1552 if (copy_flag == M_TRYWAIT)
1553 SOCKBUF_UNLOCK(&so->so_rcv);
1554 *mp = m_copym(m, 0, len, copy_flag);
1555 if (copy_flag == M_TRYWAIT)
1556 SOCKBUF_LOCK(&so->so_rcv);
1557 if (*mp == NULL) {
1558 /*
1559 * m_copym() couldn't allocate an mbuf.
1560 * Adjust uio_resid back (it was adjusted
1561 * down by len bytes, which we didn't end
1562 * up "copying" over).
1563 */
1564 uio->uio_resid += len;
1565 break;
1566 }
1567 }
1568 m->m_data += len;
1569 m->m_len -= len;
1570 so->so_rcv.sb_cc -= len;
1571 }
1572 }
1573 SOCKBUF_LOCK_ASSERT(&so->so_rcv);
1574 if (so->so_oobmark) {
1575 if ((flags & MSG_PEEK) == 0) {
1576 so->so_oobmark -= len;
1577 if (so->so_oobmark == 0) {
1578 so->so_rcv.sb_state |= SBS_RCVATMARK;
1579 break;
1580 }
1581 } else {
1582 offset += len;
1583 if (offset == so->so_oobmark)
1584 break;
1585 }
1586 }
1587 if (flags & MSG_EOR)
1588 break;
1589 /*
1590 * If the MSG_WAITALL flag is set (for non-atomic socket),
1591 * we must not quit until "uio->uio_resid == 0" or an error
1592 * termination. If a signal/timeout occurs, return
1593 * with a short count but without error.
1594 * Keep sockbuf locked against other readers.
1595 */
1596 while (flags & MSG_WAITALL && m == NULL && uio->uio_resid > 0 &&
1597 !sosendallatonce(so) && nextrecord == NULL) {
1598 SOCKBUF_LOCK_ASSERT(&so->so_rcv);
1599 if (so->so_error || so->so_rcv.sb_state & SBS_CANTRCVMORE)
1600 break;
1601 /*
1602 * Notify the protocol that some data has been
1603 * drained before blocking.
1604 */
1605 if (pr->pr_flags & PR_WANTRCVD && so->so_pcb != NULL) {
1606 SOCKBUF_UNLOCK(&so->so_rcv);
1607 (*pr->pr_usrreqs->pru_rcvd)(so, flags);
1608 SOCKBUF_LOCK(&so->so_rcv);
1609 }
1610 SBLASTRECORDCHK(&so->so_rcv);
1611 SBLASTMBUFCHK(&so->so_rcv);
1612 error = sbwait(&so->so_rcv);
1613 if (error)
1614 goto release;
1615 m = so->so_rcv.sb_mb;
1616 if (m != NULL)
1617 nextrecord = m->m_nextpkt;
1618 }
1619 }
1620
1621 SOCKBUF_LOCK_ASSERT(&so->so_rcv);
1622 if (m != NULL && pr->pr_flags & PR_ATOMIC) {
1623 flags |= MSG_TRUNC;
1624 if ((flags & MSG_PEEK) == 0)
1625 (void) sbdroprecord_locked(&so->so_rcv);
1626 }
1627 if ((flags & MSG_PEEK) == 0) {
1628 if (m == NULL) {
1629 /*
1630 * First part is an inline SB_EMPTY_FIXUP(). Second
1631 * part makes sure sb_lastrecord is up-to-date if
1632 * there is still data in the socket buffer.
1633 */
1634 so->so_rcv.sb_mb = nextrecord;
1635 if (so->so_rcv.sb_mb == NULL) {
1636 so->so_rcv.sb_mbtail = NULL;
1637 so->so_rcv.sb_lastrecord = NULL;
1638 } else if (nextrecord->m_nextpkt == NULL)
1639 so->so_rcv.sb_lastrecord = nextrecord;
1640 }
1641 SBLASTRECORDCHK(&so->so_rcv);
1642 SBLASTMBUFCHK(&so->so_rcv);
1643 /*
1644 * If soreceive() is being done from the socket callback, then
1645 * don't need to generate ACK to peer to update window, since
1646 * ACK will be generated on return to TCP.
1647 */
1648 if (!(flags & MSG_SOCALLBCK) &&
1649 (pr->pr_flags & PR_WANTRCVD) && so->so_pcb) {
1650 SOCKBUF_UNLOCK(&so->so_rcv);
1651 (*pr->pr_usrreqs->pru_rcvd)(so, flags);
1652 SOCKBUF_LOCK(&so->so_rcv);
1653 }
1654 }
1655 SOCKBUF_LOCK_ASSERT(&so->so_rcv);
1656 if (orig_resid == uio->uio_resid && orig_resid &&
1657 (flags & MSG_EOR) == 0 && (so->so_rcv.sb_state & SBS_CANTRCVMORE) == 0) {
1658 sbunlock(&so->so_rcv);
1659 goto restart;
1660 }
1661
1662 if (flagsp != NULL)
1663 *flagsp |= flags;
1664release:
1665 SOCKBUF_LOCK_ASSERT(&so->so_rcv);
1666 sbunlock(&so->so_rcv);
1667out:
1668 SOCKBUF_LOCK_ASSERT(&so->so_rcv);
1669 SOCKBUF_UNLOCK(&so->so_rcv);
1670 return (error);
1671}
1672
1673int
1674soshutdown(so, how)
1675 struct socket *so;
1676 int how;
1677{
1678 struct protosw *pr = so->so_proto;
1679
1680 if (!(how == SHUT_RD || how == SHUT_WR || how == SHUT_RDWR))
1681 return (EINVAL);
1682
1683 if (how != SHUT_WR)
1684 sorflush(so);
1685 if (how != SHUT_RD)
1686 return ((*pr->pr_usrreqs->pru_shutdown)(so));
1687 return (0);
1688}
1689
1690void
1691sorflush(so)
1692 struct socket *so;
1693{
1694 struct sockbuf *sb = &so->so_rcv;
1695 struct protosw *pr = so->so_proto;
1696 struct sockbuf asb;
1697
1698 /*
1699 * XXXRW: This is quite ugly. Previously, this code made a copy of
1700 * the socket buffer, then zero'd the original to clear the buffer
1701 * fields. However, with mutexes in the socket buffer, this causes
1702 * problems. We only clear the zeroable bits of the original;
1703 * however, we have to initialize and destroy the mutex in the copy
1704 * so that dom_dispose() and sbrelease() can lock t as needed.
1705 */
1706 SOCKBUF_LOCK(sb);
1707 sb->sb_flags |= SB_NOINTR;
1708 (void) sblock(sb, M_WAITOK);
1709 /*
1710 * socantrcvmore_locked() drops the socket buffer mutex so that it
1711 * can safely perform wakeups. Re-acquire the mutex before
1712 * continuing.
1713 */
1714 socantrcvmore_locked(so);
1715 SOCKBUF_LOCK(sb);
1716 sbunlock(sb);
1717 /*
1718 * Invalidate/clear most of the sockbuf structure, but leave
1719 * selinfo and mutex data unchanged.
1720 */
1721 bzero(&asb, offsetof(struct sockbuf, sb_startzero));
1722 bcopy(&sb->sb_startzero, &asb.sb_startzero,
1723 sizeof(*sb) - offsetof(struct sockbuf, sb_startzero));
1724 bzero(&sb->sb_startzero,
1725 sizeof(*sb) - offsetof(struct sockbuf, sb_startzero));
1726 SOCKBUF_UNLOCK(sb);
1727
1728 SOCKBUF_LOCK_INIT(&asb, "so_rcv");
1729 if (pr->pr_flags & PR_RIGHTS && pr->pr_domain->dom_dispose != NULL)
1730 (*pr->pr_domain->dom_dispose)(asb.sb_mb);
1731 sbrelease(&asb, so);
1732 SOCKBUF_LOCK_DESTROY(&asb);
1733}
1734
1735/*
1736 * Perhaps this routine, and sooptcopyout(), below, ought to come in
1737 * an additional variant to handle the case where the option value needs
1738 * to be some kind of integer, but not a specific size.
1739 * In addition to their use here, these functions are also called by the
1740 * protocol-level pr_ctloutput() routines.
1741 */
1742int
1743sooptcopyin(sopt, buf, len, minlen)
1744 struct sockopt *sopt;
1745 void *buf;
1746 size_t len;
1747 size_t minlen;
1748{
1749 size_t valsize;
1750
1751 /*
1752 * If the user gives us more than we wanted, we ignore it,
1753 * but if we don't get the minimum length the caller
1754 * wants, we return EINVAL. On success, sopt->sopt_valsize
1755 * is set to however much we actually retrieved.
1756 */
1757 if ((valsize = sopt->sopt_valsize) < minlen)
1758 return EINVAL;
1759 if (valsize > len)
1760 sopt->sopt_valsize = valsize = len;
1761
1762 if (sopt->sopt_td != NULL)
1763 return (copyin(sopt->sopt_val, buf, valsize));
1764
1765 bcopy(sopt->sopt_val, buf, valsize);
1766 return (0);
1767}
1768
1769/*
1770 * Kernel version of setsockopt(2)/
1771 * XXX: optlen is size_t, not socklen_t
1772 */
1773int
1774so_setsockopt(struct socket *so, int level, int optname, void *optval,
1775 size_t optlen)
1776{
1777 struct sockopt sopt;
1778
1779 sopt.sopt_level = level;
1780 sopt.sopt_name = optname;
1781 sopt.sopt_dir = SOPT_SET;
1782 sopt.sopt_val = optval;
1783 sopt.sopt_valsize = optlen;
1784 sopt.sopt_td = NULL;
1785 return (sosetopt(so, &sopt));
1786}
1787
1788int
1789sosetopt(so, sopt)
1790 struct socket *so;
1791 struct sockopt *sopt;
1792{
1793 int error, optval;
1794 struct linger l;
1795 struct timeval tv;
1796 u_long val;
1797#ifdef MAC
1798 struct mac extmac;
1799#endif
1800
1801 error = 0;
1802 if (sopt->sopt_level != SOL_SOCKET) {
1803 if (so->so_proto && so->so_proto->pr_ctloutput)
1804 return ((*so->so_proto->pr_ctloutput)
1805 (so, sopt));
1806 error = ENOPROTOOPT;
1807 } else {
1808 switch (sopt->sopt_name) {
1809#ifdef INET
1810 case SO_ACCEPTFILTER:
1811 error = do_setopt_accept_filter(so, sopt);
1812 if (error)
1813 goto bad;
1814 break;
1815#endif
1816 case SO_LINGER:
1817 error = sooptcopyin(sopt, &l, sizeof l, sizeof l);
1818 if (error)
1819 goto bad;
1820
1821 SOCK_LOCK(so);
1822 so->so_linger = l.l_linger;
1823 if (l.l_onoff)
1824 so->so_options |= SO_LINGER;
1825 else
1826 so->so_options &= ~SO_LINGER;
1827 SOCK_UNLOCK(so);
1828 break;
1829
1830 case SO_DEBUG:
1831 case SO_KEEPALIVE:
1832 case SO_DONTROUTE:
1833 case SO_USELOOPBACK:
1834 case SO_BROADCAST:
1835 case SO_REUSEADDR:
1836 case SO_REUSEPORT:
1837 case SO_OOBINLINE:
1838 case SO_TIMESTAMP:
1839 case SO_BINTIME:
1840 case SO_NOSIGPIPE:
1841 error = sooptcopyin(sopt, &optval, sizeof optval,
1842 sizeof optval);
1843 if (error)
1844 goto bad;
1845 SOCK_LOCK(so);
1846 if (optval)
1847 so->so_options |= sopt->sopt_name;
1848 else
1849 so->so_options &= ~sopt->sopt_name;
1850 SOCK_UNLOCK(so);
1851 break;
1852
1853 case SO_SNDBUF:
1854 case SO_RCVBUF:
1855 case SO_SNDLOWAT:
1856 case SO_RCVLOWAT:
1857 error = sooptcopyin(sopt, &optval, sizeof optval,
1858 sizeof optval);
1859 if (error)
1860 goto bad;
1861
1862 /*
1863 * Values < 1 make no sense for any of these
1864 * options, so disallow them.
1865 */
1866 if (optval < 1) {
1867 error = EINVAL;
1868 goto bad;
1869 }
1870
1871 switch (sopt->sopt_name) {
1872 case SO_SNDBUF:
1873 case SO_RCVBUF:
1874 if (sbreserve(sopt->sopt_name == SO_SNDBUF ?
1875 &so->so_snd : &so->so_rcv, (u_long)optval,
1876 so, curthread) == 0) {
1877 error = ENOBUFS;
1878 goto bad;
1879 }
1880 break;
1881
1882 /*
1883 * Make sure the low-water is never greater than
1884 * the high-water.
1885 */
1886 case SO_SNDLOWAT:
1887 SOCKBUF_LOCK(&so->so_snd);
1888 so->so_snd.sb_lowat =
1889 (optval > so->so_snd.sb_hiwat) ?
1890 so->so_snd.sb_hiwat : optval;
1891 SOCKBUF_UNLOCK(&so->so_snd);
1892 break;
1893 case SO_RCVLOWAT:
1894 SOCKBUF_LOCK(&so->so_rcv);
1895 so->so_rcv.sb_lowat =
1896 (optval > so->so_rcv.sb_hiwat) ?
1897 so->so_rcv.sb_hiwat : optval;
1898 SOCKBUF_UNLOCK(&so->so_rcv);
1899 break;
1900 }
1901 break;
1902
1903 case SO_SNDTIMEO:
1904 case SO_RCVTIMEO:
1905#ifdef COMPAT_IA32
1906 if (curthread->td_proc->p_sysent == &ia32_freebsd_sysvec) {
1907 struct timeval32 tv32;
1908
1909 error = sooptcopyin(sopt, &tv32, sizeof tv32,
1910 sizeof tv32);
1911 CP(tv32, tv, tv_sec);
1912 CP(tv32, tv, tv_usec);
1913 } else
1914#endif
1915 error = sooptcopyin(sopt, &tv, sizeof tv,
1916 sizeof tv);
1917 if (error)
1918 goto bad;
1919
1920 /* assert(hz > 0); */
1921 if (tv.tv_sec < 0 || tv.tv_sec > INT_MAX / hz ||
1922 tv.tv_usec < 0 || tv.tv_usec >= 1000000) {
1923 error = EDOM;
1924 goto bad;
1925 }
1926 /* assert(tick > 0); */
1927 /* assert(ULONG_MAX - INT_MAX >= 1000000); */
1928 val = (u_long)(tv.tv_sec * hz) + tv.tv_usec / tick;
1929 if (val > INT_MAX) {
1930 error = EDOM;
1931 goto bad;
1932 }
1933 if (val == 0 && tv.tv_usec != 0)
1934 val = 1;
1935
1936 switch (sopt->sopt_name) {
1937 case SO_SNDTIMEO:
1938 so->so_snd.sb_timeo = val;
1939 break;
1940 case SO_RCVTIMEO:
1941 so->so_rcv.sb_timeo = val;
1942 break;
1943 }
1944 break;
1945
1946 case SO_LABEL:
1947#ifdef MAC
1948 error = sooptcopyin(sopt, &extmac, sizeof extmac,
1949 sizeof extmac);
1950 if (error)
1951 goto bad;
1952 error = mac_setsockopt_label(sopt->sopt_td->td_ucred,
1953 so, &extmac);
1954#else
1955 error = EOPNOTSUPP;
1956#endif
1957 break;
1958
1959 default:
1960 error = ENOPROTOOPT;
1961 break;
1962 }
1963 if (error == 0 && so->so_proto != NULL &&
1964 so->so_proto->pr_ctloutput != NULL) {
1965 (void) ((*so->so_proto->pr_ctloutput)
1966 (so, sopt));
1967 }
1968 }
1969bad:
1970 return (error);
1971}
1972
1973/* Helper routine for getsockopt */
1974int
1975sooptcopyout(struct sockopt *sopt, const void *buf, size_t len)
1976{
1977 int error;
1978 size_t valsize;
1979
1980 error = 0;
1981
1982 /*
1983 * Documented get behavior is that we always return a value,
1984 * possibly truncated to fit in the user's buffer.
1985 * Traditional behavior is that we always tell the user
1986 * precisely how much we copied, rather than something useful
1987 * like the total amount we had available for her.
1988 * Note that this interface is not idempotent; the entire answer must
1989 * generated ahead of time.
1990 */
1991 valsize = min(len, sopt->sopt_valsize);
1992 sopt->sopt_valsize = valsize;
1993 if (sopt->sopt_val != NULL) {
1994 if (sopt->sopt_td != NULL)
1995 error = copyout(buf, sopt->sopt_val, valsize);
1996 else
1997 bcopy(buf, sopt->sopt_val, valsize);
1998 }
1999 return (error);
2000}
2001
2002int
2003sogetopt(so, sopt)
2004 struct socket *so;
2005 struct sockopt *sopt;
2006{
2007 int error, optval;
2008 struct linger l;
2009 struct timeval tv;
2010#ifdef MAC
2011 struct mac extmac;
2012#endif
2013
2014 error = 0;
2015 if (sopt->sopt_level != SOL_SOCKET) {
2016 if (so->so_proto && so->so_proto->pr_ctloutput) {
2017 return ((*so->so_proto->pr_ctloutput)
2018 (so, sopt));
2019 } else
2020 return (ENOPROTOOPT);
2021 } else {
2022 switch (sopt->sopt_name) {
2023#ifdef INET
2024 case SO_ACCEPTFILTER:
2025 error = do_getopt_accept_filter(so, sopt);
2026 break;
2027#endif
2028 case SO_LINGER:
2029 SOCK_LOCK(so);
2030 l.l_onoff = so->so_options & SO_LINGER;
2031 l.l_linger = so->so_linger;
2032 SOCK_UNLOCK(so);
2033 error = sooptcopyout(sopt, &l, sizeof l);
2034 break;
2035
2036 case SO_USELOOPBACK:
2037 case SO_DONTROUTE:
2038 case SO_DEBUG:
2039 case SO_KEEPALIVE:
2040 case SO_REUSEADDR:
2041 case SO_REUSEPORT:
2042 case SO_BROADCAST:
2043 case SO_OOBINLINE:
2044 case SO_ACCEPTCONN:
2045 case SO_TIMESTAMP:
2046 case SO_BINTIME:
2047 case SO_NOSIGPIPE:
2048 optval = so->so_options & sopt->sopt_name;
2049integer:
2050 error = sooptcopyout(sopt, &optval, sizeof optval);
2051 break;
2052
2053 case SO_TYPE:
2054 optval = so->so_type;
2055 goto integer;
2056
2057 case SO_ERROR:
2058 optval = so->so_error;
2059 so->so_error = 0;
2060 goto integer;
2061
2062 case SO_SNDBUF:
2063 optval = so->so_snd.sb_hiwat;
2064 goto integer;
2065
2066 case SO_RCVBUF:
2067 optval = so->so_rcv.sb_hiwat;
2068 goto integer;
2069
2070 case SO_SNDLOWAT:
2071 optval = so->so_snd.sb_lowat;
2072 goto integer;
2073
2074 case SO_RCVLOWAT:
2075 optval = so->so_rcv.sb_lowat;
2076 goto integer;
2077
2078 case SO_SNDTIMEO:
2079 case SO_RCVTIMEO:
2080 optval = (sopt->sopt_name == SO_SNDTIMEO ?
2081 so->so_snd.sb_timeo : so->so_rcv.sb_timeo);
2082
2083 tv.tv_sec = optval / hz;
2084 tv.tv_usec = (optval % hz) * tick;
2085#ifdef COMPAT_IA32
2086 if (curthread->td_proc->p_sysent == &ia32_freebsd_sysvec) {
2087 struct timeval32 tv32;
2088
2089 CP(tv, tv32, tv_sec);
2090 CP(tv, tv32, tv_usec);
2091 error = sooptcopyout(sopt, &tv32, sizeof tv32);
2092 } else
2093#endif
2094 error = sooptcopyout(sopt, &tv, sizeof tv);
2095 break;
2096
2097 case SO_LABEL:
2098#ifdef MAC
2099 error = sooptcopyin(sopt, &extmac, sizeof(extmac),
2100 sizeof(extmac));
2101 if (error)
2102 return (error);
2103 error = mac_getsockopt_label(sopt->sopt_td->td_ucred,
2104 so, &extmac);
2105 if (error)
2106 return (error);
2107 error = sooptcopyout(sopt, &extmac, sizeof extmac);
2108#else
2109 error = EOPNOTSUPP;
2110#endif
2111 break;
2112
2113 case SO_PEERLABEL:
2114#ifdef MAC
2115 error = sooptcopyin(sopt, &extmac, sizeof(extmac),
2116 sizeof(extmac));
2117 if (error)
2118 return (error);
2119 error = mac_getsockopt_peerlabel(
2120 sopt->sopt_td->td_ucred, so, &extmac);
2121 if (error)
2122 return (error);
2123 error = sooptcopyout(sopt, &extmac, sizeof extmac);
2124#else
2125 error = EOPNOTSUPP;
2126#endif
2127 break;
2128
2129 case SO_LISTENQLIMIT:
2130 optval = so->so_qlimit;
2131 goto integer;
2132
2133 case SO_LISTENQLEN:
2134 optval = so->so_qlen;
2135 goto integer;
2136
2137 case SO_LISTENINCQLEN:
2138 optval = so->so_incqlen;
2139 goto integer;
2140
2141 default:
2142 error = ENOPROTOOPT;
2143 break;
2144 }
2145 return (error);
2146 }
2147}
2148
2149/* XXX; prepare mbuf for (__FreeBSD__ < 3) routines. */
2150int
2151soopt_getm(struct sockopt *sopt, struct mbuf **mp)
2152{
2153 struct mbuf *m, *m_prev;
2154 int sopt_size = sopt->sopt_valsize;
2155
2156 MGET(m, sopt->sopt_td ? M_TRYWAIT : M_DONTWAIT, MT_DATA);
2157 if (m == NULL)
2158 return ENOBUFS;
2159 if (sopt_size > MLEN) {
2160 MCLGET(m, sopt->sopt_td ? M_TRYWAIT : M_DONTWAIT);
2161 if ((m->m_flags & M_EXT) == 0) {
2162 m_free(m);
2163 return ENOBUFS;
2164 }
2165 m->m_len = min(MCLBYTES, sopt_size);
2166 } else {
2167 m->m_len = min(MLEN, sopt_size);
2168 }
2169 sopt_size -= m->m_len;
2170 *mp = m;
2171 m_prev = m;
2172
2173 while (sopt_size) {
2174 MGET(m, sopt->sopt_td ? M_TRYWAIT : M_DONTWAIT, MT_DATA);
2175 if (m == NULL) {
2176 m_freem(*mp);
2177 return ENOBUFS;
2178 }
2179 if (sopt_size > MLEN) {
2180 MCLGET(m, sopt->sopt_td != NULL ? M_TRYWAIT :
2181 M_DONTWAIT);
2182 if ((m->m_flags & M_EXT) == 0) {
2183 m_freem(m);
2184 m_freem(*mp);
2185 return ENOBUFS;
2186 }
2187 m->m_len = min(MCLBYTES, sopt_size);
2188 } else {
2189 m->m_len = min(MLEN, sopt_size);
2190 }
2191 sopt_size -= m->m_len;
2192 m_prev->m_next = m;
2193 m_prev = m;
2194 }
2195 return (0);
2196}
2197
2198/* XXX; copyin sopt data into mbuf chain for (__FreeBSD__ < 3) routines. */
2199int
2200soopt_mcopyin(struct sockopt *sopt, struct mbuf *m)
2201{
2202 struct mbuf *m0 = m;
2203
2204 if (sopt->sopt_val == NULL)
2205 return (0);
2206 while (m != NULL && sopt->sopt_valsize >= m->m_len) {
2207 if (sopt->sopt_td != NULL) {
2208 int error;
2209
2210 error = copyin(sopt->sopt_val, mtod(m, char *),
2211 m->m_len);
2212 if (error != 0) {
2213 m_freem(m0);
2214 return(error);
2215 }
2216 } else
2217 bcopy(sopt->sopt_val, mtod(m, char *), m->m_len);
2218 sopt->sopt_valsize -= m->m_len;
2219 sopt->sopt_val = (char *)sopt->sopt_val + m->m_len;
2220 m = m->m_next;
2221 }
2222 if (m != NULL) /* should be allocated enoughly at ip6_sooptmcopyin() */
2223 panic("ip6_sooptmcopyin");
2224 return (0);
2225}
2226
2227/* XXX; copyout mbuf chain data into soopt for (__FreeBSD__ < 3) routines. */
2228int
2229soopt_mcopyout(struct sockopt *sopt, struct mbuf *m)
2230{
2231 struct mbuf *m0 = m;
2232 size_t valsize = 0;
2233
2234 if (sopt->sopt_val == NULL)
2235 return (0);
2236 while (m != NULL && sopt->sopt_valsize >= m->m_len) {
2237 if (sopt->sopt_td != NULL) {
2238 int error;
2239
2240 error = copyout(mtod(m, char *), sopt->sopt_val,
2241 m->m_len);
2242 if (error != 0) {
2243 m_freem(m0);
2244 return(error);
2245 }
2246 } else
2247 bcopy(mtod(m, char *), sopt->sopt_val, m->m_len);
2248 sopt->sopt_valsize -= m->m_len;
2249 sopt->sopt_val = (char *)sopt->sopt_val + m->m_len;
2250 valsize += m->m_len;
2251 m = m->m_next;
2252 }
2253 if (m != NULL) {
2254 /* enough soopt buffer should be given from user-land */
2255 m_freem(m0);
2256 return(EINVAL);
2257 }
2258 sopt->sopt_valsize = valsize;
2259 return (0);
2260}
2261
2262void
2263sohasoutofband(so)
2264 struct socket *so;
2265{
2266 if (so->so_sigio != NULL)
2267 pgsigio(&so->so_sigio, SIGURG, 0);
2268 selwakeuppri(&so->so_rcv.sb_sel, PSOCK);
2269}
2270
2271int
2272sopoll(struct socket *so, int events, struct ucred *active_cred,
2273 struct thread *td)
2274{
2275 int revents = 0;
2276
2277 SOCKBUF_LOCK(&so->so_snd);
2278 SOCKBUF_LOCK(&so->so_rcv);
2279 if (events & (POLLIN | POLLRDNORM))
2280 if (soreadable(so))
2281 revents |= events & (POLLIN | POLLRDNORM);
2282
2283 if (events & POLLINIGNEOF)
2284 if (so->so_rcv.sb_cc >= so->so_rcv.sb_lowat ||
2285 !TAILQ_EMPTY(&so->so_comp) || so->so_error)
2286 revents |= POLLINIGNEOF;
2287
2288 if (events & (POLLOUT | POLLWRNORM))
2289 if (sowriteable(so))
2290 revents |= events & (POLLOUT | POLLWRNORM);
2291
2292 if (events & (POLLPRI | POLLRDBAND))
2293 if (so->so_oobmark || (so->so_rcv.sb_state & SBS_RCVATMARK))
2294 revents |= events & (POLLPRI | POLLRDBAND);
2295
2296 if (revents == 0) {
2297 if (events &
2298 (POLLIN | POLLINIGNEOF | POLLPRI | POLLRDNORM |
2299 POLLRDBAND)) {
2300 selrecord(td, &so->so_rcv.sb_sel);
2301 so->so_rcv.sb_flags |= SB_SEL;
2302 }
2303
2304 if (events & (POLLOUT | POLLWRNORM)) {
2305 selrecord(td, &so->so_snd.sb_sel);
2306 so->so_snd.sb_flags |= SB_SEL;
2307 }
2308 }
2309
2310 SOCKBUF_UNLOCK(&so->so_rcv);
2311 SOCKBUF_UNLOCK(&so->so_snd);
2312 return (revents);
2313}
2314
2315int
2316soo_kqfilter(struct file *fp, struct knote *kn)
2317{
2318 struct socket *so = kn->kn_fp->f_data;
2319 struct sockbuf *sb;
2320
2321 switch (kn->kn_filter) {
2322 case EVFILT_READ:
2323 if (so->so_options & SO_ACCEPTCONN)
2324 kn->kn_fop = &solisten_filtops;
2325 else
2326 kn->kn_fop = &soread_filtops;
2327 sb = &so->so_rcv;
2328 break;
2329 case EVFILT_WRITE:
2330 kn->kn_fop = &sowrite_filtops;
2331 sb = &so->so_snd;
2332 break;
2333 default:
2334 return (EINVAL);
2335 }
2336
2337 SOCKBUF_LOCK(sb);
2338 knlist_add(&sb->sb_sel.si_note, kn, 1);
2339 sb->sb_flags |= SB_KNOTE;
2340 SOCKBUF_UNLOCK(sb);
2341 return (0);
2342}
2343
2344static void
2345filt_sordetach(struct knote *kn)
2346{
2347 struct socket *so = kn->kn_fp->f_data;
2348
2349 SOCKBUF_LOCK(&so->so_rcv);
2350 knlist_remove(&so->so_rcv.sb_sel.si_note, kn, 1);
2351 if (knlist_empty(&so->so_rcv.sb_sel.si_note))
2352 so->so_rcv.sb_flags &= ~SB_KNOTE;
2353 SOCKBUF_UNLOCK(&so->so_rcv);
2354}
2355
2356/*ARGSUSED*/
2357static int
2358filt_soread(struct knote *kn, long hint)
2359{
2360 struct socket *so;
2361
2362 so = kn->kn_fp->f_data;
2363 SOCKBUF_LOCK_ASSERT(&so->so_rcv);
2364
2365 kn->kn_data = so->so_rcv.sb_cc - so->so_rcv.sb_ctl;
2366 if (so->so_rcv.sb_state & SBS_CANTRCVMORE) {
2367 kn->kn_flags |= EV_EOF;
2368 kn->kn_fflags = so->so_error;
2369 return (1);
2370 } else if (so->so_error) /* temporary udp error */
2371 return (1);
2372 else if (kn->kn_sfflags & NOTE_LOWAT)
2373 return (kn->kn_data >= kn->kn_sdata);
2374 else
2375 return (so->so_rcv.sb_cc >= so->so_rcv.sb_lowat);
2376}
2377
2378static void
2379filt_sowdetach(struct knote *kn)
2380{
2381 struct socket *so = kn->kn_fp->f_data;
2382
2383 SOCKBUF_LOCK(&so->so_snd);
2384 knlist_remove(&so->so_snd.sb_sel.si_note, kn, 1);
2385 if (knlist_empty(&so->so_snd.sb_sel.si_note))
2386 so->so_snd.sb_flags &= ~SB_KNOTE;
2387 SOCKBUF_UNLOCK(&so->so_snd);
2388}
2389
2390/*ARGSUSED*/
2391static int
2392filt_sowrite(struct knote *kn, long hint)
2393{
2394 struct socket *so;
2395
2396 so = kn->kn_fp->f_data;
2397 SOCKBUF_LOCK_ASSERT(&so->so_snd);
2398 kn->kn_data = sbspace(&so->so_snd);
2399 if (so->so_snd.sb_state & SBS_CANTSENDMORE) {
2400 kn->kn_flags |= EV_EOF;
2401 kn->kn_fflags = so->so_error;
2402 return (1);
2403 } else if (so->so_error) /* temporary udp error */
2404 return (1);
2405 else if (((so->so_state & SS_ISCONNECTED) == 0) &&
2406 (so->so_proto->pr_flags & PR_CONNREQUIRED))
2407 return (0);
2408 else if (kn->kn_sfflags & NOTE_LOWAT)
2409 return (kn->kn_data >= kn->kn_sdata);
2410 else
2411 return (kn->kn_data >= so->so_snd.sb_lowat);
2412}
2413
2414/*ARGSUSED*/
2415static int
2416filt_solisten(struct knote *kn, long hint)
2417{
2418 struct socket *so = kn->kn_fp->f_data;
2419
2420 kn->kn_data = so->so_qlen;
2421 return (! TAILQ_EMPTY(&so->so_comp));
2422}
2423
2424int
2425socheckuid(struct socket *so, uid_t uid)
2426{
2427
2428 if (so == NULL)
2429 return (EPERM);
2430 if (so->so_cred->cr_uid != uid)
2431 return (EPERM);
2432 return (0);
2433}
2434
2435static int
2436somaxconn_sysctl(SYSCTL_HANDLER_ARGS)
2437{
2438 int error;
2439 int val;
2440
2441 val = somaxconn;
2442 error = sysctl_handle_int(oidp, &val, sizeof(int), req);
2443 if (error || !req->newptr )
2444 return (error);
2445
2446 if (val < 1 || val > USHRT_MAX)
2447 return (EINVAL);
2448
2449 somaxconn = val;
2450 return (0);
2451}