Deleted Added
full compact
sdp_main.c (274421) sdp_main.c (275329)
1
2/*-
3 * Copyright (c) 1982, 1986, 1988, 1990, 1993, 1995
4 * The Regents of the University of California. All rights reserved.
5 * Copyright (c) 2004 The FreeBSD Foundation. All rights reserved.
6 * Copyright (c) 2004-2008 Robert N. M. Watson. All rights reserved.
7 *
8 * Redistribution and use in source and binary forms, with or without
9 * modification, are permitted provided that the following conditions
10 * are met:
11 * 1. Redistributions of source code must retain the above copyright
12 * notice, this list of conditions and the following disclaimer.
13 * 2. Redistributions in binary form must reproduce the above copyright
14 * notice, this list of conditions and the following disclaimer in the
15 * documentation and/or other materials provided with the distribution.
16 * 4. Neither the name of the University nor the names of its contributors
17 * may be used to endorse or promote products derived from this software
18 * without specific prior written permission.
19 *
20 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
21 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
22 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
23 * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
24 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
25 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
26 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
27 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
28 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
29 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
30 * SUCH DAMAGE.
31 *
32 * Excerpts taken from tcp_subr.c, tcp_usrreq.c, uipc_socket.c
33 */
34
35/*
36 *
37 * Copyright (c) 2010 Isilon Systems, Inc.
38 * Copyright (c) 2010 iX Systems, Inc.
39 * Copyright (c) 2010 Panasas, Inc.
40 * All rights reserved.
41 *
42 * Redistribution and use in source and binary forms, with or without
43 * modification, are permitted provided that the following conditions
44 * are met:
45 * 1. Redistributions of source code must retain the above copyright
46 * notice unmodified, this list of conditions, and the following
47 * disclaimer.
48 * 2. Redistributions in binary form must reproduce the above copyright
49 * notice, this list of conditions and the following disclaimer in the
50 * documentation and/or other materials provided with the distribution.
51 *
52 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
53 * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
54 * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
55 * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
56 * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
57 * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
58 * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
59 * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
60 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
61 * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
62 *
63 */
64#include <sys/cdefs.h>
65__FBSDID("$FreeBSD$");
66
67#include "sdp.h"
68
69#include <net/if.h>
70#include <net/route.h>
71#include <net/vnet.h>
72#include <sys/sysctl.h>
73
74uma_zone_t sdp_zone;
75struct rwlock sdp_lock;
76LIST_HEAD(, sdp_sock) sdp_list;
77
78struct workqueue_struct *rx_comp_wq;
79
80RW_SYSINIT(sdplockinit, &sdp_lock, "SDP lock");
81#define SDP_LIST_WLOCK() rw_wlock(&sdp_lock)
82#define SDP_LIST_RLOCK() rw_rlock(&sdp_lock)
83#define SDP_LIST_WUNLOCK() rw_wunlock(&sdp_lock)
84#define SDP_LIST_RUNLOCK() rw_runlock(&sdp_lock)
85#define SDP_LIST_WLOCK_ASSERT() rw_assert(&sdp_lock, RW_WLOCKED)
86#define SDP_LIST_RLOCK_ASSERT() rw_assert(&sdp_lock, RW_RLOCKED)
87#define SDP_LIST_LOCK_ASSERT() rw_assert(&sdp_lock, RW_LOCKED)
88
89static MALLOC_DEFINE(M_SDP, "sdp", "Socket Direct Protocol");
90
91static void sdp_stop_keepalive_timer(struct socket *so);
92
93/*
94 * SDP protocol interface to socket abstraction.
95 */
96/*
97 * sdp_sendspace and sdp_recvspace are the default send and receive window
98 * sizes, respectively.
99 */
100u_long sdp_sendspace = 1024*32;
101u_long sdp_recvspace = 1024*64;
102
103static int sdp_count;
104
105/*
106 * Disable async. CMA events for sockets which are being torn down.
107 */
108static void
109sdp_destroy_cma(struct sdp_sock *ssk)
110{
111
112 if (ssk->id == NULL)
113 return;
114 rdma_destroy_id(ssk->id);
115 ssk->id = NULL;
116}
117
118static int
119sdp_pcbbind(struct sdp_sock *ssk, struct sockaddr *nam, struct ucred *cred)
120{
121 struct sockaddr_in *sin;
122 struct sockaddr_in null;
123 int error;
124
125 SDP_WLOCK_ASSERT(ssk);
126
127 if (ssk->lport != 0 || ssk->laddr != INADDR_ANY)
128 return (EINVAL);
129 /* rdma_bind_addr handles bind races. */
130 SDP_WUNLOCK(ssk);
131 if (ssk->id == NULL)
132 ssk->id = rdma_create_id(sdp_cma_handler, ssk, RDMA_PS_SDP);
133 if (ssk->id == NULL) {
134 SDP_WLOCK(ssk);
135 return (ENOMEM);
136 }
137 if (nam == NULL) {
138 null.sin_family = AF_INET;
139 null.sin_len = sizeof(null);
140 null.sin_addr.s_addr = INADDR_ANY;
141 null.sin_port = 0;
142 bzero(&null.sin_zero, sizeof(null.sin_zero));
143 nam = (struct sockaddr *)&null;
144 }
145 error = -rdma_bind_addr(ssk->id, nam);
146 SDP_WLOCK(ssk);
147 if (error == 0) {
148 sin = (struct sockaddr_in *)&ssk->id->route.addr.src_addr;
149 ssk->laddr = sin->sin_addr.s_addr;
150 ssk->lport = sin->sin_port;
151 } else
152 sdp_destroy_cma(ssk);
153 return (error);
154}
155
156static void
157sdp_pcbfree(struct sdp_sock *ssk)
158{
159 KASSERT(ssk->socket == NULL, ("ssk %p socket still attached", ssk));
160
161 sdp_dbg(ssk->socket, "Freeing pcb");
162 SDP_WLOCK_ASSERT(ssk);
163 ssk->flags |= SDP_DESTROY;
164 SDP_WUNLOCK(ssk);
165 SDP_LIST_WLOCK();
166 sdp_count--;
167 LIST_REMOVE(ssk, list);
168 SDP_LIST_WUNLOCK();
169 crfree(ssk->cred);
170 sdp_destroy_cma(ssk);
171 ssk->qp_active = 0;
172 if (ssk->qp) {
173 ib_destroy_qp(ssk->qp);
174 ssk->qp = NULL;
175 }
176 sdp_tx_ring_destroy(ssk);
177 sdp_rx_ring_destroy(ssk);
178 rw_destroy(&ssk->rx_ring.destroyed_lock);
179 uma_zfree(sdp_zone, ssk);
180 rw_destroy(&ssk->lock);
181}
182
183/*
184 * Common routines to return a socket address.
185 */
186static struct sockaddr *
187sdp_sockaddr(in_port_t port, struct in_addr *addr_p)
188{
189 struct sockaddr_in *sin;
190
191 sin = malloc(sizeof *sin, M_SONAME,
192 M_WAITOK | M_ZERO);
193 sin->sin_family = AF_INET;
194 sin->sin_len = sizeof(*sin);
195 sin->sin_addr = *addr_p;
196 sin->sin_port = port;
197
198 return (struct sockaddr *)sin;
199}
200
201static int
202sdp_getsockaddr(struct socket *so, struct sockaddr **nam)
203{
204 struct sdp_sock *ssk;
205 struct in_addr addr;
206 in_port_t port;
207
208 ssk = sdp_sk(so);
209 SDP_RLOCK(ssk);
210 port = ssk->lport;
211 addr.s_addr = ssk->laddr;
212 SDP_RUNLOCK(ssk);
213
214 *nam = sdp_sockaddr(port, &addr);
215 return 0;
216}
217
218static int
219sdp_getpeeraddr(struct socket *so, struct sockaddr **nam)
220{
221 struct sdp_sock *ssk;
222 struct in_addr addr;
223 in_port_t port;
224
225 ssk = sdp_sk(so);
226 SDP_RLOCK(ssk);
227 port = ssk->fport;
228 addr.s_addr = ssk->faddr;
229 SDP_RUNLOCK(ssk);
230
231 *nam = sdp_sockaddr(port, &addr);
232 return 0;
233}
234
235static void
236sdp_pcbnotifyall(struct in_addr faddr, int errno,
237 struct sdp_sock *(*notify)(struct sdp_sock *, int))
238{
239 struct sdp_sock *ssk, *ssk_temp;
240
241 SDP_LIST_WLOCK();
242 LIST_FOREACH_SAFE(ssk, &sdp_list, list, ssk_temp) {
243 SDP_WLOCK(ssk);
244 if (ssk->faddr != faddr.s_addr || ssk->socket == NULL) {
245 SDP_WUNLOCK(ssk);
246 continue;
247 }
248 if ((ssk->flags & SDP_DESTROY) == 0)
249 if ((*notify)(ssk, errno))
250 SDP_WUNLOCK(ssk);
251 }
252 SDP_LIST_WUNLOCK();
253}
254
255#if 0
256static void
257sdp_apply_all(void (*func)(struct sdp_sock *, void *), void *arg)
258{
259 struct sdp_sock *ssk;
260
261 SDP_LIST_RLOCK();
262 LIST_FOREACH(ssk, &sdp_list, list) {
263 SDP_WLOCK(ssk);
264 func(ssk, arg);
265 SDP_WUNLOCK(ssk);
266 }
267 SDP_LIST_RUNLOCK();
268}
269#endif
270
271static void
272sdp_output_reset(struct sdp_sock *ssk)
273{
274 struct rdma_cm_id *id;
275
276 SDP_WLOCK_ASSERT(ssk);
277 if (ssk->id) {
278 id = ssk->id;
279 ssk->qp_active = 0;
280 SDP_WUNLOCK(ssk);
281 rdma_disconnect(id);
282 SDP_WLOCK(ssk);
283 }
284 ssk->state = TCPS_CLOSED;
285}
286
287/*
288 * Attempt to close a SDP socket, marking it as dropped, and freeing
289 * the socket if we hold the only reference.
290 */
291static struct sdp_sock *
292sdp_closed(struct sdp_sock *ssk)
293{
294 struct socket *so;
295
296 SDP_WLOCK_ASSERT(ssk);
297
298 ssk->flags |= SDP_DROPPED;
299 so = ssk->socket;
300 soisdisconnected(so);
301 if (ssk->flags & SDP_SOCKREF) {
302 KASSERT(so->so_state & SS_PROTOREF,
303 ("sdp_closed: !SS_PROTOREF"));
304 ssk->flags &= ~SDP_SOCKREF;
305 SDP_WUNLOCK(ssk);
306 ACCEPT_LOCK();
307 SOCK_LOCK(so);
308 so->so_state &= ~SS_PROTOREF;
309 sofree(so);
310 return (NULL);
311 }
312 return (ssk);
313}
314
315/*
316 * Perform timer based shutdowns which can not operate in
317 * callout context.
318 */
319static void
320sdp_shutdown_task(void *data, int pending)
321{
322 struct sdp_sock *ssk;
323
324 ssk = data;
325 SDP_WLOCK(ssk);
326 /*
327 * I don't think this can race with another call to pcbfree()
328 * because SDP_TIMEWAIT protects it. SDP_DESTROY may be redundant.
329 */
330 if (ssk->flags & SDP_DESTROY)
331 panic("sdp_shutdown_task: Racing with pcbfree for ssk %p",
332 ssk);
333 if (ssk->flags & SDP_DISCON)
334 sdp_output_reset(ssk);
335 /* We have to clear this so sdp_detach() will call pcbfree(). */
336 ssk->flags &= ~(SDP_TIMEWAIT | SDP_DREQWAIT);
337 if ((ssk->flags & SDP_DROPPED) == 0 &&
338 sdp_closed(ssk) == NULL)
339 return;
340 if (ssk->socket == NULL) {
341 sdp_pcbfree(ssk);
342 return;
343 }
344 SDP_WUNLOCK(ssk);
345}
346
347/*
348 * 2msl has expired, schedule the shutdown task.
349 */
350static void
351sdp_2msl_timeout(void *data)
352{
353 struct sdp_sock *ssk;
354
355 ssk = data;
356 /* Callout canceled. */
357 if (!callout_active(&ssk->keep2msl))
358 goto out;
359 callout_deactivate(&ssk->keep2msl);
360 /* Should be impossible, defensive programming. */
361 if ((ssk->flags & SDP_TIMEWAIT) == 0)
362 goto out;
363 taskqueue_enqueue(taskqueue_thread, &ssk->shutdown_task);
364out:
365 SDP_WUNLOCK(ssk);
366 return;
367}
368
369/*
370 * Schedule the 2msl wait timer.
371 */
372static void
373sdp_2msl_wait(struct sdp_sock *ssk)
374{
375
376 SDP_WLOCK_ASSERT(ssk);
377 ssk->flags |= SDP_TIMEWAIT;
378 ssk->state = TCPS_TIME_WAIT;
379 soisdisconnected(ssk->socket);
380 callout_reset(&ssk->keep2msl, TCPTV_MSL, sdp_2msl_timeout, ssk);
381}
382
383/*
384 * Timed out waiting for the final fin/ack from rdma_disconnect().
385 */
386static void
387sdp_dreq_timeout(void *data)
388{
389 struct sdp_sock *ssk;
390
391 ssk = data;
392 /* Callout canceled. */
393 if (!callout_active(&ssk->keep2msl))
394 goto out;
395 /* Callout rescheduled, probably as a different timer. */
396 if (callout_pending(&ssk->keep2msl))
397 goto out;
398 callout_deactivate(&ssk->keep2msl);
399 if (ssk->state != TCPS_FIN_WAIT_1 && ssk->state != TCPS_LAST_ACK)
400 goto out;
401 if ((ssk->flags & SDP_DREQWAIT) == 0)
402 goto out;
403 ssk->flags &= ~SDP_DREQWAIT;
404 ssk->flags |= SDP_DISCON;
405 sdp_2msl_wait(ssk);
406 ssk->qp_active = 0;
407out:
408 SDP_WUNLOCK(ssk);
409}
410
411/*
412 * Received the final fin/ack. Cancel the 2msl.
413 */
414void
415sdp_cancel_dreq_wait_timeout(struct sdp_sock *ssk)
416{
417 sdp_dbg(ssk->socket, "cancelling dreq wait timeout\n");
418 ssk->flags &= ~SDP_DREQWAIT;
419 sdp_2msl_wait(ssk);
420}
421
422static int
423sdp_init_sock(struct socket *sk)
424{
425 struct sdp_sock *ssk = sdp_sk(sk);
426
427 sdp_dbg(sk, "%s\n", __func__);
428
429 callout_init_rw(&ssk->keep2msl, &ssk->lock, CALLOUT_RETURNUNLOCKED);
430 TASK_INIT(&ssk->shutdown_task, 0, sdp_shutdown_task, ssk);
431#ifdef SDP_ZCOPY
432 INIT_DELAYED_WORK(&ssk->srcavail_cancel_work, srcavail_cancel_timeout);
433 ssk->zcopy_thresh = -1; /* use global sdp_zcopy_thresh */
434 ssk->tx_ring.rdma_inflight = NULL;
435#endif
436 atomic_set(&ssk->mseq_ack, 0);
437 sdp_rx_ring_init(ssk);
438 ssk->tx_ring.buffer = NULL;
439
440 return 0;
441}
442
443/*
444 * Allocate an sdp_sock for the socket and reserve socket buffer space.
445 */
446static int
447sdp_attach(struct socket *so, int proto, struct thread *td)
448{
449 struct sdp_sock *ssk;
450 int error;
451
452 ssk = sdp_sk(so);
453 KASSERT(ssk == NULL, ("sdp_attach: ssk already set on so %p", so));
454 if (so->so_snd.sb_hiwat == 0 || so->so_rcv.sb_hiwat == 0) {
455 error = soreserve(so, sdp_sendspace, sdp_recvspace);
456 if (error)
457 return (error);
458 }
459 so->so_rcv.sb_flags |= SB_AUTOSIZE;
460 so->so_snd.sb_flags |= SB_AUTOSIZE;
461 ssk = uma_zalloc(sdp_zone, M_NOWAIT | M_ZERO);
462 if (ssk == NULL)
463 return (ENOBUFS);
464 rw_init(&ssk->lock, "sdpsock");
465 ssk->socket = so;
466 ssk->cred = crhold(so->so_cred);
467 so->so_pcb = (caddr_t)ssk;
468 sdp_init_sock(so);
469 ssk->flags = 0;
470 ssk->qp_active = 0;
471 ssk->state = TCPS_CLOSED;
472 SDP_LIST_WLOCK();
473 LIST_INSERT_HEAD(&sdp_list, ssk, list);
474 sdp_count++;
475 SDP_LIST_WUNLOCK();
476 if ((so->so_options & SO_LINGER) && so->so_linger == 0)
477 so->so_linger = TCP_LINGERTIME;
478
479 return (0);
480}
481
482/*
483 * Detach SDP from the socket, potentially leaving it around for the
484 * timewait to expire.
485 */
486static void
487sdp_detach(struct socket *so)
488{
489 struct sdp_sock *ssk;
490
491 ssk = sdp_sk(so);
492 SDP_WLOCK(ssk);
493 KASSERT(ssk->socket != NULL, ("sdp_detach: socket is NULL"));
494 ssk->socket->so_pcb = NULL;
495 ssk->socket = NULL;
496 if (ssk->flags & (SDP_TIMEWAIT | SDP_DREQWAIT))
497 SDP_WUNLOCK(ssk);
498 else if (ssk->flags & SDP_DROPPED || ssk->state < TCPS_SYN_SENT)
499 sdp_pcbfree(ssk);
500 else
501 panic("sdp_detach: Unexpected state, ssk %p.\n", ssk);
502}
503
504/*
505 * Allocate a local address for the socket.
506 */
507static int
508sdp_bind(struct socket *so, struct sockaddr *nam, struct thread *td)
509{
510 int error = 0;
511 struct sdp_sock *ssk;
512 struct sockaddr_in *sin;
513
514 sin = (struct sockaddr_in *)nam;
515 if (nam->sa_len != sizeof (*sin))
516 return (EINVAL);
517 if (sin->sin_family != AF_INET)
518 return (EINVAL);
519 if (IN_MULTICAST(ntohl(sin->sin_addr.s_addr)))
520 return (EAFNOSUPPORT);
521
522 ssk = sdp_sk(so);
523 SDP_WLOCK(ssk);
524 if (ssk->flags & (SDP_TIMEWAIT | SDP_DROPPED)) {
525 error = EINVAL;
526 goto out;
527 }
528 error = sdp_pcbbind(ssk, nam, td->td_ucred);
529out:
530 SDP_WUNLOCK(ssk);
531
532 return (error);
533}
534
535/*
536 * Prepare to accept connections.
537 */
538static int
539sdp_listen(struct socket *so, int backlog, struct thread *td)
540{
541 int error = 0;
542 struct sdp_sock *ssk;
543
544 ssk = sdp_sk(so);
545 SDP_WLOCK(ssk);
546 if (ssk->flags & (SDP_TIMEWAIT | SDP_DROPPED)) {
547 error = EINVAL;
548 goto out;
549 }
550 if (error == 0 && ssk->lport == 0)
551 error = sdp_pcbbind(ssk, (struct sockaddr *)0, td->td_ucred);
552 SOCK_LOCK(so);
553 if (error == 0)
554 error = solisten_proto_check(so);
555 if (error == 0) {
556 solisten_proto(so, backlog);
557 ssk->state = TCPS_LISTEN;
558 }
559 SOCK_UNLOCK(so);
560
561out:
562 SDP_WUNLOCK(ssk);
563 if (error == 0)
564 error = -rdma_listen(ssk->id, backlog);
565 return (error);
566}
567
568/*
569 * Initiate a SDP connection to nam.
570 */
571static int
572sdp_start_connect(struct sdp_sock *ssk, struct sockaddr *nam, struct thread *td)
573{
574 struct sockaddr_in src;
575 struct socket *so;
576 int error;
577
578 so = ssk->socket;
579
580 SDP_WLOCK_ASSERT(ssk);
581 if (ssk->lport == 0) {
582 error = sdp_pcbbind(ssk, (struct sockaddr *)0, td->td_ucred);
583 if (error)
584 return error;
585 }
586 src.sin_family = AF_INET;
587 src.sin_len = sizeof(src);
588 bzero(&src.sin_zero, sizeof(src.sin_zero));
589 src.sin_port = ssk->lport;
590 src.sin_addr.s_addr = ssk->laddr;
591 soisconnecting(so);
592 SDP_WUNLOCK(ssk);
593 error = -rdma_resolve_addr(ssk->id, (struct sockaddr *)&src, nam,
594 SDP_RESOLVE_TIMEOUT);
595 SDP_WLOCK(ssk);
596 if (error == 0)
597 ssk->state = TCPS_SYN_SENT;
598
599 return 0;
600}
601
602/*
603 * Initiate SDP connection.
604 */
605static int
606sdp_connect(struct socket *so, struct sockaddr *nam, struct thread *td)
607{
608 int error = 0;
609 struct sdp_sock *ssk;
610 struct sockaddr_in *sin;
611
612 sin = (struct sockaddr_in *)nam;
613 if (nam->sa_len != sizeof (*sin))
614 return (EINVAL);
615 if (sin->sin_family != AF_INET)
616 return (EINVAL);
617 if (IN_MULTICAST(ntohl(sin->sin_addr.s_addr)))
618 return (EAFNOSUPPORT);
619 if ((error = prison_remote_ip4(td->td_ucred, &sin->sin_addr)) != 0)
620 return (error);
621 ssk = sdp_sk(so);
622 SDP_WLOCK(ssk);
623 if (ssk->flags & (SDP_TIMEWAIT | SDP_DROPPED))
624 error = EINVAL;
625 else
626 error = sdp_start_connect(ssk, nam, td);
627 SDP_WUNLOCK(ssk);
628 return (error);
629}
630
631/*
632 * Drop a SDP socket, reporting
633 * the specified error. If connection is synchronized,
634 * then send a RST to peer.
635 */
636static struct sdp_sock *
637sdp_drop(struct sdp_sock *ssk, int errno)
638{
639 struct socket *so;
640
641 SDP_WLOCK_ASSERT(ssk);
642 so = ssk->socket;
643 if (TCPS_HAVERCVDSYN(ssk->state))
644 sdp_output_reset(ssk);
645 if (errno == ETIMEDOUT && ssk->softerror)
646 errno = ssk->softerror;
647 so->so_error = errno;
648 return (sdp_closed(ssk));
649}
650
651/*
652 * User issued close, and wish to trail through shutdown states:
653 * if never received SYN, just forget it. If got a SYN from peer,
654 * but haven't sent FIN, then go to FIN_WAIT_1 state to send peer a FIN.
655 * If already got a FIN from peer, then almost done; go to LAST_ACK
656 * state. In all other cases, have already sent FIN to peer (e.g.
657 * after PRU_SHUTDOWN), and just have to play tedious game waiting
658 * for peer to send FIN or not respond to keep-alives, etc.
659 * We can let the user exit from the close as soon as the FIN is acked.
660 */
661static void
662sdp_usrclosed(struct sdp_sock *ssk)
663{
664
665 SDP_WLOCK_ASSERT(ssk);
666
667 switch (ssk->state) {
668 case TCPS_LISTEN:
669 ssk->state = TCPS_CLOSED;
670 SDP_WUNLOCK(ssk);
671 sdp_destroy_cma(ssk);
672 SDP_WLOCK(ssk);
673 /* FALLTHROUGH */
674 case TCPS_CLOSED:
675 ssk = sdp_closed(ssk);
676 /*
677 * sdp_closed() should never return NULL here as the socket is
678 * still open.
679 */
680 KASSERT(ssk != NULL,
681 ("sdp_usrclosed: sdp_closed() returned NULL"));
682 break;
683
684 case TCPS_SYN_SENT:
685 /* FALLTHROUGH */
686 case TCPS_SYN_RECEIVED:
687 ssk->flags |= SDP_NEEDFIN;
688 break;
689
690 case TCPS_ESTABLISHED:
691 ssk->flags |= SDP_NEEDFIN;
692 ssk->state = TCPS_FIN_WAIT_1;
693 break;
694
695 case TCPS_CLOSE_WAIT:
696 ssk->state = TCPS_LAST_ACK;
697 break;
698 }
699 if (ssk->state >= TCPS_FIN_WAIT_2) {
700 /* Prevent the connection hanging in FIN_WAIT_2 forever. */
701 if (ssk->state == TCPS_FIN_WAIT_2)
702 sdp_2msl_wait(ssk);
703 else
704 soisdisconnected(ssk->socket);
705 }
706}
707
708static void
709sdp_output_disconnect(struct sdp_sock *ssk)
710{
711
712 SDP_WLOCK_ASSERT(ssk);
713 callout_reset(&ssk->keep2msl, SDP_FIN_WAIT_TIMEOUT,
714 sdp_dreq_timeout, ssk);
715 ssk->flags |= SDP_NEEDFIN | SDP_DREQWAIT;
716 sdp_post_sends(ssk, M_NOWAIT);
717}
718
719/*
720 * Initiate or continue a disconnect.
721 * If embryonic state, just send reset (once).
722 * If in ``let data drain'' option and linger null, just drop.
723 * Otherwise (hard), mark socket disconnecting and drop
724 * current input data; switch states based on user close, and
725 * send segment to peer (with FIN).
726 */
727static void
728sdp_start_disconnect(struct sdp_sock *ssk)
729{
730 struct socket *so;
731 int unread;
732
733 so = ssk->socket;
734 SDP_WLOCK_ASSERT(ssk);
735 sdp_stop_keepalive_timer(so);
736 /*
737 * Neither sdp_closed() nor sdp_drop() should return NULL, as the
738 * socket is still open.
739 */
740 if (ssk->state < TCPS_ESTABLISHED) {
741 ssk = sdp_closed(ssk);
742 KASSERT(ssk != NULL,
743 ("sdp_start_disconnect: sdp_close() returned NULL"));
744 } else if ((so->so_options & SO_LINGER) && so->so_linger == 0) {
745 ssk = sdp_drop(ssk, 0);
746 KASSERT(ssk != NULL,
747 ("sdp_start_disconnect: sdp_drop() returned NULL"));
748 } else {
749 soisdisconnecting(so);
750 unread = sbused(&so->so_rcv);
751 sbflush(&so->so_rcv);
752 sdp_usrclosed(ssk);
753 if (!(ssk->flags & SDP_DROPPED)) {
754 if (unread)
755 sdp_output_reset(ssk);
756 else
757 sdp_output_disconnect(ssk);
758 }
759 }
760}
761
762/*
763 * User initiated disconnect.
764 */
765static int
766sdp_disconnect(struct socket *so)
767{
768 struct sdp_sock *ssk;
769 int error = 0;
770
771 ssk = sdp_sk(so);
772 SDP_WLOCK(ssk);
773 if (ssk->flags & (SDP_TIMEWAIT | SDP_DROPPED)) {
774 error = ECONNRESET;
775 goto out;
776 }
777 sdp_start_disconnect(ssk);
778out:
779 SDP_WUNLOCK(ssk);
780 return (error);
781}
782
783/*
784 * Accept a connection. Essentially all the work is done at higher levels;
785 * just return the address of the peer, storing through addr.
786 *
787 *
788 * XXX This is broken XXX
789 *
790 * The rationale for acquiring the sdp lock here is somewhat complicated,
791 * and is described in detail in the commit log entry for r175612. Acquiring
792 * it delays an accept(2) racing with sonewconn(), which inserts the socket
793 * before the address/port fields are initialized. A better fix would
794 * prevent the socket from being placed in the listen queue until all fields
795 * are fully initialized.
796 */
797static int
798sdp_accept(struct socket *so, struct sockaddr **nam)
799{
800 struct sdp_sock *ssk = NULL;
801 struct in_addr addr;
802 in_port_t port;
803 int error;
804
805 if (so->so_state & SS_ISDISCONNECTED)
806 return (ECONNABORTED);
807
808 port = 0;
809 addr.s_addr = 0;
810 error = 0;
811 ssk = sdp_sk(so);
812 SDP_WLOCK(ssk);
813 if (ssk->flags & (SDP_TIMEWAIT | SDP_DROPPED)) {
814 error = ECONNABORTED;
815 goto out;
816 }
817 port = ssk->fport;
818 addr.s_addr = ssk->faddr;
819out:
820 SDP_WUNLOCK(ssk);
821 if (error == 0)
822 *nam = sdp_sockaddr(port, &addr);
823 return error;
824}
825
826/*
827 * Mark the connection as being incapable of further output.
828 */
829static int
830sdp_shutdown(struct socket *so)
831{
832 int error = 0;
833 struct sdp_sock *ssk;
834
835 ssk = sdp_sk(so);
836 SDP_WLOCK(ssk);
837 if (ssk->flags & (SDP_TIMEWAIT | SDP_DROPPED)) {
838 error = ECONNRESET;
839 goto out;
840 }
841 socantsendmore(so);
842 sdp_usrclosed(ssk);
843 if (!(ssk->flags & SDP_DROPPED))
844 sdp_output_disconnect(ssk);
845
846out:
847 SDP_WUNLOCK(ssk);
848
849 return (error);
850}
851
852static void
853sdp_append(struct sdp_sock *ssk, struct sockbuf *sb, struct mbuf *mb, int cnt)
854{
855 struct mbuf *n;
856 int ncnt;
857
858 SOCKBUF_LOCK_ASSERT(sb);
859 SBLASTRECORDCHK(sb);
860 KASSERT(mb->m_flags & M_PKTHDR,
861 ("sdp_append: %p Missing packet header.\n", mb));
862 n = sb->sb_lastrecord;
863 /*
864 * If the queue is empty just set all pointers and proceed.
865 */
866 if (n == NULL) {
867 sb->sb_lastrecord = sb->sb_mb = sb->sb_sndptr = mb;
868 for (; mb; mb = mb->m_next) {
869 sb->sb_mbtail = mb;
870 sballoc(sb, mb);
871 }
872 return;
873 }
874 /*
875 * Count the number of mbufs in the current tail.
876 */
877 for (ncnt = 0; n->m_next; n = n->m_next)
878 ncnt++;
879 n = sb->sb_lastrecord;
880 /*
881 * If the two chains can fit in a single sdp packet and
882 * the last record has not been sent yet (WRITABLE) coalesce
883 * them. The lastrecord remains the same but we must strip the
884 * packet header and then let sbcompress do the hard part.
885 */
886 if (M_WRITABLE(n) && ncnt + cnt < SDP_MAX_SEND_SGES &&
887 n->m_pkthdr.len + mb->m_pkthdr.len - SDP_HEAD_SIZE <
888 ssk->xmit_size_goal) {
889 m_adj(mb, SDP_HEAD_SIZE);
890 n->m_pkthdr.len += mb->m_pkthdr.len;
891 n->m_flags |= mb->m_flags & (M_PUSH | M_URG);
1
2/*-
3 * Copyright (c) 1982, 1986, 1988, 1990, 1993, 1995
4 * The Regents of the University of California. All rights reserved.
5 * Copyright (c) 2004 The FreeBSD Foundation. All rights reserved.
6 * Copyright (c) 2004-2008 Robert N. M. Watson. All rights reserved.
7 *
8 * Redistribution and use in source and binary forms, with or without
9 * modification, are permitted provided that the following conditions
10 * are met:
11 * 1. Redistributions of source code must retain the above copyright
12 * notice, this list of conditions and the following disclaimer.
13 * 2. Redistributions in binary form must reproduce the above copyright
14 * notice, this list of conditions and the following disclaimer in the
15 * documentation and/or other materials provided with the distribution.
16 * 4. Neither the name of the University nor the names of its contributors
17 * may be used to endorse or promote products derived from this software
18 * without specific prior written permission.
19 *
20 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
21 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
22 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
23 * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
24 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
25 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
26 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
27 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
28 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
29 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
30 * SUCH DAMAGE.
31 *
32 * Excerpts taken from tcp_subr.c, tcp_usrreq.c, uipc_socket.c
33 */
34
35/*
36 *
37 * Copyright (c) 2010 Isilon Systems, Inc.
38 * Copyright (c) 2010 iX Systems, Inc.
39 * Copyright (c) 2010 Panasas, Inc.
40 * All rights reserved.
41 *
42 * Redistribution and use in source and binary forms, with or without
43 * modification, are permitted provided that the following conditions
44 * are met:
45 * 1. Redistributions of source code must retain the above copyright
46 * notice unmodified, this list of conditions, and the following
47 * disclaimer.
48 * 2. Redistributions in binary form must reproduce the above copyright
49 * notice, this list of conditions and the following disclaimer in the
50 * documentation and/or other materials provided with the distribution.
51 *
52 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
53 * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
54 * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
55 * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
56 * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
57 * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
58 * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
59 * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
60 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
61 * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
62 *
63 */
64#include <sys/cdefs.h>
65__FBSDID("$FreeBSD$");
66
67#include "sdp.h"
68
69#include <net/if.h>
70#include <net/route.h>
71#include <net/vnet.h>
72#include <sys/sysctl.h>
73
74uma_zone_t sdp_zone;
75struct rwlock sdp_lock;
76LIST_HEAD(, sdp_sock) sdp_list;
77
78struct workqueue_struct *rx_comp_wq;
79
80RW_SYSINIT(sdplockinit, &sdp_lock, "SDP lock");
81#define SDP_LIST_WLOCK() rw_wlock(&sdp_lock)
82#define SDP_LIST_RLOCK() rw_rlock(&sdp_lock)
83#define SDP_LIST_WUNLOCK() rw_wunlock(&sdp_lock)
84#define SDP_LIST_RUNLOCK() rw_runlock(&sdp_lock)
85#define SDP_LIST_WLOCK_ASSERT() rw_assert(&sdp_lock, RW_WLOCKED)
86#define SDP_LIST_RLOCK_ASSERT() rw_assert(&sdp_lock, RW_RLOCKED)
87#define SDP_LIST_LOCK_ASSERT() rw_assert(&sdp_lock, RW_LOCKED)
88
89static MALLOC_DEFINE(M_SDP, "sdp", "Socket Direct Protocol");
90
91static void sdp_stop_keepalive_timer(struct socket *so);
92
93/*
94 * SDP protocol interface to socket abstraction.
95 */
96/*
97 * sdp_sendspace and sdp_recvspace are the default send and receive window
98 * sizes, respectively.
99 */
100u_long sdp_sendspace = 1024*32;
101u_long sdp_recvspace = 1024*64;
102
103static int sdp_count;
104
105/*
106 * Disable async. CMA events for sockets which are being torn down.
107 */
108static void
109sdp_destroy_cma(struct sdp_sock *ssk)
110{
111
112 if (ssk->id == NULL)
113 return;
114 rdma_destroy_id(ssk->id);
115 ssk->id = NULL;
116}
117
118static int
119sdp_pcbbind(struct sdp_sock *ssk, struct sockaddr *nam, struct ucred *cred)
120{
121 struct sockaddr_in *sin;
122 struct sockaddr_in null;
123 int error;
124
125 SDP_WLOCK_ASSERT(ssk);
126
127 if (ssk->lport != 0 || ssk->laddr != INADDR_ANY)
128 return (EINVAL);
129 /* rdma_bind_addr handles bind races. */
130 SDP_WUNLOCK(ssk);
131 if (ssk->id == NULL)
132 ssk->id = rdma_create_id(sdp_cma_handler, ssk, RDMA_PS_SDP);
133 if (ssk->id == NULL) {
134 SDP_WLOCK(ssk);
135 return (ENOMEM);
136 }
137 if (nam == NULL) {
138 null.sin_family = AF_INET;
139 null.sin_len = sizeof(null);
140 null.sin_addr.s_addr = INADDR_ANY;
141 null.sin_port = 0;
142 bzero(&null.sin_zero, sizeof(null.sin_zero));
143 nam = (struct sockaddr *)&null;
144 }
145 error = -rdma_bind_addr(ssk->id, nam);
146 SDP_WLOCK(ssk);
147 if (error == 0) {
148 sin = (struct sockaddr_in *)&ssk->id->route.addr.src_addr;
149 ssk->laddr = sin->sin_addr.s_addr;
150 ssk->lport = sin->sin_port;
151 } else
152 sdp_destroy_cma(ssk);
153 return (error);
154}
155
156static void
157sdp_pcbfree(struct sdp_sock *ssk)
158{
159 KASSERT(ssk->socket == NULL, ("ssk %p socket still attached", ssk));
160
161 sdp_dbg(ssk->socket, "Freeing pcb");
162 SDP_WLOCK_ASSERT(ssk);
163 ssk->flags |= SDP_DESTROY;
164 SDP_WUNLOCK(ssk);
165 SDP_LIST_WLOCK();
166 sdp_count--;
167 LIST_REMOVE(ssk, list);
168 SDP_LIST_WUNLOCK();
169 crfree(ssk->cred);
170 sdp_destroy_cma(ssk);
171 ssk->qp_active = 0;
172 if (ssk->qp) {
173 ib_destroy_qp(ssk->qp);
174 ssk->qp = NULL;
175 }
176 sdp_tx_ring_destroy(ssk);
177 sdp_rx_ring_destroy(ssk);
178 rw_destroy(&ssk->rx_ring.destroyed_lock);
179 uma_zfree(sdp_zone, ssk);
180 rw_destroy(&ssk->lock);
181}
182
183/*
184 * Common routines to return a socket address.
185 */
186static struct sockaddr *
187sdp_sockaddr(in_port_t port, struct in_addr *addr_p)
188{
189 struct sockaddr_in *sin;
190
191 sin = malloc(sizeof *sin, M_SONAME,
192 M_WAITOK | M_ZERO);
193 sin->sin_family = AF_INET;
194 sin->sin_len = sizeof(*sin);
195 sin->sin_addr = *addr_p;
196 sin->sin_port = port;
197
198 return (struct sockaddr *)sin;
199}
200
201static int
202sdp_getsockaddr(struct socket *so, struct sockaddr **nam)
203{
204 struct sdp_sock *ssk;
205 struct in_addr addr;
206 in_port_t port;
207
208 ssk = sdp_sk(so);
209 SDP_RLOCK(ssk);
210 port = ssk->lport;
211 addr.s_addr = ssk->laddr;
212 SDP_RUNLOCK(ssk);
213
214 *nam = sdp_sockaddr(port, &addr);
215 return 0;
216}
217
218static int
219sdp_getpeeraddr(struct socket *so, struct sockaddr **nam)
220{
221 struct sdp_sock *ssk;
222 struct in_addr addr;
223 in_port_t port;
224
225 ssk = sdp_sk(so);
226 SDP_RLOCK(ssk);
227 port = ssk->fport;
228 addr.s_addr = ssk->faddr;
229 SDP_RUNLOCK(ssk);
230
231 *nam = sdp_sockaddr(port, &addr);
232 return 0;
233}
234
235static void
236sdp_pcbnotifyall(struct in_addr faddr, int errno,
237 struct sdp_sock *(*notify)(struct sdp_sock *, int))
238{
239 struct sdp_sock *ssk, *ssk_temp;
240
241 SDP_LIST_WLOCK();
242 LIST_FOREACH_SAFE(ssk, &sdp_list, list, ssk_temp) {
243 SDP_WLOCK(ssk);
244 if (ssk->faddr != faddr.s_addr || ssk->socket == NULL) {
245 SDP_WUNLOCK(ssk);
246 continue;
247 }
248 if ((ssk->flags & SDP_DESTROY) == 0)
249 if ((*notify)(ssk, errno))
250 SDP_WUNLOCK(ssk);
251 }
252 SDP_LIST_WUNLOCK();
253}
254
255#if 0
256static void
257sdp_apply_all(void (*func)(struct sdp_sock *, void *), void *arg)
258{
259 struct sdp_sock *ssk;
260
261 SDP_LIST_RLOCK();
262 LIST_FOREACH(ssk, &sdp_list, list) {
263 SDP_WLOCK(ssk);
264 func(ssk, arg);
265 SDP_WUNLOCK(ssk);
266 }
267 SDP_LIST_RUNLOCK();
268}
269#endif
270
271static void
272sdp_output_reset(struct sdp_sock *ssk)
273{
274 struct rdma_cm_id *id;
275
276 SDP_WLOCK_ASSERT(ssk);
277 if (ssk->id) {
278 id = ssk->id;
279 ssk->qp_active = 0;
280 SDP_WUNLOCK(ssk);
281 rdma_disconnect(id);
282 SDP_WLOCK(ssk);
283 }
284 ssk->state = TCPS_CLOSED;
285}
286
287/*
288 * Attempt to close a SDP socket, marking it as dropped, and freeing
289 * the socket if we hold the only reference.
290 */
291static struct sdp_sock *
292sdp_closed(struct sdp_sock *ssk)
293{
294 struct socket *so;
295
296 SDP_WLOCK_ASSERT(ssk);
297
298 ssk->flags |= SDP_DROPPED;
299 so = ssk->socket;
300 soisdisconnected(so);
301 if (ssk->flags & SDP_SOCKREF) {
302 KASSERT(so->so_state & SS_PROTOREF,
303 ("sdp_closed: !SS_PROTOREF"));
304 ssk->flags &= ~SDP_SOCKREF;
305 SDP_WUNLOCK(ssk);
306 ACCEPT_LOCK();
307 SOCK_LOCK(so);
308 so->so_state &= ~SS_PROTOREF;
309 sofree(so);
310 return (NULL);
311 }
312 return (ssk);
313}
314
315/*
316 * Perform timer based shutdowns which can not operate in
317 * callout context.
318 */
319static void
320sdp_shutdown_task(void *data, int pending)
321{
322 struct sdp_sock *ssk;
323
324 ssk = data;
325 SDP_WLOCK(ssk);
326 /*
327 * I don't think this can race with another call to pcbfree()
328 * because SDP_TIMEWAIT protects it. SDP_DESTROY may be redundant.
329 */
330 if (ssk->flags & SDP_DESTROY)
331 panic("sdp_shutdown_task: Racing with pcbfree for ssk %p",
332 ssk);
333 if (ssk->flags & SDP_DISCON)
334 sdp_output_reset(ssk);
335 /* We have to clear this so sdp_detach() will call pcbfree(). */
336 ssk->flags &= ~(SDP_TIMEWAIT | SDP_DREQWAIT);
337 if ((ssk->flags & SDP_DROPPED) == 0 &&
338 sdp_closed(ssk) == NULL)
339 return;
340 if (ssk->socket == NULL) {
341 sdp_pcbfree(ssk);
342 return;
343 }
344 SDP_WUNLOCK(ssk);
345}
346
347/*
348 * 2msl has expired, schedule the shutdown task.
349 */
350static void
351sdp_2msl_timeout(void *data)
352{
353 struct sdp_sock *ssk;
354
355 ssk = data;
356 /* Callout canceled. */
357 if (!callout_active(&ssk->keep2msl))
358 goto out;
359 callout_deactivate(&ssk->keep2msl);
360 /* Should be impossible, defensive programming. */
361 if ((ssk->flags & SDP_TIMEWAIT) == 0)
362 goto out;
363 taskqueue_enqueue(taskqueue_thread, &ssk->shutdown_task);
364out:
365 SDP_WUNLOCK(ssk);
366 return;
367}
368
369/*
370 * Schedule the 2msl wait timer.
371 */
372static void
373sdp_2msl_wait(struct sdp_sock *ssk)
374{
375
376 SDP_WLOCK_ASSERT(ssk);
377 ssk->flags |= SDP_TIMEWAIT;
378 ssk->state = TCPS_TIME_WAIT;
379 soisdisconnected(ssk->socket);
380 callout_reset(&ssk->keep2msl, TCPTV_MSL, sdp_2msl_timeout, ssk);
381}
382
383/*
384 * Timed out waiting for the final fin/ack from rdma_disconnect().
385 */
386static void
387sdp_dreq_timeout(void *data)
388{
389 struct sdp_sock *ssk;
390
391 ssk = data;
392 /* Callout canceled. */
393 if (!callout_active(&ssk->keep2msl))
394 goto out;
395 /* Callout rescheduled, probably as a different timer. */
396 if (callout_pending(&ssk->keep2msl))
397 goto out;
398 callout_deactivate(&ssk->keep2msl);
399 if (ssk->state != TCPS_FIN_WAIT_1 && ssk->state != TCPS_LAST_ACK)
400 goto out;
401 if ((ssk->flags & SDP_DREQWAIT) == 0)
402 goto out;
403 ssk->flags &= ~SDP_DREQWAIT;
404 ssk->flags |= SDP_DISCON;
405 sdp_2msl_wait(ssk);
406 ssk->qp_active = 0;
407out:
408 SDP_WUNLOCK(ssk);
409}
410
411/*
412 * Received the final fin/ack. Cancel the 2msl.
413 */
414void
415sdp_cancel_dreq_wait_timeout(struct sdp_sock *ssk)
416{
417 sdp_dbg(ssk->socket, "cancelling dreq wait timeout\n");
418 ssk->flags &= ~SDP_DREQWAIT;
419 sdp_2msl_wait(ssk);
420}
421
422static int
423sdp_init_sock(struct socket *sk)
424{
425 struct sdp_sock *ssk = sdp_sk(sk);
426
427 sdp_dbg(sk, "%s\n", __func__);
428
429 callout_init_rw(&ssk->keep2msl, &ssk->lock, CALLOUT_RETURNUNLOCKED);
430 TASK_INIT(&ssk->shutdown_task, 0, sdp_shutdown_task, ssk);
431#ifdef SDP_ZCOPY
432 INIT_DELAYED_WORK(&ssk->srcavail_cancel_work, srcavail_cancel_timeout);
433 ssk->zcopy_thresh = -1; /* use global sdp_zcopy_thresh */
434 ssk->tx_ring.rdma_inflight = NULL;
435#endif
436 atomic_set(&ssk->mseq_ack, 0);
437 sdp_rx_ring_init(ssk);
438 ssk->tx_ring.buffer = NULL;
439
440 return 0;
441}
442
443/*
444 * Allocate an sdp_sock for the socket and reserve socket buffer space.
445 */
446static int
447sdp_attach(struct socket *so, int proto, struct thread *td)
448{
449 struct sdp_sock *ssk;
450 int error;
451
452 ssk = sdp_sk(so);
453 KASSERT(ssk == NULL, ("sdp_attach: ssk already set on so %p", so));
454 if (so->so_snd.sb_hiwat == 0 || so->so_rcv.sb_hiwat == 0) {
455 error = soreserve(so, sdp_sendspace, sdp_recvspace);
456 if (error)
457 return (error);
458 }
459 so->so_rcv.sb_flags |= SB_AUTOSIZE;
460 so->so_snd.sb_flags |= SB_AUTOSIZE;
461 ssk = uma_zalloc(sdp_zone, M_NOWAIT | M_ZERO);
462 if (ssk == NULL)
463 return (ENOBUFS);
464 rw_init(&ssk->lock, "sdpsock");
465 ssk->socket = so;
466 ssk->cred = crhold(so->so_cred);
467 so->so_pcb = (caddr_t)ssk;
468 sdp_init_sock(so);
469 ssk->flags = 0;
470 ssk->qp_active = 0;
471 ssk->state = TCPS_CLOSED;
472 SDP_LIST_WLOCK();
473 LIST_INSERT_HEAD(&sdp_list, ssk, list);
474 sdp_count++;
475 SDP_LIST_WUNLOCK();
476 if ((so->so_options & SO_LINGER) && so->so_linger == 0)
477 so->so_linger = TCP_LINGERTIME;
478
479 return (0);
480}
481
482/*
483 * Detach SDP from the socket, potentially leaving it around for the
484 * timewait to expire.
485 */
486static void
487sdp_detach(struct socket *so)
488{
489 struct sdp_sock *ssk;
490
491 ssk = sdp_sk(so);
492 SDP_WLOCK(ssk);
493 KASSERT(ssk->socket != NULL, ("sdp_detach: socket is NULL"));
494 ssk->socket->so_pcb = NULL;
495 ssk->socket = NULL;
496 if (ssk->flags & (SDP_TIMEWAIT | SDP_DREQWAIT))
497 SDP_WUNLOCK(ssk);
498 else if (ssk->flags & SDP_DROPPED || ssk->state < TCPS_SYN_SENT)
499 sdp_pcbfree(ssk);
500 else
501 panic("sdp_detach: Unexpected state, ssk %p.\n", ssk);
502}
503
504/*
505 * Allocate a local address for the socket.
506 */
507static int
508sdp_bind(struct socket *so, struct sockaddr *nam, struct thread *td)
509{
510 int error = 0;
511 struct sdp_sock *ssk;
512 struct sockaddr_in *sin;
513
514 sin = (struct sockaddr_in *)nam;
515 if (nam->sa_len != sizeof (*sin))
516 return (EINVAL);
517 if (sin->sin_family != AF_INET)
518 return (EINVAL);
519 if (IN_MULTICAST(ntohl(sin->sin_addr.s_addr)))
520 return (EAFNOSUPPORT);
521
522 ssk = sdp_sk(so);
523 SDP_WLOCK(ssk);
524 if (ssk->flags & (SDP_TIMEWAIT | SDP_DROPPED)) {
525 error = EINVAL;
526 goto out;
527 }
528 error = sdp_pcbbind(ssk, nam, td->td_ucred);
529out:
530 SDP_WUNLOCK(ssk);
531
532 return (error);
533}
534
535/*
536 * Prepare to accept connections.
537 */
538static int
539sdp_listen(struct socket *so, int backlog, struct thread *td)
540{
541 int error = 0;
542 struct sdp_sock *ssk;
543
544 ssk = sdp_sk(so);
545 SDP_WLOCK(ssk);
546 if (ssk->flags & (SDP_TIMEWAIT | SDP_DROPPED)) {
547 error = EINVAL;
548 goto out;
549 }
550 if (error == 0 && ssk->lport == 0)
551 error = sdp_pcbbind(ssk, (struct sockaddr *)0, td->td_ucred);
552 SOCK_LOCK(so);
553 if (error == 0)
554 error = solisten_proto_check(so);
555 if (error == 0) {
556 solisten_proto(so, backlog);
557 ssk->state = TCPS_LISTEN;
558 }
559 SOCK_UNLOCK(so);
560
561out:
562 SDP_WUNLOCK(ssk);
563 if (error == 0)
564 error = -rdma_listen(ssk->id, backlog);
565 return (error);
566}
567
568/*
569 * Initiate a SDP connection to nam.
570 */
571static int
572sdp_start_connect(struct sdp_sock *ssk, struct sockaddr *nam, struct thread *td)
573{
574 struct sockaddr_in src;
575 struct socket *so;
576 int error;
577
578 so = ssk->socket;
579
580 SDP_WLOCK_ASSERT(ssk);
581 if (ssk->lport == 0) {
582 error = sdp_pcbbind(ssk, (struct sockaddr *)0, td->td_ucred);
583 if (error)
584 return error;
585 }
586 src.sin_family = AF_INET;
587 src.sin_len = sizeof(src);
588 bzero(&src.sin_zero, sizeof(src.sin_zero));
589 src.sin_port = ssk->lport;
590 src.sin_addr.s_addr = ssk->laddr;
591 soisconnecting(so);
592 SDP_WUNLOCK(ssk);
593 error = -rdma_resolve_addr(ssk->id, (struct sockaddr *)&src, nam,
594 SDP_RESOLVE_TIMEOUT);
595 SDP_WLOCK(ssk);
596 if (error == 0)
597 ssk->state = TCPS_SYN_SENT;
598
599 return 0;
600}
601
602/*
603 * Initiate SDP connection.
604 */
605static int
606sdp_connect(struct socket *so, struct sockaddr *nam, struct thread *td)
607{
608 int error = 0;
609 struct sdp_sock *ssk;
610 struct sockaddr_in *sin;
611
612 sin = (struct sockaddr_in *)nam;
613 if (nam->sa_len != sizeof (*sin))
614 return (EINVAL);
615 if (sin->sin_family != AF_INET)
616 return (EINVAL);
617 if (IN_MULTICAST(ntohl(sin->sin_addr.s_addr)))
618 return (EAFNOSUPPORT);
619 if ((error = prison_remote_ip4(td->td_ucred, &sin->sin_addr)) != 0)
620 return (error);
621 ssk = sdp_sk(so);
622 SDP_WLOCK(ssk);
623 if (ssk->flags & (SDP_TIMEWAIT | SDP_DROPPED))
624 error = EINVAL;
625 else
626 error = sdp_start_connect(ssk, nam, td);
627 SDP_WUNLOCK(ssk);
628 return (error);
629}
630
631/*
632 * Drop a SDP socket, reporting
633 * the specified error. If connection is synchronized,
634 * then send a RST to peer.
635 */
636static struct sdp_sock *
637sdp_drop(struct sdp_sock *ssk, int errno)
638{
639 struct socket *so;
640
641 SDP_WLOCK_ASSERT(ssk);
642 so = ssk->socket;
643 if (TCPS_HAVERCVDSYN(ssk->state))
644 sdp_output_reset(ssk);
645 if (errno == ETIMEDOUT && ssk->softerror)
646 errno = ssk->softerror;
647 so->so_error = errno;
648 return (sdp_closed(ssk));
649}
650
651/*
652 * User issued close, and wish to trail through shutdown states:
653 * if never received SYN, just forget it. If got a SYN from peer,
654 * but haven't sent FIN, then go to FIN_WAIT_1 state to send peer a FIN.
655 * If already got a FIN from peer, then almost done; go to LAST_ACK
656 * state. In all other cases, have already sent FIN to peer (e.g.
657 * after PRU_SHUTDOWN), and just have to play tedious game waiting
658 * for peer to send FIN or not respond to keep-alives, etc.
659 * We can let the user exit from the close as soon as the FIN is acked.
660 */
661static void
662sdp_usrclosed(struct sdp_sock *ssk)
663{
664
665 SDP_WLOCK_ASSERT(ssk);
666
667 switch (ssk->state) {
668 case TCPS_LISTEN:
669 ssk->state = TCPS_CLOSED;
670 SDP_WUNLOCK(ssk);
671 sdp_destroy_cma(ssk);
672 SDP_WLOCK(ssk);
673 /* FALLTHROUGH */
674 case TCPS_CLOSED:
675 ssk = sdp_closed(ssk);
676 /*
677 * sdp_closed() should never return NULL here as the socket is
678 * still open.
679 */
680 KASSERT(ssk != NULL,
681 ("sdp_usrclosed: sdp_closed() returned NULL"));
682 break;
683
684 case TCPS_SYN_SENT:
685 /* FALLTHROUGH */
686 case TCPS_SYN_RECEIVED:
687 ssk->flags |= SDP_NEEDFIN;
688 break;
689
690 case TCPS_ESTABLISHED:
691 ssk->flags |= SDP_NEEDFIN;
692 ssk->state = TCPS_FIN_WAIT_1;
693 break;
694
695 case TCPS_CLOSE_WAIT:
696 ssk->state = TCPS_LAST_ACK;
697 break;
698 }
699 if (ssk->state >= TCPS_FIN_WAIT_2) {
700 /* Prevent the connection hanging in FIN_WAIT_2 forever. */
701 if (ssk->state == TCPS_FIN_WAIT_2)
702 sdp_2msl_wait(ssk);
703 else
704 soisdisconnected(ssk->socket);
705 }
706}
707
708static void
709sdp_output_disconnect(struct sdp_sock *ssk)
710{
711
712 SDP_WLOCK_ASSERT(ssk);
713 callout_reset(&ssk->keep2msl, SDP_FIN_WAIT_TIMEOUT,
714 sdp_dreq_timeout, ssk);
715 ssk->flags |= SDP_NEEDFIN | SDP_DREQWAIT;
716 sdp_post_sends(ssk, M_NOWAIT);
717}
718
719/*
720 * Initiate or continue a disconnect.
721 * If embryonic state, just send reset (once).
722 * If in ``let data drain'' option and linger null, just drop.
723 * Otherwise (hard), mark socket disconnecting and drop
724 * current input data; switch states based on user close, and
725 * send segment to peer (with FIN).
726 */
727static void
728sdp_start_disconnect(struct sdp_sock *ssk)
729{
730 struct socket *so;
731 int unread;
732
733 so = ssk->socket;
734 SDP_WLOCK_ASSERT(ssk);
735 sdp_stop_keepalive_timer(so);
736 /*
737 * Neither sdp_closed() nor sdp_drop() should return NULL, as the
738 * socket is still open.
739 */
740 if (ssk->state < TCPS_ESTABLISHED) {
741 ssk = sdp_closed(ssk);
742 KASSERT(ssk != NULL,
743 ("sdp_start_disconnect: sdp_close() returned NULL"));
744 } else if ((so->so_options & SO_LINGER) && so->so_linger == 0) {
745 ssk = sdp_drop(ssk, 0);
746 KASSERT(ssk != NULL,
747 ("sdp_start_disconnect: sdp_drop() returned NULL"));
748 } else {
749 soisdisconnecting(so);
750 unread = sbused(&so->so_rcv);
751 sbflush(&so->so_rcv);
752 sdp_usrclosed(ssk);
753 if (!(ssk->flags & SDP_DROPPED)) {
754 if (unread)
755 sdp_output_reset(ssk);
756 else
757 sdp_output_disconnect(ssk);
758 }
759 }
760}
761
762/*
763 * User initiated disconnect.
764 */
765static int
766sdp_disconnect(struct socket *so)
767{
768 struct sdp_sock *ssk;
769 int error = 0;
770
771 ssk = sdp_sk(so);
772 SDP_WLOCK(ssk);
773 if (ssk->flags & (SDP_TIMEWAIT | SDP_DROPPED)) {
774 error = ECONNRESET;
775 goto out;
776 }
777 sdp_start_disconnect(ssk);
778out:
779 SDP_WUNLOCK(ssk);
780 return (error);
781}
782
783/*
784 * Accept a connection. Essentially all the work is done at higher levels;
785 * just return the address of the peer, storing through addr.
786 *
787 *
788 * XXX This is broken XXX
789 *
790 * The rationale for acquiring the sdp lock here is somewhat complicated,
791 * and is described in detail in the commit log entry for r175612. Acquiring
792 * it delays an accept(2) racing with sonewconn(), which inserts the socket
793 * before the address/port fields are initialized. A better fix would
794 * prevent the socket from being placed in the listen queue until all fields
795 * are fully initialized.
796 */
797static int
798sdp_accept(struct socket *so, struct sockaddr **nam)
799{
800 struct sdp_sock *ssk = NULL;
801 struct in_addr addr;
802 in_port_t port;
803 int error;
804
805 if (so->so_state & SS_ISDISCONNECTED)
806 return (ECONNABORTED);
807
808 port = 0;
809 addr.s_addr = 0;
810 error = 0;
811 ssk = sdp_sk(so);
812 SDP_WLOCK(ssk);
813 if (ssk->flags & (SDP_TIMEWAIT | SDP_DROPPED)) {
814 error = ECONNABORTED;
815 goto out;
816 }
817 port = ssk->fport;
818 addr.s_addr = ssk->faddr;
819out:
820 SDP_WUNLOCK(ssk);
821 if (error == 0)
822 *nam = sdp_sockaddr(port, &addr);
823 return error;
824}
825
826/*
827 * Mark the connection as being incapable of further output.
828 */
829static int
830sdp_shutdown(struct socket *so)
831{
832 int error = 0;
833 struct sdp_sock *ssk;
834
835 ssk = sdp_sk(so);
836 SDP_WLOCK(ssk);
837 if (ssk->flags & (SDP_TIMEWAIT | SDP_DROPPED)) {
838 error = ECONNRESET;
839 goto out;
840 }
841 socantsendmore(so);
842 sdp_usrclosed(ssk);
843 if (!(ssk->flags & SDP_DROPPED))
844 sdp_output_disconnect(ssk);
845
846out:
847 SDP_WUNLOCK(ssk);
848
849 return (error);
850}
851
852static void
853sdp_append(struct sdp_sock *ssk, struct sockbuf *sb, struct mbuf *mb, int cnt)
854{
855 struct mbuf *n;
856 int ncnt;
857
858 SOCKBUF_LOCK_ASSERT(sb);
859 SBLASTRECORDCHK(sb);
860 KASSERT(mb->m_flags & M_PKTHDR,
861 ("sdp_append: %p Missing packet header.\n", mb));
862 n = sb->sb_lastrecord;
863 /*
864 * If the queue is empty just set all pointers and proceed.
865 */
866 if (n == NULL) {
867 sb->sb_lastrecord = sb->sb_mb = sb->sb_sndptr = mb;
868 for (; mb; mb = mb->m_next) {
869 sb->sb_mbtail = mb;
870 sballoc(sb, mb);
871 }
872 return;
873 }
874 /*
875 * Count the number of mbufs in the current tail.
876 */
877 for (ncnt = 0; n->m_next; n = n->m_next)
878 ncnt++;
879 n = sb->sb_lastrecord;
880 /*
881 * If the two chains can fit in a single sdp packet and
882 * the last record has not been sent yet (WRITABLE) coalesce
883 * them. The lastrecord remains the same but we must strip the
884 * packet header and then let sbcompress do the hard part.
885 */
886 if (M_WRITABLE(n) && ncnt + cnt < SDP_MAX_SEND_SGES &&
887 n->m_pkthdr.len + mb->m_pkthdr.len - SDP_HEAD_SIZE <
888 ssk->xmit_size_goal) {
889 m_adj(mb, SDP_HEAD_SIZE);
890 n->m_pkthdr.len += mb->m_pkthdr.len;
891 n->m_flags |= mb->m_flags & (M_PUSH | M_URG);
892 m_demote(mb, 1);
892 m_demote(mb, 1, 0);
893 sbcompress(sb, mb, sb->sb_mbtail);
894 return;
895 }
896 /*
897 * Not compressible, just append to the end and adjust counters.
898 */
899 sb->sb_lastrecord->m_flags |= M_PUSH;
900 sb->sb_lastrecord->m_nextpkt = mb;
901 sb->sb_lastrecord = mb;
902 if (sb->sb_sndptr == NULL)
903 sb->sb_sndptr = mb;
904 for (; mb; mb = mb->m_next) {
905 sb->sb_mbtail = mb;
906 sballoc(sb, mb);
907 }
908}
909
910/*
911 * Do a send by putting data in output queue and updating urgent
912 * marker if URG set. Possibly send more data. Unlike the other
913 * pru_*() routines, the mbuf chains are our responsibility. We
914 * must either enqueue them or free them. The other pru_* routines
915 * generally are caller-frees.
916 *
917 * This comes from sendfile, normal sends will come from sdp_sosend().
918 */
919static int
920sdp_send(struct socket *so, int flags, struct mbuf *m,
921 struct sockaddr *nam, struct mbuf *control, struct thread *td)
922{
923 struct sdp_sock *ssk;
924 struct mbuf *n;
925 int error;
926 int cnt;
927
928 error = 0;
929 ssk = sdp_sk(so);
930 KASSERT(m->m_flags & M_PKTHDR,
931 ("sdp_send: %p no packet header", m));
932 M_PREPEND(m, SDP_HEAD_SIZE, M_WAITOK);
933 mtod(m, struct sdp_bsdh *)->mid = SDP_MID_DATA;
934 for (n = m, cnt = 0; n->m_next; n = n->m_next)
935 cnt++;
936 if (cnt > SDP_MAX_SEND_SGES) {
937 n = m_collapse(m, M_WAITOK, SDP_MAX_SEND_SGES);
938 if (n == NULL) {
939 m_freem(m);
940 return (EMSGSIZE);
941 }
942 m = n;
943 for (cnt = 0; n->m_next; n = n->m_next)
944 cnt++;
945 }
946 SDP_WLOCK(ssk);
947 if (ssk->flags & (SDP_TIMEWAIT | SDP_DROPPED)) {
948 if (control)
949 m_freem(control);
950 if (m)
951 m_freem(m);
952 error = ECONNRESET;
953 goto out;
954 }
955 if (control) {
956 /* SDP doesn't support control messages. */
957 if (control->m_len) {
958 m_freem(control);
959 if (m)
960 m_freem(m);
961 error = EINVAL;
962 goto out;
963 }
964 m_freem(control); /* empty control, just free it */
965 }
966 if (!(flags & PRUS_OOB)) {
967 SOCKBUF_LOCK(&so->so_snd);
968 sdp_append(ssk, &so->so_snd, m, cnt);
969 SOCKBUF_UNLOCK(&so->so_snd);
970 if (nam && ssk->state < TCPS_SYN_SENT) {
971 /*
972 * Do implied connect if not yet connected.
973 */
974 error = sdp_start_connect(ssk, nam, td);
975 if (error)
976 goto out;
977 }
978 if (flags & PRUS_EOF) {
979 /*
980 * Close the send side of the connection after
981 * the data is sent.
982 */
983 socantsendmore(so);
984 sdp_usrclosed(ssk);
985 if (!(ssk->flags & SDP_DROPPED))
986 sdp_output_disconnect(ssk);
987 } else if (!(ssk->flags & SDP_DROPPED) &&
988 !(flags & PRUS_MORETOCOME))
989 sdp_post_sends(ssk, M_NOWAIT);
990 SDP_WUNLOCK(ssk);
991 return (0);
992 } else {
993 SOCKBUF_LOCK(&so->so_snd);
994 if (sbspace(&so->so_snd) < -512) {
995 SOCKBUF_UNLOCK(&so->so_snd);
996 m_freem(m);
997 error = ENOBUFS;
998 goto out;
999 }
1000 /*
1001 * According to RFC961 (Assigned Protocols),
1002 * the urgent pointer points to the last octet
1003 * of urgent data. We continue, however,
1004 * to consider it to indicate the first octet
1005 * of data past the urgent section.
1006 * Otherwise, snd_up should be one lower.
1007 */
1008 m->m_flags |= M_URG | M_PUSH;
1009 sdp_append(ssk, &so->so_snd, m, cnt);
1010 SOCKBUF_UNLOCK(&so->so_snd);
1011 if (nam && ssk->state < TCPS_SYN_SENT) {
1012 /*
1013 * Do implied connect if not yet connected.
1014 */
1015 error = sdp_start_connect(ssk, nam, td);
1016 if (error)
1017 goto out;
1018 }
1019 sdp_post_sends(ssk, M_NOWAIT);
1020 SDP_WUNLOCK(ssk);
1021 return (0);
1022 }
1023out:
1024 SDP_WUNLOCK(ssk);
1025 return (error);
1026}
1027
1028#define SBLOCKWAIT(f) (((f) & MSG_DONTWAIT) ? 0 : SBL_WAIT)
1029
1030/*
1031 * Send on a socket. If send must go all at once and message is larger than
1032 * send buffering, then hard error. Lock against other senders. If must go
1033 * all at once and not enough room now, then inform user that this would
1034 * block and do nothing. Otherwise, if nonblocking, send as much as
1035 * possible. The data to be sent is described by "uio" if nonzero, otherwise
1036 * by the mbuf chain "top" (which must be null if uio is not). Data provided
1037 * in mbuf chain must be small enough to send all at once.
1038 *
1039 * Returns nonzero on error, timeout or signal; callers must check for short
1040 * counts if EINTR/ERESTART are returned. Data and control buffers are freed
1041 * on return.
1042 */
1043static int
1044sdp_sosend(struct socket *so, struct sockaddr *addr, struct uio *uio,
1045 struct mbuf *top, struct mbuf *control, int flags, struct thread *td)
1046{
1047 struct sdp_sock *ssk;
1048 long space, resid;
1049 int atomic;
1050 int error;
1051 int copy;
1052
1053 if (uio != NULL)
1054 resid = uio->uio_resid;
1055 else
1056 resid = top->m_pkthdr.len;
1057 atomic = top != NULL;
1058 if (control != NULL) {
1059 if (control->m_len) {
1060 m_freem(control);
1061 if (top)
1062 m_freem(top);
1063 return (EINVAL);
1064 }
1065 m_freem(control);
1066 control = NULL;
1067 }
1068 /*
1069 * In theory resid should be unsigned. However, space must be
1070 * signed, as it might be less than 0 if we over-committed, and we
1071 * must use a signed comparison of space and resid. On the other
1072 * hand, a negative resid causes us to loop sending 0-length
1073 * segments to the protocol.
1074 *
1075 * Also check to make sure that MSG_EOR isn't used on SOCK_STREAM
1076 * type sockets since that's an error.
1077 */
1078 if (resid < 0 || (so->so_type == SOCK_STREAM && (flags & MSG_EOR))) {
1079 error = EINVAL;
1080 goto out;
1081 }
1082 if (td != NULL)
1083 td->td_ru.ru_msgsnd++;
1084
1085 ssk = sdp_sk(so);
1086 error = sblock(&so->so_snd, SBLOCKWAIT(flags));
1087 if (error)
1088 goto out;
1089
1090restart:
1091 do {
1092 SOCKBUF_LOCK(&so->so_snd);
1093 if (so->so_snd.sb_state & SBS_CANTSENDMORE) {
1094 SOCKBUF_UNLOCK(&so->so_snd);
1095 error = EPIPE;
1096 goto release;
1097 }
1098 if (so->so_error) {
1099 error = so->so_error;
1100 so->so_error = 0;
1101 SOCKBUF_UNLOCK(&so->so_snd);
1102 goto release;
1103 }
1104 if ((so->so_state & SS_ISCONNECTED) == 0 && addr == NULL) {
1105 SOCKBUF_UNLOCK(&so->so_snd);
1106 error = ENOTCONN;
1107 goto release;
1108 }
1109 space = sbspace(&so->so_snd);
1110 if (flags & MSG_OOB)
1111 space += 1024;
1112 if (atomic && resid > ssk->xmit_size_goal - SDP_HEAD_SIZE) {
1113 SOCKBUF_UNLOCK(&so->so_snd);
1114 error = EMSGSIZE;
1115 goto release;
1116 }
1117 if (space < resid &&
1118 (atomic || space < so->so_snd.sb_lowat)) {
1119 if ((so->so_state & SS_NBIO) || (flags & MSG_NBIO)) {
1120 SOCKBUF_UNLOCK(&so->so_snd);
1121 error = EWOULDBLOCK;
1122 goto release;
1123 }
1124 error = sbwait(&so->so_snd);
1125 SOCKBUF_UNLOCK(&so->so_snd);
1126 if (error)
1127 goto release;
1128 goto restart;
1129 }
1130 SOCKBUF_UNLOCK(&so->so_snd);
1131 do {
1132 if (uio == NULL) {
1133 resid = 0;
1134 if (flags & MSG_EOR)
1135 top->m_flags |= M_EOR;
1136 } else {
1137 /*
1138 * Copy the data from userland into a mbuf
1139 * chain. If no data is to be copied in,
1140 * a single empty mbuf is returned.
1141 */
1142 copy = min(space,
1143 ssk->xmit_size_goal - SDP_HEAD_SIZE);
1144 top = m_uiotombuf(uio, M_WAITOK, copy,
1145 0, M_PKTHDR |
1146 ((flags & MSG_EOR) ? M_EOR : 0));
1147 if (top == NULL) {
1148 /* only possible error */
1149 error = EFAULT;
1150 goto release;
1151 }
1152 space -= resid - uio->uio_resid;
1153 resid = uio->uio_resid;
1154 }
1155 /*
1156 * XXX all the SBS_CANTSENDMORE checks previously
1157 * done could be out of date after dropping the
1158 * socket lock.
1159 */
1160 error = sdp_send(so, (flags & MSG_OOB) ? PRUS_OOB :
1161 /*
1162 * Set EOF on the last send if the user specified
1163 * MSG_EOF.
1164 */
1165 ((flags & MSG_EOF) && (resid <= 0)) ? PRUS_EOF :
1166 /* If there is more to send set PRUS_MORETOCOME. */
1167 (resid > 0 && space > 0) ? PRUS_MORETOCOME : 0,
1168 top, addr, NULL, td);
1169 top = NULL;
1170 if (error)
1171 goto release;
1172 } while (resid && space > 0);
1173 } while (resid);
1174
1175release:
1176 sbunlock(&so->so_snd);
1177out:
1178 if (top != NULL)
1179 m_freem(top);
1180 return (error);
1181}
1182
1183/*
1184 * The part of soreceive() that implements reading non-inline out-of-band
1185 * data from a socket. For more complete comments, see soreceive(), from
1186 * which this code originated.
1187 *
1188 * Note that soreceive_rcvoob(), unlike the remainder of soreceive(), is
1189 * unable to return an mbuf chain to the caller.
1190 */
1191static int
1192soreceive_rcvoob(struct socket *so, struct uio *uio, int flags)
1193{
1194 struct protosw *pr = so->so_proto;
1195 struct mbuf *m;
1196 int error;
1197
1198 KASSERT(flags & MSG_OOB, ("soreceive_rcvoob: (flags & MSG_OOB) == 0"));
1199
1200 m = m_get(M_WAITOK, MT_DATA);
1201 error = (*pr->pr_usrreqs->pru_rcvoob)(so, m, flags & MSG_PEEK);
1202 if (error)
1203 goto bad;
1204 do {
1205 error = uiomove(mtod(m, void *),
1206 (int) min(uio->uio_resid, m->m_len), uio);
1207 m = m_free(m);
1208 } while (uio->uio_resid && error == 0 && m);
1209bad:
1210 if (m != NULL)
1211 m_freem(m);
1212 return (error);
1213}
1214
1215/*
1216 * Optimized version of soreceive() for stream (TCP) sockets.
1217 */
1218static int
1219sdp_sorecv(struct socket *so, struct sockaddr **psa, struct uio *uio,
1220 struct mbuf **mp0, struct mbuf **controlp, int *flagsp)
1221{
1222 int len = 0, error = 0, flags, oresid;
1223 struct sockbuf *sb;
1224 struct mbuf *m, *n = NULL;
1225 struct sdp_sock *ssk;
1226
1227 /* We only do stream sockets. */
1228 if (so->so_type != SOCK_STREAM)
1229 return (EINVAL);
1230 if (psa != NULL)
1231 *psa = NULL;
1232 if (controlp != NULL)
1233 return (EINVAL);
1234 if (flagsp != NULL)
1235 flags = *flagsp &~ MSG_EOR;
1236 else
1237 flags = 0;
1238 if (flags & MSG_OOB)
1239 return (soreceive_rcvoob(so, uio, flags));
1240 if (mp0 != NULL)
1241 *mp0 = NULL;
1242
1243 sb = &so->so_rcv;
1244 ssk = sdp_sk(so);
1245
1246 /* Prevent other readers from entering the socket. */
1247 error = sblock(sb, SBLOCKWAIT(flags));
1248 if (error)
1249 goto out;
1250 SOCKBUF_LOCK(sb);
1251
1252 /* Easy one, no space to copyout anything. */
1253 if (uio->uio_resid == 0) {
1254 error = EINVAL;
1255 goto out;
1256 }
1257 oresid = uio->uio_resid;
1258
1259 /* We will never ever get anything unless we are connected. */
1260 if (!(so->so_state & (SS_ISCONNECTED|SS_ISDISCONNECTED))) {
1261 /* When disconnecting there may be still some data left. */
1262 if (sbavail(sb))
1263 goto deliver;
1264 if (!(so->so_state & SS_ISDISCONNECTED))
1265 error = ENOTCONN;
1266 goto out;
1267 }
1268
1269 /* Socket buffer is empty and we shall not block. */
1270 if (sbavail(sb) == 0 &&
1271 ((so->so_state & SS_NBIO) || (flags & (MSG_DONTWAIT|MSG_NBIO)))) {
1272 error = EAGAIN;
1273 goto out;
1274 }
1275
1276restart:
1277 SOCKBUF_LOCK_ASSERT(&so->so_rcv);
1278
1279 /* Abort if socket has reported problems. */
1280 if (so->so_error) {
1281 if (sbavail(sb))
1282 goto deliver;
1283 if (oresid > uio->uio_resid)
1284 goto out;
1285 error = so->so_error;
1286 if (!(flags & MSG_PEEK))
1287 so->so_error = 0;
1288 goto out;
1289 }
1290
1291 /* Door is closed. Deliver what is left, if any. */
1292 if (sb->sb_state & SBS_CANTRCVMORE) {
1293 if (sbavail(sb))
1294 goto deliver;
1295 else
1296 goto out;
1297 }
1298
1299 /* Socket buffer got some data that we shall deliver now. */
1300 if (sbavail(sb) && !(flags & MSG_WAITALL) &&
1301 ((so->so_state & SS_NBIO) ||
1302 (flags & (MSG_DONTWAIT|MSG_NBIO)) ||
1303 sbavail(sb) >= sb->sb_lowat ||
1304 sbavail(sb) >= uio->uio_resid ||
1305 sbavail(sb) >= sb->sb_hiwat) ) {
1306 goto deliver;
1307 }
1308
1309 /* On MSG_WAITALL we must wait until all data or error arrives. */
1310 if ((flags & MSG_WAITALL) &&
1311 (sbavail(sb) >= uio->uio_resid || sbavail(sb) >= sb->sb_lowat))
1312 goto deliver;
1313
1314 /*
1315 * Wait and block until (more) data comes in.
1316 * NB: Drops the sockbuf lock during wait.
1317 */
1318 error = sbwait(sb);
1319 if (error)
1320 goto out;
1321 goto restart;
1322
1323deliver:
1324 SOCKBUF_LOCK_ASSERT(&so->so_rcv);
1325 KASSERT(sbavail(sb), ("%s: sockbuf empty", __func__));
1326 KASSERT(sb->sb_mb != NULL, ("%s: sb_mb == NULL", __func__));
1327
1328 /* Statistics. */
1329 if (uio->uio_td)
1330 uio->uio_td->td_ru.ru_msgrcv++;
1331
1332 /* Fill uio until full or current end of socket buffer is reached. */
1333 len = min(uio->uio_resid, sbavail(sb));
1334 if (mp0 != NULL) {
1335 /* Dequeue as many mbufs as possible. */
1336 if (!(flags & MSG_PEEK) && len >= sb->sb_mb->m_len) {
1337 for (*mp0 = m = sb->sb_mb;
1338 m != NULL && m->m_len <= len;
1339 m = m->m_next) {
1340 len -= m->m_len;
1341 uio->uio_resid -= m->m_len;
1342 sbfree(sb, m);
1343 n = m;
1344 }
1345 sb->sb_mb = m;
1346 if (sb->sb_mb == NULL)
1347 SB_EMPTY_FIXUP(sb);
1348 n->m_next = NULL;
1349 }
1350 /* Copy the remainder. */
1351 if (len > 0) {
1352 KASSERT(sb->sb_mb != NULL,
1353 ("%s: len > 0 && sb->sb_mb empty", __func__));
1354
1355 m = m_copym(sb->sb_mb, 0, len, M_NOWAIT);
1356 if (m == NULL)
1357 len = 0; /* Don't flush data from sockbuf. */
1358 else
1359 uio->uio_resid -= m->m_len;
1360 if (*mp0 != NULL)
1361 n->m_next = m;
1362 else
1363 *mp0 = m;
1364 if (*mp0 == NULL) {
1365 error = ENOBUFS;
1366 goto out;
1367 }
1368 }
1369 } else {
1370 /* NB: Must unlock socket buffer as uiomove may sleep. */
1371 SOCKBUF_UNLOCK(sb);
1372 error = m_mbuftouio(uio, sb->sb_mb, len);
1373 SOCKBUF_LOCK(sb);
1374 if (error)
1375 goto out;
1376 }
1377 SBLASTRECORDCHK(sb);
1378 SBLASTMBUFCHK(sb);
1379
1380 /*
1381 * Remove the delivered data from the socket buffer unless we
1382 * were only peeking.
1383 */
1384 if (!(flags & MSG_PEEK)) {
1385 if (len > 0)
1386 sbdrop_locked(sb, len);
1387
1388 /* Notify protocol that we drained some data. */
1389 SOCKBUF_UNLOCK(sb);
1390 SDP_WLOCK(ssk);
1391 sdp_do_posts(ssk);
1392 SDP_WUNLOCK(ssk);
1393 SOCKBUF_LOCK(sb);
1394 }
1395
1396 /*
1397 * For MSG_WAITALL we may have to loop again and wait for
1398 * more data to come in.
1399 */
1400 if ((flags & MSG_WAITALL) && uio->uio_resid > 0)
1401 goto restart;
1402out:
1403 SOCKBUF_LOCK_ASSERT(sb);
1404 SBLASTRECORDCHK(sb);
1405 SBLASTMBUFCHK(sb);
1406 SOCKBUF_UNLOCK(sb);
1407 sbunlock(sb);
1408 return (error);
1409}
1410
1411/*
1412 * Abort is used to teardown a connection typically while sitting in
1413 * the accept queue.
1414 */
1415void
1416sdp_abort(struct socket *so)
1417{
1418 struct sdp_sock *ssk;
1419
1420 ssk = sdp_sk(so);
1421 SDP_WLOCK(ssk);
1422 /*
1423 * If we have not yet dropped, do it now.
1424 */
1425 if (!(ssk->flags & SDP_TIMEWAIT) &&
1426 !(ssk->flags & SDP_DROPPED))
1427 sdp_drop(ssk, ECONNABORTED);
1428 KASSERT(ssk->flags & SDP_DROPPED, ("sdp_abort: %p not dropped 0x%X",
1429 ssk, ssk->flags));
1430 SDP_WUNLOCK(ssk);
1431}
1432
1433/*
1434 * Close a SDP socket and initiate a friendly disconnect.
1435 */
1436static void
1437sdp_close(struct socket *so)
1438{
1439 struct sdp_sock *ssk;
1440
1441 ssk = sdp_sk(so);
1442 SDP_WLOCK(ssk);
1443 /*
1444 * If we have not yet dropped, do it now.
1445 */
1446 if (!(ssk->flags & SDP_TIMEWAIT) &&
1447 !(ssk->flags & SDP_DROPPED))
1448 sdp_start_disconnect(ssk);
1449
1450 /*
1451 * If we've still not dropped let the socket layer know we're
1452 * holding on to the socket and pcb for a while.
1453 */
1454 if (!(ssk->flags & SDP_DROPPED)) {
1455 SOCK_LOCK(so);
1456 so->so_state |= SS_PROTOREF;
1457 SOCK_UNLOCK(so);
1458 ssk->flags |= SDP_SOCKREF;
1459 }
1460 SDP_WUNLOCK(ssk);
1461}
1462
1463/*
1464 * User requests out-of-band data.
1465 */
1466static int
1467sdp_rcvoob(struct socket *so, struct mbuf *m, int flags)
1468{
1469 int error = 0;
1470 struct sdp_sock *ssk;
1471
1472 ssk = sdp_sk(so);
1473 SDP_WLOCK(ssk);
1474 if (!rx_ring_trylock(&ssk->rx_ring)) {
1475 SDP_WUNLOCK(ssk);
1476 return (ECONNRESET);
1477 }
1478 if (ssk->flags & (SDP_TIMEWAIT | SDP_DROPPED)) {
1479 error = ECONNRESET;
1480 goto out;
1481 }
1482 if ((so->so_oobmark == 0 &&
1483 (so->so_rcv.sb_state & SBS_RCVATMARK) == 0) ||
1484 so->so_options & SO_OOBINLINE ||
1485 ssk->oobflags & SDP_HADOOB) {
1486 error = EINVAL;
1487 goto out;
1488 }
1489 if ((ssk->oobflags & SDP_HAVEOOB) == 0) {
1490 error = EWOULDBLOCK;
1491 goto out;
1492 }
1493 m->m_len = 1;
1494 *mtod(m, caddr_t) = ssk->iobc;
1495 if ((flags & MSG_PEEK) == 0)
1496 ssk->oobflags ^= (SDP_HAVEOOB | SDP_HADOOB);
1497out:
1498 rx_ring_unlock(&ssk->rx_ring);
1499 SDP_WUNLOCK(ssk);
1500 return (error);
1501}
1502
1503void
1504sdp_urg(struct sdp_sock *ssk, struct mbuf *mb)
1505{
1506 struct mbuf *m;
1507 struct socket *so;
1508
1509 so = ssk->socket;
1510 if (so == NULL)
1511 return;
1512
1513 so->so_oobmark = sbused(&so->so_rcv) + mb->m_pkthdr.len - 1;
1514 sohasoutofband(so);
1515 ssk->oobflags &= ~(SDP_HAVEOOB | SDP_HADOOB);
1516 if (!(so->so_options & SO_OOBINLINE)) {
1517 for (m = mb; m->m_next != NULL; m = m->m_next);
1518 ssk->iobc = *(mtod(m, char *) + m->m_len - 1);
1519 ssk->oobflags |= SDP_HAVEOOB;
1520 m->m_len--;
1521 mb->m_pkthdr.len--;
1522 }
1523}
1524
1525/*
1526 * Notify a sdp socket of an asynchronous error.
1527 *
1528 * Do not wake up user since there currently is no mechanism for
1529 * reporting soft errors (yet - a kqueue filter may be added).
1530 */
1531struct sdp_sock *
1532sdp_notify(struct sdp_sock *ssk, int error)
1533{
1534
1535 SDP_WLOCK_ASSERT(ssk);
1536
1537 if ((ssk->flags & SDP_TIMEWAIT) ||
1538 (ssk->flags & SDP_DROPPED))
1539 return (ssk);
1540
1541 /*
1542 * Ignore some errors if we are hooked up.
1543 */
1544 if (ssk->state == TCPS_ESTABLISHED &&
1545 (error == EHOSTUNREACH || error == ENETUNREACH ||
1546 error == EHOSTDOWN))
1547 return (ssk);
1548 ssk->softerror = error;
1549 return sdp_drop(ssk, error);
1550}
1551
1552static void
1553sdp_ctlinput(int cmd, struct sockaddr *sa, void *vip)
1554{
1555 struct in_addr faddr;
1556
1557 faddr = ((struct sockaddr_in *)sa)->sin_addr;
1558 if (sa->sa_family != AF_INET || faddr.s_addr == INADDR_ANY)
1559 return;
1560
1561 sdp_pcbnotifyall(faddr, inetctlerrmap[cmd], sdp_notify);
1562}
1563
1564static int
1565sdp_control(struct socket *so, u_long cmd, caddr_t data, struct ifnet *ifp,
1566 struct thread *td)
1567{
1568 return (EOPNOTSUPP);
1569}
1570
1571static void
1572sdp_keepalive_timeout(void *data)
1573{
1574 struct sdp_sock *ssk;
1575
1576 ssk = data;
1577 /* Callout canceled. */
1578 if (!callout_active(&ssk->keep2msl))
1579 return;
1580 /* Callout rescheduled as a different kind of timer. */
1581 if (callout_pending(&ssk->keep2msl))
1582 goto out;
1583 callout_deactivate(&ssk->keep2msl);
1584 if (ssk->flags & SDP_DROPPED ||
1585 (ssk->socket->so_options & SO_KEEPALIVE) == 0)
1586 goto out;
1587 sdp_post_keepalive(ssk);
1588 callout_reset(&ssk->keep2msl, SDP_KEEPALIVE_TIME,
1589 sdp_keepalive_timeout, ssk);
1590out:
1591 SDP_WUNLOCK(ssk);
1592}
1593
1594
1595void
1596sdp_start_keepalive_timer(struct socket *so)
1597{
1598 struct sdp_sock *ssk;
1599
1600 ssk = sdp_sk(so);
1601 if (!callout_pending(&ssk->keep2msl))
1602 callout_reset(&ssk->keep2msl, SDP_KEEPALIVE_TIME,
1603 sdp_keepalive_timeout, ssk);
1604}
1605
1606static void
1607sdp_stop_keepalive_timer(struct socket *so)
1608{
1609 struct sdp_sock *ssk;
1610
1611 ssk = sdp_sk(so);
1612 callout_stop(&ssk->keep2msl);
1613}
1614
1615/*
1616 * sdp_ctloutput() must drop the inpcb lock before performing copyin on
1617 * socket option arguments. When it re-acquires the lock after the copy, it
1618 * has to revalidate that the connection is still valid for the socket
1619 * option.
1620 */
1621#define SDP_WLOCK_RECHECK(inp) do { \
1622 SDP_WLOCK(ssk); \
1623 if (ssk->flags & (SDP_TIMEWAIT | SDP_DROPPED)) { \
1624 SDP_WUNLOCK(ssk); \
1625 return (ECONNRESET); \
1626 } \
1627} while(0)
1628
1629static int
1630sdp_ctloutput(struct socket *so, struct sockopt *sopt)
1631{
1632 int error, opt, optval;
1633 struct sdp_sock *ssk;
1634
1635 error = 0;
1636 ssk = sdp_sk(so);
1637 if (sopt->sopt_level == SOL_SOCKET && sopt->sopt_name == SO_KEEPALIVE) {
1638 SDP_WLOCK(ssk);
1639 if (so->so_options & SO_KEEPALIVE)
1640 sdp_start_keepalive_timer(so);
1641 else
1642 sdp_stop_keepalive_timer(so);
1643 SDP_WUNLOCK(ssk);
1644 }
1645 if (sopt->sopt_level != IPPROTO_TCP)
1646 return (error);
1647
1648 SDP_WLOCK(ssk);
1649 if (ssk->flags & (SDP_TIMEWAIT | SDP_DROPPED)) {
1650 SDP_WUNLOCK(ssk);
1651 return (ECONNRESET);
1652 }
1653
1654 switch (sopt->sopt_dir) {
1655 case SOPT_SET:
1656 switch (sopt->sopt_name) {
1657 case TCP_NODELAY:
1658 SDP_WUNLOCK(ssk);
1659 error = sooptcopyin(sopt, &optval, sizeof optval,
1660 sizeof optval);
1661 if (error)
1662 return (error);
1663
1664 SDP_WLOCK_RECHECK(ssk);
1665 opt = SDP_NODELAY;
1666 if (optval)
1667 ssk->flags |= opt;
1668 else
1669 ssk->flags &= ~opt;
1670 sdp_do_posts(ssk);
1671 SDP_WUNLOCK(ssk);
1672 break;
1673
1674 default:
1675 SDP_WUNLOCK(ssk);
1676 error = ENOPROTOOPT;
1677 break;
1678 }
1679 break;
1680
1681 case SOPT_GET:
1682 switch (sopt->sopt_name) {
1683 case TCP_NODELAY:
1684 optval = ssk->flags & SDP_NODELAY;
1685 SDP_WUNLOCK(ssk);
1686 error = sooptcopyout(sopt, &optval, sizeof optval);
1687 break;
1688 default:
1689 SDP_WUNLOCK(ssk);
1690 error = ENOPROTOOPT;
1691 break;
1692 }
1693 break;
1694 }
1695 return (error);
1696}
1697#undef SDP_WLOCK_RECHECK
1698
1699int sdp_mod_count = 0;
1700int sdp_mod_usec = 0;
1701
1702void
1703sdp_set_default_moderation(struct sdp_sock *ssk)
1704{
1705 if (sdp_mod_count <= 0 || sdp_mod_usec <= 0)
1706 return;
1707 ib_modify_cq(ssk->rx_ring.cq, sdp_mod_count, sdp_mod_usec);
1708}
1709
1710
1711static void
1712sdp_dev_add(struct ib_device *device)
1713{
1714 struct ib_fmr_pool_param param;
1715 struct sdp_device *sdp_dev;
1716
1717 sdp_dev = malloc(sizeof(*sdp_dev), M_SDP, M_WAITOK | M_ZERO);
1718 sdp_dev->pd = ib_alloc_pd(device);
1719 if (IS_ERR(sdp_dev->pd))
1720 goto out_pd;
1721 sdp_dev->mr = ib_get_dma_mr(sdp_dev->pd, IB_ACCESS_LOCAL_WRITE);
1722 if (IS_ERR(sdp_dev->mr))
1723 goto out_mr;
1724 memset(&param, 0, sizeof param);
1725 param.max_pages_per_fmr = SDP_FMR_SIZE;
1726 param.page_shift = PAGE_SHIFT;
1727 param.access = (IB_ACCESS_LOCAL_WRITE | IB_ACCESS_REMOTE_READ);
1728 param.pool_size = SDP_FMR_POOL_SIZE;
1729 param.dirty_watermark = SDP_FMR_DIRTY_SIZE;
1730 param.cache = 1;
1731 sdp_dev->fmr_pool = ib_create_fmr_pool(sdp_dev->pd, &param);
1732 if (IS_ERR(sdp_dev->fmr_pool))
1733 goto out_fmr;
1734 ib_set_client_data(device, &sdp_client, sdp_dev);
1735 return;
1736
1737out_fmr:
1738 ib_dereg_mr(sdp_dev->mr);
1739out_mr:
1740 ib_dealloc_pd(sdp_dev->pd);
1741out_pd:
1742 free(sdp_dev, M_SDP);
1743}
1744
1745static void
1746sdp_dev_rem(struct ib_device *device)
1747{
1748 struct sdp_device *sdp_dev;
1749 struct sdp_sock *ssk;
1750
1751 SDP_LIST_WLOCK();
1752 LIST_FOREACH(ssk, &sdp_list, list) {
1753 if (ssk->ib_device != device)
1754 continue;
1755 SDP_WLOCK(ssk);
1756 if ((ssk->flags & SDP_DESTROY) == 0)
1757 ssk = sdp_notify(ssk, ECONNRESET);
1758 if (ssk)
1759 SDP_WUNLOCK(ssk);
1760 }
1761 SDP_LIST_WUNLOCK();
1762 /*
1763 * XXX Do I need to wait between these two?
1764 */
1765 sdp_dev = ib_get_client_data(device, &sdp_client);
1766 if (!sdp_dev)
1767 return;
1768 ib_flush_fmr_pool(sdp_dev->fmr_pool);
1769 ib_destroy_fmr_pool(sdp_dev->fmr_pool);
1770 ib_dereg_mr(sdp_dev->mr);
1771 ib_dealloc_pd(sdp_dev->pd);
1772 free(sdp_dev, M_SDP);
1773}
1774
1775struct ib_client sdp_client =
1776 { .name = "sdp", .add = sdp_dev_add, .remove = sdp_dev_rem };
1777
1778
1779static int
1780sdp_pcblist(SYSCTL_HANDLER_ARGS)
1781{
1782 int error, n, i;
1783 struct sdp_sock *ssk;
1784 struct xinpgen xig;
1785
1786 /*
1787 * The process of preparing the TCB list is too time-consuming and
1788 * resource-intensive to repeat twice on every request.
1789 */
1790 if (req->oldptr == NULL) {
1791 n = sdp_count;
1792 n += imax(n / 8, 10);
1793 req->oldidx = 2 * (sizeof xig) + n * sizeof(struct xtcpcb);
1794 return (0);
1795 }
1796
1797 if (req->newptr != NULL)
1798 return (EPERM);
1799
1800 /*
1801 * OK, now we're committed to doing something.
1802 */
1803 SDP_LIST_RLOCK();
1804 n = sdp_count;
1805 SDP_LIST_RUNLOCK();
1806
1807 error = sysctl_wire_old_buffer(req, 2 * (sizeof xig)
1808 + n * sizeof(struct xtcpcb));
1809 if (error != 0)
1810 return (error);
1811
1812 xig.xig_len = sizeof xig;
1813 xig.xig_count = n;
1814 xig.xig_gen = 0;
1815 xig.xig_sogen = so_gencnt;
1816 error = SYSCTL_OUT(req, &xig, sizeof xig);
1817 if (error)
1818 return (error);
1819
1820 SDP_LIST_RLOCK();
1821 for (ssk = LIST_FIRST(&sdp_list), i = 0;
1822 ssk != NULL && i < n; ssk = LIST_NEXT(ssk, list)) {
1823 struct xtcpcb xt;
1824
1825 SDP_RLOCK(ssk);
1826 if (ssk->flags & SDP_TIMEWAIT) {
1827 if (ssk->cred != NULL)
1828 error = cr_cansee(req->td->td_ucred,
1829 ssk->cred);
1830 else
1831 error = EINVAL; /* Skip this inp. */
1832 } else if (ssk->socket)
1833 error = cr_canseesocket(req->td->td_ucred,
1834 ssk->socket);
1835 else
1836 error = EINVAL;
1837 if (error) {
1838 error = 0;
1839 goto next;
1840 }
1841
1842 bzero(&xt, sizeof(xt));
1843 xt.xt_len = sizeof xt;
1844 xt.xt_inp.inp_gencnt = 0;
1845 xt.xt_inp.inp_vflag = INP_IPV4;
1846 memcpy(&xt.xt_inp.inp_laddr, &ssk->laddr, sizeof(ssk->laddr));
1847 xt.xt_inp.inp_lport = ssk->lport;
1848 memcpy(&xt.xt_inp.inp_faddr, &ssk->faddr, sizeof(ssk->faddr));
1849 xt.xt_inp.inp_fport = ssk->fport;
1850 xt.xt_tp.t_state = ssk->state;
1851 if (ssk->socket != NULL)
1852 sotoxsocket(ssk->socket, &xt.xt_socket);
1853 else
1854 bzero(&xt.xt_socket, sizeof xt.xt_socket);
1855 xt.xt_socket.xso_protocol = IPPROTO_TCP;
1856 SDP_RUNLOCK(ssk);
1857 error = SYSCTL_OUT(req, &xt, sizeof xt);
1858 if (error)
1859 break;
1860 i++;
1861 continue;
1862next:
1863 SDP_RUNLOCK(ssk);
1864 }
1865 if (!error) {
1866 /*
1867 * Give the user an updated idea of our state.
1868 * If the generation differs from what we told
1869 * her before, she knows that something happened
1870 * while we were processing this request, and it
1871 * might be necessary to retry.
1872 */
1873 xig.xig_gen = 0;
1874 xig.xig_sogen = so_gencnt;
1875 xig.xig_count = sdp_count;
1876 error = SYSCTL_OUT(req, &xig, sizeof xig);
1877 }
1878 SDP_LIST_RUNLOCK();
1879 return (error);
1880}
1881
1882static SYSCTL_NODE(_net_inet, -1, sdp, CTLFLAG_RW, 0, "SDP");
1883
1884SYSCTL_PROC(_net_inet_sdp, TCPCTL_PCBLIST, pcblist,
1885 CTLFLAG_RD | CTLTYPE_STRUCT, 0, 0, sdp_pcblist, "S,xtcpcb",
1886 "List of active SDP connections");
1887
1888static void
1889sdp_zone_change(void *tag)
1890{
1891
1892 uma_zone_set_max(sdp_zone, maxsockets);
1893}
1894
1895static void
1896sdp_init(void)
1897{
1898
1899 LIST_INIT(&sdp_list);
1900 sdp_zone = uma_zcreate("sdp_sock", sizeof(struct sdp_sock),
1901 NULL, NULL, NULL, NULL, UMA_ALIGN_PTR, UMA_ZONE_NOFREE);
1902 uma_zone_set_max(sdp_zone, maxsockets);
1903 EVENTHANDLER_REGISTER(maxsockets_change, sdp_zone_change, NULL,
1904 EVENTHANDLER_PRI_ANY);
1905 rx_comp_wq = create_singlethread_workqueue("rx_comp_wq");
1906 ib_register_client(&sdp_client);
1907}
1908
1909extern struct domain sdpdomain;
1910
1911struct pr_usrreqs sdp_usrreqs = {
1912 .pru_abort = sdp_abort,
1913 .pru_accept = sdp_accept,
1914 .pru_attach = sdp_attach,
1915 .pru_bind = sdp_bind,
1916 .pru_connect = sdp_connect,
1917 .pru_control = sdp_control,
1918 .pru_detach = sdp_detach,
1919 .pru_disconnect = sdp_disconnect,
1920 .pru_listen = sdp_listen,
1921 .pru_peeraddr = sdp_getpeeraddr,
1922 .pru_rcvoob = sdp_rcvoob,
1923 .pru_send = sdp_send,
1924 .pru_sosend = sdp_sosend,
1925 .pru_soreceive = sdp_sorecv,
1926 .pru_shutdown = sdp_shutdown,
1927 .pru_sockaddr = sdp_getsockaddr,
1928 .pru_close = sdp_close,
1929};
1930
1931struct protosw sdpsw[] = {
1932{
1933 .pr_type = SOCK_STREAM,
1934 .pr_domain = &sdpdomain,
1935 .pr_protocol = IPPROTO_IP,
1936 .pr_flags = PR_CONNREQUIRED|PR_IMPLOPCL|PR_WANTRCVD,
1937 .pr_ctlinput = sdp_ctlinput,
1938 .pr_ctloutput = sdp_ctloutput,
1939 .pr_usrreqs = &sdp_usrreqs
1940},
1941{
1942 .pr_type = SOCK_STREAM,
1943 .pr_domain = &sdpdomain,
1944 .pr_protocol = IPPROTO_TCP,
1945 .pr_flags = PR_CONNREQUIRED|PR_IMPLOPCL|PR_WANTRCVD,
1946 .pr_ctlinput = sdp_ctlinput,
1947 .pr_ctloutput = sdp_ctloutput,
1948 .pr_usrreqs = &sdp_usrreqs
1949},
1950};
1951
1952struct domain sdpdomain = {
1953 .dom_family = AF_INET_SDP,
1954 .dom_name = "SDP",
1955 .dom_init = sdp_init,
1956 .dom_protosw = sdpsw,
1957 .dom_protoswNPROTOSW = &sdpsw[sizeof(sdpsw)/sizeof(sdpsw[0])],
1958};
1959
1960DOMAIN_SET(sdp);
1961
1962int sdp_debug_level = 1;
1963int sdp_data_debug_level = 0;
893 sbcompress(sb, mb, sb->sb_mbtail);
894 return;
895 }
896 /*
897 * Not compressible, just append to the end and adjust counters.
898 */
899 sb->sb_lastrecord->m_flags |= M_PUSH;
900 sb->sb_lastrecord->m_nextpkt = mb;
901 sb->sb_lastrecord = mb;
902 if (sb->sb_sndptr == NULL)
903 sb->sb_sndptr = mb;
904 for (; mb; mb = mb->m_next) {
905 sb->sb_mbtail = mb;
906 sballoc(sb, mb);
907 }
908}
909
910/*
911 * Do a send by putting data in output queue and updating urgent
912 * marker if URG set. Possibly send more data. Unlike the other
913 * pru_*() routines, the mbuf chains are our responsibility. We
914 * must either enqueue them or free them. The other pru_* routines
915 * generally are caller-frees.
916 *
917 * This comes from sendfile, normal sends will come from sdp_sosend().
918 */
919static int
920sdp_send(struct socket *so, int flags, struct mbuf *m,
921 struct sockaddr *nam, struct mbuf *control, struct thread *td)
922{
923 struct sdp_sock *ssk;
924 struct mbuf *n;
925 int error;
926 int cnt;
927
928 error = 0;
929 ssk = sdp_sk(so);
930 KASSERT(m->m_flags & M_PKTHDR,
931 ("sdp_send: %p no packet header", m));
932 M_PREPEND(m, SDP_HEAD_SIZE, M_WAITOK);
933 mtod(m, struct sdp_bsdh *)->mid = SDP_MID_DATA;
934 for (n = m, cnt = 0; n->m_next; n = n->m_next)
935 cnt++;
936 if (cnt > SDP_MAX_SEND_SGES) {
937 n = m_collapse(m, M_WAITOK, SDP_MAX_SEND_SGES);
938 if (n == NULL) {
939 m_freem(m);
940 return (EMSGSIZE);
941 }
942 m = n;
943 for (cnt = 0; n->m_next; n = n->m_next)
944 cnt++;
945 }
946 SDP_WLOCK(ssk);
947 if (ssk->flags & (SDP_TIMEWAIT | SDP_DROPPED)) {
948 if (control)
949 m_freem(control);
950 if (m)
951 m_freem(m);
952 error = ECONNRESET;
953 goto out;
954 }
955 if (control) {
956 /* SDP doesn't support control messages. */
957 if (control->m_len) {
958 m_freem(control);
959 if (m)
960 m_freem(m);
961 error = EINVAL;
962 goto out;
963 }
964 m_freem(control); /* empty control, just free it */
965 }
966 if (!(flags & PRUS_OOB)) {
967 SOCKBUF_LOCK(&so->so_snd);
968 sdp_append(ssk, &so->so_snd, m, cnt);
969 SOCKBUF_UNLOCK(&so->so_snd);
970 if (nam && ssk->state < TCPS_SYN_SENT) {
971 /*
972 * Do implied connect if not yet connected.
973 */
974 error = sdp_start_connect(ssk, nam, td);
975 if (error)
976 goto out;
977 }
978 if (flags & PRUS_EOF) {
979 /*
980 * Close the send side of the connection after
981 * the data is sent.
982 */
983 socantsendmore(so);
984 sdp_usrclosed(ssk);
985 if (!(ssk->flags & SDP_DROPPED))
986 sdp_output_disconnect(ssk);
987 } else if (!(ssk->flags & SDP_DROPPED) &&
988 !(flags & PRUS_MORETOCOME))
989 sdp_post_sends(ssk, M_NOWAIT);
990 SDP_WUNLOCK(ssk);
991 return (0);
992 } else {
993 SOCKBUF_LOCK(&so->so_snd);
994 if (sbspace(&so->so_snd) < -512) {
995 SOCKBUF_UNLOCK(&so->so_snd);
996 m_freem(m);
997 error = ENOBUFS;
998 goto out;
999 }
1000 /*
1001 * According to RFC961 (Assigned Protocols),
1002 * the urgent pointer points to the last octet
1003 * of urgent data. We continue, however,
1004 * to consider it to indicate the first octet
1005 * of data past the urgent section.
1006 * Otherwise, snd_up should be one lower.
1007 */
1008 m->m_flags |= M_URG | M_PUSH;
1009 sdp_append(ssk, &so->so_snd, m, cnt);
1010 SOCKBUF_UNLOCK(&so->so_snd);
1011 if (nam && ssk->state < TCPS_SYN_SENT) {
1012 /*
1013 * Do implied connect if not yet connected.
1014 */
1015 error = sdp_start_connect(ssk, nam, td);
1016 if (error)
1017 goto out;
1018 }
1019 sdp_post_sends(ssk, M_NOWAIT);
1020 SDP_WUNLOCK(ssk);
1021 return (0);
1022 }
1023out:
1024 SDP_WUNLOCK(ssk);
1025 return (error);
1026}
1027
1028#define SBLOCKWAIT(f) (((f) & MSG_DONTWAIT) ? 0 : SBL_WAIT)
1029
1030/*
1031 * Send on a socket. If send must go all at once and message is larger than
1032 * send buffering, then hard error. Lock against other senders. If must go
1033 * all at once and not enough room now, then inform user that this would
1034 * block and do nothing. Otherwise, if nonblocking, send as much as
1035 * possible. The data to be sent is described by "uio" if nonzero, otherwise
1036 * by the mbuf chain "top" (which must be null if uio is not). Data provided
1037 * in mbuf chain must be small enough to send all at once.
1038 *
1039 * Returns nonzero on error, timeout or signal; callers must check for short
1040 * counts if EINTR/ERESTART are returned. Data and control buffers are freed
1041 * on return.
1042 */
1043static int
1044sdp_sosend(struct socket *so, struct sockaddr *addr, struct uio *uio,
1045 struct mbuf *top, struct mbuf *control, int flags, struct thread *td)
1046{
1047 struct sdp_sock *ssk;
1048 long space, resid;
1049 int atomic;
1050 int error;
1051 int copy;
1052
1053 if (uio != NULL)
1054 resid = uio->uio_resid;
1055 else
1056 resid = top->m_pkthdr.len;
1057 atomic = top != NULL;
1058 if (control != NULL) {
1059 if (control->m_len) {
1060 m_freem(control);
1061 if (top)
1062 m_freem(top);
1063 return (EINVAL);
1064 }
1065 m_freem(control);
1066 control = NULL;
1067 }
1068 /*
1069 * In theory resid should be unsigned. However, space must be
1070 * signed, as it might be less than 0 if we over-committed, and we
1071 * must use a signed comparison of space and resid. On the other
1072 * hand, a negative resid causes us to loop sending 0-length
1073 * segments to the protocol.
1074 *
1075 * Also check to make sure that MSG_EOR isn't used on SOCK_STREAM
1076 * type sockets since that's an error.
1077 */
1078 if (resid < 0 || (so->so_type == SOCK_STREAM && (flags & MSG_EOR))) {
1079 error = EINVAL;
1080 goto out;
1081 }
1082 if (td != NULL)
1083 td->td_ru.ru_msgsnd++;
1084
1085 ssk = sdp_sk(so);
1086 error = sblock(&so->so_snd, SBLOCKWAIT(flags));
1087 if (error)
1088 goto out;
1089
1090restart:
1091 do {
1092 SOCKBUF_LOCK(&so->so_snd);
1093 if (so->so_snd.sb_state & SBS_CANTSENDMORE) {
1094 SOCKBUF_UNLOCK(&so->so_snd);
1095 error = EPIPE;
1096 goto release;
1097 }
1098 if (so->so_error) {
1099 error = so->so_error;
1100 so->so_error = 0;
1101 SOCKBUF_UNLOCK(&so->so_snd);
1102 goto release;
1103 }
1104 if ((so->so_state & SS_ISCONNECTED) == 0 && addr == NULL) {
1105 SOCKBUF_UNLOCK(&so->so_snd);
1106 error = ENOTCONN;
1107 goto release;
1108 }
1109 space = sbspace(&so->so_snd);
1110 if (flags & MSG_OOB)
1111 space += 1024;
1112 if (atomic && resid > ssk->xmit_size_goal - SDP_HEAD_SIZE) {
1113 SOCKBUF_UNLOCK(&so->so_snd);
1114 error = EMSGSIZE;
1115 goto release;
1116 }
1117 if (space < resid &&
1118 (atomic || space < so->so_snd.sb_lowat)) {
1119 if ((so->so_state & SS_NBIO) || (flags & MSG_NBIO)) {
1120 SOCKBUF_UNLOCK(&so->so_snd);
1121 error = EWOULDBLOCK;
1122 goto release;
1123 }
1124 error = sbwait(&so->so_snd);
1125 SOCKBUF_UNLOCK(&so->so_snd);
1126 if (error)
1127 goto release;
1128 goto restart;
1129 }
1130 SOCKBUF_UNLOCK(&so->so_snd);
1131 do {
1132 if (uio == NULL) {
1133 resid = 0;
1134 if (flags & MSG_EOR)
1135 top->m_flags |= M_EOR;
1136 } else {
1137 /*
1138 * Copy the data from userland into a mbuf
1139 * chain. If no data is to be copied in,
1140 * a single empty mbuf is returned.
1141 */
1142 copy = min(space,
1143 ssk->xmit_size_goal - SDP_HEAD_SIZE);
1144 top = m_uiotombuf(uio, M_WAITOK, copy,
1145 0, M_PKTHDR |
1146 ((flags & MSG_EOR) ? M_EOR : 0));
1147 if (top == NULL) {
1148 /* only possible error */
1149 error = EFAULT;
1150 goto release;
1151 }
1152 space -= resid - uio->uio_resid;
1153 resid = uio->uio_resid;
1154 }
1155 /*
1156 * XXX all the SBS_CANTSENDMORE checks previously
1157 * done could be out of date after dropping the
1158 * socket lock.
1159 */
1160 error = sdp_send(so, (flags & MSG_OOB) ? PRUS_OOB :
1161 /*
1162 * Set EOF on the last send if the user specified
1163 * MSG_EOF.
1164 */
1165 ((flags & MSG_EOF) && (resid <= 0)) ? PRUS_EOF :
1166 /* If there is more to send set PRUS_MORETOCOME. */
1167 (resid > 0 && space > 0) ? PRUS_MORETOCOME : 0,
1168 top, addr, NULL, td);
1169 top = NULL;
1170 if (error)
1171 goto release;
1172 } while (resid && space > 0);
1173 } while (resid);
1174
1175release:
1176 sbunlock(&so->so_snd);
1177out:
1178 if (top != NULL)
1179 m_freem(top);
1180 return (error);
1181}
1182
1183/*
1184 * The part of soreceive() that implements reading non-inline out-of-band
1185 * data from a socket. For more complete comments, see soreceive(), from
1186 * which this code originated.
1187 *
1188 * Note that soreceive_rcvoob(), unlike the remainder of soreceive(), is
1189 * unable to return an mbuf chain to the caller.
1190 */
1191static int
1192soreceive_rcvoob(struct socket *so, struct uio *uio, int flags)
1193{
1194 struct protosw *pr = so->so_proto;
1195 struct mbuf *m;
1196 int error;
1197
1198 KASSERT(flags & MSG_OOB, ("soreceive_rcvoob: (flags & MSG_OOB) == 0"));
1199
1200 m = m_get(M_WAITOK, MT_DATA);
1201 error = (*pr->pr_usrreqs->pru_rcvoob)(so, m, flags & MSG_PEEK);
1202 if (error)
1203 goto bad;
1204 do {
1205 error = uiomove(mtod(m, void *),
1206 (int) min(uio->uio_resid, m->m_len), uio);
1207 m = m_free(m);
1208 } while (uio->uio_resid && error == 0 && m);
1209bad:
1210 if (m != NULL)
1211 m_freem(m);
1212 return (error);
1213}
1214
1215/*
1216 * Optimized version of soreceive() for stream (TCP) sockets.
1217 */
1218static int
1219sdp_sorecv(struct socket *so, struct sockaddr **psa, struct uio *uio,
1220 struct mbuf **mp0, struct mbuf **controlp, int *flagsp)
1221{
1222 int len = 0, error = 0, flags, oresid;
1223 struct sockbuf *sb;
1224 struct mbuf *m, *n = NULL;
1225 struct sdp_sock *ssk;
1226
1227 /* We only do stream sockets. */
1228 if (so->so_type != SOCK_STREAM)
1229 return (EINVAL);
1230 if (psa != NULL)
1231 *psa = NULL;
1232 if (controlp != NULL)
1233 return (EINVAL);
1234 if (flagsp != NULL)
1235 flags = *flagsp &~ MSG_EOR;
1236 else
1237 flags = 0;
1238 if (flags & MSG_OOB)
1239 return (soreceive_rcvoob(so, uio, flags));
1240 if (mp0 != NULL)
1241 *mp0 = NULL;
1242
1243 sb = &so->so_rcv;
1244 ssk = sdp_sk(so);
1245
1246 /* Prevent other readers from entering the socket. */
1247 error = sblock(sb, SBLOCKWAIT(flags));
1248 if (error)
1249 goto out;
1250 SOCKBUF_LOCK(sb);
1251
1252 /* Easy one, no space to copyout anything. */
1253 if (uio->uio_resid == 0) {
1254 error = EINVAL;
1255 goto out;
1256 }
1257 oresid = uio->uio_resid;
1258
1259 /* We will never ever get anything unless we are connected. */
1260 if (!(so->so_state & (SS_ISCONNECTED|SS_ISDISCONNECTED))) {
1261 /* When disconnecting there may be still some data left. */
1262 if (sbavail(sb))
1263 goto deliver;
1264 if (!(so->so_state & SS_ISDISCONNECTED))
1265 error = ENOTCONN;
1266 goto out;
1267 }
1268
1269 /* Socket buffer is empty and we shall not block. */
1270 if (sbavail(sb) == 0 &&
1271 ((so->so_state & SS_NBIO) || (flags & (MSG_DONTWAIT|MSG_NBIO)))) {
1272 error = EAGAIN;
1273 goto out;
1274 }
1275
1276restart:
1277 SOCKBUF_LOCK_ASSERT(&so->so_rcv);
1278
1279 /* Abort if socket has reported problems. */
1280 if (so->so_error) {
1281 if (sbavail(sb))
1282 goto deliver;
1283 if (oresid > uio->uio_resid)
1284 goto out;
1285 error = so->so_error;
1286 if (!(flags & MSG_PEEK))
1287 so->so_error = 0;
1288 goto out;
1289 }
1290
1291 /* Door is closed. Deliver what is left, if any. */
1292 if (sb->sb_state & SBS_CANTRCVMORE) {
1293 if (sbavail(sb))
1294 goto deliver;
1295 else
1296 goto out;
1297 }
1298
1299 /* Socket buffer got some data that we shall deliver now. */
1300 if (sbavail(sb) && !(flags & MSG_WAITALL) &&
1301 ((so->so_state & SS_NBIO) ||
1302 (flags & (MSG_DONTWAIT|MSG_NBIO)) ||
1303 sbavail(sb) >= sb->sb_lowat ||
1304 sbavail(sb) >= uio->uio_resid ||
1305 sbavail(sb) >= sb->sb_hiwat) ) {
1306 goto deliver;
1307 }
1308
1309 /* On MSG_WAITALL we must wait until all data or error arrives. */
1310 if ((flags & MSG_WAITALL) &&
1311 (sbavail(sb) >= uio->uio_resid || sbavail(sb) >= sb->sb_lowat))
1312 goto deliver;
1313
1314 /*
1315 * Wait and block until (more) data comes in.
1316 * NB: Drops the sockbuf lock during wait.
1317 */
1318 error = sbwait(sb);
1319 if (error)
1320 goto out;
1321 goto restart;
1322
1323deliver:
1324 SOCKBUF_LOCK_ASSERT(&so->so_rcv);
1325 KASSERT(sbavail(sb), ("%s: sockbuf empty", __func__));
1326 KASSERT(sb->sb_mb != NULL, ("%s: sb_mb == NULL", __func__));
1327
1328 /* Statistics. */
1329 if (uio->uio_td)
1330 uio->uio_td->td_ru.ru_msgrcv++;
1331
1332 /* Fill uio until full or current end of socket buffer is reached. */
1333 len = min(uio->uio_resid, sbavail(sb));
1334 if (mp0 != NULL) {
1335 /* Dequeue as many mbufs as possible. */
1336 if (!(flags & MSG_PEEK) && len >= sb->sb_mb->m_len) {
1337 for (*mp0 = m = sb->sb_mb;
1338 m != NULL && m->m_len <= len;
1339 m = m->m_next) {
1340 len -= m->m_len;
1341 uio->uio_resid -= m->m_len;
1342 sbfree(sb, m);
1343 n = m;
1344 }
1345 sb->sb_mb = m;
1346 if (sb->sb_mb == NULL)
1347 SB_EMPTY_FIXUP(sb);
1348 n->m_next = NULL;
1349 }
1350 /* Copy the remainder. */
1351 if (len > 0) {
1352 KASSERT(sb->sb_mb != NULL,
1353 ("%s: len > 0 && sb->sb_mb empty", __func__));
1354
1355 m = m_copym(sb->sb_mb, 0, len, M_NOWAIT);
1356 if (m == NULL)
1357 len = 0; /* Don't flush data from sockbuf. */
1358 else
1359 uio->uio_resid -= m->m_len;
1360 if (*mp0 != NULL)
1361 n->m_next = m;
1362 else
1363 *mp0 = m;
1364 if (*mp0 == NULL) {
1365 error = ENOBUFS;
1366 goto out;
1367 }
1368 }
1369 } else {
1370 /* NB: Must unlock socket buffer as uiomove may sleep. */
1371 SOCKBUF_UNLOCK(sb);
1372 error = m_mbuftouio(uio, sb->sb_mb, len);
1373 SOCKBUF_LOCK(sb);
1374 if (error)
1375 goto out;
1376 }
1377 SBLASTRECORDCHK(sb);
1378 SBLASTMBUFCHK(sb);
1379
1380 /*
1381 * Remove the delivered data from the socket buffer unless we
1382 * were only peeking.
1383 */
1384 if (!(flags & MSG_PEEK)) {
1385 if (len > 0)
1386 sbdrop_locked(sb, len);
1387
1388 /* Notify protocol that we drained some data. */
1389 SOCKBUF_UNLOCK(sb);
1390 SDP_WLOCK(ssk);
1391 sdp_do_posts(ssk);
1392 SDP_WUNLOCK(ssk);
1393 SOCKBUF_LOCK(sb);
1394 }
1395
1396 /*
1397 * For MSG_WAITALL we may have to loop again and wait for
1398 * more data to come in.
1399 */
1400 if ((flags & MSG_WAITALL) && uio->uio_resid > 0)
1401 goto restart;
1402out:
1403 SOCKBUF_LOCK_ASSERT(sb);
1404 SBLASTRECORDCHK(sb);
1405 SBLASTMBUFCHK(sb);
1406 SOCKBUF_UNLOCK(sb);
1407 sbunlock(sb);
1408 return (error);
1409}
1410
1411/*
1412 * Abort is used to teardown a connection typically while sitting in
1413 * the accept queue.
1414 */
1415void
1416sdp_abort(struct socket *so)
1417{
1418 struct sdp_sock *ssk;
1419
1420 ssk = sdp_sk(so);
1421 SDP_WLOCK(ssk);
1422 /*
1423 * If we have not yet dropped, do it now.
1424 */
1425 if (!(ssk->flags & SDP_TIMEWAIT) &&
1426 !(ssk->flags & SDP_DROPPED))
1427 sdp_drop(ssk, ECONNABORTED);
1428 KASSERT(ssk->flags & SDP_DROPPED, ("sdp_abort: %p not dropped 0x%X",
1429 ssk, ssk->flags));
1430 SDP_WUNLOCK(ssk);
1431}
1432
1433/*
1434 * Close a SDP socket and initiate a friendly disconnect.
1435 */
1436static void
1437sdp_close(struct socket *so)
1438{
1439 struct sdp_sock *ssk;
1440
1441 ssk = sdp_sk(so);
1442 SDP_WLOCK(ssk);
1443 /*
1444 * If we have not yet dropped, do it now.
1445 */
1446 if (!(ssk->flags & SDP_TIMEWAIT) &&
1447 !(ssk->flags & SDP_DROPPED))
1448 sdp_start_disconnect(ssk);
1449
1450 /*
1451 * If we've still not dropped let the socket layer know we're
1452 * holding on to the socket and pcb for a while.
1453 */
1454 if (!(ssk->flags & SDP_DROPPED)) {
1455 SOCK_LOCK(so);
1456 so->so_state |= SS_PROTOREF;
1457 SOCK_UNLOCK(so);
1458 ssk->flags |= SDP_SOCKREF;
1459 }
1460 SDP_WUNLOCK(ssk);
1461}
1462
1463/*
1464 * User requests out-of-band data.
1465 */
1466static int
1467sdp_rcvoob(struct socket *so, struct mbuf *m, int flags)
1468{
1469 int error = 0;
1470 struct sdp_sock *ssk;
1471
1472 ssk = sdp_sk(so);
1473 SDP_WLOCK(ssk);
1474 if (!rx_ring_trylock(&ssk->rx_ring)) {
1475 SDP_WUNLOCK(ssk);
1476 return (ECONNRESET);
1477 }
1478 if (ssk->flags & (SDP_TIMEWAIT | SDP_DROPPED)) {
1479 error = ECONNRESET;
1480 goto out;
1481 }
1482 if ((so->so_oobmark == 0 &&
1483 (so->so_rcv.sb_state & SBS_RCVATMARK) == 0) ||
1484 so->so_options & SO_OOBINLINE ||
1485 ssk->oobflags & SDP_HADOOB) {
1486 error = EINVAL;
1487 goto out;
1488 }
1489 if ((ssk->oobflags & SDP_HAVEOOB) == 0) {
1490 error = EWOULDBLOCK;
1491 goto out;
1492 }
1493 m->m_len = 1;
1494 *mtod(m, caddr_t) = ssk->iobc;
1495 if ((flags & MSG_PEEK) == 0)
1496 ssk->oobflags ^= (SDP_HAVEOOB | SDP_HADOOB);
1497out:
1498 rx_ring_unlock(&ssk->rx_ring);
1499 SDP_WUNLOCK(ssk);
1500 return (error);
1501}
1502
1503void
1504sdp_urg(struct sdp_sock *ssk, struct mbuf *mb)
1505{
1506 struct mbuf *m;
1507 struct socket *so;
1508
1509 so = ssk->socket;
1510 if (so == NULL)
1511 return;
1512
1513 so->so_oobmark = sbused(&so->so_rcv) + mb->m_pkthdr.len - 1;
1514 sohasoutofband(so);
1515 ssk->oobflags &= ~(SDP_HAVEOOB | SDP_HADOOB);
1516 if (!(so->so_options & SO_OOBINLINE)) {
1517 for (m = mb; m->m_next != NULL; m = m->m_next);
1518 ssk->iobc = *(mtod(m, char *) + m->m_len - 1);
1519 ssk->oobflags |= SDP_HAVEOOB;
1520 m->m_len--;
1521 mb->m_pkthdr.len--;
1522 }
1523}
1524
1525/*
1526 * Notify a sdp socket of an asynchronous error.
1527 *
1528 * Do not wake up user since there currently is no mechanism for
1529 * reporting soft errors (yet - a kqueue filter may be added).
1530 */
1531struct sdp_sock *
1532sdp_notify(struct sdp_sock *ssk, int error)
1533{
1534
1535 SDP_WLOCK_ASSERT(ssk);
1536
1537 if ((ssk->flags & SDP_TIMEWAIT) ||
1538 (ssk->flags & SDP_DROPPED))
1539 return (ssk);
1540
1541 /*
1542 * Ignore some errors if we are hooked up.
1543 */
1544 if (ssk->state == TCPS_ESTABLISHED &&
1545 (error == EHOSTUNREACH || error == ENETUNREACH ||
1546 error == EHOSTDOWN))
1547 return (ssk);
1548 ssk->softerror = error;
1549 return sdp_drop(ssk, error);
1550}
1551
1552static void
1553sdp_ctlinput(int cmd, struct sockaddr *sa, void *vip)
1554{
1555 struct in_addr faddr;
1556
1557 faddr = ((struct sockaddr_in *)sa)->sin_addr;
1558 if (sa->sa_family != AF_INET || faddr.s_addr == INADDR_ANY)
1559 return;
1560
1561 sdp_pcbnotifyall(faddr, inetctlerrmap[cmd], sdp_notify);
1562}
1563
1564static int
1565sdp_control(struct socket *so, u_long cmd, caddr_t data, struct ifnet *ifp,
1566 struct thread *td)
1567{
1568 return (EOPNOTSUPP);
1569}
1570
1571static void
1572sdp_keepalive_timeout(void *data)
1573{
1574 struct sdp_sock *ssk;
1575
1576 ssk = data;
1577 /* Callout canceled. */
1578 if (!callout_active(&ssk->keep2msl))
1579 return;
1580 /* Callout rescheduled as a different kind of timer. */
1581 if (callout_pending(&ssk->keep2msl))
1582 goto out;
1583 callout_deactivate(&ssk->keep2msl);
1584 if (ssk->flags & SDP_DROPPED ||
1585 (ssk->socket->so_options & SO_KEEPALIVE) == 0)
1586 goto out;
1587 sdp_post_keepalive(ssk);
1588 callout_reset(&ssk->keep2msl, SDP_KEEPALIVE_TIME,
1589 sdp_keepalive_timeout, ssk);
1590out:
1591 SDP_WUNLOCK(ssk);
1592}
1593
1594
1595void
1596sdp_start_keepalive_timer(struct socket *so)
1597{
1598 struct sdp_sock *ssk;
1599
1600 ssk = sdp_sk(so);
1601 if (!callout_pending(&ssk->keep2msl))
1602 callout_reset(&ssk->keep2msl, SDP_KEEPALIVE_TIME,
1603 sdp_keepalive_timeout, ssk);
1604}
1605
1606static void
1607sdp_stop_keepalive_timer(struct socket *so)
1608{
1609 struct sdp_sock *ssk;
1610
1611 ssk = sdp_sk(so);
1612 callout_stop(&ssk->keep2msl);
1613}
1614
1615/*
1616 * sdp_ctloutput() must drop the inpcb lock before performing copyin on
1617 * socket option arguments. When it re-acquires the lock after the copy, it
1618 * has to revalidate that the connection is still valid for the socket
1619 * option.
1620 */
1621#define SDP_WLOCK_RECHECK(inp) do { \
1622 SDP_WLOCK(ssk); \
1623 if (ssk->flags & (SDP_TIMEWAIT | SDP_DROPPED)) { \
1624 SDP_WUNLOCK(ssk); \
1625 return (ECONNRESET); \
1626 } \
1627} while(0)
1628
1629static int
1630sdp_ctloutput(struct socket *so, struct sockopt *sopt)
1631{
1632 int error, opt, optval;
1633 struct sdp_sock *ssk;
1634
1635 error = 0;
1636 ssk = sdp_sk(so);
1637 if (sopt->sopt_level == SOL_SOCKET && sopt->sopt_name == SO_KEEPALIVE) {
1638 SDP_WLOCK(ssk);
1639 if (so->so_options & SO_KEEPALIVE)
1640 sdp_start_keepalive_timer(so);
1641 else
1642 sdp_stop_keepalive_timer(so);
1643 SDP_WUNLOCK(ssk);
1644 }
1645 if (sopt->sopt_level != IPPROTO_TCP)
1646 return (error);
1647
1648 SDP_WLOCK(ssk);
1649 if (ssk->flags & (SDP_TIMEWAIT | SDP_DROPPED)) {
1650 SDP_WUNLOCK(ssk);
1651 return (ECONNRESET);
1652 }
1653
1654 switch (sopt->sopt_dir) {
1655 case SOPT_SET:
1656 switch (sopt->sopt_name) {
1657 case TCP_NODELAY:
1658 SDP_WUNLOCK(ssk);
1659 error = sooptcopyin(sopt, &optval, sizeof optval,
1660 sizeof optval);
1661 if (error)
1662 return (error);
1663
1664 SDP_WLOCK_RECHECK(ssk);
1665 opt = SDP_NODELAY;
1666 if (optval)
1667 ssk->flags |= opt;
1668 else
1669 ssk->flags &= ~opt;
1670 sdp_do_posts(ssk);
1671 SDP_WUNLOCK(ssk);
1672 break;
1673
1674 default:
1675 SDP_WUNLOCK(ssk);
1676 error = ENOPROTOOPT;
1677 break;
1678 }
1679 break;
1680
1681 case SOPT_GET:
1682 switch (sopt->sopt_name) {
1683 case TCP_NODELAY:
1684 optval = ssk->flags & SDP_NODELAY;
1685 SDP_WUNLOCK(ssk);
1686 error = sooptcopyout(sopt, &optval, sizeof optval);
1687 break;
1688 default:
1689 SDP_WUNLOCK(ssk);
1690 error = ENOPROTOOPT;
1691 break;
1692 }
1693 break;
1694 }
1695 return (error);
1696}
1697#undef SDP_WLOCK_RECHECK
1698
1699int sdp_mod_count = 0;
1700int sdp_mod_usec = 0;
1701
1702void
1703sdp_set_default_moderation(struct sdp_sock *ssk)
1704{
1705 if (sdp_mod_count <= 0 || sdp_mod_usec <= 0)
1706 return;
1707 ib_modify_cq(ssk->rx_ring.cq, sdp_mod_count, sdp_mod_usec);
1708}
1709
1710
1711static void
1712sdp_dev_add(struct ib_device *device)
1713{
1714 struct ib_fmr_pool_param param;
1715 struct sdp_device *sdp_dev;
1716
1717 sdp_dev = malloc(sizeof(*sdp_dev), M_SDP, M_WAITOK | M_ZERO);
1718 sdp_dev->pd = ib_alloc_pd(device);
1719 if (IS_ERR(sdp_dev->pd))
1720 goto out_pd;
1721 sdp_dev->mr = ib_get_dma_mr(sdp_dev->pd, IB_ACCESS_LOCAL_WRITE);
1722 if (IS_ERR(sdp_dev->mr))
1723 goto out_mr;
1724 memset(&param, 0, sizeof param);
1725 param.max_pages_per_fmr = SDP_FMR_SIZE;
1726 param.page_shift = PAGE_SHIFT;
1727 param.access = (IB_ACCESS_LOCAL_WRITE | IB_ACCESS_REMOTE_READ);
1728 param.pool_size = SDP_FMR_POOL_SIZE;
1729 param.dirty_watermark = SDP_FMR_DIRTY_SIZE;
1730 param.cache = 1;
1731 sdp_dev->fmr_pool = ib_create_fmr_pool(sdp_dev->pd, &param);
1732 if (IS_ERR(sdp_dev->fmr_pool))
1733 goto out_fmr;
1734 ib_set_client_data(device, &sdp_client, sdp_dev);
1735 return;
1736
1737out_fmr:
1738 ib_dereg_mr(sdp_dev->mr);
1739out_mr:
1740 ib_dealloc_pd(sdp_dev->pd);
1741out_pd:
1742 free(sdp_dev, M_SDP);
1743}
1744
1745static void
1746sdp_dev_rem(struct ib_device *device)
1747{
1748 struct sdp_device *sdp_dev;
1749 struct sdp_sock *ssk;
1750
1751 SDP_LIST_WLOCK();
1752 LIST_FOREACH(ssk, &sdp_list, list) {
1753 if (ssk->ib_device != device)
1754 continue;
1755 SDP_WLOCK(ssk);
1756 if ((ssk->flags & SDP_DESTROY) == 0)
1757 ssk = sdp_notify(ssk, ECONNRESET);
1758 if (ssk)
1759 SDP_WUNLOCK(ssk);
1760 }
1761 SDP_LIST_WUNLOCK();
1762 /*
1763 * XXX Do I need to wait between these two?
1764 */
1765 sdp_dev = ib_get_client_data(device, &sdp_client);
1766 if (!sdp_dev)
1767 return;
1768 ib_flush_fmr_pool(sdp_dev->fmr_pool);
1769 ib_destroy_fmr_pool(sdp_dev->fmr_pool);
1770 ib_dereg_mr(sdp_dev->mr);
1771 ib_dealloc_pd(sdp_dev->pd);
1772 free(sdp_dev, M_SDP);
1773}
1774
1775struct ib_client sdp_client =
1776 { .name = "sdp", .add = sdp_dev_add, .remove = sdp_dev_rem };
1777
1778
1779static int
1780sdp_pcblist(SYSCTL_HANDLER_ARGS)
1781{
1782 int error, n, i;
1783 struct sdp_sock *ssk;
1784 struct xinpgen xig;
1785
1786 /*
1787 * The process of preparing the TCB list is too time-consuming and
1788 * resource-intensive to repeat twice on every request.
1789 */
1790 if (req->oldptr == NULL) {
1791 n = sdp_count;
1792 n += imax(n / 8, 10);
1793 req->oldidx = 2 * (sizeof xig) + n * sizeof(struct xtcpcb);
1794 return (0);
1795 }
1796
1797 if (req->newptr != NULL)
1798 return (EPERM);
1799
1800 /*
1801 * OK, now we're committed to doing something.
1802 */
1803 SDP_LIST_RLOCK();
1804 n = sdp_count;
1805 SDP_LIST_RUNLOCK();
1806
1807 error = sysctl_wire_old_buffer(req, 2 * (sizeof xig)
1808 + n * sizeof(struct xtcpcb));
1809 if (error != 0)
1810 return (error);
1811
1812 xig.xig_len = sizeof xig;
1813 xig.xig_count = n;
1814 xig.xig_gen = 0;
1815 xig.xig_sogen = so_gencnt;
1816 error = SYSCTL_OUT(req, &xig, sizeof xig);
1817 if (error)
1818 return (error);
1819
1820 SDP_LIST_RLOCK();
1821 for (ssk = LIST_FIRST(&sdp_list), i = 0;
1822 ssk != NULL && i < n; ssk = LIST_NEXT(ssk, list)) {
1823 struct xtcpcb xt;
1824
1825 SDP_RLOCK(ssk);
1826 if (ssk->flags & SDP_TIMEWAIT) {
1827 if (ssk->cred != NULL)
1828 error = cr_cansee(req->td->td_ucred,
1829 ssk->cred);
1830 else
1831 error = EINVAL; /* Skip this inp. */
1832 } else if (ssk->socket)
1833 error = cr_canseesocket(req->td->td_ucred,
1834 ssk->socket);
1835 else
1836 error = EINVAL;
1837 if (error) {
1838 error = 0;
1839 goto next;
1840 }
1841
1842 bzero(&xt, sizeof(xt));
1843 xt.xt_len = sizeof xt;
1844 xt.xt_inp.inp_gencnt = 0;
1845 xt.xt_inp.inp_vflag = INP_IPV4;
1846 memcpy(&xt.xt_inp.inp_laddr, &ssk->laddr, sizeof(ssk->laddr));
1847 xt.xt_inp.inp_lport = ssk->lport;
1848 memcpy(&xt.xt_inp.inp_faddr, &ssk->faddr, sizeof(ssk->faddr));
1849 xt.xt_inp.inp_fport = ssk->fport;
1850 xt.xt_tp.t_state = ssk->state;
1851 if (ssk->socket != NULL)
1852 sotoxsocket(ssk->socket, &xt.xt_socket);
1853 else
1854 bzero(&xt.xt_socket, sizeof xt.xt_socket);
1855 xt.xt_socket.xso_protocol = IPPROTO_TCP;
1856 SDP_RUNLOCK(ssk);
1857 error = SYSCTL_OUT(req, &xt, sizeof xt);
1858 if (error)
1859 break;
1860 i++;
1861 continue;
1862next:
1863 SDP_RUNLOCK(ssk);
1864 }
1865 if (!error) {
1866 /*
1867 * Give the user an updated idea of our state.
1868 * If the generation differs from what we told
1869 * her before, she knows that something happened
1870 * while we were processing this request, and it
1871 * might be necessary to retry.
1872 */
1873 xig.xig_gen = 0;
1874 xig.xig_sogen = so_gencnt;
1875 xig.xig_count = sdp_count;
1876 error = SYSCTL_OUT(req, &xig, sizeof xig);
1877 }
1878 SDP_LIST_RUNLOCK();
1879 return (error);
1880}
1881
1882static SYSCTL_NODE(_net_inet, -1, sdp, CTLFLAG_RW, 0, "SDP");
1883
1884SYSCTL_PROC(_net_inet_sdp, TCPCTL_PCBLIST, pcblist,
1885 CTLFLAG_RD | CTLTYPE_STRUCT, 0, 0, sdp_pcblist, "S,xtcpcb",
1886 "List of active SDP connections");
1887
1888static void
1889sdp_zone_change(void *tag)
1890{
1891
1892 uma_zone_set_max(sdp_zone, maxsockets);
1893}
1894
1895static void
1896sdp_init(void)
1897{
1898
1899 LIST_INIT(&sdp_list);
1900 sdp_zone = uma_zcreate("sdp_sock", sizeof(struct sdp_sock),
1901 NULL, NULL, NULL, NULL, UMA_ALIGN_PTR, UMA_ZONE_NOFREE);
1902 uma_zone_set_max(sdp_zone, maxsockets);
1903 EVENTHANDLER_REGISTER(maxsockets_change, sdp_zone_change, NULL,
1904 EVENTHANDLER_PRI_ANY);
1905 rx_comp_wq = create_singlethread_workqueue("rx_comp_wq");
1906 ib_register_client(&sdp_client);
1907}
1908
1909extern struct domain sdpdomain;
1910
1911struct pr_usrreqs sdp_usrreqs = {
1912 .pru_abort = sdp_abort,
1913 .pru_accept = sdp_accept,
1914 .pru_attach = sdp_attach,
1915 .pru_bind = sdp_bind,
1916 .pru_connect = sdp_connect,
1917 .pru_control = sdp_control,
1918 .pru_detach = sdp_detach,
1919 .pru_disconnect = sdp_disconnect,
1920 .pru_listen = sdp_listen,
1921 .pru_peeraddr = sdp_getpeeraddr,
1922 .pru_rcvoob = sdp_rcvoob,
1923 .pru_send = sdp_send,
1924 .pru_sosend = sdp_sosend,
1925 .pru_soreceive = sdp_sorecv,
1926 .pru_shutdown = sdp_shutdown,
1927 .pru_sockaddr = sdp_getsockaddr,
1928 .pru_close = sdp_close,
1929};
1930
1931struct protosw sdpsw[] = {
1932{
1933 .pr_type = SOCK_STREAM,
1934 .pr_domain = &sdpdomain,
1935 .pr_protocol = IPPROTO_IP,
1936 .pr_flags = PR_CONNREQUIRED|PR_IMPLOPCL|PR_WANTRCVD,
1937 .pr_ctlinput = sdp_ctlinput,
1938 .pr_ctloutput = sdp_ctloutput,
1939 .pr_usrreqs = &sdp_usrreqs
1940},
1941{
1942 .pr_type = SOCK_STREAM,
1943 .pr_domain = &sdpdomain,
1944 .pr_protocol = IPPROTO_TCP,
1945 .pr_flags = PR_CONNREQUIRED|PR_IMPLOPCL|PR_WANTRCVD,
1946 .pr_ctlinput = sdp_ctlinput,
1947 .pr_ctloutput = sdp_ctloutput,
1948 .pr_usrreqs = &sdp_usrreqs
1949},
1950};
1951
1952struct domain sdpdomain = {
1953 .dom_family = AF_INET_SDP,
1954 .dom_name = "SDP",
1955 .dom_init = sdp_init,
1956 .dom_protosw = sdpsw,
1957 .dom_protoswNPROTOSW = &sdpsw[sizeof(sdpsw)/sizeof(sdpsw[0])],
1958};
1959
1960DOMAIN_SET(sdp);
1961
1962int sdp_debug_level = 1;
1963int sdp_data_debug_level = 0;