1219820Sjeff
2219820Sjeff/*-
3219820Sjeff * Copyright (c) 1982, 1986, 1988, 1990, 1993, 1995
4219820Sjeff *      The Regents of the University of California.  All rights reserved.
5219820Sjeff * Copyright (c) 2004 The FreeBSD Foundation.  All rights reserved.
6219820Sjeff * Copyright (c) 2004-2008 Robert N. M. Watson.  All rights reserved.
7219820Sjeff *
8219820Sjeff * Redistribution and use in source and binary forms, with or without
9219820Sjeff * modification, are permitted provided that the following conditions
10219820Sjeff * are met:
11219820Sjeff * 1. Redistributions of source code must retain the above copyright
12219820Sjeff *    notice, this list of conditions and the following disclaimer.
13219820Sjeff * 2. Redistributions in binary form must reproduce the above copyright
14219820Sjeff *    notice, this list of conditions and the following disclaimer in the
15219820Sjeff *    documentation and/or other materials provided with the distribution.
16219820Sjeff * 4. Neither the name of the University nor the names of its contributors
17219820Sjeff *    may be used to endorse or promote products derived from this software
18219820Sjeff *    without specific prior written permission.
19219820Sjeff *
20219820Sjeff * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
21219820Sjeff * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
22219820Sjeff * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
23219820Sjeff * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
24219820Sjeff * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
25219820Sjeff * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
26219820Sjeff * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
27219820Sjeff * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
28219820Sjeff * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
29219820Sjeff * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
30219820Sjeff * SUCH DAMAGE.
31219820Sjeff *
32219820Sjeff * Excerpts taken from tcp_subr.c, tcp_usrreq.c, uipc_socket.c
33219820Sjeff */
34219820Sjeff
35219820Sjeff/*
36219820Sjeff *
37219820Sjeff * Copyright (c) 2010 Isilon Systems, Inc.
38219820Sjeff * Copyright (c) 2010 iX Systems, Inc.
39219820Sjeff * Copyright (c) 2010 Panasas, Inc.
40219820Sjeff * All rights reserved.
41219820Sjeff *
42219820Sjeff * Redistribution and use in source and binary forms, with or without
43219820Sjeff * modification, are permitted provided that the following conditions
44219820Sjeff * are met:
45219820Sjeff * 1. Redistributions of source code must retain the above copyright
46219820Sjeff *    notice unmodified, this list of conditions, and the following
47219820Sjeff *    disclaimer.
48219820Sjeff * 2. Redistributions in binary form must reproduce the above copyright
49219820Sjeff *    notice, this list of conditions and the following disclaimer in the
50219820Sjeff *    documentation and/or other materials provided with the distribution.
51219820Sjeff *
52219820Sjeff * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
53219820Sjeff * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
54219820Sjeff * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
55219820Sjeff * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
56219820Sjeff * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
57219820Sjeff * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
58219820Sjeff * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
59219820Sjeff * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
60219820Sjeff * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
61219820Sjeff * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
62219820Sjeff *
63219820Sjeff */
64219820Sjeff#include <sys/cdefs.h>
65219820Sjeff__FBSDID("$FreeBSD$");
66219820Sjeff
67219820Sjeff#include "sdp.h"
68219820Sjeff
69219820Sjeff#include <net/if.h>
70219820Sjeff#include <net/route.h>
71219820Sjeff#include <net/vnet.h>
72268005Shselasky#include <sys/sysctl.h>
73219820Sjeff
74219820Sjeffuma_zone_t	sdp_zone;
75219820Sjeffstruct rwlock	sdp_lock;
76219820SjeffLIST_HEAD(, sdp_sock) sdp_list;
77219820Sjeff
78219820Sjeffstruct workqueue_struct *rx_comp_wq;
79219820Sjeff
80219820SjeffRW_SYSINIT(sdplockinit, &sdp_lock, "SDP lock");
81219820Sjeff#define	SDP_LIST_WLOCK()	rw_wlock(&sdp_lock)
82219820Sjeff#define	SDP_LIST_RLOCK()	rw_rlock(&sdp_lock)
83219820Sjeff#define	SDP_LIST_WUNLOCK()	rw_wunlock(&sdp_lock)
84219820Sjeff#define	SDP_LIST_RUNLOCK()	rw_runlock(&sdp_lock)
85219820Sjeff#define	SDP_LIST_WLOCK_ASSERT()	rw_assert(&sdp_lock, RW_WLOCKED)
86219820Sjeff#define	SDP_LIST_RLOCK_ASSERT()	rw_assert(&sdp_lock, RW_RLOCKED)
87219820Sjeff#define	SDP_LIST_LOCK_ASSERT()	rw_assert(&sdp_lock, RW_LOCKED)
88219820Sjeff
89227293Sedstatic MALLOC_DEFINE(M_SDP, "sdp", "Socket Direct Protocol");
90219820Sjeff
91219820Sjeffstatic void sdp_stop_keepalive_timer(struct socket *so);
92219820Sjeff
93219820Sjeff/*
94219820Sjeff * SDP protocol interface to socket abstraction.
95219820Sjeff */
96219820Sjeff/*
97219820Sjeff * sdp_sendspace and sdp_recvspace are the default send and receive window
98219820Sjeff * sizes, respectively.
99219820Sjeff */
100219820Sjeffu_long	sdp_sendspace = 1024*32;
101219820Sjeffu_long	sdp_recvspace = 1024*64;
102219820Sjeff
103219820Sjeffstatic int sdp_count;
104219820Sjeff
105219820Sjeff/*
106219820Sjeff * Disable async. CMA events for sockets which are being torn down.
107219820Sjeff */
108219820Sjeffstatic void
109219820Sjeffsdp_destroy_cma(struct sdp_sock *ssk)
110219820Sjeff{
111219820Sjeff
112219820Sjeff	if (ssk->id == NULL)
113219820Sjeff		return;
114219820Sjeff	rdma_destroy_id(ssk->id);
115219820Sjeff	ssk->id = NULL;
116219820Sjeff}
117219820Sjeff
118219820Sjeffstatic int
119219820Sjeffsdp_pcbbind(struct sdp_sock *ssk, struct sockaddr *nam, struct ucred *cred)
120219820Sjeff{
121219820Sjeff	struct sockaddr_in *sin;
122219820Sjeff	struct sockaddr_in null;
123219820Sjeff	int error;
124219820Sjeff
125219820Sjeff	SDP_WLOCK_ASSERT(ssk);
126219820Sjeff
127219820Sjeff	if (ssk->lport != 0 || ssk->laddr != INADDR_ANY)
128219820Sjeff		return (EINVAL);
129219820Sjeff	/* rdma_bind_addr handles bind races.  */
130219820Sjeff	SDP_WUNLOCK(ssk);
131219820Sjeff	if (ssk->id == NULL)
132278894Shselasky		ssk->id = rdma_create_id(sdp_cma_handler, ssk, RDMA_PS_SDP, IB_QPT_RC);
133219820Sjeff	if (ssk->id == NULL) {
134219820Sjeff		SDP_WLOCK(ssk);
135219820Sjeff		return (ENOMEM);
136219820Sjeff	}
137219820Sjeff	if (nam == NULL) {
138219820Sjeff		null.sin_family = AF_INET;
139219820Sjeff		null.sin_len = sizeof(null);
140219820Sjeff		null.sin_addr.s_addr = INADDR_ANY;
141219820Sjeff		null.sin_port = 0;
142219820Sjeff		bzero(&null.sin_zero, sizeof(null.sin_zero));
143219820Sjeff		nam = (struct sockaddr *)&null;
144219820Sjeff	}
145219820Sjeff	error = -rdma_bind_addr(ssk->id, nam);
146219820Sjeff	SDP_WLOCK(ssk);
147219820Sjeff	if (error == 0) {
148219820Sjeff		sin = (struct sockaddr_in *)&ssk->id->route.addr.src_addr;
149219820Sjeff		ssk->laddr = sin->sin_addr.s_addr;
150219820Sjeff		ssk->lport = sin->sin_port;
151219820Sjeff	} else
152219820Sjeff		sdp_destroy_cma(ssk);
153219820Sjeff	return (error);
154219820Sjeff}
155219820Sjeff
156219820Sjeffstatic void
157219820Sjeffsdp_pcbfree(struct sdp_sock *ssk)
158219820Sjeff{
159219820Sjeff	KASSERT(ssk->socket == NULL, ("ssk %p socket still attached", ssk));
160219820Sjeff
161219820Sjeff	sdp_dbg(ssk->socket, "Freeing pcb");
162219820Sjeff	SDP_WLOCK_ASSERT(ssk);
163219820Sjeff	ssk->flags |= SDP_DESTROY;
164219820Sjeff	SDP_WUNLOCK(ssk);
165219820Sjeff	SDP_LIST_WLOCK();
166219820Sjeff	sdp_count--;
167219820Sjeff	LIST_REMOVE(ssk, list);
168219820Sjeff	SDP_LIST_WUNLOCK();
169219820Sjeff	crfree(ssk->cred);
170219820Sjeff	sdp_destroy_cma(ssk);
171219820Sjeff	ssk->qp_active = 0;
172219820Sjeff	if (ssk->qp) {
173219820Sjeff		ib_destroy_qp(ssk->qp);
174219820Sjeff		ssk->qp = NULL;
175219820Sjeff	}
176219820Sjeff	sdp_tx_ring_destroy(ssk);
177219820Sjeff	sdp_rx_ring_destroy(ssk);
178219820Sjeff	rw_destroy(&ssk->rx_ring.destroyed_lock);
179219820Sjeff	uma_zfree(sdp_zone, ssk);
180219820Sjeff	rw_destroy(&ssk->lock);
181219820Sjeff}
182219820Sjeff
183219820Sjeff/*
184219820Sjeff * Common routines to return a socket address.
185219820Sjeff */
186219820Sjeffstatic struct sockaddr *
187219820Sjeffsdp_sockaddr(in_port_t port, struct in_addr *addr_p)
188219820Sjeff{
189219820Sjeff	struct sockaddr_in *sin;
190219820Sjeff
191219820Sjeff	sin = malloc(sizeof *sin, M_SONAME,
192219820Sjeff		M_WAITOK | M_ZERO);
193219820Sjeff	sin->sin_family = AF_INET;
194219820Sjeff	sin->sin_len = sizeof(*sin);
195219820Sjeff	sin->sin_addr = *addr_p;
196219820Sjeff	sin->sin_port = port;
197219820Sjeff
198219820Sjeff	return (struct sockaddr *)sin;
199219820Sjeff}
200219820Sjeff
201219820Sjeffstatic int
202219820Sjeffsdp_getsockaddr(struct socket *so, struct sockaddr **nam)
203219820Sjeff{
204219820Sjeff	struct sdp_sock *ssk;
205219820Sjeff	struct in_addr addr;
206219820Sjeff	in_port_t port;
207219820Sjeff
208219820Sjeff	ssk = sdp_sk(so);
209219820Sjeff	SDP_RLOCK(ssk);
210219820Sjeff	port = ssk->lport;
211219820Sjeff	addr.s_addr = ssk->laddr;
212219820Sjeff	SDP_RUNLOCK(ssk);
213219820Sjeff
214219820Sjeff	*nam = sdp_sockaddr(port, &addr);
215219820Sjeff	return 0;
216219820Sjeff}
217219820Sjeff
218219820Sjeffstatic int
219219820Sjeffsdp_getpeeraddr(struct socket *so, struct sockaddr **nam)
220219820Sjeff{
221219820Sjeff	struct sdp_sock *ssk;
222219820Sjeff	struct in_addr addr;
223219820Sjeff	in_port_t port;
224219820Sjeff
225219820Sjeff	ssk = sdp_sk(so);
226219820Sjeff	SDP_RLOCK(ssk);
227219820Sjeff	port = ssk->fport;
228219820Sjeff	addr.s_addr = ssk->faddr;
229219820Sjeff	SDP_RUNLOCK(ssk);
230219820Sjeff
231219820Sjeff	*nam = sdp_sockaddr(port, &addr);
232219820Sjeff	return 0;
233219820Sjeff}
234219820Sjeff
235219820Sjeffstatic void
236219820Sjeffsdp_pcbnotifyall(struct in_addr faddr, int errno,
237219820Sjeff    struct sdp_sock *(*notify)(struct sdp_sock *, int))
238219820Sjeff{
239219820Sjeff	struct sdp_sock *ssk, *ssk_temp;
240219820Sjeff
241219820Sjeff	SDP_LIST_WLOCK();
242219820Sjeff	LIST_FOREACH_SAFE(ssk, &sdp_list, list, ssk_temp) {
243219820Sjeff		SDP_WLOCK(ssk);
244219820Sjeff		if (ssk->faddr != faddr.s_addr || ssk->socket == NULL) {
245219820Sjeff			SDP_WUNLOCK(ssk);
246219820Sjeff			continue;
247219820Sjeff		}
248219820Sjeff		if ((ssk->flags & SDP_DESTROY) == 0)
249219820Sjeff			if ((*notify)(ssk, errno))
250219820Sjeff				SDP_WUNLOCK(ssk);
251219820Sjeff	}
252219820Sjeff	SDP_LIST_WUNLOCK();
253219820Sjeff}
254219820Sjeff
255219820Sjeff#if 0
256219820Sjeffstatic void
257219820Sjeffsdp_apply_all(void (*func)(struct sdp_sock *, void *), void *arg)
258219820Sjeff{
259219820Sjeff	struct sdp_sock *ssk;
260219820Sjeff
261219820Sjeff	SDP_LIST_RLOCK();
262219820Sjeff	LIST_FOREACH(ssk, &sdp_list, list) {
263219820Sjeff		SDP_WLOCK(ssk);
264219820Sjeff		func(ssk, arg);
265219820Sjeff		SDP_WUNLOCK(ssk);
266219820Sjeff	}
267219820Sjeff	SDP_LIST_RUNLOCK();
268219820Sjeff}
269219820Sjeff#endif
270219820Sjeff
271219820Sjeffstatic void
272219820Sjeffsdp_output_reset(struct sdp_sock *ssk)
273219820Sjeff{
274219820Sjeff	struct rdma_cm_id *id;
275219820Sjeff
276219820Sjeff	SDP_WLOCK_ASSERT(ssk);
277219820Sjeff	if (ssk->id) {
278219820Sjeff		id = ssk->id;
279219820Sjeff		ssk->qp_active = 0;
280219820Sjeff		SDP_WUNLOCK(ssk);
281219820Sjeff		rdma_disconnect(id);
282219820Sjeff		SDP_WLOCK(ssk);
283219820Sjeff	}
284219820Sjeff	ssk->state = TCPS_CLOSED;
285219820Sjeff}
286219820Sjeff
287219820Sjeff/*
288219820Sjeff * Attempt to close a SDP socket, marking it as dropped, and freeing
289219820Sjeff * the socket if we hold the only reference.
290219820Sjeff */
291219820Sjeffstatic struct sdp_sock *
292219820Sjeffsdp_closed(struct sdp_sock *ssk)
293219820Sjeff{
294219820Sjeff	struct socket *so;
295219820Sjeff
296219820Sjeff	SDP_WLOCK_ASSERT(ssk);
297219820Sjeff
298219820Sjeff	ssk->flags |= SDP_DROPPED;
299219820Sjeff	so = ssk->socket;
300219820Sjeff	soisdisconnected(so);
301219820Sjeff	if (ssk->flags & SDP_SOCKREF) {
302219820Sjeff		KASSERT(so->so_state & SS_PROTOREF,
303219820Sjeff		    ("sdp_closed: !SS_PROTOREF"));
304219820Sjeff		ssk->flags &= ~SDP_SOCKREF;
305219820Sjeff		SDP_WUNLOCK(ssk);
306219820Sjeff		ACCEPT_LOCK();
307219820Sjeff		SOCK_LOCK(so);
308219820Sjeff		so->so_state &= ~SS_PROTOREF;
309219820Sjeff		sofree(so);
310219820Sjeff		return (NULL);
311219820Sjeff	}
312219820Sjeff	return (ssk);
313219820Sjeff}
314219820Sjeff
315219820Sjeff/*
316219820Sjeff * Perform timer based shutdowns which can not operate in
317219820Sjeff * callout context.
318219820Sjeff */
319219820Sjeffstatic void
320219820Sjeffsdp_shutdown_task(void *data, int pending)
321219820Sjeff{
322219820Sjeff	struct sdp_sock *ssk;
323219820Sjeff
324219820Sjeff	ssk = data;
325219820Sjeff	SDP_WLOCK(ssk);
326219820Sjeff	/*
327219820Sjeff	 * I don't think this can race with another call to pcbfree()
328219820Sjeff	 * because SDP_TIMEWAIT protects it.  SDP_DESTROY may be redundant.
329219820Sjeff	 */
330219820Sjeff	if (ssk->flags & SDP_DESTROY)
331219820Sjeff		panic("sdp_shutdown_task: Racing with pcbfree for ssk %p",
332219820Sjeff		    ssk);
333219820Sjeff	if (ssk->flags & SDP_DISCON)
334219820Sjeff		sdp_output_reset(ssk);
335219820Sjeff	/* We have to clear this so sdp_detach() will call pcbfree(). */
336219820Sjeff	ssk->flags &= ~(SDP_TIMEWAIT | SDP_DREQWAIT);
337219820Sjeff	if ((ssk->flags & SDP_DROPPED) == 0 &&
338219820Sjeff	    sdp_closed(ssk) == NULL)
339219820Sjeff		return;
340219820Sjeff	if (ssk->socket == NULL) {
341219820Sjeff		sdp_pcbfree(ssk);
342219820Sjeff		return;
343219820Sjeff	}
344219820Sjeff	SDP_WUNLOCK(ssk);
345219820Sjeff}
346219820Sjeff
347219820Sjeff/*
348219820Sjeff * 2msl has expired, schedule the shutdown task.
349219820Sjeff */
350219820Sjeffstatic void
351219820Sjeffsdp_2msl_timeout(void *data)
352219820Sjeff{
353219820Sjeff	struct sdp_sock *ssk;
354219820Sjeff
355219820Sjeff	ssk = data;
356219820Sjeff	/* Callout canceled. */
357219820Sjeff        if (!callout_active(&ssk->keep2msl))
358219820Sjeff		goto out;
359219820Sjeff        callout_deactivate(&ssk->keep2msl);
360219820Sjeff	/* Should be impossible, defensive programming. */
361219820Sjeff	if ((ssk->flags & SDP_TIMEWAIT) == 0)
362219820Sjeff		goto out;
363219820Sjeff	taskqueue_enqueue(taskqueue_thread, &ssk->shutdown_task);
364219820Sjeffout:
365219820Sjeff	SDP_WUNLOCK(ssk);
366219820Sjeff	return;
367219820Sjeff}
368219820Sjeff
369219820Sjeff/*
370219820Sjeff * Schedule the 2msl wait timer.
371219820Sjeff */
372219820Sjeffstatic void
373219820Sjeffsdp_2msl_wait(struct sdp_sock *ssk)
374219820Sjeff{
375219820Sjeff
376219820Sjeff	SDP_WLOCK_ASSERT(ssk);
377219820Sjeff	ssk->flags |= SDP_TIMEWAIT;
378219820Sjeff	ssk->state = TCPS_TIME_WAIT;
379219820Sjeff	soisdisconnected(ssk->socket);
380219820Sjeff	callout_reset(&ssk->keep2msl, TCPTV_MSL, sdp_2msl_timeout, ssk);
381219820Sjeff}
382219820Sjeff
383219820Sjeff/*
384219820Sjeff * Timed out waiting for the final fin/ack from rdma_disconnect().
385219820Sjeff */
386219820Sjeffstatic void
387219820Sjeffsdp_dreq_timeout(void *data)
388219820Sjeff{
389219820Sjeff	struct sdp_sock *ssk;
390219820Sjeff
391219820Sjeff	ssk = data;
392219820Sjeff	/* Callout canceled. */
393219820Sjeff        if (!callout_active(&ssk->keep2msl))
394219820Sjeff		goto out;
395219820Sjeff	/* Callout rescheduled, probably as a different timer. */
396219820Sjeff	if (callout_pending(&ssk->keep2msl))
397219820Sjeff		goto out;
398219820Sjeff        callout_deactivate(&ssk->keep2msl);
399219820Sjeff	if (ssk->state != TCPS_FIN_WAIT_1 && ssk->state != TCPS_LAST_ACK)
400219820Sjeff		goto out;
401219820Sjeff	if ((ssk->flags & SDP_DREQWAIT) == 0)
402219820Sjeff		goto out;
403219820Sjeff	ssk->flags &= ~SDP_DREQWAIT;
404219820Sjeff	ssk->flags |= SDP_DISCON;
405219820Sjeff	sdp_2msl_wait(ssk);
406219820Sjeff	ssk->qp_active = 0;
407219820Sjeffout:
408219820Sjeff	SDP_WUNLOCK(ssk);
409219820Sjeff}
410219820Sjeff
411219820Sjeff/*
412219820Sjeff * Received the final fin/ack.  Cancel the 2msl.
413219820Sjeff */
414219820Sjeffvoid
415219820Sjeffsdp_cancel_dreq_wait_timeout(struct sdp_sock *ssk)
416219820Sjeff{
417219820Sjeff	sdp_dbg(ssk->socket, "cancelling dreq wait timeout\n");
418219820Sjeff	ssk->flags &= ~SDP_DREQWAIT;
419219820Sjeff	sdp_2msl_wait(ssk);
420219820Sjeff}
421219820Sjeff
422219820Sjeffstatic int
423219820Sjeffsdp_init_sock(struct socket *sk)
424219820Sjeff{
425219820Sjeff	struct sdp_sock *ssk = sdp_sk(sk);
426219820Sjeff
427219820Sjeff	sdp_dbg(sk, "%s\n", __func__);
428219820Sjeff
429219820Sjeff	callout_init_rw(&ssk->keep2msl, &ssk->lock, CALLOUT_RETURNUNLOCKED);
430219820Sjeff	TASK_INIT(&ssk->shutdown_task, 0, sdp_shutdown_task, ssk);
431219820Sjeff#ifdef SDP_ZCOPY
432219820Sjeff	INIT_DELAYED_WORK(&ssk->srcavail_cancel_work, srcavail_cancel_timeout);
433219820Sjeff	ssk->zcopy_thresh = -1; /* use global sdp_zcopy_thresh */
434219820Sjeff	ssk->tx_ring.rdma_inflight = NULL;
435219820Sjeff#endif
436219820Sjeff	atomic_set(&ssk->mseq_ack, 0);
437219820Sjeff	sdp_rx_ring_init(ssk);
438219820Sjeff	ssk->tx_ring.buffer = NULL;
439219820Sjeff
440219820Sjeff	return 0;
441219820Sjeff}
442219820Sjeff
443219820Sjeff/*
444219820Sjeff * Allocate an sdp_sock for the socket and reserve socket buffer space.
445219820Sjeff */
446219820Sjeffstatic int
447219820Sjeffsdp_attach(struct socket *so, int proto, struct thread *td)
448219820Sjeff{
449219820Sjeff	struct sdp_sock *ssk;
450219820Sjeff	int error;
451219820Sjeff
452219820Sjeff	ssk = sdp_sk(so);
453219820Sjeff	KASSERT(ssk == NULL, ("sdp_attach: ssk already set on so %p", so));
454219820Sjeff	if (so->so_snd.sb_hiwat == 0 || so->so_rcv.sb_hiwat == 0) {
455219820Sjeff		error = soreserve(so, sdp_sendspace, sdp_recvspace);
456219820Sjeff		if (error)
457219820Sjeff			return (error);
458219820Sjeff	}
459219820Sjeff	so->so_rcv.sb_flags |= SB_AUTOSIZE;
460219820Sjeff	so->so_snd.sb_flags |= SB_AUTOSIZE;
461219820Sjeff	ssk = uma_zalloc(sdp_zone, M_NOWAIT | M_ZERO);
462219820Sjeff	if (ssk == NULL)
463219820Sjeff		return (ENOBUFS);
464219820Sjeff	rw_init(&ssk->lock, "sdpsock");
465219820Sjeff	ssk->socket = so;
466219820Sjeff	ssk->cred = crhold(so->so_cred);
467219820Sjeff	so->so_pcb = (caddr_t)ssk;
468219820Sjeff	sdp_init_sock(so);
469219820Sjeff	ssk->flags = 0;
470219820Sjeff	ssk->qp_active = 0;
471219820Sjeff	ssk->state = TCPS_CLOSED;
472219820Sjeff	SDP_LIST_WLOCK();
473219820Sjeff	LIST_INSERT_HEAD(&sdp_list, ssk, list);
474219820Sjeff	sdp_count++;
475219820Sjeff	SDP_LIST_WUNLOCK();
476219820Sjeff	if ((so->so_options & SO_LINGER) && so->so_linger == 0)
477219820Sjeff		so->so_linger = TCP_LINGERTIME;
478219820Sjeff
479219820Sjeff	return (0);
480219820Sjeff}
481219820Sjeff
482219820Sjeff/*
483219820Sjeff * Detach SDP from the socket, potentially leaving it around for the
484219820Sjeff * timewait to expire.
485219820Sjeff */
486219820Sjeffstatic void
487219820Sjeffsdp_detach(struct socket *so)
488219820Sjeff{
489219820Sjeff	struct sdp_sock *ssk;
490219820Sjeff
491219820Sjeff	ssk = sdp_sk(so);
492219820Sjeff	SDP_WLOCK(ssk);
493219820Sjeff	KASSERT(ssk->socket != NULL, ("sdp_detach: socket is NULL"));
494219820Sjeff	ssk->socket->so_pcb = NULL;
495219820Sjeff	ssk->socket = NULL;
496219820Sjeff	if (ssk->flags & (SDP_TIMEWAIT | SDP_DREQWAIT))
497219820Sjeff		SDP_WUNLOCK(ssk);
498219820Sjeff	else if (ssk->flags & SDP_DROPPED || ssk->state < TCPS_SYN_SENT)
499219820Sjeff		sdp_pcbfree(ssk);
500219820Sjeff	else
501219820Sjeff		panic("sdp_detach: Unexpected state, ssk %p.\n", ssk);
502219820Sjeff}
503219820Sjeff
504219820Sjeff/*
505219820Sjeff * Allocate a local address for the socket.
506219820Sjeff */
507219820Sjeffstatic int
508219820Sjeffsdp_bind(struct socket *so, struct sockaddr *nam, struct thread *td)
509219820Sjeff{
510219820Sjeff	int error = 0;
511219820Sjeff	struct sdp_sock *ssk;
512219820Sjeff	struct sockaddr_in *sin;
513219820Sjeff
514219820Sjeff	sin = (struct sockaddr_in *)nam;
515219820Sjeff	if (nam->sa_len != sizeof (*sin))
516219820Sjeff		return (EINVAL);
517219820Sjeff	if (sin->sin_family != AF_INET)
518219820Sjeff		return (EINVAL);
519219820Sjeff	if (IN_MULTICAST(ntohl(sin->sin_addr.s_addr)))
520219820Sjeff		return (EAFNOSUPPORT);
521219820Sjeff
522219820Sjeff	ssk = sdp_sk(so);
523219820Sjeff	SDP_WLOCK(ssk);
524219820Sjeff	if (ssk->flags & (SDP_TIMEWAIT | SDP_DROPPED)) {
525219820Sjeff		error = EINVAL;
526219820Sjeff		goto out;
527219820Sjeff	}
528219820Sjeff	error = sdp_pcbbind(ssk, nam, td->td_ucred);
529219820Sjeffout:
530219820Sjeff	SDP_WUNLOCK(ssk);
531219820Sjeff
532219820Sjeff	return (error);
533219820Sjeff}
534219820Sjeff
535219820Sjeff/*
536219820Sjeff * Prepare to accept connections.
537219820Sjeff */
538219820Sjeffstatic int
539219820Sjeffsdp_listen(struct socket *so, int backlog, struct thread *td)
540219820Sjeff{
541219820Sjeff	int error = 0;
542219820Sjeff	struct sdp_sock *ssk;
543219820Sjeff
544219820Sjeff	ssk = sdp_sk(so);
545219820Sjeff	SDP_WLOCK(ssk);
546219820Sjeff	if (ssk->flags & (SDP_TIMEWAIT | SDP_DROPPED)) {
547219820Sjeff		error = EINVAL;
548219820Sjeff		goto out;
549219820Sjeff	}
550219820Sjeff	if (error == 0 && ssk->lport == 0)
551219820Sjeff		error = sdp_pcbbind(ssk, (struct sockaddr *)0, td->td_ucred);
552219820Sjeff	SOCK_LOCK(so);
553219820Sjeff	if (error == 0)
554219820Sjeff		error = solisten_proto_check(so);
555219820Sjeff	if (error == 0) {
556219820Sjeff		solisten_proto(so, backlog);
557219820Sjeff		ssk->state = TCPS_LISTEN;
558219820Sjeff	}
559219820Sjeff	SOCK_UNLOCK(so);
560219820Sjeff
561219820Sjeffout:
562219820Sjeff	SDP_WUNLOCK(ssk);
563219820Sjeff	if (error == 0)
564219820Sjeff		error = -rdma_listen(ssk->id, backlog);
565219820Sjeff	return (error);
566219820Sjeff}
567219820Sjeff
568219820Sjeff/*
569219820Sjeff * Initiate a SDP connection to nam.
570219820Sjeff */
571219820Sjeffstatic int
572219820Sjeffsdp_start_connect(struct sdp_sock *ssk, struct sockaddr *nam, struct thread *td)
573219820Sjeff{
574219820Sjeff	struct sockaddr_in src;
575219820Sjeff	struct socket *so;
576219820Sjeff	int error;
577219820Sjeff
578219820Sjeff	so = ssk->socket;
579219820Sjeff
580219820Sjeff	SDP_WLOCK_ASSERT(ssk);
581219820Sjeff	if (ssk->lport == 0) {
582219820Sjeff		error = sdp_pcbbind(ssk, (struct sockaddr *)0, td->td_ucred);
583219820Sjeff		if (error)
584219820Sjeff			return error;
585219820Sjeff	}
586219820Sjeff	src.sin_family = AF_INET;
587219820Sjeff	src.sin_len = sizeof(src);
588219820Sjeff	bzero(&src.sin_zero, sizeof(src.sin_zero));
589219820Sjeff	src.sin_port = ssk->lport;
590219820Sjeff	src.sin_addr.s_addr = ssk->laddr;
591219820Sjeff	soisconnecting(so);
592219820Sjeff	SDP_WUNLOCK(ssk);
593219820Sjeff	error = -rdma_resolve_addr(ssk->id, (struct sockaddr *)&src, nam,
594219820Sjeff	    SDP_RESOLVE_TIMEOUT);
595219820Sjeff	SDP_WLOCK(ssk);
596219820Sjeff	if (error == 0)
597219820Sjeff		ssk->state = TCPS_SYN_SENT;
598219820Sjeff
599219820Sjeff	return 0;
600219820Sjeff}
601219820Sjeff
602219820Sjeff/*
603219820Sjeff * Initiate SDP connection.
604219820Sjeff */
605219820Sjeffstatic int
606219820Sjeffsdp_connect(struct socket *so, struct sockaddr *nam, struct thread *td)
607219820Sjeff{
608219820Sjeff	int error = 0;
609219820Sjeff	struct sdp_sock *ssk;
610219820Sjeff	struct sockaddr_in *sin;
611219820Sjeff
612219820Sjeff	sin = (struct sockaddr_in *)nam;
613219820Sjeff	if (nam->sa_len != sizeof (*sin))
614219820Sjeff		return (EINVAL);
615219820Sjeff	if (sin->sin_family != AF_INET)
616219820Sjeff		return (EINVAL);
617219820Sjeff	if (IN_MULTICAST(ntohl(sin->sin_addr.s_addr)))
618219820Sjeff		return (EAFNOSUPPORT);
619219820Sjeff	if ((error = prison_remote_ip4(td->td_ucred, &sin->sin_addr)) != 0)
620219820Sjeff		return (error);
621219820Sjeff	ssk = sdp_sk(so);
622219820Sjeff	SDP_WLOCK(ssk);
623219820Sjeff	if (ssk->flags & (SDP_TIMEWAIT | SDP_DROPPED))
624219820Sjeff		error = EINVAL;
625219820Sjeff	else
626219820Sjeff		error = sdp_start_connect(ssk, nam, td);
627219820Sjeff	SDP_WUNLOCK(ssk);
628219820Sjeff	return (error);
629219820Sjeff}
630219820Sjeff
631219820Sjeff/*
632219820Sjeff * Drop a SDP socket, reporting
633219820Sjeff * the specified error.  If connection is synchronized,
634219820Sjeff * then send a RST to peer.
635219820Sjeff */
636219820Sjeffstatic struct sdp_sock *
637219820Sjeffsdp_drop(struct sdp_sock *ssk, int errno)
638219820Sjeff{
639219820Sjeff	struct socket *so;
640219820Sjeff
641219820Sjeff	SDP_WLOCK_ASSERT(ssk);
642219820Sjeff	so = ssk->socket;
643219820Sjeff	if (TCPS_HAVERCVDSYN(ssk->state))
644219820Sjeff		sdp_output_reset(ssk);
645219820Sjeff	if (errno == ETIMEDOUT && ssk->softerror)
646219820Sjeff		errno = ssk->softerror;
647219820Sjeff	so->so_error = errno;
648219820Sjeff	return (sdp_closed(ssk));
649219820Sjeff}
650219820Sjeff
651219820Sjeff/*
652219820Sjeff * User issued close, and wish to trail through shutdown states:
653219820Sjeff * if never received SYN, just forget it.  If got a SYN from peer,
654219820Sjeff * but haven't sent FIN, then go to FIN_WAIT_1 state to send peer a FIN.
655219820Sjeff * If already got a FIN from peer, then almost done; go to LAST_ACK
656219820Sjeff * state.  In all other cases, have already sent FIN to peer (e.g.
657219820Sjeff * after PRU_SHUTDOWN), and just have to play tedious game waiting
658219820Sjeff * for peer to send FIN or not respond to keep-alives, etc.
659219820Sjeff * We can let the user exit from the close as soon as the FIN is acked.
660219820Sjeff */
661219820Sjeffstatic void
662219820Sjeffsdp_usrclosed(struct sdp_sock *ssk)
663219820Sjeff{
664219820Sjeff
665219820Sjeff	SDP_WLOCK_ASSERT(ssk);
666219820Sjeff
667219820Sjeff	switch (ssk->state) {
668219820Sjeff	case TCPS_LISTEN:
669219820Sjeff		ssk->state = TCPS_CLOSED;
670219820Sjeff		SDP_WUNLOCK(ssk);
671219820Sjeff		sdp_destroy_cma(ssk);
672219820Sjeff		SDP_WLOCK(ssk);
673219820Sjeff		/* FALLTHROUGH */
674219820Sjeff	case TCPS_CLOSED:
675219820Sjeff		ssk = sdp_closed(ssk);
676219820Sjeff		/*
677219820Sjeff		 * sdp_closed() should never return NULL here as the socket is
678219820Sjeff		 * still open.
679219820Sjeff		 */
680219820Sjeff		KASSERT(ssk != NULL,
681219820Sjeff		    ("sdp_usrclosed: sdp_closed() returned NULL"));
682219820Sjeff		break;
683219820Sjeff
684219820Sjeff	case TCPS_SYN_SENT:
685219820Sjeff		/* FALLTHROUGH */
686219820Sjeff	case TCPS_SYN_RECEIVED:
687219820Sjeff		ssk->flags |= SDP_NEEDFIN;
688219820Sjeff		break;
689219820Sjeff
690219820Sjeff	case TCPS_ESTABLISHED:
691219820Sjeff		ssk->flags |= SDP_NEEDFIN;
692219820Sjeff		ssk->state = TCPS_FIN_WAIT_1;
693219820Sjeff		break;
694219820Sjeff
695219820Sjeff	case TCPS_CLOSE_WAIT:
696219820Sjeff		ssk->state = TCPS_LAST_ACK;
697219820Sjeff		break;
698219820Sjeff	}
699219820Sjeff	if (ssk->state >= TCPS_FIN_WAIT_2) {
700219820Sjeff		/* Prevent the connection hanging in FIN_WAIT_2 forever. */
701219820Sjeff		if (ssk->state == TCPS_FIN_WAIT_2)
702219820Sjeff			sdp_2msl_wait(ssk);
703219820Sjeff		else
704219820Sjeff			soisdisconnected(ssk->socket);
705219820Sjeff	}
706219820Sjeff}
707219820Sjeff
708219820Sjeffstatic void
709219820Sjeffsdp_output_disconnect(struct sdp_sock *ssk)
710219820Sjeff{
711219820Sjeff
712219820Sjeff	SDP_WLOCK_ASSERT(ssk);
713219820Sjeff	callout_reset(&ssk->keep2msl, SDP_FIN_WAIT_TIMEOUT,
714219820Sjeff	    sdp_dreq_timeout, ssk);
715219820Sjeff	ssk->flags |= SDP_NEEDFIN | SDP_DREQWAIT;
716219820Sjeff	sdp_post_sends(ssk, M_NOWAIT);
717219820Sjeff}
718219820Sjeff
719219820Sjeff/*
720219820Sjeff * Initiate or continue a disconnect.
721219820Sjeff * If embryonic state, just send reset (once).
722219820Sjeff * If in ``let data drain'' option and linger null, just drop.
723219820Sjeff * Otherwise (hard), mark socket disconnecting and drop
724219820Sjeff * current input data; switch states based on user close, and
725219820Sjeff * send segment to peer (with FIN).
726219820Sjeff */
727219820Sjeffstatic void
728219820Sjeffsdp_start_disconnect(struct sdp_sock *ssk)
729219820Sjeff{
730219820Sjeff	struct socket *so;
731219820Sjeff	int unread;
732219820Sjeff
733219820Sjeff	so = ssk->socket;
734219820Sjeff	SDP_WLOCK_ASSERT(ssk);
735219820Sjeff	sdp_stop_keepalive_timer(so);
736219820Sjeff	/*
737219820Sjeff	 * Neither sdp_closed() nor sdp_drop() should return NULL, as the
738219820Sjeff	 * socket is still open.
739219820Sjeff	 */
740219820Sjeff	if (ssk->state < TCPS_ESTABLISHED) {
741219820Sjeff		ssk = sdp_closed(ssk);
742219820Sjeff		KASSERT(ssk != NULL,
743219820Sjeff		    ("sdp_start_disconnect: sdp_close() returned NULL"));
744219820Sjeff	} else if ((so->so_options & SO_LINGER) && so->so_linger == 0) {
745219820Sjeff		ssk = sdp_drop(ssk, 0);
746219820Sjeff		KASSERT(ssk != NULL,
747219820Sjeff		    ("sdp_start_disconnect: sdp_drop() returned NULL"));
748219820Sjeff	} else {
749219820Sjeff		soisdisconnecting(so);
750274421Sglebius		unread = sbused(&so->so_rcv);
751219820Sjeff		sbflush(&so->so_rcv);
752219820Sjeff		sdp_usrclosed(ssk);
753219820Sjeff		if (!(ssk->flags & SDP_DROPPED)) {
754219820Sjeff			if (unread)
755219820Sjeff				sdp_output_reset(ssk);
756219820Sjeff			else
757219820Sjeff				sdp_output_disconnect(ssk);
758219820Sjeff		}
759219820Sjeff	}
760219820Sjeff}
761219820Sjeff
762219820Sjeff/*
763219820Sjeff * User initiated disconnect.
764219820Sjeff */
765219820Sjeffstatic int
766219820Sjeffsdp_disconnect(struct socket *so)
767219820Sjeff{
768219820Sjeff	struct sdp_sock *ssk;
769219820Sjeff	int error = 0;
770219820Sjeff
771219820Sjeff	ssk = sdp_sk(so);
772219820Sjeff	SDP_WLOCK(ssk);
773219820Sjeff	if (ssk->flags & (SDP_TIMEWAIT | SDP_DROPPED)) {
774219820Sjeff		error = ECONNRESET;
775219820Sjeff		goto out;
776219820Sjeff	}
777219820Sjeff	sdp_start_disconnect(ssk);
778219820Sjeffout:
779219820Sjeff	SDP_WUNLOCK(ssk);
780219820Sjeff	return (error);
781219820Sjeff}
782219820Sjeff
783219820Sjeff/*
784219820Sjeff * Accept a connection.  Essentially all the work is done at higher levels;
785219820Sjeff * just return the address of the peer, storing through addr.
786219820Sjeff *
787219820Sjeff *
788219820Sjeff * XXX This is broken XXX
789219820Sjeff *
790219820Sjeff * The rationale for acquiring the sdp lock here is somewhat complicated,
791219820Sjeff * and is described in detail in the commit log entry for r175612.  Acquiring
792219820Sjeff * it delays an accept(2) racing with sonewconn(), which inserts the socket
793219820Sjeff * before the address/port fields are initialized.  A better fix would
794219820Sjeff * prevent the socket from being placed in the listen queue until all fields
795219820Sjeff * are fully initialized.
796219820Sjeff */
797219820Sjeffstatic int
798219820Sjeffsdp_accept(struct socket *so, struct sockaddr **nam)
799219820Sjeff{
800219820Sjeff	struct sdp_sock *ssk = NULL;
801219820Sjeff	struct in_addr addr;
802219820Sjeff	in_port_t port;
803219820Sjeff	int error;
804219820Sjeff
805219820Sjeff	if (so->so_state & SS_ISDISCONNECTED)
806219820Sjeff		return (ECONNABORTED);
807219820Sjeff
808219820Sjeff	port = 0;
809219820Sjeff	addr.s_addr = 0;
810219820Sjeff	error = 0;
811219820Sjeff	ssk = sdp_sk(so);
812219820Sjeff	SDP_WLOCK(ssk);
813219820Sjeff	if (ssk->flags & (SDP_TIMEWAIT | SDP_DROPPED)) {
814219820Sjeff		error = ECONNABORTED;
815219820Sjeff		goto out;
816219820Sjeff	}
817219820Sjeff	port = ssk->fport;
818219820Sjeff	addr.s_addr = ssk->faddr;
819219820Sjeffout:
820219820Sjeff	SDP_WUNLOCK(ssk);
821219820Sjeff	if (error == 0)
822219820Sjeff		*nam = sdp_sockaddr(port, &addr);
823219820Sjeff	return error;
824219820Sjeff}
825219820Sjeff
826219820Sjeff/*
827219820Sjeff * Mark the connection as being incapable of further output.
828219820Sjeff */
829219820Sjeffstatic int
830219820Sjeffsdp_shutdown(struct socket *so)
831219820Sjeff{
832219820Sjeff	int error = 0;
833219820Sjeff	struct sdp_sock *ssk;
834219820Sjeff
835219820Sjeff	ssk = sdp_sk(so);
836219820Sjeff	SDP_WLOCK(ssk);
837219820Sjeff	if (ssk->flags & (SDP_TIMEWAIT | SDP_DROPPED)) {
838219820Sjeff		error = ECONNRESET;
839219820Sjeff		goto out;
840219820Sjeff	}
841219820Sjeff	socantsendmore(so);
842219820Sjeff	sdp_usrclosed(ssk);
843219820Sjeff	if (!(ssk->flags & SDP_DROPPED))
844219820Sjeff		sdp_output_disconnect(ssk);
845219820Sjeff
846219820Sjeffout:
847219820Sjeff	SDP_WUNLOCK(ssk);
848219820Sjeff
849219820Sjeff	return (error);
850219820Sjeff}
851219820Sjeff
852219820Sjeffstatic void
853219820Sjeffsdp_append(struct sdp_sock *ssk, struct sockbuf *sb, struct mbuf *mb, int cnt)
854219820Sjeff{
855219820Sjeff	struct mbuf *n;
856219820Sjeff	int ncnt;
857219820Sjeff
858219820Sjeff	SOCKBUF_LOCK_ASSERT(sb);
859233198Sjhb	SBLASTRECORDCHK(sb);
860219820Sjeff	KASSERT(mb->m_flags & M_PKTHDR,
861219820Sjeff		("sdp_append: %p Missing packet header.\n", mb));
862219820Sjeff	n = sb->sb_lastrecord;
863219820Sjeff	/*
864219820Sjeff	 * If the queue is empty just set all pointers and proceed.
865219820Sjeff	 */
866219820Sjeff	if (n == NULL) {
867219820Sjeff		sb->sb_lastrecord = sb->sb_mb = sb->sb_sndptr = mb;
868219820Sjeff		for (; mb; mb = mb->m_next) {
869219820Sjeff	                sb->sb_mbtail = mb;
870219820Sjeff			sballoc(sb, mb);
871219820Sjeff		}
872219820Sjeff		return;
873219820Sjeff	}
874219820Sjeff	/*
875219820Sjeff	 * Count the number of mbufs in the current tail.
876219820Sjeff	 */
877219820Sjeff	for (ncnt = 0; n->m_next; n = n->m_next)
878219820Sjeff		ncnt++;
879219820Sjeff	n = sb->sb_lastrecord;
880219820Sjeff	/*
881219820Sjeff	 * If the two chains can fit in a single sdp packet and
882219820Sjeff	 * the last record has not been sent yet (WRITABLE) coalesce
883219820Sjeff	 * them.  The lastrecord remains the same but we must strip the
884219820Sjeff	 * packet header and then let sbcompress do the hard part.
885219820Sjeff	 */
886219820Sjeff	if (M_WRITABLE(n) && ncnt + cnt < SDP_MAX_SEND_SGES &&
887219820Sjeff	    n->m_pkthdr.len + mb->m_pkthdr.len - SDP_HEAD_SIZE <
888219820Sjeff	    ssk->xmit_size_goal) {
889219820Sjeff		m_adj(mb, SDP_HEAD_SIZE);
890219820Sjeff		n->m_pkthdr.len += mb->m_pkthdr.len;
891219820Sjeff		n->m_flags |= mb->m_flags & (M_PUSH | M_URG);
892275329Sglebius		m_demote(mb, 1, 0);
893219820Sjeff		sbcompress(sb, mb, sb->sb_mbtail);
894219820Sjeff		return;
895219820Sjeff	}
896219820Sjeff	/*
897219820Sjeff	 * Not compressible, just append to the end and adjust counters.
898219820Sjeff	 */
899219820Sjeff	sb->sb_lastrecord->m_flags |= M_PUSH;
900219820Sjeff	sb->sb_lastrecord->m_nextpkt = mb;
901219820Sjeff	sb->sb_lastrecord = mb;
902219820Sjeff	if (sb->sb_sndptr == NULL)
903219820Sjeff		sb->sb_sndptr = mb;
904219820Sjeff	for (; mb; mb = mb->m_next) {
905219820Sjeff		sb->sb_mbtail = mb;
906219820Sjeff		sballoc(sb, mb);
907219820Sjeff	}
908219820Sjeff}
909219820Sjeff
910219820Sjeff/*
911219820Sjeff * Do a send by putting data in output queue and updating urgent
912219820Sjeff * marker if URG set.  Possibly send more data.  Unlike the other
913219820Sjeff * pru_*() routines, the mbuf chains are our responsibility.  We
914219820Sjeff * must either enqueue them or free them.  The other pru_* routines
915219820Sjeff * generally are caller-frees.
916219820Sjeff *
917219820Sjeff * This comes from sendfile, normal sends will come from sdp_sosend().
918219820Sjeff */
919219820Sjeffstatic int
920219820Sjeffsdp_send(struct socket *so, int flags, struct mbuf *m,
921219820Sjeff    struct sockaddr *nam, struct mbuf *control, struct thread *td)
922219820Sjeff{
923219820Sjeff	struct sdp_sock *ssk;
924219820Sjeff	struct mbuf *n;
925219820Sjeff	int error;
926219820Sjeff	int cnt;
927219820Sjeff
928219820Sjeff	error = 0;
929219820Sjeff	ssk = sdp_sk(so);
930219820Sjeff	KASSERT(m->m_flags & M_PKTHDR,
931219820Sjeff	    ("sdp_send: %p no packet header", m));
932243882Sglebius	M_PREPEND(m, SDP_HEAD_SIZE, M_WAITOK);
933219820Sjeff	mtod(m, struct sdp_bsdh *)->mid = SDP_MID_DATA;
934219820Sjeff	for (n = m, cnt = 0; n->m_next; n = n->m_next)
935219820Sjeff		cnt++;
936219820Sjeff	if (cnt > SDP_MAX_SEND_SGES) {
937243882Sglebius		n = m_collapse(m, M_WAITOK, SDP_MAX_SEND_SGES);
938219820Sjeff		if (n == NULL) {
939219820Sjeff			m_freem(m);
940219820Sjeff			return (EMSGSIZE);
941219820Sjeff		}
942219820Sjeff		m = n;
943219820Sjeff		for (cnt = 0; n->m_next; n = n->m_next)
944219820Sjeff			cnt++;
945219820Sjeff	}
946219820Sjeff	SDP_WLOCK(ssk);
947219820Sjeff	if (ssk->flags & (SDP_TIMEWAIT | SDP_DROPPED)) {
948219820Sjeff		if (control)
949219820Sjeff			m_freem(control);
950219820Sjeff		if (m)
951219820Sjeff			m_freem(m);
952219820Sjeff		error = ECONNRESET;
953219820Sjeff		goto out;
954219820Sjeff	}
955219820Sjeff	if (control) {
956219820Sjeff		/* SDP doesn't support control messages. */
957219820Sjeff		if (control->m_len) {
958219820Sjeff			m_freem(control);
959219820Sjeff			if (m)
960219820Sjeff				m_freem(m);
961219820Sjeff			error = EINVAL;
962219820Sjeff			goto out;
963219820Sjeff		}
964219820Sjeff		m_freem(control);	/* empty control, just free it */
965219820Sjeff	}
966219820Sjeff	if (!(flags & PRUS_OOB)) {
967219820Sjeff		SOCKBUF_LOCK(&so->so_snd);
968219820Sjeff		sdp_append(ssk, &so->so_snd, m, cnt);
969219820Sjeff		SOCKBUF_UNLOCK(&so->so_snd);
970219820Sjeff		if (nam && ssk->state < TCPS_SYN_SENT) {
971219820Sjeff			/*
972219820Sjeff			 * Do implied connect if not yet connected.
973219820Sjeff			 */
974219820Sjeff			error = sdp_start_connect(ssk, nam, td);
975219820Sjeff			if (error)
976219820Sjeff				goto out;
977219820Sjeff		}
978219820Sjeff		if (flags & PRUS_EOF) {
979219820Sjeff			/*
980219820Sjeff			 * Close the send side of the connection after
981219820Sjeff			 * the data is sent.
982219820Sjeff			 */
983219820Sjeff			socantsendmore(so);
984219820Sjeff			sdp_usrclosed(ssk);
985219820Sjeff			if (!(ssk->flags & SDP_DROPPED))
986219820Sjeff				sdp_output_disconnect(ssk);
987219820Sjeff		} else if (!(ssk->flags & SDP_DROPPED) &&
988219820Sjeff		    !(flags & PRUS_MORETOCOME))
989219820Sjeff			sdp_post_sends(ssk, M_NOWAIT);
990219820Sjeff		SDP_WUNLOCK(ssk);
991219820Sjeff		return (0);
992219820Sjeff	} else {
993219820Sjeff		SOCKBUF_LOCK(&so->so_snd);
994219820Sjeff		if (sbspace(&so->so_snd) < -512) {
995219820Sjeff			SOCKBUF_UNLOCK(&so->so_snd);
996219820Sjeff			m_freem(m);
997219820Sjeff			error = ENOBUFS;
998219820Sjeff			goto out;
999219820Sjeff		}
1000219820Sjeff		/*
1001219820Sjeff		 * According to RFC961 (Assigned Protocols),
1002219820Sjeff		 * the urgent pointer points to the last octet
1003219820Sjeff		 * of urgent data.  We continue, however,
1004219820Sjeff		 * to consider it to indicate the first octet
1005219820Sjeff		 * of data past the urgent section.
1006219820Sjeff		 * Otherwise, snd_up should be one lower.
1007219820Sjeff		 */
1008219820Sjeff		m->m_flags |= M_URG | M_PUSH;
1009219820Sjeff		sdp_append(ssk, &so->so_snd, m, cnt);
1010219820Sjeff		SOCKBUF_UNLOCK(&so->so_snd);
1011219820Sjeff		if (nam && ssk->state < TCPS_SYN_SENT) {
1012219820Sjeff			/*
1013219820Sjeff			 * Do implied connect if not yet connected.
1014219820Sjeff			 */
1015219820Sjeff			error = sdp_start_connect(ssk, nam, td);
1016219820Sjeff			if (error)
1017219820Sjeff				goto out;
1018219820Sjeff		}
1019219820Sjeff		sdp_post_sends(ssk, M_NOWAIT);
1020219820Sjeff		SDP_WUNLOCK(ssk);
1021219820Sjeff		return (0);
1022219820Sjeff	}
1023219820Sjeffout:
1024219820Sjeff	SDP_WUNLOCK(ssk);
1025219820Sjeff	return (error);
1026219820Sjeff}
1027219820Sjeff
1028219820Sjeff#define	SBLOCKWAIT(f)	(((f) & MSG_DONTWAIT) ? 0 : SBL_WAIT)
1029219820Sjeff
1030219820Sjeff/*
1031219820Sjeff * Send on a socket.  If send must go all at once and message is larger than
1032219820Sjeff * send buffering, then hard error.  Lock against other senders.  If must go
1033219820Sjeff * all at once and not enough room now, then inform user that this would
1034219820Sjeff * block and do nothing.  Otherwise, if nonblocking, send as much as
1035219820Sjeff * possible.  The data to be sent is described by "uio" if nonzero, otherwise
1036219820Sjeff * by the mbuf chain "top" (which must be null if uio is not).  Data provided
1037219820Sjeff * in mbuf chain must be small enough to send all at once.
1038219820Sjeff *
1039219820Sjeff * Returns nonzero on error, timeout or signal; callers must check for short
1040219820Sjeff * counts if EINTR/ERESTART are returned.  Data and control buffers are freed
1041219820Sjeff * on return.
1042219820Sjeff */
1043219820Sjeffstatic int
1044219820Sjeffsdp_sosend(struct socket *so, struct sockaddr *addr, struct uio *uio,
1045219820Sjeff    struct mbuf *top, struct mbuf *control, int flags, struct thread *td)
1046219820Sjeff{
1047219820Sjeff	struct sdp_sock *ssk;
1048219820Sjeff	long space, resid;
1049219820Sjeff	int atomic;
1050219820Sjeff	int error;
1051219820Sjeff	int copy;
1052219820Sjeff
1053219820Sjeff	if (uio != NULL)
1054219820Sjeff		resid = uio->uio_resid;
1055219820Sjeff	else
1056219820Sjeff		resid = top->m_pkthdr.len;
1057219820Sjeff	atomic = top != NULL;
1058219820Sjeff	if (control != NULL) {
1059219820Sjeff		if (control->m_len) {
1060219820Sjeff			m_freem(control);
1061219820Sjeff			if (top)
1062219820Sjeff				m_freem(top);
1063219820Sjeff			return (EINVAL);
1064219820Sjeff		}
1065219820Sjeff		m_freem(control);
1066219820Sjeff		control = NULL;
1067219820Sjeff	}
1068219820Sjeff	/*
1069219820Sjeff	 * In theory resid should be unsigned.  However, space must be
1070219820Sjeff	 * signed, as it might be less than 0 if we over-committed, and we
1071219820Sjeff	 * must use a signed comparison of space and resid.  On the other
1072219820Sjeff	 * hand, a negative resid causes us to loop sending 0-length
1073219820Sjeff	 * segments to the protocol.
1074219820Sjeff	 *
1075219820Sjeff	 * Also check to make sure that MSG_EOR isn't used on SOCK_STREAM
1076219820Sjeff	 * type sockets since that's an error.
1077219820Sjeff	 */
1078219820Sjeff	if (resid < 0 || (so->so_type == SOCK_STREAM && (flags & MSG_EOR))) {
1079219820Sjeff		error = EINVAL;
1080219820Sjeff		goto out;
1081219820Sjeff	}
1082219820Sjeff	if (td != NULL)
1083219820Sjeff		td->td_ru.ru_msgsnd++;
1084219820Sjeff
1085219820Sjeff	ssk = sdp_sk(so);
1086219820Sjeff	error = sblock(&so->so_snd, SBLOCKWAIT(flags));
1087219820Sjeff	if (error)
1088219820Sjeff		goto out;
1089219820Sjeff
1090219820Sjeffrestart:
1091219820Sjeff	do {
1092219820Sjeff		SOCKBUF_LOCK(&so->so_snd);
1093219820Sjeff		if (so->so_snd.sb_state & SBS_CANTSENDMORE) {
1094219820Sjeff			SOCKBUF_UNLOCK(&so->so_snd);
1095219820Sjeff			error = EPIPE;
1096219820Sjeff			goto release;
1097219820Sjeff		}
1098219820Sjeff		if (so->so_error) {
1099219820Sjeff			error = so->so_error;
1100219820Sjeff			so->so_error = 0;
1101219820Sjeff			SOCKBUF_UNLOCK(&so->so_snd);
1102219820Sjeff			goto release;
1103219820Sjeff		}
1104219820Sjeff		if ((so->so_state & SS_ISCONNECTED) == 0 && addr == NULL) {
1105219820Sjeff			SOCKBUF_UNLOCK(&so->so_snd);
1106219820Sjeff			error = ENOTCONN;
1107219820Sjeff			goto release;
1108219820Sjeff		}
1109219820Sjeff		space = sbspace(&so->so_snd);
1110219820Sjeff		if (flags & MSG_OOB)
1111219820Sjeff			space += 1024;
1112219820Sjeff		if (atomic && resid > ssk->xmit_size_goal - SDP_HEAD_SIZE) {
1113219820Sjeff			SOCKBUF_UNLOCK(&so->so_snd);
1114219820Sjeff			error = EMSGSIZE;
1115219820Sjeff			goto release;
1116219820Sjeff		}
1117219820Sjeff		if (space < resid &&
1118219820Sjeff		    (atomic || space < so->so_snd.sb_lowat)) {
1119219820Sjeff			if ((so->so_state & SS_NBIO) || (flags & MSG_NBIO)) {
1120219820Sjeff				SOCKBUF_UNLOCK(&so->so_snd);
1121219820Sjeff				error = EWOULDBLOCK;
1122219820Sjeff				goto release;
1123219820Sjeff			}
1124219820Sjeff			error = sbwait(&so->so_snd);
1125219820Sjeff			SOCKBUF_UNLOCK(&so->so_snd);
1126219820Sjeff			if (error)
1127219820Sjeff				goto release;
1128219820Sjeff			goto restart;
1129219820Sjeff		}
1130219820Sjeff		SOCKBUF_UNLOCK(&so->so_snd);
1131219820Sjeff		do {
1132219820Sjeff			if (uio == NULL) {
1133219820Sjeff				resid = 0;
1134219820Sjeff				if (flags & MSG_EOR)
1135219820Sjeff					top->m_flags |= M_EOR;
1136219820Sjeff			} else {
1137219820Sjeff				/*
1138219820Sjeff				 * Copy the data from userland into a mbuf
1139219820Sjeff				 * chain.  If no data is to be copied in,
1140219820Sjeff				 * a single empty mbuf is returned.
1141219820Sjeff				 */
1142219820Sjeff				copy = min(space,
1143219820Sjeff				    ssk->xmit_size_goal - SDP_HEAD_SIZE);
1144219820Sjeff				top = m_uiotombuf(uio, M_WAITOK, copy,
1145219820Sjeff				    0, M_PKTHDR |
1146219820Sjeff				    ((flags & MSG_EOR) ? M_EOR : 0));
1147219820Sjeff				if (top == NULL) {
1148219820Sjeff					/* only possible error */
1149219820Sjeff					error = EFAULT;
1150219820Sjeff					goto release;
1151219820Sjeff				}
1152219820Sjeff				space -= resid - uio->uio_resid;
1153219820Sjeff				resid = uio->uio_resid;
1154219820Sjeff			}
1155219820Sjeff			/*
1156219820Sjeff			 * XXX all the SBS_CANTSENDMORE checks previously
1157219820Sjeff			 * done could be out of date after dropping the
1158219820Sjeff			 * socket lock.
1159219820Sjeff			 */
1160219820Sjeff			error = sdp_send(so, (flags & MSG_OOB) ? PRUS_OOB :
1161219820Sjeff			/*
1162219820Sjeff			 * Set EOF on the last send if the user specified
1163219820Sjeff			 * MSG_EOF.
1164219820Sjeff			 */
1165219820Sjeff			    ((flags & MSG_EOF) && (resid <= 0)) ? PRUS_EOF :
1166219820Sjeff			/* If there is more to send set PRUS_MORETOCOME. */
1167219820Sjeff			    (resid > 0 && space > 0) ? PRUS_MORETOCOME : 0,
1168219820Sjeff			    top, addr, NULL, td);
1169219820Sjeff			top = NULL;
1170219820Sjeff			if (error)
1171219820Sjeff				goto release;
1172219820Sjeff		} while (resid && space > 0);
1173219820Sjeff	} while (resid);
1174219820Sjeff
1175219820Sjeffrelease:
1176219820Sjeff	sbunlock(&so->so_snd);
1177219820Sjeffout:
1178219820Sjeff	if (top != NULL)
1179219820Sjeff		m_freem(top);
1180219820Sjeff	return (error);
1181219820Sjeff}
1182219820Sjeff
1183219820Sjeff/*
1184219820Sjeff * The part of soreceive() that implements reading non-inline out-of-band
1185219820Sjeff * data from a socket.  For more complete comments, see soreceive(), from
1186219820Sjeff * which this code originated.
1187219820Sjeff *
1188219820Sjeff * Note that soreceive_rcvoob(), unlike the remainder of soreceive(), is
1189219820Sjeff * unable to return an mbuf chain to the caller.
1190219820Sjeff */
1191219820Sjeffstatic int
1192219820Sjeffsoreceive_rcvoob(struct socket *so, struct uio *uio, int flags)
1193219820Sjeff{
1194219820Sjeff	struct protosw *pr = so->so_proto;
1195219820Sjeff	struct mbuf *m;
1196219820Sjeff	int error;
1197219820Sjeff
1198219820Sjeff	KASSERT(flags & MSG_OOB, ("soreceive_rcvoob: (flags & MSG_OOB) == 0"));
1199219820Sjeff
1200243882Sglebius	m = m_get(M_WAITOK, MT_DATA);
1201219820Sjeff	error = (*pr->pr_usrreqs->pru_rcvoob)(so, m, flags & MSG_PEEK);
1202219820Sjeff	if (error)
1203219820Sjeff		goto bad;
1204219820Sjeff	do {
1205219820Sjeff		error = uiomove(mtod(m, void *),
1206219820Sjeff		    (int) min(uio->uio_resid, m->m_len), uio);
1207219820Sjeff		m = m_free(m);
1208219820Sjeff	} while (uio->uio_resid && error == 0 && m);
1209219820Sjeffbad:
1210219820Sjeff	if (m != NULL)
1211219820Sjeff		m_freem(m);
1212219820Sjeff	return (error);
1213219820Sjeff}
1214219820Sjeff
1215219820Sjeff/*
1216219820Sjeff * Optimized version of soreceive() for stream (TCP) sockets.
1217219820Sjeff */
1218219820Sjeffstatic int
1219219820Sjeffsdp_sorecv(struct socket *so, struct sockaddr **psa, struct uio *uio,
1220219820Sjeff    struct mbuf **mp0, struct mbuf **controlp, int *flagsp)
1221219820Sjeff{
1222219820Sjeff	int len = 0, error = 0, flags, oresid;
1223219820Sjeff	struct sockbuf *sb;
1224219820Sjeff	struct mbuf *m, *n = NULL;
1225219820Sjeff	struct sdp_sock *ssk;
1226219820Sjeff
1227219820Sjeff	/* We only do stream sockets. */
1228219820Sjeff	if (so->so_type != SOCK_STREAM)
1229219820Sjeff		return (EINVAL);
1230219820Sjeff	if (psa != NULL)
1231219820Sjeff		*psa = NULL;
1232219820Sjeff	if (controlp != NULL)
1233219820Sjeff		return (EINVAL);
1234219820Sjeff	if (flagsp != NULL)
1235219820Sjeff		flags = *flagsp &~ MSG_EOR;
1236219820Sjeff	else
1237219820Sjeff		flags = 0;
1238219820Sjeff	if (flags & MSG_OOB)
1239219820Sjeff		return (soreceive_rcvoob(so, uio, flags));
1240219820Sjeff	if (mp0 != NULL)
1241219820Sjeff		*mp0 = NULL;
1242219820Sjeff
1243219820Sjeff	sb = &so->so_rcv;
1244219820Sjeff	ssk = sdp_sk(so);
1245219820Sjeff
1246219820Sjeff	/* Prevent other readers from entering the socket. */
1247219820Sjeff	error = sblock(sb, SBLOCKWAIT(flags));
1248219820Sjeff	if (error)
1249219820Sjeff		goto out;
1250219820Sjeff	SOCKBUF_LOCK(sb);
1251219820Sjeff
1252219820Sjeff	/* Easy one, no space to copyout anything. */
1253219820Sjeff	if (uio->uio_resid == 0) {
1254219820Sjeff		error = EINVAL;
1255219820Sjeff		goto out;
1256219820Sjeff	}
1257219820Sjeff	oresid = uio->uio_resid;
1258219820Sjeff
1259219820Sjeff	/* We will never ever get anything unless we are connected. */
1260219820Sjeff	if (!(so->so_state & (SS_ISCONNECTED|SS_ISDISCONNECTED))) {
1261219820Sjeff		/* When disconnecting there may be still some data left. */
1262274421Sglebius		if (sbavail(sb))
1263219820Sjeff			goto deliver;
1264219820Sjeff		if (!(so->so_state & SS_ISDISCONNECTED))
1265219820Sjeff			error = ENOTCONN;
1266219820Sjeff		goto out;
1267219820Sjeff	}
1268219820Sjeff
1269219820Sjeff	/* Socket buffer is empty and we shall not block. */
1270274421Sglebius	if (sbavail(sb) == 0 &&
1271249066Sjhb	    ((so->so_state & SS_NBIO) || (flags & (MSG_DONTWAIT|MSG_NBIO)))) {
1272219820Sjeff		error = EAGAIN;
1273219820Sjeff		goto out;
1274219820Sjeff	}
1275219820Sjeff
1276219820Sjeffrestart:
1277219820Sjeff	SOCKBUF_LOCK_ASSERT(&so->so_rcv);
1278219820Sjeff
1279219820Sjeff	/* Abort if socket has reported problems. */
1280219820Sjeff	if (so->so_error) {
1281274421Sglebius		if (sbavail(sb))
1282219820Sjeff			goto deliver;
1283219820Sjeff		if (oresid > uio->uio_resid)
1284219820Sjeff			goto out;
1285219820Sjeff		error = so->so_error;
1286219820Sjeff		if (!(flags & MSG_PEEK))
1287219820Sjeff			so->so_error = 0;
1288219820Sjeff		goto out;
1289219820Sjeff	}
1290219820Sjeff
1291219820Sjeff	/* Door is closed.  Deliver what is left, if any. */
1292219820Sjeff	if (sb->sb_state & SBS_CANTRCVMORE) {
1293274421Sglebius		if (sbavail(sb))
1294219820Sjeff			goto deliver;
1295219820Sjeff		else
1296219820Sjeff			goto out;
1297219820Sjeff	}
1298219820Sjeff
1299219820Sjeff	/* Socket buffer got some data that we shall deliver now. */
1300274421Sglebius	if (sbavail(sb) && !(flags & MSG_WAITALL) &&
1301249066Sjhb	    ((so->so_state & SS_NBIO) ||
1302219820Sjeff	     (flags & (MSG_DONTWAIT|MSG_NBIO)) ||
1303274421Sglebius	     sbavail(sb) >= sb->sb_lowat ||
1304274421Sglebius	     sbavail(sb) >= uio->uio_resid ||
1305274421Sglebius	     sbavail(sb) >= sb->sb_hiwat) ) {
1306219820Sjeff		goto deliver;
1307219820Sjeff	}
1308219820Sjeff
1309219820Sjeff	/* On MSG_WAITALL we must wait until all data or error arrives. */
1310219820Sjeff	if ((flags & MSG_WAITALL) &&
1311274421Sglebius	    (sbavail(sb) >= uio->uio_resid || sbavail(sb) >= sb->sb_lowat))
1312219820Sjeff		goto deliver;
1313219820Sjeff
1314219820Sjeff	/*
1315219820Sjeff	 * Wait and block until (more) data comes in.
1316219820Sjeff	 * NB: Drops the sockbuf lock during wait.
1317219820Sjeff	 */
1318219820Sjeff	error = sbwait(sb);
1319219820Sjeff	if (error)
1320219820Sjeff		goto out;
1321219820Sjeff	goto restart;
1322219820Sjeff
1323219820Sjeffdeliver:
1324219820Sjeff	SOCKBUF_LOCK_ASSERT(&so->so_rcv);
1325274421Sglebius	KASSERT(sbavail(sb), ("%s: sockbuf empty", __func__));
1326219820Sjeff	KASSERT(sb->sb_mb != NULL, ("%s: sb_mb == NULL", __func__));
1327219820Sjeff
1328219820Sjeff	/* Statistics. */
1329219820Sjeff	if (uio->uio_td)
1330219820Sjeff		uio->uio_td->td_ru.ru_msgrcv++;
1331219820Sjeff
1332219820Sjeff	/* Fill uio until full or current end of socket buffer is reached. */
1333274421Sglebius	len = min(uio->uio_resid, sbavail(sb));
1334219820Sjeff	if (mp0 != NULL) {
1335219820Sjeff		/* Dequeue as many mbufs as possible. */
1336219820Sjeff		if (!(flags & MSG_PEEK) && len >= sb->sb_mb->m_len) {
1337219820Sjeff			for (*mp0 = m = sb->sb_mb;
1338219820Sjeff			     m != NULL && m->m_len <= len;
1339219820Sjeff			     m = m->m_next) {
1340219820Sjeff				len -= m->m_len;
1341219820Sjeff				uio->uio_resid -= m->m_len;
1342219820Sjeff				sbfree(sb, m);
1343219820Sjeff				n = m;
1344219820Sjeff			}
1345219820Sjeff			sb->sb_mb = m;
1346219820Sjeff			if (sb->sb_mb == NULL)
1347219820Sjeff				SB_EMPTY_FIXUP(sb);
1348219820Sjeff			n->m_next = NULL;
1349219820Sjeff		}
1350219820Sjeff		/* Copy the remainder. */
1351219820Sjeff		if (len > 0) {
1352219820Sjeff			KASSERT(sb->sb_mb != NULL,
1353219820Sjeff			    ("%s: len > 0 && sb->sb_mb empty", __func__));
1354219820Sjeff
1355243882Sglebius			m = m_copym(sb->sb_mb, 0, len, M_NOWAIT);
1356219820Sjeff			if (m == NULL)
1357219820Sjeff				len = 0;	/* Don't flush data from sockbuf. */
1358219820Sjeff			else
1359219820Sjeff				uio->uio_resid -= m->m_len;
1360219820Sjeff			if (*mp0 != NULL)
1361219820Sjeff				n->m_next = m;
1362219820Sjeff			else
1363219820Sjeff				*mp0 = m;
1364219820Sjeff			if (*mp0 == NULL) {
1365219820Sjeff				error = ENOBUFS;
1366219820Sjeff				goto out;
1367219820Sjeff			}
1368219820Sjeff		}
1369219820Sjeff	} else {
1370219820Sjeff		/* NB: Must unlock socket buffer as uiomove may sleep. */
1371219820Sjeff		SOCKBUF_UNLOCK(sb);
1372219820Sjeff		error = m_mbuftouio(uio, sb->sb_mb, len);
1373219820Sjeff		SOCKBUF_LOCK(sb);
1374219820Sjeff		if (error)
1375219820Sjeff			goto out;
1376219820Sjeff	}
1377219820Sjeff	SBLASTRECORDCHK(sb);
1378219820Sjeff	SBLASTMBUFCHK(sb);
1379219820Sjeff
1380219820Sjeff	/*
1381219820Sjeff	 * Remove the delivered data from the socket buffer unless we
1382219820Sjeff	 * were only peeking.
1383219820Sjeff	 */
1384219820Sjeff	if (!(flags & MSG_PEEK)) {
1385219820Sjeff		if (len > 0)
1386219820Sjeff			sbdrop_locked(sb, len);
1387219820Sjeff
1388219820Sjeff		/* Notify protocol that we drained some data. */
1389219820Sjeff		SOCKBUF_UNLOCK(sb);
1390219820Sjeff		SDP_WLOCK(ssk);
1391219820Sjeff		sdp_do_posts(ssk);
1392219820Sjeff		SDP_WUNLOCK(ssk);
1393219820Sjeff		SOCKBUF_LOCK(sb);
1394219820Sjeff	}
1395219820Sjeff
1396219820Sjeff	/*
1397219820Sjeff	 * For MSG_WAITALL we may have to loop again and wait for
1398219820Sjeff	 * more data to come in.
1399219820Sjeff	 */
1400219820Sjeff	if ((flags & MSG_WAITALL) && uio->uio_resid > 0)
1401219820Sjeff		goto restart;
1402219820Sjeffout:
1403219820Sjeff	SOCKBUF_LOCK_ASSERT(sb);
1404219820Sjeff	SBLASTRECORDCHK(sb);
1405219820Sjeff	SBLASTMBUFCHK(sb);
1406219820Sjeff	SOCKBUF_UNLOCK(sb);
1407219820Sjeff	sbunlock(sb);
1408219820Sjeff	return (error);
1409219820Sjeff}
1410219820Sjeff
1411219820Sjeff/*
1412219820Sjeff * Abort is used to teardown a connection typically while sitting in
1413219820Sjeff * the accept queue.
1414219820Sjeff */
1415219820Sjeffvoid
1416219820Sjeffsdp_abort(struct socket *so)
1417219820Sjeff{
1418219820Sjeff	struct sdp_sock *ssk;
1419219820Sjeff
1420219820Sjeff	ssk = sdp_sk(so);
1421219820Sjeff	SDP_WLOCK(ssk);
1422219820Sjeff	/*
1423219820Sjeff	 * If we have not yet dropped, do it now.
1424219820Sjeff	 */
1425219820Sjeff	if (!(ssk->flags & SDP_TIMEWAIT) &&
1426219820Sjeff	    !(ssk->flags & SDP_DROPPED))
1427219820Sjeff		sdp_drop(ssk, ECONNABORTED);
1428219820Sjeff	KASSERT(ssk->flags & SDP_DROPPED, ("sdp_abort: %p not dropped 0x%X",
1429219820Sjeff	    ssk, ssk->flags));
1430219820Sjeff	SDP_WUNLOCK(ssk);
1431219820Sjeff}
1432219820Sjeff
1433219820Sjeff/*
1434219820Sjeff * Close a SDP socket and initiate a friendly disconnect.
1435219820Sjeff */
1436219820Sjeffstatic void
1437219820Sjeffsdp_close(struct socket *so)
1438219820Sjeff{
1439219820Sjeff	struct sdp_sock *ssk;
1440219820Sjeff
1441219820Sjeff	ssk = sdp_sk(so);
1442219820Sjeff	SDP_WLOCK(ssk);
1443219820Sjeff	/*
1444219820Sjeff	 * If we have not yet dropped, do it now.
1445219820Sjeff	 */
1446219820Sjeff	if (!(ssk->flags & SDP_TIMEWAIT) &&
1447219820Sjeff	    !(ssk->flags & SDP_DROPPED))
1448219820Sjeff		sdp_start_disconnect(ssk);
1449219820Sjeff
1450219820Sjeff	/*
1451219820Sjeff	 * If we've still not dropped let the socket layer know we're
1452219820Sjeff	 * holding on to the socket and pcb for a while.
1453219820Sjeff	 */
1454219820Sjeff	if (!(ssk->flags & SDP_DROPPED)) {
1455219820Sjeff		SOCK_LOCK(so);
1456219820Sjeff		so->so_state |= SS_PROTOREF;
1457219820Sjeff		SOCK_UNLOCK(so);
1458219820Sjeff		ssk->flags |= SDP_SOCKREF;
1459219820Sjeff	}
1460219820Sjeff	SDP_WUNLOCK(ssk);
1461219820Sjeff}
1462219820Sjeff
1463219820Sjeff/*
1464219820Sjeff * User requests out-of-band data.
1465219820Sjeff */
1466219820Sjeffstatic int
1467219820Sjeffsdp_rcvoob(struct socket *so, struct mbuf *m, int flags)
1468219820Sjeff{
1469219820Sjeff	int error = 0;
1470219820Sjeff	struct sdp_sock *ssk;
1471219820Sjeff
1472219820Sjeff	ssk = sdp_sk(so);
1473219820Sjeff	SDP_WLOCK(ssk);
1474219820Sjeff	if (!rx_ring_trylock(&ssk->rx_ring)) {
1475219820Sjeff		SDP_WUNLOCK(ssk);
1476219820Sjeff		return (ECONNRESET);
1477219820Sjeff	}
1478219820Sjeff	if (ssk->flags & (SDP_TIMEWAIT | SDP_DROPPED)) {
1479219820Sjeff		error = ECONNRESET;
1480219820Sjeff		goto out;
1481219820Sjeff	}
1482219820Sjeff	if ((so->so_oobmark == 0 &&
1483219820Sjeff	     (so->so_rcv.sb_state & SBS_RCVATMARK) == 0) ||
1484219820Sjeff	    so->so_options & SO_OOBINLINE ||
1485219820Sjeff	    ssk->oobflags & SDP_HADOOB) {
1486219820Sjeff		error = EINVAL;
1487219820Sjeff		goto out;
1488219820Sjeff	}
1489219820Sjeff	if ((ssk->oobflags & SDP_HAVEOOB) == 0) {
1490219820Sjeff		error = EWOULDBLOCK;
1491219820Sjeff		goto out;
1492219820Sjeff	}
1493219820Sjeff	m->m_len = 1;
1494219820Sjeff	*mtod(m, caddr_t) = ssk->iobc;
1495219820Sjeff	if ((flags & MSG_PEEK) == 0)
1496219820Sjeff		ssk->oobflags ^= (SDP_HAVEOOB | SDP_HADOOB);
1497219820Sjeffout:
1498219820Sjeff	rx_ring_unlock(&ssk->rx_ring);
1499219820Sjeff	SDP_WUNLOCK(ssk);
1500219820Sjeff	return (error);
1501219820Sjeff}
1502219820Sjeff
1503219820Sjeffvoid
1504219820Sjeffsdp_urg(struct sdp_sock *ssk, struct mbuf *mb)
1505219820Sjeff{
1506219820Sjeff	struct mbuf *m;
1507219820Sjeff	struct socket *so;
1508219820Sjeff
1509219820Sjeff	so = ssk->socket;
1510219820Sjeff	if (so == NULL)
1511219820Sjeff		return;
1512219820Sjeff
1513274421Sglebius	so->so_oobmark = sbused(&so->so_rcv) + mb->m_pkthdr.len - 1;
1514219820Sjeff	sohasoutofband(so);
1515219820Sjeff	ssk->oobflags &= ~(SDP_HAVEOOB | SDP_HADOOB);
1516219820Sjeff	if (!(so->so_options & SO_OOBINLINE)) {
1517219820Sjeff		for (m = mb; m->m_next != NULL; m = m->m_next);
1518219820Sjeff		ssk->iobc = *(mtod(m, char *) + m->m_len - 1);
1519219820Sjeff		ssk->oobflags |= SDP_HAVEOOB;
1520219820Sjeff		m->m_len--;
1521219820Sjeff		mb->m_pkthdr.len--;
1522219820Sjeff	}
1523219820Sjeff}
1524219820Sjeff
1525219820Sjeff/*
1526219820Sjeff * Notify a sdp socket of an asynchronous error.
1527219820Sjeff *
1528219820Sjeff * Do not wake up user since there currently is no mechanism for
1529219820Sjeff * reporting soft errors (yet - a kqueue filter may be added).
1530219820Sjeff */
1531219820Sjeffstruct sdp_sock *
1532219820Sjeffsdp_notify(struct sdp_sock *ssk, int error)
1533219820Sjeff{
1534219820Sjeff
1535219820Sjeff	SDP_WLOCK_ASSERT(ssk);
1536219820Sjeff
1537219820Sjeff	if ((ssk->flags & SDP_TIMEWAIT) ||
1538219820Sjeff	    (ssk->flags & SDP_DROPPED))
1539219820Sjeff		return (ssk);
1540219820Sjeff
1541219820Sjeff	/*
1542219820Sjeff	 * Ignore some errors if we are hooked up.
1543219820Sjeff	 */
1544219820Sjeff	if (ssk->state == TCPS_ESTABLISHED &&
1545219820Sjeff	    (error == EHOSTUNREACH || error == ENETUNREACH ||
1546219820Sjeff	     error == EHOSTDOWN))
1547219820Sjeff		return (ssk);
1548219820Sjeff	ssk->softerror = error;
1549219820Sjeff	return sdp_drop(ssk, error);
1550219820Sjeff}
1551219820Sjeff
1552219820Sjeffstatic void
1553219820Sjeffsdp_ctlinput(int cmd, struct sockaddr *sa, void *vip)
1554219820Sjeff{
1555219820Sjeff	struct in_addr faddr;
1556219820Sjeff
1557219820Sjeff	faddr = ((struct sockaddr_in *)sa)->sin_addr;
1558219820Sjeff	if (sa->sa_family != AF_INET || faddr.s_addr == INADDR_ANY)
1559219820Sjeff		return;
1560219820Sjeff
1561219820Sjeff	sdp_pcbnotifyall(faddr, inetctlerrmap[cmd], sdp_notify);
1562219820Sjeff}
1563219820Sjeff
1564219820Sjeffstatic int
1565219820Sjeffsdp_control(struct socket *so, u_long cmd, caddr_t data, struct ifnet *ifp,
1566219820Sjeff    struct thread *td)
1567219820Sjeff{
1568219820Sjeff	return (EOPNOTSUPP);
1569219820Sjeff}
1570219820Sjeff
1571219820Sjeffstatic void
1572219820Sjeffsdp_keepalive_timeout(void *data)
1573219820Sjeff{
1574219820Sjeff	struct sdp_sock *ssk;
1575219820Sjeff
1576219820Sjeff	ssk = data;
1577219820Sjeff	/* Callout canceled. */
1578219820Sjeff        if (!callout_active(&ssk->keep2msl))
1579219820Sjeff                return;
1580219820Sjeff	/* Callout rescheduled as a different kind of timer. */
1581219820Sjeff	if (callout_pending(&ssk->keep2msl))
1582219820Sjeff		goto out;
1583219820Sjeff        callout_deactivate(&ssk->keep2msl);
1584219820Sjeff	if (ssk->flags & SDP_DROPPED ||
1585219820Sjeff	    (ssk->socket->so_options & SO_KEEPALIVE) == 0)
1586219820Sjeff		goto out;
1587219820Sjeff	sdp_post_keepalive(ssk);
1588219820Sjeff	callout_reset(&ssk->keep2msl, SDP_KEEPALIVE_TIME,
1589219820Sjeff	    sdp_keepalive_timeout, ssk);
1590219820Sjeffout:
1591219820Sjeff	SDP_WUNLOCK(ssk);
1592219820Sjeff}
1593219820Sjeff
1594219820Sjeff
1595219820Sjeffvoid
1596219820Sjeffsdp_start_keepalive_timer(struct socket *so)
1597219820Sjeff{
1598219820Sjeff	struct sdp_sock *ssk;
1599219820Sjeff
1600219820Sjeff	ssk = sdp_sk(so);
1601219820Sjeff	if (!callout_pending(&ssk->keep2msl))
1602219820Sjeff                callout_reset(&ssk->keep2msl, SDP_KEEPALIVE_TIME,
1603219820Sjeff                    sdp_keepalive_timeout, ssk);
1604219820Sjeff}
1605219820Sjeff
1606219820Sjeffstatic void
1607219820Sjeffsdp_stop_keepalive_timer(struct socket *so)
1608219820Sjeff{
1609219820Sjeff	struct sdp_sock *ssk;
1610219820Sjeff
1611219820Sjeff	ssk = sdp_sk(so);
1612219820Sjeff	callout_stop(&ssk->keep2msl);
1613219820Sjeff}
1614219820Sjeff
1615219820Sjeff/*
1616219820Sjeff * sdp_ctloutput() must drop the inpcb lock before performing copyin on
1617219820Sjeff * socket option arguments.  When it re-acquires the lock after the copy, it
1618219820Sjeff * has to revalidate that the connection is still valid for the socket
1619219820Sjeff * option.
1620219820Sjeff */
1621219820Sjeff#define SDP_WLOCK_RECHECK(inp) do {					\
1622219820Sjeff	SDP_WLOCK(ssk);							\
1623219820Sjeff	if (ssk->flags & (SDP_TIMEWAIT | SDP_DROPPED)) {		\
1624219820Sjeff		SDP_WUNLOCK(ssk);					\
1625219820Sjeff		return (ECONNRESET);					\
1626219820Sjeff	}								\
1627219820Sjeff} while(0)
1628219820Sjeff
1629219820Sjeffstatic int
1630219820Sjeffsdp_ctloutput(struct socket *so, struct sockopt *sopt)
1631219820Sjeff{
1632219820Sjeff	int	error, opt, optval;
1633219820Sjeff	struct sdp_sock *ssk;
1634219820Sjeff
1635219820Sjeff	error = 0;
1636219820Sjeff	ssk = sdp_sk(so);
1637219820Sjeff	if (sopt->sopt_level == SOL_SOCKET && sopt->sopt_name == SO_KEEPALIVE) {
1638219820Sjeff		SDP_WLOCK(ssk);
1639219820Sjeff		if (so->so_options & SO_KEEPALIVE)
1640219820Sjeff			sdp_start_keepalive_timer(so);
1641219820Sjeff		else
1642219820Sjeff			sdp_stop_keepalive_timer(so);
1643219820Sjeff		SDP_WUNLOCK(ssk);
1644219820Sjeff	}
1645219820Sjeff	if (sopt->sopt_level != IPPROTO_TCP)
1646219820Sjeff		return (error);
1647219820Sjeff
1648219820Sjeff	SDP_WLOCK(ssk);
1649219820Sjeff	if (ssk->flags & (SDP_TIMEWAIT | SDP_DROPPED)) {
1650219820Sjeff		SDP_WUNLOCK(ssk);
1651219820Sjeff		return (ECONNRESET);
1652219820Sjeff	}
1653219820Sjeff
1654219820Sjeff	switch (sopt->sopt_dir) {
1655219820Sjeff	case SOPT_SET:
1656219820Sjeff		switch (sopt->sopt_name) {
1657219820Sjeff		case TCP_NODELAY:
1658219820Sjeff			SDP_WUNLOCK(ssk);
1659219820Sjeff			error = sooptcopyin(sopt, &optval, sizeof optval,
1660219820Sjeff			    sizeof optval);
1661219820Sjeff			if (error)
1662219820Sjeff				return (error);
1663219820Sjeff
1664219820Sjeff			SDP_WLOCK_RECHECK(ssk);
1665219820Sjeff			opt = SDP_NODELAY;
1666219820Sjeff			if (optval)
1667219820Sjeff				ssk->flags |= opt;
1668219820Sjeff			else
1669219820Sjeff				ssk->flags &= ~opt;
1670219820Sjeff			sdp_do_posts(ssk);
1671219820Sjeff			SDP_WUNLOCK(ssk);
1672219820Sjeff			break;
1673219820Sjeff
1674219820Sjeff		default:
1675219820Sjeff			SDP_WUNLOCK(ssk);
1676219820Sjeff			error = ENOPROTOOPT;
1677219820Sjeff			break;
1678219820Sjeff		}
1679219820Sjeff		break;
1680219820Sjeff
1681219820Sjeff	case SOPT_GET:
1682219820Sjeff		switch (sopt->sopt_name) {
1683219820Sjeff		case TCP_NODELAY:
1684219820Sjeff			optval = ssk->flags & SDP_NODELAY;
1685219820Sjeff			SDP_WUNLOCK(ssk);
1686219820Sjeff			error = sooptcopyout(sopt, &optval, sizeof optval);
1687219820Sjeff			break;
1688219820Sjeff		default:
1689219820Sjeff			SDP_WUNLOCK(ssk);
1690219820Sjeff			error = ENOPROTOOPT;
1691219820Sjeff			break;
1692219820Sjeff		}
1693219820Sjeff		break;
1694219820Sjeff	}
1695219820Sjeff	return (error);
1696219820Sjeff}
1697219820Sjeff#undef SDP_WLOCK_RECHECK
1698219820Sjeff
1699219820Sjeffint sdp_mod_count = 0;
1700219820Sjeffint sdp_mod_usec = 0;
1701219820Sjeff
1702219820Sjeffvoid
1703219820Sjeffsdp_set_default_moderation(struct sdp_sock *ssk)
1704219820Sjeff{
1705278894Shselasky	struct ib_cq_attr attr;
1706219820Sjeff	if (sdp_mod_count <= 0 || sdp_mod_usec <= 0)
1707219820Sjeff		return;
1708278894Shselasky	memset(&attr, 0, sizeof(attr));
1709278894Shselasky	attr.moderation.cq_count = sdp_mod_count;
1710278894Shselasky	attr.moderation.cq_period = sdp_mod_usec;
1711278894Shselasky
1712278894Shselasky	ib_modify_cq(ssk->rx_ring.cq, &attr, IB_CQ_MODERATION);
1713219820Sjeff}
1714219820Sjeff
1715219820Sjeffstatic void
1716219820Sjeffsdp_dev_add(struct ib_device *device)
1717219820Sjeff{
1718219820Sjeff	struct ib_fmr_pool_param param;
1719219820Sjeff	struct sdp_device *sdp_dev;
1720219820Sjeff
1721219820Sjeff	sdp_dev = malloc(sizeof(*sdp_dev), M_SDP, M_WAITOK | M_ZERO);
1722219820Sjeff	sdp_dev->pd = ib_alloc_pd(device);
1723219820Sjeff	if (IS_ERR(sdp_dev->pd))
1724219820Sjeff		goto out_pd;
1725219820Sjeff        sdp_dev->mr = ib_get_dma_mr(sdp_dev->pd, IB_ACCESS_LOCAL_WRITE);
1726219820Sjeff        if (IS_ERR(sdp_dev->mr))
1727219820Sjeff		goto out_mr;
1728219820Sjeff	memset(&param, 0, sizeof param);
1729219820Sjeff	param.max_pages_per_fmr = SDP_FMR_SIZE;
1730219820Sjeff	param.page_shift = PAGE_SHIFT;
1731219820Sjeff	param.access = (IB_ACCESS_LOCAL_WRITE | IB_ACCESS_REMOTE_READ);
1732219820Sjeff	param.pool_size = SDP_FMR_POOL_SIZE;
1733219820Sjeff	param.dirty_watermark = SDP_FMR_DIRTY_SIZE;
1734219820Sjeff	param.cache = 1;
1735219820Sjeff	sdp_dev->fmr_pool = ib_create_fmr_pool(sdp_dev->pd, &param);
1736219820Sjeff	if (IS_ERR(sdp_dev->fmr_pool))
1737219820Sjeff		goto out_fmr;
1738219820Sjeff	ib_set_client_data(device, &sdp_client, sdp_dev);
1739219820Sjeff	return;
1740219820Sjeff
1741219820Sjeffout_fmr:
1742219820Sjeff	ib_dereg_mr(sdp_dev->mr);
1743219820Sjeffout_mr:
1744219820Sjeff	ib_dealloc_pd(sdp_dev->pd);
1745219820Sjeffout_pd:
1746219820Sjeff	free(sdp_dev, M_SDP);
1747219820Sjeff}
1748219820Sjeff
1749219820Sjeffstatic void
1750219820Sjeffsdp_dev_rem(struct ib_device *device)
1751219820Sjeff{
1752219820Sjeff	struct sdp_device *sdp_dev;
1753219820Sjeff	struct sdp_sock *ssk;
1754219820Sjeff
1755219820Sjeff	SDP_LIST_WLOCK();
1756219820Sjeff	LIST_FOREACH(ssk, &sdp_list, list) {
1757219820Sjeff		if (ssk->ib_device != device)
1758219820Sjeff			continue;
1759219820Sjeff		SDP_WLOCK(ssk);
1760219820Sjeff		if ((ssk->flags & SDP_DESTROY) == 0)
1761219820Sjeff			ssk = sdp_notify(ssk, ECONNRESET);
1762219820Sjeff		if (ssk)
1763219820Sjeff			SDP_WUNLOCK(ssk);
1764219820Sjeff	}
1765219820Sjeff	SDP_LIST_WUNLOCK();
1766219820Sjeff	/*
1767219820Sjeff	 * XXX Do I need to wait between these two?
1768219820Sjeff	 */
1769219820Sjeff	sdp_dev = ib_get_client_data(device, &sdp_client);
1770219820Sjeff	if (!sdp_dev)
1771219820Sjeff		return;
1772219820Sjeff	ib_flush_fmr_pool(sdp_dev->fmr_pool);
1773219820Sjeff	ib_destroy_fmr_pool(sdp_dev->fmr_pool);
1774219820Sjeff	ib_dereg_mr(sdp_dev->mr);
1775219820Sjeff	ib_dealloc_pd(sdp_dev->pd);
1776219820Sjeff	free(sdp_dev, M_SDP);
1777219820Sjeff}
1778219820Sjeff
1779219820Sjeffstruct ib_client sdp_client =
1780219820Sjeff    { .name = "sdp", .add = sdp_dev_add, .remove = sdp_dev_rem };
1781219820Sjeff
1782219820Sjeff
1783219820Sjeffstatic int
1784219820Sjeffsdp_pcblist(SYSCTL_HANDLER_ARGS)
1785219820Sjeff{
1786219820Sjeff	int error, n, i;
1787219820Sjeff	struct sdp_sock *ssk;
1788219820Sjeff	struct xinpgen xig;
1789219820Sjeff
1790219820Sjeff	/*
1791219820Sjeff	 * The process of preparing the TCB list is too time-consuming and
1792219820Sjeff	 * resource-intensive to repeat twice on every request.
1793219820Sjeff	 */
1794219820Sjeff	if (req->oldptr == NULL) {
1795219820Sjeff		n = sdp_count;
1796219820Sjeff		n += imax(n / 8, 10);
1797219820Sjeff		req->oldidx = 2 * (sizeof xig) + n * sizeof(struct xtcpcb);
1798219820Sjeff		return (0);
1799219820Sjeff	}
1800219820Sjeff
1801219820Sjeff	if (req->newptr != NULL)
1802219820Sjeff		return (EPERM);
1803219820Sjeff
1804219820Sjeff	/*
1805219820Sjeff	 * OK, now we're committed to doing something.
1806219820Sjeff	 */
1807219820Sjeff	SDP_LIST_RLOCK();
1808219820Sjeff	n = sdp_count;
1809219820Sjeff	SDP_LIST_RUNLOCK();
1810219820Sjeff
1811219820Sjeff	error = sysctl_wire_old_buffer(req, 2 * (sizeof xig)
1812219820Sjeff		+ n * sizeof(struct xtcpcb));
1813219820Sjeff	if (error != 0)
1814219820Sjeff		return (error);
1815219820Sjeff
1816219820Sjeff	xig.xig_len = sizeof xig;
1817219820Sjeff	xig.xig_count = n;
1818219820Sjeff	xig.xig_gen = 0;
1819219820Sjeff	xig.xig_sogen = so_gencnt;
1820219820Sjeff	error = SYSCTL_OUT(req, &xig, sizeof xig);
1821219820Sjeff	if (error)
1822219820Sjeff		return (error);
1823219820Sjeff
1824219820Sjeff	SDP_LIST_RLOCK();
1825219820Sjeff	for (ssk = LIST_FIRST(&sdp_list), i = 0;
1826219820Sjeff	    ssk != NULL && i < n; ssk = LIST_NEXT(ssk, list)) {
1827219820Sjeff		struct xtcpcb xt;
1828219820Sjeff
1829219820Sjeff		SDP_RLOCK(ssk);
1830219820Sjeff		if (ssk->flags & SDP_TIMEWAIT) {
1831219820Sjeff			if (ssk->cred != NULL)
1832219820Sjeff				error = cr_cansee(req->td->td_ucred,
1833219820Sjeff				    ssk->cred);
1834219820Sjeff			else
1835219820Sjeff				error = EINVAL;	/* Skip this inp. */
1836219820Sjeff		} else if (ssk->socket)
1837219820Sjeff			error = cr_canseesocket(req->td->td_ucred,
1838219820Sjeff			    ssk->socket);
1839219820Sjeff		else
1840219820Sjeff			error = EINVAL;
1841219820Sjeff		if (error) {
1842219820Sjeff			error = 0;
1843219820Sjeff			goto next;
1844219820Sjeff		}
1845219820Sjeff
1846219820Sjeff		bzero(&xt, sizeof(xt));
1847219820Sjeff		xt.xt_len = sizeof xt;
1848219820Sjeff		xt.xt_inp.inp_gencnt = 0;
1849219820Sjeff		xt.xt_inp.inp_vflag = INP_IPV4;
1850219820Sjeff		memcpy(&xt.xt_inp.inp_laddr, &ssk->laddr, sizeof(ssk->laddr));
1851219820Sjeff		xt.xt_inp.inp_lport = ssk->lport;
1852219820Sjeff		memcpy(&xt.xt_inp.inp_faddr, &ssk->faddr, sizeof(ssk->faddr));
1853219820Sjeff		xt.xt_inp.inp_fport = ssk->fport;
1854219820Sjeff		xt.xt_tp.t_state = ssk->state;
1855219820Sjeff		if (ssk->socket != NULL)
1856219820Sjeff			sotoxsocket(ssk->socket, &xt.xt_socket);
1857219820Sjeff		else
1858219820Sjeff			bzero(&xt.xt_socket, sizeof xt.xt_socket);
1859219820Sjeff		xt.xt_socket.xso_protocol = IPPROTO_TCP;
1860219820Sjeff		SDP_RUNLOCK(ssk);
1861219820Sjeff		error = SYSCTL_OUT(req, &xt, sizeof xt);
1862219820Sjeff		if (error)
1863219820Sjeff			break;
1864219820Sjeff		i++;
1865219820Sjeff		continue;
1866219820Sjeffnext:
1867219820Sjeff		SDP_RUNLOCK(ssk);
1868219820Sjeff	}
1869219820Sjeff	if (!error) {
1870219820Sjeff		/*
1871219820Sjeff		 * Give the user an updated idea of our state.
1872219820Sjeff		 * If the generation differs from what we told
1873219820Sjeff		 * her before, she knows that something happened
1874219820Sjeff		 * while we were processing this request, and it
1875219820Sjeff		 * might be necessary to retry.
1876219820Sjeff		 */
1877219820Sjeff		xig.xig_gen = 0;
1878219820Sjeff		xig.xig_sogen = so_gencnt;
1879219820Sjeff		xig.xig_count = sdp_count;
1880219820Sjeff		error = SYSCTL_OUT(req, &xig, sizeof xig);
1881219820Sjeff	}
1882219820Sjeff	SDP_LIST_RUNLOCK();
1883219820Sjeff	return (error);
1884219820Sjeff}
1885219820Sjeff
1886227309Sedstatic SYSCTL_NODE(_net_inet, -1,  sdp,    CTLFLAG_RW, 0,  "SDP");
1887219820Sjeff
1888219820SjeffSYSCTL_PROC(_net_inet_sdp, TCPCTL_PCBLIST, pcblist,
1889219820Sjeff    CTLFLAG_RD | CTLTYPE_STRUCT, 0, 0, sdp_pcblist, "S,xtcpcb",
1890219820Sjeff    "List of active SDP connections");
1891219820Sjeff
1892219820Sjeffstatic void
1893219820Sjeffsdp_zone_change(void *tag)
1894219820Sjeff{
1895219820Sjeff
1896219820Sjeff	uma_zone_set_max(sdp_zone, maxsockets);
1897219820Sjeff}
1898219820Sjeff
1899219820Sjeffstatic void
1900219820Sjeffsdp_init(void)
1901219820Sjeff{
1902219820Sjeff
1903219820Sjeff	LIST_INIT(&sdp_list);
1904219820Sjeff	sdp_zone = uma_zcreate("sdp_sock", sizeof(struct sdp_sock),
1905219820Sjeff	    NULL, NULL, NULL, NULL, UMA_ALIGN_PTR, UMA_ZONE_NOFREE);
1906219820Sjeff	uma_zone_set_max(sdp_zone, maxsockets);
1907219820Sjeff	EVENTHANDLER_REGISTER(maxsockets_change, sdp_zone_change, NULL,
1908219820Sjeff		EVENTHANDLER_PRI_ANY);
1909219820Sjeff	rx_comp_wq = create_singlethread_workqueue("rx_comp_wq");
1910219820Sjeff	ib_register_client(&sdp_client);
1911219820Sjeff}
1912219820Sjeff
1913219820Sjeffextern struct domain sdpdomain;
1914219820Sjeff
1915219820Sjeffstruct pr_usrreqs sdp_usrreqs = {
1916219820Sjeff	.pru_abort =		sdp_abort,
1917219820Sjeff	.pru_accept =		sdp_accept,
1918219820Sjeff	.pru_attach =		sdp_attach,
1919219820Sjeff	.pru_bind =		sdp_bind,
1920219820Sjeff	.pru_connect =		sdp_connect,
1921219820Sjeff	.pru_control =		sdp_control,
1922219820Sjeff	.pru_detach =		sdp_detach,
1923219820Sjeff	.pru_disconnect =	sdp_disconnect,
1924219820Sjeff	.pru_listen =		sdp_listen,
1925219820Sjeff	.pru_peeraddr =		sdp_getpeeraddr,
1926219820Sjeff	.pru_rcvoob =		sdp_rcvoob,
1927219820Sjeff	.pru_send =		sdp_send,
1928219820Sjeff	.pru_sosend =		sdp_sosend,
1929219820Sjeff	.pru_soreceive =	sdp_sorecv,
1930219820Sjeff	.pru_shutdown =		sdp_shutdown,
1931219820Sjeff	.pru_sockaddr =		sdp_getsockaddr,
1932219820Sjeff	.pru_close =		sdp_close,
1933219820Sjeff};
1934219820Sjeff
1935219820Sjeffstruct protosw sdpsw[] = {
1936219820Sjeff{
1937219820Sjeff	.pr_type =		SOCK_STREAM,
1938219820Sjeff	.pr_domain =		&sdpdomain,
1939219820Sjeff	.pr_protocol =		IPPROTO_IP,
1940219820Sjeff	.pr_flags =		PR_CONNREQUIRED|PR_IMPLOPCL|PR_WANTRCVD,
1941219820Sjeff	.pr_ctlinput =		sdp_ctlinput,
1942219820Sjeff	.pr_ctloutput =		sdp_ctloutput,
1943219820Sjeff	.pr_usrreqs =		&sdp_usrreqs
1944219820Sjeff},
1945219820Sjeff{
1946219820Sjeff	.pr_type =		SOCK_STREAM,
1947219820Sjeff	.pr_domain =		&sdpdomain,
1948219820Sjeff	.pr_protocol =		IPPROTO_TCP,
1949219820Sjeff	.pr_flags =		PR_CONNREQUIRED|PR_IMPLOPCL|PR_WANTRCVD,
1950219820Sjeff	.pr_ctlinput =		sdp_ctlinput,
1951219820Sjeff	.pr_ctloutput =		sdp_ctloutput,
1952219820Sjeff	.pr_usrreqs =		&sdp_usrreqs
1953219820Sjeff},
1954219820Sjeff};
1955219820Sjeff
1956219820Sjeffstruct domain sdpdomain = {
1957219820Sjeff	.dom_family =		AF_INET_SDP,
1958219820Sjeff	.dom_name =		"SDP",
1959219820Sjeff	.dom_init =		sdp_init,
1960219820Sjeff	.dom_protosw =		sdpsw,
1961219820Sjeff	.dom_protoswNPROTOSW =	&sdpsw[sizeof(sdpsw)/sizeof(sdpsw[0])],
1962219820Sjeff};
1963219820Sjeff
1964219820SjeffDOMAIN_SET(sdp);
1965219820Sjeff
1966219820Sjeffint sdp_debug_level = 1;
1967219820Sjeffint sdp_data_debug_level = 0;
1968