tcp_usrreq.c revision 127862
1/*
2 * Copyright (c) 1982, 1986, 1988, 1993
3 *	The Regents of the University of California.  All rights reserved.
4 *
5 * Redistribution and use in source and binary forms, with or without
6 * modification, are permitted provided that the following conditions
7 * are met:
8 * 1. Redistributions of source code must retain the above copyright
9 *    notice, this list of conditions and the following disclaimer.
10 * 2. Redistributions in binary form must reproduce the above copyright
11 *    notice, this list of conditions and the following disclaimer in the
12 *    documentation and/or other materials provided with the distribution.
13 * 3. All advertising materials mentioning features or use of this software
14 *    must display the following acknowledgement:
15 *	This product includes software developed by the University of
16 *	California, Berkeley and its contributors.
17 * 4. Neither the name of the University nor the names of its contributors
18 *    may be used to endorse or promote products derived from this software
19 *    without specific prior written permission.
20 *
21 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
22 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
23 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
24 * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
25 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
26 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
27 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
28 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
29 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
30 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
31 * SUCH DAMAGE.
32 *
33 *	From: @(#)tcp_usrreq.c	8.2 (Berkeley) 1/3/94
34 * $FreeBSD: head/sys/netinet/tcp_usrreq.c 127862 2004-04-04 20:14:55Z pjd $
35 */
36
37#include "opt_ipsec.h"
38#include "opt_inet.h"
39#include "opt_inet6.h"
40#include "opt_tcpdebug.h"
41
42#include <sys/param.h>
43#include <sys/systm.h>
44#include <sys/malloc.h>
45#include <sys/kernel.h>
46#include <sys/sysctl.h>
47#include <sys/mbuf.h>
48#ifdef INET6
49#include <sys/domain.h>
50#endif /* INET6 */
51#include <sys/socket.h>
52#include <sys/socketvar.h>
53#include <sys/protosw.h>
54#include <sys/proc.h>
55#include <sys/jail.h>
56
57#include <net/if.h>
58#include <net/route.h>
59
60#include <netinet/in.h>
61#include <netinet/in_systm.h>
62#ifdef INET6
63#include <netinet/ip6.h>
64#endif
65#include <netinet/in_pcb.h>
66#ifdef INET6
67#include <netinet6/in6_pcb.h>
68#endif
69#include <netinet/in_var.h>
70#include <netinet/ip_var.h>
71#ifdef INET6
72#include <netinet6/ip6_var.h>
73#endif
74#include <netinet/tcp.h>
75#include <netinet/tcp_fsm.h>
76#include <netinet/tcp_seq.h>
77#include <netinet/tcp_timer.h>
78#include <netinet/tcp_var.h>
79#include <netinet/tcpip.h>
80#ifdef TCPDEBUG
81#include <netinet/tcp_debug.h>
82#endif
83
84#ifdef IPSEC
85#include <netinet6/ipsec.h>
86#endif /*IPSEC*/
87
88/*
89 * TCP protocol interface to socket abstraction.
90 */
91extern	char *tcpstates[];	/* XXX ??? */
92
93static int	tcp_attach(struct socket *);
94static int	tcp_connect(struct tcpcb *, struct sockaddr *,
95		    struct thread *td);
96#ifdef INET6
97static int	tcp6_connect(struct tcpcb *, struct sockaddr *,
98		    struct thread *td);
99#endif /* INET6 */
100static struct tcpcb *
101		tcp_disconnect(struct tcpcb *);
102static struct tcpcb *
103		tcp_usrclosed(struct tcpcb *);
104
105#ifdef TCPDEBUG
106#define	TCPDEBUG0	int ostate = 0
107#define	TCPDEBUG1()	ostate = tp ? tp->t_state : 0
108#define	TCPDEBUG2(req)	if (tp && (so->so_options & SO_DEBUG)) \
109				tcp_trace(TA_USER, ostate, tp, 0, 0, req)
110#else
111#define	TCPDEBUG0
112#define	TCPDEBUG1()
113#define	TCPDEBUG2(req)
114#endif
115
116/*
117 * TCP attaches to socket via pru_attach(), reserving space,
118 * and an internet control block.
119 */
120static int
121tcp_usr_attach(struct socket *so, int proto, struct thread *td)
122{
123	int s = splnet();
124	int error;
125	struct inpcb *inp;
126	struct tcpcb *tp = 0;
127	TCPDEBUG0;
128
129	INP_INFO_WLOCK(&tcbinfo);
130	TCPDEBUG1();
131	inp = sotoinpcb(so);
132	if (inp) {
133		error = EISCONN;
134		goto out;
135	}
136
137	error = tcp_attach(so);
138	if (error)
139		goto out;
140
141	if ((so->so_options & SO_LINGER) && so->so_linger == 0)
142		so->so_linger = TCP_LINGERTIME;
143
144	inp = sotoinpcb(so);
145	tp = intotcpcb(inp);
146out:
147	TCPDEBUG2(PRU_ATTACH);
148	INP_INFO_WUNLOCK(&tcbinfo);
149	splx(s);
150	return error;
151}
152
153/*
154 * pru_detach() detaches the TCP protocol from the socket.
155 * If the protocol state is non-embryonic, then can't
156 * do this directly: have to initiate a pru_disconnect(),
157 * which may finish later; embryonic TCB's can just
158 * be discarded here.
159 */
160static int
161tcp_usr_detach(struct socket *so)
162{
163	int s = splnet();
164	int error = 0;
165	struct inpcb *inp;
166	struct tcpcb *tp;
167	TCPDEBUG0;
168
169	INP_INFO_WLOCK(&tcbinfo);
170	inp = sotoinpcb(so);
171	if (inp == 0) {
172		INP_INFO_WUNLOCK(&tcbinfo);
173		splx(s);
174		return EINVAL;	/* XXX */
175	}
176	INP_LOCK(inp);
177	tp = intotcpcb(inp);
178	TCPDEBUG1();
179	tp = tcp_disconnect(tp);
180
181	TCPDEBUG2(PRU_DETACH);
182	if (tp)
183		INP_UNLOCK(inp);
184	INP_INFO_WUNLOCK(&tcbinfo);
185	splx(s);
186	return error;
187}
188
189#define INI_NOLOCK	0
190#define INI_READ	1
191#define INI_WRITE	2
192
193#define	COMMON_START()						\
194	TCPDEBUG0;						\
195	do {							\
196		if (inirw == INI_READ)				\
197			INP_INFO_RLOCK(&tcbinfo);		\
198		else if (inirw == INI_WRITE)			\
199			INP_INFO_WLOCK(&tcbinfo);		\
200		inp = sotoinpcb(so);				\
201		if (inp == 0) {					\
202			if (inirw == INI_READ)			\
203				INP_INFO_RUNLOCK(&tcbinfo);	\
204			else if (inirw == INI_WRITE)		\
205				INP_INFO_WUNLOCK(&tcbinfo);	\
206			splx(s);				\
207			return EINVAL;				\
208		}						\
209		INP_LOCK(inp);					\
210		if (inirw == INI_READ)				\
211			INP_INFO_RUNLOCK(&tcbinfo);		\
212		tp = intotcpcb(inp);				\
213		TCPDEBUG1();					\
214} while(0)
215
216#define COMMON_END(req)						\
217out:	TCPDEBUG2(req);						\
218	do {							\
219		if (tp)						\
220			INP_UNLOCK(inp);			\
221		if (inirw == INI_WRITE)				\
222			INP_INFO_WUNLOCK(&tcbinfo);		\
223		splx(s);					\
224		return error;					\
225		goto out;					\
226} while(0)
227
228/*
229 * Give the socket an address.
230 */
231static int
232tcp_usr_bind(struct socket *so, struct sockaddr *nam, struct thread *td)
233{
234	int s = splnet();
235	int error = 0;
236	struct inpcb *inp;
237	struct tcpcb *tp;
238	struct sockaddr_in *sinp;
239	const int inirw = INI_WRITE;
240
241	sinp = (struct sockaddr_in *)nam;
242	if (nam->sa_len != sizeof (*sinp))
243		return (EINVAL);
244	/*
245	 * Must check for multicast addresses and disallow binding
246	 * to them.
247	 */
248	if (sinp->sin_family == AF_INET &&
249	    IN_MULTICAST(ntohl(sinp->sin_addr.s_addr)))
250		return (EAFNOSUPPORT);
251
252	COMMON_START();
253	error = in_pcbbind(inp, nam, td->td_ucred);
254	if (error)
255		goto out;
256	COMMON_END(PRU_BIND);
257}
258
259#ifdef INET6
260static int
261tcp6_usr_bind(struct socket *so, struct sockaddr *nam, struct thread *td)
262{
263	int s = splnet();
264	int error = 0;
265	struct inpcb *inp;
266	struct tcpcb *tp;
267	struct sockaddr_in6 *sin6p;
268	const int inirw = INI_WRITE;
269
270	sin6p = (struct sockaddr_in6 *)nam;
271	if (nam->sa_len != sizeof (*sin6p))
272		return (EINVAL);
273	/*
274	 * Must check for multicast addresses and disallow binding
275	 * to them.
276	 */
277	if (sin6p->sin6_family == AF_INET6 &&
278	    IN6_IS_ADDR_MULTICAST(&sin6p->sin6_addr))
279		return (EAFNOSUPPORT);
280
281	COMMON_START();
282	inp->inp_vflag &= ~INP_IPV4;
283	inp->inp_vflag |= INP_IPV6;
284	if ((inp->inp_flags & IN6P_IPV6_V6ONLY) == 0) {
285		if (IN6_IS_ADDR_UNSPECIFIED(&sin6p->sin6_addr))
286			inp->inp_vflag |= INP_IPV4;
287		else if (IN6_IS_ADDR_V4MAPPED(&sin6p->sin6_addr)) {
288			struct sockaddr_in sin;
289
290			in6_sin6_2_sin(&sin, sin6p);
291			inp->inp_vflag |= INP_IPV4;
292			inp->inp_vflag &= ~INP_IPV6;
293			error = in_pcbbind(inp, (struct sockaddr *)&sin,
294			    td->td_ucred);
295			goto out;
296		}
297	}
298	error = in6_pcbbind(inp, nam, td->td_ucred);
299	if (error)
300		goto out;
301	COMMON_END(PRU_BIND);
302}
303#endif /* INET6 */
304
305/*
306 * Prepare to accept connections.
307 */
308static int
309tcp_usr_listen(struct socket *so, struct thread *td)
310{
311	int s = splnet();
312	int error = 0;
313	struct inpcb *inp;
314	struct tcpcb *tp;
315	const int inirw = INI_WRITE;
316
317	COMMON_START();
318	if (inp->inp_lport == 0)
319		error = in_pcbbind(inp, (struct sockaddr *)0, td->td_ucred);
320	if (error == 0)
321		tp->t_state = TCPS_LISTEN;
322	COMMON_END(PRU_LISTEN);
323}
324
325#ifdef INET6
326static int
327tcp6_usr_listen(struct socket *so, struct thread *td)
328{
329	int s = splnet();
330	int error = 0;
331	struct inpcb *inp;
332	struct tcpcb *tp;
333	const int inirw = INI_WRITE;
334
335	COMMON_START();
336	if (inp->inp_lport == 0) {
337		inp->inp_vflag &= ~INP_IPV4;
338		if ((inp->inp_flags & IN6P_IPV6_V6ONLY) == 0)
339			inp->inp_vflag |= INP_IPV4;
340		error = in6_pcbbind(inp, (struct sockaddr *)0, td->td_ucred);
341	}
342	if (error == 0)
343		tp->t_state = TCPS_LISTEN;
344	COMMON_END(PRU_LISTEN);
345}
346#endif /* INET6 */
347
348/*
349 * Initiate connection to peer.
350 * Create a template for use in transmissions on this connection.
351 * Enter SYN_SENT state, and mark socket as connecting.
352 * Start keep-alive timer, and seed output sequence space.
353 * Send initial segment on connection.
354 */
355static int
356tcp_usr_connect(struct socket *so, struct sockaddr *nam, struct thread *td)
357{
358	int s = splnet();
359	int error = 0;
360	struct inpcb *inp;
361	struct tcpcb *tp;
362	struct sockaddr_in *sinp;
363	const int inirw = INI_WRITE;
364
365	sinp = (struct sockaddr_in *)nam;
366	if (nam->sa_len != sizeof (*sinp))
367		return (EINVAL);
368	/*
369	 * Must disallow TCP ``connections'' to multicast addresses.
370	 */
371	if (sinp->sin_family == AF_INET
372	    && IN_MULTICAST(ntohl(sinp->sin_addr.s_addr)))
373		return (EAFNOSUPPORT);
374	if (td && jailed(td->td_ucred))
375		prison_remote_ip(td->td_ucred, 0, &sinp->sin_addr.s_addr);
376
377	COMMON_START();
378	if ((error = tcp_connect(tp, nam, td)) != 0)
379		goto out;
380	error = tcp_output(tp);
381	COMMON_END(PRU_CONNECT);
382}
383
384#ifdef INET6
385static int
386tcp6_usr_connect(struct socket *so, struct sockaddr *nam, struct thread *td)
387{
388	int s = splnet();
389	int error = 0;
390	struct inpcb *inp;
391	struct tcpcb *tp;
392	struct sockaddr_in6 *sin6p;
393	const int inirw = INI_WRITE;
394
395	sin6p = (struct sockaddr_in6 *)nam;
396	if (nam->sa_len != sizeof (*sin6p))
397		return (EINVAL);
398	/*
399	 * Must disallow TCP ``connections'' to multicast addresses.
400	 */
401	if (sin6p->sin6_family == AF_INET6
402	    && IN6_IS_ADDR_MULTICAST(&sin6p->sin6_addr))
403		return (EAFNOSUPPORT);
404
405	COMMON_START();
406	if (IN6_IS_ADDR_V4MAPPED(&sin6p->sin6_addr)) {
407		struct sockaddr_in sin;
408
409		if ((inp->inp_flags & IN6P_IPV6_V6ONLY) != 0) {
410			error = EINVAL;
411			goto out;
412		}
413
414		in6_sin6_2_sin(&sin, sin6p);
415		inp->inp_vflag |= INP_IPV4;
416		inp->inp_vflag &= ~INP_IPV6;
417		if ((error = tcp_connect(tp, (struct sockaddr *)&sin, td)) != 0)
418			goto out;
419		error = tcp_output(tp);
420		goto out;
421	}
422	inp->inp_vflag &= ~INP_IPV4;
423	inp->inp_vflag |= INP_IPV6;
424	inp->inp_inc.inc_isipv6 = 1;
425	if ((error = tcp6_connect(tp, nam, td)) != 0)
426		goto out;
427	error = tcp_output(tp);
428	COMMON_END(PRU_CONNECT);
429}
430#endif /* INET6 */
431
432/*
433 * Initiate disconnect from peer.
434 * If connection never passed embryonic stage, just drop;
435 * else if don't need to let data drain, then can just drop anyways,
436 * else have to begin TCP shutdown process: mark socket disconnecting,
437 * drain unread data, state switch to reflect user close, and
438 * send segment (e.g. FIN) to peer.  Socket will be really disconnected
439 * when peer sends FIN and acks ours.
440 *
441 * SHOULD IMPLEMENT LATER PRU_CONNECT VIA REALLOC TCPCB.
442 */
443static int
444tcp_usr_disconnect(struct socket *so)
445{
446	int s = splnet();
447	int error = 0;
448	struct inpcb *inp;
449	struct tcpcb *tp;
450	const int inirw = INI_WRITE;
451
452	COMMON_START();
453	tp = tcp_disconnect(tp);
454	COMMON_END(PRU_DISCONNECT);
455}
456
457/*
458 * Accept a connection.  Essentially all the work is
459 * done at higher levels; just return the address
460 * of the peer, storing through addr.
461 */
462static int
463tcp_usr_accept(struct socket *so, struct sockaddr **nam)
464{
465	int s;
466	int error = 0;
467	struct inpcb *inp = NULL;
468	struct tcpcb *tp = NULL;
469	struct in_addr addr;
470	in_port_t port = 0;
471	TCPDEBUG0;
472
473	if (so->so_state & SS_ISDISCONNECTED) {
474		error = ECONNABORTED;
475		goto out;
476	}
477
478	s = splnet();
479	INP_INFO_RLOCK(&tcbinfo);
480	inp = sotoinpcb(so);
481	if (!inp) {
482		INP_INFO_RUNLOCK(&tcbinfo);
483		splx(s);
484		return (EINVAL);
485	}
486	INP_LOCK(inp);
487	INP_INFO_RUNLOCK(&tcbinfo);
488	tp = intotcpcb(inp);
489	TCPDEBUG1();
490
491	/*
492	 * We inline in_setpeeraddr and COMMON_END here, so that we can
493	 * copy the data of interest and defer the malloc until after we
494	 * release the lock.
495	 */
496	port = inp->inp_fport;
497	addr = inp->inp_faddr;
498
499out:	TCPDEBUG2(PRU_ACCEPT);
500	if (tp)
501		INP_UNLOCK(inp);
502	splx(s);
503	if (error == 0)
504		*nam = in_sockaddr(port, &addr);
505	return error;
506}
507
508#ifdef INET6
509static int
510tcp6_usr_accept(struct socket *so, struct sockaddr **nam)
511{
512	int s;
513	struct inpcb *inp = NULL;
514	int error = 0;
515	struct tcpcb *tp = NULL;
516	struct in_addr addr;
517	struct in6_addr addr6;
518	in_port_t port = 0;
519	int v4 = 0;
520	TCPDEBUG0;
521
522	if (so->so_state & SS_ISDISCONNECTED) {
523		error = ECONNABORTED;
524		goto out;
525	}
526
527	s = splnet();
528	INP_INFO_RLOCK(&tcbinfo);
529	inp = sotoinpcb(so);
530	if (inp == 0) {
531		INP_INFO_RUNLOCK(&tcbinfo);
532		splx(s);
533		return (EINVAL);
534	}
535	INP_LOCK(inp);
536	INP_INFO_RUNLOCK(&tcbinfo);
537	tp = intotcpcb(inp);
538	TCPDEBUG1();
539	/*
540	 * We inline in6_mapped_peeraddr and COMMON_END here, so that we can
541	 * copy the data of interest and defer the malloc until after we
542	 * release the lock.
543	 */
544	if (inp->inp_vflag & INP_IPV4) {
545		v4 = 1;
546		port = inp->inp_fport;
547		addr = inp->inp_faddr;
548	} else {
549		port = inp->inp_fport;
550		addr6 = inp->in6p_faddr;
551	}
552
553out:	TCPDEBUG2(PRU_ACCEPT);
554	if (tp)
555		INP_UNLOCK(inp);
556	splx(s);
557	if (error == 0) {
558		if (v4)
559			*nam = in6_v4mapsin6_sockaddr(port, &addr);
560		else
561			*nam = in6_sockaddr(port, &addr6);
562	}
563	return error;
564}
565#endif /* INET6 */
566
567/*
568 * This is the wrapper function for in_setsockaddr. We just pass down
569 * the pcbinfo for in_setsockaddr to lock. We don't want to do the locking
570 * here because in_setsockaddr will call malloc and can block.
571 */
572static int
573tcp_sockaddr(struct socket *so, struct sockaddr **nam)
574{
575	return (in_setsockaddr(so, nam, &tcbinfo));
576}
577
578/*
579 * This is the wrapper function for in_setpeeraddr. We just pass down
580 * the pcbinfo for in_setpeeraddr to lock.
581 */
582static int
583tcp_peeraddr(struct socket *so, struct sockaddr **nam)
584{
585	return (in_setpeeraddr(so, nam, &tcbinfo));
586}
587
588/*
589 * Mark the connection as being incapable of further output.
590 */
591static int
592tcp_usr_shutdown(struct socket *so)
593{
594	int s = splnet();
595	int error = 0;
596	struct inpcb *inp;
597	struct tcpcb *tp;
598	const int inirw = INI_WRITE;
599
600	COMMON_START();
601	socantsendmore(so);
602	tp = tcp_usrclosed(tp);
603	if (tp)
604		error = tcp_output(tp);
605	COMMON_END(PRU_SHUTDOWN);
606}
607
608/*
609 * After a receive, possibly send window update to peer.
610 */
611static int
612tcp_usr_rcvd(struct socket *so, int flags)
613{
614	int s = splnet();
615	int error = 0;
616	struct inpcb *inp;
617	struct tcpcb *tp;
618	const int inirw = INI_READ;
619
620	COMMON_START();
621	tcp_output(tp);
622	COMMON_END(PRU_RCVD);
623}
624
625/*
626 * Do a send by putting data in output queue and updating urgent
627 * marker if URG set.  Possibly send more data.  Unlike the other
628 * pru_*() routines, the mbuf chains are our responsibility.  We
629 * must either enqueue them or free them.  The other pru_* routines
630 * generally are caller-frees.
631 */
632static int
633tcp_usr_send(struct socket *so, int flags, struct mbuf *m,
634	     struct sockaddr *nam, struct mbuf *control, struct thread *td)
635{
636	int s = splnet();
637	int error = 0;
638	struct inpcb *inp;
639	struct tcpcb *tp;
640	const int inirw = INI_WRITE;
641#ifdef INET6
642	int isipv6;
643#endif
644	TCPDEBUG0;
645
646	/*
647	 * Need write lock here because this function might call
648	 * tcp_connect or tcp_usrclosed.
649	 * We really want to have to this function upgrade from read lock
650	 * to write lock.  XXX
651	 */
652	INP_INFO_WLOCK(&tcbinfo);
653	inp = sotoinpcb(so);
654	if (inp == NULL) {
655		/*
656		 * OOPS! we lost a race, the TCP session got reset after
657		 * we checked SS_CANTSENDMORE, eg: while doing uiomove or a
658		 * network interrupt in the non-splnet() section of sosend().
659		 */
660		if (m)
661			m_freem(m);
662		if (control)
663			m_freem(control);
664		error = ECONNRESET;	/* XXX EPIPE? */
665		tp = NULL;
666		TCPDEBUG1();
667		goto out;
668	}
669	INP_LOCK(inp);
670#ifdef INET6
671	isipv6 = nam && nam->sa_family == AF_INET6;
672#endif /* INET6 */
673	tp = intotcpcb(inp);
674	TCPDEBUG1();
675	if (control) {
676		/* TCP doesn't do control messages (rights, creds, etc) */
677		if (control->m_len) {
678			m_freem(control);
679			if (m)
680				m_freem(m);
681			error = EINVAL;
682			goto out;
683		}
684		m_freem(control);	/* empty control, just free it */
685	}
686	if (!(flags & PRUS_OOB)) {
687		sbappendstream(&so->so_snd, m);
688		if (nam && tp->t_state < TCPS_SYN_SENT) {
689			/*
690			 * Do implied connect if not yet connected,
691			 * initialize window to default value, and
692			 * initialize maxseg/maxopd using peer's cached
693			 * MSS.
694			 */
695#ifdef INET6
696			if (isipv6)
697				error = tcp6_connect(tp, nam, td);
698			else
699#endif /* INET6 */
700			error = tcp_connect(tp, nam, td);
701			if (error)
702				goto out;
703			tp->snd_wnd = TTCP_CLIENT_SND_WND;
704			tcp_mss(tp, -1);
705		}
706
707		if (flags & PRUS_EOF) {
708			/*
709			 * Close the send side of the connection after
710			 * the data is sent.
711			 */
712			socantsendmore(so);
713			tp = tcp_usrclosed(tp);
714		}
715		if (tp != NULL) {
716			if (flags & PRUS_MORETOCOME)
717				tp->t_flags |= TF_MORETOCOME;
718			error = tcp_output(tp);
719			if (flags & PRUS_MORETOCOME)
720				tp->t_flags &= ~TF_MORETOCOME;
721		}
722	} else {
723		if (sbspace(&so->so_snd) < -512) {
724			m_freem(m);
725			error = ENOBUFS;
726			goto out;
727		}
728		/*
729		 * According to RFC961 (Assigned Protocols),
730		 * the urgent pointer points to the last octet
731		 * of urgent data.  We continue, however,
732		 * to consider it to indicate the first octet
733		 * of data past the urgent section.
734		 * Otherwise, snd_up should be one lower.
735		 */
736		sbappendstream(&so->so_snd, m);
737		if (nam && tp->t_state < TCPS_SYN_SENT) {
738			/*
739			 * Do implied connect if not yet connected,
740			 * initialize window to default value, and
741			 * initialize maxseg/maxopd using peer's cached
742			 * MSS.
743			 */
744#ifdef INET6
745			if (isipv6)
746				error = tcp6_connect(tp, nam, td);
747			else
748#endif /* INET6 */
749			error = tcp_connect(tp, nam, td);
750			if (error)
751				goto out;
752			tp->snd_wnd = TTCP_CLIENT_SND_WND;
753			tcp_mss(tp, -1);
754		}
755		tp->snd_up = tp->snd_una + so->so_snd.sb_cc;
756		tp->t_force = 1;
757		error = tcp_output(tp);
758		tp->t_force = 0;
759	}
760	COMMON_END((flags & PRUS_OOB) ? PRU_SENDOOB :
761		   ((flags & PRUS_EOF) ? PRU_SEND_EOF : PRU_SEND));
762}
763
764/*
765 * Abort the TCP.
766 */
767static int
768tcp_usr_abort(struct socket *so)
769{
770	int s = splnet();
771	int error = 0;
772	struct inpcb *inp;
773	struct tcpcb *tp;
774	const int inirw = INI_WRITE;
775
776	COMMON_START();
777	tp = tcp_drop(tp, ECONNABORTED);
778	COMMON_END(PRU_ABORT);
779}
780
781/*
782 * Receive out-of-band data.
783 */
784static int
785tcp_usr_rcvoob(struct socket *so, struct mbuf *m, int flags)
786{
787	int s = splnet();
788	int error = 0;
789	struct inpcb *inp;
790	struct tcpcb *tp;
791	const int inirw = INI_READ;
792
793	COMMON_START();
794	if ((so->so_oobmark == 0 &&
795	     (so->so_state & SS_RCVATMARK) == 0) ||
796	    so->so_options & SO_OOBINLINE ||
797	    tp->t_oobflags & TCPOOB_HADDATA) {
798		error = EINVAL;
799		goto out;
800	}
801	if ((tp->t_oobflags & TCPOOB_HAVEDATA) == 0) {
802		error = EWOULDBLOCK;
803		goto out;
804	}
805	m->m_len = 1;
806	*mtod(m, caddr_t) = tp->t_iobc;
807	if ((flags & MSG_PEEK) == 0)
808		tp->t_oobflags ^= (TCPOOB_HAVEDATA | TCPOOB_HADDATA);
809	COMMON_END(PRU_RCVOOB);
810}
811
812/* xxx - should be const */
813struct pr_usrreqs tcp_usrreqs = {
814	tcp_usr_abort, tcp_usr_accept, tcp_usr_attach, tcp_usr_bind,
815	tcp_usr_connect, pru_connect2_notsupp, in_control, tcp_usr_detach,
816	tcp_usr_disconnect, tcp_usr_listen, tcp_peeraddr, tcp_usr_rcvd,
817	tcp_usr_rcvoob, tcp_usr_send, pru_sense_null, tcp_usr_shutdown,
818	tcp_sockaddr, sosend, soreceive, sopoll, in_pcbsosetlabel
819};
820
821#ifdef INET6
822struct pr_usrreqs tcp6_usrreqs = {
823	tcp_usr_abort, tcp6_usr_accept, tcp_usr_attach, tcp6_usr_bind,
824	tcp6_usr_connect, pru_connect2_notsupp, in6_control, tcp_usr_detach,
825	tcp_usr_disconnect, tcp6_usr_listen, in6_mapped_peeraddr, tcp_usr_rcvd,
826	tcp_usr_rcvoob, tcp_usr_send, pru_sense_null, tcp_usr_shutdown,
827	in6_mapped_sockaddr, sosend, soreceive, sopoll, in_pcbsosetlabel
828};
829#endif /* INET6 */
830
831/*
832 * Common subroutine to open a TCP connection to remote host specified
833 * by struct sockaddr_in in mbuf *nam.  Call in_pcbbind to assign a local
834 * port number if needed.  Call in_pcbconnect_setup to do the routing and
835 * to choose a local host address (interface).  If there is an existing
836 * incarnation of the same connection in TIME-WAIT state and if the remote
837 * host was sending CC options and if the connection duration was < MSL, then
838 * truncate the previous TIME-WAIT state and proceed.
839 * Initialize connection parameters and enter SYN-SENT state.
840 */
841static int
842tcp_connect(tp, nam, td)
843	register struct tcpcb *tp;
844	struct sockaddr *nam;
845	struct thread *td;
846{
847	struct inpcb *inp = tp->t_inpcb, *oinp;
848	struct socket *so = inp->inp_socket;
849	struct tcptw *otw;
850	struct rmxp_tao tao;
851	struct in_addr laddr;
852	u_short lport;
853	int error;
854
855	bzero(&tao, sizeof(tao));
856
857	if (inp->inp_lport == 0) {
858		error = in_pcbbind(inp, (struct sockaddr *)0, td->td_ucred);
859		if (error)
860			return error;
861	}
862
863	/*
864	 * Cannot simply call in_pcbconnect, because there might be an
865	 * earlier incarnation of this same connection still in
866	 * TIME_WAIT state, creating an ADDRINUSE error.
867	 */
868	laddr = inp->inp_laddr;
869	lport = inp->inp_lport;
870	error = in_pcbconnect_setup(inp, nam, &laddr.s_addr, &lport,
871	    &inp->inp_faddr.s_addr, &inp->inp_fport, &oinp, td->td_ucred);
872	if (error && oinp == NULL)
873		return error;
874	if (oinp) {
875		if (oinp != inp &&
876		    (oinp->inp_vflag & INP_TIMEWAIT) &&
877		    (ticks - (otw = intotw(oinp))->t_starttime) < tcp_msl &&
878		    otw->cc_recv != 0) {
879			inp->inp_faddr = oinp->inp_faddr;
880			inp->inp_fport = oinp->inp_fport;
881			(void) tcp_twclose(otw, 0);
882		} else
883			return EADDRINUSE;
884	}
885	inp->inp_laddr = laddr;
886	in_pcbrehash(inp);
887
888	/* Compute window scaling to request.  */
889	while (tp->request_r_scale < TCP_MAX_WINSHIFT &&
890	    (TCP_MAXWIN << tp->request_r_scale) < so->so_rcv.sb_hiwat)
891		tp->request_r_scale++;
892
893	soisconnecting(so);
894	tcpstat.tcps_connattempt++;
895	tp->t_state = TCPS_SYN_SENT;
896	callout_reset(tp->tt_keep, tcp_keepinit, tcp_timer_keep, tp);
897	tp->iss = tcp_new_isn(tp);
898	tp->t_bw_rtseq = tp->iss;
899	tcp_sendseqinit(tp);
900
901	/*
902	 * Generate a CC value for this connection and
903	 * check whether CC or CCnew should be used.
904	 */
905	if (tcp_do_rfc1644)
906		tcp_hc_gettao(&inp->inp_inc, &tao);
907
908	tp->cc_send = CC_INC(tcp_ccgen);
909	if (tao.tao_ccsent != 0 &&
910	    CC_GEQ(tp->cc_send, tao.tao_ccsent)) {
911		tao.tao_ccsent = tp->cc_send;
912	} else {
913		tao.tao_ccsent = 0;
914		tp->t_flags |= TF_SENDCCNEW;
915	}
916
917	if (tcp_do_rfc1644)
918		tcp_hc_updatetao(&inp->inp_inc, TCP_HC_TAO_CCSENT,
919				 tao.tao_ccsent, 0);
920
921	return 0;
922}
923
924#ifdef INET6
925static int
926tcp6_connect(tp, nam, td)
927	register struct tcpcb *tp;
928	struct sockaddr *nam;
929	struct thread *td;
930{
931	struct inpcb *inp = tp->t_inpcb, *oinp;
932	struct socket *so = inp->inp_socket;
933	struct tcptw *otw;
934	struct sockaddr_in6 *sin6 = (struct sockaddr_in6 *)nam;
935	struct in6_addr *addr6;
936	struct rmxp_tao tao;
937	int error;
938
939	bzero(&tao, sizeof(tao));
940
941	if (inp->inp_lport == 0) {
942		error = in6_pcbbind(inp, (struct sockaddr *)0, td->td_ucred);
943		if (error)
944			return error;
945	}
946
947	/*
948	 * Cannot simply call in_pcbconnect, because there might be an
949	 * earlier incarnation of this same connection still in
950	 * TIME_WAIT state, creating an ADDRINUSE error.
951	 */
952	error = in6_pcbladdr(inp, nam, &addr6);
953	if (error)
954		return error;
955	oinp = in6_pcblookup_hash(inp->inp_pcbinfo,
956				  &sin6->sin6_addr, sin6->sin6_port,
957				  IN6_IS_ADDR_UNSPECIFIED(&inp->in6p_laddr)
958				  ? addr6
959				  : &inp->in6p_laddr,
960				  inp->inp_lport,  0, NULL);
961	if (oinp) {
962		if (oinp != inp &&
963		    (oinp->inp_vflag & INP_TIMEWAIT) &&
964		    (ticks - (otw = intotw(oinp))->t_starttime) < tcp_msl &&
965		    otw->cc_recv != 0) {
966			inp->inp_faddr = oinp->inp_faddr;
967			inp->inp_fport = oinp->inp_fport;
968			(void) tcp_twclose(otw, 0);
969		} else
970			return EADDRINUSE;
971	}
972	if (IN6_IS_ADDR_UNSPECIFIED(&inp->in6p_laddr))
973		inp->in6p_laddr = *addr6;
974	inp->in6p_faddr = sin6->sin6_addr;
975	inp->inp_fport = sin6->sin6_port;
976	if ((sin6->sin6_flowinfo & IPV6_FLOWINFO_MASK) != 0)
977		inp->in6p_flowinfo = sin6->sin6_flowinfo;
978	in_pcbrehash(inp);
979
980	/* Compute window scaling to request.  */
981	while (tp->request_r_scale < TCP_MAX_WINSHIFT &&
982	    (TCP_MAXWIN << tp->request_r_scale) < so->so_rcv.sb_hiwat)
983		tp->request_r_scale++;
984
985	soisconnecting(so);
986	tcpstat.tcps_connattempt++;
987	tp->t_state = TCPS_SYN_SENT;
988	callout_reset(tp->tt_keep, tcp_keepinit, tcp_timer_keep, tp);
989	tp->iss = tcp_new_isn(tp);
990	tp->t_bw_rtseq = tp->iss;
991	tcp_sendseqinit(tp);
992
993	/*
994	 * Generate a CC value for this connection and
995	 * check whether CC or CCnew should be used.
996	 */
997	if (tcp_do_rfc1644)
998		tcp_hc_gettao(&inp->inp_inc, &tao);
999
1000	tp->cc_send = CC_INC(tcp_ccgen);
1001	if (tao.tao_ccsent != 0 &&
1002	    CC_GEQ(tp->cc_send, tao.tao_ccsent)) {
1003		tao.tao_ccsent = tp->cc_send;
1004	} else {
1005		tao.tao_ccsent = 0;
1006		tp->t_flags |= TF_SENDCCNEW;
1007	}
1008	if (tcp_do_rfc1644)
1009		tcp_hc_updatetao(&inp->inp_inc, TCP_HC_TAO_CCSENT,
1010				 tao.tao_ccsent, 0);
1011
1012	return 0;
1013}
1014#endif /* INET6 */
1015
1016/*
1017 * The new sockopt interface makes it possible for us to block in the
1018 * copyin/out step (if we take a page fault).  Taking a page fault at
1019 * splnet() is probably a Bad Thing.  (Since sockets and pcbs both now
1020 * use TSM, there probably isn't any need for this function to run at
1021 * splnet() any more.  This needs more examination.)
1022 */
1023int
1024tcp_ctloutput(so, sopt)
1025	struct socket *so;
1026	struct sockopt *sopt;
1027{
1028	int	error, opt, optval, s;
1029	struct	inpcb *inp;
1030	struct	tcpcb *tp;
1031
1032	error = 0;
1033	s = splnet();		/* XXX */
1034	INP_INFO_RLOCK(&tcbinfo);
1035	inp = sotoinpcb(so);
1036	if (inp == NULL) {
1037		INP_INFO_RUNLOCK(&tcbinfo);
1038		splx(s);
1039		return (ECONNRESET);
1040	}
1041	INP_LOCK(inp);
1042	INP_INFO_RUNLOCK(&tcbinfo);
1043	if (sopt->sopt_level != IPPROTO_TCP) {
1044#ifdef INET6
1045		if (INP_CHECK_SOCKAF(so, AF_INET6))
1046			error = ip6_ctloutput(so, sopt);
1047		else
1048#endif /* INET6 */
1049		error = ip_ctloutput(so, sopt);
1050		INP_UNLOCK(inp);
1051		splx(s);
1052		return (error);
1053	}
1054	tp = intotcpcb(inp);
1055
1056	switch (sopt->sopt_dir) {
1057	case SOPT_SET:
1058		switch (sopt->sopt_name) {
1059#ifdef TCP_SIGNATURE
1060		case TCP_MD5SIG:
1061			error = sooptcopyin(sopt, &optval, sizeof optval,
1062					    sizeof optval);
1063			if (error)
1064				break;
1065
1066			if (optval > 0)
1067				tp->t_flags |= TF_SIGNATURE;
1068			else
1069				tp->t_flags &= ~TF_SIGNATURE;
1070			break;
1071#endif /* TCP_SIGNATURE */
1072		case TCP_NODELAY:
1073		case TCP_NOOPT:
1074			error = sooptcopyin(sopt, &optval, sizeof optval,
1075					    sizeof optval);
1076			if (error)
1077				break;
1078
1079			switch (sopt->sopt_name) {
1080			case TCP_NODELAY:
1081				opt = TF_NODELAY;
1082				break;
1083			case TCP_NOOPT:
1084				opt = TF_NOOPT;
1085				break;
1086			default:
1087				opt = 0; /* dead code to fool gcc */
1088				break;
1089			}
1090
1091			if (optval)
1092				tp->t_flags |= opt;
1093			else
1094				tp->t_flags &= ~opt;
1095			break;
1096
1097		case TCP_NOPUSH:
1098			error = sooptcopyin(sopt, &optval, sizeof optval,
1099					    sizeof optval);
1100			if (error)
1101				break;
1102
1103			if (optval)
1104				tp->t_flags |= TF_NOPUSH;
1105			else {
1106				tp->t_flags &= ~TF_NOPUSH;
1107				error = tcp_output(tp);
1108			}
1109			break;
1110
1111		case TCP_MAXSEG:
1112			error = sooptcopyin(sopt, &optval, sizeof optval,
1113					    sizeof optval);
1114			if (error)
1115				break;
1116
1117			if (optval > 0 && optval <= tp->t_maxseg &&
1118			    optval + 40 >= tcp_minmss)
1119				tp->t_maxseg = optval;
1120			else
1121				error = EINVAL;
1122			break;
1123
1124		default:
1125			error = ENOPROTOOPT;
1126			break;
1127		}
1128		break;
1129
1130	case SOPT_GET:
1131		switch (sopt->sopt_name) {
1132#ifdef TCP_SIGNATURE
1133		case TCP_MD5SIG:
1134			optval = (tp->t_flags & TF_SIGNATURE) ? 1 : 0;
1135			break;
1136#endif
1137		case TCP_NODELAY:
1138			optval = tp->t_flags & TF_NODELAY;
1139			break;
1140		case TCP_MAXSEG:
1141			optval = tp->t_maxseg;
1142			break;
1143		case TCP_NOOPT:
1144			optval = tp->t_flags & TF_NOOPT;
1145			break;
1146		case TCP_NOPUSH:
1147			optval = tp->t_flags & TF_NOPUSH;
1148			break;
1149		default:
1150			error = ENOPROTOOPT;
1151			break;
1152		}
1153		if (error == 0)
1154			error = sooptcopyout(sopt, &optval, sizeof optval);
1155		break;
1156	}
1157	INP_UNLOCK(inp);
1158	splx(s);
1159	return (error);
1160}
1161
1162/*
1163 * tcp_sendspace and tcp_recvspace are the default send and receive window
1164 * sizes, respectively.  These are obsolescent (this information should
1165 * be set by the route).
1166 */
1167u_long	tcp_sendspace = 1024*32;
1168SYSCTL_INT(_net_inet_tcp, TCPCTL_SENDSPACE, sendspace, CTLFLAG_RW,
1169    &tcp_sendspace , 0, "Maximum outgoing TCP datagram size");
1170u_long	tcp_recvspace = 1024*64;
1171SYSCTL_INT(_net_inet_tcp, TCPCTL_RECVSPACE, recvspace, CTLFLAG_RW,
1172    &tcp_recvspace , 0, "Maximum incoming TCP datagram size");
1173
1174/*
1175 * Attach TCP protocol to socket, allocating
1176 * internet protocol control block, tcp control block,
1177 * bufer space, and entering LISTEN state if to accept connections.
1178 */
1179static int
1180tcp_attach(so)
1181	struct socket *so;
1182{
1183	register struct tcpcb *tp;
1184	struct inpcb *inp;
1185	int error;
1186#ifdef INET6
1187	int isipv6 = INP_CHECK_SOCKAF(so, AF_INET6) != 0;
1188#endif
1189
1190	if (so->so_snd.sb_hiwat == 0 || so->so_rcv.sb_hiwat == 0) {
1191		error = soreserve(so, tcp_sendspace, tcp_recvspace);
1192		if (error)
1193			return (error);
1194	}
1195	error = in_pcballoc(so, &tcbinfo, "tcpinp");
1196	if (error)
1197		return (error);
1198	inp = sotoinpcb(so);
1199#ifdef INET6
1200	if (isipv6) {
1201		inp->inp_vflag |= INP_IPV6;
1202		inp->in6p_hops = -1;	/* use kernel default */
1203	}
1204	else
1205#endif
1206	inp->inp_vflag |= INP_IPV4;
1207	tp = tcp_newtcpcb(inp);
1208	if (tp == 0) {
1209		int nofd = so->so_state & SS_NOFDREF;	/* XXX */
1210
1211		so->so_state &= ~SS_NOFDREF;	/* don't free the socket yet */
1212#ifdef INET6
1213		if (isipv6)
1214			in6_pcbdetach(inp);
1215		else
1216#endif
1217		in_pcbdetach(inp);
1218		so->so_state |= nofd;
1219		return (ENOBUFS);
1220	}
1221	tp->t_state = TCPS_CLOSED;
1222	return (0);
1223}
1224
1225/*
1226 * Initiate (or continue) disconnect.
1227 * If embryonic state, just send reset (once).
1228 * If in ``let data drain'' option and linger null, just drop.
1229 * Otherwise (hard), mark socket disconnecting and drop
1230 * current input data; switch states based on user close, and
1231 * send segment to peer (with FIN).
1232 */
1233static struct tcpcb *
1234tcp_disconnect(tp)
1235	register struct tcpcb *tp;
1236{
1237	struct socket *so = tp->t_inpcb->inp_socket;
1238
1239	if (tp->t_state < TCPS_ESTABLISHED)
1240		tp = tcp_close(tp);
1241	else if ((so->so_options & SO_LINGER) && so->so_linger == 0)
1242		tp = tcp_drop(tp, 0);
1243	else {
1244		soisdisconnecting(so);
1245		sbflush(&so->so_rcv);
1246		tp = tcp_usrclosed(tp);
1247		if (tp)
1248			(void) tcp_output(tp);
1249	}
1250	return (tp);
1251}
1252
1253/*
1254 * User issued close, and wish to trail through shutdown states:
1255 * if never received SYN, just forget it.  If got a SYN from peer,
1256 * but haven't sent FIN, then go to FIN_WAIT_1 state to send peer a FIN.
1257 * If already got a FIN from peer, then almost done; go to LAST_ACK
1258 * state.  In all other cases, have already sent FIN to peer (e.g.
1259 * after PRU_SHUTDOWN), and just have to play tedious game waiting
1260 * for peer to send FIN or not respond to keep-alives, etc.
1261 * We can let the user exit from the close as soon as the FIN is acked.
1262 */
1263static struct tcpcb *
1264tcp_usrclosed(tp)
1265	register struct tcpcb *tp;
1266{
1267
1268	switch (tp->t_state) {
1269
1270	case TCPS_CLOSED:
1271	case TCPS_LISTEN:
1272		tp->t_state = TCPS_CLOSED;
1273		tp = tcp_close(tp);
1274		break;
1275
1276	case TCPS_SYN_SENT:
1277	case TCPS_SYN_RECEIVED:
1278		tp->t_flags |= TF_NEEDFIN;
1279		break;
1280
1281	case TCPS_ESTABLISHED:
1282		tp->t_state = TCPS_FIN_WAIT_1;
1283		break;
1284
1285	case TCPS_CLOSE_WAIT:
1286		tp->t_state = TCPS_LAST_ACK;
1287		break;
1288	}
1289	if (tp && tp->t_state >= TCPS_FIN_WAIT_2) {
1290		soisdisconnected(tp->t_inpcb->inp_socket);
1291		/* To prevent the connection hanging in FIN_WAIT_2 forever. */
1292		if (tp->t_state == TCPS_FIN_WAIT_2)
1293			callout_reset(tp->tt_2msl, tcp_maxidle,
1294				      tcp_timer_2msl, tp);
1295	}
1296	return (tp);
1297}
1298
1299