tcp_usrreq.c revision 17096
1/*
2 * Copyright (c) 1982, 1986, 1988, 1993
3 *	The Regents of the University of California.  All rights reserved.
4 *
5 * Redistribution and use in source and binary forms, with or without
6 * modification, are permitted provided that the following conditions
7 * are met:
8 * 1. Redistributions of source code must retain the above copyright
9 *    notice, this list of conditions and the following disclaimer.
10 * 2. Redistributions in binary form must reproduce the above copyright
11 *    notice, this list of conditions and the following disclaimer in the
12 *    documentation and/or other materials provided with the distribution.
13 * 3. All advertising materials mentioning features or use of this software
14 *    must display the following acknowledgement:
15 *	This product includes software developed by the University of
16 *	California, Berkeley and its contributors.
17 * 4. Neither the name of the University nor the names of its contributors
18 *    may be used to endorse or promote products derived from this software
19 *    without specific prior written permission.
20 *
21 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
22 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
23 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
24 * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
25 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
26 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
27 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
28 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
29 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
30 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
31 * SUCH DAMAGE.
32 *
33 *	From: @(#)tcp_usrreq.c	8.2 (Berkeley) 1/3/94
34 *	$Id: tcp_usrreq.c,v 1.22 1996/03/11 15:13:37 davidg Exp $
35 */
36
37#include <sys/param.h>
38#include <sys/queue.h>
39#include <sys/systm.h>
40#include <sys/kernel.h>
41#include <sys/sysctl.h>
42#include <sys/malloc.h>
43#include <sys/mbuf.h>
44#include <sys/socket.h>
45#include <sys/socketvar.h>
46#include <sys/protosw.h>
47#include <sys/errno.h>
48#include <sys/stat.h>
49
50#include <net/if.h>
51#include <net/route.h>
52
53#include <netinet/in.h>
54#include <netinet/in_systm.h>
55#include <netinet/ip.h>
56#include <netinet/in_pcb.h>
57#include <netinet/in_var.h>
58#include <netinet/ip_var.h>
59#include <netinet/tcp.h>
60#include <netinet/tcp_fsm.h>
61#include <netinet/tcp_seq.h>
62#include <netinet/tcp_timer.h>
63#include <netinet/tcp_var.h>
64#include <netinet/tcpip.h>
65#ifdef TCPDEBUG
66#include <netinet/tcp_debug.h>
67#endif
68
69/*
70 * TCP protocol interface to socket abstraction.
71 */
72extern	char *tcpstates[];
73
74static int	tcp_attach __P((struct socket *));
75static int	tcp_connect __P((struct tcpcb *, struct mbuf *));
76static struct tcpcb *
77		tcp_disconnect __P((struct tcpcb *));
78static struct tcpcb *
79		tcp_usrclosed __P((struct tcpcb *));
80
81#ifdef notdef
82/*
83 * Process a TCP user request for TCP tb.  If this is a send request
84 * then m is the mbuf chain of send data.  If this is a timer expiration
85 * (called from the software clock routine), then timertype tells which timer.
86 */
87/*ARGSUSED*/
88int
89tcp_usrreq(so, req, m, nam, control)
90	struct socket *so;
91	int req;
92	struct mbuf *m, *nam, *control;
93{
94	register struct inpcb *inp;
95	register struct tcpcb *tp = 0;
96	struct sockaddr_in *sinp;
97	int s;
98	int error = 0;
99#ifdef TCPDEBUG
100	int ostate;
101#endif
102
103	if (req == PRU_CONTROL)
104		return (in_control(so, (u_long)m, (caddr_t)nam,
105			(struct ifnet *)control));
106	if (control && control->m_len) {
107		m_freem(control);
108		if (m)
109			m_freem(m);
110		return (EINVAL);
111	}
112
113	s = splnet();
114	inp = sotoinpcb(so);
115	/*
116	 * When a TCP is attached to a socket, then there will be
117	 * a (struct inpcb) pointed at by the socket, and this
118	 * structure will point at a subsidary (struct tcpcb).
119	 */
120	if (inp == 0 && req != PRU_ATTACH) {
121		splx(s);
122#if 0
123		/*
124		 * The following corrects an mbuf leak under rare
125		 * circumstances, but has not been fully tested.
126		 */
127		if (m && req != PRU_SENSE)
128			m_freem(m);
129#else
130		/* safer version of fix for mbuf leak */
131		if (m && (req == PRU_SEND || req == PRU_SENDOOB))
132			m_freem(m);
133#endif
134		return (EINVAL);		/* XXX */
135	}
136	if (inp) {
137		tp = intotcpcb(inp);
138		/* WHAT IF TP IS 0? */
139#ifdef KPROF
140		tcp_acounts[tp->t_state][req]++;
141#endif
142#ifdef TCPDEBUG
143		ostate = tp->t_state;
144	} else
145		ostate = 0;
146#else /* TCPDEBUG */
147	}
148#endif /* TCPDEBUG */
149
150	switch (req) {
151
152	/*
153	 * TCP attaches to socket via PRU_ATTACH, reserving space,
154	 * and an internet control block.
155	 */
156	case PRU_ATTACH:
157		if (inp) {
158			error = EISCONN;
159			break;
160		}
161		error = tcp_attach(so);
162		if (error)
163			break;
164		if ((so->so_options & SO_LINGER) && so->so_linger == 0)
165			so->so_linger = TCP_LINGERTIME * hz;
166		tp = sototcpcb(so);
167		break;
168
169	/*
170	 * PRU_DETACH detaches the TCP protocol from the socket.
171	 * If the protocol state is non-embryonic, then can't
172	 * do this directly: have to initiate a PRU_DISCONNECT,
173	 * which may finish later; embryonic TCB's can just
174	 * be discarded here.
175	 */
176	case PRU_DETACH:
177		if (tp->t_state > TCPS_LISTEN)
178			tp = tcp_disconnect(tp);
179		else
180			tp = tcp_close(tp);
181		break;
182
183	/*
184	 * Give the socket an address.
185	 */
186	case PRU_BIND:
187		/*
188		 * Must check for multicast addresses and disallow binding
189		 * to them.
190		 */
191		sinp = mtod(nam, struct sockaddr_in *);
192		if (sinp->sin_family == AF_INET &&
193		    IN_MULTICAST(ntohl(sinp->sin_addr.s_addr))) {
194			error = EAFNOSUPPORT;
195			break;
196		}
197		error = in_pcbbind(inp, nam);
198		if (error)
199			break;
200		break;
201
202	/*
203	 * Prepare to accept connections.
204	 */
205	case PRU_LISTEN:
206		if (inp->inp_lport == 0)
207			error = in_pcbbind(inp, NULL);
208		if (error == 0)
209			tp->t_state = TCPS_LISTEN;
210		break;
211
212	/*
213	 * Initiate connection to peer.
214	 * Create a template for use in transmissions on this connection.
215	 * Enter SYN_SENT state, and mark socket as connecting.
216	 * Start keep-alive timer, and seed output sequence space.
217	 * Send initial segment on connection.
218	 */
219	case PRU_CONNECT:
220		/*
221		 * Must disallow TCP ``connections'' to multicast addresses.
222		 */
223		sinp = mtod(nam, struct sockaddr_in *);
224		if (sinp->sin_family == AF_INET
225		    && IN_MULTICAST(ntohl(sinp->sin_addr.s_addr))) {
226			error = EAFNOSUPPORT;
227			break;
228		}
229
230		if ((error = tcp_connect(tp, nam)) != 0)
231			break;
232		error = tcp_output(tp);
233		break;
234
235	/*
236	 * Create a TCP connection between two sockets.
237	 */
238	case PRU_CONNECT2:
239		error = EOPNOTSUPP;
240		break;
241
242	/*
243	 * Initiate disconnect from peer.
244	 * If connection never passed embryonic stage, just drop;
245	 * else if don't need to let data drain, then can just drop anyways,
246	 * else have to begin TCP shutdown process: mark socket disconnecting,
247	 * drain unread data, state switch to reflect user close, and
248	 * send segment (e.g. FIN) to peer.  Socket will be really disconnected
249	 * when peer sends FIN and acks ours.
250	 *
251	 * SHOULD IMPLEMENT LATER PRU_CONNECT VIA REALLOC TCPCB.
252	 */
253	case PRU_DISCONNECT:
254		tp = tcp_disconnect(tp);
255		break;
256
257	/*
258	 * Accept a connection.  Essentially all the work is
259	 * done at higher levels; just return the address
260	 * of the peer, storing through addr.
261	 */
262	case PRU_ACCEPT:
263		in_setpeeraddr(inp, nam);
264		break;
265
266	/*
267	 * Mark the connection as being incapable of further output.
268	 */
269	case PRU_SHUTDOWN:
270		socantsendmore(so);
271		tp = tcp_usrclosed(tp);
272		if (tp)
273			error = tcp_output(tp);
274		break;
275
276	/*
277	 * After a receive, possibly send window update to peer.
278	 */
279	case PRU_RCVD:
280		(void) tcp_output(tp);
281		break;
282
283	/*
284	 * Do a send by putting data in output queue and updating urgent
285	 * marker if URG set.  Possibly send more data.
286	 */
287	case PRU_SEND_EOF:
288	case PRU_SEND:
289		sbappend(&so->so_snd, m);
290		if (nam && tp->t_state < TCPS_SYN_SENT) {
291			/*
292			 * Do implied connect if not yet connected,
293			 * initialize window to default value, and
294			 * initialize maxseg/maxopd using peer's cached
295			 * MSS.
296			 */
297			error = tcp_connect(tp, nam);
298			if (error)
299				break;
300			tp->snd_wnd = TTCP_CLIENT_SND_WND;
301			tcp_mss(tp, -1);
302		}
303
304		if (req == PRU_SEND_EOF) {
305			/*
306			 * Close the send side of the connection after
307			 * the data is sent.
308			 */
309			socantsendmore(so);
310			tp = tcp_usrclosed(tp);
311		}
312		if (tp != NULL)
313			error = tcp_output(tp);
314		break;
315
316	/*
317	 * Abort the TCP.
318	 */
319	case PRU_ABORT:
320		tp = tcp_drop(tp, ECONNABORTED);
321		break;
322
323	case PRU_SENSE:
324		((struct stat *) m)->st_blksize = so->so_snd.sb_hiwat;
325		(void) splx(s);
326		return (0);
327
328	case PRU_RCVOOB:
329		if ((so->so_oobmark == 0 &&
330		    (so->so_state & SS_RCVATMARK) == 0) ||
331		    so->so_options & SO_OOBINLINE ||
332		    tp->t_oobflags & TCPOOB_HADDATA) {
333			error = EINVAL;
334			break;
335		}
336		if ((tp->t_oobflags & TCPOOB_HAVEDATA) == 0) {
337			error = EWOULDBLOCK;
338			break;
339		}
340		m->m_len = 1;
341		*mtod(m, caddr_t) = tp->t_iobc;
342		if (((int)nam & MSG_PEEK) == 0)
343			tp->t_oobflags ^= (TCPOOB_HAVEDATA | TCPOOB_HADDATA);
344		break;
345
346	case PRU_SENDOOB:
347		if (sbspace(&so->so_snd) < -512) {
348			m_freem(m);
349			error = ENOBUFS;
350			break;
351		}
352		/*
353		 * According to RFC961 (Assigned Protocols),
354		 * the urgent pointer points to the last octet
355		 * of urgent data.  We continue, however,
356		 * to consider it to indicate the first octet
357		 * of data past the urgent section.
358		 * Otherwise, snd_up should be one lower.
359		 */
360		sbappend(&so->so_snd, m);
361		tp->snd_up = tp->snd_una + so->so_snd.sb_cc;
362		tp->t_force = 1;
363		error = tcp_output(tp);
364		tp->t_force = 0;
365		break;
366
367	case PRU_SOCKADDR:
368		in_setsockaddr(inp, nam);
369		break;
370
371	case PRU_PEERADDR:
372		in_setpeeraddr(inp, nam);
373		break;
374
375	/*
376	 * TCP slow timer went off; going through this
377	 * routine for tracing's sake.
378	 */
379	case PRU_SLOWTIMO:
380		tp = tcp_timers(tp, (int)nam);
381#ifdef TCPDEBUG
382		req |= (int)nam << 8;		/* for debug's sake */
383#endif
384		break;
385
386	default:
387		panic("tcp_usrreq");
388	}
389#ifdef TCPDEBUG
390	if (tp && (so->so_options & SO_DEBUG))
391		tcp_trace(TA_USER, ostate, tp, (struct tcpiphdr *)0, req);
392#endif
393	splx(s);
394	return (error);
395}
396#endif
397
398#ifdef TCPDEBUG
399#define	TCPDEBUG0	int ostate
400#define	TCPDEBUG1()	ostate = tp ? tp->t_state : 0
401#define	TCPDEBUG2(req)	if (tp && (so->so_options & SO_DEBUG)) && \
402				tcp_trace(TA_USER, ostate, tp, 0, req)
403#else
404#define	TCPDEBUG0
405#define	TCPDEBUG1()
406#define	TCPDEBUG2(req)
407#endif
408
409/*
410 * TCP attaches to socket via pru_attach(), reserving space,
411 * and an internet control block.
412 */
413static int
414tcp_usr_attach(struct socket *so, int proto)
415{
416	int s = splnet();
417	int error;
418	struct inpcb *inp = sotoinpcb(so);
419	struct tcpcb *tp = 0;
420	TCPDEBUG0;
421
422	TCPDEBUG1();
423	if (inp) {
424		error = EISCONN;
425		goto out;
426	}
427
428	error = tcp_attach(so);
429	if (error)
430		goto out;
431
432	if ((so->so_options & SO_LINGER) && so->so_linger == 0)
433		so->so_linger = TCP_LINGERTIME * hz;
434	tp = sototcpcb(so);
435out:
436	TCPDEBUG2(PRU_ATTACH);
437	splx(s);
438	return error;
439}
440
441/*
442 * pru_detach() detaches the TCP protocol from the socket.
443 * If the protocol state is non-embryonic, then can't
444 * do this directly: have to initiate a pru_disconnect(),
445 * which may finish later; embryonic TCB's can just
446 * be discarded here.
447 */
448static int
449tcp_usr_detach(struct socket *so)
450{
451	int s = splnet();
452	int error = 0;
453	struct inpcb *inp = sotoinpcb(so);
454	struct tcpcb *tp;
455	TCPDEBUG0;
456
457	if (inp == 0) {
458		splx(s);
459		return EINVAL;	/* XXX */
460	}
461	tp = intotcpcb(inp);
462	TCPDEBUG1();
463	if (tp->t_state > TCPS_LISTEN)
464		tp = tcp_disconnect(tp);
465	else
466		tp = tcp_close(tp);
467
468	TCPDEBUG2(PRU_DETACH);
469	splx(s);
470	return error;
471}
472
473#define	COMMON_START()	TCPDEBUG0; \
474			do { \
475				     if (inp == 0) { \
476					     splx(s); \
477					     return EINVAL; \
478				     } \
479				     tp = intotcpcb(inp); \
480				     TCPDEBUG1(); \
481		     } while(0)
482
483#define COMMON_END(req)	out: TCPDEBUG2(req); splx(s); return error; goto out
484
485
486/*
487 * Give the socket an address.
488 */
489static int
490tcp_usr_bind(struct socket *so, struct mbuf *nam)
491{
492	int s = splnet();
493	int error = 0;
494	struct inpcb *inp = sotoinpcb(so);
495	struct tcpcb *tp;
496	struct sockaddr_in *sinp;
497
498	COMMON_START();
499
500	/*
501	 * Must check for multicast addresses and disallow binding
502	 * to them.
503	 */
504	sinp = mtod(nam, struct sockaddr_in *);
505	if (sinp->sin_family == AF_INET &&
506	    IN_MULTICAST(ntohl(sinp->sin_addr.s_addr))) {
507		error = EAFNOSUPPORT;
508		goto out;
509	}
510	error = in_pcbbind(inp, nam);
511	if (error)
512		goto out;
513	COMMON_END(PRU_BIND);
514
515}
516
517/*
518 * Prepare to accept connections.
519 */
520static int
521tcp_usr_listen(struct socket *so)
522{
523	int s = splnet();
524	int error = 0;
525	struct inpcb *inp = sotoinpcb(so);
526	struct tcpcb *tp;
527
528	COMMON_START();
529	if (inp->inp_lport == 0)
530		error = in_pcbbind(inp, NULL);
531	if (error == 0)
532		tp->t_state = TCPS_LISTEN;
533	COMMON_END(PRU_LISTEN);
534}
535
536/*
537 * Initiate connection to peer.
538 * Create a template for use in transmissions on this connection.
539 * Enter SYN_SENT state, and mark socket as connecting.
540 * Start keep-alive timer, and seed output sequence space.
541 * Send initial segment on connection.
542 */
543static int
544tcp_usr_connect(struct socket *so, struct mbuf *nam)
545{
546	int s = splnet();
547	int error = 0;
548	struct inpcb *inp = sotoinpcb(so);
549	struct tcpcb *tp;
550	struct sockaddr_in *sinp;
551
552	COMMON_START();
553
554	/*
555	 * Must disallow TCP ``connections'' to multicast addresses.
556	 */
557	sinp = mtod(nam, struct sockaddr_in *);
558	if (sinp->sin_family == AF_INET
559	    && IN_MULTICAST(ntohl(sinp->sin_addr.s_addr))) {
560		error = EAFNOSUPPORT;
561		goto out;
562	}
563
564	if ((error = tcp_connect(tp, nam)) != 0)
565		goto out;
566	error = tcp_output(tp);
567	COMMON_END(PRU_CONNECT);
568}
569
570/*
571 * Initiate disconnect from peer.
572 * If connection never passed embryonic stage, just drop;
573 * else if don't need to let data drain, then can just drop anyways,
574 * else have to begin TCP shutdown process: mark socket disconnecting,
575 * drain unread data, state switch to reflect user close, and
576 * send segment (e.g. FIN) to peer.  Socket will be really disconnected
577 * when peer sends FIN and acks ours.
578 *
579 * SHOULD IMPLEMENT LATER PRU_CONNECT VIA REALLOC TCPCB.
580 */
581static int
582tcp_usr_disconnect(struct socket *so)
583{
584	int s = splnet();
585	int error = 0;
586	struct inpcb *inp = sotoinpcb(so);
587	struct tcpcb *tp;
588
589	COMMON_START();
590	tp = tcp_disconnect(tp);
591	COMMON_END(PRU_DISCONNECT);
592}
593
594/*
595 * Accept a connection.  Essentially all the work is
596 * done at higher levels; just return the address
597 * of the peer, storing through addr.
598 */
599static int
600tcp_usr_accept(struct socket *so, struct mbuf *nam)
601{
602	int s = splnet();
603	int error = 0;
604	struct inpcb *inp = sotoinpcb(so);
605	struct tcpcb *tp;
606
607	COMMON_START();
608	in_setpeeraddr(inp, nam);
609	COMMON_END(PRU_ACCEPT);
610}
611
612/*
613 * Mark the connection as being incapable of further output.
614 */
615static int
616tcp_usr_shutdown(struct socket *so)
617{
618	int s = splnet();
619	int error = 0;
620	struct inpcb *inp = sotoinpcb(so);
621	struct tcpcb *tp;
622
623	COMMON_START();
624	socantsendmore(so);
625	tp = tcp_usrclosed(tp);
626	if (tp)
627		error = tcp_output(tp);
628	COMMON_END(PRU_SHUTDOWN);
629}
630
631/*
632 * After a receive, possibly send window update to peer.
633 */
634static int
635tcp_usr_rcvd(struct socket *so, int flags)
636{
637	int s = splnet();
638	int error = 0;
639	struct inpcb *inp = sotoinpcb(so);
640	struct tcpcb *tp;
641
642	COMMON_START();
643	tcp_output(tp);
644	COMMON_END(PRU_RCVD);
645}
646
647/*
648 * Do a send by putting data in output queue and updating urgent
649 * marker if URG set.  Possibly send more data.
650 */
651static int
652tcp_usr_send(struct socket *so, int flags, struct mbuf *m, struct mbuf *nam,
653	     struct mbuf *control)
654{
655	int s = splnet();
656	int error = 0;
657	struct inpcb *inp = sotoinpcb(so);
658	struct tcpcb *tp;
659
660	COMMON_START();
661	if (control && control->m_len) {
662		m_freem(control); /* XXX shouldn't caller do this??? */
663		if (m)
664			m_freem(m);
665		return EINVAL;
666	}
667
668	if(!(flags & PRUS_OOB)) {
669		sbappend(&so->so_snd, m);
670		if (nam && tp->t_state < TCPS_SYN_SENT) {
671			/*
672			 * Do implied connect if not yet connected,
673			 * initialize window to default value, and
674			 * initialize maxseg/maxopd using peer's cached
675			 * MSS.
676			 */
677			error = tcp_connect(tp, nam);
678			if (error)
679				goto out;
680			tp->snd_wnd = TTCP_CLIENT_SND_WND;
681			tcp_mss(tp, -1);
682		}
683
684		if (flags & PRUS_EOF) {
685			/*
686			 * Close the send side of the connection after
687			 * the data is sent.
688			 */
689			socantsendmore(so);
690			tp = tcp_usrclosed(tp);
691		}
692		if (tp != NULL)
693			error = tcp_output(tp);
694	} else {
695		if (sbspace(&so->so_snd) < -512) {
696			m_freem(m);
697			error = ENOBUFS;
698			goto out;
699		}
700		/*
701		 * According to RFC961 (Assigned Protocols),
702		 * the urgent pointer points to the last octet
703		 * of urgent data.  We continue, however,
704		 * to consider it to indicate the first octet
705		 * of data past the urgent section.
706		 * Otherwise, snd_up should be one lower.
707		 */
708		sbappend(&so->so_snd, m);
709		tp->snd_up = tp->snd_una + so->so_snd.sb_cc;
710		tp->t_force = 1;
711		error = tcp_output(tp);
712		tp->t_force = 0;
713	}
714	COMMON_END((flags & PRUS_OOB) ? PRU_SENDOOB :
715		   ((flags & PRUS_EOF) ? PRU_SEND_EOF : PRU_SEND));
716}
717
718/*
719 * Abort the TCP.
720 */
721static int
722tcp_usr_abort(struct socket *so)
723{
724	int s = splnet();
725	int error = 0;
726	struct inpcb *inp = sotoinpcb(so);
727	struct tcpcb *tp;
728
729	COMMON_START();
730	tp = tcp_drop(tp, ECONNABORTED);
731	COMMON_END(PRU_ABORT);
732}
733
734/*
735 * Fill in st_bklsize for fstat() operations on a socket.
736 */
737static int
738tcp_usr_sense(struct socket *so, struct stat *sb)
739{
740	int s = splnet();
741
742	sb->st_blksize = so->so_snd.sb_hiwat;
743	splx(s);
744	return 0;
745}
746
747/*
748 * Receive out-of-band data.
749 */
750static int
751tcp_usr_rcvoob(struct socket *so, struct mbuf *m, int flags)
752{
753	int s = splnet();
754	int error = 0;
755	struct inpcb *inp = sotoinpcb(so);
756	struct tcpcb *tp;
757
758	COMMON_START();
759	if ((so->so_oobmark == 0 &&
760	     (so->so_state & SS_RCVATMARK) == 0) ||
761	    so->so_options & SO_OOBINLINE ||
762	    tp->t_oobflags & TCPOOB_HADDATA) {
763		error = EINVAL;
764		goto out;
765	}
766	if ((tp->t_oobflags & TCPOOB_HAVEDATA) == 0) {
767		error = EWOULDBLOCK;
768		goto out;
769	}
770	m->m_len = 1;
771	*mtod(m, caddr_t) = tp->t_iobc;
772	if ((flags & MSG_PEEK) == 0)
773		tp->t_oobflags ^= (TCPOOB_HAVEDATA | TCPOOB_HADDATA);
774	COMMON_END(PRU_RCVOOB);
775}
776
777static int
778tcp_usr_sockaddr(struct socket *so, struct mbuf *nam)
779{
780	int s = splnet();
781	int error = 0;
782	struct inpcb *inp = sotoinpcb(so);
783	struct tcpcb *tp;
784
785	COMMON_START();
786	in_setsockaddr(inp, nam);
787	COMMON_END(PRU_SOCKADDR);
788}
789
790static int
791tcp_usr_peeraddr(struct socket *so, struct mbuf *nam)
792{
793	int s = splnet();
794	int error = 0;
795	struct inpcb *inp = sotoinpcb(so);
796	struct tcpcb *tp;
797
798	COMMON_START();
799	in_setpeeraddr(inp, nam);
800	COMMON_END(PRU_PEERADDR);
801}
802
803/*
804 * XXX - this should just be a call to in_control, but we need to get
805 * the types worked out.
806 */
807static int
808tcp_usr_control(struct socket *so, int cmd, caddr_t arg, struct ifnet *ifp)
809{
810	return in_control(so, cmd, arg, ifp);
811}
812
813/* xxx - should be const */
814struct pr_usrreqs tcp_usrreqs = {
815	tcp_usr_abort, tcp_usr_accept, tcp_usr_attach, tcp_usr_bind,
816	tcp_usr_connect, pru_connect2_notsupp, tcp_usr_control, tcp_usr_detach,
817	tcp_usr_disconnect, tcp_usr_listen, tcp_usr_peeraddr, tcp_usr_rcvd,
818	tcp_usr_rcvoob, tcp_usr_send, tcp_usr_sense, tcp_usr_shutdown,
819	tcp_usr_sockaddr
820};
821
822/*
823 * Common subroutine to open a TCP connection to remote host specified
824 * by struct sockaddr_in in mbuf *nam.  Call in_pcbbind to assign a local
825 * port number if needed.  Call in_pcbladdr to do the routing and to choose
826 * a local host address (interface).  If there is an existing incarnation
827 * of the same connection in TIME-WAIT state and if the remote host was
828 * sending CC options and if the connection duration was < MSL, then
829 * truncate the previous TIME-WAIT state and proceed.
830 * Initialize connection parameters and enter SYN-SENT state.
831 */
832static int
833tcp_connect(tp, nam)
834	register struct tcpcb *tp;
835	struct mbuf *nam;
836{
837	struct inpcb *inp = tp->t_inpcb, *oinp;
838	struct socket *so = inp->inp_socket;
839	struct tcpcb *otp;
840	struct sockaddr_in *sin = mtod(nam, struct sockaddr_in *);
841	struct sockaddr_in *ifaddr;
842	int error;
843	struct rmxp_tao *taop;
844	struct rmxp_tao tao_noncached;
845
846	if (inp->inp_lport == 0) {
847		error = in_pcbbind(inp, NULL);
848		if (error)
849			return error;
850	}
851
852	/*
853	 * Cannot simply call in_pcbconnect, because there might be an
854	 * earlier incarnation of this same connection still in
855	 * TIME_WAIT state, creating an ADDRINUSE error.
856	 */
857	error = in_pcbladdr(inp, nam, &ifaddr);
858	if (error)
859		return error;
860	oinp = in_pcblookup(inp->inp_pcbinfo->listhead,
861	    sin->sin_addr, sin->sin_port,
862	    inp->inp_laddr.s_addr != INADDR_ANY ? inp->inp_laddr
863						: ifaddr->sin_addr,
864	    inp->inp_lport,  0);
865	if (oinp) {
866		if (oinp != inp && (otp = intotcpcb(oinp)) != NULL &&
867		otp->t_state == TCPS_TIME_WAIT &&
868		    otp->t_duration < TCPTV_MSL &&
869		    (otp->t_flags & TF_RCVD_CC))
870			otp = tcp_close(otp);
871		else
872			return EADDRINUSE;
873	}
874	if (inp->inp_laddr.s_addr == INADDR_ANY)
875		inp->inp_laddr = ifaddr->sin_addr;
876	inp->inp_faddr = sin->sin_addr;
877	inp->inp_fport = sin->sin_port;
878	in_pcbrehash(inp);
879
880	tp->t_template = tcp_template(tp);
881	if (tp->t_template == 0) {
882		in_pcbdisconnect(inp);
883		return ENOBUFS;
884	}
885
886	/* Compute window scaling to request.  */
887	while (tp->request_r_scale < TCP_MAX_WINSHIFT &&
888	    (TCP_MAXWIN << tp->request_r_scale) < so->so_rcv.sb_hiwat)
889		tp->request_r_scale++;
890
891	soisconnecting(so);
892	tcpstat.tcps_connattempt++;
893	tp->t_state = TCPS_SYN_SENT;
894	tp->t_timer[TCPT_KEEP] = TCPTV_KEEP_INIT;
895	tp->iss = tcp_iss; tcp_iss += TCP_ISSINCR/2;
896	tcp_sendseqinit(tp);
897
898	/*
899	 * Generate a CC value for this connection and
900	 * check whether CC or CCnew should be used.
901	 */
902	if ((taop = tcp_gettaocache(tp->t_inpcb)) == NULL) {
903		taop = &tao_noncached;
904		bzero(taop, sizeof(*taop));
905	}
906
907	tp->cc_send = CC_INC(tcp_ccgen);
908	if (taop->tao_ccsent != 0 &&
909	    CC_GEQ(tp->cc_send, taop->tao_ccsent)) {
910		taop->tao_ccsent = tp->cc_send;
911	} else {
912		taop->tao_ccsent = 0;
913		tp->t_flags |= TF_SENDCCNEW;
914	}
915
916	return 0;
917}
918
919int
920tcp_ctloutput(op, so, level, optname, mp)
921	int op;
922	struct socket *so;
923	int level, optname;
924	struct mbuf **mp;
925{
926	int error = 0, s;
927	struct inpcb *inp;
928	register struct tcpcb *tp;
929	register struct mbuf *m;
930	register int i;
931
932	s = splnet();
933	inp = sotoinpcb(so);
934	if (inp == NULL) {
935		splx(s);
936		if (op == PRCO_SETOPT && *mp)
937			(void) m_free(*mp);
938		return (ECONNRESET);
939	}
940	if (level != IPPROTO_TCP) {
941		error = ip_ctloutput(op, so, level, optname, mp);
942		splx(s);
943		return (error);
944	}
945	tp = intotcpcb(inp);
946
947	switch (op) {
948
949	case PRCO_SETOPT:
950		m = *mp;
951		switch (optname) {
952
953		case TCP_NODELAY:
954			if (m == NULL || m->m_len < sizeof (int))
955				error = EINVAL;
956			else if (*mtod(m, int *))
957				tp->t_flags |= TF_NODELAY;
958			else
959				tp->t_flags &= ~TF_NODELAY;
960			break;
961
962		case TCP_MAXSEG:
963			if (m && (i = *mtod(m, int *)) > 0 && i <= tp->t_maxseg)
964				tp->t_maxseg = i;
965			else
966				error = EINVAL;
967			break;
968
969		case TCP_NOOPT:
970			if (m == NULL || m->m_len < sizeof (int))
971				error = EINVAL;
972			else if (*mtod(m, int *))
973				tp->t_flags |= TF_NOOPT;
974			else
975				tp->t_flags &= ~TF_NOOPT;
976			break;
977
978		case TCP_NOPUSH:
979			if (m == NULL || m->m_len < sizeof (int))
980				error = EINVAL;
981			else if (*mtod(m, int *))
982				tp->t_flags |= TF_NOPUSH;
983			else
984				tp->t_flags &= ~TF_NOPUSH;
985			break;
986
987		default:
988			error = ENOPROTOOPT;
989			break;
990		}
991		if (m)
992			(void) m_free(m);
993		break;
994
995	case PRCO_GETOPT:
996		*mp = m = m_get(M_WAIT, MT_SOOPTS);
997		m->m_len = sizeof(int);
998
999		switch (optname) {
1000		case TCP_NODELAY:
1001			*mtod(m, int *) = tp->t_flags & TF_NODELAY;
1002			break;
1003		case TCP_MAXSEG:
1004			*mtod(m, int *) = tp->t_maxseg;
1005			break;
1006		case TCP_NOOPT:
1007			*mtod(m, int *) = tp->t_flags & TF_NOOPT;
1008			break;
1009		case TCP_NOPUSH:
1010			*mtod(m, int *) = tp->t_flags & TF_NOPUSH;
1011			break;
1012		default:
1013			error = ENOPROTOOPT;
1014			break;
1015		}
1016		break;
1017	}
1018	splx(s);
1019	return (error);
1020}
1021
1022/*
1023 * tcp_sendspace and tcp_recvspace are the default send and receive window
1024 * sizes, respectively.  These are obsolescent (this information should
1025 * be set by the route).
1026 */
1027u_long	tcp_sendspace = 1024*16;
1028SYSCTL_INT(_net_inet_tcp, TCPCTL_SENDSPACE, sendspace,
1029	CTLFLAG_RW, &tcp_sendspace , 0, "");
1030u_long	tcp_recvspace = 1024*16;
1031SYSCTL_INT(_net_inet_tcp, TCPCTL_RECVSPACE, recvspace,
1032	CTLFLAG_RW, &tcp_recvspace , 0, "");
1033
1034/*
1035 * Attach TCP protocol to socket, allocating
1036 * internet protocol control block, tcp control block,
1037 * bufer space, and entering LISTEN state if to accept connections.
1038 */
1039static int
1040tcp_attach(so)
1041	struct socket *so;
1042{
1043	register struct tcpcb *tp;
1044	struct inpcb *inp;
1045	int error;
1046
1047	if (so->so_snd.sb_hiwat == 0 || so->so_rcv.sb_hiwat == 0) {
1048		error = soreserve(so, tcp_sendspace, tcp_recvspace);
1049		if (error)
1050			return (error);
1051	}
1052	error = in_pcballoc(so, &tcbinfo);
1053	if (error)
1054		return (error);
1055	inp = sotoinpcb(so);
1056	tp = tcp_newtcpcb(inp);
1057	if (tp == 0) {
1058		int nofd = so->so_state & SS_NOFDREF;	/* XXX */
1059
1060		so->so_state &= ~SS_NOFDREF;	/* don't free the socket yet */
1061		in_pcbdetach(inp);
1062		so->so_state |= nofd;
1063		return (ENOBUFS);
1064	}
1065	tp->t_state = TCPS_CLOSED;
1066	return (0);
1067}
1068
1069/*
1070 * Initiate (or continue) disconnect.
1071 * If embryonic state, just send reset (once).
1072 * If in ``let data drain'' option and linger null, just drop.
1073 * Otherwise (hard), mark socket disconnecting and drop
1074 * current input data; switch states based on user close, and
1075 * send segment to peer (with FIN).
1076 */
1077static struct tcpcb *
1078tcp_disconnect(tp)
1079	register struct tcpcb *tp;
1080{
1081	struct socket *so = tp->t_inpcb->inp_socket;
1082
1083	if (tp->t_state < TCPS_ESTABLISHED)
1084		tp = tcp_close(tp);
1085	else if ((so->so_options & SO_LINGER) && so->so_linger == 0)
1086		tp = tcp_drop(tp, 0);
1087	else {
1088		soisdisconnecting(so);
1089		sbflush(&so->so_rcv);
1090		tp = tcp_usrclosed(tp);
1091		if (tp)
1092			(void) tcp_output(tp);
1093	}
1094	return (tp);
1095}
1096
1097/*
1098 * User issued close, and wish to trail through shutdown states:
1099 * if never received SYN, just forget it.  If got a SYN from peer,
1100 * but haven't sent FIN, then go to FIN_WAIT_1 state to send peer a FIN.
1101 * If already got a FIN from peer, then almost done; go to LAST_ACK
1102 * state.  In all other cases, have already sent FIN to peer (e.g.
1103 * after PRU_SHUTDOWN), and just have to play tedious game waiting
1104 * for peer to send FIN or not respond to keep-alives, etc.
1105 * We can let the user exit from the close as soon as the FIN is acked.
1106 */
1107static struct tcpcb *
1108tcp_usrclosed(tp)
1109	register struct tcpcb *tp;
1110{
1111
1112	switch (tp->t_state) {
1113
1114	case TCPS_CLOSED:
1115	case TCPS_LISTEN:
1116		tp->t_state = TCPS_CLOSED;
1117		tp = tcp_close(tp);
1118		break;
1119
1120	case TCPS_SYN_SENT:
1121	case TCPS_SYN_RECEIVED:
1122		tp->t_flags |= TF_NEEDFIN;
1123		break;
1124
1125	case TCPS_ESTABLISHED:
1126		tp->t_state = TCPS_FIN_WAIT_1;
1127		break;
1128
1129	case TCPS_CLOSE_WAIT:
1130		tp->t_state = TCPS_LAST_ACK;
1131		break;
1132	}
1133	if (tp && tp->t_state >= TCPS_FIN_WAIT_2) {
1134		soisdisconnected(tp->t_inpcb->inp_socket);
1135		/* To prevent the connection hanging in FIN_WAIT_2 forever. */
1136		if (tp->t_state == TCPS_FIN_WAIT_2)
1137			tp->t_timer[TCPT_2MSL] = tcp_maxidle;
1138	}
1139	return (tp);
1140}
1141
1142