tcp_usrreq.c revision 14546
1/*
2 * Copyright (c) 1982, 1986, 1988, 1993
3 *	The Regents of the University of California.  All rights reserved.
4 *
5 * Redistribution and use in source and binary forms, with or without
6 * modification, are permitted provided that the following conditions
7 * are met:
8 * 1. Redistributions of source code must retain the above copyright
9 *    notice, this list of conditions and the following disclaimer.
10 * 2. Redistributions in binary form must reproduce the above copyright
11 *    notice, this list of conditions and the following disclaimer in the
12 *    documentation and/or other materials provided with the distribution.
13 * 3. All advertising materials mentioning features or use of this software
14 *    must display the following acknowledgement:
15 *	This product includes software developed by the University of
16 *	California, Berkeley and its contributors.
17 * 4. Neither the name of the University nor the names of its contributors
18 *    may be used to endorse or promote products derived from this software
19 *    without specific prior written permission.
20 *
21 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
22 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
23 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
24 * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
25 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
26 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
27 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
28 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
29 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
30 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
31 * SUCH DAMAGE.
32 *
33 *	From: @(#)tcp_usrreq.c	8.2 (Berkeley) 1/3/94
34 *	$Id: tcp_usrreq.c,v 1.21 1995/12/06 23:37:42 bde Exp $
35 */
36
37#include <sys/param.h>
38#include <sys/queue.h>
39#include <sys/systm.h>
40#include <sys/kernel.h>
41#include <sys/sysctl.h>
42#include <sys/malloc.h>
43#include <sys/mbuf.h>
44#include <sys/socket.h>
45#include <sys/socketvar.h>
46#include <sys/protosw.h>
47#include <sys/errno.h>
48#include <sys/stat.h>
49
50#include <net/if.h>
51#include <net/route.h>
52
53#include <netinet/in.h>
54#include <netinet/in_systm.h>
55#include <netinet/ip.h>
56#include <netinet/in_pcb.h>
57#include <netinet/in_var.h>
58#include <netinet/ip_var.h>
59#include <netinet/tcp.h>
60#include <netinet/tcp_fsm.h>
61#include <netinet/tcp_seq.h>
62#include <netinet/tcp_timer.h>
63#include <netinet/tcp_var.h>
64#include <netinet/tcpip.h>
65#ifdef TCPDEBUG
66#include <netinet/tcp_debug.h>
67#endif
68
69/*
70 * TCP protocol interface to socket abstraction.
71 */
72extern	char *tcpstates[];
73
74static int	tcp_attach __P((struct socket *));
75static int	tcp_connect __P((struct tcpcb *, struct mbuf *));
76static struct tcpcb *
77		tcp_disconnect __P((struct tcpcb *));
78static struct tcpcb *
79		tcp_usrclosed __P((struct tcpcb *));
80/*
81 * Process a TCP user request for TCP tb.  If this is a send request
82 * then m is the mbuf chain of send data.  If this is a timer expiration
83 * (called from the software clock routine), then timertype tells which timer.
84 */
85/*ARGSUSED*/
86int
87tcp_usrreq(so, req, m, nam, control)
88	struct socket *so;
89	int req;
90	struct mbuf *m, *nam, *control;
91{
92	register struct inpcb *inp;
93	register struct tcpcb *tp = 0;
94	struct sockaddr_in *sinp;
95	int s;
96	int error = 0;
97#ifdef TCPDEBUG
98	int ostate;
99#endif
100
101	if (req == PRU_CONTROL)
102		return (in_control(so, (u_long)m, (caddr_t)nam,
103			(struct ifnet *)control));
104	if (control && control->m_len) {
105		m_freem(control);
106		if (m)
107			m_freem(m);
108		return (EINVAL);
109	}
110
111	s = splnet();
112	inp = sotoinpcb(so);
113	/*
114	 * When a TCP is attached to a socket, then there will be
115	 * a (struct inpcb) pointed at by the socket, and this
116	 * structure will point at a subsidary (struct tcpcb).
117	 */
118	if (inp == 0 && req != PRU_ATTACH) {
119		splx(s);
120#if 0
121		/*
122		 * The following corrects an mbuf leak under rare
123		 * circumstances, but has not been fully tested.
124		 */
125		if (m && req != PRU_SENSE)
126			m_freem(m);
127#else
128		/* safer version of fix for mbuf leak */
129		if (m && (req == PRU_SEND || req == PRU_SENDOOB))
130			m_freem(m);
131#endif
132		return (EINVAL);		/* XXX */
133	}
134	if (inp) {
135		tp = intotcpcb(inp);
136		/* WHAT IF TP IS 0? */
137#ifdef KPROF
138		tcp_acounts[tp->t_state][req]++;
139#endif
140#ifdef TCPDEBUG
141		ostate = tp->t_state;
142	} else
143		ostate = 0;
144#else /* TCPDEBUG */
145	}
146#endif /* TCPDEBUG */
147
148	switch (req) {
149
150	/*
151	 * TCP attaches to socket via PRU_ATTACH, reserving space,
152	 * and an internet control block.
153	 */
154	case PRU_ATTACH:
155		if (inp) {
156			error = EISCONN;
157			break;
158		}
159		error = tcp_attach(so);
160		if (error)
161			break;
162		if ((so->so_options & SO_LINGER) && so->so_linger == 0)
163			so->so_linger = TCP_LINGERTIME * hz;
164		tp = sototcpcb(so);
165		break;
166
167	/*
168	 * PRU_DETACH detaches the TCP protocol from the socket.
169	 * If the protocol state is non-embryonic, then can't
170	 * do this directly: have to initiate a PRU_DISCONNECT,
171	 * which may finish later; embryonic TCB's can just
172	 * be discarded here.
173	 */
174	case PRU_DETACH:
175		if (tp->t_state > TCPS_LISTEN)
176			tp = tcp_disconnect(tp);
177		else
178			tp = tcp_close(tp);
179		break;
180
181	/*
182	 * Give the socket an address.
183	 */
184	case PRU_BIND:
185		/*
186		 * Must check for multicast addresses and disallow binding
187		 * to them.
188		 */
189		sinp = mtod(nam, struct sockaddr_in *);
190		if (sinp->sin_family == AF_INET &&
191		    IN_MULTICAST(ntohl(sinp->sin_addr.s_addr))) {
192			error = EAFNOSUPPORT;
193			break;
194		}
195		error = in_pcbbind(inp, nam);
196		if (error)
197			break;
198		break;
199
200	/*
201	 * Prepare to accept connections.
202	 */
203	case PRU_LISTEN:
204		if (inp->inp_lport == 0)
205			error = in_pcbbind(inp, NULL);
206		if (error == 0)
207			tp->t_state = TCPS_LISTEN;
208		break;
209
210	/*
211	 * Initiate connection to peer.
212	 * Create a template for use in transmissions on this connection.
213	 * Enter SYN_SENT state, and mark socket as connecting.
214	 * Start keep-alive timer, and seed output sequence space.
215	 * Send initial segment on connection.
216	 */
217	case PRU_CONNECT:
218		/*
219		 * Must disallow TCP ``connections'' to multicast addresses.
220		 */
221		sinp = mtod(nam, struct sockaddr_in *);
222		if (sinp->sin_family == AF_INET
223		    && IN_MULTICAST(ntohl(sinp->sin_addr.s_addr))) {
224			error = EAFNOSUPPORT;
225			break;
226		}
227
228		if ((error = tcp_connect(tp, nam)) != 0)
229			break;
230		error = tcp_output(tp);
231		break;
232
233	/*
234	 * Create a TCP connection between two sockets.
235	 */
236	case PRU_CONNECT2:
237		error = EOPNOTSUPP;
238		break;
239
240	/*
241	 * Initiate disconnect from peer.
242	 * If connection never passed embryonic stage, just drop;
243	 * else if don't need to let data drain, then can just drop anyways,
244	 * else have to begin TCP shutdown process: mark socket disconnecting,
245	 * drain unread data, state switch to reflect user close, and
246	 * send segment (e.g. FIN) to peer.  Socket will be really disconnected
247	 * when peer sends FIN and acks ours.
248	 *
249	 * SHOULD IMPLEMENT LATER PRU_CONNECT VIA REALLOC TCPCB.
250	 */
251	case PRU_DISCONNECT:
252		tp = tcp_disconnect(tp);
253		break;
254
255	/*
256	 * Accept a connection.  Essentially all the work is
257	 * done at higher levels; just return the address
258	 * of the peer, storing through addr.
259	 */
260	case PRU_ACCEPT:
261		in_setpeeraddr(inp, nam);
262		break;
263
264	/*
265	 * Mark the connection as being incapable of further output.
266	 */
267	case PRU_SHUTDOWN:
268		socantsendmore(so);
269		tp = tcp_usrclosed(tp);
270		if (tp)
271			error = tcp_output(tp);
272		break;
273
274	/*
275	 * After a receive, possibly send window update to peer.
276	 */
277	case PRU_RCVD:
278		(void) tcp_output(tp);
279		break;
280
281	/*
282	 * Do a send by putting data in output queue and updating urgent
283	 * marker if URG set.  Possibly send more data.
284	 */
285	case PRU_SEND_EOF:
286	case PRU_SEND:
287		sbappend(&so->so_snd, m);
288		if (nam && tp->t_state < TCPS_SYN_SENT) {
289			/*
290			 * Do implied connect if not yet connected,
291			 * initialize window to default value, and
292			 * initialize maxseg/maxopd using peer's cached
293			 * MSS.
294			 */
295			error = tcp_connect(tp, nam);
296			if (error)
297				break;
298			tp->snd_wnd = TTCP_CLIENT_SND_WND;
299			tcp_mss(tp, -1);
300		}
301
302		if (req == PRU_SEND_EOF) {
303			/*
304			 * Close the send side of the connection after
305			 * the data is sent.
306			 */
307			socantsendmore(so);
308			tp = tcp_usrclosed(tp);
309		}
310		if (tp != NULL)
311			error = tcp_output(tp);
312		break;
313
314	/*
315	 * Abort the TCP.
316	 */
317	case PRU_ABORT:
318		tp = tcp_drop(tp, ECONNABORTED);
319		break;
320
321	case PRU_SENSE:
322		((struct stat *) m)->st_blksize = so->so_snd.sb_hiwat;
323		(void) splx(s);
324		return (0);
325
326	case PRU_RCVOOB:
327		if ((so->so_oobmark == 0 &&
328		    (so->so_state & SS_RCVATMARK) == 0) ||
329		    so->so_options & SO_OOBINLINE ||
330		    tp->t_oobflags & TCPOOB_HADDATA) {
331			error = EINVAL;
332			break;
333		}
334		if ((tp->t_oobflags & TCPOOB_HAVEDATA) == 0) {
335			error = EWOULDBLOCK;
336			break;
337		}
338		m->m_len = 1;
339		*mtod(m, caddr_t) = tp->t_iobc;
340		if (((int)nam & MSG_PEEK) == 0)
341			tp->t_oobflags ^= (TCPOOB_HAVEDATA | TCPOOB_HADDATA);
342		break;
343
344	case PRU_SENDOOB:
345		if (sbspace(&so->so_snd) < -512) {
346			m_freem(m);
347			error = ENOBUFS;
348			break;
349		}
350		/*
351		 * According to RFC961 (Assigned Protocols),
352		 * the urgent pointer points to the last octet
353		 * of urgent data.  We continue, however,
354		 * to consider it to indicate the first octet
355		 * of data past the urgent section.
356		 * Otherwise, snd_up should be one lower.
357		 */
358		sbappend(&so->so_snd, m);
359		tp->snd_up = tp->snd_una + so->so_snd.sb_cc;
360		tp->t_force = 1;
361		error = tcp_output(tp);
362		tp->t_force = 0;
363		break;
364
365	case PRU_SOCKADDR:
366		in_setsockaddr(inp, nam);
367		break;
368
369	case PRU_PEERADDR:
370		in_setpeeraddr(inp, nam);
371		break;
372
373	/*
374	 * TCP slow timer went off; going through this
375	 * routine for tracing's sake.
376	 */
377	case PRU_SLOWTIMO:
378		tp = tcp_timers(tp, (int)nam);
379#ifdef TCPDEBUG
380		req |= (int)nam << 8;		/* for debug's sake */
381#endif
382		break;
383
384	default:
385		panic("tcp_usrreq");
386	}
387#ifdef TCPDEBUG
388	if (tp && (so->so_options & SO_DEBUG))
389		tcp_trace(TA_USER, ostate, tp, (struct tcpiphdr *)0, req);
390#endif
391	splx(s);
392	return (error);
393}
394
395/*
396 * Common subroutine to open a TCP connection to remote host specified
397 * by struct sockaddr_in in mbuf *nam.  Call in_pcbbind to assign a local
398 * port number if needed.  Call in_pcbladdr to do the routing and to choose
399 * a local host address (interface).  If there is an existing incarnation
400 * of the same connection in TIME-WAIT state and if the remote host was
401 * sending CC options and if the connection duration was < MSL, then
402 * truncate the previous TIME-WAIT state and proceed.
403 * Initialize connection parameters and enter SYN-SENT state.
404 */
405static int
406tcp_connect(tp, nam)
407	register struct tcpcb *tp;
408	struct mbuf *nam;
409{
410	struct inpcb *inp = tp->t_inpcb, *oinp;
411	struct socket *so = inp->inp_socket;
412	struct tcpcb *otp;
413	struct sockaddr_in *sin = mtod(nam, struct sockaddr_in *);
414	struct sockaddr_in *ifaddr;
415	int error;
416	struct rmxp_tao *taop;
417	struct rmxp_tao tao_noncached;
418
419	if (inp->inp_lport == 0) {
420		error = in_pcbbind(inp, NULL);
421		if (error)
422			return error;
423	}
424
425	/*
426	 * Cannot simply call in_pcbconnect, because there might be an
427	 * earlier incarnation of this same connection still in
428	 * TIME_WAIT state, creating an ADDRINUSE error.
429	 */
430	error = in_pcbladdr(inp, nam, &ifaddr);
431	if (error)
432		return error;
433	oinp = in_pcblookup(inp->inp_pcbinfo->listhead,
434	    sin->sin_addr, sin->sin_port,
435	    inp->inp_laddr.s_addr != INADDR_ANY ? inp->inp_laddr
436						: ifaddr->sin_addr,
437	    inp->inp_lport,  0);
438	if (oinp) {
439		if (oinp != inp && (otp = intotcpcb(oinp)) != NULL &&
440		otp->t_state == TCPS_TIME_WAIT &&
441		    otp->t_duration < TCPTV_MSL &&
442		    (otp->t_flags & TF_RCVD_CC))
443			otp = tcp_close(otp);
444		else
445			return EADDRINUSE;
446	}
447	if (inp->inp_laddr.s_addr == INADDR_ANY)
448		inp->inp_laddr = ifaddr->sin_addr;
449	inp->inp_faddr = sin->sin_addr;
450	inp->inp_fport = sin->sin_port;
451	in_pcbrehash(inp);
452
453	tp->t_template = tcp_template(tp);
454	if (tp->t_template == 0) {
455		in_pcbdisconnect(inp);
456		return ENOBUFS;
457	}
458
459	/* Compute window scaling to request.  */
460	while (tp->request_r_scale < TCP_MAX_WINSHIFT &&
461	    (TCP_MAXWIN << tp->request_r_scale) < so->so_rcv.sb_hiwat)
462		tp->request_r_scale++;
463
464	soisconnecting(so);
465	tcpstat.tcps_connattempt++;
466	tp->t_state = TCPS_SYN_SENT;
467	tp->t_timer[TCPT_KEEP] = TCPTV_KEEP_INIT;
468	tp->iss = tcp_iss; tcp_iss += TCP_ISSINCR/2;
469	tcp_sendseqinit(tp);
470
471	/*
472	 * Generate a CC value for this connection and
473	 * check whether CC or CCnew should be used.
474	 */
475	if ((taop = tcp_gettaocache(tp->t_inpcb)) == NULL) {
476		taop = &tao_noncached;
477		bzero(taop, sizeof(*taop));
478	}
479
480	tp->cc_send = CC_INC(tcp_ccgen);
481	if (taop->tao_ccsent != 0 &&
482	    CC_GEQ(tp->cc_send, taop->tao_ccsent)) {
483		taop->tao_ccsent = tp->cc_send;
484	} else {
485		taop->tao_ccsent = 0;
486		tp->t_flags |= TF_SENDCCNEW;
487	}
488
489	return 0;
490}
491
492int
493tcp_ctloutput(op, so, level, optname, mp)
494	int op;
495	struct socket *so;
496	int level, optname;
497	struct mbuf **mp;
498{
499	int error = 0, s;
500	struct inpcb *inp;
501	register struct tcpcb *tp;
502	register struct mbuf *m;
503	register int i;
504
505	s = splnet();
506	inp = sotoinpcb(so);
507	if (inp == NULL) {
508		splx(s);
509		if (op == PRCO_SETOPT && *mp)
510			(void) m_free(*mp);
511		return (ECONNRESET);
512	}
513	if (level != IPPROTO_TCP) {
514		error = ip_ctloutput(op, so, level, optname, mp);
515		splx(s);
516		return (error);
517	}
518	tp = intotcpcb(inp);
519
520	switch (op) {
521
522	case PRCO_SETOPT:
523		m = *mp;
524		switch (optname) {
525
526		case TCP_NODELAY:
527			if (m == NULL || m->m_len < sizeof (int))
528				error = EINVAL;
529			else if (*mtod(m, int *))
530				tp->t_flags |= TF_NODELAY;
531			else
532				tp->t_flags &= ~TF_NODELAY;
533			break;
534
535		case TCP_MAXSEG:
536			if (m && (i = *mtod(m, int *)) > 0 && i <= tp->t_maxseg)
537				tp->t_maxseg = i;
538			else
539				error = EINVAL;
540			break;
541
542		case TCP_NOOPT:
543			if (m == NULL || m->m_len < sizeof (int))
544				error = EINVAL;
545			else if (*mtod(m, int *))
546				tp->t_flags |= TF_NOOPT;
547			else
548				tp->t_flags &= ~TF_NOOPT;
549			break;
550
551		case TCP_NOPUSH:
552			if (m == NULL || m->m_len < sizeof (int))
553				error = EINVAL;
554			else if (*mtod(m, int *))
555				tp->t_flags |= TF_NOPUSH;
556			else
557				tp->t_flags &= ~TF_NOPUSH;
558			break;
559
560		default:
561			error = ENOPROTOOPT;
562			break;
563		}
564		if (m)
565			(void) m_free(m);
566		break;
567
568	case PRCO_GETOPT:
569		*mp = m = m_get(M_WAIT, MT_SOOPTS);
570		m->m_len = sizeof(int);
571
572		switch (optname) {
573		case TCP_NODELAY:
574			*mtod(m, int *) = tp->t_flags & TF_NODELAY;
575			break;
576		case TCP_MAXSEG:
577			*mtod(m, int *) = tp->t_maxseg;
578			break;
579		case TCP_NOOPT:
580			*mtod(m, int *) = tp->t_flags & TF_NOOPT;
581			break;
582		case TCP_NOPUSH:
583			*mtod(m, int *) = tp->t_flags & TF_NOPUSH;
584			break;
585		default:
586			error = ENOPROTOOPT;
587			break;
588		}
589		break;
590	}
591	splx(s);
592	return (error);
593}
594
595/*
596 * tcp_sendspace and tcp_recvspace are the default send and receive window
597 * sizes, respectively.  These are obsolescent (this information should
598 * be set by the route).
599 */
600u_long	tcp_sendspace = 1024*16;
601SYSCTL_INT(_net_inet_tcp, TCPCTL_SENDSPACE, sendspace,
602	CTLFLAG_RW, &tcp_sendspace , 0, "");
603u_long	tcp_recvspace = 1024*16;
604SYSCTL_INT(_net_inet_tcp, TCPCTL_RECVSPACE, recvspace,
605	CTLFLAG_RW, &tcp_recvspace , 0, "");
606
607/*
608 * Attach TCP protocol to socket, allocating
609 * internet protocol control block, tcp control block,
610 * bufer space, and entering LISTEN state if to accept connections.
611 */
612static int
613tcp_attach(so)
614	struct socket *so;
615{
616	register struct tcpcb *tp;
617	struct inpcb *inp;
618	int error;
619
620	if (so->so_snd.sb_hiwat == 0 || so->so_rcv.sb_hiwat == 0) {
621		error = soreserve(so, tcp_sendspace, tcp_recvspace);
622		if (error)
623			return (error);
624	}
625	error = in_pcballoc(so, &tcbinfo);
626	if (error)
627		return (error);
628	inp = sotoinpcb(so);
629	tp = tcp_newtcpcb(inp);
630	if (tp == 0) {
631		int nofd = so->so_state & SS_NOFDREF;	/* XXX */
632
633		so->so_state &= ~SS_NOFDREF;	/* don't free the socket yet */
634		in_pcbdetach(inp);
635		so->so_state |= nofd;
636		return (ENOBUFS);
637	}
638	tp->t_state = TCPS_CLOSED;
639	return (0);
640}
641
642/*
643 * Initiate (or continue) disconnect.
644 * If embryonic state, just send reset (once).
645 * If in ``let data drain'' option and linger null, just drop.
646 * Otherwise (hard), mark socket disconnecting and drop
647 * current input data; switch states based on user close, and
648 * send segment to peer (with FIN).
649 */
650static struct tcpcb *
651tcp_disconnect(tp)
652	register struct tcpcb *tp;
653{
654	struct socket *so = tp->t_inpcb->inp_socket;
655
656	if (tp->t_state < TCPS_ESTABLISHED)
657		tp = tcp_close(tp);
658	else if ((so->so_options & SO_LINGER) && so->so_linger == 0)
659		tp = tcp_drop(tp, 0);
660	else {
661		soisdisconnecting(so);
662		sbflush(&so->so_rcv);
663		tp = tcp_usrclosed(tp);
664		if (tp)
665			(void) tcp_output(tp);
666	}
667	return (tp);
668}
669
670/*
671 * User issued close, and wish to trail through shutdown states:
672 * if never received SYN, just forget it.  If got a SYN from peer,
673 * but haven't sent FIN, then go to FIN_WAIT_1 state to send peer a FIN.
674 * If already got a FIN from peer, then almost done; go to LAST_ACK
675 * state.  In all other cases, have already sent FIN to peer (e.g.
676 * after PRU_SHUTDOWN), and just have to play tedious game waiting
677 * for peer to send FIN or not respond to keep-alives, etc.
678 * We can let the user exit from the close as soon as the FIN is acked.
679 */
680static struct tcpcb *
681tcp_usrclosed(tp)
682	register struct tcpcb *tp;
683{
684
685	switch (tp->t_state) {
686
687	case TCPS_CLOSED:
688	case TCPS_LISTEN:
689		tp->t_state = TCPS_CLOSED;
690		tp = tcp_close(tp);
691		break;
692
693	case TCPS_SYN_SENT:
694	case TCPS_SYN_RECEIVED:
695		tp->t_flags |= TF_NEEDFIN;
696		break;
697
698	case TCPS_ESTABLISHED:
699		tp->t_state = TCPS_FIN_WAIT_1;
700		break;
701
702	case TCPS_CLOSE_WAIT:
703		tp->t_state = TCPS_LAST_ACK;
704		break;
705	}
706	if (tp && tp->t_state >= TCPS_FIN_WAIT_2) {
707		soisdisconnected(tp->t_inpcb->inp_socket);
708		/* To prevent the connection hanging in FIN_WAIT_2 forever. */
709		if (tp->t_state == TCPS_FIN_WAIT_2)
710			tp->t_timer[TCPT_2MSL] = tcp_maxidle;
711	}
712	return (tp);
713}
714
715