tcp_usrreq.c revision 12657
1/*
2 * Copyright (c) 1982, 1986, 1988, 1993
3 *	The Regents of the University of California.  All rights reserved.
4 *
5 * Redistribution and use in source and binary forms, with or without
6 * modification, are permitted provided that the following conditions
7 * are met:
8 * 1. Redistributions of source code must retain the above copyright
9 *    notice, this list of conditions and the following disclaimer.
10 * 2. Redistributions in binary form must reproduce the above copyright
11 *    notice, this list of conditions and the following disclaimer in the
12 *    documentation and/or other materials provided with the distribution.
13 * 3. All advertising materials mentioning features or use of this software
14 *    must display the following acknowledgement:
15 *	This product includes software developed by the University of
16 *	California, Berkeley and its contributors.
17 * 4. Neither the name of the University nor the names of its contributors
18 *    may be used to endorse or promote products derived from this software
19 *    without specific prior written permission.
20 *
21 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
22 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
23 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
24 * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
25 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
26 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
27 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
28 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
29 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
30 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
31 * SUCH DAMAGE.
32 *
33 *	From: @(#)tcp_usrreq.c	8.2 (Berkeley) 1/3/94
34 *	$Id: tcp_usrreq.c,v 1.20 1995/11/14 20:34:47 phk Exp $
35 */
36
37#include <sys/param.h>
38#include <sys/systm.h>
39#include <sys/kernel.h>
40#include <sys/sysctl.h>
41#include <sys/malloc.h>
42#include <sys/mbuf.h>
43#include <sys/socket.h>
44#include <sys/socketvar.h>
45#include <sys/protosw.h>
46#include <sys/errno.h>
47#include <sys/stat.h>
48
49#include <net/if.h>
50#include <net/route.h>
51
52#include <netinet/in.h>
53#include <netinet/in_systm.h>
54#include <netinet/ip.h>
55#include <netinet/in_pcb.h>
56#include <netinet/in_var.h>
57#include <netinet/ip_var.h>
58#include <netinet/tcp.h>
59#include <netinet/tcp_fsm.h>
60#include <netinet/tcp_seq.h>
61#include <netinet/tcp_timer.h>
62#include <netinet/tcp_var.h>
63#include <netinet/tcpip.h>
64#ifdef TCPDEBUG
65#include <netinet/tcp_debug.h>
66#endif
67
68/*
69 * TCP protocol interface to socket abstraction.
70 */
71extern	char *tcpstates[];
72
73static int	tcp_attach __P((struct socket *));
74static int	tcp_connect __P((struct tcpcb *, struct mbuf *));
75static struct tcpcb *
76		tcp_disconnect __P((struct tcpcb *));
77static struct tcpcb *
78		tcp_usrclosed __P((struct tcpcb *));
79/*
80 * Process a TCP user request for TCP tb.  If this is a send request
81 * then m is the mbuf chain of send data.  If this is a timer expiration
82 * (called from the software clock routine), then timertype tells which timer.
83 */
84/*ARGSUSED*/
85int
86tcp_usrreq(so, req, m, nam, control)
87	struct socket *so;
88	int req;
89	struct mbuf *m, *nam, *control;
90{
91	register struct inpcb *inp;
92	register struct tcpcb *tp = 0;
93	struct sockaddr_in *sinp;
94	int s;
95	int error = 0;
96#ifdef TCPDEBUG
97	int ostate;
98#endif
99
100	if (req == PRU_CONTROL)
101		return (in_control(so, (u_long)m, (caddr_t)nam,
102			(struct ifnet *)control));
103	if (control && control->m_len) {
104		m_freem(control);
105		if (m)
106			m_freem(m);
107		return (EINVAL);
108	}
109
110	s = splnet();
111	inp = sotoinpcb(so);
112	/*
113	 * When a TCP is attached to a socket, then there will be
114	 * a (struct inpcb) pointed at by the socket, and this
115	 * structure will point at a subsidary (struct tcpcb).
116	 */
117	if (inp == 0 && req != PRU_ATTACH) {
118		splx(s);
119#if 0
120		/*
121		 * The following corrects an mbuf leak under rare
122		 * circumstances, but has not been fully tested.
123		 */
124		if (m && req != PRU_SENSE)
125			m_freem(m);
126#else
127		/* safer version of fix for mbuf leak */
128		if (m && (req == PRU_SEND || req == PRU_SENDOOB))
129			m_freem(m);
130#endif
131		return (EINVAL);		/* XXX */
132	}
133	if (inp) {
134		tp = intotcpcb(inp);
135		/* WHAT IF TP IS 0? */
136#ifdef KPROF
137		tcp_acounts[tp->t_state][req]++;
138#endif
139#ifdef TCPDEBUG
140		ostate = tp->t_state;
141	} else
142		ostate = 0;
143#else /* TCPDEBUG */
144	}
145#endif /* TCPDEBUG */
146
147	switch (req) {
148
149	/*
150	 * TCP attaches to socket via PRU_ATTACH, reserving space,
151	 * and an internet control block.
152	 */
153	case PRU_ATTACH:
154		if (inp) {
155			error = EISCONN;
156			break;
157		}
158		error = tcp_attach(so);
159		if (error)
160			break;
161		if ((so->so_options & SO_LINGER) && so->so_linger == 0)
162			so->so_linger = TCP_LINGERTIME * hz;
163		tp = sototcpcb(so);
164		break;
165
166	/*
167	 * PRU_DETACH detaches the TCP protocol from the socket.
168	 * If the protocol state is non-embryonic, then can't
169	 * do this directly: have to initiate a PRU_DISCONNECT,
170	 * which may finish later; embryonic TCB's can just
171	 * be discarded here.
172	 */
173	case PRU_DETACH:
174		if (tp->t_state > TCPS_LISTEN)
175			tp = tcp_disconnect(tp);
176		else
177			tp = tcp_close(tp);
178		break;
179
180	/*
181	 * Give the socket an address.
182	 */
183	case PRU_BIND:
184		/*
185		 * Must check for multicast addresses and disallow binding
186		 * to them.
187		 */
188		sinp = mtod(nam, struct sockaddr_in *);
189		if (sinp->sin_family == AF_INET &&
190		    IN_MULTICAST(ntohl(sinp->sin_addr.s_addr))) {
191			error = EAFNOSUPPORT;
192			break;
193		}
194		error = in_pcbbind(inp, nam);
195		if (error)
196			break;
197		break;
198
199	/*
200	 * Prepare to accept connections.
201	 */
202	case PRU_LISTEN:
203		if (inp->inp_lport == 0)
204			error = in_pcbbind(inp, NULL);
205		if (error == 0)
206			tp->t_state = TCPS_LISTEN;
207		break;
208
209	/*
210	 * Initiate connection to peer.
211	 * Create a template for use in transmissions on this connection.
212	 * Enter SYN_SENT state, and mark socket as connecting.
213	 * Start keep-alive timer, and seed output sequence space.
214	 * Send initial segment on connection.
215	 */
216	case PRU_CONNECT:
217		/*
218		 * Must disallow TCP ``connections'' to multicast addresses.
219		 */
220		sinp = mtod(nam, struct sockaddr_in *);
221		if (sinp->sin_family == AF_INET
222		    && IN_MULTICAST(ntohl(sinp->sin_addr.s_addr))) {
223			error = EAFNOSUPPORT;
224			break;
225		}
226
227		if ((error = tcp_connect(tp, nam)) != 0)
228			break;
229		error = tcp_output(tp);
230		break;
231
232	/*
233	 * Create a TCP connection between two sockets.
234	 */
235	case PRU_CONNECT2:
236		error = EOPNOTSUPP;
237		break;
238
239	/*
240	 * Initiate disconnect from peer.
241	 * If connection never passed embryonic stage, just drop;
242	 * else if don't need to let data drain, then can just drop anyways,
243	 * else have to begin TCP shutdown process: mark socket disconnecting,
244	 * drain unread data, state switch to reflect user close, and
245	 * send segment (e.g. FIN) to peer.  Socket will be really disconnected
246	 * when peer sends FIN and acks ours.
247	 *
248	 * SHOULD IMPLEMENT LATER PRU_CONNECT VIA REALLOC TCPCB.
249	 */
250	case PRU_DISCONNECT:
251		tp = tcp_disconnect(tp);
252		break;
253
254	/*
255	 * Accept a connection.  Essentially all the work is
256	 * done at higher levels; just return the address
257	 * of the peer, storing through addr.
258	 */
259	case PRU_ACCEPT:
260		in_setpeeraddr(inp, nam);
261		break;
262
263	/*
264	 * Mark the connection as being incapable of further output.
265	 */
266	case PRU_SHUTDOWN:
267		socantsendmore(so);
268		tp = tcp_usrclosed(tp);
269		if (tp)
270			error = tcp_output(tp);
271		break;
272
273	/*
274	 * After a receive, possibly send window update to peer.
275	 */
276	case PRU_RCVD:
277		(void) tcp_output(tp);
278		break;
279
280	/*
281	 * Do a send by putting data in output queue and updating urgent
282	 * marker if URG set.  Possibly send more data.
283	 */
284	case PRU_SEND_EOF:
285	case PRU_SEND:
286		sbappend(&so->so_snd, m);
287		if (nam && tp->t_state < TCPS_SYN_SENT) {
288			/*
289			 * Do implied connect if not yet connected,
290			 * initialize window to default value, and
291			 * initialize maxseg/maxopd using peer's cached
292			 * MSS.
293			 */
294			error = tcp_connect(tp, nam);
295			if (error)
296				break;
297			tp->snd_wnd = TTCP_CLIENT_SND_WND;
298			tcp_mss(tp, -1);
299		}
300
301		if (req == PRU_SEND_EOF) {
302			/*
303			 * Close the send side of the connection after
304			 * the data is sent.
305			 */
306			socantsendmore(so);
307			tp = tcp_usrclosed(tp);
308		}
309		if (tp != NULL)
310			error = tcp_output(tp);
311		break;
312
313	/*
314	 * Abort the TCP.
315	 */
316	case PRU_ABORT:
317		tp = tcp_drop(tp, ECONNABORTED);
318		break;
319
320	case PRU_SENSE:
321		((struct stat *) m)->st_blksize = so->so_snd.sb_hiwat;
322		(void) splx(s);
323		return (0);
324
325	case PRU_RCVOOB:
326		if ((so->so_oobmark == 0 &&
327		    (so->so_state & SS_RCVATMARK) == 0) ||
328		    so->so_options & SO_OOBINLINE ||
329		    tp->t_oobflags & TCPOOB_HADDATA) {
330			error = EINVAL;
331			break;
332		}
333		if ((tp->t_oobflags & TCPOOB_HAVEDATA) == 0) {
334			error = EWOULDBLOCK;
335			break;
336		}
337		m->m_len = 1;
338		*mtod(m, caddr_t) = tp->t_iobc;
339		if (((int)nam & MSG_PEEK) == 0)
340			tp->t_oobflags ^= (TCPOOB_HAVEDATA | TCPOOB_HADDATA);
341		break;
342
343	case PRU_SENDOOB:
344		if (sbspace(&so->so_snd) < -512) {
345			m_freem(m);
346			error = ENOBUFS;
347			break;
348		}
349		/*
350		 * According to RFC961 (Assigned Protocols),
351		 * the urgent pointer points to the last octet
352		 * of urgent data.  We continue, however,
353		 * to consider it to indicate the first octet
354		 * of data past the urgent section.
355		 * Otherwise, snd_up should be one lower.
356		 */
357		sbappend(&so->so_snd, m);
358		tp->snd_up = tp->snd_una + so->so_snd.sb_cc;
359		tp->t_force = 1;
360		error = tcp_output(tp);
361		tp->t_force = 0;
362		break;
363
364	case PRU_SOCKADDR:
365		in_setsockaddr(inp, nam);
366		break;
367
368	case PRU_PEERADDR:
369		in_setpeeraddr(inp, nam);
370		break;
371
372	/*
373	 * TCP slow timer went off; going through this
374	 * routine for tracing's sake.
375	 */
376	case PRU_SLOWTIMO:
377		tp = tcp_timers(tp, (int)nam);
378#ifdef TCPDEBUG
379		req |= (int)nam << 8;		/* for debug's sake */
380#endif
381		break;
382
383	default:
384		panic("tcp_usrreq");
385	}
386#ifdef TCPDEBUG
387	if (tp && (so->so_options & SO_DEBUG))
388		tcp_trace(TA_USER, ostate, tp, (struct tcpiphdr *)0, req);
389#endif
390	splx(s);
391	return (error);
392}
393
394/*
395 * Common subroutine to open a TCP connection to remote host specified
396 * by struct sockaddr_in in mbuf *nam.  Call in_pcbbind to assign a local
397 * port number if needed.  Call in_pcbladdr to do the routing and to choose
398 * a local host address (interface).  If there is an existing incarnation
399 * of the same connection in TIME-WAIT state and if the remote host was
400 * sending CC options and if the connection duration was < MSL, then
401 * truncate the previous TIME-WAIT state and proceed.
402 * Initialize connection parameters and enter SYN-SENT state.
403 */
404static int
405tcp_connect(tp, nam)
406	register struct tcpcb *tp;
407	struct mbuf *nam;
408{
409	struct inpcb *inp = tp->t_inpcb, *oinp;
410	struct socket *so = inp->inp_socket;
411	struct tcpcb *otp;
412	struct sockaddr_in *sin = mtod(nam, struct sockaddr_in *);
413	struct sockaddr_in *ifaddr;
414	int error;
415	struct rmxp_tao *taop;
416	struct rmxp_tao tao_noncached;
417
418	if (inp->inp_lport == 0) {
419		error = in_pcbbind(inp, NULL);
420		if (error)
421			return error;
422	}
423
424	/*
425	 * Cannot simply call in_pcbconnect, because there might be an
426	 * earlier incarnation of this same connection still in
427	 * TIME_WAIT state, creating an ADDRINUSE error.
428	 */
429	error = in_pcbladdr(inp, nam, &ifaddr);
430	if (error)
431		return error;
432	oinp = in_pcblookup(inp->inp_pcbinfo->listhead,
433	    sin->sin_addr, sin->sin_port,
434	    inp->inp_laddr.s_addr != INADDR_ANY ? inp->inp_laddr
435						: ifaddr->sin_addr,
436	    inp->inp_lport,  0);
437	if (oinp) {
438		if (oinp != inp && (otp = intotcpcb(oinp)) != NULL &&
439		otp->t_state == TCPS_TIME_WAIT &&
440		    otp->t_duration < TCPTV_MSL &&
441		    (otp->t_flags & TF_RCVD_CC))
442			otp = tcp_close(otp);
443		else
444			return EADDRINUSE;
445	}
446	if (inp->inp_laddr.s_addr == INADDR_ANY)
447		inp->inp_laddr = ifaddr->sin_addr;
448	inp->inp_faddr = sin->sin_addr;
449	inp->inp_fport = sin->sin_port;
450	in_pcbrehash(inp);
451
452	tp->t_template = tcp_template(tp);
453	if (tp->t_template == 0) {
454		in_pcbdisconnect(inp);
455		return ENOBUFS;
456	}
457
458	/* Compute window scaling to request.  */
459	while (tp->request_r_scale < TCP_MAX_WINSHIFT &&
460	    (TCP_MAXWIN << tp->request_r_scale) < so->so_rcv.sb_hiwat)
461		tp->request_r_scale++;
462
463	soisconnecting(so);
464	tcpstat.tcps_connattempt++;
465	tp->t_state = TCPS_SYN_SENT;
466	tp->t_timer[TCPT_KEEP] = TCPTV_KEEP_INIT;
467	tp->iss = tcp_iss; tcp_iss += TCP_ISSINCR/2;
468	tcp_sendseqinit(tp);
469
470	/*
471	 * Generate a CC value for this connection and
472	 * check whether CC or CCnew should be used.
473	 */
474	if ((taop = tcp_gettaocache(tp->t_inpcb)) == NULL) {
475		taop = &tao_noncached;
476		bzero(taop, sizeof(*taop));
477	}
478
479	tp->cc_send = CC_INC(tcp_ccgen);
480	if (taop->tao_ccsent != 0 &&
481	    CC_GEQ(tp->cc_send, taop->tao_ccsent)) {
482		taop->tao_ccsent = tp->cc_send;
483	} else {
484		taop->tao_ccsent = 0;
485		tp->t_flags |= TF_SENDCCNEW;
486	}
487
488	return 0;
489}
490
491int
492tcp_ctloutput(op, so, level, optname, mp)
493	int op;
494	struct socket *so;
495	int level, optname;
496	struct mbuf **mp;
497{
498	int error = 0, s;
499	struct inpcb *inp;
500	register struct tcpcb *tp;
501	register struct mbuf *m;
502	register int i;
503
504	s = splnet();
505	inp = sotoinpcb(so);
506	if (inp == NULL) {
507		splx(s);
508		if (op == PRCO_SETOPT && *mp)
509			(void) m_free(*mp);
510		return (ECONNRESET);
511	}
512	if (level != IPPROTO_TCP) {
513		error = ip_ctloutput(op, so, level, optname, mp);
514		splx(s);
515		return (error);
516	}
517	tp = intotcpcb(inp);
518
519	switch (op) {
520
521	case PRCO_SETOPT:
522		m = *mp;
523		switch (optname) {
524
525		case TCP_NODELAY:
526			if (m == NULL || m->m_len < sizeof (int))
527				error = EINVAL;
528			else if (*mtod(m, int *))
529				tp->t_flags |= TF_NODELAY;
530			else
531				tp->t_flags &= ~TF_NODELAY;
532			break;
533
534		case TCP_MAXSEG:
535			if (m && (i = *mtod(m, int *)) > 0 && i <= tp->t_maxseg)
536				tp->t_maxseg = i;
537			else
538				error = EINVAL;
539			break;
540
541		case TCP_NOOPT:
542			if (m == NULL || m->m_len < sizeof (int))
543				error = EINVAL;
544			else if (*mtod(m, int *))
545				tp->t_flags |= TF_NOOPT;
546			else
547				tp->t_flags &= ~TF_NOOPT;
548			break;
549
550		case TCP_NOPUSH:
551			if (m == NULL || m->m_len < sizeof (int))
552				error = EINVAL;
553			else if (*mtod(m, int *))
554				tp->t_flags |= TF_NOPUSH;
555			else
556				tp->t_flags &= ~TF_NOPUSH;
557			break;
558
559		default:
560			error = ENOPROTOOPT;
561			break;
562		}
563		if (m)
564			(void) m_free(m);
565		break;
566
567	case PRCO_GETOPT:
568		*mp = m = m_get(M_WAIT, MT_SOOPTS);
569		m->m_len = sizeof(int);
570
571		switch (optname) {
572		case TCP_NODELAY:
573			*mtod(m, int *) = tp->t_flags & TF_NODELAY;
574			break;
575		case TCP_MAXSEG:
576			*mtod(m, int *) = tp->t_maxseg;
577			break;
578		case TCP_NOOPT:
579			*mtod(m, int *) = tp->t_flags & TF_NOOPT;
580			break;
581		case TCP_NOPUSH:
582			*mtod(m, int *) = tp->t_flags & TF_NOPUSH;
583			break;
584		default:
585			error = ENOPROTOOPT;
586			break;
587		}
588		break;
589	}
590	splx(s);
591	return (error);
592}
593
594/*
595 * tcp_sendspace and tcp_recvspace are the default send and receive window
596 * sizes, respectively.  These are obsolescent (this information should
597 * be set by the route).
598 */
599u_long	tcp_sendspace = 1024*16;
600SYSCTL_INT(_net_inet_tcp, TCPCTL_SENDSPACE, sendspace,
601	CTLFLAG_RW, &tcp_sendspace , 0, "");
602u_long	tcp_recvspace = 1024*16;
603SYSCTL_INT(_net_inet_tcp, TCPCTL_RECVSPACE, recvspace,
604	CTLFLAG_RW, &tcp_recvspace , 0, "");
605
606/*
607 * Attach TCP protocol to socket, allocating
608 * internet protocol control block, tcp control block,
609 * bufer space, and entering LISTEN state if to accept connections.
610 */
611static int
612tcp_attach(so)
613	struct socket *so;
614{
615	register struct tcpcb *tp;
616	struct inpcb *inp;
617	int error;
618
619	if (so->so_snd.sb_hiwat == 0 || so->so_rcv.sb_hiwat == 0) {
620		error = soreserve(so, tcp_sendspace, tcp_recvspace);
621		if (error)
622			return (error);
623	}
624	error = in_pcballoc(so, &tcbinfo);
625	if (error)
626		return (error);
627	inp = sotoinpcb(so);
628	tp = tcp_newtcpcb(inp);
629	if (tp == 0) {
630		int nofd = so->so_state & SS_NOFDREF;	/* XXX */
631
632		so->so_state &= ~SS_NOFDREF;	/* don't free the socket yet */
633		in_pcbdetach(inp);
634		so->so_state |= nofd;
635		return (ENOBUFS);
636	}
637	tp->t_state = TCPS_CLOSED;
638	return (0);
639}
640
641/*
642 * Initiate (or continue) disconnect.
643 * If embryonic state, just send reset (once).
644 * If in ``let data drain'' option and linger null, just drop.
645 * Otherwise (hard), mark socket disconnecting and drop
646 * current input data; switch states based on user close, and
647 * send segment to peer (with FIN).
648 */
649static struct tcpcb *
650tcp_disconnect(tp)
651	register struct tcpcb *tp;
652{
653	struct socket *so = tp->t_inpcb->inp_socket;
654
655	if (tp->t_state < TCPS_ESTABLISHED)
656		tp = tcp_close(tp);
657	else if ((so->so_options & SO_LINGER) && so->so_linger == 0)
658		tp = tcp_drop(tp, 0);
659	else {
660		soisdisconnecting(so);
661		sbflush(&so->so_rcv);
662		tp = tcp_usrclosed(tp);
663		if (tp)
664			(void) tcp_output(tp);
665	}
666	return (tp);
667}
668
669/*
670 * User issued close, and wish to trail through shutdown states:
671 * if never received SYN, just forget it.  If got a SYN from peer,
672 * but haven't sent FIN, then go to FIN_WAIT_1 state to send peer a FIN.
673 * If already got a FIN from peer, then almost done; go to LAST_ACK
674 * state.  In all other cases, have already sent FIN to peer (e.g.
675 * after PRU_SHUTDOWN), and just have to play tedious game waiting
676 * for peer to send FIN or not respond to keep-alives, etc.
677 * We can let the user exit from the close as soon as the FIN is acked.
678 */
679static struct tcpcb *
680tcp_usrclosed(tp)
681	register struct tcpcb *tp;
682{
683
684	switch (tp->t_state) {
685
686	case TCPS_CLOSED:
687	case TCPS_LISTEN:
688		tp->t_state = TCPS_CLOSED;
689		tp = tcp_close(tp);
690		break;
691
692	case TCPS_SYN_SENT:
693	case TCPS_SYN_RECEIVED:
694		tp->t_flags |= TF_NEEDFIN;
695		break;
696
697	case TCPS_ESTABLISHED:
698		tp->t_state = TCPS_FIN_WAIT_1;
699		break;
700
701	case TCPS_CLOSE_WAIT:
702		tp->t_state = TCPS_LAST_ACK;
703		break;
704	}
705	if (tp && tp->t_state >= TCPS_FIN_WAIT_2) {
706		soisdisconnected(tp->t_inpcb->inp_socket);
707		/* To prevent the connection hanging in FIN_WAIT_2 forever. */
708		if (tp->t_state == TCPS_FIN_WAIT_2)
709			tp->t_timer[TCPT_2MSL] = tcp_maxidle;
710	}
711	return (tp);
712}
713
714