tcp_usrreq.c revision 8876
1/*
2 * Copyright (c) 1982, 1986, 1988, 1993
3 *	The Regents of the University of California.  All rights reserved.
4 *
5 * Redistribution and use in source and binary forms, with or without
6 * modification, are permitted provided that the following conditions
7 * are met:
8 * 1. Redistributions of source code must retain the above copyright
9 *    notice, this list of conditions and the following disclaimer.
10 * 2. Redistributions in binary form must reproduce the above copyright
11 *    notice, this list of conditions and the following disclaimer in the
12 *    documentation and/or other materials provided with the distribution.
13 * 3. All advertising materials mentioning features or use of this software
14 *    must display the following acknowledgement:
15 *	This product includes software developed by the University of
16 *	California, Berkeley and its contributors.
17 * 4. Neither the name of the University nor the names of its contributors
18 *    may be used to endorse or promote products derived from this software
19 *    without specific prior written permission.
20 *
21 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
22 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
23 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
24 * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
25 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
26 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
27 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
28 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
29 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
30 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
31 * SUCH DAMAGE.
32 *
33 *	From: @(#)tcp_usrreq.c	8.2 (Berkeley) 1/3/94
34 *	$Id: tcp_usrreq.c,v 1.13 1995/04/09 01:29:28 davidg Exp $
35 */
36
37#include <sys/param.h>
38#include <sys/systm.h>
39#include <sys/kernel.h>
40#include <sys/malloc.h>
41#include <sys/mbuf.h>
42#include <sys/socket.h>
43#include <sys/socketvar.h>
44#include <sys/protosw.h>
45#include <sys/errno.h>
46#include <sys/stat.h>
47#include <vm/vm.h>
48#include <sys/sysctl.h>
49
50#include <net/if.h>
51#include <net/route.h>
52
53#include <netinet/in.h>
54#include <netinet/in_systm.h>
55#include <netinet/ip.h>
56#include <netinet/in_pcb.h>
57#include <netinet/in_var.h>
58#include <netinet/ip_var.h>
59#include <netinet/tcp.h>
60#include <netinet/tcp_fsm.h>
61#include <netinet/tcp_seq.h>
62#include <netinet/tcp_timer.h>
63#include <netinet/tcp_var.h>
64#include <netinet/tcpip.h>
65#ifdef TCPDEBUG
66#include <netinet/tcp_debug.h>
67#endif
68
69/*
70 * TCP protocol interface to socket abstraction.
71 */
72extern	char *tcpstates[];
73
74/*
75 * Process a TCP user request for TCP tb.  If this is a send request
76 * then m is the mbuf chain of send data.  If this is a timer expiration
77 * (called from the software clock routine), then timertype tells which timer.
78 */
79/*ARGSUSED*/
80int
81tcp_usrreq(so, req, m, nam, control)
82	struct socket *so;
83	int req;
84	struct mbuf *m, *nam, *control;
85{
86	register struct inpcb *inp;
87	register struct tcpcb *tp = 0;
88	struct sockaddr_in *sinp;
89	int s;
90	int error = 0;
91#ifdef TCPDEBUG
92	int ostate;
93#endif
94
95	if (req == PRU_CONTROL)
96		return (in_control(so, (int)m, (caddr_t)nam,
97			(struct ifnet *)control));
98	if (control && control->m_len) {
99		m_freem(control);
100		if (m)
101			m_freem(m);
102		return (EINVAL);
103	}
104
105	s = splnet();
106	inp = sotoinpcb(so);
107	/*
108	 * When a TCP is attached to a socket, then there will be
109	 * a (struct inpcb) pointed at by the socket, and this
110	 * structure will point at a subsidary (struct tcpcb).
111	 */
112	if (inp == 0 && req != PRU_ATTACH) {
113		splx(s);
114		return (EINVAL);		/* XXX */
115	}
116	if (inp) {
117		tp = intotcpcb(inp);
118		/* WHAT IF TP IS 0? */
119#ifdef KPROF
120		tcp_acounts[tp->t_state][req]++;
121#endif
122#ifdef TCPDEBUG
123		ostate = tp->t_state;
124	} else
125		ostate = 0;
126#else /* TCPDEBUG */
127	}
128#endif /* TCPDEBUG */
129
130	switch (req) {
131
132	/*
133	 * TCP attaches to socket via PRU_ATTACH, reserving space,
134	 * and an internet control block.
135	 */
136	case PRU_ATTACH:
137		if (inp) {
138			error = EISCONN;
139			break;
140		}
141		error = tcp_attach(so);
142		if (error)
143			break;
144		if ((so->so_options & SO_LINGER) && so->so_linger == 0)
145			so->so_linger = TCP_LINGERTIME * hz;
146		tp = sototcpcb(so);
147		break;
148
149	/*
150	 * PRU_DETACH detaches the TCP protocol from the socket.
151	 * If the protocol state is non-embryonic, then can't
152	 * do this directly: have to initiate a PRU_DISCONNECT,
153	 * which may finish later; embryonic TCB's can just
154	 * be discarded here.
155	 */
156	case PRU_DETACH:
157		if (tp->t_state > TCPS_LISTEN)
158			tp = tcp_disconnect(tp);
159		else
160			tp = tcp_close(tp);
161		break;
162
163	/*
164	 * Give the socket an address.
165	 */
166	case PRU_BIND:
167		/*
168		 * Must check for multicast addresses and disallow binding
169		 * to them.
170		 */
171		sinp = mtod(nam, struct sockaddr_in *);
172		if (sinp->sin_family == AF_INET &&
173		    IN_MULTICAST(ntohl(sinp->sin_addr.s_addr))) {
174			error = EAFNOSUPPORT;
175			break;
176		}
177		error = in_pcbbind(inp, nam);
178		if (error)
179			break;
180		break;
181
182	/*
183	 * Prepare to accept connections.
184	 */
185	case PRU_LISTEN:
186		if (inp->inp_lport == 0)
187			error = in_pcbbind(inp, NULL);
188		if (error == 0)
189			tp->t_state = TCPS_LISTEN;
190		break;
191
192	/*
193	 * Initiate connection to peer.
194	 * Create a template for use in transmissions on this connection.
195	 * Enter SYN_SENT state, and mark socket as connecting.
196	 * Start keep-alive timer, and seed output sequence space.
197	 * Send initial segment on connection.
198	 */
199	case PRU_CONNECT:
200		/*
201		 * Must disallow TCP ``connections'' to multicast addresses.
202		 */
203		sinp = mtod(nam, struct sockaddr_in *);
204		if (sinp->sin_family == AF_INET
205		    && IN_MULTICAST(ntohl(sinp->sin_addr.s_addr))) {
206			error = EAFNOSUPPORT;
207			break;
208		}
209
210		if ((error = tcp_connect(tp, nam)) != 0)
211			break;
212		error = tcp_output(tp);
213		break;
214
215	/*
216	 * Create a TCP connection between two sockets.
217	 */
218	case PRU_CONNECT2:
219		error = EOPNOTSUPP;
220		break;
221
222	/*
223	 * Initiate disconnect from peer.
224	 * If connection never passed embryonic stage, just drop;
225	 * else if don't need to let data drain, then can just drop anyways,
226	 * else have to begin TCP shutdown process: mark socket disconnecting,
227	 * drain unread data, state switch to reflect user close, and
228	 * send segment (e.g. FIN) to peer.  Socket will be really disconnected
229	 * when peer sends FIN and acks ours.
230	 *
231	 * SHOULD IMPLEMENT LATER PRU_CONNECT VIA REALLOC TCPCB.
232	 */
233	case PRU_DISCONNECT:
234		tp = tcp_disconnect(tp);
235		break;
236
237	/*
238	 * Accept a connection.  Essentially all the work is
239	 * done at higher levels; just return the address
240	 * of the peer, storing through addr.
241	 */
242	case PRU_ACCEPT:
243		in_setpeeraddr(inp, nam);
244		break;
245
246	/*
247	 * Mark the connection as being incapable of further output.
248	 */
249	case PRU_SHUTDOWN:
250		socantsendmore(so);
251		tp = tcp_usrclosed(tp);
252		if (tp)
253			error = tcp_output(tp);
254		break;
255
256	/*
257	 * After a receive, possibly send window update to peer.
258	 */
259	case PRU_RCVD:
260		(void) tcp_output(tp);
261		break;
262
263	/*
264	 * Do a send by putting data in output queue and updating urgent
265	 * marker if URG set.  Possibly send more data.
266	 */
267	case PRU_SEND_EOF:
268	case PRU_SEND:
269		sbappend(&so->so_snd, m);
270		if (nam && tp->t_state < TCPS_SYN_SENT) {
271			/*
272			 * Do implied connect if not yet connected,
273			 * initialize window to default value, and
274			 * initialize maxseg/maxopd using peer's cached
275			 * MSS.
276			 */
277			error = tcp_connect(tp, nam);
278			if (error)
279				break;
280			tp->snd_wnd = TTCP_CLIENT_SND_WND;
281			tcp_mss(tp, -1);
282		}
283
284		if (req == PRU_SEND_EOF) {
285			/*
286			 * Close the send side of the connection after
287			 * the data is sent.
288			 */
289			socantsendmore(so);
290			tp = tcp_usrclosed(tp);
291		}
292		if (tp != NULL)
293			error = tcp_output(tp);
294		break;
295
296	/*
297	 * Abort the TCP.
298	 */
299	case PRU_ABORT:
300		tp = tcp_drop(tp, ECONNABORTED);
301		break;
302
303	case PRU_SENSE:
304		((struct stat *) m)->st_blksize = so->so_snd.sb_hiwat;
305		(void) splx(s);
306		return (0);
307
308	case PRU_RCVOOB:
309		if ((so->so_oobmark == 0 &&
310		    (so->so_state & SS_RCVATMARK) == 0) ||
311		    so->so_options & SO_OOBINLINE ||
312		    tp->t_oobflags & TCPOOB_HADDATA) {
313			error = EINVAL;
314			break;
315		}
316		if ((tp->t_oobflags & TCPOOB_HAVEDATA) == 0) {
317			error = EWOULDBLOCK;
318			break;
319		}
320		m->m_len = 1;
321		*mtod(m, caddr_t) = tp->t_iobc;
322		if (((int)nam & MSG_PEEK) == 0)
323			tp->t_oobflags ^= (TCPOOB_HAVEDATA | TCPOOB_HADDATA);
324		break;
325
326	case PRU_SENDOOB:
327		if (sbspace(&so->so_snd) < -512) {
328			m_freem(m);
329			error = ENOBUFS;
330			break;
331		}
332		/*
333		 * According to RFC961 (Assigned Protocols),
334		 * the urgent pointer points to the last octet
335		 * of urgent data.  We continue, however,
336		 * to consider it to indicate the first octet
337		 * of data past the urgent section.
338		 * Otherwise, snd_up should be one lower.
339		 */
340		sbappend(&so->so_snd, m);
341		tp->snd_up = tp->snd_una + so->so_snd.sb_cc;
342		tp->t_force = 1;
343		error = tcp_output(tp);
344		tp->t_force = 0;
345		break;
346
347	case PRU_SOCKADDR:
348		in_setsockaddr(inp, nam);
349		break;
350
351	case PRU_PEERADDR:
352		in_setpeeraddr(inp, nam);
353		break;
354
355	/*
356	 * TCP slow timer went off; going through this
357	 * routine for tracing's sake.
358	 */
359	case PRU_SLOWTIMO:
360		tp = tcp_timers(tp, (int)nam);
361#ifdef TCPDEBUG
362		req |= (int)nam << 8;		/* for debug's sake */
363#endif
364		break;
365
366	default:
367		panic("tcp_usrreq");
368	}
369#ifdef TCPDEBUG
370	if (tp && (so->so_options & SO_DEBUG))
371		tcp_trace(TA_USER, ostate, tp, (struct tcpiphdr *)0, req);
372#endif
373	splx(s);
374	return (error);
375}
376
377/*
378 * Common subroutine to open a TCP connection to remote host specified
379 * by struct sockaddr_in in mbuf *nam.  Call in_pcbbind to assign a local
380 * port number if needed.  Call in_pcbladdr to do the routing and to choose
381 * a local host address (interface).  If there is an existing incarnation
382 * of the same connection in TIME-WAIT state and if the remote host was
383 * sending CC options and if the connection duration was < MSL, then
384 * truncate the previous TIME-WAIT state and proceed.
385 * Initialize connection parameters and enter SYN-SENT state.
386 */
387int
388tcp_connect(tp, nam)
389	register struct tcpcb *tp;
390	struct mbuf *nam;
391{
392	struct inpcb *inp = tp->t_inpcb, *oinp;
393	struct socket *so = inp->inp_socket;
394	struct tcpcb *otp;
395	struct sockaddr_in *sin = mtod(nam, struct sockaddr_in *);
396	struct sockaddr_in *ifaddr;
397	int error;
398
399	if (inp->inp_lport == 0) {
400		error = in_pcbbind(inp, NULL);
401		if (error)
402			return error;
403	}
404
405	/*
406	 * Cannot simply call in_pcbconnect, because there might be an
407	 * earlier incarnation of this same connection still in
408	 * TIME_WAIT state, creating an ADDRINUSE error.
409	 */
410	error = in_pcbladdr(inp, nam, &ifaddr);
411	oinp = in_pcblookup(inp->inp_pcbinfo->listhead,
412	    sin->sin_addr, sin->sin_port,
413	    inp->inp_laddr.s_addr != INADDR_ANY ? inp->inp_laddr
414						: ifaddr->sin_addr,
415	    inp->inp_lport,  0);
416	if (oinp) {
417		if (oinp != inp && (otp = intotcpcb(oinp)) != NULL &&
418		otp->t_state == TCPS_TIME_WAIT &&
419		    otp->t_duration < TCPTV_MSL &&
420		    (otp->t_flags & TF_RCVD_CC))
421			otp = tcp_close(otp);
422		else
423			return EADDRINUSE;
424	}
425	if (inp->inp_laddr.s_addr == INADDR_ANY)
426		inp->inp_laddr = ifaddr->sin_addr;
427	inp->inp_faddr = sin->sin_addr;
428	inp->inp_fport = sin->sin_port;
429	in_pcbrehash(inp);
430
431	tp->t_template = tcp_template(tp);
432	if (tp->t_template == 0) {
433		in_pcbdisconnect(inp);
434		return ENOBUFS;
435	}
436
437	/* Compute window scaling to request.  */
438	while (tp->request_r_scale < TCP_MAX_WINSHIFT &&
439	    (TCP_MAXWIN << tp->request_r_scale) < so->so_rcv.sb_hiwat)
440		tp->request_r_scale++;
441
442	soisconnecting(so);
443	tcpstat.tcps_connattempt++;
444	tp->t_state = TCPS_SYN_SENT;
445	tp->t_timer[TCPT_KEEP] = TCPTV_KEEP_INIT;
446	tp->iss = tcp_iss; tcp_iss += TCP_ISSINCR/2;
447	tcp_sendseqinit(tp);
448	tp->cc_send = CC_INC(tcp_ccgen);
449
450	return 0;
451}
452
453int
454tcp_ctloutput(op, so, level, optname, mp)
455	int op;
456	struct socket *so;
457	int level, optname;
458	struct mbuf **mp;
459{
460	int error = 0, s;
461	struct inpcb *inp;
462	register struct tcpcb *tp;
463	register struct mbuf *m;
464	register int i;
465
466	s = splnet();
467	inp = sotoinpcb(so);
468	if (inp == NULL) {
469		splx(s);
470		if (op == PRCO_SETOPT && *mp)
471			(void) m_free(*mp);
472		return (ECONNRESET);
473	}
474	if (level != IPPROTO_TCP) {
475		error = ip_ctloutput(op, so, level, optname, mp);
476		splx(s);
477		return (error);
478	}
479	tp = intotcpcb(inp);
480
481	switch (op) {
482
483	case PRCO_SETOPT:
484		m = *mp;
485		switch (optname) {
486
487		case TCP_NODELAY:
488			if (m == NULL || m->m_len < sizeof (int))
489				error = EINVAL;
490			else if (*mtod(m, int *))
491				tp->t_flags |= TF_NODELAY;
492			else
493				tp->t_flags &= ~TF_NODELAY;
494			break;
495
496		case TCP_MAXSEG:
497			if (m && (i = *mtod(m, int *)) > 0 && i <= tp->t_maxseg)
498				tp->t_maxseg = i;
499			else
500				error = EINVAL;
501			break;
502
503		case TCP_NOOPT:
504			if (m == NULL || m->m_len < sizeof (int))
505				error = EINVAL;
506			else if (*mtod(m, int *))
507				tp->t_flags |= TF_NOOPT;
508			else
509				tp->t_flags &= ~TF_NOOPT;
510			break;
511
512		case TCP_NOPUSH:
513			if (m == NULL || m->m_len < sizeof (int))
514				error = EINVAL;
515			else if (*mtod(m, int *))
516				tp->t_flags |= TF_NOPUSH;
517			else
518				tp->t_flags &= ~TF_NOPUSH;
519			break;
520
521		default:
522			error = ENOPROTOOPT;
523			break;
524		}
525		if (m)
526			(void) m_free(m);
527		break;
528
529	case PRCO_GETOPT:
530		*mp = m = m_get(M_WAIT, MT_SOOPTS);
531		m->m_len = sizeof(int);
532
533		switch (optname) {
534		case TCP_NODELAY:
535			*mtod(m, int *) = tp->t_flags & TF_NODELAY;
536			break;
537		case TCP_MAXSEG:
538			*mtod(m, int *) = tp->t_maxseg;
539			break;
540		case TCP_NOOPT:
541			*mtod(m, int *) = tp->t_flags & TF_NOOPT;
542			break;
543		case TCP_NOPUSH:
544			*mtod(m, int *) = tp->t_flags & TF_NOPUSH;
545			break;
546		default:
547			error = ENOPROTOOPT;
548			break;
549		}
550		break;
551	}
552	splx(s);
553	return (error);
554}
555
556/*
557 * tcp_sendspace and tcp_recvspace are the default send and receive window
558 * sizes, respectively.  These are obsolescent (this information should
559 * be set by the route).
560 */
561u_long	tcp_sendspace = 1024*16;
562u_long	tcp_recvspace = 1024*16;
563
564/*
565 * Attach TCP protocol to socket, allocating
566 * internet protocol control block, tcp control block,
567 * bufer space, and entering LISTEN state if to accept connections.
568 */
569int
570tcp_attach(so)
571	struct socket *so;
572{
573	register struct tcpcb *tp;
574	struct inpcb *inp;
575	int error;
576
577	if (so->so_snd.sb_hiwat == 0 || so->so_rcv.sb_hiwat == 0) {
578		error = soreserve(so, tcp_sendspace, tcp_recvspace);
579		if (error)
580			return (error);
581	}
582	error = in_pcballoc(so, &tcbinfo);
583	if (error)
584		return (error);
585	inp = sotoinpcb(so);
586	tp = tcp_newtcpcb(inp);
587	if (tp == 0) {
588		int nofd = so->so_state & SS_NOFDREF;	/* XXX */
589
590		so->so_state &= ~SS_NOFDREF;	/* don't free the socket yet */
591		in_pcbdetach(inp);
592		so->so_state |= nofd;
593		return (ENOBUFS);
594	}
595	tp->t_state = TCPS_CLOSED;
596	return (0);
597}
598
599/*
600 * Initiate (or continue) disconnect.
601 * If embryonic state, just send reset (once).
602 * If in ``let data drain'' option and linger null, just drop.
603 * Otherwise (hard), mark socket disconnecting and drop
604 * current input data; switch states based on user close, and
605 * send segment to peer (with FIN).
606 */
607struct tcpcb *
608tcp_disconnect(tp)
609	register struct tcpcb *tp;
610{
611	struct socket *so = tp->t_inpcb->inp_socket;
612
613	if (tp->t_state < TCPS_ESTABLISHED)
614		tp = tcp_close(tp);
615	else if ((so->so_options & SO_LINGER) && so->so_linger == 0)
616		tp = tcp_drop(tp, 0);
617	else {
618		soisdisconnecting(so);
619		sbflush(&so->so_rcv);
620		tp = tcp_usrclosed(tp);
621		if (tp)
622			(void) tcp_output(tp);
623	}
624	return (tp);
625}
626
627/*
628 * User issued close, and wish to trail through shutdown states:
629 * if never received SYN, just forget it.  If got a SYN from peer,
630 * but haven't sent FIN, then go to FIN_WAIT_1 state to send peer a FIN.
631 * If already got a FIN from peer, then almost done; go to LAST_ACK
632 * state.  In all other cases, have already sent FIN to peer (e.g.
633 * after PRU_SHUTDOWN), and just have to play tedious game waiting
634 * for peer to send FIN or not respond to keep-alives, etc.
635 * We can let the user exit from the close as soon as the FIN is acked.
636 */
637struct tcpcb *
638tcp_usrclosed(tp)
639	register struct tcpcb *tp;
640{
641
642	switch (tp->t_state) {
643
644	case TCPS_CLOSED:
645	case TCPS_LISTEN:
646		tp->t_state = TCPS_CLOSED;
647		tp = tcp_close(tp);
648		break;
649
650	case TCPS_SYN_SENT:
651	case TCPS_SYN_RECEIVED:
652		tp->t_flags |= TF_NEEDFIN;
653		break;
654
655	case TCPS_ESTABLISHED:
656		tp->t_state = TCPS_FIN_WAIT_1;
657		break;
658
659	case TCPS_CLOSE_WAIT:
660		tp->t_state = TCPS_LAST_ACK;
661		break;
662	}
663	if (tp && tp->t_state >= TCPS_FIN_WAIT_2)
664		soisdisconnected(tp->t_inpcb->inp_socket);
665	return (tp);
666}
667
668/*
669 * Sysctl for tcp variables.
670 */
671int
672tcp_sysctl(name, namelen, oldp, oldlenp, newp, newlen)
673	int *name;
674	u_int namelen;
675	void *oldp;
676	size_t *oldlenp;
677	void *newp;
678	size_t newlen;
679{
680	/* All sysctl names at this level are terminal. */
681	if (namelen != 1)
682		return (ENOTDIR);
683
684	switch (name[0]) {
685	case TCPCTL_DO_RFC1323:
686		return (sysctl_int(oldp, oldlenp, newp, newlen,
687		    &tcp_do_rfc1323));
688	case TCPCTL_DO_RFC1644:
689		return (sysctl_int(oldp, oldlenp, newp, newlen,
690		    &tcp_do_rfc1644));
691	case TCPCTL_MSSDFLT:
692		return (sysctl_int(oldp, oldlenp, newp, newlen,
693		    &tcp_mssdflt));
694	case TCPCTL_STATS:
695		return (sysctl_rdstruct(oldp, oldlenp, newp, &tcpstat,
696					sizeof tcpstat));
697	case TCPCTL_RTTDFLT:
698		return (sysctl_int(oldp, oldlenp, newp, newlen, &tcp_rttdflt));
699	case TCPCTL_KEEPIDLE:
700		return (sysctl_int(oldp, oldlenp, newp, newlen,
701				   &tcp_keepidle));
702	case TCPCTL_KEEPINTVL:
703		return (sysctl_int(oldp, oldlenp, newp, newlen,
704				   &tcp_keepintvl));
705	case TCPCTL_SENDSPACE:
706		return (sysctl_int(oldp, oldlenp, newp, newlen,
707				   (int *)&tcp_sendspace)); /* XXX */
708	case TCPCTL_RECVSPACE:
709		return (sysctl_int(oldp, oldlenp, newp, newlen,
710				   (int *)&tcp_recvspace)); /* XXX */
711	default:
712		return (ENOPROTOOPT);
713	}
714	/* NOTREACHED */
715}
716