tcp_usrreq.c revision 7090
1/*
2 * Copyright (c) 1982, 1986, 1988, 1993
3 *	The Regents of the University of California.  All rights reserved.
4 *
5 * Redistribution and use in source and binary forms, with or without
6 * modification, are permitted provided that the following conditions
7 * are met:
8 * 1. Redistributions of source code must retain the above copyright
9 *    notice, this list of conditions and the following disclaimer.
10 * 2. Redistributions in binary form must reproduce the above copyright
11 *    notice, this list of conditions and the following disclaimer in the
12 *    documentation and/or other materials provided with the distribution.
13 * 3. All advertising materials mentioning features or use of this software
14 *    must display the following acknowledgement:
15 *	This product includes software developed by the University of
16 *	California, Berkeley and its contributors.
17 * 4. Neither the name of the University nor the names of its contributors
18 *    may be used to endorse or promote products derived from this software
19 *    without specific prior written permission.
20 *
21 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
22 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
23 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
24 * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
25 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
26 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
27 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
28 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
29 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
30 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
31 * SUCH DAMAGE.
32 *
33 *	From: @(#)tcp_usrreq.c	8.2 (Berkeley) 1/3/94
34 *	$Id: tcp_usrreq.c,v 1.11 1995/02/17 00:29:42 wollman Exp $
35 */
36
37#include <sys/param.h>
38#include <sys/systm.h>
39#include <sys/kernel.h>
40#include <sys/malloc.h>
41#include <sys/mbuf.h>
42#include <sys/socket.h>
43#include <sys/socketvar.h>
44#include <sys/protosw.h>
45#include <sys/errno.h>
46#include <sys/stat.h>
47#include <vm/vm.h>
48#include <sys/sysctl.h>
49
50#include <net/if.h>
51#include <net/route.h>
52
53#include <netinet/in.h>
54#include <netinet/in_systm.h>
55#include <netinet/ip.h>
56#include <netinet/in_pcb.h>
57#include <netinet/in_var.h>
58#include <netinet/ip_var.h>
59#include <netinet/tcp.h>
60#include <netinet/tcp_fsm.h>
61#include <netinet/tcp_seq.h>
62#include <netinet/tcp_timer.h>
63#include <netinet/tcp_var.h>
64#include <netinet/tcpip.h>
65#ifdef TCPDEBUG
66#include <netinet/tcp_debug.h>
67#endif
68
69/*
70 * TCP protocol interface to socket abstraction.
71 */
72extern	char *tcpstates[];
73
74/*
75 * Process a TCP user request for TCP tb.  If this is a send request
76 * then m is the mbuf chain of send data.  If this is a timer expiration
77 * (called from the software clock routine), then timertype tells which timer.
78 */
79/*ARGSUSED*/
80int
81tcp_usrreq(so, req, m, nam, control)
82	struct socket *so;
83	int req;
84	struct mbuf *m, *nam, *control;
85{
86	register struct inpcb *inp;
87	register struct tcpcb *tp = 0;
88	struct sockaddr_in *sinp;
89	int s;
90	int error = 0;
91#ifdef TCPDEBUG
92	int ostate;
93#endif
94
95	if (req == PRU_CONTROL)
96		return (in_control(so, (int)m, (caddr_t)nam,
97			(struct ifnet *)control));
98	if (control && control->m_len) {
99		m_freem(control);
100		if (m)
101			m_freem(m);
102		return (EINVAL);
103	}
104
105	s = splnet();
106	inp = sotoinpcb(so);
107	/*
108	 * When a TCP is attached to a socket, then there will be
109	 * a (struct inpcb) pointed at by the socket, and this
110	 * structure will point at a subsidary (struct tcpcb).
111	 */
112	if (inp == 0 && req != PRU_ATTACH) {
113		splx(s);
114		return (EINVAL);		/* XXX */
115	}
116	if (inp) {
117		tp = intotcpcb(inp);
118		/* WHAT IF TP IS 0? */
119#ifdef KPROF
120		tcp_acounts[tp->t_state][req]++;
121#endif
122#ifdef TCPDEBUG
123		ostate = tp->t_state;
124	} else
125		ostate = 0;
126#else /* TCPDEBUG */
127	}
128#endif /* TCPDEBUG */
129
130	switch (req) {
131
132	/*
133	 * TCP attaches to socket via PRU_ATTACH, reserving space,
134	 * and an internet control block.
135	 */
136	case PRU_ATTACH:
137		if (inp) {
138			error = EISCONN;
139			break;
140		}
141		error = tcp_attach(so);
142		if (error)
143			break;
144		if ((so->so_options & SO_LINGER) && so->so_linger == 0)
145			so->so_linger = TCP_LINGERTIME * hz;
146		tp = sototcpcb(so);
147		break;
148
149	/*
150	 * PRU_DETACH detaches the TCP protocol from the socket.
151	 * If the protocol state is non-embryonic, then can't
152	 * do this directly: have to initiate a PRU_DISCONNECT,
153	 * which may finish later; embryonic TCB's can just
154	 * be discarded here.
155	 */
156	case PRU_DETACH:
157		if (tp->t_state > TCPS_LISTEN)
158			tp = tcp_disconnect(tp);
159		else
160			tp = tcp_close(tp);
161		break;
162
163	/*
164	 * Give the socket an address.
165	 */
166	case PRU_BIND:
167		/*
168		 * Must check for multicast addresses and disallow binding
169		 * to them.
170		 */
171		sinp = mtod(nam, struct sockaddr_in *);
172		if (sinp->sin_family == AF_INET &&
173		    IN_MULTICAST(ntohl(sinp->sin_addr.s_addr))) {
174			error = EAFNOSUPPORT;
175			break;
176		}
177		error = in_pcbbind(inp, nam);
178		if (error)
179			break;
180		break;
181
182	/*
183	 * Prepare to accept connections.
184	 */
185	case PRU_LISTEN:
186		if (inp->inp_lport == 0)
187			error = in_pcbbind(inp, (struct mbuf *)0);
188		if (error == 0)
189			tp->t_state = TCPS_LISTEN;
190		break;
191
192	/*
193	 * Initiate connection to peer.
194	 * Create a template for use in transmissions on this connection.
195	 * Enter SYN_SENT state, and mark socket as connecting.
196	 * Start keep-alive timer, and seed output sequence space.
197	 * Send initial segment on connection.
198	 */
199	case PRU_CONNECT:
200		/*
201		 * Must disallow TCP ``connections'' to multicast addresses.
202		 */
203		sinp = mtod(nam, struct sockaddr_in *);
204		if (sinp->sin_family == AF_INET
205		    && IN_MULTICAST(ntohl(sinp->sin_addr.s_addr))) {
206			error = EAFNOSUPPORT;
207			break;
208		}
209
210		if ((error = tcp_connect(tp, nam)) != 0)
211			break;
212		error = tcp_output(tp);
213		break;
214
215	/*
216	 * Create a TCP connection between two sockets.
217	 */
218	case PRU_CONNECT2:
219		error = EOPNOTSUPP;
220		break;
221
222	/*
223	 * Initiate disconnect from peer.
224	 * If connection never passed embryonic stage, just drop;
225	 * else if don't need to let data drain, then can just drop anyways,
226	 * else have to begin TCP shutdown process: mark socket disconnecting,
227	 * drain unread data, state switch to reflect user close, and
228	 * send segment (e.g. FIN) to peer.  Socket will be really disconnected
229	 * when peer sends FIN and acks ours.
230	 *
231	 * SHOULD IMPLEMENT LATER PRU_CONNECT VIA REALLOC TCPCB.
232	 */
233	case PRU_DISCONNECT:
234		tp = tcp_disconnect(tp);
235		break;
236
237	/*
238	 * Accept a connection.  Essentially all the work is
239	 * done at higher levels; just return the address
240	 * of the peer, storing through addr.
241	 */
242	case PRU_ACCEPT:
243		in_setpeeraddr(inp, nam);
244		break;
245
246	/*
247	 * Mark the connection as being incapable of further output.
248	 */
249	case PRU_SHUTDOWN:
250		socantsendmore(so);
251		tp = tcp_usrclosed(tp);
252		if (tp)
253			error = tcp_output(tp);
254		break;
255
256	/*
257	 * After a receive, possibly send window update to peer.
258	 */
259	case PRU_RCVD:
260		(void) tcp_output(tp);
261		break;
262
263	/*
264	 * Do a send by putting data in output queue and updating urgent
265	 * marker if URG set.  Possibly send more data.
266	 */
267	case PRU_SEND_EOF:
268	case PRU_SEND:
269		sbappend(&so->so_snd, m);
270		if (nam && tp->t_state < TCPS_SYN_SENT) {
271			/*
272			 * Do implied connect if not yet connected,
273			 * initialize window to default value, and
274			 * initialize maxseg/maxopd using peer's cached
275			 * MSS.
276			 */
277			error = tcp_connect(tp, nam);
278			if (error)
279				break;
280			tp->snd_wnd = TTCP_CLIENT_SND_WND;
281			tcp_mss(tp, -1);
282		}
283
284		if (req == PRU_SEND_EOF) {
285			/*
286			 * Close the send side of the connection after
287			 * the data is sent.
288			 */
289			socantsendmore(so);
290			tp = tcp_usrclosed(tp);
291		}
292		if (tp != NULL)
293			error = tcp_output(tp);
294		break;
295
296	/*
297	 * Abort the TCP.
298	 */
299	case PRU_ABORT:
300		tp = tcp_drop(tp, ECONNABORTED);
301		break;
302
303	case PRU_SENSE:
304		((struct stat *) m)->st_blksize = so->so_snd.sb_hiwat;
305		(void) splx(s);
306		return (0);
307
308	case PRU_RCVOOB:
309		if ((so->so_oobmark == 0 &&
310		    (so->so_state & SS_RCVATMARK) == 0) ||
311		    so->so_options & SO_OOBINLINE ||
312		    tp->t_oobflags & TCPOOB_HADDATA) {
313			error = EINVAL;
314			break;
315		}
316		if ((tp->t_oobflags & TCPOOB_HAVEDATA) == 0) {
317			error = EWOULDBLOCK;
318			break;
319		}
320		m->m_len = 1;
321		*mtod(m, caddr_t) = tp->t_iobc;
322		if (((int)nam & MSG_PEEK) == 0)
323			tp->t_oobflags ^= (TCPOOB_HAVEDATA | TCPOOB_HADDATA);
324		break;
325
326	case PRU_SENDOOB:
327		if (sbspace(&so->so_snd) < -512) {
328			m_freem(m);
329			error = ENOBUFS;
330			break;
331		}
332		/*
333		 * According to RFC961 (Assigned Protocols),
334		 * the urgent pointer points to the last octet
335		 * of urgent data.  We continue, however,
336		 * to consider it to indicate the first octet
337		 * of data past the urgent section.
338		 * Otherwise, snd_up should be one lower.
339		 */
340		sbappend(&so->so_snd, m);
341		tp->snd_up = tp->snd_una + so->so_snd.sb_cc;
342		tp->t_force = 1;
343		error = tcp_output(tp);
344		tp->t_force = 0;
345		break;
346
347	case PRU_SOCKADDR:
348		in_setsockaddr(inp, nam);
349		break;
350
351	case PRU_PEERADDR:
352		in_setpeeraddr(inp, nam);
353		break;
354
355	/*
356	 * TCP slow timer went off; going through this
357	 * routine for tracing's sake.
358	 */
359	case PRU_SLOWTIMO:
360		tp = tcp_timers(tp, (int)nam);
361#ifdef TCPDEBUG
362		req |= (int)nam << 8;		/* for debug's sake */
363#endif
364		break;
365
366	default:
367		panic("tcp_usrreq");
368	}
369#ifdef TCPDEBUG
370	if (tp && (so->so_options & SO_DEBUG))
371		tcp_trace(TA_USER, ostate, tp, (struct tcpiphdr *)0, req);
372#endif
373	splx(s);
374	return (error);
375}
376
377/*
378 * Common subroutine to open a TCP connection to remote host specified
379 * by struct sockaddr_in in mbuf *nam.  Call in_pcbbind to assign a local
380 * port number if needed.  Call in_pcbladdr to do the routing and to choose
381 * a local host address (interface).  If there is an existing incarnation
382 * of the same connection in TIME-WAIT state and if the remote host was
383 * sending CC options and if the connection duration was < MSL, then
384 * truncate the previous TIME-WAIT state and proceed.
385 * Initialize connection parameters and enter SYN-SENT state.
386 */
387int
388tcp_connect(tp, nam)
389	register struct tcpcb *tp;
390	struct mbuf *nam;
391{
392	struct inpcb *inp = tp->t_inpcb, *oinp;
393	struct socket *so = inp->inp_socket;
394	struct tcpcb *otp;
395	struct sockaddr_in *sin = mtod(nam, struct sockaddr_in *);
396	struct sockaddr_in *ifaddr;
397	int error;
398
399	if (inp->inp_lport == 0) {
400		error = in_pcbbind(inp, NULL);
401		if (error)
402			return error;
403	}
404
405	/*
406	 * Cannot simply call in_pcbconnect, because there might be an
407	 * earlier incarnation of this same connection still in
408	 * TIME_WAIT state, creating an ADDRINUSE error.
409	 */
410	error = in_pcbladdr(inp, nam, &ifaddr);
411	oinp = in_pcblookup(inp->inp_head,
412	    sin->sin_addr, sin->sin_port,
413	    inp->inp_laddr.s_addr != INADDR_ANY ? inp->inp_laddr
414						: ifaddr->sin_addr,
415	    inp->inp_lport,  0);
416	if (oinp) {
417		if (oinp != inp && (otp = intotcpcb(oinp)) != NULL &&
418		otp->t_state == TCPS_TIME_WAIT &&
419		    otp->t_duration < TCPTV_MSL &&
420		    (otp->t_flags & TF_RCVD_CC))
421			otp = tcp_close(otp);
422		else
423			return EADDRINUSE;
424	}
425	if (inp->inp_laddr.s_addr == INADDR_ANY)
426		inp->inp_laddr = ifaddr->sin_addr;
427	inp->inp_faddr = sin->sin_addr;
428	inp->inp_fport = sin->sin_port;
429
430	tp->t_template = tcp_template(tp);
431	if (tp->t_template == 0) {
432		in_pcbdisconnect(inp);
433		return ENOBUFS;
434	}
435
436	/* Compute window scaling to request.  */
437	while (tp->request_r_scale < TCP_MAX_WINSHIFT &&
438	    (TCP_MAXWIN << tp->request_r_scale) < so->so_rcv.sb_hiwat)
439		tp->request_r_scale++;
440
441	soisconnecting(so);
442	tcpstat.tcps_connattempt++;
443	tp->t_state = TCPS_SYN_SENT;
444	tp->t_timer[TCPT_KEEP] = TCPTV_KEEP_INIT;
445	tp->iss = tcp_iss; tcp_iss += TCP_ISSINCR/2;
446	tcp_sendseqinit(tp);
447	tp->cc_send = CC_INC(tcp_ccgen);
448
449	return 0;
450}
451
452int
453tcp_ctloutput(op, so, level, optname, mp)
454	int op;
455	struct socket *so;
456	int level, optname;
457	struct mbuf **mp;
458{
459	int error = 0, s;
460	struct inpcb *inp;
461	register struct tcpcb *tp;
462	register struct mbuf *m;
463	register int i;
464
465	s = splnet();
466	inp = sotoinpcb(so);
467	if (inp == NULL) {
468		splx(s);
469		if (op == PRCO_SETOPT && *mp)
470			(void) m_free(*mp);
471		return (ECONNRESET);
472	}
473	if (level != IPPROTO_TCP) {
474		error = ip_ctloutput(op, so, level, optname, mp);
475		splx(s);
476		return (error);
477	}
478	tp = intotcpcb(inp);
479
480	switch (op) {
481
482	case PRCO_SETOPT:
483		m = *mp;
484		switch (optname) {
485
486		case TCP_NODELAY:
487			if (m == NULL || m->m_len < sizeof (int))
488				error = EINVAL;
489			else if (*mtod(m, int *))
490				tp->t_flags |= TF_NODELAY;
491			else
492				tp->t_flags &= ~TF_NODELAY;
493			break;
494
495		case TCP_MAXSEG:
496			if (m && (i = *mtod(m, int *)) > 0 && i <= tp->t_maxseg)
497				tp->t_maxseg = i;
498			else
499				error = EINVAL;
500			break;
501
502		case TCP_NOOPT:
503			if (m == NULL || m->m_len < sizeof (int))
504				error = EINVAL;
505			else if (*mtod(m, int *))
506				tp->t_flags |= TF_NOOPT;
507			else
508				tp->t_flags &= ~TF_NOOPT;
509			break;
510
511		case TCP_NOPUSH:
512			if (m == NULL || m->m_len < sizeof (int))
513				error = EINVAL;
514			else if (*mtod(m, int *))
515				tp->t_flags |= TF_NOPUSH;
516			else
517				tp->t_flags &= ~TF_NOPUSH;
518			break;
519
520		default:
521			error = ENOPROTOOPT;
522			break;
523		}
524		if (m)
525			(void) m_free(m);
526		break;
527
528	case PRCO_GETOPT:
529		*mp = m = m_get(M_WAIT, MT_SOOPTS);
530		m->m_len = sizeof(int);
531
532		switch (optname) {
533		case TCP_NODELAY:
534			*mtod(m, int *) = tp->t_flags & TF_NODELAY;
535			break;
536		case TCP_MAXSEG:
537			*mtod(m, int *) = tp->t_maxseg;
538			break;
539		case TCP_NOOPT:
540			*mtod(m, int *) = tp->t_flags & TF_NOOPT;
541			break;
542		case TCP_NOPUSH:
543			*mtod(m, int *) = tp->t_flags & TF_NOPUSH;
544			break;
545		default:
546			error = ENOPROTOOPT;
547			break;
548		}
549		break;
550	}
551	splx(s);
552	return (error);
553}
554
555/*
556 * tcp_sendspace and tcp_recvspace are the default send and receive window
557 * sizes, respectively.  These are obsolescent (this information should
558 * be set by the route).
559 */
560u_long	tcp_sendspace = 1024*16;
561u_long	tcp_recvspace = 1024*16;
562
563/*
564 * Attach TCP protocol to socket, allocating
565 * internet protocol control block, tcp control block,
566 * bufer space, and entering LISTEN state if to accept connections.
567 */
568int
569tcp_attach(so)
570	struct socket *so;
571{
572	register struct tcpcb *tp;
573	struct inpcb *inp;
574	int error;
575
576	if (so->so_snd.sb_hiwat == 0 || so->so_rcv.sb_hiwat == 0) {
577		error = soreserve(so, tcp_sendspace, tcp_recvspace);
578		if (error)
579			return (error);
580	}
581	error = in_pcballoc(so, &tcb);
582	if (error)
583		return (error);
584	inp = sotoinpcb(so);
585	tp = tcp_newtcpcb(inp);
586	if (tp == 0) {
587		int nofd = so->so_state & SS_NOFDREF;	/* XXX */
588
589		so->so_state &= ~SS_NOFDREF;	/* don't free the socket yet */
590		in_pcbdetach(inp);
591		so->so_state |= nofd;
592		return (ENOBUFS);
593	}
594	tp->t_state = TCPS_CLOSED;
595	return (0);
596}
597
598/*
599 * Initiate (or continue) disconnect.
600 * If embryonic state, just send reset (once).
601 * If in ``let data drain'' option and linger null, just drop.
602 * Otherwise (hard), mark socket disconnecting and drop
603 * current input data; switch states based on user close, and
604 * send segment to peer (with FIN).
605 */
606struct tcpcb *
607tcp_disconnect(tp)
608	register struct tcpcb *tp;
609{
610	struct socket *so = tp->t_inpcb->inp_socket;
611
612	if (tp->t_state < TCPS_ESTABLISHED)
613		tp = tcp_close(tp);
614	else if ((so->so_options & SO_LINGER) && so->so_linger == 0)
615		tp = tcp_drop(tp, 0);
616	else {
617		soisdisconnecting(so);
618		sbflush(&so->so_rcv);
619		tp = tcp_usrclosed(tp);
620		if (tp)
621			(void) tcp_output(tp);
622	}
623	return (tp);
624}
625
626/*
627 * User issued close, and wish to trail through shutdown states:
628 * if never received SYN, just forget it.  If got a SYN from peer,
629 * but haven't sent FIN, then go to FIN_WAIT_1 state to send peer a FIN.
630 * If already got a FIN from peer, then almost done; go to LAST_ACK
631 * state.  In all other cases, have already sent FIN to peer (e.g.
632 * after PRU_SHUTDOWN), and just have to play tedious game waiting
633 * for peer to send FIN or not respond to keep-alives, etc.
634 * We can let the user exit from the close as soon as the FIN is acked.
635 */
636struct tcpcb *
637tcp_usrclosed(tp)
638	register struct tcpcb *tp;
639{
640
641	switch (tp->t_state) {
642
643	case TCPS_CLOSED:
644	case TCPS_LISTEN:
645		tp->t_state = TCPS_CLOSED;
646		tp = tcp_close(tp);
647		break;
648
649	case TCPS_SYN_SENT:
650	case TCPS_SYN_RECEIVED:
651		tp->t_flags |= TF_NEEDFIN;
652		break;
653
654	case TCPS_ESTABLISHED:
655		tp->t_state = TCPS_FIN_WAIT_1;
656		break;
657
658	case TCPS_CLOSE_WAIT:
659		tp->t_state = TCPS_LAST_ACK;
660		break;
661	}
662	if (tp && tp->t_state >= TCPS_FIN_WAIT_2)
663		soisdisconnected(tp->t_inpcb->inp_socket);
664	return (tp);
665}
666
667/*
668 * Sysctl for tcp variables.
669 */
670int
671tcp_sysctl(name, namelen, oldp, oldlenp, newp, newlen)
672	int *name;
673	u_int namelen;
674	void *oldp;
675	size_t *oldlenp;
676	void *newp;
677	size_t newlen;
678{
679	/* All sysctl names at this level are terminal. */
680	if (namelen != 1)
681		return (ENOTDIR);
682
683	switch (name[0]) {
684	case TCPCTL_DO_RFC1323:
685		return (sysctl_int(oldp, oldlenp, newp, newlen,
686		    &tcp_do_rfc1323));
687	case TCPCTL_DO_RFC1644:
688		return (sysctl_int(oldp, oldlenp, newp, newlen,
689		    &tcp_do_rfc1644));
690	case TCPCTL_MSSDFLT:
691		return (sysctl_int(oldp, oldlenp, newp, newlen,
692		    &tcp_mssdflt));
693	case TCPCTL_STATS:
694		return (sysctl_rdstruct(oldp, oldlenp, newp, &tcpstat,
695					sizeof tcpstat));
696	case TCPCTL_RTTDFLT:
697		return (sysctl_int(oldp, oldlenp, newp, newlen, &tcp_rttdflt));
698	case TCPCTL_KEEPIDLE:
699		return (sysctl_int(oldp, oldlenp, newp, newlen,
700				   &tcp_keepidle));
701	case TCPCTL_KEEPINTVL:
702		return (sysctl_int(oldp, oldlenp, newp, newlen,
703				   &tcp_keepintvl));
704	case TCPCTL_SENDSPACE:
705		return (sysctl_int(oldp, oldlenp, newp, newlen,
706				   (int *)&tcp_sendspace)); /* XXX */
707	case TCPCTL_RECVSPACE:
708		return (sysctl_int(oldp, oldlenp, newp, newlen,
709				   (int *)&tcp_recvspace)); /* XXX */
710	default:
711		return (ENOPROTOOPT);
712	}
713	/* NOTREACHED */
714}
715