tcp_usrreq.c revision 6475
1/*
2 * Copyright (c) 1982, 1986, 1988, 1993
3 *	The Regents of the University of California.  All rights reserved.
4 *
5 * Redistribution and use in source and binary forms, with or without
6 * modification, are permitted provided that the following conditions
7 * are met:
8 * 1. Redistributions of source code must retain the above copyright
9 *    notice, this list of conditions and the following disclaimer.
10 * 2. Redistributions in binary form must reproduce the above copyright
11 *    notice, this list of conditions and the following disclaimer in the
12 *    documentation and/or other materials provided with the distribution.
13 * 3. All advertising materials mentioning features or use of this software
14 *    must display the following acknowledgement:
15 *	This product includes software developed by the University of
16 *	California, Berkeley and its contributors.
17 * 4. Neither the name of the University nor the names of its contributors
18 *    may be used to endorse or promote products derived from this software
19 *    without specific prior written permission.
20 *
21 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
22 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
23 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
24 * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
25 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
26 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
27 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
28 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
29 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
30 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
31 * SUCH DAMAGE.
32 *
33 *	@(#)tcp_usrreq.c	8.2 (Berkeley) 1/3/94
34 * $Id: tcp_usrreq.c,v 1.7 1995/02/09 23:13:27 wollman Exp $
35 */
36
37#include <sys/param.h>
38#include <sys/systm.h>
39#include <sys/malloc.h>
40#include <sys/mbuf.h>
41#include <sys/socket.h>
42#include <sys/socketvar.h>
43#include <sys/protosw.h>
44#include <sys/errno.h>
45#include <sys/stat.h>
46
47#include <net/if.h>
48#include <net/route.h>
49
50#include <netinet/in.h>
51#include <netinet/in_systm.h>
52#include <netinet/ip.h>
53#include <netinet/in_pcb.h>
54#include <netinet/ip_var.h>
55#include <netinet/tcp.h>
56#include <netinet/tcp_fsm.h>
57#include <netinet/tcp_seq.h>
58#include <netinet/tcp_timer.h>
59#include <netinet/tcp_var.h>
60#include <netinet/tcpip.h>
61#ifdef TCPDEBUG
62#include <netinet/tcp_debug.h>
63#endif
64
65/*
66 * TCP protocol interface to socket abstraction.
67 */
68extern	char *tcpstates[];
69
70/*
71 * Process a TCP user request for TCP tb.  If this is a send request
72 * then m is the mbuf chain of send data.  If this is a timer expiration
73 * (called from the software clock routine), then timertype tells which timer.
74 */
75/*ARGSUSED*/
76int
77tcp_usrreq(so, req, m, nam, control)
78	struct socket *so;
79	int req;
80	struct mbuf *m, *nam, *control;
81{
82	register struct inpcb *inp;
83	register struct tcpcb *tp = 0;
84	struct sockaddr_in *sinp;
85	int s;
86	int error = 0;
87#ifdef TCPDEBUG
88	int ostate;
89#endif
90
91	if (req == PRU_CONTROL)
92		return (in_control(so, (int)m, (caddr_t)nam,
93			(struct ifnet *)control));
94	if (control && control->m_len) {
95		m_freem(control);
96		if (m)
97			m_freem(m);
98		return (EINVAL);
99	}
100
101	s = splnet();
102	inp = sotoinpcb(so);
103	/*
104	 * When a TCP is attached to a socket, then there will be
105	 * a (struct inpcb) pointed at by the socket, and this
106	 * structure will point at a subsidary (struct tcpcb).
107	 */
108	if (inp == 0 && req != PRU_ATTACH) {
109		splx(s);
110		return (EINVAL);		/* XXX */
111	}
112	if (inp) {
113		tp = intotcpcb(inp);
114		/* WHAT IF TP IS 0? */
115#ifdef KPROF
116		tcp_acounts[tp->t_state][req]++;
117#endif
118#ifdef TCPDEBUG
119		ostate = tp->t_state;
120	} else
121		ostate = 0;
122#else /* TCPDEBUG */
123	}
124#endif /* TCPDEBUG */
125
126	switch (req) {
127
128	/*
129	 * TCP attaches to socket via PRU_ATTACH, reserving space,
130	 * and an internet control block.
131	 */
132	case PRU_ATTACH:
133		if (inp) {
134			error = EISCONN;
135			break;
136		}
137		error = tcp_attach(so);
138		if (error)
139			break;
140		if ((so->so_options & SO_LINGER) && so->so_linger == 0)
141			so->so_linger = TCP_LINGERTIME;
142		tp = sototcpcb(so);
143		break;
144
145	/*
146	 * PRU_DETACH detaches the TCP protocol from the socket.
147	 * If the protocol state is non-embryonic, then can't
148	 * do this directly: have to initiate a PRU_DISCONNECT,
149	 * which may finish later; embryonic TCB's can just
150	 * be discarded here.
151	 */
152	case PRU_DETACH:
153		if (tp->t_state > TCPS_LISTEN)
154			tp = tcp_disconnect(tp);
155		else
156			tp = tcp_close(tp);
157		break;
158
159	/*
160	 * Give the socket an address.
161	 */
162	case PRU_BIND:
163		/*
164		 * Must check for multicast addresses and disallow binding
165		 * to them.
166		 */
167		sinp = mtod(nam, struct sockaddr_in *);
168		if (sinp->sin_family == AF_INET &&
169		    IN_MULTICAST(ntohl(sinp->sin_addr.s_addr))) {
170			error = EAFNOSUPPORT;
171			break;
172		}
173		error = in_pcbbind(inp, nam);
174		if (error)
175			break;
176		break;
177
178	/*
179	 * Prepare to accept connections.
180	 */
181	case PRU_LISTEN:
182		if (inp->inp_lport == 0)
183			error = in_pcbbind(inp, (struct mbuf *)0);
184		if (error == 0)
185			tp->t_state = TCPS_LISTEN;
186		break;
187
188	/*
189	 * Initiate connection to peer.
190	 * Create a template for use in transmissions on this connection.
191	 * Enter SYN_SENT state, and mark socket as connecting.
192	 * Start keep-alive timer, and seed output sequence space.
193	 * Send initial segment on connection.
194	 */
195	case PRU_CONNECT:
196		/*
197		 * Must disallow TCP ``connections'' to multicast addresses.
198		 */
199		sinp = mtod(nam, struct sockaddr_in *);
200		if (sinp->sin_family == AF_INET
201		    && IN_MULTICAST(ntohl(sinp->sin_addr.s_addr))) {
202			error = EAFNOSUPPORT;
203			break;
204		}
205
206		if ((error = tcp_connect(tp, nam)) != 0)
207			break;
208		error = tcp_output(tp);
209		break;
210
211	/*
212	 * Create a TCP connection between two sockets.
213	 */
214	case PRU_CONNECT2:
215		error = EOPNOTSUPP;
216		break;
217
218	/*
219	 * Initiate disconnect from peer.
220	 * If connection never passed embryonic stage, just drop;
221	 * else if don't need to let data drain, then can just drop anyways,
222	 * else have to begin TCP shutdown process: mark socket disconnecting,
223	 * drain unread data, state switch to reflect user close, and
224	 * send segment (e.g. FIN) to peer.  Socket will be really disconnected
225	 * when peer sends FIN and acks ours.
226	 *
227	 * SHOULD IMPLEMENT LATER PRU_CONNECT VIA REALLOC TCPCB.
228	 */
229	case PRU_DISCONNECT:
230		tp = tcp_disconnect(tp);
231		break;
232
233	/*
234	 * Accept a connection.  Essentially all the work is
235	 * done at higher levels; just return the address
236	 * of the peer, storing through addr.
237	 */
238	case PRU_ACCEPT:
239		in_setpeeraddr(inp, nam);
240		break;
241
242	/*
243	 * Mark the connection as being incapable of further output.
244	 */
245	case PRU_SHUTDOWN:
246		socantsendmore(so);
247		tp = tcp_usrclosed(tp);
248		if (tp)
249			error = tcp_output(tp);
250		break;
251
252	/*
253	 * After a receive, possibly send window update to peer.
254	 */
255	case PRU_RCVD:
256		(void) tcp_output(tp);
257		break;
258
259	/*
260	 * Do a send by putting data in output queue and updating urgent
261	 * marker if URG set.  Possibly send more data.
262	 */
263	case PRU_SEND_EOF:
264	case PRU_SEND:
265		sbappend(&so->so_snd, m);
266		if (nam && tp->t_state < TCPS_SYN_SENT) {
267			/*
268			 * Do implied connect if not yet connected,
269			 * initialize window to default value, and
270			 * initialize maxseg/maxopd using peer's cached
271			 * MSS.
272			 */
273			error = tcp_connect(tp, nam);
274			if (error)
275				break;
276			tp->snd_wnd = TTCP_CLIENT_SND_WND;
277			tcp_mss(tp, -1);
278		}
279
280		if (req == PRU_SEND_EOF) {
281			/*
282			 * Close the send side of the connection after
283			 * the data is sent.
284			 */
285			socantsendmore(so);
286			tp = tcp_usrclosed(tp);
287		}
288		if (tp != NULL)
289			error = tcp_output(tp);
290		break;
291
292	/*
293	 * Abort the TCP.
294	 */
295	case PRU_ABORT:
296		tp = tcp_drop(tp, ECONNABORTED);
297		break;
298
299	case PRU_SENSE:
300		((struct stat *) m)->st_blksize = so->so_snd.sb_hiwat;
301		(void) splx(s);
302		return (0);
303
304	case PRU_RCVOOB:
305		if ((so->so_oobmark == 0 &&
306		    (so->so_state & SS_RCVATMARK) == 0) ||
307		    so->so_options & SO_OOBINLINE ||
308		    tp->t_oobflags & TCPOOB_HADDATA) {
309			error = EINVAL;
310			break;
311		}
312		if ((tp->t_oobflags & TCPOOB_HAVEDATA) == 0) {
313			error = EWOULDBLOCK;
314			break;
315		}
316		m->m_len = 1;
317		*mtod(m, caddr_t) = tp->t_iobc;
318		if (((int)nam & MSG_PEEK) == 0)
319			tp->t_oobflags ^= (TCPOOB_HAVEDATA | TCPOOB_HADDATA);
320		break;
321
322	case PRU_SENDOOB:
323		if (sbspace(&so->so_snd) < -512) {
324			m_freem(m);
325			error = ENOBUFS;
326			break;
327		}
328		/*
329		 * According to RFC961 (Assigned Protocols),
330		 * the urgent pointer points to the last octet
331		 * of urgent data.  We continue, however,
332		 * to consider it to indicate the first octet
333		 * of data past the urgent section.
334		 * Otherwise, snd_up should be one lower.
335		 */
336		sbappend(&so->so_snd, m);
337		tp->snd_up = tp->snd_una + so->so_snd.sb_cc;
338		tp->t_force = 1;
339		error = tcp_output(tp);
340		tp->t_force = 0;
341		break;
342
343	case PRU_SOCKADDR:
344		in_setsockaddr(inp, nam);
345		break;
346
347	case PRU_PEERADDR:
348		in_setpeeraddr(inp, nam);
349		break;
350
351	/*
352	 * TCP slow timer went off; going through this
353	 * routine for tracing's sake.
354	 */
355	case PRU_SLOWTIMO:
356		tp = tcp_timers(tp, (int)nam);
357#ifdef TCPDEBUG
358		req |= (int)nam << 8;		/* for debug's sake */
359#endif
360		break;
361
362	default:
363		panic("tcp_usrreq");
364	}
365#ifdef TCPDEBUG
366	if (tp && (so->so_options & SO_DEBUG))
367		tcp_trace(TA_USER, ostate, tp, (struct tcpiphdr *)0, req);
368#endif
369	splx(s);
370	return (error);
371}
372
373/*
374 * Common subroutine to open a TCP connection to remote host specified
375 * by struct sockaddr_in in mbuf *nam.  Call in_pcbbind to assign a local
376 * port number if needed.  Call in_pcbladdr to do the routing and to choose
377 * a local host address (interface).  If there is an existing incarnation
378 * of the same connection in TIME-WAIT state and if the remote host was
379 * sending CC options and if the connection duration was < MSL, then
380 * truncate the previous TIME-WAIT state and proceed.
381 * Initialize connection parameters and enter SYN-SENT state.
382 */
383int
384tcp_connect(tp, nam)
385	register struct tcpcb *tp;
386	struct mbuf *nam;
387{
388	struct inpcb *inp = tp->t_inpcb, *oinp;
389	struct socket *so = inp->inp_socket;
390	struct tcpcb *otp;
391	struct sockaddr_in *sin = mtod(nam, struct sockaddr_in *);
392	struct sockaddr_in *ifaddr;
393	int error;
394
395	if (inp->inp_lport == 0) {
396		error = in_pcbbind(inp, NULL);
397		if (error)
398			return error;
399	}
400
401	/*
402	 * Cannot simply call in_pcbconnect, because there might be an
403	 * earlier incarnation of this same connection still in
404	 * TIME_WAIT state, creating an ADDRINUSE error.
405	 */
406	error = in_pcbladdr(inp, nam, &ifaddr);
407	oinp = in_pcblookup(inp->inp_head,
408	    sin->sin_addr, sin->sin_port,
409	    inp->inp_laddr.s_addr != INADDR_ANY ? inp->inp_laddr
410						: ifaddr->sin_addr,
411	    inp->inp_lport,  0);
412	if (oinp) {
413		if (oinp != inp && (otp = intotcpcb(oinp)) != NULL &&
414		otp->t_state == TCPS_TIME_WAIT &&
415		    otp->t_duration < TCPTV_MSL &&
416		    (otp->t_flags & TF_RCVD_CC))
417			otp = tcp_close(otp);
418		else
419			return EADDRINUSE;
420	}
421	if (inp->inp_laddr.s_addr == INADDR_ANY)
422		inp->inp_laddr = ifaddr->sin_addr;
423	inp->inp_faddr = sin->sin_addr;
424	inp->inp_fport = sin->sin_port;
425
426	tp->t_template = tcp_template(tp);
427	if (tp->t_template == 0) {
428		in_pcbdisconnect(inp);
429		return ENOBUFS;
430	}
431
432	/* Compute window scaling to request.  */
433	while (tp->request_r_scale < TCP_MAX_WINSHIFT &&
434	    (TCP_MAXWIN << tp->request_r_scale) < so->so_rcv.sb_hiwat)
435		tp->request_r_scale++;
436
437	soisconnecting(so);
438	tcpstat.tcps_connattempt++;
439	tp->t_state = TCPS_SYN_SENT;
440	tp->t_timer[TCPT_KEEP] = TCPTV_KEEP_INIT;
441	tp->iss = tcp_iss; tcp_iss += TCP_ISSINCR/2;
442	tcp_sendseqinit(tp);
443	tp->cc_send = CC_INC(tcp_ccgen);
444
445	return 0;
446}
447
448int
449tcp_ctloutput(op, so, level, optname, mp)
450	int op;
451	struct socket *so;
452	int level, optname;
453	struct mbuf **mp;
454{
455	int error = 0, s;
456	struct inpcb *inp;
457	register struct tcpcb *tp;
458	register struct mbuf *m;
459	register int i;
460
461	s = splnet();
462	inp = sotoinpcb(so);
463	if (inp == NULL) {
464		splx(s);
465		if (op == PRCO_SETOPT && *mp)
466			(void) m_free(*mp);
467		return (ECONNRESET);
468	}
469	if (level != IPPROTO_TCP) {
470		error = ip_ctloutput(op, so, level, optname, mp);
471		splx(s);
472		return (error);
473	}
474	tp = intotcpcb(inp);
475
476	switch (op) {
477
478	case PRCO_SETOPT:
479		m = *mp;
480		switch (optname) {
481
482		case TCP_NODELAY:
483			if (m == NULL || m->m_len < sizeof (int))
484				error = EINVAL;
485			else if (*mtod(m, int *))
486				tp->t_flags |= TF_NODELAY;
487			else
488				tp->t_flags &= ~TF_NODELAY;
489			break;
490
491		case TCP_MAXSEG:
492			if (m && (i = *mtod(m, int *)) > 0 && i <= tp->t_maxseg)
493				tp->t_maxseg = i;
494			else
495				error = EINVAL;
496			break;
497
498		case TCP_NOOPT:
499			if (m == NULL || m->m_len < sizeof (int))
500				error = EINVAL;
501			else if (*mtod(m, int *))
502				tp->t_flags |= TF_NOOPT;
503			else
504				tp->t_flags &= ~TF_NOOPT;
505			break;
506
507		case TCP_NOPUSH:
508			if (m == NULL || m->m_len < sizeof (int))
509				error = EINVAL;
510			else if (*mtod(m, int *))
511				tp->t_flags |= TF_NOPUSH;
512			else
513				tp->t_flags &= ~TF_NOPUSH;
514			break;
515
516		default:
517			error = ENOPROTOOPT;
518			break;
519		}
520		if (m)
521			(void) m_free(m);
522		break;
523
524	case PRCO_GETOPT:
525		*mp = m = m_get(M_WAIT, MT_SOOPTS);
526		m->m_len = sizeof(int);
527
528		switch (optname) {
529		case TCP_NODELAY:
530			*mtod(m, int *) = tp->t_flags & TF_NODELAY;
531			break;
532		case TCP_MAXSEG:
533			*mtod(m, int *) = tp->t_maxseg;
534			break;
535		case TCP_NOOPT:
536			*mtod(m, int *) = tp->t_flags & TF_NOOPT;
537			break;
538		case TCP_NOPUSH:
539			*mtod(m, int *) = tp->t_flags & TF_NOPUSH;
540			break;
541		default:
542			error = ENOPROTOOPT;
543			break;
544		}
545		break;
546	}
547	splx(s);
548	return (error);
549}
550
551/*
552 * tcp_sendspace and tcp_recvspace are the default send and receive window
553 * sizes, respectively.  These are obsolescent (this information should
554 * be set by the route).
555 */
556u_long	tcp_sendspace = 1024*16;
557u_long	tcp_recvspace = 1024*16;
558
559/*
560 * Attach TCP protocol to socket, allocating
561 * internet protocol control block, tcp control block,
562 * bufer space, and entering LISTEN state if to accept connections.
563 */
564int
565tcp_attach(so)
566	struct socket *so;
567{
568	register struct tcpcb *tp;
569	struct inpcb *inp;
570	int error;
571
572	if (so->so_snd.sb_hiwat == 0 || so->so_rcv.sb_hiwat == 0) {
573		error = soreserve(so, tcp_sendspace, tcp_recvspace);
574		if (error)
575			return (error);
576	}
577	error = in_pcballoc(so, &tcb);
578	if (error)
579		return (error);
580	inp = sotoinpcb(so);
581	tp = tcp_newtcpcb(inp);
582	if (tp == 0) {
583		int nofd = so->so_state & SS_NOFDREF;	/* XXX */
584
585		so->so_state &= ~SS_NOFDREF;	/* don't free the socket yet */
586		in_pcbdetach(inp);
587		so->so_state |= nofd;
588		return (ENOBUFS);
589	}
590	tp->t_state = TCPS_CLOSED;
591	return (0);
592}
593
594/*
595 * Initiate (or continue) disconnect.
596 * If embryonic state, just send reset (once).
597 * If in ``let data drain'' option and linger null, just drop.
598 * Otherwise (hard), mark socket disconnecting and drop
599 * current input data; switch states based on user close, and
600 * send segment to peer (with FIN).
601 */
602struct tcpcb *
603tcp_disconnect(tp)
604	register struct tcpcb *tp;
605{
606	struct socket *so = tp->t_inpcb->inp_socket;
607
608	if (tp->t_state < TCPS_ESTABLISHED)
609		tp = tcp_close(tp);
610	else if ((so->so_options & SO_LINGER) && so->so_linger == 0)
611		tp = tcp_drop(tp, 0);
612	else {
613		soisdisconnecting(so);
614		sbflush(&so->so_rcv);
615		tp = tcp_usrclosed(tp);
616		if (tp)
617			(void) tcp_output(tp);
618	}
619	return (tp);
620}
621
622/*
623 * User issued close, and wish to trail through shutdown states:
624 * if never received SYN, just forget it.  If got a SYN from peer,
625 * but haven't sent FIN, then go to FIN_WAIT_1 state to send peer a FIN.
626 * If already got a FIN from peer, then almost done; go to LAST_ACK
627 * state.  In all other cases, have already sent FIN to peer (e.g.
628 * after PRU_SHUTDOWN), and just have to play tedious game waiting
629 * for peer to send FIN or not respond to keep-alives, etc.
630 * We can let the user exit from the close as soon as the FIN is acked.
631 */
632struct tcpcb *
633tcp_usrclosed(tp)
634	register struct tcpcb *tp;
635{
636
637	switch (tp->t_state) {
638
639	case TCPS_CLOSED:
640	case TCPS_LISTEN:
641		tp->t_state = TCPS_CLOSED;
642		tp = tcp_close(tp);
643		break;
644
645	case TCPS_SYN_SENT:
646	case TCPS_SYN_RECEIVED:
647		tp->t_flags |= TF_NEEDFIN;
648		break;
649
650	case TCPS_ESTABLISHED:
651		tp->t_state = TCPS_FIN_WAIT_1;
652		break;
653
654	case TCPS_CLOSE_WAIT:
655		tp->t_state = TCPS_LAST_ACK;
656		break;
657	}
658	if (tp && tp->t_state >= TCPS_FIN_WAIT_2)
659		soisdisconnected(tp->t_inpcb->inp_socket);
660	return (tp);
661}
662
663/*
664 * Sysctl for tcp variables.
665 */
666int
667tcp_sysctl(name, namelen, oldp, oldlenp, newp, newlen)
668	int *name;
669	u_int namelen;
670	void *oldp;
671	size_t *oldlenp;
672	void *newp;
673	size_t newlen;
674{
675	extern	int tcp_do_rfc1323; /* XXX */
676	extern	int tcp_do_rfc1644; /* XXX */
677	extern	int tcp_mssdflt; /* XXX */
678	extern	int tcp_rttdflt; /* XXX */
679
680	/* All sysctl names at this level are terminal. */
681	if (namelen != 1)
682		return (ENOTDIR);
683
684	switch (name[0]) {
685	case TCPCTL_DO_RFC1323:
686		return (sysctl_int(oldp, oldlenp, newp, newlen,
687		    &tcp_do_rfc1323));
688	case TCPCTL_DO_RFC1644:
689		return (sysctl_int(oldp, oldlenp, newp, newlen,
690		    &tcp_do_rfc1644));
691	case TCPCTL_MSSDFLT:
692		return (sysctl_int(oldp, oldlenp, newp, newlen,
693		    &tcp_mssdflt));
694	case TCPCTL_STATS:
695		return (sysctl_rdstruct(oldp, oldlenp, newp, &tcpstat,
696					sizeof tcpstat));
697	case TCPCTL_RTTDFLT:
698		return (sysctl_int(oldp, oldlenp, newp, newlen, &tcp_rttdflt));
699	case TCPCTL_KEEPIDLE:
700		return (sysctl_int(oldp, oldlenp, newp, newlen,
701				   &tcp_keepidle));
702	case TCPCTL_KEEPINTVL:
703		return (sysctl_int(oldp, oldlenp, newp, newlen,
704				   &tcp_keepintvl));
705	case TCPCTL_SENDSPACE:
706		return (sysctl_int(oldp, oldlenp, newp, newlen,
707				   (int *)&tcp_sendspace)); /* XXX */
708	case TCPCTL_RECVSPACE:
709		return (sysctl_int(oldp, oldlenp, newp, newlen,
710				   (int *)&tcp_recvspace)); /* XXX */
711	default:
712		return (ENOPROTOOPT);
713	}
714	/* NOTREACHED */
715}
716