tcp_usrreq.c revision 5112
1/*
2 * Copyright (c) 1982, 1986, 1988, 1993
3 *	The Regents of the University of California.  All rights reserved.
4 *
5 * Redistribution and use in source and binary forms, with or without
6 * modification, are permitted provided that the following conditions
7 * are met:
8 * 1. Redistributions of source code must retain the above copyright
9 *    notice, this list of conditions and the following disclaimer.
10 * 2. Redistributions in binary form must reproduce the above copyright
11 *    notice, this list of conditions and the following disclaimer in the
12 *    documentation and/or other materials provided with the distribution.
13 * 3. All advertising materials mentioning features or use of this software
14 *    must display the following acknowledgement:
15 *	This product includes software developed by the University of
16 *	California, Berkeley and its contributors.
17 * 4. Neither the name of the University nor the names of its contributors
18 *    may be used to endorse or promote products derived from this software
19 *    without specific prior written permission.
20 *
21 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
22 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
23 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
24 * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
25 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
26 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
27 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
28 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
29 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
30 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
31 * SUCH DAMAGE.
32 *
33 *	@(#)tcp_usrreq.c	8.2 (Berkeley) 1/3/94
34 * $Id: tcp_usrreq.c,v 1.5 1994/09/15 10:36:56 davidg Exp $
35 */
36
37#include <sys/param.h>
38#include <sys/systm.h>
39#include <sys/malloc.h>
40#include <sys/mbuf.h>
41#include <sys/socket.h>
42#include <sys/socketvar.h>
43#include <sys/protosw.h>
44#include <sys/errno.h>
45#include <sys/stat.h>
46
47#include <net/if.h>
48#include <net/route.h>
49
50#include <netinet/in.h>
51#include <netinet/in_systm.h>
52#include <netinet/ip.h>
53#include <netinet/in_pcb.h>
54#include <netinet/ip_var.h>
55#include <netinet/tcp.h>
56#include <netinet/tcp_fsm.h>
57#include <netinet/tcp_seq.h>
58#include <netinet/tcp_timer.h>
59#include <netinet/tcp_var.h>
60#include <netinet/tcpip.h>
61#ifdef TCPDEBUG
62#include <netinet/tcp_debug.h>
63#endif
64
65/*
66 * TCP protocol interface to socket abstraction.
67 */
68extern	char *tcpstates[];
69
70/*
71 * Process a TCP user request for TCP tb.  If this is a send request
72 * then m is the mbuf chain of send data.  If this is a timer expiration
73 * (called from the software clock routine), then timertype tells which timer.
74 */
75/*ARGSUSED*/
76int
77tcp_usrreq(so, req, m, nam, control)
78	struct socket *so;
79	int req;
80	struct mbuf *m, *nam, *control;
81{
82	register struct inpcb *inp;
83	register struct tcpcb *tp = 0;
84	struct sockaddr_in *sinp;
85	int s;
86	int error = 0;
87	int ostate;
88
89	if (req == PRU_CONTROL)
90		return (in_control(so, (int)m, (caddr_t)nam,
91			(struct ifnet *)control));
92	if (control && control->m_len) {
93		m_freem(control);
94		if (m)
95			m_freem(m);
96		return (EINVAL);
97	}
98
99	s = splnet();
100	inp = sotoinpcb(so);
101	/*
102	 * When a TCP is attached to a socket, then there will be
103	 * a (struct inpcb) pointed at by the socket, and this
104	 * structure will point at a subsidary (struct tcpcb).
105	 */
106	if (inp == 0 && req != PRU_ATTACH) {
107		splx(s);
108		return (EINVAL);		/* XXX */
109	}
110	if (inp) {
111		tp = intotcpcb(inp);
112		/* WHAT IF TP IS 0? */
113#ifdef KPROF
114		tcp_acounts[tp->t_state][req]++;
115#endif
116		ostate = tp->t_state;
117	} else
118		ostate = 0;
119	switch (req) {
120
121	/*
122	 * TCP attaches to socket via PRU_ATTACH, reserving space,
123	 * and an internet control block.
124	 */
125	case PRU_ATTACH:
126		if (inp) {
127			error = EISCONN;
128			break;
129		}
130		error = tcp_attach(so);
131		if (error)
132			break;
133		if ((so->so_options & SO_LINGER) && so->so_linger == 0)
134			so->so_linger = TCP_LINGERTIME;
135		tp = sototcpcb(so);
136		break;
137
138	/*
139	 * PRU_DETACH detaches the TCP protocol from the socket.
140	 * If the protocol state is non-embryonic, then can't
141	 * do this directly: have to initiate a PRU_DISCONNECT,
142	 * which may finish later; embryonic TCB's can just
143	 * be discarded here.
144	 */
145	case PRU_DETACH:
146		if (tp->t_state > TCPS_LISTEN)
147			tp = tcp_disconnect(tp);
148		else
149			tp = tcp_close(tp);
150		break;
151
152	/*
153	 * Give the socket an address.
154	 */
155	case PRU_BIND:
156		/*
157		 * Must check for multicast addresses and disallow binding
158		 * to them.
159		 */
160		sinp = mtod(nam, struct sockaddr_in *);
161		if (sinp->sin_family == AF_INET &&
162		    IN_MULTICAST(ntohl(sinp->sin_addr.s_addr))) {
163			error = EAFNOSUPPORT;
164			break;
165		}
166		error = in_pcbbind(inp, nam);
167		if (error)
168			break;
169		break;
170
171	/*
172	 * Prepare to accept connections.
173	 */
174	case PRU_LISTEN:
175		if (inp->inp_lport == 0)
176			error = in_pcbbind(inp, (struct mbuf *)0);
177		if (error == 0)
178			tp->t_state = TCPS_LISTEN;
179		break;
180
181	/*
182	 * Initiate connection to peer.
183	 * Create a template for use in transmissions on this connection.
184	 * Enter SYN_SENT state, and mark socket as connecting.
185	 * Start keep-alive timer, and seed output sequence space.
186	 * Send initial segment on connection.
187	 */
188	case PRU_CONNECT:
189		/*
190		 * Must disallow TCP ``connections'' to multicast addresses.
191		 */
192		sinp = mtod(nam, struct sockaddr_in *);
193		if (sinp->sin_family == AF_INET
194		    && IN_MULTICAST(ntohl(sinp->sin_addr.s_addr))) {
195			error = EAFNOSUPPORT;
196			break;
197		}
198
199		if (inp->inp_lport == 0) {
200			error = in_pcbbind(inp, (struct mbuf *)0);
201			if (error)
202				break;
203		}
204		error = in_pcbconnect(inp, nam);
205		if (error)
206			break;
207		tp->t_template = tcp_template(tp);
208		if (tp->t_template == 0) {
209			in_pcbdisconnect(inp);
210			error = ENOBUFS;
211			break;
212		}
213		/* Compute window scaling to request.  */
214		while (tp->request_r_scale < TCP_MAX_WINSHIFT &&
215		    (TCP_MAXWIN << tp->request_r_scale) < so->so_rcv.sb_hiwat)
216			tp->request_r_scale++;
217		soisconnecting(so);
218		tcpstat.tcps_connattempt++;
219		tp->t_state = TCPS_SYN_SENT;
220		tp->t_timer[TCPT_KEEP] = TCPTV_KEEP_INIT;
221		tp->iss = tcp_iss; tcp_iss += TCP_ISSINCR/2;
222		tcp_sendseqinit(tp);
223		error = tcp_output(tp);
224		break;
225
226	/*
227	 * Create a TCP connection between two sockets.
228	 */
229	case PRU_CONNECT2:
230		error = EOPNOTSUPP;
231		break;
232
233	/*
234	 * Initiate disconnect from peer.
235	 * If connection never passed embryonic stage, just drop;
236	 * else if don't need to let data drain, then can just drop anyways,
237	 * else have to begin TCP shutdown process: mark socket disconnecting,
238	 * drain unread data, state switch to reflect user close, and
239	 * send segment (e.g. FIN) to peer.  Socket will be really disconnected
240	 * when peer sends FIN and acks ours.
241	 *
242	 * SHOULD IMPLEMENT LATER PRU_CONNECT VIA REALLOC TCPCB.
243	 */
244	case PRU_DISCONNECT:
245		tp = tcp_disconnect(tp);
246		break;
247
248	/*
249	 * Accept a connection.  Essentially all the work is
250	 * done at higher levels; just return the address
251	 * of the peer, storing through addr.
252	 */
253	case PRU_ACCEPT:
254		in_setpeeraddr(inp, nam);
255		break;
256
257	/*
258	 * Mark the connection as being incapable of further output.
259	 */
260	case PRU_SHUTDOWN:
261		socantsendmore(so);
262		tp = tcp_usrclosed(tp);
263		if (tp)
264			error = tcp_output(tp);
265		break;
266
267	/*
268	 * After a receive, possibly send window update to peer.
269	 */
270	case PRU_RCVD:
271		(void) tcp_output(tp);
272		break;
273
274	/*
275	 * Do a send by putting data in output queue and updating urgent
276	 * marker if URG set.  Possibly send more data.
277	 */
278	case PRU_SEND:
279		sbappend(&so->so_snd, m);
280		error = tcp_output(tp);
281		break;
282
283	/*
284	 * Abort the TCP.
285	 */
286	case PRU_ABORT:
287		tp = tcp_drop(tp, ECONNABORTED);
288		break;
289
290	case PRU_SENSE:
291		((struct stat *) m)->st_blksize = so->so_snd.sb_hiwat;
292		(void) splx(s);
293		return (0);
294
295	case PRU_RCVOOB:
296		if ((so->so_oobmark == 0 &&
297		    (so->so_state & SS_RCVATMARK) == 0) ||
298		    so->so_options & SO_OOBINLINE ||
299		    tp->t_oobflags & TCPOOB_HADDATA) {
300			error = EINVAL;
301			break;
302		}
303		if ((tp->t_oobflags & TCPOOB_HAVEDATA) == 0) {
304			error = EWOULDBLOCK;
305			break;
306		}
307		m->m_len = 1;
308		*mtod(m, caddr_t) = tp->t_iobc;
309		if (((int)nam & MSG_PEEK) == 0)
310			tp->t_oobflags ^= (TCPOOB_HAVEDATA | TCPOOB_HADDATA);
311		break;
312
313	case PRU_SENDOOB:
314		if (sbspace(&so->so_snd) < -512) {
315			m_freem(m);
316			error = ENOBUFS;
317			break;
318		}
319		/*
320		 * According to RFC961 (Assigned Protocols),
321		 * the urgent pointer points to the last octet
322		 * of urgent data.  We continue, however,
323		 * to consider it to indicate the first octet
324		 * of data past the urgent section.
325		 * Otherwise, snd_up should be one lower.
326		 */
327		sbappend(&so->so_snd, m);
328		tp->snd_up = tp->snd_una + so->so_snd.sb_cc;
329		tp->t_force = 1;
330		error = tcp_output(tp);
331		tp->t_force = 0;
332		break;
333
334	case PRU_SOCKADDR:
335		in_setsockaddr(inp, nam);
336		break;
337
338	case PRU_PEERADDR:
339		in_setpeeraddr(inp, nam);
340		break;
341
342	/*
343	 * TCP slow timer went off; going through this
344	 * routine for tracing's sake.
345	 */
346	case PRU_SLOWTIMO:
347		tp = tcp_timers(tp, (int)nam);
348		req |= (int)nam << 8;		/* for debug's sake */
349		break;
350
351	default:
352		panic("tcp_usrreq");
353	}
354#ifdef TCPDEBUG
355	if (tp && (so->so_options & SO_DEBUG))
356		tcp_trace(TA_USER, ostate, tp, (struct tcpiphdr *)0, req);
357#endif
358	splx(s);
359	return (error);
360}
361
362int
363tcp_ctloutput(op, so, level, optname, mp)
364	int op;
365	struct socket *so;
366	int level, optname;
367	struct mbuf **mp;
368{
369	int error = 0, s;
370	struct inpcb *inp;
371	register struct tcpcb *tp;
372	register struct mbuf *m;
373	register int i;
374
375	s = splnet();
376	inp = sotoinpcb(so);
377	if (inp == NULL) {
378		splx(s);
379		if (op == PRCO_SETOPT && *mp)
380			(void) m_free(*mp);
381		return (ECONNRESET);
382	}
383	if (level != IPPROTO_TCP) {
384		error = ip_ctloutput(op, so, level, optname, mp);
385		splx(s);
386		return (error);
387	}
388	tp = intotcpcb(inp);
389
390	switch (op) {
391
392	case PRCO_SETOPT:
393		m = *mp;
394		switch (optname) {
395
396		case TCP_NODELAY:
397			if (m == NULL || m->m_len < sizeof (int))
398				error = EINVAL;
399			else if (*mtod(m, int *))
400				tp->t_flags |= TF_NODELAY;
401			else
402				tp->t_flags &= ~TF_NODELAY;
403			break;
404
405		case TCP_MAXSEG:
406			if (m && (i = *mtod(m, int *)) > 0 && i <= tp->t_maxseg)
407				tp->t_maxseg = i;
408			else
409				error = EINVAL;
410			break;
411
412		default:
413			error = ENOPROTOOPT;
414			break;
415		}
416		if (m)
417			(void) m_free(m);
418		break;
419
420	case PRCO_GETOPT:
421		*mp = m = m_get(M_WAIT, MT_SOOPTS);
422		m->m_len = sizeof(int);
423
424		switch (optname) {
425		case TCP_NODELAY:
426			*mtod(m, int *) = tp->t_flags & TF_NODELAY;
427			break;
428		case TCP_MAXSEG:
429			*mtod(m, int *) = tp->t_maxseg;
430			break;
431		default:
432			error = ENOPROTOOPT;
433			break;
434		}
435		break;
436	}
437	splx(s);
438	return (error);
439}
440
441/*
442 * tcp_sendspace and tcp_recvspace are the default send and receive window
443 * sizes, respectively.  These are obsolescent (this information should
444 * be set by the route).
445 */
446#ifdef TCP_SMALLSPACE
447u_long	tcp_sendspace = 1024*4;
448u_long	tcp_recvspace = 1024*4;
449#else
450u_long	tcp_sendspace = 1024*16;
451u_long	tcp_recvspace = 1024*16;
452#endif
453
454/*
455 * Attach TCP protocol to socket, allocating
456 * internet protocol control block, tcp control block,
457 * bufer space, and entering LISTEN state if to accept connections.
458 */
459int
460tcp_attach(so)
461	struct socket *so;
462{
463	register struct tcpcb *tp;
464	struct inpcb *inp;
465	int error;
466
467	if (so->so_snd.sb_hiwat == 0 || so->so_rcv.sb_hiwat == 0) {
468		error = soreserve(so, tcp_sendspace, tcp_recvspace);
469		if (error)
470			return (error);
471	}
472	error = in_pcballoc(so, &tcb);
473	if (error)
474		return (error);
475	inp = sotoinpcb(so);
476	tp = tcp_newtcpcb(inp);
477	if (tp == 0) {
478		int nofd = so->so_state & SS_NOFDREF;	/* XXX */
479
480		so->so_state &= ~SS_NOFDREF;	/* don't free the socket yet */
481		in_pcbdetach(inp);
482		so->so_state |= nofd;
483		return (ENOBUFS);
484	}
485	tp->t_state = TCPS_CLOSED;
486	return (0);
487}
488
489/*
490 * Initiate (or continue) disconnect.
491 * If embryonic state, just send reset (once).
492 * If in ``let data drain'' option and linger null, just drop.
493 * Otherwise (hard), mark socket disconnecting and drop
494 * current input data; switch states based on user close, and
495 * send segment to peer (with FIN).
496 */
497struct tcpcb *
498tcp_disconnect(tp)
499	register struct tcpcb *tp;
500{
501	struct socket *so = tp->t_inpcb->inp_socket;
502
503	if (tp->t_state < TCPS_ESTABLISHED)
504		tp = tcp_close(tp);
505	else if ((so->so_options & SO_LINGER) && so->so_linger == 0)
506		tp = tcp_drop(tp, 0);
507	else {
508		soisdisconnecting(so);
509		sbflush(&so->so_rcv);
510		tp = tcp_usrclosed(tp);
511		if (tp)
512			(void) tcp_output(tp);
513	}
514	return (tp);
515}
516
517/*
518 * User issued close, and wish to trail through shutdown states:
519 * if never received SYN, just forget it.  If got a SYN from peer,
520 * but haven't sent FIN, then go to FIN_WAIT_1 state to send peer a FIN.
521 * If already got a FIN from peer, then almost done; go to LAST_ACK
522 * state.  In all other cases, have already sent FIN to peer (e.g.
523 * after PRU_SHUTDOWN), and just have to play tedious game waiting
524 * for peer to send FIN or not respond to keep-alives, etc.
525 * We can let the user exit from the close as soon as the FIN is acked.
526 */
527struct tcpcb *
528tcp_usrclosed(tp)
529	register struct tcpcb *tp;
530{
531
532	switch (tp->t_state) {
533
534	case TCPS_CLOSED:
535	case TCPS_LISTEN:
536	case TCPS_SYN_SENT:
537		tp->t_state = TCPS_CLOSED;
538		tp = tcp_close(tp);
539		break;
540
541	case TCPS_SYN_RECEIVED:
542	case TCPS_ESTABLISHED:
543		tp->t_state = TCPS_FIN_WAIT_1;
544		break;
545
546	case TCPS_CLOSE_WAIT:
547		tp->t_state = TCPS_LAST_ACK;
548		break;
549	}
550	if (tp && tp->t_state >= TCPS_FIN_WAIT_2)
551		soisdisconnected(tp->t_inpcb->inp_socket);
552	return (tp);
553}
554