raw_ip.c revision 41793
1/*
2 * Copyright (c) 1982, 1986, 1988, 1993
3 *	The Regents of the University of California.  All rights reserved.
4 *
5 * Redistribution and use in source and binary forms, with or without
6 * modification, are permitted provided that the following conditions
7 * are met:
8 * 1. Redistributions of source code must retain the above copyright
9 *    notice, this list of conditions and the following disclaimer.
10 * 2. Redistributions in binary form must reproduce the above copyright
11 *    notice, this list of conditions and the following disclaimer in the
12 *    documentation and/or other materials provided with the distribution.
13 * 3. All advertising materials mentioning features or use of this software
14 *    must display the following acknowledgement:
15 *	This product includes software developed by the University of
16 *	California, Berkeley and its contributors.
17 * 4. Neither the name of the University nor the names of its contributors
18 *    may be used to endorse or promote products derived from this software
19 *    without specific prior written permission.
20 *
21 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
22 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
23 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
24 * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
25 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
26 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
27 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
28 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
29 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
30 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
31 * SUCH DAMAGE.
32 *
33 *	@(#)raw_ip.c	8.7 (Berkeley) 5/15/95
34 *	$Id: raw_ip.c,v 1.55 1998/08/23 03:07:14 wollman Exp $
35 */
36
37#include <sys/param.h>
38#include <sys/systm.h>
39#include <sys/kernel.h>
40#include <sys/malloc.h>
41#include <sys/mbuf.h>
42#include <sys/proc.h>
43#include <sys/protosw.h>
44#include <sys/socket.h>
45#include <sys/socketvar.h>
46#include <sys/sysctl.h>
47
48#include <vm/vm_zone.h>
49
50#include <net/if.h>
51#include <net/route.h>
52
53#define _IP_VHL
54#include <netinet/in.h>
55#include <netinet/in_systm.h>
56#include <netinet/ip.h>
57#include <netinet/in_pcb.h>
58#include <netinet/in_var.h>
59#include <netinet/ip_var.h>
60#include <netinet/ip_mroute.h>
61
62#include <netinet/ip_fw.h>
63
64#include "opt_ipdn.h"
65#ifdef DUMMYNET
66#include <netinet/ip_dummynet.h>
67#endif
68#if !defined(COMPAT_IPFW) || COMPAT_IPFW == 1
69#undef COMPAT_IPFW
70#define COMPAT_IPFW 1
71#else
72#undef COMPAT_IPFW
73#endif
74
75static struct inpcbhead ripcb;
76static struct inpcbinfo ripcbinfo;
77
78/*
79 * Nominal space allocated to a raw ip socket.
80 */
81#define	RIPSNDQ		8192
82#define	RIPRCVQ		8192
83
84/*
85 * Raw interface to IP protocol.
86 */
87
88/*
89 * Initialize raw connection block q.
90 */
91void
92rip_init()
93{
94	LIST_INIT(&ripcb);
95	ripcbinfo.listhead = &ripcb;
96	/*
97	 * XXX We don't use the hash list for raw IP, but it's easier
98	 * to allocate a one entry hash list than it is to check all
99	 * over the place for hashbase == NULL.
100	 */
101	ripcbinfo.hashbase = hashinit(1, M_PCB, &ripcbinfo.hashmask);
102	ripcbinfo.porthashbase = hashinit(1, M_PCB, &ripcbinfo.porthashmask);
103	ripcbinfo.ipi_zone = zinit("ripcb", sizeof(struct inpcb),
104				   maxsockets, ZONE_INTERRUPT, 0);
105}
106
107static struct	sockaddr_in ripsrc = { sizeof(ripsrc), AF_INET };
108/*
109 * Setup generic address and protocol structures
110 * for raw_input routine, then pass them along with
111 * mbuf chain.
112 */
113void
114rip_input(m, iphlen)
115	struct mbuf *m;
116	int iphlen;
117{
118	register struct ip *ip = mtod(m, struct ip *);
119	register struct inpcb *inp;
120	struct inpcb *last = 0;
121	struct mbuf *opts = 0;
122
123	ripsrc.sin_addr = ip->ip_src;
124	for (inp = ripcb.lh_first; inp != NULL; inp = inp->inp_list.le_next) {
125		if (inp->inp_ip_p && inp->inp_ip_p != ip->ip_p)
126			continue;
127		if (inp->inp_laddr.s_addr &&
128                  inp->inp_laddr.s_addr != ip->ip_dst.s_addr)
129			continue;
130		if (inp->inp_faddr.s_addr &&
131                  inp->inp_faddr.s_addr != ip->ip_src.s_addr)
132			continue;
133		if (last) {
134			struct mbuf *n = m_copy(m, 0, (int)M_COPYALL);
135			if (n) {
136				if (last->inp_flags & INP_CONTROLOPTS ||
137				    last->inp_socket->so_options & SO_TIMESTAMP)
138				    ip_savecontrol(last, &opts, ip, n);
139				if (sbappendaddr(&last->inp_socket->so_rcv,
140				    (struct sockaddr *)&ripsrc, n,
141				    opts) == 0) {
142					/* should notify about lost packet */
143					m_freem(n);
144					if (opts)
145					    m_freem(opts);
146				} else
147					sorwakeup(last->inp_socket);
148				opts = 0;
149			}
150		}
151		last = inp;
152	}
153	if (last) {
154		if (last->inp_flags & INP_CONTROLOPTS ||
155		    last->inp_socket->so_options & SO_TIMESTAMP)
156			ip_savecontrol(last, &opts, ip, m);
157		if (sbappendaddr(&last->inp_socket->so_rcv,
158		    (struct sockaddr *)&ripsrc, m, opts) == 0) {
159			m_freem(m);
160			if (opts)
161			    m_freem(opts);
162		} else
163			sorwakeup(last->inp_socket);
164	} else {
165		m_freem(m);
166              ipstat.ips_noproto++;
167              ipstat.ips_delivered--;
168      }
169}
170
171/*
172 * Generate IP header and pass packet to ip_output.
173 * Tack on options user may have setup with control call.
174 */
175int
176rip_output(m, so, dst)
177	register struct mbuf *m;
178	struct socket *so;
179	u_long dst;
180{
181	register struct ip *ip;
182	register struct inpcb *inp = sotoinpcb(so);
183	int flags = (so->so_options & SO_DONTROUTE) | IP_ALLOWBROADCAST;
184
185	/*
186	 * If the user handed us a complete IP packet, use it.
187	 * Otherwise, allocate an mbuf for a header and fill it in.
188	 */
189	if ((inp->inp_flags & INP_HDRINCL) == 0) {
190		if (m->m_pkthdr.len + sizeof(struct ip) > IP_MAXPACKET) {
191			m_freem(m);
192			return(EMSGSIZE);
193		}
194		M_PREPEND(m, sizeof(struct ip), M_WAIT);
195		ip = mtod(m, struct ip *);
196		ip->ip_tos = 0;
197		ip->ip_off = 0;
198		ip->ip_p = inp->inp_ip_p;
199		ip->ip_len = m->m_pkthdr.len;
200		ip->ip_src = inp->inp_laddr;
201		ip->ip_dst.s_addr = dst;
202		ip->ip_ttl = MAXTTL;
203	} else {
204		if (m->m_pkthdr.len > IP_MAXPACKET) {
205			m_freem(m);
206			return(EMSGSIZE);
207		}
208		ip = mtod(m, struct ip *);
209		/* don't allow both user specified and setsockopt options,
210		   and don't allow packet length sizes that will crash */
211		if (((IP_VHL_HL(ip->ip_vhl) != (sizeof (*ip) >> 2))
212		     && inp->inp_options)
213		    || (ip->ip_len > m->m_pkthdr.len)
214		    || (ip->ip_len < (IP_VHL_HL(ip->ip_vhl) << 2))) {
215			m_freem(m);
216			return EINVAL;
217		}
218		if (ip->ip_id == 0)
219			ip->ip_id = htons(ip_id++);
220		/* XXX prevent ip_output from overwriting header fields */
221		flags |= IP_RAWOUTPUT;
222		ipstat.ips_rawout++;
223	}
224	return (ip_output(m, inp->inp_options, &inp->inp_route, flags,
225			  inp->inp_moptions));
226}
227
228/*
229 * Raw IP socket option processing.
230 */
231int
232rip_ctloutput(so, sopt)
233	struct socket *so;
234	struct sockopt *sopt;
235{
236	struct	inpcb *inp = sotoinpcb(so);
237	int	error, optval;
238
239	if (sopt->sopt_level != IPPROTO_IP)
240		return (EINVAL);
241
242	error = 0;
243
244	switch (sopt->sopt_dir) {
245	case SOPT_GET:
246		switch (sopt->sopt_name) {
247		case IP_HDRINCL:
248			optval = inp->inp_flags & INP_HDRINCL;
249			error = sooptcopyout(sopt, &optval, sizeof optval);
250			break;
251
252#ifdef COMPAT_IPFW
253		case IP_FW_GET:
254			if (ip_fw_ctl_ptr == 0)
255				error = ENOPROTOOPT;
256			else
257				error = ip_fw_ctl_ptr(sopt);
258			break;
259
260		case IP_NAT:
261			if (ip_nat_ctl_ptr == 0)
262				error = ENOPROTOOPT;
263			else
264				error = ip_nat_ctl_ptr(sopt);
265			break;
266#ifdef DUMMYNET
267		case IP_DUMMYNET_GET:
268			if (ip_dn_ctl_ptr == NULL)
269				error = ENOPROTOOPT ;
270			else
271				error = ip_dn_ctl_ptr(sopt);
272			break ;
273#endif /* DUMMYNET */
274#endif /* COMPAT_IPFW */
275
276		case MRT_INIT:
277		case MRT_DONE:
278		case MRT_ADD_VIF:
279		case MRT_DEL_VIF:
280		case MRT_ADD_MFC:
281		case MRT_DEL_MFC:
282		case MRT_VERSION:
283		case MRT_ASSERT:
284			error = ip_mrouter_get(so, sopt);
285			break;
286
287		default:
288			error = ip_ctloutput(so, sopt);
289			break;
290		}
291		break;
292
293	case SOPT_SET:
294		switch (sopt->sopt_name) {
295		case IP_HDRINCL:
296			error = sooptcopyin(sopt, &optval, sizeof optval,
297					    sizeof optval);
298			if (error)
299				break;
300			if (optval)
301				inp->inp_flags |= INP_HDRINCL;
302			else
303				inp->inp_flags &= ~INP_HDRINCL;
304			break;
305
306#ifdef COMPAT_IPFW
307		case IP_FW_ADD:
308		case IP_FW_DEL:
309		case IP_FW_FLUSH:
310		case IP_FW_ZERO:
311			if (ip_fw_ctl_ptr == 0)
312				error = ENOPROTOOPT;
313			else
314				error = ip_fw_ctl_ptr(sopt);
315			break;
316
317		case IP_NAT:
318			if (ip_nat_ctl_ptr == 0)
319				error = ENOPROTOOPT;
320			else
321				error = ip_nat_ctl_ptr(sopt);
322			break;
323#ifdef DUMMYNET
324		case IP_DUMMYNET_CONFIGURE:
325		case IP_DUMMYNET_DEL:
326		case IP_DUMMYNET_FLUSH:
327			if (ip_dn_ctl_ptr == NULL)
328				error = ENOPROTOOPT ;
329			else
330				error = ip_dn_ctl_ptr(sopt);
331			break ;
332#endif
333#endif /* COMPAT_IPFW */
334
335		case IP_RSVP_ON:
336			error = ip_rsvp_init(so);
337			break;
338
339		case IP_RSVP_OFF:
340			error = ip_rsvp_done();
341			break;
342
343			/* XXX - should be combined */
344		case IP_RSVP_VIF_ON:
345			error = ip_rsvp_vif_init(so, sopt);
346			break;
347
348		case IP_RSVP_VIF_OFF:
349			error = ip_rsvp_vif_done(so, sopt);
350			break;
351
352		case MRT_INIT:
353		case MRT_DONE:
354		case MRT_ADD_VIF:
355		case MRT_DEL_VIF:
356		case MRT_ADD_MFC:
357		case MRT_DEL_MFC:
358		case MRT_VERSION:
359		case MRT_ASSERT:
360			error = ip_mrouter_set(so, sopt);
361			break;
362
363		default:
364			error = ip_ctloutput(so, sopt);
365			break;
366		}
367		break;
368	}
369
370	return (error);
371}
372
373/*
374 * This function exists solely to receive the PRC_IFDOWN messages which
375 * are sent by if_down().  It looks for an ifaddr whose ifa_addr is sa,
376 * and calls in_ifadown() to remove all routes corresponding to that address.
377 * It also receives the PRC_IFUP messages from if_up() and reinstalls the
378 * interface routes.
379 */
380void
381rip_ctlinput(cmd, sa, vip)
382	int cmd;
383	struct sockaddr *sa;
384	void *vip;
385{
386	struct in_ifaddr *ia;
387	struct ifnet *ifp;
388	int err;
389	int flags;
390
391	switch (cmd) {
392	case PRC_IFDOWN:
393		for (ia = in_ifaddrhead.tqh_first; ia;
394		     ia = ia->ia_link.tqe_next) {
395			if (ia->ia_ifa.ifa_addr == sa
396			    && (ia->ia_flags & IFA_ROUTE)) {
397				/*
398				 * in_ifscrub kills the interface route.
399				 */
400				in_ifscrub(ia->ia_ifp, ia);
401				/*
402				 * in_ifadown gets rid of all the rest of
403				 * the routes.  This is not quite the right
404				 * thing to do, but at least if we are running
405				 * a routing process they will come back.
406				 */
407				in_ifadown(&ia->ia_ifa);
408				break;
409			}
410		}
411		break;
412
413	case PRC_IFUP:
414		for (ia = in_ifaddrhead.tqh_first; ia;
415		     ia = ia->ia_link.tqe_next) {
416			if (ia->ia_ifa.ifa_addr == sa)
417				break;
418		}
419		if (ia == 0 || (ia->ia_flags & IFA_ROUTE))
420			return;
421		flags = RTF_UP;
422		ifp = ia->ia_ifa.ifa_ifp;
423
424		if ((ifp->if_flags & IFF_LOOPBACK)
425		    || (ifp->if_flags & IFF_POINTOPOINT))
426			flags |= RTF_HOST;
427
428		err = rtinit(&ia->ia_ifa, RTM_ADD, flags);
429		if (err == 0)
430			ia->ia_flags |= IFA_ROUTE;
431		break;
432	}
433}
434
435static u_long	rip_sendspace = RIPSNDQ;
436static u_long	rip_recvspace = RIPRCVQ;
437
438SYSCTL_INT(_net_inet_raw, OID_AUTO, maxdgram, CTLFLAG_RW, &rip_sendspace,
439	   0, "");
440SYSCTL_INT(_net_inet_raw, OID_AUTO, recvspace, CTLFLAG_RW, &rip_recvspace,
441	   0, "");
442
443static int
444rip_attach(struct socket *so, int proto, struct proc *p)
445{
446	struct inpcb *inp;
447	int error, s;
448
449	inp = sotoinpcb(so);
450	if (inp)
451		panic("rip_attach");
452	if (p && (error = suser(p->p_ucred, &p->p_acflag)) != 0)
453		return error;
454
455	s = splnet();
456	error = in_pcballoc(so, &ripcbinfo, p);
457	splx(s);
458	if (error)
459		return error;
460	error = soreserve(so, rip_sendspace, rip_recvspace);
461	if (error)
462		return error;
463	inp = (struct inpcb *)so->so_pcb;
464	inp->inp_ip_p = proto;
465	return 0;
466}
467
468static int
469rip_detach(struct socket *so)
470{
471	struct inpcb *inp;
472
473	inp = sotoinpcb(so);
474	if (inp == 0)
475		panic("rip_detach");
476	if (so == ip_mrouter)
477		ip_mrouter_done();
478	ip_rsvp_force_done(so);
479	if (so == ip_rsvpd)
480		ip_rsvp_done();
481	in_pcbdetach(inp);
482	return 0;
483}
484
485static int
486rip_abort(struct socket *so)
487{
488	soisdisconnected(so);
489	return rip_detach(so);
490}
491
492static int
493rip_disconnect(struct socket *so)
494{
495	if ((so->so_state & SS_ISCONNECTED) == 0)
496		return ENOTCONN;
497	return rip_abort(so);
498}
499
500static int
501rip_bind(struct socket *so, struct sockaddr *nam, struct proc *p)
502{
503	struct inpcb *inp = sotoinpcb(so);
504	struct sockaddr_in *addr = (struct sockaddr_in *)nam;
505
506	if (nam->sa_len != sizeof(*addr))
507		return EINVAL;
508
509	if (TAILQ_EMPTY(&ifnet) || ((addr->sin_family != AF_INET) &&
510				    (addr->sin_family != AF_IMPLINK)) ||
511	    (addr->sin_addr.s_addr &&
512	     ifa_ifwithaddr((struct sockaddr *)addr) == 0))
513		return EADDRNOTAVAIL;
514	inp->inp_laddr = addr->sin_addr;
515	return 0;
516}
517
518static int
519rip_connect(struct socket *so, struct sockaddr *nam, struct proc *p)
520{
521	struct inpcb *inp = sotoinpcb(so);
522	struct sockaddr_in *addr = (struct sockaddr_in *)nam;
523
524	if (nam->sa_len != sizeof(*addr))
525		return EINVAL;
526	if (TAILQ_EMPTY(&ifnet))
527		return EADDRNOTAVAIL;
528	if ((addr->sin_family != AF_INET) &&
529	    (addr->sin_family != AF_IMPLINK))
530		return EAFNOSUPPORT;
531	inp->inp_faddr = addr->sin_addr;
532	soisconnected(so);
533	return 0;
534}
535
536static int
537rip_shutdown(struct socket *so)
538{
539	socantsendmore(so);
540	return 0;
541}
542
543static int
544rip_send(struct socket *so, int flags, struct mbuf *m, struct sockaddr *nam,
545	 struct mbuf *control, struct proc *p)
546{
547	struct inpcb *inp = sotoinpcb(so);
548	register u_long dst;
549
550	if (so->so_state & SS_ISCONNECTED) {
551		if (nam) {
552			m_freem(m);
553			return EISCONN;
554		}
555		dst = inp->inp_faddr.s_addr;
556	} else {
557		if (nam == NULL) {
558			m_freem(m);
559			return ENOTCONN;
560		}
561		dst = ((struct sockaddr_in *)nam)->sin_addr.s_addr;
562	}
563	return rip_output(m, so, dst);
564}
565
566static int
567rip_pcblist SYSCTL_HANDLER_ARGS
568{
569	int error, i, n, s;
570	struct inpcb *inp, **inp_list;
571	inp_gen_t gencnt;
572	struct xinpgen xig;
573
574	/*
575	 * The process of preparing the TCB list is too time-consuming and
576	 * resource-intensive to repeat twice on every request.
577	 */
578	if (req->oldptr == 0) {
579		n = ripcbinfo.ipi_count;
580		req->oldidx = 2 * (sizeof xig)
581			+ (n + n/8) * sizeof(struct xinpcb);
582		return 0;
583	}
584
585	if (req->newptr != 0)
586		return EPERM;
587
588	/*
589	 * OK, now we're committed to doing something.
590	 */
591	s = splnet();
592	gencnt = ripcbinfo.ipi_gencnt;
593	n = ripcbinfo.ipi_count;
594	splx(s);
595
596	xig.xig_len = sizeof xig;
597	xig.xig_count = n;
598	xig.xig_gen = gencnt;
599	xig.xig_sogen = so_gencnt;
600	error = SYSCTL_OUT(req, &xig, sizeof xig);
601	if (error)
602		return error;
603
604	inp_list = malloc(n * sizeof *inp_list, M_TEMP, M_WAITOK);
605	if (inp_list == 0)
606		return ENOMEM;
607
608	s = splnet();
609	for (inp = ripcbinfo.listhead->lh_first, i = 0; inp && i < n;
610	     inp = inp->inp_list.le_next) {
611		if (inp->inp_gencnt <= gencnt)
612			inp_list[i++] = inp;
613	}
614	splx(s);
615	n = i;
616
617	error = 0;
618	for (i = 0; i < n; i++) {
619		inp = inp_list[i];
620		if (inp->inp_gencnt <= gencnt) {
621			struct xinpcb xi;
622			xi.xi_len = sizeof xi;
623			/* XXX should avoid extra copy */
624			bcopy(inp, &xi.xi_inp, sizeof *inp);
625			if (inp->inp_socket)
626				sotoxsocket(inp->inp_socket, &xi.xi_socket);
627			error = SYSCTL_OUT(req, &xi, sizeof xi);
628		}
629	}
630	if (!error) {
631		/*
632		 * Give the user an updated idea of our state.
633		 * If the generation differs from what we told
634		 * her before, she knows that something happened
635		 * while we were processing this request, and it
636		 * might be necessary to retry.
637		 */
638		s = splnet();
639		xig.xig_gen = ripcbinfo.ipi_gencnt;
640		xig.xig_sogen = so_gencnt;
641		xig.xig_count = ripcbinfo.ipi_count;
642		splx(s);
643		error = SYSCTL_OUT(req, &xig, sizeof xig);
644	}
645	free(inp_list, M_TEMP);
646	return error;
647}
648
649SYSCTL_PROC(_net_inet_raw, OID_AUTO/*XXX*/, pcblist, CTLFLAG_RD, 0, 0,
650	    rip_pcblist, "S,xinpcb", "List of active raw IP sockets");
651
652struct pr_usrreqs rip_usrreqs = {
653	rip_abort, pru_accept_notsupp, rip_attach, rip_bind, rip_connect,
654	pru_connect2_notsupp, in_control, rip_detach, rip_disconnect,
655	pru_listen_notsupp, in_setpeeraddr, pru_rcvd_notsupp,
656	pru_rcvoob_notsupp, rip_send, pru_sense_null, rip_shutdown,
657	in_setsockaddr, sosend, soreceive, sopoll
658};
659