raw_ip.c revision 92976
1/*
2 * Copyright (c) 1982, 1986, 1988, 1993
3 *	The Regents of the University of California.  All rights reserved.
4 *
5 * Redistribution and use in source and binary forms, with or without
6 * modification, are permitted provided that the following conditions
7 * are met:
8 * 1. Redistributions of source code must retain the above copyright
9 *    notice, this list of conditions and the following disclaimer.
10 * 2. Redistributions in binary form must reproduce the above copyright
11 *    notice, this list of conditions and the following disclaimer in the
12 *    documentation and/or other materials provided with the distribution.
13 * 3. All advertising materials mentioning features or use of this software
14 *    must display the following acknowledgement:
15 *	This product includes software developed by the University of
16 *	California, Berkeley and its contributors.
17 * 4. Neither the name of the University nor the names of its contributors
18 *    may be used to endorse or promote products derived from this software
19 *    without specific prior written permission.
20 *
21 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
22 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
23 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
24 * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
25 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
26 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
27 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
28 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
29 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
30 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
31 * SUCH DAMAGE.
32 *
33 *	@(#)raw_ip.c	8.7 (Berkeley) 5/15/95
34 * $FreeBSD: head/sys/netinet/raw_ip.c 92976 2002-03-22 19:57:41Z rwatson $
35 */
36
37#include "opt_inet6.h"
38#include "opt_ipsec.h"
39#include "opt_random_ip_id.h"
40
41#include <sys/param.h>
42#include <sys/systm.h>
43#include <sys/kernel.h>
44#include <sys/malloc.h>
45#include <sys/mbuf.h>
46#include <sys/proc.h>
47#include <sys/protosw.h>
48#include <sys/socket.h>
49#include <sys/socketvar.h>
50#include <sys/sysctl.h>
51
52#include <vm/uma.h>
53
54#include <net/if.h>
55#include <net/route.h>
56
57#define _IP_VHL
58#include <netinet/in.h>
59#include <netinet/in_systm.h>
60#include <netinet/ip.h>
61#include <netinet/in_pcb.h>
62#include <netinet/in_var.h>
63#include <netinet/ip_var.h>
64#include <netinet/ip_mroute.h>
65
66#include <netinet/ip_fw.h>
67#include <netinet/ip_dummynet.h>
68
69#ifdef IPSEC
70#include <netinet6/ipsec.h>
71#endif /*IPSEC*/
72
73struct	inpcbhead ripcb;
74struct	inpcbinfo ripcbinfo;
75
76/* control hooks for ipfw and dummynet */
77ip_fw_ctl_t *ip_fw_ctl_ptr;
78ip_dn_ctl_t *ip_dn_ctl_ptr;
79
80/*
81 * Nominal space allocated to a raw ip socket.
82 */
83#define	RIPSNDQ		8192
84#define	RIPRCVQ		8192
85
86/*
87 * Raw interface to IP protocol.
88 */
89
90/*
91 * Initialize raw connection block q.
92 */
93void
94rip_init()
95{
96	LIST_INIT(&ripcb);
97	ripcbinfo.listhead = &ripcb;
98	/*
99	 * XXX We don't use the hash list for raw IP, but it's easier
100	 * to allocate a one entry hash list than it is to check all
101	 * over the place for hashbase == NULL.
102	 */
103	ripcbinfo.hashbase = hashinit(1, M_PCB, &ripcbinfo.hashmask);
104	ripcbinfo.porthashbase = hashinit(1, M_PCB, &ripcbinfo.porthashmask);
105	ripcbinfo.ipi_zone = uma_zcreate("ripcb", sizeof(struct inpcb),
106	    NULL, NULL, NULL, NULL, UMA_ALIGN_PTR, UMA_ZONE_NOFREE);
107	uma_zone_set_max(ripcbinfo.ipi_zone, maxsockets);
108}
109
110static struct	sockaddr_in ripsrc = { sizeof(ripsrc), AF_INET };
111/*
112 * Setup generic address and protocol structures
113 * for raw_input routine, then pass them along with
114 * mbuf chain.
115 */
116void
117rip_input(m, off)
118	struct mbuf *m;
119	int off;
120{
121	register struct ip *ip = mtod(m, struct ip *);
122	register struct inpcb *inp;
123	struct inpcb *last = 0;
124	struct mbuf *opts = 0;
125	int proto = ip->ip_p;
126
127	ripsrc.sin_addr = ip->ip_src;
128	LIST_FOREACH(inp, &ripcb, inp_list) {
129#ifdef INET6
130		if ((inp->inp_vflag & INP_IPV4) == 0)
131			continue;
132#endif
133		if (inp->inp_ip_p && inp->inp_ip_p != proto)
134			continue;
135		if (inp->inp_laddr.s_addr &&
136                  inp->inp_laddr.s_addr != ip->ip_dst.s_addr)
137			continue;
138		if (inp->inp_faddr.s_addr &&
139                  inp->inp_faddr.s_addr != ip->ip_src.s_addr)
140			continue;
141		if (last) {
142			struct mbuf *n = m_copy(m, 0, (int)M_COPYALL);
143
144#ifdef IPSEC
145			/* check AH/ESP integrity. */
146			if (n && ipsec4_in_reject_so(n, last->inp_socket)) {
147				m_freem(n);
148				ipsecstat.in_polvio++;
149				/* do not inject data to pcb */
150			} else
151#endif /*IPSEC*/
152			if (n) {
153				if (last->inp_flags & INP_CONTROLOPTS ||
154				    last->inp_socket->so_options & SO_TIMESTAMP)
155				    ip_savecontrol(last, &opts, ip, n);
156				if (sbappendaddr(&last->inp_socket->so_rcv,
157				    (struct sockaddr *)&ripsrc, n,
158				    opts) == 0) {
159					/* should notify about lost packet */
160					m_freem(n);
161					if (opts)
162					    m_freem(opts);
163				} else
164					sorwakeup(last->inp_socket);
165				opts = 0;
166			}
167		}
168		last = inp;
169	}
170#ifdef IPSEC
171	/* check AH/ESP integrity. */
172	if (last && ipsec4_in_reject_so(m, last->inp_socket)) {
173		m_freem(m);
174		ipsecstat.in_polvio++;
175		ipstat.ips_delivered--;
176		/* do not inject data to pcb */
177	} else
178#endif /*IPSEC*/
179	if (last) {
180		if (last->inp_flags & INP_CONTROLOPTS ||
181		    last->inp_socket->so_options & SO_TIMESTAMP)
182			ip_savecontrol(last, &opts, ip, m);
183		if (sbappendaddr(&last->inp_socket->so_rcv,
184		    (struct sockaddr *)&ripsrc, m, opts) == 0) {
185			m_freem(m);
186			if (opts)
187			    m_freem(opts);
188		} else
189			sorwakeup(last->inp_socket);
190	} else {
191		m_freem(m);
192		ipstat.ips_noproto++;
193		ipstat.ips_delivered--;
194	}
195}
196
197/*
198 * Generate IP header and pass packet to ip_output.
199 * Tack on options user may have setup with control call.
200 */
201int
202rip_output(m, so, dst)
203	struct mbuf *m;
204	struct socket *so;
205	u_long dst;
206{
207	register struct ip *ip;
208	register struct inpcb *inp = sotoinpcb(so);
209	int flags = (so->so_options & SO_DONTROUTE) | IP_ALLOWBROADCAST;
210
211	/*
212	 * If the user handed us a complete IP packet, use it.
213	 * Otherwise, allocate an mbuf for a header and fill it in.
214	 */
215	if ((inp->inp_flags & INP_HDRINCL) == 0) {
216		if (m->m_pkthdr.len + sizeof(struct ip) > IP_MAXPACKET) {
217			m_freem(m);
218			return(EMSGSIZE);
219		}
220		M_PREPEND(m, sizeof(struct ip), M_TRYWAIT);
221		ip = mtod(m, struct ip *);
222		ip->ip_tos = inp->inp_ip_tos;
223		ip->ip_off = 0;
224		ip->ip_p = inp->inp_ip_p;
225		ip->ip_len = m->m_pkthdr.len;
226		ip->ip_src = inp->inp_laddr;
227		ip->ip_dst.s_addr = dst;
228		ip->ip_ttl = inp->inp_ip_ttl;
229	} else {
230		if (m->m_pkthdr.len > IP_MAXPACKET) {
231			m_freem(m);
232			return(EMSGSIZE);
233		}
234		ip = mtod(m, struct ip *);
235		/* don't allow both user specified and setsockopt options,
236		   and don't allow packet length sizes that will crash */
237		if (((IP_VHL_HL(ip->ip_vhl) != (sizeof (*ip) >> 2))
238		     && inp->inp_options)
239		    || (ip->ip_len > m->m_pkthdr.len)
240		    || (ip->ip_len < (IP_VHL_HL(ip->ip_vhl) << 2))) {
241			m_freem(m);
242			return EINVAL;
243		}
244		if (ip->ip_id == 0)
245#ifdef RANDOM_IP_ID
246			ip->ip_id = ip_randomid();
247#else
248			ip->ip_id = htons(ip_id++);
249#endif
250		/* XXX prevent ip_output from overwriting header fields */
251		flags |= IP_RAWOUTPUT;
252		ipstat.ips_rawout++;
253	}
254
255#ifdef IPSEC
256	if (ipsec_setsocket(m, so) != 0) {
257		m_freem(m);
258		return ENOBUFS;
259	}
260#endif /*IPSEC*/
261
262	return (ip_output(m, inp->inp_options, &inp->inp_route, flags,
263			  inp->inp_moptions));
264}
265
266/*
267 * Raw IP socket option processing.
268 */
269int
270rip_ctloutput(so, sopt)
271	struct socket *so;
272	struct sockopt *sopt;
273{
274	struct	inpcb *inp = sotoinpcb(so);
275	int	error, optval;
276
277	if (sopt->sopt_level != IPPROTO_IP)
278		return (EINVAL);
279
280	error = 0;
281
282	switch (sopt->sopt_dir) {
283	case SOPT_GET:
284		switch (sopt->sopt_name) {
285		case IP_HDRINCL:
286			optval = inp->inp_flags & INP_HDRINCL;
287			error = sooptcopyout(sopt, &optval, sizeof optval);
288			break;
289
290		case IP_FW_ADD:	/* ADD actually returns the body... */
291		case IP_FW_GET:
292			if (IPFW_LOADED)
293				error = ip_fw_ctl_ptr(sopt);
294			else
295				error = ENOPROTOOPT;
296			break;
297
298		case IP_DUMMYNET_GET:
299			if (DUMMYNET_LOADED)
300				error = ip_dn_ctl_ptr(sopt);
301			else
302				error = ENOPROTOOPT;
303			break ;
304
305		case MRT_INIT:
306		case MRT_DONE:
307		case MRT_ADD_VIF:
308		case MRT_DEL_VIF:
309		case MRT_ADD_MFC:
310		case MRT_DEL_MFC:
311		case MRT_VERSION:
312		case MRT_ASSERT:
313			error = ip_mrouter_get(so, sopt);
314			break;
315
316		default:
317			error = ip_ctloutput(so, sopt);
318			break;
319		}
320		break;
321
322	case SOPT_SET:
323		switch (sopt->sopt_name) {
324		case IP_HDRINCL:
325			error = sooptcopyin(sopt, &optval, sizeof optval,
326					    sizeof optval);
327			if (error)
328				break;
329			if (optval)
330				inp->inp_flags |= INP_HDRINCL;
331			else
332				inp->inp_flags &= ~INP_HDRINCL;
333			break;
334
335		case IP_FW_ADD:
336		case IP_FW_DEL:
337		case IP_FW_FLUSH:
338		case IP_FW_ZERO:
339		case IP_FW_RESETLOG:
340			if (IPFW_LOADED)
341				error = ip_fw_ctl_ptr(sopt);
342			else
343				error = ENOPROTOOPT;
344			break;
345
346		case IP_DUMMYNET_CONFIGURE:
347		case IP_DUMMYNET_DEL:
348		case IP_DUMMYNET_FLUSH:
349			if (DUMMYNET_LOADED)
350				error = ip_dn_ctl_ptr(sopt);
351			else
352				error = ENOPROTOOPT ;
353			break ;
354
355		case IP_RSVP_ON:
356			error = ip_rsvp_init(so);
357			break;
358
359		case IP_RSVP_OFF:
360			error = ip_rsvp_done();
361			break;
362
363			/* XXX - should be combined */
364		case IP_RSVP_VIF_ON:
365			error = ip_rsvp_vif_init(so, sopt);
366			break;
367
368		case IP_RSVP_VIF_OFF:
369			error = ip_rsvp_vif_done(so, sopt);
370			break;
371
372		case MRT_INIT:
373		case MRT_DONE:
374		case MRT_ADD_VIF:
375		case MRT_DEL_VIF:
376		case MRT_ADD_MFC:
377		case MRT_DEL_MFC:
378		case MRT_VERSION:
379		case MRT_ASSERT:
380			error = ip_mrouter_set(so, sopt);
381			break;
382
383		default:
384			error = ip_ctloutput(so, sopt);
385			break;
386		}
387		break;
388	}
389
390	return (error);
391}
392
393/*
394 * This function exists solely to receive the PRC_IFDOWN messages which
395 * are sent by if_down().  It looks for an ifaddr whose ifa_addr is sa,
396 * and calls in_ifadown() to remove all routes corresponding to that address.
397 * It also receives the PRC_IFUP messages from if_up() and reinstalls the
398 * interface routes.
399 */
400void
401rip_ctlinput(cmd, sa, vip)
402	int cmd;
403	struct sockaddr *sa;
404	void *vip;
405{
406	struct in_ifaddr *ia;
407	struct ifnet *ifp;
408	int err;
409	int flags;
410
411	switch (cmd) {
412	case PRC_IFDOWN:
413		TAILQ_FOREACH(ia, &in_ifaddrhead, ia_link) {
414			if (ia->ia_ifa.ifa_addr == sa
415			    && (ia->ia_flags & IFA_ROUTE)) {
416				/*
417				 * in_ifscrub kills the interface route.
418				 */
419				in_ifscrub(ia->ia_ifp, ia);
420				/*
421				 * in_ifadown gets rid of all the rest of
422				 * the routes.  This is not quite the right
423				 * thing to do, but at least if we are running
424				 * a routing process they will come back.
425				 */
426				in_ifadown(&ia->ia_ifa, 0);
427				break;
428			}
429		}
430		break;
431
432	case PRC_IFUP:
433		TAILQ_FOREACH(ia, &in_ifaddrhead, ia_link) {
434			if (ia->ia_ifa.ifa_addr == sa)
435				break;
436		}
437		if (ia == 0 || (ia->ia_flags & IFA_ROUTE))
438			return;
439		flags = RTF_UP;
440		ifp = ia->ia_ifa.ifa_ifp;
441
442		if ((ifp->if_flags & IFF_LOOPBACK)
443		    || (ifp->if_flags & IFF_POINTOPOINT))
444			flags |= RTF_HOST;
445
446		err = rtinit(&ia->ia_ifa, RTM_ADD, flags);
447		if (err == 0)
448			ia->ia_flags |= IFA_ROUTE;
449		break;
450	}
451}
452
453u_long	rip_sendspace = RIPSNDQ;
454u_long	rip_recvspace = RIPRCVQ;
455
456SYSCTL_INT(_net_inet_raw, OID_AUTO, maxdgram, CTLFLAG_RW,
457    &rip_sendspace, 0, "Maximum outgoing raw IP datagram size");
458SYSCTL_INT(_net_inet_raw, OID_AUTO, recvspace, CTLFLAG_RW,
459    &rip_recvspace, 0, "Maximum incoming raw IP datagram size");
460
461static int
462rip_attach(struct socket *so, int proto, struct thread *td)
463{
464	struct inpcb *inp;
465	int error, s;
466
467	inp = sotoinpcb(so);
468	if (inp)
469		panic("rip_attach");
470	if (td && (error = suser_td(td)) != 0)
471		return error;
472
473	error = soreserve(so, rip_sendspace, rip_recvspace);
474	if (error)
475		return error;
476	s = splnet();
477	error = in_pcballoc(so, &ripcbinfo, td);
478	splx(s);
479	if (error)
480		return error;
481	inp = (struct inpcb *)so->so_pcb;
482	inp->inp_vflag |= INP_IPV4;
483	inp->inp_ip_p = proto;
484	inp->inp_ip_ttl = ip_defttl;
485	return 0;
486}
487
488static int
489rip_detach(struct socket *so)
490{
491	struct inpcb *inp;
492
493	inp = sotoinpcb(so);
494	if (inp == 0)
495		panic("rip_detach");
496	if (so == ip_mrouter)
497		ip_mrouter_done();
498	ip_rsvp_force_done(so);
499	if (so == ip_rsvpd)
500		ip_rsvp_done();
501	in_pcbdetach(inp);
502	return 0;
503}
504
505static int
506rip_abort(struct socket *so)
507{
508	soisdisconnected(so);
509	return rip_detach(so);
510}
511
512static int
513rip_disconnect(struct socket *so)
514{
515	if ((so->so_state & SS_ISCONNECTED) == 0)
516		return ENOTCONN;
517	return rip_abort(so);
518}
519
520static int
521rip_bind(struct socket *so, struct sockaddr *nam, struct thread *td)
522{
523	struct inpcb *inp = sotoinpcb(so);
524	struct sockaddr_in *addr = (struct sockaddr_in *)nam;
525
526	if (nam->sa_len != sizeof(*addr))
527		return EINVAL;
528
529	if (TAILQ_EMPTY(&ifnet) || ((addr->sin_family != AF_INET) &&
530				    (addr->sin_family != AF_IMPLINK)) ||
531	    (addr->sin_addr.s_addr &&
532	     ifa_ifwithaddr((struct sockaddr *)addr) == 0))
533		return EADDRNOTAVAIL;
534	inp->inp_laddr = addr->sin_addr;
535	return 0;
536}
537
538static int
539rip_connect(struct socket *so, struct sockaddr *nam, struct thread *td)
540{
541	struct inpcb *inp = sotoinpcb(so);
542	struct sockaddr_in *addr = (struct sockaddr_in *)nam;
543
544	if (nam->sa_len != sizeof(*addr))
545		return EINVAL;
546	if (TAILQ_EMPTY(&ifnet))
547		return EADDRNOTAVAIL;
548	if ((addr->sin_family != AF_INET) &&
549	    (addr->sin_family != AF_IMPLINK))
550		return EAFNOSUPPORT;
551	inp->inp_faddr = addr->sin_addr;
552	soisconnected(so);
553	return 0;
554}
555
556static int
557rip_shutdown(struct socket *so)
558{
559	socantsendmore(so);
560	return 0;
561}
562
563static int
564rip_send(struct socket *so, int flags, struct mbuf *m, struct sockaddr *nam,
565	 struct mbuf *control, struct thread *td)
566{
567	struct inpcb *inp = sotoinpcb(so);
568	register u_long dst;
569
570	if (so->so_state & SS_ISCONNECTED) {
571		if (nam) {
572			m_freem(m);
573			return EISCONN;
574		}
575		dst = inp->inp_faddr.s_addr;
576	} else {
577		if (nam == NULL) {
578			m_freem(m);
579			return ENOTCONN;
580		}
581		dst = ((struct sockaddr_in *)nam)->sin_addr.s_addr;
582	}
583	return rip_output(m, so, dst);
584}
585
586static int
587rip_pcblist(SYSCTL_HANDLER_ARGS)
588{
589	int error, i, n, s;
590	struct inpcb *inp, **inp_list;
591	inp_gen_t gencnt;
592	struct xinpgen xig;
593
594	/*
595	 * The process of preparing the TCB list is too time-consuming and
596	 * resource-intensive to repeat twice on every request.
597	 */
598	if (req->oldptr == 0) {
599		n = ripcbinfo.ipi_count;
600		req->oldidx = 2 * (sizeof xig)
601			+ (n + n/8) * sizeof(struct xinpcb);
602		return 0;
603	}
604
605	if (req->newptr != 0)
606		return EPERM;
607
608	/*
609	 * OK, now we're committed to doing something.
610	 */
611	s = splnet();
612	gencnt = ripcbinfo.ipi_gencnt;
613	n = ripcbinfo.ipi_count;
614	splx(s);
615
616	xig.xig_len = sizeof xig;
617	xig.xig_count = n;
618	xig.xig_gen = gencnt;
619	xig.xig_sogen = so_gencnt;
620	error = SYSCTL_OUT(req, &xig, sizeof xig);
621	if (error)
622		return error;
623
624	inp_list = malloc(n * sizeof *inp_list, M_TEMP, M_WAITOK);
625	if (inp_list == 0)
626		return ENOMEM;
627
628	s = splnet();
629	for (inp = LIST_FIRST(ripcbinfo.listhead), i = 0; inp && i < n;
630	     inp = LIST_NEXT(inp, inp_list)) {
631		if (inp->inp_gencnt <= gencnt) {
632			if (cr_canseesocket(req->td->td_ucred,
633			    inp->inp_socket))
634				continue;
635			inp_list[i++] = inp;
636		}
637	}
638	splx(s);
639	n = i;
640
641	error = 0;
642	for (i = 0; i < n; i++) {
643		inp = inp_list[i];
644		if (inp->inp_gencnt <= gencnt) {
645			struct xinpcb xi;
646			xi.xi_len = sizeof xi;
647			/* XXX should avoid extra copy */
648			bcopy(inp, &xi.xi_inp, sizeof *inp);
649			if (inp->inp_socket)
650				sotoxsocket(inp->inp_socket, &xi.xi_socket);
651			error = SYSCTL_OUT(req, &xi, sizeof xi);
652		}
653	}
654	if (!error) {
655		/*
656		 * Give the user an updated idea of our state.
657		 * If the generation differs from what we told
658		 * her before, she knows that something happened
659		 * while we were processing this request, and it
660		 * might be necessary to retry.
661		 */
662		s = splnet();
663		xig.xig_gen = ripcbinfo.ipi_gencnt;
664		xig.xig_sogen = so_gencnt;
665		xig.xig_count = ripcbinfo.ipi_count;
666		splx(s);
667		error = SYSCTL_OUT(req, &xig, sizeof xig);
668	}
669	free(inp_list, M_TEMP);
670	return error;
671}
672
673SYSCTL_PROC(_net_inet_raw, OID_AUTO/*XXX*/, pcblist, CTLFLAG_RD, 0, 0,
674	    rip_pcblist, "S,xinpcb", "List of active raw IP sockets");
675
676struct pr_usrreqs rip_usrreqs = {
677	rip_abort, pru_accept_notsupp, rip_attach, rip_bind, rip_connect,
678	pru_connect2_notsupp, in_control, rip_detach, rip_disconnect,
679	pru_listen_notsupp, in_setpeeraddr, pru_rcvd_notsupp,
680	pru_rcvoob_notsupp, rip_send, pru_sense_null, rip_shutdown,
681	in_setsockaddr, sosend, soreceive, sopoll
682};
683