raw_ip.c revision 221131
1/*-
2 * Copyright (c) 1982, 1986, 1988, 1993
3 *	The Regents of the University of California.
4 * All rights reserved.
5 *
6 * Redistribution and use in source and binary forms, with or without
7 * modification, are permitted provided that the following conditions
8 * are met:
9 * 1. Redistributions of source code must retain the above copyright
10 *    notice, this list of conditions and the following disclaimer.
11 * 2. Redistributions in binary form must reproduce the above copyright
12 *    notice, this list of conditions and the following disclaimer in the
13 *    documentation and/or other materials provided with the distribution.
14 * 4. Neither the name of the University nor the names of its contributors
15 *    may be used to endorse or promote products derived from this software
16 *    without specific prior written permission.
17 *
18 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
19 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
20 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
21 * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
22 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
23 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
24 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
25 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
26 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
27 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
28 * SUCH DAMAGE.
29 *
30 *	@(#)raw_ip.c	8.7 (Berkeley) 5/15/95
31 */
32
33#include <sys/cdefs.h>
34__FBSDID("$FreeBSD: head/sys/netinet/raw_ip.c 221131 2011-04-27 19:32:27Z bz $");
35
36#include "opt_inet.h"
37#include "opt_inet6.h"
38#include "opt_ipsec.h"
39
40#include <sys/param.h>
41#include <sys/jail.h>
42#include <sys/kernel.h>
43#include <sys/lock.h>
44#include <sys/malloc.h>
45#include <sys/mbuf.h>
46#include <sys/priv.h>
47#include <sys/proc.h>
48#include <sys/protosw.h>
49#include <sys/rwlock.h>
50#include <sys/signalvar.h>
51#include <sys/socket.h>
52#include <sys/socketvar.h>
53#include <sys/sx.h>
54#include <sys/sysctl.h>
55#include <sys/systm.h>
56
57#include <vm/uma.h>
58
59#include <net/if.h>
60#include <net/route.h>
61#include <net/vnet.h>
62
63#include <netinet/in.h>
64#include <netinet/in_systm.h>
65#include <netinet/in_pcb.h>
66#include <netinet/in_var.h>
67#include <netinet/ip.h>
68#include <netinet/ip_var.h>
69#include <netinet/ip_mroute.h>
70
71#ifdef IPSEC
72#include <netipsec/ipsec.h>
73#endif /*IPSEC*/
74
75#include <security/mac/mac_framework.h>
76
77VNET_DEFINE(int, ip_defttl) = IPDEFTTL;
78SYSCTL_VNET_INT(_net_inet_ip, IPCTL_DEFTTL, ttl, CTLFLAG_RW,
79    &VNET_NAME(ip_defttl), 0,
80    "Maximum TTL on IP packets");
81
82VNET_DEFINE(struct inpcbhead, ripcb);
83VNET_DEFINE(struct inpcbinfo, ripcbinfo);
84
85#define	V_ripcb			VNET(ripcb)
86#define	V_ripcbinfo		VNET(ripcbinfo)
87
88/*
89 * Control and data hooks for ipfw, dummynet, divert and so on.
90 * The data hooks are not used here but it is convenient
91 * to keep them all in one place.
92 */
93VNET_DEFINE(ip_fw_chk_ptr_t, ip_fw_chk_ptr) = NULL;
94VNET_DEFINE(ip_fw_ctl_ptr_t, ip_fw_ctl_ptr) = NULL;
95
96int	(*ip_dn_ctl_ptr)(struct sockopt *);
97int	(*ip_dn_io_ptr)(struct mbuf **, int, struct ip_fw_args *);
98void	(*ip_divert_ptr)(struct mbuf *, int);
99int	(*ng_ipfw_input_p)(struct mbuf **, int,
100			struct ip_fw_args *, int);
101
102#ifdef INET
103/*
104 * Hooks for multicast routing. They all default to NULL, so leave them not
105 * initialized and rely on BSS being set to 0.
106 */
107
108/*
109 * The socket used to communicate with the multicast routing daemon.
110 */
111VNET_DEFINE(struct socket *, ip_mrouter);
112
113/*
114 * The various mrouter and rsvp functions.
115 */
116int (*ip_mrouter_set)(struct socket *, struct sockopt *);
117int (*ip_mrouter_get)(struct socket *, struct sockopt *);
118int (*ip_mrouter_done)(void);
119int (*ip_mforward)(struct ip *, struct ifnet *, struct mbuf *,
120		   struct ip_moptions *);
121int (*mrt_ioctl)(u_long, caddr_t, int);
122int (*legal_vif_num)(int);
123u_long (*ip_mcast_src)(int);
124
125void (*rsvp_input_p)(struct mbuf *m, int off);
126int (*ip_rsvp_vif)(struct socket *, struct sockopt *);
127void (*ip_rsvp_force_done)(struct socket *);
128#endif /* INET */
129
130u_long	rip_sendspace = 9216;
131SYSCTL_ULONG(_net_inet_raw, OID_AUTO, maxdgram, CTLFLAG_RW,
132    &rip_sendspace, 0, "Maximum outgoing raw IP datagram size");
133
134u_long	rip_recvspace = 9216;
135SYSCTL_ULONG(_net_inet_raw, OID_AUTO, recvspace, CTLFLAG_RW,
136    &rip_recvspace, 0, "Maximum space for incoming raw IP datagrams");
137
138/*
139 * Hash functions
140 */
141
142#define INP_PCBHASH_RAW_SIZE	256
143#define INP_PCBHASH_RAW(proto, laddr, faddr, mask) \
144        (((proto) + (laddr) + (faddr)) % (mask) + 1)
145
146#ifdef INET
147static void
148rip_inshash(struct inpcb *inp)
149{
150	struct inpcbinfo *pcbinfo = inp->inp_pcbinfo;
151	struct inpcbhead *pcbhash;
152	int hash;
153
154	INP_INFO_WLOCK_ASSERT(pcbinfo);
155	INP_WLOCK_ASSERT(inp);
156
157	if (inp->inp_ip_p != 0 &&
158	    inp->inp_laddr.s_addr != INADDR_ANY &&
159	    inp->inp_faddr.s_addr != INADDR_ANY) {
160		hash = INP_PCBHASH_RAW(inp->inp_ip_p, inp->inp_laddr.s_addr,
161		    inp->inp_faddr.s_addr, pcbinfo->ipi_hashmask);
162	} else
163		hash = 0;
164	pcbhash = &pcbinfo->ipi_hashbase[hash];
165	LIST_INSERT_HEAD(pcbhash, inp, inp_hash);
166}
167
168static void
169rip_delhash(struct inpcb *inp)
170{
171
172	INP_INFO_WLOCK_ASSERT(inp->inp_pcbinfo);
173	INP_WLOCK_ASSERT(inp);
174
175	LIST_REMOVE(inp, inp_hash);
176}
177#endif /* INET */
178
179/*
180 * Raw interface to IP protocol.
181 */
182
183/*
184 * Initialize raw connection block q.
185 */
186static void
187rip_zone_change(void *tag)
188{
189
190	uma_zone_set_max(V_ripcbinfo.ipi_zone, maxsockets);
191}
192
193static int
194rip_inpcb_init(void *mem, int size, int flags)
195{
196	struct inpcb *inp = mem;
197
198	INP_LOCK_INIT(inp, "inp", "rawinp");
199	return (0);
200}
201
202void
203rip_init(void)
204{
205
206	in_pcbinfo_init(&V_ripcbinfo, "rip", &V_ripcb, INP_PCBHASH_RAW_SIZE,
207	    1, "ripcb", rip_inpcb_init, NULL, UMA_ZONE_NOFREE);
208	EVENTHANDLER_REGISTER(maxsockets_change, rip_zone_change, NULL,
209	    EVENTHANDLER_PRI_ANY);
210}
211
212#ifdef VIMAGE
213void
214rip_destroy(void)
215{
216
217	in_pcbinfo_destroy(&V_ripcbinfo);
218}
219#endif
220
221#ifdef INET
222static int
223rip_append(struct inpcb *last, struct ip *ip, struct mbuf *n,
224    struct sockaddr_in *ripsrc)
225{
226	int policyfail = 0;
227
228	INP_RLOCK_ASSERT(last);
229
230#ifdef IPSEC
231	/* check AH/ESP integrity. */
232	if (ipsec4_in_reject(n, last)) {
233		policyfail = 1;
234	}
235#endif /* IPSEC */
236#ifdef MAC
237	if (!policyfail && mac_inpcb_check_deliver(last, n) != 0)
238		policyfail = 1;
239#endif
240	/* Check the minimum TTL for socket. */
241	if (last->inp_ip_minttl && last->inp_ip_minttl > ip->ip_ttl)
242		policyfail = 1;
243	if (!policyfail) {
244		struct mbuf *opts = NULL;
245		struct socket *so;
246
247		so = last->inp_socket;
248		if ((last->inp_flags & INP_CONTROLOPTS) ||
249		    (so->so_options & (SO_TIMESTAMP | SO_BINTIME)))
250			ip_savecontrol(last, &opts, ip, n);
251		SOCKBUF_LOCK(&so->so_rcv);
252		if (sbappendaddr_locked(&so->so_rcv,
253		    (struct sockaddr *)ripsrc, n, opts) == 0) {
254			/* should notify about lost packet */
255			m_freem(n);
256			if (opts)
257				m_freem(opts);
258			SOCKBUF_UNLOCK(&so->so_rcv);
259		} else
260			sorwakeup_locked(so);
261	} else
262		m_freem(n);
263	return (policyfail);
264}
265
266/*
267 * Setup generic address and protocol structures for raw_input routine, then
268 * pass them along with mbuf chain.
269 */
270void
271rip_input(struct mbuf *m, int off)
272{
273	struct ifnet *ifp;
274	struct ip *ip = mtod(m, struct ip *);
275	int proto = ip->ip_p;
276	struct inpcb *inp, *last;
277	struct sockaddr_in ripsrc;
278	int hash;
279
280	bzero(&ripsrc, sizeof(ripsrc));
281	ripsrc.sin_len = sizeof(ripsrc);
282	ripsrc.sin_family = AF_INET;
283	ripsrc.sin_addr = ip->ip_src;
284	last = NULL;
285
286	ifp = m->m_pkthdr.rcvif;
287
288	hash = INP_PCBHASH_RAW(proto, ip->ip_src.s_addr,
289	    ip->ip_dst.s_addr, V_ripcbinfo.ipi_hashmask);
290	INP_INFO_RLOCK(&V_ripcbinfo);
291	LIST_FOREACH(inp, &V_ripcbinfo.ipi_hashbase[hash], inp_hash) {
292		if (inp->inp_ip_p != proto)
293			continue;
294#ifdef INET6
295		/* XXX inp locking */
296		if ((inp->inp_vflag & INP_IPV4) == 0)
297			continue;
298#endif
299		if (inp->inp_laddr.s_addr != ip->ip_dst.s_addr)
300			continue;
301		if (inp->inp_faddr.s_addr != ip->ip_src.s_addr)
302			continue;
303		if (jailed_without_vnet(inp->inp_cred)) {
304			/*
305			 * XXX: If faddr was bound to multicast group,
306			 * jailed raw socket will drop datagram.
307			 */
308			if (prison_check_ip4(inp->inp_cred, &ip->ip_dst) != 0)
309				continue;
310		}
311		if (last != NULL) {
312			struct mbuf *n;
313
314			n = m_copy(m, 0, (int)M_COPYALL);
315			if (n != NULL)
316		    	    (void) rip_append(last, ip, n, &ripsrc);
317			/* XXX count dropped packet */
318			INP_RUNLOCK(last);
319		}
320		INP_RLOCK(inp);
321		last = inp;
322	}
323	LIST_FOREACH(inp, &V_ripcbinfo.ipi_hashbase[0], inp_hash) {
324		if (inp->inp_ip_p && inp->inp_ip_p != proto)
325			continue;
326#ifdef INET6
327		/* XXX inp locking */
328		if ((inp->inp_vflag & INP_IPV4) == 0)
329			continue;
330#endif
331		if (!in_nullhost(inp->inp_laddr) &&
332		    !in_hosteq(inp->inp_laddr, ip->ip_dst))
333			continue;
334		if (!in_nullhost(inp->inp_faddr) &&
335		    !in_hosteq(inp->inp_faddr, ip->ip_src))
336			continue;
337		if (jailed_without_vnet(inp->inp_cred)) {
338			/*
339			 * Allow raw socket in jail to receive multicast;
340			 * assume process had PRIV_NETINET_RAW at attach,
341			 * and fall through into normal filter path if so.
342			 */
343			if (!IN_MULTICAST(ntohl(ip->ip_dst.s_addr)) &&
344			    prison_check_ip4(inp->inp_cred, &ip->ip_dst) != 0)
345				continue;
346		}
347		/*
348		 * If this raw socket has multicast state, and we
349		 * have received a multicast, check if this socket
350		 * should receive it, as multicast filtering is now
351		 * the responsibility of the transport layer.
352		 */
353		if (inp->inp_moptions != NULL &&
354		    IN_MULTICAST(ntohl(ip->ip_dst.s_addr))) {
355			/*
356			 * If the incoming datagram is for IGMP, allow it
357			 * through unconditionally to the raw socket.
358			 *
359			 * In the case of IGMPv2, we may not have explicitly
360			 * joined the group, and may have set IFF_ALLMULTI
361			 * on the interface. imo_multi_filter() may discard
362			 * control traffic we actually need to see.
363			 *
364			 * Userland multicast routing daemons should continue
365			 * filter the control traffic appropriately.
366			 */
367			int blocked;
368
369			blocked = MCAST_PASS;
370			if (proto != IPPROTO_IGMP) {
371				struct sockaddr_in group;
372
373				bzero(&group, sizeof(struct sockaddr_in));
374				group.sin_len = sizeof(struct sockaddr_in);
375				group.sin_family = AF_INET;
376				group.sin_addr = ip->ip_dst;
377
378				blocked = imo_multi_filter(inp->inp_moptions,
379				    ifp,
380				    (struct sockaddr *)&group,
381				    (struct sockaddr *)&ripsrc);
382			}
383
384			if (blocked != MCAST_PASS) {
385				IPSTAT_INC(ips_notmember);
386				continue;
387			}
388		}
389		if (last != NULL) {
390			struct mbuf *n;
391
392			n = m_copy(m, 0, (int)M_COPYALL);
393			if (n != NULL)
394				(void) rip_append(last, ip, n, &ripsrc);
395			/* XXX count dropped packet */
396			INP_RUNLOCK(last);
397		}
398		INP_RLOCK(inp);
399		last = inp;
400	}
401	INP_INFO_RUNLOCK(&V_ripcbinfo);
402	if (last != NULL) {
403		if (rip_append(last, ip, m, &ripsrc) != 0)
404			IPSTAT_INC(ips_delivered);
405		INP_RUNLOCK(last);
406	} else {
407		m_freem(m);
408		IPSTAT_INC(ips_noproto);
409		IPSTAT_DEC(ips_delivered);
410	}
411}
412
413/*
414 * Generate IP header and pass packet to ip_output.  Tack on options user may
415 * have setup with control call.
416 */
417int
418rip_output(struct mbuf *m, struct socket *so, u_long dst)
419{
420	struct ip *ip;
421	int error;
422	struct inpcb *inp = sotoinpcb(so);
423	int flags = ((so->so_options & SO_DONTROUTE) ? IP_ROUTETOIF : 0) |
424	    IP_ALLOWBROADCAST;
425
426	/*
427	 * If the user handed us a complete IP packet, use it.  Otherwise,
428	 * allocate an mbuf for a header and fill it in.
429	 */
430	if ((inp->inp_flags & INP_HDRINCL) == 0) {
431		if (m->m_pkthdr.len + sizeof(struct ip) > IP_MAXPACKET) {
432			m_freem(m);
433			return(EMSGSIZE);
434		}
435		M_PREPEND(m, sizeof(struct ip), M_DONTWAIT);
436		if (m == NULL)
437			return(ENOBUFS);
438
439		INP_RLOCK(inp);
440		ip = mtod(m, struct ip *);
441		ip->ip_tos = inp->inp_ip_tos;
442		if (inp->inp_flags & INP_DONTFRAG)
443			ip->ip_off = IP_DF;
444		else
445			ip->ip_off = 0;
446		ip->ip_p = inp->inp_ip_p;
447		ip->ip_len = m->m_pkthdr.len;
448		ip->ip_src = inp->inp_laddr;
449		if (jailed(inp->inp_cred)) {
450			/*
451			 * prison_local_ip4() would be good enough but would
452			 * let a source of INADDR_ANY pass, which we do not
453			 * want to see from jails. We do not go through the
454			 * pain of in_pcbladdr() for raw sockets.
455			 */
456			if (ip->ip_src.s_addr == INADDR_ANY)
457				error = prison_get_ip4(inp->inp_cred,
458				    &ip->ip_src);
459			else
460				error = prison_local_ip4(inp->inp_cred,
461				    &ip->ip_src);
462			if (error != 0) {
463				INP_RUNLOCK(inp);
464				m_freem(m);
465				return (error);
466			}
467		}
468		ip->ip_dst.s_addr = dst;
469		ip->ip_ttl = inp->inp_ip_ttl;
470	} else {
471		if (m->m_pkthdr.len > IP_MAXPACKET) {
472			m_freem(m);
473			return(EMSGSIZE);
474		}
475		INP_RLOCK(inp);
476		ip = mtod(m, struct ip *);
477		error = prison_check_ip4(inp->inp_cred, &ip->ip_src);
478		if (error != 0) {
479			INP_RUNLOCK(inp);
480			m_freem(m);
481			return (error);
482		}
483
484		/*
485		 * Don't allow both user specified and setsockopt options,
486		 * and don't allow packet length sizes that will crash.
487		 */
488		if (((ip->ip_hl != (sizeof (*ip) >> 2)) && inp->inp_options)
489		    || (ip->ip_len > m->m_pkthdr.len)
490		    || (ip->ip_len < (ip->ip_hl << 2))) {
491			INP_RUNLOCK(inp);
492			m_freem(m);
493			return (EINVAL);
494		}
495		if (ip->ip_id == 0)
496			ip->ip_id = ip_newid();
497
498		/*
499		 * XXX prevent ip_output from overwriting header fields.
500		 */
501		flags |= IP_RAWOUTPUT;
502		IPSTAT_INC(ips_rawout);
503	}
504
505	if (inp->inp_flags & INP_ONESBCAST)
506		flags |= IP_SENDONES;
507
508#ifdef MAC
509	mac_inpcb_create_mbuf(inp, m);
510#endif
511
512	error = ip_output(m, inp->inp_options, NULL, flags,
513	    inp->inp_moptions, inp);
514	INP_RUNLOCK(inp);
515	return (error);
516}
517
518/*
519 * Raw IP socket option processing.
520 *
521 * IMPORTANT NOTE regarding access control: Traditionally, raw sockets could
522 * only be created by a privileged process, and as such, socket option
523 * operations to manage system properties on any raw socket were allowed to
524 * take place without explicit additional access control checks.  However,
525 * raw sockets can now also be created in jail(), and therefore explicit
526 * checks are now required.  Likewise, raw sockets can be used by a process
527 * after it gives up privilege, so some caution is required.  For options
528 * passed down to the IP layer via ip_ctloutput(), checks are assumed to be
529 * performed in ip_ctloutput() and therefore no check occurs here.
530 * Unilaterally checking priv_check() here breaks normal IP socket option
531 * operations on raw sockets.
532 *
533 * When adding new socket options here, make sure to add access control
534 * checks here as necessary.
535 */
536int
537rip_ctloutput(struct socket *so, struct sockopt *sopt)
538{
539	struct	inpcb *inp = sotoinpcb(so);
540	int	error, optval;
541
542	if (sopt->sopt_level != IPPROTO_IP) {
543		if ((sopt->sopt_level == SOL_SOCKET) &&
544		    (sopt->sopt_name == SO_SETFIB)) {
545			inp->inp_inc.inc_fibnum = so->so_fibnum;
546			return (0);
547		}
548		return (EINVAL);
549	}
550
551	error = 0;
552	switch (sopt->sopt_dir) {
553	case SOPT_GET:
554		switch (sopt->sopt_name) {
555		case IP_HDRINCL:
556			optval = inp->inp_flags & INP_HDRINCL;
557			error = sooptcopyout(sopt, &optval, sizeof optval);
558			break;
559
560		case IP_FW3:	/* generic ipfw v.3 functions */
561		case IP_FW_ADD:	/* ADD actually returns the body... */
562		case IP_FW_GET:
563		case IP_FW_TABLE_GETSIZE:
564		case IP_FW_TABLE_LIST:
565		case IP_FW_NAT_GET_CONFIG:
566		case IP_FW_NAT_GET_LOG:
567			if (V_ip_fw_ctl_ptr != NULL)
568				error = V_ip_fw_ctl_ptr(sopt);
569			else
570				error = ENOPROTOOPT;
571			break;
572
573		case IP_DUMMYNET3:	/* generic dummynet v.3 functions */
574		case IP_DUMMYNET_GET:
575			if (ip_dn_ctl_ptr != NULL)
576				error = ip_dn_ctl_ptr(sopt);
577			else
578				error = ENOPROTOOPT;
579			break ;
580
581		case MRT_INIT:
582		case MRT_DONE:
583		case MRT_ADD_VIF:
584		case MRT_DEL_VIF:
585		case MRT_ADD_MFC:
586		case MRT_DEL_MFC:
587		case MRT_VERSION:
588		case MRT_ASSERT:
589		case MRT_API_SUPPORT:
590		case MRT_API_CONFIG:
591		case MRT_ADD_BW_UPCALL:
592		case MRT_DEL_BW_UPCALL:
593			error = priv_check(curthread, PRIV_NETINET_MROUTE);
594			if (error != 0)
595				return (error);
596			error = ip_mrouter_get ? ip_mrouter_get(so, sopt) :
597				EOPNOTSUPP;
598			break;
599
600		default:
601			error = ip_ctloutput(so, sopt);
602			break;
603		}
604		break;
605
606	case SOPT_SET:
607		switch (sopt->sopt_name) {
608		case IP_HDRINCL:
609			error = sooptcopyin(sopt, &optval, sizeof optval,
610					    sizeof optval);
611			if (error)
612				break;
613			if (optval)
614				inp->inp_flags |= INP_HDRINCL;
615			else
616				inp->inp_flags &= ~INP_HDRINCL;
617			break;
618
619		case IP_FW3:	/* generic ipfw v.3 functions */
620		case IP_FW_ADD:
621		case IP_FW_DEL:
622		case IP_FW_FLUSH:
623		case IP_FW_ZERO:
624		case IP_FW_RESETLOG:
625		case IP_FW_TABLE_ADD:
626		case IP_FW_TABLE_DEL:
627		case IP_FW_TABLE_FLUSH:
628		case IP_FW_NAT_CFG:
629		case IP_FW_NAT_DEL:
630			if (V_ip_fw_ctl_ptr != NULL)
631				error = V_ip_fw_ctl_ptr(sopt);
632			else
633				error = ENOPROTOOPT;
634			break;
635
636		case IP_DUMMYNET3:	/* generic dummynet v.3 functions */
637		case IP_DUMMYNET_CONFIGURE:
638		case IP_DUMMYNET_DEL:
639		case IP_DUMMYNET_FLUSH:
640			if (ip_dn_ctl_ptr != NULL)
641				error = ip_dn_ctl_ptr(sopt);
642			else
643				error = ENOPROTOOPT ;
644			break ;
645
646		case IP_RSVP_ON:
647			error = priv_check(curthread, PRIV_NETINET_MROUTE);
648			if (error != 0)
649				return (error);
650			error = ip_rsvp_init(so);
651			break;
652
653		case IP_RSVP_OFF:
654			error = priv_check(curthread, PRIV_NETINET_MROUTE);
655			if (error != 0)
656				return (error);
657			error = ip_rsvp_done();
658			break;
659
660		case IP_RSVP_VIF_ON:
661		case IP_RSVP_VIF_OFF:
662			error = priv_check(curthread, PRIV_NETINET_MROUTE);
663			if (error != 0)
664				return (error);
665			error = ip_rsvp_vif ?
666				ip_rsvp_vif(so, sopt) : EINVAL;
667			break;
668
669		case MRT_INIT:
670		case MRT_DONE:
671		case MRT_ADD_VIF:
672		case MRT_DEL_VIF:
673		case MRT_ADD_MFC:
674		case MRT_DEL_MFC:
675		case MRT_VERSION:
676		case MRT_ASSERT:
677		case MRT_API_SUPPORT:
678		case MRT_API_CONFIG:
679		case MRT_ADD_BW_UPCALL:
680		case MRT_DEL_BW_UPCALL:
681			error = priv_check(curthread, PRIV_NETINET_MROUTE);
682			if (error != 0)
683				return (error);
684			error = ip_mrouter_set ? ip_mrouter_set(so, sopt) :
685					EOPNOTSUPP;
686			break;
687
688		default:
689			error = ip_ctloutput(so, sopt);
690			break;
691		}
692		break;
693	}
694
695	return (error);
696}
697
698/*
699 * This function exists solely to receive the PRC_IFDOWN messages which are
700 * sent by if_down().  It looks for an ifaddr whose ifa_addr is sa, and calls
701 * in_ifadown() to remove all routes corresponding to that address.  It also
702 * receives the PRC_IFUP messages from if_up() and reinstalls the interface
703 * routes.
704 */
705void
706rip_ctlinput(int cmd, struct sockaddr *sa, void *vip)
707{
708	struct in_ifaddr *ia;
709	struct ifnet *ifp;
710	int err;
711	int flags;
712
713	switch (cmd) {
714	case PRC_IFDOWN:
715		IN_IFADDR_RLOCK();
716		TAILQ_FOREACH(ia, &V_in_ifaddrhead, ia_link) {
717			if (ia->ia_ifa.ifa_addr == sa
718			    && (ia->ia_flags & IFA_ROUTE)) {
719				ifa_ref(&ia->ia_ifa);
720				IN_IFADDR_RUNLOCK();
721				/*
722				 * in_ifscrub kills the interface route.
723				 */
724				in_ifscrub(ia->ia_ifp, ia);
725				/*
726				 * in_ifadown gets rid of all the rest of the
727				 * routes.  This is not quite the right thing
728				 * to do, but at least if we are running a
729				 * routing process they will come back.
730				 */
731				in_ifadown(&ia->ia_ifa, 0);
732				ifa_free(&ia->ia_ifa);
733				break;
734			}
735		}
736		if (ia == NULL)		/* If ia matched, already unlocked. */
737			IN_IFADDR_RUNLOCK();
738		break;
739
740	case PRC_IFUP:
741		IN_IFADDR_RLOCK();
742		TAILQ_FOREACH(ia, &V_in_ifaddrhead, ia_link) {
743			if (ia->ia_ifa.ifa_addr == sa)
744				break;
745		}
746		if (ia == NULL || (ia->ia_flags & IFA_ROUTE)) {
747			IN_IFADDR_RUNLOCK();
748			return;
749		}
750		ifa_ref(&ia->ia_ifa);
751		IN_IFADDR_RUNLOCK();
752		flags = RTF_UP;
753		ifp = ia->ia_ifa.ifa_ifp;
754
755		if ((ifp->if_flags & IFF_LOOPBACK)
756		    || (ifp->if_flags & IFF_POINTOPOINT))
757			flags |= RTF_HOST;
758
759		err = rtinit(&ia->ia_ifa, RTM_ADD, flags);
760		if (err == 0)
761			ia->ia_flags |= IFA_ROUTE;
762		err = ifa_add_loopback_route((struct ifaddr *)ia, sa);
763		if (err == 0)
764			ia->ia_flags |= IFA_RTSELF;
765		ifa_free(&ia->ia_ifa);
766		break;
767	}
768}
769
770static int
771rip_attach(struct socket *so, int proto, struct thread *td)
772{
773	struct inpcb *inp;
774	int error;
775
776	inp = sotoinpcb(so);
777	KASSERT(inp == NULL, ("rip_attach: inp != NULL"));
778
779	error = priv_check(td, PRIV_NETINET_RAW);
780	if (error)
781		return (error);
782	if (proto >= IPPROTO_MAX || proto < 0)
783		return EPROTONOSUPPORT;
784	error = soreserve(so, rip_sendspace, rip_recvspace);
785	if (error)
786		return (error);
787	INP_INFO_WLOCK(&V_ripcbinfo);
788	error = in_pcballoc(so, &V_ripcbinfo);
789	if (error) {
790		INP_INFO_WUNLOCK(&V_ripcbinfo);
791		return (error);
792	}
793	inp = (struct inpcb *)so->so_pcb;
794	inp->inp_vflag |= INP_IPV4;
795	inp->inp_ip_p = proto;
796	inp->inp_ip_ttl = V_ip_defttl;
797	rip_inshash(inp);
798	INP_INFO_WUNLOCK(&V_ripcbinfo);
799	INP_WUNLOCK(inp);
800	return (0);
801}
802
803static void
804rip_detach(struct socket *so)
805{
806	struct inpcb *inp;
807
808	inp = sotoinpcb(so);
809	KASSERT(inp != NULL, ("rip_detach: inp == NULL"));
810	KASSERT(inp->inp_faddr.s_addr == INADDR_ANY,
811	    ("rip_detach: not closed"));
812
813	INP_INFO_WLOCK(&V_ripcbinfo);
814	INP_WLOCK(inp);
815	rip_delhash(inp);
816	if (so == V_ip_mrouter && ip_mrouter_done)
817		ip_mrouter_done();
818	if (ip_rsvp_force_done)
819		ip_rsvp_force_done(so);
820	if (so == V_ip_rsvpd)
821		ip_rsvp_done();
822	in_pcbdetach(inp);
823	in_pcbfree(inp);
824	INP_INFO_WUNLOCK(&V_ripcbinfo);
825}
826
827static void
828rip_dodisconnect(struct socket *so, struct inpcb *inp)
829{
830
831	INP_INFO_WLOCK_ASSERT(inp->inp_pcbinfo);
832	INP_WLOCK_ASSERT(inp);
833
834	rip_delhash(inp);
835	inp->inp_faddr.s_addr = INADDR_ANY;
836	rip_inshash(inp);
837	SOCK_LOCK(so);
838	so->so_state &= ~SS_ISCONNECTED;
839	SOCK_UNLOCK(so);
840}
841
842static void
843rip_abort(struct socket *so)
844{
845	struct inpcb *inp;
846
847	inp = sotoinpcb(so);
848	KASSERT(inp != NULL, ("rip_abort: inp == NULL"));
849
850	INP_INFO_WLOCK(&V_ripcbinfo);
851	INP_WLOCK(inp);
852	rip_dodisconnect(so, inp);
853	INP_WUNLOCK(inp);
854	INP_INFO_WUNLOCK(&V_ripcbinfo);
855}
856
857static void
858rip_close(struct socket *so)
859{
860	struct inpcb *inp;
861
862	inp = sotoinpcb(so);
863	KASSERT(inp != NULL, ("rip_close: inp == NULL"));
864
865	INP_INFO_WLOCK(&V_ripcbinfo);
866	INP_WLOCK(inp);
867	rip_dodisconnect(so, inp);
868	INP_WUNLOCK(inp);
869	INP_INFO_WUNLOCK(&V_ripcbinfo);
870}
871
872static int
873rip_disconnect(struct socket *so)
874{
875	struct inpcb *inp;
876
877	if ((so->so_state & SS_ISCONNECTED) == 0)
878		return (ENOTCONN);
879
880	inp = sotoinpcb(so);
881	KASSERT(inp != NULL, ("rip_disconnect: inp == NULL"));
882
883	INP_INFO_WLOCK(&V_ripcbinfo);
884	INP_WLOCK(inp);
885	rip_dodisconnect(so, inp);
886	INP_WUNLOCK(inp);
887	INP_INFO_WUNLOCK(&V_ripcbinfo);
888	return (0);
889}
890
891static int
892rip_bind(struct socket *so, struct sockaddr *nam, struct thread *td)
893{
894	struct sockaddr_in *addr = (struct sockaddr_in *)nam;
895	struct inpcb *inp;
896	int error;
897
898	if (nam->sa_len != sizeof(*addr))
899		return (EINVAL);
900
901	error = prison_check_ip4(td->td_ucred, &addr->sin_addr);
902	if (error != 0)
903		return (error);
904
905	inp = sotoinpcb(so);
906	KASSERT(inp != NULL, ("rip_bind: inp == NULL"));
907
908	if (TAILQ_EMPTY(&V_ifnet) ||
909	    (addr->sin_family != AF_INET && addr->sin_family != AF_IMPLINK) ||
910	    (addr->sin_addr.s_addr &&
911	     (inp->inp_flags & INP_BINDANY) == 0 &&
912	     ifa_ifwithaddr_check((struct sockaddr *)addr) == 0))
913		return (EADDRNOTAVAIL);
914
915	INP_INFO_WLOCK(&V_ripcbinfo);
916	INP_WLOCK(inp);
917	rip_delhash(inp);
918	inp->inp_laddr = addr->sin_addr;
919	rip_inshash(inp);
920	INP_WUNLOCK(inp);
921	INP_INFO_WUNLOCK(&V_ripcbinfo);
922	return (0);
923}
924
925static int
926rip_connect(struct socket *so, struct sockaddr *nam, struct thread *td)
927{
928	struct sockaddr_in *addr = (struct sockaddr_in *)nam;
929	struct inpcb *inp;
930
931	if (nam->sa_len != sizeof(*addr))
932		return (EINVAL);
933	if (TAILQ_EMPTY(&V_ifnet))
934		return (EADDRNOTAVAIL);
935	if (addr->sin_family != AF_INET && addr->sin_family != AF_IMPLINK)
936		return (EAFNOSUPPORT);
937
938	inp = sotoinpcb(so);
939	KASSERT(inp != NULL, ("rip_connect: inp == NULL"));
940
941	INP_INFO_WLOCK(&V_ripcbinfo);
942	INP_WLOCK(inp);
943	rip_delhash(inp);
944	inp->inp_faddr = addr->sin_addr;
945	rip_inshash(inp);
946	soisconnected(so);
947	INP_WUNLOCK(inp);
948	INP_INFO_WUNLOCK(&V_ripcbinfo);
949	return (0);
950}
951
952static int
953rip_shutdown(struct socket *so)
954{
955	struct inpcb *inp;
956
957	inp = sotoinpcb(so);
958	KASSERT(inp != NULL, ("rip_shutdown: inp == NULL"));
959
960	INP_WLOCK(inp);
961	socantsendmore(so);
962	INP_WUNLOCK(inp);
963	return (0);
964}
965
966static int
967rip_send(struct socket *so, int flags, struct mbuf *m, struct sockaddr *nam,
968    struct mbuf *control, struct thread *td)
969{
970	struct inpcb *inp;
971	u_long dst;
972
973	inp = sotoinpcb(so);
974	KASSERT(inp != NULL, ("rip_send: inp == NULL"));
975
976	/*
977	 * Note: 'dst' reads below are unlocked.
978	 */
979	if (so->so_state & SS_ISCONNECTED) {
980		if (nam) {
981			m_freem(m);
982			return (EISCONN);
983		}
984		dst = inp->inp_faddr.s_addr;	/* Unlocked read. */
985	} else {
986		if (nam == NULL) {
987			m_freem(m);
988			return (ENOTCONN);
989		}
990		dst = ((struct sockaddr_in *)nam)->sin_addr.s_addr;
991	}
992	return (rip_output(m, so, dst));
993}
994#endif /* INET */
995
996static int
997rip_pcblist(SYSCTL_HANDLER_ARGS)
998{
999	int error, i, n;
1000	struct inpcb *inp, **inp_list;
1001	inp_gen_t gencnt;
1002	struct xinpgen xig;
1003
1004	/*
1005	 * The process of preparing the TCB list is too time-consuming and
1006	 * resource-intensive to repeat twice on every request.
1007	 */
1008	if (req->oldptr == 0) {
1009		n = V_ripcbinfo.ipi_count;
1010		n += imax(n / 8, 10);
1011		req->oldidx = 2 * (sizeof xig) + n * sizeof(struct xinpcb);
1012		return (0);
1013	}
1014
1015	if (req->newptr != 0)
1016		return (EPERM);
1017
1018	/*
1019	 * OK, now we're committed to doing something.
1020	 */
1021	INP_INFO_RLOCK(&V_ripcbinfo);
1022	gencnt = V_ripcbinfo.ipi_gencnt;
1023	n = V_ripcbinfo.ipi_count;
1024	INP_INFO_RUNLOCK(&V_ripcbinfo);
1025
1026	xig.xig_len = sizeof xig;
1027	xig.xig_count = n;
1028	xig.xig_gen = gencnt;
1029	xig.xig_sogen = so_gencnt;
1030	error = SYSCTL_OUT(req, &xig, sizeof xig);
1031	if (error)
1032		return (error);
1033
1034	inp_list = malloc(n * sizeof *inp_list, M_TEMP, M_WAITOK);
1035	if (inp_list == 0)
1036		return (ENOMEM);
1037
1038	INP_INFO_RLOCK(&V_ripcbinfo);
1039	for (inp = LIST_FIRST(V_ripcbinfo.ipi_listhead), i = 0; inp && i < n;
1040	     inp = LIST_NEXT(inp, inp_list)) {
1041		INP_WLOCK(inp);
1042		if (inp->inp_gencnt <= gencnt &&
1043		    cr_canseeinpcb(req->td->td_ucred, inp) == 0) {
1044			in_pcbref(inp);
1045			inp_list[i++] = inp;
1046		}
1047		INP_WUNLOCK(inp);
1048	}
1049	INP_INFO_RUNLOCK(&V_ripcbinfo);
1050	n = i;
1051
1052	error = 0;
1053	for (i = 0; i < n; i++) {
1054		inp = inp_list[i];
1055		INP_RLOCK(inp);
1056		if (inp->inp_gencnt <= gencnt) {
1057			struct xinpcb xi;
1058
1059			bzero(&xi, sizeof(xi));
1060			xi.xi_len = sizeof xi;
1061			/* XXX should avoid extra copy */
1062			bcopy(inp, &xi.xi_inp, sizeof *inp);
1063			if (inp->inp_socket)
1064				sotoxsocket(inp->inp_socket, &xi.xi_socket);
1065			INP_RUNLOCK(inp);
1066			error = SYSCTL_OUT(req, &xi, sizeof xi);
1067		} else
1068			INP_RUNLOCK(inp);
1069	}
1070	INP_INFO_WLOCK(&V_ripcbinfo);
1071	for (i = 0; i < n; i++) {
1072		inp = inp_list[i];
1073		INP_WLOCK(inp);
1074		if (!in_pcbrele(inp))
1075			INP_WUNLOCK(inp);
1076	}
1077	INP_INFO_WUNLOCK(&V_ripcbinfo);
1078
1079	if (!error) {
1080		/*
1081		 * Give the user an updated idea of our state.  If the
1082		 * generation differs from what we told her before, she knows
1083		 * that something happened while we were processing this
1084		 * request, and it might be necessary to retry.
1085		 */
1086		INP_INFO_RLOCK(&V_ripcbinfo);
1087		xig.xig_gen = V_ripcbinfo.ipi_gencnt;
1088		xig.xig_sogen = so_gencnt;
1089		xig.xig_count = V_ripcbinfo.ipi_count;
1090		INP_INFO_RUNLOCK(&V_ripcbinfo);
1091		error = SYSCTL_OUT(req, &xig, sizeof xig);
1092	}
1093	free(inp_list, M_TEMP);
1094	return (error);
1095}
1096
1097SYSCTL_PROC(_net_inet_raw, OID_AUTO/*XXX*/, pcblist,
1098    CTLTYPE_OPAQUE | CTLFLAG_RD, NULL, 0,
1099    rip_pcblist, "S,xinpcb", "List of active raw IP sockets");
1100
1101#ifdef INET
1102struct pr_usrreqs rip_usrreqs = {
1103	.pru_abort =		rip_abort,
1104	.pru_attach =		rip_attach,
1105	.pru_bind =		rip_bind,
1106	.pru_connect =		rip_connect,
1107	.pru_control =		in_control,
1108	.pru_detach =		rip_detach,
1109	.pru_disconnect =	rip_disconnect,
1110	.pru_peeraddr =		in_getpeeraddr,
1111	.pru_send =		rip_send,
1112	.pru_shutdown =		rip_shutdown,
1113	.pru_sockaddr =		in_getsockaddr,
1114	.pru_sosetlabel =	in_pcbsosetlabel,
1115	.pru_close =		rip_close,
1116};
1117#endif /* INET */
1118