1/*-
2 * SPDX-License-Identifier: BSD-3-Clause
3 *
4 * Copyright (c) 1982, 1986, 1988, 1993
5 *	The Regents of the University of California.
6 * All rights reserved.
7 *
8 * Redistribution and use in source and binary forms, with or without
9 * modification, are permitted provided that the following conditions
10 * are met:
11 * 1. Redistributions of source code must retain the above copyright
12 *    notice, this list of conditions and the following disclaimer.
13 * 2. Redistributions in binary form must reproduce the above copyright
14 *    notice, this list of conditions and the following disclaimer in the
15 *    documentation and/or other materials provided with the distribution.
16 * 3. Neither the name of the University nor the names of its contributors
17 *    may be used to endorse or promote products derived from this software
18 *    without specific prior written permission.
19 *
20 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
21 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
22 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
23 * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
24 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
25 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
26 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
27 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
28 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
29 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
30 * SUCH DAMAGE.
31 *
32 *	@(#)raw_ip.c	8.7 (Berkeley) 5/15/95
33 */
34
35#include <sys/cdefs.h>
36__FBSDID("$FreeBSD$");
37
38#include "opt_inet.h"
39#include "opt_inet6.h"
40#include "opt_ipsec.h"
41#include "opt_route.h"
42
43#include <sys/param.h>
44#include <sys/jail.h>
45#include <sys/kernel.h>
46#include <sys/eventhandler.h>
47#include <sys/lock.h>
48#include <sys/malloc.h>
49#include <sys/mbuf.h>
50#include <sys/priv.h>
51#include <sys/proc.h>
52#include <sys/protosw.h>
53#include <sys/rmlock.h>
54#include <sys/rwlock.h>
55#include <sys/signalvar.h>
56#include <sys/socket.h>
57#include <sys/socketvar.h>
58#include <sys/sx.h>
59#include <sys/sysctl.h>
60#include <sys/systm.h>
61
62#include <vm/uma.h>
63
64#include <net/if.h>
65#include <net/if_var.h>
66#include <net/route.h>
67#include <net/route/route_ctl.h>
68#include <net/vnet.h>
69
70#include <netinet/in.h>
71#include <netinet/in_systm.h>
72#include <netinet/in_fib.h>
73#include <netinet/in_pcb.h>
74#include <netinet/in_var.h>
75#include <netinet/if_ether.h>
76#include <netinet/ip.h>
77#include <netinet/ip_var.h>
78#include <netinet/ip_mroute.h>
79#include <netinet/ip_icmp.h>
80
81#include <netipsec/ipsec_support.h>
82
83#include <machine/stdarg.h>
84#include <security/mac/mac_framework.h>
85
86VNET_DEFINE(int, ip_defttl) = IPDEFTTL;
87SYSCTL_INT(_net_inet_ip, IPCTL_DEFTTL, ttl, CTLFLAG_VNET | CTLFLAG_RW,
88    &VNET_NAME(ip_defttl), 0,
89    "Maximum TTL on IP packets");
90
91VNET_DEFINE(struct inpcbhead, ripcb);
92VNET_DEFINE(struct inpcbinfo, ripcbinfo);
93
94#define	V_ripcb			VNET(ripcb)
95#define	V_ripcbinfo		VNET(ripcbinfo)
96
97/*
98 * Control and data hooks for ipfw, dummynet, divert and so on.
99 * The data hooks are not used here but it is convenient
100 * to keep them all in one place.
101 */
102VNET_DEFINE(ip_fw_chk_ptr_t, ip_fw_chk_ptr) = NULL;
103VNET_DEFINE(ip_fw_ctl_ptr_t, ip_fw_ctl_ptr) = NULL;
104
105int	(*ip_dn_ctl_ptr)(struct sockopt *);
106int	(*ip_dn_io_ptr)(struct mbuf **, struct ip_fw_args *);
107void	(*ip_divert_ptr)(struct mbuf *, bool);
108int	(*ng_ipfw_input_p)(struct mbuf **, struct ip_fw_args *, bool);
109
110#ifdef INET
111/*
112 * Hooks for multicast routing. They all default to NULL, so leave them not
113 * initialized and rely on BSS being set to 0.
114 */
115
116/*
117 * The socket used to communicate with the multicast routing daemon.
118 */
119VNET_DEFINE(struct socket *, ip_mrouter);
120
121/*
122 * The various mrouter and rsvp functions.
123 */
124int (*ip_mrouter_set)(struct socket *, struct sockopt *);
125int (*ip_mrouter_get)(struct socket *, struct sockopt *);
126int (*ip_mrouter_done)(void);
127int (*ip_mforward)(struct ip *, struct ifnet *, struct mbuf *,
128		   struct ip_moptions *);
129int (*mrt_ioctl)(u_long, caddr_t, int);
130int (*legal_vif_num)(int);
131u_long (*ip_mcast_src)(int);
132
133int (*rsvp_input_p)(struct mbuf **, int *, int);
134int (*ip_rsvp_vif)(struct socket *, struct sockopt *);
135void (*ip_rsvp_force_done)(struct socket *);
136#endif /* INET */
137
138extern	struct protosw inetsw[];
139
140u_long	rip_sendspace = 9216;
141SYSCTL_ULONG(_net_inet_raw, OID_AUTO, maxdgram, CTLFLAG_RW,
142    &rip_sendspace, 0, "Maximum outgoing raw IP datagram size");
143
144u_long	rip_recvspace = 9216;
145SYSCTL_ULONG(_net_inet_raw, OID_AUTO, recvspace, CTLFLAG_RW,
146    &rip_recvspace, 0, "Maximum space for incoming raw IP datagrams");
147
148/*
149 * Hash functions
150 */
151
152#define INP_PCBHASH_RAW_SIZE	256
153#define INP_PCBHASH_RAW(proto, laddr, faddr, mask) \
154        (((proto) + (laddr) + (faddr)) % (mask) + 1)
155
156#ifdef INET
157static void
158rip_inshash(struct inpcb *inp)
159{
160	struct inpcbinfo *pcbinfo = inp->inp_pcbinfo;
161	struct inpcbhead *pcbhash;
162	int hash;
163
164	INP_INFO_WLOCK_ASSERT(pcbinfo);
165	INP_WLOCK_ASSERT(inp);
166
167	if (inp->inp_ip_p != 0 &&
168	    inp->inp_laddr.s_addr != INADDR_ANY &&
169	    inp->inp_faddr.s_addr != INADDR_ANY) {
170		hash = INP_PCBHASH_RAW(inp->inp_ip_p, inp->inp_laddr.s_addr,
171		    inp->inp_faddr.s_addr, pcbinfo->ipi_hashmask);
172	} else
173		hash = 0;
174	pcbhash = &pcbinfo->ipi_hashbase[hash];
175	CK_LIST_INSERT_HEAD(pcbhash, inp, inp_hash);
176}
177
178static void
179rip_delhash(struct inpcb *inp)
180{
181
182	INP_INFO_WLOCK_ASSERT(inp->inp_pcbinfo);
183	INP_WLOCK_ASSERT(inp);
184
185	CK_LIST_REMOVE(inp, inp_hash);
186}
187#endif /* INET */
188
189/*
190 * Raw interface to IP protocol.
191 */
192
193/*
194 * Initialize raw connection block q.
195 */
196static void
197rip_zone_change(void *tag)
198{
199
200	uma_zone_set_max(V_ripcbinfo.ipi_zone, maxsockets);
201}
202
203static int
204rip_inpcb_init(void *mem, int size, int flags)
205{
206	struct inpcb *inp = mem;
207
208	INP_LOCK_INIT(inp, "inp", "rawinp");
209	return (0);
210}
211
212void
213rip_init(void)
214{
215
216	in_pcbinfo_init(&V_ripcbinfo, "rip", &V_ripcb, INP_PCBHASH_RAW_SIZE,
217	    1, "ripcb", rip_inpcb_init, IPI_HASHFIELDS_NONE);
218	EVENTHANDLER_REGISTER(maxsockets_change, rip_zone_change, NULL,
219	    EVENTHANDLER_PRI_ANY);
220}
221
222#ifdef VIMAGE
223static void
224rip_destroy(void *unused __unused)
225{
226
227	in_pcbinfo_destroy(&V_ripcbinfo);
228}
229VNET_SYSUNINIT(raw_ip, SI_SUB_PROTO_DOMAIN, SI_ORDER_FOURTH, rip_destroy, NULL);
230#endif
231
232#ifdef INET
233static int
234rip_append(struct inpcb *last, struct ip *ip, struct mbuf *n,
235    struct sockaddr_in *ripsrc)
236{
237	int policyfail = 0;
238
239	INP_LOCK_ASSERT(last);
240
241#if defined(IPSEC) || defined(IPSEC_SUPPORT)
242	/* check AH/ESP integrity. */
243	if (IPSEC_ENABLED(ipv4)) {
244		if (IPSEC_CHECK_POLICY(ipv4, n, last) != 0)
245			policyfail = 1;
246	}
247#endif /* IPSEC */
248#ifdef MAC
249	if (!policyfail && mac_inpcb_check_deliver(last, n) != 0)
250		policyfail = 1;
251#endif
252	/* Check the minimum TTL for socket. */
253	if (last->inp_ip_minttl && last->inp_ip_minttl > ip->ip_ttl)
254		policyfail = 1;
255	if (!policyfail) {
256		struct mbuf *opts = NULL;
257		struct socket *so;
258
259		so = last->inp_socket;
260		if ((last->inp_flags & INP_CONTROLOPTS) ||
261		    (so->so_options & (SO_TIMESTAMP | SO_BINTIME)))
262			ip_savecontrol(last, &opts, ip, n);
263		SOCKBUF_LOCK(&so->so_rcv);
264		if (sbappendaddr_locked(&so->so_rcv,
265		    (struct sockaddr *)ripsrc, n, opts) == 0) {
266			/* should notify about lost packet */
267			m_freem(n);
268			if (opts)
269				m_freem(opts);
270			SOCKBUF_UNLOCK(&so->so_rcv);
271		} else
272			sorwakeup_locked(so);
273	} else
274		m_freem(n);
275	return (policyfail);
276}
277
278/*
279 * Setup generic address and protocol structures for raw_input routine, then
280 * pass them along with mbuf chain.
281 */
282int
283rip_input(struct mbuf **mp, int *offp, int proto)
284{
285	struct ifnet *ifp;
286	struct mbuf *m = *mp;
287	struct ip *ip = mtod(m, struct ip *);
288	struct inpcb *inp, *last;
289	struct sockaddr_in ripsrc;
290	int hash;
291
292	NET_EPOCH_ASSERT();
293
294	*mp = NULL;
295
296	bzero(&ripsrc, sizeof(ripsrc));
297	ripsrc.sin_len = sizeof(ripsrc);
298	ripsrc.sin_family = AF_INET;
299	ripsrc.sin_addr = ip->ip_src;
300	last = NULL;
301
302	ifp = m->m_pkthdr.rcvif;
303
304	hash = INP_PCBHASH_RAW(proto, ip->ip_src.s_addr,
305	    ip->ip_dst.s_addr, V_ripcbinfo.ipi_hashmask);
306	CK_LIST_FOREACH(inp, &V_ripcbinfo.ipi_hashbase[hash], inp_hash) {
307		if (inp->inp_ip_p != proto)
308			continue;
309#ifdef INET6
310		/* XXX inp locking */
311		if ((inp->inp_vflag & INP_IPV4) == 0)
312			continue;
313#endif
314		if (inp->inp_laddr.s_addr != ip->ip_dst.s_addr)
315			continue;
316		if (inp->inp_faddr.s_addr != ip->ip_src.s_addr)
317			continue;
318		if (last != NULL) {
319			struct mbuf *n;
320
321			n = m_copym(m, 0, M_COPYALL, M_NOWAIT);
322			if (n != NULL)
323			    (void) rip_append(last, ip, n, &ripsrc);
324			/* XXX count dropped packet */
325			INP_RUNLOCK(last);
326			last = NULL;
327		}
328		INP_RLOCK(inp);
329		if (__predict_false(inp->inp_flags2 & INP_FREED))
330			goto skip_1;
331		if (jailed_without_vnet(inp->inp_cred)) {
332			/*
333			 * XXX: If faddr was bound to multicast group,
334			 * jailed raw socket will drop datagram.
335			 */
336			if (prison_check_ip4(inp->inp_cred, &ip->ip_dst) != 0)
337				goto skip_1;
338		}
339		last = inp;
340		continue;
341	skip_1:
342		INP_RUNLOCK(inp);
343	}
344	CK_LIST_FOREACH(inp, &V_ripcbinfo.ipi_hashbase[0], inp_hash) {
345		if (inp->inp_ip_p && inp->inp_ip_p != proto)
346			continue;
347#ifdef INET6
348		/* XXX inp locking */
349		if ((inp->inp_vflag & INP_IPV4) == 0)
350			continue;
351#endif
352		if (!in_nullhost(inp->inp_laddr) &&
353		    !in_hosteq(inp->inp_laddr, ip->ip_dst))
354			continue;
355		if (!in_nullhost(inp->inp_faddr) &&
356		    !in_hosteq(inp->inp_faddr, ip->ip_src))
357			continue;
358		if (last != NULL) {
359			struct mbuf *n;
360
361			n = m_copym(m, 0, M_COPYALL, M_NOWAIT);
362			if (n != NULL)
363				(void) rip_append(last, ip, n, &ripsrc);
364			/* XXX count dropped packet */
365			INP_RUNLOCK(last);
366			last = NULL;
367		}
368		INP_RLOCK(inp);
369		if (__predict_false(inp->inp_flags2 & INP_FREED))
370			goto skip_2;
371		if (jailed_without_vnet(inp->inp_cred)) {
372			/*
373			 * Allow raw socket in jail to receive multicast;
374			 * assume process had PRIV_NETINET_RAW at attach,
375			 * and fall through into normal filter path if so.
376			 */
377			if (!IN_MULTICAST(ntohl(ip->ip_dst.s_addr)) &&
378			    prison_check_ip4(inp->inp_cred, &ip->ip_dst) != 0)
379				goto skip_2;
380		}
381		/*
382		 * If this raw socket has multicast state, and we
383		 * have received a multicast, check if this socket
384		 * should receive it, as multicast filtering is now
385		 * the responsibility of the transport layer.
386		 */
387		if (inp->inp_moptions != NULL &&
388		    IN_MULTICAST(ntohl(ip->ip_dst.s_addr))) {
389			/*
390			 * If the incoming datagram is for IGMP, allow it
391			 * through unconditionally to the raw socket.
392			 *
393			 * In the case of IGMPv2, we may not have explicitly
394			 * joined the group, and may have set IFF_ALLMULTI
395			 * on the interface. imo_multi_filter() may discard
396			 * control traffic we actually need to see.
397			 *
398			 * Userland multicast routing daemons should continue
399			 * filter the control traffic appropriately.
400			 */
401			int blocked;
402
403			blocked = MCAST_PASS;
404			if (proto != IPPROTO_IGMP) {
405				struct sockaddr_in group;
406
407				bzero(&group, sizeof(struct sockaddr_in));
408				group.sin_len = sizeof(struct sockaddr_in);
409				group.sin_family = AF_INET;
410				group.sin_addr = ip->ip_dst;
411
412				blocked = imo_multi_filter(inp->inp_moptions,
413				    ifp,
414				    (struct sockaddr *)&group,
415				    (struct sockaddr *)&ripsrc);
416			}
417
418			if (blocked != MCAST_PASS) {
419				IPSTAT_INC(ips_notmember);
420				goto skip_2;
421			}
422		}
423		last = inp;
424		continue;
425	skip_2:
426		INP_RUNLOCK(inp);
427	}
428	if (last != NULL) {
429		if (rip_append(last, ip, m, &ripsrc) != 0)
430			IPSTAT_INC(ips_delivered);
431		INP_RUNLOCK(last);
432	} else {
433		if (inetsw[ip_protox[ip->ip_p]].pr_input == rip_input) {
434			IPSTAT_INC(ips_noproto);
435			IPSTAT_DEC(ips_delivered);
436			icmp_error(m, ICMP_UNREACH, ICMP_UNREACH_PROTOCOL, 0, 0);
437		} else {
438			m_freem(m);
439		}
440	}
441	return (IPPROTO_DONE);
442}
443
444/*
445 * Generate IP header and pass packet to ip_output.  Tack on options user may
446 * have setup with control call.
447 */
448int
449rip_output(struct mbuf *m, struct socket *so, ...)
450{
451	struct epoch_tracker et;
452	struct ip *ip;
453	int error;
454	struct inpcb *inp = sotoinpcb(so);
455	va_list ap;
456	u_long dst;
457	int flags = ((so->so_options & SO_DONTROUTE) ? IP_ROUTETOIF : 0) |
458	    IP_ALLOWBROADCAST;
459	int cnt, hlen;
460	u_char opttype, optlen, *cp;
461
462	va_start(ap, so);
463	dst = va_arg(ap, u_long);
464	va_end(ap);
465
466	/*
467	 * If the user handed us a complete IP packet, use it.  Otherwise,
468	 * allocate an mbuf for a header and fill it in.
469	 */
470	if ((inp->inp_flags & INP_HDRINCL) == 0) {
471		if (m->m_pkthdr.len + sizeof(struct ip) > IP_MAXPACKET) {
472			m_freem(m);
473			return(EMSGSIZE);
474		}
475		M_PREPEND(m, sizeof(struct ip), M_NOWAIT);
476		if (m == NULL)
477			return(ENOBUFS);
478
479		INP_RLOCK(inp);
480		ip = mtod(m, struct ip *);
481		ip->ip_tos = inp->inp_ip_tos;
482		if (inp->inp_flags & INP_DONTFRAG)
483			ip->ip_off = htons(IP_DF);
484		else
485			ip->ip_off = htons(0);
486		ip->ip_p = inp->inp_ip_p;
487		ip->ip_len = htons(m->m_pkthdr.len);
488		ip->ip_src = inp->inp_laddr;
489		ip->ip_dst.s_addr = dst;
490#ifdef ROUTE_MPATH
491		if (CALC_FLOWID_OUTBOUND) {
492			uint32_t hash_type, hash_val;
493
494			hash_val = fib4_calc_software_hash(ip->ip_src,
495			    ip->ip_dst, 0, 0, ip->ip_p, &hash_type);
496			m->m_pkthdr.flowid = hash_val;
497			M_HASHTYPE_SET(m, hash_type);
498			flags |= IP_NODEFAULTFLOWID;
499		}
500#endif
501		if (jailed(inp->inp_cred)) {
502			/*
503			 * prison_local_ip4() would be good enough but would
504			 * let a source of INADDR_ANY pass, which we do not
505			 * want to see from jails.
506			 */
507			if (ip->ip_src.s_addr == INADDR_ANY) {
508				NET_EPOCH_ENTER(et);
509				error = in_pcbladdr(inp, &ip->ip_dst,
510				    &ip->ip_src, inp->inp_cred);
511				NET_EPOCH_EXIT(et);
512			} else {
513				error = prison_local_ip4(inp->inp_cred,
514				    &ip->ip_src);
515			}
516			if (error != 0) {
517				INP_RUNLOCK(inp);
518				m_freem(m);
519				return (error);
520			}
521		}
522		ip->ip_ttl = inp->inp_ip_ttl;
523	} else {
524		if (m->m_pkthdr.len > IP_MAXPACKET) {
525			m_freem(m);
526			return(EMSGSIZE);
527		}
528		ip = mtod(m, struct ip *);
529		hlen = ip->ip_hl << 2;
530		if (m->m_len < hlen) {
531			m = m_pullup(m, hlen);
532			if (m == NULL)
533				return (EINVAL);
534			ip = mtod(m, struct ip *);
535		}
536#ifdef ROUTE_MPATH
537		if (CALC_FLOWID_OUTBOUND) {
538			uint32_t hash_type, hash_val;
539
540			hash_val = fib4_calc_software_hash(ip->ip_dst,
541			    ip->ip_src, 0, 0, ip->ip_p, &hash_type);
542			m->m_pkthdr.flowid = hash_val;
543			M_HASHTYPE_SET(m, hash_type);
544			flags |= IP_NODEFAULTFLOWID;
545		}
546#endif
547		INP_RLOCK(inp);
548		/*
549		 * Don't allow both user specified and setsockopt options,
550		 * and don't allow packet length sizes that will crash.
551		 */
552		if ((hlen < sizeof (*ip))
553		    || ((hlen > sizeof (*ip)) && inp->inp_options)
554		    || (ntohs(ip->ip_len) != m->m_pkthdr.len)) {
555			INP_RUNLOCK(inp);
556			m_freem(m);
557			return (EINVAL);
558		}
559		error = prison_check_ip4(inp->inp_cred, &ip->ip_src);
560		if (error != 0) {
561			INP_RUNLOCK(inp);
562			m_freem(m);
563			return (error);
564		}
565		/*
566		 * Don't allow IP options which do not have the required
567		 * structure as specified in section 3.1 of RFC 791 on
568		 * pages 15-23.
569		 */
570		cp = (u_char *)(ip + 1);
571		cnt = hlen - sizeof (struct ip);
572		for (; cnt > 0; cnt -= optlen, cp += optlen) {
573			opttype = cp[IPOPT_OPTVAL];
574			if (opttype == IPOPT_EOL)
575				break;
576			if (opttype == IPOPT_NOP) {
577				optlen = 1;
578				continue;
579			}
580			if (cnt < IPOPT_OLEN + sizeof(u_char)) {
581				INP_RUNLOCK(inp);
582				m_freem(m);
583				return (EINVAL);
584			}
585			optlen = cp[IPOPT_OLEN];
586			if (optlen < IPOPT_OLEN + sizeof(u_char) ||
587			    optlen > cnt) {
588				INP_RUNLOCK(inp);
589				m_freem(m);
590				return (EINVAL);
591			}
592		}
593		/*
594		 * This doesn't allow application to specify ID of zero,
595		 * but we got this limitation from the beginning of history.
596		 */
597		if (ip->ip_id == 0)
598			ip_fillid(ip);
599
600		/*
601		 * XXX prevent ip_output from overwriting header fields.
602		 */
603		flags |= IP_RAWOUTPUT;
604		IPSTAT_INC(ips_rawout);
605	}
606
607	if (inp->inp_flags & INP_ONESBCAST)
608		flags |= IP_SENDONES;
609
610#ifdef MAC
611	mac_inpcb_create_mbuf(inp, m);
612#endif
613
614	NET_EPOCH_ENTER(et);
615	error = ip_output(m, inp->inp_options, NULL, flags,
616	    inp->inp_moptions, inp);
617	NET_EPOCH_EXIT(et);
618	INP_RUNLOCK(inp);
619	return (error);
620}
621
622/*
623 * Raw IP socket option processing.
624 *
625 * IMPORTANT NOTE regarding access control: Traditionally, raw sockets could
626 * only be created by a privileged process, and as such, socket option
627 * operations to manage system properties on any raw socket were allowed to
628 * take place without explicit additional access control checks.  However,
629 * raw sockets can now also be created in jail(), and therefore explicit
630 * checks are now required.  Likewise, raw sockets can be used by a process
631 * after it gives up privilege, so some caution is required.  For options
632 * passed down to the IP layer via ip_ctloutput(), checks are assumed to be
633 * performed in ip_ctloutput() and therefore no check occurs here.
634 * Unilaterally checking priv_check() here breaks normal IP socket option
635 * operations on raw sockets.
636 *
637 * When adding new socket options here, make sure to add access control
638 * checks here as necessary.
639 *
640 * XXX-BZ inp locking?
641 */
642int
643rip_ctloutput(struct socket *so, struct sockopt *sopt)
644{
645	struct	inpcb *inp = sotoinpcb(so);
646	int	error, optval;
647
648	if (sopt->sopt_level != IPPROTO_IP) {
649		if ((sopt->sopt_level == SOL_SOCKET) &&
650		    (sopt->sopt_name == SO_SETFIB)) {
651			inp->inp_inc.inc_fibnum = so->so_fibnum;
652			return (0);
653		}
654		return (EINVAL);
655	}
656
657	error = 0;
658	switch (sopt->sopt_dir) {
659	case SOPT_GET:
660		switch (sopt->sopt_name) {
661		case IP_HDRINCL:
662			optval = inp->inp_flags & INP_HDRINCL;
663			error = sooptcopyout(sopt, &optval, sizeof optval);
664			break;
665
666		case IP_FW3:	/* generic ipfw v.3 functions */
667		case IP_FW_ADD:	/* ADD actually returns the body... */
668		case IP_FW_GET:
669		case IP_FW_TABLE_GETSIZE:
670		case IP_FW_TABLE_LIST:
671		case IP_FW_NAT_GET_CONFIG:
672		case IP_FW_NAT_GET_LOG:
673			if (V_ip_fw_ctl_ptr != NULL)
674				error = V_ip_fw_ctl_ptr(sopt);
675			else
676				error = ENOPROTOOPT;
677			break;
678
679		case IP_DUMMYNET3:	/* generic dummynet v.3 functions */
680		case IP_DUMMYNET_GET:
681			if (ip_dn_ctl_ptr != NULL)
682				error = ip_dn_ctl_ptr(sopt);
683			else
684				error = ENOPROTOOPT;
685			break ;
686
687		case MRT_INIT:
688		case MRT_DONE:
689		case MRT_ADD_VIF:
690		case MRT_DEL_VIF:
691		case MRT_ADD_MFC:
692		case MRT_DEL_MFC:
693		case MRT_VERSION:
694		case MRT_ASSERT:
695		case MRT_API_SUPPORT:
696		case MRT_API_CONFIG:
697		case MRT_ADD_BW_UPCALL:
698		case MRT_DEL_BW_UPCALL:
699			error = priv_check(curthread, PRIV_NETINET_MROUTE);
700			if (error != 0)
701				return (error);
702			error = ip_mrouter_get ? ip_mrouter_get(so, sopt) :
703				EOPNOTSUPP;
704			break;
705
706		default:
707			error = ip_ctloutput(so, sopt);
708			break;
709		}
710		break;
711
712	case SOPT_SET:
713		switch (sopt->sopt_name) {
714		case IP_HDRINCL:
715			error = sooptcopyin(sopt, &optval, sizeof optval,
716					    sizeof optval);
717			if (error)
718				break;
719			if (optval)
720				inp->inp_flags |= INP_HDRINCL;
721			else
722				inp->inp_flags &= ~INP_HDRINCL;
723			break;
724
725		case IP_FW3:	/* generic ipfw v.3 functions */
726		case IP_FW_ADD:
727		case IP_FW_DEL:
728		case IP_FW_FLUSH:
729		case IP_FW_ZERO:
730		case IP_FW_RESETLOG:
731		case IP_FW_TABLE_ADD:
732		case IP_FW_TABLE_DEL:
733		case IP_FW_TABLE_FLUSH:
734		case IP_FW_NAT_CFG:
735		case IP_FW_NAT_DEL:
736			if (V_ip_fw_ctl_ptr != NULL)
737				error = V_ip_fw_ctl_ptr(sopt);
738			else
739				error = ENOPROTOOPT;
740			break;
741
742		case IP_DUMMYNET3:	/* generic dummynet v.3 functions */
743		case IP_DUMMYNET_CONFIGURE:
744		case IP_DUMMYNET_DEL:
745		case IP_DUMMYNET_FLUSH:
746			if (ip_dn_ctl_ptr != NULL)
747				error = ip_dn_ctl_ptr(sopt);
748			else
749				error = ENOPROTOOPT ;
750			break ;
751
752		case IP_RSVP_ON:
753			error = priv_check(curthread, PRIV_NETINET_MROUTE);
754			if (error != 0)
755				return (error);
756			error = ip_rsvp_init(so);
757			break;
758
759		case IP_RSVP_OFF:
760			error = priv_check(curthread, PRIV_NETINET_MROUTE);
761			if (error != 0)
762				return (error);
763			error = ip_rsvp_done();
764			break;
765
766		case IP_RSVP_VIF_ON:
767		case IP_RSVP_VIF_OFF:
768			error = priv_check(curthread, PRIV_NETINET_MROUTE);
769			if (error != 0)
770				return (error);
771			error = ip_rsvp_vif ?
772				ip_rsvp_vif(so, sopt) : EINVAL;
773			break;
774
775		case MRT_INIT:
776		case MRT_DONE:
777		case MRT_ADD_VIF:
778		case MRT_DEL_VIF:
779		case MRT_ADD_MFC:
780		case MRT_DEL_MFC:
781		case MRT_VERSION:
782		case MRT_ASSERT:
783		case MRT_API_SUPPORT:
784		case MRT_API_CONFIG:
785		case MRT_ADD_BW_UPCALL:
786		case MRT_DEL_BW_UPCALL:
787			error = priv_check(curthread, PRIV_NETINET_MROUTE);
788			if (error != 0)
789				return (error);
790			error = ip_mrouter_set ? ip_mrouter_set(so, sopt) :
791					EOPNOTSUPP;
792			break;
793
794		default:
795			error = ip_ctloutput(so, sopt);
796			break;
797		}
798		break;
799	}
800
801	return (error);
802}
803
804/*
805 * This function exists solely to receive the PRC_IFDOWN messages which are
806 * sent by if_down().  It looks for an ifaddr whose ifa_addr is sa, and calls
807 * in_ifadown() to remove all routes corresponding to that address.  It also
808 * receives the PRC_IFUP messages from if_up() and reinstalls the interface
809 * routes.
810 */
811void
812rip_ctlinput(int cmd, struct sockaddr *sa, void *vip)
813{
814	struct rm_priotracker in_ifa_tracker;
815	struct in_ifaddr *ia;
816	struct ifnet *ifp;
817	int err;
818	int flags;
819
820	switch (cmd) {
821	case PRC_IFDOWN:
822		IN_IFADDR_RLOCK(&in_ifa_tracker);
823		CK_STAILQ_FOREACH(ia, &V_in_ifaddrhead, ia_link) {
824			if (ia->ia_ifa.ifa_addr == sa
825			    && (ia->ia_flags & IFA_ROUTE)) {
826				ifa_ref(&ia->ia_ifa);
827				IN_IFADDR_RUNLOCK(&in_ifa_tracker);
828				/*
829				 * in_scrubprefix() kills the interface route.
830				 */
831				in_scrubprefix(ia, 0);
832				/*
833				 * in_ifadown gets rid of all the rest of the
834				 * routes.  This is not quite the right thing
835				 * to do, but at least if we are running a
836				 * routing process they will come back.
837				 */
838				in_ifadown(&ia->ia_ifa, 0);
839				ifa_free(&ia->ia_ifa);
840				break;
841			}
842		}
843		if (ia == NULL)		/* If ia matched, already unlocked. */
844			IN_IFADDR_RUNLOCK(&in_ifa_tracker);
845		break;
846
847	case PRC_IFUP:
848		IN_IFADDR_RLOCK(&in_ifa_tracker);
849		CK_STAILQ_FOREACH(ia, &V_in_ifaddrhead, ia_link) {
850			if (ia->ia_ifa.ifa_addr == sa)
851				break;
852		}
853		if (ia == NULL || (ia->ia_flags & IFA_ROUTE)) {
854			IN_IFADDR_RUNLOCK(&in_ifa_tracker);
855			return;
856		}
857		ifa_ref(&ia->ia_ifa);
858		IN_IFADDR_RUNLOCK(&in_ifa_tracker);
859		flags = RTF_UP;
860		ifp = ia->ia_ifa.ifa_ifp;
861
862		if ((ifp->if_flags & IFF_LOOPBACK)
863		    || (ifp->if_flags & IFF_POINTOPOINT))
864			flags |= RTF_HOST;
865
866		err = ifa_del_loopback_route((struct ifaddr *)ia, sa);
867
868		rt_addrmsg(RTM_ADD, &ia->ia_ifa, ia->ia_ifp->if_fib);
869		err = in_handle_ifaddr_route(RTM_ADD, ia);
870		if (err == 0)
871			ia->ia_flags |= IFA_ROUTE;
872
873		err = ifa_add_loopback_route((struct ifaddr *)ia, sa);
874
875		ifa_free(&ia->ia_ifa);
876		break;
877	}
878}
879
880static int
881rip_attach(struct socket *so, int proto, struct thread *td)
882{
883	struct inpcb *inp;
884	int error;
885
886	inp = sotoinpcb(so);
887	KASSERT(inp == NULL, ("rip_attach: inp != NULL"));
888
889	error = priv_check(td, PRIV_NETINET_RAW);
890	if (error)
891		return (error);
892	if (proto >= IPPROTO_MAX || proto < 0)
893		return EPROTONOSUPPORT;
894	error = soreserve(so, rip_sendspace, rip_recvspace);
895	if (error)
896		return (error);
897	INP_INFO_WLOCK(&V_ripcbinfo);
898	error = in_pcballoc(so, &V_ripcbinfo);
899	if (error) {
900		INP_INFO_WUNLOCK(&V_ripcbinfo);
901		return (error);
902	}
903	inp = (struct inpcb *)so->so_pcb;
904	inp->inp_vflag |= INP_IPV4;
905	inp->inp_ip_p = proto;
906	inp->inp_ip_ttl = V_ip_defttl;
907	rip_inshash(inp);
908	INP_INFO_WUNLOCK(&V_ripcbinfo);
909	INP_WUNLOCK(inp);
910	return (0);
911}
912
913static void
914rip_detach(struct socket *so)
915{
916	struct inpcb *inp;
917
918	inp = sotoinpcb(so);
919	KASSERT(inp != NULL, ("rip_detach: inp == NULL"));
920	KASSERT(inp->inp_faddr.s_addr == INADDR_ANY,
921	    ("rip_detach: not closed"));
922
923	INP_INFO_WLOCK(&V_ripcbinfo);
924	INP_WLOCK(inp);
925	rip_delhash(inp);
926	if (so == V_ip_mrouter && ip_mrouter_done)
927		ip_mrouter_done();
928	if (ip_rsvp_force_done)
929		ip_rsvp_force_done(so);
930	if (so == V_ip_rsvpd)
931		ip_rsvp_done();
932	in_pcbdetach(inp);
933	in_pcbfree(inp);
934	INP_INFO_WUNLOCK(&V_ripcbinfo);
935}
936
937static void
938rip_dodisconnect(struct socket *so, struct inpcb *inp)
939{
940	struct inpcbinfo *pcbinfo;
941
942	pcbinfo = inp->inp_pcbinfo;
943	INP_INFO_WLOCK(pcbinfo);
944	INP_WLOCK(inp);
945	rip_delhash(inp);
946	inp->inp_faddr.s_addr = INADDR_ANY;
947	rip_inshash(inp);
948	SOCK_LOCK(so);
949	so->so_state &= ~SS_ISCONNECTED;
950	SOCK_UNLOCK(so);
951	INP_WUNLOCK(inp);
952	INP_INFO_WUNLOCK(pcbinfo);
953}
954
955static void
956rip_abort(struct socket *so)
957{
958	struct inpcb *inp;
959
960	inp = sotoinpcb(so);
961	KASSERT(inp != NULL, ("rip_abort: inp == NULL"));
962
963	rip_dodisconnect(so, inp);
964}
965
966static void
967rip_close(struct socket *so)
968{
969	struct inpcb *inp;
970
971	inp = sotoinpcb(so);
972	KASSERT(inp != NULL, ("rip_close: inp == NULL"));
973
974	rip_dodisconnect(so, inp);
975}
976
977static int
978rip_disconnect(struct socket *so)
979{
980	struct inpcb *inp;
981
982	if ((so->so_state & SS_ISCONNECTED) == 0)
983		return (ENOTCONN);
984
985	inp = sotoinpcb(so);
986	KASSERT(inp != NULL, ("rip_disconnect: inp == NULL"));
987
988	rip_dodisconnect(so, inp);
989	return (0);
990}
991
992static int
993rip_bind(struct socket *so, struct sockaddr *nam, struct thread *td)
994{
995	struct sockaddr_in *addr = (struct sockaddr_in *)nam;
996	struct inpcb *inp;
997	int error;
998
999	if (nam->sa_family != AF_INET)
1000		return (EAFNOSUPPORT);
1001	if (nam->sa_len != sizeof(*addr))
1002		return (EINVAL);
1003
1004	error = prison_check_ip4(td->td_ucred, &addr->sin_addr);
1005	if (error != 0)
1006		return (error);
1007
1008	inp = sotoinpcb(so);
1009	KASSERT(inp != NULL, ("rip_bind: inp == NULL"));
1010
1011	if (CK_STAILQ_EMPTY(&V_ifnet) ||
1012	    (addr->sin_family != AF_INET && addr->sin_family != AF_IMPLINK) ||
1013	    (addr->sin_addr.s_addr &&
1014	     (inp->inp_flags & INP_BINDANY) == 0 &&
1015	     ifa_ifwithaddr_check((struct sockaddr *)addr) == 0))
1016		return (EADDRNOTAVAIL);
1017
1018	INP_INFO_WLOCK(&V_ripcbinfo);
1019	INP_WLOCK(inp);
1020	rip_delhash(inp);
1021	inp->inp_laddr = addr->sin_addr;
1022	rip_inshash(inp);
1023	INP_WUNLOCK(inp);
1024	INP_INFO_WUNLOCK(&V_ripcbinfo);
1025	return (0);
1026}
1027
1028static int
1029rip_connect(struct socket *so, struct sockaddr *nam, struct thread *td)
1030{
1031	struct sockaddr_in *addr = (struct sockaddr_in *)nam;
1032	struct inpcb *inp;
1033
1034	if (nam->sa_len != sizeof(*addr))
1035		return (EINVAL);
1036	if (CK_STAILQ_EMPTY(&V_ifnet))
1037		return (EADDRNOTAVAIL);
1038	if (addr->sin_family != AF_INET && addr->sin_family != AF_IMPLINK)
1039		return (EAFNOSUPPORT);
1040
1041	inp = sotoinpcb(so);
1042	KASSERT(inp != NULL, ("rip_connect: inp == NULL"));
1043
1044	INP_INFO_WLOCK(&V_ripcbinfo);
1045	INP_WLOCK(inp);
1046	rip_delhash(inp);
1047	inp->inp_faddr = addr->sin_addr;
1048	rip_inshash(inp);
1049	soisconnected(so);
1050	INP_WUNLOCK(inp);
1051	INP_INFO_WUNLOCK(&V_ripcbinfo);
1052	return (0);
1053}
1054
1055static int
1056rip_shutdown(struct socket *so)
1057{
1058	struct inpcb *inp;
1059
1060	inp = sotoinpcb(so);
1061	KASSERT(inp != NULL, ("rip_shutdown: inp == NULL"));
1062
1063	INP_WLOCK(inp);
1064	socantsendmore(so);
1065	INP_WUNLOCK(inp);
1066	return (0);
1067}
1068
1069static int
1070rip_send(struct socket *so, int flags, struct mbuf *m, struct sockaddr *nam,
1071    struct mbuf *control, struct thread *td)
1072{
1073	struct inpcb *inp;
1074	u_long dst;
1075	int error;
1076
1077	inp = sotoinpcb(so);
1078	KASSERT(inp != NULL, ("rip_send: inp == NULL"));
1079
1080	if (control != NULL) {
1081		m_freem(control);
1082		control = NULL;
1083	}
1084
1085	/*
1086	 * Note: 'dst' reads below are unlocked.
1087	 */
1088	if (so->so_state & SS_ISCONNECTED) {
1089		if (nam) {
1090			error = EISCONN;
1091			goto release;
1092		}
1093		dst = inp->inp_faddr.s_addr;	/* Unlocked read. */
1094	} else {
1095		error = 0;
1096		if (nam == NULL)
1097			error = ENOTCONN;
1098		else if (nam->sa_family != AF_INET)
1099			error = EAFNOSUPPORT;
1100		else if (nam->sa_len != sizeof(struct sockaddr_in))
1101			error = EINVAL;
1102		if (error != 0)
1103			goto release;
1104		dst = ((struct sockaddr_in *)nam)->sin_addr.s_addr;
1105	}
1106	return (rip_output(m, so, dst));
1107
1108release:
1109	m_freem(m);
1110	return (error);
1111}
1112#endif /* INET */
1113
1114static int
1115rip_pcblist(SYSCTL_HANDLER_ARGS)
1116{
1117	struct xinpgen xig;
1118	struct epoch_tracker et;
1119	struct inpcb *inp;
1120	int error;
1121
1122	if (req->newptr != 0)
1123		return (EPERM);
1124
1125	if (req->oldptr == 0) {
1126		int n;
1127
1128		n = V_ripcbinfo.ipi_count;
1129		n += imax(n / 8, 10);
1130		req->oldidx = 2 * (sizeof xig) + n * sizeof(struct xinpcb);
1131		return (0);
1132	}
1133
1134	if ((error = sysctl_wire_old_buffer(req, 0)) != 0)
1135		return (error);
1136
1137	bzero(&xig, sizeof(xig));
1138	xig.xig_len = sizeof xig;
1139	xig.xig_count = V_ripcbinfo.ipi_count;
1140	xig.xig_gen = V_ripcbinfo.ipi_gencnt;
1141	xig.xig_sogen = so_gencnt;
1142	error = SYSCTL_OUT(req, &xig, sizeof xig);
1143	if (error)
1144		return (error);
1145
1146	NET_EPOCH_ENTER(et);
1147	for (inp = CK_LIST_FIRST(V_ripcbinfo.ipi_listhead);
1148	    inp != NULL;
1149	    inp = CK_LIST_NEXT(inp, inp_list)) {
1150		INP_RLOCK(inp);
1151		if (inp->inp_gencnt <= xig.xig_gen &&
1152		    cr_canseeinpcb(req->td->td_ucred, inp) == 0) {
1153			struct xinpcb xi;
1154
1155			in_pcbtoxinpcb(inp, &xi);
1156			INP_RUNLOCK(inp);
1157			error = SYSCTL_OUT(req, &xi, sizeof xi);
1158			if (error)
1159				break;
1160		} else
1161			INP_RUNLOCK(inp);
1162	}
1163	NET_EPOCH_EXIT(et);
1164
1165	if (!error) {
1166		/*
1167		 * Give the user an updated idea of our state.  If the
1168		 * generation differs from what we told her before, she knows
1169		 * that something happened while we were processing this
1170		 * request, and it might be necessary to retry.
1171		 */
1172		xig.xig_gen = V_ripcbinfo.ipi_gencnt;
1173		xig.xig_sogen = so_gencnt;
1174		xig.xig_count = V_ripcbinfo.ipi_count;
1175		error = SYSCTL_OUT(req, &xig, sizeof xig);
1176	}
1177
1178	return (error);
1179}
1180
1181SYSCTL_PROC(_net_inet_raw, OID_AUTO/*XXX*/, pcblist,
1182    CTLTYPE_OPAQUE | CTLFLAG_RD | CTLFLAG_MPSAFE, NULL, 0,
1183    rip_pcblist, "S,xinpcb",
1184    "List of active raw IP sockets");
1185
1186#ifdef INET
1187struct pr_usrreqs rip_usrreqs = {
1188	.pru_abort =		rip_abort,
1189	.pru_attach =		rip_attach,
1190	.pru_bind =		rip_bind,
1191	.pru_connect =		rip_connect,
1192	.pru_control =		in_control,
1193	.pru_detach =		rip_detach,
1194	.pru_disconnect =	rip_disconnect,
1195	.pru_peeraddr =		in_getpeeraddr,
1196	.pru_send =		rip_send,
1197	.pru_shutdown =		rip_shutdown,
1198	.pru_sockaddr =		in_getsockaddr,
1199	.pru_sosetlabel =	in_pcbsosetlabel,
1200	.pru_close =		rip_close,
1201};
1202#endif /* INET */
1203