raw_ip.c revision 193731
1/*-
2 * Copyright (c) 1982, 1986, 1988, 1993
3 *	The Regents of the University of California.
4 * All rights reserved.
5 *
6 * Redistribution and use in source and binary forms, with or without
7 * modification, are permitted provided that the following conditions
8 * are met:
9 * 1. Redistributions of source code must retain the above copyright
10 *    notice, this list of conditions and the following disclaimer.
11 * 2. Redistributions in binary form must reproduce the above copyright
12 *    notice, this list of conditions and the following disclaimer in the
13 *    documentation and/or other materials provided with the distribution.
14 * 4. Neither the name of the University nor the names of its contributors
15 *    may be used to endorse or promote products derived from this software
16 *    without specific prior written permission.
17 *
18 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
19 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
20 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
21 * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
22 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
23 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
24 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
25 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
26 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
27 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
28 * SUCH DAMAGE.
29 *
30 *	@(#)raw_ip.c	8.7 (Berkeley) 5/15/95
31 */
32
33#include <sys/cdefs.h>
34__FBSDID("$FreeBSD: head/sys/netinet/raw_ip.c 193731 2009-06-08 17:15:40Z zec $");
35
36#include "opt_inet6.h"
37#include "opt_ipsec.h"
38#include "opt_route.h"
39
40#include <sys/param.h>
41#include <sys/jail.h>
42#include <sys/kernel.h>
43#include <sys/lock.h>
44#include <sys/malloc.h>
45#include <sys/mbuf.h>
46#include <sys/priv.h>
47#include <sys/proc.h>
48#include <sys/protosw.h>
49#include <sys/rwlock.h>
50#include <sys/signalvar.h>
51#include <sys/socket.h>
52#include <sys/socketvar.h>
53#include <sys/sx.h>
54#include <sys/sysctl.h>
55#include <sys/systm.h>
56#include <sys/vimage.h>
57
58#include <vm/uma.h>
59
60#include <net/if.h>
61#include <net/route.h>
62#include <net/vnet.h>
63
64#include <netinet/in.h>
65#include <netinet/in_systm.h>
66#include <netinet/in_pcb.h>
67#include <netinet/in_var.h>
68#include <netinet/ip.h>
69#include <netinet/ip_var.h>
70#include <netinet/ip_mroute.h>
71
72#include <netinet/vinet.h>
73
74#ifdef IPSEC
75#include <netipsec/ipsec.h>
76#endif /*IPSEC*/
77
78#include <security/mac/mac_framework.h>
79
80#ifdef VIMAGE_GLOBALS
81struct	inpcbhead ripcb;
82struct	inpcbinfo ripcbinfo;
83#endif
84
85/*
86 * Control and data hooks for ipfw and dummynet.
87 * The data hooks are not used here but it is convenient
88 * to keep them all in one place.
89 */
90int (*ip_fw_ctl_ptr)(struct sockopt *) = NULL;
91int (*ip_dn_ctl_ptr)(struct sockopt *) = NULL;
92int (*ip_fw_chk_ptr)(struct ip_fw_args *args) = NULL;
93int (*ip_dn_io_ptr)(struct mbuf **m, int dir, struct ip_fw_args *fwa) = NULL;
94
95/*
96 * Hooks for multicast routing. They all default to NULL, so leave them not
97 * initialized and rely on BSS being set to 0.
98 */
99
100/*
101 * The socket used to communicate with the multicast routing daemon.
102 */
103#ifdef VIMAGE_GLOBALS
104struct socket  *ip_mrouter;
105#endif
106
107/*
108 * The various mrouter and rsvp functions.
109 */
110int (*ip_mrouter_set)(struct socket *, struct sockopt *);
111int (*ip_mrouter_get)(struct socket *, struct sockopt *);
112int (*ip_mrouter_done)(void);
113int (*ip_mforward)(struct ip *, struct ifnet *, struct mbuf *,
114		   struct ip_moptions *);
115int (*mrt_ioctl)(int, caddr_t, int);
116int (*legal_vif_num)(int);
117u_long (*ip_mcast_src)(int);
118
119void (*rsvp_input_p)(struct mbuf *m, int off);
120int (*ip_rsvp_vif)(struct socket *, struct sockopt *);
121void (*ip_rsvp_force_done)(struct socket *);
122
123/*
124 * Hash functions
125 */
126
127#define INP_PCBHASH_RAW_SIZE	256
128#define INP_PCBHASH_RAW(proto, laddr, faddr, mask) \
129        (((proto) + (laddr) + (faddr)) % (mask) + 1)
130
131static void
132rip_inshash(struct inpcb *inp)
133{
134	struct inpcbinfo *pcbinfo = inp->inp_pcbinfo;
135	struct inpcbhead *pcbhash;
136	int hash;
137
138	INP_INFO_WLOCK_ASSERT(pcbinfo);
139	INP_WLOCK_ASSERT(inp);
140
141	if (inp->inp_ip_p != 0 &&
142	    inp->inp_laddr.s_addr != INADDR_ANY &&
143	    inp->inp_faddr.s_addr != INADDR_ANY) {
144		hash = INP_PCBHASH_RAW(inp->inp_ip_p, inp->inp_laddr.s_addr,
145		    inp->inp_faddr.s_addr, pcbinfo->ipi_hashmask);
146	} else
147		hash = 0;
148	pcbhash = &pcbinfo->ipi_hashbase[hash];
149	LIST_INSERT_HEAD(pcbhash, inp, inp_hash);
150}
151
152static void
153rip_delhash(struct inpcb *inp)
154{
155
156	INP_INFO_WLOCK_ASSERT(inp->inp_pcbinfo);
157	INP_WLOCK_ASSERT(inp);
158
159	LIST_REMOVE(inp, inp_hash);
160}
161
162/*
163 * Raw interface to IP protocol.
164 */
165
166/*
167 * Initialize raw connection block q.
168 */
169static void
170rip_zone_change(void *tag)
171{
172	INIT_VNET_INET(curvnet);
173
174	uma_zone_set_max(V_ripcbinfo.ipi_zone, maxsockets);
175}
176
177static int
178rip_inpcb_init(void *mem, int size, int flags)
179{
180	struct inpcb *inp = mem;
181
182	INP_LOCK_INIT(inp, "inp", "rawinp");
183	return (0);
184}
185
186void
187rip_init(void)
188{
189	INIT_VNET_INET(curvnet);
190
191	INP_INFO_LOCK_INIT(&V_ripcbinfo, "rip");
192	LIST_INIT(&V_ripcb);
193#ifdef VIMAGE
194	V_ripcbinfo.ipi_vnet = curvnet;
195#endif
196	V_ripcbinfo.ipi_listhead = &V_ripcb;
197	V_ripcbinfo.ipi_hashbase =
198	    hashinit(INP_PCBHASH_RAW_SIZE, M_PCB, &V_ripcbinfo.ipi_hashmask);
199	V_ripcbinfo.ipi_porthashbase =
200	    hashinit(1, M_PCB, &V_ripcbinfo.ipi_porthashmask);
201	V_ripcbinfo.ipi_zone = uma_zcreate("ripcb", sizeof(struct inpcb),
202	    NULL, NULL, rip_inpcb_init, NULL, UMA_ALIGN_PTR, UMA_ZONE_NOFREE);
203	uma_zone_set_max(V_ripcbinfo.ipi_zone, maxsockets);
204	EVENTHANDLER_REGISTER(maxsockets_change, rip_zone_change, NULL,
205	    EVENTHANDLER_PRI_ANY);
206}
207
208#ifdef VIMAGE
209void
210rip_destroy(void)
211{
212	INIT_VNET_INET(curvnet);
213
214	hashdestroy(V_ripcbinfo.ipi_hashbase, M_PCB,
215	    V_ripcbinfo.ipi_hashmask);
216	hashdestroy(V_ripcbinfo.ipi_porthashbase, M_PCB,
217	    V_ripcbinfo.ipi_porthashmask);
218}
219#endif
220
221static int
222rip_append(struct inpcb *last, struct ip *ip, struct mbuf *n,
223    struct sockaddr_in *ripsrc)
224{
225	int policyfail = 0;
226
227	INP_RLOCK_ASSERT(last);
228
229#ifdef IPSEC
230	/* check AH/ESP integrity. */
231	if (ipsec4_in_reject(n, last)) {
232		policyfail = 1;
233	}
234#endif /* IPSEC */
235#ifdef MAC
236	if (!policyfail && mac_inpcb_check_deliver(last, n) != 0)
237		policyfail = 1;
238#endif
239	/* Check the minimum TTL for socket. */
240	if (last->inp_ip_minttl && last->inp_ip_minttl > ip->ip_ttl)
241		policyfail = 1;
242	if (!policyfail) {
243		struct mbuf *opts = NULL;
244		struct socket *so;
245
246		so = last->inp_socket;
247		if ((last->inp_flags & INP_CONTROLOPTS) ||
248		    (so->so_options & (SO_TIMESTAMP | SO_BINTIME)))
249			ip_savecontrol(last, &opts, ip, n);
250		SOCKBUF_LOCK(&so->so_rcv);
251		if (sbappendaddr_locked(&so->so_rcv,
252		    (struct sockaddr *)ripsrc, n, opts) == 0) {
253			/* should notify about lost packet */
254			m_freem(n);
255			if (opts)
256				m_freem(opts);
257			SOCKBUF_UNLOCK(&so->so_rcv);
258		} else
259			sorwakeup_locked(so);
260	} else
261		m_freem(n);
262	return (policyfail);
263}
264
265/*
266 * Setup generic address and protocol structures for raw_input routine, then
267 * pass them along with mbuf chain.
268 */
269void
270rip_input(struct mbuf *m, int off)
271{
272	INIT_VNET_INET(curvnet);
273	struct ifnet *ifp;
274	struct ip *ip = mtod(m, struct ip *);
275	int proto = ip->ip_p;
276	struct inpcb *inp, *last;
277	struct sockaddr_in ripsrc;
278	int hash;
279
280	bzero(&ripsrc, sizeof(ripsrc));
281	ripsrc.sin_len = sizeof(ripsrc);
282	ripsrc.sin_family = AF_INET;
283	ripsrc.sin_addr = ip->ip_src;
284	last = NULL;
285
286	ifp = m->m_pkthdr.rcvif;
287
288	hash = INP_PCBHASH_RAW(proto, ip->ip_src.s_addr,
289	    ip->ip_dst.s_addr, V_ripcbinfo.ipi_hashmask);
290	INP_INFO_RLOCK(&V_ripcbinfo);
291	LIST_FOREACH(inp, &V_ripcbinfo.ipi_hashbase[hash], inp_hash) {
292		if (inp->inp_ip_p != proto)
293			continue;
294#ifdef INET6
295		/* XXX inp locking */
296		if ((inp->inp_vflag & INP_IPV4) == 0)
297			continue;
298#endif
299		if (inp->inp_laddr.s_addr != ip->ip_dst.s_addr)
300			continue;
301		if (inp->inp_faddr.s_addr != ip->ip_src.s_addr)
302			continue;
303		if (jailed(inp->inp_cred)) {
304			/*
305			 * XXX: If faddr was bound to multicast group,
306			 * jailed raw socket will drop datagram.
307			 */
308			if (prison_check_ip4(inp->inp_cred, &ip->ip_dst) != 0)
309				continue;
310		}
311		if (last != NULL) {
312			struct mbuf *n;
313
314			n = m_copy(m, 0, (int)M_COPYALL);
315			if (n != NULL)
316		    	    (void) rip_append(last, ip, n, &ripsrc);
317			/* XXX count dropped packet */
318			INP_RUNLOCK(last);
319		}
320		INP_RLOCK(inp);
321		last = inp;
322	}
323	LIST_FOREACH(inp, &V_ripcbinfo.ipi_hashbase[0], inp_hash) {
324		if (inp->inp_ip_p && inp->inp_ip_p != proto)
325			continue;
326#ifdef INET6
327		/* XXX inp locking */
328		if ((inp->inp_vflag & INP_IPV4) == 0)
329			continue;
330#endif
331		if (!in_nullhost(inp->inp_laddr) &&
332		    !in_hosteq(inp->inp_laddr, ip->ip_dst))
333			continue;
334		if (!in_nullhost(inp->inp_faddr) &&
335		    !in_hosteq(inp->inp_faddr, ip->ip_src))
336			continue;
337		if (jailed(inp->inp_cred)) {
338			/*
339			 * Allow raw socket in jail to receive multicast;
340			 * assume process had PRIV_NETINET_RAW at attach,
341			 * and fall through into normal filter path if so.
342			 */
343			if (!IN_MULTICAST(ntohl(ip->ip_dst.s_addr)) &&
344			    prison_check_ip4(inp->inp_cred, &ip->ip_dst) != 0)
345				continue;
346		}
347		/*
348		 * If this raw socket has multicast state, and we
349		 * have received a multicast, check if this socket
350		 * should receive it, as multicast filtering is now
351		 * the responsibility of the transport layer.
352		 */
353		if (inp->inp_moptions != NULL &&
354		    IN_MULTICAST(ntohl(ip->ip_dst.s_addr))) {
355			struct sockaddr_in group;
356			int blocked;
357
358			bzero(&group, sizeof(struct sockaddr_in));
359			group.sin_len = sizeof(struct sockaddr_in);
360			group.sin_family = AF_INET;
361			group.sin_addr = ip->ip_dst;
362
363			blocked = imo_multi_filter(inp->inp_moptions, ifp,
364			    (struct sockaddr *)&group,
365			    (struct sockaddr *)&ripsrc);
366			if (blocked != MCAST_PASS) {
367				IPSTAT_INC(ips_notmember);
368				continue;
369			}
370		}
371		if (last != NULL) {
372			struct mbuf *n;
373
374			n = m_copy(m, 0, (int)M_COPYALL);
375			if (n != NULL)
376				(void) rip_append(last, ip, n, &ripsrc);
377			/* XXX count dropped packet */
378			INP_RUNLOCK(last);
379		}
380		INP_RLOCK(inp);
381		last = inp;
382	}
383	INP_INFO_RUNLOCK(&V_ripcbinfo);
384	if (last != NULL) {
385		if (rip_append(last, ip, m, &ripsrc) != 0)
386			IPSTAT_INC(ips_delivered);
387		INP_RUNLOCK(last);
388	} else {
389		m_freem(m);
390		IPSTAT_INC(ips_noproto);
391		IPSTAT_DEC(ips_delivered);
392	}
393}
394
395/*
396 * Generate IP header and pass packet to ip_output.  Tack on options user may
397 * have setup with control call.
398 */
399int
400rip_output(struct mbuf *m, struct socket *so, u_long dst)
401{
402	INIT_VNET_INET(so->so_vnet);
403	struct ip *ip;
404	int error;
405	struct inpcb *inp = sotoinpcb(so);
406	int flags = ((so->so_options & SO_DONTROUTE) ? IP_ROUTETOIF : 0) |
407	    IP_ALLOWBROADCAST;
408
409	/*
410	 * If the user handed us a complete IP packet, use it.  Otherwise,
411	 * allocate an mbuf for a header and fill it in.
412	 */
413	if ((inp->inp_flags & INP_HDRINCL) == 0) {
414		if (m->m_pkthdr.len + sizeof(struct ip) > IP_MAXPACKET) {
415			m_freem(m);
416			return(EMSGSIZE);
417		}
418		M_PREPEND(m, sizeof(struct ip), M_DONTWAIT);
419		if (m == NULL)
420			return(ENOBUFS);
421
422		INP_RLOCK(inp);
423		ip = mtod(m, struct ip *);
424		ip->ip_tos = inp->inp_ip_tos;
425		if (inp->inp_flags & INP_DONTFRAG)
426			ip->ip_off = IP_DF;
427		else
428			ip->ip_off = 0;
429		ip->ip_p = inp->inp_ip_p;
430		ip->ip_len = m->m_pkthdr.len;
431		ip->ip_src = inp->inp_laddr;
432		error = prison_get_ip4(inp->inp_cred, &ip->ip_src);
433		if (error != 0) {
434			INP_RUNLOCK(inp);
435			m_freem(m);
436			return (error);
437		}
438		ip->ip_dst.s_addr = dst;
439		ip->ip_ttl = inp->inp_ip_ttl;
440	} else {
441		if (m->m_pkthdr.len > IP_MAXPACKET) {
442			m_freem(m);
443			return(EMSGSIZE);
444		}
445		INP_RLOCK(inp);
446		ip = mtod(m, struct ip *);
447		error = prison_check_ip4(inp->inp_cred, &ip->ip_src);
448		if (error != 0) {
449			INP_RUNLOCK(inp);
450			m_freem(m);
451			return (error);
452		}
453
454		/*
455		 * Don't allow both user specified and setsockopt options,
456		 * and don't allow packet length sizes that will crash.
457		 */
458		if (((ip->ip_hl != (sizeof (*ip) >> 2)) && inp->inp_options)
459		    || (ip->ip_len > m->m_pkthdr.len)
460		    || (ip->ip_len < (ip->ip_hl << 2))) {
461			INP_RUNLOCK(inp);
462			m_freem(m);
463			return (EINVAL);
464		}
465		if (ip->ip_id == 0)
466			ip->ip_id = ip_newid();
467
468		/*
469		 * XXX prevent ip_output from overwriting header fields.
470		 */
471		flags |= IP_RAWOUTPUT;
472		IPSTAT_INC(ips_rawout);
473	}
474
475	if (inp->inp_flags & INP_ONESBCAST)
476		flags |= IP_SENDONES;
477
478#ifdef MAC
479	mac_inpcb_create_mbuf(inp, m);
480#endif
481
482	error = ip_output(m, inp->inp_options, NULL, flags,
483	    inp->inp_moptions, inp);
484	INP_RUNLOCK(inp);
485	return (error);
486}
487
488/*
489 * Raw IP socket option processing.
490 *
491 * IMPORTANT NOTE regarding access control: Traditionally, raw sockets could
492 * only be created by a privileged process, and as such, socket option
493 * operations to manage system properties on any raw socket were allowed to
494 * take place without explicit additional access control checks.  However,
495 * raw sockets can now also be created in jail(), and therefore explicit
496 * checks are now required.  Likewise, raw sockets can be used by a process
497 * after it gives up privilege, so some caution is required.  For options
498 * passed down to the IP layer via ip_ctloutput(), checks are assumed to be
499 * performed in ip_ctloutput() and therefore no check occurs here.
500 * Unilaterally checking priv_check() here breaks normal IP socket option
501 * operations on raw sockets.
502 *
503 * When adding new socket options here, make sure to add access control
504 * checks here as necessary.
505 */
506int
507rip_ctloutput(struct socket *so, struct sockopt *sopt)
508{
509	struct	inpcb *inp = sotoinpcb(so);
510	int	error, optval;
511
512	if (sopt->sopt_level != IPPROTO_IP) {
513		if ((sopt->sopt_level == SOL_SOCKET) &&
514		    (sopt->sopt_name == SO_SETFIB)) {
515			inp->inp_inc.inc_fibnum = so->so_fibnum;
516			return (0);
517		}
518		return (EINVAL);
519	}
520
521	error = 0;
522	switch (sopt->sopt_dir) {
523	case SOPT_GET:
524		switch (sopt->sopt_name) {
525		case IP_HDRINCL:
526			optval = inp->inp_flags & INP_HDRINCL;
527			error = sooptcopyout(sopt, &optval, sizeof optval);
528			break;
529
530		case IP_FW_ADD:	/* ADD actually returns the body... */
531		case IP_FW_GET:
532		case IP_FW_TABLE_GETSIZE:
533		case IP_FW_TABLE_LIST:
534		case IP_FW_NAT_GET_CONFIG:
535		case IP_FW_NAT_GET_LOG:
536			if (ip_fw_ctl_ptr != NULL)
537				error = ip_fw_ctl_ptr(sopt);
538			else
539				error = ENOPROTOOPT;
540			break;
541
542		case IP_DUMMYNET_GET:
543			if (ip_dn_ctl_ptr != NULL)
544				error = ip_dn_ctl_ptr(sopt);
545			else
546				error = ENOPROTOOPT;
547			break ;
548
549		case MRT_INIT:
550		case MRT_DONE:
551		case MRT_ADD_VIF:
552		case MRT_DEL_VIF:
553		case MRT_ADD_MFC:
554		case MRT_DEL_MFC:
555		case MRT_VERSION:
556		case MRT_ASSERT:
557		case MRT_API_SUPPORT:
558		case MRT_API_CONFIG:
559		case MRT_ADD_BW_UPCALL:
560		case MRT_DEL_BW_UPCALL:
561			error = priv_check(curthread, PRIV_NETINET_MROUTE);
562			if (error != 0)
563				return (error);
564			error = ip_mrouter_get ? ip_mrouter_get(so, sopt) :
565				EOPNOTSUPP;
566			break;
567
568		default:
569			error = ip_ctloutput(so, sopt);
570			break;
571		}
572		break;
573
574	case SOPT_SET:
575		switch (sopt->sopt_name) {
576		case IP_HDRINCL:
577			error = sooptcopyin(sopt, &optval, sizeof optval,
578					    sizeof optval);
579			if (error)
580				break;
581			if (optval)
582				inp->inp_flags |= INP_HDRINCL;
583			else
584				inp->inp_flags &= ~INP_HDRINCL;
585			break;
586
587		case IP_FW_ADD:
588		case IP_FW_DEL:
589		case IP_FW_FLUSH:
590		case IP_FW_ZERO:
591		case IP_FW_RESETLOG:
592		case IP_FW_TABLE_ADD:
593		case IP_FW_TABLE_DEL:
594		case IP_FW_TABLE_FLUSH:
595		case IP_FW_NAT_CFG:
596		case IP_FW_NAT_DEL:
597			if (ip_fw_ctl_ptr != NULL)
598				error = ip_fw_ctl_ptr(sopt);
599			else
600				error = ENOPROTOOPT;
601			break;
602
603		case IP_DUMMYNET_CONFIGURE:
604		case IP_DUMMYNET_DEL:
605		case IP_DUMMYNET_FLUSH:
606			if (ip_dn_ctl_ptr != NULL)
607				error = ip_dn_ctl_ptr(sopt);
608			else
609				error = ENOPROTOOPT ;
610			break ;
611
612		case IP_RSVP_ON:
613			error = priv_check(curthread, PRIV_NETINET_MROUTE);
614			if (error != 0)
615				return (error);
616			error = ip_rsvp_init(so);
617			break;
618
619		case IP_RSVP_OFF:
620			error = priv_check(curthread, PRIV_NETINET_MROUTE);
621			if (error != 0)
622				return (error);
623			error = ip_rsvp_done();
624			break;
625
626		case IP_RSVP_VIF_ON:
627		case IP_RSVP_VIF_OFF:
628			error = priv_check(curthread, PRIV_NETINET_MROUTE);
629			if (error != 0)
630				return (error);
631			error = ip_rsvp_vif ?
632				ip_rsvp_vif(so, sopt) : EINVAL;
633			break;
634
635		case MRT_INIT:
636		case MRT_DONE:
637		case MRT_ADD_VIF:
638		case MRT_DEL_VIF:
639		case MRT_ADD_MFC:
640		case MRT_DEL_MFC:
641		case MRT_VERSION:
642		case MRT_ASSERT:
643		case MRT_API_SUPPORT:
644		case MRT_API_CONFIG:
645		case MRT_ADD_BW_UPCALL:
646		case MRT_DEL_BW_UPCALL:
647			error = priv_check(curthread, PRIV_NETINET_MROUTE);
648			if (error != 0)
649				return (error);
650			error = ip_mrouter_set ? ip_mrouter_set(so, sopt) :
651					EOPNOTSUPP;
652			break;
653
654		default:
655			error = ip_ctloutput(so, sopt);
656			break;
657		}
658		break;
659	}
660
661	return (error);
662}
663
664/*
665 * This function exists solely to receive the PRC_IFDOWN messages which are
666 * sent by if_down().  It looks for an ifaddr whose ifa_addr is sa, and calls
667 * in_ifadown() to remove all routes corresponding to that address.  It also
668 * receives the PRC_IFUP messages from if_up() and reinstalls the interface
669 * routes.
670 */
671void
672rip_ctlinput(int cmd, struct sockaddr *sa, void *vip)
673{
674	INIT_VNET_INET(curvnet);
675	struct in_ifaddr *ia;
676	struct ifnet *ifp;
677	int err;
678	int flags;
679
680	switch (cmd) {
681	case PRC_IFDOWN:
682		TAILQ_FOREACH(ia, &V_in_ifaddrhead, ia_link) {
683			if (ia->ia_ifa.ifa_addr == sa
684			    && (ia->ia_flags & IFA_ROUTE)) {
685				/*
686				 * in_ifscrub kills the interface route.
687				 */
688				in_ifscrub(ia->ia_ifp, ia);
689				/*
690				 * in_ifadown gets rid of all the rest of the
691				 * routes.  This is not quite the right thing
692				 * to do, but at least if we are running a
693				 * routing process they will come back.
694				 */
695				in_ifadown(&ia->ia_ifa, 0);
696				break;
697			}
698		}
699		break;
700
701	case PRC_IFUP:
702		TAILQ_FOREACH(ia, &V_in_ifaddrhead, ia_link) {
703			if (ia->ia_ifa.ifa_addr == sa)
704				break;
705		}
706		if (ia == 0 || (ia->ia_flags & IFA_ROUTE))
707			return;
708		flags = RTF_UP;
709		ifp = ia->ia_ifa.ifa_ifp;
710
711		if ((ifp->if_flags & IFF_LOOPBACK)
712		    || (ifp->if_flags & IFF_POINTOPOINT))
713			flags |= RTF_HOST;
714
715		err = rtinit(&ia->ia_ifa, RTM_ADD, flags);
716		if (err == 0)
717			ia->ia_flags |= IFA_ROUTE;
718		break;
719	}
720}
721
722u_long	rip_sendspace = 9216;
723u_long	rip_recvspace = 9216;
724
725SYSCTL_ULONG(_net_inet_raw, OID_AUTO, maxdgram, CTLFLAG_RW,
726    &rip_sendspace, 0, "Maximum outgoing raw IP datagram size");
727SYSCTL_ULONG(_net_inet_raw, OID_AUTO, recvspace, CTLFLAG_RW,
728    &rip_recvspace, 0, "Maximum space for incoming raw IP datagrams");
729
730static int
731rip_attach(struct socket *so, int proto, struct thread *td)
732{
733	INIT_VNET_INET(so->so_vnet);
734	struct inpcb *inp;
735	int error;
736
737	inp = sotoinpcb(so);
738	KASSERT(inp == NULL, ("rip_attach: inp != NULL"));
739
740	error = priv_check(td, PRIV_NETINET_RAW);
741	if (error)
742		return (error);
743	if (proto >= IPPROTO_MAX || proto < 0)
744		return EPROTONOSUPPORT;
745	error = soreserve(so, rip_sendspace, rip_recvspace);
746	if (error)
747		return (error);
748	INP_INFO_WLOCK(&V_ripcbinfo);
749	error = in_pcballoc(so, &V_ripcbinfo);
750	if (error) {
751		INP_INFO_WUNLOCK(&V_ripcbinfo);
752		return (error);
753	}
754	inp = (struct inpcb *)so->so_pcb;
755	inp->inp_vflag |= INP_IPV4;
756	inp->inp_ip_p = proto;
757	inp->inp_ip_ttl = V_ip_defttl;
758	rip_inshash(inp);
759	INP_INFO_WUNLOCK(&V_ripcbinfo);
760	INP_WUNLOCK(inp);
761	return (0);
762}
763
764static void
765rip_detach(struct socket *so)
766{
767	INIT_VNET_INET(so->so_vnet);
768	struct inpcb *inp;
769
770	inp = sotoinpcb(so);
771	KASSERT(inp != NULL, ("rip_detach: inp == NULL"));
772	KASSERT(inp->inp_faddr.s_addr == INADDR_ANY,
773	    ("rip_detach: not closed"));
774
775	INP_INFO_WLOCK(&V_ripcbinfo);
776	INP_WLOCK(inp);
777	rip_delhash(inp);
778	if (so == V_ip_mrouter && ip_mrouter_done)
779		ip_mrouter_done();
780	if (ip_rsvp_force_done)
781		ip_rsvp_force_done(so);
782	if (so == V_ip_rsvpd)
783		ip_rsvp_done();
784	in_pcbdetach(inp);
785	in_pcbfree(inp);
786	INP_INFO_WUNLOCK(&V_ripcbinfo);
787}
788
789static void
790rip_dodisconnect(struct socket *so, struct inpcb *inp)
791{
792
793	INP_INFO_WLOCK_ASSERT(inp->inp_pcbinfo);
794	INP_WLOCK_ASSERT(inp);
795
796	rip_delhash(inp);
797	inp->inp_faddr.s_addr = INADDR_ANY;
798	rip_inshash(inp);
799	SOCK_LOCK(so);
800	so->so_state &= ~SS_ISCONNECTED;
801	SOCK_UNLOCK(so);
802}
803
804static void
805rip_abort(struct socket *so)
806{
807	INIT_VNET_INET(so->so_vnet);
808	struct inpcb *inp;
809
810	inp = sotoinpcb(so);
811	KASSERT(inp != NULL, ("rip_abort: inp == NULL"));
812
813	INP_INFO_WLOCK(&V_ripcbinfo);
814	INP_WLOCK(inp);
815	rip_dodisconnect(so, inp);
816	INP_WUNLOCK(inp);
817	INP_INFO_WUNLOCK(&V_ripcbinfo);
818}
819
820static void
821rip_close(struct socket *so)
822{
823	INIT_VNET_INET(so->so_vnet);
824	struct inpcb *inp;
825
826	inp = sotoinpcb(so);
827	KASSERT(inp != NULL, ("rip_close: inp == NULL"));
828
829	INP_INFO_WLOCK(&V_ripcbinfo);
830	INP_WLOCK(inp);
831	rip_dodisconnect(so, inp);
832	INP_WUNLOCK(inp);
833	INP_INFO_WUNLOCK(&V_ripcbinfo);
834}
835
836static int
837rip_disconnect(struct socket *so)
838{
839	INIT_VNET_INET(so->so_vnet);
840	struct inpcb *inp;
841
842	if ((so->so_state & SS_ISCONNECTED) == 0)
843		return (ENOTCONN);
844
845	inp = sotoinpcb(so);
846	KASSERT(inp != NULL, ("rip_disconnect: inp == NULL"));
847
848	INP_INFO_WLOCK(&V_ripcbinfo);
849	INP_WLOCK(inp);
850	rip_dodisconnect(so, inp);
851	INP_WUNLOCK(inp);
852	INP_INFO_WUNLOCK(&V_ripcbinfo);
853	return (0);
854}
855
856static int
857rip_bind(struct socket *so, struct sockaddr *nam, struct thread *td)
858{
859	INIT_VNET_NET(so->so_vnet);
860	INIT_VNET_INET(so->so_vnet);
861	struct sockaddr_in *addr = (struct sockaddr_in *)nam;
862	struct inpcb *inp;
863	int error;
864
865	if (nam->sa_len != sizeof(*addr))
866		return (EINVAL);
867
868	error = prison_check_ip4(td->td_ucred, &addr->sin_addr);
869	if (error != 0)
870		return (error);
871
872	inp = sotoinpcb(so);
873	KASSERT(inp != NULL, ("rip_bind: inp == NULL"));
874
875	if (TAILQ_EMPTY(&V_ifnet) ||
876	    (addr->sin_family != AF_INET && addr->sin_family != AF_IMPLINK) ||
877	    (addr->sin_addr.s_addr &&
878	     (inp->inp_flags & INP_BINDANY) == 0 &&
879	     ifa_ifwithaddr((struct sockaddr *)addr) == NULL))
880		return (EADDRNOTAVAIL);
881
882	INP_INFO_WLOCK(&V_ripcbinfo);
883	INP_WLOCK(inp);
884	rip_delhash(inp);
885	inp->inp_laddr = addr->sin_addr;
886	rip_inshash(inp);
887	INP_WUNLOCK(inp);
888	INP_INFO_WUNLOCK(&V_ripcbinfo);
889	return (0);
890}
891
892static int
893rip_connect(struct socket *so, struct sockaddr *nam, struct thread *td)
894{
895	INIT_VNET_NET(so->so_vnet);
896	INIT_VNET_INET(so->so_vnet);
897	struct sockaddr_in *addr = (struct sockaddr_in *)nam;
898	struct inpcb *inp;
899
900	if (nam->sa_len != sizeof(*addr))
901		return (EINVAL);
902	if (TAILQ_EMPTY(&V_ifnet))
903		return (EADDRNOTAVAIL);
904	if (addr->sin_family != AF_INET && addr->sin_family != AF_IMPLINK)
905		return (EAFNOSUPPORT);
906
907	inp = sotoinpcb(so);
908	KASSERT(inp != NULL, ("rip_connect: inp == NULL"));
909
910	INP_INFO_WLOCK(&V_ripcbinfo);
911	INP_WLOCK(inp);
912	rip_delhash(inp);
913	inp->inp_faddr = addr->sin_addr;
914	rip_inshash(inp);
915	soisconnected(so);
916	INP_WUNLOCK(inp);
917	INP_INFO_WUNLOCK(&V_ripcbinfo);
918	return (0);
919}
920
921static int
922rip_shutdown(struct socket *so)
923{
924	struct inpcb *inp;
925
926	inp = sotoinpcb(so);
927	KASSERT(inp != NULL, ("rip_shutdown: inp == NULL"));
928
929	INP_WLOCK(inp);
930	socantsendmore(so);
931	INP_WUNLOCK(inp);
932	return (0);
933}
934
935static int
936rip_send(struct socket *so, int flags, struct mbuf *m, struct sockaddr *nam,
937    struct mbuf *control, struct thread *td)
938{
939	struct inpcb *inp;
940	u_long dst;
941
942	inp = sotoinpcb(so);
943	KASSERT(inp != NULL, ("rip_send: inp == NULL"));
944
945	/*
946	 * Note: 'dst' reads below are unlocked.
947	 */
948	if (so->so_state & SS_ISCONNECTED) {
949		if (nam) {
950			m_freem(m);
951			return (EISCONN);
952		}
953		dst = inp->inp_faddr.s_addr;	/* Unlocked read. */
954	} else {
955		if (nam == NULL) {
956			m_freem(m);
957			return (ENOTCONN);
958		}
959		dst = ((struct sockaddr_in *)nam)->sin_addr.s_addr;
960	}
961	return (rip_output(m, so, dst));
962}
963
964static int
965rip_pcblist(SYSCTL_HANDLER_ARGS)
966{
967	INIT_VNET_INET(curvnet);
968	int error, i, n;
969	struct inpcb *inp, **inp_list;
970	inp_gen_t gencnt;
971	struct xinpgen xig;
972
973	/*
974	 * The process of preparing the TCB list is too time-consuming and
975	 * resource-intensive to repeat twice on every request.
976	 */
977	if (req->oldptr == 0) {
978		n = V_ripcbinfo.ipi_count;
979		req->oldidx = 2 * (sizeof xig)
980		    + (n + n/8) * sizeof(struct xinpcb);
981		return (0);
982	}
983
984	if (req->newptr != 0)
985		return (EPERM);
986
987	/*
988	 * OK, now we're committed to doing something.
989	 */
990	INP_INFO_RLOCK(&V_ripcbinfo);
991	gencnt = V_ripcbinfo.ipi_gencnt;
992	n = V_ripcbinfo.ipi_count;
993	INP_INFO_RUNLOCK(&V_ripcbinfo);
994
995	xig.xig_len = sizeof xig;
996	xig.xig_count = n;
997	xig.xig_gen = gencnt;
998	xig.xig_sogen = so_gencnt;
999	error = SYSCTL_OUT(req, &xig, sizeof xig);
1000	if (error)
1001		return (error);
1002
1003	inp_list = malloc(n * sizeof *inp_list, M_TEMP, M_WAITOK);
1004	if (inp_list == 0)
1005		return (ENOMEM);
1006
1007	INP_INFO_RLOCK(&V_ripcbinfo);
1008	for (inp = LIST_FIRST(V_ripcbinfo.ipi_listhead), i = 0; inp && i < n;
1009	     inp = LIST_NEXT(inp, inp_list)) {
1010		INP_RLOCK(inp);
1011		if (inp->inp_gencnt <= gencnt &&
1012		    cr_canseeinpcb(req->td->td_ucred, inp) == 0) {
1013			/* XXX held references? */
1014			inp_list[i++] = inp;
1015		}
1016		INP_RUNLOCK(inp);
1017	}
1018	INP_INFO_RUNLOCK(&V_ripcbinfo);
1019	n = i;
1020
1021	error = 0;
1022	for (i = 0; i < n; i++) {
1023		inp = inp_list[i];
1024		INP_RLOCK(inp);
1025		if (inp->inp_gencnt <= gencnt) {
1026			struct xinpcb xi;
1027
1028			bzero(&xi, sizeof(xi));
1029			xi.xi_len = sizeof xi;
1030			/* XXX should avoid extra copy */
1031			bcopy(inp, &xi.xi_inp, sizeof *inp);
1032			if (inp->inp_socket)
1033				sotoxsocket(inp->inp_socket, &xi.xi_socket);
1034			INP_RUNLOCK(inp);
1035			error = SYSCTL_OUT(req, &xi, sizeof xi);
1036		} else
1037			INP_RUNLOCK(inp);
1038	}
1039	if (!error) {
1040		/*
1041		 * Give the user an updated idea of our state.  If the
1042		 * generation differs from what we told her before, she knows
1043		 * that something happened while we were processing this
1044		 * request, and it might be necessary to retry.
1045		 */
1046		INP_INFO_RLOCK(&V_ripcbinfo);
1047		xig.xig_gen = V_ripcbinfo.ipi_gencnt;
1048		xig.xig_sogen = so_gencnt;
1049		xig.xig_count = V_ripcbinfo.ipi_count;
1050		INP_INFO_RUNLOCK(&V_ripcbinfo);
1051		error = SYSCTL_OUT(req, &xig, sizeof xig);
1052	}
1053	free(inp_list, M_TEMP);
1054	return (error);
1055}
1056
1057SYSCTL_PROC(_net_inet_raw, OID_AUTO/*XXX*/, pcblist, CTLFLAG_RD, 0, 0,
1058    rip_pcblist, "S,xinpcb", "List of active raw IP sockets");
1059
1060struct pr_usrreqs rip_usrreqs = {
1061	.pru_abort =		rip_abort,
1062	.pru_attach =		rip_attach,
1063	.pru_bind =		rip_bind,
1064	.pru_connect =		rip_connect,
1065	.pru_control =		in_control,
1066	.pru_detach =		rip_detach,
1067	.pru_disconnect =	rip_disconnect,
1068	.pru_peeraddr =		in_getpeeraddr,
1069	.pru_send =		rip_send,
1070	.pru_shutdown =		rip_shutdown,
1071	.pru_sockaddr =		in_getsockaddr,
1072	.pru_sosetlabel =	in_pcbsosetlabel,
1073	.pru_close =		rip_close,
1074};
1075