raw_ip.c revision 186164
117683Spst/*-
239291Sfenner * Copyright (c) 1982, 1986, 1988, 1993
317683Spst *	The Regents of the University of California.
417683Spst * All rights reserved.
517683Spst *
617683Spst * Redistribution and use in source and binary forms, with or without
717683Spst * modification, are permitted provided that the following conditions
817683Spst * are met:
917683Spst * 1. Redistributions of source code must retain the above copyright
1017683Spst *    notice, this list of conditions and the following disclaimer.
1117683Spst * 2. Redistributions in binary form must reproduce the above copyright
1217683Spst *    notice, this list of conditions and the following disclaimer in the
1317683Spst *    documentation and/or other materials provided with the distribution.
1417683Spst * 4. Neither the name of the University nor the names of its contributors
1517683Spst *    may be used to endorse or promote products derived from this software
1617683Spst *    without specific prior written permission.
1717683Spst *
1817683Spst * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
1917683Spst * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
20162020Ssam * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
21162020Ssam * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
2217683Spst * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
2317683Spst * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
24127664Sbms * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
25214518Srpaulo * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
2617683Spst * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
2717683Spst * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
2875107Sfenner * SUCH DAMAGE.
2975107Sfenner *
3075107Sfenner *	@(#)raw_ip.c	8.7 (Berkeley) 5/15/95
3175107Sfenner */
3217683Spst
33190225Srpaulo#include <sys/cdefs.h>
34183102Scsjp__FBSDID("$FreeBSD: head/sys/netinet/raw_ip.c 186164 2008-12-16 03:18:59Z kmacy $");
35190225Srpaulo
3617683Spst#include "opt_inet6.h"
3717683Spst#include "opt_ipsec.h"
38214518Srpaulo#include "opt_mac.h"
39214518Srpaulo
40214518Srpaulo#include <sys/param.h>
41214518Srpaulo#include <sys/jail.h>
42214518Srpaulo#include <sys/kernel.h>
43214518Srpaulo#include <sys/lock.h>
44214518Srpaulo#include <sys/malloc.h>
45214518Srpaulo#include <sys/mbuf.h>
46214518Srpaulo#include <sys/priv.h>
47214518Srpaulo#include <sys/proc.h>
48214518Srpaulo#include <sys/protosw.h>
49214518Srpaulo#include <sys/rwlock.h>
50214518Srpaulo#include <sys/signalvar.h>
5117683Spst#include <sys/socket.h>
52214518Srpaulo#include <sys/socketvar.h>
53214518Srpaulo#include <sys/sx.h>
54214518Srpaulo#include <sys/sysctl.h>
55127664Sbms#include <sys/systm.h>
5617683Spst#include <sys/vimage.h>
57190225Srpaulo
58190225Srpaulo#include <vm/uma.h>
59190225Srpaulo
60190225Srpaulo#include <net/if.h>
6117683Spst#include <net/route.h>
62127664Sbms#include <net/vnet.h>
6398530Sfenner
64127664Sbms#include <netinet/in.h>
6598530Sfenner#include <netinet/in_systm.h>
66190225Srpaulo#include <netinet/in_pcb.h>
67127664Sbms#include <netinet/in_var.h>
6898530Sfenner#include <netinet/ip.h>
69127664Sbms#include <netinet/ip_var.h>
70127664Sbms#include <netinet/ip_mroute.h>
71127664Sbms
72127664Sbms#include <netinet/ip_fw.h>
73127664Sbms#include <netinet/ip_dummynet.h>
74127664Sbms#include <netinet/vinet.h>
75127664Sbms
76127664Sbms#ifdef IPSEC
77127664Sbms#include <netipsec/ipsec.h>
78127664Sbms#endif /*IPSEC*/
79127664Sbms
80127664Sbms#include <security/mac/mac_framework.h>
81127664Sbms
8298530Sfenner#ifdef VIMAGE_GLOBALS
83127664Sbmsstruct	inpcbhead ripcb;
84127664Sbmsstruct	inpcbinfo ripcbinfo;
85147894Ssam#endif
86127664Sbms
8717683Spst/* control hooks for ipfw and dummynet */
88127664Sbmsip_fw_ctl_t *ip_fw_ctl_ptr = NULL;
89127664Sbmsip_dn_ctl_t *ip_dn_ctl_ptr = NULL;
90127664Sbms
91127664Sbms/*
92127664Sbms * Hooks for multicast routing. They all default to NULL, so leave them not
93127664Sbms * initialized and rely on BSS being set to 0.
94127664Sbms */
95127664Sbms
96127664Sbms/*
97127664Sbms * The socket used to communicate with the multicast routing daemon.
98127664Sbms */
99127664Sbms#ifdef VIMAGE_GLOBALS
100127664Sbmsstruct socket  *ip_mrouter;
101127664Sbms#endif
102127664Sbms
103127664Sbms/*
104214518Srpaulo * The various mrouter and rsvp functions.
105214518Srpaulo */
106127664Sbmsint (*ip_mrouter_set)(struct socket *, struct sockopt *);
107127664Sbmsint (*ip_mrouter_get)(struct socket *, struct sockopt *);
108127664Sbmsint (*ip_mrouter_done)(void);
109127664Sbmsint (*ip_mforward)(struct ip *, struct ifnet *, struct mbuf *,
110127664Sbms		   struct ip_moptions *);
111127664Sbmsint (*mrt_ioctl)(int, caddr_t, int);
11217683Spstint (*legal_vif_num)(int);
113214518Srpaulou_long (*ip_mcast_src)(int);
11417683Spst
11517683Spstvoid (*rsvp_input_p)(struct mbuf *m, int off);
11617683Spstint (*ip_rsvp_vif)(struct socket *, struct sockopt *);
11717683Spstvoid (*ip_rsvp_force_done)(struct socket *);
11817683Spst
11917683Spst/*
12017683Spst * Hash functions
121190225Srpaulo */
122190225Srpaulo
123190225Srpaulo#define INP_PCBHASH_RAW_SIZE	256
124190225Srpaulo#define INP_PCBHASH_RAW(proto, laddr, faddr, mask) \
12517683Spst        (((proto) + (laddr) + (faddr)) % (mask) + 1)
12617683Spst
127127664Sbmsstatic void
128127664Sbmsrip_inshash(struct inpcb *inp)
129127664Sbms{
130127664Sbms	struct inpcbinfo *pcbinfo = inp->inp_pcbinfo;
131214518Srpaulo	struct inpcbhead *pcbhash;
132214518Srpaulo	int hash;
133214518Srpaulo
134214518Srpaulo	INP_INFO_WLOCK_ASSERT(pcbinfo);
13517683Spst	INP_WLOCK_ASSERT(inp);
13617683Spst
13717683Spst	if (inp->inp_ip_p != 0 &&
13817683Spst	    inp->inp_laddr.s_addr != INADDR_ANY &&
139190225Srpaulo	    inp->inp_faddr.s_addr != INADDR_ANY) {
140190225Srpaulo		hash = INP_PCBHASH_RAW(inp->inp_ip_p, inp->inp_laddr.s_addr,
141190225Srpaulo		    inp->inp_faddr.s_addr, pcbinfo->ipi_hashmask);
142190225Srpaulo	} else
14356889Sfenner		hash = 0;
144190225Srpaulo	pcbhash = &pcbinfo->ipi_hashbase[hash];
145190225Srpaulo	LIST_INSERT_HEAD(pcbhash, inp, inp_hash);
146190225Srpaulo}
147190225Srpaulo
148190225Srpaulostatic void
149190225Srpaulorip_delhash(struct inpcb *inp)
150190225Srpaulo{
151190225Srpaulo
152190225Srpaulo	INP_INFO_WLOCK_ASSERT(inp->inp_pcbinfo);
153190225Srpaulo	INP_WLOCK_ASSERT(inp);
154190225Srpaulo
155190225Srpaulo	LIST_REMOVE(inp, inp_hash);
156190225Srpaulo}
157190225Srpaulo
158190225Srpaulo/*
159190225Srpaulo * Raw interface to IP protocol.
160190225Srpaulo */
161190225Srpaulo
162190225Srpaulo/*
163190225Srpaulo * Initialize raw connection block q.
164190225Srpaulo */
165190225Srpaulostatic void
166190225Srpaulorip_zone_change(void *tag)
167190225Srpaulo{
168190225Srpaulo	INIT_VNET_INET(curvnet);
169190225Srpaulo
170190225Srpaulo	uma_zone_set_max(V_ripcbinfo.ipi_zone, maxsockets);
171190225Srpaulo}
172190225Srpaulo
173190225Srpaulostatic int
174190225Srpaulorip_inpcb_init(void *mem, int size, int flags)
175190225Srpaulo{
176190225Srpaulo	struct inpcb *inp = mem;
177190225Srpaulo
178190225Srpaulo	INP_LOCK_INIT(inp, "inp", "rawinp");
179190225Srpaulo	return (0);
180190225Srpaulo}
181190225Srpaulo
182190225Srpaulovoid
183190225Srpaulorip_init(void)
184190225Srpaulo{
185190225Srpaulo	INIT_VNET_INET(curvnet);
186190225Srpaulo
187127664Sbms	INP_INFO_LOCK_INIT(&V_ripcbinfo, "rip");
188162012Ssam	LIST_INIT(&V_ripcb);
189127664Sbms	V_ripcbinfo.ipi_listhead = &V_ripcb;
190127664Sbms	V_ripcbinfo.ipi_hashbase =
191190225Srpaulo	    hashinit(INP_PCBHASH_RAW_SIZE, M_PCB, &V_ripcbinfo.ipi_hashmask);
192190225Srpaulo	V_ripcbinfo.ipi_porthashbase =
193190225Srpaulo	    hashinit(1, M_PCB, &V_ripcbinfo.ipi_porthashmask);
194190225Srpaulo	V_ripcbinfo.ipi_zone = uma_zcreate("ripcb", sizeof(struct inpcb),
195190225Srpaulo	    NULL, NULL, rip_inpcb_init, NULL, UMA_ALIGN_PTR, UMA_ZONE_NOFREE);
196190225Srpaulo	uma_zone_set_max(V_ripcbinfo.ipi_zone, maxsockets);
197190225Srpaulo	EVENTHANDLER_REGISTER(maxsockets_change, rip_zone_change, NULL,
198190225Srpaulo	    EVENTHANDLER_PRI_ANY);
199127664Sbms}
200190225Srpaulo
201190225Srpaulostatic int
202190225Srpaulorip_append(struct inpcb *last, struct ip *ip, struct mbuf *n,
203190225Srpaulo    struct sockaddr_in *ripsrc)
204190225Srpaulo{
205190225Srpaulo	int policyfail = 0;
206190225Srpaulo
207190225Srpaulo	INP_RLOCK_ASSERT(last);
208190225Srpaulo
209190225Srpaulo#ifdef IPSEC
210190225Srpaulo	/* check AH/ESP integrity. */
211190225Srpaulo	if (ipsec4_in_reject(n, last)) {
212190225Srpaulo		policyfail = 1;
213190225Srpaulo	}
214190225Srpaulo#endif /* IPSEC */
215190225Srpaulo#ifdef MAC
216190225Srpaulo	if (!policyfail && mac_inpcb_check_deliver(last, n) != 0)
217190225Srpaulo		policyfail = 1;
218214518Srpaulo#endif
219214518Srpaulo	/* Check the minimum TTL for socket. */
220214518Srpaulo	if (last->inp_ip_minttl && last->inp_ip_minttl > ip->ip_ttl)
221214518Srpaulo		policyfail = 1;
222214518Srpaulo	if (!policyfail) {
223214518Srpaulo		struct mbuf *opts = NULL;
224214518Srpaulo		struct socket *so;
225190225Srpaulo
226214518Srpaulo		so = last->inp_socket;
227214518Srpaulo		if ((last->inp_flags & INP_CONTROLOPTS) ||
228214518Srpaulo		    (so->so_options & (SO_TIMESTAMP | SO_BINTIME)))
229214518Srpaulo			ip_savecontrol(last, &opts, ip, n);
230214518Srpaulo		SOCKBUF_LOCK(&so->so_rcv);
231214518Srpaulo		if (sbappendaddr_locked(&so->so_rcv,
232214518Srpaulo		    (struct sockaddr *)ripsrc, n, opts) == 0) {
233214518Srpaulo			/* should notify about lost packet */
234214518Srpaulo			m_freem(n);
235190225Srpaulo			if (opts)
236214518Srpaulo				m_freem(opts);
237214518Srpaulo			SOCKBUF_UNLOCK(&so->so_rcv);
238190225Srpaulo		} else
239190225Srpaulo			sorwakeup_locked(so);
240190225Srpaulo	} else
241190225Srpaulo		m_freem(n);
242190225Srpaulo	return (policyfail);
243190225Srpaulo}
244190225Srpaulo
245190225Srpaulo/*
246190225Srpaulo * Setup generic address and protocol structures for raw_input routine, then
247190225Srpaulo * pass them along with mbuf chain.
248190225Srpaulo */
249190225Srpaulovoid
250190225Srpaulorip_input(struct mbuf *m, int off)
251190225Srpaulo{
252190225Srpaulo	INIT_VNET_INET(curvnet);
253190225Srpaulo	struct ip *ip = mtod(m, struct ip *);
254190225Srpaulo	int proto = ip->ip_p;
255190225Srpaulo	struct inpcb *inp, *last;
256190225Srpaulo	struct sockaddr_in ripsrc;
257190225Srpaulo	int hash;
258190225Srpaulo
259190225Srpaulo	bzero(&ripsrc, sizeof(ripsrc));
260190225Srpaulo	ripsrc.sin_len = sizeof(ripsrc);
261190225Srpaulo	ripsrc.sin_family = AF_INET;
262190225Srpaulo	ripsrc.sin_addr = ip->ip_src;
263190225Srpaulo	last = NULL;
264190225Srpaulo	hash = INP_PCBHASH_RAW(proto, ip->ip_src.s_addr,
265190225Srpaulo	    ip->ip_dst.s_addr, V_ripcbinfo.ipi_hashmask);
266190225Srpaulo	INP_INFO_RLOCK(&V_ripcbinfo);
267190225Srpaulo	LIST_FOREACH(inp, &V_ripcbinfo.ipi_hashbase[hash], inp_hash) {
268190225Srpaulo		if (inp->inp_ip_p != proto)
269190225Srpaulo			continue;
270190225Srpaulo#ifdef INET6
271190225Srpaulo		/* XXX inp locking */
272190225Srpaulo		if ((inp->inp_vflag & INP_IPV4) == 0)
273190225Srpaulo			continue;
274190225Srpaulo#endif
275190225Srpaulo		if (inp->inp_laddr.s_addr != ip->ip_dst.s_addr)
276190225Srpaulo			continue;
277190225Srpaulo		if (inp->inp_faddr.s_addr != ip->ip_src.s_addr)
278190225Srpaulo			continue;
279190225Srpaulo		if (jailed(inp->inp_cred)) {
280190225Srpaulo			if (!prison_check_ip4(inp->inp_cred, &ip->ip_dst))
281190225Srpaulo				continue;
282190225Srpaulo		}
283190225Srpaulo		if (last != NULL) {
284190225Srpaulo			struct mbuf *n;
285190225Srpaulo
286190225Srpaulo			n = m_copy(m, 0, (int)M_COPYALL);
287190225Srpaulo			if (n != NULL)
288190225Srpaulo		    	    (void) rip_append(last, ip, n, &ripsrc);
289190225Srpaulo			/* XXX count dropped packet */
290190225Srpaulo			INP_RUNLOCK(last);
291190225Srpaulo		}
292190225Srpaulo		INP_RLOCK(inp);
293190225Srpaulo		last = inp;
294190225Srpaulo	}
295190225Srpaulo	LIST_FOREACH(inp, &V_ripcbinfo.ipi_hashbase[0], inp_hash) {
296190225Srpaulo		if (inp->inp_ip_p && inp->inp_ip_p != proto)
297190225Srpaulo			continue;
298190225Srpaulo#ifdef INET6
299190225Srpaulo		/* XXX inp locking */
300190225Srpaulo		if ((inp->inp_vflag & INP_IPV4) == 0)
301190225Srpaulo			continue;
302190225Srpaulo#endif
303190225Srpaulo		if (inp->inp_laddr.s_addr &&
304190225Srpaulo		    inp->inp_laddr.s_addr != ip->ip_dst.s_addr)
305190225Srpaulo			continue;
306190225Srpaulo		if (inp->inp_faddr.s_addr &&
307190225Srpaulo		    inp->inp_faddr.s_addr != ip->ip_src.s_addr)
308190225Srpaulo			continue;
309190225Srpaulo		if (jailed(inp->inp_cred)) {
310190225Srpaulo			if (!prison_check_ip4(inp->inp_cred, &ip->ip_dst))
311190225Srpaulo				continue;
312190225Srpaulo		}
313190225Srpaulo		if (last != NULL) {
314190225Srpaulo			struct mbuf *n;
315190225Srpaulo
316190225Srpaulo			n = m_copy(m, 0, (int)M_COPYALL);
317190225Srpaulo			if (n != NULL)
318190225Srpaulo				(void) rip_append(last, ip, n, &ripsrc);
319190225Srpaulo			/* XXX count dropped packet */
320190225Srpaulo			INP_RUNLOCK(last);
321190225Srpaulo		}
322190225Srpaulo		INP_RLOCK(inp);
323190225Srpaulo		last = inp;
324190225Srpaulo	}
325190225Srpaulo	INP_INFO_RUNLOCK(&V_ripcbinfo);
326190225Srpaulo	if (last != NULL) {
327190225Srpaulo		if (rip_append(last, ip, m, &ripsrc) != 0)
328190225Srpaulo			V_ipstat.ips_delivered--;
329190225Srpaulo		INP_RUNLOCK(last);
330190225Srpaulo	} else {
331190225Srpaulo		m_freem(m);
332190225Srpaulo		V_ipstat.ips_noproto++;
333190225Srpaulo		V_ipstat.ips_delivered--;
334190225Srpaulo	}
335190225Srpaulo}
336190225Srpaulo
337190225Srpaulo/*
338190225Srpaulo * Generate IP header and pass packet to ip_output.  Tack on options user may
339190225Srpaulo * have setup with control call.
340190225Srpaulo */
341190225Srpauloint
342190225Srpaulorip_output(struct mbuf *m, struct socket *so, u_long dst)
343190225Srpaulo{
344190225Srpaulo	INIT_VNET_INET(so->so_vnet);
345190225Srpaulo	struct ip *ip;
346190225Srpaulo	int error;
347190225Srpaulo	struct inpcb *inp = sotoinpcb(so);
348190225Srpaulo	int flags = ((so->so_options & SO_DONTROUTE) ? IP_ROUTETOIF : 0) |
349190225Srpaulo	    IP_ALLOWBROADCAST;
350190225Srpaulo
351190225Srpaulo	/*
352190225Srpaulo	 * If the user handed us a complete IP packet, use it.  Otherwise,
353190225Srpaulo	 * allocate an mbuf for a header and fill it in.
354190225Srpaulo	 */
355190225Srpaulo	if ((inp->inp_flags & INP_HDRINCL) == 0) {
356190225Srpaulo		if (m->m_pkthdr.len + sizeof(struct ip) > IP_MAXPACKET) {
357190225Srpaulo			m_freem(m);
358190225Srpaulo			return(EMSGSIZE);
359190225Srpaulo		}
360190225Srpaulo		M_PREPEND(m, sizeof(struct ip), M_DONTWAIT);
361190225Srpaulo		if (m == NULL)
362190225Srpaulo			return(ENOBUFS);
363190225Srpaulo
364190225Srpaulo		INP_RLOCK(inp);
365190225Srpaulo		ip = mtod(m, struct ip *);
366190225Srpaulo		ip->ip_tos = inp->inp_ip_tos;
367190225Srpaulo		if (inp->inp_flags & INP_DONTFRAG)
368190225Srpaulo			ip->ip_off = IP_DF;
369190225Srpaulo		else
370190225Srpaulo			ip->ip_off = 0;
371190225Srpaulo		ip->ip_p = inp->inp_ip_p;
372190225Srpaulo		ip->ip_len = m->m_pkthdr.len;
373190225Srpaulo		if (jailed(inp->inp_cred)) {
374190225Srpaulo			if (prison_getip4(inp->inp_cred, &ip->ip_src)) {
375190225Srpaulo				INP_RUNLOCK(inp);
376190225Srpaulo				m_freem(m);
377190225Srpaulo				return (EPERM);
378190225Srpaulo			}
379190225Srpaulo		} else {
380190225Srpaulo			ip->ip_src = inp->inp_laddr;
381190225Srpaulo		}
382190225Srpaulo		ip->ip_dst.s_addr = dst;
383190225Srpaulo		ip->ip_ttl = inp->inp_ip_ttl;
384190225Srpaulo	} else {
385190225Srpaulo		if (m->m_pkthdr.len > IP_MAXPACKET) {
386190225Srpaulo			m_freem(m);
387190225Srpaulo			return(EMSGSIZE);
388190225Srpaulo		}
389190225Srpaulo		INP_RLOCK(inp);
390190225Srpaulo		ip = mtod(m, struct ip *);
391190225Srpaulo		if (!prison_check_ip4(inp->inp_cred, &ip->ip_src)) {
392190225Srpaulo			INP_RUNLOCK(inp);
393190225Srpaulo			m_freem(m);
394190225Srpaulo			return (EPERM);
395190225Srpaulo		}
396190225Srpaulo
397190225Srpaulo		/*
398190225Srpaulo		 * Don't allow both user specified and setsockopt options,
399190225Srpaulo		 * and don't allow packet length sizes that will crash.
400190225Srpaulo		 */
401190225Srpaulo		if (((ip->ip_hl != (sizeof (*ip) >> 2)) && inp->inp_options)
402190225Srpaulo		    || (ip->ip_len > m->m_pkthdr.len)
403190225Srpaulo		    || (ip->ip_len < (ip->ip_hl << 2))) {
404190225Srpaulo			INP_RUNLOCK(inp);
405190225Srpaulo			m_freem(m);
406190225Srpaulo			return (EINVAL);
407190225Srpaulo		}
408190225Srpaulo		if (ip->ip_id == 0)
409190225Srpaulo			ip->ip_id = ip_newid();
410190225Srpaulo
411190225Srpaulo		/*
412190225Srpaulo		 * XXX prevent ip_output from overwriting header fields.
413190225Srpaulo		 */
414190225Srpaulo		flags |= IP_RAWOUTPUT;
415190225Srpaulo		V_ipstat.ips_rawout++;
416190225Srpaulo	}
417190225Srpaulo
418190225Srpaulo	if (inp->inp_flags & INP_ONESBCAST)
419190225Srpaulo		flags |= IP_SENDONES;
420190225Srpaulo
421190225Srpaulo#ifdef MAC
422190225Srpaulo	mac_inpcb_create_mbuf(inp, m);
423190225Srpaulo#endif
424190225Srpaulo
425190225Srpaulo	error = ip_output(m, inp->inp_options, NULL, flags,
426214518Srpaulo	    inp->inp_moptions, inp);
427214518Srpaulo	INP_RUNLOCK(inp);
428214518Srpaulo	return (error);
429214518Srpaulo}
430190225Srpaulo
431190225Srpaulo/*
432190225Srpaulo * Raw IP socket option processing.
433190225Srpaulo *
434190225Srpaulo * IMPORTANT NOTE regarding access control: Traditionally, raw sockets could
435190225Srpaulo * only be created by a privileged process, and as such, socket option
436190225Srpaulo * operations to manage system properties on any raw socket were allowed to
437190225Srpaulo * take place without explicit additional access control checks.  However,
438190225Srpaulo * raw sockets can now also be created in jail(), and therefore explicit
439190225Srpaulo * checks are now required.  Likewise, raw sockets can be used by a process
440190225Srpaulo * after it gives up privilege, so some caution is required.  For options
441190225Srpaulo * passed down to the IP layer via ip_ctloutput(), checks are assumed to be
442190225Srpaulo * performed in ip_ctloutput() and therefore no check occurs here.
443190225Srpaulo * Unilaterally checking priv_check() here breaks normal IP socket option
444190225Srpaulo * operations on raw sockets.
445190225Srpaulo *
446190225Srpaulo * When adding new socket options here, make sure to add access control
447190225Srpaulo * checks here as necessary.
448190225Srpaulo */
449190225Srpauloint
450190225Srpaulorip_ctloutput(struct socket *so, struct sockopt *sopt)
451190225Srpaulo{
452190225Srpaulo	struct	inpcb *inp = sotoinpcb(so);
453190225Srpaulo	int	error, optval;
454190225Srpaulo
455190225Srpaulo	if (sopt->sopt_level != IPPROTO_IP) {
456190225Srpaulo		if ((sopt->sopt_level == SOL_SOCKET) &&
457190225Srpaulo		    (sopt->sopt_name == SO_SETFIB)) {
458190225Srpaulo			inp->inp_inc.inc_fibnum = so->so_fibnum;
459190225Srpaulo			return (0);
460190225Srpaulo		}
461190225Srpaulo		return (EINVAL);
462190225Srpaulo	}
463190225Srpaulo
464190225Srpaulo	error = 0;
465190225Srpaulo	switch (sopt->sopt_dir) {
466190225Srpaulo	case SOPT_GET:
467190225Srpaulo		switch (sopt->sopt_name) {
468190225Srpaulo		case IP_HDRINCL:
469190225Srpaulo			optval = inp->inp_flags & INP_HDRINCL;
470190225Srpaulo			error = sooptcopyout(sopt, &optval, sizeof optval);
471190225Srpaulo			break;
472190225Srpaulo
473190225Srpaulo		case IP_FW_ADD:	/* ADD actually returns the body... */
474190225Srpaulo		case IP_FW_GET:
475190225Srpaulo		case IP_FW_TABLE_GETSIZE:
476190225Srpaulo		case IP_FW_TABLE_LIST:
477190225Srpaulo		case IP_FW_NAT_GET_CONFIG:
478190225Srpaulo		case IP_FW_NAT_GET_LOG:
479190225Srpaulo			if (ip_fw_ctl_ptr != NULL)
480190225Srpaulo				error = ip_fw_ctl_ptr(sopt);
481190225Srpaulo			else
482190225Srpaulo				error = ENOPROTOOPT;
483190225Srpaulo			break;
484190225Srpaulo
485190225Srpaulo		case IP_DUMMYNET_GET:
486190225Srpaulo			if (ip_dn_ctl_ptr != NULL)
487190225Srpaulo				error = ip_dn_ctl_ptr(sopt);
488190225Srpaulo			else
489190225Srpaulo				error = ENOPROTOOPT;
490190225Srpaulo			break ;
491190225Srpaulo
492190225Srpaulo		case MRT_INIT:
493190225Srpaulo		case MRT_DONE:
494190225Srpaulo		case MRT_ADD_VIF:
495190225Srpaulo		case MRT_DEL_VIF:
496190225Srpaulo		case MRT_ADD_MFC:
497190225Srpaulo		case MRT_DEL_MFC:
498190225Srpaulo		case MRT_VERSION:
499190225Srpaulo		case MRT_ASSERT:
500190225Srpaulo		case MRT_API_SUPPORT:
501190225Srpaulo		case MRT_API_CONFIG:
502190225Srpaulo		case MRT_ADD_BW_UPCALL:
503190225Srpaulo		case MRT_DEL_BW_UPCALL:
504190225Srpaulo			error = priv_check(curthread, PRIV_NETINET_MROUTE);
505190225Srpaulo			if (error != 0)
506190225Srpaulo				return (error);
507190225Srpaulo			error = ip_mrouter_get ? ip_mrouter_get(so, sopt) :
508190225Srpaulo				EOPNOTSUPP;
509190225Srpaulo			break;
510190225Srpaulo
511190225Srpaulo		default:
512190225Srpaulo			error = ip_ctloutput(so, sopt);
513190225Srpaulo			break;
514190225Srpaulo		}
515190225Srpaulo		break;
516190225Srpaulo
517190225Srpaulo	case SOPT_SET:
518190225Srpaulo		switch (sopt->sopt_name) {
519190225Srpaulo		case IP_HDRINCL:
520190225Srpaulo			error = sooptcopyin(sopt, &optval, sizeof optval,
521190225Srpaulo					    sizeof optval);
522190225Srpaulo			if (error)
523190225Srpaulo				break;
524190225Srpaulo			if (optval)
525190225Srpaulo				inp->inp_flags |= INP_HDRINCL;
526190225Srpaulo			else
527190225Srpaulo				inp->inp_flags &= ~INP_HDRINCL;
528190225Srpaulo			break;
529190225Srpaulo
530190225Srpaulo		case IP_FW_ADD:
531190225Srpaulo		case IP_FW_DEL:
532190225Srpaulo		case IP_FW_FLUSH:
533190225Srpaulo		case IP_FW_ZERO:
534190225Srpaulo		case IP_FW_RESETLOG:
535190225Srpaulo		case IP_FW_TABLE_ADD:
536190225Srpaulo		case IP_FW_TABLE_DEL:
537190225Srpaulo		case IP_FW_TABLE_FLUSH:
538190225Srpaulo		case IP_FW_NAT_CFG:
539190225Srpaulo		case IP_FW_NAT_DEL:
540190225Srpaulo			if (ip_fw_ctl_ptr != NULL)
541190225Srpaulo				error = ip_fw_ctl_ptr(sopt);
542190225Srpaulo			else
543190225Srpaulo				error = ENOPROTOOPT;
544190225Srpaulo			break;
545190225Srpaulo
546190225Srpaulo		case IP_DUMMYNET_CONFIGURE:
547190225Srpaulo		case IP_DUMMYNET_DEL:
548190225Srpaulo		case IP_DUMMYNET_FLUSH:
549190225Srpaulo			if (ip_dn_ctl_ptr != NULL)
550190225Srpaulo				error = ip_dn_ctl_ptr(sopt);
551214518Srpaulo			else
552214518Srpaulo				error = ENOPROTOOPT ;
553214518Srpaulo			break ;
554214518Srpaulo
555214518Srpaulo		case IP_RSVP_ON:
556190225Srpaulo			error = priv_check(curthread, PRIV_NETINET_MROUTE);
557190225Srpaulo			if (error != 0)
558190225Srpaulo				return (error);
559190225Srpaulo			error = ip_rsvp_init(so);
560214518Srpaulo			break;
561214518Srpaulo
562214518Srpaulo		case IP_RSVP_OFF:
563214518Srpaulo			error = priv_check(curthread, PRIV_NETINET_MROUTE);
564214518Srpaulo			if (error != 0)
565190225Srpaulo				return (error);
566190225Srpaulo			error = ip_rsvp_done();
567190225Srpaulo			break;
568190225Srpaulo
569190225Srpaulo		case IP_RSVP_VIF_ON:
570190225Srpaulo		case IP_RSVP_VIF_OFF:
571190225Srpaulo			error = priv_check(curthread, PRIV_NETINET_MROUTE);
572190225Srpaulo			if (error != 0)
573190225Srpaulo				return (error);
574190225Srpaulo			error = ip_rsvp_vif ?
575190225Srpaulo				ip_rsvp_vif(so, sopt) : EINVAL;
576190225Srpaulo			break;
577190225Srpaulo
578190225Srpaulo		case MRT_INIT:
579190225Srpaulo		case MRT_DONE:
580190225Srpaulo		case MRT_ADD_VIF:
581190225Srpaulo		case MRT_DEL_VIF:
582190225Srpaulo		case MRT_ADD_MFC:
583190225Srpaulo		case MRT_DEL_MFC:
584190225Srpaulo		case MRT_VERSION:
585190225Srpaulo		case MRT_ASSERT:
586190225Srpaulo		case MRT_API_SUPPORT:
587190225Srpaulo		case MRT_API_CONFIG:
588190225Srpaulo		case MRT_ADD_BW_UPCALL:
589190225Srpaulo		case MRT_DEL_BW_UPCALL:
590190225Srpaulo			error = priv_check(curthread, PRIV_NETINET_MROUTE);
591190225Srpaulo			if (error != 0)
592190225Srpaulo				return (error);
593190225Srpaulo			error = ip_mrouter_set ? ip_mrouter_set(so, sopt) :
594190225Srpaulo					EOPNOTSUPP;
595190225Srpaulo			break;
596190225Srpaulo
597190225Srpaulo		default:
598190225Srpaulo			error = ip_ctloutput(so, sopt);
599190225Srpaulo			break;
600190225Srpaulo		}
601190225Srpaulo		break;
602190225Srpaulo	}
603190225Srpaulo
604190225Srpaulo	return (error);
605190225Srpaulo}
606190225Srpaulo
607190225Srpaulo/*
608190225Srpaulo * This function exists solely to receive the PRC_IFDOWN messages which are
609190225Srpaulo * sent by if_down().  It looks for an ifaddr whose ifa_addr is sa, and calls
610190225Srpaulo * in_ifadown() to remove all routes corresponding to that address.  It also
611190225Srpaulo * receives the PRC_IFUP messages from if_up() and reinstalls the interface
612190225Srpaulo * routes.
613190225Srpaulo */
614190225Srpaulovoid
615190225Srpaulorip_ctlinput(int cmd, struct sockaddr *sa, void *vip)
616190225Srpaulo{
617190225Srpaulo	INIT_VNET_INET(curvnet);
618190225Srpaulo	struct in_ifaddr *ia;
619190225Srpaulo	struct ifnet *ifp;
620190225Srpaulo	int err;
621190225Srpaulo	int flags;
622190225Srpaulo
623190225Srpaulo	switch (cmd) {
624190225Srpaulo	case PRC_IFDOWN:
625190225Srpaulo		TAILQ_FOREACH(ia, &V_in_ifaddrhead, ia_link) {
626190225Srpaulo			if (ia->ia_ifa.ifa_addr == sa
627190225Srpaulo			    && (ia->ia_flags & IFA_ROUTE)) {
628190225Srpaulo				/*
629190225Srpaulo				 * in_ifscrub kills the interface route.
630190225Srpaulo				 */
631190225Srpaulo				in_ifscrub(ia->ia_ifp, ia);
632190225Srpaulo				/*
633190225Srpaulo				 * in_ifadown gets rid of all the rest of the
634190225Srpaulo				 * routes.  This is not quite the right thing
635190225Srpaulo				 * to do, but at least if we are running a
636190225Srpaulo				 * routing process they will come back.
637190225Srpaulo				 */
638190225Srpaulo				in_ifadown(&ia->ia_ifa, 0);
639190225Srpaulo				break;
640190225Srpaulo			}
641190225Srpaulo		}
642190225Srpaulo		break;
643190225Srpaulo
644190225Srpaulo	case PRC_IFUP:
645190225Srpaulo		TAILQ_FOREACH(ia, &V_in_ifaddrhead, ia_link) {
646190225Srpaulo			if (ia->ia_ifa.ifa_addr == sa)
647190225Srpaulo				break;
648190225Srpaulo		}
649190225Srpaulo		if (ia == 0 || (ia->ia_flags & IFA_ROUTE))
650190225Srpaulo			return;
651190225Srpaulo		flags = RTF_UP;
652190225Srpaulo		ifp = ia->ia_ifa.ifa_ifp;
653190225Srpaulo
654190225Srpaulo		if ((ifp->if_flags & IFF_LOOPBACK)
655190225Srpaulo		    || (ifp->if_flags & IFF_POINTOPOINT))
656190225Srpaulo			flags |= RTF_HOST;
657190225Srpaulo
658190225Srpaulo		err = rtinit(&ia->ia_ifa, RTM_ADD, flags);
659190225Srpaulo		if (err == 0)
660190225Srpaulo			ia->ia_flags |= IFA_ROUTE;
661190225Srpaulo		break;
662190225Srpaulo	}
663190225Srpaulo}
664190225Srpaulo
665190225Srpaulou_long	rip_sendspace = 9216;
666190225Srpaulou_long	rip_recvspace = 9216;
667190225Srpaulo
668190225SrpauloSYSCTL_ULONG(_net_inet_raw, OID_AUTO, maxdgram, CTLFLAG_RW,
669190225Srpaulo    &rip_sendspace, 0, "Maximum outgoing raw IP datagram size");
670190225SrpauloSYSCTL_ULONG(_net_inet_raw, OID_AUTO, recvspace, CTLFLAG_RW,
671190225Srpaulo    &rip_recvspace, 0, "Maximum space for incoming raw IP datagrams");
672190225Srpaulo
673190225Srpaulostatic int
674190225Srpaulorip_attach(struct socket *so, int proto, struct thread *td)
675190225Srpaulo{
676190225Srpaulo	INIT_VNET_INET(so->so_vnet);
677190225Srpaulo	struct inpcb *inp;
678190225Srpaulo	int error;
679190225Srpaulo
680190225Srpaulo	inp = sotoinpcb(so);
681190225Srpaulo	KASSERT(inp == NULL, ("rip_attach: inp != NULL"));
682190225Srpaulo
683190225Srpaulo	error = priv_check(td, PRIV_NETINET_RAW);
684190225Srpaulo	if (error)
685190225Srpaulo		return (error);
686190225Srpaulo	if (proto >= IPPROTO_MAX || proto < 0)
687190225Srpaulo		return EPROTONOSUPPORT;
688190225Srpaulo	error = soreserve(so, rip_sendspace, rip_recvspace);
689190225Srpaulo	if (error)
690190225Srpaulo		return (error);
691190225Srpaulo	INP_INFO_WLOCK(&V_ripcbinfo);
692190225Srpaulo	error = in_pcballoc(so, &V_ripcbinfo);
693190225Srpaulo	if (error) {
694190225Srpaulo		INP_INFO_WUNLOCK(&V_ripcbinfo);
695190225Srpaulo		return (error);
696190225Srpaulo	}
697190225Srpaulo	inp = (struct inpcb *)so->so_pcb;
698190225Srpaulo	inp->inp_vflag |= INP_IPV4;
699190225Srpaulo	inp->inp_ip_p = proto;
700190225Srpaulo	inp->inp_ip_ttl = V_ip_defttl;
701190225Srpaulo	rip_inshash(inp);
702190225Srpaulo	INP_INFO_WUNLOCK(&V_ripcbinfo);
703190225Srpaulo	INP_WUNLOCK(inp);
704190225Srpaulo	return (0);
705190225Srpaulo}
706190225Srpaulo
707190225Srpaulostatic void
708190225Srpaulorip_detach(struct socket *so)
709190225Srpaulo{
710190225Srpaulo	INIT_VNET_INET(so->so_vnet);
711190225Srpaulo	struct inpcb *inp;
712190225Srpaulo
713190225Srpaulo	inp = sotoinpcb(so);
714190225Srpaulo	KASSERT(inp != NULL, ("rip_detach: inp == NULL"));
715190225Srpaulo	KASSERT(inp->inp_faddr.s_addr == INADDR_ANY,
716190225Srpaulo	    ("rip_detach: not closed"));
717190225Srpaulo
718190225Srpaulo	INP_INFO_WLOCK(&V_ripcbinfo);
719190225Srpaulo	INP_WLOCK(inp);
720190225Srpaulo	rip_delhash(inp);
721190225Srpaulo	if (so == V_ip_mrouter && ip_mrouter_done)
722190225Srpaulo		ip_mrouter_done();
723190225Srpaulo	if (ip_rsvp_force_done)
724190225Srpaulo		ip_rsvp_force_done(so);
725190225Srpaulo	if (so == V_ip_rsvpd)
726190225Srpaulo		ip_rsvp_done();
727190225Srpaulo	in_pcbdetach(inp);
728190225Srpaulo	in_pcbfree(inp);
729190225Srpaulo	INP_INFO_WUNLOCK(&V_ripcbinfo);
730190225Srpaulo}
731190225Srpaulo
732190225Srpaulostatic void
733190225Srpaulorip_dodisconnect(struct socket *so, struct inpcb *inp)
734190225Srpaulo{
735190225Srpaulo
736190225Srpaulo	INP_INFO_WLOCK_ASSERT(inp->inp_pcbinfo);
737190225Srpaulo	INP_WLOCK_ASSERT(inp);
738190225Srpaulo
739190225Srpaulo	rip_delhash(inp);
740190225Srpaulo	inp->inp_faddr.s_addr = INADDR_ANY;
741127664Sbms	rip_inshash(inp);
74217683Spst	SOCK_LOCK(so);
74317683Spst	so->so_state &= ~SS_ISCONNECTED;
74417683Spst	SOCK_UNLOCK(so);
74598530Sfenner}
74698530Sfenner
74798530Sfennerstatic void
74898530Sfennerrip_abort(struct socket *so)
74998530Sfenner{
75098530Sfenner	INIT_VNET_INET(so->so_vnet);
75198530Sfenner	struct inpcb *inp;
75298530Sfenner
75398530Sfenner	inp = sotoinpcb(so);
75498530Sfenner	KASSERT(inp != NULL, ("rip_abort: inp == NULL"));
75598530Sfenner
75698530Sfenner	INP_INFO_WLOCK(&V_ripcbinfo);
75798530Sfenner	INP_WLOCK(inp);
75817683Spst	rip_dodisconnect(so, inp);
75975107Sfenner	INP_WUNLOCK(inp);
76075107Sfenner	INP_INFO_WUNLOCK(&V_ripcbinfo);
761190225Srpaulo}
76217683Spst
76317683Spststatic void
76417683Spstrip_close(struct socket *so)
76517683Spst{
766214518Srpaulo	INIT_VNET_INET(so->so_vnet);
76717683Spst	struct inpcb *inp;
76817683Spst
76917683Spst	inp = sotoinpcb(so);
770127664Sbms	KASSERT(inp != NULL, ("rip_close: inp == NULL"));
771127664Sbms
77217683Spst	INP_INFO_WLOCK(&V_ripcbinfo);
77317683Spst	INP_WLOCK(inp);
77417683Spst	rip_dodisconnect(so, inp);
77517683Spst	INP_WUNLOCK(inp);
776146768Ssam	INP_INFO_WUNLOCK(&V_ripcbinfo);
777146768Ssam}
778146768Ssam
779146768Ssamstatic int
780190225Srpaulorip_disconnect(struct socket *so)
781190225Srpaulo{
782190225Srpaulo	INIT_VNET_INET(so->so_vnet);
78317683Spst	struct inpcb *inp;
78417683Spst
785127664Sbms	if ((so->so_state & SS_ISCONNECTED) == 0)
786127664Sbms		return (ENOTCONN);
787127664Sbms
788127664Sbms	inp = sotoinpcb(so);
789127664Sbms	KASSERT(inp != NULL, ("rip_disconnect: inp == NULL"));
790127664Sbms
791190225Srpaulo	INP_INFO_WLOCK(&V_ripcbinfo);
792190225Srpaulo	INP_WLOCK(inp);
793127664Sbms	rip_dodisconnect(so, inp);
794127664Sbms	INP_WUNLOCK(inp);
795190225Srpaulo	INP_INFO_WUNLOCK(&V_ripcbinfo);
796127664Sbms	return (0);
79717683Spst}
79817683Spst
799183102Scsjpstatic int
800183102Scsjprip_bind(struct socket *so, struct sockaddr *nam, struct thread *td)
801183102Scsjp{
802183102Scsjp	INIT_VNET_NET(so->so_vnet);
803183102Scsjp	INIT_VNET_INET(so->so_vnet);
804183102Scsjp	struct sockaddr_in *addr = (struct sockaddr_in *)nam;
805183102Scsjp	struct inpcb *inp;
806183102Scsjp
807190225Srpaulo	if (nam->sa_len != sizeof(*addr))
808190225Srpaulo		return (EINVAL);
809183102Scsjp
810183102Scsjp	if (!prison_check_ip4(td->td_ucred, &addr->sin_addr))
811183102Scsjp		return (EADDRNOTAVAIL);
812183102Scsjp
813183102Scsjp	if (TAILQ_EMPTY(&V_ifnet) ||
814183102Scsjp	    (addr->sin_family != AF_INET && addr->sin_family != AF_IMPLINK) ||
815190225Srpaulo	    (addr->sin_addr.s_addr &&
816183102Scsjp	     ifa_ifwithaddr((struct sockaddr *)addr) == 0))
817183102Scsjp		return (EADDRNOTAVAIL);
818190225Srpaulo
819183102Scsjp	inp = sotoinpcb(so);
820190225Srpaulo	KASSERT(inp != NULL, ("rip_bind: inp == NULL"));
82117683Spst
82217683Spst	INP_INFO_WLOCK(&V_ripcbinfo);
82317683Spst	INP_WLOCK(inp);
82417683Spst	rip_delhash(inp);
82517683Spst	inp->inp_laddr = addr->sin_addr;
82617683Spst	rip_inshash(inp);
82717683Spst	INP_WUNLOCK(inp);
828127664Sbms	INP_INFO_WUNLOCK(&V_ripcbinfo);
829127664Sbms	return (0);
830127664Sbms}
831127664Sbms
832127664Sbmsstatic int
833127664Sbmsrip_connect(struct socket *so, struct sockaddr *nam, struct thread *td)
834127664Sbms{
835190225Srpaulo	INIT_VNET_NET(so->so_vnet);
836127664Sbms	INIT_VNET_INET(so->so_vnet);
837127664Sbms	struct sockaddr_in *addr = (struct sockaddr_in *)nam;
838190225Srpaulo	struct inpcb *inp;
839190225Srpaulo
840190225Srpaulo	if (nam->sa_len != sizeof(*addr))
841190225Srpaulo		return (EINVAL);
842127664Sbms	if (TAILQ_EMPTY(&V_ifnet))
843127664Sbms		return (EADDRNOTAVAIL);
844190225Srpaulo	if (addr->sin_family != AF_INET && addr->sin_family != AF_IMPLINK)
845127664Sbms		return (EAFNOSUPPORT);
846127664Sbms
847127664Sbms	inp = sotoinpcb(so);
848127664Sbms	KASSERT(inp != NULL, ("rip_connect: inp == NULL"));
849127664Sbms
850127664Sbms	INP_INFO_WLOCK(&V_ripcbinfo);
851127664Sbms	INP_WLOCK(inp);
852190225Srpaulo	rip_delhash(inp);
853190225Srpaulo	inp->inp_faddr = addr->sin_addr;
85417683Spst	rip_inshash(inp);
85517683Spst	soisconnected(so);
856214518Srpaulo	INP_WUNLOCK(inp);
857214518Srpaulo	INP_INFO_WUNLOCK(&V_ripcbinfo);
858214518Srpaulo	return (0);
859214518Srpaulo}
860214518Srpaulo
861214518Srpaulostatic int
862214518Srpaulorip_shutdown(struct socket *so)
863214518Srpaulo{
864214518Srpaulo	struct inpcb *inp;
865214518Srpaulo
866214518Srpaulo	inp = sotoinpcb(so);
867214518Srpaulo	KASSERT(inp != NULL, ("rip_shutdown: inp == NULL"));
868214518Srpaulo
869214518Srpaulo	INP_WLOCK(inp);
870214518Srpaulo	socantsendmore(so);
871214518Srpaulo	INP_WUNLOCK(inp);
87217683Spst	return (0);
87317683Spst}
87417683Spst
87517683Spststatic int
87617683Spstrip_send(struct socket *so, int flags, struct mbuf *m, struct sockaddr *nam,
87717683Spst    struct mbuf *control, struct thread *td)
87817683Spst{
87917683Spst	struct inpcb *inp;
88017683Spst	u_long dst;
88117683Spst
88217683Spst	inp = sotoinpcb(so);
88317683Spst	KASSERT(inp != NULL, ("rip_send: inp == NULL"));
88417683Spst
88517683Spst	/*
88675107Sfenner	 * Note: 'dst' reads below are unlocked.
88775107Sfenner	 */
888190225Srpaulo	if (so->so_state & SS_ISCONNECTED) {
88917683Spst		if (nam) {
89017683Spst			m_freem(m);
89117683Spst			return (EISCONN);
89217683Spst		}
89317683Spst		dst = inp->inp_faddr.s_addr;	/* Unlocked read. */
89417683Spst	} else {
89517683Spst		if (nam == NULL) {
89617683Spst			m_freem(m);
89717683Spst			return (ENOTCONN);
89817683Spst		}
899146768Ssam		dst = ((struct sockaddr_in *)nam)->sin_addr.s_addr;
900146768Ssam	}
901146768Ssam	return (rip_output(m, so, dst));
90217683Spst}
90317683Spst
904127664Sbmsstatic int
905127664Sbmsrip_pcblist(SYSCTL_HANDLER_ARGS)
906127664Sbms{
907127664Sbms	INIT_VNET_INET(curvnet);
908190225Srpaulo	int error, i, n;
909190225Srpaulo	struct inpcb *inp, **inp_list;
910190225Srpaulo	inp_gen_t gencnt;
911190225Srpaulo	struct xinpgen xig;
912190225Srpaulo
913190225Srpaulo	/*
914127664Sbms	 * The process of preparing the TCB list is too time-consuming and
915127664Sbms	 * resource-intensive to repeat twice on every request.
916127664Sbms	 */
917127664Sbms	if (req->oldptr == 0) {
918190225Srpaulo		n = V_ripcbinfo.ipi_count;
919127664Sbms		req->oldidx = 2 * (sizeof xig)
920127664Sbms		    + (n + n/8) * sizeof(struct xinpcb);
921127664Sbms		return (0);
922127664Sbms	}
923127664Sbms
924127664Sbms	if (req->newptr != 0)
925127664Sbms		return (EPERM);
92617683Spst
92717683Spst	/*
928146768Ssam	 * OK, now we're committed to doing something.
92917683Spst	 */
930127664Sbms	INP_INFO_RLOCK(&V_ripcbinfo);
931190225Srpaulo	gencnt = V_ripcbinfo.ipi_gencnt;
932190225Srpaulo	n = V_ripcbinfo.ipi_count;
933146768Ssam	INP_INFO_RUNLOCK(&V_ripcbinfo);
934146768Ssam
935146768Ssam	xig.xig_len = sizeof xig;
936146768Ssam	xig.xig_count = n;
937146768Ssam	xig.xig_gen = gencnt;
938146768Ssam	xig.xig_sogen = so_gencnt;
939146768Ssam	error = SYSCTL_OUT(req, &xig, sizeof xig);
940146768Ssam	if (error)
94117683Spst		return (error);
942190225Srpaulo
943190225Srpaulo	inp_list = malloc(n * sizeof *inp_list, M_TEMP, M_WAITOK);
944146768Ssam	if (inp_list == 0)
945146768Ssam		return (ENOMEM);
946146768Ssam
94798530Sfenner	INP_INFO_RLOCK(&V_ripcbinfo);
948127664Sbms	for (inp = LIST_FIRST(V_ripcbinfo.ipi_listhead), i = 0; inp && i < n;
949127664Sbms	     inp = LIST_NEXT(inp, inp_list)) {
950127664Sbms		INP_RLOCK(inp);
951127664Sbms		if (inp->inp_gencnt <= gencnt &&
952146768Ssam		    cr_canseeinpcb(req->td->td_ucred, inp) == 0) {
953146768Ssam			/* XXX held references? */
954146768Ssam			inp_list[i++] = inp;
95598530Sfenner		}
956146768Ssam		INP_RUNLOCK(inp);
957146768Ssam	}
958146768Ssam	INP_INFO_RUNLOCK(&V_ripcbinfo);
959146768Ssam	n = i;
960146768Ssam
961146768Ssam	error = 0;
962146768Ssam	for (i = 0; i < n; i++) {
963146768Ssam		inp = inp_list[i];
964146768Ssam		INP_RLOCK(inp);
965146768Ssam		if (inp->inp_gencnt <= gencnt) {
966146768Ssam			struct xinpcb xi;
967146768Ssam
968146768Ssam			bzero(&xi, sizeof(xi));
969146768Ssam			xi.xi_len = sizeof xi;
970146768Ssam			/* XXX should avoid extra copy */
971127664Sbms			bcopy(inp, &xi.xi_inp, sizeof *inp);
972127664Sbms			if (inp->inp_socket)
973127664Sbms				sotoxsocket(inp->inp_socket, &xi.xi_socket);
974127664Sbms			INP_RUNLOCK(inp);
975127664Sbms			error = SYSCTL_OUT(req, &xi, sizeof xi);
976127664Sbms		} else
977127664Sbms			INP_RUNLOCK(inp);
978127664Sbms	}
979127664Sbms	if (!error) {
980127664Sbms		/*
981127664Sbms		 * Give the user an updated idea of our state.  If the
98217683Spst		 * generation differs from what we told her before, she knows
98317683Spst		 * that something happened while we were processing this
98417683Spst		 * request, and it might be necessary to retry.
98517683Spst		 */
98617683Spst		INP_INFO_RLOCK(&V_ripcbinfo);
98717683Spst		xig.xig_gen = V_ripcbinfo.ipi_gencnt;
98817683Spst		xig.xig_sogen = so_gencnt;
989146768Ssam		xig.xig_count = V_ripcbinfo.ipi_count;
990146768Ssam		INP_INFO_RUNLOCK(&V_ripcbinfo);
991146768Ssam		error = SYSCTL_OUT(req, &xig, sizeof xig);
992146768Ssam	}
993146768Ssam	free(inp_list, M_TEMP);
994146768Ssam	return (error);
995146768Ssam}
996146768Ssam
997146768SsamSYSCTL_PROC(_net_inet_raw, OID_AUTO/*XXX*/, pcblist, CTLFLAG_RD, 0, 0,
998146768Ssam    rip_pcblist, "S,xinpcb", "List of active raw IP sockets");
999146768Ssam
1000146768Ssamstruct pr_usrreqs rip_usrreqs = {
1001146768Ssam	.pru_abort =		rip_abort,
1002146768Ssam	.pru_attach =		rip_attach,
1003146768Ssam	.pru_bind =		rip_bind,
1004146768Ssam	.pru_connect =		rip_connect,
1005146768Ssam	.pru_control =		in_control,
1006146768Ssam	.pru_detach =		rip_detach,
1007146768Ssam	.pru_disconnect =	rip_disconnect,
1008146768Ssam	.pru_peeraddr =		in_getpeeraddr,
1009146768Ssam	.pru_send =		rip_send,
1010146768Ssam	.pru_shutdown =		rip_shutdown,
1011146768Ssam	.pru_sockaddr =		in_getsockaddr,
1012146768Ssam	.pru_sosetlabel =	in_pcbsosetlabel,
1013146768Ssam	.pru_close =		rip_close,
1014146768Ssam};
1015146768Ssam