raw_ip.c revision 181803
1/*-
2 * Copyright (c) 1982, 1986, 1988, 1993
3 *	The Regents of the University of California.
4 * All rights reserved.
5 *
6 * Redistribution and use in source and binary forms, with or without
7 * modification, are permitted provided that the following conditions
8 * are met:
9 * 1. Redistributions of source code must retain the above copyright
10 *    notice, this list of conditions and the following disclaimer.
11 * 2. Redistributions in binary form must reproduce the above copyright
12 *    notice, this list of conditions and the following disclaimer in the
13 *    documentation and/or other materials provided with the distribution.
14 * 4. Neither the name of the University nor the names of its contributors
15 *    may be used to endorse or promote products derived from this software
16 *    without specific prior written permission.
17 *
18 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
19 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
20 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
21 * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
22 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
23 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
24 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
25 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
26 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
27 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
28 * SUCH DAMAGE.
29 *
30 *	@(#)raw_ip.c	8.7 (Berkeley) 5/15/95
31 */
32
33#include <sys/cdefs.h>
34__FBSDID("$FreeBSD: head/sys/netinet/raw_ip.c 181803 2008-08-17 23:27:27Z bz $");
35
36#include "opt_inet6.h"
37#include "opt_ipsec.h"
38#include "opt_mac.h"
39
40#include <sys/param.h>
41#include <sys/jail.h>
42#include <sys/kernel.h>
43#include <sys/lock.h>
44#include <sys/malloc.h>
45#include <sys/mbuf.h>
46#include <sys/priv.h>
47#include <sys/proc.h>
48#include <sys/protosw.h>
49#include <sys/signalvar.h>
50#include <sys/socket.h>
51#include <sys/socketvar.h>
52#include <sys/sx.h>
53#include <sys/sysctl.h>
54#include <sys/systm.h>
55#include <sys/vimage.h>
56
57#include <vm/uma.h>
58
59#include <net/if.h>
60#include <net/route.h>
61
62#include <netinet/in.h>
63#include <netinet/in_systm.h>
64#include <netinet/in_pcb.h>
65#include <netinet/in_var.h>
66#include <netinet/ip.h>
67#include <netinet/ip_var.h>
68#include <netinet/ip_mroute.h>
69
70#include <netinet/ip_fw.h>
71#include <netinet/ip_dummynet.h>
72
73#ifdef IPSEC
74#include <netipsec/ipsec.h>
75#endif /*IPSEC*/
76
77#include <security/mac/mac_framework.h>
78
79struct	inpcbhead ripcb;
80struct	inpcbinfo ripcbinfo;
81
82/* control hooks for ipfw and dummynet */
83ip_fw_ctl_t *ip_fw_ctl_ptr = NULL;
84ip_dn_ctl_t *ip_dn_ctl_ptr = NULL;
85
86/*
87 * Hooks for multicast routing. They all default to NULL, so leave them not
88 * initialized and rely on BSS being set to 0.
89 */
90
91/*
92 * The socket used to communicate with the multicast routing daemon.
93 */
94struct socket  *ip_mrouter;
95
96/*
97 * The various mrouter and rsvp functions.
98 */
99int (*ip_mrouter_set)(struct socket *, struct sockopt *);
100int (*ip_mrouter_get)(struct socket *, struct sockopt *);
101int (*ip_mrouter_done)(void);
102int (*ip_mforward)(struct ip *, struct ifnet *, struct mbuf *,
103		   struct ip_moptions *);
104int (*mrt_ioctl)(int, caddr_t, int);
105int (*legal_vif_num)(int);
106u_long (*ip_mcast_src)(int);
107
108void (*rsvp_input_p)(struct mbuf *m, int off);
109int (*ip_rsvp_vif)(struct socket *, struct sockopt *);
110void (*ip_rsvp_force_done)(struct socket *);
111
112/*
113 * Hash functions
114 */
115
116#define INP_PCBHASH_RAW_SIZE	256
117#define INP_PCBHASH_RAW(proto, laddr, faddr, mask) \
118        (((proto) + (laddr) + (faddr)) % (mask) + 1)
119
120static void
121rip_inshash(struct inpcb *inp)
122{
123	struct inpcbinfo *pcbinfo = inp->inp_pcbinfo;
124	struct inpcbhead *pcbhash;
125	int hash;
126
127	INP_INFO_WLOCK_ASSERT(pcbinfo);
128	INP_WLOCK_ASSERT(inp);
129
130	if (inp->inp_ip_p != 0 &&
131	    inp->inp_laddr.s_addr != INADDR_ANY &&
132	    inp->inp_faddr.s_addr != INADDR_ANY) {
133		hash = INP_PCBHASH_RAW(inp->inp_ip_p, inp->inp_laddr.s_addr,
134		    inp->inp_faddr.s_addr, pcbinfo->ipi_hashmask);
135	} else
136		hash = 0;
137	pcbhash = &pcbinfo->ipi_hashbase[hash];
138	LIST_INSERT_HEAD(pcbhash, inp, inp_hash);
139}
140
141static void
142rip_delhash(struct inpcb *inp)
143{
144
145	INP_INFO_WLOCK_ASSERT(inp->inp_pcbinfo);
146	INP_WLOCK_ASSERT(inp);
147
148	LIST_REMOVE(inp, inp_hash);
149}
150
151/*
152 * Raw interface to IP protocol.
153 */
154
155/*
156 * Initialize raw connection block q.
157 */
158static void
159rip_zone_change(void *tag)
160{
161
162	uma_zone_set_max(V_ripcbinfo.ipi_zone, maxsockets);
163}
164
165static int
166rip_inpcb_init(void *mem, int size, int flags)
167{
168	struct inpcb *inp = mem;
169
170	INP_LOCK_INIT(inp, "inp", "rawinp");
171	return (0);
172}
173
174void
175rip_init(void)
176{
177
178	INP_INFO_LOCK_INIT(&V_ripcbinfo, "rip");
179	LIST_INIT(&V_ripcb);
180	V_ripcbinfo.ipi_listhead = &V_ripcb;
181	V_ripcbinfo.ipi_hashbase = hashinit(INP_PCBHASH_RAW_SIZE, M_PCB,
182	    &V_ripcbinfo.ipi_hashmask);
183	V_ripcbinfo.ipi_porthashbase = hashinit(1, M_PCB,
184	    &V_ripcbinfo.ipi_porthashmask);
185	V_ripcbinfo.ipi_zone = uma_zcreate("ripcb", sizeof(struct inpcb),
186	    NULL, NULL, rip_inpcb_init, NULL, UMA_ALIGN_PTR, UMA_ZONE_NOFREE);
187	uma_zone_set_max(V_ripcbinfo.ipi_zone, maxsockets);
188	EVENTHANDLER_REGISTER(maxsockets_change, rip_zone_change, NULL,
189	    EVENTHANDLER_PRI_ANY);
190}
191
192static int
193rip_append(struct inpcb *last, struct ip *ip, struct mbuf *n,
194    struct sockaddr_in *ripsrc)
195{
196	int policyfail = 0;
197
198	INP_RLOCK_ASSERT(last);
199
200#ifdef IPSEC
201	/* check AH/ESP integrity. */
202	if (ipsec4_in_reject(n, last)) {
203		policyfail = 1;
204	}
205#endif /* IPSEC */
206#ifdef MAC
207	if (!policyfail && mac_inpcb_check_deliver(last, n) != 0)
208		policyfail = 1;
209#endif
210	/* Check the minimum TTL for socket. */
211	if (last->inp_ip_minttl && last->inp_ip_minttl > ip->ip_ttl)
212		policyfail = 1;
213	if (!policyfail) {
214		struct mbuf *opts = NULL;
215		struct socket *so;
216
217		so = last->inp_socket;
218		if ((last->inp_flags & INP_CONTROLOPTS) ||
219		    (so->so_options & (SO_TIMESTAMP | SO_BINTIME)))
220			ip_savecontrol(last, &opts, ip, n);
221		SOCKBUF_LOCK(&so->so_rcv);
222		if (sbappendaddr_locked(&so->so_rcv,
223		    (struct sockaddr *)ripsrc, n, opts) == 0) {
224			/* should notify about lost packet */
225			m_freem(n);
226			if (opts)
227				m_freem(opts);
228			SOCKBUF_UNLOCK(&so->so_rcv);
229		} else
230			sorwakeup_locked(so);
231	} else
232		m_freem(n);
233	return (policyfail);
234}
235
236/*
237 * Setup generic address and protocol structures for raw_input routine, then
238 * pass them along with mbuf chain.
239 */
240void
241rip_input(struct mbuf *m, int off)
242{
243	struct ip *ip = mtod(m, struct ip *);
244	int proto = ip->ip_p;
245	struct inpcb *inp, *last;
246	struct sockaddr_in ripsrc;
247	int hash;
248
249	bzero(&ripsrc, sizeof(ripsrc));
250	ripsrc.sin_len = sizeof(ripsrc);
251	ripsrc.sin_family = AF_INET;
252	ripsrc.sin_addr = ip->ip_src;
253	last = NULL;
254	hash = INP_PCBHASH_RAW(proto, ip->ip_src.s_addr,
255	    ip->ip_dst.s_addr, V_ripcbinfo.ipi_hashmask);
256	INP_INFO_RLOCK(&V_ripcbinfo);
257	LIST_FOREACH(inp, &V_ripcbinfo.ipi_hashbase[hash], inp_hash) {
258		if (inp->inp_ip_p != proto)
259			continue;
260#ifdef INET6
261		if ((inp->inp_vflag & INP_IPV4) == 0)
262			continue;
263#endif
264		if (inp->inp_laddr.s_addr != ip->ip_dst.s_addr)
265			continue;
266		if (inp->inp_faddr.s_addr != ip->ip_src.s_addr)
267			continue;
268		INP_RLOCK(inp);
269		if (jailed(inp->inp_socket->so_cred) &&
270		    (htonl(prison_getip(inp->inp_socket->so_cred)) !=
271		    ip->ip_dst.s_addr)) {
272			INP_RUNLOCK(inp);
273			continue;
274		}
275		if (last) {
276			struct mbuf *n;
277
278			n = m_copy(m, 0, (int)M_COPYALL);
279			if (n != NULL)
280		    	    (void) rip_append(last, ip, n, &ripsrc);
281			/* XXX count dropped packet */
282			INP_RUNLOCK(last);
283		}
284		last = inp;
285	}
286	LIST_FOREACH(inp, &V_ripcbinfo.ipi_hashbase[0], inp_hash) {
287		if (inp->inp_ip_p && inp->inp_ip_p != proto)
288			continue;
289#ifdef INET6
290		if ((inp->inp_vflag & INP_IPV4) == 0)
291			continue;
292#endif
293		if (inp->inp_laddr.s_addr &&
294		    inp->inp_laddr.s_addr != ip->ip_dst.s_addr)
295			continue;
296		if (inp->inp_faddr.s_addr &&
297		    inp->inp_faddr.s_addr != ip->ip_src.s_addr)
298			continue;
299		INP_RLOCK(inp);
300		if (jailed(inp->inp_socket->so_cred) &&
301		    (htonl(prison_getip(inp->inp_socket->so_cred)) !=
302		    ip->ip_dst.s_addr)) {
303			INP_RUNLOCK(inp);
304			continue;
305		}
306		if (last) {
307			struct mbuf *n;
308
309			n = m_copy(m, 0, (int)M_COPYALL);
310			if (n != NULL)
311				(void) rip_append(last, ip, n, &ripsrc);
312			/* XXX count dropped packet */
313			INP_RUNLOCK(last);
314		}
315		last = inp;
316	}
317	INP_INFO_RUNLOCK(&V_ripcbinfo);
318	if (last != NULL) {
319		if (rip_append(last, ip, m, &ripsrc) != 0)
320			V_ipstat.ips_delivered--;
321		INP_RUNLOCK(last);
322	} else {
323		m_freem(m);
324		V_ipstat.ips_noproto++;
325		V_ipstat.ips_delivered--;
326	}
327}
328
329/*
330 * Generate IP header and pass packet to ip_output.  Tack on options user may
331 * have setup with control call.
332 */
333int
334rip_output(struct mbuf *m, struct socket *so, u_long dst)
335{
336	struct ip *ip;
337	int error;
338	struct inpcb *inp = sotoinpcb(so);
339	int flags = ((so->so_options & SO_DONTROUTE) ? IP_ROUTETOIF : 0) |
340	    IP_ALLOWBROADCAST;
341
342	/*
343	 * If the user handed us a complete IP packet, use it.  Otherwise,
344	 * allocate an mbuf for a header and fill it in.
345	 */
346	if ((inp->inp_flags & INP_HDRINCL) == 0) {
347		if (m->m_pkthdr.len + sizeof(struct ip) > IP_MAXPACKET) {
348			m_freem(m);
349			return(EMSGSIZE);
350		}
351		M_PREPEND(m, sizeof(struct ip), M_DONTWAIT);
352		if (m == NULL)
353			return(ENOBUFS);
354
355		INP_RLOCK(inp);
356		ip = mtod(m, struct ip *);
357		ip->ip_tos = inp->inp_ip_tos;
358		if (inp->inp_flags & INP_DONTFRAG)
359			ip->ip_off = IP_DF;
360		else
361			ip->ip_off = 0;
362		ip->ip_p = inp->inp_ip_p;
363		ip->ip_len = m->m_pkthdr.len;
364		if (jailed(inp->inp_socket->so_cred))
365			ip->ip_src.s_addr =
366			    htonl(prison_getip(inp->inp_socket->so_cred));
367		else
368			ip->ip_src = inp->inp_laddr;
369		ip->ip_dst.s_addr = dst;
370		ip->ip_ttl = inp->inp_ip_ttl;
371	} else {
372		if (m->m_pkthdr.len > IP_MAXPACKET) {
373			m_freem(m);
374			return(EMSGSIZE);
375		}
376		INP_RLOCK(inp);
377		ip = mtod(m, struct ip *);
378		if (jailed(inp->inp_socket->so_cred)) {
379			if (ip->ip_src.s_addr !=
380			    htonl(prison_getip(inp->inp_socket->so_cred))) {
381				INP_RUNLOCK(inp);
382				m_freem(m);
383				return (EPERM);
384			}
385		}
386
387		/*
388		 * Don't allow both user specified and setsockopt options,
389		 * and don't allow packet length sizes that will crash.
390		 */
391		if (((ip->ip_hl != (sizeof (*ip) >> 2)) && inp->inp_options)
392		    || (ip->ip_len > m->m_pkthdr.len)
393		    || (ip->ip_len < (ip->ip_hl << 2))) {
394			INP_RUNLOCK(inp);
395			m_freem(m);
396			return (EINVAL);
397		}
398		if (ip->ip_id == 0)
399			ip->ip_id = ip_newid();
400
401		/*
402		 * XXX prevent ip_output from overwriting header fields.
403		 */
404		flags |= IP_RAWOUTPUT;
405		V_ipstat.ips_rawout++;
406	}
407
408	if (inp->inp_flags & INP_ONESBCAST)
409		flags |= IP_SENDONES;
410
411#ifdef MAC
412	mac_inpcb_create_mbuf(inp, m);
413#endif
414
415	error = ip_output(m, inp->inp_options, NULL, flags,
416	    inp->inp_moptions, inp);
417	INP_RUNLOCK(inp);
418	return (error);
419}
420
421/*
422 * Raw IP socket option processing.
423 *
424 * IMPORTANT NOTE regarding access control: Traditionally, raw sockets could
425 * only be created by a privileged process, and as such, socket option
426 * operations to manage system properties on any raw socket were allowed to
427 * take place without explicit additional access control checks.  However,
428 * raw sockets can now also be created in jail(), and therefore explicit
429 * checks are now required.  Likewise, raw sockets can be used by a process
430 * after it gives up privilege, so some caution is required.  For options
431 * passed down to the IP layer via ip_ctloutput(), checks are assumed to be
432 * performed in ip_ctloutput() and therefore no check occurs here.
433 * Unilaterally checking priv_check() here breaks normal IP socket option
434 * operations on raw sockets.
435 *
436 * When adding new socket options here, make sure to add access control
437 * checks here as necessary.
438 */
439int
440rip_ctloutput(struct socket *so, struct sockopt *sopt)
441{
442	struct	inpcb *inp = sotoinpcb(so);
443	int	error, optval;
444
445	if (sopt->sopt_level != IPPROTO_IP)
446		return (EINVAL);
447
448	error = 0;
449	switch (sopt->sopt_dir) {
450	case SOPT_GET:
451		switch (sopt->sopt_name) {
452		case IP_HDRINCL:
453			optval = inp->inp_flags & INP_HDRINCL;
454			error = sooptcopyout(sopt, &optval, sizeof optval);
455			break;
456
457		case IP_FW_ADD:	/* ADD actually returns the body... */
458		case IP_FW_GET:
459		case IP_FW_TABLE_GETSIZE:
460		case IP_FW_TABLE_LIST:
461		case IP_FW_NAT_GET_CONFIG:
462		case IP_FW_NAT_GET_LOG:
463			if (ip_fw_ctl_ptr != NULL)
464				error = ip_fw_ctl_ptr(sopt);
465			else
466				error = ENOPROTOOPT;
467			break;
468
469		case IP_DUMMYNET_GET:
470			if (ip_dn_ctl_ptr != NULL)
471				error = ip_dn_ctl_ptr(sopt);
472			else
473				error = ENOPROTOOPT;
474			break ;
475
476		case MRT_INIT:
477		case MRT_DONE:
478		case MRT_ADD_VIF:
479		case MRT_DEL_VIF:
480		case MRT_ADD_MFC:
481		case MRT_DEL_MFC:
482		case MRT_VERSION:
483		case MRT_ASSERT:
484		case MRT_API_SUPPORT:
485		case MRT_API_CONFIG:
486		case MRT_ADD_BW_UPCALL:
487		case MRT_DEL_BW_UPCALL:
488			error = priv_check(curthread, PRIV_NETINET_MROUTE);
489			if (error != 0)
490				return (error);
491			error = ip_mrouter_get ? ip_mrouter_get(so, sopt) :
492				EOPNOTSUPP;
493			break;
494
495		default:
496			error = ip_ctloutput(so, sopt);
497			break;
498		}
499		break;
500
501	case SOPT_SET:
502		switch (sopt->sopt_name) {
503		case IP_HDRINCL:
504			error = sooptcopyin(sopt, &optval, sizeof optval,
505					    sizeof optval);
506			if (error)
507				break;
508			if (optval)
509				inp->inp_flags |= INP_HDRINCL;
510			else
511				inp->inp_flags &= ~INP_HDRINCL;
512			break;
513
514		case IP_FW_ADD:
515		case IP_FW_DEL:
516		case IP_FW_FLUSH:
517		case IP_FW_ZERO:
518		case IP_FW_RESETLOG:
519		case IP_FW_TABLE_ADD:
520		case IP_FW_TABLE_DEL:
521		case IP_FW_TABLE_FLUSH:
522		case IP_FW_NAT_CFG:
523		case IP_FW_NAT_DEL:
524			if (ip_fw_ctl_ptr != NULL)
525				error = ip_fw_ctl_ptr(sopt);
526			else
527				error = ENOPROTOOPT;
528			break;
529
530		case IP_DUMMYNET_CONFIGURE:
531		case IP_DUMMYNET_DEL:
532		case IP_DUMMYNET_FLUSH:
533			if (ip_dn_ctl_ptr != NULL)
534				error = ip_dn_ctl_ptr(sopt);
535			else
536				error = ENOPROTOOPT ;
537			break ;
538
539		case IP_RSVP_ON:
540			error = priv_check(curthread, PRIV_NETINET_MROUTE);
541			if (error != 0)
542				return (error);
543			error = ip_rsvp_init(so);
544			break;
545
546		case IP_RSVP_OFF:
547			error = priv_check(curthread, PRIV_NETINET_MROUTE);
548			if (error != 0)
549				return (error);
550			error = ip_rsvp_done();
551			break;
552
553		case IP_RSVP_VIF_ON:
554		case IP_RSVP_VIF_OFF:
555			error = priv_check(curthread, PRIV_NETINET_MROUTE);
556			if (error != 0)
557				return (error);
558			error = ip_rsvp_vif ?
559				ip_rsvp_vif(so, sopt) : EINVAL;
560			break;
561
562		case MRT_INIT:
563		case MRT_DONE:
564		case MRT_ADD_VIF:
565		case MRT_DEL_VIF:
566		case MRT_ADD_MFC:
567		case MRT_DEL_MFC:
568		case MRT_VERSION:
569		case MRT_ASSERT:
570		case MRT_API_SUPPORT:
571		case MRT_API_CONFIG:
572		case MRT_ADD_BW_UPCALL:
573		case MRT_DEL_BW_UPCALL:
574			error = priv_check(curthread, PRIV_NETINET_MROUTE);
575			if (error != 0)
576				return (error);
577			error = ip_mrouter_set ? ip_mrouter_set(so, sopt) :
578					EOPNOTSUPP;
579			break;
580
581		default:
582			error = ip_ctloutput(so, sopt);
583			break;
584		}
585		break;
586	}
587
588	return (error);
589}
590
591/*
592 * This function exists solely to receive the PRC_IFDOWN messages which are
593 * sent by if_down().  It looks for an ifaddr whose ifa_addr is sa, and calls
594 * in_ifadown() to remove all routes corresponding to that address.  It also
595 * receives the PRC_IFUP messages from if_up() and reinstalls the interface
596 * routes.
597 */
598void
599rip_ctlinput(int cmd, struct sockaddr *sa, void *vip)
600{
601	struct in_ifaddr *ia;
602	struct ifnet *ifp;
603	int err;
604	int flags;
605
606	switch (cmd) {
607	case PRC_IFDOWN:
608		TAILQ_FOREACH(ia, &V_in_ifaddrhead, ia_link) {
609			if (ia->ia_ifa.ifa_addr == sa
610			    && (ia->ia_flags & IFA_ROUTE)) {
611				/*
612				 * in_ifscrub kills the interface route.
613				 */
614				in_ifscrub(ia->ia_ifp, ia);
615				/*
616				 * in_ifadown gets rid of all the rest of the
617				 * routes.  This is not quite the right thing
618				 * to do, but at least if we are running a
619				 * routing process they will come back.
620				 */
621				in_ifadown(&ia->ia_ifa, 0);
622				break;
623			}
624		}
625		break;
626
627	case PRC_IFUP:
628		TAILQ_FOREACH(ia, &V_in_ifaddrhead, ia_link) {
629			if (ia->ia_ifa.ifa_addr == sa)
630				break;
631		}
632		if (ia == 0 || (ia->ia_flags & IFA_ROUTE))
633			return;
634		flags = RTF_UP;
635		ifp = ia->ia_ifa.ifa_ifp;
636
637		if ((ifp->if_flags & IFF_LOOPBACK)
638		    || (ifp->if_flags & IFF_POINTOPOINT))
639			flags |= RTF_HOST;
640
641		err = rtinit(&ia->ia_ifa, RTM_ADD, flags);
642		if (err == 0)
643			ia->ia_flags |= IFA_ROUTE;
644		break;
645	}
646}
647
648u_long	rip_sendspace = 9216;
649u_long	rip_recvspace = 9216;
650
651SYSCTL_ULONG(_net_inet_raw, OID_AUTO, maxdgram, CTLFLAG_RW,
652    &rip_sendspace, 0, "Maximum outgoing raw IP datagram size");
653SYSCTL_ULONG(_net_inet_raw, OID_AUTO, recvspace, CTLFLAG_RW,
654    &rip_recvspace, 0, "Maximum space for incoming raw IP datagrams");
655
656static int
657rip_attach(struct socket *so, int proto, struct thread *td)
658{
659	struct inpcb *inp;
660	int error;
661
662	inp = sotoinpcb(so);
663	KASSERT(inp == NULL, ("rip_attach: inp != NULL"));
664
665	error = priv_check(td, PRIV_NETINET_RAW);
666	if (error)
667		return (error);
668	if (proto >= IPPROTO_MAX || proto < 0)
669		return EPROTONOSUPPORT;
670	error = soreserve(so, rip_sendspace, rip_recvspace);
671	if (error)
672		return (error);
673	INP_INFO_WLOCK(&V_ripcbinfo);
674	error = in_pcballoc(so, &V_ripcbinfo);
675	if (error) {
676		INP_INFO_WUNLOCK(&V_ripcbinfo);
677		return (error);
678	}
679	inp = (struct inpcb *)so->so_pcb;
680	inp->inp_vflag |= INP_IPV4;
681	inp->inp_ip_p = proto;
682	inp->inp_ip_ttl = V_ip_defttl;
683	rip_inshash(inp);
684	INP_INFO_WUNLOCK(&V_ripcbinfo);
685	INP_WUNLOCK(inp);
686	return (0);
687}
688
689static void
690rip_detach(struct socket *so)
691{
692	struct inpcb *inp;
693
694	inp = sotoinpcb(so);
695	KASSERT(inp != NULL, ("rip_detach: inp == NULL"));
696	KASSERT(inp->inp_faddr.s_addr == INADDR_ANY,
697	    ("rip_detach: not closed"));
698
699	INP_INFO_WLOCK(&V_ripcbinfo);
700	INP_WLOCK(inp);
701	rip_delhash(inp);
702	if (so == V_ip_mrouter && ip_mrouter_done)
703		ip_mrouter_done();
704	if (ip_rsvp_force_done)
705		ip_rsvp_force_done(so);
706	if (so == V_ip_rsvpd)
707		ip_rsvp_done();
708	in_pcbdetach(inp);
709	in_pcbfree(inp);
710	INP_INFO_WUNLOCK(&V_ripcbinfo);
711}
712
713static void
714rip_dodisconnect(struct socket *so, struct inpcb *inp)
715{
716
717	INP_INFO_WLOCK_ASSERT(inp->inp_pcbinfo);
718	INP_WLOCK_ASSERT(inp);
719
720	rip_delhash(inp);
721	inp->inp_faddr.s_addr = INADDR_ANY;
722	rip_inshash(inp);
723	SOCK_LOCK(so);
724	so->so_state &= ~SS_ISCONNECTED;
725	SOCK_UNLOCK(so);
726}
727
728static void
729rip_abort(struct socket *so)
730{
731	struct inpcb *inp;
732
733	inp = sotoinpcb(so);
734	KASSERT(inp != NULL, ("rip_abort: inp == NULL"));
735
736	INP_INFO_WLOCK(&V_ripcbinfo);
737	INP_WLOCK(inp);
738	rip_dodisconnect(so, inp);
739	INP_WUNLOCK(inp);
740	INP_INFO_WUNLOCK(&V_ripcbinfo);
741}
742
743static void
744rip_close(struct socket *so)
745{
746	struct inpcb *inp;
747
748	inp = sotoinpcb(so);
749	KASSERT(inp != NULL, ("rip_close: inp == NULL"));
750
751	INP_INFO_WLOCK(&V_ripcbinfo);
752	INP_WLOCK(inp);
753	rip_dodisconnect(so, inp);
754	INP_WUNLOCK(inp);
755	INP_INFO_WUNLOCK(&V_ripcbinfo);
756}
757
758static int
759rip_disconnect(struct socket *so)
760{
761	struct inpcb *inp;
762
763	if ((so->so_state & SS_ISCONNECTED) == 0)
764		return (ENOTCONN);
765
766	inp = sotoinpcb(so);
767	KASSERT(inp != NULL, ("rip_disconnect: inp == NULL"));
768
769	INP_INFO_WLOCK(&V_ripcbinfo);
770	INP_WLOCK(inp);
771	rip_dodisconnect(so, inp);
772	INP_WUNLOCK(inp);
773	INP_INFO_WUNLOCK(&V_ripcbinfo);
774	return (0);
775}
776
777static int
778rip_bind(struct socket *so, struct sockaddr *nam, struct thread *td)
779{
780	struct sockaddr_in *addr = (struct sockaddr_in *)nam;
781	struct inpcb *inp;
782
783	if (nam->sa_len != sizeof(*addr))
784		return (EINVAL);
785
786	if (jailed(td->td_ucred)) {
787		if (addr->sin_addr.s_addr == INADDR_ANY)
788			addr->sin_addr.s_addr =
789			    htonl(prison_getip(td->td_ucred));
790		if (htonl(prison_getip(td->td_ucred)) != addr->sin_addr.s_addr)
791			return (EADDRNOTAVAIL);
792	}
793
794	if (TAILQ_EMPTY(&V_ifnet) ||
795	    (addr->sin_family != AF_INET && addr->sin_family != AF_IMPLINK) ||
796	    (addr->sin_addr.s_addr &&
797	     ifa_ifwithaddr((struct sockaddr *)addr) == 0))
798		return (EADDRNOTAVAIL);
799
800	inp = sotoinpcb(so);
801	KASSERT(inp != NULL, ("rip_bind: inp == NULL"));
802
803	INP_INFO_WLOCK(&V_ripcbinfo);
804	INP_WLOCK(inp);
805	rip_delhash(inp);
806	inp->inp_laddr = addr->sin_addr;
807	rip_inshash(inp);
808	INP_WUNLOCK(inp);
809	INP_INFO_WUNLOCK(&V_ripcbinfo);
810	return (0);
811}
812
813static int
814rip_connect(struct socket *so, struct sockaddr *nam, struct thread *td)
815{
816	struct sockaddr_in *addr = (struct sockaddr_in *)nam;
817	struct inpcb *inp;
818
819	if (nam->sa_len != sizeof(*addr))
820		return (EINVAL);
821	if (TAILQ_EMPTY(&V_ifnet))
822		return (EADDRNOTAVAIL);
823	if (addr->sin_family != AF_INET && addr->sin_family != AF_IMPLINK)
824		return (EAFNOSUPPORT);
825
826	inp = sotoinpcb(so);
827	KASSERT(inp != NULL, ("rip_connect: inp == NULL"));
828
829	INP_INFO_WLOCK(&V_ripcbinfo);
830	INP_WLOCK(inp);
831	rip_delhash(inp);
832	inp->inp_faddr = addr->sin_addr;
833	rip_inshash(inp);
834	soisconnected(so);
835	INP_WUNLOCK(inp);
836	INP_INFO_WUNLOCK(&V_ripcbinfo);
837	return (0);
838}
839
840static int
841rip_shutdown(struct socket *so)
842{
843	struct inpcb *inp;
844
845	inp = sotoinpcb(so);
846	KASSERT(inp != NULL, ("rip_shutdown: inp == NULL"));
847
848	INP_WLOCK(inp);
849	socantsendmore(so);
850	INP_WUNLOCK(inp);
851	return (0);
852}
853
854static int
855rip_send(struct socket *so, int flags, struct mbuf *m, struct sockaddr *nam,
856    struct mbuf *control, struct thread *td)
857{
858	struct inpcb *inp;
859	u_long dst;
860
861	inp = sotoinpcb(so);
862	KASSERT(inp != NULL, ("rip_send: inp == NULL"));
863
864	/*
865	 * Note: 'dst' reads below are unlocked.
866	 */
867	if (so->so_state & SS_ISCONNECTED) {
868		if (nam) {
869			m_freem(m);
870			return (EISCONN);
871		}
872		dst = inp->inp_faddr.s_addr;	/* Unlocked read. */
873	} else {
874		if (nam == NULL) {
875			m_freem(m);
876			return (ENOTCONN);
877		}
878		dst = ((struct sockaddr_in *)nam)->sin_addr.s_addr;
879	}
880	return (rip_output(m, so, dst));
881}
882
883static int
884rip_pcblist(SYSCTL_HANDLER_ARGS)
885{
886	int error, i, n;
887	struct inpcb *inp, **inp_list;
888	inp_gen_t gencnt;
889	struct xinpgen xig;
890
891	/*
892	 * The process of preparing the TCB list is too time-consuming and
893	 * resource-intensive to repeat twice on every request.
894	 */
895	if (req->oldptr == 0) {
896		n = V_ripcbinfo.ipi_count;
897		req->oldidx = 2 * (sizeof xig)
898		    + (n + n/8) * sizeof(struct xinpcb);
899		return (0);
900	}
901
902	if (req->newptr != 0)
903		return (EPERM);
904
905	/*
906	 * OK, now we're committed to doing something.
907	 */
908	INP_INFO_RLOCK(&V_ripcbinfo);
909	gencnt = V_ripcbinfo.ipi_gencnt;
910	n = V_ripcbinfo.ipi_count;
911	INP_INFO_RUNLOCK(&V_ripcbinfo);
912
913	xig.xig_len = sizeof xig;
914	xig.xig_count = n;
915	xig.xig_gen = gencnt;
916	xig.xig_sogen = so_gencnt;
917	error = SYSCTL_OUT(req, &xig, sizeof xig);
918	if (error)
919		return (error);
920
921	inp_list = malloc(n * sizeof *inp_list, M_TEMP, M_WAITOK);
922	if (inp_list == 0)
923		return (ENOMEM);
924
925	INP_INFO_RLOCK(&V_ripcbinfo);
926	for (inp = LIST_FIRST(V_ripcbinfo.ipi_listhead), i = 0; inp && i < n;
927	     inp = LIST_NEXT(inp, inp_list)) {
928		INP_RLOCK(inp);
929		if (inp->inp_gencnt <= gencnt &&
930		    cr_canseesocket(req->td->td_ucred, inp->inp_socket) == 0) {
931			/* XXX held references? */
932			inp_list[i++] = inp;
933		}
934		INP_RUNLOCK(inp);
935	}
936	INP_INFO_RUNLOCK(&V_ripcbinfo);
937	n = i;
938
939	error = 0;
940	for (i = 0; i < n; i++) {
941		inp = inp_list[i];
942		INP_RLOCK(inp);
943		if (inp->inp_gencnt <= gencnt) {
944			struct xinpcb xi;
945			bzero(&xi, sizeof(xi));
946			xi.xi_len = sizeof xi;
947			/* XXX should avoid extra copy */
948			bcopy(inp, &xi.xi_inp, sizeof *inp);
949			if (inp->inp_socket)
950				sotoxsocket(inp->inp_socket, &xi.xi_socket);
951			INP_RUNLOCK(inp);
952			error = SYSCTL_OUT(req, &xi, sizeof xi);
953		} else
954			INP_RUNLOCK(inp);
955	}
956	if (!error) {
957		/*
958		 * Give the user an updated idea of our state.  If the
959		 * generation differs from what we told her before, she knows
960		 * that something happened while we were processing this
961		 * request, and it might be necessary to retry.
962		 */
963		INP_INFO_RLOCK(&V_ripcbinfo);
964		xig.xig_gen = V_ripcbinfo.ipi_gencnt;
965		xig.xig_sogen = so_gencnt;
966		xig.xig_count = V_ripcbinfo.ipi_count;
967		INP_INFO_RUNLOCK(&V_ripcbinfo);
968		error = SYSCTL_OUT(req, &xig, sizeof xig);
969	}
970	free(inp_list, M_TEMP);
971	return (error);
972}
973
974SYSCTL_PROC(_net_inet_raw, OID_AUTO/*XXX*/, pcblist, CTLFLAG_RD, 0, 0,
975    rip_pcblist, "S,xinpcb", "List of active raw IP sockets");
976
977struct pr_usrreqs rip_usrreqs = {
978	.pru_abort =		rip_abort,
979	.pru_attach =		rip_attach,
980	.pru_bind =		rip_bind,
981	.pru_connect =		rip_connect,
982	.pru_control =		in_control,
983	.pru_detach =		rip_detach,
984	.pru_disconnect =	rip_disconnect,
985	.pru_peeraddr =		in_getpeeraddr,
986	.pru_send =		rip_send,
987	.pru_shutdown =		rip_shutdown,
988	.pru_sockaddr =		in_getsockaddr,
989	.pru_sosetlabel =	in_pcbsosetlabel,
990	.pru_close =		rip_close,
991};
992