raw_ip.c revision 185571
1/*-
2 * Copyright (c) 1982, 1986, 1988, 1993
3 *	The Regents of the University of California.
4 * All rights reserved.
5 *
6 * Redistribution and use in source and binary forms, with or without
7 * modification, are permitted provided that the following conditions
8 * are met:
9 * 1. Redistributions of source code must retain the above copyright
10 *    notice, this list of conditions and the following disclaimer.
11 * 2. Redistributions in binary form must reproduce the above copyright
12 *    notice, this list of conditions and the following disclaimer in the
13 *    documentation and/or other materials provided with the distribution.
14 * 4. Neither the name of the University nor the names of its contributors
15 *    may be used to endorse or promote products derived from this software
16 *    without specific prior written permission.
17 *
18 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
19 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
20 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
21 * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
22 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
23 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
24 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
25 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
26 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
27 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
28 * SUCH DAMAGE.
29 *
30 *	@(#)raw_ip.c	8.7 (Berkeley) 5/15/95
31 */
32
33#include <sys/cdefs.h>
34__FBSDID("$FreeBSD: head/sys/netinet/raw_ip.c 185571 2008-12-02 21:37:28Z bz $");
35
36#include "opt_inet6.h"
37#include "opt_ipsec.h"
38#include "opt_mac.h"
39
40#include <sys/param.h>
41#include <sys/jail.h>
42#include <sys/kernel.h>
43#include <sys/lock.h>
44#include <sys/malloc.h>
45#include <sys/mbuf.h>
46#include <sys/priv.h>
47#include <sys/proc.h>
48#include <sys/protosw.h>
49#include <sys/signalvar.h>
50#include <sys/socket.h>
51#include <sys/socketvar.h>
52#include <sys/sx.h>
53#include <sys/sysctl.h>
54#include <sys/systm.h>
55#include <sys/vimage.h>
56
57#include <vm/uma.h>
58
59#include <net/if.h>
60#include <net/route.h>
61#include <net/vnet.h>
62
63#include <netinet/in.h>
64#include <netinet/in_systm.h>
65#include <netinet/in_pcb.h>
66#include <netinet/in_var.h>
67#include <netinet/ip.h>
68#include <netinet/ip_var.h>
69#include <netinet/ip_mroute.h>
70
71#include <netinet/ip_fw.h>
72#include <netinet/ip_dummynet.h>
73#include <netinet/vinet.h>
74
75#ifdef IPSEC
76#include <netipsec/ipsec.h>
77#endif /*IPSEC*/
78
79#include <security/mac/mac_framework.h>
80
81#ifdef VIMAGE_GLOBALS
82struct	inpcbhead ripcb;
83struct	inpcbinfo ripcbinfo;
84#endif
85
86/* control hooks for ipfw and dummynet */
87ip_fw_ctl_t *ip_fw_ctl_ptr = NULL;
88ip_dn_ctl_t *ip_dn_ctl_ptr = NULL;
89
90/*
91 * Hooks for multicast routing. They all default to NULL, so leave them not
92 * initialized and rely on BSS being set to 0.
93 */
94
95/*
96 * The socket used to communicate with the multicast routing daemon.
97 */
98#ifdef VIMAGE_GLOBALS
99struct socket  *ip_mrouter;
100#endif
101
102/*
103 * The various mrouter and rsvp functions.
104 */
105int (*ip_mrouter_set)(struct socket *, struct sockopt *);
106int (*ip_mrouter_get)(struct socket *, struct sockopt *);
107int (*ip_mrouter_done)(void);
108int (*ip_mforward)(struct ip *, struct ifnet *, struct mbuf *,
109		   struct ip_moptions *);
110int (*mrt_ioctl)(int, caddr_t, int);
111int (*legal_vif_num)(int);
112u_long (*ip_mcast_src)(int);
113
114void (*rsvp_input_p)(struct mbuf *m, int off);
115int (*ip_rsvp_vif)(struct socket *, struct sockopt *);
116void (*ip_rsvp_force_done)(struct socket *);
117
118/*
119 * Hash functions
120 */
121
122#define INP_PCBHASH_RAW_SIZE	256
123#define INP_PCBHASH_RAW(proto, laddr, faddr, mask) \
124        (((proto) + (laddr) + (faddr)) % (mask) + 1)
125
126static void
127rip_inshash(struct inpcb *inp)
128{
129	struct inpcbinfo *pcbinfo = inp->inp_pcbinfo;
130	struct inpcbhead *pcbhash;
131	int hash;
132
133	INP_INFO_WLOCK_ASSERT(pcbinfo);
134	INP_WLOCK_ASSERT(inp);
135
136	if (inp->inp_ip_p != 0 &&
137	    inp->inp_laddr.s_addr != INADDR_ANY &&
138	    inp->inp_faddr.s_addr != INADDR_ANY) {
139		hash = INP_PCBHASH_RAW(inp->inp_ip_p, inp->inp_laddr.s_addr,
140		    inp->inp_faddr.s_addr, pcbinfo->ipi_hashmask);
141	} else
142		hash = 0;
143	pcbhash = &pcbinfo->ipi_hashbase[hash];
144	LIST_INSERT_HEAD(pcbhash, inp, inp_hash);
145}
146
147static void
148rip_delhash(struct inpcb *inp)
149{
150
151	INP_INFO_WLOCK_ASSERT(inp->inp_pcbinfo);
152	INP_WLOCK_ASSERT(inp);
153
154	LIST_REMOVE(inp, inp_hash);
155}
156
157/*
158 * Raw interface to IP protocol.
159 */
160
161/*
162 * Initialize raw connection block q.
163 */
164static void
165rip_zone_change(void *tag)
166{
167	INIT_VNET_INET(curvnet);
168
169	uma_zone_set_max(V_ripcbinfo.ipi_zone, maxsockets);
170}
171
172static int
173rip_inpcb_init(void *mem, int size, int flags)
174{
175	struct inpcb *inp = mem;
176
177	INP_LOCK_INIT(inp, "inp", "rawinp");
178	return (0);
179}
180
181void
182rip_init(void)
183{
184	INIT_VNET_INET(curvnet);
185
186	INP_INFO_LOCK_INIT(&V_ripcbinfo, "rip");
187	LIST_INIT(&V_ripcb);
188	V_ripcbinfo.ipi_listhead = &V_ripcb;
189	V_ripcbinfo.ipi_hashbase =
190	    hashinit(INP_PCBHASH_RAW_SIZE, M_PCB, &V_ripcbinfo.ipi_hashmask);
191	V_ripcbinfo.ipi_porthashbase =
192	    hashinit(1, M_PCB, &V_ripcbinfo.ipi_porthashmask);
193	V_ripcbinfo.ipi_zone = uma_zcreate("ripcb", sizeof(struct inpcb),
194	    NULL, NULL, rip_inpcb_init, NULL, UMA_ALIGN_PTR, UMA_ZONE_NOFREE);
195	uma_zone_set_max(V_ripcbinfo.ipi_zone, maxsockets);
196	EVENTHANDLER_REGISTER(maxsockets_change, rip_zone_change, NULL,
197	    EVENTHANDLER_PRI_ANY);
198}
199
200static int
201rip_append(struct inpcb *last, struct ip *ip, struct mbuf *n,
202    struct sockaddr_in *ripsrc)
203{
204	int policyfail = 0;
205
206	INP_RLOCK_ASSERT(last);
207
208#ifdef IPSEC
209	/* check AH/ESP integrity. */
210	if (ipsec4_in_reject(n, last)) {
211		policyfail = 1;
212	}
213#endif /* IPSEC */
214#ifdef MAC
215	if (!policyfail && mac_inpcb_check_deliver(last, n) != 0)
216		policyfail = 1;
217#endif
218	/* Check the minimum TTL for socket. */
219	if (last->inp_ip_minttl && last->inp_ip_minttl > ip->ip_ttl)
220		policyfail = 1;
221	if (!policyfail) {
222		struct mbuf *opts = NULL;
223		struct socket *so;
224
225		so = last->inp_socket;
226		if ((last->inp_flags & INP_CONTROLOPTS) ||
227		    (so->so_options & (SO_TIMESTAMP | SO_BINTIME)))
228			ip_savecontrol(last, &opts, ip, n);
229		SOCKBUF_LOCK(&so->so_rcv);
230		if (sbappendaddr_locked(&so->so_rcv,
231		    (struct sockaddr *)ripsrc, n, opts) == 0) {
232			/* should notify about lost packet */
233			m_freem(n);
234			if (opts)
235				m_freem(opts);
236			SOCKBUF_UNLOCK(&so->so_rcv);
237		} else
238			sorwakeup_locked(so);
239	} else
240		m_freem(n);
241	return (policyfail);
242}
243
244/*
245 * Setup generic address and protocol structures for raw_input routine, then
246 * pass them along with mbuf chain.
247 */
248void
249rip_input(struct mbuf *m, int off)
250{
251	INIT_VNET_INET(curvnet);
252	struct ip *ip = mtod(m, struct ip *);
253	int proto = ip->ip_p;
254	struct inpcb *inp, *last;
255	struct sockaddr_in ripsrc;
256	int hash;
257
258	bzero(&ripsrc, sizeof(ripsrc));
259	ripsrc.sin_len = sizeof(ripsrc);
260	ripsrc.sin_family = AF_INET;
261	ripsrc.sin_addr = ip->ip_src;
262	last = NULL;
263	hash = INP_PCBHASH_RAW(proto, ip->ip_src.s_addr,
264	    ip->ip_dst.s_addr, V_ripcbinfo.ipi_hashmask);
265	INP_INFO_RLOCK(&V_ripcbinfo);
266	LIST_FOREACH(inp, &V_ripcbinfo.ipi_hashbase[hash], inp_hash) {
267		if (inp->inp_ip_p != proto)
268			continue;
269#ifdef INET6
270		/* XXX inp locking */
271		if ((inp->inp_vflag & INP_IPV4) == 0)
272			continue;
273#endif
274		if (inp->inp_laddr.s_addr != ip->ip_dst.s_addr)
275			continue;
276		if (inp->inp_faddr.s_addr != ip->ip_src.s_addr)
277			continue;
278		if (jailed(inp->inp_cred)) {
279			if (!prison_check_ip4(inp->inp_cred, &ip->ip_dst))
280				continue;
281		}
282		if (last) {
283			struct mbuf *n;
284
285			n = m_copy(m, 0, (int)M_COPYALL);
286			if (n != NULL)
287		    	    (void) rip_append(last, ip, n, &ripsrc);
288			/* XXX count dropped packet */
289			INP_RUNLOCK(last);
290		}
291		INP_RLOCK(inp);
292		last = inp;
293	}
294	LIST_FOREACH(inp, &V_ripcbinfo.ipi_hashbase[0], inp_hash) {
295		if (inp->inp_ip_p && inp->inp_ip_p != proto)
296			continue;
297#ifdef INET6
298		/* XXX inp locking */
299		if ((inp->inp_vflag & INP_IPV4) == 0)
300			continue;
301#endif
302		if (inp->inp_laddr.s_addr &&
303		    inp->inp_laddr.s_addr != ip->ip_dst.s_addr)
304			continue;
305		if (inp->inp_faddr.s_addr &&
306		    inp->inp_faddr.s_addr != ip->ip_src.s_addr)
307			continue;
308		if (jailed(inp->inp_cred)) {
309			if (!prison_check_ip4(inp->inp_cred, &ip->ip_dst))
310				continue;
311		}
312		if (last) {
313			struct mbuf *n;
314
315			n = m_copy(m, 0, (int)M_COPYALL);
316			if (n != NULL)
317				(void) rip_append(last, ip, n, &ripsrc);
318			/* XXX count dropped packet */
319			INP_RUNLOCK(last);
320		}
321		INP_RLOCK(inp);
322		last = inp;
323	}
324	INP_INFO_RUNLOCK(&V_ripcbinfo);
325	if (last != NULL) {
326		if (rip_append(last, ip, m, &ripsrc) != 0)
327			V_ipstat.ips_delivered--;
328		INP_RUNLOCK(last);
329	} else {
330		m_freem(m);
331		V_ipstat.ips_noproto++;
332		V_ipstat.ips_delivered--;
333	}
334}
335
336/*
337 * Generate IP header and pass packet to ip_output.  Tack on options user may
338 * have setup with control call.
339 */
340int
341rip_output(struct mbuf *m, struct socket *so, u_long dst)
342{
343	INIT_VNET_INET(so->so_vnet);
344	struct ip *ip;
345	int error;
346	struct inpcb *inp = sotoinpcb(so);
347	int flags = ((so->so_options & SO_DONTROUTE) ? IP_ROUTETOIF : 0) |
348	    IP_ALLOWBROADCAST;
349
350	/*
351	 * If the user handed us a complete IP packet, use it.  Otherwise,
352	 * allocate an mbuf for a header and fill it in.
353	 */
354	if ((inp->inp_flags & INP_HDRINCL) == 0) {
355		if (m->m_pkthdr.len + sizeof(struct ip) > IP_MAXPACKET) {
356			m_freem(m);
357			return(EMSGSIZE);
358		}
359		M_PREPEND(m, sizeof(struct ip), M_DONTWAIT);
360		if (m == NULL)
361			return(ENOBUFS);
362
363		INP_RLOCK(inp);
364		ip = mtod(m, struct ip *);
365		ip->ip_tos = inp->inp_ip_tos;
366		if (inp->inp_flags & INP_DONTFRAG)
367			ip->ip_off = IP_DF;
368		else
369			ip->ip_off = 0;
370		ip->ip_p = inp->inp_ip_p;
371		ip->ip_len = m->m_pkthdr.len;
372		if (jailed(inp->inp_cred)) {
373			if (prison_getip4(inp->inp_cred, &ip->ip_src)) {
374				INP_RUNLOCK(inp);
375				m_freem(m);
376				return (EPERM);
377			}
378		} else {
379			ip->ip_src = inp->inp_laddr;
380		}
381		ip->ip_dst.s_addr = dst;
382		ip->ip_ttl = inp->inp_ip_ttl;
383	} else {
384		if (m->m_pkthdr.len > IP_MAXPACKET) {
385			m_freem(m);
386			return(EMSGSIZE);
387		}
388		INP_RLOCK(inp);
389		ip = mtod(m, struct ip *);
390		if (!prison_check_ip4(inp->inp_cred, &ip->ip_src)) {
391			INP_RUNLOCK(inp);
392			m_freem(m);
393			return (EPERM);
394		}
395
396		/*
397		 * Don't allow both user specified and setsockopt options,
398		 * and don't allow packet length sizes that will crash.
399		 */
400		if (((ip->ip_hl != (sizeof (*ip) >> 2)) && inp->inp_options)
401		    || (ip->ip_len > m->m_pkthdr.len)
402		    || (ip->ip_len < (ip->ip_hl << 2))) {
403			INP_RUNLOCK(inp);
404			m_freem(m);
405			return (EINVAL);
406		}
407		if (ip->ip_id == 0)
408			ip->ip_id = ip_newid();
409
410		/*
411		 * XXX prevent ip_output from overwriting header fields.
412		 */
413		flags |= IP_RAWOUTPUT;
414		V_ipstat.ips_rawout++;
415	}
416
417	if (inp->inp_flags & INP_ONESBCAST)
418		flags |= IP_SENDONES;
419
420#ifdef MAC
421	mac_inpcb_create_mbuf(inp, m);
422#endif
423
424	error = ip_output(m, inp->inp_options, NULL, flags,
425	    inp->inp_moptions, inp);
426	INP_RUNLOCK(inp);
427	return (error);
428}
429
430/*
431 * Raw IP socket option processing.
432 *
433 * IMPORTANT NOTE regarding access control: Traditionally, raw sockets could
434 * only be created by a privileged process, and as such, socket option
435 * operations to manage system properties on any raw socket were allowed to
436 * take place without explicit additional access control checks.  However,
437 * raw sockets can now also be created in jail(), and therefore explicit
438 * checks are now required.  Likewise, raw sockets can be used by a process
439 * after it gives up privilege, so some caution is required.  For options
440 * passed down to the IP layer via ip_ctloutput(), checks are assumed to be
441 * performed in ip_ctloutput() and therefore no check occurs here.
442 * Unilaterally checking priv_check() here breaks normal IP socket option
443 * operations on raw sockets.
444 *
445 * When adding new socket options here, make sure to add access control
446 * checks here as necessary.
447 */
448int
449rip_ctloutput(struct socket *so, struct sockopt *sopt)
450{
451	struct	inpcb *inp = sotoinpcb(so);
452	int	error, optval;
453
454	if (sopt->sopt_level != IPPROTO_IP) {
455		if ((sopt->sopt_level == SOL_SOCKET) &&
456		    (sopt->sopt_name == SO_SETFIB)) {
457			inp->inp_inc.inc_fibnum = so->so_fibnum;
458			return (0);
459		}
460		return (EINVAL);
461	}
462
463	error = 0;
464	switch (sopt->sopt_dir) {
465	case SOPT_GET:
466		switch (sopt->sopt_name) {
467		case IP_HDRINCL:
468			optval = inp->inp_flags & INP_HDRINCL;
469			error = sooptcopyout(sopt, &optval, sizeof optval);
470			break;
471
472		case IP_FW_ADD:	/* ADD actually returns the body... */
473		case IP_FW_GET:
474		case IP_FW_TABLE_GETSIZE:
475		case IP_FW_TABLE_LIST:
476		case IP_FW_NAT_GET_CONFIG:
477		case IP_FW_NAT_GET_LOG:
478			if (ip_fw_ctl_ptr != NULL)
479				error = ip_fw_ctl_ptr(sopt);
480			else
481				error = ENOPROTOOPT;
482			break;
483
484		case IP_DUMMYNET_GET:
485			if (ip_dn_ctl_ptr != NULL)
486				error = ip_dn_ctl_ptr(sopt);
487			else
488				error = ENOPROTOOPT;
489			break ;
490
491		case MRT_INIT:
492		case MRT_DONE:
493		case MRT_ADD_VIF:
494		case MRT_DEL_VIF:
495		case MRT_ADD_MFC:
496		case MRT_DEL_MFC:
497		case MRT_VERSION:
498		case MRT_ASSERT:
499		case MRT_API_SUPPORT:
500		case MRT_API_CONFIG:
501		case MRT_ADD_BW_UPCALL:
502		case MRT_DEL_BW_UPCALL:
503			error = priv_check(curthread, PRIV_NETINET_MROUTE);
504			if (error != 0)
505				return (error);
506			error = ip_mrouter_get ? ip_mrouter_get(so, sopt) :
507				EOPNOTSUPP;
508			break;
509
510		default:
511			error = ip_ctloutput(so, sopt);
512			break;
513		}
514		break;
515
516	case SOPT_SET:
517		switch (sopt->sopt_name) {
518		case IP_HDRINCL:
519			error = sooptcopyin(sopt, &optval, sizeof optval,
520					    sizeof optval);
521			if (error)
522				break;
523			if (optval)
524				inp->inp_flags |= INP_HDRINCL;
525			else
526				inp->inp_flags &= ~INP_HDRINCL;
527			break;
528
529		case IP_FW_ADD:
530		case IP_FW_DEL:
531		case IP_FW_FLUSH:
532		case IP_FW_ZERO:
533		case IP_FW_RESETLOG:
534		case IP_FW_TABLE_ADD:
535		case IP_FW_TABLE_DEL:
536		case IP_FW_TABLE_FLUSH:
537		case IP_FW_NAT_CFG:
538		case IP_FW_NAT_DEL:
539			if (ip_fw_ctl_ptr != NULL)
540				error = ip_fw_ctl_ptr(sopt);
541			else
542				error = ENOPROTOOPT;
543			break;
544
545		case IP_DUMMYNET_CONFIGURE:
546		case IP_DUMMYNET_DEL:
547		case IP_DUMMYNET_FLUSH:
548			if (ip_dn_ctl_ptr != NULL)
549				error = ip_dn_ctl_ptr(sopt);
550			else
551				error = ENOPROTOOPT ;
552			break ;
553
554		case IP_RSVP_ON:
555			error = priv_check(curthread, PRIV_NETINET_MROUTE);
556			if (error != 0)
557				return (error);
558			error = ip_rsvp_init(so);
559			break;
560
561		case IP_RSVP_OFF:
562			error = priv_check(curthread, PRIV_NETINET_MROUTE);
563			if (error != 0)
564				return (error);
565			error = ip_rsvp_done();
566			break;
567
568		case IP_RSVP_VIF_ON:
569		case IP_RSVP_VIF_OFF:
570			error = priv_check(curthread, PRIV_NETINET_MROUTE);
571			if (error != 0)
572				return (error);
573			error = ip_rsvp_vif ?
574				ip_rsvp_vif(so, sopt) : EINVAL;
575			break;
576
577		case MRT_INIT:
578		case MRT_DONE:
579		case MRT_ADD_VIF:
580		case MRT_DEL_VIF:
581		case MRT_ADD_MFC:
582		case MRT_DEL_MFC:
583		case MRT_VERSION:
584		case MRT_ASSERT:
585		case MRT_API_SUPPORT:
586		case MRT_API_CONFIG:
587		case MRT_ADD_BW_UPCALL:
588		case MRT_DEL_BW_UPCALL:
589			error = priv_check(curthread, PRIV_NETINET_MROUTE);
590			if (error != 0)
591				return (error);
592			error = ip_mrouter_set ? ip_mrouter_set(so, sopt) :
593					EOPNOTSUPP;
594			break;
595
596		default:
597			error = ip_ctloutput(so, sopt);
598			break;
599		}
600		break;
601	}
602
603	return (error);
604}
605
606/*
607 * This function exists solely to receive the PRC_IFDOWN messages which are
608 * sent by if_down().  It looks for an ifaddr whose ifa_addr is sa, and calls
609 * in_ifadown() to remove all routes corresponding to that address.  It also
610 * receives the PRC_IFUP messages from if_up() and reinstalls the interface
611 * routes.
612 */
613void
614rip_ctlinput(int cmd, struct sockaddr *sa, void *vip)
615{
616	INIT_VNET_INET(curvnet);
617	struct in_ifaddr *ia;
618	struct ifnet *ifp;
619	int err;
620	int flags;
621
622	switch (cmd) {
623	case PRC_IFDOWN:
624		TAILQ_FOREACH(ia, &V_in_ifaddrhead, ia_link) {
625			if (ia->ia_ifa.ifa_addr == sa
626			    && (ia->ia_flags & IFA_ROUTE)) {
627				/*
628				 * in_ifscrub kills the interface route.
629				 */
630				in_ifscrub(ia->ia_ifp, ia);
631				/*
632				 * in_ifadown gets rid of all the rest of the
633				 * routes.  This is not quite the right thing
634				 * to do, but at least if we are running a
635				 * routing process they will come back.
636				 */
637				in_ifadown(&ia->ia_ifa, 0);
638				break;
639			}
640		}
641		break;
642
643	case PRC_IFUP:
644		TAILQ_FOREACH(ia, &V_in_ifaddrhead, ia_link) {
645			if (ia->ia_ifa.ifa_addr == sa)
646				break;
647		}
648		if (ia == 0 || (ia->ia_flags & IFA_ROUTE))
649			return;
650		flags = RTF_UP;
651		ifp = ia->ia_ifa.ifa_ifp;
652
653		if ((ifp->if_flags & IFF_LOOPBACK)
654		    || (ifp->if_flags & IFF_POINTOPOINT))
655			flags |= RTF_HOST;
656
657		err = rtinit(&ia->ia_ifa, RTM_ADD, flags);
658		if (err == 0)
659			ia->ia_flags |= IFA_ROUTE;
660		break;
661	}
662}
663
664u_long	rip_sendspace = 9216;
665u_long	rip_recvspace = 9216;
666
667SYSCTL_ULONG(_net_inet_raw, OID_AUTO, maxdgram, CTLFLAG_RW,
668    &rip_sendspace, 0, "Maximum outgoing raw IP datagram size");
669SYSCTL_ULONG(_net_inet_raw, OID_AUTO, recvspace, CTLFLAG_RW,
670    &rip_recvspace, 0, "Maximum space for incoming raw IP datagrams");
671
672static int
673rip_attach(struct socket *so, int proto, struct thread *td)
674{
675	INIT_VNET_INET(so->so_vnet);
676	struct inpcb *inp;
677	int error;
678
679	inp = sotoinpcb(so);
680	KASSERT(inp == NULL, ("rip_attach: inp != NULL"));
681
682	error = priv_check(td, PRIV_NETINET_RAW);
683	if (error)
684		return (error);
685	if (proto >= IPPROTO_MAX || proto < 0)
686		return EPROTONOSUPPORT;
687	error = soreserve(so, rip_sendspace, rip_recvspace);
688	if (error)
689		return (error);
690	INP_INFO_WLOCK(&V_ripcbinfo);
691	error = in_pcballoc(so, &V_ripcbinfo);
692	if (error) {
693		INP_INFO_WUNLOCK(&V_ripcbinfo);
694		return (error);
695	}
696	inp = (struct inpcb *)so->so_pcb;
697	inp->inp_vflag |= INP_IPV4;
698	inp->inp_ip_p = proto;
699	inp->inp_ip_ttl = V_ip_defttl;
700	rip_inshash(inp);
701	INP_INFO_WUNLOCK(&V_ripcbinfo);
702	INP_WUNLOCK(inp);
703	return (0);
704}
705
706static void
707rip_detach(struct socket *so)
708{
709	INIT_VNET_INET(so->so_vnet);
710	struct inpcb *inp;
711
712	inp = sotoinpcb(so);
713	KASSERT(inp != NULL, ("rip_detach: inp == NULL"));
714	KASSERT(inp->inp_faddr.s_addr == INADDR_ANY,
715	    ("rip_detach: not closed"));
716
717	INP_INFO_WLOCK(&V_ripcbinfo);
718	INP_WLOCK(inp);
719	rip_delhash(inp);
720	if (so == V_ip_mrouter && ip_mrouter_done)
721		ip_mrouter_done();
722	if (ip_rsvp_force_done)
723		ip_rsvp_force_done(so);
724	if (so == V_ip_rsvpd)
725		ip_rsvp_done();
726	in_pcbdetach(inp);
727	in_pcbfree(inp);
728	INP_INFO_WUNLOCK(&V_ripcbinfo);
729}
730
731static void
732rip_dodisconnect(struct socket *so, struct inpcb *inp)
733{
734
735	INP_INFO_WLOCK_ASSERT(inp->inp_pcbinfo);
736	INP_WLOCK_ASSERT(inp);
737
738	rip_delhash(inp);
739	inp->inp_faddr.s_addr = INADDR_ANY;
740	rip_inshash(inp);
741	SOCK_LOCK(so);
742	so->so_state &= ~SS_ISCONNECTED;
743	SOCK_UNLOCK(so);
744}
745
746static void
747rip_abort(struct socket *so)
748{
749	INIT_VNET_INET(so->so_vnet);
750	struct inpcb *inp;
751
752	inp = sotoinpcb(so);
753	KASSERT(inp != NULL, ("rip_abort: inp == NULL"));
754
755	INP_INFO_WLOCK(&V_ripcbinfo);
756	INP_WLOCK(inp);
757	rip_dodisconnect(so, inp);
758	INP_WUNLOCK(inp);
759	INP_INFO_WUNLOCK(&V_ripcbinfo);
760}
761
762static void
763rip_close(struct socket *so)
764{
765	INIT_VNET_INET(so->so_vnet);
766	struct inpcb *inp;
767
768	inp = sotoinpcb(so);
769	KASSERT(inp != NULL, ("rip_close: inp == NULL"));
770
771	INP_INFO_WLOCK(&V_ripcbinfo);
772	INP_WLOCK(inp);
773	rip_dodisconnect(so, inp);
774	INP_WUNLOCK(inp);
775	INP_INFO_WUNLOCK(&V_ripcbinfo);
776}
777
778static int
779rip_disconnect(struct socket *so)
780{
781	INIT_VNET_INET(so->so_vnet);
782	struct inpcb *inp;
783
784	if ((so->so_state & SS_ISCONNECTED) == 0)
785		return (ENOTCONN);
786
787	inp = sotoinpcb(so);
788	KASSERT(inp != NULL, ("rip_disconnect: inp == NULL"));
789
790	INP_INFO_WLOCK(&V_ripcbinfo);
791	INP_WLOCK(inp);
792	rip_dodisconnect(so, inp);
793	INP_WUNLOCK(inp);
794	INP_INFO_WUNLOCK(&V_ripcbinfo);
795	return (0);
796}
797
798static int
799rip_bind(struct socket *so, struct sockaddr *nam, struct thread *td)
800{
801	INIT_VNET_NET(so->so_vnet);
802	INIT_VNET_INET(so->so_vnet);
803	struct sockaddr_in *addr = (struct sockaddr_in *)nam;
804	struct inpcb *inp;
805
806	if (nam->sa_len != sizeof(*addr))
807		return (EINVAL);
808
809	if (!prison_check_ip4(td->td_ucred, &addr->sin_addr))
810		return (EADDRNOTAVAIL);
811
812	if (TAILQ_EMPTY(&V_ifnet) ||
813	    (addr->sin_family != AF_INET && addr->sin_family != AF_IMPLINK) ||
814	    (addr->sin_addr.s_addr &&
815	     ifa_ifwithaddr((struct sockaddr *)addr) == 0))
816		return (EADDRNOTAVAIL);
817
818	inp = sotoinpcb(so);
819	KASSERT(inp != NULL, ("rip_bind: inp == NULL"));
820
821	INP_INFO_WLOCK(&V_ripcbinfo);
822	INP_WLOCK(inp);
823	rip_delhash(inp);
824	inp->inp_laddr = addr->sin_addr;
825	rip_inshash(inp);
826	INP_WUNLOCK(inp);
827	INP_INFO_WUNLOCK(&V_ripcbinfo);
828	return (0);
829}
830
831static int
832rip_connect(struct socket *so, struct sockaddr *nam, struct thread *td)
833{
834	INIT_VNET_NET(so->so_vnet);
835	INIT_VNET_INET(so->so_vnet);
836	struct sockaddr_in *addr = (struct sockaddr_in *)nam;
837	struct inpcb *inp;
838
839	if (nam->sa_len != sizeof(*addr))
840		return (EINVAL);
841	if (TAILQ_EMPTY(&V_ifnet))
842		return (EADDRNOTAVAIL);
843	if (addr->sin_family != AF_INET && addr->sin_family != AF_IMPLINK)
844		return (EAFNOSUPPORT);
845
846	inp = sotoinpcb(so);
847	KASSERT(inp != NULL, ("rip_connect: inp == NULL"));
848
849	INP_INFO_WLOCK(&V_ripcbinfo);
850	INP_WLOCK(inp);
851	rip_delhash(inp);
852	inp->inp_faddr = addr->sin_addr;
853	rip_inshash(inp);
854	soisconnected(so);
855	INP_WUNLOCK(inp);
856	INP_INFO_WUNLOCK(&V_ripcbinfo);
857	return (0);
858}
859
860static int
861rip_shutdown(struct socket *so)
862{
863	struct inpcb *inp;
864
865	inp = sotoinpcb(so);
866	KASSERT(inp != NULL, ("rip_shutdown: inp == NULL"));
867
868	INP_WLOCK(inp);
869	socantsendmore(so);
870	INP_WUNLOCK(inp);
871	return (0);
872}
873
874static int
875rip_send(struct socket *so, int flags, struct mbuf *m, struct sockaddr *nam,
876    struct mbuf *control, struct thread *td)
877{
878	struct inpcb *inp;
879	u_long dst;
880
881	inp = sotoinpcb(so);
882	KASSERT(inp != NULL, ("rip_send: inp == NULL"));
883
884	/*
885	 * Note: 'dst' reads below are unlocked.
886	 */
887	if (so->so_state & SS_ISCONNECTED) {
888		if (nam) {
889			m_freem(m);
890			return (EISCONN);
891		}
892		dst = inp->inp_faddr.s_addr;	/* Unlocked read. */
893	} else {
894		if (nam == NULL) {
895			m_freem(m);
896			return (ENOTCONN);
897		}
898		dst = ((struct sockaddr_in *)nam)->sin_addr.s_addr;
899	}
900	return (rip_output(m, so, dst));
901}
902
903static int
904rip_pcblist(SYSCTL_HANDLER_ARGS)
905{
906	INIT_VNET_INET(curvnet);
907	int error, i, n;
908	struct inpcb *inp, **inp_list;
909	inp_gen_t gencnt;
910	struct xinpgen xig;
911
912	/*
913	 * The process of preparing the TCB list is too time-consuming and
914	 * resource-intensive to repeat twice on every request.
915	 */
916	if (req->oldptr == 0) {
917		n = V_ripcbinfo.ipi_count;
918		req->oldidx = 2 * (sizeof xig)
919		    + (n + n/8) * sizeof(struct xinpcb);
920		return (0);
921	}
922
923	if (req->newptr != 0)
924		return (EPERM);
925
926	/*
927	 * OK, now we're committed to doing something.
928	 */
929	INP_INFO_RLOCK(&V_ripcbinfo);
930	gencnt = V_ripcbinfo.ipi_gencnt;
931	n = V_ripcbinfo.ipi_count;
932	INP_INFO_RUNLOCK(&V_ripcbinfo);
933
934	xig.xig_len = sizeof xig;
935	xig.xig_count = n;
936	xig.xig_gen = gencnt;
937	xig.xig_sogen = so_gencnt;
938	error = SYSCTL_OUT(req, &xig, sizeof xig);
939	if (error)
940		return (error);
941
942	inp_list = malloc(n * sizeof *inp_list, M_TEMP, M_WAITOK);
943	if (inp_list == 0)
944		return (ENOMEM);
945
946	INP_INFO_RLOCK(&V_ripcbinfo);
947	for (inp = LIST_FIRST(V_ripcbinfo.ipi_listhead), i = 0; inp && i < n;
948	     inp = LIST_NEXT(inp, inp_list)) {
949		INP_RLOCK(inp);
950		if (inp->inp_gencnt <= gencnt &&
951		    cr_canseeinpcb(req->td->td_ucred, inp) == 0) {
952			/* XXX held references? */
953			inp_list[i++] = inp;
954		}
955		INP_RUNLOCK(inp);
956	}
957	INP_INFO_RUNLOCK(&V_ripcbinfo);
958	n = i;
959
960	error = 0;
961	for (i = 0; i < n; i++) {
962		inp = inp_list[i];
963		INP_RLOCK(inp);
964		if (inp->inp_gencnt <= gencnt) {
965			struct xinpcb xi;
966			bzero(&xi, sizeof(xi));
967			xi.xi_len = sizeof xi;
968			/* XXX should avoid extra copy */
969			bcopy(inp, &xi.xi_inp, sizeof *inp);
970			if (inp->inp_socket)
971				sotoxsocket(inp->inp_socket, &xi.xi_socket);
972			INP_RUNLOCK(inp);
973			error = SYSCTL_OUT(req, &xi, sizeof xi);
974		} else
975			INP_RUNLOCK(inp);
976	}
977	if (!error) {
978		/*
979		 * Give the user an updated idea of our state.  If the
980		 * generation differs from what we told her before, she knows
981		 * that something happened while we were processing this
982		 * request, and it might be necessary to retry.
983		 */
984		INP_INFO_RLOCK(&V_ripcbinfo);
985		xig.xig_gen = V_ripcbinfo.ipi_gencnt;
986		xig.xig_sogen = so_gencnt;
987		xig.xig_count = V_ripcbinfo.ipi_count;
988		INP_INFO_RUNLOCK(&V_ripcbinfo);
989		error = SYSCTL_OUT(req, &xig, sizeof xig);
990	}
991	free(inp_list, M_TEMP);
992	return (error);
993}
994
995SYSCTL_PROC(_net_inet_raw, OID_AUTO/*XXX*/, pcblist, CTLFLAG_RD, 0, 0,
996    rip_pcblist, "S,xinpcb", "List of active raw IP sockets");
997
998struct pr_usrreqs rip_usrreqs = {
999	.pru_abort =		rip_abort,
1000	.pru_attach =		rip_attach,
1001	.pru_bind =		rip_bind,
1002	.pru_connect =		rip_connect,
1003	.pru_control =		in_control,
1004	.pru_detach =		rip_detach,
1005	.pru_disconnect =	rip_disconnect,
1006	.pru_peeraddr =		in_getpeeraddr,
1007	.pru_send =		rip_send,
1008	.pru_shutdown =		rip_shutdown,
1009	.pru_sockaddr =		in_getsockaddr,
1010	.pru_sosetlabel =	in_pcbsosetlabel,
1011	.pru_close =		rip_close,
1012};
1013