raw_ip.c revision 189106
1/*-
2 * Copyright (c) 1982, 1986, 1988, 1993
3 *	The Regents of the University of California.
4 * All rights reserved.
5 *
6 * Redistribution and use in source and binary forms, with or without
7 * modification, are permitted provided that the following conditions
8 * are met:
9 * 1. Redistributions of source code must retain the above copyright
10 *    notice, this list of conditions and the following disclaimer.
11 * 2. Redistributions in binary form must reproduce the above copyright
12 *    notice, this list of conditions and the following disclaimer in the
13 *    documentation and/or other materials provided with the distribution.
14 * 4. Neither the name of the University nor the names of its contributors
15 *    may be used to endorse or promote products derived from this software
16 *    without specific prior written permission.
17 *
18 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
19 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
20 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
21 * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
22 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
23 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
24 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
25 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
26 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
27 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
28 * SUCH DAMAGE.
29 *
30 *	@(#)raw_ip.c	8.7 (Berkeley) 5/15/95
31 */
32
33#include <sys/cdefs.h>
34__FBSDID("$FreeBSD: head/sys/netinet/raw_ip.c 189106 2009-02-27 14:12:05Z bz $");
35
36#include "opt_inet6.h"
37#include "opt_ipsec.h"
38#include "opt_route.h"
39#include "opt_mac.h"
40
41#include <sys/param.h>
42#include <sys/jail.h>
43#include <sys/kernel.h>
44#include <sys/lock.h>
45#include <sys/malloc.h>
46#include <sys/mbuf.h>
47#include <sys/priv.h>
48#include <sys/proc.h>
49#include <sys/protosw.h>
50#include <sys/rwlock.h>
51#include <sys/signalvar.h>
52#include <sys/socket.h>
53#include <sys/socketvar.h>
54#include <sys/sx.h>
55#include <sys/sysctl.h>
56#include <sys/systm.h>
57#include <sys/vimage.h>
58
59#include <vm/uma.h>
60
61#include <net/if.h>
62#include <net/route.h>
63#include <net/vnet.h>
64
65#include <netinet/in.h>
66#include <netinet/in_systm.h>
67#include <netinet/in_pcb.h>
68#include <netinet/in_var.h>
69#include <netinet/ip.h>
70#include <netinet/ip_var.h>
71#include <netinet/ip_mroute.h>
72
73#include <netinet/ip_fw.h>
74#include <netinet/ip_dummynet.h>
75#include <netinet/vinet.h>
76
77#ifdef IPSEC
78#include <netipsec/ipsec.h>
79#endif /*IPSEC*/
80
81#include <security/mac/mac_framework.h>
82
83#ifdef VIMAGE_GLOBALS
84struct	inpcbhead ripcb;
85struct	inpcbinfo ripcbinfo;
86#endif
87
88/* control hooks for ipfw and dummynet */
89ip_fw_ctl_t *ip_fw_ctl_ptr = NULL;
90ip_dn_ctl_t *ip_dn_ctl_ptr = NULL;
91
92/*
93 * Hooks for multicast routing. They all default to NULL, so leave them not
94 * initialized and rely on BSS being set to 0.
95 */
96
97/*
98 * The socket used to communicate with the multicast routing daemon.
99 */
100#ifdef VIMAGE_GLOBALS
101struct socket  *ip_mrouter;
102#endif
103
104/*
105 * The various mrouter and rsvp functions.
106 */
107int (*ip_mrouter_set)(struct socket *, struct sockopt *);
108int (*ip_mrouter_get)(struct socket *, struct sockopt *);
109int (*ip_mrouter_done)(void);
110int (*ip_mforward)(struct ip *, struct ifnet *, struct mbuf *,
111		   struct ip_moptions *);
112int (*mrt_ioctl)(int, caddr_t, int);
113int (*legal_vif_num)(int);
114u_long (*ip_mcast_src)(int);
115
116void (*rsvp_input_p)(struct mbuf *m, int off);
117int (*ip_rsvp_vif)(struct socket *, struct sockopt *);
118void (*ip_rsvp_force_done)(struct socket *);
119
120/*
121 * Hash functions
122 */
123
124#define INP_PCBHASH_RAW_SIZE	256
125#define INP_PCBHASH_RAW(proto, laddr, faddr, mask) \
126        (((proto) + (laddr) + (faddr)) % (mask) + 1)
127
128static void
129rip_inshash(struct inpcb *inp)
130{
131	struct inpcbinfo *pcbinfo = inp->inp_pcbinfo;
132	struct inpcbhead *pcbhash;
133	int hash;
134
135	INP_INFO_WLOCK_ASSERT(pcbinfo);
136	INP_WLOCK_ASSERT(inp);
137
138	if (inp->inp_ip_p != 0 &&
139	    inp->inp_laddr.s_addr != INADDR_ANY &&
140	    inp->inp_faddr.s_addr != INADDR_ANY) {
141		hash = INP_PCBHASH_RAW(inp->inp_ip_p, inp->inp_laddr.s_addr,
142		    inp->inp_faddr.s_addr, pcbinfo->ipi_hashmask);
143	} else
144		hash = 0;
145	pcbhash = &pcbinfo->ipi_hashbase[hash];
146	LIST_INSERT_HEAD(pcbhash, inp, inp_hash);
147}
148
149static void
150rip_delhash(struct inpcb *inp)
151{
152
153	INP_INFO_WLOCK_ASSERT(inp->inp_pcbinfo);
154	INP_WLOCK_ASSERT(inp);
155
156	LIST_REMOVE(inp, inp_hash);
157}
158
159/*
160 * Raw interface to IP protocol.
161 */
162
163/*
164 * Initialize raw connection block q.
165 */
166static void
167rip_zone_change(void *tag)
168{
169	INIT_VNET_INET(curvnet);
170
171	uma_zone_set_max(V_ripcbinfo.ipi_zone, maxsockets);
172}
173
174static int
175rip_inpcb_init(void *mem, int size, int flags)
176{
177	struct inpcb *inp = mem;
178
179	INP_LOCK_INIT(inp, "inp", "rawinp");
180	return (0);
181}
182
183void
184rip_init(void)
185{
186	INIT_VNET_INET(curvnet);
187
188	INP_INFO_LOCK_INIT(&V_ripcbinfo, "rip");
189	LIST_INIT(&V_ripcb);
190	V_ripcbinfo.ipi_listhead = &V_ripcb;
191	V_ripcbinfo.ipi_hashbase =
192	    hashinit(INP_PCBHASH_RAW_SIZE, M_PCB, &V_ripcbinfo.ipi_hashmask);
193	V_ripcbinfo.ipi_porthashbase =
194	    hashinit(1, M_PCB, &V_ripcbinfo.ipi_porthashmask);
195	V_ripcbinfo.ipi_zone = uma_zcreate("ripcb", sizeof(struct inpcb),
196	    NULL, NULL, rip_inpcb_init, NULL, UMA_ALIGN_PTR, UMA_ZONE_NOFREE);
197	uma_zone_set_max(V_ripcbinfo.ipi_zone, maxsockets);
198	EVENTHANDLER_REGISTER(maxsockets_change, rip_zone_change, NULL,
199	    EVENTHANDLER_PRI_ANY);
200}
201
202static int
203rip_append(struct inpcb *last, struct ip *ip, struct mbuf *n,
204    struct sockaddr_in *ripsrc)
205{
206	int policyfail = 0;
207
208	INP_RLOCK_ASSERT(last);
209
210#ifdef IPSEC
211	/* check AH/ESP integrity. */
212	if (ipsec4_in_reject(n, last)) {
213		policyfail = 1;
214	}
215#endif /* IPSEC */
216#ifdef MAC
217	if (!policyfail && mac_inpcb_check_deliver(last, n) != 0)
218		policyfail = 1;
219#endif
220	/* Check the minimum TTL for socket. */
221	if (last->inp_ip_minttl && last->inp_ip_minttl > ip->ip_ttl)
222		policyfail = 1;
223	if (!policyfail) {
224		struct mbuf *opts = NULL;
225		struct socket *so;
226
227		so = last->inp_socket;
228		if ((last->inp_flags & INP_CONTROLOPTS) ||
229		    (so->so_options & (SO_TIMESTAMP | SO_BINTIME)))
230			ip_savecontrol(last, &opts, ip, n);
231		SOCKBUF_LOCK(&so->so_rcv);
232		if (sbappendaddr_locked(&so->so_rcv,
233		    (struct sockaddr *)ripsrc, n, opts) == 0) {
234			/* should notify about lost packet */
235			m_freem(n);
236			if (opts)
237				m_freem(opts);
238			SOCKBUF_UNLOCK(&so->so_rcv);
239		} else
240			sorwakeup_locked(so);
241	} else
242		m_freem(n);
243	return (policyfail);
244}
245
246/*
247 * Setup generic address and protocol structures for raw_input routine, then
248 * pass them along with mbuf chain.
249 */
250void
251rip_input(struct mbuf *m, int off)
252{
253	INIT_VNET_INET(curvnet);
254	struct ip *ip = mtod(m, struct ip *);
255	int proto = ip->ip_p;
256	struct inpcb *inp, *last;
257	struct sockaddr_in ripsrc;
258	int hash;
259
260	bzero(&ripsrc, sizeof(ripsrc));
261	ripsrc.sin_len = sizeof(ripsrc);
262	ripsrc.sin_family = AF_INET;
263	ripsrc.sin_addr = ip->ip_src;
264	last = NULL;
265	hash = INP_PCBHASH_RAW(proto, ip->ip_src.s_addr,
266	    ip->ip_dst.s_addr, V_ripcbinfo.ipi_hashmask);
267	INP_INFO_RLOCK(&V_ripcbinfo);
268	LIST_FOREACH(inp, &V_ripcbinfo.ipi_hashbase[hash], inp_hash) {
269		if (inp->inp_ip_p != proto)
270			continue;
271#ifdef INET6
272		/* XXX inp locking */
273		if ((inp->inp_vflag & INP_IPV4) == 0)
274			continue;
275#endif
276		if (inp->inp_laddr.s_addr != ip->ip_dst.s_addr)
277			continue;
278		if (inp->inp_faddr.s_addr != ip->ip_src.s_addr)
279			continue;
280		if (prison_check_ip4(inp->inp_cred, &ip->ip_dst) != 0)
281			continue;
282		if (last != NULL) {
283			struct mbuf *n;
284
285			n = m_copy(m, 0, (int)M_COPYALL);
286			if (n != NULL)
287		    	    (void) rip_append(last, ip, n, &ripsrc);
288			/* XXX count dropped packet */
289			INP_RUNLOCK(last);
290		}
291		INP_RLOCK(inp);
292		last = inp;
293	}
294	LIST_FOREACH(inp, &V_ripcbinfo.ipi_hashbase[0], inp_hash) {
295		if (inp->inp_ip_p && inp->inp_ip_p != proto)
296			continue;
297#ifdef INET6
298		/* XXX inp locking */
299		if ((inp->inp_vflag & INP_IPV4) == 0)
300			continue;
301#endif
302		if (inp->inp_laddr.s_addr &&
303		    inp->inp_laddr.s_addr != ip->ip_dst.s_addr)
304			continue;
305		if (inp->inp_faddr.s_addr &&
306		    inp->inp_faddr.s_addr != ip->ip_src.s_addr)
307			continue;
308		if (prison_check_ip4(inp->inp_cred, &ip->ip_dst) != 0)
309			continue;
310		if (last != NULL) {
311			struct mbuf *n;
312
313			n = m_copy(m, 0, (int)M_COPYALL);
314			if (n != NULL)
315				(void) rip_append(last, ip, n, &ripsrc);
316			/* XXX count dropped packet */
317			INP_RUNLOCK(last);
318		}
319		INP_RLOCK(inp);
320		last = inp;
321	}
322	INP_INFO_RUNLOCK(&V_ripcbinfo);
323	if (last != NULL) {
324		if (rip_append(last, ip, m, &ripsrc) != 0)
325			V_ipstat.ips_delivered--;
326		INP_RUNLOCK(last);
327	} else {
328		m_freem(m);
329		V_ipstat.ips_noproto++;
330		V_ipstat.ips_delivered--;
331	}
332}
333
334/*
335 * Generate IP header and pass packet to ip_output.  Tack on options user may
336 * have setup with control call.
337 */
338int
339rip_output(struct mbuf *m, struct socket *so, u_long dst)
340{
341	INIT_VNET_INET(so->so_vnet);
342	struct ip *ip;
343	int error;
344	struct inpcb *inp = sotoinpcb(so);
345	int flags = ((so->so_options & SO_DONTROUTE) ? IP_ROUTETOIF : 0) |
346	    IP_ALLOWBROADCAST;
347
348	/*
349	 * If the user handed us a complete IP packet, use it.  Otherwise,
350	 * allocate an mbuf for a header and fill it in.
351	 */
352	if ((inp->inp_flags & INP_HDRINCL) == 0) {
353		if (m->m_pkthdr.len + sizeof(struct ip) > IP_MAXPACKET) {
354			m_freem(m);
355			return(EMSGSIZE);
356		}
357		M_PREPEND(m, sizeof(struct ip), M_DONTWAIT);
358		if (m == NULL)
359			return(ENOBUFS);
360
361		INP_RLOCK(inp);
362		ip = mtod(m, struct ip *);
363		ip->ip_tos = inp->inp_ip_tos;
364		if (inp->inp_flags & INP_DONTFRAG)
365			ip->ip_off = IP_DF;
366		else
367			ip->ip_off = 0;
368		ip->ip_p = inp->inp_ip_p;
369		ip->ip_len = m->m_pkthdr.len;
370		ip->ip_src = inp->inp_laddr;
371		error = prison_get_ip4(inp->inp_cred, &ip->ip_src);
372		if (error != 0) {
373			INP_RUNLOCK(inp);
374			m_freem(m);
375			return (error);
376		}
377		ip->ip_dst.s_addr = dst;
378		ip->ip_ttl = inp->inp_ip_ttl;
379	} else {
380		if (m->m_pkthdr.len > IP_MAXPACKET) {
381			m_freem(m);
382			return(EMSGSIZE);
383		}
384		INP_RLOCK(inp);
385		ip = mtod(m, struct ip *);
386		error = prison_check_ip4(inp->inp_cred, &ip->ip_src);
387		if (error != 0) {
388			INP_RUNLOCK(inp);
389			m_freem(m);
390			return (error);
391		}
392
393		/*
394		 * Don't allow both user specified and setsockopt options,
395		 * and don't allow packet length sizes that will crash.
396		 */
397		if (((ip->ip_hl != (sizeof (*ip) >> 2)) && inp->inp_options)
398		    || (ip->ip_len > m->m_pkthdr.len)
399		    || (ip->ip_len < (ip->ip_hl << 2))) {
400			INP_RUNLOCK(inp);
401			m_freem(m);
402			return (EINVAL);
403		}
404		if (ip->ip_id == 0)
405			ip->ip_id = ip_newid();
406
407		/*
408		 * XXX prevent ip_output from overwriting header fields.
409		 */
410		flags |= IP_RAWOUTPUT;
411		V_ipstat.ips_rawout++;
412	}
413
414	if (inp->inp_flags & INP_ONESBCAST)
415		flags |= IP_SENDONES;
416
417#ifdef MAC
418	mac_inpcb_create_mbuf(inp, m);
419#endif
420
421	error = ip_output(m, inp->inp_options, NULL, flags,
422	    inp->inp_moptions, inp);
423	INP_RUNLOCK(inp);
424	return (error);
425}
426
427/*
428 * Raw IP socket option processing.
429 *
430 * IMPORTANT NOTE regarding access control: Traditionally, raw sockets could
431 * only be created by a privileged process, and as such, socket option
432 * operations to manage system properties on any raw socket were allowed to
433 * take place without explicit additional access control checks.  However,
434 * raw sockets can now also be created in jail(), and therefore explicit
435 * checks are now required.  Likewise, raw sockets can be used by a process
436 * after it gives up privilege, so some caution is required.  For options
437 * passed down to the IP layer via ip_ctloutput(), checks are assumed to be
438 * performed in ip_ctloutput() and therefore no check occurs here.
439 * Unilaterally checking priv_check() here breaks normal IP socket option
440 * operations on raw sockets.
441 *
442 * When adding new socket options here, make sure to add access control
443 * checks here as necessary.
444 */
445int
446rip_ctloutput(struct socket *so, struct sockopt *sopt)
447{
448	struct	inpcb *inp = sotoinpcb(so);
449	int	error, optval;
450
451	if (sopt->sopt_level != IPPROTO_IP) {
452		if ((sopt->sopt_level == SOL_SOCKET) &&
453		    (sopt->sopt_name == SO_SETFIB)) {
454			inp->inp_inc.inc_fibnum = so->so_fibnum;
455			return (0);
456		}
457		return (EINVAL);
458	}
459
460	error = 0;
461	switch (sopt->sopt_dir) {
462	case SOPT_GET:
463		switch (sopt->sopt_name) {
464		case IP_HDRINCL:
465			optval = inp->inp_flags & INP_HDRINCL;
466			error = sooptcopyout(sopt, &optval, sizeof optval);
467			break;
468
469		case IP_FW_ADD:	/* ADD actually returns the body... */
470		case IP_FW_GET:
471		case IP_FW_TABLE_GETSIZE:
472		case IP_FW_TABLE_LIST:
473		case IP_FW_NAT_GET_CONFIG:
474		case IP_FW_NAT_GET_LOG:
475			if (ip_fw_ctl_ptr != NULL)
476				error = ip_fw_ctl_ptr(sopt);
477			else
478				error = ENOPROTOOPT;
479			break;
480
481		case IP_DUMMYNET_GET:
482			if (ip_dn_ctl_ptr != NULL)
483				error = ip_dn_ctl_ptr(sopt);
484			else
485				error = ENOPROTOOPT;
486			break ;
487
488		case MRT_INIT:
489		case MRT_DONE:
490		case MRT_ADD_VIF:
491		case MRT_DEL_VIF:
492		case MRT_ADD_MFC:
493		case MRT_DEL_MFC:
494		case MRT_VERSION:
495		case MRT_ASSERT:
496		case MRT_API_SUPPORT:
497		case MRT_API_CONFIG:
498		case MRT_ADD_BW_UPCALL:
499		case MRT_DEL_BW_UPCALL:
500			error = priv_check(curthread, PRIV_NETINET_MROUTE);
501			if (error != 0)
502				return (error);
503			error = ip_mrouter_get ? ip_mrouter_get(so, sopt) :
504				EOPNOTSUPP;
505			break;
506
507		default:
508			error = ip_ctloutput(so, sopt);
509			break;
510		}
511		break;
512
513	case SOPT_SET:
514		switch (sopt->sopt_name) {
515		case IP_HDRINCL:
516			error = sooptcopyin(sopt, &optval, sizeof optval,
517					    sizeof optval);
518			if (error)
519				break;
520			if (optval)
521				inp->inp_flags |= INP_HDRINCL;
522			else
523				inp->inp_flags &= ~INP_HDRINCL;
524			break;
525
526		case IP_FW_ADD:
527		case IP_FW_DEL:
528		case IP_FW_FLUSH:
529		case IP_FW_ZERO:
530		case IP_FW_RESETLOG:
531		case IP_FW_TABLE_ADD:
532		case IP_FW_TABLE_DEL:
533		case IP_FW_TABLE_FLUSH:
534		case IP_FW_NAT_CFG:
535		case IP_FW_NAT_DEL:
536			if (ip_fw_ctl_ptr != NULL)
537				error = ip_fw_ctl_ptr(sopt);
538			else
539				error = ENOPROTOOPT;
540			break;
541
542		case IP_DUMMYNET_CONFIGURE:
543		case IP_DUMMYNET_DEL:
544		case IP_DUMMYNET_FLUSH:
545			if (ip_dn_ctl_ptr != NULL)
546				error = ip_dn_ctl_ptr(sopt);
547			else
548				error = ENOPROTOOPT ;
549			break ;
550
551		case IP_RSVP_ON:
552			error = priv_check(curthread, PRIV_NETINET_MROUTE);
553			if (error != 0)
554				return (error);
555			error = ip_rsvp_init(so);
556			break;
557
558		case IP_RSVP_OFF:
559			error = priv_check(curthread, PRIV_NETINET_MROUTE);
560			if (error != 0)
561				return (error);
562			error = ip_rsvp_done();
563			break;
564
565		case IP_RSVP_VIF_ON:
566		case IP_RSVP_VIF_OFF:
567			error = priv_check(curthread, PRIV_NETINET_MROUTE);
568			if (error != 0)
569				return (error);
570			error = ip_rsvp_vif ?
571				ip_rsvp_vif(so, sopt) : EINVAL;
572			break;
573
574		case MRT_INIT:
575		case MRT_DONE:
576		case MRT_ADD_VIF:
577		case MRT_DEL_VIF:
578		case MRT_ADD_MFC:
579		case MRT_DEL_MFC:
580		case MRT_VERSION:
581		case MRT_ASSERT:
582		case MRT_API_SUPPORT:
583		case MRT_API_CONFIG:
584		case MRT_ADD_BW_UPCALL:
585		case MRT_DEL_BW_UPCALL:
586			error = priv_check(curthread, PRIV_NETINET_MROUTE);
587			if (error != 0)
588				return (error);
589			error = ip_mrouter_set ? ip_mrouter_set(so, sopt) :
590					EOPNOTSUPP;
591			break;
592
593		default:
594			error = ip_ctloutput(so, sopt);
595			break;
596		}
597		break;
598	}
599
600	return (error);
601}
602
603/*
604 * This function exists solely to receive the PRC_IFDOWN messages which are
605 * sent by if_down().  It looks for an ifaddr whose ifa_addr is sa, and calls
606 * in_ifadown() to remove all routes corresponding to that address.  It also
607 * receives the PRC_IFUP messages from if_up() and reinstalls the interface
608 * routes.
609 */
610void
611rip_ctlinput(int cmd, struct sockaddr *sa, void *vip)
612{
613	INIT_VNET_INET(curvnet);
614	struct in_ifaddr *ia;
615	struct ifnet *ifp;
616	int err;
617	int flags;
618
619	switch (cmd) {
620	case PRC_IFDOWN:
621		TAILQ_FOREACH(ia, &V_in_ifaddrhead, ia_link) {
622			if (ia->ia_ifa.ifa_addr == sa
623			    && (ia->ia_flags & IFA_ROUTE)) {
624				/*
625				 * in_ifscrub kills the interface route.
626				 */
627				in_ifscrub(ia->ia_ifp, ia);
628				/*
629				 * in_ifadown gets rid of all the rest of the
630				 * routes.  This is not quite the right thing
631				 * to do, but at least if we are running a
632				 * routing process they will come back.
633				 */
634				in_ifadown(&ia->ia_ifa, 0);
635				break;
636			}
637		}
638		break;
639
640	case PRC_IFUP:
641		TAILQ_FOREACH(ia, &V_in_ifaddrhead, ia_link) {
642			if (ia->ia_ifa.ifa_addr == sa)
643				break;
644		}
645		if (ia == 0 || (ia->ia_flags & IFA_ROUTE))
646			return;
647		flags = RTF_UP;
648		ifp = ia->ia_ifa.ifa_ifp;
649
650		if ((ifp->if_flags & IFF_LOOPBACK)
651		    || (ifp->if_flags & IFF_POINTOPOINT))
652			flags |= RTF_HOST;
653
654		err = rtinit(&ia->ia_ifa, RTM_ADD, flags);
655		if (err == 0)
656			ia->ia_flags |= IFA_ROUTE;
657		break;
658	}
659}
660
661u_long	rip_sendspace = 9216;
662u_long	rip_recvspace = 9216;
663
664SYSCTL_ULONG(_net_inet_raw, OID_AUTO, maxdgram, CTLFLAG_RW,
665    &rip_sendspace, 0, "Maximum outgoing raw IP datagram size");
666SYSCTL_ULONG(_net_inet_raw, OID_AUTO, recvspace, CTLFLAG_RW,
667    &rip_recvspace, 0, "Maximum space for incoming raw IP datagrams");
668
669static int
670rip_attach(struct socket *so, int proto, struct thread *td)
671{
672	INIT_VNET_INET(so->so_vnet);
673	struct inpcb *inp;
674	int error;
675
676	inp = sotoinpcb(so);
677	KASSERT(inp == NULL, ("rip_attach: inp != NULL"));
678
679	error = priv_check(td, PRIV_NETINET_RAW);
680	if (error)
681		return (error);
682	if (proto >= IPPROTO_MAX || proto < 0)
683		return EPROTONOSUPPORT;
684	error = soreserve(so, rip_sendspace, rip_recvspace);
685	if (error)
686		return (error);
687	INP_INFO_WLOCK(&V_ripcbinfo);
688	error = in_pcballoc(so, &V_ripcbinfo);
689	if (error) {
690		INP_INFO_WUNLOCK(&V_ripcbinfo);
691		return (error);
692	}
693	inp = (struct inpcb *)so->so_pcb;
694	inp->inp_vflag |= INP_IPV4;
695	inp->inp_ip_p = proto;
696	inp->inp_ip_ttl = V_ip_defttl;
697	rip_inshash(inp);
698	INP_INFO_WUNLOCK(&V_ripcbinfo);
699	INP_WUNLOCK(inp);
700	return (0);
701}
702
703static void
704rip_detach(struct socket *so)
705{
706	INIT_VNET_INET(so->so_vnet);
707	struct inpcb *inp;
708
709	inp = sotoinpcb(so);
710	KASSERT(inp != NULL, ("rip_detach: inp == NULL"));
711	KASSERT(inp->inp_faddr.s_addr == INADDR_ANY,
712	    ("rip_detach: not closed"));
713
714	INP_INFO_WLOCK(&V_ripcbinfo);
715	INP_WLOCK(inp);
716	rip_delhash(inp);
717	if (so == V_ip_mrouter && ip_mrouter_done)
718		ip_mrouter_done();
719	if (ip_rsvp_force_done)
720		ip_rsvp_force_done(so);
721	if (so == V_ip_rsvpd)
722		ip_rsvp_done();
723	in_pcbdetach(inp);
724	in_pcbfree(inp);
725	INP_INFO_WUNLOCK(&V_ripcbinfo);
726}
727
728static void
729rip_dodisconnect(struct socket *so, struct inpcb *inp)
730{
731
732	INP_INFO_WLOCK_ASSERT(inp->inp_pcbinfo);
733	INP_WLOCK_ASSERT(inp);
734
735	rip_delhash(inp);
736	inp->inp_faddr.s_addr = INADDR_ANY;
737	rip_inshash(inp);
738	SOCK_LOCK(so);
739	so->so_state &= ~SS_ISCONNECTED;
740	SOCK_UNLOCK(so);
741}
742
743static void
744rip_abort(struct socket *so)
745{
746	INIT_VNET_INET(so->so_vnet);
747	struct inpcb *inp;
748
749	inp = sotoinpcb(so);
750	KASSERT(inp != NULL, ("rip_abort: inp == NULL"));
751
752	INP_INFO_WLOCK(&V_ripcbinfo);
753	INP_WLOCK(inp);
754	rip_dodisconnect(so, inp);
755	INP_WUNLOCK(inp);
756	INP_INFO_WUNLOCK(&V_ripcbinfo);
757}
758
759static void
760rip_close(struct socket *so)
761{
762	INIT_VNET_INET(so->so_vnet);
763	struct inpcb *inp;
764
765	inp = sotoinpcb(so);
766	KASSERT(inp != NULL, ("rip_close: inp == NULL"));
767
768	INP_INFO_WLOCK(&V_ripcbinfo);
769	INP_WLOCK(inp);
770	rip_dodisconnect(so, inp);
771	INP_WUNLOCK(inp);
772	INP_INFO_WUNLOCK(&V_ripcbinfo);
773}
774
775static int
776rip_disconnect(struct socket *so)
777{
778	INIT_VNET_INET(so->so_vnet);
779	struct inpcb *inp;
780
781	if ((so->so_state & SS_ISCONNECTED) == 0)
782		return (ENOTCONN);
783
784	inp = sotoinpcb(so);
785	KASSERT(inp != NULL, ("rip_disconnect: inp == NULL"));
786
787	INP_INFO_WLOCK(&V_ripcbinfo);
788	INP_WLOCK(inp);
789	rip_dodisconnect(so, inp);
790	INP_WUNLOCK(inp);
791	INP_INFO_WUNLOCK(&V_ripcbinfo);
792	return (0);
793}
794
795static int
796rip_bind(struct socket *so, struct sockaddr *nam, struct thread *td)
797{
798	INIT_VNET_NET(so->so_vnet);
799	INIT_VNET_INET(so->so_vnet);
800	struct sockaddr_in *addr = (struct sockaddr_in *)nam;
801	struct inpcb *inp;
802	int error;
803
804	if (nam->sa_len != sizeof(*addr))
805		return (EINVAL);
806
807	error = prison_check_ip4(td->td_ucred, &addr->sin_addr);
808	if (error != 0)
809		return (error);
810
811	if (TAILQ_EMPTY(&V_ifnet) ||
812	    (addr->sin_family != AF_INET && addr->sin_family != AF_IMPLINK) ||
813	    (addr->sin_addr.s_addr &&
814	     ifa_ifwithaddr((struct sockaddr *)addr) == 0))
815		return (EADDRNOTAVAIL);
816
817	inp = sotoinpcb(so);
818	KASSERT(inp != NULL, ("rip_bind: inp == NULL"));
819
820	INP_INFO_WLOCK(&V_ripcbinfo);
821	INP_WLOCK(inp);
822	rip_delhash(inp);
823	inp->inp_laddr = addr->sin_addr;
824	rip_inshash(inp);
825	INP_WUNLOCK(inp);
826	INP_INFO_WUNLOCK(&V_ripcbinfo);
827	return (0);
828}
829
830static int
831rip_connect(struct socket *so, struct sockaddr *nam, struct thread *td)
832{
833	INIT_VNET_NET(so->so_vnet);
834	INIT_VNET_INET(so->so_vnet);
835	struct sockaddr_in *addr = (struct sockaddr_in *)nam;
836	struct inpcb *inp;
837
838	if (nam->sa_len != sizeof(*addr))
839		return (EINVAL);
840	if (TAILQ_EMPTY(&V_ifnet))
841		return (EADDRNOTAVAIL);
842	if (addr->sin_family != AF_INET && addr->sin_family != AF_IMPLINK)
843		return (EAFNOSUPPORT);
844
845	inp = sotoinpcb(so);
846	KASSERT(inp != NULL, ("rip_connect: inp == NULL"));
847
848	INP_INFO_WLOCK(&V_ripcbinfo);
849	INP_WLOCK(inp);
850	rip_delhash(inp);
851	inp->inp_faddr = addr->sin_addr;
852	rip_inshash(inp);
853	soisconnected(so);
854	INP_WUNLOCK(inp);
855	INP_INFO_WUNLOCK(&V_ripcbinfo);
856	return (0);
857}
858
859static int
860rip_shutdown(struct socket *so)
861{
862	struct inpcb *inp;
863
864	inp = sotoinpcb(so);
865	KASSERT(inp != NULL, ("rip_shutdown: inp == NULL"));
866
867	INP_WLOCK(inp);
868	socantsendmore(so);
869	INP_WUNLOCK(inp);
870	return (0);
871}
872
873static int
874rip_send(struct socket *so, int flags, struct mbuf *m, struct sockaddr *nam,
875    struct mbuf *control, struct thread *td)
876{
877	struct inpcb *inp;
878	u_long dst;
879
880	inp = sotoinpcb(so);
881	KASSERT(inp != NULL, ("rip_send: inp == NULL"));
882
883	/*
884	 * Note: 'dst' reads below are unlocked.
885	 */
886	if (so->so_state & SS_ISCONNECTED) {
887		if (nam) {
888			m_freem(m);
889			return (EISCONN);
890		}
891		dst = inp->inp_faddr.s_addr;	/* Unlocked read. */
892	} else {
893		if (nam == NULL) {
894			m_freem(m);
895			return (ENOTCONN);
896		}
897		dst = ((struct sockaddr_in *)nam)->sin_addr.s_addr;
898	}
899	return (rip_output(m, so, dst));
900}
901
902static int
903rip_pcblist(SYSCTL_HANDLER_ARGS)
904{
905	INIT_VNET_INET(curvnet);
906	int error, i, n;
907	struct inpcb *inp, **inp_list;
908	inp_gen_t gencnt;
909	struct xinpgen xig;
910
911	/*
912	 * The process of preparing the TCB list is too time-consuming and
913	 * resource-intensive to repeat twice on every request.
914	 */
915	if (req->oldptr == 0) {
916		n = V_ripcbinfo.ipi_count;
917		req->oldidx = 2 * (sizeof xig)
918		    + (n + n/8) * sizeof(struct xinpcb);
919		return (0);
920	}
921
922	if (req->newptr != 0)
923		return (EPERM);
924
925	/*
926	 * OK, now we're committed to doing something.
927	 */
928	INP_INFO_RLOCK(&V_ripcbinfo);
929	gencnt = V_ripcbinfo.ipi_gencnt;
930	n = V_ripcbinfo.ipi_count;
931	INP_INFO_RUNLOCK(&V_ripcbinfo);
932
933	xig.xig_len = sizeof xig;
934	xig.xig_count = n;
935	xig.xig_gen = gencnt;
936	xig.xig_sogen = so_gencnt;
937	error = SYSCTL_OUT(req, &xig, sizeof xig);
938	if (error)
939		return (error);
940
941	inp_list = malloc(n * sizeof *inp_list, M_TEMP, M_WAITOK);
942	if (inp_list == 0)
943		return (ENOMEM);
944
945	INP_INFO_RLOCK(&V_ripcbinfo);
946	for (inp = LIST_FIRST(V_ripcbinfo.ipi_listhead), i = 0; inp && i < n;
947	     inp = LIST_NEXT(inp, inp_list)) {
948		INP_RLOCK(inp);
949		if (inp->inp_gencnt <= gencnt &&
950		    cr_canseeinpcb(req->td->td_ucred, inp) == 0) {
951			/* XXX held references? */
952			inp_list[i++] = inp;
953		}
954		INP_RUNLOCK(inp);
955	}
956	INP_INFO_RUNLOCK(&V_ripcbinfo);
957	n = i;
958
959	error = 0;
960	for (i = 0; i < n; i++) {
961		inp = inp_list[i];
962		INP_RLOCK(inp);
963		if (inp->inp_gencnt <= gencnt) {
964			struct xinpcb xi;
965
966			bzero(&xi, sizeof(xi));
967			xi.xi_len = sizeof xi;
968			/* XXX should avoid extra copy */
969			bcopy(inp, &xi.xi_inp, sizeof *inp);
970			if (inp->inp_socket)
971				sotoxsocket(inp->inp_socket, &xi.xi_socket);
972			INP_RUNLOCK(inp);
973			error = SYSCTL_OUT(req, &xi, sizeof xi);
974		} else
975			INP_RUNLOCK(inp);
976	}
977	if (!error) {
978		/*
979		 * Give the user an updated idea of our state.  If the
980		 * generation differs from what we told her before, she knows
981		 * that something happened while we were processing this
982		 * request, and it might be necessary to retry.
983		 */
984		INP_INFO_RLOCK(&V_ripcbinfo);
985		xig.xig_gen = V_ripcbinfo.ipi_gencnt;
986		xig.xig_sogen = so_gencnt;
987		xig.xig_count = V_ripcbinfo.ipi_count;
988		INP_INFO_RUNLOCK(&V_ripcbinfo);
989		error = SYSCTL_OUT(req, &xig, sizeof xig);
990	}
991	free(inp_list, M_TEMP);
992	return (error);
993}
994
995SYSCTL_PROC(_net_inet_raw, OID_AUTO/*XXX*/, pcblist, CTLFLAG_RD, 0, 0,
996    rip_pcblist, "S,xinpcb", "List of active raw IP sockets");
997
998struct pr_usrreqs rip_usrreqs = {
999	.pru_abort =		rip_abort,
1000	.pru_attach =		rip_attach,
1001	.pru_bind =		rip_bind,
1002	.pru_connect =		rip_connect,
1003	.pru_control =		in_control,
1004	.pru_detach =		rip_detach,
1005	.pru_disconnect =	rip_disconnect,
1006	.pru_peeraddr =		in_getpeeraddr,
1007	.pru_send =		rip_send,
1008	.pru_shutdown =		rip_shutdown,
1009	.pru_sockaddr =		in_getsockaddr,
1010	.pru_sosetlabel =	in_pcbsosetlabel,
1011	.pru_close =		rip_close,
1012};
1013