raw_ip.c revision 180828
1/*-
2 * Copyright (c) 1982, 1986, 1988, 1993
3 *	The Regents of the University of California.
4 * All rights reserved.
5 *
6 * Redistribution and use in source and binary forms, with or without
7 * modification, are permitted provided that the following conditions
8 * are met:
9 * 1. Redistributions of source code must retain the above copyright
10 *    notice, this list of conditions and the following disclaimer.
11 * 2. Redistributions in binary form must reproduce the above copyright
12 *    notice, this list of conditions and the following disclaimer in the
13 *    documentation and/or other materials provided with the distribution.
14 * 4. Neither the name of the University nor the names of its contributors
15 *    may be used to endorse or promote products derived from this software
16 *    without specific prior written permission.
17 *
18 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
19 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
20 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
21 * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
22 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
23 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
24 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
25 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
26 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
27 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
28 * SUCH DAMAGE.
29 *
30 *	@(#)raw_ip.c	8.7 (Berkeley) 5/15/95
31 */
32
33#include <sys/cdefs.h>
34__FBSDID("$FreeBSD: head/sys/netinet/raw_ip.c 180828 2008-07-26 17:32:15Z mav $");
35
36#include "opt_inet6.h"
37#include "opt_ipsec.h"
38#include "opt_mac.h"
39
40#include <sys/param.h>
41#include <sys/jail.h>
42#include <sys/kernel.h>
43#include <sys/lock.h>
44#include <sys/malloc.h>
45#include <sys/mbuf.h>
46#include <sys/priv.h>
47#include <sys/proc.h>
48#include <sys/protosw.h>
49#include <sys/signalvar.h>
50#include <sys/socket.h>
51#include <sys/socketvar.h>
52#include <sys/sx.h>
53#include <sys/sysctl.h>
54#include <sys/systm.h>
55
56#include <vm/uma.h>
57
58#include <net/if.h>
59#include <net/route.h>
60
61#include <netinet/in.h>
62#include <netinet/in_systm.h>
63#include <netinet/in_pcb.h>
64#include <netinet/in_var.h>
65#include <netinet/ip.h>
66#include <netinet/ip_var.h>
67#include <netinet/ip_mroute.h>
68
69#include <netinet/ip_fw.h>
70#include <netinet/ip_dummynet.h>
71
72#ifdef IPSEC
73#include <netipsec/ipsec.h>
74#endif /*IPSEC*/
75
76#include <security/mac/mac_framework.h>
77
78struct	inpcbhead ripcb;
79struct	inpcbinfo ripcbinfo;
80
81/* control hooks for ipfw and dummynet */
82ip_fw_ctl_t *ip_fw_ctl_ptr = NULL;
83ip_dn_ctl_t *ip_dn_ctl_ptr = NULL;
84
85/*
86 * Hooks for multicast routing. They all default to NULL, so leave them not
87 * initialized and rely on BSS being set to 0.
88 */
89
90/*
91 * The socket used to communicate with the multicast routing daemon.
92 */
93struct socket  *ip_mrouter;
94
95/*
96 * The various mrouter and rsvp functions.
97 */
98int (*ip_mrouter_set)(struct socket *, struct sockopt *);
99int (*ip_mrouter_get)(struct socket *, struct sockopt *);
100int (*ip_mrouter_done)(void);
101int (*ip_mforward)(struct ip *, struct ifnet *, struct mbuf *,
102		   struct ip_moptions *);
103int (*mrt_ioctl)(int, caddr_t, int);
104int (*legal_vif_num)(int);
105u_long (*ip_mcast_src)(int);
106
107void (*rsvp_input_p)(struct mbuf *m, int off);
108int (*ip_rsvp_vif)(struct socket *, struct sockopt *);
109void (*ip_rsvp_force_done)(struct socket *);
110
111/*
112 * Hash functions
113 */
114
115#define INP_PCBHASH_RAW_SIZE	256
116#define INP_PCBHASH_RAW(proto, laddr, faddr, mask) \
117        (((proto) + (laddr) + (faddr)) % (mask) + 1)
118
119static void
120rip_inshash(struct inpcb *inp)
121{
122	struct inpcbinfo *pcbinfo = inp->inp_pcbinfo;
123	struct inpcbhead *pcbhash;
124	int hash;
125
126	INP_INFO_WLOCK_ASSERT(pcbinfo);
127	INP_WLOCK_ASSERT(inp);
128
129	if (inp->inp_ip_p && inp->inp_laddr.s_addr && inp->inp_faddr.s_addr) {
130		hash = INP_PCBHASH_RAW(inp->inp_ip_p, inp->inp_laddr.s_addr,
131		    inp->inp_faddr.s_addr, pcbinfo->ipi_hashmask);
132	} else {
133		hash = 0;
134	}
135	pcbhash = &pcbinfo->ipi_hashbase[hash];
136	LIST_INSERT_HEAD(pcbhash, inp, inp_hash);
137}
138
139static void
140rip_delhash(struct inpcb *inp)
141{
142	INP_WLOCK_ASSERT(inp);
143	LIST_REMOVE(inp, inp_hash);
144}
145
146/*
147 * Raw interface to IP protocol.
148 */
149
150/*
151 * Initialize raw connection block q.
152 */
153static void
154rip_zone_change(void *tag)
155{
156
157	uma_zone_set_max(ripcbinfo.ipi_zone, maxsockets);
158}
159
160static int
161rip_inpcb_init(void *mem, int size, int flags)
162{
163	struct inpcb *inp = mem;
164
165	INP_LOCK_INIT(inp, "inp", "rawinp");
166	return (0);
167}
168
169void
170rip_init(void)
171{
172
173	INP_INFO_LOCK_INIT(&ripcbinfo, "rip");
174	LIST_INIT(&ripcb);
175	ripcbinfo.ipi_listhead = &ripcb;
176	ripcbinfo.ipi_hashbase = hashinit(INP_PCBHASH_RAW_SIZE, M_PCB,
177	    &ripcbinfo.ipi_hashmask);
178	ripcbinfo.ipi_porthashbase = hashinit(1, M_PCB,
179	    &ripcbinfo.ipi_porthashmask);
180	ripcbinfo.ipi_zone = uma_zcreate("ripcb", sizeof(struct inpcb),
181	    NULL, NULL, rip_inpcb_init, NULL, UMA_ALIGN_PTR, UMA_ZONE_NOFREE);
182	uma_zone_set_max(ripcbinfo.ipi_zone, maxsockets);
183	EVENTHANDLER_REGISTER(maxsockets_change, rip_zone_change, NULL,
184	    EVENTHANDLER_PRI_ANY);
185}
186
187static int
188rip_append(struct inpcb *last, struct ip *ip, struct mbuf *n,
189    struct sockaddr_in *ripsrc)
190{
191	int policyfail = 0;
192
193	INP_RLOCK_ASSERT(last);
194
195#ifdef IPSEC
196	/* check AH/ESP integrity. */
197	if (ipsec4_in_reject(n, last)) {
198		policyfail = 1;
199	}
200#endif /* IPSEC */
201#ifdef MAC
202	if (!policyfail && mac_inpcb_check_deliver(last, n) != 0)
203		policyfail = 1;
204#endif
205	/* Check the minimum TTL for socket. */
206	if (last->inp_ip_minttl && last->inp_ip_minttl > ip->ip_ttl)
207		policyfail = 1;
208	if (!policyfail) {
209		struct mbuf *opts = NULL;
210		struct socket *so;
211
212		so = last->inp_socket;
213		if ((last->inp_flags & INP_CONTROLOPTS) ||
214		    (so->so_options & (SO_TIMESTAMP | SO_BINTIME)))
215			ip_savecontrol(last, &opts, ip, n);
216		SOCKBUF_LOCK(&so->so_rcv);
217		if (sbappendaddr_locked(&so->so_rcv,
218		    (struct sockaddr *)ripsrc, n, opts) == 0) {
219			/* should notify about lost packet */
220			m_freem(n);
221			if (opts)
222				m_freem(opts);
223			SOCKBUF_UNLOCK(&so->so_rcv);
224		} else
225			sorwakeup_locked(so);
226	} else
227		m_freem(n);
228	return (policyfail);
229}
230
231/*
232 * Setup generic address and protocol structures for raw_input routine, then
233 * pass them along with mbuf chain.
234 */
235void
236rip_input(struct mbuf *m, int off)
237{
238	struct ip *ip = mtod(m, struct ip *);
239	int proto = ip->ip_p;
240	struct inpcb *inp, *last;
241	struct sockaddr_in ripsrc;
242	int hash;
243
244	bzero(&ripsrc, sizeof(ripsrc));
245	ripsrc.sin_len = sizeof(ripsrc);
246	ripsrc.sin_family = AF_INET;
247	ripsrc.sin_addr = ip->ip_src;
248	last = NULL;
249	INP_INFO_RLOCK(&ripcbinfo);
250	hash = INP_PCBHASH_RAW(proto, ip->ip_src.s_addr,
251	    ip->ip_dst.s_addr, ripcbinfo.ipi_hashmask);
252	LIST_FOREACH(inp, &ripcbinfo.ipi_hashbase[hash], inp_hash) {
253		INP_RLOCK(inp);
254		if (inp->inp_ip_p != proto) {
255	docontinue1:
256			INP_RUNLOCK(inp);
257			continue;
258		}
259#ifdef INET6
260		if ((inp->inp_vflag & INP_IPV4) == 0)
261			goto docontinue1;
262#endif
263		if (inp->inp_laddr.s_addr != ip->ip_dst.s_addr)
264			goto docontinue1;
265		if (inp->inp_faddr.s_addr != ip->ip_src.s_addr)
266			goto docontinue1;
267		if (jailed(inp->inp_socket->so_cred) &&
268		    (htonl(prison_getip(inp->inp_socket->so_cred)) !=
269			ip->ip_dst.s_addr))
270				goto docontinue1;
271		if (last) {
272			struct mbuf *n;
273
274			n = m_copy(m, 0, (int)M_COPYALL);
275			if (n != NULL)
276		    	    (void) rip_append(last, ip, n, &ripsrc);
277			/* XXX count dropped packet */
278			INP_RUNLOCK(last);
279		}
280		last = inp;
281	}
282	LIST_FOREACH(inp, &ripcbinfo.ipi_hashbase[0], inp_hash) {
283		INP_RLOCK(inp);
284		if (inp->inp_ip_p && inp->inp_ip_p != proto) {
285	docontinue:
286			INP_RUNLOCK(inp);
287			continue;
288		}
289#ifdef INET6
290		if ((inp->inp_vflag & INP_IPV4) == 0)
291			goto docontinue;
292#endif
293		if (inp->inp_laddr.s_addr &&
294		    inp->inp_laddr.s_addr != ip->ip_dst.s_addr)
295			goto docontinue;
296		if (inp->inp_faddr.s_addr &&
297		    inp->inp_faddr.s_addr != ip->ip_src.s_addr)
298			goto docontinue;
299		if (jailed(inp->inp_socket->so_cred))
300			if (htonl(prison_getip(inp->inp_socket->so_cred)) !=
301			    ip->ip_dst.s_addr)
302				goto docontinue;
303		if (last) {
304			struct mbuf *n;
305
306			n = m_copy(m, 0, (int)M_COPYALL);
307			if (n != NULL)
308				(void) rip_append(last, ip, n, &ripsrc);
309			/* XXX count dropped packet */
310			INP_RUNLOCK(last);
311		}
312		last = inp;
313	}
314	if (last != NULL) {
315		if (rip_append(last, ip, m, &ripsrc) != 0)
316			ipstat.ips_delivered--;
317		INP_RUNLOCK(last);
318	} else {
319		m_freem(m);
320		ipstat.ips_noproto++;
321		ipstat.ips_delivered--;
322	}
323	INP_INFO_RUNLOCK(&ripcbinfo);
324}
325
326/*
327 * Generate IP header and pass packet to ip_output.  Tack on options user may
328 * have setup with control call.
329 */
330int
331rip_output(struct mbuf *m, struct socket *so, u_long dst)
332{
333	struct ip *ip;
334	int error;
335	struct inpcb *inp = sotoinpcb(so);
336	int flags = ((so->so_options & SO_DONTROUTE) ? IP_ROUTETOIF : 0) |
337	    IP_ALLOWBROADCAST;
338
339	/*
340	 * If the user handed us a complete IP packet, use it.  Otherwise,
341	 * allocate an mbuf for a header and fill it in.
342	 */
343	if ((inp->inp_flags & INP_HDRINCL) == 0) {
344		if (m->m_pkthdr.len + sizeof(struct ip) > IP_MAXPACKET) {
345			m_freem(m);
346			return(EMSGSIZE);
347		}
348		M_PREPEND(m, sizeof(struct ip), M_DONTWAIT);
349		if (m == NULL)
350			return(ENOBUFS);
351
352		INP_RLOCK(inp);
353		ip = mtod(m, struct ip *);
354		ip->ip_tos = inp->inp_ip_tos;
355		if (inp->inp_flags & INP_DONTFRAG)
356			ip->ip_off = IP_DF;
357		else
358			ip->ip_off = 0;
359		ip->ip_p = inp->inp_ip_p;
360		ip->ip_len = m->m_pkthdr.len;
361		if (jailed(inp->inp_socket->so_cred))
362			ip->ip_src.s_addr =
363			    htonl(prison_getip(inp->inp_socket->so_cred));
364		else
365			ip->ip_src = inp->inp_laddr;
366		ip->ip_dst.s_addr = dst;
367		ip->ip_ttl = inp->inp_ip_ttl;
368	} else {
369		if (m->m_pkthdr.len > IP_MAXPACKET) {
370			m_freem(m);
371			return(EMSGSIZE);
372		}
373		INP_RLOCK(inp);
374		ip = mtod(m, struct ip *);
375		if (jailed(inp->inp_socket->so_cred)) {
376			if (ip->ip_src.s_addr !=
377			    htonl(prison_getip(inp->inp_socket->so_cred))) {
378				INP_RUNLOCK(inp);
379				m_freem(m);
380				return (EPERM);
381			}
382		}
383
384		/*
385		 * Don't allow both user specified and setsockopt options,
386		 * and don't allow packet length sizes that will crash.
387		 */
388		if (((ip->ip_hl != (sizeof (*ip) >> 2)) && inp->inp_options)
389		    || (ip->ip_len > m->m_pkthdr.len)
390		    || (ip->ip_len < (ip->ip_hl << 2))) {
391			INP_RUNLOCK(inp);
392			m_freem(m);
393			return (EINVAL);
394		}
395		if (ip->ip_id == 0)
396			ip->ip_id = ip_newid();
397
398		/*
399		 * XXX prevent ip_output from overwriting header fields.
400		 */
401		flags |= IP_RAWOUTPUT;
402		ipstat.ips_rawout++;
403	}
404
405	if (inp->inp_flags & INP_ONESBCAST)
406		flags |= IP_SENDONES;
407
408#ifdef MAC
409	mac_inpcb_create_mbuf(inp, m);
410#endif
411
412	error = ip_output(m, inp->inp_options, NULL, flags,
413	    inp->inp_moptions, inp);
414	INP_RUNLOCK(inp);
415	return (error);
416}
417
418/*
419 * Raw IP socket option processing.
420 *
421 * IMPORTANT NOTE regarding access control: Traditionally, raw sockets could
422 * only be created by a privileged process, and as such, socket option
423 * operations to manage system properties on any raw socket were allowed to
424 * take place without explicit additional access control checks.  However,
425 * raw sockets can now also be created in jail(), and therefore explicit
426 * checks are now required.  Likewise, raw sockets can be used by a process
427 * after it gives up privilege, so some caution is required.  For options
428 * passed down to the IP layer via ip_ctloutput(), checks are assumed to be
429 * performed in ip_ctloutput() and therefore no check occurs here.
430 * Unilaterally checking priv_check() here breaks normal IP socket option
431 * operations on raw sockets.
432 *
433 * When adding new socket options here, make sure to add access control
434 * checks here as necessary.
435 */
436int
437rip_ctloutput(struct socket *so, struct sockopt *sopt)
438{
439	struct	inpcb *inp = sotoinpcb(so);
440	int	error, optval;
441
442	if (sopt->sopt_level != IPPROTO_IP)
443		return (EINVAL);
444
445	error = 0;
446	switch (sopt->sopt_dir) {
447	case SOPT_GET:
448		switch (sopt->sopt_name) {
449		case IP_HDRINCL:
450			optval = inp->inp_flags & INP_HDRINCL;
451			error = sooptcopyout(sopt, &optval, sizeof optval);
452			break;
453
454		case IP_FW_ADD:	/* ADD actually returns the body... */
455		case IP_FW_GET:
456		case IP_FW_TABLE_GETSIZE:
457		case IP_FW_TABLE_LIST:
458		case IP_FW_NAT_GET_CONFIG:
459		case IP_FW_NAT_GET_LOG:
460			if (ip_fw_ctl_ptr != NULL)
461				error = ip_fw_ctl_ptr(sopt);
462			else
463				error = ENOPROTOOPT;
464			break;
465
466		case IP_DUMMYNET_GET:
467			if (ip_dn_ctl_ptr != NULL)
468				error = ip_dn_ctl_ptr(sopt);
469			else
470				error = ENOPROTOOPT;
471			break ;
472
473		case MRT_INIT:
474		case MRT_DONE:
475		case MRT_ADD_VIF:
476		case MRT_DEL_VIF:
477		case MRT_ADD_MFC:
478		case MRT_DEL_MFC:
479		case MRT_VERSION:
480		case MRT_ASSERT:
481		case MRT_API_SUPPORT:
482		case MRT_API_CONFIG:
483		case MRT_ADD_BW_UPCALL:
484		case MRT_DEL_BW_UPCALL:
485			error = priv_check(curthread, PRIV_NETINET_MROUTE);
486			if (error != 0)
487				return (error);
488			error = ip_mrouter_get ? ip_mrouter_get(so, sopt) :
489				EOPNOTSUPP;
490			break;
491
492		default:
493			error = ip_ctloutput(so, sopt);
494			break;
495		}
496		break;
497
498	case SOPT_SET:
499		switch (sopt->sopt_name) {
500		case IP_HDRINCL:
501			error = sooptcopyin(sopt, &optval, sizeof optval,
502					    sizeof optval);
503			if (error)
504				break;
505			if (optval)
506				inp->inp_flags |= INP_HDRINCL;
507			else
508				inp->inp_flags &= ~INP_HDRINCL;
509			break;
510
511		case IP_FW_ADD:
512		case IP_FW_DEL:
513		case IP_FW_FLUSH:
514		case IP_FW_ZERO:
515		case IP_FW_RESETLOG:
516		case IP_FW_TABLE_ADD:
517		case IP_FW_TABLE_DEL:
518		case IP_FW_TABLE_FLUSH:
519		case IP_FW_NAT_CFG:
520		case IP_FW_NAT_DEL:
521			if (ip_fw_ctl_ptr != NULL)
522				error = ip_fw_ctl_ptr(sopt);
523			else
524				error = ENOPROTOOPT;
525			break;
526
527		case IP_DUMMYNET_CONFIGURE:
528		case IP_DUMMYNET_DEL:
529		case IP_DUMMYNET_FLUSH:
530			if (ip_dn_ctl_ptr != NULL)
531				error = ip_dn_ctl_ptr(sopt);
532			else
533				error = ENOPROTOOPT ;
534			break ;
535
536		case IP_RSVP_ON:
537			error = priv_check(curthread, PRIV_NETINET_MROUTE);
538			if (error != 0)
539				return (error);
540			error = ip_rsvp_init(so);
541			break;
542
543		case IP_RSVP_OFF:
544			error = priv_check(curthread, PRIV_NETINET_MROUTE);
545			if (error != 0)
546				return (error);
547			error = ip_rsvp_done();
548			break;
549
550		case IP_RSVP_VIF_ON:
551		case IP_RSVP_VIF_OFF:
552			error = priv_check(curthread, PRIV_NETINET_MROUTE);
553			if (error != 0)
554				return (error);
555			error = ip_rsvp_vif ?
556				ip_rsvp_vif(so, sopt) : EINVAL;
557			break;
558
559		case MRT_INIT:
560		case MRT_DONE:
561		case MRT_ADD_VIF:
562		case MRT_DEL_VIF:
563		case MRT_ADD_MFC:
564		case MRT_DEL_MFC:
565		case MRT_VERSION:
566		case MRT_ASSERT:
567		case MRT_API_SUPPORT:
568		case MRT_API_CONFIG:
569		case MRT_ADD_BW_UPCALL:
570		case MRT_DEL_BW_UPCALL:
571			error = priv_check(curthread, PRIV_NETINET_MROUTE);
572			if (error != 0)
573				return (error);
574			error = ip_mrouter_set ? ip_mrouter_set(so, sopt) :
575					EOPNOTSUPP;
576			break;
577
578		default:
579			error = ip_ctloutput(so, sopt);
580			break;
581		}
582		break;
583	}
584
585	return (error);
586}
587
588/*
589 * This function exists solely to receive the PRC_IFDOWN messages which are
590 * sent by if_down().  It looks for an ifaddr whose ifa_addr is sa, and calls
591 * in_ifadown() to remove all routes corresponding to that address.  It also
592 * receives the PRC_IFUP messages from if_up() and reinstalls the interface
593 * routes.
594 */
595void
596rip_ctlinput(int cmd, struct sockaddr *sa, void *vip)
597{
598	struct in_ifaddr *ia;
599	struct ifnet *ifp;
600	int err;
601	int flags;
602
603	switch (cmd) {
604	case PRC_IFDOWN:
605		TAILQ_FOREACH(ia, &in_ifaddrhead, ia_link) {
606			if (ia->ia_ifa.ifa_addr == sa
607			    && (ia->ia_flags & IFA_ROUTE)) {
608				/*
609				 * in_ifscrub kills the interface route.
610				 */
611				in_ifscrub(ia->ia_ifp, ia);
612				/*
613				 * in_ifadown gets rid of all the rest of the
614				 * routes.  This is not quite the right thing
615				 * to do, but at least if we are running a
616				 * routing process they will come back.
617				 */
618				in_ifadown(&ia->ia_ifa, 0);
619				break;
620			}
621		}
622		break;
623
624	case PRC_IFUP:
625		TAILQ_FOREACH(ia, &in_ifaddrhead, ia_link) {
626			if (ia->ia_ifa.ifa_addr == sa)
627				break;
628		}
629		if (ia == 0 || (ia->ia_flags & IFA_ROUTE))
630			return;
631		flags = RTF_UP;
632		ifp = ia->ia_ifa.ifa_ifp;
633
634		if ((ifp->if_flags & IFF_LOOPBACK)
635		    || (ifp->if_flags & IFF_POINTOPOINT))
636			flags |= RTF_HOST;
637
638		err = rtinit(&ia->ia_ifa, RTM_ADD, flags);
639		if (err == 0)
640			ia->ia_flags |= IFA_ROUTE;
641		break;
642	}
643}
644
645u_long	rip_sendspace = 9216;
646u_long	rip_recvspace = 9216;
647
648SYSCTL_ULONG(_net_inet_raw, OID_AUTO, maxdgram, CTLFLAG_RW,
649    &rip_sendspace, 0, "Maximum outgoing raw IP datagram size");
650SYSCTL_ULONG(_net_inet_raw, OID_AUTO, recvspace, CTLFLAG_RW,
651    &rip_recvspace, 0, "Maximum space for incoming raw IP datagrams");
652
653static int
654rip_attach(struct socket *so, int proto, struct thread *td)
655{
656	struct inpcb *inp;
657	int error;
658
659	inp = sotoinpcb(so);
660	KASSERT(inp == NULL, ("rip_attach: inp != NULL"));
661
662	error = priv_check(td, PRIV_NETINET_RAW);
663	if (error)
664		return (error);
665	if (proto >= IPPROTO_MAX || proto < 0)
666		return EPROTONOSUPPORT;
667	error = soreserve(so, rip_sendspace, rip_recvspace);
668	if (error)
669		return (error);
670	INP_INFO_WLOCK(&ripcbinfo);
671	error = in_pcballoc(so, &ripcbinfo);
672	if (error) {
673		INP_INFO_WUNLOCK(&ripcbinfo);
674		return (error);
675	}
676	inp = (struct inpcb *)so->so_pcb;
677	inp->inp_vflag |= INP_IPV4;
678	inp->inp_ip_p = proto;
679	inp->inp_ip_ttl = ip_defttl;
680	rip_inshash(inp);
681	INP_INFO_WUNLOCK(&ripcbinfo);
682	INP_WUNLOCK(inp);
683	return (0);
684}
685
686static void
687rip_detach(struct socket *so)
688{
689	struct inpcb *inp;
690
691	inp = sotoinpcb(so);
692	KASSERT(inp != NULL, ("rip_detach: inp == NULL"));
693	KASSERT(inp->inp_faddr.s_addr == INADDR_ANY,
694	    ("rip_detach: not closed"));
695
696	INP_INFO_WLOCK(&ripcbinfo);
697	INP_WLOCK(inp);
698	rip_delhash(inp);
699	if (so == ip_mrouter && ip_mrouter_done)
700		ip_mrouter_done();
701	if (ip_rsvp_force_done)
702		ip_rsvp_force_done(so);
703	if (so == ip_rsvpd)
704		ip_rsvp_done();
705	in_pcbdetach(inp);
706	in_pcbfree(inp);
707	INP_INFO_WUNLOCK(&ripcbinfo);
708}
709
710static void
711rip_dodisconnect(struct socket *so, struct inpcb *inp)
712{
713	INP_WLOCK_ASSERT(inp);
714
715	rip_delhash(inp);
716	inp->inp_faddr.s_addr = INADDR_ANY;
717	rip_inshash(inp);
718	SOCK_LOCK(so);
719	so->so_state &= ~SS_ISCONNECTED;
720	SOCK_UNLOCK(so);
721}
722
723static void
724rip_abort(struct socket *so)
725{
726	struct inpcb *inp;
727
728	inp = sotoinpcb(so);
729	KASSERT(inp != NULL, ("rip_abort: inp == NULL"));
730
731	INP_INFO_WLOCK(&ripcbinfo);
732	INP_WLOCK(inp);
733	rip_dodisconnect(so, inp);
734	INP_WUNLOCK(inp);
735	INP_INFO_WUNLOCK(&ripcbinfo);
736}
737
738static void
739rip_close(struct socket *so)
740{
741	struct inpcb *inp;
742
743	inp = sotoinpcb(so);
744	KASSERT(inp != NULL, ("rip_close: inp == NULL"));
745
746	INP_INFO_WLOCK(&ripcbinfo);
747	INP_WLOCK(inp);
748	rip_dodisconnect(so, inp);
749	INP_WUNLOCK(inp);
750	INP_INFO_WUNLOCK(&ripcbinfo);
751}
752
753static int
754rip_disconnect(struct socket *so)
755{
756	struct inpcb *inp;
757
758	if ((so->so_state & SS_ISCONNECTED) == 0)
759		return (ENOTCONN);
760
761	inp = sotoinpcb(so);
762	KASSERT(inp != NULL, ("rip_disconnect: inp == NULL"));
763
764	INP_INFO_WLOCK(&ripcbinfo);
765	INP_WLOCK(inp);
766	rip_dodisconnect(so, inp);
767	INP_WUNLOCK(inp);
768	INP_INFO_WUNLOCK(&ripcbinfo);
769	return (0);
770}
771
772static int
773rip_bind(struct socket *so, struct sockaddr *nam, struct thread *td)
774{
775	struct sockaddr_in *addr = (struct sockaddr_in *)nam;
776	struct inpcb *inp;
777
778	if (nam->sa_len != sizeof(*addr))
779		return (EINVAL);
780
781	if (jailed(td->td_ucred)) {
782		if (addr->sin_addr.s_addr == INADDR_ANY)
783			addr->sin_addr.s_addr =
784			    htonl(prison_getip(td->td_ucred));
785		if (htonl(prison_getip(td->td_ucred)) != addr->sin_addr.s_addr)
786			return (EADDRNOTAVAIL);
787	}
788
789	if (TAILQ_EMPTY(&ifnet) ||
790	    (addr->sin_family != AF_INET && addr->sin_family != AF_IMPLINK) ||
791	    (addr->sin_addr.s_addr &&
792	     ifa_ifwithaddr((struct sockaddr *)addr) == 0))
793		return (EADDRNOTAVAIL);
794
795	inp = sotoinpcb(so);
796	KASSERT(inp != NULL, ("rip_bind: inp == NULL"));
797
798	INP_INFO_WLOCK(&ripcbinfo);
799	INP_WLOCK(inp);
800	rip_delhash(inp);
801	inp->inp_laddr = addr->sin_addr;
802	rip_inshash(inp);
803	INP_WUNLOCK(inp);
804	INP_INFO_WUNLOCK(&ripcbinfo);
805	return (0);
806}
807
808static int
809rip_connect(struct socket *so, struct sockaddr *nam, struct thread *td)
810{
811	struct sockaddr_in *addr = (struct sockaddr_in *)nam;
812	struct inpcb *inp;
813
814	if (nam->sa_len != sizeof(*addr))
815		return (EINVAL);
816	if (TAILQ_EMPTY(&ifnet))
817		return (EADDRNOTAVAIL);
818	if (addr->sin_family != AF_INET && addr->sin_family != AF_IMPLINK)
819		return (EAFNOSUPPORT);
820
821	inp = sotoinpcb(so);
822	KASSERT(inp != NULL, ("rip_connect: inp == NULL"));
823
824	INP_INFO_WLOCK(&ripcbinfo);
825	INP_WLOCK(inp);
826	rip_delhash(inp);
827	inp->inp_faddr = addr->sin_addr;
828	rip_inshash(inp);
829	soisconnected(so);
830	INP_WUNLOCK(inp);
831	INP_INFO_WUNLOCK(&ripcbinfo);
832	return (0);
833}
834
835static int
836rip_shutdown(struct socket *so)
837{
838	struct inpcb *inp;
839
840	inp = sotoinpcb(so);
841	KASSERT(inp != NULL, ("rip_shutdown: inp == NULL"));
842
843	INP_WLOCK(inp);
844	socantsendmore(so);
845	INP_WUNLOCK(inp);
846	return (0);
847}
848
849static int
850rip_send(struct socket *so, int flags, struct mbuf *m, struct sockaddr *nam,
851    struct mbuf *control, struct thread *td)
852{
853	struct inpcb *inp;
854	u_long dst;
855
856	inp = sotoinpcb(so);
857	KASSERT(inp != NULL, ("rip_send: inp == NULL"));
858
859	/*
860	 * Note: 'dst' reads below are unlocked.
861	 */
862	if (so->so_state & SS_ISCONNECTED) {
863		if (nam) {
864			m_freem(m);
865			return (EISCONN);
866		}
867		dst = inp->inp_faddr.s_addr;	/* Unlocked read. */
868	} else {
869		if (nam == NULL) {
870			m_freem(m);
871			return (ENOTCONN);
872		}
873		dst = ((struct sockaddr_in *)nam)->sin_addr.s_addr;
874	}
875	return (rip_output(m, so, dst));
876}
877
878static int
879rip_pcblist(SYSCTL_HANDLER_ARGS)
880{
881	int error, i, n;
882	struct inpcb *inp, **inp_list;
883	inp_gen_t gencnt;
884	struct xinpgen xig;
885
886	/*
887	 * The process of preparing the TCB list is too time-consuming and
888	 * resource-intensive to repeat twice on every request.
889	 */
890	if (req->oldptr == 0) {
891		n = ripcbinfo.ipi_count;
892		req->oldidx = 2 * (sizeof xig)
893		    + (n + n/8) * sizeof(struct xinpcb);
894		return (0);
895	}
896
897	if (req->newptr != 0)
898		return (EPERM);
899
900	/*
901	 * OK, now we're committed to doing something.
902	 */
903	INP_INFO_RLOCK(&ripcbinfo);
904	gencnt = ripcbinfo.ipi_gencnt;
905	n = ripcbinfo.ipi_count;
906	INP_INFO_RUNLOCK(&ripcbinfo);
907
908	xig.xig_len = sizeof xig;
909	xig.xig_count = n;
910	xig.xig_gen = gencnt;
911	xig.xig_sogen = so_gencnt;
912	error = SYSCTL_OUT(req, &xig, sizeof xig);
913	if (error)
914		return (error);
915
916	inp_list = malloc(n * sizeof *inp_list, M_TEMP, M_WAITOK);
917	if (inp_list == 0)
918		return (ENOMEM);
919
920	INP_INFO_RLOCK(&ripcbinfo);
921	for (inp = LIST_FIRST(ripcbinfo.ipi_listhead), i = 0; inp && i < n;
922	     inp = LIST_NEXT(inp, inp_list)) {
923		INP_RLOCK(inp);
924		if (inp->inp_gencnt <= gencnt &&
925		    cr_canseesocket(req->td->td_ucred, inp->inp_socket) == 0) {
926			/* XXX held references? */
927			inp_list[i++] = inp;
928		}
929		INP_RUNLOCK(inp);
930	}
931	INP_INFO_RUNLOCK(&ripcbinfo);
932	n = i;
933
934	error = 0;
935	for (i = 0; i < n; i++) {
936		inp = inp_list[i];
937		INP_RLOCK(inp);
938		if (inp->inp_gencnt <= gencnt) {
939			struct xinpcb xi;
940			bzero(&xi, sizeof(xi));
941			xi.xi_len = sizeof xi;
942			/* XXX should avoid extra copy */
943			bcopy(inp, &xi.xi_inp, sizeof *inp);
944			if (inp->inp_socket)
945				sotoxsocket(inp->inp_socket, &xi.xi_socket);
946			INP_RUNLOCK(inp);
947			error = SYSCTL_OUT(req, &xi, sizeof xi);
948		} else
949			INP_RUNLOCK(inp);
950	}
951	if (!error) {
952		/*
953		 * Give the user an updated idea of our state.  If the
954		 * generation differs from what we told her before, she knows
955		 * that something happened while we were processing this
956		 * request, and it might be necessary to retry.
957		 */
958		INP_INFO_RLOCK(&ripcbinfo);
959		xig.xig_gen = ripcbinfo.ipi_gencnt;
960		xig.xig_sogen = so_gencnt;
961		xig.xig_count = ripcbinfo.ipi_count;
962		INP_INFO_RUNLOCK(&ripcbinfo);
963		error = SYSCTL_OUT(req, &xig, sizeof xig);
964	}
965	free(inp_list, M_TEMP);
966	return (error);
967}
968
969SYSCTL_PROC(_net_inet_raw, OID_AUTO/*XXX*/, pcblist, CTLFLAG_RD, 0, 0,
970    rip_pcblist, "S,xinpcb", "List of active raw IP sockets");
971
972struct pr_usrreqs rip_usrreqs = {
973	.pru_abort =		rip_abort,
974	.pru_attach =		rip_attach,
975	.pru_bind =		rip_bind,
976	.pru_connect =		rip_connect,
977	.pru_control =		in_control,
978	.pru_detach =		rip_detach,
979	.pru_disconnect =	rip_disconnect,
980	.pru_peeraddr =		in_getpeeraddr,
981	.pru_send =		rip_send,
982	.pru_shutdown =		rip_shutdown,
983	.pru_sockaddr =		in_getsockaddr,
984	.pru_sosetlabel =	in_pcbsosetlabel,
985	.pru_close =		rip_close,
986};
987