1/*
2 * Copyright (c) 2000-2012 Apple Inc. All rights reserved.
3 *
4 * @APPLE_OSREFERENCE_LICENSE_HEADER_START@
5 *
6 * This file contains Original Code and/or Modifications of Original Code
7 * as defined in and that are subject to the Apple Public Source License
8 * Version 2.0 (the 'License'). You may not use this file except in
9 * compliance with the License. The rights granted to you under the License
10 * may not be used to create, or enable the creation or redistribution of,
11 * unlawful or unlicensed copies of an Apple operating system, or to
12 * circumvent, violate, or enable the circumvention or violation of, any
13 * terms of an Apple operating system software license agreement.
14 *
15 * Please obtain a copy of the License at
16 * http://www.opensource.apple.com/apsl/ and read it before using this file.
17 *
18 * The Original Code and all software distributed under the License are
19 * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
20 * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
21 * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
22 * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT.
23 * Please see the License for the specific language governing rights and
24 * limitations under the License.
25 *
26 * @APPLE_OSREFERENCE_LICENSE_HEADER_END@
27 */
28/*
29 * Copyright (c) 1982, 1986, 1988, 1993
30 *	The Regents of the University of California.  All rights reserved.
31 *
32 * Redistribution and use in source and binary forms, with or without
33 * modification, are permitted provided that the following conditions
34 * are met:
35 * 1. Redistributions of source code must retain the above copyright
36 *    notice, this list of conditions and the following disclaimer.
37 * 2. Redistributions in binary form must reproduce the above copyright
38 *    notice, this list of conditions and the following disclaimer in the
39 *    documentation and/or other materials provided with the distribution.
40 * 3. All advertising materials mentioning features or use of this software
41 *    must display the following acknowledgement:
42 *	This product includes software developed by the University of
43 *	California, Berkeley and its contributors.
44 * 4. Neither the name of the University nor the names of its contributors
45 *    may be used to endorse or promote products derived from this software
46 *    without specific prior written permission.
47 *
48 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
49 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
50 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
51 * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
52 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
53 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
54 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
55 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
56 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
57 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
58 * SUCH DAMAGE.
59 *
60 *	@(#)raw_ip.c	8.7 (Berkeley) 5/15/95
61 */
62/*
63 * NOTICE: This file was modified by SPARTA, Inc. in 2005 to introduce
64 * support for mandatory and extensible security protections.  This notice
65 * is included in support of clause 2.2 (b) of the Apple Public License,
66 * Version 2.0.
67 */
68
69#include <sys/param.h>
70#include <sys/systm.h>
71#include <sys/kernel.h>
72#include <sys/malloc.h>
73#include <sys/mbuf.h>
74#include <sys/mcache.h>
75#include <sys/proc.h>
76#include <sys/domain.h>
77#include <sys/protosw.h>
78#include <sys/socket.h>
79#include <sys/socketvar.h>
80#include <sys/sysctl.h>
81#include <libkern/OSAtomic.h>
82#include <kern/zalloc.h>
83
84#include <pexpert/pexpert.h>
85
86#include <net/if.h>
87#include <net/route.h>
88
89#define _IP_VHL
90#include <netinet/in.h>
91#include <netinet/in_systm.h>
92#include <netinet/ip.h>
93#include <netinet/in_pcb.h>
94#include <netinet/in_var.h>
95#include <netinet/ip_var.h>
96#include <netinet/ip_mroute.h>
97
98#if INET6
99#include <netinet6/in6_pcb.h>
100#endif /* INET6 */
101
102#include <netinet/ip_fw.h>
103
104#if IPSEC
105#include <netinet6/ipsec.h>
106#endif /*IPSEC*/
107
108#if DUMMYNET
109#include <netinet/ip_dummynet.h>
110#endif
111
112#if CONFIG_MACF_NET
113#include <security/mac_framework.h>
114#endif /* MAC_NET */
115
116int load_ipfw(void);
117int rip_detach(struct socket *);
118int rip_abort(struct socket *);
119int rip_disconnect(struct socket *);
120int rip_bind(struct socket *, struct sockaddr *, struct proc *);
121int rip_connect(struct socket *, struct sockaddr *, struct proc *);
122int rip_shutdown(struct socket *);
123
124#if IPSEC
125extern int ipsec_bypass;
126#endif
127
128struct	inpcbhead ripcb;
129struct	inpcbinfo ripcbinfo;
130
131/* control hooks for ipfw and dummynet */
132#if IPFIREWALL
133ip_fw_ctl_t *ip_fw_ctl_ptr;
134#endif /* IPFIREWALL */
135#if DUMMYNET
136ip_dn_ctl_t *ip_dn_ctl_ptr;
137#endif /* DUMMYNET */
138
139/*
140 * Nominal space allocated to a raw ip socket.
141 */
142#define	RIPSNDQ		8192
143#define	RIPRCVQ		8192
144
145/*
146 * Raw interface to IP protocol.
147 */
148
149/*
150 * Initialize raw connection block q.
151 */
152void
153rip_init()
154{
155    	struct inpcbinfo *pcbinfo;
156
157	LIST_INIT(&ripcb);
158	ripcbinfo.listhead = &ripcb;
159	/*
160	 * XXX We don't use the hash list for raw IP, but it's easier
161	 * to allocate a one entry hash list than it is to check all
162	 * over the place for hashbase == NULL.
163	 */
164	ripcbinfo.hashbase = hashinit(1, M_PCB, &ripcbinfo.hashmask);
165	ripcbinfo.porthashbase = hashinit(1, M_PCB, &ripcbinfo.porthashmask);
166
167	ripcbinfo.ipi_zone = (void *) zinit(sizeof(struct inpcb),
168					    (4096 * sizeof(struct inpcb)),
169					    4096, "ripzone");
170
171	pcbinfo = &ripcbinfo;
172        /*
173	 * allocate lock group attribute and group for udp pcb mutexes
174	 */
175	pcbinfo->mtx_grp_attr = lck_grp_attr_alloc_init();
176
177	pcbinfo->mtx_grp = lck_grp_alloc_init("ripcb", pcbinfo->mtx_grp_attr);
178
179	/*
180	 * allocate the lock attribute for udp pcb mutexes
181	 */
182	pcbinfo->mtx_attr = lck_attr_alloc_init();
183
184	if ((pcbinfo->mtx = lck_rw_alloc_init(pcbinfo->mtx_grp, pcbinfo->mtx_attr)) == NULL)
185		return;	/* pretty much dead if this fails... */
186
187}
188
189static struct	sockaddr_in ripsrc = { sizeof(ripsrc), AF_INET , 0, {0}, {0,0,0,0,0,0,0,0,} };
190/*
191 * Setup generic address and protocol structures
192 * for raw_input routine, then pass them along with
193 * mbuf chain.
194 */
195void
196rip_input(m, iphlen)
197	struct mbuf *m;
198	int iphlen;
199{
200	register struct ip *ip = mtod(m, struct ip *);
201	register struct inpcb *inp;
202	struct inpcb *last = 0;
203	struct mbuf *opts = 0;
204	int skipit = 0, ret = 0;
205
206	/* Expect 32-bit aligned data pointer on strict-align platforms */
207	MBUF_STRICT_DATA_ALIGNMENT_CHECK_32(m);
208
209	ripsrc.sin_addr = ip->ip_src;
210	lck_rw_lock_shared(ripcbinfo.mtx);
211	LIST_FOREACH(inp, &ripcb, inp_list) {
212#if INET6
213		if ((inp->inp_vflag & INP_IPV4) == 0)
214			continue;
215#endif
216		if (inp->inp_ip_p && (inp->inp_ip_p != ip->ip_p))
217			continue;
218		if (inp->inp_laddr.s_addr &&
219                  inp->inp_laddr.s_addr != ip->ip_dst.s_addr)
220			continue;
221		if (inp->inp_faddr.s_addr &&
222                  inp->inp_faddr.s_addr != ip->ip_src.s_addr)
223			continue;
224		if (last) {
225			struct mbuf *n = m_copy(m, 0, (int)M_COPYALL);
226
227			skipit = 0;
228#if IPSEC
229			/* check AH/ESP integrity. */
230			if (ipsec_bypass == 0 && n) {
231				if (ipsec4_in_reject_so(n, last->inp_socket)) {
232					m_freem(n);
233					IPSEC_STAT_INCREMENT(ipsecstat.in_polvio);
234					/* do not inject data to pcb */
235					skipit = 1;
236				}
237			}
238#endif /*IPSEC*/
239#if CONFIG_MACF_NET
240			if (n && skipit == 0) {
241				if (mac_inpcb_check_deliver(last, n, AF_INET,
242				    SOCK_RAW) != 0) {
243					m_freem(n);
244					skipit = 1;
245				}
246			}
247#endif
248			if (n && skipit == 0) {
249				int error = 0;
250				if ((last->inp_flags & INP_CONTROLOPTS) != 0 ||
251				    (last->inp_socket->so_options & SO_TIMESTAMP) != 0 ||
252				    (last->inp_socket->so_options & SO_TIMESTAMP_MONOTONIC) != 0) {
253					ret = ip_savecontrol(last, &opts, ip, n);
254					if (ret != 0) {
255						m_freem(n);
256						m_freem(opts);
257						last = inp;
258						continue;
259					}
260				}
261				if (last->inp_flags & INP_STRIPHDR) {
262					n->m_len -= iphlen;
263					n->m_pkthdr.len -= iphlen;
264					n->m_data += iphlen;
265				}
266				so_recv_data_stat(last->inp_socket, m, 0);
267				if (sbappendaddr(&last->inp_socket->so_rcv,
268				    (struct sockaddr *)&ripsrc, n,
269				    opts, &error) != 0) {
270					sorwakeup(last->inp_socket);
271				} else {
272					if (error) {
273						/* should notify about lost packet */
274						kprintf("rip_input can't append to socket\n");
275					}
276				}
277				opts = 0;
278			}
279		}
280		last = inp;
281	}
282
283	skipit = 0;
284#if IPSEC
285	/* check AH/ESP integrity. */
286	if (ipsec_bypass == 0 && last) {
287		if (ipsec4_in_reject_so(m, last->inp_socket)) {
288			m_freem(m);
289			IPSEC_STAT_INCREMENT(ipsecstat.in_polvio);
290			OSAddAtomic(1, &ipstat.ips_delivered);
291			/* do not inject data to pcb */
292			skipit = 1;
293		}
294	}
295#endif /*IPSEC*/
296#if CONFIG_MACF_NET
297	if (last && skipit == 0) {
298		if (mac_inpcb_check_deliver(last, m, AF_INET, SOCK_RAW) != 0) {
299			skipit = 1;
300			m_freem(m);
301		}
302	}
303#endif
304	if (skipit == 0) {
305		if (last) {
306			if ((last->inp_flags & INP_CONTROLOPTS) != 0 ||
307				(last->inp_socket->so_options & SO_TIMESTAMP) != 0 ||
308				(last->inp_socket->so_options & SO_TIMESTAMP_MONOTONIC) != 0) {
309				ret = ip_savecontrol(last, &opts, ip, m);
310				if (ret != 0) {
311					m_freem(m);
312					m_freem(opts);
313					goto unlock;
314				}
315			}
316			if (last->inp_flags & INP_STRIPHDR) {
317				m->m_len -= iphlen;
318				m->m_pkthdr.len -= iphlen;
319				m->m_data += iphlen;
320			}
321			so_recv_data_stat(last->inp_socket, m, 0);
322			if (sbappendaddr(&last->inp_socket->so_rcv,
323				(struct sockaddr *)&ripsrc, m, opts, NULL) != 0) {
324				sorwakeup(last->inp_socket);
325			} else {
326				kprintf("rip_input(2) can't append to socket\n");
327			}
328		} else {
329			m_freem(m);
330			OSAddAtomic(1, &ipstat.ips_noproto);
331			OSAddAtomic(-1, &ipstat.ips_delivered);
332		}
333	}
334unlock:
335	/*
336	 * Keep the list locked because socket filter may force the socket lock
337	 * to be released when calling sbappendaddr() -- see rdar://7627704
338	 */
339	lck_rw_done(ripcbinfo.mtx);
340}
341
342/*
343 * Generate IP header and pass packet to ip_output.
344 * Tack on options user may have setup with control call.
345 */
346int
347rip_output(
348	struct mbuf *m,
349	struct socket *so,
350	u_int32_t dst,
351	struct mbuf *control)
352{
353	register struct ip *ip;
354	register struct inpcb *inp = sotoinpcb(so);
355	int flags = (so->so_options & SO_DONTROUTE) | IP_ALLOWBROADCAST;
356	struct ip_out_args ipoa = { IFSCOPE_NONE, { 0 }, IPOAF_SELECT_SRCIF };
357	struct ip_moptions *imo;
358	int error = 0;
359	mbuf_svc_class_t msc = MBUF_SC_UNSPEC;
360
361	if (control != NULL) {
362		msc = mbuf_service_class_from_control(control);
363
364		m_freem(control);
365	}
366
367	flags |= IP_OUTARGS;
368	/* If socket was bound to an ifindex, tell ip_output about it */
369	if (inp->inp_flags & INP_BOUND_IF) {
370		ipoa.ipoa_boundif = inp->inp_boundifp->if_index;
371		ipoa.ipoa_flags |= IPOAF_BOUND_IF;
372	}
373	if (inp->inp_flags & INP_NO_IFT_CELLULAR)
374		ipoa.ipoa_flags |=  IPOAF_NO_CELLULAR;
375
376	if (inp->inp_flowhash == 0)
377		inp->inp_flowhash = inp_calc_flowhash(inp);
378
379	/*
380	 * If the user handed us a complete IP packet, use it.
381	 * Otherwise, allocate an mbuf for a header and fill it in.
382	 */
383	if ((inp->inp_flags & INP_HDRINCL) == 0) {
384		if (m->m_pkthdr.len + sizeof(struct ip) > IP_MAXPACKET) {
385			m_freem(m);
386			return(EMSGSIZE);
387		}
388		M_PREPEND(m, sizeof(struct ip), M_WAIT);
389		if (m == NULL)
390			return ENOBUFS;
391		ip = mtod(m, struct ip *);
392		ip->ip_tos = inp->inp_ip_tos;
393		ip->ip_off = 0;
394		ip->ip_p = inp->inp_ip_p;
395		ip->ip_len = m->m_pkthdr.len;
396		ip->ip_src = inp->inp_laddr;
397		ip->ip_dst.s_addr = dst;
398		ip->ip_ttl = inp->inp_ip_ttl;
399	} else {
400		if (m->m_pkthdr.len > IP_MAXPACKET) {
401			m_freem(m);
402			return(EMSGSIZE);
403		}
404		ip = mtod(m, struct ip *);
405		/* don't allow both user specified and setsockopt options,
406		   and don't allow packet length sizes that will crash */
407		if (((IP_VHL_HL(ip->ip_vhl) != (sizeof (*ip) >> 2))
408		     && inp->inp_options)
409		    || (ip->ip_len > m->m_pkthdr.len)
410		    || (ip->ip_len < (IP_VHL_HL(ip->ip_vhl) << 2))) {
411			m_freem(m);
412			return EINVAL;
413		}
414		if (ip->ip_id == 0)
415#if RANDOM_IP_ID
416			ip->ip_id = ip_randomid();
417#else
418			ip->ip_id = htons(ip_id++);
419#endif
420		/* XXX prevent ip_output from overwriting header fields */
421		flags |= IP_RAWOUTPUT;
422		OSAddAtomic(1, &ipstat.ips_rawout);
423	}
424
425	if (inp->inp_laddr.s_addr != INADDR_ANY)
426		ipoa.ipoa_flags |= IPOAF_BOUND_SRCADDR;
427
428#if IPSEC
429	if (ipsec_bypass == 0 && ipsec_setsocket(m, so) != 0) {
430		m_freem(m);
431		return ENOBUFS;
432	}
433#endif /*IPSEC*/
434
435	if (inp->inp_route.ro_rt != NULL &&
436	    inp->inp_route.ro_rt->generation_id != route_generation) {
437		rtfree(inp->inp_route.ro_rt);
438		inp->inp_route.ro_rt = NULL;
439	}
440
441	set_packet_service_class(m, so, msc, 0);
442	m->m_pkthdr.m_flowhash = inp->inp_flowhash;
443	m->m_pkthdr.m_fhflags |= PF_TAG_FLOWHASH;
444
445#if CONFIG_MACF_NET
446	mac_mbuf_label_associate_inpcb(inp, m);
447#endif
448
449	imo = inp->inp_moptions;
450	if (imo != NULL)
451		IMO_ADDREF(imo);
452	/*
453	 * The domain lock is held across ip_output, so it is okay
454	 * to pass the PCB cached route pointer directly to IP and
455	 * the modules beneath it.
456	 */
457	error = ip_output(m, inp->inp_options, &inp->inp_route, flags,
458	    imo, &ipoa);
459
460	if (imo != NULL)
461		IMO_REMREF(imo);
462
463	if (inp->inp_route.ro_rt != NULL) {
464		struct rtentry *rt = inp->inp_route.ro_rt;
465		struct ifnet *outif;
466
467		if ((rt->rt_flags & (RTF_MULTICAST|RTF_BROADCAST)) ||
468		    inp->inp_socket == NULL ||
469		    !(inp->inp_socket->so_state & SS_ISCONNECTED)) {
470			rt = NULL;	/* unusable */
471		}
472		/*
473		 * Always discard the cached route for unconnected
474		 * socket or if it is a multicast route.
475		 */
476		if (rt == NULL) {
477			rtfree(inp->inp_route.ro_rt);
478			inp->inp_route.ro_rt = NULL;
479		}
480		/*
481		 * If this is a connected socket and the destination
482		 * route is unicast, update outif with that of the
483		 * route interface used by IP.
484		 */
485		if (rt != NULL && (outif = rt->rt_ifp) != inp->inp_last_outifp)
486			inp->inp_last_outifp = outif;
487	}
488
489	return (error);
490}
491
492#if IPFIREWALL
493int
494load_ipfw(void)
495{
496	kern_return_t	err;
497
498	ipfw_init();
499
500#if DUMMYNET
501	if (!DUMMYNET_LOADED)
502		ip_dn_init();
503#endif /* DUMMYNET */
504	err = 0;
505
506	return err == 0 && ip_fw_ctl_ptr == NULL ? -1 : err;
507}
508#endif /* IPFIREWALL */
509
510/*
511 * Raw IP socket option processing.
512 */
513int
514rip_ctloutput(so, sopt)
515	struct socket *so;
516	struct sockopt *sopt;
517{
518	struct	inpcb *inp = sotoinpcb(so);
519	int	error, optval;
520
521	/* Allow <SOL_SOCKET,SO_FLUSH> at this level */
522	if (sopt->sopt_level != IPPROTO_IP &&
523	    !(sopt->sopt_level == SOL_SOCKET && sopt->sopt_name == SO_FLUSH))
524		return (EINVAL);
525
526	error = 0;
527
528	switch (sopt->sopt_dir) {
529	case SOPT_GET:
530		switch (sopt->sopt_name) {
531		case IP_HDRINCL:
532			optval = inp->inp_flags & INP_HDRINCL;
533			error = sooptcopyout(sopt, &optval, sizeof optval);
534			break;
535
536		case IP_STRIPHDR:
537			optval = inp->inp_flags & INP_STRIPHDR;
538			error = sooptcopyout(sopt, &optval, sizeof optval);
539			break;
540
541#if IPFIREWALL
542		case IP_FW_ADD:
543		case IP_FW_GET:
544		case IP_OLD_FW_ADD:
545		case IP_OLD_FW_GET:
546			if (ip_fw_ctl_ptr == 0)
547				error = load_ipfw();
548			if (ip_fw_ctl_ptr && error == 0)
549				error = ip_fw_ctl_ptr(sopt);
550			else
551				error = ENOPROTOOPT;
552			break;
553#endif /* IPFIREWALL */
554
555#if DUMMYNET
556		case IP_DUMMYNET_GET:
557			if (!DUMMYNET_LOADED)
558				ip_dn_init();
559			if (DUMMYNET_LOADED)
560				error = ip_dn_ctl_ptr(sopt);
561			else
562				error = ENOPROTOOPT;
563			break ;
564#endif /* DUMMYNET */
565
566#if MROUTING
567		case MRT_INIT:
568		case MRT_DONE:
569		case MRT_ADD_VIF:
570		case MRT_DEL_VIF:
571		case MRT_ADD_MFC:
572		case MRT_DEL_MFC:
573		case MRT_VERSION:
574		case MRT_ASSERT:
575			error = ip_mrouter_get(so, sopt);
576			break;
577#endif /* MROUTING */
578
579		default:
580			error = ip_ctloutput(so, sopt);
581			break;
582		}
583		break;
584
585	case SOPT_SET:
586		switch (sopt->sopt_name) {
587		case IP_HDRINCL:
588			error = sooptcopyin(sopt, &optval, sizeof optval,
589					    sizeof optval);
590			if (error)
591				break;
592			if (optval)
593				inp->inp_flags |= INP_HDRINCL;
594			else
595				inp->inp_flags &= ~INP_HDRINCL;
596			break;
597
598		case IP_STRIPHDR:
599			error = sooptcopyin(sopt, &optval, sizeof optval,
600			    sizeof optval);
601			if (error)
602				break;
603			if (optval)
604				inp->inp_flags |= INP_STRIPHDR;
605			else
606				inp->inp_flags &= ~INP_STRIPHDR;
607			break;
608
609#if IPFIREWALL
610		case IP_FW_ADD:
611		case IP_FW_DEL:
612		case IP_FW_FLUSH:
613		case IP_FW_ZERO:
614		case IP_FW_RESETLOG:
615		case IP_OLD_FW_ADD:
616		case IP_OLD_FW_DEL:
617		case IP_OLD_FW_FLUSH:
618		case IP_OLD_FW_ZERO:
619		case IP_OLD_FW_RESETLOG:
620			if (ip_fw_ctl_ptr == 0)
621				error = load_ipfw();
622			if (ip_fw_ctl_ptr && error == 0)
623				error = ip_fw_ctl_ptr(sopt);
624			else
625				error = ENOPROTOOPT;
626			break;
627#endif /* IPFIREWALL */
628
629#if DUMMYNET
630		case IP_DUMMYNET_CONFIGURE:
631		case IP_DUMMYNET_DEL:
632		case IP_DUMMYNET_FLUSH:
633			if (!DUMMYNET_LOADED)
634				ip_dn_init();
635			if (DUMMYNET_LOADED)
636				error = ip_dn_ctl_ptr(sopt);
637			else
638				error = ENOPROTOOPT ;
639			break ;
640#endif
641
642#if MROUTING
643		case IP_RSVP_ON:
644			error = ip_rsvp_init(so);
645			break;
646
647		case IP_RSVP_OFF:
648			error = ip_rsvp_done();
649			break;
650
651			/* XXX - should be combined */
652		case IP_RSVP_VIF_ON:
653			error = ip_rsvp_vif_init(so, sopt);
654			break;
655
656		case IP_RSVP_VIF_OFF:
657			error = ip_rsvp_vif_done(so, sopt);
658			break;
659
660		case MRT_INIT:
661		case MRT_DONE:
662		case MRT_ADD_VIF:
663		case MRT_DEL_VIF:
664		case MRT_ADD_MFC:
665		case MRT_DEL_MFC:
666		case MRT_VERSION:
667		case MRT_ASSERT:
668			error = ip_mrouter_set(so, sopt);
669			break;
670#endif /* MROUTING */
671
672		case SO_FLUSH:
673			if ((error = sooptcopyin(sopt, &optval, sizeof (optval),
674			    sizeof (optval))) != 0)
675				break;
676
677			error = inp_flush(inp, optval);
678			break;
679
680		default:
681			error = ip_ctloutput(so, sopt);
682			break;
683		}
684		break;
685	}
686
687	return (error);
688}
689
690/*
691 * This function exists solely to receive the PRC_IFDOWN messages which
692 * are sent by if_down().  It looks for an ifaddr whose ifa_addr is sa,
693 * and calls in_ifadown() to remove all routes corresponding to that address.
694 * It also receives the PRC_IFUP messages from if_up() and reinstalls the
695 * interface routes.
696 */
697void
698rip_ctlinput(
699	int cmd,
700	struct sockaddr *sa,
701	__unused void *vip)
702{
703	struct in_ifaddr *ia;
704	struct ifnet *ifp;
705	int err;
706	int flags, done = 0;
707
708	switch (cmd) {
709	case PRC_IFDOWN:
710		lck_rw_lock_shared(in_ifaddr_rwlock);
711		for (ia = in_ifaddrhead.tqh_first; ia;
712		     ia = ia->ia_link.tqe_next) {
713			IFA_LOCK(&ia->ia_ifa);
714			if (ia->ia_ifa.ifa_addr == sa &&
715			    (ia->ia_flags & IFA_ROUTE)) {
716				done = 1;
717				IFA_ADDREF_LOCKED(&ia->ia_ifa);
718				IFA_UNLOCK(&ia->ia_ifa);
719				lck_rw_done(in_ifaddr_rwlock);
720				lck_mtx_lock(rnh_lock);
721				/*
722				 * in_ifscrub kills the interface route.
723				 */
724				in_ifscrub(ia->ia_ifp, ia, 1);
725				/*
726				 * in_ifadown gets rid of all the rest of
727				 * the routes.  This is not quite the right
728				 * thing to do, but at least if we are running
729				 * a routing process they will come back.
730				 */
731				in_ifadown(&ia->ia_ifa, 1);
732				lck_mtx_unlock(rnh_lock);
733				IFA_REMREF(&ia->ia_ifa);
734				break;
735			}
736			IFA_UNLOCK(&ia->ia_ifa);
737		}
738		if (!done)
739			lck_rw_done(in_ifaddr_rwlock);
740		break;
741
742	case PRC_IFUP:
743		lck_rw_lock_shared(in_ifaddr_rwlock);
744		for (ia = in_ifaddrhead.tqh_first; ia;
745		     ia = ia->ia_link.tqe_next) {
746			IFA_LOCK(&ia->ia_ifa);
747			if (ia->ia_ifa.ifa_addr == sa) {
748				/* keep it locked */
749				break;
750			}
751			IFA_UNLOCK(&ia->ia_ifa);
752		}
753		if (ia == NULL || (ia->ia_flags & IFA_ROUTE) ||
754		    (ia->ia_ifa.ifa_debug & IFD_NOTREADY)) {
755			if (ia != NULL)
756				IFA_UNLOCK(&ia->ia_ifa);
757			lck_rw_done(in_ifaddr_rwlock);
758			return;
759		}
760		IFA_ADDREF_LOCKED(&ia->ia_ifa);
761		IFA_UNLOCK(&ia->ia_ifa);
762		lck_rw_done(in_ifaddr_rwlock);
763
764		flags = RTF_UP;
765		ifp = ia->ia_ifa.ifa_ifp;
766
767		if ((ifp->if_flags & IFF_LOOPBACK)
768		    || (ifp->if_flags & IFF_POINTOPOINT))
769			flags |= RTF_HOST;
770
771		err = rtinit(&ia->ia_ifa, RTM_ADD, flags);
772		if (err == 0) {
773			IFA_LOCK_SPIN(&ia->ia_ifa);
774			ia->ia_flags |= IFA_ROUTE;
775			IFA_UNLOCK(&ia->ia_ifa);
776		}
777		IFA_REMREF(&ia->ia_ifa);
778		break;
779	}
780}
781
782u_int32_t	rip_sendspace = RIPSNDQ;
783u_int32_t	rip_recvspace = RIPRCVQ;
784
785SYSCTL_INT(_net_inet_raw, OID_AUTO, maxdgram, CTLFLAG_RW | CTLFLAG_LOCKED,
786    &rip_sendspace, 0, "Maximum outgoing raw IP datagram size");
787SYSCTL_INT(_net_inet_raw, OID_AUTO, recvspace, CTLFLAG_RW | CTLFLAG_LOCKED,
788    &rip_recvspace, 0, "Maximum incoming raw IP datagram size");
789
790static int
791rip_attach(struct socket *so, int proto, struct proc *p)
792{
793	struct inpcb *inp;
794	int error;
795
796	inp = sotoinpcb(so);
797	if (inp)
798		panic("rip_attach");
799	if ((so->so_state & SS_PRIV) == 0)
800		return (EPERM);
801
802	error = soreserve(so, rip_sendspace, rip_recvspace);
803	if (error)
804		return error;
805	error = in_pcballoc(so, &ripcbinfo, p);
806	if (error)
807		return error;
808	inp = (struct inpcb *)so->so_pcb;
809	inp->inp_vflag |= INP_IPV4;
810	inp->inp_ip_p = proto;
811	inp->inp_ip_ttl = ip_defttl;
812	return 0;
813}
814
815__private_extern__ int
816rip_detach(struct socket *so)
817{
818	struct inpcb *inp;
819
820	inp = sotoinpcb(so);
821	if (inp == 0)
822		panic("rip_detach");
823#if MROUTING
824	if (so == ip_mrouter)
825		ip_mrouter_done();
826	ip_rsvp_force_done(so);
827	if (so == ip_rsvpd)
828		ip_rsvp_done();
829#endif /* MROUTING */
830	in_pcbdetach(inp);
831	return 0;
832}
833
834__private_extern__ int
835rip_abort(struct socket *so)
836{
837	soisdisconnected(so);
838	return rip_detach(so);
839}
840
841__private_extern__ int
842rip_disconnect(struct socket *so)
843{
844	if ((so->so_state & SS_ISCONNECTED) == 0)
845		return ENOTCONN;
846	return rip_abort(so);
847}
848
849__private_extern__ int
850rip_bind(struct socket *so, struct sockaddr *nam, __unused struct proc *p)
851{
852	struct inpcb *inp = sotoinpcb(so);
853	struct sockaddr_in *addr = (struct sockaddr_in *)(void *)nam;
854	struct ifaddr *ifa = NULL;
855	struct ifnet *outif = NULL;
856
857	if (nam->sa_len != sizeof(*addr))
858		return EINVAL;
859
860	if (TAILQ_EMPTY(&ifnet_head) || ((addr->sin_family != AF_INET) &&
861				    (addr->sin_family != AF_IMPLINK)) ||
862	    (addr->sin_addr.s_addr &&
863	     (ifa = ifa_ifwithaddr((struct sockaddr *)addr)) == 0)) {
864		return EADDRNOTAVAIL;
865	}
866	else if (ifa) {
867		IFA_LOCK(ifa);
868		outif = ifa->ifa_ifp;
869		IFA_UNLOCK(ifa);
870		IFA_REMREF(ifa);
871	}
872	inp->inp_laddr = addr->sin_addr;
873	inp->inp_last_outifp = outif;
874	return 0;
875}
876
877__private_extern__ int
878rip_connect(struct socket *so, struct sockaddr *nam, __unused  struct proc *p)
879{
880	struct inpcb *inp = sotoinpcb(so);
881	struct sockaddr_in *addr = (struct sockaddr_in *)(void *)nam;
882
883	if (nam->sa_len != sizeof(*addr))
884		return EINVAL;
885	if (TAILQ_EMPTY(&ifnet_head))
886		return EADDRNOTAVAIL;
887	if ((addr->sin_family != AF_INET) &&
888	    (addr->sin_family != AF_IMPLINK))
889		return EAFNOSUPPORT;
890	inp->inp_faddr = addr->sin_addr;
891	soisconnected(so);
892
893	return 0;
894}
895
896__private_extern__ int
897rip_shutdown(struct socket *so)
898{
899	socantsendmore(so);
900	return 0;
901}
902
903__private_extern__ int
904rip_send(struct socket *so, __unused int flags, struct mbuf *m, struct sockaddr *nam,
905	struct mbuf *control, __unused struct proc *p)
906{
907	struct inpcb *inp = sotoinpcb(so);
908	register u_int32_t dst;
909
910	if (so->so_state & SS_ISCONNECTED) {
911		if (nam) {
912			m_freem(m);
913			return EISCONN;
914		}
915		dst = inp->inp_faddr.s_addr;
916	} else {
917		if (nam == NULL) {
918			m_freem(m);
919			return ENOTCONN;
920		}
921		dst = ((struct sockaddr_in *)(void *)nam)->sin_addr.s_addr;
922	}
923	return rip_output(m, so, dst, control);
924}
925
926/* note: rip_unlock is called from different protos  instead of the generic socket_unlock,
927 * it will handle the socket dealloc on last reference
928 * */
929int
930rip_unlock(struct socket *so, int refcount, void *debug)
931{
932	void *lr_saved;
933	struct inpcb *inp = sotoinpcb(so);
934
935	if (debug == NULL)
936		lr_saved = __builtin_return_address(0);
937	else
938		lr_saved = debug;
939
940	if (refcount) {
941		if (so->so_usecount <= 0) {
942			panic("rip_unlock: bad refoucnt so=%p val=%x lrh= %s\n",
943			    so, so->so_usecount, solockhistory_nr(so));
944			/* NOTREACHED */
945		}
946		so->so_usecount--;
947		if (so->so_usecount == 0 && (inp->inp_wantcnt == WNT_STOPUSING)) {
948			/* cleanup after last reference */
949			lck_mtx_unlock(so->so_proto->pr_domain->dom_mtx);
950			lck_rw_lock_exclusive(ripcbinfo.mtx);
951			if (inp->inp_state != INPCB_STATE_DEAD) {
952#if INET6
953				if (INP_CHECK_SOCKAF(so, AF_INET6))
954					in6_pcbdetach(inp);
955				else
956#endif /* INET6 */
957				in_pcbdetach(inp);
958			}
959			in_pcbdispose(inp);
960			lck_rw_done(ripcbinfo.mtx);
961			return(0);
962		}
963	}
964	so->unlock_lr[so->next_unlock_lr] = lr_saved;
965	so->next_unlock_lr = (so->next_unlock_lr+1) % SO_LCKDBG_MAX;
966	lck_mtx_unlock(so->so_proto->pr_domain->dom_mtx);
967	return(0);
968}
969
970static int
971rip_pcblist SYSCTL_HANDLER_ARGS
972{
973#pragma unused(oidp, arg1, arg2)
974	int error, i, n;
975	struct inpcb *inp, **inp_list;
976	inp_gen_t gencnt;
977	struct xinpgen xig;
978
979	/*
980	 * The process of preparing the TCB list is too time-consuming and
981	 * resource-intensive to repeat twice on every request.
982	 */
983	lck_rw_lock_exclusive(ripcbinfo.mtx);
984	if (req->oldptr == USER_ADDR_NULL) {
985		n = ripcbinfo.ipi_count;
986		req->oldidx = 2 * (sizeof xig)
987			+ (n + n/8) * sizeof(struct xinpcb);
988		lck_rw_done(ripcbinfo.mtx);
989		return 0;
990	}
991
992	if (req->newptr != USER_ADDR_NULL) {
993		lck_rw_done(ripcbinfo.mtx);
994		return EPERM;
995	}
996
997	/*
998	 * OK, now we're committed to doing something.
999	 */
1000	gencnt = ripcbinfo.ipi_gencnt;
1001	n = ripcbinfo.ipi_count;
1002
1003	bzero(&xig, sizeof(xig));
1004	xig.xig_len = sizeof xig;
1005	xig.xig_count = n;
1006	xig.xig_gen = gencnt;
1007	xig.xig_sogen = so_gencnt;
1008	error = SYSCTL_OUT(req, &xig, sizeof xig);
1009	if (error) {
1010		lck_rw_done(ripcbinfo.mtx);
1011		return error;
1012	}
1013    /*
1014     * We are done if there is no pcb
1015     */
1016    if (n == 0) {
1017	lck_rw_done(ripcbinfo.mtx);
1018        return 0;
1019    }
1020
1021	inp_list = _MALLOC(n * sizeof *inp_list, M_TEMP, M_WAITOK);
1022	if (inp_list == 0) {
1023		lck_rw_done(ripcbinfo.mtx);
1024		return ENOMEM;
1025	}
1026
1027	for (inp = ripcbinfo.listhead->lh_first, i = 0; inp && i < n;
1028	     inp = inp->inp_list.le_next) {
1029		if (inp->inp_gencnt <= gencnt && inp->inp_state != INPCB_STATE_DEAD)
1030			inp_list[i++] = inp;
1031	}
1032	n = i;
1033
1034	error = 0;
1035	for (i = 0; i < n; i++) {
1036		inp = inp_list[i];
1037		if (inp->inp_gencnt <= gencnt && inp->inp_state != INPCB_STATE_DEAD) {
1038			struct xinpcb xi;
1039
1040			bzero(&xi, sizeof(xi));
1041			xi.xi_len = sizeof xi;
1042			/* XXX should avoid extra copy */
1043			inpcb_to_compat(inp, &xi.xi_inp);
1044			if (inp->inp_socket)
1045				sotoxsocket(inp->inp_socket, &xi.xi_socket);
1046			error = SYSCTL_OUT(req, &xi, sizeof xi);
1047		}
1048	}
1049	if (!error) {
1050		/*
1051		 * Give the user an updated idea of our state.
1052		 * If the generation differs from what we told
1053		 * her before, she knows that something happened
1054		 * while we were processing this request, and it
1055		 * might be necessary to retry.
1056		 */
1057		bzero(&xig, sizeof(xig));
1058		xig.xig_len = sizeof xig;
1059		xig.xig_gen = ripcbinfo.ipi_gencnt;
1060		xig.xig_sogen = so_gencnt;
1061		xig.xig_count = ripcbinfo.ipi_count;
1062		error = SYSCTL_OUT(req, &xig, sizeof xig);
1063	}
1064	FREE(inp_list, M_TEMP);
1065	lck_rw_done(ripcbinfo.mtx);
1066	return error;
1067}
1068
1069SYSCTL_PROC(_net_inet_raw, OID_AUTO/*XXX*/, pcblist, CTLFLAG_RD | CTLFLAG_LOCKED, 0, 0,
1070	    rip_pcblist, "S,xinpcb", "List of active raw IP sockets");
1071
1072#if !CONFIG_EMBEDDED
1073
1074static int
1075rip_pcblist64 SYSCTL_HANDLER_ARGS
1076{
1077#pragma unused(oidp, arg1, arg2)
1078        int error, i, n;
1079        struct inpcb *inp, **inp_list;
1080        inp_gen_t gencnt;
1081        struct xinpgen xig;
1082
1083        /*
1084         * The process of preparing the TCB list is too time-consuming and
1085         * resource-intensive to repeat twice on every request.
1086         */
1087        lck_rw_lock_exclusive(ripcbinfo.mtx);
1088        if (req->oldptr == USER_ADDR_NULL) {
1089                n = ripcbinfo.ipi_count;
1090                req->oldidx = 2 * (sizeof xig)
1091                        + (n + n/8) * sizeof(struct xinpcb64);
1092                lck_rw_done(ripcbinfo.mtx);
1093                return 0;
1094        }
1095
1096        if (req->newptr != USER_ADDR_NULL) {
1097                lck_rw_done(ripcbinfo.mtx);
1098                return EPERM;
1099        }
1100
1101        /*
1102         * OK, now we're committed to doing something.
1103         */
1104        gencnt = ripcbinfo.ipi_gencnt;
1105        n = ripcbinfo.ipi_count;
1106
1107        bzero(&xig, sizeof(xig));
1108        xig.xig_len = sizeof xig;
1109        xig.xig_count = n;
1110        xig.xig_gen = gencnt;
1111        xig.xig_sogen = so_gencnt;
1112        error = SYSCTL_OUT(req, &xig, sizeof xig);
1113        if (error) {
1114                lck_rw_done(ripcbinfo.mtx);
1115                return error;
1116        }
1117    /*
1118     * We are done if there is no pcb
1119     */
1120    if (n == 0) {
1121        lck_rw_done(ripcbinfo.mtx);
1122        return 0;
1123    }
1124
1125        inp_list = _MALLOC(n * sizeof *inp_list, M_TEMP, M_WAITOK);
1126        if (inp_list == 0) {
1127                lck_rw_done(ripcbinfo.mtx);
1128                return ENOMEM;
1129        }
1130
1131        for (inp = ripcbinfo.listhead->lh_first, i = 0; inp && i < n;
1132             inp = inp->inp_list.le_next) {
1133                if (inp->inp_gencnt <= gencnt && inp->inp_state != INPCB_STATE_DEAD)
1134                        inp_list[i++] = inp;
1135        }
1136        n = i;
1137
1138        error = 0;
1139        for (i = 0; i < n; i++) {
1140                inp = inp_list[i];
1141                if (inp->inp_gencnt <= gencnt && inp->inp_state != INPCB_STATE_DEAD) {
1142                        struct xinpcb64 xi;
1143
1144                        bzero(&xi, sizeof(xi));
1145                        xi.xi_len = sizeof xi;
1146                        inpcb_to_xinpcb64(inp, &xi);
1147                        if (inp->inp_socket)
1148                                sotoxsocket64(inp->inp_socket, &xi.xi_socket);
1149                        error = SYSCTL_OUT(req, &xi, sizeof xi);
1150                }
1151        }
1152        if (!error) {
1153                /*
1154                 * Give the user an updated idea of our state.
1155                 * If the generation differs from what we told
1156                 * her before, she knows that something happened
1157                 * while we were processing this request, and it
1158                 * might be necessary to retry.
1159                 */
1160                bzero(&xig, sizeof(xig));
1161                xig.xig_len = sizeof xig;
1162                xig.xig_gen = ripcbinfo.ipi_gencnt;
1163                xig.xig_sogen = so_gencnt;
1164                xig.xig_count = ripcbinfo.ipi_count;
1165                error = SYSCTL_OUT(req, &xig, sizeof xig);
1166        }
1167        FREE(inp_list, M_TEMP);
1168        lck_rw_done(ripcbinfo.mtx);
1169        return error;
1170}
1171
1172SYSCTL_PROC(_net_inet_raw, OID_AUTO, pcblist64, CTLFLAG_RD | CTLFLAG_LOCKED, 0, 0,
1173            rip_pcblist64, "S,xinpcb64", "List of active raw IP sockets");
1174
1175#endif /* !CONFIG_EMBEDDED */
1176
1177
1178static int
1179rip_pcblist_n SYSCTL_HANDLER_ARGS
1180{
1181#pragma unused(oidp, arg1, arg2)
1182	int error = 0;
1183
1184	error = get_pcblist_n(IPPROTO_IP, req, &ripcbinfo);
1185
1186	return error;
1187}
1188
1189SYSCTL_PROC(_net_inet_raw, OID_AUTO, pcblist_n, CTLFLAG_RD | CTLFLAG_LOCKED, 0, 0,
1190            rip_pcblist_n, "S,xinpcb_n", "List of active raw IP sockets");
1191
1192struct pr_usrreqs rip_usrreqs = {
1193	rip_abort, pru_accept_notsupp, rip_attach, rip_bind, rip_connect,
1194	pru_connect2_notsupp, in_control, rip_detach, rip_disconnect,
1195	pru_listen_notsupp, in_setpeeraddr, pru_rcvd_notsupp,
1196	pru_rcvoob_notsupp, rip_send, pru_sense_null, rip_shutdown,
1197	in_setsockaddr, sosend, soreceive, pru_sopoll_notsupp
1198};
1199/* DSEP Review Done pl-20051213-v02 @3253 */
1200