1/*
2 * Copyright (c) 2000-2014 Apple Inc. All rights reserved.
3 *
4 * @APPLE_OSREFERENCE_LICENSE_HEADER_START@
5 *
6 * This file contains Original Code and/or Modifications of Original Code
7 * as defined in and that are subject to the Apple Public Source License
8 * Version 2.0 (the 'License'). You may not use this file except in
9 * compliance with the License. The rights granted to you under the License
10 * may not be used to create, or enable the creation or redistribution of,
11 * unlawful or unlicensed copies of an Apple operating system, or to
12 * circumvent, violate, or enable the circumvention or violation of, any
13 * terms of an Apple operating system software license agreement.
14 *
15 * Please obtain a copy of the License at
16 * http://www.opensource.apple.com/apsl/ and read it before using this file.
17 *
18 * The Original Code and all software distributed under the License are
19 * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
20 * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
21 * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
22 * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT.
23 * Please see the License for the specific language governing rights and
24 * limitations under the License.
25 *
26 * @APPLE_OSREFERENCE_LICENSE_HEADER_END@
27 */
28/*
29 * Copyright (c) 1982, 1986, 1988, 1993
30 *	The Regents of the University of California.  All rights reserved.
31 *
32 * Redistribution and use in source and binary forms, with or without
33 * modification, are permitted provided that the following conditions
34 * are met:
35 * 1. Redistributions of source code must retain the above copyright
36 *    notice, this list of conditions and the following disclaimer.
37 * 2. Redistributions in binary form must reproduce the above copyright
38 *    notice, this list of conditions and the following disclaimer in the
39 *    documentation and/or other materials provided with the distribution.
40 * 3. All advertising materials mentioning features or use of this software
41 *    must display the following acknowledgement:
42 *	This product includes software developed by the University of
43 *	California, Berkeley and its contributors.
44 * 4. Neither the name of the University nor the names of its contributors
45 *    may be used to endorse or promote products derived from this software
46 *    without specific prior written permission.
47 *
48 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
49 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
50 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
51 * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
52 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
53 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
54 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
55 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
56 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
57 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
58 * SUCH DAMAGE.
59 *
60 *	@(#)raw_ip.c	8.7 (Berkeley) 5/15/95
61 */
62/*
63 * NOTICE: This file was modified by SPARTA, Inc. in 2005 to introduce
64 * support for mandatory and extensible security protections.  This notice
65 * is included in support of clause 2.2 (b) of the Apple Public License,
66 * Version 2.0.
67 */
68
69#include <sys/param.h>
70#include <sys/systm.h>
71#include <sys/kernel.h>
72#include <sys/malloc.h>
73#include <sys/mbuf.h>
74#include <sys/mcache.h>
75#include <sys/proc.h>
76#include <sys/domain.h>
77#include <sys/protosw.h>
78#include <sys/socket.h>
79#include <sys/socketvar.h>
80#include <sys/sysctl.h>
81#include <libkern/OSAtomic.h>
82#include <kern/zalloc.h>
83
84#include <pexpert/pexpert.h>
85
86#include <net/if.h>
87#include <net/route.h>
88
89#define _IP_VHL
90#include <netinet/in.h>
91#include <netinet/in_systm.h>
92#include <netinet/ip.h>
93#include <netinet/in_pcb.h>
94#include <netinet/in_var.h>
95#include <netinet/ip_var.h>
96
97#if INET6
98#include <netinet6/in6_pcb.h>
99#endif /* INET6 */
100
101#include <netinet/ip_fw.h>
102
103#if IPSEC
104#include <netinet6/ipsec.h>
105#endif /*IPSEC*/
106
107#if DUMMYNET
108#include <netinet/ip_dummynet.h>
109#endif
110
111#if CONFIG_MACF_NET
112#include <security/mac_framework.h>
113#endif /* MAC_NET */
114
115int load_ipfw(void);
116int rip_detach(struct socket *);
117int rip_abort(struct socket *);
118int rip_disconnect(struct socket *);
119int rip_bind(struct socket *, struct sockaddr *, struct proc *);
120int rip_connect(struct socket *, struct sockaddr *, struct proc *);
121int rip_shutdown(struct socket *);
122
123struct	inpcbhead ripcb;
124struct	inpcbinfo ripcbinfo;
125
126/* control hooks for ipfw and dummynet */
127#if IPFIREWALL
128ip_fw_ctl_t *ip_fw_ctl_ptr;
129#endif /* IPFIREWALL */
130#if DUMMYNET
131ip_dn_ctl_t *ip_dn_ctl_ptr;
132#endif /* DUMMYNET */
133
134/*
135 * Nominal space allocated to a raw ip socket.
136 */
137#define	RIPSNDQ		8192
138#define	RIPRCVQ		8192
139
140/*
141 * Raw interface to IP protocol.
142 */
143
144/*
145 * Initialize raw connection block q.
146 */
147void
148rip_init(struct protosw *pp, struct domain *dp)
149{
150#pragma unused(dp)
151	static int rip_initialized = 0;
152	struct inpcbinfo *pcbinfo;
153
154	VERIFY((pp->pr_flags & (PR_INITIALIZED|PR_ATTACHED)) == PR_ATTACHED);
155
156	if (rip_initialized)
157		return;
158	rip_initialized = 1;
159
160	LIST_INIT(&ripcb);
161	ripcbinfo.ipi_listhead = &ripcb;
162	/*
163	 * XXX We don't use the hash list for raw IP, but it's easier
164	 * to allocate a one entry hash list than it is to check all
165	 * over the place for ipi_hashbase == NULL.
166	 */
167	ripcbinfo.ipi_hashbase = hashinit(1, M_PCB, &ripcbinfo.ipi_hashmask);
168	ripcbinfo.ipi_porthashbase = hashinit(1, M_PCB, &ripcbinfo.ipi_porthashmask);
169
170	ripcbinfo.ipi_zone = zinit(sizeof(struct inpcb),
171	    (4096 * sizeof(struct inpcb)), 4096, "ripzone");
172
173	pcbinfo = &ripcbinfo;
174        /*
175	 * allocate lock group attribute and group for udp pcb mutexes
176	 */
177	pcbinfo->ipi_lock_grp_attr = lck_grp_attr_alloc_init();
178	pcbinfo->ipi_lock_grp = lck_grp_alloc_init("ripcb", pcbinfo->ipi_lock_grp_attr);
179
180	/*
181	 * allocate the lock attribute for udp pcb mutexes
182	 */
183	pcbinfo->ipi_lock_attr = lck_attr_alloc_init();
184	if ((pcbinfo->ipi_lock = lck_rw_alloc_init(pcbinfo->ipi_lock_grp,
185	    pcbinfo->ipi_lock_attr)) == NULL) {
186		panic("%s: unable to allocate PCB lock\n", __func__);
187		/* NOTREACHED */
188	}
189
190	in_pcbinfo_attach(&ripcbinfo);
191}
192
193static struct	sockaddr_in ripsrc = { sizeof(ripsrc), AF_INET , 0, {0}, {0,0,0,0,0,0,0,0,} };
194/*
195 * Setup generic address and protocol structures
196 * for raw_input routine, then pass them along with
197 * mbuf chain.
198 */
199void
200rip_input(m, iphlen)
201	struct mbuf *m;
202	int iphlen;
203{
204	struct ip *ip = mtod(m, struct ip *);
205	struct inpcb *inp;
206	struct inpcb *last = 0;
207	struct mbuf *opts = 0;
208	int skipit = 0, ret = 0;
209	struct ifnet *ifp = m->m_pkthdr.rcvif;
210
211	/* Expect 32-bit aligned data pointer on strict-align platforms */
212	MBUF_STRICT_DATA_ALIGNMENT_CHECK_32(m);
213
214	ripsrc.sin_addr = ip->ip_src;
215	lck_rw_lock_shared(ripcbinfo.ipi_lock);
216	LIST_FOREACH(inp, &ripcb, inp_list) {
217#if INET6
218		if ((inp->inp_vflag & INP_IPV4) == 0)
219			continue;
220#endif
221		if (inp->inp_ip_p && (inp->inp_ip_p != ip->ip_p))
222			continue;
223		if (inp->inp_laddr.s_addr &&
224                  inp->inp_laddr.s_addr != ip->ip_dst.s_addr)
225			continue;
226		if (inp->inp_faddr.s_addr &&
227                  inp->inp_faddr.s_addr != ip->ip_src.s_addr)
228			continue;
229		if (inp_restricted_recv(inp, ifp))
230			continue;
231		if (last) {
232			struct mbuf *n = m_copy(m, 0, (int)M_COPYALL);
233
234			skipit = 0;
235
236#if NECP
237			if (n && !necp_socket_is_allowed_to_send_recv_v4(last, 0, 0, &ip->ip_dst, &ip->ip_src, ifp, NULL)) {
238				m_freem(n);
239				/* do not inject data to pcb */
240				skipit = 1;
241			}
242#endif /* NECP */
243#if CONFIG_MACF_NET
244			if (n && skipit == 0) {
245				if (mac_inpcb_check_deliver(last, n, AF_INET,
246				    SOCK_RAW) != 0) {
247					m_freem(n);
248					skipit = 1;
249				}
250			}
251#endif
252			if (n && skipit == 0) {
253				int error = 0;
254				if ((last->inp_flags & INP_CONTROLOPTS) != 0 ||
255				    (last->inp_socket->so_options & SO_TIMESTAMP) != 0 ||
256				    (last->inp_socket->so_options & SO_TIMESTAMP_MONOTONIC) != 0) {
257					ret = ip_savecontrol(last, &opts, ip, n);
258					if (ret != 0) {
259						m_freem(n);
260						m_freem(opts);
261						last = inp;
262						continue;
263					}
264				}
265				if (last->inp_flags & INP_STRIPHDR) {
266					n->m_len -= iphlen;
267					n->m_pkthdr.len -= iphlen;
268					n->m_data += iphlen;
269				}
270				so_recv_data_stat(last->inp_socket, m, 0);
271				if (sbappendaddr(&last->inp_socket->so_rcv,
272				    (struct sockaddr *)&ripsrc, n,
273				    opts, &error) != 0) {
274					sorwakeup(last->inp_socket);
275				} else {
276					if (error) {
277						/* should notify about lost packet */
278						kprintf("rip_input can't append to socket\n");
279					}
280				}
281				opts = 0;
282			}
283		}
284		last = inp;
285	}
286
287	skipit = 0;
288#if NECP
289	if (last && !necp_socket_is_allowed_to_send_recv_v4(last, 0, 0, &ip->ip_dst, &ip->ip_src, ifp, NULL)) {
290		m_freem(m);
291		OSAddAtomic(1, &ipstat.ips_delivered);
292		/* do not inject data to pcb */
293		skipit = 1;
294	}
295#endif /* NECP */
296#if CONFIG_MACF_NET
297	if (last && skipit == 0) {
298		if (mac_inpcb_check_deliver(last, m, AF_INET, SOCK_RAW) != 0) {
299			skipit = 1;
300			m_freem(m);
301		}
302	}
303#endif
304	if (skipit == 0) {
305		if (last) {
306			if ((last->inp_flags & INP_CONTROLOPTS) != 0 ||
307				(last->inp_socket->so_options & SO_TIMESTAMP) != 0 ||
308				(last->inp_socket->so_options & SO_TIMESTAMP_MONOTONIC) != 0) {
309				ret = ip_savecontrol(last, &opts, ip, m);
310				if (ret != 0) {
311					m_freem(m);
312					m_freem(opts);
313					goto unlock;
314				}
315			}
316			if (last->inp_flags & INP_STRIPHDR) {
317				m->m_len -= iphlen;
318				m->m_pkthdr.len -= iphlen;
319				m->m_data += iphlen;
320			}
321			so_recv_data_stat(last->inp_socket, m, 0);
322			if (sbappendaddr(&last->inp_socket->so_rcv,
323				(struct sockaddr *)&ripsrc, m, opts, NULL) != 0) {
324				sorwakeup(last->inp_socket);
325			} else {
326				kprintf("rip_input(2) can't append to socket\n");
327			}
328		} else {
329			m_freem(m);
330			OSAddAtomic(1, &ipstat.ips_noproto);
331			OSAddAtomic(-1, &ipstat.ips_delivered);
332		}
333	}
334unlock:
335	/*
336	 * Keep the list locked because socket filter may force the socket lock
337	 * to be released when calling sbappendaddr() -- see rdar://7627704
338	 */
339	lck_rw_done(ripcbinfo.ipi_lock);
340}
341
342/*
343 * Generate IP header and pass packet to ip_output.
344 * Tack on options user may have setup with control call.
345 */
346int
347rip_output(
348	struct mbuf *m,
349	struct socket *so,
350	u_int32_t dst,
351	struct mbuf *control)
352{
353	struct ip *ip;
354	struct inpcb *inp = sotoinpcb(so);
355	int flags = (so->so_options & SO_DONTROUTE) | IP_ALLOWBROADCAST;
356	struct ip_out_args ipoa =
357	    { IFSCOPE_NONE, { 0 }, IPOAF_SELECT_SRCIF, 0 };
358	struct ip_moptions *imo;
359	int error = 0;
360	mbuf_svc_class_t msc = MBUF_SC_UNSPEC;
361
362	if (control != NULL) {
363		msc = mbuf_service_class_from_control(control);
364
365		m_freem(control);
366		control = NULL;
367	}
368
369	if (inp == NULL
370#if NECP
371		|| (necp_socket_should_use_flow_divert(inp))
372#endif /* NECP */
373		) {
374		if (m != NULL)
375			m_freem(m);
376		VERIFY(control == NULL);
377		return (inp == NULL ? EINVAL : EPROTOTYPE);
378	}
379
380	flags |= IP_OUTARGS;
381	/* If socket was bound to an ifindex, tell ip_output about it */
382	if (inp->inp_flags & INP_BOUND_IF) {
383		ipoa.ipoa_boundif = inp->inp_boundifp->if_index;
384		ipoa.ipoa_flags |= IPOAF_BOUND_IF;
385	}
386	if (INP_NO_CELLULAR(inp))
387		ipoa.ipoa_flags |=  IPOAF_NO_CELLULAR;
388	if (INP_NO_EXPENSIVE(inp))
389		ipoa.ipoa_flags |=  IPOAF_NO_EXPENSIVE;
390	if (INP_AWDL_UNRESTRICTED(inp))
391		ipoa.ipoa_flags |=  IPOAF_AWDL_UNRESTRICTED;
392
393	if (inp->inp_flowhash == 0)
394		inp->inp_flowhash = inp_calc_flowhash(inp);
395
396	/*
397	 * If the user handed us a complete IP packet, use it.
398	 * Otherwise, allocate an mbuf for a header and fill it in.
399	 */
400	if ((inp->inp_flags & INP_HDRINCL) == 0) {
401		if (m->m_pkthdr.len + sizeof(struct ip) > IP_MAXPACKET) {
402			m_freem(m);
403			return(EMSGSIZE);
404		}
405		M_PREPEND(m, sizeof(struct ip), M_WAIT);
406		if (m == NULL)
407			return ENOBUFS;
408		ip = mtod(m, struct ip *);
409		ip->ip_tos = inp->inp_ip_tos;
410		ip->ip_off = 0;
411		ip->ip_p = inp->inp_ip_p;
412		ip->ip_len = m->m_pkthdr.len;
413		ip->ip_src = inp->inp_laddr;
414		ip->ip_dst.s_addr = dst;
415		ip->ip_ttl = inp->inp_ip_ttl;
416	} else {
417		if (m->m_pkthdr.len > IP_MAXPACKET) {
418			m_freem(m);
419			return(EMSGSIZE);
420		}
421		ip = mtod(m, struct ip *);
422		/* don't allow both user specified and setsockopt options,
423		   and don't allow packet length sizes that will crash */
424		if (((IP_VHL_HL(ip->ip_vhl) != (sizeof (*ip) >> 2))
425		     && inp->inp_options)
426		    || (ip->ip_len > m->m_pkthdr.len)
427		    || (ip->ip_len < (IP_VHL_HL(ip->ip_vhl) << 2))) {
428			m_freem(m);
429			return EINVAL;
430		}
431		if (ip->ip_id == 0)
432			ip->ip_id = ip_randomid();
433		/* XXX prevent ip_output from overwriting header fields */
434		flags |= IP_RAWOUTPUT;
435		OSAddAtomic(1, &ipstat.ips_rawout);
436	}
437
438	if (inp->inp_laddr.s_addr != INADDR_ANY)
439		ipoa.ipoa_flags |= IPOAF_BOUND_SRCADDR;
440
441#if NECP
442	{
443		necp_kernel_policy_id policy_id;
444		if (!necp_socket_is_allowed_to_send_recv_v4(inp, 0, 0, &ip->ip_src, &ip->ip_dst, NULL, &policy_id)) {
445			m_freem(m);
446			return(EHOSTUNREACH);
447		}
448
449		necp_mark_packet_from_socket(m, inp, policy_id);
450	}
451#endif /* NECP */
452
453#if IPSEC
454	if (inp->inp_sp != NULL && ipsec_setsocket(m, so) != 0) {
455		m_freem(m);
456		return ENOBUFS;
457	}
458#endif /*IPSEC*/
459
460	if (ROUTE_UNUSABLE(&inp->inp_route))
461		ROUTE_RELEASE(&inp->inp_route);
462
463	set_packet_service_class(m, so, msc, 0);
464	m->m_pkthdr.pkt_flowsrc = FLOWSRC_INPCB;
465	m->m_pkthdr.pkt_flowid = inp->inp_flowhash;
466	m->m_pkthdr.pkt_flags |= (PKTF_FLOW_ID | PKTF_FLOW_LOCALSRC |
467	    PKTF_FLOW_RAWSOCK);
468	m->m_pkthdr.pkt_proto = inp->inp_ip_p;
469
470#if CONFIG_MACF_NET
471	mac_mbuf_label_associate_inpcb(inp, m);
472#endif
473
474	imo = inp->inp_moptions;
475	if (imo != NULL)
476		IMO_ADDREF(imo);
477	/*
478	 * The domain lock is held across ip_output, so it is okay
479	 * to pass the PCB cached route pointer directly to IP and
480	 * the modules beneath it.
481	 */
482	error = ip_output(m, inp->inp_options, &inp->inp_route, flags,
483	    imo, &ipoa);
484
485	if (imo != NULL)
486		IMO_REMREF(imo);
487
488	if (inp->inp_route.ro_rt != NULL) {
489		struct rtentry *rt = inp->inp_route.ro_rt;
490		struct ifnet *outif;
491
492		if ((rt->rt_flags & (RTF_MULTICAST|RTF_BROADCAST)) ||
493		    inp->inp_socket == NULL ||
494		    !(inp->inp_socket->so_state & SS_ISCONNECTED)) {
495			rt = NULL;	/* unusable */
496		}
497		/*
498		 * Always discard the cached route for unconnected
499		 * socket or if it is a multicast route.
500		 */
501		if (rt == NULL)
502			ROUTE_RELEASE(&inp->inp_route);
503
504		/*
505		 * If this is a connected socket and the destination
506		 * route is unicast, update outif with that of the
507		 * route interface used by IP.
508		 */
509		if (rt != NULL && (outif = rt->rt_ifp) != inp->inp_last_outifp)
510			inp->inp_last_outifp = outif;
511	} else {
512		ROUTE_RELEASE(&inp->inp_route);
513	}
514
515	/*
516	 * If output interface was cellular/expensive, and this socket is
517	 * denied access to it, generate an event.
518	 */
519	if (error != 0 && (ipoa.ipoa_retflags & IPOARF_IFDENIED) &&
520	    (INP_NO_CELLULAR(inp) || INP_NO_EXPENSIVE(inp)))
521		soevent(so, (SO_FILT_HINT_LOCKED|SO_FILT_HINT_IFDENIED));
522
523	return (error);
524}
525
526#if IPFIREWALL
527int
528load_ipfw(void)
529{
530	kern_return_t	err;
531
532	ipfw_init();
533
534#if DUMMYNET
535	if (!DUMMYNET_LOADED)
536		ip_dn_init();
537#endif /* DUMMYNET */
538	err = 0;
539
540	return err == 0 && ip_fw_ctl_ptr == NULL ? -1 : err;
541}
542#endif /* IPFIREWALL */
543
544/*
545 * Raw IP socket option processing.
546 */
547int
548rip_ctloutput(so, sopt)
549	struct socket *so;
550	struct sockopt *sopt;
551{
552	struct	inpcb *inp = sotoinpcb(so);
553	int	error, optval;
554
555	/* Allow <SOL_SOCKET,SO_FLUSH> at this level */
556	if (sopt->sopt_level != IPPROTO_IP &&
557	    !(sopt->sopt_level == SOL_SOCKET && sopt->sopt_name == SO_FLUSH))
558		return (EINVAL);
559
560	error = 0;
561
562	switch (sopt->sopt_dir) {
563	case SOPT_GET:
564		switch (sopt->sopt_name) {
565		case IP_HDRINCL:
566			optval = inp->inp_flags & INP_HDRINCL;
567			error = sooptcopyout(sopt, &optval, sizeof optval);
568			break;
569
570		case IP_STRIPHDR:
571			optval = inp->inp_flags & INP_STRIPHDR;
572			error = sooptcopyout(sopt, &optval, sizeof optval);
573			break;
574
575#if IPFIREWALL
576		case IP_FW_ADD:
577		case IP_FW_GET:
578		case IP_OLD_FW_ADD:
579		case IP_OLD_FW_GET:
580			if (ip_fw_ctl_ptr == 0)
581				error = load_ipfw();
582			if (ip_fw_ctl_ptr && error == 0)
583				error = ip_fw_ctl_ptr(sopt);
584			else
585				error = ENOPROTOOPT;
586			break;
587#endif /* IPFIREWALL */
588
589#if DUMMYNET
590		case IP_DUMMYNET_GET:
591			if (!DUMMYNET_LOADED)
592				ip_dn_init();
593			if (DUMMYNET_LOADED)
594				error = ip_dn_ctl_ptr(sopt);
595			else
596				error = ENOPROTOOPT;
597			break ;
598#endif /* DUMMYNET */
599
600		default:
601			error = ip_ctloutput(so, sopt);
602			break;
603		}
604		break;
605
606	case SOPT_SET:
607		switch (sopt->sopt_name) {
608		case IP_HDRINCL:
609			error = sooptcopyin(sopt, &optval, sizeof optval,
610					    sizeof optval);
611			if (error)
612				break;
613			if (optval)
614				inp->inp_flags |= INP_HDRINCL;
615			else
616				inp->inp_flags &= ~INP_HDRINCL;
617			break;
618
619		case IP_STRIPHDR:
620			error = sooptcopyin(sopt, &optval, sizeof optval,
621			    sizeof optval);
622			if (error)
623				break;
624			if (optval)
625				inp->inp_flags |= INP_STRIPHDR;
626			else
627				inp->inp_flags &= ~INP_STRIPHDR;
628			break;
629
630#if IPFIREWALL
631		case IP_FW_ADD:
632		case IP_FW_DEL:
633		case IP_FW_FLUSH:
634		case IP_FW_ZERO:
635		case IP_FW_RESETLOG:
636		case IP_OLD_FW_ADD:
637		case IP_OLD_FW_DEL:
638		case IP_OLD_FW_FLUSH:
639		case IP_OLD_FW_ZERO:
640		case IP_OLD_FW_RESETLOG:
641			if (ip_fw_ctl_ptr == 0)
642				error = load_ipfw();
643			if (ip_fw_ctl_ptr && error == 0)
644				error = ip_fw_ctl_ptr(sopt);
645			else
646				error = ENOPROTOOPT;
647			break;
648#endif /* IPFIREWALL */
649
650#if DUMMYNET
651		case IP_DUMMYNET_CONFIGURE:
652		case IP_DUMMYNET_DEL:
653		case IP_DUMMYNET_FLUSH:
654			if (!DUMMYNET_LOADED)
655				ip_dn_init();
656			if (DUMMYNET_LOADED)
657				error = ip_dn_ctl_ptr(sopt);
658			else
659				error = ENOPROTOOPT ;
660			break ;
661#endif
662
663		case SO_FLUSH:
664			if ((error = sooptcopyin(sopt, &optval, sizeof (optval),
665			    sizeof (optval))) != 0)
666				break;
667
668			error = inp_flush(inp, optval);
669			break;
670
671		default:
672			error = ip_ctloutput(so, sopt);
673			break;
674		}
675		break;
676	}
677
678	return (error);
679}
680
681/*
682 * This function exists solely to receive the PRC_IFDOWN messages which
683 * are sent by if_down().  It looks for an ifaddr whose ifa_addr is sa,
684 * and calls in_ifadown() to remove all routes corresponding to that address.
685 * It also receives the PRC_IFUP messages from if_up() and reinstalls the
686 * interface routes.
687 */
688void
689rip_ctlinput(
690	int cmd,
691	struct sockaddr *sa,
692	__unused void *vip)
693{
694	struct in_ifaddr *ia;
695	struct ifnet *ifp;
696	int err;
697	int flags, done = 0;
698
699	switch (cmd) {
700	case PRC_IFDOWN:
701		lck_rw_lock_shared(in_ifaddr_rwlock);
702		for (ia = in_ifaddrhead.tqh_first; ia;
703		     ia = ia->ia_link.tqe_next) {
704			IFA_LOCK(&ia->ia_ifa);
705			if (ia->ia_ifa.ifa_addr == sa &&
706			    (ia->ia_flags & IFA_ROUTE)) {
707				done = 1;
708				IFA_ADDREF_LOCKED(&ia->ia_ifa);
709				IFA_UNLOCK(&ia->ia_ifa);
710				lck_rw_done(in_ifaddr_rwlock);
711				lck_mtx_lock(rnh_lock);
712				/*
713				 * in_ifscrub kills the interface route.
714				 */
715				in_ifscrub(ia->ia_ifp, ia, 1);
716				/*
717				 * in_ifadown gets rid of all the rest of
718				 * the routes.  This is not quite the right
719				 * thing to do, but at least if we are running
720				 * a routing process they will come back.
721				 */
722				in_ifadown(&ia->ia_ifa, 1);
723				lck_mtx_unlock(rnh_lock);
724				IFA_REMREF(&ia->ia_ifa);
725				break;
726			}
727			IFA_UNLOCK(&ia->ia_ifa);
728		}
729		if (!done)
730			lck_rw_done(in_ifaddr_rwlock);
731		break;
732
733	case PRC_IFUP:
734		lck_rw_lock_shared(in_ifaddr_rwlock);
735		for (ia = in_ifaddrhead.tqh_first; ia;
736		     ia = ia->ia_link.tqe_next) {
737			IFA_LOCK(&ia->ia_ifa);
738			if (ia->ia_ifa.ifa_addr == sa) {
739				/* keep it locked */
740				break;
741			}
742			IFA_UNLOCK(&ia->ia_ifa);
743		}
744		if (ia == NULL || (ia->ia_flags & IFA_ROUTE) ||
745		    (ia->ia_ifa.ifa_debug & IFD_NOTREADY)) {
746			if (ia != NULL)
747				IFA_UNLOCK(&ia->ia_ifa);
748			lck_rw_done(in_ifaddr_rwlock);
749			return;
750		}
751		IFA_ADDREF_LOCKED(&ia->ia_ifa);
752		IFA_UNLOCK(&ia->ia_ifa);
753		lck_rw_done(in_ifaddr_rwlock);
754
755		flags = RTF_UP;
756		ifp = ia->ia_ifa.ifa_ifp;
757
758		if ((ifp->if_flags & IFF_LOOPBACK)
759		    || (ifp->if_flags & IFF_POINTOPOINT))
760			flags |= RTF_HOST;
761
762		err = rtinit(&ia->ia_ifa, RTM_ADD, flags);
763		if (err == 0) {
764			IFA_LOCK_SPIN(&ia->ia_ifa);
765			ia->ia_flags |= IFA_ROUTE;
766			IFA_UNLOCK(&ia->ia_ifa);
767		}
768		IFA_REMREF(&ia->ia_ifa);
769		break;
770	}
771}
772
773u_int32_t	rip_sendspace = RIPSNDQ;
774u_int32_t	rip_recvspace = RIPRCVQ;
775
776SYSCTL_INT(_net_inet_raw, OID_AUTO, maxdgram, CTLFLAG_RW | CTLFLAG_LOCKED,
777    &rip_sendspace, 0, "Maximum outgoing raw IP datagram size");
778SYSCTL_INT(_net_inet_raw, OID_AUTO, recvspace, CTLFLAG_RW | CTLFLAG_LOCKED,
779    &rip_recvspace, 0, "Maximum incoming raw IP datagram size");
780SYSCTL_UINT(_net_inet_raw, OID_AUTO, pcbcount, CTLFLAG_RD | CTLFLAG_LOCKED,
781    &ripcbinfo.ipi_count, 0, "Number of active PCBs");
782
783static int
784rip_attach(struct socket *so, int proto, struct proc *p)
785{
786	struct inpcb *inp;
787	int error;
788
789	inp = sotoinpcb(so);
790	if (inp)
791		panic("rip_attach");
792	if ((so->so_state & SS_PRIV) == 0)
793		return (EPERM);
794
795	error = soreserve(so, rip_sendspace, rip_recvspace);
796	if (error)
797		return error;
798	error = in_pcballoc(so, &ripcbinfo, p);
799	if (error)
800		return error;
801	inp = (struct inpcb *)so->so_pcb;
802	inp->inp_vflag |= INP_IPV4;
803	inp->inp_ip_p = proto;
804	inp->inp_ip_ttl = ip_defttl;
805	return 0;
806}
807
808__private_extern__ int
809rip_detach(struct socket *so)
810{
811	struct inpcb *inp;
812
813	inp = sotoinpcb(so);
814	if (inp == 0)
815		panic("rip_detach");
816	in_pcbdetach(inp);
817	return 0;
818}
819
820__private_extern__ int
821rip_abort(struct socket *so)
822{
823	soisdisconnected(so);
824	return rip_detach(so);
825}
826
827__private_extern__ int
828rip_disconnect(struct socket *so)
829{
830	if ((so->so_state & SS_ISCONNECTED) == 0)
831		return ENOTCONN;
832	return rip_abort(so);
833}
834
835__private_extern__ int
836rip_bind(struct socket *so, struct sockaddr *nam, struct proc *p)
837{
838#pragma unused(p)
839	struct inpcb *inp = sotoinpcb(so);
840	struct sockaddr_in sin;
841	struct ifaddr *ifa = NULL;
842	struct ifnet *outif = NULL;
843
844	if (inp == NULL
845#if NECP
846		|| (necp_socket_should_use_flow_divert(inp))
847#endif /* NECP */
848		)
849		return (inp == NULL ? EINVAL : EPROTOTYPE);
850
851	if (nam->sa_len != sizeof (struct sockaddr_in))
852		return (EINVAL);
853
854	/* Sanitized local copy for interface address searches */
855	bzero(&sin, sizeof (sin));
856	sin.sin_family = AF_INET;
857	sin.sin_len = sizeof (struct sockaddr_in);
858	sin.sin_addr.s_addr = SIN(nam)->sin_addr.s_addr;
859
860	if (TAILQ_EMPTY(&ifnet_head) ||
861	    (sin.sin_family != AF_INET && sin.sin_family != AF_IMPLINK) ||
862	    (sin.sin_addr.s_addr && (ifa = ifa_ifwithaddr(SA(&sin))) == 0)) {
863		return (EADDRNOTAVAIL);
864	} else if (ifa) {
865		/*
866		 * Opportunistically determine the outbound
867		 * interface that may be used; this may not
868		 * hold true if we end up using a route
869		 * going over a different interface, e.g.
870		 * when sending to a local address.  This
871		 * will get updated again after sending.
872		 */
873		IFA_LOCK(ifa);
874		outif = ifa->ifa_ifp;
875		IFA_UNLOCK(ifa);
876		IFA_REMREF(ifa);
877	}
878	inp->inp_laddr = sin.sin_addr;
879	inp->inp_last_outifp = outif;
880	return (0);
881}
882
883__private_extern__ int
884rip_connect(struct socket *so, struct sockaddr *nam, __unused  struct proc *p)
885{
886	struct inpcb *inp = sotoinpcb(so);
887	struct sockaddr_in *addr = (struct sockaddr_in *)(void *)nam;
888
889	if (inp == NULL
890#if NECP
891		|| (necp_socket_should_use_flow_divert(inp))
892#endif /* NECP */
893		)
894		return (inp == NULL ? EINVAL : EPROTOTYPE);
895	if (nam->sa_len != sizeof(*addr))
896		return EINVAL;
897	if (TAILQ_EMPTY(&ifnet_head))
898		return EADDRNOTAVAIL;
899	if ((addr->sin_family != AF_INET) &&
900	    (addr->sin_family != AF_IMPLINK))
901		return EAFNOSUPPORT;
902	inp->inp_faddr = addr->sin_addr;
903	soisconnected(so);
904
905	return 0;
906}
907
908__private_extern__ int
909rip_shutdown(struct socket *so)
910{
911	socantsendmore(so);
912	return 0;
913}
914
915__private_extern__ int
916rip_send(struct socket *so, int flags, struct mbuf *m, struct sockaddr *nam,
917    struct mbuf *control, struct proc *p)
918{
919#pragma unused(flags, p)
920	struct inpcb *inp = sotoinpcb(so);
921	u_int32_t dst;
922	int error = 0;
923
924	if (inp == NULL
925#if NECP
926		|| (necp_socket_should_use_flow_divert(inp) && (error = EPROTOTYPE))
927#endif /* NECP */
928		) {
929		if (inp == NULL)
930			error = EINVAL;
931		else
932			error = EPROTOTYPE;
933		goto bad;
934	}
935
936	if (so->so_state & SS_ISCONNECTED) {
937		if (nam != NULL) {
938			error = EISCONN;
939			goto bad;
940		}
941		dst = inp->inp_faddr.s_addr;
942	} else {
943		if (nam == NULL) {
944			error = ENOTCONN;
945			goto bad;
946		}
947		dst = ((struct sockaddr_in *)(void *)nam)->sin_addr.s_addr;
948	}
949	return (rip_output(m, so, dst, control));
950
951bad:
952	VERIFY(error != 0);
953
954	if (m != NULL)
955		m_freem(m);
956	if (control != NULL)
957		m_freem(control);
958
959	return (error);
960}
961
962/* note: rip_unlock is called from different protos  instead of the generic socket_unlock,
963 * it will handle the socket dealloc on last reference
964 * */
965int
966rip_unlock(struct socket *so, int refcount, void *debug)
967{
968	void *lr_saved;
969	struct inpcb *inp = sotoinpcb(so);
970
971	if (debug == NULL)
972		lr_saved = __builtin_return_address(0);
973	else
974		lr_saved = debug;
975
976	if (refcount) {
977		if (so->so_usecount <= 0) {
978			panic("rip_unlock: bad refoucnt so=%p val=%x lrh= %s\n",
979			    so, so->so_usecount, solockhistory_nr(so));
980			/* NOTREACHED */
981		}
982		so->so_usecount--;
983		if (so->so_usecount == 0 && (inp->inp_wantcnt == WNT_STOPUSING)) {
984			/* cleanup after last reference */
985			lck_mtx_unlock(so->so_proto->pr_domain->dom_mtx);
986			lck_rw_lock_exclusive(ripcbinfo.ipi_lock);
987			if (inp->inp_state != INPCB_STATE_DEAD) {
988#if INET6
989				if (SOCK_CHECK_DOM(so, PF_INET6))
990					in6_pcbdetach(inp);
991				else
992#endif /* INET6 */
993				in_pcbdetach(inp);
994			}
995			in_pcbdispose(inp);
996			lck_rw_done(ripcbinfo.ipi_lock);
997			return(0);
998		}
999	}
1000	so->unlock_lr[so->next_unlock_lr] = lr_saved;
1001	so->next_unlock_lr = (so->next_unlock_lr+1) % SO_LCKDBG_MAX;
1002	lck_mtx_unlock(so->so_proto->pr_domain->dom_mtx);
1003	return(0);
1004}
1005
1006static int
1007rip_pcblist SYSCTL_HANDLER_ARGS
1008{
1009#pragma unused(oidp, arg1, arg2)
1010	int error, i, n;
1011	struct inpcb *inp, **inp_list;
1012	inp_gen_t gencnt;
1013	struct xinpgen xig;
1014
1015	/*
1016	 * The process of preparing the TCB list is too time-consuming and
1017	 * resource-intensive to repeat twice on every request.
1018	 */
1019	lck_rw_lock_exclusive(ripcbinfo.ipi_lock);
1020	if (req->oldptr == USER_ADDR_NULL) {
1021		n = ripcbinfo.ipi_count;
1022		req->oldidx = 2 * (sizeof xig)
1023			+ (n + n/8) * sizeof(struct xinpcb);
1024		lck_rw_done(ripcbinfo.ipi_lock);
1025		return 0;
1026	}
1027
1028	if (req->newptr != USER_ADDR_NULL) {
1029		lck_rw_done(ripcbinfo.ipi_lock);
1030		return EPERM;
1031	}
1032
1033	/*
1034	 * OK, now we're committed to doing something.
1035	 */
1036	gencnt = ripcbinfo.ipi_gencnt;
1037	n = ripcbinfo.ipi_count;
1038
1039	bzero(&xig, sizeof(xig));
1040	xig.xig_len = sizeof xig;
1041	xig.xig_count = n;
1042	xig.xig_gen = gencnt;
1043	xig.xig_sogen = so_gencnt;
1044	error = SYSCTL_OUT(req, &xig, sizeof xig);
1045	if (error) {
1046		lck_rw_done(ripcbinfo.ipi_lock);
1047		return error;
1048	}
1049    /*
1050     * We are done if there is no pcb
1051     */
1052    if (n == 0) {
1053	lck_rw_done(ripcbinfo.ipi_lock);
1054        return 0;
1055    }
1056
1057	inp_list = _MALLOC(n * sizeof *inp_list, M_TEMP, M_WAITOK);
1058	if (inp_list == 0) {
1059		lck_rw_done(ripcbinfo.ipi_lock);
1060		return ENOMEM;
1061	}
1062
1063	for (inp = ripcbinfo.ipi_listhead->lh_first, i = 0; inp && i < n;
1064	     inp = inp->inp_list.le_next) {
1065		if (inp->inp_gencnt <= gencnt && inp->inp_state != INPCB_STATE_DEAD)
1066			inp_list[i++] = inp;
1067	}
1068	n = i;
1069
1070	error = 0;
1071	for (i = 0; i < n; i++) {
1072		inp = inp_list[i];
1073		if (inp->inp_gencnt <= gencnt && inp->inp_state != INPCB_STATE_DEAD) {
1074			struct xinpcb xi;
1075
1076			bzero(&xi, sizeof(xi));
1077			xi.xi_len = sizeof xi;
1078			/* XXX should avoid extra copy */
1079			inpcb_to_compat(inp, &xi.xi_inp);
1080			if (inp->inp_socket)
1081				sotoxsocket(inp->inp_socket, &xi.xi_socket);
1082			error = SYSCTL_OUT(req, &xi, sizeof xi);
1083		}
1084	}
1085	if (!error) {
1086		/*
1087		 * Give the user an updated idea of our state.
1088		 * If the generation differs from what we told
1089		 * her before, she knows that something happened
1090		 * while we were processing this request, and it
1091		 * might be necessary to retry.
1092		 */
1093		bzero(&xig, sizeof(xig));
1094		xig.xig_len = sizeof xig;
1095		xig.xig_gen = ripcbinfo.ipi_gencnt;
1096		xig.xig_sogen = so_gencnt;
1097		xig.xig_count = ripcbinfo.ipi_count;
1098		error = SYSCTL_OUT(req, &xig, sizeof xig);
1099	}
1100	FREE(inp_list, M_TEMP);
1101	lck_rw_done(ripcbinfo.ipi_lock);
1102	return error;
1103}
1104
1105SYSCTL_PROC(_net_inet_raw, OID_AUTO/*XXX*/, pcblist,
1106	    CTLTYPE_STRUCT | CTLFLAG_RD | CTLFLAG_LOCKED, 0, 0,
1107	    rip_pcblist, "S,xinpcb", "List of active raw IP sockets");
1108
1109
1110static int
1111rip_pcblist64 SYSCTL_HANDLER_ARGS
1112{
1113#pragma unused(oidp, arg1, arg2)
1114        int error, i, n;
1115        struct inpcb *inp, **inp_list;
1116        inp_gen_t gencnt;
1117        struct xinpgen xig;
1118
1119        /*
1120         * The process of preparing the TCB list is too time-consuming and
1121         * resource-intensive to repeat twice on every request.
1122         */
1123        lck_rw_lock_exclusive(ripcbinfo.ipi_lock);
1124        if (req->oldptr == USER_ADDR_NULL) {
1125                n = ripcbinfo.ipi_count;
1126                req->oldidx = 2 * (sizeof xig)
1127                        + (n + n/8) * sizeof(struct xinpcb64);
1128                lck_rw_done(ripcbinfo.ipi_lock);
1129                return 0;
1130        }
1131
1132        if (req->newptr != USER_ADDR_NULL) {
1133                lck_rw_done(ripcbinfo.ipi_lock);
1134                return EPERM;
1135        }
1136
1137        /*
1138         * OK, now we're committed to doing something.
1139         */
1140        gencnt = ripcbinfo.ipi_gencnt;
1141        n = ripcbinfo.ipi_count;
1142
1143        bzero(&xig, sizeof(xig));
1144        xig.xig_len = sizeof xig;
1145        xig.xig_count = n;
1146        xig.xig_gen = gencnt;
1147        xig.xig_sogen = so_gencnt;
1148        error = SYSCTL_OUT(req, &xig, sizeof xig);
1149        if (error) {
1150                lck_rw_done(ripcbinfo.ipi_lock);
1151                return error;
1152        }
1153    /*
1154     * We are done if there is no pcb
1155     */
1156    if (n == 0) {
1157        lck_rw_done(ripcbinfo.ipi_lock);
1158        return 0;
1159    }
1160
1161        inp_list = _MALLOC(n * sizeof *inp_list, M_TEMP, M_WAITOK);
1162        if (inp_list == 0) {
1163                lck_rw_done(ripcbinfo.ipi_lock);
1164                return ENOMEM;
1165        }
1166
1167        for (inp = ripcbinfo.ipi_listhead->lh_first, i = 0; inp && i < n;
1168             inp = inp->inp_list.le_next) {
1169                if (inp->inp_gencnt <= gencnt && inp->inp_state != INPCB_STATE_DEAD)
1170                        inp_list[i++] = inp;
1171        }
1172        n = i;
1173
1174        error = 0;
1175        for (i = 0; i < n; i++) {
1176                inp = inp_list[i];
1177                if (inp->inp_gencnt <= gencnt && inp->inp_state != INPCB_STATE_DEAD) {
1178                        struct xinpcb64 xi;
1179
1180                        bzero(&xi, sizeof(xi));
1181                        xi.xi_len = sizeof xi;
1182                        inpcb_to_xinpcb64(inp, &xi);
1183                        if (inp->inp_socket)
1184                                sotoxsocket64(inp->inp_socket, &xi.xi_socket);
1185                        error = SYSCTL_OUT(req, &xi, sizeof xi);
1186                }
1187        }
1188        if (!error) {
1189                /*
1190                 * Give the user an updated idea of our state.
1191                 * If the generation differs from what we told
1192                 * her before, she knows that something happened
1193                 * while we were processing this request, and it
1194                 * might be necessary to retry.
1195                 */
1196                bzero(&xig, sizeof(xig));
1197                xig.xig_len = sizeof xig;
1198                xig.xig_gen = ripcbinfo.ipi_gencnt;
1199                xig.xig_sogen = so_gencnt;
1200                xig.xig_count = ripcbinfo.ipi_count;
1201                error = SYSCTL_OUT(req, &xig, sizeof xig);
1202        }
1203        FREE(inp_list, M_TEMP);
1204        lck_rw_done(ripcbinfo.ipi_lock);
1205        return error;
1206}
1207
1208SYSCTL_PROC(_net_inet_raw, OID_AUTO, pcblist64,
1209            CTLTYPE_STRUCT | CTLFLAG_RD | CTLFLAG_LOCKED, 0, 0,
1210            rip_pcblist64, "S,xinpcb64", "List of active raw IP sockets");
1211
1212
1213
1214static int
1215rip_pcblist_n SYSCTL_HANDLER_ARGS
1216{
1217#pragma unused(oidp, arg1, arg2)
1218	int error = 0;
1219
1220	error = get_pcblist_n(IPPROTO_IP, req, &ripcbinfo);
1221
1222	return error;
1223}
1224
1225SYSCTL_PROC(_net_inet_raw, OID_AUTO, pcblist_n,
1226            CTLTYPE_STRUCT | CTLFLAG_RD | CTLFLAG_LOCKED, 0, 0,
1227            rip_pcblist_n, "S,xinpcb_n", "List of active raw IP sockets");
1228
1229struct pr_usrreqs rip_usrreqs = {
1230	.pru_abort =		rip_abort,
1231	.pru_attach =		rip_attach,
1232	.pru_bind =		rip_bind,
1233	.pru_connect =		rip_connect,
1234	.pru_control =		in_control,
1235	.pru_detach =		rip_detach,
1236	.pru_disconnect =	rip_disconnect,
1237	.pru_peeraddr =		in_getpeeraddr,
1238	.pru_send =		rip_send,
1239	.pru_shutdown =		rip_shutdown,
1240	.pru_sockaddr =		in_getsockaddr,
1241	.pru_sosend =		sosend,
1242	.pru_soreceive =	soreceive,
1243};
1244/* DSEP Review Done pl-20051213-v02 @3253 */
1245