1139823Simp/*-
217072Sjulian * Copyright (c) 1982, 1986, 1988, 1993
317072Sjulian *	The Regents of the University of California.  All rights reserved.
417072Sjulian *
517072Sjulian * Redistribution and use in source and binary forms, with or without
617072Sjulian * modification, are permitted provided that the following conditions
717072Sjulian * are met:
817072Sjulian * 1. Redistributions of source code must retain the above copyright
917072Sjulian *    notice, this list of conditions and the following disclaimer.
1017072Sjulian * 2. Redistributions in binary form must reproduce the above copyright
1117072Sjulian *    notice, this list of conditions and the following disclaimer in the
1217072Sjulian *    documentation and/or other materials provided with the distribution.
1317072Sjulian * 4. Neither the name of the University nor the names of its contributors
1417072Sjulian *    may be used to endorse or promote products derived from this software
1517072Sjulian *    without specific prior written permission.
1617072Sjulian *
1717072Sjulian * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
1817072Sjulian * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
1917072Sjulian * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
2017072Sjulian * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
2117072Sjulian * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
2217072Sjulian * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
2317072Sjulian * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
2417072Sjulian * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
2517072Sjulian * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
2617072Sjulian * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
2717072Sjulian * SUCH DAMAGE.
2817072Sjulian */
2917072Sjulian
30172467Ssilby#include <sys/cdefs.h>
31172467Ssilby__FBSDID("$FreeBSD: releng/10.3/sys/netinet/ip_divert.c 241913 2012-10-22 21:09:03Z glebius $");
32172467Ssilby
3332350Seivind#include "opt_inet.h"
34230443Sbz#include "opt_inet6.h"
35188066Srrs#include "opt_sctp.h"
3632350Seivind#ifndef INET
37230452Sbz#error "IPDIVERT requires INET"
3832350Seivind#endif
3932350Seivind
4017072Sjulian#include <sys/param.h>
4164192Sru#include <sys/kernel.h>
4295759Stanimura#include <sys/lock.h>
4317072Sjulian#include <sys/malloc.h>
4417072Sjulian#include <sys/mbuf.h>
45136714Sandre#include <sys/module.h>
46136714Sandre#include <sys/kernel.h>
47164033Srwatson#include <sys/priv.h>
4886183Srwatson#include <sys/proc.h>
4995759Stanimura#include <sys/protosw.h>
5017072Sjulian#include <sys/socket.h>
5117072Sjulian#include <sys/socketvar.h>
5264192Sru#include <sys/sysctl.h>
53201735Sluigi#include <net/vnet.h>
5434923Sbde
5517072Sjulian#include <net/if.h>
56171746Scsjp#include <net/netisr.h>
5717072Sjulian
5817072Sjulian#include <netinet/in.h>
5995759Stanimura#include <netinet/in_pcb.h>
6017072Sjulian#include <netinet/in_systm.h>
6195759Stanimura#include <netinet/in_var.h>
6217072Sjulian#include <netinet/ip.h>
6317072Sjulian#include <netinet/ip_var.h>
64223593Sglebius#ifdef INET6
65223593Sglebius#include <netinet/ip6.h>
66223593Sglebius#include <netinet6/ip6_var.h>
67223593Sglebius#endif
68188066Srrs#ifdef SCTP
69188066Srrs#include <netinet/sctp_crc32.h>
70188066Srrs#endif
7117072Sjulian
72163606Srwatson#include <security/mac/mac_framework.h>
73163606Srwatson
7417072Sjulian/*
7517072Sjulian * Divert sockets
7617072Sjulian */
7717072Sjulian
7817072Sjulian/*
7917072Sjulian * Allocate enough space to hold a full IP packet
8017072Sjulian */
8117072Sjulian#define	DIVSNDQ		(65536 + 100)
8217072Sjulian#define	DIVRCVQ		(65536 + 100)
8317072Sjulian
8417072Sjulian/*
85201735Sluigi * Divert sockets work in conjunction with ipfw or other packet filters,
86201735Sluigi * see the divert(4) manpage for features.
87201735Sluigi * Packets are selected by the packet filter and tagged with an
88201735Sluigi * MTAG_IPFW_RULE tag carrying the 'divert port' number (as set by
89201735Sluigi * the packet filter) and information on the matching filter rule for
90201735Sluigi * subsequent reinjection. The divert_port is used to put the packet
91201735Sluigi * on the corresponding divert socket, while the rule number is passed
92201735Sluigi * up (at least partially) as the sin_port in the struct sockaddr.
9336369Sjulian *
94201735Sluigi * Packets written to the divert socket carry in sin_addr a
95201735Sluigi * destination address, and in sin_port the number of the filter rule
96201735Sluigi * after which to continue processing.
97201735Sluigi * If the destination address is INADDR_ANY, the packet is treated as
98201735Sluigi * as outgoing and sent to ip_output(); otherwise it is treated as
99201735Sluigi * incoming and sent to ip_input().
100201735Sluigi * Further, sin_zero carries some information on the interface,
101201735Sluigi * which can be used in the reinject -- see comments in the code.
10254175Sarchie *
10398613Sluigi * On reinjection, processing in ip_input() and ip_output()
10498613Sluigi * will be exactly the same as for the original packet, except that
105201735Sluigi * packet filter processing will start at the rule number after the one
106201735Sluigi * written in the sin_port (ipfw does not allow a rule #0, so sin_port=0
107201735Sluigi * will apply the entire ruleset to the packet).
10817072Sjulian */
10917072Sjulian
110136714Sandre/* Internal variables. */
111215701Sdimstatic VNET_DEFINE(struct inpcbhead, divcb);
112215701Sdimstatic VNET_DEFINE(struct inpcbinfo, divcbinfo);
11317072Sjulian
114195727Srwatson#define	V_divcb				VNET(divcb)
115195727Srwatson#define	V_divcbinfo			VNET(divcbinfo)
116195699Srwatson
11717072Sjulianstatic u_long	div_sendspace = DIVSNDQ;	/* XXX sysctl ? */
11817072Sjulianstatic u_long	div_recvspace = DIVRCVQ;	/* XXX sysctl ? */
11917072Sjulian
120196502Szecstatic eventhandler_tag ip_divert_event_tag;
121196502Szec
12217072Sjulian/*
12317072Sjulian * Initialize divert connection block queue.
12417072Sjulian */
125157927Spsstatic void
126157927Spsdiv_zone_change(void *tag)
127157927Sps{
128157927Sps
129181803Sbz	uma_zone_set_max(V_divcbinfo.ipi_zone, maxsockets);
130157927Sps}
131157927Sps
132160491Supsstatic int
133160491Supsdiv_inpcb_init(void *mem, int size, int flags)
134160491Sups{
135165634Sjhb	struct inpcb *inp = mem;
136165634Sjhb
137160491Sups	INP_LOCK_INIT(inp, "inp", "divinp");
138160491Sups	return (0);
139160491Sups}
140160491Sups
141160491Supsstatic void
142160491Supsdiv_inpcb_fini(void *mem, int size)
143160491Sups{
144165634Sjhb	struct inpcb *inp = mem;
145165634Sjhb
146160491Sups	INP_LOCK_DESTROY(inp);
147160491Sups}
148160491Sups
149196502Szecstatic void
15017072Sjuliandiv_init(void)
15117072Sjulian{
152169454Srwatson
15317072Sjulian	/*
154205157Srwatson	 * XXX We don't use the hash list for divert IP, but it's easier to
155205157Srwatson	 * allocate one-entry hash lists than it is to check all over the
156205157Srwatson	 * place for hashbase == NULL.
15717072Sjulian	 */
158205157Srwatson	in_pcbinfo_init(&V_divcbinfo, "div", &V_divcb, 1, 1, "divcb",
159222748Srwatson	    div_inpcb_init, div_inpcb_fini, UMA_ZONE_NOFREE,
160222748Srwatson	    IPI_HASHFIELDS_NONE);
16117072Sjulian}
16217072Sjulian
163196502Szecstatic void
164196502Szecdiv_destroy(void)
165196502Szec{
166196502Szec
167205157Srwatson	in_pcbinfo_destroy(&V_divcbinfo);
168196502Szec}
169196502Szec
17017072Sjulian/*
171106152Sfenner * IPPROTO_DIVERT is not in the real IP protocol number space; this
172106152Sfenner * function should never be called.  Just in case, drop any packets.
17317072Sjulian */
174201527Sluigistatic void
17582884Sjuliandiv_input(struct mbuf *m, int off)
17617072Sjulian{
177183550Szec
178196039Srwatson	KMOD_IPSTAT_INC(ips_noproto);
17954175Sarchie	m_freem(m);
18054175Sarchie}
18154175Sarchie
18254175Sarchie/*
18354175Sarchie * Divert a packet by passing it up to the divert socket at port 'port'.
18454175Sarchie *
18554175Sarchie * Setup generic address and protocol structures for div_input routine,
18654175Sarchie * then pass them along with mbuf chain.
18754175Sarchie */
188136714Sandrestatic void
189126239Smlaierdivert_packet(struct mbuf *m, int incoming)
19054175Sarchie{
19126359Sjulian	struct ip *ip;
19226359Sjulian	struct inpcb *inp;
19326359Sjulian	struct socket *sa;
19454175Sarchie	u_int16_t nport;
195119752Ssam	struct sockaddr_in divsrc;
196126239Smlaier	struct m_tag *mtag;
19717072Sjulian
198201527Sluigi	mtag = m_tag_locate(m, MTAG_IPFW_RULE, 0, NULL);
199126239Smlaier	if (mtag == NULL) {
200126239Smlaier		m_freem(m);
201126239Smlaier		return;
202126239Smlaier	}
20326359Sjulian	/* Assure header */
20426359Sjulian	if (m->m_len < sizeof(struct ip) &&
20598613Sluigi	    (m = m_pullup(m, sizeof(struct ip))) == 0)
20626359Sjulian		return;
20726359Sjulian	ip = mtod(m, struct ip *);
20826359Sjulian
209133069Sandre	/* Delayed checksums are currently not compatible with divert. */
210133069Sandre	if (m->m_pkthdr.csum_flags & CSUM_DELAY_DATA) {
211133069Sandre		in_delayed_cksum(m);
212133069Sandre		m->m_pkthdr.csum_flags &= ~CSUM_DELAY_DATA;
213133069Sandre	}
214188066Srrs#ifdef SCTP
215188066Srrs	if (m->m_pkthdr.csum_flags & CSUM_SCTP) {
216205104Srrs		sctp_delayed_cksum(m, (uint32_t)(ip->ip_hl << 2));
217188066Srrs		m->m_pkthdr.csum_flags &= ~CSUM_SCTP;
218188066Srrs	}
219188066Srrs#endif
220201527Sluigi	bzero(&divsrc, sizeof(divsrc));
221201527Sluigi	divsrc.sin_len = sizeof(divsrc);
222201527Sluigi	divsrc.sin_family = AF_INET;
223201527Sluigi	/* record matching rule, in host format */
224201527Sluigi	divsrc.sin_port = ((struct ipfw_rule_ref *)(mtag+1))->rulenum;
22537433Sjulian	/*
22654175Sarchie	 * Record receive interface address, if any.
22737433Sjulian	 * But only for incoming packets.
22837433Sjulian	 */
22954175Sarchie	if (incoming) {
23017072Sjulian		struct ifaddr *ifa;
231191287Srwatson		struct ifnet *ifp;
23217072Sjulian
23326359Sjulian		/* Sanity check */
234113255Sdes		M_ASSERTPKTHDR(m);
23526359Sjulian
23636364Sjulian		/* Find IP address for receive interface */
237191287Srwatson		ifp = m->m_pkthdr.rcvif;
238195023Srwatson		if_addr_rlock(ifp);
239191287Srwatson		TAILQ_FOREACH(ifa, &ifp->if_addrhead, ifa_link) {
24017072Sjulian			if (ifa->ifa_addr->sa_family != AF_INET)
24117072Sjulian				continue;
24217072Sjulian			divsrc.sin_addr =
24317072Sjulian			    ((struct sockaddr_in *) ifa->ifa_addr)->sin_addr;
24417072Sjulian			break;
24517072Sjulian		}
246195023Srwatson		if_addr_runlock(ifp);
24736903Sjulian	}
24837433Sjulian	/*
24937433Sjulian	 * Record the incoming interface name whenever we have one.
25037433Sjulian	 */
25136903Sjulian	if (m->m_pkthdr.rcvif) {
25236364Sjulian		/*
25336364Sjulian		 * Hide the actual interface name in there in the
25436364Sjulian		 * sin_zero array. XXX This needs to be moved to a
25536364Sjulian		 * different sockaddr type for divert, e.g.
25636364Sjulian		 * sockaddr_div with multiple fields like
25736364Sjulian		 * sockaddr_dl. Presently we have only 7 bytes
25836364Sjulian		 * but that will do for now as most interfaces
25936364Sjulian		 * are 4 or less + 2 or less bytes for unit.
26036364Sjulian		 * There is probably a faster way of doing this,
26136364Sjulian		 * possibly taking it from the sockaddr_dl on the iface.
26236364Sjulian		 * This solves the problem of a P2P link and a LAN interface
26336364Sjulian		 * having the same address, which can result in the wrong
26436364Sjulian		 * interface being assigned to the packet when fed back
26536364Sjulian		 * into the divert socket. Theoretically if the daemon saves
26636364Sjulian		 * and re-uses the sockaddr_in as suggested in the man pages,
26736364Sjulian		 * this iface name will come along for the ride.
26836364Sjulian		 * (see div_output for the other half of this.)
26936364Sjulian		 */
270121816Sbrooks		strlcpy(divsrc.sin_zero, m->m_pkthdr.rcvif->if_xname,
271121816Sbrooks		    sizeof(divsrc.sin_zero));
27217072Sjulian	}
27317072Sjulian
27417072Sjulian	/* Put packet on socket queue, if any */
27517072Sjulian	sa = NULL;
276201527Sluigi	nport = htons((u_int16_t)(((struct ipfw_rule_ref *)(mtag+1))->info));
277181803Sbz	INP_INFO_RLOCK(&V_divcbinfo);
278181803Sbz	LIST_FOREACH(inp, &V_divcb, inp_list) {
279119752Ssam		/* XXX why does only one socket match? */
280119752Ssam		if (inp->inp_lport == nport) {
281180851Smav			INP_RLOCK(inp);
28217072Sjulian			sa = inp->inp_socket;
283131151Srwatson			SOCKBUF_LOCK(&sa->so_rcv);
284131151Srwatson			if (sbappendaddr_locked(&sa->so_rcv,
285119752Ssam			    (struct sockaddr *)&divsrc, m,
286131151Srwatson			    (struct mbuf *)0) == 0) {
287131208Sphk				SOCKBUF_UNLOCK(&sa->so_rcv);
288119752Ssam				sa = NULL;	/* force mbuf reclaim below */
289131151Srwatson			} else
290131151Srwatson				sorwakeup_locked(sa);
291178376Srwatson			INP_RUNLOCK(inp);
292119752Ssam			break;
293119752Ssam		}
29417072Sjulian	}
295181803Sbz	INP_INFO_RUNLOCK(&V_divcbinfo);
296119752Ssam	if (sa == NULL) {
29717072Sjulian		m_freem(m);
298196039Srwatson		KMOD_IPSTAT_INC(ips_noproto);
299196039Srwatson		KMOD_IPSTAT_DEC(ips_delivered);
30017072Sjulian        }
30117072Sjulian}
30217072Sjulian
30317072Sjulian/*
30417072Sjulian * Deliver packet back into the IP processing machinery.
30517072Sjulian *
30617072Sjulian * If no address specified, or address is 0.0.0.0, send to ip_output();
30717072Sjulian * otherwise, send to ip_input() and mark as having been received on
30817072Sjulian * the interface with that address.
30917072Sjulian */
31017072Sjulianstatic int
311169454Srwatsondiv_output(struct socket *so, struct mbuf *m, struct sockaddr_in *sin,
312169454Srwatson    struct mbuf *control)
31317072Sjulian{
314223593Sglebius	struct ip *const ip = mtod(m, struct ip *);
315136073Sgreen	struct m_tag *mtag;
316201527Sluigi	struct ipfw_rule_ref *dt;
31717072Sjulian	int error = 0;
31817072Sjulian
319146182Sglebius	/*
320146182Sglebius	 * An mbuf may hasn't come from userland, but we pretend
321146182Sglebius	 * that it has.
322146182Sglebius	 */
323137630Sglebius	m->m_pkthdr.rcvif = NULL;
324146182Sglebius	m->m_nextpkt = NULL;
325185101Sjulian	M_SETFIB(m, so->so_fibnum);
32698613Sluigi
32717072Sjulian	if (control)
32817072Sjulian		m_freem(control);		/* XXX */
32917072Sjulian
330201527Sluigi	mtag = m_tag_locate(m, MTAG_IPFW_RULE, 0, NULL);
331201527Sluigi	if (mtag == NULL) {
332201527Sluigi		/* this should be normal */
333201527Sluigi		mtag = m_tag_alloc(MTAG_IPFW_RULE, 0,
334201527Sluigi		    sizeof(struct ipfw_rule_ref), M_NOWAIT | M_ZERO);
335137630Sglebius		if (mtag == NULL) {
336137630Sglebius			error = ENOBUFS;
337137630Sglebius			goto cantsend;
338137630Sglebius		}
339137630Sglebius		m_tag_prepend(m, mtag);
340201527Sluigi	}
341201527Sluigi	dt = (struct ipfw_rule_ref *)(mtag+1);
342136073Sgreen
34336903Sjulian	/* Loopback avoidance and state recovery */
34436707Sjulian	if (sin) {
34598613Sluigi		int i;
34637433Sjulian
347201527Sluigi		/* set the starting point. We provide a non-zero slot,
348201527Sluigi		 * but a non_matching chain_id to skip that info and use
349201527Sluigi		 * the rulenum/rule_id.
350201527Sluigi		 */
351201527Sluigi		dt->slot = 1; /* dummy, chain_id is invalid */
352201527Sluigi		dt->chain_id = 0;
353201527Sluigi		dt->rulenum = sin->sin_port+1; /* host format ? */
354201527Sluigi		dt->rule_id = 0;
35536903Sjulian		/*
35698613Sluigi		 * Find receive interface with the given name, stuffed
35798613Sluigi		 * (if it exists) in the sin_zero[] field.
35898613Sluigi		 * The name is user supplied data so don't trust its size
35998613Sluigi		 * or that it is zero terminated.
36036903Sjulian		 */
361110008Sphk		for (i = 0; i < sizeof(sin->sin_zero) && sin->sin_zero[i]; i++)
36298613Sluigi			;
36398613Sluigi		if ( i > 0 && i < sizeof(sin->sin_zero))
36436903Sjulian			m->m_pkthdr.rcvif = ifunit(sin->sin_zero);
36536369Sjulian	}
36617072Sjulian
36717072Sjulian	/* Reinject packet into the system as incoming or outgoing */
36817072Sjulian	if (!sin || sin->sin_addr.s_addr == 0) {
369223593Sglebius		struct mbuf *options = NULL;
370122331Ssam		struct inpcb *inp;
37198613Sluigi
372201527Sluigi		dt->info |= IPFW_IS_DIVERT | IPFW_INFO_OUT;
373122331Ssam		inp = sotoinpcb(so);
374178376Srwatson		INP_RLOCK(inp);
375223593Sglebius		switch (ip->ip_v) {
376223593Sglebius		case IPVERSION:
377223593Sglebius			/*
378223593Sglebius			 * Don't allow both user specified and setsockopt
379223593Sglebius			 * options, and don't allow packet length sizes that
380223593Sglebius			 * will crash.
381223593Sglebius			 */
382223593Sglebius			if ((((ip->ip_hl << 2) != sizeof(struct ip)) &&
383223593Sglebius			    inp->inp_options != NULL) ||
384223593Sglebius			    ((u_short)ntohs(ip->ip_len) > m->m_pkthdr.len)) {
385223593Sglebius				error = EINVAL;
386223593Sglebius				INP_RUNLOCK(inp);
387223593Sglebius				goto cantsend;
388223593Sglebius			}
389223593Sglebius			break;
390223593Sglebius#ifdef INET6
391223593Sglebius		case IPV6_VERSION >> 4:
392223593Sglebius		    {
393223593Sglebius			struct ip6_hdr *const ip6 = mtod(m, struct ip6_hdr *);
39417072Sjulian
395223593Sglebius			/* Don't allow packet length sizes that will crash */
396223593Sglebius			if (((u_short)ntohs(ip6->ip6_plen) > m->m_pkthdr.len)) {
397223593Sglebius				error = EINVAL;
398223593Sglebius				INP_RUNLOCK(inp);
399223593Sglebius				goto cantsend;
400223593Sglebius			}
401224575Sglebius			break;
402223593Sglebius		    }
403223593Sglebius#endif
404223593Sglebius		default:
405223593Sglebius			error = EINVAL;
406223593Sglebius			INP_RUNLOCK(inp);
407223593Sglebius			goto cantsend;
408223593Sglebius		}
409223593Sglebius
410223593Sglebius		/* Send packet to output processing */
411223593Sglebius		KMOD_IPSTAT_INC(ips_rawout);		/* XXX */
412223593Sglebius
413130900Srwatson#ifdef MAC
414223593Sglebius		mac_inpcb_create_mbuf(inp, m);
415130900Srwatson#endif
416223593Sglebius		/*
417223593Sglebius		 * Get ready to inject the packet into ip_output().
418223593Sglebius		 * Just in case socket options were specified on the
419223593Sglebius		 * divert socket, we duplicate them.  This is done
420223593Sglebius		 * to avoid having to hold the PCB locks over the call
421223593Sglebius		 * to ip_output(), as doing this results in a number of
422223593Sglebius		 * lock ordering complexities.
423223593Sglebius		 *
424223593Sglebius		 * Note that we set the multicast options argument for
425223593Sglebius		 * ip_output() to NULL since it should be invariant that
426223593Sglebius		 * they are not present.
427223593Sglebius		 */
428223593Sglebius		KASSERT(inp->inp_moptions == NULL,
429223593Sglebius		    ("multicast options set on a divert socket"));
430223593Sglebius		/*
431223593Sglebius		 * XXXCSJP: It is unclear to me whether or not it makes
432223593Sglebius		 * sense for divert sockets to have options.  However,
433223593Sglebius		 * for now we will duplicate them with the INP locks
434223593Sglebius		 * held so we can use them in ip_output() without
435223593Sglebius		 * requring a reference to the pcb.
436223593Sglebius		 */
437223593Sglebius		if (inp->inp_options != NULL) {
438223593Sglebius			options = m_dup(inp->inp_options, M_NOWAIT);
439223593Sglebius			if (options == NULL) {
440223593Sglebius				INP_RUNLOCK(inp);
441223593Sglebius				error = ENOBUFS;
442223593Sglebius				goto cantsend;
443171746Scsjp			}
444223593Sglebius		}
445223593Sglebius		INP_RUNLOCK(inp);
446223593Sglebius
447223593Sglebius		switch (ip->ip_v) {
448223593Sglebius		case IPVERSION:
449171746Scsjp			error = ip_output(m, options, NULL,
450223593Sglebius			    ((so->so_options & SO_DONTROUTE) ? IP_ROUTETOIF : 0)
451223593Sglebius			    | IP_ALLOWBROADCAST | IP_RAWOUTPUT, NULL, NULL);
452223593Sglebius			break;
453223593Sglebius#ifdef INET6
454223593Sglebius		case IPV6_VERSION >> 4:
455223593Sglebius			error = ip6_output(m, NULL, NULL, 0, NULL, NULL, NULL);
456223593Sglebius			break;
457223593Sglebius#endif
458122331Ssam		}
459223593Sglebius		if (options != NULL)
460223593Sglebius			m_freem(options);
46117072Sjulian	} else {
462201527Sluigi		dt->info |= IPFW_IS_DIVERT | IPFW_INFO_IN;
46336903Sjulian		if (m->m_pkthdr.rcvif == NULL) {
46443763Sjulian			/*
46598613Sluigi			 * No luck with the name, check by IP address.
46698613Sluigi			 * Clear the port and the ifname to make sure
46798613Sluigi			 * there are no distractions for ifa_ifwithaddr.
46843763Sjulian			 */
46998613Sluigi			struct	ifaddr *ifa;
47098613Sluigi
47143763Sjulian			bzero(sin->sin_zero, sizeof(sin->sin_zero));
47243763Sjulian			sin->sin_port = 0;
47398613Sluigi			ifa = ifa_ifwithaddr((struct sockaddr *) sin);
47498613Sluigi			if (ifa == NULL) {
47536364Sjulian				error = EADDRNOTAVAIL;
47636364Sjulian				goto cantsend;
47736364Sjulian			}
47836364Sjulian			m->m_pkthdr.rcvif = ifa->ifa_ifp;
479194760Srwatson			ifa_free(ifa);
48017072Sjulian		}
481130900Srwatson#ifdef MAC
482172930Srwatson		mac_socket_create_mbuf(so, m);
483130900Srwatson#endif
484171746Scsjp		/* Send packet to input processing via netisr */
485223593Sglebius		switch (ip->ip_v) {
486223593Sglebius		case IPVERSION:
487223593Sglebius			netisr_queue_src(NETISR_IP, (uintptr_t)so, m);
488223593Sglebius			break;
489223593Sglebius#ifdef INET6
490223593Sglebius		case IPV6_VERSION >> 4:
491223593Sglebius			netisr_queue_src(NETISR_IPV6, (uintptr_t)so, m);
492223593Sglebius			break;
493223593Sglebius#endif
494223593Sglebius		default:
495223593Sglebius			error = EINVAL;
496223593Sglebius			goto cantsend;
497223593Sglebius		}
49817072Sjulian	}
49917072Sjulian
500223593Sglebius	return (error);
50117072Sjulian
50217072Sjuliancantsend:
50354175Sarchie	m_freem(m);
504223593Sglebius	return (error);
50517072Sjulian}
50617072Sjulian
50726096Speterstatic int
50883366Sjuliandiv_attach(struct socket *so, int proto, struct thread *td)
50917072Sjulian{
51026096Speter	struct inpcb *inp;
511119752Ssam	int error;
51217072Sjulian
51326096Speter	inp  = sotoinpcb(so);
514157374Srwatson	KASSERT(inp == NULL, ("div_attach: inp != NULL"));
515164033Srwatson	if (td != NULL) {
516164033Srwatson		error = priv_check(td, PRIV_NETINET_DIVERT);
517164033Srwatson		if (error)
518164033Srwatson			return (error);
519164033Srwatson	}
52055009Sshin	error = soreserve(so, div_sendspace, div_recvspace);
521157374Srwatson	if (error)
52255009Sshin		return error;
523181803Sbz	INP_INFO_WLOCK(&V_divcbinfo);
524181803Sbz	error = in_pcballoc(so, &V_divcbinfo);
525119752Ssam	if (error) {
526181803Sbz		INP_INFO_WUNLOCK(&V_divcbinfo);
52726096Speter		return error;
528119752Ssam	}
52926096Speter	inp = (struct inpcb *)so->so_pcb;
530181803Sbz	INP_INFO_WUNLOCK(&V_divcbinfo);
53126096Speter	inp->inp_ip_p = proto;
53264192Sru	inp->inp_vflag |= INP_IPV4;
53326096Speter	inp->inp_flags |= INP_HDRINCL;
534178285Srwatson	INP_WUNLOCK(inp);
53526096Speter	return 0;
53626096Speter}
53717072Sjulian
538157370Srwatsonstatic void
53926096Speterdiv_detach(struct socket *so)
54026096Speter{
54126096Speter	struct inpcb *inp;
54217072Sjulian
543157374Srwatson	inp = sotoinpcb(so);
544157374Srwatson	KASSERT(inp != NULL, ("div_detach: inp == NULL"));
545181803Sbz	INP_INFO_WLOCK(&V_divcbinfo);
546178285Srwatson	INP_WLOCK(inp);
54726096Speter	in_pcbdetach(inp);
548157374Srwatson	in_pcbfree(inp);
549181803Sbz	INP_INFO_WUNLOCK(&V_divcbinfo);
55026096Speter}
55117072Sjulian
55226096Speterstatic int
55383366Sjuliandiv_bind(struct socket *so, struct sockaddr *nam, struct thread *td)
55426096Speter{
55526096Speter	struct inpcb *inp;
55626096Speter	int error;
55717072Sjulian
55826345Speter	inp = sotoinpcb(so);
559157423Srwatson	KASSERT(inp != NULL, ("div_bind: inp == NULL"));
56098664Sluigi	/* in_pcbbind assumes that nam is a sockaddr_in
56159909Spaul	 * and in_pcbbind requires a valid address. Since divert
56259909Spaul	 * sockets don't we need to make sure the address is
56359909Spaul	 * filled in properly.
56459909Spaul	 * XXX -- divert should not be abusing in_pcbind
56559909Spaul	 * and should probably have its own family.
56659909Spaul	 */
56798613Sluigi	if (nam->sa_family != AF_INET)
568157374Srwatson		return EAFNOSUPPORT;
569157374Srwatson	((struct sockaddr_in *)nam)->sin_addr.s_addr = INADDR_ANY;
570181803Sbz	INP_INFO_WLOCK(&V_divcbinfo);
571178285Srwatson	INP_WLOCK(inp);
572222690Srwatson	INP_HASH_WLOCK(&V_divcbinfo);
573157374Srwatson	error = in_pcbbind(inp, nam, td->td_ucred);
574222690Srwatson	INP_HASH_WUNLOCK(&V_divcbinfo);
575178285Srwatson	INP_WUNLOCK(inp);
576181803Sbz	INP_INFO_WUNLOCK(&V_divcbinfo);
57765260Sru	return error;
57826096Speter}
57917072Sjulian
58026096Speterstatic int
58126096Speterdiv_shutdown(struct socket *so)
58226096Speter{
583122331Ssam	struct inpcb *inp;
584122331Ssam
585122331Ssam	inp = sotoinpcb(so);
586157374Srwatson	KASSERT(inp != NULL, ("div_shutdown: inp == NULL"));
587178285Srwatson	INP_WLOCK(inp);
58826096Speter	socantsendmore(so);
589178285Srwatson	INP_WUNLOCK(inp);
59026096Speter	return 0;
59126096Speter}
59217072Sjulian
59326096Speterstatic int
59429327Speterdiv_send(struct socket *so, int flags, struct mbuf *m, struct sockaddr *nam,
595169454Srwatson    struct mbuf *control, struct thread *td)
59626096Speter{
597183550Szec
59826096Speter	/* Packet must have a header (but that's about it) */
59955009Sshin	if (m->m_len < sizeof (struct ip) &&
60026096Speter	    (m = m_pullup(m, sizeof (struct ip))) == 0) {
601196039Srwatson		KMOD_IPSTAT_INC(ips_toosmall);
60226096Speter		m_freem(m);
60326096Speter		return EINVAL;
60426096Speter	}
60517072Sjulian
60626096Speter	/* Send packet */
60798613Sluigi	return div_output(so, m, (struct sockaddr_in *)nam, control);
60817072Sjulian}
60926096Speter
610201527Sluigistatic void
611122331Ssamdiv_ctlinput(int cmd, struct sockaddr *sa, void *vip)
612122331Ssam{
613122331Ssam        struct in_addr faddr;
614122331Ssam
615122331Ssam	faddr = ((struct sockaddr_in *)sa)->sin_addr;
616122331Ssam	if (sa->sa_family != AF_INET || faddr.s_addr == INADDR_ANY)
617122331Ssam        	return;
618122922Sandre	if (PRC_IS_REDIRECT(cmd))
619122922Sandre		return;
620122331Ssam}
621122331Ssam
62264192Srustatic int
62364192Srudiv_pcblist(SYSCTL_HANDLER_ARGS)
62464192Sru{
625119752Ssam	int error, i, n;
62664192Sru	struct inpcb *inp, **inp_list;
62764192Sru	inp_gen_t gencnt;
62864192Sru	struct xinpgen xig;
62964192Sru
63064192Sru	/*
63164192Sru	 * The process of preparing the TCB list is too time-consuming and
63264192Sru	 * resource-intensive to repeat twice on every request.
63364192Sru	 */
63464192Sru	if (req->oldptr == 0) {
635181803Sbz		n = V_divcbinfo.ipi_count;
636211433Sjhb		n += imax(n / 8, 10);
637211433Sjhb		req->oldidx = 2 * (sizeof xig) + n * sizeof(struct xinpcb);
63864192Sru		return 0;
63964192Sru	}
64064192Sru
64164192Sru	if (req->newptr != 0)
64264192Sru		return EPERM;
64364192Sru
64464192Sru	/*
64564192Sru	 * OK, now we're committed to doing something.
64664192Sru	 */
647181803Sbz	INP_INFO_RLOCK(&V_divcbinfo);
648181803Sbz	gencnt = V_divcbinfo.ipi_gencnt;
649181803Sbz	n = V_divcbinfo.ipi_count;
650181803Sbz	INP_INFO_RUNLOCK(&V_divcbinfo);
65164192Sru
652126253Struckman	error = sysctl_wire_old_buffer(req,
653126253Struckman	    2 * sizeof(xig) + n*sizeof(struct xinpcb));
654126253Struckman	if (error != 0)
655126253Struckman		return (error);
656119752Ssam
65764192Sru	xig.xig_len = sizeof xig;
65864192Sru	xig.xig_count = n;
65964192Sru	xig.xig_gen = gencnt;
66064192Sru	xig.xig_sogen = so_gencnt;
66164192Sru	error = SYSCTL_OUT(req, &xig, sizeof xig);
66264192Sru	if (error)
66364192Sru		return error;
66464192Sru
665111119Simp	inp_list = malloc(n * sizeof *inp_list, M_TEMP, M_WAITOK);
66664192Sru	if (inp_list == 0)
66764192Sru		return ENOMEM;
66864192Sru
669181803Sbz	INP_INFO_RLOCK(&V_divcbinfo);
670181803Sbz	for (inp = LIST_FIRST(V_divcbinfo.ipi_listhead), i = 0; inp && i < n;
67171999Sphk	     inp = LIST_NEXT(inp, inp_list)) {
672205251Sbz		INP_WLOCK(inp);
673119752Ssam		if (inp->inp_gencnt <= gencnt &&
674205251Sbz		    cr_canseeinpcb(req->td->td_ucred, inp) == 0) {
675205251Sbz			in_pcbref(inp);
67664192Sru			inp_list[i++] = inp;
677205251Sbz		}
678205251Sbz		INP_WUNLOCK(inp);
67964192Sru	}
680181803Sbz	INP_INFO_RUNLOCK(&V_divcbinfo);
68164192Sru	n = i;
68264192Sru
68364192Sru	error = 0;
68464192Sru	for (i = 0; i < n; i++) {
68564192Sru		inp = inp_list[i];
686178376Srwatson		INP_RLOCK(inp);
68764192Sru		if (inp->inp_gencnt <= gencnt) {
68864192Sru			struct xinpcb xi;
689145953Scperciva			bzero(&xi, sizeof(xi));
69064192Sru			xi.xi_len = sizeof xi;
69164192Sru			/* XXX should avoid extra copy */
69264192Sru			bcopy(inp, &xi.xi_inp, sizeof *inp);
69364192Sru			if (inp->inp_socket)
69464192Sru				sotoxsocket(inp->inp_socket, &xi.xi_socket);
695178376Srwatson			INP_RUNLOCK(inp);
69664192Sru			error = SYSCTL_OUT(req, &xi, sizeof xi);
697160491Sups		} else
698178376Srwatson			INP_RUNLOCK(inp);
69964192Sru	}
700205251Sbz	INP_INFO_WLOCK(&V_divcbinfo);
701205251Sbz	for (i = 0; i < n; i++) {
702205251Sbz		inp = inp_list[i];
703222488Srwatson		INP_RLOCK(inp);
704222488Srwatson		if (!in_pcbrele_rlocked(inp))
705222488Srwatson			INP_RUNLOCK(inp);
706205251Sbz	}
707205251Sbz	INP_INFO_WUNLOCK(&V_divcbinfo);
708205251Sbz
70964192Sru	if (!error) {
71064192Sru		/*
71164192Sru		 * Give the user an updated idea of our state.
71264192Sru		 * If the generation differs from what we told
71364192Sru		 * her before, she knows that something happened
71464192Sru		 * while we were processing this request, and it
71564192Sru		 * might be necessary to retry.
71664192Sru		 */
717181803Sbz		INP_INFO_RLOCK(&V_divcbinfo);
718181803Sbz		xig.xig_gen = V_divcbinfo.ipi_gencnt;
71964192Sru		xig.xig_sogen = so_gencnt;
720181803Sbz		xig.xig_count = V_divcbinfo.ipi_count;
721181803Sbz		INP_INFO_RUNLOCK(&V_divcbinfo);
72264192Sru		error = SYSCTL_OUT(req, &xig, sizeof xig);
72364192Sru	}
72464192Sru	free(inp_list, M_TEMP);
72564192Sru	return error;
72664192Sru}
72764192Sru
728136714Sandre#ifdef SYSCTL_NODE
729227309Sedstatic SYSCTL_NODE(_net_inet, IPPROTO_DIVERT, divert, CTLFLAG_RW, 0,
730227309Sed    "IPDIVERT");
731217554SmdfSYSCTL_PROC(_net_inet_divert, OID_AUTO, pcblist, CTLTYPE_OPAQUE | CTLFLAG_RD,
732217554Smdf    NULL, 0, div_pcblist, "S,xinpcb", "List of active divert sockets");
733136714Sandre#endif
73464192Sru
73526096Speterstruct pr_usrreqs div_usrreqs = {
736137386Sphk	.pru_attach =		div_attach,
737137386Sphk	.pru_bind =		div_bind,
738137386Sphk	.pru_control =		in_control,
739137386Sphk	.pru_detach =		div_detach,
740169462Srwatson	.pru_peeraddr =		in_getpeeraddr,
741137386Sphk	.pru_send =		div_send,
742137386Sphk	.pru_shutdown =		div_shutdown,
743169462Srwatson	.pru_sockaddr =		in_getsockaddr,
744137584Sphk	.pru_sosetlabel =	in_pcbsosetlabel
74526096Speter};
746136714Sandre
747136714Sandrestruct protosw div_protosw = {
748152242Sru	.pr_type =		SOCK_RAW,
749152242Sru	.pr_protocol =		IPPROTO_DIVERT,
750152242Sru	.pr_flags =		PR_ATOMIC|PR_ADDR,
751152242Sru	.pr_input =		div_input,
752152242Sru	.pr_ctlinput =		div_ctlinput,
753152242Sru	.pr_ctloutput =		ip_ctloutput,
754152242Sru	.pr_init =		div_init,
755196502Szec#ifdef VIMAGE
756196502Szec	.pr_destroy =		div_destroy,
757196502Szec#endif
758152242Sru	.pr_usrreqs =		&div_usrreqs
759136714Sandre};
760136714Sandre
761136714Sandrestatic int
762136714Sandrediv_modevent(module_t mod, int type, void *unused)
763136714Sandre{
764136714Sandre	int err = 0;
765196502Szec#ifndef VIMAGE
766136714Sandre	int n;
767196502Szec#endif
768136714Sandre
769136714Sandre	switch (type) {
770136714Sandre	case MOD_LOAD:
771136714Sandre		/*
772136714Sandre		 * Protocol will be initialized by pf_proto_register().
773136714Sandre		 * We don't have to register ip_protox because we are not
774136714Sandre		 * a true IP protocol that goes over the wire.
775136714Sandre		 */
776136714Sandre		err = pf_proto_register(PF_INET, &div_protosw);
777196502Szec		if (err != 0)
778196502Szec			return (err);
779136714Sandre		ip_divert_ptr = divert_packet;
780196502Szec		ip_divert_event_tag = EVENTHANDLER_REGISTER(maxsockets_change,
781196502Szec		    div_zone_change, NULL, EVENTHANDLER_PRI_ANY);
782136714Sandre		break;
783136788Sandre	case MOD_QUIESCE:
784136788Sandre		/*
785136788Sandre		 * IPDIVERT may normally not be unloaded because of the
786136788Sandre		 * potential race conditions.  Tell kldunload we can't be
787136788Sandre		 * unloaded unless the unload is forced.
788136788Sandre		 */
789136788Sandre		err = EPERM;
790136788Sandre		break;
791136714Sandre	case MOD_UNLOAD:
792196502Szec#ifdef VIMAGE
793196502Szec		err = EPERM;
794196502Szec		break;
795196502Szec#else
796136714Sandre		/*
797136788Sandre		 * Forced unload.
798136788Sandre		 *
799136714Sandre		 * Module ipdivert can only be unloaded if no sockets are
800136714Sandre		 * connected.  Maybe this can be changed later to forcefully
801136714Sandre		 * disconnect any open sockets.
802136715Srwatson		 *
803136716Sandre		 * XXXRW: Note that there is a slight race here, as a new
804136716Sandre		 * socket open request could be spinning on the lock and then
805136716Sandre		 * we destroy the lock.
806136714Sandre		 */
807181803Sbz		INP_INFO_WLOCK(&V_divcbinfo);
808181803Sbz		n = V_divcbinfo.ipi_count;
809136714Sandre		if (n != 0) {
810136714Sandre			err = EBUSY;
811181803Sbz			INP_INFO_WUNLOCK(&V_divcbinfo);
812136714Sandre			break;
813136714Sandre		}
814136714Sandre		ip_divert_ptr = NULL;
815136714Sandre		err = pf_proto_unregister(PF_INET, IPPROTO_DIVERT, SOCK_RAW);
816181803Sbz		INP_INFO_WUNLOCK(&V_divcbinfo);
817196502Szec		div_destroy();
818196502Szec		EVENTHANDLER_DEREGISTER(maxsockets_change, ip_divert_event_tag);
819136714Sandre		break;
820196502Szec#endif /* !VIMAGE */
821136714Sandre	default:
822136788Sandre		err = EOPNOTSUPP;
823136714Sandre		break;
824136714Sandre	}
825136714Sandre	return err;
826136714Sandre}
827136714Sandre
828136714Sandrestatic moduledata_t ipdivertmod = {
829136714Sandre        "ipdivert",
830136714Sandre        div_modevent,
831241394Skevlo        0
832136714Sandre};
833136714Sandre
834136714SandreDECLARE_MODULE(ipdivert, ipdivertmod, SI_SUB_PROTO_IFATTACHDOMAIN, SI_ORDER_ANY);
835201527SluigiMODULE_DEPEND(ipdivert, ipfw, 2, 2, 2);
836136714SandreMODULE_VERSION(ipdivert, 1);
837