ip_divert.c revision 230443
152506Simp/*- 252506Simp * Copyright (c) 1982, 1986, 1988, 1993 352506Simp * The Regents of the University of California. All rights reserved. 452506Simp * 552506Simp * Redistribution and use in source and binary forms, with or without 652506Simp * modification, are permitted provided that the following conditions 752506Simp * are met: 852506Simp * 1. Redistributions of source code must retain the above copyright 952506Simp * notice, this list of conditions and the following disclaimer. 1052506Simp * 2. Redistributions in binary form must reproduce the above copyright 1152506Simp * notice, this list of conditions and the following disclaimer in the 1252506Simp * documentation and/or other materials provided with the distribution. 1352506Simp * 4. Neither the name of the University nor the names of its contributors 1452506Simp * may be used to endorse or promote products derived from this software 1552506Simp * without specific prior written permission. 1652506Simp * 1752506Simp * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND 1852506Simp * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 1952506Simp * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 2052506Simp * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE 2152506Simp * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 2252506Simp * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 2352506Simp * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 2452506Simp * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 2552506Simp * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 2652506Simp * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 2752506Simp * SUCH DAMAGE. 2852506Simp */ 2952506Simp 3052506Simp#include <sys/cdefs.h> 3152506Simp__FBSDID("$FreeBSD: head/sys/netinet/ip_divert.c 230443 2012-01-22 02:16:31Z bz $"); 3252506Simp 3352506Simp#include "opt_inet.h" 3452506Simp#include "opt_inet6.h" 3552506Simp#include "opt_sctp.h" 3652506Simp#ifndef INET 3752506Simp#error "IPDIVERT requires INET." 3852506Simp#endif 3952506Simp 4052506Simp#include <sys/param.h> 4152506Simp#include <sys/kernel.h> 4252506Simp#include <sys/lock.h> 4352506Simp#include <sys/malloc.h> 4452506Simp#include <sys/mbuf.h> 4552506Simp#include <sys/module.h> 4652506Simp#include <sys/kernel.h> 4752506Simp#include <sys/priv.h> 4852506Simp#include <sys/proc.h> 4959193Simp#include <sys/protosw.h> 5059193Simp#include <sys/socket.h> 5159193Simp#include <sys/socketvar.h> 5252506Simp#include <sys/sysctl.h> 5359193Simp#include <net/vnet.h> 5452506Simp 5555720Simp#include <net/if.h> 5652506Simp#include <net/netisr.h> 5752506Simp 5855720Simp#include <netinet/in.h> 5952506Simp#include <netinet/in_pcb.h> 6052506Simp#include <netinet/in_systm.h> 6152506Simp#include <netinet/in_var.h> 6252506Simp#include <netinet/ip.h> 6352506Simp#include <netinet/ip_var.h> 6452506Simp#ifdef INET6 6552506Simp#include <netinet/ip6.h> 6652506Simp#include <netinet6/ip6_var.h> 6752506Simp#endif 6852506Simp#ifdef SCTP 6952506Simp#include <netinet/sctp_crc32.h> 7052506Simp#endif 7152506Simp 7254250Simp#include <security/mac/mac_framework.h> 7352506Simp 7452506Simp/* 7555720Simp * Divert sockets 7652506Simp */ 7752506Simp 7852506Simp/* 7952506Simp * Allocate enough space to hold a full IP packet 8052506Simp */ 8152506Simp#define DIVSNDQ (65536 + 100) 8252506Simp#define DIVRCVQ (65536 + 100) 8352506Simp 8452506Simp/* 8552506Simp * Divert sockets work in conjunction with ipfw or other packet filters, 8652506Simp * see the divert(4) manpage for features. 8752506Simp * Packets are selected by the packet filter and tagged with an 8852506Simp * MTAG_IPFW_RULE tag carrying the 'divert port' number (as set by 8952506Simp * the packet filter) and information on the matching filter rule for 9052506Simp * subsequent reinjection. The divert_port is used to put the packet 9152506Simp * on the corresponding divert socket, while the rule number is passed 9252506Simp * up (at least partially) as the sin_port in the struct sockaddr. 9352506Simp * 9452506Simp * Packets written to the divert socket carry in sin_addr a 9552506Simp * destination address, and in sin_port the number of the filter rule 9652506Simp * after which to continue processing. 9758997Simp * If the destination address is INADDR_ANY, the packet is treated as 9852506Simp * as outgoing and sent to ip_output(); otherwise it is treated as 9952506Simp * incoming and sent to ip_input(). 10052506Simp * Further, sin_zero carries some information on the interface, 10152506Simp * which can be used in the reinject -- see comments in the code. 10252506Simp * 10355720Simp * On reinjection, processing in ip_input() and ip_output() 10455720Simp * will be exactly the same as for the original packet, except that 10552506Simp * packet filter processing will start at the rule number after the one 10655720Simp * written in the sin_port (ipfw does not allow a rule #0, so sin_port=0 10755720Simp * will apply the entire ruleset to the packet). 10852506Simp */ 10952506Simp 11052506Simp/* Internal variables. */ 11152506Simpstatic VNET_DEFINE(struct inpcbhead, divcb); 11252506Simpstatic VNET_DEFINE(struct inpcbinfo, divcbinfo); 11352506Simp 11452506Simp#define V_divcb VNET(divcb) 11552506Simp#define V_divcbinfo VNET(divcbinfo) 11652506Simp 11752506Simpstatic u_long div_sendspace = DIVSNDQ; /* XXX sysctl ? */ 11852506Simpstatic u_long div_recvspace = DIVRCVQ; /* XXX sysctl ? */ 11952506Simp 12052506Simpstatic eventhandler_tag ip_divert_event_tag; 12152506Simp 12252506Simp/* 12352506Simp * Initialize divert connection block queue. 12455720Simp */ 12555720Simpstatic void 12659193Simpdiv_zone_change(void *tag) 12755720Simp{ 12855720Simp 12952506Simp uma_zone_set_max(V_divcbinfo.ipi_zone, maxsockets); 13052506Simp} 13159193Simp 13259193Simpstatic int 13355720Simpdiv_inpcb_init(void *mem, int size, int flags) 13455720Simp{ 13559389Simp struct inpcb *inp = mem; 13652506Simp 13752506Simp INP_LOCK_INIT(inp, "inp", "divinp"); 13855720Simp return (0); 13952506Simp} 14052506Simp 14152506Simpstatic void 14252506Simpdiv_inpcb_fini(void *mem, int size) 14352506Simp{ 14452506Simp struct inpcb *inp = mem; 14552506Simp 14652506Simp INP_LOCK_DESTROY(inp); 14752506Simp} 14855720Simp 14952506Simpstatic void 15052506Simpdiv_init(void) 15152506Simp{ 15252506Simp 15352506Simp /* 15452506Simp * XXX We don't use the hash list for divert IP, but it's easier to 15552506Simp * allocate one-entry hash lists than it is to check all over the 15652506Simp * place for hashbase == NULL. 15752506Simp */ 15852506Simp in_pcbinfo_init(&V_divcbinfo, "div", &V_divcb, 1, 1, "divcb", 15952506Simp div_inpcb_init, div_inpcb_fini, UMA_ZONE_NOFREE, 16052506Simp IPI_HASHFIELDS_NONE); 16152506Simp} 16252506Simp 16352506Simpstatic void 16452506Simpdiv_destroy(void) 16552506Simp{ 16652506Simp 16752506Simp in_pcbinfo_destroy(&V_divcbinfo); 16852506Simp} 16952506Simp 17059389Simp/* 17152506Simp * IPPROTO_DIVERT is not in the real IP protocol number space; this 17252506Simp * function should never be called. Just in case, drop any packets. 17352506Simp */ 17452506Simpstatic void 17552506Simpdiv_input(struct mbuf *m, int off) 17652506Simp{ 17752506Simp 17852506Simp KMOD_IPSTAT_INC(ips_noproto); 17952506Simp m_freem(m); 18052506Simp} 18152506Simp 18252506Simp/* 18352506Simp * Divert a packet by passing it up to the divert socket at port 'port'. 18452506Simp * 18552506Simp * Setup generic address and protocol structures for div_input routine, 18652506Simp * then pass them along with mbuf chain. 18752506Simp */ 18852506Simpstatic void 18952506Simpdivert_packet(struct mbuf *m, int incoming) 19052506Simp{ 19152506Simp struct ip *ip; 19252506Simp struct inpcb *inp; 19352506Simp struct socket *sa; 19452506Simp u_int16_t nport; 19552506Simp struct sockaddr_in divsrc; 19652506Simp struct m_tag *mtag; 19752506Simp 19852506Simp mtag = m_tag_locate(m, MTAG_IPFW_RULE, 0, NULL); 19952506Simp if (mtag == NULL) { 20052506Simp m_freem(m); 20152506Simp return; 20252506Simp } 20352506Simp /* Assure header */ 20452506Simp if (m->m_len < sizeof(struct ip) && 20552506Simp (m = m_pullup(m, sizeof(struct ip))) == 0) 20652506Simp return; 20752506Simp ip = mtod(m, struct ip *); 20852506Simp 20952506Simp /* Delayed checksums are currently not compatible with divert. */ 21052506Simp if (m->m_pkthdr.csum_flags & CSUM_DELAY_DATA) { 21152506Simp ip->ip_len = ntohs(ip->ip_len); 21252506Simp in_delayed_cksum(m); 21352506Simp m->m_pkthdr.csum_flags &= ~CSUM_DELAY_DATA; 21452506Simp ip->ip_len = htons(ip->ip_len); 21552506Simp } 21652506Simp#ifdef SCTP 21752506Simp if (m->m_pkthdr.csum_flags & CSUM_SCTP) { 21852506Simp ip->ip_len = ntohs(ip->ip_len); 21952506Simp sctp_delayed_cksum(m, (uint32_t)(ip->ip_hl << 2)); 22052506Simp m->m_pkthdr.csum_flags &= ~CSUM_SCTP; 22152506Simp ip->ip_len = htons(ip->ip_len); 22252506Simp } 22352506Simp#endif 22452506Simp bzero(&divsrc, sizeof(divsrc)); 22552506Simp divsrc.sin_len = sizeof(divsrc); 22652506Simp divsrc.sin_family = AF_INET; 22752506Simp /* record matching rule, in host format */ 22852506Simp divsrc.sin_port = ((struct ipfw_rule_ref *)(mtag+1))->rulenum; 22952506Simp /* 23052506Simp * Record receive interface address, if any. 23152506Simp * But only for incoming packets. 23252506Simp */ 23352506Simp if (incoming) { 23452506Simp struct ifaddr *ifa; 23552506Simp struct ifnet *ifp; 23652506Simp 23752506Simp /* Sanity check */ 23852506Simp M_ASSERTPKTHDR(m); 23952506Simp 24052506Simp /* Find IP address for receive interface */ 24152506Simp ifp = m->m_pkthdr.rcvif; 24255720Simp if_addr_rlock(ifp); 24355720Simp TAILQ_FOREACH(ifa, &ifp->if_addrhead, ifa_link) { 24452506Simp if (ifa->ifa_addr->sa_family != AF_INET) 24552506Simp continue; 24652506Simp divsrc.sin_addr = 24752506Simp ((struct sockaddr_in *) ifa->ifa_addr)->sin_addr; 24852506Simp break; 24952506Simp } 25052506Simp if_addr_runlock(ifp); 25152506Simp } 25252506Simp /* 25352506Simp * Record the incoming interface name whenever we have one. 25452506Simp */ 25552506Simp if (m->m_pkthdr.rcvif) { 25652506Simp /* 25752506Simp * Hide the actual interface name in there in the 25852506Simp * sin_zero array. XXX This needs to be moved to a 25952506Simp * different sockaddr type for divert, e.g. 26052506Simp * sockaddr_div with multiple fields like 26152506Simp * sockaddr_dl. Presently we have only 7 bytes 26252506Simp * but that will do for now as most interfaces 26352506Simp * are 4 or less + 2 or less bytes for unit. 26452506Simp * There is probably a faster way of doing this, 26552506Simp * possibly taking it from the sockaddr_dl on the iface. 26652506Simp * This solves the problem of a P2P link and a LAN interface 26752506Simp * having the same address, which can result in the wrong 26852506Simp * interface being assigned to the packet when fed back 26952506Simp * into the divert socket. Theoretically if the daemon saves 27052506Simp * and re-uses the sockaddr_in as suggested in the man pages, 27152506Simp * this iface name will come along for the ride. 27252506Simp * (see div_output for the other half of this.) 27352506Simp */ 27452506Simp strlcpy(divsrc.sin_zero, m->m_pkthdr.rcvif->if_xname, 27552506Simp sizeof(divsrc.sin_zero)); 27652506Simp } 27752506Simp 27852506Simp /* Put packet on socket queue, if any */ 27952506Simp sa = NULL; 28052506Simp nport = htons((u_int16_t)(((struct ipfw_rule_ref *)(mtag+1))->info)); 28152506Simp INP_INFO_RLOCK(&V_divcbinfo); 28252506Simp LIST_FOREACH(inp, &V_divcb, inp_list) { 28352506Simp /* XXX why does only one socket match? */ 28452506Simp if (inp->inp_lport == nport) { 28552506Simp INP_RLOCK(inp); 28652506Simp sa = inp->inp_socket; 28752506Simp SOCKBUF_LOCK(&sa->so_rcv); 28852506Simp if (sbappendaddr_locked(&sa->so_rcv, 28952506Simp (struct sockaddr *)&divsrc, m, 29052506Simp (struct mbuf *)0) == 0) { 29152506Simp SOCKBUF_UNLOCK(&sa->so_rcv); 29252506Simp sa = NULL; /* force mbuf reclaim below */ 29352506Simp } else 29452506Simp sorwakeup_locked(sa); 29552506Simp INP_RUNLOCK(inp); 29652506Simp break; 29752506Simp } 29852506Simp } 29952506Simp INP_INFO_RUNLOCK(&V_divcbinfo); 30052506Simp if (sa == NULL) { 30152506Simp m_freem(m); 30252506Simp KMOD_IPSTAT_INC(ips_noproto); 30352506Simp KMOD_IPSTAT_DEC(ips_delivered); 30452506Simp } 30552506Simp} 30652506Simp 30752506Simp/* 30852506Simp * Deliver packet back into the IP processing machinery. 30952506Simp * 31052506Simp * If no address specified, or address is 0.0.0.0, send to ip_output(); 31152506Simp * otherwise, send to ip_input() and mark as having been received on 31252506Simp * the interface with that address. 31352506Simp */ 31452506Simpstatic int 31552506Simpdiv_output(struct socket *so, struct mbuf *m, struct sockaddr_in *sin, 31652506Simp struct mbuf *control) 31752506Simp{ 31852506Simp struct ip *const ip = mtod(m, struct ip *); 31952506Simp struct m_tag *mtag; 32052506Simp struct ipfw_rule_ref *dt; 32152506Simp int error = 0; 32252506Simp 32352506Simp /* 32455720Simp * An mbuf may hasn't come from userland, but we pretend 32555720Simp * that it has. 32655720Simp */ 32755720Simp m->m_pkthdr.rcvif = NULL; 32852506Simp m->m_nextpkt = NULL; 32952506Simp M_SETFIB(m, so->so_fibnum); 33052506Simp 33152506Simp if (control) 33252506Simp m_freem(control); /* XXX */ 33352506Simp 33452506Simp mtag = m_tag_locate(m, MTAG_IPFW_RULE, 0, NULL); 33552506Simp if (mtag == NULL) { 33652506Simp /* this should be normal */ 33752506Simp mtag = m_tag_alloc(MTAG_IPFW_RULE, 0, 33852506Simp sizeof(struct ipfw_rule_ref), M_NOWAIT | M_ZERO); 33952506Simp if (mtag == NULL) { 34052506Simp error = ENOBUFS; 34152506Simp goto cantsend; 34252506Simp } 34352506Simp m_tag_prepend(m, mtag); 34452506Simp } 34552506Simp dt = (struct ipfw_rule_ref *)(mtag+1); 34652506Simp 34752506Simp /* Loopback avoidance and state recovery */ 34852506Simp if (sin) { 34952506Simp int i; 35052506Simp 35152506Simp /* set the starting point. We provide a non-zero slot, 35252506Simp * but a non_matching chain_id to skip that info and use 35352506Simp * the rulenum/rule_id. 35452506Simp */ 35552506Simp dt->slot = 1; /* dummy, chain_id is invalid */ 35652506Simp dt->chain_id = 0; 35752506Simp dt->rulenum = sin->sin_port+1; /* host format ? */ 35852506Simp dt->rule_id = 0; 35952506Simp /* 36052506Simp * Find receive interface with the given name, stuffed 36152506Simp * (if it exists) in the sin_zero[] field. 36252506Simp * The name is user supplied data so don't trust its size 36352506Simp * or that it is zero terminated. 36452506Simp */ 36552506Simp for (i = 0; i < sizeof(sin->sin_zero) && sin->sin_zero[i]; i++) 36652506Simp ; 36752506Simp if ( i > 0 && i < sizeof(sin->sin_zero)) 36852506Simp m->m_pkthdr.rcvif = ifunit(sin->sin_zero); 36952506Simp } 37052506Simp 37152506Simp /* Reinject packet into the system as incoming or outgoing */ 37252506Simp if (!sin || sin->sin_addr.s_addr == 0) { 37352506Simp struct mbuf *options = NULL; 37452506Simp struct inpcb *inp; 37552506Simp 37652506Simp dt->info |= IPFW_IS_DIVERT | IPFW_INFO_OUT; 37752506Simp inp = sotoinpcb(so); 37852506Simp INP_RLOCK(inp); 37952506Simp switch (ip->ip_v) { 38052506Simp case IPVERSION: 38152506Simp /* 38252506Simp * Don't allow both user specified and setsockopt 38352506Simp * options, and don't allow packet length sizes that 38452506Simp * will crash. 38552506Simp */ 38652506Simp if ((((ip->ip_hl << 2) != sizeof(struct ip)) && 38752506Simp inp->inp_options != NULL) || 38852506Simp ((u_short)ntohs(ip->ip_len) > m->m_pkthdr.len)) { 38952506Simp error = EINVAL; 39052506Simp INP_RUNLOCK(inp); 39152506Simp goto cantsend; 39252506Simp } 39352506Simp 39452506Simp /* Convert fields to host order for ip_output() */ 39552506Simp ip->ip_len = ntohs(ip->ip_len); 39652506Simp ip->ip_off = ntohs(ip->ip_off); 39752506Simp break; 39852506Simp#ifdef INET6 39952506Simp case IPV6_VERSION >> 4: 40052506Simp { 40152506Simp struct ip6_hdr *const ip6 = mtod(m, struct ip6_hdr *); 40252506Simp 40352506Simp /* Don't allow packet length sizes that will crash */ 40452506Simp if (((u_short)ntohs(ip6->ip6_plen) > m->m_pkthdr.len)) { 40552506Simp error = EINVAL; 40652506Simp INP_RUNLOCK(inp); 40752506Simp goto cantsend; 40852506Simp } 40952506Simp 41052506Simp ip6->ip6_plen = ntohs(ip6->ip6_plen); 41152506Simp break; 41252506Simp } 41352506Simp#endif 41452506Simp default: 41552506Simp error = EINVAL; 41655720Simp INP_RUNLOCK(inp); 41752506Simp goto cantsend; 41852506Simp } 41952506Simp 42055720Simp /* Send packet to output processing */ 42152506Simp KMOD_IPSTAT_INC(ips_rawout); /* XXX */ 42252506Simp 42352506Simp#ifdef MAC 42452506Simp mac_inpcb_create_mbuf(inp, m); 42552506Simp#endif 42652506Simp /* 42752506Simp * Get ready to inject the packet into ip_output(). 42852506Simp * Just in case socket options were specified on the 42952506Simp * divert socket, we duplicate them. This is done 43064850Simp * to avoid having to hold the PCB locks over the call 43152506Simp * to ip_output(), as doing this results in a number of 43252506Simp * lock ordering complexities. 43352506Simp * 43452506Simp * Note that we set the multicast options argument for 43552506Simp * ip_output() to NULL since it should be invariant that 43652506Simp * they are not present. 43752506Simp */ 43852506Simp KASSERT(inp->inp_moptions == NULL, 43952506Simp ("multicast options set on a divert socket")); 44052506Simp /* 44152506Simp * XXXCSJP: It is unclear to me whether or not it makes 44252506Simp * sense for divert sockets to have options. However, 44352506Simp * for now we will duplicate them with the INP locks 44452506Simp * held so we can use them in ip_output() without 44552506Simp * requring a reference to the pcb. 44652506Simp */ 44752506Simp if (inp->inp_options != NULL) { 44852506Simp options = m_dup(inp->inp_options, M_NOWAIT); 44952506Simp if (options == NULL) { 45052506Simp INP_RUNLOCK(inp); 45152506Simp error = ENOBUFS; 45252506Simp goto cantsend; 45352506Simp } 45452506Simp } 45552506Simp INP_RUNLOCK(inp); 45652506Simp 45752506Simp switch (ip->ip_v) { 45852506Simp case IPVERSION: 45952506Simp error = ip_output(m, options, NULL, 46052506Simp ((so->so_options & SO_DONTROUTE) ? IP_ROUTETOIF : 0) 46152506Simp | IP_ALLOWBROADCAST | IP_RAWOUTPUT, NULL, NULL); 46252506Simp break; 46352506Simp#ifdef INET6 46452506Simp case IPV6_VERSION >> 4: 46552506Simp error = ip6_output(m, NULL, NULL, 0, NULL, NULL, NULL); 46652506Simp break; 46752506Simp#endif 46852506Simp } 46952506Simp if (options != NULL) 47052506Simp m_freem(options); 47152506Simp } else { 47252506Simp dt->info |= IPFW_IS_DIVERT | IPFW_INFO_IN; 47352506Simp if (m->m_pkthdr.rcvif == NULL) { 47452506Simp /* 47552506Simp * No luck with the name, check by IP address. 47652506Simp * Clear the port and the ifname to make sure 47752506Simp * there are no distractions for ifa_ifwithaddr. 47852506Simp */ 47952506Simp struct ifaddr *ifa; 48052506Simp 48152506Simp bzero(sin->sin_zero, sizeof(sin->sin_zero)); 48252506Simp sin->sin_port = 0; 48352506Simp ifa = ifa_ifwithaddr((struct sockaddr *) sin); 48452506Simp if (ifa == NULL) { 48552506Simp error = EADDRNOTAVAIL; 48652506Simp goto cantsend; 48752506Simp } 48852506Simp m->m_pkthdr.rcvif = ifa->ifa_ifp; 48952506Simp ifa_free(ifa); 49052506Simp } 49152506Simp#ifdef MAC 49252506Simp mac_socket_create_mbuf(so, m); 49352506Simp#endif 49452506Simp /* Send packet to input processing via netisr */ 49552506Simp switch (ip->ip_v) { 49652506Simp case IPVERSION: 49752506Simp netisr_queue_src(NETISR_IP, (uintptr_t)so, m); 49852506Simp break; 49952506Simp#ifdef INET6 50052506Simp case IPV6_VERSION >> 4: 50152506Simp netisr_queue_src(NETISR_IPV6, (uintptr_t)so, m); 50252506Simp break; 50352506Simp#endif 50452506Simp default: 50552506Simp error = EINVAL; 50652506Simp goto cantsend; 50752506Simp } 50852506Simp } 50952506Simp 51052506Simp return (error); 51152506Simp 51252506Simpcantsend: 51352506Simp m_freem(m); 51452506Simp return (error); 51552506Simp} 51652506Simp 51752506Simpstatic int 51852506Simpdiv_attach(struct socket *so, int proto, struct thread *td) 51952506Simp{ 52052506Simp struct inpcb *inp; 52152506Simp int error; 52252506Simp 52352506Simp inp = sotoinpcb(so); 52452506Simp KASSERT(inp == NULL, ("div_attach: inp != NULL")); 52552506Simp if (td != NULL) { 52652506Simp error = priv_check(td, PRIV_NETINET_DIVERT); 52752506Simp if (error) 52852506Simp return (error); 52953813Simp } 53053813Simp error = soreserve(so, div_sendspace, div_recvspace); 53153813Simp if (error) 53253813Simp return error; 53353813Simp INP_INFO_WLOCK(&V_divcbinfo); 53453813Simp error = in_pcballoc(so, &V_divcbinfo); 53553813Simp if (error) { 53652506Simp INP_INFO_WUNLOCK(&V_divcbinfo); 53752506Simp return error; 53852506Simp } 53952506Simp inp = (struct inpcb *)so->so_pcb; 54053813Simp INP_INFO_WUNLOCK(&V_divcbinfo); 54153813Simp inp->inp_ip_p = proto; 54253813Simp inp->inp_vflag |= INP_IPV4; 54353813Simp inp->inp_flags |= INP_HDRINCL; 54453813Simp INP_WUNLOCK(inp); 54553813Simp return 0; 54653813Simp} 54753813Simp 54853813Simpstatic void 54953813Simpdiv_detach(struct socket *so) 55053813Simp{ 55152506Simp struct inpcb *inp; 55252506Simp 55352506Simp inp = sotoinpcb(so); 55452506Simp KASSERT(inp != NULL, ("div_detach: inp == NULL")); 55552506Simp INP_INFO_WLOCK(&V_divcbinfo); 55652506Simp INP_WLOCK(inp); 55752506Simp in_pcbdetach(inp); 55852506Simp in_pcbfree(inp); 55952506Simp INP_INFO_WUNLOCK(&V_divcbinfo); 56052506Simp} 56152506Simp 56252506Simpstatic int 56352506Simpdiv_bind(struct socket *so, struct sockaddr *nam, struct thread *td) 56452506Simp{ 56552506Simp struct inpcb *inp; 56652506Simp int error; 56752506Simp 56852506Simp inp = sotoinpcb(so); 56952506Simp KASSERT(inp != NULL, ("div_bind: inp == NULL")); 57052506Simp /* in_pcbbind assumes that nam is a sockaddr_in 57152506Simp * and in_pcbbind requires a valid address. Since divert 57252506Simp * sockets don't we need to make sure the address is 57352506Simp * filled in properly. 57452506Simp * XXX -- divert should not be abusing in_pcbind 57552506Simp * and should probably have its own family. 57652506Simp */ 57752506Simp if (nam->sa_family != AF_INET) 57852506Simp return EAFNOSUPPORT; 57952506Simp ((struct sockaddr_in *)nam)->sin_addr.s_addr = INADDR_ANY; 58052506Simp INP_INFO_WLOCK(&V_divcbinfo); 58152506Simp INP_WLOCK(inp); 58252506Simp INP_HASH_WLOCK(&V_divcbinfo); 58352506Simp error = in_pcbbind(inp, nam, td->td_ucred); 58452506Simp INP_HASH_WUNLOCK(&V_divcbinfo); 58552506Simp INP_WUNLOCK(inp); 58652506Simp INP_INFO_WUNLOCK(&V_divcbinfo); 58752506Simp return error; 58852506Simp} 58952506Simp 59052506Simpstatic int 59152506Simpdiv_shutdown(struct socket *so) 59255720Simp{ 59352506Simp struct inpcb *inp; 59452506Simp 59552506Simp inp = sotoinpcb(so); 59652506Simp KASSERT(inp != NULL, ("div_shutdown: inp == NULL")); 59752506Simp INP_WLOCK(inp); 59852506Simp socantsendmore(so); 59952506Simp INP_WUNLOCK(inp); 60052506Simp return 0; 60152506Simp} 60252506Simp 60352506Simpstatic int 60452506Simpdiv_send(struct socket *so, int flags, struct mbuf *m, struct sockaddr *nam, 60552506Simp struct mbuf *control, struct thread *td) 60652506Simp{ 60752506Simp 60852506Simp /* Packet must have a header (but that's about it) */ 60952506Simp if (m->m_len < sizeof (struct ip) && 61052506Simp (m = m_pullup(m, sizeof (struct ip))) == 0) { 61152506Simp KMOD_IPSTAT_INC(ips_toosmall); 61252506Simp m_freem(m); 61352506Simp return EINVAL; 61452506Simp } 61552506Simp 61652506Simp /* Send packet */ 61752506Simp return div_output(so, m, (struct sockaddr_in *)nam, control); 61852506Simp} 61952506Simp 62052506Simpstatic void 62152506Simpdiv_ctlinput(int cmd, struct sockaddr *sa, void *vip) 62252506Simp{ 62352506Simp struct in_addr faddr; 62452506Simp 62552506Simp faddr = ((struct sockaddr_in *)sa)->sin_addr; 62652506Simp if (sa->sa_family != AF_INET || faddr.s_addr == INADDR_ANY) 62752506Simp return; 62852506Simp if (PRC_IS_REDIRECT(cmd)) 62952506Simp return; 63052506Simp} 63152506Simp 63252506Simpstatic int 63352506Simpdiv_pcblist(SYSCTL_HANDLER_ARGS) 63452506Simp{ 63552506Simp int error, i, n; 63652506Simp struct inpcb *inp, **inp_list; 63752506Simp inp_gen_t gencnt; 63852506Simp struct xinpgen xig; 63952506Simp 64052506Simp /* 64152506Simp * The process of preparing the TCB list is too time-consuming and 64252506Simp * resource-intensive to repeat twice on every request. 64352506Simp */ 64452506Simp if (req->oldptr == 0) { 64552506Simp n = V_divcbinfo.ipi_count; 64652506Simp n += imax(n / 8, 10); 64752506Simp req->oldidx = 2 * (sizeof xig) + n * sizeof(struct xinpcb); 64852506Simp return 0; 64952506Simp } 65052506Simp 65152506Simp if (req->newptr != 0) 65252506Simp return EPERM; 65352506Simp 65452506Simp /* 65552506Simp * OK, now we're committed to doing something. 65652506Simp */ 65752506Simp INP_INFO_RLOCK(&V_divcbinfo); 65852506Simp gencnt = V_divcbinfo.ipi_gencnt; 65952506Simp n = V_divcbinfo.ipi_count; 66052506Simp INP_INFO_RUNLOCK(&V_divcbinfo); 66152506Simp 66252506Simp error = sysctl_wire_old_buffer(req, 66352506Simp 2 * sizeof(xig) + n*sizeof(struct xinpcb)); 66452506Simp if (error != 0) 66552506Simp return (error); 66652506Simp 66752506Simp xig.xig_len = sizeof xig; 66852506Simp xig.xig_count = n; 66952506Simp xig.xig_gen = gencnt; 67052506Simp xig.xig_sogen = so_gencnt; 67152506Simp error = SYSCTL_OUT(req, &xig, sizeof xig); 67252506Simp if (error) 67352506Simp return error; 67452506Simp 67552506Simp inp_list = malloc(n * sizeof *inp_list, M_TEMP, M_WAITOK); 67652506Simp if (inp_list == 0) 67752506Simp return ENOMEM; 67852506Simp 67952506Simp INP_INFO_RLOCK(&V_divcbinfo); 68052506Simp for (inp = LIST_FIRST(V_divcbinfo.ipi_listhead), i = 0; inp && i < n; 68152506Simp inp = LIST_NEXT(inp, inp_list)) { 68252506Simp INP_WLOCK(inp); 68352506Simp if (inp->inp_gencnt <= gencnt && 68452506Simp cr_canseeinpcb(req->td->td_ucred, inp) == 0) { 68552506Simp in_pcbref(inp); 68652506Simp inp_list[i++] = inp; 68752506Simp } 68852506Simp INP_WUNLOCK(inp); 68952506Simp } 69052506Simp INP_INFO_RUNLOCK(&V_divcbinfo); 69152506Simp n = i; 69252506Simp 69352506Simp error = 0; 69452506Simp for (i = 0; i < n; i++) { 69552506Simp inp = inp_list[i]; 69652506Simp INP_RLOCK(inp); 69752506Simp if (inp->inp_gencnt <= gencnt) { 69852506Simp struct xinpcb xi; 69952506Simp bzero(&xi, sizeof(xi)); 70052506Simp xi.xi_len = sizeof xi; 70152506Simp /* XXX should avoid extra copy */ 70252506Simp bcopy(inp, &xi.xi_inp, sizeof *inp); 70352506Simp if (inp->inp_socket) 70452506Simp sotoxsocket(inp->inp_socket, &xi.xi_socket); 70552506Simp INP_RUNLOCK(inp); 70652506Simp error = SYSCTL_OUT(req, &xi, sizeof xi); 70752506Simp } else 70852506Simp INP_RUNLOCK(inp); 70952506Simp } 71052506Simp INP_INFO_WLOCK(&V_divcbinfo); 71152506Simp for (i = 0; i < n; i++) { 71252506Simp inp = inp_list[i]; 71352506Simp INP_RLOCK(inp); 71452506Simp if (!in_pcbrele_rlocked(inp)) 71552506Simp INP_RUNLOCK(inp); 71652506Simp } 71752506Simp INP_INFO_WUNLOCK(&V_divcbinfo); 71852506Simp 71952506Simp if (!error) { 72052506Simp /* 72152506Simp * Give the user an updated idea of our state. 72252506Simp * If the generation differs from what we told 72352506Simp * her before, she knows that something happened 72452506Simp * while we were processing this request, and it 72552506Simp * might be necessary to retry. 72652506Simp */ 72752506Simp INP_INFO_RLOCK(&V_divcbinfo); 72852506Simp xig.xig_gen = V_divcbinfo.ipi_gencnt; 72952506Simp xig.xig_sogen = so_gencnt; 73052506Simp xig.xig_count = V_divcbinfo.ipi_count; 73152506Simp INP_INFO_RUNLOCK(&V_divcbinfo); 73252506Simp error = SYSCTL_OUT(req, &xig, sizeof xig); 73352506Simp } 73452506Simp free(inp_list, M_TEMP); 73552506Simp return error; 73652506Simp} 73752506Simp 73852506Simp#ifdef SYSCTL_NODE 73952506Simpstatic SYSCTL_NODE(_net_inet, IPPROTO_DIVERT, divert, CTLFLAG_RW, 0, 74052506Simp "IPDIVERT"); 74152506SimpSYSCTL_PROC(_net_inet_divert, OID_AUTO, pcblist, CTLTYPE_OPAQUE | CTLFLAG_RD, 74252506Simp NULL, 0, div_pcblist, "S,xinpcb", "List of active divert sockets"); 74352506Simp#endif 74452506Simp 74552506Simpstruct pr_usrreqs div_usrreqs = { 74652506Simp .pru_attach = div_attach, 74752506Simp .pru_bind = div_bind, 74852506Simp .pru_control = in_control, 74952506Simp .pru_detach = div_detach, 75052506Simp .pru_peeraddr = in_getpeeraddr, 75152506Simp .pru_send = div_send, 75252506Simp .pru_shutdown = div_shutdown, 75352506Simp .pru_sockaddr = in_getsockaddr, 75452506Simp .pru_sosetlabel = in_pcbsosetlabel 75552506Simp}; 75652506Simp 75752506Simpstruct protosw div_protosw = { 75852506Simp .pr_type = SOCK_RAW, 75967897Sdwmalone .pr_protocol = IPPROTO_DIVERT, 76052506Simp .pr_flags = PR_ATOMIC|PR_ADDR, 76152506Simp .pr_input = div_input, 76252506Simp .pr_ctlinput = div_ctlinput, 76352506Simp .pr_ctloutput = ip_ctloutput, 76452506Simp .pr_init = div_init, 76552506Simp#ifdef VIMAGE 76652506Simp .pr_destroy = div_destroy, 76752506Simp#endif 76852506Simp .pr_usrreqs = &div_usrreqs 76952506Simp}; 77052506Simp 77152506Simpstatic int 77252506Simpdiv_modevent(module_t mod, int type, void *unused) 77352506Simp{ 77452506Simp int err = 0; 77552506Simp#ifndef VIMAGE 77652506Simp int n; 77752506Simp#endif 77852506Simp 77952506Simp switch (type) { 78052506Simp case MOD_LOAD: 78152506Simp /* 78252506Simp * Protocol will be initialized by pf_proto_register(). 78352506Simp * We don't have to register ip_protox because we are not 78452506Simp * a true IP protocol that goes over the wire. 78552506Simp */ 78652506Simp err = pf_proto_register(PF_INET, &div_protosw); 78752506Simp if (err != 0) 78852506Simp return (err); 78952506Simp ip_divert_ptr = divert_packet; 79052506Simp ip_divert_event_tag = EVENTHANDLER_REGISTER(maxsockets_change, 79152506Simp div_zone_change, NULL, EVENTHANDLER_PRI_ANY); 79252506Simp break; 79352506Simp case MOD_QUIESCE: 79452506Simp /* 79552506Simp * IPDIVERT may normally not be unloaded because of the 79667897Sdwmalone * potential race conditions. Tell kldunload we can't be 79752506Simp * unloaded unless the unload is forced. 79852506Simp */ 79952506Simp err = EPERM; 80052506Simp break; 80152506Simp case MOD_UNLOAD: 80252506Simp#ifdef VIMAGE 80352506Simp err = EPERM; 80452506Simp break; 80552506Simp#else 80652506Simp /* 80752506Simp * Forced unload. 80852506Simp * 80952506Simp * Module ipdivert can only be unloaded if no sockets are 81052506Simp * connected. Maybe this can be changed later to forcefully 81152506Simp * disconnect any open sockets. 81252506Simp * 81352506Simp * XXXRW: Note that there is a slight race here, as a new 81452506Simp * socket open request could be spinning on the lock and then 81552506Simp * we destroy the lock. 81652506Simp */ 81752506Simp INP_INFO_WLOCK(&V_divcbinfo); 81852506Simp n = V_divcbinfo.ipi_count; 81952506Simp if (n != 0) { 82052506Simp err = EBUSY; 82152506Simp INP_INFO_WUNLOCK(&V_divcbinfo); 82252506Simp break; 82352506Simp } 82452506Simp ip_divert_ptr = NULL; 82552506Simp err = pf_proto_unregister(PF_INET, IPPROTO_DIVERT, SOCK_RAW); 82652506Simp INP_INFO_WUNLOCK(&V_divcbinfo); 82752506Simp div_destroy(); 82852506Simp EVENTHANDLER_DEREGISTER(maxsockets_change, ip_divert_event_tag); 82952506Simp break; 83052506Simp#endif /* !VIMAGE */ 83152506Simp default: 83252506Simp err = EOPNOTSUPP; 83352506Simp break; 83452506Simp } 83552506Simp return err; 83652506Simp} 83752506Simp 83852506Simpstatic moduledata_t ipdivertmod = { 83952506Simp "ipdivert", 84052506Simp div_modevent, 84152506Simp 0 84252506Simp}; 84352506Simp 84452506SimpDECLARE_MODULE(ipdivert, ipdivertmod, SI_SUB_PROTO_IFATTACHDOMAIN, SI_ORDER_ANY); 84552506SimpMODULE_DEPEND(ipdivert, ipfw, 2, 2, 2); 84652506SimpMODULE_VERSION(ipdivert, 1); 84752506Simp