1/*      $OpenBSD: ip_divert.c,v 1.95 2024/03/05 09:45:13 bluhm Exp $ */
2
3/*
4 * Copyright (c) 2009 Michele Marchetto <michele@openbsd.org>
5 *
6 * Permission to use, copy, modify, and distribute this software for any
7 * purpose with or without fee is hereby granted, provided that the above
8 * copyright notice and this permission notice appear in all copies.
9 *
10 * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
11 * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
12 * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR
13 * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
14 * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
15 * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF
16 * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
17 */
18
19#include <sys/param.h>
20#include <sys/systm.h>
21#include <sys/mbuf.h>
22#include <sys/protosw.h>
23#include <sys/socket.h>
24#include <sys/socketvar.h>
25#include <sys/sysctl.h>
26
27#include <net/if.h>
28#include <net/route.h>
29#include <net/if_var.h>
30#include <net/netisr.h>
31
32#include <netinet/in.h>
33#include <netinet/in_var.h>
34#include <netinet/ip.h>
35#include <netinet/ip_var.h>
36#include <netinet/in_pcb.h>
37#include <netinet/ip_divert.h>
38#include <netinet/tcp.h>
39#include <netinet/udp.h>
40#include <netinet/ip_icmp.h>
41
42#include <net/pfvar.h>
43
44struct	inpcbtable	divbtable;
45struct	cpumem		*divcounters;
46
47#ifndef DIVERT_SENDSPACE
48#define DIVERT_SENDSPACE	(65536 + 100)
49#endif
50u_int   divert_sendspace = DIVERT_SENDSPACE;
51#ifndef DIVERT_RECVSPACE
52#define DIVERT_RECVSPACE	(65536 + 100)
53#endif
54u_int   divert_recvspace = DIVERT_RECVSPACE;
55
56#ifndef DIVERTHASHSIZE
57#define DIVERTHASHSIZE	128
58#endif
59
60const struct sysctl_bounded_args divertctl_vars[] = {
61	{ DIVERTCTL_RECVSPACE, &divert_recvspace, 0, INT_MAX },
62	{ DIVERTCTL_SENDSPACE, &divert_sendspace, 0, INT_MAX },
63};
64
65const struct pr_usrreqs divert_usrreqs = {
66	.pru_attach	= divert_attach,
67	.pru_detach	= divert_detach,
68	.pru_lock	= divert_lock,
69	.pru_unlock	= divert_unlock,
70	.pru_locked	= divert_locked,
71	.pru_bind	= divert_bind,
72	.pru_shutdown	= divert_shutdown,
73	.pru_send	= divert_send,
74	.pru_control	= in_control,
75	.pru_sockaddr	= in_sockaddr,
76	.pru_peeraddr	= in_peeraddr,
77};
78
79int divbhashsize = DIVERTHASHSIZE;
80
81int	divert_output(struct inpcb *, struct mbuf *, struct mbuf *,
82	    struct mbuf *);
83void
84divert_init(void)
85{
86	in_pcbinit(&divbtable, divbhashsize);
87	divcounters = counters_alloc(divs_ncounters);
88}
89
90int
91divert_output(struct inpcb *inp, struct mbuf *m, struct mbuf *nam,
92    struct mbuf *control)
93{
94	struct sockaddr_in *sin;
95	int error, min_hdrlen, off, dir;
96	struct ip *ip;
97
98	m_freem(control);
99
100	if ((error = in_nam2sin(nam, &sin)))
101		goto fail;
102
103	if (m->m_pkthdr.len > IP_MAXPACKET) {
104		error = EMSGSIZE;
105		goto fail;
106	}
107
108	m = rip_chkhdr(m, NULL);
109	if (m == NULL) {
110		error = EINVAL;
111		goto fail;
112	}
113
114	ip = mtod(m, struct ip *);
115	off = ip->ip_hl << 2;
116
117	dir = (sin->sin_addr.s_addr == INADDR_ANY ? PF_OUT : PF_IN);
118
119	switch (ip->ip_p) {
120	case IPPROTO_TCP:
121		min_hdrlen = sizeof(struct tcphdr);
122		m->m_pkthdr.csum_flags |= M_TCP_CSUM_OUT;
123		break;
124	case IPPROTO_UDP:
125		min_hdrlen = sizeof(struct udphdr);
126		m->m_pkthdr.csum_flags |= M_UDP_CSUM_OUT;
127		break;
128	case IPPROTO_ICMP:
129		min_hdrlen = ICMP_MINLEN;
130		m->m_pkthdr.csum_flags |= M_ICMP_CSUM_OUT;
131		break;
132	default:
133		min_hdrlen = 0;
134		break;
135	}
136	if (min_hdrlen && m->m_pkthdr.len < off + min_hdrlen) {
137		error = EINVAL;
138		goto fail;
139	}
140
141	m->m_pkthdr.pf.flags |= PF_TAG_DIVERTED_PACKET;
142
143	if (dir == PF_IN) {
144		struct rtentry *rt;
145		struct ifnet *ifp;
146
147		rt = rtalloc(sintosa(sin), 0, inp->inp_rtableid);
148		if (!rtisvalid(rt) || !ISSET(rt->rt_flags, RTF_LOCAL)) {
149			rtfree(rt);
150			error = EADDRNOTAVAIL;
151			goto fail;
152		}
153		m->m_pkthdr.ph_ifidx = rt->rt_ifidx;
154		rtfree(rt);
155
156		/*
157		 * Recalculate IP and protocol checksums for the inbound packet
158		 * since the userspace application may have modified the packet
159		 * prior to reinjection.
160		 */
161		in_hdr_cksum_out(m, NULL);
162		in_proto_cksum_out(m, NULL);
163
164		ifp = if_get(m->m_pkthdr.ph_ifidx);
165		if (ifp == NULL) {
166			error = ENETDOWN;
167			goto fail;
168		}
169		ipv4_input(ifp, m);
170		if_put(ifp);
171	} else {
172		m->m_pkthdr.ph_rtableid = inp->inp_rtableid;
173
174		error = ip_output(m, NULL, &inp->inp_route,
175		    IP_ALLOWBROADCAST | IP_RAWOUTPUT, NULL, NULL, 0);
176	}
177
178	divstat_inc(divs_opackets);
179	return (error);
180
181fail:
182	m_freem(m);
183	divstat_inc(divs_errors);
184	return (error);
185}
186
187void
188divert_packet(struct mbuf *m, int dir, u_int16_t divert_port)
189{
190	struct inpcb *inp = NULL;
191	struct socket *so;
192	struct sockaddr_in sin;
193
194	divstat_inc(divs_ipackets);
195
196	if (m->m_len < sizeof(struct ip) &&
197	    (m = m_pullup(m, sizeof(struct ip))) == NULL) {
198		divstat_inc(divs_errors);
199		goto bad;
200	}
201
202	mtx_enter(&divbtable.inpt_mtx);
203	TAILQ_FOREACH(inp, &divbtable.inpt_queue, inp_queue) {
204		if (inp->inp_lport != divert_port)
205			continue;
206		in_pcbref(inp);
207		break;
208	}
209	mtx_leave(&divbtable.inpt_mtx);
210	if (inp == NULL) {
211		divstat_inc(divs_noport);
212		goto bad;
213	}
214
215	memset(&sin, 0, sizeof(sin));
216	sin.sin_family = AF_INET;
217	sin.sin_len = sizeof(sin);
218
219	if (dir == PF_IN) {
220		struct ifaddr *ifa;
221		struct ifnet *ifp;
222
223		ifp = if_get(m->m_pkthdr.ph_ifidx);
224		if (ifp == NULL) {
225			divstat_inc(divs_errors);
226			goto bad;
227		}
228		TAILQ_FOREACH(ifa, &ifp->if_addrlist, ifa_list) {
229			if (ifa->ifa_addr->sa_family != AF_INET)
230				continue;
231			sin.sin_addr = satosin(ifa->ifa_addr)->sin_addr;
232			break;
233		}
234		if_put(ifp);
235	} else {
236		/*
237		 * Calculate IP and protocol checksums for outbound packet
238		 * diverted to userland.  pf rule diverts before cksum offload.
239		 */
240		in_hdr_cksum_out(m, NULL);
241		in_proto_cksum_out(m, NULL);
242	}
243
244	so = inp->inp_socket;
245	mtx_enter(&so->so_rcv.sb_mtx);
246	if (sbappendaddr(so, &so->so_rcv, sintosa(&sin), m, NULL) == 0) {
247		mtx_leave(&so->so_rcv.sb_mtx);
248		divstat_inc(divs_fullsock);
249		goto bad;
250	}
251	mtx_leave(&so->so_rcv.sb_mtx);
252	sorwakeup(so);
253
254	in_pcbunref(inp);
255	return;
256
257 bad:
258	if (inp != NULL)
259		in_pcbunref(inp);
260	m_freem(m);
261}
262
263int
264divert_attach(struct socket *so, int proto, int wait)
265{
266	int error;
267
268	if (so->so_pcb != NULL)
269		return EINVAL;
270	if ((so->so_state & SS_PRIV) == 0)
271		return EACCES;
272
273	error = in_pcballoc(so, &divbtable, wait);
274	if (error)
275		return error;
276
277	error = soreserve(so, divert_sendspace, divert_recvspace);
278	if (error)
279		return error;
280
281	sotoinpcb(so)->inp_flags |= INP_HDRINCL;
282	return (0);
283}
284
285int
286divert_detach(struct socket *so)
287{
288	struct inpcb *inp = sotoinpcb(so);
289
290	soassertlocked(so);
291
292	if (inp == NULL)
293		return (EINVAL);
294
295	in_pcbdetach(inp);
296	return (0);
297}
298
299void
300divert_lock(struct socket *so)
301{
302	struct inpcb *inp = sotoinpcb(so);
303
304	NET_ASSERT_LOCKED();
305	mtx_enter(&inp->inp_mtx);
306}
307
308void
309divert_unlock(struct socket *so)
310{
311	struct inpcb *inp = sotoinpcb(so);
312
313	NET_ASSERT_LOCKED();
314	mtx_leave(&inp->inp_mtx);
315}
316
317int
318divert_locked(struct socket *so)
319{
320	struct inpcb *inp = sotoinpcb(so);
321
322	return mtx_owned(&inp->inp_mtx);
323}
324
325int
326divert_bind(struct socket *so, struct mbuf *addr, struct proc *p)
327{
328	struct inpcb *inp = sotoinpcb(so);
329
330	soassertlocked(so);
331	return in_pcbbind(inp, addr, p);
332}
333
334int
335divert_shutdown(struct socket *so)
336{
337	soassertlocked(so);
338	socantsendmore(so);
339	return (0);
340}
341
342int
343divert_send(struct socket *so, struct mbuf *m, struct mbuf *addr,
344    struct mbuf *control)
345{
346	struct inpcb *inp = sotoinpcb(so);
347
348	soassertlocked(so);
349	return (divert_output(inp, m, addr, control));
350}
351
352int
353divert_sysctl_divstat(void *oldp, size_t *oldlenp, void *newp)
354{
355	uint64_t counters[divs_ncounters];
356	struct divstat divstat;
357	u_long *words = (u_long *)&divstat;
358	int i;
359
360	CTASSERT(sizeof(divstat) == (nitems(counters) * sizeof(u_long)));
361	memset(&divstat, 0, sizeof divstat);
362	counters_read(divcounters, counters, nitems(counters), NULL);
363
364	for (i = 0; i < nitems(counters); i++)
365		words[i] = (u_long)counters[i];
366
367	return (sysctl_rdstruct(oldp, oldlenp, newp,
368	    &divstat, sizeof(divstat)));
369}
370
371/*
372 * Sysctl for divert variables.
373 */
374int
375divert_sysctl(int *name, u_int namelen, void *oldp, size_t *oldlenp, void *newp,
376    size_t newlen)
377{
378	int error;
379
380	/* All sysctl names at this level are terminal. */
381	if (namelen != 1)
382		return (ENOTDIR);
383
384	switch (name[0]) {
385	case DIVERTCTL_STATS:
386		return (divert_sysctl_divstat(oldp, oldlenp, newp));
387	default:
388		NET_LOCK();
389		error = sysctl_bounded_arr(divertctl_vars,
390		    nitems(divertctl_vars), name, namelen, oldp, oldlenp, newp,
391		    newlen);
392		NET_UNLOCK();
393		return (error);
394	}
395	/* NOTREACHED */
396}
397