1/* $OpenBSD: ip_divert.c,v 1.95 2024/03/05 09:45:13 bluhm Exp $ */ 2 3/* 4 * Copyright (c) 2009 Michele Marchetto <michele@openbsd.org> 5 * 6 * Permission to use, copy, modify, and distribute this software for any 7 * purpose with or without fee is hereby granted, provided that the above 8 * copyright notice and this permission notice appear in all copies. 9 * 10 * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES 11 * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF 12 * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR 13 * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES 14 * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN 15 * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF 16 * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. 17 */ 18 19#include <sys/param.h> 20#include <sys/systm.h> 21#include <sys/mbuf.h> 22#include <sys/protosw.h> 23#include <sys/socket.h> 24#include <sys/socketvar.h> 25#include <sys/sysctl.h> 26 27#include <net/if.h> 28#include <net/route.h> 29#include <net/if_var.h> 30#include <net/netisr.h> 31 32#include <netinet/in.h> 33#include <netinet/in_var.h> 34#include <netinet/ip.h> 35#include <netinet/ip_var.h> 36#include <netinet/in_pcb.h> 37#include <netinet/ip_divert.h> 38#include <netinet/tcp.h> 39#include <netinet/udp.h> 40#include <netinet/ip_icmp.h> 41 42#include <net/pfvar.h> 43 44struct inpcbtable divbtable; 45struct cpumem *divcounters; 46 47#ifndef DIVERT_SENDSPACE 48#define DIVERT_SENDSPACE (65536 + 100) 49#endif 50u_int divert_sendspace = DIVERT_SENDSPACE; 51#ifndef DIVERT_RECVSPACE 52#define DIVERT_RECVSPACE (65536 + 100) 53#endif 54u_int divert_recvspace = DIVERT_RECVSPACE; 55 56#ifndef DIVERTHASHSIZE 57#define DIVERTHASHSIZE 128 58#endif 59 60const struct sysctl_bounded_args divertctl_vars[] = { 61 { DIVERTCTL_RECVSPACE, &divert_recvspace, 0, INT_MAX }, 62 { DIVERTCTL_SENDSPACE, &divert_sendspace, 0, INT_MAX }, 63}; 64 65const struct pr_usrreqs divert_usrreqs = { 66 .pru_attach = divert_attach, 67 .pru_detach = divert_detach, 68 .pru_lock = divert_lock, 69 .pru_unlock = divert_unlock, 70 .pru_locked = divert_locked, 71 .pru_bind = divert_bind, 72 .pru_shutdown = divert_shutdown, 73 .pru_send = divert_send, 74 .pru_control = in_control, 75 .pru_sockaddr = in_sockaddr, 76 .pru_peeraddr = in_peeraddr, 77}; 78 79int divbhashsize = DIVERTHASHSIZE; 80 81int divert_output(struct inpcb *, struct mbuf *, struct mbuf *, 82 struct mbuf *); 83void 84divert_init(void) 85{ 86 in_pcbinit(&divbtable, divbhashsize); 87 divcounters = counters_alloc(divs_ncounters); 88} 89 90int 91divert_output(struct inpcb *inp, struct mbuf *m, struct mbuf *nam, 92 struct mbuf *control) 93{ 94 struct sockaddr_in *sin; 95 int error, min_hdrlen, off, dir; 96 struct ip *ip; 97 98 m_freem(control); 99 100 if ((error = in_nam2sin(nam, &sin))) 101 goto fail; 102 103 if (m->m_pkthdr.len > IP_MAXPACKET) { 104 error = EMSGSIZE; 105 goto fail; 106 } 107 108 m = rip_chkhdr(m, NULL); 109 if (m == NULL) { 110 error = EINVAL; 111 goto fail; 112 } 113 114 ip = mtod(m, struct ip *); 115 off = ip->ip_hl << 2; 116 117 dir = (sin->sin_addr.s_addr == INADDR_ANY ? PF_OUT : PF_IN); 118 119 switch (ip->ip_p) { 120 case IPPROTO_TCP: 121 min_hdrlen = sizeof(struct tcphdr); 122 m->m_pkthdr.csum_flags |= M_TCP_CSUM_OUT; 123 break; 124 case IPPROTO_UDP: 125 min_hdrlen = sizeof(struct udphdr); 126 m->m_pkthdr.csum_flags |= M_UDP_CSUM_OUT; 127 break; 128 case IPPROTO_ICMP: 129 min_hdrlen = ICMP_MINLEN; 130 m->m_pkthdr.csum_flags |= M_ICMP_CSUM_OUT; 131 break; 132 default: 133 min_hdrlen = 0; 134 break; 135 } 136 if (min_hdrlen && m->m_pkthdr.len < off + min_hdrlen) { 137 error = EINVAL; 138 goto fail; 139 } 140 141 m->m_pkthdr.pf.flags |= PF_TAG_DIVERTED_PACKET; 142 143 if (dir == PF_IN) { 144 struct rtentry *rt; 145 struct ifnet *ifp; 146 147 rt = rtalloc(sintosa(sin), 0, inp->inp_rtableid); 148 if (!rtisvalid(rt) || !ISSET(rt->rt_flags, RTF_LOCAL)) { 149 rtfree(rt); 150 error = EADDRNOTAVAIL; 151 goto fail; 152 } 153 m->m_pkthdr.ph_ifidx = rt->rt_ifidx; 154 rtfree(rt); 155 156 /* 157 * Recalculate IP and protocol checksums for the inbound packet 158 * since the userspace application may have modified the packet 159 * prior to reinjection. 160 */ 161 in_hdr_cksum_out(m, NULL); 162 in_proto_cksum_out(m, NULL); 163 164 ifp = if_get(m->m_pkthdr.ph_ifidx); 165 if (ifp == NULL) { 166 error = ENETDOWN; 167 goto fail; 168 } 169 ipv4_input(ifp, m); 170 if_put(ifp); 171 } else { 172 m->m_pkthdr.ph_rtableid = inp->inp_rtableid; 173 174 error = ip_output(m, NULL, &inp->inp_route, 175 IP_ALLOWBROADCAST | IP_RAWOUTPUT, NULL, NULL, 0); 176 } 177 178 divstat_inc(divs_opackets); 179 return (error); 180 181fail: 182 m_freem(m); 183 divstat_inc(divs_errors); 184 return (error); 185} 186 187void 188divert_packet(struct mbuf *m, int dir, u_int16_t divert_port) 189{ 190 struct inpcb *inp = NULL; 191 struct socket *so; 192 struct sockaddr_in sin; 193 194 divstat_inc(divs_ipackets); 195 196 if (m->m_len < sizeof(struct ip) && 197 (m = m_pullup(m, sizeof(struct ip))) == NULL) { 198 divstat_inc(divs_errors); 199 goto bad; 200 } 201 202 mtx_enter(&divbtable.inpt_mtx); 203 TAILQ_FOREACH(inp, &divbtable.inpt_queue, inp_queue) { 204 if (inp->inp_lport != divert_port) 205 continue; 206 in_pcbref(inp); 207 break; 208 } 209 mtx_leave(&divbtable.inpt_mtx); 210 if (inp == NULL) { 211 divstat_inc(divs_noport); 212 goto bad; 213 } 214 215 memset(&sin, 0, sizeof(sin)); 216 sin.sin_family = AF_INET; 217 sin.sin_len = sizeof(sin); 218 219 if (dir == PF_IN) { 220 struct ifaddr *ifa; 221 struct ifnet *ifp; 222 223 ifp = if_get(m->m_pkthdr.ph_ifidx); 224 if (ifp == NULL) { 225 divstat_inc(divs_errors); 226 goto bad; 227 } 228 TAILQ_FOREACH(ifa, &ifp->if_addrlist, ifa_list) { 229 if (ifa->ifa_addr->sa_family != AF_INET) 230 continue; 231 sin.sin_addr = satosin(ifa->ifa_addr)->sin_addr; 232 break; 233 } 234 if_put(ifp); 235 } else { 236 /* 237 * Calculate IP and protocol checksums for outbound packet 238 * diverted to userland. pf rule diverts before cksum offload. 239 */ 240 in_hdr_cksum_out(m, NULL); 241 in_proto_cksum_out(m, NULL); 242 } 243 244 so = inp->inp_socket; 245 mtx_enter(&so->so_rcv.sb_mtx); 246 if (sbappendaddr(so, &so->so_rcv, sintosa(&sin), m, NULL) == 0) { 247 mtx_leave(&so->so_rcv.sb_mtx); 248 divstat_inc(divs_fullsock); 249 goto bad; 250 } 251 mtx_leave(&so->so_rcv.sb_mtx); 252 sorwakeup(so); 253 254 in_pcbunref(inp); 255 return; 256 257 bad: 258 if (inp != NULL) 259 in_pcbunref(inp); 260 m_freem(m); 261} 262 263int 264divert_attach(struct socket *so, int proto, int wait) 265{ 266 int error; 267 268 if (so->so_pcb != NULL) 269 return EINVAL; 270 if ((so->so_state & SS_PRIV) == 0) 271 return EACCES; 272 273 error = in_pcballoc(so, &divbtable, wait); 274 if (error) 275 return error; 276 277 error = soreserve(so, divert_sendspace, divert_recvspace); 278 if (error) 279 return error; 280 281 sotoinpcb(so)->inp_flags |= INP_HDRINCL; 282 return (0); 283} 284 285int 286divert_detach(struct socket *so) 287{ 288 struct inpcb *inp = sotoinpcb(so); 289 290 soassertlocked(so); 291 292 if (inp == NULL) 293 return (EINVAL); 294 295 in_pcbdetach(inp); 296 return (0); 297} 298 299void 300divert_lock(struct socket *so) 301{ 302 struct inpcb *inp = sotoinpcb(so); 303 304 NET_ASSERT_LOCKED(); 305 mtx_enter(&inp->inp_mtx); 306} 307 308void 309divert_unlock(struct socket *so) 310{ 311 struct inpcb *inp = sotoinpcb(so); 312 313 NET_ASSERT_LOCKED(); 314 mtx_leave(&inp->inp_mtx); 315} 316 317int 318divert_locked(struct socket *so) 319{ 320 struct inpcb *inp = sotoinpcb(so); 321 322 return mtx_owned(&inp->inp_mtx); 323} 324 325int 326divert_bind(struct socket *so, struct mbuf *addr, struct proc *p) 327{ 328 struct inpcb *inp = sotoinpcb(so); 329 330 soassertlocked(so); 331 return in_pcbbind(inp, addr, p); 332} 333 334int 335divert_shutdown(struct socket *so) 336{ 337 soassertlocked(so); 338 socantsendmore(so); 339 return (0); 340} 341 342int 343divert_send(struct socket *so, struct mbuf *m, struct mbuf *addr, 344 struct mbuf *control) 345{ 346 struct inpcb *inp = sotoinpcb(so); 347 348 soassertlocked(so); 349 return (divert_output(inp, m, addr, control)); 350} 351 352int 353divert_sysctl_divstat(void *oldp, size_t *oldlenp, void *newp) 354{ 355 uint64_t counters[divs_ncounters]; 356 struct divstat divstat; 357 u_long *words = (u_long *)&divstat; 358 int i; 359 360 CTASSERT(sizeof(divstat) == (nitems(counters) * sizeof(u_long))); 361 memset(&divstat, 0, sizeof divstat); 362 counters_read(divcounters, counters, nitems(counters), NULL); 363 364 for (i = 0; i < nitems(counters); i++) 365 words[i] = (u_long)counters[i]; 366 367 return (sysctl_rdstruct(oldp, oldlenp, newp, 368 &divstat, sizeof(divstat))); 369} 370 371/* 372 * Sysctl for divert variables. 373 */ 374int 375divert_sysctl(int *name, u_int namelen, void *oldp, size_t *oldlenp, void *newp, 376 size_t newlen) 377{ 378 int error; 379 380 /* All sysctl names at this level are terminal. */ 381 if (namelen != 1) 382 return (ENOTDIR); 383 384 switch (name[0]) { 385 case DIVERTCTL_STATS: 386 return (divert_sysctl_divstat(oldp, oldlenp, newp)); 387 default: 388 NET_LOCK(); 389 error = sysctl_bounded_arr(divertctl_vars, 390 nitems(divertctl_vars), name, namelen, oldp, oldlenp, newp, 391 newlen); 392 NET_UNLOCK(); 393 return (error); 394 } 395 /* NOTREACHED */ 396} 397