1/*-
2 * SPDX-License-Identifier: BSD-3-Clause
3 *
4 * Copyright (c) 2003 Andre Oppermann, Internet Business Solutions AG
5 * All rights reserved.
6 *
7 * Redistribution and use in source and binary forms, with or without
8 * modification, are permitted provided that the following conditions
9 * are met:
10 * 1. Redistributions of source code must retain the above copyright
11 *    notice, this list of conditions and the following disclaimer.
12 * 2. Redistributions in binary form must reproduce the above copyright
13 *    notice, this list of conditions and the following disclaimer in the
14 *    documentation and/or other materials provided with the distribution.
15 * 3. The name of the author may not be used to endorse or promote
16 *    products derived from this software without specific prior written
17 *    permission.
18 *
19 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
20 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
21 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
22 * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
23 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
24 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
25 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
26 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
27 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
28 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
29 * SUCH DAMAGE.
30 */
31
32/*
33 * ip_fastforward gets its speed from processing the forwarded packet to
34 * completion (if_output on the other side) without any queues or netisr's.
35 * The receiving interface DMAs the packet into memory, the upper half of
36 * driver calls ip_fastforward, we do our routing table lookup and directly
37 * send it off to the outgoing interface, which DMAs the packet to the
38 * network card. The only part of the packet we touch with the CPU is the
39 * IP header (unless there are complex firewall rules touching other parts
40 * of the packet, but that is up to you). We are essentially limited by bus
41 * bandwidth and how fast the network card/driver can set up receives and
42 * transmits.
43 *
44 * We handle basic errors, IP header errors, checksum errors,
45 * destination unreachable, fragmentation and fragmentation needed and
46 * report them via ICMP to the sender.
47 *
48 * Else if something is not pure IPv4 unicast forwarding we fall back to
49 * the normal ip_input processing path. We should only be called from
50 * interfaces connected to the outside world.
51 *
52 * Firewalling is fully supported including divert, ipfw fwd and ipfilter
53 * ipnat and address rewrite.
54 *
55 * IPSEC is not supported if this host is a tunnel broker. IPSEC is
56 * supported for connections to/from local host.
57 *
58 * We try to do the least expensive (in CPU ops) checks and operations
59 * first to catch junk with as little overhead as possible.
60 *
61 * We take full advantage of hardware support for IP checksum and
62 * fragmentation offloading.
63 *
64 * We don't do ICMP redirect in the fast forwarding path. I have had my own
65 * cases where two core routers with Zebra routing suite would send millions
66 * ICMP redirects to connected hosts if the destination router was not the
67 * default gateway. In one case it was filling the routing table of a host
68 * with approximately 300.000 cloned redirect entries until it ran out of
69 * kernel memory. However the networking code proved very robust and it didn't
70 * crash or fail in other ways.
71 */
72
73/*
74 * Many thanks to Matt Thomas of NetBSD for basic structure of ip_flow.c which
75 * is being followed here.
76 */
77
78#include <sys/cdefs.h>
79__FBSDID("$FreeBSD$");
80
81#include "opt_ipstealth.h"
82
83#include <sys/param.h>
84#include <sys/systm.h>
85#include <sys/kernel.h>
86#include <sys/malloc.h>
87#include <sys/mbuf.h>
88#include <sys/protosw.h>
89#include <sys/sdt.h>
90#include <sys/socket.h>
91#include <sys/sysctl.h>
92
93#include <net/pfil.h>
94#include <net/if.h>
95#include <net/if_types.h>
96#include <net/if_var.h>
97#include <net/if_dl.h>
98#include <net/route.h>
99#include <net/vnet.h>
100
101#include <netinet/in.h>
102#include <netinet/in_fib.h>
103#include <netinet/in_kdtrace.h>
104#include <netinet/in_systm.h>
105#include <netinet/in_var.h>
106#include <netinet/ip.h>
107#include <netinet/ip_var.h>
108#include <netinet/ip_icmp.h>
109#include <netinet/ip_options.h>
110
111#include <machine/in_cksum.h>
112
113#define	V_ipsendredirects	VNET(ipsendredirects)
114
115struct mbuf *
116ip_redir_alloc(struct mbuf *m, struct ip *ip, struct in_addr dest,
117    in_addr_t *addr);
118
119
120struct mbuf *
121ip_redir_alloc(struct mbuf *m, struct ip *ip, struct in_addr dest,
122    in_addr_t *addr)
123{
124	struct sockaddr_in s;
125	struct nhop4_extended nh;
126	struct mbuf *mcopy = m_gethdr(M_NOWAIT, m->m_type);
127
128	if (mcopy == NULL)
129		return (NULL);
130
131	if (fib4_lookup_nh_ext(M_GETFIB(m), dest, 0, 0, &nh) != 0)
132		return (NULL);
133
134	if (m_dup_pkthdr(mcopy, m, M_NOWAIT) == 0) {
135		/*
136		 * It's probably ok if the pkthdr dup fails (because
137		 * the deep copy of the tag chain failed), but for now
138		 * be conservative and just discard the copy since
139		 * code below may some day want the tags.
140		 */
141		m_free(mcopy);
142		return (NULL);
143	}
144	mcopy->m_len = min(ntohs(ip->ip_len), M_TRAILINGSPACE(mcopy));
145	mcopy->m_pkthdr.len = mcopy->m_len;
146	m_copydata(m, 0, mcopy->m_len, mtod(mcopy, caddr_t));
147
148	s.sin_len = sizeof(struct sockaddr_in);
149	s.sin_family= AF_INET;
150	s.sin_addr = nh.nh_src;
151
152	if (((nh.nh_flags & (NHF_REDIRECT|NHF_DEFAULT)) == 0)) {
153		struct in_ifaddr *nh_ia = (struct in_ifaddr *)ifaof_ifpforaddr((struct sockaddr *)&s, nh.nh_ifp);
154		u_long src = ntohl(ip->ip_src.s_addr);
155
156		if (nh_ia != NULL && (src & nh_ia->ia_subnetmask) == nh_ia->ia_subnet) {
157			if (nh.nh_flags & NHF_GATEWAY)
158				*addr = nh.nh_addr.s_addr;
159			else
160				*addr = ip->ip_dst.s_addr;
161		}
162	}
163
164
165	return (mcopy);
166}
167
168
169static int
170ip_findroute(struct nhop4_basic *pnh, struct in_addr dest, struct mbuf *m)
171{
172
173	bzero(pnh, sizeof(*pnh));
174	if (fib4_lookup_nh_basic(M_GETFIB(m), dest, 0, 0, pnh) != 0) {
175		IPSTAT_INC(ips_noroute);
176		IPSTAT_INC(ips_cantforward);
177		icmp_error(m, ICMP_UNREACH, ICMP_UNREACH_HOST, 0, 0);
178		return (EHOSTUNREACH);
179	}
180	/*
181	 * Drop blackholed traffic and directed broadcasts.
182	 */
183	if ((pnh->nh_flags & (NHF_BLACKHOLE | NHF_BROADCAST)) != 0) {
184		IPSTAT_INC(ips_cantforward);
185		m_freem(m);
186		return (EHOSTUNREACH);
187	}
188
189	if (pnh->nh_flags & NHF_REJECT) {
190		IPSTAT_INC(ips_cantforward);
191		icmp_error(m, ICMP_UNREACH, ICMP_UNREACH_HOST, 0, 0);
192		return (EHOSTUNREACH);
193	}
194
195	return (0);
196}
197
198/*
199 * Try to forward a packet based on the destination address.
200 * This is a fast path optimized for the plain forwarding case.
201 * If the packet is handled (and consumed) here then we return NULL;
202 * otherwise mbuf is returned and the packet should be delivered
203 * to ip_input for full processing.
204 */
205struct mbuf *
206ip_tryforward(struct mbuf *m)
207{
208	struct ip *ip;
209	struct mbuf *m0 = NULL;
210	struct nhop4_basic nh;
211	struct sockaddr_in dst;
212	struct in_addr dest, odest, rtdest;
213	uint16_t ip_len, ip_off;
214	int error = 0;
215	struct m_tag *fwd_tag = NULL;
216	struct mbuf *mcopy = NULL;
217	struct in_addr redest;
218	/*
219	 * Are we active and forwarding packets?
220	 */
221
222	M_ASSERTVALID(m);
223	M_ASSERTPKTHDR(m);
224
225#ifdef ALTQ
226	/*
227	 * Is packet dropped by traffic conditioner?
228	 */
229	if (altq_input != NULL && (*altq_input)(m, AF_INET) == 0)
230		goto drop;
231#endif
232
233	/*
234	 * Only IP packets without options
235	 */
236	ip = mtod(m, struct ip *);
237
238	if (ip->ip_hl != (sizeof(struct ip) >> 2)) {
239		if (V_ip_doopts == 1)
240			return m;
241		else if (V_ip_doopts == 2) {
242			icmp_error(m, ICMP_UNREACH, ICMP_UNREACH_FILTER_PROHIB,
243				0, 0);
244			return NULL;	/* mbuf already free'd */
245		}
246		/* else ignore IP options and continue */
247	}
248
249	/*
250	 * Only unicast IP, not from loopback, no L2 or IP broadcast,
251	 * no multicast, no INADDR_ANY
252	 *
253	 * XXX: Probably some of these checks could be direct drop
254	 * conditions.  However it is not clear whether there are some
255	 * hacks or obscure behaviours which make it necessary to
256	 * let ip_input handle it.  We play safe here and let ip_input
257	 * deal with it until it is proven that we can directly drop it.
258	 */
259	if ((m->m_flags & (M_BCAST|M_MCAST)) ||
260	    (m->m_pkthdr.rcvif->if_flags & IFF_LOOPBACK) ||
261	    ntohl(ip->ip_src.s_addr) == (u_long)INADDR_BROADCAST ||
262	    ntohl(ip->ip_dst.s_addr) == (u_long)INADDR_BROADCAST ||
263	    IN_MULTICAST(ntohl(ip->ip_src.s_addr)) ||
264	    IN_MULTICAST(ntohl(ip->ip_dst.s_addr)) ||
265	    IN_LINKLOCAL(ntohl(ip->ip_src.s_addr)) ||
266	    IN_LINKLOCAL(ntohl(ip->ip_dst.s_addr)) ||
267	    ip->ip_src.s_addr == INADDR_ANY ||
268	    ip->ip_dst.s_addr == INADDR_ANY )
269		return m;
270
271	/*
272	 * Is it for a local address on this host?
273	 */
274	if (in_localip(ip->ip_dst))
275		return m;
276
277	IPSTAT_INC(ips_total);
278
279	/*
280	 * Step 3: incoming packet firewall processing
281	 */
282
283	odest.s_addr = dest.s_addr = ip->ip_dst.s_addr;
284
285	/*
286	 * Run through list of ipfilter hooks for input packets
287	 */
288	if (!PFIL_HOOKED(&V_inet_pfil_hook))
289		goto passin;
290
291	if (pfil_run_hooks(
292	    &V_inet_pfil_hook, &m, m->m_pkthdr.rcvif, PFIL_IN, 0, NULL) ||
293	    m == NULL)
294		goto drop;
295
296	M_ASSERTVALID(m);
297	M_ASSERTPKTHDR(m);
298
299	ip = mtod(m, struct ip *);	/* m may have changed by pfil hook */
300	dest.s_addr = ip->ip_dst.s_addr;
301
302	/*
303	 * Destination address changed?
304	 */
305	if (odest.s_addr != dest.s_addr) {
306		/*
307		 * Is it now for a local address on this host?
308		 */
309		if (in_localip(dest))
310			goto forwardlocal;
311		/*
312		 * Go on with new destination address
313		 */
314	}
315
316	if (m->m_flags & M_FASTFWD_OURS) {
317		/*
318		 * ipfw changed it for a local address on this host.
319		 */
320		goto forwardlocal;
321	}
322
323passin:
324	/*
325	 * Step 4: decrement TTL and look up route
326	 */
327
328	/*
329	 * Check TTL
330	 */
331#ifdef IPSTEALTH
332	if (!V_ipstealth) {
333#endif
334	if (ip->ip_ttl <= IPTTLDEC) {
335		icmp_error(m, ICMP_TIMXCEED, ICMP_TIMXCEED_INTRANS, 0, 0);
336		return NULL;	/* mbuf already free'd */
337	}
338
339	/*
340	 * Decrement the TTL and incrementally change the IP header checksum.
341	 * Don't bother doing this with hw checksum offloading, it's faster
342	 * doing it right here.
343	 */
344	ip->ip_ttl -= IPTTLDEC;
345	if (ip->ip_sum >= (u_int16_t) ~htons(IPTTLDEC << 8))
346		ip->ip_sum -= ~htons(IPTTLDEC << 8);
347	else
348		ip->ip_sum += htons(IPTTLDEC << 8);
349#ifdef IPSTEALTH
350	}
351#endif
352
353	/*
354	 * Next hop forced by pfil(9) hook?
355	 */
356	if ((m->m_flags & M_IP_NEXTHOP) &&
357	    ((fwd_tag = m_tag_find(m, PACKET_TAG_IPFORWARD, NULL)) != NULL)) {
358		/*
359		 * Now we will find route to forced destination.
360		 */
361		dest.s_addr = ((struct sockaddr_in *)
362			    (fwd_tag + 1))->sin_addr.s_addr;
363		m_tag_delete(m, fwd_tag);
364		m->m_flags &= ~M_IP_NEXTHOP;
365	}
366
367	/*
368	 * Find route to destination.
369	 */
370	if (ip_findroute(&nh, dest, m) != 0)
371		return (NULL);	/* icmp unreach already sent */
372
373	/*
374	 * Avoid second route lookup by caching destination.
375	 */
376	rtdest.s_addr = dest.s_addr;
377
378	/*
379	 * Step 5: outgoing firewall packet processing
380	 */
381	if (!PFIL_HOOKED(&V_inet_pfil_hook))
382		goto passout;
383
384	if (pfil_run_hooks(&V_inet_pfil_hook, &m, nh.nh_ifp, PFIL_OUT, PFIL_FWD,
385	    NULL) || m == NULL) {
386		goto drop;
387	}
388
389	M_ASSERTVALID(m);
390	M_ASSERTPKTHDR(m);
391
392	ip = mtod(m, struct ip *);
393	dest.s_addr = ip->ip_dst.s_addr;
394
395	/*
396	 * Destination address changed?
397	 */
398	if (m->m_flags & M_IP_NEXTHOP)
399		fwd_tag = m_tag_find(m, PACKET_TAG_IPFORWARD, NULL);
400	else
401		fwd_tag = NULL;
402	if (odest.s_addr != dest.s_addr || fwd_tag != NULL) {
403		/*
404		 * Is it now for a local address on this host?
405		 */
406		if (m->m_flags & M_FASTFWD_OURS || in_localip(dest)) {
407forwardlocal:
408			/*
409			 * Return packet for processing by ip_input().
410			 */
411			m->m_flags |= M_FASTFWD_OURS;
412			return (m);
413		}
414		/*
415		 * Redo route lookup with new destination address
416		 */
417		if (fwd_tag) {
418			dest.s_addr = ((struct sockaddr_in *)
419				    (fwd_tag + 1))->sin_addr.s_addr;
420			m_tag_delete(m, fwd_tag);
421			m->m_flags &= ~M_IP_NEXTHOP;
422		}
423		if (dest.s_addr != rtdest.s_addr &&
424		    ip_findroute(&nh, dest, m) != 0)
425			return (NULL);	/* icmp unreach already sent */
426	}
427
428passout:
429	/*
430	 * Step 6: send off the packet
431	 */
432	ip_len = ntohs(ip->ip_len);
433	ip_off = ntohs(ip->ip_off);
434
435	bzero(&dst, sizeof(dst));
436	dst.sin_family = AF_INET;
437	dst.sin_len = sizeof(dst);
438	dst.sin_addr = nh.nh_addr;
439
440	/*
441	 * Handle redirect case.
442	 */
443	redest.s_addr = 0;
444	if (V_ipsendredirects && (nh.nh_ifp == m->m_pkthdr.rcvif))
445		mcopy = ip_redir_alloc(m, ip, dest, &redest.s_addr);
446
447	/*
448	 * Check if packet fits MTU or if hardware will fragment for us
449	 */
450	if (ip_len <= nh.nh_mtu) {
451		/*
452		 * Avoid confusing lower layers.
453		 */
454		m_clrprotoflags(m);
455		/*
456		 * Send off the packet via outgoing interface
457		 */
458		IP_PROBE(send, NULL, NULL, ip, nh.nh_ifp, ip, NULL);
459		error = (*nh.nh_ifp->if_output)(nh.nh_ifp, m,
460		    (struct sockaddr *)&dst, NULL);
461	} else {
462		/*
463		 * Handle EMSGSIZE with icmp reply needfrag for TCP MTU discovery
464		 */
465		if (ip_off & IP_DF) {
466			IPSTAT_INC(ips_cantfrag);
467			icmp_error(m, ICMP_UNREACH, ICMP_UNREACH_NEEDFRAG,
468				0, nh.nh_mtu);
469			goto consumed;
470		} else {
471			/*
472			 * We have to fragment the packet
473			 */
474			m->m_pkthdr.csum_flags |= CSUM_IP;
475			if (ip_fragment(ip, &m, nh.nh_mtu,
476			    nh.nh_ifp->if_hwassist) != 0)
477				goto drop;
478			KASSERT(m != NULL, ("null mbuf and no error"));
479			/*
480			 * Send off the fragments via outgoing interface
481			 */
482			error = 0;
483			do {
484				m0 = m->m_nextpkt;
485				m->m_nextpkt = NULL;
486				/*
487				 * Avoid confusing lower layers.
488				 */
489				m_clrprotoflags(m);
490
491				IP_PROBE(send, NULL, NULL,
492				    mtod(m, struct ip *), nh.nh_ifp,
493				    mtod(m, struct ip *), NULL);
494				/* XXX: we can use cached route here */
495				error = (*nh.nh_ifp->if_output)(nh.nh_ifp, m,
496				    (struct sockaddr *)&dst, NULL);
497				if (error)
498					break;
499			} while ((m = m0) != NULL);
500			if (error) {
501				/* Reclaim remaining fragments */
502				for (m = m0; m; m = m0) {
503					m0 = m->m_nextpkt;
504					m_freem(m);
505				}
506			} else
507				IPSTAT_INC(ips_fragmented);
508		}
509	}
510
511	if (error != 0)
512		IPSTAT_INC(ips_odropped);
513	else {
514		IPSTAT_INC(ips_forward);
515		IPSTAT_INC(ips_fastforward);
516	}
517
518	/* Send required redirect */
519	if (mcopy != NULL) {
520		icmp_error(mcopy, ICMP_REDIRECT, ICMP_REDIRECT_HOST, redest.s_addr, 0);
521		mcopy = NULL; /* Freed by caller */
522	}
523
524consumed:
525	if (mcopy != NULL)
526		m_freem(mcopy);
527	return NULL;
528drop:
529	if (m)
530		m_freem(m);
531	return NULL;
532}
533