nat64_translate.c revision 340541
1/*-
2 * Copyright (c) 2015-2018 Yandex LLC
3 * Copyright (c) 2015-2018 Andrey V. Elsukov <ae@FreeBSD.org>
4 * All rights reserved.
5 *
6 * Redistribution and use in source and binary forms, with or without
7 * modification, are permitted provided that the following conditions
8 * are met:
9 *
10 * 1. Redistributions of source code must retain the above copyright
11 *    notice, this list of conditions and the following disclaimer.
12 * 2. Redistributions in binary form must reproduce the above copyright
13 *    notice, this list of conditions and the following disclaimer in the
14 *    documentation and/or other materials provided with the distribution.
15 *
16 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
17 * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
18 * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
19 * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
20 * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
21 * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
22 * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
23 * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
24 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
25 * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
26 */
27
28#include <sys/cdefs.h>
29__FBSDID("$FreeBSD: stable/11/sys/netpfil/ipfw/nat64/nat64_translate.c 340541 2018-11-18 00:34:24Z ae $");
30
31#include <sys/param.h>
32#include <sys/systm.h>
33#include <sys/counter.h>
34#include <sys/errno.h>
35#include <sys/kernel.h>
36#include <sys/lock.h>
37#include <sys/mbuf.h>
38#include <sys/module.h>
39#include <sys/rmlock.h>
40#include <sys/rwlock.h>
41#include <sys/socket.h>
42#include <sys/queue.h>
43
44#include <net/if.h>
45#include <net/if_var.h>
46#include <net/if_pflog.h>
47#include <net/pfil.h>
48#include <net/netisr.h>
49#include <net/route.h>
50
51#include <netinet/in.h>
52#include <netinet/in_fib.h>
53#include <netinet/ip.h>
54#include <netinet/ip_var.h>
55#include <netinet/ip_fw.h>
56#include <netinet/ip6.h>
57#include <netinet/icmp6.h>
58#include <netinet/ip_icmp.h>
59#include <netinet/tcp.h>
60#include <netinet/udp.h>
61#include <netinet6/in6_var.h>
62#include <netinet6/in6_fib.h>
63#include <netinet6/ip6_var.h>
64
65#include <netpfil/pf/pf.h>
66#include <netpfil/ipfw/ip_fw_private.h>
67#include <machine/in_cksum.h>
68
69#include "ip_fw_nat64.h"
70#include "nat64_translate.h"
71
72
73typedef int (*nat64_output_t)(struct ifnet *, struct mbuf *,
74    struct sockaddr *, struct nat64_counters *, void *);
75typedef int (*nat64_output_one_t)(struct mbuf *, struct nat64_counters *,
76    void *);
77
78static int nat64_find_route4(struct nhop4_basic *, struct sockaddr_in *,
79    struct mbuf *);
80static int nat64_find_route6(struct nhop6_basic *, struct sockaddr_in6 *,
81    struct mbuf *);
82static int nat64_output_one(struct mbuf *, struct nat64_counters *, void *);
83static int nat64_output(struct ifnet *, struct mbuf *, struct sockaddr *,
84    struct nat64_counters *, void *);
85static int nat64_direct_output_one(struct mbuf *, struct nat64_counters *,
86    void *);
87static int nat64_direct_output(struct ifnet *, struct mbuf *,
88    struct sockaddr *, struct nat64_counters *, void *);
89
90struct nat64_methods {
91	nat64_output_t		output;
92	nat64_output_one_t	output_one;
93};
94static const struct nat64_methods nat64_netisr = {
95	.output = nat64_output,
96	.output_one = nat64_output_one
97};
98static const struct nat64_methods nat64_direct = {
99	.output = nat64_direct_output,
100	.output_one = nat64_direct_output_one
101};
102VNET_DEFINE_STATIC(const struct nat64_methods *, nat64out) = &nat64_netisr;
103#define	V_nat64out	VNET(nat64out)
104
105void
106nat64_set_output_method(int direct)
107{
108
109	V_nat64out = direct != 0 ? &nat64_direct: &nat64_netisr;
110}
111
112int
113nat64_get_output_method(void)
114{
115
116	return (V_nat64out == &nat64_direct ? 1: 0);
117}
118
119static void
120nat64_log(struct pfloghdr *logdata, struct mbuf *m, sa_family_t family)
121{
122
123	logdata->dir = PF_OUT;
124	logdata->af = family;
125	ipfw_bpf_mtap2(logdata, PFLOG_HDRLEN, m);
126}
127
128static int
129nat64_direct_output(struct ifnet *ifp, struct mbuf *m, struct sockaddr *dst,
130    struct nat64_counters *stats, void *logdata)
131{
132	int error;
133
134	if (logdata != NULL)
135		nat64_log(logdata, m, dst->sa_family);
136	error = (*ifp->if_output)(ifp, m, dst, NULL);
137	if (error != 0)
138		NAT64STAT_INC(stats, oerrors);
139	return (error);
140}
141
142static int
143nat64_direct_output_one(struct mbuf *m, struct nat64_counters *stats,
144    void *logdata)
145{
146	struct nhop6_basic nh6;
147	struct nhop4_basic nh4;
148	struct sockaddr_in6 dst6;
149	struct sockaddr_in dst4;
150	struct sockaddr *dst;
151	struct ip6_hdr *ip6;
152	struct ip *ip4;
153	struct ifnet *ifp;
154	int error;
155
156	ip4 = mtod(m, struct ip *);
157	switch (ip4->ip_v) {
158	case IPVERSION:
159		dst4.sin_addr = ip4->ip_dst;
160		error = nat64_find_route4(&nh4, &dst4, m);
161		if (error != 0)
162			NAT64STAT_INC(stats, noroute4);
163		else {
164			ifp = nh4.nh_ifp;
165			dst = (struct sockaddr *)&dst4;
166		}
167		break;
168	case (IPV6_VERSION >> 4):
169		ip6 = mtod(m, struct ip6_hdr *);
170		dst6.sin6_addr = ip6->ip6_dst;
171		error = nat64_find_route6(&nh6, &dst6, m);
172		if (error != 0)
173			NAT64STAT_INC(stats, noroute6);
174		else {
175			ifp = nh6.nh_ifp;
176			dst = (struct sockaddr *)&dst6;
177		}
178		break;
179	default:
180		m_freem(m);
181		NAT64STAT_INC(stats, dropped);
182		DPRINTF(DP_DROPS, "dropped due to unknown IP version");
183		return (EAFNOSUPPORT);
184	}
185	if (error != 0) {
186		m_freem(m);
187		return (EHOSTUNREACH);
188	}
189	if (logdata != NULL)
190		nat64_log(logdata, m, dst->sa_family);
191	error = (*ifp->if_output)(ifp, m, dst, NULL);
192	if (error != 0)
193		NAT64STAT_INC(stats, oerrors);
194	return (error);
195}
196
197static int
198nat64_output(struct ifnet *ifp, struct mbuf *m, struct sockaddr *dst,
199    struct nat64_counters *stats, void *logdata)
200{
201	struct ip *ip4;
202	int ret, af;
203
204	ip4 = mtod(m, struct ip *);
205	switch (ip4->ip_v) {
206	case IPVERSION:
207		af = AF_INET;
208		ret = NETISR_IP;
209		break;
210	case (IPV6_VERSION >> 4):
211		af = AF_INET6;
212		ret = NETISR_IPV6;
213		break;
214	default:
215		m_freem(m);
216		NAT64STAT_INC(stats, dropped);
217		DPRINTF(DP_DROPS, "unknown IP version");
218		return (EAFNOSUPPORT);
219	}
220	if (logdata != NULL)
221		nat64_log(logdata, m, af);
222	ret = netisr_queue(ret, m);
223	if (ret != 0)
224		NAT64STAT_INC(stats, oerrors);
225	return (ret);
226}
227
228static int
229nat64_output_one(struct mbuf *m, struct nat64_counters *stats, void *logdata)
230{
231
232	return (nat64_output(NULL, m, NULL, stats, logdata));
233}
234
235/*
236 * Check the given IPv6 prefix and length according to RFC6052:
237 *   The prefixes can only have one of the following lengths:
238 *   32, 40, 48, 56, 64, or 96 (The Well-Known Prefix is 96 bits long).
239 * Returns zero on success, otherwise EINVAL.
240 */
241int
242nat64_check_prefix6(const struct in6_addr *prefix, int length)
243{
244
245	switch (length) {
246	case 32:
247	case 40:
248	case 48:
249	case 56:
250	case 64:
251		/* Well-known prefix has 96 prefix length */
252		if (IN6_IS_ADDR_WKPFX(prefix))
253			return (EINVAL);
254		/* FALLTHROUGH */
255	case 96:
256		/* Bits 64 to 71 must be set to zero */
257		if (prefix->__u6_addr.__u6_addr8[8] != 0)
258			return (EINVAL);
259		/* Some extra checks */
260		if (IN6_IS_ADDR_MULTICAST(prefix) ||
261		    IN6_IS_ADDR_UNSPECIFIED(prefix) ||
262		    IN6_IS_ADDR_LOOPBACK(prefix))
263			return (EINVAL);
264		return (0);
265	}
266	return (EINVAL);
267}
268
269int
270nat64_check_private_ip4(const struct nat64_config *cfg, in_addr_t ia)
271{
272
273	if (V_nat64_allow_private)
274		return (0);
275
276	/* WKPFX must not be used to represent non-global IPv4 addresses */
277	if (cfg->flags & NAT64_WKPFX) {
278		/* IN_PRIVATE */
279		if ((ia & htonl(0xff000000)) == htonl(0x0a000000) ||
280		    (ia & htonl(0xfff00000)) == htonl(0xac100000) ||
281		    (ia & htonl(0xffff0000)) == htonl(0xc0a80000))
282			return (1);
283		/*
284		 * RFC 5735:
285		 *  192.0.0.0/24 - reserved for IETF protocol assignments
286		 *  192.88.99.0/24 - for use as 6to4 relay anycast addresses
287		 *  198.18.0.0/15 - for use in benchmark tests
288		 *  192.0.2.0/24, 198.51.100.0/24, 203.0.113.0/24 - for use
289		 *   in documentation and example code
290		 */
291		if ((ia & htonl(0xffffff00)) == htonl(0xc0000000) ||
292		    (ia & htonl(0xffffff00)) == htonl(0xc0586300) ||
293		    (ia & htonl(0xfffffe00)) == htonl(0xc6120000) ||
294		    (ia & htonl(0xffffff00)) == htonl(0xc0000200) ||
295		    (ia & htonl(0xfffffe00)) == htonl(0xc6336400) ||
296		    (ia & htonl(0xffffff00)) == htonl(0xcb007100))
297			return (1);
298	}
299	return (0);
300}
301
302void
303nat64_embed_ip4(const struct nat64_config *cfg, in_addr_t ia,
304    struct in6_addr *ip6)
305{
306
307	/* assume the prefix6 is properly filled with zeros */
308	bcopy(&cfg->prefix6, ip6, sizeof(*ip6));
309	switch (cfg->plen6) {
310	case 32:
311	case 96:
312		ip6->s6_addr32[cfg->plen6 / 32] = ia;
313		break;
314	case 40:
315	case 48:
316	case 56:
317#if BYTE_ORDER == BIG_ENDIAN
318		ip6->s6_addr32[1] = cfg->prefix6.s6_addr32[1] |
319		    (ia >> (cfg->plen6 % 32));
320		ip6->s6_addr32[2] = ia << (24 - cfg->plen6 % 32);
321#elif BYTE_ORDER == LITTLE_ENDIAN
322		ip6->s6_addr32[1] = cfg->prefix6.s6_addr32[1] |
323		    (ia << (cfg->plen6 % 32));
324		ip6->s6_addr32[2] = ia >> (24 - cfg->plen6 % 32);
325#endif
326		break;
327	case 64:
328#if BYTE_ORDER == BIG_ENDIAN
329		ip6->s6_addr32[2] = ia >> 8;
330		ip6->s6_addr32[3] = ia << 24;
331#elif BYTE_ORDER == LITTLE_ENDIAN
332		ip6->s6_addr32[2] = ia << 8;
333		ip6->s6_addr32[3] = ia >> 24;
334#endif
335		break;
336	default:
337		panic("Wrong plen6");
338	};
339	ip6->s6_addr8[8] = 0;
340}
341
342in_addr_t
343nat64_extract_ip4(const struct nat64_config *cfg, const struct in6_addr *ip6)
344{
345	in_addr_t ia;
346
347	/*
348	 * According to RFC 6052 p2.2:
349	 * IPv4-embedded IPv6 addresses are composed of a variable-length
350	 * prefix, the embedded IPv4 address, and a variable length suffix.
351	 * The suffix bits are reserved for future extensions and SHOULD
352	 * be set to zero.
353	 */
354	switch (cfg->plen6) {
355	case 32:
356		if (ip6->s6_addr32[3] != 0 || ip6->s6_addr32[2] != 0)
357			goto badip6;
358		break;
359	case 40:
360		if (ip6->s6_addr32[3] != 0 ||
361		    (ip6->s6_addr32[2] & htonl(0xff00ffff)) != 0)
362			goto badip6;
363		break;
364	case 48:
365		if (ip6->s6_addr32[3] != 0 ||
366		    (ip6->s6_addr32[2] & htonl(0xff0000ff)) != 0)
367			goto badip6;
368		break;
369	case 56:
370		if (ip6->s6_addr32[3] != 0 || ip6->s6_addr8[8] != 0)
371			goto badip6;
372		break;
373	case 64:
374		if (ip6->s6_addr8[8] != 0 ||
375		    (ip6->s6_addr32[3] & htonl(0x00ffffff)) != 0)
376			goto badip6;
377	};
378	switch (cfg->plen6) {
379	case 32:
380	case 96:
381		ia = ip6->s6_addr32[cfg->plen6 / 32];
382		break;
383	case 40:
384	case 48:
385	case 56:
386#if BYTE_ORDER == BIG_ENDIAN
387		ia = (ip6->s6_addr32[1] << (cfg->plen6 % 32)) |
388		    (ip6->s6_addr32[2] >> (24 - cfg->plen6 % 32));
389#elif BYTE_ORDER == LITTLE_ENDIAN
390		ia = (ip6->s6_addr32[1] >> (cfg->plen6 % 32)) |
391		    (ip6->s6_addr32[2] << (24 - cfg->plen6 % 32));
392#endif
393		break;
394	case 64:
395#if BYTE_ORDER == BIG_ENDIAN
396		ia = (ip6->s6_addr32[2] << 8) | (ip6->s6_addr32[3] >> 24);
397#elif BYTE_ORDER == LITTLE_ENDIAN
398		ia = (ip6->s6_addr32[2] >> 8) | (ip6->s6_addr32[3] << 24);
399#endif
400		break;
401	default:
402		return (0);
403	};
404	if (nat64_check_ip4(ia) != 0 ||
405	    nat64_check_private_ip4(cfg, ia) != 0)
406		goto badip4;
407
408	return (ia);
409badip4:
410	DPRINTF(DP_GENERIC | DP_DROPS,
411	    "invalid destination address: %08x", ia);
412	return (0);
413badip6:
414	DPRINTF(DP_GENERIC | DP_DROPS, "invalid IPv4-embedded IPv6 address");
415	return (0);
416}
417
418/*
419 * According to RFC 1624 the equation for incremental checksum update is:
420 *	HC' = ~(~HC + ~m + m')	--	[Eqn. 3]
421 *	HC' = HC - ~m - m'	--	[Eqn. 4]
422 * So, when we are replacing IPv4 addresses to IPv6, we
423 * can assume, that new bytes previously were zeros, and vise versa -
424 * when we replacing IPv6 addresses to IPv4, now unused bytes become
425 * zeros. The payload length in pseudo header has bigger size, but one
426 * half of it should be zero. Using the equation 4 we get:
427 *	HC' = HC - (~m0 + m0')	-- m0 is first changed word
428 *	HC' = (HC - (~m0 + m0')) - (~m1 + m1')	-- m1 is second changed word
429 *	HC' = HC - ~m0 - m0' - ~m1 - m1' - ... =
430 *	  = HC - sum(~m[i] + m'[i])
431 *
432 * The function result should be used as follows:
433 *	IPv6 to IPv4:	HC' = cksum_add(HC, result)
434 *	IPv4 to IPv6:	HC' = cksum_add(HC, ~result)
435 */
436static NAT64NOINLINE uint16_t
437nat64_cksum_convert(struct ip6_hdr *ip6, struct ip *ip)
438{
439	uint32_t sum;
440	uint16_t *p;
441
442	sum = ~ip->ip_src.s_addr >> 16;
443	sum += ~ip->ip_src.s_addr & 0xffff;
444	sum += ~ip->ip_dst.s_addr >> 16;
445	sum += ~ip->ip_dst.s_addr & 0xffff;
446
447	for (p = (uint16_t *)&ip6->ip6_src;
448	    p < (uint16_t *)(&ip6->ip6_src + 2); p++)
449		sum += *p;
450
451	while (sum >> 16)
452		sum = (sum & 0xffff) + (sum >> 16);
453	return (sum);
454}
455
456static NAT64NOINLINE void
457nat64_init_ip4hdr(const struct ip6_hdr *ip6, const struct ip6_frag *frag,
458    uint16_t plen, uint8_t proto, struct ip *ip)
459{
460
461	/* assume addresses are already initialized */
462	ip->ip_v = IPVERSION;
463	ip->ip_hl = sizeof(*ip) >> 2;
464	ip->ip_tos = (ntohl(ip6->ip6_flow) >> 20) & 0xff;
465	ip->ip_len = htons(sizeof(*ip) + plen);
466	ip->ip_ttl = ip6->ip6_hlim;
467	/* Forwarding code will decrement TTL for netisr based output. */
468	if (V_nat64out == &nat64_direct)
469		ip->ip_ttl -= IPV6_HLIMDEC;
470	ip->ip_sum = 0;
471	ip->ip_p = (proto == IPPROTO_ICMPV6) ? IPPROTO_ICMP: proto;
472	ip_fillid(ip);
473	if (frag != NULL) {
474		ip->ip_off = htons(ntohs(frag->ip6f_offlg) >> 3);
475		if (frag->ip6f_offlg & IP6F_MORE_FRAG)
476			ip->ip_off |= htons(IP_MF);
477	} else {
478		ip->ip_off = htons(IP_DF);
479	}
480	ip->ip_sum = in_cksum_hdr(ip);
481}
482
483#define	FRAGSZ(mtu) ((mtu) - sizeof(struct ip6_hdr) - sizeof(struct ip6_frag))
484static NAT64NOINLINE int
485nat64_fragment6(struct nat64_counters *stats, struct ip6_hdr *ip6,
486    struct mbufq *mq, struct mbuf *m, uint32_t mtu, uint16_t ip_id,
487    uint16_t ip_off)
488{
489	struct ip6_frag ip6f;
490	struct mbuf *n;
491	uint16_t hlen, len, offset;
492	int plen;
493
494	plen = ntohs(ip6->ip6_plen);
495	hlen = sizeof(struct ip6_hdr);
496
497	/* Fragmentation isn't needed */
498	if (ip_off == 0 && plen <= mtu - hlen) {
499		M_PREPEND(m, hlen, M_NOWAIT);
500		if (m == NULL) {
501			NAT64STAT_INC(stats, nomem);
502			return (ENOMEM);
503		}
504		bcopy(ip6, mtod(m, void *), hlen);
505		if (mbufq_enqueue(mq, m) != 0) {
506			m_freem(m);
507			NAT64STAT_INC(stats, dropped);
508			DPRINTF(DP_DROPS, "dropped due to mbufq overflow");
509			return (ENOBUFS);
510		}
511		return (0);
512	}
513
514	hlen += sizeof(struct ip6_frag);
515	ip6f.ip6f_reserved = 0;
516	ip6f.ip6f_nxt = ip6->ip6_nxt;
517	ip6->ip6_nxt = IPPROTO_FRAGMENT;
518	if (ip_off != 0) {
519		/*
520		 * We have got an IPv4 fragment.
521		 * Use offset value and ip_id from original fragment.
522		 */
523		ip6f.ip6f_ident = htonl(ntohs(ip_id));
524		offset = (ntohs(ip_off) & IP_OFFMASK) << 3;
525		NAT64STAT_INC(stats, ifrags);
526	} else {
527		/* The packet size exceeds interface MTU */
528		ip6f.ip6f_ident = htonl(ip6_randomid());
529		offset = 0; /* First fragment*/
530	}
531	while (plen > 0 && m != NULL) {
532		n = NULL;
533		len = FRAGSZ(mtu) & ~7;
534		if (len > plen)
535			len = plen;
536		ip6->ip6_plen = htons(len + sizeof(ip6f));
537		ip6f.ip6f_offlg = ntohs(offset);
538		if (len < plen || (ip_off & htons(IP_MF)) != 0)
539			ip6f.ip6f_offlg |= IP6F_MORE_FRAG;
540		offset += len;
541		plen -= len;
542		if (plen > 0) {
543			n = m_split(m, len, M_NOWAIT);
544			if (n == NULL)
545				goto fail;
546		}
547		M_PREPEND(m, hlen, M_NOWAIT);
548		if (m == NULL)
549			goto fail;
550		bcopy(ip6, mtod(m, void *), sizeof(struct ip6_hdr));
551		bcopy(&ip6f, mtodo(m, sizeof(struct ip6_hdr)),
552		    sizeof(struct ip6_frag));
553		if (mbufq_enqueue(mq, m) != 0)
554			goto fail;
555		m = n;
556	}
557	NAT64STAT_ADD(stats, ofrags, mbufq_len(mq));
558	return (0);
559fail:
560	if (m != NULL)
561		m_freem(m);
562	if (n != NULL)
563		m_freem(n);
564	mbufq_drain(mq);
565	NAT64STAT_INC(stats, nomem);
566	return (ENOMEM);
567}
568
569static NAT64NOINLINE int
570nat64_find_route6(struct nhop6_basic *pnh, struct sockaddr_in6 *dst,
571    struct mbuf *m)
572{
573
574	if (fib6_lookup_nh_basic(M_GETFIB(m), &dst->sin6_addr, 0, 0, 0,
575	    pnh) != 0)
576		return (EHOSTUNREACH);
577	if (pnh->nh_flags & (NHF_BLACKHOLE | NHF_REJECT))
578		return (EHOSTUNREACH);
579	/*
580	 * XXX: we need to use destination address with embedded scope
581	 * zone id, because LLTABLE uses such form of addresses for lookup.
582	 */
583	dst->sin6_family = AF_INET6;
584	dst->sin6_len = sizeof(*dst);
585	dst->sin6_addr = pnh->nh_addr;
586	if (IN6_IS_SCOPE_LINKLOCAL(&dst->sin6_addr))
587		dst->sin6_addr.s6_addr16[1] =
588		    htons(pnh->nh_ifp->if_index & 0xffff);
589	dst->sin6_port = 0;
590	dst->sin6_scope_id = 0;
591	dst->sin6_flowinfo = 0;
592
593	return (0);
594}
595
596#define	NAT64_ICMP6_PLEN	64
597static NAT64NOINLINE void
598nat64_icmp6_reflect(struct mbuf *m, uint8_t type, uint8_t code, uint32_t mtu,
599    struct nat64_counters *stats, void *logdata)
600{
601	struct icmp6_hdr *icmp6;
602	struct ip6_hdr *ip6, *oip6;
603	struct mbuf *n;
604	int len, plen;
605
606	len = 0;
607	plen = nat64_getlasthdr(m, &len);
608	if (plen < 0) {
609		DPRINTF(DP_DROPS, "mbuf isn't contigious");
610		goto freeit;
611	}
612	/*
613	 * Do not send ICMPv6 in reply to ICMPv6 errors.
614	 */
615	if (plen == IPPROTO_ICMPV6) {
616		if (m->m_len < len + sizeof(*icmp6)) {
617			DPRINTF(DP_DROPS, "mbuf isn't contigious");
618			goto freeit;
619		}
620		icmp6 = mtodo(m, len);
621		if (icmp6->icmp6_type < ICMP6_ECHO_REQUEST ||
622		    icmp6->icmp6_type == ND_REDIRECT) {
623			DPRINTF(DP_DROPS, "do not send ICMPv6 in reply to "
624			    "ICMPv6 errors");
625			goto freeit;
626		}
627	}
628	/*
629	if (icmp6_ratelimit(&ip6->ip6_src, type, code))
630		goto freeit;
631		*/
632	ip6 = mtod(m, struct ip6_hdr *);
633	switch (type) {
634	case ICMP6_DST_UNREACH:
635	case ICMP6_PACKET_TOO_BIG:
636	case ICMP6_TIME_EXCEEDED:
637	case ICMP6_PARAM_PROB:
638		break;
639	default:
640		goto freeit;
641	}
642	/* Calculate length of ICMPv6 payload */
643	len = (m->m_pkthdr.len > NAT64_ICMP6_PLEN) ? NAT64_ICMP6_PLEN:
644	    m->m_pkthdr.len;
645
646	/* Create new ICMPv6 datagram */
647	plen = len + sizeof(struct icmp6_hdr);
648	n = m_get2(sizeof(struct ip6_hdr) + plen + max_hdr, M_NOWAIT,
649	    MT_HEADER, M_PKTHDR);
650	if (n == NULL) {
651		NAT64STAT_INC(stats, nomem);
652		m_freem(m);
653		return;
654	}
655	/*
656	 * Move pkthdr from original mbuf. We should have initialized some
657	 * fields, because we can reinject this mbuf to netisr and it will
658	 * go trough input path (it requires at least rcvif should be set).
659	 * Also do M_ALIGN() to reduce chances of need to allocate new mbuf
660	 * in the chain, when we will do M_PREPEND() or make some type of
661	 * tunneling.
662	 */
663	m_move_pkthdr(n, m);
664	M_ALIGN(n, sizeof(struct ip6_hdr) + plen + max_hdr);
665
666	n->m_len = n->m_pkthdr.len = sizeof(struct ip6_hdr) + plen;
667	oip6 = mtod(n, struct ip6_hdr *);
668	oip6->ip6_src = ip6->ip6_dst;
669	oip6->ip6_dst = ip6->ip6_src;
670	oip6->ip6_nxt = IPPROTO_ICMPV6;
671	oip6->ip6_flow = 0;
672	oip6->ip6_vfc |= IPV6_VERSION;
673	oip6->ip6_hlim = V_ip6_defhlim;
674	oip6->ip6_plen = htons(plen);
675
676	icmp6 = mtodo(n, sizeof(struct ip6_hdr));
677	icmp6->icmp6_cksum = 0;
678	icmp6->icmp6_type = type;
679	icmp6->icmp6_code = code;
680	icmp6->icmp6_mtu = htonl(mtu);
681
682	m_copydata(m, 0, len, mtodo(n, sizeof(struct ip6_hdr) +
683	    sizeof(struct icmp6_hdr)));
684	icmp6->icmp6_cksum = in6_cksum(n, IPPROTO_ICMPV6,
685	    sizeof(struct ip6_hdr), plen);
686	m_freem(m);
687	V_nat64out->output_one(n, stats, logdata);
688	return;
689freeit:
690	NAT64STAT_INC(stats, dropped);
691	m_freem(m);
692}
693
694static NAT64NOINLINE int
695nat64_find_route4(struct nhop4_basic *pnh, struct sockaddr_in *dst,
696    struct mbuf *m)
697{
698
699	if (fib4_lookup_nh_basic(M_GETFIB(m), dst->sin_addr, 0, 0, pnh) != 0)
700		return (EHOSTUNREACH);
701	if (pnh->nh_flags & (NHF_BLACKHOLE | NHF_BROADCAST | NHF_REJECT))
702		return (EHOSTUNREACH);
703
704	dst->sin_family = AF_INET;
705	dst->sin_len = sizeof(*dst);
706	dst->sin_addr = pnh->nh_addr;
707	dst->sin_port = 0;
708	return (0);
709}
710
711#define	NAT64_ICMP_PLEN	64
712static NAT64NOINLINE void
713nat64_icmp_reflect(struct mbuf *m, uint8_t type,
714    uint8_t code, uint16_t mtu, struct nat64_counters *stats, void *logdata)
715{
716	struct icmp *icmp;
717	struct ip *ip, *oip;
718	struct mbuf *n;
719	int len, plen;
720
721	ip = mtod(m, struct ip *);
722	/* Do not send ICMP error if packet is not the first fragment */
723	if (ip->ip_off & ~ntohs(IP_MF|IP_DF)) {
724		DPRINTF(DP_DROPS, "not first fragment");
725		goto freeit;
726	}
727	/* Do not send ICMP in reply to ICMP errors */
728	if (ip->ip_p == IPPROTO_ICMP) {
729		if (m->m_len < (ip->ip_hl << 2)) {
730			DPRINTF(DP_DROPS, "mbuf isn't contigious");
731			goto freeit;
732		}
733		icmp = mtodo(m, ip->ip_hl << 2);
734		if (!ICMP_INFOTYPE(icmp->icmp_type)) {
735			DPRINTF(DP_DROPS, "do not send ICMP in reply to "
736			    "ICMP errors");
737			goto freeit;
738		}
739	}
740	switch (type) {
741	case ICMP_UNREACH:
742	case ICMP_TIMXCEED:
743	case ICMP_PARAMPROB:
744		break;
745	default:
746		goto freeit;
747	}
748	/* Calculate length of ICMP payload */
749	len = (m->m_pkthdr.len > NAT64_ICMP_PLEN) ? (ip->ip_hl << 2) + 8:
750	    m->m_pkthdr.len;
751
752	/* Create new ICMPv4 datagram */
753	plen = len + sizeof(struct icmphdr) + sizeof(uint32_t);
754	n = m_get2(sizeof(struct ip) + plen + max_hdr, M_NOWAIT,
755	    MT_HEADER, M_PKTHDR);
756	if (n == NULL) {
757		NAT64STAT_INC(stats, nomem);
758		m_freem(m);
759		return;
760	}
761	m_move_pkthdr(n, m);
762	M_ALIGN(n, sizeof(struct ip) + plen + max_hdr);
763
764	n->m_len = n->m_pkthdr.len = sizeof(struct ip) + plen;
765	oip = mtod(n, struct ip *);
766	oip->ip_v = IPVERSION;
767	oip->ip_hl = sizeof(struct ip) >> 2;
768	oip->ip_tos = 0;
769	oip->ip_len = htons(n->m_pkthdr.len);
770	oip->ip_ttl = V_ip_defttl;
771	oip->ip_p = IPPROTO_ICMP;
772	ip_fillid(oip);
773	oip->ip_off = htons(IP_DF);
774	oip->ip_src = ip->ip_dst;
775	oip->ip_dst = ip->ip_src;
776	oip->ip_sum = 0;
777	oip->ip_sum = in_cksum_hdr(oip);
778
779	icmp = mtodo(n, sizeof(struct ip));
780	icmp->icmp_type = type;
781	icmp->icmp_code = code;
782	icmp->icmp_cksum = 0;
783	icmp->icmp_pmvoid = 0;
784	icmp->icmp_nextmtu = htons(mtu);
785	m_copydata(m, 0, len, mtodo(n, sizeof(struct ip) +
786	    sizeof(struct icmphdr) + sizeof(uint32_t)));
787	icmp->icmp_cksum = in_cksum_skip(n, sizeof(struct ip) + plen,
788	    sizeof(struct ip));
789	m_freem(m);
790	V_nat64out->output_one(n, stats, logdata);
791	return;
792freeit:
793	NAT64STAT_INC(stats, dropped);
794	m_freem(m);
795}
796
797/* Translate ICMP echo request/reply into ICMPv6 */
798static void
799nat64_icmp_handle_echo(struct ip6_hdr *ip6, struct icmp6_hdr *icmp6,
800    uint16_t id, uint8_t type)
801{
802	uint16_t old;
803
804	old = *(uint16_t *)icmp6;	/* save type+code in one word */
805	icmp6->icmp6_type = type;
806	/* Reflect ICMPv6 -> ICMPv4 type translation in the cksum */
807	icmp6->icmp6_cksum = cksum_adjust(icmp6->icmp6_cksum,
808	    old, *(uint16_t *)icmp6);
809	if (id != 0) {
810		old = icmp6->icmp6_id;
811		icmp6->icmp6_id = id;
812		/* Reflect ICMP id translation in the cksum */
813		icmp6->icmp6_cksum = cksum_adjust(icmp6->icmp6_cksum,
814		    old, id);
815	}
816	/* Reflect IPv6 pseudo header in the cksum */
817	icmp6->icmp6_cksum = ~in6_cksum_pseudo(ip6, ntohs(ip6->ip6_plen),
818	    IPPROTO_ICMPV6, ~icmp6->icmp6_cksum);
819}
820
821static NAT64NOINLINE struct mbuf *
822nat64_icmp_translate(struct mbuf *m, struct ip6_hdr *ip6, uint16_t icmpid,
823    int offset, struct nat64_config *cfg)
824{
825	struct ip ip;
826	struct icmp *icmp;
827	struct tcphdr *tcp;
828	struct udphdr *udp;
829	struct ip6_hdr *eip6;
830	struct mbuf *n;
831	uint32_t mtu;
832	int len, hlen, plen;
833	uint8_t type, code;
834
835	if (m->m_len < offset + ICMP_MINLEN)
836		m = m_pullup(m, offset + ICMP_MINLEN);
837	if (m == NULL) {
838		NAT64STAT_INC(&cfg->stats, nomem);
839		return (m);
840	}
841	mtu = 0;
842	icmp = mtodo(m, offset);
843	/* RFC 7915 p4.2 */
844	switch (icmp->icmp_type) {
845	case ICMP_ECHOREPLY:
846		type = ICMP6_ECHO_REPLY;
847		code = 0;
848		break;
849	case ICMP_UNREACH:
850		type = ICMP6_DST_UNREACH;
851		switch (icmp->icmp_code) {
852		case ICMP_UNREACH_NET:
853		case ICMP_UNREACH_HOST:
854		case ICMP_UNREACH_SRCFAIL:
855		case ICMP_UNREACH_NET_UNKNOWN:
856		case ICMP_UNREACH_HOST_UNKNOWN:
857		case ICMP_UNREACH_TOSNET:
858		case ICMP_UNREACH_TOSHOST:
859			code = ICMP6_DST_UNREACH_NOROUTE;
860			break;
861		case ICMP_UNREACH_PROTOCOL:
862			type = ICMP6_PARAM_PROB;
863			code = ICMP6_PARAMPROB_NEXTHEADER;
864			break;
865		case ICMP_UNREACH_PORT:
866			code = ICMP6_DST_UNREACH_NOPORT;
867			break;
868		case ICMP_UNREACH_NEEDFRAG:
869			type = ICMP6_PACKET_TOO_BIG;
870			code = 0;
871			/* XXX: needs an additional look */
872			mtu = max(IPV6_MMTU, ntohs(icmp->icmp_nextmtu) + 20);
873			break;
874		case ICMP_UNREACH_NET_PROHIB:
875		case ICMP_UNREACH_HOST_PROHIB:
876		case ICMP_UNREACH_FILTER_PROHIB:
877		case ICMP_UNREACH_PRECEDENCE_CUTOFF:
878			code = ICMP6_DST_UNREACH_ADMIN;
879			break;
880		default:
881			DPRINTF(DP_DROPS, "Unsupported ICMP type %d, code %d",
882			    icmp->icmp_type, icmp->icmp_code);
883			goto freeit;
884		}
885		break;
886	case ICMP_TIMXCEED:
887		type = ICMP6_TIME_EXCEEDED;
888		code = icmp->icmp_code;
889		break;
890	case ICMP_ECHO:
891		type = ICMP6_ECHO_REQUEST;
892		code = 0;
893		break;
894	case ICMP_PARAMPROB:
895		type = ICMP6_PARAM_PROB;
896		switch (icmp->icmp_code) {
897		case ICMP_PARAMPROB_ERRATPTR:
898		case ICMP_PARAMPROB_LENGTH:
899			code = ICMP6_PARAMPROB_HEADER;
900			switch (icmp->icmp_pptr) {
901			case 0: /* Version/IHL */
902			case 1: /* Type Of Service */
903				mtu = icmp->icmp_pptr;
904				break;
905			case 2: /* Total Length */
906			case 3: mtu = 4; /* Payload Length */
907				break;
908			case 8: /* Time to Live */
909				mtu = 7; /* Hop Limit */
910				break;
911			case 9: /* Protocol */
912				mtu = 6; /* Next Header */
913				break;
914			case 12: /* Source address */
915			case 13:
916			case 14:
917			case 15:
918				mtu = 8;
919				break;
920			case 16: /* Destination address */
921			case 17:
922			case 18:
923			case 19:
924				mtu = 24;
925				break;
926			default: /* Silently drop */
927				DPRINTF(DP_DROPS, "Unsupported ICMP type %d,"
928				    " code %d, pptr %d", icmp->icmp_type,
929				    icmp->icmp_code, icmp->icmp_pptr);
930				goto freeit;
931			}
932			break;
933		default:
934			DPRINTF(DP_DROPS, "Unsupported ICMP type %d,"
935			    " code %d, pptr %d", icmp->icmp_type,
936			    icmp->icmp_code, icmp->icmp_pptr);
937			goto freeit;
938		}
939		break;
940	default:
941		DPRINTF(DP_DROPS, "Unsupported ICMP type %d, code %d",
942		    icmp->icmp_type, icmp->icmp_code);
943		goto freeit;
944	}
945	/*
946	 * For echo request/reply we can use original payload,
947	 * but we need adjust icmp_cksum, because ICMPv6 cksum covers
948	 * IPv6 pseudo header and ICMPv6 types differs from ICMPv4.
949	 */
950	if (type == ICMP6_ECHO_REQUEST || type == ICMP6_ECHO_REPLY) {
951		nat64_icmp_handle_echo(ip6, ICMP6(icmp), icmpid, type);
952		return (m);
953	}
954	/*
955	 * For other types of ICMP messages we need to translate inner
956	 * IPv4 header to IPv6 header.
957	 * Assume ICMP src is the same as payload dst
958	 * E.g. we have ( GWsrc1 , NATIP1 ) in outer header
959	 * and          ( NATIP1, Hostdst1 ) in ICMP copy header.
960	 * In that case, we already have map for NATIP1 and GWsrc1.
961	 * The only thing we need is to copy IPv6 map prefix to
962	 * Hostdst1.
963	 */
964	hlen = offset + ICMP_MINLEN;
965	if (m->m_pkthdr.len < hlen + sizeof(struct ip) + ICMP_MINLEN) {
966		DPRINTF(DP_DROPS, "Message is too short %d",
967		    m->m_pkthdr.len);
968		goto freeit;
969	}
970	m_copydata(m, hlen, sizeof(struct ip), (char *)&ip);
971	if (ip.ip_v != IPVERSION) {
972		DPRINTF(DP_DROPS, "Wrong IP version %d", ip.ip_v);
973		goto freeit;
974	}
975	hlen += ip.ip_hl << 2; /* Skip inner IP header */
976	if (nat64_check_ip4(ip.ip_src.s_addr) != 0 ||
977	    nat64_check_ip4(ip.ip_dst.s_addr) != 0 ||
978	    nat64_check_private_ip4(cfg, ip.ip_src.s_addr) != 0 ||
979	    nat64_check_private_ip4(cfg, ip.ip_dst.s_addr) != 0) {
980		DPRINTF(DP_DROPS, "IP addresses checks failed %04x -> %04x",
981		    ntohl(ip.ip_src.s_addr), ntohl(ip.ip_dst.s_addr));
982		goto freeit;
983	}
984	if (m->m_pkthdr.len < hlen + ICMP_MINLEN) {
985		DPRINTF(DP_DROPS, "Message is too short %d",
986		    m->m_pkthdr.len);
987		goto freeit;
988	}
989#if 0
990	/*
991	 * Check that inner source matches the outer destination.
992	 * XXX: We need some method to convert IPv4 into IPv6 address here,
993	 *	and compare IPv6 addresses.
994	 */
995	if (ip.ip_src.s_addr != nat64_get_ip4(&ip6->ip6_dst)) {
996		DPRINTF(DP_GENERIC, "Inner source doesn't match destination ",
997		    "%04x vs %04x", ip.ip_src.s_addr,
998		    nat64_get_ip4(&ip6->ip6_dst));
999		goto freeit;
1000	}
1001#endif
1002	/*
1003	 * Create new mbuf for ICMPv6 datagram.
1004	 * NOTE: len is data length just after inner IP header.
1005	 */
1006	len = m->m_pkthdr.len - hlen;
1007	if (sizeof(struct ip6_hdr) +
1008	    sizeof(struct icmp6_hdr) + len > NAT64_ICMP6_PLEN)
1009		len = NAT64_ICMP6_PLEN - sizeof(struct icmp6_hdr) -
1010		    sizeof(struct ip6_hdr);
1011	plen = sizeof(struct icmp6_hdr) + sizeof(struct ip6_hdr) + len;
1012	n = m_get2(offset + plen + max_hdr, M_NOWAIT, MT_HEADER, M_PKTHDR);
1013	if (n == NULL) {
1014		NAT64STAT_INC(&cfg->stats, nomem);
1015		m_freem(m);
1016		return (NULL);
1017	}
1018	m_move_pkthdr(n, m);
1019	M_ALIGN(n, offset + plen + max_hdr);
1020	n->m_len = n->m_pkthdr.len = offset + plen;
1021	/* Adjust ip6_plen in outer header */
1022	ip6->ip6_plen = htons(plen);
1023	/* Construct new inner IPv6 header */
1024	eip6 = mtodo(n, offset + sizeof(struct icmp6_hdr));
1025	eip6->ip6_src = ip6->ip6_dst;
1026	/* Use the fact that we have single /96 prefix for IPv4 map */
1027	eip6->ip6_dst = ip6->ip6_src;
1028	nat64_embed_ip4(cfg, ip.ip_dst.s_addr, &eip6->ip6_dst);
1029
1030	eip6->ip6_flow = htonl(ip.ip_tos << 20);
1031	eip6->ip6_vfc |= IPV6_VERSION;
1032	eip6->ip6_hlim = ip.ip_ttl;
1033	eip6->ip6_plen = htons(ntohs(ip.ip_len) - (ip.ip_hl << 2));
1034	eip6->ip6_nxt = (ip.ip_p == IPPROTO_ICMP) ? IPPROTO_ICMPV6: ip.ip_p;
1035	m_copydata(m, hlen, len, (char *)(eip6 + 1));
1036	/*
1037	 * We need to translate source port in the inner ULP header,
1038	 * and adjust ULP checksum.
1039	 */
1040	switch (ip.ip_p) {
1041	case IPPROTO_TCP:
1042		if (len < offsetof(struct tcphdr, th_sum))
1043			break;
1044		tcp = TCP(eip6 + 1);
1045		if (icmpid != 0) {
1046			tcp->th_sum = cksum_adjust(tcp->th_sum,
1047			    tcp->th_sport, icmpid);
1048			tcp->th_sport = icmpid;
1049		}
1050		tcp->th_sum = cksum_add(tcp->th_sum,
1051		    ~nat64_cksum_convert(eip6, &ip));
1052		break;
1053	case IPPROTO_UDP:
1054		if (len < offsetof(struct udphdr, uh_sum))
1055			break;
1056		udp = UDP(eip6 + 1);
1057		if (icmpid != 0) {
1058			udp->uh_sum = cksum_adjust(udp->uh_sum,
1059			    udp->uh_sport, icmpid);
1060			udp->uh_sport = icmpid;
1061		}
1062		udp->uh_sum = cksum_add(udp->uh_sum,
1063		    ~nat64_cksum_convert(eip6, &ip));
1064		break;
1065	case IPPROTO_ICMP:
1066		/*
1067		 * Check if this is an ICMP error message for echo request
1068		 * that we sent. I.e. ULP in the data containing invoking
1069		 * packet is IPPROTO_ICMP and its type is ICMP_ECHO.
1070		 */
1071		icmp = (struct icmp *)(eip6 + 1);
1072		if (icmp->icmp_type != ICMP_ECHO) {
1073			m_freem(n);
1074			goto freeit;
1075		}
1076		/*
1077		 * For our client this original datagram should looks
1078		 * like it was ICMPv6 datagram with type ICMP6_ECHO_REQUEST.
1079		 * Thus we need adjust icmp_cksum and convert type from
1080		 * ICMP_ECHO to ICMP6_ECHO_REQUEST.
1081		 */
1082		nat64_icmp_handle_echo(eip6, ICMP6(icmp), icmpid,
1083		    ICMP6_ECHO_REQUEST);
1084	}
1085	m_freem(m);
1086	/* Convert ICMPv4 into ICMPv6 header */
1087	icmp = mtodo(n, offset);
1088	ICMP6(icmp)->icmp6_type = type;
1089	ICMP6(icmp)->icmp6_code = code;
1090	ICMP6(icmp)->icmp6_mtu = htonl(mtu);
1091	ICMP6(icmp)->icmp6_cksum = 0;
1092	ICMP6(icmp)->icmp6_cksum = cksum_add(
1093	    ~in6_cksum_pseudo(ip6, plen, IPPROTO_ICMPV6, 0),
1094	    in_cksum_skip(n, n->m_pkthdr.len, offset));
1095	return (n);
1096freeit:
1097	m_freem(m);
1098	NAT64STAT_INC(&cfg->stats, dropped);
1099	return (NULL);
1100}
1101
1102int
1103nat64_getlasthdr(struct mbuf *m, int *offset)
1104{
1105	struct ip6_hdr *ip6;
1106	struct ip6_hbh *hbh;
1107	int proto, hlen;
1108
1109	if (offset != NULL)
1110		hlen = *offset;
1111	else
1112		hlen = 0;
1113
1114	if (m->m_len < hlen + sizeof(*ip6))
1115		return (-1);
1116
1117	ip6 = mtodo(m, hlen);
1118	hlen += sizeof(*ip6);
1119	proto = ip6->ip6_nxt;
1120	/* Skip extension headers */
1121	while (proto == IPPROTO_HOPOPTS || proto == IPPROTO_ROUTING ||
1122	    proto == IPPROTO_DSTOPTS) {
1123		hbh = mtodo(m, hlen);
1124		/*
1125		 * We expect mbuf has contigious data up to
1126		 * upper level header.
1127		 */
1128		if (m->m_len < hlen)
1129			return (-1);
1130		/*
1131		 * We doesn't support Jumbo payload option,
1132		 * so return error.
1133		 */
1134		if (proto == IPPROTO_HOPOPTS && ip6->ip6_plen == 0)
1135			return (-1);
1136		proto = hbh->ip6h_nxt;
1137		hlen += (hbh->ip6h_len + 1) << 3;
1138	}
1139	if (offset != NULL)
1140		*offset = hlen;
1141	return (proto);
1142}
1143
1144int
1145nat64_do_handle_ip4(struct mbuf *m, struct in6_addr *saddr,
1146    struct in6_addr *daddr, uint16_t lport, struct nat64_config *cfg,
1147    void *logdata)
1148{
1149	struct nhop6_basic nh;
1150	struct ip6_hdr ip6;
1151	struct sockaddr_in6 dst;
1152	struct ip *ip;
1153	struct mbufq mq;
1154	uint16_t ip_id, ip_off;
1155	uint16_t *csum;
1156	int plen, hlen;
1157	uint8_t proto;
1158
1159	ip = mtod(m, struct ip*);
1160
1161	if (ip->ip_ttl <= IPTTLDEC) {
1162		nat64_icmp_reflect(m, ICMP_TIMXCEED,
1163		    ICMP_TIMXCEED_INTRANS, 0, &cfg->stats, logdata);
1164		return (NAT64RETURN);
1165	}
1166
1167	ip6.ip6_dst = *daddr;
1168	ip6.ip6_src = *saddr;
1169
1170	hlen = ip->ip_hl << 2;
1171	plen = ntohs(ip->ip_len) - hlen;
1172	proto = ip->ip_p;
1173
1174	/* Save ip_id and ip_off, both are in network byte order */
1175	ip_id = ip->ip_id;
1176	ip_off = ip->ip_off & htons(IP_OFFMASK | IP_MF);
1177
1178	/* Fragment length must be multiple of 8 octets */
1179	if ((ip->ip_off & htons(IP_MF)) != 0 && (plen & 0x7) != 0) {
1180		nat64_icmp_reflect(m, ICMP_PARAMPROB,
1181		    ICMP_PARAMPROB_LENGTH, 0, &cfg->stats, logdata);
1182		return (NAT64RETURN);
1183	}
1184	/* Fragmented ICMP is unsupported */
1185	if (proto == IPPROTO_ICMP && ip_off != 0) {
1186		DPRINTF(DP_DROPS, "dropped due to fragmented ICMP");
1187		NAT64STAT_INC(&cfg->stats, dropped);
1188		return (NAT64MFREE);
1189	}
1190
1191	dst.sin6_addr = ip6.ip6_dst;
1192	if (nat64_find_route6(&nh, &dst, m) != 0) {
1193		NAT64STAT_INC(&cfg->stats, noroute6);
1194		nat64_icmp_reflect(m, ICMP_UNREACH, ICMP_UNREACH_HOST, 0,
1195		    &cfg->stats, logdata);
1196		return (NAT64RETURN);
1197	}
1198	if (nh.nh_mtu < plen + sizeof(ip6) &&
1199	    (ip->ip_off & htons(IP_DF)) != 0) {
1200		nat64_icmp_reflect(m, ICMP_UNREACH, ICMP_UNREACH_NEEDFRAG,
1201		    FRAGSZ(nh.nh_mtu) + sizeof(struct ip), &cfg->stats, logdata);
1202		return (NAT64RETURN);
1203	}
1204
1205	ip6.ip6_flow = htonl(ip->ip_tos << 20);
1206	ip6.ip6_vfc |= IPV6_VERSION;
1207	ip6.ip6_hlim = ip->ip_ttl;
1208	/* Forwarding code will decrement TTL for netisr based output. */
1209	if (V_nat64out == &nat64_direct)
1210		ip6.ip6_hlim -= IPTTLDEC;
1211	ip6.ip6_plen = htons(plen);
1212	ip6.ip6_nxt = (proto == IPPROTO_ICMP) ? IPPROTO_ICMPV6: proto;
1213	/* Convert checksums. */
1214	switch (proto) {
1215	case IPPROTO_TCP:
1216		csum = &TCP(mtodo(m, hlen))->th_sum;
1217		if (lport != 0) {
1218			struct tcphdr *tcp = TCP(mtodo(m, hlen));
1219			*csum = cksum_adjust(*csum, tcp->th_dport, lport);
1220			tcp->th_dport = lport;
1221		}
1222		*csum = cksum_add(*csum, ~nat64_cksum_convert(&ip6, ip));
1223		break;
1224	case IPPROTO_UDP:
1225		csum = &UDP(mtodo(m, hlen))->uh_sum;
1226		if (lport != 0) {
1227			struct udphdr *udp = UDP(mtodo(m, hlen));
1228			*csum = cksum_adjust(*csum, udp->uh_dport, lport);
1229			udp->uh_dport = lport;
1230		}
1231		*csum = cksum_add(*csum, ~nat64_cksum_convert(&ip6, ip));
1232		break;
1233	case IPPROTO_ICMP:
1234		m = nat64_icmp_translate(m, &ip6, lport, hlen, cfg);
1235		if (m == NULL)	/* stats already accounted */
1236			return (NAT64RETURN);
1237	}
1238
1239	m_adj(m, hlen);
1240	mbufq_init(&mq, 255);
1241	nat64_fragment6(&cfg->stats, &ip6, &mq, m, nh.nh_mtu, ip_id, ip_off);
1242	while ((m = mbufq_dequeue(&mq)) != NULL) {
1243		if (V_nat64out->output(nh.nh_ifp, m, (struct sockaddr *)&dst,
1244		    &cfg->stats, logdata) != 0)
1245			break;
1246		NAT64STAT_INC(&cfg->stats, opcnt46);
1247	}
1248	mbufq_drain(&mq);
1249	return (NAT64RETURN);
1250}
1251
1252int
1253nat64_handle_icmp6(struct mbuf *m, int hlen, uint32_t aaddr, uint16_t aport,
1254    struct nat64_config *cfg, void *logdata)
1255{
1256	struct ip ip;
1257	struct icmp6_hdr *icmp6;
1258	struct ip6_frag *ip6f;
1259	struct ip6_hdr *ip6, *ip6i;
1260	uint32_t mtu;
1261	int plen, proto;
1262	uint8_t type, code;
1263
1264	if (hlen == 0) {
1265		ip6 = mtod(m, struct ip6_hdr *);
1266		if (nat64_check_ip6(&ip6->ip6_src) != 0 ||
1267		    nat64_check_ip6(&ip6->ip6_dst) != 0)
1268			return (NAT64SKIP);
1269
1270		proto = nat64_getlasthdr(m, &hlen);
1271		if (proto != IPPROTO_ICMPV6) {
1272			DPRINTF(DP_DROPS,
1273			    "dropped due to mbuf isn't contigious");
1274			NAT64STAT_INC(&cfg->stats, dropped);
1275			return (NAT64MFREE);
1276		}
1277	}
1278
1279	/*
1280	 * Translate ICMPv6 type and code to ICMPv4 (RFC7915).
1281	 * NOTE: ICMPv6 echo handled by nat64_do_handle_ip6().
1282	 */
1283	icmp6 = mtodo(m, hlen);
1284	mtu = 0;
1285	switch (icmp6->icmp6_type) {
1286	case ICMP6_DST_UNREACH:
1287		type = ICMP_UNREACH;
1288		switch (icmp6->icmp6_code) {
1289		case ICMP6_DST_UNREACH_NOROUTE:
1290		case ICMP6_DST_UNREACH_BEYONDSCOPE:
1291		case ICMP6_DST_UNREACH_ADDR:
1292			code = ICMP_UNREACH_HOST;
1293			break;
1294		case ICMP6_DST_UNREACH_ADMIN:
1295			code = ICMP_UNREACH_HOST_PROHIB;
1296			break;
1297		case ICMP6_DST_UNREACH_NOPORT:
1298			code = ICMP_UNREACH_PORT;
1299			break;
1300		default:
1301			DPRINTF(DP_DROPS, "Unsupported ICMPv6 type %d,"
1302			    " code %d", icmp6->icmp6_type,
1303			    icmp6->icmp6_code);
1304			NAT64STAT_INC(&cfg->stats, dropped);
1305			return (NAT64MFREE);
1306		}
1307		break;
1308	case ICMP6_PACKET_TOO_BIG:
1309		type = ICMP_UNREACH;
1310		code = ICMP_UNREACH_NEEDFRAG;
1311		mtu = ntohl(icmp6->icmp6_mtu);
1312		if (mtu < IPV6_MMTU) {
1313			DPRINTF(DP_DROPS, "Wrong MTU %d in ICMPv6 type %d,"
1314			    " code %d", mtu, icmp6->icmp6_type,
1315			    icmp6->icmp6_code);
1316			NAT64STAT_INC(&cfg->stats, dropped);
1317			return (NAT64MFREE);
1318		}
1319		/*
1320		 * Adjust MTU to reflect difference between
1321		 * IPv6 an IPv4 headers.
1322		 */
1323		mtu -= sizeof(struct ip6_hdr) - sizeof(struct ip);
1324		break;
1325	case ICMP6_TIME_EXCEEDED:
1326		type = ICMP_TIMXCEED;
1327		code = icmp6->icmp6_code;
1328		break;
1329	case ICMP6_PARAM_PROB:
1330		switch (icmp6->icmp6_code) {
1331		case ICMP6_PARAMPROB_HEADER:
1332			type = ICMP_PARAMPROB;
1333			code = ICMP_PARAMPROB_ERRATPTR;
1334			mtu = ntohl(icmp6->icmp6_pptr);
1335			switch (mtu) {
1336			case 0: /* Version/Traffic Class */
1337			case 1: /* Traffic Class/Flow Label */
1338				break;
1339			case 4: /* Payload Length */
1340			case 5:
1341				mtu = 2;
1342				break;
1343			case 6: /* Next Header */
1344				mtu = 9;
1345				break;
1346			case 7: /* Hop Limit */
1347				mtu = 8;
1348				break;
1349			default:
1350				if (mtu >= 8 && mtu <= 23) {
1351					mtu = 12; /* Source address */
1352					break;
1353				}
1354				if (mtu >= 24 && mtu <= 39) {
1355					mtu = 16; /* Destination address */
1356					break;
1357				}
1358				DPRINTF(DP_DROPS, "Unsupported ICMPv6 type %d,"
1359				    " code %d, pptr %d", icmp6->icmp6_type,
1360				    icmp6->icmp6_code, mtu);
1361				NAT64STAT_INC(&cfg->stats, dropped);
1362				return (NAT64MFREE);
1363			}
1364		case ICMP6_PARAMPROB_NEXTHEADER:
1365			type = ICMP_UNREACH;
1366			code = ICMP_UNREACH_PROTOCOL;
1367			break;
1368		default:
1369			DPRINTF(DP_DROPS, "Unsupported ICMPv6 type %d,"
1370			    " code %d, pptr %d", icmp6->icmp6_type,
1371			    icmp6->icmp6_code, ntohl(icmp6->icmp6_pptr));
1372			NAT64STAT_INC(&cfg->stats, dropped);
1373			return (NAT64MFREE);
1374		}
1375		break;
1376	default:
1377		DPRINTF(DP_DROPS, "Unsupported ICMPv6 type %d, code %d",
1378		    icmp6->icmp6_type, icmp6->icmp6_code);
1379		NAT64STAT_INC(&cfg->stats, dropped);
1380		return (NAT64MFREE);
1381	}
1382
1383	hlen += sizeof(struct icmp6_hdr);
1384	if (m->m_pkthdr.len < hlen + sizeof(struct ip6_hdr) + ICMP_MINLEN) {
1385		NAT64STAT_INC(&cfg->stats, dropped);
1386		DPRINTF(DP_DROPS, "Message is too short %d",
1387		    m->m_pkthdr.len);
1388		return (NAT64MFREE);
1389	}
1390	/*
1391	 * We need at least ICMP_MINLEN bytes of original datagram payload
1392	 * to generate ICMP message. It is nice that ICMP_MINLEN is equal
1393	 * to sizeof(struct ip6_frag). So, if embedded datagram had a fragment
1394	 * header we will not have to do m_pullup() again.
1395	 *
1396	 * What we have here:
1397	 * Outer header: (IPv6iGW, v4mapPRefix+v4exthost)
1398	 * Inner header: (v4mapPRefix+v4host, IPv6iHost) [sport, dport]
1399	 * We need to translate it to:
1400	 *
1401	 * Outer header: (alias_host, v4exthost)
1402	 * Inner header: (v4exthost, alias_host) [sport, alias_port]
1403	 *
1404	 * Assume caller function has checked if v4mapPRefix+v4host
1405	 * matches configured prefix.
1406	 * The only two things we should be provided with are mapping between
1407	 * IPv6iHost <> alias_host and between dport and alias_port.
1408	 */
1409	if (m->m_len < hlen + sizeof(struct ip6_hdr) + ICMP_MINLEN)
1410		m = m_pullup(m, hlen + sizeof(struct ip6_hdr) + ICMP_MINLEN);
1411	if (m == NULL) {
1412		NAT64STAT_INC(&cfg->stats, nomem);
1413		return (NAT64RETURN);
1414	}
1415	ip6 = mtod(m, struct ip6_hdr *);
1416	ip6i = mtodo(m, hlen);
1417	ip6f = NULL;
1418	proto = ip6i->ip6_nxt;
1419	plen = ntohs(ip6i->ip6_plen);
1420	hlen += sizeof(struct ip6_hdr);
1421	if (proto == IPPROTO_FRAGMENT) {
1422		if (m->m_pkthdr.len < hlen + sizeof(struct ip6_frag) +
1423		    ICMP_MINLEN)
1424			goto fail;
1425		ip6f = mtodo(m, hlen);
1426		proto = ip6f->ip6f_nxt;
1427		plen -= sizeof(struct ip6_frag);
1428		hlen += sizeof(struct ip6_frag);
1429		/* Ajust MTU to reflect frag header size */
1430		if (type == ICMP_UNREACH && code == ICMP_UNREACH_NEEDFRAG)
1431			mtu -= sizeof(struct ip6_frag);
1432	}
1433	if (proto != IPPROTO_TCP && proto != IPPROTO_UDP) {
1434		DPRINTF(DP_DROPS, "Unsupported proto %d in the inner header",
1435		    proto);
1436		goto fail;
1437	}
1438	if (nat64_check_ip6(&ip6i->ip6_src) != 0 ||
1439	    nat64_check_ip6(&ip6i->ip6_dst) != 0) {
1440		DPRINTF(DP_DROPS, "Inner addresses do not passes the check");
1441		goto fail;
1442	}
1443	/* Check if outer dst is the same as inner src */
1444	if (!IN6_ARE_ADDR_EQUAL(&ip6->ip6_dst, &ip6i->ip6_src)) {
1445		DPRINTF(DP_DROPS, "Inner src doesn't match outer dst");
1446		goto fail;
1447	}
1448
1449	/* Now we need to make a fake IPv4 packet to generate ICMP message */
1450	ip.ip_dst.s_addr = aaddr;
1451	ip.ip_src.s_addr = nat64_extract_ip4(cfg, &ip6i->ip6_src);
1452	/* XXX: Make fake ulp header */
1453	if (V_nat64out == &nat64_direct) /* init_ip4hdr will decrement it */
1454		ip6i->ip6_hlim += IPV6_HLIMDEC;
1455	nat64_init_ip4hdr(ip6i, ip6f, plen, proto, &ip);
1456	m_adj(m, hlen - sizeof(struct ip));
1457	bcopy(&ip, mtod(m, void *), sizeof(ip));
1458	nat64_icmp_reflect(m, type, code, (uint16_t)mtu, &cfg->stats,
1459	    logdata);
1460	return (NAT64RETURN);
1461fail:
1462	/*
1463	 * We must call m_freem() because mbuf pointer could be
1464	 * changed with m_pullup().
1465	 */
1466	m_freem(m);
1467	NAT64STAT_INC(&cfg->stats, dropped);
1468	return (NAT64RETURN);
1469}
1470
1471int
1472nat64_do_handle_ip6(struct mbuf *m, uint32_t aaddr, uint16_t aport,
1473    struct nat64_config *cfg, void *logdata)
1474{
1475	struct ip ip;
1476	struct nhop4_basic nh;
1477	struct sockaddr_in dst;
1478	struct ip6_frag *frag;
1479	struct ip6_hdr *ip6;
1480	struct icmp6_hdr *icmp6;
1481	uint16_t *csum;
1482	int plen, hlen, proto;
1483
1484	/*
1485	 * XXX: we expect ipfw_chk() did m_pullup() up to upper level
1486	 * protocol's headers. Also we skip some checks, that ip6_input(),
1487	 * ip6_forward(), ip6_fastfwd() and ipfw_chk() already did.
1488	 */
1489	ip6 = mtod(m, struct ip6_hdr *);
1490	if (nat64_check_ip6(&ip6->ip6_src) != 0 ||
1491	    nat64_check_ip6(&ip6->ip6_dst) != 0) {
1492		return (NAT64SKIP);
1493	}
1494
1495	/* Starting from this point we must not return zero */
1496	ip.ip_src.s_addr = aaddr;
1497	if (nat64_check_ip4(ip.ip_src.s_addr) != 0) {
1498		DPRINTF(DP_GENERIC | DP_DROPS, "invalid source address: %08x",
1499		    ip.ip_src.s_addr);
1500		NAT64STAT_INC(&cfg->stats, dropped);
1501		return (NAT64MFREE);
1502	}
1503
1504	ip.ip_dst.s_addr = nat64_extract_ip4(cfg, &ip6->ip6_dst);
1505	if (ip.ip_dst.s_addr == 0) {
1506		NAT64STAT_INC(&cfg->stats, dropped);
1507		return (NAT64MFREE);
1508	}
1509
1510	if (ip6->ip6_hlim <= IPV6_HLIMDEC) {
1511		nat64_icmp6_reflect(m, ICMP6_TIME_EXCEEDED,
1512		    ICMP6_TIME_EXCEED_TRANSIT, 0, &cfg->stats, logdata);
1513		return (NAT64RETURN);
1514	}
1515
1516	hlen = 0;
1517	plen = ntohs(ip6->ip6_plen);
1518	proto = nat64_getlasthdr(m, &hlen);
1519	if (proto < 0) {
1520		DPRINTF(DP_DROPS, "dropped due to mbuf isn't contigious");
1521		NAT64STAT_INC(&cfg->stats, dropped);
1522		return (NAT64MFREE);
1523	}
1524	frag = NULL;
1525	if (proto == IPPROTO_FRAGMENT) {
1526		/* ipfw_chk should m_pullup up to frag header */
1527		if (m->m_len < hlen + sizeof(*frag)) {
1528			DPRINTF(DP_DROPS,
1529			    "dropped due to mbuf isn't contigious");
1530			NAT64STAT_INC(&cfg->stats, dropped);
1531			return (NAT64MFREE);
1532		}
1533		frag = mtodo(m, hlen);
1534		proto = frag->ip6f_nxt;
1535		hlen += sizeof(*frag);
1536		/* Fragmented ICMPv6 is unsupported */
1537		if (proto == IPPROTO_ICMPV6) {
1538			DPRINTF(DP_DROPS, "dropped due to fragmented ICMPv6");
1539			NAT64STAT_INC(&cfg->stats, dropped);
1540			return (NAT64MFREE);
1541		}
1542		/* Fragment length must be multiple of 8 octets */
1543		if ((frag->ip6f_offlg & IP6F_MORE_FRAG) != 0 &&
1544		    ((plen + sizeof(struct ip6_hdr) - hlen) & 0x7) != 0) {
1545			nat64_icmp6_reflect(m, ICMP6_PARAM_PROB,
1546			    ICMP6_PARAMPROB_HEADER,
1547			    offsetof(struct ip6_hdr, ip6_plen), &cfg->stats,
1548			    logdata);
1549			return (NAT64RETURN);
1550		}
1551	}
1552	plen -= hlen - sizeof(struct ip6_hdr);
1553	if (plen < 0 || m->m_pkthdr.len < plen + hlen) {
1554		DPRINTF(DP_DROPS, "plen %d, pkthdr.len %d, hlen %d",
1555		    plen, m->m_pkthdr.len, hlen);
1556		NAT64STAT_INC(&cfg->stats, dropped);
1557		return (NAT64MFREE);
1558	}
1559
1560	icmp6 = NULL;	/* Make gcc happy */
1561	if (proto == IPPROTO_ICMPV6) {
1562		icmp6 = mtodo(m, hlen);
1563		if (icmp6->icmp6_type != ICMP6_ECHO_REQUEST &&
1564		    icmp6->icmp6_type != ICMP6_ECHO_REPLY)
1565			return (nat64_handle_icmp6(m, hlen, aaddr, aport,
1566			    cfg, logdata));
1567	}
1568	dst.sin_addr.s_addr = ip.ip_dst.s_addr;
1569	if (nat64_find_route4(&nh, &dst, m) != 0) {
1570		NAT64STAT_INC(&cfg->stats, noroute4);
1571		nat64_icmp6_reflect(m, ICMP6_DST_UNREACH,
1572		    ICMP6_DST_UNREACH_NOROUTE, 0, &cfg->stats, logdata);
1573		return (NAT64RETURN);
1574	}
1575	if (nh.nh_mtu < plen + sizeof(ip)) {
1576		nat64_icmp6_reflect(m, ICMP6_PACKET_TOO_BIG, 0, nh.nh_mtu,
1577		    &cfg->stats, logdata);
1578		return (NAT64RETURN);
1579	}
1580	nat64_init_ip4hdr(ip6, frag, plen, proto, &ip);
1581	/* Convert checksums. */
1582	switch (proto) {
1583	case IPPROTO_TCP:
1584		csum = &TCP(mtodo(m, hlen))->th_sum;
1585		if (aport != 0) {
1586			struct tcphdr *tcp = TCP(mtodo(m, hlen));
1587			*csum = cksum_adjust(*csum, tcp->th_sport, aport);
1588			tcp->th_sport = aport;
1589		}
1590		*csum = cksum_add(*csum, nat64_cksum_convert(ip6, &ip));
1591		break;
1592	case IPPROTO_UDP:
1593		csum = &UDP(mtodo(m, hlen))->uh_sum;
1594		if (aport != 0) {
1595			struct udphdr *udp = UDP(mtodo(m, hlen));
1596			*csum = cksum_adjust(*csum, udp->uh_sport, aport);
1597			udp->uh_sport = aport;
1598		}
1599		*csum = cksum_add(*csum, nat64_cksum_convert(ip6, &ip));
1600		break;
1601	case IPPROTO_ICMPV6:
1602		/* Checksum in ICMPv6 covers pseudo header */
1603		csum = &icmp6->icmp6_cksum;
1604		*csum = cksum_add(*csum, in6_cksum_pseudo(ip6, plen,
1605		    IPPROTO_ICMPV6, 0));
1606		/* Convert ICMPv6 types to ICMP */
1607		proto = *(uint16_t *)icmp6; /* save old word for cksum_adjust */
1608		if (icmp6->icmp6_type == ICMP6_ECHO_REQUEST)
1609			icmp6->icmp6_type = ICMP_ECHO;
1610		else /* ICMP6_ECHO_REPLY */
1611			icmp6->icmp6_type = ICMP_ECHOREPLY;
1612		*csum = cksum_adjust(*csum, (uint16_t)proto,
1613		    *(uint16_t *)icmp6);
1614		if (aport != 0) {
1615			uint16_t old_id = icmp6->icmp6_id;
1616			icmp6->icmp6_id = aport;
1617			*csum = cksum_adjust(*csum, old_id, aport);
1618		}
1619		break;
1620	};
1621
1622	m_adj(m, hlen - sizeof(ip));
1623	bcopy(&ip, mtod(m, void *), sizeof(ip));
1624	if (V_nat64out->output(nh.nh_ifp, m, (struct sockaddr *)&dst,
1625	    &cfg->stats, logdata) == 0)
1626		NAT64STAT_INC(&cfg->stats, opcnt64);
1627	return (NAT64RETURN);
1628}
1629
1630