1/*	$NetBSD: ip_encap.c,v 1.77 2022/12/07 08:33:02 knakahara Exp $	*/
2/*	$KAME: ip_encap.c,v 1.73 2001/10/02 08:30:58 itojun Exp $	*/
3
4/*
5 * Copyright (C) 1995, 1996, 1997, and 1998 WIDE Project.
6 * All rights reserved.
7 *
8 * Redistribution and use in source and binary forms, with or without
9 * modification, are permitted provided that the following conditions
10 * are met:
11 * 1. Redistributions of source code must retain the above copyright
12 *    notice, this list of conditions and the following disclaimer.
13 * 2. Redistributions in binary form must reproduce the above copyright
14 *    notice, this list of conditions and the following disclaimer in the
15 *    documentation and/or other materials provided with the distribution.
16 * 3. Neither the name of the project nor the names of its contributors
17 *    may be used to endorse or promote products derived from this software
18 *    without specific prior written permission.
19 *
20 * THIS SOFTWARE IS PROVIDED BY THE PROJECT AND CONTRIBUTORS ``AS IS'' AND
21 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
22 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
23 * ARE DISCLAIMED.  IN NO EVENT SHALL THE PROJECT OR CONTRIBUTORS BE LIABLE
24 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
25 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
26 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
27 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
28 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
29 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
30 * SUCH DAMAGE.
31 */
32/*
33 * My grandfather said that there's a devil inside tunnelling technology...
34 *
35 * We have surprisingly many protocols that want packets with IP protocol
36 * #4 or #41.  Here's a list of protocols that want protocol #41:
37 *	RFC1933 configured tunnel
38 *	RFC1933 automatic tunnel
39 *	RFC2401 IPsec tunnel
40 *	RFC2473 IPv6 generic packet tunnelling
41 *	RFC2529 6over4 tunnel
42 *	RFC3056 6to4 tunnel
43 *	isatap tunnel
44 *	mobile-ip6 (uses RFC2473)
45 * Here's a list of protocol that want protocol #4:
46 *	RFC1853 IPv4-in-IPv4 tunnelling
47 *	RFC2003 IPv4 encapsulation within IPv4
48 *	RFC2344 reverse tunnelling for mobile-ip4
49 *	RFC2401 IPsec tunnel
50 * Well, what can I say.  They impose different en/decapsulation mechanism
51 * from each other, so they need separate protocol handler.  The only one
52 * we can easily determine by protocol # is IPsec, which always has
53 * AH/ESP/IPComp header right after outer IP header.
54 *
55 * So, clearly good old protosw does not work for protocol #4 and #41.
56 * The code will let you match protocol via src/dst address pair.
57 */
58/* XXX is M_NETADDR correct? */
59
60#include <sys/cdefs.h>
61__KERNEL_RCSID(0, "$NetBSD: ip_encap.c,v 1.77 2022/12/07 08:33:02 knakahara Exp $");
62
63#ifdef _KERNEL_OPT
64#include "opt_mrouting.h"
65#include "opt_inet.h"
66#include "opt_net_mpsafe.h"
67#endif
68
69#include <sys/param.h>
70#include <sys/systm.h>
71#include <sys/socket.h>
72#include <sys/socketvar.h> /* for softnet_lock */
73#include <sys/sockio.h>
74#include <sys/mbuf.h>
75#include <sys/errno.h>
76#include <sys/queue.h>
77#include <sys/kmem.h>
78#include <sys/mutex.h>
79#include <sys/condvar.h>
80#include <sys/psref.h>
81#include <sys/pslist.h>
82#include <sys/thmap.h>
83
84#include <net/if.h>
85
86#include <netinet/in.h>
87#include <netinet/in_systm.h>
88#include <netinet/ip.h>
89#include <netinet/ip_var.h>
90#include <netinet/ip_encap.h>
91#ifdef MROUTING
92#include <netinet/ip_mroute.h>
93#endif /* MROUTING */
94
95#ifdef INET6
96#include <netinet/ip6.h>
97#include <netinet6/ip6_var.h>
98#include <netinet6/ip6protosw.h> /* for struct ip6ctlparam */
99#include <netinet6/in6_var.h>
100#include <netinet6/in6_pcb.h>
101#include <netinet/icmp6.h>
102#endif
103
104#ifdef NET_MPSAFE
105#define ENCAP_MPSAFE	1
106#endif
107
108enum direction { INBOUND, OUTBOUND };
109
110#ifdef INET
111static struct encaptab *encap4_lookup(struct mbuf *, int, int, enum direction,
112    struct psref *);
113#endif
114#ifdef INET6
115static struct encaptab *encap6_lookup(struct mbuf *, int, int, enum direction,
116    struct psref *);
117#endif
118static int encap_add(struct encaptab *);
119static int encap_remove(struct encaptab *);
120static void encap_afcheck(int, const struct sockaddr *, const struct sockaddr *);
121static void encap_key_init(struct encap_key *, const struct sockaddr *,
122    const struct sockaddr *);
123static void encap_key_inc(struct encap_key *);
124
125/*
126 * In encap[46]_lookup(), ep->func can sleep(e.g. rtalloc1) while walking
127 * encap_table. So, it cannot use pserialize_read_enter()
128 */
129static struct {
130	struct pslist_head	list;
131	pserialize_t		psz;
132	struct psref_class	*elem_class; /* for the element of et_list */
133} encaptab  __cacheline_aligned = {
134	.list = PSLIST_INITIALIZER,
135};
136#define encap_table encaptab.list
137
138static struct {
139	kmutex_t	lock;
140	kcondvar_t	cv;
141	struct lwp	*busy;
142} encap_whole __cacheline_aligned;
143
144static thmap_t *encap_map[2];	/* 0 for AF_INET, 1 for AF_INET6 */
145
146static bool encap_initialized = false;
147/*
148 * must be done before other encap interfaces initialization.
149 */
150void
151encapinit(void)
152{
153
154	if (encap_initialized)
155		return;
156
157	encaptab.psz = pserialize_create();
158	encaptab.elem_class = psref_class_create("encapelem", IPL_SOFTNET);
159
160	mutex_init(&encap_whole.lock, MUTEX_DEFAULT, IPL_NONE);
161	cv_init(&encap_whole.cv, "ip_encap cv");
162	encap_whole.busy = NULL;
163
164	encap_initialized = true;
165}
166
167void
168encap_init(void)
169{
170	static int initialized = 0;
171
172	if (initialized)
173		return;
174	initialized++;
175#if 0
176	/*
177	 * we cannot use LIST_INIT() here, since drivers may want to call
178	 * encap_attach(), on driver attach.  encap_init() will be called
179	 * on AF_INET{,6} initialization, which happens after driver
180	 * initialization - using LIST_INIT() here can nuke encap_attach()
181	 * from drivers.
182	 */
183	PSLIST_INIT(&encap_table);
184#endif
185
186	encap_map[0] = thmap_create(0, NULL, THMAP_NOCOPY);
187#ifdef INET6
188	encap_map[1] = thmap_create(0, NULL, THMAP_NOCOPY);
189#endif
190}
191
192#ifdef INET
193static struct encaptab *
194encap4_lookup(struct mbuf *m, int off, int proto, enum direction dir,
195    struct psref *match_psref)
196{
197	struct ip *ip;
198	struct ip_pack4 pack;
199	struct encaptab *ep, *match;
200	int prio, matchprio;
201	int s;
202	thmap_t *emap = encap_map[0];
203	struct encap_key key;
204
205	KASSERT(m->m_len >= sizeof(*ip));
206
207	ip = mtod(m, struct ip *);
208
209	memset(&pack, 0, sizeof(pack));
210	pack.p.sp_len = sizeof(pack);
211	pack.mine.sin_family = pack.yours.sin_family = AF_INET;
212	pack.mine.sin_len = pack.yours.sin_len = sizeof(struct sockaddr_in);
213	if (dir == INBOUND) {
214		pack.mine.sin_addr = ip->ip_dst;
215		pack.yours.sin_addr = ip->ip_src;
216	} else {
217		pack.mine.sin_addr = ip->ip_src;
218		pack.yours.sin_addr = ip->ip_dst;
219	}
220
221	match = NULL;
222	matchprio = 0;
223
224	s = pserialize_read_enter();
225
226	encap_key_init(&key, sintosa(&pack.mine), sintosa(&pack.yours));
227	while ((ep = thmap_get(emap, &key, sizeof(key))) != NULL) {
228		struct psref elem_psref;
229
230		KASSERT(ep->af == AF_INET);
231
232		if (ep->proto >= 0 && ep->proto != proto) {
233			encap_key_inc(&key);
234			continue;
235		}
236
237		psref_acquire(&elem_psref, &ep->psref,
238		    encaptab.elem_class);
239		if (ep->func) {
240			pserialize_read_exit(s);
241			prio = (*ep->func)(m, off, proto, ep->arg);
242			s = pserialize_read_enter();
243		} else {
244			prio = pack.mine.sin_len + pack.yours.sin_len;
245		}
246
247		if (prio <= 0) {
248			psref_release(&elem_psref, &ep->psref,
249			    encaptab.elem_class);
250			encap_key_inc(&key);
251			continue;
252		}
253		if (prio > matchprio) {
254			/* release last matched ep */
255			if (match != NULL)
256				psref_release(match_psref, &match->psref,
257				    encaptab.elem_class);
258
259			psref_copy(match_psref, &elem_psref,
260			    encaptab.elem_class);
261			matchprio = prio;
262			match = ep;
263		}
264
265		psref_release(&elem_psref, &ep->psref,
266		    encaptab.elem_class);
267		encap_key_inc(&key);
268	}
269
270	PSLIST_READER_FOREACH(ep, &encap_table, struct encaptab, chain) {
271		struct psref elem_psref;
272
273		if (ep->af != AF_INET)
274			continue;
275		if (ep->proto >= 0 && ep->proto != proto)
276			continue;
277
278		psref_acquire(&elem_psref, &ep->psref,
279		    encaptab.elem_class);
280		pserialize_read_exit(s);
281		/* ep->func is sleepable. e.g. rtalloc1 */
282		prio = (*ep->func)(m, off, proto, ep->arg);
283		s = pserialize_read_enter();
284
285		/*
286		 * We prioritize the matches by using bit length of the
287		 * matches.  user-supplied matching function
288		 * should return the bit length of the matches (for example,
289		 * if both src/dst are matched for IPv4, 64 should be returned).
290		 * 0 or negative return value means "it did not match".
291		 *
292		 * We need to loop through all the possible candidates
293		 * to get the best match - the search takes O(n) for
294		 * n attachments (i.e. interfaces).
295		 */
296		if (prio <= 0) {
297			psref_release(&elem_psref, &ep->psref,
298			    encaptab.elem_class);
299			continue;
300		}
301		if (prio > matchprio) {
302			/* release last matched ep */
303			if (match != NULL)
304				psref_release(match_psref, &match->psref,
305				    encaptab.elem_class);
306
307			psref_copy(match_psref, &elem_psref,
308			    encaptab.elem_class);
309			matchprio = prio;
310			match = ep;
311		}
312		KASSERTMSG((match == NULL) || psref_held(&match->psref,
313			encaptab.elem_class),
314		    "current match = %p, but not hold its psref", match);
315
316		psref_release(&elem_psref, &ep->psref,
317		    encaptab.elem_class);
318	}
319	pserialize_read_exit(s);
320
321	return match;
322}
323
324void
325encap4_input(struct mbuf *m, int off, int proto)
326{
327	const struct encapsw *esw;
328	struct encaptab *match;
329	struct psref match_psref;
330
331	match = encap4_lookup(m, off, proto, INBOUND, &match_psref);
332	if (match) {
333		/* found a match, "match" has the best one */
334		esw = match->esw;
335		if (esw && esw->encapsw4.pr_input) {
336			(*esw->encapsw4.pr_input)(m, off, proto, match->arg);
337			psref_release(&match_psref, &match->psref,
338			    encaptab.elem_class);
339		} else {
340			psref_release(&match_psref, &match->psref,
341			    encaptab.elem_class);
342			m_freem(m);
343		}
344		return;
345	}
346
347	/* last resort: inject to raw socket */
348	SOFTNET_LOCK_IF_NET_MPSAFE();
349	rip_input(m, off, proto);
350	SOFTNET_UNLOCK_IF_NET_MPSAFE();
351}
352#endif
353
354#ifdef INET6
355static struct encaptab *
356encap6_lookup(struct mbuf *m, int off, int proto, enum direction dir,
357    struct psref *match_psref)
358{
359	struct ip6_hdr *ip6;
360	struct ip_pack6 pack;
361	int prio, matchprio;
362	int s;
363	struct encaptab *ep, *match;
364	thmap_t *emap = encap_map[1];
365	struct encap_key key;
366
367	KASSERT(m->m_len >= sizeof(*ip6));
368
369	ip6 = mtod(m, struct ip6_hdr *);
370
371	memset(&pack, 0, sizeof(pack));
372	pack.p.sp_len = sizeof(pack);
373	pack.mine.sin6_family = pack.yours.sin6_family = AF_INET6;
374	pack.mine.sin6_len = pack.yours.sin6_len = sizeof(struct sockaddr_in6);
375	if (dir == INBOUND) {
376		pack.mine.sin6_addr = ip6->ip6_dst;
377		pack.yours.sin6_addr = ip6->ip6_src;
378	} else {
379		pack.mine.sin6_addr = ip6->ip6_src;
380		pack.yours.sin6_addr = ip6->ip6_dst;
381	}
382
383	match = NULL;
384	matchprio = 0;
385
386	s = pserialize_read_enter();
387
388	encap_key_init(&key, sin6tosa(&pack.mine), sin6tosa(&pack.yours));
389	while ((ep = thmap_get(emap, &key, sizeof(key))) != NULL) {
390		struct psref elem_psref;
391
392		KASSERT(ep->af == AF_INET6);
393
394		if (ep->proto >= 0 && ep->proto != proto) {
395			encap_key_inc(&key);
396			continue;
397		}
398
399		psref_acquire(&elem_psref, &ep->psref,
400		    encaptab.elem_class);
401		if (ep->func) {
402			pserialize_read_exit(s);
403			prio = (*ep->func)(m, off, proto, ep->arg);
404			s = pserialize_read_enter();
405		} else {
406			prio = pack.mine.sin6_len + pack.yours.sin6_len;
407		}
408
409		if (prio <= 0) {
410			psref_release(&elem_psref, &ep->psref,
411			    encaptab.elem_class);
412			encap_key_inc(&key);
413			continue;
414		}
415		if (prio > matchprio) {
416			/* release last matched ep */
417			if (match != NULL)
418				psref_release(match_psref, &match->psref,
419				    encaptab.elem_class);
420
421			psref_copy(match_psref, &elem_psref,
422			    encaptab.elem_class);
423			matchprio = prio;
424			match = ep;
425		}
426		psref_release(&elem_psref, &ep->psref,
427		    encaptab.elem_class);
428		encap_key_inc(&key);
429	}
430
431	PSLIST_READER_FOREACH(ep, &encap_table, struct encaptab, chain) {
432		struct psref elem_psref;
433
434		if (ep->af != AF_INET6)
435			continue;
436		if (ep->proto >= 0 && ep->proto != proto)
437			continue;
438
439		psref_acquire(&elem_psref, &ep->psref,
440		    encaptab.elem_class);
441
442		pserialize_read_exit(s);
443		/* ep->func is sleepable. e.g. rtalloc1 */
444		prio = (*ep->func)(m, off, proto, ep->arg);
445		s = pserialize_read_enter();
446
447		/* see encap4_lookup() for issues here */
448		if (prio <= 0) {
449			psref_release(&elem_psref, &ep->psref,
450			    encaptab.elem_class);
451			continue;
452		}
453		if (prio > matchprio) {
454			/* release last matched ep */
455			if (match != NULL)
456				psref_release(match_psref, &match->psref,
457				    encaptab.elem_class);
458
459			psref_copy(match_psref, &elem_psref,
460			    encaptab.elem_class);
461			matchprio = prio;
462			match = ep;
463		}
464		KASSERTMSG((match == NULL) || psref_held(&match->psref,
465			encaptab.elem_class),
466		    "current match = %p, but not hold its psref", match);
467
468		psref_release(&elem_psref, &ep->psref,
469		    encaptab.elem_class);
470	}
471	pserialize_read_exit(s);
472
473	return match;
474}
475
476int
477encap6_input(struct mbuf **mp, int *offp, int proto)
478{
479	struct mbuf *m = *mp;
480	const struct encapsw *esw;
481	struct encaptab *match;
482	struct psref match_psref;
483	int rv;
484
485	match = encap6_lookup(m, *offp, proto, INBOUND, &match_psref);
486
487	if (match) {
488		/* found a match */
489		esw = match->esw;
490		if (esw && esw->encapsw6.pr_input) {
491			int ret;
492			ret = (*esw->encapsw6.pr_input)(mp, offp, proto,
493			    match->arg);
494			psref_release(&match_psref, &match->psref,
495			    encaptab.elem_class);
496			return ret;
497		} else {
498			psref_release(&match_psref, &match->psref,
499			    encaptab.elem_class);
500			m_freem(m);
501			return IPPROTO_DONE;
502		}
503	}
504
505	/* last resort: inject to raw socket */
506	SOFTNET_LOCK_IF_NET_MPSAFE();
507	rv = rip6_input(mp, offp, proto);
508	SOFTNET_UNLOCK_IF_NET_MPSAFE();
509	return rv;
510}
511#endif
512
513static int
514encap_add(struct encaptab *ep)
515{
516
517	KASSERT(encap_lock_held());
518
519	PSLIST_WRITER_INSERT_HEAD(&encap_table, ep, chain);
520
521	return 0;
522}
523
524static int
525encap_remove(struct encaptab *ep)
526{
527	int error = 0;
528
529	KASSERT(encap_lock_held());
530
531	PSLIST_WRITER_REMOVE(ep, chain);
532
533	return error;
534}
535
536static void
537encap_afcheck(int af, const struct sockaddr *sp, const struct sockaddr *dp)
538{
539
540	KASSERT(sp != NULL && dp != NULL);
541	KASSERT(sp->sa_len == dp->sa_len);
542	KASSERT(af == sp->sa_family && af == dp->sa_family);
543
544	socklen_t len __diagused = sockaddr_getsize_by_family(af);
545	KASSERT(len != 0 && len == sp->sa_len && len == dp->sa_len);
546}
547
548const struct encaptab *
549encap_attach_func(int af, int proto,
550    encap_priofunc_t *func,
551    const struct encapsw *esw, void *arg)
552{
553	struct encaptab *ep;
554	int error;
555#ifndef ENCAP_MPSAFE
556	int s;
557
558	s = splsoftnet();
559#endif
560
561	ASSERT_SLEEPABLE();
562
563	/* sanity check on args */
564	KASSERT(func != NULL);
565	KASSERT(af == AF_INET
566#ifdef INET6
567	    || af == AF_INET6
568#endif
569	);
570
571	ep = kmem_alloc(sizeof(*ep), KM_SLEEP);
572	memset(ep, 0, sizeof(*ep));
573
574	ep->af = af;
575	ep->proto = proto;
576	ep->func = func;
577	ep->esw = esw;
578	ep->arg = arg;
579	psref_target_init(&ep->psref, encaptab.elem_class);
580
581	error = encap_add(ep);
582	if (error)
583		goto gc;
584
585	error = 0;
586#ifndef ENCAP_MPSAFE
587	splx(s);
588#endif
589	return ep;
590
591gc:
592	kmem_free(ep, sizeof(*ep));
593#ifndef ENCAP_MPSAFE
594	splx(s);
595#endif
596	return NULL;
597}
598
599static void
600encap_key_init(struct encap_key *key,
601    const struct sockaddr *local, const struct sockaddr *remote)
602{
603
604	memset(key, 0, sizeof(*key));
605
606	sockaddr_copy(&key->local_sa, sizeof(key->local_u), local);
607	sockaddr_copy(&key->remote_sa, sizeof(key->remote_u), remote);
608}
609
610static void
611encap_key_inc(struct encap_key *key)
612{
613
614	(key->seq)++;
615}
616
617static void
618encap_key_dec(struct encap_key *key)
619{
620
621	(key->seq)--;
622}
623
624static void
625encap_key_copy(struct encap_key *dst, const struct encap_key *src)
626{
627
628	memset(dst, 0, sizeof(*dst));
629	*dst = *src;
630}
631
632/*
633 * src is always my side, and dst is always remote side.
634 * Return value will be necessary as input (cookie) for encap_detach().
635 */
636const struct encaptab *
637encap_attach_addr(int af, int proto,
638    const struct sockaddr *src, const struct sockaddr *dst,
639    encap_priofunc_t *func,
640    const struct encapsw *esw, void *arg)
641{
642	struct encaptab *ep;
643	size_t l;
644	thmap_t *emap;
645	void *retep;
646	struct ip_pack4 *pack4;
647#ifdef INET6
648	struct ip_pack6 *pack6;
649#endif
650
651	ASSERT_SLEEPABLE();
652
653	encap_afcheck(af, src, dst);
654
655	switch (af) {
656	case AF_INET:
657		l = sizeof(*pack4);
658		emap = encap_map[0];
659		break;
660#ifdef INET6
661	case AF_INET6:
662		l = sizeof(*pack6);
663		emap = encap_map[1];
664		break;
665#endif
666	default:
667		return NULL;
668	}
669
670	ep = kmem_zalloc(sizeof(*ep), KM_SLEEP);
671	ep->addrpack = kmem_zalloc(l, KM_SLEEP);
672	ep->addrpack->sa_len = l & 0xff;
673	ep->af = af;
674	ep->proto = proto;
675	ep->flag = IP_ENCAP_ADDR_ENABLE;
676	switch (af) {
677	case AF_INET:
678		pack4 = (struct ip_pack4 *)ep->addrpack;
679		ep->src = (struct sockaddr *)&pack4->mine;
680		ep->dst = (struct sockaddr *)&pack4->yours;
681		break;
682#ifdef INET6
683	case AF_INET6:
684		pack6 = (struct ip_pack6 *)ep->addrpack;
685		ep->src = (struct sockaddr *)&pack6->mine;
686		ep->dst = (struct sockaddr *)&pack6->yours;
687		break;
688#endif
689	}
690	memcpy(ep->src, src, src->sa_len);
691	memcpy(ep->dst, dst, dst->sa_len);
692	ep->esw = esw;
693	ep->arg = arg;
694	ep->func = func;
695	psref_target_init(&ep->psref, encaptab.elem_class);
696
697	encap_key_init(&ep->key, src, dst);
698	while ((retep = thmap_put(emap, &ep->key, sizeof(ep->key), ep)) != ep)
699		encap_key_inc(&ep->key);
700	return ep;
701}
702
703
704/* XXX encap4_ctlinput() is necessary if we set DF=1 on outer IPv4 header */
705
706#ifdef INET6
707void *
708encap6_ctlinput(int cmd, const struct sockaddr *sa, void *d0)
709{
710	void *d = d0;
711	struct ip6_hdr *ip6;
712	struct mbuf *m;
713	int off;
714	struct ip6ctlparam *ip6cp = NULL;
715	int nxt;
716	int s;
717	struct encaptab *ep;
718	const struct encapsw *esw;
719
720	if (sa->sa_family != AF_INET6 ||
721	    sa->sa_len != sizeof(struct sockaddr_in6))
722		return NULL;
723
724	if ((unsigned)cmd >= PRC_NCMDS)
725		return NULL;
726	if (cmd == PRC_HOSTDEAD)
727		d = NULL;
728	else if (cmd == PRC_MSGSIZE)
729		; /* special code is present, see below */
730	else if (inet6ctlerrmap[cmd] == 0)
731		return NULL;
732
733	/* if the parameter is from icmp6, decode it. */
734	if (d != NULL) {
735		ip6cp = (struct ip6ctlparam *)d;
736		m = ip6cp->ip6c_m;
737		ip6 = ip6cp->ip6c_ip6;
738		off = ip6cp->ip6c_off;
739		nxt = ip6cp->ip6c_nxt;
740
741		if (ip6 && cmd == PRC_MSGSIZE) {
742			int valid = 0;
743			struct encaptab *match;
744			struct psref elem_psref;
745
746			/*
747		 	* Check to see if we have a valid encap configuration.
748		 	*/
749			match = encap6_lookup(m, off, nxt, OUTBOUND,
750			    &elem_psref);
751			if (match) {
752				valid++;
753				psref_release(&elem_psref, &match->psref,
754				    encaptab.elem_class);
755			}
756
757			/*
758		 	* Depending on the value of "valid" and routing table
759		 	* size (mtudisc_{hi,lo}wat), we will:
760		 	* - recalcurate the new MTU and create the
761		 	*   corresponding routing entry, or
762		 	* - ignore the MTU change notification.
763		 	*/
764			icmp6_mtudisc_update((struct ip6ctlparam *)d, valid);
765		}
766	} else {
767		m = NULL;
768		ip6 = NULL;
769		nxt = -1;
770	}
771
772	/* inform all listeners */
773
774	s = pserialize_read_enter();
775	PSLIST_READER_FOREACH(ep, &encap_table, struct encaptab, chain) {
776		struct psref elem_psref;
777
778		if (ep->af != AF_INET6)
779			continue;
780		if (ep->proto >= 0 && ep->proto != nxt)
781			continue;
782
783		/* should optimize by looking at address pairs */
784
785		/* XXX need to pass ep->arg or ep itself to listeners */
786		psref_acquire(&elem_psref, &ep->psref,
787		    encaptab.elem_class);
788		esw = ep->esw;
789		if (esw && esw->encapsw6.pr_ctlinput) {
790			pserialize_read_exit(s);
791			/* pr_ctlinput is sleepable. e.g. rtcache_free */
792			(*esw->encapsw6.pr_ctlinput)(cmd, sa, d, ep->arg);
793			s = pserialize_read_enter();
794		}
795		psref_release(&elem_psref, &ep->psref,
796		    encaptab.elem_class);
797	}
798	pserialize_read_exit(s);
799
800	rip6_ctlinput(cmd, sa, d0);
801	return NULL;
802}
803#endif
804
805static int
806encap_detach_addr(const struct encaptab *ep)
807{
808	thmap_t *emap;
809	struct encaptab *retep;
810	struct encaptab *target;
811	void *thgc;
812	struct encap_key key;
813
814	KASSERT(encap_lock_held());
815	KASSERT(ep->flag & IP_ENCAP_ADDR_ENABLE);
816
817	switch (ep->af) {
818	case AF_INET:
819		emap = encap_map[0];
820		break;
821#ifdef INET6
822	case AF_INET6:
823		emap = encap_map[1];
824		break;
825#endif
826	default:
827		return EINVAL;
828	}
829
830	retep = thmap_del(emap, &ep->key, sizeof(ep->key));
831	if (retep != ep) {
832		return ENOENT;
833	}
834	target = retep;
835
836	/*
837	 * To keep continuity, decrement seq after detached encaptab.
838	 */
839	encap_key_copy(&key, &ep->key);
840	encap_key_inc(&key);
841	while ((retep = thmap_del(emap, &key, sizeof(key))) != NULL) {
842		void *pp;
843
844		encap_key_dec(&retep->key);
845		pp = thmap_put(emap, &retep->key, sizeof(retep->key), retep);
846		KASSERT(retep == pp);
847
848		encap_key_inc(&key);
849	}
850
851	thgc = thmap_stage_gc(emap);
852	pserialize_perform(encaptab.psz);
853	thmap_gc(emap, thgc);
854	psref_target_destroy(&target->psref, encaptab.elem_class);
855	kmem_free(target->addrpack, target->addrpack->sa_len);
856	kmem_free(target, sizeof(*target));
857
858	return 0;
859}
860
861int
862encap_detach(const struct encaptab *cookie)
863{
864	const struct encaptab *ep = cookie;
865	struct encaptab *p;
866	int error;
867
868	KASSERT(encap_lock_held());
869
870	if (ep->flag & IP_ENCAP_ADDR_ENABLE)
871		return encap_detach_addr(ep);
872
873	PSLIST_WRITER_FOREACH(p, &encap_table, struct encaptab, chain) {
874		if (p == ep) {
875			error = encap_remove(p);
876			if (error)
877				return error;
878			else
879				break;
880		}
881	}
882	if (p == NULL)
883		return ENOENT;
884
885	pserialize_perform(encaptab.psz);
886	psref_target_destroy(&p->psref,
887	    encaptab.elem_class);
888	kmem_free(p, sizeof(*p));
889
890	return 0;
891}
892
893int
894encap_lock_enter(void)
895{
896	int error;
897
898	mutex_enter(&encap_whole.lock);
899	while (encap_whole.busy != NULL) {
900		error = cv_wait_sig(&encap_whole.cv, &encap_whole.lock);
901		if (error) {
902			mutex_exit(&encap_whole.lock);
903			return error;
904		}
905	}
906	KASSERT(encap_whole.busy == NULL);
907	encap_whole.busy = curlwp;
908	mutex_exit(&encap_whole.lock);
909
910	return 0;
911}
912
913void
914encap_lock_exit(void)
915{
916
917	mutex_enter(&encap_whole.lock);
918	KASSERT(encap_whole.busy == curlwp);
919	encap_whole.busy = NULL;
920	cv_broadcast(&encap_whole.cv);
921	mutex_exit(&encap_whole.lock);
922}
923
924bool
925encap_lock_held(void)
926{
927
928	return (encap_whole.busy == curlwp);
929}
930