1/*	$OpenBSD: if_pfsync.c,v 1.326 2024/05/24 06:38:41 sashan Exp $	*/
2
3/*
4 * Copyright (c) 2002 Michael Shalayeff
5 * All rights reserved.
6 *
7 * Redistribution and use in source and binary forms, with or without
8 * modification, are permitted provided that the following conditions
9 * are met:
10 * 1. Redistributions of source code must retain the above copyright
11 *    notice, this list of conditions and the following disclaimer.
12 * 2. Redistributions in binary form must reproduce the above copyright
13 *    notice, this list of conditions and the following disclaimer in the
14 *    documentation and/or other materials provided with the distribution.
15 *
16 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
17 * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
18 * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
19 * IN NO EVENT SHALL THE AUTHOR OR HIS RELATIVES BE LIABLE FOR ANY DIRECT,
20 * INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
21 * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
22 * SERVICES; LOSS OF MIND, USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
23 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
24 * STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING
25 * IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF
26 * THE POSSIBILITY OF SUCH DAMAGE.
27 */
28
29/*
30 * Copyright (c) 2009, 2022, 2023 David Gwynne <dlg@openbsd.org>
31 *
32 * Permission to use, copy, modify, and distribute this software for any
33 * purpose with or without fee is hereby granted, provided that the above
34 * copyright notice and this permission notice appear in all copies.
35 *
36 * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
37 * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
38 * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR
39 * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
40 * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
41 * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF
42 * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
43 */
44
45#include "bpfilter.h"
46#include "pfsync.h"
47#include "kstat.h"
48
49#include <sys/param.h>
50#include <sys/systm.h>
51#include <sys/time.h>
52#include <sys/malloc.h>
53#include <sys/mbuf.h>
54#include <sys/socket.h>
55#include <sys/ioctl.h>
56#include <sys/timeout.h>
57#include <sys/kernel.h>
58#include <sys/sysctl.h>
59#include <sys/pool.h>
60#include <sys/syslog.h>
61#include <sys/tree.h>
62#include <sys/smr.h>
63#include <sys/percpu.h>
64#include <sys/refcnt.h>
65#include <sys/kstat.h>
66#include <sys/stdarg.h>
67
68#include <net/if.h>
69#include <net/if_types.h>
70#include <net/bpf.h>
71#include <net/netisr.h>
72#include <net/route.h>
73
74#include <netinet/in.h>
75#include <netinet/if_ether.h>
76#include <netinet/ip.h>
77#include <netinet/in_var.h>
78#include <netinet/ip_var.h>
79#include <netinet/ip_ipsp.h>
80#include <netinet/ip_icmp.h>
81#include <netinet/icmp6.h>
82#include <netinet/tcp.h>
83#include <netinet/tcp_seq.h>
84#include <netinet/tcp_fsm.h>
85#include <netinet/udp.h>
86
87#ifdef INET6
88#include <netinet6/in6_var.h>
89#include <netinet/ip6.h>
90#include <netinet6/ip6_var.h>
91#include <netinet6/nd6.h>
92#endif /* INET6 */
93
94#include "carp.h"
95#if NCARP > 0
96#include <netinet/ip_carp.h>
97#endif
98
99#include <net/pfvar.h>
100#include <net/pfvar_priv.h>
101#include <net/if_pfsync.h>
102
103#define PFSYNC_MINPKT ( \
104	sizeof(struct ip) + \
105	sizeof(struct pfsync_header))
106
107struct pfsync_softc;
108
109struct pfsync_deferral {
110	TAILQ_ENTRY(pfsync_deferral)		 pd_entry;
111	struct pf_state				*pd_st;
112	struct mbuf				*pd_m;
113	uint64_t				 pd_deadline;
114};
115TAILQ_HEAD(pfsync_deferrals, pfsync_deferral);
116
117#define PFSYNC_DEFER_NSEC	20000000ULL
118#define PFSYNC_DEFER_LIMIT	128
119#define PFSYNC_BULK_SND_IVAL_MS	20
120
121static struct pool pfsync_deferrals_pool;
122
123enum pfsync_bulk_req_state {
124	PFSYNC_BREQ_S_NONE,
125	PFSYNC_BREQ_S_START,
126	PFSYNC_BREQ_S_SENT,
127	PFSYNC_BREQ_S_BULK,
128	PFSYNC_BREQ_S_DONE,
129};
130
131static const char *pfsync_bulk_req_state_names[] = {
132	[PFSYNC_BREQ_S_NONE]		= "none",
133	[PFSYNC_BREQ_S_START]		= "start",
134	[PFSYNC_BREQ_S_SENT]		= "sent",
135	[PFSYNC_BREQ_S_BULK]		= "bulk",
136	[PFSYNC_BREQ_S_DONE]		= "done",
137};
138
139enum pfsync_bulk_req_event {
140	PFSYNC_BREQ_EVT_UP,
141	PFSYNC_BREQ_EVT_DOWN,
142	PFSYNC_BREQ_EVT_TMO,
143	PFSYNC_BREQ_EVT_LINK,
144	PFSYNC_BREQ_EVT_BUS_START,
145	PFSYNC_BREQ_EVT_BUS_END,
146};
147
148static const char *pfsync_bulk_req_event_names[] = {
149	[PFSYNC_BREQ_EVT_UP]		= "up",
150	[PFSYNC_BREQ_EVT_DOWN]		= "down",
151	[PFSYNC_BREQ_EVT_TMO]		= "timeout",
152	[PFSYNC_BREQ_EVT_LINK]		= "link",
153	[PFSYNC_BREQ_EVT_BUS_START]	= "bus-start",
154	[PFSYNC_BREQ_EVT_BUS_END]	= "bus-end",
155};
156
157struct pfsync_slice {
158	struct pfsync_softc	*s_pfsync;
159	struct mutex		 s_mtx;
160
161	struct pf_state_queue	 s_qs[PFSYNC_S_COUNT];
162	TAILQ_HEAD(, tdb)	 s_tdb_q;
163	size_t			 s_len;
164	struct mbuf_list	 s_ml;
165
166	struct taskq		*s_softnet;
167	struct task		 s_task;
168	struct timeout		 s_tmo;
169
170	struct mbuf_queue	 s_sendq;
171	struct task		 s_send;
172
173	struct pfsync_deferrals	 s_deferrals;
174	unsigned int		 s_deferred;
175	struct task		 s_deferrals_task;
176	struct timeout		 s_deferrals_tmo;
177
178	uint64_t		 s_stat_locks;
179	uint64_t		 s_stat_contended;
180	uint64_t		 s_stat_write_nop;
181	uint64_t		 s_stat_task_add;
182	uint64_t		 s_stat_task_run;
183	uint64_t		 s_stat_enqueue;
184	uint64_t		 s_stat_dequeue;
185
186	uint64_t		 s_stat_defer_add;
187	uint64_t		 s_stat_defer_ack;
188	uint64_t		 s_stat_defer_run;
189	uint64_t		 s_stat_defer_overlimit;
190
191	struct kstat		*s_kstat;
192} __aligned(CACHELINESIZE);
193
194#define PFSYNC_SLICE_BITS	 1
195#define PFSYNC_NSLICES		 (1 << PFSYNC_SLICE_BITS)
196
197struct pfsync_softc {
198	struct ifnet		 sc_if;
199	unsigned int		 sc_dead;
200	unsigned int		 sc_up;
201	struct refcnt		 sc_refs;
202
203	/* config */
204	struct in_addr		 sc_syncpeer;
205	unsigned int		 sc_maxupdates;
206	unsigned int		 sc_defer;
207
208	/* operation */
209	unsigned int		 sc_sync_ifidx;
210	unsigned int		 sc_sync_if_down;
211	void			*sc_inm;
212	struct task		 sc_ltask;
213	struct task		 sc_dtask;
214	struct ip		 sc_template;
215
216	struct pfsync_slice	 sc_slices[PFSYNC_NSLICES];
217
218	struct {
219		struct rwlock			 req_lock;
220		struct timeout			 req_tmo;
221		enum pfsync_bulk_req_state	 req_state;
222		unsigned int			 req_tries;
223		unsigned int			 req_demoted;
224	}			 sc_bulk_req;
225
226	struct {
227		struct rwlock			 snd_lock;
228		struct timeout			 snd_tmo;
229		time_t				 snd_requested;
230
231		struct pf_state			*snd_next;
232		struct pf_state			*snd_tail;
233		unsigned int			 snd_again;
234	}			 sc_bulk_snd;
235};
236
237static struct pfsync_softc	*pfsyncif = NULL;
238static struct cpumem		*pfsynccounters;
239
240static inline void
241pfsyncstat_inc(enum pfsync_counters c)
242{
243	counters_inc(pfsynccounters, c);
244}
245
246static int	pfsync_clone_create(struct if_clone *, int);
247static int	pfsync_clone_destroy(struct ifnet *);
248
249static int	pfsync_output(struct ifnet *, struct mbuf *, struct sockaddr *,
250		    struct rtentry *);
251static void	pfsync_start(struct ifqueue *);
252
253static int	pfsync_ioctl(struct ifnet *, u_long, caddr_t);
254static int	pfsync_up(struct pfsync_softc *);
255static int	pfsync_down(struct pfsync_softc *);
256
257static int	pfsync_set_mtu(struct pfsync_softc *, unsigned int);
258static int	pfsync_set_parent(struct pfsync_softc *,
259		    const struct if_parent *);
260static int	pfsync_get_parent(struct pfsync_softc *, struct if_parent *);
261static int	pfsync_del_parent(struct pfsync_softc *);
262
263static int	pfsync_get_ioc(struct pfsync_softc *, struct ifreq *);
264static int	pfsync_set_ioc(struct pfsync_softc *, struct ifreq *);
265
266static void	pfsync_syncif_link(void *);
267static void	pfsync_syncif_detach(void *);
268
269static void	pfsync_sendout(struct pfsync_softc *, struct mbuf *);
270static void	pfsync_slice_drop(struct pfsync_softc *, struct pfsync_slice *);
271
272static void	pfsync_slice_tmo(void *);
273static void	pfsync_slice_task(void *);
274static void	pfsync_slice_sendq(void *);
275
276static void	pfsync_deferrals_tmo(void *);
277static void	pfsync_deferrals_task(void *);
278static void	pfsync_defer_output(struct pfsync_deferral *);
279
280static void	pfsync_bulk_req_evt(struct pfsync_softc *,
281		    enum pfsync_bulk_req_event);
282static void	pfsync_bulk_req_tmo(void *);
283
284static void	pfsync_bulk_snd_tmo(void *);
285
286#if NKSTAT > 0
287struct pfsync_kstat_data {
288	struct kstat_kv pd_locks;
289	struct kstat_kv pd_contended;
290	struct kstat_kv pd_write_nop;
291	struct kstat_kv pd_task_add;
292	struct kstat_kv pd_task_run;
293	struct kstat_kv pd_enqueue;
294	struct kstat_kv pd_dequeue;
295	struct kstat_kv pd_qdrop;
296
297	struct kstat_kv pd_defer_len;
298	struct kstat_kv pd_defer_add;
299	struct kstat_kv pd_defer_ack;
300	struct kstat_kv pd_defer_run;
301	struct kstat_kv pd_defer_overlimit;
302};
303
304static const struct pfsync_kstat_data pfsync_kstat_tpl = {
305	KSTAT_KV_INITIALIZER("locks",		KSTAT_KV_T_COUNTER64),
306	KSTAT_KV_INITIALIZER("contended",	KSTAT_KV_T_COUNTER64),
307	KSTAT_KV_INITIALIZER("write-nops",	KSTAT_KV_T_COUNTER64),
308	KSTAT_KV_INITIALIZER("send-sched",	KSTAT_KV_T_COUNTER64),
309	KSTAT_KV_INITIALIZER("send-run",	KSTAT_KV_T_COUNTER64),
310	KSTAT_KV_INITIALIZER("enqueues",	KSTAT_KV_T_COUNTER64),
311	KSTAT_KV_INITIALIZER("dequeues",	KSTAT_KV_T_COUNTER64),
312	KSTAT_KV_UNIT_INITIALIZER("qdrops",
313	    KSTAT_KV_T_COUNTER32, KSTAT_KV_U_PACKETS),
314
315	KSTAT_KV_UNIT_INITIALIZER("defer-len",
316	    KSTAT_KV_T_COUNTER32, KSTAT_KV_U_PACKETS),
317	KSTAT_KV_INITIALIZER("defer-add",	KSTAT_KV_T_COUNTER64),
318	KSTAT_KV_INITIALIZER("defer-ack",	KSTAT_KV_T_COUNTER64),
319	KSTAT_KV_INITIALIZER("defer-run",	KSTAT_KV_T_COUNTER64),
320	KSTAT_KV_INITIALIZER("defer-over",	KSTAT_KV_T_COUNTER64),
321};
322
323static int
324pfsync_kstat_copy(struct kstat *ks, void *dst)
325{
326	struct pfsync_slice *s = ks->ks_softc;
327	struct pfsync_kstat_data *pd = dst;
328
329	*pd = pfsync_kstat_tpl;
330	kstat_kv_u64(&pd->pd_locks) = s->s_stat_locks;
331	kstat_kv_u64(&pd->pd_contended) = s->s_stat_contended;
332	kstat_kv_u64(&pd->pd_write_nop) = s->s_stat_write_nop;
333	kstat_kv_u64(&pd->pd_task_add) = s->s_stat_task_add;
334	kstat_kv_u64(&pd->pd_task_run) = s->s_stat_task_run;
335	kstat_kv_u64(&pd->pd_enqueue) = s->s_stat_enqueue;
336	kstat_kv_u64(&pd->pd_dequeue) = s->s_stat_dequeue;
337	kstat_kv_u32(&pd->pd_qdrop) = mq_drops(&s->s_sendq);
338
339	kstat_kv_u32(&pd->pd_defer_len) = s->s_deferred;
340	kstat_kv_u64(&pd->pd_defer_add) = s->s_stat_defer_add;
341	kstat_kv_u64(&pd->pd_defer_ack) = s->s_stat_defer_ack;
342	kstat_kv_u64(&pd->pd_defer_run) = s->s_stat_defer_run;
343	kstat_kv_u64(&pd->pd_defer_overlimit) = s->s_stat_defer_overlimit;
344
345	return (0);
346}
347#endif /* NKSTAT > 0 */
348
349#define PFSYNC_MAX_BULKTRIES	12
350
351struct if_clone	pfsync_cloner =
352    IF_CLONE_INITIALIZER("pfsync", pfsync_clone_create, pfsync_clone_destroy);
353
354void
355pfsyncattach(int npfsync)
356{
357	pfsynccounters = counters_alloc(pfsyncs_ncounters);
358	if_clone_attach(&pfsync_cloner);
359}
360
361static int
362pfsync_clone_create(struct if_clone *ifc, int unit)
363{
364	struct pfsync_softc *sc;
365	struct ifnet *ifp;
366	size_t i, q;
367
368	if (unit != 0)
369		return (ENXIO);
370
371	if (pfsync_deferrals_pool.pr_size == 0) {
372		pool_init(&pfsync_deferrals_pool,
373		    sizeof(struct pfsync_deferral), 0,
374		    IPL_MPFLOOR, 0, "pfdefer", NULL);
375		/* pool_cache_init(&pfsync_deferrals_pool); */
376	}
377
378	sc = malloc(sizeof(*sc), M_DEVBUF, M_WAITOK|M_ZERO|M_CANFAIL);
379	if (sc == NULL)
380		return (ENOMEM);
381
382	/* sc_refs is "owned" by IFF_RUNNING */
383
384	sc->sc_syncpeer.s_addr = INADDR_PFSYNC_GROUP;
385	sc->sc_maxupdates = 128;
386	sc->sc_defer = 0;
387
388	task_set(&sc->sc_ltask, pfsync_syncif_link, sc);
389	task_set(&sc->sc_dtask, pfsync_syncif_detach, sc);
390
391	rw_init(&sc->sc_bulk_req.req_lock, "pfsyncbreq");
392	/* need process context to take net lock to call ip_output */
393	timeout_set_proc(&sc->sc_bulk_req.req_tmo, pfsync_bulk_req_tmo, sc);
394
395	rw_init(&sc->sc_bulk_snd.snd_lock, "pfsyncbsnd");
396	/* need process context to take net lock to call ip_output */
397	timeout_set_proc(&sc->sc_bulk_snd.snd_tmo, pfsync_bulk_snd_tmo, sc);
398
399	ifp = &sc->sc_if;
400	snprintf(ifp->if_xname, sizeof ifp->if_xname, "%s%d",
401	    ifc->ifc_name, unit);
402	ifp->if_softc = sc;
403	ifp->if_ioctl = pfsync_ioctl;
404	ifp->if_output = pfsync_output;
405	ifp->if_qstart = pfsync_start;
406	ifp->if_type = IFT_PFSYNC;
407	ifp->if_hdrlen = sizeof(struct pfsync_header);
408	ifp->if_mtu = ETHERMTU;
409	ifp->if_xflags = IFXF_CLONED | IFXF_MPSAFE;
410
411	for (i = 0; i < nitems(sc->sc_slices); i++) {
412		struct pfsync_slice *s = &sc->sc_slices[i];
413
414		s->s_pfsync = sc;
415
416		mtx_init_flags(&s->s_mtx, IPL_SOFTNET, "pfslice", 0);
417		s->s_softnet = net_tq(i);
418		timeout_set(&s->s_tmo, pfsync_slice_tmo, s);
419		task_set(&s->s_task, pfsync_slice_task, s);
420
421		mq_init(&s->s_sendq, 16, IPL_SOFTNET);
422		task_set(&s->s_send, pfsync_slice_sendq, s);
423
424		s->s_len = PFSYNC_MINPKT;
425		ml_init(&s->s_ml);
426
427		for (q = 0; q < nitems(s->s_qs); q++)
428			TAILQ_INIT(&s->s_qs[q]);
429		TAILQ_INIT(&s->s_tdb_q);
430
431		/* stupid NET_LOCK */
432		timeout_set(&s->s_deferrals_tmo, pfsync_deferrals_tmo, s);
433		task_set(&s->s_deferrals_task, pfsync_deferrals_task, s);
434		TAILQ_INIT(&s->s_deferrals);
435
436#if NKSTAT > 0
437		s->s_kstat = kstat_create(ifp->if_xname, 0, "pfsync-slice", i,
438		    KSTAT_T_KV, 0);
439
440		kstat_set_mutex(s->s_kstat, &s->s_mtx);
441		s->s_kstat->ks_softc = s;
442		s->s_kstat->ks_datalen = sizeof(pfsync_kstat_tpl);
443		s->s_kstat->ks_copy = pfsync_kstat_copy;
444		kstat_install(s->s_kstat);
445#endif
446	}
447
448	if_counters_alloc(ifp);
449	if_attach(ifp);
450	if_alloc_sadl(ifp);
451
452#if NCARP > 0
453	if_addgroup(ifp, "carp");
454#endif
455
456#if NBPFILTER > 0
457	bpfattach(&sc->sc_if.if_bpf, ifp, DLT_PFSYNC, PFSYNC_HDRLEN);
458#endif
459
460	return (0);
461}
462
463static int
464pfsync_clone_destroy(struct ifnet *ifp)
465{
466	struct pfsync_softc *sc = ifp->if_softc;
467#if NKSTAT > 0
468	size_t i;
469#endif
470
471	NET_LOCK();
472	sc->sc_dead = 1;
473
474	if (ISSET(ifp->if_flags, IFF_RUNNING))
475		pfsync_down(sc);
476	NET_UNLOCK();
477
478	if_detach(ifp);
479
480#if NKSTAT > 0
481	for (i = 0; i < nitems(sc->sc_slices); i++) {
482		struct pfsync_slice *s = &sc->sc_slices[i];
483
484		kstat_destroy(s->s_kstat);
485	}
486#endif
487
488	free(sc, M_DEVBUF, sizeof(*sc));
489
490	return (0);
491}
492
493static void
494pfsync_dprintf(struct pfsync_softc *sc, const char *fmt, ...)
495{
496	struct ifnet *ifp = &sc->sc_if;
497	va_list ap;
498
499	if (!ISSET(ifp->if_flags, IFF_DEBUG))
500		return;
501
502	printf("%s: ", ifp->if_xname);
503	va_start(ap, fmt);
504	vprintf(fmt, ap);
505	va_end(ap);
506	printf("\n");
507}
508
509static void
510pfsync_syncif_link(void *arg)
511{
512	struct pfsync_softc *sc = arg;
513	struct ifnet *ifp0;
514	unsigned int sync_if_down = 1;
515
516	ifp0 = if_get(sc->sc_sync_ifidx);
517	if (ifp0 != NULL && LINK_STATE_IS_UP(ifp0->if_link_state)) {
518		pfsync_bulk_req_evt(sc, PFSYNC_BREQ_EVT_LINK);
519		sync_if_down = 0;
520	}
521	if_put(ifp0);
522
523#if NCARP > 0
524	if (sc->sc_sync_if_down != sync_if_down) {
525		carp_group_demote_adj(&sc->sc_if,
526		    sync_if_down ? 1 : -1, "pfsync link");
527	}
528#endif
529
530	sc->sc_sync_if_down = sync_if_down;
531}
532
533static void
534pfsync_syncif_detach(void *arg)
535{
536	struct pfsync_softc *sc = arg;
537	struct ifnet *ifp = &sc->sc_if;
538
539	if (ISSET(ifp->if_flags, IFF_RUNNING)) {
540		pfsync_down(sc);
541		if_down(ifp);
542	}
543
544	sc->sc_sync_ifidx = 0;
545}
546
547static int
548pfsync_output(struct ifnet *ifp, struct mbuf *m, struct sockaddr *dst,
549    struct rtentry *rt)
550{
551	m_freem(m);	/* drop packet */
552	return (EAFNOSUPPORT);
553}
554
555static int
556pfsync_ioctl(struct ifnet *ifp, u_long cmd, caddr_t data)
557{
558	struct pfsync_softc *sc = ifp->if_softc;
559	struct ifreq *ifr = (struct ifreq *)data;
560	int error = ENOTTY;
561
562	switch (cmd) {
563	case SIOCSIFADDR:
564		error = EOPNOTSUPP;
565		break;
566
567	case SIOCSIFFLAGS:
568		if (ISSET(ifp->if_flags, IFF_UP)) {
569			if (!ISSET(ifp->if_flags, IFF_RUNNING))
570				error = pfsync_up(sc);
571			else
572				error = ENETRESET;
573		} else {
574			if (ISSET(ifp->if_flags, IFF_RUNNING))
575				error = pfsync_down(sc);
576		}
577		break;
578
579	case SIOCSIFMTU:
580		error = pfsync_set_mtu(sc, ifr->ifr_mtu);
581		break;
582
583	case SIOCSIFPARENT:
584		error = pfsync_set_parent(sc, (struct if_parent *)data);
585		break;
586	case SIOCGIFPARENT:
587		error = pfsync_get_parent(sc, (struct if_parent *)data);
588		break;
589	case SIOCDIFPARENT:
590		error = pfsync_del_parent(sc);
591		break;
592
593	case SIOCSETPFSYNC:
594		error = pfsync_set_ioc(sc, ifr);
595		break;
596	case SIOCGETPFSYNC:
597		error = pfsync_get_ioc(sc, ifr);
598		break;
599
600	default:
601		break;
602	}
603
604	if (error == ENETRESET)
605		error = 0;
606
607	return (error);
608}
609
610static int
611pfsync_set_mtu(struct pfsync_softc *sc, unsigned int mtu)
612{
613	struct ifnet *ifp = &sc->sc_if;
614	struct ifnet *ifp0;
615	int error = 0;
616
617	ifp0 = if_get(sc->sc_sync_ifidx);
618	if (ifp0 == NULL)
619		return (EINVAL);
620
621	if (mtu <= PFSYNC_MINPKT || mtu > ifp0->if_mtu) {
622		error = EINVAL;
623		goto put;
624	}
625
626	/* commit */
627	ifp->if_mtu = mtu;
628
629put:
630	if_put(ifp0);
631	return (error);
632}
633
634static int
635pfsync_set_parent(struct pfsync_softc *sc, const struct if_parent *p)
636{
637	struct ifnet *ifp = &sc->sc_if;
638	struct ifnet *ifp0;
639	int error = 0;
640
641	ifp0 = if_unit(p->ifp_parent);
642	if (ifp0 == NULL)
643		return (ENXIO);
644
645	if (ifp0->if_index == sc->sc_sync_ifidx)
646		goto put;
647
648	if (ISSET(ifp->if_flags, IFF_RUNNING)) {
649		error = EBUSY;
650		goto put;
651	}
652
653	/* commit */
654	sc->sc_sync_ifidx = ifp0->if_index;
655
656put:
657	if_put(ifp0);
658	return (error);
659}
660
661static int
662pfsync_get_parent(struct pfsync_softc *sc, struct if_parent *p)
663{
664	struct ifnet *ifp0;
665	int error = 0;
666
667	ifp0 = if_get(sc->sc_sync_ifidx);
668	if (ifp0 == NULL)
669		error = EADDRNOTAVAIL;
670	else
671		strlcpy(p->ifp_parent, ifp0->if_xname, sizeof(p->ifp_parent));
672	if_put(ifp0);
673
674	return (error);
675}
676
677static int
678pfsync_del_parent(struct pfsync_softc *sc)
679{
680	struct ifnet *ifp = &sc->sc_if;
681
682	if (ISSET(ifp->if_flags, IFF_RUNNING))
683		return (EBUSY);
684
685	/* commit */
686	sc->sc_sync_ifidx = 0;
687
688	return (0);
689}
690
691static int
692pfsync_get_ioc(struct pfsync_softc *sc, struct ifreq *ifr)
693{
694	struct pfsyncreq pfsyncr;
695	struct ifnet *ifp0;
696
697	memset(&pfsyncr, 0, sizeof(pfsyncr));
698
699	ifp0 = if_get(sc->sc_sync_ifidx);
700	if (ifp0 != NULL) {
701		strlcpy(pfsyncr.pfsyncr_syncdev, ifp0->if_xname,
702		    sizeof(pfsyncr.pfsyncr_syncdev));
703	}
704	if_put(ifp0);
705
706	pfsyncr.pfsyncr_syncpeer = sc->sc_syncpeer;
707	pfsyncr.pfsyncr_maxupdates = sc->sc_maxupdates;
708	pfsyncr.pfsyncr_defer = sc->sc_defer;
709
710	return (copyout(&pfsyncr, ifr->ifr_data, sizeof(pfsyncr)));
711}
712
713static int
714pfsync_set_ioc(struct pfsync_softc *sc, struct ifreq *ifr)
715{
716	struct ifnet *ifp = &sc->sc_if;
717	struct pfsyncreq pfsyncr;
718	unsigned int sync_ifidx = sc->sc_sync_ifidx;
719	int wantdown = 0;
720	int error;
721
722	error = suser(curproc);
723	if (error != 0)
724		return (error);
725
726	error = copyin(ifr->ifr_data, &pfsyncr, sizeof(pfsyncr));
727	if (error != 0)
728		return (error);
729
730	if (pfsyncr.pfsyncr_maxupdates > 255)
731		return (EINVAL);
732
733	if (pfsyncr.pfsyncr_syncdev[0] != '\0') { /* set */
734		struct ifnet *ifp0 = if_unit(pfsyncr.pfsyncr_syncdev);
735		if (ifp0 == NULL)
736			return (ENXIO);
737
738		if (ifp0->if_index != sync_ifidx)
739			wantdown = 1;
740
741		sync_ifidx = ifp0->if_index;
742		if_put(ifp0);
743	} else { /* del */
744		wantdown = 1;
745		sync_ifidx = 0;
746	}
747
748	if (pfsyncr.pfsyncr_syncpeer.s_addr == INADDR_ANY)
749		pfsyncr.pfsyncr_syncpeer.s_addr = INADDR_PFSYNC_GROUP;
750	if (pfsyncr.pfsyncr_syncpeer.s_addr != sc->sc_syncpeer.s_addr)
751		wantdown = 1;
752
753	if (wantdown && ISSET(ifp->if_flags, IFF_RUNNING))
754		return (EBUSY);
755
756	/* commit */
757	sc->sc_sync_ifidx = sync_ifidx;
758	sc->sc_syncpeer = pfsyncr.pfsyncr_syncpeer;
759	sc->sc_maxupdates = pfsyncr.pfsyncr_maxupdates;
760	sc->sc_defer = pfsyncr.pfsyncr_defer;
761
762	return (0);
763}
764
765static int
766pfsync_up(struct pfsync_softc *sc)
767{
768	struct ifnet *ifp = &sc->sc_if;
769	struct ifnet *ifp0;
770	void *inm = NULL;
771	int error = 0;
772	struct ip *ip;
773
774	NET_ASSERT_LOCKED();
775	KASSERT(!ISSET(ifp->if_flags, IFF_RUNNING));
776
777	if (sc->sc_dead)
778		return (ENXIO);
779
780	/*
781	 * coordinate with pfsync_down(). if sc_up is still up and
782	 * we're here then something else is tearing pfsync down.
783	 */
784	if (sc->sc_up)
785		return (EBUSY);
786
787	if (sc->sc_syncpeer.s_addr == INADDR_ANY ||
788	    sc->sc_syncpeer.s_addr == INADDR_BROADCAST)
789		return (EDESTADDRREQ);
790
791	ifp0 = if_get(sc->sc_sync_ifidx);
792	if (ifp0 == NULL)
793		return (ENXIO);
794
795	if (IN_MULTICAST(sc->sc_syncpeer.s_addr)) {
796		if (!ISSET(ifp0->if_flags, IFF_MULTICAST)) {
797			error = ENODEV;
798			goto put;
799		}
800		inm = in_addmulti(&sc->sc_syncpeer, ifp0);
801		if (inm == NULL) {
802			error = ECONNABORTED;
803			goto put;
804		}
805	}
806
807	sc->sc_up = 1;
808
809	ip = &sc->sc_template;
810	memset(ip, 0, sizeof(*ip));
811	ip->ip_v = IPVERSION;
812	ip->ip_hl = sizeof(*ip) >> 2;
813	ip->ip_tos = IPTOS_LOWDELAY;
814	/* len and id are set later */
815	ip->ip_off = htons(IP_DF);
816	ip->ip_ttl = PFSYNC_DFLTTL;
817	ip->ip_p = IPPROTO_PFSYNC;
818	ip->ip_src.s_addr = INADDR_ANY;
819	ip->ip_dst.s_addr = sc->sc_syncpeer.s_addr;
820
821	/* commit */
822	refcnt_init(&sc->sc_refs); /* IFF_RUNNING kind of owns this */
823
824#if NCARP > 0
825	sc->sc_sync_if_down = 1;
826	carp_group_demote_adj(&sc->sc_if, 1, "pfsync up");
827#endif
828
829	if_linkstatehook_add(ifp0, &sc->sc_ltask);
830	if_detachhook_add(ifp0, &sc->sc_dtask);
831
832	sc->sc_inm = inm;
833	SET(ifp->if_flags, IFF_RUNNING);
834
835	pfsync_bulk_req_evt(sc, PFSYNC_BREQ_EVT_UP);
836
837	refcnt_take(&sc->sc_refs); /* give one to SMR */
838	SMR_PTR_SET_LOCKED(&pfsyncif, sc);
839
840	pfsync_syncif_link(sc); /* try and push the bulk req state forward */
841
842put:
843	if_put(ifp0);
844	return (error);
845}
846
847static struct mbuf *
848pfsync_encap(struct pfsync_softc *sc, struct mbuf *m)
849{
850	struct {
851		struct ip		ip;
852		struct pfsync_header	ph;
853	} __packed __aligned(4) *h;
854	unsigned int mlen = m->m_pkthdr.len;
855
856	m = m_prepend(m, sizeof(*h), M_DONTWAIT);
857	if (m == NULL)
858		return (NULL);
859
860	h = mtod(m, void *);
861	memset(h, 0, sizeof(*h));
862
863	mlen += sizeof(h->ph);
864	h->ph.version = PFSYNC_VERSION;
865	h->ph.len = htons(mlen);
866	/* h->ph.pfcksum */
867
868	mlen += sizeof(h->ip);
869	h->ip = sc->sc_template;
870	h->ip.ip_len = htons(mlen);
871	h->ip.ip_id = htons(ip_randomid());
872
873	return (m);
874}
875
876static void
877pfsync_bulk_req_send(struct pfsync_softc *sc)
878{
879	struct {
880		struct pfsync_subheader	subh;
881		struct pfsync_upd_req	ur;
882	} __packed __aligned(4) *h;
883	unsigned mlen = max_linkhdr +
884	    sizeof(struct ip) + sizeof(struct pfsync_header) + sizeof(*h);
885	struct mbuf *m;
886
887	m = m_gethdr(M_DONTWAIT, MT_DATA);
888	if (m == NULL)
889		goto fail;
890
891	if (mlen > MHLEN) {
892		MCLGETL(m, M_DONTWAIT, mlen);
893		if (!ISSET(m->m_flags, M_EXT))
894			goto drop;
895	}
896
897	m_align(m, sizeof(*h));
898	m->m_len = m->m_pkthdr.len = sizeof(*h);
899
900	h = mtod(m, void *);
901	memset(h, 0, sizeof(*h));
902
903	h->subh.action = PFSYNC_ACT_UPD_REQ;
904	h->subh.len = sizeof(h->ur) >> 2;
905	h->subh.count = htons(1);
906
907	h->ur.id = htobe64(0);
908	h->ur.creatorid = htobe32(0);
909
910	m = pfsync_encap(sc, m);
911	if (m == NULL)
912		goto fail;
913
914	pfsync_sendout(sc, m);
915	return;
916
917drop:
918	m_freem(m);
919fail:
920	printf("%s: unable to request bulk update\n", sc->sc_if.if_xname);
921}
922
923static void
924pfsync_bulk_req_nstate(struct pfsync_softc *sc,
925    enum pfsync_bulk_req_state nstate, int seconds)
926{
927	sc->sc_bulk_req.req_state = nstate;
928	if (seconds > 0)
929		timeout_add_sec(&sc->sc_bulk_req.req_tmo, seconds);
930	else
931		timeout_del(&sc->sc_bulk_req.req_tmo);
932}
933
934static void
935pfsync_bulk_req_invstate(struct pfsync_softc *sc,
936    enum pfsync_bulk_req_event evt)
937{
938	panic("%s: unexpected event %s in state %s", sc->sc_if.if_xname,
939	    pfsync_bulk_req_event_names[evt],
940	    pfsync_bulk_req_state_names[sc->sc_bulk_req.req_state]);
941}
942
943static void
944pfsync_bulk_req_nstate_bulk(struct pfsync_softc *sc)
945{
946	/* calculate the number of packets we expect */
947	int t = pf_pool_limits[PF_LIMIT_STATES].limit /
948	    ((sc->sc_if.if_mtu - PFSYNC_MINPKT) /
949	     sizeof(struct pfsync_state));
950
951	/* turn it into seconds */
952	t /= 1000 / PFSYNC_BULK_SND_IVAL_MS;
953
954	if (t == 0)
955		t = 1;
956
957	pfsync_bulk_req_nstate(sc, PFSYNC_BREQ_S_BULK, t * 4);
958}
959
960static inline void
961pfsync_bulk_req_nstate_done(struct pfsync_softc *sc)
962{
963	pfsync_bulk_req_nstate(sc, PFSYNC_BREQ_S_DONE, 0);
964
965	KASSERT(sc->sc_bulk_req.req_demoted == 1);
966	sc->sc_bulk_req.req_demoted = 0;
967
968#if NCARP > 0
969	carp_group_demote_adj(&sc->sc_if, -32, "pfsync done");
970#endif
971}
972
973static void
974pfsync_bulk_req_evt(struct pfsync_softc *sc, enum pfsync_bulk_req_event evt)
975{
976	struct ifnet *ifp = &sc->sc_if;
977
978	rw_enter_write(&sc->sc_bulk_req.req_lock);
979	pfsync_dprintf(sc, "%s state %s evt %s", __func__,
980	    pfsync_bulk_req_state_names[sc->sc_bulk_req.req_state],
981	    pfsync_bulk_req_event_names[evt]);
982
983	if (evt == PFSYNC_BREQ_EVT_DOWN) {
984		/* unconditionally move down */
985		sc->sc_bulk_req.req_tries = 0;
986		pfsync_bulk_req_nstate(sc, PFSYNC_BREQ_S_NONE, 0);
987
988		if (sc->sc_bulk_req.req_demoted) {
989			sc->sc_bulk_req.req_demoted = 0;
990#if NCARP > 0
991			carp_group_demote_adj(&sc->sc_if, -32,
992			    "pfsync down");
993#endif
994		}
995	} else switch (sc->sc_bulk_req.req_state) {
996	case PFSYNC_BREQ_S_NONE:
997		switch (evt) {
998		case PFSYNC_BREQ_EVT_UP:
999			KASSERT(sc->sc_bulk_req.req_demoted == 0);
1000			sc->sc_bulk_req.req_demoted = 1;
1001#if NCARP > 0
1002			carp_group_demote_adj(&sc->sc_if, 32,
1003			    "pfsync start");
1004#endif
1005			pfsync_bulk_req_nstate(sc, PFSYNC_BREQ_S_START, 30);
1006			break;
1007		default:
1008			pfsync_bulk_req_invstate(sc, evt);
1009		}
1010
1011		break;
1012
1013	case PFSYNC_BREQ_S_START:
1014		switch (evt) {
1015		case PFSYNC_BREQ_EVT_LINK:
1016			pfsync_bulk_req_send(sc);
1017			pfsync_bulk_req_nstate(sc, PFSYNC_BREQ_S_SENT, 2);
1018			break;
1019		case PFSYNC_BREQ_EVT_TMO:
1020			pfsync_dprintf(sc, "timeout waiting for link");
1021			pfsync_bulk_req_nstate_done(sc);
1022			break;
1023		case PFSYNC_BREQ_EVT_BUS_START:
1024			pfsync_bulk_req_nstate_bulk(sc);
1025			break;
1026		case PFSYNC_BREQ_EVT_BUS_END:
1027			/* ignore this */
1028			break;
1029		default:
1030			pfsync_bulk_req_invstate(sc, evt);
1031		}
1032		break;
1033
1034	case PFSYNC_BREQ_S_SENT:
1035		switch (evt) {
1036		case PFSYNC_BREQ_EVT_BUS_START:
1037			pfsync_bulk_req_nstate_bulk(sc);
1038			break;
1039		case PFSYNC_BREQ_EVT_BUS_END:
1040		case PFSYNC_BREQ_EVT_LINK:
1041			/* ignore this */
1042			break;
1043		case PFSYNC_BREQ_EVT_TMO:
1044			if (++sc->sc_bulk_req.req_tries <
1045			    PFSYNC_MAX_BULKTRIES) {
1046				pfsync_bulk_req_send(sc);
1047				pfsync_bulk_req_nstate(sc,
1048				    PFSYNC_BREQ_S_SENT, 2);
1049				break;
1050			}
1051
1052			pfsync_dprintf(sc,
1053			    "timeout waiting for bulk transfer start");
1054			pfsync_bulk_req_nstate_done(sc);
1055			break;
1056		default:
1057			pfsync_bulk_req_invstate(sc, evt);
1058		}
1059		break;
1060
1061	case PFSYNC_BREQ_S_BULK:
1062		switch (evt) {
1063		case PFSYNC_BREQ_EVT_BUS_START:
1064		case PFSYNC_BREQ_EVT_LINK:
1065			/* ignore this */
1066			break;
1067		case PFSYNC_BREQ_EVT_BUS_END:
1068			pfsync_bulk_req_nstate_done(sc);
1069			break;
1070		case PFSYNC_BREQ_EVT_TMO:
1071			if (++sc->sc_bulk_req.req_tries <
1072			    PFSYNC_MAX_BULKTRIES) {
1073				pfsync_bulk_req_send(sc);
1074				pfsync_bulk_req_nstate(sc,
1075				    PFSYNC_BREQ_S_SENT, 2);
1076			}
1077
1078			pfsync_dprintf(sc,
1079			    "timeout waiting for bulk transfer end");
1080			pfsync_bulk_req_nstate_done(sc);
1081			break;
1082		default:
1083			pfsync_bulk_req_invstate(sc, evt);
1084		}
1085		break;
1086
1087	case PFSYNC_BREQ_S_DONE: /* pfsync is up and running */
1088		switch (evt) {
1089		case PFSYNC_BREQ_EVT_BUS_START:
1090		case PFSYNC_BREQ_EVT_BUS_END:
1091		case PFSYNC_BREQ_EVT_LINK:
1092			/* nops */
1093			break;
1094		default:
1095			pfsync_bulk_req_invstate(sc, evt);
1096		}
1097		break;
1098
1099	default:
1100		panic("%s: unknown event %d", ifp->if_xname, evt);
1101		/* NOTREACHED */
1102	}
1103	rw_exit_write(&sc->sc_bulk_req.req_lock);
1104}
1105
1106static void
1107pfsync_bulk_req_tmo(void *arg)
1108{
1109	struct pfsync_softc *sc = arg;
1110
1111	NET_LOCK();
1112	pfsync_bulk_req_evt(sc, PFSYNC_BREQ_EVT_TMO);
1113	NET_UNLOCK();
1114}
1115
1116static int
1117pfsync_down(struct pfsync_softc *sc)
1118{
1119	struct ifnet *ifp = &sc->sc_if;
1120	struct ifnet *ifp0;
1121	struct smr_entry smr;
1122	size_t i;
1123	void *inm = NULL;
1124	unsigned int sndbar = 0;
1125	struct pfsync_deferrals pds = TAILQ_HEAD_INITIALIZER(pds);
1126	struct pfsync_deferral *pd;
1127
1128	NET_ASSERT_LOCKED();
1129	KASSERT(ISSET(ifp->if_flags, IFF_RUNNING));
1130
1131	/*
1132	 * tearing down pfsync involves waiting for pfsync to stop
1133	 * running in various contexts including softnet taskqs.
1134	 * this thread cannot hold netlock while waiting for a
1135	 * barrier in softnet because softnet might be waiting for
1136	 * the netlock. sc->sc_up is used to coordinate with
1137	 * pfsync_up.
1138	 */
1139
1140	CLR(ifp->if_flags, IFF_RUNNING);
1141
1142	ifp0 = if_get(sc->sc_sync_ifidx);
1143	if (ifp0 != NULL) {
1144		if_linkstatehook_del(ifp0, &sc->sc_ltask);
1145		if_detachhook_del(ifp0, &sc->sc_dtask);
1146	}
1147	if_put(ifp0);
1148
1149#if NCARP > 0
1150	if (sc->sc_sync_if_down)
1151		carp_group_demote_adj(&sc->sc_if, -1, "pfsync down");
1152#endif
1153
1154	NET_UNLOCK();
1155
1156	KASSERTMSG(SMR_PTR_GET_LOCKED(&pfsyncif) == sc,
1157	   "pfsyncif %p != sc %p", pfsyncif, sc);
1158	SMR_PTR_SET_LOCKED(&pfsyncif, NULL);
1159	smr_init(&smr);
1160	smr_call(&smr, (void (*)(void *))refcnt_rele_wake, &sc->sc_refs);
1161
1162	/* stop pf producing work before cleaning up the timeouts and tasks */
1163	refcnt_finalize(&sc->sc_refs, "pfsyncfini");
1164
1165	pfsync_bulk_req_evt(sc, PFSYNC_BREQ_EVT_DOWN);
1166
1167	rw_enter_read(&pf_state_list.pfs_rwl);
1168	rw_enter_write(&sc->sc_bulk_snd.snd_lock);
1169	if (sc->sc_bulk_snd.snd_tail != NULL) {
1170		sndbar = !timeout_del(&sc->sc_bulk_snd.snd_tmo);
1171
1172		sc->sc_bulk_snd.snd_again = 0;
1173		sc->sc_bulk_snd.snd_next = NULL;
1174		sc->sc_bulk_snd.snd_tail = NULL;
1175	}
1176	rw_exit_write(&sc->sc_bulk_snd.snd_lock);
1177	rw_exit_read(&pf_state_list.pfs_rwl);
1178
1179	/*
1180	 * do a single barrier for all the timeouts. because the
1181	 * timeouts in each slice are configured the same way, the
1182	 * barrier for one will work for all of them.
1183	 */
1184	for (i = 0; i < nitems(sc->sc_slices); i++) {
1185		struct pfsync_slice *s = &sc->sc_slices[i];
1186
1187		timeout_del(&s->s_tmo);
1188		task_del(s->s_softnet, &s->s_task);
1189		task_del(s->s_softnet, &s->s_send);
1190
1191		timeout_del(&s->s_deferrals_tmo);
1192		task_del(s->s_softnet, &s->s_deferrals_task);
1193	}
1194	timeout_barrier(&sc->sc_slices[0].s_tmo);
1195	timeout_barrier(&sc->sc_bulk_req.req_tmo); /* XXX proc */
1196	if (sndbar) {
1197		/* technically the preceding barrier does the same job */
1198		timeout_barrier(&sc->sc_bulk_snd.snd_tmo);
1199	}
1200	net_tq_barriers("pfsyncbar");
1201
1202	/* pfsync is no longer running */
1203
1204	if (sc->sc_inm != NULL) {
1205		inm = sc->sc_inm;
1206		sc->sc_inm = NULL;
1207	}
1208
1209	for (i = 0; i < nitems(sc->sc_slices); i++) {
1210		struct pfsync_slice *s = &sc->sc_slices[i];
1211		struct pf_state *st;
1212
1213		pfsync_slice_drop(sc, s);
1214		mq_purge(&s->s_sendq);
1215
1216		while ((pd = TAILQ_FIRST(&s->s_deferrals)) != NULL) {
1217			TAILQ_REMOVE(&s->s_deferrals, pd, pd_entry);
1218
1219			st = pd->pd_st;
1220			st->sync_defer = NULL;
1221
1222			TAILQ_INSERT_TAIL(&pds, pd, pd_entry);
1223		}
1224		s->s_deferred = 0;
1225	}
1226
1227	NET_LOCK();
1228	sc->sc_up = 0;
1229
1230	if (inm != NULL)
1231		in_delmulti(inm);
1232
1233	while ((pd = TAILQ_FIRST(&pds)) != NULL) {
1234		TAILQ_REMOVE(&pds, pd, pd_entry);
1235
1236		pfsync_defer_output(pd);
1237	}
1238
1239	return (0);
1240}
1241
1242int
1243pfsync_is_up(void)
1244{
1245	int rv;
1246
1247	smr_read_enter();
1248	rv = SMR_PTR_GET(&pfsyncif) != NULL;
1249	smr_read_leave();
1250
1251	return (rv);
1252}
1253
1254static void
1255pfsync_start(struct ifqueue *ifq)
1256{
1257	ifq_purge(ifq);
1258}
1259
1260struct pfsync_q {
1261	void		(*write)(struct pf_state *, void *);
1262	size_t		len;
1263	u_int8_t	action;
1264};
1265
1266static struct pfsync_slice *
1267pfsync_slice_enter(struct pfsync_softc *sc, const struct pf_state *st)
1268{
1269	unsigned int idx = st->key[0]->hash % nitems(sc->sc_slices);
1270	struct pfsync_slice *s = &sc->sc_slices[idx];
1271
1272	if (!mtx_enter_try(&s->s_mtx)) {
1273		mtx_enter(&s->s_mtx);
1274		s->s_stat_contended++;
1275	}
1276	s->s_stat_locks++;
1277
1278	return (s);
1279}
1280
1281static void
1282pfsync_slice_leave(struct pfsync_softc *sc, struct pfsync_slice *s)
1283{
1284	mtx_leave(&s->s_mtx);
1285}
1286
1287/* we have one of these for every PFSYNC_S_ */
1288static void	pfsync_out_state(struct pf_state *, void *);
1289static void	pfsync_out_iack(struct pf_state *, void *);
1290static void	pfsync_out_upd_c(struct pf_state *, void *);
1291static void	pfsync_out_del(struct pf_state *, void *);
1292#if defined(IPSEC)
1293static void	pfsync_out_tdb(struct tdb *, void *);
1294#endif
1295
1296static const struct pfsync_q pfsync_qs[] = {
1297	{ pfsync_out_iack,  sizeof(struct pfsync_ins_ack), PFSYNC_ACT_INS_ACK },
1298	{ pfsync_out_upd_c, sizeof(struct pfsync_upd_c),   PFSYNC_ACT_UPD_C },
1299	{ pfsync_out_del,   sizeof(struct pfsync_del_c),   PFSYNC_ACT_DEL_C },
1300	{ pfsync_out_state, sizeof(struct pfsync_state),   PFSYNC_ACT_INS },
1301	{ pfsync_out_state, sizeof(struct pfsync_state),   PFSYNC_ACT_UPD }
1302};
1303
1304static void
1305pfsync_out_state(struct pf_state *st, void *buf)
1306{
1307	struct pfsync_state *sp = buf;
1308
1309	mtx_enter(&st->mtx);
1310	pf_state_export(sp, st);
1311	mtx_leave(&st->mtx);
1312}
1313
1314static void
1315pfsync_out_iack(struct pf_state *st, void *buf)
1316{
1317	struct pfsync_ins_ack *iack = buf;
1318
1319	iack->id = st->id;
1320	iack->creatorid = st->creatorid;
1321}
1322
1323static void
1324pfsync_out_upd_c(struct pf_state *st, void *buf)
1325{
1326	struct pfsync_upd_c *up = buf;
1327
1328	memset(up, 0, sizeof(*up));
1329	up->id = st->id;
1330	up->creatorid = st->creatorid;
1331
1332	mtx_enter(&st->mtx);
1333	pf_state_peer_hton(&st->src, &up->src);
1334	pf_state_peer_hton(&st->dst, &up->dst);
1335	up->timeout = st->timeout;
1336	mtx_leave(&st->mtx);
1337}
1338
1339static void
1340pfsync_out_del(struct pf_state *st, void *buf)
1341{
1342	struct pfsync_del_c *dp = buf;
1343
1344	dp->id = st->id;
1345	dp->creatorid = st->creatorid;
1346
1347	st->sync_state = PFSYNC_S_DEAD;
1348}
1349
1350#if defined(IPSEC)
1351static inline void
1352pfsync_tdb_enter(struct tdb *tdb)
1353{
1354	mtx_enter(&tdb->tdb_mtx);
1355}
1356
1357static inline void
1358pfsync_tdb_leave(struct tdb *tdb)
1359{
1360	unsigned int snapped = ISSET(tdb->tdb_flags, TDBF_PFSYNC_SNAPPED);
1361	mtx_leave(&tdb->tdb_mtx);
1362	if (snapped)
1363		wakeup_one(&tdb->tdb_updates);
1364}
1365#endif /* defined(IPSEC) */
1366
1367static void
1368pfsync_slice_drop(struct pfsync_softc *sc, struct pfsync_slice *s)
1369{
1370	struct pf_state *st;
1371	int q;
1372#if defined(IPSEC)
1373	struct tdb *tdb;
1374#endif
1375
1376	for (q = 0; q < nitems(s->s_qs); q++) {
1377		if (TAILQ_EMPTY(&s->s_qs[q]))
1378			continue;
1379
1380		while ((st = TAILQ_FIRST(&s->s_qs[q])) != NULL) {
1381			TAILQ_REMOVE(&s->s_qs[q], st, sync_list);
1382#ifdef PFSYNC_DEBUG
1383			KASSERT(st->sync_state == q);
1384#endif
1385			st->sync_state = PFSYNC_S_NONE;
1386			pf_state_unref(st);
1387		}
1388	}
1389
1390#if defined(IPSEC)
1391	while ((tdb = TAILQ_FIRST(&s->s_tdb_q)) != NULL) {
1392		TAILQ_REMOVE(&s->s_tdb_q, tdb, tdb_sync_entry);
1393
1394		pfsync_tdb_enter(tdb);
1395		KASSERT(ISSET(tdb->tdb_flags, TDBF_PFSYNC));
1396		CLR(tdb->tdb_flags, TDBF_PFSYNC);
1397		pfsync_tdb_leave(tdb);
1398	}
1399#endif /* defined(IPSEC) */
1400
1401	timeout_del(&s->s_tmo);
1402	s->s_len = PFSYNC_MINPKT;
1403}
1404
1405static struct mbuf *
1406pfsync_slice_write(struct pfsync_slice *s)
1407{
1408	struct pfsync_softc *sc = s->s_pfsync;
1409	struct mbuf *m;
1410
1411	struct ip *ip;
1412	struct pfsync_header *ph;
1413	struct pfsync_subheader *subh;
1414
1415	unsigned int mlen = max_linkhdr + s->s_len;
1416	unsigned int q, count;
1417	caddr_t ptr;
1418	size_t off;
1419
1420	MUTEX_ASSERT_LOCKED(&s->s_mtx);
1421	if (s->s_len == PFSYNC_MINPKT) {
1422		s->s_stat_write_nop++;
1423		return (NULL);
1424	}
1425
1426	task_del(s->s_softnet, &s->s_task);
1427
1428	m = m_gethdr(M_DONTWAIT, MT_DATA);
1429	if (m == NULL)
1430		goto drop;
1431
1432	if (mlen > MHLEN) {
1433		MCLGETL(m, M_DONTWAIT, mlen);
1434		if (!ISSET(m->m_flags, M_EXT))
1435			goto drop;
1436	}
1437
1438	m_align(m, s->s_len);
1439	m->m_len = m->m_pkthdr.len = s->s_len;
1440
1441	ptr = mtod(m, caddr_t);
1442	off = 0;
1443
1444	ip = (struct ip *)(ptr + off);
1445	off += sizeof(*ip);
1446	*ip = sc->sc_template;
1447	ip->ip_len = htons(m->m_pkthdr.len);
1448	ip->ip_id = htons(ip_randomid());
1449
1450	ph = (struct pfsync_header *)(ptr + off);
1451	off += sizeof(*ph);
1452	memset(ph, 0, sizeof(*ph));
1453	ph->version = PFSYNC_VERSION;
1454	ph->len = htons(m->m_pkthdr.len - sizeof(*ip));
1455
1456	for (q = 0; q < nitems(s->s_qs); q++) {
1457		struct pf_state_queue *psq = &s->s_qs[q];
1458		struct pf_state *st;
1459
1460		if (TAILQ_EMPTY(psq))
1461			continue;
1462
1463		subh = (struct pfsync_subheader *)(ptr + off);
1464		off += sizeof(*subh);
1465
1466		count = 0;
1467		while ((st = TAILQ_FIRST(psq)) != NULL) {
1468			TAILQ_REMOVE(psq, st, sync_list);
1469			count++;
1470
1471			KASSERT(st->sync_state == q);
1472			/* the write handler below may override this */
1473			st->sync_state = PFSYNC_S_NONE;
1474
1475			pfsync_qs[q].write(st, ptr + off);
1476			off += pfsync_qs[q].len;
1477
1478			pf_state_unref(st);
1479		}
1480
1481		subh->action = pfsync_qs[q].action;
1482		subh->len = pfsync_qs[q].len >> 2;
1483		subh->count = htons(count);
1484	}
1485
1486#if defined(IPSEC)
1487	if (!TAILQ_EMPTY(&s->s_tdb_q)) {
1488		struct tdb *tdb;
1489
1490		subh = (struct pfsync_subheader *)(ptr + off);
1491		off += sizeof(*subh);
1492
1493		count = 0;
1494		while ((tdb = TAILQ_FIRST(&s->s_tdb_q)) != NULL) {
1495			TAILQ_REMOVE(&s->s_tdb_q, tdb, tdb_sync_entry);
1496			count++;
1497
1498			pfsync_tdb_enter(tdb);
1499			KASSERT(ISSET(tdb->tdb_flags, TDBF_PFSYNC));
1500
1501			/* get a consistent view of the counters */
1502			pfsync_out_tdb(tdb, ptr + off);
1503
1504			CLR(tdb->tdb_flags, TDBF_PFSYNC);
1505			pfsync_tdb_leave(tdb);
1506
1507			off += sizeof(struct pfsync_tdb);
1508		}
1509
1510		subh->action = PFSYNC_ACT_TDB;
1511		subh->len = sizeof(struct pfsync_tdb) >> 2;
1512		subh->count = htons(count);
1513	}
1514#endif
1515
1516	timeout_del(&s->s_tmo);
1517	s->s_len = PFSYNC_MINPKT;
1518
1519	return (m);
1520drop:
1521	m_freem(m);
1522	pfsyncstat_inc(pfsyncs_onomem);
1523	pfsync_slice_drop(sc, s);
1524	return (NULL);
1525}
1526
1527static void
1528pfsync_sendout(struct pfsync_softc *sc, struct mbuf *m)
1529{
1530	struct ip_moptions imo;
1531	unsigned int len = m->m_pkthdr.len;
1532#if NBPFILTER > 0
1533	caddr_t if_bpf = sc->sc_if.if_bpf;
1534	if (if_bpf)
1535		bpf_mtap(if_bpf, m, BPF_DIRECTION_OUT);
1536#endif
1537
1538	imo.imo_ifidx = sc->sc_sync_ifidx;
1539	imo.imo_ttl = PFSYNC_DFLTTL;
1540	imo.imo_loop = 0;
1541	m->m_pkthdr.ph_rtableid = sc->sc_if.if_rdomain;
1542
1543	if (ip_output(m, NULL, NULL, IP_RAWOUTPUT, &imo, NULL, 0) == 0) {
1544		counters_pkt(sc->sc_if.if_counters, ifc_opackets,
1545		    ifc_obytes, len);
1546		pfsyncstat_inc(pfsyncs_opackets);
1547	} else {
1548		counters_inc(sc->sc_if.if_counters, ifc_oerrors);
1549		pfsyncstat_inc(pfsyncs_oerrors);
1550	}
1551}
1552
1553static void
1554pfsync_slice_tmo(void *arg)
1555{
1556	struct pfsync_slice *s = arg;
1557
1558	task_add(s->s_softnet, &s->s_task);
1559}
1560
1561static void
1562pfsync_slice_sched(struct pfsync_slice *s)
1563{
1564	s->s_stat_task_add++;
1565	task_add(s->s_softnet, &s->s_task);
1566}
1567
1568static void
1569pfsync_slice_task(void *arg)
1570{
1571	struct pfsync_slice *s = arg;
1572	struct mbuf *m;
1573
1574	mtx_enter(&s->s_mtx);
1575	s->s_stat_task_run++;
1576
1577	m = pfsync_slice_write(s);
1578	mtx_leave(&s->s_mtx);
1579	if (m != NULL) {
1580		NET_LOCK();
1581		pfsync_sendout(s->s_pfsync, m);
1582		NET_UNLOCK();
1583	}
1584}
1585
1586static void
1587pfsync_slice_sendq(void *arg)
1588{
1589	struct pfsync_slice *s = arg;
1590	struct mbuf_list ml;
1591	struct mbuf *m;
1592
1593	mq_delist(&s->s_sendq, &ml);
1594	if (ml_empty(&ml))
1595		return;
1596
1597	mtx_enter(&s->s_mtx);
1598	s->s_stat_dequeue++;
1599	mtx_leave(&s->s_mtx);
1600
1601	NET_LOCK();
1602	while ((m = ml_dequeue(&ml)) != NULL)
1603		pfsync_sendout(s->s_pfsync, m);
1604	NET_UNLOCK();
1605}
1606
1607static void
1608pfsync_q_ins(struct pfsync_slice *s, struct pf_state *st, unsigned int q)
1609{
1610	size_t nlen = pfsync_qs[q].len;
1611	struct mbuf *m = NULL;
1612
1613	MUTEX_ASSERT_LOCKED(&s->s_mtx);
1614	KASSERT(st->sync_state == PFSYNC_S_NONE);
1615	KASSERT(s->s_len >= PFSYNC_MINPKT);
1616
1617	if (TAILQ_EMPTY(&s->s_qs[q]))
1618		nlen += sizeof(struct pfsync_subheader);
1619
1620	if (s->s_len + nlen > s->s_pfsync->sc_if.if_mtu) {
1621		m = pfsync_slice_write(s);
1622		if (m != NULL) {
1623			s->s_stat_enqueue++;
1624			if (mq_enqueue(&s->s_sendq, m) == 0)
1625				task_add(s->s_softnet, &s->s_send);
1626		}
1627
1628		nlen = sizeof(struct pfsync_subheader) + pfsync_qs[q].len;
1629	}
1630
1631	s->s_len += nlen;
1632	pf_state_ref(st);
1633	TAILQ_INSERT_TAIL(&s->s_qs[q], st, sync_list);
1634	st->sync_state = q;
1635
1636	if (!timeout_pending(&s->s_tmo))
1637		timeout_add_sec(&s->s_tmo, 1);
1638}
1639
1640static void
1641pfsync_q_del(struct pfsync_slice *s, struct pf_state *st)
1642{
1643	unsigned int q = st->sync_state;
1644
1645	MUTEX_ASSERT_LOCKED(&s->s_mtx);
1646	KASSERT(st->sync_state < PFSYNC_S_NONE);
1647
1648	st->sync_state = PFSYNC_S_NONE;
1649	TAILQ_REMOVE(&s->s_qs[q], st, sync_list);
1650	pf_state_unref(st);
1651	s->s_len -= pfsync_qs[q].len;
1652
1653	if (TAILQ_EMPTY(&s->s_qs[q]))
1654		s->s_len -= sizeof(struct pfsync_subheader);
1655}
1656
1657/*
1658 * the pfsync hooks that pf calls
1659 */
1660
1661void
1662pfsync_init_state(struct pf_state *st, const struct pf_state_key *skw,
1663    const struct pf_state_key *sks, int flags)
1664{
1665	/* this is called before pf_state_insert */
1666
1667	if (skw->proto == IPPROTO_PFSYNC)
1668		SET(st->state_flags, PFSTATE_NOSYNC);
1669
1670	if (ISSET(st->state_flags, PFSTATE_NOSYNC)) {
1671		st->sync_state = PFSYNC_S_DEAD;
1672		return;
1673	}
1674
1675	if (ISSET(flags, PFSYNC_SI_IOCTL)) {
1676		/* all good */
1677		return;
1678	}
1679
1680	/* state came off the wire */
1681	if (ISSET(flags, PFSYNC_SI_PFSYNC)) {
1682		if (ISSET(st->state_flags, PFSTATE_ACK)) {
1683			CLR(st->state_flags, PFSTATE_ACK);
1684
1685			/* peer wants an iack, not an insert */
1686			st->sync_state = PFSYNC_S_SYNC;
1687		} else
1688			st->sync_state = PFSYNC_S_PFSYNC;
1689	}
1690}
1691
1692void
1693pfsync_insert_state(struct pf_state *st)
1694{
1695	struct pfsync_softc *sc;
1696
1697	MUTEX_ASSERT_UNLOCKED(&st->mtx);
1698
1699	if (ISSET(st->state_flags, PFSTATE_NOSYNC) ||
1700	    st->sync_state == PFSYNC_S_DEAD)
1701		return;
1702
1703	smr_read_enter();
1704	sc = SMR_PTR_GET(&pfsyncif);
1705	if (sc != NULL) {
1706		struct pfsync_slice *s = pfsync_slice_enter(sc, st);
1707
1708		switch (st->sync_state) {
1709		case PFSYNC_S_UPD_C:
1710			/* we must have lost a race after insert */
1711			pfsync_q_del(s, st);
1712			/* FALLTHROUGH */
1713		case PFSYNC_S_NONE:
1714			pfsync_q_ins(s, st, PFSYNC_S_INS);
1715			break;
1716		case PFSYNC_S_SYNC:
1717			st->sync_state = PFSYNC_S_NONE; /* gross */
1718			pfsync_q_ins(s, st, PFSYNC_S_IACK);
1719			pfsync_slice_sched(s); /* the peer is waiting */
1720			break;
1721		case PFSYNC_S_PFSYNC:
1722			/* state was just inserted by pfsync */
1723			st->sync_state = PFSYNC_S_NONE;
1724			break;
1725		default:
1726			panic("%s: state %p unexpected sync_state %d",
1727			    __func__, st, st->sync_state);
1728			/* NOTREACHED */
1729		}
1730
1731		pfsync_slice_leave(sc, s);
1732	}
1733	smr_read_leave();
1734}
1735
1736void
1737pfsync_update_state(struct pf_state *st)
1738{
1739	struct pfsync_softc *sc;
1740
1741	MUTEX_ASSERT_UNLOCKED(&st->mtx);
1742
1743	if (ISSET(st->state_flags, PFSTATE_NOSYNC) ||
1744	    st->sync_state == PFSYNC_S_DEAD)
1745		return;
1746
1747	smr_read_enter();
1748	sc = SMR_PTR_GET(&pfsyncif);
1749	if (sc != NULL) {
1750		struct pfsync_slice *s = pfsync_slice_enter(sc, st);
1751		int sync = 0;
1752
1753		switch (st->sync_state) {
1754		case PFSYNC_S_UPD_C:
1755		case PFSYNC_S_UPD:
1756			/* we're already handling it */
1757			if (st->key[PF_SK_WIRE]->proto == IPPROTO_TCP) {
1758				st->sync_updates++;
1759				if (st->sync_updates >= sc->sc_maxupdates)
1760					sync = 1;
1761			}
1762			/* FALLTHROUGH */
1763		case PFSYNC_S_INS:
1764		case PFSYNC_S_DEL:
1765		case PFSYNC_S_DEAD:
1766			break;
1767
1768		case PFSYNC_S_IACK:
1769			pfsync_q_del(s, st);
1770			/* FALLTHROUGH */
1771		case PFSYNC_S_NONE:
1772			pfsync_q_ins(s, st, PFSYNC_S_UPD_C);
1773			st->sync_updates = 0;
1774			break;
1775		default:
1776			panic("%s: state %p unexpected sync_state %d",
1777			    __func__, st, st->sync_state);
1778			/* NOTREACHED */
1779		}
1780
1781		if (!sync && (getuptime() - st->pfsync_time) < 2)
1782			sync = 1;
1783
1784		if (sync)
1785			pfsync_slice_sched(s);
1786		pfsync_slice_leave(sc, s);
1787	}
1788	smr_read_leave();
1789}
1790
1791void
1792pfsync_delete_state(struct pf_state *st)
1793{
1794	struct pfsync_softc *sc;
1795
1796	MUTEX_ASSERT_UNLOCKED(&st->mtx);
1797
1798	if (ISSET(st->state_flags, PFSTATE_NOSYNC) ||
1799	    st->sync_state == PFSYNC_S_DEAD)
1800		return;
1801
1802	smr_read_enter();
1803	sc = SMR_PTR_GET(&pfsyncif);
1804	if (sc != NULL) {
1805		struct pfsync_slice *s = pfsync_slice_enter(sc, st);
1806
1807		switch (st->sync_state) {
1808		case PFSYNC_S_INS:
1809			/* let's pretend this never happened */
1810			pfsync_q_del(s, st);
1811			break;
1812
1813		case PFSYNC_S_UPD_C:
1814		case PFSYNC_S_UPD:
1815		case PFSYNC_S_IACK:
1816			pfsync_q_del(s, st);
1817			/* FALLTHROUGH */
1818		case PFSYNC_S_NONE:
1819			pfsync_q_ins(s, st, PFSYNC_S_DEL);
1820			st->sync_updates = 0;
1821			break;
1822		case PFSYNC_S_DEL:
1823		case PFSYNC_S_DEAD:
1824			/* XXX we should count this */
1825			break;
1826		default:
1827			panic("%s: state %p unexpected sync_state %d",
1828			    __func__, st, st->sync_state);
1829			/* NOTREACHED */
1830		}
1831
1832		pfsync_slice_leave(sc, s);
1833	}
1834	smr_read_leave();
1835}
1836
1837struct pfsync_subh_clr {
1838	struct pfsync_subheader	subh;
1839	struct pfsync_clr	clr;
1840} __packed __aligned(4);
1841
1842void
1843pfsync_clear_states(u_int32_t creatorid, const char *ifname)
1844{
1845	struct pfsync_softc *sc;
1846	struct pfsync_subh_clr *h;
1847	struct mbuf *m;
1848	unsigned int hlen, mlen;
1849
1850	smr_read_enter();
1851	sc = SMR_PTR_GET(&pfsyncif);
1852	if (sc != NULL)
1853		refcnt_take(&sc->sc_refs);
1854	smr_read_leave();
1855
1856	if (sc == NULL)
1857		return;
1858
1859	hlen = sizeof(sc->sc_template) +
1860	    sizeof(struct pfsync_header) +
1861	    sizeof(*h);
1862
1863	mlen = max_linkhdr + hlen;
1864
1865	m = m_gethdr(M_DONTWAIT, MT_DATA);
1866	if (m == NULL) {
1867		/* count error */
1868		goto leave;
1869	}
1870
1871	if (mlen > MHLEN) {
1872		MCLGETL(m, M_DONTWAIT, mlen);
1873		if (!ISSET(m->m_flags, M_EXT)) {
1874			m_freem(m);
1875			goto leave;
1876		}
1877	}
1878
1879	m_align(m, sizeof(*h));
1880	h = mtod(m, struct pfsync_subh_clr *);
1881
1882	h->subh.action = PFSYNC_ACT_CLR;
1883	h->subh.len = sizeof(h->clr) >> 2;
1884	h->subh.count = htons(1);
1885
1886	strlcpy(h->clr.ifname, ifname, sizeof(h->clr.ifname));
1887	h->clr.creatorid = creatorid;
1888
1889	m->m_pkthdr.len = m->m_len = sizeof(*h);
1890	m = pfsync_encap(sc, m);
1891	if (m == NULL)
1892		goto leave;
1893
1894	pfsync_sendout(sc, m);
1895leave:
1896	refcnt_rele_wake(&sc->sc_refs);
1897}
1898
1899int
1900pfsync_state_in_use(struct pf_state *st)
1901{
1902	struct pfsync_softc *sc;
1903	int rv = 0;
1904
1905	smr_read_enter();
1906	sc = SMR_PTR_GET(&pfsyncif);
1907	if (sc != NULL) {
1908		/*
1909		 * pfsync bulk sends run inside
1910		 * rw_enter_read(&pf_state_list.pfs_rwl), and this
1911		 * code (pfsync_state_in_use) is only called from the
1912		 * purge code inside
1913		 * rw_enter_write(&pf_state_list.pfs_rwl). therefore,
1914		 * those two sections are exclusive so we can safely
1915		 * look at the bulk send pointers.
1916		 */
1917		/* rw_assert_wrlock(&pf_state_list.pfs_rwl); */
1918		if (sc->sc_bulk_snd.snd_next == st ||
1919		    sc->sc_bulk_snd.snd_tail == st)
1920			rv = 1;
1921	}
1922	smr_read_leave();
1923
1924	return (rv);
1925}
1926
1927int
1928pfsync_defer(struct pf_state *st, struct mbuf *m)
1929{
1930	struct pfsync_softc *sc;
1931	struct pfsync_slice *s;
1932	struct pfsync_deferral *pd;
1933	int sched = 0;
1934	int rv = 0;
1935
1936	if (ISSET(st->state_flags, PFSTATE_NOSYNC) ||
1937	    ISSET(m->m_flags, M_BCAST|M_MCAST))
1938		return (0);
1939
1940	smr_read_enter();
1941	sc = SMR_PTR_GET(&pfsyncif);
1942	if (sc == NULL || !sc->sc_defer)
1943		goto leave;
1944
1945	pd = pool_get(&pfsync_deferrals_pool, M_NOWAIT);
1946	if (pd == NULL) {
1947		goto leave;
1948	}
1949
1950	s = pfsync_slice_enter(sc, st);
1951	s->s_stat_defer_add++;
1952
1953	pd->pd_st = pf_state_ref(st);
1954	pd->pd_m = m;
1955	pd->pd_deadline = getnsecuptime() + PFSYNC_DEFER_NSEC;
1956
1957	m->m_pkthdr.pf.flags |= PF_TAG_GENERATED;
1958	st->sync_defer = pd;
1959
1960	sched = s->s_deferred++;
1961	TAILQ_INSERT_TAIL(&s->s_deferrals, pd, pd_entry);
1962
1963	if (sched == 0)
1964		timeout_add_nsec(&s->s_deferrals_tmo, PFSYNC_DEFER_NSEC);
1965	else if (sched >= PFSYNC_DEFER_LIMIT) {
1966		s->s_stat_defer_overlimit++;
1967		timeout_del(&s->s_deferrals_tmo);
1968		task_add(s->s_softnet, &s->s_deferrals_task);
1969	}
1970
1971	pfsync_slice_sched(s);
1972	pfsync_slice_leave(sc, s);
1973	rv = 1;
1974leave:
1975	smr_read_leave();
1976
1977	return (rv);
1978}
1979
1980static void
1981pfsync_deferred(struct pfsync_softc *sc, struct pf_state *st)
1982{
1983	struct pfsync_slice *s;
1984	struct pfsync_deferral *pd;
1985
1986	s = pfsync_slice_enter(sc, st);
1987
1988	pd = st->sync_defer;
1989	if (pd != NULL) {
1990		s->s_stat_defer_ack++;
1991
1992		TAILQ_REMOVE(&s->s_deferrals, pd, pd_entry);
1993		s->s_deferred--;
1994
1995		st = pd->pd_st;
1996		st->sync_defer = NULL;
1997	}
1998	pfsync_slice_leave(sc, s);
1999
2000	if (pd != NULL)
2001		pfsync_defer_output(pd);
2002}
2003
2004static void
2005pfsync_deferrals_tmo(void *arg)
2006{
2007	struct pfsync_slice *s = arg;
2008
2009	if (READ_ONCE(s->s_deferred) > 0)
2010		task_add(s->s_softnet, &s->s_deferrals_task);
2011}
2012
2013static void
2014pfsync_deferrals_task(void *arg)
2015{
2016	struct pfsync_slice *s = arg;
2017	struct pfsync_deferral *pd;
2018	struct pf_state *st;
2019	uint64_t now, nsec = 0;
2020	struct pfsync_deferrals pds = TAILQ_HEAD_INITIALIZER(pds);
2021
2022	now = getnsecuptime();
2023
2024	mtx_enter(&s->s_mtx);
2025	s->s_stat_defer_run++; /* maybe move this into the loop */
2026	for (;;) {
2027		pd = TAILQ_FIRST(&s->s_deferrals);
2028		if (pd == NULL)
2029			break;
2030
2031		if (s->s_deferred < PFSYNC_DEFER_LIMIT &&
2032		    now < pd->pd_deadline) {
2033			nsec = pd->pd_deadline - now;
2034			break;
2035		}
2036
2037		TAILQ_REMOVE(&s->s_deferrals, pd, pd_entry);
2038		s->s_deferred--;
2039
2040		/*
2041		 * detach the pd from the state. the pd still refers
2042		 * to the state though.
2043		 */
2044		st = pd->pd_st;
2045		st->sync_defer = NULL;
2046
2047		TAILQ_INSERT_TAIL(&pds, pd, pd_entry);
2048	}
2049	mtx_leave(&s->s_mtx);
2050
2051	if (nsec > 0) {
2052		/* we were looking at a pd, but it wasn't old enough */
2053		timeout_add_nsec(&s->s_deferrals_tmo, nsec);
2054	}
2055
2056	if (TAILQ_EMPTY(&pds))
2057		return;
2058
2059	NET_LOCK();
2060	while ((pd = TAILQ_FIRST(&pds)) != NULL) {
2061		TAILQ_REMOVE(&pds, pd, pd_entry);
2062
2063		pfsync_defer_output(pd);
2064	}
2065	NET_UNLOCK();
2066}
2067
2068static void
2069pfsync_defer_output(struct pfsync_deferral *pd)
2070{
2071	struct pf_pdesc pdesc;
2072	struct pf_state *st = pd->pd_st;
2073
2074	if (st->rt == PF_ROUTETO) {
2075		if (pf_setup_pdesc(&pdesc, st->key[PF_SK_WIRE]->af,
2076		    st->direction, NULL, pd->pd_m, NULL) != PF_PASS)
2077			return;
2078		switch (st->key[PF_SK_WIRE]->af) {
2079		case AF_INET:
2080			pf_route(&pdesc, st);
2081			break;
2082#ifdef INET6
2083		case AF_INET6:
2084			pf_route6(&pdesc, st);
2085			break;
2086#endif /* INET6 */
2087		default:
2088			unhandled_af(st->key[PF_SK_WIRE]->af);
2089		}
2090		pd->pd_m = pdesc.m;
2091	} else {
2092		switch (st->key[PF_SK_WIRE]->af) {
2093		case AF_INET:
2094			ip_output(pd->pd_m, NULL, NULL, 0, NULL, NULL, 0);
2095			break;
2096#ifdef INET6
2097		case AF_INET6:
2098			ip6_output(pd->pd_m, NULL, NULL, 0, NULL, NULL);
2099			break;
2100#endif /* INET6 */
2101		default:
2102			unhandled_af(st->key[PF_SK_WIRE]->af);
2103		}
2104
2105		pd->pd_m = NULL;
2106	}
2107
2108	pf_state_unref(st);
2109	m_freem(pd->pd_m);
2110	pool_put(&pfsync_deferrals_pool, pd);
2111}
2112
2113struct pfsync_subh_bus {
2114	struct pfsync_subheader	subh;
2115	struct pfsync_bus	bus;
2116} __packed __aligned(4);
2117
2118static unsigned int
2119pfsync_bulk_snd_bus(struct pfsync_softc *sc,
2120    struct mbuf *m, const unsigned int space,
2121    uint32_t endtime, uint8_t status)
2122{
2123	struct pfsync_subh_bus *h;
2124	unsigned int nlen;
2125
2126	nlen = m->m_len + sizeof(*h);
2127	if (space < nlen)
2128		return (0);
2129
2130	h = (struct pfsync_subh_bus *)(mtod(m, caddr_t) + m->m_len);
2131	memset(h, 0, sizeof(*h));
2132
2133	h->subh.action = PFSYNC_ACT_BUS;
2134	h->subh.len = sizeof(h->bus) >> 2;
2135	h->subh.count = htons(1);
2136
2137	h->bus.creatorid = pf_status.hostid;
2138	h->bus.endtime = htonl(endtime);
2139	h->bus.status = status;
2140
2141	m->m_len = nlen;
2142
2143	return (1);
2144}
2145
2146static unsigned int
2147pfsync_bulk_snd_states(struct pfsync_softc *sc,
2148    struct mbuf *m, const unsigned int space, unsigned int len)
2149{
2150	struct pf_state *st;
2151	struct pfsync_state *sp;
2152	unsigned int nlen;
2153	unsigned int count = 0;
2154
2155	st = sc->sc_bulk_snd.snd_next;
2156
2157	for (;;) {
2158		nlen = len + sizeof(*sp);
2159		sp = (struct pfsync_state *)(mtod(m, caddr_t) + len);
2160		if (space < nlen)
2161			break;
2162
2163		mtx_enter(&st->mtx);
2164		pf_state_export(sp, st);
2165		mtx_leave(&st->mtx);
2166
2167		/* commit */
2168		count++;
2169		m->m_len = len = nlen;
2170
2171		if (st == sc->sc_bulk_snd.snd_tail) {
2172			if (pfsync_bulk_snd_bus(sc, m, space,
2173			    0, PFSYNC_BUS_END) == 0) {
2174				/* couldn't fit the BUS */
2175				st = NULL;
2176				break;
2177			}
2178
2179			/* this BUS is done */
2180			pfsync_dprintf(sc, "bulk send done (%s)", __func__);
2181			sc->sc_bulk_snd.snd_again = 0; /* XXX */
2182			sc->sc_bulk_snd.snd_next = NULL;
2183			sc->sc_bulk_snd.snd_tail = NULL;
2184			return (count);
2185		}
2186
2187		st = TAILQ_NEXT(st, entry_list);
2188	}
2189
2190	/* there's still work to do */
2191	sc->sc_bulk_snd.snd_next = st;
2192	timeout_add_msec(&sc->sc_bulk_snd.snd_tmo, PFSYNC_BULK_SND_IVAL_MS);
2193
2194	return (count);
2195}
2196
2197static unsigned int
2198pfsync_bulk_snd_sub(struct pfsync_softc *sc,
2199    struct mbuf *m, const unsigned int space)
2200{
2201	struct pfsync_subheader *subh;
2202	unsigned int count;
2203	unsigned int len, nlen;
2204
2205	len = m->m_len;
2206	nlen = len + sizeof(*subh);
2207	if (nlen > space)
2208		return (0);
2209
2210	subh = (struct pfsync_subheader *)(mtod(m, caddr_t) + len);
2211
2212	/*
2213	 * pfsync_bulk_snd_states only updates m->m_len after
2214	 * filling in a state after the offset we gave it.
2215	 */
2216	count = pfsync_bulk_snd_states(sc, m, space, nlen);
2217	if (count == 0)
2218		return (0);
2219
2220	subh->action = PFSYNC_ACT_UPD;
2221	subh->len = sizeof(struct pfsync_state) >> 2;
2222	subh->count = htons(count);
2223
2224	return (count);
2225}
2226
2227static void
2228pfsync_bulk_snd_start(struct pfsync_softc *sc)
2229{
2230	const unsigned int space = sc->sc_if.if_mtu -
2231	    (sizeof(struct ip) + sizeof(struct pfsync_header));
2232	struct mbuf *m;
2233
2234	rw_enter_read(&pf_state_list.pfs_rwl);
2235
2236	rw_enter_write(&sc->sc_bulk_snd.snd_lock);
2237	if (sc->sc_bulk_snd.snd_next != NULL) {
2238		sc->sc_bulk_snd.snd_again = 1;
2239		goto leave;
2240	}
2241
2242	mtx_enter(&pf_state_list.pfs_mtx);
2243	sc->sc_bulk_snd.snd_next = TAILQ_FIRST(&pf_state_list.pfs_list);
2244	sc->sc_bulk_snd.snd_tail = TAILQ_LAST(&pf_state_list.pfs_list,
2245	    pf_state_queue);
2246	mtx_leave(&pf_state_list.pfs_mtx);
2247
2248	m = m_gethdr(M_DONTWAIT, MT_DATA);
2249	if (m == NULL)
2250		goto leave;
2251
2252	MCLGETL(m, M_DONTWAIT, max_linkhdr + sc->sc_if.if_mtu);
2253	if (!ISSET(m->m_flags, M_EXT)) {
2254		/* some error++ */
2255		m_freem(m); /* drop */
2256		goto leave;
2257	}
2258
2259	m_align(m, space);
2260	m->m_len = 0;
2261
2262	if (sc->sc_bulk_snd.snd_tail == NULL) {
2263		pfsync_dprintf(sc, "bulk send empty (%s)", __func__);
2264
2265		/* list is empty */
2266		if (pfsync_bulk_snd_bus(sc, m, space, 0, PFSYNC_BUS_END) == 0)
2267			panic("%s: mtu is too low", __func__);
2268		goto encap;
2269	}
2270
2271	pfsync_dprintf(sc, "bulk send start (%s)", __func__);
2272
2273	/* start a bulk update. */
2274	if (pfsync_bulk_snd_bus(sc, m, space, 0, PFSYNC_BUS_START) == 0)
2275		panic("%s: mtu is too low", __func__);
2276
2277	/* fill it up with state updates. */
2278	pfsync_bulk_snd_sub(sc, m, space);
2279
2280encap:
2281	m->m_pkthdr.len = m->m_len;
2282	m = pfsync_encap(sc, m);
2283	if (m == NULL)
2284		goto leave;
2285
2286	pfsync_sendout(sc, m);
2287
2288leave:
2289	rw_exit_write(&sc->sc_bulk_snd.snd_lock);
2290
2291	rw_exit_read(&pf_state_list.pfs_rwl);
2292}
2293
2294static void
2295pfsync_bulk_snd_tmo(void *arg)
2296{
2297	struct pfsync_softc *sc = arg;
2298	const unsigned int space = sc->sc_if.if_mtu -
2299	    (sizeof(struct ip) + sizeof(struct pfsync_header));
2300	struct mbuf *m;
2301
2302	m = m_gethdr(M_DONTWAIT, MT_DATA);
2303	if (m == NULL) {
2304		/* some error++ */
2305		/* retry later */
2306		timeout_add_msec(&sc->sc_bulk_snd.snd_tmo,
2307		    PFSYNC_BULK_SND_IVAL_MS);
2308		return;
2309	}
2310
2311	MCLGETL(m, M_DONTWAIT, max_linkhdr + sc->sc_if.if_mtu);
2312	if (!ISSET(m->m_flags, M_EXT)) {
2313		/* some error++ */
2314		m_freem(m);
2315		/* retry later */
2316		timeout_add_msec(&sc->sc_bulk_snd.snd_tmo,
2317		    PFSYNC_BULK_SND_IVAL_MS);
2318		return;
2319	}
2320
2321	m_align(m, space);
2322	m->m_len = 0;
2323
2324	rw_enter_read(&pf_state_list.pfs_rwl);
2325	rw_enter_write(&sc->sc_bulk_snd.snd_lock);
2326
2327	if (sc->sc_bulk_snd.snd_next == NULL) {
2328		/* there was no space in the previous packet for a BUS END */
2329
2330		if (pfsync_bulk_snd_bus(sc, m, space, 0, PFSYNC_BUS_END) == 0)
2331			panic("%s: mtu is too low", __func__);
2332
2333		/* this bulk is done */
2334		pfsync_dprintf(sc, "bulk send done (%s)", __func__);
2335		sc->sc_bulk_snd.snd_again = 0; /* XXX */
2336		sc->sc_bulk_snd.snd_tail = NULL;
2337	} else {
2338		pfsync_dprintf(sc, "bulk send again (%s)", __func__);
2339
2340		/* fill it up with state updates. */
2341		pfsync_bulk_snd_sub(sc, m, space);
2342	}
2343
2344	m->m_pkthdr.len = m->m_len;
2345	m = pfsync_encap(sc, m);
2346
2347	rw_exit_write(&sc->sc_bulk_snd.snd_lock);
2348	rw_exit_read(&pf_state_list.pfs_rwl);
2349
2350	if (m != NULL) {
2351		NET_LOCK();
2352		pfsync_sendout(sc, m);
2353		NET_UNLOCK();
2354	}
2355}
2356
2357static void
2358pfsync_update_state_req(struct pfsync_softc *sc, struct pf_state *st)
2359{
2360	struct pfsync_slice *s = pfsync_slice_enter(sc, st);
2361
2362	switch (st->sync_state) {
2363	case PFSYNC_S_UPD_C:
2364	case PFSYNC_S_IACK:
2365		pfsync_q_del(s, st);
2366		/* FALLTHROUGH */
2367	case PFSYNC_S_NONE:
2368		pfsync_q_ins(s, st, PFSYNC_S_UPD);
2369		break;
2370
2371	case PFSYNC_S_INS:
2372	case PFSYNC_S_UPD:
2373	case PFSYNC_S_DEL:
2374		/* we're already handling it */
2375		break;
2376	default:
2377		panic("%s: state %p unexpected sync_state %d",
2378		    __func__, st, st->sync_state);
2379	}
2380
2381	pfsync_slice_sched(s);
2382	pfsync_slice_leave(sc, s);
2383}
2384
2385#if defined(IPSEC)
2386static void
2387pfsync_out_tdb(struct tdb *tdb, void *buf)
2388{
2389	struct pfsync_tdb *ut = buf;
2390
2391	memset(ut, 0, sizeof(*ut));
2392	ut->spi = tdb->tdb_spi;
2393	memcpy(&ut->dst, &tdb->tdb_dst, sizeof(ut->dst));
2394	/*
2395	 * When a failover happens, the master's rpl is probably above
2396	 * what we see here (we may be up to a second late), so
2397	 * increase it a bit for outbound tdbs to manage most such
2398	 * situations.
2399	 *
2400	 * For now, just add an offset that is likely to be larger
2401	 * than the number of packets we can see in one second. The RFC
2402	 * just says the next packet must have a higher seq value.
2403	 *
2404	 * XXX What is a good algorithm for this? We could use
2405	 * a rate-determined increase, but to know it, we would have
2406	 * to extend struct tdb.
2407	 * XXX pt->rpl can wrap over MAXINT, but if so the real tdb
2408	 * will soon be replaced anyway. For now, just don't handle
2409	 * this edge case.
2410	 */
2411#define RPL_INCR 16384
2412	ut->rpl = htobe64(tdb->tdb_rpl +
2413	    (ISSET(tdb->tdb_flags, TDBF_PFSYNC_RPL) ? RPL_INCR : 0));
2414	ut->cur_bytes = htobe64(tdb->tdb_cur_bytes);
2415	ut->sproto = tdb->tdb_sproto;
2416	ut->rdomain = htons(tdb->tdb_rdomain);
2417}
2418
2419static struct pfsync_slice *
2420pfsync_slice_enter_tdb(struct pfsync_softc *sc, const struct tdb *t)
2421{
2422	/*
2423	 * just use the first slice for all ipsec (for now) until
2424	 * it's more obvious what property (eg, spi) we can distribute
2425	 * tdbs over slices with.
2426	 */
2427	struct pfsync_slice *s = &sc->sc_slices[0];
2428
2429	if (!mtx_enter_try(&s->s_mtx)) {
2430		mtx_enter(&s->s_mtx);
2431		s->s_stat_contended++;
2432	}
2433	s->s_stat_locks++;
2434
2435	return (s);
2436}
2437
2438static void
2439pfsync_tdb_ins(struct pfsync_slice *s, struct tdb *tdb)
2440{
2441	size_t nlen = sizeof(struct pfsync_tdb);
2442	struct mbuf *m = NULL;
2443
2444	KASSERT(s->s_len >= PFSYNC_MINPKT);
2445
2446	MUTEX_ASSERT_LOCKED(&s->s_mtx);
2447	MUTEX_ASSERT_UNLOCKED(&tdb->tdb_mtx);
2448
2449	if (TAILQ_EMPTY(&s->s_tdb_q))
2450		nlen += sizeof(struct pfsync_subheader);
2451
2452	if (s->s_len + nlen > s->s_pfsync->sc_if.if_mtu) {
2453		m = pfsync_slice_write(s);
2454		if (m != NULL) {
2455			s->s_stat_enqueue++;
2456			if (mq_enqueue(&s->s_sendq, m) == 0)
2457				task_add(s->s_softnet, &s->s_send);
2458		}
2459
2460		nlen = sizeof(struct pfsync_subheader) +
2461		    sizeof(struct pfsync_tdb);
2462	}
2463
2464	s->s_len += nlen;
2465	TAILQ_INSERT_TAIL(&s->s_tdb_q, tdb, tdb_sync_entry);
2466	tdb->tdb_updates = 0;
2467
2468	if (!timeout_pending(&s->s_tmo))
2469		timeout_add_sec(&s->s_tmo, 1);
2470}
2471
2472static void
2473pfsync_tdb_del(struct pfsync_slice *s, struct tdb *tdb)
2474{
2475	MUTEX_ASSERT_LOCKED(&s->s_mtx);
2476	MUTEX_ASSERT_UNLOCKED(&tdb->tdb_mtx);
2477
2478	TAILQ_REMOVE(&s->s_tdb_q, tdb, tdb_sync_entry);
2479
2480	s->s_len -= sizeof(struct pfsync_tdb);
2481	if (TAILQ_EMPTY(&s->s_tdb_q))
2482		s->s_len -= sizeof(struct pfsync_subheader);
2483}
2484
2485/*
2486 * the reference that pfsync has to a tdb is accounted for by the
2487 * TDBF_PFSYNC flag, not by tdb_ref/tdb_unref. tdb_delete_tdb() is
2488 * called after all other references to a tdb are dropped (with
2489 * tdb_unref) as part of the tdb_free().
2490 *
2491 * tdb_free() needs to wait for pfsync to let go of the tdb though,
2492 * which would be best handled by a reference count, but tdb_free
2493 * needs the NET_LOCK which pfsync is already fighting with. instead
2494 * use the TDBF_PFSYNC_SNAPPED flag to coordinate the pfsync write/drop
2495 * with tdb_free.
2496 */
2497
2498void
2499pfsync_update_tdb(struct tdb *tdb, int output)
2500{
2501	struct pfsync_softc *sc;
2502
2503	MUTEX_ASSERT_UNLOCKED(&tdb->tdb_mtx);
2504
2505	smr_read_enter();
2506	sc = SMR_PTR_GET(&pfsyncif);
2507	if (sc != NULL) {
2508		struct pfsync_slice *s = pfsync_slice_enter_tdb(sc, tdb);
2509
2510		/* TDBF_PFSYNC is only changed while the slice mtx is held */
2511		if (!ISSET(tdb->tdb_flags, TDBF_PFSYNC)) {
2512			mtx_enter(&tdb->tdb_mtx);
2513			SET(tdb->tdb_flags, TDBF_PFSYNC);
2514			mtx_leave(&tdb->tdb_mtx);
2515
2516			pfsync_tdb_ins(s, tdb);
2517		} else if (++tdb->tdb_updates >= sc->sc_maxupdates)
2518			pfsync_slice_sched(s);
2519
2520		/* XXX no sync timestamp on tdbs to check */
2521
2522		pfsync_slice_leave(sc, s);
2523	}
2524	smr_read_leave();
2525}
2526
2527void
2528pfsync_delete_tdb(struct tdb *tdb)
2529{
2530	struct pfsync_softc *sc;
2531
2532	MUTEX_ASSERT_UNLOCKED(&tdb->tdb_mtx);
2533
2534	smr_read_enter();
2535	sc = SMR_PTR_GET(&pfsyncif);
2536	if (sc != NULL) {
2537		struct pfsync_slice *s = pfsync_slice_enter_tdb(sc, tdb);
2538
2539		/* TDBF_PFSYNC is only changed while the slice mtx is held */
2540		if (ISSET(tdb->tdb_flags, TDBF_PFSYNC)) {
2541			pfsync_tdb_del(s, tdb);
2542
2543			mtx_enter(&tdb->tdb_mtx);
2544			CLR(tdb->tdb_flags, TDBF_PFSYNC);
2545			mtx_leave(&tdb->tdb_mtx);
2546		}
2547
2548		pfsync_slice_leave(sc, s);
2549	}
2550	smr_read_leave();
2551
2552	/*
2553	 * handle pfsync_slice_drop being called from pfsync_down
2554	 * and the smr/slice access above won't work.
2555	 */
2556
2557	mtx_enter(&tdb->tdb_mtx);
2558	SET(tdb->tdb_flags, TDBF_PFSYNC_SNAPPED); /* like a thanos snap */
2559	while (ISSET(tdb->tdb_flags, TDBF_PFSYNC)) {
2560		msleep_nsec(&tdb->tdb_updates, &tdb->tdb_mtx, PWAIT,
2561		    "tdbfree", INFSLP);
2562	}
2563	mtx_leave(&tdb->tdb_mtx);
2564}
2565#endif /* defined(IPSEC) */
2566
2567struct pfsync_act {
2568	void (*in)(struct pfsync_softc *, const caddr_t,
2569	    unsigned int, unsigned int);
2570	size_t len;
2571};
2572
2573static void	pfsync_in_clr(struct pfsync_softc *,
2574		    const caddr_t, unsigned int, unsigned int);
2575static void	pfsync_in_iack(struct pfsync_softc *,
2576		    const caddr_t, unsigned int, unsigned int);
2577static void	pfsync_in_upd_c(struct pfsync_softc *,
2578		    const caddr_t, unsigned int, unsigned int);
2579static void	pfsync_in_ureq(struct pfsync_softc *,
2580		    const caddr_t, unsigned int, unsigned int);
2581static void	pfsync_in_del(struct pfsync_softc *,
2582		    const caddr_t, unsigned int, unsigned int);
2583static void	pfsync_in_del_c(struct pfsync_softc *,
2584		    const caddr_t, unsigned int, unsigned int);
2585static void	pfsync_in_bus(struct pfsync_softc *,
2586		    const caddr_t, unsigned int, unsigned int);
2587static void	pfsync_in_tdb(struct pfsync_softc *,
2588		    const caddr_t, unsigned int, unsigned int);
2589static void	pfsync_in_ins(struct pfsync_softc *,
2590		    const caddr_t, unsigned int, unsigned int);
2591static void	pfsync_in_upd(struct pfsync_softc *,
2592		    const caddr_t, unsigned int, unsigned int);
2593
2594static const struct pfsync_act pfsync_acts[] = {
2595	[PFSYNC_ACT_CLR] =
2596	    { pfsync_in_clr,	sizeof(struct pfsync_clr) },
2597	[PFSYNC_ACT_INS_ACK] =
2598	    { pfsync_in_iack,	sizeof(struct pfsync_ins_ack) },
2599	[PFSYNC_ACT_UPD_C] =
2600	    { pfsync_in_upd_c,	sizeof(struct pfsync_upd_c) },
2601	[PFSYNC_ACT_UPD_REQ] =
2602	    { pfsync_in_ureq,	sizeof(struct pfsync_upd_req) },
2603	[PFSYNC_ACT_DEL] =
2604	    { pfsync_in_del,	sizeof(struct pfsync_state) },
2605	[PFSYNC_ACT_DEL_C] =
2606	    { pfsync_in_del_c,	sizeof(struct pfsync_del_c) },
2607	[PFSYNC_ACT_BUS] =
2608	    { pfsync_in_bus,	sizeof(struct pfsync_bus) },
2609	[PFSYNC_ACT_INS] =
2610	    { pfsync_in_ins,	sizeof(struct pfsync_state) },
2611	[PFSYNC_ACT_UPD] =
2612	    { pfsync_in_upd,	sizeof(struct pfsync_state) },
2613	[PFSYNC_ACT_TDB] =
2614	    { pfsync_in_tdb,	sizeof(struct pfsync_tdb) },
2615};
2616
2617static void
2618pfsync_in_skip(struct pfsync_softc *sc,
2619    const caddr_t buf, unsigned int mlen, unsigned int count)
2620{
2621	/* nop */
2622}
2623
2624static struct mbuf *
2625pfsync_input(struct mbuf *m, uint8_t ttl, unsigned int hlen)
2626{
2627	struct pfsync_softc *sc;
2628	struct pfsync_header *ph;
2629	struct pfsync_subheader *subh;
2630	unsigned int len;
2631	void (*in)(struct pfsync_softc *,
2632	    const caddr_t, unsigned int, unsigned int);
2633
2634	pfsyncstat_inc(pfsyncs_ipackets);
2635
2636	if (!pf_status.running)
2637		return (m);
2638
2639	/*
2640	 * pfsyncif is only set if it is up and running correctly.
2641	 */
2642	smr_read_enter();
2643	sc = SMR_PTR_GET(&pfsyncif);
2644	if (sc == NULL)
2645		goto leave;
2646
2647	if (sc->sc_sync_ifidx != m->m_pkthdr.ph_ifidx) {
2648		pfsyncstat_inc(pfsyncs_badif);
2649		goto leave;
2650	}
2651
2652	/* verify that the IP TTL is 255. */
2653	if (ttl != PFSYNC_DFLTTL) {
2654		pfsyncstat_inc(pfsyncs_badttl);
2655		goto leave;
2656	}
2657
2658	m_adj(m, hlen);
2659
2660	if (m->m_pkthdr.len < sizeof(*ph)) {
2661		pfsyncstat_inc(pfsyncs_hdrops);
2662		goto leave;
2663	}
2664	if (m->m_len < sizeof(*ph)) {
2665		m = m_pullup(m, sizeof(*ph));
2666		if (m == NULL)
2667			goto leave;
2668	}
2669
2670	ph = mtod(m, struct pfsync_header *);
2671	if (ph->version != PFSYNC_VERSION) {
2672		pfsyncstat_inc(pfsyncs_badver);
2673		goto leave;
2674	}
2675
2676	len = ntohs(ph->len);
2677	if (m->m_pkthdr.len < len) {
2678		pfsyncstat_inc(pfsyncs_badlen);
2679		goto leave;
2680	}
2681	if (m->m_pkthdr.len > len)
2682		m->m_pkthdr.len = len;
2683
2684	/* ok, it's serious now */
2685	refcnt_take(&sc->sc_refs);
2686	smr_read_leave();
2687
2688	counters_pkt(sc->sc_if.if_counters, ifc_ipackets, ifc_ibytes, len);
2689
2690	m_adj(m, sizeof(*ph));
2691
2692	while (m->m_pkthdr.len >= sizeof(*subh)) {
2693		unsigned int action, mlen, count;
2694
2695		if (m->m_len < sizeof(*subh)) {
2696			m = m_pullup(m, sizeof(*subh));
2697			if (m == NULL)
2698				goto rele;
2699		}
2700		subh = mtod(m, struct pfsync_subheader *);
2701
2702		action = subh->action;
2703		mlen = subh->len << 2;
2704		count = ntohs(subh->count);
2705
2706		if (action >= PFSYNC_ACT_MAX ||
2707		    action >= nitems(pfsync_acts) ||
2708		    mlen < pfsync_acts[subh->action].len) {
2709			/*
2710			 * subheaders are always followed by at least one
2711			 * message, so if the peer is new
2712			 * enough to tell us how big its messages are then we
2713			 * know enough to skip them.
2714			 */
2715			if (count == 0 || mlen == 0) {
2716				pfsyncstat_inc(pfsyncs_badact);
2717				goto rele;
2718			}
2719
2720			in = pfsync_in_skip;
2721		} else {
2722			in = pfsync_acts[action].in;
2723			if (in == NULL)
2724				in = pfsync_in_skip;
2725		}
2726
2727		m_adj(m, sizeof(*subh));
2728		len = mlen * count;
2729		if (len > m->m_pkthdr.len) {
2730			pfsyncstat_inc(pfsyncs_badlen);
2731			goto rele;
2732		}
2733		if (m->m_len < len) {
2734			m = m_pullup(m, len);
2735			if (m == NULL)
2736				goto rele;
2737		}
2738
2739		(*in)(sc, mtod(m, caddr_t), mlen, count);
2740		m_adj(m, len);
2741	}
2742
2743rele:
2744	refcnt_rele_wake(&sc->sc_refs);
2745	return (m);
2746
2747leave:
2748	smr_read_leave();
2749	return (m);
2750}
2751
2752static void
2753pfsync_in_clr(struct pfsync_softc *sc,
2754    const caddr_t buf, unsigned int mlen, unsigned int count)
2755{
2756	const struct pfsync_clr *clr;
2757	struct pf_state *head, *tail, *st, *next;
2758	struct pfi_kif *kif;
2759	uint32_t creatorid;
2760	unsigned int i;
2761
2762	rw_enter_read(&pf_state_list.pfs_rwl);
2763
2764	/* get a view of the state list */
2765	mtx_enter(&pf_state_list.pfs_mtx);
2766	head = TAILQ_FIRST(&pf_state_list.pfs_list);
2767	tail = TAILQ_LAST(&pf_state_list.pfs_list, pf_state_queue);
2768	mtx_leave(&pf_state_list.pfs_mtx);
2769
2770	PF_LOCK();
2771	for (i = 0; i < count; i++) {
2772		clr = (struct pfsync_clr *)(buf + i * mlen);
2773
2774		creatorid = clr->creatorid;
2775		if (clr->ifname[0] == '\0')
2776			kif = NULL;
2777		else {
2778			kif = pfi_kif_find(clr->ifname);
2779			if (kif == NULL)
2780				continue;
2781		}
2782
2783		st = NULL;
2784		next = head;
2785
2786		PF_STATE_ENTER_WRITE();
2787		while (st != tail) {
2788			st = next;
2789			next = TAILQ_NEXT(st, entry_list);
2790
2791			if (creatorid != st->creatorid)
2792				continue;
2793			if (kif != NULL && kif != st->kif)
2794				continue;
2795
2796			mtx_enter(&st->mtx);
2797			SET(st->state_flags, PFSTATE_NOSYNC);
2798			mtx_leave(&st->mtx);
2799			pf_remove_state(st);
2800		}
2801		PF_STATE_EXIT_WRITE();
2802	}
2803	PF_UNLOCK();
2804
2805	rw_exit_read(&pf_state_list.pfs_rwl);
2806}
2807
2808static void
2809pfsync_in_ins(struct pfsync_softc *sc,
2810    const caddr_t buf, unsigned int mlen, unsigned int count)
2811{
2812	const struct pfsync_state *sp;
2813	sa_family_t af1, af2;
2814	unsigned int i;
2815
2816	PF_LOCK();
2817	for (i = 0; i < count; i++) {
2818		sp = (struct pfsync_state *)(buf + mlen * i);
2819		af1 = sp->key[0].af;
2820		af2 = sp->key[1].af;
2821
2822		/* check for invalid values */
2823		if (sp->timeout >= PFTM_MAX ||
2824		    sp->src.state > PF_TCPS_PROXY_DST ||
2825		    sp->dst.state > PF_TCPS_PROXY_DST ||
2826		    sp->direction > PF_OUT ||
2827		    (((af1 || af2) &&
2828		     ((af1 != AF_INET && af1 != AF_INET6) ||
2829		      (af2 != AF_INET && af2 != AF_INET6))) ||
2830		     (sp->af != AF_INET && sp->af != AF_INET6))) {
2831			pfsyncstat_inc(pfsyncs_badval);
2832			continue;
2833		}
2834
2835		if (pf_state_import(sp, PFSYNC_SI_PFSYNC) == ENOMEM) {
2836			/* drop out, but process the rest of the actions */
2837			break;
2838		}
2839	}
2840	PF_UNLOCK();
2841}
2842
2843static void
2844pfsync_in_iack(struct pfsync_softc *sc,
2845    const caddr_t buf, unsigned int mlen, unsigned int count)
2846{
2847	const struct pfsync_ins_ack *ia;
2848	struct pf_state_cmp id_key;
2849	struct pf_state *st;
2850	unsigned int i;
2851
2852	for (i = 0; i < count; i++) {
2853		ia = (struct pfsync_ins_ack *)(buf + mlen * i);
2854
2855		id_key.id = ia->id;
2856		id_key.creatorid = ia->creatorid;
2857
2858		PF_STATE_ENTER_READ();
2859		st = pf_find_state_byid(&id_key);
2860		pf_state_ref(st);
2861		PF_STATE_EXIT_READ();
2862		if (st == NULL)
2863			continue;
2864
2865		if (READ_ONCE(st->sync_defer) != NULL)
2866			pfsync_deferred(sc, st);
2867
2868		pf_state_unref(st);
2869	}
2870}
2871
2872static int
2873pfsync_upd_tcp(struct pf_state *st, const struct pfsync_state_peer *src,
2874    const struct pfsync_state_peer *dst)
2875{
2876	int sync = 0;
2877
2878	/*
2879	 * The state should never go backwards except
2880	 * for syn-proxy states.  Neither should the
2881	 * sequence window slide backwards.
2882	 */
2883	if ((st->src.state > src->state &&
2884	    (st->src.state < PF_TCPS_PROXY_SRC ||
2885	     src->state >= PF_TCPS_PROXY_SRC)) ||
2886
2887	    (st->src.state == src->state &&
2888	     SEQ_GT(st->src.seqlo, ntohl(src->seqlo))))
2889		sync++;
2890	else
2891		pf_state_peer_ntoh(src, &st->src);
2892
2893	if ((st->dst.state > dst->state) ||
2894
2895	    (st->dst.state == dst->state &&
2896	     SEQ_GT(st->dst.seqlo, ntohl(dst->seqlo))))
2897		sync++;
2898	else
2899		pf_state_peer_ntoh(dst, &st->dst);
2900
2901	return (sync);
2902}
2903
2904static void
2905pfsync_in_updates(struct pfsync_softc *sc, struct pf_state *st,
2906    const struct pfsync_state_peer *src, const struct pfsync_state_peer *dst,
2907    uint8_t timeout)
2908{
2909	struct pf_state_scrub *sscrub = NULL;
2910	struct pf_state_scrub *dscrub = NULL;
2911	int sync;
2912
2913	if (src->scrub.scrub_flag && st->src.scrub == NULL) {
2914		sscrub = pf_state_scrub_get();
2915		if (sscrub == NULL) {
2916			/* inc error? */
2917			goto out;
2918		}
2919	}
2920	if (dst->scrub.scrub_flag && st->dst.scrub == NULL) {
2921		dscrub = pf_state_scrub_get();
2922		if (dscrub == NULL) {
2923			/* inc error? */
2924			goto out;
2925		}
2926	}
2927
2928	if (READ_ONCE(st->sync_defer) != NULL)
2929		pfsync_deferred(sc, st);
2930
2931	mtx_enter(&st->mtx);
2932
2933	/* attach the scrub memory if needed */
2934	if (sscrub != NULL && st->src.scrub == NULL) {
2935		st->src.scrub = sscrub;
2936		sscrub = NULL;
2937	}
2938	if (dscrub != NULL && st->dst.scrub == NULL) {
2939		st->dst.scrub = dscrub;
2940		dscrub = NULL;
2941	}
2942
2943	if (st->key[PF_SK_WIRE]->proto == IPPROTO_TCP)
2944		sync = pfsync_upd_tcp(st, src, dst);
2945	else {
2946		sync = 0;
2947
2948		/*
2949		 * Non-TCP protocol state machine always go
2950		 * forwards
2951		 */
2952		if (st->src.state > src->state)
2953			sync++;
2954		else
2955			pf_state_peer_ntoh(src, &st->src);
2956
2957		if (st->dst.state > dst->state)
2958			sync++;
2959		else
2960			pf_state_peer_ntoh(dst, &st->dst);
2961	}
2962
2963	st->pfsync_time = getuptime();
2964	if (sync < 2) {
2965		st->expire = st->pfsync_time;
2966		st->timeout = timeout;
2967	}
2968
2969	mtx_leave(&st->mtx);
2970
2971	if (sync) {
2972		pfsyncstat_inc(pfsyncs_stale);
2973		pfsync_update_state(st);
2974	}
2975
2976out:
2977	if (sscrub != NULL)
2978		pf_state_scrub_put(sscrub);
2979	if (dscrub != NULL)
2980		pf_state_scrub_put(dscrub);
2981}
2982
2983
2984static void
2985pfsync_in_upd(struct pfsync_softc *sc,
2986    const caddr_t buf, unsigned int mlen, unsigned int count)
2987{
2988	const struct pfsync_state *sp;
2989	struct pf_state_cmp id_key;
2990	struct pf_state *st;
2991	int error;
2992	unsigned int i;
2993
2994	for (i = 0; i < count; i++) {
2995		sp = (struct pfsync_state *)(buf + mlen * i);
2996
2997		/* check for invalid values */
2998		if (sp->timeout >= PFTM_MAX ||
2999		    sp->src.state > PF_TCPS_PROXY_DST ||
3000		    sp->dst.state > PF_TCPS_PROXY_DST) {
3001			pfsyncstat_inc(pfsyncs_badval);
3002			continue;
3003		}
3004
3005		id_key.id = sp->id;
3006		id_key.creatorid = sp->creatorid;
3007
3008		PF_STATE_ENTER_READ();
3009		st = pf_find_state_byid(&id_key);
3010		pf_state_ref(st);
3011		PF_STATE_EXIT_READ();
3012		if (st == NULL) {
3013			/* insert the update */
3014			PF_LOCK();
3015			error = pf_state_import(sp, PFSYNC_SI_PFSYNC);
3016			if (error)
3017				pfsyncstat_inc(pfsyncs_badstate);
3018			PF_UNLOCK();
3019			continue;
3020		}
3021
3022		pfsync_in_updates(sc, st, &sp->src, &sp->dst, sp->timeout);
3023
3024		pf_state_unref(st);
3025	}
3026}
3027
3028static struct mbuf *
3029pfsync_upd_req_init(struct pfsync_softc *sc, unsigned int count)
3030{
3031	struct mbuf *m;
3032	unsigned int mlen;
3033
3034	m = m_gethdr(M_DONTWAIT, MT_DATA);
3035	if (m == NULL) {
3036		pfsyncstat_inc(pfsyncs_onomem);
3037		return (NULL);
3038	}
3039
3040	mlen = max_linkhdr + sizeof(sc->sc_template) +
3041	    sizeof(struct pfsync_header) +
3042	    sizeof(struct pfsync_subheader) +
3043	    sizeof(struct pfsync_upd_req) * count;
3044
3045	if (mlen > MHLEN) {
3046		MCLGETL(m, M_DONTWAIT, mlen);
3047		if (!ISSET(m->m_flags, M_EXT)) {
3048			m_freem(m);
3049			return (NULL);
3050		}
3051	}
3052
3053	m_align(m, 0);
3054	m->m_len = 0;
3055
3056	return (m);
3057}
3058
3059static void
3060pfsync_in_upd_c(struct pfsync_softc *sc,
3061    const caddr_t buf, unsigned int mlen, unsigned int count)
3062{
3063	const struct pfsync_upd_c *up;
3064	struct pf_state_cmp id_key;
3065	struct pf_state *st;
3066	unsigned int i;
3067	struct mbuf *m = NULL;
3068	unsigned int rcount = 0;
3069
3070	for (i = 0; i < count; i++) {
3071		up = (struct pfsync_upd_c *)(buf + mlen * i);
3072
3073		/* check for invalid values */
3074		if (up->timeout >= PFTM_MAX ||
3075		    up->src.state > PF_TCPS_PROXY_DST ||
3076		    up->dst.state > PF_TCPS_PROXY_DST) {
3077			pfsyncstat_inc(pfsyncs_badval);
3078			continue;
3079		}
3080
3081		id_key.id = up->id;
3082		id_key.creatorid = up->creatorid;
3083
3084		PF_STATE_ENTER_READ();
3085		st = pf_find_state_byid(&id_key);
3086		pf_state_ref(st);
3087		PF_STATE_EXIT_READ();
3088		if (st == NULL) {
3089			/* We don't have this state. Ask for it. */
3090			struct pfsync_upd_req *ur;
3091
3092			if (m == NULL) {
3093				m = pfsync_upd_req_init(sc, count);
3094				if (m == NULL) {
3095					pfsyncstat_inc(pfsyncs_onomem);
3096					continue;
3097				}
3098			}
3099
3100			m = m_prepend(m, sizeof(*ur), M_DONTWAIT);
3101			if (m == NULL) {
3102				pfsyncstat_inc(pfsyncs_onomem);
3103				continue;
3104			}
3105
3106			ur = mtod(m, struct pfsync_upd_req *);
3107			ur->id = up->id;
3108			ur->creatorid = up->creatorid;
3109			rcount++;
3110
3111			continue;
3112		}
3113
3114		pfsync_in_updates(sc, st, &up->src, &up->dst, up->timeout);
3115
3116		pf_state_unref(st);
3117	}
3118
3119	if (m != NULL) {
3120		struct pfsync_subheader *subh;
3121
3122		m = m_prepend(m, sizeof(*subh), M_DONTWAIT);
3123		if (m == NULL) {
3124			pfsyncstat_inc(pfsyncs_onomem);
3125			return;
3126		}
3127
3128		subh = mtod(m, struct pfsync_subheader *);
3129		subh->action = PFSYNC_ACT_UPD_REQ;
3130		subh->len = sizeof(struct pfsync_upd_req) >> 2;
3131		subh->count = htons(rcount);
3132
3133		m = pfsync_encap(sc, m);
3134		if (m == NULL) {
3135			pfsyncstat_inc(pfsyncs_onomem);
3136			return;
3137		}
3138
3139		pfsync_sendout(sc, m);
3140	}
3141}
3142
3143static void
3144pfsync_in_ureq(struct pfsync_softc *sc,
3145    const caddr_t buf, unsigned int mlen, unsigned int count)
3146{
3147	const struct pfsync_upd_req *ur;
3148	struct pf_state_cmp id_key;
3149	struct pf_state *st;
3150	unsigned int i;
3151
3152	for (i = 0; i < count; i++) {
3153		ur = (struct pfsync_upd_req *)(buf + mlen * i);
3154
3155		id_key.id = ur->id;
3156		id_key.creatorid = ur->creatorid;
3157
3158		if (id_key.id == 0 && id_key.creatorid == 0) {
3159			pfsync_bulk_snd_start(sc);
3160			continue;
3161		}
3162
3163		PF_STATE_ENTER_READ();
3164		st = pf_find_state_byid(&id_key);
3165		if (st != NULL && st->timeout < PFTM_MAX &&
3166		    !ISSET(st->state_flags, PFSTATE_NOSYNC))
3167			pf_state_ref(st);
3168		else
3169			st = NULL;
3170		PF_STATE_EXIT_READ();
3171		if (st == NULL) {
3172			pfsyncstat_inc(pfsyncs_badstate);
3173			continue;
3174		}
3175
3176		pfsync_update_state_req(sc, st);
3177
3178		pf_state_unref(st);
3179	}
3180}
3181
3182static void
3183pfsync_in_del(struct pfsync_softc *sc,
3184    const caddr_t buf, unsigned int mlen, unsigned int count)
3185{
3186	const struct pfsync_state *sp;
3187	struct pf_state_cmp id_key;
3188	struct pf_state *st;
3189	unsigned int i;
3190
3191	PF_LOCK();
3192	PF_STATE_ENTER_WRITE();
3193	for (i = 0; i < count; i++) {
3194		sp = (struct pfsync_state *)(buf + mlen * i);
3195
3196		id_key.id = sp->id;
3197		id_key.creatorid = sp->creatorid;
3198
3199		st = pf_find_state_byid(&id_key);
3200		if (st == NULL) {
3201			pfsyncstat_inc(pfsyncs_badstate);
3202			continue;
3203		}
3204
3205		mtx_enter(&st->mtx);
3206		SET(st->state_flags, PFSTATE_NOSYNC);
3207		mtx_leave(&st->mtx);
3208		pf_remove_state(st);
3209	}
3210	PF_STATE_EXIT_WRITE();
3211	PF_UNLOCK();
3212}
3213
3214static void
3215pfsync_in_del_c(struct pfsync_softc *sc,
3216    const caddr_t buf, unsigned int mlen, unsigned int count)
3217{
3218	const struct pfsync_del_c *sp;
3219	struct pf_state_cmp id_key;
3220	struct pf_state *st;
3221	unsigned int i;
3222
3223	PF_LOCK();
3224	PF_STATE_ENTER_WRITE();
3225	for (i = 0; i < count; i++) {
3226		sp = (struct pfsync_del_c *)(buf + mlen * i);
3227
3228		id_key.id = sp->id;
3229		id_key.creatorid = sp->creatorid;
3230
3231		st = pf_find_state_byid(&id_key);
3232		if (st == NULL) {
3233			pfsyncstat_inc(pfsyncs_badstate);
3234			continue;
3235		}
3236
3237		mtx_enter(&st->mtx);
3238		SET(st->state_flags, PFSTATE_NOSYNC);
3239		mtx_leave(&st->mtx);
3240		pf_remove_state(st);
3241	}
3242	PF_STATE_EXIT_WRITE();
3243	PF_UNLOCK();
3244}
3245
3246static void
3247pfsync_in_bus(struct pfsync_softc *sc,
3248    const caddr_t buf, unsigned int len, unsigned int count)
3249{
3250	const struct pfsync_bus *bus = (struct pfsync_bus *)buf;
3251
3252	switch (bus->status) {
3253	case PFSYNC_BUS_START:
3254		pfsync_bulk_req_evt(sc, PFSYNC_BREQ_EVT_BUS_START);
3255		break;
3256
3257	case PFSYNC_BUS_END:
3258		pfsync_bulk_req_evt(sc, PFSYNC_BREQ_EVT_BUS_END);
3259		break;
3260	}
3261}
3262
3263#if defined(IPSEC)
3264/* Update an in-kernel tdb. Silently fail if no tdb is found. */
3265static void
3266pfsync_update_net_tdb(const struct pfsync_tdb *pt)
3267{
3268	struct tdb *tdb;
3269
3270	NET_ASSERT_LOCKED();
3271
3272	/* check for invalid values */
3273	if (ntohl(pt->spi) <= SPI_RESERVED_MAX ||
3274	    (pt->dst.sa.sa_family != AF_INET &&
3275	     pt->dst.sa.sa_family != AF_INET6))
3276		goto bad;
3277
3278	tdb = gettdb(ntohs(pt->rdomain), pt->spi,
3279	    (union sockaddr_union *)&pt->dst, pt->sproto);
3280	if (tdb) {
3281		uint64_t rpl = betoh64(pt->rpl);
3282		uint64_t cur_bytes = betoh64(pt->cur_bytes);
3283
3284		/* Neither replay nor byte counter should ever decrease. */
3285		mtx_enter(&tdb->tdb_mtx);
3286		if (rpl >= tdb->tdb_rpl &&
3287		    cur_bytes >= tdb->tdb_cur_bytes) {
3288			tdb->tdb_rpl = rpl;
3289			tdb->tdb_cur_bytes = cur_bytes;
3290		}
3291		mtx_leave(&tdb->tdb_mtx);
3292
3293		tdb_unref(tdb);
3294	}
3295	return;
3296
3297 bad:
3298	DPFPRINTF(LOG_WARNING, "pfsync_insert: PFSYNC_ACT_TDB_UPD: "
3299	    "invalid value");
3300	pfsyncstat_inc(pfsyncs_badstate);
3301	return;
3302}
3303#endif
3304
3305static void
3306pfsync_in_tdb(struct pfsync_softc *sc,
3307    const caddr_t buf, unsigned int len, unsigned int count)
3308{
3309#if defined(IPSEC)
3310	const struct pfsync_tdb *tp;
3311	unsigned int i;
3312
3313	for (i = 0; i < count; i++) {
3314		tp = (const struct pfsync_tdb *)(buf + len * i);
3315		pfsync_update_net_tdb(tp);
3316	}
3317#endif
3318}
3319
3320int
3321pfsync_input4(struct mbuf **mp, int *offp, int proto, int af)
3322{
3323	struct mbuf *m = *mp;
3324	struct ip *ip;
3325
3326	ip = mtod(m, struct ip *);
3327
3328	m = pfsync_input(m, ip->ip_ttl, ip->ip_hl << 2);
3329
3330	m_freem(m);
3331	*mp = NULL;
3332
3333	return (IPPROTO_DONE);
3334}
3335
3336int
3337pfsync_sysctl_pfsyncstat(void *oldp, size_t *oldlenp, void *newp)
3338{
3339	struct pfsyncstats pfsyncstat;
3340
3341	CTASSERT(sizeof(pfsyncstat) == (pfsyncs_ncounters * sizeof(uint64_t)));
3342	memset(&pfsyncstat, 0, sizeof pfsyncstat);
3343	counters_read(pfsynccounters, (uint64_t *)&pfsyncstat,
3344	    pfsyncs_ncounters, NULL);
3345	return (sysctl_rdstruct(oldp, oldlenp, newp,
3346	    &pfsyncstat, sizeof(pfsyncstat)));
3347}
3348
3349int
3350pfsync_sysctl(int *name, u_int namelen, void *oldp, size_t *oldlenp,
3351    void *newp, size_t newlen)
3352{
3353	/* All sysctl names at this level are terminal. */
3354	if (namelen != 1)
3355		return (ENOTDIR);
3356
3357	switch (name[0]) {
3358	case PFSYNCCTL_STATS:
3359		return (pfsync_sysctl_pfsyncstat(oldp, oldlenp, newp));
3360	default:
3361		return (ENOPROTOOPT);
3362	}
3363}
3364