if_pfsync.c revision 269699
1/*-
2 * Copyright (c) 2002 Michael Shalayeff
3 * Copyright (c) 2012 Gleb Smirnoff <glebius@FreeBSD.org>
4 * All rights reserved.
5 *
6 * Redistribution and use in source and binary forms, with or without
7 * modification, are permitted provided that the following conditions
8 * are met:
9 * 1. Redistributions of source code must retain the above copyright
10 *    notice, this list of conditions and the following disclaimer.
11 * 2. Redistributions in binary form must reproduce the above copyright
12 *    notice, this list of conditions and the following disclaimer in the
13 *    documentation and/or other materials provided with the distribution.
14 *
15 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
16 * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
17 * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
18 * IN NO EVENT SHALL THE AUTHOR OR HIS RELATIVES BE LIABLE FOR ANY DIRECT,
19 * INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
20 * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
21 * SERVICES; LOSS OF MIND, USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
22 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
23 * STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING
24 * IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF
25 * THE POSSIBILITY OF SUCH DAMAGE.
26 */
27
28/*-
29 * Copyright (c) 2009 David Gwynne <dlg@openbsd.org>
30 *
31 * Permission to use, copy, modify, and distribute this software for any
32 * purpose with or without fee is hereby granted, provided that the above
33 * copyright notice and this permission notice appear in all copies.
34 *
35 * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
36 * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
37 * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR
38 * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
39 * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
40 * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF
41 * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
42 */
43
44/*
45 * $OpenBSD: if_pfsync.c,v 1.110 2009/02/24 05:39:19 dlg Exp $
46 *
47 * Revisions picked from OpenBSD after revision 1.110 import:
48 * 1.119 - don't m_copydata() beyond the len of mbuf in pfsync_input()
49 * 1.118, 1.124, 1.148, 1.149, 1.151, 1.171 - fixes to bulk updates
50 * 1.120, 1.175 - use monotonic time_uptime
51 * 1.122 - reduce number of updates for non-TCP sessions
52 * 1.125, 1.127 - rewrite merge or stale processing
53 * 1.128 - cleanups
54 * 1.146 - bzero() mbuf before sparsely filling it with data
55 * 1.170 - SIOCSIFMTU checks
56 * 1.126, 1.142 - deferred packets processing
57 * 1.173 - correct expire time processing
58 */
59
60#include <sys/cdefs.h>
61__FBSDID("$FreeBSD: head/sys/netpfil/pf/if_pfsync.c 269699 2014-08-08 01:57:15Z kevlo $");
62
63#include "opt_inet.h"
64#include "opt_inet6.h"
65#include "opt_pf.h"
66
67#include <sys/param.h>
68#include <sys/bus.h>
69#include <sys/endian.h>
70#include <sys/interrupt.h>
71#include <sys/kernel.h>
72#include <sys/lock.h>
73#include <sys/mbuf.h>
74#include <sys/module.h>
75#include <sys/mutex.h>
76#include <sys/priv.h>
77#include <sys/protosw.h>
78#include <sys/socket.h>
79#include <sys/sockio.h>
80#include <sys/sysctl.h>
81
82#include <net/bpf.h>
83#include <net/if.h>
84#include <net/if_var.h>
85#include <net/if_clone.h>
86#include <net/if_types.h>
87#include <net/vnet.h>
88#include <net/pfvar.h>
89#include <net/if_pfsync.h>
90
91#include <netinet/if_ether.h>
92#include <netinet/in.h>
93#include <netinet/in_var.h>
94#include <netinet/ip.h>
95#include <netinet/ip_carp.h>
96#include <netinet/ip_var.h>
97#include <netinet/tcp.h>
98#include <netinet/tcp_fsm.h>
99#include <netinet/tcp_seq.h>
100
101#define PFSYNC_MINPKT ( \
102	sizeof(struct ip) + \
103	sizeof(struct pfsync_header) + \
104	sizeof(struct pfsync_subheader) )
105
106struct pfsync_pkt {
107	struct ip *ip;
108	struct in_addr src;
109	u_int8_t flags;
110};
111
112static int	pfsync_upd_tcp(struct pf_state *, struct pfsync_state_peer *,
113		    struct pfsync_state_peer *);
114static int	pfsync_in_clr(struct pfsync_pkt *, struct mbuf *, int, int);
115static int	pfsync_in_ins(struct pfsync_pkt *, struct mbuf *, int, int);
116static int	pfsync_in_iack(struct pfsync_pkt *, struct mbuf *, int, int);
117static int	pfsync_in_upd(struct pfsync_pkt *, struct mbuf *, int, int);
118static int	pfsync_in_upd_c(struct pfsync_pkt *, struct mbuf *, int, int);
119static int	pfsync_in_ureq(struct pfsync_pkt *, struct mbuf *, int, int);
120static int	pfsync_in_del(struct pfsync_pkt *, struct mbuf *, int, int);
121static int	pfsync_in_del_c(struct pfsync_pkt *, struct mbuf *, int, int);
122static int	pfsync_in_bus(struct pfsync_pkt *, struct mbuf *, int, int);
123static int	pfsync_in_tdb(struct pfsync_pkt *, struct mbuf *, int, int);
124static int	pfsync_in_eof(struct pfsync_pkt *, struct mbuf *, int, int);
125static int	pfsync_in_error(struct pfsync_pkt *, struct mbuf *, int, int);
126
127static int (*pfsync_acts[])(struct pfsync_pkt *, struct mbuf *, int, int) = {
128	pfsync_in_clr,			/* PFSYNC_ACT_CLR */
129	pfsync_in_ins,			/* PFSYNC_ACT_INS */
130	pfsync_in_iack,			/* PFSYNC_ACT_INS_ACK */
131	pfsync_in_upd,			/* PFSYNC_ACT_UPD */
132	pfsync_in_upd_c,		/* PFSYNC_ACT_UPD_C */
133	pfsync_in_ureq,			/* PFSYNC_ACT_UPD_REQ */
134	pfsync_in_del,			/* PFSYNC_ACT_DEL */
135	pfsync_in_del_c,		/* PFSYNC_ACT_DEL_C */
136	pfsync_in_error,		/* PFSYNC_ACT_INS_F */
137	pfsync_in_error,		/* PFSYNC_ACT_DEL_F */
138	pfsync_in_bus,			/* PFSYNC_ACT_BUS */
139	pfsync_in_tdb,			/* PFSYNC_ACT_TDB */
140	pfsync_in_eof			/* PFSYNC_ACT_EOF */
141};
142
143struct pfsync_q {
144	void		(*write)(struct pf_state *, void *);
145	size_t		len;
146	u_int8_t	action;
147};
148
149/* we have one of these for every PFSYNC_S_ */
150static void	pfsync_out_state(struct pf_state *, void *);
151static void	pfsync_out_iack(struct pf_state *, void *);
152static void	pfsync_out_upd_c(struct pf_state *, void *);
153static void	pfsync_out_del(struct pf_state *, void *);
154
155static struct pfsync_q pfsync_qs[] = {
156	{ pfsync_out_state, sizeof(struct pfsync_state),   PFSYNC_ACT_INS },
157	{ pfsync_out_iack,  sizeof(struct pfsync_ins_ack), PFSYNC_ACT_INS_ACK },
158	{ pfsync_out_state, sizeof(struct pfsync_state),   PFSYNC_ACT_UPD },
159	{ pfsync_out_upd_c, sizeof(struct pfsync_upd_c),   PFSYNC_ACT_UPD_C },
160	{ pfsync_out_del,   sizeof(struct pfsync_del_c),   PFSYNC_ACT_DEL_C }
161};
162
163static void	pfsync_q_ins(struct pf_state *, int);
164static void	pfsync_q_del(struct pf_state *);
165
166static void	pfsync_update_state(struct pf_state *);
167
168struct pfsync_upd_req_item {
169	TAILQ_ENTRY(pfsync_upd_req_item)	ur_entry;
170	struct pfsync_upd_req			ur_msg;
171};
172
173struct pfsync_deferral {
174	struct pfsync_softc		*pd_sc;
175	TAILQ_ENTRY(pfsync_deferral)	pd_entry;
176	u_int				pd_refs;
177	struct callout			pd_tmo;
178
179	struct pf_state			*pd_st;
180	struct mbuf			*pd_m;
181};
182
183struct pfsync_softc {
184	/* Configuration */
185	struct ifnet		*sc_ifp;
186	struct ifnet		*sc_sync_if;
187	struct ip_moptions	sc_imo;
188	struct in_addr		sc_sync_peer;
189	uint32_t		sc_flags;
190#define	PFSYNCF_OK		0x00000001
191#define	PFSYNCF_DEFER		0x00000002
192#define	PFSYNCF_PUSH		0x00000004
193	uint8_t			sc_maxupdates;
194	struct ip		sc_template;
195	struct callout		sc_tmo;
196	struct mtx		sc_mtx;
197
198	/* Queued data */
199	size_t			sc_len;
200	TAILQ_HEAD(, pf_state)			sc_qs[PFSYNC_S_COUNT];
201	TAILQ_HEAD(, pfsync_upd_req_item)	sc_upd_req_list;
202	TAILQ_HEAD(, pfsync_deferral)		sc_deferrals;
203	u_int			sc_deferred;
204	void			*sc_plus;
205	size_t			sc_pluslen;
206
207	/* Bulk update info */
208	struct mtx		sc_bulk_mtx;
209	uint32_t		sc_ureq_sent;
210	int			sc_bulk_tries;
211	uint32_t		sc_ureq_received;
212	int			sc_bulk_hashid;
213	uint64_t		sc_bulk_stateid;
214	uint32_t		sc_bulk_creatorid;
215	struct callout		sc_bulk_tmo;
216	struct callout		sc_bulkfail_tmo;
217};
218
219#define	PFSYNC_LOCK(sc)		mtx_lock(&(sc)->sc_mtx)
220#define	PFSYNC_UNLOCK(sc)	mtx_unlock(&(sc)->sc_mtx)
221#define	PFSYNC_LOCK_ASSERT(sc)	mtx_assert(&(sc)->sc_mtx, MA_OWNED)
222
223#define	PFSYNC_BLOCK(sc)	mtx_lock(&(sc)->sc_bulk_mtx)
224#define	PFSYNC_BUNLOCK(sc)	mtx_unlock(&(sc)->sc_bulk_mtx)
225#define	PFSYNC_BLOCK_ASSERT(sc)	mtx_assert(&(sc)->sc_bulk_mtx, MA_OWNED)
226
227static const char pfsyncname[] = "pfsync";
228static MALLOC_DEFINE(M_PFSYNC, pfsyncname, "pfsync(4) data");
229static VNET_DEFINE(struct pfsync_softc	*, pfsyncif) = NULL;
230#define	V_pfsyncif		VNET(pfsyncif)
231static VNET_DEFINE(void *, pfsync_swi_cookie) = NULL;
232#define	V_pfsync_swi_cookie	VNET(pfsync_swi_cookie)
233static VNET_DEFINE(struct pfsyncstats, pfsyncstats);
234#define	V_pfsyncstats		VNET(pfsyncstats)
235static VNET_DEFINE(int, pfsync_carp_adj) = CARP_MAXSKEW;
236#define	V_pfsync_carp_adj	VNET(pfsync_carp_adj)
237
238static void	pfsync_timeout(void *);
239static void	pfsync_push(struct pfsync_softc *);
240static void	pfsyncintr(void *);
241static int	pfsync_multicast_setup(struct pfsync_softc *, struct ifnet *,
242		    void *);
243static void	pfsync_multicast_cleanup(struct pfsync_softc *);
244static void	pfsync_pointers_init(void);
245static void	pfsync_pointers_uninit(void);
246static int	pfsync_init(void);
247static void	pfsync_uninit(void);
248
249SYSCTL_NODE(_net, OID_AUTO, pfsync, CTLFLAG_RW, 0, "PFSYNC");
250SYSCTL_VNET_STRUCT(_net_pfsync, OID_AUTO, stats, CTLFLAG_RW,
251    &VNET_NAME(pfsyncstats), pfsyncstats,
252    "PFSYNC statistics (struct pfsyncstats, net/if_pfsync.h)");
253SYSCTL_INT(_net_pfsync, OID_AUTO, carp_demotion_factor, CTLFLAG_RW,
254    &VNET_NAME(pfsync_carp_adj), 0, "pfsync's CARP demotion factor adjustment");
255
256static int	pfsync_clone_create(struct if_clone *, int, caddr_t);
257static void	pfsync_clone_destroy(struct ifnet *);
258static int	pfsync_alloc_scrub_memory(struct pfsync_state_peer *,
259		    struct pf_state_peer *);
260static int	pfsyncoutput(struct ifnet *, struct mbuf *,
261		    const struct sockaddr *, struct route *);
262static int	pfsyncioctl(struct ifnet *, u_long, caddr_t);
263
264static int	pfsync_defer(struct pf_state *, struct mbuf *);
265static void	pfsync_undefer(struct pfsync_deferral *, int);
266static void	pfsync_undefer_state(struct pf_state *, int);
267static void	pfsync_defer_tmo(void *);
268
269static void	pfsync_request_update(u_int32_t, u_int64_t);
270static void	pfsync_update_state_req(struct pf_state *);
271
272static void	pfsync_drop(struct pfsync_softc *);
273static void	pfsync_sendout(int);
274static void	pfsync_send_plus(void *, size_t);
275
276static void	pfsync_bulk_start(void);
277static void	pfsync_bulk_status(u_int8_t);
278static void	pfsync_bulk_update(void *);
279static void	pfsync_bulk_fail(void *);
280
281#ifdef IPSEC
282static void	pfsync_update_net_tdb(struct pfsync_tdb *);
283#endif
284
285#define PFSYNC_MAX_BULKTRIES	12
286
287VNET_DEFINE(struct if_clone *, pfsync_cloner);
288#define	V_pfsync_cloner	VNET(pfsync_cloner)
289
290static int
291pfsync_clone_create(struct if_clone *ifc, int unit, caddr_t param)
292{
293	struct pfsync_softc *sc;
294	struct ifnet *ifp;
295	int q;
296
297	if (unit != 0)
298		return (EINVAL);
299
300	sc = malloc(sizeof(struct pfsync_softc), M_PFSYNC, M_WAITOK | M_ZERO);
301	sc->sc_flags |= PFSYNCF_OK;
302
303	for (q = 0; q < PFSYNC_S_COUNT; q++)
304		TAILQ_INIT(&sc->sc_qs[q]);
305
306	TAILQ_INIT(&sc->sc_upd_req_list);
307	TAILQ_INIT(&sc->sc_deferrals);
308
309	sc->sc_len = PFSYNC_MINPKT;
310	sc->sc_maxupdates = 128;
311
312	ifp = sc->sc_ifp = if_alloc(IFT_PFSYNC);
313	if (ifp == NULL) {
314		free(sc, M_PFSYNC);
315		return (ENOSPC);
316	}
317	if_initname(ifp, pfsyncname, unit);
318	ifp->if_softc = sc;
319	ifp->if_ioctl = pfsyncioctl;
320	ifp->if_output = pfsyncoutput;
321	ifp->if_type = IFT_PFSYNC;
322	ifp->if_snd.ifq_maxlen = ifqmaxlen;
323	ifp->if_hdrlen = sizeof(struct pfsync_header);
324	ifp->if_mtu = ETHERMTU;
325	mtx_init(&sc->sc_mtx, pfsyncname, NULL, MTX_DEF);
326	mtx_init(&sc->sc_bulk_mtx, "pfsync bulk", NULL, MTX_DEF);
327	callout_init(&sc->sc_tmo, CALLOUT_MPSAFE);
328	callout_init_mtx(&sc->sc_bulk_tmo, &sc->sc_bulk_mtx, 0);
329	callout_init_mtx(&sc->sc_bulkfail_tmo, &sc->sc_bulk_mtx, 0);
330
331	if_attach(ifp);
332
333	bpfattach(ifp, DLT_PFSYNC, PFSYNC_HDRLEN);
334
335	V_pfsyncif = sc;
336
337	return (0);
338}
339
340static void
341pfsync_clone_destroy(struct ifnet *ifp)
342{
343	struct pfsync_softc *sc = ifp->if_softc;
344
345	/*
346	 * At this stage, everything should have already been
347	 * cleared by pfsync_uninit(), and we have only to
348	 * drain callouts.
349	 */
350	while (sc->sc_deferred > 0) {
351		struct pfsync_deferral *pd = TAILQ_FIRST(&sc->sc_deferrals);
352
353		TAILQ_REMOVE(&sc->sc_deferrals, pd, pd_entry);
354		sc->sc_deferred--;
355		if (callout_stop(&pd->pd_tmo)) {
356			pf_release_state(pd->pd_st);
357			m_freem(pd->pd_m);
358			free(pd, M_PFSYNC);
359		} else {
360			pd->pd_refs++;
361			callout_drain(&pd->pd_tmo);
362			free(pd, M_PFSYNC);
363		}
364	}
365
366	callout_drain(&sc->sc_tmo);
367	callout_drain(&sc->sc_bulkfail_tmo);
368	callout_drain(&sc->sc_bulk_tmo);
369
370	if (!(sc->sc_flags & PFSYNCF_OK) && carp_demote_adj_p)
371		(*carp_demote_adj_p)(-V_pfsync_carp_adj, "pfsync destroy");
372	bpfdetach(ifp);
373	if_detach(ifp);
374
375	pfsync_drop(sc);
376
377	if_free(ifp);
378	if (sc->sc_imo.imo_membership)
379		pfsync_multicast_cleanup(sc);
380	mtx_destroy(&sc->sc_mtx);
381	mtx_destroy(&sc->sc_bulk_mtx);
382	free(sc, M_PFSYNC);
383
384	V_pfsyncif = NULL;
385}
386
387static int
388pfsync_alloc_scrub_memory(struct pfsync_state_peer *s,
389    struct pf_state_peer *d)
390{
391	if (s->scrub.scrub_flag && d->scrub == NULL) {
392		d->scrub = uma_zalloc(V_pf_state_scrub_z, M_NOWAIT | M_ZERO);
393		if (d->scrub == NULL)
394			return (ENOMEM);
395	}
396
397	return (0);
398}
399
400
401static int
402pfsync_state_import(struct pfsync_state *sp, u_int8_t flags)
403{
404	struct pfsync_softc *sc = V_pfsyncif;
405#ifndef	__NO_STRICT_ALIGNMENT
406	struct pfsync_state_key key[2];
407#endif
408	struct pfsync_state_key *kw, *ks;
409	struct pf_state	*st = NULL;
410	struct pf_state_key *skw = NULL, *sks = NULL;
411	struct pf_rule *r = NULL;
412	struct pfi_kif	*kif;
413	int error;
414
415	PF_RULES_RASSERT();
416
417	if (sp->creatorid == 0) {
418		if (V_pf_status.debug >= PF_DEBUG_MISC)
419			printf("%s: invalid creator id: %08x\n", __func__,
420			    ntohl(sp->creatorid));
421		return (EINVAL);
422	}
423
424	if ((kif = pfi_kif_find(sp->ifname)) == NULL) {
425		if (V_pf_status.debug >= PF_DEBUG_MISC)
426			printf("%s: unknown interface: %s\n", __func__,
427			    sp->ifname);
428		if (flags & PFSYNC_SI_IOCTL)
429			return (EINVAL);
430		return (0);	/* skip this state */
431	}
432
433	/*
434	 * If the ruleset checksums match or the state is coming from the ioctl,
435	 * it's safe to associate the state with the rule of that number.
436	 */
437	if (sp->rule != htonl(-1) && sp->anchor == htonl(-1) &&
438	    (flags & (PFSYNC_SI_IOCTL | PFSYNC_SI_CKSUM)) && ntohl(sp->rule) <
439	    pf_main_ruleset.rules[PF_RULESET_FILTER].active.rcount)
440		r = pf_main_ruleset.rules[
441		    PF_RULESET_FILTER].active.ptr_array[ntohl(sp->rule)];
442	else
443		r = &V_pf_default_rule;
444
445	if ((r->max_states &&
446	    counter_u64_fetch(r->states_cur) >= r->max_states))
447		goto cleanup;
448
449	/*
450	 * XXXGL: consider M_WAITOK in ioctl path after.
451	 */
452	if ((st = uma_zalloc(V_pf_state_z, M_NOWAIT | M_ZERO)) == NULL)
453		goto cleanup;
454
455	if ((skw = uma_zalloc(V_pf_state_key_z, M_NOWAIT)) == NULL)
456		goto cleanup;
457
458#ifndef	__NO_STRICT_ALIGNMENT
459	bcopy(&sp->key, key, sizeof(struct pfsync_state_key) * 2);
460	kw = &key[PF_SK_WIRE];
461	ks = &key[PF_SK_STACK];
462#else
463	kw = &sp->key[PF_SK_WIRE];
464	ks = &sp->key[PF_SK_STACK];
465#endif
466
467	if (PF_ANEQ(&kw->addr[0], &ks->addr[0], sp->af) ||
468	    PF_ANEQ(&kw->addr[1], &ks->addr[1], sp->af) ||
469	    kw->port[0] != ks->port[0] ||
470	    kw->port[1] != ks->port[1]) {
471		sks = uma_zalloc(V_pf_state_key_z, M_NOWAIT);
472		if (sks == NULL)
473			goto cleanup;
474	} else
475		sks = skw;
476
477	/* allocate memory for scrub info */
478	if (pfsync_alloc_scrub_memory(&sp->src, &st->src) ||
479	    pfsync_alloc_scrub_memory(&sp->dst, &st->dst))
480		goto cleanup;
481
482	/* Copy to state key(s). */
483	skw->addr[0] = kw->addr[0];
484	skw->addr[1] = kw->addr[1];
485	skw->port[0] = kw->port[0];
486	skw->port[1] = kw->port[1];
487	skw->proto = sp->proto;
488	skw->af = sp->af;
489	if (sks != skw) {
490		sks->addr[0] = ks->addr[0];
491		sks->addr[1] = ks->addr[1];
492		sks->port[0] = ks->port[0];
493		sks->port[1] = ks->port[1];
494		sks->proto = sp->proto;
495		sks->af = sp->af;
496	}
497
498	/* copy to state */
499	bcopy(&sp->rt_addr, &st->rt_addr, sizeof(st->rt_addr));
500	st->creation = time_uptime - ntohl(sp->creation);
501	st->expire = time_uptime;
502	if (sp->expire) {
503		uint32_t timeout;
504
505		timeout = r->timeout[sp->timeout];
506		if (!timeout)
507			timeout = V_pf_default_rule.timeout[sp->timeout];
508
509		/* sp->expire may have been adaptively scaled by export. */
510		st->expire -= timeout - ntohl(sp->expire);
511	}
512
513	st->direction = sp->direction;
514	st->log = sp->log;
515	st->timeout = sp->timeout;
516	st->state_flags = sp->state_flags;
517
518	st->id = sp->id;
519	st->creatorid = sp->creatorid;
520	pf_state_peer_ntoh(&sp->src, &st->src);
521	pf_state_peer_ntoh(&sp->dst, &st->dst);
522
523	st->rule.ptr = r;
524	st->nat_rule.ptr = NULL;
525	st->anchor.ptr = NULL;
526	st->rt_kif = NULL;
527
528	st->pfsync_time = time_uptime;
529	st->sync_state = PFSYNC_S_NONE;
530
531	if (!(flags & PFSYNC_SI_IOCTL))
532		st->state_flags |= PFSTATE_NOSYNC;
533
534	if ((error = pf_state_insert(kif, skw, sks, st)) != 0)
535		goto cleanup_state;
536
537	/* XXX when we have nat_rule/anchors, use STATE_INC_COUNTERS */
538	counter_u64_add(r->states_cur, 1);
539	counter_u64_add(r->states_tot, 1);
540
541	if (!(flags & PFSYNC_SI_IOCTL)) {
542		st->state_flags &= ~PFSTATE_NOSYNC;
543		if (st->state_flags & PFSTATE_ACK) {
544			pfsync_q_ins(st, PFSYNC_S_IACK);
545			pfsync_push(sc);
546		}
547	}
548	st->state_flags &= ~PFSTATE_ACK;
549	PF_STATE_UNLOCK(st);
550
551	return (0);
552
553cleanup:
554	error = ENOMEM;
555	if (skw == sks)
556		sks = NULL;
557	if (skw != NULL)
558		uma_zfree(V_pf_state_key_z, skw);
559	if (sks != NULL)
560		uma_zfree(V_pf_state_key_z, sks);
561
562cleanup_state:	/* pf_state_insert() frees the state keys. */
563	if (st) {
564		if (st->dst.scrub)
565			uma_zfree(V_pf_state_scrub_z, st->dst.scrub);
566		if (st->src.scrub)
567			uma_zfree(V_pf_state_scrub_z, st->src.scrub);
568		uma_zfree(V_pf_state_z, st);
569	}
570	return (error);
571}
572
573static int
574pfsync_input(struct mbuf **mp, int *offp __unused, int proto __unused)
575{
576	struct pfsync_softc *sc = V_pfsyncif;
577	struct pfsync_pkt pkt;
578	struct mbuf *m = *mp;
579	struct ip *ip = mtod(m, struct ip *);
580	struct pfsync_header *ph;
581	struct pfsync_subheader subh;
582
583	int offset, len;
584	int rv;
585	uint16_t count;
586
587	*mp = NULL;
588	V_pfsyncstats.pfsyncs_ipackets++;
589
590	/* Verify that we have a sync interface configured. */
591	if (!sc || !sc->sc_sync_if || !V_pf_status.running ||
592	    (sc->sc_ifp->if_drv_flags & IFF_DRV_RUNNING) == 0)
593		goto done;
594
595	/* verify that the packet came in on the right interface */
596	if (sc->sc_sync_if != m->m_pkthdr.rcvif) {
597		V_pfsyncstats.pfsyncs_badif++;
598		goto done;
599	}
600
601	sc->sc_ifp->if_ipackets++;
602	sc->sc_ifp->if_ibytes += m->m_pkthdr.len;
603	/* verify that the IP TTL is 255. */
604	if (ip->ip_ttl != PFSYNC_DFLTTL) {
605		V_pfsyncstats.pfsyncs_badttl++;
606		goto done;
607	}
608
609	offset = ip->ip_hl << 2;
610	if (m->m_pkthdr.len < offset + sizeof(*ph)) {
611		V_pfsyncstats.pfsyncs_hdrops++;
612		goto done;
613	}
614
615	if (offset + sizeof(*ph) > m->m_len) {
616		if (m_pullup(m, offset + sizeof(*ph)) == NULL) {
617			V_pfsyncstats.pfsyncs_hdrops++;
618			return (IPPROTO_DONE);
619		}
620		ip = mtod(m, struct ip *);
621	}
622	ph = (struct pfsync_header *)((char *)ip + offset);
623
624	/* verify the version */
625	if (ph->version != PFSYNC_VERSION) {
626		V_pfsyncstats.pfsyncs_badver++;
627		goto done;
628	}
629
630	len = ntohs(ph->len) + offset;
631	if (m->m_pkthdr.len < len) {
632		V_pfsyncstats.pfsyncs_badlen++;
633		goto done;
634	}
635
636	/* Cheaper to grab this now than having to mess with mbufs later */
637	pkt.ip = ip;
638	pkt.src = ip->ip_src;
639	pkt.flags = 0;
640
641	/*
642	 * Trusting pf_chksum during packet processing, as well as seeking
643	 * in interface name tree, require holding PF_RULES_RLOCK().
644	 */
645	PF_RULES_RLOCK();
646	if (!bcmp(&ph->pfcksum, &V_pf_status.pf_chksum, PF_MD5_DIGEST_LENGTH))
647		pkt.flags |= PFSYNC_SI_CKSUM;
648
649	offset += sizeof(*ph);
650	while (offset <= len - sizeof(subh)) {
651		m_copydata(m, offset, sizeof(subh), (caddr_t)&subh);
652		offset += sizeof(subh);
653
654		if (subh.action >= PFSYNC_ACT_MAX) {
655			V_pfsyncstats.pfsyncs_badact++;
656			PF_RULES_RUNLOCK();
657			goto done;
658		}
659
660		count = ntohs(subh.count);
661		V_pfsyncstats.pfsyncs_iacts[subh.action] += count;
662		rv = (*pfsync_acts[subh.action])(&pkt, m, offset, count);
663		if (rv == -1) {
664			PF_RULES_RUNLOCK();
665			return (IPPROTO_DONE);
666		}
667
668		offset += rv;
669	}
670	PF_RULES_RUNLOCK();
671
672done:
673	m_freem(m);
674	return (IPPROTO_DONE);
675}
676
677static int
678pfsync_in_clr(struct pfsync_pkt *pkt, struct mbuf *m, int offset, int count)
679{
680	struct pfsync_clr *clr;
681	struct mbuf *mp;
682	int len = sizeof(*clr) * count;
683	int i, offp;
684	u_int32_t creatorid;
685
686	mp = m_pulldown(m, offset, len, &offp);
687	if (mp == NULL) {
688		V_pfsyncstats.pfsyncs_badlen++;
689		return (-1);
690	}
691	clr = (struct pfsync_clr *)(mp->m_data + offp);
692
693	for (i = 0; i < count; i++) {
694		creatorid = clr[i].creatorid;
695
696		if (clr[i].ifname[0] != '\0' &&
697		    pfi_kif_find(clr[i].ifname) == NULL)
698			continue;
699
700		for (int i = 0; i <= pf_hashmask; i++) {
701			struct pf_idhash *ih = &V_pf_idhash[i];
702			struct pf_state *s;
703relock:
704			PF_HASHROW_LOCK(ih);
705			LIST_FOREACH(s, &ih->states, entry) {
706				if (s->creatorid == creatorid) {
707					s->state_flags |= PFSTATE_NOSYNC;
708					pf_unlink_state(s, PF_ENTER_LOCKED);
709					goto relock;
710				}
711			}
712			PF_HASHROW_UNLOCK(ih);
713		}
714	}
715
716	return (len);
717}
718
719static int
720pfsync_in_ins(struct pfsync_pkt *pkt, struct mbuf *m, int offset, int count)
721{
722	struct mbuf *mp;
723	struct pfsync_state *sa, *sp;
724	int len = sizeof(*sp) * count;
725	int i, offp;
726
727	mp = m_pulldown(m, offset, len, &offp);
728	if (mp == NULL) {
729		V_pfsyncstats.pfsyncs_badlen++;
730		return (-1);
731	}
732	sa = (struct pfsync_state *)(mp->m_data + offp);
733
734	for (i = 0; i < count; i++) {
735		sp = &sa[i];
736
737		/* Check for invalid values. */
738		if (sp->timeout >= PFTM_MAX ||
739		    sp->src.state > PF_TCPS_PROXY_DST ||
740		    sp->dst.state > PF_TCPS_PROXY_DST ||
741		    sp->direction > PF_OUT ||
742		    (sp->af != AF_INET && sp->af != AF_INET6)) {
743			if (V_pf_status.debug >= PF_DEBUG_MISC)
744				printf("%s: invalid value\n", __func__);
745			V_pfsyncstats.pfsyncs_badval++;
746			continue;
747		}
748
749		if (pfsync_state_import(sp, pkt->flags) == ENOMEM)
750			/* Drop out, but process the rest of the actions. */
751			break;
752	}
753
754	return (len);
755}
756
757static int
758pfsync_in_iack(struct pfsync_pkt *pkt, struct mbuf *m, int offset, int count)
759{
760	struct pfsync_ins_ack *ia, *iaa;
761	struct pf_state *st;
762
763	struct mbuf *mp;
764	int len = count * sizeof(*ia);
765	int offp, i;
766
767	mp = m_pulldown(m, offset, len, &offp);
768	if (mp == NULL) {
769		V_pfsyncstats.pfsyncs_badlen++;
770		return (-1);
771	}
772	iaa = (struct pfsync_ins_ack *)(mp->m_data + offp);
773
774	for (i = 0; i < count; i++) {
775		ia = &iaa[i];
776
777		st = pf_find_state_byid(ia->id, ia->creatorid);
778		if (st == NULL)
779			continue;
780
781		if (st->state_flags & PFSTATE_ACK) {
782			PFSYNC_LOCK(V_pfsyncif);
783			pfsync_undefer_state(st, 0);
784			PFSYNC_UNLOCK(V_pfsyncif);
785		}
786		PF_STATE_UNLOCK(st);
787	}
788	/*
789	 * XXX this is not yet implemented, but we know the size of the
790	 * message so we can skip it.
791	 */
792
793	return (count * sizeof(struct pfsync_ins_ack));
794}
795
796static int
797pfsync_upd_tcp(struct pf_state *st, struct pfsync_state_peer *src,
798    struct pfsync_state_peer *dst)
799{
800	int sync = 0;
801
802	PF_STATE_LOCK_ASSERT(st);
803
804	/*
805	 * The state should never go backwards except
806	 * for syn-proxy states.  Neither should the
807	 * sequence window slide backwards.
808	 */
809	if ((st->src.state > src->state &&
810	    (st->src.state < PF_TCPS_PROXY_SRC ||
811	    src->state >= PF_TCPS_PROXY_SRC)) ||
812
813	    (st->src.state == src->state &&
814	    SEQ_GT(st->src.seqlo, ntohl(src->seqlo))))
815		sync++;
816	else
817		pf_state_peer_ntoh(src, &st->src);
818
819	if ((st->dst.state > dst->state) ||
820
821	    (st->dst.state >= TCPS_SYN_SENT &&
822	    SEQ_GT(st->dst.seqlo, ntohl(dst->seqlo))))
823		sync++;
824	else
825		pf_state_peer_ntoh(dst, &st->dst);
826
827	return (sync);
828}
829
830static int
831pfsync_in_upd(struct pfsync_pkt *pkt, struct mbuf *m, int offset, int count)
832{
833	struct pfsync_softc *sc = V_pfsyncif;
834	struct pfsync_state *sa, *sp;
835	struct pf_state *st;
836	int sync;
837
838	struct mbuf *mp;
839	int len = count * sizeof(*sp);
840	int offp, i;
841
842	mp = m_pulldown(m, offset, len, &offp);
843	if (mp == NULL) {
844		V_pfsyncstats.pfsyncs_badlen++;
845		return (-1);
846	}
847	sa = (struct pfsync_state *)(mp->m_data + offp);
848
849	for (i = 0; i < count; i++) {
850		sp = &sa[i];
851
852		/* check for invalid values */
853		if (sp->timeout >= PFTM_MAX ||
854		    sp->src.state > PF_TCPS_PROXY_DST ||
855		    sp->dst.state > PF_TCPS_PROXY_DST) {
856			if (V_pf_status.debug >= PF_DEBUG_MISC) {
857				printf("pfsync_input: PFSYNC_ACT_UPD: "
858				    "invalid value\n");
859			}
860			V_pfsyncstats.pfsyncs_badval++;
861			continue;
862		}
863
864		st = pf_find_state_byid(sp->id, sp->creatorid);
865		if (st == NULL) {
866			/* insert the update */
867			if (pfsync_state_import(sp, 0))
868				V_pfsyncstats.pfsyncs_badstate++;
869			continue;
870		}
871
872		if (st->state_flags & PFSTATE_ACK) {
873			PFSYNC_LOCK(sc);
874			pfsync_undefer_state(st, 1);
875			PFSYNC_UNLOCK(sc);
876		}
877
878		if (st->key[PF_SK_WIRE]->proto == IPPROTO_TCP)
879			sync = pfsync_upd_tcp(st, &sp->src, &sp->dst);
880		else {
881			sync = 0;
882
883			/*
884			 * Non-TCP protocol state machine always go
885			 * forwards
886			 */
887			if (st->src.state > sp->src.state)
888				sync++;
889			else
890				pf_state_peer_ntoh(&sp->src, &st->src);
891			if (st->dst.state > sp->dst.state)
892				sync++;
893			else
894				pf_state_peer_ntoh(&sp->dst, &st->dst);
895		}
896		if (sync < 2) {
897			pfsync_alloc_scrub_memory(&sp->dst, &st->dst);
898			pf_state_peer_ntoh(&sp->dst, &st->dst);
899			st->expire = time_uptime;
900			st->timeout = sp->timeout;
901		}
902		st->pfsync_time = time_uptime;
903
904		if (sync) {
905			V_pfsyncstats.pfsyncs_stale++;
906
907			pfsync_update_state(st);
908			PF_STATE_UNLOCK(st);
909			PFSYNC_LOCK(sc);
910			pfsync_push(sc);
911			PFSYNC_UNLOCK(sc);
912			continue;
913		}
914		PF_STATE_UNLOCK(st);
915	}
916
917	return (len);
918}
919
920static int
921pfsync_in_upd_c(struct pfsync_pkt *pkt, struct mbuf *m, int offset, int count)
922{
923	struct pfsync_softc *sc = V_pfsyncif;
924	struct pfsync_upd_c *ua, *up;
925	struct pf_state *st;
926	int len = count * sizeof(*up);
927	int sync;
928	struct mbuf *mp;
929	int offp, i;
930
931	mp = m_pulldown(m, offset, len, &offp);
932	if (mp == NULL) {
933		V_pfsyncstats.pfsyncs_badlen++;
934		return (-1);
935	}
936	ua = (struct pfsync_upd_c *)(mp->m_data + offp);
937
938	for (i = 0; i < count; i++) {
939		up = &ua[i];
940
941		/* check for invalid values */
942		if (up->timeout >= PFTM_MAX ||
943		    up->src.state > PF_TCPS_PROXY_DST ||
944		    up->dst.state > PF_TCPS_PROXY_DST) {
945			if (V_pf_status.debug >= PF_DEBUG_MISC) {
946				printf("pfsync_input: "
947				    "PFSYNC_ACT_UPD_C: "
948				    "invalid value\n");
949			}
950			V_pfsyncstats.pfsyncs_badval++;
951			continue;
952		}
953
954		st = pf_find_state_byid(up->id, up->creatorid);
955		if (st == NULL) {
956			/* We don't have this state. Ask for it. */
957			PFSYNC_LOCK(sc);
958			pfsync_request_update(up->creatorid, up->id);
959			PFSYNC_UNLOCK(sc);
960			continue;
961		}
962
963		if (st->state_flags & PFSTATE_ACK) {
964			PFSYNC_LOCK(sc);
965			pfsync_undefer_state(st, 1);
966			PFSYNC_UNLOCK(sc);
967		}
968
969		if (st->key[PF_SK_WIRE]->proto == IPPROTO_TCP)
970			sync = pfsync_upd_tcp(st, &up->src, &up->dst);
971		else {
972			sync = 0;
973
974			/*
975			 * Non-TCP protocol state machine always go
976			 * forwards
977			 */
978			if (st->src.state > up->src.state)
979				sync++;
980			else
981				pf_state_peer_ntoh(&up->src, &st->src);
982			if (st->dst.state > up->dst.state)
983				sync++;
984			else
985				pf_state_peer_ntoh(&up->dst, &st->dst);
986		}
987		if (sync < 2) {
988			pfsync_alloc_scrub_memory(&up->dst, &st->dst);
989			pf_state_peer_ntoh(&up->dst, &st->dst);
990			st->expire = time_uptime;
991			st->timeout = up->timeout;
992		}
993		st->pfsync_time = time_uptime;
994
995		if (sync) {
996			V_pfsyncstats.pfsyncs_stale++;
997
998			pfsync_update_state(st);
999			PF_STATE_UNLOCK(st);
1000			PFSYNC_LOCK(sc);
1001			pfsync_push(sc);
1002			PFSYNC_UNLOCK(sc);
1003			continue;
1004		}
1005		PF_STATE_UNLOCK(st);
1006	}
1007
1008	return (len);
1009}
1010
1011static int
1012pfsync_in_ureq(struct pfsync_pkt *pkt, struct mbuf *m, int offset, int count)
1013{
1014	struct pfsync_upd_req *ur, *ura;
1015	struct mbuf *mp;
1016	int len = count * sizeof(*ur);
1017	int i, offp;
1018
1019	struct pf_state *st;
1020
1021	mp = m_pulldown(m, offset, len, &offp);
1022	if (mp == NULL) {
1023		V_pfsyncstats.pfsyncs_badlen++;
1024		return (-1);
1025	}
1026	ura = (struct pfsync_upd_req *)(mp->m_data + offp);
1027
1028	for (i = 0; i < count; i++) {
1029		ur = &ura[i];
1030
1031		if (ur->id == 0 && ur->creatorid == 0)
1032			pfsync_bulk_start();
1033		else {
1034			st = pf_find_state_byid(ur->id, ur->creatorid);
1035			if (st == NULL) {
1036				V_pfsyncstats.pfsyncs_badstate++;
1037				continue;
1038			}
1039			if (st->state_flags & PFSTATE_NOSYNC) {
1040				PF_STATE_UNLOCK(st);
1041				continue;
1042			}
1043
1044			pfsync_update_state_req(st);
1045			PF_STATE_UNLOCK(st);
1046		}
1047	}
1048
1049	return (len);
1050}
1051
1052static int
1053pfsync_in_del(struct pfsync_pkt *pkt, struct mbuf *m, int offset, int count)
1054{
1055	struct mbuf *mp;
1056	struct pfsync_state *sa, *sp;
1057	struct pf_state *st;
1058	int len = count * sizeof(*sp);
1059	int offp, i;
1060
1061	mp = m_pulldown(m, offset, len, &offp);
1062	if (mp == NULL) {
1063		V_pfsyncstats.pfsyncs_badlen++;
1064		return (-1);
1065	}
1066	sa = (struct pfsync_state *)(mp->m_data + offp);
1067
1068	for (i = 0; i < count; i++) {
1069		sp = &sa[i];
1070
1071		st = pf_find_state_byid(sp->id, sp->creatorid);
1072		if (st == NULL) {
1073			V_pfsyncstats.pfsyncs_badstate++;
1074			continue;
1075		}
1076		st->state_flags |= PFSTATE_NOSYNC;
1077		pf_unlink_state(st, PF_ENTER_LOCKED);
1078	}
1079
1080	return (len);
1081}
1082
1083static int
1084pfsync_in_del_c(struct pfsync_pkt *pkt, struct mbuf *m, int offset, int count)
1085{
1086	struct mbuf *mp;
1087	struct pfsync_del_c *sa, *sp;
1088	struct pf_state *st;
1089	int len = count * sizeof(*sp);
1090	int offp, i;
1091
1092	mp = m_pulldown(m, offset, len, &offp);
1093	if (mp == NULL) {
1094		V_pfsyncstats.pfsyncs_badlen++;
1095		return (-1);
1096	}
1097	sa = (struct pfsync_del_c *)(mp->m_data + offp);
1098
1099	for (i = 0; i < count; i++) {
1100		sp = &sa[i];
1101
1102		st = pf_find_state_byid(sp->id, sp->creatorid);
1103		if (st == NULL) {
1104			V_pfsyncstats.pfsyncs_badstate++;
1105			continue;
1106		}
1107
1108		st->state_flags |= PFSTATE_NOSYNC;
1109		pf_unlink_state(st, PF_ENTER_LOCKED);
1110	}
1111
1112	return (len);
1113}
1114
1115static int
1116pfsync_in_bus(struct pfsync_pkt *pkt, struct mbuf *m, int offset, int count)
1117{
1118	struct pfsync_softc *sc = V_pfsyncif;
1119	struct pfsync_bus *bus;
1120	struct mbuf *mp;
1121	int len = count * sizeof(*bus);
1122	int offp;
1123
1124	PFSYNC_BLOCK(sc);
1125
1126	/* If we're not waiting for a bulk update, who cares. */
1127	if (sc->sc_ureq_sent == 0) {
1128		PFSYNC_BUNLOCK(sc);
1129		return (len);
1130	}
1131
1132	mp = m_pulldown(m, offset, len, &offp);
1133	if (mp == NULL) {
1134		PFSYNC_BUNLOCK(sc);
1135		V_pfsyncstats.pfsyncs_badlen++;
1136		return (-1);
1137	}
1138	bus = (struct pfsync_bus *)(mp->m_data + offp);
1139
1140	switch (bus->status) {
1141	case PFSYNC_BUS_START:
1142		callout_reset(&sc->sc_bulkfail_tmo, 4 * hz +
1143		    V_pf_limits[PF_LIMIT_STATES].limit /
1144		    ((sc->sc_ifp->if_mtu - PFSYNC_MINPKT) /
1145		    sizeof(struct pfsync_state)),
1146		    pfsync_bulk_fail, sc);
1147		if (V_pf_status.debug >= PF_DEBUG_MISC)
1148			printf("pfsync: received bulk update start\n");
1149		break;
1150
1151	case PFSYNC_BUS_END:
1152		if (time_uptime - ntohl(bus->endtime) >=
1153		    sc->sc_ureq_sent) {
1154			/* that's it, we're happy */
1155			sc->sc_ureq_sent = 0;
1156			sc->sc_bulk_tries = 0;
1157			callout_stop(&sc->sc_bulkfail_tmo);
1158			if (!(sc->sc_flags & PFSYNCF_OK) && carp_demote_adj_p)
1159				(*carp_demote_adj_p)(-V_pfsync_carp_adj,
1160				    "pfsync bulk done");
1161			sc->sc_flags |= PFSYNCF_OK;
1162			if (V_pf_status.debug >= PF_DEBUG_MISC)
1163				printf("pfsync: received valid "
1164				    "bulk update end\n");
1165		} else {
1166			if (V_pf_status.debug >= PF_DEBUG_MISC)
1167				printf("pfsync: received invalid "
1168				    "bulk update end: bad timestamp\n");
1169		}
1170		break;
1171	}
1172	PFSYNC_BUNLOCK(sc);
1173
1174	return (len);
1175}
1176
1177static int
1178pfsync_in_tdb(struct pfsync_pkt *pkt, struct mbuf *m, int offset, int count)
1179{
1180	int len = count * sizeof(struct pfsync_tdb);
1181
1182#if defined(IPSEC)
1183	struct pfsync_tdb *tp;
1184	struct mbuf *mp;
1185	int offp;
1186	int i;
1187	int s;
1188
1189	mp = m_pulldown(m, offset, len, &offp);
1190	if (mp == NULL) {
1191		V_pfsyncstats.pfsyncs_badlen++;
1192		return (-1);
1193	}
1194	tp = (struct pfsync_tdb *)(mp->m_data + offp);
1195
1196	for (i = 0; i < count; i++)
1197		pfsync_update_net_tdb(&tp[i]);
1198#endif
1199
1200	return (len);
1201}
1202
1203#if defined(IPSEC)
1204/* Update an in-kernel tdb. Silently fail if no tdb is found. */
1205static void
1206pfsync_update_net_tdb(struct pfsync_tdb *pt)
1207{
1208	struct tdb		*tdb;
1209	int			 s;
1210
1211	/* check for invalid values */
1212	if (ntohl(pt->spi) <= SPI_RESERVED_MAX ||
1213	    (pt->dst.sa.sa_family != AF_INET &&
1214	    pt->dst.sa.sa_family != AF_INET6))
1215		goto bad;
1216
1217	tdb = gettdb(pt->spi, &pt->dst, pt->sproto);
1218	if (tdb) {
1219		pt->rpl = ntohl(pt->rpl);
1220		pt->cur_bytes = (unsigned long long)be64toh(pt->cur_bytes);
1221
1222		/* Neither replay nor byte counter should ever decrease. */
1223		if (pt->rpl < tdb->tdb_rpl ||
1224		    pt->cur_bytes < tdb->tdb_cur_bytes) {
1225			goto bad;
1226		}
1227
1228		tdb->tdb_rpl = pt->rpl;
1229		tdb->tdb_cur_bytes = pt->cur_bytes;
1230	}
1231	return;
1232
1233bad:
1234	if (V_pf_status.debug >= PF_DEBUG_MISC)
1235		printf("pfsync_insert: PFSYNC_ACT_TDB_UPD: "
1236		    "invalid value\n");
1237	V_pfsyncstats.pfsyncs_badstate++;
1238	return;
1239}
1240#endif
1241
1242
1243static int
1244pfsync_in_eof(struct pfsync_pkt *pkt, struct mbuf *m, int offset, int count)
1245{
1246	/* check if we are at the right place in the packet */
1247	if (offset != m->m_pkthdr.len)
1248		V_pfsyncstats.pfsyncs_badlen++;
1249
1250	/* we're done. free and let the caller return */
1251	m_freem(m);
1252	return (-1);
1253}
1254
1255static int
1256pfsync_in_error(struct pfsync_pkt *pkt, struct mbuf *m, int offset, int count)
1257{
1258	V_pfsyncstats.pfsyncs_badact++;
1259
1260	m_freem(m);
1261	return (-1);
1262}
1263
1264static int
1265pfsyncoutput(struct ifnet *ifp, struct mbuf *m, const struct sockaddr *dst,
1266	struct route *rt)
1267{
1268	m_freem(m);
1269	return (0);
1270}
1271
1272/* ARGSUSED */
1273static int
1274pfsyncioctl(struct ifnet *ifp, u_long cmd, caddr_t data)
1275{
1276	struct pfsync_softc *sc = ifp->if_softc;
1277	struct ifreq *ifr = (struct ifreq *)data;
1278	struct pfsyncreq pfsyncr;
1279	int error;
1280
1281	switch (cmd) {
1282	case SIOCSIFFLAGS:
1283		PFSYNC_LOCK(sc);
1284		if (ifp->if_flags & IFF_UP) {
1285			ifp->if_drv_flags |= IFF_DRV_RUNNING;
1286			PFSYNC_UNLOCK(sc);
1287			pfsync_pointers_init();
1288		} else {
1289			ifp->if_drv_flags &= ~IFF_DRV_RUNNING;
1290			PFSYNC_UNLOCK(sc);
1291			pfsync_pointers_uninit();
1292		}
1293		break;
1294	case SIOCSIFMTU:
1295		if (!sc->sc_sync_if ||
1296		    ifr->ifr_mtu <= PFSYNC_MINPKT ||
1297		    ifr->ifr_mtu > sc->sc_sync_if->if_mtu)
1298			return (EINVAL);
1299		if (ifr->ifr_mtu < ifp->if_mtu) {
1300			PFSYNC_LOCK(sc);
1301			if (sc->sc_len > PFSYNC_MINPKT)
1302				pfsync_sendout(1);
1303			PFSYNC_UNLOCK(sc);
1304		}
1305		ifp->if_mtu = ifr->ifr_mtu;
1306		break;
1307	case SIOCGETPFSYNC:
1308		bzero(&pfsyncr, sizeof(pfsyncr));
1309		PFSYNC_LOCK(sc);
1310		if (sc->sc_sync_if) {
1311			strlcpy(pfsyncr.pfsyncr_syncdev,
1312			    sc->sc_sync_if->if_xname, IFNAMSIZ);
1313		}
1314		pfsyncr.pfsyncr_syncpeer = sc->sc_sync_peer;
1315		pfsyncr.pfsyncr_maxupdates = sc->sc_maxupdates;
1316		pfsyncr.pfsyncr_defer = (PFSYNCF_DEFER ==
1317		    (sc->sc_flags & PFSYNCF_DEFER));
1318		PFSYNC_UNLOCK(sc);
1319		return (copyout(&pfsyncr, ifr->ifr_data, sizeof(pfsyncr)));
1320
1321	case SIOCSETPFSYNC:
1322	    {
1323		struct ip_moptions *imo = &sc->sc_imo;
1324		struct ifnet *sifp;
1325		struct ip *ip;
1326		void *mship = NULL;
1327
1328		if ((error = priv_check(curthread, PRIV_NETINET_PF)) != 0)
1329			return (error);
1330		if ((error = copyin(ifr->ifr_data, &pfsyncr, sizeof(pfsyncr))))
1331			return (error);
1332
1333		if (pfsyncr.pfsyncr_maxupdates > 255)
1334			return (EINVAL);
1335
1336		if (pfsyncr.pfsyncr_syncdev[0] == 0)
1337			sifp = NULL;
1338		else if ((sifp = ifunit_ref(pfsyncr.pfsyncr_syncdev)) == NULL)
1339			return (EINVAL);
1340
1341		if (sifp != NULL && (
1342		    pfsyncr.pfsyncr_syncpeer.s_addr == 0 ||
1343		    pfsyncr.pfsyncr_syncpeer.s_addr ==
1344		    htonl(INADDR_PFSYNC_GROUP)))
1345			mship = malloc((sizeof(struct in_multi *) *
1346			    IP_MIN_MEMBERSHIPS), M_PFSYNC, M_WAITOK | M_ZERO);
1347
1348		PFSYNC_LOCK(sc);
1349		if (pfsyncr.pfsyncr_syncpeer.s_addr == 0)
1350			sc->sc_sync_peer.s_addr = htonl(INADDR_PFSYNC_GROUP);
1351		else
1352			sc->sc_sync_peer.s_addr =
1353			    pfsyncr.pfsyncr_syncpeer.s_addr;
1354
1355		sc->sc_maxupdates = pfsyncr.pfsyncr_maxupdates;
1356		if (pfsyncr.pfsyncr_defer) {
1357			sc->sc_flags |= PFSYNCF_DEFER;
1358			pfsync_defer_ptr = pfsync_defer;
1359		} else {
1360			sc->sc_flags &= ~PFSYNCF_DEFER;
1361			pfsync_defer_ptr = NULL;
1362		}
1363
1364		if (sifp == NULL) {
1365			if (sc->sc_sync_if)
1366				if_rele(sc->sc_sync_if);
1367			sc->sc_sync_if = NULL;
1368			if (imo->imo_membership)
1369				pfsync_multicast_cleanup(sc);
1370			PFSYNC_UNLOCK(sc);
1371			break;
1372		}
1373
1374		if (sc->sc_len > PFSYNC_MINPKT &&
1375		    (sifp->if_mtu < sc->sc_ifp->if_mtu ||
1376		    (sc->sc_sync_if != NULL &&
1377		    sifp->if_mtu < sc->sc_sync_if->if_mtu) ||
1378		    sifp->if_mtu < MCLBYTES - sizeof(struct ip)))
1379			pfsync_sendout(1);
1380
1381		if (imo->imo_membership)
1382			pfsync_multicast_cleanup(sc);
1383
1384		if (sc->sc_sync_peer.s_addr == htonl(INADDR_PFSYNC_GROUP)) {
1385			error = pfsync_multicast_setup(sc, sifp, mship);
1386			if (error) {
1387				if_rele(sifp);
1388				free(mship, M_PFSYNC);
1389				return (error);
1390			}
1391		}
1392		if (sc->sc_sync_if)
1393			if_rele(sc->sc_sync_if);
1394		sc->sc_sync_if = sifp;
1395
1396		ip = &sc->sc_template;
1397		bzero(ip, sizeof(*ip));
1398		ip->ip_v = IPVERSION;
1399		ip->ip_hl = sizeof(sc->sc_template) >> 2;
1400		ip->ip_tos = IPTOS_LOWDELAY;
1401		/* len and id are set later. */
1402		ip->ip_off = htons(IP_DF);
1403		ip->ip_ttl = PFSYNC_DFLTTL;
1404		ip->ip_p = IPPROTO_PFSYNC;
1405		ip->ip_src.s_addr = INADDR_ANY;
1406		ip->ip_dst.s_addr = sc->sc_sync_peer.s_addr;
1407
1408		/* Request a full state table update. */
1409		if ((sc->sc_flags & PFSYNCF_OK) && carp_demote_adj_p)
1410			(*carp_demote_adj_p)(V_pfsync_carp_adj,
1411			    "pfsync bulk start");
1412		sc->sc_flags &= ~PFSYNCF_OK;
1413		if (V_pf_status.debug >= PF_DEBUG_MISC)
1414			printf("pfsync: requesting bulk update\n");
1415		pfsync_request_update(0, 0);
1416		PFSYNC_UNLOCK(sc);
1417		PFSYNC_BLOCK(sc);
1418		sc->sc_ureq_sent = time_uptime;
1419		callout_reset(&sc->sc_bulkfail_tmo, 5 * hz, pfsync_bulk_fail,
1420		    sc);
1421		PFSYNC_BUNLOCK(sc);
1422
1423		break;
1424	    }
1425	default:
1426		return (ENOTTY);
1427	}
1428
1429	return (0);
1430}
1431
1432static void
1433pfsync_out_state(struct pf_state *st, void *buf)
1434{
1435	struct pfsync_state *sp = buf;
1436
1437	pfsync_state_export(sp, st);
1438}
1439
1440static void
1441pfsync_out_iack(struct pf_state *st, void *buf)
1442{
1443	struct pfsync_ins_ack *iack = buf;
1444
1445	iack->id = st->id;
1446	iack->creatorid = st->creatorid;
1447}
1448
1449static void
1450pfsync_out_upd_c(struct pf_state *st, void *buf)
1451{
1452	struct pfsync_upd_c *up = buf;
1453
1454	bzero(up, sizeof(*up));
1455	up->id = st->id;
1456	pf_state_peer_hton(&st->src, &up->src);
1457	pf_state_peer_hton(&st->dst, &up->dst);
1458	up->creatorid = st->creatorid;
1459	up->timeout = st->timeout;
1460}
1461
1462static void
1463pfsync_out_del(struct pf_state *st, void *buf)
1464{
1465	struct pfsync_del_c *dp = buf;
1466
1467	dp->id = st->id;
1468	dp->creatorid = st->creatorid;
1469	st->state_flags |= PFSTATE_NOSYNC;
1470}
1471
1472static void
1473pfsync_drop(struct pfsync_softc *sc)
1474{
1475	struct pf_state *st, *next;
1476	struct pfsync_upd_req_item *ur;
1477	int q;
1478
1479	for (q = 0; q < PFSYNC_S_COUNT; q++) {
1480		if (TAILQ_EMPTY(&sc->sc_qs[q]))
1481			continue;
1482
1483		TAILQ_FOREACH_SAFE(st, &sc->sc_qs[q], sync_list, next) {
1484			KASSERT(st->sync_state == q,
1485				("%s: st->sync_state == q",
1486					__func__));
1487			st->sync_state = PFSYNC_S_NONE;
1488			pf_release_state(st);
1489		}
1490		TAILQ_INIT(&sc->sc_qs[q]);
1491	}
1492
1493	while ((ur = TAILQ_FIRST(&sc->sc_upd_req_list)) != NULL) {
1494		TAILQ_REMOVE(&sc->sc_upd_req_list, ur, ur_entry);
1495		free(ur, M_PFSYNC);
1496	}
1497
1498	sc->sc_plus = NULL;
1499	sc->sc_len = PFSYNC_MINPKT;
1500}
1501
1502static void
1503pfsync_sendout(int schedswi)
1504{
1505	struct pfsync_softc *sc = V_pfsyncif;
1506	struct ifnet *ifp = sc->sc_ifp;
1507	struct mbuf *m;
1508	struct ip *ip;
1509	struct pfsync_header *ph;
1510	struct pfsync_subheader *subh;
1511	struct pf_state *st;
1512	struct pfsync_upd_req_item *ur;
1513	int offset;
1514	int q, count = 0;
1515
1516	KASSERT(sc != NULL, ("%s: null sc", __func__));
1517	KASSERT(sc->sc_len > PFSYNC_MINPKT,
1518	    ("%s: sc_len %zu", __func__, sc->sc_len));
1519	PFSYNC_LOCK_ASSERT(sc);
1520
1521	if (ifp->if_bpf == NULL && sc->sc_sync_if == NULL) {
1522		pfsync_drop(sc);
1523		return;
1524	}
1525
1526	m = m_get2(max_linkhdr + sc->sc_len, M_NOWAIT, MT_DATA, M_PKTHDR);
1527	if (m == NULL) {
1528		sc->sc_ifp->if_oerrors++;
1529		V_pfsyncstats.pfsyncs_onomem++;
1530		return;
1531	}
1532	m->m_data += max_linkhdr;
1533	m->m_len = m->m_pkthdr.len = sc->sc_len;
1534
1535	/* build the ip header */
1536	ip = (struct ip *)m->m_data;
1537	bcopy(&sc->sc_template, ip, sizeof(*ip));
1538	offset = sizeof(*ip);
1539
1540	ip->ip_len = htons(m->m_pkthdr.len);
1541	ip->ip_id = htons(ip_randomid());
1542
1543	/* build the pfsync header */
1544	ph = (struct pfsync_header *)(m->m_data + offset);
1545	bzero(ph, sizeof(*ph));
1546	offset += sizeof(*ph);
1547
1548	ph->version = PFSYNC_VERSION;
1549	ph->len = htons(sc->sc_len - sizeof(*ip));
1550	bcopy(V_pf_status.pf_chksum, ph->pfcksum, PF_MD5_DIGEST_LENGTH);
1551
1552	/* walk the queues */
1553	for (q = 0; q < PFSYNC_S_COUNT; q++) {
1554		if (TAILQ_EMPTY(&sc->sc_qs[q]))
1555			continue;
1556
1557		subh = (struct pfsync_subheader *)(m->m_data + offset);
1558		offset += sizeof(*subh);
1559
1560		count = 0;
1561		TAILQ_FOREACH(st, &sc->sc_qs[q], sync_list) {
1562			KASSERT(st->sync_state == q,
1563				("%s: st->sync_state == q",
1564					__func__));
1565			/*
1566			 * XXXGL: some of write methods do unlocked reads
1567			 * of state data :(
1568			 */
1569			pfsync_qs[q].write(st, m->m_data + offset);
1570			offset += pfsync_qs[q].len;
1571			st->sync_state = PFSYNC_S_NONE;
1572			pf_release_state(st);
1573			count++;
1574		}
1575		TAILQ_INIT(&sc->sc_qs[q]);
1576
1577		bzero(subh, sizeof(*subh));
1578		subh->action = pfsync_qs[q].action;
1579		subh->count = htons(count);
1580		V_pfsyncstats.pfsyncs_oacts[pfsync_qs[q].action] += count;
1581	}
1582
1583	if (!TAILQ_EMPTY(&sc->sc_upd_req_list)) {
1584		subh = (struct pfsync_subheader *)(m->m_data + offset);
1585		offset += sizeof(*subh);
1586
1587		count = 0;
1588		while ((ur = TAILQ_FIRST(&sc->sc_upd_req_list)) != NULL) {
1589			TAILQ_REMOVE(&sc->sc_upd_req_list, ur, ur_entry);
1590
1591			bcopy(&ur->ur_msg, m->m_data + offset,
1592			    sizeof(ur->ur_msg));
1593			offset += sizeof(ur->ur_msg);
1594			free(ur, M_PFSYNC);
1595			count++;
1596		}
1597
1598		bzero(subh, sizeof(*subh));
1599		subh->action = PFSYNC_ACT_UPD_REQ;
1600		subh->count = htons(count);
1601		V_pfsyncstats.pfsyncs_oacts[PFSYNC_ACT_UPD_REQ] += count;
1602	}
1603
1604	/* has someone built a custom region for us to add? */
1605	if (sc->sc_plus != NULL) {
1606		bcopy(sc->sc_plus, m->m_data + offset, sc->sc_pluslen);
1607		offset += sc->sc_pluslen;
1608
1609		sc->sc_plus = NULL;
1610	}
1611
1612	subh = (struct pfsync_subheader *)(m->m_data + offset);
1613	offset += sizeof(*subh);
1614
1615	bzero(subh, sizeof(*subh));
1616	subh->action = PFSYNC_ACT_EOF;
1617	subh->count = htons(1);
1618	V_pfsyncstats.pfsyncs_oacts[PFSYNC_ACT_EOF]++;
1619
1620	/* we're done, let's put it on the wire */
1621	if (ifp->if_bpf) {
1622		m->m_data += sizeof(*ip);
1623		m->m_len = m->m_pkthdr.len = sc->sc_len - sizeof(*ip);
1624		BPF_MTAP(ifp, m);
1625		m->m_data -= sizeof(*ip);
1626		m->m_len = m->m_pkthdr.len = sc->sc_len;
1627	}
1628
1629	if (sc->sc_sync_if == NULL) {
1630		sc->sc_len = PFSYNC_MINPKT;
1631		m_freem(m);
1632		return;
1633	}
1634
1635	sc->sc_ifp->if_opackets++;
1636	sc->sc_ifp->if_obytes += m->m_pkthdr.len;
1637	sc->sc_len = PFSYNC_MINPKT;
1638
1639	if (!_IF_QFULL(&sc->sc_ifp->if_snd))
1640		_IF_ENQUEUE(&sc->sc_ifp->if_snd, m);
1641	else {
1642		m_freem(m);
1643		sc->sc_ifp->if_snd.ifq_drops++;
1644	}
1645	if (schedswi)
1646		swi_sched(V_pfsync_swi_cookie, 0);
1647}
1648
1649static void
1650pfsync_insert_state(struct pf_state *st)
1651{
1652	struct pfsync_softc *sc = V_pfsyncif;
1653
1654	if (st->state_flags & PFSTATE_NOSYNC)
1655		return;
1656
1657	if ((st->rule.ptr->rule_flag & PFRULE_NOSYNC) ||
1658	    st->key[PF_SK_WIRE]->proto == IPPROTO_PFSYNC) {
1659		st->state_flags |= PFSTATE_NOSYNC;
1660		return;
1661	}
1662
1663	KASSERT(st->sync_state == PFSYNC_S_NONE,
1664		("%s: st->sync_state %u", __func__, st->sync_state));
1665
1666	PFSYNC_LOCK(sc);
1667	if (sc->sc_len == PFSYNC_MINPKT)
1668		callout_reset(&sc->sc_tmo, 1 * hz, pfsync_timeout, V_pfsyncif);
1669
1670	pfsync_q_ins(st, PFSYNC_S_INS);
1671	PFSYNC_UNLOCK(sc);
1672
1673	st->sync_updates = 0;
1674}
1675
1676static int
1677pfsync_defer(struct pf_state *st, struct mbuf *m)
1678{
1679	struct pfsync_softc *sc = V_pfsyncif;
1680	struct pfsync_deferral *pd;
1681
1682	if (m->m_flags & (M_BCAST|M_MCAST))
1683		return (0);
1684
1685	PFSYNC_LOCK(sc);
1686
1687	if (sc == NULL || !(sc->sc_ifp->if_flags & IFF_DRV_RUNNING) ||
1688	    !(sc->sc_flags & PFSYNCF_DEFER)) {
1689		PFSYNC_UNLOCK(sc);
1690		return (0);
1691	}
1692
1693	 if (sc->sc_deferred >= 128)
1694		pfsync_undefer(TAILQ_FIRST(&sc->sc_deferrals), 0);
1695
1696	pd = malloc(sizeof(*pd), M_PFSYNC, M_NOWAIT);
1697	if (pd == NULL)
1698		return (0);
1699	sc->sc_deferred++;
1700
1701	m->m_flags |= M_SKIP_FIREWALL;
1702	st->state_flags |= PFSTATE_ACK;
1703
1704	pd->pd_sc = sc;
1705	pd->pd_refs = 0;
1706	pd->pd_st = st;
1707	pf_ref_state(st);
1708	pd->pd_m = m;
1709
1710	TAILQ_INSERT_TAIL(&sc->sc_deferrals, pd, pd_entry);
1711	callout_init_mtx(&pd->pd_tmo, &sc->sc_mtx, CALLOUT_RETURNUNLOCKED);
1712	callout_reset(&pd->pd_tmo, 10, pfsync_defer_tmo, pd);
1713
1714	pfsync_push(sc);
1715
1716	return (1);
1717}
1718
1719static void
1720pfsync_undefer(struct pfsync_deferral *pd, int drop)
1721{
1722	struct pfsync_softc *sc = pd->pd_sc;
1723	struct mbuf *m = pd->pd_m;
1724	struct pf_state *st = pd->pd_st;
1725
1726	PFSYNC_LOCK_ASSERT(sc);
1727
1728	TAILQ_REMOVE(&sc->sc_deferrals, pd, pd_entry);
1729	sc->sc_deferred--;
1730	pd->pd_st->state_flags &= ~PFSTATE_ACK;	/* XXX: locking! */
1731	free(pd, M_PFSYNC);
1732	pf_release_state(st);
1733
1734	if (drop)
1735		m_freem(m);
1736	else {
1737		_IF_ENQUEUE(&sc->sc_ifp->if_snd, m);
1738		pfsync_push(sc);
1739	}
1740}
1741
1742static void
1743pfsync_defer_tmo(void *arg)
1744{
1745	struct pfsync_deferral *pd = arg;
1746	struct pfsync_softc *sc = pd->pd_sc;
1747	struct mbuf *m = pd->pd_m;
1748	struct pf_state *st = pd->pd_st;
1749
1750	PFSYNC_LOCK_ASSERT(sc);
1751
1752	CURVNET_SET(m->m_pkthdr.rcvif->if_vnet);
1753
1754	TAILQ_REMOVE(&sc->sc_deferrals, pd, pd_entry);
1755	sc->sc_deferred--;
1756	pd->pd_st->state_flags &= ~PFSTATE_ACK;	/* XXX: locking! */
1757	if (pd->pd_refs == 0)
1758		free(pd, M_PFSYNC);
1759	PFSYNC_UNLOCK(sc);
1760
1761	ip_output(m, NULL, NULL, 0, NULL, NULL);
1762
1763	pf_release_state(st);
1764
1765	CURVNET_RESTORE();
1766}
1767
1768static void
1769pfsync_undefer_state(struct pf_state *st, int drop)
1770{
1771	struct pfsync_softc *sc = V_pfsyncif;
1772	struct pfsync_deferral *pd;
1773
1774	PFSYNC_LOCK_ASSERT(sc);
1775
1776	TAILQ_FOREACH(pd, &sc->sc_deferrals, pd_entry) {
1777		 if (pd->pd_st == st) {
1778			if (callout_stop(&pd->pd_tmo))
1779				pfsync_undefer(pd, drop);
1780			return;
1781		}
1782	}
1783
1784	panic("%s: unable to find deferred state", __func__);
1785}
1786
1787static void
1788pfsync_update_state(struct pf_state *st)
1789{
1790	struct pfsync_softc *sc = V_pfsyncif;
1791	int sync = 0;
1792
1793	PF_STATE_LOCK_ASSERT(st);
1794	PFSYNC_LOCK(sc);
1795
1796	if (st->state_flags & PFSTATE_ACK)
1797		pfsync_undefer_state(st, 0);
1798	if (st->state_flags & PFSTATE_NOSYNC) {
1799		if (st->sync_state != PFSYNC_S_NONE)
1800			pfsync_q_del(st);
1801		PFSYNC_UNLOCK(sc);
1802		return;
1803	}
1804
1805	if (sc->sc_len == PFSYNC_MINPKT)
1806		callout_reset(&sc->sc_tmo, 1 * hz, pfsync_timeout, V_pfsyncif);
1807
1808	switch (st->sync_state) {
1809	case PFSYNC_S_UPD_C:
1810	case PFSYNC_S_UPD:
1811	case PFSYNC_S_INS:
1812		/* we're already handling it */
1813
1814		if (st->key[PF_SK_WIRE]->proto == IPPROTO_TCP) {
1815			st->sync_updates++;
1816			if (st->sync_updates >= sc->sc_maxupdates)
1817				sync = 1;
1818		}
1819		break;
1820
1821	case PFSYNC_S_IACK:
1822		pfsync_q_del(st);
1823	case PFSYNC_S_NONE:
1824		pfsync_q_ins(st, PFSYNC_S_UPD_C);
1825		st->sync_updates = 0;
1826		break;
1827
1828	default:
1829		panic("%s: unexpected sync state %d", __func__, st->sync_state);
1830	}
1831
1832	if (sync || (time_uptime - st->pfsync_time) < 2)
1833		pfsync_push(sc);
1834
1835	PFSYNC_UNLOCK(sc);
1836}
1837
1838static void
1839pfsync_request_update(u_int32_t creatorid, u_int64_t id)
1840{
1841	struct pfsync_softc *sc = V_pfsyncif;
1842	struct pfsync_upd_req_item *item;
1843	size_t nlen = sizeof(struct pfsync_upd_req);
1844
1845	PFSYNC_LOCK_ASSERT(sc);
1846
1847	/*
1848	 * This code does a bit to prevent multiple update requests for the
1849	 * same state being generated. It searches current subheader queue,
1850	 * but it doesn't lookup into queue of already packed datagrams.
1851	 */
1852	TAILQ_FOREACH(item, &sc->sc_upd_req_list, ur_entry)
1853		if (item->ur_msg.id == id &&
1854		    item->ur_msg.creatorid == creatorid)
1855			return;
1856
1857	item = malloc(sizeof(*item), M_PFSYNC, M_NOWAIT);
1858	if (item == NULL)
1859		return; /* XXX stats */
1860
1861	item->ur_msg.id = id;
1862	item->ur_msg.creatorid = creatorid;
1863
1864	if (TAILQ_EMPTY(&sc->sc_upd_req_list))
1865		nlen += sizeof(struct pfsync_subheader);
1866
1867	if (sc->sc_len + nlen > sc->sc_ifp->if_mtu) {
1868		pfsync_sendout(1);
1869
1870		nlen = sizeof(struct pfsync_subheader) +
1871		    sizeof(struct pfsync_upd_req);
1872	}
1873
1874	TAILQ_INSERT_TAIL(&sc->sc_upd_req_list, item, ur_entry);
1875	sc->sc_len += nlen;
1876}
1877
1878static void
1879pfsync_update_state_req(struct pf_state *st)
1880{
1881	struct pfsync_softc *sc = V_pfsyncif;
1882
1883	PF_STATE_LOCK_ASSERT(st);
1884	PFSYNC_LOCK(sc);
1885
1886	if (st->state_flags & PFSTATE_NOSYNC) {
1887		if (st->sync_state != PFSYNC_S_NONE)
1888			pfsync_q_del(st);
1889		PFSYNC_UNLOCK(sc);
1890		return;
1891	}
1892
1893	switch (st->sync_state) {
1894	case PFSYNC_S_UPD_C:
1895	case PFSYNC_S_IACK:
1896		pfsync_q_del(st);
1897	case PFSYNC_S_NONE:
1898		pfsync_q_ins(st, PFSYNC_S_UPD);
1899		pfsync_push(sc);
1900		break;
1901
1902	case PFSYNC_S_INS:
1903	case PFSYNC_S_UPD:
1904	case PFSYNC_S_DEL:
1905		/* we're already handling it */
1906		break;
1907
1908	default:
1909		panic("%s: unexpected sync state %d", __func__, st->sync_state);
1910	}
1911
1912	PFSYNC_UNLOCK(sc);
1913}
1914
1915static void
1916pfsync_delete_state(struct pf_state *st)
1917{
1918	struct pfsync_softc *sc = V_pfsyncif;
1919
1920	PFSYNC_LOCK(sc);
1921	if (st->state_flags & PFSTATE_ACK)
1922		pfsync_undefer_state(st, 1);
1923	if (st->state_flags & PFSTATE_NOSYNC) {
1924		if (st->sync_state != PFSYNC_S_NONE)
1925			pfsync_q_del(st);
1926		PFSYNC_UNLOCK(sc);
1927		return;
1928	}
1929
1930	if (sc->sc_len == PFSYNC_MINPKT)
1931		callout_reset(&sc->sc_tmo, 1 * hz, pfsync_timeout, V_pfsyncif);
1932
1933	switch (st->sync_state) {
1934	case PFSYNC_S_INS:
1935		/* We never got to tell the world so just forget about it. */
1936		pfsync_q_del(st);
1937		break;
1938
1939	case PFSYNC_S_UPD_C:
1940	case PFSYNC_S_UPD:
1941	case PFSYNC_S_IACK:
1942		pfsync_q_del(st);
1943		/* FALLTHROUGH to putting it on the del list */
1944
1945	case PFSYNC_S_NONE:
1946		pfsync_q_ins(st, PFSYNC_S_DEL);
1947		break;
1948
1949	default:
1950		panic("%s: unexpected sync state %d", __func__, st->sync_state);
1951	}
1952	PFSYNC_UNLOCK(sc);
1953}
1954
1955static void
1956pfsync_clear_states(u_int32_t creatorid, const char *ifname)
1957{
1958	struct pfsync_softc *sc = V_pfsyncif;
1959	struct {
1960		struct pfsync_subheader subh;
1961		struct pfsync_clr clr;
1962	} __packed r;
1963
1964	bzero(&r, sizeof(r));
1965
1966	r.subh.action = PFSYNC_ACT_CLR;
1967	r.subh.count = htons(1);
1968	V_pfsyncstats.pfsyncs_oacts[PFSYNC_ACT_CLR]++;
1969
1970	strlcpy(r.clr.ifname, ifname, sizeof(r.clr.ifname));
1971	r.clr.creatorid = creatorid;
1972
1973	PFSYNC_LOCK(sc);
1974	pfsync_send_plus(&r, sizeof(r));
1975	PFSYNC_UNLOCK(sc);
1976}
1977
1978static void
1979pfsync_q_ins(struct pf_state *st, int q)
1980{
1981	struct pfsync_softc *sc = V_pfsyncif;
1982	size_t nlen = pfsync_qs[q].len;
1983
1984	PFSYNC_LOCK_ASSERT(sc);
1985
1986	KASSERT(st->sync_state == PFSYNC_S_NONE,
1987		("%s: st->sync_state %u", __func__, st->sync_state));
1988	KASSERT(sc->sc_len >= PFSYNC_MINPKT, ("pfsync pkt len is too low %zu",
1989	    sc->sc_len));
1990
1991	if (TAILQ_EMPTY(&sc->sc_qs[q]))
1992		nlen += sizeof(struct pfsync_subheader);
1993
1994	if (sc->sc_len + nlen > sc->sc_ifp->if_mtu) {
1995		pfsync_sendout(1);
1996
1997		nlen = sizeof(struct pfsync_subheader) + pfsync_qs[q].len;
1998	}
1999
2000	sc->sc_len += nlen;
2001	TAILQ_INSERT_TAIL(&sc->sc_qs[q], st, sync_list);
2002	st->sync_state = q;
2003	pf_ref_state(st);
2004}
2005
2006static void
2007pfsync_q_del(struct pf_state *st)
2008{
2009	struct pfsync_softc *sc = V_pfsyncif;
2010	int q = st->sync_state;
2011
2012	PFSYNC_LOCK_ASSERT(sc);
2013	KASSERT(st->sync_state != PFSYNC_S_NONE,
2014		("%s: st->sync_state != PFSYNC_S_NONE", __func__));
2015
2016	sc->sc_len -= pfsync_qs[q].len;
2017	TAILQ_REMOVE(&sc->sc_qs[q], st, sync_list);
2018	st->sync_state = PFSYNC_S_NONE;
2019	pf_release_state(st);
2020
2021	if (TAILQ_EMPTY(&sc->sc_qs[q]))
2022		sc->sc_len -= sizeof(struct pfsync_subheader);
2023}
2024
2025static void
2026pfsync_bulk_start(void)
2027{
2028	struct pfsync_softc *sc = V_pfsyncif;
2029
2030	if (V_pf_status.debug >= PF_DEBUG_MISC)
2031		printf("pfsync: received bulk update request\n");
2032
2033	PFSYNC_BLOCK(sc);
2034
2035	sc->sc_ureq_received = time_uptime;
2036	sc->sc_bulk_hashid = 0;
2037	sc->sc_bulk_stateid = 0;
2038	pfsync_bulk_status(PFSYNC_BUS_START);
2039	callout_reset(&sc->sc_bulk_tmo, 1, pfsync_bulk_update, sc);
2040	PFSYNC_BUNLOCK(sc);
2041}
2042
2043static void
2044pfsync_bulk_update(void *arg)
2045{
2046	struct pfsync_softc *sc = arg;
2047	struct pf_state *s;
2048	int i, sent = 0;
2049
2050	PFSYNC_BLOCK_ASSERT(sc);
2051	CURVNET_SET(sc->sc_ifp->if_vnet);
2052
2053	/*
2054	 * Start with last state from previous invocation.
2055	 * It may had gone, in this case start from the
2056	 * hash slot.
2057	 */
2058	s = pf_find_state_byid(sc->sc_bulk_stateid, sc->sc_bulk_creatorid);
2059
2060	if (s != NULL)
2061		i = PF_IDHASH(s);
2062	else
2063		i = sc->sc_bulk_hashid;
2064
2065	for (; i <= pf_hashmask; i++) {
2066		struct pf_idhash *ih = &V_pf_idhash[i];
2067
2068		if (s != NULL)
2069			PF_HASHROW_ASSERT(ih);
2070		else {
2071			PF_HASHROW_LOCK(ih);
2072			s = LIST_FIRST(&ih->states);
2073		}
2074
2075		for (; s; s = LIST_NEXT(s, entry)) {
2076
2077			if (sent > 1 && (sc->sc_ifp->if_mtu - sc->sc_len) <
2078			    sizeof(struct pfsync_state)) {
2079				/* We've filled a packet. */
2080				sc->sc_bulk_hashid = i;
2081				sc->sc_bulk_stateid = s->id;
2082				sc->sc_bulk_creatorid = s->creatorid;
2083				PF_HASHROW_UNLOCK(ih);
2084				callout_reset(&sc->sc_bulk_tmo, 1,
2085				    pfsync_bulk_update, sc);
2086				goto full;
2087			}
2088
2089			if (s->sync_state == PFSYNC_S_NONE &&
2090			    s->timeout < PFTM_MAX &&
2091			    s->pfsync_time <= sc->sc_ureq_received) {
2092				pfsync_update_state_req(s);
2093				sent++;
2094			}
2095		}
2096		PF_HASHROW_UNLOCK(ih);
2097	}
2098
2099	/* We're done. */
2100	pfsync_bulk_status(PFSYNC_BUS_END);
2101
2102full:
2103	CURVNET_RESTORE();
2104}
2105
2106static void
2107pfsync_bulk_status(u_int8_t status)
2108{
2109	struct {
2110		struct pfsync_subheader subh;
2111		struct pfsync_bus bus;
2112	} __packed r;
2113
2114	struct pfsync_softc *sc = V_pfsyncif;
2115
2116	bzero(&r, sizeof(r));
2117
2118	r.subh.action = PFSYNC_ACT_BUS;
2119	r.subh.count = htons(1);
2120	V_pfsyncstats.pfsyncs_oacts[PFSYNC_ACT_BUS]++;
2121
2122	r.bus.creatorid = V_pf_status.hostid;
2123	r.bus.endtime = htonl(time_uptime - sc->sc_ureq_received);
2124	r.bus.status = status;
2125
2126	PFSYNC_LOCK(sc);
2127	pfsync_send_plus(&r, sizeof(r));
2128	PFSYNC_UNLOCK(sc);
2129}
2130
2131static void
2132pfsync_bulk_fail(void *arg)
2133{
2134	struct pfsync_softc *sc = arg;
2135
2136	CURVNET_SET(sc->sc_ifp->if_vnet);
2137
2138	PFSYNC_BLOCK_ASSERT(sc);
2139
2140	if (sc->sc_bulk_tries++ < PFSYNC_MAX_BULKTRIES) {
2141		/* Try again */
2142		callout_reset(&sc->sc_bulkfail_tmo, 5 * hz,
2143		    pfsync_bulk_fail, V_pfsyncif);
2144		PFSYNC_LOCK(sc);
2145		pfsync_request_update(0, 0);
2146		PFSYNC_UNLOCK(sc);
2147	} else {
2148		/* Pretend like the transfer was ok. */
2149		sc->sc_ureq_sent = 0;
2150		sc->sc_bulk_tries = 0;
2151		PFSYNC_LOCK(sc);
2152		if (!(sc->sc_flags & PFSYNCF_OK) && carp_demote_adj_p)
2153			(*carp_demote_adj_p)(-V_pfsync_carp_adj,
2154			    "pfsync bulk fail");
2155		sc->sc_flags |= PFSYNCF_OK;
2156		PFSYNC_UNLOCK(sc);
2157		if (V_pf_status.debug >= PF_DEBUG_MISC)
2158			printf("pfsync: failed to receive bulk update\n");
2159	}
2160
2161	CURVNET_RESTORE();
2162}
2163
2164static void
2165pfsync_send_plus(void *plus, size_t pluslen)
2166{
2167	struct pfsync_softc *sc = V_pfsyncif;
2168
2169	PFSYNC_LOCK_ASSERT(sc);
2170
2171	if (sc->sc_len + pluslen > sc->sc_ifp->if_mtu)
2172		pfsync_sendout(1);
2173
2174	sc->sc_plus = plus;
2175	sc->sc_len += (sc->sc_pluslen = pluslen);
2176
2177	pfsync_sendout(1);
2178}
2179
2180static void
2181pfsync_timeout(void *arg)
2182{
2183	struct pfsync_softc *sc = arg;
2184
2185	CURVNET_SET(sc->sc_ifp->if_vnet);
2186	PFSYNC_LOCK(sc);
2187	pfsync_push(sc);
2188	PFSYNC_UNLOCK(sc);
2189	CURVNET_RESTORE();
2190}
2191
2192static void
2193pfsync_push(struct pfsync_softc *sc)
2194{
2195
2196	PFSYNC_LOCK_ASSERT(sc);
2197
2198	sc->sc_flags |= PFSYNCF_PUSH;
2199	swi_sched(V_pfsync_swi_cookie, 0);
2200}
2201
2202static void
2203pfsyncintr(void *arg)
2204{
2205	struct pfsync_softc *sc = arg;
2206	struct mbuf *m, *n;
2207
2208	CURVNET_SET(sc->sc_ifp->if_vnet);
2209
2210	PFSYNC_LOCK(sc);
2211	if ((sc->sc_flags & PFSYNCF_PUSH) && sc->sc_len > PFSYNC_MINPKT) {
2212		pfsync_sendout(0);
2213		sc->sc_flags &= ~PFSYNCF_PUSH;
2214	}
2215	_IF_DEQUEUE_ALL(&sc->sc_ifp->if_snd, m);
2216	PFSYNC_UNLOCK(sc);
2217
2218	for (; m != NULL; m = n) {
2219
2220		n = m->m_nextpkt;
2221		m->m_nextpkt = NULL;
2222
2223		/*
2224		 * We distinguish between a deferral packet and our
2225		 * own pfsync packet based on M_SKIP_FIREWALL
2226		 * flag. This is XXX.
2227		 */
2228		if (m->m_flags & M_SKIP_FIREWALL)
2229			ip_output(m, NULL, NULL, 0, NULL, NULL);
2230		else if (ip_output(m, NULL, NULL, IP_RAWOUTPUT, &sc->sc_imo,
2231		    NULL) == 0)
2232			V_pfsyncstats.pfsyncs_opackets++;
2233		else
2234			V_pfsyncstats.pfsyncs_oerrors++;
2235	}
2236	CURVNET_RESTORE();
2237}
2238
2239static int
2240pfsync_multicast_setup(struct pfsync_softc *sc, struct ifnet *ifp, void *mship)
2241{
2242	struct ip_moptions *imo = &sc->sc_imo;
2243	int error;
2244
2245	if (!(ifp->if_flags & IFF_MULTICAST))
2246		return (EADDRNOTAVAIL);
2247
2248	imo->imo_membership = (struct in_multi **)mship;
2249	imo->imo_max_memberships = IP_MIN_MEMBERSHIPS;
2250	imo->imo_multicast_vif = -1;
2251
2252	if ((error = in_joingroup(ifp, &sc->sc_sync_peer, NULL,
2253	    &imo->imo_membership[0])) != 0) {
2254		imo->imo_membership = NULL;
2255		return (error);
2256	}
2257	imo->imo_num_memberships++;
2258	imo->imo_multicast_ifp = ifp;
2259	imo->imo_multicast_ttl = PFSYNC_DFLTTL;
2260	imo->imo_multicast_loop = 0;
2261
2262	return (0);
2263}
2264
2265static void
2266pfsync_multicast_cleanup(struct pfsync_softc *sc)
2267{
2268	struct ip_moptions *imo = &sc->sc_imo;
2269
2270	in_leavegroup(imo->imo_membership[0], NULL);
2271	free(imo->imo_membership, M_PFSYNC);
2272	imo->imo_membership = NULL;
2273	imo->imo_multicast_ifp = NULL;
2274}
2275
2276#ifdef INET
2277extern  struct domain inetdomain;
2278static struct protosw in_pfsync_protosw = {
2279	.pr_type =		SOCK_RAW,
2280	.pr_domain =		&inetdomain,
2281	.pr_protocol =		IPPROTO_PFSYNC,
2282	.pr_flags =		PR_ATOMIC|PR_ADDR,
2283	.pr_input =		pfsync_input,
2284	.pr_output =		(pr_output_t *)rip_output,
2285	.pr_ctloutput =		rip_ctloutput,
2286	.pr_usrreqs =		&rip_usrreqs
2287};
2288#endif
2289
2290static void
2291pfsync_pointers_init()
2292{
2293
2294	PF_RULES_WLOCK();
2295	pfsync_state_import_ptr = pfsync_state_import;
2296	pfsync_insert_state_ptr = pfsync_insert_state;
2297	pfsync_update_state_ptr = pfsync_update_state;
2298	pfsync_delete_state_ptr = pfsync_delete_state;
2299	pfsync_clear_states_ptr = pfsync_clear_states;
2300	pfsync_defer_ptr = pfsync_defer;
2301	PF_RULES_WUNLOCK();
2302}
2303
2304static void
2305pfsync_pointers_uninit()
2306{
2307
2308	PF_RULES_WLOCK();
2309	pfsync_state_import_ptr = NULL;
2310	pfsync_insert_state_ptr = NULL;
2311	pfsync_update_state_ptr = NULL;
2312	pfsync_delete_state_ptr = NULL;
2313	pfsync_clear_states_ptr = NULL;
2314	pfsync_defer_ptr = NULL;
2315	PF_RULES_WUNLOCK();
2316}
2317
2318static int
2319pfsync_init()
2320{
2321	VNET_ITERATOR_DECL(vnet_iter);
2322	int error = 0;
2323
2324	VNET_LIST_RLOCK();
2325	VNET_FOREACH(vnet_iter) {
2326		CURVNET_SET(vnet_iter);
2327		V_pfsync_cloner = if_clone_simple(pfsyncname,
2328		    pfsync_clone_create, pfsync_clone_destroy, 1);
2329		error = swi_add(NULL, pfsyncname, pfsyncintr, V_pfsyncif,
2330		    SWI_NET, INTR_MPSAFE, &V_pfsync_swi_cookie);
2331		CURVNET_RESTORE();
2332		if (error)
2333			goto fail_locked;
2334	}
2335	VNET_LIST_RUNLOCK();
2336#ifdef INET
2337	error = pf_proto_register(PF_INET, &in_pfsync_protosw);
2338	if (error)
2339		goto fail;
2340	error = ipproto_register(IPPROTO_PFSYNC);
2341	if (error) {
2342		pf_proto_unregister(PF_INET, IPPROTO_PFSYNC, SOCK_RAW);
2343		goto fail;
2344	}
2345#endif
2346	pfsync_pointers_init();
2347
2348	return (0);
2349
2350fail:
2351	VNET_LIST_RLOCK();
2352fail_locked:
2353	VNET_FOREACH(vnet_iter) {
2354		CURVNET_SET(vnet_iter);
2355		if (V_pfsync_swi_cookie) {
2356			swi_remove(V_pfsync_swi_cookie);
2357			if_clone_detach(V_pfsync_cloner);
2358		}
2359		CURVNET_RESTORE();
2360	}
2361	VNET_LIST_RUNLOCK();
2362
2363	return (error);
2364}
2365
2366static void
2367pfsync_uninit()
2368{
2369	VNET_ITERATOR_DECL(vnet_iter);
2370
2371	pfsync_pointers_uninit();
2372
2373	ipproto_unregister(IPPROTO_PFSYNC);
2374	pf_proto_unregister(PF_INET, IPPROTO_PFSYNC, SOCK_RAW);
2375	VNET_LIST_RLOCK();
2376	VNET_FOREACH(vnet_iter) {
2377		CURVNET_SET(vnet_iter);
2378		if_clone_detach(V_pfsync_cloner);
2379		swi_remove(V_pfsync_swi_cookie);
2380		CURVNET_RESTORE();
2381	}
2382	VNET_LIST_RUNLOCK();
2383}
2384
2385static int
2386pfsync_modevent(module_t mod, int type, void *data)
2387{
2388	int error = 0;
2389
2390	switch (type) {
2391	case MOD_LOAD:
2392		error = pfsync_init();
2393		break;
2394	case MOD_QUIESCE:
2395		/*
2396		 * Module should not be unloaded due to race conditions.
2397		 */
2398		error = EBUSY;
2399		break;
2400	case MOD_UNLOAD:
2401		pfsync_uninit();
2402		break;
2403	default:
2404		error = EINVAL;
2405		break;
2406	}
2407
2408	return (error);
2409}
2410
2411static moduledata_t pfsync_mod = {
2412	pfsyncname,
2413	pfsync_modevent,
2414	0
2415};
2416
2417#define PFSYNC_MODVER 1
2418
2419DECLARE_MODULE(pfsync, pfsync_mod, SI_SUB_PROTO_DOMAIN, SI_ORDER_ANY);
2420MODULE_VERSION(pfsync, PFSYNC_MODVER);
2421MODULE_DEPEND(pfsync, pf, PF_MODVER, PF_MODVER, PF_MODVER);
2422