if_pfsync.c revision 241394
1/*	$OpenBSD: if_pfsync.c,v 1.110 2009/02/24 05:39:19 dlg Exp $	*/
2
3/*
4 * Copyright (c) 2002 Michael Shalayeff
5 * All rights reserved.
6 *
7 * Redistribution and use in source and binary forms, with or without
8 * modification, are permitted provided that the following conditions
9 * are met:
10 * 1. Redistributions of source code must retain the above copyright
11 *    notice, this list of conditions and the following disclaimer.
12 * 2. Redistributions in binary form must reproduce the above copyright
13 *    notice, this list of conditions and the following disclaimer in the
14 *    documentation and/or other materials provided with the distribution.
15 *
16 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
17 * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
18 * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
19 * IN NO EVENT SHALL THE AUTHOR OR HIS RELATIVES BE LIABLE FOR ANY DIRECT,
20 * INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
21 * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
22 * SERVICES; LOSS OF MIND, USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
23 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
24 * STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING
25 * IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF
26 * THE POSSIBILITY OF SUCH DAMAGE.
27 */
28
29/*
30 * Copyright (c) 2009 David Gwynne <dlg@openbsd.org>
31 *
32 * Permission to use, copy, modify, and distribute this software for any
33 * purpose with or without fee is hereby granted, provided that the above
34 * copyright notice and this permission notice appear in all copies.
35 *
36 * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
37 * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
38 * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR
39 * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
40 * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
41 * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF
42 * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
43 */
44
45/*
46 * Revisions picked from OpenBSD after revision 1.110 import:
47 * 1.118, 1.124, 1.148, 1.149, 1.151, 1.171 - fixes to bulk updates
48 * 1.120, 1.175 - use monotonic time_uptime
49 * 1.122 - reduce number of updates for non-TCP sessions
50 * 1.128 - cleanups
51 * 1.146 - bzero() mbuf before sparsely filling it with data
52 * 1.170 - SIOCSIFMTU checks
53 * 1.126, 1.142 - deferred packets processing
54 * 1.173 - correct expire time processing
55 */
56
57#include <sys/cdefs.h>
58__FBSDID("$FreeBSD: head/sys/netpfil/pf/if_pfsync.c 241394 2012-10-10 08:36:38Z kevlo $");
59
60#include "opt_inet.h"
61#include "opt_inet6.h"
62#include "opt_pf.h"
63
64#include <sys/param.h>
65#include <sys/bus.h>
66#include <sys/endian.h>
67#include <sys/interrupt.h>
68#include <sys/kernel.h>
69#include <sys/lock.h>
70#include <sys/mbuf.h>
71#include <sys/module.h>
72#include <sys/mutex.h>
73#include <sys/priv.h>
74#include <sys/protosw.h>
75#include <sys/socket.h>
76#include <sys/sockio.h>
77#include <sys/sysctl.h>
78
79#include <net/bpf.h>
80#include <net/if.h>
81#include <net/if_clone.h>
82#include <net/if_types.h>
83#include <net/pfvar.h>
84#include <net/if_pfsync.h>
85
86#include <netinet/if_ether.h>
87#include <netinet/in.h>
88#include <netinet/in_var.h>
89#include <netinet/ip.h>
90#include <netinet/ip_carp.h>
91#include <netinet/ip_var.h>
92#include <netinet/tcp.h>
93#include <netinet/tcp_fsm.h>
94#include <netinet/tcp_seq.h>
95
96#define PFSYNC_MINPKT ( \
97	sizeof(struct ip) + \
98	sizeof(struct pfsync_header) + \
99	sizeof(struct pfsync_subheader) + \
100	sizeof(struct pfsync_eof))
101
102struct pfsync_pkt {
103	struct ip *ip;
104	struct in_addr src;
105	u_int8_t flags;
106};
107
108static int	pfsync_upd_tcp(struct pf_state *, struct pfsync_state_peer *,
109		    struct pfsync_state_peer *);
110static int	pfsync_in_clr(struct pfsync_pkt *, struct mbuf *, int, int);
111static int	pfsync_in_ins(struct pfsync_pkt *, struct mbuf *, int, int);
112static int	pfsync_in_iack(struct pfsync_pkt *, struct mbuf *, int, int);
113static int	pfsync_in_upd(struct pfsync_pkt *, struct mbuf *, int, int);
114static int	pfsync_in_upd_c(struct pfsync_pkt *, struct mbuf *, int, int);
115static int	pfsync_in_ureq(struct pfsync_pkt *, struct mbuf *, int, int);
116static int	pfsync_in_del(struct pfsync_pkt *, struct mbuf *, int, int);
117static int	pfsync_in_del_c(struct pfsync_pkt *, struct mbuf *, int, int);
118static int	pfsync_in_bus(struct pfsync_pkt *, struct mbuf *, int, int);
119static int	pfsync_in_tdb(struct pfsync_pkt *, struct mbuf *, int, int);
120static int	pfsync_in_eof(struct pfsync_pkt *, struct mbuf *, int, int);
121static int	pfsync_in_error(struct pfsync_pkt *, struct mbuf *, int, int);
122
123static int (*pfsync_acts[])(struct pfsync_pkt *, struct mbuf *, int, int) = {
124	pfsync_in_clr,			/* PFSYNC_ACT_CLR */
125	pfsync_in_ins,			/* PFSYNC_ACT_INS */
126	pfsync_in_iack,			/* PFSYNC_ACT_INS_ACK */
127	pfsync_in_upd,			/* PFSYNC_ACT_UPD */
128	pfsync_in_upd_c,		/* PFSYNC_ACT_UPD_C */
129	pfsync_in_ureq,			/* PFSYNC_ACT_UPD_REQ */
130	pfsync_in_del,			/* PFSYNC_ACT_DEL */
131	pfsync_in_del_c,		/* PFSYNC_ACT_DEL_C */
132	pfsync_in_error,		/* PFSYNC_ACT_INS_F */
133	pfsync_in_error,		/* PFSYNC_ACT_DEL_F */
134	pfsync_in_bus,			/* PFSYNC_ACT_BUS */
135	pfsync_in_tdb,			/* PFSYNC_ACT_TDB */
136	pfsync_in_eof			/* PFSYNC_ACT_EOF */
137};
138
139struct pfsync_q {
140	void		(*write)(struct pf_state *, void *);
141	size_t		len;
142	u_int8_t	action;
143};
144
145/* we have one of these for every PFSYNC_S_ */
146static void	pfsync_out_state(struct pf_state *, void *);
147static void	pfsync_out_iack(struct pf_state *, void *);
148static void	pfsync_out_upd_c(struct pf_state *, void *);
149static void	pfsync_out_del(struct pf_state *, void *);
150
151static struct pfsync_q pfsync_qs[] = {
152	{ pfsync_out_state, sizeof(struct pfsync_state),   PFSYNC_ACT_INS },
153	{ pfsync_out_iack,  sizeof(struct pfsync_ins_ack), PFSYNC_ACT_INS_ACK },
154	{ pfsync_out_state, sizeof(struct pfsync_state),   PFSYNC_ACT_UPD },
155	{ pfsync_out_upd_c, sizeof(struct pfsync_upd_c),   PFSYNC_ACT_UPD_C },
156	{ pfsync_out_del,   sizeof(struct pfsync_del_c),   PFSYNC_ACT_DEL_C }
157};
158
159static void	pfsync_q_ins(struct pf_state *, int);
160static void	pfsync_q_del(struct pf_state *);
161
162static void	pfsync_update_state(struct pf_state *);
163
164struct pfsync_upd_req_item {
165	TAILQ_ENTRY(pfsync_upd_req_item)	ur_entry;
166	struct pfsync_upd_req			ur_msg;
167};
168
169struct pfsync_deferral {
170	struct pfsync_softc		*pd_sc;
171	TAILQ_ENTRY(pfsync_deferral)	pd_entry;
172	u_int				pd_refs;
173	struct callout			pd_tmo;
174
175	struct pf_state			*pd_st;
176	struct mbuf			*pd_m;
177};
178
179struct pfsync_softc {
180	/* Configuration */
181	struct ifnet		*sc_ifp;
182	struct ifnet		*sc_sync_if;
183	struct ip_moptions	sc_imo;
184	struct in_addr		sc_sync_peer;
185	uint32_t		sc_flags;
186#define	PFSYNCF_OK		0x00000001
187#define	PFSYNCF_DEFER		0x00000002
188#define	PFSYNCF_PUSH		0x00000004
189	uint8_t			sc_maxupdates;
190	struct ip		sc_template;
191	struct callout		sc_tmo;
192	struct mtx		sc_mtx;
193
194	/* Queued data */
195	size_t			sc_len;
196	TAILQ_HEAD(, pf_state)			sc_qs[PFSYNC_S_COUNT];
197	TAILQ_HEAD(, pfsync_upd_req_item)	sc_upd_req_list;
198	TAILQ_HEAD(, pfsync_deferral)		sc_deferrals;
199	u_int			sc_deferred;
200	void			*sc_plus;
201	size_t			sc_pluslen;
202
203	/* Bulk update info */
204	struct mtx		sc_bulk_mtx;
205	uint32_t		sc_ureq_sent;
206	int			sc_bulk_tries;
207	uint32_t		sc_ureq_received;
208	int			sc_bulk_hashid;
209	uint64_t		sc_bulk_stateid;
210	uint32_t		sc_bulk_creatorid;
211	struct callout		sc_bulk_tmo;
212	struct callout		sc_bulkfail_tmo;
213};
214
215#define	PFSYNC_LOCK(sc)		mtx_lock(&(sc)->sc_mtx)
216#define	PFSYNC_UNLOCK(sc)	mtx_unlock(&(sc)->sc_mtx)
217#define	PFSYNC_LOCK_ASSERT(sc)	mtx_assert(&(sc)->sc_mtx, MA_OWNED)
218
219#define	PFSYNC_BLOCK(sc)	mtx_lock(&(sc)->sc_bulk_mtx)
220#define	PFSYNC_BUNLOCK(sc)	mtx_unlock(&(sc)->sc_bulk_mtx)
221#define	PFSYNC_BLOCK_ASSERT(sc)	mtx_assert(&(sc)->sc_bulk_mtx, MA_OWNED)
222
223static MALLOC_DEFINE(M_PFSYNC, "pfsync", "pfsync(4) data");
224static VNET_DEFINE(struct pfsync_softc	*, pfsyncif) = NULL;
225#define	V_pfsyncif		VNET(pfsyncif)
226static VNET_DEFINE(void *, pfsync_swi_cookie) = NULL;
227#define	V_pfsync_swi_cookie	VNET(pfsync_swi_cookie)
228static VNET_DEFINE(struct pfsyncstats, pfsyncstats);
229#define	V_pfsyncstats		VNET(pfsyncstats)
230static VNET_DEFINE(int, pfsync_carp_adj) = CARP_MAXSKEW;
231#define	V_pfsync_carp_adj	VNET(pfsync_carp_adj)
232
233static void	pfsync_timeout(void *);
234static void	pfsync_push(struct pfsync_softc *);
235static void	pfsyncintr(void *);
236static int	pfsync_multicast_setup(struct pfsync_softc *, struct ifnet *,
237		    void *);
238static void	pfsync_multicast_cleanup(struct pfsync_softc *);
239static void	pfsync_pointers_init(void);
240static void	pfsync_pointers_uninit(void);
241static int	pfsync_init(void);
242static void	pfsync_uninit(void);
243
244SYSCTL_NODE(_net, OID_AUTO, pfsync, CTLFLAG_RW, 0, "PFSYNC");
245SYSCTL_VNET_STRUCT(_net_pfsync, OID_AUTO, stats, CTLFLAG_RW,
246    &VNET_NAME(pfsyncstats), pfsyncstats,
247    "PFSYNC statistics (struct pfsyncstats, net/if_pfsync.h)");
248SYSCTL_INT(_net_pfsync, OID_AUTO, carp_demotion_factor, CTLFLAG_RW,
249    &VNET_NAME(pfsync_carp_adj), 0, "pfsync's CARP demotion factor adjustment");
250
251static int	pfsync_clone_create(struct if_clone *, int, caddr_t);
252static void	pfsync_clone_destroy(struct ifnet *);
253static int	pfsync_alloc_scrub_memory(struct pfsync_state_peer *,
254		    struct pf_state_peer *);
255static int	pfsyncoutput(struct ifnet *, struct mbuf *, struct sockaddr *,
256		    struct route *);
257static int	pfsyncioctl(struct ifnet *, u_long, caddr_t);
258
259static int	pfsync_defer(struct pf_state *, struct mbuf *);
260static void	pfsync_undefer(struct pfsync_deferral *, int);
261static void	pfsync_undefer_state(struct pf_state *, int);
262static void	pfsync_defer_tmo(void *);
263
264static void	pfsync_request_update(u_int32_t, u_int64_t);
265static void	pfsync_update_state_req(struct pf_state *);
266
267static void	pfsync_drop(struct pfsync_softc *);
268static void	pfsync_sendout(int);
269static void	pfsync_send_plus(void *, size_t);
270
271static void	pfsync_bulk_start(void);
272static void	pfsync_bulk_status(u_int8_t);
273static void	pfsync_bulk_update(void *);
274static void	pfsync_bulk_fail(void *);
275
276#ifdef IPSEC
277static void	pfsync_update_net_tdb(struct pfsync_tdb *);
278#endif
279
280#define PFSYNC_MAX_BULKTRIES	12
281
282VNET_DEFINE(struct ifc_simple_data, pfsync_cloner_data);
283VNET_DEFINE(struct if_clone, pfsync_cloner);
284#define	V_pfsync_cloner_data	VNET(pfsync_cloner_data)
285#define	V_pfsync_cloner		VNET(pfsync_cloner)
286IFC_SIMPLE_DECLARE(pfsync, 1);
287
288static int
289pfsync_clone_create(struct if_clone *ifc, int unit, caddr_t param)
290{
291	struct pfsync_softc *sc;
292	struct ifnet *ifp;
293	int q;
294
295	if (unit != 0)
296		return (EINVAL);
297
298	sc = malloc(sizeof(struct pfsync_softc), M_PFSYNC, M_WAITOK | M_ZERO);
299	sc->sc_flags |= PFSYNCF_OK;
300
301	for (q = 0; q < PFSYNC_S_COUNT; q++)
302		TAILQ_INIT(&sc->sc_qs[q]);
303
304	TAILQ_INIT(&sc->sc_upd_req_list);
305	TAILQ_INIT(&sc->sc_deferrals);
306
307	sc->sc_len = PFSYNC_MINPKT;
308	sc->sc_maxupdates = 128;
309
310	ifp = sc->sc_ifp = if_alloc(IFT_PFSYNC);
311	if (ifp == NULL) {
312		free(sc, M_PFSYNC);
313		return (ENOSPC);
314	}
315	if_initname(ifp, ifc->ifc_name, unit);
316	ifp->if_softc = sc;
317	ifp->if_ioctl = pfsyncioctl;
318	ifp->if_output = pfsyncoutput;
319	ifp->if_type = IFT_PFSYNC;
320	ifp->if_snd.ifq_maxlen = ifqmaxlen;
321	ifp->if_hdrlen = sizeof(struct pfsync_header);
322	ifp->if_mtu = ETHERMTU;
323	mtx_init(&sc->sc_mtx, "pfsync", NULL, MTX_DEF);
324	mtx_init(&sc->sc_bulk_mtx, "pfsync bulk", NULL, MTX_DEF);
325	callout_init(&sc->sc_tmo, CALLOUT_MPSAFE);
326	callout_init_mtx(&sc->sc_bulk_tmo, &sc->sc_bulk_mtx, 0);
327	callout_init_mtx(&sc->sc_bulkfail_tmo, &sc->sc_bulk_mtx, 0);
328
329	if_attach(ifp);
330
331	bpfattach(ifp, DLT_PFSYNC, PFSYNC_HDRLEN);
332
333	V_pfsyncif = sc;
334
335	return (0);
336}
337
338static void
339pfsync_clone_destroy(struct ifnet *ifp)
340{
341	struct pfsync_softc *sc = ifp->if_softc;
342
343	/*
344	 * At this stage, everything should have already been
345	 * cleared by pfsync_uninit(), and we have only to
346	 * drain callouts.
347	 */
348	while (sc->sc_deferred > 0) {
349		struct pfsync_deferral *pd = TAILQ_FIRST(&sc->sc_deferrals);
350
351		TAILQ_REMOVE(&sc->sc_deferrals, pd, pd_entry);
352		sc->sc_deferred--;
353		if (callout_stop(&pd->pd_tmo)) {
354			pf_release_state(pd->pd_st);
355			m_freem(pd->pd_m);
356			free(pd, M_PFSYNC);
357		} else {
358			pd->pd_refs++;
359			callout_drain(&pd->pd_tmo);
360			free(pd, M_PFSYNC);
361		}
362	}
363
364	callout_drain(&sc->sc_tmo);
365	callout_drain(&sc->sc_bulkfail_tmo);
366	callout_drain(&sc->sc_bulk_tmo);
367
368	if (!(sc->sc_flags & PFSYNCF_OK) && carp_demote_adj_p)
369		(*carp_demote_adj_p)(-V_pfsync_carp_adj, "pfsync destroy");
370	bpfdetach(ifp);
371	if_detach(ifp);
372
373	pfsync_drop(sc);
374
375	if_free(ifp);
376	if (sc->sc_imo.imo_membership)
377		pfsync_multicast_cleanup(sc);
378	mtx_destroy(&sc->sc_mtx);
379	mtx_destroy(&sc->sc_bulk_mtx);
380	free(sc, M_PFSYNC);
381
382	V_pfsyncif = NULL;
383}
384
385static int
386pfsync_alloc_scrub_memory(struct pfsync_state_peer *s,
387    struct pf_state_peer *d)
388{
389	if (s->scrub.scrub_flag && d->scrub == NULL) {
390		d->scrub = uma_zalloc(V_pf_state_scrub_z, M_NOWAIT | M_ZERO);
391		if (d->scrub == NULL)
392			return (ENOMEM);
393	}
394
395	return (0);
396}
397
398
399static int
400pfsync_state_import(struct pfsync_state *sp, u_int8_t flags)
401{
402	struct pfsync_softc *sc = V_pfsyncif;
403	struct pf_state	*st = NULL;
404	struct pf_state_key *skw = NULL, *sks = NULL;
405	struct pf_rule *r = NULL;
406	struct pfi_kif	*kif;
407	int error;
408
409	PF_RULES_RASSERT();
410
411	if (sp->creatorid == 0 && V_pf_status.debug >= PF_DEBUG_MISC) {
412		printf("%s: invalid creator id: %08x\n", __func__,
413		    ntohl(sp->creatorid));
414		return (EINVAL);
415	}
416
417	if ((kif = pfi_kif_find(sp->ifname)) == NULL) {
418		if (V_pf_status.debug >= PF_DEBUG_MISC)
419			printf("%s: unknown interface: %s\n", __func__,
420			    sp->ifname);
421		if (flags & PFSYNC_SI_IOCTL)
422			return (EINVAL);
423		return (0);	/* skip this state */
424	}
425
426	/*
427	 * If the ruleset checksums match or the state is coming from the ioctl,
428	 * it's safe to associate the state with the rule of that number.
429	 */
430	if (sp->rule != htonl(-1) && sp->anchor == htonl(-1) &&
431	    (flags & (PFSYNC_SI_IOCTL | PFSYNC_SI_CKSUM)) && ntohl(sp->rule) <
432	    pf_main_ruleset.rules[PF_RULESET_FILTER].active.rcount)
433		r = pf_main_ruleset.rules[
434		    PF_RULESET_FILTER].active.ptr_array[ntohl(sp->rule)];
435	else
436		r = &V_pf_default_rule;
437
438	if ((r->max_states && r->states_cur >= r->max_states))
439		goto cleanup;
440
441	/*
442	 * XXXGL: consider M_WAITOK in ioctl path after.
443	 */
444	if ((st = uma_zalloc(V_pf_state_z, M_NOWAIT | M_ZERO)) == NULL)
445		goto cleanup;
446
447	if ((skw = uma_zalloc(V_pf_state_key_z, M_NOWAIT)) == NULL)
448		goto cleanup;
449
450	if (PF_ANEQ(&sp->key[PF_SK_WIRE].addr[0],
451	    &sp->key[PF_SK_STACK].addr[0], sp->af) ||
452	    PF_ANEQ(&sp->key[PF_SK_WIRE].addr[1],
453	    &sp->key[PF_SK_STACK].addr[1], sp->af) ||
454	    sp->key[PF_SK_WIRE].port[0] != sp->key[PF_SK_STACK].port[0] ||
455	    sp->key[PF_SK_WIRE].port[1] != sp->key[PF_SK_STACK].port[1]) {
456		sks = uma_zalloc(V_pf_state_key_z, M_NOWAIT);
457		if (sks == NULL)
458			goto cleanup;
459	} else
460		sks = skw;
461
462	/* allocate memory for scrub info */
463	if (pfsync_alloc_scrub_memory(&sp->src, &st->src) ||
464	    pfsync_alloc_scrub_memory(&sp->dst, &st->dst))
465		goto cleanup;
466
467	/* copy to state key(s) */
468	skw->addr[0] = sp->key[PF_SK_WIRE].addr[0];
469	skw->addr[1] = sp->key[PF_SK_WIRE].addr[1];
470	skw->port[0] = sp->key[PF_SK_WIRE].port[0];
471	skw->port[1] = sp->key[PF_SK_WIRE].port[1];
472	skw->proto = sp->proto;
473	skw->af = sp->af;
474	if (sks != skw) {
475		sks->addr[0] = sp->key[PF_SK_STACK].addr[0];
476		sks->addr[1] = sp->key[PF_SK_STACK].addr[1];
477		sks->port[0] = sp->key[PF_SK_STACK].port[0];
478		sks->port[1] = sp->key[PF_SK_STACK].port[1];
479		sks->proto = sp->proto;
480		sks->af = sp->af;
481	}
482
483	/* copy to state */
484	bcopy(&sp->rt_addr, &st->rt_addr, sizeof(st->rt_addr));
485	st->creation = time_uptime - ntohl(sp->creation);
486	st->expire = time_uptime;
487	if (sp->expire) {
488		uint32_t timeout;
489
490		timeout = r->timeout[sp->timeout];
491		if (!timeout)
492			timeout = V_pf_default_rule.timeout[sp->timeout];
493
494		/* sp->expire may have been adaptively scaled by export. */
495		st->expire -= timeout - ntohl(sp->expire);
496	}
497
498	st->direction = sp->direction;
499	st->log = sp->log;
500	st->timeout = sp->timeout;
501	st->state_flags = sp->state_flags;
502
503	st->id = sp->id;
504	st->creatorid = sp->creatorid;
505	pf_state_peer_ntoh(&sp->src, &st->src);
506	pf_state_peer_ntoh(&sp->dst, &st->dst);
507
508	st->rule.ptr = r;
509	st->nat_rule.ptr = NULL;
510	st->anchor.ptr = NULL;
511	st->rt_kif = NULL;
512
513	st->pfsync_time = time_uptime;
514	st->sync_state = PFSYNC_S_NONE;
515
516	/* XXX when we have nat_rule/anchors, use STATE_INC_COUNTERS */
517	r->states_cur++;
518	r->states_tot++;
519
520	if (!(flags & PFSYNC_SI_IOCTL))
521		st->state_flags |= PFSTATE_NOSYNC;
522
523	if ((error = pf_state_insert(kif, skw, sks, st)) != 0) {
524		/* XXX when we have nat_rule/anchors, use STATE_DEC_COUNTERS */
525		r->states_cur--;
526		goto cleanup_state;
527	}
528
529	if (!(flags & PFSYNC_SI_IOCTL)) {
530		st->state_flags &= ~PFSTATE_NOSYNC;
531		if (st->state_flags & PFSTATE_ACK) {
532			pfsync_q_ins(st, PFSYNC_S_IACK);
533			pfsync_push(sc);
534		}
535	}
536	st->state_flags &= ~PFSTATE_ACK;
537	PF_STATE_UNLOCK(st);
538
539	return (0);
540
541cleanup:
542	error = ENOMEM;
543	if (skw == sks)
544		sks = NULL;
545	if (skw != NULL)
546		uma_zfree(V_pf_state_key_z, skw);
547	if (sks != NULL)
548		uma_zfree(V_pf_state_key_z, sks);
549
550cleanup_state:	/* pf_state_insert() frees the state keys. */
551	if (st) {
552		if (st->dst.scrub)
553			uma_zfree(V_pf_state_scrub_z, st->dst.scrub);
554		if (st->src.scrub)
555			uma_zfree(V_pf_state_scrub_z, st->src.scrub);
556		uma_zfree(V_pf_state_z, st);
557	}
558	return (error);
559}
560
561static void
562pfsync_input(struct mbuf *m, __unused int off)
563{
564	struct pfsync_softc *sc = V_pfsyncif;
565	struct pfsync_pkt pkt;
566	struct ip *ip = mtod(m, struct ip *);
567	struct pfsync_header *ph;
568	struct pfsync_subheader subh;
569
570	int offset;
571	int rv;
572	uint16_t count;
573
574	V_pfsyncstats.pfsyncs_ipackets++;
575
576	/* Verify that we have a sync interface configured. */
577	if (!sc || !sc->sc_sync_if || !V_pf_status.running ||
578	    (sc->sc_ifp->if_drv_flags & IFF_DRV_RUNNING) == 0)
579		goto done;
580
581	/* verify that the packet came in on the right interface */
582	if (sc->sc_sync_if != m->m_pkthdr.rcvif) {
583		V_pfsyncstats.pfsyncs_badif++;
584		goto done;
585	}
586
587	sc->sc_ifp->if_ipackets++;
588	sc->sc_ifp->if_ibytes += m->m_pkthdr.len;
589	/* verify that the IP TTL is 255. */
590	if (ip->ip_ttl != PFSYNC_DFLTTL) {
591		V_pfsyncstats.pfsyncs_badttl++;
592		goto done;
593	}
594
595	offset = ip->ip_hl << 2;
596	if (m->m_pkthdr.len < offset + sizeof(*ph)) {
597		V_pfsyncstats.pfsyncs_hdrops++;
598		goto done;
599	}
600
601	if (offset + sizeof(*ph) > m->m_len) {
602		if (m_pullup(m, offset + sizeof(*ph)) == NULL) {
603			V_pfsyncstats.pfsyncs_hdrops++;
604			return;
605		}
606		ip = mtod(m, struct ip *);
607	}
608	ph = (struct pfsync_header *)((char *)ip + offset);
609
610	/* verify the version */
611	if (ph->version != PFSYNC_VERSION) {
612		V_pfsyncstats.pfsyncs_badver++;
613		goto done;
614	}
615
616	/* Cheaper to grab this now than having to mess with mbufs later */
617	pkt.ip = ip;
618	pkt.src = ip->ip_src;
619	pkt.flags = 0;
620
621	/*
622	 * Trusting pf_chksum during packet processing, as well as seeking
623	 * in interface name tree, require holding PF_RULES_RLOCK().
624	 */
625	PF_RULES_RLOCK();
626	if (!bcmp(&ph->pfcksum, &V_pf_status.pf_chksum, PF_MD5_DIGEST_LENGTH))
627		pkt.flags |= PFSYNC_SI_CKSUM;
628
629	offset += sizeof(*ph);
630	for (;;) {
631		m_copydata(m, offset, sizeof(subh), (caddr_t)&subh);
632		offset += sizeof(subh);
633
634		if (subh.action >= PFSYNC_ACT_MAX) {
635			V_pfsyncstats.pfsyncs_badact++;
636			PF_RULES_RUNLOCK();
637			goto done;
638		}
639
640		count = ntohs(subh.count);
641		V_pfsyncstats.pfsyncs_iacts[subh.action] += count;
642		rv = (*pfsync_acts[subh.action])(&pkt, m, offset, count);
643		if (rv == -1) {
644			PF_RULES_RUNLOCK();
645			return;
646		}
647
648		offset += rv;
649	}
650	PF_RULES_RUNLOCK();
651
652done:
653	m_freem(m);
654}
655
656static int
657pfsync_in_clr(struct pfsync_pkt *pkt, struct mbuf *m, int offset, int count)
658{
659	struct pfsync_clr *clr;
660	struct mbuf *mp;
661	int len = sizeof(*clr) * count;
662	int i, offp;
663	u_int32_t creatorid;
664
665	mp = m_pulldown(m, offset, len, &offp);
666	if (mp == NULL) {
667		V_pfsyncstats.pfsyncs_badlen++;
668		return (-1);
669	}
670	clr = (struct pfsync_clr *)(mp->m_data + offp);
671
672	for (i = 0; i < count; i++) {
673		creatorid = clr[i].creatorid;
674
675		if (clr[i].ifname[0] != '\0' &&
676		    pfi_kif_find(clr[i].ifname) == NULL)
677			continue;
678
679		for (int i = 0; i <= V_pf_hashmask; i++) {
680			struct pf_idhash *ih = &V_pf_idhash[i];
681			struct pf_state *s;
682relock:
683			PF_HASHROW_LOCK(ih);
684			LIST_FOREACH(s, &ih->states, entry) {
685				if (s->creatorid == creatorid) {
686					s->state_flags |= PFSTATE_NOSYNC;
687					pf_unlink_state(s, PF_ENTER_LOCKED);
688					goto relock;
689				}
690			}
691			PF_HASHROW_UNLOCK(ih);
692		}
693	}
694
695	return (len);
696}
697
698static int
699pfsync_in_ins(struct pfsync_pkt *pkt, struct mbuf *m, int offset, int count)
700{
701	struct mbuf *mp;
702	struct pfsync_state *sa, *sp;
703	int len = sizeof(*sp) * count;
704	int i, offp;
705
706	mp = m_pulldown(m, offset, len, &offp);
707	if (mp == NULL) {
708		V_pfsyncstats.pfsyncs_badlen++;
709		return (-1);
710	}
711	sa = (struct pfsync_state *)(mp->m_data + offp);
712
713	for (i = 0; i < count; i++) {
714		sp = &sa[i];
715
716		/* Check for invalid values. */
717		if (sp->timeout >= PFTM_MAX ||
718		    sp->src.state > PF_TCPS_PROXY_DST ||
719		    sp->dst.state > PF_TCPS_PROXY_DST ||
720		    sp->direction > PF_OUT ||
721		    (sp->af != AF_INET && sp->af != AF_INET6)) {
722			if (V_pf_status.debug >= PF_DEBUG_MISC)
723				printf("%s: invalid value\n", __func__);
724			V_pfsyncstats.pfsyncs_badval++;
725			continue;
726		}
727
728		if (pfsync_state_import(sp, pkt->flags) == ENOMEM)
729			/* Drop out, but process the rest of the actions. */
730			break;
731	}
732
733	return (len);
734}
735
736static int
737pfsync_in_iack(struct pfsync_pkt *pkt, struct mbuf *m, int offset, int count)
738{
739	struct pfsync_ins_ack *ia, *iaa;
740	struct pf_state *st;
741
742	struct mbuf *mp;
743	int len = count * sizeof(*ia);
744	int offp, i;
745
746	mp = m_pulldown(m, offset, len, &offp);
747	if (mp == NULL) {
748		V_pfsyncstats.pfsyncs_badlen++;
749		return (-1);
750	}
751	iaa = (struct pfsync_ins_ack *)(mp->m_data + offp);
752
753	for (i = 0; i < count; i++) {
754		ia = &iaa[i];
755
756		st = pf_find_state_byid(ia->id, ia->creatorid);
757		if (st == NULL)
758			continue;
759
760		if (st->state_flags & PFSTATE_ACK) {
761			PFSYNC_LOCK(V_pfsyncif);
762			pfsync_undefer_state(st, 0);
763			PFSYNC_UNLOCK(V_pfsyncif);
764		}
765		PF_STATE_UNLOCK(st);
766	}
767	/*
768	 * XXX this is not yet implemented, but we know the size of the
769	 * message so we can skip it.
770	 */
771
772	return (count * sizeof(struct pfsync_ins_ack));
773}
774
775static int
776pfsync_upd_tcp(struct pf_state *st, struct pfsync_state_peer *src,
777    struct pfsync_state_peer *dst)
778{
779	int sfail = 0;
780
781	PF_STATE_LOCK_ASSERT(st);
782
783	/*
784	 * The state should never go backwards except
785	 * for syn-proxy states.  Neither should the
786	 * sequence window slide backwards.
787	 */
788	if (st->src.state > src->state &&
789	    (st->src.state < PF_TCPS_PROXY_SRC ||
790	    src->state >= PF_TCPS_PROXY_SRC))
791		sfail = 1;
792	else if (SEQ_GT(st->src.seqlo, ntohl(src->seqlo)))
793		sfail = 3;
794	else if (st->dst.state > dst->state) {
795		/* There might still be useful
796		 * information about the src state here,
797		 * so import that part of the update,
798		 * then "fail" so we send the updated
799		 * state back to the peer who is missing
800		 * our what we know. */
801		pf_state_peer_ntoh(src, &st->src);
802		/* XXX do anything with timeouts? */
803		sfail = 7;
804	} else if (st->dst.state >= TCPS_SYN_SENT &&
805	    SEQ_GT(st->dst.seqlo, ntohl(dst->seqlo)))
806		sfail = 4;
807
808	return (sfail);
809}
810
811static int
812pfsync_in_upd(struct pfsync_pkt *pkt, struct mbuf *m, int offset, int count)
813{
814	struct pfsync_softc *sc = V_pfsyncif;
815	struct pfsync_state *sa, *sp;
816	struct pf_state_key *sk;
817	struct pf_state *st;
818	int sfail;
819
820	struct mbuf *mp;
821	int len = count * sizeof(*sp);
822	int offp, i;
823
824	mp = m_pulldown(m, offset, len, &offp);
825	if (mp == NULL) {
826		V_pfsyncstats.pfsyncs_badlen++;
827		return (-1);
828	}
829	sa = (struct pfsync_state *)(mp->m_data + offp);
830
831	for (i = 0; i < count; i++) {
832		sp = &sa[i];
833
834		/* check for invalid values */
835		if (sp->timeout >= PFTM_MAX ||
836		    sp->src.state > PF_TCPS_PROXY_DST ||
837		    sp->dst.state > PF_TCPS_PROXY_DST) {
838			if (V_pf_status.debug >= PF_DEBUG_MISC) {
839				printf("pfsync_input: PFSYNC_ACT_UPD: "
840				    "invalid value\n");
841			}
842			V_pfsyncstats.pfsyncs_badval++;
843			continue;
844		}
845
846		st = pf_find_state_byid(sp->id, sp->creatorid);
847		if (st == NULL) {
848			/* insert the update */
849			if (pfsync_state_import(sp, 0))
850				V_pfsyncstats.pfsyncs_badstate++;
851			continue;
852		}
853
854		if (st->state_flags & PFSTATE_ACK) {
855			PFSYNC_LOCK(sc);
856			pfsync_undefer_state(st, 1);
857			PFSYNC_UNLOCK(sc);
858		}
859
860		sk = st->key[PF_SK_WIRE];	/* XXX right one? */
861		sfail = 0;
862		if (sk->proto == IPPROTO_TCP)
863			sfail = pfsync_upd_tcp(st, &sp->src, &sp->dst);
864		else {
865			/*
866			 * Non-TCP protocol state machine always go
867			 * forwards
868			 */
869			if (st->src.state > sp->src.state)
870				sfail = 5;
871			else if (st->dst.state > sp->dst.state)
872				sfail = 6;
873		}
874
875		if (sfail) {
876			if (V_pf_status.debug >= PF_DEBUG_MISC) {
877				printf("pfsync: %s stale update (%d)"
878				    " id: %016llx creatorid: %08x\n",
879				    (sfail < 7 ?  "ignoring" : "partial"),
880				    sfail, (unsigned long long)be64toh(st->id),
881				    ntohl(st->creatorid));
882			}
883			V_pfsyncstats.pfsyncs_stale++;
884
885			pfsync_update_state(st);
886			PF_STATE_UNLOCK(st);
887			PFSYNC_LOCK(sc);
888			pfsync_push(sc);
889			PFSYNC_UNLOCK(sc);
890			continue;
891		}
892		pfsync_alloc_scrub_memory(&sp->dst, &st->dst);
893		pf_state_peer_ntoh(&sp->src, &st->src);
894		pf_state_peer_ntoh(&sp->dst, &st->dst);
895		st->expire = time_uptime;
896		st->timeout = sp->timeout;
897		st->pfsync_time = time_uptime;
898		PF_STATE_UNLOCK(st);
899	}
900
901	return (len);
902}
903
904static int
905pfsync_in_upd_c(struct pfsync_pkt *pkt, struct mbuf *m, int offset, int count)
906{
907	struct pfsync_softc *sc = V_pfsyncif;
908	struct pfsync_upd_c *ua, *up;
909	struct pf_state_key *sk;
910	struct pf_state *st;
911
912	int len = count * sizeof(*up);
913	int sfail;
914
915	struct mbuf *mp;
916	int offp, i;
917
918	mp = m_pulldown(m, offset, len, &offp);
919	if (mp == NULL) {
920		V_pfsyncstats.pfsyncs_badlen++;
921		return (-1);
922	}
923	ua = (struct pfsync_upd_c *)(mp->m_data + offp);
924
925	for (i = 0; i < count; i++) {
926		up = &ua[i];
927
928		/* check for invalid values */
929		if (up->timeout >= PFTM_MAX ||
930		    up->src.state > PF_TCPS_PROXY_DST ||
931		    up->dst.state > PF_TCPS_PROXY_DST) {
932			if (V_pf_status.debug >= PF_DEBUG_MISC) {
933				printf("pfsync_input: "
934				    "PFSYNC_ACT_UPD_C: "
935				    "invalid value\n");
936			}
937			V_pfsyncstats.pfsyncs_badval++;
938			continue;
939		}
940
941		st = pf_find_state_byid(up->id, up->creatorid);
942		if (st == NULL) {
943			/* We don't have this state. Ask for it. */
944			PFSYNC_LOCK(sc);
945			pfsync_request_update(up->creatorid, up->id);
946			PFSYNC_UNLOCK(sc);
947			continue;
948		}
949
950		if (st->state_flags & PFSTATE_ACK) {
951			PFSYNC_LOCK(sc);
952			pfsync_undefer_state(st, 1);
953			PFSYNC_UNLOCK(sc);
954		}
955
956		sk = st->key[PF_SK_WIRE]; /* XXX right one? */
957		sfail = 0;
958		if (sk->proto == IPPROTO_TCP)
959			sfail = pfsync_upd_tcp(st, &up->src, &up->dst);
960		else {
961			/*
962			 * Non-TCP protocol state machine always go forwards
963			 */
964			if (st->src.state > up->src.state)
965				sfail = 5;
966			else if (st->dst.state > up->dst.state)
967				sfail = 6;
968		}
969
970		if (sfail) {
971			if (V_pf_status.debug >= PF_DEBUG_MISC) {
972				printf("pfsync: ignoring stale update "
973				    "(%d) id: %016llx "
974				    "creatorid: %08x\n", sfail,
975				    (unsigned long long)be64toh(st->id),
976				    ntohl(st->creatorid));
977			}
978			V_pfsyncstats.pfsyncs_stale++;
979
980			pfsync_update_state(st);
981			PF_STATE_UNLOCK(st);
982			PFSYNC_LOCK(sc);
983			pfsync_push(sc);
984			PFSYNC_UNLOCK(sc);
985			continue;
986		}
987		pfsync_alloc_scrub_memory(&up->dst, &st->dst);
988		pf_state_peer_ntoh(&up->src, &st->src);
989		pf_state_peer_ntoh(&up->dst, &st->dst);
990		st->expire = time_uptime;
991		st->timeout = up->timeout;
992		st->pfsync_time = time_uptime;
993		PF_STATE_UNLOCK(st);
994	}
995
996	return (len);
997}
998
999static int
1000pfsync_in_ureq(struct pfsync_pkt *pkt, struct mbuf *m, int offset, int count)
1001{
1002	struct pfsync_upd_req *ur, *ura;
1003	struct mbuf *mp;
1004	int len = count * sizeof(*ur);
1005	int i, offp;
1006
1007	struct pf_state *st;
1008
1009	mp = m_pulldown(m, offset, len, &offp);
1010	if (mp == NULL) {
1011		V_pfsyncstats.pfsyncs_badlen++;
1012		return (-1);
1013	}
1014	ura = (struct pfsync_upd_req *)(mp->m_data + offp);
1015
1016	for (i = 0; i < count; i++) {
1017		ur = &ura[i];
1018
1019		if (ur->id == 0 && ur->creatorid == 0)
1020			pfsync_bulk_start();
1021		else {
1022			st = pf_find_state_byid(ur->id, ur->creatorid);
1023			if (st == NULL) {
1024				V_pfsyncstats.pfsyncs_badstate++;
1025				continue;
1026			}
1027			if (st->state_flags & PFSTATE_NOSYNC) {
1028				PF_STATE_UNLOCK(st);
1029				continue;
1030			}
1031
1032			pfsync_update_state_req(st);
1033			PF_STATE_UNLOCK(st);
1034		}
1035	}
1036
1037	return (len);
1038}
1039
1040static int
1041pfsync_in_del(struct pfsync_pkt *pkt, struct mbuf *m, int offset, int count)
1042{
1043	struct mbuf *mp;
1044	struct pfsync_state *sa, *sp;
1045	struct pf_state *st;
1046	int len = count * sizeof(*sp);
1047	int offp, i;
1048
1049	mp = m_pulldown(m, offset, len, &offp);
1050	if (mp == NULL) {
1051		V_pfsyncstats.pfsyncs_badlen++;
1052		return (-1);
1053	}
1054	sa = (struct pfsync_state *)(mp->m_data + offp);
1055
1056	for (i = 0; i < count; i++) {
1057		sp = &sa[i];
1058
1059		st = pf_find_state_byid(sp->id, sp->creatorid);
1060		if (st == NULL) {
1061			V_pfsyncstats.pfsyncs_badstate++;
1062			continue;
1063		}
1064		st->state_flags |= PFSTATE_NOSYNC;
1065		pf_unlink_state(st, PF_ENTER_LOCKED);
1066	}
1067
1068	return (len);
1069}
1070
1071static int
1072pfsync_in_del_c(struct pfsync_pkt *pkt, struct mbuf *m, int offset, int count)
1073{
1074	struct mbuf *mp;
1075	struct pfsync_del_c *sa, *sp;
1076	struct pf_state *st;
1077	int len = count * sizeof(*sp);
1078	int offp, i;
1079
1080	mp = m_pulldown(m, offset, len, &offp);
1081	if (mp == NULL) {
1082		V_pfsyncstats.pfsyncs_badlen++;
1083		return (-1);
1084	}
1085	sa = (struct pfsync_del_c *)(mp->m_data + offp);
1086
1087	for (i = 0; i < count; i++) {
1088		sp = &sa[i];
1089
1090		st = pf_find_state_byid(sp->id, sp->creatorid);
1091		if (st == NULL) {
1092			V_pfsyncstats.pfsyncs_badstate++;
1093			continue;
1094		}
1095
1096		st->state_flags |= PFSTATE_NOSYNC;
1097		pf_unlink_state(st, PF_ENTER_LOCKED);
1098	}
1099
1100	return (len);
1101}
1102
1103static int
1104pfsync_in_bus(struct pfsync_pkt *pkt, struct mbuf *m, int offset, int count)
1105{
1106	struct pfsync_softc *sc = V_pfsyncif;
1107	struct pfsync_bus *bus;
1108	struct mbuf *mp;
1109	int len = count * sizeof(*bus);
1110	int offp;
1111
1112	PFSYNC_BLOCK(sc);
1113
1114	/* If we're not waiting for a bulk update, who cares. */
1115	if (sc->sc_ureq_sent == 0) {
1116		PFSYNC_BUNLOCK(sc);
1117		return (len);
1118	}
1119
1120	mp = m_pulldown(m, offset, len, &offp);
1121	if (mp == NULL) {
1122		PFSYNC_BUNLOCK(sc);
1123		V_pfsyncstats.pfsyncs_badlen++;
1124		return (-1);
1125	}
1126	bus = (struct pfsync_bus *)(mp->m_data + offp);
1127
1128	switch (bus->status) {
1129	case PFSYNC_BUS_START:
1130		callout_reset(&sc->sc_bulkfail_tmo, 4 * hz +
1131		    V_pf_limits[PF_LIMIT_STATES].limit /
1132		    ((sc->sc_ifp->if_mtu - PFSYNC_MINPKT) /
1133		    sizeof(struct pfsync_state)),
1134		    pfsync_bulk_fail, sc);
1135		if (V_pf_status.debug >= PF_DEBUG_MISC)
1136			printf("pfsync: received bulk update start\n");
1137		break;
1138
1139	case PFSYNC_BUS_END:
1140		if (time_uptime - ntohl(bus->endtime) >=
1141		    sc->sc_ureq_sent) {
1142			/* that's it, we're happy */
1143			sc->sc_ureq_sent = 0;
1144			sc->sc_bulk_tries = 0;
1145			callout_stop(&sc->sc_bulkfail_tmo);
1146			if (!(sc->sc_flags & PFSYNCF_OK) && carp_demote_adj_p)
1147				(*carp_demote_adj_p)(-V_pfsync_carp_adj,
1148				    "pfsync bulk done");
1149			sc->sc_flags |= PFSYNCF_OK;
1150			if (V_pf_status.debug >= PF_DEBUG_MISC)
1151				printf("pfsync: received valid "
1152				    "bulk update end\n");
1153		} else {
1154			if (V_pf_status.debug >= PF_DEBUG_MISC)
1155				printf("pfsync: received invalid "
1156				    "bulk update end: bad timestamp\n");
1157		}
1158		break;
1159	}
1160	PFSYNC_BUNLOCK(sc);
1161
1162	return (len);
1163}
1164
1165static int
1166pfsync_in_tdb(struct pfsync_pkt *pkt, struct mbuf *m, int offset, int count)
1167{
1168	int len = count * sizeof(struct pfsync_tdb);
1169
1170#if defined(IPSEC)
1171	struct pfsync_tdb *tp;
1172	struct mbuf *mp;
1173	int offp;
1174	int i;
1175	int s;
1176
1177	mp = m_pulldown(m, offset, len, &offp);
1178	if (mp == NULL) {
1179		V_pfsyncstats.pfsyncs_badlen++;
1180		return (-1);
1181	}
1182	tp = (struct pfsync_tdb *)(mp->m_data + offp);
1183
1184	for (i = 0; i < count; i++)
1185		pfsync_update_net_tdb(&tp[i]);
1186#endif
1187
1188	return (len);
1189}
1190
1191#if defined(IPSEC)
1192/* Update an in-kernel tdb. Silently fail if no tdb is found. */
1193static void
1194pfsync_update_net_tdb(struct pfsync_tdb *pt)
1195{
1196	struct tdb		*tdb;
1197	int			 s;
1198
1199	/* check for invalid values */
1200	if (ntohl(pt->spi) <= SPI_RESERVED_MAX ||
1201	    (pt->dst.sa.sa_family != AF_INET &&
1202	    pt->dst.sa.sa_family != AF_INET6))
1203		goto bad;
1204
1205	tdb = gettdb(pt->spi, &pt->dst, pt->sproto);
1206	if (tdb) {
1207		pt->rpl = ntohl(pt->rpl);
1208		pt->cur_bytes = (unsigned long long)be64toh(pt->cur_bytes);
1209
1210		/* Neither replay nor byte counter should ever decrease. */
1211		if (pt->rpl < tdb->tdb_rpl ||
1212		    pt->cur_bytes < tdb->tdb_cur_bytes) {
1213			goto bad;
1214		}
1215
1216		tdb->tdb_rpl = pt->rpl;
1217		tdb->tdb_cur_bytes = pt->cur_bytes;
1218	}
1219	return;
1220
1221bad:
1222	if (V_pf_status.debug >= PF_DEBUG_MISC)
1223		printf("pfsync_insert: PFSYNC_ACT_TDB_UPD: "
1224		    "invalid value\n");
1225	V_pfsyncstats.pfsyncs_badstate++;
1226	return;
1227}
1228#endif
1229
1230
1231static int
1232pfsync_in_eof(struct pfsync_pkt *pkt, struct mbuf *m, int offset, int count)
1233{
1234	/* check if we are at the right place in the packet */
1235	if (offset != m->m_pkthdr.len - sizeof(struct pfsync_eof))
1236		V_pfsyncstats.pfsyncs_badact++;
1237
1238	/* we're done. free and let the caller return */
1239	m_freem(m);
1240	return (-1);
1241}
1242
1243static int
1244pfsync_in_error(struct pfsync_pkt *pkt, struct mbuf *m, int offset, int count)
1245{
1246	V_pfsyncstats.pfsyncs_badact++;
1247
1248	m_freem(m);
1249	return (-1);
1250}
1251
1252static int
1253pfsyncoutput(struct ifnet *ifp, struct mbuf *m, struct sockaddr *dst,
1254	struct route *rt)
1255{
1256	m_freem(m);
1257	return (0);
1258}
1259
1260/* ARGSUSED */
1261static int
1262pfsyncioctl(struct ifnet *ifp, u_long cmd, caddr_t data)
1263{
1264	struct pfsync_softc *sc = ifp->if_softc;
1265	struct ifreq *ifr = (struct ifreq *)data;
1266	struct pfsyncreq pfsyncr;
1267	int error;
1268
1269	switch (cmd) {
1270	case SIOCSIFFLAGS:
1271		PFSYNC_LOCK(sc);
1272		if (ifp->if_flags & IFF_UP) {
1273			ifp->if_drv_flags |= IFF_DRV_RUNNING;
1274			PFSYNC_UNLOCK(sc);
1275			pfsync_pointers_init();
1276		} else {
1277			ifp->if_drv_flags &= ~IFF_DRV_RUNNING;
1278			PFSYNC_UNLOCK(sc);
1279			pfsync_pointers_uninit();
1280		}
1281		break;
1282	case SIOCSIFMTU:
1283		if (!sc->sc_sync_if ||
1284		    ifr->ifr_mtu <= PFSYNC_MINPKT ||
1285		    ifr->ifr_mtu > sc->sc_sync_if->if_mtu)
1286			return (EINVAL);
1287		if (ifr->ifr_mtu < ifp->if_mtu) {
1288			PFSYNC_LOCK(sc);
1289			if (sc->sc_len > PFSYNC_MINPKT)
1290				pfsync_sendout(1);
1291			PFSYNC_UNLOCK(sc);
1292		}
1293		ifp->if_mtu = ifr->ifr_mtu;
1294		break;
1295	case SIOCGETPFSYNC:
1296		bzero(&pfsyncr, sizeof(pfsyncr));
1297		PFSYNC_LOCK(sc);
1298		if (sc->sc_sync_if) {
1299			strlcpy(pfsyncr.pfsyncr_syncdev,
1300			    sc->sc_sync_if->if_xname, IFNAMSIZ);
1301		}
1302		pfsyncr.pfsyncr_syncpeer = sc->sc_sync_peer;
1303		pfsyncr.pfsyncr_maxupdates = sc->sc_maxupdates;
1304		pfsyncr.pfsyncr_defer = (PFSYNCF_DEFER ==
1305		    (sc->sc_flags & PFSYNCF_DEFER));
1306		PFSYNC_UNLOCK(sc);
1307		return (copyout(&pfsyncr, ifr->ifr_data, sizeof(pfsyncr)));
1308
1309	case SIOCSETPFSYNC:
1310	    {
1311		struct ip_moptions *imo = &sc->sc_imo;
1312		struct ifnet *sifp;
1313		struct ip *ip;
1314		void *mship = NULL;
1315
1316		if ((error = priv_check(curthread, PRIV_NETINET_PF)) != 0)
1317			return (error);
1318		if ((error = copyin(ifr->ifr_data, &pfsyncr, sizeof(pfsyncr))))
1319			return (error);
1320
1321		if (pfsyncr.pfsyncr_maxupdates > 255)
1322			return (EINVAL);
1323
1324		if (pfsyncr.pfsyncr_syncdev[0] == 0)
1325			sifp = NULL;
1326		else if ((sifp = ifunit_ref(pfsyncr.pfsyncr_syncdev)) == NULL)
1327			return (EINVAL);
1328
1329		if (pfsyncr.pfsyncr_syncpeer.s_addr == 0 && sifp != NULL)
1330			mship = malloc((sizeof(struct in_multi *) *
1331			    IP_MIN_MEMBERSHIPS), M_PFSYNC, M_WAITOK | M_ZERO);
1332
1333		PFSYNC_LOCK(sc);
1334		if (pfsyncr.pfsyncr_syncpeer.s_addr == 0)
1335			sc->sc_sync_peer.s_addr = htonl(INADDR_PFSYNC_GROUP);
1336		else
1337			sc->sc_sync_peer.s_addr =
1338			    pfsyncr.pfsyncr_syncpeer.s_addr;
1339
1340		sc->sc_maxupdates = pfsyncr.pfsyncr_maxupdates;
1341		if (pfsyncr.pfsyncr_defer) {
1342			sc->sc_flags |= PFSYNCF_DEFER;
1343			pfsync_defer_ptr = pfsync_defer;
1344		} else {
1345			sc->sc_flags &= ~PFSYNCF_DEFER;
1346			pfsync_defer_ptr = NULL;
1347		}
1348
1349		if (sifp == NULL) {
1350			if (sc->sc_sync_if)
1351				if_rele(sc->sc_sync_if);
1352			sc->sc_sync_if = NULL;
1353			if (imo->imo_membership)
1354				pfsync_multicast_cleanup(sc);
1355			PFSYNC_UNLOCK(sc);
1356			break;
1357		}
1358
1359		if (sc->sc_len > PFSYNC_MINPKT &&
1360		    (sifp->if_mtu < sc->sc_ifp->if_mtu ||
1361		    (sc->sc_sync_if != NULL &&
1362		    sifp->if_mtu < sc->sc_sync_if->if_mtu) ||
1363		    sifp->if_mtu < MCLBYTES - sizeof(struct ip)))
1364			pfsync_sendout(1);
1365
1366		if (imo->imo_membership)
1367			pfsync_multicast_cleanup(sc);
1368
1369		if (sc->sc_sync_peer.s_addr == htonl(INADDR_PFSYNC_GROUP)) {
1370			error = pfsync_multicast_setup(sc, sifp, mship);
1371			if (error) {
1372				if_rele(sifp);
1373				free(mship, M_PFSYNC);
1374				return (error);
1375			}
1376		}
1377		if (sc->sc_sync_if)
1378			if_rele(sc->sc_sync_if);
1379		sc->sc_sync_if = sifp;
1380
1381		ip = &sc->sc_template;
1382		bzero(ip, sizeof(*ip));
1383		ip->ip_v = IPVERSION;
1384		ip->ip_hl = sizeof(sc->sc_template) >> 2;
1385		ip->ip_tos = IPTOS_LOWDELAY;
1386		/* len and id are set later. */
1387		ip->ip_off = IP_DF;
1388		ip->ip_ttl = PFSYNC_DFLTTL;
1389		ip->ip_p = IPPROTO_PFSYNC;
1390		ip->ip_src.s_addr = INADDR_ANY;
1391		ip->ip_dst.s_addr = sc->sc_sync_peer.s_addr;
1392
1393		/* Request a full state table update. */
1394		if ((sc->sc_flags & PFSYNCF_OK) && carp_demote_adj_p)
1395			(*carp_demote_adj_p)(V_pfsync_carp_adj,
1396			    "pfsync bulk start");
1397		sc->sc_flags &= ~PFSYNCF_OK;
1398		if (V_pf_status.debug >= PF_DEBUG_MISC)
1399			printf("pfsync: requesting bulk update\n");
1400		pfsync_request_update(0, 0);
1401		PFSYNC_UNLOCK(sc);
1402		PFSYNC_BLOCK(sc);
1403		sc->sc_ureq_sent = time_uptime;
1404		callout_reset(&sc->sc_bulkfail_tmo, 5 * hz, pfsync_bulk_fail,
1405		    sc);
1406		PFSYNC_BUNLOCK(sc);
1407
1408		break;
1409	    }
1410	default:
1411		return (ENOTTY);
1412	}
1413
1414	return (0);
1415}
1416
1417static void
1418pfsync_out_state(struct pf_state *st, void *buf)
1419{
1420	struct pfsync_state *sp = buf;
1421
1422	pfsync_state_export(sp, st);
1423}
1424
1425static void
1426pfsync_out_iack(struct pf_state *st, void *buf)
1427{
1428	struct pfsync_ins_ack *iack = buf;
1429
1430	iack->id = st->id;
1431	iack->creatorid = st->creatorid;
1432}
1433
1434static void
1435pfsync_out_upd_c(struct pf_state *st, void *buf)
1436{
1437	struct pfsync_upd_c *up = buf;
1438
1439	bzero(up, sizeof(*up));
1440	up->id = st->id;
1441	pf_state_peer_hton(&st->src, &up->src);
1442	pf_state_peer_hton(&st->dst, &up->dst);
1443	up->creatorid = st->creatorid;
1444	up->timeout = st->timeout;
1445}
1446
1447static void
1448pfsync_out_del(struct pf_state *st, void *buf)
1449{
1450	struct pfsync_del_c *dp = buf;
1451
1452	dp->id = st->id;
1453	dp->creatorid = st->creatorid;
1454	st->state_flags |= PFSTATE_NOSYNC;
1455}
1456
1457static void
1458pfsync_drop(struct pfsync_softc *sc)
1459{
1460	struct pf_state *st, *next;
1461	struct pfsync_upd_req_item *ur;
1462	int q;
1463
1464	for (q = 0; q < PFSYNC_S_COUNT; q++) {
1465		if (TAILQ_EMPTY(&sc->sc_qs[q]))
1466			continue;
1467
1468		TAILQ_FOREACH_SAFE(st, &sc->sc_qs[q], sync_list, next) {
1469			KASSERT(st->sync_state == q,
1470				("%s: st->sync_state == q",
1471					__func__));
1472			st->sync_state = PFSYNC_S_NONE;
1473			pf_release_state(st);
1474		}
1475		TAILQ_INIT(&sc->sc_qs[q]);
1476	}
1477
1478	while ((ur = TAILQ_FIRST(&sc->sc_upd_req_list)) != NULL) {
1479		TAILQ_REMOVE(&sc->sc_upd_req_list, ur, ur_entry);
1480		free(ur, M_PFSYNC);
1481	}
1482
1483	sc->sc_plus = NULL;
1484	sc->sc_len = PFSYNC_MINPKT;
1485}
1486
1487static void
1488pfsync_sendout(int schedswi)
1489{
1490	struct pfsync_softc *sc = V_pfsyncif;
1491	struct ifnet *ifp = sc->sc_ifp;
1492	struct mbuf *m;
1493	struct ip *ip;
1494	struct pfsync_header *ph;
1495	struct pfsync_subheader *subh;
1496	struct pf_state *st;
1497	struct pfsync_upd_req_item *ur;
1498	int offset;
1499	int q, count = 0;
1500
1501	KASSERT(sc != NULL, ("%s: null sc", __func__));
1502	KASSERT(sc->sc_len > PFSYNC_MINPKT,
1503	    ("%s: sc_len %zu", __func__, sc->sc_len));
1504	PFSYNC_LOCK_ASSERT(sc);
1505
1506	if (ifp->if_bpf == NULL && sc->sc_sync_if == NULL) {
1507		pfsync_drop(sc);
1508		return;
1509	}
1510
1511	m = m_get2(M_NOWAIT, MT_DATA, M_PKTHDR, max_linkhdr + sc->sc_len);
1512	if (m == NULL) {
1513		sc->sc_ifp->if_oerrors++;
1514		V_pfsyncstats.pfsyncs_onomem++;
1515		return;
1516	}
1517	m->m_data += max_linkhdr;
1518	m->m_len = m->m_pkthdr.len = sc->sc_len;
1519
1520	/* build the ip header */
1521	ip = (struct ip *)m->m_data;
1522	bcopy(&sc->sc_template, ip, sizeof(*ip));
1523	offset = sizeof(*ip);
1524
1525	ip->ip_len = m->m_pkthdr.len;
1526	ip->ip_id = htons(ip_randomid());
1527
1528	/* build the pfsync header */
1529	ph = (struct pfsync_header *)(m->m_data + offset);
1530	bzero(ph, sizeof(*ph));
1531	offset += sizeof(*ph);
1532
1533	ph->version = PFSYNC_VERSION;
1534	ph->len = htons(sc->sc_len - sizeof(*ip));
1535	bcopy(V_pf_status.pf_chksum, ph->pfcksum, PF_MD5_DIGEST_LENGTH);
1536
1537	/* walk the queues */
1538	for (q = 0; q < PFSYNC_S_COUNT; q++) {
1539		if (TAILQ_EMPTY(&sc->sc_qs[q]))
1540			continue;
1541
1542		subh = (struct pfsync_subheader *)(m->m_data + offset);
1543		offset += sizeof(*subh);
1544
1545		count = 0;
1546		TAILQ_FOREACH(st, &sc->sc_qs[q], sync_list) {
1547			KASSERT(st->sync_state == q,
1548				("%s: st->sync_state == q",
1549					__func__));
1550			/*
1551			 * XXXGL: some of write methods do unlocked reads
1552			 * of state data :(
1553			 */
1554			pfsync_qs[q].write(st, m->m_data + offset);
1555			offset += pfsync_qs[q].len;
1556			st->sync_state = PFSYNC_S_NONE;
1557			pf_release_state(st);
1558			count++;
1559		}
1560		TAILQ_INIT(&sc->sc_qs[q]);
1561
1562		bzero(subh, sizeof(*subh));
1563		subh->action = pfsync_qs[q].action;
1564		subh->count = htons(count);
1565		V_pfsyncstats.pfsyncs_oacts[pfsync_qs[q].action] += count;
1566	}
1567
1568	if (!TAILQ_EMPTY(&sc->sc_upd_req_list)) {
1569		subh = (struct pfsync_subheader *)(m->m_data + offset);
1570		offset += sizeof(*subh);
1571
1572		count = 0;
1573		while ((ur = TAILQ_FIRST(&sc->sc_upd_req_list)) != NULL) {
1574			TAILQ_REMOVE(&sc->sc_upd_req_list, ur, ur_entry);
1575
1576			bcopy(&ur->ur_msg, m->m_data + offset,
1577			    sizeof(ur->ur_msg));
1578			offset += sizeof(ur->ur_msg);
1579			free(ur, M_PFSYNC);
1580			count++;
1581		}
1582
1583		bzero(subh, sizeof(*subh));
1584		subh->action = PFSYNC_ACT_UPD_REQ;
1585		subh->count = htons(count);
1586		V_pfsyncstats.pfsyncs_oacts[PFSYNC_ACT_UPD_REQ] += count;
1587	}
1588
1589	/* has someone built a custom region for us to add? */
1590	if (sc->sc_plus != NULL) {
1591		bcopy(sc->sc_plus, m->m_data + offset, sc->sc_pluslen);
1592		offset += sc->sc_pluslen;
1593
1594		sc->sc_plus = NULL;
1595	}
1596
1597	subh = (struct pfsync_subheader *)(m->m_data + offset);
1598	offset += sizeof(*subh);
1599
1600	bzero(subh, sizeof(*subh));
1601	subh->action = PFSYNC_ACT_EOF;
1602	subh->count = htons(1);
1603	V_pfsyncstats.pfsyncs_oacts[PFSYNC_ACT_EOF]++;
1604
1605	/* XXX write checksum in EOF here */
1606
1607	/* we're done, let's put it on the wire */
1608	if (ifp->if_bpf) {
1609		m->m_data += sizeof(*ip);
1610		m->m_len = m->m_pkthdr.len = sc->sc_len - sizeof(*ip);
1611		BPF_MTAP(ifp, m);
1612		m->m_data -= sizeof(*ip);
1613		m->m_len = m->m_pkthdr.len = sc->sc_len;
1614	}
1615
1616	if (sc->sc_sync_if == NULL) {
1617		sc->sc_len = PFSYNC_MINPKT;
1618		m_freem(m);
1619		return;
1620	}
1621
1622	sc->sc_ifp->if_opackets++;
1623	sc->sc_ifp->if_obytes += m->m_pkthdr.len;
1624	sc->sc_len = PFSYNC_MINPKT;
1625
1626	if (!_IF_QFULL(&sc->sc_ifp->if_snd))
1627		_IF_ENQUEUE(&sc->sc_ifp->if_snd, m);
1628	else {
1629		m_freem(m);
1630		sc->sc_ifp->if_snd.ifq_drops++;
1631	}
1632	if (schedswi)
1633		swi_sched(V_pfsync_swi_cookie, 0);
1634}
1635
1636static void
1637pfsync_insert_state(struct pf_state *st)
1638{
1639	struct pfsync_softc *sc = V_pfsyncif;
1640
1641	if (st->state_flags & PFSTATE_NOSYNC)
1642		return;
1643
1644	if ((st->rule.ptr->rule_flag & PFRULE_NOSYNC) ||
1645	    st->key[PF_SK_WIRE]->proto == IPPROTO_PFSYNC) {
1646		st->state_flags |= PFSTATE_NOSYNC;
1647		return;
1648	}
1649
1650	KASSERT(st->sync_state == PFSYNC_S_NONE,
1651		("%s: st->sync_state == PFSYNC_S_NONE", __func__));
1652
1653	PFSYNC_LOCK(sc);
1654	if (sc->sc_len == PFSYNC_MINPKT)
1655		callout_reset(&sc->sc_tmo, 1 * hz, pfsync_timeout, V_pfsyncif);
1656
1657	pfsync_q_ins(st, PFSYNC_S_INS);
1658	PFSYNC_UNLOCK(sc);
1659
1660	st->sync_updates = 0;
1661}
1662
1663static int
1664pfsync_defer(struct pf_state *st, struct mbuf *m)
1665{
1666	struct pfsync_softc *sc = V_pfsyncif;
1667	struct pfsync_deferral *pd;
1668
1669	if (m->m_flags & (M_BCAST|M_MCAST))
1670		return (0);
1671
1672	PFSYNC_LOCK(sc);
1673
1674	if (sc == NULL || !(sc->sc_ifp->if_flags & IFF_DRV_RUNNING) ||
1675	    !(sc->sc_flags & PFSYNCF_DEFER)) {
1676		PFSYNC_UNLOCK(sc);
1677		return (0);
1678	}
1679
1680	 if (sc->sc_deferred >= 128)
1681		pfsync_undefer(TAILQ_FIRST(&sc->sc_deferrals), 0);
1682
1683	pd = malloc(sizeof(*pd), M_PFSYNC, M_NOWAIT);
1684	if (pd == NULL)
1685		return (0);
1686	sc->sc_deferred++;
1687
1688	m->m_flags |= M_SKIP_FIREWALL;
1689	st->state_flags |= PFSTATE_ACK;
1690
1691	pd->pd_sc = sc;
1692	pd->pd_refs = 0;
1693	pd->pd_st = st;
1694	pf_ref_state(st);
1695	pd->pd_m = m;
1696
1697	TAILQ_INSERT_TAIL(&sc->sc_deferrals, pd, pd_entry);
1698	callout_init_mtx(&pd->pd_tmo, &sc->sc_mtx, CALLOUT_RETURNUNLOCKED);
1699	callout_reset(&pd->pd_tmo, 10, pfsync_defer_tmo, pd);
1700
1701	pfsync_push(sc);
1702
1703	return (1);
1704}
1705
1706static void
1707pfsync_undefer(struct pfsync_deferral *pd, int drop)
1708{
1709	struct pfsync_softc *sc = pd->pd_sc;
1710	struct mbuf *m = pd->pd_m;
1711	struct pf_state *st = pd->pd_st;
1712
1713	PFSYNC_LOCK_ASSERT(sc);
1714
1715	TAILQ_REMOVE(&sc->sc_deferrals, pd, pd_entry);
1716	sc->sc_deferred--;
1717	pd->pd_st->state_flags &= ~PFSTATE_ACK;	/* XXX: locking! */
1718	free(pd, M_PFSYNC);
1719	pf_release_state(st);
1720
1721	if (drop)
1722		m_freem(m);
1723	else {
1724		_IF_ENQUEUE(&sc->sc_ifp->if_snd, m);
1725		pfsync_push(sc);
1726	}
1727}
1728
1729static void
1730pfsync_defer_tmo(void *arg)
1731{
1732	struct pfsync_deferral *pd = arg;
1733	struct pfsync_softc *sc = pd->pd_sc;
1734	struct mbuf *m = pd->pd_m;
1735	struct pf_state *st = pd->pd_st;
1736
1737	PFSYNC_LOCK_ASSERT(sc);
1738
1739	CURVNET_SET(m->m_pkthdr.rcvif->if_vnet);
1740
1741	TAILQ_REMOVE(&sc->sc_deferrals, pd, pd_entry);
1742	sc->sc_deferred--;
1743	pd->pd_st->state_flags &= ~PFSTATE_ACK;	/* XXX: locking! */
1744	if (pd->pd_refs == 0)
1745		free(pd, M_PFSYNC);
1746	PFSYNC_UNLOCK(sc);
1747
1748	ip_output(m, NULL, NULL, 0, NULL, NULL);
1749
1750	pf_release_state(st);
1751
1752	CURVNET_RESTORE();
1753}
1754
1755static void
1756pfsync_undefer_state(struct pf_state *st, int drop)
1757{
1758	struct pfsync_softc *sc = V_pfsyncif;
1759	struct pfsync_deferral *pd;
1760
1761	PFSYNC_LOCK_ASSERT(sc);
1762
1763	TAILQ_FOREACH(pd, &sc->sc_deferrals, pd_entry) {
1764		 if (pd->pd_st == st) {
1765			if (callout_stop(&pd->pd_tmo))
1766				pfsync_undefer(pd, drop);
1767			return;
1768		}
1769	}
1770
1771	panic("%s: unable to find deferred state", __func__);
1772}
1773
1774static void
1775pfsync_update_state(struct pf_state *st)
1776{
1777	struct pfsync_softc *sc = V_pfsyncif;
1778	int sync = 0;
1779
1780	PF_STATE_LOCK_ASSERT(st);
1781	PFSYNC_LOCK(sc);
1782
1783	if (st->state_flags & PFSTATE_ACK)
1784		pfsync_undefer_state(st, 0);
1785	if (st->state_flags & PFSTATE_NOSYNC) {
1786		if (st->sync_state != PFSYNC_S_NONE)
1787			pfsync_q_del(st);
1788		PFSYNC_UNLOCK(sc);
1789		return;
1790	}
1791
1792	if (sc->sc_len == PFSYNC_MINPKT)
1793		callout_reset(&sc->sc_tmo, 1 * hz, pfsync_timeout, V_pfsyncif);
1794
1795	switch (st->sync_state) {
1796	case PFSYNC_S_UPD_C:
1797	case PFSYNC_S_UPD:
1798	case PFSYNC_S_INS:
1799		/* we're already handling it */
1800
1801		if (st->key[PF_SK_WIRE]->proto == IPPROTO_TCP) {
1802			st->sync_updates++;
1803			if (st->sync_updates >= sc->sc_maxupdates)
1804				sync = 1;
1805		}
1806		break;
1807
1808	case PFSYNC_S_IACK:
1809		pfsync_q_del(st);
1810	case PFSYNC_S_NONE:
1811		pfsync_q_ins(st, PFSYNC_S_UPD_C);
1812		st->sync_updates = 0;
1813		break;
1814
1815	default:
1816		panic("%s: unexpected sync state %d", __func__, st->sync_state);
1817	}
1818
1819	if (sync || (time_uptime - st->pfsync_time) < 2)
1820		pfsync_push(sc);
1821
1822	PFSYNC_UNLOCK(sc);
1823}
1824
1825static void
1826pfsync_request_update(u_int32_t creatorid, u_int64_t id)
1827{
1828	struct pfsync_softc *sc = V_pfsyncif;
1829	struct pfsync_upd_req_item *item;
1830	size_t nlen = sizeof(struct pfsync_upd_req);
1831
1832	PFSYNC_LOCK_ASSERT(sc);
1833
1834	/*
1835	 * This code does a bit to prevent multiple update requests for the
1836	 * same state being generated. It searches current subheader queue,
1837	 * but it doesn't lookup into queue of already packed datagrams.
1838	 */
1839	TAILQ_FOREACH(item, &sc->sc_upd_req_list, ur_entry)
1840		if (item->ur_msg.id == id &&
1841		    item->ur_msg.creatorid == creatorid)
1842			return;
1843
1844	item = malloc(sizeof(*item), M_PFSYNC, M_NOWAIT);
1845	if (item == NULL)
1846		return; /* XXX stats */
1847
1848	item->ur_msg.id = id;
1849	item->ur_msg.creatorid = creatorid;
1850
1851	if (TAILQ_EMPTY(&sc->sc_upd_req_list))
1852		nlen += sizeof(struct pfsync_subheader);
1853
1854	if (sc->sc_len + nlen > sc->sc_ifp->if_mtu) {
1855		pfsync_sendout(1);
1856
1857		nlen = sizeof(struct pfsync_subheader) +
1858		    sizeof(struct pfsync_upd_req);
1859	}
1860
1861	TAILQ_INSERT_TAIL(&sc->sc_upd_req_list, item, ur_entry);
1862	sc->sc_len += nlen;
1863}
1864
1865static void
1866pfsync_update_state_req(struct pf_state *st)
1867{
1868	struct pfsync_softc *sc = V_pfsyncif;
1869
1870	PF_STATE_LOCK_ASSERT(st);
1871	PFSYNC_LOCK(sc);
1872
1873	if (st->state_flags & PFSTATE_NOSYNC) {
1874		if (st->sync_state != PFSYNC_S_NONE)
1875			pfsync_q_del(st);
1876		PFSYNC_UNLOCK(sc);
1877		return;
1878	}
1879
1880	switch (st->sync_state) {
1881	case PFSYNC_S_UPD_C:
1882	case PFSYNC_S_IACK:
1883		pfsync_q_del(st);
1884	case PFSYNC_S_NONE:
1885		pfsync_q_ins(st, PFSYNC_S_UPD);
1886		pfsync_push(sc);
1887		break;
1888
1889	case PFSYNC_S_INS:
1890	case PFSYNC_S_UPD:
1891	case PFSYNC_S_DEL:
1892		/* we're already handling it */
1893		break;
1894
1895	default:
1896		panic("%s: unexpected sync state %d", __func__, st->sync_state);
1897	}
1898
1899	PFSYNC_UNLOCK(sc);
1900}
1901
1902static void
1903pfsync_delete_state(struct pf_state *st)
1904{
1905	struct pfsync_softc *sc = V_pfsyncif;
1906
1907	PFSYNC_LOCK(sc);
1908	if (st->state_flags & PFSTATE_ACK)
1909		pfsync_undefer_state(st, 1);
1910	if (st->state_flags & PFSTATE_NOSYNC) {
1911		if (st->sync_state != PFSYNC_S_NONE)
1912			pfsync_q_del(st);
1913		PFSYNC_UNLOCK(sc);
1914		return;
1915	}
1916
1917	if (sc->sc_len == PFSYNC_MINPKT)
1918		callout_reset(&sc->sc_tmo, 1 * hz, pfsync_timeout, V_pfsyncif);
1919
1920	switch (st->sync_state) {
1921	case PFSYNC_S_INS:
1922		/* We never got to tell the world so just forget about it. */
1923		pfsync_q_del(st);
1924		break;
1925
1926	case PFSYNC_S_UPD_C:
1927	case PFSYNC_S_UPD:
1928	case PFSYNC_S_IACK:
1929		pfsync_q_del(st);
1930		/* FALLTHROUGH to putting it on the del list */
1931
1932	case PFSYNC_S_NONE:
1933		pfsync_q_ins(st, PFSYNC_S_DEL);
1934		break;
1935
1936	default:
1937		panic("%s: unexpected sync state %d", __func__, st->sync_state);
1938	}
1939	PFSYNC_UNLOCK(sc);
1940}
1941
1942static void
1943pfsync_clear_states(u_int32_t creatorid, const char *ifname)
1944{
1945	struct pfsync_softc *sc = V_pfsyncif;
1946	struct {
1947		struct pfsync_subheader subh;
1948		struct pfsync_clr clr;
1949	} __packed r;
1950
1951	bzero(&r, sizeof(r));
1952
1953	r.subh.action = PFSYNC_ACT_CLR;
1954	r.subh.count = htons(1);
1955	V_pfsyncstats.pfsyncs_oacts[PFSYNC_ACT_CLR]++;
1956
1957	strlcpy(r.clr.ifname, ifname, sizeof(r.clr.ifname));
1958	r.clr.creatorid = creatorid;
1959
1960	PFSYNC_LOCK(sc);
1961	pfsync_send_plus(&r, sizeof(r));
1962	PFSYNC_UNLOCK(sc);
1963}
1964
1965static void
1966pfsync_q_ins(struct pf_state *st, int q)
1967{
1968	struct pfsync_softc *sc = V_pfsyncif;
1969	size_t nlen = pfsync_qs[q].len;
1970
1971	PFSYNC_LOCK_ASSERT(sc);
1972
1973	KASSERT(st->sync_state == PFSYNC_S_NONE,
1974		("%s: st->sync_state == PFSYNC_S_NONE", __func__));
1975	KASSERT(sc->sc_len >= PFSYNC_MINPKT, ("pfsync pkt len is too low %zu",
1976	    sc->sc_len));
1977
1978	if (TAILQ_EMPTY(&sc->sc_qs[q]))
1979		nlen += sizeof(struct pfsync_subheader);
1980
1981	if (sc->sc_len + nlen > sc->sc_ifp->if_mtu) {
1982		pfsync_sendout(1);
1983
1984		nlen = sizeof(struct pfsync_subheader) + pfsync_qs[q].len;
1985	}
1986
1987	sc->sc_len += nlen;
1988	TAILQ_INSERT_TAIL(&sc->sc_qs[q], st, sync_list);
1989	st->sync_state = q;
1990	pf_ref_state(st);
1991}
1992
1993static void
1994pfsync_q_del(struct pf_state *st)
1995{
1996	struct pfsync_softc *sc = V_pfsyncif;
1997	int q = st->sync_state;
1998
1999	PFSYNC_LOCK_ASSERT(sc);
2000	KASSERT(st->sync_state != PFSYNC_S_NONE,
2001		("%s: st->sync_state != PFSYNC_S_NONE", __func__));
2002
2003	sc->sc_len -= pfsync_qs[q].len;
2004	TAILQ_REMOVE(&sc->sc_qs[q], st, sync_list);
2005	st->sync_state = PFSYNC_S_NONE;
2006	pf_release_state(st);
2007
2008	if (TAILQ_EMPTY(&sc->sc_qs[q]))
2009		sc->sc_len -= sizeof(struct pfsync_subheader);
2010}
2011
2012static void
2013pfsync_bulk_start(void)
2014{
2015	struct pfsync_softc *sc = V_pfsyncif;
2016
2017	if (V_pf_status.debug >= PF_DEBUG_MISC)
2018		printf("pfsync: received bulk update request\n");
2019
2020	PFSYNC_BLOCK(sc);
2021
2022	sc->sc_ureq_received = time_uptime;
2023	sc->sc_bulk_hashid = 0;
2024	sc->sc_bulk_stateid = 0;
2025	pfsync_bulk_status(PFSYNC_BUS_START);
2026	callout_reset(&sc->sc_bulk_tmo, 1, pfsync_bulk_update, sc);
2027	PFSYNC_BUNLOCK(sc);
2028}
2029
2030static void
2031pfsync_bulk_update(void *arg)
2032{
2033	struct pfsync_softc *sc = arg;
2034	struct pf_state *s;
2035	int i, sent = 0;
2036
2037	PFSYNC_BLOCK_ASSERT(sc);
2038	CURVNET_SET(sc->sc_ifp->if_vnet);
2039
2040	/*
2041	 * Start with last state from previous invocation.
2042	 * It may had gone, in this case start from the
2043	 * hash slot.
2044	 */
2045	s = pf_find_state_byid(sc->sc_bulk_stateid, sc->sc_bulk_creatorid);
2046
2047	if (s != NULL)
2048		i = PF_IDHASH(s);
2049	else
2050		i = sc->sc_bulk_hashid;
2051
2052	for (; i <= V_pf_hashmask; i++) {
2053		struct pf_idhash *ih = &V_pf_idhash[i];
2054
2055		if (s != NULL)
2056			PF_HASHROW_ASSERT(ih);
2057		else {
2058			PF_HASHROW_LOCK(ih);
2059			s = LIST_FIRST(&ih->states);
2060		}
2061
2062		for (; s; s = LIST_NEXT(s, entry)) {
2063
2064			if (sent > 1 && (sc->sc_ifp->if_mtu - sc->sc_len) <
2065			    sizeof(struct pfsync_state)) {
2066				/* We've filled a packet. */
2067				sc->sc_bulk_hashid = i;
2068				sc->sc_bulk_stateid = s->id;
2069				sc->sc_bulk_creatorid = s->creatorid;
2070				PF_HASHROW_UNLOCK(ih);
2071				callout_reset(&sc->sc_bulk_tmo, 1,
2072				    pfsync_bulk_update, sc);
2073				goto full;
2074			}
2075
2076			if (s->sync_state == PFSYNC_S_NONE &&
2077			    s->timeout < PFTM_MAX &&
2078			    s->pfsync_time <= sc->sc_ureq_received) {
2079				PFSYNC_LOCK(sc);
2080				pfsync_update_state_req(s);
2081				PFSYNC_UNLOCK(sc);
2082				sent++;
2083			}
2084		}
2085		PF_HASHROW_UNLOCK(ih);
2086	}
2087
2088	/* We're done. */
2089	pfsync_bulk_status(PFSYNC_BUS_END);
2090
2091full:
2092	CURVNET_RESTORE();
2093}
2094
2095static void
2096pfsync_bulk_status(u_int8_t status)
2097{
2098	struct {
2099		struct pfsync_subheader subh;
2100		struct pfsync_bus bus;
2101	} __packed r;
2102
2103	struct pfsync_softc *sc = V_pfsyncif;
2104
2105	bzero(&r, sizeof(r));
2106
2107	r.subh.action = PFSYNC_ACT_BUS;
2108	r.subh.count = htons(1);
2109	V_pfsyncstats.pfsyncs_oacts[PFSYNC_ACT_BUS]++;
2110
2111	r.bus.creatorid = V_pf_status.hostid;
2112	r.bus.endtime = htonl(time_uptime - sc->sc_ureq_received);
2113	r.bus.status = status;
2114
2115	PFSYNC_LOCK(sc);
2116	pfsync_send_plus(&r, sizeof(r));
2117	PFSYNC_UNLOCK(sc);
2118}
2119
2120static void
2121pfsync_bulk_fail(void *arg)
2122{
2123	struct pfsync_softc *sc = arg;
2124
2125	CURVNET_SET(sc->sc_ifp->if_vnet);
2126
2127	PFSYNC_BLOCK_ASSERT(sc);
2128
2129	if (sc->sc_bulk_tries++ < PFSYNC_MAX_BULKTRIES) {
2130		/* Try again */
2131		callout_reset(&sc->sc_bulkfail_tmo, 5 * hz,
2132		    pfsync_bulk_fail, V_pfsyncif);
2133		PFSYNC_LOCK(sc);
2134		pfsync_request_update(0, 0);
2135		PFSYNC_UNLOCK(sc);
2136	} else {
2137		/* Pretend like the transfer was ok. */
2138		sc->sc_ureq_sent = 0;
2139		sc->sc_bulk_tries = 0;
2140		PFSYNC_LOCK(sc);
2141		if (!(sc->sc_flags & PFSYNCF_OK) && carp_demote_adj_p)
2142			(*carp_demote_adj_p)(-V_pfsync_carp_adj,
2143			    "pfsync bulk fail");
2144		sc->sc_flags |= PFSYNCF_OK;
2145		PFSYNC_UNLOCK(sc);
2146		if (V_pf_status.debug >= PF_DEBUG_MISC)
2147			printf("pfsync: failed to receive bulk update\n");
2148	}
2149
2150	CURVNET_RESTORE();
2151}
2152
2153static void
2154pfsync_send_plus(void *plus, size_t pluslen)
2155{
2156	struct pfsync_softc *sc = V_pfsyncif;
2157
2158	PFSYNC_LOCK_ASSERT(sc);
2159
2160	if (sc->sc_len + pluslen > sc->sc_ifp->if_mtu)
2161		pfsync_sendout(1);
2162
2163	sc->sc_plus = plus;
2164	sc->sc_len += (sc->sc_pluslen = pluslen);
2165
2166	pfsync_sendout(1);
2167}
2168
2169static void
2170pfsync_timeout(void *arg)
2171{
2172	struct pfsync_softc *sc = arg;
2173
2174	CURVNET_SET(sc->sc_ifp->if_vnet);
2175	PFSYNC_LOCK(sc);
2176	pfsync_push(sc);
2177	PFSYNC_UNLOCK(sc);
2178	CURVNET_RESTORE();
2179}
2180
2181static void
2182pfsync_push(struct pfsync_softc *sc)
2183{
2184
2185	PFSYNC_LOCK_ASSERT(sc);
2186
2187	sc->sc_flags |= PFSYNCF_PUSH;
2188	swi_sched(V_pfsync_swi_cookie, 0);
2189}
2190
2191static void
2192pfsyncintr(void *arg)
2193{
2194	struct pfsync_softc *sc = arg;
2195	struct mbuf *m, *n;
2196
2197	CURVNET_SET(sc->sc_ifp->if_vnet);
2198
2199	PFSYNC_LOCK(sc);
2200	if ((sc->sc_flags & PFSYNCF_PUSH) && sc->sc_len > PFSYNC_MINPKT) {
2201		pfsync_sendout(0);
2202		sc->sc_flags &= ~PFSYNCF_PUSH;
2203	}
2204	_IF_DEQUEUE_ALL(&sc->sc_ifp->if_snd, m);
2205	PFSYNC_UNLOCK(sc);
2206
2207	for (; m != NULL; m = n) {
2208
2209		n = m->m_nextpkt;
2210		m->m_nextpkt = NULL;
2211
2212		/*
2213		 * We distinguish between a deferral packet and our
2214		 * own pfsync packet based on M_SKIP_FIREWALL
2215		 * flag. This is XXX.
2216		 */
2217		if (m->m_flags & M_SKIP_FIREWALL)
2218			ip_output(m, NULL, NULL, 0, NULL, NULL);
2219		else if (ip_output(m, NULL, NULL, IP_RAWOUTPUT, &sc->sc_imo,
2220		    NULL) == 0)
2221			V_pfsyncstats.pfsyncs_opackets++;
2222		else
2223			V_pfsyncstats.pfsyncs_oerrors++;
2224	}
2225	CURVNET_RESTORE();
2226}
2227
2228static int
2229pfsync_multicast_setup(struct pfsync_softc *sc, struct ifnet *ifp, void *mship)
2230{
2231	struct ip_moptions *imo = &sc->sc_imo;
2232	int error;
2233
2234	if (!(ifp->if_flags & IFF_MULTICAST))
2235		return (EADDRNOTAVAIL);
2236
2237	imo->imo_membership = (struct in_multi **)mship;
2238	imo->imo_max_memberships = IP_MIN_MEMBERSHIPS;
2239	imo->imo_multicast_vif = -1;
2240
2241	if ((error = in_joingroup(ifp, &sc->sc_sync_peer, NULL,
2242	    &imo->imo_membership[0])) != 0) {
2243		imo->imo_membership = NULL;
2244		return (error);
2245	}
2246	imo->imo_num_memberships++;
2247	imo->imo_multicast_ifp = ifp;
2248	imo->imo_multicast_ttl = PFSYNC_DFLTTL;
2249	imo->imo_multicast_loop = 0;
2250
2251	return (0);
2252}
2253
2254static void
2255pfsync_multicast_cleanup(struct pfsync_softc *sc)
2256{
2257	struct ip_moptions *imo = &sc->sc_imo;
2258
2259	in_leavegroup(imo->imo_membership[0], NULL);
2260	free(imo->imo_membership, M_PFSYNC);
2261	imo->imo_membership = NULL;
2262	imo->imo_multicast_ifp = NULL;
2263}
2264
2265#ifdef INET
2266extern  struct domain inetdomain;
2267static struct protosw in_pfsync_protosw = {
2268	.pr_type =		SOCK_RAW,
2269	.pr_domain =		&inetdomain,
2270	.pr_protocol =		IPPROTO_PFSYNC,
2271	.pr_flags =		PR_ATOMIC|PR_ADDR,
2272	.pr_input =		pfsync_input,
2273	.pr_output =		(pr_output_t *)rip_output,
2274	.pr_ctloutput =		rip_ctloutput,
2275	.pr_usrreqs =		&rip_usrreqs
2276};
2277#endif
2278
2279static void
2280pfsync_pointers_init()
2281{
2282
2283	PF_RULES_WLOCK();
2284	pfsync_state_import_ptr = pfsync_state_import;
2285	pfsync_insert_state_ptr = pfsync_insert_state;
2286	pfsync_update_state_ptr = pfsync_update_state;
2287	pfsync_delete_state_ptr = pfsync_delete_state;
2288	pfsync_clear_states_ptr = pfsync_clear_states;
2289	pfsync_defer_ptr = pfsync_defer;
2290	PF_RULES_WUNLOCK();
2291}
2292
2293static void
2294pfsync_pointers_uninit()
2295{
2296
2297	PF_RULES_WLOCK();
2298	pfsync_state_import_ptr = NULL;
2299	pfsync_insert_state_ptr = NULL;
2300	pfsync_update_state_ptr = NULL;
2301	pfsync_delete_state_ptr = NULL;
2302	pfsync_clear_states_ptr = NULL;
2303	pfsync_defer_ptr = NULL;
2304	PF_RULES_WUNLOCK();
2305}
2306
2307static int
2308pfsync_init()
2309{
2310	VNET_ITERATOR_DECL(vnet_iter);
2311	int error = 0;
2312
2313	VNET_LIST_RLOCK();
2314	VNET_FOREACH(vnet_iter) {
2315		CURVNET_SET(vnet_iter);
2316		V_pfsync_cloner = pfsync_cloner;
2317		V_pfsync_cloner_data = pfsync_cloner_data;
2318		V_pfsync_cloner.ifc_data = &V_pfsync_cloner_data;
2319		if_clone_attach(&V_pfsync_cloner);
2320		error = swi_add(NULL, "pfsync", pfsyncintr, V_pfsyncif,
2321		    SWI_NET, INTR_MPSAFE, &V_pfsync_swi_cookie);
2322		CURVNET_RESTORE();
2323		if (error)
2324			goto fail_locked;
2325	}
2326	VNET_LIST_RUNLOCK();
2327#ifdef INET
2328	error = pf_proto_register(PF_INET, &in_pfsync_protosw);
2329	if (error)
2330		goto fail;
2331	error = ipproto_register(IPPROTO_PFSYNC);
2332	if (error) {
2333		pf_proto_unregister(PF_INET, IPPROTO_PFSYNC, SOCK_RAW);
2334		goto fail;
2335	}
2336#endif
2337	pfsync_pointers_init();
2338
2339	return (0);
2340
2341fail:
2342	VNET_LIST_RLOCK();
2343fail_locked:
2344	VNET_FOREACH(vnet_iter) {
2345		CURVNET_SET(vnet_iter);
2346		if (V_pfsync_swi_cookie) {
2347			swi_remove(V_pfsync_swi_cookie);
2348			if_clone_detach(&V_pfsync_cloner);
2349		}
2350		CURVNET_RESTORE();
2351	}
2352	VNET_LIST_RUNLOCK();
2353
2354	return (error);
2355}
2356
2357static void
2358pfsync_uninit()
2359{
2360	VNET_ITERATOR_DECL(vnet_iter);
2361
2362	pfsync_pointers_uninit();
2363
2364	ipproto_unregister(IPPROTO_PFSYNC);
2365	pf_proto_unregister(PF_INET, IPPROTO_PFSYNC, SOCK_RAW);
2366	VNET_LIST_RLOCK();
2367	VNET_FOREACH(vnet_iter) {
2368		CURVNET_SET(vnet_iter);
2369		if_clone_detach(&V_pfsync_cloner);
2370		swi_remove(V_pfsync_swi_cookie);
2371		CURVNET_RESTORE();
2372	}
2373	VNET_LIST_RUNLOCK();
2374}
2375
2376static int
2377pfsync_modevent(module_t mod, int type, void *data)
2378{
2379	int error = 0;
2380
2381	switch (type) {
2382	case MOD_LOAD:
2383		error = pfsync_init();
2384		break;
2385	case MOD_QUIESCE:
2386		/*
2387		 * Module should not be unloaded due to race conditions.
2388		 */
2389		error = EBUSY;
2390		break;
2391	case MOD_UNLOAD:
2392		pfsync_uninit();
2393		break;
2394	default:
2395		error = EINVAL;
2396		break;
2397	}
2398
2399	return (error);
2400}
2401
2402static moduledata_t pfsync_mod = {
2403	"pfsync",
2404	pfsync_modevent,
2405	0
2406};
2407
2408#define PFSYNC_MODVER 1
2409
2410DECLARE_MODULE(pfsync, pfsync_mod, SI_SUB_PROTO_DOMAIN, SI_ORDER_ANY);
2411MODULE_VERSION(pfsync, PFSYNC_MODVER);
2412MODULE_DEPEND(pfsync, pf, PF_MODVER, PF_MODVER, PF_MODVER);
2413