if_pfsync.c revision 314667
1/*-
2 * Copyright (c) 2002 Michael Shalayeff
3 * Copyright (c) 2012 Gleb Smirnoff <glebius@FreeBSD.org>
4 * All rights reserved.
5 *
6 * Redistribution and use in source and binary forms, with or without
7 * modification, are permitted provided that the following conditions
8 * are met:
9 * 1. Redistributions of source code must retain the above copyright
10 *    notice, this list of conditions and the following disclaimer.
11 * 2. Redistributions in binary form must reproduce the above copyright
12 *    notice, this list of conditions and the following disclaimer in the
13 *    documentation and/or other materials provided with the distribution.
14 *
15 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
16 * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
17 * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
18 * IN NO EVENT SHALL THE AUTHOR OR HIS RELATIVES BE LIABLE FOR ANY DIRECT,
19 * INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
20 * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
21 * SERVICES; LOSS OF MIND, USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
22 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
23 * STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING
24 * IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF
25 * THE POSSIBILITY OF SUCH DAMAGE.
26 */
27
28/*-
29 * Copyright (c) 2009 David Gwynne <dlg@openbsd.org>
30 *
31 * Permission to use, copy, modify, and distribute this software for any
32 * purpose with or without fee is hereby granted, provided that the above
33 * copyright notice and this permission notice appear in all copies.
34 *
35 * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
36 * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
37 * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR
38 * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
39 * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
40 * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF
41 * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
42 */
43
44/*
45 * $OpenBSD: if_pfsync.c,v 1.110 2009/02/24 05:39:19 dlg Exp $
46 *
47 * Revisions picked from OpenBSD after revision 1.110 import:
48 * 1.119 - don't m_copydata() beyond the len of mbuf in pfsync_input()
49 * 1.118, 1.124, 1.148, 1.149, 1.151, 1.171 - fixes to bulk updates
50 * 1.120, 1.175 - use monotonic time_uptime
51 * 1.122 - reduce number of updates for non-TCP sessions
52 * 1.125, 1.127 - rewrite merge or stale processing
53 * 1.128 - cleanups
54 * 1.146 - bzero() mbuf before sparsely filling it with data
55 * 1.170 - SIOCSIFMTU checks
56 * 1.126, 1.142 - deferred packets processing
57 * 1.173 - correct expire time processing
58 */
59
60#include <sys/cdefs.h>
61__FBSDID("$FreeBSD: stable/10/sys/netpfil/pf/if_pfsync.c 314667 2017-03-04 13:03:31Z avg $");
62
63#include "opt_inet.h"
64#include "opt_inet6.h"
65#include "opt_pf.h"
66
67#include <sys/param.h>
68#include <sys/bus.h>
69#include <sys/endian.h>
70#include <sys/interrupt.h>
71#include <sys/kernel.h>
72#include <sys/lock.h>
73#include <sys/mbuf.h>
74#include <sys/module.h>
75#include <sys/mutex.h>
76#include <sys/priv.h>
77#include <sys/protosw.h>
78#include <sys/socket.h>
79#include <sys/sockio.h>
80#include <sys/sysctl.h>
81
82#include <net/bpf.h>
83#include <net/if.h>
84#include <net/if_clone.h>
85#include <net/if_types.h>
86#include <net/pfvar.h>
87#include <net/if_pfsync.h>
88
89#include <netinet/if_ether.h>
90#include <netinet/in.h>
91#include <netinet/in_var.h>
92#include <netinet/ip.h>
93#include <netinet/ip_carp.h>
94#include <netinet/ip_var.h>
95#include <netinet/tcp.h>
96#include <netinet/tcp_fsm.h>
97#include <netinet/tcp_seq.h>
98
99#define PFSYNC_MINPKT ( \
100	sizeof(struct ip) + \
101	sizeof(struct pfsync_header) + \
102	sizeof(struct pfsync_subheader) )
103
104struct pfsync_pkt {
105	struct ip *ip;
106	struct in_addr src;
107	u_int8_t flags;
108};
109
110static int	pfsync_upd_tcp(struct pf_state *, struct pfsync_state_peer *,
111		    struct pfsync_state_peer *);
112static int	pfsync_in_clr(struct pfsync_pkt *, struct mbuf *, int, int);
113static int	pfsync_in_ins(struct pfsync_pkt *, struct mbuf *, int, int);
114static int	pfsync_in_iack(struct pfsync_pkt *, struct mbuf *, int, int);
115static int	pfsync_in_upd(struct pfsync_pkt *, struct mbuf *, int, int);
116static int	pfsync_in_upd_c(struct pfsync_pkt *, struct mbuf *, int, int);
117static int	pfsync_in_ureq(struct pfsync_pkt *, struct mbuf *, int, int);
118static int	pfsync_in_del(struct pfsync_pkt *, struct mbuf *, int, int);
119static int	pfsync_in_del_c(struct pfsync_pkt *, struct mbuf *, int, int);
120static int	pfsync_in_bus(struct pfsync_pkt *, struct mbuf *, int, int);
121static int	pfsync_in_tdb(struct pfsync_pkt *, struct mbuf *, int, int);
122static int	pfsync_in_eof(struct pfsync_pkt *, struct mbuf *, int, int);
123static int	pfsync_in_error(struct pfsync_pkt *, struct mbuf *, int, int);
124
125static int (*pfsync_acts[])(struct pfsync_pkt *, struct mbuf *, int, int) = {
126	pfsync_in_clr,			/* PFSYNC_ACT_CLR */
127	pfsync_in_ins,			/* PFSYNC_ACT_INS */
128	pfsync_in_iack,			/* PFSYNC_ACT_INS_ACK */
129	pfsync_in_upd,			/* PFSYNC_ACT_UPD */
130	pfsync_in_upd_c,		/* PFSYNC_ACT_UPD_C */
131	pfsync_in_ureq,			/* PFSYNC_ACT_UPD_REQ */
132	pfsync_in_del,			/* PFSYNC_ACT_DEL */
133	pfsync_in_del_c,		/* PFSYNC_ACT_DEL_C */
134	pfsync_in_error,		/* PFSYNC_ACT_INS_F */
135	pfsync_in_error,		/* PFSYNC_ACT_DEL_F */
136	pfsync_in_bus,			/* PFSYNC_ACT_BUS */
137	pfsync_in_tdb,			/* PFSYNC_ACT_TDB */
138	pfsync_in_eof			/* PFSYNC_ACT_EOF */
139};
140
141struct pfsync_q {
142	void		(*write)(struct pf_state *, void *);
143	size_t		len;
144	u_int8_t	action;
145};
146
147/* we have one of these for every PFSYNC_S_ */
148static void	pfsync_out_state(struct pf_state *, void *);
149static void	pfsync_out_iack(struct pf_state *, void *);
150static void	pfsync_out_upd_c(struct pf_state *, void *);
151static void	pfsync_out_del(struct pf_state *, void *);
152
153static struct pfsync_q pfsync_qs[] = {
154	{ pfsync_out_state, sizeof(struct pfsync_state),   PFSYNC_ACT_INS },
155	{ pfsync_out_iack,  sizeof(struct pfsync_ins_ack), PFSYNC_ACT_INS_ACK },
156	{ pfsync_out_state, sizeof(struct pfsync_state),   PFSYNC_ACT_UPD },
157	{ pfsync_out_upd_c, sizeof(struct pfsync_upd_c),   PFSYNC_ACT_UPD_C },
158	{ pfsync_out_del,   sizeof(struct pfsync_del_c),   PFSYNC_ACT_DEL_C }
159};
160
161static void	pfsync_q_ins(struct pf_state *, int);
162static void	pfsync_q_del(struct pf_state *);
163
164static void	pfsync_update_state(struct pf_state *);
165
166struct pfsync_upd_req_item {
167	TAILQ_ENTRY(pfsync_upd_req_item)	ur_entry;
168	struct pfsync_upd_req			ur_msg;
169};
170
171struct pfsync_deferral {
172	struct pfsync_softc		*pd_sc;
173	TAILQ_ENTRY(pfsync_deferral)	pd_entry;
174	u_int				pd_refs;
175	struct callout			pd_tmo;
176
177	struct pf_state			*pd_st;
178	struct mbuf			*pd_m;
179};
180
181struct pfsync_softc {
182	/* Configuration */
183	struct ifnet		*sc_ifp;
184	struct ifnet		*sc_sync_if;
185	struct ip_moptions	sc_imo;
186	struct in_addr		sc_sync_peer;
187	uint32_t		sc_flags;
188#define	PFSYNCF_OK		0x00000001
189#define	PFSYNCF_DEFER		0x00000002
190#define	PFSYNCF_PUSH		0x00000004
191	uint8_t			sc_maxupdates;
192	struct ip		sc_template;
193	struct callout		sc_tmo;
194	struct mtx		sc_mtx;
195
196	/* Queued data */
197	size_t			sc_len;
198	TAILQ_HEAD(, pf_state)			sc_qs[PFSYNC_S_COUNT];
199	TAILQ_HEAD(, pfsync_upd_req_item)	sc_upd_req_list;
200	TAILQ_HEAD(, pfsync_deferral)		sc_deferrals;
201	u_int			sc_deferred;
202	void			*sc_plus;
203	size_t			sc_pluslen;
204
205	/* Bulk update info */
206	struct mtx		sc_bulk_mtx;
207	uint32_t		sc_ureq_sent;
208	int			sc_bulk_tries;
209	uint32_t		sc_ureq_received;
210	int			sc_bulk_hashid;
211	uint64_t		sc_bulk_stateid;
212	uint32_t		sc_bulk_creatorid;
213	struct callout		sc_bulk_tmo;
214	struct callout		sc_bulkfail_tmo;
215};
216
217#define	PFSYNC_LOCK(sc)		mtx_lock(&(sc)->sc_mtx)
218#define	PFSYNC_UNLOCK(sc)	mtx_unlock(&(sc)->sc_mtx)
219#define	PFSYNC_LOCK_ASSERT(sc)	mtx_assert(&(sc)->sc_mtx, MA_OWNED)
220
221#define	PFSYNC_BLOCK(sc)	mtx_lock(&(sc)->sc_bulk_mtx)
222#define	PFSYNC_BUNLOCK(sc)	mtx_unlock(&(sc)->sc_bulk_mtx)
223#define	PFSYNC_BLOCK_ASSERT(sc)	mtx_assert(&(sc)->sc_bulk_mtx, MA_OWNED)
224
225static const char pfsyncname[] = "pfsync";
226static MALLOC_DEFINE(M_PFSYNC, pfsyncname, "pfsync(4) data");
227static VNET_DEFINE(struct pfsync_softc	*, pfsyncif) = NULL;
228#define	V_pfsyncif		VNET(pfsyncif)
229static VNET_DEFINE(void *, pfsync_swi_cookie) = NULL;
230#define	V_pfsync_swi_cookie	VNET(pfsync_swi_cookie)
231static VNET_DEFINE(struct pfsyncstats, pfsyncstats);
232#define	V_pfsyncstats		VNET(pfsyncstats)
233static VNET_DEFINE(int, pfsync_carp_adj) = CARP_MAXSKEW;
234#define	V_pfsync_carp_adj	VNET(pfsync_carp_adj)
235
236static void	pfsync_timeout(void *);
237static void	pfsync_push(struct pfsync_softc *);
238static void	pfsyncintr(void *);
239static int	pfsync_multicast_setup(struct pfsync_softc *, struct ifnet *,
240		    void *);
241static void	pfsync_multicast_cleanup(struct pfsync_softc *);
242static void	pfsync_pointers_init(void);
243static void	pfsync_pointers_uninit(void);
244static int	pfsync_init(void);
245static void	pfsync_uninit(void);
246
247SYSCTL_NODE(_net, OID_AUTO, pfsync, CTLFLAG_RW, 0, "PFSYNC");
248SYSCTL_VNET_STRUCT(_net_pfsync, OID_AUTO, stats, CTLFLAG_RW,
249    &VNET_NAME(pfsyncstats), pfsyncstats,
250    "PFSYNC statistics (struct pfsyncstats, net/if_pfsync.h)");
251SYSCTL_INT(_net_pfsync, OID_AUTO, carp_demotion_factor, CTLFLAG_RW,
252    &VNET_NAME(pfsync_carp_adj), 0, "pfsync's CARP demotion factor adjustment");
253
254static int	pfsync_clone_create(struct if_clone *, int, caddr_t);
255static void	pfsync_clone_destroy(struct ifnet *);
256static int	pfsync_alloc_scrub_memory(struct pfsync_state_peer *,
257		    struct pf_state_peer *);
258static int	pfsyncoutput(struct ifnet *, struct mbuf *,
259		    const struct sockaddr *, struct route *);
260static int	pfsyncioctl(struct ifnet *, u_long, caddr_t);
261
262static int	pfsync_defer(struct pf_state *, struct mbuf *);
263static void	pfsync_undefer(struct pfsync_deferral *, int);
264static void	pfsync_undefer_state(struct pf_state *, int);
265static void	pfsync_defer_tmo(void *);
266
267static void	pfsync_request_update(u_int32_t, u_int64_t);
268static void	pfsync_update_state_req(struct pf_state *);
269
270static void	pfsync_drop(struct pfsync_softc *);
271static void	pfsync_sendout(int);
272static void	pfsync_send_plus(void *, size_t);
273
274static void	pfsync_bulk_start(void);
275static void	pfsync_bulk_status(u_int8_t);
276static void	pfsync_bulk_update(void *);
277static void	pfsync_bulk_fail(void *);
278
279#ifdef IPSEC
280static void	pfsync_update_net_tdb(struct pfsync_tdb *);
281#endif
282
283#define PFSYNC_MAX_BULKTRIES	12
284
285VNET_DEFINE(struct if_clone *, pfsync_cloner);
286#define	V_pfsync_cloner	VNET(pfsync_cloner)
287
288static int
289pfsync_clone_create(struct if_clone *ifc, int unit, caddr_t param)
290{
291	struct pfsync_softc *sc;
292	struct ifnet *ifp;
293	int q;
294
295	if (unit != 0)
296		return (EINVAL);
297
298	sc = malloc(sizeof(struct pfsync_softc), M_PFSYNC, M_WAITOK | M_ZERO);
299	sc->sc_flags |= PFSYNCF_OK;
300
301	for (q = 0; q < PFSYNC_S_COUNT; q++)
302		TAILQ_INIT(&sc->sc_qs[q]);
303
304	TAILQ_INIT(&sc->sc_upd_req_list);
305	TAILQ_INIT(&sc->sc_deferrals);
306
307	sc->sc_len = PFSYNC_MINPKT;
308	sc->sc_maxupdates = 128;
309
310	ifp = sc->sc_ifp = if_alloc(IFT_PFSYNC);
311	if (ifp == NULL) {
312		free(sc, M_PFSYNC);
313		return (ENOSPC);
314	}
315	if_initname(ifp, pfsyncname, unit);
316	ifp->if_softc = sc;
317	ifp->if_ioctl = pfsyncioctl;
318	ifp->if_output = pfsyncoutput;
319	ifp->if_type = IFT_PFSYNC;
320	ifp->if_snd.ifq_maxlen = ifqmaxlen;
321	ifp->if_hdrlen = sizeof(struct pfsync_header);
322	ifp->if_mtu = ETHERMTU;
323	mtx_init(&sc->sc_mtx, pfsyncname, NULL, MTX_DEF);
324	mtx_init(&sc->sc_bulk_mtx, "pfsync bulk", NULL, MTX_DEF);
325	callout_init(&sc->sc_tmo, 1);
326	callout_init_mtx(&sc->sc_bulk_tmo, &sc->sc_bulk_mtx, 0);
327	callout_init_mtx(&sc->sc_bulkfail_tmo, &sc->sc_bulk_mtx, 0);
328
329	if_attach(ifp);
330
331	bpfattach(ifp, DLT_PFSYNC, PFSYNC_HDRLEN);
332
333	V_pfsyncif = sc;
334
335	return (0);
336}
337
338static void
339pfsync_clone_destroy(struct ifnet *ifp)
340{
341	struct pfsync_softc *sc = ifp->if_softc;
342
343	/*
344	 * At this stage, everything should have already been
345	 * cleared by pfsync_uninit(), and we have only to
346	 * drain callouts.
347	 */
348	while (sc->sc_deferred > 0) {
349		struct pfsync_deferral *pd = TAILQ_FIRST(&sc->sc_deferrals);
350
351		TAILQ_REMOVE(&sc->sc_deferrals, pd, pd_entry);
352		sc->sc_deferred--;
353		if (callout_stop(&pd->pd_tmo)) {
354			pf_release_state(pd->pd_st);
355			m_freem(pd->pd_m);
356			free(pd, M_PFSYNC);
357		} else {
358			pd->pd_refs++;
359			callout_drain(&pd->pd_tmo);
360			free(pd, M_PFSYNC);
361		}
362	}
363
364	callout_drain(&sc->sc_tmo);
365	callout_drain(&sc->sc_bulkfail_tmo);
366	callout_drain(&sc->sc_bulk_tmo);
367
368	if (!(sc->sc_flags & PFSYNCF_OK) && carp_demote_adj_p)
369		(*carp_demote_adj_p)(-V_pfsync_carp_adj, "pfsync destroy");
370	bpfdetach(ifp);
371	if_detach(ifp);
372
373	pfsync_drop(sc);
374
375	if_free(ifp);
376	if (sc->sc_imo.imo_membership)
377		pfsync_multicast_cleanup(sc);
378	mtx_destroy(&sc->sc_mtx);
379	mtx_destroy(&sc->sc_bulk_mtx);
380	free(sc, M_PFSYNC);
381
382	V_pfsyncif = NULL;
383}
384
385static int
386pfsync_alloc_scrub_memory(struct pfsync_state_peer *s,
387    struct pf_state_peer *d)
388{
389	if (s->scrub.scrub_flag && d->scrub == NULL) {
390		d->scrub = uma_zalloc(V_pf_state_scrub_z, M_NOWAIT | M_ZERO);
391		if (d->scrub == NULL)
392			return (ENOMEM);
393	}
394
395	return (0);
396}
397
398
399static int
400pfsync_state_import(struct pfsync_state *sp, u_int8_t flags)
401{
402	struct pfsync_softc *sc = V_pfsyncif;
403#ifndef	__NO_STRICT_ALIGNMENT
404	struct pfsync_state_key key[2];
405#endif
406	struct pfsync_state_key *kw, *ks;
407	struct pf_state	*st = NULL;
408	struct pf_state_key *skw = NULL, *sks = NULL;
409	struct pf_rule *r = NULL;
410	struct pfi_kif	*kif;
411	int error;
412
413	PF_RULES_RASSERT();
414
415	if (sp->creatorid == 0) {
416		if (V_pf_status.debug >= PF_DEBUG_MISC)
417			printf("%s: invalid creator id: %08x\n", __func__,
418			    ntohl(sp->creatorid));
419		return (EINVAL);
420	}
421
422	if ((kif = pfi_kif_find(sp->ifname)) == NULL) {
423		if (V_pf_status.debug >= PF_DEBUG_MISC)
424			printf("%s: unknown interface: %s\n", __func__,
425			    sp->ifname);
426		if (flags & PFSYNC_SI_IOCTL)
427			return (EINVAL);
428		return (0);	/* skip this state */
429	}
430
431	/*
432	 * If the ruleset checksums match or the state is coming from the ioctl,
433	 * it's safe to associate the state with the rule of that number.
434	 */
435	if (sp->rule != htonl(-1) && sp->anchor == htonl(-1) &&
436	    (flags & (PFSYNC_SI_IOCTL | PFSYNC_SI_CKSUM)) && ntohl(sp->rule) <
437	    pf_main_ruleset.rules[PF_RULESET_FILTER].active.rcount)
438		r = pf_main_ruleset.rules[
439		    PF_RULESET_FILTER].active.ptr_array[ntohl(sp->rule)];
440	else
441		r = &V_pf_default_rule;
442
443	if ((r->max_states &&
444	    counter_u64_fetch(r->states_cur) >= r->max_states))
445		goto cleanup;
446
447	/*
448	 * XXXGL: consider M_WAITOK in ioctl path after.
449	 */
450	if ((st = uma_zalloc(V_pf_state_z, M_NOWAIT | M_ZERO)) == NULL)
451		goto cleanup;
452
453	if ((skw = uma_zalloc(V_pf_state_key_z, M_NOWAIT)) == NULL)
454		goto cleanup;
455
456#ifndef	__NO_STRICT_ALIGNMENT
457	bcopy(&sp->key, key, sizeof(struct pfsync_state_key) * 2);
458	kw = &key[PF_SK_WIRE];
459	ks = &key[PF_SK_STACK];
460#else
461	kw = &sp->key[PF_SK_WIRE];
462	ks = &sp->key[PF_SK_STACK];
463#endif
464
465	if (PF_ANEQ(&kw->addr[0], &ks->addr[0], sp->af) ||
466	    PF_ANEQ(&kw->addr[1], &ks->addr[1], sp->af) ||
467	    kw->port[0] != ks->port[0] ||
468	    kw->port[1] != ks->port[1]) {
469		sks = uma_zalloc(V_pf_state_key_z, M_NOWAIT);
470		if (sks == NULL)
471			goto cleanup;
472	} else
473		sks = skw;
474
475	/* allocate memory for scrub info */
476	if (pfsync_alloc_scrub_memory(&sp->src, &st->src) ||
477	    pfsync_alloc_scrub_memory(&sp->dst, &st->dst))
478		goto cleanup;
479
480	/* Copy to state key(s). */
481	skw->addr[0] = kw->addr[0];
482	skw->addr[1] = kw->addr[1];
483	skw->port[0] = kw->port[0];
484	skw->port[1] = kw->port[1];
485	skw->proto = sp->proto;
486	skw->af = sp->af;
487	if (sks != skw) {
488		sks->addr[0] = ks->addr[0];
489		sks->addr[1] = ks->addr[1];
490		sks->port[0] = ks->port[0];
491		sks->port[1] = ks->port[1];
492		sks->proto = sp->proto;
493		sks->af = sp->af;
494	}
495
496	/* copy to state */
497	bcopy(&sp->rt_addr, &st->rt_addr, sizeof(st->rt_addr));
498	st->creation = time_uptime - ntohl(sp->creation);
499	st->expire = time_uptime;
500	if (sp->expire) {
501		uint32_t timeout;
502
503		timeout = r->timeout[sp->timeout];
504		if (!timeout)
505			timeout = V_pf_default_rule.timeout[sp->timeout];
506
507		/* sp->expire may have been adaptively scaled by export. */
508		st->expire -= timeout - ntohl(sp->expire);
509	}
510
511	st->direction = sp->direction;
512	st->log = sp->log;
513	st->timeout = sp->timeout;
514	st->state_flags = sp->state_flags;
515
516	st->id = sp->id;
517	st->creatorid = sp->creatorid;
518	pf_state_peer_ntoh(&sp->src, &st->src);
519	pf_state_peer_ntoh(&sp->dst, &st->dst);
520
521	st->rule.ptr = r;
522	st->nat_rule.ptr = NULL;
523	st->anchor.ptr = NULL;
524	st->rt_kif = NULL;
525
526	st->pfsync_time = time_uptime;
527	st->sync_state = PFSYNC_S_NONE;
528
529	if (!(flags & PFSYNC_SI_IOCTL))
530		st->state_flags |= PFSTATE_NOSYNC;
531
532	if ((error = pf_state_insert(kif, skw, sks, st)) != 0)
533		goto cleanup_state;
534
535	/* XXX when we have nat_rule/anchors, use STATE_INC_COUNTERS */
536	counter_u64_add(r->states_cur, 1);
537	counter_u64_add(r->states_tot, 1);
538
539	if (!(flags & PFSYNC_SI_IOCTL)) {
540		st->state_flags &= ~PFSTATE_NOSYNC;
541		if (st->state_flags & PFSTATE_ACK) {
542			pfsync_q_ins(st, PFSYNC_S_IACK);
543			pfsync_push(sc);
544		}
545	}
546	st->state_flags &= ~PFSTATE_ACK;
547	PF_STATE_UNLOCK(st);
548
549	return (0);
550
551cleanup:
552	error = ENOMEM;
553	if (skw == sks)
554		sks = NULL;
555	if (skw != NULL)
556		uma_zfree(V_pf_state_key_z, skw);
557	if (sks != NULL)
558		uma_zfree(V_pf_state_key_z, sks);
559
560cleanup_state:	/* pf_state_insert() frees the state keys. */
561	if (st) {
562		if (st->dst.scrub)
563			uma_zfree(V_pf_state_scrub_z, st->dst.scrub);
564		if (st->src.scrub)
565			uma_zfree(V_pf_state_scrub_z, st->src.scrub);
566		uma_zfree(V_pf_state_z, st);
567	}
568	return (error);
569}
570
571static void
572pfsync_input(struct mbuf *m, __unused int off)
573{
574	struct pfsync_softc *sc = V_pfsyncif;
575	struct pfsync_pkt pkt;
576	struct ip *ip = mtod(m, struct ip *);
577	struct pfsync_header *ph;
578	struct pfsync_subheader subh;
579
580	int offset, len;
581	int rv;
582	uint16_t count;
583
584	V_pfsyncstats.pfsyncs_ipackets++;
585
586	/* Verify that we have a sync interface configured. */
587	if (!sc || !sc->sc_sync_if || !V_pf_status.running ||
588	    (sc->sc_ifp->if_drv_flags & IFF_DRV_RUNNING) == 0)
589		goto done;
590
591	/* verify that the packet came in on the right interface */
592	if (sc->sc_sync_if != m->m_pkthdr.rcvif) {
593		V_pfsyncstats.pfsyncs_badif++;
594		goto done;
595	}
596
597	sc->sc_ifp->if_ipackets++;
598	sc->sc_ifp->if_ibytes += m->m_pkthdr.len;
599	/* verify that the IP TTL is 255. */
600	if (ip->ip_ttl != PFSYNC_DFLTTL) {
601		V_pfsyncstats.pfsyncs_badttl++;
602		goto done;
603	}
604
605	offset = ip->ip_hl << 2;
606	if (m->m_pkthdr.len < offset + sizeof(*ph)) {
607		V_pfsyncstats.pfsyncs_hdrops++;
608		goto done;
609	}
610
611	if (offset + sizeof(*ph) > m->m_len) {
612		if (m_pullup(m, offset + sizeof(*ph)) == NULL) {
613			V_pfsyncstats.pfsyncs_hdrops++;
614			return;
615		}
616		ip = mtod(m, struct ip *);
617	}
618	ph = (struct pfsync_header *)((char *)ip + offset);
619
620	/* verify the version */
621	if (ph->version != PFSYNC_VERSION) {
622		V_pfsyncstats.pfsyncs_badver++;
623		goto done;
624	}
625
626	len = ntohs(ph->len) + offset;
627	if (m->m_pkthdr.len < len) {
628		V_pfsyncstats.pfsyncs_badlen++;
629		goto done;
630	}
631
632	/* Cheaper to grab this now than having to mess with mbufs later */
633	pkt.ip = ip;
634	pkt.src = ip->ip_src;
635	pkt.flags = 0;
636
637	/*
638	 * Trusting pf_chksum during packet processing, as well as seeking
639	 * in interface name tree, require holding PF_RULES_RLOCK().
640	 */
641	PF_RULES_RLOCK();
642	if (!bcmp(&ph->pfcksum, &V_pf_status.pf_chksum, PF_MD5_DIGEST_LENGTH))
643		pkt.flags |= PFSYNC_SI_CKSUM;
644
645	offset += sizeof(*ph);
646	while (offset <= len - sizeof(subh)) {
647		m_copydata(m, offset, sizeof(subh), (caddr_t)&subh);
648		offset += sizeof(subh);
649
650		if (subh.action >= PFSYNC_ACT_MAX) {
651			V_pfsyncstats.pfsyncs_badact++;
652			PF_RULES_RUNLOCK();
653			goto done;
654		}
655
656		count = ntohs(subh.count);
657		V_pfsyncstats.pfsyncs_iacts[subh.action] += count;
658		rv = (*pfsync_acts[subh.action])(&pkt, m, offset, count);
659		if (rv == -1) {
660			PF_RULES_RUNLOCK();
661			return;
662		}
663
664		offset += rv;
665	}
666	PF_RULES_RUNLOCK();
667
668done:
669	m_freem(m);
670}
671
672static int
673pfsync_in_clr(struct pfsync_pkt *pkt, struct mbuf *m, int offset, int count)
674{
675	struct pfsync_clr *clr;
676	struct mbuf *mp;
677	int len = sizeof(*clr) * count;
678	int i, offp;
679	u_int32_t creatorid;
680
681	mp = m_pulldown(m, offset, len, &offp);
682	if (mp == NULL) {
683		V_pfsyncstats.pfsyncs_badlen++;
684		return (-1);
685	}
686	clr = (struct pfsync_clr *)(mp->m_data + offp);
687
688	for (i = 0; i < count; i++) {
689		creatorid = clr[i].creatorid;
690
691		if (clr[i].ifname[0] != '\0' &&
692		    pfi_kif_find(clr[i].ifname) == NULL)
693			continue;
694
695		for (int i = 0; i <= pf_hashmask; i++) {
696			struct pf_idhash *ih = &V_pf_idhash[i];
697			struct pf_state *s;
698relock:
699			PF_HASHROW_LOCK(ih);
700			LIST_FOREACH(s, &ih->states, entry) {
701				if (s->creatorid == creatorid) {
702					s->state_flags |= PFSTATE_NOSYNC;
703					pf_unlink_state(s, PF_ENTER_LOCKED);
704					goto relock;
705				}
706			}
707			PF_HASHROW_UNLOCK(ih);
708		}
709	}
710
711	return (len);
712}
713
714static int
715pfsync_in_ins(struct pfsync_pkt *pkt, struct mbuf *m, int offset, int count)
716{
717	struct mbuf *mp;
718	struct pfsync_state *sa, *sp;
719	int len = sizeof(*sp) * count;
720	int i, offp;
721
722	mp = m_pulldown(m, offset, len, &offp);
723	if (mp == NULL) {
724		V_pfsyncstats.pfsyncs_badlen++;
725		return (-1);
726	}
727	sa = (struct pfsync_state *)(mp->m_data + offp);
728
729	for (i = 0; i < count; i++) {
730		sp = &sa[i];
731
732		/* Check for invalid values. */
733		if (sp->timeout >= PFTM_MAX ||
734		    sp->src.state > PF_TCPS_PROXY_DST ||
735		    sp->dst.state > PF_TCPS_PROXY_DST ||
736		    sp->direction > PF_OUT ||
737		    (sp->af != AF_INET && sp->af != AF_INET6)) {
738			if (V_pf_status.debug >= PF_DEBUG_MISC)
739				printf("%s: invalid value\n", __func__);
740			V_pfsyncstats.pfsyncs_badval++;
741			continue;
742		}
743
744		if (pfsync_state_import(sp, pkt->flags) == ENOMEM)
745			/* Drop out, but process the rest of the actions. */
746			break;
747	}
748
749	return (len);
750}
751
752static int
753pfsync_in_iack(struct pfsync_pkt *pkt, struct mbuf *m, int offset, int count)
754{
755	struct pfsync_ins_ack *ia, *iaa;
756	struct pf_state *st;
757
758	struct mbuf *mp;
759	int len = count * sizeof(*ia);
760	int offp, i;
761
762	mp = m_pulldown(m, offset, len, &offp);
763	if (mp == NULL) {
764		V_pfsyncstats.pfsyncs_badlen++;
765		return (-1);
766	}
767	iaa = (struct pfsync_ins_ack *)(mp->m_data + offp);
768
769	for (i = 0; i < count; i++) {
770		ia = &iaa[i];
771
772		st = pf_find_state_byid(ia->id, ia->creatorid);
773		if (st == NULL)
774			continue;
775
776		if (st->state_flags & PFSTATE_ACK) {
777			PFSYNC_LOCK(V_pfsyncif);
778			pfsync_undefer_state(st, 0);
779			PFSYNC_UNLOCK(V_pfsyncif);
780		}
781		PF_STATE_UNLOCK(st);
782	}
783	/*
784	 * XXX this is not yet implemented, but we know the size of the
785	 * message so we can skip it.
786	 */
787
788	return (count * sizeof(struct pfsync_ins_ack));
789}
790
791static int
792pfsync_upd_tcp(struct pf_state *st, struct pfsync_state_peer *src,
793    struct pfsync_state_peer *dst)
794{
795	int sync = 0;
796
797	PF_STATE_LOCK_ASSERT(st);
798
799	/*
800	 * The state should never go backwards except
801	 * for syn-proxy states.  Neither should the
802	 * sequence window slide backwards.
803	 */
804	if ((st->src.state > src->state &&
805	    (st->src.state < PF_TCPS_PROXY_SRC ||
806	    src->state >= PF_TCPS_PROXY_SRC)) ||
807
808	    (st->src.state == src->state &&
809	    SEQ_GT(st->src.seqlo, ntohl(src->seqlo))))
810		sync++;
811	else
812		pf_state_peer_ntoh(src, &st->src);
813
814	if ((st->dst.state > dst->state) ||
815
816	    (st->dst.state >= TCPS_SYN_SENT &&
817	    SEQ_GT(st->dst.seqlo, ntohl(dst->seqlo))))
818		sync++;
819	else
820		pf_state_peer_ntoh(dst, &st->dst);
821
822	return (sync);
823}
824
825static int
826pfsync_in_upd(struct pfsync_pkt *pkt, struct mbuf *m, int offset, int count)
827{
828	struct pfsync_softc *sc = V_pfsyncif;
829	struct pfsync_state *sa, *sp;
830	struct pf_state *st;
831	int sync;
832
833	struct mbuf *mp;
834	int len = count * sizeof(*sp);
835	int offp, i;
836
837	mp = m_pulldown(m, offset, len, &offp);
838	if (mp == NULL) {
839		V_pfsyncstats.pfsyncs_badlen++;
840		return (-1);
841	}
842	sa = (struct pfsync_state *)(mp->m_data + offp);
843
844	for (i = 0; i < count; i++) {
845		sp = &sa[i];
846
847		/* check for invalid values */
848		if (sp->timeout >= PFTM_MAX ||
849		    sp->src.state > PF_TCPS_PROXY_DST ||
850		    sp->dst.state > PF_TCPS_PROXY_DST) {
851			if (V_pf_status.debug >= PF_DEBUG_MISC) {
852				printf("pfsync_input: PFSYNC_ACT_UPD: "
853				    "invalid value\n");
854			}
855			V_pfsyncstats.pfsyncs_badval++;
856			continue;
857		}
858
859		st = pf_find_state_byid(sp->id, sp->creatorid);
860		if (st == NULL) {
861			/* insert the update */
862			if (pfsync_state_import(sp, 0))
863				V_pfsyncstats.pfsyncs_badstate++;
864			continue;
865		}
866
867		if (st->state_flags & PFSTATE_ACK) {
868			PFSYNC_LOCK(sc);
869			pfsync_undefer_state(st, 1);
870			PFSYNC_UNLOCK(sc);
871		}
872
873		if (st->key[PF_SK_WIRE]->proto == IPPROTO_TCP)
874			sync = pfsync_upd_tcp(st, &sp->src, &sp->dst);
875		else {
876			sync = 0;
877
878			/*
879			 * Non-TCP protocol state machine always go
880			 * forwards
881			 */
882			if (st->src.state > sp->src.state)
883				sync++;
884			else
885				pf_state_peer_ntoh(&sp->src, &st->src);
886			if (st->dst.state > sp->dst.state)
887				sync++;
888			else
889				pf_state_peer_ntoh(&sp->dst, &st->dst);
890		}
891		if (sync < 2) {
892			pfsync_alloc_scrub_memory(&sp->dst, &st->dst);
893			pf_state_peer_ntoh(&sp->dst, &st->dst);
894			st->expire = time_uptime;
895			st->timeout = sp->timeout;
896		}
897		st->pfsync_time = time_uptime;
898
899		if (sync) {
900			V_pfsyncstats.pfsyncs_stale++;
901
902			pfsync_update_state(st);
903			PF_STATE_UNLOCK(st);
904			PFSYNC_LOCK(sc);
905			pfsync_push(sc);
906			PFSYNC_UNLOCK(sc);
907			continue;
908		}
909		PF_STATE_UNLOCK(st);
910	}
911
912	return (len);
913}
914
915static int
916pfsync_in_upd_c(struct pfsync_pkt *pkt, struct mbuf *m, int offset, int count)
917{
918	struct pfsync_softc *sc = V_pfsyncif;
919	struct pfsync_upd_c *ua, *up;
920	struct pf_state *st;
921	int len = count * sizeof(*up);
922	int sync;
923	struct mbuf *mp;
924	int offp, i;
925
926	mp = m_pulldown(m, offset, len, &offp);
927	if (mp == NULL) {
928		V_pfsyncstats.pfsyncs_badlen++;
929		return (-1);
930	}
931	ua = (struct pfsync_upd_c *)(mp->m_data + offp);
932
933	for (i = 0; i < count; i++) {
934		up = &ua[i];
935
936		/* check for invalid values */
937		if (up->timeout >= PFTM_MAX ||
938		    up->src.state > PF_TCPS_PROXY_DST ||
939		    up->dst.state > PF_TCPS_PROXY_DST) {
940			if (V_pf_status.debug >= PF_DEBUG_MISC) {
941				printf("pfsync_input: "
942				    "PFSYNC_ACT_UPD_C: "
943				    "invalid value\n");
944			}
945			V_pfsyncstats.pfsyncs_badval++;
946			continue;
947		}
948
949		st = pf_find_state_byid(up->id, up->creatorid);
950		if (st == NULL) {
951			/* We don't have this state. Ask for it. */
952			PFSYNC_LOCK(sc);
953			pfsync_request_update(up->creatorid, up->id);
954			PFSYNC_UNLOCK(sc);
955			continue;
956		}
957
958		if (st->state_flags & PFSTATE_ACK) {
959			PFSYNC_LOCK(sc);
960			pfsync_undefer_state(st, 1);
961			PFSYNC_UNLOCK(sc);
962		}
963
964		if (st->key[PF_SK_WIRE]->proto == IPPROTO_TCP)
965			sync = pfsync_upd_tcp(st, &up->src, &up->dst);
966		else {
967			sync = 0;
968
969			/*
970			 * Non-TCP protocol state machine always go
971			 * forwards
972			 */
973			if (st->src.state > up->src.state)
974				sync++;
975			else
976				pf_state_peer_ntoh(&up->src, &st->src);
977			if (st->dst.state > up->dst.state)
978				sync++;
979			else
980				pf_state_peer_ntoh(&up->dst, &st->dst);
981		}
982		if (sync < 2) {
983			pfsync_alloc_scrub_memory(&up->dst, &st->dst);
984			pf_state_peer_ntoh(&up->dst, &st->dst);
985			st->expire = time_uptime;
986			st->timeout = up->timeout;
987		}
988		st->pfsync_time = time_uptime;
989
990		if (sync) {
991			V_pfsyncstats.pfsyncs_stale++;
992
993			pfsync_update_state(st);
994			PF_STATE_UNLOCK(st);
995			PFSYNC_LOCK(sc);
996			pfsync_push(sc);
997			PFSYNC_UNLOCK(sc);
998			continue;
999		}
1000		PF_STATE_UNLOCK(st);
1001	}
1002
1003	return (len);
1004}
1005
1006static int
1007pfsync_in_ureq(struct pfsync_pkt *pkt, struct mbuf *m, int offset, int count)
1008{
1009	struct pfsync_upd_req *ur, *ura;
1010	struct mbuf *mp;
1011	int len = count * sizeof(*ur);
1012	int i, offp;
1013
1014	struct pf_state *st;
1015
1016	mp = m_pulldown(m, offset, len, &offp);
1017	if (mp == NULL) {
1018		V_pfsyncstats.pfsyncs_badlen++;
1019		return (-1);
1020	}
1021	ura = (struct pfsync_upd_req *)(mp->m_data + offp);
1022
1023	for (i = 0; i < count; i++) {
1024		ur = &ura[i];
1025
1026		if (ur->id == 0 && ur->creatorid == 0)
1027			pfsync_bulk_start();
1028		else {
1029			st = pf_find_state_byid(ur->id, ur->creatorid);
1030			if (st == NULL) {
1031				V_pfsyncstats.pfsyncs_badstate++;
1032				continue;
1033			}
1034			if (st->state_flags & PFSTATE_NOSYNC) {
1035				PF_STATE_UNLOCK(st);
1036				continue;
1037			}
1038
1039			pfsync_update_state_req(st);
1040			PF_STATE_UNLOCK(st);
1041		}
1042	}
1043
1044	return (len);
1045}
1046
1047static int
1048pfsync_in_del(struct pfsync_pkt *pkt, struct mbuf *m, int offset, int count)
1049{
1050	struct mbuf *mp;
1051	struct pfsync_state *sa, *sp;
1052	struct pf_state *st;
1053	int len = count * sizeof(*sp);
1054	int offp, i;
1055
1056	mp = m_pulldown(m, offset, len, &offp);
1057	if (mp == NULL) {
1058		V_pfsyncstats.pfsyncs_badlen++;
1059		return (-1);
1060	}
1061	sa = (struct pfsync_state *)(mp->m_data + offp);
1062
1063	for (i = 0; i < count; i++) {
1064		sp = &sa[i];
1065
1066		st = pf_find_state_byid(sp->id, sp->creatorid);
1067		if (st == NULL) {
1068			V_pfsyncstats.pfsyncs_badstate++;
1069			continue;
1070		}
1071		st->state_flags |= PFSTATE_NOSYNC;
1072		pf_unlink_state(st, PF_ENTER_LOCKED);
1073	}
1074
1075	return (len);
1076}
1077
1078static int
1079pfsync_in_del_c(struct pfsync_pkt *pkt, struct mbuf *m, int offset, int count)
1080{
1081	struct mbuf *mp;
1082	struct pfsync_del_c *sa, *sp;
1083	struct pf_state *st;
1084	int len = count * sizeof(*sp);
1085	int offp, i;
1086
1087	mp = m_pulldown(m, offset, len, &offp);
1088	if (mp == NULL) {
1089		V_pfsyncstats.pfsyncs_badlen++;
1090		return (-1);
1091	}
1092	sa = (struct pfsync_del_c *)(mp->m_data + offp);
1093
1094	for (i = 0; i < count; i++) {
1095		sp = &sa[i];
1096
1097		st = pf_find_state_byid(sp->id, sp->creatorid);
1098		if (st == NULL) {
1099			V_pfsyncstats.pfsyncs_badstate++;
1100			continue;
1101		}
1102
1103		st->state_flags |= PFSTATE_NOSYNC;
1104		pf_unlink_state(st, PF_ENTER_LOCKED);
1105	}
1106
1107	return (len);
1108}
1109
1110static int
1111pfsync_in_bus(struct pfsync_pkt *pkt, struct mbuf *m, int offset, int count)
1112{
1113	struct pfsync_softc *sc = V_pfsyncif;
1114	struct pfsync_bus *bus;
1115	struct mbuf *mp;
1116	int len = count * sizeof(*bus);
1117	int offp;
1118
1119	PFSYNC_BLOCK(sc);
1120
1121	/* If we're not waiting for a bulk update, who cares. */
1122	if (sc->sc_ureq_sent == 0) {
1123		PFSYNC_BUNLOCK(sc);
1124		return (len);
1125	}
1126
1127	mp = m_pulldown(m, offset, len, &offp);
1128	if (mp == NULL) {
1129		PFSYNC_BUNLOCK(sc);
1130		V_pfsyncstats.pfsyncs_badlen++;
1131		return (-1);
1132	}
1133	bus = (struct pfsync_bus *)(mp->m_data + offp);
1134
1135	switch (bus->status) {
1136	case PFSYNC_BUS_START:
1137		callout_reset(&sc->sc_bulkfail_tmo, 4 * hz +
1138		    V_pf_limits[PF_LIMIT_STATES].limit /
1139		    ((sc->sc_ifp->if_mtu - PFSYNC_MINPKT) /
1140		    sizeof(struct pfsync_state)),
1141		    pfsync_bulk_fail, sc);
1142		if (V_pf_status.debug >= PF_DEBUG_MISC)
1143			printf("pfsync: received bulk update start\n");
1144		break;
1145
1146	case PFSYNC_BUS_END:
1147		if (time_uptime - ntohl(bus->endtime) >=
1148		    sc->sc_ureq_sent) {
1149			/* that's it, we're happy */
1150			sc->sc_ureq_sent = 0;
1151			sc->sc_bulk_tries = 0;
1152			callout_stop(&sc->sc_bulkfail_tmo);
1153			if (!(sc->sc_flags & PFSYNCF_OK) && carp_demote_adj_p)
1154				(*carp_demote_adj_p)(-V_pfsync_carp_adj,
1155				    "pfsync bulk done");
1156			sc->sc_flags |= PFSYNCF_OK;
1157			if (V_pf_status.debug >= PF_DEBUG_MISC)
1158				printf("pfsync: received valid "
1159				    "bulk update end\n");
1160		} else {
1161			if (V_pf_status.debug >= PF_DEBUG_MISC)
1162				printf("pfsync: received invalid "
1163				    "bulk update end: bad timestamp\n");
1164		}
1165		break;
1166	}
1167	PFSYNC_BUNLOCK(sc);
1168
1169	return (len);
1170}
1171
1172static int
1173pfsync_in_tdb(struct pfsync_pkt *pkt, struct mbuf *m, int offset, int count)
1174{
1175	int len = count * sizeof(struct pfsync_tdb);
1176
1177#if defined(IPSEC)
1178	struct pfsync_tdb *tp;
1179	struct mbuf *mp;
1180	int offp;
1181	int i;
1182	int s;
1183
1184	mp = m_pulldown(m, offset, len, &offp);
1185	if (mp == NULL) {
1186		V_pfsyncstats.pfsyncs_badlen++;
1187		return (-1);
1188	}
1189	tp = (struct pfsync_tdb *)(mp->m_data + offp);
1190
1191	for (i = 0; i < count; i++)
1192		pfsync_update_net_tdb(&tp[i]);
1193#endif
1194
1195	return (len);
1196}
1197
1198#if defined(IPSEC)
1199/* Update an in-kernel tdb. Silently fail if no tdb is found. */
1200static void
1201pfsync_update_net_tdb(struct pfsync_tdb *pt)
1202{
1203	struct tdb		*tdb;
1204	int			 s;
1205
1206	/* check for invalid values */
1207	if (ntohl(pt->spi) <= SPI_RESERVED_MAX ||
1208	    (pt->dst.sa.sa_family != AF_INET &&
1209	    pt->dst.sa.sa_family != AF_INET6))
1210		goto bad;
1211
1212	tdb = gettdb(pt->spi, &pt->dst, pt->sproto);
1213	if (tdb) {
1214		pt->rpl = ntohl(pt->rpl);
1215		pt->cur_bytes = (unsigned long long)be64toh(pt->cur_bytes);
1216
1217		/* Neither replay nor byte counter should ever decrease. */
1218		if (pt->rpl < tdb->tdb_rpl ||
1219		    pt->cur_bytes < tdb->tdb_cur_bytes) {
1220			goto bad;
1221		}
1222
1223		tdb->tdb_rpl = pt->rpl;
1224		tdb->tdb_cur_bytes = pt->cur_bytes;
1225	}
1226	return;
1227
1228bad:
1229	if (V_pf_status.debug >= PF_DEBUG_MISC)
1230		printf("pfsync_insert: PFSYNC_ACT_TDB_UPD: "
1231		    "invalid value\n");
1232	V_pfsyncstats.pfsyncs_badstate++;
1233	return;
1234}
1235#endif
1236
1237
1238static int
1239pfsync_in_eof(struct pfsync_pkt *pkt, struct mbuf *m, int offset, int count)
1240{
1241	/* check if we are at the right place in the packet */
1242	if (offset != m->m_pkthdr.len)
1243		V_pfsyncstats.pfsyncs_badlen++;
1244
1245	/* we're done. free and let the caller return */
1246	m_freem(m);
1247	return (-1);
1248}
1249
1250static int
1251pfsync_in_error(struct pfsync_pkt *pkt, struct mbuf *m, int offset, int count)
1252{
1253	V_pfsyncstats.pfsyncs_badact++;
1254
1255	m_freem(m);
1256	return (-1);
1257}
1258
1259static int
1260pfsyncoutput(struct ifnet *ifp, struct mbuf *m, const struct sockaddr *dst,
1261	struct route *rt)
1262{
1263	m_freem(m);
1264	return (0);
1265}
1266
1267/* ARGSUSED */
1268static int
1269pfsyncioctl(struct ifnet *ifp, u_long cmd, caddr_t data)
1270{
1271	struct pfsync_softc *sc = ifp->if_softc;
1272	struct ifreq *ifr = (struct ifreq *)data;
1273	struct pfsyncreq pfsyncr;
1274	int error;
1275
1276	switch (cmd) {
1277	case SIOCSIFFLAGS:
1278		PFSYNC_LOCK(sc);
1279		if (ifp->if_flags & IFF_UP) {
1280			ifp->if_drv_flags |= IFF_DRV_RUNNING;
1281			PFSYNC_UNLOCK(sc);
1282			pfsync_pointers_init();
1283		} else {
1284			ifp->if_drv_flags &= ~IFF_DRV_RUNNING;
1285			PFSYNC_UNLOCK(sc);
1286			pfsync_pointers_uninit();
1287		}
1288		break;
1289	case SIOCSIFMTU:
1290		if (!sc->sc_sync_if ||
1291		    ifr->ifr_mtu <= PFSYNC_MINPKT ||
1292		    ifr->ifr_mtu > sc->sc_sync_if->if_mtu)
1293			return (EINVAL);
1294		if (ifr->ifr_mtu < ifp->if_mtu) {
1295			PFSYNC_LOCK(sc);
1296			if (sc->sc_len > PFSYNC_MINPKT)
1297				pfsync_sendout(1);
1298			PFSYNC_UNLOCK(sc);
1299		}
1300		ifp->if_mtu = ifr->ifr_mtu;
1301		break;
1302	case SIOCGETPFSYNC:
1303		bzero(&pfsyncr, sizeof(pfsyncr));
1304		PFSYNC_LOCK(sc);
1305		if (sc->sc_sync_if) {
1306			strlcpy(pfsyncr.pfsyncr_syncdev,
1307			    sc->sc_sync_if->if_xname, IFNAMSIZ);
1308		}
1309		pfsyncr.pfsyncr_syncpeer = sc->sc_sync_peer;
1310		pfsyncr.pfsyncr_maxupdates = sc->sc_maxupdates;
1311		pfsyncr.pfsyncr_defer = (PFSYNCF_DEFER ==
1312		    (sc->sc_flags & PFSYNCF_DEFER));
1313		PFSYNC_UNLOCK(sc);
1314		return (copyout(&pfsyncr, ifr->ifr_data, sizeof(pfsyncr)));
1315
1316	case SIOCSETPFSYNC:
1317	    {
1318		struct ip_moptions *imo = &sc->sc_imo;
1319		struct ifnet *sifp;
1320		struct ip *ip;
1321		void *mship = NULL;
1322
1323		if ((error = priv_check(curthread, PRIV_NETINET_PF)) != 0)
1324			return (error);
1325		if ((error = copyin(ifr->ifr_data, &pfsyncr, sizeof(pfsyncr))))
1326			return (error);
1327
1328		if (pfsyncr.pfsyncr_maxupdates > 255)
1329			return (EINVAL);
1330
1331		if (pfsyncr.pfsyncr_syncdev[0] == 0)
1332			sifp = NULL;
1333		else if ((sifp = ifunit_ref(pfsyncr.pfsyncr_syncdev)) == NULL)
1334			return (EINVAL);
1335
1336		if (sifp != NULL && (
1337		    pfsyncr.pfsyncr_syncpeer.s_addr == 0 ||
1338		    pfsyncr.pfsyncr_syncpeer.s_addr ==
1339		    htonl(INADDR_PFSYNC_GROUP)))
1340			mship = malloc((sizeof(struct in_multi *) *
1341			    IP_MIN_MEMBERSHIPS), M_PFSYNC, M_WAITOK | M_ZERO);
1342
1343		PFSYNC_LOCK(sc);
1344		if (pfsyncr.pfsyncr_syncpeer.s_addr == 0)
1345			sc->sc_sync_peer.s_addr = htonl(INADDR_PFSYNC_GROUP);
1346		else
1347			sc->sc_sync_peer.s_addr =
1348			    pfsyncr.pfsyncr_syncpeer.s_addr;
1349
1350		sc->sc_maxupdates = pfsyncr.pfsyncr_maxupdates;
1351		if (pfsyncr.pfsyncr_defer) {
1352			sc->sc_flags |= PFSYNCF_DEFER;
1353			pfsync_defer_ptr = pfsync_defer;
1354		} else {
1355			sc->sc_flags &= ~PFSYNCF_DEFER;
1356			pfsync_defer_ptr = NULL;
1357		}
1358
1359		if (sifp == NULL) {
1360			if (sc->sc_sync_if)
1361				if_rele(sc->sc_sync_if);
1362			sc->sc_sync_if = NULL;
1363			if (imo->imo_membership)
1364				pfsync_multicast_cleanup(sc);
1365			PFSYNC_UNLOCK(sc);
1366			break;
1367		}
1368
1369		if (sc->sc_len > PFSYNC_MINPKT &&
1370		    (sifp->if_mtu < sc->sc_ifp->if_mtu ||
1371		    (sc->sc_sync_if != NULL &&
1372		    sifp->if_mtu < sc->sc_sync_if->if_mtu) ||
1373		    sifp->if_mtu < MCLBYTES - sizeof(struct ip)))
1374			pfsync_sendout(1);
1375
1376		if (imo->imo_membership)
1377			pfsync_multicast_cleanup(sc);
1378
1379		if (sc->sc_sync_peer.s_addr == htonl(INADDR_PFSYNC_GROUP)) {
1380			error = pfsync_multicast_setup(sc, sifp, mship);
1381			if (error) {
1382				if_rele(sifp);
1383				free(mship, M_PFSYNC);
1384				return (error);
1385			}
1386		}
1387		if (sc->sc_sync_if)
1388			if_rele(sc->sc_sync_if);
1389		sc->sc_sync_if = sifp;
1390
1391		ip = &sc->sc_template;
1392		bzero(ip, sizeof(*ip));
1393		ip->ip_v = IPVERSION;
1394		ip->ip_hl = sizeof(sc->sc_template) >> 2;
1395		ip->ip_tos = IPTOS_LOWDELAY;
1396		/* len and id are set later. */
1397		ip->ip_off = htons(IP_DF);
1398		ip->ip_ttl = PFSYNC_DFLTTL;
1399		ip->ip_p = IPPROTO_PFSYNC;
1400		ip->ip_src.s_addr = INADDR_ANY;
1401		ip->ip_dst.s_addr = sc->sc_sync_peer.s_addr;
1402
1403		/* Request a full state table update. */
1404		if ((sc->sc_flags & PFSYNCF_OK) && carp_demote_adj_p)
1405			(*carp_demote_adj_p)(V_pfsync_carp_adj,
1406			    "pfsync bulk start");
1407		sc->sc_flags &= ~PFSYNCF_OK;
1408		if (V_pf_status.debug >= PF_DEBUG_MISC)
1409			printf("pfsync: requesting bulk update\n");
1410		pfsync_request_update(0, 0);
1411		PFSYNC_UNLOCK(sc);
1412		PFSYNC_BLOCK(sc);
1413		sc->sc_ureq_sent = time_uptime;
1414		callout_reset(&sc->sc_bulkfail_tmo, 5 * hz, pfsync_bulk_fail,
1415		    sc);
1416		PFSYNC_BUNLOCK(sc);
1417
1418		break;
1419	    }
1420	default:
1421		return (ENOTTY);
1422	}
1423
1424	return (0);
1425}
1426
1427static void
1428pfsync_out_state(struct pf_state *st, void *buf)
1429{
1430	struct pfsync_state *sp = buf;
1431
1432	pfsync_state_export(sp, st);
1433}
1434
1435static void
1436pfsync_out_iack(struct pf_state *st, void *buf)
1437{
1438	struct pfsync_ins_ack *iack = buf;
1439
1440	iack->id = st->id;
1441	iack->creatorid = st->creatorid;
1442}
1443
1444static void
1445pfsync_out_upd_c(struct pf_state *st, void *buf)
1446{
1447	struct pfsync_upd_c *up = buf;
1448
1449	bzero(up, sizeof(*up));
1450	up->id = st->id;
1451	pf_state_peer_hton(&st->src, &up->src);
1452	pf_state_peer_hton(&st->dst, &up->dst);
1453	up->creatorid = st->creatorid;
1454	up->timeout = st->timeout;
1455}
1456
1457static void
1458pfsync_out_del(struct pf_state *st, void *buf)
1459{
1460	struct pfsync_del_c *dp = buf;
1461
1462	dp->id = st->id;
1463	dp->creatorid = st->creatorid;
1464	st->state_flags |= PFSTATE_NOSYNC;
1465}
1466
1467static void
1468pfsync_drop(struct pfsync_softc *sc)
1469{
1470	struct pf_state *st, *next;
1471	struct pfsync_upd_req_item *ur;
1472	int q;
1473
1474	for (q = 0; q < PFSYNC_S_COUNT; q++) {
1475		if (TAILQ_EMPTY(&sc->sc_qs[q]))
1476			continue;
1477
1478		TAILQ_FOREACH_SAFE(st, &sc->sc_qs[q], sync_list, next) {
1479			KASSERT(st->sync_state == q,
1480				("%s: st->sync_state == q",
1481					__func__));
1482			st->sync_state = PFSYNC_S_NONE;
1483			pf_release_state(st);
1484		}
1485		TAILQ_INIT(&sc->sc_qs[q]);
1486	}
1487
1488	while ((ur = TAILQ_FIRST(&sc->sc_upd_req_list)) != NULL) {
1489		TAILQ_REMOVE(&sc->sc_upd_req_list, ur, ur_entry);
1490		free(ur, M_PFSYNC);
1491	}
1492
1493	sc->sc_plus = NULL;
1494	sc->sc_len = PFSYNC_MINPKT;
1495}
1496
1497static void
1498pfsync_sendout(int schedswi)
1499{
1500	struct pfsync_softc *sc = V_pfsyncif;
1501	struct ifnet *ifp = sc->sc_ifp;
1502	struct mbuf *m;
1503	struct ip *ip;
1504	struct pfsync_header *ph;
1505	struct pfsync_subheader *subh;
1506	struct pf_state *st;
1507	struct pfsync_upd_req_item *ur;
1508	int offset;
1509	int q, count = 0;
1510
1511	KASSERT(sc != NULL, ("%s: null sc", __func__));
1512	KASSERT(sc->sc_len > PFSYNC_MINPKT,
1513	    ("%s: sc_len %zu", __func__, sc->sc_len));
1514	PFSYNC_LOCK_ASSERT(sc);
1515
1516	if (ifp->if_bpf == NULL && sc->sc_sync_if == NULL) {
1517		pfsync_drop(sc);
1518		return;
1519	}
1520
1521	m = m_get2(max_linkhdr + sc->sc_len, M_NOWAIT, MT_DATA, M_PKTHDR);
1522	if (m == NULL) {
1523		sc->sc_ifp->if_oerrors++;
1524		V_pfsyncstats.pfsyncs_onomem++;
1525		return;
1526	}
1527	m->m_data += max_linkhdr;
1528	m->m_len = m->m_pkthdr.len = sc->sc_len;
1529
1530	/* build the ip header */
1531	ip = (struct ip *)m->m_data;
1532	bcopy(&sc->sc_template, ip, sizeof(*ip));
1533	offset = sizeof(*ip);
1534
1535	ip->ip_len = htons(m->m_pkthdr.len);
1536	ip->ip_id = htons(ip_randomid());
1537
1538	/* build the pfsync header */
1539	ph = (struct pfsync_header *)(m->m_data + offset);
1540	bzero(ph, sizeof(*ph));
1541	offset += sizeof(*ph);
1542
1543	ph->version = PFSYNC_VERSION;
1544	ph->len = htons(sc->sc_len - sizeof(*ip));
1545	bcopy(V_pf_status.pf_chksum, ph->pfcksum, PF_MD5_DIGEST_LENGTH);
1546
1547	/* walk the queues */
1548	for (q = 0; q < PFSYNC_S_COUNT; q++) {
1549		if (TAILQ_EMPTY(&sc->sc_qs[q]))
1550			continue;
1551
1552		subh = (struct pfsync_subheader *)(m->m_data + offset);
1553		offset += sizeof(*subh);
1554
1555		count = 0;
1556		TAILQ_FOREACH(st, &sc->sc_qs[q], sync_list) {
1557			KASSERT(st->sync_state == q,
1558				("%s: st->sync_state == q",
1559					__func__));
1560			/*
1561			 * XXXGL: some of write methods do unlocked reads
1562			 * of state data :(
1563			 */
1564			pfsync_qs[q].write(st, m->m_data + offset);
1565			offset += pfsync_qs[q].len;
1566			st->sync_state = PFSYNC_S_NONE;
1567			pf_release_state(st);
1568			count++;
1569		}
1570		TAILQ_INIT(&sc->sc_qs[q]);
1571
1572		bzero(subh, sizeof(*subh));
1573		subh->action = pfsync_qs[q].action;
1574		subh->count = htons(count);
1575		V_pfsyncstats.pfsyncs_oacts[pfsync_qs[q].action] += count;
1576	}
1577
1578	if (!TAILQ_EMPTY(&sc->sc_upd_req_list)) {
1579		subh = (struct pfsync_subheader *)(m->m_data + offset);
1580		offset += sizeof(*subh);
1581
1582		count = 0;
1583		while ((ur = TAILQ_FIRST(&sc->sc_upd_req_list)) != NULL) {
1584			TAILQ_REMOVE(&sc->sc_upd_req_list, ur, ur_entry);
1585
1586			bcopy(&ur->ur_msg, m->m_data + offset,
1587			    sizeof(ur->ur_msg));
1588			offset += sizeof(ur->ur_msg);
1589			free(ur, M_PFSYNC);
1590			count++;
1591		}
1592
1593		bzero(subh, sizeof(*subh));
1594		subh->action = PFSYNC_ACT_UPD_REQ;
1595		subh->count = htons(count);
1596		V_pfsyncstats.pfsyncs_oacts[PFSYNC_ACT_UPD_REQ] += count;
1597	}
1598
1599	/* has someone built a custom region for us to add? */
1600	if (sc->sc_plus != NULL) {
1601		bcopy(sc->sc_plus, m->m_data + offset, sc->sc_pluslen);
1602		offset += sc->sc_pluslen;
1603
1604		sc->sc_plus = NULL;
1605	}
1606
1607	subh = (struct pfsync_subheader *)(m->m_data + offset);
1608	offset += sizeof(*subh);
1609
1610	bzero(subh, sizeof(*subh));
1611	subh->action = PFSYNC_ACT_EOF;
1612	subh->count = htons(1);
1613	V_pfsyncstats.pfsyncs_oacts[PFSYNC_ACT_EOF]++;
1614
1615	/* we're done, let's put it on the wire */
1616	if (ifp->if_bpf) {
1617		m->m_data += sizeof(*ip);
1618		m->m_len = m->m_pkthdr.len = sc->sc_len - sizeof(*ip);
1619		BPF_MTAP(ifp, m);
1620		m->m_data -= sizeof(*ip);
1621		m->m_len = m->m_pkthdr.len = sc->sc_len;
1622	}
1623
1624	if (sc->sc_sync_if == NULL) {
1625		sc->sc_len = PFSYNC_MINPKT;
1626		m_freem(m);
1627		return;
1628	}
1629
1630	sc->sc_ifp->if_opackets++;
1631	sc->sc_ifp->if_obytes += m->m_pkthdr.len;
1632	sc->sc_len = PFSYNC_MINPKT;
1633
1634	if (!_IF_QFULL(&sc->sc_ifp->if_snd))
1635		_IF_ENQUEUE(&sc->sc_ifp->if_snd, m);
1636	else {
1637		m_freem(m);
1638		sc->sc_ifp->if_snd.ifq_drops++;
1639	}
1640	if (schedswi)
1641		swi_sched(V_pfsync_swi_cookie, 0);
1642}
1643
1644static void
1645pfsync_insert_state(struct pf_state *st)
1646{
1647	struct pfsync_softc *sc = V_pfsyncif;
1648
1649	if (st->state_flags & PFSTATE_NOSYNC)
1650		return;
1651
1652	if ((st->rule.ptr->rule_flag & PFRULE_NOSYNC) ||
1653	    st->key[PF_SK_WIRE]->proto == IPPROTO_PFSYNC) {
1654		st->state_flags |= PFSTATE_NOSYNC;
1655		return;
1656	}
1657
1658	KASSERT(st->sync_state == PFSYNC_S_NONE,
1659		("%s: st->sync_state %u", __func__, st->sync_state));
1660
1661	PFSYNC_LOCK(sc);
1662	if (sc->sc_len == PFSYNC_MINPKT)
1663		callout_reset(&sc->sc_tmo, 1 * hz, pfsync_timeout, V_pfsyncif);
1664
1665	pfsync_q_ins(st, PFSYNC_S_INS);
1666	PFSYNC_UNLOCK(sc);
1667
1668	st->sync_updates = 0;
1669}
1670
1671static int
1672pfsync_defer(struct pf_state *st, struct mbuf *m)
1673{
1674	struct pfsync_softc *sc = V_pfsyncif;
1675	struct pfsync_deferral *pd;
1676
1677	if (m->m_flags & (M_BCAST|M_MCAST))
1678		return (0);
1679
1680	PFSYNC_LOCK(sc);
1681
1682	if (sc == NULL || !(sc->sc_ifp->if_flags & IFF_DRV_RUNNING) ||
1683	    !(sc->sc_flags & PFSYNCF_DEFER)) {
1684		PFSYNC_UNLOCK(sc);
1685		return (0);
1686	}
1687
1688	 if (sc->sc_deferred >= 128)
1689		pfsync_undefer(TAILQ_FIRST(&sc->sc_deferrals), 0);
1690
1691	pd = malloc(sizeof(*pd), M_PFSYNC, M_NOWAIT);
1692	if (pd == NULL)
1693		return (0);
1694	sc->sc_deferred++;
1695
1696	m->m_flags |= M_SKIP_FIREWALL;
1697	st->state_flags |= PFSTATE_ACK;
1698
1699	pd->pd_sc = sc;
1700	pd->pd_refs = 0;
1701	pd->pd_st = st;
1702	pf_ref_state(st);
1703	pd->pd_m = m;
1704
1705	TAILQ_INSERT_TAIL(&sc->sc_deferrals, pd, pd_entry);
1706	callout_init_mtx(&pd->pd_tmo, &sc->sc_mtx, CALLOUT_RETURNUNLOCKED);
1707	callout_reset(&pd->pd_tmo, 10, pfsync_defer_tmo, pd);
1708
1709	pfsync_push(sc);
1710
1711	return (1);
1712}
1713
1714static void
1715pfsync_undefer(struct pfsync_deferral *pd, int drop)
1716{
1717	struct pfsync_softc *sc = pd->pd_sc;
1718	struct mbuf *m = pd->pd_m;
1719	struct pf_state *st = pd->pd_st;
1720
1721	PFSYNC_LOCK_ASSERT(sc);
1722
1723	TAILQ_REMOVE(&sc->sc_deferrals, pd, pd_entry);
1724	sc->sc_deferred--;
1725	pd->pd_st->state_flags &= ~PFSTATE_ACK;	/* XXX: locking! */
1726	free(pd, M_PFSYNC);
1727	pf_release_state(st);
1728
1729	if (drop)
1730		m_freem(m);
1731	else {
1732		_IF_ENQUEUE(&sc->sc_ifp->if_snd, m);
1733		pfsync_push(sc);
1734	}
1735}
1736
1737static void
1738pfsync_defer_tmo(void *arg)
1739{
1740	struct pfsync_deferral *pd = arg;
1741	struct pfsync_softc *sc = pd->pd_sc;
1742	struct mbuf *m = pd->pd_m;
1743	struct pf_state *st = pd->pd_st;
1744
1745	PFSYNC_LOCK_ASSERT(sc);
1746
1747	CURVNET_SET(m->m_pkthdr.rcvif->if_vnet);
1748
1749	TAILQ_REMOVE(&sc->sc_deferrals, pd, pd_entry);
1750	sc->sc_deferred--;
1751	pd->pd_st->state_flags &= ~PFSTATE_ACK;	/* XXX: locking! */
1752	if (pd->pd_refs == 0)
1753		free(pd, M_PFSYNC);
1754	PFSYNC_UNLOCK(sc);
1755
1756	ip_output(m, NULL, NULL, 0, NULL, NULL);
1757
1758	pf_release_state(st);
1759
1760	CURVNET_RESTORE();
1761}
1762
1763static void
1764pfsync_undefer_state(struct pf_state *st, int drop)
1765{
1766	struct pfsync_softc *sc = V_pfsyncif;
1767	struct pfsync_deferral *pd;
1768
1769	PFSYNC_LOCK_ASSERT(sc);
1770
1771	TAILQ_FOREACH(pd, &sc->sc_deferrals, pd_entry) {
1772		 if (pd->pd_st == st) {
1773			if (callout_stop(&pd->pd_tmo))
1774				pfsync_undefer(pd, drop);
1775			return;
1776		}
1777	}
1778
1779	panic("%s: unable to find deferred state", __func__);
1780}
1781
1782static void
1783pfsync_update_state(struct pf_state *st)
1784{
1785	struct pfsync_softc *sc = V_pfsyncif;
1786	int sync = 0;
1787
1788	PF_STATE_LOCK_ASSERT(st);
1789	PFSYNC_LOCK(sc);
1790
1791	if (st->state_flags & PFSTATE_ACK)
1792		pfsync_undefer_state(st, 0);
1793	if (st->state_flags & PFSTATE_NOSYNC) {
1794		if (st->sync_state != PFSYNC_S_NONE)
1795			pfsync_q_del(st);
1796		PFSYNC_UNLOCK(sc);
1797		return;
1798	}
1799
1800	if (sc->sc_len == PFSYNC_MINPKT)
1801		callout_reset(&sc->sc_tmo, 1 * hz, pfsync_timeout, V_pfsyncif);
1802
1803	switch (st->sync_state) {
1804	case PFSYNC_S_UPD_C:
1805	case PFSYNC_S_UPD:
1806	case PFSYNC_S_INS:
1807		/* we're already handling it */
1808
1809		if (st->key[PF_SK_WIRE]->proto == IPPROTO_TCP) {
1810			st->sync_updates++;
1811			if (st->sync_updates >= sc->sc_maxupdates)
1812				sync = 1;
1813		}
1814		break;
1815
1816	case PFSYNC_S_IACK:
1817		pfsync_q_del(st);
1818	case PFSYNC_S_NONE:
1819		pfsync_q_ins(st, PFSYNC_S_UPD_C);
1820		st->sync_updates = 0;
1821		break;
1822
1823	default:
1824		panic("%s: unexpected sync state %d", __func__, st->sync_state);
1825	}
1826
1827	if (sync || (time_uptime - st->pfsync_time) < 2)
1828		pfsync_push(sc);
1829
1830	PFSYNC_UNLOCK(sc);
1831}
1832
1833static void
1834pfsync_request_update(u_int32_t creatorid, u_int64_t id)
1835{
1836	struct pfsync_softc *sc = V_pfsyncif;
1837	struct pfsync_upd_req_item *item;
1838	size_t nlen = sizeof(struct pfsync_upd_req);
1839
1840	PFSYNC_LOCK_ASSERT(sc);
1841
1842	/*
1843	 * This code does a bit to prevent multiple update requests for the
1844	 * same state being generated. It searches current subheader queue,
1845	 * but it doesn't lookup into queue of already packed datagrams.
1846	 */
1847	TAILQ_FOREACH(item, &sc->sc_upd_req_list, ur_entry)
1848		if (item->ur_msg.id == id &&
1849		    item->ur_msg.creatorid == creatorid)
1850			return;
1851
1852	item = malloc(sizeof(*item), M_PFSYNC, M_NOWAIT);
1853	if (item == NULL)
1854		return; /* XXX stats */
1855
1856	item->ur_msg.id = id;
1857	item->ur_msg.creatorid = creatorid;
1858
1859	if (TAILQ_EMPTY(&sc->sc_upd_req_list))
1860		nlen += sizeof(struct pfsync_subheader);
1861
1862	if (sc->sc_len + nlen > sc->sc_ifp->if_mtu) {
1863		pfsync_sendout(1);
1864
1865		nlen = sizeof(struct pfsync_subheader) +
1866		    sizeof(struct pfsync_upd_req);
1867	}
1868
1869	TAILQ_INSERT_TAIL(&sc->sc_upd_req_list, item, ur_entry);
1870	sc->sc_len += nlen;
1871}
1872
1873static void
1874pfsync_update_state_req(struct pf_state *st)
1875{
1876	struct pfsync_softc *sc = V_pfsyncif;
1877
1878	PF_STATE_LOCK_ASSERT(st);
1879	PFSYNC_LOCK(sc);
1880
1881	if (st->state_flags & PFSTATE_NOSYNC) {
1882		if (st->sync_state != PFSYNC_S_NONE)
1883			pfsync_q_del(st);
1884		PFSYNC_UNLOCK(sc);
1885		return;
1886	}
1887
1888	switch (st->sync_state) {
1889	case PFSYNC_S_UPD_C:
1890	case PFSYNC_S_IACK:
1891		pfsync_q_del(st);
1892	case PFSYNC_S_NONE:
1893		pfsync_q_ins(st, PFSYNC_S_UPD);
1894		pfsync_push(sc);
1895		break;
1896
1897	case PFSYNC_S_INS:
1898	case PFSYNC_S_UPD:
1899	case PFSYNC_S_DEL:
1900		/* we're already handling it */
1901		break;
1902
1903	default:
1904		panic("%s: unexpected sync state %d", __func__, st->sync_state);
1905	}
1906
1907	PFSYNC_UNLOCK(sc);
1908}
1909
1910static void
1911pfsync_delete_state(struct pf_state *st)
1912{
1913	struct pfsync_softc *sc = V_pfsyncif;
1914
1915	PFSYNC_LOCK(sc);
1916	if (st->state_flags & PFSTATE_ACK)
1917		pfsync_undefer_state(st, 1);
1918	if (st->state_flags & PFSTATE_NOSYNC) {
1919		if (st->sync_state != PFSYNC_S_NONE)
1920			pfsync_q_del(st);
1921		PFSYNC_UNLOCK(sc);
1922		return;
1923	}
1924
1925	if (sc->sc_len == PFSYNC_MINPKT)
1926		callout_reset(&sc->sc_tmo, 1 * hz, pfsync_timeout, V_pfsyncif);
1927
1928	switch (st->sync_state) {
1929	case PFSYNC_S_INS:
1930		/* We never got to tell the world so just forget about it. */
1931		pfsync_q_del(st);
1932		break;
1933
1934	case PFSYNC_S_UPD_C:
1935	case PFSYNC_S_UPD:
1936	case PFSYNC_S_IACK:
1937		pfsync_q_del(st);
1938		/* FALLTHROUGH to putting it on the del list */
1939
1940	case PFSYNC_S_NONE:
1941		pfsync_q_ins(st, PFSYNC_S_DEL);
1942		break;
1943
1944	default:
1945		panic("%s: unexpected sync state %d", __func__, st->sync_state);
1946	}
1947	PFSYNC_UNLOCK(sc);
1948}
1949
1950static void
1951pfsync_clear_states(u_int32_t creatorid, const char *ifname)
1952{
1953	struct pfsync_softc *sc = V_pfsyncif;
1954	struct {
1955		struct pfsync_subheader subh;
1956		struct pfsync_clr clr;
1957	} __packed r;
1958
1959	bzero(&r, sizeof(r));
1960
1961	r.subh.action = PFSYNC_ACT_CLR;
1962	r.subh.count = htons(1);
1963	V_pfsyncstats.pfsyncs_oacts[PFSYNC_ACT_CLR]++;
1964
1965	strlcpy(r.clr.ifname, ifname, sizeof(r.clr.ifname));
1966	r.clr.creatorid = creatorid;
1967
1968	PFSYNC_LOCK(sc);
1969	pfsync_send_plus(&r, sizeof(r));
1970	PFSYNC_UNLOCK(sc);
1971}
1972
1973static void
1974pfsync_q_ins(struct pf_state *st, int q)
1975{
1976	struct pfsync_softc *sc = V_pfsyncif;
1977	size_t nlen = pfsync_qs[q].len;
1978
1979	PFSYNC_LOCK_ASSERT(sc);
1980
1981	KASSERT(st->sync_state == PFSYNC_S_NONE,
1982		("%s: st->sync_state %u", __func__, st->sync_state));
1983	KASSERT(sc->sc_len >= PFSYNC_MINPKT, ("pfsync pkt len is too low %zu",
1984	    sc->sc_len));
1985
1986	if (TAILQ_EMPTY(&sc->sc_qs[q]))
1987		nlen += sizeof(struct pfsync_subheader);
1988
1989	if (sc->sc_len + nlen > sc->sc_ifp->if_mtu) {
1990		pfsync_sendout(1);
1991
1992		nlen = sizeof(struct pfsync_subheader) + pfsync_qs[q].len;
1993	}
1994
1995	sc->sc_len += nlen;
1996	TAILQ_INSERT_TAIL(&sc->sc_qs[q], st, sync_list);
1997	st->sync_state = q;
1998	pf_ref_state(st);
1999}
2000
2001static void
2002pfsync_q_del(struct pf_state *st)
2003{
2004	struct pfsync_softc *sc = V_pfsyncif;
2005	int q = st->sync_state;
2006
2007	PFSYNC_LOCK_ASSERT(sc);
2008	KASSERT(st->sync_state != PFSYNC_S_NONE,
2009		("%s: st->sync_state != PFSYNC_S_NONE", __func__));
2010
2011	sc->sc_len -= pfsync_qs[q].len;
2012	TAILQ_REMOVE(&sc->sc_qs[q], st, sync_list);
2013	st->sync_state = PFSYNC_S_NONE;
2014	pf_release_state(st);
2015
2016	if (TAILQ_EMPTY(&sc->sc_qs[q]))
2017		sc->sc_len -= sizeof(struct pfsync_subheader);
2018}
2019
2020static void
2021pfsync_bulk_start(void)
2022{
2023	struct pfsync_softc *sc = V_pfsyncif;
2024
2025	if (V_pf_status.debug >= PF_DEBUG_MISC)
2026		printf("pfsync: received bulk update request\n");
2027
2028	PFSYNC_BLOCK(sc);
2029
2030	sc->sc_ureq_received = time_uptime;
2031	sc->sc_bulk_hashid = 0;
2032	sc->sc_bulk_stateid = 0;
2033	pfsync_bulk_status(PFSYNC_BUS_START);
2034	callout_reset(&sc->sc_bulk_tmo, 1, pfsync_bulk_update, sc);
2035	PFSYNC_BUNLOCK(sc);
2036}
2037
2038static void
2039pfsync_bulk_update(void *arg)
2040{
2041	struct pfsync_softc *sc = arg;
2042	struct pf_state *s;
2043	int i, sent = 0;
2044
2045	PFSYNC_BLOCK_ASSERT(sc);
2046	CURVNET_SET(sc->sc_ifp->if_vnet);
2047
2048	/*
2049	 * Start with last state from previous invocation.
2050	 * It may had gone, in this case start from the
2051	 * hash slot.
2052	 */
2053	s = pf_find_state_byid(sc->sc_bulk_stateid, sc->sc_bulk_creatorid);
2054
2055	if (s != NULL)
2056		i = PF_IDHASH(s);
2057	else
2058		i = sc->sc_bulk_hashid;
2059
2060	for (; i <= pf_hashmask; i++) {
2061		struct pf_idhash *ih = &V_pf_idhash[i];
2062
2063		if (s != NULL)
2064			PF_HASHROW_ASSERT(ih);
2065		else {
2066			PF_HASHROW_LOCK(ih);
2067			s = LIST_FIRST(&ih->states);
2068		}
2069
2070		for (; s; s = LIST_NEXT(s, entry)) {
2071
2072			if (sent > 1 && (sc->sc_ifp->if_mtu - sc->sc_len) <
2073			    sizeof(struct pfsync_state)) {
2074				/* We've filled a packet. */
2075				sc->sc_bulk_hashid = i;
2076				sc->sc_bulk_stateid = s->id;
2077				sc->sc_bulk_creatorid = s->creatorid;
2078				PF_HASHROW_UNLOCK(ih);
2079				callout_reset(&sc->sc_bulk_tmo, 1,
2080				    pfsync_bulk_update, sc);
2081				goto full;
2082			}
2083
2084			if (s->sync_state == PFSYNC_S_NONE &&
2085			    s->timeout < PFTM_MAX &&
2086			    s->pfsync_time <= sc->sc_ureq_received) {
2087				pfsync_update_state_req(s);
2088				sent++;
2089			}
2090		}
2091		PF_HASHROW_UNLOCK(ih);
2092	}
2093
2094	/* We're done. */
2095	pfsync_bulk_status(PFSYNC_BUS_END);
2096
2097full:
2098	CURVNET_RESTORE();
2099}
2100
2101static void
2102pfsync_bulk_status(u_int8_t status)
2103{
2104	struct {
2105		struct pfsync_subheader subh;
2106		struct pfsync_bus bus;
2107	} __packed r;
2108
2109	struct pfsync_softc *sc = V_pfsyncif;
2110
2111	bzero(&r, sizeof(r));
2112
2113	r.subh.action = PFSYNC_ACT_BUS;
2114	r.subh.count = htons(1);
2115	V_pfsyncstats.pfsyncs_oacts[PFSYNC_ACT_BUS]++;
2116
2117	r.bus.creatorid = V_pf_status.hostid;
2118	r.bus.endtime = htonl(time_uptime - sc->sc_ureq_received);
2119	r.bus.status = status;
2120
2121	PFSYNC_LOCK(sc);
2122	pfsync_send_plus(&r, sizeof(r));
2123	PFSYNC_UNLOCK(sc);
2124}
2125
2126static void
2127pfsync_bulk_fail(void *arg)
2128{
2129	struct pfsync_softc *sc = arg;
2130
2131	CURVNET_SET(sc->sc_ifp->if_vnet);
2132
2133	PFSYNC_BLOCK_ASSERT(sc);
2134
2135	if (sc->sc_bulk_tries++ < PFSYNC_MAX_BULKTRIES) {
2136		/* Try again */
2137		callout_reset(&sc->sc_bulkfail_tmo, 5 * hz,
2138		    pfsync_bulk_fail, V_pfsyncif);
2139		PFSYNC_LOCK(sc);
2140		pfsync_request_update(0, 0);
2141		PFSYNC_UNLOCK(sc);
2142	} else {
2143		/* Pretend like the transfer was ok. */
2144		sc->sc_ureq_sent = 0;
2145		sc->sc_bulk_tries = 0;
2146		PFSYNC_LOCK(sc);
2147		if (!(sc->sc_flags & PFSYNCF_OK) && carp_demote_adj_p)
2148			(*carp_demote_adj_p)(-V_pfsync_carp_adj,
2149			    "pfsync bulk fail");
2150		sc->sc_flags |= PFSYNCF_OK;
2151		PFSYNC_UNLOCK(sc);
2152		if (V_pf_status.debug >= PF_DEBUG_MISC)
2153			printf("pfsync: failed to receive bulk update\n");
2154	}
2155
2156	CURVNET_RESTORE();
2157}
2158
2159static void
2160pfsync_send_plus(void *plus, size_t pluslen)
2161{
2162	struct pfsync_softc *sc = V_pfsyncif;
2163
2164	PFSYNC_LOCK_ASSERT(sc);
2165
2166	if (sc->sc_len + pluslen > sc->sc_ifp->if_mtu)
2167		pfsync_sendout(1);
2168
2169	sc->sc_plus = plus;
2170	sc->sc_len += (sc->sc_pluslen = pluslen);
2171
2172	pfsync_sendout(1);
2173}
2174
2175static void
2176pfsync_timeout(void *arg)
2177{
2178	struct pfsync_softc *sc = arg;
2179
2180	CURVNET_SET(sc->sc_ifp->if_vnet);
2181	PFSYNC_LOCK(sc);
2182	pfsync_push(sc);
2183	PFSYNC_UNLOCK(sc);
2184	CURVNET_RESTORE();
2185}
2186
2187static void
2188pfsync_push(struct pfsync_softc *sc)
2189{
2190
2191	PFSYNC_LOCK_ASSERT(sc);
2192
2193	sc->sc_flags |= PFSYNCF_PUSH;
2194	swi_sched(V_pfsync_swi_cookie, 0);
2195}
2196
2197static void
2198pfsyncintr(void *arg)
2199{
2200	struct pfsync_softc *sc = arg;
2201	struct mbuf *m, *n;
2202
2203	CURVNET_SET(sc->sc_ifp->if_vnet);
2204
2205	PFSYNC_LOCK(sc);
2206	if ((sc->sc_flags & PFSYNCF_PUSH) && sc->sc_len > PFSYNC_MINPKT) {
2207		pfsync_sendout(0);
2208		sc->sc_flags &= ~PFSYNCF_PUSH;
2209	}
2210	_IF_DEQUEUE_ALL(&sc->sc_ifp->if_snd, m);
2211	PFSYNC_UNLOCK(sc);
2212
2213	for (; m != NULL; m = n) {
2214
2215		n = m->m_nextpkt;
2216		m->m_nextpkt = NULL;
2217
2218		/*
2219		 * We distinguish between a deferral packet and our
2220		 * own pfsync packet based on M_SKIP_FIREWALL
2221		 * flag. This is XXX.
2222		 */
2223		if (m->m_flags & M_SKIP_FIREWALL)
2224			ip_output(m, NULL, NULL, 0, NULL, NULL);
2225		else if (ip_output(m, NULL, NULL, IP_RAWOUTPUT, &sc->sc_imo,
2226		    NULL) == 0)
2227			V_pfsyncstats.pfsyncs_opackets++;
2228		else
2229			V_pfsyncstats.pfsyncs_oerrors++;
2230	}
2231	CURVNET_RESTORE();
2232}
2233
2234static int
2235pfsync_multicast_setup(struct pfsync_softc *sc, struct ifnet *ifp, void *mship)
2236{
2237	struct ip_moptions *imo = &sc->sc_imo;
2238	int error;
2239
2240	if (!(ifp->if_flags & IFF_MULTICAST))
2241		return (EADDRNOTAVAIL);
2242
2243	imo->imo_membership = (struct in_multi **)mship;
2244	imo->imo_max_memberships = IP_MIN_MEMBERSHIPS;
2245	imo->imo_multicast_vif = -1;
2246
2247	if ((error = in_joingroup(ifp, &sc->sc_sync_peer, NULL,
2248	    &imo->imo_membership[0])) != 0) {
2249		imo->imo_membership = NULL;
2250		return (error);
2251	}
2252	imo->imo_num_memberships++;
2253	imo->imo_multicast_ifp = ifp;
2254	imo->imo_multicast_ttl = PFSYNC_DFLTTL;
2255	imo->imo_multicast_loop = 0;
2256
2257	return (0);
2258}
2259
2260static void
2261pfsync_multicast_cleanup(struct pfsync_softc *sc)
2262{
2263	struct ip_moptions *imo = &sc->sc_imo;
2264
2265	in_leavegroup(imo->imo_membership[0], NULL);
2266	free(imo->imo_membership, M_PFSYNC);
2267	imo->imo_membership = NULL;
2268	imo->imo_multicast_ifp = NULL;
2269}
2270
2271#ifdef INET
2272extern  struct domain inetdomain;
2273static struct protosw in_pfsync_protosw = {
2274	.pr_type =		SOCK_RAW,
2275	.pr_domain =		&inetdomain,
2276	.pr_protocol =		IPPROTO_PFSYNC,
2277	.pr_flags =		PR_ATOMIC|PR_ADDR,
2278	.pr_input =		pfsync_input,
2279	.pr_output =		(pr_output_t *)rip_output,
2280	.pr_ctloutput =		rip_ctloutput,
2281	.pr_usrreqs =		&rip_usrreqs
2282};
2283#endif
2284
2285static void
2286pfsync_pointers_init()
2287{
2288
2289	PF_RULES_WLOCK();
2290	pfsync_state_import_ptr = pfsync_state_import;
2291	pfsync_insert_state_ptr = pfsync_insert_state;
2292	pfsync_update_state_ptr = pfsync_update_state;
2293	pfsync_delete_state_ptr = pfsync_delete_state;
2294	pfsync_clear_states_ptr = pfsync_clear_states;
2295	pfsync_defer_ptr = pfsync_defer;
2296	PF_RULES_WUNLOCK();
2297}
2298
2299static void
2300pfsync_pointers_uninit()
2301{
2302
2303	PF_RULES_WLOCK();
2304	pfsync_state_import_ptr = NULL;
2305	pfsync_insert_state_ptr = NULL;
2306	pfsync_update_state_ptr = NULL;
2307	pfsync_delete_state_ptr = NULL;
2308	pfsync_clear_states_ptr = NULL;
2309	pfsync_defer_ptr = NULL;
2310	PF_RULES_WUNLOCK();
2311}
2312
2313static int
2314pfsync_init()
2315{
2316	VNET_ITERATOR_DECL(vnet_iter);
2317	int error = 0;
2318
2319	VNET_LIST_RLOCK();
2320	VNET_FOREACH(vnet_iter) {
2321		CURVNET_SET(vnet_iter);
2322		V_pfsync_cloner = if_clone_simple(pfsyncname,
2323		    pfsync_clone_create, pfsync_clone_destroy, 1);
2324		error = swi_add(NULL, pfsyncname, pfsyncintr, V_pfsyncif,
2325		    SWI_NET, INTR_MPSAFE, &V_pfsync_swi_cookie);
2326		CURVNET_RESTORE();
2327		if (error)
2328			goto fail_locked;
2329	}
2330	VNET_LIST_RUNLOCK();
2331#ifdef INET
2332	error = pf_proto_register(PF_INET, &in_pfsync_protosw);
2333	if (error)
2334		goto fail;
2335	error = ipproto_register(IPPROTO_PFSYNC);
2336	if (error) {
2337		pf_proto_unregister(PF_INET, IPPROTO_PFSYNC, SOCK_RAW);
2338		goto fail;
2339	}
2340#endif
2341	pfsync_pointers_init();
2342
2343	return (0);
2344
2345fail:
2346	VNET_LIST_RLOCK();
2347fail_locked:
2348	VNET_FOREACH(vnet_iter) {
2349		CURVNET_SET(vnet_iter);
2350		if (V_pfsync_swi_cookie) {
2351			swi_remove(V_pfsync_swi_cookie);
2352			if_clone_detach(V_pfsync_cloner);
2353		}
2354		CURVNET_RESTORE();
2355	}
2356	VNET_LIST_RUNLOCK();
2357
2358	return (error);
2359}
2360
2361static void
2362pfsync_uninit()
2363{
2364	VNET_ITERATOR_DECL(vnet_iter);
2365
2366	pfsync_pointers_uninit();
2367
2368	ipproto_unregister(IPPROTO_PFSYNC);
2369	pf_proto_unregister(PF_INET, IPPROTO_PFSYNC, SOCK_RAW);
2370	VNET_LIST_RLOCK();
2371	VNET_FOREACH(vnet_iter) {
2372		CURVNET_SET(vnet_iter);
2373		if_clone_detach(V_pfsync_cloner);
2374		swi_remove(V_pfsync_swi_cookie);
2375		CURVNET_RESTORE();
2376	}
2377	VNET_LIST_RUNLOCK();
2378}
2379
2380static int
2381pfsync_modevent(module_t mod, int type, void *data)
2382{
2383	int error = 0;
2384
2385	switch (type) {
2386	case MOD_LOAD:
2387		error = pfsync_init();
2388		break;
2389	case MOD_QUIESCE:
2390		/*
2391		 * Module should not be unloaded due to race conditions.
2392		 */
2393		error = EBUSY;
2394		break;
2395	case MOD_UNLOAD:
2396		pfsync_uninit();
2397		break;
2398	default:
2399		error = EINVAL;
2400		break;
2401	}
2402
2403	return (error);
2404}
2405
2406static moduledata_t pfsync_mod = {
2407	pfsyncname,
2408	pfsync_modevent,
2409	0
2410};
2411
2412#define PFSYNC_MODVER 1
2413
2414DECLARE_MODULE(pfsync, pfsync_mod, SI_SUB_PROTO_DOMAIN, SI_ORDER_ANY);
2415MODULE_VERSION(pfsync, PFSYNC_MODVER);
2416MODULE_DEPEND(pfsync, pf, PF_MODVER, PF_MODVER, PF_MODVER);
2417