ip_fw_table.c revision 315532
1235251Seadler/*-
2244164Seadler * Copyright (c) 2004 Ruslan Ermilov and Vsevolod Lobko.
3270114Sse * Copyright (c) 2014 Yandex LLC
4235251Seadler * Copyright (c) 2014 Alexander V. Chernikov
5235251Seadler *
6235251Seadler * Redistribution and use in source and binary forms, with or without
7235251Seadler * modification, are permitted provided that the following conditions
8235251Seadler * are met:
9235251Seadler * 1. Redistributions of source code must retain the above copyright
10235251Seadler *    notice, this list of conditions and the following disclaimer.
11235251Seadler * 2. Redistributions in binary form must reproduce the above copyright
12270114Sse *    notice, this list of conditions and the following disclaimer in the
13235251Seadler *    documentation and/or other materials provided with the distribution.
14235251Seadler *
15270114Sse * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
16235251Seadler * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
17235251Seadler * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
18235251Seadler * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
19235251Seadler * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
20235251Seadler * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
21270114Sse * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
22235251Seadler * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
23235251Seadler * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
24235251Seadler * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
25235251Seadler * SUCH DAMAGE.
26270114Sse */
27235251Seadler
28235251Seadler#include <sys/cdefs.h>
29235251Seadler__FBSDID("$FreeBSD: stable/11/sys/netpfil/ipfw/ip_fw_table.c 315532 2017-03-19 07:34:19Z ae $");
30235251Seadler
31235251Seadler/*
32235251Seadler * Lookup table support for ipfw.
33235251Seadler *
34235251Seadler * This file contains handlers for all generic tables' operations:
35235251Seadler * add/del/flush entries, list/dump tables etc..
36235251Seadler *
37235251Seadler * Table data modification is protected by both UH and runtime lock
38235251Seadler * while reading configuration/data is protected by UH lock.
39270114Sse *
40270114Sse * Lookup algorithms for all table types are located in ip_fw_table_algo.c
41270114Sse */
42270114Sse
43235251Seadler#include "opt_ipfw.h"
44235251Seadler
45235251Seadler#include <sys/param.h>
46235251Seadler#include <sys/systm.h>
47235251Seadler#include <sys/malloc.h>
48235251Seadler#include <sys/kernel.h>
49235251Seadler#include <sys/lock.h>
50235251Seadler#include <sys/rwlock.h>
51270114Sse#include <sys/rmlock.h>
52235251Seadler#include <sys/socket.h>
53235251Seadler#include <sys/socketvar.h>
54235251Seadler#include <sys/queue.h>
55235251Seadler#include <net/if.h>	/* ip_fw.h requires IFNAMSIZ */
56235251Seadler
57235251Seadler#include <netinet/in.h>
58235251Seadler#include <netinet/ip_var.h>	/* struct ipfw_rule_ref */
59235251Seadler#include <netinet/ip_fw.h>
60235251Seadler
61235251Seadler#include <netpfil/ipfw/ip_fw_private.h>
62235251Seadler#include <netpfil/ipfw/ip_fw_table.h>
63235251Seadler
64235251Seadler /*
65235251Seadler * Table has the following `type` concepts:
66235251Seadler *
67235251Seadler * `no.type` represents lookup key type (addr, ifp, uid, etc..)
68235251Seadler * vmask represents bitmask of table values which are present at the moment.
69235251Seadler * Special IPFW_VTYPE_LEGACY ( (uint32_t)-1 ) represents old
70235251Seadler * single-value-for-all approach.
71235251Seadler */
72235251Seadlerstruct table_config {
73235251Seadler	struct named_object	no;
74235251Seadler	uint8_t		tflags;		/* type flags */
75235251Seadler	uint8_t		locked;		/* 1 if locked from changes */
76235251Seadler	uint8_t		linked;		/* 1 if already linked */
77235251Seadler	uint8_t		ochanged;	/* used by set swapping */
78235251Seadler	uint8_t		vshared;	/* 1 if using shared value array */
79235251Seadler	uint8_t		spare[3];
80235251Seadler	uint32_t	count;		/* Number of records */
81235251Seadler	uint32_t	limit;		/* Max number of records */
82235251Seadler	uint32_t	vmask;		/* bitmask with supported values */
83235251Seadler	uint32_t	ocount;		/* used by set swapping */
84235251Seadler	uint64_t	gencnt;		/* generation count */
85235251Seadler	char		tablename[64];	/* table name */
86235251Seadler	struct table_algo	*ta;	/* Callbacks for given algo */
87235251Seadler	void		*astate;	/* algorithm state */
88235251Seadler	struct table_info	ti_copy;	/* data to put to table_info */
89235251Seadler	struct namedobj_instance	*vi;
90235251Seadler};
91235251Seadler
92235251Seadlerstatic int find_table_err(struct namedobj_instance *ni, struct tid_info *ti,
93235251Seadler    struct table_config **tc);
94235251Seadlerstatic struct table_config *find_table(struct namedobj_instance *ni,
95235251Seadler    struct tid_info *ti);
96235251Seadlerstatic struct table_config *alloc_table_config(struct ip_fw_chain *ch,
97235251Seadler    struct tid_info *ti, struct table_algo *ta, char *adata, uint8_t tflags);
98235251Seadlerstatic void free_table_config(struct namedobj_instance *ni,
99235251Seadler    struct table_config *tc);
100235251Seadlerstatic int create_table_internal(struct ip_fw_chain *ch, struct tid_info *ti,
101235251Seadler    char *aname, ipfw_xtable_info *i, uint16_t *pkidx, int ref);
102235251Seadlerstatic void link_table(struct ip_fw_chain *ch, struct table_config *tc);
103235251Seadlerstatic void unlink_table(struct ip_fw_chain *ch, struct table_config *tc);
104235251Seadlerstatic int find_ref_table(struct ip_fw_chain *ch, struct tid_info *ti,
105235251Seadler    struct tentry_info *tei, uint32_t count, int op, struct table_config **ptc);
106235251Seadler#define	OP_ADD	1
107235251Seadler#define	OP_DEL	0
108235251Seadlerstatic int export_tables(struct ip_fw_chain *ch, ipfw_obj_lheader *olh,
109235251Seadler    struct sockopt_data *sd);
110235251Seadlerstatic void export_table_info(struct ip_fw_chain *ch, struct table_config *tc,
111270114Sse    ipfw_xtable_info *i);
112235251Seadlerstatic int dump_table_tentry(void *e, void *arg);
113235251Seadlerstatic int dump_table_xentry(void *e, void *arg);
114235251Seadler
115235251Seadlerstatic int swap_tables(struct ip_fw_chain *ch, struct tid_info *a,
116235251Seadler    struct tid_info *b);
117235251Seadler
118270114Ssestatic int check_table_name(const char *name);
119270114Ssestatic int check_table_space(struct ip_fw_chain *ch, struct tableop_state *ts,
120270114Sse    struct table_config *tc, struct table_info *ti, uint32_t count);
121270114Ssestatic int destroy_table(struct ip_fw_chain *ch, struct tid_info *ti);
122270114Sse
123270114Ssestatic struct table_algo *find_table_algo(struct tables_config *tableconf,
124270114Sse    struct tid_info *ti, char *name);
125270114Sse
126270114Ssestatic void objheader_to_ti(struct _ipfw_obj_header *oh, struct tid_info *ti);
127270114Ssestatic void ntlv_to_ti(struct _ipfw_obj_ntlv *ntlv, struct tid_info *ti);
128270114Sse
129270114Sse#define	CHAIN_TO_NI(chain)	(CHAIN_TO_TCFG(chain)->namehash)
130270114Sse#define	KIDX_TO_TI(ch, k)	(&(((struct table_info *)(ch)->tablestate)[k]))
131270114Sse
132270114Sse#define	TA_BUF_SZ	128	/* On-stack buffer for add/delete state */
133270310Sse
134270114Ssevoid
135270114Sserollback_toperation_state(struct ip_fw_chain *ch, void *object)
136270114Sse{
137270114Sse	struct tables_config *tcfg;
138270114Sse	struct op_state *os;
139270114Sse
140	tcfg = CHAIN_TO_TCFG(ch);
141	TAILQ_FOREACH(os, &tcfg->state_list, next)
142		os->func(object, os);
143}
144
145void
146add_toperation_state(struct ip_fw_chain *ch, struct tableop_state *ts)
147{
148	struct tables_config *tcfg;
149
150	tcfg = CHAIN_TO_TCFG(ch);
151	TAILQ_INSERT_HEAD(&tcfg->state_list, &ts->opstate, next);
152}
153
154void
155del_toperation_state(struct ip_fw_chain *ch, struct tableop_state *ts)
156{
157	struct tables_config *tcfg;
158
159	tcfg = CHAIN_TO_TCFG(ch);
160	TAILQ_REMOVE(&tcfg->state_list, &ts->opstate, next);
161}
162
163void
164tc_ref(struct table_config *tc)
165{
166
167	tc->no.refcnt++;
168}
169
170void
171tc_unref(struct table_config *tc)
172{
173
174	tc->no.refcnt--;
175}
176
177static struct table_value *
178get_table_value(struct ip_fw_chain *ch, struct table_config *tc, uint32_t kidx)
179{
180	struct table_value *pval;
181
182	pval = (struct table_value *)ch->valuestate;
183
184	return (&pval[kidx]);
185}
186
187
188/*
189 * Checks if we're able to insert/update entry @tei into table
190 * w.r.t @tc limits.
191 * May alter @tei to indicate insertion error / insert
192 * options.
193 *
194 * Returns 0 if operation can be performed/
195 */
196static int
197check_table_limit(struct table_config *tc, struct tentry_info *tei)
198{
199
200	if (tc->limit == 0 || tc->count < tc->limit)
201		return (0);
202
203	if ((tei->flags & TEI_FLAGS_UPDATE) == 0) {
204		/* Notify userland on error cause */
205		tei->flags |= TEI_FLAGS_LIMIT;
206		return (EFBIG);
207	}
208
209	/*
210	 * We have UPDATE flag set.
211	 * Permit updating record (if found),
212	 * but restrict adding new one since we've
213	 * already hit the limit.
214	 */
215	tei->flags |= TEI_FLAGS_DONTADD;
216
217	return (0);
218}
219
220/*
221 * Convert algorithm callback return code into
222 * one of pre-defined states known by userland.
223 */
224static void
225store_tei_result(struct tentry_info *tei, int op, int error, uint32_t num)
226{
227	int flag;
228
229	flag = 0;
230
231	switch (error) {
232	case 0:
233		if (op == OP_ADD && num != 0)
234			flag = TEI_FLAGS_ADDED;
235		if (op == OP_DEL)
236			flag = TEI_FLAGS_DELETED;
237		break;
238	case ENOENT:
239		flag = TEI_FLAGS_NOTFOUND;
240		break;
241	case EEXIST:
242		flag = TEI_FLAGS_EXISTS;
243		break;
244	default:
245		flag = TEI_FLAGS_ERROR;
246	}
247
248	tei->flags |= flag;
249}
250
251/*
252 * Creates and references table with default parameters.
253 * Saves table config, algo and allocated kidx info @ptc, @pta and
254 * @pkidx if non-zero.
255 * Used for table auto-creation to support old binaries.
256 *
257 * Returns 0 on success.
258 */
259static int
260create_table_compat(struct ip_fw_chain *ch, struct tid_info *ti,
261    uint16_t *pkidx)
262{
263	ipfw_xtable_info xi;
264	int error;
265
266	memset(&xi, 0, sizeof(xi));
267	/* Set default value mask for legacy clients */
268	xi.vmask = IPFW_VTYPE_LEGACY;
269
270	error = create_table_internal(ch, ti, NULL, &xi, pkidx, 1);
271	if (error != 0)
272		return (error);
273
274	return (0);
275}
276
277/*
278 * Find and reference existing table optionally
279 * creating new one.
280 *
281 * Saves found table config into @ptc.
282 * Note function may drop/acquire UH_WLOCK.
283 * Returns 0 if table was found/created and referenced
284 * or non-zero return code.
285 */
286static int
287find_ref_table(struct ip_fw_chain *ch, struct tid_info *ti,
288    struct tentry_info *tei, uint32_t count, int op,
289    struct table_config **ptc)
290{
291	struct namedobj_instance *ni;
292	struct table_config *tc;
293	uint16_t kidx;
294	int error;
295
296	IPFW_UH_WLOCK_ASSERT(ch);
297
298	ni = CHAIN_TO_NI(ch);
299	tc = NULL;
300	if ((tc = find_table(ni, ti)) != NULL) {
301		/* check table type */
302		if (tc->no.subtype != ti->type)
303			return (EINVAL);
304
305		if (tc->locked != 0)
306			return (EACCES);
307
308		/* Try to exit early on limit hit */
309		if (op == OP_ADD && count == 1 &&
310		    check_table_limit(tc, tei) != 0)
311			return (EFBIG);
312
313		/* Reference and return */
314		tc->no.refcnt++;
315		*ptc = tc;
316		return (0);
317	}
318
319	if (op == OP_DEL)
320		return (ESRCH);
321
322	/* Compatibility mode: create new table for old clients */
323	if ((tei->flags & TEI_FLAGS_COMPAT) == 0)
324		return (ESRCH);
325
326	IPFW_UH_WUNLOCK(ch);
327	error = create_table_compat(ch, ti, &kidx);
328	IPFW_UH_WLOCK(ch);
329
330	if (error != 0)
331		return (error);
332
333	tc = (struct table_config *)ipfw_objhash_lookup_kidx(ni, kidx);
334	KASSERT(tc != NULL, ("create_table_compat returned bad idx %d", kidx));
335
336	/* OK, now we've got referenced table. */
337	*ptc = tc;
338	return (0);
339}
340
341/*
342 * Rolls back already @added to @tc entries using state array @ta_buf_m.
343 * Assume the following layout:
344 * 1) ADD state (ta_buf_m[0] ... t_buf_m[added - 1]) for handling update cases
345 * 2) DEL state (ta_buf_m[count[ ... t_buf_m[count + added - 1])
346 *   for storing deleted state
347 */
348static void
349rollback_added_entries(struct ip_fw_chain *ch, struct table_config *tc,
350    struct table_info *tinfo, struct tentry_info *tei, caddr_t ta_buf_m,
351    uint32_t count, uint32_t added)
352{
353	struct table_algo *ta;
354	struct tentry_info *ptei;
355	caddr_t v, vv;
356	size_t ta_buf_sz;
357	int error, i;
358	uint32_t num;
359
360	IPFW_UH_WLOCK_ASSERT(ch);
361
362	ta = tc->ta;
363	ta_buf_sz = ta->ta_buf_size;
364	v = ta_buf_m;
365	vv = v + count * ta_buf_sz;
366	for (i = 0; i < added; i++, v += ta_buf_sz, vv += ta_buf_sz) {
367		ptei = &tei[i];
368		if ((ptei->flags & TEI_FLAGS_UPDATED) != 0) {
369
370			/*
371			 * We have old value stored by previous
372			 * call in @ptei->value. Do add once again
373			 * to restore it.
374			 */
375			error = ta->add(tc->astate, tinfo, ptei, v, &num);
376			KASSERT(error == 0, ("rollback UPDATE fail"));
377			KASSERT(num == 0, ("rollback UPDATE fail2"));
378			continue;
379		}
380
381		error = ta->prepare_del(ch, ptei, vv);
382		KASSERT(error == 0, ("pre-rollback INSERT failed"));
383		error = ta->del(tc->astate, tinfo, ptei, vv, &num);
384		KASSERT(error == 0, ("rollback INSERT failed"));
385		tc->count -= num;
386	}
387}
388
389/*
390 * Prepares add/del state for all @count entries in @tei.
391 * Uses either stack buffer (@ta_buf) or allocates a new one.
392 * Stores pointer to allocated buffer back to @ta_buf.
393 *
394 * Returns 0 on success.
395 */
396static int
397prepare_batch_buffer(struct ip_fw_chain *ch, struct table_algo *ta,
398    struct tentry_info *tei, uint32_t count, int op, caddr_t *ta_buf)
399{
400	caddr_t ta_buf_m, v;
401	size_t ta_buf_sz, sz;
402	struct tentry_info *ptei;
403	int error, i;
404
405	error = 0;
406	ta_buf_sz = ta->ta_buf_size;
407	if (count == 1) {
408		/* Sigle add/delete, use on-stack buffer */
409		memset(*ta_buf, 0, TA_BUF_SZ);
410		ta_buf_m = *ta_buf;
411	} else {
412
413		/*
414		 * Multiple adds/deletes, allocate larger buffer
415		 *
416		 * Note we need 2xcount buffer for add case:
417		 * we have hold both ADD state
418		 * and DELETE state (this may be needed
419		 * if we need to rollback all changes)
420		 */
421		sz = count * ta_buf_sz;
422		ta_buf_m = malloc((op == OP_ADD) ? sz * 2 : sz, M_TEMP,
423		    M_WAITOK | M_ZERO);
424	}
425
426	v = ta_buf_m;
427	for (i = 0; i < count; i++, v += ta_buf_sz) {
428		ptei = &tei[i];
429		error = (op == OP_ADD) ?
430		    ta->prepare_add(ch, ptei, v) : ta->prepare_del(ch, ptei, v);
431
432		/*
433		 * Some syntax error (incorrect mask, or address, or
434		 * anything). Return error regardless of atomicity
435		 * settings.
436		 */
437		if (error != 0)
438			break;
439	}
440
441	*ta_buf = ta_buf_m;
442	return (error);
443}
444
445/*
446 * Flushes allocated state for each @count entries in @tei.
447 * Frees @ta_buf_m if differs from stack buffer @ta_buf.
448 */
449static void
450flush_batch_buffer(struct ip_fw_chain *ch, struct table_algo *ta,
451    struct tentry_info *tei, uint32_t count, int rollback,
452    caddr_t ta_buf_m, caddr_t ta_buf)
453{
454	caddr_t v;
455	struct tentry_info *ptei;
456	size_t ta_buf_sz;
457	int i;
458
459	ta_buf_sz = ta->ta_buf_size;
460
461	/* Run cleaning callback anyway */
462	v = ta_buf_m;
463	for (i = 0; i < count; i++, v += ta_buf_sz) {
464		ptei = &tei[i];
465		ta->flush_entry(ch, ptei, v);
466		if (ptei->ptv != NULL) {
467			free(ptei->ptv, M_IPFW);
468			ptei->ptv = NULL;
469		}
470	}
471
472	/* Clean up "deleted" state in case of rollback */
473	if (rollback != 0) {
474		v = ta_buf_m + count * ta_buf_sz;
475		for (i = 0; i < count; i++, v += ta_buf_sz)
476			ta->flush_entry(ch, &tei[i], v);
477	}
478
479	if (ta_buf_m != ta_buf)
480		free(ta_buf_m, M_TEMP);
481}
482
483
484static void
485rollback_add_entry(void *object, struct op_state *_state)
486{
487	struct ip_fw_chain *ch;
488	struct tableop_state *ts;
489
490	ts = (struct tableop_state *)_state;
491
492	if (ts->tc != object && ts->ch != object)
493		return;
494
495	ch = ts->ch;
496
497	IPFW_UH_WLOCK_ASSERT(ch);
498
499	/* Call specifid unlockers */
500	rollback_table_values(ts);
501
502	/* Indicate we've called */
503	ts->modified = 1;
504}
505
506/*
507 * Adds/updates one or more entries in table @ti.
508 *
509 * Function may drop/reacquire UH wlock multiple times due to
510 * items alloc, algorithm callbacks (check_space), value linkage
511 * (new values, value storage realloc), etc..
512 * Other processes like other adds (which may involve storage resize),
513 * table swaps (which changes table data and may change algo type),
514 * table modify (which may change value mask) may be executed
515 * simultaneously so we need to deal with it.
516 *
517 * The following approach was implemented:
518 * we have per-chain linked list, protected with UH lock.
519 * add_table_entry prepares special on-stack structure wthich is passed
520 * to its descendants. Users add this structure to this list before unlock.
521 * After performing needed operations and acquiring UH lock back, each user
522 * checks if structure has changed. If true, it rolls local state back and
523 * returns without error to the caller.
524 * add_table_entry() on its own checks if structure has changed and restarts
525 * its operation from the beginning (goto restart).
526 *
527 * Functions which are modifying fields of interest (currently
528 *   resize_shared_value_storage() and swap_tables() )
529 * traverses given list while holding UH lock immediately before
530 * performing their operations calling function provided be list entry
531 * ( currently rollback_add_entry  ) which performs rollback for all necessary
532 * state and sets appropriate values in structure indicating rollback
533 * has happened.
534 *
535 * Algo interaction:
536 * Function references @ti first to ensure table won't
537 * disappear or change its type.
538 * After that, prepare_add callback is called for each @tei entry.
539 * Next, we try to add each entry under UH+WHLOCK
540 * using add() callback.
541 * Finally, we free all state by calling flush_entry callback
542 * for each @tei.
543 *
544 * Returns 0 on success.
545 */
546int
547add_table_entry(struct ip_fw_chain *ch, struct tid_info *ti,
548    struct tentry_info *tei, uint8_t flags, uint32_t count)
549{
550	struct table_config *tc;
551	struct table_algo *ta;
552	uint16_t kidx;
553	int error, first_error, i, rollback;
554	uint32_t num, numadd;
555	struct tentry_info *ptei;
556	struct tableop_state ts;
557	char ta_buf[TA_BUF_SZ];
558	caddr_t ta_buf_m, v;
559
560	memset(&ts, 0, sizeof(ts));
561	ta = NULL;
562	IPFW_UH_WLOCK(ch);
563
564	/*
565	 * Find and reference existing table.
566	 */
567restart:
568	if (ts.modified != 0) {
569		IPFW_UH_WUNLOCK(ch);
570		flush_batch_buffer(ch, ta, tei, count, rollback,
571		    ta_buf_m, ta_buf);
572		memset(&ts, 0, sizeof(ts));
573		ta = NULL;
574		IPFW_UH_WLOCK(ch);
575	}
576
577	error = find_ref_table(ch, ti, tei, count, OP_ADD, &tc);
578	if (error != 0) {
579		IPFW_UH_WUNLOCK(ch);
580		return (error);
581	}
582	ta = tc->ta;
583
584	/* Fill in tablestate */
585	ts.ch = ch;
586	ts.opstate.func = rollback_add_entry;
587	ts.tc = tc;
588	ts.vshared = tc->vshared;
589	ts.vmask = tc->vmask;
590	ts.ta = ta;
591	ts.tei = tei;
592	ts.count = count;
593	rollback = 0;
594	add_toperation_state(ch, &ts);
595	IPFW_UH_WUNLOCK(ch);
596
597	/* Allocate memory and prepare record(s) */
598	/* Pass stack buffer by default */
599	ta_buf_m = ta_buf;
600	error = prepare_batch_buffer(ch, ta, tei, count, OP_ADD, &ta_buf_m);
601
602	IPFW_UH_WLOCK(ch);
603	del_toperation_state(ch, &ts);
604	/* Drop reference we've used in first search */
605	tc->no.refcnt--;
606
607	/* Check prepare_batch_buffer() error */
608	if (error != 0)
609		goto cleanup;
610
611	/*
612	 * Check if table swap has happened.
613	 * (so table algo might be changed).
614	 * Restart operation to achieve consistent behavior.
615	 */
616	if (ts.modified != 0)
617		goto restart;
618
619	/*
620	 * Link all values values to shared/per-table value array.
621	 *
622	 * May release/reacquire UH_WLOCK.
623	 */
624	error = ipfw_link_table_values(ch, &ts);
625	if (error != 0)
626		goto cleanup;
627	if (ts.modified != 0)
628		goto restart;
629
630	/*
631	 * Ensure we are able to add all entries without additional
632	 * memory allocations. May release/reacquire UH_WLOCK.
633	 */
634	kidx = tc->no.kidx;
635	error = check_table_space(ch, &ts, tc, KIDX_TO_TI(ch, kidx), count);
636	if (error != 0)
637		goto cleanup;
638	if (ts.modified != 0)
639		goto restart;
640
641	/* We've got valid table in @tc. Let's try to add data */
642	kidx = tc->no.kidx;
643	ta = tc->ta;
644	numadd = 0;
645	first_error = 0;
646
647	IPFW_WLOCK(ch);
648
649	v = ta_buf_m;
650	for (i = 0; i < count; i++, v += ta->ta_buf_size) {
651		ptei = &tei[i];
652		num = 0;
653		/* check limit before adding */
654		if ((error = check_table_limit(tc, ptei)) == 0) {
655			error = ta->add(tc->astate, KIDX_TO_TI(ch, kidx),
656			    ptei, v, &num);
657			/* Set status flag to inform userland */
658			store_tei_result(ptei, OP_ADD, error, num);
659		}
660		if (error == 0) {
661			/* Update number of records to ease limit checking */
662			tc->count += num;
663			numadd += num;
664			continue;
665		}
666
667		if (first_error == 0)
668			first_error = error;
669
670		/*
671		 * Some error have happened. Check our atomicity
672		 * settings: continue if atomicity is not required,
673		 * rollback changes otherwise.
674		 */
675		if ((flags & IPFW_CTF_ATOMIC) == 0)
676			continue;
677
678		rollback_added_entries(ch, tc, KIDX_TO_TI(ch, kidx),
679		    tei, ta_buf_m, count, i);
680
681		rollback = 1;
682		break;
683	}
684
685	IPFW_WUNLOCK(ch);
686
687	ipfw_garbage_table_values(ch, tc, tei, count, rollback);
688
689	/* Permit post-add algorithm grow/rehash. */
690	if (numadd != 0)
691		check_table_space(ch, NULL, tc, KIDX_TO_TI(ch, kidx), 0);
692
693	/* Return first error to user, if any */
694	error = first_error;
695
696cleanup:
697	IPFW_UH_WUNLOCK(ch);
698
699	flush_batch_buffer(ch, ta, tei, count, rollback, ta_buf_m, ta_buf);
700
701	return (error);
702}
703
704/*
705 * Deletes one or more entries in table @ti.
706 *
707 * Returns 0 on success.
708 */
709int
710del_table_entry(struct ip_fw_chain *ch, struct tid_info *ti,
711    struct tentry_info *tei, uint8_t flags, uint32_t count)
712{
713	struct table_config *tc;
714	struct table_algo *ta;
715	struct tentry_info *ptei;
716	uint16_t kidx;
717	int error, first_error, i;
718	uint32_t num, numdel;
719	char ta_buf[TA_BUF_SZ];
720	caddr_t ta_buf_m, v;
721
722	/*
723	 * Find and reference existing table.
724	 */
725	IPFW_UH_WLOCK(ch);
726	error = find_ref_table(ch, ti, tei, count, OP_DEL, &tc);
727	if (error != 0) {
728		IPFW_UH_WUNLOCK(ch);
729		return (error);
730	}
731	ta = tc->ta;
732	IPFW_UH_WUNLOCK(ch);
733
734	/* Allocate memory and prepare record(s) */
735	/* Pass stack buffer by default */
736	ta_buf_m = ta_buf;
737	error = prepare_batch_buffer(ch, ta, tei, count, OP_DEL, &ta_buf_m);
738	if (error != 0)
739		goto cleanup;
740
741	IPFW_UH_WLOCK(ch);
742
743	/* Drop reference we've used in first search */
744	tc->no.refcnt--;
745
746	/*
747	 * Check if table algo is still the same.
748	 * (changed ta may be the result of table swap).
749	 */
750	if (ta != tc->ta) {
751		IPFW_UH_WUNLOCK(ch);
752		error = EINVAL;
753		goto cleanup;
754	}
755
756	kidx = tc->no.kidx;
757	numdel = 0;
758	first_error = 0;
759
760	IPFW_WLOCK(ch);
761	v = ta_buf_m;
762	for (i = 0; i < count; i++, v += ta->ta_buf_size) {
763		ptei = &tei[i];
764		num = 0;
765		error = ta->del(tc->astate, KIDX_TO_TI(ch, kidx), ptei, v,
766		    &num);
767		/* Save state for userland */
768		store_tei_result(ptei, OP_DEL, error, num);
769		if (error != 0 && first_error == 0)
770			first_error = error;
771		tc->count -= num;
772		numdel += num;
773	}
774	IPFW_WUNLOCK(ch);
775
776	/* Unlink non-used values */
777	ipfw_garbage_table_values(ch, tc, tei, count, 0);
778
779	if (numdel != 0) {
780		/* Run post-del hook to permit shrinking */
781		check_table_space(ch, NULL, tc, KIDX_TO_TI(ch, kidx), 0);
782	}
783
784	IPFW_UH_WUNLOCK(ch);
785
786	/* Return first error to user, if any */
787	error = first_error;
788
789cleanup:
790	flush_batch_buffer(ch, ta, tei, count, 0, ta_buf_m, ta_buf);
791
792	return (error);
793}
794
795/*
796 * Ensure that table @tc has enough space to add @count entries without
797 * need for reallocation.
798 *
799 * Callbacks order:
800 * 0) need_modify() (UH_WLOCK) - checks if @count items can be added w/o resize.
801 *
802 * 1) alloc_modify (no locks, M_WAITOK) - alloc new state based on @pflags.
803 * 2) prepare_modifyt (UH_WLOCK) - copy old data into new storage
804 * 3) modify (UH_WLOCK + WLOCK) - switch pointers
805 * 4) flush_modify (UH_WLOCK) - free state, if needed
806 *
807 * Returns 0 on success.
808 */
809static int
810check_table_space(struct ip_fw_chain *ch, struct tableop_state *ts,
811    struct table_config *tc, struct table_info *ti, uint32_t count)
812{
813	struct table_algo *ta;
814	uint64_t pflags;
815	char ta_buf[TA_BUF_SZ];
816	int error;
817
818	IPFW_UH_WLOCK_ASSERT(ch);
819
820	error = 0;
821	ta = tc->ta;
822	if (ta->need_modify == NULL)
823		return (0);
824
825	/* Acquire reference not to loose @tc between locks/unlocks */
826	tc->no.refcnt++;
827
828	/*
829	 * TODO: think about avoiding race between large add/large delete
830	 * operation on algorithm which implements shrinking along with
831	 * growing.
832	 */
833	while (true) {
834		pflags = 0;
835		if (ta->need_modify(tc->astate, ti, count, &pflags) == 0) {
836			error = 0;
837			break;
838		}
839
840		/* We have to shrink/grow table */
841		if (ts != NULL)
842			add_toperation_state(ch, ts);
843		IPFW_UH_WUNLOCK(ch);
844
845		memset(&ta_buf, 0, sizeof(ta_buf));
846		error = ta->prepare_mod(ta_buf, &pflags);
847
848		IPFW_UH_WLOCK(ch);
849		if (ts != NULL)
850			del_toperation_state(ch, ts);
851
852		if (error != 0)
853			break;
854
855		if (ts != NULL && ts->modified != 0) {
856
857			/*
858			 * Swap operation has happened
859			 * so we're currently operating on other
860			 * table data. Stop doing this.
861			 */
862			ta->flush_mod(ta_buf);
863			break;
864		}
865
866		/* Check if we still need to alter table */
867		ti = KIDX_TO_TI(ch, tc->no.kidx);
868		if (ta->need_modify(tc->astate, ti, count, &pflags) == 0) {
869			IPFW_UH_WUNLOCK(ch);
870
871			/*
872			 * Other thread has already performed resize.
873			 * Flush our state and return.
874			 */
875			ta->flush_mod(ta_buf);
876			break;
877		}
878
879		error = ta->fill_mod(tc->astate, ti, ta_buf, &pflags);
880		if (error == 0) {
881			/* Do actual modification */
882			IPFW_WLOCK(ch);
883			ta->modify(tc->astate, ti, ta_buf, pflags);
884			IPFW_WUNLOCK(ch);
885		}
886
887		/* Anyway, flush data and retry */
888		ta->flush_mod(ta_buf);
889	}
890
891	tc->no.refcnt--;
892	return (error);
893}
894
895/*
896 * Adds or deletes record in table.
897 * Data layout (v0):
898 * Request: [ ip_fw3_opheader ipfw_table_xentry ]
899 *
900 * Returns 0 on success
901 */
902static int
903manage_table_ent_v0(struct ip_fw_chain *ch, ip_fw3_opheader *op3,
904    struct sockopt_data *sd)
905{
906	ipfw_table_xentry *xent;
907	struct tentry_info tei;
908	struct tid_info ti;
909	struct table_value v;
910	int error, hdrlen, read;
911
912	hdrlen = offsetof(ipfw_table_xentry, k);
913
914	/* Check minimum header size */
915	if (sd->valsize < (sizeof(*op3) + hdrlen))
916		return (EINVAL);
917
918	read = sizeof(ip_fw3_opheader);
919
920	/* Check if xentry len field is valid */
921	xent = (ipfw_table_xentry *)(op3 + 1);
922	if (xent->len < hdrlen || xent->len + read > sd->valsize)
923		return (EINVAL);
924
925	memset(&tei, 0, sizeof(tei));
926	tei.paddr = &xent->k;
927	tei.masklen = xent->masklen;
928	ipfw_import_table_value_legacy(xent->value, &v);
929	tei.pvalue = &v;
930	/* Old requests compatibility */
931	tei.flags = TEI_FLAGS_COMPAT;
932	if (xent->type == IPFW_TABLE_ADDR) {
933		if (xent->len - hdrlen == sizeof(in_addr_t))
934			tei.subtype = AF_INET;
935		else
936			tei.subtype = AF_INET6;
937	}
938
939	memset(&ti, 0, sizeof(ti));
940	ti.uidx = xent->tbl;
941	ti.type = xent->type;
942
943	error = (op3->opcode == IP_FW_TABLE_XADD) ?
944	    add_table_entry(ch, &ti, &tei, 0, 1) :
945	    del_table_entry(ch, &ti, &tei, 0, 1);
946
947	return (error);
948}
949
950/*
951 * Adds or deletes record in table.
952 * Data layout (v1)(current):
953 * Request: [ ipfw_obj_header
954 *   ipfw_obj_ctlv(IPFW_TLV_TBLENT_LIST) [ ipfw_obj_tentry x N ]
955 * ]
956 *
957 * Returns 0 on success
958 */
959static int
960manage_table_ent_v1(struct ip_fw_chain *ch, ip_fw3_opheader *op3,
961    struct sockopt_data *sd)
962{
963	ipfw_obj_tentry *tent, *ptent;
964	ipfw_obj_ctlv *ctlv;
965	ipfw_obj_header *oh;
966	struct tentry_info *ptei, tei, *tei_buf;
967	struct tid_info ti;
968	int error, i, kidx, read;
969
970	/* Check minimum header size */
971	if (sd->valsize < (sizeof(*oh) + sizeof(*ctlv)))
972		return (EINVAL);
973
974	/* Check if passed data is too long */
975	if (sd->valsize != sd->kavail)
976		return (EINVAL);
977
978	oh = (ipfw_obj_header *)sd->kbuf;
979
980	/* Basic length checks for TLVs */
981	if (oh->ntlv.head.length != sizeof(oh->ntlv))
982		return (EINVAL);
983
984	read = sizeof(*oh);
985
986	ctlv = (ipfw_obj_ctlv *)(oh + 1);
987	if (ctlv->head.length + read != sd->valsize)
988		return (EINVAL);
989
990	read += sizeof(*ctlv);
991	tent = (ipfw_obj_tentry *)(ctlv + 1);
992	if (ctlv->count * sizeof(*tent) + read != sd->valsize)
993		return (EINVAL);
994
995	if (ctlv->count == 0)
996		return (0);
997
998	/*
999	 * Mark entire buffer as "read".
1000	 * This instructs sopt api write it back
1001	 * after function return.
1002	 */
1003	ipfw_get_sopt_header(sd, sd->valsize);
1004
1005	/* Perform basic checks for each entry */
1006	ptent = tent;
1007	kidx = tent->idx;
1008	for (i = 0; i < ctlv->count; i++, ptent++) {
1009		if (ptent->head.length != sizeof(*ptent))
1010			return (EINVAL);
1011		if (ptent->idx != kidx)
1012			return (ENOTSUP);
1013	}
1014
1015	/* Convert data into kernel request objects */
1016	objheader_to_ti(oh, &ti);
1017	ti.type = oh->ntlv.type;
1018	ti.uidx = kidx;
1019
1020	/* Use on-stack buffer for single add/del */
1021	if (ctlv->count == 1) {
1022		memset(&tei, 0, sizeof(tei));
1023		tei_buf = &tei;
1024	} else
1025		tei_buf = malloc(ctlv->count * sizeof(tei), M_TEMP,
1026		    M_WAITOK | M_ZERO);
1027
1028	ptei = tei_buf;
1029	ptent = tent;
1030	for (i = 0; i < ctlv->count; i++, ptent++, ptei++) {
1031		ptei->paddr = &ptent->k;
1032		ptei->subtype = ptent->subtype;
1033		ptei->masklen = ptent->masklen;
1034		if (ptent->head.flags & IPFW_TF_UPDATE)
1035			ptei->flags |= TEI_FLAGS_UPDATE;
1036
1037		ipfw_import_table_value_v1(&ptent->v.value);
1038		ptei->pvalue = (struct table_value *)&ptent->v.value;
1039	}
1040
1041	error = (oh->opheader.opcode == IP_FW_TABLE_XADD) ?
1042	    add_table_entry(ch, &ti, tei_buf, ctlv->flags, ctlv->count) :
1043	    del_table_entry(ch, &ti, tei_buf, ctlv->flags, ctlv->count);
1044
1045	/* Translate result back to userland */
1046	ptei = tei_buf;
1047	ptent = tent;
1048	for (i = 0; i < ctlv->count; i++, ptent++, ptei++) {
1049		if (ptei->flags & TEI_FLAGS_ADDED)
1050			ptent->result = IPFW_TR_ADDED;
1051		else if (ptei->flags & TEI_FLAGS_DELETED)
1052			ptent->result = IPFW_TR_DELETED;
1053		else if (ptei->flags & TEI_FLAGS_UPDATED)
1054			ptent->result = IPFW_TR_UPDATED;
1055		else if (ptei->flags & TEI_FLAGS_LIMIT)
1056			ptent->result = IPFW_TR_LIMIT;
1057		else if (ptei->flags & TEI_FLAGS_ERROR)
1058			ptent->result = IPFW_TR_ERROR;
1059		else if (ptei->flags & TEI_FLAGS_NOTFOUND)
1060			ptent->result = IPFW_TR_NOTFOUND;
1061		else if (ptei->flags & TEI_FLAGS_EXISTS)
1062			ptent->result = IPFW_TR_EXISTS;
1063		ipfw_export_table_value_v1(ptei->pvalue, &ptent->v.value);
1064	}
1065
1066	if (tei_buf != &tei)
1067		free(tei_buf, M_TEMP);
1068
1069	return (error);
1070}
1071
1072/*
1073 * Looks up an entry in given table.
1074 * Data layout (v0)(current):
1075 * Request: [ ipfw_obj_header ipfw_obj_tentry ]
1076 * Reply: [ ipfw_obj_header ipfw_obj_tentry ]
1077 *
1078 * Returns 0 on success
1079 */
1080static int
1081find_table_entry(struct ip_fw_chain *ch, ip_fw3_opheader *op3,
1082    struct sockopt_data *sd)
1083{
1084	ipfw_obj_tentry *tent;
1085	ipfw_obj_header *oh;
1086	struct tid_info ti;
1087	struct table_config *tc;
1088	struct table_algo *ta;
1089	struct table_info *kti;
1090	struct table_value *pval;
1091	struct namedobj_instance *ni;
1092	int error;
1093	size_t sz;
1094
1095	/* Check minimum header size */
1096	sz = sizeof(*oh) + sizeof(*tent);
1097	if (sd->valsize != sz)
1098		return (EINVAL);
1099
1100	oh = (struct _ipfw_obj_header *)ipfw_get_sopt_header(sd, sz);
1101	tent = (ipfw_obj_tentry *)(oh + 1);
1102
1103	/* Basic length checks for TLVs */
1104	if (oh->ntlv.head.length != sizeof(oh->ntlv))
1105		return (EINVAL);
1106
1107	objheader_to_ti(oh, &ti);
1108	ti.type = oh->ntlv.type;
1109	ti.uidx = tent->idx;
1110
1111	IPFW_UH_RLOCK(ch);
1112	ni = CHAIN_TO_NI(ch);
1113
1114	/*
1115	 * Find existing table and check its type .
1116	 */
1117	ta = NULL;
1118	if ((tc = find_table(ni, &ti)) == NULL) {
1119		IPFW_UH_RUNLOCK(ch);
1120		return (ESRCH);
1121	}
1122
1123	/* check table type */
1124	if (tc->no.subtype != ti.type) {
1125		IPFW_UH_RUNLOCK(ch);
1126		return (EINVAL);
1127	}
1128
1129	kti = KIDX_TO_TI(ch, tc->no.kidx);
1130	ta = tc->ta;
1131
1132	if (ta->find_tentry == NULL)
1133		return (ENOTSUP);
1134
1135	error = ta->find_tentry(tc->astate, kti, tent);
1136	if (error == 0) {
1137		pval = get_table_value(ch, tc, tent->v.kidx);
1138		ipfw_export_table_value_v1(pval, &tent->v.value);
1139	}
1140	IPFW_UH_RUNLOCK(ch);
1141
1142	return (error);
1143}
1144
1145/*
1146 * Flushes all entries or destroys given table.
1147 * Data layout (v0)(current):
1148 * Request: [ ipfw_obj_header ]
1149 *
1150 * Returns 0 on success
1151 */
1152static int
1153flush_table_v0(struct ip_fw_chain *ch, ip_fw3_opheader *op3,
1154    struct sockopt_data *sd)
1155{
1156	int error;
1157	struct _ipfw_obj_header *oh;
1158	struct tid_info ti;
1159
1160	if (sd->valsize != sizeof(*oh))
1161		return (EINVAL);
1162
1163	oh = (struct _ipfw_obj_header *)op3;
1164	objheader_to_ti(oh, &ti);
1165
1166	if (op3->opcode == IP_FW_TABLE_XDESTROY)
1167		error = destroy_table(ch, &ti);
1168	else if (op3->opcode == IP_FW_TABLE_XFLUSH)
1169		error = flush_table(ch, &ti);
1170	else
1171		return (ENOTSUP);
1172
1173	return (error);
1174}
1175
1176static void
1177restart_flush(void *object, struct op_state *_state)
1178{
1179	struct tableop_state *ts;
1180
1181	ts = (struct tableop_state *)_state;
1182
1183	if (ts->tc != object)
1184		return;
1185
1186	/* Indicate we've called */
1187	ts->modified = 1;
1188}
1189
1190/*
1191 * Flushes given table.
1192 *
1193 * Function create new table instance with the same
1194 * parameters, swaps it with old one and
1195 * flushes state without holding runtime WLOCK.
1196 *
1197 * Returns 0 on success.
1198 */
1199int
1200flush_table(struct ip_fw_chain *ch, struct tid_info *ti)
1201{
1202	struct namedobj_instance *ni;
1203	struct table_config *tc;
1204	struct table_algo *ta;
1205	struct table_info ti_old, ti_new, *tablestate;
1206	void *astate_old, *astate_new;
1207	char algostate[64], *pstate;
1208	struct tableop_state ts;
1209	int error, need_gc;
1210	uint16_t kidx;
1211	uint8_t tflags;
1212
1213	/*
1214	 * Stage 1: save table algorithm.
1215	 * Reference found table to ensure it won't disappear.
1216	 */
1217	IPFW_UH_WLOCK(ch);
1218	ni = CHAIN_TO_NI(ch);
1219	if ((tc = find_table(ni, ti)) == NULL) {
1220		IPFW_UH_WUNLOCK(ch);
1221		return (ESRCH);
1222	}
1223	need_gc = 0;
1224	astate_new = NULL;
1225	memset(&ti_new, 0, sizeof(ti_new));
1226restart:
1227	/* Set up swap handler */
1228	memset(&ts, 0, sizeof(ts));
1229	ts.opstate.func = restart_flush;
1230	ts.tc = tc;
1231
1232	ta = tc->ta;
1233	/* Do not flush readonly tables */
1234	if ((ta->flags & TA_FLAG_READONLY) != 0) {
1235		IPFW_UH_WUNLOCK(ch);
1236		return (EACCES);
1237	}
1238	/* Save startup algo parameters */
1239	if (ta->print_config != NULL) {
1240		ta->print_config(tc->astate, KIDX_TO_TI(ch, tc->no.kidx),
1241		    algostate, sizeof(algostate));
1242		pstate = algostate;
1243	} else
1244		pstate = NULL;
1245	tflags = tc->tflags;
1246	tc->no.refcnt++;
1247	add_toperation_state(ch, &ts);
1248	IPFW_UH_WUNLOCK(ch);
1249
1250	/*
1251	 * Stage 1.5: if this is not the first attempt, destroy previous state
1252	 */
1253	if (need_gc != 0) {
1254		ta->destroy(astate_new, &ti_new);
1255		need_gc = 0;
1256	}
1257
1258	/*
1259	 * Stage 2: allocate new table instance using same algo.
1260	 */
1261	memset(&ti_new, 0, sizeof(struct table_info));
1262	error = ta->init(ch, &astate_new, &ti_new, pstate, tflags);
1263
1264	/*
1265	 * Stage 3: swap old state pointers with newly-allocated ones.
1266	 * Decrease refcount.
1267	 */
1268	IPFW_UH_WLOCK(ch);
1269	tc->no.refcnt--;
1270	del_toperation_state(ch, &ts);
1271
1272	if (error != 0) {
1273		IPFW_UH_WUNLOCK(ch);
1274		return (error);
1275	}
1276
1277	/*
1278	 * Restart operation if table swap has happened:
1279	 * even if algo may be the same, algo init parameters
1280	 * may change. Restart operation instead of doing
1281	 * complex checks.
1282	 */
1283	if (ts.modified != 0) {
1284		/* Delay destroying data since we're holding UH lock */
1285		need_gc = 1;
1286		goto restart;
1287	}
1288
1289	ni = CHAIN_TO_NI(ch);
1290	kidx = tc->no.kidx;
1291	tablestate = (struct table_info *)ch->tablestate;
1292
1293	IPFW_WLOCK(ch);
1294	ti_old = tablestate[kidx];
1295	tablestate[kidx] = ti_new;
1296	IPFW_WUNLOCK(ch);
1297
1298	astate_old = tc->astate;
1299	tc->astate = astate_new;
1300	tc->ti_copy = ti_new;
1301	tc->count = 0;
1302
1303	/* Notify algo on real @ti address */
1304	if (ta->change_ti != NULL)
1305		ta->change_ti(tc->astate, &tablestate[kidx]);
1306
1307	/*
1308	 * Stage 4: unref values.
1309	 */
1310	ipfw_unref_table_values(ch, tc, ta, astate_old, &ti_old);
1311	IPFW_UH_WUNLOCK(ch);
1312
1313	/*
1314	 * Stage 5: perform real flush/destroy.
1315	 */
1316	ta->destroy(astate_old, &ti_old);
1317
1318	return (0);
1319}
1320
1321/*
1322 * Swaps two tables.
1323 * Data layout (v0)(current):
1324 * Request: [ ipfw_obj_header ipfw_obj_ntlv ]
1325 *
1326 * Returns 0 on success
1327 */
1328static int
1329swap_table(struct ip_fw_chain *ch, ip_fw3_opheader *op3,
1330    struct sockopt_data *sd)
1331{
1332	int error;
1333	struct _ipfw_obj_header *oh;
1334	struct tid_info ti_a, ti_b;
1335
1336	if (sd->valsize != sizeof(*oh) + sizeof(ipfw_obj_ntlv))
1337		return (EINVAL);
1338
1339	oh = (struct _ipfw_obj_header *)op3;
1340	ntlv_to_ti(&oh->ntlv, &ti_a);
1341	ntlv_to_ti((ipfw_obj_ntlv *)(oh + 1), &ti_b);
1342
1343	error = swap_tables(ch, &ti_a, &ti_b);
1344
1345	return (error);
1346}
1347
1348/*
1349 * Swaps two tables of the same type/valtype.
1350 *
1351 * Checks if tables are compatible and limits
1352 * permits swap, than actually perform swap.
1353 *
1354 * Each table consists of 2 different parts:
1355 * config:
1356 *   @tc (with name, set, kidx) and rule bindings, which is "stable".
1357 *   number of items
1358 *   table algo
1359 * runtime:
1360 *   runtime data @ti (ch->tablestate)
1361 *   runtime cache in @tc
1362 *   algo-specific data (@tc->astate)
1363 *
1364 * So we switch:
1365 *  all runtime data
1366 *   number of items
1367 *   table algo
1368 *
1369 * After that we call @ti change handler for each table.
1370 *
1371 * Note that referencing @tc won't protect tc->ta from change.
1372 * XXX: Do we need to restrict swap between locked tables?
1373 * XXX: Do we need to exchange ftype?
1374 *
1375 * Returns 0 on success.
1376 */
1377static int
1378swap_tables(struct ip_fw_chain *ch, struct tid_info *a,
1379    struct tid_info *b)
1380{
1381	struct namedobj_instance *ni;
1382	struct table_config *tc_a, *tc_b;
1383	struct table_algo *ta;
1384	struct table_info ti, *tablestate;
1385	void *astate;
1386	uint32_t count;
1387
1388	/*
1389	 * Stage 1: find both tables and ensure they are of
1390	 * the same type.
1391	 */
1392	IPFW_UH_WLOCK(ch);
1393	ni = CHAIN_TO_NI(ch);
1394	if ((tc_a = find_table(ni, a)) == NULL) {
1395		IPFW_UH_WUNLOCK(ch);
1396		return (ESRCH);
1397	}
1398	if ((tc_b = find_table(ni, b)) == NULL) {
1399		IPFW_UH_WUNLOCK(ch);
1400		return (ESRCH);
1401	}
1402
1403	/* It is very easy to swap between the same table */
1404	if (tc_a == tc_b) {
1405		IPFW_UH_WUNLOCK(ch);
1406		return (0);
1407	}
1408
1409	/* Check type and value are the same */
1410	if (tc_a->no.subtype!=tc_b->no.subtype || tc_a->tflags!=tc_b->tflags) {
1411		IPFW_UH_WUNLOCK(ch);
1412		return (EINVAL);
1413	}
1414
1415	/* Check limits before swap */
1416	if ((tc_a->limit != 0 && tc_b->count > tc_a->limit) ||
1417	    (tc_b->limit != 0 && tc_a->count > tc_b->limit)) {
1418		IPFW_UH_WUNLOCK(ch);
1419		return (EFBIG);
1420	}
1421
1422	/* Check if one of the tables is readonly */
1423	if (((tc_a->ta->flags | tc_b->ta->flags) & TA_FLAG_READONLY) != 0) {
1424		IPFW_UH_WUNLOCK(ch);
1425		return (EACCES);
1426	}
1427
1428	/* Notify we're going to swap */
1429	rollback_toperation_state(ch, tc_a);
1430	rollback_toperation_state(ch, tc_b);
1431
1432	/* Everything is fine, prepare to swap */
1433	tablestate = (struct table_info *)ch->tablestate;
1434	ti = tablestate[tc_a->no.kidx];
1435	ta = tc_a->ta;
1436	astate = tc_a->astate;
1437	count = tc_a->count;
1438
1439	IPFW_WLOCK(ch);
1440	/* a <- b */
1441	tablestate[tc_a->no.kidx] = tablestate[tc_b->no.kidx];
1442	tc_a->ta = tc_b->ta;
1443	tc_a->astate = tc_b->astate;
1444	tc_a->count = tc_b->count;
1445	/* b <- a */
1446	tablestate[tc_b->no.kidx] = ti;
1447	tc_b->ta = ta;
1448	tc_b->astate = astate;
1449	tc_b->count = count;
1450	IPFW_WUNLOCK(ch);
1451
1452	/* Ensure tc.ti copies are in sync */
1453	tc_a->ti_copy = tablestate[tc_a->no.kidx];
1454	tc_b->ti_copy = tablestate[tc_b->no.kidx];
1455
1456	/* Notify both tables on @ti change */
1457	if (tc_a->ta->change_ti != NULL)
1458		tc_a->ta->change_ti(tc_a->astate, &tablestate[tc_a->no.kidx]);
1459	if (tc_b->ta->change_ti != NULL)
1460		tc_b->ta->change_ti(tc_b->astate, &tablestate[tc_b->no.kidx]);
1461
1462	IPFW_UH_WUNLOCK(ch);
1463
1464	return (0);
1465}
1466
1467/*
1468 * Destroys table specified by @ti.
1469 * Data layout (v0)(current):
1470 * Request: [ ip_fw3_opheader ]
1471 *
1472 * Returns 0 on success
1473 */
1474static int
1475destroy_table(struct ip_fw_chain *ch, struct tid_info *ti)
1476{
1477	struct namedobj_instance *ni;
1478	struct table_config *tc;
1479
1480	IPFW_UH_WLOCK(ch);
1481
1482	ni = CHAIN_TO_NI(ch);
1483	if ((tc = find_table(ni, ti)) == NULL) {
1484		IPFW_UH_WUNLOCK(ch);
1485		return (ESRCH);
1486	}
1487
1488	/* Do not permit destroying referenced tables */
1489	if (tc->no.refcnt > 0) {
1490		IPFW_UH_WUNLOCK(ch);
1491		return (EBUSY);
1492	}
1493
1494	IPFW_WLOCK(ch);
1495	unlink_table(ch, tc);
1496	IPFW_WUNLOCK(ch);
1497
1498	/* Free obj index */
1499	if (ipfw_objhash_free_idx(ni, tc->no.kidx) != 0)
1500		printf("Error unlinking kidx %d from table %s\n",
1501		    tc->no.kidx, tc->tablename);
1502
1503	/* Unref values used in tables while holding UH lock */
1504	ipfw_unref_table_values(ch, tc, tc->ta, tc->astate, &tc->ti_copy);
1505	IPFW_UH_WUNLOCK(ch);
1506
1507	free_table_config(ni, tc);
1508
1509	return (0);
1510}
1511
1512static uint32_t
1513roundup2p(uint32_t v)
1514{
1515
1516	v--;
1517	v |= v >> 1;
1518	v |= v >> 2;
1519	v |= v >> 4;
1520	v |= v >> 8;
1521	v |= v >> 16;
1522	v++;
1523
1524	return (v);
1525}
1526
1527/*
1528 * Grow tables index.
1529 *
1530 * Returns 0 on success.
1531 */
1532int
1533ipfw_resize_tables(struct ip_fw_chain *ch, unsigned int ntables)
1534{
1535	unsigned int ntables_old, tbl;
1536	struct namedobj_instance *ni;
1537	void *new_idx, *old_tablestate, *tablestate;
1538	struct table_info *ti;
1539	struct table_config *tc;
1540	int i, new_blocks;
1541
1542	/* Check new value for validity */
1543	if (ntables == 0)
1544		return (EINVAL);
1545	if (ntables > IPFW_TABLES_MAX)
1546		ntables = IPFW_TABLES_MAX;
1547	/* Alight to nearest power of 2 */
1548	ntables = (unsigned int)roundup2p(ntables);
1549
1550	/* Allocate new pointers */
1551	tablestate = malloc(ntables * sizeof(struct table_info),
1552	    M_IPFW, M_WAITOK | M_ZERO);
1553
1554	ipfw_objhash_bitmap_alloc(ntables, (void *)&new_idx, &new_blocks);
1555
1556	IPFW_UH_WLOCK(ch);
1557
1558	tbl = (ntables >= V_fw_tables_max) ? V_fw_tables_max : ntables;
1559	ni = CHAIN_TO_NI(ch);
1560
1561	/* Temporary restrict decreasing max_tables */
1562	if (ntables < V_fw_tables_max) {
1563
1564		/*
1565		 * FIXME: Check if we really can shrink
1566		 */
1567		IPFW_UH_WUNLOCK(ch);
1568		return (EINVAL);
1569	}
1570
1571	/* Copy table info/indices */
1572	memcpy(tablestate, ch->tablestate, sizeof(struct table_info) * tbl);
1573	ipfw_objhash_bitmap_merge(ni, &new_idx, &new_blocks);
1574
1575	IPFW_WLOCK(ch);
1576
1577	/* Change pointers */
1578	old_tablestate = ch->tablestate;
1579	ch->tablestate = tablestate;
1580	ipfw_objhash_bitmap_swap(ni, &new_idx, &new_blocks);
1581
1582	ntables_old = V_fw_tables_max;
1583	V_fw_tables_max = ntables;
1584
1585	IPFW_WUNLOCK(ch);
1586
1587	/* Notify all consumers that their @ti pointer has changed */
1588	ti = (struct table_info *)ch->tablestate;
1589	for (i = 0; i < tbl; i++, ti++) {
1590		if (ti->lookup == NULL)
1591			continue;
1592		tc = (struct table_config *)ipfw_objhash_lookup_kidx(ni, i);
1593		if (tc == NULL || tc->ta->change_ti == NULL)
1594			continue;
1595
1596		tc->ta->change_ti(tc->astate, ti);
1597	}
1598
1599	IPFW_UH_WUNLOCK(ch);
1600
1601	/* Free old pointers */
1602	free(old_tablestate, M_IPFW);
1603	ipfw_objhash_bitmap_free(new_idx, new_blocks);
1604
1605	return (0);
1606}
1607
1608/*
1609 * Lookup an arbtrary key @paddr of legth @plen in table @tbl.
1610 * Stores found value in @val.
1611 *
1612 * Returns 1 if key was found.
1613 */
1614int
1615ipfw_lookup_table(struct ip_fw_chain *ch, uint16_t tbl, uint16_t plen,
1616    void *paddr, uint32_t *val)
1617{
1618	struct table_info *ti;
1619
1620	ti = KIDX_TO_TI(ch, tbl);
1621
1622	return (ti->lookup(ti, paddr, plen, val));
1623}
1624
1625/*
1626 * Info/List/dump support for tables.
1627 *
1628 */
1629
1630/*
1631 * High-level 'get' cmds sysctl handlers
1632 */
1633
1634/*
1635 * Lists all tables currently available in kernel.
1636 * Data layout (v0)(current):
1637 * Request: [ ipfw_obj_lheader ], size = ipfw_obj_lheader.size
1638 * Reply: [ ipfw_obj_lheader ipfw_xtable_info x N ]
1639 *
1640 * Returns 0 on success
1641 */
1642static int
1643list_tables(struct ip_fw_chain *ch, ip_fw3_opheader *op3,
1644    struct sockopt_data *sd)
1645{
1646	struct _ipfw_obj_lheader *olh;
1647	int error;
1648
1649	olh = (struct _ipfw_obj_lheader *)ipfw_get_sopt_header(sd,sizeof(*olh));
1650	if (olh == NULL)
1651		return (EINVAL);
1652	if (sd->valsize < olh->size)
1653		return (EINVAL);
1654
1655	IPFW_UH_RLOCK(ch);
1656	error = export_tables(ch, olh, sd);
1657	IPFW_UH_RUNLOCK(ch);
1658
1659	return (error);
1660}
1661
1662/*
1663 * Store table info to buffer provided by @sd.
1664 * Data layout (v0)(current):
1665 * Request: [ ipfw_obj_header ipfw_xtable_info(empty)]
1666 * Reply: [ ipfw_obj_header ipfw_xtable_info ]
1667 *
1668 * Returns 0 on success.
1669 */
1670static int
1671describe_table(struct ip_fw_chain *ch, ip_fw3_opheader *op3,
1672    struct sockopt_data *sd)
1673{
1674	struct _ipfw_obj_header *oh;
1675	struct table_config *tc;
1676	struct tid_info ti;
1677	size_t sz;
1678
1679	sz = sizeof(*oh) + sizeof(ipfw_xtable_info);
1680	oh = (struct _ipfw_obj_header *)ipfw_get_sopt_header(sd, sz);
1681	if (oh == NULL)
1682		return (EINVAL);
1683
1684	objheader_to_ti(oh, &ti);
1685
1686	IPFW_UH_RLOCK(ch);
1687	if ((tc = find_table(CHAIN_TO_NI(ch), &ti)) == NULL) {
1688		IPFW_UH_RUNLOCK(ch);
1689		return (ESRCH);
1690	}
1691
1692	export_table_info(ch, tc, (ipfw_xtable_info *)(oh + 1));
1693	IPFW_UH_RUNLOCK(ch);
1694
1695	return (0);
1696}
1697
1698/*
1699 * Modifies existing table.
1700 * Data layout (v0)(current):
1701 * Request: [ ipfw_obj_header ipfw_xtable_info ]
1702 *
1703 * Returns 0 on success
1704 */
1705static int
1706modify_table(struct ip_fw_chain *ch, ip_fw3_opheader *op3,
1707    struct sockopt_data *sd)
1708{
1709	struct _ipfw_obj_header *oh;
1710	ipfw_xtable_info *i;
1711	char *tname;
1712	struct tid_info ti;
1713	struct namedobj_instance *ni;
1714	struct table_config *tc;
1715
1716	if (sd->valsize != sizeof(*oh) + sizeof(ipfw_xtable_info))
1717		return (EINVAL);
1718
1719	oh = (struct _ipfw_obj_header *)sd->kbuf;
1720	i = (ipfw_xtable_info *)(oh + 1);
1721
1722	/*
1723	 * Verify user-supplied strings.
1724	 * Check for null-terminated/zero-length strings/
1725	 */
1726	tname = oh->ntlv.name;
1727	if (check_table_name(tname) != 0)
1728		return (EINVAL);
1729
1730	objheader_to_ti(oh, &ti);
1731	ti.type = i->type;
1732
1733	IPFW_UH_WLOCK(ch);
1734	ni = CHAIN_TO_NI(ch);
1735	if ((tc = find_table(ni, &ti)) == NULL) {
1736		IPFW_UH_WUNLOCK(ch);
1737		return (ESRCH);
1738	}
1739
1740	/* Do not support any modifications for readonly tables */
1741	if ((tc->ta->flags & TA_FLAG_READONLY) != 0) {
1742		IPFW_UH_WUNLOCK(ch);
1743		return (EACCES);
1744	}
1745
1746	if ((i->mflags & IPFW_TMFLAGS_LIMIT) != 0)
1747		tc->limit = i->limit;
1748	if ((i->mflags & IPFW_TMFLAGS_LOCK) != 0)
1749		tc->locked = ((i->flags & IPFW_TGFLAGS_LOCKED) != 0);
1750	IPFW_UH_WUNLOCK(ch);
1751
1752	return (0);
1753}
1754
1755/*
1756 * Creates new table.
1757 * Data layout (v0)(current):
1758 * Request: [ ipfw_obj_header ipfw_xtable_info ]
1759 *
1760 * Returns 0 on success
1761 */
1762static int
1763create_table(struct ip_fw_chain *ch, ip_fw3_opheader *op3,
1764    struct sockopt_data *sd)
1765{
1766	struct _ipfw_obj_header *oh;
1767	ipfw_xtable_info *i;
1768	char *tname, *aname;
1769	struct tid_info ti;
1770	struct namedobj_instance *ni;
1771
1772	if (sd->valsize != sizeof(*oh) + sizeof(ipfw_xtable_info))
1773		return (EINVAL);
1774
1775	oh = (struct _ipfw_obj_header *)sd->kbuf;
1776	i = (ipfw_xtable_info *)(oh + 1);
1777
1778	/*
1779	 * Verify user-supplied strings.
1780	 * Check for null-terminated/zero-length strings/
1781	 */
1782	tname = oh->ntlv.name;
1783	aname = i->algoname;
1784	if (check_table_name(tname) != 0 ||
1785	    strnlen(aname, sizeof(i->algoname)) == sizeof(i->algoname))
1786		return (EINVAL);
1787
1788	if (aname[0] == '\0') {
1789		/* Use default algorithm */
1790		aname = NULL;
1791	}
1792
1793	objheader_to_ti(oh, &ti);
1794	ti.type = i->type;
1795
1796	ni = CHAIN_TO_NI(ch);
1797
1798	IPFW_UH_RLOCK(ch);
1799	if (find_table(ni, &ti) != NULL) {
1800		IPFW_UH_RUNLOCK(ch);
1801		return (EEXIST);
1802	}
1803	IPFW_UH_RUNLOCK(ch);
1804
1805	return (create_table_internal(ch, &ti, aname, i, NULL, 0));
1806}
1807
1808/*
1809 * Creates new table based on @ti and @aname.
1810 *
1811 * Assume @aname to be checked and valid.
1812 * Stores allocated table kidx inside @pkidx (if non-NULL).
1813 * Reference created table if @compat is non-zero.
1814 *
1815 * Returns 0 on success.
1816 */
1817static int
1818create_table_internal(struct ip_fw_chain *ch, struct tid_info *ti,
1819    char *aname, ipfw_xtable_info *i, uint16_t *pkidx, int compat)
1820{
1821	struct namedobj_instance *ni;
1822	struct table_config *tc, *tc_new, *tmp;
1823	struct table_algo *ta;
1824	uint16_t kidx;
1825
1826	ni = CHAIN_TO_NI(ch);
1827
1828	ta = find_table_algo(CHAIN_TO_TCFG(ch), ti, aname);
1829	if (ta == NULL)
1830		return (ENOTSUP);
1831
1832	tc = alloc_table_config(ch, ti, ta, aname, i->tflags);
1833	if (tc == NULL)
1834		return (ENOMEM);
1835
1836	tc->vmask = i->vmask;
1837	tc->limit = i->limit;
1838	if (ta->flags & TA_FLAG_READONLY)
1839		tc->locked = 1;
1840	else
1841		tc->locked = (i->flags & IPFW_TGFLAGS_LOCKED) != 0;
1842
1843	IPFW_UH_WLOCK(ch);
1844
1845	/* Check if table has been already created */
1846	tc_new = find_table(ni, ti);
1847	if (tc_new != NULL) {
1848
1849		/*
1850		 * Compat: do not fail if we're
1851		 * requesting to create existing table
1852		 * which has the same type
1853		 */
1854		if (compat == 0 || tc_new->no.subtype != tc->no.subtype) {
1855			IPFW_UH_WUNLOCK(ch);
1856			free_table_config(ni, tc);
1857			return (EEXIST);
1858		}
1859
1860		/* Exchange tc and tc_new for proper refcounting & freeing */
1861		tmp = tc;
1862		tc = tc_new;
1863		tc_new = tmp;
1864	} else {
1865		/* New table */
1866		if (ipfw_objhash_alloc_idx(ni, &kidx) != 0) {
1867			IPFW_UH_WUNLOCK(ch);
1868			printf("Unable to allocate table index."
1869			    " Consider increasing net.inet.ip.fw.tables_max");
1870			free_table_config(ni, tc);
1871			return (EBUSY);
1872		}
1873		tc->no.kidx = kidx;
1874		tc->no.etlv = IPFW_TLV_TBL_NAME;
1875
1876		IPFW_WLOCK(ch);
1877		link_table(ch, tc);
1878		IPFW_WUNLOCK(ch);
1879	}
1880
1881	if (compat != 0)
1882		tc->no.refcnt++;
1883	if (pkidx != NULL)
1884		*pkidx = tc->no.kidx;
1885
1886	IPFW_UH_WUNLOCK(ch);
1887
1888	if (tc_new != NULL)
1889		free_table_config(ni, tc_new);
1890
1891	return (0);
1892}
1893
1894static void
1895ntlv_to_ti(ipfw_obj_ntlv *ntlv, struct tid_info *ti)
1896{
1897
1898	memset(ti, 0, sizeof(struct tid_info));
1899	ti->set = ntlv->set;
1900	ti->uidx = ntlv->idx;
1901	ti->tlvs = ntlv;
1902	ti->tlen = ntlv->head.length;
1903}
1904
1905static void
1906objheader_to_ti(struct _ipfw_obj_header *oh, struct tid_info *ti)
1907{
1908
1909	ntlv_to_ti(&oh->ntlv, ti);
1910}
1911
1912struct namedobj_instance *
1913ipfw_get_table_objhash(struct ip_fw_chain *ch)
1914{
1915
1916	return (CHAIN_TO_NI(ch));
1917}
1918
1919/*
1920 * Exports basic table info as name TLV.
1921 * Used inside dump_static_rules() to provide info
1922 * about all tables referenced by current ruleset.
1923 *
1924 * Returns 0 on success.
1925 */
1926int
1927ipfw_export_table_ntlv(struct ip_fw_chain *ch, uint16_t kidx,
1928    struct sockopt_data *sd)
1929{
1930	struct namedobj_instance *ni;
1931	struct named_object *no;
1932	ipfw_obj_ntlv *ntlv;
1933
1934	ni = CHAIN_TO_NI(ch);
1935
1936	no = ipfw_objhash_lookup_kidx(ni, kidx);
1937	KASSERT(no != NULL, ("invalid table kidx passed"));
1938
1939	ntlv = (ipfw_obj_ntlv *)ipfw_get_sopt_space(sd, sizeof(*ntlv));
1940	if (ntlv == NULL)
1941		return (ENOMEM);
1942
1943	ntlv->head.type = IPFW_TLV_TBL_NAME;
1944	ntlv->head.length = sizeof(*ntlv);
1945	ntlv->idx = no->kidx;
1946	strlcpy(ntlv->name, no->name, sizeof(ntlv->name));
1947
1948	return (0);
1949}
1950
1951struct dump_args {
1952	struct ip_fw_chain *ch;
1953	struct table_info *ti;
1954	struct table_config *tc;
1955	struct sockopt_data *sd;
1956	uint32_t cnt;
1957	uint16_t uidx;
1958	int error;
1959	uint32_t size;
1960	ipfw_table_entry *ent;
1961	ta_foreach_f *f;
1962	void *farg;
1963	ipfw_obj_tentry tent;
1964};
1965
1966static int
1967count_ext_entries(void *e, void *arg)
1968{
1969	struct dump_args *da;
1970
1971	da = (struct dump_args *)arg;
1972	da->cnt++;
1973
1974	return (0);
1975}
1976
1977/*
1978 * Gets number of items from table either using
1979 * internal counter or calling algo callback for
1980 * externally-managed tables.
1981 *
1982 * Returns number of records.
1983 */
1984static uint32_t
1985table_get_count(struct ip_fw_chain *ch, struct table_config *tc)
1986{
1987	struct table_info *ti;
1988	struct table_algo *ta;
1989	struct dump_args da;
1990
1991	ti = KIDX_TO_TI(ch, tc->no.kidx);
1992	ta = tc->ta;
1993
1994	/* Use internal counter for self-managed tables */
1995	if ((ta->flags & TA_FLAG_READONLY) == 0)
1996		return (tc->count);
1997
1998	/* Use callback to quickly get number of items */
1999	if ((ta->flags & TA_FLAG_EXTCOUNTER) != 0)
2000		return (ta->get_count(tc->astate, ti));
2001
2002	/* Count number of iterms ourselves */
2003	memset(&da, 0, sizeof(da));
2004	ta->foreach(tc->astate, ti, count_ext_entries, &da);
2005
2006	return (da.cnt);
2007}
2008
2009/*
2010 * Exports table @tc info into standard ipfw_xtable_info format.
2011 */
2012static void
2013export_table_info(struct ip_fw_chain *ch, struct table_config *tc,
2014    ipfw_xtable_info *i)
2015{
2016	struct table_info *ti;
2017	struct table_algo *ta;
2018
2019	i->type = tc->no.subtype;
2020	i->tflags = tc->tflags;
2021	i->vmask = tc->vmask;
2022	i->set = tc->no.set;
2023	i->kidx = tc->no.kidx;
2024	i->refcnt = tc->no.refcnt;
2025	i->count = table_get_count(ch, tc);
2026	i->limit = tc->limit;
2027	i->flags |= (tc->locked != 0) ? IPFW_TGFLAGS_LOCKED : 0;
2028	i->size = i->count * sizeof(ipfw_obj_tentry);
2029	i->size += sizeof(ipfw_obj_header) + sizeof(ipfw_xtable_info);
2030	strlcpy(i->tablename, tc->tablename, sizeof(i->tablename));
2031	ti = KIDX_TO_TI(ch, tc->no.kidx);
2032	ta = tc->ta;
2033	if (ta->print_config != NULL) {
2034		/* Use algo function to print table config to string */
2035		ta->print_config(tc->astate, ti, i->algoname,
2036		    sizeof(i->algoname));
2037	} else
2038		strlcpy(i->algoname, ta->name, sizeof(i->algoname));
2039	/* Dump algo-specific data, if possible */
2040	if (ta->dump_tinfo != NULL) {
2041		ta->dump_tinfo(tc->astate, ti, &i->ta_info);
2042		i->ta_info.flags |= IPFW_TATFLAGS_DATA;
2043	}
2044}
2045
2046struct dump_table_args {
2047	struct ip_fw_chain *ch;
2048	struct sockopt_data *sd;
2049};
2050
2051static int
2052export_table_internal(struct namedobj_instance *ni, struct named_object *no,
2053    void *arg)
2054{
2055	ipfw_xtable_info *i;
2056	struct dump_table_args *dta;
2057
2058	dta = (struct dump_table_args *)arg;
2059
2060	i = (ipfw_xtable_info *)ipfw_get_sopt_space(dta->sd, sizeof(*i));
2061	KASSERT(i != NULL, ("previously checked buffer is not enough"));
2062
2063	export_table_info(dta->ch, (struct table_config *)no, i);
2064	return (0);
2065}
2066
2067/*
2068 * Export all tables as ipfw_xtable_info structures to
2069 * storage provided by @sd.
2070 *
2071 * If supplied buffer is too small, fills in required size
2072 * and returns ENOMEM.
2073 * Returns 0 on success.
2074 */
2075static int
2076export_tables(struct ip_fw_chain *ch, ipfw_obj_lheader *olh,
2077    struct sockopt_data *sd)
2078{
2079	uint32_t size;
2080	uint32_t count;
2081	struct dump_table_args dta;
2082
2083	count = ipfw_objhash_count(CHAIN_TO_NI(ch));
2084	size = count * sizeof(ipfw_xtable_info) + sizeof(ipfw_obj_lheader);
2085
2086	/* Fill in header regadless of buffer size */
2087	olh->count = count;
2088	olh->objsize = sizeof(ipfw_xtable_info);
2089
2090	if (size > olh->size) {
2091		olh->size = size;
2092		return (ENOMEM);
2093	}
2094
2095	olh->size = size;
2096
2097	dta.ch = ch;
2098	dta.sd = sd;
2099
2100	ipfw_objhash_foreach(CHAIN_TO_NI(ch), export_table_internal, &dta);
2101
2102	return (0);
2103}
2104
2105/*
2106 * Dumps all table data
2107 * Data layout (v1)(current):
2108 * Request: [ ipfw_obj_header ], size = ipfw_xtable_info.size
2109 * Reply: [ ipfw_obj_header ipfw_xtable_info ipfw_obj_tentry x N ]
2110 *
2111 * Returns 0 on success
2112 */
2113static int
2114dump_table_v1(struct ip_fw_chain *ch, ip_fw3_opheader *op3,
2115    struct sockopt_data *sd)
2116{
2117	struct _ipfw_obj_header *oh;
2118	ipfw_xtable_info *i;
2119	struct tid_info ti;
2120	struct table_config *tc;
2121	struct table_algo *ta;
2122	struct dump_args da;
2123	uint32_t sz;
2124
2125	sz = sizeof(ipfw_obj_header) + sizeof(ipfw_xtable_info);
2126	oh = (struct _ipfw_obj_header *)ipfw_get_sopt_header(sd, sz);
2127	if (oh == NULL)
2128		return (EINVAL);
2129
2130	i = (ipfw_xtable_info *)(oh + 1);
2131	objheader_to_ti(oh, &ti);
2132
2133	IPFW_UH_RLOCK(ch);
2134	if ((tc = find_table(CHAIN_TO_NI(ch), &ti)) == NULL) {
2135		IPFW_UH_RUNLOCK(ch);
2136		return (ESRCH);
2137	}
2138	export_table_info(ch, tc, i);
2139
2140	if (sd->valsize < i->size) {
2141
2142		/*
2143		 * Submitted buffer size is not enough.
2144		 * WE've already filled in @i structure with
2145		 * relevant table info including size, so we
2146		 * can return. Buffer will be flushed automatically.
2147		 */
2148		IPFW_UH_RUNLOCK(ch);
2149		return (ENOMEM);
2150	}
2151
2152	/*
2153	 * Do the actual dump in eXtended format
2154	 */
2155	memset(&da, 0, sizeof(da));
2156	da.ch = ch;
2157	da.ti = KIDX_TO_TI(ch, tc->no.kidx);
2158	da.tc = tc;
2159	da.sd = sd;
2160
2161	ta = tc->ta;
2162
2163	ta->foreach(tc->astate, da.ti, dump_table_tentry, &da);
2164	IPFW_UH_RUNLOCK(ch);
2165
2166	return (da.error);
2167}
2168
2169/*
2170 * Dumps all table data
2171 * Data layout (version 0)(legacy):
2172 * Request: [ ipfw_xtable ], size = IP_FW_TABLE_XGETSIZE()
2173 * Reply: [ ipfw_xtable ipfw_table_xentry x N ]
2174 *
2175 * Returns 0 on success
2176 */
2177static int
2178dump_table_v0(struct ip_fw_chain *ch, ip_fw3_opheader *op3,
2179    struct sockopt_data *sd)
2180{
2181	ipfw_xtable *xtbl;
2182	struct tid_info ti;
2183	struct table_config *tc;
2184	struct table_algo *ta;
2185	struct dump_args da;
2186	size_t sz, count;
2187
2188	xtbl = (ipfw_xtable *)ipfw_get_sopt_header(sd, sizeof(ipfw_xtable));
2189	if (xtbl == NULL)
2190		return (EINVAL);
2191
2192	memset(&ti, 0, sizeof(ti));
2193	ti.uidx = xtbl->tbl;
2194
2195	IPFW_UH_RLOCK(ch);
2196	if ((tc = find_table(CHAIN_TO_NI(ch), &ti)) == NULL) {
2197		IPFW_UH_RUNLOCK(ch);
2198		return (0);
2199	}
2200	count = table_get_count(ch, tc);
2201	sz = count * sizeof(ipfw_table_xentry) + sizeof(ipfw_xtable);
2202
2203	xtbl->cnt = count;
2204	xtbl->size = sz;
2205	xtbl->type = tc->no.subtype;
2206	xtbl->tbl = ti.uidx;
2207
2208	if (sd->valsize < sz) {
2209
2210		/*
2211		 * Submitted buffer size is not enough.
2212		 * WE've already filled in @i structure with
2213		 * relevant table info including size, so we
2214		 * can return. Buffer will be flushed automatically.
2215		 */
2216		IPFW_UH_RUNLOCK(ch);
2217		return (ENOMEM);
2218	}
2219
2220	/* Do the actual dump in eXtended format */
2221	memset(&da, 0, sizeof(da));
2222	da.ch = ch;
2223	da.ti = KIDX_TO_TI(ch, tc->no.kidx);
2224	da.tc = tc;
2225	da.sd = sd;
2226
2227	ta = tc->ta;
2228
2229	ta->foreach(tc->astate, da.ti, dump_table_xentry, &da);
2230	IPFW_UH_RUNLOCK(ch);
2231
2232	return (0);
2233}
2234
2235/*
2236 * Legacy function to retrieve number of items in table.
2237 */
2238static int
2239get_table_size(struct ip_fw_chain *ch, ip_fw3_opheader *op3,
2240    struct sockopt_data *sd)
2241{
2242	uint32_t *tbl;
2243	struct tid_info ti;
2244	size_t sz;
2245	int error;
2246
2247	sz = sizeof(*op3) + sizeof(uint32_t);
2248	op3 = (ip_fw3_opheader *)ipfw_get_sopt_header(sd, sz);
2249	if (op3 == NULL)
2250		return (EINVAL);
2251
2252	tbl = (uint32_t *)(op3 + 1);
2253	memset(&ti, 0, sizeof(ti));
2254	ti.uidx = *tbl;
2255	IPFW_UH_RLOCK(ch);
2256	error = ipfw_count_xtable(ch, &ti, tbl);
2257	IPFW_UH_RUNLOCK(ch);
2258	return (error);
2259}
2260
2261/*
2262 * Legacy IP_FW_TABLE_GETSIZE handler
2263 */
2264int
2265ipfw_count_table(struct ip_fw_chain *ch, struct tid_info *ti, uint32_t *cnt)
2266{
2267	struct table_config *tc;
2268
2269	if ((tc = find_table(CHAIN_TO_NI(ch), ti)) == NULL)
2270		return (ESRCH);
2271	*cnt = table_get_count(ch, tc);
2272	return (0);
2273}
2274
2275/*
2276 * Legacy IP_FW_TABLE_XGETSIZE handler
2277 */
2278int
2279ipfw_count_xtable(struct ip_fw_chain *ch, struct tid_info *ti, uint32_t *cnt)
2280{
2281	struct table_config *tc;
2282	uint32_t count;
2283
2284	if ((tc = find_table(CHAIN_TO_NI(ch), ti)) == NULL) {
2285		*cnt = 0;
2286		return (0); /* 'table all list' requires success */
2287	}
2288
2289	count = table_get_count(ch, tc);
2290	*cnt = count * sizeof(ipfw_table_xentry);
2291	if (count > 0)
2292		*cnt += sizeof(ipfw_xtable);
2293	return (0);
2294}
2295
2296static int
2297dump_table_entry(void *e, void *arg)
2298{
2299	struct dump_args *da;
2300	struct table_config *tc;
2301	struct table_algo *ta;
2302	ipfw_table_entry *ent;
2303	struct table_value *pval;
2304	int error;
2305
2306	da = (struct dump_args *)arg;
2307
2308	tc = da->tc;
2309	ta = tc->ta;
2310
2311	/* Out of memory, returning */
2312	if (da->cnt == da->size)
2313		return (1);
2314	ent = da->ent++;
2315	ent->tbl = da->uidx;
2316	da->cnt++;
2317
2318	error = ta->dump_tentry(tc->astate, da->ti, e, &da->tent);
2319	if (error != 0)
2320		return (error);
2321
2322	ent->addr = da->tent.k.addr.s_addr;
2323	ent->masklen = da->tent.masklen;
2324	pval = get_table_value(da->ch, da->tc, da->tent.v.kidx);
2325	ent->value = ipfw_export_table_value_legacy(pval);
2326
2327	return (0);
2328}
2329
2330/*
2331 * Dumps table in pre-8.1 legacy format.
2332 */
2333int
2334ipfw_dump_table_legacy(struct ip_fw_chain *ch, struct tid_info *ti,
2335    ipfw_table *tbl)
2336{
2337	struct table_config *tc;
2338	struct table_algo *ta;
2339	struct dump_args da;
2340
2341	tbl->cnt = 0;
2342
2343	if ((tc = find_table(CHAIN_TO_NI(ch), ti)) == NULL)
2344		return (0);	/* XXX: We should return ESRCH */
2345
2346	ta = tc->ta;
2347
2348	/* This dump format supports IPv4 only */
2349	if (tc->no.subtype != IPFW_TABLE_ADDR)
2350		return (0);
2351
2352	memset(&da, 0, sizeof(da));
2353	da.ch = ch;
2354	da.ti = KIDX_TO_TI(ch, tc->no.kidx);
2355	da.tc = tc;
2356	da.ent = &tbl->ent[0];
2357	da.size = tbl->size;
2358
2359	tbl->cnt = 0;
2360	ta->foreach(tc->astate, da.ti, dump_table_entry, &da);
2361	tbl->cnt = da.cnt;
2362
2363	return (0);
2364}
2365
2366/*
2367 * Dumps table entry in eXtended format (v1)(current).
2368 */
2369static int
2370dump_table_tentry(void *e, void *arg)
2371{
2372	struct dump_args *da;
2373	struct table_config *tc;
2374	struct table_algo *ta;
2375	struct table_value *pval;
2376	ipfw_obj_tentry *tent;
2377	int error;
2378
2379	da = (struct dump_args *)arg;
2380
2381	tc = da->tc;
2382	ta = tc->ta;
2383
2384	tent = (ipfw_obj_tentry *)ipfw_get_sopt_space(da->sd, sizeof(*tent));
2385	/* Out of memory, returning */
2386	if (tent == NULL) {
2387		da->error = ENOMEM;
2388		return (1);
2389	}
2390	tent->head.length = sizeof(ipfw_obj_tentry);
2391	tent->idx = da->uidx;
2392
2393	error = ta->dump_tentry(tc->astate, da->ti, e, tent);
2394	if (error != 0)
2395		return (error);
2396
2397	pval = get_table_value(da->ch, da->tc, tent->v.kidx);
2398	ipfw_export_table_value_v1(pval, &tent->v.value);
2399
2400	return (0);
2401}
2402
2403/*
2404 * Dumps table entry in eXtended format (v0).
2405 */
2406static int
2407dump_table_xentry(void *e, void *arg)
2408{
2409	struct dump_args *da;
2410	struct table_config *tc;
2411	struct table_algo *ta;
2412	ipfw_table_xentry *xent;
2413	ipfw_obj_tentry *tent;
2414	struct table_value *pval;
2415	int error;
2416
2417	da = (struct dump_args *)arg;
2418
2419	tc = da->tc;
2420	ta = tc->ta;
2421
2422	xent = (ipfw_table_xentry *)ipfw_get_sopt_space(da->sd, sizeof(*xent));
2423	/* Out of memory, returning */
2424	if (xent == NULL)
2425		return (1);
2426	xent->len = sizeof(ipfw_table_xentry);
2427	xent->tbl = da->uidx;
2428
2429	memset(&da->tent, 0, sizeof(da->tent));
2430	tent = &da->tent;
2431	error = ta->dump_tentry(tc->astate, da->ti, e, tent);
2432	if (error != 0)
2433		return (error);
2434
2435	/* Convert current format to previous one */
2436	xent->masklen = tent->masklen;
2437	pval = get_table_value(da->ch, da->tc, da->tent.v.kidx);
2438	xent->value = ipfw_export_table_value_legacy(pval);
2439	/* Apply some hacks */
2440	if (tc->no.subtype == IPFW_TABLE_ADDR && tent->subtype == AF_INET) {
2441		xent->k.addr6.s6_addr32[3] = tent->k.addr.s_addr;
2442		xent->flags = IPFW_TCF_INET;
2443	} else
2444		memcpy(&xent->k, &tent->k, sizeof(xent->k));
2445
2446	return (0);
2447}
2448
2449/*
2450 * Helper function to export table algo data
2451 * to tentry format before calling user function.
2452 *
2453 * Returns 0 on success.
2454 */
2455static int
2456prepare_table_tentry(void *e, void *arg)
2457{
2458	struct dump_args *da;
2459	struct table_config *tc;
2460	struct table_algo *ta;
2461	int error;
2462
2463	da = (struct dump_args *)arg;
2464
2465	tc = da->tc;
2466	ta = tc->ta;
2467
2468	error = ta->dump_tentry(tc->astate, da->ti, e, &da->tent);
2469	if (error != 0)
2470		return (error);
2471
2472	da->f(&da->tent, da->farg);
2473
2474	return (0);
2475}
2476
2477/*
2478 * Allow external consumers to read table entries in standard format.
2479 */
2480int
2481ipfw_foreach_table_tentry(struct ip_fw_chain *ch, uint16_t kidx,
2482    ta_foreach_f *f, void *arg)
2483{
2484	struct namedobj_instance *ni;
2485	struct table_config *tc;
2486	struct table_algo *ta;
2487	struct dump_args da;
2488
2489	ni = CHAIN_TO_NI(ch);
2490
2491	tc = (struct table_config *)ipfw_objhash_lookup_kidx(ni, kidx);
2492	if (tc == NULL)
2493		return (ESRCH);
2494
2495	ta = tc->ta;
2496
2497	memset(&da, 0, sizeof(da));
2498	da.ch = ch;
2499	da.ti = KIDX_TO_TI(ch, tc->no.kidx);
2500	da.tc = tc;
2501	da.f = f;
2502	da.farg = arg;
2503
2504	ta->foreach(tc->astate, da.ti, prepare_table_tentry, &da);
2505
2506	return (0);
2507}
2508
2509/*
2510 * Table algorithms
2511 */
2512
2513/*
2514 * Finds algorithm by index, table type or supplied name.
2515 *
2516 * Returns pointer to algo or NULL.
2517 */
2518static struct table_algo *
2519find_table_algo(struct tables_config *tcfg, struct tid_info *ti, char *name)
2520{
2521	int i, l;
2522	struct table_algo *ta;
2523
2524	if (ti->type > IPFW_TABLE_MAXTYPE)
2525		return (NULL);
2526
2527	/* Search by index */
2528	if (ti->atype != 0) {
2529		if (ti->atype > tcfg->algo_count)
2530			return (NULL);
2531		return (tcfg->algo[ti->atype]);
2532	}
2533
2534	if (name == NULL) {
2535		/* Return default algorithm for given type if set */
2536		return (tcfg->def_algo[ti->type]);
2537	}
2538
2539	/* Search by name */
2540	/* TODO: better search */
2541	for (i = 1; i <= tcfg->algo_count; i++) {
2542		ta = tcfg->algo[i];
2543
2544		/*
2545		 * One can supply additional algorithm
2546		 * parameters so we compare only the first word
2547		 * of supplied name:
2548		 * 'addr:chash hsize=32'
2549		 * '^^^^^^^^^'
2550		 *
2551		 */
2552		l = strlen(ta->name);
2553		if (strncmp(name, ta->name, l) != 0)
2554			continue;
2555		if (name[l] != '\0' && name[l] != ' ')
2556			continue;
2557		/* Check if we're requesting proper table type */
2558		if (ti->type != 0 && ti->type != ta->type)
2559			return (NULL);
2560		return (ta);
2561	}
2562
2563	return (NULL);
2564}
2565
2566/*
2567 * Register new table algo @ta.
2568 * Stores algo id inside @idx.
2569 *
2570 * Returns 0 on success.
2571 */
2572int
2573ipfw_add_table_algo(struct ip_fw_chain *ch, struct table_algo *ta, size_t size,
2574    int *idx)
2575{
2576	struct tables_config *tcfg;
2577	struct table_algo *ta_new;
2578	size_t sz;
2579
2580	if (size > sizeof(struct table_algo))
2581		return (EINVAL);
2582
2583	/* Check for the required on-stack size for add/del */
2584	sz = roundup2(ta->ta_buf_size, sizeof(void *));
2585	if (sz > TA_BUF_SZ)
2586		return (EINVAL);
2587
2588	KASSERT(ta->type <= IPFW_TABLE_MAXTYPE,("Increase IPFW_TABLE_MAXTYPE"));
2589
2590	/* Copy algorithm data to stable storage. */
2591	ta_new = malloc(sizeof(struct table_algo), M_IPFW, M_WAITOK | M_ZERO);
2592	memcpy(ta_new, ta, size);
2593
2594	tcfg = CHAIN_TO_TCFG(ch);
2595
2596	KASSERT(tcfg->algo_count < 255, ("Increase algo array size"));
2597
2598	tcfg->algo[++tcfg->algo_count] = ta_new;
2599	ta_new->idx = tcfg->algo_count;
2600
2601	/* Set algorithm as default one for given type */
2602	if ((ta_new->flags & TA_FLAG_DEFAULT) != 0 &&
2603	    tcfg->def_algo[ta_new->type] == NULL)
2604		tcfg->def_algo[ta_new->type] = ta_new;
2605
2606	*idx = ta_new->idx;
2607
2608	return (0);
2609}
2610
2611/*
2612 * Unregisters table algo using @idx as id.
2613 * XXX: It is NOT safe to call this function in any place
2614 * other than ipfw instance destroy handler.
2615 */
2616void
2617ipfw_del_table_algo(struct ip_fw_chain *ch, int idx)
2618{
2619	struct tables_config *tcfg;
2620	struct table_algo *ta;
2621
2622	tcfg = CHAIN_TO_TCFG(ch);
2623
2624	KASSERT(idx <= tcfg->algo_count, ("algo idx %d out of range 1..%d",
2625	    idx, tcfg->algo_count));
2626
2627	ta = tcfg->algo[idx];
2628	KASSERT(ta != NULL, ("algo idx %d is NULL", idx));
2629
2630	if (tcfg->def_algo[ta->type] == ta)
2631		tcfg->def_algo[ta->type] = NULL;
2632
2633	free(ta, M_IPFW);
2634}
2635
2636/*
2637 * Lists all table algorithms currently available.
2638 * Data layout (v0)(current):
2639 * Request: [ ipfw_obj_lheader ], size = ipfw_obj_lheader.size
2640 * Reply: [ ipfw_obj_lheader ipfw_ta_info x N ]
2641 *
2642 * Returns 0 on success
2643 */
2644static int
2645list_table_algo(struct ip_fw_chain *ch, ip_fw3_opheader *op3,
2646    struct sockopt_data *sd)
2647{
2648	struct _ipfw_obj_lheader *olh;
2649	struct tables_config *tcfg;
2650	ipfw_ta_info *i;
2651	struct table_algo *ta;
2652	uint32_t count, n, size;
2653
2654	olh = (struct _ipfw_obj_lheader *)ipfw_get_sopt_header(sd,sizeof(*olh));
2655	if (olh == NULL)
2656		return (EINVAL);
2657	if (sd->valsize < olh->size)
2658		return (EINVAL);
2659
2660	IPFW_UH_RLOCK(ch);
2661	tcfg = CHAIN_TO_TCFG(ch);
2662	count = tcfg->algo_count;
2663	size = count * sizeof(ipfw_ta_info) + sizeof(ipfw_obj_lheader);
2664
2665	/* Fill in header regadless of buffer size */
2666	olh->count = count;
2667	olh->objsize = sizeof(ipfw_ta_info);
2668
2669	if (size > olh->size) {
2670		olh->size = size;
2671		IPFW_UH_RUNLOCK(ch);
2672		return (ENOMEM);
2673	}
2674	olh->size = size;
2675
2676	for (n = 1; n <= count; n++) {
2677		i = (ipfw_ta_info *)ipfw_get_sopt_space(sd, sizeof(*i));
2678		KASSERT(i != NULL, ("previously checked buffer is not enough"));
2679		ta = tcfg->algo[n];
2680		strlcpy(i->algoname, ta->name, sizeof(i->algoname));
2681		i->type = ta->type;
2682		i->refcnt = ta->refcnt;
2683	}
2684
2685	IPFW_UH_RUNLOCK(ch);
2686
2687	return (0);
2688}
2689
2690static int
2691classify_srcdst(ipfw_insn *cmd, uint16_t *puidx, uint8_t *ptype)
2692{
2693	/* Basic IPv4/IPv6 or u32 lookups */
2694	*puidx = cmd->arg1;
2695	/* Assume ADDR by default */
2696	*ptype = IPFW_TABLE_ADDR;
2697	int v;
2698
2699	if (F_LEN(cmd) > F_INSN_SIZE(ipfw_insn_u32)) {
2700		/*
2701		 * generic lookup. The key must be
2702		 * in 32bit big-endian format.
2703		 */
2704		v = ((ipfw_insn_u32 *)cmd)->d[1];
2705		switch (v) {
2706		case 0:
2707		case 1:
2708			/* IPv4 src/dst */
2709			break;
2710		case 2:
2711		case 3:
2712			/* src/dst port */
2713			*ptype = IPFW_TABLE_NUMBER;
2714			break;
2715		case 4:
2716			/* uid/gid */
2717			*ptype = IPFW_TABLE_NUMBER;
2718			break;
2719		case 5:
2720			/* jid */
2721			*ptype = IPFW_TABLE_NUMBER;
2722			break;
2723		case 6:
2724			/* dscp */
2725			*ptype = IPFW_TABLE_NUMBER;
2726			break;
2727		}
2728	}
2729
2730	return (0);
2731}
2732
2733static int
2734classify_via(ipfw_insn *cmd, uint16_t *puidx, uint8_t *ptype)
2735{
2736	ipfw_insn_if *cmdif;
2737
2738	/* Interface table, possibly */
2739	cmdif = (ipfw_insn_if *)cmd;
2740	if (cmdif->name[0] != '\1')
2741		return (1);
2742
2743	*ptype = IPFW_TABLE_INTERFACE;
2744	*puidx = cmdif->p.kidx;
2745
2746	return (0);
2747}
2748
2749static int
2750classify_flow(ipfw_insn *cmd, uint16_t *puidx, uint8_t *ptype)
2751{
2752
2753	*puidx = cmd->arg1;
2754	*ptype = IPFW_TABLE_FLOW;
2755
2756	return (0);
2757}
2758
2759static void
2760update_arg1(ipfw_insn *cmd, uint16_t idx)
2761{
2762
2763	cmd->arg1 = idx;
2764}
2765
2766static void
2767update_via(ipfw_insn *cmd, uint16_t idx)
2768{
2769	ipfw_insn_if *cmdif;
2770
2771	cmdif = (ipfw_insn_if *)cmd;
2772	cmdif->p.kidx = idx;
2773}
2774
2775static int
2776table_findbyname(struct ip_fw_chain *ch, struct tid_info *ti,
2777    struct named_object **pno)
2778{
2779	struct table_config *tc;
2780	int error;
2781
2782	IPFW_UH_WLOCK_ASSERT(ch);
2783
2784	error = find_table_err(CHAIN_TO_NI(ch), ti, &tc);
2785	if (error != 0)
2786		return (error);
2787
2788	*pno = &tc->no;
2789	return (0);
2790}
2791
2792/* XXX: sets-sets! */
2793static struct named_object *
2794table_findbykidx(struct ip_fw_chain *ch, uint16_t idx)
2795{
2796	struct namedobj_instance *ni;
2797	struct table_config *tc;
2798
2799	IPFW_UH_WLOCK_ASSERT(ch);
2800	ni = CHAIN_TO_NI(ch);
2801	tc = (struct table_config *)ipfw_objhash_lookup_kidx(ni, idx);
2802	KASSERT(tc != NULL, ("Table with index %d not found", idx));
2803
2804	return (&tc->no);
2805}
2806
2807static int
2808table_manage_sets(struct ip_fw_chain *ch, uint16_t set, uint8_t new_set,
2809    enum ipfw_sets_cmd cmd)
2810{
2811
2812	switch (cmd) {
2813	case SWAP_ALL:
2814	case TEST_ALL:
2815	case MOVE_ALL:
2816		/*
2817		 * Always return success, the real action and decision
2818		 * should make table_manage_sets_all().
2819		 */
2820		return (0);
2821	case TEST_ONE:
2822	case MOVE_ONE:
2823		/*
2824		 * NOTE: we need to use ipfw_objhash_del/ipfw_objhash_add
2825		 * if set number will be used in hash function. Currently
2826		 * we can just use generic handler that replaces set value.
2827		 */
2828		if (V_fw_tables_sets == 0)
2829			return (0);
2830		break;
2831	case COUNT_ONE:
2832		/*
2833		 * Return EOPNOTSUPP for COUNT_ONE when per-set sysctl is
2834		 * disabled. This allow skip table's opcodes from additional
2835		 * checks when specific rules moved to another set.
2836		 */
2837		if (V_fw_tables_sets == 0)
2838			return (EOPNOTSUPP);
2839	}
2840	/* Use generic sets handler when per-set sysctl is enabled. */
2841	return (ipfw_obj_manage_sets(CHAIN_TO_NI(ch), IPFW_TLV_TBL_NAME,
2842	    set, new_set, cmd));
2843}
2844
2845/*
2846 * We register several opcode rewriters for lookup tables.
2847 * All tables opcodes have the same ETLV type, but different subtype.
2848 * To avoid invoking sets handler several times for XXX_ALL commands,
2849 * we use separate manage_sets handler. O_RECV has the lowest value,
2850 * so it should be called first.
2851 */
2852static int
2853table_manage_sets_all(struct ip_fw_chain *ch, uint16_t set, uint8_t new_set,
2854    enum ipfw_sets_cmd cmd)
2855{
2856
2857	switch (cmd) {
2858	case SWAP_ALL:
2859	case TEST_ALL:
2860		/*
2861		 * Return success for TEST_ALL, since nothing prevents
2862		 * move rules from one set to another. All tables are
2863		 * accessible from all sets when per-set tables sysctl
2864		 * is disabled.
2865		 */
2866	case MOVE_ALL:
2867		if (V_fw_tables_sets == 0)
2868			return (0);
2869		break;
2870	default:
2871		return (table_manage_sets(ch, set, new_set, cmd));
2872	}
2873	/* Use generic sets handler when per-set sysctl is enabled. */
2874	return (ipfw_obj_manage_sets(CHAIN_TO_NI(ch), IPFW_TLV_TBL_NAME,
2875	    set, new_set, cmd));
2876}
2877
2878static struct opcode_obj_rewrite opcodes[] = {
2879	{
2880		.opcode = O_IP_SRC_LOOKUP,
2881		.etlv = IPFW_TLV_TBL_NAME,
2882		.classifier = classify_srcdst,
2883		.update = update_arg1,
2884		.find_byname = table_findbyname,
2885		.find_bykidx = table_findbykidx,
2886		.create_object = create_table_compat,
2887		.manage_sets = table_manage_sets,
2888	},
2889	{
2890		.opcode = O_IP_DST_LOOKUP,
2891		.etlv = IPFW_TLV_TBL_NAME,
2892		.classifier = classify_srcdst,
2893		.update = update_arg1,
2894		.find_byname = table_findbyname,
2895		.find_bykidx = table_findbykidx,
2896		.create_object = create_table_compat,
2897		.manage_sets = table_manage_sets,
2898	},
2899	{
2900		.opcode = O_IP_FLOW_LOOKUP,
2901		.etlv = IPFW_TLV_TBL_NAME,
2902		.classifier = classify_flow,
2903		.update = update_arg1,
2904		.find_byname = table_findbyname,
2905		.find_bykidx = table_findbykidx,
2906		.create_object = create_table_compat,
2907		.manage_sets = table_manage_sets,
2908	},
2909	{
2910		.opcode = O_XMIT,
2911		.etlv = IPFW_TLV_TBL_NAME,
2912		.classifier = classify_via,
2913		.update = update_via,
2914		.find_byname = table_findbyname,
2915		.find_bykidx = table_findbykidx,
2916		.create_object = create_table_compat,
2917		.manage_sets = table_manage_sets,
2918	},
2919	{
2920		.opcode = O_RECV,
2921		.etlv = IPFW_TLV_TBL_NAME,
2922		.classifier = classify_via,
2923		.update = update_via,
2924		.find_byname = table_findbyname,
2925		.find_bykidx = table_findbykidx,
2926		.create_object = create_table_compat,
2927		.manage_sets = table_manage_sets_all,
2928	},
2929	{
2930		.opcode = O_VIA,
2931		.etlv = IPFW_TLV_TBL_NAME,
2932		.classifier = classify_via,
2933		.update = update_via,
2934		.find_byname = table_findbyname,
2935		.find_bykidx = table_findbykidx,
2936		.create_object = create_table_compat,
2937		.manage_sets = table_manage_sets,
2938	},
2939};
2940
2941static int
2942test_sets_cb(struct namedobj_instance *ni __unused, struct named_object *no,
2943    void *arg __unused)
2944{
2945
2946	/* Check that there aren't any tables in not default set */
2947	if (no->set != 0)
2948		return (EBUSY);
2949	return (0);
2950}
2951
2952/*
2953 * Switch between "set 0" and "rule's set" table binding,
2954 * Check all ruleset bindings and permits changing
2955 * IFF each binding has both rule AND table in default set (set 0).
2956 *
2957 * Returns 0 on success.
2958 */
2959int
2960ipfw_switch_tables_namespace(struct ip_fw_chain *ch, unsigned int sets)
2961{
2962	struct opcode_obj_rewrite *rw;
2963	struct namedobj_instance *ni;
2964	struct named_object *no;
2965	struct ip_fw *rule;
2966	ipfw_insn *cmd;
2967	int cmdlen, i, l;
2968	uint16_t kidx;
2969	uint8_t subtype;
2970
2971	IPFW_UH_WLOCK(ch);
2972
2973	if (V_fw_tables_sets == sets) {
2974		IPFW_UH_WUNLOCK(ch);
2975		return (0);
2976	}
2977	ni = CHAIN_TO_NI(ch);
2978	if (sets == 0) {
2979		/*
2980		 * Prevent disabling sets support if we have some tables
2981		 * in not default sets.
2982		 */
2983		if (ipfw_objhash_foreach_type(ni, test_sets_cb,
2984		    NULL, IPFW_TLV_TBL_NAME) != 0) {
2985			IPFW_UH_WUNLOCK(ch);
2986			return (EBUSY);
2987		}
2988	}
2989	/*
2990	 * Scan all rules and examine tables opcodes.
2991	 */
2992	for (i = 0; i < ch->n_rules; i++) {
2993		rule = ch->map[i];
2994
2995		l = rule->cmd_len;
2996		cmd = rule->cmd;
2997		cmdlen = 0;
2998		for ( ;	l > 0 ; l -= cmdlen, cmd += cmdlen) {
2999			cmdlen = F_LEN(cmd);
3000			/* Check only tables opcodes */
3001			for (kidx = 0, rw = opcodes;
3002			    rw < opcodes + nitems(opcodes); rw++) {
3003				if (rw->opcode != cmd->opcode)
3004					continue;
3005				if (rw->classifier(cmd, &kidx, &subtype) == 0)
3006					break;
3007			}
3008			if (kidx == 0)
3009				continue;
3010			no = ipfw_objhash_lookup_kidx(ni, kidx);
3011			/* Check if both table object and rule has the set 0 */
3012			if (no->set != 0 || rule->set != 0) {
3013				IPFW_UH_WUNLOCK(ch);
3014				return (EBUSY);
3015			}
3016
3017		}
3018	}
3019	V_fw_tables_sets = sets;
3020	IPFW_UH_WUNLOCK(ch);
3021	return (0);
3022}
3023
3024/*
3025 * Checks table name for validity.
3026 * Enforce basic length checks, the rest
3027 * should be done in userland.
3028 *
3029 * Returns 0 if name is considered valid.
3030 */
3031static int
3032check_table_name(const char *name)
3033{
3034
3035	/*
3036	 * TODO: do some more complicated checks
3037	 */
3038	return (ipfw_check_object_name_generic(name));
3039}
3040
3041/*
3042 * Finds table config based on either legacy index
3043 * or name in ntlv.
3044 * Note @ti structure contains unchecked data from userland.
3045 *
3046 * Returns 0 in success and fills in @tc with found config
3047 */
3048static int
3049find_table_err(struct namedobj_instance *ni, struct tid_info *ti,
3050    struct table_config **tc)
3051{
3052	char *name, bname[16];
3053	struct named_object *no;
3054	ipfw_obj_ntlv *ntlv;
3055	uint32_t set;
3056
3057	if (ti->tlvs != NULL) {
3058		ntlv = ipfw_find_name_tlv_type(ti->tlvs, ti->tlen, ti->uidx,
3059		    IPFW_TLV_TBL_NAME);
3060		if (ntlv == NULL)
3061			return (EINVAL);
3062		name = ntlv->name;
3063
3064		/*
3065		 * Use set provided by @ti instead of @ntlv one.
3066		 * This is needed due to different sets behavior
3067		 * controlled by V_fw_tables_sets.
3068		 */
3069		set = (V_fw_tables_sets != 0) ? ti->set : 0;
3070	} else {
3071		snprintf(bname, sizeof(bname), "%d", ti->uidx);
3072		name = bname;
3073		set = 0;
3074	}
3075
3076	no = ipfw_objhash_lookup_name(ni, set, name);
3077	*tc = (struct table_config *)no;
3078
3079	return (0);
3080}
3081
3082/*
3083 * Finds table config based on either legacy index
3084 * or name in ntlv.
3085 * Note @ti structure contains unchecked data from userland.
3086 *
3087 * Returns pointer to table_config or NULL.
3088 */
3089static struct table_config *
3090find_table(struct namedobj_instance *ni, struct tid_info *ti)
3091{
3092	struct table_config *tc;
3093
3094	if (find_table_err(ni, ti, &tc) != 0)
3095		return (NULL);
3096
3097	return (tc);
3098}
3099
3100/*
3101 * Allocate new table config structure using
3102 * specified @algo and @aname.
3103 *
3104 * Returns pointer to config or NULL.
3105 */
3106static struct table_config *
3107alloc_table_config(struct ip_fw_chain *ch, struct tid_info *ti,
3108    struct table_algo *ta, char *aname, uint8_t tflags)
3109{
3110	char *name, bname[16];
3111	struct table_config *tc;
3112	int error;
3113	ipfw_obj_ntlv *ntlv;
3114	uint32_t set;
3115
3116	if (ti->tlvs != NULL) {
3117		ntlv = ipfw_find_name_tlv_type(ti->tlvs, ti->tlen, ti->uidx,
3118		    IPFW_TLV_TBL_NAME);
3119		if (ntlv == NULL)
3120			return (NULL);
3121		name = ntlv->name;
3122		set = ntlv->set;
3123	} else {
3124		/* Compat part: convert number to string representation */
3125		snprintf(bname, sizeof(bname), "%d", ti->uidx);
3126		name = bname;
3127		set = 0;
3128	}
3129
3130	tc = malloc(sizeof(struct table_config), M_IPFW, M_WAITOK | M_ZERO);
3131	tc->no.name = tc->tablename;
3132	tc->no.subtype = ta->type;
3133	tc->no.set = set;
3134	tc->tflags = tflags;
3135	tc->ta = ta;
3136	strlcpy(tc->tablename, name, sizeof(tc->tablename));
3137	/* Set "shared" value type by default */
3138	tc->vshared = 1;
3139
3140	/* Preallocate data structures for new tables */
3141	error = ta->init(ch, &tc->astate, &tc->ti_copy, aname, tflags);
3142	if (error != 0) {
3143		free(tc, M_IPFW);
3144		return (NULL);
3145	}
3146
3147	return (tc);
3148}
3149
3150/*
3151 * Destroys table state and config.
3152 */
3153static void
3154free_table_config(struct namedobj_instance *ni, struct table_config *tc)
3155{
3156
3157	KASSERT(tc->linked == 0, ("free() on linked config"));
3158	/* UH lock MUST NOT be held */
3159
3160	/*
3161	 * We're using ta without any locking/referencing.
3162	 * TODO: fix this if we're going to use unloadable algos.
3163	 */
3164	tc->ta->destroy(tc->astate, &tc->ti_copy);
3165	free(tc, M_IPFW);
3166}
3167
3168/*
3169 * Links @tc to @chain table named instance.
3170 * Sets appropriate type/states in @chain table info.
3171 */
3172static void
3173link_table(struct ip_fw_chain *ch, struct table_config *tc)
3174{
3175	struct namedobj_instance *ni;
3176	struct table_info *ti;
3177	uint16_t kidx;
3178
3179	IPFW_UH_WLOCK_ASSERT(ch);
3180	IPFW_WLOCK_ASSERT(ch);
3181
3182	ni = CHAIN_TO_NI(ch);
3183	kidx = tc->no.kidx;
3184
3185	ipfw_objhash_add(ni, &tc->no);
3186
3187	ti = KIDX_TO_TI(ch, kidx);
3188	*ti = tc->ti_copy;
3189
3190	/* Notify algo on real @ti address */
3191	if (tc->ta->change_ti != NULL)
3192		tc->ta->change_ti(tc->astate, ti);
3193
3194	tc->linked = 1;
3195	tc->ta->refcnt++;
3196}
3197
3198/*
3199 * Unlinks @tc from @chain table named instance.
3200 * Zeroes states in @chain and stores them in @tc.
3201 */
3202static void
3203unlink_table(struct ip_fw_chain *ch, struct table_config *tc)
3204{
3205	struct namedobj_instance *ni;
3206	struct table_info *ti;
3207	uint16_t kidx;
3208
3209	IPFW_UH_WLOCK_ASSERT(ch);
3210	IPFW_WLOCK_ASSERT(ch);
3211
3212	ni = CHAIN_TO_NI(ch);
3213	kidx = tc->no.kidx;
3214
3215	/* Clear state. @ti copy is already saved inside @tc */
3216	ipfw_objhash_del(ni, &tc->no);
3217	ti = KIDX_TO_TI(ch, kidx);
3218	memset(ti, 0, sizeof(struct table_info));
3219	tc->linked = 0;
3220	tc->ta->refcnt--;
3221
3222	/* Notify algo on real @ti address */
3223	if (tc->ta->change_ti != NULL)
3224		tc->ta->change_ti(tc->astate, NULL);
3225}
3226
3227static struct ipfw_sopt_handler	scodes[] = {
3228	{ IP_FW_TABLE_XCREATE,	0,	HDIR_SET,	create_table },
3229	{ IP_FW_TABLE_XDESTROY,	0,	HDIR_SET,	flush_table_v0 },
3230	{ IP_FW_TABLE_XFLUSH,	0,	HDIR_SET,	flush_table_v0 },
3231	{ IP_FW_TABLE_XMODIFY,	0,	HDIR_BOTH,	modify_table },
3232	{ IP_FW_TABLE_XINFO,	0,	HDIR_GET,	describe_table },
3233	{ IP_FW_TABLES_XLIST,	0,	HDIR_GET,	list_tables },
3234	{ IP_FW_TABLE_XLIST,	0,	HDIR_GET,	dump_table_v0 },
3235	{ IP_FW_TABLE_XLIST,	1,	HDIR_GET,	dump_table_v1 },
3236	{ IP_FW_TABLE_XADD,	0,	HDIR_BOTH,	manage_table_ent_v0 },
3237	{ IP_FW_TABLE_XADD,	1,	HDIR_BOTH,	manage_table_ent_v1 },
3238	{ IP_FW_TABLE_XDEL,	0,	HDIR_BOTH,	manage_table_ent_v0 },
3239	{ IP_FW_TABLE_XDEL,	1,	HDIR_BOTH,	manage_table_ent_v1 },
3240	{ IP_FW_TABLE_XFIND,	0,	HDIR_GET,	find_table_entry },
3241	{ IP_FW_TABLE_XSWAP,	0,	HDIR_SET,	swap_table },
3242	{ IP_FW_TABLES_ALIST,	0,	HDIR_GET,	list_table_algo },
3243	{ IP_FW_TABLE_XGETSIZE,	0,	HDIR_GET,	get_table_size },
3244};
3245
3246static int
3247destroy_table_locked(struct namedobj_instance *ni, struct named_object *no,
3248    void *arg)
3249{
3250
3251	unlink_table((struct ip_fw_chain *)arg, (struct table_config *)no);
3252	if (ipfw_objhash_free_idx(ni, no->kidx) != 0)
3253		printf("Error unlinking kidx %d from table %s\n",
3254		    no->kidx, no->name);
3255	free_table_config(ni, (struct table_config *)no);
3256	return (0);
3257}
3258
3259/*
3260 * Shuts tables module down.
3261 */
3262void
3263ipfw_destroy_tables(struct ip_fw_chain *ch, int last)
3264{
3265
3266	IPFW_DEL_SOPT_HANDLER(last, scodes);
3267	IPFW_DEL_OBJ_REWRITER(last, opcodes);
3268
3269	/* Remove all tables from working set */
3270	IPFW_UH_WLOCK(ch);
3271	IPFW_WLOCK(ch);
3272	ipfw_objhash_foreach(CHAIN_TO_NI(ch), destroy_table_locked, ch);
3273	IPFW_WUNLOCK(ch);
3274	IPFW_UH_WUNLOCK(ch);
3275
3276	/* Free pointers itself */
3277	free(ch->tablestate, M_IPFW);
3278
3279	ipfw_table_value_destroy(ch, last);
3280	ipfw_table_algo_destroy(ch);
3281
3282	ipfw_objhash_destroy(CHAIN_TO_NI(ch));
3283	free(CHAIN_TO_TCFG(ch), M_IPFW);
3284}
3285
3286/*
3287 * Starts tables module.
3288 */
3289int
3290ipfw_init_tables(struct ip_fw_chain *ch, int first)
3291{
3292	struct tables_config *tcfg;
3293
3294	/* Allocate pointers */
3295	ch->tablestate = malloc(V_fw_tables_max * sizeof(struct table_info),
3296	    M_IPFW, M_WAITOK | M_ZERO);
3297
3298	tcfg = malloc(sizeof(struct tables_config), M_IPFW, M_WAITOK | M_ZERO);
3299	tcfg->namehash = ipfw_objhash_create(V_fw_tables_max);
3300	ch->tblcfg = tcfg;
3301
3302	ipfw_table_value_init(ch, first);
3303	ipfw_table_algo_init(ch);
3304
3305	IPFW_ADD_OBJ_REWRITER(first, opcodes);
3306	IPFW_ADD_SOPT_HANDLER(first, scodes);
3307	return (0);
3308}
3309
3310
3311
3312