1/*-
2 * SPDX-License-Identifier: BSD-2-Clause
3 *
4 * Copyright (c) 2004 Ruslan Ermilov and Vsevolod Lobko.
5 * Copyright (c) 2014 Yandex LLC
6 * Copyright (c) 2014 Alexander V. Chernikov
7 *
8 * Redistribution and use in source and binary forms, with or without
9 * modification, are permitted provided that the following conditions
10 * are met:
11 * 1. Redistributions of source code must retain the above copyright
12 *    notice, this list of conditions and the following disclaimer.
13 * 2. Redistributions in binary form must reproduce the above copyright
14 *    notice, this list of conditions and the following disclaimer in the
15 *    documentation and/or other materials provided with the distribution.
16 *
17 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
18 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
19 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
20 * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
21 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
22 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
23 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
24 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
25 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
26 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
27 * SUCH DAMAGE.
28 */
29
30#include <sys/cdefs.h>
31/*
32 * Lookup table support for ipfw.
33 *
34 * This file contains handlers for all generic tables' operations:
35 * add/del/flush entries, list/dump tables etc..
36 *
37 * Table data modification is protected by both UH and runtime lock
38 * while reading configuration/data is protected by UH lock.
39 *
40 * Lookup algorithms for all table types are located in ip_fw_table_algo.c
41 */
42
43#include "opt_ipfw.h"
44
45#include <sys/param.h>
46#include <sys/systm.h>
47#include <sys/malloc.h>
48#include <sys/kernel.h>
49#include <sys/lock.h>
50#include <sys/rwlock.h>
51#include <sys/rmlock.h>
52#include <sys/socket.h>
53#include <sys/socketvar.h>
54#include <sys/queue.h>
55#include <net/if.h>	/* ip_fw.h requires IFNAMSIZ */
56
57#include <netinet/in.h>
58#include <netinet/ip_var.h>	/* struct ipfw_rule_ref */
59#include <netinet/ip_fw.h>
60
61#include <netpfil/ipfw/ip_fw_private.h>
62#include <netpfil/ipfw/ip_fw_table.h>
63
64 /*
65 * Table has the following `type` concepts:
66 *
67 * `no.type` represents lookup key type (addr, ifp, uid, etc..)
68 * vmask represents bitmask of table values which are present at the moment.
69 * Special IPFW_VTYPE_LEGACY ( (uint32_t)-1 ) represents old
70 * single-value-for-all approach.
71 */
72struct table_config {
73	struct named_object	no;
74	uint8_t		tflags;		/* type flags */
75	uint8_t		locked;		/* 1 if locked from changes */
76	uint8_t		linked;		/* 1 if already linked */
77	uint8_t		ochanged;	/* used by set swapping */
78	uint8_t		vshared;	/* 1 if using shared value array */
79	uint8_t		spare[3];
80	uint32_t	count;		/* Number of records */
81	uint32_t	limit;		/* Max number of records */
82	uint32_t	vmask;		/* bitmask with supported values */
83	uint32_t	ocount;		/* used by set swapping */
84	uint64_t	gencnt;		/* generation count */
85	char		tablename[64];	/* table name */
86	struct table_algo	*ta;	/* Callbacks for given algo */
87	void		*astate;	/* algorithm state */
88	struct table_info	ti_copy;	/* data to put to table_info */
89	struct namedobj_instance	*vi;
90};
91
92static int find_table_err(struct namedobj_instance *ni, struct tid_info *ti,
93    struct table_config **tc);
94static struct table_config *find_table(struct namedobj_instance *ni,
95    struct tid_info *ti);
96static struct table_config *alloc_table_config(struct ip_fw_chain *ch,
97    struct tid_info *ti, struct table_algo *ta, char *adata, uint8_t tflags);
98static void free_table_config(struct namedobj_instance *ni,
99    struct table_config *tc);
100static int create_table_internal(struct ip_fw_chain *ch, struct tid_info *ti,
101    char *aname, ipfw_xtable_info *i, uint16_t *pkidx, int ref);
102static void link_table(struct ip_fw_chain *ch, struct table_config *tc);
103static void unlink_table(struct ip_fw_chain *ch, struct table_config *tc);
104static int find_ref_table(struct ip_fw_chain *ch, struct tid_info *ti,
105    struct tentry_info *tei, uint32_t count, int op, struct table_config **ptc);
106#define	OP_ADD	1
107#define	OP_DEL	0
108static int export_tables(struct ip_fw_chain *ch, ipfw_obj_lheader *olh,
109    struct sockopt_data *sd);
110static void export_table_info(struct ip_fw_chain *ch, struct table_config *tc,
111    ipfw_xtable_info *i);
112static int dump_table_tentry(void *e, void *arg);
113static int dump_table_xentry(void *e, void *arg);
114
115static int swap_tables(struct ip_fw_chain *ch, struct tid_info *a,
116    struct tid_info *b);
117
118static int check_table_name(const char *name);
119static int check_table_space(struct ip_fw_chain *ch, struct tableop_state *ts,
120    struct table_config *tc, struct table_info *ti, uint32_t count);
121static int destroy_table(struct ip_fw_chain *ch, struct tid_info *ti);
122
123static struct table_algo *find_table_algo(struct tables_config *tableconf,
124    struct tid_info *ti, char *name);
125
126static void objheader_to_ti(struct _ipfw_obj_header *oh, struct tid_info *ti);
127static void ntlv_to_ti(struct _ipfw_obj_ntlv *ntlv, struct tid_info *ti);
128
129#define	CHAIN_TO_NI(chain)	(CHAIN_TO_TCFG(chain)->namehash)
130#define	KIDX_TO_TI(ch, k)	(&(((struct table_info *)(ch)->tablestate)[k]))
131
132#define	TA_BUF_SZ	128	/* On-stack buffer for add/delete state */
133
134void
135rollback_toperation_state(struct ip_fw_chain *ch, void *object)
136{
137	struct tables_config *tcfg;
138	struct op_state *os;
139
140	tcfg = CHAIN_TO_TCFG(ch);
141	TAILQ_FOREACH(os, &tcfg->state_list, next)
142		os->func(object, os);
143}
144
145void
146add_toperation_state(struct ip_fw_chain *ch, struct tableop_state *ts)
147{
148	struct tables_config *tcfg;
149
150	tcfg = CHAIN_TO_TCFG(ch);
151	TAILQ_INSERT_HEAD(&tcfg->state_list, &ts->opstate, next);
152}
153
154void
155del_toperation_state(struct ip_fw_chain *ch, struct tableop_state *ts)
156{
157	struct tables_config *tcfg;
158
159	tcfg = CHAIN_TO_TCFG(ch);
160	TAILQ_REMOVE(&tcfg->state_list, &ts->opstate, next);
161}
162
163void
164tc_ref(struct table_config *tc)
165{
166
167	tc->no.refcnt++;
168}
169
170void
171tc_unref(struct table_config *tc)
172{
173
174	tc->no.refcnt--;
175}
176
177static struct table_value *
178get_table_value(struct ip_fw_chain *ch, struct table_config *tc, uint32_t kidx)
179{
180	struct table_value *pval;
181
182	pval = (struct table_value *)ch->valuestate;
183
184	return (&pval[kidx]);
185}
186
187/*
188 * Checks if we're able to insert/update entry @tei into table
189 * w.r.t @tc limits.
190 * May alter @tei to indicate insertion error / insert
191 * options.
192 *
193 * Returns 0 if operation can be performed/
194 */
195static int
196check_table_limit(struct table_config *tc, struct tentry_info *tei)
197{
198
199	if (tc->limit == 0 || tc->count < tc->limit)
200		return (0);
201
202	if ((tei->flags & TEI_FLAGS_UPDATE) == 0) {
203		/* Notify userland on error cause */
204		tei->flags |= TEI_FLAGS_LIMIT;
205		return (EFBIG);
206	}
207
208	/*
209	 * We have UPDATE flag set.
210	 * Permit updating record (if found),
211	 * but restrict adding new one since we've
212	 * already hit the limit.
213	 */
214	tei->flags |= TEI_FLAGS_DONTADD;
215
216	return (0);
217}
218
219/*
220 * Convert algorithm callback return code into
221 * one of pre-defined states known by userland.
222 */
223static void
224store_tei_result(struct tentry_info *tei, int op, int error, uint32_t num)
225{
226	int flag;
227
228	flag = 0;
229
230	switch (error) {
231	case 0:
232		if (op == OP_ADD && num != 0)
233			flag = TEI_FLAGS_ADDED;
234		if (op == OP_DEL)
235			flag = TEI_FLAGS_DELETED;
236		break;
237	case ENOENT:
238		flag = TEI_FLAGS_NOTFOUND;
239		break;
240	case EEXIST:
241		flag = TEI_FLAGS_EXISTS;
242		break;
243	default:
244		flag = TEI_FLAGS_ERROR;
245	}
246
247	tei->flags |= flag;
248}
249
250/*
251 * Creates and references table with default parameters.
252 * Saves table config, algo and allocated kidx info @ptc, @pta and
253 * @pkidx if non-zero.
254 * Used for table auto-creation to support old binaries.
255 *
256 * Returns 0 on success.
257 */
258static int
259create_table_compat(struct ip_fw_chain *ch, struct tid_info *ti,
260    uint16_t *pkidx)
261{
262	ipfw_xtable_info xi;
263	int error;
264
265	memset(&xi, 0, sizeof(xi));
266	/* Set default value mask for legacy clients */
267	xi.vmask = IPFW_VTYPE_LEGACY;
268
269	error = create_table_internal(ch, ti, NULL, &xi, pkidx, 1);
270	if (error != 0)
271		return (error);
272
273	return (0);
274}
275
276/*
277 * Find and reference existing table optionally
278 * creating new one.
279 *
280 * Saves found table config into @ptc.
281 * Note function may drop/acquire UH_WLOCK.
282 * Returns 0 if table was found/created and referenced
283 * or non-zero return code.
284 */
285static int
286find_ref_table(struct ip_fw_chain *ch, struct tid_info *ti,
287    struct tentry_info *tei, uint32_t count, int op,
288    struct table_config **ptc)
289{
290	struct namedobj_instance *ni;
291	struct table_config *tc;
292	uint16_t kidx;
293	int error;
294
295	IPFW_UH_WLOCK_ASSERT(ch);
296
297	ni = CHAIN_TO_NI(ch);
298	tc = NULL;
299	if ((tc = find_table(ni, ti)) != NULL) {
300		/* check table type */
301		if (tc->no.subtype != ti->type)
302			return (EINVAL);
303
304		if (tc->locked != 0)
305			return (EACCES);
306
307		/* Try to exit early on limit hit */
308		if (op == OP_ADD && count == 1 &&
309		    check_table_limit(tc, tei) != 0)
310			return (EFBIG);
311
312		/* Reference and return */
313		tc->no.refcnt++;
314		*ptc = tc;
315		return (0);
316	}
317
318	if (op == OP_DEL)
319		return (ESRCH);
320
321	/* Compatibility mode: create new table for old clients */
322	if ((tei->flags & TEI_FLAGS_COMPAT) == 0)
323		return (ESRCH);
324
325	IPFW_UH_WUNLOCK(ch);
326	error = create_table_compat(ch, ti, &kidx);
327	IPFW_UH_WLOCK(ch);
328
329	if (error != 0)
330		return (error);
331
332	tc = (struct table_config *)ipfw_objhash_lookup_kidx(ni, kidx);
333	KASSERT(tc != NULL, ("create_table_compat returned bad idx %d", kidx));
334
335	/* OK, now we've got referenced table. */
336	*ptc = tc;
337	return (0);
338}
339
340/*
341 * Rolls back already @added to @tc entries using state array @ta_buf_m.
342 * Assume the following layout:
343 * 1) ADD state (ta_buf_m[0] ... t_buf_m[added - 1]) for handling update cases
344 * 2) DEL state (ta_buf_m[count[ ... t_buf_m[count + added - 1])
345 *   for storing deleted state
346 */
347static void
348rollback_added_entries(struct ip_fw_chain *ch, struct table_config *tc,
349    struct table_info *tinfo, struct tentry_info *tei, caddr_t ta_buf_m,
350    uint32_t count, uint32_t added)
351{
352	struct table_algo *ta;
353	struct tentry_info *ptei;
354	caddr_t v, vv;
355	size_t ta_buf_sz;
356	int error __diagused, i;
357	uint32_t num;
358
359	IPFW_UH_WLOCK_ASSERT(ch);
360
361	ta = tc->ta;
362	ta_buf_sz = ta->ta_buf_size;
363	v = ta_buf_m;
364	vv = v + count * ta_buf_sz;
365	for (i = 0; i < added; i++, v += ta_buf_sz, vv += ta_buf_sz) {
366		ptei = &tei[i];
367		if ((ptei->flags & TEI_FLAGS_UPDATED) != 0) {
368			/*
369			 * We have old value stored by previous
370			 * call in @ptei->value. Do add once again
371			 * to restore it.
372			 */
373			error = ta->add(tc->astate, tinfo, ptei, v, &num);
374			KASSERT(error == 0, ("rollback UPDATE fail"));
375			KASSERT(num == 0, ("rollback UPDATE fail2"));
376			continue;
377		}
378
379		error = ta->prepare_del(ch, ptei, vv);
380		KASSERT(error == 0, ("pre-rollback INSERT failed"));
381		error = ta->del(tc->astate, tinfo, ptei, vv, &num);
382		KASSERT(error == 0, ("rollback INSERT failed"));
383		tc->count -= num;
384	}
385}
386
387/*
388 * Prepares add/del state for all @count entries in @tei.
389 * Uses either stack buffer (@ta_buf) or allocates a new one.
390 * Stores pointer to allocated buffer back to @ta_buf.
391 *
392 * Returns 0 on success.
393 */
394static int
395prepare_batch_buffer(struct ip_fw_chain *ch, struct table_algo *ta,
396    struct tentry_info *tei, uint32_t count, int op, caddr_t *ta_buf)
397{
398	caddr_t ta_buf_m, v;
399	size_t ta_buf_sz, sz;
400	struct tentry_info *ptei;
401	int error, i;
402
403	error = 0;
404	ta_buf_sz = ta->ta_buf_size;
405	if (count == 1) {
406		/* Single add/delete, use on-stack buffer */
407		memset(*ta_buf, 0, TA_BUF_SZ);
408		ta_buf_m = *ta_buf;
409	} else {
410		/*
411		 * Multiple adds/deletes, allocate larger buffer
412		 *
413		 * Note we need 2xcount buffer for add case:
414		 * we have hold both ADD state
415		 * and DELETE state (this may be needed
416		 * if we need to rollback all changes)
417		 */
418		sz = count * ta_buf_sz;
419		ta_buf_m = malloc((op == OP_ADD) ? sz * 2 : sz, M_TEMP,
420		    M_WAITOK | M_ZERO);
421	}
422
423	v = ta_buf_m;
424	for (i = 0; i < count; i++, v += ta_buf_sz) {
425		ptei = &tei[i];
426		error = (op == OP_ADD) ?
427		    ta->prepare_add(ch, ptei, v) : ta->prepare_del(ch, ptei, v);
428
429		/*
430		 * Some syntax error (incorrect mask, or address, or
431		 * anything). Return error regardless of atomicity
432		 * settings.
433		 */
434		if (error != 0)
435			break;
436	}
437
438	*ta_buf = ta_buf_m;
439	return (error);
440}
441
442/*
443 * Flushes allocated state for each @count entries in @tei.
444 * Frees @ta_buf_m if differs from stack buffer @ta_buf.
445 */
446static void
447flush_batch_buffer(struct ip_fw_chain *ch, struct table_algo *ta,
448    struct tentry_info *tei, uint32_t count, int rollback,
449    caddr_t ta_buf_m, caddr_t ta_buf)
450{
451	caddr_t v;
452	struct tentry_info *ptei;
453	size_t ta_buf_sz;
454	int i;
455
456	ta_buf_sz = ta->ta_buf_size;
457
458	/* Run cleaning callback anyway */
459	v = ta_buf_m;
460	for (i = 0; i < count; i++, v += ta_buf_sz) {
461		ptei = &tei[i];
462		ta->flush_entry(ch, ptei, v);
463		if (ptei->ptv != NULL) {
464			free(ptei->ptv, M_IPFW);
465			ptei->ptv = NULL;
466		}
467	}
468
469	/* Clean up "deleted" state in case of rollback */
470	if (rollback != 0) {
471		v = ta_buf_m + count * ta_buf_sz;
472		for (i = 0; i < count; i++, v += ta_buf_sz)
473			ta->flush_entry(ch, &tei[i], v);
474	}
475
476	if (ta_buf_m != ta_buf)
477		free(ta_buf_m, M_TEMP);
478}
479
480static void
481rollback_add_entry(void *object, struct op_state *_state)
482{
483	struct ip_fw_chain *ch __diagused;
484	struct tableop_state *ts;
485
486	ts = (struct tableop_state *)_state;
487
488	if (ts->tc != object && ts->ch != object)
489		return;
490
491	ch = ts->ch;
492
493	IPFW_UH_WLOCK_ASSERT(ch);
494
495	/* Call specifid unlockers */
496	rollback_table_values(ts);
497
498	/* Indicate we've called */
499	ts->modified = 1;
500}
501
502/*
503 * Adds/updates one or more entries in table @ti.
504 *
505 * Function may drop/reacquire UH wlock multiple times due to
506 * items alloc, algorithm callbacks (check_space), value linkage
507 * (new values, value storage realloc), etc..
508 * Other processes like other adds (which may involve storage resize),
509 * table swaps (which changes table data and may change algo type),
510 * table modify (which may change value mask) may be executed
511 * simultaneously so we need to deal with it.
512 *
513 * The following approach was implemented:
514 * we have per-chain linked list, protected with UH lock.
515 * add_table_entry prepares special on-stack structure wthich is passed
516 * to its descendants. Users add this structure to this list before unlock.
517 * After performing needed operations and acquiring UH lock back, each user
518 * checks if structure has changed. If true, it rolls local state back and
519 * returns without error to the caller.
520 * add_table_entry() on its own checks if structure has changed and restarts
521 * its operation from the beginning (goto restart).
522 *
523 * Functions which are modifying fields of interest (currently
524 *   resize_shared_value_storage() and swap_tables() )
525 * traverses given list while holding UH lock immediately before
526 * performing their operations calling function provided be list entry
527 * ( currently rollback_add_entry  ) which performs rollback for all necessary
528 * state and sets appropriate values in structure indicating rollback
529 * has happened.
530 *
531 * Algo interaction:
532 * Function references @ti first to ensure table won't
533 * disappear or change its type.
534 * After that, prepare_add callback is called for each @tei entry.
535 * Next, we try to add each entry under UH+WHLOCK
536 * using add() callback.
537 * Finally, we free all state by calling flush_entry callback
538 * for each @tei.
539 *
540 * Returns 0 on success.
541 */
542int
543add_table_entry(struct ip_fw_chain *ch, struct tid_info *ti,
544    struct tentry_info *tei, uint8_t flags, uint32_t count)
545{
546	struct table_config *tc;
547	struct table_algo *ta;
548	uint16_t kidx;
549	int error, first_error, i, rollback;
550	uint32_t num, numadd;
551	struct tentry_info *ptei;
552	struct tableop_state ts;
553	char ta_buf[TA_BUF_SZ];
554	caddr_t ta_buf_m, v;
555
556	memset(&ts, 0, sizeof(ts));
557	ta = NULL;
558	IPFW_UH_WLOCK(ch);
559
560	/*
561	 * Find and reference existing table.
562	 */
563restart:
564	if (ts.modified != 0) {
565		IPFW_UH_WUNLOCK(ch);
566		flush_batch_buffer(ch, ta, tei, count, rollback,
567		    ta_buf_m, ta_buf);
568		memset(&ts, 0, sizeof(ts));
569		ta = NULL;
570		IPFW_UH_WLOCK(ch);
571	}
572
573	error = find_ref_table(ch, ti, tei, count, OP_ADD, &tc);
574	if (error != 0) {
575		IPFW_UH_WUNLOCK(ch);
576		return (error);
577	}
578	ta = tc->ta;
579
580	/* Fill in tablestate */
581	ts.ch = ch;
582	ts.opstate.func = rollback_add_entry;
583	ts.tc = tc;
584	ts.vshared = tc->vshared;
585	ts.vmask = tc->vmask;
586	ts.ta = ta;
587	ts.tei = tei;
588	ts.count = count;
589	rollback = 0;
590	add_toperation_state(ch, &ts);
591	IPFW_UH_WUNLOCK(ch);
592
593	/* Allocate memory and prepare record(s) */
594	/* Pass stack buffer by default */
595	ta_buf_m = ta_buf;
596	error = prepare_batch_buffer(ch, ta, tei, count, OP_ADD, &ta_buf_m);
597
598	IPFW_UH_WLOCK(ch);
599	del_toperation_state(ch, &ts);
600	/* Drop reference we've used in first search */
601	tc->no.refcnt--;
602
603	/* Check prepare_batch_buffer() error */
604	if (error != 0)
605		goto cleanup;
606
607	/*
608	 * Check if table swap has happened.
609	 * (so table algo might be changed).
610	 * Restart operation to achieve consistent behavior.
611	 */
612	if (ts.modified != 0)
613		goto restart;
614
615	/*
616	 * Link all values values to shared/per-table value array.
617	 *
618	 * May release/reacquire UH_WLOCK.
619	 */
620	error = ipfw_link_table_values(ch, &ts, flags);
621	if (error != 0)
622		goto cleanup;
623	if (ts.modified != 0)
624		goto restart;
625
626	/*
627	 * Ensure we are able to add all entries without additional
628	 * memory allocations. May release/reacquire UH_WLOCK.
629	 */
630	kidx = tc->no.kidx;
631	error = check_table_space(ch, &ts, tc, KIDX_TO_TI(ch, kidx), count);
632	if (error != 0)
633		goto cleanup;
634	if (ts.modified != 0)
635		goto restart;
636
637	/* We've got valid table in @tc. Let's try to add data */
638	kidx = tc->no.kidx;
639	ta = tc->ta;
640	numadd = 0;
641	first_error = 0;
642
643	IPFW_WLOCK(ch);
644
645	v = ta_buf_m;
646	for (i = 0; i < count; i++, v += ta->ta_buf_size) {
647		ptei = &tei[i];
648		num = 0;
649		/* check limit before adding */
650		if ((error = check_table_limit(tc, ptei)) == 0) {
651			/*
652			 * It should be safe to insert a record w/o
653			 * a properly-linked value if atomicity is
654			 * not required.
655			 *
656			 * If the added item does not have a valid value
657			 * index, it would get rejected by ta->add().
658			 * */
659			error = ta->add(tc->astate, KIDX_TO_TI(ch, kidx),
660			    ptei, v, &num);
661			/* Set status flag to inform userland */
662			store_tei_result(ptei, OP_ADD, error, num);
663		}
664		if (error == 0) {
665			/* Update number of records to ease limit checking */
666			tc->count += num;
667			numadd += num;
668			continue;
669		}
670
671		if (first_error == 0)
672			first_error = error;
673
674		/*
675		 * Some error have happened. Check our atomicity
676		 * settings: continue if atomicity is not required,
677		 * rollback changes otherwise.
678		 */
679		if ((flags & IPFW_CTF_ATOMIC) == 0)
680			continue;
681
682		rollback_added_entries(ch, tc, KIDX_TO_TI(ch, kidx),
683		    tei, ta_buf_m, count, i);
684
685		rollback = 1;
686		break;
687	}
688
689	IPFW_WUNLOCK(ch);
690
691	ipfw_garbage_table_values(ch, tc, tei, count, rollback);
692
693	/* Permit post-add algorithm grow/rehash. */
694	if (numadd != 0)
695		check_table_space(ch, NULL, tc, KIDX_TO_TI(ch, kidx), 0);
696
697	/* Return first error to user, if any */
698	error = first_error;
699
700cleanup:
701	IPFW_UH_WUNLOCK(ch);
702
703	flush_batch_buffer(ch, ta, tei, count, rollback, ta_buf_m, ta_buf);
704
705	return (error);
706}
707
708/*
709 * Deletes one or more entries in table @ti.
710 *
711 * Returns 0 on success.
712 */
713int
714del_table_entry(struct ip_fw_chain *ch, struct tid_info *ti,
715    struct tentry_info *tei, uint8_t flags, uint32_t count)
716{
717	struct table_config *tc;
718	struct table_algo *ta;
719	struct tentry_info *ptei;
720	uint16_t kidx;
721	int error, first_error, i;
722	uint32_t num, numdel;
723	char ta_buf[TA_BUF_SZ];
724	caddr_t ta_buf_m, v;
725
726	/*
727	 * Find and reference existing table.
728	 */
729	IPFW_UH_WLOCK(ch);
730	error = find_ref_table(ch, ti, tei, count, OP_DEL, &tc);
731	if (error != 0) {
732		IPFW_UH_WUNLOCK(ch);
733		return (error);
734	}
735	ta = tc->ta;
736	IPFW_UH_WUNLOCK(ch);
737
738	/* Allocate memory and prepare record(s) */
739	/* Pass stack buffer by default */
740	ta_buf_m = ta_buf;
741	error = prepare_batch_buffer(ch, ta, tei, count, OP_DEL, &ta_buf_m);
742	if (error != 0)
743		goto cleanup;
744
745	IPFW_UH_WLOCK(ch);
746
747	/* Drop reference we've used in first search */
748	tc->no.refcnt--;
749
750	/*
751	 * Check if table algo is still the same.
752	 * (changed ta may be the result of table swap).
753	 */
754	if (ta != tc->ta) {
755		IPFW_UH_WUNLOCK(ch);
756		error = EINVAL;
757		goto cleanup;
758	}
759
760	kidx = tc->no.kidx;
761	numdel = 0;
762	first_error = 0;
763
764	IPFW_WLOCK(ch);
765	v = ta_buf_m;
766	for (i = 0; i < count; i++, v += ta->ta_buf_size) {
767		ptei = &tei[i];
768		num = 0;
769		error = ta->del(tc->astate, KIDX_TO_TI(ch, kidx), ptei, v,
770		    &num);
771		/* Save state for userland */
772		store_tei_result(ptei, OP_DEL, error, num);
773		if (error != 0 && first_error == 0)
774			first_error = error;
775		tc->count -= num;
776		numdel += num;
777	}
778	IPFW_WUNLOCK(ch);
779
780	/* Unlink non-used values */
781	ipfw_garbage_table_values(ch, tc, tei, count, 0);
782
783	if (numdel != 0) {
784		/* Run post-del hook to permit shrinking */
785		check_table_space(ch, NULL, tc, KIDX_TO_TI(ch, kidx), 0);
786	}
787
788	IPFW_UH_WUNLOCK(ch);
789
790	/* Return first error to user, if any */
791	error = first_error;
792
793cleanup:
794	flush_batch_buffer(ch, ta, tei, count, 0, ta_buf_m, ta_buf);
795
796	return (error);
797}
798
799/*
800 * Ensure that table @tc has enough space to add @count entries without
801 * need for reallocation.
802 *
803 * Callbacks order:
804 * 0) need_modify() (UH_WLOCK) - checks if @count items can be added w/o resize.
805 *
806 * 1) alloc_modify (no locks, M_WAITOK) - alloc new state based on @pflags.
807 * 2) prepare_modifyt (UH_WLOCK) - copy old data into new storage
808 * 3) modify (UH_WLOCK + WLOCK) - switch pointers
809 * 4) flush_modify (UH_WLOCK) - free state, if needed
810 *
811 * Returns 0 on success.
812 */
813static int
814check_table_space(struct ip_fw_chain *ch, struct tableop_state *ts,
815    struct table_config *tc, struct table_info *ti, uint32_t count)
816{
817	struct table_algo *ta;
818	uint64_t pflags;
819	char ta_buf[TA_BUF_SZ];
820	int error;
821
822	IPFW_UH_WLOCK_ASSERT(ch);
823
824	error = 0;
825	ta = tc->ta;
826	if (ta->need_modify == NULL)
827		return (0);
828
829	/* Acquire reference not to loose @tc between locks/unlocks */
830	tc->no.refcnt++;
831
832	/*
833	 * TODO: think about avoiding race between large add/large delete
834	 * operation on algorithm which implements shrinking along with
835	 * growing.
836	 */
837	while (true) {
838		pflags = 0;
839		if (ta->need_modify(tc->astate, ti, count, &pflags) == 0) {
840			error = 0;
841			break;
842		}
843
844		/* We have to shrink/grow table */
845		if (ts != NULL)
846			add_toperation_state(ch, ts);
847		IPFW_UH_WUNLOCK(ch);
848
849		memset(&ta_buf, 0, sizeof(ta_buf));
850		error = ta->prepare_mod(ta_buf, &pflags);
851
852		IPFW_UH_WLOCK(ch);
853		if (ts != NULL)
854			del_toperation_state(ch, ts);
855
856		if (error != 0)
857			break;
858
859		if (ts != NULL && ts->modified != 0) {
860			/*
861			 * Swap operation has happened
862			 * so we're currently operating on other
863			 * table data. Stop doing this.
864			 */
865			ta->flush_mod(ta_buf);
866			break;
867		}
868
869		/* Check if we still need to alter table */
870		ti = KIDX_TO_TI(ch, tc->no.kidx);
871		if (ta->need_modify(tc->astate, ti, count, &pflags) == 0) {
872			IPFW_UH_WUNLOCK(ch);
873
874			/*
875			 * Other thread has already performed resize.
876			 * Flush our state and return.
877			 */
878			ta->flush_mod(ta_buf);
879			break;
880		}
881
882		error = ta->fill_mod(tc->astate, ti, ta_buf, &pflags);
883		if (error == 0) {
884			/* Do actual modification */
885			IPFW_WLOCK(ch);
886			ta->modify(tc->astate, ti, ta_buf, pflags);
887			IPFW_WUNLOCK(ch);
888		}
889
890		/* Anyway, flush data and retry */
891		ta->flush_mod(ta_buf);
892	}
893
894	tc->no.refcnt--;
895	return (error);
896}
897
898/*
899 * Adds or deletes record in table.
900 * Data layout (v0):
901 * Request: [ ip_fw3_opheader ipfw_table_xentry ]
902 *
903 * Returns 0 on success
904 */
905static int
906manage_table_ent_v0(struct ip_fw_chain *ch, ip_fw3_opheader *op3,
907    struct sockopt_data *sd)
908{
909	ipfw_table_xentry *xent;
910	struct tentry_info tei;
911	struct tid_info ti;
912	struct table_value v;
913	int error, hdrlen, read;
914
915	hdrlen = offsetof(ipfw_table_xentry, k);
916
917	/* Check minimum header size */
918	if (sd->valsize < (sizeof(*op3) + hdrlen))
919		return (EINVAL);
920
921	read = sizeof(ip_fw3_opheader);
922
923	/* Check if xentry len field is valid */
924	xent = (ipfw_table_xentry *)(op3 + 1);
925	if (xent->len < hdrlen || xent->len + read > sd->valsize)
926		return (EINVAL);
927
928	memset(&tei, 0, sizeof(tei));
929	tei.paddr = &xent->k;
930	tei.masklen = xent->masklen;
931	ipfw_import_table_value_legacy(xent->value, &v);
932	tei.pvalue = &v;
933	/* Old requests compatibility */
934	tei.flags = TEI_FLAGS_COMPAT;
935	if (xent->type == IPFW_TABLE_ADDR) {
936		if (xent->len - hdrlen == sizeof(in_addr_t))
937			tei.subtype = AF_INET;
938		else
939			tei.subtype = AF_INET6;
940	}
941
942	memset(&ti, 0, sizeof(ti));
943	ti.uidx = xent->tbl;
944	ti.type = xent->type;
945
946	error = (op3->opcode == IP_FW_TABLE_XADD) ?
947	    add_table_entry(ch, &ti, &tei, 0, 1) :
948	    del_table_entry(ch, &ti, &tei, 0, 1);
949
950	return (error);
951}
952
953/*
954 * Adds or deletes record in table.
955 * Data layout (v1)(current):
956 * Request: [ ipfw_obj_header
957 *   ipfw_obj_ctlv(IPFW_TLV_TBLENT_LIST) [ ipfw_obj_tentry x N ]
958 * ]
959 *
960 * Returns 0 on success
961 */
962static int
963manage_table_ent_v1(struct ip_fw_chain *ch, ip_fw3_opheader *op3,
964    struct sockopt_data *sd)
965{
966	ipfw_obj_tentry *tent, *ptent;
967	ipfw_obj_ctlv *ctlv;
968	ipfw_obj_header *oh;
969	struct tentry_info *ptei, tei, *tei_buf;
970	struct tid_info ti;
971	int error, i, kidx, read;
972
973	/* Check minimum header size */
974	if (sd->valsize < (sizeof(*oh) + sizeof(*ctlv)))
975		return (EINVAL);
976
977	/* Check if passed data is too long */
978	if (sd->valsize != sd->kavail)
979		return (EINVAL);
980
981	oh = (ipfw_obj_header *)sd->kbuf;
982
983	/* Basic length checks for TLVs */
984	if (oh->ntlv.head.length != sizeof(oh->ntlv))
985		return (EINVAL);
986
987	read = sizeof(*oh);
988
989	ctlv = (ipfw_obj_ctlv *)(oh + 1);
990	if (ctlv->head.length + read != sd->valsize)
991		return (EINVAL);
992
993	read += sizeof(*ctlv);
994	tent = (ipfw_obj_tentry *)(ctlv + 1);
995	if (ctlv->count * sizeof(*tent) + read != sd->valsize)
996		return (EINVAL);
997
998	if (ctlv->count == 0)
999		return (0);
1000
1001	/*
1002	 * Mark entire buffer as "read".
1003	 * This instructs sopt api write it back
1004	 * after function return.
1005	 */
1006	ipfw_get_sopt_header(sd, sd->valsize);
1007
1008	/* Perform basic checks for each entry */
1009	ptent = tent;
1010	kidx = tent->idx;
1011	for (i = 0; i < ctlv->count; i++, ptent++) {
1012		if (ptent->head.length != sizeof(*ptent))
1013			return (EINVAL);
1014		if (ptent->idx != kidx)
1015			return (ENOTSUP);
1016	}
1017
1018	/* Convert data into kernel request objects */
1019	objheader_to_ti(oh, &ti);
1020	ti.type = oh->ntlv.type;
1021	ti.uidx = kidx;
1022
1023	/* Use on-stack buffer for single add/del */
1024	if (ctlv->count == 1) {
1025		memset(&tei, 0, sizeof(tei));
1026		tei_buf = &tei;
1027	} else
1028		tei_buf = malloc(ctlv->count * sizeof(tei), M_TEMP,
1029		    M_WAITOK | M_ZERO);
1030
1031	ptei = tei_buf;
1032	ptent = tent;
1033	for (i = 0; i < ctlv->count; i++, ptent++, ptei++) {
1034		ptei->paddr = &ptent->k;
1035		ptei->subtype = ptent->subtype;
1036		ptei->masklen = ptent->masklen;
1037		if (ptent->head.flags & IPFW_TF_UPDATE)
1038			ptei->flags |= TEI_FLAGS_UPDATE;
1039
1040		ipfw_import_table_value_v1(&ptent->v.value);
1041		ptei->pvalue = (struct table_value *)&ptent->v.value;
1042	}
1043
1044	error = (oh->opheader.opcode == IP_FW_TABLE_XADD) ?
1045	    add_table_entry(ch, &ti, tei_buf, ctlv->flags, ctlv->count) :
1046	    del_table_entry(ch, &ti, tei_buf, ctlv->flags, ctlv->count);
1047
1048	/* Translate result back to userland */
1049	ptei = tei_buf;
1050	ptent = tent;
1051	for (i = 0; i < ctlv->count; i++, ptent++, ptei++) {
1052		if (ptei->flags & TEI_FLAGS_ADDED)
1053			ptent->result = IPFW_TR_ADDED;
1054		else if (ptei->flags & TEI_FLAGS_DELETED)
1055			ptent->result = IPFW_TR_DELETED;
1056		else if (ptei->flags & TEI_FLAGS_UPDATED)
1057			ptent->result = IPFW_TR_UPDATED;
1058		else if (ptei->flags & TEI_FLAGS_LIMIT)
1059			ptent->result = IPFW_TR_LIMIT;
1060		else if (ptei->flags & TEI_FLAGS_ERROR)
1061			ptent->result = IPFW_TR_ERROR;
1062		else if (ptei->flags & TEI_FLAGS_NOTFOUND)
1063			ptent->result = IPFW_TR_NOTFOUND;
1064		else if (ptei->flags & TEI_FLAGS_EXISTS)
1065			ptent->result = IPFW_TR_EXISTS;
1066		ipfw_export_table_value_v1(ptei->pvalue, &ptent->v.value);
1067	}
1068
1069	if (tei_buf != &tei)
1070		free(tei_buf, M_TEMP);
1071
1072	return (error);
1073}
1074
1075/*
1076 * Looks up an entry in given table.
1077 * Data layout (v0)(current):
1078 * Request: [ ipfw_obj_header ipfw_obj_tentry ]
1079 * Reply: [ ipfw_obj_header ipfw_obj_tentry ]
1080 *
1081 * Returns 0 on success
1082 */
1083static int
1084find_table_entry(struct ip_fw_chain *ch, ip_fw3_opheader *op3,
1085    struct sockopt_data *sd)
1086{
1087	ipfw_obj_tentry *tent;
1088	ipfw_obj_header *oh;
1089	struct tid_info ti;
1090	struct table_config *tc;
1091	struct table_algo *ta;
1092	struct table_info *kti;
1093	struct table_value *pval;
1094	struct namedobj_instance *ni;
1095	int error;
1096	size_t sz;
1097
1098	/* Check minimum header size */
1099	sz = sizeof(*oh) + sizeof(*tent);
1100	if (sd->valsize != sz)
1101		return (EINVAL);
1102
1103	oh = (struct _ipfw_obj_header *)ipfw_get_sopt_header(sd, sz);
1104	tent = (ipfw_obj_tentry *)(oh + 1);
1105
1106	/* Basic length checks for TLVs */
1107	if (oh->ntlv.head.length != sizeof(oh->ntlv))
1108		return (EINVAL);
1109
1110	objheader_to_ti(oh, &ti);
1111	ti.type = oh->ntlv.type;
1112	ti.uidx = tent->idx;
1113
1114	IPFW_UH_RLOCK(ch);
1115	ni = CHAIN_TO_NI(ch);
1116
1117	/*
1118	 * Find existing table and check its type .
1119	 */
1120	ta = NULL;
1121	if ((tc = find_table(ni, &ti)) == NULL) {
1122		IPFW_UH_RUNLOCK(ch);
1123		return (ESRCH);
1124	}
1125
1126	/* check table type */
1127	if (tc->no.subtype != ti.type) {
1128		IPFW_UH_RUNLOCK(ch);
1129		return (EINVAL);
1130	}
1131
1132	kti = KIDX_TO_TI(ch, tc->no.kidx);
1133	ta = tc->ta;
1134
1135	if (ta->find_tentry == NULL)
1136		return (ENOTSUP);
1137
1138	error = ta->find_tentry(tc->astate, kti, tent);
1139	if (error == 0) {
1140		pval = get_table_value(ch, tc, tent->v.kidx);
1141		ipfw_export_table_value_v1(pval, &tent->v.value);
1142	}
1143	IPFW_UH_RUNLOCK(ch);
1144
1145	return (error);
1146}
1147
1148/*
1149 * Flushes all entries or destroys given table.
1150 * Data layout (v0)(current):
1151 * Request: [ ipfw_obj_header ]
1152 *
1153 * Returns 0 on success
1154 */
1155static int
1156flush_table_v0(struct ip_fw_chain *ch, ip_fw3_opheader *op3,
1157    struct sockopt_data *sd)
1158{
1159	int error;
1160	struct _ipfw_obj_header *oh;
1161	struct tid_info ti;
1162
1163	if (sd->valsize != sizeof(*oh))
1164		return (EINVAL);
1165
1166	oh = (struct _ipfw_obj_header *)op3;
1167	objheader_to_ti(oh, &ti);
1168
1169	if (op3->opcode == IP_FW_TABLE_XDESTROY)
1170		error = destroy_table(ch, &ti);
1171	else if (op3->opcode == IP_FW_TABLE_XFLUSH)
1172		error = flush_table(ch, &ti);
1173	else
1174		return (ENOTSUP);
1175
1176	return (error);
1177}
1178
1179static void
1180restart_flush(void *object, struct op_state *_state)
1181{
1182	struct tableop_state *ts;
1183
1184	ts = (struct tableop_state *)_state;
1185
1186	if (ts->tc != object)
1187		return;
1188
1189	/* Indicate we've called */
1190	ts->modified = 1;
1191}
1192
1193/*
1194 * Flushes given table.
1195 *
1196 * Function create new table instance with the same
1197 * parameters, swaps it with old one and
1198 * flushes state without holding runtime WLOCK.
1199 *
1200 * Returns 0 on success.
1201 */
1202int
1203flush_table(struct ip_fw_chain *ch, struct tid_info *ti)
1204{
1205	struct namedobj_instance *ni;
1206	struct table_config *tc;
1207	struct table_algo *ta;
1208	struct table_info ti_old, ti_new, *tablestate;
1209	void *astate_old, *astate_new;
1210	char algostate[64], *pstate;
1211	struct tableop_state ts;
1212	int error, need_gc;
1213	uint16_t kidx;
1214	uint8_t tflags;
1215
1216	/*
1217	 * Stage 1: save table algorithm.
1218	 * Reference found table to ensure it won't disappear.
1219	 */
1220	IPFW_UH_WLOCK(ch);
1221	ni = CHAIN_TO_NI(ch);
1222	if ((tc = find_table(ni, ti)) == NULL) {
1223		IPFW_UH_WUNLOCK(ch);
1224		return (ESRCH);
1225	}
1226	need_gc = 0;
1227	astate_new = NULL;
1228	memset(&ti_new, 0, sizeof(ti_new));
1229restart:
1230	/* Set up swap handler */
1231	memset(&ts, 0, sizeof(ts));
1232	ts.opstate.func = restart_flush;
1233	ts.tc = tc;
1234
1235	ta = tc->ta;
1236	/* Do not flush readonly tables */
1237	if ((ta->flags & TA_FLAG_READONLY) != 0) {
1238		IPFW_UH_WUNLOCK(ch);
1239		return (EACCES);
1240	}
1241	/* Save startup algo parameters */
1242	if (ta->print_config != NULL) {
1243		ta->print_config(tc->astate, KIDX_TO_TI(ch, tc->no.kidx),
1244		    algostate, sizeof(algostate));
1245		pstate = algostate;
1246	} else
1247		pstate = NULL;
1248	tflags = tc->tflags;
1249	tc->no.refcnt++;
1250	add_toperation_state(ch, &ts);
1251	IPFW_UH_WUNLOCK(ch);
1252
1253	/*
1254	 * Stage 1.5: if this is not the first attempt, destroy previous state
1255	 */
1256	if (need_gc != 0) {
1257		ta->destroy(astate_new, &ti_new);
1258		need_gc = 0;
1259	}
1260
1261	/*
1262	 * Stage 2: allocate new table instance using same algo.
1263	 */
1264	memset(&ti_new, 0, sizeof(struct table_info));
1265	error = ta->init(ch, &astate_new, &ti_new, pstate, tflags);
1266
1267	/*
1268	 * Stage 3: swap old state pointers with newly-allocated ones.
1269	 * Decrease refcount.
1270	 */
1271	IPFW_UH_WLOCK(ch);
1272	tc->no.refcnt--;
1273	del_toperation_state(ch, &ts);
1274
1275	if (error != 0) {
1276		IPFW_UH_WUNLOCK(ch);
1277		return (error);
1278	}
1279
1280	/*
1281	 * Restart operation if table swap has happened:
1282	 * even if algo may be the same, algo init parameters
1283	 * may change. Restart operation instead of doing
1284	 * complex checks.
1285	 */
1286	if (ts.modified != 0) {
1287		/* Delay destroying data since we're holding UH lock */
1288		need_gc = 1;
1289		goto restart;
1290	}
1291
1292	ni = CHAIN_TO_NI(ch);
1293	kidx = tc->no.kidx;
1294	tablestate = (struct table_info *)ch->tablestate;
1295
1296	IPFW_WLOCK(ch);
1297	ti_old = tablestate[kidx];
1298	tablestate[kidx] = ti_new;
1299	IPFW_WUNLOCK(ch);
1300
1301	astate_old = tc->astate;
1302	tc->astate = astate_new;
1303	tc->ti_copy = ti_new;
1304	tc->count = 0;
1305
1306	/* Notify algo on real @ti address */
1307	if (ta->change_ti != NULL)
1308		ta->change_ti(tc->astate, &tablestate[kidx]);
1309
1310	/*
1311	 * Stage 4: unref values.
1312	 */
1313	ipfw_unref_table_values(ch, tc, ta, astate_old, &ti_old);
1314	IPFW_UH_WUNLOCK(ch);
1315
1316	/*
1317	 * Stage 5: perform real flush/destroy.
1318	 */
1319	ta->destroy(astate_old, &ti_old);
1320
1321	return (0);
1322}
1323
1324/*
1325 * Swaps two tables.
1326 * Data layout (v0)(current):
1327 * Request: [ ipfw_obj_header ipfw_obj_ntlv ]
1328 *
1329 * Returns 0 on success
1330 */
1331static int
1332swap_table(struct ip_fw_chain *ch, ip_fw3_opheader *op3,
1333    struct sockopt_data *sd)
1334{
1335	int error;
1336	struct _ipfw_obj_header *oh;
1337	struct tid_info ti_a, ti_b;
1338
1339	if (sd->valsize != sizeof(*oh) + sizeof(ipfw_obj_ntlv))
1340		return (EINVAL);
1341
1342	oh = (struct _ipfw_obj_header *)op3;
1343	ntlv_to_ti(&oh->ntlv, &ti_a);
1344	ntlv_to_ti((ipfw_obj_ntlv *)(oh + 1), &ti_b);
1345
1346	error = swap_tables(ch, &ti_a, &ti_b);
1347
1348	return (error);
1349}
1350
1351/*
1352 * Swaps two tables of the same type/valtype.
1353 *
1354 * Checks if tables are compatible and limits
1355 * permits swap, than actually perform swap.
1356 *
1357 * Each table consists of 2 different parts:
1358 * config:
1359 *   @tc (with name, set, kidx) and rule bindings, which is "stable".
1360 *   number of items
1361 *   table algo
1362 * runtime:
1363 *   runtime data @ti (ch->tablestate)
1364 *   runtime cache in @tc
1365 *   algo-specific data (@tc->astate)
1366 *
1367 * So we switch:
1368 *  all runtime data
1369 *   number of items
1370 *   table algo
1371 *
1372 * After that we call @ti change handler for each table.
1373 *
1374 * Note that referencing @tc won't protect tc->ta from change.
1375 * XXX: Do we need to restrict swap between locked tables?
1376 * XXX: Do we need to exchange ftype?
1377 *
1378 * Returns 0 on success.
1379 */
1380static int
1381swap_tables(struct ip_fw_chain *ch, struct tid_info *a,
1382    struct tid_info *b)
1383{
1384	struct namedobj_instance *ni;
1385	struct table_config *tc_a, *tc_b;
1386	struct table_algo *ta;
1387	struct table_info ti, *tablestate;
1388	void *astate;
1389	uint32_t count;
1390
1391	/*
1392	 * Stage 1: find both tables and ensure they are of
1393	 * the same type.
1394	 */
1395	IPFW_UH_WLOCK(ch);
1396	ni = CHAIN_TO_NI(ch);
1397	if ((tc_a = find_table(ni, a)) == NULL) {
1398		IPFW_UH_WUNLOCK(ch);
1399		return (ESRCH);
1400	}
1401	if ((tc_b = find_table(ni, b)) == NULL) {
1402		IPFW_UH_WUNLOCK(ch);
1403		return (ESRCH);
1404	}
1405
1406	/* It is very easy to swap between the same table */
1407	if (tc_a == tc_b) {
1408		IPFW_UH_WUNLOCK(ch);
1409		return (0);
1410	}
1411
1412	/* Check type and value are the same */
1413	if (tc_a->no.subtype!=tc_b->no.subtype || tc_a->tflags!=tc_b->tflags) {
1414		IPFW_UH_WUNLOCK(ch);
1415		return (EINVAL);
1416	}
1417
1418	/* Check limits before swap */
1419	if ((tc_a->limit != 0 && tc_b->count > tc_a->limit) ||
1420	    (tc_b->limit != 0 && tc_a->count > tc_b->limit)) {
1421		IPFW_UH_WUNLOCK(ch);
1422		return (EFBIG);
1423	}
1424
1425	/* Check if one of the tables is readonly */
1426	if (((tc_a->ta->flags | tc_b->ta->flags) & TA_FLAG_READONLY) != 0) {
1427		IPFW_UH_WUNLOCK(ch);
1428		return (EACCES);
1429	}
1430
1431	/* Notify we're going to swap */
1432	rollback_toperation_state(ch, tc_a);
1433	rollback_toperation_state(ch, tc_b);
1434
1435	/* Everything is fine, prepare to swap */
1436	tablestate = (struct table_info *)ch->tablestate;
1437	ti = tablestate[tc_a->no.kidx];
1438	ta = tc_a->ta;
1439	astate = tc_a->astate;
1440	count = tc_a->count;
1441
1442	IPFW_WLOCK(ch);
1443	/* a <- b */
1444	tablestate[tc_a->no.kidx] = tablestate[tc_b->no.kidx];
1445	tc_a->ta = tc_b->ta;
1446	tc_a->astate = tc_b->astate;
1447	tc_a->count = tc_b->count;
1448	/* b <- a */
1449	tablestate[tc_b->no.kidx] = ti;
1450	tc_b->ta = ta;
1451	tc_b->astate = astate;
1452	tc_b->count = count;
1453	IPFW_WUNLOCK(ch);
1454
1455	/* Ensure tc.ti copies are in sync */
1456	tc_a->ti_copy = tablestate[tc_a->no.kidx];
1457	tc_b->ti_copy = tablestate[tc_b->no.kidx];
1458
1459	/* Notify both tables on @ti change */
1460	if (tc_a->ta->change_ti != NULL)
1461		tc_a->ta->change_ti(tc_a->astate, &tablestate[tc_a->no.kidx]);
1462	if (tc_b->ta->change_ti != NULL)
1463		tc_b->ta->change_ti(tc_b->astate, &tablestate[tc_b->no.kidx]);
1464
1465	IPFW_UH_WUNLOCK(ch);
1466
1467	return (0);
1468}
1469
1470/*
1471 * Destroys table specified by @ti.
1472 * Data layout (v0)(current):
1473 * Request: [ ip_fw3_opheader ]
1474 *
1475 * Returns 0 on success
1476 */
1477static int
1478destroy_table(struct ip_fw_chain *ch, struct tid_info *ti)
1479{
1480	struct namedobj_instance *ni;
1481	struct table_config *tc;
1482
1483	IPFW_UH_WLOCK(ch);
1484
1485	ni = CHAIN_TO_NI(ch);
1486	if ((tc = find_table(ni, ti)) == NULL) {
1487		IPFW_UH_WUNLOCK(ch);
1488		return (ESRCH);
1489	}
1490
1491	/* Do not permit destroying referenced tables */
1492	if (tc->no.refcnt > 0) {
1493		IPFW_UH_WUNLOCK(ch);
1494		return (EBUSY);
1495	}
1496
1497	IPFW_WLOCK(ch);
1498	unlink_table(ch, tc);
1499	IPFW_WUNLOCK(ch);
1500
1501	/* Free obj index */
1502	if (ipfw_objhash_free_idx(ni, tc->no.kidx) != 0)
1503		printf("Error unlinking kidx %d from table %s\n",
1504		    tc->no.kidx, tc->tablename);
1505
1506	/* Unref values used in tables while holding UH lock */
1507	ipfw_unref_table_values(ch, tc, tc->ta, tc->astate, &tc->ti_copy);
1508	IPFW_UH_WUNLOCK(ch);
1509
1510	free_table_config(ni, tc);
1511
1512	return (0);
1513}
1514
1515static uint32_t
1516roundup2p(uint32_t v)
1517{
1518
1519	return (1 << fls(v - 1));
1520}
1521
1522/*
1523 * Grow tables index.
1524 *
1525 * Returns 0 on success.
1526 */
1527int
1528ipfw_resize_tables(struct ip_fw_chain *ch, unsigned int ntables)
1529{
1530	unsigned int tbl;
1531	struct namedobj_instance *ni;
1532	void *new_idx, *old_tablestate, *tablestate;
1533	struct table_info *ti;
1534	struct table_config *tc;
1535	int i, new_blocks;
1536
1537	/* Check new value for validity */
1538	if (ntables == 0)
1539		return (EINVAL);
1540	if (ntables > IPFW_TABLES_MAX)
1541		ntables = IPFW_TABLES_MAX;
1542	/* Alight to nearest power of 2 */
1543	ntables = (unsigned int)roundup2p(ntables);
1544
1545	/* Allocate new pointers */
1546	tablestate = malloc(ntables * sizeof(struct table_info),
1547	    M_IPFW, M_WAITOK | M_ZERO);
1548
1549	ipfw_objhash_bitmap_alloc(ntables, (void *)&new_idx, &new_blocks);
1550
1551	IPFW_UH_WLOCK(ch);
1552
1553	tbl = (ntables >= V_fw_tables_max) ? V_fw_tables_max : ntables;
1554	ni = CHAIN_TO_NI(ch);
1555
1556	/* Temporary restrict decreasing max_tables */
1557	if (ntables < V_fw_tables_max) {
1558		/*
1559		 * FIXME: Check if we really can shrink
1560		 */
1561		IPFW_UH_WUNLOCK(ch);
1562		return (EINVAL);
1563	}
1564
1565	/* Copy table info/indices */
1566	memcpy(tablestate, ch->tablestate, sizeof(struct table_info) * tbl);
1567	ipfw_objhash_bitmap_merge(ni, &new_idx, &new_blocks);
1568
1569	IPFW_WLOCK(ch);
1570
1571	/* Change pointers */
1572	old_tablestate = ch->tablestate;
1573	ch->tablestate = tablestate;
1574	ipfw_objhash_bitmap_swap(ni, &new_idx, &new_blocks);
1575
1576	V_fw_tables_max = ntables;
1577
1578	IPFW_WUNLOCK(ch);
1579
1580	/* Notify all consumers that their @ti pointer has changed */
1581	ti = (struct table_info *)ch->tablestate;
1582	for (i = 0; i < tbl; i++, ti++) {
1583		if (ti->lookup == NULL)
1584			continue;
1585		tc = (struct table_config *)ipfw_objhash_lookup_kidx(ni, i);
1586		if (tc == NULL || tc->ta->change_ti == NULL)
1587			continue;
1588
1589		tc->ta->change_ti(tc->astate, ti);
1590	}
1591
1592	IPFW_UH_WUNLOCK(ch);
1593
1594	/* Free old pointers */
1595	free(old_tablestate, M_IPFW);
1596	ipfw_objhash_bitmap_free(new_idx, new_blocks);
1597
1598	return (0);
1599}
1600
1601/*
1602 * Lookup table's named object by its @kidx.
1603 */
1604struct named_object *
1605ipfw_objhash_lookup_table_kidx(struct ip_fw_chain *ch, uint16_t kidx)
1606{
1607
1608	return (ipfw_objhash_lookup_kidx(CHAIN_TO_NI(ch), kidx));
1609}
1610
1611/*
1612 * Take reference to table specified in @ntlv.
1613 * On success return its @kidx.
1614 */
1615int
1616ipfw_ref_table(struct ip_fw_chain *ch, ipfw_obj_ntlv *ntlv, uint16_t *kidx)
1617{
1618	struct tid_info ti;
1619	struct table_config *tc;
1620	int error;
1621
1622	IPFW_UH_WLOCK_ASSERT(ch);
1623
1624	ntlv_to_ti(ntlv, &ti);
1625	error = find_table_err(CHAIN_TO_NI(ch), &ti, &tc);
1626	if (error != 0)
1627		return (error);
1628
1629	if (tc == NULL)
1630		return (ESRCH);
1631
1632	tc_ref(tc);
1633	*kidx = tc->no.kidx;
1634
1635	return (0);
1636}
1637
1638void
1639ipfw_unref_table(struct ip_fw_chain *ch, uint16_t kidx)
1640{
1641
1642	struct namedobj_instance *ni;
1643	struct named_object *no;
1644
1645	IPFW_UH_WLOCK_ASSERT(ch);
1646	ni = CHAIN_TO_NI(ch);
1647	no = ipfw_objhash_lookup_kidx(ni, kidx);
1648	KASSERT(no != NULL, ("Table with index %d not found", kidx));
1649	no->refcnt--;
1650}
1651
1652/*
1653 * Lookup an arbitrary key @paddr of length @plen in table @tbl.
1654 * Stores found value in @val.
1655 *
1656 * Returns 1 if key was found.
1657 */
1658int
1659ipfw_lookup_table(struct ip_fw_chain *ch, uint16_t tbl, uint16_t plen,
1660    void *paddr, uint32_t *val)
1661{
1662	struct table_info *ti;
1663
1664	ti = KIDX_TO_TI(ch, tbl);
1665
1666	return (ti->lookup(ti, paddr, plen, val));
1667}
1668
1669/*
1670 * Info/List/dump support for tables.
1671 *
1672 */
1673
1674/*
1675 * High-level 'get' cmds sysctl handlers
1676 */
1677
1678/*
1679 * Lists all tables currently available in kernel.
1680 * Data layout (v0)(current):
1681 * Request: [ ipfw_obj_lheader ], size = ipfw_obj_lheader.size
1682 * Reply: [ ipfw_obj_lheader ipfw_xtable_info x N ]
1683 *
1684 * Returns 0 on success
1685 */
1686static int
1687list_tables(struct ip_fw_chain *ch, ip_fw3_opheader *op3,
1688    struct sockopt_data *sd)
1689{
1690	struct _ipfw_obj_lheader *olh;
1691	int error;
1692
1693	olh = (struct _ipfw_obj_lheader *)ipfw_get_sopt_header(sd,sizeof(*olh));
1694	if (olh == NULL)
1695		return (EINVAL);
1696	if (sd->valsize < olh->size)
1697		return (EINVAL);
1698
1699	IPFW_UH_RLOCK(ch);
1700	error = export_tables(ch, olh, sd);
1701	IPFW_UH_RUNLOCK(ch);
1702
1703	return (error);
1704}
1705
1706/*
1707 * Store table info to buffer provided by @sd.
1708 * Data layout (v0)(current):
1709 * Request: [ ipfw_obj_header ipfw_xtable_info(empty)]
1710 * Reply: [ ipfw_obj_header ipfw_xtable_info ]
1711 *
1712 * Returns 0 on success.
1713 */
1714static int
1715describe_table(struct ip_fw_chain *ch, ip_fw3_opheader *op3,
1716    struct sockopt_data *sd)
1717{
1718	struct _ipfw_obj_header *oh;
1719	struct table_config *tc;
1720	struct tid_info ti;
1721	size_t sz;
1722
1723	sz = sizeof(*oh) + sizeof(ipfw_xtable_info);
1724	oh = (struct _ipfw_obj_header *)ipfw_get_sopt_header(sd, sz);
1725	if (oh == NULL)
1726		return (EINVAL);
1727
1728	objheader_to_ti(oh, &ti);
1729
1730	IPFW_UH_RLOCK(ch);
1731	if ((tc = find_table(CHAIN_TO_NI(ch), &ti)) == NULL) {
1732		IPFW_UH_RUNLOCK(ch);
1733		return (ESRCH);
1734	}
1735
1736	export_table_info(ch, tc, (ipfw_xtable_info *)(oh + 1));
1737	IPFW_UH_RUNLOCK(ch);
1738
1739	return (0);
1740}
1741
1742/*
1743 * Modifies existing table.
1744 * Data layout (v0)(current):
1745 * Request: [ ipfw_obj_header ipfw_xtable_info ]
1746 *
1747 * Returns 0 on success
1748 */
1749static int
1750modify_table(struct ip_fw_chain *ch, ip_fw3_opheader *op3,
1751    struct sockopt_data *sd)
1752{
1753	struct _ipfw_obj_header *oh;
1754	ipfw_xtable_info *i;
1755	char *tname;
1756	struct tid_info ti;
1757	struct namedobj_instance *ni;
1758	struct table_config *tc;
1759
1760	if (sd->valsize != sizeof(*oh) + sizeof(ipfw_xtable_info))
1761		return (EINVAL);
1762
1763	oh = (struct _ipfw_obj_header *)sd->kbuf;
1764	i = (ipfw_xtable_info *)(oh + 1);
1765
1766	/*
1767	 * Verify user-supplied strings.
1768	 * Check for null-terminated/zero-length strings/
1769	 */
1770	tname = oh->ntlv.name;
1771	if (check_table_name(tname) != 0)
1772		return (EINVAL);
1773
1774	objheader_to_ti(oh, &ti);
1775	ti.type = i->type;
1776
1777	IPFW_UH_WLOCK(ch);
1778	ni = CHAIN_TO_NI(ch);
1779	if ((tc = find_table(ni, &ti)) == NULL) {
1780		IPFW_UH_WUNLOCK(ch);
1781		return (ESRCH);
1782	}
1783
1784	/* Do not support any modifications for readonly tables */
1785	if ((tc->ta->flags & TA_FLAG_READONLY) != 0) {
1786		IPFW_UH_WUNLOCK(ch);
1787		return (EACCES);
1788	}
1789
1790	if ((i->mflags & IPFW_TMFLAGS_LIMIT) != 0)
1791		tc->limit = i->limit;
1792	if ((i->mflags & IPFW_TMFLAGS_LOCK) != 0)
1793		tc->locked = ((i->flags & IPFW_TGFLAGS_LOCKED) != 0);
1794	IPFW_UH_WUNLOCK(ch);
1795
1796	return (0);
1797}
1798
1799/*
1800 * Creates new table.
1801 * Data layout (v0)(current):
1802 * Request: [ ipfw_obj_header ipfw_xtable_info ]
1803 *
1804 * Returns 0 on success
1805 */
1806static int
1807create_table(struct ip_fw_chain *ch, ip_fw3_opheader *op3,
1808    struct sockopt_data *sd)
1809{
1810	struct _ipfw_obj_header *oh;
1811	ipfw_xtable_info *i;
1812	char *tname, *aname;
1813	struct tid_info ti;
1814	struct namedobj_instance *ni;
1815
1816	if (sd->valsize != sizeof(*oh) + sizeof(ipfw_xtable_info))
1817		return (EINVAL);
1818
1819	oh = (struct _ipfw_obj_header *)sd->kbuf;
1820	i = (ipfw_xtable_info *)(oh + 1);
1821
1822	/*
1823	 * Verify user-supplied strings.
1824	 * Check for null-terminated/zero-length strings/
1825	 */
1826	tname = oh->ntlv.name;
1827	aname = i->algoname;
1828	if (check_table_name(tname) != 0 ||
1829	    strnlen(aname, sizeof(i->algoname)) == sizeof(i->algoname))
1830		return (EINVAL);
1831
1832	if (aname[0] == '\0') {
1833		/* Use default algorithm */
1834		aname = NULL;
1835	}
1836
1837	objheader_to_ti(oh, &ti);
1838	ti.type = i->type;
1839
1840	ni = CHAIN_TO_NI(ch);
1841
1842	IPFW_UH_RLOCK(ch);
1843	if (find_table(ni, &ti) != NULL) {
1844		IPFW_UH_RUNLOCK(ch);
1845		return (EEXIST);
1846	}
1847	IPFW_UH_RUNLOCK(ch);
1848
1849	return (create_table_internal(ch, &ti, aname, i, NULL, 0));
1850}
1851
1852/*
1853 * Creates new table based on @ti and @aname.
1854 *
1855 * Assume @aname to be checked and valid.
1856 * Stores allocated table kidx inside @pkidx (if non-NULL).
1857 * Reference created table if @compat is non-zero.
1858 *
1859 * Returns 0 on success.
1860 */
1861static int
1862create_table_internal(struct ip_fw_chain *ch, struct tid_info *ti,
1863    char *aname, ipfw_xtable_info *i, uint16_t *pkidx, int compat)
1864{
1865	struct namedobj_instance *ni;
1866	struct table_config *tc, *tc_new, *tmp;
1867	struct table_algo *ta;
1868	uint16_t kidx;
1869
1870	ni = CHAIN_TO_NI(ch);
1871
1872	ta = find_table_algo(CHAIN_TO_TCFG(ch), ti, aname);
1873	if (ta == NULL)
1874		return (ENOTSUP);
1875
1876	tc = alloc_table_config(ch, ti, ta, aname, i->tflags);
1877	if (tc == NULL)
1878		return (ENOMEM);
1879
1880	tc->vmask = i->vmask;
1881	tc->limit = i->limit;
1882	if (ta->flags & TA_FLAG_READONLY)
1883		tc->locked = 1;
1884	else
1885		tc->locked = (i->flags & IPFW_TGFLAGS_LOCKED) != 0;
1886
1887	IPFW_UH_WLOCK(ch);
1888
1889	/* Check if table has been already created */
1890	tc_new = find_table(ni, ti);
1891	if (tc_new != NULL) {
1892		/*
1893		 * Compat: do not fail if we're
1894		 * requesting to create existing table
1895		 * which has the same type
1896		 */
1897		if (compat == 0 || tc_new->no.subtype != tc->no.subtype) {
1898			IPFW_UH_WUNLOCK(ch);
1899			free_table_config(ni, tc);
1900			return (EEXIST);
1901		}
1902
1903		/* Exchange tc and tc_new for proper refcounting & freeing */
1904		tmp = tc;
1905		tc = tc_new;
1906		tc_new = tmp;
1907	} else {
1908		/* New table */
1909		if (ipfw_objhash_alloc_idx(ni, &kidx) != 0) {
1910			IPFW_UH_WUNLOCK(ch);
1911			printf("Unable to allocate table index."
1912			    " Consider increasing net.inet.ip.fw.tables_max");
1913			free_table_config(ni, tc);
1914			return (EBUSY);
1915		}
1916		tc->no.kidx = kidx;
1917		tc->no.etlv = IPFW_TLV_TBL_NAME;
1918
1919		link_table(ch, tc);
1920	}
1921
1922	if (compat != 0)
1923		tc->no.refcnt++;
1924	if (pkidx != NULL)
1925		*pkidx = tc->no.kidx;
1926
1927	IPFW_UH_WUNLOCK(ch);
1928
1929	if (tc_new != NULL)
1930		free_table_config(ni, tc_new);
1931
1932	return (0);
1933}
1934
1935static void
1936ntlv_to_ti(ipfw_obj_ntlv *ntlv, struct tid_info *ti)
1937{
1938
1939	memset(ti, 0, sizeof(struct tid_info));
1940	ti->set = ntlv->set;
1941	ti->uidx = ntlv->idx;
1942	ti->tlvs = ntlv;
1943	ti->tlen = ntlv->head.length;
1944}
1945
1946static void
1947objheader_to_ti(struct _ipfw_obj_header *oh, struct tid_info *ti)
1948{
1949
1950	ntlv_to_ti(&oh->ntlv, ti);
1951}
1952
1953struct namedobj_instance *
1954ipfw_get_table_objhash(struct ip_fw_chain *ch)
1955{
1956
1957	return (CHAIN_TO_NI(ch));
1958}
1959
1960/*
1961 * Exports basic table info as name TLV.
1962 * Used inside dump_static_rules() to provide info
1963 * about all tables referenced by current ruleset.
1964 *
1965 * Returns 0 on success.
1966 */
1967int
1968ipfw_export_table_ntlv(struct ip_fw_chain *ch, uint16_t kidx,
1969    struct sockopt_data *sd)
1970{
1971	struct namedobj_instance *ni;
1972	struct named_object *no;
1973	ipfw_obj_ntlv *ntlv;
1974
1975	ni = CHAIN_TO_NI(ch);
1976
1977	no = ipfw_objhash_lookup_kidx(ni, kidx);
1978	KASSERT(no != NULL, ("invalid table kidx passed"));
1979
1980	ntlv = (ipfw_obj_ntlv *)ipfw_get_sopt_space(sd, sizeof(*ntlv));
1981	if (ntlv == NULL)
1982		return (ENOMEM);
1983
1984	ntlv->head.type = IPFW_TLV_TBL_NAME;
1985	ntlv->head.length = sizeof(*ntlv);
1986	ntlv->idx = no->kidx;
1987	strlcpy(ntlv->name, no->name, sizeof(ntlv->name));
1988
1989	return (0);
1990}
1991
1992struct dump_args {
1993	struct ip_fw_chain *ch;
1994	struct table_info *ti;
1995	struct table_config *tc;
1996	struct sockopt_data *sd;
1997	uint32_t cnt;
1998	uint16_t uidx;
1999	int error;
2000	uint32_t size;
2001	ipfw_table_entry *ent;
2002	ta_foreach_f *f;
2003	void *farg;
2004	ipfw_obj_tentry tent;
2005};
2006
2007static int
2008count_ext_entries(void *e, void *arg)
2009{
2010	struct dump_args *da;
2011
2012	da = (struct dump_args *)arg;
2013	da->cnt++;
2014
2015	return (0);
2016}
2017
2018/*
2019 * Gets number of items from table either using
2020 * internal counter or calling algo callback for
2021 * externally-managed tables.
2022 *
2023 * Returns number of records.
2024 */
2025static uint32_t
2026table_get_count(struct ip_fw_chain *ch, struct table_config *tc)
2027{
2028	struct table_info *ti;
2029	struct table_algo *ta;
2030	struct dump_args da;
2031
2032	ti = KIDX_TO_TI(ch, tc->no.kidx);
2033	ta = tc->ta;
2034
2035	/* Use internal counter for self-managed tables */
2036	if ((ta->flags & TA_FLAG_READONLY) == 0)
2037		return (tc->count);
2038
2039	/* Use callback to quickly get number of items */
2040	if ((ta->flags & TA_FLAG_EXTCOUNTER) != 0)
2041		return (ta->get_count(tc->astate, ti));
2042
2043	/* Count number of iterms ourselves */
2044	memset(&da, 0, sizeof(da));
2045	ta->foreach(tc->astate, ti, count_ext_entries, &da);
2046
2047	return (da.cnt);
2048}
2049
2050/*
2051 * Exports table @tc info into standard ipfw_xtable_info format.
2052 */
2053static void
2054export_table_info(struct ip_fw_chain *ch, struct table_config *tc,
2055    ipfw_xtable_info *i)
2056{
2057	struct table_info *ti;
2058	struct table_algo *ta;
2059
2060	i->type = tc->no.subtype;
2061	i->tflags = tc->tflags;
2062	i->vmask = tc->vmask;
2063	i->set = tc->no.set;
2064	i->kidx = tc->no.kidx;
2065	i->refcnt = tc->no.refcnt;
2066	i->count = table_get_count(ch, tc);
2067	i->limit = tc->limit;
2068	i->flags |= (tc->locked != 0) ? IPFW_TGFLAGS_LOCKED : 0;
2069	i->size = i->count * sizeof(ipfw_obj_tentry);
2070	i->size += sizeof(ipfw_obj_header) + sizeof(ipfw_xtable_info);
2071	strlcpy(i->tablename, tc->tablename, sizeof(i->tablename));
2072	ti = KIDX_TO_TI(ch, tc->no.kidx);
2073	ta = tc->ta;
2074	if (ta->print_config != NULL) {
2075		/* Use algo function to print table config to string */
2076		ta->print_config(tc->astate, ti, i->algoname,
2077		    sizeof(i->algoname));
2078	} else
2079		strlcpy(i->algoname, ta->name, sizeof(i->algoname));
2080	/* Dump algo-specific data, if possible */
2081	if (ta->dump_tinfo != NULL) {
2082		ta->dump_tinfo(tc->astate, ti, &i->ta_info);
2083		i->ta_info.flags |= IPFW_TATFLAGS_DATA;
2084	}
2085}
2086
2087struct dump_table_args {
2088	struct ip_fw_chain *ch;
2089	struct sockopt_data *sd;
2090};
2091
2092static int
2093export_table_internal(struct namedobj_instance *ni, struct named_object *no,
2094    void *arg)
2095{
2096	ipfw_xtable_info *i;
2097	struct dump_table_args *dta;
2098
2099	dta = (struct dump_table_args *)arg;
2100
2101	i = (ipfw_xtable_info *)ipfw_get_sopt_space(dta->sd, sizeof(*i));
2102	KASSERT(i != NULL, ("previously checked buffer is not enough"));
2103
2104	export_table_info(dta->ch, (struct table_config *)no, i);
2105	return (0);
2106}
2107
2108/*
2109 * Export all tables as ipfw_xtable_info structures to
2110 * storage provided by @sd.
2111 *
2112 * If supplied buffer is too small, fills in required size
2113 * and returns ENOMEM.
2114 * Returns 0 on success.
2115 */
2116static int
2117export_tables(struct ip_fw_chain *ch, ipfw_obj_lheader *olh,
2118    struct sockopt_data *sd)
2119{
2120	uint32_t size;
2121	uint32_t count;
2122	struct dump_table_args dta;
2123
2124	count = ipfw_objhash_count(CHAIN_TO_NI(ch));
2125	size = count * sizeof(ipfw_xtable_info) + sizeof(ipfw_obj_lheader);
2126
2127	/* Fill in header regadless of buffer size */
2128	olh->count = count;
2129	olh->objsize = sizeof(ipfw_xtable_info);
2130
2131	if (size > olh->size) {
2132		olh->size = size;
2133		return (ENOMEM);
2134	}
2135
2136	olh->size = size;
2137
2138	dta.ch = ch;
2139	dta.sd = sd;
2140
2141	ipfw_objhash_foreach(CHAIN_TO_NI(ch), export_table_internal, &dta);
2142
2143	return (0);
2144}
2145
2146/*
2147 * Dumps all table data
2148 * Data layout (v1)(current):
2149 * Request: [ ipfw_obj_header ], size = ipfw_xtable_info.size
2150 * Reply: [ ipfw_obj_header ipfw_xtable_info ipfw_obj_tentry x N ]
2151 *
2152 * Returns 0 on success
2153 */
2154static int
2155dump_table_v1(struct ip_fw_chain *ch, ip_fw3_opheader *op3,
2156    struct sockopt_data *sd)
2157{
2158	struct _ipfw_obj_header *oh;
2159	ipfw_xtable_info *i;
2160	struct tid_info ti;
2161	struct table_config *tc;
2162	struct table_algo *ta;
2163	struct dump_args da;
2164	uint32_t sz;
2165
2166	sz = sizeof(ipfw_obj_header) + sizeof(ipfw_xtable_info);
2167	oh = (struct _ipfw_obj_header *)ipfw_get_sopt_header(sd, sz);
2168	if (oh == NULL)
2169		return (EINVAL);
2170
2171	i = (ipfw_xtable_info *)(oh + 1);
2172	objheader_to_ti(oh, &ti);
2173
2174	IPFW_UH_RLOCK(ch);
2175	if ((tc = find_table(CHAIN_TO_NI(ch), &ti)) == NULL) {
2176		IPFW_UH_RUNLOCK(ch);
2177		return (ESRCH);
2178	}
2179	export_table_info(ch, tc, i);
2180
2181	if (sd->valsize < i->size) {
2182		/*
2183		 * Submitted buffer size is not enough.
2184		 * WE've already filled in @i structure with
2185		 * relevant table info including size, so we
2186		 * can return. Buffer will be flushed automatically.
2187		 */
2188		IPFW_UH_RUNLOCK(ch);
2189		return (ENOMEM);
2190	}
2191
2192	/*
2193	 * Do the actual dump in eXtended format
2194	 */
2195	memset(&da, 0, sizeof(da));
2196	da.ch = ch;
2197	da.ti = KIDX_TO_TI(ch, tc->no.kidx);
2198	da.tc = tc;
2199	da.sd = sd;
2200
2201	ta = tc->ta;
2202
2203	ta->foreach(tc->astate, da.ti, dump_table_tentry, &da);
2204	IPFW_UH_RUNLOCK(ch);
2205
2206	return (da.error);
2207}
2208
2209/*
2210 * Dumps all table data
2211 * Data layout (version 0)(legacy):
2212 * Request: [ ipfw_xtable ], size = IP_FW_TABLE_XGETSIZE()
2213 * Reply: [ ipfw_xtable ipfw_table_xentry x N ]
2214 *
2215 * Returns 0 on success
2216 */
2217static int
2218dump_table_v0(struct ip_fw_chain *ch, ip_fw3_opheader *op3,
2219    struct sockopt_data *sd)
2220{
2221	ipfw_xtable *xtbl;
2222	struct tid_info ti;
2223	struct table_config *tc;
2224	struct table_algo *ta;
2225	struct dump_args da;
2226	size_t sz, count;
2227
2228	xtbl = (ipfw_xtable *)ipfw_get_sopt_header(sd, sizeof(ipfw_xtable));
2229	if (xtbl == NULL)
2230		return (EINVAL);
2231
2232	memset(&ti, 0, sizeof(ti));
2233	ti.uidx = xtbl->tbl;
2234
2235	IPFW_UH_RLOCK(ch);
2236	if ((tc = find_table(CHAIN_TO_NI(ch), &ti)) == NULL) {
2237		IPFW_UH_RUNLOCK(ch);
2238		return (0);
2239	}
2240	count = table_get_count(ch, tc);
2241	sz = count * sizeof(ipfw_table_xentry) + sizeof(ipfw_xtable);
2242
2243	xtbl->cnt = count;
2244	xtbl->size = sz;
2245	xtbl->type = tc->no.subtype;
2246	xtbl->tbl = ti.uidx;
2247
2248	if (sd->valsize < sz) {
2249		/*
2250		 * Submitted buffer size is not enough.
2251		 * WE've already filled in @i structure with
2252		 * relevant table info including size, so we
2253		 * can return. Buffer will be flushed automatically.
2254		 */
2255		IPFW_UH_RUNLOCK(ch);
2256		return (ENOMEM);
2257	}
2258
2259	/* Do the actual dump in eXtended format */
2260	memset(&da, 0, sizeof(da));
2261	da.ch = ch;
2262	da.ti = KIDX_TO_TI(ch, tc->no.kidx);
2263	da.tc = tc;
2264	da.sd = sd;
2265
2266	ta = tc->ta;
2267
2268	ta->foreach(tc->astate, da.ti, dump_table_xentry, &da);
2269	IPFW_UH_RUNLOCK(ch);
2270
2271	return (0);
2272}
2273
2274/*
2275 * Legacy function to retrieve number of items in table.
2276 */
2277static int
2278get_table_size(struct ip_fw_chain *ch, ip_fw3_opheader *op3,
2279    struct sockopt_data *sd)
2280{
2281	uint32_t *tbl;
2282	struct tid_info ti;
2283	size_t sz;
2284	int error;
2285
2286	sz = sizeof(*op3) + sizeof(uint32_t);
2287	op3 = (ip_fw3_opheader *)ipfw_get_sopt_header(sd, sz);
2288	if (op3 == NULL)
2289		return (EINVAL);
2290
2291	tbl = (uint32_t *)(op3 + 1);
2292	memset(&ti, 0, sizeof(ti));
2293	ti.uidx = *tbl;
2294	IPFW_UH_RLOCK(ch);
2295	error = ipfw_count_xtable(ch, &ti, tbl);
2296	IPFW_UH_RUNLOCK(ch);
2297	return (error);
2298}
2299
2300/*
2301 * Legacy IP_FW_TABLE_GETSIZE handler
2302 */
2303int
2304ipfw_count_table(struct ip_fw_chain *ch, struct tid_info *ti, uint32_t *cnt)
2305{
2306	struct table_config *tc;
2307
2308	if ((tc = find_table(CHAIN_TO_NI(ch), ti)) == NULL)
2309		return (ESRCH);
2310	*cnt = table_get_count(ch, tc);
2311	return (0);
2312}
2313
2314/*
2315 * Legacy IP_FW_TABLE_XGETSIZE handler
2316 */
2317int
2318ipfw_count_xtable(struct ip_fw_chain *ch, struct tid_info *ti, uint32_t *cnt)
2319{
2320	struct table_config *tc;
2321	uint32_t count;
2322
2323	if ((tc = find_table(CHAIN_TO_NI(ch), ti)) == NULL) {
2324		*cnt = 0;
2325		return (0); /* 'table all list' requires success */
2326	}
2327
2328	count = table_get_count(ch, tc);
2329	*cnt = count * sizeof(ipfw_table_xentry);
2330	if (count > 0)
2331		*cnt += sizeof(ipfw_xtable);
2332	return (0);
2333}
2334
2335static int
2336dump_table_entry(void *e, void *arg)
2337{
2338	struct dump_args *da;
2339	struct table_config *tc;
2340	struct table_algo *ta;
2341	ipfw_table_entry *ent;
2342	struct table_value *pval;
2343	int error;
2344
2345	da = (struct dump_args *)arg;
2346
2347	tc = da->tc;
2348	ta = tc->ta;
2349
2350	/* Out of memory, returning */
2351	if (da->cnt == da->size)
2352		return (1);
2353	ent = da->ent++;
2354	ent->tbl = da->uidx;
2355	da->cnt++;
2356
2357	error = ta->dump_tentry(tc->astate, da->ti, e, &da->tent);
2358	if (error != 0)
2359		return (error);
2360
2361	ent->addr = da->tent.k.addr.s_addr;
2362	ent->masklen = da->tent.masklen;
2363	pval = get_table_value(da->ch, da->tc, da->tent.v.kidx);
2364	ent->value = ipfw_export_table_value_legacy(pval);
2365
2366	return (0);
2367}
2368
2369/*
2370 * Dumps table in pre-8.1 legacy format.
2371 */
2372int
2373ipfw_dump_table_legacy(struct ip_fw_chain *ch, struct tid_info *ti,
2374    ipfw_table *tbl)
2375{
2376	struct table_config *tc;
2377	struct table_algo *ta;
2378	struct dump_args da;
2379
2380	tbl->cnt = 0;
2381
2382	if ((tc = find_table(CHAIN_TO_NI(ch), ti)) == NULL)
2383		return (0);	/* XXX: We should return ESRCH */
2384
2385	ta = tc->ta;
2386
2387	/* This dump format supports IPv4 only */
2388	if (tc->no.subtype != IPFW_TABLE_ADDR)
2389		return (0);
2390
2391	memset(&da, 0, sizeof(da));
2392	da.ch = ch;
2393	da.ti = KIDX_TO_TI(ch, tc->no.kidx);
2394	da.tc = tc;
2395	da.ent = &tbl->ent[0];
2396	da.size = tbl->size;
2397
2398	tbl->cnt = 0;
2399	ta->foreach(tc->astate, da.ti, dump_table_entry, &da);
2400	tbl->cnt = da.cnt;
2401
2402	return (0);
2403}
2404
2405/*
2406 * Dumps table entry in eXtended format (v1)(current).
2407 */
2408static int
2409dump_table_tentry(void *e, void *arg)
2410{
2411	struct dump_args *da;
2412	struct table_config *tc;
2413	struct table_algo *ta;
2414	struct table_value *pval;
2415	ipfw_obj_tentry *tent;
2416	int error;
2417
2418	da = (struct dump_args *)arg;
2419
2420	tc = da->tc;
2421	ta = tc->ta;
2422
2423	tent = (ipfw_obj_tentry *)ipfw_get_sopt_space(da->sd, sizeof(*tent));
2424	/* Out of memory, returning */
2425	if (tent == NULL) {
2426		da->error = ENOMEM;
2427		return (1);
2428	}
2429	tent->head.length = sizeof(ipfw_obj_tentry);
2430	tent->idx = da->uidx;
2431
2432	error = ta->dump_tentry(tc->astate, da->ti, e, tent);
2433	if (error != 0)
2434		return (error);
2435
2436	pval = get_table_value(da->ch, da->tc, tent->v.kidx);
2437	ipfw_export_table_value_v1(pval, &tent->v.value);
2438
2439	return (0);
2440}
2441
2442/*
2443 * Dumps table entry in eXtended format (v0).
2444 */
2445static int
2446dump_table_xentry(void *e, void *arg)
2447{
2448	struct dump_args *da;
2449	struct table_config *tc;
2450	struct table_algo *ta;
2451	ipfw_table_xentry *xent;
2452	ipfw_obj_tentry *tent;
2453	struct table_value *pval;
2454	int error;
2455
2456	da = (struct dump_args *)arg;
2457
2458	tc = da->tc;
2459	ta = tc->ta;
2460
2461	xent = (ipfw_table_xentry *)ipfw_get_sopt_space(da->sd, sizeof(*xent));
2462	/* Out of memory, returning */
2463	if (xent == NULL)
2464		return (1);
2465	xent->len = sizeof(ipfw_table_xentry);
2466	xent->tbl = da->uidx;
2467
2468	memset(&da->tent, 0, sizeof(da->tent));
2469	tent = &da->tent;
2470	error = ta->dump_tentry(tc->astate, da->ti, e, tent);
2471	if (error != 0)
2472		return (error);
2473
2474	/* Convert current format to previous one */
2475	xent->masklen = tent->masklen;
2476	pval = get_table_value(da->ch, da->tc, da->tent.v.kidx);
2477	xent->value = ipfw_export_table_value_legacy(pval);
2478	/* Apply some hacks */
2479	if (tc->no.subtype == IPFW_TABLE_ADDR && tent->subtype == AF_INET) {
2480		xent->k.addr6.s6_addr32[3] = tent->k.addr.s_addr;
2481		xent->flags = IPFW_TCF_INET;
2482	} else
2483		memcpy(&xent->k, &tent->k, sizeof(xent->k));
2484
2485	return (0);
2486}
2487
2488/*
2489 * Helper function to export table algo data
2490 * to tentry format before calling user function.
2491 *
2492 * Returns 0 on success.
2493 */
2494static int
2495prepare_table_tentry(void *e, void *arg)
2496{
2497	struct dump_args *da;
2498	struct table_config *tc;
2499	struct table_algo *ta;
2500	int error;
2501
2502	da = (struct dump_args *)arg;
2503
2504	tc = da->tc;
2505	ta = tc->ta;
2506
2507	error = ta->dump_tentry(tc->astate, da->ti, e, &da->tent);
2508	if (error != 0)
2509		return (error);
2510
2511	da->f(&da->tent, da->farg);
2512
2513	return (0);
2514}
2515
2516/*
2517 * Allow external consumers to read table entries in standard format.
2518 */
2519int
2520ipfw_foreach_table_tentry(struct ip_fw_chain *ch, uint16_t kidx,
2521    ta_foreach_f *f, void *arg)
2522{
2523	struct namedobj_instance *ni;
2524	struct table_config *tc;
2525	struct table_algo *ta;
2526	struct dump_args da;
2527
2528	ni = CHAIN_TO_NI(ch);
2529
2530	tc = (struct table_config *)ipfw_objhash_lookup_kidx(ni, kidx);
2531	if (tc == NULL)
2532		return (ESRCH);
2533
2534	ta = tc->ta;
2535
2536	memset(&da, 0, sizeof(da));
2537	da.ch = ch;
2538	da.ti = KIDX_TO_TI(ch, tc->no.kidx);
2539	da.tc = tc;
2540	da.f = f;
2541	da.farg = arg;
2542
2543	ta->foreach(tc->astate, da.ti, prepare_table_tentry, &da);
2544
2545	return (0);
2546}
2547
2548/*
2549 * Table algorithms
2550 */
2551
2552/*
2553 * Finds algorithm by index, table type or supplied name.
2554 *
2555 * Returns pointer to algo or NULL.
2556 */
2557static struct table_algo *
2558find_table_algo(struct tables_config *tcfg, struct tid_info *ti, char *name)
2559{
2560	int i, l;
2561	struct table_algo *ta;
2562
2563	if (ti->type > IPFW_TABLE_MAXTYPE)
2564		return (NULL);
2565
2566	/* Search by index */
2567	if (ti->atype != 0) {
2568		if (ti->atype > tcfg->algo_count)
2569			return (NULL);
2570		return (tcfg->algo[ti->atype]);
2571	}
2572
2573	if (name == NULL) {
2574		/* Return default algorithm for given type if set */
2575		return (tcfg->def_algo[ti->type]);
2576	}
2577
2578	/* Search by name */
2579	/* TODO: better search */
2580	for (i = 1; i <= tcfg->algo_count; i++) {
2581		ta = tcfg->algo[i];
2582
2583		/*
2584		 * One can supply additional algorithm
2585		 * parameters so we compare only the first word
2586		 * of supplied name:
2587		 * 'addr:chash hsize=32'
2588		 * '^^^^^^^^^'
2589		 *
2590		 */
2591		l = strlen(ta->name);
2592		if (strncmp(name, ta->name, l) != 0)
2593			continue;
2594		if (name[l] != '\0' && name[l] != ' ')
2595			continue;
2596		/* Check if we're requesting proper table type */
2597		if (ti->type != 0 && ti->type != ta->type)
2598			return (NULL);
2599		return (ta);
2600	}
2601
2602	return (NULL);
2603}
2604
2605/*
2606 * Register new table algo @ta.
2607 * Stores algo id inside @idx.
2608 *
2609 * Returns 0 on success.
2610 */
2611int
2612ipfw_add_table_algo(struct ip_fw_chain *ch, struct table_algo *ta, size_t size,
2613    int *idx)
2614{
2615	struct tables_config *tcfg;
2616	struct table_algo *ta_new;
2617	size_t sz;
2618
2619	if (size > sizeof(struct table_algo))
2620		return (EINVAL);
2621
2622	/* Check for the required on-stack size for add/del */
2623	sz = roundup2(ta->ta_buf_size, sizeof(void *));
2624	if (sz > TA_BUF_SZ)
2625		return (EINVAL);
2626
2627	KASSERT(ta->type <= IPFW_TABLE_MAXTYPE,("Increase IPFW_TABLE_MAXTYPE"));
2628
2629	/* Copy algorithm data to stable storage. */
2630	ta_new = malloc(sizeof(struct table_algo), M_IPFW, M_WAITOK | M_ZERO);
2631	memcpy(ta_new, ta, size);
2632
2633	tcfg = CHAIN_TO_TCFG(ch);
2634
2635	KASSERT(tcfg->algo_count < 255, ("Increase algo array size"));
2636
2637	tcfg->algo[++tcfg->algo_count] = ta_new;
2638	ta_new->idx = tcfg->algo_count;
2639
2640	/* Set algorithm as default one for given type */
2641	if ((ta_new->flags & TA_FLAG_DEFAULT) != 0 &&
2642	    tcfg->def_algo[ta_new->type] == NULL)
2643		tcfg->def_algo[ta_new->type] = ta_new;
2644
2645	*idx = ta_new->idx;
2646
2647	return (0);
2648}
2649
2650/*
2651 * Unregisters table algo using @idx as id.
2652 * XXX: It is NOT safe to call this function in any place
2653 * other than ipfw instance destroy handler.
2654 */
2655void
2656ipfw_del_table_algo(struct ip_fw_chain *ch, int idx)
2657{
2658	struct tables_config *tcfg;
2659	struct table_algo *ta;
2660
2661	tcfg = CHAIN_TO_TCFG(ch);
2662
2663	KASSERT(idx <= tcfg->algo_count, ("algo idx %d out of range 1..%d",
2664	    idx, tcfg->algo_count));
2665
2666	ta = tcfg->algo[idx];
2667	KASSERT(ta != NULL, ("algo idx %d is NULL", idx));
2668
2669	if (tcfg->def_algo[ta->type] == ta)
2670		tcfg->def_algo[ta->type] = NULL;
2671
2672	free(ta, M_IPFW);
2673}
2674
2675/*
2676 * Lists all table algorithms currently available.
2677 * Data layout (v0)(current):
2678 * Request: [ ipfw_obj_lheader ], size = ipfw_obj_lheader.size
2679 * Reply: [ ipfw_obj_lheader ipfw_ta_info x N ]
2680 *
2681 * Returns 0 on success
2682 */
2683static int
2684list_table_algo(struct ip_fw_chain *ch, ip_fw3_opheader *op3,
2685    struct sockopt_data *sd)
2686{
2687	struct _ipfw_obj_lheader *olh;
2688	struct tables_config *tcfg;
2689	ipfw_ta_info *i;
2690	struct table_algo *ta;
2691	uint32_t count, n, size;
2692
2693	olh = (struct _ipfw_obj_lheader *)ipfw_get_sopt_header(sd,sizeof(*olh));
2694	if (olh == NULL)
2695		return (EINVAL);
2696	if (sd->valsize < olh->size)
2697		return (EINVAL);
2698
2699	IPFW_UH_RLOCK(ch);
2700	tcfg = CHAIN_TO_TCFG(ch);
2701	count = tcfg->algo_count;
2702	size = count * sizeof(ipfw_ta_info) + sizeof(ipfw_obj_lheader);
2703
2704	/* Fill in header regadless of buffer size */
2705	olh->count = count;
2706	olh->objsize = sizeof(ipfw_ta_info);
2707
2708	if (size > olh->size) {
2709		olh->size = size;
2710		IPFW_UH_RUNLOCK(ch);
2711		return (ENOMEM);
2712	}
2713	olh->size = size;
2714
2715	for (n = 1; n <= count; n++) {
2716		i = (ipfw_ta_info *)ipfw_get_sopt_space(sd, sizeof(*i));
2717		KASSERT(i != NULL, ("previously checked buffer is not enough"));
2718		ta = tcfg->algo[n];
2719		strlcpy(i->algoname, ta->name, sizeof(i->algoname));
2720		i->type = ta->type;
2721		i->refcnt = ta->refcnt;
2722	}
2723
2724	IPFW_UH_RUNLOCK(ch);
2725
2726	return (0);
2727}
2728
2729static int
2730classify_srcdst(ipfw_insn *cmd, uint16_t *puidx, uint8_t *ptype)
2731{
2732	/* Basic IPv4/IPv6 or u32 lookups */
2733	*puidx = cmd->arg1;
2734	/* Assume ADDR by default */
2735	*ptype = IPFW_TABLE_ADDR;
2736	int v;
2737
2738	if (F_LEN(cmd) > F_INSN_SIZE(ipfw_insn_u32)) {
2739		/*
2740		 * generic lookup. The key must be
2741		 * in 32bit big-endian format.
2742		 */
2743		v = ((ipfw_insn_u32 *)cmd)->d[1];
2744		switch (v) {
2745		case LOOKUP_DST_IP:
2746		case LOOKUP_SRC_IP:
2747			break;
2748		case LOOKUP_DST_PORT:
2749		case LOOKUP_SRC_PORT:
2750		case LOOKUP_UID:
2751		case LOOKUP_JAIL:
2752		case LOOKUP_DSCP:
2753		case LOOKUP_MARK:
2754			*ptype = IPFW_TABLE_NUMBER;
2755			break;
2756		case LOOKUP_DST_MAC:
2757		case LOOKUP_SRC_MAC:
2758			*ptype = IPFW_TABLE_MAC;
2759			break;
2760		}
2761	}
2762
2763	return (0);
2764}
2765
2766static int
2767classify_via(ipfw_insn *cmd, uint16_t *puidx, uint8_t *ptype)
2768{
2769	ipfw_insn_if *cmdif;
2770
2771	/* Interface table, possibly */
2772	cmdif = (ipfw_insn_if *)cmd;
2773	if (cmdif->name[0] != '\1')
2774		return (1);
2775
2776	*ptype = IPFW_TABLE_INTERFACE;
2777	*puidx = cmdif->p.kidx;
2778
2779	return (0);
2780}
2781
2782static int
2783classify_flow(ipfw_insn *cmd, uint16_t *puidx, uint8_t *ptype)
2784{
2785
2786	*puidx = cmd->arg1;
2787	*ptype = IPFW_TABLE_FLOW;
2788
2789	return (0);
2790}
2791
2792static int
2793classify_mac_lookup(ipfw_insn *cmd, uint16_t *puidx, uint8_t *ptype)
2794{
2795	*puidx = cmd->arg1;
2796	*ptype = IPFW_TABLE_MAC;
2797	return (0);
2798}
2799
2800static void
2801update_arg1(ipfw_insn *cmd, uint16_t idx)
2802{
2803
2804	cmd->arg1 = idx;
2805}
2806
2807static void
2808update_via(ipfw_insn *cmd, uint16_t idx)
2809{
2810	ipfw_insn_if *cmdif;
2811
2812	cmdif = (ipfw_insn_if *)cmd;
2813	cmdif->p.kidx = idx;
2814}
2815
2816static int
2817table_findbyname(struct ip_fw_chain *ch, struct tid_info *ti,
2818    struct named_object **pno)
2819{
2820	struct table_config *tc;
2821	int error;
2822
2823	IPFW_UH_WLOCK_ASSERT(ch);
2824
2825	error = find_table_err(CHAIN_TO_NI(ch), ti, &tc);
2826	if (error != 0)
2827		return (error);
2828
2829	*pno = &tc->no;
2830	return (0);
2831}
2832
2833/* XXX: sets-sets! */
2834static struct named_object *
2835table_findbykidx(struct ip_fw_chain *ch, uint16_t idx)
2836{
2837	struct namedobj_instance *ni;
2838	struct table_config *tc;
2839
2840	IPFW_UH_WLOCK_ASSERT(ch);
2841	ni = CHAIN_TO_NI(ch);
2842	tc = (struct table_config *)ipfw_objhash_lookup_kidx(ni, idx);
2843	KASSERT(tc != NULL, ("Table with index %d not found", idx));
2844
2845	return (&tc->no);
2846}
2847
2848static int
2849table_manage_sets(struct ip_fw_chain *ch, uint16_t set, uint8_t new_set,
2850    enum ipfw_sets_cmd cmd)
2851{
2852
2853	switch (cmd) {
2854	case SWAP_ALL:
2855	case TEST_ALL:
2856	case MOVE_ALL:
2857		/*
2858		 * Always return success, the real action and decision
2859		 * should make table_manage_sets_all().
2860		 */
2861		return (0);
2862	case TEST_ONE:
2863	case MOVE_ONE:
2864		/*
2865		 * NOTE: we need to use ipfw_objhash_del/ipfw_objhash_add
2866		 * if set number will be used in hash function. Currently
2867		 * we can just use generic handler that replaces set value.
2868		 */
2869		if (V_fw_tables_sets == 0)
2870			return (0);
2871		break;
2872	case COUNT_ONE:
2873		/*
2874		 * Return EOPNOTSUPP for COUNT_ONE when per-set sysctl is
2875		 * disabled. This allow skip table's opcodes from additional
2876		 * checks when specific rules moved to another set.
2877		 */
2878		if (V_fw_tables_sets == 0)
2879			return (EOPNOTSUPP);
2880	}
2881	/* Use generic sets handler when per-set sysctl is enabled. */
2882	return (ipfw_obj_manage_sets(CHAIN_TO_NI(ch), IPFW_TLV_TBL_NAME,
2883	    set, new_set, cmd));
2884}
2885
2886/*
2887 * We register several opcode rewriters for lookup tables.
2888 * All tables opcodes have the same ETLV type, but different subtype.
2889 * To avoid invoking sets handler several times for XXX_ALL commands,
2890 * we use separate manage_sets handler. O_RECV has the lowest value,
2891 * so it should be called first.
2892 */
2893static int
2894table_manage_sets_all(struct ip_fw_chain *ch, uint16_t set, uint8_t new_set,
2895    enum ipfw_sets_cmd cmd)
2896{
2897
2898	switch (cmd) {
2899	case SWAP_ALL:
2900	case TEST_ALL:
2901		/*
2902		 * Return success for TEST_ALL, since nothing prevents
2903		 * move rules from one set to another. All tables are
2904		 * accessible from all sets when per-set tables sysctl
2905		 * is disabled.
2906		 */
2907	case MOVE_ALL:
2908		if (V_fw_tables_sets == 0)
2909			return (0);
2910		break;
2911	default:
2912		return (table_manage_sets(ch, set, new_set, cmd));
2913	}
2914	/* Use generic sets handler when per-set sysctl is enabled. */
2915	return (ipfw_obj_manage_sets(CHAIN_TO_NI(ch), IPFW_TLV_TBL_NAME,
2916	    set, new_set, cmd));
2917}
2918
2919static struct opcode_obj_rewrite opcodes[] = {
2920	{
2921		.opcode = O_IP_SRC_LOOKUP,
2922		.etlv = IPFW_TLV_TBL_NAME,
2923		.classifier = classify_srcdst,
2924		.update = update_arg1,
2925		.find_byname = table_findbyname,
2926		.find_bykidx = table_findbykidx,
2927		.create_object = create_table_compat,
2928		.manage_sets = table_manage_sets,
2929	},
2930	{
2931		.opcode = O_IP_DST_LOOKUP,
2932		.etlv = IPFW_TLV_TBL_NAME,
2933		.classifier = classify_srcdst,
2934		.update = update_arg1,
2935		.find_byname = table_findbyname,
2936		.find_bykidx = table_findbykidx,
2937		.create_object = create_table_compat,
2938		.manage_sets = table_manage_sets,
2939	},
2940	{
2941		.opcode = O_IP_FLOW_LOOKUP,
2942		.etlv = IPFW_TLV_TBL_NAME,
2943		.classifier = classify_flow,
2944		.update = update_arg1,
2945		.find_byname = table_findbyname,
2946		.find_bykidx = table_findbykidx,
2947		.create_object = create_table_compat,
2948		.manage_sets = table_manage_sets,
2949	},
2950	{
2951		.opcode = O_MAC_SRC_LOOKUP,
2952		.etlv = IPFW_TLV_TBL_NAME,
2953		.classifier = classify_mac_lookup,
2954		.update = update_arg1,
2955		.find_byname = table_findbyname,
2956		.find_bykidx = table_findbykidx,
2957		.create_object = create_table_compat,
2958		.manage_sets = table_manage_sets,
2959	},
2960	{
2961		.opcode = O_MAC_DST_LOOKUP,
2962		.etlv = IPFW_TLV_TBL_NAME,
2963		.classifier = classify_mac_lookup,
2964		.update = update_arg1,
2965		.find_byname = table_findbyname,
2966		.find_bykidx = table_findbykidx,
2967		.create_object = create_table_compat,
2968		.manage_sets = table_manage_sets,
2969	},
2970	{
2971		.opcode = O_XMIT,
2972		.etlv = IPFW_TLV_TBL_NAME,
2973		.classifier = classify_via,
2974		.update = update_via,
2975		.find_byname = table_findbyname,
2976		.find_bykidx = table_findbykidx,
2977		.create_object = create_table_compat,
2978		.manage_sets = table_manage_sets,
2979	},
2980	{
2981		.opcode = O_RECV,
2982		.etlv = IPFW_TLV_TBL_NAME,
2983		.classifier = classify_via,
2984		.update = update_via,
2985		.find_byname = table_findbyname,
2986		.find_bykidx = table_findbykidx,
2987		.create_object = create_table_compat,
2988		.manage_sets = table_manage_sets_all,
2989	},
2990	{
2991		.opcode = O_VIA,
2992		.etlv = IPFW_TLV_TBL_NAME,
2993		.classifier = classify_via,
2994		.update = update_via,
2995		.find_byname = table_findbyname,
2996		.find_bykidx = table_findbykidx,
2997		.create_object = create_table_compat,
2998		.manage_sets = table_manage_sets,
2999	},
3000};
3001
3002static int
3003test_sets_cb(struct namedobj_instance *ni __unused, struct named_object *no,
3004    void *arg __unused)
3005{
3006
3007	/* Check that there aren't any tables in not default set */
3008	if (no->set != 0)
3009		return (EBUSY);
3010	return (0);
3011}
3012
3013/*
3014 * Switch between "set 0" and "rule's set" table binding,
3015 * Check all ruleset bindings and permits changing
3016 * IFF each binding has both rule AND table in default set (set 0).
3017 *
3018 * Returns 0 on success.
3019 */
3020int
3021ipfw_switch_tables_namespace(struct ip_fw_chain *ch, unsigned int sets)
3022{
3023	struct opcode_obj_rewrite *rw;
3024	struct namedobj_instance *ni;
3025	struct named_object *no;
3026	struct ip_fw *rule;
3027	ipfw_insn *cmd;
3028	int cmdlen, i, l;
3029	uint16_t kidx;
3030	uint8_t subtype;
3031
3032	IPFW_UH_WLOCK(ch);
3033
3034	if (V_fw_tables_sets == sets) {
3035		IPFW_UH_WUNLOCK(ch);
3036		return (0);
3037	}
3038	ni = CHAIN_TO_NI(ch);
3039	if (sets == 0) {
3040		/*
3041		 * Prevent disabling sets support if we have some tables
3042		 * in not default sets.
3043		 */
3044		if (ipfw_objhash_foreach_type(ni, test_sets_cb,
3045		    NULL, IPFW_TLV_TBL_NAME) != 0) {
3046			IPFW_UH_WUNLOCK(ch);
3047			return (EBUSY);
3048		}
3049	}
3050	/*
3051	 * Scan all rules and examine tables opcodes.
3052	 */
3053	for (i = 0; i < ch->n_rules; i++) {
3054		rule = ch->map[i];
3055
3056		l = rule->cmd_len;
3057		cmd = rule->cmd;
3058		cmdlen = 0;
3059		for ( ;	l > 0 ; l -= cmdlen, cmd += cmdlen) {
3060			cmdlen = F_LEN(cmd);
3061			/* Check only tables opcodes */
3062			for (kidx = 0, rw = opcodes;
3063			    rw < opcodes + nitems(opcodes); rw++) {
3064				if (rw->opcode != cmd->opcode)
3065					continue;
3066				if (rw->classifier(cmd, &kidx, &subtype) == 0)
3067					break;
3068			}
3069			if (kidx == 0)
3070				continue;
3071			no = ipfw_objhash_lookup_kidx(ni, kidx);
3072			/* Check if both table object and rule has the set 0 */
3073			if (no->set != 0 || rule->set != 0) {
3074				IPFW_UH_WUNLOCK(ch);
3075				return (EBUSY);
3076			}
3077		}
3078	}
3079	V_fw_tables_sets = sets;
3080	IPFW_UH_WUNLOCK(ch);
3081	return (0);
3082}
3083
3084/*
3085 * Checks table name for validity.
3086 * Enforce basic length checks, the rest
3087 * should be done in userland.
3088 *
3089 * Returns 0 if name is considered valid.
3090 */
3091static int
3092check_table_name(const char *name)
3093{
3094
3095	/*
3096	 * TODO: do some more complicated checks
3097	 */
3098	return (ipfw_check_object_name_generic(name));
3099}
3100
3101/*
3102 * Finds table config based on either legacy index
3103 * or name in ntlv.
3104 * Note @ti structure contains unchecked data from userland.
3105 *
3106 * Returns 0 in success and fills in @tc with found config
3107 */
3108static int
3109find_table_err(struct namedobj_instance *ni, struct tid_info *ti,
3110    struct table_config **tc)
3111{
3112	char *name, bname[16];
3113	struct named_object *no;
3114	ipfw_obj_ntlv *ntlv;
3115	uint32_t set;
3116
3117	if (ti->tlvs != NULL) {
3118		ntlv = ipfw_find_name_tlv_type(ti->tlvs, ti->tlen, ti->uidx,
3119		    IPFW_TLV_TBL_NAME);
3120		if (ntlv == NULL)
3121			return (EINVAL);
3122		name = ntlv->name;
3123
3124		/*
3125		 * Use set provided by @ti instead of @ntlv one.
3126		 * This is needed due to different sets behavior
3127		 * controlled by V_fw_tables_sets.
3128		 */
3129		set = (V_fw_tables_sets != 0) ? ti->set : 0;
3130	} else {
3131		snprintf(bname, sizeof(bname), "%d", ti->uidx);
3132		name = bname;
3133		set = 0;
3134	}
3135
3136	no = ipfw_objhash_lookup_name(ni, set, name);
3137	*tc = (struct table_config *)no;
3138
3139	return (0);
3140}
3141
3142/*
3143 * Finds table config based on either legacy index
3144 * or name in ntlv.
3145 * Note @ti structure contains unchecked data from userland.
3146 *
3147 * Returns pointer to table_config or NULL.
3148 */
3149static struct table_config *
3150find_table(struct namedobj_instance *ni, struct tid_info *ti)
3151{
3152	struct table_config *tc;
3153
3154	if (find_table_err(ni, ti, &tc) != 0)
3155		return (NULL);
3156
3157	return (tc);
3158}
3159
3160/*
3161 * Allocate new table config structure using
3162 * specified @algo and @aname.
3163 *
3164 * Returns pointer to config or NULL.
3165 */
3166static struct table_config *
3167alloc_table_config(struct ip_fw_chain *ch, struct tid_info *ti,
3168    struct table_algo *ta, char *aname, uint8_t tflags)
3169{
3170	char *name, bname[16];
3171	struct table_config *tc;
3172	int error;
3173	ipfw_obj_ntlv *ntlv;
3174	uint32_t set;
3175
3176	if (ti->tlvs != NULL) {
3177		ntlv = ipfw_find_name_tlv_type(ti->tlvs, ti->tlen, ti->uidx,
3178		    IPFW_TLV_TBL_NAME);
3179		if (ntlv == NULL)
3180			return (NULL);
3181		name = ntlv->name;
3182		set = (V_fw_tables_sets == 0) ? 0 : ntlv->set;
3183	} else {
3184		/* Compat part: convert number to string representation */
3185		snprintf(bname, sizeof(bname), "%d", ti->uidx);
3186		name = bname;
3187		set = 0;
3188	}
3189
3190	tc = malloc(sizeof(struct table_config), M_IPFW, M_WAITOK | M_ZERO);
3191	tc->no.name = tc->tablename;
3192	tc->no.subtype = ta->type;
3193	tc->no.set = set;
3194	tc->tflags = tflags;
3195	tc->ta = ta;
3196	strlcpy(tc->tablename, name, sizeof(tc->tablename));
3197	/* Set "shared" value type by default */
3198	tc->vshared = 1;
3199
3200	/* Preallocate data structures for new tables */
3201	error = ta->init(ch, &tc->astate, &tc->ti_copy, aname, tflags);
3202	if (error != 0) {
3203		free(tc, M_IPFW);
3204		return (NULL);
3205	}
3206
3207	return (tc);
3208}
3209
3210/*
3211 * Destroys table state and config.
3212 */
3213static void
3214free_table_config(struct namedobj_instance *ni, struct table_config *tc)
3215{
3216
3217	KASSERT(tc->linked == 0, ("free() on linked config"));
3218	/* UH lock MUST NOT be held */
3219
3220	/*
3221	 * We're using ta without any locking/referencing.
3222	 * TODO: fix this if we're going to use unloadable algos.
3223	 */
3224	tc->ta->destroy(tc->astate, &tc->ti_copy);
3225	free(tc, M_IPFW);
3226}
3227
3228/*
3229 * Links @tc to @chain table named instance.
3230 * Sets appropriate type/states in @chain table info.
3231 */
3232static void
3233link_table(struct ip_fw_chain *ch, struct table_config *tc)
3234{
3235	struct namedobj_instance *ni;
3236	struct table_info *ti;
3237	uint16_t kidx;
3238
3239	IPFW_UH_WLOCK_ASSERT(ch);
3240
3241	ni = CHAIN_TO_NI(ch);
3242	kidx = tc->no.kidx;
3243
3244	ipfw_objhash_add(ni, &tc->no);
3245
3246	ti = KIDX_TO_TI(ch, kidx);
3247	*ti = tc->ti_copy;
3248
3249	/* Notify algo on real @ti address */
3250	if (tc->ta->change_ti != NULL)
3251		tc->ta->change_ti(tc->astate, ti);
3252
3253	tc->linked = 1;
3254	tc->ta->refcnt++;
3255}
3256
3257/*
3258 * Unlinks @tc from @chain table named instance.
3259 * Zeroes states in @chain and stores them in @tc.
3260 */
3261static void
3262unlink_table(struct ip_fw_chain *ch, struct table_config *tc)
3263{
3264	struct namedobj_instance *ni;
3265	struct table_info *ti;
3266	uint16_t kidx;
3267
3268	IPFW_UH_WLOCK_ASSERT(ch);
3269	IPFW_WLOCK_ASSERT(ch);
3270
3271	ni = CHAIN_TO_NI(ch);
3272	kidx = tc->no.kidx;
3273
3274	/* Clear state. @ti copy is already saved inside @tc */
3275	ipfw_objhash_del(ni, &tc->no);
3276	ti = KIDX_TO_TI(ch, kidx);
3277	memset(ti, 0, sizeof(struct table_info));
3278	tc->linked = 0;
3279	tc->ta->refcnt--;
3280
3281	/* Notify algo on real @ti address */
3282	if (tc->ta->change_ti != NULL)
3283		tc->ta->change_ti(tc->astate, NULL);
3284}
3285
3286static struct ipfw_sopt_handler	scodes[] = {
3287	{ IP_FW_TABLE_XCREATE,	0,	HDIR_SET,	create_table },
3288	{ IP_FW_TABLE_XDESTROY,	0,	HDIR_SET,	flush_table_v0 },
3289	{ IP_FW_TABLE_XFLUSH,	0,	HDIR_SET,	flush_table_v0 },
3290	{ IP_FW_TABLE_XMODIFY,	0,	HDIR_BOTH,	modify_table },
3291	{ IP_FW_TABLE_XINFO,	0,	HDIR_GET,	describe_table },
3292	{ IP_FW_TABLES_XLIST,	0,	HDIR_GET,	list_tables },
3293	{ IP_FW_TABLE_XLIST,	0,	HDIR_GET,	dump_table_v0 },
3294	{ IP_FW_TABLE_XLIST,	1,	HDIR_GET,	dump_table_v1 },
3295	{ IP_FW_TABLE_XADD,	0,	HDIR_BOTH,	manage_table_ent_v0 },
3296	{ IP_FW_TABLE_XADD,	1,	HDIR_BOTH,	manage_table_ent_v1 },
3297	{ IP_FW_TABLE_XDEL,	0,	HDIR_BOTH,	manage_table_ent_v0 },
3298	{ IP_FW_TABLE_XDEL,	1,	HDIR_BOTH,	manage_table_ent_v1 },
3299	{ IP_FW_TABLE_XFIND,	0,	HDIR_GET,	find_table_entry },
3300	{ IP_FW_TABLE_XSWAP,	0,	HDIR_SET,	swap_table },
3301	{ IP_FW_TABLES_ALIST,	0,	HDIR_GET,	list_table_algo },
3302	{ IP_FW_TABLE_XGETSIZE,	0,	HDIR_GET,	get_table_size },
3303};
3304
3305static int
3306destroy_table_locked(struct namedobj_instance *ni, struct named_object *no,
3307    void *arg)
3308{
3309
3310	unlink_table((struct ip_fw_chain *)arg, (struct table_config *)no);
3311	if (ipfw_objhash_free_idx(ni, no->kidx) != 0)
3312		printf("Error unlinking kidx %d from table %s\n",
3313		    no->kidx, no->name);
3314	free_table_config(ni, (struct table_config *)no);
3315	return (0);
3316}
3317
3318/*
3319 * Shuts tables module down.
3320 */
3321void
3322ipfw_destroy_tables(struct ip_fw_chain *ch, int last)
3323{
3324
3325	IPFW_DEL_SOPT_HANDLER(last, scodes);
3326	IPFW_DEL_OBJ_REWRITER(last, opcodes);
3327
3328	/* Remove all tables from working set */
3329	IPFW_UH_WLOCK(ch);
3330	IPFW_WLOCK(ch);
3331	ipfw_objhash_foreach(CHAIN_TO_NI(ch), destroy_table_locked, ch);
3332	IPFW_WUNLOCK(ch);
3333	IPFW_UH_WUNLOCK(ch);
3334
3335	/* Free pointers itself */
3336	free(ch->tablestate, M_IPFW);
3337
3338	ipfw_table_value_destroy(ch, last);
3339	ipfw_table_algo_destroy(ch);
3340
3341	ipfw_objhash_destroy(CHAIN_TO_NI(ch));
3342	free(CHAIN_TO_TCFG(ch), M_IPFW);
3343}
3344
3345/*
3346 * Starts tables module.
3347 */
3348int
3349ipfw_init_tables(struct ip_fw_chain *ch, int first)
3350{
3351	struct tables_config *tcfg;
3352
3353	/* Allocate pointers */
3354	ch->tablestate = malloc(V_fw_tables_max * sizeof(struct table_info),
3355	    M_IPFW, M_WAITOK | M_ZERO);
3356
3357	tcfg = malloc(sizeof(struct tables_config), M_IPFW, M_WAITOK | M_ZERO);
3358	tcfg->namehash = ipfw_objhash_create(V_fw_tables_max);
3359	ch->tblcfg = tcfg;
3360
3361	ipfw_table_value_init(ch, first);
3362	ipfw_table_algo_init(ch);
3363
3364	IPFW_ADD_OBJ_REWRITER(first, opcodes);
3365	IPFW_ADD_SOPT_HANDLER(first, scodes);
3366	return (0);
3367}
3368