1/*-
2 * SPDX-License-Identifier: BSD-2-Clause-FreeBSD
3 *
4 * Copyright (c) 2004 Ruslan Ermilov and Vsevolod Lobko.
5 * Copyright (c) 2014 Yandex LLC
6 * Copyright (c) 2014 Alexander V. Chernikov
7 *
8 * Redistribution and use in source and binary forms, with or without
9 * modification, are permitted provided that the following conditions
10 * are met:
11 * 1. Redistributions of source code must retain the above copyright
12 *    notice, this list of conditions and the following disclaimer.
13 * 2. Redistributions in binary form must reproduce the above copyright
14 *    notice, this list of conditions and the following disclaimer in the
15 *    documentation and/or other materials provided with the distribution.
16 *
17 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
18 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
19 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
20 * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
21 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
22 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
23 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
24 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
25 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
26 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
27 * SUCH DAMAGE.
28 */
29
30#include <sys/cdefs.h>
31__FBSDID("$FreeBSD$");
32
33/*
34 * Lookup table support for ipfw.
35 *
36 * This file contains handlers for all generic tables' operations:
37 * add/del/flush entries, list/dump tables etc..
38 *
39 * Table data modification is protected by both UH and runtime lock
40 * while reading configuration/data is protected by UH lock.
41 *
42 * Lookup algorithms for all table types are located in ip_fw_table_algo.c
43 */
44
45#include "opt_ipfw.h"
46
47#include <sys/param.h>
48#include <sys/systm.h>
49#include <sys/malloc.h>
50#include <sys/kernel.h>
51#include <sys/lock.h>
52#include <sys/rwlock.h>
53#include <sys/rmlock.h>
54#include <sys/socket.h>
55#include <sys/socketvar.h>
56#include <sys/queue.h>
57#include <net/if.h>	/* ip_fw.h requires IFNAMSIZ */
58#include <net/pfil.h>
59
60#include <netinet/in.h>
61#include <netinet/ip_var.h>	/* struct ipfw_rule_ref */
62#include <netinet/ip_fw.h>
63
64#include <netpfil/ipfw/ip_fw_private.h>
65#include <netpfil/ipfw/ip_fw_table.h>
66
67 /*
68 * Table has the following `type` concepts:
69 *
70 * `no.type` represents lookup key type (addr, ifp, uid, etc..)
71 * vmask represents bitmask of table values which are present at the moment.
72 * Special IPFW_VTYPE_LEGACY ( (uint32_t)-1 ) represents old
73 * single-value-for-all approach.
74 */
75struct table_config {
76	struct named_object	no;
77	uint8_t		tflags;		/* type flags */
78	uint8_t		locked;		/* 1 if locked from changes */
79	uint8_t		linked;		/* 1 if already linked */
80	uint8_t		ochanged;	/* used by set swapping */
81	uint8_t		vshared;	/* 1 if using shared value array */
82	uint8_t		spare[3];
83	uint32_t	count;		/* Number of records */
84	uint32_t	limit;		/* Max number of records */
85	uint32_t	vmask;		/* bitmask with supported values */
86	uint32_t	ocount;		/* used by set swapping */
87	uint64_t	gencnt;		/* generation count */
88	char		tablename[64];	/* table name */
89	struct table_algo	*ta;	/* Callbacks for given algo */
90	void		*astate;	/* algorithm state */
91	struct table_info	ti_copy;	/* data to put to table_info */
92	struct namedobj_instance	*vi;
93};
94
95static int find_table_err(struct namedobj_instance *ni, struct tid_info *ti,
96    struct table_config **tc);
97static struct table_config *find_table(struct namedobj_instance *ni,
98    struct tid_info *ti);
99static struct table_config *alloc_table_config(struct ip_fw_chain *ch,
100    struct tid_info *ti, struct table_algo *ta, char *adata, uint8_t tflags);
101static void free_table_config(struct namedobj_instance *ni,
102    struct table_config *tc);
103static int create_table_internal(struct ip_fw_chain *ch, struct tid_info *ti,
104    char *aname, ipfw_xtable_info *i, uint16_t *pkidx, int ref);
105static void link_table(struct ip_fw_chain *ch, struct table_config *tc);
106static void unlink_table(struct ip_fw_chain *ch, struct table_config *tc);
107static int find_ref_table(struct ip_fw_chain *ch, struct tid_info *ti,
108    struct tentry_info *tei, uint32_t count, int op, struct table_config **ptc);
109#define	OP_ADD	1
110#define	OP_DEL	0
111static int export_tables(struct ip_fw_chain *ch, ipfw_obj_lheader *olh,
112    struct sockopt_data *sd);
113static void export_table_info(struct ip_fw_chain *ch, struct table_config *tc,
114    ipfw_xtable_info *i);
115static int dump_table_tentry(void *e, void *arg);
116static int dump_table_xentry(void *e, void *arg);
117
118static int swap_tables(struct ip_fw_chain *ch, struct tid_info *a,
119    struct tid_info *b);
120
121static int check_table_name(const char *name);
122static int check_table_space(struct ip_fw_chain *ch, struct tableop_state *ts,
123    struct table_config *tc, struct table_info *ti, uint32_t count);
124static int destroy_table(struct ip_fw_chain *ch, struct tid_info *ti);
125
126static struct table_algo *find_table_algo(struct tables_config *tableconf,
127    struct tid_info *ti, char *name);
128
129static void objheader_to_ti(struct _ipfw_obj_header *oh, struct tid_info *ti);
130static void ntlv_to_ti(struct _ipfw_obj_ntlv *ntlv, struct tid_info *ti);
131
132#define	CHAIN_TO_NI(chain)	(CHAIN_TO_TCFG(chain)->namehash)
133#define	KIDX_TO_TI(ch, k)	(&(((struct table_info *)(ch)->tablestate)[k]))
134
135#define	TA_BUF_SZ	128	/* On-stack buffer for add/delete state */
136
137void
138rollback_toperation_state(struct ip_fw_chain *ch, void *object)
139{
140	struct tables_config *tcfg;
141	struct op_state *os;
142
143	tcfg = CHAIN_TO_TCFG(ch);
144	TAILQ_FOREACH(os, &tcfg->state_list, next)
145		os->func(object, os);
146}
147
148void
149add_toperation_state(struct ip_fw_chain *ch, struct tableop_state *ts)
150{
151	struct tables_config *tcfg;
152
153	tcfg = CHAIN_TO_TCFG(ch);
154	TAILQ_INSERT_HEAD(&tcfg->state_list, &ts->opstate, next);
155}
156
157void
158del_toperation_state(struct ip_fw_chain *ch, struct tableop_state *ts)
159{
160	struct tables_config *tcfg;
161
162	tcfg = CHAIN_TO_TCFG(ch);
163	TAILQ_REMOVE(&tcfg->state_list, &ts->opstate, next);
164}
165
166void
167tc_ref(struct table_config *tc)
168{
169
170	tc->no.refcnt++;
171}
172
173void
174tc_unref(struct table_config *tc)
175{
176
177	tc->no.refcnt--;
178}
179
180static struct table_value *
181get_table_value(struct ip_fw_chain *ch, struct table_config *tc, uint32_t kidx)
182{
183	struct table_value *pval;
184
185	pval = (struct table_value *)ch->valuestate;
186
187	return (&pval[kidx]);
188}
189
190
191/*
192 * Checks if we're able to insert/update entry @tei into table
193 * w.r.t @tc limits.
194 * May alter @tei to indicate insertion error / insert
195 * options.
196 *
197 * Returns 0 if operation can be performed/
198 */
199static int
200check_table_limit(struct table_config *tc, struct tentry_info *tei)
201{
202
203	if (tc->limit == 0 || tc->count < tc->limit)
204		return (0);
205
206	if ((tei->flags & TEI_FLAGS_UPDATE) == 0) {
207		/* Notify userland on error cause */
208		tei->flags |= TEI_FLAGS_LIMIT;
209		return (EFBIG);
210	}
211
212	/*
213	 * We have UPDATE flag set.
214	 * Permit updating record (if found),
215	 * but restrict adding new one since we've
216	 * already hit the limit.
217	 */
218	tei->flags |= TEI_FLAGS_DONTADD;
219
220	return (0);
221}
222
223/*
224 * Convert algorithm callback return code into
225 * one of pre-defined states known by userland.
226 */
227static void
228store_tei_result(struct tentry_info *tei, int op, int error, uint32_t num)
229{
230	int flag;
231
232	flag = 0;
233
234	switch (error) {
235	case 0:
236		if (op == OP_ADD && num != 0)
237			flag = TEI_FLAGS_ADDED;
238		if (op == OP_DEL)
239			flag = TEI_FLAGS_DELETED;
240		break;
241	case ENOENT:
242		flag = TEI_FLAGS_NOTFOUND;
243		break;
244	case EEXIST:
245		flag = TEI_FLAGS_EXISTS;
246		break;
247	default:
248		flag = TEI_FLAGS_ERROR;
249	}
250
251	tei->flags |= flag;
252}
253
254/*
255 * Creates and references table with default parameters.
256 * Saves table config, algo and allocated kidx info @ptc, @pta and
257 * @pkidx if non-zero.
258 * Used for table auto-creation to support old binaries.
259 *
260 * Returns 0 on success.
261 */
262static int
263create_table_compat(struct ip_fw_chain *ch, struct tid_info *ti,
264    uint16_t *pkidx)
265{
266	ipfw_xtable_info xi;
267	int error;
268
269	memset(&xi, 0, sizeof(xi));
270	/* Set default value mask for legacy clients */
271	xi.vmask = IPFW_VTYPE_LEGACY;
272
273	error = create_table_internal(ch, ti, NULL, &xi, pkidx, 1);
274	if (error != 0)
275		return (error);
276
277	return (0);
278}
279
280/*
281 * Find and reference existing table optionally
282 * creating new one.
283 *
284 * Saves found table config into @ptc.
285 * Note function may drop/acquire UH_WLOCK.
286 * Returns 0 if table was found/created and referenced
287 * or non-zero return code.
288 */
289static int
290find_ref_table(struct ip_fw_chain *ch, struct tid_info *ti,
291    struct tentry_info *tei, uint32_t count, int op,
292    struct table_config **ptc)
293{
294	struct namedobj_instance *ni;
295	struct table_config *tc;
296	uint16_t kidx;
297	int error;
298
299	IPFW_UH_WLOCK_ASSERT(ch);
300
301	ni = CHAIN_TO_NI(ch);
302	tc = NULL;
303	if ((tc = find_table(ni, ti)) != NULL) {
304		/* check table type */
305		if (tc->no.subtype != ti->type)
306			return (EINVAL);
307
308		if (tc->locked != 0)
309			return (EACCES);
310
311		/* Try to exit early on limit hit */
312		if (op == OP_ADD && count == 1 &&
313		    check_table_limit(tc, tei) != 0)
314			return (EFBIG);
315
316		/* Reference and return */
317		tc->no.refcnt++;
318		*ptc = tc;
319		return (0);
320	}
321
322	if (op == OP_DEL)
323		return (ESRCH);
324
325	/* Compatibility mode: create new table for old clients */
326	if ((tei->flags & TEI_FLAGS_COMPAT) == 0)
327		return (ESRCH);
328
329	IPFW_UH_WUNLOCK(ch);
330	error = create_table_compat(ch, ti, &kidx);
331	IPFW_UH_WLOCK(ch);
332
333	if (error != 0)
334		return (error);
335
336	tc = (struct table_config *)ipfw_objhash_lookup_kidx(ni, kidx);
337	KASSERT(tc != NULL, ("create_table_compat returned bad idx %d", kidx));
338
339	/* OK, now we've got referenced table. */
340	*ptc = tc;
341	return (0);
342}
343
344/*
345 * Rolls back already @added to @tc entries using state array @ta_buf_m.
346 * Assume the following layout:
347 * 1) ADD state (ta_buf_m[0] ... t_buf_m[added - 1]) for handling update cases
348 * 2) DEL state (ta_buf_m[count[ ... t_buf_m[count + added - 1])
349 *   for storing deleted state
350 */
351static void
352rollback_added_entries(struct ip_fw_chain *ch, struct table_config *tc,
353    struct table_info *tinfo, struct tentry_info *tei, caddr_t ta_buf_m,
354    uint32_t count, uint32_t added)
355{
356	struct table_algo *ta;
357	struct tentry_info *ptei;
358	caddr_t v, vv;
359	size_t ta_buf_sz;
360	int error, i;
361	uint32_t num;
362
363	IPFW_UH_WLOCK_ASSERT(ch);
364
365	ta = tc->ta;
366	ta_buf_sz = ta->ta_buf_size;
367	v = ta_buf_m;
368	vv = v + count * ta_buf_sz;
369	for (i = 0; i < added; i++, v += ta_buf_sz, vv += ta_buf_sz) {
370		ptei = &tei[i];
371		if ((ptei->flags & TEI_FLAGS_UPDATED) != 0) {
372
373			/*
374			 * We have old value stored by previous
375			 * call in @ptei->value. Do add once again
376			 * to restore it.
377			 */
378			error = ta->add(tc->astate, tinfo, ptei, v, &num);
379			KASSERT(error == 0, ("rollback UPDATE fail"));
380			KASSERT(num == 0, ("rollback UPDATE fail2"));
381			continue;
382		}
383
384		error = ta->prepare_del(ch, ptei, vv);
385		KASSERT(error == 0, ("pre-rollback INSERT failed"));
386		error = ta->del(tc->astate, tinfo, ptei, vv, &num);
387		KASSERT(error == 0, ("rollback INSERT failed"));
388		tc->count -= num;
389	}
390}
391
392/*
393 * Prepares add/del state for all @count entries in @tei.
394 * Uses either stack buffer (@ta_buf) or allocates a new one.
395 * Stores pointer to allocated buffer back to @ta_buf.
396 *
397 * Returns 0 on success.
398 */
399static int
400prepare_batch_buffer(struct ip_fw_chain *ch, struct table_algo *ta,
401    struct tentry_info *tei, uint32_t count, int op, caddr_t *ta_buf)
402{
403	caddr_t ta_buf_m, v;
404	size_t ta_buf_sz, sz;
405	struct tentry_info *ptei;
406	int error, i;
407
408	error = 0;
409	ta_buf_sz = ta->ta_buf_size;
410	if (count == 1) {
411		/* Single add/delete, use on-stack buffer */
412		memset(*ta_buf, 0, TA_BUF_SZ);
413		ta_buf_m = *ta_buf;
414	} else {
415
416		/*
417		 * Multiple adds/deletes, allocate larger buffer
418		 *
419		 * Note we need 2xcount buffer for add case:
420		 * we have hold both ADD state
421		 * and DELETE state (this may be needed
422		 * if we need to rollback all changes)
423		 */
424		sz = count * ta_buf_sz;
425		ta_buf_m = malloc((op == OP_ADD) ? sz * 2 : sz, M_TEMP,
426		    M_WAITOK | M_ZERO);
427	}
428
429	v = ta_buf_m;
430	for (i = 0; i < count; i++, v += ta_buf_sz) {
431		ptei = &tei[i];
432		error = (op == OP_ADD) ?
433		    ta->prepare_add(ch, ptei, v) : ta->prepare_del(ch, ptei, v);
434
435		/*
436		 * Some syntax error (incorrect mask, or address, or
437		 * anything). Return error regardless of atomicity
438		 * settings.
439		 */
440		if (error != 0)
441			break;
442	}
443
444	*ta_buf = ta_buf_m;
445	return (error);
446}
447
448/*
449 * Flushes allocated state for each @count entries in @tei.
450 * Frees @ta_buf_m if differs from stack buffer @ta_buf.
451 */
452static void
453flush_batch_buffer(struct ip_fw_chain *ch, struct table_algo *ta,
454    struct tentry_info *tei, uint32_t count, int rollback,
455    caddr_t ta_buf_m, caddr_t ta_buf)
456{
457	caddr_t v;
458	struct tentry_info *ptei;
459	size_t ta_buf_sz;
460	int i;
461
462	ta_buf_sz = ta->ta_buf_size;
463
464	/* Run cleaning callback anyway */
465	v = ta_buf_m;
466	for (i = 0; i < count; i++, v += ta_buf_sz) {
467		ptei = &tei[i];
468		ta->flush_entry(ch, ptei, v);
469		if (ptei->ptv != NULL) {
470			free(ptei->ptv, M_IPFW);
471			ptei->ptv = NULL;
472		}
473	}
474
475	/* Clean up "deleted" state in case of rollback */
476	if (rollback != 0) {
477		v = ta_buf_m + count * ta_buf_sz;
478		for (i = 0; i < count; i++, v += ta_buf_sz)
479			ta->flush_entry(ch, &tei[i], v);
480	}
481
482	if (ta_buf_m != ta_buf)
483		free(ta_buf_m, M_TEMP);
484}
485
486
487static void
488rollback_add_entry(void *object, struct op_state *_state)
489{
490	struct ip_fw_chain *ch;
491	struct tableop_state *ts;
492
493	ts = (struct tableop_state *)_state;
494
495	if (ts->tc != object && ts->ch != object)
496		return;
497
498	ch = ts->ch;
499
500	IPFW_UH_WLOCK_ASSERT(ch);
501
502	/* Call specifid unlockers */
503	rollback_table_values(ts);
504
505	/* Indicate we've called */
506	ts->modified = 1;
507}
508
509/*
510 * Adds/updates one or more entries in table @ti.
511 *
512 * Function may drop/reacquire UH wlock multiple times due to
513 * items alloc, algorithm callbacks (check_space), value linkage
514 * (new values, value storage realloc), etc..
515 * Other processes like other adds (which may involve storage resize),
516 * table swaps (which changes table data and may change algo type),
517 * table modify (which may change value mask) may be executed
518 * simultaneously so we need to deal with it.
519 *
520 * The following approach was implemented:
521 * we have per-chain linked list, protected with UH lock.
522 * add_table_entry prepares special on-stack structure wthich is passed
523 * to its descendants. Users add this structure to this list before unlock.
524 * After performing needed operations and acquiring UH lock back, each user
525 * checks if structure has changed. If true, it rolls local state back and
526 * returns without error to the caller.
527 * add_table_entry() on its own checks if structure has changed and restarts
528 * its operation from the beginning (goto restart).
529 *
530 * Functions which are modifying fields of interest (currently
531 *   resize_shared_value_storage() and swap_tables() )
532 * traverses given list while holding UH lock immediately before
533 * performing their operations calling function provided be list entry
534 * ( currently rollback_add_entry  ) which performs rollback for all necessary
535 * state and sets appropriate values in structure indicating rollback
536 * has happened.
537 *
538 * Algo interaction:
539 * Function references @ti first to ensure table won't
540 * disappear or change its type.
541 * After that, prepare_add callback is called for each @tei entry.
542 * Next, we try to add each entry under UH+WHLOCK
543 * using add() callback.
544 * Finally, we free all state by calling flush_entry callback
545 * for each @tei.
546 *
547 * Returns 0 on success.
548 */
549int
550add_table_entry(struct ip_fw_chain *ch, struct tid_info *ti,
551    struct tentry_info *tei, uint8_t flags, uint32_t count)
552{
553	struct table_config *tc;
554	struct table_algo *ta;
555	uint16_t kidx;
556	int error, first_error, i, rollback;
557	uint32_t num, numadd;
558	struct tentry_info *ptei;
559	struct tableop_state ts;
560	char ta_buf[TA_BUF_SZ];
561	caddr_t ta_buf_m, v;
562
563	memset(&ts, 0, sizeof(ts));
564	ta = NULL;
565	IPFW_UH_WLOCK(ch);
566
567	/*
568	 * Find and reference existing table.
569	 */
570restart:
571	if (ts.modified != 0) {
572		IPFW_UH_WUNLOCK(ch);
573		flush_batch_buffer(ch, ta, tei, count, rollback,
574		    ta_buf_m, ta_buf);
575		memset(&ts, 0, sizeof(ts));
576		ta = NULL;
577		IPFW_UH_WLOCK(ch);
578	}
579
580	error = find_ref_table(ch, ti, tei, count, OP_ADD, &tc);
581	if (error != 0) {
582		IPFW_UH_WUNLOCK(ch);
583		return (error);
584	}
585	ta = tc->ta;
586
587	/* Fill in tablestate */
588	ts.ch = ch;
589	ts.opstate.func = rollback_add_entry;
590	ts.tc = tc;
591	ts.vshared = tc->vshared;
592	ts.vmask = tc->vmask;
593	ts.ta = ta;
594	ts.tei = tei;
595	ts.count = count;
596	rollback = 0;
597	add_toperation_state(ch, &ts);
598	IPFW_UH_WUNLOCK(ch);
599
600	/* Allocate memory and prepare record(s) */
601	/* Pass stack buffer by default */
602	ta_buf_m = ta_buf;
603	error = prepare_batch_buffer(ch, ta, tei, count, OP_ADD, &ta_buf_m);
604
605	IPFW_UH_WLOCK(ch);
606	del_toperation_state(ch, &ts);
607	/* Drop reference we've used in first search */
608	tc->no.refcnt--;
609
610	/* Check prepare_batch_buffer() error */
611	if (error != 0)
612		goto cleanup;
613
614	/*
615	 * Check if table swap has happened.
616	 * (so table algo might be changed).
617	 * Restart operation to achieve consistent behavior.
618	 */
619	if (ts.modified != 0)
620		goto restart;
621
622	/*
623	 * Link all values values to shared/per-table value array.
624	 *
625	 * May release/reacquire UH_WLOCK.
626	 */
627	error = ipfw_link_table_values(ch, &ts);
628	if (error != 0)
629		goto cleanup;
630	if (ts.modified != 0)
631		goto restart;
632
633	/*
634	 * Ensure we are able to add all entries without additional
635	 * memory allocations. May release/reacquire UH_WLOCK.
636	 */
637	kidx = tc->no.kidx;
638	error = check_table_space(ch, &ts, tc, KIDX_TO_TI(ch, kidx), count);
639	if (error != 0)
640		goto cleanup;
641	if (ts.modified != 0)
642		goto restart;
643
644	/* We've got valid table in @tc. Let's try to add data */
645	kidx = tc->no.kidx;
646	ta = tc->ta;
647	numadd = 0;
648	first_error = 0;
649
650	IPFW_WLOCK(ch);
651
652	v = ta_buf_m;
653	for (i = 0; i < count; i++, v += ta->ta_buf_size) {
654		ptei = &tei[i];
655		num = 0;
656		/* check limit before adding */
657		if ((error = check_table_limit(tc, ptei)) == 0) {
658			error = ta->add(tc->astate, KIDX_TO_TI(ch, kidx),
659			    ptei, v, &num);
660			/* Set status flag to inform userland */
661			store_tei_result(ptei, OP_ADD, error, num);
662		}
663		if (error == 0) {
664			/* Update number of records to ease limit checking */
665			tc->count += num;
666			numadd += num;
667			continue;
668		}
669
670		if (first_error == 0)
671			first_error = error;
672
673		/*
674		 * Some error have happened. Check our atomicity
675		 * settings: continue if atomicity is not required,
676		 * rollback changes otherwise.
677		 */
678		if ((flags & IPFW_CTF_ATOMIC) == 0)
679			continue;
680
681		rollback_added_entries(ch, tc, KIDX_TO_TI(ch, kidx),
682		    tei, ta_buf_m, count, i);
683
684		rollback = 1;
685		break;
686	}
687
688	IPFW_WUNLOCK(ch);
689
690	ipfw_garbage_table_values(ch, tc, tei, count, rollback);
691
692	/* Permit post-add algorithm grow/rehash. */
693	if (numadd != 0)
694		check_table_space(ch, NULL, tc, KIDX_TO_TI(ch, kidx), 0);
695
696	/* Return first error to user, if any */
697	error = first_error;
698
699cleanup:
700	IPFW_UH_WUNLOCK(ch);
701
702	flush_batch_buffer(ch, ta, tei, count, rollback, ta_buf_m, ta_buf);
703
704	return (error);
705}
706
707/*
708 * Deletes one or more entries in table @ti.
709 *
710 * Returns 0 on success.
711 */
712int
713del_table_entry(struct ip_fw_chain *ch, struct tid_info *ti,
714    struct tentry_info *tei, uint8_t flags, uint32_t count)
715{
716	struct table_config *tc;
717	struct table_algo *ta;
718	struct tentry_info *ptei;
719	uint16_t kidx;
720	int error, first_error, i;
721	uint32_t num, numdel;
722	char ta_buf[TA_BUF_SZ];
723	caddr_t ta_buf_m, v;
724
725	/*
726	 * Find and reference existing table.
727	 */
728	IPFW_UH_WLOCK(ch);
729	error = find_ref_table(ch, ti, tei, count, OP_DEL, &tc);
730	if (error != 0) {
731		IPFW_UH_WUNLOCK(ch);
732		return (error);
733	}
734	ta = tc->ta;
735	IPFW_UH_WUNLOCK(ch);
736
737	/* Allocate memory and prepare record(s) */
738	/* Pass stack buffer by default */
739	ta_buf_m = ta_buf;
740	error = prepare_batch_buffer(ch, ta, tei, count, OP_DEL, &ta_buf_m);
741	if (error != 0)
742		goto cleanup;
743
744	IPFW_UH_WLOCK(ch);
745
746	/* Drop reference we've used in first search */
747	tc->no.refcnt--;
748
749	/*
750	 * Check if table algo is still the same.
751	 * (changed ta may be the result of table swap).
752	 */
753	if (ta != tc->ta) {
754		IPFW_UH_WUNLOCK(ch);
755		error = EINVAL;
756		goto cleanup;
757	}
758
759	kidx = tc->no.kidx;
760	numdel = 0;
761	first_error = 0;
762
763	IPFW_WLOCK(ch);
764	v = ta_buf_m;
765	for (i = 0; i < count; i++, v += ta->ta_buf_size) {
766		ptei = &tei[i];
767		num = 0;
768		error = ta->del(tc->astate, KIDX_TO_TI(ch, kidx), ptei, v,
769		    &num);
770		/* Save state for userland */
771		store_tei_result(ptei, OP_DEL, error, num);
772		if (error != 0 && first_error == 0)
773			first_error = error;
774		tc->count -= num;
775		numdel += num;
776	}
777	IPFW_WUNLOCK(ch);
778
779	/* Unlink non-used values */
780	ipfw_garbage_table_values(ch, tc, tei, count, 0);
781
782	if (numdel != 0) {
783		/* Run post-del hook to permit shrinking */
784		check_table_space(ch, NULL, tc, KIDX_TO_TI(ch, kidx), 0);
785	}
786
787	IPFW_UH_WUNLOCK(ch);
788
789	/* Return first error to user, if any */
790	error = first_error;
791
792cleanup:
793	flush_batch_buffer(ch, ta, tei, count, 0, ta_buf_m, ta_buf);
794
795	return (error);
796}
797
798/*
799 * Ensure that table @tc has enough space to add @count entries without
800 * need for reallocation.
801 *
802 * Callbacks order:
803 * 0) need_modify() (UH_WLOCK) - checks if @count items can be added w/o resize.
804 *
805 * 1) alloc_modify (no locks, M_WAITOK) - alloc new state based on @pflags.
806 * 2) prepare_modifyt (UH_WLOCK) - copy old data into new storage
807 * 3) modify (UH_WLOCK + WLOCK) - switch pointers
808 * 4) flush_modify (UH_WLOCK) - free state, if needed
809 *
810 * Returns 0 on success.
811 */
812static int
813check_table_space(struct ip_fw_chain *ch, struct tableop_state *ts,
814    struct table_config *tc, struct table_info *ti, uint32_t count)
815{
816	struct table_algo *ta;
817	uint64_t pflags;
818	char ta_buf[TA_BUF_SZ];
819	int error;
820
821	IPFW_UH_WLOCK_ASSERT(ch);
822
823	error = 0;
824	ta = tc->ta;
825	if (ta->need_modify == NULL)
826		return (0);
827
828	/* Acquire reference not to loose @tc between locks/unlocks */
829	tc->no.refcnt++;
830
831	/*
832	 * TODO: think about avoiding race between large add/large delete
833	 * operation on algorithm which implements shrinking along with
834	 * growing.
835	 */
836	while (true) {
837		pflags = 0;
838		if (ta->need_modify(tc->astate, ti, count, &pflags) == 0) {
839			error = 0;
840			break;
841		}
842
843		/* We have to shrink/grow table */
844		if (ts != NULL)
845			add_toperation_state(ch, ts);
846		IPFW_UH_WUNLOCK(ch);
847
848		memset(&ta_buf, 0, sizeof(ta_buf));
849		error = ta->prepare_mod(ta_buf, &pflags);
850
851		IPFW_UH_WLOCK(ch);
852		if (ts != NULL)
853			del_toperation_state(ch, ts);
854
855		if (error != 0)
856			break;
857
858		if (ts != NULL && ts->modified != 0) {
859
860			/*
861			 * Swap operation has happened
862			 * so we're currently operating on other
863			 * table data. Stop doing this.
864			 */
865			ta->flush_mod(ta_buf);
866			break;
867		}
868
869		/* Check if we still need to alter table */
870		ti = KIDX_TO_TI(ch, tc->no.kidx);
871		if (ta->need_modify(tc->astate, ti, count, &pflags) == 0) {
872			IPFW_UH_WUNLOCK(ch);
873
874			/*
875			 * Other thread has already performed resize.
876			 * Flush our state and return.
877			 */
878			ta->flush_mod(ta_buf);
879			break;
880		}
881
882		error = ta->fill_mod(tc->astate, ti, ta_buf, &pflags);
883		if (error == 0) {
884			/* Do actual modification */
885			IPFW_WLOCK(ch);
886			ta->modify(tc->astate, ti, ta_buf, pflags);
887			IPFW_WUNLOCK(ch);
888		}
889
890		/* Anyway, flush data and retry */
891		ta->flush_mod(ta_buf);
892	}
893
894	tc->no.refcnt--;
895	return (error);
896}
897
898/*
899 * Adds or deletes record in table.
900 * Data layout (v0):
901 * Request: [ ip_fw3_opheader ipfw_table_xentry ]
902 *
903 * Returns 0 on success
904 */
905static int
906manage_table_ent_v0(struct ip_fw_chain *ch, ip_fw3_opheader *op3,
907    struct sockopt_data *sd)
908{
909	ipfw_table_xentry *xent;
910	struct tentry_info tei;
911	struct tid_info ti;
912	struct table_value v;
913	int error, hdrlen, read;
914
915	hdrlen = offsetof(ipfw_table_xentry, k);
916
917	/* Check minimum header size */
918	if (sd->valsize < (sizeof(*op3) + hdrlen))
919		return (EINVAL);
920
921	read = sizeof(ip_fw3_opheader);
922
923	/* Check if xentry len field is valid */
924	xent = (ipfw_table_xentry *)(op3 + 1);
925	if (xent->len < hdrlen || xent->len + read > sd->valsize)
926		return (EINVAL);
927
928	memset(&tei, 0, sizeof(tei));
929	tei.paddr = &xent->k;
930	tei.masklen = xent->masklen;
931	ipfw_import_table_value_legacy(xent->value, &v);
932	tei.pvalue = &v;
933	/* Old requests compatibility */
934	tei.flags = TEI_FLAGS_COMPAT;
935	if (xent->type == IPFW_TABLE_ADDR) {
936		if (xent->len - hdrlen == sizeof(in_addr_t))
937			tei.subtype = AF_INET;
938		else
939			tei.subtype = AF_INET6;
940	}
941
942	memset(&ti, 0, sizeof(ti));
943	ti.uidx = xent->tbl;
944	ti.type = xent->type;
945
946	error = (op3->opcode == IP_FW_TABLE_XADD) ?
947	    add_table_entry(ch, &ti, &tei, 0, 1) :
948	    del_table_entry(ch, &ti, &tei, 0, 1);
949
950	return (error);
951}
952
953/*
954 * Adds or deletes record in table.
955 * Data layout (v1)(current):
956 * Request: [ ipfw_obj_header
957 *   ipfw_obj_ctlv(IPFW_TLV_TBLENT_LIST) [ ipfw_obj_tentry x N ]
958 * ]
959 *
960 * Returns 0 on success
961 */
962static int
963manage_table_ent_v1(struct ip_fw_chain *ch, ip_fw3_opheader *op3,
964    struct sockopt_data *sd)
965{
966	ipfw_obj_tentry *tent, *ptent;
967	ipfw_obj_ctlv *ctlv;
968	ipfw_obj_header *oh;
969	struct tentry_info *ptei, tei, *tei_buf;
970	struct tid_info ti;
971	int error, i, kidx, read;
972
973	/* Check minimum header size */
974	if (sd->valsize < (sizeof(*oh) + sizeof(*ctlv)))
975		return (EINVAL);
976
977	/* Check if passed data is too long */
978	if (sd->valsize != sd->kavail)
979		return (EINVAL);
980
981	oh = (ipfw_obj_header *)sd->kbuf;
982
983	/* Basic length checks for TLVs */
984	if (oh->ntlv.head.length != sizeof(oh->ntlv))
985		return (EINVAL);
986
987	read = sizeof(*oh);
988
989	ctlv = (ipfw_obj_ctlv *)(oh + 1);
990	if (ctlv->head.length + read != sd->valsize)
991		return (EINVAL);
992
993	read += sizeof(*ctlv);
994	tent = (ipfw_obj_tentry *)(ctlv + 1);
995	if (ctlv->count * sizeof(*tent) + read != sd->valsize)
996		return (EINVAL);
997
998	if (ctlv->count == 0)
999		return (0);
1000
1001	/*
1002	 * Mark entire buffer as "read".
1003	 * This instructs sopt api write it back
1004	 * after function return.
1005	 */
1006	ipfw_get_sopt_header(sd, sd->valsize);
1007
1008	/* Perform basic checks for each entry */
1009	ptent = tent;
1010	kidx = tent->idx;
1011	for (i = 0; i < ctlv->count; i++, ptent++) {
1012		if (ptent->head.length != sizeof(*ptent))
1013			return (EINVAL);
1014		if (ptent->idx != kidx)
1015			return (ENOTSUP);
1016	}
1017
1018	/* Convert data into kernel request objects */
1019	objheader_to_ti(oh, &ti);
1020	ti.type = oh->ntlv.type;
1021	ti.uidx = kidx;
1022
1023	/* Use on-stack buffer for single add/del */
1024	if (ctlv->count == 1) {
1025		memset(&tei, 0, sizeof(tei));
1026		tei_buf = &tei;
1027	} else
1028		tei_buf = malloc(ctlv->count * sizeof(tei), M_TEMP,
1029		    M_WAITOK | M_ZERO);
1030
1031	ptei = tei_buf;
1032	ptent = tent;
1033	for (i = 0; i < ctlv->count; i++, ptent++, ptei++) {
1034		ptei->paddr = &ptent->k;
1035		ptei->subtype = ptent->subtype;
1036		ptei->masklen = ptent->masklen;
1037		if (ptent->head.flags & IPFW_TF_UPDATE)
1038			ptei->flags |= TEI_FLAGS_UPDATE;
1039
1040		ipfw_import_table_value_v1(&ptent->v.value);
1041		ptei->pvalue = (struct table_value *)&ptent->v.value;
1042	}
1043
1044	error = (oh->opheader.opcode == IP_FW_TABLE_XADD) ?
1045	    add_table_entry(ch, &ti, tei_buf, ctlv->flags, ctlv->count) :
1046	    del_table_entry(ch, &ti, tei_buf, ctlv->flags, ctlv->count);
1047
1048	/* Translate result back to userland */
1049	ptei = tei_buf;
1050	ptent = tent;
1051	for (i = 0; i < ctlv->count; i++, ptent++, ptei++) {
1052		if (ptei->flags & TEI_FLAGS_ADDED)
1053			ptent->result = IPFW_TR_ADDED;
1054		else if (ptei->flags & TEI_FLAGS_DELETED)
1055			ptent->result = IPFW_TR_DELETED;
1056		else if (ptei->flags & TEI_FLAGS_UPDATED)
1057			ptent->result = IPFW_TR_UPDATED;
1058		else if (ptei->flags & TEI_FLAGS_LIMIT)
1059			ptent->result = IPFW_TR_LIMIT;
1060		else if (ptei->flags & TEI_FLAGS_ERROR)
1061			ptent->result = IPFW_TR_ERROR;
1062		else if (ptei->flags & TEI_FLAGS_NOTFOUND)
1063			ptent->result = IPFW_TR_NOTFOUND;
1064		else if (ptei->flags & TEI_FLAGS_EXISTS)
1065			ptent->result = IPFW_TR_EXISTS;
1066		ipfw_export_table_value_v1(ptei->pvalue, &ptent->v.value);
1067	}
1068
1069	if (tei_buf != &tei)
1070		free(tei_buf, M_TEMP);
1071
1072	return (error);
1073}
1074
1075/*
1076 * Looks up an entry in given table.
1077 * Data layout (v0)(current):
1078 * Request: [ ipfw_obj_header ipfw_obj_tentry ]
1079 * Reply: [ ipfw_obj_header ipfw_obj_tentry ]
1080 *
1081 * Returns 0 on success
1082 */
1083static int
1084find_table_entry(struct ip_fw_chain *ch, ip_fw3_opheader *op3,
1085    struct sockopt_data *sd)
1086{
1087	ipfw_obj_tentry *tent;
1088	ipfw_obj_header *oh;
1089	struct tid_info ti;
1090	struct table_config *tc;
1091	struct table_algo *ta;
1092	struct table_info *kti;
1093	struct table_value *pval;
1094	struct namedobj_instance *ni;
1095	int error;
1096	size_t sz;
1097
1098	/* Check minimum header size */
1099	sz = sizeof(*oh) + sizeof(*tent);
1100	if (sd->valsize != sz)
1101		return (EINVAL);
1102
1103	oh = (struct _ipfw_obj_header *)ipfw_get_sopt_header(sd, sz);
1104	tent = (ipfw_obj_tentry *)(oh + 1);
1105
1106	/* Basic length checks for TLVs */
1107	if (oh->ntlv.head.length != sizeof(oh->ntlv))
1108		return (EINVAL);
1109
1110	objheader_to_ti(oh, &ti);
1111	ti.type = oh->ntlv.type;
1112	ti.uidx = tent->idx;
1113
1114	IPFW_UH_RLOCK(ch);
1115	ni = CHAIN_TO_NI(ch);
1116
1117	/*
1118	 * Find existing table and check its type .
1119	 */
1120	ta = NULL;
1121	if ((tc = find_table(ni, &ti)) == NULL) {
1122		IPFW_UH_RUNLOCK(ch);
1123		return (ESRCH);
1124	}
1125
1126	/* check table type */
1127	if (tc->no.subtype != ti.type) {
1128		IPFW_UH_RUNLOCK(ch);
1129		return (EINVAL);
1130	}
1131
1132	kti = KIDX_TO_TI(ch, tc->no.kidx);
1133	ta = tc->ta;
1134
1135	if (ta->find_tentry == NULL)
1136		return (ENOTSUP);
1137
1138	error = ta->find_tentry(tc->astate, kti, tent);
1139	if (error == 0) {
1140		pval = get_table_value(ch, tc, tent->v.kidx);
1141		ipfw_export_table_value_v1(pval, &tent->v.value);
1142	}
1143	IPFW_UH_RUNLOCK(ch);
1144
1145	return (error);
1146}
1147
1148/*
1149 * Flushes all entries or destroys given table.
1150 * Data layout (v0)(current):
1151 * Request: [ ipfw_obj_header ]
1152 *
1153 * Returns 0 on success
1154 */
1155static int
1156flush_table_v0(struct ip_fw_chain *ch, ip_fw3_opheader *op3,
1157    struct sockopt_data *sd)
1158{
1159	int error;
1160	struct _ipfw_obj_header *oh;
1161	struct tid_info ti;
1162
1163	if (sd->valsize != sizeof(*oh))
1164		return (EINVAL);
1165
1166	oh = (struct _ipfw_obj_header *)op3;
1167	objheader_to_ti(oh, &ti);
1168
1169	if (op3->opcode == IP_FW_TABLE_XDESTROY)
1170		error = destroy_table(ch, &ti);
1171	else if (op3->opcode == IP_FW_TABLE_XFLUSH)
1172		error = flush_table(ch, &ti);
1173	else
1174		return (ENOTSUP);
1175
1176	return (error);
1177}
1178
1179static void
1180restart_flush(void *object, struct op_state *_state)
1181{
1182	struct tableop_state *ts;
1183
1184	ts = (struct tableop_state *)_state;
1185
1186	if (ts->tc != object)
1187		return;
1188
1189	/* Indicate we've called */
1190	ts->modified = 1;
1191}
1192
1193/*
1194 * Flushes given table.
1195 *
1196 * Function create new table instance with the same
1197 * parameters, swaps it with old one and
1198 * flushes state without holding runtime WLOCK.
1199 *
1200 * Returns 0 on success.
1201 */
1202int
1203flush_table(struct ip_fw_chain *ch, struct tid_info *ti)
1204{
1205	struct namedobj_instance *ni;
1206	struct table_config *tc;
1207	struct table_algo *ta;
1208	struct table_info ti_old, ti_new, *tablestate;
1209	void *astate_old, *astate_new;
1210	char algostate[64], *pstate;
1211	struct tableop_state ts;
1212	int error, need_gc;
1213	uint16_t kidx;
1214	uint8_t tflags;
1215
1216	/*
1217	 * Stage 1: save table algorithm.
1218	 * Reference found table to ensure it won't disappear.
1219	 */
1220	IPFW_UH_WLOCK(ch);
1221	ni = CHAIN_TO_NI(ch);
1222	if ((tc = find_table(ni, ti)) == NULL) {
1223		IPFW_UH_WUNLOCK(ch);
1224		return (ESRCH);
1225	}
1226	need_gc = 0;
1227	astate_new = NULL;
1228	memset(&ti_new, 0, sizeof(ti_new));
1229restart:
1230	/* Set up swap handler */
1231	memset(&ts, 0, sizeof(ts));
1232	ts.opstate.func = restart_flush;
1233	ts.tc = tc;
1234
1235	ta = tc->ta;
1236	/* Do not flush readonly tables */
1237	if ((ta->flags & TA_FLAG_READONLY) != 0) {
1238		IPFW_UH_WUNLOCK(ch);
1239		return (EACCES);
1240	}
1241	/* Save startup algo parameters */
1242	if (ta->print_config != NULL) {
1243		ta->print_config(tc->astate, KIDX_TO_TI(ch, tc->no.kidx),
1244		    algostate, sizeof(algostate));
1245		pstate = algostate;
1246	} else
1247		pstate = NULL;
1248	tflags = tc->tflags;
1249	tc->no.refcnt++;
1250	add_toperation_state(ch, &ts);
1251	IPFW_UH_WUNLOCK(ch);
1252
1253	/*
1254	 * Stage 1.5: if this is not the first attempt, destroy previous state
1255	 */
1256	if (need_gc != 0) {
1257		ta->destroy(astate_new, &ti_new);
1258		need_gc = 0;
1259	}
1260
1261	/*
1262	 * Stage 2: allocate new table instance using same algo.
1263	 */
1264	memset(&ti_new, 0, sizeof(struct table_info));
1265	error = ta->init(ch, &astate_new, &ti_new, pstate, tflags);
1266
1267	/*
1268	 * Stage 3: swap old state pointers with newly-allocated ones.
1269	 * Decrease refcount.
1270	 */
1271	IPFW_UH_WLOCK(ch);
1272	tc->no.refcnt--;
1273	del_toperation_state(ch, &ts);
1274
1275	if (error != 0) {
1276		IPFW_UH_WUNLOCK(ch);
1277		return (error);
1278	}
1279
1280	/*
1281	 * Restart operation if table swap has happened:
1282	 * even if algo may be the same, algo init parameters
1283	 * may change. Restart operation instead of doing
1284	 * complex checks.
1285	 */
1286	if (ts.modified != 0) {
1287		/* Delay destroying data since we're holding UH lock */
1288		need_gc = 1;
1289		goto restart;
1290	}
1291
1292	ni = CHAIN_TO_NI(ch);
1293	kidx = tc->no.kidx;
1294	tablestate = (struct table_info *)ch->tablestate;
1295
1296	IPFW_WLOCK(ch);
1297	ti_old = tablestate[kidx];
1298	tablestate[kidx] = ti_new;
1299	IPFW_WUNLOCK(ch);
1300
1301	astate_old = tc->astate;
1302	tc->astate = astate_new;
1303	tc->ti_copy = ti_new;
1304	tc->count = 0;
1305
1306	/* Notify algo on real @ti address */
1307	if (ta->change_ti != NULL)
1308		ta->change_ti(tc->astate, &tablestate[kidx]);
1309
1310	/*
1311	 * Stage 4: unref values.
1312	 */
1313	ipfw_unref_table_values(ch, tc, ta, astate_old, &ti_old);
1314	IPFW_UH_WUNLOCK(ch);
1315
1316	/*
1317	 * Stage 5: perform real flush/destroy.
1318	 */
1319	ta->destroy(astate_old, &ti_old);
1320
1321	return (0);
1322}
1323
1324/*
1325 * Swaps two tables.
1326 * Data layout (v0)(current):
1327 * Request: [ ipfw_obj_header ipfw_obj_ntlv ]
1328 *
1329 * Returns 0 on success
1330 */
1331static int
1332swap_table(struct ip_fw_chain *ch, ip_fw3_opheader *op3,
1333    struct sockopt_data *sd)
1334{
1335	int error;
1336	struct _ipfw_obj_header *oh;
1337	struct tid_info ti_a, ti_b;
1338
1339	if (sd->valsize != sizeof(*oh) + sizeof(ipfw_obj_ntlv))
1340		return (EINVAL);
1341
1342	oh = (struct _ipfw_obj_header *)op3;
1343	ntlv_to_ti(&oh->ntlv, &ti_a);
1344	ntlv_to_ti((ipfw_obj_ntlv *)(oh + 1), &ti_b);
1345
1346	error = swap_tables(ch, &ti_a, &ti_b);
1347
1348	return (error);
1349}
1350
1351/*
1352 * Swaps two tables of the same type/valtype.
1353 *
1354 * Checks if tables are compatible and limits
1355 * permits swap, than actually perform swap.
1356 *
1357 * Each table consists of 2 different parts:
1358 * config:
1359 *   @tc (with name, set, kidx) and rule bindings, which is "stable".
1360 *   number of items
1361 *   table algo
1362 * runtime:
1363 *   runtime data @ti (ch->tablestate)
1364 *   runtime cache in @tc
1365 *   algo-specific data (@tc->astate)
1366 *
1367 * So we switch:
1368 *  all runtime data
1369 *   number of items
1370 *   table algo
1371 *
1372 * After that we call @ti change handler for each table.
1373 *
1374 * Note that referencing @tc won't protect tc->ta from change.
1375 * XXX: Do we need to restrict swap between locked tables?
1376 * XXX: Do we need to exchange ftype?
1377 *
1378 * Returns 0 on success.
1379 */
1380static int
1381swap_tables(struct ip_fw_chain *ch, struct tid_info *a,
1382    struct tid_info *b)
1383{
1384	struct namedobj_instance *ni;
1385	struct table_config *tc_a, *tc_b;
1386	struct table_algo *ta;
1387	struct table_info ti, *tablestate;
1388	void *astate;
1389	uint32_t count;
1390
1391	/*
1392	 * Stage 1: find both tables and ensure they are of
1393	 * the same type.
1394	 */
1395	IPFW_UH_WLOCK(ch);
1396	ni = CHAIN_TO_NI(ch);
1397	if ((tc_a = find_table(ni, a)) == NULL) {
1398		IPFW_UH_WUNLOCK(ch);
1399		return (ESRCH);
1400	}
1401	if ((tc_b = find_table(ni, b)) == NULL) {
1402		IPFW_UH_WUNLOCK(ch);
1403		return (ESRCH);
1404	}
1405
1406	/* It is very easy to swap between the same table */
1407	if (tc_a == tc_b) {
1408		IPFW_UH_WUNLOCK(ch);
1409		return (0);
1410	}
1411
1412	/* Check type and value are the same */
1413	if (tc_a->no.subtype!=tc_b->no.subtype || tc_a->tflags!=tc_b->tflags) {
1414		IPFW_UH_WUNLOCK(ch);
1415		return (EINVAL);
1416	}
1417
1418	/* Check limits before swap */
1419	if ((tc_a->limit != 0 && tc_b->count > tc_a->limit) ||
1420	    (tc_b->limit != 0 && tc_a->count > tc_b->limit)) {
1421		IPFW_UH_WUNLOCK(ch);
1422		return (EFBIG);
1423	}
1424
1425	/* Check if one of the tables is readonly */
1426	if (((tc_a->ta->flags | tc_b->ta->flags) & TA_FLAG_READONLY) != 0) {
1427		IPFW_UH_WUNLOCK(ch);
1428		return (EACCES);
1429	}
1430
1431	/* Notify we're going to swap */
1432	rollback_toperation_state(ch, tc_a);
1433	rollback_toperation_state(ch, tc_b);
1434
1435	/* Everything is fine, prepare to swap */
1436	tablestate = (struct table_info *)ch->tablestate;
1437	ti = tablestate[tc_a->no.kidx];
1438	ta = tc_a->ta;
1439	astate = tc_a->astate;
1440	count = tc_a->count;
1441
1442	IPFW_WLOCK(ch);
1443	/* a <- b */
1444	tablestate[tc_a->no.kidx] = tablestate[tc_b->no.kidx];
1445	tc_a->ta = tc_b->ta;
1446	tc_a->astate = tc_b->astate;
1447	tc_a->count = tc_b->count;
1448	/* b <- a */
1449	tablestate[tc_b->no.kidx] = ti;
1450	tc_b->ta = ta;
1451	tc_b->astate = astate;
1452	tc_b->count = count;
1453	IPFW_WUNLOCK(ch);
1454
1455	/* Ensure tc.ti copies are in sync */
1456	tc_a->ti_copy = tablestate[tc_a->no.kidx];
1457	tc_b->ti_copy = tablestate[tc_b->no.kidx];
1458
1459	/* Notify both tables on @ti change */
1460	if (tc_a->ta->change_ti != NULL)
1461		tc_a->ta->change_ti(tc_a->astate, &tablestate[tc_a->no.kidx]);
1462	if (tc_b->ta->change_ti != NULL)
1463		tc_b->ta->change_ti(tc_b->astate, &tablestate[tc_b->no.kidx]);
1464
1465	IPFW_UH_WUNLOCK(ch);
1466
1467	return (0);
1468}
1469
1470/*
1471 * Destroys table specified by @ti.
1472 * Data layout (v0)(current):
1473 * Request: [ ip_fw3_opheader ]
1474 *
1475 * Returns 0 on success
1476 */
1477static int
1478destroy_table(struct ip_fw_chain *ch, struct tid_info *ti)
1479{
1480	struct namedobj_instance *ni;
1481	struct table_config *tc;
1482
1483	IPFW_UH_WLOCK(ch);
1484
1485	ni = CHAIN_TO_NI(ch);
1486	if ((tc = find_table(ni, ti)) == NULL) {
1487		IPFW_UH_WUNLOCK(ch);
1488		return (ESRCH);
1489	}
1490
1491	/* Do not permit destroying referenced tables */
1492	if (tc->no.refcnt > 0) {
1493		IPFW_UH_WUNLOCK(ch);
1494		return (EBUSY);
1495	}
1496
1497	IPFW_WLOCK(ch);
1498	unlink_table(ch, tc);
1499	IPFW_WUNLOCK(ch);
1500
1501	/* Free obj index */
1502	if (ipfw_objhash_free_idx(ni, tc->no.kidx) != 0)
1503		printf("Error unlinking kidx %d from table %s\n",
1504		    tc->no.kidx, tc->tablename);
1505
1506	/* Unref values used in tables while holding UH lock */
1507	ipfw_unref_table_values(ch, tc, tc->ta, tc->astate, &tc->ti_copy);
1508	IPFW_UH_WUNLOCK(ch);
1509
1510	free_table_config(ni, tc);
1511
1512	return (0);
1513}
1514
1515static uint32_t
1516roundup2p(uint32_t v)
1517{
1518
1519	v--;
1520	v |= v >> 1;
1521	v |= v >> 2;
1522	v |= v >> 4;
1523	v |= v >> 8;
1524	v |= v >> 16;
1525	v++;
1526
1527	return (v);
1528}
1529
1530/*
1531 * Grow tables index.
1532 *
1533 * Returns 0 on success.
1534 */
1535int
1536ipfw_resize_tables(struct ip_fw_chain *ch, unsigned int ntables)
1537{
1538	unsigned int ntables_old, tbl;
1539	struct namedobj_instance *ni;
1540	void *new_idx, *old_tablestate, *tablestate;
1541	struct table_info *ti;
1542	struct table_config *tc;
1543	int i, new_blocks;
1544
1545	/* Check new value for validity */
1546	if (ntables == 0)
1547		return (EINVAL);
1548	if (ntables > IPFW_TABLES_MAX)
1549		ntables = IPFW_TABLES_MAX;
1550	/* Alight to nearest power of 2 */
1551	ntables = (unsigned int)roundup2p(ntables);
1552
1553	/* Allocate new pointers */
1554	tablestate = malloc(ntables * sizeof(struct table_info),
1555	    M_IPFW, M_WAITOK | M_ZERO);
1556
1557	ipfw_objhash_bitmap_alloc(ntables, (void *)&new_idx, &new_blocks);
1558
1559	IPFW_UH_WLOCK(ch);
1560
1561	tbl = (ntables >= V_fw_tables_max) ? V_fw_tables_max : ntables;
1562	ni = CHAIN_TO_NI(ch);
1563
1564	/* Temporary restrict decreasing max_tables */
1565	if (ntables < V_fw_tables_max) {
1566
1567		/*
1568		 * FIXME: Check if we really can shrink
1569		 */
1570		IPFW_UH_WUNLOCK(ch);
1571		return (EINVAL);
1572	}
1573
1574	/* Copy table info/indices */
1575	memcpy(tablestate, ch->tablestate, sizeof(struct table_info) * tbl);
1576	ipfw_objhash_bitmap_merge(ni, &new_idx, &new_blocks);
1577
1578	IPFW_WLOCK(ch);
1579
1580	/* Change pointers */
1581	old_tablestate = ch->tablestate;
1582	ch->tablestate = tablestate;
1583	ipfw_objhash_bitmap_swap(ni, &new_idx, &new_blocks);
1584
1585	ntables_old = V_fw_tables_max;
1586	V_fw_tables_max = ntables;
1587
1588	IPFW_WUNLOCK(ch);
1589
1590	/* Notify all consumers that their @ti pointer has changed */
1591	ti = (struct table_info *)ch->tablestate;
1592	for (i = 0; i < tbl; i++, ti++) {
1593		if (ti->lookup == NULL)
1594			continue;
1595		tc = (struct table_config *)ipfw_objhash_lookup_kidx(ni, i);
1596		if (tc == NULL || tc->ta->change_ti == NULL)
1597			continue;
1598
1599		tc->ta->change_ti(tc->astate, ti);
1600	}
1601
1602	IPFW_UH_WUNLOCK(ch);
1603
1604	/* Free old pointers */
1605	free(old_tablestate, M_IPFW);
1606	ipfw_objhash_bitmap_free(new_idx, new_blocks);
1607
1608	return (0);
1609}
1610
1611/*
1612 * Lookup table's named object by its @kidx.
1613 */
1614struct named_object *
1615ipfw_objhash_lookup_table_kidx(struct ip_fw_chain *ch, uint16_t kidx)
1616{
1617
1618	return (ipfw_objhash_lookup_kidx(CHAIN_TO_NI(ch), kidx));
1619}
1620
1621/*
1622 * Take reference to table specified in @ntlv.
1623 * On success return its @kidx.
1624 */
1625int
1626ipfw_ref_table(struct ip_fw_chain *ch, ipfw_obj_ntlv *ntlv, uint16_t *kidx)
1627{
1628	struct tid_info ti;
1629	struct table_config *tc;
1630	int error;
1631
1632	IPFW_UH_WLOCK_ASSERT(ch);
1633
1634	ntlv_to_ti(ntlv, &ti);
1635	error = find_table_err(CHAIN_TO_NI(ch), &ti, &tc);
1636	if (error != 0)
1637		return (error);
1638
1639	if (tc == NULL)
1640		return (ESRCH);
1641
1642	tc_ref(tc);
1643	*kidx = tc->no.kidx;
1644
1645	return (0);
1646}
1647
1648void
1649ipfw_unref_table(struct ip_fw_chain *ch, uint16_t kidx)
1650{
1651
1652	struct namedobj_instance *ni;
1653	struct named_object *no;
1654
1655	IPFW_UH_WLOCK_ASSERT(ch);
1656	ni = CHAIN_TO_NI(ch);
1657	no = ipfw_objhash_lookup_kidx(ni, kidx);
1658	KASSERT(no != NULL, ("Table with index %d not found", kidx));
1659	no->refcnt--;
1660}
1661
1662/*
1663 * Lookup an arbitrary key @paddr of length @plen in table @tbl.
1664 * Stores found value in @val.
1665 *
1666 * Returns 1 if key was found.
1667 */
1668int
1669ipfw_lookup_table(struct ip_fw_chain *ch, uint16_t tbl, uint16_t plen,
1670    void *paddr, uint32_t *val)
1671{
1672	struct table_info *ti;
1673
1674	ti = KIDX_TO_TI(ch, tbl);
1675
1676	return (ti->lookup(ti, paddr, plen, val));
1677}
1678
1679/*
1680 * Info/List/dump support for tables.
1681 *
1682 */
1683
1684/*
1685 * High-level 'get' cmds sysctl handlers
1686 */
1687
1688/*
1689 * Lists all tables currently available in kernel.
1690 * Data layout (v0)(current):
1691 * Request: [ ipfw_obj_lheader ], size = ipfw_obj_lheader.size
1692 * Reply: [ ipfw_obj_lheader ipfw_xtable_info x N ]
1693 *
1694 * Returns 0 on success
1695 */
1696static int
1697list_tables(struct ip_fw_chain *ch, ip_fw3_opheader *op3,
1698    struct sockopt_data *sd)
1699{
1700	struct _ipfw_obj_lheader *olh;
1701	int error;
1702
1703	olh = (struct _ipfw_obj_lheader *)ipfw_get_sopt_header(sd,sizeof(*olh));
1704	if (olh == NULL)
1705		return (EINVAL);
1706	if (sd->valsize < olh->size)
1707		return (EINVAL);
1708
1709	IPFW_UH_RLOCK(ch);
1710	error = export_tables(ch, olh, sd);
1711	IPFW_UH_RUNLOCK(ch);
1712
1713	return (error);
1714}
1715
1716/*
1717 * Store table info to buffer provided by @sd.
1718 * Data layout (v0)(current):
1719 * Request: [ ipfw_obj_header ipfw_xtable_info(empty)]
1720 * Reply: [ ipfw_obj_header ipfw_xtable_info ]
1721 *
1722 * Returns 0 on success.
1723 */
1724static int
1725describe_table(struct ip_fw_chain *ch, ip_fw3_opheader *op3,
1726    struct sockopt_data *sd)
1727{
1728	struct _ipfw_obj_header *oh;
1729	struct table_config *tc;
1730	struct tid_info ti;
1731	size_t sz;
1732
1733	sz = sizeof(*oh) + sizeof(ipfw_xtable_info);
1734	oh = (struct _ipfw_obj_header *)ipfw_get_sopt_header(sd, sz);
1735	if (oh == NULL)
1736		return (EINVAL);
1737
1738	objheader_to_ti(oh, &ti);
1739
1740	IPFW_UH_RLOCK(ch);
1741	if ((tc = find_table(CHAIN_TO_NI(ch), &ti)) == NULL) {
1742		IPFW_UH_RUNLOCK(ch);
1743		return (ESRCH);
1744	}
1745
1746	export_table_info(ch, tc, (ipfw_xtable_info *)(oh + 1));
1747	IPFW_UH_RUNLOCK(ch);
1748
1749	return (0);
1750}
1751
1752/*
1753 * Modifies existing table.
1754 * Data layout (v0)(current):
1755 * Request: [ ipfw_obj_header ipfw_xtable_info ]
1756 *
1757 * Returns 0 on success
1758 */
1759static int
1760modify_table(struct ip_fw_chain *ch, ip_fw3_opheader *op3,
1761    struct sockopt_data *sd)
1762{
1763	struct _ipfw_obj_header *oh;
1764	ipfw_xtable_info *i;
1765	char *tname;
1766	struct tid_info ti;
1767	struct namedobj_instance *ni;
1768	struct table_config *tc;
1769
1770	if (sd->valsize != sizeof(*oh) + sizeof(ipfw_xtable_info))
1771		return (EINVAL);
1772
1773	oh = (struct _ipfw_obj_header *)sd->kbuf;
1774	i = (ipfw_xtable_info *)(oh + 1);
1775
1776	/*
1777	 * Verify user-supplied strings.
1778	 * Check for null-terminated/zero-length strings/
1779	 */
1780	tname = oh->ntlv.name;
1781	if (check_table_name(tname) != 0)
1782		return (EINVAL);
1783
1784	objheader_to_ti(oh, &ti);
1785	ti.type = i->type;
1786
1787	IPFW_UH_WLOCK(ch);
1788	ni = CHAIN_TO_NI(ch);
1789	if ((tc = find_table(ni, &ti)) == NULL) {
1790		IPFW_UH_WUNLOCK(ch);
1791		return (ESRCH);
1792	}
1793
1794	/* Do not support any modifications for readonly tables */
1795	if ((tc->ta->flags & TA_FLAG_READONLY) != 0) {
1796		IPFW_UH_WUNLOCK(ch);
1797		return (EACCES);
1798	}
1799
1800	if ((i->mflags & IPFW_TMFLAGS_LIMIT) != 0)
1801		tc->limit = i->limit;
1802	if ((i->mflags & IPFW_TMFLAGS_LOCK) != 0)
1803		tc->locked = ((i->flags & IPFW_TGFLAGS_LOCKED) != 0);
1804	IPFW_UH_WUNLOCK(ch);
1805
1806	return (0);
1807}
1808
1809/*
1810 * Creates new table.
1811 * Data layout (v0)(current):
1812 * Request: [ ipfw_obj_header ipfw_xtable_info ]
1813 *
1814 * Returns 0 on success
1815 */
1816static int
1817create_table(struct ip_fw_chain *ch, ip_fw3_opheader *op3,
1818    struct sockopt_data *sd)
1819{
1820	struct _ipfw_obj_header *oh;
1821	ipfw_xtable_info *i;
1822	char *tname, *aname;
1823	struct tid_info ti;
1824	struct namedobj_instance *ni;
1825
1826	if (sd->valsize != sizeof(*oh) + sizeof(ipfw_xtable_info))
1827		return (EINVAL);
1828
1829	oh = (struct _ipfw_obj_header *)sd->kbuf;
1830	i = (ipfw_xtable_info *)(oh + 1);
1831
1832	/*
1833	 * Verify user-supplied strings.
1834	 * Check for null-terminated/zero-length strings/
1835	 */
1836	tname = oh->ntlv.name;
1837	aname = i->algoname;
1838	if (check_table_name(tname) != 0 ||
1839	    strnlen(aname, sizeof(i->algoname)) == sizeof(i->algoname))
1840		return (EINVAL);
1841
1842	if (aname[0] == '\0') {
1843		/* Use default algorithm */
1844		aname = NULL;
1845	}
1846
1847	objheader_to_ti(oh, &ti);
1848	ti.type = i->type;
1849
1850	ni = CHAIN_TO_NI(ch);
1851
1852	IPFW_UH_RLOCK(ch);
1853	if (find_table(ni, &ti) != NULL) {
1854		IPFW_UH_RUNLOCK(ch);
1855		return (EEXIST);
1856	}
1857	IPFW_UH_RUNLOCK(ch);
1858
1859	return (create_table_internal(ch, &ti, aname, i, NULL, 0));
1860}
1861
1862/*
1863 * Creates new table based on @ti and @aname.
1864 *
1865 * Assume @aname to be checked and valid.
1866 * Stores allocated table kidx inside @pkidx (if non-NULL).
1867 * Reference created table if @compat is non-zero.
1868 *
1869 * Returns 0 on success.
1870 */
1871static int
1872create_table_internal(struct ip_fw_chain *ch, struct tid_info *ti,
1873    char *aname, ipfw_xtable_info *i, uint16_t *pkidx, int compat)
1874{
1875	struct namedobj_instance *ni;
1876	struct table_config *tc, *tc_new, *tmp;
1877	struct table_algo *ta;
1878	uint16_t kidx;
1879
1880	ni = CHAIN_TO_NI(ch);
1881
1882	ta = find_table_algo(CHAIN_TO_TCFG(ch), ti, aname);
1883	if (ta == NULL)
1884		return (ENOTSUP);
1885
1886	tc = alloc_table_config(ch, ti, ta, aname, i->tflags);
1887	if (tc == NULL)
1888		return (ENOMEM);
1889
1890	tc->vmask = i->vmask;
1891	tc->limit = i->limit;
1892	if (ta->flags & TA_FLAG_READONLY)
1893		tc->locked = 1;
1894	else
1895		tc->locked = (i->flags & IPFW_TGFLAGS_LOCKED) != 0;
1896
1897	IPFW_UH_WLOCK(ch);
1898
1899	/* Check if table has been already created */
1900	tc_new = find_table(ni, ti);
1901	if (tc_new != NULL) {
1902
1903		/*
1904		 * Compat: do not fail if we're
1905		 * requesting to create existing table
1906		 * which has the same type
1907		 */
1908		if (compat == 0 || tc_new->no.subtype != tc->no.subtype) {
1909			IPFW_UH_WUNLOCK(ch);
1910			free_table_config(ni, tc);
1911			return (EEXIST);
1912		}
1913
1914		/* Exchange tc and tc_new for proper refcounting & freeing */
1915		tmp = tc;
1916		tc = tc_new;
1917		tc_new = tmp;
1918	} else {
1919		/* New table */
1920		if (ipfw_objhash_alloc_idx(ni, &kidx) != 0) {
1921			IPFW_UH_WUNLOCK(ch);
1922			printf("Unable to allocate table index."
1923			    " Consider increasing net.inet.ip.fw.tables_max");
1924			free_table_config(ni, tc);
1925			return (EBUSY);
1926		}
1927		tc->no.kidx = kidx;
1928		tc->no.etlv = IPFW_TLV_TBL_NAME;
1929
1930		link_table(ch, tc);
1931	}
1932
1933	if (compat != 0)
1934		tc->no.refcnt++;
1935	if (pkidx != NULL)
1936		*pkidx = tc->no.kidx;
1937
1938	IPFW_UH_WUNLOCK(ch);
1939
1940	if (tc_new != NULL)
1941		free_table_config(ni, tc_new);
1942
1943	return (0);
1944}
1945
1946static void
1947ntlv_to_ti(ipfw_obj_ntlv *ntlv, struct tid_info *ti)
1948{
1949
1950	memset(ti, 0, sizeof(struct tid_info));
1951	ti->set = ntlv->set;
1952	ti->uidx = ntlv->idx;
1953	ti->tlvs = ntlv;
1954	ti->tlen = ntlv->head.length;
1955}
1956
1957static void
1958objheader_to_ti(struct _ipfw_obj_header *oh, struct tid_info *ti)
1959{
1960
1961	ntlv_to_ti(&oh->ntlv, ti);
1962}
1963
1964struct namedobj_instance *
1965ipfw_get_table_objhash(struct ip_fw_chain *ch)
1966{
1967
1968	return (CHAIN_TO_NI(ch));
1969}
1970
1971/*
1972 * Exports basic table info as name TLV.
1973 * Used inside dump_static_rules() to provide info
1974 * about all tables referenced by current ruleset.
1975 *
1976 * Returns 0 on success.
1977 */
1978int
1979ipfw_export_table_ntlv(struct ip_fw_chain *ch, uint16_t kidx,
1980    struct sockopt_data *sd)
1981{
1982	struct namedobj_instance *ni;
1983	struct named_object *no;
1984	ipfw_obj_ntlv *ntlv;
1985
1986	ni = CHAIN_TO_NI(ch);
1987
1988	no = ipfw_objhash_lookup_kidx(ni, kidx);
1989	KASSERT(no != NULL, ("invalid table kidx passed"));
1990
1991	ntlv = (ipfw_obj_ntlv *)ipfw_get_sopt_space(sd, sizeof(*ntlv));
1992	if (ntlv == NULL)
1993		return (ENOMEM);
1994
1995	ntlv->head.type = IPFW_TLV_TBL_NAME;
1996	ntlv->head.length = sizeof(*ntlv);
1997	ntlv->idx = no->kidx;
1998	strlcpy(ntlv->name, no->name, sizeof(ntlv->name));
1999
2000	return (0);
2001}
2002
2003struct dump_args {
2004	struct ip_fw_chain *ch;
2005	struct table_info *ti;
2006	struct table_config *tc;
2007	struct sockopt_data *sd;
2008	uint32_t cnt;
2009	uint16_t uidx;
2010	int error;
2011	uint32_t size;
2012	ipfw_table_entry *ent;
2013	ta_foreach_f *f;
2014	void *farg;
2015	ipfw_obj_tentry tent;
2016};
2017
2018static int
2019count_ext_entries(void *e, void *arg)
2020{
2021	struct dump_args *da;
2022
2023	da = (struct dump_args *)arg;
2024	da->cnt++;
2025
2026	return (0);
2027}
2028
2029/*
2030 * Gets number of items from table either using
2031 * internal counter or calling algo callback for
2032 * externally-managed tables.
2033 *
2034 * Returns number of records.
2035 */
2036static uint32_t
2037table_get_count(struct ip_fw_chain *ch, struct table_config *tc)
2038{
2039	struct table_info *ti;
2040	struct table_algo *ta;
2041	struct dump_args da;
2042
2043	ti = KIDX_TO_TI(ch, tc->no.kidx);
2044	ta = tc->ta;
2045
2046	/* Use internal counter for self-managed tables */
2047	if ((ta->flags & TA_FLAG_READONLY) == 0)
2048		return (tc->count);
2049
2050	/* Use callback to quickly get number of items */
2051	if ((ta->flags & TA_FLAG_EXTCOUNTER) != 0)
2052		return (ta->get_count(tc->astate, ti));
2053
2054	/* Count number of iterms ourselves */
2055	memset(&da, 0, sizeof(da));
2056	ta->foreach(tc->astate, ti, count_ext_entries, &da);
2057
2058	return (da.cnt);
2059}
2060
2061/*
2062 * Exports table @tc info into standard ipfw_xtable_info format.
2063 */
2064static void
2065export_table_info(struct ip_fw_chain *ch, struct table_config *tc,
2066    ipfw_xtable_info *i)
2067{
2068	struct table_info *ti;
2069	struct table_algo *ta;
2070
2071	i->type = tc->no.subtype;
2072	i->tflags = tc->tflags;
2073	i->vmask = tc->vmask;
2074	i->set = tc->no.set;
2075	i->kidx = tc->no.kidx;
2076	i->refcnt = tc->no.refcnt;
2077	i->count = table_get_count(ch, tc);
2078	i->limit = tc->limit;
2079	i->flags |= (tc->locked != 0) ? IPFW_TGFLAGS_LOCKED : 0;
2080	i->size = i->count * sizeof(ipfw_obj_tentry);
2081	i->size += sizeof(ipfw_obj_header) + sizeof(ipfw_xtable_info);
2082	strlcpy(i->tablename, tc->tablename, sizeof(i->tablename));
2083	ti = KIDX_TO_TI(ch, tc->no.kidx);
2084	ta = tc->ta;
2085	if (ta->print_config != NULL) {
2086		/* Use algo function to print table config to string */
2087		ta->print_config(tc->astate, ti, i->algoname,
2088		    sizeof(i->algoname));
2089	} else
2090		strlcpy(i->algoname, ta->name, sizeof(i->algoname));
2091	/* Dump algo-specific data, if possible */
2092	if (ta->dump_tinfo != NULL) {
2093		ta->dump_tinfo(tc->astate, ti, &i->ta_info);
2094		i->ta_info.flags |= IPFW_TATFLAGS_DATA;
2095	}
2096}
2097
2098struct dump_table_args {
2099	struct ip_fw_chain *ch;
2100	struct sockopt_data *sd;
2101};
2102
2103static int
2104export_table_internal(struct namedobj_instance *ni, struct named_object *no,
2105    void *arg)
2106{
2107	ipfw_xtable_info *i;
2108	struct dump_table_args *dta;
2109
2110	dta = (struct dump_table_args *)arg;
2111
2112	i = (ipfw_xtable_info *)ipfw_get_sopt_space(dta->sd, sizeof(*i));
2113	KASSERT(i != NULL, ("previously checked buffer is not enough"));
2114
2115	export_table_info(dta->ch, (struct table_config *)no, i);
2116	return (0);
2117}
2118
2119/*
2120 * Export all tables as ipfw_xtable_info structures to
2121 * storage provided by @sd.
2122 *
2123 * If supplied buffer is too small, fills in required size
2124 * and returns ENOMEM.
2125 * Returns 0 on success.
2126 */
2127static int
2128export_tables(struct ip_fw_chain *ch, ipfw_obj_lheader *olh,
2129    struct sockopt_data *sd)
2130{
2131	uint32_t size;
2132	uint32_t count;
2133	struct dump_table_args dta;
2134
2135	count = ipfw_objhash_count(CHAIN_TO_NI(ch));
2136	size = count * sizeof(ipfw_xtable_info) + sizeof(ipfw_obj_lheader);
2137
2138	/* Fill in header regadless of buffer size */
2139	olh->count = count;
2140	olh->objsize = sizeof(ipfw_xtable_info);
2141
2142	if (size > olh->size) {
2143		olh->size = size;
2144		return (ENOMEM);
2145	}
2146
2147	olh->size = size;
2148
2149	dta.ch = ch;
2150	dta.sd = sd;
2151
2152	ipfw_objhash_foreach(CHAIN_TO_NI(ch), export_table_internal, &dta);
2153
2154	return (0);
2155}
2156
2157/*
2158 * Dumps all table data
2159 * Data layout (v1)(current):
2160 * Request: [ ipfw_obj_header ], size = ipfw_xtable_info.size
2161 * Reply: [ ipfw_obj_header ipfw_xtable_info ipfw_obj_tentry x N ]
2162 *
2163 * Returns 0 on success
2164 */
2165static int
2166dump_table_v1(struct ip_fw_chain *ch, ip_fw3_opheader *op3,
2167    struct sockopt_data *sd)
2168{
2169	struct _ipfw_obj_header *oh;
2170	ipfw_xtable_info *i;
2171	struct tid_info ti;
2172	struct table_config *tc;
2173	struct table_algo *ta;
2174	struct dump_args da;
2175	uint32_t sz;
2176
2177	sz = sizeof(ipfw_obj_header) + sizeof(ipfw_xtable_info);
2178	oh = (struct _ipfw_obj_header *)ipfw_get_sopt_header(sd, sz);
2179	if (oh == NULL)
2180		return (EINVAL);
2181
2182	i = (ipfw_xtable_info *)(oh + 1);
2183	objheader_to_ti(oh, &ti);
2184
2185	IPFW_UH_RLOCK(ch);
2186	if ((tc = find_table(CHAIN_TO_NI(ch), &ti)) == NULL) {
2187		IPFW_UH_RUNLOCK(ch);
2188		return (ESRCH);
2189	}
2190	export_table_info(ch, tc, i);
2191
2192	if (sd->valsize < i->size) {
2193
2194		/*
2195		 * Submitted buffer size is not enough.
2196		 * WE've already filled in @i structure with
2197		 * relevant table info including size, so we
2198		 * can return. Buffer will be flushed automatically.
2199		 */
2200		IPFW_UH_RUNLOCK(ch);
2201		return (ENOMEM);
2202	}
2203
2204	/*
2205	 * Do the actual dump in eXtended format
2206	 */
2207	memset(&da, 0, sizeof(da));
2208	da.ch = ch;
2209	da.ti = KIDX_TO_TI(ch, tc->no.kidx);
2210	da.tc = tc;
2211	da.sd = sd;
2212
2213	ta = tc->ta;
2214
2215	ta->foreach(tc->astate, da.ti, dump_table_tentry, &da);
2216	IPFW_UH_RUNLOCK(ch);
2217
2218	return (da.error);
2219}
2220
2221/*
2222 * Dumps all table data
2223 * Data layout (version 0)(legacy):
2224 * Request: [ ipfw_xtable ], size = IP_FW_TABLE_XGETSIZE()
2225 * Reply: [ ipfw_xtable ipfw_table_xentry x N ]
2226 *
2227 * Returns 0 on success
2228 */
2229static int
2230dump_table_v0(struct ip_fw_chain *ch, ip_fw3_opheader *op3,
2231    struct sockopt_data *sd)
2232{
2233	ipfw_xtable *xtbl;
2234	struct tid_info ti;
2235	struct table_config *tc;
2236	struct table_algo *ta;
2237	struct dump_args da;
2238	size_t sz, count;
2239
2240	xtbl = (ipfw_xtable *)ipfw_get_sopt_header(sd, sizeof(ipfw_xtable));
2241	if (xtbl == NULL)
2242		return (EINVAL);
2243
2244	memset(&ti, 0, sizeof(ti));
2245	ti.uidx = xtbl->tbl;
2246
2247	IPFW_UH_RLOCK(ch);
2248	if ((tc = find_table(CHAIN_TO_NI(ch), &ti)) == NULL) {
2249		IPFW_UH_RUNLOCK(ch);
2250		return (0);
2251	}
2252	count = table_get_count(ch, tc);
2253	sz = count * sizeof(ipfw_table_xentry) + sizeof(ipfw_xtable);
2254
2255	xtbl->cnt = count;
2256	xtbl->size = sz;
2257	xtbl->type = tc->no.subtype;
2258	xtbl->tbl = ti.uidx;
2259
2260	if (sd->valsize < sz) {
2261
2262		/*
2263		 * Submitted buffer size is not enough.
2264		 * WE've already filled in @i structure with
2265		 * relevant table info including size, so we
2266		 * can return. Buffer will be flushed automatically.
2267		 */
2268		IPFW_UH_RUNLOCK(ch);
2269		return (ENOMEM);
2270	}
2271
2272	/* Do the actual dump in eXtended format */
2273	memset(&da, 0, sizeof(da));
2274	da.ch = ch;
2275	da.ti = KIDX_TO_TI(ch, tc->no.kidx);
2276	da.tc = tc;
2277	da.sd = sd;
2278
2279	ta = tc->ta;
2280
2281	ta->foreach(tc->astate, da.ti, dump_table_xentry, &da);
2282	IPFW_UH_RUNLOCK(ch);
2283
2284	return (0);
2285}
2286
2287/*
2288 * Legacy function to retrieve number of items in table.
2289 */
2290static int
2291get_table_size(struct ip_fw_chain *ch, ip_fw3_opheader *op3,
2292    struct sockopt_data *sd)
2293{
2294	uint32_t *tbl;
2295	struct tid_info ti;
2296	size_t sz;
2297	int error;
2298
2299	sz = sizeof(*op3) + sizeof(uint32_t);
2300	op3 = (ip_fw3_opheader *)ipfw_get_sopt_header(sd, sz);
2301	if (op3 == NULL)
2302		return (EINVAL);
2303
2304	tbl = (uint32_t *)(op3 + 1);
2305	memset(&ti, 0, sizeof(ti));
2306	ti.uidx = *tbl;
2307	IPFW_UH_RLOCK(ch);
2308	error = ipfw_count_xtable(ch, &ti, tbl);
2309	IPFW_UH_RUNLOCK(ch);
2310	return (error);
2311}
2312
2313/*
2314 * Legacy IP_FW_TABLE_GETSIZE handler
2315 */
2316int
2317ipfw_count_table(struct ip_fw_chain *ch, struct tid_info *ti, uint32_t *cnt)
2318{
2319	struct table_config *tc;
2320
2321	if ((tc = find_table(CHAIN_TO_NI(ch), ti)) == NULL)
2322		return (ESRCH);
2323	*cnt = table_get_count(ch, tc);
2324	return (0);
2325}
2326
2327/*
2328 * Legacy IP_FW_TABLE_XGETSIZE handler
2329 */
2330int
2331ipfw_count_xtable(struct ip_fw_chain *ch, struct tid_info *ti, uint32_t *cnt)
2332{
2333	struct table_config *tc;
2334	uint32_t count;
2335
2336	if ((tc = find_table(CHAIN_TO_NI(ch), ti)) == NULL) {
2337		*cnt = 0;
2338		return (0); /* 'table all list' requires success */
2339	}
2340
2341	count = table_get_count(ch, tc);
2342	*cnt = count * sizeof(ipfw_table_xentry);
2343	if (count > 0)
2344		*cnt += sizeof(ipfw_xtable);
2345	return (0);
2346}
2347
2348static int
2349dump_table_entry(void *e, void *arg)
2350{
2351	struct dump_args *da;
2352	struct table_config *tc;
2353	struct table_algo *ta;
2354	ipfw_table_entry *ent;
2355	struct table_value *pval;
2356	int error;
2357
2358	da = (struct dump_args *)arg;
2359
2360	tc = da->tc;
2361	ta = tc->ta;
2362
2363	/* Out of memory, returning */
2364	if (da->cnt == da->size)
2365		return (1);
2366	ent = da->ent++;
2367	ent->tbl = da->uidx;
2368	da->cnt++;
2369
2370	error = ta->dump_tentry(tc->astate, da->ti, e, &da->tent);
2371	if (error != 0)
2372		return (error);
2373
2374	ent->addr = da->tent.k.addr.s_addr;
2375	ent->masklen = da->tent.masklen;
2376	pval = get_table_value(da->ch, da->tc, da->tent.v.kidx);
2377	ent->value = ipfw_export_table_value_legacy(pval);
2378
2379	return (0);
2380}
2381
2382/*
2383 * Dumps table in pre-8.1 legacy format.
2384 */
2385int
2386ipfw_dump_table_legacy(struct ip_fw_chain *ch, struct tid_info *ti,
2387    ipfw_table *tbl)
2388{
2389	struct table_config *tc;
2390	struct table_algo *ta;
2391	struct dump_args da;
2392
2393	tbl->cnt = 0;
2394
2395	if ((tc = find_table(CHAIN_TO_NI(ch), ti)) == NULL)
2396		return (0);	/* XXX: We should return ESRCH */
2397
2398	ta = tc->ta;
2399
2400	/* This dump format supports IPv4 only */
2401	if (tc->no.subtype != IPFW_TABLE_ADDR)
2402		return (0);
2403
2404	memset(&da, 0, sizeof(da));
2405	da.ch = ch;
2406	da.ti = KIDX_TO_TI(ch, tc->no.kidx);
2407	da.tc = tc;
2408	da.ent = &tbl->ent[0];
2409	da.size = tbl->size;
2410
2411	tbl->cnt = 0;
2412	ta->foreach(tc->astate, da.ti, dump_table_entry, &da);
2413	tbl->cnt = da.cnt;
2414
2415	return (0);
2416}
2417
2418/*
2419 * Dumps table entry in eXtended format (v1)(current).
2420 */
2421static int
2422dump_table_tentry(void *e, void *arg)
2423{
2424	struct dump_args *da;
2425	struct table_config *tc;
2426	struct table_algo *ta;
2427	struct table_value *pval;
2428	ipfw_obj_tentry *tent;
2429	int error;
2430
2431	da = (struct dump_args *)arg;
2432
2433	tc = da->tc;
2434	ta = tc->ta;
2435
2436	tent = (ipfw_obj_tentry *)ipfw_get_sopt_space(da->sd, sizeof(*tent));
2437	/* Out of memory, returning */
2438	if (tent == NULL) {
2439		da->error = ENOMEM;
2440		return (1);
2441	}
2442	tent->head.length = sizeof(ipfw_obj_tentry);
2443	tent->idx = da->uidx;
2444
2445	error = ta->dump_tentry(tc->astate, da->ti, e, tent);
2446	if (error != 0)
2447		return (error);
2448
2449	pval = get_table_value(da->ch, da->tc, tent->v.kidx);
2450	ipfw_export_table_value_v1(pval, &tent->v.value);
2451
2452	return (0);
2453}
2454
2455/*
2456 * Dumps table entry in eXtended format (v0).
2457 */
2458static int
2459dump_table_xentry(void *e, void *arg)
2460{
2461	struct dump_args *da;
2462	struct table_config *tc;
2463	struct table_algo *ta;
2464	ipfw_table_xentry *xent;
2465	ipfw_obj_tentry *tent;
2466	struct table_value *pval;
2467	int error;
2468
2469	da = (struct dump_args *)arg;
2470
2471	tc = da->tc;
2472	ta = tc->ta;
2473
2474	xent = (ipfw_table_xentry *)ipfw_get_sopt_space(da->sd, sizeof(*xent));
2475	/* Out of memory, returning */
2476	if (xent == NULL)
2477		return (1);
2478	xent->len = sizeof(ipfw_table_xentry);
2479	xent->tbl = da->uidx;
2480
2481	memset(&da->tent, 0, sizeof(da->tent));
2482	tent = &da->tent;
2483	error = ta->dump_tentry(tc->astate, da->ti, e, tent);
2484	if (error != 0)
2485		return (error);
2486
2487	/* Convert current format to previous one */
2488	xent->masklen = tent->masklen;
2489	pval = get_table_value(da->ch, da->tc, da->tent.v.kidx);
2490	xent->value = ipfw_export_table_value_legacy(pval);
2491	/* Apply some hacks */
2492	if (tc->no.subtype == IPFW_TABLE_ADDR && tent->subtype == AF_INET) {
2493		xent->k.addr6.s6_addr32[3] = tent->k.addr.s_addr;
2494		xent->flags = IPFW_TCF_INET;
2495	} else
2496		memcpy(&xent->k, &tent->k, sizeof(xent->k));
2497
2498	return (0);
2499}
2500
2501/*
2502 * Helper function to export table algo data
2503 * to tentry format before calling user function.
2504 *
2505 * Returns 0 on success.
2506 */
2507static int
2508prepare_table_tentry(void *e, void *arg)
2509{
2510	struct dump_args *da;
2511	struct table_config *tc;
2512	struct table_algo *ta;
2513	int error;
2514
2515	da = (struct dump_args *)arg;
2516
2517	tc = da->tc;
2518	ta = tc->ta;
2519
2520	error = ta->dump_tentry(tc->astate, da->ti, e, &da->tent);
2521	if (error != 0)
2522		return (error);
2523
2524	da->f(&da->tent, da->farg);
2525
2526	return (0);
2527}
2528
2529/*
2530 * Allow external consumers to read table entries in standard format.
2531 */
2532int
2533ipfw_foreach_table_tentry(struct ip_fw_chain *ch, uint16_t kidx,
2534    ta_foreach_f *f, void *arg)
2535{
2536	struct namedobj_instance *ni;
2537	struct table_config *tc;
2538	struct table_algo *ta;
2539	struct dump_args da;
2540
2541	ni = CHAIN_TO_NI(ch);
2542
2543	tc = (struct table_config *)ipfw_objhash_lookup_kidx(ni, kidx);
2544	if (tc == NULL)
2545		return (ESRCH);
2546
2547	ta = tc->ta;
2548
2549	memset(&da, 0, sizeof(da));
2550	da.ch = ch;
2551	da.ti = KIDX_TO_TI(ch, tc->no.kidx);
2552	da.tc = tc;
2553	da.f = f;
2554	da.farg = arg;
2555
2556	ta->foreach(tc->astate, da.ti, prepare_table_tentry, &da);
2557
2558	return (0);
2559}
2560
2561/*
2562 * Table algorithms
2563 */
2564
2565/*
2566 * Finds algorithm by index, table type or supplied name.
2567 *
2568 * Returns pointer to algo or NULL.
2569 */
2570static struct table_algo *
2571find_table_algo(struct tables_config *tcfg, struct tid_info *ti, char *name)
2572{
2573	int i, l;
2574	struct table_algo *ta;
2575
2576	if (ti->type > IPFW_TABLE_MAXTYPE)
2577		return (NULL);
2578
2579	/* Search by index */
2580	if (ti->atype != 0) {
2581		if (ti->atype > tcfg->algo_count)
2582			return (NULL);
2583		return (tcfg->algo[ti->atype]);
2584	}
2585
2586	if (name == NULL) {
2587		/* Return default algorithm for given type if set */
2588		return (tcfg->def_algo[ti->type]);
2589	}
2590
2591	/* Search by name */
2592	/* TODO: better search */
2593	for (i = 1; i <= tcfg->algo_count; i++) {
2594		ta = tcfg->algo[i];
2595
2596		/*
2597		 * One can supply additional algorithm
2598		 * parameters so we compare only the first word
2599		 * of supplied name:
2600		 * 'addr:chash hsize=32'
2601		 * '^^^^^^^^^'
2602		 *
2603		 */
2604		l = strlen(ta->name);
2605		if (strncmp(name, ta->name, l) != 0)
2606			continue;
2607		if (name[l] != '\0' && name[l] != ' ')
2608			continue;
2609		/* Check if we're requesting proper table type */
2610		if (ti->type != 0 && ti->type != ta->type)
2611			return (NULL);
2612		return (ta);
2613	}
2614
2615	return (NULL);
2616}
2617
2618/*
2619 * Register new table algo @ta.
2620 * Stores algo id inside @idx.
2621 *
2622 * Returns 0 on success.
2623 */
2624int
2625ipfw_add_table_algo(struct ip_fw_chain *ch, struct table_algo *ta, size_t size,
2626    int *idx)
2627{
2628	struct tables_config *tcfg;
2629	struct table_algo *ta_new;
2630	size_t sz;
2631
2632	if (size > sizeof(struct table_algo))
2633		return (EINVAL);
2634
2635	/* Check for the required on-stack size for add/del */
2636	sz = roundup2(ta->ta_buf_size, sizeof(void *));
2637	if (sz > TA_BUF_SZ)
2638		return (EINVAL);
2639
2640	KASSERT(ta->type <= IPFW_TABLE_MAXTYPE,("Increase IPFW_TABLE_MAXTYPE"));
2641
2642	/* Copy algorithm data to stable storage. */
2643	ta_new = malloc(sizeof(struct table_algo), M_IPFW, M_WAITOK | M_ZERO);
2644	memcpy(ta_new, ta, size);
2645
2646	tcfg = CHAIN_TO_TCFG(ch);
2647
2648	KASSERT(tcfg->algo_count < 255, ("Increase algo array size"));
2649
2650	tcfg->algo[++tcfg->algo_count] = ta_new;
2651	ta_new->idx = tcfg->algo_count;
2652
2653	/* Set algorithm as default one for given type */
2654	if ((ta_new->flags & TA_FLAG_DEFAULT) != 0 &&
2655	    tcfg->def_algo[ta_new->type] == NULL)
2656		tcfg->def_algo[ta_new->type] = ta_new;
2657
2658	*idx = ta_new->idx;
2659
2660	return (0);
2661}
2662
2663/*
2664 * Unregisters table algo using @idx as id.
2665 * XXX: It is NOT safe to call this function in any place
2666 * other than ipfw instance destroy handler.
2667 */
2668void
2669ipfw_del_table_algo(struct ip_fw_chain *ch, int idx)
2670{
2671	struct tables_config *tcfg;
2672	struct table_algo *ta;
2673
2674	tcfg = CHAIN_TO_TCFG(ch);
2675
2676	KASSERT(idx <= tcfg->algo_count, ("algo idx %d out of range 1..%d",
2677	    idx, tcfg->algo_count));
2678
2679	ta = tcfg->algo[idx];
2680	KASSERT(ta != NULL, ("algo idx %d is NULL", idx));
2681
2682	if (tcfg->def_algo[ta->type] == ta)
2683		tcfg->def_algo[ta->type] = NULL;
2684
2685	free(ta, M_IPFW);
2686}
2687
2688/*
2689 * Lists all table algorithms currently available.
2690 * Data layout (v0)(current):
2691 * Request: [ ipfw_obj_lheader ], size = ipfw_obj_lheader.size
2692 * Reply: [ ipfw_obj_lheader ipfw_ta_info x N ]
2693 *
2694 * Returns 0 on success
2695 */
2696static int
2697list_table_algo(struct ip_fw_chain *ch, ip_fw3_opheader *op3,
2698    struct sockopt_data *sd)
2699{
2700	struct _ipfw_obj_lheader *olh;
2701	struct tables_config *tcfg;
2702	ipfw_ta_info *i;
2703	struct table_algo *ta;
2704	uint32_t count, n, size;
2705
2706	olh = (struct _ipfw_obj_lheader *)ipfw_get_sopt_header(sd,sizeof(*olh));
2707	if (olh == NULL)
2708		return (EINVAL);
2709	if (sd->valsize < olh->size)
2710		return (EINVAL);
2711
2712	IPFW_UH_RLOCK(ch);
2713	tcfg = CHAIN_TO_TCFG(ch);
2714	count = tcfg->algo_count;
2715	size = count * sizeof(ipfw_ta_info) + sizeof(ipfw_obj_lheader);
2716
2717	/* Fill in header regadless of buffer size */
2718	olh->count = count;
2719	olh->objsize = sizeof(ipfw_ta_info);
2720
2721	if (size > olh->size) {
2722		olh->size = size;
2723		IPFW_UH_RUNLOCK(ch);
2724		return (ENOMEM);
2725	}
2726	olh->size = size;
2727
2728	for (n = 1; n <= count; n++) {
2729		i = (ipfw_ta_info *)ipfw_get_sopt_space(sd, sizeof(*i));
2730		KASSERT(i != NULL, ("previously checked buffer is not enough"));
2731		ta = tcfg->algo[n];
2732		strlcpy(i->algoname, ta->name, sizeof(i->algoname));
2733		i->type = ta->type;
2734		i->refcnt = ta->refcnt;
2735	}
2736
2737	IPFW_UH_RUNLOCK(ch);
2738
2739	return (0);
2740}
2741
2742static int
2743classify_srcdst(ipfw_insn *cmd, uint16_t *puidx, uint8_t *ptype)
2744{
2745	/* Basic IPv4/IPv6 or u32 lookups */
2746	*puidx = cmd->arg1;
2747	/* Assume ADDR by default */
2748	*ptype = IPFW_TABLE_ADDR;
2749	int v;
2750
2751	if (F_LEN(cmd) > F_INSN_SIZE(ipfw_insn_u32)) {
2752		/*
2753		 * generic lookup. The key must be
2754		 * in 32bit big-endian format.
2755		 */
2756		v = ((ipfw_insn_u32 *)cmd)->d[1];
2757		switch (v) {
2758		case 0:
2759		case 1:
2760			/* IPv4 src/dst */
2761			break;
2762		case 2:
2763		case 3:
2764			/* src/dst port */
2765			*ptype = IPFW_TABLE_NUMBER;
2766			break;
2767		case 4:
2768			/* uid/gid */
2769			*ptype = IPFW_TABLE_NUMBER;
2770			break;
2771		case 5:
2772			/* jid */
2773			*ptype = IPFW_TABLE_NUMBER;
2774			break;
2775		case 6:
2776			/* dscp */
2777			*ptype = IPFW_TABLE_NUMBER;
2778			break;
2779		}
2780	}
2781
2782	return (0);
2783}
2784
2785static int
2786classify_via(ipfw_insn *cmd, uint16_t *puidx, uint8_t *ptype)
2787{
2788	ipfw_insn_if *cmdif;
2789
2790	/* Interface table, possibly */
2791	cmdif = (ipfw_insn_if *)cmd;
2792	if (cmdif->name[0] != '\1')
2793		return (1);
2794
2795	*ptype = IPFW_TABLE_INTERFACE;
2796	*puidx = cmdif->p.kidx;
2797
2798	return (0);
2799}
2800
2801static int
2802classify_flow(ipfw_insn *cmd, uint16_t *puidx, uint8_t *ptype)
2803{
2804
2805	*puidx = cmd->arg1;
2806	*ptype = IPFW_TABLE_FLOW;
2807
2808	return (0);
2809}
2810
2811static void
2812update_arg1(ipfw_insn *cmd, uint16_t idx)
2813{
2814
2815	cmd->arg1 = idx;
2816}
2817
2818static void
2819update_via(ipfw_insn *cmd, uint16_t idx)
2820{
2821	ipfw_insn_if *cmdif;
2822
2823	cmdif = (ipfw_insn_if *)cmd;
2824	cmdif->p.kidx = idx;
2825}
2826
2827static int
2828table_findbyname(struct ip_fw_chain *ch, struct tid_info *ti,
2829    struct named_object **pno)
2830{
2831	struct table_config *tc;
2832	int error;
2833
2834	IPFW_UH_WLOCK_ASSERT(ch);
2835
2836	error = find_table_err(CHAIN_TO_NI(ch), ti, &tc);
2837	if (error != 0)
2838		return (error);
2839
2840	*pno = &tc->no;
2841	return (0);
2842}
2843
2844/* XXX: sets-sets! */
2845static struct named_object *
2846table_findbykidx(struct ip_fw_chain *ch, uint16_t idx)
2847{
2848	struct namedobj_instance *ni;
2849	struct table_config *tc;
2850
2851	IPFW_UH_WLOCK_ASSERT(ch);
2852	ni = CHAIN_TO_NI(ch);
2853	tc = (struct table_config *)ipfw_objhash_lookup_kidx(ni, idx);
2854	KASSERT(tc != NULL, ("Table with index %d not found", idx));
2855
2856	return (&tc->no);
2857}
2858
2859static int
2860table_manage_sets(struct ip_fw_chain *ch, uint16_t set, uint8_t new_set,
2861    enum ipfw_sets_cmd cmd)
2862{
2863
2864	switch (cmd) {
2865	case SWAP_ALL:
2866	case TEST_ALL:
2867	case MOVE_ALL:
2868		/*
2869		 * Always return success, the real action and decision
2870		 * should make table_manage_sets_all().
2871		 */
2872		return (0);
2873	case TEST_ONE:
2874	case MOVE_ONE:
2875		/*
2876		 * NOTE: we need to use ipfw_objhash_del/ipfw_objhash_add
2877		 * if set number will be used in hash function. Currently
2878		 * we can just use generic handler that replaces set value.
2879		 */
2880		if (V_fw_tables_sets == 0)
2881			return (0);
2882		break;
2883	case COUNT_ONE:
2884		/*
2885		 * Return EOPNOTSUPP for COUNT_ONE when per-set sysctl is
2886		 * disabled. This allow skip table's opcodes from additional
2887		 * checks when specific rules moved to another set.
2888		 */
2889		if (V_fw_tables_sets == 0)
2890			return (EOPNOTSUPP);
2891	}
2892	/* Use generic sets handler when per-set sysctl is enabled. */
2893	return (ipfw_obj_manage_sets(CHAIN_TO_NI(ch), IPFW_TLV_TBL_NAME,
2894	    set, new_set, cmd));
2895}
2896
2897/*
2898 * We register several opcode rewriters for lookup tables.
2899 * All tables opcodes have the same ETLV type, but different subtype.
2900 * To avoid invoking sets handler several times for XXX_ALL commands,
2901 * we use separate manage_sets handler. O_RECV has the lowest value,
2902 * so it should be called first.
2903 */
2904static int
2905table_manage_sets_all(struct ip_fw_chain *ch, uint16_t set, uint8_t new_set,
2906    enum ipfw_sets_cmd cmd)
2907{
2908
2909	switch (cmd) {
2910	case SWAP_ALL:
2911	case TEST_ALL:
2912		/*
2913		 * Return success for TEST_ALL, since nothing prevents
2914		 * move rules from one set to another. All tables are
2915		 * accessible from all sets when per-set tables sysctl
2916		 * is disabled.
2917		 */
2918	case MOVE_ALL:
2919		if (V_fw_tables_sets == 0)
2920			return (0);
2921		break;
2922	default:
2923		return (table_manage_sets(ch, set, new_set, cmd));
2924	}
2925	/* Use generic sets handler when per-set sysctl is enabled. */
2926	return (ipfw_obj_manage_sets(CHAIN_TO_NI(ch), IPFW_TLV_TBL_NAME,
2927	    set, new_set, cmd));
2928}
2929
2930static struct opcode_obj_rewrite opcodes[] = {
2931	{
2932		.opcode = O_IP_SRC_LOOKUP,
2933		.etlv = IPFW_TLV_TBL_NAME,
2934		.classifier = classify_srcdst,
2935		.update = update_arg1,
2936		.find_byname = table_findbyname,
2937		.find_bykidx = table_findbykidx,
2938		.create_object = create_table_compat,
2939		.manage_sets = table_manage_sets,
2940	},
2941	{
2942		.opcode = O_IP_DST_LOOKUP,
2943		.etlv = IPFW_TLV_TBL_NAME,
2944		.classifier = classify_srcdst,
2945		.update = update_arg1,
2946		.find_byname = table_findbyname,
2947		.find_bykidx = table_findbykidx,
2948		.create_object = create_table_compat,
2949		.manage_sets = table_manage_sets,
2950	},
2951	{
2952		.opcode = O_IP_FLOW_LOOKUP,
2953		.etlv = IPFW_TLV_TBL_NAME,
2954		.classifier = classify_flow,
2955		.update = update_arg1,
2956		.find_byname = table_findbyname,
2957		.find_bykidx = table_findbykidx,
2958		.create_object = create_table_compat,
2959		.manage_sets = table_manage_sets,
2960	},
2961	{
2962		.opcode = O_XMIT,
2963		.etlv = IPFW_TLV_TBL_NAME,
2964		.classifier = classify_via,
2965		.update = update_via,
2966		.find_byname = table_findbyname,
2967		.find_bykidx = table_findbykidx,
2968		.create_object = create_table_compat,
2969		.manage_sets = table_manage_sets,
2970	},
2971	{
2972		.opcode = O_RECV,
2973		.etlv = IPFW_TLV_TBL_NAME,
2974		.classifier = classify_via,
2975		.update = update_via,
2976		.find_byname = table_findbyname,
2977		.find_bykidx = table_findbykidx,
2978		.create_object = create_table_compat,
2979		.manage_sets = table_manage_sets_all,
2980	},
2981	{
2982		.opcode = O_VIA,
2983		.etlv = IPFW_TLV_TBL_NAME,
2984		.classifier = classify_via,
2985		.update = update_via,
2986		.find_byname = table_findbyname,
2987		.find_bykidx = table_findbykidx,
2988		.create_object = create_table_compat,
2989		.manage_sets = table_manage_sets,
2990	},
2991};
2992
2993static int
2994test_sets_cb(struct namedobj_instance *ni __unused, struct named_object *no,
2995    void *arg __unused)
2996{
2997
2998	/* Check that there aren't any tables in not default set */
2999	if (no->set != 0)
3000		return (EBUSY);
3001	return (0);
3002}
3003
3004/*
3005 * Switch between "set 0" and "rule's set" table binding,
3006 * Check all ruleset bindings and permits changing
3007 * IFF each binding has both rule AND table in default set (set 0).
3008 *
3009 * Returns 0 on success.
3010 */
3011int
3012ipfw_switch_tables_namespace(struct ip_fw_chain *ch, unsigned int sets)
3013{
3014	struct opcode_obj_rewrite *rw;
3015	struct namedobj_instance *ni;
3016	struct named_object *no;
3017	struct ip_fw *rule;
3018	ipfw_insn *cmd;
3019	int cmdlen, i, l;
3020	uint16_t kidx;
3021	uint8_t subtype;
3022
3023	IPFW_UH_WLOCK(ch);
3024
3025	if (V_fw_tables_sets == sets) {
3026		IPFW_UH_WUNLOCK(ch);
3027		return (0);
3028	}
3029	ni = CHAIN_TO_NI(ch);
3030	if (sets == 0) {
3031		/*
3032		 * Prevent disabling sets support if we have some tables
3033		 * in not default sets.
3034		 */
3035		if (ipfw_objhash_foreach_type(ni, test_sets_cb,
3036		    NULL, IPFW_TLV_TBL_NAME) != 0) {
3037			IPFW_UH_WUNLOCK(ch);
3038			return (EBUSY);
3039		}
3040	}
3041	/*
3042	 * Scan all rules and examine tables opcodes.
3043	 */
3044	for (i = 0; i < ch->n_rules; i++) {
3045		rule = ch->map[i];
3046
3047		l = rule->cmd_len;
3048		cmd = rule->cmd;
3049		cmdlen = 0;
3050		for ( ;	l > 0 ; l -= cmdlen, cmd += cmdlen) {
3051			cmdlen = F_LEN(cmd);
3052			/* Check only tables opcodes */
3053			for (kidx = 0, rw = opcodes;
3054			    rw < opcodes + nitems(opcodes); rw++) {
3055				if (rw->opcode != cmd->opcode)
3056					continue;
3057				if (rw->classifier(cmd, &kidx, &subtype) == 0)
3058					break;
3059			}
3060			if (kidx == 0)
3061				continue;
3062			no = ipfw_objhash_lookup_kidx(ni, kidx);
3063			/* Check if both table object and rule has the set 0 */
3064			if (no->set != 0 || rule->set != 0) {
3065				IPFW_UH_WUNLOCK(ch);
3066				return (EBUSY);
3067			}
3068
3069		}
3070	}
3071	V_fw_tables_sets = sets;
3072	IPFW_UH_WUNLOCK(ch);
3073	return (0);
3074}
3075
3076/*
3077 * Checks table name for validity.
3078 * Enforce basic length checks, the rest
3079 * should be done in userland.
3080 *
3081 * Returns 0 if name is considered valid.
3082 */
3083static int
3084check_table_name(const char *name)
3085{
3086
3087	/*
3088	 * TODO: do some more complicated checks
3089	 */
3090	return (ipfw_check_object_name_generic(name));
3091}
3092
3093/*
3094 * Finds table config based on either legacy index
3095 * or name in ntlv.
3096 * Note @ti structure contains unchecked data from userland.
3097 *
3098 * Returns 0 in success and fills in @tc with found config
3099 */
3100static int
3101find_table_err(struct namedobj_instance *ni, struct tid_info *ti,
3102    struct table_config **tc)
3103{
3104	char *name, bname[16];
3105	struct named_object *no;
3106	ipfw_obj_ntlv *ntlv;
3107	uint32_t set;
3108
3109	if (ti->tlvs != NULL) {
3110		ntlv = ipfw_find_name_tlv_type(ti->tlvs, ti->tlen, ti->uidx,
3111		    IPFW_TLV_TBL_NAME);
3112		if (ntlv == NULL)
3113			return (EINVAL);
3114		name = ntlv->name;
3115
3116		/*
3117		 * Use set provided by @ti instead of @ntlv one.
3118		 * This is needed due to different sets behavior
3119		 * controlled by V_fw_tables_sets.
3120		 */
3121		set = (V_fw_tables_sets != 0) ? ti->set : 0;
3122	} else {
3123		snprintf(bname, sizeof(bname), "%d", ti->uidx);
3124		name = bname;
3125		set = 0;
3126	}
3127
3128	no = ipfw_objhash_lookup_name(ni, set, name);
3129	*tc = (struct table_config *)no;
3130
3131	return (0);
3132}
3133
3134/*
3135 * Finds table config based on either legacy index
3136 * or name in ntlv.
3137 * Note @ti structure contains unchecked data from userland.
3138 *
3139 * Returns pointer to table_config or NULL.
3140 */
3141static struct table_config *
3142find_table(struct namedobj_instance *ni, struct tid_info *ti)
3143{
3144	struct table_config *tc;
3145
3146	if (find_table_err(ni, ti, &tc) != 0)
3147		return (NULL);
3148
3149	return (tc);
3150}
3151
3152/*
3153 * Allocate new table config structure using
3154 * specified @algo and @aname.
3155 *
3156 * Returns pointer to config or NULL.
3157 */
3158static struct table_config *
3159alloc_table_config(struct ip_fw_chain *ch, struct tid_info *ti,
3160    struct table_algo *ta, char *aname, uint8_t tflags)
3161{
3162	char *name, bname[16];
3163	struct table_config *tc;
3164	int error;
3165	ipfw_obj_ntlv *ntlv;
3166	uint32_t set;
3167
3168	if (ti->tlvs != NULL) {
3169		ntlv = ipfw_find_name_tlv_type(ti->tlvs, ti->tlen, ti->uidx,
3170		    IPFW_TLV_TBL_NAME);
3171		if (ntlv == NULL)
3172			return (NULL);
3173		name = ntlv->name;
3174		set = (V_fw_tables_sets == 0) ? 0 : ntlv->set;
3175	} else {
3176		/* Compat part: convert number to string representation */
3177		snprintf(bname, sizeof(bname), "%d", ti->uidx);
3178		name = bname;
3179		set = 0;
3180	}
3181
3182	tc = malloc(sizeof(struct table_config), M_IPFW, M_WAITOK | M_ZERO);
3183	tc->no.name = tc->tablename;
3184	tc->no.subtype = ta->type;
3185	tc->no.set = set;
3186	tc->tflags = tflags;
3187	tc->ta = ta;
3188	strlcpy(tc->tablename, name, sizeof(tc->tablename));
3189	/* Set "shared" value type by default */
3190	tc->vshared = 1;
3191
3192	/* Preallocate data structures for new tables */
3193	error = ta->init(ch, &tc->astate, &tc->ti_copy, aname, tflags);
3194	if (error != 0) {
3195		free(tc, M_IPFW);
3196		return (NULL);
3197	}
3198
3199	return (tc);
3200}
3201
3202/*
3203 * Destroys table state and config.
3204 */
3205static void
3206free_table_config(struct namedobj_instance *ni, struct table_config *tc)
3207{
3208
3209	KASSERT(tc->linked == 0, ("free() on linked config"));
3210	/* UH lock MUST NOT be held */
3211
3212	/*
3213	 * We're using ta without any locking/referencing.
3214	 * TODO: fix this if we're going to use unloadable algos.
3215	 */
3216	tc->ta->destroy(tc->astate, &tc->ti_copy);
3217	free(tc, M_IPFW);
3218}
3219
3220/*
3221 * Links @tc to @chain table named instance.
3222 * Sets appropriate type/states in @chain table info.
3223 */
3224static void
3225link_table(struct ip_fw_chain *ch, struct table_config *tc)
3226{
3227	struct namedobj_instance *ni;
3228	struct table_info *ti;
3229	uint16_t kidx;
3230
3231	IPFW_UH_WLOCK_ASSERT(ch);
3232
3233	ni = CHAIN_TO_NI(ch);
3234	kidx = tc->no.kidx;
3235
3236	ipfw_objhash_add(ni, &tc->no);
3237
3238	ti = KIDX_TO_TI(ch, kidx);
3239	*ti = tc->ti_copy;
3240
3241	/* Notify algo on real @ti address */
3242	if (tc->ta->change_ti != NULL)
3243		tc->ta->change_ti(tc->astate, ti);
3244
3245	tc->linked = 1;
3246	tc->ta->refcnt++;
3247}
3248
3249/*
3250 * Unlinks @tc from @chain table named instance.
3251 * Zeroes states in @chain and stores them in @tc.
3252 */
3253static void
3254unlink_table(struct ip_fw_chain *ch, struct table_config *tc)
3255{
3256	struct namedobj_instance *ni;
3257	struct table_info *ti;
3258	uint16_t kidx;
3259
3260	IPFW_UH_WLOCK_ASSERT(ch);
3261	IPFW_WLOCK_ASSERT(ch);
3262
3263	ni = CHAIN_TO_NI(ch);
3264	kidx = tc->no.kidx;
3265
3266	/* Clear state. @ti copy is already saved inside @tc */
3267	ipfw_objhash_del(ni, &tc->no);
3268	ti = KIDX_TO_TI(ch, kidx);
3269	memset(ti, 0, sizeof(struct table_info));
3270	tc->linked = 0;
3271	tc->ta->refcnt--;
3272
3273	/* Notify algo on real @ti address */
3274	if (tc->ta->change_ti != NULL)
3275		tc->ta->change_ti(tc->astate, NULL);
3276}
3277
3278static struct ipfw_sopt_handler	scodes[] = {
3279	{ IP_FW_TABLE_XCREATE,	0,	HDIR_SET,	create_table },
3280	{ IP_FW_TABLE_XDESTROY,	0,	HDIR_SET,	flush_table_v0 },
3281	{ IP_FW_TABLE_XFLUSH,	0,	HDIR_SET,	flush_table_v0 },
3282	{ IP_FW_TABLE_XMODIFY,	0,	HDIR_BOTH,	modify_table },
3283	{ IP_FW_TABLE_XINFO,	0,	HDIR_GET,	describe_table },
3284	{ IP_FW_TABLES_XLIST,	0,	HDIR_GET,	list_tables },
3285	{ IP_FW_TABLE_XLIST,	0,	HDIR_GET,	dump_table_v0 },
3286	{ IP_FW_TABLE_XLIST,	1,	HDIR_GET,	dump_table_v1 },
3287	{ IP_FW_TABLE_XADD,	0,	HDIR_BOTH,	manage_table_ent_v0 },
3288	{ IP_FW_TABLE_XADD,	1,	HDIR_BOTH,	manage_table_ent_v1 },
3289	{ IP_FW_TABLE_XDEL,	0,	HDIR_BOTH,	manage_table_ent_v0 },
3290	{ IP_FW_TABLE_XDEL,	1,	HDIR_BOTH,	manage_table_ent_v1 },
3291	{ IP_FW_TABLE_XFIND,	0,	HDIR_GET,	find_table_entry },
3292	{ IP_FW_TABLE_XSWAP,	0,	HDIR_SET,	swap_table },
3293	{ IP_FW_TABLES_ALIST,	0,	HDIR_GET,	list_table_algo },
3294	{ IP_FW_TABLE_XGETSIZE,	0,	HDIR_GET,	get_table_size },
3295};
3296
3297static int
3298destroy_table_locked(struct namedobj_instance *ni, struct named_object *no,
3299    void *arg)
3300{
3301
3302	unlink_table((struct ip_fw_chain *)arg, (struct table_config *)no);
3303	if (ipfw_objhash_free_idx(ni, no->kidx) != 0)
3304		printf("Error unlinking kidx %d from table %s\n",
3305		    no->kidx, no->name);
3306	free_table_config(ni, (struct table_config *)no);
3307	return (0);
3308}
3309
3310/*
3311 * Shuts tables module down.
3312 */
3313void
3314ipfw_destroy_tables(struct ip_fw_chain *ch, int last)
3315{
3316
3317	IPFW_DEL_SOPT_HANDLER(last, scodes);
3318	IPFW_DEL_OBJ_REWRITER(last, opcodes);
3319
3320	/* Remove all tables from working set */
3321	IPFW_UH_WLOCK(ch);
3322	IPFW_WLOCK(ch);
3323	ipfw_objhash_foreach(CHAIN_TO_NI(ch), destroy_table_locked, ch);
3324	IPFW_WUNLOCK(ch);
3325	IPFW_UH_WUNLOCK(ch);
3326
3327	/* Free pointers itself */
3328	free(ch->tablestate, M_IPFW);
3329
3330	ipfw_table_value_destroy(ch, last);
3331	ipfw_table_algo_destroy(ch);
3332
3333	ipfw_objhash_destroy(CHAIN_TO_NI(ch));
3334	free(CHAIN_TO_TCFG(ch), M_IPFW);
3335}
3336
3337/*
3338 * Starts tables module.
3339 */
3340int
3341ipfw_init_tables(struct ip_fw_chain *ch, int first)
3342{
3343	struct tables_config *tcfg;
3344
3345	/* Allocate pointers */
3346	ch->tablestate = malloc(V_fw_tables_max * sizeof(struct table_info),
3347	    M_IPFW, M_WAITOK | M_ZERO);
3348
3349	tcfg = malloc(sizeof(struct tables_config), M_IPFW, M_WAITOK | M_ZERO);
3350	tcfg->namehash = ipfw_objhash_create(V_fw_tables_max);
3351	ch->tblcfg = tcfg;
3352
3353	ipfw_table_value_init(ch, first);
3354	ipfw_table_algo_init(ch);
3355
3356	IPFW_ADD_OBJ_REWRITER(first, opcodes);
3357	IPFW_ADD_SOPT_HANDLER(first, scodes);
3358	return (0);
3359}
3360
3361
3362
3363