ip_fw_table.c revision 282286
1/*-
2 * Copyright (c) 2004 Ruslan Ermilov and Vsevolod Lobko.
3 * Copyright (c) 2014 Yandex LLC
4 * Copyright (c) 2014 Alexander V. Chernikov
5 *
6 * Redistribution and use in source and binary forms, with or without
7 * modification, are permitted provided that the following conditions
8 * are met:
9 * 1. Redistributions of source code must retain the above copyright
10 *    notice, this list of conditions and the following disclaimer.
11 * 2. Redistributions in binary form must reproduce the above copyright
12 *    notice, this list of conditions and the following disclaimer in the
13 *    documentation and/or other materials provided with the distribution.
14 *
15 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
16 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
17 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
18 * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
19 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
20 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
21 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
22 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
23 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
24 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
25 * SUCH DAMAGE.
26 */
27
28#include <sys/cdefs.h>
29__FBSDID("$FreeBSD: head/sys/netpfil/ipfw/ip_fw_table.c 282286 2015-04-30 21:51:12Z melifaro $");
30
31/*
32 * Lookup table support for ipfw.
33 *
34 * This file contains handlers for all generic tables' operations:
35 * add/del/flush entries, list/dump tables etc..
36 *
37 * Table data modification is protected by both UH and runtime lock
38 * while reading configuration/data is protected by UH lock.
39 *
40 * Lookup algorithms for all table types are located in ip_fw_table_algo.c
41 */
42
43#include "opt_ipfw.h"
44
45#include <sys/param.h>
46#include <sys/systm.h>
47#include <sys/malloc.h>
48#include <sys/kernel.h>
49#include <sys/lock.h>
50#include <sys/rwlock.h>
51#include <sys/rmlock.h>
52#include <sys/socket.h>
53#include <sys/socketvar.h>
54#include <sys/queue.h>
55#include <net/if.h>	/* ip_fw.h requires IFNAMSIZ */
56
57#include <netinet/in.h>
58#include <netinet/ip_var.h>	/* struct ipfw_rule_ref */
59#include <netinet/ip_fw.h>
60
61#include <netpfil/ipfw/ip_fw_private.h>
62#include <netpfil/ipfw/ip_fw_table.h>
63
64 /*
65 * Table has the following `type` concepts:
66 *
67 * `no.type` represents lookup key type (addr, ifp, uid, etc..)
68 * vmask represents bitmask of table values which are present at the moment.
69 * Special IPFW_VTYPE_LEGACY ( (uint32_t)-1 ) represents old
70 * single-value-for-all approach.
71 */
72struct table_config {
73	struct named_object	no;
74	uint8_t		tflags;		/* type flags */
75	uint8_t		locked;		/* 1 if locked from changes */
76	uint8_t		linked;		/* 1 if already linked */
77	uint8_t		ochanged;	/* used by set swapping */
78	uint8_t		vshared;	/* 1 if using shared value array */
79	uint8_t		spare[3];
80	uint32_t	count;		/* Number of records */
81	uint32_t	limit;		/* Max number of records */
82	uint32_t	vmask;		/* bitmask with supported values */
83	uint32_t	ocount;		/* used by set swapping */
84	uint64_t	gencnt;		/* generation count */
85	char		tablename[64];	/* table name */
86	struct table_algo	*ta;	/* Callbacks for given algo */
87	void		*astate;	/* algorithm state */
88	struct table_info	ti_copy;	/* data to put to table_info */
89	struct namedobj_instance	*vi;
90};
91
92static int find_table_err(struct namedobj_instance *ni, struct tid_info *ti,
93    struct table_config **tc);
94static struct table_config *find_table(struct namedobj_instance *ni,
95    struct tid_info *ti);
96static struct table_config *alloc_table_config(struct ip_fw_chain *ch,
97    struct tid_info *ti, struct table_algo *ta, char *adata, uint8_t tflags);
98static void free_table_config(struct namedobj_instance *ni,
99    struct table_config *tc);
100static int create_table_internal(struct ip_fw_chain *ch, struct tid_info *ti,
101    char *aname, ipfw_xtable_info *i, uint16_t *pkidx, int ref);
102static void link_table(struct ip_fw_chain *ch, struct table_config *tc);
103static void unlink_table(struct ip_fw_chain *ch, struct table_config *tc);
104static int find_ref_table(struct ip_fw_chain *ch, struct tid_info *ti,
105    struct tentry_info *tei, uint32_t count, int op, struct table_config **ptc);
106#define	OP_ADD	1
107#define	OP_DEL	0
108static int export_tables(struct ip_fw_chain *ch, ipfw_obj_lheader *olh,
109    struct sockopt_data *sd);
110static void export_table_info(struct ip_fw_chain *ch, struct table_config *tc,
111    ipfw_xtable_info *i);
112static int dump_table_tentry(void *e, void *arg);
113static int dump_table_xentry(void *e, void *arg);
114
115static int swap_tables(struct ip_fw_chain *ch, struct tid_info *a,
116    struct tid_info *b);
117
118static int check_table_space(struct ip_fw_chain *ch, struct tableop_state *ts,
119    struct table_config *tc, struct table_info *ti, uint32_t count);
120static int destroy_table(struct ip_fw_chain *ch, struct tid_info *ti);
121
122static struct table_algo *find_table_algo(struct tables_config *tableconf,
123    struct tid_info *ti, char *name);
124
125static void objheader_to_ti(struct _ipfw_obj_header *oh, struct tid_info *ti);
126static void ntlv_to_ti(struct _ipfw_obj_ntlv *ntlv, struct tid_info *ti);
127
128#define	CHAIN_TO_NI(chain)	(CHAIN_TO_TCFG(chain)->namehash)
129#define	KIDX_TO_TI(ch, k)	(&(((struct table_info *)(ch)->tablestate)[k]))
130
131#define	TA_BUF_SZ	128	/* On-stack buffer for add/delete state */
132
133void
134rollback_toperation_state(struct ip_fw_chain *ch, void *object)
135{
136	struct tables_config *tcfg;
137	struct op_state *os;
138
139	tcfg = CHAIN_TO_TCFG(ch);
140	TAILQ_FOREACH(os, &tcfg->state_list, next)
141		os->func(object, os);
142}
143
144void
145add_toperation_state(struct ip_fw_chain *ch, struct tableop_state *ts)
146{
147	struct tables_config *tcfg;
148
149	tcfg = CHAIN_TO_TCFG(ch);
150	TAILQ_INSERT_HEAD(&tcfg->state_list, &ts->opstate, next);
151}
152
153void
154del_toperation_state(struct ip_fw_chain *ch, struct tableop_state *ts)
155{
156	struct tables_config *tcfg;
157
158	tcfg = CHAIN_TO_TCFG(ch);
159	TAILQ_REMOVE(&tcfg->state_list, &ts->opstate, next);
160}
161
162void
163tc_ref(struct table_config *tc)
164{
165
166	tc->no.refcnt++;
167}
168
169void
170tc_unref(struct table_config *tc)
171{
172
173	tc->no.refcnt--;
174}
175
176static struct table_value *
177get_table_value(struct ip_fw_chain *ch, struct table_config *tc, uint32_t kidx)
178{
179	struct table_value *pval;
180
181	pval = (struct table_value *)ch->valuestate;
182
183	return (&pval[kidx]);
184}
185
186
187/*
188 * Checks if we're able to insert/update entry @tei into table
189 * w.r.t @tc limits.
190 * May alter @tei to indicate insertion error / insert
191 * options.
192 *
193 * Returns 0 if operation can be performed/
194 */
195static int
196check_table_limit(struct table_config *tc, struct tentry_info *tei)
197{
198
199	if (tc->limit == 0 || tc->count < tc->limit)
200		return (0);
201
202	if ((tei->flags & TEI_FLAGS_UPDATE) == 0) {
203		/* Notify userland on error cause */
204		tei->flags |= TEI_FLAGS_LIMIT;
205		return (EFBIG);
206	}
207
208	/*
209	 * We have UPDATE flag set.
210	 * Permit updating record (if found),
211	 * but restrict adding new one since we've
212	 * already hit the limit.
213	 */
214	tei->flags |= TEI_FLAGS_DONTADD;
215
216	return (0);
217}
218
219/*
220 * Convert algorithm callback return code into
221 * one of pre-defined states known by userland.
222 */
223static void
224store_tei_result(struct tentry_info *tei, int op, int error, uint32_t num)
225{
226	int flag;
227
228	flag = 0;
229
230	switch (error) {
231	case 0:
232		if (op == OP_ADD && num != 0)
233			flag = TEI_FLAGS_ADDED;
234		if (op == OP_DEL)
235			flag = TEI_FLAGS_DELETED;
236		break;
237	case ENOENT:
238		flag = TEI_FLAGS_NOTFOUND;
239		break;
240	case EEXIST:
241		flag = TEI_FLAGS_EXISTS;
242		break;
243	default:
244		flag = TEI_FLAGS_ERROR;
245	}
246
247	tei->flags |= flag;
248}
249
250/*
251 * Creates and references table with default parameters.
252 * Saves table config, algo and allocated kidx info @ptc, @pta and
253 * @pkidx if non-zero.
254 * Used for table auto-creation to support old binaries.
255 *
256 * Returns 0 on success.
257 */
258static int
259create_table_compat(struct ip_fw_chain *ch, struct tid_info *ti,
260    uint16_t *pkidx)
261{
262	ipfw_xtable_info xi;
263	int error;
264
265	memset(&xi, 0, sizeof(xi));
266	/* Set default value mask for legacy clients */
267	xi.vmask = IPFW_VTYPE_LEGACY;
268
269	error = create_table_internal(ch, ti, NULL, &xi, pkidx, 1);
270	if (error != 0)
271		return (error);
272
273	return (0);
274}
275
276/*
277 * Find and reference existing table optionally
278 * creating new one.
279 *
280 * Saves found table config into @ptc.
281 * Note function may drop/acquire UH_WLOCK.
282 * Returns 0 if table was found/created and referenced
283 * or non-zero return code.
284 */
285static int
286find_ref_table(struct ip_fw_chain *ch, struct tid_info *ti,
287    struct tentry_info *tei, uint32_t count, int op,
288    struct table_config **ptc)
289{
290	struct namedobj_instance *ni;
291	struct table_config *tc;
292	uint16_t kidx;
293	int error;
294
295	IPFW_UH_WLOCK_ASSERT(ch);
296
297	ni = CHAIN_TO_NI(ch);
298	tc = NULL;
299	if ((tc = find_table(ni, ti)) != NULL) {
300		/* check table type */
301		if (tc->no.subtype != ti->type)
302			return (EINVAL);
303
304		if (tc->locked != 0)
305			return (EACCES);
306
307		/* Try to exit early on limit hit */
308		if (op == OP_ADD && count == 1 &&
309		    check_table_limit(tc, tei) != 0)
310			return (EFBIG);
311
312		/* Reference and return */
313		tc->no.refcnt++;
314		*ptc = tc;
315		return (0);
316	}
317
318	if (op == OP_DEL)
319		return (ESRCH);
320
321	/* Compability mode: create new table for old clients */
322	if ((tei->flags & TEI_FLAGS_COMPAT) == 0)
323		return (ESRCH);
324
325	IPFW_UH_WUNLOCK(ch);
326	error = create_table_compat(ch, ti, &kidx);
327	IPFW_UH_WLOCK(ch);
328
329	if (error != 0)
330		return (error);
331
332	tc = (struct table_config *)ipfw_objhash_lookup_kidx(ni, kidx);
333	KASSERT(tc != NULL, ("create_table_compat returned bad idx %d", kidx));
334
335	/* OK, now we've got referenced table. */
336	*ptc = tc;
337	return (0);
338}
339
340/*
341 * Rolls back already @added to @tc entries using state array @ta_buf_m.
342 * Assume the following layout:
343 * 1) ADD state (ta_buf_m[0] ... t_buf_m[added - 1]) for handling update cases
344 * 2) DEL state (ta_buf_m[count[ ... t_buf_m[count + added - 1])
345 *   for storing deleted state
346 */
347static void
348rollback_added_entries(struct ip_fw_chain *ch, struct table_config *tc,
349    struct table_info *tinfo, struct tentry_info *tei, caddr_t ta_buf_m,
350    uint32_t count, uint32_t added)
351{
352	struct table_algo *ta;
353	struct tentry_info *ptei;
354	caddr_t v, vv;
355	size_t ta_buf_sz;
356	int error, i;
357	uint32_t num;
358
359	IPFW_UH_WLOCK_ASSERT(ch);
360
361	ta = tc->ta;
362	ta_buf_sz = ta->ta_buf_size;
363	v = ta_buf_m;
364	vv = v + count * ta_buf_sz;
365	for (i = 0; i < added; i++, v += ta_buf_sz, vv += ta_buf_sz) {
366		ptei = &tei[i];
367		if ((ptei->flags & TEI_FLAGS_UPDATED) != 0) {
368
369			/*
370			 * We have old value stored by previous
371			 * call in @ptei->value. Do add once again
372			 * to restore it.
373			 */
374			error = ta->add(tc->astate, tinfo, ptei, v, &num);
375			KASSERT(error == 0, ("rollback UPDATE fail"));
376			KASSERT(num == 0, ("rollback UPDATE fail2"));
377			continue;
378		}
379
380		error = ta->prepare_del(ch, ptei, vv);
381		KASSERT(error == 0, ("pre-rollback INSERT failed"));
382		error = ta->del(tc->astate, tinfo, ptei, vv, &num);
383		KASSERT(error == 0, ("rollback INSERT failed"));
384		tc->count -= num;
385	}
386}
387
388/*
389 * Prepares add/del state for all @count entries in @tei.
390 * Uses either stack buffer (@ta_buf) or allocates a new one.
391 * Stores pointer to allocated buffer back to @ta_buf.
392 *
393 * Returns 0 on success.
394 */
395static int
396prepare_batch_buffer(struct ip_fw_chain *ch, struct table_algo *ta,
397    struct tentry_info *tei, uint32_t count, int op, caddr_t *ta_buf)
398{
399	caddr_t ta_buf_m, v;
400	size_t ta_buf_sz, sz;
401	struct tentry_info *ptei;
402	int error, i;
403
404	error = 0;
405	ta_buf_sz = ta->ta_buf_size;
406	if (count == 1) {
407		/* Sigle add/delete, use on-stack buffer */
408		memset(*ta_buf, 0, TA_BUF_SZ);
409		ta_buf_m = *ta_buf;
410	} else {
411
412		/*
413		 * Multiple adds/deletes, allocate larger buffer
414		 *
415		 * Note we need 2xcount buffer for add case:
416		 * we have hold both ADD state
417		 * and DELETE state (this may be needed
418		 * if we need to rollback all changes)
419		 */
420		sz = count * ta_buf_sz;
421		ta_buf_m = malloc((op == OP_ADD) ? sz * 2 : sz, M_TEMP,
422		    M_WAITOK | M_ZERO);
423	}
424
425	v = ta_buf_m;
426	for (i = 0; i < count; i++, v += ta_buf_sz) {
427		ptei = &tei[i];
428		error = (op == OP_ADD) ?
429		    ta->prepare_add(ch, ptei, v) : ta->prepare_del(ch, ptei, v);
430
431		/*
432		 * Some syntax error (incorrect mask, or address, or
433		 * anything). Return error regardless of atomicity
434		 * settings.
435		 */
436		if (error != 0)
437			break;
438	}
439
440	*ta_buf = ta_buf_m;
441	return (error);
442}
443
444/*
445 * Flushes allocated state for each @count entries in @tei.
446 * Frees @ta_buf_m if differs from stack buffer @ta_buf.
447 */
448static void
449flush_batch_buffer(struct ip_fw_chain *ch, struct table_algo *ta,
450    struct tentry_info *tei, uint32_t count, int rollback,
451    caddr_t ta_buf_m, caddr_t ta_buf)
452{
453	caddr_t v;
454	struct tentry_info *ptei;
455	size_t ta_buf_sz;
456	int i;
457
458	ta_buf_sz = ta->ta_buf_size;
459
460	/* Run cleaning callback anyway */
461	v = ta_buf_m;
462	for (i = 0; i < count; i++, v += ta_buf_sz) {
463		ptei = &tei[i];
464		ta->flush_entry(ch, ptei, v);
465		if (ptei->ptv != NULL) {
466			free(ptei->ptv, M_IPFW);
467			ptei->ptv = NULL;
468		}
469	}
470
471	/* Clean up "deleted" state in case of rollback */
472	if (rollback != 0) {
473		v = ta_buf_m + count * ta_buf_sz;
474		for (i = 0; i < count; i++, v += ta_buf_sz)
475			ta->flush_entry(ch, &tei[i], v);
476	}
477
478	if (ta_buf_m != ta_buf)
479		free(ta_buf_m, M_TEMP);
480}
481
482
483static void
484rollback_add_entry(void *object, struct op_state *_state)
485{
486	struct ip_fw_chain *ch;
487	struct tableop_state *ts;
488
489	ts = (struct tableop_state *)_state;
490
491	if (ts->tc != object && ts->ch != object)
492		return;
493
494	ch = ts->ch;
495
496	IPFW_UH_WLOCK_ASSERT(ch);
497
498	/* Call specifid unlockers */
499	rollback_table_values(ts);
500
501	/* Indicate we've called */
502	ts->modified = 1;
503}
504
505/*
506 * Adds/updates one or more entries in table @ti.
507 *
508 * Function may drop/reacquire UH wlock multiple times due to
509 * items alloc, algorithm callbacks (check_space), value linkage
510 * (new values, value storage realloc), etc..
511 * Other processes like other adds (which may involve storage resize),
512 * table swaps (which changes table data and may change algo type),
513 * table modify (which may change value mask) may be executed
514 * simultaneously so we need to deal with it.
515 *
516 * The following approach was implemented:
517 * we have per-chain linked list, protected with UH lock.
518 * add_table_entry prepares special on-stack structure wthich is passed
519 * to its descendants. Users add this structure to this list before unlock.
520 * After performing needed operations and acquiring UH lock back, each user
521 * checks if structure has changed. If true, it rolls local state back and
522 * returns without error to the caller.
523 * add_table_entry() on its own checks if structure has changed and restarts
524 * its operation from the beginning (goto restart).
525 *
526 * Functions which are modifying fields of interest (currently
527 *   resize_shared_value_storage() and swap_tables() )
528 * traverses given list while holding UH lock immediately before
529 * performing their operations calling function provided be list entry
530 * ( currently rollback_add_entry  ) which performs rollback for all necessary
531 * state and sets appropriate values in structure indicating rollback
532 * has happened.
533 *
534 * Algo interaction:
535 * Function references @ti first to ensure table won't
536 * disappear or change its type.
537 * After that, prepare_add callback is called for each @tei entry.
538 * Next, we try to add each entry under UH+WHLOCK
539 * using add() callback.
540 * Finally, we free all state by calling flush_entry callback
541 * for each @tei.
542 *
543 * Returns 0 on success.
544 */
545int
546add_table_entry(struct ip_fw_chain *ch, struct tid_info *ti,
547    struct tentry_info *tei, uint8_t flags, uint32_t count)
548{
549	struct table_config *tc;
550	struct table_algo *ta;
551	uint16_t kidx;
552	int error, first_error, i, rollback;
553	uint32_t num, numadd;
554	struct tentry_info *ptei;
555	struct tableop_state ts;
556	char ta_buf[TA_BUF_SZ];
557	caddr_t ta_buf_m, v;
558
559	memset(&ts, 0, sizeof(ts));
560	ta = NULL;
561	IPFW_UH_WLOCK(ch);
562
563	/*
564	 * Find and reference existing table.
565	 */
566restart:
567	if (ts.modified != 0) {
568		IPFW_UH_WUNLOCK(ch);
569		flush_batch_buffer(ch, ta, tei, count, rollback,
570		    ta_buf_m, ta_buf);
571		memset(&ts, 0, sizeof(ts));
572		ta = NULL;
573		IPFW_UH_WLOCK(ch);
574	}
575
576	error = find_ref_table(ch, ti, tei, count, OP_ADD, &tc);
577	if (error != 0) {
578		IPFW_UH_WUNLOCK(ch);
579		return (error);
580	}
581	ta = tc->ta;
582
583	/* Fill in tablestate */
584	ts.ch = ch;
585	ts.opstate.func = rollback_add_entry;
586	ts.tc = tc;
587	ts.vshared = tc->vshared;
588	ts.vmask = tc->vmask;
589	ts.ta = ta;
590	ts.tei = tei;
591	ts.count = count;
592	rollback = 0;
593	add_toperation_state(ch, &ts);
594	IPFW_UH_WUNLOCK(ch);
595
596	/* Allocate memory and prepare record(s) */
597	/* Pass stack buffer by default */
598	ta_buf_m = ta_buf;
599	error = prepare_batch_buffer(ch, ta, tei, count, OP_ADD, &ta_buf_m);
600	if (error != 0)
601		goto cleanup;
602
603	IPFW_UH_WLOCK(ch);
604	/* Drop reference we've used in first search */
605	tc->no.refcnt--;
606
607	/*
608	 * Check if table swap has happened.
609	 * (so table algo might be changed).
610	 * Restart operation to achieve consistent behavior.
611	 */
612	del_toperation_state(ch, &ts);
613	if (ts.modified != 0)
614		goto restart;
615
616	/*
617	 * Link all values values to shared/per-table value array.
618	 *
619	 * May release/reacquire UH_WLOCK.
620	 */
621	error = ipfw_link_table_values(ch, &ts);
622	if (error != 0)
623		goto cleanup;
624	if (ts.modified != 0)
625		goto restart;
626
627	/*
628	 * Ensure we are able to add all entries without additional
629	 * memory allocations. May release/reacquire UH_WLOCK.
630	 */
631	kidx = tc->no.kidx;
632	error = check_table_space(ch, &ts, tc, KIDX_TO_TI(ch, kidx), count);
633	if (error != 0)
634		goto cleanup;
635	if (ts.modified != 0)
636		goto restart;
637
638	/* We've got valid table in @tc. Let's try to add data */
639	kidx = tc->no.kidx;
640	ta = tc->ta;
641	numadd = 0;
642	first_error = 0;
643
644	IPFW_WLOCK(ch);
645
646	v = ta_buf_m;
647	for (i = 0; i < count; i++, v += ta->ta_buf_size) {
648		ptei = &tei[i];
649		num = 0;
650		/* check limit before adding */
651		if ((error = check_table_limit(tc, ptei)) == 0) {
652			error = ta->add(tc->astate, KIDX_TO_TI(ch, kidx),
653			    ptei, v, &num);
654			/* Set status flag to inform userland */
655			store_tei_result(ptei, OP_ADD, error, num);
656		}
657		if (error == 0) {
658			/* Update number of records to ease limit checking */
659			tc->count += num;
660			numadd += num;
661			continue;
662		}
663
664		if (first_error == 0)
665			first_error = error;
666
667		/*
668		 * Some error have happened. Check our atomicity
669		 * settings: continue if atomicity is not required,
670		 * rollback changes otherwise.
671		 */
672		if ((flags & IPFW_CTF_ATOMIC) == 0)
673			continue;
674
675		rollback_added_entries(ch, tc, KIDX_TO_TI(ch, kidx),
676		    tei, ta_buf_m, count, i);
677
678		rollback = 1;
679		break;
680	}
681
682	IPFW_WUNLOCK(ch);
683
684	ipfw_garbage_table_values(ch, tc, tei, count, rollback);
685
686	/* Permit post-add algorithm grow/rehash. */
687	if (numadd != 0)
688		check_table_space(ch, NULL, tc, KIDX_TO_TI(ch, kidx), 0);
689
690	/* Return first error to user, if any */
691	error = first_error;
692
693cleanup:
694	IPFW_UH_WUNLOCK(ch);
695
696	flush_batch_buffer(ch, ta, tei, count, rollback, ta_buf_m, ta_buf);
697
698	return (error);
699}
700
701/*
702 * Deletes one or more entries in table @ti.
703 *
704 * Returns 0 on success.
705 */
706int
707del_table_entry(struct ip_fw_chain *ch, struct tid_info *ti,
708    struct tentry_info *tei, uint8_t flags, uint32_t count)
709{
710	struct table_config *tc;
711	struct table_algo *ta;
712	struct tentry_info *ptei;
713	uint16_t kidx;
714	int error, first_error, i;
715	uint32_t num, numdel;
716	char ta_buf[TA_BUF_SZ];
717	caddr_t ta_buf_m, v;
718
719	/*
720	 * Find and reference existing table.
721	 */
722	IPFW_UH_WLOCK(ch);
723	error = find_ref_table(ch, ti, tei, count, OP_DEL, &tc);
724	if (error != 0) {
725		IPFW_UH_WUNLOCK(ch);
726		return (error);
727	}
728	ta = tc->ta;
729	IPFW_UH_WUNLOCK(ch);
730
731	/* Allocate memory and prepare record(s) */
732	/* Pass stack buffer by default */
733	ta_buf_m = ta_buf;
734	error = prepare_batch_buffer(ch, ta, tei, count, OP_DEL, &ta_buf_m);
735	if (error != 0)
736		goto cleanup;
737
738	IPFW_UH_WLOCK(ch);
739
740	/* Drop reference we've used in first search */
741	tc->no.refcnt--;
742
743	/*
744	 * Check if table algo is still the same.
745	 * (changed ta may be the result of table swap).
746	 */
747	if (ta != tc->ta) {
748		IPFW_UH_WUNLOCK(ch);
749		error = EINVAL;
750		goto cleanup;
751	}
752
753	kidx = tc->no.kidx;
754	numdel = 0;
755	first_error = 0;
756
757	IPFW_WLOCK(ch);
758	v = ta_buf_m;
759	for (i = 0; i < count; i++, v += ta->ta_buf_size) {
760		ptei = &tei[i];
761		num = 0;
762		error = ta->del(tc->astate, KIDX_TO_TI(ch, kidx), ptei, v,
763		    &num);
764		/* Save state for userland */
765		store_tei_result(ptei, OP_DEL, error, num);
766		if (error != 0 && first_error == 0)
767			first_error = error;
768		tc->count -= num;
769		numdel += num;
770	}
771	IPFW_WUNLOCK(ch);
772
773	/* Unlink non-used values */
774	ipfw_garbage_table_values(ch, tc, tei, count, 0);
775
776	if (numdel != 0) {
777		/* Run post-del hook to permit shrinking */
778		check_table_space(ch, NULL, tc, KIDX_TO_TI(ch, kidx), 0);
779	}
780
781	IPFW_UH_WUNLOCK(ch);
782
783	/* Return first error to user, if any */
784	error = first_error;
785
786cleanup:
787	flush_batch_buffer(ch, ta, tei, count, 0, ta_buf_m, ta_buf);
788
789	return (error);
790}
791
792/*
793 * Ensure that table @tc has enough space to add @count entries without
794 * need for reallocation.
795 *
796 * Callbacks order:
797 * 0) need_modify() (UH_WLOCK) - checks if @count items can be added w/o resize.
798 *
799 * 1) alloc_modify (no locks, M_WAITOK) - alloc new state based on @pflags.
800 * 2) prepare_modifyt (UH_WLOCK) - copy old data into new storage
801 * 3) modify (UH_WLOCK + WLOCK) - switch pointers
802 * 4) flush_modify (UH_WLOCK) - free state, if needed
803 *
804 * Returns 0 on success.
805 */
806static int
807check_table_space(struct ip_fw_chain *ch, struct tableop_state *ts,
808    struct table_config *tc, struct table_info *ti, uint32_t count)
809{
810	struct table_algo *ta;
811	uint64_t pflags;
812	char ta_buf[TA_BUF_SZ];
813	int error;
814
815	IPFW_UH_WLOCK_ASSERT(ch);
816
817	error = 0;
818	ta = tc->ta;
819	if (ta->need_modify == NULL)
820		return (0);
821
822	/* Acquire reference not to loose @tc between locks/unlocks */
823	tc->no.refcnt++;
824
825	/*
826	 * TODO: think about avoiding race between large add/large delete
827	 * operation on algorithm which implements shrinking along with
828	 * growing.
829	 */
830	while (true) {
831		pflags = 0;
832		if (ta->need_modify(tc->astate, ti, count, &pflags) == 0) {
833			error = 0;
834			break;
835		}
836
837		/* We have to shrink/grow table */
838		if (ts != NULL)
839			add_toperation_state(ch, ts);
840		IPFW_UH_WUNLOCK(ch);
841
842		memset(&ta_buf, 0, sizeof(ta_buf));
843		error = ta->prepare_mod(ta_buf, &pflags);
844
845		IPFW_UH_WLOCK(ch);
846		if (ts != NULL)
847			del_toperation_state(ch, ts);
848
849		if (error != 0)
850			break;
851
852		if (ts != NULL && ts->modified != 0) {
853
854			/*
855			 * Swap operation has happened
856			 * so we're currently operating on other
857			 * table data. Stop doing this.
858			 */
859			ta->flush_mod(ta_buf);
860			break;
861		}
862
863		/* Check if we still need to alter table */
864		ti = KIDX_TO_TI(ch, tc->no.kidx);
865		if (ta->need_modify(tc->astate, ti, count, &pflags) == 0) {
866			IPFW_UH_WUNLOCK(ch);
867
868			/*
869			 * Other thread has already performed resize.
870			 * Flush our state and return.
871			 */
872			ta->flush_mod(ta_buf);
873			break;
874		}
875
876		error = ta->fill_mod(tc->astate, ti, ta_buf, &pflags);
877		if (error == 0) {
878			/* Do actual modification */
879			IPFW_WLOCK(ch);
880			ta->modify(tc->astate, ti, ta_buf, pflags);
881			IPFW_WUNLOCK(ch);
882		}
883
884		/* Anyway, flush data and retry */
885		ta->flush_mod(ta_buf);
886	}
887
888	tc->no.refcnt--;
889	return (error);
890}
891
892/*
893 * Adds or deletes record in table.
894 * Data layout (v0):
895 * Request: [ ip_fw3_opheader ipfw_table_xentry ]
896 *
897 * Returns 0 on success
898 */
899static int
900manage_table_ent_v0(struct ip_fw_chain *ch, ip_fw3_opheader *op3,
901    struct sockopt_data *sd)
902{
903	ipfw_table_xentry *xent;
904	struct tentry_info tei;
905	struct tid_info ti;
906	struct table_value v;
907	int error, hdrlen, read;
908
909	hdrlen = offsetof(ipfw_table_xentry, k);
910
911	/* Check minimum header size */
912	if (sd->valsize < (sizeof(*op3) + hdrlen))
913		return (EINVAL);
914
915	read = sizeof(ip_fw3_opheader);
916
917	/* Check if xentry len field is valid */
918	xent = (ipfw_table_xentry *)(op3 + 1);
919	if (xent->len < hdrlen || xent->len + read > sd->valsize)
920		return (EINVAL);
921
922	memset(&tei, 0, sizeof(tei));
923	tei.paddr = &xent->k;
924	tei.masklen = xent->masklen;
925	ipfw_import_table_value_legacy(xent->value, &v);
926	tei.pvalue = &v;
927	/* Old requests compability */
928	tei.flags = TEI_FLAGS_COMPAT;
929	if (xent->type == IPFW_TABLE_ADDR) {
930		if (xent->len - hdrlen == sizeof(in_addr_t))
931			tei.subtype = AF_INET;
932		else
933			tei.subtype = AF_INET6;
934	}
935
936	memset(&ti, 0, sizeof(ti));
937	ti.uidx = xent->tbl;
938	ti.type = xent->type;
939
940	error = (op3->opcode == IP_FW_TABLE_XADD) ?
941	    add_table_entry(ch, &ti, &tei, 0, 1) :
942	    del_table_entry(ch, &ti, &tei, 0, 1);
943
944	return (error);
945}
946
947/*
948 * Adds or deletes record in table.
949 * Data layout (v1)(current):
950 * Request: [ ipfw_obj_header
951 *   ipfw_obj_ctlv(IPFW_TLV_TBLENT_LIST) [ ipfw_obj_tentry x N ]
952 * ]
953 *
954 * Returns 0 on success
955 */
956static int
957manage_table_ent_v1(struct ip_fw_chain *ch, ip_fw3_opheader *op3,
958    struct sockopt_data *sd)
959{
960	ipfw_obj_tentry *tent, *ptent;
961	ipfw_obj_ctlv *ctlv;
962	ipfw_obj_header *oh;
963	struct tentry_info *ptei, tei, *tei_buf;
964	struct tid_info ti;
965	int error, i, kidx, read;
966
967	/* Check minimum header size */
968	if (sd->valsize < (sizeof(*oh) + sizeof(*ctlv)))
969		return (EINVAL);
970
971	/* Check if passed data is too long */
972	if (sd->valsize != sd->kavail)
973		return (EINVAL);
974
975	oh = (ipfw_obj_header *)sd->kbuf;
976
977	/* Basic length checks for TLVs */
978	if (oh->ntlv.head.length != sizeof(oh->ntlv))
979		return (EINVAL);
980
981	read = sizeof(*oh);
982
983	ctlv = (ipfw_obj_ctlv *)(oh + 1);
984	if (ctlv->head.length + read != sd->valsize)
985		return (EINVAL);
986
987	read += sizeof(*ctlv);
988	tent = (ipfw_obj_tentry *)(ctlv + 1);
989	if (ctlv->count * sizeof(*tent) + read != sd->valsize)
990		return (EINVAL);
991
992	if (ctlv->count == 0)
993		return (0);
994
995	/*
996	 * Mark entire buffer as "read".
997	 * This instructs sopt api write it back
998	 * after function return.
999	 */
1000	ipfw_get_sopt_header(sd, sd->valsize);
1001
1002	/* Perform basic checks for each entry */
1003	ptent = tent;
1004	kidx = tent->idx;
1005	for (i = 0; i < ctlv->count; i++, ptent++) {
1006		if (ptent->head.length != sizeof(*ptent))
1007			return (EINVAL);
1008		if (ptent->idx != kidx)
1009			return (ENOTSUP);
1010	}
1011
1012	/* Convert data into kernel request objects */
1013	objheader_to_ti(oh, &ti);
1014	ti.type = oh->ntlv.type;
1015	ti.uidx = kidx;
1016
1017	/* Use on-stack buffer for single add/del */
1018	if (ctlv->count == 1) {
1019		memset(&tei, 0, sizeof(tei));
1020		tei_buf = &tei;
1021	} else
1022		tei_buf = malloc(ctlv->count * sizeof(tei), M_TEMP,
1023		    M_WAITOK | M_ZERO);
1024
1025	ptei = tei_buf;
1026	ptent = tent;
1027	for (i = 0; i < ctlv->count; i++, ptent++, ptei++) {
1028		ptei->paddr = &ptent->k;
1029		ptei->subtype = ptent->subtype;
1030		ptei->masklen = ptent->masklen;
1031		if (ptent->head.flags & IPFW_TF_UPDATE)
1032			ptei->flags |= TEI_FLAGS_UPDATE;
1033
1034		ipfw_import_table_value_v1(&ptent->v.value);
1035		ptei->pvalue = (struct table_value *)&ptent->v.value;
1036	}
1037
1038	error = (oh->opheader.opcode == IP_FW_TABLE_XADD) ?
1039	    add_table_entry(ch, &ti, tei_buf, ctlv->flags, ctlv->count) :
1040	    del_table_entry(ch, &ti, tei_buf, ctlv->flags, ctlv->count);
1041
1042	/* Translate result back to userland */
1043	ptei = tei_buf;
1044	ptent = tent;
1045	for (i = 0; i < ctlv->count; i++, ptent++, ptei++) {
1046		if (ptei->flags & TEI_FLAGS_ADDED)
1047			ptent->result = IPFW_TR_ADDED;
1048		else if (ptei->flags & TEI_FLAGS_DELETED)
1049			ptent->result = IPFW_TR_DELETED;
1050		else if (ptei->flags & TEI_FLAGS_UPDATED)
1051			ptent->result = IPFW_TR_UPDATED;
1052		else if (ptei->flags & TEI_FLAGS_LIMIT)
1053			ptent->result = IPFW_TR_LIMIT;
1054		else if (ptei->flags & TEI_FLAGS_ERROR)
1055			ptent->result = IPFW_TR_ERROR;
1056		else if (ptei->flags & TEI_FLAGS_NOTFOUND)
1057			ptent->result = IPFW_TR_NOTFOUND;
1058		else if (ptei->flags & TEI_FLAGS_EXISTS)
1059			ptent->result = IPFW_TR_EXISTS;
1060		ipfw_export_table_value_v1(ptei->pvalue, &ptent->v.value);
1061	}
1062
1063	if (tei_buf != &tei)
1064		free(tei_buf, M_TEMP);
1065
1066	return (error);
1067}
1068
1069/*
1070 * Looks up an entry in given table.
1071 * Data layout (v0)(current):
1072 * Request: [ ipfw_obj_header ipfw_obj_tentry ]
1073 * Reply: [ ipfw_obj_header ipfw_obj_tentry ]
1074 *
1075 * Returns 0 on success
1076 */
1077static int
1078find_table_entry(struct ip_fw_chain *ch, ip_fw3_opheader *op3,
1079    struct sockopt_data *sd)
1080{
1081	ipfw_obj_tentry *tent;
1082	ipfw_obj_header *oh;
1083	struct tid_info ti;
1084	struct table_config *tc;
1085	struct table_algo *ta;
1086	struct table_info *kti;
1087	struct namedobj_instance *ni;
1088	int error;
1089	size_t sz;
1090
1091	/* Check minimum header size */
1092	sz = sizeof(*oh) + sizeof(*tent);
1093	if (sd->valsize != sz)
1094		return (EINVAL);
1095
1096	oh = (struct _ipfw_obj_header *)ipfw_get_sopt_header(sd, sz);
1097	tent = (ipfw_obj_tentry *)(oh + 1);
1098
1099	/* Basic length checks for TLVs */
1100	if (oh->ntlv.head.length != sizeof(oh->ntlv))
1101		return (EINVAL);
1102
1103	objheader_to_ti(oh, &ti);
1104	ti.type = oh->ntlv.type;
1105	ti.uidx = tent->idx;
1106
1107	IPFW_UH_RLOCK(ch);
1108	ni = CHAIN_TO_NI(ch);
1109
1110	/*
1111	 * Find existing table and check its type .
1112	 */
1113	ta = NULL;
1114	if ((tc = find_table(ni, &ti)) == NULL) {
1115		IPFW_UH_RUNLOCK(ch);
1116		return (ESRCH);
1117	}
1118
1119	/* check table type */
1120	if (tc->no.subtype != ti.type) {
1121		IPFW_UH_RUNLOCK(ch);
1122		return (EINVAL);
1123	}
1124
1125	kti = KIDX_TO_TI(ch, tc->no.kidx);
1126	ta = tc->ta;
1127
1128	if (ta->find_tentry == NULL)
1129		return (ENOTSUP);
1130
1131	error = ta->find_tentry(tc->astate, kti, tent);
1132
1133	IPFW_UH_RUNLOCK(ch);
1134
1135	return (error);
1136}
1137
1138/*
1139 * Flushes all entries or destroys given table.
1140 * Data layout (v0)(current):
1141 * Request: [ ipfw_obj_header ]
1142 *
1143 * Returns 0 on success
1144 */
1145static int
1146flush_table_v0(struct ip_fw_chain *ch, ip_fw3_opheader *op3,
1147    struct sockopt_data *sd)
1148{
1149	int error;
1150	struct _ipfw_obj_header *oh;
1151	struct tid_info ti;
1152
1153	if (sd->valsize != sizeof(*oh))
1154		return (EINVAL);
1155
1156	oh = (struct _ipfw_obj_header *)op3;
1157	objheader_to_ti(oh, &ti);
1158
1159	if (op3->opcode == IP_FW_TABLE_XDESTROY)
1160		error = destroy_table(ch, &ti);
1161	else if (op3->opcode == IP_FW_TABLE_XFLUSH)
1162		error = flush_table(ch, &ti);
1163	else
1164		return (ENOTSUP);
1165
1166	return (error);
1167}
1168
1169static void
1170restart_flush(void *object, struct op_state *_state)
1171{
1172	struct tableop_state *ts;
1173
1174	ts = (struct tableop_state *)_state;
1175
1176	if (ts->tc != object)
1177		return;
1178
1179	/* Indicate we've called */
1180	ts->modified = 1;
1181}
1182
1183/*
1184 * Flushes given table.
1185 *
1186 * Function create new table instance with the same
1187 * parameters, swaps it with old one and
1188 * flushes state without holding runtime WLOCK.
1189 *
1190 * Returns 0 on success.
1191 */
1192int
1193flush_table(struct ip_fw_chain *ch, struct tid_info *ti)
1194{
1195	struct namedobj_instance *ni;
1196	struct table_config *tc;
1197	struct table_algo *ta;
1198	struct table_info ti_old, ti_new, *tablestate;
1199	void *astate_old, *astate_new;
1200	char algostate[64], *pstate;
1201	struct tableop_state ts;
1202	int error, need_gc;
1203	uint16_t kidx;
1204	uint8_t tflags;
1205
1206	/*
1207	 * Stage 1: save table algoritm.
1208	 * Reference found table to ensure it won't disappear.
1209	 */
1210	IPFW_UH_WLOCK(ch);
1211	ni = CHAIN_TO_NI(ch);
1212	if ((tc = find_table(ni, ti)) == NULL) {
1213		IPFW_UH_WUNLOCK(ch);
1214		return (ESRCH);
1215	}
1216	need_gc = 0;
1217	astate_new = NULL;
1218	memset(&ti_new, 0, sizeof(ti_new));
1219restart:
1220	/* Set up swap handler */
1221	memset(&ts, 0, sizeof(ts));
1222	ts.opstate.func = restart_flush;
1223	ts.tc = tc;
1224
1225	ta = tc->ta;
1226	/* Do not flush readonly tables */
1227	if ((ta->flags & TA_FLAG_READONLY) != 0) {
1228		IPFW_UH_WUNLOCK(ch);
1229		return (EACCES);
1230	}
1231	/* Save startup algo parameters */
1232	if (ta->print_config != NULL) {
1233		ta->print_config(tc->astate, KIDX_TO_TI(ch, tc->no.kidx),
1234		    algostate, sizeof(algostate));
1235		pstate = algostate;
1236	} else
1237		pstate = NULL;
1238	tflags = tc->tflags;
1239	tc->no.refcnt++;
1240	add_toperation_state(ch, &ts);
1241	IPFW_UH_WUNLOCK(ch);
1242
1243	/*
1244	 * Stage 1.5: if this is not the first attempt, destroy previous state
1245	 */
1246	if (need_gc != 0) {
1247		ta->destroy(astate_new, &ti_new);
1248		need_gc = 0;
1249	}
1250
1251	/*
1252	 * Stage 2: allocate new table instance using same algo.
1253	 */
1254	memset(&ti_new, 0, sizeof(struct table_info));
1255	error = ta->init(ch, &astate_new, &ti_new, pstate, tflags);
1256
1257	/*
1258	 * Stage 3: swap old state pointers with newly-allocated ones.
1259	 * Decrease refcount.
1260	 */
1261	IPFW_UH_WLOCK(ch);
1262	tc->no.refcnt--;
1263	del_toperation_state(ch, &ts);
1264
1265	if (error != 0) {
1266		IPFW_UH_WUNLOCK(ch);
1267		return (error);
1268	}
1269
1270	/*
1271	 * Restart operation if table swap has happened:
1272	 * even if algo may be the same, algo init parameters
1273	 * may change. Restart operation instead of doing
1274	 * complex checks.
1275	 */
1276	if (ts.modified != 0) {
1277		/* Delay destroying data since we're holding UH lock */
1278		need_gc = 1;
1279		goto restart;
1280	}
1281
1282	ni = CHAIN_TO_NI(ch);
1283	kidx = tc->no.kidx;
1284	tablestate = (struct table_info *)ch->tablestate;
1285
1286	IPFW_WLOCK(ch);
1287	ti_old = tablestate[kidx];
1288	tablestate[kidx] = ti_new;
1289	IPFW_WUNLOCK(ch);
1290
1291	astate_old = tc->astate;
1292	tc->astate = astate_new;
1293	tc->ti_copy = ti_new;
1294	tc->count = 0;
1295
1296	/* Notify algo on real @ti address */
1297	if (ta->change_ti != NULL)
1298		ta->change_ti(tc->astate, &tablestate[kidx]);
1299
1300	/*
1301	 * Stage 4: unref values.
1302	 */
1303	ipfw_unref_table_values(ch, tc, ta, astate_old, &ti_old);
1304	IPFW_UH_WUNLOCK(ch);
1305
1306	/*
1307	 * Stage 5: perform real flush/destroy.
1308	 */
1309	ta->destroy(astate_old, &ti_old);
1310
1311	return (0);
1312}
1313
1314/*
1315 * Swaps two tables.
1316 * Data layout (v0)(current):
1317 * Request: [ ipfw_obj_header ipfw_obj_ntlv ]
1318 *
1319 * Returns 0 on success
1320 */
1321static int
1322swap_table(struct ip_fw_chain *ch, ip_fw3_opheader *op3,
1323    struct sockopt_data *sd)
1324{
1325	int error;
1326	struct _ipfw_obj_header *oh;
1327	struct tid_info ti_a, ti_b;
1328
1329	if (sd->valsize != sizeof(*oh) + sizeof(ipfw_obj_ntlv))
1330		return (EINVAL);
1331
1332	oh = (struct _ipfw_obj_header *)op3;
1333	ntlv_to_ti(&oh->ntlv, &ti_a);
1334	ntlv_to_ti((ipfw_obj_ntlv *)(oh + 1), &ti_b);
1335
1336	error = swap_tables(ch, &ti_a, &ti_b);
1337
1338	return (error);
1339}
1340
1341/*
1342 * Swaps two tables of the same type/valtype.
1343 *
1344 * Checks if tables are compatible and limits
1345 * permits swap, than actually perform swap.
1346 *
1347 * Each table consists of 2 different parts:
1348 * config:
1349 *   @tc (with name, set, kidx) and rule bindings, which is "stable".
1350 *   number of items
1351 *   table algo
1352 * runtime:
1353 *   runtime data @ti (ch->tablestate)
1354 *   runtime cache in @tc
1355 *   algo-specific data (@tc->astate)
1356 *
1357 * So we switch:
1358 *  all runtime data
1359 *   number of items
1360 *   table algo
1361 *
1362 * After that we call @ti change handler for each table.
1363 *
1364 * Note that referencing @tc won't protect tc->ta from change.
1365 * XXX: Do we need to restrict swap between locked tables?
1366 * XXX: Do we need to exchange ftype?
1367 *
1368 * Returns 0 on success.
1369 */
1370static int
1371swap_tables(struct ip_fw_chain *ch, struct tid_info *a,
1372    struct tid_info *b)
1373{
1374	struct namedobj_instance *ni;
1375	struct table_config *tc_a, *tc_b;
1376	struct table_algo *ta;
1377	struct table_info ti, *tablestate;
1378	void *astate;
1379	uint32_t count;
1380
1381	/*
1382	 * Stage 1: find both tables and ensure they are of
1383	 * the same type.
1384	 */
1385	IPFW_UH_WLOCK(ch);
1386	ni = CHAIN_TO_NI(ch);
1387	if ((tc_a = find_table(ni, a)) == NULL) {
1388		IPFW_UH_WUNLOCK(ch);
1389		return (ESRCH);
1390	}
1391	if ((tc_b = find_table(ni, b)) == NULL) {
1392		IPFW_UH_WUNLOCK(ch);
1393		return (ESRCH);
1394	}
1395
1396	/* It is very easy to swap between the same table */
1397	if (tc_a == tc_b) {
1398		IPFW_UH_WUNLOCK(ch);
1399		return (0);
1400	}
1401
1402	/* Check type and value are the same */
1403	if (tc_a->no.subtype!=tc_b->no.subtype || tc_a->tflags!=tc_b->tflags) {
1404		IPFW_UH_WUNLOCK(ch);
1405		return (EINVAL);
1406	}
1407
1408	/* Check limits before swap */
1409	if ((tc_a->limit != 0 && tc_b->count > tc_a->limit) ||
1410	    (tc_b->limit != 0 && tc_a->count > tc_b->limit)) {
1411		IPFW_UH_WUNLOCK(ch);
1412		return (EFBIG);
1413	}
1414
1415	/* Check if one of the tables is readonly */
1416	if (((tc_a->ta->flags | tc_b->ta->flags) & TA_FLAG_READONLY) != 0) {
1417		IPFW_UH_WUNLOCK(ch);
1418		return (EACCES);
1419	}
1420
1421	/* Notify we're going to swap */
1422	rollback_toperation_state(ch, tc_a);
1423	rollback_toperation_state(ch, tc_b);
1424
1425	/* Everything is fine, prepare to swap */
1426	tablestate = (struct table_info *)ch->tablestate;
1427	ti = tablestate[tc_a->no.kidx];
1428	ta = tc_a->ta;
1429	astate = tc_a->astate;
1430	count = tc_a->count;
1431
1432	IPFW_WLOCK(ch);
1433	/* a <- b */
1434	tablestate[tc_a->no.kidx] = tablestate[tc_b->no.kidx];
1435	tc_a->ta = tc_b->ta;
1436	tc_a->astate = tc_b->astate;
1437	tc_a->count = tc_b->count;
1438	/* b <- a */
1439	tablestate[tc_b->no.kidx] = ti;
1440	tc_b->ta = ta;
1441	tc_b->astate = astate;
1442	tc_b->count = count;
1443	IPFW_WUNLOCK(ch);
1444
1445	/* Ensure tc.ti copies are in sync */
1446	tc_a->ti_copy = tablestate[tc_a->no.kidx];
1447	tc_b->ti_copy = tablestate[tc_b->no.kidx];
1448
1449	/* Notify both tables on @ti change */
1450	if (tc_a->ta->change_ti != NULL)
1451		tc_a->ta->change_ti(tc_a->astate, &tablestate[tc_a->no.kidx]);
1452	if (tc_b->ta->change_ti != NULL)
1453		tc_b->ta->change_ti(tc_b->astate, &tablestate[tc_b->no.kidx]);
1454
1455	IPFW_UH_WUNLOCK(ch);
1456
1457	return (0);
1458}
1459
1460/*
1461 * Destroys table specified by @ti.
1462 * Data layout (v0)(current):
1463 * Request: [ ip_fw3_opheader ]
1464 *
1465 * Returns 0 on success
1466 */
1467static int
1468destroy_table(struct ip_fw_chain *ch, struct tid_info *ti)
1469{
1470	struct namedobj_instance *ni;
1471	struct table_config *tc;
1472
1473	IPFW_UH_WLOCK(ch);
1474
1475	ni = CHAIN_TO_NI(ch);
1476	if ((tc = find_table(ni, ti)) == NULL) {
1477		IPFW_UH_WUNLOCK(ch);
1478		return (ESRCH);
1479	}
1480
1481	/* Do not permit destroying referenced tables */
1482	if (tc->no.refcnt > 0) {
1483		IPFW_UH_WUNLOCK(ch);
1484		return (EBUSY);
1485	}
1486
1487	IPFW_WLOCK(ch);
1488	unlink_table(ch, tc);
1489	IPFW_WUNLOCK(ch);
1490
1491	/* Free obj index */
1492	if (ipfw_objhash_free_idx(ni, tc->no.kidx) != 0)
1493		printf("Error unlinking kidx %d from table %s\n",
1494		    tc->no.kidx, tc->tablename);
1495
1496	/* Unref values used in tables while holding UH lock */
1497	ipfw_unref_table_values(ch, tc, tc->ta, tc->astate, &tc->ti_copy);
1498	IPFW_UH_WUNLOCK(ch);
1499
1500	free_table_config(ni, tc);
1501
1502	return (0);
1503}
1504
1505static uint32_t
1506roundup2p(uint32_t v)
1507{
1508
1509	v--;
1510	v |= v >> 1;
1511	v |= v >> 2;
1512	v |= v >> 4;
1513	v |= v >> 8;
1514	v |= v >> 16;
1515	v++;
1516
1517	return (v);
1518}
1519
1520/*
1521 * Grow tables index.
1522 *
1523 * Returns 0 on success.
1524 */
1525int
1526ipfw_resize_tables(struct ip_fw_chain *ch, unsigned int ntables)
1527{
1528	unsigned int ntables_old, tbl;
1529	struct namedobj_instance *ni;
1530	void *new_idx, *old_tablestate, *tablestate;
1531	struct table_info *ti;
1532	struct table_config *tc;
1533	int i, new_blocks;
1534
1535	/* Check new value for validity */
1536	if (ntables == 0)
1537		return (EINVAL);
1538	if (ntables > IPFW_TABLES_MAX)
1539		ntables = IPFW_TABLES_MAX;
1540	/* Alight to nearest power of 2 */
1541	ntables = (unsigned int)roundup2p(ntables);
1542
1543	/* Allocate new pointers */
1544	tablestate = malloc(ntables * sizeof(struct table_info),
1545	    M_IPFW, M_WAITOK | M_ZERO);
1546
1547	ipfw_objhash_bitmap_alloc(ntables, (void *)&new_idx, &new_blocks);
1548
1549	IPFW_UH_WLOCK(ch);
1550
1551	tbl = (ntables >= V_fw_tables_max) ? V_fw_tables_max : ntables;
1552	ni = CHAIN_TO_NI(ch);
1553
1554	/* Temporary restrict decreasing max_tables */
1555	if (ntables < V_fw_tables_max) {
1556
1557		/*
1558		 * FIXME: Check if we really can shrink
1559		 */
1560		IPFW_UH_WUNLOCK(ch);
1561		return (EINVAL);
1562	}
1563
1564	/* Copy table info/indices */
1565	memcpy(tablestate, ch->tablestate, sizeof(struct table_info) * tbl);
1566	ipfw_objhash_bitmap_merge(ni, &new_idx, &new_blocks);
1567
1568	IPFW_WLOCK(ch);
1569
1570	/* Change pointers */
1571	old_tablestate = ch->tablestate;
1572	ch->tablestate = tablestate;
1573	ipfw_objhash_bitmap_swap(ni, &new_idx, &new_blocks);
1574
1575	ntables_old = V_fw_tables_max;
1576	V_fw_tables_max = ntables;
1577
1578	IPFW_WUNLOCK(ch);
1579
1580	/* Notify all consumers that their @ti pointer has changed */
1581	ti = (struct table_info *)ch->tablestate;
1582	for (i = 0; i < tbl; i++, ti++) {
1583		if (ti->lookup == NULL)
1584			continue;
1585		tc = (struct table_config *)ipfw_objhash_lookup_kidx(ni, i);
1586		if (tc == NULL || tc->ta->change_ti == NULL)
1587			continue;
1588
1589		tc->ta->change_ti(tc->astate, ti);
1590	}
1591
1592	IPFW_UH_WUNLOCK(ch);
1593
1594	/* Free old pointers */
1595	free(old_tablestate, M_IPFW);
1596	ipfw_objhash_bitmap_free(new_idx, new_blocks);
1597
1598	return (0);
1599}
1600
1601/*
1602 * Switch between "set 0" and "rule's set" table binding,
1603 * Check all ruleset bindings and permits changing
1604 * IFF each binding has both rule AND table in default set (set 0).
1605 *
1606 * Returns 0 on success.
1607 */
1608int
1609ipfw_switch_tables_namespace(struct ip_fw_chain *ch, unsigned int sets)
1610{
1611	struct namedobj_instance *ni;
1612	struct named_object *no;
1613	struct ip_fw *rule;
1614	ipfw_insn *cmd;
1615	int cmdlen, i, l;
1616	uint16_t kidx;
1617
1618	IPFW_UH_WLOCK(ch);
1619
1620	if (V_fw_tables_sets == sets) {
1621		IPFW_UH_WUNLOCK(ch);
1622		return (0);
1623	}
1624
1625	ni = CHAIN_TO_NI(ch);
1626
1627	/*
1628	 * Scan all rules and examine tables opcodes.
1629	 */
1630	for (i = 0; i < ch->n_rules; i++) {
1631		rule = ch->map[i];
1632
1633		l = rule->cmd_len;
1634		cmd = rule->cmd;
1635		cmdlen = 0;
1636		for ( ;	l > 0 ; l -= cmdlen, cmd += cmdlen) {
1637			cmdlen = F_LEN(cmd);
1638
1639			if (classify_opcode_kidx(cmd, &kidx) != 0)
1640				continue;
1641
1642			no = ipfw_objhash_lookup_kidx(ni, kidx);
1643
1644			/* Check if both table object and rule has the set 0 */
1645			if (no->set != 0 || rule->set != 0) {
1646				IPFW_UH_WUNLOCK(ch);
1647				return (EBUSY);
1648			}
1649
1650		}
1651	}
1652	V_fw_tables_sets = sets;
1653
1654	IPFW_UH_WUNLOCK(ch);
1655
1656	return (0);
1657}
1658
1659/*
1660 * Lookup an IP @addr in table @tbl.
1661 * Stores found value in @val.
1662 *
1663 * Returns 1 if @addr was found.
1664 */
1665int
1666ipfw_lookup_table(struct ip_fw_chain *ch, uint16_t tbl, in_addr_t addr,
1667    uint32_t *val)
1668{
1669	struct table_info *ti;
1670
1671	ti = KIDX_TO_TI(ch, tbl);
1672
1673	return (ti->lookup(ti, &addr, sizeof(in_addr_t), val));
1674}
1675
1676/*
1677 * Lookup an arbtrary key @paddr of legth @plen in table @tbl.
1678 * Stores found value in @val.
1679 *
1680 * Returns 1 if key was found.
1681 */
1682int
1683ipfw_lookup_table_extended(struct ip_fw_chain *ch, uint16_t tbl, uint16_t plen,
1684    void *paddr, uint32_t *val)
1685{
1686	struct table_info *ti;
1687
1688	ti = KIDX_TO_TI(ch, tbl);
1689
1690	return (ti->lookup(ti, paddr, plen, val));
1691}
1692
1693/*
1694 * Info/List/dump support for tables.
1695 *
1696 */
1697
1698/*
1699 * High-level 'get' cmds sysctl handlers
1700 */
1701
1702/*
1703 * Lists all tables currently available in kernel.
1704 * Data layout (v0)(current):
1705 * Request: [ ipfw_obj_lheader ], size = ipfw_obj_lheader.size
1706 * Reply: [ ipfw_obj_lheader ipfw_xtable_info x N ]
1707 *
1708 * Returns 0 on success
1709 */
1710static int
1711list_tables(struct ip_fw_chain *ch, ip_fw3_opheader *op3,
1712    struct sockopt_data *sd)
1713{
1714	struct _ipfw_obj_lheader *olh;
1715	int error;
1716
1717	olh = (struct _ipfw_obj_lheader *)ipfw_get_sopt_header(sd,sizeof(*olh));
1718	if (olh == NULL)
1719		return (EINVAL);
1720	if (sd->valsize < olh->size)
1721		return (EINVAL);
1722
1723	IPFW_UH_RLOCK(ch);
1724	error = export_tables(ch, olh, sd);
1725	IPFW_UH_RUNLOCK(ch);
1726
1727	return (error);
1728}
1729
1730/*
1731 * Store table info to buffer provided by @sd.
1732 * Data layout (v0)(current):
1733 * Request: [ ipfw_obj_header ipfw_xtable_info(empty)]
1734 * Reply: [ ipfw_obj_header ipfw_xtable_info ]
1735 *
1736 * Returns 0 on success.
1737 */
1738static int
1739describe_table(struct ip_fw_chain *ch, ip_fw3_opheader *op3,
1740    struct sockopt_data *sd)
1741{
1742	struct _ipfw_obj_header *oh;
1743	struct table_config *tc;
1744	struct tid_info ti;
1745	size_t sz;
1746
1747	sz = sizeof(*oh) + sizeof(ipfw_xtable_info);
1748	oh = (struct _ipfw_obj_header *)ipfw_get_sopt_header(sd, sz);
1749	if (oh == NULL)
1750		return (EINVAL);
1751
1752	objheader_to_ti(oh, &ti);
1753
1754	IPFW_UH_RLOCK(ch);
1755	if ((tc = find_table(CHAIN_TO_NI(ch), &ti)) == NULL) {
1756		IPFW_UH_RUNLOCK(ch);
1757		return (ESRCH);
1758	}
1759
1760	export_table_info(ch, tc, (ipfw_xtable_info *)(oh + 1));
1761	IPFW_UH_RUNLOCK(ch);
1762
1763	return (0);
1764}
1765
1766/*
1767 * Modifies existing table.
1768 * Data layout (v0)(current):
1769 * Request: [ ipfw_obj_header ipfw_xtable_info ]
1770 *
1771 * Returns 0 on success
1772 */
1773static int
1774modify_table(struct ip_fw_chain *ch, ip_fw3_opheader *op3,
1775    struct sockopt_data *sd)
1776{
1777	struct _ipfw_obj_header *oh;
1778	ipfw_xtable_info *i;
1779	char *tname;
1780	struct tid_info ti;
1781	struct namedobj_instance *ni;
1782	struct table_config *tc;
1783
1784	if (sd->valsize != sizeof(*oh) + sizeof(ipfw_xtable_info))
1785		return (EINVAL);
1786
1787	oh = (struct _ipfw_obj_header *)sd->kbuf;
1788	i = (ipfw_xtable_info *)(oh + 1);
1789
1790	/*
1791	 * Verify user-supplied strings.
1792	 * Check for null-terminated/zero-length strings/
1793	 */
1794	tname = oh->ntlv.name;
1795	if (ipfw_check_table_name(tname) != 0)
1796		return (EINVAL);
1797
1798	objheader_to_ti(oh, &ti);
1799	ti.type = i->type;
1800
1801	IPFW_UH_WLOCK(ch);
1802	ni = CHAIN_TO_NI(ch);
1803	if ((tc = find_table(ni, &ti)) == NULL) {
1804		IPFW_UH_WUNLOCK(ch);
1805		return (ESRCH);
1806	}
1807
1808	/* Do not support any modifications for readonly tables */
1809	if ((tc->ta->flags & TA_FLAG_READONLY) != 0) {
1810		IPFW_UH_WUNLOCK(ch);
1811		return (EACCES);
1812	}
1813
1814	if ((i->mflags & IPFW_TMFLAGS_LIMIT) != 0)
1815		tc->limit = i->limit;
1816	if ((i->mflags & IPFW_TMFLAGS_LOCK) != 0)
1817		tc->locked = ((i->flags & IPFW_TGFLAGS_LOCKED) != 0);
1818	IPFW_UH_WUNLOCK(ch);
1819
1820	return (0);
1821}
1822
1823/*
1824 * Creates new table.
1825 * Data layout (v0)(current):
1826 * Request: [ ipfw_obj_header ipfw_xtable_info ]
1827 *
1828 * Returns 0 on success
1829 */
1830static int
1831create_table(struct ip_fw_chain *ch, ip_fw3_opheader *op3,
1832    struct sockopt_data *sd)
1833{
1834	struct _ipfw_obj_header *oh;
1835	ipfw_xtable_info *i;
1836	char *tname, *aname;
1837	struct tid_info ti;
1838	struct namedobj_instance *ni;
1839
1840	if (sd->valsize != sizeof(*oh) + sizeof(ipfw_xtable_info))
1841		return (EINVAL);
1842
1843	oh = (struct _ipfw_obj_header *)sd->kbuf;
1844	i = (ipfw_xtable_info *)(oh + 1);
1845
1846	/*
1847	 * Verify user-supplied strings.
1848	 * Check for null-terminated/zero-length strings/
1849	 */
1850	tname = oh->ntlv.name;
1851	aname = i->algoname;
1852	if (ipfw_check_table_name(tname) != 0 ||
1853	    strnlen(aname, sizeof(i->algoname)) == sizeof(i->algoname))
1854		return (EINVAL);
1855
1856	if (aname[0] == '\0') {
1857		/* Use default algorithm */
1858		aname = NULL;
1859	}
1860
1861	objheader_to_ti(oh, &ti);
1862	ti.type = i->type;
1863
1864	ni = CHAIN_TO_NI(ch);
1865
1866	IPFW_UH_RLOCK(ch);
1867	if (find_table(ni, &ti) != NULL) {
1868		IPFW_UH_RUNLOCK(ch);
1869		return (EEXIST);
1870	}
1871	IPFW_UH_RUNLOCK(ch);
1872
1873	return (create_table_internal(ch, &ti, aname, i, NULL, 0));
1874}
1875
1876/*
1877 * Creates new table based on @ti and @aname.
1878 *
1879 * Relies on table name checking inside find_name_tlv()
1880 * Assume @aname to be checked and valid.
1881 * Stores allocated table kidx inside @pkidx (if non-NULL).
1882 * Reference created table if @compat is non-zero.
1883 *
1884 * Returns 0 on success.
1885 */
1886static int
1887create_table_internal(struct ip_fw_chain *ch, struct tid_info *ti,
1888    char *aname, ipfw_xtable_info *i, uint16_t *pkidx, int compat)
1889{
1890	struct namedobj_instance *ni;
1891	struct table_config *tc, *tc_new, *tmp;
1892	struct table_algo *ta;
1893	uint16_t kidx;
1894
1895	ni = CHAIN_TO_NI(ch);
1896
1897	ta = find_table_algo(CHAIN_TO_TCFG(ch), ti, aname);
1898	if (ta == NULL)
1899		return (ENOTSUP);
1900
1901	tc = alloc_table_config(ch, ti, ta, aname, i->tflags);
1902	if (tc == NULL)
1903		return (ENOMEM);
1904
1905	tc->vmask = i->vmask;
1906	tc->limit = i->limit;
1907	if (ta->flags & TA_FLAG_READONLY)
1908		tc->locked = 1;
1909	else
1910		tc->locked = (i->flags & IPFW_TGFLAGS_LOCKED) != 0;
1911
1912	IPFW_UH_WLOCK(ch);
1913
1914	/* Check if table has been already created */
1915	tc_new = find_table(ni, ti);
1916	if (tc_new != NULL) {
1917
1918		/*
1919		 * Compat: do not fail if we're
1920		 * requesting to create existing table
1921		 * which has the same type
1922		 */
1923		if (compat == 0 || tc_new->no.subtype != tc->no.subtype) {
1924			IPFW_UH_WUNLOCK(ch);
1925			free_table_config(ni, tc);
1926			return (EEXIST);
1927		}
1928
1929		/* Exchange tc and tc_new for proper refcounting & freeing */
1930		tmp = tc;
1931		tc = tc_new;
1932		tc_new = tmp;
1933	} else {
1934		/* New table */
1935		if (ipfw_objhash_alloc_idx(ni, &kidx) != 0) {
1936			IPFW_UH_WUNLOCK(ch);
1937			printf("Unable to allocate table index."
1938			    " Consider increasing net.inet.ip.fw.tables_max");
1939			free_table_config(ni, tc);
1940			return (EBUSY);
1941		}
1942		tc->no.kidx = kidx;
1943		tc->no.etlv = IPFW_TLV_TBL_NAME;
1944
1945		IPFW_WLOCK(ch);
1946		link_table(ch, tc);
1947		IPFW_WUNLOCK(ch);
1948	}
1949
1950	if (compat != 0)
1951		tc->no.refcnt++;
1952	if (pkidx != NULL)
1953		*pkidx = tc->no.kidx;
1954
1955	IPFW_UH_WUNLOCK(ch);
1956
1957	if (tc_new != NULL)
1958		free_table_config(ni, tc_new);
1959
1960	return (0);
1961}
1962
1963static void
1964ntlv_to_ti(ipfw_obj_ntlv *ntlv, struct tid_info *ti)
1965{
1966
1967	memset(ti, 0, sizeof(struct tid_info));
1968	ti->set = ntlv->set;
1969	ti->uidx = ntlv->idx;
1970	ti->tlvs = ntlv;
1971	ti->tlen = ntlv->head.length;
1972}
1973
1974static void
1975objheader_to_ti(struct _ipfw_obj_header *oh, struct tid_info *ti)
1976{
1977
1978	ntlv_to_ti(&oh->ntlv, ti);
1979}
1980
1981struct namedobj_instance *
1982ipfw_get_table_objhash(struct ip_fw_chain *ch)
1983{
1984
1985	return (CHAIN_TO_NI(ch));
1986}
1987
1988/*
1989 * Exports basic table info as name TLV.
1990 * Used inside dump_static_rules() to provide info
1991 * about all tables referenced by current ruleset.
1992 *
1993 * Returns 0 on success.
1994 */
1995int
1996ipfw_export_table_ntlv(struct ip_fw_chain *ch, uint16_t kidx,
1997    struct sockopt_data *sd)
1998{
1999	struct namedobj_instance *ni;
2000	struct named_object *no;
2001	ipfw_obj_ntlv *ntlv;
2002
2003	ni = CHAIN_TO_NI(ch);
2004
2005	no = ipfw_objhash_lookup_kidx(ni, kidx);
2006	KASSERT(no != NULL, ("invalid table kidx passed"));
2007
2008	ntlv = (ipfw_obj_ntlv *)ipfw_get_sopt_space(sd, sizeof(*ntlv));
2009	if (ntlv == NULL)
2010		return (ENOMEM);
2011
2012	ntlv->head.type = IPFW_TLV_TBL_NAME;
2013	ntlv->head.length = sizeof(*ntlv);
2014	ntlv->idx = no->kidx;
2015	strlcpy(ntlv->name, no->name, sizeof(ntlv->name));
2016
2017	return (0);
2018}
2019
2020struct dump_args {
2021	struct ip_fw_chain *ch;
2022	struct table_info *ti;
2023	struct table_config *tc;
2024	struct sockopt_data *sd;
2025	uint32_t cnt;
2026	uint16_t uidx;
2027	int error;
2028	uint32_t size;
2029	ipfw_table_entry *ent;
2030	ta_foreach_f *f;
2031	void *farg;
2032	ipfw_obj_tentry tent;
2033};
2034
2035static int
2036count_ext_entries(void *e, void *arg)
2037{
2038	struct dump_args *da;
2039
2040	da = (struct dump_args *)arg;
2041	da->cnt++;
2042
2043	return (0);
2044}
2045
2046/*
2047 * Gets number of items from table either using
2048 * internal counter or calling algo callback for
2049 * externally-managed tables.
2050 *
2051 * Returns number of records.
2052 */
2053static uint32_t
2054table_get_count(struct ip_fw_chain *ch, struct table_config *tc)
2055{
2056	struct table_info *ti;
2057	struct table_algo *ta;
2058	struct dump_args da;
2059
2060	ti = KIDX_TO_TI(ch, tc->no.kidx);
2061	ta = tc->ta;
2062
2063	/* Use internal counter for self-managed tables */
2064	if ((ta->flags & TA_FLAG_READONLY) == 0)
2065		return (tc->count);
2066
2067	/* Use callback to quickly get number of items */
2068	if ((ta->flags & TA_FLAG_EXTCOUNTER) != 0)
2069		return (ta->get_count(tc->astate, ti));
2070
2071	/* Count number of iterms ourselves */
2072	memset(&da, 0, sizeof(da));
2073	ta->foreach(tc->astate, ti, count_ext_entries, &da);
2074
2075	return (da.cnt);
2076}
2077
2078/*
2079 * Exports table @tc info into standard ipfw_xtable_info format.
2080 */
2081static void
2082export_table_info(struct ip_fw_chain *ch, struct table_config *tc,
2083    ipfw_xtable_info *i)
2084{
2085	struct table_info *ti;
2086	struct table_algo *ta;
2087
2088	i->type = tc->no.subtype;
2089	i->tflags = tc->tflags;
2090	i->vmask = tc->vmask;
2091	i->set = tc->no.set;
2092	i->kidx = tc->no.kidx;
2093	i->refcnt = tc->no.refcnt;
2094	i->count = table_get_count(ch, tc);
2095	i->limit = tc->limit;
2096	i->flags |= (tc->locked != 0) ? IPFW_TGFLAGS_LOCKED : 0;
2097	i->size = tc->count * sizeof(ipfw_obj_tentry);
2098	i->size += sizeof(ipfw_obj_header) + sizeof(ipfw_xtable_info);
2099	strlcpy(i->tablename, tc->tablename, sizeof(i->tablename));
2100	ti = KIDX_TO_TI(ch, tc->no.kidx);
2101	ta = tc->ta;
2102	if (ta->print_config != NULL) {
2103		/* Use algo function to print table config to string */
2104		ta->print_config(tc->astate, ti, i->algoname,
2105		    sizeof(i->algoname));
2106	} else
2107		strlcpy(i->algoname, ta->name, sizeof(i->algoname));
2108	/* Dump algo-specific data, if possible */
2109	if (ta->dump_tinfo != NULL) {
2110		ta->dump_tinfo(tc->astate, ti, &i->ta_info);
2111		i->ta_info.flags |= IPFW_TATFLAGS_DATA;
2112	}
2113}
2114
2115struct dump_table_args {
2116	struct ip_fw_chain *ch;
2117	struct sockopt_data *sd;
2118};
2119
2120static void
2121export_table_internal(struct namedobj_instance *ni, struct named_object *no,
2122    void *arg)
2123{
2124	ipfw_xtable_info *i;
2125	struct dump_table_args *dta;
2126
2127	dta = (struct dump_table_args *)arg;
2128
2129	i = (ipfw_xtable_info *)ipfw_get_sopt_space(dta->sd, sizeof(*i));
2130	KASSERT(i != 0, ("previously checked buffer is not enough"));
2131
2132	export_table_info(dta->ch, (struct table_config *)no, i);
2133}
2134
2135/*
2136 * Export all tables as ipfw_xtable_info structures to
2137 * storage provided by @sd.
2138 *
2139 * If supplied buffer is too small, fills in required size
2140 * and returns ENOMEM.
2141 * Returns 0 on success.
2142 */
2143static int
2144export_tables(struct ip_fw_chain *ch, ipfw_obj_lheader *olh,
2145    struct sockopt_data *sd)
2146{
2147	uint32_t size;
2148	uint32_t count;
2149	struct dump_table_args dta;
2150
2151	count = ipfw_objhash_count(CHAIN_TO_NI(ch));
2152	size = count * sizeof(ipfw_xtable_info) + sizeof(ipfw_obj_lheader);
2153
2154	/* Fill in header regadless of buffer size */
2155	olh->count = count;
2156	olh->objsize = sizeof(ipfw_xtable_info);
2157
2158	if (size > olh->size) {
2159		olh->size = size;
2160		return (ENOMEM);
2161	}
2162
2163	olh->size = size;
2164
2165	dta.ch = ch;
2166	dta.sd = sd;
2167
2168	ipfw_objhash_foreach(CHAIN_TO_NI(ch), export_table_internal, &dta);
2169
2170	return (0);
2171}
2172
2173/*
2174 * Dumps all table data
2175 * Data layout (v1)(current):
2176 * Request: [ ipfw_obj_header ], size = ipfw_xtable_info.size
2177 * Reply: [ ipfw_obj_header ipfw_xtable_info ipfw_obj_tentry x N ]
2178 *
2179 * Returns 0 on success
2180 */
2181static int
2182dump_table_v1(struct ip_fw_chain *ch, ip_fw3_opheader *op3,
2183    struct sockopt_data *sd)
2184{
2185	struct _ipfw_obj_header *oh;
2186	ipfw_xtable_info *i;
2187	struct tid_info ti;
2188	struct table_config *tc;
2189	struct table_algo *ta;
2190	struct dump_args da;
2191	uint32_t sz;
2192
2193	sz = sizeof(ipfw_obj_header) + sizeof(ipfw_xtable_info);
2194	oh = (struct _ipfw_obj_header *)ipfw_get_sopt_header(sd, sz);
2195	if (oh == NULL)
2196		return (EINVAL);
2197
2198	i = (ipfw_xtable_info *)(oh + 1);
2199	objheader_to_ti(oh, &ti);
2200
2201	IPFW_UH_RLOCK(ch);
2202	if ((tc = find_table(CHAIN_TO_NI(ch), &ti)) == NULL) {
2203		IPFW_UH_RUNLOCK(ch);
2204		return (ESRCH);
2205	}
2206	export_table_info(ch, tc, i);
2207
2208	if (sd->valsize < i->size) {
2209
2210		/*
2211		 * Submitted buffer size is not enough.
2212		 * WE've already filled in @i structure with
2213		 * relevant table info including size, so we
2214		 * can return. Buffer will be flushed automatically.
2215		 */
2216		IPFW_UH_RUNLOCK(ch);
2217		return (ENOMEM);
2218	}
2219
2220	/*
2221	 * Do the actual dump in eXtended format
2222	 */
2223	memset(&da, 0, sizeof(da));
2224	da.ch = ch;
2225	da.ti = KIDX_TO_TI(ch, tc->no.kidx);
2226	da.tc = tc;
2227	da.sd = sd;
2228
2229	ta = tc->ta;
2230
2231	ta->foreach(tc->astate, da.ti, dump_table_tentry, &da);
2232	IPFW_UH_RUNLOCK(ch);
2233
2234	return (da.error);
2235}
2236
2237/*
2238 * Dumps all table data
2239 * Data layout (version 0)(legacy):
2240 * Request: [ ipfw_xtable ], size = IP_FW_TABLE_XGETSIZE()
2241 * Reply: [ ipfw_xtable ipfw_table_xentry x N ]
2242 *
2243 * Returns 0 on success
2244 */
2245static int
2246dump_table_v0(struct ip_fw_chain *ch, ip_fw3_opheader *op3,
2247    struct sockopt_data *sd)
2248{
2249	ipfw_xtable *xtbl;
2250	struct tid_info ti;
2251	struct table_config *tc;
2252	struct table_algo *ta;
2253	struct dump_args da;
2254	size_t sz, count;
2255
2256	xtbl = (ipfw_xtable *)ipfw_get_sopt_header(sd, sizeof(ipfw_xtable));
2257	if (xtbl == NULL)
2258		return (EINVAL);
2259
2260	memset(&ti, 0, sizeof(ti));
2261	ti.uidx = xtbl->tbl;
2262
2263	IPFW_UH_RLOCK(ch);
2264	if ((tc = find_table(CHAIN_TO_NI(ch), &ti)) == NULL) {
2265		IPFW_UH_RUNLOCK(ch);
2266		return (0);
2267	}
2268	count = table_get_count(ch, tc);
2269	sz = count * sizeof(ipfw_table_xentry) + sizeof(ipfw_xtable);
2270
2271	xtbl->cnt = count;
2272	xtbl->size = sz;
2273	xtbl->type = tc->no.subtype;
2274	xtbl->tbl = ti.uidx;
2275
2276	if (sd->valsize < sz) {
2277
2278		/*
2279		 * Submitted buffer size is not enough.
2280		 * WE've already filled in @i structure with
2281		 * relevant table info including size, so we
2282		 * can return. Buffer will be flushed automatically.
2283		 */
2284		IPFW_UH_RUNLOCK(ch);
2285		return (ENOMEM);
2286	}
2287
2288	/* Do the actual dump in eXtended format */
2289	memset(&da, 0, sizeof(da));
2290	da.ch = ch;
2291	da.ti = KIDX_TO_TI(ch, tc->no.kidx);
2292	da.tc = tc;
2293	da.sd = sd;
2294
2295	ta = tc->ta;
2296
2297	ta->foreach(tc->astate, da.ti, dump_table_xentry, &da);
2298	IPFW_UH_RUNLOCK(ch);
2299
2300	return (0);
2301}
2302
2303/*
2304 * Legacy function to retrieve number of items in table.
2305 */
2306static int
2307get_table_size(struct ip_fw_chain *ch, ip_fw3_opheader *op3,
2308    struct sockopt_data *sd)
2309{
2310	uint32_t *tbl;
2311	struct tid_info ti;
2312	size_t sz;
2313	int error;
2314
2315	sz = sizeof(*op3) + sizeof(uint32_t);
2316	op3 = (ip_fw3_opheader *)ipfw_get_sopt_header(sd, sz);
2317	if (op3 == NULL)
2318		return (EINVAL);
2319
2320	tbl = (uint32_t *)(op3 + 1);
2321	memset(&ti, 0, sizeof(ti));
2322	ti.uidx = *tbl;
2323	IPFW_UH_RLOCK(ch);
2324	error = ipfw_count_xtable(ch, &ti, tbl);
2325	IPFW_UH_RUNLOCK(ch);
2326	return (error);
2327}
2328
2329/*
2330 * Legacy IP_FW_TABLE_GETSIZE handler
2331 */
2332int
2333ipfw_count_table(struct ip_fw_chain *ch, struct tid_info *ti, uint32_t *cnt)
2334{
2335	struct table_config *tc;
2336
2337	if ((tc = find_table(CHAIN_TO_NI(ch), ti)) == NULL)
2338		return (ESRCH);
2339	*cnt = table_get_count(ch, tc);
2340	return (0);
2341}
2342
2343/*
2344 * Legacy IP_FW_TABLE_XGETSIZE handler
2345 */
2346int
2347ipfw_count_xtable(struct ip_fw_chain *ch, struct tid_info *ti, uint32_t *cnt)
2348{
2349	struct table_config *tc;
2350	uint32_t count;
2351
2352	if ((tc = find_table(CHAIN_TO_NI(ch), ti)) == NULL) {
2353		*cnt = 0;
2354		return (0); /* 'table all list' requires success */
2355	}
2356
2357	count = table_get_count(ch, tc);
2358	*cnt = count * sizeof(ipfw_table_xentry);
2359	if (count > 0)
2360		*cnt += sizeof(ipfw_xtable);
2361	return (0);
2362}
2363
2364static int
2365dump_table_entry(void *e, void *arg)
2366{
2367	struct dump_args *da;
2368	struct table_config *tc;
2369	struct table_algo *ta;
2370	ipfw_table_entry *ent;
2371	struct table_value *pval;
2372	int error;
2373
2374	da = (struct dump_args *)arg;
2375
2376	tc = da->tc;
2377	ta = tc->ta;
2378
2379	/* Out of memory, returning */
2380	if (da->cnt == da->size)
2381		return (1);
2382	ent = da->ent++;
2383	ent->tbl = da->uidx;
2384	da->cnt++;
2385
2386	error = ta->dump_tentry(tc->astate, da->ti, e, &da->tent);
2387	if (error != 0)
2388		return (error);
2389
2390	ent->addr = da->tent.k.addr.s_addr;
2391	ent->masklen = da->tent.masklen;
2392	pval = get_table_value(da->ch, da->tc, da->tent.v.kidx);
2393	ent->value = ipfw_export_table_value_legacy(pval);
2394
2395	return (0);
2396}
2397
2398/*
2399 * Dumps table in pre-8.1 legacy format.
2400 */
2401int
2402ipfw_dump_table_legacy(struct ip_fw_chain *ch, struct tid_info *ti,
2403    ipfw_table *tbl)
2404{
2405	struct table_config *tc;
2406	struct table_algo *ta;
2407	struct dump_args da;
2408
2409	tbl->cnt = 0;
2410
2411	if ((tc = find_table(CHAIN_TO_NI(ch), ti)) == NULL)
2412		return (0);	/* XXX: We should return ESRCH */
2413
2414	ta = tc->ta;
2415
2416	/* This dump format supports IPv4 only */
2417	if (tc->no.subtype != IPFW_TABLE_ADDR)
2418		return (0);
2419
2420	memset(&da, 0, sizeof(da));
2421	da.ch = ch;
2422	da.ti = KIDX_TO_TI(ch, tc->no.kidx);
2423	da.tc = tc;
2424	da.ent = &tbl->ent[0];
2425	da.size = tbl->size;
2426
2427	tbl->cnt = 0;
2428	ta->foreach(tc->astate, da.ti, dump_table_entry, &da);
2429	tbl->cnt = da.cnt;
2430
2431	return (0);
2432}
2433
2434/*
2435 * Dumps table entry in eXtended format (v1)(current).
2436 */
2437static int
2438dump_table_tentry(void *e, void *arg)
2439{
2440	struct dump_args *da;
2441	struct table_config *tc;
2442	struct table_algo *ta;
2443	struct table_value *pval;
2444	ipfw_obj_tentry *tent;
2445	int error;
2446
2447	da = (struct dump_args *)arg;
2448
2449	tc = da->tc;
2450	ta = tc->ta;
2451
2452	tent = (ipfw_obj_tentry *)ipfw_get_sopt_space(da->sd, sizeof(*tent));
2453	/* Out of memory, returning */
2454	if (tent == NULL) {
2455		da->error = ENOMEM;
2456		return (1);
2457	}
2458	tent->head.length = sizeof(ipfw_obj_tentry);
2459	tent->idx = da->uidx;
2460
2461	error = ta->dump_tentry(tc->astate, da->ti, e, tent);
2462	if (error != 0)
2463		return (error);
2464
2465	pval = get_table_value(da->ch, da->tc, tent->v.kidx);
2466	ipfw_export_table_value_v1(pval, &tent->v.value);
2467
2468	return (0);
2469}
2470
2471/*
2472 * Dumps table entry in eXtended format (v0).
2473 */
2474static int
2475dump_table_xentry(void *e, void *arg)
2476{
2477	struct dump_args *da;
2478	struct table_config *tc;
2479	struct table_algo *ta;
2480	ipfw_table_xentry *xent;
2481	ipfw_obj_tentry *tent;
2482	struct table_value *pval;
2483	int error;
2484
2485	da = (struct dump_args *)arg;
2486
2487	tc = da->tc;
2488	ta = tc->ta;
2489
2490	xent = (ipfw_table_xentry *)ipfw_get_sopt_space(da->sd, sizeof(*xent));
2491	/* Out of memory, returning */
2492	if (xent == NULL)
2493		return (1);
2494	xent->len = sizeof(ipfw_table_xentry);
2495	xent->tbl = da->uidx;
2496
2497	memset(&da->tent, 0, sizeof(da->tent));
2498	tent = &da->tent;
2499	error = ta->dump_tentry(tc->astate, da->ti, e, tent);
2500	if (error != 0)
2501		return (error);
2502
2503	/* Convert current format to previous one */
2504	xent->masklen = tent->masklen;
2505	pval = get_table_value(da->ch, da->tc, da->tent.v.kidx);
2506	xent->value = ipfw_export_table_value_legacy(pval);
2507	/* Apply some hacks */
2508	if (tc->no.subtype == IPFW_TABLE_ADDR && tent->subtype == AF_INET) {
2509		xent->k.addr6.s6_addr32[3] = tent->k.addr.s_addr;
2510		xent->flags = IPFW_TCF_INET;
2511	} else
2512		memcpy(&xent->k, &tent->k, sizeof(xent->k));
2513
2514	return (0);
2515}
2516
2517/*
2518 * Helper function to export table algo data
2519 * to tentry format before calling user function.
2520 *
2521 * Returns 0 on success.
2522 */
2523static int
2524prepare_table_tentry(void *e, void *arg)
2525{
2526	struct dump_args *da;
2527	struct table_config *tc;
2528	struct table_algo *ta;
2529	int error;
2530
2531	da = (struct dump_args *)arg;
2532
2533	tc = da->tc;
2534	ta = tc->ta;
2535
2536	error = ta->dump_tentry(tc->astate, da->ti, e, &da->tent);
2537	if (error != 0)
2538		return (error);
2539
2540	da->f(&da->tent, da->farg);
2541
2542	return (0);
2543}
2544
2545/*
2546 * Allow external consumers to read table entries in standard format.
2547 */
2548int
2549ipfw_foreach_table_tentry(struct ip_fw_chain *ch, uint16_t kidx,
2550    ta_foreach_f *f, void *arg)
2551{
2552	struct namedobj_instance *ni;
2553	struct table_config *tc;
2554	struct table_algo *ta;
2555	struct dump_args da;
2556
2557	ni = CHAIN_TO_NI(ch);
2558
2559	tc = (struct table_config *)ipfw_objhash_lookup_kidx(ni, kidx);
2560	if (tc == NULL)
2561		return (ESRCH);
2562
2563	ta = tc->ta;
2564
2565	memset(&da, 0, sizeof(da));
2566	da.ch = ch;
2567	da.ti = KIDX_TO_TI(ch, tc->no.kidx);
2568	da.tc = tc;
2569	da.f = f;
2570	da.farg = arg;
2571
2572	ta->foreach(tc->astate, da.ti, prepare_table_tentry, &da);
2573
2574	return (0);
2575}
2576
2577/*
2578 * Table algorithms
2579 */
2580
2581/*
2582 * Finds algoritm by index, table type or supplied name.
2583 *
2584 * Returns pointer to algo or NULL.
2585 */
2586static struct table_algo *
2587find_table_algo(struct tables_config *tcfg, struct tid_info *ti, char *name)
2588{
2589	int i, l;
2590	struct table_algo *ta;
2591
2592	if (ti->type > IPFW_TABLE_MAXTYPE)
2593		return (NULL);
2594
2595	/* Search by index */
2596	if (ti->atype != 0) {
2597		if (ti->atype > tcfg->algo_count)
2598			return (NULL);
2599		return (tcfg->algo[ti->atype]);
2600	}
2601
2602	if (name == NULL) {
2603		/* Return default algorithm for given type if set */
2604		return (tcfg->def_algo[ti->type]);
2605	}
2606
2607	/* Search by name */
2608	/* TODO: better search */
2609	for (i = 1; i <= tcfg->algo_count; i++) {
2610		ta = tcfg->algo[i];
2611
2612		/*
2613		 * One can supply additional algorithm
2614		 * parameters so we compare only the first word
2615		 * of supplied name:
2616		 * 'addr:chash hsize=32'
2617		 * '^^^^^^^^^'
2618		 *
2619		 */
2620		l = strlen(ta->name);
2621		if (strncmp(name, ta->name, l) != 0)
2622			continue;
2623		if (name[l] != '\0' && name[l] != ' ')
2624			continue;
2625		/* Check if we're requesting proper table type */
2626		if (ti->type != 0 && ti->type != ta->type)
2627			return (NULL);
2628		return (ta);
2629	}
2630
2631	return (NULL);
2632}
2633
2634/*
2635 * Register new table algo @ta.
2636 * Stores algo id inside @idx.
2637 *
2638 * Returns 0 on success.
2639 */
2640int
2641ipfw_add_table_algo(struct ip_fw_chain *ch, struct table_algo *ta, size_t size,
2642    int *idx)
2643{
2644	struct tables_config *tcfg;
2645	struct table_algo *ta_new;
2646	size_t sz;
2647
2648	if (size > sizeof(struct table_algo))
2649		return (EINVAL);
2650
2651	/* Check for the required on-stack size for add/del */
2652	sz = roundup2(ta->ta_buf_size, sizeof(void *));
2653	if (sz > TA_BUF_SZ)
2654		return (EINVAL);
2655
2656	KASSERT(ta->type <= IPFW_TABLE_MAXTYPE,("Increase IPFW_TABLE_MAXTYPE"));
2657
2658	/* Copy algorithm data to stable storage. */
2659	ta_new = malloc(sizeof(struct table_algo), M_IPFW, M_WAITOK | M_ZERO);
2660	memcpy(ta_new, ta, size);
2661
2662	tcfg = CHAIN_TO_TCFG(ch);
2663
2664	KASSERT(tcfg->algo_count < 255, ("Increase algo array size"));
2665
2666	tcfg->algo[++tcfg->algo_count] = ta_new;
2667	ta_new->idx = tcfg->algo_count;
2668
2669	/* Set algorithm as default one for given type */
2670	if ((ta_new->flags & TA_FLAG_DEFAULT) != 0 &&
2671	    tcfg->def_algo[ta_new->type] == NULL)
2672		tcfg->def_algo[ta_new->type] = ta_new;
2673
2674	*idx = ta_new->idx;
2675
2676	return (0);
2677}
2678
2679/*
2680 * Unregisters table algo using @idx as id.
2681 * XXX: It is NOT safe to call this function in any place
2682 * other than ipfw instance destroy handler.
2683 */
2684void
2685ipfw_del_table_algo(struct ip_fw_chain *ch, int idx)
2686{
2687	struct tables_config *tcfg;
2688	struct table_algo *ta;
2689
2690	tcfg = CHAIN_TO_TCFG(ch);
2691
2692	KASSERT(idx <= tcfg->algo_count, ("algo idx %d out of range 1..%d",
2693	    idx, tcfg->algo_count));
2694
2695	ta = tcfg->algo[idx];
2696	KASSERT(ta != NULL, ("algo idx %d is NULL", idx));
2697
2698	if (tcfg->def_algo[ta->type] == ta)
2699		tcfg->def_algo[ta->type] = NULL;
2700
2701	free(ta, M_IPFW);
2702}
2703
2704/*
2705 * Lists all table algorithms currently available.
2706 * Data layout (v0)(current):
2707 * Request: [ ipfw_obj_lheader ], size = ipfw_obj_lheader.size
2708 * Reply: [ ipfw_obj_lheader ipfw_ta_info x N ]
2709 *
2710 * Returns 0 on success
2711 */
2712static int
2713list_table_algo(struct ip_fw_chain *ch, ip_fw3_opheader *op3,
2714    struct sockopt_data *sd)
2715{
2716	struct _ipfw_obj_lheader *olh;
2717	struct tables_config *tcfg;
2718	ipfw_ta_info *i;
2719	struct table_algo *ta;
2720	uint32_t count, n, size;
2721
2722	olh = (struct _ipfw_obj_lheader *)ipfw_get_sopt_header(sd,sizeof(*olh));
2723	if (olh == NULL)
2724		return (EINVAL);
2725	if (sd->valsize < olh->size)
2726		return (EINVAL);
2727
2728	IPFW_UH_RLOCK(ch);
2729	tcfg = CHAIN_TO_TCFG(ch);
2730	count = tcfg->algo_count;
2731	size = count * sizeof(ipfw_ta_info) + sizeof(ipfw_obj_lheader);
2732
2733	/* Fill in header regadless of buffer size */
2734	olh->count = count;
2735	olh->objsize = sizeof(ipfw_ta_info);
2736
2737	if (size > olh->size) {
2738		olh->size = size;
2739		IPFW_UH_RUNLOCK(ch);
2740		return (ENOMEM);
2741	}
2742	olh->size = size;
2743
2744	for (n = 1; n <= count; n++) {
2745		i = (ipfw_ta_info *)ipfw_get_sopt_space(sd, sizeof(*i));
2746		KASSERT(i != 0, ("previously checked buffer is not enough"));
2747		ta = tcfg->algo[n];
2748		strlcpy(i->algoname, ta->name, sizeof(i->algoname));
2749		i->type = ta->type;
2750		i->refcnt = ta->refcnt;
2751	}
2752
2753	IPFW_UH_RUNLOCK(ch);
2754
2755	return (0);
2756}
2757
2758static int
2759classify_srcdst(ipfw_insn *cmd, uint16_t *puidx, uint8_t *ptype)
2760{
2761	/* Basic IPv4/IPv6 or u32 lookups */
2762	*puidx = cmd->arg1;
2763	/* Assume ADDR by default */
2764	*ptype = IPFW_TABLE_ADDR;
2765	int v;
2766
2767	if (F_LEN(cmd) > F_INSN_SIZE(ipfw_insn_u32)) {
2768		/*
2769		 * generic lookup. The key must be
2770		 * in 32bit big-endian format.
2771		 */
2772		v = ((ipfw_insn_u32 *)cmd)->d[1];
2773		switch (v) {
2774		case 0:
2775		case 1:
2776			/* IPv4 src/dst */
2777			break;
2778		case 2:
2779		case 3:
2780			/* src/dst port */
2781			*ptype = IPFW_TABLE_NUMBER;
2782			break;
2783		case 4:
2784			/* uid/gid */
2785			*ptype = IPFW_TABLE_NUMBER;
2786			break;
2787		case 5:
2788			/* jid */
2789			*ptype = IPFW_TABLE_NUMBER;
2790			break;
2791		case 6:
2792			/* dscp */
2793			*ptype = IPFW_TABLE_NUMBER;
2794			break;
2795		}
2796	}
2797
2798	return (0);
2799}
2800
2801static int
2802classify_via(ipfw_insn *cmd, uint16_t *puidx, uint8_t *ptype)
2803{
2804	ipfw_insn_if *cmdif;
2805
2806	/* Interface table, possibly */
2807	cmdif = (ipfw_insn_if *)cmd;
2808	if (cmdif->name[0] != '\1')
2809		return (1);
2810
2811	*ptype = IPFW_TABLE_INTERFACE;
2812	*puidx = cmdif->p.kidx;
2813
2814	return (0);
2815}
2816
2817static int
2818classify_flow(ipfw_insn *cmd, uint16_t *puidx, uint8_t *ptype)
2819{
2820
2821	*puidx = cmd->arg1;
2822	*ptype = IPFW_TABLE_FLOW;
2823
2824	return (0);
2825}
2826
2827static void
2828update_arg1(ipfw_insn *cmd, uint16_t idx)
2829{
2830
2831	cmd->arg1 = idx;
2832}
2833
2834static void
2835update_via(ipfw_insn *cmd, uint16_t idx)
2836{
2837	ipfw_insn_if *cmdif;
2838
2839	cmdif = (ipfw_insn_if *)cmd;
2840	cmdif->p.kidx = idx;
2841}
2842
2843static int
2844table_findbyname(struct ip_fw_chain *ch, struct tid_info *ti,
2845    struct named_object **pno)
2846{
2847	struct table_config *tc;
2848	int error;
2849
2850	IPFW_UH_WLOCK_ASSERT(ch);
2851
2852	error = find_table_err(CHAIN_TO_NI(ch), ti, &tc);
2853	if (error != 0)
2854		return (error);
2855
2856	*pno = &tc->no;
2857	return (0);
2858}
2859
2860/* XXX: sets-sets! */
2861static struct named_object *
2862table_findbykidx(struct ip_fw_chain *ch, uint16_t idx)
2863{
2864	struct namedobj_instance *ni;
2865	struct table_config *tc;
2866
2867	IPFW_UH_WLOCK_ASSERT(ch);
2868	ni = CHAIN_TO_NI(ch);
2869	tc = (struct table_config *)ipfw_objhash_lookup_kidx(ni, idx);
2870	KASSERT(tc != NULL, ("Table with index %d not found", idx));
2871
2872	return (&tc->no);
2873}
2874
2875static struct opcode_obj_rewrite opcodes[] = {
2876	{
2877		O_IP_SRC_LOOKUP, IPFW_TLV_TBL_NAME,
2878		classify_srcdst, update_arg1,
2879		table_findbyname, table_findbykidx, create_table_compat
2880	},
2881	{
2882		O_IP_DST_LOOKUP, IPFW_TLV_TBL_NAME,
2883		classify_srcdst, update_arg1,
2884		table_findbyname, table_findbykidx, create_table_compat
2885	},
2886	{
2887		O_IP_FLOW_LOOKUP, IPFW_TLV_TBL_NAME,
2888		classify_flow, update_arg1,
2889		table_findbyname, table_findbykidx, create_table_compat
2890	},
2891	{
2892		O_XMIT, IPFW_TLV_TBL_NAME,
2893		classify_via, update_via,
2894		table_findbyname, table_findbykidx, create_table_compat
2895	},
2896	{
2897		O_RECV, IPFW_TLV_TBL_NAME,
2898		classify_via, update_via,
2899		table_findbyname, table_findbykidx, create_table_compat
2900	},
2901	{
2902		O_VIA, IPFW_TLV_TBL_NAME,
2903		classify_via, update_via,
2904		table_findbyname, table_findbykidx, create_table_compat
2905	},
2906};
2907
2908
2909/*
2910 * Checks table name for validity.
2911 * Enforce basic length checks, the rest
2912 * should be done in userland.
2913 *
2914 * Returns 0 if name is considered valid.
2915 */
2916int
2917ipfw_check_table_name(char *name)
2918{
2919	int nsize;
2920	ipfw_obj_ntlv *ntlv = NULL;
2921
2922	nsize = sizeof(ntlv->name);
2923
2924	if (strnlen(name, nsize) == nsize)
2925		return (EINVAL);
2926
2927	if (name[0] == '\0')
2928		return (EINVAL);
2929
2930	/*
2931	 * TODO: do some more complicated checks
2932	 */
2933
2934	return (0);
2935}
2936
2937/*
2938 * Find tablename TLV by @uid.
2939 * Check @tlvs for valid data inside.
2940 *
2941 * Returns pointer to found TLV or NULL.
2942 */
2943static ipfw_obj_ntlv *
2944find_name_tlv(void *tlvs, int len, uint16_t uidx)
2945{
2946	ipfw_obj_ntlv *ntlv;
2947	uintptr_t pa, pe;
2948	int l;
2949
2950	pa = (uintptr_t)tlvs;
2951	pe = pa + len;
2952	l = 0;
2953	for (; pa < pe; pa += l) {
2954		ntlv = (ipfw_obj_ntlv *)pa;
2955		l = ntlv->head.length;
2956
2957		if (l != sizeof(*ntlv))
2958			return (NULL);
2959
2960		if (ntlv->head.type != IPFW_TLV_TBL_NAME)
2961			continue;
2962
2963		if (ntlv->idx != uidx)
2964			continue;
2965
2966		if (ipfw_check_table_name(ntlv->name) != 0)
2967			return (NULL);
2968
2969		return (ntlv);
2970	}
2971
2972	return (NULL);
2973}
2974
2975/*
2976 * Finds table config based on either legacy index
2977 * or name in ntlv.
2978 * Note @ti structure contains unchecked data from userland.
2979 *
2980 * Returns 0 in success and fills in @tc with found config
2981 */
2982static int
2983find_table_err(struct namedobj_instance *ni, struct tid_info *ti,
2984    struct table_config **tc)
2985{
2986	char *name, bname[16];
2987	struct named_object *no;
2988	ipfw_obj_ntlv *ntlv;
2989	uint32_t set;
2990
2991	if (ti->tlvs != NULL) {
2992		ntlv = find_name_tlv(ti->tlvs, ti->tlen, ti->uidx);
2993		if (ntlv == NULL)
2994			return (EINVAL);
2995		name = ntlv->name;
2996
2997		/*
2998		 * Use set provided by @ti instead of @ntlv one.
2999		 * This is needed due to different sets behavior
3000		 * controlled by V_fw_tables_sets.
3001		 */
3002		set = ti->set;
3003	} else {
3004		snprintf(bname, sizeof(bname), "%d", ti->uidx);
3005		name = bname;
3006		set = 0;
3007	}
3008
3009	no = ipfw_objhash_lookup_name(ni, set, name);
3010	*tc = (struct table_config *)no;
3011
3012	return (0);
3013}
3014
3015/*
3016 * Finds table config based on either legacy index
3017 * or name in ntlv.
3018 * Note @ti structure contains unchecked data from userland.
3019 *
3020 * Returns pointer to table_config or NULL.
3021 */
3022static struct table_config *
3023find_table(struct namedobj_instance *ni, struct tid_info *ti)
3024{
3025	struct table_config *tc;
3026
3027	if (find_table_err(ni, ti, &tc) != 0)
3028		return (NULL);
3029
3030	return (tc);
3031}
3032
3033/*
3034 * Allocate new table config structure using
3035 * specified @algo and @aname.
3036 *
3037 * Returns pointer to config or NULL.
3038 */
3039static struct table_config *
3040alloc_table_config(struct ip_fw_chain *ch, struct tid_info *ti,
3041    struct table_algo *ta, char *aname, uint8_t tflags)
3042{
3043	char *name, bname[16];
3044	struct table_config *tc;
3045	int error;
3046	ipfw_obj_ntlv *ntlv;
3047	uint32_t set;
3048
3049	if (ti->tlvs != NULL) {
3050		ntlv = find_name_tlv(ti->tlvs, ti->tlen, ti->uidx);
3051		if (ntlv == NULL)
3052			return (NULL);
3053		name = ntlv->name;
3054		set = ntlv->set;
3055	} else {
3056		/* Compat part: convert number to string representation */
3057		snprintf(bname, sizeof(bname), "%d", ti->uidx);
3058		name = bname;
3059		set = 0;
3060	}
3061
3062	tc = malloc(sizeof(struct table_config), M_IPFW, M_WAITOK | M_ZERO);
3063	tc->no.name = tc->tablename;
3064	tc->no.subtype = ta->type;
3065	tc->no.set = set;
3066	tc->tflags = tflags;
3067	tc->ta = ta;
3068	strlcpy(tc->tablename, name, sizeof(tc->tablename));
3069	/* Set "shared" value type by default */
3070	tc->vshared = 1;
3071
3072	/* Preallocate data structures for new tables */
3073	error = ta->init(ch, &tc->astate, &tc->ti_copy, aname, tflags);
3074	if (error != 0) {
3075		free(tc, M_IPFW);
3076		return (NULL);
3077	}
3078
3079	return (tc);
3080}
3081
3082/*
3083 * Destroys table state and config.
3084 */
3085static void
3086free_table_config(struct namedobj_instance *ni, struct table_config *tc)
3087{
3088
3089	KASSERT(tc->linked == 0, ("free() on linked config"));
3090	/* UH lock MUST NOT be held */
3091
3092	/*
3093	 * We're using ta without any locking/referencing.
3094	 * TODO: fix this if we're going to use unloadable algos.
3095	 */
3096	tc->ta->destroy(tc->astate, &tc->ti_copy);
3097	free(tc, M_IPFW);
3098}
3099
3100/*
3101 * Links @tc to @chain table named instance.
3102 * Sets appropriate type/states in @chain table info.
3103 */
3104static void
3105link_table(struct ip_fw_chain *ch, struct table_config *tc)
3106{
3107	struct namedobj_instance *ni;
3108	struct table_info *ti;
3109	uint16_t kidx;
3110
3111	IPFW_UH_WLOCK_ASSERT(ch);
3112	IPFW_WLOCK_ASSERT(ch);
3113
3114	ni = CHAIN_TO_NI(ch);
3115	kidx = tc->no.kidx;
3116
3117	ipfw_objhash_add(ni, &tc->no);
3118
3119	ti = KIDX_TO_TI(ch, kidx);
3120	*ti = tc->ti_copy;
3121
3122	/* Notify algo on real @ti address */
3123	if (tc->ta->change_ti != NULL)
3124		tc->ta->change_ti(tc->astate, ti);
3125
3126	tc->linked = 1;
3127	tc->ta->refcnt++;
3128}
3129
3130/*
3131 * Unlinks @tc from @chain table named instance.
3132 * Zeroes states in @chain and stores them in @tc.
3133 */
3134static void
3135unlink_table(struct ip_fw_chain *ch, struct table_config *tc)
3136{
3137	struct namedobj_instance *ni;
3138	struct table_info *ti;
3139	uint16_t kidx;
3140
3141	IPFW_UH_WLOCK_ASSERT(ch);
3142	IPFW_WLOCK_ASSERT(ch);
3143
3144	ni = CHAIN_TO_NI(ch);
3145	kidx = tc->no.kidx;
3146
3147	/* Clear state. @ti copy is already saved inside @tc */
3148	ipfw_objhash_del(ni, &tc->no);
3149	ti = KIDX_TO_TI(ch, kidx);
3150	memset(ti, 0, sizeof(struct table_info));
3151	tc->linked = 0;
3152	tc->ta->refcnt--;
3153
3154	/* Notify algo on real @ti address */
3155	if (tc->ta->change_ti != NULL)
3156		tc->ta->change_ti(tc->astate, NULL);
3157}
3158
3159struct swap_table_args {
3160	int set;
3161	int new_set;
3162	int mv;
3163};
3164
3165/*
3166 * Change set for each matching table.
3167 *
3168 * Ensure we dispatch each table once by setting/checking ochange
3169 * fields.
3170 */
3171static void
3172swap_table_set(struct namedobj_instance *ni, struct named_object *no,
3173    void *arg)
3174{
3175	struct table_config *tc;
3176	struct swap_table_args *sta;
3177
3178	tc = (struct table_config *)no;
3179	sta = (struct swap_table_args *)arg;
3180
3181	if (no->set != sta->set && (no->set != sta->new_set || sta->mv != 0))
3182		return;
3183
3184	if (tc->ochanged != 0)
3185		return;
3186
3187	tc->ochanged = 1;
3188	ipfw_objhash_del(ni, no);
3189	if (no->set == sta->set)
3190		no->set = sta->new_set;
3191	else
3192		no->set = sta->set;
3193	ipfw_objhash_add(ni, no);
3194}
3195
3196/*
3197 * Cleans up ochange field for all tables.
3198 */
3199static void
3200clean_table_set_data(struct namedobj_instance *ni, struct named_object *no,
3201    void *arg)
3202{
3203	struct table_config *tc;
3204	struct swap_table_args *sta;
3205
3206	tc = (struct table_config *)no;
3207	sta = (struct swap_table_args *)arg;
3208
3209	tc->ochanged = 0;
3210}
3211
3212/*
3213 * Swaps tables within two sets.
3214 */
3215void
3216ipfw_swap_tables_sets(struct ip_fw_chain *ch, uint32_t set,
3217    uint32_t new_set, int mv)
3218{
3219	struct swap_table_args sta;
3220
3221	IPFW_UH_WLOCK_ASSERT(ch);
3222
3223	sta.set = set;
3224	sta.new_set = new_set;
3225	sta.mv = mv;
3226
3227	ipfw_objhash_foreach(CHAIN_TO_NI(ch), swap_table_set, &sta);
3228	ipfw_objhash_foreach(CHAIN_TO_NI(ch), clean_table_set_data, &sta);
3229}
3230
3231/*
3232 * Move all tables which are reference by rules in @rr to set @new_set.
3233 * Makes sure that all relevant tables are referenced ONLLY by given rules.
3234 *
3235 * Retuns 0 on success,
3236 */
3237int
3238ipfw_move_tables_sets(struct ip_fw_chain *ch, ipfw_range_tlv *rt,
3239    uint32_t new_set)
3240{
3241	struct ip_fw *rule;
3242	struct table_config *tc;
3243	struct named_object *no;
3244	struct namedobj_instance *ni;
3245	int bad, i, l, cmdlen;
3246	uint16_t kidx;
3247	ipfw_insn *cmd;
3248
3249	IPFW_UH_WLOCK_ASSERT(ch);
3250
3251	ni = CHAIN_TO_NI(ch);
3252
3253	/* Stage 1: count number of references by given rules */
3254	for (i = 0; i < ch->n_rules - 1; i++) {
3255		rule = ch->map[i];
3256		if (ipfw_match_range(rule, rt) == 0)
3257			continue;
3258
3259		l = rule->cmd_len;
3260		cmd = rule->cmd;
3261		cmdlen = 0;
3262		for ( ;	l > 0 ; l -= cmdlen, cmd += cmdlen) {
3263			cmdlen = F_LEN(cmd);
3264			if (classify_opcode_kidx(cmd, &kidx) != 0)
3265				continue;
3266			no = ipfw_objhash_lookup_kidx(ni, kidx);
3267			KASSERT(no != NULL,
3268			    ("objhash lookup failed on index %d", kidx));
3269			tc = (struct table_config *)no;
3270			tc->ocount++;
3271		}
3272
3273	}
3274
3275	/* Stage 2: verify "ownership" */
3276	bad = 0;
3277	for (i = 0; i < ch->n_rules - 1; i++) {
3278		rule = ch->map[i];
3279		if (ipfw_match_range(rule, rt) == 0)
3280			continue;
3281
3282		l = rule->cmd_len;
3283		cmd = rule->cmd;
3284		cmdlen = 0;
3285		for ( ;	l > 0 ; l -= cmdlen, cmd += cmdlen) {
3286			cmdlen = F_LEN(cmd);
3287			if (classify_opcode_kidx(cmd, &kidx) != 0)
3288				continue;
3289			no = ipfw_objhash_lookup_kidx(ni, kidx);
3290			KASSERT(no != NULL,
3291			    ("objhash lookup failed on index %d", kidx));
3292			tc = (struct table_config *)no;
3293			if (tc->no.refcnt != tc->ocount) {
3294
3295				/*
3296				 * Number of references differ:
3297				 * Other rule(s) are holding reference to given
3298				 * table, so it is not possible to change its set.
3299				 *
3300				 * Note that refcnt may account
3301				 * references to some going-to-be-added rules.
3302				 * Since we don't know their numbers (and event
3303				 * if they will be added) it is perfectly OK
3304				 * to return error here.
3305				 */
3306				bad = 1;
3307				break;
3308			}
3309		}
3310
3311		if (bad != 0)
3312			break;
3313	}
3314
3315	/* Stage 3: change set or cleanup */
3316	for (i = 0; i < ch->n_rules - 1; i++) {
3317		rule = ch->map[i];
3318		if (ipfw_match_range(rule, rt) == 0)
3319			continue;
3320
3321		l = rule->cmd_len;
3322		cmd = rule->cmd;
3323		cmdlen = 0;
3324		for ( ;	l > 0 ; l -= cmdlen, cmd += cmdlen) {
3325			cmdlen = F_LEN(cmd);
3326			if (classify_opcode_kidx(cmd, &kidx) != 0)
3327				continue;
3328			no = ipfw_objhash_lookup_kidx(ni, kidx);
3329			KASSERT(no != NULL,
3330			    ("objhash lookup failed on index %d", kidx));
3331			tc = (struct table_config *)no;
3332
3333			tc->ocount = 0;
3334			if (bad != 0)
3335				continue;
3336
3337			/* Actually change set. */
3338			ipfw_objhash_del(ni, no);
3339			no->set = new_set;
3340			ipfw_objhash_add(ni, no);
3341		}
3342	}
3343
3344	return (bad);
3345}
3346
3347/*
3348 * Finds and bumps refcount for objects referenced by given @rule.
3349 * Auto-creates non-existing tables.
3350 * Fills in @oib array with userland/kernel indexes.
3351 *
3352 * Returns 0 on success.
3353 */
3354static int
3355ref_rule_objects(struct ip_fw_chain *ch, struct ip_fw *rule,
3356    struct rule_check_info *ci, struct obj_idx *oib, struct tid_info *ti)
3357{
3358	int cmdlen, error, l, numnew;
3359	ipfw_insn *cmd;
3360	struct obj_idx *pidx;
3361	int found, unresolved;
3362
3363	pidx = oib;
3364	l = rule->cmd_len;
3365	cmd = rule->cmd;
3366	cmdlen = 0;
3367	error = 0;
3368	numnew = 0;
3369	found = 0;
3370	unresolved = 0;
3371
3372	IPFW_UH_WLOCK(ch);
3373
3374	/* Increase refcount on each existing referenced table. */
3375	for ( ;	l > 0 ; l -= cmdlen, cmd += cmdlen) {
3376		cmdlen = F_LEN(cmd);
3377
3378		error = ref_opcode_object(ch, cmd, ti, pidx, &found, &unresolved);
3379		if (error != 0)
3380			break;
3381		if (found || unresolved) {
3382			pidx->off = rule->cmd_len - l;
3383			pidx++;
3384		}
3385		/*
3386		 * Compability stuff for old clients:
3387		 * prepare to manually create non-existing objects.
3388		 */
3389		if (unresolved)
3390			numnew++;
3391	}
3392
3393	if (error != 0) {
3394		/* Unref everything we have already done */
3395		unref_oib_objects(ch, rule->cmd, oib, pidx);
3396		IPFW_UH_WUNLOCK(ch);
3397		return (error);
3398	}
3399
3400	IPFW_UH_WUNLOCK(ch);
3401
3402	found = pidx - oib;
3403	KASSERT(found == ci->object_opcodes,
3404	    ("refcount inconsistency: found: %d total: %d",
3405	    found, ci->object_opcodes));
3406
3407	/* Perform auto-creation for non-existing objects */
3408	if (numnew != 0)
3409		error = create_objects_compat(ch, rule->cmd, oib, pidx, ti);
3410
3411	return (error);
3412}
3413
3414/*
3415 * Checks is opcode is referencing table of appropriate type.
3416 * Adds reference count for found table if true.
3417 * Rewrites user-supplied opcode values with kernel ones.
3418 *
3419 * Returns 0 on success and appropriate error code otherwise.
3420 */
3421int
3422ipfw_rewrite_rule_uidx(struct ip_fw_chain *chain,
3423    struct rule_check_info *ci)
3424{
3425	int error;
3426	ipfw_insn *cmd;
3427	uint8_t type;
3428	struct obj_idx *p, *pidx_first, *pidx_last;
3429	struct tid_info ti;
3430
3431	/*
3432	 * Prepare an array for storing opcode indices.
3433	 * Use stack allocation by default.
3434	 */
3435	if (ci->object_opcodes <= (sizeof(ci->obuf)/sizeof(ci->obuf[0]))) {
3436		/* Stack */
3437		pidx_first = ci->obuf;
3438	} else
3439		pidx_first = malloc(ci->object_opcodes * sizeof(struct obj_idx),
3440		    M_IPFW, M_WAITOK | M_ZERO);
3441
3442	pidx_last = pidx_first + ci->object_opcodes;
3443	error = 0;
3444	type = 0;
3445	memset(&ti, 0, sizeof(ti));
3446
3447	/*
3448	 * Use default set for looking up tables (old way) or
3449	 * use set rule is assigned to (new way).
3450	 */
3451	ti.set = (V_fw_tables_sets != 0) ? ci->krule->set : 0;
3452	if (ci->ctlv != NULL) {
3453		ti.tlvs = (void *)(ci->ctlv + 1);
3454		ti.tlen = ci->ctlv->head.length - sizeof(ipfw_obj_ctlv);
3455	}
3456
3457	/* Reference all used tables and other objects */
3458	error = ref_rule_objects(chain, ci->krule, ci, pidx_first, &ti);
3459	if (error != 0)
3460		goto free;
3461
3462	/* Perform rule rewrite */
3463	p = pidx_first;
3464	for (p = pidx_first; p < pidx_last; p++) {
3465		cmd = ci->krule->cmd + p->off;
3466		update_opcode_kidx(cmd, p->kidx);
3467	}
3468
3469free:
3470	if (pidx_first != ci->obuf)
3471		free(pidx_first, M_IPFW);
3472
3473	return (error);
3474}
3475
3476static struct ipfw_sopt_handler	scodes[] = {
3477	{ IP_FW_TABLE_XCREATE,	0,	HDIR_SET,	create_table },
3478	{ IP_FW_TABLE_XDESTROY,	0,	HDIR_SET,	flush_table_v0 },
3479	{ IP_FW_TABLE_XFLUSH,	0,	HDIR_SET,	flush_table_v0 },
3480	{ IP_FW_TABLE_XMODIFY,	0,	HDIR_BOTH,	modify_table },
3481	{ IP_FW_TABLE_XINFO,	0,	HDIR_GET,	describe_table },
3482	{ IP_FW_TABLES_XLIST,	0,	HDIR_GET,	list_tables },
3483	{ IP_FW_TABLE_XLIST,	0,	HDIR_GET,	dump_table_v0 },
3484	{ IP_FW_TABLE_XLIST,	1,	HDIR_GET,	dump_table_v1 },
3485	{ IP_FW_TABLE_XADD,	0,	HDIR_BOTH,	manage_table_ent_v0 },
3486	{ IP_FW_TABLE_XADD,	1,	HDIR_BOTH,	manage_table_ent_v1 },
3487	{ IP_FW_TABLE_XDEL,	0,	HDIR_BOTH,	manage_table_ent_v0 },
3488	{ IP_FW_TABLE_XDEL,	1,	HDIR_BOTH,	manage_table_ent_v1 },
3489	{ IP_FW_TABLE_XFIND,	0,	HDIR_GET,	find_table_entry },
3490	{ IP_FW_TABLE_XSWAP,	0,	HDIR_SET,	swap_table },
3491	{ IP_FW_TABLES_ALIST,	0,	HDIR_GET,	list_table_algo },
3492	{ IP_FW_TABLE_XGETSIZE,	0,	HDIR_GET,	get_table_size },
3493};
3494
3495static void
3496destroy_table_locked(struct namedobj_instance *ni, struct named_object *no,
3497    void *arg)
3498{
3499
3500	unlink_table((struct ip_fw_chain *)arg, (struct table_config *)no);
3501	if (ipfw_objhash_free_idx(ni, no->kidx) != 0)
3502		printf("Error unlinking kidx %d from table %s\n",
3503		    no->kidx, no->name);
3504	free_table_config(ni, (struct table_config *)no);
3505}
3506
3507/*
3508 * Shuts tables module down.
3509 */
3510void
3511ipfw_destroy_tables(struct ip_fw_chain *ch, int last)
3512{
3513
3514	IPFW_DEL_SOPT_HANDLER(last, scodes);
3515	IPFW_DEL_OBJ_REWRITER(last, opcodes);
3516
3517	/* Remove all tables from working set */
3518	IPFW_UH_WLOCK(ch);
3519	IPFW_WLOCK(ch);
3520	ipfw_objhash_foreach(CHAIN_TO_NI(ch), destroy_table_locked, ch);
3521	IPFW_WUNLOCK(ch);
3522	IPFW_UH_WUNLOCK(ch);
3523
3524	/* Free pointers itself */
3525	free(ch->tablestate, M_IPFW);
3526
3527	ipfw_table_value_destroy(ch, last);
3528	ipfw_table_algo_destroy(ch);
3529
3530	ipfw_objhash_destroy(CHAIN_TO_NI(ch));
3531	free(CHAIN_TO_TCFG(ch), M_IPFW);
3532}
3533
3534/*
3535 * Starts tables module.
3536 */
3537int
3538ipfw_init_tables(struct ip_fw_chain *ch, int first)
3539{
3540	struct tables_config *tcfg;
3541
3542	/* Allocate pointers */
3543	ch->tablestate = malloc(V_fw_tables_max * sizeof(struct table_info),
3544	    M_IPFW, M_WAITOK | M_ZERO);
3545
3546	tcfg = malloc(sizeof(struct tables_config), M_IPFW, M_WAITOK | M_ZERO);
3547	tcfg->namehash = ipfw_objhash_create(V_fw_tables_max);
3548	ch->tblcfg = tcfg;
3549
3550	ipfw_table_value_init(ch, first);
3551	ipfw_table_algo_init(ch);
3552
3553	IPFW_ADD_OBJ_REWRITER(first, opcodes);
3554	IPFW_ADD_SOPT_HANDLER(first, scodes);
3555	return (0);
3556}
3557
3558
3559
3560