1/*-
2 * Copyright (C) 1997-2003
3 *	Sony Computer Science Laboratories Inc.  All rights reserved.
4 *
5 * Redistribution and use in source and binary forms, with or without
6 * modification, are permitted provided that the following conditions
7 * are met:
8 * 1. Redistributions of source code must retain the above copyright
9 *    notice, this list of conditions and the following disclaimer.
10 * 2. Redistributions in binary form must reproduce the above copyright
11 *    notice, this list of conditions and the following disclaimer in the
12 *    documentation and/or other materials provided with the distribution.
13 *
14 * THIS SOFTWARE IS PROVIDED BY SONY CSL AND CONTRIBUTORS ``AS IS'' AND
15 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
16 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
17 * ARE DISCLAIMED.  IN NO EVENT SHALL SONY CSL OR CONTRIBUTORS BE LIABLE
18 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
19 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
20 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
21 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
22 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
23 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
24 * SUCH DAMAGE.
25 *
26 * $KAME: altq_subr.c,v 1.21 2003/11/06 06:32:53 kjc Exp $
27 */
28
29#include "opt_altq.h"
30#include "opt_inet.h"
31#include "opt_inet6.h"
32
33#include <sys/param.h>
34#include <sys/malloc.h>
35#include <sys/mbuf.h>
36#include <sys/systm.h>
37#include <sys/proc.h>
38#include <sys/socket.h>
39#include <sys/socketvar.h>
40#include <sys/kernel.h>
41#include <sys/errno.h>
42#include <sys/syslog.h>
43#include <sys/sysctl.h>
44#include <sys/queue.h>
45
46#include <net/if.h>
47#include <net/if_var.h>
48#include <net/if_private.h>
49#include <net/if_dl.h>
50#include <net/if_types.h>
51#include <net/vnet.h>
52
53#include <netinet/in.h>
54#include <netinet/in_systm.h>
55#include <netinet/ip.h>
56#ifdef INET6
57#include <netinet/ip6.h>
58#endif
59#include <netinet/tcp.h>
60#include <netinet/udp.h>
61
62#include <netpfil/pf/pf.h>
63#include <netpfil/pf/pf_altq.h>
64#include <net/altq/altq.h>
65
66/* machine dependent clock related includes */
67#include <sys/bus.h>
68#include <sys/cpu.h>
69#include <sys/eventhandler.h>
70#include <machine/clock.h>
71#if defined(__amd64__) || defined(__i386__)
72#include <machine/cpufunc.h>		/* for pentium tsc */
73#include <machine/specialreg.h>		/* for CPUID_TSC */
74#include <machine/md_var.h>		/* for cpu_feature */
75#endif /* __amd64 || __i386__ */
76
77/*
78 * internal function prototypes
79 */
80static void	tbr_timeout(void *);
81static struct mbuf *tbr_dequeue(struct ifaltq *, int);
82static int tbr_timer = 0;	/* token bucket regulator timer */
83static struct callout tbr_callout;
84
85#ifdef ALTQ3_CLFIER_COMPAT
86static int 	extract_ports4(struct mbuf *, struct ip *, struct flowinfo_in *);
87#ifdef INET6
88static int 	extract_ports6(struct mbuf *, struct ip6_hdr *,
89			       struct flowinfo_in6 *);
90#endif
91static int	apply_filter4(u_int32_t, struct flow_filter *,
92			      struct flowinfo_in *);
93static int	apply_ppfilter4(u_int32_t, struct flow_filter *,
94				struct flowinfo_in *);
95#ifdef INET6
96static int	apply_filter6(u_int32_t, struct flow_filter6 *,
97			      struct flowinfo_in6 *);
98#endif
99static int	apply_tosfilter4(u_int32_t, struct flow_filter *,
100				 struct flowinfo_in *);
101static u_long	get_filt_handle(struct acc_classifier *, int);
102static struct acc_filter *filth_to_filtp(struct acc_classifier *, u_long);
103static u_int32_t filt2fibmask(struct flow_filter *);
104
105static void 	ip4f_cache(struct ip *, struct flowinfo_in *);
106static int 	ip4f_lookup(struct ip *, struct flowinfo_in *);
107static int 	ip4f_init(void);
108static struct ip4_frag	*ip4f_alloc(void);
109static void 	ip4f_free(struct ip4_frag *);
110#endif /* ALTQ3_CLFIER_COMPAT */
111
112#ifdef ALTQ
113SYSCTL_NODE(_kern_features, OID_AUTO, altq, CTLFLAG_RD | CTLFLAG_CAPRD, 0,
114    "ALTQ packet queuing");
115
116#define	ALTQ_FEATURE(name, desc)					\
117	SYSCTL_INT_WITH_LABEL(_kern_features_altq, OID_AUTO, name,	\
118	    CTLFLAG_RD | CTLFLAG_CAPRD, SYSCTL_NULL_INT_PTR, 1,		\
119	    desc, "feature")
120
121#ifdef ALTQ_CBQ
122ALTQ_FEATURE(cbq, "ALTQ Class Based Queuing discipline");
123#endif
124#ifdef ALTQ_CODEL
125ALTQ_FEATURE(codel, "ALTQ Controlled Delay discipline");
126#endif
127#ifdef ALTQ_RED
128ALTQ_FEATURE(red, "ALTQ Random Early Detection discipline");
129#endif
130#ifdef ALTQ_RIO
131ALTQ_FEATURE(rio, "ALTQ Random Early Drop discipline");
132#endif
133#ifdef ALTQ_HFSC
134ALTQ_FEATURE(hfsc, "ALTQ Hierarchical Packet Scheduler discipline");
135#endif
136#ifdef ALTQ_PRIQ
137ALTQ_FEATURE(priq, "ATLQ Priority Queuing discipline");
138#endif
139#ifdef ALTQ_FAIRQ
140ALTQ_FEATURE(fairq, "ALTQ Fair Queuing discipline");
141#endif
142#endif
143
144/*
145 * alternate queueing support routines
146 */
147
148/* look up the queue state by the interface name and the queueing type. */
149void *
150altq_lookup(char *name, int type)
151{
152	struct ifnet *ifp;
153
154	if ((ifp = ifunit(name)) != NULL) {
155		/* read if_snd unlocked */
156		if (type != ALTQT_NONE && ifp->if_snd.altq_type == type)
157			return (ifp->if_snd.altq_disc);
158	}
159
160	return NULL;
161}
162
163int
164altq_attach(struct ifaltq *ifq, int type, void *discipline,
165	int (*enqueue)(struct ifaltq *, struct mbuf *, struct altq_pktattr *),
166	struct mbuf *(*dequeue)(struct ifaltq *, int),
167	int (*request)(struct ifaltq *, int, void *))
168{
169	IFQ_LOCK(ifq);
170	if (!ALTQ_IS_READY(ifq)) {
171		IFQ_UNLOCK(ifq);
172		return ENXIO;
173	}
174
175	ifq->altq_type     = type;
176	ifq->altq_disc     = discipline;
177	ifq->altq_enqueue  = enqueue;
178	ifq->altq_dequeue  = dequeue;
179	ifq->altq_request  = request;
180	ifq->altq_flags &= (ALTQF_CANTCHANGE|ALTQF_ENABLED);
181	IFQ_UNLOCK(ifq);
182	return 0;
183}
184
185int
186altq_detach(struct ifaltq *ifq)
187{
188	IFQ_LOCK(ifq);
189
190	if (!ALTQ_IS_READY(ifq)) {
191		IFQ_UNLOCK(ifq);
192		return ENXIO;
193	}
194	if (ALTQ_IS_ENABLED(ifq)) {
195		IFQ_UNLOCK(ifq);
196		return EBUSY;
197	}
198	if (!ALTQ_IS_ATTACHED(ifq)) {
199		IFQ_UNLOCK(ifq);
200		return (0);
201	}
202
203	ifq->altq_type     = ALTQT_NONE;
204	ifq->altq_disc     = NULL;
205	ifq->altq_enqueue  = NULL;
206	ifq->altq_dequeue  = NULL;
207	ifq->altq_request  = NULL;
208	ifq->altq_flags &= ALTQF_CANTCHANGE;
209
210	IFQ_UNLOCK(ifq);
211	return 0;
212}
213
214int
215altq_enable(struct ifaltq *ifq)
216{
217	int s;
218
219	IFQ_LOCK(ifq);
220
221	if (!ALTQ_IS_READY(ifq)) {
222		IFQ_UNLOCK(ifq);
223		return ENXIO;
224	}
225	if (ALTQ_IS_ENABLED(ifq)) {
226		IFQ_UNLOCK(ifq);
227		return 0;
228	}
229
230	s = splnet();
231	IFQ_PURGE_NOLOCK(ifq);
232	ASSERT(ifq->ifq_len == 0);
233	ifq->ifq_drv_maxlen = 0;		/* disable bulk dequeue */
234	ifq->altq_flags |= ALTQF_ENABLED;
235	splx(s);
236
237	IFQ_UNLOCK(ifq);
238	return 0;
239}
240
241int
242altq_disable(struct ifaltq *ifq)
243{
244	int s;
245
246	IFQ_LOCK(ifq);
247	if (!ALTQ_IS_ENABLED(ifq)) {
248		IFQ_UNLOCK(ifq);
249		return 0;
250	}
251
252	s = splnet();
253	IFQ_PURGE_NOLOCK(ifq);
254	ASSERT(ifq->ifq_len == 0);
255	ifq->altq_flags &= ~(ALTQF_ENABLED);
256	splx(s);
257
258	IFQ_UNLOCK(ifq);
259	return 0;
260}
261
262#ifdef ALTQ_DEBUG
263void
264altq_assert(const char *file, int line, const char *failedexpr)
265{
266	(void)printf("altq assertion \"%s\" failed: file \"%s\", line %d\n",
267		     failedexpr, file, line);
268	panic("altq assertion");
269	/* NOTREACHED */
270}
271#endif
272
273/*
274 * internal representation of token bucket parameters
275 *	rate:	(byte_per_unittime << TBR_SHIFT)  / machclk_freq
276 *		(((bits_per_sec) / 8) << TBR_SHIFT) / machclk_freq
277 *	depth:	byte << TBR_SHIFT
278 *
279 */
280#define	TBR_SHIFT	29
281#define	TBR_SCALE(x)	((int64_t)(x) << TBR_SHIFT)
282#define	TBR_UNSCALE(x)	((x) >> TBR_SHIFT)
283
284static struct mbuf *
285tbr_dequeue(struct ifaltq *ifq, int op)
286{
287	struct tb_regulator *tbr;
288	struct mbuf *m;
289	int64_t interval;
290	u_int64_t now;
291
292	IFQ_LOCK_ASSERT(ifq);
293	tbr = ifq->altq_tbr;
294	if (op == ALTDQ_REMOVE && tbr->tbr_lastop == ALTDQ_POLL) {
295		/* if this is a remove after poll, bypass tbr check */
296	} else {
297		/* update token only when it is negative */
298		if (tbr->tbr_token <= 0) {
299			now = read_machclk();
300			interval = now - tbr->tbr_last;
301			if (interval >= tbr->tbr_filluptime)
302				tbr->tbr_token = tbr->tbr_depth;
303			else {
304				tbr->tbr_token += interval * tbr->tbr_rate;
305				if (tbr->tbr_token > tbr->tbr_depth)
306					tbr->tbr_token = tbr->tbr_depth;
307			}
308			tbr->tbr_last = now;
309		}
310		/* if token is still negative, don't allow dequeue */
311		if (tbr->tbr_token <= 0)
312			return (NULL);
313	}
314
315	if (ALTQ_IS_ENABLED(ifq))
316		m = (*ifq->altq_dequeue)(ifq, op);
317	else {
318		if (op == ALTDQ_POLL)
319			_IF_POLL(ifq, m);
320		else
321			_IF_DEQUEUE(ifq, m);
322	}
323
324	if (m != NULL && op == ALTDQ_REMOVE)
325		tbr->tbr_token -= TBR_SCALE(m_pktlen(m));
326	tbr->tbr_lastop = op;
327	return (m);
328}
329
330/*
331 * set a token bucket regulator.
332 * if the specified rate is zero, the token bucket regulator is deleted.
333 */
334int
335tbr_set(struct ifaltq *ifq, struct tb_profile *profile)
336{
337	struct tb_regulator *tbr, *otbr;
338
339	if (tbr_dequeue_ptr == NULL)
340		tbr_dequeue_ptr = tbr_dequeue;
341
342	if (machclk_freq == 0)
343		init_machclk();
344	if (machclk_freq == 0) {
345		printf("tbr_set: no cpu clock available!\n");
346		return (ENXIO);
347	}
348
349	IFQ_LOCK(ifq);
350	if (profile->rate == 0) {
351		/* delete this tbr */
352		if ((tbr = ifq->altq_tbr) == NULL) {
353			IFQ_UNLOCK(ifq);
354			return (ENOENT);
355		}
356		ifq->altq_tbr = NULL;
357		free(tbr, M_DEVBUF);
358		IFQ_UNLOCK(ifq);
359		return (0);
360	}
361
362	tbr = malloc(sizeof(struct tb_regulator), M_DEVBUF, M_NOWAIT | M_ZERO);
363	if (tbr == NULL) {
364		IFQ_UNLOCK(ifq);
365		return (ENOMEM);
366	}
367
368	tbr->tbr_rate = TBR_SCALE(profile->rate / 8) / machclk_freq;
369	tbr->tbr_depth = TBR_SCALE(profile->depth);
370	if (tbr->tbr_rate > 0)
371		tbr->tbr_filluptime = tbr->tbr_depth / tbr->tbr_rate;
372	else
373		tbr->tbr_filluptime = LLONG_MAX;
374	/*
375	 *  The longest time between tbr_dequeue() calls will be about 1
376	 *  system tick, as the callout that drives it is scheduled once per
377	 *  tick.  The refill-time detection logic in tbr_dequeue() can only
378	 *  properly detect the passage of up to LLONG_MAX machclk ticks.
379	 *  Therefore, in order for this logic to function properly in the
380	 *  extreme case, the maximum value of tbr_filluptime should be
381	 *  LLONG_MAX less one system tick's worth of machclk ticks less
382	 *  some additional slop factor (here one more system tick's worth
383	 *  of machclk ticks).
384	 */
385	if (tbr->tbr_filluptime > (LLONG_MAX - 2 * machclk_per_tick))
386		tbr->tbr_filluptime = LLONG_MAX - 2 * machclk_per_tick;
387	tbr->tbr_token = tbr->tbr_depth;
388	tbr->tbr_last = read_machclk();
389	tbr->tbr_lastop = ALTDQ_REMOVE;
390
391	otbr = ifq->altq_tbr;
392	ifq->altq_tbr = tbr;	/* set the new tbr */
393
394	if (otbr != NULL)
395		free(otbr, M_DEVBUF);
396	else {
397		if (tbr_timer == 0) {
398			CALLOUT_RESET(&tbr_callout, 1, tbr_timeout, (void *)0);
399			tbr_timer = 1;
400		}
401	}
402	IFQ_UNLOCK(ifq);
403	return (0);
404}
405
406/*
407 * tbr_timeout goes through the interface list, and kicks the drivers
408 * if necessary.
409 *
410 * MPSAFE
411 */
412static void
413tbr_timeout(void *arg)
414{
415	VNET_ITERATOR_DECL(vnet_iter);
416	struct ifnet *ifp;
417	struct epoch_tracker et;
418	int active;
419
420	active = 0;
421	NET_EPOCH_ENTER(et);
422	VNET_LIST_RLOCK_NOSLEEP();
423	VNET_FOREACH(vnet_iter) {
424		CURVNET_SET(vnet_iter);
425		for (ifp = CK_STAILQ_FIRST(&V_ifnet); ifp;
426		    ifp = CK_STAILQ_NEXT(ifp, if_link)) {
427			/* read from if_snd unlocked */
428			if (!TBR_IS_ENABLED(&ifp->if_snd))
429				continue;
430			active++;
431			if (!IFQ_IS_EMPTY(&ifp->if_snd) &&
432			    ifp->if_start != NULL)
433				(*ifp->if_start)(ifp);
434		}
435		CURVNET_RESTORE();
436	}
437	VNET_LIST_RUNLOCK_NOSLEEP();
438	NET_EPOCH_EXIT(et);
439	if (active > 0)
440		CALLOUT_RESET(&tbr_callout, 1, tbr_timeout, (void *)0);
441	else
442		tbr_timer = 0;	/* don't need tbr_timer anymore */
443}
444
445/*
446 * attach a discipline to the interface.  if one already exists, it is
447 * overridden.
448 * Locking is done in the discipline specific attach functions. Basically
449 * they call back to altq_attach which takes care of the attach and locking.
450 */
451int
452altq_pfattach(struct pf_altq *a)
453{
454	int error = 0;
455
456	switch (a->scheduler) {
457	case ALTQT_NONE:
458		break;
459#ifdef ALTQ_CBQ
460	case ALTQT_CBQ:
461		error = cbq_pfattach(a);
462		break;
463#endif
464#ifdef ALTQ_PRIQ
465	case ALTQT_PRIQ:
466		error = priq_pfattach(a);
467		break;
468#endif
469#ifdef ALTQ_HFSC
470	case ALTQT_HFSC:
471		error = hfsc_pfattach(a);
472		break;
473#endif
474#ifdef ALTQ_FAIRQ
475	case ALTQT_FAIRQ:
476		error = fairq_pfattach(a);
477		break;
478#endif
479#ifdef ALTQ_CODEL
480	case ALTQT_CODEL:
481		error = codel_pfattach(a);
482		break;
483#endif
484	default:
485		error = ENXIO;
486	}
487
488	return (error);
489}
490
491/*
492 * detach a discipline from the interface.
493 * it is possible that the discipline was already overridden by another
494 * discipline.
495 */
496int
497altq_pfdetach(struct pf_altq *a)
498{
499	struct ifnet *ifp;
500	int s, error = 0;
501
502	if ((ifp = ifunit(a->ifname)) == NULL)
503		return (EINVAL);
504
505	/* if this discipline is no longer referenced, just return */
506	/* read unlocked from if_snd */
507	if (a->altq_disc == NULL || a->altq_disc != ifp->if_snd.altq_disc)
508		return (0);
509
510	s = splnet();
511	/* read unlocked from if_snd, _disable and _detach take care */
512	if (ALTQ_IS_ENABLED(&ifp->if_snd))
513		error = altq_disable(&ifp->if_snd);
514	if (error == 0)
515		error = altq_detach(&ifp->if_snd);
516	splx(s);
517
518	return (error);
519}
520
521/*
522 * add a discipline or a queue
523 * Locking is done in the discipline specific functions with regards to
524 * malloc with WAITOK, also it is not yet clear which lock to use.
525 */
526int
527altq_add(struct ifnet *ifp, struct pf_altq *a)
528{
529	int error = 0;
530
531	if (a->qname[0] != 0)
532		return (altq_add_queue(a));
533
534	if (machclk_freq == 0)
535		init_machclk();
536	if (machclk_freq == 0)
537		panic("altq_add: no cpu clock");
538
539	switch (a->scheduler) {
540#ifdef ALTQ_CBQ
541	case ALTQT_CBQ:
542		error = cbq_add_altq(ifp, a);
543		break;
544#endif
545#ifdef ALTQ_PRIQ
546	case ALTQT_PRIQ:
547		error = priq_add_altq(ifp, a);
548		break;
549#endif
550#ifdef ALTQ_HFSC
551	case ALTQT_HFSC:
552		error = hfsc_add_altq(ifp, a);
553		break;
554#endif
555#ifdef ALTQ_FAIRQ
556        case ALTQT_FAIRQ:
557                error = fairq_add_altq(ifp, a);
558                break;
559#endif
560#ifdef ALTQ_CODEL
561	case ALTQT_CODEL:
562		error = codel_add_altq(ifp, a);
563		break;
564#endif
565	default:
566		error = ENXIO;
567	}
568
569	return (error);
570}
571
572/*
573 * remove a discipline or a queue
574 * It is yet unclear what lock to use to protect this operation, the
575 * discipline specific functions will determine and grab it
576 */
577int
578altq_remove(struct pf_altq *a)
579{
580	int error = 0;
581
582	if (a->qname[0] != 0)
583		return (altq_remove_queue(a));
584
585	switch (a->scheduler) {
586#ifdef ALTQ_CBQ
587	case ALTQT_CBQ:
588		error = cbq_remove_altq(a);
589		break;
590#endif
591#ifdef ALTQ_PRIQ
592	case ALTQT_PRIQ:
593		error = priq_remove_altq(a);
594		break;
595#endif
596#ifdef ALTQ_HFSC
597	case ALTQT_HFSC:
598		error = hfsc_remove_altq(a);
599		break;
600#endif
601#ifdef ALTQ_FAIRQ
602        case ALTQT_FAIRQ:
603                error = fairq_remove_altq(a);
604                break;
605#endif
606#ifdef ALTQ_CODEL
607	case ALTQT_CODEL:
608		error = codel_remove_altq(a);
609		break;
610#endif
611	default:
612		error = ENXIO;
613	}
614
615	return (error);
616}
617
618/*
619 * add a queue to the discipline
620 * It is yet unclear what lock to use to protect this operation, the
621 * discipline specific functions will determine and grab it
622 */
623int
624altq_add_queue(struct pf_altq *a)
625{
626	int error = 0;
627
628	switch (a->scheduler) {
629#ifdef ALTQ_CBQ
630	case ALTQT_CBQ:
631		error = cbq_add_queue(a);
632		break;
633#endif
634#ifdef ALTQ_PRIQ
635	case ALTQT_PRIQ:
636		error = priq_add_queue(a);
637		break;
638#endif
639#ifdef ALTQ_HFSC
640	case ALTQT_HFSC:
641		error = hfsc_add_queue(a);
642		break;
643#endif
644#ifdef ALTQ_FAIRQ
645        case ALTQT_FAIRQ:
646                error = fairq_add_queue(a);
647                break;
648#endif
649	default:
650		error = ENXIO;
651	}
652
653	return (error);
654}
655
656/*
657 * remove a queue from the discipline
658 * It is yet unclear what lock to use to protect this operation, the
659 * discipline specific functions will determine and grab it
660 */
661int
662altq_remove_queue(struct pf_altq *a)
663{
664	int error = 0;
665
666	switch (a->scheduler) {
667#ifdef ALTQ_CBQ
668	case ALTQT_CBQ:
669		error = cbq_remove_queue(a);
670		break;
671#endif
672#ifdef ALTQ_PRIQ
673	case ALTQT_PRIQ:
674		error = priq_remove_queue(a);
675		break;
676#endif
677#ifdef ALTQ_HFSC
678	case ALTQT_HFSC:
679		error = hfsc_remove_queue(a);
680		break;
681#endif
682#ifdef ALTQ_FAIRQ
683        case ALTQT_FAIRQ:
684                error = fairq_remove_queue(a);
685                break;
686#endif
687	default:
688		error = ENXIO;
689	}
690
691	return (error);
692}
693
694/*
695 * get queue statistics
696 * Locking is done in the discipline specific functions with regards to
697 * copyout operations, also it is not yet clear which lock to use.
698 */
699int
700altq_getqstats(struct pf_altq *a, void *ubuf, int *nbytes, int version)
701{
702	int error = 0;
703
704	switch (a->scheduler) {
705#ifdef ALTQ_CBQ
706	case ALTQT_CBQ:
707		error = cbq_getqstats(a, ubuf, nbytes, version);
708		break;
709#endif
710#ifdef ALTQ_PRIQ
711	case ALTQT_PRIQ:
712		error = priq_getqstats(a, ubuf, nbytes, version);
713		break;
714#endif
715#ifdef ALTQ_HFSC
716	case ALTQT_HFSC:
717		error = hfsc_getqstats(a, ubuf, nbytes, version);
718		break;
719#endif
720#ifdef ALTQ_FAIRQ
721        case ALTQT_FAIRQ:
722                error = fairq_getqstats(a, ubuf, nbytes, version);
723                break;
724#endif
725#ifdef ALTQ_CODEL
726	case ALTQT_CODEL:
727		error = codel_getqstats(a, ubuf, nbytes, version);
728		break;
729#endif
730	default:
731		error = ENXIO;
732	}
733
734	return (error);
735}
736
737/*
738 * read and write diffserv field in IPv4 or IPv6 header
739 */
740u_int8_t
741read_dsfield(struct mbuf *m, struct altq_pktattr *pktattr)
742{
743	struct mbuf *m0;
744	u_int8_t ds_field = 0;
745
746	if (pktattr == NULL ||
747	    (pktattr->pattr_af != AF_INET && pktattr->pattr_af != AF_INET6))
748		return ((u_int8_t)0);
749
750	/* verify that pattr_hdr is within the mbuf data */
751	for (m0 = m; m0 != NULL; m0 = m0->m_next)
752		if ((pktattr->pattr_hdr >= m0->m_data) &&
753		    (pktattr->pattr_hdr < m0->m_data + m0->m_len))
754			break;
755	if (m0 == NULL) {
756		/* ick, pattr_hdr is stale */
757		pktattr->pattr_af = AF_UNSPEC;
758#ifdef ALTQ_DEBUG
759		printf("read_dsfield: can't locate header!\n");
760#endif
761		return ((u_int8_t)0);
762	}
763
764	if (pktattr->pattr_af == AF_INET) {
765		struct ip *ip = (struct ip *)pktattr->pattr_hdr;
766
767		if (ip->ip_v != 4)
768			return ((u_int8_t)0);	/* version mismatch! */
769		ds_field = ip->ip_tos;
770	}
771#ifdef INET6
772	else if (pktattr->pattr_af == AF_INET6) {
773		struct ip6_hdr *ip6 = (struct ip6_hdr *)pktattr->pattr_hdr;
774		u_int32_t flowlabel;
775
776		flowlabel = ntohl(ip6->ip6_flow);
777		if ((flowlabel >> 28) != 6)
778			return ((u_int8_t)0);	/* version mismatch! */
779		ds_field = (flowlabel >> 20) & 0xff;
780	}
781#endif
782	return (ds_field);
783}
784
785void
786write_dsfield(struct mbuf *m, struct altq_pktattr *pktattr, u_int8_t dsfield)
787{
788	struct mbuf *m0;
789
790	if (pktattr == NULL ||
791	    (pktattr->pattr_af != AF_INET && pktattr->pattr_af != AF_INET6))
792		return;
793
794	/* verify that pattr_hdr is within the mbuf data */
795	for (m0 = m; m0 != NULL; m0 = m0->m_next)
796		if ((pktattr->pattr_hdr >= m0->m_data) &&
797		    (pktattr->pattr_hdr < m0->m_data + m0->m_len))
798			break;
799	if (m0 == NULL) {
800		/* ick, pattr_hdr is stale */
801		pktattr->pattr_af = AF_UNSPEC;
802#ifdef ALTQ_DEBUG
803		printf("write_dsfield: can't locate header!\n");
804#endif
805		return;
806	}
807
808	if (pktattr->pattr_af == AF_INET) {
809		struct ip *ip = (struct ip *)pktattr->pattr_hdr;
810		u_int8_t old;
811		int32_t sum;
812
813		if (ip->ip_v != 4)
814			return;		/* version mismatch! */
815		old = ip->ip_tos;
816		dsfield |= old & 3;	/* leave CU bits */
817		if (old == dsfield)
818			return;
819		ip->ip_tos = dsfield;
820		/*
821		 * update checksum (from RFC1624)
822		 *	   HC' = ~(~HC + ~m + m')
823		 */
824		sum = ~ntohs(ip->ip_sum) & 0xffff;
825		sum += 0xff00 + (~old & 0xff) + dsfield;
826		sum = (sum >> 16) + (sum & 0xffff);
827		sum += (sum >> 16);  /* add carry */
828
829		ip->ip_sum = htons(~sum & 0xffff);
830	}
831#ifdef INET6
832	else if (pktattr->pattr_af == AF_INET6) {
833		struct ip6_hdr *ip6 = (struct ip6_hdr *)pktattr->pattr_hdr;
834		u_int32_t flowlabel;
835
836		flowlabel = ntohl(ip6->ip6_flow);
837		if ((flowlabel >> 28) != 6)
838			return;		/* version mismatch! */
839		flowlabel = (flowlabel & 0xf03fffff) | (dsfield << 20);
840		ip6->ip6_flow = htonl(flowlabel);
841	}
842#endif
843	return;
844}
845
846/*
847 * high resolution clock support taking advantage of a machine dependent
848 * high resolution time counter (e.g., timestamp counter of intel pentium).
849 * we assume
850 *  - 64-bit-long monotonically-increasing counter
851 *  - frequency range is 100M-4GHz (CPU speed)
852 */
853/* if pcc is not available or disabled, emulate 256MHz using microtime() */
854#define	MACHCLK_SHIFT	8
855
856int machclk_usepcc;
857u_int32_t machclk_freq;
858u_int32_t machclk_per_tick;
859
860#if defined(__i386__) && defined(__NetBSD__)
861extern u_int64_t cpu_tsc_freq;
862#endif
863
864/* Update TSC freq with the value indicated by the caller. */
865static void
866tsc_freq_changed(void *arg, const struct cf_level *level, int status)
867{
868	/* If there was an error during the transition, don't do anything. */
869	if (status != 0)
870		return;
871
872#if defined(__amd64__) || defined(__i386__)
873	/* If TSC is P-state invariant, don't do anything. */
874	if (tsc_is_invariant)
875		return;
876#endif
877
878	/* Total setting for this level gives the new frequency in MHz. */
879	init_machclk();
880}
881EVENTHANDLER_DEFINE(cpufreq_post_change, tsc_freq_changed, NULL,
882    EVENTHANDLER_PRI_LAST);
883
884static void
885init_machclk_setup(void)
886{
887	callout_init(&tbr_callout, 1);
888
889	machclk_usepcc = 1;
890
891#if (!defined(__amd64__) && !defined(__i386__)) || defined(ALTQ_NOPCC)
892	machclk_usepcc = 0;
893#endif
894#if defined(__FreeBSD__) && defined(SMP)
895	machclk_usepcc = 0;
896#endif
897#if defined(__NetBSD__) && defined(MULTIPROCESSOR)
898	machclk_usepcc = 0;
899#endif
900#if defined(__amd64__) || defined(__i386__)
901	/* check if TSC is available */
902	if ((cpu_feature & CPUID_TSC) == 0 ||
903	    atomic_load_acq_64(&tsc_freq) == 0)
904		machclk_usepcc = 0;
905#endif
906}
907
908void
909init_machclk(void)
910{
911	static int called;
912
913	/* Call one-time initialization function. */
914	if (!called) {
915		init_machclk_setup();
916		called = 1;
917	}
918
919	if (machclk_usepcc == 0) {
920		/* emulate 256MHz using microtime() */
921		machclk_freq = 1000000 << MACHCLK_SHIFT;
922		machclk_per_tick = machclk_freq / hz;
923#ifdef ALTQ_DEBUG
924		printf("altq: emulate %uHz cpu clock\n", machclk_freq);
925#endif
926		return;
927	}
928
929	/*
930	 * if the clock frequency (of Pentium TSC or Alpha PCC) is
931	 * accessible, just use it.
932	 */
933#if defined(__amd64__) || defined(__i386__)
934	machclk_freq = atomic_load_acq_64(&tsc_freq);
935#endif
936
937	/*
938	 * if we don't know the clock frequency, measure it.
939	 */
940	if (machclk_freq == 0) {
941		static int	wait;
942		struct timeval	tv_start, tv_end;
943		u_int64_t	start, end, diff;
944		int		timo;
945
946		microtime(&tv_start);
947		start = read_machclk();
948		timo = hz;	/* 1 sec */
949		(void)tsleep(&wait, PWAIT | PCATCH, "init_machclk", timo);
950		microtime(&tv_end);
951		end = read_machclk();
952		diff = (u_int64_t)(tv_end.tv_sec - tv_start.tv_sec) * 1000000
953		    + tv_end.tv_usec - tv_start.tv_usec;
954		if (diff != 0)
955			machclk_freq = (u_int)((end - start) * 1000000 / diff);
956	}
957
958	machclk_per_tick = machclk_freq / hz;
959
960#ifdef ALTQ_DEBUG
961	printf("altq: CPU clock: %uHz\n", machclk_freq);
962#endif
963}
964
965#if defined(__OpenBSD__) && defined(__i386__)
966static __inline u_int64_t
967rdtsc(void)
968{
969	u_int64_t rv;
970	__asm __volatile(".byte 0x0f, 0x31" : "=A" (rv));
971	return (rv);
972}
973#endif /* __OpenBSD__ && __i386__ */
974
975u_int64_t
976read_machclk(void)
977{
978	u_int64_t val;
979
980	if (machclk_usepcc) {
981#if defined(__amd64__) || defined(__i386__)
982		val = rdtsc();
983#else
984		panic("read_machclk");
985#endif
986	} else {
987		struct timeval tv, boottime;
988
989		microtime(&tv);
990		getboottime(&boottime);
991		val = (((u_int64_t)(tv.tv_sec - boottime.tv_sec) * 1000000
992		    + tv.tv_usec) << MACHCLK_SHIFT);
993	}
994	return (val);
995}
996
997#ifdef ALTQ3_CLFIER_COMPAT
998
999#ifndef IPPROTO_ESP
1000#define	IPPROTO_ESP	50		/* encapsulating security payload */
1001#endif
1002#ifndef IPPROTO_AH
1003#define	IPPROTO_AH	51		/* authentication header */
1004#endif
1005
1006/*
1007 * extract flow information from a given packet.
1008 * filt_mask shows flowinfo fields required.
1009 * we assume the ip header is in one mbuf, and addresses and ports are
1010 * in network byte order.
1011 */
1012int
1013altq_extractflow(m, af, flow, filt_bmask)
1014	struct mbuf *m;
1015	int af;
1016	struct flowinfo *flow;
1017	u_int32_t	filt_bmask;
1018{
1019
1020	switch (af) {
1021	case PF_INET: {
1022		struct flowinfo_in *fin;
1023		struct ip *ip;
1024
1025		ip = mtod(m, struct ip *);
1026
1027		if (ip->ip_v != 4)
1028			break;
1029
1030		fin = (struct flowinfo_in *)flow;
1031		fin->fi_len = sizeof(struct flowinfo_in);
1032		fin->fi_family = AF_INET;
1033
1034		fin->fi_proto = ip->ip_p;
1035		fin->fi_tos = ip->ip_tos;
1036
1037		fin->fi_src.s_addr = ip->ip_src.s_addr;
1038		fin->fi_dst.s_addr = ip->ip_dst.s_addr;
1039
1040		if (filt_bmask & FIMB4_PORTS)
1041			/* if port info is required, extract port numbers */
1042			extract_ports4(m, ip, fin);
1043		else {
1044			fin->fi_sport = 0;
1045			fin->fi_dport = 0;
1046			fin->fi_gpi = 0;
1047		}
1048		return (1);
1049	}
1050
1051#ifdef INET6
1052	case PF_INET6: {
1053		struct flowinfo_in6 *fin6;
1054		struct ip6_hdr *ip6;
1055
1056		ip6 = mtod(m, struct ip6_hdr *);
1057		/* should we check the ip version? */
1058
1059		fin6 = (struct flowinfo_in6 *)flow;
1060		fin6->fi6_len = sizeof(struct flowinfo_in6);
1061		fin6->fi6_family = AF_INET6;
1062
1063		fin6->fi6_proto = ip6->ip6_nxt;
1064		fin6->fi6_tclass   = IPV6_TRAFFIC_CLASS(ip6);
1065
1066		fin6->fi6_flowlabel = ip6->ip6_flow & htonl(0x000fffff);
1067		fin6->fi6_src = ip6->ip6_src;
1068		fin6->fi6_dst = ip6->ip6_dst;
1069
1070		if ((filt_bmask & FIMB6_PORTS) ||
1071		    ((filt_bmask & FIMB6_PROTO)
1072		     && ip6->ip6_nxt > IPPROTO_IPV6))
1073			/*
1074			 * if port info is required, or proto is required
1075			 * but there are option headers, extract port
1076			 * and protocol numbers.
1077			 */
1078			extract_ports6(m, ip6, fin6);
1079		else {
1080			fin6->fi6_sport = 0;
1081			fin6->fi6_dport = 0;
1082			fin6->fi6_gpi = 0;
1083		}
1084		return (1);
1085	}
1086#endif /* INET6 */
1087
1088	default:
1089		break;
1090	}
1091
1092	/* failed */
1093	flow->fi_len = sizeof(struct flowinfo);
1094	flow->fi_family = AF_UNSPEC;
1095	return (0);
1096}
1097
1098/*
1099 * helper routine to extract port numbers
1100 */
1101/* structure for ipsec and ipv6 option header template */
1102struct _opt6 {
1103	u_int8_t	opt6_nxt;	/* next header */
1104	u_int8_t	opt6_hlen;	/* header extension length */
1105	u_int16_t	_pad;
1106	u_int32_t	ah_spi;		/* security parameter index
1107					   for authentication header */
1108};
1109
1110/*
1111 * extract port numbers from a ipv4 packet.
1112 */
1113static int
1114extract_ports4(m, ip, fin)
1115	struct mbuf *m;
1116	struct ip *ip;
1117	struct flowinfo_in *fin;
1118{
1119	struct mbuf *m0;
1120	u_short ip_off;
1121	u_int8_t proto;
1122	int 	off;
1123
1124	fin->fi_sport = 0;
1125	fin->fi_dport = 0;
1126	fin->fi_gpi = 0;
1127
1128	ip_off = ntohs(ip->ip_off);
1129	/* if it is a fragment, try cached fragment info */
1130	if (ip_off & IP_OFFMASK) {
1131		ip4f_lookup(ip, fin);
1132		return (1);
1133	}
1134
1135	/* locate the mbuf containing the protocol header */
1136	for (m0 = m; m0 != NULL; m0 = m0->m_next)
1137		if (((caddr_t)ip >= m0->m_data) &&
1138		    ((caddr_t)ip < m0->m_data + m0->m_len))
1139			break;
1140	if (m0 == NULL) {
1141#ifdef ALTQ_DEBUG
1142		printf("extract_ports4: can't locate header! ip=%p\n", ip);
1143#endif
1144		return (0);
1145	}
1146	off = ((caddr_t)ip - m0->m_data) + (ip->ip_hl << 2);
1147	proto = ip->ip_p;
1148
1149#ifdef ALTQ_IPSEC
1150 again:
1151#endif
1152	while (off >= m0->m_len) {
1153		off -= m0->m_len;
1154		m0 = m0->m_next;
1155		if (m0 == NULL)
1156			return (0);  /* bogus ip_hl! */
1157	}
1158	if (m0->m_len < off + 4)
1159		return (0);
1160
1161	switch (proto) {
1162	case IPPROTO_TCP:
1163	case IPPROTO_UDP: {
1164		struct udphdr *udp;
1165
1166		udp = (struct udphdr *)(mtod(m0, caddr_t) + off);
1167		fin->fi_sport = udp->uh_sport;
1168		fin->fi_dport = udp->uh_dport;
1169		fin->fi_proto = proto;
1170		}
1171		break;
1172
1173#ifdef ALTQ_IPSEC
1174	case IPPROTO_ESP:
1175		if (fin->fi_gpi == 0){
1176			u_int32_t *gpi;
1177
1178			gpi = (u_int32_t *)(mtod(m0, caddr_t) + off);
1179			fin->fi_gpi   = *gpi;
1180		}
1181		fin->fi_proto = proto;
1182		break;
1183
1184	case IPPROTO_AH: {
1185			/* get next header and header length */
1186			struct _opt6 *opt6;
1187
1188			opt6 = (struct _opt6 *)(mtod(m0, caddr_t) + off);
1189			proto = opt6->opt6_nxt;
1190			off += 8 + (opt6->opt6_hlen * 4);
1191			if (fin->fi_gpi == 0 && m0->m_len >= off + 8)
1192				fin->fi_gpi = opt6->ah_spi;
1193		}
1194		/* goto the next header */
1195		goto again;
1196#endif  /* ALTQ_IPSEC */
1197
1198	default:
1199		fin->fi_proto = proto;
1200		return (0);
1201	}
1202
1203	/* if this is a first fragment, cache it. */
1204	if (ip_off & IP_MF)
1205		ip4f_cache(ip, fin);
1206
1207	return (1);
1208}
1209
1210#ifdef INET6
1211static int
1212extract_ports6(m, ip6, fin6)
1213	struct mbuf *m;
1214	struct ip6_hdr *ip6;
1215	struct flowinfo_in6 *fin6;
1216{
1217	struct mbuf *m0;
1218	int	off;
1219	u_int8_t proto;
1220
1221	fin6->fi6_gpi   = 0;
1222	fin6->fi6_sport = 0;
1223	fin6->fi6_dport = 0;
1224
1225	/* locate the mbuf containing the protocol header */
1226	for (m0 = m; m0 != NULL; m0 = m0->m_next)
1227		if (((caddr_t)ip6 >= m0->m_data) &&
1228		    ((caddr_t)ip6 < m0->m_data + m0->m_len))
1229			break;
1230	if (m0 == NULL) {
1231#ifdef ALTQ_DEBUG
1232		printf("extract_ports6: can't locate header! ip6=%p\n", ip6);
1233#endif
1234		return (0);
1235	}
1236	off = ((caddr_t)ip6 - m0->m_data) + sizeof(struct ip6_hdr);
1237
1238	proto = ip6->ip6_nxt;
1239	do {
1240		while (off >= m0->m_len) {
1241			off -= m0->m_len;
1242			m0 = m0->m_next;
1243			if (m0 == NULL)
1244				return (0);
1245		}
1246		if (m0->m_len < off + 4)
1247			return (0);
1248
1249		switch (proto) {
1250		case IPPROTO_TCP:
1251		case IPPROTO_UDP: {
1252			struct udphdr *udp;
1253
1254			udp = (struct udphdr *)(mtod(m0, caddr_t) + off);
1255			fin6->fi6_sport = udp->uh_sport;
1256			fin6->fi6_dport = udp->uh_dport;
1257			fin6->fi6_proto = proto;
1258			}
1259			return (1);
1260
1261		case IPPROTO_ESP:
1262			if (fin6->fi6_gpi == 0) {
1263				u_int32_t *gpi;
1264
1265				gpi = (u_int32_t *)(mtod(m0, caddr_t) + off);
1266				fin6->fi6_gpi   = *gpi;
1267			}
1268			fin6->fi6_proto = proto;
1269			return (1);
1270
1271		case IPPROTO_AH: {
1272			/* get next header and header length */
1273			struct _opt6 *opt6;
1274
1275			opt6 = (struct _opt6 *)(mtod(m0, caddr_t) + off);
1276			if (fin6->fi6_gpi == 0 && m0->m_len >= off + 8)
1277				fin6->fi6_gpi = opt6->ah_spi;
1278			proto = opt6->opt6_nxt;
1279			off += 8 + (opt6->opt6_hlen * 4);
1280			/* goto the next header */
1281			break;
1282			}
1283
1284		case IPPROTO_HOPOPTS:
1285		case IPPROTO_ROUTING:
1286		case IPPROTO_DSTOPTS: {
1287			/* get next header and header length */
1288			struct _opt6 *opt6;
1289
1290			opt6 = (struct _opt6 *)(mtod(m0, caddr_t) + off);
1291			proto = opt6->opt6_nxt;
1292			off += (opt6->opt6_hlen + 1) * 8;
1293			/* goto the next header */
1294			break;
1295			}
1296
1297		case IPPROTO_FRAGMENT:
1298			/* ipv6 fragmentations are not supported yet */
1299		default:
1300			fin6->fi6_proto = proto;
1301			return (0);
1302		}
1303	} while (1);
1304	/*NOTREACHED*/
1305}
1306#endif /* INET6 */
1307
1308/*
1309 * altq common classifier
1310 */
1311int
1312acc_add_filter(classifier, filter, class, phandle)
1313	struct acc_classifier *classifier;
1314	struct flow_filter *filter;
1315	void	*class;
1316	u_long	*phandle;
1317{
1318	struct acc_filter *afp, *prev, *tmp;
1319	int	i, s;
1320
1321#ifdef INET6
1322	if (filter->ff_flow.fi_family != AF_INET &&
1323	    filter->ff_flow.fi_family != AF_INET6)
1324		return (EINVAL);
1325#else
1326	if (filter->ff_flow.fi_family != AF_INET)
1327		return (EINVAL);
1328#endif
1329
1330	afp = malloc(sizeof(struct acc_filter),
1331	       M_DEVBUF, M_WAITOK);
1332	if (afp == NULL)
1333		return (ENOMEM);
1334	bzero(afp, sizeof(struct acc_filter));
1335
1336	afp->f_filter = *filter;
1337	afp->f_class = class;
1338
1339	i = ACC_WILDCARD_INDEX;
1340	if (filter->ff_flow.fi_family == AF_INET) {
1341		struct flow_filter *filter4 = &afp->f_filter;
1342
1343		/*
1344		 * if address is 0, it's a wildcard.  if address mask
1345		 * isn't set, use full mask.
1346		 */
1347		if (filter4->ff_flow.fi_dst.s_addr == 0)
1348			filter4->ff_mask.mask_dst.s_addr = 0;
1349		else if (filter4->ff_mask.mask_dst.s_addr == 0)
1350			filter4->ff_mask.mask_dst.s_addr = 0xffffffff;
1351		if (filter4->ff_flow.fi_src.s_addr == 0)
1352			filter4->ff_mask.mask_src.s_addr = 0;
1353		else if (filter4->ff_mask.mask_src.s_addr == 0)
1354			filter4->ff_mask.mask_src.s_addr = 0xffffffff;
1355
1356		/* clear extra bits in addresses  */
1357		   filter4->ff_flow.fi_dst.s_addr &=
1358		       filter4->ff_mask.mask_dst.s_addr;
1359		   filter4->ff_flow.fi_src.s_addr &=
1360		       filter4->ff_mask.mask_src.s_addr;
1361
1362		/*
1363		 * if dst address is a wildcard, use hash-entry
1364		 * ACC_WILDCARD_INDEX.
1365		 */
1366		if (filter4->ff_mask.mask_dst.s_addr != 0xffffffff)
1367			i = ACC_WILDCARD_INDEX;
1368		else
1369			i = ACC_GET_HASH_INDEX(filter4->ff_flow.fi_dst.s_addr);
1370	}
1371#ifdef INET6
1372	else if (filter->ff_flow.fi_family == AF_INET6) {
1373		struct flow_filter6 *filter6 =
1374			(struct flow_filter6 *)&afp->f_filter;
1375#ifndef IN6MASK0 /* taken from kame ipv6 */
1376#define	IN6MASK0	{{{ 0, 0, 0, 0 }}}
1377#define	IN6MASK128	{{{ 0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff }}}
1378		const struct in6_addr in6mask0 = IN6MASK0;
1379		const struct in6_addr in6mask128 = IN6MASK128;
1380#endif
1381
1382		if (IN6_IS_ADDR_UNSPECIFIED(&filter6->ff_flow6.fi6_dst))
1383			filter6->ff_mask6.mask6_dst = in6mask0;
1384		else if (IN6_IS_ADDR_UNSPECIFIED(&filter6->ff_mask6.mask6_dst))
1385			filter6->ff_mask6.mask6_dst = in6mask128;
1386		if (IN6_IS_ADDR_UNSPECIFIED(&filter6->ff_flow6.fi6_src))
1387			filter6->ff_mask6.mask6_src = in6mask0;
1388		else if (IN6_IS_ADDR_UNSPECIFIED(&filter6->ff_mask6.mask6_src))
1389			filter6->ff_mask6.mask6_src = in6mask128;
1390
1391		/* clear extra bits in addresses  */
1392		for (i = 0; i < 16; i++)
1393			filter6->ff_flow6.fi6_dst.s6_addr[i] &=
1394			    filter6->ff_mask6.mask6_dst.s6_addr[i];
1395		for (i = 0; i < 16; i++)
1396			filter6->ff_flow6.fi6_src.s6_addr[i] &=
1397			    filter6->ff_mask6.mask6_src.s6_addr[i];
1398
1399		if (filter6->ff_flow6.fi6_flowlabel == 0)
1400			i = ACC_WILDCARD_INDEX;
1401		else
1402			i = ACC_GET_HASH_INDEX(filter6->ff_flow6.fi6_flowlabel);
1403	}
1404#endif /* INET6 */
1405
1406	afp->f_handle = get_filt_handle(classifier, i);
1407
1408	/* update filter bitmask */
1409	afp->f_fbmask = filt2fibmask(filter);
1410	classifier->acc_fbmask |= afp->f_fbmask;
1411
1412	/*
1413	 * add this filter to the filter list.
1414	 * filters are ordered from the highest rule number.
1415	 */
1416	s = splnet();
1417	prev = NULL;
1418	LIST_FOREACH(tmp, &classifier->acc_filters[i], f_chain) {
1419		if (tmp->f_filter.ff_ruleno > afp->f_filter.ff_ruleno)
1420			prev = tmp;
1421		else
1422			break;
1423	}
1424	if (prev == NULL)
1425		LIST_INSERT_HEAD(&classifier->acc_filters[i], afp, f_chain);
1426	else
1427		LIST_INSERT_AFTER(prev, afp, f_chain);
1428	splx(s);
1429
1430	*phandle = afp->f_handle;
1431	return (0);
1432}
1433
1434int
1435acc_delete_filter(classifier, handle)
1436	struct acc_classifier *classifier;
1437	u_long handle;
1438{
1439	struct acc_filter *afp;
1440	int	s;
1441
1442	if ((afp = filth_to_filtp(classifier, handle)) == NULL)
1443		return (EINVAL);
1444
1445	s = splnet();
1446	LIST_REMOVE(afp, f_chain);
1447	splx(s);
1448
1449	free(afp, M_DEVBUF);
1450
1451	/* todo: update filt_bmask */
1452
1453	return (0);
1454}
1455
1456/*
1457 * delete filters referencing to the specified class.
1458 * if the all flag is not 0, delete all the filters.
1459 */
1460int
1461acc_discard_filters(classifier, class, all)
1462	struct acc_classifier *classifier;
1463	void	*class;
1464	int	all;
1465{
1466	struct acc_filter *afp;
1467	int	i, s;
1468
1469	s = splnet();
1470	for (i = 0; i < ACC_FILTER_TABLESIZE; i++) {
1471		do {
1472			LIST_FOREACH(afp, &classifier->acc_filters[i], f_chain)
1473				if (all || afp->f_class == class) {
1474					LIST_REMOVE(afp, f_chain);
1475					free(afp, M_DEVBUF);
1476					/* start again from the head */
1477					break;
1478				}
1479		} while (afp != NULL);
1480	}
1481	splx(s);
1482
1483	if (all)
1484		classifier->acc_fbmask = 0;
1485
1486	return (0);
1487}
1488
1489void *
1490acc_classify(clfier, m, af)
1491	void *clfier;
1492	struct mbuf *m;
1493	int af;
1494{
1495	struct acc_classifier *classifier;
1496	struct flowinfo flow;
1497	struct acc_filter *afp;
1498	int	i;
1499
1500	classifier = (struct acc_classifier *)clfier;
1501	altq_extractflow(m, af, &flow, classifier->acc_fbmask);
1502
1503	if (flow.fi_family == AF_INET) {
1504		struct flowinfo_in *fp = (struct flowinfo_in *)&flow;
1505
1506		if ((classifier->acc_fbmask & FIMB4_ALL) == FIMB4_TOS) {
1507			/* only tos is used */
1508			LIST_FOREACH(afp,
1509				 &classifier->acc_filters[ACC_WILDCARD_INDEX],
1510				 f_chain)
1511				if (apply_tosfilter4(afp->f_fbmask,
1512						     &afp->f_filter, fp))
1513					/* filter matched */
1514					return (afp->f_class);
1515		} else if ((classifier->acc_fbmask &
1516			(~(FIMB4_PROTO|FIMB4_SPORT|FIMB4_DPORT) & FIMB4_ALL))
1517		    == 0) {
1518			/* only proto and ports are used */
1519			LIST_FOREACH(afp,
1520				 &classifier->acc_filters[ACC_WILDCARD_INDEX],
1521				 f_chain)
1522				if (apply_ppfilter4(afp->f_fbmask,
1523						    &afp->f_filter, fp))
1524					/* filter matched */
1525					return (afp->f_class);
1526		} else {
1527			/* get the filter hash entry from its dest address */
1528			i = ACC_GET_HASH_INDEX(fp->fi_dst.s_addr);
1529			do {
1530				/*
1531				 * go through this loop twice.  first for dst
1532				 * hash, second for wildcards.
1533				 */
1534				LIST_FOREACH(afp, &classifier->acc_filters[i],
1535					     f_chain)
1536					if (apply_filter4(afp->f_fbmask,
1537							  &afp->f_filter, fp))
1538						/* filter matched */
1539						return (afp->f_class);
1540
1541				/*
1542				 * check again for filters with a dst addr
1543				 * wildcard.
1544				 * (daddr == 0 || dmask != 0xffffffff).
1545				 */
1546				if (i != ACC_WILDCARD_INDEX)
1547					i = ACC_WILDCARD_INDEX;
1548				else
1549					break;
1550			} while (1);
1551		}
1552	}
1553#ifdef INET6
1554	else if (flow.fi_family == AF_INET6) {
1555		struct flowinfo_in6 *fp6 = (struct flowinfo_in6 *)&flow;
1556
1557		/* get the filter hash entry from its flow ID */
1558		if (fp6->fi6_flowlabel != 0)
1559			i = ACC_GET_HASH_INDEX(fp6->fi6_flowlabel);
1560		else
1561			/* flowlable can be zero */
1562			i = ACC_WILDCARD_INDEX;
1563
1564		/* go through this loop twice.  first for flow hash, second
1565		   for wildcards. */
1566		do {
1567			LIST_FOREACH(afp, &classifier->acc_filters[i], f_chain)
1568				if (apply_filter6(afp->f_fbmask,
1569					(struct flow_filter6 *)&afp->f_filter,
1570					fp6))
1571					/* filter matched */
1572					return (afp->f_class);
1573
1574			/*
1575			 * check again for filters with a wildcard.
1576			 */
1577			if (i != ACC_WILDCARD_INDEX)
1578				i = ACC_WILDCARD_INDEX;
1579			else
1580				break;
1581		} while (1);
1582	}
1583#endif /* INET6 */
1584
1585	/* no filter matched */
1586	return (NULL);
1587}
1588
1589static int
1590apply_filter4(fbmask, filt, pkt)
1591	u_int32_t	fbmask;
1592	struct flow_filter *filt;
1593	struct flowinfo_in *pkt;
1594{
1595	if (filt->ff_flow.fi_family != AF_INET)
1596		return (0);
1597	if ((fbmask & FIMB4_SPORT) && filt->ff_flow.fi_sport != pkt->fi_sport)
1598		return (0);
1599	if ((fbmask & FIMB4_DPORT) && filt->ff_flow.fi_dport != pkt->fi_dport)
1600		return (0);
1601	if ((fbmask & FIMB4_DADDR) &&
1602	    filt->ff_flow.fi_dst.s_addr !=
1603	    (pkt->fi_dst.s_addr & filt->ff_mask.mask_dst.s_addr))
1604		return (0);
1605	if ((fbmask & FIMB4_SADDR) &&
1606	    filt->ff_flow.fi_src.s_addr !=
1607	    (pkt->fi_src.s_addr & filt->ff_mask.mask_src.s_addr))
1608		return (0);
1609	if ((fbmask & FIMB4_PROTO) && filt->ff_flow.fi_proto != pkt->fi_proto)
1610		return (0);
1611	if ((fbmask & FIMB4_TOS) && filt->ff_flow.fi_tos !=
1612	    (pkt->fi_tos & filt->ff_mask.mask_tos))
1613		return (0);
1614	if ((fbmask & FIMB4_GPI) && filt->ff_flow.fi_gpi != (pkt->fi_gpi))
1615		return (0);
1616	/* match */
1617	return (1);
1618}
1619
1620/*
1621 * filter matching function optimized for a common case that checks
1622 * only protocol and port numbers
1623 */
1624static int
1625apply_ppfilter4(fbmask, filt, pkt)
1626	u_int32_t	fbmask;
1627	struct flow_filter *filt;
1628	struct flowinfo_in *pkt;
1629{
1630	if (filt->ff_flow.fi_family != AF_INET)
1631		return (0);
1632	if ((fbmask & FIMB4_SPORT) && filt->ff_flow.fi_sport != pkt->fi_sport)
1633		return (0);
1634	if ((fbmask & FIMB4_DPORT) && filt->ff_flow.fi_dport != pkt->fi_dport)
1635		return (0);
1636	if ((fbmask & FIMB4_PROTO) && filt->ff_flow.fi_proto != pkt->fi_proto)
1637		return (0);
1638	/* match */
1639	return (1);
1640}
1641
1642/*
1643 * filter matching function only for tos field.
1644 */
1645static int
1646apply_tosfilter4(fbmask, filt, pkt)
1647	u_int32_t	fbmask;
1648	struct flow_filter *filt;
1649	struct flowinfo_in *pkt;
1650{
1651	if (filt->ff_flow.fi_family != AF_INET)
1652		return (0);
1653	if ((fbmask & FIMB4_TOS) && filt->ff_flow.fi_tos !=
1654	    (pkt->fi_tos & filt->ff_mask.mask_tos))
1655		return (0);
1656	/* match */
1657	return (1);
1658}
1659
1660#ifdef INET6
1661static int
1662apply_filter6(fbmask, filt, pkt)
1663	u_int32_t	fbmask;
1664	struct flow_filter6 *filt;
1665	struct flowinfo_in6 *pkt;
1666{
1667	int i;
1668
1669	if (filt->ff_flow6.fi6_family != AF_INET6)
1670		return (0);
1671	if ((fbmask & FIMB6_FLABEL) &&
1672	    filt->ff_flow6.fi6_flowlabel != pkt->fi6_flowlabel)
1673		return (0);
1674	if ((fbmask & FIMB6_PROTO) &&
1675	    filt->ff_flow6.fi6_proto != pkt->fi6_proto)
1676		return (0);
1677	if ((fbmask & FIMB6_SPORT) &&
1678	    filt->ff_flow6.fi6_sport != pkt->fi6_sport)
1679		return (0);
1680	if ((fbmask & FIMB6_DPORT) &&
1681	    filt->ff_flow6.fi6_dport != pkt->fi6_dport)
1682		return (0);
1683	if (fbmask & FIMB6_SADDR) {
1684		for (i = 0; i < 4; i++)
1685			if (filt->ff_flow6.fi6_src.s6_addr32[i] !=
1686			    (pkt->fi6_src.s6_addr32[i] &
1687			     filt->ff_mask6.mask6_src.s6_addr32[i]))
1688				return (0);
1689	}
1690	if (fbmask & FIMB6_DADDR) {
1691		for (i = 0; i < 4; i++)
1692			if (filt->ff_flow6.fi6_dst.s6_addr32[i] !=
1693			    (pkt->fi6_dst.s6_addr32[i] &
1694			     filt->ff_mask6.mask6_dst.s6_addr32[i]))
1695				return (0);
1696	}
1697	if ((fbmask & FIMB6_TCLASS) &&
1698	    filt->ff_flow6.fi6_tclass !=
1699	    (pkt->fi6_tclass & filt->ff_mask6.mask6_tclass))
1700		return (0);
1701	if ((fbmask & FIMB6_GPI) &&
1702	    filt->ff_flow6.fi6_gpi != pkt->fi6_gpi)
1703		return (0);
1704	/* match */
1705	return (1);
1706}
1707#endif /* INET6 */
1708
1709/*
1710 *  filter handle:
1711 *	bit 20-28: index to the filter hash table
1712 *	bit  0-19: unique id in the hash bucket.
1713 */
1714static u_long
1715get_filt_handle(classifier, i)
1716	struct acc_classifier *classifier;
1717	int	i;
1718{
1719	static u_long handle_number = 1;
1720	u_long 	handle;
1721	struct acc_filter *afp;
1722
1723	while (1) {
1724		handle = handle_number++ & 0x000fffff;
1725
1726		if (LIST_EMPTY(&classifier->acc_filters[i]))
1727			break;
1728
1729		LIST_FOREACH(afp, &classifier->acc_filters[i], f_chain)
1730			if ((afp->f_handle & 0x000fffff) == handle)
1731				break;
1732		if (afp == NULL)
1733			break;
1734		/* this handle is already used, try again */
1735	}
1736
1737	return ((i << 20) | handle);
1738}
1739
1740/* convert filter handle to filter pointer */
1741static struct acc_filter *
1742filth_to_filtp(classifier, handle)
1743	struct acc_classifier *classifier;
1744	u_long handle;
1745{
1746	struct acc_filter *afp;
1747	int	i;
1748
1749	i = ACC_GET_HINDEX(handle);
1750
1751	LIST_FOREACH(afp, &classifier->acc_filters[i], f_chain)
1752		if (afp->f_handle == handle)
1753			return (afp);
1754
1755	return (NULL);
1756}
1757
1758/* create flowinfo bitmask */
1759static u_int32_t
1760filt2fibmask(filt)
1761	struct flow_filter *filt;
1762{
1763	u_int32_t mask = 0;
1764#ifdef INET6
1765	struct flow_filter6 *filt6;
1766#endif
1767
1768	switch (filt->ff_flow.fi_family) {
1769	case AF_INET:
1770		if (filt->ff_flow.fi_proto != 0)
1771			mask |= FIMB4_PROTO;
1772		if (filt->ff_flow.fi_tos != 0)
1773			mask |= FIMB4_TOS;
1774		if (filt->ff_flow.fi_dst.s_addr != 0)
1775			mask |= FIMB4_DADDR;
1776		if (filt->ff_flow.fi_src.s_addr != 0)
1777			mask |= FIMB4_SADDR;
1778		if (filt->ff_flow.fi_sport != 0)
1779			mask |= FIMB4_SPORT;
1780		if (filt->ff_flow.fi_dport != 0)
1781			mask |= FIMB4_DPORT;
1782		if (filt->ff_flow.fi_gpi != 0)
1783			mask |= FIMB4_GPI;
1784		break;
1785#ifdef INET6
1786	case AF_INET6:
1787		filt6 = (struct flow_filter6 *)filt;
1788
1789		if (filt6->ff_flow6.fi6_proto != 0)
1790			mask |= FIMB6_PROTO;
1791		if (filt6->ff_flow6.fi6_tclass != 0)
1792			mask |= FIMB6_TCLASS;
1793		if (!IN6_IS_ADDR_UNSPECIFIED(&filt6->ff_flow6.fi6_dst))
1794			mask |= FIMB6_DADDR;
1795		if (!IN6_IS_ADDR_UNSPECIFIED(&filt6->ff_flow6.fi6_src))
1796			mask |= FIMB6_SADDR;
1797		if (filt6->ff_flow6.fi6_sport != 0)
1798			mask |= FIMB6_SPORT;
1799		if (filt6->ff_flow6.fi6_dport != 0)
1800			mask |= FIMB6_DPORT;
1801		if (filt6->ff_flow6.fi6_gpi != 0)
1802			mask |= FIMB6_GPI;
1803		if (filt6->ff_flow6.fi6_flowlabel != 0)
1804			mask |= FIMB6_FLABEL;
1805		break;
1806#endif /* INET6 */
1807	}
1808	return (mask);
1809}
1810
1811/*
1812 * helper functions to handle IPv4 fragments.
1813 * currently only in-sequence fragments are handled.
1814 *	- fragment info is cached in a LRU list.
1815 *	- when a first fragment is found, cache its flow info.
1816 *	- when a non-first fragment is found, lookup the cache.
1817 */
1818
1819struct ip4_frag {
1820    TAILQ_ENTRY(ip4_frag) ip4f_chain;
1821    char    ip4f_valid;
1822    u_short ip4f_id;
1823    struct flowinfo_in ip4f_info;
1824};
1825
1826static TAILQ_HEAD(ip4f_list, ip4_frag) ip4f_list; /* IPv4 fragment cache */
1827
1828#define	IP4F_TABSIZE		16	/* IPv4 fragment cache size */
1829
1830static void
1831ip4f_cache(ip, fin)
1832	struct ip *ip;
1833	struct flowinfo_in *fin;
1834{
1835	struct ip4_frag *fp;
1836
1837	if (TAILQ_EMPTY(&ip4f_list)) {
1838		/* first time call, allocate fragment cache entries. */
1839		if (ip4f_init() < 0)
1840			/* allocation failed! */
1841			return;
1842	}
1843
1844	fp = ip4f_alloc();
1845	fp->ip4f_id = ip->ip_id;
1846	fp->ip4f_info.fi_proto = ip->ip_p;
1847	fp->ip4f_info.fi_src.s_addr = ip->ip_src.s_addr;
1848	fp->ip4f_info.fi_dst.s_addr = ip->ip_dst.s_addr;
1849
1850	/* save port numbers */
1851	fp->ip4f_info.fi_sport = fin->fi_sport;
1852	fp->ip4f_info.fi_dport = fin->fi_dport;
1853	fp->ip4f_info.fi_gpi   = fin->fi_gpi;
1854}
1855
1856static int
1857ip4f_lookup(ip, fin)
1858	struct ip *ip;
1859	struct flowinfo_in *fin;
1860{
1861	struct ip4_frag *fp;
1862
1863	for (fp = TAILQ_FIRST(&ip4f_list); fp != NULL && fp->ip4f_valid;
1864	     fp = TAILQ_NEXT(fp, ip4f_chain))
1865		if (ip->ip_id == fp->ip4f_id &&
1866		    ip->ip_src.s_addr == fp->ip4f_info.fi_src.s_addr &&
1867		    ip->ip_dst.s_addr == fp->ip4f_info.fi_dst.s_addr &&
1868		    ip->ip_p == fp->ip4f_info.fi_proto) {
1869			/* found the matching entry */
1870			fin->fi_sport = fp->ip4f_info.fi_sport;
1871			fin->fi_dport = fp->ip4f_info.fi_dport;
1872			fin->fi_gpi   = fp->ip4f_info.fi_gpi;
1873
1874			if ((ntohs(ip->ip_off) & IP_MF) == 0)
1875				/* this is the last fragment,
1876				   release the entry. */
1877				ip4f_free(fp);
1878
1879			return (1);
1880		}
1881
1882	/* no matching entry found */
1883	return (0);
1884}
1885
1886static int
1887ip4f_init(void)
1888{
1889	struct ip4_frag *fp;
1890	int i;
1891
1892	TAILQ_INIT(&ip4f_list);
1893	for (i=0; i<IP4F_TABSIZE; i++) {
1894		fp = malloc(sizeof(struct ip4_frag),
1895		       M_DEVBUF, M_NOWAIT);
1896		if (fp == NULL) {
1897			printf("ip4f_init: can't alloc %dth entry!\n", i);
1898			if (i == 0)
1899				return (-1);
1900			return (0);
1901		}
1902		fp->ip4f_valid = 0;
1903		TAILQ_INSERT_TAIL(&ip4f_list, fp, ip4f_chain);
1904	}
1905	return (0);
1906}
1907
1908static struct ip4_frag *
1909ip4f_alloc(void)
1910{
1911	struct ip4_frag *fp;
1912
1913	/* reclaim an entry at the tail, put it at the head */
1914	fp = TAILQ_LAST(&ip4f_list, ip4f_list);
1915	TAILQ_REMOVE(&ip4f_list, fp, ip4f_chain);
1916	fp->ip4f_valid = 1;
1917	TAILQ_INSERT_HEAD(&ip4f_list, fp, ip4f_chain);
1918	return (fp);
1919}
1920
1921static void
1922ip4f_free(fp)
1923	struct ip4_frag *fp;
1924{
1925	TAILQ_REMOVE(&ip4f_list, fp, ip4f_chain);
1926	fp->ip4f_valid = 0;
1927	TAILQ_INSERT_TAIL(&ip4f_list, fp, ip4f_chain);
1928}
1929
1930#endif /* ALTQ3_CLFIER_COMPAT */
1931