1/*-
2 * Copyright (C) 1997-2003
3 *	Sony Computer Science Laboratories Inc.  All rights reserved.
4 *
5 * Redistribution and use in source and binary forms, with or without
6 * modification, are permitted provided that the following conditions
7 * are met:
8 * 1. Redistributions of source code must retain the above copyright
9 *    notice, this list of conditions and the following disclaimer.
10 * 2. Redistributions in binary form must reproduce the above copyright
11 *    notice, this list of conditions and the following disclaimer in the
12 *    documentation and/or other materials provided with the distribution.
13 *
14 * THIS SOFTWARE IS PROVIDED BY SONY CSL AND CONTRIBUTORS ``AS IS'' AND
15 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
16 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
17 * ARE DISCLAIMED.  IN NO EVENT SHALL SONY CSL OR CONTRIBUTORS BE LIABLE
18 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
19 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
20 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
21 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
22 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
23 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
24 * SUCH DAMAGE.
25 *
26 * $KAME: altq_subr.c,v 1.21 2003/11/06 06:32:53 kjc Exp $
27 * $FreeBSD$
28 */
29
30#include "opt_altq.h"
31#include "opt_inet.h"
32#include "opt_inet6.h"
33
34#include <sys/param.h>
35#include <sys/malloc.h>
36#include <sys/mbuf.h>
37#include <sys/systm.h>
38#include <sys/proc.h>
39#include <sys/socket.h>
40#include <sys/socketvar.h>
41#include <sys/kernel.h>
42#include <sys/errno.h>
43#include <sys/syslog.h>
44#include <sys/sysctl.h>
45#include <sys/queue.h>
46
47#include <net/if.h>
48#include <net/if_var.h>
49#include <net/if_dl.h>
50#include <net/if_types.h>
51#include <net/vnet.h>
52
53#include <netinet/in.h>
54#include <netinet/in_systm.h>
55#include <netinet/ip.h>
56#ifdef INET6
57#include <netinet/ip6.h>
58#endif
59#include <netinet/tcp.h>
60#include <netinet/udp.h>
61
62#include <netpfil/pf/pf.h>
63#include <netpfil/pf/pf_altq.h>
64#include <net/altq/altq.h>
65
66/* machine dependent clock related includes */
67#include <sys/bus.h>
68#include <sys/cpu.h>
69#include <sys/eventhandler.h>
70#include <machine/clock.h>
71#if defined(__amd64__) || defined(__i386__)
72#include <machine/cpufunc.h>		/* for pentium tsc */
73#include <machine/specialreg.h>		/* for CPUID_TSC */
74#include <machine/md_var.h>		/* for cpu_feature */
75#endif /* __amd64 || __i386__ */
76
77/*
78 * internal function prototypes
79 */
80static void	tbr_timeout(void *);
81int (*altq_input)(struct mbuf *, int) = NULL;
82static struct mbuf *tbr_dequeue(struct ifaltq *, int);
83static int tbr_timer = 0;	/* token bucket regulator timer */
84#if !defined(__FreeBSD__) || (__FreeBSD_version < 600000)
85static struct callout tbr_callout = CALLOUT_INITIALIZER;
86#else
87static struct callout tbr_callout;
88#endif
89
90#ifdef ALTQ3_CLFIER_COMPAT
91static int 	extract_ports4(struct mbuf *, struct ip *, struct flowinfo_in *);
92#ifdef INET6
93static int 	extract_ports6(struct mbuf *, struct ip6_hdr *,
94			       struct flowinfo_in6 *);
95#endif
96static int	apply_filter4(u_int32_t, struct flow_filter *,
97			      struct flowinfo_in *);
98static int	apply_ppfilter4(u_int32_t, struct flow_filter *,
99				struct flowinfo_in *);
100#ifdef INET6
101static int	apply_filter6(u_int32_t, struct flow_filter6 *,
102			      struct flowinfo_in6 *);
103#endif
104static int	apply_tosfilter4(u_int32_t, struct flow_filter *,
105				 struct flowinfo_in *);
106static u_long	get_filt_handle(struct acc_classifier *, int);
107static struct acc_filter *filth_to_filtp(struct acc_classifier *, u_long);
108static u_int32_t filt2fibmask(struct flow_filter *);
109
110static void 	ip4f_cache(struct ip *, struct flowinfo_in *);
111static int 	ip4f_lookup(struct ip *, struct flowinfo_in *);
112static int 	ip4f_init(void);
113static struct ip4_frag	*ip4f_alloc(void);
114static void 	ip4f_free(struct ip4_frag *);
115#endif /* ALTQ3_CLFIER_COMPAT */
116
117/*
118 * alternate queueing support routines
119 */
120
121/* look up the queue state by the interface name and the queueing type. */
122void *
123altq_lookup(name, type)
124	char *name;
125	int type;
126{
127	struct ifnet *ifp;
128
129	if ((ifp = ifunit(name)) != NULL) {
130		/* read if_snd unlocked */
131		if (type != ALTQT_NONE && ifp->if_snd.altq_type == type)
132			return (ifp->if_snd.altq_disc);
133	}
134
135	return NULL;
136}
137
138int
139altq_attach(ifq, type, discipline, enqueue, dequeue, request, clfier, classify)
140	struct ifaltq *ifq;
141	int type;
142	void *discipline;
143	int (*enqueue)(struct ifaltq *, struct mbuf *, struct altq_pktattr *);
144	struct mbuf *(*dequeue)(struct ifaltq *, int);
145	int (*request)(struct ifaltq *, int, void *);
146	void *clfier;
147	void *(*classify)(void *, struct mbuf *, int);
148{
149	IFQ_LOCK(ifq);
150	if (!ALTQ_IS_READY(ifq)) {
151		IFQ_UNLOCK(ifq);
152		return ENXIO;
153	}
154
155	ifq->altq_type     = type;
156	ifq->altq_disc     = discipline;
157	ifq->altq_enqueue  = enqueue;
158	ifq->altq_dequeue  = dequeue;
159	ifq->altq_request  = request;
160	ifq->altq_clfier   = clfier;
161	ifq->altq_classify = classify;
162	ifq->altq_flags &= (ALTQF_CANTCHANGE|ALTQF_ENABLED);
163	IFQ_UNLOCK(ifq);
164	return 0;
165}
166
167int
168altq_detach(ifq)
169	struct ifaltq *ifq;
170{
171	IFQ_LOCK(ifq);
172
173	if (!ALTQ_IS_READY(ifq)) {
174		IFQ_UNLOCK(ifq);
175		return ENXIO;
176	}
177	if (ALTQ_IS_ENABLED(ifq)) {
178		IFQ_UNLOCK(ifq);
179		return EBUSY;
180	}
181	if (!ALTQ_IS_ATTACHED(ifq)) {
182		IFQ_UNLOCK(ifq);
183		return (0);
184	}
185
186	ifq->altq_type     = ALTQT_NONE;
187	ifq->altq_disc     = NULL;
188	ifq->altq_enqueue  = NULL;
189	ifq->altq_dequeue  = NULL;
190	ifq->altq_request  = NULL;
191	ifq->altq_clfier   = NULL;
192	ifq->altq_classify = NULL;
193	ifq->altq_flags &= ALTQF_CANTCHANGE;
194
195	IFQ_UNLOCK(ifq);
196	return 0;
197}
198
199int
200altq_enable(ifq)
201	struct ifaltq *ifq;
202{
203	int s;
204
205	IFQ_LOCK(ifq);
206
207	if (!ALTQ_IS_READY(ifq)) {
208		IFQ_UNLOCK(ifq);
209		return ENXIO;
210	}
211	if (ALTQ_IS_ENABLED(ifq)) {
212		IFQ_UNLOCK(ifq);
213		return 0;
214	}
215
216	s = splnet();
217	IFQ_PURGE_NOLOCK(ifq);
218	ASSERT(ifq->ifq_len == 0);
219	ifq->ifq_drv_maxlen = 0;		/* disable bulk dequeue */
220	ifq->altq_flags |= ALTQF_ENABLED;
221	if (ifq->altq_clfier != NULL)
222		ifq->altq_flags |= ALTQF_CLASSIFY;
223	splx(s);
224
225	IFQ_UNLOCK(ifq);
226	return 0;
227}
228
229int
230altq_disable(ifq)
231	struct ifaltq *ifq;
232{
233	int s;
234
235	IFQ_LOCK(ifq);
236	if (!ALTQ_IS_ENABLED(ifq)) {
237		IFQ_UNLOCK(ifq);
238		return 0;
239	}
240
241	s = splnet();
242	IFQ_PURGE_NOLOCK(ifq);
243	ASSERT(ifq->ifq_len == 0);
244	ifq->altq_flags &= ~(ALTQF_ENABLED|ALTQF_CLASSIFY);
245	splx(s);
246
247	IFQ_UNLOCK(ifq);
248	return 0;
249}
250
251#ifdef ALTQ_DEBUG
252void
253altq_assert(file, line, failedexpr)
254	const char *file, *failedexpr;
255	int line;
256{
257	(void)printf("altq assertion \"%s\" failed: file \"%s\", line %d\n",
258		     failedexpr, file, line);
259	panic("altq assertion");
260	/* NOTREACHED */
261}
262#endif
263
264/*
265 * internal representation of token bucket parameters
266 *	rate:	(byte_per_unittime << TBR_SHIFT)  / machclk_freq
267 *		(((bits_per_sec) / 8) << TBR_SHIFT) / machclk_freq
268 *	depth:	byte << TBR_SHIFT
269 *
270 */
271#define	TBR_SHIFT	29
272#define	TBR_SCALE(x)	((int64_t)(x) << TBR_SHIFT)
273#define	TBR_UNSCALE(x)	((x) >> TBR_SHIFT)
274
275static struct mbuf *
276tbr_dequeue(ifq, op)
277	struct ifaltq *ifq;
278	int op;
279{
280	struct tb_regulator *tbr;
281	struct mbuf *m;
282	int64_t interval;
283	u_int64_t now;
284
285	IFQ_LOCK_ASSERT(ifq);
286	tbr = ifq->altq_tbr;
287	if (op == ALTDQ_REMOVE && tbr->tbr_lastop == ALTDQ_POLL) {
288		/* if this is a remove after poll, bypass tbr check */
289	} else {
290		/* update token only when it is negative */
291		if (tbr->tbr_token <= 0) {
292			now = read_machclk();
293			interval = now - tbr->tbr_last;
294			if (interval >= tbr->tbr_filluptime)
295				tbr->tbr_token = tbr->tbr_depth;
296			else {
297				tbr->tbr_token += interval * tbr->tbr_rate;
298				if (tbr->tbr_token > tbr->tbr_depth)
299					tbr->tbr_token = tbr->tbr_depth;
300			}
301			tbr->tbr_last = now;
302		}
303		/* if token is still negative, don't allow dequeue */
304		if (tbr->tbr_token <= 0)
305			return (NULL);
306	}
307
308	if (ALTQ_IS_ENABLED(ifq))
309		m = (*ifq->altq_dequeue)(ifq, op);
310	else {
311		if (op == ALTDQ_POLL)
312			_IF_POLL(ifq, m);
313		else
314			_IF_DEQUEUE(ifq, m);
315	}
316
317	if (m != NULL && op == ALTDQ_REMOVE)
318		tbr->tbr_token -= TBR_SCALE(m_pktlen(m));
319	tbr->tbr_lastop = op;
320	return (m);
321}
322
323/*
324 * set a token bucket regulator.
325 * if the specified rate is zero, the token bucket regulator is deleted.
326 */
327int
328tbr_set(ifq, profile)
329	struct ifaltq *ifq;
330	struct tb_profile *profile;
331{
332	struct tb_regulator *tbr, *otbr;
333
334	if (tbr_dequeue_ptr == NULL)
335		tbr_dequeue_ptr = tbr_dequeue;
336
337	if (machclk_freq == 0)
338		init_machclk();
339	if (machclk_freq == 0) {
340		printf("tbr_set: no cpu clock available!\n");
341		return (ENXIO);
342	}
343
344	IFQ_LOCK(ifq);
345	if (profile->rate == 0) {
346		/* delete this tbr */
347		if ((tbr = ifq->altq_tbr) == NULL) {
348			IFQ_UNLOCK(ifq);
349			return (ENOENT);
350		}
351		ifq->altq_tbr = NULL;
352		free(tbr, M_DEVBUF);
353		IFQ_UNLOCK(ifq);
354		return (0);
355	}
356
357	tbr = malloc(sizeof(struct tb_regulator), M_DEVBUF, M_NOWAIT | M_ZERO);
358	if (tbr == NULL) {
359		IFQ_UNLOCK(ifq);
360		return (ENOMEM);
361	}
362
363	tbr->tbr_rate = TBR_SCALE(profile->rate / 8) / machclk_freq;
364	tbr->tbr_depth = TBR_SCALE(profile->depth);
365	if (tbr->tbr_rate > 0)
366		tbr->tbr_filluptime = tbr->tbr_depth / tbr->tbr_rate;
367	else
368		tbr->tbr_filluptime = LLONG_MAX;
369	/*
370	 *  The longest time between tbr_dequeue() calls will be about 1
371	 *  system tick, as the callout that drives it is scheduled once per
372	 *  tick.  The refill-time detection logic in tbr_dequeue() can only
373	 *  properly detect the passage of up to LLONG_MAX machclk ticks.
374	 *  Therefore, in order for this logic to function properly in the
375	 *  extreme case, the maximum value of tbr_filluptime should be
376	 *  LLONG_MAX less one system tick's worth of machclk ticks less
377	 *  some additional slop factor (here one more system tick's worth
378	 *  of machclk ticks).
379	 */
380	if (tbr->tbr_filluptime > (LLONG_MAX - 2 * machclk_per_tick))
381		tbr->tbr_filluptime = LLONG_MAX - 2 * machclk_per_tick;
382	tbr->tbr_token = tbr->tbr_depth;
383	tbr->tbr_last = read_machclk();
384	tbr->tbr_lastop = ALTDQ_REMOVE;
385
386	otbr = ifq->altq_tbr;
387	ifq->altq_tbr = tbr;	/* set the new tbr */
388
389	if (otbr != NULL)
390		free(otbr, M_DEVBUF);
391	else {
392		if (tbr_timer == 0) {
393			CALLOUT_RESET(&tbr_callout, 1, tbr_timeout, (void *)0);
394			tbr_timer = 1;
395		}
396	}
397	IFQ_UNLOCK(ifq);
398	return (0);
399}
400
401/*
402 * tbr_timeout goes through the interface list, and kicks the drivers
403 * if necessary.
404 *
405 * MPSAFE
406 */
407static void
408tbr_timeout(arg)
409	void *arg;
410{
411	VNET_ITERATOR_DECL(vnet_iter);
412	struct ifnet *ifp;
413	struct epoch_tracker et;
414	int active;
415
416	active = 0;
417	NET_EPOCH_ENTER(et);
418	VNET_LIST_RLOCK_NOSLEEP();
419	VNET_FOREACH(vnet_iter) {
420		CURVNET_SET(vnet_iter);
421		for (ifp = CK_STAILQ_FIRST(&V_ifnet); ifp;
422		    ifp = CK_STAILQ_NEXT(ifp, if_link)) {
423			/* read from if_snd unlocked */
424			if (!TBR_IS_ENABLED(&ifp->if_snd))
425				continue;
426			active++;
427			if (!IFQ_IS_EMPTY(&ifp->if_snd) &&
428			    ifp->if_start != NULL)
429				(*ifp->if_start)(ifp);
430		}
431		CURVNET_RESTORE();
432	}
433	VNET_LIST_RUNLOCK_NOSLEEP();
434	NET_EPOCH_EXIT(et);
435	if (active > 0)
436		CALLOUT_RESET(&tbr_callout, 1, tbr_timeout, (void *)0);
437	else
438		tbr_timer = 0;	/* don't need tbr_timer anymore */
439}
440
441/*
442 * attach a discipline to the interface.  if one already exists, it is
443 * overridden.
444 * Locking is done in the discipline specific attach functions. Basically
445 * they call back to altq_attach which takes care of the attach and locking.
446 */
447int
448altq_pfattach(struct pf_altq *a)
449{
450	int error = 0;
451
452	switch (a->scheduler) {
453	case ALTQT_NONE:
454		break;
455#ifdef ALTQ_CBQ
456	case ALTQT_CBQ:
457		error = cbq_pfattach(a);
458		break;
459#endif
460#ifdef ALTQ_PRIQ
461	case ALTQT_PRIQ:
462		error = priq_pfattach(a);
463		break;
464#endif
465#ifdef ALTQ_HFSC
466	case ALTQT_HFSC:
467		error = hfsc_pfattach(a);
468		break;
469#endif
470#ifdef ALTQ_FAIRQ
471	case ALTQT_FAIRQ:
472		error = fairq_pfattach(a);
473		break;
474#endif
475#ifdef ALTQ_CODEL
476	case ALTQT_CODEL:
477		error = codel_pfattach(a);
478		break;
479#endif
480	default:
481		error = ENXIO;
482	}
483
484	return (error);
485}
486
487/*
488 * detach a discipline from the interface.
489 * it is possible that the discipline was already overridden by another
490 * discipline.
491 */
492int
493altq_pfdetach(struct pf_altq *a)
494{
495	struct ifnet *ifp;
496	int s, error = 0;
497
498	if ((ifp = ifunit(a->ifname)) == NULL)
499		return (EINVAL);
500
501	/* if this discipline is no longer referenced, just return */
502	/* read unlocked from if_snd */
503	if (a->altq_disc == NULL || a->altq_disc != ifp->if_snd.altq_disc)
504		return (0);
505
506	s = splnet();
507	/* read unlocked from if_snd, _disable and _detach take care */
508	if (ALTQ_IS_ENABLED(&ifp->if_snd))
509		error = altq_disable(&ifp->if_snd);
510	if (error == 0)
511		error = altq_detach(&ifp->if_snd);
512	splx(s);
513
514	return (error);
515}
516
517/*
518 * add a discipline or a queue
519 * Locking is done in the discipline specific functions with regards to
520 * malloc with WAITOK, also it is not yet clear which lock to use.
521 */
522int
523altq_add(struct ifnet *ifp, struct pf_altq *a)
524{
525	int error = 0;
526
527	if (a->qname[0] != 0)
528		return (altq_add_queue(a));
529
530	if (machclk_freq == 0)
531		init_machclk();
532	if (machclk_freq == 0)
533		panic("altq_add: no cpu clock");
534
535	switch (a->scheduler) {
536#ifdef ALTQ_CBQ
537	case ALTQT_CBQ:
538		error = cbq_add_altq(ifp, a);
539		break;
540#endif
541#ifdef ALTQ_PRIQ
542	case ALTQT_PRIQ:
543		error = priq_add_altq(ifp, a);
544		break;
545#endif
546#ifdef ALTQ_HFSC
547	case ALTQT_HFSC:
548		error = hfsc_add_altq(ifp, a);
549		break;
550#endif
551#ifdef ALTQ_FAIRQ
552        case ALTQT_FAIRQ:
553                error = fairq_add_altq(ifp, a);
554                break;
555#endif
556#ifdef ALTQ_CODEL
557	case ALTQT_CODEL:
558		error = codel_add_altq(ifp, a);
559		break;
560#endif
561	default:
562		error = ENXIO;
563	}
564
565	return (error);
566}
567
568/*
569 * remove a discipline or a queue
570 * It is yet unclear what lock to use to protect this operation, the
571 * discipline specific functions will determine and grab it
572 */
573int
574altq_remove(struct pf_altq *a)
575{
576	int error = 0;
577
578	if (a->qname[0] != 0)
579		return (altq_remove_queue(a));
580
581	switch (a->scheduler) {
582#ifdef ALTQ_CBQ
583	case ALTQT_CBQ:
584		error = cbq_remove_altq(a);
585		break;
586#endif
587#ifdef ALTQ_PRIQ
588	case ALTQT_PRIQ:
589		error = priq_remove_altq(a);
590		break;
591#endif
592#ifdef ALTQ_HFSC
593	case ALTQT_HFSC:
594		error = hfsc_remove_altq(a);
595		break;
596#endif
597#ifdef ALTQ_FAIRQ
598        case ALTQT_FAIRQ:
599                error = fairq_remove_altq(a);
600                break;
601#endif
602#ifdef ALTQ_CODEL
603	case ALTQT_CODEL:
604		error = codel_remove_altq(a);
605		break;
606#endif
607	default:
608		error = ENXIO;
609	}
610
611	return (error);
612}
613
614/*
615 * add a queue to the discipline
616 * It is yet unclear what lock to use to protect this operation, the
617 * discipline specific functions will determine and grab it
618 */
619int
620altq_add_queue(struct pf_altq *a)
621{
622	int error = 0;
623
624	switch (a->scheduler) {
625#ifdef ALTQ_CBQ
626	case ALTQT_CBQ:
627		error = cbq_add_queue(a);
628		break;
629#endif
630#ifdef ALTQ_PRIQ
631	case ALTQT_PRIQ:
632		error = priq_add_queue(a);
633		break;
634#endif
635#ifdef ALTQ_HFSC
636	case ALTQT_HFSC:
637		error = hfsc_add_queue(a);
638		break;
639#endif
640#ifdef ALTQ_FAIRQ
641        case ALTQT_FAIRQ:
642                error = fairq_add_queue(a);
643                break;
644#endif
645	default:
646		error = ENXIO;
647	}
648
649	return (error);
650}
651
652/*
653 * remove a queue from the discipline
654 * It is yet unclear what lock to use to protect this operation, the
655 * discipline specific functions will determine and grab it
656 */
657int
658altq_remove_queue(struct pf_altq *a)
659{
660	int error = 0;
661
662	switch (a->scheduler) {
663#ifdef ALTQ_CBQ
664	case ALTQT_CBQ:
665		error = cbq_remove_queue(a);
666		break;
667#endif
668#ifdef ALTQ_PRIQ
669	case ALTQT_PRIQ:
670		error = priq_remove_queue(a);
671		break;
672#endif
673#ifdef ALTQ_HFSC
674	case ALTQT_HFSC:
675		error = hfsc_remove_queue(a);
676		break;
677#endif
678#ifdef ALTQ_FAIRQ
679        case ALTQT_FAIRQ:
680                error = fairq_remove_queue(a);
681                break;
682#endif
683	default:
684		error = ENXIO;
685	}
686
687	return (error);
688}
689
690/*
691 * get queue statistics
692 * Locking is done in the discipline specific functions with regards to
693 * copyout operations, also it is not yet clear which lock to use.
694 */
695int
696altq_getqstats(struct pf_altq *a, void *ubuf, int *nbytes, int version)
697{
698	int error = 0;
699
700	switch (a->scheduler) {
701#ifdef ALTQ_CBQ
702	case ALTQT_CBQ:
703		error = cbq_getqstats(a, ubuf, nbytes, version);
704		break;
705#endif
706#ifdef ALTQ_PRIQ
707	case ALTQT_PRIQ:
708		error = priq_getqstats(a, ubuf, nbytes, version);
709		break;
710#endif
711#ifdef ALTQ_HFSC
712	case ALTQT_HFSC:
713		error = hfsc_getqstats(a, ubuf, nbytes, version);
714		break;
715#endif
716#ifdef ALTQ_FAIRQ
717        case ALTQT_FAIRQ:
718                error = fairq_getqstats(a, ubuf, nbytes, version);
719                break;
720#endif
721#ifdef ALTQ_CODEL
722	case ALTQT_CODEL:
723		error = codel_getqstats(a, ubuf, nbytes, version);
724		break;
725#endif
726	default:
727		error = ENXIO;
728	}
729
730	return (error);
731}
732
733/*
734 * read and write diffserv field in IPv4 or IPv6 header
735 */
736u_int8_t
737read_dsfield(m, pktattr)
738	struct mbuf *m;
739	struct altq_pktattr *pktattr;
740{
741	struct mbuf *m0;
742	u_int8_t ds_field = 0;
743
744	if (pktattr == NULL ||
745	    (pktattr->pattr_af != AF_INET && pktattr->pattr_af != AF_INET6))
746		return ((u_int8_t)0);
747
748	/* verify that pattr_hdr is within the mbuf data */
749	for (m0 = m; m0 != NULL; m0 = m0->m_next)
750		if ((pktattr->pattr_hdr >= m0->m_data) &&
751		    (pktattr->pattr_hdr < m0->m_data + m0->m_len))
752			break;
753	if (m0 == NULL) {
754		/* ick, pattr_hdr is stale */
755		pktattr->pattr_af = AF_UNSPEC;
756#ifdef ALTQ_DEBUG
757		printf("read_dsfield: can't locate header!\n");
758#endif
759		return ((u_int8_t)0);
760	}
761
762	if (pktattr->pattr_af == AF_INET) {
763		struct ip *ip = (struct ip *)pktattr->pattr_hdr;
764
765		if (ip->ip_v != 4)
766			return ((u_int8_t)0);	/* version mismatch! */
767		ds_field = ip->ip_tos;
768	}
769#ifdef INET6
770	else if (pktattr->pattr_af == AF_INET6) {
771		struct ip6_hdr *ip6 = (struct ip6_hdr *)pktattr->pattr_hdr;
772		u_int32_t flowlabel;
773
774		flowlabel = ntohl(ip6->ip6_flow);
775		if ((flowlabel >> 28) != 6)
776			return ((u_int8_t)0);	/* version mismatch! */
777		ds_field = (flowlabel >> 20) & 0xff;
778	}
779#endif
780	return (ds_field);
781}
782
783void
784write_dsfield(struct mbuf *m, struct altq_pktattr *pktattr, u_int8_t dsfield)
785{
786	struct mbuf *m0;
787
788	if (pktattr == NULL ||
789	    (pktattr->pattr_af != AF_INET && pktattr->pattr_af != AF_INET6))
790		return;
791
792	/* verify that pattr_hdr is within the mbuf data */
793	for (m0 = m; m0 != NULL; m0 = m0->m_next)
794		if ((pktattr->pattr_hdr >= m0->m_data) &&
795		    (pktattr->pattr_hdr < m0->m_data + m0->m_len))
796			break;
797	if (m0 == NULL) {
798		/* ick, pattr_hdr is stale */
799		pktattr->pattr_af = AF_UNSPEC;
800#ifdef ALTQ_DEBUG
801		printf("write_dsfield: can't locate header!\n");
802#endif
803		return;
804	}
805
806	if (pktattr->pattr_af == AF_INET) {
807		struct ip *ip = (struct ip *)pktattr->pattr_hdr;
808		u_int8_t old;
809		int32_t sum;
810
811		if (ip->ip_v != 4)
812			return;		/* version mismatch! */
813		old = ip->ip_tos;
814		dsfield |= old & 3;	/* leave CU bits */
815		if (old == dsfield)
816			return;
817		ip->ip_tos = dsfield;
818		/*
819		 * update checksum (from RFC1624)
820		 *	   HC' = ~(~HC + ~m + m')
821		 */
822		sum = ~ntohs(ip->ip_sum) & 0xffff;
823		sum += 0xff00 + (~old & 0xff) + dsfield;
824		sum = (sum >> 16) + (sum & 0xffff);
825		sum += (sum >> 16);  /* add carry */
826
827		ip->ip_sum = htons(~sum & 0xffff);
828	}
829#ifdef INET6
830	else if (pktattr->pattr_af == AF_INET6) {
831		struct ip6_hdr *ip6 = (struct ip6_hdr *)pktattr->pattr_hdr;
832		u_int32_t flowlabel;
833
834		flowlabel = ntohl(ip6->ip6_flow);
835		if ((flowlabel >> 28) != 6)
836			return;		/* version mismatch! */
837		flowlabel = (flowlabel & 0xf03fffff) | (dsfield << 20);
838		ip6->ip6_flow = htonl(flowlabel);
839	}
840#endif
841	return;
842}
843
844/*
845 * high resolution clock support taking advantage of a machine dependent
846 * high resolution time counter (e.g., timestamp counter of intel pentium).
847 * we assume
848 *  - 64-bit-long monotonically-increasing counter
849 *  - frequency range is 100M-4GHz (CPU speed)
850 */
851/* if pcc is not available or disabled, emulate 256MHz using microtime() */
852#define	MACHCLK_SHIFT	8
853
854int machclk_usepcc;
855u_int32_t machclk_freq;
856u_int32_t machclk_per_tick;
857
858#if defined(__i386__) && defined(__NetBSD__)
859extern u_int64_t cpu_tsc_freq;
860#endif
861
862#if (__FreeBSD_version >= 700035)
863/* Update TSC freq with the value indicated by the caller. */
864static void
865tsc_freq_changed(void *arg, const struct cf_level *level, int status)
866{
867	/* If there was an error during the transition, don't do anything. */
868	if (status != 0)
869		return;
870
871#if (__FreeBSD_version >= 701102) && (defined(__amd64__) || defined(__i386__))
872	/* If TSC is P-state invariant, don't do anything. */
873	if (tsc_is_invariant)
874		return;
875#endif
876
877	/* Total setting for this level gives the new frequency in MHz. */
878	init_machclk();
879}
880EVENTHANDLER_DEFINE(cpufreq_post_change, tsc_freq_changed, NULL,
881    EVENTHANDLER_PRI_LAST);
882#endif /* __FreeBSD_version >= 700035 */
883
884static void
885init_machclk_setup(void)
886{
887#if (__FreeBSD_version >= 600000)
888	callout_init(&tbr_callout, 0);
889#endif
890
891	machclk_usepcc = 1;
892
893#if (!defined(__amd64__) && !defined(__i386__)) || defined(ALTQ_NOPCC)
894	machclk_usepcc = 0;
895#endif
896#if defined(__FreeBSD__) && defined(SMP)
897	machclk_usepcc = 0;
898#endif
899#if defined(__NetBSD__) && defined(MULTIPROCESSOR)
900	machclk_usepcc = 0;
901#endif
902#if defined(__amd64__) || defined(__i386__)
903	/* check if TSC is available */
904	if ((cpu_feature & CPUID_TSC) == 0 ||
905	    atomic_load_acq_64(&tsc_freq) == 0)
906		machclk_usepcc = 0;
907#endif
908}
909
910void
911init_machclk(void)
912{
913	static int called;
914
915	/* Call one-time initialization function. */
916	if (!called) {
917		init_machclk_setup();
918		called = 1;
919	}
920
921	if (machclk_usepcc == 0) {
922		/* emulate 256MHz using microtime() */
923		machclk_freq = 1000000 << MACHCLK_SHIFT;
924		machclk_per_tick = machclk_freq / hz;
925#ifdef ALTQ_DEBUG
926		printf("altq: emulate %uHz cpu clock\n", machclk_freq);
927#endif
928		return;
929	}
930
931	/*
932	 * if the clock frequency (of Pentium TSC or Alpha PCC) is
933	 * accessible, just use it.
934	 */
935#if defined(__amd64__) || defined(__i386__)
936	machclk_freq = atomic_load_acq_64(&tsc_freq);
937#endif
938
939	/*
940	 * if we don't know the clock frequency, measure it.
941	 */
942	if (machclk_freq == 0) {
943		static int	wait;
944		struct timeval	tv_start, tv_end;
945		u_int64_t	start, end, diff;
946		int		timo;
947
948		microtime(&tv_start);
949		start = read_machclk();
950		timo = hz;	/* 1 sec */
951		(void)tsleep(&wait, PWAIT | PCATCH, "init_machclk", timo);
952		microtime(&tv_end);
953		end = read_machclk();
954		diff = (u_int64_t)(tv_end.tv_sec - tv_start.tv_sec) * 1000000
955		    + tv_end.tv_usec - tv_start.tv_usec;
956		if (diff != 0)
957			machclk_freq = (u_int)((end - start) * 1000000 / diff);
958	}
959
960	machclk_per_tick = machclk_freq / hz;
961
962#ifdef ALTQ_DEBUG
963	printf("altq: CPU clock: %uHz\n", machclk_freq);
964#endif
965}
966
967#if defined(__OpenBSD__) && defined(__i386__)
968static __inline u_int64_t
969rdtsc(void)
970{
971	u_int64_t rv;
972	__asm __volatile(".byte 0x0f, 0x31" : "=A" (rv));
973	return (rv);
974}
975#endif /* __OpenBSD__ && __i386__ */
976
977u_int64_t
978read_machclk(void)
979{
980	u_int64_t val;
981
982	if (machclk_usepcc) {
983#if defined(__amd64__) || defined(__i386__)
984		val = rdtsc();
985#else
986		panic("read_machclk");
987#endif
988	} else {
989		struct timeval tv, boottime;
990
991		microtime(&tv);
992		getboottime(&boottime);
993		val = (((u_int64_t)(tv.tv_sec - boottime.tv_sec) * 1000000
994		    + tv.tv_usec) << MACHCLK_SHIFT);
995	}
996	return (val);
997}
998
999#ifdef ALTQ3_CLFIER_COMPAT
1000
1001#ifndef IPPROTO_ESP
1002#define	IPPROTO_ESP	50		/* encapsulating security payload */
1003#endif
1004#ifndef IPPROTO_AH
1005#define	IPPROTO_AH	51		/* authentication header */
1006#endif
1007
1008/*
1009 * extract flow information from a given packet.
1010 * filt_mask shows flowinfo fields required.
1011 * we assume the ip header is in one mbuf, and addresses and ports are
1012 * in network byte order.
1013 */
1014int
1015altq_extractflow(m, af, flow, filt_bmask)
1016	struct mbuf *m;
1017	int af;
1018	struct flowinfo *flow;
1019	u_int32_t	filt_bmask;
1020{
1021
1022	switch (af) {
1023	case PF_INET: {
1024		struct flowinfo_in *fin;
1025		struct ip *ip;
1026
1027		ip = mtod(m, struct ip *);
1028
1029		if (ip->ip_v != 4)
1030			break;
1031
1032		fin = (struct flowinfo_in *)flow;
1033		fin->fi_len = sizeof(struct flowinfo_in);
1034		fin->fi_family = AF_INET;
1035
1036		fin->fi_proto = ip->ip_p;
1037		fin->fi_tos = ip->ip_tos;
1038
1039		fin->fi_src.s_addr = ip->ip_src.s_addr;
1040		fin->fi_dst.s_addr = ip->ip_dst.s_addr;
1041
1042		if (filt_bmask & FIMB4_PORTS)
1043			/* if port info is required, extract port numbers */
1044			extract_ports4(m, ip, fin);
1045		else {
1046			fin->fi_sport = 0;
1047			fin->fi_dport = 0;
1048			fin->fi_gpi = 0;
1049		}
1050		return (1);
1051	}
1052
1053#ifdef INET6
1054	case PF_INET6: {
1055		struct flowinfo_in6 *fin6;
1056		struct ip6_hdr *ip6;
1057
1058		ip6 = mtod(m, struct ip6_hdr *);
1059		/* should we check the ip version? */
1060
1061		fin6 = (struct flowinfo_in6 *)flow;
1062		fin6->fi6_len = sizeof(struct flowinfo_in6);
1063		fin6->fi6_family = AF_INET6;
1064
1065		fin6->fi6_proto = ip6->ip6_nxt;
1066		fin6->fi6_tclass   = IPV6_TRAFFIC_CLASS(ip6);
1067
1068		fin6->fi6_flowlabel = ip6->ip6_flow & htonl(0x000fffff);
1069		fin6->fi6_src = ip6->ip6_src;
1070		fin6->fi6_dst = ip6->ip6_dst;
1071
1072		if ((filt_bmask & FIMB6_PORTS) ||
1073		    ((filt_bmask & FIMB6_PROTO)
1074		     && ip6->ip6_nxt > IPPROTO_IPV6))
1075			/*
1076			 * if port info is required, or proto is required
1077			 * but there are option headers, extract port
1078			 * and protocol numbers.
1079			 */
1080			extract_ports6(m, ip6, fin6);
1081		else {
1082			fin6->fi6_sport = 0;
1083			fin6->fi6_dport = 0;
1084			fin6->fi6_gpi = 0;
1085		}
1086		return (1);
1087	}
1088#endif /* INET6 */
1089
1090	default:
1091		break;
1092	}
1093
1094	/* failed */
1095	flow->fi_len = sizeof(struct flowinfo);
1096	flow->fi_family = AF_UNSPEC;
1097	return (0);
1098}
1099
1100/*
1101 * helper routine to extract port numbers
1102 */
1103/* structure for ipsec and ipv6 option header template */
1104struct _opt6 {
1105	u_int8_t	opt6_nxt;	/* next header */
1106	u_int8_t	opt6_hlen;	/* header extension length */
1107	u_int16_t	_pad;
1108	u_int32_t	ah_spi;		/* security parameter index
1109					   for authentication header */
1110};
1111
1112/*
1113 * extract port numbers from a ipv4 packet.
1114 */
1115static int
1116extract_ports4(m, ip, fin)
1117	struct mbuf *m;
1118	struct ip *ip;
1119	struct flowinfo_in *fin;
1120{
1121	struct mbuf *m0;
1122	u_short ip_off;
1123	u_int8_t proto;
1124	int 	off;
1125
1126	fin->fi_sport = 0;
1127	fin->fi_dport = 0;
1128	fin->fi_gpi = 0;
1129
1130	ip_off = ntohs(ip->ip_off);
1131	/* if it is a fragment, try cached fragment info */
1132	if (ip_off & IP_OFFMASK) {
1133		ip4f_lookup(ip, fin);
1134		return (1);
1135	}
1136
1137	/* locate the mbuf containing the protocol header */
1138	for (m0 = m; m0 != NULL; m0 = m0->m_next)
1139		if (((caddr_t)ip >= m0->m_data) &&
1140		    ((caddr_t)ip < m0->m_data + m0->m_len))
1141			break;
1142	if (m0 == NULL) {
1143#ifdef ALTQ_DEBUG
1144		printf("extract_ports4: can't locate header! ip=%p\n", ip);
1145#endif
1146		return (0);
1147	}
1148	off = ((caddr_t)ip - m0->m_data) + (ip->ip_hl << 2);
1149	proto = ip->ip_p;
1150
1151#ifdef ALTQ_IPSEC
1152 again:
1153#endif
1154	while (off >= m0->m_len) {
1155		off -= m0->m_len;
1156		m0 = m0->m_next;
1157		if (m0 == NULL)
1158			return (0);  /* bogus ip_hl! */
1159	}
1160	if (m0->m_len < off + 4)
1161		return (0);
1162
1163	switch (proto) {
1164	case IPPROTO_TCP:
1165	case IPPROTO_UDP: {
1166		struct udphdr *udp;
1167
1168		udp = (struct udphdr *)(mtod(m0, caddr_t) + off);
1169		fin->fi_sport = udp->uh_sport;
1170		fin->fi_dport = udp->uh_dport;
1171		fin->fi_proto = proto;
1172		}
1173		break;
1174
1175#ifdef ALTQ_IPSEC
1176	case IPPROTO_ESP:
1177		if (fin->fi_gpi == 0){
1178			u_int32_t *gpi;
1179
1180			gpi = (u_int32_t *)(mtod(m0, caddr_t) + off);
1181			fin->fi_gpi   = *gpi;
1182		}
1183		fin->fi_proto = proto;
1184		break;
1185
1186	case IPPROTO_AH: {
1187			/* get next header and header length */
1188			struct _opt6 *opt6;
1189
1190			opt6 = (struct _opt6 *)(mtod(m0, caddr_t) + off);
1191			proto = opt6->opt6_nxt;
1192			off += 8 + (opt6->opt6_hlen * 4);
1193			if (fin->fi_gpi == 0 && m0->m_len >= off + 8)
1194				fin->fi_gpi = opt6->ah_spi;
1195		}
1196		/* goto the next header */
1197		goto again;
1198#endif  /* ALTQ_IPSEC */
1199
1200	default:
1201		fin->fi_proto = proto;
1202		return (0);
1203	}
1204
1205	/* if this is a first fragment, cache it. */
1206	if (ip_off & IP_MF)
1207		ip4f_cache(ip, fin);
1208
1209	return (1);
1210}
1211
1212#ifdef INET6
1213static int
1214extract_ports6(m, ip6, fin6)
1215	struct mbuf *m;
1216	struct ip6_hdr *ip6;
1217	struct flowinfo_in6 *fin6;
1218{
1219	struct mbuf *m0;
1220	int	off;
1221	u_int8_t proto;
1222
1223	fin6->fi6_gpi   = 0;
1224	fin6->fi6_sport = 0;
1225	fin6->fi6_dport = 0;
1226
1227	/* locate the mbuf containing the protocol header */
1228	for (m0 = m; m0 != NULL; m0 = m0->m_next)
1229		if (((caddr_t)ip6 >= m0->m_data) &&
1230		    ((caddr_t)ip6 < m0->m_data + m0->m_len))
1231			break;
1232	if (m0 == NULL) {
1233#ifdef ALTQ_DEBUG
1234		printf("extract_ports6: can't locate header! ip6=%p\n", ip6);
1235#endif
1236		return (0);
1237	}
1238	off = ((caddr_t)ip6 - m0->m_data) + sizeof(struct ip6_hdr);
1239
1240	proto = ip6->ip6_nxt;
1241	do {
1242		while (off >= m0->m_len) {
1243			off -= m0->m_len;
1244			m0 = m0->m_next;
1245			if (m0 == NULL)
1246				return (0);
1247		}
1248		if (m0->m_len < off + 4)
1249			return (0);
1250
1251		switch (proto) {
1252		case IPPROTO_TCP:
1253		case IPPROTO_UDP: {
1254			struct udphdr *udp;
1255
1256			udp = (struct udphdr *)(mtod(m0, caddr_t) + off);
1257			fin6->fi6_sport = udp->uh_sport;
1258			fin6->fi6_dport = udp->uh_dport;
1259			fin6->fi6_proto = proto;
1260			}
1261			return (1);
1262
1263		case IPPROTO_ESP:
1264			if (fin6->fi6_gpi == 0) {
1265				u_int32_t *gpi;
1266
1267				gpi = (u_int32_t *)(mtod(m0, caddr_t) + off);
1268				fin6->fi6_gpi   = *gpi;
1269			}
1270			fin6->fi6_proto = proto;
1271			return (1);
1272
1273		case IPPROTO_AH: {
1274			/* get next header and header length */
1275			struct _opt6 *opt6;
1276
1277			opt6 = (struct _opt6 *)(mtod(m0, caddr_t) + off);
1278			if (fin6->fi6_gpi == 0 && m0->m_len >= off + 8)
1279				fin6->fi6_gpi = opt6->ah_spi;
1280			proto = opt6->opt6_nxt;
1281			off += 8 + (opt6->opt6_hlen * 4);
1282			/* goto the next header */
1283			break;
1284			}
1285
1286		case IPPROTO_HOPOPTS:
1287		case IPPROTO_ROUTING:
1288		case IPPROTO_DSTOPTS: {
1289			/* get next header and header length */
1290			struct _opt6 *opt6;
1291
1292			opt6 = (struct _opt6 *)(mtod(m0, caddr_t) + off);
1293			proto = opt6->opt6_nxt;
1294			off += (opt6->opt6_hlen + 1) * 8;
1295			/* goto the next header */
1296			break;
1297			}
1298
1299		case IPPROTO_FRAGMENT:
1300			/* ipv6 fragmentations are not supported yet */
1301		default:
1302			fin6->fi6_proto = proto;
1303			return (0);
1304		}
1305	} while (1);
1306	/*NOTREACHED*/
1307}
1308#endif /* INET6 */
1309
1310/*
1311 * altq common classifier
1312 */
1313int
1314acc_add_filter(classifier, filter, class, phandle)
1315	struct acc_classifier *classifier;
1316	struct flow_filter *filter;
1317	void	*class;
1318	u_long	*phandle;
1319{
1320	struct acc_filter *afp, *prev, *tmp;
1321	int	i, s;
1322
1323#ifdef INET6
1324	if (filter->ff_flow.fi_family != AF_INET &&
1325	    filter->ff_flow.fi_family != AF_INET6)
1326		return (EINVAL);
1327#else
1328	if (filter->ff_flow.fi_family != AF_INET)
1329		return (EINVAL);
1330#endif
1331
1332	afp = malloc(sizeof(struct acc_filter),
1333	       M_DEVBUF, M_WAITOK);
1334	if (afp == NULL)
1335		return (ENOMEM);
1336	bzero(afp, sizeof(struct acc_filter));
1337
1338	afp->f_filter = *filter;
1339	afp->f_class = class;
1340
1341	i = ACC_WILDCARD_INDEX;
1342	if (filter->ff_flow.fi_family == AF_INET) {
1343		struct flow_filter *filter4 = &afp->f_filter;
1344
1345		/*
1346		 * if address is 0, it's a wildcard.  if address mask
1347		 * isn't set, use full mask.
1348		 */
1349		if (filter4->ff_flow.fi_dst.s_addr == 0)
1350			filter4->ff_mask.mask_dst.s_addr = 0;
1351		else if (filter4->ff_mask.mask_dst.s_addr == 0)
1352			filter4->ff_mask.mask_dst.s_addr = 0xffffffff;
1353		if (filter4->ff_flow.fi_src.s_addr == 0)
1354			filter4->ff_mask.mask_src.s_addr = 0;
1355		else if (filter4->ff_mask.mask_src.s_addr == 0)
1356			filter4->ff_mask.mask_src.s_addr = 0xffffffff;
1357
1358		/* clear extra bits in addresses  */
1359		   filter4->ff_flow.fi_dst.s_addr &=
1360		       filter4->ff_mask.mask_dst.s_addr;
1361		   filter4->ff_flow.fi_src.s_addr &=
1362		       filter4->ff_mask.mask_src.s_addr;
1363
1364		/*
1365		 * if dst address is a wildcard, use hash-entry
1366		 * ACC_WILDCARD_INDEX.
1367		 */
1368		if (filter4->ff_mask.mask_dst.s_addr != 0xffffffff)
1369			i = ACC_WILDCARD_INDEX;
1370		else
1371			i = ACC_GET_HASH_INDEX(filter4->ff_flow.fi_dst.s_addr);
1372	}
1373#ifdef INET6
1374	else if (filter->ff_flow.fi_family == AF_INET6) {
1375		struct flow_filter6 *filter6 =
1376			(struct flow_filter6 *)&afp->f_filter;
1377#ifndef IN6MASK0 /* taken from kame ipv6 */
1378#define	IN6MASK0	{{{ 0, 0, 0, 0 }}}
1379#define	IN6MASK128	{{{ 0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff }}}
1380		const struct in6_addr in6mask0 = IN6MASK0;
1381		const struct in6_addr in6mask128 = IN6MASK128;
1382#endif
1383
1384		if (IN6_IS_ADDR_UNSPECIFIED(&filter6->ff_flow6.fi6_dst))
1385			filter6->ff_mask6.mask6_dst = in6mask0;
1386		else if (IN6_IS_ADDR_UNSPECIFIED(&filter6->ff_mask6.mask6_dst))
1387			filter6->ff_mask6.mask6_dst = in6mask128;
1388		if (IN6_IS_ADDR_UNSPECIFIED(&filter6->ff_flow6.fi6_src))
1389			filter6->ff_mask6.mask6_src = in6mask0;
1390		else if (IN6_IS_ADDR_UNSPECIFIED(&filter6->ff_mask6.mask6_src))
1391			filter6->ff_mask6.mask6_src = in6mask128;
1392
1393		/* clear extra bits in addresses  */
1394		for (i = 0; i < 16; i++)
1395			filter6->ff_flow6.fi6_dst.s6_addr[i] &=
1396			    filter6->ff_mask6.mask6_dst.s6_addr[i];
1397		for (i = 0; i < 16; i++)
1398			filter6->ff_flow6.fi6_src.s6_addr[i] &=
1399			    filter6->ff_mask6.mask6_src.s6_addr[i];
1400
1401		if (filter6->ff_flow6.fi6_flowlabel == 0)
1402			i = ACC_WILDCARD_INDEX;
1403		else
1404			i = ACC_GET_HASH_INDEX(filter6->ff_flow6.fi6_flowlabel);
1405	}
1406#endif /* INET6 */
1407
1408	afp->f_handle = get_filt_handle(classifier, i);
1409
1410	/* update filter bitmask */
1411	afp->f_fbmask = filt2fibmask(filter);
1412	classifier->acc_fbmask |= afp->f_fbmask;
1413
1414	/*
1415	 * add this filter to the filter list.
1416	 * filters are ordered from the highest rule number.
1417	 */
1418	s = splnet();
1419	prev = NULL;
1420	LIST_FOREACH(tmp, &classifier->acc_filters[i], f_chain) {
1421		if (tmp->f_filter.ff_ruleno > afp->f_filter.ff_ruleno)
1422			prev = tmp;
1423		else
1424			break;
1425	}
1426	if (prev == NULL)
1427		LIST_INSERT_HEAD(&classifier->acc_filters[i], afp, f_chain);
1428	else
1429		LIST_INSERT_AFTER(prev, afp, f_chain);
1430	splx(s);
1431
1432	*phandle = afp->f_handle;
1433	return (0);
1434}
1435
1436int
1437acc_delete_filter(classifier, handle)
1438	struct acc_classifier *classifier;
1439	u_long handle;
1440{
1441	struct acc_filter *afp;
1442	int	s;
1443
1444	if ((afp = filth_to_filtp(classifier, handle)) == NULL)
1445		return (EINVAL);
1446
1447	s = splnet();
1448	LIST_REMOVE(afp, f_chain);
1449	splx(s);
1450
1451	free(afp, M_DEVBUF);
1452
1453	/* todo: update filt_bmask */
1454
1455	return (0);
1456}
1457
1458/*
1459 * delete filters referencing to the specified class.
1460 * if the all flag is not 0, delete all the filters.
1461 */
1462int
1463acc_discard_filters(classifier, class, all)
1464	struct acc_classifier *classifier;
1465	void	*class;
1466	int	all;
1467{
1468	struct acc_filter *afp;
1469	int	i, s;
1470
1471	s = splnet();
1472	for (i = 0; i < ACC_FILTER_TABLESIZE; i++) {
1473		do {
1474			LIST_FOREACH(afp, &classifier->acc_filters[i], f_chain)
1475				if (all || afp->f_class == class) {
1476					LIST_REMOVE(afp, f_chain);
1477					free(afp, M_DEVBUF);
1478					/* start again from the head */
1479					break;
1480				}
1481		} while (afp != NULL);
1482	}
1483	splx(s);
1484
1485	if (all)
1486		classifier->acc_fbmask = 0;
1487
1488	return (0);
1489}
1490
1491void *
1492acc_classify(clfier, m, af)
1493	void *clfier;
1494	struct mbuf *m;
1495	int af;
1496{
1497	struct acc_classifier *classifier;
1498	struct flowinfo flow;
1499	struct acc_filter *afp;
1500	int	i;
1501
1502	classifier = (struct acc_classifier *)clfier;
1503	altq_extractflow(m, af, &flow, classifier->acc_fbmask);
1504
1505	if (flow.fi_family == AF_INET) {
1506		struct flowinfo_in *fp = (struct flowinfo_in *)&flow;
1507
1508		if ((classifier->acc_fbmask & FIMB4_ALL) == FIMB4_TOS) {
1509			/* only tos is used */
1510			LIST_FOREACH(afp,
1511				 &classifier->acc_filters[ACC_WILDCARD_INDEX],
1512				 f_chain)
1513				if (apply_tosfilter4(afp->f_fbmask,
1514						     &afp->f_filter, fp))
1515					/* filter matched */
1516					return (afp->f_class);
1517		} else if ((classifier->acc_fbmask &
1518			(~(FIMB4_PROTO|FIMB4_SPORT|FIMB4_DPORT) & FIMB4_ALL))
1519		    == 0) {
1520			/* only proto and ports are used */
1521			LIST_FOREACH(afp,
1522				 &classifier->acc_filters[ACC_WILDCARD_INDEX],
1523				 f_chain)
1524				if (apply_ppfilter4(afp->f_fbmask,
1525						    &afp->f_filter, fp))
1526					/* filter matched */
1527					return (afp->f_class);
1528		} else {
1529			/* get the filter hash entry from its dest address */
1530			i = ACC_GET_HASH_INDEX(fp->fi_dst.s_addr);
1531			do {
1532				/*
1533				 * go through this loop twice.  first for dst
1534				 * hash, second for wildcards.
1535				 */
1536				LIST_FOREACH(afp, &classifier->acc_filters[i],
1537					     f_chain)
1538					if (apply_filter4(afp->f_fbmask,
1539							  &afp->f_filter, fp))
1540						/* filter matched */
1541						return (afp->f_class);
1542
1543				/*
1544				 * check again for filters with a dst addr
1545				 * wildcard.
1546				 * (daddr == 0 || dmask != 0xffffffff).
1547				 */
1548				if (i != ACC_WILDCARD_INDEX)
1549					i = ACC_WILDCARD_INDEX;
1550				else
1551					break;
1552			} while (1);
1553		}
1554	}
1555#ifdef INET6
1556	else if (flow.fi_family == AF_INET6) {
1557		struct flowinfo_in6 *fp6 = (struct flowinfo_in6 *)&flow;
1558
1559		/* get the filter hash entry from its flow ID */
1560		if (fp6->fi6_flowlabel != 0)
1561			i = ACC_GET_HASH_INDEX(fp6->fi6_flowlabel);
1562		else
1563			/* flowlable can be zero */
1564			i = ACC_WILDCARD_INDEX;
1565
1566		/* go through this loop twice.  first for flow hash, second
1567		   for wildcards. */
1568		do {
1569			LIST_FOREACH(afp, &classifier->acc_filters[i], f_chain)
1570				if (apply_filter6(afp->f_fbmask,
1571					(struct flow_filter6 *)&afp->f_filter,
1572					fp6))
1573					/* filter matched */
1574					return (afp->f_class);
1575
1576			/*
1577			 * check again for filters with a wildcard.
1578			 */
1579			if (i != ACC_WILDCARD_INDEX)
1580				i = ACC_WILDCARD_INDEX;
1581			else
1582				break;
1583		} while (1);
1584	}
1585#endif /* INET6 */
1586
1587	/* no filter matched */
1588	return (NULL);
1589}
1590
1591static int
1592apply_filter4(fbmask, filt, pkt)
1593	u_int32_t	fbmask;
1594	struct flow_filter *filt;
1595	struct flowinfo_in *pkt;
1596{
1597	if (filt->ff_flow.fi_family != AF_INET)
1598		return (0);
1599	if ((fbmask & FIMB4_SPORT) && filt->ff_flow.fi_sport != pkt->fi_sport)
1600		return (0);
1601	if ((fbmask & FIMB4_DPORT) && filt->ff_flow.fi_dport != pkt->fi_dport)
1602		return (0);
1603	if ((fbmask & FIMB4_DADDR) &&
1604	    filt->ff_flow.fi_dst.s_addr !=
1605	    (pkt->fi_dst.s_addr & filt->ff_mask.mask_dst.s_addr))
1606		return (0);
1607	if ((fbmask & FIMB4_SADDR) &&
1608	    filt->ff_flow.fi_src.s_addr !=
1609	    (pkt->fi_src.s_addr & filt->ff_mask.mask_src.s_addr))
1610		return (0);
1611	if ((fbmask & FIMB4_PROTO) && filt->ff_flow.fi_proto != pkt->fi_proto)
1612		return (0);
1613	if ((fbmask & FIMB4_TOS) && filt->ff_flow.fi_tos !=
1614	    (pkt->fi_tos & filt->ff_mask.mask_tos))
1615		return (0);
1616	if ((fbmask & FIMB4_GPI) && filt->ff_flow.fi_gpi != (pkt->fi_gpi))
1617		return (0);
1618	/* match */
1619	return (1);
1620}
1621
1622/*
1623 * filter matching function optimized for a common case that checks
1624 * only protocol and port numbers
1625 */
1626static int
1627apply_ppfilter4(fbmask, filt, pkt)
1628	u_int32_t	fbmask;
1629	struct flow_filter *filt;
1630	struct flowinfo_in *pkt;
1631{
1632	if (filt->ff_flow.fi_family != AF_INET)
1633		return (0);
1634	if ((fbmask & FIMB4_SPORT) && filt->ff_flow.fi_sport != pkt->fi_sport)
1635		return (0);
1636	if ((fbmask & FIMB4_DPORT) && filt->ff_flow.fi_dport != pkt->fi_dport)
1637		return (0);
1638	if ((fbmask & FIMB4_PROTO) && filt->ff_flow.fi_proto != pkt->fi_proto)
1639		return (0);
1640	/* match */
1641	return (1);
1642}
1643
1644/*
1645 * filter matching function only for tos field.
1646 */
1647static int
1648apply_tosfilter4(fbmask, filt, pkt)
1649	u_int32_t	fbmask;
1650	struct flow_filter *filt;
1651	struct flowinfo_in *pkt;
1652{
1653	if (filt->ff_flow.fi_family != AF_INET)
1654		return (0);
1655	if ((fbmask & FIMB4_TOS) && filt->ff_flow.fi_tos !=
1656	    (pkt->fi_tos & filt->ff_mask.mask_tos))
1657		return (0);
1658	/* match */
1659	return (1);
1660}
1661
1662#ifdef INET6
1663static int
1664apply_filter6(fbmask, filt, pkt)
1665	u_int32_t	fbmask;
1666	struct flow_filter6 *filt;
1667	struct flowinfo_in6 *pkt;
1668{
1669	int i;
1670
1671	if (filt->ff_flow6.fi6_family != AF_INET6)
1672		return (0);
1673	if ((fbmask & FIMB6_FLABEL) &&
1674	    filt->ff_flow6.fi6_flowlabel != pkt->fi6_flowlabel)
1675		return (0);
1676	if ((fbmask & FIMB6_PROTO) &&
1677	    filt->ff_flow6.fi6_proto != pkt->fi6_proto)
1678		return (0);
1679	if ((fbmask & FIMB6_SPORT) &&
1680	    filt->ff_flow6.fi6_sport != pkt->fi6_sport)
1681		return (0);
1682	if ((fbmask & FIMB6_DPORT) &&
1683	    filt->ff_flow6.fi6_dport != pkt->fi6_dport)
1684		return (0);
1685	if (fbmask & FIMB6_SADDR) {
1686		for (i = 0; i < 4; i++)
1687			if (filt->ff_flow6.fi6_src.s6_addr32[i] !=
1688			    (pkt->fi6_src.s6_addr32[i] &
1689			     filt->ff_mask6.mask6_src.s6_addr32[i]))
1690				return (0);
1691	}
1692	if (fbmask & FIMB6_DADDR) {
1693		for (i = 0; i < 4; i++)
1694			if (filt->ff_flow6.fi6_dst.s6_addr32[i] !=
1695			    (pkt->fi6_dst.s6_addr32[i] &
1696			     filt->ff_mask6.mask6_dst.s6_addr32[i]))
1697				return (0);
1698	}
1699	if ((fbmask & FIMB6_TCLASS) &&
1700	    filt->ff_flow6.fi6_tclass !=
1701	    (pkt->fi6_tclass & filt->ff_mask6.mask6_tclass))
1702		return (0);
1703	if ((fbmask & FIMB6_GPI) &&
1704	    filt->ff_flow6.fi6_gpi != pkt->fi6_gpi)
1705		return (0);
1706	/* match */
1707	return (1);
1708}
1709#endif /* INET6 */
1710
1711/*
1712 *  filter handle:
1713 *	bit 20-28: index to the filter hash table
1714 *	bit  0-19: unique id in the hash bucket.
1715 */
1716static u_long
1717get_filt_handle(classifier, i)
1718	struct acc_classifier *classifier;
1719	int	i;
1720{
1721	static u_long handle_number = 1;
1722	u_long 	handle;
1723	struct acc_filter *afp;
1724
1725	while (1) {
1726		handle = handle_number++ & 0x000fffff;
1727
1728		if (LIST_EMPTY(&classifier->acc_filters[i]))
1729			break;
1730
1731		LIST_FOREACH(afp, &classifier->acc_filters[i], f_chain)
1732			if ((afp->f_handle & 0x000fffff) == handle)
1733				break;
1734		if (afp == NULL)
1735			break;
1736		/* this handle is already used, try again */
1737	}
1738
1739	return ((i << 20) | handle);
1740}
1741
1742/* convert filter handle to filter pointer */
1743static struct acc_filter *
1744filth_to_filtp(classifier, handle)
1745	struct acc_classifier *classifier;
1746	u_long handle;
1747{
1748	struct acc_filter *afp;
1749	int	i;
1750
1751	i = ACC_GET_HINDEX(handle);
1752
1753	LIST_FOREACH(afp, &classifier->acc_filters[i], f_chain)
1754		if (afp->f_handle == handle)
1755			return (afp);
1756
1757	return (NULL);
1758}
1759
1760/* create flowinfo bitmask */
1761static u_int32_t
1762filt2fibmask(filt)
1763	struct flow_filter *filt;
1764{
1765	u_int32_t mask = 0;
1766#ifdef INET6
1767	struct flow_filter6 *filt6;
1768#endif
1769
1770	switch (filt->ff_flow.fi_family) {
1771	case AF_INET:
1772		if (filt->ff_flow.fi_proto != 0)
1773			mask |= FIMB4_PROTO;
1774		if (filt->ff_flow.fi_tos != 0)
1775			mask |= FIMB4_TOS;
1776		if (filt->ff_flow.fi_dst.s_addr != 0)
1777			mask |= FIMB4_DADDR;
1778		if (filt->ff_flow.fi_src.s_addr != 0)
1779			mask |= FIMB4_SADDR;
1780		if (filt->ff_flow.fi_sport != 0)
1781			mask |= FIMB4_SPORT;
1782		if (filt->ff_flow.fi_dport != 0)
1783			mask |= FIMB4_DPORT;
1784		if (filt->ff_flow.fi_gpi != 0)
1785			mask |= FIMB4_GPI;
1786		break;
1787#ifdef INET6
1788	case AF_INET6:
1789		filt6 = (struct flow_filter6 *)filt;
1790
1791		if (filt6->ff_flow6.fi6_proto != 0)
1792			mask |= FIMB6_PROTO;
1793		if (filt6->ff_flow6.fi6_tclass != 0)
1794			mask |= FIMB6_TCLASS;
1795		if (!IN6_IS_ADDR_UNSPECIFIED(&filt6->ff_flow6.fi6_dst))
1796			mask |= FIMB6_DADDR;
1797		if (!IN6_IS_ADDR_UNSPECIFIED(&filt6->ff_flow6.fi6_src))
1798			mask |= FIMB6_SADDR;
1799		if (filt6->ff_flow6.fi6_sport != 0)
1800			mask |= FIMB6_SPORT;
1801		if (filt6->ff_flow6.fi6_dport != 0)
1802			mask |= FIMB6_DPORT;
1803		if (filt6->ff_flow6.fi6_gpi != 0)
1804			mask |= FIMB6_GPI;
1805		if (filt6->ff_flow6.fi6_flowlabel != 0)
1806			mask |= FIMB6_FLABEL;
1807		break;
1808#endif /* INET6 */
1809	}
1810	return (mask);
1811}
1812
1813/*
1814 * helper functions to handle IPv4 fragments.
1815 * currently only in-sequence fragments are handled.
1816 *	- fragment info is cached in a LRU list.
1817 *	- when a first fragment is found, cache its flow info.
1818 *	- when a non-first fragment is found, lookup the cache.
1819 */
1820
1821struct ip4_frag {
1822    TAILQ_ENTRY(ip4_frag) ip4f_chain;
1823    char    ip4f_valid;
1824    u_short ip4f_id;
1825    struct flowinfo_in ip4f_info;
1826};
1827
1828static TAILQ_HEAD(ip4f_list, ip4_frag) ip4f_list; /* IPv4 fragment cache */
1829
1830#define	IP4F_TABSIZE		16	/* IPv4 fragment cache size */
1831
1832static void
1833ip4f_cache(ip, fin)
1834	struct ip *ip;
1835	struct flowinfo_in *fin;
1836{
1837	struct ip4_frag *fp;
1838
1839	if (TAILQ_EMPTY(&ip4f_list)) {
1840		/* first time call, allocate fragment cache entries. */
1841		if (ip4f_init() < 0)
1842			/* allocation failed! */
1843			return;
1844	}
1845
1846	fp = ip4f_alloc();
1847	fp->ip4f_id = ip->ip_id;
1848	fp->ip4f_info.fi_proto = ip->ip_p;
1849	fp->ip4f_info.fi_src.s_addr = ip->ip_src.s_addr;
1850	fp->ip4f_info.fi_dst.s_addr = ip->ip_dst.s_addr;
1851
1852	/* save port numbers */
1853	fp->ip4f_info.fi_sport = fin->fi_sport;
1854	fp->ip4f_info.fi_dport = fin->fi_dport;
1855	fp->ip4f_info.fi_gpi   = fin->fi_gpi;
1856}
1857
1858static int
1859ip4f_lookup(ip, fin)
1860	struct ip *ip;
1861	struct flowinfo_in *fin;
1862{
1863	struct ip4_frag *fp;
1864
1865	for (fp = TAILQ_FIRST(&ip4f_list); fp != NULL && fp->ip4f_valid;
1866	     fp = TAILQ_NEXT(fp, ip4f_chain))
1867		if (ip->ip_id == fp->ip4f_id &&
1868		    ip->ip_src.s_addr == fp->ip4f_info.fi_src.s_addr &&
1869		    ip->ip_dst.s_addr == fp->ip4f_info.fi_dst.s_addr &&
1870		    ip->ip_p == fp->ip4f_info.fi_proto) {
1871			/* found the matching entry */
1872			fin->fi_sport = fp->ip4f_info.fi_sport;
1873			fin->fi_dport = fp->ip4f_info.fi_dport;
1874			fin->fi_gpi   = fp->ip4f_info.fi_gpi;
1875
1876			if ((ntohs(ip->ip_off) & IP_MF) == 0)
1877				/* this is the last fragment,
1878				   release the entry. */
1879				ip4f_free(fp);
1880
1881			return (1);
1882		}
1883
1884	/* no matching entry found */
1885	return (0);
1886}
1887
1888static int
1889ip4f_init(void)
1890{
1891	struct ip4_frag *fp;
1892	int i;
1893
1894	TAILQ_INIT(&ip4f_list);
1895	for (i=0; i<IP4F_TABSIZE; i++) {
1896		fp = malloc(sizeof(struct ip4_frag),
1897		       M_DEVBUF, M_NOWAIT);
1898		if (fp == NULL) {
1899			printf("ip4f_init: can't alloc %dth entry!\n", i);
1900			if (i == 0)
1901				return (-1);
1902			return (0);
1903		}
1904		fp->ip4f_valid = 0;
1905		TAILQ_INSERT_TAIL(&ip4f_list, fp, ip4f_chain);
1906	}
1907	return (0);
1908}
1909
1910static struct ip4_frag *
1911ip4f_alloc(void)
1912{
1913	struct ip4_frag *fp;
1914
1915	/* reclaim an entry at the tail, put it at the head */
1916	fp = TAILQ_LAST(&ip4f_list, ip4f_list);
1917	TAILQ_REMOVE(&ip4f_list, fp, ip4f_chain);
1918	fp->ip4f_valid = 1;
1919	TAILQ_INSERT_HEAD(&ip4f_list, fp, ip4f_chain);
1920	return (fp);
1921}
1922
1923static void
1924ip4f_free(fp)
1925	struct ip4_frag *fp;
1926{
1927	TAILQ_REMOVE(&ip4f_list, fp, ip4f_chain);
1928	fp->ip4f_valid = 0;
1929	TAILQ_INSERT_TAIL(&ip4f_list, fp, ip4f_chain);
1930}
1931
1932#endif /* ALTQ3_CLFIER_COMPAT */
1933