dn_sched_fq_pie.c revision 318885
1/*
2 * FQ_PIE - The FlowQueue-PIE scheduler/AQM
3 *
4 * $FreeBSD: stable/11/sys/netpfil/ipfw/dn_sched_fq_pie.c 318885 2017-05-25 17:22:13Z truckman $
5 *
6 * Copyright (C) 2016 Centre for Advanced Internet Architectures,
7 *  Swinburne University of Technology, Melbourne, Australia.
8 * Portions of this code were made possible in part by a gift from
9 *  The Comcast Innovation Fund.
10 * Implemented by Rasool Al-Saadi <ralsaadi@swin.edu.au>
11 *
12 * Redistribution and use in source and binary forms, with or without
13 * modification, are permitted provided that the following conditions
14 * are met:
15 * 1. Redistributions of source code must retain the above copyright
16 *    notice, this list of conditions and the following disclaimer.
17 * 2. Redistributions in binary form must reproduce the above copyright
18 *    notice, this list of conditions and the following disclaimer in the
19 *    documentation and/or other materials provided with the distribution.
20 *
21 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
22 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
23 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
24 * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
25 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
26 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
27 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
28 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
29 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
30 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
31 * SUCH DAMAGE.
32 */
33
34/* Important note:
35 * As there is no an office document for FQ-PIE specification, we used
36 * FQ-CoDel algorithm with some modifications to implement FQ-PIE.
37 * This FQ-PIE implementation is a beta version and have not been tested
38 * extensively. Our FQ-PIE uses stand-alone PIE AQM per sub-queue. By
39 * default, timestamp is used to calculate queue delay instead of departure
40 * rate estimation method. Although departure rate estimation is available
41 * as testing option, the results could be incorrect. Moreover, turning PIE on
42 * and off option is available but it does not work properly in this version.
43 */
44
45
46#ifdef _KERNEL
47#include <sys/malloc.h>
48#include <sys/socket.h>
49#include <sys/kernel.h>
50#include <sys/mbuf.h>
51#include <sys/lock.h>
52#include <sys/module.h>
53#include <sys/mutex.h>
54#include <net/if.h>	/* IFNAMSIZ */
55#include <netinet/in.h>
56#include <netinet/ip_var.h>		/* ipfw_rule_ref */
57#include <netinet/ip_fw.h>	/* flow_id */
58#include <netinet/ip_dummynet.h>
59
60#include <sys/proc.h>
61#include <sys/rwlock.h>
62
63#include <netpfil/ipfw/ip_fw_private.h>
64#include <sys/sysctl.h>
65#include <netinet/ip.h>
66#include <netinet/ip6.h>
67#include <netinet/ip_icmp.h>
68#include <netinet/tcp.h>
69#include <netinet/udp.h>
70#include <sys/queue.h>
71#include <sys/hash.h>
72
73#include <netpfil/ipfw/dn_heap.h>
74#include <netpfil/ipfw/ip_dn_private.h>
75
76#include <netpfil/ipfw/dn_aqm.h>
77#include <netpfil/ipfw/dn_aqm_pie.h>
78#include <netpfil/ipfw/dn_sched.h>
79
80#else
81#include <dn_test.h>
82#endif
83
84#define DN_SCHED_FQ_PIE 7
85
86/* list of queues */
87STAILQ_HEAD(fq_pie_list, fq_pie_flow) ;
88
89/* FQ_PIE parameters including PIE */
90struct dn_sch_fq_pie_parms {
91	struct dn_aqm_pie_parms	pcfg;	/* PIE configuration Parameters */
92	/* FQ_PIE Parameters */
93	uint32_t flows_cnt;	/* number of flows */
94	uint32_t limit;	/* hard limit of FQ_PIE queue size*/
95	uint32_t quantum;
96};
97
98/* flow (sub-queue) stats */
99struct flow_stats {
100	uint64_t tot_pkts;	/* statistics counters  */
101	uint64_t tot_bytes;
102	uint32_t length;		/* Queue length, in packets */
103	uint32_t len_bytes;	/* Queue length, in bytes */
104	uint32_t drops;
105};
106
107/* A flow of packets (sub-queue)*/
108struct fq_pie_flow {
109	struct mq	mq;	/* list of packets */
110	struct flow_stats stats;	/* statistics */
111	int deficit;
112	int active;		/* 1: flow is active (in a list) */
113	struct pie_status pst;	/* pie status variables */
114	struct fq_pie_si_extra *psi_extra;
115	STAILQ_ENTRY(fq_pie_flow) flowchain;
116};
117
118/* extra fq_pie scheduler configurations */
119struct fq_pie_schk {
120	struct dn_sch_fq_pie_parms cfg;
121};
122
123
124/* fq_pie scheduler instance extra state vars.
125 * The purpose of separation this structure is to preserve number of active
126 * sub-queues and the flows array pointer even after the scheduler instance
127 * is destroyed.
128 * Preserving these varaiables allows freeing the allocated memory by
129 * fqpie_callout_cleanup() independently from fq_pie_free_sched().
130 */
131struct fq_pie_si_extra {
132	uint32_t nr_active_q;	/* number of active queues */
133	struct fq_pie_flow *flows;	/* array of flows (queues) */
134	};
135
136/* fq_pie scheduler instance */
137struct fq_pie_si {
138	struct dn_sch_inst _si;	/* standard scheduler instance. SHOULD BE FIRST */
139	struct dn_queue main_q; /* main queue is after si directly */
140	uint32_t perturbation; 	/* random value */
141	struct fq_pie_list newflows;	/* list of new queues */
142	struct fq_pie_list oldflows;	/* list of old queues */
143	struct fq_pie_si_extra *si_extra; /* extra state vars*/
144};
145
146
147static struct dn_alg fq_pie_desc;
148
149/*  Default FQ-PIE parameters including PIE */
150/*  PIE defaults
151 * target=15ms, max_burst=150ms, max_ecnth=0.1,
152 * alpha=0.125, beta=1.25, tupdate=15ms
153 * FQ-
154 * flows=1024, limit=10240, quantum =1514
155 */
156struct dn_sch_fq_pie_parms
157 fq_pie_sysctl = {{15000 * AQM_TIME_1US, 15000 * AQM_TIME_1US,
158	150000 * AQM_TIME_1US, PIE_SCALE * 0.1, PIE_SCALE * 0.125,
159	PIE_SCALE * 1.25,	PIE_CAPDROP_ENABLED | PIE_DERAND_ENABLED},
160	1024, 10240, 1514};
161
162static int
163fqpie_sysctl_alpha_beta_handler(SYSCTL_HANDLER_ARGS)
164{
165	int error;
166	long  value;
167
168	if (!strcmp(oidp->oid_name,"alpha"))
169		value = fq_pie_sysctl.pcfg.alpha;
170	else
171		value = fq_pie_sysctl.pcfg.beta;
172
173	value = value * 1000 / PIE_SCALE;
174	error = sysctl_handle_long(oidp, &value, 0, req);
175	if (error != 0 || req->newptr == NULL)
176		return (error);
177	if (value < 1 || value > 7 * PIE_SCALE)
178		return (EINVAL);
179	value = (value * PIE_SCALE) / 1000;
180	if (!strcmp(oidp->oid_name,"alpha"))
181			fq_pie_sysctl.pcfg.alpha = value;
182	else
183		fq_pie_sysctl.pcfg.beta = value;
184	return (0);
185}
186
187static int
188fqpie_sysctl_target_tupdate_maxb_handler(SYSCTL_HANDLER_ARGS)
189{
190	int error;
191	long  value;
192
193	if (!strcmp(oidp->oid_name,"target"))
194		value = fq_pie_sysctl.pcfg.qdelay_ref;
195	else if (!strcmp(oidp->oid_name,"tupdate"))
196		value = fq_pie_sysctl.pcfg.tupdate;
197	else
198		value = fq_pie_sysctl.pcfg.max_burst;
199
200	value = value / AQM_TIME_1US;
201	error = sysctl_handle_long(oidp, &value, 0, req);
202	if (error != 0 || req->newptr == NULL)
203		return (error);
204	if (value < 1 || value > 10 * AQM_TIME_1S)
205		return (EINVAL);
206	value = value * AQM_TIME_1US;
207
208	if (!strcmp(oidp->oid_name,"target"))
209		fq_pie_sysctl.pcfg.qdelay_ref  = value;
210	else if (!strcmp(oidp->oid_name,"tupdate"))
211		fq_pie_sysctl.pcfg.tupdate  = value;
212	else
213		fq_pie_sysctl.pcfg.max_burst = value;
214	return (0);
215}
216
217static int
218fqpie_sysctl_max_ecnth_handler(SYSCTL_HANDLER_ARGS)
219{
220	int error;
221	long  value;
222
223	value = fq_pie_sysctl.pcfg.max_ecnth;
224	value = value * 1000 / PIE_SCALE;
225	error = sysctl_handle_long(oidp, &value, 0, req);
226	if (error != 0 || req->newptr == NULL)
227		return (error);
228	if (value < 1 || value > PIE_SCALE)
229		return (EINVAL);
230	value = (value * PIE_SCALE) / 1000;
231	fq_pie_sysctl.pcfg.max_ecnth = value;
232	return (0);
233}
234
235/* define FQ- PIE sysctl variables */
236SYSBEGIN(f4)
237SYSCTL_DECL(_net_inet);
238SYSCTL_DECL(_net_inet_ip);
239SYSCTL_DECL(_net_inet_ip_dummynet);
240static SYSCTL_NODE(_net_inet_ip_dummynet, OID_AUTO, fqpie,
241	CTLFLAG_RW, 0, "FQ_PIE");
242
243#ifdef SYSCTL_NODE
244
245SYSCTL_PROC(_net_inet_ip_dummynet_fqpie, OID_AUTO, target,
246	CTLTYPE_LONG | CTLFLAG_RW, NULL, 0,
247	fqpie_sysctl_target_tupdate_maxb_handler, "L",
248	"queue target in microsecond");
249
250SYSCTL_PROC(_net_inet_ip_dummynet_fqpie, OID_AUTO, tupdate,
251	CTLTYPE_LONG | CTLFLAG_RW, NULL, 0,
252	fqpie_sysctl_target_tupdate_maxb_handler, "L",
253	"the frequency of drop probability calculation in microsecond");
254
255SYSCTL_PROC(_net_inet_ip_dummynet_fqpie, OID_AUTO, max_burst,
256	CTLTYPE_LONG | CTLFLAG_RW, NULL, 0,
257	fqpie_sysctl_target_tupdate_maxb_handler, "L",
258	"Burst allowance interval in microsecond");
259
260SYSCTL_PROC(_net_inet_ip_dummynet_fqpie, OID_AUTO, max_ecnth,
261	CTLTYPE_LONG | CTLFLAG_RW, NULL, 0,
262	fqpie_sysctl_max_ecnth_handler, "L",
263	"ECN safeguard threshold scaled by 1000");
264
265SYSCTL_PROC(_net_inet_ip_dummynet_fqpie, OID_AUTO, alpha,
266	CTLTYPE_LONG | CTLFLAG_RW, NULL, 0,
267	fqpie_sysctl_alpha_beta_handler, "L", "PIE alpha scaled by 1000");
268
269SYSCTL_PROC(_net_inet_ip_dummynet_fqpie, OID_AUTO, beta,
270	CTLTYPE_LONG | CTLFLAG_RW, NULL, 0,
271	fqpie_sysctl_alpha_beta_handler, "L", "beta scaled by 1000");
272
273SYSCTL_UINT(_net_inet_ip_dummynet_fqpie, OID_AUTO, quantum,
274	CTLFLAG_RW, &fq_pie_sysctl.quantum, 1514, "quantum for FQ_PIE");
275SYSCTL_UINT(_net_inet_ip_dummynet_fqpie, OID_AUTO, flows,
276	CTLFLAG_RW, &fq_pie_sysctl.flows_cnt, 1024, "Number of queues for FQ_PIE");
277SYSCTL_UINT(_net_inet_ip_dummynet_fqpie, OID_AUTO, limit,
278	CTLFLAG_RW, &fq_pie_sysctl.limit, 10240, "limit for FQ_PIE");
279#endif
280
281/* Helper function to update queue&main-queue and scheduler statistics.
282 * negative len & drop -> drop
283 * negative len -> dequeue
284 * positive len -> enqueue
285 * positive len + drop -> drop during enqueue
286 */
287__inline static void
288fq_update_stats(struct fq_pie_flow *q, struct fq_pie_si *si, int len,
289	int drop)
290{
291	int inc = 0;
292
293	if (len < 0)
294		inc = -1;
295	else if (len > 0)
296		inc = 1;
297
298	if (drop) {
299		si->main_q.ni.drops ++;
300		q->stats.drops ++;
301		si->_si.ni.drops ++;
302		io_pkt_drop ++;
303	}
304
305	if (!drop || (drop && len < 0)) {
306		/* Update stats for the main queue */
307		si->main_q.ni.length += inc;
308		si->main_q.ni.len_bytes += len;
309
310		/*update sub-queue stats */
311		q->stats.length += inc;
312		q->stats.len_bytes += len;
313
314		/*update scheduler instance stats */
315		si->_si.ni.length += inc;
316		si->_si.ni.len_bytes += len;
317	}
318
319	if (inc > 0) {
320		si->main_q.ni.tot_bytes += len;
321		si->main_q.ni.tot_pkts ++;
322
323		q->stats.tot_bytes +=len;
324		q->stats.tot_pkts++;
325
326		si->_si.ni.tot_bytes +=len;
327		si->_si.ni.tot_pkts ++;
328	}
329
330}
331
332/*
333 * Extract a packet from the head of sub-queue 'q'
334 * Return a packet or NULL if the queue is empty.
335 * If getts is set, also extract packet's timestamp from mtag.
336 */
337__inline static struct mbuf *
338fq_pie_extract_head(struct fq_pie_flow *q, aqm_time_t *pkt_ts,
339	struct fq_pie_si *si, int getts)
340{
341	struct mbuf *m = q->mq.head;
342
343	if (m == NULL)
344		return m;
345	q->mq.head = m->m_nextpkt;
346
347	fq_update_stats(q, si, -m->m_pkthdr.len, 0);
348
349	if (si->main_q.ni.length == 0) /* queue is now idle */
350			si->main_q.q_time = dn_cfg.curr_time;
351
352	if (getts) {
353		/* extract packet timestamp*/
354		struct m_tag *mtag;
355		mtag = m_tag_locate(m, MTAG_ABI_COMPAT, DN_AQM_MTAG_TS, NULL);
356		if (mtag == NULL){
357			D("PIE timestamp mtag not found!");
358			*pkt_ts = 0;
359		} else {
360			*pkt_ts = *(aqm_time_t *)(mtag + 1);
361			m_tag_delete(m,mtag);
362		}
363	}
364	return m;
365}
366
367/*
368 * Callout function for drop probability calculation
369 * This function is called over tupdate ms and takes pointer of FQ-PIE
370 * flow as an argument
371  */
372static void
373fq_calculate_drop_prob(void *x)
374{
375	struct fq_pie_flow *q = (struct fq_pie_flow *) x;
376	struct pie_status *pst = &q->pst;
377	struct dn_aqm_pie_parms *pprms;
378	int64_t p, prob, oldprob;
379	aqm_time_t now;
380	int p_isneg;
381
382	now = AQM_UNOW;
383	pprms = pst->parms;
384	prob = pst->drop_prob;
385
386	/* calculate current qdelay */
387	if (pprms->flags & PIE_DEPRATEEST_ENABLED) {
388		pst->current_qdelay = ((uint64_t)q->stats.len_bytes  * pst->avg_dq_time)
389			>> PIE_DQ_THRESHOLD_BITS;
390	}
391
392	/* calculate drop probability */
393	p = (int64_t)pprms->alpha *
394		((int64_t)pst->current_qdelay - (int64_t)pprms->qdelay_ref);
395	p +=(int64_t) pprms->beta *
396		((int64_t)pst->current_qdelay - (int64_t)pst->qdelay_old);
397
398	/* take absolute value so right shift result is well defined */
399	p_isneg = p < 0;
400	if (p_isneg) {
401		p = -p;
402	}
403
404	/* We PIE_MAX_PROB shift by 12-bits to increase the division precision  */
405	p *= (PIE_MAX_PROB << 12) / AQM_TIME_1S;
406
407	/* auto-tune drop probability */
408	if (prob < (PIE_MAX_PROB / 1000000)) /* 0.000001 */
409		p >>= 11 + PIE_FIX_POINT_BITS + 12;
410	else if (prob < (PIE_MAX_PROB / 100000)) /* 0.00001 */
411		p >>= 9 + PIE_FIX_POINT_BITS + 12;
412	else if (prob < (PIE_MAX_PROB / 10000)) /* 0.0001 */
413		p >>= 7 + PIE_FIX_POINT_BITS + 12;
414	else if (prob < (PIE_MAX_PROB / 1000)) /* 0.001 */
415		p >>= 5 + PIE_FIX_POINT_BITS + 12;
416	else if (prob < (PIE_MAX_PROB / 100)) /* 0.01 */
417		p >>= 3 + PIE_FIX_POINT_BITS + 12;
418	else if (prob < (PIE_MAX_PROB / 10)) /* 0.1 */
419		p >>= 1 + PIE_FIX_POINT_BITS + 12;
420	else
421		p >>= PIE_FIX_POINT_BITS + 12;
422
423	oldprob = prob;
424
425	if (p_isneg) {
426		prob = prob - p;
427
428		/* check for multiplication underflow */
429		if (prob > oldprob) {
430			prob= 0;
431			D("underflow");
432		}
433	} else {
434		/* Cap Drop adjustment */
435		if ((pprms->flags & PIE_CAPDROP_ENABLED) &&
436		    prob >= PIE_MAX_PROB / 10 &&
437		    p > PIE_MAX_PROB / 50 ) {
438			p = PIE_MAX_PROB / 50;
439		}
440
441		prob = prob + p;
442
443		/* check for multiplication overflow */
444		if (prob<oldprob) {
445			D("overflow");
446			prob= PIE_MAX_PROB;
447		}
448	}
449
450	/*
451	 * decay the drop probability exponentially
452	 * and restrict it to range 0 to PIE_MAX_PROB
453	 */
454	if (prob < 0) {
455		prob = 0;
456	} else {
457		if (pst->current_qdelay == 0 && pst->qdelay_old == 0) {
458			/* 0.98 ~= 1- 1/64 */
459			prob = prob - (prob >> 6);
460		}
461
462		if (prob > PIE_MAX_PROB) {
463			prob = PIE_MAX_PROB;
464		}
465	}
466
467	pst->drop_prob = prob;
468
469	/* store current delay value */
470	pst->qdelay_old = pst->current_qdelay;
471
472	/* update burst allowance */
473	if ((pst->sflags & PIE_ACTIVE) && pst->burst_allowance) {
474		if (pst->burst_allowance > pprms->tupdate)
475			pst->burst_allowance -= pprms->tupdate;
476		else
477			pst->burst_allowance = 0;
478	}
479
480	if (pst->sflags & PIE_ACTIVE)
481	callout_reset_sbt(&pst->aqm_pie_callout,
482		(uint64_t)pprms->tupdate * SBT_1US,
483		0, fq_calculate_drop_prob, q, 0);
484
485	mtx_unlock(&pst->lock_mtx);
486}
487
488/*
489 * Reset PIE variables & activate the queue
490 */
491__inline static void
492fq_activate_pie(struct fq_pie_flow *q)
493{
494	struct pie_status *pst = &q->pst;
495	struct dn_aqm_pie_parms *pprms;
496
497	mtx_lock(&pst->lock_mtx);
498	pprms = pst->parms;
499
500	pprms = pst->parms;
501	pst->drop_prob = 0;
502	pst->qdelay_old = 0;
503	pst->burst_allowance = pprms->max_burst;
504	pst->accu_prob = 0;
505	pst->dq_count = 0;
506	pst->avg_dq_time = 0;
507	pst->sflags = PIE_INMEASUREMENT | PIE_ACTIVE;
508	pst->measurement_start = AQM_UNOW;
509
510	callout_reset_sbt(&pst->aqm_pie_callout,
511		(uint64_t)pprms->tupdate * SBT_1US,
512		0, fq_calculate_drop_prob, q, 0);
513
514	mtx_unlock(&pst->lock_mtx);
515}
516
517
518 /*
519  * Deactivate PIE and stop probe update callout
520  */
521__inline static void
522fq_deactivate_pie(struct pie_status *pst)
523{
524	mtx_lock(&pst->lock_mtx);
525	pst->sflags &= ~(PIE_ACTIVE | PIE_INMEASUREMENT);
526	callout_stop(&pst->aqm_pie_callout);
527	//D("PIE Deactivated");
528	mtx_unlock(&pst->lock_mtx);
529}
530
531 /*
532  * Initialize PIE for sub-queue 'q'
533  */
534static int
535pie_init(struct fq_pie_flow *q, struct fq_pie_schk *fqpie_schk)
536{
537	struct pie_status *pst=&q->pst;
538	struct dn_aqm_pie_parms *pprms = pst->parms;
539
540	int err = 0;
541	if (!pprms){
542		D("AQM_PIE is not configured");
543		err = EINVAL;
544	} else {
545		q->psi_extra->nr_active_q++;
546
547		/* For speed optimization, we caculate 1/3 queue size once here */
548		// XXX limit divided by number of queues divided by 3 ???
549		pst->one_third_q_size = (fqpie_schk->cfg.limit /
550			fqpie_schk->cfg.flows_cnt) / 3;
551
552		mtx_init(&pst->lock_mtx, "mtx_pie", NULL, MTX_DEF);
553		callout_init_mtx(&pst->aqm_pie_callout, &pst->lock_mtx,
554			CALLOUT_RETURNUNLOCKED);
555	}
556
557	return err;
558}
559
560/*
561 * callout function to destroy PIE lock, and free fq_pie flows and fq_pie si
562 * extra memory when number of active sub-queues reaches zero.
563 * 'x' is a fq_pie_flow to be destroyed
564 */
565static void
566fqpie_callout_cleanup(void *x)
567{
568	struct fq_pie_flow *q = x;
569	struct pie_status *pst = &q->pst;
570	struct fq_pie_si_extra *psi_extra;
571
572	mtx_unlock(&pst->lock_mtx);
573	mtx_destroy(&pst->lock_mtx);
574	psi_extra = q->psi_extra;
575
576	DN_BH_WLOCK();
577	psi_extra->nr_active_q--;
578
579	/* when all sub-queues are destroyed, free flows fq_pie extra vars memory */
580	if (!psi_extra->nr_active_q) {
581		free(psi_extra->flows, M_DUMMYNET);
582		free(psi_extra, M_DUMMYNET);
583		fq_pie_desc.ref_count--;
584	}
585	DN_BH_WUNLOCK();
586}
587
588/*
589 * Clean up PIE status for sub-queue 'q'
590 * Stop callout timer and destroy mtx using fqpie_callout_cleanup() callout.
591 */
592static int
593pie_cleanup(struct fq_pie_flow *q)
594{
595	struct pie_status *pst  = &q->pst;
596
597	mtx_lock(&pst->lock_mtx);
598	callout_reset_sbt(&pst->aqm_pie_callout,
599		SBT_1US, 0, fqpie_callout_cleanup, q, 0);
600	mtx_unlock(&pst->lock_mtx);
601	return 0;
602}
603
604/*
605 * Dequeue and return a pcaket from sub-queue 'q' or NULL if 'q' is empty.
606 * Also, caculate depature time or queue delay using timestamp
607 */
608 static struct mbuf *
609pie_dequeue(struct fq_pie_flow *q, struct fq_pie_si *si)
610{
611	struct mbuf *m;
612	struct dn_aqm_pie_parms *pprms;
613	struct pie_status *pst;
614	aqm_time_t now;
615	aqm_time_t pkt_ts, dq_time;
616	int32_t w;
617
618	pst  = &q->pst;
619	pprms = q->pst.parms;
620
621	/*we extarct packet ts only when Departure Rate Estimation dis not used*/
622	m = fq_pie_extract_head(q, &pkt_ts, si,
623		!(pprms->flags & PIE_DEPRATEEST_ENABLED));
624
625	if (!m || !(pst->sflags & PIE_ACTIVE))
626		return m;
627
628	now = AQM_UNOW;
629	if (pprms->flags & PIE_DEPRATEEST_ENABLED) {
630		/* calculate average depature time */
631		if(pst->sflags & PIE_INMEASUREMENT) {
632			pst->dq_count += m->m_pkthdr.len;
633
634			if (pst->dq_count >= PIE_DQ_THRESHOLD) {
635				dq_time = now - pst->measurement_start;
636
637				/*
638				 * if we don't have old avg dq_time i.e PIE is (re)initialized,
639				 * don't use weight to calculate new avg_dq_time
640				 */
641				if(pst->avg_dq_time == 0)
642					pst->avg_dq_time = dq_time;
643				else {
644					/*
645					 * weight = PIE_DQ_THRESHOLD/2^6, but we scaled
646					 * weight by 2^8. Thus, scaled
647					 * weight = PIE_DQ_THRESHOLD /2^8
648					 * */
649					w = PIE_DQ_THRESHOLD >> 8;
650					pst->avg_dq_time = (dq_time* w
651						+ (pst->avg_dq_time * ((1L << 8) - w))) >> 8;
652					pst->sflags &= ~PIE_INMEASUREMENT;
653				}
654			}
655		}
656
657		/*
658		 * Start new measurment cycle when the queue has
659		 *  PIE_DQ_THRESHOLD worth of bytes.
660		 */
661		if(!(pst->sflags & PIE_INMEASUREMENT) &&
662			q->stats.len_bytes >= PIE_DQ_THRESHOLD) {
663			pst->sflags |= PIE_INMEASUREMENT;
664			pst->measurement_start = now;
665			pst->dq_count = 0;
666		}
667	}
668	/* Optionally, use packet timestamp to estimate queue delay */
669	else
670		pst->current_qdelay = now - pkt_ts;
671
672	return m;
673}
674
675
676 /*
677 * Enqueue a packet in q, subject to space and FQ-PIE queue management policy
678 * (whose parameters are in q->fs).
679 * Update stats for the queue and the scheduler.
680 * Return 0 on success, 1 on drop. The packet is consumed anyways.
681 */
682static int
683pie_enqueue(struct fq_pie_flow *q, struct mbuf* m, struct fq_pie_si *si)
684{
685	uint64_t len;
686	struct pie_status *pst;
687	struct dn_aqm_pie_parms *pprms;
688	int t;
689
690	len = m->m_pkthdr.len;
691	pst  = &q->pst;
692	pprms = pst->parms;
693	t = ENQUE;
694
695	/* drop/mark the packet when PIE is active and burst time elapsed */
696	if (pst->sflags & PIE_ACTIVE && pst->burst_allowance == 0
697		&& drop_early(pst, q->stats.len_bytes) == DROP) {
698			/*
699			 * if drop_prob over ECN threshold, drop the packet
700			 * otherwise mark and enqueue it.
701			 */
702			if (pprms->flags & PIE_ECN_ENABLED && pst->drop_prob <
703				(pprms->max_ecnth << (PIE_PROB_BITS - PIE_FIX_POINT_BITS))
704				&& ecn_mark(m))
705				t = ENQUE;
706			else
707				t = DROP;
708		}
709
710	/* Turn PIE on when 1/3 of the queue is full */
711	if (!(pst->sflags & PIE_ACTIVE) && q->stats.len_bytes >=
712		pst->one_third_q_size) {
713		fq_activate_pie(q);
714	}
715
716	/*  reset burst tolerance and optinally turn PIE off*/
717	if (pst->drop_prob == 0 && pst->current_qdelay < (pprms->qdelay_ref >> 1)
718		&& pst->qdelay_old < (pprms->qdelay_ref >> 1)) {
719
720			pst->burst_allowance = pprms->max_burst;
721		if (pprms->flags & PIE_ON_OFF_MODE_ENABLED && q->stats.len_bytes<=0)
722			fq_deactivate_pie(pst);
723	}
724
725	/* Use timestamp if Departure Rate Estimation mode is disabled */
726	if (t != DROP && !(pprms->flags & PIE_DEPRATEEST_ENABLED)) {
727		/* Add TS to mbuf as a TAG */
728		struct m_tag *mtag;
729		mtag = m_tag_locate(m, MTAG_ABI_COMPAT, DN_AQM_MTAG_TS, NULL);
730		if (mtag == NULL)
731			mtag = m_tag_alloc(MTAG_ABI_COMPAT, DN_AQM_MTAG_TS,
732				sizeof(aqm_time_t), M_NOWAIT);
733		if (mtag == NULL) {
734			m_freem(m);
735			t = DROP;
736		}
737		*(aqm_time_t *)(mtag + 1) = AQM_UNOW;
738		m_tag_prepend(m, mtag);
739	}
740
741	if (t != DROP) {
742		mq_append(&q->mq, m);
743		fq_update_stats(q, si, len, 0);
744		return 0;
745	} else {
746		fq_update_stats(q, si, len, 1);
747		pst->accu_prob = 0;
748		FREE_PKT(m);
749		return 1;
750	}
751
752	return 0;
753}
754
755/* Drop a packet form the head of FQ-PIE sub-queue */
756static void
757pie_drop_head(struct fq_pie_flow *q, struct fq_pie_si *si)
758{
759	struct mbuf *m = q->mq.head;
760
761	if (m == NULL)
762		return;
763	q->mq.head = m->m_nextpkt;
764
765	fq_update_stats(q, si, -m->m_pkthdr.len, 1);
766
767	if (si->main_q.ni.length == 0) /* queue is now idle */
768			si->main_q.q_time = dn_cfg.curr_time;
769	/* reset accu_prob after packet drop */
770	q->pst.accu_prob = 0;
771
772	FREE_PKT(m);
773}
774
775/*
776 * Classify a packet to queue number using Jenkins hash function.
777 * Return: queue number
778 * the input of the hash are protocol no, perturbation, src IP, dst IP,
779 * src port, dst port,
780 */
781static inline int
782fq_pie_classify_flow(struct mbuf *m, uint16_t fcount, struct fq_pie_si *si)
783{
784	struct ip *ip;
785	struct tcphdr *th;
786	struct udphdr *uh;
787	uint8_t tuple[41];
788	uint16_t hash=0;
789
790//#ifdef INET6
791	struct ip6_hdr *ip6;
792	int isip6;
793	isip6 = (mtod(m, struct ip *)->ip_v == 6) ? 1 : 0;
794
795	if(isip6) {
796		ip6 = mtod(m, struct ip6_hdr *);
797		*((uint8_t *) &tuple[0]) = ip6->ip6_nxt;
798		*((uint32_t *) &tuple[1]) = si->perturbation;
799		memcpy(&tuple[5], ip6->ip6_src.s6_addr, 16);
800		memcpy(&tuple[21], ip6->ip6_dst.s6_addr, 16);
801
802		switch (ip6->ip6_nxt) {
803		case IPPROTO_TCP:
804			th = (struct tcphdr *)(ip6 + 1);
805			*((uint16_t *) &tuple[37]) = th->th_dport;
806			*((uint16_t *) &tuple[39]) = th->th_sport;
807			break;
808
809		case IPPROTO_UDP:
810			uh = (struct udphdr *)(ip6 + 1);
811			*((uint16_t *) &tuple[37]) = uh->uh_dport;
812			*((uint16_t *) &tuple[39]) = uh->uh_sport;
813			break;
814		default:
815			memset(&tuple[37], 0, 4);
816		}
817
818		hash = jenkins_hash(tuple, 41, HASHINIT) %  fcount;
819		return hash;
820	}
821//#endif
822
823	/* IPv4 */
824	ip = mtod(m, struct ip *);
825	*((uint8_t *) &tuple[0]) = ip->ip_p;
826	*((uint32_t *) &tuple[1]) = si->perturbation;
827	*((uint32_t *) &tuple[5]) = ip->ip_src.s_addr;
828	*((uint32_t *) &tuple[9]) = ip->ip_dst.s_addr;
829
830	switch (ip->ip_p) {
831		case IPPROTO_TCP:
832			th = (struct tcphdr *)(ip + 1);
833			*((uint16_t *) &tuple[13]) = th->th_dport;
834			*((uint16_t *) &tuple[15]) = th->th_sport;
835			break;
836
837		case IPPROTO_UDP:
838			uh = (struct udphdr *)(ip + 1);
839			*((uint16_t *) &tuple[13]) = uh->uh_dport;
840			*((uint16_t *) &tuple[15]) = uh->uh_sport;
841			break;
842		default:
843			memset(&tuple[13], 0, 4);
844	}
845	hash = jenkins_hash(tuple, 17, HASHINIT) % fcount;
846
847	return hash;
848}
849
850/*
851 * Enqueue a packet into an appropriate queue according to
852 * FQ-CoDe; algorithm.
853 */
854static int
855fq_pie_enqueue(struct dn_sch_inst *_si, struct dn_queue *_q,
856	struct mbuf *m)
857{
858	struct fq_pie_si *si;
859	struct fq_pie_schk *schk;
860	struct dn_sch_fq_pie_parms *param;
861	struct dn_queue *mainq;
862	struct fq_pie_flow *flows;
863	int idx, drop, i, maxidx;
864
865	mainq = (struct dn_queue *)(_si + 1);
866	si = (struct fq_pie_si *)_si;
867	flows = si->si_extra->flows;
868	schk = (struct fq_pie_schk *)(si->_si.sched+1);
869	param = &schk->cfg;
870
871	 /* classify a packet to queue number*/
872	idx = fq_pie_classify_flow(m, param->flows_cnt, si);
873
874	/* enqueue packet into appropriate queue using PIE AQM.
875	 * Note: 'pie_enqueue' function returns 1 only when it unable to
876	 * add timestamp to packet (no limit check)*/
877	drop = pie_enqueue(&flows[idx], m, si);
878
879	/* pie unable to timestamp a packet */
880	if (drop)
881		return 1;
882
883	/* If the flow (sub-queue) is not active ,then add it to tail of
884	 * new flows list, initialize and activate it.
885	 */
886	if (!flows[idx].active) {
887		STAILQ_INSERT_TAIL(&si->newflows, &flows[idx], flowchain);
888		flows[idx].deficit = param->quantum;
889		fq_activate_pie(&flows[idx]);
890		flows[idx].active = 1;
891	}
892
893	/* check the limit for all queues and remove a packet from the
894	 * largest one
895	 */
896	if (mainq->ni.length > schk->cfg.limit) {
897		/* find first active flow */
898		for (maxidx = 0; maxidx < schk->cfg.flows_cnt; maxidx++)
899			if (flows[maxidx].active)
900				break;
901		if (maxidx < schk->cfg.flows_cnt) {
902			/* find the largest sub- queue */
903			for (i = maxidx + 1; i < schk->cfg.flows_cnt; i++)
904				if (flows[i].active && flows[i].stats.length >
905					flows[maxidx].stats.length)
906					maxidx = i;
907			pie_drop_head(&flows[maxidx], si);
908			drop = 1;
909		}
910	}
911
912	return drop;
913}
914
915/*
916 * Dequeue a packet from an appropriate queue according to
917 * FQ-CoDel algorithm.
918 */
919static struct mbuf *
920fq_pie_dequeue(struct dn_sch_inst *_si)
921{
922	struct fq_pie_si *si;
923	struct fq_pie_schk *schk;
924	struct dn_sch_fq_pie_parms *param;
925	struct fq_pie_flow *f;
926	struct mbuf *mbuf;
927	struct fq_pie_list *fq_pie_flowlist;
928
929	si = (struct fq_pie_si *)_si;
930	schk = (struct fq_pie_schk *)(si->_si.sched+1);
931	param = &schk->cfg;
932
933	do {
934		/* select a list to start with */
935		if (STAILQ_EMPTY(&si->newflows))
936			fq_pie_flowlist = &si->oldflows;
937		else
938			fq_pie_flowlist = &si->newflows;
939
940		/* Both new and old queue lists are empty, return NULL */
941		if (STAILQ_EMPTY(fq_pie_flowlist))
942			return NULL;
943
944		f = STAILQ_FIRST(fq_pie_flowlist);
945		while (f != NULL)	{
946			/* if there is no flow(sub-queue) deficit, increase deficit
947			 * by quantum, move the flow to the tail of old flows list
948			 * and try another flow.
949			 * Otherwise, the flow will be used for dequeue.
950			 */
951			if (f->deficit < 0) {
952				 f->deficit += param->quantum;
953				 STAILQ_REMOVE_HEAD(fq_pie_flowlist, flowchain);
954				 STAILQ_INSERT_TAIL(&si->oldflows, f, flowchain);
955			 } else
956				 break;
957
958			f = STAILQ_FIRST(fq_pie_flowlist);
959		}
960
961		/* the new flows list is empty, try old flows list */
962		if (STAILQ_EMPTY(fq_pie_flowlist))
963			continue;
964
965		/* Dequeue a packet from the selected flow */
966		mbuf = pie_dequeue(f, si);
967
968		/* pie did not return a packet */
969		if (!mbuf) {
970			/* If the selected flow belongs to new flows list, then move
971			 * it to the tail of old flows list. Otherwise, deactivate it and
972			 * remove it from the old list and
973			 */
974			if (fq_pie_flowlist == &si->newflows) {
975				STAILQ_REMOVE_HEAD(fq_pie_flowlist, flowchain);
976				STAILQ_INSERT_TAIL(&si->oldflows, f, flowchain);
977			}	else {
978				f->active = 0;
979				fq_deactivate_pie(&f->pst);
980				STAILQ_REMOVE_HEAD(fq_pie_flowlist, flowchain);
981			}
982			/* start again */
983			continue;
984		}
985
986		/* we have a packet to return,
987		 * update flow deficit and return the packet*/
988		f->deficit -= mbuf->m_pkthdr.len;
989		return mbuf;
990
991	} while (1);
992
993	/* unreachable point */
994	return NULL;
995}
996
997/*
998 * Initialize fq_pie scheduler instance.
999 * also, allocate memory for flows array.
1000 */
1001static int
1002fq_pie_new_sched(struct dn_sch_inst *_si)
1003{
1004	struct fq_pie_si *si;
1005	struct dn_queue *q;
1006	struct fq_pie_schk *schk;
1007	struct fq_pie_flow *flows;
1008	int i;
1009
1010	si = (struct fq_pie_si *)_si;
1011	schk = (struct fq_pie_schk *)(_si->sched+1);
1012
1013	if(si->si_extra) {
1014		D("si already configured!");
1015		return 0;
1016	}
1017
1018	/* init the main queue */
1019	q = &si->main_q;
1020	set_oid(&q->ni.oid, DN_QUEUE, sizeof(*q));
1021	q->_si = _si;
1022	q->fs = _si->sched->fs;
1023
1024	/* allocate memory for scheduler instance extra vars */
1025	si->si_extra = malloc(sizeof(struct fq_pie_si_extra),
1026		 M_DUMMYNET, M_NOWAIT | M_ZERO);
1027	if (si->si_extra == NULL) {
1028		D("cannot allocate memory for fq_pie si extra vars");
1029		return ENOMEM ;
1030	}
1031	/* allocate memory for flows array */
1032	si->si_extra->flows = malloc(schk->cfg.flows_cnt * sizeof(struct fq_pie_flow),
1033		 M_DUMMYNET, M_NOWAIT | M_ZERO);
1034	flows = si->si_extra->flows;
1035	if (flows == NULL) {
1036		free(si->si_extra, M_DUMMYNET);
1037		si->si_extra = NULL;
1038		D("cannot allocate memory for fq_pie flows");
1039		return ENOMEM ;
1040	}
1041
1042	/* init perturbation for this si */
1043	si->perturbation = random();
1044	si->si_extra->nr_active_q = 0;
1045
1046	/* init the old and new flows lists */
1047	STAILQ_INIT(&si->newflows);
1048	STAILQ_INIT(&si->oldflows);
1049
1050	/* init the flows (sub-queues) */
1051	for (i = 0; i < schk->cfg.flows_cnt; i++) {
1052		flows[i].pst.parms = &schk->cfg.pcfg;
1053		flows[i].psi_extra = si->si_extra;
1054		pie_init(&flows[i], schk);
1055	}
1056
1057	fq_pie_desc.ref_count++;
1058
1059	return 0;
1060}
1061
1062
1063/*
1064 * Free fq_pie scheduler instance.
1065 */
1066static int
1067fq_pie_free_sched(struct dn_sch_inst *_si)
1068{
1069	struct fq_pie_si *si;
1070	struct fq_pie_schk *schk;
1071	struct fq_pie_flow *flows;
1072	int i;
1073
1074	si = (struct fq_pie_si *)_si;
1075	schk = (struct fq_pie_schk *)(_si->sched+1);
1076	flows = si->si_extra->flows;
1077	for (i = 0; i < schk->cfg.flows_cnt; i++) {
1078		pie_cleanup(&flows[i]);
1079	}
1080	si->si_extra = NULL;
1081	return 0;
1082}
1083
1084/*
1085 * Configure FQ-PIE scheduler.
1086 * the configurations for the scheduler is passed fromipfw  userland.
1087 */
1088static int
1089fq_pie_config(struct dn_schk *_schk)
1090{
1091	struct fq_pie_schk *schk;
1092	struct dn_extra_parms *ep;
1093	struct dn_sch_fq_pie_parms *fqp_cfg;
1094
1095	schk = (struct fq_pie_schk *)(_schk+1);
1096	ep = (struct dn_extra_parms *) _schk->cfg;
1097
1098	/* par array contains fq_pie configuration as follow
1099	 * PIE: 0- qdelay_ref,1- tupdate, 2- max_burst
1100	 * 3- max_ecnth, 4- alpha, 5- beta, 6- flags
1101	 * FQ_PIE: 7- quantum, 8- limit, 9- flows
1102	 */
1103	if (ep && ep->oid.len ==sizeof(*ep) &&
1104		ep->oid.subtype == DN_SCH_PARAMS) {
1105
1106		fqp_cfg = &schk->cfg;
1107		if (ep->par[0] < 0)
1108			fqp_cfg->pcfg.qdelay_ref = fq_pie_sysctl.pcfg.qdelay_ref;
1109		else
1110			fqp_cfg->pcfg.qdelay_ref = ep->par[0];
1111		if (ep->par[1] < 0)
1112			fqp_cfg->pcfg.tupdate = fq_pie_sysctl.pcfg.tupdate;
1113		else
1114			fqp_cfg->pcfg.tupdate = ep->par[1];
1115		if (ep->par[2] < 0)
1116			fqp_cfg->pcfg.max_burst = fq_pie_sysctl.pcfg.max_burst;
1117		else
1118			fqp_cfg->pcfg.max_burst = ep->par[2];
1119		if (ep->par[3] < 0)
1120			fqp_cfg->pcfg.max_ecnth = fq_pie_sysctl.pcfg.max_ecnth;
1121		else
1122			fqp_cfg->pcfg.max_ecnth = ep->par[3];
1123		if (ep->par[4] < 0)
1124			fqp_cfg->pcfg.alpha = fq_pie_sysctl.pcfg.alpha;
1125		else
1126			fqp_cfg->pcfg.alpha = ep->par[4];
1127		if (ep->par[5] < 0)
1128			fqp_cfg->pcfg.beta = fq_pie_sysctl.pcfg.beta;
1129		else
1130			fqp_cfg->pcfg.beta = ep->par[5];
1131		if (ep->par[6] < 0)
1132			fqp_cfg->pcfg.flags = 0;
1133		else
1134			fqp_cfg->pcfg.flags = ep->par[6];
1135
1136		/* FQ configurations */
1137		if (ep->par[7] < 0)
1138			fqp_cfg->quantum = fq_pie_sysctl.quantum;
1139		else
1140			fqp_cfg->quantum = ep->par[7];
1141		if (ep->par[8] < 0)
1142			fqp_cfg->limit = fq_pie_sysctl.limit;
1143		else
1144			fqp_cfg->limit = ep->par[8];
1145		if (ep->par[9] < 0)
1146			fqp_cfg->flows_cnt = fq_pie_sysctl.flows_cnt;
1147		else
1148			fqp_cfg->flows_cnt = ep->par[9];
1149
1150		/* Bound the configurations */
1151		fqp_cfg->pcfg.qdelay_ref = BOUND_VAR(fqp_cfg->pcfg.qdelay_ref,
1152			1, 5 * AQM_TIME_1S);
1153		fqp_cfg->pcfg.tupdate = BOUND_VAR(fqp_cfg->pcfg.tupdate,
1154			1, 5 * AQM_TIME_1S);
1155		fqp_cfg->pcfg.max_burst = BOUND_VAR(fqp_cfg->pcfg.max_burst,
1156			0, 5 * AQM_TIME_1S);
1157		fqp_cfg->pcfg.max_ecnth = BOUND_VAR(fqp_cfg->pcfg.max_ecnth,
1158			0, PIE_SCALE);
1159		fqp_cfg->pcfg.alpha = BOUND_VAR(fqp_cfg->pcfg.alpha, 0, 7 * PIE_SCALE);
1160		fqp_cfg->pcfg.beta = BOUND_VAR(fqp_cfg->pcfg.beta, 0, 7 * PIE_SCALE);
1161
1162		fqp_cfg->quantum = BOUND_VAR(fqp_cfg->quantum,1,9000);
1163		fqp_cfg->limit= BOUND_VAR(fqp_cfg->limit,1,20480);
1164		fqp_cfg->flows_cnt= BOUND_VAR(fqp_cfg->flows_cnt,1,65536);
1165	}
1166	else {
1167		D("Wrong parameters for fq_pie scheduler");
1168		return 1;
1169	}
1170
1171	return 0;
1172}
1173
1174/*
1175 * Return FQ-PIE scheduler configurations
1176 * the configurations for the scheduler is passed to userland.
1177 */
1178static int
1179fq_pie_getconfig (struct dn_schk *_schk, struct dn_extra_parms *ep) {
1180
1181	struct fq_pie_schk *schk = (struct fq_pie_schk *)(_schk+1);
1182	struct dn_sch_fq_pie_parms *fqp_cfg;
1183
1184	fqp_cfg = &schk->cfg;
1185
1186	strcpy(ep->name, fq_pie_desc.name);
1187	ep->par[0] = fqp_cfg->pcfg.qdelay_ref;
1188	ep->par[1] = fqp_cfg->pcfg.tupdate;
1189	ep->par[2] = fqp_cfg->pcfg.max_burst;
1190	ep->par[3] = fqp_cfg->pcfg.max_ecnth;
1191	ep->par[4] = fqp_cfg->pcfg.alpha;
1192	ep->par[5] = fqp_cfg->pcfg.beta;
1193	ep->par[6] = fqp_cfg->pcfg.flags;
1194
1195	ep->par[7] = fqp_cfg->quantum;
1196	ep->par[8] = fqp_cfg->limit;
1197	ep->par[9] = fqp_cfg->flows_cnt;
1198
1199	return 0;
1200}
1201
1202/*
1203 *  FQ-PIE scheduler descriptor
1204 * contains the type of the scheduler, the name, the size of extra
1205 * data structures, and function pointers.
1206 */
1207static struct dn_alg fq_pie_desc = {
1208	_SI( .type = )  DN_SCHED_FQ_PIE,
1209	_SI( .name = ) "FQ_PIE",
1210	_SI( .flags = ) 0,
1211
1212	_SI( .schk_datalen = ) sizeof(struct fq_pie_schk),
1213	_SI( .si_datalen = ) sizeof(struct fq_pie_si) - sizeof(struct dn_sch_inst),
1214	_SI( .q_datalen = ) 0,
1215
1216	_SI( .enqueue = ) fq_pie_enqueue,
1217	_SI( .dequeue = ) fq_pie_dequeue,
1218	_SI( .config = ) fq_pie_config, /* new sched i.e. sched X config ...*/
1219	_SI( .destroy = ) NULL,  /*sched x delete */
1220	_SI( .new_sched = ) fq_pie_new_sched, /* new schd instance */
1221	_SI( .free_sched = ) fq_pie_free_sched,	/* delete schd instance */
1222	_SI( .new_fsk = ) NULL,
1223	_SI( .free_fsk = ) NULL,
1224	_SI( .new_queue = ) NULL,
1225	_SI( .free_queue = ) NULL,
1226	_SI( .getconfig = )  fq_pie_getconfig,
1227	_SI( .ref_count = ) 0
1228};
1229
1230DECLARE_DNSCHED_MODULE(dn_fq_pie, &fq_pie_desc);
1231