1/*
2 * FQ_Codel - The FlowQueue-Codel scheduler/AQM
3 *
4 * $FreeBSD$
5 *
6 * Copyright (C) 2016 Centre for Advanced Internet Architectures,
7 *  Swinburne University of Technology, Melbourne, Australia.
8 * Portions of this code were made possible in part by a gift from
9 *  The Comcast Innovation Fund.
10 * Implemented by Rasool Al-Saadi <ralsaadi@swin.edu.au>
11 *
12 * Redistribution and use in source and binary forms, with or without
13 * modification, are permitted provided that the following conditions
14 * are met:
15 * 1. Redistributions of source code must retain the above copyright
16 *    notice, this list of conditions and the following disclaimer.
17 * 2. Redistributions in binary form must reproduce the above copyright
18 *    notice, this list of conditions and the following disclaimer in the
19 *    documentation and/or other materials provided with the distribution.
20 *
21 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
22 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
23 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
24 * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
25 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
26 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
27 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
28 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
29 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
30 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
31 * SUCH DAMAGE.
32 */
33
34#ifdef _KERNEL
35#include <sys/malloc.h>
36#include <sys/socket.h>
37//#include <sys/socketvar.h>
38#include <sys/kernel.h>
39#include <sys/mbuf.h>
40#include <sys/module.h>
41#include <net/if.h>	/* IFNAMSIZ */
42#include <netinet/in.h>
43#include <netinet/ip_var.h>		/* ipfw_rule_ref */
44#include <netinet/ip_fw.h>	/* flow_id */
45#include <netinet/ip_dummynet.h>
46
47#include <sys/lock.h>
48#include <sys/proc.h>
49#include <sys/rwlock.h>
50
51#include <netpfil/ipfw/ip_fw_private.h>
52#include <sys/sysctl.h>
53#include <netinet/ip.h>
54#include <netinet/ip6.h>
55#include <netinet/ip_icmp.h>
56#include <netinet/tcp.h>
57#include <netinet/udp.h>
58#include <sys/queue.h>
59#include <sys/hash.h>
60
61#include <netpfil/ipfw/dn_heap.h>
62#include <netpfil/ipfw/ip_dn_private.h>
63
64#include <netpfil/ipfw/dn_aqm.h>
65#include <netpfil/ipfw/dn_aqm_codel.h>
66#include <netpfil/ipfw/dn_sched.h>
67#include <netpfil/ipfw/dn_sched_fq_codel.h>
68#include <netpfil/ipfw/dn_sched_fq_codel_helper.h>
69
70#else
71#include <dn_test.h>
72#endif
73
74/* NOTE: In fq_codel module, we reimplements CoDel AQM functions
75 * because fq_codel use different flows (sub-queues) structure and
76 * dn_queue includes many variables not needed by a flow (sub-queue
77 * )i.e. avoid extra overhead (88 bytes vs 208 bytes).
78 * Also, CoDel functions manages stats of sub-queues as well as the main queue.
79 */
80
81#define DN_SCHED_FQ_CODEL 6
82
83static struct dn_alg fq_codel_desc;
84
85/* fq_codel default parameters including codel */
86struct dn_sch_fq_codel_parms
87fq_codel_sysctl = {{5000 * AQM_TIME_1US, 100000 * AQM_TIME_1US,
88	CODEL_ECN_ENABLED}, 1024, 10240, 1514};
89
90static int
91fqcodel_sysctl_interval_handler(SYSCTL_HANDLER_ARGS)
92{
93	int error;
94	long  value;
95
96	value = fq_codel_sysctl.ccfg.interval;
97	value /= AQM_TIME_1US;
98	error = sysctl_handle_long(oidp, &value, 0, req);
99	if (error != 0 || req->newptr == NULL)
100		return (error);
101	if (value < 1 || value > 100 * AQM_TIME_1S)
102		return (EINVAL);
103	fq_codel_sysctl.ccfg.interval = value * AQM_TIME_1US ;
104
105	return (0);
106}
107
108static int
109fqcodel_sysctl_target_handler(SYSCTL_HANDLER_ARGS)
110{
111	int error;
112	long  value;
113
114	value = fq_codel_sysctl.ccfg.target;
115	value /= AQM_TIME_1US;
116	error = sysctl_handle_long(oidp, &value, 0, req);
117	if (error != 0 || req->newptr == NULL)
118		return (error);
119	if (value < 1 || value > 5 * AQM_TIME_1S)
120		return (EINVAL);
121	fq_codel_sysctl.ccfg.target = value * AQM_TIME_1US ;
122
123	return (0);
124}
125
126
127SYSBEGIN(f4)
128
129SYSCTL_DECL(_net_inet);
130SYSCTL_DECL(_net_inet_ip);
131SYSCTL_DECL(_net_inet_ip_dummynet);
132static SYSCTL_NODE(_net_inet_ip_dummynet, OID_AUTO, fqcodel,
133	CTLFLAG_RW, 0, "FQ_CODEL");
134
135#ifdef SYSCTL_NODE
136
137SYSCTL_PROC(_net_inet_ip_dummynet_fqcodel, OID_AUTO, target,
138	CTLTYPE_LONG | CTLFLAG_RW, NULL, 0, fqcodel_sysctl_target_handler, "L",
139	"FQ_CoDel target in microsecond");
140SYSCTL_PROC(_net_inet_ip_dummynet_fqcodel, OID_AUTO, interval,
141	CTLTYPE_LONG | CTLFLAG_RW, NULL, 0, fqcodel_sysctl_interval_handler, "L",
142	"FQ_CoDel interval in microsecond");
143
144SYSCTL_UINT(_net_inet_ip_dummynet_fqcodel, OID_AUTO, quantum,
145	CTLFLAG_RW, &fq_codel_sysctl.quantum, 1514, "FQ_CoDel quantum");
146SYSCTL_UINT(_net_inet_ip_dummynet_fqcodel, OID_AUTO, flows,
147	CTLFLAG_RW, &fq_codel_sysctl.flows_cnt, 1024,
148	"Number of queues for FQ_CoDel");
149SYSCTL_UINT(_net_inet_ip_dummynet_fqcodel, OID_AUTO, limit,
150	CTLFLAG_RW, &fq_codel_sysctl.limit, 10240, "FQ_CoDel queues size limit");
151#endif
152
153/* Drop a packet form the head of codel queue */
154static void
155codel_drop_head(struct fq_codel_flow *q, struct fq_codel_si *si)
156{
157	struct mbuf *m = q->mq.head;
158
159	if (m == NULL)
160		return;
161	q->mq.head = m->m_nextpkt;
162
163	fq_update_stats(q, si, -m->m_pkthdr.len, 1);
164
165	if (si->main_q.ni.length == 0) /* queue is now idle */
166			si->main_q.q_time = V_dn_cfg.curr_time;
167
168	FREE_PKT(m);
169}
170
171/* Enqueue a packet 'm' to a queue 'q' and add timestamp to that packet.
172 * Return 1 when unable to add timestamp, otherwise return 0
173 */
174static int
175codel_enqueue(struct fq_codel_flow *q, struct mbuf *m, struct fq_codel_si *si)
176{
177	uint64_t len;
178
179	len = m->m_pkthdr.len;
180	/* finding maximum packet size */
181	if (len > q->cst.maxpkt_size)
182		q->cst.maxpkt_size = len;
183
184	/* Add timestamp to mbuf as MTAG */
185	struct m_tag *mtag;
186	mtag = m_tag_locate(m, MTAG_ABI_COMPAT, DN_AQM_MTAG_TS, NULL);
187	if (mtag == NULL)
188		mtag = m_tag_alloc(MTAG_ABI_COMPAT, DN_AQM_MTAG_TS, sizeof(aqm_time_t),
189			M_NOWAIT);
190	if (mtag == NULL)
191		goto drop;
192	*(aqm_time_t *)(mtag + 1) = AQM_UNOW;
193	m_tag_prepend(m, mtag);
194
195	mq_append(&q->mq, m);
196	fq_update_stats(q, si, len, 0);
197	return 0;
198
199drop:
200	fq_update_stats(q, si, len, 1);
201	m_freem(m);
202	return 1;
203}
204
205/*
206 * Classify a packet to queue number using Jenkins hash function.
207 * Return: queue number
208 * the input of the hash are protocol no, perturbation, src IP, dst IP,
209 * src port, dst port,
210 */
211static inline int
212fq_codel_classify_flow(struct mbuf *m, uint16_t fcount, struct fq_codel_si *si)
213{
214	struct ip *ip;
215	struct tcphdr *th;
216	struct udphdr *uh;
217	uint8_t tuple[41];
218	uint16_t hash=0;
219
220	ip = (struct ip *)mtodo(m, dn_tag_get(m)->iphdr_off);
221//#ifdef INET6
222	struct ip6_hdr *ip6;
223	int isip6;
224	isip6 = (ip->ip_v == 6);
225
226	if(isip6) {
227		ip6 = (struct ip6_hdr *)ip;
228		*((uint8_t *) &tuple[0]) = ip6->ip6_nxt;
229		*((uint32_t *) &tuple[1]) = si->perturbation;
230		memcpy(&tuple[5], ip6->ip6_src.s6_addr, 16);
231		memcpy(&tuple[21], ip6->ip6_dst.s6_addr, 16);
232
233		switch (ip6->ip6_nxt) {
234		case IPPROTO_TCP:
235			th = (struct tcphdr *)(ip6 + 1);
236			*((uint16_t *) &tuple[37]) = th->th_dport;
237			*((uint16_t *) &tuple[39]) = th->th_sport;
238			break;
239
240		case IPPROTO_UDP:
241			uh = (struct udphdr *)(ip6 + 1);
242			*((uint16_t *) &tuple[37]) = uh->uh_dport;
243			*((uint16_t *) &tuple[39]) = uh->uh_sport;
244			break;
245		default:
246			memset(&tuple[37], 0, 4);
247
248		}
249
250		hash = jenkins_hash(tuple, 41, HASHINIT) %  fcount;
251		return hash;
252	}
253//#endif
254
255	/* IPv4 */
256	*((uint8_t *) &tuple[0]) = ip->ip_p;
257	*((uint32_t *) &tuple[1]) = si->perturbation;
258	*((uint32_t *) &tuple[5]) = ip->ip_src.s_addr;
259	*((uint32_t *) &tuple[9]) = ip->ip_dst.s_addr;
260
261	switch (ip->ip_p) {
262		case IPPROTO_TCP:
263			th = (struct tcphdr *)(ip + 1);
264			*((uint16_t *) &tuple[13]) = th->th_dport;
265			*((uint16_t *) &tuple[15]) = th->th_sport;
266			break;
267
268		case IPPROTO_UDP:
269			uh = (struct udphdr *)(ip + 1);
270			*((uint16_t *) &tuple[13]) = uh->uh_dport;
271			*((uint16_t *) &tuple[15]) = uh->uh_sport;
272			break;
273		default:
274			memset(&tuple[13], 0, 4);
275
276	}
277	hash = jenkins_hash(tuple, 17, HASHINIT) %  fcount;
278
279	return hash;
280}
281
282/*
283 * Enqueue a packet into an appropriate queue according to
284 * FQ_CODEL algorithm.
285 */
286static int
287fq_codel_enqueue(struct dn_sch_inst *_si, struct dn_queue *_q,
288	struct mbuf *m)
289{
290	struct fq_codel_si *si;
291	struct fq_codel_schk *schk;
292	struct dn_sch_fq_codel_parms *param;
293	struct dn_queue *mainq;
294	int idx, drop, i, maxidx;
295
296	mainq = (struct dn_queue *)(_si + 1);
297	si = (struct fq_codel_si *)_si;
298	schk = (struct fq_codel_schk *)(si->_si.sched+1);
299	param = &schk->cfg;
300
301	 /* classify a packet to queue number*/
302	idx = fq_codel_classify_flow(m, param->flows_cnt, si);
303	/* enqueue packet into appropriate queue using CoDel AQM.
304	 * Note: 'codel_enqueue' function returns 1 only when it unable to
305	 * add timestamp to packet (no limit check)*/
306	drop = codel_enqueue(&si->flows[idx], m, si);
307
308	/* codel unable to timestamp a packet */
309	if (drop)
310		return 1;
311
312	/* If the flow (sub-queue) is not active ,then add it to the tail of
313	 * new flows list, initialize and activate it.
314	 */
315	if (!si->flows[idx].active ) {
316		STAILQ_INSERT_TAIL(&si->newflows, &si->flows[idx], flowchain);
317		si->flows[idx].deficit = param->quantum;
318		si->flows[idx].cst.dropping = false;
319		si->flows[idx].cst.first_above_time = 0;
320		si->flows[idx].active = 1;
321		//D("activate %d",idx);
322	}
323
324	/* check the limit for all queues and remove a packet from the
325	 * largest one
326	 */
327	if (mainq->ni.length > schk->cfg.limit) { D("over limit");
328		/* find first active flow */
329		for (maxidx = 0; maxidx < schk->cfg.flows_cnt; maxidx++)
330			if (si->flows[maxidx].active)
331				break;
332		if (maxidx < schk->cfg.flows_cnt) {
333			/* find the largest sub- queue */
334			for (i = maxidx + 1; i < schk->cfg.flows_cnt; i++)
335				if (si->flows[i].active && si->flows[i].stats.length >
336					si->flows[maxidx].stats.length)
337					maxidx = i;
338			codel_drop_head(&si->flows[maxidx], si);
339			D("maxidx = %d",maxidx);
340			drop = 1;
341		}
342	}
343
344	return drop;
345}
346
347/*
348 * Dequeue a packet from an appropriate queue according to
349 * FQ_CODEL algorithm.
350 */
351static struct mbuf *
352fq_codel_dequeue(struct dn_sch_inst *_si)
353{
354	struct fq_codel_si *si;
355	struct fq_codel_schk *schk;
356	struct dn_sch_fq_codel_parms *param;
357	struct fq_codel_flow *f;
358	struct mbuf *mbuf;
359	struct fq_codel_list *fq_codel_flowlist;
360
361	si = (struct fq_codel_si *)_si;
362	schk = (struct fq_codel_schk *)(si->_si.sched+1);
363	param = &schk->cfg;
364
365	do {
366		/* select a list to start with */
367		if (STAILQ_EMPTY(&si->newflows))
368			fq_codel_flowlist = &si->oldflows;
369		else
370			fq_codel_flowlist = &si->newflows;
371
372		/* Both new and old queue lists are empty, return NULL */
373		if (STAILQ_EMPTY(fq_codel_flowlist))
374			return NULL;
375
376		f = STAILQ_FIRST(fq_codel_flowlist);
377		while (f != NULL)	{
378			/* if there is no flow(sub-queue) deficit, increase deficit
379			 * by quantum, move the flow to the tail of old flows list
380			 * and try another flow.
381			 * Otherwise, the flow will be used for dequeue.
382			 */
383			if (f->deficit < 0) {
384				 f->deficit += param->quantum;
385				 STAILQ_REMOVE_HEAD(fq_codel_flowlist, flowchain);
386				 STAILQ_INSERT_TAIL(&si->oldflows, f, flowchain);
387			 } else
388				 break;
389
390			f = STAILQ_FIRST(fq_codel_flowlist);
391		}
392
393		/* the new flows list is empty, try old flows list */
394		if (STAILQ_EMPTY(fq_codel_flowlist))
395			continue;
396
397		/* Dequeue a packet from the selected flow */
398		mbuf = fqc_codel_dequeue(f, si);
399
400		/* Codel did not return a packet */
401		if (!mbuf) {
402			/* If the selected flow belongs to new flows list, then move
403			 * it to the tail of old flows list. Otherwise, deactivate it and
404			 * remove it from the old list and
405			 */
406			if (fq_codel_flowlist == &si->newflows) {
407				STAILQ_REMOVE_HEAD(fq_codel_flowlist, flowchain);
408				STAILQ_INSERT_TAIL(&si->oldflows, f, flowchain);
409			}	else {
410				f->active = 0;
411				STAILQ_REMOVE_HEAD(fq_codel_flowlist, flowchain);
412			}
413			/* start again */
414			continue;
415		}
416
417		/* we have a packet to return,
418		 * update flow deficit and return the packet*/
419		f->deficit -= mbuf->m_pkthdr.len;
420		return mbuf;
421
422	} while (1);
423
424	/* unreachable point */
425	return NULL;
426}
427
428/*
429 * Initialize fq_codel scheduler instance.
430 * also, allocate memory for flows array.
431 */
432static int
433fq_codel_new_sched(struct dn_sch_inst *_si)
434{
435	struct fq_codel_si *si;
436	struct dn_queue *q;
437	struct fq_codel_schk *schk;
438	int i;
439
440	si = (struct fq_codel_si *)_si;
441	schk = (struct fq_codel_schk *)(_si->sched+1);
442
443	if(si->flows) {
444		D("si already configured!");
445		return 0;
446	}
447
448	/* init the main queue */
449	q = &si->main_q;
450	set_oid(&q->ni.oid, DN_QUEUE, sizeof(*q));
451	q->_si = _si;
452	q->fs = _si->sched->fs;
453
454	/* allocate memory for flows array */
455	si->flows = mallocarray(schk->cfg.flows_cnt,
456	    sizeof(struct fq_codel_flow), M_DUMMYNET, M_NOWAIT | M_ZERO);
457	if (si->flows == NULL) {
458		D("cannot allocate memory for fq_codel configuration parameters");
459		return ENOMEM ;
460	}
461
462	/* init perturbation for this si */
463	si->perturbation = random();
464
465	/* init the old and new flows lists */
466	STAILQ_INIT(&si->newflows);
467	STAILQ_INIT(&si->oldflows);
468
469	/* init the flows (sub-queues) */
470	for (i = 0; i < schk->cfg.flows_cnt; i++) {
471		/* init codel */
472		si->flows[i].cst.maxpkt_size = 500;
473	}
474
475	fq_codel_desc.ref_count++;
476	return 0;
477}
478
479/*
480 * Free fq_codel scheduler instance.
481 */
482static int
483fq_codel_free_sched(struct dn_sch_inst *_si)
484{
485	struct fq_codel_si *si = (struct fq_codel_si *)_si ;
486
487	/* free the flows array */
488	free(si->flows , M_DUMMYNET);
489	si->flows = NULL;
490	fq_codel_desc.ref_count--;
491
492	return 0;
493}
494
495/*
496 * Configure fq_codel scheduler.
497 * the configurations for the scheduler is passed from userland.
498 */
499static int
500fq_codel_config(struct dn_schk *_schk)
501{
502	struct fq_codel_schk *schk;
503	struct dn_extra_parms *ep;
504	struct dn_sch_fq_codel_parms *fqc_cfg;
505
506	schk = (struct fq_codel_schk *)(_schk+1);
507	ep = (struct dn_extra_parms *) _schk->cfg;
508
509	/* par array contains fq_codel configuration as follow
510	 * Codel: 0- target,1- interval, 2- flags
511	 * FQ_CODEL: 3- quantum, 4- limit, 5- flows
512	 */
513	if (ep && ep->oid.len ==sizeof(*ep) &&
514		ep->oid.subtype == DN_SCH_PARAMS) {
515
516		fqc_cfg = &schk->cfg;
517		if (ep->par[0] < 0)
518			fqc_cfg->ccfg.target = fq_codel_sysctl.ccfg.target;
519		else
520			fqc_cfg->ccfg.target = ep->par[0] * AQM_TIME_1US;
521
522		if (ep->par[1] < 0)
523			fqc_cfg->ccfg.interval = fq_codel_sysctl.ccfg.interval;
524		else
525			fqc_cfg->ccfg.interval = ep->par[1] * AQM_TIME_1US;
526
527		if (ep->par[2] < 0)
528			fqc_cfg->ccfg.flags = 0;
529		else
530			fqc_cfg->ccfg.flags = ep->par[2];
531
532		/* FQ configurations */
533		if (ep->par[3] < 0)
534			fqc_cfg->quantum = fq_codel_sysctl.quantum;
535		else
536			fqc_cfg->quantum = ep->par[3];
537
538		if (ep->par[4] < 0)
539			fqc_cfg->limit = fq_codel_sysctl.limit;
540		else
541			fqc_cfg->limit = ep->par[4];
542
543		if (ep->par[5] < 0)
544			fqc_cfg->flows_cnt = fq_codel_sysctl.flows_cnt;
545		else
546			fqc_cfg->flows_cnt = ep->par[5];
547
548		/* Bound the configurations */
549		fqc_cfg->ccfg.target = BOUND_VAR(fqc_cfg->ccfg.target, 1 ,
550			5 * AQM_TIME_1S); ;
551		fqc_cfg->ccfg.interval = BOUND_VAR(fqc_cfg->ccfg.interval, 1,
552			100 * AQM_TIME_1S);
553
554		fqc_cfg->quantum = BOUND_VAR(fqc_cfg->quantum,1, 9000);
555		fqc_cfg->limit= BOUND_VAR(fqc_cfg->limit,1,20480);
556		fqc_cfg->flows_cnt= BOUND_VAR(fqc_cfg->flows_cnt,1,65536);
557	}
558	else
559		return 1;
560
561	return 0;
562}
563
564/*
565 * Return fq_codel scheduler configurations
566 * the configurations for the scheduler is passed to userland.
567 */
568static int
569fq_codel_getconfig (struct dn_schk *_schk, struct dn_extra_parms *ep) {
570
571	struct fq_codel_schk *schk = (struct fq_codel_schk *)(_schk+1);
572	struct dn_sch_fq_codel_parms *fqc_cfg;
573
574	fqc_cfg = &schk->cfg;
575
576	strcpy(ep->name, fq_codel_desc.name);
577	ep->par[0] = fqc_cfg->ccfg.target / AQM_TIME_1US;
578	ep->par[1] = fqc_cfg->ccfg.interval / AQM_TIME_1US;
579	ep->par[2] = fqc_cfg->ccfg.flags;
580
581	ep->par[3] = fqc_cfg->quantum;
582	ep->par[4] = fqc_cfg->limit;
583	ep->par[5] = fqc_cfg->flows_cnt;
584
585	return 0;
586}
587
588/*
589 * fq_codel scheduler descriptor
590 * contains the type of the scheduler, the name, the size of extra
591 * data structures, and function pointers.
592 */
593static struct dn_alg fq_codel_desc = {
594	_SI( .type = )  DN_SCHED_FQ_CODEL,
595	_SI( .name = ) "FQ_CODEL",
596	_SI( .flags = ) 0,
597
598	_SI( .schk_datalen = ) sizeof(struct fq_codel_schk),
599	_SI( .si_datalen = ) sizeof(struct fq_codel_si) - sizeof(struct dn_sch_inst),
600	_SI( .q_datalen = ) 0,
601
602	_SI( .enqueue = ) fq_codel_enqueue,
603	_SI( .dequeue = ) fq_codel_dequeue,
604	_SI( .config = ) fq_codel_config, /* new sched i.e. sched X config ...*/
605	_SI( .destroy = ) NULL,  /*sched x delete */
606	_SI( .new_sched = ) fq_codel_new_sched, /* new schd instance */
607	_SI( .free_sched = ) fq_codel_free_sched,	/* delete schd instance */
608	_SI( .new_fsk = ) NULL,
609	_SI( .free_fsk = ) NULL,
610	_SI( .new_queue = ) NULL,
611	_SI( .free_queue = ) NULL,
612	_SI( .getconfig = )  fq_codel_getconfig,
613	_SI( .ref_count = ) 0
614};
615
616DECLARE_DNSCHED_MODULE(dn_fq_codel, &fq_codel_desc);
617