1/* 2 * FQ_Codel - The FlowQueue-Codel scheduler/AQM 3 * 4 * $FreeBSD$ 5 * 6 * Copyright (C) 2016 Centre for Advanced Internet Architectures, 7 * Swinburne University of Technology, Melbourne, Australia. 8 * Portions of this code were made possible in part by a gift from 9 * The Comcast Innovation Fund. 10 * Implemented by Rasool Al-Saadi <ralsaadi@swin.edu.au> 11 * 12 * Redistribution and use in source and binary forms, with or without 13 * modification, are permitted provided that the following conditions 14 * are met: 15 * 1. Redistributions of source code must retain the above copyright 16 * notice, this list of conditions and the following disclaimer. 17 * 2. Redistributions in binary form must reproduce the above copyright 18 * notice, this list of conditions and the following disclaimer in the 19 * documentation and/or other materials provided with the distribution. 20 * 21 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND 22 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 23 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 24 * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE 25 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 26 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 27 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 28 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 29 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 30 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 31 * SUCH DAMAGE. 32 */ 33 34#ifdef _KERNEL 35#include <sys/malloc.h> 36#include <sys/socket.h> 37//#include <sys/socketvar.h> 38#include <sys/kernel.h> 39#include <sys/mbuf.h> 40#include <sys/module.h> 41#include <net/if.h> /* IFNAMSIZ */ 42#include <netinet/in.h> 43#include <netinet/ip_var.h> /* ipfw_rule_ref */ 44#include <netinet/ip_fw.h> /* flow_id */ 45#include <netinet/ip_dummynet.h> 46 47#include <sys/lock.h> 48#include <sys/proc.h> 49#include <sys/rwlock.h> 50 51#include <netpfil/ipfw/ip_fw_private.h> 52#include <sys/sysctl.h> 53#include <netinet/ip.h> 54#include <netinet/ip6.h> 55#include <netinet/ip_icmp.h> 56#include <netinet/tcp.h> 57#include <netinet/udp.h> 58#include <sys/queue.h> 59#include <sys/hash.h> 60 61#include <netpfil/ipfw/dn_heap.h> 62#include <netpfil/ipfw/ip_dn_private.h> 63 64#include <netpfil/ipfw/dn_aqm.h> 65#include <netpfil/ipfw/dn_aqm_codel.h> 66#include <netpfil/ipfw/dn_sched.h> 67#include <netpfil/ipfw/dn_sched_fq_codel.h> 68#include <netpfil/ipfw/dn_sched_fq_codel_helper.h> 69 70#else 71#include <dn_test.h> 72#endif 73 74/* NOTE: In fq_codel module, we reimplements CoDel AQM functions 75 * because fq_codel use different flows (sub-queues) structure and 76 * dn_queue includes many variables not needed by a flow (sub-queue 77 * )i.e. avoid extra overhead (88 bytes vs 208 bytes). 78 * Also, CoDel functions manages stats of sub-queues as well as the main queue. 79 */ 80 81#define DN_SCHED_FQ_CODEL 6 82 83static struct dn_alg fq_codel_desc; 84 85/* fq_codel default parameters including codel */ 86struct dn_sch_fq_codel_parms 87fq_codel_sysctl = {{5000 * AQM_TIME_1US, 100000 * AQM_TIME_1US, 88 CODEL_ECN_ENABLED}, 1024, 10240, 1514}; 89 90static int 91fqcodel_sysctl_interval_handler(SYSCTL_HANDLER_ARGS) 92{ 93 int error; 94 long value; 95 96 value = fq_codel_sysctl.ccfg.interval; 97 value /= AQM_TIME_1US; 98 error = sysctl_handle_long(oidp, &value, 0, req); 99 if (error != 0 || req->newptr == NULL) 100 return (error); 101 if (value < 1 || value > 100 * AQM_TIME_1S) 102 return (EINVAL); 103 fq_codel_sysctl.ccfg.interval = value * AQM_TIME_1US ; 104 105 return (0); 106} 107 108static int 109fqcodel_sysctl_target_handler(SYSCTL_HANDLER_ARGS) 110{ 111 int error; 112 long value; 113 114 value = fq_codel_sysctl.ccfg.target; 115 value /= AQM_TIME_1US; 116 error = sysctl_handle_long(oidp, &value, 0, req); 117 if (error != 0 || req->newptr == NULL) 118 return (error); 119 if (value < 1 || value > 5 * AQM_TIME_1S) 120 return (EINVAL); 121 fq_codel_sysctl.ccfg.target = value * AQM_TIME_1US ; 122 123 return (0); 124} 125 126 127SYSBEGIN(f4) 128 129SYSCTL_DECL(_net_inet); 130SYSCTL_DECL(_net_inet_ip); 131SYSCTL_DECL(_net_inet_ip_dummynet); 132static SYSCTL_NODE(_net_inet_ip_dummynet, OID_AUTO, fqcodel, 133 CTLFLAG_RW, 0, "FQ_CODEL"); 134 135#ifdef SYSCTL_NODE 136 137SYSCTL_PROC(_net_inet_ip_dummynet_fqcodel, OID_AUTO, target, 138 CTLTYPE_LONG | CTLFLAG_RW, NULL, 0, fqcodel_sysctl_target_handler, "L", 139 "FQ_CoDel target in microsecond"); 140SYSCTL_PROC(_net_inet_ip_dummynet_fqcodel, OID_AUTO, interval, 141 CTLTYPE_LONG | CTLFLAG_RW, NULL, 0, fqcodel_sysctl_interval_handler, "L", 142 "FQ_CoDel interval in microsecond"); 143 144SYSCTL_UINT(_net_inet_ip_dummynet_fqcodel, OID_AUTO, quantum, 145 CTLFLAG_RW, &fq_codel_sysctl.quantum, 1514, "FQ_CoDel quantum"); 146SYSCTL_UINT(_net_inet_ip_dummynet_fqcodel, OID_AUTO, flows, 147 CTLFLAG_RW, &fq_codel_sysctl.flows_cnt, 1024, 148 "Number of queues for FQ_CoDel"); 149SYSCTL_UINT(_net_inet_ip_dummynet_fqcodel, OID_AUTO, limit, 150 CTLFLAG_RW, &fq_codel_sysctl.limit, 10240, "FQ_CoDel queues size limit"); 151#endif 152 153/* Drop a packet form the head of codel queue */ 154static void 155codel_drop_head(struct fq_codel_flow *q, struct fq_codel_si *si) 156{ 157 struct mbuf *m = q->mq.head; 158 159 if (m == NULL) 160 return; 161 q->mq.head = m->m_nextpkt; 162 163 fq_update_stats(q, si, -m->m_pkthdr.len, 1); 164 165 if (si->main_q.ni.length == 0) /* queue is now idle */ 166 si->main_q.q_time = V_dn_cfg.curr_time; 167 168 FREE_PKT(m); 169} 170 171/* Enqueue a packet 'm' to a queue 'q' and add timestamp to that packet. 172 * Return 1 when unable to add timestamp, otherwise return 0 173 */ 174static int 175codel_enqueue(struct fq_codel_flow *q, struct mbuf *m, struct fq_codel_si *si) 176{ 177 uint64_t len; 178 179 len = m->m_pkthdr.len; 180 /* finding maximum packet size */ 181 if (len > q->cst.maxpkt_size) 182 q->cst.maxpkt_size = len; 183 184 /* Add timestamp to mbuf as MTAG */ 185 struct m_tag *mtag; 186 mtag = m_tag_locate(m, MTAG_ABI_COMPAT, DN_AQM_MTAG_TS, NULL); 187 if (mtag == NULL) 188 mtag = m_tag_alloc(MTAG_ABI_COMPAT, DN_AQM_MTAG_TS, sizeof(aqm_time_t), 189 M_NOWAIT); 190 if (mtag == NULL) 191 goto drop; 192 *(aqm_time_t *)(mtag + 1) = AQM_UNOW; 193 m_tag_prepend(m, mtag); 194 195 mq_append(&q->mq, m); 196 fq_update_stats(q, si, len, 0); 197 return 0; 198 199drop: 200 fq_update_stats(q, si, len, 1); 201 m_freem(m); 202 return 1; 203} 204 205/* 206 * Classify a packet to queue number using Jenkins hash function. 207 * Return: queue number 208 * the input of the hash are protocol no, perturbation, src IP, dst IP, 209 * src port, dst port, 210 */ 211static inline int 212fq_codel_classify_flow(struct mbuf *m, uint16_t fcount, struct fq_codel_si *si) 213{ 214 struct ip *ip; 215 struct tcphdr *th; 216 struct udphdr *uh; 217 uint8_t tuple[41]; 218 uint16_t hash=0; 219 220 ip = (struct ip *)mtodo(m, dn_tag_get(m)->iphdr_off); 221//#ifdef INET6 222 struct ip6_hdr *ip6; 223 int isip6; 224 isip6 = (ip->ip_v == 6); 225 226 if(isip6) { 227 ip6 = (struct ip6_hdr *)ip; 228 *((uint8_t *) &tuple[0]) = ip6->ip6_nxt; 229 *((uint32_t *) &tuple[1]) = si->perturbation; 230 memcpy(&tuple[5], ip6->ip6_src.s6_addr, 16); 231 memcpy(&tuple[21], ip6->ip6_dst.s6_addr, 16); 232 233 switch (ip6->ip6_nxt) { 234 case IPPROTO_TCP: 235 th = (struct tcphdr *)(ip6 + 1); 236 *((uint16_t *) &tuple[37]) = th->th_dport; 237 *((uint16_t *) &tuple[39]) = th->th_sport; 238 break; 239 240 case IPPROTO_UDP: 241 uh = (struct udphdr *)(ip6 + 1); 242 *((uint16_t *) &tuple[37]) = uh->uh_dport; 243 *((uint16_t *) &tuple[39]) = uh->uh_sport; 244 break; 245 default: 246 memset(&tuple[37], 0, 4); 247 248 } 249 250 hash = jenkins_hash(tuple, 41, HASHINIT) % fcount; 251 return hash; 252 } 253//#endif 254 255 /* IPv4 */ 256 *((uint8_t *) &tuple[0]) = ip->ip_p; 257 *((uint32_t *) &tuple[1]) = si->perturbation; 258 *((uint32_t *) &tuple[5]) = ip->ip_src.s_addr; 259 *((uint32_t *) &tuple[9]) = ip->ip_dst.s_addr; 260 261 switch (ip->ip_p) { 262 case IPPROTO_TCP: 263 th = (struct tcphdr *)(ip + 1); 264 *((uint16_t *) &tuple[13]) = th->th_dport; 265 *((uint16_t *) &tuple[15]) = th->th_sport; 266 break; 267 268 case IPPROTO_UDP: 269 uh = (struct udphdr *)(ip + 1); 270 *((uint16_t *) &tuple[13]) = uh->uh_dport; 271 *((uint16_t *) &tuple[15]) = uh->uh_sport; 272 break; 273 default: 274 memset(&tuple[13], 0, 4); 275 276 } 277 hash = jenkins_hash(tuple, 17, HASHINIT) % fcount; 278 279 return hash; 280} 281 282/* 283 * Enqueue a packet into an appropriate queue according to 284 * FQ_CODEL algorithm. 285 */ 286static int 287fq_codel_enqueue(struct dn_sch_inst *_si, struct dn_queue *_q, 288 struct mbuf *m) 289{ 290 struct fq_codel_si *si; 291 struct fq_codel_schk *schk; 292 struct dn_sch_fq_codel_parms *param; 293 struct dn_queue *mainq; 294 int idx, drop, i, maxidx; 295 296 mainq = (struct dn_queue *)(_si + 1); 297 si = (struct fq_codel_si *)_si; 298 schk = (struct fq_codel_schk *)(si->_si.sched+1); 299 param = &schk->cfg; 300 301 /* classify a packet to queue number*/ 302 idx = fq_codel_classify_flow(m, param->flows_cnt, si); 303 /* enqueue packet into appropriate queue using CoDel AQM. 304 * Note: 'codel_enqueue' function returns 1 only when it unable to 305 * add timestamp to packet (no limit check)*/ 306 drop = codel_enqueue(&si->flows[idx], m, si); 307 308 /* codel unable to timestamp a packet */ 309 if (drop) 310 return 1; 311 312 /* If the flow (sub-queue) is not active ,then add it to the tail of 313 * new flows list, initialize and activate it. 314 */ 315 if (!si->flows[idx].active ) { 316 STAILQ_INSERT_TAIL(&si->newflows, &si->flows[idx], flowchain); 317 si->flows[idx].deficit = param->quantum; 318 si->flows[idx].cst.dropping = false; 319 si->flows[idx].cst.first_above_time = 0; 320 si->flows[idx].active = 1; 321 //D("activate %d",idx); 322 } 323 324 /* check the limit for all queues and remove a packet from the 325 * largest one 326 */ 327 if (mainq->ni.length > schk->cfg.limit) { D("over limit"); 328 /* find first active flow */ 329 for (maxidx = 0; maxidx < schk->cfg.flows_cnt; maxidx++) 330 if (si->flows[maxidx].active) 331 break; 332 if (maxidx < schk->cfg.flows_cnt) { 333 /* find the largest sub- queue */ 334 for (i = maxidx + 1; i < schk->cfg.flows_cnt; i++) 335 if (si->flows[i].active && si->flows[i].stats.length > 336 si->flows[maxidx].stats.length) 337 maxidx = i; 338 codel_drop_head(&si->flows[maxidx], si); 339 D("maxidx = %d",maxidx); 340 drop = 1; 341 } 342 } 343 344 return drop; 345} 346 347/* 348 * Dequeue a packet from an appropriate queue according to 349 * FQ_CODEL algorithm. 350 */ 351static struct mbuf * 352fq_codel_dequeue(struct dn_sch_inst *_si) 353{ 354 struct fq_codel_si *si; 355 struct fq_codel_schk *schk; 356 struct dn_sch_fq_codel_parms *param; 357 struct fq_codel_flow *f; 358 struct mbuf *mbuf; 359 struct fq_codel_list *fq_codel_flowlist; 360 361 si = (struct fq_codel_si *)_si; 362 schk = (struct fq_codel_schk *)(si->_si.sched+1); 363 param = &schk->cfg; 364 365 do { 366 /* select a list to start with */ 367 if (STAILQ_EMPTY(&si->newflows)) 368 fq_codel_flowlist = &si->oldflows; 369 else 370 fq_codel_flowlist = &si->newflows; 371 372 /* Both new and old queue lists are empty, return NULL */ 373 if (STAILQ_EMPTY(fq_codel_flowlist)) 374 return NULL; 375 376 f = STAILQ_FIRST(fq_codel_flowlist); 377 while (f != NULL) { 378 /* if there is no flow(sub-queue) deficit, increase deficit 379 * by quantum, move the flow to the tail of old flows list 380 * and try another flow. 381 * Otherwise, the flow will be used for dequeue. 382 */ 383 if (f->deficit < 0) { 384 f->deficit += param->quantum; 385 STAILQ_REMOVE_HEAD(fq_codel_flowlist, flowchain); 386 STAILQ_INSERT_TAIL(&si->oldflows, f, flowchain); 387 } else 388 break; 389 390 f = STAILQ_FIRST(fq_codel_flowlist); 391 } 392 393 /* the new flows list is empty, try old flows list */ 394 if (STAILQ_EMPTY(fq_codel_flowlist)) 395 continue; 396 397 /* Dequeue a packet from the selected flow */ 398 mbuf = fqc_codel_dequeue(f, si); 399 400 /* Codel did not return a packet */ 401 if (!mbuf) { 402 /* If the selected flow belongs to new flows list, then move 403 * it to the tail of old flows list. Otherwise, deactivate it and 404 * remove it from the old list and 405 */ 406 if (fq_codel_flowlist == &si->newflows) { 407 STAILQ_REMOVE_HEAD(fq_codel_flowlist, flowchain); 408 STAILQ_INSERT_TAIL(&si->oldflows, f, flowchain); 409 } else { 410 f->active = 0; 411 STAILQ_REMOVE_HEAD(fq_codel_flowlist, flowchain); 412 } 413 /* start again */ 414 continue; 415 } 416 417 /* we have a packet to return, 418 * update flow deficit and return the packet*/ 419 f->deficit -= mbuf->m_pkthdr.len; 420 return mbuf; 421 422 } while (1); 423 424 /* unreachable point */ 425 return NULL; 426} 427 428/* 429 * Initialize fq_codel scheduler instance. 430 * also, allocate memory for flows array. 431 */ 432static int 433fq_codel_new_sched(struct dn_sch_inst *_si) 434{ 435 struct fq_codel_si *si; 436 struct dn_queue *q; 437 struct fq_codel_schk *schk; 438 int i; 439 440 si = (struct fq_codel_si *)_si; 441 schk = (struct fq_codel_schk *)(_si->sched+1); 442 443 if(si->flows) { 444 D("si already configured!"); 445 return 0; 446 } 447 448 /* init the main queue */ 449 q = &si->main_q; 450 set_oid(&q->ni.oid, DN_QUEUE, sizeof(*q)); 451 q->_si = _si; 452 q->fs = _si->sched->fs; 453 454 /* allocate memory for flows array */ 455 si->flows = mallocarray(schk->cfg.flows_cnt, 456 sizeof(struct fq_codel_flow), M_DUMMYNET, M_NOWAIT | M_ZERO); 457 if (si->flows == NULL) { 458 D("cannot allocate memory for fq_codel configuration parameters"); 459 return ENOMEM ; 460 } 461 462 /* init perturbation for this si */ 463 si->perturbation = random(); 464 465 /* init the old and new flows lists */ 466 STAILQ_INIT(&si->newflows); 467 STAILQ_INIT(&si->oldflows); 468 469 /* init the flows (sub-queues) */ 470 for (i = 0; i < schk->cfg.flows_cnt; i++) { 471 /* init codel */ 472 si->flows[i].cst.maxpkt_size = 500; 473 } 474 475 fq_codel_desc.ref_count++; 476 return 0; 477} 478 479/* 480 * Free fq_codel scheduler instance. 481 */ 482static int 483fq_codel_free_sched(struct dn_sch_inst *_si) 484{ 485 struct fq_codel_si *si = (struct fq_codel_si *)_si ; 486 487 /* free the flows array */ 488 free(si->flows , M_DUMMYNET); 489 si->flows = NULL; 490 fq_codel_desc.ref_count--; 491 492 return 0; 493} 494 495/* 496 * Configure fq_codel scheduler. 497 * the configurations for the scheduler is passed from userland. 498 */ 499static int 500fq_codel_config(struct dn_schk *_schk) 501{ 502 struct fq_codel_schk *schk; 503 struct dn_extra_parms *ep; 504 struct dn_sch_fq_codel_parms *fqc_cfg; 505 506 schk = (struct fq_codel_schk *)(_schk+1); 507 ep = (struct dn_extra_parms *) _schk->cfg; 508 509 /* par array contains fq_codel configuration as follow 510 * Codel: 0- target,1- interval, 2- flags 511 * FQ_CODEL: 3- quantum, 4- limit, 5- flows 512 */ 513 if (ep && ep->oid.len ==sizeof(*ep) && 514 ep->oid.subtype == DN_SCH_PARAMS) { 515 516 fqc_cfg = &schk->cfg; 517 if (ep->par[0] < 0) 518 fqc_cfg->ccfg.target = fq_codel_sysctl.ccfg.target; 519 else 520 fqc_cfg->ccfg.target = ep->par[0] * AQM_TIME_1US; 521 522 if (ep->par[1] < 0) 523 fqc_cfg->ccfg.interval = fq_codel_sysctl.ccfg.interval; 524 else 525 fqc_cfg->ccfg.interval = ep->par[1] * AQM_TIME_1US; 526 527 if (ep->par[2] < 0) 528 fqc_cfg->ccfg.flags = 0; 529 else 530 fqc_cfg->ccfg.flags = ep->par[2]; 531 532 /* FQ configurations */ 533 if (ep->par[3] < 0) 534 fqc_cfg->quantum = fq_codel_sysctl.quantum; 535 else 536 fqc_cfg->quantum = ep->par[3]; 537 538 if (ep->par[4] < 0) 539 fqc_cfg->limit = fq_codel_sysctl.limit; 540 else 541 fqc_cfg->limit = ep->par[4]; 542 543 if (ep->par[5] < 0) 544 fqc_cfg->flows_cnt = fq_codel_sysctl.flows_cnt; 545 else 546 fqc_cfg->flows_cnt = ep->par[5]; 547 548 /* Bound the configurations */ 549 fqc_cfg->ccfg.target = BOUND_VAR(fqc_cfg->ccfg.target, 1 , 550 5 * AQM_TIME_1S); ; 551 fqc_cfg->ccfg.interval = BOUND_VAR(fqc_cfg->ccfg.interval, 1, 552 100 * AQM_TIME_1S); 553 554 fqc_cfg->quantum = BOUND_VAR(fqc_cfg->quantum,1, 9000); 555 fqc_cfg->limit= BOUND_VAR(fqc_cfg->limit,1,20480); 556 fqc_cfg->flows_cnt= BOUND_VAR(fqc_cfg->flows_cnt,1,65536); 557 } 558 else 559 return 1; 560 561 return 0; 562} 563 564/* 565 * Return fq_codel scheduler configurations 566 * the configurations for the scheduler is passed to userland. 567 */ 568static int 569fq_codel_getconfig (struct dn_schk *_schk, struct dn_extra_parms *ep) { 570 571 struct fq_codel_schk *schk = (struct fq_codel_schk *)(_schk+1); 572 struct dn_sch_fq_codel_parms *fqc_cfg; 573 574 fqc_cfg = &schk->cfg; 575 576 strcpy(ep->name, fq_codel_desc.name); 577 ep->par[0] = fqc_cfg->ccfg.target / AQM_TIME_1US; 578 ep->par[1] = fqc_cfg->ccfg.interval / AQM_TIME_1US; 579 ep->par[2] = fqc_cfg->ccfg.flags; 580 581 ep->par[3] = fqc_cfg->quantum; 582 ep->par[4] = fqc_cfg->limit; 583 ep->par[5] = fqc_cfg->flows_cnt; 584 585 return 0; 586} 587 588/* 589 * fq_codel scheduler descriptor 590 * contains the type of the scheduler, the name, the size of extra 591 * data structures, and function pointers. 592 */ 593static struct dn_alg fq_codel_desc = { 594 _SI( .type = ) DN_SCHED_FQ_CODEL, 595 _SI( .name = ) "FQ_CODEL", 596 _SI( .flags = ) 0, 597 598 _SI( .schk_datalen = ) sizeof(struct fq_codel_schk), 599 _SI( .si_datalen = ) sizeof(struct fq_codel_si) - sizeof(struct dn_sch_inst), 600 _SI( .q_datalen = ) 0, 601 602 _SI( .enqueue = ) fq_codel_enqueue, 603 _SI( .dequeue = ) fq_codel_dequeue, 604 _SI( .config = ) fq_codel_config, /* new sched i.e. sched X config ...*/ 605 _SI( .destroy = ) NULL, /*sched x delete */ 606 _SI( .new_sched = ) fq_codel_new_sched, /* new schd instance */ 607 _SI( .free_sched = ) fq_codel_free_sched, /* delete schd instance */ 608 _SI( .new_fsk = ) NULL, 609 _SI( .free_fsk = ) NULL, 610 _SI( .new_queue = ) NULL, 611 _SI( .free_queue = ) NULL, 612 _SI( .getconfig = ) fq_codel_getconfig, 613 _SI( .ref_count = ) 0 614}; 615 616DECLARE_DNSCHED_MODULE(dn_fq_codel, &fq_codel_desc); 617