1// SPDX-License-Identifier: GPL-2.0 OR BSD-3-Clause
2
3/* COMMON Applications Kept Enhanced (CAKE) discipline
4 *
5 * Copyright (C) 2014-2018 Jonathan Morton <chromatix99@gmail.com>
6 * Copyright (C) 2015-2018 Toke H��iland-J��rgensen <toke@toke.dk>
7 * Copyright (C) 2014-2018 Dave T��ht <dave.taht@gmail.com>
8 * Copyright (C) 2015-2018 Sebastian Moeller <moeller0@gmx.de>
9 * (C) 2015-2018 Kevin Darbyshire-Bryant <kevin@darbyshire-bryant.me.uk>
10 * Copyright (C) 2017-2018 Ryan Mounce <ryan@mounce.com.au>
11 *
12 * The CAKE Principles:
13 *		   (or, how to have your cake and eat it too)
14 *
15 * This is a combination of several shaping, AQM and FQ techniques into one
16 * easy-to-use package:
17 *
18 * - An overall bandwidth shaper, to move the bottleneck away from dumb CPE
19 *   equipment and bloated MACs.  This operates in deficit mode (as in sch_fq),
20 *   eliminating the need for any sort of burst parameter (eg. token bucket
21 *   depth).  Burst support is limited to that necessary to overcome scheduling
22 *   latency.
23 *
24 * - A Diffserv-aware priority queue, giving more priority to certain classes,
25 *   up to a specified fraction of bandwidth.  Above that bandwidth threshold,
26 *   the priority is reduced to avoid starving other tins.
27 *
28 * - Each priority tin has a separate Flow Queue system, to isolate traffic
29 *   flows from each other.  This prevents a burst on one flow from increasing
30 *   the delay to another.  Flows are distributed to queues using a
31 *   set-associative hash function.
32 *
33 * - Each queue is actively managed by Cobalt, which is a combination of the
34 *   Codel and Blue AQM algorithms.  This serves flows fairly, and signals
35 *   congestion early via ECN (if available) and/or packet drops, to keep
36 *   latency low.  The codel parameters are auto-tuned based on the bandwidth
37 *   setting, as is necessary at low bandwidths.
38 *
39 * The configuration parameters are kept deliberately simple for ease of use.
40 * Everything has sane defaults.  Complete generality of configuration is *not*
41 * a goal.
42 *
43 * The priority queue operates according to a weighted DRR scheme, combined with
44 * a bandwidth tracker which reuses the shaper logic to detect which side of the
45 * bandwidth sharing threshold the tin is operating.  This determines whether a
46 * priority-based weight (high) or a bandwidth-based weight (low) is used for
47 * that tin in the current pass.
48 *
49 * This qdisc was inspired by Eric Dumazet's fq_codel code, which he kindly
50 * granted us permission to leverage.
51 */
52
53#include <linux/module.h>
54#include <linux/types.h>
55#include <linux/kernel.h>
56#include <linux/jiffies.h>
57#include <linux/string.h>
58#include <linux/in.h>
59#include <linux/errno.h>
60#include <linux/init.h>
61#include <linux/skbuff.h>
62#include <linux/jhash.h>
63#include <linux/slab.h>
64#include <linux/vmalloc.h>
65#include <linux/reciprocal_div.h>
66#include <net/netlink.h>
67#include <linux/if_vlan.h>
68#include <net/gso.h>
69#include <net/pkt_sched.h>
70#include <net/pkt_cls.h>
71#include <net/tcp.h>
72#include <net/flow_dissector.h>
73
74#if IS_ENABLED(CONFIG_NF_CONNTRACK)
75#include <net/netfilter/nf_conntrack_core.h>
76#endif
77
78#define CAKE_SET_WAYS (8)
79#define CAKE_MAX_TINS (8)
80#define CAKE_QUEUES (1024)
81#define CAKE_FLOW_MASK 63
82#define CAKE_FLOW_NAT_FLAG 64
83
84/* struct cobalt_params - contains codel and blue parameters
85 * @interval:	codel initial drop rate
86 * @target:     maximum persistent sojourn time & blue update rate
87 * @mtu_time:   serialisation delay of maximum-size packet
88 * @p_inc:      increment of blue drop probability (0.32 fxp)
89 * @p_dec:      decrement of blue drop probability (0.32 fxp)
90 */
91struct cobalt_params {
92	u64	interval;
93	u64	target;
94	u64	mtu_time;
95	u32	p_inc;
96	u32	p_dec;
97};
98
99/* struct cobalt_vars - contains codel and blue variables
100 * @count:		codel dropping frequency
101 * @rec_inv_sqrt:	reciprocal value of sqrt(count) >> 1
102 * @drop_next:		time to drop next packet, or when we dropped last
103 * @blue_timer:		Blue time to next drop
104 * @p_drop:		BLUE drop probability (0.32 fxp)
105 * @dropping:		set if in dropping state
106 * @ecn_marked:		set if marked
107 */
108struct cobalt_vars {
109	u32	count;
110	u32	rec_inv_sqrt;
111	ktime_t	drop_next;
112	ktime_t	blue_timer;
113	u32     p_drop;
114	bool	dropping;
115	bool    ecn_marked;
116};
117
118enum {
119	CAKE_SET_NONE = 0,
120	CAKE_SET_SPARSE,
121	CAKE_SET_SPARSE_WAIT, /* counted in SPARSE, actually in BULK */
122	CAKE_SET_BULK,
123	CAKE_SET_DECAYING
124};
125
126struct cake_flow {
127	/* this stuff is all needed per-flow at dequeue time */
128	struct sk_buff	  *head;
129	struct sk_buff	  *tail;
130	struct list_head  flowchain;
131	s32		  deficit;
132	u32		  dropped;
133	struct cobalt_vars cvars;
134	u16		  srchost; /* index into cake_host table */
135	u16		  dsthost;
136	u8		  set;
137}; /* please try to keep this structure <= 64 bytes */
138
139struct cake_host {
140	u32 srchost_tag;
141	u32 dsthost_tag;
142	u16 srchost_bulk_flow_count;
143	u16 dsthost_bulk_flow_count;
144};
145
146struct cake_heap_entry {
147	u16 t:3, b:10;
148};
149
150struct cake_tin_data {
151	struct cake_flow flows[CAKE_QUEUES];
152	u32	backlogs[CAKE_QUEUES];
153	u32	tags[CAKE_QUEUES]; /* for set association */
154	u16	overflow_idx[CAKE_QUEUES];
155	struct cake_host hosts[CAKE_QUEUES]; /* for triple isolation */
156	u16	flow_quantum;
157
158	struct cobalt_params cparams;
159	u32	drop_overlimit;
160	u16	bulk_flow_count;
161	u16	sparse_flow_count;
162	u16	decaying_flow_count;
163	u16	unresponsive_flow_count;
164
165	u32	max_skblen;
166
167	struct list_head new_flows;
168	struct list_head old_flows;
169	struct list_head decaying_flows;
170
171	/* time_next = time_this + ((len * rate_ns) >> rate_shft) */
172	ktime_t	time_next_packet;
173	u64	tin_rate_ns;
174	u64	tin_rate_bps;
175	u16	tin_rate_shft;
176
177	u16	tin_quantum;
178	s32	tin_deficit;
179	u32	tin_backlog;
180	u32	tin_dropped;
181	u32	tin_ecn_mark;
182
183	u32	packets;
184	u64	bytes;
185
186	u32	ack_drops;
187
188	/* moving averages */
189	u64 avge_delay;
190	u64 peak_delay;
191	u64 base_delay;
192
193	/* hash function stats */
194	u32	way_directs;
195	u32	way_hits;
196	u32	way_misses;
197	u32	way_collisions;
198}; /* number of tins is small, so size of this struct doesn't matter much */
199
200struct cake_sched_data {
201	struct tcf_proto __rcu *filter_list; /* optional external classifier */
202	struct tcf_block *block;
203	struct cake_tin_data *tins;
204
205	struct cake_heap_entry overflow_heap[CAKE_QUEUES * CAKE_MAX_TINS];
206	u16		overflow_timeout;
207
208	u16		tin_cnt;
209	u8		tin_mode;
210	u8		flow_mode;
211	u8		ack_filter;
212	u8		atm_mode;
213
214	u32		fwmark_mask;
215	u16		fwmark_shft;
216
217	/* time_next = time_this + ((len * rate_ns) >> rate_shft) */
218	u16		rate_shft;
219	ktime_t		time_next_packet;
220	ktime_t		failsafe_next_packet;
221	u64		rate_ns;
222	u64		rate_bps;
223	u16		rate_flags;
224	s16		rate_overhead;
225	u16		rate_mpu;
226	u64		interval;
227	u64		target;
228
229	/* resource tracking */
230	u32		buffer_used;
231	u32		buffer_max_used;
232	u32		buffer_limit;
233	u32		buffer_config_limit;
234
235	/* indices for dequeue */
236	u16		cur_tin;
237	u16		cur_flow;
238
239	struct qdisc_watchdog watchdog;
240	const u8	*tin_index;
241	const u8	*tin_order;
242
243	/* bandwidth capacity estimate */
244	ktime_t		last_packet_time;
245	ktime_t		avg_window_begin;
246	u64		avg_packet_interval;
247	u64		avg_window_bytes;
248	u64		avg_peak_bandwidth;
249	ktime_t		last_reconfig_time;
250
251	/* packet length stats */
252	u32		avg_netoff;
253	u16		max_netlen;
254	u16		max_adjlen;
255	u16		min_netlen;
256	u16		min_adjlen;
257};
258
259enum {
260	CAKE_FLAG_OVERHEAD	   = BIT(0),
261	CAKE_FLAG_AUTORATE_INGRESS = BIT(1),
262	CAKE_FLAG_INGRESS	   = BIT(2),
263	CAKE_FLAG_WASH		   = BIT(3),
264	CAKE_FLAG_SPLIT_GSO	   = BIT(4)
265};
266
267/* COBALT operates the Codel and BLUE algorithms in parallel, in order to
268 * obtain the best features of each.  Codel is excellent on flows which
269 * respond to congestion signals in a TCP-like way.  BLUE is more effective on
270 * unresponsive flows.
271 */
272
273struct cobalt_skb_cb {
274	ktime_t enqueue_time;
275	u32     adjusted_len;
276};
277
278static u64 us_to_ns(u64 us)
279{
280	return us * NSEC_PER_USEC;
281}
282
283static struct cobalt_skb_cb *get_cobalt_cb(const struct sk_buff *skb)
284{
285	qdisc_cb_private_validate(skb, sizeof(struct cobalt_skb_cb));
286	return (struct cobalt_skb_cb *)qdisc_skb_cb(skb)->data;
287}
288
289static ktime_t cobalt_get_enqueue_time(const struct sk_buff *skb)
290{
291	return get_cobalt_cb(skb)->enqueue_time;
292}
293
294static void cobalt_set_enqueue_time(struct sk_buff *skb,
295				    ktime_t now)
296{
297	get_cobalt_cb(skb)->enqueue_time = now;
298}
299
300static u16 quantum_div[CAKE_QUEUES + 1] = {0};
301
302/* Diffserv lookup tables */
303
304static const u8 precedence[] = {
305	0, 0, 0, 0, 0, 0, 0, 0,
306	1, 1, 1, 1, 1, 1, 1, 1,
307	2, 2, 2, 2, 2, 2, 2, 2,
308	3, 3, 3, 3, 3, 3, 3, 3,
309	4, 4, 4, 4, 4, 4, 4, 4,
310	5, 5, 5, 5, 5, 5, 5, 5,
311	6, 6, 6, 6, 6, 6, 6, 6,
312	7, 7, 7, 7, 7, 7, 7, 7,
313};
314
315static const u8 diffserv8[] = {
316	2, 0, 1, 2, 4, 2, 2, 2,
317	1, 2, 1, 2, 1, 2, 1, 2,
318	5, 2, 4, 2, 4, 2, 4, 2,
319	3, 2, 3, 2, 3, 2, 3, 2,
320	6, 2, 3, 2, 3, 2, 3, 2,
321	6, 2, 2, 2, 6, 2, 6, 2,
322	7, 2, 2, 2, 2, 2, 2, 2,
323	7, 2, 2, 2, 2, 2, 2, 2,
324};
325
326static const u8 diffserv4[] = {
327	0, 1, 0, 0, 2, 0, 0, 0,
328	1, 0, 0, 0, 0, 0, 0, 0,
329	2, 0, 2, 0, 2, 0, 2, 0,
330	2, 0, 2, 0, 2, 0, 2, 0,
331	3, 0, 2, 0, 2, 0, 2, 0,
332	3, 0, 0, 0, 3, 0, 3, 0,
333	3, 0, 0, 0, 0, 0, 0, 0,
334	3, 0, 0, 0, 0, 0, 0, 0,
335};
336
337static const u8 diffserv3[] = {
338	0, 1, 0, 0, 2, 0, 0, 0,
339	1, 0, 0, 0, 0, 0, 0, 0,
340	0, 0, 0, 0, 0, 0, 0, 0,
341	0, 0, 0, 0, 0, 0, 0, 0,
342	0, 0, 0, 0, 0, 0, 0, 0,
343	0, 0, 0, 0, 2, 0, 2, 0,
344	2, 0, 0, 0, 0, 0, 0, 0,
345	2, 0, 0, 0, 0, 0, 0, 0,
346};
347
348static const u8 besteffort[] = {
349	0, 0, 0, 0, 0, 0, 0, 0,
350	0, 0, 0, 0, 0, 0, 0, 0,
351	0, 0, 0, 0, 0, 0, 0, 0,
352	0, 0, 0, 0, 0, 0, 0, 0,
353	0, 0, 0, 0, 0, 0, 0, 0,
354	0, 0, 0, 0, 0, 0, 0, 0,
355	0, 0, 0, 0, 0, 0, 0, 0,
356	0, 0, 0, 0, 0, 0, 0, 0,
357};
358
359/* tin priority order for stats dumping */
360
361static const u8 normal_order[] = {0, 1, 2, 3, 4, 5, 6, 7};
362static const u8 bulk_order[] = {1, 0, 2, 3};
363
364#define REC_INV_SQRT_CACHE (16)
365static u32 cobalt_rec_inv_sqrt_cache[REC_INV_SQRT_CACHE] = {0};
366
367/* http://en.wikipedia.org/wiki/Methods_of_computing_square_roots
368 * new_invsqrt = (invsqrt / 2) * (3 - count * invsqrt^2)
369 *
370 * Here, invsqrt is a fixed point number (< 1.0), 32bit mantissa, aka Q0.32
371 */
372
373static void cobalt_newton_step(struct cobalt_vars *vars)
374{
375	u32 invsqrt, invsqrt2;
376	u64 val;
377
378	invsqrt = vars->rec_inv_sqrt;
379	invsqrt2 = ((u64)invsqrt * invsqrt) >> 32;
380	val = (3LL << 32) - ((u64)vars->count * invsqrt2);
381
382	val >>= 2; /* avoid overflow in following multiply */
383	val = (val * invsqrt) >> (32 - 2 + 1);
384
385	vars->rec_inv_sqrt = val;
386}
387
388static void cobalt_invsqrt(struct cobalt_vars *vars)
389{
390	if (vars->count < REC_INV_SQRT_CACHE)
391		vars->rec_inv_sqrt = cobalt_rec_inv_sqrt_cache[vars->count];
392	else
393		cobalt_newton_step(vars);
394}
395
396/* There is a big difference in timing between the accurate values placed in
397 * the cache and the approximations given by a single Newton step for small
398 * count values, particularly when stepping from count 1 to 2 or vice versa.
399 * Above 16, a single Newton step gives sufficient accuracy in either
400 * direction, given the precision stored.
401 *
402 * The magnitude of the error when stepping up to count 2 is such as to give
403 * the value that *should* have been produced at count 4.
404 */
405
406static void cobalt_cache_init(void)
407{
408	struct cobalt_vars v;
409
410	memset(&v, 0, sizeof(v));
411	v.rec_inv_sqrt = ~0U;
412	cobalt_rec_inv_sqrt_cache[0] = v.rec_inv_sqrt;
413
414	for (v.count = 1; v.count < REC_INV_SQRT_CACHE; v.count++) {
415		cobalt_newton_step(&v);
416		cobalt_newton_step(&v);
417		cobalt_newton_step(&v);
418		cobalt_newton_step(&v);
419
420		cobalt_rec_inv_sqrt_cache[v.count] = v.rec_inv_sqrt;
421	}
422}
423
424static void cobalt_vars_init(struct cobalt_vars *vars)
425{
426	memset(vars, 0, sizeof(*vars));
427
428	if (!cobalt_rec_inv_sqrt_cache[0]) {
429		cobalt_cache_init();
430		cobalt_rec_inv_sqrt_cache[0] = ~0;
431	}
432}
433
434/* CoDel control_law is t + interval/sqrt(count)
435 * We maintain in rec_inv_sqrt the reciprocal value of sqrt(count) to avoid
436 * both sqrt() and divide operation.
437 */
438static ktime_t cobalt_control(ktime_t t,
439			      u64 interval,
440			      u32 rec_inv_sqrt)
441{
442	return ktime_add_ns(t, reciprocal_scale(interval,
443						rec_inv_sqrt));
444}
445
446/* Call this when a packet had to be dropped due to queue overflow.  Returns
447 * true if the BLUE state was quiescent before but active after this call.
448 */
449static bool cobalt_queue_full(struct cobalt_vars *vars,
450			      struct cobalt_params *p,
451			      ktime_t now)
452{
453	bool up = false;
454
455	if (ktime_to_ns(ktime_sub(now, vars->blue_timer)) > p->target) {
456		up = !vars->p_drop;
457		vars->p_drop += p->p_inc;
458		if (vars->p_drop < p->p_inc)
459			vars->p_drop = ~0;
460		vars->blue_timer = now;
461	}
462	vars->dropping = true;
463	vars->drop_next = now;
464	if (!vars->count)
465		vars->count = 1;
466
467	return up;
468}
469
470/* Call this when the queue was serviced but turned out to be empty.  Returns
471 * true if the BLUE state was active before but quiescent after this call.
472 */
473static bool cobalt_queue_empty(struct cobalt_vars *vars,
474			       struct cobalt_params *p,
475			       ktime_t now)
476{
477	bool down = false;
478
479	if (vars->p_drop &&
480	    ktime_to_ns(ktime_sub(now, vars->blue_timer)) > p->target) {
481		if (vars->p_drop < p->p_dec)
482			vars->p_drop = 0;
483		else
484			vars->p_drop -= p->p_dec;
485		vars->blue_timer = now;
486		down = !vars->p_drop;
487	}
488	vars->dropping = false;
489
490	if (vars->count && ktime_to_ns(ktime_sub(now, vars->drop_next)) >= 0) {
491		vars->count--;
492		cobalt_invsqrt(vars);
493		vars->drop_next = cobalt_control(vars->drop_next,
494						 p->interval,
495						 vars->rec_inv_sqrt);
496	}
497
498	return down;
499}
500
501/* Call this with a freshly dequeued packet for possible congestion marking.
502 * Returns true as an instruction to drop the packet, false for delivery.
503 */
504static bool cobalt_should_drop(struct cobalt_vars *vars,
505			       struct cobalt_params *p,
506			       ktime_t now,
507			       struct sk_buff *skb,
508			       u32 bulk_flows)
509{
510	bool next_due, over_target, drop = false;
511	ktime_t schedule;
512	u64 sojourn;
513
514/* The 'schedule' variable records, in its sign, whether 'now' is before or
515 * after 'drop_next'.  This allows 'drop_next' to be updated before the next
516 * scheduling decision is actually branched, without destroying that
517 * information.  Similarly, the first 'schedule' value calculated is preserved
518 * in the boolean 'next_due'.
519 *
520 * As for 'drop_next', we take advantage of the fact that 'interval' is both
521 * the delay between first exceeding 'target' and the first signalling event,
522 * *and* the scaling factor for the signalling frequency.  It's therefore very
523 * natural to use a single mechanism for both purposes, and eliminates a
524 * significant amount of reference Codel's spaghetti code.  To help with this,
525 * both the '0' and '1' entries in the invsqrt cache are 0xFFFFFFFF, as close
526 * as possible to 1.0 in fixed-point.
527 */
528
529	sojourn = ktime_to_ns(ktime_sub(now, cobalt_get_enqueue_time(skb)));
530	schedule = ktime_sub(now, vars->drop_next);
531	over_target = sojourn > p->target &&
532		      sojourn > p->mtu_time * bulk_flows * 2 &&
533		      sojourn > p->mtu_time * 4;
534	next_due = vars->count && ktime_to_ns(schedule) >= 0;
535
536	vars->ecn_marked = false;
537
538	if (over_target) {
539		if (!vars->dropping) {
540			vars->dropping = true;
541			vars->drop_next = cobalt_control(now,
542							 p->interval,
543							 vars->rec_inv_sqrt);
544		}
545		if (!vars->count)
546			vars->count = 1;
547	} else if (vars->dropping) {
548		vars->dropping = false;
549	}
550
551	if (next_due && vars->dropping) {
552		/* Use ECN mark if possible, otherwise drop */
553		drop = !(vars->ecn_marked = INET_ECN_set_ce(skb));
554
555		vars->count++;
556		if (!vars->count)
557			vars->count--;
558		cobalt_invsqrt(vars);
559		vars->drop_next = cobalt_control(vars->drop_next,
560						 p->interval,
561						 vars->rec_inv_sqrt);
562		schedule = ktime_sub(now, vars->drop_next);
563	} else {
564		while (next_due) {
565			vars->count--;
566			cobalt_invsqrt(vars);
567			vars->drop_next = cobalt_control(vars->drop_next,
568							 p->interval,
569							 vars->rec_inv_sqrt);
570			schedule = ktime_sub(now, vars->drop_next);
571			next_due = vars->count && ktime_to_ns(schedule) >= 0;
572		}
573	}
574
575	/* Simple BLUE implementation.  Lack of ECN is deliberate. */
576	if (vars->p_drop)
577		drop |= (get_random_u32() < vars->p_drop);
578
579	/* Overload the drop_next field as an activity timeout */
580	if (!vars->count)
581		vars->drop_next = ktime_add_ns(now, p->interval);
582	else if (ktime_to_ns(schedule) > 0 && !drop)
583		vars->drop_next = now;
584
585	return drop;
586}
587
588static bool cake_update_flowkeys(struct flow_keys *keys,
589				 const struct sk_buff *skb)
590{
591#if IS_ENABLED(CONFIG_NF_CONNTRACK)
592	struct nf_conntrack_tuple tuple = {};
593	bool rev = !skb->_nfct, upd = false;
594	__be32 ip;
595
596	if (skb_protocol(skb, true) != htons(ETH_P_IP))
597		return false;
598
599	if (!nf_ct_get_tuple_skb(&tuple, skb))
600		return false;
601
602	ip = rev ? tuple.dst.u3.ip : tuple.src.u3.ip;
603	if (ip != keys->addrs.v4addrs.src) {
604		keys->addrs.v4addrs.src = ip;
605		upd = true;
606	}
607	ip = rev ? tuple.src.u3.ip : tuple.dst.u3.ip;
608	if (ip != keys->addrs.v4addrs.dst) {
609		keys->addrs.v4addrs.dst = ip;
610		upd = true;
611	}
612
613	if (keys->ports.ports) {
614		__be16 port;
615
616		port = rev ? tuple.dst.u.all : tuple.src.u.all;
617		if (port != keys->ports.src) {
618			keys->ports.src = port;
619			upd = true;
620		}
621		port = rev ? tuple.src.u.all : tuple.dst.u.all;
622		if (port != keys->ports.dst) {
623			port = keys->ports.dst;
624			upd = true;
625		}
626	}
627	return upd;
628#else
629	return false;
630#endif
631}
632
633/* Cake has several subtle multiple bit settings. In these cases you
634 *  would be matching triple isolate mode as well.
635 */
636
637static bool cake_dsrc(int flow_mode)
638{
639	return (flow_mode & CAKE_FLOW_DUAL_SRC) == CAKE_FLOW_DUAL_SRC;
640}
641
642static bool cake_ddst(int flow_mode)
643{
644	return (flow_mode & CAKE_FLOW_DUAL_DST) == CAKE_FLOW_DUAL_DST;
645}
646
647static u32 cake_hash(struct cake_tin_data *q, const struct sk_buff *skb,
648		     int flow_mode, u16 flow_override, u16 host_override)
649{
650	bool hash_flows = (!flow_override && !!(flow_mode & CAKE_FLOW_FLOWS));
651	bool hash_hosts = (!host_override && !!(flow_mode & CAKE_FLOW_HOSTS));
652	bool nat_enabled = !!(flow_mode & CAKE_FLOW_NAT_FLAG);
653	u32 flow_hash = 0, srchost_hash = 0, dsthost_hash = 0;
654	u16 reduced_hash, srchost_idx, dsthost_idx;
655	struct flow_keys keys, host_keys;
656	bool use_skbhash = skb->l4_hash;
657
658	if (unlikely(flow_mode == CAKE_FLOW_NONE))
659		return 0;
660
661	/* If both overrides are set, or we can use the SKB hash and nat mode is
662	 * disabled, we can skip packet dissection entirely. If nat mode is
663	 * enabled there's another check below after doing the conntrack lookup.
664	 */
665	if ((!hash_flows || (use_skbhash && !nat_enabled)) && !hash_hosts)
666		goto skip_hash;
667
668	skb_flow_dissect_flow_keys(skb, &keys,
669				   FLOW_DISSECTOR_F_STOP_AT_FLOW_LABEL);
670
671	/* Don't use the SKB hash if we change the lookup keys from conntrack */
672	if (nat_enabled && cake_update_flowkeys(&keys, skb))
673		use_skbhash = false;
674
675	/* If we can still use the SKB hash and don't need the host hash, we can
676	 * skip the rest of the hashing procedure
677	 */
678	if (use_skbhash && !hash_hosts)
679		goto skip_hash;
680
681	/* flow_hash_from_keys() sorts the addresses by value, so we have
682	 * to preserve their order in a separate data structure to treat
683	 * src and dst host addresses as independently selectable.
684	 */
685	host_keys = keys;
686	host_keys.ports.ports     = 0;
687	host_keys.basic.ip_proto  = 0;
688	host_keys.keyid.keyid     = 0;
689	host_keys.tags.flow_label = 0;
690
691	switch (host_keys.control.addr_type) {
692	case FLOW_DISSECTOR_KEY_IPV4_ADDRS:
693		host_keys.addrs.v4addrs.src = 0;
694		dsthost_hash = flow_hash_from_keys(&host_keys);
695		host_keys.addrs.v4addrs.src = keys.addrs.v4addrs.src;
696		host_keys.addrs.v4addrs.dst = 0;
697		srchost_hash = flow_hash_from_keys(&host_keys);
698		break;
699
700	case FLOW_DISSECTOR_KEY_IPV6_ADDRS:
701		memset(&host_keys.addrs.v6addrs.src, 0,
702		       sizeof(host_keys.addrs.v6addrs.src));
703		dsthost_hash = flow_hash_from_keys(&host_keys);
704		host_keys.addrs.v6addrs.src = keys.addrs.v6addrs.src;
705		memset(&host_keys.addrs.v6addrs.dst, 0,
706		       sizeof(host_keys.addrs.v6addrs.dst));
707		srchost_hash = flow_hash_from_keys(&host_keys);
708		break;
709
710	default:
711		dsthost_hash = 0;
712		srchost_hash = 0;
713	}
714
715	/* This *must* be after the above switch, since as a
716	 * side-effect it sorts the src and dst addresses.
717	 */
718	if (hash_flows && !use_skbhash)
719		flow_hash = flow_hash_from_keys(&keys);
720
721skip_hash:
722	if (flow_override)
723		flow_hash = flow_override - 1;
724	else if (use_skbhash && (flow_mode & CAKE_FLOW_FLOWS))
725		flow_hash = skb->hash;
726	if (host_override) {
727		dsthost_hash = host_override - 1;
728		srchost_hash = host_override - 1;
729	}
730
731	if (!(flow_mode & CAKE_FLOW_FLOWS)) {
732		if (flow_mode & CAKE_FLOW_SRC_IP)
733			flow_hash ^= srchost_hash;
734
735		if (flow_mode & CAKE_FLOW_DST_IP)
736			flow_hash ^= dsthost_hash;
737	}
738
739	reduced_hash = flow_hash % CAKE_QUEUES;
740
741	/* set-associative hashing */
742	/* fast path if no hash collision (direct lookup succeeds) */
743	if (likely(q->tags[reduced_hash] == flow_hash &&
744		   q->flows[reduced_hash].set)) {
745		q->way_directs++;
746	} else {
747		u32 inner_hash = reduced_hash % CAKE_SET_WAYS;
748		u32 outer_hash = reduced_hash - inner_hash;
749		bool allocate_src = false;
750		bool allocate_dst = false;
751		u32 i, k;
752
753		/* check if any active queue in the set is reserved for
754		 * this flow.
755		 */
756		for (i = 0, k = inner_hash; i < CAKE_SET_WAYS;
757		     i++, k = (k + 1) % CAKE_SET_WAYS) {
758			if (q->tags[outer_hash + k] == flow_hash) {
759				if (i)
760					q->way_hits++;
761
762				if (!q->flows[outer_hash + k].set) {
763					/* need to increment host refcnts */
764					allocate_src = cake_dsrc(flow_mode);
765					allocate_dst = cake_ddst(flow_mode);
766				}
767
768				goto found;
769			}
770		}
771
772		/* no queue is reserved for this flow, look for an
773		 * empty one.
774		 */
775		for (i = 0; i < CAKE_SET_WAYS;
776			 i++, k = (k + 1) % CAKE_SET_WAYS) {
777			if (!q->flows[outer_hash + k].set) {
778				q->way_misses++;
779				allocate_src = cake_dsrc(flow_mode);
780				allocate_dst = cake_ddst(flow_mode);
781				goto found;
782			}
783		}
784
785		/* With no empty queues, default to the original
786		 * queue, accept the collision, update the host tags.
787		 */
788		q->way_collisions++;
789		if (q->flows[outer_hash + k].set == CAKE_SET_BULK) {
790			q->hosts[q->flows[reduced_hash].srchost].srchost_bulk_flow_count--;
791			q->hosts[q->flows[reduced_hash].dsthost].dsthost_bulk_flow_count--;
792		}
793		allocate_src = cake_dsrc(flow_mode);
794		allocate_dst = cake_ddst(flow_mode);
795found:
796		/* reserve queue for future packets in same flow */
797		reduced_hash = outer_hash + k;
798		q->tags[reduced_hash] = flow_hash;
799
800		if (allocate_src) {
801			srchost_idx = srchost_hash % CAKE_QUEUES;
802			inner_hash = srchost_idx % CAKE_SET_WAYS;
803			outer_hash = srchost_idx - inner_hash;
804			for (i = 0, k = inner_hash; i < CAKE_SET_WAYS;
805				i++, k = (k + 1) % CAKE_SET_WAYS) {
806				if (q->hosts[outer_hash + k].srchost_tag ==
807				    srchost_hash)
808					goto found_src;
809			}
810			for (i = 0; i < CAKE_SET_WAYS;
811				i++, k = (k + 1) % CAKE_SET_WAYS) {
812				if (!q->hosts[outer_hash + k].srchost_bulk_flow_count)
813					break;
814			}
815			q->hosts[outer_hash + k].srchost_tag = srchost_hash;
816found_src:
817			srchost_idx = outer_hash + k;
818			if (q->flows[reduced_hash].set == CAKE_SET_BULK)
819				q->hosts[srchost_idx].srchost_bulk_flow_count++;
820			q->flows[reduced_hash].srchost = srchost_idx;
821		}
822
823		if (allocate_dst) {
824			dsthost_idx = dsthost_hash % CAKE_QUEUES;
825			inner_hash = dsthost_idx % CAKE_SET_WAYS;
826			outer_hash = dsthost_idx - inner_hash;
827			for (i = 0, k = inner_hash; i < CAKE_SET_WAYS;
828			     i++, k = (k + 1) % CAKE_SET_WAYS) {
829				if (q->hosts[outer_hash + k].dsthost_tag ==
830				    dsthost_hash)
831					goto found_dst;
832			}
833			for (i = 0; i < CAKE_SET_WAYS;
834			     i++, k = (k + 1) % CAKE_SET_WAYS) {
835				if (!q->hosts[outer_hash + k].dsthost_bulk_flow_count)
836					break;
837			}
838			q->hosts[outer_hash + k].dsthost_tag = dsthost_hash;
839found_dst:
840			dsthost_idx = outer_hash + k;
841			if (q->flows[reduced_hash].set == CAKE_SET_BULK)
842				q->hosts[dsthost_idx].dsthost_bulk_flow_count++;
843			q->flows[reduced_hash].dsthost = dsthost_idx;
844		}
845	}
846
847	return reduced_hash;
848}
849
850/* helper functions : might be changed when/if skb use a standard list_head */
851/* remove one skb from head of slot queue */
852
853static struct sk_buff *dequeue_head(struct cake_flow *flow)
854{
855	struct sk_buff *skb = flow->head;
856
857	if (skb) {
858		flow->head = skb->next;
859		skb_mark_not_on_list(skb);
860	}
861
862	return skb;
863}
864
865/* add skb to flow queue (tail add) */
866
867static void flow_queue_add(struct cake_flow *flow, struct sk_buff *skb)
868{
869	if (!flow->head)
870		flow->head = skb;
871	else
872		flow->tail->next = skb;
873	flow->tail = skb;
874	skb->next = NULL;
875}
876
877static struct iphdr *cake_get_iphdr(const struct sk_buff *skb,
878				    struct ipv6hdr *buf)
879{
880	unsigned int offset = skb_network_offset(skb);
881	struct iphdr *iph;
882
883	iph = skb_header_pointer(skb, offset, sizeof(struct iphdr), buf);
884
885	if (!iph)
886		return NULL;
887
888	if (iph->version == 4 && iph->protocol == IPPROTO_IPV6)
889		return skb_header_pointer(skb, offset + iph->ihl * 4,
890					  sizeof(struct ipv6hdr), buf);
891
892	else if (iph->version == 4)
893		return iph;
894
895	else if (iph->version == 6)
896		return skb_header_pointer(skb, offset, sizeof(struct ipv6hdr),
897					  buf);
898
899	return NULL;
900}
901
902static struct tcphdr *cake_get_tcphdr(const struct sk_buff *skb,
903				      void *buf, unsigned int bufsize)
904{
905	unsigned int offset = skb_network_offset(skb);
906	const struct ipv6hdr *ipv6h;
907	const struct tcphdr *tcph;
908	const struct iphdr *iph;
909	struct ipv6hdr _ipv6h;
910	struct tcphdr _tcph;
911
912	ipv6h = skb_header_pointer(skb, offset, sizeof(_ipv6h), &_ipv6h);
913
914	if (!ipv6h)
915		return NULL;
916
917	if (ipv6h->version == 4) {
918		iph = (struct iphdr *)ipv6h;
919		offset += iph->ihl * 4;
920
921		/* special-case 6in4 tunnelling, as that is a common way to get
922		 * v6 connectivity in the home
923		 */
924		if (iph->protocol == IPPROTO_IPV6) {
925			ipv6h = skb_header_pointer(skb, offset,
926						   sizeof(_ipv6h), &_ipv6h);
927
928			if (!ipv6h || ipv6h->nexthdr != IPPROTO_TCP)
929				return NULL;
930
931			offset += sizeof(struct ipv6hdr);
932
933		} else if (iph->protocol != IPPROTO_TCP) {
934			return NULL;
935		}
936
937	} else if (ipv6h->version == 6) {
938		if (ipv6h->nexthdr != IPPROTO_TCP)
939			return NULL;
940
941		offset += sizeof(struct ipv6hdr);
942	} else {
943		return NULL;
944	}
945
946	tcph = skb_header_pointer(skb, offset, sizeof(_tcph), &_tcph);
947	if (!tcph || tcph->doff < 5)
948		return NULL;
949
950	return skb_header_pointer(skb, offset,
951				  min(__tcp_hdrlen(tcph), bufsize), buf);
952}
953
954static const void *cake_get_tcpopt(const struct tcphdr *tcph,
955				   int code, int *oplen)
956{
957	/* inspired by tcp_parse_options in tcp_input.c */
958	int length = __tcp_hdrlen(tcph) - sizeof(struct tcphdr);
959	const u8 *ptr = (const u8 *)(tcph + 1);
960
961	while (length > 0) {
962		int opcode = *ptr++;
963		int opsize;
964
965		if (opcode == TCPOPT_EOL)
966			break;
967		if (opcode == TCPOPT_NOP) {
968			length--;
969			continue;
970		}
971		if (length < 2)
972			break;
973		opsize = *ptr++;
974		if (opsize < 2 || opsize > length)
975			break;
976
977		if (opcode == code) {
978			*oplen = opsize;
979			return ptr;
980		}
981
982		ptr += opsize - 2;
983		length -= opsize;
984	}
985
986	return NULL;
987}
988
989/* Compare two SACK sequences. A sequence is considered greater if it SACKs more
990 * bytes than the other. In the case where both sequences ACKs bytes that the
991 * other doesn't, A is considered greater. DSACKs in A also makes A be
992 * considered greater.
993 *
994 * @return -1, 0 or 1 as normal compare functions
995 */
996static int cake_tcph_sack_compare(const struct tcphdr *tcph_a,
997				  const struct tcphdr *tcph_b)
998{
999	const struct tcp_sack_block_wire *sack_a, *sack_b;
1000	u32 ack_seq_a = ntohl(tcph_a->ack_seq);
1001	u32 bytes_a = 0, bytes_b = 0;
1002	int oplen_a, oplen_b;
1003	bool first = true;
1004
1005	sack_a = cake_get_tcpopt(tcph_a, TCPOPT_SACK, &oplen_a);
1006	sack_b = cake_get_tcpopt(tcph_b, TCPOPT_SACK, &oplen_b);
1007
1008	/* pointers point to option contents */
1009	oplen_a -= TCPOLEN_SACK_BASE;
1010	oplen_b -= TCPOLEN_SACK_BASE;
1011
1012	if (sack_a && oplen_a >= sizeof(*sack_a) &&
1013	    (!sack_b || oplen_b < sizeof(*sack_b)))
1014		return -1;
1015	else if (sack_b && oplen_b >= sizeof(*sack_b) &&
1016		 (!sack_a || oplen_a < sizeof(*sack_a)))
1017		return 1;
1018	else if ((!sack_a || oplen_a < sizeof(*sack_a)) &&
1019		 (!sack_b || oplen_b < sizeof(*sack_b)))
1020		return 0;
1021
1022	while (oplen_a >= sizeof(*sack_a)) {
1023		const struct tcp_sack_block_wire *sack_tmp = sack_b;
1024		u32 start_a = get_unaligned_be32(&sack_a->start_seq);
1025		u32 end_a = get_unaligned_be32(&sack_a->end_seq);
1026		int oplen_tmp = oplen_b;
1027		bool found = false;
1028
1029		/* DSACK; always considered greater to prevent dropping */
1030		if (before(start_a, ack_seq_a))
1031			return -1;
1032
1033		bytes_a += end_a - start_a;
1034
1035		while (oplen_tmp >= sizeof(*sack_tmp)) {
1036			u32 start_b = get_unaligned_be32(&sack_tmp->start_seq);
1037			u32 end_b = get_unaligned_be32(&sack_tmp->end_seq);
1038
1039			/* first time through we count the total size */
1040			if (first)
1041				bytes_b += end_b - start_b;
1042
1043			if (!after(start_b, start_a) && !before(end_b, end_a)) {
1044				found = true;
1045				if (!first)
1046					break;
1047			}
1048			oplen_tmp -= sizeof(*sack_tmp);
1049			sack_tmp++;
1050		}
1051
1052		if (!found)
1053			return -1;
1054
1055		oplen_a -= sizeof(*sack_a);
1056		sack_a++;
1057		first = false;
1058	}
1059
1060	/* If we made it this far, all ranges SACKed by A are covered by B, so
1061	 * either the SACKs are equal, or B SACKs more bytes.
1062	 */
1063	return bytes_b > bytes_a ? 1 : 0;
1064}
1065
1066static void cake_tcph_get_tstamp(const struct tcphdr *tcph,
1067				 u32 *tsval, u32 *tsecr)
1068{
1069	const u8 *ptr;
1070	int opsize;
1071
1072	ptr = cake_get_tcpopt(tcph, TCPOPT_TIMESTAMP, &opsize);
1073
1074	if (ptr && opsize == TCPOLEN_TIMESTAMP) {
1075		*tsval = get_unaligned_be32(ptr);
1076		*tsecr = get_unaligned_be32(ptr + 4);
1077	}
1078}
1079
1080static bool cake_tcph_may_drop(const struct tcphdr *tcph,
1081			       u32 tstamp_new, u32 tsecr_new)
1082{
1083	/* inspired by tcp_parse_options in tcp_input.c */
1084	int length = __tcp_hdrlen(tcph) - sizeof(struct tcphdr);
1085	const u8 *ptr = (const u8 *)(tcph + 1);
1086	u32 tstamp, tsecr;
1087
1088	/* 3 reserved flags must be unset to avoid future breakage
1089	 * ACK must be set
1090	 * ECE/CWR are handled separately
1091	 * All other flags URG/PSH/RST/SYN/FIN must be unset
1092	 * 0x0FFF0000 = all TCP flags (confirm ACK=1, others zero)
1093	 * 0x00C00000 = CWR/ECE (handled separately)
1094	 * 0x0F3F0000 = 0x0FFF0000 & ~0x00C00000
1095	 */
1096	if (((tcp_flag_word(tcph) &
1097	      cpu_to_be32(0x0F3F0000)) != TCP_FLAG_ACK))
1098		return false;
1099
1100	while (length > 0) {
1101		int opcode = *ptr++;
1102		int opsize;
1103
1104		if (opcode == TCPOPT_EOL)
1105			break;
1106		if (opcode == TCPOPT_NOP) {
1107			length--;
1108			continue;
1109		}
1110		if (length < 2)
1111			break;
1112		opsize = *ptr++;
1113		if (opsize < 2 || opsize > length)
1114			break;
1115
1116		switch (opcode) {
1117		case TCPOPT_MD5SIG: /* doesn't influence state */
1118			break;
1119
1120		case TCPOPT_SACK: /* stricter checking performed later */
1121			if (opsize % 8 != 2)
1122				return false;
1123			break;
1124
1125		case TCPOPT_TIMESTAMP:
1126			/* only drop timestamps lower than new */
1127			if (opsize != TCPOLEN_TIMESTAMP)
1128				return false;
1129			tstamp = get_unaligned_be32(ptr);
1130			tsecr = get_unaligned_be32(ptr + 4);
1131			if (after(tstamp, tstamp_new) ||
1132			    after(tsecr, tsecr_new))
1133				return false;
1134			break;
1135
1136		case TCPOPT_MSS:  /* these should only be set on SYN */
1137		case TCPOPT_WINDOW:
1138		case TCPOPT_SACK_PERM:
1139		case TCPOPT_FASTOPEN:
1140		case TCPOPT_EXP:
1141		default: /* don't drop if any unknown options are present */
1142			return false;
1143		}
1144
1145		ptr += opsize - 2;
1146		length -= opsize;
1147	}
1148
1149	return true;
1150}
1151
1152static struct sk_buff *cake_ack_filter(struct cake_sched_data *q,
1153				       struct cake_flow *flow)
1154{
1155	bool aggressive = q->ack_filter == CAKE_ACK_AGGRESSIVE;
1156	struct sk_buff *elig_ack = NULL, *elig_ack_prev = NULL;
1157	struct sk_buff *skb_check, *skb_prev = NULL;
1158	const struct ipv6hdr *ipv6h, *ipv6h_check;
1159	unsigned char _tcph[64], _tcph_check[64];
1160	const struct tcphdr *tcph, *tcph_check;
1161	const struct iphdr *iph, *iph_check;
1162	struct ipv6hdr _iph, _iph_check;
1163	const struct sk_buff *skb;
1164	int seglen, num_found = 0;
1165	u32 tstamp = 0, tsecr = 0;
1166	__be32 elig_flags = 0;
1167	int sack_comp;
1168
1169	/* no other possible ACKs to filter */
1170	if (flow->head == flow->tail)
1171		return NULL;
1172
1173	skb = flow->tail;
1174	tcph = cake_get_tcphdr(skb, _tcph, sizeof(_tcph));
1175	iph = cake_get_iphdr(skb, &_iph);
1176	if (!tcph)
1177		return NULL;
1178
1179	cake_tcph_get_tstamp(tcph, &tstamp, &tsecr);
1180
1181	/* the 'triggering' packet need only have the ACK flag set.
1182	 * also check that SYN is not set, as there won't be any previous ACKs.
1183	 */
1184	if ((tcp_flag_word(tcph) &
1185	     (TCP_FLAG_ACK | TCP_FLAG_SYN)) != TCP_FLAG_ACK)
1186		return NULL;
1187
1188	/* the 'triggering' ACK is at the tail of the queue, we have already
1189	 * returned if it is the only packet in the flow. loop through the rest
1190	 * of the queue looking for pure ACKs with the same 5-tuple as the
1191	 * triggering one.
1192	 */
1193	for (skb_check = flow->head;
1194	     skb_check && skb_check != skb;
1195	     skb_prev = skb_check, skb_check = skb_check->next) {
1196		iph_check = cake_get_iphdr(skb_check, &_iph_check);
1197		tcph_check = cake_get_tcphdr(skb_check, &_tcph_check,
1198					     sizeof(_tcph_check));
1199
1200		/* only TCP packets with matching 5-tuple are eligible, and only
1201		 * drop safe headers
1202		 */
1203		if (!tcph_check || iph->version != iph_check->version ||
1204		    tcph_check->source != tcph->source ||
1205		    tcph_check->dest != tcph->dest)
1206			continue;
1207
1208		if (iph_check->version == 4) {
1209			if (iph_check->saddr != iph->saddr ||
1210			    iph_check->daddr != iph->daddr)
1211				continue;
1212
1213			seglen = iph_totlen(skb, iph_check) -
1214				       (4 * iph_check->ihl);
1215		} else if (iph_check->version == 6) {
1216			ipv6h = (struct ipv6hdr *)iph;
1217			ipv6h_check = (struct ipv6hdr *)iph_check;
1218
1219			if (ipv6_addr_cmp(&ipv6h_check->saddr, &ipv6h->saddr) ||
1220			    ipv6_addr_cmp(&ipv6h_check->daddr, &ipv6h->daddr))
1221				continue;
1222
1223			seglen = ntohs(ipv6h_check->payload_len);
1224		} else {
1225			WARN_ON(1);  /* shouldn't happen */
1226			continue;
1227		}
1228
1229		/* If the ECE/CWR flags changed from the previous eligible
1230		 * packet in the same flow, we should no longer be dropping that
1231		 * previous packet as this would lose information.
1232		 */
1233		if (elig_ack && (tcp_flag_word(tcph_check) &
1234				 (TCP_FLAG_ECE | TCP_FLAG_CWR)) != elig_flags) {
1235			elig_ack = NULL;
1236			elig_ack_prev = NULL;
1237			num_found--;
1238		}
1239
1240		/* Check TCP options and flags, don't drop ACKs with segment
1241		 * data, and don't drop ACKs with a higher cumulative ACK
1242		 * counter than the triggering packet. Check ACK seqno here to
1243		 * avoid parsing SACK options of packets we are going to exclude
1244		 * anyway.
1245		 */
1246		if (!cake_tcph_may_drop(tcph_check, tstamp, tsecr) ||
1247		    (seglen - __tcp_hdrlen(tcph_check)) != 0 ||
1248		    after(ntohl(tcph_check->ack_seq), ntohl(tcph->ack_seq)))
1249			continue;
1250
1251		/* Check SACK options. The triggering packet must SACK more data
1252		 * than the ACK under consideration, or SACK the same range but
1253		 * have a larger cumulative ACK counter. The latter is a
1254		 * pathological case, but is contained in the following check
1255		 * anyway, just to be safe.
1256		 */
1257		sack_comp = cake_tcph_sack_compare(tcph_check, tcph);
1258
1259		if (sack_comp < 0 ||
1260		    (ntohl(tcph_check->ack_seq) == ntohl(tcph->ack_seq) &&
1261		     sack_comp == 0))
1262			continue;
1263
1264		/* At this point we have found an eligible pure ACK to drop; if
1265		 * we are in aggressive mode, we are done. Otherwise, keep
1266		 * searching unless this is the second eligible ACK we
1267		 * found.
1268		 *
1269		 * Since we want to drop ACK closest to the head of the queue,
1270		 * save the first eligible ACK we find, even if we need to loop
1271		 * again.
1272		 */
1273		if (!elig_ack) {
1274			elig_ack = skb_check;
1275			elig_ack_prev = skb_prev;
1276			elig_flags = (tcp_flag_word(tcph_check)
1277				      & (TCP_FLAG_ECE | TCP_FLAG_CWR));
1278		}
1279
1280		if (num_found++ > 0)
1281			goto found;
1282	}
1283
1284	/* We made it through the queue without finding two eligible ACKs . If
1285	 * we found a single eligible ACK we can drop it in aggressive mode if
1286	 * we can guarantee that this does not interfere with ECN flag
1287	 * information. We ensure this by dropping it only if the enqueued
1288	 * packet is consecutive with the eligible ACK, and their flags match.
1289	 */
1290	if (elig_ack && aggressive && elig_ack->next == skb &&
1291	    (elig_flags == (tcp_flag_word(tcph) &
1292			    (TCP_FLAG_ECE | TCP_FLAG_CWR))))
1293		goto found;
1294
1295	return NULL;
1296
1297found:
1298	if (elig_ack_prev)
1299		elig_ack_prev->next = elig_ack->next;
1300	else
1301		flow->head = elig_ack->next;
1302
1303	skb_mark_not_on_list(elig_ack);
1304
1305	return elig_ack;
1306}
1307
1308static u64 cake_ewma(u64 avg, u64 sample, u32 shift)
1309{
1310	avg -= avg >> shift;
1311	avg += sample >> shift;
1312	return avg;
1313}
1314
1315static u32 cake_calc_overhead(struct cake_sched_data *q, u32 len, u32 off)
1316{
1317	if (q->rate_flags & CAKE_FLAG_OVERHEAD)
1318		len -= off;
1319
1320	if (q->max_netlen < len)
1321		q->max_netlen = len;
1322	if (q->min_netlen > len)
1323		q->min_netlen = len;
1324
1325	len += q->rate_overhead;
1326
1327	if (len < q->rate_mpu)
1328		len = q->rate_mpu;
1329
1330	if (q->atm_mode == CAKE_ATM_ATM) {
1331		len += 47;
1332		len /= 48;
1333		len *= 53;
1334	} else if (q->atm_mode == CAKE_ATM_PTM) {
1335		/* Add one byte per 64 bytes or part thereof.
1336		 * This is conservative and easier to calculate than the
1337		 * precise value.
1338		 */
1339		len += (len + 63) / 64;
1340	}
1341
1342	if (q->max_adjlen < len)
1343		q->max_adjlen = len;
1344	if (q->min_adjlen > len)
1345		q->min_adjlen = len;
1346
1347	return len;
1348}
1349
1350static u32 cake_overhead(struct cake_sched_data *q, const struct sk_buff *skb)
1351{
1352	const struct skb_shared_info *shinfo = skb_shinfo(skb);
1353	unsigned int hdr_len, last_len = 0;
1354	u32 off = skb_network_offset(skb);
1355	u32 len = qdisc_pkt_len(skb);
1356	u16 segs = 1;
1357
1358	q->avg_netoff = cake_ewma(q->avg_netoff, off << 16, 8);
1359
1360	if (!shinfo->gso_size)
1361		return cake_calc_overhead(q, len, off);
1362
1363	/* borrowed from qdisc_pkt_len_init() */
1364	hdr_len = skb_transport_offset(skb);
1365
1366	/* + transport layer */
1367	if (likely(shinfo->gso_type & (SKB_GSO_TCPV4 |
1368						SKB_GSO_TCPV6))) {
1369		const struct tcphdr *th;
1370		struct tcphdr _tcphdr;
1371
1372		th = skb_header_pointer(skb, hdr_len,
1373					sizeof(_tcphdr), &_tcphdr);
1374		if (likely(th))
1375			hdr_len += __tcp_hdrlen(th);
1376	} else {
1377		struct udphdr _udphdr;
1378
1379		if (skb_header_pointer(skb, hdr_len,
1380				       sizeof(_udphdr), &_udphdr))
1381			hdr_len += sizeof(struct udphdr);
1382	}
1383
1384	if (unlikely(shinfo->gso_type & SKB_GSO_DODGY))
1385		segs = DIV_ROUND_UP(skb->len - hdr_len,
1386				    shinfo->gso_size);
1387	else
1388		segs = shinfo->gso_segs;
1389
1390	len = shinfo->gso_size + hdr_len;
1391	last_len = skb->len - shinfo->gso_size * (segs - 1);
1392
1393	return (cake_calc_overhead(q, len, off) * (segs - 1) +
1394		cake_calc_overhead(q, last_len, off));
1395}
1396
1397static void cake_heap_swap(struct cake_sched_data *q, u16 i, u16 j)
1398{
1399	struct cake_heap_entry ii = q->overflow_heap[i];
1400	struct cake_heap_entry jj = q->overflow_heap[j];
1401
1402	q->overflow_heap[i] = jj;
1403	q->overflow_heap[j] = ii;
1404
1405	q->tins[ii.t].overflow_idx[ii.b] = j;
1406	q->tins[jj.t].overflow_idx[jj.b] = i;
1407}
1408
1409static u32 cake_heap_get_backlog(const struct cake_sched_data *q, u16 i)
1410{
1411	struct cake_heap_entry ii = q->overflow_heap[i];
1412
1413	return q->tins[ii.t].backlogs[ii.b];
1414}
1415
1416static void cake_heapify(struct cake_sched_data *q, u16 i)
1417{
1418	static const u32 a = CAKE_MAX_TINS * CAKE_QUEUES;
1419	u32 mb = cake_heap_get_backlog(q, i);
1420	u32 m = i;
1421
1422	while (m < a) {
1423		u32 l = m + m + 1;
1424		u32 r = l + 1;
1425
1426		if (l < a) {
1427			u32 lb = cake_heap_get_backlog(q, l);
1428
1429			if (lb > mb) {
1430				m  = l;
1431				mb = lb;
1432			}
1433		}
1434
1435		if (r < a) {
1436			u32 rb = cake_heap_get_backlog(q, r);
1437
1438			if (rb > mb) {
1439				m  = r;
1440				mb = rb;
1441			}
1442		}
1443
1444		if (m != i) {
1445			cake_heap_swap(q, i, m);
1446			i = m;
1447		} else {
1448			break;
1449		}
1450	}
1451}
1452
1453static void cake_heapify_up(struct cake_sched_data *q, u16 i)
1454{
1455	while (i > 0 && i < CAKE_MAX_TINS * CAKE_QUEUES) {
1456		u16 p = (i - 1) >> 1;
1457		u32 ib = cake_heap_get_backlog(q, i);
1458		u32 pb = cake_heap_get_backlog(q, p);
1459
1460		if (ib > pb) {
1461			cake_heap_swap(q, i, p);
1462			i = p;
1463		} else {
1464			break;
1465		}
1466	}
1467}
1468
1469static int cake_advance_shaper(struct cake_sched_data *q,
1470			       struct cake_tin_data *b,
1471			       struct sk_buff *skb,
1472			       ktime_t now, bool drop)
1473{
1474	u32 len = get_cobalt_cb(skb)->adjusted_len;
1475
1476	/* charge packet bandwidth to this tin
1477	 * and to the global shaper.
1478	 */
1479	if (q->rate_ns) {
1480		u64 tin_dur = (len * b->tin_rate_ns) >> b->tin_rate_shft;
1481		u64 global_dur = (len * q->rate_ns) >> q->rate_shft;
1482		u64 failsafe_dur = global_dur + (global_dur >> 1);
1483
1484		if (ktime_before(b->time_next_packet, now))
1485			b->time_next_packet = ktime_add_ns(b->time_next_packet,
1486							   tin_dur);
1487
1488		else if (ktime_before(b->time_next_packet,
1489				      ktime_add_ns(now, tin_dur)))
1490			b->time_next_packet = ktime_add_ns(now, tin_dur);
1491
1492		q->time_next_packet = ktime_add_ns(q->time_next_packet,
1493						   global_dur);
1494		if (!drop)
1495			q->failsafe_next_packet = \
1496				ktime_add_ns(q->failsafe_next_packet,
1497					     failsafe_dur);
1498	}
1499	return len;
1500}
1501
1502static unsigned int cake_drop(struct Qdisc *sch, struct sk_buff **to_free)
1503{
1504	struct cake_sched_data *q = qdisc_priv(sch);
1505	ktime_t now = ktime_get();
1506	u32 idx = 0, tin = 0, len;
1507	struct cake_heap_entry qq;
1508	struct cake_tin_data *b;
1509	struct cake_flow *flow;
1510	struct sk_buff *skb;
1511
1512	if (!q->overflow_timeout) {
1513		int i;
1514		/* Build fresh max-heap */
1515		for (i = CAKE_MAX_TINS * CAKE_QUEUES / 2; i >= 0; i--)
1516			cake_heapify(q, i);
1517	}
1518	q->overflow_timeout = 65535;
1519
1520	/* select longest queue for pruning */
1521	qq  = q->overflow_heap[0];
1522	tin = qq.t;
1523	idx = qq.b;
1524
1525	b = &q->tins[tin];
1526	flow = &b->flows[idx];
1527	skb = dequeue_head(flow);
1528	if (unlikely(!skb)) {
1529		/* heap has gone wrong, rebuild it next time */
1530		q->overflow_timeout = 0;
1531		return idx + (tin << 16);
1532	}
1533
1534	if (cobalt_queue_full(&flow->cvars, &b->cparams, now))
1535		b->unresponsive_flow_count++;
1536
1537	len = qdisc_pkt_len(skb);
1538	q->buffer_used      -= skb->truesize;
1539	b->backlogs[idx]    -= len;
1540	b->tin_backlog      -= len;
1541	sch->qstats.backlog -= len;
1542	qdisc_tree_reduce_backlog(sch, 1, len);
1543
1544	flow->dropped++;
1545	b->tin_dropped++;
1546	sch->qstats.drops++;
1547
1548	if (q->rate_flags & CAKE_FLAG_INGRESS)
1549		cake_advance_shaper(q, b, skb, now, true);
1550
1551	__qdisc_drop(skb, to_free);
1552	sch->q.qlen--;
1553
1554	cake_heapify(q, 0);
1555
1556	return idx + (tin << 16);
1557}
1558
1559static u8 cake_handle_diffserv(struct sk_buff *skb, bool wash)
1560{
1561	const int offset = skb_network_offset(skb);
1562	u16 *buf, buf_;
1563	u8 dscp;
1564
1565	switch (skb_protocol(skb, true)) {
1566	case htons(ETH_P_IP):
1567		buf = skb_header_pointer(skb, offset, sizeof(buf_), &buf_);
1568		if (unlikely(!buf))
1569			return 0;
1570
1571		/* ToS is in the second byte of iphdr */
1572		dscp = ipv4_get_dsfield((struct iphdr *)buf) >> 2;
1573
1574		if (wash && dscp) {
1575			const int wlen = offset + sizeof(struct iphdr);
1576
1577			if (!pskb_may_pull(skb, wlen) ||
1578			    skb_try_make_writable(skb, wlen))
1579				return 0;
1580
1581			ipv4_change_dsfield(ip_hdr(skb), INET_ECN_MASK, 0);
1582		}
1583
1584		return dscp;
1585
1586	case htons(ETH_P_IPV6):
1587		buf = skb_header_pointer(skb, offset, sizeof(buf_), &buf_);
1588		if (unlikely(!buf))
1589			return 0;
1590
1591		/* Traffic class is in the first and second bytes of ipv6hdr */
1592		dscp = ipv6_get_dsfield((struct ipv6hdr *)buf) >> 2;
1593
1594		if (wash && dscp) {
1595			const int wlen = offset + sizeof(struct ipv6hdr);
1596
1597			if (!pskb_may_pull(skb, wlen) ||
1598			    skb_try_make_writable(skb, wlen))
1599				return 0;
1600
1601			ipv6_change_dsfield(ipv6_hdr(skb), INET_ECN_MASK, 0);
1602		}
1603
1604		return dscp;
1605
1606	case htons(ETH_P_ARP):
1607		return 0x38;  /* CS7 - Net Control */
1608
1609	default:
1610		/* If there is no Diffserv field, treat as best-effort */
1611		return 0;
1612	}
1613}
1614
1615static struct cake_tin_data *cake_select_tin(struct Qdisc *sch,
1616					     struct sk_buff *skb)
1617{
1618	struct cake_sched_data *q = qdisc_priv(sch);
1619	u32 tin, mark;
1620	bool wash;
1621	u8 dscp;
1622
1623	/* Tin selection: Default to diffserv-based selection, allow overriding
1624	 * using firewall marks or skb->priority. Call DSCP parsing early if
1625	 * wash is enabled, otherwise defer to below to skip unneeded parsing.
1626	 */
1627	mark = (skb->mark & q->fwmark_mask) >> q->fwmark_shft;
1628	wash = !!(q->rate_flags & CAKE_FLAG_WASH);
1629	if (wash)
1630		dscp = cake_handle_diffserv(skb, wash);
1631
1632	if (q->tin_mode == CAKE_DIFFSERV_BESTEFFORT)
1633		tin = 0;
1634
1635	else if (mark && mark <= q->tin_cnt)
1636		tin = q->tin_order[mark - 1];
1637
1638	else if (TC_H_MAJ(skb->priority) == sch->handle &&
1639		 TC_H_MIN(skb->priority) > 0 &&
1640		 TC_H_MIN(skb->priority) <= q->tin_cnt)
1641		tin = q->tin_order[TC_H_MIN(skb->priority) - 1];
1642
1643	else {
1644		if (!wash)
1645			dscp = cake_handle_diffserv(skb, wash);
1646		tin = q->tin_index[dscp];
1647
1648		if (unlikely(tin >= q->tin_cnt))
1649			tin = 0;
1650	}
1651
1652	return &q->tins[tin];
1653}
1654
1655static u32 cake_classify(struct Qdisc *sch, struct cake_tin_data **t,
1656			 struct sk_buff *skb, int flow_mode, int *qerr)
1657{
1658	struct cake_sched_data *q = qdisc_priv(sch);
1659	struct tcf_proto *filter;
1660	struct tcf_result res;
1661	u16 flow = 0, host = 0;
1662	int result;
1663
1664	filter = rcu_dereference_bh(q->filter_list);
1665	if (!filter)
1666		goto hash;
1667
1668	*qerr = NET_XMIT_SUCCESS | __NET_XMIT_BYPASS;
1669	result = tcf_classify(skb, NULL, filter, &res, false);
1670
1671	if (result >= 0) {
1672#ifdef CONFIG_NET_CLS_ACT
1673		switch (result) {
1674		case TC_ACT_STOLEN:
1675		case TC_ACT_QUEUED:
1676		case TC_ACT_TRAP:
1677			*qerr = NET_XMIT_SUCCESS | __NET_XMIT_STOLEN;
1678			fallthrough;
1679		case TC_ACT_SHOT:
1680			return 0;
1681		}
1682#endif
1683		if (TC_H_MIN(res.classid) <= CAKE_QUEUES)
1684			flow = TC_H_MIN(res.classid);
1685		if (TC_H_MAJ(res.classid) <= (CAKE_QUEUES << 16))
1686			host = TC_H_MAJ(res.classid) >> 16;
1687	}
1688hash:
1689	*t = cake_select_tin(sch, skb);
1690	return cake_hash(*t, skb, flow_mode, flow, host) + 1;
1691}
1692
1693static void cake_reconfigure(struct Qdisc *sch);
1694
1695static s32 cake_enqueue(struct sk_buff *skb, struct Qdisc *sch,
1696			struct sk_buff **to_free)
1697{
1698	struct cake_sched_data *q = qdisc_priv(sch);
1699	int len = qdisc_pkt_len(skb);
1700	int ret;
1701	struct sk_buff *ack = NULL;
1702	ktime_t now = ktime_get();
1703	struct cake_tin_data *b;
1704	struct cake_flow *flow;
1705	u32 idx;
1706
1707	/* choose flow to insert into */
1708	idx = cake_classify(sch, &b, skb, q->flow_mode, &ret);
1709	if (idx == 0) {
1710		if (ret & __NET_XMIT_BYPASS)
1711			qdisc_qstats_drop(sch);
1712		__qdisc_drop(skb, to_free);
1713		return ret;
1714	}
1715	idx--;
1716	flow = &b->flows[idx];
1717
1718	/* ensure shaper state isn't stale */
1719	if (!b->tin_backlog) {
1720		if (ktime_before(b->time_next_packet, now))
1721			b->time_next_packet = now;
1722
1723		if (!sch->q.qlen) {
1724			if (ktime_before(q->time_next_packet, now)) {
1725				q->failsafe_next_packet = now;
1726				q->time_next_packet = now;
1727			} else if (ktime_after(q->time_next_packet, now) &&
1728				   ktime_after(q->failsafe_next_packet, now)) {
1729				u64 next = \
1730					min(ktime_to_ns(q->time_next_packet),
1731					    ktime_to_ns(
1732						   q->failsafe_next_packet));
1733				sch->qstats.overlimits++;
1734				qdisc_watchdog_schedule_ns(&q->watchdog, next);
1735			}
1736		}
1737	}
1738
1739	if (unlikely(len > b->max_skblen))
1740		b->max_skblen = len;
1741
1742	if (skb_is_gso(skb) && q->rate_flags & CAKE_FLAG_SPLIT_GSO) {
1743		struct sk_buff *segs, *nskb;
1744		netdev_features_t features = netif_skb_features(skb);
1745		unsigned int slen = 0, numsegs = 0;
1746
1747		segs = skb_gso_segment(skb, features & ~NETIF_F_GSO_MASK);
1748		if (IS_ERR_OR_NULL(segs))
1749			return qdisc_drop(skb, sch, to_free);
1750
1751		skb_list_walk_safe(segs, segs, nskb) {
1752			skb_mark_not_on_list(segs);
1753			qdisc_skb_cb(segs)->pkt_len = segs->len;
1754			cobalt_set_enqueue_time(segs, now);
1755			get_cobalt_cb(segs)->adjusted_len = cake_overhead(q,
1756									  segs);
1757			flow_queue_add(flow, segs);
1758
1759			sch->q.qlen++;
1760			numsegs++;
1761			slen += segs->len;
1762			q->buffer_used += segs->truesize;
1763			b->packets++;
1764		}
1765
1766		/* stats */
1767		b->bytes	    += slen;
1768		b->backlogs[idx]    += slen;
1769		b->tin_backlog      += slen;
1770		sch->qstats.backlog += slen;
1771		q->avg_window_bytes += slen;
1772
1773		qdisc_tree_reduce_backlog(sch, 1-numsegs, len-slen);
1774		consume_skb(skb);
1775	} else {
1776		/* not splitting */
1777		cobalt_set_enqueue_time(skb, now);
1778		get_cobalt_cb(skb)->adjusted_len = cake_overhead(q, skb);
1779		flow_queue_add(flow, skb);
1780
1781		if (q->ack_filter)
1782			ack = cake_ack_filter(q, flow);
1783
1784		if (ack) {
1785			b->ack_drops++;
1786			sch->qstats.drops++;
1787			b->bytes += qdisc_pkt_len(ack);
1788			len -= qdisc_pkt_len(ack);
1789			q->buffer_used += skb->truesize - ack->truesize;
1790			if (q->rate_flags & CAKE_FLAG_INGRESS)
1791				cake_advance_shaper(q, b, ack, now, true);
1792
1793			qdisc_tree_reduce_backlog(sch, 1, qdisc_pkt_len(ack));
1794			consume_skb(ack);
1795		} else {
1796			sch->q.qlen++;
1797			q->buffer_used      += skb->truesize;
1798		}
1799
1800		/* stats */
1801		b->packets++;
1802		b->bytes	    += len;
1803		b->backlogs[idx]    += len;
1804		b->tin_backlog      += len;
1805		sch->qstats.backlog += len;
1806		q->avg_window_bytes += len;
1807	}
1808
1809	if (q->overflow_timeout)
1810		cake_heapify_up(q, b->overflow_idx[idx]);
1811
1812	/* incoming bandwidth capacity estimate */
1813	if (q->rate_flags & CAKE_FLAG_AUTORATE_INGRESS) {
1814		u64 packet_interval = \
1815			ktime_to_ns(ktime_sub(now, q->last_packet_time));
1816
1817		if (packet_interval > NSEC_PER_SEC)
1818			packet_interval = NSEC_PER_SEC;
1819
1820		/* filter out short-term bursts, eg. wifi aggregation */
1821		q->avg_packet_interval = \
1822			cake_ewma(q->avg_packet_interval,
1823				  packet_interval,
1824				  (packet_interval > q->avg_packet_interval ?
1825					  2 : 8));
1826
1827		q->last_packet_time = now;
1828
1829		if (packet_interval > q->avg_packet_interval) {
1830			u64 window_interval = \
1831				ktime_to_ns(ktime_sub(now,
1832						      q->avg_window_begin));
1833			u64 b = q->avg_window_bytes * (u64)NSEC_PER_SEC;
1834
1835			b = div64_u64(b, window_interval);
1836			q->avg_peak_bandwidth =
1837				cake_ewma(q->avg_peak_bandwidth, b,
1838					  b > q->avg_peak_bandwidth ? 2 : 8);
1839			q->avg_window_bytes = 0;
1840			q->avg_window_begin = now;
1841
1842			if (ktime_after(now,
1843					ktime_add_ms(q->last_reconfig_time,
1844						     250))) {
1845				q->rate_bps = (q->avg_peak_bandwidth * 15) >> 4;
1846				cake_reconfigure(sch);
1847			}
1848		}
1849	} else {
1850		q->avg_window_bytes = 0;
1851		q->last_packet_time = now;
1852	}
1853
1854	/* flowchain */
1855	if (!flow->set || flow->set == CAKE_SET_DECAYING) {
1856		struct cake_host *srchost = &b->hosts[flow->srchost];
1857		struct cake_host *dsthost = &b->hosts[flow->dsthost];
1858		u16 host_load = 1;
1859
1860		if (!flow->set) {
1861			list_add_tail(&flow->flowchain, &b->new_flows);
1862		} else {
1863			b->decaying_flow_count--;
1864			list_move_tail(&flow->flowchain, &b->new_flows);
1865		}
1866		flow->set = CAKE_SET_SPARSE;
1867		b->sparse_flow_count++;
1868
1869		if (cake_dsrc(q->flow_mode))
1870			host_load = max(host_load, srchost->srchost_bulk_flow_count);
1871
1872		if (cake_ddst(q->flow_mode))
1873			host_load = max(host_load, dsthost->dsthost_bulk_flow_count);
1874
1875		flow->deficit = (b->flow_quantum *
1876				 quantum_div[host_load]) >> 16;
1877	} else if (flow->set == CAKE_SET_SPARSE_WAIT) {
1878		struct cake_host *srchost = &b->hosts[flow->srchost];
1879		struct cake_host *dsthost = &b->hosts[flow->dsthost];
1880
1881		/* this flow was empty, accounted as a sparse flow, but actually
1882		 * in the bulk rotation.
1883		 */
1884		flow->set = CAKE_SET_BULK;
1885		b->sparse_flow_count--;
1886		b->bulk_flow_count++;
1887
1888		if (cake_dsrc(q->flow_mode))
1889			srchost->srchost_bulk_flow_count++;
1890
1891		if (cake_ddst(q->flow_mode))
1892			dsthost->dsthost_bulk_flow_count++;
1893
1894	}
1895
1896	if (q->buffer_used > q->buffer_max_used)
1897		q->buffer_max_used = q->buffer_used;
1898
1899	if (q->buffer_used > q->buffer_limit) {
1900		u32 dropped = 0;
1901
1902		while (q->buffer_used > q->buffer_limit) {
1903			dropped++;
1904			cake_drop(sch, to_free);
1905		}
1906		b->drop_overlimit += dropped;
1907	}
1908	return NET_XMIT_SUCCESS;
1909}
1910
1911static struct sk_buff *cake_dequeue_one(struct Qdisc *sch)
1912{
1913	struct cake_sched_data *q = qdisc_priv(sch);
1914	struct cake_tin_data *b = &q->tins[q->cur_tin];
1915	struct cake_flow *flow = &b->flows[q->cur_flow];
1916	struct sk_buff *skb = NULL;
1917	u32 len;
1918
1919	if (flow->head) {
1920		skb = dequeue_head(flow);
1921		len = qdisc_pkt_len(skb);
1922		b->backlogs[q->cur_flow] -= len;
1923		b->tin_backlog		 -= len;
1924		sch->qstats.backlog      -= len;
1925		q->buffer_used		 -= skb->truesize;
1926		sch->q.qlen--;
1927
1928		if (q->overflow_timeout)
1929			cake_heapify(q, b->overflow_idx[q->cur_flow]);
1930	}
1931	return skb;
1932}
1933
1934/* Discard leftover packets from a tin no longer in use. */
1935static void cake_clear_tin(struct Qdisc *sch, u16 tin)
1936{
1937	struct cake_sched_data *q = qdisc_priv(sch);
1938	struct sk_buff *skb;
1939
1940	q->cur_tin = tin;
1941	for (q->cur_flow = 0; q->cur_flow < CAKE_QUEUES; q->cur_flow++)
1942		while (!!(skb = cake_dequeue_one(sch)))
1943			kfree_skb(skb);
1944}
1945
1946static struct sk_buff *cake_dequeue(struct Qdisc *sch)
1947{
1948	struct cake_sched_data *q = qdisc_priv(sch);
1949	struct cake_tin_data *b = &q->tins[q->cur_tin];
1950	struct cake_host *srchost, *dsthost;
1951	ktime_t now = ktime_get();
1952	struct cake_flow *flow;
1953	struct list_head *head;
1954	bool first_flow = true;
1955	struct sk_buff *skb;
1956	u16 host_load;
1957	u64 delay;
1958	u32 len;
1959
1960begin:
1961	if (!sch->q.qlen)
1962		return NULL;
1963
1964	/* global hard shaper */
1965	if (ktime_after(q->time_next_packet, now) &&
1966	    ktime_after(q->failsafe_next_packet, now)) {
1967		u64 next = min(ktime_to_ns(q->time_next_packet),
1968			       ktime_to_ns(q->failsafe_next_packet));
1969
1970		sch->qstats.overlimits++;
1971		qdisc_watchdog_schedule_ns(&q->watchdog, next);
1972		return NULL;
1973	}
1974
1975	/* Choose a class to work on. */
1976	if (!q->rate_ns) {
1977		/* In unlimited mode, can't rely on shaper timings, just balance
1978		 * with DRR
1979		 */
1980		bool wrapped = false, empty = true;
1981
1982		while (b->tin_deficit < 0 ||
1983		       !(b->sparse_flow_count + b->bulk_flow_count)) {
1984			if (b->tin_deficit <= 0)
1985				b->tin_deficit += b->tin_quantum;
1986			if (b->sparse_flow_count + b->bulk_flow_count)
1987				empty = false;
1988
1989			q->cur_tin++;
1990			b++;
1991			if (q->cur_tin >= q->tin_cnt) {
1992				q->cur_tin = 0;
1993				b = q->tins;
1994
1995				if (wrapped) {
1996					/* It's possible for q->qlen to be
1997					 * nonzero when we actually have no
1998					 * packets anywhere.
1999					 */
2000					if (empty)
2001						return NULL;
2002				} else {
2003					wrapped = true;
2004				}
2005			}
2006		}
2007	} else {
2008		/* In shaped mode, choose:
2009		 * - Highest-priority tin with queue and meeting schedule, or
2010		 * - The earliest-scheduled tin with queue.
2011		 */
2012		ktime_t best_time = KTIME_MAX;
2013		int tin, best_tin = 0;
2014
2015		for (tin = 0; tin < q->tin_cnt; tin++) {
2016			b = q->tins + tin;
2017			if ((b->sparse_flow_count + b->bulk_flow_count) > 0) {
2018				ktime_t time_to_pkt = \
2019					ktime_sub(b->time_next_packet, now);
2020
2021				if (ktime_to_ns(time_to_pkt) <= 0 ||
2022				    ktime_compare(time_to_pkt,
2023						  best_time) <= 0) {
2024					best_time = time_to_pkt;
2025					best_tin = tin;
2026				}
2027			}
2028		}
2029
2030		q->cur_tin = best_tin;
2031		b = q->tins + best_tin;
2032
2033		/* No point in going further if no packets to deliver. */
2034		if (unlikely(!(b->sparse_flow_count + b->bulk_flow_count)))
2035			return NULL;
2036	}
2037
2038retry:
2039	/* service this class */
2040	head = &b->decaying_flows;
2041	if (!first_flow || list_empty(head)) {
2042		head = &b->new_flows;
2043		if (list_empty(head)) {
2044			head = &b->old_flows;
2045			if (unlikely(list_empty(head))) {
2046				head = &b->decaying_flows;
2047				if (unlikely(list_empty(head)))
2048					goto begin;
2049			}
2050		}
2051	}
2052	flow = list_first_entry(head, struct cake_flow, flowchain);
2053	q->cur_flow = flow - b->flows;
2054	first_flow = false;
2055
2056	/* triple isolation (modified DRR++) */
2057	srchost = &b->hosts[flow->srchost];
2058	dsthost = &b->hosts[flow->dsthost];
2059	host_load = 1;
2060
2061	/* flow isolation (DRR++) */
2062	if (flow->deficit <= 0) {
2063		/* Keep all flows with deficits out of the sparse and decaying
2064		 * rotations.  No non-empty flow can go into the decaying
2065		 * rotation, so they can't get deficits
2066		 */
2067		if (flow->set == CAKE_SET_SPARSE) {
2068			if (flow->head) {
2069				b->sparse_flow_count--;
2070				b->bulk_flow_count++;
2071
2072				if (cake_dsrc(q->flow_mode))
2073					srchost->srchost_bulk_flow_count++;
2074
2075				if (cake_ddst(q->flow_mode))
2076					dsthost->dsthost_bulk_flow_count++;
2077
2078				flow->set = CAKE_SET_BULK;
2079			} else {
2080				/* we've moved it to the bulk rotation for
2081				 * correct deficit accounting but we still want
2082				 * to count it as a sparse flow, not a bulk one.
2083				 */
2084				flow->set = CAKE_SET_SPARSE_WAIT;
2085			}
2086		}
2087
2088		if (cake_dsrc(q->flow_mode))
2089			host_load = max(host_load, srchost->srchost_bulk_flow_count);
2090
2091		if (cake_ddst(q->flow_mode))
2092			host_load = max(host_load, dsthost->dsthost_bulk_flow_count);
2093
2094		WARN_ON(host_load > CAKE_QUEUES);
2095
2096		/* The get_random_u16() is a way to apply dithering to avoid
2097		 * accumulating roundoff errors
2098		 */
2099		flow->deficit += (b->flow_quantum * quantum_div[host_load] +
2100				  get_random_u16()) >> 16;
2101		list_move_tail(&flow->flowchain, &b->old_flows);
2102
2103		goto retry;
2104	}
2105
2106	/* Retrieve a packet via the AQM */
2107	while (1) {
2108		skb = cake_dequeue_one(sch);
2109		if (!skb) {
2110			/* this queue was actually empty */
2111			if (cobalt_queue_empty(&flow->cvars, &b->cparams, now))
2112				b->unresponsive_flow_count--;
2113
2114			if (flow->cvars.p_drop || flow->cvars.count ||
2115			    ktime_before(now, flow->cvars.drop_next)) {
2116				/* keep in the flowchain until the state has
2117				 * decayed to rest
2118				 */
2119				list_move_tail(&flow->flowchain,
2120					       &b->decaying_flows);
2121				if (flow->set == CAKE_SET_BULK) {
2122					b->bulk_flow_count--;
2123
2124					if (cake_dsrc(q->flow_mode))
2125						srchost->srchost_bulk_flow_count--;
2126
2127					if (cake_ddst(q->flow_mode))
2128						dsthost->dsthost_bulk_flow_count--;
2129
2130					b->decaying_flow_count++;
2131				} else if (flow->set == CAKE_SET_SPARSE ||
2132					   flow->set == CAKE_SET_SPARSE_WAIT) {
2133					b->sparse_flow_count--;
2134					b->decaying_flow_count++;
2135				}
2136				flow->set = CAKE_SET_DECAYING;
2137			} else {
2138				/* remove empty queue from the flowchain */
2139				list_del_init(&flow->flowchain);
2140				if (flow->set == CAKE_SET_SPARSE ||
2141				    flow->set == CAKE_SET_SPARSE_WAIT)
2142					b->sparse_flow_count--;
2143				else if (flow->set == CAKE_SET_BULK) {
2144					b->bulk_flow_count--;
2145
2146					if (cake_dsrc(q->flow_mode))
2147						srchost->srchost_bulk_flow_count--;
2148
2149					if (cake_ddst(q->flow_mode))
2150						dsthost->dsthost_bulk_flow_count--;
2151
2152				} else
2153					b->decaying_flow_count--;
2154
2155				flow->set = CAKE_SET_NONE;
2156			}
2157			goto begin;
2158		}
2159
2160		/* Last packet in queue may be marked, shouldn't be dropped */
2161		if (!cobalt_should_drop(&flow->cvars, &b->cparams, now, skb,
2162					(b->bulk_flow_count *
2163					 !!(q->rate_flags &
2164					    CAKE_FLAG_INGRESS))) ||
2165		    !flow->head)
2166			break;
2167
2168		/* drop this packet, get another one */
2169		if (q->rate_flags & CAKE_FLAG_INGRESS) {
2170			len = cake_advance_shaper(q, b, skb,
2171						  now, true);
2172			flow->deficit -= len;
2173			b->tin_deficit -= len;
2174		}
2175		flow->dropped++;
2176		b->tin_dropped++;
2177		qdisc_tree_reduce_backlog(sch, 1, qdisc_pkt_len(skb));
2178		qdisc_qstats_drop(sch);
2179		kfree_skb(skb);
2180		if (q->rate_flags & CAKE_FLAG_INGRESS)
2181			goto retry;
2182	}
2183
2184	b->tin_ecn_mark += !!flow->cvars.ecn_marked;
2185	qdisc_bstats_update(sch, skb);
2186
2187	/* collect delay stats */
2188	delay = ktime_to_ns(ktime_sub(now, cobalt_get_enqueue_time(skb)));
2189	b->avge_delay = cake_ewma(b->avge_delay, delay, 8);
2190	b->peak_delay = cake_ewma(b->peak_delay, delay,
2191				  delay > b->peak_delay ? 2 : 8);
2192	b->base_delay = cake_ewma(b->base_delay, delay,
2193				  delay < b->base_delay ? 2 : 8);
2194
2195	len = cake_advance_shaper(q, b, skb, now, false);
2196	flow->deficit -= len;
2197	b->tin_deficit -= len;
2198
2199	if (ktime_after(q->time_next_packet, now) && sch->q.qlen) {
2200		u64 next = min(ktime_to_ns(q->time_next_packet),
2201			       ktime_to_ns(q->failsafe_next_packet));
2202
2203		qdisc_watchdog_schedule_ns(&q->watchdog, next);
2204	} else if (!sch->q.qlen) {
2205		int i;
2206
2207		for (i = 0; i < q->tin_cnt; i++) {
2208			if (q->tins[i].decaying_flow_count) {
2209				ktime_t next = \
2210					ktime_add_ns(now,
2211						     q->tins[i].cparams.target);
2212
2213				qdisc_watchdog_schedule_ns(&q->watchdog,
2214							   ktime_to_ns(next));
2215				break;
2216			}
2217		}
2218	}
2219
2220	if (q->overflow_timeout)
2221		q->overflow_timeout--;
2222
2223	return skb;
2224}
2225
2226static void cake_reset(struct Qdisc *sch)
2227{
2228	struct cake_sched_data *q = qdisc_priv(sch);
2229	u32 c;
2230
2231	if (!q->tins)
2232		return;
2233
2234	for (c = 0; c < CAKE_MAX_TINS; c++)
2235		cake_clear_tin(sch, c);
2236}
2237
2238static const struct nla_policy cake_policy[TCA_CAKE_MAX + 1] = {
2239	[TCA_CAKE_BASE_RATE64]   = { .type = NLA_U64 },
2240	[TCA_CAKE_DIFFSERV_MODE] = { .type = NLA_U32 },
2241	[TCA_CAKE_ATM]		 = { .type = NLA_U32 },
2242	[TCA_CAKE_FLOW_MODE]     = { .type = NLA_U32 },
2243	[TCA_CAKE_OVERHEAD]      = { .type = NLA_S32 },
2244	[TCA_CAKE_RTT]		 = { .type = NLA_U32 },
2245	[TCA_CAKE_TARGET]	 = { .type = NLA_U32 },
2246	[TCA_CAKE_AUTORATE]      = { .type = NLA_U32 },
2247	[TCA_CAKE_MEMORY]	 = { .type = NLA_U32 },
2248	[TCA_CAKE_NAT]		 = { .type = NLA_U32 },
2249	[TCA_CAKE_RAW]		 = { .type = NLA_U32 },
2250	[TCA_CAKE_WASH]		 = { .type = NLA_U32 },
2251	[TCA_CAKE_MPU]		 = { .type = NLA_U32 },
2252	[TCA_CAKE_INGRESS]	 = { .type = NLA_U32 },
2253	[TCA_CAKE_ACK_FILTER]	 = { .type = NLA_U32 },
2254	[TCA_CAKE_SPLIT_GSO]	 = { .type = NLA_U32 },
2255	[TCA_CAKE_FWMARK]	 = { .type = NLA_U32 },
2256};
2257
2258static void cake_set_rate(struct cake_tin_data *b, u64 rate, u32 mtu,
2259			  u64 target_ns, u64 rtt_est_ns)
2260{
2261	/* convert byte-rate into time-per-byte
2262	 * so it will always unwedge in reasonable time.
2263	 */
2264	static const u64 MIN_RATE = 64;
2265	u32 byte_target = mtu;
2266	u64 byte_target_ns;
2267	u8  rate_shft = 0;
2268	u64 rate_ns = 0;
2269
2270	b->flow_quantum = 1514;
2271	if (rate) {
2272		b->flow_quantum = max(min(rate >> 12, 1514ULL), 300ULL);
2273		rate_shft = 34;
2274		rate_ns = ((u64)NSEC_PER_SEC) << rate_shft;
2275		rate_ns = div64_u64(rate_ns, max(MIN_RATE, rate));
2276		while (!!(rate_ns >> 34)) {
2277			rate_ns >>= 1;
2278			rate_shft--;
2279		}
2280	} /* else unlimited, ie. zero delay */
2281
2282	b->tin_rate_bps  = rate;
2283	b->tin_rate_ns   = rate_ns;
2284	b->tin_rate_shft = rate_shft;
2285
2286	byte_target_ns = (byte_target * rate_ns) >> rate_shft;
2287
2288	b->cparams.target = max((byte_target_ns * 3) / 2, target_ns);
2289	b->cparams.interval = max(rtt_est_ns +
2290				     b->cparams.target - target_ns,
2291				     b->cparams.target * 2);
2292	b->cparams.mtu_time = byte_target_ns;
2293	b->cparams.p_inc = 1 << 24; /* 1/256 */
2294	b->cparams.p_dec = 1 << 20; /* 1/4096 */
2295}
2296
2297static int cake_config_besteffort(struct Qdisc *sch)
2298{
2299	struct cake_sched_data *q = qdisc_priv(sch);
2300	struct cake_tin_data *b = &q->tins[0];
2301	u32 mtu = psched_mtu(qdisc_dev(sch));
2302	u64 rate = q->rate_bps;
2303
2304	q->tin_cnt = 1;
2305
2306	q->tin_index = besteffort;
2307	q->tin_order = normal_order;
2308
2309	cake_set_rate(b, rate, mtu,
2310		      us_to_ns(q->target), us_to_ns(q->interval));
2311	b->tin_quantum = 65535;
2312
2313	return 0;
2314}
2315
2316static int cake_config_precedence(struct Qdisc *sch)
2317{
2318	/* convert high-level (user visible) parameters into internal format */
2319	struct cake_sched_data *q = qdisc_priv(sch);
2320	u32 mtu = psched_mtu(qdisc_dev(sch));
2321	u64 rate = q->rate_bps;
2322	u32 quantum = 256;
2323	u32 i;
2324
2325	q->tin_cnt = 8;
2326	q->tin_index = precedence;
2327	q->tin_order = normal_order;
2328
2329	for (i = 0; i < q->tin_cnt; i++) {
2330		struct cake_tin_data *b = &q->tins[i];
2331
2332		cake_set_rate(b, rate, mtu, us_to_ns(q->target),
2333			      us_to_ns(q->interval));
2334
2335		b->tin_quantum = max_t(u16, 1U, quantum);
2336
2337		/* calculate next class's parameters */
2338		rate  *= 7;
2339		rate >>= 3;
2340
2341		quantum  *= 7;
2342		quantum >>= 3;
2343	}
2344
2345	return 0;
2346}
2347
2348/*	List of known Diffserv codepoints:
2349 *
2350 *	Default Forwarding (DF/CS0) - Best Effort
2351 *	Max Throughput (TOS2)
2352 *	Min Delay (TOS4)
2353 *	LLT "La" (TOS5)
2354 *	Assured Forwarding 1 (AF1x) - x3
2355 *	Assured Forwarding 2 (AF2x) - x3
2356 *	Assured Forwarding 3 (AF3x) - x3
2357 *	Assured Forwarding 4 (AF4x) - x3
2358 *	Precedence Class 1 (CS1)
2359 *	Precedence Class 2 (CS2)
2360 *	Precedence Class 3 (CS3)
2361 *	Precedence Class 4 (CS4)
2362 *	Precedence Class 5 (CS5)
2363 *	Precedence Class 6 (CS6)
2364 *	Precedence Class 7 (CS7)
2365 *	Voice Admit (VA)
2366 *	Expedited Forwarding (EF)
2367 *	Lower Effort (LE)
2368 *
2369 *	Total 26 codepoints.
2370 */
2371
2372/*	List of traffic classes in RFC 4594, updated by RFC 8622:
2373 *		(roughly descending order of contended priority)
2374 *		(roughly ascending order of uncontended throughput)
2375 *
2376 *	Network Control (CS6,CS7)      - routing traffic
2377 *	Telephony (EF,VA)         - aka. VoIP streams
2378 *	Signalling (CS5)               - VoIP setup
2379 *	Multimedia Conferencing (AF4x) - aka. video calls
2380 *	Realtime Interactive (CS4)     - eg. games
2381 *	Multimedia Streaming (AF3x)    - eg. YouTube, NetFlix, Twitch
2382 *	Broadcast Video (CS3)
2383 *	Low-Latency Data (AF2x,TOS4)      - eg. database
2384 *	Ops, Admin, Management (CS2)      - eg. ssh
2385 *	Standard Service (DF & unrecognised codepoints)
2386 *	High-Throughput Data (AF1x,TOS2)  - eg. web traffic
2387 *	Low-Priority Data (LE,CS1)        - eg. BitTorrent
2388 *
2389 *	Total 12 traffic classes.
2390 */
2391
2392static int cake_config_diffserv8(struct Qdisc *sch)
2393{
2394/*	Pruned list of traffic classes for typical applications:
2395 *
2396 *		Network Control          (CS6, CS7)
2397 *		Minimum Latency          (EF, VA, CS5, CS4)
2398 *		Interactive Shell        (CS2)
2399 *		Low Latency Transactions (AF2x, TOS4)
2400 *		Video Streaming          (AF4x, AF3x, CS3)
2401 *		Bog Standard             (DF etc.)
2402 *		High Throughput          (AF1x, TOS2, CS1)
2403 *		Background Traffic       (LE)
2404 *
2405 *		Total 8 traffic classes.
2406 */
2407
2408	struct cake_sched_data *q = qdisc_priv(sch);
2409	u32 mtu = psched_mtu(qdisc_dev(sch));
2410	u64 rate = q->rate_bps;
2411	u32 quantum = 256;
2412	u32 i;
2413
2414	q->tin_cnt = 8;
2415
2416	/* codepoint to class mapping */
2417	q->tin_index = diffserv8;
2418	q->tin_order = normal_order;
2419
2420	/* class characteristics */
2421	for (i = 0; i < q->tin_cnt; i++) {
2422		struct cake_tin_data *b = &q->tins[i];
2423
2424		cake_set_rate(b, rate, mtu, us_to_ns(q->target),
2425			      us_to_ns(q->interval));
2426
2427		b->tin_quantum = max_t(u16, 1U, quantum);
2428
2429		/* calculate next class's parameters */
2430		rate  *= 7;
2431		rate >>= 3;
2432
2433		quantum  *= 7;
2434		quantum >>= 3;
2435	}
2436
2437	return 0;
2438}
2439
2440static int cake_config_diffserv4(struct Qdisc *sch)
2441{
2442/*  Further pruned list of traffic classes for four-class system:
2443 *
2444 *	    Latency Sensitive  (CS7, CS6, EF, VA, CS5, CS4)
2445 *	    Streaming Media    (AF4x, AF3x, CS3, AF2x, TOS4, CS2)
2446 *	    Best Effort        (DF, AF1x, TOS2, and those not specified)
2447 *	    Background Traffic (LE, CS1)
2448 *
2449 *		Total 4 traffic classes.
2450 */
2451
2452	struct cake_sched_data *q = qdisc_priv(sch);
2453	u32 mtu = psched_mtu(qdisc_dev(sch));
2454	u64 rate = q->rate_bps;
2455	u32 quantum = 1024;
2456
2457	q->tin_cnt = 4;
2458
2459	/* codepoint to class mapping */
2460	q->tin_index = diffserv4;
2461	q->tin_order = bulk_order;
2462
2463	/* class characteristics */
2464	cake_set_rate(&q->tins[0], rate, mtu,
2465		      us_to_ns(q->target), us_to_ns(q->interval));
2466	cake_set_rate(&q->tins[1], rate >> 4, mtu,
2467		      us_to_ns(q->target), us_to_ns(q->interval));
2468	cake_set_rate(&q->tins[2], rate >> 1, mtu,
2469		      us_to_ns(q->target), us_to_ns(q->interval));
2470	cake_set_rate(&q->tins[3], rate >> 2, mtu,
2471		      us_to_ns(q->target), us_to_ns(q->interval));
2472
2473	/* bandwidth-sharing weights */
2474	q->tins[0].tin_quantum = quantum;
2475	q->tins[1].tin_quantum = quantum >> 4;
2476	q->tins[2].tin_quantum = quantum >> 1;
2477	q->tins[3].tin_quantum = quantum >> 2;
2478
2479	return 0;
2480}
2481
2482static int cake_config_diffserv3(struct Qdisc *sch)
2483{
2484/*  Simplified Diffserv structure with 3 tins.
2485 *		Latency Sensitive	(CS7, CS6, EF, VA, TOS4)
2486 *		Best Effort
2487 *		Low Priority		(LE, CS1)
2488 */
2489	struct cake_sched_data *q = qdisc_priv(sch);
2490	u32 mtu = psched_mtu(qdisc_dev(sch));
2491	u64 rate = q->rate_bps;
2492	u32 quantum = 1024;
2493
2494	q->tin_cnt = 3;
2495
2496	/* codepoint to class mapping */
2497	q->tin_index = diffserv3;
2498	q->tin_order = bulk_order;
2499
2500	/* class characteristics */
2501	cake_set_rate(&q->tins[0], rate, mtu,
2502		      us_to_ns(q->target), us_to_ns(q->interval));
2503	cake_set_rate(&q->tins[1], rate >> 4, mtu,
2504		      us_to_ns(q->target), us_to_ns(q->interval));
2505	cake_set_rate(&q->tins[2], rate >> 2, mtu,
2506		      us_to_ns(q->target), us_to_ns(q->interval));
2507
2508	/* bandwidth-sharing weights */
2509	q->tins[0].tin_quantum = quantum;
2510	q->tins[1].tin_quantum = quantum >> 4;
2511	q->tins[2].tin_quantum = quantum >> 2;
2512
2513	return 0;
2514}
2515
2516static void cake_reconfigure(struct Qdisc *sch)
2517{
2518	struct cake_sched_data *q = qdisc_priv(sch);
2519	int c, ft;
2520
2521	switch (q->tin_mode) {
2522	case CAKE_DIFFSERV_BESTEFFORT:
2523		ft = cake_config_besteffort(sch);
2524		break;
2525
2526	case CAKE_DIFFSERV_PRECEDENCE:
2527		ft = cake_config_precedence(sch);
2528		break;
2529
2530	case CAKE_DIFFSERV_DIFFSERV8:
2531		ft = cake_config_diffserv8(sch);
2532		break;
2533
2534	case CAKE_DIFFSERV_DIFFSERV4:
2535		ft = cake_config_diffserv4(sch);
2536		break;
2537
2538	case CAKE_DIFFSERV_DIFFSERV3:
2539	default:
2540		ft = cake_config_diffserv3(sch);
2541		break;
2542	}
2543
2544	for (c = q->tin_cnt; c < CAKE_MAX_TINS; c++) {
2545		cake_clear_tin(sch, c);
2546		q->tins[c].cparams.mtu_time = q->tins[ft].cparams.mtu_time;
2547	}
2548
2549	q->rate_ns   = q->tins[ft].tin_rate_ns;
2550	q->rate_shft = q->tins[ft].tin_rate_shft;
2551
2552	if (q->buffer_config_limit) {
2553		q->buffer_limit = q->buffer_config_limit;
2554	} else if (q->rate_bps) {
2555		u64 t = q->rate_bps * q->interval;
2556
2557		do_div(t, USEC_PER_SEC / 4);
2558		q->buffer_limit = max_t(u32, t, 4U << 20);
2559	} else {
2560		q->buffer_limit = ~0;
2561	}
2562
2563	sch->flags &= ~TCQ_F_CAN_BYPASS;
2564
2565	q->buffer_limit = min(q->buffer_limit,
2566			      max(sch->limit * psched_mtu(qdisc_dev(sch)),
2567				  q->buffer_config_limit));
2568}
2569
2570static int cake_change(struct Qdisc *sch, struct nlattr *opt,
2571		       struct netlink_ext_ack *extack)
2572{
2573	struct cake_sched_data *q = qdisc_priv(sch);
2574	struct nlattr *tb[TCA_CAKE_MAX + 1];
2575	int err;
2576
2577	err = nla_parse_nested_deprecated(tb, TCA_CAKE_MAX, opt, cake_policy,
2578					  extack);
2579	if (err < 0)
2580		return err;
2581
2582	if (tb[TCA_CAKE_NAT]) {
2583#if IS_ENABLED(CONFIG_NF_CONNTRACK)
2584		q->flow_mode &= ~CAKE_FLOW_NAT_FLAG;
2585		q->flow_mode |= CAKE_FLOW_NAT_FLAG *
2586			!!nla_get_u32(tb[TCA_CAKE_NAT]);
2587#else
2588		NL_SET_ERR_MSG_ATTR(extack, tb[TCA_CAKE_NAT],
2589				    "No conntrack support in kernel");
2590		return -EOPNOTSUPP;
2591#endif
2592	}
2593
2594	if (tb[TCA_CAKE_BASE_RATE64])
2595		q->rate_bps = nla_get_u64(tb[TCA_CAKE_BASE_RATE64]);
2596
2597	if (tb[TCA_CAKE_DIFFSERV_MODE])
2598		q->tin_mode = nla_get_u32(tb[TCA_CAKE_DIFFSERV_MODE]);
2599
2600	if (tb[TCA_CAKE_WASH]) {
2601		if (!!nla_get_u32(tb[TCA_CAKE_WASH]))
2602			q->rate_flags |= CAKE_FLAG_WASH;
2603		else
2604			q->rate_flags &= ~CAKE_FLAG_WASH;
2605	}
2606
2607	if (tb[TCA_CAKE_FLOW_MODE])
2608		q->flow_mode = ((q->flow_mode & CAKE_FLOW_NAT_FLAG) |
2609				(nla_get_u32(tb[TCA_CAKE_FLOW_MODE]) &
2610					CAKE_FLOW_MASK));
2611
2612	if (tb[TCA_CAKE_ATM])
2613		q->atm_mode = nla_get_u32(tb[TCA_CAKE_ATM]);
2614
2615	if (tb[TCA_CAKE_OVERHEAD]) {
2616		q->rate_overhead = nla_get_s32(tb[TCA_CAKE_OVERHEAD]);
2617		q->rate_flags |= CAKE_FLAG_OVERHEAD;
2618
2619		q->max_netlen = 0;
2620		q->max_adjlen = 0;
2621		q->min_netlen = ~0;
2622		q->min_adjlen = ~0;
2623	}
2624
2625	if (tb[TCA_CAKE_RAW]) {
2626		q->rate_flags &= ~CAKE_FLAG_OVERHEAD;
2627
2628		q->max_netlen = 0;
2629		q->max_adjlen = 0;
2630		q->min_netlen = ~0;
2631		q->min_adjlen = ~0;
2632	}
2633
2634	if (tb[TCA_CAKE_MPU])
2635		q->rate_mpu = nla_get_u32(tb[TCA_CAKE_MPU]);
2636
2637	if (tb[TCA_CAKE_RTT]) {
2638		q->interval = nla_get_u32(tb[TCA_CAKE_RTT]);
2639
2640		if (!q->interval)
2641			q->interval = 1;
2642	}
2643
2644	if (tb[TCA_CAKE_TARGET]) {
2645		q->target = nla_get_u32(tb[TCA_CAKE_TARGET]);
2646
2647		if (!q->target)
2648			q->target = 1;
2649	}
2650
2651	if (tb[TCA_CAKE_AUTORATE]) {
2652		if (!!nla_get_u32(tb[TCA_CAKE_AUTORATE]))
2653			q->rate_flags |= CAKE_FLAG_AUTORATE_INGRESS;
2654		else
2655			q->rate_flags &= ~CAKE_FLAG_AUTORATE_INGRESS;
2656	}
2657
2658	if (tb[TCA_CAKE_INGRESS]) {
2659		if (!!nla_get_u32(tb[TCA_CAKE_INGRESS]))
2660			q->rate_flags |= CAKE_FLAG_INGRESS;
2661		else
2662			q->rate_flags &= ~CAKE_FLAG_INGRESS;
2663	}
2664
2665	if (tb[TCA_CAKE_ACK_FILTER])
2666		q->ack_filter = nla_get_u32(tb[TCA_CAKE_ACK_FILTER]);
2667
2668	if (tb[TCA_CAKE_MEMORY])
2669		q->buffer_config_limit = nla_get_u32(tb[TCA_CAKE_MEMORY]);
2670
2671	if (tb[TCA_CAKE_SPLIT_GSO]) {
2672		if (!!nla_get_u32(tb[TCA_CAKE_SPLIT_GSO]))
2673			q->rate_flags |= CAKE_FLAG_SPLIT_GSO;
2674		else
2675			q->rate_flags &= ~CAKE_FLAG_SPLIT_GSO;
2676	}
2677
2678	if (tb[TCA_CAKE_FWMARK]) {
2679		q->fwmark_mask = nla_get_u32(tb[TCA_CAKE_FWMARK]);
2680		q->fwmark_shft = q->fwmark_mask ? __ffs(q->fwmark_mask) : 0;
2681	}
2682
2683	if (q->tins) {
2684		sch_tree_lock(sch);
2685		cake_reconfigure(sch);
2686		sch_tree_unlock(sch);
2687	}
2688
2689	return 0;
2690}
2691
2692static void cake_destroy(struct Qdisc *sch)
2693{
2694	struct cake_sched_data *q = qdisc_priv(sch);
2695
2696	qdisc_watchdog_cancel(&q->watchdog);
2697	tcf_block_put(q->block);
2698	kvfree(q->tins);
2699}
2700
2701static int cake_init(struct Qdisc *sch, struct nlattr *opt,
2702		     struct netlink_ext_ack *extack)
2703{
2704	struct cake_sched_data *q = qdisc_priv(sch);
2705	int i, j, err;
2706
2707	sch->limit = 10240;
2708	q->tin_mode = CAKE_DIFFSERV_DIFFSERV3;
2709	q->flow_mode  = CAKE_FLOW_TRIPLE;
2710
2711	q->rate_bps = 0; /* unlimited by default */
2712
2713	q->interval = 100000; /* 100ms default */
2714	q->target   =   5000; /* 5ms: codel RFC argues
2715			       * for 5 to 10% of interval
2716			       */
2717	q->rate_flags |= CAKE_FLAG_SPLIT_GSO;
2718	q->cur_tin = 0;
2719	q->cur_flow  = 0;
2720
2721	qdisc_watchdog_init(&q->watchdog, sch);
2722
2723	if (opt) {
2724		err = cake_change(sch, opt, extack);
2725
2726		if (err)
2727			return err;
2728	}
2729
2730	err = tcf_block_get(&q->block, &q->filter_list, sch, extack);
2731	if (err)
2732		return err;
2733
2734	quantum_div[0] = ~0;
2735	for (i = 1; i <= CAKE_QUEUES; i++)
2736		quantum_div[i] = 65535 / i;
2737
2738	q->tins = kvcalloc(CAKE_MAX_TINS, sizeof(struct cake_tin_data),
2739			   GFP_KERNEL);
2740	if (!q->tins)
2741		return -ENOMEM;
2742
2743	for (i = 0; i < CAKE_MAX_TINS; i++) {
2744		struct cake_tin_data *b = q->tins + i;
2745
2746		INIT_LIST_HEAD(&b->new_flows);
2747		INIT_LIST_HEAD(&b->old_flows);
2748		INIT_LIST_HEAD(&b->decaying_flows);
2749		b->sparse_flow_count = 0;
2750		b->bulk_flow_count = 0;
2751		b->decaying_flow_count = 0;
2752
2753		for (j = 0; j < CAKE_QUEUES; j++) {
2754			struct cake_flow *flow = b->flows + j;
2755			u32 k = j * CAKE_MAX_TINS + i;
2756
2757			INIT_LIST_HEAD(&flow->flowchain);
2758			cobalt_vars_init(&flow->cvars);
2759
2760			q->overflow_heap[k].t = i;
2761			q->overflow_heap[k].b = j;
2762			b->overflow_idx[j] = k;
2763		}
2764	}
2765
2766	cake_reconfigure(sch);
2767	q->avg_peak_bandwidth = q->rate_bps;
2768	q->min_netlen = ~0;
2769	q->min_adjlen = ~0;
2770	return 0;
2771}
2772
2773static int cake_dump(struct Qdisc *sch, struct sk_buff *skb)
2774{
2775	struct cake_sched_data *q = qdisc_priv(sch);
2776	struct nlattr *opts;
2777
2778	opts = nla_nest_start_noflag(skb, TCA_OPTIONS);
2779	if (!opts)
2780		goto nla_put_failure;
2781
2782	if (nla_put_u64_64bit(skb, TCA_CAKE_BASE_RATE64, q->rate_bps,
2783			      TCA_CAKE_PAD))
2784		goto nla_put_failure;
2785
2786	if (nla_put_u32(skb, TCA_CAKE_FLOW_MODE,
2787			q->flow_mode & CAKE_FLOW_MASK))
2788		goto nla_put_failure;
2789
2790	if (nla_put_u32(skb, TCA_CAKE_RTT, q->interval))
2791		goto nla_put_failure;
2792
2793	if (nla_put_u32(skb, TCA_CAKE_TARGET, q->target))
2794		goto nla_put_failure;
2795
2796	if (nla_put_u32(skb, TCA_CAKE_MEMORY, q->buffer_config_limit))
2797		goto nla_put_failure;
2798
2799	if (nla_put_u32(skb, TCA_CAKE_AUTORATE,
2800			!!(q->rate_flags & CAKE_FLAG_AUTORATE_INGRESS)))
2801		goto nla_put_failure;
2802
2803	if (nla_put_u32(skb, TCA_CAKE_INGRESS,
2804			!!(q->rate_flags & CAKE_FLAG_INGRESS)))
2805		goto nla_put_failure;
2806
2807	if (nla_put_u32(skb, TCA_CAKE_ACK_FILTER, q->ack_filter))
2808		goto nla_put_failure;
2809
2810	if (nla_put_u32(skb, TCA_CAKE_NAT,
2811			!!(q->flow_mode & CAKE_FLOW_NAT_FLAG)))
2812		goto nla_put_failure;
2813
2814	if (nla_put_u32(skb, TCA_CAKE_DIFFSERV_MODE, q->tin_mode))
2815		goto nla_put_failure;
2816
2817	if (nla_put_u32(skb, TCA_CAKE_WASH,
2818			!!(q->rate_flags & CAKE_FLAG_WASH)))
2819		goto nla_put_failure;
2820
2821	if (nla_put_u32(skb, TCA_CAKE_OVERHEAD, q->rate_overhead))
2822		goto nla_put_failure;
2823
2824	if (!(q->rate_flags & CAKE_FLAG_OVERHEAD))
2825		if (nla_put_u32(skb, TCA_CAKE_RAW, 0))
2826			goto nla_put_failure;
2827
2828	if (nla_put_u32(skb, TCA_CAKE_ATM, q->atm_mode))
2829		goto nla_put_failure;
2830
2831	if (nla_put_u32(skb, TCA_CAKE_MPU, q->rate_mpu))
2832		goto nla_put_failure;
2833
2834	if (nla_put_u32(skb, TCA_CAKE_SPLIT_GSO,
2835			!!(q->rate_flags & CAKE_FLAG_SPLIT_GSO)))
2836		goto nla_put_failure;
2837
2838	if (nla_put_u32(skb, TCA_CAKE_FWMARK, q->fwmark_mask))
2839		goto nla_put_failure;
2840
2841	return nla_nest_end(skb, opts);
2842
2843nla_put_failure:
2844	return -1;
2845}
2846
2847static int cake_dump_stats(struct Qdisc *sch, struct gnet_dump *d)
2848{
2849	struct nlattr *stats = nla_nest_start_noflag(d->skb, TCA_STATS_APP);
2850	struct cake_sched_data *q = qdisc_priv(sch);
2851	struct nlattr *tstats, *ts;
2852	int i;
2853
2854	if (!stats)
2855		return -1;
2856
2857#define PUT_STAT_U32(attr, data) do {				       \
2858		if (nla_put_u32(d->skb, TCA_CAKE_STATS_ ## attr, data)) \
2859			goto nla_put_failure;			       \
2860	} while (0)
2861#define PUT_STAT_U64(attr, data) do {				       \
2862		if (nla_put_u64_64bit(d->skb, TCA_CAKE_STATS_ ## attr, \
2863					data, TCA_CAKE_STATS_PAD)) \
2864			goto nla_put_failure;			       \
2865	} while (0)
2866
2867	PUT_STAT_U64(CAPACITY_ESTIMATE64, q->avg_peak_bandwidth);
2868	PUT_STAT_U32(MEMORY_LIMIT, q->buffer_limit);
2869	PUT_STAT_U32(MEMORY_USED, q->buffer_max_used);
2870	PUT_STAT_U32(AVG_NETOFF, ((q->avg_netoff + 0x8000) >> 16));
2871	PUT_STAT_U32(MAX_NETLEN, q->max_netlen);
2872	PUT_STAT_U32(MAX_ADJLEN, q->max_adjlen);
2873	PUT_STAT_U32(MIN_NETLEN, q->min_netlen);
2874	PUT_STAT_U32(MIN_ADJLEN, q->min_adjlen);
2875
2876#undef PUT_STAT_U32
2877#undef PUT_STAT_U64
2878
2879	tstats = nla_nest_start_noflag(d->skb, TCA_CAKE_STATS_TIN_STATS);
2880	if (!tstats)
2881		goto nla_put_failure;
2882
2883#define PUT_TSTAT_U32(attr, data) do {					\
2884		if (nla_put_u32(d->skb, TCA_CAKE_TIN_STATS_ ## attr, data)) \
2885			goto nla_put_failure;				\
2886	} while (0)
2887#define PUT_TSTAT_U64(attr, data) do {					\
2888		if (nla_put_u64_64bit(d->skb, TCA_CAKE_TIN_STATS_ ## attr, \
2889					data, TCA_CAKE_TIN_STATS_PAD))	\
2890			goto nla_put_failure;				\
2891	} while (0)
2892
2893	for (i = 0; i < q->tin_cnt; i++) {
2894		struct cake_tin_data *b = &q->tins[q->tin_order[i]];
2895
2896		ts = nla_nest_start_noflag(d->skb, i + 1);
2897		if (!ts)
2898			goto nla_put_failure;
2899
2900		PUT_TSTAT_U64(THRESHOLD_RATE64, b->tin_rate_bps);
2901		PUT_TSTAT_U64(SENT_BYTES64, b->bytes);
2902		PUT_TSTAT_U32(BACKLOG_BYTES, b->tin_backlog);
2903
2904		PUT_TSTAT_U32(TARGET_US,
2905			      ktime_to_us(ns_to_ktime(b->cparams.target)));
2906		PUT_TSTAT_U32(INTERVAL_US,
2907			      ktime_to_us(ns_to_ktime(b->cparams.interval)));
2908
2909		PUT_TSTAT_U32(SENT_PACKETS, b->packets);
2910		PUT_TSTAT_U32(DROPPED_PACKETS, b->tin_dropped);
2911		PUT_TSTAT_U32(ECN_MARKED_PACKETS, b->tin_ecn_mark);
2912		PUT_TSTAT_U32(ACKS_DROPPED_PACKETS, b->ack_drops);
2913
2914		PUT_TSTAT_U32(PEAK_DELAY_US,
2915			      ktime_to_us(ns_to_ktime(b->peak_delay)));
2916		PUT_TSTAT_U32(AVG_DELAY_US,
2917			      ktime_to_us(ns_to_ktime(b->avge_delay)));
2918		PUT_TSTAT_U32(BASE_DELAY_US,
2919			      ktime_to_us(ns_to_ktime(b->base_delay)));
2920
2921		PUT_TSTAT_U32(WAY_INDIRECT_HITS, b->way_hits);
2922		PUT_TSTAT_U32(WAY_MISSES, b->way_misses);
2923		PUT_TSTAT_U32(WAY_COLLISIONS, b->way_collisions);
2924
2925		PUT_TSTAT_U32(SPARSE_FLOWS, b->sparse_flow_count +
2926					    b->decaying_flow_count);
2927		PUT_TSTAT_U32(BULK_FLOWS, b->bulk_flow_count);
2928		PUT_TSTAT_U32(UNRESPONSIVE_FLOWS, b->unresponsive_flow_count);
2929		PUT_TSTAT_U32(MAX_SKBLEN, b->max_skblen);
2930
2931		PUT_TSTAT_U32(FLOW_QUANTUM, b->flow_quantum);
2932		nla_nest_end(d->skb, ts);
2933	}
2934
2935#undef PUT_TSTAT_U32
2936#undef PUT_TSTAT_U64
2937
2938	nla_nest_end(d->skb, tstats);
2939	return nla_nest_end(d->skb, stats);
2940
2941nla_put_failure:
2942	nla_nest_cancel(d->skb, stats);
2943	return -1;
2944}
2945
2946static struct Qdisc *cake_leaf(struct Qdisc *sch, unsigned long arg)
2947{
2948	return NULL;
2949}
2950
2951static unsigned long cake_find(struct Qdisc *sch, u32 classid)
2952{
2953	return 0;
2954}
2955
2956static unsigned long cake_bind(struct Qdisc *sch, unsigned long parent,
2957			       u32 classid)
2958{
2959	return 0;
2960}
2961
2962static void cake_unbind(struct Qdisc *q, unsigned long cl)
2963{
2964}
2965
2966static struct tcf_block *cake_tcf_block(struct Qdisc *sch, unsigned long cl,
2967					struct netlink_ext_ack *extack)
2968{
2969	struct cake_sched_data *q = qdisc_priv(sch);
2970
2971	if (cl)
2972		return NULL;
2973	return q->block;
2974}
2975
2976static int cake_dump_class(struct Qdisc *sch, unsigned long cl,
2977			   struct sk_buff *skb, struct tcmsg *tcm)
2978{
2979	tcm->tcm_handle |= TC_H_MIN(cl);
2980	return 0;
2981}
2982
2983static int cake_dump_class_stats(struct Qdisc *sch, unsigned long cl,
2984				 struct gnet_dump *d)
2985{
2986	struct cake_sched_data *q = qdisc_priv(sch);
2987	const struct cake_flow *flow = NULL;
2988	struct gnet_stats_queue qs = { 0 };
2989	struct nlattr *stats;
2990	u32 idx = cl - 1;
2991
2992	if (idx < CAKE_QUEUES * q->tin_cnt) {
2993		const struct cake_tin_data *b = \
2994			&q->tins[q->tin_order[idx / CAKE_QUEUES]];
2995		const struct sk_buff *skb;
2996
2997		flow = &b->flows[idx % CAKE_QUEUES];
2998
2999		if (flow->head) {
3000			sch_tree_lock(sch);
3001			skb = flow->head;
3002			while (skb) {
3003				qs.qlen++;
3004				skb = skb->next;
3005			}
3006			sch_tree_unlock(sch);
3007		}
3008		qs.backlog = b->backlogs[idx % CAKE_QUEUES];
3009		qs.drops = flow->dropped;
3010	}
3011	if (gnet_stats_copy_queue(d, NULL, &qs, qs.qlen) < 0)
3012		return -1;
3013	if (flow) {
3014		ktime_t now = ktime_get();
3015
3016		stats = nla_nest_start_noflag(d->skb, TCA_STATS_APP);
3017		if (!stats)
3018			return -1;
3019
3020#define PUT_STAT_U32(attr, data) do {				       \
3021		if (nla_put_u32(d->skb, TCA_CAKE_STATS_ ## attr, data)) \
3022			goto nla_put_failure;			       \
3023	} while (0)
3024#define PUT_STAT_S32(attr, data) do {				       \
3025		if (nla_put_s32(d->skb, TCA_CAKE_STATS_ ## attr, data)) \
3026			goto nla_put_failure;			       \
3027	} while (0)
3028
3029		PUT_STAT_S32(DEFICIT, flow->deficit);
3030		PUT_STAT_U32(DROPPING, flow->cvars.dropping);
3031		PUT_STAT_U32(COBALT_COUNT, flow->cvars.count);
3032		PUT_STAT_U32(P_DROP, flow->cvars.p_drop);
3033		if (flow->cvars.p_drop) {
3034			PUT_STAT_S32(BLUE_TIMER_US,
3035				     ktime_to_us(
3036					     ktime_sub(now,
3037						       flow->cvars.blue_timer)));
3038		}
3039		if (flow->cvars.dropping) {
3040			PUT_STAT_S32(DROP_NEXT_US,
3041				     ktime_to_us(
3042					     ktime_sub(now,
3043						       flow->cvars.drop_next)));
3044		}
3045
3046		if (nla_nest_end(d->skb, stats) < 0)
3047			return -1;
3048	}
3049
3050	return 0;
3051
3052nla_put_failure:
3053	nla_nest_cancel(d->skb, stats);
3054	return -1;
3055}
3056
3057static void cake_walk(struct Qdisc *sch, struct qdisc_walker *arg)
3058{
3059	struct cake_sched_data *q = qdisc_priv(sch);
3060	unsigned int i, j;
3061
3062	if (arg->stop)
3063		return;
3064
3065	for (i = 0; i < q->tin_cnt; i++) {
3066		struct cake_tin_data *b = &q->tins[q->tin_order[i]];
3067
3068		for (j = 0; j < CAKE_QUEUES; j++) {
3069			if (list_empty(&b->flows[j].flowchain)) {
3070				arg->count++;
3071				continue;
3072			}
3073			if (!tc_qdisc_stats_dump(sch, i * CAKE_QUEUES + j + 1,
3074						 arg))
3075				break;
3076		}
3077	}
3078}
3079
3080static const struct Qdisc_class_ops cake_class_ops = {
3081	.leaf		=	cake_leaf,
3082	.find		=	cake_find,
3083	.tcf_block	=	cake_tcf_block,
3084	.bind_tcf	=	cake_bind,
3085	.unbind_tcf	=	cake_unbind,
3086	.dump		=	cake_dump_class,
3087	.dump_stats	=	cake_dump_class_stats,
3088	.walk		=	cake_walk,
3089};
3090
3091static struct Qdisc_ops cake_qdisc_ops __read_mostly = {
3092	.cl_ops		=	&cake_class_ops,
3093	.id		=	"cake",
3094	.priv_size	=	sizeof(struct cake_sched_data),
3095	.enqueue	=	cake_enqueue,
3096	.dequeue	=	cake_dequeue,
3097	.peek		=	qdisc_peek_dequeued,
3098	.init		=	cake_init,
3099	.reset		=	cake_reset,
3100	.destroy	=	cake_destroy,
3101	.change		=	cake_change,
3102	.dump		=	cake_dump,
3103	.dump_stats	=	cake_dump_stats,
3104	.owner		=	THIS_MODULE,
3105};
3106MODULE_ALIAS_NET_SCH("cake");
3107
3108static int __init cake_module_init(void)
3109{
3110	return register_qdisc(&cake_qdisc_ops);
3111}
3112
3113static void __exit cake_module_exit(void)
3114{
3115	unregister_qdisc(&cake_qdisc_ops);
3116}
3117
3118module_init(cake_module_init)
3119module_exit(cake_module_exit)
3120MODULE_AUTHOR("Jonathan Morton");
3121MODULE_LICENSE("Dual BSD/GPL");
3122MODULE_DESCRIPTION("The CAKE shaper.");
3123