• Home
  • History
  • Annotate
  • Line#
  • Navigate
  • Raw
  • Download
  • only in /asuswrt-rt-n18u-9.0.0.4.380.2695/release/src-rt-6.x.4708/linux/linux-2.6.36/net/sched/
1/*
2 * net/sched/sch_netem.c	Network emulator
3 *
4 * 		This program is free software; you can redistribute it and/or
5 * 		modify it under the terms of the GNU General Public License
6 * 		as published by the Free Software Foundation; either version
7 * 		2 of the License.
8 *
9 *  		Many of the algorithms and ideas for this came from
10 *		NIST Net which is not copyrighted.
11 *
12 * Authors:	Stephen Hemminger <shemminger@osdl.org>
13 *		Catalin(ux aka Dino) BOIE <catab at umbrella dot ro>
14 */
15
16#include <linux/module.h>
17#include <linux/slab.h>
18#include <linux/types.h>
19#include <linux/kernel.h>
20#include <linux/errno.h>
21#include <linux/skbuff.h>
22#include <linux/rtnetlink.h>
23
24#include <net/netlink.h>
25#include <net/pkt_sched.h>
26
27#define VERSION "1.2"
28
29/*	Network Emulation Queuing algorithm.
30	====================================
31
32	Sources: [1] Mark Carson, Darrin Santay, "NIST Net - A Linux-based
33		 Network Emulation Tool
34		 [2] Luigi Rizzo, DummyNet for FreeBSD
35
36	 ----------------------------------------------------------------
37
38	 This started out as a simple way to delay outgoing packets to
39	 test TCP but has grown to include most of the functionality
40	 of a full blown network emulator like NISTnet. It can delay
41	 packets and add random jitter (and correlation). The random
42	 distribution can be loaded from a table as well to provide
43	 normal, Pareto, or experimental curves. Packet loss,
44	 duplication, and reordering can also be emulated.
45
46	 This qdisc does not do classification that can be handled in
47	 layering other disciplines.  It does not need to do bandwidth
48	 control either since that can be handled by using token
49	 bucket or other rate control.
50*/
51
52struct netem_sched_data {
53	struct Qdisc	*qdisc;
54	struct qdisc_watchdog watchdog;
55
56	psched_tdiff_t latency;
57	psched_tdiff_t jitter;
58
59	u32 loss;
60	u32 limit;
61	u32 counter;
62	u32 gap;
63	u32 duplicate;
64	u32 reorder;
65	u32 corrupt;
66
67	struct crndstate {
68		u32 last;
69		u32 rho;
70	} delay_cor, loss_cor, dup_cor, reorder_cor, corrupt_cor;
71
72	struct disttable {
73		u32  size;
74		s16 table[0];
75	} *delay_dist;
76};
77
78/* Time stamp put into socket buffer control block */
79struct netem_skb_cb {
80	psched_time_t	time_to_send;
81};
82
83static inline struct netem_skb_cb *netem_skb_cb(struct sk_buff *skb)
84{
85	BUILD_BUG_ON(sizeof(skb->cb) <
86		sizeof(struct qdisc_skb_cb) + sizeof(struct netem_skb_cb));
87	return (struct netem_skb_cb *)qdisc_skb_cb(skb)->data;
88}
89
90/* init_crandom - initialize correlated random number generator
91 * Use entropy source for initial seed.
92 */
93static void init_crandom(struct crndstate *state, unsigned long rho)
94{
95	state->rho = rho;
96	state->last = net_random();
97}
98
99/* get_crandom - correlated random number generator
100 * Next number depends on last value.
101 * rho is scaled to avoid floating point.
102 */
103static u32 get_crandom(struct crndstate *state)
104{
105	u64 value, rho;
106	unsigned long answer;
107
108	if (state->rho == 0)	/* no correlation */
109		return net_random();
110
111	value = net_random();
112	rho = (u64)state->rho + 1;
113	answer = (value * ((1ull<<32) - rho) + state->last * rho) >> 32;
114	state->last = answer;
115	return answer;
116}
117
118/* tabledist - return a pseudo-randomly distributed value with mean mu and
119 * std deviation sigma.  Uses table lookup to approximate the desired
120 * distribution, and a uniformly-distributed pseudo-random source.
121 */
122static psched_tdiff_t tabledist(psched_tdiff_t mu, psched_tdiff_t sigma,
123				struct crndstate *state,
124				const struct disttable *dist)
125{
126	psched_tdiff_t x;
127	long t;
128	u32 rnd;
129
130	if (sigma == 0)
131		return mu;
132
133	rnd = get_crandom(state);
134
135	/* default uniform distribution */
136	if (dist == NULL)
137		return (rnd % (2*sigma)) - sigma + mu;
138
139	t = dist->table[rnd % dist->size];
140	x = (sigma % NETEM_DIST_SCALE) * t;
141	if (x >= 0)
142		x += NETEM_DIST_SCALE/2;
143	else
144		x -= NETEM_DIST_SCALE/2;
145
146	return  x / NETEM_DIST_SCALE + (sigma / NETEM_DIST_SCALE) * t + mu;
147}
148
149/*
150 * Insert one skb into qdisc.
151 * Note: parent depends on return value to account for queue length.
152 * 	NET_XMIT_DROP: queue length didn't change.
153 *      NET_XMIT_SUCCESS: one skb was queued.
154 */
155static int netem_enqueue(struct sk_buff *skb, struct Qdisc *sch)
156{
157	struct netem_sched_data *q = qdisc_priv(sch);
158	/* We don't fill cb now as skb_unshare() may invalidate it */
159	struct netem_skb_cb *cb;
160	struct sk_buff *skb2;
161	int ret;
162	int count = 1;
163
164	pr_debug("netem_enqueue skb=%p\n", skb);
165
166	/* Random duplication */
167	if (q->duplicate && q->duplicate >= get_crandom(&q->dup_cor))
168		++count;
169
170	/* Random packet drop 0 => none, ~0 => all */
171	if (q->loss && q->loss >= get_crandom(&q->loss_cor))
172		--count;
173
174	if (count == 0) {
175		sch->qstats.drops++;
176		kfree_skb(skb);
177		return NET_XMIT_SUCCESS | __NET_XMIT_BYPASS;
178	}
179
180	skb_orphan(skb);
181
182	/*
183	 * If we need to duplicate packet, then re-insert at top of the
184	 * qdisc tree, since parent queuer expects that only one
185	 * skb will be queued.
186	 */
187	if (count > 1 && (skb2 = skb_clone(skb, GFP_ATOMIC)) != NULL) {
188		struct Qdisc *rootq = qdisc_root(sch);
189		u32 dupsave = q->duplicate; /* prevent duplicating a dup... */
190		q->duplicate = 0;
191
192		qdisc_enqueue_root(skb2, rootq);
193		q->duplicate = dupsave;
194	}
195
196	/*
197	 * Randomized packet corruption.
198	 * Make copy if needed since we are modifying
199	 * If packet is going to be hardware checksummed, then
200	 * do it now in software before we mangle it.
201	 */
202	if (q->corrupt && q->corrupt >= get_crandom(&q->corrupt_cor)) {
203		if (!(skb = skb_unshare(skb, GFP_ATOMIC)) ||
204		    (skb->ip_summed == CHECKSUM_PARTIAL &&
205		     skb_checksum_help(skb))) {
206			sch->qstats.drops++;
207			return NET_XMIT_DROP;
208		}
209
210		skb->data[net_random() % skb_headlen(skb)] ^= 1<<(net_random() % 8);
211	}
212
213	cb = netem_skb_cb(skb);
214	if (q->gap == 0 || 		/* not doing reordering */
215	    q->counter < q->gap || 	/* inside last reordering gap */
216	    q->reorder < get_crandom(&q->reorder_cor)) {
217		psched_time_t now;
218		psched_tdiff_t delay;
219
220		delay = tabledist(q->latency, q->jitter,
221				  &q->delay_cor, q->delay_dist);
222
223		now = psched_get_time();
224		cb->time_to_send = now + delay;
225		++q->counter;
226		ret = qdisc_enqueue(skb, q->qdisc);
227	} else {
228		/*
229		 * Do re-ordering by putting one out of N packets at the front
230		 * of the queue.
231		 */
232		cb->time_to_send = psched_get_time();
233		q->counter = 0;
234
235		__skb_queue_head(&q->qdisc->q, skb);
236		q->qdisc->qstats.backlog += qdisc_pkt_len(skb);
237		q->qdisc->qstats.requeues++;
238		ret = NET_XMIT_SUCCESS;
239	}
240
241	if (likely(ret == NET_XMIT_SUCCESS)) {
242		sch->q.qlen++;
243		sch->bstats.bytes += qdisc_pkt_len(skb);
244		sch->bstats.packets++;
245	} else if (net_xmit_drop_count(ret)) {
246		sch->qstats.drops++;
247	}
248
249	pr_debug("netem: enqueue ret %d\n", ret);
250	return ret;
251}
252
253static unsigned int netem_drop(struct Qdisc* sch)
254{
255	struct netem_sched_data *q = qdisc_priv(sch);
256	unsigned int len = 0;
257
258	if (q->qdisc->ops->drop && (len = q->qdisc->ops->drop(q->qdisc)) != 0) {
259		sch->q.qlen--;
260		sch->qstats.drops++;
261	}
262	return len;
263}
264
265static struct sk_buff *netem_dequeue(struct Qdisc *sch)
266{
267	struct netem_sched_data *q = qdisc_priv(sch);
268	struct sk_buff *skb;
269
270	if (sch->flags & TCQ_F_THROTTLED)
271		return NULL;
272
273	skb = q->qdisc->ops->peek(q->qdisc);
274	if (skb) {
275		const struct netem_skb_cb *cb = netem_skb_cb(skb);
276		psched_time_t now = psched_get_time();
277
278		/* if more time remaining? */
279		if (cb->time_to_send <= now) {
280			skb = qdisc_dequeue_peeked(q->qdisc);
281			if (unlikely(!skb))
282				return NULL;
283
284#ifdef CONFIG_NET_CLS_ACT
285			/*
286			 * If it's at ingress let's pretend the delay is
287			 * from the network (tstamp will be updated).
288			 */
289			if (G_TC_FROM(skb->tc_verd) & AT_INGRESS)
290				skb->tstamp.tv64 = 0;
291#endif
292			pr_debug("netem_dequeue: return skb=%p\n", skb);
293			sch->q.qlen--;
294			return skb;
295		}
296
297		qdisc_watchdog_schedule(&q->watchdog, cb->time_to_send);
298	}
299
300	return NULL;
301}
302
303static void netem_reset(struct Qdisc *sch)
304{
305	struct netem_sched_data *q = qdisc_priv(sch);
306
307	qdisc_reset(q->qdisc);
308	sch->q.qlen = 0;
309	qdisc_watchdog_cancel(&q->watchdog);
310}
311
312/*
313 * Distribution data is a variable size payload containing
314 * signed 16 bit values.
315 */
316static int get_dist_table(struct Qdisc *sch, const struct nlattr *attr)
317{
318	struct netem_sched_data *q = qdisc_priv(sch);
319	unsigned long n = nla_len(attr)/sizeof(__s16);
320	const __s16 *data = nla_data(attr);
321	spinlock_t *root_lock;
322	struct disttable *d;
323	int i;
324
325	if (n > 65536)
326		return -EINVAL;
327
328	d = kmalloc(sizeof(*d) + n*sizeof(d->table[0]), GFP_KERNEL);
329	if (!d)
330		return -ENOMEM;
331
332	d->size = n;
333	for (i = 0; i < n; i++)
334		d->table[i] = data[i];
335
336	root_lock = qdisc_root_sleeping_lock(sch);
337
338	spin_lock_bh(root_lock);
339	kfree(q->delay_dist);
340	q->delay_dist = d;
341	spin_unlock_bh(root_lock);
342	return 0;
343}
344
345static void get_correlation(struct Qdisc *sch, const struct nlattr *attr)
346{
347	struct netem_sched_data *q = qdisc_priv(sch);
348	const struct tc_netem_corr *c = nla_data(attr);
349
350	init_crandom(&q->delay_cor, c->delay_corr);
351	init_crandom(&q->loss_cor, c->loss_corr);
352	init_crandom(&q->dup_cor, c->dup_corr);
353}
354
355static void get_reorder(struct Qdisc *sch, const struct nlattr *attr)
356{
357	struct netem_sched_data *q = qdisc_priv(sch);
358	const struct tc_netem_reorder *r = nla_data(attr);
359
360	q->reorder = r->probability;
361	init_crandom(&q->reorder_cor, r->correlation);
362}
363
364static void get_corrupt(struct Qdisc *sch, const struct nlattr *attr)
365{
366	struct netem_sched_data *q = qdisc_priv(sch);
367	const struct tc_netem_corrupt *r = nla_data(attr);
368
369	q->corrupt = r->probability;
370	init_crandom(&q->corrupt_cor, r->correlation);
371}
372
373static const struct nla_policy netem_policy[TCA_NETEM_MAX + 1] = {
374	[TCA_NETEM_CORR]	= { .len = sizeof(struct tc_netem_corr) },
375	[TCA_NETEM_REORDER]	= { .len = sizeof(struct tc_netem_reorder) },
376	[TCA_NETEM_CORRUPT]	= { .len = sizeof(struct tc_netem_corrupt) },
377};
378
379static int parse_attr(struct nlattr *tb[], int maxtype, struct nlattr *nla,
380		      const struct nla_policy *policy, int len)
381{
382	int nested_len = nla_len(nla) - NLA_ALIGN(len);
383
384	if (nested_len < 0)
385		return -EINVAL;
386	if (nested_len >= nla_attr_size(0))
387		return nla_parse(tb, maxtype, nla_data(nla) + NLA_ALIGN(len),
388				 nested_len, policy);
389	memset(tb, 0, sizeof(struct nlattr *) * (maxtype + 1));
390	return 0;
391}
392
393/* Parse netlink message to set options */
394static int netem_change(struct Qdisc *sch, struct nlattr *opt)
395{
396	struct netem_sched_data *q = qdisc_priv(sch);
397	struct nlattr *tb[TCA_NETEM_MAX + 1];
398	struct tc_netem_qopt *qopt;
399	int ret;
400
401	if (opt == NULL)
402		return -EINVAL;
403
404	qopt = nla_data(opt);
405	ret = parse_attr(tb, TCA_NETEM_MAX, opt, netem_policy, sizeof(*qopt));
406	if (ret < 0)
407		return ret;
408
409	ret = fifo_set_limit(q->qdisc, qopt->limit);
410	if (ret) {
411		pr_debug("netem: can't set fifo limit\n");
412		return ret;
413	}
414
415	q->latency = qopt->latency;
416	q->jitter = qopt->jitter;
417	q->limit = qopt->limit;
418	q->gap = qopt->gap;
419	q->counter = 0;
420	q->loss = qopt->loss;
421	q->duplicate = qopt->duplicate;
422
423	/* for compatibility with earlier versions.
424	 * if gap is set, need to assume 100% probability
425	 */
426	if (q->gap)
427		q->reorder = ~0;
428
429	if (tb[TCA_NETEM_CORR])
430		get_correlation(sch, tb[TCA_NETEM_CORR]);
431
432	if (tb[TCA_NETEM_DELAY_DIST]) {
433		ret = get_dist_table(sch, tb[TCA_NETEM_DELAY_DIST]);
434		if (ret)
435			return ret;
436	}
437
438	if (tb[TCA_NETEM_REORDER])
439		get_reorder(sch, tb[TCA_NETEM_REORDER]);
440
441	if (tb[TCA_NETEM_CORRUPT])
442		get_corrupt(sch, tb[TCA_NETEM_CORRUPT]);
443
444	return 0;
445}
446
447/*
448 * Special case version of FIFO queue for use by netem.
449 * It queues in order based on timestamps in skb's
450 */
451struct fifo_sched_data {
452	u32 limit;
453	psched_time_t oldest;
454};
455
456static int tfifo_enqueue(struct sk_buff *nskb, struct Qdisc *sch)
457{
458	struct fifo_sched_data *q = qdisc_priv(sch);
459	struct sk_buff_head *list = &sch->q;
460	psched_time_t tnext = netem_skb_cb(nskb)->time_to_send;
461	struct sk_buff *skb;
462
463	if (likely(skb_queue_len(list) < q->limit)) {
464		/* Optimize for add at tail */
465		if (likely(skb_queue_empty(list) || tnext >= q->oldest)) {
466			q->oldest = tnext;
467			return qdisc_enqueue_tail(nskb, sch);
468		}
469
470		skb_queue_reverse_walk(list, skb) {
471			const struct netem_skb_cb *cb = netem_skb_cb(skb);
472
473			if (tnext >= cb->time_to_send)
474				break;
475		}
476
477		__skb_queue_after(list, skb, nskb);
478
479		sch->qstats.backlog += qdisc_pkt_len(nskb);
480		sch->bstats.bytes += qdisc_pkt_len(nskb);
481		sch->bstats.packets++;
482
483		return NET_XMIT_SUCCESS;
484	}
485
486	return qdisc_reshape_fail(nskb, sch);
487}
488
489static int tfifo_init(struct Qdisc *sch, struct nlattr *opt)
490{
491	struct fifo_sched_data *q = qdisc_priv(sch);
492
493	if (opt) {
494		struct tc_fifo_qopt *ctl = nla_data(opt);
495		if (nla_len(opt) < sizeof(*ctl))
496			return -EINVAL;
497
498		q->limit = ctl->limit;
499	} else
500		q->limit = max_t(u32, qdisc_dev(sch)->tx_queue_len, 1);
501
502	q->oldest = PSCHED_PASTPERFECT;
503	return 0;
504}
505
506static int tfifo_dump(struct Qdisc *sch, struct sk_buff *skb)
507{
508	struct fifo_sched_data *q = qdisc_priv(sch);
509	struct tc_fifo_qopt opt = { .limit = q->limit };
510
511	NLA_PUT(skb, TCA_OPTIONS, sizeof(opt), &opt);
512	return skb->len;
513
514nla_put_failure:
515	return -1;
516}
517
518static struct Qdisc_ops tfifo_qdisc_ops __read_mostly = {
519	.id		=	"tfifo",
520	.priv_size	=	sizeof(struct fifo_sched_data),
521	.enqueue	=	tfifo_enqueue,
522	.dequeue	=	qdisc_dequeue_head,
523	.peek		=	qdisc_peek_head,
524	.drop		=	qdisc_queue_drop,
525	.init		=	tfifo_init,
526	.reset		=	qdisc_reset_queue,
527	.change		=	tfifo_init,
528	.dump		=	tfifo_dump,
529};
530
531static int netem_init(struct Qdisc *sch, struct nlattr *opt)
532{
533	struct netem_sched_data *q = qdisc_priv(sch);
534	int ret;
535
536	if (!opt)
537		return -EINVAL;
538
539	qdisc_watchdog_init(&q->watchdog, sch);
540
541	q->qdisc = qdisc_create_dflt(qdisc_dev(sch), sch->dev_queue,
542				     &tfifo_qdisc_ops,
543				     TC_H_MAKE(sch->handle, 1));
544	if (!q->qdisc) {
545		pr_debug("netem: qdisc create failed\n");
546		return -ENOMEM;
547	}
548
549	ret = netem_change(sch, opt);
550	if (ret) {
551		pr_debug("netem: change failed\n");
552		qdisc_destroy(q->qdisc);
553	}
554	return ret;
555}
556
557static void netem_destroy(struct Qdisc *sch)
558{
559	struct netem_sched_data *q = qdisc_priv(sch);
560
561	qdisc_watchdog_cancel(&q->watchdog);
562	qdisc_destroy(q->qdisc);
563	kfree(q->delay_dist);
564}
565
566static int netem_dump(struct Qdisc *sch, struct sk_buff *skb)
567{
568	const struct netem_sched_data *q = qdisc_priv(sch);
569	unsigned char *b = skb_tail_pointer(skb);
570	struct nlattr *nla = (struct nlattr *) b;
571	struct tc_netem_qopt qopt;
572	struct tc_netem_corr cor;
573	struct tc_netem_reorder reorder;
574	struct tc_netem_corrupt corrupt;
575
576	qopt.latency = q->latency;
577	qopt.jitter = q->jitter;
578	qopt.limit = q->limit;
579	qopt.loss = q->loss;
580	qopt.gap = q->gap;
581	qopt.duplicate = q->duplicate;
582	NLA_PUT(skb, TCA_OPTIONS, sizeof(qopt), &qopt);
583
584	cor.delay_corr = q->delay_cor.rho;
585	cor.loss_corr = q->loss_cor.rho;
586	cor.dup_corr = q->dup_cor.rho;
587	NLA_PUT(skb, TCA_NETEM_CORR, sizeof(cor), &cor);
588
589	reorder.probability = q->reorder;
590	reorder.correlation = q->reorder_cor.rho;
591	NLA_PUT(skb, TCA_NETEM_REORDER, sizeof(reorder), &reorder);
592
593	corrupt.probability = q->corrupt;
594	corrupt.correlation = q->corrupt_cor.rho;
595	NLA_PUT(skb, TCA_NETEM_CORRUPT, sizeof(corrupt), &corrupt);
596
597	nla->nla_len = skb_tail_pointer(skb) - b;
598
599	return skb->len;
600
601nla_put_failure:
602	nlmsg_trim(skb, b);
603	return -1;
604}
605
606static struct Qdisc_ops netem_qdisc_ops __read_mostly = {
607	.id		=	"netem",
608	.priv_size	=	sizeof(struct netem_sched_data),
609	.enqueue	=	netem_enqueue,
610	.dequeue	=	netem_dequeue,
611	.peek		=	qdisc_peek_dequeued,
612	.drop		=	netem_drop,
613	.init		=	netem_init,
614	.reset		=	netem_reset,
615	.destroy	=	netem_destroy,
616	.change		=	netem_change,
617	.dump		=	netem_dump,
618	.owner		=	THIS_MODULE,
619};
620
621
622static int __init netem_module_init(void)
623{
624	pr_info("netem: version " VERSION "\n");
625	return register_qdisc(&netem_qdisc_ops);
626}
627static void __exit netem_module_exit(void)
628{
629	unregister_qdisc(&netem_qdisc_ops);
630}
631module_init(netem_module_init)
632module_exit(netem_module_exit)
633MODULE_LICENSE("GPL");
634