1/*
2 * Common Block IO controller cgroup interface
3 *
4 * Based on ideas and code from CFQ, CFS and BFQ:
5 * Copyright (C) 2003 Jens Axboe <axboe@kernel.dk>
6 *
7 * Copyright (C) 2008 Fabio Checconi <fabio@gandalf.sssup.it>
8 *		      Paolo Valente <paolo.valente@unimore.it>
9 *
10 * Copyright (C) 2009 Vivek Goyal <vgoyal@redhat.com>
11 * 	              Nauman Rafique <nauman@google.com>
12 */
13#include <linux/ioprio.h>
14#include <linux/seq_file.h>
15#include <linux/kdev_t.h>
16#include <linux/module.h>
17#include <linux/err.h>
18#include <linux/blkdev.h>
19#include <linux/slab.h>
20#include "blk-cgroup.h"
21#include <linux/genhd.h>
22
23#define MAX_KEY_LEN 100
24
25static DEFINE_SPINLOCK(blkio_list_lock);
26static LIST_HEAD(blkio_list);
27
28struct blkio_cgroup blkio_root_cgroup = { .weight = 2*BLKIO_WEIGHT_DEFAULT };
29EXPORT_SYMBOL_GPL(blkio_root_cgroup);
30
31static struct cgroup_subsys_state *blkiocg_create(struct cgroup_subsys *,
32						  struct cgroup *);
33static int blkiocg_can_attach(struct cgroup_subsys *, struct cgroup *,
34			      struct task_struct *, bool);
35static void blkiocg_attach(struct cgroup_subsys *, struct cgroup *,
36			   struct cgroup *, struct task_struct *, bool);
37static void blkiocg_destroy(struct cgroup_subsys *, struct cgroup *);
38static int blkiocg_populate(struct cgroup_subsys *, struct cgroup *);
39
40struct cgroup_subsys blkio_subsys = {
41	.name = "blkio",
42	.create = blkiocg_create,
43	.can_attach = blkiocg_can_attach,
44	.attach = blkiocg_attach,
45	.destroy = blkiocg_destroy,
46	.populate = blkiocg_populate,
47#ifdef CONFIG_BLK_CGROUP
48	/* note: blkio_subsys_id is otherwise defined in blk-cgroup.h */
49	.subsys_id = blkio_subsys_id,
50#endif
51	.use_id = 1,
52	.module = THIS_MODULE,
53};
54EXPORT_SYMBOL_GPL(blkio_subsys);
55
56static inline void blkio_policy_insert_node(struct blkio_cgroup *blkcg,
57					    struct blkio_policy_node *pn)
58{
59	list_add(&pn->node, &blkcg->policy_list);
60}
61
62/* Must be called with blkcg->lock held */
63static inline void blkio_policy_delete_node(struct blkio_policy_node *pn)
64{
65	list_del(&pn->node);
66}
67
68/* Must be called with blkcg->lock held */
69static struct blkio_policy_node *
70blkio_policy_search_node(const struct blkio_cgroup *blkcg, dev_t dev)
71{
72	struct blkio_policy_node *pn;
73
74	list_for_each_entry(pn, &blkcg->policy_list, node) {
75		if (pn->dev == dev)
76			return pn;
77	}
78
79	return NULL;
80}
81
82struct blkio_cgroup *cgroup_to_blkio_cgroup(struct cgroup *cgroup)
83{
84	return container_of(cgroup_subsys_state(cgroup, blkio_subsys_id),
85			    struct blkio_cgroup, css);
86}
87EXPORT_SYMBOL_GPL(cgroup_to_blkio_cgroup);
88
89/*
90 * Add to the appropriate stat variable depending on the request type.
91 * This should be called with the blkg->stats_lock held.
92 */
93static void blkio_add_stat(uint64_t *stat, uint64_t add, bool direction,
94				bool sync)
95{
96	if (direction)
97		stat[BLKIO_STAT_WRITE] += add;
98	else
99		stat[BLKIO_STAT_READ] += add;
100	if (sync)
101		stat[BLKIO_STAT_SYNC] += add;
102	else
103		stat[BLKIO_STAT_ASYNC] += add;
104}
105
106/*
107 * Decrements the appropriate stat variable if non-zero depending on the
108 * request type. Panics on value being zero.
109 * This should be called with the blkg->stats_lock held.
110 */
111static void blkio_check_and_dec_stat(uint64_t *stat, bool direction, bool sync)
112{
113	if (direction) {
114		BUG_ON(stat[BLKIO_STAT_WRITE] == 0);
115		stat[BLKIO_STAT_WRITE]--;
116	} else {
117		BUG_ON(stat[BLKIO_STAT_READ] == 0);
118		stat[BLKIO_STAT_READ]--;
119	}
120	if (sync) {
121		BUG_ON(stat[BLKIO_STAT_SYNC] == 0);
122		stat[BLKIO_STAT_SYNC]--;
123	} else {
124		BUG_ON(stat[BLKIO_STAT_ASYNC] == 0);
125		stat[BLKIO_STAT_ASYNC]--;
126	}
127}
128
129#ifdef CONFIG_DEBUG_BLK_CGROUP
130/* This should be called with the blkg->stats_lock held. */
131static void blkio_set_start_group_wait_time(struct blkio_group *blkg,
132						struct blkio_group *curr_blkg)
133{
134	if (blkio_blkg_waiting(&blkg->stats))
135		return;
136	if (blkg == curr_blkg)
137		return;
138	blkg->stats.start_group_wait_time = sched_clock();
139	blkio_mark_blkg_waiting(&blkg->stats);
140}
141
142/* This should be called with the blkg->stats_lock held. */
143static void blkio_update_group_wait_time(struct blkio_group_stats *stats)
144{
145	unsigned long long now;
146
147	if (!blkio_blkg_waiting(stats))
148		return;
149
150	now = sched_clock();
151	if (time_after64(now, stats->start_group_wait_time))
152		stats->group_wait_time += now - stats->start_group_wait_time;
153	blkio_clear_blkg_waiting(stats);
154}
155
156/* This should be called with the blkg->stats_lock held. */
157static void blkio_end_empty_time(struct blkio_group_stats *stats)
158{
159	unsigned long long now;
160
161	if (!blkio_blkg_empty(stats))
162		return;
163
164	now = sched_clock();
165	if (time_after64(now, stats->start_empty_time))
166		stats->empty_time += now - stats->start_empty_time;
167	blkio_clear_blkg_empty(stats);
168}
169
170void blkiocg_update_set_idle_time_stats(struct blkio_group *blkg)
171{
172	unsigned long flags;
173
174	spin_lock_irqsave(&blkg->stats_lock, flags);
175	BUG_ON(blkio_blkg_idling(&blkg->stats));
176	blkg->stats.start_idle_time = sched_clock();
177	blkio_mark_blkg_idling(&blkg->stats);
178	spin_unlock_irqrestore(&blkg->stats_lock, flags);
179}
180EXPORT_SYMBOL_GPL(blkiocg_update_set_idle_time_stats);
181
182void blkiocg_update_idle_time_stats(struct blkio_group *blkg)
183{
184	unsigned long flags;
185	unsigned long long now;
186	struct blkio_group_stats *stats;
187
188	spin_lock_irqsave(&blkg->stats_lock, flags);
189	stats = &blkg->stats;
190	if (blkio_blkg_idling(stats)) {
191		now = sched_clock();
192		if (time_after64(now, stats->start_idle_time))
193			stats->idle_time += now - stats->start_idle_time;
194		blkio_clear_blkg_idling(stats);
195	}
196	spin_unlock_irqrestore(&blkg->stats_lock, flags);
197}
198EXPORT_SYMBOL_GPL(blkiocg_update_idle_time_stats);
199
200void blkiocg_update_avg_queue_size_stats(struct blkio_group *blkg)
201{
202	unsigned long flags;
203	struct blkio_group_stats *stats;
204
205	spin_lock_irqsave(&blkg->stats_lock, flags);
206	stats = &blkg->stats;
207	stats->avg_queue_size_sum +=
208			stats->stat_arr[BLKIO_STAT_QUEUED][BLKIO_STAT_READ] +
209			stats->stat_arr[BLKIO_STAT_QUEUED][BLKIO_STAT_WRITE];
210	stats->avg_queue_size_samples++;
211	blkio_update_group_wait_time(stats);
212	spin_unlock_irqrestore(&blkg->stats_lock, flags);
213}
214EXPORT_SYMBOL_GPL(blkiocg_update_avg_queue_size_stats);
215
216void blkiocg_set_start_empty_time(struct blkio_group *blkg)
217{
218	unsigned long flags;
219	struct blkio_group_stats *stats;
220
221	spin_lock_irqsave(&blkg->stats_lock, flags);
222	stats = &blkg->stats;
223
224	if (stats->stat_arr[BLKIO_STAT_QUEUED][BLKIO_STAT_READ] ||
225			stats->stat_arr[BLKIO_STAT_QUEUED][BLKIO_STAT_WRITE]) {
226		spin_unlock_irqrestore(&blkg->stats_lock, flags);
227		return;
228	}
229
230	/*
231	 * group is already marked empty. This can happen if cfqq got new
232	 * request in parent group and moved to this group while being added
233	 * to service tree. Just ignore the event and move on.
234	 */
235	if(blkio_blkg_empty(stats)) {
236		spin_unlock_irqrestore(&blkg->stats_lock, flags);
237		return;
238	}
239
240	stats->start_empty_time = sched_clock();
241	blkio_mark_blkg_empty(stats);
242	spin_unlock_irqrestore(&blkg->stats_lock, flags);
243}
244EXPORT_SYMBOL_GPL(blkiocg_set_start_empty_time);
245
246void blkiocg_update_dequeue_stats(struct blkio_group *blkg,
247			unsigned long dequeue)
248{
249	blkg->stats.dequeue += dequeue;
250}
251EXPORT_SYMBOL_GPL(blkiocg_update_dequeue_stats);
252#else
253static inline void blkio_set_start_group_wait_time(struct blkio_group *blkg,
254					struct blkio_group *curr_blkg) {}
255static inline void blkio_end_empty_time(struct blkio_group_stats *stats) {}
256#endif
257
258void blkiocg_update_io_add_stats(struct blkio_group *blkg,
259			struct blkio_group *curr_blkg, bool direction,
260			bool sync)
261{
262	unsigned long flags;
263
264	spin_lock_irqsave(&blkg->stats_lock, flags);
265	blkio_add_stat(blkg->stats.stat_arr[BLKIO_STAT_QUEUED], 1, direction,
266			sync);
267	blkio_end_empty_time(&blkg->stats);
268	blkio_set_start_group_wait_time(blkg, curr_blkg);
269	spin_unlock_irqrestore(&blkg->stats_lock, flags);
270}
271EXPORT_SYMBOL_GPL(blkiocg_update_io_add_stats);
272
273void blkiocg_update_io_remove_stats(struct blkio_group *blkg,
274						bool direction, bool sync)
275{
276	unsigned long flags;
277
278	spin_lock_irqsave(&blkg->stats_lock, flags);
279	blkio_check_and_dec_stat(blkg->stats.stat_arr[BLKIO_STAT_QUEUED],
280					direction, sync);
281	spin_unlock_irqrestore(&blkg->stats_lock, flags);
282}
283EXPORT_SYMBOL_GPL(blkiocg_update_io_remove_stats);
284
285void blkiocg_update_timeslice_used(struct blkio_group *blkg, unsigned long time)
286{
287	unsigned long flags;
288
289	spin_lock_irqsave(&blkg->stats_lock, flags);
290	blkg->stats.time += time;
291	spin_unlock_irqrestore(&blkg->stats_lock, flags);
292}
293EXPORT_SYMBOL_GPL(blkiocg_update_timeslice_used);
294
295void blkiocg_update_dispatch_stats(struct blkio_group *blkg,
296				uint64_t bytes, bool direction, bool sync)
297{
298	struct blkio_group_stats *stats;
299	unsigned long flags;
300
301	spin_lock_irqsave(&blkg->stats_lock, flags);
302	stats = &blkg->stats;
303	stats->sectors += bytes >> 9;
304	blkio_add_stat(stats->stat_arr[BLKIO_STAT_SERVICED], 1, direction,
305			sync);
306	blkio_add_stat(stats->stat_arr[BLKIO_STAT_SERVICE_BYTES], bytes,
307			direction, sync);
308	spin_unlock_irqrestore(&blkg->stats_lock, flags);
309}
310EXPORT_SYMBOL_GPL(blkiocg_update_dispatch_stats);
311
312void blkiocg_update_completion_stats(struct blkio_group *blkg,
313	uint64_t start_time, uint64_t io_start_time, bool direction, bool sync)
314{
315	struct blkio_group_stats *stats;
316	unsigned long flags;
317	unsigned long long now = sched_clock();
318
319	spin_lock_irqsave(&blkg->stats_lock, flags);
320	stats = &blkg->stats;
321	if (time_after64(now, io_start_time))
322		blkio_add_stat(stats->stat_arr[BLKIO_STAT_SERVICE_TIME],
323				now - io_start_time, direction, sync);
324	if (time_after64(io_start_time, start_time))
325		blkio_add_stat(stats->stat_arr[BLKIO_STAT_WAIT_TIME],
326				io_start_time - start_time, direction, sync);
327	spin_unlock_irqrestore(&blkg->stats_lock, flags);
328}
329EXPORT_SYMBOL_GPL(blkiocg_update_completion_stats);
330
331void blkiocg_update_io_merged_stats(struct blkio_group *blkg, bool direction,
332					bool sync)
333{
334	unsigned long flags;
335
336	spin_lock_irqsave(&blkg->stats_lock, flags);
337	blkio_add_stat(blkg->stats.stat_arr[BLKIO_STAT_MERGED], 1, direction,
338			sync);
339	spin_unlock_irqrestore(&blkg->stats_lock, flags);
340}
341EXPORT_SYMBOL_GPL(blkiocg_update_io_merged_stats);
342
343void blkiocg_add_blkio_group(struct blkio_cgroup *blkcg,
344			struct blkio_group *blkg, void *key, dev_t dev)
345{
346	unsigned long flags;
347
348	spin_lock_irqsave(&blkcg->lock, flags);
349	spin_lock_init(&blkg->stats_lock);
350	rcu_assign_pointer(blkg->key, key);
351	blkg->blkcg_id = css_id(&blkcg->css);
352	hlist_add_head_rcu(&blkg->blkcg_node, &blkcg->blkg_list);
353	spin_unlock_irqrestore(&blkcg->lock, flags);
354	/* Need to take css reference ? */
355	cgroup_path(blkcg->css.cgroup, blkg->path, sizeof(blkg->path));
356	blkg->dev = dev;
357}
358EXPORT_SYMBOL_GPL(blkiocg_add_blkio_group);
359
360static void __blkiocg_del_blkio_group(struct blkio_group *blkg)
361{
362	hlist_del_init_rcu(&blkg->blkcg_node);
363	blkg->blkcg_id = 0;
364}
365
366/*
367 * returns 0 if blkio_group was still on cgroup list. Otherwise returns 1
368 * indicating that blk_group was unhashed by the time we got to it.
369 */
370int blkiocg_del_blkio_group(struct blkio_group *blkg)
371{
372	struct blkio_cgroup *blkcg;
373	unsigned long flags;
374	struct cgroup_subsys_state *css;
375	int ret = 1;
376
377	rcu_read_lock();
378	css = css_lookup(&blkio_subsys, blkg->blkcg_id);
379	if (css) {
380		blkcg = container_of(css, struct blkio_cgroup, css);
381		spin_lock_irqsave(&blkcg->lock, flags);
382		if (!hlist_unhashed(&blkg->blkcg_node)) {
383			__blkiocg_del_blkio_group(blkg);
384			ret = 0;
385		}
386		spin_unlock_irqrestore(&blkcg->lock, flags);
387	}
388
389	rcu_read_unlock();
390	return ret;
391}
392EXPORT_SYMBOL_GPL(blkiocg_del_blkio_group);
393
394/* called under rcu_read_lock(). */
395struct blkio_group *blkiocg_lookup_group(struct blkio_cgroup *blkcg, void *key)
396{
397	struct blkio_group *blkg;
398	struct hlist_node *n;
399	void *__key;
400
401	hlist_for_each_entry_rcu(blkg, n, &blkcg->blkg_list, blkcg_node) {
402		__key = blkg->key;
403		if (__key == key)
404			return blkg;
405	}
406
407	return NULL;
408}
409EXPORT_SYMBOL_GPL(blkiocg_lookup_group);
410
411#define SHOW_FUNCTION(__VAR)						\
412static u64 blkiocg_##__VAR##_read(struct cgroup *cgroup,		\
413				       struct cftype *cftype)		\
414{									\
415	struct blkio_cgroup *blkcg;					\
416									\
417	blkcg = cgroup_to_blkio_cgroup(cgroup);				\
418	return (u64)blkcg->__VAR;					\
419}
420
421SHOW_FUNCTION(weight);
422#undef SHOW_FUNCTION
423
424static int
425blkiocg_weight_write(struct cgroup *cgroup, struct cftype *cftype, u64 val)
426{
427	struct blkio_cgroup *blkcg;
428	struct blkio_group *blkg;
429	struct hlist_node *n;
430	struct blkio_policy_type *blkiop;
431	struct blkio_policy_node *pn;
432
433	if (val < BLKIO_WEIGHT_MIN || val > BLKIO_WEIGHT_MAX)
434		return -EINVAL;
435
436	blkcg = cgroup_to_blkio_cgroup(cgroup);
437	spin_lock(&blkio_list_lock);
438	spin_lock_irq(&blkcg->lock);
439	blkcg->weight = (unsigned int)val;
440
441	hlist_for_each_entry(blkg, n, &blkcg->blkg_list, blkcg_node) {
442		pn = blkio_policy_search_node(blkcg, blkg->dev);
443
444		if (pn)
445			continue;
446
447		list_for_each_entry(blkiop, &blkio_list, list)
448			blkiop->ops.blkio_update_group_weight_fn(blkg,
449					blkcg->weight);
450	}
451	spin_unlock_irq(&blkcg->lock);
452	spin_unlock(&blkio_list_lock);
453	return 0;
454}
455
456static int
457blkiocg_reset_stats(struct cgroup *cgroup, struct cftype *cftype, u64 val)
458{
459	struct blkio_cgroup *blkcg;
460	struct blkio_group *blkg;
461	struct blkio_group_stats *stats;
462	struct hlist_node *n;
463	uint64_t queued[BLKIO_STAT_TOTAL];
464	int i;
465#ifdef CONFIG_DEBUG_BLK_CGROUP
466	bool idling, waiting, empty;
467	unsigned long long now = sched_clock();
468#endif
469
470	blkcg = cgroup_to_blkio_cgroup(cgroup);
471	spin_lock_irq(&blkcg->lock);
472	hlist_for_each_entry(blkg, n, &blkcg->blkg_list, blkcg_node) {
473		spin_lock(&blkg->stats_lock);
474		stats = &blkg->stats;
475#ifdef CONFIG_DEBUG_BLK_CGROUP
476		idling = blkio_blkg_idling(stats);
477		waiting = blkio_blkg_waiting(stats);
478		empty = blkio_blkg_empty(stats);
479#endif
480		for (i = 0; i < BLKIO_STAT_TOTAL; i++)
481			queued[i] = stats->stat_arr[BLKIO_STAT_QUEUED][i];
482		memset(stats, 0, sizeof(struct blkio_group_stats));
483		for (i = 0; i < BLKIO_STAT_TOTAL; i++)
484			stats->stat_arr[BLKIO_STAT_QUEUED][i] = queued[i];
485#ifdef CONFIG_DEBUG_BLK_CGROUP
486		if (idling) {
487			blkio_mark_blkg_idling(stats);
488			stats->start_idle_time = now;
489		}
490		if (waiting) {
491			blkio_mark_blkg_waiting(stats);
492			stats->start_group_wait_time = now;
493		}
494		if (empty) {
495			blkio_mark_blkg_empty(stats);
496			stats->start_empty_time = now;
497		}
498#endif
499		spin_unlock(&blkg->stats_lock);
500	}
501	spin_unlock_irq(&blkcg->lock);
502	return 0;
503}
504
505static void blkio_get_key_name(enum stat_sub_type type, dev_t dev, char *str,
506				int chars_left, bool diskname_only)
507{
508	snprintf(str, chars_left, "%d:%d", MAJOR(dev), MINOR(dev));
509	chars_left -= strlen(str);
510	if (chars_left <= 0) {
511		printk(KERN_WARNING
512			"Possibly incorrect cgroup stat display format");
513		return;
514	}
515	if (diskname_only)
516		return;
517	switch (type) {
518	case BLKIO_STAT_READ:
519		strlcat(str, " Read", chars_left);
520		break;
521	case BLKIO_STAT_WRITE:
522		strlcat(str, " Write", chars_left);
523		break;
524	case BLKIO_STAT_SYNC:
525		strlcat(str, " Sync", chars_left);
526		break;
527	case BLKIO_STAT_ASYNC:
528		strlcat(str, " Async", chars_left);
529		break;
530	case BLKIO_STAT_TOTAL:
531		strlcat(str, " Total", chars_left);
532		break;
533	default:
534		strlcat(str, " Invalid", chars_left);
535	}
536}
537
538static uint64_t blkio_fill_stat(char *str, int chars_left, uint64_t val,
539				struct cgroup_map_cb *cb, dev_t dev)
540{
541	blkio_get_key_name(0, dev, str, chars_left, true);
542	cb->fill(cb, str, val);
543	return val;
544}
545
546/* This should be called with blkg->stats_lock held */
547static uint64_t blkio_get_stat(struct blkio_group *blkg,
548		struct cgroup_map_cb *cb, dev_t dev, enum stat_type type)
549{
550	uint64_t disk_total;
551	char key_str[MAX_KEY_LEN];
552	enum stat_sub_type sub_type;
553
554	if (type == BLKIO_STAT_TIME)
555		return blkio_fill_stat(key_str, MAX_KEY_LEN - 1,
556					blkg->stats.time, cb, dev);
557	if (type == BLKIO_STAT_SECTORS)
558		return blkio_fill_stat(key_str, MAX_KEY_LEN - 1,
559					blkg->stats.sectors, cb, dev);
560#ifdef CONFIG_DEBUG_BLK_CGROUP
561	if (type == BLKIO_STAT_AVG_QUEUE_SIZE) {
562		uint64_t sum = blkg->stats.avg_queue_size_sum;
563		uint64_t samples = blkg->stats.avg_queue_size_samples;
564		if (samples)
565			do_div(sum, samples);
566		else
567			sum = 0;
568		return blkio_fill_stat(key_str, MAX_KEY_LEN - 1, sum, cb, dev);
569	}
570	if (type == BLKIO_STAT_GROUP_WAIT_TIME)
571		return blkio_fill_stat(key_str, MAX_KEY_LEN - 1,
572					blkg->stats.group_wait_time, cb, dev);
573	if (type == BLKIO_STAT_IDLE_TIME)
574		return blkio_fill_stat(key_str, MAX_KEY_LEN - 1,
575					blkg->stats.idle_time, cb, dev);
576	if (type == BLKIO_STAT_EMPTY_TIME)
577		return blkio_fill_stat(key_str, MAX_KEY_LEN - 1,
578					blkg->stats.empty_time, cb, dev);
579	if (type == BLKIO_STAT_DEQUEUE)
580		return blkio_fill_stat(key_str, MAX_KEY_LEN - 1,
581					blkg->stats.dequeue, cb, dev);
582#endif
583
584	for (sub_type = BLKIO_STAT_READ; sub_type < BLKIO_STAT_TOTAL;
585			sub_type++) {
586		blkio_get_key_name(sub_type, dev, key_str, MAX_KEY_LEN, false);
587		cb->fill(cb, key_str, blkg->stats.stat_arr[type][sub_type]);
588	}
589	disk_total = blkg->stats.stat_arr[type][BLKIO_STAT_READ] +
590			blkg->stats.stat_arr[type][BLKIO_STAT_WRITE];
591	blkio_get_key_name(BLKIO_STAT_TOTAL, dev, key_str, MAX_KEY_LEN, false);
592	cb->fill(cb, key_str, disk_total);
593	return disk_total;
594}
595
596#define SHOW_FUNCTION_PER_GROUP(__VAR, type, show_total)		\
597static int blkiocg_##__VAR##_read(struct cgroup *cgroup,		\
598		struct cftype *cftype, struct cgroup_map_cb *cb)	\
599{									\
600	struct blkio_cgroup *blkcg;					\
601	struct blkio_group *blkg;					\
602	struct hlist_node *n;						\
603	uint64_t cgroup_total = 0;					\
604									\
605	if (!cgroup_lock_live_group(cgroup))				\
606		return -ENODEV;						\
607									\
608	blkcg = cgroup_to_blkio_cgroup(cgroup);				\
609	rcu_read_lock();						\
610	hlist_for_each_entry_rcu(blkg, n, &blkcg->blkg_list, blkcg_node) {\
611		if (blkg->dev) {					\
612			spin_lock_irq(&blkg->stats_lock);		\
613			cgroup_total += blkio_get_stat(blkg, cb,	\
614						blkg->dev, type);	\
615			spin_unlock_irq(&blkg->stats_lock);		\
616		}							\
617	}								\
618	if (show_total)							\
619		cb->fill(cb, "Total", cgroup_total);			\
620	rcu_read_unlock();						\
621	cgroup_unlock();						\
622	return 0;							\
623}
624
625SHOW_FUNCTION_PER_GROUP(time, BLKIO_STAT_TIME, 0);
626SHOW_FUNCTION_PER_GROUP(sectors, BLKIO_STAT_SECTORS, 0);
627SHOW_FUNCTION_PER_GROUP(io_service_bytes, BLKIO_STAT_SERVICE_BYTES, 1);
628SHOW_FUNCTION_PER_GROUP(io_serviced, BLKIO_STAT_SERVICED, 1);
629SHOW_FUNCTION_PER_GROUP(io_service_time, BLKIO_STAT_SERVICE_TIME, 1);
630SHOW_FUNCTION_PER_GROUP(io_wait_time, BLKIO_STAT_WAIT_TIME, 1);
631SHOW_FUNCTION_PER_GROUP(io_merged, BLKIO_STAT_MERGED, 1);
632SHOW_FUNCTION_PER_GROUP(io_queued, BLKIO_STAT_QUEUED, 1);
633#ifdef CONFIG_DEBUG_BLK_CGROUP
634SHOW_FUNCTION_PER_GROUP(dequeue, BLKIO_STAT_DEQUEUE, 0);
635SHOW_FUNCTION_PER_GROUP(avg_queue_size, BLKIO_STAT_AVG_QUEUE_SIZE, 0);
636SHOW_FUNCTION_PER_GROUP(group_wait_time, BLKIO_STAT_GROUP_WAIT_TIME, 0);
637SHOW_FUNCTION_PER_GROUP(idle_time, BLKIO_STAT_IDLE_TIME, 0);
638SHOW_FUNCTION_PER_GROUP(empty_time, BLKIO_STAT_EMPTY_TIME, 0);
639#endif
640#undef SHOW_FUNCTION_PER_GROUP
641
642static int blkio_check_dev_num(dev_t dev)
643{
644	int part = 0;
645	struct gendisk *disk;
646
647	disk = get_gendisk(dev, &part);
648	if (!disk || part)
649		return -ENODEV;
650
651	return 0;
652}
653
654static int blkio_policy_parse_and_set(char *buf,
655				      struct blkio_policy_node *newpn)
656{
657	char *s[4], *p, *major_s = NULL, *minor_s = NULL;
658	int ret;
659	unsigned long major, minor, temp;
660	int i = 0;
661	dev_t dev;
662
663	memset(s, 0, sizeof(s));
664
665	while ((p = strsep(&buf, " ")) != NULL) {
666		if (!*p)
667			continue;
668
669		s[i++] = p;
670
671		/* Prevent from inputing too many things */
672		if (i == 3)
673			break;
674	}
675
676	if (i != 2)
677		return -EINVAL;
678
679	p = strsep(&s[0], ":");
680	if (p != NULL)
681		major_s = p;
682	else
683		return -EINVAL;
684
685	minor_s = s[0];
686	if (!minor_s)
687		return -EINVAL;
688
689	ret = strict_strtoul(major_s, 10, &major);
690	if (ret)
691		return -EINVAL;
692
693	ret = strict_strtoul(minor_s, 10, &minor);
694	if (ret)
695		return -EINVAL;
696
697	dev = MKDEV(major, minor);
698
699	ret = blkio_check_dev_num(dev);
700	if (ret)
701		return ret;
702
703	newpn->dev = dev;
704
705	if (s[1] == NULL)
706		return -EINVAL;
707
708	ret = strict_strtoul(s[1], 10, &temp);
709	if (ret || (temp < BLKIO_WEIGHT_MIN && temp > 0) ||
710	    temp > BLKIO_WEIGHT_MAX)
711		return -EINVAL;
712
713	newpn->weight =  temp;
714
715	return 0;
716}
717
718unsigned int blkcg_get_weight(struct blkio_cgroup *blkcg,
719			      dev_t dev)
720{
721	struct blkio_policy_node *pn;
722
723	pn = blkio_policy_search_node(blkcg, dev);
724	if (pn)
725		return pn->weight;
726	else
727		return blkcg->weight;
728}
729EXPORT_SYMBOL_GPL(blkcg_get_weight);
730
731
732static int blkiocg_weight_device_write(struct cgroup *cgrp, struct cftype *cft,
733				       const char *buffer)
734{
735	int ret = 0;
736	char *buf;
737	struct blkio_policy_node *newpn, *pn;
738	struct blkio_cgroup *blkcg;
739	struct blkio_group *blkg;
740	int keep_newpn = 0;
741	struct hlist_node *n;
742	struct blkio_policy_type *blkiop;
743
744	buf = kstrdup(buffer, GFP_KERNEL);
745	if (!buf)
746		return -ENOMEM;
747
748	newpn = kzalloc(sizeof(*newpn), GFP_KERNEL);
749	if (!newpn) {
750		ret = -ENOMEM;
751		goto free_buf;
752	}
753
754	ret = blkio_policy_parse_and_set(buf, newpn);
755	if (ret)
756		goto free_newpn;
757
758	blkcg = cgroup_to_blkio_cgroup(cgrp);
759
760	spin_lock_irq(&blkcg->lock);
761
762	pn = blkio_policy_search_node(blkcg, newpn->dev);
763	if (!pn) {
764		if (newpn->weight != 0) {
765			blkio_policy_insert_node(blkcg, newpn);
766			keep_newpn = 1;
767		}
768		spin_unlock_irq(&blkcg->lock);
769		goto update_io_group;
770	}
771
772	if (newpn->weight == 0) {
773		/* weight == 0 means deleteing a specific weight */
774		blkio_policy_delete_node(pn);
775		spin_unlock_irq(&blkcg->lock);
776		goto update_io_group;
777	}
778	spin_unlock_irq(&blkcg->lock);
779
780	pn->weight = newpn->weight;
781
782update_io_group:
783	/* update weight for each cfqg */
784	spin_lock(&blkio_list_lock);
785	spin_lock_irq(&blkcg->lock);
786
787	hlist_for_each_entry(blkg, n, &blkcg->blkg_list, blkcg_node) {
788		if (newpn->dev == blkg->dev) {
789			list_for_each_entry(blkiop, &blkio_list, list)
790				blkiop->ops.blkio_update_group_weight_fn(blkg,
791							 newpn->weight ?
792							 newpn->weight :
793							 blkcg->weight);
794		}
795	}
796
797	spin_unlock_irq(&blkcg->lock);
798	spin_unlock(&blkio_list_lock);
799
800free_newpn:
801	if (!keep_newpn)
802		kfree(newpn);
803free_buf:
804	kfree(buf);
805	return ret;
806}
807
808static int blkiocg_weight_device_read(struct cgroup *cgrp, struct cftype *cft,
809				      struct seq_file *m)
810{
811	struct blkio_cgroup *blkcg;
812	struct blkio_policy_node *pn;
813
814	seq_printf(m, "dev\tweight\n");
815
816	blkcg = cgroup_to_blkio_cgroup(cgrp);
817	if (!list_empty(&blkcg->policy_list)) {
818		spin_lock_irq(&blkcg->lock);
819		list_for_each_entry(pn, &blkcg->policy_list, node) {
820			seq_printf(m, "%u:%u\t%u\n", MAJOR(pn->dev),
821				   MINOR(pn->dev), pn->weight);
822		}
823		spin_unlock_irq(&blkcg->lock);
824	}
825
826	return 0;
827}
828
829struct cftype blkio_files[] = {
830	{
831		.name = "weight_device",
832		.read_seq_string = blkiocg_weight_device_read,
833		.write_string = blkiocg_weight_device_write,
834		.max_write_len = 256,
835	},
836	{
837		.name = "weight",
838		.read_u64 = blkiocg_weight_read,
839		.write_u64 = blkiocg_weight_write,
840	},
841	{
842		.name = "time",
843		.read_map = blkiocg_time_read,
844	},
845	{
846		.name = "sectors",
847		.read_map = blkiocg_sectors_read,
848	},
849	{
850		.name = "io_service_bytes",
851		.read_map = blkiocg_io_service_bytes_read,
852	},
853	{
854		.name = "io_serviced",
855		.read_map = blkiocg_io_serviced_read,
856	},
857	{
858		.name = "io_service_time",
859		.read_map = blkiocg_io_service_time_read,
860	},
861	{
862		.name = "io_wait_time",
863		.read_map = blkiocg_io_wait_time_read,
864	},
865	{
866		.name = "io_merged",
867		.read_map = blkiocg_io_merged_read,
868	},
869	{
870		.name = "io_queued",
871		.read_map = blkiocg_io_queued_read,
872	},
873	{
874		.name = "reset_stats",
875		.write_u64 = blkiocg_reset_stats,
876	},
877#ifdef CONFIG_DEBUG_BLK_CGROUP
878	{
879		.name = "avg_queue_size",
880		.read_map = blkiocg_avg_queue_size_read,
881	},
882	{
883		.name = "group_wait_time",
884		.read_map = blkiocg_group_wait_time_read,
885	},
886	{
887		.name = "idle_time",
888		.read_map = blkiocg_idle_time_read,
889	},
890	{
891		.name = "empty_time",
892		.read_map = blkiocg_empty_time_read,
893	},
894	{
895		.name = "dequeue",
896		.read_map = blkiocg_dequeue_read,
897	},
898#endif
899};
900
901static int blkiocg_populate(struct cgroup_subsys *subsys, struct cgroup *cgroup)
902{
903	return cgroup_add_files(cgroup, subsys, blkio_files,
904				ARRAY_SIZE(blkio_files));
905}
906
907static void blkiocg_destroy(struct cgroup_subsys *subsys, struct cgroup *cgroup)
908{
909	struct blkio_cgroup *blkcg = cgroup_to_blkio_cgroup(cgroup);
910	unsigned long flags;
911	struct blkio_group *blkg;
912	void *key;
913	struct blkio_policy_type *blkiop;
914	struct blkio_policy_node *pn, *pntmp;
915
916	rcu_read_lock();
917	do {
918		spin_lock_irqsave(&blkcg->lock, flags);
919
920		if (hlist_empty(&blkcg->blkg_list)) {
921			spin_unlock_irqrestore(&blkcg->lock, flags);
922			break;
923		}
924
925		blkg = hlist_entry(blkcg->blkg_list.first, struct blkio_group,
926					blkcg_node);
927		key = rcu_dereference(blkg->key);
928		__blkiocg_del_blkio_group(blkg);
929
930		spin_unlock_irqrestore(&blkcg->lock, flags);
931
932		/*
933		 * This blkio_group is being unlinked as associated cgroup is
934		 * going away. Let all the IO controlling policies know about
935		 * this event. Currently this is static call to one io
936		 * controlling policy. Once we have more policies in place, we
937		 * need some dynamic registration of callback function.
938		 */
939		spin_lock(&blkio_list_lock);
940		list_for_each_entry(blkiop, &blkio_list, list)
941			blkiop->ops.blkio_unlink_group_fn(key, blkg);
942		spin_unlock(&blkio_list_lock);
943	} while (1);
944
945	list_for_each_entry_safe(pn, pntmp, &blkcg->policy_list, node) {
946		blkio_policy_delete_node(pn);
947		kfree(pn);
948	}
949
950	free_css_id(&blkio_subsys, &blkcg->css);
951	rcu_read_unlock();
952	if (blkcg != &blkio_root_cgroup)
953		kfree(blkcg);
954}
955
956static struct cgroup_subsys_state *
957blkiocg_create(struct cgroup_subsys *subsys, struct cgroup *cgroup)
958{
959	struct blkio_cgroup *blkcg;
960	struct cgroup *parent = cgroup->parent;
961
962	if (!parent) {
963		blkcg = &blkio_root_cgroup;
964		goto done;
965	}
966
967	/* Currently we do not support hierarchy deeper than two level (0,1) */
968	if (parent != cgroup->top_cgroup)
969		return ERR_PTR(-EPERM);
970
971	blkcg = kzalloc(sizeof(*blkcg), GFP_KERNEL);
972	if (!blkcg)
973		return ERR_PTR(-ENOMEM);
974
975	blkcg->weight = BLKIO_WEIGHT_DEFAULT;
976done:
977	spin_lock_init(&blkcg->lock);
978	INIT_HLIST_HEAD(&blkcg->blkg_list);
979
980	INIT_LIST_HEAD(&blkcg->policy_list);
981	return &blkcg->css;
982}
983
984/*
985 * We cannot support shared io contexts, as we have no mean to support
986 * two tasks with the same ioc in two different groups without major rework
987 * of the main cic data structures.  For now we allow a task to change
988 * its cgroup only if it's the only owner of its ioc.
989 */
990static int blkiocg_can_attach(struct cgroup_subsys *subsys,
991				struct cgroup *cgroup, struct task_struct *tsk,
992				bool threadgroup)
993{
994	struct io_context *ioc;
995	int ret = 0;
996
997	/* task_lock() is needed to avoid races with exit_io_context() */
998	task_lock(tsk);
999	ioc = tsk->io_context;
1000	if (ioc && atomic_read(&ioc->nr_tasks) > 1)
1001		ret = -EINVAL;
1002	task_unlock(tsk);
1003
1004	return ret;
1005}
1006
1007static void blkiocg_attach(struct cgroup_subsys *subsys, struct cgroup *cgroup,
1008				struct cgroup *prev, struct task_struct *tsk,
1009				bool threadgroup)
1010{
1011	struct io_context *ioc;
1012
1013	task_lock(tsk);
1014	ioc = tsk->io_context;
1015	if (ioc)
1016		ioc->cgroup_changed = 1;
1017	task_unlock(tsk);
1018}
1019
1020void blkio_policy_register(struct blkio_policy_type *blkiop)
1021{
1022	spin_lock(&blkio_list_lock);
1023	list_add_tail(&blkiop->list, &blkio_list);
1024	spin_unlock(&blkio_list_lock);
1025}
1026EXPORT_SYMBOL_GPL(blkio_policy_register);
1027
1028void blkio_policy_unregister(struct blkio_policy_type *blkiop)
1029{
1030	spin_lock(&blkio_list_lock);
1031	list_del_init(&blkiop->list);
1032	spin_unlock(&blkio_list_lock);
1033}
1034EXPORT_SYMBOL_GPL(blkio_policy_unregister);
1035
1036static int __init init_cgroup_blkio(void)
1037{
1038	return cgroup_load_subsys(&blkio_subsys);
1039}
1040
1041static void __exit exit_cgroup_blkio(void)
1042{
1043	cgroup_unload_subsys(&blkio_subsys);
1044}
1045
1046module_init(init_cgroup_blkio);
1047module_exit(exit_cgroup_blkio);
1048MODULE_LICENSE("GPL");
1049