1/* 2 * CFQ, or complete fairness queueing, disk scheduler. 3 * 4 * Based on ideas from a previously unfinished io 5 * scheduler (round robin per-process disk scheduling) and Andrea Arcangeli. 6 * 7 * Copyright (C) 2003 Jens Axboe <axboe@kernel.dk> 8 */ 9#include <linux/module.h> 10#include <linux/slab.h> 11#include <linux/blkdev.h> 12#include <linux/elevator.h> 13#include <linux/jiffies.h> 14#include <linux/rbtree.h> 15#include <linux/ioprio.h> 16#include <linux/blktrace_api.h> 17#include "cfq.h" 18 19/* 20 * tunables 21 */ 22/* max queue in one round of service */ 23static const int cfq_quantum = 8; 24static const int cfq_fifo_expire[2] = { HZ / 4, HZ / 8 }; 25/* maximum backwards seek, in KiB */ 26static const int cfq_back_max = 16 * 1024; 27/* penalty of a backwards seek */ 28static const int cfq_back_penalty = 2; 29static const int cfq_slice_sync = HZ / 10; 30static int cfq_slice_async = HZ / 25; 31static const int cfq_slice_async_rq = 2; 32static int cfq_slice_idle = HZ / 125; 33static int cfq_group_idle = HZ / 125; 34static const int cfq_target_latency = HZ * 3/10; /* 300 ms */ 35static const int cfq_hist_divisor = 4; 36 37/* 38 * offset from end of service tree 39 */ 40#define CFQ_IDLE_DELAY (HZ / 5) 41 42/* 43 * below this threshold, we consider thinktime immediate 44 */ 45#define CFQ_MIN_TT (2) 46 47#define CFQ_SLICE_SCALE (5) 48#define CFQ_HW_QUEUE_MIN (5) 49#define CFQ_SERVICE_SHIFT 12 50 51#define CFQQ_SEEK_THR (sector_t)(8 * 100) 52#define CFQQ_CLOSE_THR (sector_t)(8 * 1024) 53#define CFQQ_SECT_THR_NONROT (sector_t)(2 * 32) 54#define CFQQ_SEEKY(cfqq) (hweight32(cfqq->seek_history) > 32/8) 55 56#define RQ_CIC(rq) \ 57 ((struct cfq_io_context *) (rq)->elevator_private) 58#define RQ_CFQQ(rq) (struct cfq_queue *) ((rq)->elevator_private2) 59#define RQ_CFQG(rq) (struct cfq_group *) ((rq)->elevator_private3) 60 61static struct kmem_cache *cfq_pool; 62static struct kmem_cache *cfq_ioc_pool; 63 64static DEFINE_PER_CPU(unsigned long, cfq_ioc_count); 65static struct completion *ioc_gone; 66static DEFINE_SPINLOCK(ioc_gone_lock); 67 68static DEFINE_SPINLOCK(cic_index_lock); 69static DEFINE_IDA(cic_index_ida); 70 71#define CFQ_PRIO_LISTS IOPRIO_BE_NR 72#define cfq_class_idle(cfqq) ((cfqq)->ioprio_class == IOPRIO_CLASS_IDLE) 73#define cfq_class_rt(cfqq) ((cfqq)->ioprio_class == IOPRIO_CLASS_RT) 74 75#define sample_valid(samples) ((samples) > 80) 76#define rb_entry_cfqg(node) rb_entry((node), struct cfq_group, rb_node) 77 78/* 79 * Most of our rbtree usage is for sorting with min extraction, so 80 * if we cache the leftmost node we don't have to walk down the tree 81 * to find it. Idea borrowed from Ingo Molnars CFS scheduler. We should 82 * move this into the elevator for the rq sorting as well. 83 */ 84struct cfq_rb_root { 85 struct rb_root rb; 86 struct rb_node *left; 87 unsigned count; 88 unsigned total_weight; 89 u64 min_vdisktime; 90 struct rb_node *active; 91}; 92#define CFQ_RB_ROOT (struct cfq_rb_root) { .rb = RB_ROOT, .left = NULL, \ 93 .count = 0, .min_vdisktime = 0, } 94 95/* 96 * Per process-grouping structure 97 */ 98struct cfq_queue { 99 /* reference count */ 100 atomic_t ref; 101 /* various state flags, see below */ 102 unsigned int flags; 103 /* parent cfq_data */ 104 struct cfq_data *cfqd; 105 /* service_tree member */ 106 struct rb_node rb_node; 107 /* service_tree key */ 108 unsigned long rb_key; 109 /* prio tree member */ 110 struct rb_node p_node; 111 /* prio tree root we belong to, if any */ 112 struct rb_root *p_root; 113 /* sorted list of pending requests */ 114 struct rb_root sort_list; 115 /* if fifo isn't expired, next request to serve */ 116 struct request *next_rq; 117 /* requests queued in sort_list */ 118 int queued[2]; 119 /* currently allocated requests */ 120 int allocated[2]; 121 /* fifo list of requests in sort_list */ 122 struct list_head fifo; 123 124 /* time when queue got scheduled in to dispatch first request. */ 125 unsigned long dispatch_start; 126 unsigned int allocated_slice; 127 unsigned int slice_dispatch; 128 /* time when first request from queue completed and slice started. */ 129 unsigned long slice_start; 130 unsigned long slice_end; 131 long slice_resid; 132 133 /* pending metadata requests */ 134 int meta_pending; 135 /* number of requests that are on the dispatch list or inside driver */ 136 int dispatched; 137 138 /* io prio of this group */ 139 unsigned short ioprio, org_ioprio; 140 unsigned short ioprio_class, org_ioprio_class; 141 142 pid_t pid; 143 144 u32 seek_history; 145 sector_t last_request_pos; 146 147 struct cfq_rb_root *service_tree; 148 struct cfq_queue *new_cfqq; 149 struct cfq_group *cfqg; 150 struct cfq_group *orig_cfqg; 151 /* Number of sectors dispatched from queue in single dispatch round */ 152 unsigned long nr_sectors; 153}; 154 155/* 156 * First index in the service_trees. 157 * IDLE is handled separately, so it has negative index 158 */ 159enum wl_prio_t { 160 BE_WORKLOAD = 0, 161 RT_WORKLOAD = 1, 162 IDLE_WORKLOAD = 2, 163}; 164 165/* 166 * Second index in the service_trees. 167 */ 168enum wl_type_t { 169 ASYNC_WORKLOAD = 0, 170 SYNC_NOIDLE_WORKLOAD = 1, 171 SYNC_WORKLOAD = 2 172}; 173 174/* This is per cgroup per device grouping structure */ 175struct cfq_group { 176 /* group service_tree member */ 177 struct rb_node rb_node; 178 179 /* group service_tree key */ 180 u64 vdisktime; 181 unsigned int weight; 182 bool on_st; 183 184 /* number of cfqq currently on this group */ 185 int nr_cfqq; 186 187 /* Per group busy queus average. Useful for workload slice calc. */ 188 unsigned int busy_queues_avg[2]; 189 /* 190 * rr lists of queues with requests, onle rr for each priority class. 191 * Counts are embedded in the cfq_rb_root 192 */ 193 struct cfq_rb_root service_trees[2][3]; 194 struct cfq_rb_root service_tree_idle; 195 196 unsigned long saved_workload_slice; 197 enum wl_type_t saved_workload; 198 enum wl_prio_t saved_serving_prio; 199 struct blkio_group blkg; 200#ifdef CONFIG_CFQ_GROUP_IOSCHED 201 struct hlist_node cfqd_node; 202 atomic_t ref; 203#endif 204 /* number of requests that are on the dispatch list or inside driver */ 205 int dispatched; 206}; 207 208/* 209 * Per block device queue structure 210 */ 211struct cfq_data { 212 struct request_queue *queue; 213 /* Root service tree for cfq_groups */ 214 struct cfq_rb_root grp_service_tree; 215 struct cfq_group root_group; 216 217 /* 218 * The priority currently being served 219 */ 220 enum wl_prio_t serving_prio; 221 enum wl_type_t serving_type; 222 unsigned long workload_expires; 223 struct cfq_group *serving_group; 224 bool noidle_tree_requires_idle; 225 226 /* 227 * Each priority tree is sorted by next_request position. These 228 * trees are used when determining if two or more queues are 229 * interleaving requests (see cfq_close_cooperator). 230 */ 231 struct rb_root prio_trees[CFQ_PRIO_LISTS]; 232 233 unsigned int busy_queues; 234 235 int rq_in_driver; 236 int rq_in_flight[2]; 237 238 /* 239 * queue-depth detection 240 */ 241 int rq_queued; 242 int hw_tag; 243 /* 244 * hw_tag can be 245 * -1 => indeterminate, (cfq will behave as if NCQ is present, to allow better detection) 246 * 1 => NCQ is present (hw_tag_est_depth is the estimated max depth) 247 * 0 => no NCQ 248 */ 249 int hw_tag_est_depth; 250 unsigned int hw_tag_samples; 251 252 /* 253 * idle window management 254 */ 255 struct timer_list idle_slice_timer; 256 struct work_struct unplug_work; 257 258 struct cfq_queue *active_queue; 259 struct cfq_io_context *active_cic; 260 261 /* 262 * async queue for each priority case 263 */ 264 struct cfq_queue *async_cfqq[2][IOPRIO_BE_NR]; 265 struct cfq_queue *async_idle_cfqq; 266 267 sector_t last_position; 268 269 /* 270 * tunables, see top of file 271 */ 272 unsigned int cfq_quantum; 273 unsigned int cfq_fifo_expire[2]; 274 unsigned int cfq_back_penalty; 275 unsigned int cfq_back_max; 276 unsigned int cfq_slice[2]; 277 unsigned int cfq_slice_async_rq; 278 unsigned int cfq_slice_idle; 279 unsigned int cfq_group_idle; 280 unsigned int cfq_latency; 281 unsigned int cfq_group_isolation; 282 283 unsigned int cic_index; 284 struct list_head cic_list; 285 286 /* 287 * Fallback dummy cfqq for extreme OOM conditions 288 */ 289 struct cfq_queue oom_cfqq; 290 291 unsigned long last_delayed_sync; 292 293 /* List of cfq groups being managed on this device*/ 294 struct hlist_head cfqg_list; 295 struct rcu_head rcu; 296}; 297 298static struct cfq_group *cfq_get_next_cfqg(struct cfq_data *cfqd); 299 300static struct cfq_rb_root *service_tree_for(struct cfq_group *cfqg, 301 enum wl_prio_t prio, 302 enum wl_type_t type) 303{ 304 if (!cfqg) 305 return NULL; 306 307 if (prio == IDLE_WORKLOAD) 308 return &cfqg->service_tree_idle; 309 310 return &cfqg->service_trees[prio][type]; 311} 312 313enum cfqq_state_flags { 314 CFQ_CFQQ_FLAG_on_rr = 0, /* on round-robin busy list */ 315 CFQ_CFQQ_FLAG_wait_request, /* waiting for a request */ 316 CFQ_CFQQ_FLAG_must_dispatch, /* must be allowed a dispatch */ 317 CFQ_CFQQ_FLAG_must_alloc_slice, /* per-slice must_alloc flag */ 318 CFQ_CFQQ_FLAG_fifo_expire, /* FIFO checked in this slice */ 319 CFQ_CFQQ_FLAG_idle_window, /* slice idling enabled */ 320 CFQ_CFQQ_FLAG_prio_changed, /* task priority has changed */ 321 CFQ_CFQQ_FLAG_slice_new, /* no requests dispatched in slice */ 322 CFQ_CFQQ_FLAG_sync, /* synchronous queue */ 323 CFQ_CFQQ_FLAG_coop, /* cfqq is shared */ 324 CFQ_CFQQ_FLAG_split_coop, /* shared cfqq will be splitted */ 325 CFQ_CFQQ_FLAG_deep, /* sync cfqq experienced large depth */ 326 CFQ_CFQQ_FLAG_wait_busy, /* Waiting for next request */ 327}; 328 329#define CFQ_CFQQ_FNS(name) \ 330static inline void cfq_mark_cfqq_##name(struct cfq_queue *cfqq) \ 331{ \ 332 (cfqq)->flags |= (1 << CFQ_CFQQ_FLAG_##name); \ 333} \ 334static inline void cfq_clear_cfqq_##name(struct cfq_queue *cfqq) \ 335{ \ 336 (cfqq)->flags &= ~(1 << CFQ_CFQQ_FLAG_##name); \ 337} \ 338static inline int cfq_cfqq_##name(const struct cfq_queue *cfqq) \ 339{ \ 340 return ((cfqq)->flags & (1 << CFQ_CFQQ_FLAG_##name)) != 0; \ 341} 342 343CFQ_CFQQ_FNS(on_rr); 344CFQ_CFQQ_FNS(wait_request); 345CFQ_CFQQ_FNS(must_dispatch); 346CFQ_CFQQ_FNS(must_alloc_slice); 347CFQ_CFQQ_FNS(fifo_expire); 348CFQ_CFQQ_FNS(idle_window); 349CFQ_CFQQ_FNS(prio_changed); 350CFQ_CFQQ_FNS(slice_new); 351CFQ_CFQQ_FNS(sync); 352CFQ_CFQQ_FNS(coop); 353CFQ_CFQQ_FNS(split_coop); 354CFQ_CFQQ_FNS(deep); 355CFQ_CFQQ_FNS(wait_busy); 356#undef CFQ_CFQQ_FNS 357 358#ifdef CONFIG_CFQ_GROUP_IOSCHED 359#define cfq_log_cfqq(cfqd, cfqq, fmt, args...) \ 360 blk_add_trace_msg((cfqd)->queue, "cfq%d%c %s " fmt, (cfqq)->pid, \ 361 cfq_cfqq_sync((cfqq)) ? 'S' : 'A', \ 362 blkg_path(&(cfqq)->cfqg->blkg), ##args); 363 364#define cfq_log_cfqg(cfqd, cfqg, fmt, args...) \ 365 blk_add_trace_msg((cfqd)->queue, "%s " fmt, \ 366 blkg_path(&(cfqg)->blkg), ##args); \ 367 368#else 369#define cfq_log_cfqq(cfqd, cfqq, fmt, args...) \ 370 blk_add_trace_msg((cfqd)->queue, "cfq%d " fmt, (cfqq)->pid, ##args) 371#define cfq_log_cfqg(cfqd, cfqg, fmt, args...) do {} while (0); 372#endif 373#define cfq_log(cfqd, fmt, args...) \ 374 blk_add_trace_msg((cfqd)->queue, "cfq " fmt, ##args) 375 376/* Traverses through cfq group service trees */ 377#define for_each_cfqg_st(cfqg, i, j, st) \ 378 for (i = 0; i <= IDLE_WORKLOAD; i++) \ 379 for (j = 0, st = i < IDLE_WORKLOAD ? &cfqg->service_trees[i][j]\ 380 : &cfqg->service_tree_idle; \ 381 (i < IDLE_WORKLOAD && j <= SYNC_WORKLOAD) || \ 382 (i == IDLE_WORKLOAD && j == 0); \ 383 j++, st = i < IDLE_WORKLOAD ? \ 384 &cfqg->service_trees[i][j]: NULL) \ 385 386 387static inline bool iops_mode(struct cfq_data *cfqd) 388{ 389 /* 390 * If we are not idling on queues and it is a NCQ drive, parallel 391 * execution of requests is on and measuring time is not possible 392 * in most of the cases until and unless we drive shallower queue 393 * depths and that becomes a performance bottleneck. In such cases 394 * switch to start providing fairness in terms of number of IOs. 395 */ 396 if (!cfqd->cfq_slice_idle && cfqd->hw_tag) 397 return true; 398 else 399 return false; 400} 401 402static inline enum wl_prio_t cfqq_prio(struct cfq_queue *cfqq) 403{ 404 if (cfq_class_idle(cfqq)) 405 return IDLE_WORKLOAD; 406 if (cfq_class_rt(cfqq)) 407 return RT_WORKLOAD; 408 return BE_WORKLOAD; 409} 410 411 412static enum wl_type_t cfqq_type(struct cfq_queue *cfqq) 413{ 414 if (!cfq_cfqq_sync(cfqq)) 415 return ASYNC_WORKLOAD; 416 if (!cfq_cfqq_idle_window(cfqq)) 417 return SYNC_NOIDLE_WORKLOAD; 418 return SYNC_WORKLOAD; 419} 420 421static inline int cfq_group_busy_queues_wl(enum wl_prio_t wl, 422 struct cfq_data *cfqd, 423 struct cfq_group *cfqg) 424{ 425 if (wl == IDLE_WORKLOAD) 426 return cfqg->service_tree_idle.count; 427 428 return cfqg->service_trees[wl][ASYNC_WORKLOAD].count 429 + cfqg->service_trees[wl][SYNC_NOIDLE_WORKLOAD].count 430 + cfqg->service_trees[wl][SYNC_WORKLOAD].count; 431} 432 433static inline int cfqg_busy_async_queues(struct cfq_data *cfqd, 434 struct cfq_group *cfqg) 435{ 436 return cfqg->service_trees[RT_WORKLOAD][ASYNC_WORKLOAD].count 437 + cfqg->service_trees[BE_WORKLOAD][ASYNC_WORKLOAD].count; 438} 439 440static void cfq_dispatch_insert(struct request_queue *, struct request *); 441static struct cfq_queue *cfq_get_queue(struct cfq_data *, bool, 442 struct io_context *, gfp_t); 443static struct cfq_io_context *cfq_cic_lookup(struct cfq_data *, 444 struct io_context *); 445 446static inline struct cfq_queue *cic_to_cfqq(struct cfq_io_context *cic, 447 bool is_sync) 448{ 449 return cic->cfqq[is_sync]; 450} 451 452static inline void cic_set_cfqq(struct cfq_io_context *cic, 453 struct cfq_queue *cfqq, bool is_sync) 454{ 455 cic->cfqq[is_sync] = cfqq; 456} 457 458#define CIC_DEAD_KEY 1ul 459#define CIC_DEAD_INDEX_SHIFT 1 460 461static inline void *cfqd_dead_key(struct cfq_data *cfqd) 462{ 463 return (void *)(cfqd->cic_index << CIC_DEAD_INDEX_SHIFT | CIC_DEAD_KEY); 464} 465 466static inline struct cfq_data *cic_to_cfqd(struct cfq_io_context *cic) 467{ 468 struct cfq_data *cfqd = cic->key; 469 470 if (unlikely((unsigned long) cfqd & CIC_DEAD_KEY)) 471 return NULL; 472 473 return cfqd; 474} 475 476/* 477 * We regard a request as SYNC, if it's either a read or has the SYNC bit 478 * set (in which case it could also be direct WRITE). 479 */ 480static inline bool cfq_bio_sync(struct bio *bio) 481{ 482 return bio_data_dir(bio) == READ || (bio->bi_rw & REQ_SYNC); 483} 484 485/* 486 * scheduler run of queue, if there are requests pending and no one in the 487 * driver that will restart queueing 488 */ 489static inline void cfq_schedule_dispatch(struct cfq_data *cfqd) 490{ 491 if (cfqd->busy_queues) { 492 cfq_log(cfqd, "schedule dispatch"); 493 kblockd_schedule_work(cfqd->queue, &cfqd->unplug_work); 494 } 495} 496 497static int cfq_queue_empty(struct request_queue *q) 498{ 499 struct cfq_data *cfqd = q->elevator->elevator_data; 500 501 return !cfqd->rq_queued; 502} 503 504/* 505 * Scale schedule slice based on io priority. Use the sync time slice only 506 * if a queue is marked sync and has sync io queued. A sync queue with async 507 * io only, should not get full sync slice length. 508 */ 509static inline int cfq_prio_slice(struct cfq_data *cfqd, bool sync, 510 unsigned short prio) 511{ 512 const int base_slice = cfqd->cfq_slice[sync]; 513 514 WARN_ON(prio >= IOPRIO_BE_NR); 515 516 return base_slice + (base_slice/CFQ_SLICE_SCALE * (4 - prio)); 517} 518 519static inline int 520cfq_prio_to_slice(struct cfq_data *cfqd, struct cfq_queue *cfqq) 521{ 522 return cfq_prio_slice(cfqd, cfq_cfqq_sync(cfqq), cfqq->ioprio); 523} 524 525static inline u64 cfq_scale_slice(unsigned long delta, struct cfq_group *cfqg) 526{ 527 u64 d = delta << CFQ_SERVICE_SHIFT; 528 529 d = d * BLKIO_WEIGHT_DEFAULT; 530 do_div(d, cfqg->weight); 531 return d; 532} 533 534static inline u64 max_vdisktime(u64 min_vdisktime, u64 vdisktime) 535{ 536 s64 delta = (s64)(vdisktime - min_vdisktime); 537 if (delta > 0) 538 min_vdisktime = vdisktime; 539 540 return min_vdisktime; 541} 542 543static inline u64 min_vdisktime(u64 min_vdisktime, u64 vdisktime) 544{ 545 s64 delta = (s64)(vdisktime - min_vdisktime); 546 if (delta < 0) 547 min_vdisktime = vdisktime; 548 549 return min_vdisktime; 550} 551 552static void update_min_vdisktime(struct cfq_rb_root *st) 553{ 554 u64 vdisktime = st->min_vdisktime; 555 struct cfq_group *cfqg; 556 557 if (st->active) { 558 cfqg = rb_entry_cfqg(st->active); 559 vdisktime = cfqg->vdisktime; 560 } 561 562 if (st->left) { 563 cfqg = rb_entry_cfqg(st->left); 564 vdisktime = min_vdisktime(vdisktime, cfqg->vdisktime); 565 } 566 567 st->min_vdisktime = max_vdisktime(st->min_vdisktime, vdisktime); 568} 569 570/* 571 * get averaged number of queues of RT/BE priority. 572 * average is updated, with a formula that gives more weight to higher numbers, 573 * to quickly follows sudden increases and decrease slowly 574 */ 575 576static inline unsigned cfq_group_get_avg_queues(struct cfq_data *cfqd, 577 struct cfq_group *cfqg, bool rt) 578{ 579 unsigned min_q, max_q; 580 unsigned mult = cfq_hist_divisor - 1; 581 unsigned round = cfq_hist_divisor / 2; 582 unsigned busy = cfq_group_busy_queues_wl(rt, cfqd, cfqg); 583 584 min_q = min(cfqg->busy_queues_avg[rt], busy); 585 max_q = max(cfqg->busy_queues_avg[rt], busy); 586 cfqg->busy_queues_avg[rt] = (mult * max_q + min_q + round) / 587 cfq_hist_divisor; 588 return cfqg->busy_queues_avg[rt]; 589} 590 591static inline unsigned 592cfq_group_slice(struct cfq_data *cfqd, struct cfq_group *cfqg) 593{ 594 struct cfq_rb_root *st = &cfqd->grp_service_tree; 595 596 return cfq_target_latency * cfqg->weight / st->total_weight; 597} 598 599static inline void 600cfq_set_prio_slice(struct cfq_data *cfqd, struct cfq_queue *cfqq) 601{ 602 unsigned slice = cfq_prio_to_slice(cfqd, cfqq); 603 if (cfqd->cfq_latency) { 604 /* 605 * interested queues (we consider only the ones with the same 606 * priority class in the cfq group) 607 */ 608 unsigned iq = cfq_group_get_avg_queues(cfqd, cfqq->cfqg, 609 cfq_class_rt(cfqq)); 610 unsigned sync_slice = cfqd->cfq_slice[1]; 611 unsigned expect_latency = sync_slice * iq; 612 unsigned group_slice = cfq_group_slice(cfqd, cfqq->cfqg); 613 614 if (expect_latency > group_slice) { 615 unsigned base_low_slice = 2 * cfqd->cfq_slice_idle; 616 /* scale low_slice according to IO priority 617 * and sync vs async */ 618 unsigned low_slice = 619 min(slice, base_low_slice * slice / sync_slice); 620 /* the adapted slice value is scaled to fit all iqs 621 * into the target latency */ 622 slice = max(slice * group_slice / expect_latency, 623 low_slice); 624 } 625 } 626 cfqq->slice_start = jiffies; 627 cfqq->slice_end = jiffies + slice; 628 cfqq->allocated_slice = slice; 629 cfq_log_cfqq(cfqd, cfqq, "set_slice=%lu", cfqq->slice_end - jiffies); 630} 631 632/* 633 * We need to wrap this check in cfq_cfqq_slice_new(), since ->slice_end 634 * isn't valid until the first request from the dispatch is activated 635 * and the slice time set. 636 */ 637static inline bool cfq_slice_used(struct cfq_queue *cfqq) 638{ 639 if (cfq_cfqq_slice_new(cfqq)) 640 return 0; 641 if (time_before(jiffies, cfqq->slice_end)) 642 return 0; 643 644 return 1; 645} 646 647/* 648 * Lifted from AS - choose which of rq1 and rq2 that is best served now. 649 * We choose the request that is closest to the head right now. Distance 650 * behind the head is penalized and only allowed to a certain extent. 651 */ 652static struct request * 653cfq_choose_req(struct cfq_data *cfqd, struct request *rq1, struct request *rq2, sector_t last) 654{ 655 sector_t s1, s2, d1 = 0, d2 = 0; 656 unsigned long back_max; 657#define CFQ_RQ1_WRAP 0x01 /* request 1 wraps */ 658#define CFQ_RQ2_WRAP 0x02 /* request 2 wraps */ 659 unsigned wrap = 0; /* bit mask: requests behind the disk head? */ 660 661 if (rq1 == NULL || rq1 == rq2) 662 return rq2; 663 if (rq2 == NULL) 664 return rq1; 665 666 if (rq_is_sync(rq1) && !rq_is_sync(rq2)) 667 return rq1; 668 else if (rq_is_sync(rq2) && !rq_is_sync(rq1)) 669 return rq2; 670 if ((rq1->cmd_flags & REQ_META) && !(rq2->cmd_flags & REQ_META)) 671 return rq1; 672 else if ((rq2->cmd_flags & REQ_META) && 673 !(rq1->cmd_flags & REQ_META)) 674 return rq2; 675 676 s1 = blk_rq_pos(rq1); 677 s2 = blk_rq_pos(rq2); 678 679 /* 680 * by definition, 1KiB is 2 sectors 681 */ 682 back_max = cfqd->cfq_back_max * 2; 683 684 /* 685 * Strict one way elevator _except_ in the case where we allow 686 * short backward seeks which are biased as twice the cost of a 687 * similar forward seek. 688 */ 689 if (s1 >= last) 690 d1 = s1 - last; 691 else if (s1 + back_max >= last) 692 d1 = (last - s1) * cfqd->cfq_back_penalty; 693 else 694 wrap |= CFQ_RQ1_WRAP; 695 696 if (s2 >= last) 697 d2 = s2 - last; 698 else if (s2 + back_max >= last) 699 d2 = (last - s2) * cfqd->cfq_back_penalty; 700 else 701 wrap |= CFQ_RQ2_WRAP; 702 703 /* Found required data */ 704 705 /* 706 * By doing switch() on the bit mask "wrap" we avoid having to 707 * check two variables for all permutations: --> faster! 708 */ 709 switch (wrap) { 710 case 0: /* common case for CFQ: rq1 and rq2 not wrapped */ 711 if (d1 < d2) 712 return rq1; 713 else if (d2 < d1) 714 return rq2; 715 else { 716 if (s1 >= s2) 717 return rq1; 718 else 719 return rq2; 720 } 721 722 case CFQ_RQ2_WRAP: 723 return rq1; 724 case CFQ_RQ1_WRAP: 725 return rq2; 726 case (CFQ_RQ1_WRAP|CFQ_RQ2_WRAP): /* both rqs wrapped */ 727 default: 728 /* 729 * Since both rqs are wrapped, 730 * start with the one that's further behind head 731 * (--> only *one* back seek required), 732 * since back seek takes more time than forward. 733 */ 734 if (s1 <= s2) 735 return rq1; 736 else 737 return rq2; 738 } 739} 740 741/* 742 * The below is leftmost cache rbtree addon 743 */ 744static struct cfq_queue *cfq_rb_first(struct cfq_rb_root *root) 745{ 746 /* Service tree is empty */ 747 if (!root->count) 748 return NULL; 749 750 if (!root->left) 751 root->left = rb_first(&root->rb); 752 753 if (root->left) 754 return rb_entry(root->left, struct cfq_queue, rb_node); 755 756 return NULL; 757} 758 759static struct cfq_group *cfq_rb_first_group(struct cfq_rb_root *root) 760{ 761 if (!root->left) 762 root->left = rb_first(&root->rb); 763 764 if (root->left) 765 return rb_entry_cfqg(root->left); 766 767 return NULL; 768} 769 770static void rb_erase_init(struct rb_node *n, struct rb_root *root) 771{ 772 rb_erase(n, root); 773 RB_CLEAR_NODE(n); 774} 775 776static void cfq_rb_erase(struct rb_node *n, struct cfq_rb_root *root) 777{ 778 if (root->left == n) 779 root->left = NULL; 780 rb_erase_init(n, &root->rb); 781 --root->count; 782} 783 784/* 785 * would be nice to take fifo expire time into account as well 786 */ 787static struct request * 788cfq_find_next_rq(struct cfq_data *cfqd, struct cfq_queue *cfqq, 789 struct request *last) 790{ 791 struct rb_node *rbnext = rb_next(&last->rb_node); 792 struct rb_node *rbprev = rb_prev(&last->rb_node); 793 struct request *next = NULL, *prev = NULL; 794 795 BUG_ON(RB_EMPTY_NODE(&last->rb_node)); 796 797 if (rbprev) 798 prev = rb_entry_rq(rbprev); 799 800 if (rbnext) 801 next = rb_entry_rq(rbnext); 802 else { 803 rbnext = rb_first(&cfqq->sort_list); 804 if (rbnext && rbnext != &last->rb_node) 805 next = rb_entry_rq(rbnext); 806 } 807 808 return cfq_choose_req(cfqd, next, prev, blk_rq_pos(last)); 809} 810 811static unsigned long cfq_slice_offset(struct cfq_data *cfqd, 812 struct cfq_queue *cfqq) 813{ 814 /* 815 * just an approximation, should be ok. 816 */ 817 return (cfqq->cfqg->nr_cfqq - 1) * (cfq_prio_slice(cfqd, 1, 0) - 818 cfq_prio_slice(cfqd, cfq_cfqq_sync(cfqq), cfqq->ioprio)); 819} 820 821static inline s64 822cfqg_key(struct cfq_rb_root *st, struct cfq_group *cfqg) 823{ 824 return cfqg->vdisktime - st->min_vdisktime; 825} 826 827static void 828__cfq_group_service_tree_add(struct cfq_rb_root *st, struct cfq_group *cfqg) 829{ 830 struct rb_node **node = &st->rb.rb_node; 831 struct rb_node *parent = NULL; 832 struct cfq_group *__cfqg; 833 s64 key = cfqg_key(st, cfqg); 834 int left = 1; 835 836 while (*node != NULL) { 837 parent = *node; 838 __cfqg = rb_entry_cfqg(parent); 839 840 if (key < cfqg_key(st, __cfqg)) 841 node = &parent->rb_left; 842 else { 843 node = &parent->rb_right; 844 left = 0; 845 } 846 } 847 848 if (left) 849 st->left = &cfqg->rb_node; 850 851 rb_link_node(&cfqg->rb_node, parent, node); 852 rb_insert_color(&cfqg->rb_node, &st->rb); 853} 854 855static void 856cfq_group_service_tree_add(struct cfq_data *cfqd, struct cfq_group *cfqg) 857{ 858 struct cfq_rb_root *st = &cfqd->grp_service_tree; 859 struct cfq_group *__cfqg; 860 struct rb_node *n; 861 862 cfqg->nr_cfqq++; 863 if (cfqg->on_st) 864 return; 865 866 /* 867 * Currently put the group at the end. Later implement something 868 * so that groups get lesser vtime based on their weights, so that 869 * if group does not loose all if it was not continously backlogged. 870 */ 871 n = rb_last(&st->rb); 872 if (n) { 873 __cfqg = rb_entry_cfqg(n); 874 cfqg->vdisktime = __cfqg->vdisktime + CFQ_IDLE_DELAY; 875 } else 876 cfqg->vdisktime = st->min_vdisktime; 877 878 __cfq_group_service_tree_add(st, cfqg); 879 cfqg->on_st = true; 880 st->total_weight += cfqg->weight; 881} 882 883static void 884cfq_group_service_tree_del(struct cfq_data *cfqd, struct cfq_group *cfqg) 885{ 886 struct cfq_rb_root *st = &cfqd->grp_service_tree; 887 888 if (st->active == &cfqg->rb_node) 889 st->active = NULL; 890 891 BUG_ON(cfqg->nr_cfqq < 1); 892 cfqg->nr_cfqq--; 893 894 /* If there are other cfq queues under this group, don't delete it */ 895 if (cfqg->nr_cfqq) 896 return; 897 898 cfq_log_cfqg(cfqd, cfqg, "del_from_rr group"); 899 cfqg->on_st = false; 900 st->total_weight -= cfqg->weight; 901 if (!RB_EMPTY_NODE(&cfqg->rb_node)) 902 cfq_rb_erase(&cfqg->rb_node, st); 903 cfqg->saved_workload_slice = 0; 904 cfq_blkiocg_update_dequeue_stats(&cfqg->blkg, 1); 905} 906 907static inline unsigned int cfq_cfqq_slice_usage(struct cfq_queue *cfqq) 908{ 909 unsigned int slice_used; 910 911 /* 912 * Queue got expired before even a single request completed or 913 * got expired immediately after first request completion. 914 */ 915 if (!cfqq->slice_start || cfqq->slice_start == jiffies) { 916 /* 917 * Also charge the seek time incurred to the group, otherwise 918 * if there are mutiple queues in the group, each can dispatch 919 * a single request on seeky media and cause lots of seek time 920 * and group will never know it. 921 */ 922 slice_used = max_t(unsigned, (jiffies - cfqq->dispatch_start), 923 1); 924 } else { 925 slice_used = jiffies - cfqq->slice_start; 926 if (slice_used > cfqq->allocated_slice) 927 slice_used = cfqq->allocated_slice; 928 } 929 930 return slice_used; 931} 932 933static void cfq_group_served(struct cfq_data *cfqd, struct cfq_group *cfqg, 934 struct cfq_queue *cfqq) 935{ 936 struct cfq_rb_root *st = &cfqd->grp_service_tree; 937 unsigned int used_sl, charge; 938 int nr_sync = cfqg->nr_cfqq - cfqg_busy_async_queues(cfqd, cfqg) 939 - cfqg->service_tree_idle.count; 940 941 BUG_ON(nr_sync < 0); 942 used_sl = charge = cfq_cfqq_slice_usage(cfqq); 943 944 if (iops_mode(cfqd)) 945 charge = cfqq->slice_dispatch; 946 else if (!cfq_cfqq_sync(cfqq) && !nr_sync) 947 charge = cfqq->allocated_slice; 948 949 /* Can't update vdisktime while group is on service tree */ 950 cfq_rb_erase(&cfqg->rb_node, st); 951 cfqg->vdisktime += cfq_scale_slice(charge, cfqg); 952 __cfq_group_service_tree_add(st, cfqg); 953 954 /* This group is being expired. Save the context */ 955 if (time_after(cfqd->workload_expires, jiffies)) { 956 cfqg->saved_workload_slice = cfqd->workload_expires 957 - jiffies; 958 cfqg->saved_workload = cfqd->serving_type; 959 cfqg->saved_serving_prio = cfqd->serving_prio; 960 } else 961 cfqg->saved_workload_slice = 0; 962 963 cfq_log_cfqg(cfqd, cfqg, "served: vt=%llu min_vt=%llu", cfqg->vdisktime, 964 st->min_vdisktime); 965 cfq_log_cfqq(cfqq->cfqd, cfqq, "sl_used=%u disp=%u charge=%u iops=%u" 966 " sect=%u", used_sl, cfqq->slice_dispatch, charge, 967 iops_mode(cfqd), cfqq->nr_sectors); 968 cfq_blkiocg_update_timeslice_used(&cfqg->blkg, used_sl); 969 cfq_blkiocg_set_start_empty_time(&cfqg->blkg); 970} 971 972#ifdef CONFIG_CFQ_GROUP_IOSCHED 973static inline struct cfq_group *cfqg_of_blkg(struct blkio_group *blkg) 974{ 975 if (blkg) 976 return container_of(blkg, struct cfq_group, blkg); 977 return NULL; 978} 979 980void 981cfq_update_blkio_group_weight(struct blkio_group *blkg, unsigned int weight) 982{ 983 cfqg_of_blkg(blkg)->weight = weight; 984} 985 986static struct cfq_group * 987cfq_find_alloc_cfqg(struct cfq_data *cfqd, struct cgroup *cgroup, int create) 988{ 989 struct blkio_cgroup *blkcg = cgroup_to_blkio_cgroup(cgroup); 990 struct cfq_group *cfqg = NULL; 991 void *key = cfqd; 992 int i, j; 993 struct cfq_rb_root *st; 994 struct backing_dev_info *bdi = &cfqd->queue->backing_dev_info; 995 unsigned int major, minor; 996 997 cfqg = cfqg_of_blkg(blkiocg_lookup_group(blkcg, key)); 998 if (cfqg && !cfqg->blkg.dev && bdi->dev && dev_name(bdi->dev)) { 999 sscanf(dev_name(bdi->dev), "%u:%u", &major, &minor); 1000 cfqg->blkg.dev = MKDEV(major, minor); 1001 goto done; 1002 } 1003 if (cfqg || !create) 1004 goto done; 1005 1006 cfqg = kzalloc_node(sizeof(*cfqg), GFP_ATOMIC, cfqd->queue->node); 1007 if (!cfqg) 1008 goto done; 1009 1010 for_each_cfqg_st(cfqg, i, j, st) 1011 *st = CFQ_RB_ROOT; 1012 RB_CLEAR_NODE(&cfqg->rb_node); 1013 1014 /* 1015 * Take the initial reference that will be released on destroy 1016 * This can be thought of a joint reference by cgroup and 1017 * elevator which will be dropped by either elevator exit 1018 * or cgroup deletion path depending on who is exiting first. 1019 */ 1020 atomic_set(&cfqg->ref, 1); 1021 1022 /* 1023 * Add group onto cgroup list. It might happen that bdi->dev is 1024 * not initiliazed yet. Initialize this new group without major 1025 * and minor info and this info will be filled in once a new thread 1026 * comes for IO. See code above. 1027 */ 1028 if (bdi->dev) { 1029 sscanf(dev_name(bdi->dev), "%u:%u", &major, &minor); 1030 cfq_blkiocg_add_blkio_group(blkcg, &cfqg->blkg, (void *)cfqd, 1031 MKDEV(major, minor)); 1032 } else 1033 cfq_blkiocg_add_blkio_group(blkcg, &cfqg->blkg, (void *)cfqd, 1034 0); 1035 1036 cfqg->weight = blkcg_get_weight(blkcg, cfqg->blkg.dev); 1037 1038 /* Add group on cfqd list */ 1039 hlist_add_head(&cfqg->cfqd_node, &cfqd->cfqg_list); 1040 1041done: 1042 return cfqg; 1043} 1044 1045/* 1046 * Search for the cfq group current task belongs to. If create = 1, then also 1047 * create the cfq group if it does not exist. request_queue lock must be held. 1048 */ 1049static struct cfq_group *cfq_get_cfqg(struct cfq_data *cfqd, int create) 1050{ 1051 struct cgroup *cgroup; 1052 struct cfq_group *cfqg = NULL; 1053 1054 rcu_read_lock(); 1055 cgroup = task_cgroup(current, blkio_subsys_id); 1056 cfqg = cfq_find_alloc_cfqg(cfqd, cgroup, create); 1057 if (!cfqg && create) 1058 cfqg = &cfqd->root_group; 1059 rcu_read_unlock(); 1060 return cfqg; 1061} 1062 1063static inline struct cfq_group *cfq_ref_get_cfqg(struct cfq_group *cfqg) 1064{ 1065 atomic_inc(&cfqg->ref); 1066 return cfqg; 1067} 1068 1069static void cfq_link_cfqq_cfqg(struct cfq_queue *cfqq, struct cfq_group *cfqg) 1070{ 1071 /* Currently, all async queues are mapped to root group */ 1072 if (!cfq_cfqq_sync(cfqq)) 1073 cfqg = &cfqq->cfqd->root_group; 1074 1075 cfqq->cfqg = cfqg; 1076 /* cfqq reference on cfqg */ 1077 atomic_inc(&cfqq->cfqg->ref); 1078} 1079 1080static void cfq_put_cfqg(struct cfq_group *cfqg) 1081{ 1082 struct cfq_rb_root *st; 1083 int i, j; 1084 1085 BUG_ON(atomic_read(&cfqg->ref) <= 0); 1086 if (!atomic_dec_and_test(&cfqg->ref)) 1087 return; 1088 for_each_cfqg_st(cfqg, i, j, st) 1089 BUG_ON(!RB_EMPTY_ROOT(&st->rb) || st->active != NULL); 1090 kfree(cfqg); 1091} 1092 1093static void cfq_destroy_cfqg(struct cfq_data *cfqd, struct cfq_group *cfqg) 1094{ 1095 /* Something wrong if we are trying to remove same group twice */ 1096 BUG_ON(hlist_unhashed(&cfqg->cfqd_node)); 1097 1098 hlist_del_init(&cfqg->cfqd_node); 1099 1100 /* 1101 * Put the reference taken at the time of creation so that when all 1102 * queues are gone, group can be destroyed. 1103 */ 1104 cfq_put_cfqg(cfqg); 1105} 1106 1107static void cfq_release_cfq_groups(struct cfq_data *cfqd) 1108{ 1109 struct hlist_node *pos, *n; 1110 struct cfq_group *cfqg; 1111 1112 hlist_for_each_entry_safe(cfqg, pos, n, &cfqd->cfqg_list, cfqd_node) { 1113 /* 1114 * If cgroup removal path got to blk_group first and removed 1115 * it from cgroup list, then it will take care of destroying 1116 * cfqg also. 1117 */ 1118 if (!cfq_blkiocg_del_blkio_group(&cfqg->blkg)) 1119 cfq_destroy_cfqg(cfqd, cfqg); 1120 } 1121} 1122 1123/* 1124 * Blk cgroup controller notification saying that blkio_group object is being 1125 * delinked as associated cgroup object is going away. That also means that 1126 * no new IO will come in this group. So get rid of this group as soon as 1127 * any pending IO in the group is finished. 1128 * 1129 * This function is called under rcu_read_lock(). key is the rcu protected 1130 * pointer. That means "key" is a valid cfq_data pointer as long as we are rcu 1131 * read lock. 1132 * 1133 * "key" was fetched from blkio_group under blkio_cgroup->lock. That means 1134 * it should not be NULL as even if elevator was exiting, cgroup deltion 1135 * path got to it first. 1136 */ 1137void cfq_unlink_blkio_group(void *key, struct blkio_group *blkg) 1138{ 1139 unsigned long flags; 1140 struct cfq_data *cfqd = key; 1141 1142 spin_lock_irqsave(cfqd->queue->queue_lock, flags); 1143 cfq_destroy_cfqg(cfqd, cfqg_of_blkg(blkg)); 1144 spin_unlock_irqrestore(cfqd->queue->queue_lock, flags); 1145} 1146 1147#else /* GROUP_IOSCHED */ 1148static struct cfq_group *cfq_get_cfqg(struct cfq_data *cfqd, int create) 1149{ 1150 return &cfqd->root_group; 1151} 1152 1153static inline struct cfq_group *cfq_ref_get_cfqg(struct cfq_group *cfqg) 1154{ 1155 return cfqg; 1156} 1157 1158static inline void 1159cfq_link_cfqq_cfqg(struct cfq_queue *cfqq, struct cfq_group *cfqg) { 1160 cfqq->cfqg = cfqg; 1161} 1162 1163static void cfq_release_cfq_groups(struct cfq_data *cfqd) {} 1164static inline void cfq_put_cfqg(struct cfq_group *cfqg) {} 1165 1166#endif /* GROUP_IOSCHED */ 1167 1168/* 1169 * The cfqd->service_trees holds all pending cfq_queue's that have 1170 * requests waiting to be processed. It is sorted in the order that 1171 * we will service the queues. 1172 */ 1173static void cfq_service_tree_add(struct cfq_data *cfqd, struct cfq_queue *cfqq, 1174 bool add_front) 1175{ 1176 struct rb_node **p, *parent; 1177 struct cfq_queue *__cfqq; 1178 unsigned long rb_key; 1179 struct cfq_rb_root *service_tree; 1180 int left; 1181 int new_cfqq = 1; 1182 int group_changed = 0; 1183 1184#ifdef CONFIG_CFQ_GROUP_IOSCHED 1185 if (!cfqd->cfq_group_isolation 1186 && cfqq_type(cfqq) == SYNC_NOIDLE_WORKLOAD 1187 && cfqq->cfqg && cfqq->cfqg != &cfqd->root_group) { 1188 /* Move this cfq to root group */ 1189 cfq_log_cfqq(cfqd, cfqq, "moving to root group"); 1190 if (!RB_EMPTY_NODE(&cfqq->rb_node)) 1191 cfq_group_service_tree_del(cfqd, cfqq->cfqg); 1192 cfqq->orig_cfqg = cfqq->cfqg; 1193 cfqq->cfqg = &cfqd->root_group; 1194 atomic_inc(&cfqd->root_group.ref); 1195 group_changed = 1; 1196 } else if (!cfqd->cfq_group_isolation 1197 && cfqq_type(cfqq) == SYNC_WORKLOAD && cfqq->orig_cfqg) { 1198 /* cfqq is sequential now needs to go to its original group */ 1199 BUG_ON(cfqq->cfqg != &cfqd->root_group); 1200 if (!RB_EMPTY_NODE(&cfqq->rb_node)) 1201 cfq_group_service_tree_del(cfqd, cfqq->cfqg); 1202 cfq_put_cfqg(cfqq->cfqg); 1203 cfqq->cfqg = cfqq->orig_cfqg; 1204 cfqq->orig_cfqg = NULL; 1205 group_changed = 1; 1206 cfq_log_cfqq(cfqd, cfqq, "moved to origin group"); 1207 } 1208#endif 1209 1210 service_tree = service_tree_for(cfqq->cfqg, cfqq_prio(cfqq), 1211 cfqq_type(cfqq)); 1212 if (cfq_class_idle(cfqq)) { 1213 rb_key = CFQ_IDLE_DELAY; 1214 parent = rb_last(&service_tree->rb); 1215 if (parent && parent != &cfqq->rb_node) { 1216 __cfqq = rb_entry(parent, struct cfq_queue, rb_node); 1217 rb_key += __cfqq->rb_key; 1218 } else 1219 rb_key += jiffies; 1220 } else if (!add_front) { 1221 /* 1222 * Get our rb key offset. Subtract any residual slice 1223 * value carried from last service. A negative resid 1224 * count indicates slice overrun, and this should position 1225 * the next service time further away in the tree. 1226 */ 1227 rb_key = cfq_slice_offset(cfqd, cfqq) + jiffies; 1228 rb_key -= cfqq->slice_resid; 1229 cfqq->slice_resid = 0; 1230 } else { 1231 rb_key = -HZ; 1232 __cfqq = cfq_rb_first(service_tree); 1233 rb_key += __cfqq ? __cfqq->rb_key : jiffies; 1234 } 1235 1236 if (!RB_EMPTY_NODE(&cfqq->rb_node)) { 1237 new_cfqq = 0; 1238 /* 1239 * same position, nothing more to do 1240 */ 1241 if (rb_key == cfqq->rb_key && 1242 cfqq->service_tree == service_tree) 1243 return; 1244 1245 cfq_rb_erase(&cfqq->rb_node, cfqq->service_tree); 1246 cfqq->service_tree = NULL; 1247 } 1248 1249 left = 1; 1250 parent = NULL; 1251 cfqq->service_tree = service_tree; 1252 p = &service_tree->rb.rb_node; 1253 while (*p) { 1254 struct rb_node **n; 1255 1256 parent = *p; 1257 __cfqq = rb_entry(parent, struct cfq_queue, rb_node); 1258 1259 /* 1260 * sort by key, that represents service time. 1261 */ 1262 if (time_before(rb_key, __cfqq->rb_key)) 1263 n = &(*p)->rb_left; 1264 else { 1265 n = &(*p)->rb_right; 1266 left = 0; 1267 } 1268 1269 p = n; 1270 } 1271 1272 if (left) 1273 service_tree->left = &cfqq->rb_node; 1274 1275 cfqq->rb_key = rb_key; 1276 rb_link_node(&cfqq->rb_node, parent, p); 1277 rb_insert_color(&cfqq->rb_node, &service_tree->rb); 1278 service_tree->count++; 1279 if ((add_front || !new_cfqq) && !group_changed) 1280 return; 1281 cfq_group_service_tree_add(cfqd, cfqq->cfqg); 1282} 1283 1284static struct cfq_queue * 1285cfq_prio_tree_lookup(struct cfq_data *cfqd, struct rb_root *root, 1286 sector_t sector, struct rb_node **ret_parent, 1287 struct rb_node ***rb_link) 1288{ 1289 struct rb_node **p, *parent; 1290 struct cfq_queue *cfqq = NULL; 1291 1292 parent = NULL; 1293 p = &root->rb_node; 1294 while (*p) { 1295 struct rb_node **n; 1296 1297 parent = *p; 1298 cfqq = rb_entry(parent, struct cfq_queue, p_node); 1299 1300 /* 1301 * Sort strictly based on sector. Smallest to the left, 1302 * largest to the right. 1303 */ 1304 if (sector > blk_rq_pos(cfqq->next_rq)) 1305 n = &(*p)->rb_right; 1306 else if (sector < blk_rq_pos(cfqq->next_rq)) 1307 n = &(*p)->rb_left; 1308 else 1309 break; 1310 p = n; 1311 cfqq = NULL; 1312 } 1313 1314 *ret_parent = parent; 1315 if (rb_link) 1316 *rb_link = p; 1317 return cfqq; 1318} 1319 1320static void cfq_prio_tree_add(struct cfq_data *cfqd, struct cfq_queue *cfqq) 1321{ 1322 struct rb_node **p, *parent; 1323 struct cfq_queue *__cfqq; 1324 1325 if (cfqq->p_root) { 1326 rb_erase(&cfqq->p_node, cfqq->p_root); 1327 cfqq->p_root = NULL; 1328 } 1329 1330 if (cfq_class_idle(cfqq)) 1331 return; 1332 if (!cfqq->next_rq) 1333 return; 1334 1335 cfqq->p_root = &cfqd->prio_trees[cfqq->org_ioprio]; 1336 __cfqq = cfq_prio_tree_lookup(cfqd, cfqq->p_root, 1337 blk_rq_pos(cfqq->next_rq), &parent, &p); 1338 if (!__cfqq) { 1339 rb_link_node(&cfqq->p_node, parent, p); 1340 rb_insert_color(&cfqq->p_node, cfqq->p_root); 1341 } else 1342 cfqq->p_root = NULL; 1343} 1344 1345/* 1346 * Update cfqq's position in the service tree. 1347 */ 1348static void cfq_resort_rr_list(struct cfq_data *cfqd, struct cfq_queue *cfqq) 1349{ 1350 /* 1351 * Resorting requires the cfqq to be on the RR list already. 1352 */ 1353 if (cfq_cfqq_on_rr(cfqq)) { 1354 cfq_service_tree_add(cfqd, cfqq, 0); 1355 cfq_prio_tree_add(cfqd, cfqq); 1356 } 1357} 1358 1359/* 1360 * add to busy list of queues for service, trying to be fair in ordering 1361 * the pending list according to last request service 1362 */ 1363static void cfq_add_cfqq_rr(struct cfq_data *cfqd, struct cfq_queue *cfqq) 1364{ 1365 cfq_log_cfqq(cfqd, cfqq, "add_to_rr"); 1366 BUG_ON(cfq_cfqq_on_rr(cfqq)); 1367 cfq_mark_cfqq_on_rr(cfqq); 1368 cfqd->busy_queues++; 1369 1370 cfq_resort_rr_list(cfqd, cfqq); 1371} 1372 1373/* 1374 * Called when the cfqq no longer has requests pending, remove it from 1375 * the service tree. 1376 */ 1377static void cfq_del_cfqq_rr(struct cfq_data *cfqd, struct cfq_queue *cfqq) 1378{ 1379 cfq_log_cfqq(cfqd, cfqq, "del_from_rr"); 1380 BUG_ON(!cfq_cfqq_on_rr(cfqq)); 1381 cfq_clear_cfqq_on_rr(cfqq); 1382 1383 if (!RB_EMPTY_NODE(&cfqq->rb_node)) { 1384 cfq_rb_erase(&cfqq->rb_node, cfqq->service_tree); 1385 cfqq->service_tree = NULL; 1386 } 1387 if (cfqq->p_root) { 1388 rb_erase(&cfqq->p_node, cfqq->p_root); 1389 cfqq->p_root = NULL; 1390 } 1391 1392 cfq_group_service_tree_del(cfqd, cfqq->cfqg); 1393 BUG_ON(!cfqd->busy_queues); 1394 cfqd->busy_queues--; 1395} 1396 1397/* 1398 * rb tree support functions 1399 */ 1400static void cfq_del_rq_rb(struct request *rq) 1401{ 1402 struct cfq_queue *cfqq = RQ_CFQQ(rq); 1403 const int sync = rq_is_sync(rq); 1404 1405 BUG_ON(!cfqq->queued[sync]); 1406 cfqq->queued[sync]--; 1407 1408 elv_rb_del(&cfqq->sort_list, rq); 1409 1410 if (cfq_cfqq_on_rr(cfqq) && RB_EMPTY_ROOT(&cfqq->sort_list)) { 1411 /* 1412 * Queue will be deleted from service tree when we actually 1413 * expire it later. Right now just remove it from prio tree 1414 * as it is empty. 1415 */ 1416 if (cfqq->p_root) { 1417 rb_erase(&cfqq->p_node, cfqq->p_root); 1418 cfqq->p_root = NULL; 1419 } 1420 } 1421} 1422 1423static void cfq_add_rq_rb(struct request *rq) 1424{ 1425 struct cfq_queue *cfqq = RQ_CFQQ(rq); 1426 struct cfq_data *cfqd = cfqq->cfqd; 1427 struct request *__alias, *prev; 1428 1429 cfqq->queued[rq_is_sync(rq)]++; 1430 1431 /* 1432 * looks a little odd, but the first insert might return an alias. 1433 * if that happens, put the alias on the dispatch list 1434 */ 1435 while ((__alias = elv_rb_add(&cfqq->sort_list, rq)) != NULL) 1436 cfq_dispatch_insert(cfqd->queue, __alias); 1437 1438 if (!cfq_cfqq_on_rr(cfqq)) 1439 cfq_add_cfqq_rr(cfqd, cfqq); 1440 1441 /* 1442 * check if this request is a better next-serve candidate 1443 */ 1444 prev = cfqq->next_rq; 1445 cfqq->next_rq = cfq_choose_req(cfqd, cfqq->next_rq, rq, cfqd->last_position); 1446 1447 /* 1448 * adjust priority tree position, if ->next_rq changes 1449 */ 1450 if (prev != cfqq->next_rq) 1451 cfq_prio_tree_add(cfqd, cfqq); 1452 1453 BUG_ON(!cfqq->next_rq); 1454} 1455 1456static void cfq_reposition_rq_rb(struct cfq_queue *cfqq, struct request *rq) 1457{ 1458 elv_rb_del(&cfqq->sort_list, rq); 1459 cfqq->queued[rq_is_sync(rq)]--; 1460 cfq_blkiocg_update_io_remove_stats(&(RQ_CFQG(rq))->blkg, 1461 rq_data_dir(rq), rq_is_sync(rq)); 1462 cfq_add_rq_rb(rq); 1463 cfq_blkiocg_update_io_add_stats(&(RQ_CFQG(rq))->blkg, 1464 &cfqq->cfqd->serving_group->blkg, rq_data_dir(rq), 1465 rq_is_sync(rq)); 1466} 1467 1468static struct request * 1469cfq_find_rq_fmerge(struct cfq_data *cfqd, struct bio *bio) 1470{ 1471 struct task_struct *tsk = current; 1472 struct cfq_io_context *cic; 1473 struct cfq_queue *cfqq; 1474 1475 cic = cfq_cic_lookup(cfqd, tsk->io_context); 1476 if (!cic) 1477 return NULL; 1478 1479 cfqq = cic_to_cfqq(cic, cfq_bio_sync(bio)); 1480 if (cfqq) { 1481 sector_t sector = bio->bi_sector + bio_sectors(bio); 1482 1483 return elv_rb_find(&cfqq->sort_list, sector); 1484 } 1485 1486 return NULL; 1487} 1488 1489static void cfq_activate_request(struct request_queue *q, struct request *rq) 1490{ 1491 struct cfq_data *cfqd = q->elevator->elevator_data; 1492 1493 cfqd->rq_in_driver++; 1494 cfq_log_cfqq(cfqd, RQ_CFQQ(rq), "activate rq, drv=%d", 1495 cfqd->rq_in_driver); 1496 1497 cfqd->last_position = blk_rq_pos(rq) + blk_rq_sectors(rq); 1498} 1499 1500static void cfq_deactivate_request(struct request_queue *q, struct request *rq) 1501{ 1502 struct cfq_data *cfqd = q->elevator->elevator_data; 1503 1504 WARN_ON(!cfqd->rq_in_driver); 1505 cfqd->rq_in_driver--; 1506 cfq_log_cfqq(cfqd, RQ_CFQQ(rq), "deactivate rq, drv=%d", 1507 cfqd->rq_in_driver); 1508} 1509 1510static void cfq_remove_request(struct request *rq) 1511{ 1512 struct cfq_queue *cfqq = RQ_CFQQ(rq); 1513 1514 if (cfqq->next_rq == rq) 1515 cfqq->next_rq = cfq_find_next_rq(cfqq->cfqd, cfqq, rq); 1516 1517 list_del_init(&rq->queuelist); 1518 cfq_del_rq_rb(rq); 1519 1520 cfqq->cfqd->rq_queued--; 1521 cfq_blkiocg_update_io_remove_stats(&(RQ_CFQG(rq))->blkg, 1522 rq_data_dir(rq), rq_is_sync(rq)); 1523 if (rq->cmd_flags & REQ_META) { 1524 WARN_ON(!cfqq->meta_pending); 1525 cfqq->meta_pending--; 1526 } 1527} 1528 1529static int cfq_merge(struct request_queue *q, struct request **req, 1530 struct bio *bio) 1531{ 1532 struct cfq_data *cfqd = q->elevator->elevator_data; 1533 struct request *__rq; 1534 1535 __rq = cfq_find_rq_fmerge(cfqd, bio); 1536 if (__rq && elv_rq_merge_ok(__rq, bio)) { 1537 *req = __rq; 1538 return ELEVATOR_FRONT_MERGE; 1539 } 1540 1541 return ELEVATOR_NO_MERGE; 1542} 1543 1544static void cfq_merged_request(struct request_queue *q, struct request *req, 1545 int type) 1546{ 1547 if (type == ELEVATOR_FRONT_MERGE) { 1548 struct cfq_queue *cfqq = RQ_CFQQ(req); 1549 1550 cfq_reposition_rq_rb(cfqq, req); 1551 } 1552} 1553 1554static void cfq_bio_merged(struct request_queue *q, struct request *req, 1555 struct bio *bio) 1556{ 1557 cfq_blkiocg_update_io_merged_stats(&(RQ_CFQG(req))->blkg, 1558 bio_data_dir(bio), cfq_bio_sync(bio)); 1559} 1560 1561static void 1562cfq_merged_requests(struct request_queue *q, struct request *rq, 1563 struct request *next) 1564{ 1565 struct cfq_queue *cfqq = RQ_CFQQ(rq); 1566 /* 1567 * reposition in fifo if next is older than rq 1568 */ 1569 if (!list_empty(&rq->queuelist) && !list_empty(&next->queuelist) && 1570 time_before(rq_fifo_time(next), rq_fifo_time(rq))) { 1571 list_move(&rq->queuelist, &next->queuelist); 1572 rq_set_fifo_time(rq, rq_fifo_time(next)); 1573 } 1574 1575 if (cfqq->next_rq == next) 1576 cfqq->next_rq = rq; 1577 cfq_remove_request(next); 1578 cfq_blkiocg_update_io_merged_stats(&(RQ_CFQG(rq))->blkg, 1579 rq_data_dir(next), rq_is_sync(next)); 1580} 1581 1582static int cfq_allow_merge(struct request_queue *q, struct request *rq, 1583 struct bio *bio) 1584{ 1585 struct cfq_data *cfqd = q->elevator->elevator_data; 1586 struct cfq_io_context *cic; 1587 struct cfq_queue *cfqq; 1588 1589 /* 1590 * Disallow merge of a sync bio into an async request. 1591 */ 1592 if (cfq_bio_sync(bio) && !rq_is_sync(rq)) 1593 return false; 1594 1595 /* 1596 * Lookup the cfqq that this bio will be queued with. Allow 1597 * merge only if rq is queued there. 1598 */ 1599 cic = cfq_cic_lookup(cfqd, current->io_context); 1600 if (!cic) 1601 return false; 1602 1603 cfqq = cic_to_cfqq(cic, cfq_bio_sync(bio)); 1604 return cfqq == RQ_CFQQ(rq); 1605} 1606 1607static inline void cfq_del_timer(struct cfq_data *cfqd, struct cfq_queue *cfqq) 1608{ 1609 del_timer(&cfqd->idle_slice_timer); 1610 cfq_blkiocg_update_idle_time_stats(&cfqq->cfqg->blkg); 1611} 1612 1613static void __cfq_set_active_queue(struct cfq_data *cfqd, 1614 struct cfq_queue *cfqq) 1615{ 1616 if (cfqq) { 1617 cfq_log_cfqq(cfqd, cfqq, "set_active wl_prio:%d wl_type:%d", 1618 cfqd->serving_prio, cfqd->serving_type); 1619 cfq_blkiocg_update_avg_queue_size_stats(&cfqq->cfqg->blkg); 1620 cfqq->slice_start = 0; 1621 cfqq->dispatch_start = jiffies; 1622 cfqq->allocated_slice = 0; 1623 cfqq->slice_end = 0; 1624 cfqq->slice_dispatch = 0; 1625 cfqq->nr_sectors = 0; 1626 1627 cfq_clear_cfqq_wait_request(cfqq); 1628 cfq_clear_cfqq_must_dispatch(cfqq); 1629 cfq_clear_cfqq_must_alloc_slice(cfqq); 1630 cfq_clear_cfqq_fifo_expire(cfqq); 1631 cfq_mark_cfqq_slice_new(cfqq); 1632 1633 cfq_del_timer(cfqd, cfqq); 1634 } 1635 1636 cfqd->active_queue = cfqq; 1637} 1638 1639/* 1640 * current cfqq expired its slice (or was too idle), select new one 1641 */ 1642static void 1643__cfq_slice_expired(struct cfq_data *cfqd, struct cfq_queue *cfqq, 1644 bool timed_out) 1645{ 1646 cfq_log_cfqq(cfqd, cfqq, "slice expired t=%d", timed_out); 1647 1648 if (cfq_cfqq_wait_request(cfqq)) 1649 cfq_del_timer(cfqd, cfqq); 1650 1651 cfq_clear_cfqq_wait_request(cfqq); 1652 cfq_clear_cfqq_wait_busy(cfqq); 1653 1654 /* 1655 * If this cfqq is shared between multiple processes, check to 1656 * make sure that those processes are still issuing I/Os within 1657 * the mean seek distance. If not, it may be time to break the 1658 * queues apart again. 1659 */ 1660 if (cfq_cfqq_coop(cfqq) && CFQQ_SEEKY(cfqq)) 1661 cfq_mark_cfqq_split_coop(cfqq); 1662 1663 /* 1664 * store what was left of this slice, if the queue idled/timed out 1665 */ 1666 if (timed_out && !cfq_cfqq_slice_new(cfqq)) { 1667 cfqq->slice_resid = cfqq->slice_end - jiffies; 1668 cfq_log_cfqq(cfqd, cfqq, "resid=%ld", cfqq->slice_resid); 1669 } 1670 1671 cfq_group_served(cfqd, cfqq->cfqg, cfqq); 1672 1673 if (cfq_cfqq_on_rr(cfqq) && RB_EMPTY_ROOT(&cfqq->sort_list)) 1674 cfq_del_cfqq_rr(cfqd, cfqq); 1675 1676 cfq_resort_rr_list(cfqd, cfqq); 1677 1678 if (cfqq == cfqd->active_queue) 1679 cfqd->active_queue = NULL; 1680 1681 if (&cfqq->cfqg->rb_node == cfqd->grp_service_tree.active) 1682 cfqd->grp_service_tree.active = NULL; 1683 1684 if (cfqd->active_cic) { 1685 put_io_context(cfqd->active_cic->ioc); 1686 cfqd->active_cic = NULL; 1687 } 1688} 1689 1690static inline void cfq_slice_expired(struct cfq_data *cfqd, bool timed_out) 1691{ 1692 struct cfq_queue *cfqq = cfqd->active_queue; 1693 1694 if (cfqq) 1695 __cfq_slice_expired(cfqd, cfqq, timed_out); 1696} 1697 1698/* 1699 * Get next queue for service. Unless we have a queue preemption, 1700 * we'll simply select the first cfqq in the service tree. 1701 */ 1702static struct cfq_queue *cfq_get_next_queue(struct cfq_data *cfqd) 1703{ 1704 struct cfq_rb_root *service_tree = 1705 service_tree_for(cfqd->serving_group, cfqd->serving_prio, 1706 cfqd->serving_type); 1707 1708 if (!cfqd->rq_queued) 1709 return NULL; 1710 1711 /* There is nothing to dispatch */ 1712 if (!service_tree) 1713 return NULL; 1714 if (RB_EMPTY_ROOT(&service_tree->rb)) 1715 return NULL; 1716 return cfq_rb_first(service_tree); 1717} 1718 1719static struct cfq_queue *cfq_get_next_queue_forced(struct cfq_data *cfqd) 1720{ 1721 struct cfq_group *cfqg; 1722 struct cfq_queue *cfqq; 1723 int i, j; 1724 struct cfq_rb_root *st; 1725 1726 if (!cfqd->rq_queued) 1727 return NULL; 1728 1729 cfqg = cfq_get_next_cfqg(cfqd); 1730 if (!cfqg) 1731 return NULL; 1732 1733 for_each_cfqg_st(cfqg, i, j, st) 1734 if ((cfqq = cfq_rb_first(st)) != NULL) 1735 return cfqq; 1736 return NULL; 1737} 1738 1739/* 1740 * Get and set a new active queue for service. 1741 */ 1742static struct cfq_queue *cfq_set_active_queue(struct cfq_data *cfqd, 1743 struct cfq_queue *cfqq) 1744{ 1745 if (!cfqq) 1746 cfqq = cfq_get_next_queue(cfqd); 1747 1748 __cfq_set_active_queue(cfqd, cfqq); 1749 return cfqq; 1750} 1751 1752static inline sector_t cfq_dist_from_last(struct cfq_data *cfqd, 1753 struct request *rq) 1754{ 1755 if (blk_rq_pos(rq) >= cfqd->last_position) 1756 return blk_rq_pos(rq) - cfqd->last_position; 1757 else 1758 return cfqd->last_position - blk_rq_pos(rq); 1759} 1760 1761static inline int cfq_rq_close(struct cfq_data *cfqd, struct cfq_queue *cfqq, 1762 struct request *rq) 1763{ 1764 return cfq_dist_from_last(cfqd, rq) <= CFQQ_CLOSE_THR; 1765} 1766 1767static struct cfq_queue *cfqq_close(struct cfq_data *cfqd, 1768 struct cfq_queue *cur_cfqq) 1769{ 1770 struct rb_root *root = &cfqd->prio_trees[cur_cfqq->org_ioprio]; 1771 struct rb_node *parent, *node; 1772 struct cfq_queue *__cfqq; 1773 sector_t sector = cfqd->last_position; 1774 1775 if (RB_EMPTY_ROOT(root)) 1776 return NULL; 1777 1778 /* 1779 * First, if we find a request starting at the end of the last 1780 * request, choose it. 1781 */ 1782 __cfqq = cfq_prio_tree_lookup(cfqd, root, sector, &parent, NULL); 1783 if (__cfqq) 1784 return __cfqq; 1785 1786 /* 1787 * If the exact sector wasn't found, the parent of the NULL leaf 1788 * will contain the closest sector. 1789 */ 1790 __cfqq = rb_entry(parent, struct cfq_queue, p_node); 1791 if (cfq_rq_close(cfqd, cur_cfqq, __cfqq->next_rq)) 1792 return __cfqq; 1793 1794 if (blk_rq_pos(__cfqq->next_rq) < sector) 1795 node = rb_next(&__cfqq->p_node); 1796 else 1797 node = rb_prev(&__cfqq->p_node); 1798 if (!node) 1799 return NULL; 1800 1801 __cfqq = rb_entry(node, struct cfq_queue, p_node); 1802 if (cfq_rq_close(cfqd, cur_cfqq, __cfqq->next_rq)) 1803 return __cfqq; 1804 1805 return NULL; 1806} 1807 1808/* 1809 * cfqd - obvious 1810 * cur_cfqq - passed in so that we don't decide that the current queue is 1811 * closely cooperating with itself. 1812 * 1813 * So, basically we're assuming that that cur_cfqq has dispatched at least 1814 * one request, and that cfqd->last_position reflects a position on the disk 1815 * associated with the I/O issued by cur_cfqq. I'm not sure this is a valid 1816 * assumption. 1817 */ 1818static struct cfq_queue *cfq_close_cooperator(struct cfq_data *cfqd, 1819 struct cfq_queue *cur_cfqq) 1820{ 1821 struct cfq_queue *cfqq; 1822 1823 if (cfq_class_idle(cur_cfqq)) 1824 return NULL; 1825 if (!cfq_cfqq_sync(cur_cfqq)) 1826 return NULL; 1827 if (CFQQ_SEEKY(cur_cfqq)) 1828 return NULL; 1829 1830 /* 1831 * Don't search priority tree if it's the only queue in the group. 1832 */ 1833 if (cur_cfqq->cfqg->nr_cfqq == 1) 1834 return NULL; 1835 1836 /* 1837 * We should notice if some of the queues are cooperating, eg 1838 * working closely on the same area of the disk. In that case, 1839 * we can group them together and don't waste time idling. 1840 */ 1841 cfqq = cfqq_close(cfqd, cur_cfqq); 1842 if (!cfqq) 1843 return NULL; 1844 1845 /* If new queue belongs to different cfq_group, don't choose it */ 1846 if (cur_cfqq->cfqg != cfqq->cfqg) 1847 return NULL; 1848 1849 /* 1850 * It only makes sense to merge sync queues. 1851 */ 1852 if (!cfq_cfqq_sync(cfqq)) 1853 return NULL; 1854 if (CFQQ_SEEKY(cfqq)) 1855 return NULL; 1856 1857 /* 1858 * Do not merge queues of different priority classes 1859 */ 1860 if (cfq_class_rt(cfqq) != cfq_class_rt(cur_cfqq)) 1861 return NULL; 1862 1863 return cfqq; 1864} 1865 1866/* 1867 * Determine whether we should enforce idle window for this queue. 1868 */ 1869 1870static bool cfq_should_idle(struct cfq_data *cfqd, struct cfq_queue *cfqq) 1871{ 1872 enum wl_prio_t prio = cfqq_prio(cfqq); 1873 struct cfq_rb_root *service_tree = cfqq->service_tree; 1874 1875 BUG_ON(!service_tree); 1876 BUG_ON(!service_tree->count); 1877 1878 if (!cfqd->cfq_slice_idle) 1879 return false; 1880 1881 /* We never do for idle class queues. */ 1882 if (prio == IDLE_WORKLOAD) 1883 return false; 1884 1885 /* We do for queues that were marked with idle window flag. */ 1886 if (cfq_cfqq_idle_window(cfqq) && 1887 !(blk_queue_nonrot(cfqd->queue) && cfqd->hw_tag)) 1888 return true; 1889 1890 /* 1891 * Otherwise, we do only if they are the last ones 1892 * in their service tree. 1893 */ 1894 if (service_tree->count == 1 && cfq_cfqq_sync(cfqq)) 1895 return 1; 1896 cfq_log_cfqq(cfqd, cfqq, "Not idling. st->count:%d", 1897 service_tree->count); 1898 return 0; 1899} 1900 1901static void cfq_arm_slice_timer(struct cfq_data *cfqd) 1902{ 1903 struct cfq_queue *cfqq = cfqd->active_queue; 1904 struct cfq_io_context *cic; 1905 unsigned long sl, group_idle = 0; 1906 1907 /* 1908 * SSD device without seek penalty, disable idling. But only do so 1909 * for devices that support queuing, otherwise we still have a problem 1910 * with sync vs async workloads. 1911 */ 1912 if (blk_queue_nonrot(cfqd->queue) && cfqd->hw_tag) 1913 return; 1914 1915 WARN_ON(!RB_EMPTY_ROOT(&cfqq->sort_list)); 1916 WARN_ON(cfq_cfqq_slice_new(cfqq)); 1917 1918 /* 1919 * idle is disabled, either manually or by past process history 1920 */ 1921 if (!cfq_should_idle(cfqd, cfqq)) { 1922 /* no queue idling. Check for group idling */ 1923 if (cfqd->cfq_group_idle) 1924 group_idle = cfqd->cfq_group_idle; 1925 else 1926 return; 1927 } 1928 1929 /* 1930 * still active requests from this queue, don't idle 1931 */ 1932 if (cfqq->dispatched) 1933 return; 1934 1935 /* 1936 * task has exited, don't wait 1937 */ 1938 cic = cfqd->active_cic; 1939 if (!cic || !atomic_read(&cic->ioc->nr_tasks)) 1940 return; 1941 1942 /* 1943 * If our average think time is larger than the remaining time 1944 * slice, then don't idle. This avoids overrunning the allotted 1945 * time slice. 1946 */ 1947 if (sample_valid(cic->ttime_samples) && 1948 (cfqq->slice_end - jiffies < cic->ttime_mean)) { 1949 cfq_log_cfqq(cfqd, cfqq, "Not idling. think_time:%d", 1950 cic->ttime_mean); 1951 return; 1952 } 1953 1954 /* There are other queues in the group, don't do group idle */ 1955 if (group_idle && cfqq->cfqg->nr_cfqq > 1) 1956 return; 1957 1958 cfq_mark_cfqq_wait_request(cfqq); 1959 1960 if (group_idle) 1961 sl = cfqd->cfq_group_idle; 1962 else 1963 sl = cfqd->cfq_slice_idle; 1964 1965 mod_timer(&cfqd->idle_slice_timer, jiffies + sl); 1966 cfq_blkiocg_update_set_idle_time_stats(&cfqq->cfqg->blkg); 1967 cfq_log_cfqq(cfqd, cfqq, "arm_idle: %lu group_idle: %d", sl, 1968 group_idle ? 1 : 0); 1969} 1970 1971/* 1972 * Move request from internal lists to the request queue dispatch list. 1973 */ 1974static void cfq_dispatch_insert(struct request_queue *q, struct request *rq) 1975{ 1976 struct cfq_data *cfqd = q->elevator->elevator_data; 1977 struct cfq_queue *cfqq = RQ_CFQQ(rq); 1978 1979 cfq_log_cfqq(cfqd, cfqq, "dispatch_insert"); 1980 1981 cfqq->next_rq = cfq_find_next_rq(cfqd, cfqq, rq); 1982 cfq_remove_request(rq); 1983 cfqq->dispatched++; 1984 (RQ_CFQG(rq))->dispatched++; 1985 elv_dispatch_sort(q, rq); 1986 1987 cfqd->rq_in_flight[cfq_cfqq_sync(cfqq)]++; 1988 cfqq->nr_sectors += blk_rq_sectors(rq); 1989 cfq_blkiocg_update_dispatch_stats(&cfqq->cfqg->blkg, blk_rq_bytes(rq), 1990 rq_data_dir(rq), rq_is_sync(rq)); 1991} 1992 1993/* 1994 * return expired entry, or NULL to just start from scratch in rbtree 1995 */ 1996static struct request *cfq_check_fifo(struct cfq_queue *cfqq) 1997{ 1998 struct request *rq = NULL; 1999 2000 if (cfq_cfqq_fifo_expire(cfqq)) 2001 return NULL; 2002 2003 cfq_mark_cfqq_fifo_expire(cfqq); 2004 2005 if (list_empty(&cfqq->fifo)) 2006 return NULL; 2007 2008 rq = rq_entry_fifo(cfqq->fifo.next); 2009 if (time_before(jiffies, rq_fifo_time(rq))) 2010 rq = NULL; 2011 2012 cfq_log_cfqq(cfqq->cfqd, cfqq, "fifo=%p", rq); 2013 return rq; 2014} 2015 2016static inline int 2017cfq_prio_to_maxrq(struct cfq_data *cfqd, struct cfq_queue *cfqq) 2018{ 2019 const int base_rq = cfqd->cfq_slice_async_rq; 2020 2021 WARN_ON(cfqq->ioprio >= IOPRIO_BE_NR); 2022 2023 return 2 * (base_rq + base_rq * (CFQ_PRIO_LISTS - 1 - cfqq->ioprio)); 2024} 2025 2026/* 2027 * Must be called with the queue_lock held. 2028 */ 2029static int cfqq_process_refs(struct cfq_queue *cfqq) 2030{ 2031 int process_refs, io_refs; 2032 2033 io_refs = cfqq->allocated[READ] + cfqq->allocated[WRITE]; 2034 process_refs = atomic_read(&cfqq->ref) - io_refs; 2035 BUG_ON(process_refs < 0); 2036 return process_refs; 2037} 2038 2039static void cfq_setup_merge(struct cfq_queue *cfqq, struct cfq_queue *new_cfqq) 2040{ 2041 int process_refs, new_process_refs; 2042 struct cfq_queue *__cfqq; 2043 2044 /* 2045 * If there are no process references on the new_cfqq, then it is 2046 * unsafe to follow the ->new_cfqq chain as other cfqq's in the 2047 * chain may have dropped their last reference (not just their 2048 * last process reference). 2049 */ 2050 if (!cfqq_process_refs(new_cfqq)) 2051 return; 2052 2053 /* Avoid a circular list and skip interim queue merges */ 2054 while ((__cfqq = new_cfqq->new_cfqq)) { 2055 if (__cfqq == cfqq) 2056 return; 2057 new_cfqq = __cfqq; 2058 } 2059 2060 process_refs = cfqq_process_refs(cfqq); 2061 new_process_refs = cfqq_process_refs(new_cfqq); 2062 /* 2063 * If the process for the cfqq has gone away, there is no 2064 * sense in merging the queues. 2065 */ 2066 if (process_refs == 0 || new_process_refs == 0) 2067 return; 2068 2069 /* 2070 * Merge in the direction of the lesser amount of work. 2071 */ 2072 if (new_process_refs >= process_refs) { 2073 cfqq->new_cfqq = new_cfqq; 2074 atomic_add(process_refs, &new_cfqq->ref); 2075 } else { 2076 new_cfqq->new_cfqq = cfqq; 2077 atomic_add(new_process_refs, &cfqq->ref); 2078 } 2079} 2080 2081static enum wl_type_t cfq_choose_wl(struct cfq_data *cfqd, 2082 struct cfq_group *cfqg, enum wl_prio_t prio) 2083{ 2084 struct cfq_queue *queue; 2085 int i; 2086 bool key_valid = false; 2087 unsigned long lowest_key = 0; 2088 enum wl_type_t cur_best = SYNC_NOIDLE_WORKLOAD; 2089 2090 for (i = 0; i <= SYNC_WORKLOAD; ++i) { 2091 /* select the one with lowest rb_key */ 2092 queue = cfq_rb_first(service_tree_for(cfqg, prio, i)); 2093 if (queue && 2094 (!key_valid || time_before(queue->rb_key, lowest_key))) { 2095 lowest_key = queue->rb_key; 2096 cur_best = i; 2097 key_valid = true; 2098 } 2099 } 2100 2101 return cur_best; 2102} 2103 2104static void choose_service_tree(struct cfq_data *cfqd, struct cfq_group *cfqg) 2105{ 2106 unsigned slice; 2107 unsigned count; 2108 struct cfq_rb_root *st; 2109 unsigned group_slice; 2110 2111 if (!cfqg) { 2112 cfqd->serving_prio = IDLE_WORKLOAD; 2113 cfqd->workload_expires = jiffies + 1; 2114 return; 2115 } 2116 2117 /* Choose next priority. RT > BE > IDLE */ 2118 if (cfq_group_busy_queues_wl(RT_WORKLOAD, cfqd, cfqg)) 2119 cfqd->serving_prio = RT_WORKLOAD; 2120 else if (cfq_group_busy_queues_wl(BE_WORKLOAD, cfqd, cfqg)) 2121 cfqd->serving_prio = BE_WORKLOAD; 2122 else { 2123 cfqd->serving_prio = IDLE_WORKLOAD; 2124 cfqd->workload_expires = jiffies + 1; 2125 return; 2126 } 2127 2128 /* 2129 * For RT and BE, we have to choose also the type 2130 * (SYNC, SYNC_NOIDLE, ASYNC), and to compute a workload 2131 * expiration time 2132 */ 2133 st = service_tree_for(cfqg, cfqd->serving_prio, cfqd->serving_type); 2134 count = st->count; 2135 2136 /* 2137 * check workload expiration, and that we still have other queues ready 2138 */ 2139 if (count && !time_after(jiffies, cfqd->workload_expires)) 2140 return; 2141 2142 /* otherwise select new workload type */ 2143 cfqd->serving_type = 2144 cfq_choose_wl(cfqd, cfqg, cfqd->serving_prio); 2145 st = service_tree_for(cfqg, cfqd->serving_prio, cfqd->serving_type); 2146 count = st->count; 2147 2148 /* 2149 * the workload slice is computed as a fraction of target latency 2150 * proportional to the number of queues in that workload, over 2151 * all the queues in the same priority class 2152 */ 2153 group_slice = cfq_group_slice(cfqd, cfqg); 2154 2155 slice = group_slice * count / 2156 max_t(unsigned, cfqg->busy_queues_avg[cfqd->serving_prio], 2157 cfq_group_busy_queues_wl(cfqd->serving_prio, cfqd, cfqg)); 2158 2159 if (cfqd->serving_type == ASYNC_WORKLOAD) { 2160 unsigned int tmp; 2161 2162 /* 2163 * Async queues are currently system wide. Just taking 2164 * proportion of queues with-in same group will lead to higher 2165 * async ratio system wide as generally root group is going 2166 * to have higher weight. A more accurate thing would be to 2167 * calculate system wide asnc/sync ratio. 2168 */ 2169 tmp = cfq_target_latency * cfqg_busy_async_queues(cfqd, cfqg); 2170 tmp = tmp/cfqd->busy_queues; 2171 slice = min_t(unsigned, slice, tmp); 2172 2173 /* async workload slice is scaled down according to 2174 * the sync/async slice ratio. */ 2175 slice = slice * cfqd->cfq_slice[0] / cfqd->cfq_slice[1]; 2176 } else 2177 /* sync workload slice is at least 2 * cfq_slice_idle */ 2178 slice = max(slice, 2 * cfqd->cfq_slice_idle); 2179 2180 slice = max_t(unsigned, slice, CFQ_MIN_TT); 2181 cfq_log(cfqd, "workload slice:%d", slice); 2182 cfqd->workload_expires = jiffies + slice; 2183 cfqd->noidle_tree_requires_idle = false; 2184} 2185 2186static struct cfq_group *cfq_get_next_cfqg(struct cfq_data *cfqd) 2187{ 2188 struct cfq_rb_root *st = &cfqd->grp_service_tree; 2189 struct cfq_group *cfqg; 2190 2191 if (RB_EMPTY_ROOT(&st->rb)) 2192 return NULL; 2193 cfqg = cfq_rb_first_group(st); 2194 st->active = &cfqg->rb_node; 2195 update_min_vdisktime(st); 2196 return cfqg; 2197} 2198 2199static void cfq_choose_cfqg(struct cfq_data *cfqd) 2200{ 2201 struct cfq_group *cfqg = cfq_get_next_cfqg(cfqd); 2202 2203 cfqd->serving_group = cfqg; 2204 2205 /* Restore the workload type data */ 2206 if (cfqg->saved_workload_slice) { 2207 cfqd->workload_expires = jiffies + cfqg->saved_workload_slice; 2208 cfqd->serving_type = cfqg->saved_workload; 2209 cfqd->serving_prio = cfqg->saved_serving_prio; 2210 } else 2211 cfqd->workload_expires = jiffies - 1; 2212 2213 choose_service_tree(cfqd, cfqg); 2214} 2215 2216/* 2217 * Select a queue for service. If we have a current active queue, 2218 * check whether to continue servicing it, or retrieve and set a new one. 2219 */ 2220static struct cfq_queue *cfq_select_queue(struct cfq_data *cfqd) 2221{ 2222 struct cfq_queue *cfqq, *new_cfqq = NULL; 2223 2224 cfqq = cfqd->active_queue; 2225 if (!cfqq) 2226 goto new_queue; 2227 2228 if (!cfqd->rq_queued) 2229 return NULL; 2230 2231 /* 2232 * We were waiting for group to get backlogged. Expire the queue 2233 */ 2234 if (cfq_cfqq_wait_busy(cfqq) && !RB_EMPTY_ROOT(&cfqq->sort_list)) 2235 goto expire; 2236 2237 /* 2238 * The active queue has run out of time, expire it and select new. 2239 */ 2240 if (cfq_slice_used(cfqq) && !cfq_cfqq_must_dispatch(cfqq)) { 2241 /* 2242 * If slice had not expired at the completion of last request 2243 * we might not have turned on wait_busy flag. Don't expire 2244 * the queue yet. Allow the group to get backlogged. 2245 * 2246 * The very fact that we have used the slice, that means we 2247 * have been idling all along on this queue and it should be 2248 * ok to wait for this request to complete. 2249 */ 2250 if (cfqq->cfqg->nr_cfqq == 1 && RB_EMPTY_ROOT(&cfqq->sort_list) 2251 && cfqq->dispatched && cfq_should_idle(cfqd, cfqq)) { 2252 cfqq = NULL; 2253 goto keep_queue; 2254 } else 2255 goto check_group_idle; 2256 } 2257 2258 /* 2259 * The active queue has requests and isn't expired, allow it to 2260 * dispatch. 2261 */ 2262 if (!RB_EMPTY_ROOT(&cfqq->sort_list)) 2263 goto keep_queue; 2264 2265 /* 2266 * If another queue has a request waiting within our mean seek 2267 * distance, let it run. The expire code will check for close 2268 * cooperators and put the close queue at the front of the service 2269 * tree. If possible, merge the expiring queue with the new cfqq. 2270 */ 2271 new_cfqq = cfq_close_cooperator(cfqd, cfqq); 2272 if (new_cfqq) { 2273 if (!cfqq->new_cfqq) 2274 cfq_setup_merge(cfqq, new_cfqq); 2275 goto expire; 2276 } 2277 2278 /* 2279 * No requests pending. If the active queue still has requests in 2280 * flight or is idling for a new request, allow either of these 2281 * conditions to happen (or time out) before selecting a new queue. 2282 */ 2283 if (timer_pending(&cfqd->idle_slice_timer)) { 2284 cfqq = NULL; 2285 goto keep_queue; 2286 } 2287 2288 if (cfqq->dispatched && cfq_should_idle(cfqd, cfqq)) { 2289 cfqq = NULL; 2290 goto keep_queue; 2291 } 2292 2293 /* 2294 * If group idle is enabled and there are requests dispatched from 2295 * this group, wait for requests to complete. 2296 */ 2297check_group_idle: 2298 if (cfqd->cfq_group_idle && cfqq->cfqg->nr_cfqq == 1 2299 && cfqq->cfqg->dispatched) { 2300 cfqq = NULL; 2301 goto keep_queue; 2302 } 2303 2304expire: 2305 cfq_slice_expired(cfqd, 0); 2306new_queue: 2307 /* 2308 * Current queue expired. Check if we have to switch to a new 2309 * service tree 2310 */ 2311 if (!new_cfqq) 2312 cfq_choose_cfqg(cfqd); 2313 2314 cfqq = cfq_set_active_queue(cfqd, new_cfqq); 2315keep_queue: 2316 return cfqq; 2317} 2318 2319static int __cfq_forced_dispatch_cfqq(struct cfq_queue *cfqq) 2320{ 2321 int dispatched = 0; 2322 2323 while (cfqq->next_rq) { 2324 cfq_dispatch_insert(cfqq->cfqd->queue, cfqq->next_rq); 2325 dispatched++; 2326 } 2327 2328 BUG_ON(!list_empty(&cfqq->fifo)); 2329 2330 /* By default cfqq is not expired if it is empty. Do it explicitly */ 2331 __cfq_slice_expired(cfqq->cfqd, cfqq, 0); 2332 return dispatched; 2333} 2334 2335/* 2336 * Drain our current requests. Used for barriers and when switching 2337 * io schedulers on-the-fly. 2338 */ 2339static int cfq_forced_dispatch(struct cfq_data *cfqd) 2340{ 2341 struct cfq_queue *cfqq; 2342 int dispatched = 0; 2343 2344 /* Expire the timeslice of the current active queue first */ 2345 cfq_slice_expired(cfqd, 0); 2346 while ((cfqq = cfq_get_next_queue_forced(cfqd)) != NULL) { 2347 __cfq_set_active_queue(cfqd, cfqq); 2348 dispatched += __cfq_forced_dispatch_cfqq(cfqq); 2349 } 2350 2351 BUG_ON(cfqd->busy_queues); 2352 2353 cfq_log(cfqd, "forced_dispatch=%d", dispatched); 2354 return dispatched; 2355} 2356 2357static inline bool cfq_slice_used_soon(struct cfq_data *cfqd, 2358 struct cfq_queue *cfqq) 2359{ 2360 /* the queue hasn't finished any request, can't estimate */ 2361 if (cfq_cfqq_slice_new(cfqq)) 2362 return 1; 2363 if (time_after(jiffies + cfqd->cfq_slice_idle * cfqq->dispatched, 2364 cfqq->slice_end)) 2365 return 1; 2366 2367 return 0; 2368} 2369 2370static bool cfq_may_dispatch(struct cfq_data *cfqd, struct cfq_queue *cfqq) 2371{ 2372 unsigned int max_dispatch; 2373 2374 /* 2375 * Drain async requests before we start sync IO 2376 */ 2377 if (cfq_should_idle(cfqd, cfqq) && cfqd->rq_in_flight[BLK_RW_ASYNC]) 2378 return false; 2379 2380 /* 2381 * If this is an async queue and we have sync IO in flight, let it wait 2382 */ 2383 if (cfqd->rq_in_flight[BLK_RW_SYNC] && !cfq_cfqq_sync(cfqq)) 2384 return false; 2385 2386 max_dispatch = max_t(unsigned int, cfqd->cfq_quantum / 2, 1); 2387 if (cfq_class_idle(cfqq)) 2388 max_dispatch = 1; 2389 2390 /* 2391 * Does this cfqq already have too much IO in flight? 2392 */ 2393 if (cfqq->dispatched >= max_dispatch) { 2394 /* 2395 * idle queue must always only have a single IO in flight 2396 */ 2397 if (cfq_class_idle(cfqq)) 2398 return false; 2399 2400 /* 2401 * We have other queues, don't allow more IO from this one 2402 */ 2403 if (cfqd->busy_queues > 1 && cfq_slice_used_soon(cfqd, cfqq)) 2404 return false; 2405 2406 /* 2407 * Sole queue user, no limit 2408 */ 2409 if (cfqd->busy_queues == 1) 2410 max_dispatch = -1; 2411 else 2412 /* 2413 * Normally we start throttling cfqq when cfq_quantum/2 2414 * requests have been dispatched. But we can drive 2415 * deeper queue depths at the beginning of slice 2416 * subjected to upper limit of cfq_quantum. 2417 * */ 2418 max_dispatch = cfqd->cfq_quantum; 2419 } 2420 2421 /* 2422 * Async queues must wait a bit before being allowed dispatch. 2423 * We also ramp up the dispatch depth gradually for async IO, 2424 * based on the last sync IO we serviced 2425 */ 2426 if (!cfq_cfqq_sync(cfqq) && cfqd->cfq_latency) { 2427 unsigned long last_sync = jiffies - cfqd->last_delayed_sync; 2428 unsigned int depth; 2429 2430 depth = last_sync / cfqd->cfq_slice[1]; 2431 if (!depth && !cfqq->dispatched) 2432 depth = 1; 2433 if (depth < max_dispatch) 2434 max_dispatch = depth; 2435 } 2436 2437 /* 2438 * If we're below the current max, allow a dispatch 2439 */ 2440 return cfqq->dispatched < max_dispatch; 2441} 2442 2443/* 2444 * Dispatch a request from cfqq, moving them to the request queue 2445 * dispatch list. 2446 */ 2447static bool cfq_dispatch_request(struct cfq_data *cfqd, struct cfq_queue *cfqq) 2448{ 2449 struct request *rq; 2450 2451 BUG_ON(RB_EMPTY_ROOT(&cfqq->sort_list)); 2452 2453 if (!cfq_may_dispatch(cfqd, cfqq)) 2454 return false; 2455 2456 /* 2457 * follow expired path, else get first next available 2458 */ 2459 rq = cfq_check_fifo(cfqq); 2460 if (!rq) 2461 rq = cfqq->next_rq; 2462 2463 /* 2464 * insert request into driver dispatch list 2465 */ 2466 cfq_dispatch_insert(cfqd->queue, rq); 2467 2468 if (!cfqd->active_cic) { 2469 struct cfq_io_context *cic = RQ_CIC(rq); 2470 2471 atomic_long_inc(&cic->ioc->refcount); 2472 cfqd->active_cic = cic; 2473 } 2474 2475 return true; 2476} 2477 2478/* 2479 * Find the cfqq that we need to service and move a request from that to the 2480 * dispatch list 2481 */ 2482static int cfq_dispatch_requests(struct request_queue *q, int force) 2483{ 2484 struct cfq_data *cfqd = q->elevator->elevator_data; 2485 struct cfq_queue *cfqq; 2486 2487 if (!cfqd->busy_queues) 2488 return 0; 2489 2490 if (unlikely(force)) 2491 return cfq_forced_dispatch(cfqd); 2492 2493 cfqq = cfq_select_queue(cfqd); 2494 if (!cfqq) 2495 return 0; 2496 2497 /* 2498 * Dispatch a request from this cfqq, if it is allowed 2499 */ 2500 if (!cfq_dispatch_request(cfqd, cfqq)) 2501 return 0; 2502 2503 cfqq->slice_dispatch++; 2504 cfq_clear_cfqq_must_dispatch(cfqq); 2505 2506 /* 2507 * expire an async queue immediately if it has used up its slice. idle 2508 * queue always expire after 1 dispatch round. 2509 */ 2510 if (cfqd->busy_queues > 1 && ((!cfq_cfqq_sync(cfqq) && 2511 cfqq->slice_dispatch >= cfq_prio_to_maxrq(cfqd, cfqq)) || 2512 cfq_class_idle(cfqq))) { 2513 cfqq->slice_end = jiffies + 1; 2514 cfq_slice_expired(cfqd, 0); 2515 } 2516 2517 cfq_log_cfqq(cfqd, cfqq, "dispatched a request"); 2518 return 1; 2519} 2520 2521/* 2522 * task holds one reference to the queue, dropped when task exits. each rq 2523 * in-flight on this queue also holds a reference, dropped when rq is freed. 2524 * 2525 * Each cfq queue took a reference on the parent group. Drop it now. 2526 * queue lock must be held here. 2527 */ 2528static void cfq_put_queue(struct cfq_queue *cfqq) 2529{ 2530 struct cfq_data *cfqd = cfqq->cfqd; 2531 struct cfq_group *cfqg, *orig_cfqg; 2532 2533 BUG_ON(atomic_read(&cfqq->ref) <= 0); 2534 2535 if (!atomic_dec_and_test(&cfqq->ref)) 2536 return; 2537 2538 cfq_log_cfqq(cfqd, cfqq, "put_queue"); 2539 BUG_ON(rb_first(&cfqq->sort_list)); 2540 BUG_ON(cfqq->allocated[READ] + cfqq->allocated[WRITE]); 2541 cfqg = cfqq->cfqg; 2542 orig_cfqg = cfqq->orig_cfqg; 2543 2544 if (unlikely(cfqd->active_queue == cfqq)) { 2545 __cfq_slice_expired(cfqd, cfqq, 0); 2546 cfq_schedule_dispatch(cfqd); 2547 } 2548 2549 BUG_ON(cfq_cfqq_on_rr(cfqq)); 2550 kmem_cache_free(cfq_pool, cfqq); 2551 cfq_put_cfqg(cfqg); 2552 if (orig_cfqg) 2553 cfq_put_cfqg(orig_cfqg); 2554} 2555 2556/* 2557 * Must always be called with the rcu_read_lock() held 2558 */ 2559static void 2560__call_for_each_cic(struct io_context *ioc, 2561 void (*func)(struct io_context *, struct cfq_io_context *)) 2562{ 2563 struct cfq_io_context *cic; 2564 struct hlist_node *n; 2565 2566 hlist_for_each_entry_rcu(cic, n, &ioc->cic_list, cic_list) 2567 func(ioc, cic); 2568} 2569 2570/* 2571 * Call func for each cic attached to this ioc. 2572 */ 2573static void 2574call_for_each_cic(struct io_context *ioc, 2575 void (*func)(struct io_context *, struct cfq_io_context *)) 2576{ 2577 rcu_read_lock(); 2578 __call_for_each_cic(ioc, func); 2579 rcu_read_unlock(); 2580} 2581 2582static void cfq_cic_free_rcu(struct rcu_head *head) 2583{ 2584 struct cfq_io_context *cic; 2585 2586 cic = container_of(head, struct cfq_io_context, rcu_head); 2587 2588 kmem_cache_free(cfq_ioc_pool, cic); 2589 elv_ioc_count_dec(cfq_ioc_count); 2590 2591 if (ioc_gone) { 2592 /* 2593 * CFQ scheduler is exiting, grab exit lock and check 2594 * the pending io context count. If it hits zero, 2595 * complete ioc_gone and set it back to NULL 2596 */ 2597 spin_lock(&ioc_gone_lock); 2598 if (ioc_gone && !elv_ioc_count_read(cfq_ioc_count)) { 2599 complete(ioc_gone); 2600 ioc_gone = NULL; 2601 } 2602 spin_unlock(&ioc_gone_lock); 2603 } 2604} 2605 2606static void cfq_cic_free(struct cfq_io_context *cic) 2607{ 2608 call_rcu(&cic->rcu_head, cfq_cic_free_rcu); 2609} 2610 2611static void cic_free_func(struct io_context *ioc, struct cfq_io_context *cic) 2612{ 2613 unsigned long flags; 2614 unsigned long dead_key = (unsigned long) cic->key; 2615 2616 BUG_ON(!(dead_key & CIC_DEAD_KEY)); 2617 2618 spin_lock_irqsave(&ioc->lock, flags); 2619 radix_tree_delete(&ioc->radix_root, dead_key >> CIC_DEAD_INDEX_SHIFT); 2620 hlist_del_rcu(&cic->cic_list); 2621 spin_unlock_irqrestore(&ioc->lock, flags); 2622 2623 cfq_cic_free(cic); 2624} 2625 2626/* 2627 * Must be called with rcu_read_lock() held or preemption otherwise disabled. 2628 * Only two callers of this - ->dtor() which is called with the rcu_read_lock(), 2629 * and ->trim() which is called with the task lock held 2630 */ 2631static void cfq_free_io_context(struct io_context *ioc) 2632{ 2633 /* 2634 * ioc->refcount is zero here, or we are called from elv_unregister(), 2635 * so no more cic's are allowed to be linked into this ioc. So it 2636 * should be ok to iterate over the known list, we will see all cic's 2637 * since no new ones are added. 2638 */ 2639 __call_for_each_cic(ioc, cic_free_func); 2640} 2641 2642static void cfq_put_cooperator(struct cfq_queue *cfqq) 2643{ 2644 struct cfq_queue *__cfqq, *next; 2645 2646 /* 2647 * If this queue was scheduled to merge with another queue, be 2648 * sure to drop the reference taken on that queue (and others in 2649 * the merge chain). See cfq_setup_merge and cfq_merge_cfqqs. 2650 */ 2651 __cfqq = cfqq->new_cfqq; 2652 while (__cfqq) { 2653 if (__cfqq == cfqq) { 2654 WARN(1, "cfqq->new_cfqq loop detected\n"); 2655 break; 2656 } 2657 next = __cfqq->new_cfqq; 2658 cfq_put_queue(__cfqq); 2659 __cfqq = next; 2660 } 2661} 2662 2663static void cfq_exit_cfqq(struct cfq_data *cfqd, struct cfq_queue *cfqq) 2664{ 2665 if (unlikely(cfqq == cfqd->active_queue)) { 2666 __cfq_slice_expired(cfqd, cfqq, 0); 2667 cfq_schedule_dispatch(cfqd); 2668 } 2669 2670 cfq_put_cooperator(cfqq); 2671 2672 cfq_put_queue(cfqq); 2673} 2674 2675static void __cfq_exit_single_io_context(struct cfq_data *cfqd, 2676 struct cfq_io_context *cic) 2677{ 2678 struct io_context *ioc = cic->ioc; 2679 2680 list_del_init(&cic->queue_list); 2681 2682 /* 2683 * Make sure dead mark is seen for dead queues 2684 */ 2685 smp_wmb(); 2686 cic->key = cfqd_dead_key(cfqd); 2687 2688 if (ioc->ioc_data == cic) 2689 rcu_assign_pointer(ioc->ioc_data, NULL); 2690 2691 if (cic->cfqq[BLK_RW_ASYNC]) { 2692 cfq_exit_cfqq(cfqd, cic->cfqq[BLK_RW_ASYNC]); 2693 cic->cfqq[BLK_RW_ASYNC] = NULL; 2694 } 2695 2696 if (cic->cfqq[BLK_RW_SYNC]) { 2697 cfq_exit_cfqq(cfqd, cic->cfqq[BLK_RW_SYNC]); 2698 cic->cfqq[BLK_RW_SYNC] = NULL; 2699 } 2700} 2701 2702static void cfq_exit_single_io_context(struct io_context *ioc, 2703 struct cfq_io_context *cic) 2704{ 2705 struct cfq_data *cfqd = cic_to_cfqd(cic); 2706 2707 if (cfqd) { 2708 struct request_queue *q = cfqd->queue; 2709 unsigned long flags; 2710 2711 spin_lock_irqsave(q->queue_lock, flags); 2712 2713 /* 2714 * Ensure we get a fresh copy of the ->key to prevent 2715 * race between exiting task and queue 2716 */ 2717 smp_read_barrier_depends(); 2718 if (cic->key == cfqd) 2719 __cfq_exit_single_io_context(cfqd, cic); 2720 2721 spin_unlock_irqrestore(q->queue_lock, flags); 2722 } 2723} 2724 2725/* 2726 * The process that ioc belongs to has exited, we need to clean up 2727 * and put the internal structures we have that belongs to that process. 2728 */ 2729static void cfq_exit_io_context(struct io_context *ioc) 2730{ 2731 call_for_each_cic(ioc, cfq_exit_single_io_context); 2732} 2733 2734static struct cfq_io_context * 2735cfq_alloc_io_context(struct cfq_data *cfqd, gfp_t gfp_mask) 2736{ 2737 struct cfq_io_context *cic; 2738 2739 cic = kmem_cache_alloc_node(cfq_ioc_pool, gfp_mask | __GFP_ZERO, 2740 cfqd->queue->node); 2741 if (cic) { 2742 cic->last_end_request = jiffies; 2743 INIT_LIST_HEAD(&cic->queue_list); 2744 INIT_HLIST_NODE(&cic->cic_list); 2745 cic->dtor = cfq_free_io_context; 2746 cic->exit = cfq_exit_io_context; 2747 elv_ioc_count_inc(cfq_ioc_count); 2748 } 2749 2750 return cic; 2751} 2752 2753static void cfq_init_prio_data(struct cfq_queue *cfqq, struct io_context *ioc) 2754{ 2755 struct task_struct *tsk = current; 2756 int ioprio_class; 2757 2758 if (!cfq_cfqq_prio_changed(cfqq)) 2759 return; 2760 2761 ioprio_class = IOPRIO_PRIO_CLASS(ioc->ioprio); 2762 switch (ioprio_class) { 2763 default: 2764 printk(KERN_ERR "cfq: bad prio %x\n", ioprio_class); 2765 case IOPRIO_CLASS_NONE: 2766 /* 2767 * no prio set, inherit CPU scheduling settings 2768 */ 2769 cfqq->ioprio = task_nice_ioprio(tsk); 2770 cfqq->ioprio_class = task_nice_ioclass(tsk); 2771 break; 2772 case IOPRIO_CLASS_RT: 2773 cfqq->ioprio = task_ioprio(ioc); 2774 cfqq->ioprio_class = IOPRIO_CLASS_RT; 2775 break; 2776 case IOPRIO_CLASS_BE: 2777 cfqq->ioprio = task_ioprio(ioc); 2778 cfqq->ioprio_class = IOPRIO_CLASS_BE; 2779 break; 2780 case IOPRIO_CLASS_IDLE: 2781 cfqq->ioprio_class = IOPRIO_CLASS_IDLE; 2782 cfqq->ioprio = 7; 2783 cfq_clear_cfqq_idle_window(cfqq); 2784 break; 2785 } 2786 2787 /* 2788 * keep track of original prio settings in case we have to temporarily 2789 * elevate the priority of this queue 2790 */ 2791 cfqq->org_ioprio = cfqq->ioprio; 2792 cfqq->org_ioprio_class = cfqq->ioprio_class; 2793 cfq_clear_cfqq_prio_changed(cfqq); 2794} 2795 2796static void changed_ioprio(struct io_context *ioc, struct cfq_io_context *cic) 2797{ 2798 struct cfq_data *cfqd = cic_to_cfqd(cic); 2799 struct cfq_queue *cfqq; 2800 unsigned long flags; 2801 2802 if (unlikely(!cfqd)) 2803 return; 2804 2805 spin_lock_irqsave(cfqd->queue->queue_lock, flags); 2806 2807 cfqq = cic->cfqq[BLK_RW_ASYNC]; 2808 if (cfqq) { 2809 struct cfq_queue *new_cfqq; 2810 new_cfqq = cfq_get_queue(cfqd, BLK_RW_ASYNC, cic->ioc, 2811 GFP_ATOMIC); 2812 if (new_cfqq) { 2813 cic->cfqq[BLK_RW_ASYNC] = new_cfqq; 2814 cfq_put_queue(cfqq); 2815 } 2816 } 2817 2818 cfqq = cic->cfqq[BLK_RW_SYNC]; 2819 if (cfqq) 2820 cfq_mark_cfqq_prio_changed(cfqq); 2821 2822 spin_unlock_irqrestore(cfqd->queue->queue_lock, flags); 2823} 2824 2825static void cfq_ioc_set_ioprio(struct io_context *ioc) 2826{ 2827 call_for_each_cic(ioc, changed_ioprio); 2828 ioc->ioprio_changed = 0; 2829} 2830 2831static void cfq_init_cfqq(struct cfq_data *cfqd, struct cfq_queue *cfqq, 2832 pid_t pid, bool is_sync) 2833{ 2834 RB_CLEAR_NODE(&cfqq->rb_node); 2835 RB_CLEAR_NODE(&cfqq->p_node); 2836 INIT_LIST_HEAD(&cfqq->fifo); 2837 2838 atomic_set(&cfqq->ref, 0); 2839 cfqq->cfqd = cfqd; 2840 2841 cfq_mark_cfqq_prio_changed(cfqq); 2842 2843 if (is_sync) { 2844 if (!cfq_class_idle(cfqq)) 2845 cfq_mark_cfqq_idle_window(cfqq); 2846 cfq_mark_cfqq_sync(cfqq); 2847 } 2848 cfqq->pid = pid; 2849} 2850 2851#ifdef CONFIG_CFQ_GROUP_IOSCHED 2852static void changed_cgroup(struct io_context *ioc, struct cfq_io_context *cic) 2853{ 2854 struct cfq_queue *sync_cfqq = cic_to_cfqq(cic, 1); 2855 struct cfq_data *cfqd = cic_to_cfqd(cic); 2856 unsigned long flags; 2857 struct request_queue *q; 2858 2859 if (unlikely(!cfqd)) 2860 return; 2861 2862 q = cfqd->queue; 2863 2864 spin_lock_irqsave(q->queue_lock, flags); 2865 2866 if (sync_cfqq) { 2867 /* 2868 * Drop reference to sync queue. A new sync queue will be 2869 * assigned in new group upon arrival of a fresh request. 2870 */ 2871 cfq_log_cfqq(cfqd, sync_cfqq, "changed cgroup"); 2872 cic_set_cfqq(cic, NULL, 1); 2873 cfq_put_queue(sync_cfqq); 2874 } 2875 2876 spin_unlock_irqrestore(q->queue_lock, flags); 2877} 2878 2879static void cfq_ioc_set_cgroup(struct io_context *ioc) 2880{ 2881 call_for_each_cic(ioc, changed_cgroup); 2882 ioc->cgroup_changed = 0; 2883} 2884#endif /* CONFIG_CFQ_GROUP_IOSCHED */ 2885 2886static struct cfq_queue * 2887cfq_find_alloc_queue(struct cfq_data *cfqd, bool is_sync, 2888 struct io_context *ioc, gfp_t gfp_mask) 2889{ 2890 struct cfq_queue *cfqq, *new_cfqq = NULL; 2891 struct cfq_io_context *cic; 2892 struct cfq_group *cfqg; 2893 2894retry: 2895 cfqg = cfq_get_cfqg(cfqd, 1); 2896 cic = cfq_cic_lookup(cfqd, ioc); 2897 /* cic always exists here */ 2898 cfqq = cic_to_cfqq(cic, is_sync); 2899 2900 /* 2901 * Always try a new alloc if we fell back to the OOM cfqq 2902 * originally, since it should just be a temporary situation. 2903 */ 2904 if (!cfqq || cfqq == &cfqd->oom_cfqq) { 2905 cfqq = NULL; 2906 if (new_cfqq) { 2907 cfqq = new_cfqq; 2908 new_cfqq = NULL; 2909 } else if (gfp_mask & __GFP_WAIT) { 2910 spin_unlock_irq(cfqd->queue->queue_lock); 2911 new_cfqq = kmem_cache_alloc_node(cfq_pool, 2912 gfp_mask | __GFP_ZERO, 2913 cfqd->queue->node); 2914 spin_lock_irq(cfqd->queue->queue_lock); 2915 if (new_cfqq) 2916 goto retry; 2917 } else { 2918 cfqq = kmem_cache_alloc_node(cfq_pool, 2919 gfp_mask | __GFP_ZERO, 2920 cfqd->queue->node); 2921 } 2922 2923 if (cfqq) { 2924 cfq_init_cfqq(cfqd, cfqq, current->pid, is_sync); 2925 cfq_init_prio_data(cfqq, ioc); 2926 cfq_link_cfqq_cfqg(cfqq, cfqg); 2927 cfq_log_cfqq(cfqd, cfqq, "alloced"); 2928 } else 2929 cfqq = &cfqd->oom_cfqq; 2930 } 2931 2932 if (new_cfqq) 2933 kmem_cache_free(cfq_pool, new_cfqq); 2934 2935 return cfqq; 2936} 2937 2938static struct cfq_queue ** 2939cfq_async_queue_prio(struct cfq_data *cfqd, int ioprio_class, int ioprio) 2940{ 2941 switch (ioprio_class) { 2942 case IOPRIO_CLASS_RT: 2943 return &cfqd->async_cfqq[0][ioprio]; 2944 case IOPRIO_CLASS_BE: 2945 return &cfqd->async_cfqq[1][ioprio]; 2946 case IOPRIO_CLASS_IDLE: 2947 return &cfqd->async_idle_cfqq; 2948 default: 2949 BUG(); 2950 } 2951} 2952 2953static struct cfq_queue * 2954cfq_get_queue(struct cfq_data *cfqd, bool is_sync, struct io_context *ioc, 2955 gfp_t gfp_mask) 2956{ 2957 const int ioprio = task_ioprio(ioc); 2958 const int ioprio_class = task_ioprio_class(ioc); 2959 struct cfq_queue **async_cfqq = NULL; 2960 struct cfq_queue *cfqq = NULL; 2961 2962 if (!is_sync) { 2963 async_cfqq = cfq_async_queue_prio(cfqd, ioprio_class, ioprio); 2964 cfqq = *async_cfqq; 2965 } 2966 2967 if (!cfqq) 2968 cfqq = cfq_find_alloc_queue(cfqd, is_sync, ioc, gfp_mask); 2969 2970 /* 2971 * pin the queue now that it's allocated, scheduler exit will prune it 2972 */ 2973 if (!is_sync && !(*async_cfqq)) { 2974 atomic_inc(&cfqq->ref); 2975 *async_cfqq = cfqq; 2976 } 2977 2978 atomic_inc(&cfqq->ref); 2979 return cfqq; 2980} 2981 2982/* 2983 * We drop cfq io contexts lazily, so we may find a dead one. 2984 */ 2985static void 2986cfq_drop_dead_cic(struct cfq_data *cfqd, struct io_context *ioc, 2987 struct cfq_io_context *cic) 2988{ 2989 unsigned long flags; 2990 2991 WARN_ON(!list_empty(&cic->queue_list)); 2992 BUG_ON(cic->key != cfqd_dead_key(cfqd)); 2993 2994 spin_lock_irqsave(&ioc->lock, flags); 2995 2996 BUG_ON(ioc->ioc_data == cic); 2997 2998 radix_tree_delete(&ioc->radix_root, cfqd->cic_index); 2999 hlist_del_rcu(&cic->cic_list); 3000 spin_unlock_irqrestore(&ioc->lock, flags); 3001 3002 cfq_cic_free(cic); 3003} 3004 3005static struct cfq_io_context * 3006cfq_cic_lookup(struct cfq_data *cfqd, struct io_context *ioc) 3007{ 3008 struct cfq_io_context *cic; 3009 unsigned long flags; 3010 3011 if (unlikely(!ioc)) 3012 return NULL; 3013 3014 rcu_read_lock(); 3015 3016 /* 3017 * we maintain a last-hit cache, to avoid browsing over the tree 3018 */ 3019 cic = rcu_dereference(ioc->ioc_data); 3020 if (cic && cic->key == cfqd) { 3021 rcu_read_unlock(); 3022 return cic; 3023 } 3024 3025 do { 3026 cic = radix_tree_lookup(&ioc->radix_root, cfqd->cic_index); 3027 rcu_read_unlock(); 3028 if (!cic) 3029 break; 3030 if (unlikely(cic->key != cfqd)) { 3031 cfq_drop_dead_cic(cfqd, ioc, cic); 3032 rcu_read_lock(); 3033 continue; 3034 } 3035 3036 spin_lock_irqsave(&ioc->lock, flags); 3037 rcu_assign_pointer(ioc->ioc_data, cic); 3038 spin_unlock_irqrestore(&ioc->lock, flags); 3039 break; 3040 } while (1); 3041 3042 return cic; 3043} 3044 3045/* 3046 * Add cic into ioc, using cfqd as the search key. This enables us to lookup 3047 * the process specific cfq io context when entered from the block layer. 3048 * Also adds the cic to a per-cfqd list, used when this queue is removed. 3049 */ 3050static int cfq_cic_link(struct cfq_data *cfqd, struct io_context *ioc, 3051 struct cfq_io_context *cic, gfp_t gfp_mask) 3052{ 3053 unsigned long flags; 3054 int ret; 3055 3056 ret = radix_tree_preload(gfp_mask); 3057 if (!ret) { 3058 cic->ioc = ioc; 3059 cic->key = cfqd; 3060 3061 spin_lock_irqsave(&ioc->lock, flags); 3062 ret = radix_tree_insert(&ioc->radix_root, 3063 cfqd->cic_index, cic); 3064 if (!ret) 3065 hlist_add_head_rcu(&cic->cic_list, &ioc->cic_list); 3066 spin_unlock_irqrestore(&ioc->lock, flags); 3067 3068 radix_tree_preload_end(); 3069 3070 if (!ret) { 3071 spin_lock_irqsave(cfqd->queue->queue_lock, flags); 3072 list_add(&cic->queue_list, &cfqd->cic_list); 3073 spin_unlock_irqrestore(cfqd->queue->queue_lock, flags); 3074 } 3075 } 3076 3077 if (ret) 3078 printk(KERN_ERR "cfq: cic link failed!\n"); 3079 3080 return ret; 3081} 3082 3083/* 3084 * Setup general io context and cfq io context. There can be several cfq 3085 * io contexts per general io context, if this process is doing io to more 3086 * than one device managed by cfq. 3087 */ 3088static struct cfq_io_context * 3089cfq_get_io_context(struct cfq_data *cfqd, gfp_t gfp_mask) 3090{ 3091 struct io_context *ioc = NULL; 3092 struct cfq_io_context *cic; 3093 3094 might_sleep_if(gfp_mask & __GFP_WAIT); 3095 3096 ioc = get_io_context(gfp_mask, cfqd->queue->node); 3097 if (!ioc) 3098 return NULL; 3099 3100 cic = cfq_cic_lookup(cfqd, ioc); 3101 if (cic) 3102 goto out; 3103 3104 cic = cfq_alloc_io_context(cfqd, gfp_mask); 3105 if (cic == NULL) 3106 goto err; 3107 3108 if (cfq_cic_link(cfqd, ioc, cic, gfp_mask)) 3109 goto err_free; 3110 3111out: 3112 smp_read_barrier_depends(); 3113 if (unlikely(ioc->ioprio_changed)) 3114 cfq_ioc_set_ioprio(ioc); 3115 3116#ifdef CONFIG_CFQ_GROUP_IOSCHED 3117 if (unlikely(ioc->cgroup_changed)) 3118 cfq_ioc_set_cgroup(ioc); 3119#endif 3120 return cic; 3121err_free: 3122 cfq_cic_free(cic); 3123err: 3124 put_io_context(ioc); 3125 return NULL; 3126} 3127 3128static void 3129cfq_update_io_thinktime(struct cfq_data *cfqd, struct cfq_io_context *cic) 3130{ 3131 unsigned long elapsed = jiffies - cic->last_end_request; 3132 unsigned long ttime = min(elapsed, 2UL * cfqd->cfq_slice_idle); 3133 3134 cic->ttime_samples = (7*cic->ttime_samples + 256) / 8; 3135 cic->ttime_total = (7*cic->ttime_total + 256*ttime) / 8; 3136 cic->ttime_mean = (cic->ttime_total + 128) / cic->ttime_samples; 3137} 3138 3139static void 3140cfq_update_io_seektime(struct cfq_data *cfqd, struct cfq_queue *cfqq, 3141 struct request *rq) 3142{ 3143 sector_t sdist = 0; 3144 sector_t n_sec = blk_rq_sectors(rq); 3145 if (cfqq->last_request_pos) { 3146 if (cfqq->last_request_pos < blk_rq_pos(rq)) 3147 sdist = blk_rq_pos(rq) - cfqq->last_request_pos; 3148 else 3149 sdist = cfqq->last_request_pos - blk_rq_pos(rq); 3150 } 3151 3152 cfqq->seek_history <<= 1; 3153 if (blk_queue_nonrot(cfqd->queue)) 3154 cfqq->seek_history |= (n_sec < CFQQ_SECT_THR_NONROT); 3155 else 3156 cfqq->seek_history |= (sdist > CFQQ_SEEK_THR); 3157} 3158 3159/* 3160 * Disable idle window if the process thinks too long or seeks so much that 3161 * it doesn't matter 3162 */ 3163static void 3164cfq_update_idle_window(struct cfq_data *cfqd, struct cfq_queue *cfqq, 3165 struct cfq_io_context *cic) 3166{ 3167 int old_idle, enable_idle; 3168 3169 /* 3170 * Don't idle for async or idle io prio class 3171 */ 3172 if (!cfq_cfqq_sync(cfqq) || cfq_class_idle(cfqq)) 3173 return; 3174 3175 enable_idle = old_idle = cfq_cfqq_idle_window(cfqq); 3176 3177 if (cfqq->queued[0] + cfqq->queued[1] >= 4) 3178 cfq_mark_cfqq_deep(cfqq); 3179 3180 if (!atomic_read(&cic->ioc->nr_tasks) || !cfqd->cfq_slice_idle || 3181 (!cfq_cfqq_deep(cfqq) && CFQQ_SEEKY(cfqq))) 3182 enable_idle = 0; 3183 else if (sample_valid(cic->ttime_samples)) { 3184 if (cic->ttime_mean > cfqd->cfq_slice_idle) 3185 enable_idle = 0; 3186 else 3187 enable_idle = 1; 3188 } 3189 3190 if (old_idle != enable_idle) { 3191 cfq_log_cfqq(cfqd, cfqq, "idle=%d", enable_idle); 3192 if (enable_idle) 3193 cfq_mark_cfqq_idle_window(cfqq); 3194 else 3195 cfq_clear_cfqq_idle_window(cfqq); 3196 } 3197} 3198 3199/* 3200 * Check if new_cfqq should preempt the currently active queue. Return 0 for 3201 * no or if we aren't sure, a 1 will cause a preempt. 3202 */ 3203static bool 3204cfq_should_preempt(struct cfq_data *cfqd, struct cfq_queue *new_cfqq, 3205 struct request *rq) 3206{ 3207 struct cfq_queue *cfqq; 3208 3209 cfqq = cfqd->active_queue; 3210 if (!cfqq) 3211 return false; 3212 3213 if (cfq_class_idle(new_cfqq)) 3214 return false; 3215 3216 if (cfq_class_idle(cfqq)) 3217 return true; 3218 3219 /* 3220 * Don't allow a non-RT request to preempt an ongoing RT cfqq timeslice. 3221 */ 3222 if (cfq_class_rt(cfqq) && !cfq_class_rt(new_cfqq)) 3223 return false; 3224 3225 /* 3226 * if the new request is sync, but the currently running queue is 3227 * not, let the sync request have priority. 3228 */ 3229 if (rq_is_sync(rq) && !cfq_cfqq_sync(cfqq)) 3230 return true; 3231 3232 if (new_cfqq->cfqg != cfqq->cfqg) 3233 return false; 3234 3235 if (cfq_slice_used(cfqq)) 3236 return true; 3237 3238 /* Allow preemption only if we are idling on sync-noidle tree */ 3239 if (cfqd->serving_type == SYNC_NOIDLE_WORKLOAD && 3240 cfqq_type(new_cfqq) == SYNC_NOIDLE_WORKLOAD && 3241 new_cfqq->service_tree->count == 2 && 3242 RB_EMPTY_ROOT(&cfqq->sort_list)) 3243 return true; 3244 3245 /* 3246 * So both queues are sync. Let the new request get disk time if 3247 * it's a metadata request and the current queue is doing regular IO. 3248 */ 3249 if ((rq->cmd_flags & REQ_META) && !cfqq->meta_pending) 3250 return true; 3251 3252 /* 3253 * Allow an RT request to pre-empt an ongoing non-RT cfqq timeslice. 3254 */ 3255 if (cfq_class_rt(new_cfqq) && !cfq_class_rt(cfqq)) 3256 return true; 3257 3258 if (!cfqd->active_cic || !cfq_cfqq_wait_request(cfqq)) 3259 return false; 3260 3261 /* 3262 * if this request is as-good as one we would expect from the 3263 * current cfqq, let it preempt 3264 */ 3265 if (cfq_rq_close(cfqd, cfqq, rq)) 3266 return true; 3267 3268 return false; 3269} 3270 3271/* 3272 * cfqq preempts the active queue. if we allowed preempt with no slice left, 3273 * let it have half of its nominal slice. 3274 */ 3275static void cfq_preempt_queue(struct cfq_data *cfqd, struct cfq_queue *cfqq) 3276{ 3277 cfq_log_cfqq(cfqd, cfqq, "preempt"); 3278 cfq_slice_expired(cfqd, 1); 3279 3280 /* 3281 * Put the new queue at the front of the of the current list, 3282 * so we know that it will be selected next. 3283 */ 3284 BUG_ON(!cfq_cfqq_on_rr(cfqq)); 3285 3286 cfq_service_tree_add(cfqd, cfqq, 1); 3287 3288 cfqq->slice_end = 0; 3289 cfq_mark_cfqq_slice_new(cfqq); 3290} 3291 3292/* 3293 * Called when a new fs request (rq) is added (to cfqq). Check if there's 3294 * something we should do about it 3295 */ 3296static void 3297cfq_rq_enqueued(struct cfq_data *cfqd, struct cfq_queue *cfqq, 3298 struct request *rq) 3299{ 3300 struct cfq_io_context *cic = RQ_CIC(rq); 3301 3302 cfqd->rq_queued++; 3303 if (rq->cmd_flags & REQ_META) 3304 cfqq->meta_pending++; 3305 3306 cfq_update_io_thinktime(cfqd, cic); 3307 cfq_update_io_seektime(cfqd, cfqq, rq); 3308 cfq_update_idle_window(cfqd, cfqq, cic); 3309 3310 cfqq->last_request_pos = blk_rq_pos(rq) + blk_rq_sectors(rq); 3311 3312 if (cfqq == cfqd->active_queue) { 3313 /* 3314 * Remember that we saw a request from this process, but 3315 * don't start queuing just yet. Otherwise we risk seeing lots 3316 * of tiny requests, because we disrupt the normal plugging 3317 * and merging. If the request is already larger than a single 3318 * page, let it rip immediately. For that case we assume that 3319 * merging is already done. Ditto for a busy system that 3320 * has other work pending, don't risk delaying until the 3321 * idle timer unplug to continue working. 3322 */ 3323 if (cfq_cfqq_wait_request(cfqq)) { 3324 if (blk_rq_bytes(rq) > PAGE_CACHE_SIZE || 3325 cfqd->busy_queues > 1) { 3326 cfq_del_timer(cfqd, cfqq); 3327 cfq_clear_cfqq_wait_request(cfqq); 3328 __blk_run_queue(cfqd->queue); 3329 } else { 3330 cfq_blkiocg_update_idle_time_stats( 3331 &cfqq->cfqg->blkg); 3332 cfq_mark_cfqq_must_dispatch(cfqq); 3333 } 3334 } 3335 } else if (cfq_should_preempt(cfqd, cfqq, rq)) { 3336 /* 3337 * not the active queue - expire current slice if it is 3338 * idle and has expired it's mean thinktime or this new queue 3339 * has some old slice time left and is of higher priority or 3340 * this new queue is RT and the current one is BE 3341 */ 3342 cfq_preempt_queue(cfqd, cfqq); 3343 __blk_run_queue(cfqd->queue); 3344 } 3345} 3346 3347static void cfq_insert_request(struct request_queue *q, struct request *rq) 3348{ 3349 struct cfq_data *cfqd = q->elevator->elevator_data; 3350 struct cfq_queue *cfqq = RQ_CFQQ(rq); 3351 3352 cfq_log_cfqq(cfqd, cfqq, "insert_request"); 3353 cfq_init_prio_data(cfqq, RQ_CIC(rq)->ioc); 3354 3355 rq_set_fifo_time(rq, jiffies + cfqd->cfq_fifo_expire[rq_is_sync(rq)]); 3356 list_add_tail(&rq->queuelist, &cfqq->fifo); 3357 cfq_add_rq_rb(rq); 3358 cfq_blkiocg_update_io_add_stats(&(RQ_CFQG(rq))->blkg, 3359 &cfqd->serving_group->blkg, rq_data_dir(rq), 3360 rq_is_sync(rq)); 3361 cfq_rq_enqueued(cfqd, cfqq, rq); 3362} 3363 3364/* 3365 * Update hw_tag based on peak queue depth over 50 samples under 3366 * sufficient load. 3367 */ 3368static void cfq_update_hw_tag(struct cfq_data *cfqd) 3369{ 3370 struct cfq_queue *cfqq = cfqd->active_queue; 3371 3372 if (cfqd->rq_in_driver > cfqd->hw_tag_est_depth) 3373 cfqd->hw_tag_est_depth = cfqd->rq_in_driver; 3374 3375 if (cfqd->hw_tag == 1) 3376 return; 3377 3378 if (cfqd->rq_queued <= CFQ_HW_QUEUE_MIN && 3379 cfqd->rq_in_driver <= CFQ_HW_QUEUE_MIN) 3380 return; 3381 3382 /* 3383 * If active queue hasn't enough requests and can idle, cfq might not 3384 * dispatch sufficient requests to hardware. Don't zero hw_tag in this 3385 * case 3386 */ 3387 if (cfqq && cfq_cfqq_idle_window(cfqq) && 3388 cfqq->dispatched + cfqq->queued[0] + cfqq->queued[1] < 3389 CFQ_HW_QUEUE_MIN && cfqd->rq_in_driver < CFQ_HW_QUEUE_MIN) 3390 return; 3391 3392 if (cfqd->hw_tag_samples++ < 50) 3393 return; 3394 3395 if (cfqd->hw_tag_est_depth >= CFQ_HW_QUEUE_MIN) 3396 cfqd->hw_tag = 1; 3397 else 3398 cfqd->hw_tag = 0; 3399} 3400 3401static bool cfq_should_wait_busy(struct cfq_data *cfqd, struct cfq_queue *cfqq) 3402{ 3403 struct cfq_io_context *cic = cfqd->active_cic; 3404 3405 /* If the queue already has requests, don't wait */ 3406 if (!RB_EMPTY_ROOT(&cfqq->sort_list)) 3407 return false; 3408 3409 /* If there are other queues in the group, don't wait */ 3410 if (cfqq->cfqg->nr_cfqq > 1) 3411 return false; 3412 3413 if (cfq_slice_used(cfqq)) 3414 return true; 3415 3416 /* if slice left is less than think time, wait busy */ 3417 if (cic && sample_valid(cic->ttime_samples) 3418 && (cfqq->slice_end - jiffies < cic->ttime_mean)) 3419 return true; 3420 3421 /* 3422 * If think times is less than a jiffy than ttime_mean=0 and above 3423 * will not be true. It might happen that slice has not expired yet 3424 * but will expire soon (4-5 ns) during select_queue(). To cover the 3425 * case where think time is less than a jiffy, mark the queue wait 3426 * busy if only 1 jiffy is left in the slice. 3427 */ 3428 if (cfqq->slice_end - jiffies == 1) 3429 return true; 3430 3431 return false; 3432} 3433 3434static void cfq_completed_request(struct request_queue *q, struct request *rq) 3435{ 3436 struct cfq_queue *cfqq = RQ_CFQQ(rq); 3437 struct cfq_data *cfqd = cfqq->cfqd; 3438 const int sync = rq_is_sync(rq); 3439 unsigned long now; 3440 3441 now = jiffies; 3442 cfq_log_cfqq(cfqd, cfqq, "complete rqnoidle %d", 3443 !!(rq->cmd_flags & REQ_NOIDLE)); 3444 3445 cfq_update_hw_tag(cfqd); 3446 3447 WARN_ON(!cfqd->rq_in_driver); 3448 WARN_ON(!cfqq->dispatched); 3449 cfqd->rq_in_driver--; 3450 cfqq->dispatched--; 3451 (RQ_CFQG(rq))->dispatched--; 3452 cfq_blkiocg_update_completion_stats(&cfqq->cfqg->blkg, 3453 rq_start_time_ns(rq), rq_io_start_time_ns(rq), 3454 rq_data_dir(rq), rq_is_sync(rq)); 3455 3456 cfqd->rq_in_flight[cfq_cfqq_sync(cfqq)]--; 3457 3458 if (sync) { 3459 RQ_CIC(rq)->last_end_request = now; 3460 if (!time_after(rq->start_time + cfqd->cfq_fifo_expire[1], now)) 3461 cfqd->last_delayed_sync = now; 3462 } 3463 3464 /* 3465 * If this is the active queue, check if it needs to be expired, 3466 * or if we want to idle in case it has no pending requests. 3467 */ 3468 if (cfqd->active_queue == cfqq) { 3469 const bool cfqq_empty = RB_EMPTY_ROOT(&cfqq->sort_list); 3470 3471 if (cfq_cfqq_slice_new(cfqq)) { 3472 cfq_set_prio_slice(cfqd, cfqq); 3473 cfq_clear_cfqq_slice_new(cfqq); 3474 } 3475 3476 /* 3477 * Should we wait for next request to come in before we expire 3478 * the queue. 3479 */ 3480 if (cfq_should_wait_busy(cfqd, cfqq)) { 3481 unsigned long extend_sl = cfqd->cfq_slice_idle; 3482 if (!cfqd->cfq_slice_idle) 3483 extend_sl = cfqd->cfq_group_idle; 3484 cfqq->slice_end = jiffies + extend_sl; 3485 cfq_mark_cfqq_wait_busy(cfqq); 3486 cfq_log_cfqq(cfqd, cfqq, "will busy wait"); 3487 } 3488 3489 /* 3490 * Idling is not enabled on: 3491 * - expired queues 3492 * - idle-priority queues 3493 * - async queues 3494 * - queues with still some requests queued 3495 * - when there is a close cooperator 3496 */ 3497 if (cfq_slice_used(cfqq) || cfq_class_idle(cfqq)) 3498 cfq_slice_expired(cfqd, 1); 3499 else if (sync && cfqq_empty && 3500 !cfq_close_cooperator(cfqd, cfqq)) { 3501 cfqd->noidle_tree_requires_idle |= 3502 !(rq->cmd_flags & REQ_NOIDLE); 3503 /* 3504 * Idling is enabled for SYNC_WORKLOAD. 3505 * SYNC_NOIDLE_WORKLOAD idles at the end of the tree 3506 * only if we processed at least one !REQ_NOIDLE request 3507 */ 3508 if (cfqd->serving_type == SYNC_WORKLOAD 3509 || cfqd->noidle_tree_requires_idle 3510 || cfqq->cfqg->nr_cfqq == 1) 3511 cfq_arm_slice_timer(cfqd); 3512 } 3513 } 3514 3515 if (!cfqd->rq_in_driver) 3516 cfq_schedule_dispatch(cfqd); 3517} 3518 3519/* 3520 * we temporarily boost lower priority queues if they are holding fs exclusive 3521 * resources. they are boosted to normal prio (CLASS_BE/4) 3522 */ 3523static void cfq_prio_boost(struct cfq_queue *cfqq) 3524{ 3525 if (has_fs_excl()) { 3526 /* 3527 * boost idle prio on transactions that would lock out other 3528 * users of the filesystem 3529 */ 3530 if (cfq_class_idle(cfqq)) 3531 cfqq->ioprio_class = IOPRIO_CLASS_BE; 3532 if (cfqq->ioprio > IOPRIO_NORM) 3533 cfqq->ioprio = IOPRIO_NORM; 3534 } else { 3535 /* 3536 * unboost the queue (if needed) 3537 */ 3538 cfqq->ioprio_class = cfqq->org_ioprio_class; 3539 cfqq->ioprio = cfqq->org_ioprio; 3540 } 3541} 3542 3543static inline int __cfq_may_queue(struct cfq_queue *cfqq) 3544{ 3545 if (cfq_cfqq_wait_request(cfqq) && !cfq_cfqq_must_alloc_slice(cfqq)) { 3546 cfq_mark_cfqq_must_alloc_slice(cfqq); 3547 return ELV_MQUEUE_MUST; 3548 } 3549 3550 return ELV_MQUEUE_MAY; 3551} 3552 3553static int cfq_may_queue(struct request_queue *q, int rw) 3554{ 3555 struct cfq_data *cfqd = q->elevator->elevator_data; 3556 struct task_struct *tsk = current; 3557 struct cfq_io_context *cic; 3558 struct cfq_queue *cfqq; 3559 3560 /* 3561 * don't force setup of a queue from here, as a call to may_queue 3562 * does not necessarily imply that a request actually will be queued. 3563 * so just lookup a possibly existing queue, or return 'may queue' 3564 * if that fails 3565 */ 3566 cic = cfq_cic_lookup(cfqd, tsk->io_context); 3567 if (!cic) 3568 return ELV_MQUEUE_MAY; 3569 3570 cfqq = cic_to_cfqq(cic, rw_is_sync(rw)); 3571 if (cfqq) { 3572 cfq_init_prio_data(cfqq, cic->ioc); 3573 cfq_prio_boost(cfqq); 3574 3575 return __cfq_may_queue(cfqq); 3576 } 3577 3578 return ELV_MQUEUE_MAY; 3579} 3580 3581/* 3582 * queue lock held here 3583 */ 3584static void cfq_put_request(struct request *rq) 3585{ 3586 struct cfq_queue *cfqq = RQ_CFQQ(rq); 3587 3588 if (cfqq) { 3589 const int rw = rq_data_dir(rq); 3590 3591 BUG_ON(!cfqq->allocated[rw]); 3592 cfqq->allocated[rw]--; 3593 3594 put_io_context(RQ_CIC(rq)->ioc); 3595 3596 rq->elevator_private = NULL; 3597 rq->elevator_private2 = NULL; 3598 3599 /* Put down rq reference on cfqg */ 3600 cfq_put_cfqg(RQ_CFQG(rq)); 3601 rq->elevator_private3 = NULL; 3602 3603 cfq_put_queue(cfqq); 3604 } 3605} 3606 3607static struct cfq_queue * 3608cfq_merge_cfqqs(struct cfq_data *cfqd, struct cfq_io_context *cic, 3609 struct cfq_queue *cfqq) 3610{ 3611 cfq_log_cfqq(cfqd, cfqq, "merging with queue %p", cfqq->new_cfqq); 3612 cic_set_cfqq(cic, cfqq->new_cfqq, 1); 3613 cfq_mark_cfqq_coop(cfqq->new_cfqq); 3614 cfq_put_queue(cfqq); 3615 return cic_to_cfqq(cic, 1); 3616} 3617 3618/* 3619 * Returns NULL if a new cfqq should be allocated, or the old cfqq if this 3620 * was the last process referring to said cfqq. 3621 */ 3622static struct cfq_queue * 3623split_cfqq(struct cfq_io_context *cic, struct cfq_queue *cfqq) 3624{ 3625 if (cfqq_process_refs(cfqq) == 1) { 3626 cfqq->pid = current->pid; 3627 cfq_clear_cfqq_coop(cfqq); 3628 cfq_clear_cfqq_split_coop(cfqq); 3629 return cfqq; 3630 } 3631 3632 cic_set_cfqq(cic, NULL, 1); 3633 3634 cfq_put_cooperator(cfqq); 3635 3636 cfq_put_queue(cfqq); 3637 return NULL; 3638} 3639/* 3640 * Allocate cfq data structures associated with this request. 3641 */ 3642static int 3643cfq_set_request(struct request_queue *q, struct request *rq, gfp_t gfp_mask) 3644{ 3645 struct cfq_data *cfqd = q->elevator->elevator_data; 3646 struct cfq_io_context *cic; 3647 const int rw = rq_data_dir(rq); 3648 const bool is_sync = rq_is_sync(rq); 3649 struct cfq_queue *cfqq; 3650 unsigned long flags; 3651 3652 might_sleep_if(gfp_mask & __GFP_WAIT); 3653 3654 cic = cfq_get_io_context(cfqd, gfp_mask); 3655 3656 spin_lock_irqsave(q->queue_lock, flags); 3657 3658 if (!cic) 3659 goto queue_fail; 3660 3661new_queue: 3662 cfqq = cic_to_cfqq(cic, is_sync); 3663 if (!cfqq || cfqq == &cfqd->oom_cfqq) { 3664 cfqq = cfq_get_queue(cfqd, is_sync, cic->ioc, gfp_mask); 3665 cic_set_cfqq(cic, cfqq, is_sync); 3666 } else { 3667 /* 3668 * If the queue was seeky for too long, break it apart. 3669 */ 3670 if (cfq_cfqq_coop(cfqq) && cfq_cfqq_split_coop(cfqq)) { 3671 cfq_log_cfqq(cfqd, cfqq, "breaking apart cfqq"); 3672 cfqq = split_cfqq(cic, cfqq); 3673 if (!cfqq) 3674 goto new_queue; 3675 } 3676 3677 /* 3678 * Check to see if this queue is scheduled to merge with 3679 * another, closely cooperating queue. The merging of 3680 * queues happens here as it must be done in process context. 3681 * The reference on new_cfqq was taken in merge_cfqqs. 3682 */ 3683 if (cfqq->new_cfqq) 3684 cfqq = cfq_merge_cfqqs(cfqd, cic, cfqq); 3685 } 3686 3687 cfqq->allocated[rw]++; 3688 atomic_inc(&cfqq->ref); 3689 3690 spin_unlock_irqrestore(q->queue_lock, flags); 3691 3692 rq->elevator_private = cic; 3693 rq->elevator_private2 = cfqq; 3694 rq->elevator_private3 = cfq_ref_get_cfqg(cfqq->cfqg); 3695 return 0; 3696 3697queue_fail: 3698 if (cic) 3699 put_io_context(cic->ioc); 3700 3701 cfq_schedule_dispatch(cfqd); 3702 spin_unlock_irqrestore(q->queue_lock, flags); 3703 cfq_log(cfqd, "set_request fail"); 3704 return 1; 3705} 3706 3707static void cfq_kick_queue(struct work_struct *work) 3708{ 3709 struct cfq_data *cfqd = 3710 container_of(work, struct cfq_data, unplug_work); 3711 struct request_queue *q = cfqd->queue; 3712 3713 spin_lock_irq(q->queue_lock); 3714 __blk_run_queue(cfqd->queue); 3715 spin_unlock_irq(q->queue_lock); 3716} 3717 3718/* 3719 * Timer running if the active_queue is currently idling inside its time slice 3720 */ 3721static void cfq_idle_slice_timer(unsigned long data) 3722{ 3723 struct cfq_data *cfqd = (struct cfq_data *) data; 3724 struct cfq_queue *cfqq; 3725 unsigned long flags; 3726 int timed_out = 1; 3727 3728 cfq_log(cfqd, "idle timer fired"); 3729 3730 spin_lock_irqsave(cfqd->queue->queue_lock, flags); 3731 3732 cfqq = cfqd->active_queue; 3733 if (cfqq) { 3734 timed_out = 0; 3735 3736 /* 3737 * We saw a request before the queue expired, let it through 3738 */ 3739 if (cfq_cfqq_must_dispatch(cfqq)) 3740 goto out_kick; 3741 3742 /* 3743 * expired 3744 */ 3745 if (cfq_slice_used(cfqq)) 3746 goto expire; 3747 3748 /* 3749 * only expire and reinvoke request handler, if there are 3750 * other queues with pending requests 3751 */ 3752 if (!cfqd->busy_queues) 3753 goto out_cont; 3754 3755 /* 3756 * not expired and it has a request pending, let it dispatch 3757 */ 3758 if (!RB_EMPTY_ROOT(&cfqq->sort_list)) 3759 goto out_kick; 3760 3761 /* 3762 * Queue depth flag is reset only when the idle didn't succeed 3763 */ 3764 cfq_clear_cfqq_deep(cfqq); 3765 } 3766expire: 3767 cfq_slice_expired(cfqd, timed_out); 3768out_kick: 3769 cfq_schedule_dispatch(cfqd); 3770out_cont: 3771 spin_unlock_irqrestore(cfqd->queue->queue_lock, flags); 3772} 3773 3774static void cfq_shutdown_timer_wq(struct cfq_data *cfqd) 3775{ 3776 del_timer_sync(&cfqd->idle_slice_timer); 3777 cancel_work_sync(&cfqd->unplug_work); 3778} 3779 3780static void cfq_put_async_queues(struct cfq_data *cfqd) 3781{ 3782 int i; 3783 3784 for (i = 0; i < IOPRIO_BE_NR; i++) { 3785 if (cfqd->async_cfqq[0][i]) 3786 cfq_put_queue(cfqd->async_cfqq[0][i]); 3787 if (cfqd->async_cfqq[1][i]) 3788 cfq_put_queue(cfqd->async_cfqq[1][i]); 3789 } 3790 3791 if (cfqd->async_idle_cfqq) 3792 cfq_put_queue(cfqd->async_idle_cfqq); 3793} 3794 3795static void cfq_cfqd_free(struct rcu_head *head) 3796{ 3797 kfree(container_of(head, struct cfq_data, rcu)); 3798} 3799 3800static void cfq_exit_queue(struct elevator_queue *e) 3801{ 3802 struct cfq_data *cfqd = e->elevator_data; 3803 struct request_queue *q = cfqd->queue; 3804 3805 cfq_shutdown_timer_wq(cfqd); 3806 3807 spin_lock_irq(q->queue_lock); 3808 3809 if (cfqd->active_queue) 3810 __cfq_slice_expired(cfqd, cfqd->active_queue, 0); 3811 3812 while (!list_empty(&cfqd->cic_list)) { 3813 struct cfq_io_context *cic = list_entry(cfqd->cic_list.next, 3814 struct cfq_io_context, 3815 queue_list); 3816 3817 __cfq_exit_single_io_context(cfqd, cic); 3818 } 3819 3820 cfq_put_async_queues(cfqd); 3821 cfq_release_cfq_groups(cfqd); 3822 cfq_blkiocg_del_blkio_group(&cfqd->root_group.blkg); 3823 3824 spin_unlock_irq(q->queue_lock); 3825 3826 cfq_shutdown_timer_wq(cfqd); 3827 3828 spin_lock(&cic_index_lock); 3829 ida_remove(&cic_index_ida, cfqd->cic_index); 3830 spin_unlock(&cic_index_lock); 3831 3832 /* Wait for cfqg->blkg->key accessors to exit their grace periods. */ 3833 call_rcu(&cfqd->rcu, cfq_cfqd_free); 3834} 3835 3836static int cfq_alloc_cic_index(void) 3837{ 3838 int index, error; 3839 3840 do { 3841 if (!ida_pre_get(&cic_index_ida, GFP_KERNEL)) 3842 return -ENOMEM; 3843 3844 spin_lock(&cic_index_lock); 3845 error = ida_get_new(&cic_index_ida, &index); 3846 spin_unlock(&cic_index_lock); 3847 if (error && error != -EAGAIN) 3848 return error; 3849 } while (error); 3850 3851 return index; 3852} 3853 3854static void *cfq_init_queue(struct request_queue *q) 3855{ 3856 struct cfq_data *cfqd; 3857 int i, j; 3858 struct cfq_group *cfqg; 3859 struct cfq_rb_root *st; 3860 3861 i = cfq_alloc_cic_index(); 3862 if (i < 0) 3863 return NULL; 3864 3865 cfqd = kmalloc_node(sizeof(*cfqd), GFP_KERNEL | __GFP_ZERO, q->node); 3866 if (!cfqd) 3867 return NULL; 3868 3869 cfqd->cic_index = i; 3870 3871 /* Init root service tree */ 3872 cfqd->grp_service_tree = CFQ_RB_ROOT; 3873 3874 /* Init root group */ 3875 cfqg = &cfqd->root_group; 3876 for_each_cfqg_st(cfqg, i, j, st) 3877 *st = CFQ_RB_ROOT; 3878 RB_CLEAR_NODE(&cfqg->rb_node); 3879 3880 /* Give preference to root group over other groups */ 3881 cfqg->weight = 2*BLKIO_WEIGHT_DEFAULT; 3882 3883#ifdef CONFIG_CFQ_GROUP_IOSCHED 3884 /* 3885 * Take a reference to root group which we never drop. This is just 3886 * to make sure that cfq_put_cfqg() does not try to kfree root group 3887 */ 3888 atomic_set(&cfqg->ref, 1); 3889 rcu_read_lock(); 3890 cfq_blkiocg_add_blkio_group(&blkio_root_cgroup, &cfqg->blkg, 3891 (void *)cfqd, 0); 3892 rcu_read_unlock(); 3893#endif 3894 /* 3895 * Not strictly needed (since RB_ROOT just clears the node and we 3896 * zeroed cfqd on alloc), but better be safe in case someone decides 3897 * to add magic to the rb code 3898 */ 3899 for (i = 0; i < CFQ_PRIO_LISTS; i++) 3900 cfqd->prio_trees[i] = RB_ROOT; 3901 3902 /* 3903 * Our fallback cfqq if cfq_find_alloc_queue() runs into OOM issues. 3904 * Grab a permanent reference to it, so that the normal code flow 3905 * will not attempt to free it. 3906 */ 3907 cfq_init_cfqq(cfqd, &cfqd->oom_cfqq, 1, 0); 3908 atomic_inc(&cfqd->oom_cfqq.ref); 3909 cfq_link_cfqq_cfqg(&cfqd->oom_cfqq, &cfqd->root_group); 3910 3911 INIT_LIST_HEAD(&cfqd->cic_list); 3912 3913 cfqd->queue = q; 3914 3915 init_timer(&cfqd->idle_slice_timer); 3916 cfqd->idle_slice_timer.function = cfq_idle_slice_timer; 3917 cfqd->idle_slice_timer.data = (unsigned long) cfqd; 3918 3919 INIT_WORK(&cfqd->unplug_work, cfq_kick_queue); 3920 3921 cfqd->cfq_quantum = cfq_quantum; 3922 cfqd->cfq_fifo_expire[0] = cfq_fifo_expire[0]; 3923 cfqd->cfq_fifo_expire[1] = cfq_fifo_expire[1]; 3924 cfqd->cfq_back_max = cfq_back_max; 3925 cfqd->cfq_back_penalty = cfq_back_penalty; 3926 cfqd->cfq_slice[0] = cfq_slice_async; 3927 cfqd->cfq_slice[1] = cfq_slice_sync; 3928 cfqd->cfq_slice_async_rq = cfq_slice_async_rq; 3929 cfqd->cfq_slice_idle = cfq_slice_idle; 3930 cfqd->cfq_group_idle = cfq_group_idle; 3931 cfqd->cfq_latency = 1; 3932 cfqd->cfq_group_isolation = 0; 3933 cfqd->hw_tag = -1; 3934 /* 3935 * we optimistically start assuming sync ops weren't delayed in last 3936 * second, in order to have larger depth for async operations. 3937 */ 3938 cfqd->last_delayed_sync = jiffies - HZ; 3939 return cfqd; 3940} 3941 3942static void cfq_slab_kill(void) 3943{ 3944 /* 3945 * Caller already ensured that pending RCU callbacks are completed, 3946 * so we should have no busy allocations at this point. 3947 */ 3948 if (cfq_pool) 3949 kmem_cache_destroy(cfq_pool); 3950 if (cfq_ioc_pool) 3951 kmem_cache_destroy(cfq_ioc_pool); 3952} 3953 3954static int __init cfq_slab_setup(void) 3955{ 3956 cfq_pool = KMEM_CACHE(cfq_queue, 0); 3957 if (!cfq_pool) 3958 goto fail; 3959 3960 cfq_ioc_pool = KMEM_CACHE(cfq_io_context, 0); 3961 if (!cfq_ioc_pool) 3962 goto fail; 3963 3964 return 0; 3965fail: 3966 cfq_slab_kill(); 3967 return -ENOMEM; 3968} 3969 3970/* 3971 * sysfs parts below --> 3972 */ 3973static ssize_t 3974cfq_var_show(unsigned int var, char *page) 3975{ 3976 return sprintf(page, "%d\n", var); 3977} 3978 3979static ssize_t 3980cfq_var_store(unsigned int *var, const char *page, size_t count) 3981{ 3982 char *p = (char *) page; 3983 3984 *var = simple_strtoul(p, &p, 10); 3985 return count; 3986} 3987 3988#define SHOW_FUNCTION(__FUNC, __VAR, __CONV) \ 3989static ssize_t __FUNC(struct elevator_queue *e, char *page) \ 3990{ \ 3991 struct cfq_data *cfqd = e->elevator_data; \ 3992 unsigned int __data = __VAR; \ 3993 if (__CONV) \ 3994 __data = jiffies_to_msecs(__data); \ 3995 return cfq_var_show(__data, (page)); \ 3996} 3997SHOW_FUNCTION(cfq_quantum_show, cfqd->cfq_quantum, 0); 3998SHOW_FUNCTION(cfq_fifo_expire_sync_show, cfqd->cfq_fifo_expire[1], 1); 3999SHOW_FUNCTION(cfq_fifo_expire_async_show, cfqd->cfq_fifo_expire[0], 1); 4000SHOW_FUNCTION(cfq_back_seek_max_show, cfqd->cfq_back_max, 0); 4001SHOW_FUNCTION(cfq_back_seek_penalty_show, cfqd->cfq_back_penalty, 0); 4002SHOW_FUNCTION(cfq_slice_idle_show, cfqd->cfq_slice_idle, 1); 4003SHOW_FUNCTION(cfq_group_idle_show, cfqd->cfq_group_idle, 1); 4004SHOW_FUNCTION(cfq_slice_sync_show, cfqd->cfq_slice[1], 1); 4005SHOW_FUNCTION(cfq_slice_async_show, cfqd->cfq_slice[0], 1); 4006SHOW_FUNCTION(cfq_slice_async_rq_show, cfqd->cfq_slice_async_rq, 0); 4007SHOW_FUNCTION(cfq_low_latency_show, cfqd->cfq_latency, 0); 4008SHOW_FUNCTION(cfq_group_isolation_show, cfqd->cfq_group_isolation, 0); 4009#undef SHOW_FUNCTION 4010 4011#define STORE_FUNCTION(__FUNC, __PTR, MIN, MAX, __CONV) \ 4012static ssize_t __FUNC(struct elevator_queue *e, const char *page, size_t count) \ 4013{ \ 4014 struct cfq_data *cfqd = e->elevator_data; \ 4015 unsigned int __data; \ 4016 int ret = cfq_var_store(&__data, (page), count); \ 4017 if (__data < (MIN)) \ 4018 __data = (MIN); \ 4019 else if (__data > (MAX)) \ 4020 __data = (MAX); \ 4021 if (__CONV) \ 4022 *(__PTR) = msecs_to_jiffies(__data); \ 4023 else \ 4024 *(__PTR) = __data; \ 4025 return ret; \ 4026} 4027STORE_FUNCTION(cfq_quantum_store, &cfqd->cfq_quantum, 1, UINT_MAX, 0); 4028STORE_FUNCTION(cfq_fifo_expire_sync_store, &cfqd->cfq_fifo_expire[1], 1, 4029 UINT_MAX, 1); 4030STORE_FUNCTION(cfq_fifo_expire_async_store, &cfqd->cfq_fifo_expire[0], 1, 4031 UINT_MAX, 1); 4032STORE_FUNCTION(cfq_back_seek_max_store, &cfqd->cfq_back_max, 0, UINT_MAX, 0); 4033STORE_FUNCTION(cfq_back_seek_penalty_store, &cfqd->cfq_back_penalty, 1, 4034 UINT_MAX, 0); 4035STORE_FUNCTION(cfq_slice_idle_store, &cfqd->cfq_slice_idle, 0, UINT_MAX, 1); 4036STORE_FUNCTION(cfq_group_idle_store, &cfqd->cfq_group_idle, 0, UINT_MAX, 1); 4037STORE_FUNCTION(cfq_slice_sync_store, &cfqd->cfq_slice[1], 1, UINT_MAX, 1); 4038STORE_FUNCTION(cfq_slice_async_store, &cfqd->cfq_slice[0], 1, UINT_MAX, 1); 4039STORE_FUNCTION(cfq_slice_async_rq_store, &cfqd->cfq_slice_async_rq, 1, 4040 UINT_MAX, 0); 4041STORE_FUNCTION(cfq_low_latency_store, &cfqd->cfq_latency, 0, 1, 0); 4042STORE_FUNCTION(cfq_group_isolation_store, &cfqd->cfq_group_isolation, 0, 1, 0); 4043#undef STORE_FUNCTION 4044 4045#define CFQ_ATTR(name) \ 4046 __ATTR(name, S_IRUGO|S_IWUSR, cfq_##name##_show, cfq_##name##_store) 4047 4048static struct elv_fs_entry cfq_attrs[] = { 4049 CFQ_ATTR(quantum), 4050 CFQ_ATTR(fifo_expire_sync), 4051 CFQ_ATTR(fifo_expire_async), 4052 CFQ_ATTR(back_seek_max), 4053 CFQ_ATTR(back_seek_penalty), 4054 CFQ_ATTR(slice_sync), 4055 CFQ_ATTR(slice_async), 4056 CFQ_ATTR(slice_async_rq), 4057 CFQ_ATTR(slice_idle), 4058 CFQ_ATTR(group_idle), 4059 CFQ_ATTR(low_latency), 4060 CFQ_ATTR(group_isolation), 4061 __ATTR_NULL 4062}; 4063 4064static struct elevator_type iosched_cfq = { 4065 .ops = { 4066 .elevator_merge_fn = cfq_merge, 4067 .elevator_merged_fn = cfq_merged_request, 4068 .elevator_merge_req_fn = cfq_merged_requests, 4069 .elevator_allow_merge_fn = cfq_allow_merge, 4070 .elevator_bio_merged_fn = cfq_bio_merged, 4071 .elevator_dispatch_fn = cfq_dispatch_requests, 4072 .elevator_add_req_fn = cfq_insert_request, 4073 .elevator_activate_req_fn = cfq_activate_request, 4074 .elevator_deactivate_req_fn = cfq_deactivate_request, 4075 .elevator_queue_empty_fn = cfq_queue_empty, 4076 .elevator_completed_req_fn = cfq_completed_request, 4077 .elevator_former_req_fn = elv_rb_former_request, 4078 .elevator_latter_req_fn = elv_rb_latter_request, 4079 .elevator_set_req_fn = cfq_set_request, 4080 .elevator_put_req_fn = cfq_put_request, 4081 .elevator_may_queue_fn = cfq_may_queue, 4082 .elevator_init_fn = cfq_init_queue, 4083 .elevator_exit_fn = cfq_exit_queue, 4084 .trim = cfq_free_io_context, 4085 }, 4086 .elevator_attrs = cfq_attrs, 4087 .elevator_name = "cfq", 4088 .elevator_owner = THIS_MODULE, 4089}; 4090 4091#ifdef CONFIG_CFQ_GROUP_IOSCHED 4092static struct blkio_policy_type blkio_policy_cfq = { 4093 .ops = { 4094 .blkio_unlink_group_fn = cfq_unlink_blkio_group, 4095 .blkio_update_group_weight_fn = cfq_update_blkio_group_weight, 4096 }, 4097}; 4098#else 4099static struct blkio_policy_type blkio_policy_cfq; 4100#endif 4101 4102static int __init cfq_init(void) 4103{ 4104 /* 4105 * could be 0 on HZ < 1000 setups 4106 */ 4107 if (!cfq_slice_async) 4108 cfq_slice_async = 1; 4109 if (!cfq_slice_idle) 4110 cfq_slice_idle = 1; 4111 4112#ifdef CONFIG_CFQ_GROUP_IOSCHED 4113 if (!cfq_group_idle) 4114 cfq_group_idle = 1; 4115#else 4116 cfq_group_idle = 0; 4117#endif 4118 if (cfq_slab_setup()) 4119 return -ENOMEM; 4120 4121 elv_register(&iosched_cfq); 4122 blkio_policy_register(&blkio_policy_cfq); 4123 4124 return 0; 4125} 4126 4127static void __exit cfq_exit(void) 4128{ 4129 DECLARE_COMPLETION_ONSTACK(all_gone); 4130 blkio_policy_unregister(&blkio_policy_cfq); 4131 elv_unregister(&iosched_cfq); 4132 ioc_gone = &all_gone; 4133 /* ioc_gone's update must be visible before reading ioc_count */ 4134 smp_wmb(); 4135 4136 /* 4137 * this also protects us from entering cfq_slab_kill() with 4138 * pending RCU callbacks 4139 */ 4140 if (elv_ioc_count_read(cfq_ioc_count)) 4141 wait_for_completion(&all_gone); 4142 ida_destroy(&cic_index_ida); 4143 cfq_slab_kill(); 4144} 4145 4146module_init(cfq_init); 4147module_exit(cfq_exit); 4148 4149MODULE_AUTHOR("Jens Axboe"); 4150MODULE_LICENSE("GPL"); 4151MODULE_DESCRIPTION("Completely Fair Queueing IO scheduler"); 4152