Version:  2.6.24 2.6.25 2.6.26 2.6.27 2.6.28

Architecture:  x86 m68k m68knommu mips powerpc sh blackfin

Linux/block/cfq-iosched.c

  1 /*
  2  *  CFQ, or complete fairness queueing, disk scheduler.
  3  *
  4  *  Based on ideas from a previously unfinished io
  5  *  scheduler (round robin per-process disk scheduling) and Andrea Arcangeli.
  6  *
  7  *  Copyright (C) 2003 Jens Axboe <axboe@kernel.dk>
  8  */
  9 #include <linux/module.h>
 10 #include <linux/blkdev.h>
 11 #include <linux/elevator.h>
 12 #include <linux/rbtree.h>
 13 #include <linux/ioprio.h>
 14 #include <linux/blktrace_api.h>
 15 
 16 /*
 17  * tunables
 18  */
 19 /* max queue in one round of service */
 20 static const int cfq_quantum = 4;
 21 static const int cfq_fifo_expire[2] = { HZ / 4, HZ / 8 };
 22 /* maximum backwards seek, in KiB */
 23 static const int cfq_back_max = 16 * 1024;
 24 /* penalty of a backwards seek */
 25 static const int cfq_back_penalty = 2;
 26 static const int cfq_slice_sync = HZ / 10;
 27 static int cfq_slice_async = HZ / 25;
 28 static const int cfq_slice_async_rq = 2;
 29 static int cfq_slice_idle = HZ / 125;
 30 
 31 /*
 32  * offset from end of service tree
 33  */
 34 #define CFQ_IDLE_DELAY          (HZ / 5)
 35 
 36 /*
 37  * below this threshold, we consider thinktime immediate
 38  */
 39 #define CFQ_MIN_TT              (2)
 40 
 41 #define CFQ_SLICE_SCALE         (5)
 42 #define CFQ_HW_QUEUE_MIN        (5)
 43 
 44 #define RQ_CIC(rq)              \
 45         ((struct cfq_io_context *) (rq)->elevator_private)
 46 #define RQ_CFQQ(rq)             (struct cfq_queue *) ((rq)->elevator_private2)
 47 
 48 static struct kmem_cache *cfq_pool;
 49 static struct kmem_cache *cfq_ioc_pool;
 50 
 51 static DEFINE_PER_CPU(unsigned long, ioc_count);
 52 static struct completion *ioc_gone;
 53 static DEFINE_SPINLOCK(ioc_gone_lock);
 54 
 55 #define CFQ_PRIO_LISTS          IOPRIO_BE_NR
 56 #define cfq_class_idle(cfqq)    ((cfqq)->ioprio_class == IOPRIO_CLASS_IDLE)
 57 #define cfq_class_rt(cfqq)      ((cfqq)->ioprio_class == IOPRIO_CLASS_RT)
 58 
 59 #define ASYNC                   (0)
 60 #define SYNC                    (1)
 61 
 62 #define sample_valid(samples)   ((samples) > 80)
 63 
 64 /*
 65  * Most of our rbtree usage is for sorting with min extraction, so
 66  * if we cache the leftmost node we don't have to walk down the tree
 67  * to find it. Idea borrowed from Ingo Molnars CFS scheduler. We should
 68  * move this into the elevator for the rq sorting as well.
 69  */
 70 struct cfq_rb_root {
 71         struct rb_root rb;
 72         struct rb_node *left;
 73 };
 74 #define CFQ_RB_ROOT     (struct cfq_rb_root) { RB_ROOT, NULL, }
 75 
 76 /*
 77  * Per block device queue structure
 78  */
 79 struct cfq_data {
 80         struct request_queue *queue;
 81 
 82         /*
 83          * rr list of queues with requests and the count of them
 84          */
 85         struct cfq_rb_root service_tree;
 86         unsigned int busy_queues;
 87 
 88         int rq_in_driver;
 89         int sync_flight;
 90 
 91         /*
 92          * queue-depth detection
 93          */
 94         int rq_queued;
 95         int hw_tag;
 96         int hw_tag_samples;
 97         int rq_in_driver_peak;
 98 
 99         /*
100          * idle window management
101          */
102         struct timer_list idle_slice_timer;
103         struct work_struct unplug_work;
104 
105         struct cfq_queue *active_queue;
106         struct cfq_io_context *active_cic;
107 
108         /*
109          * async queue for each priority case
110          */
111         struct cfq_queue *async_cfqq[2][IOPRIO_BE_NR];
112         struct cfq_queue *async_idle_cfqq;
113 
114         sector_t last_position;
115         unsigned long last_end_request;
116 
117         /*
118          * tunables, see top of file
119          */
120         unsigned int cfq_quantum;
121         unsigned int cfq_fifo_expire[2];
122         unsigned int cfq_back_penalty;
123         unsigned int cfq_back_max;
124         unsigned int cfq_slice[2];
125         unsigned int cfq_slice_async_rq;
126         unsigned int cfq_slice_idle;
127 
128         struct list_head cic_list;
129 };
130 
131 /*
132  * Per process-grouping structure
133  */
134 struct cfq_queue {
135         /* reference count */
136         atomic_t ref;
137         /* various state flags, see below */
138         unsigned int flags;
139         /* parent cfq_data */
140         struct cfq_data *cfqd;
141         /* service_tree member */
142         struct rb_node rb_node;
143         /* service_tree key */
144         unsigned long rb_key;
145         /* sorted list of pending requests */
146         struct rb_root sort_list;
147         /* if fifo isn't expired, next request to serve */
148         struct request *next_rq;
149         /* requests queued in sort_list */
150         int queued[2];
151         /* currently allocated requests */
152         int allocated[2];
153         /* fifo list of requests in sort_list */
154         struct list_head fifo;
155 
156         unsigned long slice_end;
157         long slice_resid;
158 
159         /* pending metadata requests */
160         int meta_pending;
161         /* number of requests that are on the dispatch list or inside driver */
162         int dispatched;
163 
164         /* io prio of this group */
165         unsigned short ioprio, org_ioprio;
166         unsigned short ioprio_class, org_ioprio_class;
167 
168         pid_t pid;
169 };
170 
171 enum cfqq_state_flags {
172         CFQ_CFQQ_FLAG_on_rr = 0,        /* on round-robin busy list */
173         CFQ_CFQQ_FLAG_wait_request,     /* waiting for a request */
174         CFQ_CFQQ_FLAG_must_alloc,       /* must be allowed rq alloc */
175         CFQ_CFQQ_FLAG_must_alloc_slice, /* per-slice must_alloc flag */
176         CFQ_CFQQ_FLAG_must_dispatch,    /* must dispatch, even if expired */
177         CFQ_CFQQ_FLAG_fifo_expire,      /* FIFO checked in this slice */
178         CFQ_CFQQ_FLAG_idle_window,      /* slice idling enabled */
179         CFQ_CFQQ_FLAG_prio_changed,     /* task priority has changed */
180         CFQ_CFQQ_FLAG_queue_new,        /* queue never been serviced */
181         CFQ_CFQQ_FLAG_slice_new,        /* no requests dispatched in slice */
182         CFQ_CFQQ_FLAG_sync,             /* synchronous queue */
183 };
184 
185 #define CFQ_CFQQ_FNS(name)                                              \
186 static inline void cfq_mark_cfqq_##name(struct cfq_queue *cfqq)         \
187 {                                                                       \
188         (cfqq)->flags |= (1 << CFQ_CFQQ_FLAG_##name);                   \
189 }                                                                       \
190 static inline void cfq_clear_cfqq_##name(struct cfq_queue *cfqq)        \
191 {                                                                       \
192         (cfqq)->flags &= ~(1 << CFQ_CFQQ_FLAG_##name);                  \
193 }                                                                       \
194 static inline int cfq_cfqq_##name(const struct cfq_queue *cfqq)         \
195 {                                                                       \
196         return ((cfqq)->flags & (1 << CFQ_CFQQ_FLAG_##name)) != 0;      \
197 }
198 
199 CFQ_CFQQ_FNS(on_rr);
200 CFQ_CFQQ_FNS(wait_request);
201 CFQ_CFQQ_FNS(must_alloc);
202 CFQ_CFQQ_FNS(must_alloc_slice);
203 CFQ_CFQQ_FNS(must_dispatch);
204 CFQ_CFQQ_FNS(fifo_expire);
205 CFQ_CFQQ_FNS(idle_window);
206 CFQ_CFQQ_FNS(prio_changed);
207 CFQ_CFQQ_FNS(queue_new);
208 CFQ_CFQQ_FNS(slice_new);
209 CFQ_CFQQ_FNS(sync);
210 #undef CFQ_CFQQ_FNS
211 
212 #define cfq_log_cfqq(cfqd, cfqq, fmt, args...)  \
213         blk_add_trace_msg((cfqd)->queue, "cfq%d " fmt, (cfqq)->pid, ##args)
214 #define cfq_log(cfqd, fmt, args...)     \
215         blk_add_trace_msg((cfqd)->queue, "cfq " fmt, ##args)
216 
217 static void cfq_dispatch_insert(struct request_queue *, struct request *);
218 static struct cfq_queue *cfq_get_queue(struct cfq_data *, int,
219                                        struct io_context *, gfp_t);
220 static struct cfq_io_context *cfq_cic_lookup(struct cfq_data *,
221                                                 struct io_context *);
222 
223 static inline struct cfq_queue *cic_to_cfqq(struct cfq_io_context *cic,
224                                             int is_sync)
225 {
226         return cic->cfqq[!!is_sync];
227 }
228 
229 static inline void cic_set_cfqq(struct cfq_io_context *cic,
230                                 struct cfq_queue *cfqq, int is_sync)
231 {
232         cic->cfqq[!!is_sync] = cfqq;
233 }
234 
235 /*
236  * We regard a request as SYNC, if it's either a read or has the SYNC bit
237  * set (in which case it could also be direct WRITE).
238  */
239 static inline int cfq_bio_sync(struct bio *bio)
240 {
241         if (bio_data_dir(bio) == READ || bio_sync(bio))
242                 return 1;
243 
244         return 0;
245 }
246 
247 /*
248  * scheduler run of queue, if there are requests pending and no one in the
249  * driver that will restart queueing
250  */
251 static inline void cfq_schedule_dispatch(struct cfq_data *cfqd)
252 {
253         if (cfqd->busy_queues) {
254                 cfq_log(cfqd, "schedule dispatch");
255                 kblockd_schedule_work(cfqd->queue, &cfqd->unplug_work);
256         }
257 }
258 
259 static int cfq_queue_empty(struct request_queue *q)
260 {
261         struct cfq_data *cfqd = q->elevator->elevator_data;
262 
263         return !cfqd->busy_queues;
264 }
265 
266 /*
267  * Scale schedule slice based on io priority. Use the sync time slice only
268  * if a queue is marked sync and has sync io queued. A sync queue with async
269  * io only, should not get full sync slice length.
270  */
271 static inline int cfq_prio_slice(struct cfq_data *cfqd, int sync,
272                                  unsigned short prio)
273 {
274         const int base_slice = cfqd->cfq_slice[sync];
275 
276         WARN_ON(prio >= IOPRIO_BE_NR);
277 
278         return base_slice + (base_slice/CFQ_SLICE_SCALE * (4 - prio));
279 }
280 
281 static inline int
282 cfq_prio_to_slice(struct cfq_data *cfqd, struct cfq_queue *cfqq)
283 {
284         return cfq_prio_slice(cfqd, cfq_cfqq_sync(cfqq), cfqq->ioprio);
285 }
286 
287 static inline void
288 cfq_set_prio_slice(struct cfq_data *cfqd, struct cfq_queue *cfqq)
289 {
290         cfqq->slice_end = cfq_prio_to_slice(cfqd, cfqq) + jiffies;
291         cfq_log_cfqq(cfqd, cfqq, "set_slice=%lu", cfqq->slice_end - jiffies);
292 }
293 
294 /*
295  * We need to wrap this check in cfq_cfqq_slice_new(), since ->slice_end
296  * isn't valid until the first request from the dispatch is activated
297  * and the slice time set.
298  */
299 static inline int cfq_slice_used(struct cfq_queue *cfqq)
300 {
301         if (cfq_cfqq_slice_new(cfqq))
302                 return 0;
303         if (time_before(jiffies, cfqq->slice_end))
304                 return 0;
305 
306         return 1;
307 }
308 
309 /*
310  * Lifted from AS - choose which of rq1 and rq2 that is best served now.
311  * We choose the request that is closest to the head right now. Distance
312  * behind the head is penalized and only allowed to a certain extent.
313  */
314 static struct request *
315 cfq_choose_req(struct cfq_data *cfqd, struct request *rq1, struct request *rq2)
316 {
317         sector_t last, s1, s2, d1 = 0, d2 = 0;
318         unsigned long back_max;
319 #define CFQ_RQ1_WRAP    0x01 /* request 1 wraps */
320 #define CFQ_RQ2_WRAP    0x02 /* request 2 wraps */
321         unsigned wrap = 0; /* bit mask: requests behind the disk head? */
322 
323         if (rq1 == NULL || rq1 == rq2)
324                 return rq2;
325         if (rq2 == NULL)
326                 return rq1;
327 
328         if (rq_is_sync(rq1) && !rq_is_sync(rq2))
329                 return rq1;
330         else if (rq_is_sync(rq2) && !rq_is_sync(rq1))
331                 return rq2;
332         if (rq_is_meta(rq1) && !rq_is_meta(rq2))
333                 return rq1;
334         else if (rq_is_meta(rq2) && !rq_is_meta(rq1))
335                 return rq2;
336 
337         s1 = rq1->sector;
338         s2 = rq2->sector;
339 
340         last = cfqd->last_position;
341 
342         /*
343          * by definition, 1KiB is 2 sectors
344          */
345         back_max = cfqd->cfq_back_max * 2;
346 
347         /*
348          * Strict one way elevator _except_ in the case where we allow
349          * short backward seeks which are biased as twice the cost of a
350          * similar forward seek.
351          */
352         if (s1 >= last)
353                 d1 = s1 - last;
354         else if (s1 + back_max >= last)
355                 d1 = (last - s1) * cfqd->cfq_back_penalty;
356         else
357                 wrap |= CFQ_RQ1_WRAP;
358 
359         if (s2 >= last)
360                 d2 = s2 - last;
361         else if (s2 + back_max >= last)
362                 d2 = (last - s2) * cfqd->cfq_back_penalty;
363         else
364                 wrap |= CFQ_RQ2_WRAP;
365 
366         /* Found required data */
367 
368         /*
369          * By doing switch() on the bit mask "wrap" we avoid having to
370          * check two variables for all permutations: --> faster!
371          */
372         switch (wrap) {
373         case 0: /* common case for CFQ: rq1 and rq2 not wrapped */
374                 if (d1 < d2)
375                         return rq1;
376                 else if (d2 < d1)
377                         return rq2;
378                 else {
379                         if (s1 >= s2)
380                                 return rq1;
381                         else
382                                 return rq2;
383                 }
384 
385         case CFQ_RQ2_WRAP:
386                 return rq1;
387         case CFQ_RQ1_WRAP:
388                 return rq2;
389         case (CFQ_RQ1_WRAP|CFQ_RQ2_WRAP): /* both rqs wrapped */
390         default:
391                 /*
392                  * Since both rqs are wrapped,
393                  * start with the one that's further behind head
394                  * (--> only *one* back seek required),
395                  * since back seek takes more time than forward.
396                  */
397                 if (s1 <= s2)
398                         return rq1;
399                 else
400                         return rq2;
401         }
402 }
403 
404 /*
405  * The below is leftmost cache rbtree addon
406  */
407 static struct cfq_queue *cfq_rb_first(struct cfq_rb_root *root)
408 {
409         if (!root->left)
410                 root->left = rb_first(&root->rb);
411 
412         if (root->left)
413                 return rb_entry(root->left, struct cfq_queue, rb_node);
414 
415         return NULL;
416 }
417 
418 static void cfq_rb_erase(struct rb_node *n, struct cfq_rb_root *root)
419 {
420         if (root->left == n)
421                 root->left = NULL;
422 
423         rb_erase(n, &root->rb);
424         RB_CLEAR_NODE(n);
425 }
426 
427 /*
428  * would be nice to take fifo expire time into account as well
429  */
430 static struct request *
431 cfq_find_next_rq(struct cfq_data *cfqd, struct cfq_queue *cfqq,
432                   struct request *last)
433 {
434         struct rb_node *rbnext = rb_next(&last->rb_node);
435         struct rb_node *rbprev = rb_prev(&last->rb_node);
436         struct request *next = NULL, *prev = NULL;
437 
438         BUG_ON(RB_EMPTY_NODE(&last->rb_node));
439 
440         if (rbprev)
441                 prev = rb_entry_rq(rbprev);
442 
443         if (rbnext)
444                 next = rb_entry_rq(rbnext);
445         else {
446                 rbnext = rb_first(&cfqq->sort_list);
447                 if (rbnext && rbnext != &last->rb_node)
448                         next = rb_entry_rq(rbnext);
449         }
450 
451         return cfq_choose_req(cfqd, next, prev);
452 }
453 
454 static unsigned long cfq_slice_offset(struct cfq_data *cfqd,
455                                       struct cfq_queue *cfqq)
456 {
457         /*
458          * just an approximation, should be ok.
459          */
460         return (cfqd->busy_queues - 1) * (cfq_prio_slice(cfqd, 1, 0) -
461                        cfq_prio_slice(cfqd, cfq_cfqq_sync(cfqq), cfqq->ioprio));
462 }
463 
464 /*
465  * The cfqd->service_tree holds all pending cfq_queue's that have
466  * requests waiting to be processed. It is sorted in the order that
467  * we will service the queues.
468  */
469 static void cfq_service_tree_add(struct cfq_data *cfqd,
470                                     struct cfq_queue *cfqq, int add_front)
471 {
472         struct rb_node **p, *parent;
473         struct cfq_queue *__cfqq;
474         unsigned long rb_key;
475         int left;
476 
477         if (cfq_class_idle(cfqq)) {
478                 rb_key = CFQ_IDLE_DELAY;
479                 parent = rb_last(&cfqd->service_tree.rb);
480                 if (parent && parent != &cfqq->rb_node) {
481                         __cfqq = rb_entry(parent, struct cfq_queue, rb_node);
482                         rb_key += __cfqq->rb_key;
483                 } else
484                         rb_key += jiffies;
485         } else if (!add_front) {
486                 rb_key = cfq_slice_offset(cfqd, cfqq) + jiffies;
487                 rb_key += cfqq->slice_resid;
488                 cfqq->slice_resid = 0;
489         } else
490                 rb_key = 0;
491 
492         if (!RB_EMPTY_NODE(&cfqq->rb_node)) {
493                 /*
494                  * same position, nothing more to do
495                  */
496                 if (rb_key == cfqq->rb_key)
497                         return;
498 
499                 cfq_rb_erase(&cfqq->rb_node, &cfqd->service_tree);
500         }
501 
502         left = 1;
503         parent = NULL;
504         p = &cfqd->service_tree.rb.rb_node;
505         while (*p) {
506                 struct rb_node **n;
507 
508                 parent = *p;
509                 __cfqq = rb_entry(parent, struct cfq_queue, rb_node);
510 
511                 /*
512                  * sort RT queues first, we always want to give
513                  * preference to them. IDLE queues goes to the back.
514                  * after that, sort on the next service time.
515                  */
516                 if (cfq_class_rt(cfqq) > cfq_class_rt(__cfqq))
517                         n = &(*p)->rb_left;
518                 else if (cfq_class_rt(cfqq) < cfq_class_rt(__cfqq))
519                         n = &(*p)->rb_right;
520                 else if (cfq_class_idle(cfqq) < cfq_class_idle(__cfqq))
521                         n = &(*p)->rb_left;
522                 else if (cfq_class_idle(cfqq) > cfq_class_idle(__cfqq))
523                         n = &(*p)->rb_right;
524                 else if (rb_key < __cfqq->rb_key)
525                         n = &(*p)->rb_left;
526                 else
527                         n = &(*p)->rb_right;
528 
529                 if (n == &(*p)->rb_right)
530                         left = 0;
531 
532                 p = n;
533         }
534 
535         if (left)
536                 cfqd->service_tree.left = &cfqq->rb_node;
537 
538         cfqq->rb_key = rb_key;
539         rb_link_node(&cfqq->rb_node, parent, p);
540         rb_insert_color(&cfqq->rb_node, &cfqd->service_tree.rb);
541 }
542 
543 /*
544  * Update cfqq's position in the service tree.
545  */
546 static void cfq_resort_rr_list(struct cfq_data *cfqd, struct cfq_queue *cfqq)
547 {
548         /*
549          * Resorting requires the cfqq to be on the RR list already.
550          */
551         if (cfq_cfqq_on_rr(cfqq))
552                 cfq_service_tree_add(cfqd, cfqq, 0);
553 }
554 
555 /*
556  * add to busy list of queues for service, trying to be fair in ordering
557  * the pending list according to last request service
558  */
559 static void cfq_add_cfqq_rr(struct cfq_data *cfqd, struct cfq_queue *cfqq)
560 {
561         cfq_log_cfqq(cfqd, cfqq, "add_to_rr");
562         BUG_ON(cfq_cfqq_on_rr(cfqq));
563         cfq_mark_cfqq_on_rr(cfqq);
564         cfqd->busy_queues++;
565 
566         cfq_resort_rr_list(cfqd, cfqq);
567 }
568 
569 /*
570  * Called when the cfqq no longer has requests pending, remove it from
571  * the service tree.
572  */
573 static void cfq_del_cfqq_rr(struct cfq_data *cfqd, struct cfq_queue *cfqq)
574 {
575         cfq_log_cfqq(cfqd, cfqq, "del_from_rr");
576         BUG_ON(!cfq_cfqq_on_rr(cfqq));
577         cfq_clear_cfqq_on_rr(cfqq);
578 
579         if (!RB_EMPTY_NODE(&cfqq->rb_node))
580                 cfq_rb_erase(&cfqq->rb_node, &cfqd->service_tree);
581 
582         BUG_ON(!cfqd->busy_queues);
583         cfqd->busy_queues--;
584 }
585 
586 /*
587  * rb tree support functions
588  */
589 static void cfq_del_rq_rb(struct request *rq)
590 {
591         struct cfq_queue *cfqq = RQ_CFQQ(rq);
592         struct cfq_data *cfqd = cfqq->cfqd;
593         const int sync = rq_is_sync(rq);
594 
595         BUG_ON(!cfqq->queued[sync]);
596         cfqq->queued[sync]--;
597 
598         elv_rb_del(&cfqq->sort_list, rq);
599 
600         if (cfq_cfqq_on_rr(cfqq) && RB_EMPTY_ROOT(&cfqq->sort_list))
601                 cfq_del_cfqq_rr(cfqd, cfqq);
602 }
603 
604 static void cfq_add_rq_rb(struct request *rq)
605 {
606         struct cfq_queue *cfqq = RQ_CFQQ(rq);
607         struct cfq_data *cfqd = cfqq->cfqd;
608         struct request *__alias;
609 
610         cfqq->queued[rq_is_sync(rq)]++;
611 
612         /*
613          * looks a little odd, but the first insert might return an alias.
614          * if that happens, put the alias on the dispatch list
615          */
616         while ((__alias = elv_rb_add(&cfqq->sort_list, rq)) != NULL)
617                 cfq_dispatch_insert(cfqd->queue, __alias);
618 
619         if (!cfq_cfqq_on_rr(cfqq))
620                 cfq_add_cfqq_rr(cfqd, cfqq);
621 
622         /*
623          * check if this request is a better next-serve candidate
624          */
625         cfqq->next_rq = cfq_choose_req(cfqd, cfqq->next_rq, rq);
626         BUG_ON(!cfqq->next_rq);
627 }
628 
629 static void cfq_reposition_rq_rb(struct cfq_queue *cfqq, struct request *rq)
630 {
631         elv_rb_del(&cfqq->sort_list, rq);
632         cfqq->queued[rq_is_sync(rq)]--;
633         cfq_add_rq_rb(rq);
634 }
635 
636 static struct request *
637 cfq_find_rq_fmerge(struct cfq_data *cfqd, struct bio *bio)
638 {
639         struct task_struct *tsk = current;
640         struct cfq_io_context *cic;
641         struct cfq_queue *cfqq;
642 
643         cic = cfq_cic_lookup(cfqd, tsk->io_context);
644         if (!cic)
645                 return NULL;
646 
647         cfqq = cic_to_cfqq(cic, cfq_bio_sync(bio));
648         if (cfqq) {
649                 sector_t sector = bio->bi_sector + bio_sectors(bio);
650 
651                 return elv_rb_find(&cfqq->sort_list, sector);
652         }
653 
654         return NULL;
655 }
656 
657 static void cfq_activate_request(struct request_queue *q, struct request *rq)
658 {
659         struct cfq_data *cfqd = q->elevator->elevator_data;
660 
661         cfqd->rq_in_driver++;
662         cfq_log_cfqq(cfqd, RQ_CFQQ(rq), "activate rq, drv=%d",
663                                                 cfqd->rq_in_driver);
664 
665         cfqd->last_position = rq->hard_sector + rq->hard_nr_sectors;
666 }
667 
668 static void cfq_deactivate_request(struct request_queue *q, struct request *rq)
669 {
670         struct cfq_data *cfqd = q->elevator->elevator_data;
671 
672         WARN_ON(!cfqd->rq_in_driver);
673         cfqd->rq_in_driver--;
674         cfq_log_cfqq(cfqd, RQ_CFQQ(rq), "deactivate rq, drv=%d",
675                                                 cfqd->rq_in_driver);
676 }
677 
678 static void cfq_remove_request(struct request *rq)
679 {
680         struct cfq_queue *cfqq = RQ_CFQQ(rq);
681 
682         if (cfqq->next_rq == rq)
683                 cfqq->next_rq = cfq_find_next_rq(cfqq->cfqd, cfqq, rq);
684 
685         list_del_init(&rq->queuelist);
686         cfq_del_rq_rb(rq);
687 
688         cfqq->cfqd->rq_queued--;
689         if (rq_is_meta(rq)) {
690                 WARN_ON(!cfqq->meta_pending);
691                 cfqq->meta_pending--;
692         }
693 }
694 
695 static int cfq_merge(struct request_queue *q, struct request **req,
696                      struct bio *bio)
697 {
698         struct cfq_data *cfqd = q->elevator->elevator_data;
699         struct request *__rq;
700 
701         __rq = cfq_find_rq_fmerge(cfqd, bio);
702         if (__rq && elv_rq_merge_ok(__rq, bio)) {
703                 *req = __rq;
704                 return ELEVATOR_FRONT_MERGE;
705         }
706 
707         return ELEVATOR_NO_MERGE;
708 }
709 
710 static void cfq_merged_request(struct request_queue *q, struct request *req,
711                                int type)
712 {
713         if (type == ELEVATOR_FRONT_MERGE) {
714                 struct cfq_queue *cfqq = RQ_CFQQ(req);
715 
716                 cfq_reposition_rq_rb(cfqq, req);
717         }
718 }
719 
720 static void
721 cfq_merged_requests(struct request_queue *q, struct request *rq,
722                     struct request *next)
723 {
724         /*
725          * reposition in fifo if next is older than rq
726          */
727         if (!list_empty(&rq->queuelist) && !list_empty(&next->queuelist) &&
728             time_before(next->start_time, rq->start_time))
729                 list_move(&rq->queuelist, &next->queuelist);
730 
731         cfq_remove_request(next);
732 }
733 
734 static int cfq_allow_merge(struct request_queue *q, struct request *rq,
735                            struct bio *bio)
736 {
737         struct cfq_data *cfqd = q->elevator->elevator_data;
738         struct cfq_io_context *cic;
739         struct cfq_queue *cfqq;
740 
741         /*
742          * Disallow merge of a sync bio into an async request.
743          */
744         if (cfq_bio_sync(bio) && !rq_is_sync(rq))
745                 return 0;
746 
747         /*
748          * Lookup the cfqq that this bio will be queued with. Allow
749          * merge only if rq is queued there.
750          */
751         cic = cfq_cic_lookup(cfqd, current->io_context);
752         if (!cic)
753                 return 0;
754 
755         cfqq = cic_to_cfqq(cic, cfq_bio_sync(bio));
756         if (cfqq == RQ_CFQQ(rq))
757                 return 1;
758 
759         return 0;
760 }
761 
762 static void __cfq_set_active_queue(struct cfq_data *cfqd,
763                                    struct cfq_queue *cfqq)
764 {
765         if (cfqq) {
766                 cfq_log_cfqq(cfqd, cfqq, "set_active");
767                 cfqq->slice_end = 0;
768                 cfq_clear_cfqq_must_alloc_slice(cfqq);
769                 cfq_clear_cfqq_fifo_expire(cfqq);
770                 cfq_mark_cfqq_slice_new(cfqq);
771                 cfq_clear_cfqq_queue_new(cfqq);
772         }
773 
774         cfqd->active_queue = cfqq;
775 }
776 
777 /*
778  * current cfqq expired its slice (or was too idle), select new one
779  */
780 static void
781 __cfq_slice_expired(struct cfq_data *cfqd, struct cfq_queue *cfqq,
782                     int timed_out)
783 {
784         cfq_log_cfqq(cfqd, cfqq, "slice expired t=%d", timed_out);
785 
786         if (cfq_cfqq_wait_request(cfqq))
787                 del_timer(&cfqd->idle_slice_timer);
788 
789         cfq_clear_cfqq_must_dispatch(cfqq);
790         cfq_clear_cfqq_wait_request(cfqq);
791 
792         /*
793          * store what was left of this slice, if the queue idled/timed out
794          */
795         if (timed_out && !cfq_cfqq_slice_new(cfqq)) {
796                 cfqq->slice_resid = cfqq->slice_end - jiffies;
797                 cfq_log_cfqq(cfqd, cfqq, "resid=%ld", cfqq->slice_resid);
798         }
799 
800         cfq_resort_rr_list(cfqd, cfqq);
801 
802         if (cfqq == cfqd->active_queue)
803                 cfqd->active_queue = NULL;
804 
805         if (cfqd->active_cic) {
806                 put_io_context(cfqd->active_cic->ioc);
807                 cfqd->active_cic = NULL;
808         }
809 }
810 
811 static inline void cfq_slice_expired(struct cfq_data *cfqd, int timed_out)
812 {
813         struct cfq_queue *cfqq = cfqd->active_queue;
814 
815         if (cfqq)
816                 __cfq_slice_expired(cfqd, cfqq, timed_out);
817 }
818 
819 /*
820  * Get next queue for service. Unless we have a queue preemption,
821  * we'll simply select the first cfqq in the service tree.
822  */
823 static struct cfq_queue *cfq_get_next_queue(struct cfq_data *cfqd)
824 {
825         if (RB_EMPTY_ROOT(&cfqd->service_tree.rb))
826                 return NULL;
827 
828         return cfq_rb_first(&cfqd->service_tree);
829 }
830 
831 /*
832  * Get and set a new active queue for service.
833  */
834 static struct cfq_queue *cfq_set_active_queue(struct cfq_data *cfqd)
835 {
836         struct cfq_queue *cfqq;
837 
838         cfqq = cfq_get_next_queue(cfqd);
839         __cfq_set_active_queue(cfqd, cfqq);
840         return cfqq;
841 }
842 
843 static inline sector_t cfq_dist_from_last(struct cfq_data *cfqd,
844                                           struct request *rq)
845 {
846         if (rq->sector >= cfqd->last_position)
847                 return rq->sector - cfqd->last_position;
848         else
849                 return cfqd->last_position - rq->sector;
850 }
851 
852 static inline int cfq_rq_close(struct cfq_data *cfqd, struct request *rq)
853 {
854         struct cfq_io_context *cic = cfqd->active_cic;
855 
856         if (!sample_valid(cic->seek_samples))
857                 return 0;
858 
859         return cfq_dist_from_last(cfqd, rq) <= cic->seek_mean;
860 }
861 
862 static int cfq_close_cooperator(struct cfq_data *cfq_data,
863                                 struct cfq_queue *cfqq)
864 {
865         /*
866          * We should notice if some of the queues are cooperating, eg
867          * working closely on the same area of the disk. In that case,
868          * we can group them together and don't waste time idling.
869          */
870         return 0;
871 }
872 
873 #define CIC_SEEKY(cic) ((cic)->seek_mean > (8 * 1024))
874 
875 static void cfq_arm_slice_timer(struct cfq_data *cfqd)
876 {
877         struct cfq_queue *cfqq = cfqd->active_queue;
878         struct cfq_io_context *cic;
879         unsigned long sl;
880 
881         /*
882          * SSD device without seek penalty, disable idling. But only do so
883          * for devices that support queuing, otherwise we still have a problem
884          * with sync vs async workloads.
885          */
886         if (blk_queue_nonrot(cfqd->queue) && cfqd->hw_tag)
887                 return;
888 
889         WARN_ON(!RB_EMPTY_ROOT(&cfqq->sort_list));
890         WARN_ON(cfq_cfqq_slice_new(cfqq));
891 
892         /*
893          * idle is disabled, either manually or by past process history
894          */
895         if (!cfqd->cfq_slice_idle || !cfq_cfqq_idle_window(cfqq))
896                 return;
897 
898         /*
899          * still requests with the driver, don't idle
900          */
901         if (cfqd->rq_in_driver)
902                 return;
903 
904         /*
905          * task has exited, don't wait
906          */
907         cic = cfqd->active_cic;
908         if (!cic || !atomic_read(&cic->ioc->nr_tasks))
909                 return;
910 
911         /*
912          * See if this prio level has a good candidate
913          */
914         if (cfq_close_cooperator(cfqd, cfqq) &&
915             (sample_valid(cic->ttime_samples) && cic->ttime_mean > 2))
916                 return;
917 
918         cfq_mark_cfqq_must_dispatch(cfqq);
919         cfq_mark_cfqq_wait_request(cfqq);
920 
921         /*
922          * we don't want to idle for seeks, but we do want to allow
923          * fair distribution of slice time for a process doing back-to-back
924          * seeks. so allow a little bit of time for him to submit a new rq
925          */
926         sl = cfqd->cfq_slice_idle;
927         if (sample_valid(cic->seek_samples) && CIC_SEEKY(cic))
928                 sl = min(sl, msecs_to_jiffies(CFQ_MIN_TT));
929 
930         mod_timer(&cfqd->idle_slice_timer, jiffies + sl);
931         cfq_log(cfqd, "arm_idle: %lu", sl);
932 }
933 
934 /*
935  * Move request from internal lists to the request queue dispatch list.
936  */
937 static void cfq_dispatch_insert(struct request_queue *q, struct request *rq)
938 {
939         struct cfq_data *cfqd = q->elevator->elevator_data;
940         struct cfq_queue *cfqq = RQ_CFQQ(rq);
941 
942         cfq_log_cfqq(cfqd, cfqq, "dispatch_insert");
943 
944         cfq_remove_request(rq);
945         cfqq->dispatched++;
946         elv_dispatch_sort(q, rq);
947 
948         if (cfq_cfqq_sync(cfqq))
949                 cfqd->sync_flight++;
950 }
951 
952 /*
953  * return expired entry, or NULL to just start from scratch in rbtree
954  */
955 static struct request *cfq_check_fifo(struct cfq_queue *cfqq)
956 {
957         struct cfq_data *cfqd = cfqq->cfqd;
958         struct request *rq;
959         int fifo;
960 
961         if (cfq_cfqq_fifo_expire(cfqq))
962                 return NULL;
963 
964         cfq_mark_cfqq_fifo_expire(cfqq);
965 
966         if (list_empty(&cfqq->fifo))
967                 return NULL;
968 
969         fifo = cfq_cfqq_sync(cfqq);
970         rq = rq_entry_fifo(cfqq->fifo.next);
971 
972         if (time_before(jiffies, rq->start_time + cfqd->cfq_fifo_expire[fifo]))
973                 rq = NULL;
974 
975         cfq_log_cfqq(cfqd, cfqq, "fifo=%p", rq);
976         return rq;
977 }
978 
979 static inline int
980 cfq_prio_to_maxrq(struct cfq_data *cfqd, struct cfq_queue *cfqq)
981 {
982         const int base_rq = cfqd->cfq_slice_async_rq;
983 
984         WARN_ON(cfqq->ioprio >= IOPRIO_BE_NR);
985 
986         return 2 * (base_rq + base_rq * (CFQ_PRIO_LISTS - 1 - cfqq->ioprio));
987 }
988 
989 /*
990  * Select a queue for service. If we have a current active queue,
991  * check whether to continue servicing it, or retrieve and set a new one.
992  */
993 static struct cfq_queue *cfq_select_queue(struct cfq_data *cfqd)
994 {
995         struct cfq_queue *cfqq;
996 
997         cfqq = cfqd->active_queue;
998         if (!cfqq)
999                 goto new_queue;
1000 
1001         /*
1002          * The active queue has run out of time, expire it and select new.
1003          */
1004         if (cfq_slice_used(cfqq))
1005                 goto expire;
1006 
1007         /*
1008          * The active queue has requests and isn't expired, allow it to
1009          * dispatch.
1010          */
1011         if (!RB_EMPTY_ROOT(&cfqq->sort_list))
1012                 goto keep_queue;
1013 
1014         /*
1015          * No requests pending. If the active queue still has requests in
1016          * flight or is idling for a new request, allow either of these
1017          * conditions to happen (or time out) before selecting a new queue.
1018          */
1019         if (timer_pending(&cfqd->idle_slice_timer) ||
1020             (cfqq->dispatched && cfq_cfqq_idle_window(cfqq))) {
1021                 cfqq = NULL;
1022                 goto keep_queue;
1023         }
1024 
1025 expire:
1026         cfq_slice_expired(cfqd, 0);
1027 new_queue:
1028         cfqq = cfq_set_active_queue(cfqd);
1029 keep_queue:
1030         return cfqq;
1031 }
1032 
1033 /*
1034  * Dispatch some requests from cfqq, moving them to the request queue
1035  * dispatch list.
1036  */
1037 static int
1038 __cfq_dispatch_requests(struct cfq_data *cfqd, struct cfq_queue *cfqq,
1039                         int max_dispatch)
1040 {
1041         int dispatched = 0;
1042 
1043         BUG_ON(RB_EMPTY_ROOT(&cfqq->sort_list));
1044 
1045         do {
1046                 struct request *rq;
1047 
1048                 /*
1049                  * follow expired path, else get first next available
1050                  */
1051                 rq = cfq_check_fifo(cfqq);
1052                 if (rq == NULL)
1053                         rq = cfqq->next_rq;
1054 
1055                 /*
1056                  * finally, insert request into driver dispatch list
1057                  */
1058                 cfq_dispatch_insert(cfqd->queue, rq);
1059 
1060                 dispatched++;
1061 
1062                 if (!cfqd->active_cic) {
1063                         atomic_inc(&RQ_CIC(rq)->ioc->refcount);
1064                         cfqd->active_cic = RQ_CIC(rq);
1065                 }
1066 
1067                 if (RB_EMPTY_ROOT(&cfqq->sort_list))
1068                         break;
1069 
1070         } while (dispatched < max_dispatch);
1071 
1072         /*
1073          * expire an async queue immediately if it has used up its slice. idle
1074          * queue always expire after 1 dispatch round.
1075          */
1076         if (cfqd->busy_queues > 1 && ((!cfq_cfqq_sync(cfqq) &&
1077             dispatched >= cfq_prio_to_maxrq(cfqd, cfqq)) ||
1078             cfq_class_idle(cfqq))) {
1079                 cfqq->slice_end = jiffies + 1;
1080                 cfq_slice_expired(cfqd, 0);
1081         }
1082 
1083         return dispatched;
1084 }
1085 
1086 static int __cfq_forced_dispatch_cfqq(struct cfq_queue *cfqq)
1087 {
1088         int dispatched = 0;
1089 
1090         while (cfqq->next_rq) {
1091                 cfq_dispatch_insert(cfqq->cfqd->queue, cfqq->next_rq);
10