Linux IO Scheduler -- Deadline
Linux IO Scheduler -- Deadline
原理:
Deadline调度器对一个请求的多方面特性进行权衡来进行调度,以期即能满足块设备扇区的顺寻访问又兼顾到一个请求不会在队列中等待太久导致饿死。
Deadline调度器为了兼顾这两个方面,通过红黑树来对请求按起始扇区序号进行排序,称为sort_list;通过fifo对请求按它们的生成时间进行排序,称为fifo_list。
batching:
每当确定了一个传输方向(读或写),那么将会从相应的sort_list中将一批连续请求dispatch到requst_queue的请求队列里,具体的数目由fifo_batch(default:16)来确定。
扇区:
磁盘的每一面被分为很多条磁道,即表面上的一些同心圆,越接近中心,圆就越小。而每一个磁道又按512个字节为单位划分为等分,叫做扇区.
总体来讲,deadline算法对request进行了优先权控制调度,主要表现在如下几个方面:
1)读写请求分离,读请求具有高优先调度权,除非写请求即将被饿死的时候,才会去调度处理写请求。这种处理可以保证读请求的延迟时间最小化。
2)对请求的顺序批量处理。对那些地址临近的顺序化请求,deadline给予了高优先级处理权。
例如一个写请求得到调度后,其临近的request会在紧接着的调度过程中被处理掉。这种顺序批量处理的方法可以最大程度的减少磁盘抖动。
3)保证每个请求的延迟时间。每个请求都赋予了一个最大延迟时间,如果达到延迟时间的上限,那么这个请求就会被提前处理掉,
此时,会破坏磁盘访问的顺序化特征,会影响性能,但是,保证了每个请求的最大延迟时间。
数据结构:
struct deadline_data {
/*
* run time data
*/
/*
* requests (deadline_rq s) are present on both sort_list and fifo_list
*/
struct rb_root sort_list[2]; //按照扇区排列的读写红黑树
struct list_head fifo_list[2]; //按照到期时间排列的先进先出的读写FIFO
/*
* next in sort order. read, write or both are NULL
*/
struct request *next_rq[2]; //记录批量处理请求的下一个请求
unsigned int batching; /* number of sequential requests made */
sector_t last_sector; /* head position */
unsigned int starved; /* times reads have starved writes */ //写饥饿的次数
/*
* settings that change how the i/o scheduler behaves
*/
int fifo_expire[2]; //读写超时时间,默认读是500ms,写是
int fifo_batch; //批量处理请求的数量限制,默认是16
int writes_starved; //写饥饿的最大限制,默认是2
int front_merges;
};
static struct elevator_type iosched_deadline = {
.ops = {
.elevator_merge_fn = deadline_merge, //检查是否可以向前合并
.elevator_merged_fn = deadline_merged_request, //合并bio到请求后的处理,向前合并后,将请求从红黑树中删除,重新插入
.elevator_merge_req_fn = deadline_merged_requests, //请求之间合并后的处理,根据请求的到期时间修改,并从deadline调度器中删除next请求
.elevator_dispatch_fn = deadline_dispatch_requests, //派发请求到请求队列的派发队列
.elevator_add_req_fn = deadline_add_request, //添加请求rq到红黑树和FIFO队列
.elevator_queue_empty_fn = deadline_queue_empty, //判断调度队列是否为空
.elevator_former_req_fn = elv_rb_former_request, //获取前一个请求
.elevator_latter_req_fn = elv_rb_latter_request, //获取后一个请求
.elevator_init_fn = deadline_init_queue, //初始化调度器的私有数据
.elevator_exit_fn = deadline_exit_queue, //释放调度器的私有数据
},
.elevator_attrs = deadline_attrs,
.elevator_name = "deadline",
.elevator_owner = THIS_MODULE,
};
include/linux/blkdev.h
#define rq_data_dir(rq) ((rq)->cmd_flags & 1)
include/linux/rbtree.h:132:#define RB_ROOT (struct rb_root) { NULL, }
(1)deadline_add_request
97 /*
98 * add rq to rbtree and fifo
99 */
100 static void
101 deadline_add_request(struct request_queue *q, struct request *rq)
102 {
103 struct deadline_data *dd = q->elevator->elevator_data;
// 请求的传送方向
104 const int data_dir = rq_data_dir(rq);
105
106 deadline_add_rq_rb(dd, rq);
107
108 /*
109 * set expire time and add to fifo list
110 */
// 设置超时时间,这个请求在超时时间(jiffies + dd->fifo_expire[data_dir])必须得到响应
111 rq_set_fifo_time(rq, jiffies + dd->fifo_expire[data_dir]);
112 list_add_tail(&rq->queuelist, &dd->fifo_list[data_dir]);
113 }
全局变量jiffies用来记录自系统启动以来产生的节拍的总数。系统运行时间以秒为单位,等于jiffies/Hz。
linux/blkdev.h
#define rq_data_dir(rq) ((rq)->cmd_flags & 1)
这个宏从请求中抽取传送的方向; 0 返回值表示从设备中读, 1 返回值表示写入设备.
76 static void
77 deadline_add_rq_rb(struct deadline_data *dd, struct request *rq)
78 {
// 决定request插入哪棵树上
79 struct rb_root *root = deadline_rb_root(dd, rq);
80 struct request *__alias;
81 // rb tree如果存在相等扇区的request, 则删除old request, 否则将request加入rb tree
82 while (unlikely(__alias = elv_rb_add(root, rq)))
83 deadline_move_request(dd, __alias);
84 }
56 static inline struct rb_root *
57 deadline_rb_root(struct deadline_data *dd, struct request *rq)
58 {
59 return &dd->sort_list[rq_data_dir(rq)];
60 }
360 struct request *elv_rb_add(struct rb_root *root, struct request *rq)
361 {
362 struct rb_node **p = &root->rb_node;
363 struct rb_node *parent = NULL;
364 struct request *__rq;
365 // 查找与rq在同一扇区的request,并返回old request指针
366 while (*p) {
367 parent = *p;
// 用来获得包含struct rb_node的结构体(struct request)的首地址
368 __rq = rb_entry(parent, struct request, rb_node);
369 // 比较rq的扇区号与__rq的扇区号
370 if (blk_rq_pos(rq) < blk_rq_pos(__rq))
371 p = &(*p)->rb_left;
372 else if (blk_rq_pos(rq) > blk_rq_pos(__rq))
373 p = &(*p)->rb_right;
374 else
375 return __rq;
376 }
377 //如果没找到,则将rq插入红黑树中
378 rb_link_node(&rq->rb_node, parent, p);
379 rb_insert_color(&rq->rb_node, root);
380 return NULL;
381 }
382 EXPORT_SYMBOL(elv_rb_add);
#define rb_entry(ptr, type, member) container_of(ptr, type, member) //用来获得包含struct rb_node的结构体的首地址
获取请求操作的起始扇区(sector cursor)
852 static inline sector_t blk_rq_pos(const struct request *rq)
853 {
854 return rq->__sector;
855 }
206 static void
207 deadline_move_request(struct deadline_data *dd, struct request *rq)
208 {
209 const int data_dir = rq_data_dir(rq);
210
211 dd->next_rq[READ] = NULL;
212 dd->next_rq[WRITE] = NULL;
213 dd->next_rq[data_dir] = deadline_latter_request(rq);
214
215 dd->last_sector = rq_end_sector(rq);
216
217 /*
218 * take it off the sort and fifo list, move
219 * to dispatch queue
220 */
221 deadline_move_to_dispatch(dd, rq);
222 }
191 /*
192 * move request from sort list to dispatch queue.
193 */
194 static inline void
195 deadline_move_to_dispatch(struct deadline_data *dd, struct request *rq)
196 {
197 struct request_queue *q = rq->q;
198 // remove rq from rbtree and fifo
199 deadline_remove_request(q, rq);
200 elv_dispatch_add_tail(q, rq);
201 }
456 /*
457 * Insert rq into dispatch queue of q. Queue lock must be held on
458 * entry. rq is added to the back of the dispatch queue. To be used by
459 * specific elevators.
460 */
461 void elv_dispatch_add_tail(struct request_queue *q, struct request *rq)
462 {
463 if (q->last_merge == rq)
464 q->last_merge = NULL;
465
466 elv_rqhash_del(q, rq);
467
468 q->nr_sorted--;
469
470 q->end_sector = rq_end_sector(rq);
471 q->boundary_rq = rq;
472 list_add_tail(&rq->queuelist, &q->queue_head);
473 }
474 EXPORT_SYMBOL(elv_dispatch_add_tail);
475
./block/elevator.c
309 static inline void __elv_rqhash_del(struct request *rq)
310 {
311 hlist_del_init(&rq->hash);
312 }
313
./include/linux/list.h
583 static inline void hlist_del_init(struct hlist_node *n)
584 {
585 if (!hlist_unhashed(n)) {
586 __hlist_del(n);
587 INIT_HLIST_NODE(n);
588 }
589 }
struct hlist_head{
struct hlist_node *first;
}
struct hlist_node {
struct hlist_node *next,**pprev;
}
pprev指向前一个节点(可能是表头)中的next(对于表头则是first)指针
567 static inline void __hlist_del(struct hlist_node *n)
568 {
569 struct hlist_node *next = n->next; // 获取next节点的指针
570 struct hlist_node **pprev = n->pprev;// 保留n的pprev域(即头结点first域的地址),此时 pprev = &first
/**
* 由于pprev = &first,故*pprev = next,相当于 first = next
* 即将hash桶的头结点指针指向原来的第二个结点
*/
571 *pprev = next;
572 if (next)
573 next->pprev = pprev; // 将n的下个结点的pprev域设置为头结点first域的地址
574 }
(2)为什么需要merge request?
试想一下,你们在学校食堂排队吃饭时,第1,3,5号人说他要水煮牛肉,而第2,4,6号人说他要鱼香肉丝,
那聪明的厨师会一次做3人量(合并)的水煮牛肉先给1,3,5,
再一次性地做3人量的鱼香肉丝分别给2,4,6号人,并且大家对这种"插队"没有异议。
这回厨师可高兴:"6个人只做两次"。
试想一下,不管是对于机械硬盘也好,还是以闪存为基础的ssd,u盘,sd卡也好,将请求合并,意味着和这些东东上的控制器打交道的次数少了。
传输变成更大块的连续传输(bulk),性能自然就提高了。尤其是对u盘和sd卡,因为它们的控制器太慢了。
有人可能会觉得ssd不需要,虽然ssd的4k随机性能很好,但是再好也被它的连续性能完爆。机械硬盘更是如此。所以这种合并是合理的
deadline_merge,此函数只是判断bio是否满足front merge条件,如果满足, 返回 1;不满足, 返回0
126 static int
127 deadline_merge(struct request_queue *q, struct request **req, struct bio *bio)
128 {
129 struct deadline_data *dd = q->elevator->elevator_data;
130 struct request *__rq;
131 int ret;
132
133 /*
134 * check for front merge
135 */
136 if (dd->front_merges) {
// 得到bio请求的结束扇区地址
137 sector_t sector = bio->bi_sector + bio_sectors(bio);
138 //在同方向(读或写)红黑树中查找请求(rb tree中)的起始扇区地址与带合并请求(bio)的结束扇区地址相同的请求。两个同方向的请求,只有地址(sector)连续, 才考虑merge
139 __rq = elv_rb_find(&dd->sort_list[bio_data_dir(bio)], sector);
140 if (__rq) {
141 BUG_ON(sector != blk_rq_pos(__rq));
142 // 判断是否满足merge条件
143 if (elv_rq_merge_ok(__rq, bio)) {
144 ret = ELEVATOR_FRONT_MERGE;
145 goto out;
146 }
147 }
148 }
149
150 return ELEVATOR_NO_MERGE;
151 out:
// 更新为即将合并后的request
152 *req = __rq;
153 return ret;
154 }
linux/bio.h
62 struct bio {
63 sector_t bi_sector; /* device address in 512 byte
64 sectors */
65 struct bio *bi_next; /* request queue link */
66 struct block_device *bi_bdev;
67 unsigned long bi_flags; /* status, command, etc */
68 unsigned long bi_rw; /* bottom bits READ/WRITE,
69 * top bits priority
70 */
71
72 unsigned short bi_vcnt; /* how many bio_vec's */
73 unsigned short bi_idx; /* current index into bvl_vec */
74
75 /* Number of segments in this BIO after
76 * physical address coalescing is performed.
77 */
78 unsigned int bi_phys_segments;
79
80 unsigned int bi_size; /* residual I/O count */
81
82 /*
83 * To keep track of the max segment size, we account for the
84 * sizes of the first and last mergeable segments in this bio.
85 */
86 unsigned int bi_seg_front_size;
87 unsigned int bi_seg_back_size;
88
89 unsigned int bi_max_vecs; /* max bvl_vecs we can hold */
90
91 unsigned int bi_comp_cpu; /* completion CPU */
92
93 atomic_t bi_cnt; /* pin count */
94
95 struct bio_vec *bi_io_vec; /* the actual vec list */
96
97 bio_end_io_t *bi_end_io;
98
99 void *bi_private;
100 #if defined(CONFIG_BLK_DEV_INTEGRITY)
101 struct bio_integrity_payload *bi_integrity; /* data integrity */
102 #endif
103
104 bio_destructor_t *bi_destructor; /* destructor */
105
106 /*
107 * We can inline a number of vecs at the end of the bio, to avoid
108 * double allocations for a small number of bio_vecs. This member
109 * MUST obviously be kept at the very end of the bio.
110 */
111 struct bio_vec bi_inline_vecs[0];
112 };
//提取bio的大小,以扇区为单位
213 #define bio_sectors(bio) ((bio)->bi_size >> 9)
392 struct request *elv_rb_find(struct rb_root *root, sector_t sector)
393 {
394 struct rb_node *n = root->rb_node;
395 struct request *rq;
396
397 while (n) {
// 用来获得包含struct rb_node的结构体(struct request)的首地址
398 rq = rb_entry(n, struct request, rb_node);
399
400 if (sector < blk_rq_pos(rq))
401 n = n->rb_left;
402 else if (sector > blk_rq_pos(rq))
403 n = n->rb_right;
404 else
405 return rq;
406 }
407
408 return NULL;
409 }
(3)由于前插入改变了原红黑树节点的值,所以要将节点删除再重新进行插入
156 static void deadline_merged_request(struct request_queue *q,
157 struct request *req, int type)
158 {
159 struct deadline_data *dd = q->elevator->elevator_data;
160
161 /*
162 * if the merge was a front merge, we need to reposition request
163 */
164 if (type == ELEVATOR_FRONT_MERGE) {
// 从rb tree中删除req对应的node
165 elv_rb_del(deadline_rb_root(dd, req), req);
// 将req插入rb tree,重新排序
166 deadline_add_rq_rb(dd, req);
167 }
168 }
169
384 void elv_rb_del(struct rb_root *root, struct request *rq)
385 {
386 BUG_ON(RB_EMPTY_NODE(&rq->rb_node));
387 rb_erase(&rq->rb_node, root);
388 RB_CLEAR_NODE(&rq->rb_node);
389 }
390 EXPORT_SYMBOL(elv_rb_del);
(4)deadline_merged_requests负责bio插入的善后工作
想想刚才举的厨师的例子,厨师把1,3,5号人的请求合并成第一次了,那对应的第3次和第5次就可以不做了。
170 static void
171 deadline_merged_requests(struct request_queue *q, struct request *req,
172 struct request *next)
173 {
174 /*
175 * if next expires before rq, assign its expire time to rq
176 * and move into next position (next will be deleted) in fifo
177 */
178 if (!list_empty(&req->queuelist) && !list_empty(&next->queuelist)) {
179 if (time_before(rq_fifo_time(next), rq_fifo_time(req))) {
// 如果next的超时时间早于req,将req一到next之后
180 list_move(&req->queuelist, &next->queuelist);
// 将req的超时时间设置为next的超时时间
181 rq_set_fifo_time(req, rq_fifo_time(next));
182 }
183 }
184
185 /*
186 * kill knowledge of next, this one is a goner
187 */
188 deadline_remove_request(q, next);
189 }
static inline void list_move(struct list_head *list, struct list_head *head)
{
__list_del(list->prev, list->next);
list_add(list, head);
}
115 /*
116 * remove rq from rbtree and fifo.
117 */
118 static void deadline_remove_request(struct request_queue *q, struct request *rq)
119 {
120 struct deadline_data *dd = q->elevator->elevator_data;
121 // 从fifo中移除
122 rq_fifo_clear(rq);
// 从rb tree中移除
123 deadline_del_rq_rb(dd, rq);
124 }
86 static inline void
87 deadline_del_rq_rb(struct deadline_data *dd, struct request *rq)
88 {
89 const int data_dir = rq_data_dir(rq);
90 // 更新next_rq信息
91 if (dd->next_rq[data_dir] == rq)
92 dd->next_rq[data_dir] = deadline_latter_request(rq);
93
94 elv_rb_del(deadline_rb_root(dd, rq), rq);
95 }
96
241 /*
242 * deadline_dispatch_requests selects the best request according to
243 * read/write expire, fifo_batch, etc
244 */
245 static int deadline_dispatch_requests(struct request_queue *q, int force)
246 {
247 struct deadline_data *dd = q->elevator->elevator_data;
248 const int reads = !list_empty(&dd->fifo_list[READ]);
249 const int writes = !list_empty(&dd->fifo_list[WRITE]);
250 struct request *rq;
251 int data_dir;
252
253 /*
254 * batches are currently reads XOR writes
255 */
256 if (dd->next_rq[WRITE])
257 rq = dd->next_rq[WRITE]; // 获取写方向的下一个请求
258 else
259 rq = dd->next_rq[READ]; // 获取读方向的下一个请求
260
261 if (rq && dd->batching < dd->fifo_batch) // 存在同一方向的批量处理请求,且未达到阈值(16),就优先处理批量请求
262 /* we have a next request are still entitled to batch */
263 goto dispatch_request;
264
265 /*
266 * at this point we are not running a batch. select the appropriate
267 * data direction (read / write)
268 */
269
270 if (reads) {
271 BUG_ON(RB_EMPTY_ROOT(&dd->sort_list[READ]));
272 // 如果有写请求,并且写请求饥饿已经超过阈值,则先处理写请求
273 if (writes && (dd->starved++ >= dd->writes_starved))
274 goto dispatch_writes;
275
276 data_dir = READ;
277
278 goto dispatch_find_request;
279 }
280
281 /*
282 * there are either no reads or writes have been starved
283 */
284
285 if (writes) {
286 dispatch_writes:
287 BUG_ON(RB_EMPTY_ROOT(&dd->sort_list[WRITE]));
288
289 dd->starved = 0;
290
291 data_dir = WRITE;
292
293 goto dispatch_find_request;
294 }
295
296 return 0;
297
298 dispatch_find_request:
299 /* //我们不是处理批量请求,则选择数据方向上最优的请求
300 * we are not running a batch, find best request for selected data_dir
301 */
//该请求方向存在即将饿死的请求,或不存在批量处理的请求,则先从FIFO队列头取一个请求
302 if (deadline_check_fifo(dd, data_dir) || !dd->next_rq[data_dir]) {
303 /*
304 * A deadline has expired, the last request was in the other
305 * direction, or we have run out of higher-sectored requests.
306 * Start again from the request with the earliest expiry time.
307 */
308 rq = rq_entry_fifo(dd->fifo_list[data_dir].next);
309 } else {
310 /* //否则,按照扇区顺序处理请求
311 * The last req was the same dir and we have a next request in
312 * sort order. No expired requests so continue on from here.
313 */
314 rq = dd->next_rq[data_dir];
315 }
316
317 dd->batching = 0;
318
319 dispatch_request:
320 /*
321 * rq is the selected appropriate request.
322 */
323 dd->batching++;
// 移动一个请求到请求队列的派发队列
324 deadline_move_request(dd, rq);
325
326 return 1;
327 }
224 /*
225 * deadline_check_fifo returns 0 if there are no expired requests on the fifo,
226 * 1 otherwise. Requires !list_empty(&dd->fifo_list[data_dir])
227 */
228 static inline int deadline_check_fifo(struct deadline_data *dd, int ddir)
229 {
230 struct request *rq = rq_entry_fifo(dd->fifo_list[ddir].next);
231
232 /*
233 * rq is expired!
234 */
235 if (time_after(jiffies, rq_fifo_time(rq)))
236 return 1;
237
238 return 0;
239 }
// 获取前一个请求
1104 struct request *elv_rb_former_request(struct request_queue *q,
1105 struct request *rq)
1106 {
1107 struct rb_node *rbprev = rb_prev(&rq->rb_node);
1108
1109 if (rbprev)
1110 return rb_entry_rq(rbprev);
1111
1112 return NULL;
1113 }
1114 EXPORT_SYMBOL(elv_rb_former_request);
1115
// 获取后一个请求
1116 struct request *elv_rb_latter_request(struct request_queue *q,
1117 struct request *rq)
1118 {
1119 struct rb_node *rbnext = rb_next(&rq->rb_node);
1120
1121 if (rbnext)
1122 return rb_entry_rq(rbnext);
1123
1124 return NULL;
1125 }
原理:
Deadline调度器对一个请求的多方面特性进行权衡来进行调度,以期即能满足块设备扇区的顺寻访问又兼顾到一个请求不会在队列中等待太久导致饿死。
Deadline调度器为了兼顾这两个方面,通过红黑树来对请求按起始扇区序号进行排序,称为sort_list;通过fifo对请求按它们的生成时间进行排序,称为fifo_list。
batching:
每当确定了一个传输方向(读或写),那么将会从相应的sort_list中将一批连续请求dispatch到requst_queue的请求队列里,具体的数目由fifo_batch(default:16)来确定。
扇区:
磁盘的每一面被分为很多条磁道,即表面上的一些同心圆,越接近中心,圆就越小。而每一个磁道又按512个字节为单位划分为等分,叫做扇区.
总体来讲,deadline算法对request进行了优先权控制调度,主要表现在如下几个方面:
1)读写请求分离,读请求具有高优先调度权,除非写请求即将被饿死的时候,才会去调度处理写请求。这种处理可以保证读请求的延迟时间最小化。
2)对请求的顺序批量处理。对那些地址临近的顺序化请求,deadline给予了高优先级处理权。
例如一个写请求得到调度后,其临近的request会在紧接着的调度过程中被处理掉。这种顺序批量处理的方法可以最大程度的减少磁盘抖动。
3)保证每个请求的延迟时间。每个请求都赋予了一个最大延迟时间,如果达到延迟时间的上限,那么这个请求就会被提前处理掉,
此时,会破坏磁盘访问的顺序化特征,会影响性能,但是,保证了每个请求的最大延迟时间。
数据结构:
struct deadline_data {
/*
* run time data
*/
/*
* requests (deadline_rq s) are present on both sort_list and fifo_list
*/
struct rb_root sort_list[2]; //按照扇区排列的读写红黑树
struct list_head fifo_list[2]; //按照到期时间排列的先进先出的读写FIFO
/*
* next in sort order. read, write or both are NULL
*/
struct request *next_rq[2]; //记录批量处理请求的下一个请求
unsigned int batching; /* number of sequential requests made */
sector_t last_sector; /* head position */
unsigned int starved; /* times reads have starved writes */ //写饥饿的次数
/*
* settings that change how the i/o scheduler behaves
*/
int fifo_expire[2]; //读写超时时间,默认读是500ms,写是
int fifo_batch; //批量处理请求的数量限制,默认是16
int writes_starved; //写饥饿的最大限制,默认是2
int front_merges;
};
static struct elevator_type iosched_deadline = {
.ops = {
.elevator_merge_fn = deadline_merge, //检查是否可以向前合并
.elevator_merged_fn = deadline_merged_request, //合并bio到请求后的处理,向前合并后,将请求从红黑树中删除,重新插入
.elevator_merge_req_fn = deadline_merged_requests, //请求之间合并后的处理,根据请求的到期时间修改,并从deadline调度器中删除next请求
.elevator_dispatch_fn = deadline_dispatch_requests, //派发请求到请求队列的派发队列
.elevator_add_req_fn = deadline_add_request, //添加请求rq到红黑树和FIFO队列
.elevator_queue_empty_fn = deadline_queue_empty, //判断调度队列是否为空
.elevator_former_req_fn = elv_rb_former_request, //获取前一个请求
.elevator_latter_req_fn = elv_rb_latter_request, //获取后一个请求
.elevator_init_fn = deadline_init_queue, //初始化调度器的私有数据
.elevator_exit_fn = deadline_exit_queue, //释放调度器的私有数据
},
.elevator_attrs = deadline_attrs,
.elevator_name = "deadline",
.elevator_owner = THIS_MODULE,
};
include/linux/blkdev.h
#define rq_data_dir(rq) ((rq)->cmd_flags & 1)
include/linux/rbtree.h:132:#define RB_ROOT (struct rb_root) { NULL, }
(1)deadline_add_request
97 /*
98 * add rq to rbtree and fifo
99 */
100 static void
101 deadline_add_request(struct request_queue *q, struct request *rq)
102 {
103 struct deadline_data *dd = q->elevator->elevator_data;
// 请求的传送方向
104 const int data_dir = rq_data_dir(rq);
105
106 deadline_add_rq_rb(dd, rq);
107
108 /*
109 * set expire time and add to fifo list
110 */
// 设置超时时间,这个请求在超时时间(jiffies + dd->fifo_expire[data_dir])必须得到响应
111 rq_set_fifo_time(rq, jiffies + dd->fifo_expire[data_dir]);
112 list_add_tail(&rq->queuelist, &dd->fifo_list[data_dir]);
113 }
全局变量jiffies用来记录自系统启动以来产生的节拍的总数。系统运行时间以秒为单位,等于jiffies/Hz。
linux/blkdev.h
#define rq_data_dir(rq) ((rq)->cmd_flags & 1)
这个宏从请求中抽取传送的方向; 0 返回值表示从设备中读, 1 返回值表示写入设备.
76 static void
77 deadline_add_rq_rb(struct deadline_data *dd, struct request *rq)
78 {
// 决定request插入哪棵树上
79 struct rb_root *root = deadline_rb_root(dd, rq);
80 struct request *__alias;
81 // rb tree如果存在相等扇区的request, 则删除old request, 否则将request加入rb tree
82 while (unlikely(__alias = elv_rb_add(root, rq)))
83 deadline_move_request(dd, __alias);
84 }
56 static inline struct rb_root *
57 deadline_rb_root(struct deadline_data *dd, struct request *rq)
58 {
59 return &dd->sort_list[rq_data_dir(rq)];
60 }
360 struct request *elv_rb_add(struct rb_root *root, struct request *rq)
361 {
362 struct rb_node **p = &root->rb_node;
363 struct rb_node *parent = NULL;
364 struct request *__rq;
365 // 查找与rq在同一扇区的request,并返回old request指针
366 while (*p) {
367 parent = *p;
// 用来获得包含struct rb_node的结构体(struct request)的首地址
368 __rq = rb_entry(parent, struct request, rb_node);
369 // 比较rq的扇区号与__rq的扇区号
370 if (blk_rq_pos(rq) < blk_rq_pos(__rq))
371 p = &(*p)->rb_left;
372 else if (blk_rq_pos(rq) > blk_rq_pos(__rq))
373 p = &(*p)->rb_right;
374 else
375 return __rq;
376 }
377 //如果没找到,则将rq插入红黑树中
378 rb_link_node(&rq->rb_node, parent, p);
379 rb_insert_color(&rq->rb_node, root);
380 return NULL;
381 }
382 EXPORT_SYMBOL(elv_rb_add);
#define rb_entry(ptr, type, member) container_of(ptr, type, member) //用来获得包含struct rb_node的结构体的首地址
获取请求操作的起始扇区(sector cursor)
852 static inline sector_t blk_rq_pos(const struct request *rq)
853 {
854 return rq->__sector;
855 }
206 static void
207 deadline_move_request(struct deadline_data *dd, struct request *rq)
208 {
209 const int data_dir = rq_data_dir(rq);
210
211 dd->next_rq[READ] = NULL;
212 dd->next_rq[WRITE] = NULL;
213 dd->next_rq[data_dir] = deadline_latter_request(rq);
214
215 dd->last_sector = rq_end_sector(rq);
216
217 /*
218 * take it off the sort and fifo list, move
219 * to dispatch queue
220 */
221 deadline_move_to_dispatch(dd, rq);
222 }
191 /*
192 * move request from sort list to dispatch queue.
193 */
194 static inline void
195 deadline_move_to_dispatch(struct deadline_data *dd, struct request *rq)
196 {
197 struct request_queue *q = rq->q;
198 // remove rq from rbtree and fifo
199 deadline_remove_request(q, rq);
200 elv_dispatch_add_tail(q, rq);
201 }
456 /*
457 * Insert rq into dispatch queue of q. Queue lock must be held on
458 * entry. rq is added to the back of the dispatch queue. To be used by
459 * specific elevators.
460 */
461 void elv_dispatch_add_tail(struct request_queue *q, struct request *rq)
462 {
463 if (q->last_merge == rq)
464 q->last_merge = NULL;
465
466 elv_rqhash_del(q, rq);
467
468 q->nr_sorted--;
469
470 q->end_sector = rq_end_sector(rq);
471 q->boundary_rq = rq;
472 list_add_tail(&rq->queuelist, &q->queue_head);
473 }
474 EXPORT_SYMBOL(elv_dispatch_add_tail);
475
./block/elevator.c
309 static inline void __elv_rqhash_del(struct request *rq)
310 {
311 hlist_del_init(&rq->hash);
312 }
313
./include/linux/list.h
583 static inline void hlist_del_init(struct hlist_node *n)
584 {
585 if (!hlist_unhashed(n)) {
586 __hlist_del(n);
587 INIT_HLIST_NODE(n);
588 }
589 }
struct hlist_head{
struct hlist_node *first;
}
struct hlist_node {
struct hlist_node *next,**pprev;
}
pprev指向前一个节点(可能是表头)中的next(对于表头则是first)指针
567 static inline void __hlist_del(struct hlist_node *n)
568 {
569 struct hlist_node *next = n->next; // 获取next节点的指针
570 struct hlist_node **pprev = n->pprev;// 保留n的pprev域(即头结点first域的地址),此时 pprev = &first
/**
* 由于pprev = &first,故*pprev = next,相当于 first = next
* 即将hash桶的头结点指针指向原来的第二个结点
*/
571 *pprev = next;
572 if (next)
573 next->pprev = pprev; // 将n的下个结点的pprev域设置为头结点first域的地址
574 }
(2)为什么需要merge request?
试想一下,你们在学校食堂排队吃饭时,第1,3,5号人说他要水煮牛肉,而第2,4,6号人说他要鱼香肉丝,
那聪明的厨师会一次做3人量(合并)的水煮牛肉先给1,3,5,
再一次性地做3人量的鱼香肉丝分别给2,4,6号人,并且大家对这种"插队"没有异议。
这回厨师可高兴:"6个人只做两次"。
试想一下,不管是对于机械硬盘也好,还是以闪存为基础的ssd,u盘,sd卡也好,将请求合并,意味着和这些东东上的控制器打交道的次数少了。
传输变成更大块的连续传输(bulk),性能自然就提高了。尤其是对u盘和sd卡,因为它们的控制器太慢了。
有人可能会觉得ssd不需要,虽然ssd的4k随机性能很好,但是再好也被它的连续性能完爆。机械硬盘更是如此。所以这种合并是合理的
deadline_merge,此函数只是判断bio是否满足front merge条件,如果满足, 返回 1;不满足, 返回0
126 static int
127 deadline_merge(struct request_queue *q, struct request **req, struct bio *bio)
128 {
129 struct deadline_data *dd = q->elevator->elevator_data;
130 struct request *__rq;
131 int ret;
132
133 /*
134 * check for front merge
135 */
136 if (dd->front_merges) {
// 得到bio请求的结束扇区地址
137 sector_t sector = bio->bi_sector + bio_sectors(bio);
138 //在同方向(读或写)红黑树中查找请求(rb tree中)的起始扇区地址与带合并请求(bio)的结束扇区地址相同的请求。两个同方向的请求,只有地址(sector)连续, 才考虑merge
139 __rq = elv_rb_find(&dd->sort_list[bio_data_dir(bio)], sector);
140 if (__rq) {
141 BUG_ON(sector != blk_rq_pos(__rq));
142 // 判断是否满足merge条件
143 if (elv_rq_merge_ok(__rq, bio)) {
144 ret = ELEVATOR_FRONT_MERGE;
145 goto out;
146 }
147 }
148 }
149
150 return ELEVATOR_NO_MERGE;
151 out:
// 更新为即将合并后的request
152 *req = __rq;
153 return ret;
154 }
linux/bio.h
62 struct bio {
63 sector_t bi_sector; /* device address in 512 byte
64 sectors */
65 struct bio *bi_next; /* request queue link */
66 struct block_device *bi_bdev;
67 unsigned long bi_flags; /* status, command, etc */
68 unsigned long bi_rw; /* bottom bits READ/WRITE,
69 * top bits priority
70 */
71
72 unsigned short bi_vcnt; /* how many bio_vec's */
73 unsigned short bi_idx; /* current index into bvl_vec */
74
75 /* Number of segments in this BIO after
76 * physical address coalescing is performed.
77 */
78 unsigned int bi_phys_segments;
79
80 unsigned int bi_size; /* residual I/O count */
81
82 /*
83 * To keep track of the max segment size, we account for the
84 * sizes of the first and last mergeable segments in this bio.
85 */
86 unsigned int bi_seg_front_size;
87 unsigned int bi_seg_back_size;
88
89 unsigned int bi_max_vecs; /* max bvl_vecs we can hold */
90
91 unsigned int bi_comp_cpu; /* completion CPU */
92
93 atomic_t bi_cnt; /* pin count */
94
95 struct bio_vec *bi_io_vec; /* the actual vec list */
96
97 bio_end_io_t *bi_end_io;
98
99 void *bi_private;
100 #if defined(CONFIG_BLK_DEV_INTEGRITY)
101 struct bio_integrity_payload *bi_integrity; /* data integrity */
102 #endif
103
104 bio_destructor_t *bi_destructor; /* destructor */
105
106 /*
107 * We can inline a number of vecs at the end of the bio, to avoid
108 * double allocations for a small number of bio_vecs. This member
109 * MUST obviously be kept at the very end of the bio.
110 */
111 struct bio_vec bi_inline_vecs[0];
112 };
//提取bio的大小,以扇区为单位
213 #define bio_sectors(bio) ((bio)->bi_size >> 9)
392 struct request *elv_rb_find(struct rb_root *root, sector_t sector)
393 {
394 struct rb_node *n = root->rb_node;
395 struct request *rq;
396
397 while (n) {
// 用来获得包含struct rb_node的结构体(struct request)的首地址
398 rq = rb_entry(n, struct request, rb_node);
399
400 if (sector < blk_rq_pos(rq))
401 n = n->rb_left;
402 else if (sector > blk_rq_pos(rq))
403 n = n->rb_right;
404 else
405 return rq;
406 }
407
408 return NULL;
409 }
(3)由于前插入改变了原红黑树节点的值,所以要将节点删除再重新进行插入
156 static void deadline_merged_request(struct request_queue *q,
157 struct request *req, int type)
158 {
159 struct deadline_data *dd = q->elevator->elevator_data;
160
161 /*
162 * if the merge was a front merge, we need to reposition request
163 */
164 if (type == ELEVATOR_FRONT_MERGE) {
// 从rb tree中删除req对应的node
165 elv_rb_del(deadline_rb_root(dd, req), req);
// 将req插入rb tree,重新排序
166 deadline_add_rq_rb(dd, req);
167 }
168 }
169
384 void elv_rb_del(struct rb_root *root, struct request *rq)
385 {
386 BUG_ON(RB_EMPTY_NODE(&rq->rb_node));
387 rb_erase(&rq->rb_node, root);
388 RB_CLEAR_NODE(&rq->rb_node);
389 }
390 EXPORT_SYMBOL(elv_rb_del);
(4)deadline_merged_requests负责bio插入的善后工作
想想刚才举的厨师的例子,厨师把1,3,5号人的请求合并成第一次了,那对应的第3次和第5次就可以不做了。
170 static void
171 deadline_merged_requests(struct request_queue *q, struct request *req,
172 struct request *next)
173 {
174 /*
175 * if next expires before rq, assign its expire time to rq
176 * and move into next position (next will be deleted) in fifo
177 */
178 if (!list_empty(&req->queuelist) && !list_empty(&next->queuelist)) {
179 if (time_before(rq_fifo_time(next), rq_fifo_time(req))) {
// 如果next的超时时间早于req,将req一到next之后
180 list_move(&req->queuelist, &next->queuelist);
// 将req的超时时间设置为next的超时时间
181 rq_set_fifo_time(req, rq_fifo_time(next));
182 }
183 }
184
185 /*
186 * kill knowledge of next, this one is a goner
187 */
188 deadline_remove_request(q, next);
189 }
static inline void list_move(struct list_head *list, struct list_head *head)
{
__list_del(list->prev, list->next);
list_add(list, head);
}
115 /*
116 * remove rq from rbtree and fifo.
117 */
118 static void deadline_remove_request(struct request_queue *q, struct request *rq)
119 {
120 struct deadline_data *dd = q->elevator->elevator_data;
121 // 从fifo中移除
122 rq_fifo_clear(rq);
// 从rb tree中移除
123 deadline_del_rq_rb(dd, rq);
124 }
86 static inline void
87 deadline_del_rq_rb(struct deadline_data *dd, struct request *rq)
88 {
89 const int data_dir = rq_data_dir(rq);
90 // 更新next_rq信息
91 if (dd->next_rq[data_dir] == rq)
92 dd->next_rq[data_dir] = deadline_latter_request(rq);
93
94 elv_rb_del(deadline_rb_root(dd, rq), rq);
95 }
96
241 /*
242 * deadline_dispatch_requests selects the best request according to
243 * read/write expire, fifo_batch, etc
244 */
245 static int deadline_dispatch_requests(struct request_queue *q, int force)
246 {
247 struct deadline_data *dd = q->elevator->elevator_data;
248 const int reads = !list_empty(&dd->fifo_list[READ]);
249 const int writes = !list_empty(&dd->fifo_list[WRITE]);
250 struct request *rq;
251 int data_dir;
252
253 /*
254 * batches are currently reads XOR writes
255 */
256 if (dd->next_rq[WRITE])
257 rq = dd->next_rq[WRITE]; // 获取写方向的下一个请求
258 else
259 rq = dd->next_rq[READ]; // 获取读方向的下一个请求
260
261 if (rq && dd->batching < dd->fifo_batch) // 存在同一方向的批量处理请求,且未达到阈值(16),就优先处理批量请求
262 /* we have a next request are still entitled to batch */
263 goto dispatch_request;
264
265 /*
266 * at this point we are not running a batch. select the appropriate
267 * data direction (read / write)
268 */
269
270 if (reads) {
271 BUG_ON(RB_EMPTY_ROOT(&dd->sort_list[READ]));
272 // 如果有写请求,并且写请求饥饿已经超过阈值,则先处理写请求
273 if (writes && (dd->starved++ >= dd->writes_starved))
274 goto dispatch_writes;
275
276 data_dir = READ;
277
278 goto dispatch_find_request;
279 }
280
281 /*
282 * there are either no reads or writes have been starved
283 */
284
285 if (writes) {
286 dispatch_writes:
287 BUG_ON(RB_EMPTY_ROOT(&dd->sort_list[WRITE]));
288
289 dd->starved = 0;
290
291 data_dir = WRITE;
292
293 goto dispatch_find_request;
294 }
295
296 return 0;
297
298 dispatch_find_request:
299 /* //我们不是处理批量请求,则选择数据方向上最优的请求
300 * we are not running a batch, find best request for selected data_dir
301 */
//该请求方向存在即将饿死的请求,或不存在批量处理的请求,则先从FIFO队列头取一个请求
302 if (deadline_check_fifo(dd, data_dir) || !dd->next_rq[data_dir]) {
303 /*
304 * A deadline has expired, the last request was in the other
305 * direction, or we have run out of higher-sectored requests.
306 * Start again from the request with the earliest expiry time.
307 */
308 rq = rq_entry_fifo(dd->fifo_list[data_dir].next);
309 } else {
310 /* //否则,按照扇区顺序处理请求
311 * The last req was the same dir and we have a next request in
312 * sort order. No expired requests so continue on from here.
313 */
314 rq = dd->next_rq[data_dir];
315 }
316
317 dd->batching = 0;
318
319 dispatch_request:
320 /*
321 * rq is the selected appropriate request.
322 */
323 dd->batching++;
// 移动一个请求到请求队列的派发队列
324 deadline_move_request(dd, rq);
325
326 return 1;
327 }
224 /*
225 * deadline_check_fifo returns 0 if there are no expired requests on the fifo,
226 * 1 otherwise. Requires !list_empty(&dd->fifo_list[data_dir])
227 */
228 static inline int deadline_check_fifo(struct deadline_data *dd, int ddir)
229 {
230 struct request *rq = rq_entry_fifo(dd->fifo_list[ddir].next);
231
232 /*
233 * rq is expired!
234 */
235 if (time_after(jiffies, rq_fifo_time(rq)))
236 return 1;
237
238 return 0;
239 }
// 获取前一个请求
1104 struct request *elv_rb_former_request(struct request_queue *q,
1105 struct request *rq)
1106 {
1107 struct rb_node *rbprev = rb_prev(&rq->rb_node);
1108
1109 if (rbprev)
1110 return rb_entry_rq(rbprev);
1111
1112 return NULL;
1113 }
1114 EXPORT_SYMBOL(elv_rb_former_request);
1115
// 获取后一个请求
1116 struct request *elv_rb_latter_request(struct request_queue *q,
1117 struct request *rq)
1118 {
1119 struct rb_node *rbnext = rb_next(&rq->rb_node);
1120
1121 if (rbnext)
1122 return rb_entry_rq(rbnext);
1123
1124 return NULL;
1125 }