LKML Archive on lore.kernel.org
help / color / mirror / Atom feed
* [PATCH 1/6 v4] cfq-iosched: Introduce cfq_entity for CFQ queue
       [not found] <4D51ED26.8050809@cn.fujitsu.com>
@ 2011-02-10  7:46 ` Gui Jianfeng
  2011-02-10  7:47 ` [PATCH 2/6 v4] cfq-iosched: Introduce cfq_entity for CFQ group Gui Jianfeng
                   ` (4 subsequent siblings)
  5 siblings, 0 replies; 40+ messages in thread
From: Gui Jianfeng @ 2011-02-10  7:46 UTC (permalink / raw)
  To: Vivek Goyal, Jens Axboe
  Cc: Gui Jianfeng, Shaohua Li, lkml, Chad Talbott, Divyesh Shah

Introduce cfq_entity for CFQ queue

Signed-off-by: Gui Jianfeng <guijianfeng@cn.fujitsu.com>
---
 block/cfq-iosched.c |  118 ++++++++++++++++++++++++++++++++------------------
 1 files changed, 75 insertions(+), 43 deletions(-)

diff --git a/block/cfq-iosched.c b/block/cfq-iosched.c
index f083bda..939c6a6 100644
--- a/block/cfq-iosched.c
+++ b/block/cfq-iosched.c
@@ -92,19 +92,29 @@ struct cfq_rb_root {
 			.count = 0, .min_vdisktime = 0, }
 
 /*
+ * This's the CFQ queue schedule entity which is scheduled on  service tree.
+ */
+struct cfq_entity {
+	/* service tree */
+	struct cfq_rb_root *service_tree;
+	/* service_tree member */
+	struct rb_node rb_node;
+	/* service_tree key, represent the position on the tree */
+	unsigned long rb_key;
+};
+
+/*
  * Per process-grouping structure
  */
 struct cfq_queue {
+	/* The schedule entity */
+	struct cfq_entity cfqe;
 	/* reference count */
 	int ref;
 	/* various state flags, see below */
 	unsigned int flags;
 	/* parent cfq_data */
 	struct cfq_data *cfqd;
-	/* service_tree member */
-	struct rb_node rb_node;
-	/* service_tree key */
-	unsigned long rb_key;
 	/* prio tree member */
 	struct rb_node p_node;
 	/* prio tree root we belong to, if any */
@@ -143,7 +153,6 @@ struct cfq_queue {
 	u32 seek_history;
 	sector_t last_request_pos;
 
-	struct cfq_rb_root *service_tree;
 	struct cfq_queue *new_cfqq;
 	struct cfq_group *cfqg;
 	struct cfq_group *orig_cfqg;
@@ -302,6 +311,15 @@ struct cfq_data {
 	struct rcu_head rcu;
 };
 
+static inline struct cfq_queue *
+cfqq_of_entity(struct cfq_entity *cfqe)
+{
+	if (cfqe)
+		return container_of(cfqe, struct cfq_queue, cfqe);
+
+	return NULL;
+}
+
 static struct cfq_group *cfq_get_next_cfqg(struct cfq_data *cfqd);
 
 static struct cfq_rb_root *service_tree_for(struct cfq_group *cfqg,
@@ -743,7 +761,7 @@ cfq_choose_req(struct cfq_data *cfqd, struct request *rq1, struct request *rq2,
 /*
  * The below is leftmost cache rbtree addon
  */
-static struct cfq_queue *cfq_rb_first(struct cfq_rb_root *root)
+static struct cfq_entity *cfq_rb_first(struct cfq_rb_root *root)
 {
 	/* Service tree is empty */
 	if (!root->count)
@@ -753,7 +771,7 @@ static struct cfq_queue *cfq_rb_first(struct cfq_rb_root *root)
 		root->left = rb_first(&root->rb);
 
 	if (root->left)
-		return rb_entry(root->left, struct cfq_queue, rb_node);
+		return rb_entry(root->left, struct cfq_entity, rb_node);
 
 	return NULL;
 }
@@ -1171,21 +1189,24 @@ static inline void cfq_put_cfqg(struct cfq_group *cfqg) {}
 static void cfq_service_tree_add(struct cfq_data *cfqd, struct cfq_queue *cfqq,
 				 bool add_front)
 {
+	struct cfq_entity *cfqe;
 	struct rb_node **p, *parent;
-	struct cfq_queue *__cfqq;
+	struct cfq_entity *__cfqe;
 	unsigned long rb_key;
 	struct cfq_rb_root *service_tree;
 	int left;
 	int new_cfqq = 1;
 	int group_changed = 0;
 
+	cfqe = &cfqq->cfqe;
+
 #ifdef CONFIG_CFQ_GROUP_IOSCHED
 	if (!cfqd->cfq_group_isolation
 	    && cfqq_type(cfqq) == SYNC_NOIDLE_WORKLOAD
 	    && cfqq->cfqg && cfqq->cfqg != &cfqd->root_group) {
 		/* Move this cfq to root group */
 		cfq_log_cfqq(cfqd, cfqq, "moving to root group");
-		if (!RB_EMPTY_NODE(&cfqq->rb_node))
+		if (!RB_EMPTY_NODE(&cfqe->rb_node))
 			cfq_group_service_tree_del(cfqd, cfqq->cfqg);
 		cfqq->orig_cfqg = cfqq->cfqg;
 		cfqq->cfqg = &cfqd->root_group;
@@ -1195,7 +1216,7 @@ static void cfq_service_tree_add(struct cfq_data *cfqd, struct cfq_queue *cfqq,
 		   && cfqq_type(cfqq) == SYNC_WORKLOAD && cfqq->orig_cfqg) {
 		/* cfqq is sequential now needs to go to its original group */
 		BUG_ON(cfqq->cfqg != &cfqd->root_group);
-		if (!RB_EMPTY_NODE(&cfqq->rb_node))
+		if (!RB_EMPTY_NODE(&cfqe->rb_node))
 			cfq_group_service_tree_del(cfqd, cfqq->cfqg);
 		cfq_put_cfqg(cfqq->cfqg);
 		cfqq->cfqg = cfqq->orig_cfqg;
@@ -1210,9 +1231,9 @@ static void cfq_service_tree_add(struct cfq_data *cfqd, struct cfq_queue *cfqq,
 	if (cfq_class_idle(cfqq)) {
 		rb_key = CFQ_IDLE_DELAY;
 		parent = rb_last(&service_tree->rb);
-		if (parent && parent != &cfqq->rb_node) {
-			__cfqq = rb_entry(parent, struct cfq_queue, rb_node);
-			rb_key += __cfqq->rb_key;
+		if (parent && parent != &cfqe->rb_node) {
+			__cfqe = rb_entry(parent, struct cfq_entity, rb_node);
+			rb_key += __cfqe->rb_key;
 		} else
 			rb_key += jiffies;
 	} else if (!add_front) {
@@ -1227,37 +1248,37 @@ static void cfq_service_tree_add(struct cfq_data *cfqd, struct cfq_queue *cfqq,
 		cfqq->slice_resid = 0;
 	} else {
 		rb_key = -HZ;
-		__cfqq = cfq_rb_first(service_tree);
-		rb_key += __cfqq ? __cfqq->rb_key : jiffies;
+		__cfqe = cfq_rb_first(service_tree);
+		rb_key += __cfqe ? __cfqe->rb_key : jiffies;
 	}
 
-	if (!RB_EMPTY_NODE(&cfqq->rb_node)) {
+	if (!RB_EMPTY_NODE(&cfqe->rb_node)) {
 		new_cfqq = 0;
 		/*
 		 * same position, nothing more to do
 		 */
-		if (rb_key == cfqq->rb_key &&
-		    cfqq->service_tree == service_tree)
+		if (rb_key == cfqe->rb_key &&
+		    cfqe->service_tree == service_tree)
 			return;
 
-		cfq_rb_erase(&cfqq->rb_node, cfqq->service_tree);
-		cfqq->service_tree = NULL;
+		cfq_rb_erase(&cfqe->rb_node, cfqe->service_tree);
+		cfqe->service_tree = NULL;
 	}
 
 	left = 1;
 	parent = NULL;
-	cfqq->service_tree = service_tree;
+	cfqe->service_tree = service_tree;
 	p = &service_tree->rb.rb_node;
 	while (*p) {
 		struct rb_node **n;
 
 		parent = *p;
-		__cfqq = rb_entry(parent, struct cfq_queue, rb_node);
+		__cfqe = rb_entry(parent, struct cfq_entity, rb_node);
 
 		/*
 		 * sort by key, that represents service time.
 		 */
-		if (time_before(rb_key, __cfqq->rb_key))
+		if (time_before(rb_key, __cfqe->rb_key))
 			n = &(*p)->rb_left;
 		else {
 			n = &(*p)->rb_right;
@@ -1268,11 +1289,11 @@ static void cfq_service_tree_add(struct cfq_data *cfqd, struct cfq_queue *cfqq,
 	}
 
 	if (left)
-		service_tree->left = &cfqq->rb_node;
+		service_tree->left = &cfqe->rb_node;
 
-	cfqq->rb_key = rb_key;
-	rb_link_node(&cfqq->rb_node, parent, p);
-	rb_insert_color(&cfqq->rb_node, &service_tree->rb);
+	cfqe->rb_key = rb_key;
+	rb_link_node(&cfqe->rb_node, parent, p);
+	rb_insert_color(&cfqe->rb_node, &service_tree->rb);
 	service_tree->count++;
 	if ((add_front || !new_cfqq) && !group_changed)
 		return;
@@ -1374,13 +1395,16 @@ static void cfq_add_cfqq_rr(struct cfq_data *cfqd, struct cfq_queue *cfqq)
  */
 static void cfq_del_cfqq_rr(struct cfq_data *cfqd, struct cfq_queue *cfqq)
 {
+	struct cfq_entity *cfqe;
 	cfq_log_cfqq(cfqd, cfqq, "del_from_rr");
 	BUG_ON(!cfq_cfqq_on_rr(cfqq));
 	cfq_clear_cfqq_on_rr(cfqq);
 
-	if (!RB_EMPTY_NODE(&cfqq->rb_node)) {
-		cfq_rb_erase(&cfqq->rb_node, cfqq->service_tree);
-		cfqq->service_tree = NULL;
+	cfqe = &cfqq->cfqe;
+
+	if (!RB_EMPTY_NODE(&cfqe->rb_node)) {
+		cfq_rb_erase(&cfqe->rb_node, cfqe->service_tree);
+		cfqe->service_tree = NULL;
 	}
 	if (cfqq->p_root) {
 		rb_erase(&cfqq->p_node, cfqq->p_root);
@@ -1708,13 +1732,13 @@ static struct cfq_queue *cfq_get_next_queue(struct cfq_data *cfqd)
 		return NULL;
 	if (RB_EMPTY_ROOT(&service_tree->rb))
 		return NULL;
-	return cfq_rb_first(service_tree);
+	return cfqq_of_entity(cfq_rb_first(service_tree));
 }
 
 static struct cfq_queue *cfq_get_next_queue_forced(struct cfq_data *cfqd)
 {
 	struct cfq_group *cfqg;
-	struct cfq_queue *cfqq;
+	struct cfq_entity *cfqe;
 	int i, j;
 	struct cfq_rb_root *st;
 
@@ -1725,9 +1749,11 @@ static struct cfq_queue *cfq_get_next_queue_forced(struct cfq_data *cfqd)
 	if (!cfqg)
 		return NULL;
 
-	for_each_cfqg_st(cfqg, i, j, st)
-		if ((cfqq = cfq_rb_first(st)) != NULL)
-			return cfqq;
+	for_each_cfqg_st(cfqg, i, j, st) {
+		cfqe = cfq_rb_first(st);
+		if (cfqe != NULL)
+			return cfqq_of_entity(cfqe);
+	}
 	return NULL;
 }
 
@@ -1864,9 +1890,12 @@ static struct cfq_queue *cfq_close_cooperator(struct cfq_data *cfqd,
 
 static bool cfq_should_idle(struct cfq_data *cfqd, struct cfq_queue *cfqq)
 {
+	struct cfq_entity *cfqe;
 	enum wl_prio_t prio = cfqq_prio(cfqq);
-	struct cfq_rb_root *service_tree = cfqq->service_tree;
+	struct cfq_rb_root *service_tree;
 
+	cfqe = &cfqq->cfqe;
+	service_tree = cfqe->service_tree;
 	BUG_ON(!service_tree);
 	BUG_ON(!service_tree->count);
 
@@ -2076,7 +2105,7 @@ static void cfq_setup_merge(struct cfq_queue *cfqq, struct cfq_queue *new_cfqq)
 static enum wl_type_t cfq_choose_wl(struct cfq_data *cfqd,
 				struct cfq_group *cfqg, enum wl_prio_t prio)
 {
-	struct cfq_queue *queue;
+	struct cfq_entity *cfqe;
 	int i;
 	bool key_valid = false;
 	unsigned long lowest_key = 0;
@@ -2084,10 +2113,10 @@ static enum wl_type_t cfq_choose_wl(struct cfq_data *cfqd,
 
 	for (i = 0; i <= SYNC_WORKLOAD; ++i) {
 		/* select the one with lowest rb_key */
-		queue = cfq_rb_first(service_tree_for(cfqg, prio, i));
-		if (queue &&
-		    (!key_valid || time_before(queue->rb_key, lowest_key))) {
-			lowest_key = queue->rb_key;
+		cfqe = cfq_rb_first(service_tree_for(cfqg, prio, i));
+		if (cfqe &&
+		    (!key_valid || time_before(cfqe->rb_key, lowest_key))) {
+			lowest_key = cfqe->rb_key;
 			cur_best = i;
 			key_valid = true;
 		}
@@ -2835,7 +2864,10 @@ static void cfq_ioc_set_ioprio(struct io_context *ioc)
 static void cfq_init_cfqq(struct cfq_data *cfqd, struct cfq_queue *cfqq,
 			  pid_t pid, bool is_sync)
 {
-	RB_CLEAR_NODE(&cfqq->rb_node);
+	struct cfq_entity *cfqe;
+
+	cfqe = &cfqq->cfqe;
+	RB_CLEAR_NODE(&cfqe->rb_node);
 	RB_CLEAR_NODE(&cfqq->p_node);
 	INIT_LIST_HEAD(&cfqq->fifo);
 
@@ -3244,7 +3276,7 @@ cfq_should_preempt(struct cfq_data *cfqd, struct cfq_queue *new_cfqq,
 	/* Allow preemption only if we are idling on sync-noidle tree */
 	if (cfqd->serving_type == SYNC_NOIDLE_WORKLOAD &&
 	    cfqq_type(new_cfqq) == SYNC_NOIDLE_WORKLOAD &&
-	    new_cfqq->service_tree->count == 2 &&
+	    new_cfqq->cfqe.service_tree->count == 2 &&
 	    RB_EMPTY_ROOT(&cfqq->sort_list))
 		return true;
 
-- 
1.7.1



^ permalink raw reply	[flat|nested] 40+ messages in thread

* [PATCH 2/6 v4] cfq-iosched: Introduce cfq_entity for CFQ group
       [not found] <4D51ED26.8050809@cn.fujitsu.com>
  2011-02-10  7:46 ` [PATCH 1/6 v4] cfq-iosched: Introduce cfq_entity for CFQ queue Gui Jianfeng
@ 2011-02-10  7:47 ` Gui Jianfeng
  2011-02-10  7:47 ` [PATCH 3/6 v4] cfq-iosched: Introduce vdisktime and io weight for CFQ queue Gui Jianfeng
                   ` (3 subsequent siblings)
  5 siblings, 0 replies; 40+ messages in thread
From: Gui Jianfeng @ 2011-02-10  7:47 UTC (permalink / raw)
  To: Vivek Goyal, Jens Axboe
  Cc: Gui Jianfeng, Shaohua Li, lkml, Chad Talbott, Divyesh Shah

Introduce cfq_entity for CFQ group

Signed-off-by: Gui Jianfeng <guijianfeng@cn.fujitsu.com>
---
 block/cfq-iosched.c |  111 +++++++++++++++++++++++++++++++--------------------
 1 files changed, 67 insertions(+), 44 deletions(-)

diff --git a/block/cfq-iosched.c b/block/cfq-iosched.c
index 939c6a6..f3a126e 100644
--- a/block/cfq-iosched.c
+++ b/block/cfq-iosched.c
@@ -73,7 +73,7 @@ static DEFINE_IDA(cic_index_ida);
 #define cfq_class_rt(cfqq)	((cfqq)->ioprio_class == IOPRIO_CLASS_RT)
 
 #define sample_valid(samples)	((samples) > 80)
-#define rb_entry_cfqg(node)	rb_entry((node), struct cfq_group, rb_node)
+#define rb_entry_entity(node)	rb_entry((node), struct cfq_entity, rb_node)
 
 /*
  * Most of our rbtree usage is for sorting with min extraction, so
@@ -101,6 +101,11 @@ struct cfq_entity {
 	struct rb_node rb_node;
 	/* service_tree key, represent the position on the tree */
 	unsigned long rb_key;
+
+	/* group service_tree key */
+	u64 vdisktime;
+	bool is_group_entity;
+	unsigned int weight;
 };
 
 /*
@@ -182,12 +187,8 @@ enum wl_type_t {
 
 /* This is per cgroup per device grouping structure */
 struct cfq_group {
-	/* group service_tree member */
-	struct rb_node rb_node;
-
-	/* group service_tree key */
-	u64 vdisktime;
-	unsigned int weight;
+	/* cfq group sched entity */
+	struct cfq_entity cfqe;
 
 	/* number of cfqq currently on this group */
 	int nr_cfqq;
@@ -314,12 +315,22 @@ struct cfq_data {
 static inline struct cfq_queue *
 cfqq_of_entity(struct cfq_entity *cfqe)
 {
-	if (cfqe)
+	if (cfqe && !cfqe->is_group_entity)
 		return container_of(cfqe, struct cfq_queue, cfqe);
 
 	return NULL;
 }
 
+static inline struct cfq_group *
+cfqg_of_entity(struct cfq_entity *cfqe)
+{
+	if (cfqe && cfqe->is_group_entity)
+		return container_of(cfqe, struct cfq_group, cfqe);
+
+	return NULL;
+}
+
+
 static struct cfq_group *cfq_get_next_cfqg(struct cfq_data *cfqd);
 
 static struct cfq_rb_root *service_tree_for(struct cfq_group *cfqg,
@@ -547,12 +558,12 @@ cfq_prio_to_slice(struct cfq_data *cfqd, struct cfq_queue *cfqq)
 	return cfq_prio_slice(cfqd, cfq_cfqq_sync(cfqq), cfqq->ioprio);
 }
 
-static inline u64 cfq_scale_slice(unsigned long delta, struct cfq_group *cfqg)
+static inline u64 cfq_scale_slice(unsigned long delta, struct cfq_entity *cfqe)
 {
 	u64 d = delta << CFQ_SERVICE_SHIFT;
 
 	d = d * BLKIO_WEIGHT_DEFAULT;
-	do_div(d, cfqg->weight);
+	do_div(d, cfqe->weight);
 	return d;
 }
 
@@ -577,11 +588,11 @@ static inline u64 min_vdisktime(u64 min_vdisktime, u64 vdisktime)
 static void update_min_vdisktime(struct cfq_rb_root *st)
 {
 	u64 vdisktime = st->min_vdisktime;
-	struct cfq_group *cfqg;
+	struct cfq_entity *cfqe;
 
 	if (st->left) {
-		cfqg = rb_entry_cfqg(st->left);
-		vdisktime = min_vdisktime(vdisktime, cfqg->vdisktime);
+		cfqe = rb_entry_entity(st->left);
+		vdisktime = min_vdisktime(vdisktime, cfqe->vdisktime);
 	}
 
 	st->min_vdisktime = max_vdisktime(st->min_vdisktime, vdisktime);
@@ -612,8 +623,9 @@ static inline unsigned
 cfq_group_slice(struct cfq_data *cfqd, struct cfq_group *cfqg)
 {
 	struct cfq_rb_root *st = &cfqd->grp_service_tree;
+	struct cfq_entity *cfqe = &cfqg->cfqe;
 
-	return cfq_target_latency * cfqg->weight / st->total_weight;
+	return cfq_target_latency * cfqe->weight / st->total_weight;
 }
 
 static inline void
@@ -776,13 +788,13 @@ static struct cfq_entity *cfq_rb_first(struct cfq_rb_root *root)
 	return NULL;
 }
 
-static struct cfq_group *cfq_rb_first_group(struct cfq_rb_root *root)
+static struct cfq_entity *cfq_rb_first_entity(struct cfq_rb_root *root)
 {
 	if (!root->left)
 		root->left = rb_first(&root->rb);
 
 	if (root->left)
-		return rb_entry_cfqg(root->left);
+		return rb_entry_entity(root->left);
 
 	return NULL;
 }
@@ -839,9 +851,9 @@ static unsigned long cfq_slice_offset(struct cfq_data *cfqd,
 }
 
 static inline s64
-cfqg_key(struct cfq_rb_root *st, struct cfq_group *cfqg)
+entity_key(struct cfq_rb_root *st, struct cfq_entity *entity)
 {
-	return cfqg->vdisktime - st->min_vdisktime;
+	return entity->vdisktime - st->min_vdisktime;
 }
 
 static void
@@ -849,15 +861,16 @@ __cfq_group_service_tree_add(struct cfq_rb_root *st, struct cfq_group *cfqg)
 {
 	struct rb_node **node = &st->rb.rb_node;
 	struct rb_node *parent = NULL;
-	struct cfq_group *__cfqg;
-	s64 key = cfqg_key(st, cfqg);
+	struct cfq_entity *__cfqe;
+	struct cfq_entity *cfqe = &cfqg->cfqe;
+	s64 key = entity_key(st, cfqe);
 	int left = 1;
 
 	while (*node != NULL) {
 		parent = *node;
-		__cfqg = rb_entry_cfqg(parent);
+		__cfqe = rb_entry_entity(parent);
 
-		if (key < cfqg_key(st, __cfqg))
+		if (key < entity_key(st, __cfqe))
 			node = &parent->rb_left;
 		else {
 			node = &parent->rb_right;
@@ -866,21 +879,22 @@ __cfq_group_service_tree_add(struct cfq_rb_root *st, struct cfq_group *cfqg)
 	}
 
 	if (left)
-		st->left = &cfqg->rb_node;
+		st->left = &cfqe->rb_node;
 
-	rb_link_node(&cfqg->rb_node, parent, node);
-	rb_insert_color(&cfqg->rb_node, &st->rb);
+	rb_link_node(&cfqe->rb_node, parent, node);
+	rb_insert_color(&cfqe->rb_node, &st->rb);
 }
 
 static void
 cfq_group_service_tree_add(struct cfq_data *cfqd, struct cfq_group *cfqg)
 {
 	struct cfq_rb_root *st = &cfqd->grp_service_tree;
-	struct cfq_group *__cfqg;
+	struct cfq_entity *cfqe = &cfqg->cfqe;
+	struct cfq_entity *__cfqe;
 	struct rb_node *n;
 
 	cfqg->nr_cfqq++;
-	if (!RB_EMPTY_NODE(&cfqg->rb_node))
+	if (!RB_EMPTY_NODE(&cfqe->rb_node))
 		return;
 
 	/*
@@ -890,19 +904,20 @@ cfq_group_service_tree_add(struct cfq_data *cfqd, struct cfq_group *cfqg)
 	 */
 	n = rb_last(&st->rb);
 	if (n) {
-		__cfqg = rb_entry_cfqg(n);
-		cfqg->vdisktime = __cfqg->vdisktime + CFQ_IDLE_DELAY;
+		__cfqe = rb_entry_entity(n);
+		cfqe->vdisktime = __cfqe->vdisktime + CFQ_IDLE_DELAY;
 	} else
-		cfqg->vdisktime = st->min_vdisktime;
+		cfqe->vdisktime = st->min_vdisktime;
 
 	__cfq_group_service_tree_add(st, cfqg);
-	st->total_weight += cfqg->weight;
+	st->total_weight += cfqe->weight;
 }
 
 static void
 cfq_group_service_tree_del(struct cfq_data *cfqd, struct cfq_group *cfqg)
 {
 	struct cfq_rb_root *st = &cfqd->grp_service_tree;
+	struct cfq_entity *cfqe = &cfqg->cfqe;
 
 	BUG_ON(cfqg->nr_cfqq < 1);
 	cfqg->nr_cfqq--;
@@ -912,9 +927,9 @@ cfq_group_service_tree_del(struct cfq_data *cfqd, struct cfq_group *cfqg)
 		return;
 
 	cfq_log_cfqg(cfqd, cfqg, "del_from_rr group");
-	st->total_weight -= cfqg->weight;
-	if (!RB_EMPTY_NODE(&cfqg->rb_node))
-		cfq_rb_erase(&cfqg->rb_node, st);
+	st->total_weight -= cfqe->weight;
+	if (!RB_EMPTY_NODE(&cfqe->rb_node))
+		cfq_rb_erase(&cfqe->rb_node, st);
 	cfqg->saved_workload_slice = 0;
 	cfq_blkiocg_update_dequeue_stats(&cfqg->blkg, 1);
 }
@@ -952,6 +967,7 @@ static void cfq_group_served(struct cfq_data *cfqd, struct cfq_group *cfqg,
 	unsigned int used_sl, charge;
 	int nr_sync = cfqg->nr_cfqq - cfqg_busy_async_queues(cfqd, cfqg)
 			- cfqg->service_tree_idle.count;
+	struct cfq_entity *cfqe = &cfqg->cfqe;
 
 	BUG_ON(nr_sync < 0);
 	used_sl = charge = cfq_cfqq_slice_usage(cfqq);
@@ -962,8 +978,8 @@ static void cfq_group_served(struct cfq_data *cfqd, struct cfq_group *cfqg,
 		charge = cfqq->allocated_slice;
 
 	/* Can't update vdisktime while group is on service tree */
-	cfq_rb_erase(&cfqg->rb_node, st);
-	cfqg->vdisktime += cfq_scale_slice(charge, cfqg);
+	cfq_rb_erase(&cfqe->rb_node, st);
+	cfqe->vdisktime += cfq_scale_slice(charge, cfqe);
 	__cfq_group_service_tree_add(st, cfqg);
 
 	/* This group is being expired. Save the context */
@@ -975,8 +991,8 @@ static void cfq_group_served(struct cfq_data *cfqd, struct cfq_group *cfqg,
 	} else
 		cfqg->saved_workload_slice = 0;
 
-	cfq_log_cfqg(cfqd, cfqg, "served: vt=%llu min_vt=%llu", cfqg->vdisktime,
-					st->min_vdisktime);
+	cfq_log_cfqg(cfqd, cfqg, "served: vt=%llu min_vt=%llu",
+		     cfqe->vdisktime, st->min_vdisktime);
 	cfq_log_cfqq(cfqq->cfqd, cfqq, "sl_used=%u disp=%u charge=%u iops=%u"
 			" sect=%u", used_sl, cfqq->slice_dispatch, charge,
 			iops_mode(cfqd), cfqq->nr_sectors);
@@ -995,7 +1011,7 @@ static inline struct cfq_group *cfqg_of_blkg(struct blkio_group *blkg)
 void cfq_update_blkio_group_weight(void *key, struct blkio_group *blkg,
 					unsigned int weight)
 {
-	cfqg_of_blkg(blkg)->weight = weight;
+	cfqg_of_blkg(blkg)->cfqe.weight = weight;
 }
 
 static struct cfq_group *
@@ -1024,7 +1040,9 @@ cfq_find_alloc_cfqg(struct cfq_data *cfqd, struct cgroup *cgroup, int create)
 
 	for_each_cfqg_st(cfqg, i, j, st)
 		*st = CFQ_RB_ROOT;
-	RB_CLEAR_NODE(&cfqg->rb_node);
+	RB_CLEAR_NODE(&cfqg->cfqe.rb_node);
+
+	cfqg->cfqe.is_group_entity = true;
 
 	/*
 	 * Take the initial reference that will be released on destroy
@@ -1048,7 +1066,7 @@ cfq_find_alloc_cfqg(struct cfq_data *cfqd, struct cgroup *cgroup, int create)
 		cfq_blkiocg_add_blkio_group(blkcg, &cfqg->blkg, (void *)cfqd,
 					0);
 
-	cfqg->weight = blkcg_get_weight(blkcg, cfqg->blkg.dev);
+	cfqg->cfqe.weight = blkcg_get_weight(blkcg, cfqg->blkg.dev);
 
 	/* Add group on cfqd list */
 	hlist_add_head(&cfqg->cfqd_node, &cfqd->cfqg_list);
@@ -2209,10 +2227,13 @@ static struct cfq_group *cfq_get_next_cfqg(struct cfq_data *cfqd)
 {
 	struct cfq_rb_root *st = &cfqd->grp_service_tree;
 	struct cfq_group *cfqg;
+	struct cfq_entity *cfqe;
 
 	if (RB_EMPTY_ROOT(&st->rb))
 		return NULL;
-	cfqg = cfq_rb_first_group(st);
+	cfqe = cfq_rb_first_entity(st);
+	cfqg = cfqg_of_entity(cfqe);
+	BUG_ON(!cfqg);
 	update_min_vdisktime(st);
 	return cfqg;
 }
@@ -2871,6 +2892,7 @@ static void cfq_init_cfqq(struct cfq_data *cfqd, struct cfq_queue *cfqq,
 	RB_CLEAR_NODE(&cfqq->p_node);
 	INIT_LIST_HEAD(&cfqq->fifo);
 
+	cfqe->is_group_entity = false;
 	cfqq->ref = 0;
 	cfqq->cfqd = cfqd;
 
@@ -3907,10 +3929,11 @@ static void *cfq_init_queue(struct request_queue *q)
 	cfqg = &cfqd->root_group;
 	for_each_cfqg_st(cfqg, i, j, st)
 		*st = CFQ_RB_ROOT;
-	RB_CLEAR_NODE(&cfqg->rb_node);
+	RB_CLEAR_NODE(&cfqg->cfqe.rb_node);
 
 	/* Give preference to root group over other groups */
-	cfqg->weight = 2*BLKIO_WEIGHT_DEFAULT;
+	cfqg->cfqe.weight = 2*BLKIO_WEIGHT_DEFAULT;
+	cfqg->cfqe.is_group_entity = true;
 
 #ifdef CONFIG_CFQ_GROUP_IOSCHED
 	/*
-- 
1.7.1





^ permalink raw reply	[flat|nested] 40+ messages in thread

* [PATCH 3/6 v4] cfq-iosched: Introduce vdisktime and io weight for CFQ queue
       [not found] <4D51ED26.8050809@cn.fujitsu.com>
  2011-02-10  7:46 ` [PATCH 1/6 v4] cfq-iosched: Introduce cfq_entity for CFQ queue Gui Jianfeng
  2011-02-10  7:47 ` [PATCH 2/6 v4] cfq-iosched: Introduce cfq_entity for CFQ group Gui Jianfeng
@ 2011-02-10  7:47 ` Gui Jianfeng
  2011-02-10 19:29   ` Vivek Goyal
                     ` (2 more replies)
  2011-02-10  7:47 ` [PATCH 4/6 v4] cfq-iosched: Extract some common code of service tree handling for CFQ queue and CFQ group Gui Jianfeng
                   ` (2 subsequent siblings)
  5 siblings, 3 replies; 40+ messages in thread
From: Gui Jianfeng @ 2011-02-10  7:47 UTC (permalink / raw)
  To: Vivek Goyal, Jens Axboe
  Cc: Gui Jianfeng, Shaohua Li, lkml, Chad Talbott, Divyesh Shah

Introduce vdisktime and io weight for CFQ queue scheduling. Currently, io priority
maps to a range [100,1000]. It also gets rid of cfq_slice_offset() logic and makes
use the same scheduling algorithm as CFQ group does. This helps for CFQ queue and
group scheduling on the same service tree.

Signed-off-by: Gui Jianfeng <guijianfeng@cn.fujitsu.com>
---
 block/cfq-iosched.c |  219 +++++++++++++++++++++++++++++++++++++++------------
 1 files changed, 167 insertions(+), 52 deletions(-)

diff --git a/block/cfq-iosched.c b/block/cfq-iosched.c
index f3a126e..41cef2e 100644
--- a/block/cfq-iosched.c
+++ b/block/cfq-iosched.c
@@ -39,6 +39,13 @@ static const int cfq_hist_divisor = 4;
  */
 #define CFQ_IDLE_DELAY		(HZ / 5)
 
+/* 
+ * The base boosting value.
+ */
+#define CFQ_BOOST_SYNC_BASE          (HZ / 10)
+#define CFQ_BOOST_ASYNC_BASE          (HZ / 25)
+
+
 /*
  * below this threshold, we consider thinktime immediate
  */
@@ -99,10 +106,7 @@ struct cfq_entity {
 	struct cfq_rb_root *service_tree;
 	/* service_tree member */
 	struct rb_node rb_node;
-	/* service_tree key, represent the position on the tree */
-	unsigned long rb_key;
-
-	/* group service_tree key */
+	/* service_tree key */
 	u64 vdisktime;
 	bool is_group_entity;
 	unsigned int weight;
@@ -114,6 +118,8 @@ struct cfq_entity {
 struct cfq_queue {
 	/* The schedule entity */
 	struct cfq_entity cfqe;
+	/* Reposition time */
+	unsigned long reposition_time;
 	/* reference count */
 	int ref;
 	/* various state flags, see below */
@@ -312,6 +318,24 @@ struct cfq_data {
 	struct rcu_head rcu;
 };
 
+/*
+ * Map io priority(7 ~ 0) to io weight(100 ~ 1000) as follows
+ *     prio       0    1     2    3    4    5    6     7
+ *     weight  1000  868   740  612  484  356  228   100
+ */
+static inline unsigned int cfq_prio_to_weight(unsigned short ioprio)
+{
+	unsigned int step;
+
+	BUG_ON(ioprio >= IOPRIO_BE_NR);
+
+	step = (BLKIO_WEIGHT_MAX - BLKIO_WEIGHT_MIN) / (IOPRIO_BE_NR - 1);
+	if (ioprio == 0)
+		return BLKIO_WEIGHT_MAX;
+
+	return BLKIO_WEIGHT_MIN + (IOPRIO_BE_NR - ioprio - 1) * step;
+}
+
 static inline struct cfq_queue *
 cfqq_of_entity(struct cfq_entity *cfqe)
 {
@@ -840,16 +864,6 @@ cfq_find_next_rq(struct cfq_data *cfqd, struct cfq_queue *cfqq,
 	return cfq_choose_req(cfqd, next, prev, blk_rq_pos(last));
 }
 
-static unsigned long cfq_slice_offset(struct cfq_data *cfqd,
-				      struct cfq_queue *cfqq)
-{
-	/*
-	 * just an approximation, should be ok.
-	 */
-	return (cfqq->cfqg->nr_cfqq - 1) * (cfq_prio_slice(cfqd, 1, 0) -
-		       cfq_prio_slice(cfqd, cfq_cfqq_sync(cfqq), cfqq->ioprio));
-}
-
 static inline s64
 entity_key(struct cfq_rb_root *st, struct cfq_entity *entity)
 {
@@ -1199,6 +1213,21 @@ static inline void cfq_put_cfqg(struct cfq_group *cfqg) {}
 
 #endif /* GROUP_IOSCHED */
 
+static inline u64 cfq_get_boost(struct cfq_data *cfqd,
+				 struct cfq_queue *cfqq)
+{
+	u64 d;
+
+	if (cfq_cfqq_sync(cfqq))
+		d = CFQ_BOOST_SYNC_BASE << CFQ_SERVICE_SHIFT;
+	else
+		d = CFQ_BOOST_ASYNC_BASE << CFQ_SERVICE_SHIFT;
+
+	d = d * BLKIO_WEIGHT_DEFAULT;
+	do_div(d, cfqq->cfqe.weight);
+	return d;
+}
+
 /*
  * The cfqd->service_trees holds all pending cfq_queue's that have
  * requests waiting to be processed. It is sorted in the order that
@@ -1210,13 +1239,14 @@ static void cfq_service_tree_add(struct cfq_data *cfqd, struct cfq_queue *cfqq,
 	struct cfq_entity *cfqe;
 	struct rb_node **p, *parent;
 	struct cfq_entity *__cfqe;
-	unsigned long rb_key;
-	struct cfq_rb_root *service_tree;
+	struct cfq_rb_root *service_tree, *orig_st;
 	int left;
 	int new_cfqq = 1;
 	int group_changed = 0;
+	s64 key;
 
 	cfqe = &cfqq->cfqe;
+	orig_st = cfqe->service_tree;
 
 #ifdef CONFIG_CFQ_GROUP_IOSCHED
 	if (!cfqd->cfq_group_isolation
@@ -1224,8 +1254,15 @@ static void cfq_service_tree_add(struct cfq_data *cfqd, struct cfq_queue *cfqq,
 	    && cfqq->cfqg && cfqq->cfqg != &cfqd->root_group) {
 		/* Move this cfq to root group */
 		cfq_log_cfqq(cfqd, cfqq, "moving to root group");
-		if (!RB_EMPTY_NODE(&cfqe->rb_node))
+		if (!RB_EMPTY_NODE(&cfqe->rb_node)) {
 			cfq_group_service_tree_del(cfqd, cfqq->cfqg);
+			/*
+			 * Group changed, dequeue this CFQ queue from the
+			 * original service tree.
+			 */
+			cfq_rb_erase(&cfqe->rb_node, cfqe->service_tree);
+			orig_st->total_weight -= cfqe->weight;
+		}
 		cfqq->orig_cfqg = cfqq->cfqg;
 		cfqq->cfqg = &cfqd->root_group;
 		cfqd->root_group.ref++;
@@ -1234,8 +1271,15 @@ static void cfq_service_tree_add(struct cfq_data *cfqd, struct cfq_queue *cfqq,
 		   && cfqq_type(cfqq) == SYNC_WORKLOAD && cfqq->orig_cfqg) {
 		/* cfqq is sequential now needs to go to its original group */
 		BUG_ON(cfqq->cfqg != &cfqd->root_group);
-		if (!RB_EMPTY_NODE(&cfqe->rb_node))
+		if (!RB_EMPTY_NODE(&cfqe->rb_node)) {
 			cfq_group_service_tree_del(cfqd, cfqq->cfqg);
+			/*
+			 * Group changed, dequeue this CFQ queue from the
+			 * original service tree.
+			 */
+			cfq_rb_erase(&cfqe->rb_node, cfqe->service_tree);
+			orig_st->total_weight -= cfqe->weight;
+		}
 		cfq_put_cfqg(cfqq->cfqg);
 		cfqq->cfqg = cfqq->orig_cfqg;
 		cfqq->orig_cfqg = NULL;
@@ -1246,47 +1290,68 @@ static void cfq_service_tree_add(struct cfq_data *cfqd, struct cfq_queue *cfqq,
 
 	service_tree = service_tree_for(cfqq->cfqg, cfqq_prio(cfqq),
 						cfqq_type(cfqq));
+	if (RB_EMPTY_NODE(&cfqe->rb_node)) {
+		/*
+		 * If this CFQ queue moves to another group, the original
+		 * vdisktime makes no sense any more, reset the vdisktime
+		 * here.
+		 */
+		parent = rb_last(&service_tree->rb);
+		if (parent) {
+			u64 pos_offset;
+
+			/*
+			 * Estimate the position according to its weight and
+			 * ioprio.
+			 */
+			pos_offset = cfq_get_boost(cfqd, cfqq);
+			/* Debug purpose, should remove. */
+			cfq_log_cfqq(cfqd, cfqq, "pos_offset: %llu",
+				     pos_offset);
+			cfqe->vdisktime = service_tree->min_vdisktime +
+						pos_offset;
+		} else
+			cfqe->vdisktime = service_tree->min_vdisktime;
+
+		goto insert;
+	}
+
+	/*
+	 * Ok, we get here, this CFQ queue is on the service tree, dequeue it
+	 * firstly.
+	 */
+	cfq_rb_erase(&cfqe->rb_node, cfqe->service_tree);
+	orig_st->total_weight -= cfqe->weight;
+
+	new_cfqq = 0;
+
 	if (cfq_class_idle(cfqq)) {
-		rb_key = CFQ_IDLE_DELAY;
 		parent = rb_last(&service_tree->rb);
 		if (parent && parent != &cfqe->rb_node) {
 			__cfqe = rb_entry(parent, struct cfq_entity, rb_node);
-			rb_key += __cfqe->rb_key;
+			cfqe->vdisktime = __cfqe->vdisktime + CFQ_IDLE_DELAY;
 		} else
-			rb_key += jiffies;
+			cfqe->vdisktime = service_tree->min_vdisktime;
 	} else if (!add_front) {
 		/*
-		 * Get our rb key offset. Subtract any residual slice
-		 * value carried from last service. A negative resid
-		 * count indicates slice overrun, and this should position
-		 * the next service time further away in the tree.
+		 * We charge the CFQ queue by the time this queue runs, and
+		 * repsition it on the service tree.
 		 */
-		rb_key = cfq_slice_offset(cfqd, cfqq) + jiffies;
-		rb_key -= cfqq->slice_resid;
+		unsigned int used_sl;
+
+		used_sl = cfq_cfqq_slice_usage(cfqq);
+		cfqe->vdisktime += cfq_scale_slice(used_sl, cfqe);
 		cfqq->slice_resid = 0;
 	} else {
-		rb_key = -HZ;
-		__cfqe = cfq_rb_first(service_tree);
-		rb_key += __cfqe ? __cfqe->rb_key : jiffies;
-	}
-
-	if (!RB_EMPTY_NODE(&cfqe->rb_node)) {
-		new_cfqq = 0;
-		/*
-		 * same position, nothing more to do
-		 */
-		if (rb_key == cfqe->rb_key &&
-		    cfqe->service_tree == service_tree)
-			return;
-
-		cfq_rb_erase(&cfqe->rb_node, cfqe->service_tree);
-		cfqe->service_tree = NULL;
+		cfqe->vdisktime = service_tree->min_vdisktime;
 	}
 
+insert:
 	left = 1;
 	parent = NULL;
 	cfqe->service_tree = service_tree;
 	p = &service_tree->rb.rb_node;
+	key = entity_key(service_tree, cfqe);
 	while (*p) {
 		struct rb_node **n;
 
@@ -1296,7 +1361,7 @@ static void cfq_service_tree_add(struct cfq_data *cfqd, struct cfq_queue *cfqq,
 		/*
 		 * sort by key, that represents service time.
 		 */
-		if (time_before(rb_key, __cfqe->rb_key))
+		if (key < entity_key(service_tree, __cfqe))
 			n = &(*p)->rb_left;
 		else {
 			n = &(*p)->rb_right;
@@ -1309,10 +1374,12 @@ static void cfq_service_tree_add(struct cfq_data *cfqd, struct cfq_queue *cfqq,
 	if (left)
 		service_tree->left = &cfqe->rb_node;
 
-	cfqe->rb_key = rb_key;
 	rb_link_node(&cfqe->rb_node, parent, p);
 	rb_insert_color(&cfqe->rb_node, &service_tree->rb);
+	update_min_vdisktime(service_tree);
 	service_tree->count++;
+	service_tree->total_weight += cfqe->weight;
+	cfqq->reposition_time = jiffies;
 	if ((add_front || !new_cfqq) && !group_changed)
 		return;
 	cfq_group_service_tree_add(cfqd, cfqq->cfqg);
@@ -1414,14 +1481,18 @@ static void cfq_add_cfqq_rr(struct cfq_data *cfqd, struct cfq_queue *cfqq)
 static void cfq_del_cfqq_rr(struct cfq_data *cfqd, struct cfq_queue *cfqq)
 {
 	struct cfq_entity *cfqe;
+	struct cfq_rb_root *service_tree;
+
 	cfq_log_cfqq(cfqd, cfqq, "del_from_rr");
 	BUG_ON(!cfq_cfqq_on_rr(cfqq));
 	cfq_clear_cfqq_on_rr(cfqq);
 
 	cfqe = &cfqq->cfqe;
+	service_tree = cfqe->service_tree;
 
 	if (!RB_EMPTY_NODE(&cfqe->rb_node)) {
 		cfq_rb_erase(&cfqe->rb_node, cfqe->service_tree);
+		service_tree->total_weight -= cfqe->weight;
 		cfqe->service_tree = NULL;
 	}
 	if (cfqq->p_root) {
@@ -2120,23 +2191,36 @@ static void cfq_setup_merge(struct cfq_queue *cfqq, struct cfq_queue *new_cfqq)
 	}
 }
 
+/*
+ * The time when a CFQ queue is put onto a service tree is recoreded in
+ * cfqq->reposition_time. Currently, we check the first priority CFQ queues
+ * on each service tree, and select the workload type that contains the lowest
+ * reposition_time CFQ queue among them.
+ */
 static enum wl_type_t cfq_choose_wl(struct cfq_data *cfqd,
 				struct cfq_group *cfqg, enum wl_prio_t prio)
 {
 	struct cfq_entity *cfqe;
+	struct cfq_queue *cfqq;
+	unsigned long lowest_start_time;
 	int i;
-	bool key_valid = false;
-	unsigned long lowest_key = 0;
+	bool time_valid = false;
 	enum wl_type_t cur_best = SYNC_NOIDLE_WORKLOAD;
 
+	/*
+	 * TODO: We may take io priority and io class into account when
+	 * choosing a workload type. But for the time being just make use of
+	 * reposition_time only.
+	 */
 	for (i = 0; i <= SYNC_WORKLOAD; ++i) {
-		/* select the one with lowest rb_key */
 		cfqe = cfq_rb_first(service_tree_for(cfqg, prio, i));
-		if (cfqe &&
-		    (!key_valid || time_before(cfqe->rb_key, lowest_key))) {
-			lowest_key = cfqe->rb_key;
+		cfqq = cfqq_of_entity(cfqe);
+		if (cfqe && (!time_valid ||
+			     time_before(cfqq->reposition_time,
+					 lowest_start_time))) {
+			lowest_start_time = cfqq->reposition_time;
 			cur_best = i;
-			key_valid = true;
+			time_valid = true;
 		}
 	}
 
@@ -2808,10 +2892,13 @@ static void cfq_init_prio_data(struct cfq_queue *cfqq, struct io_context *ioc)
 {
 	struct task_struct *tsk = current;
 	int ioprio_class;
+	struct cfq_entity *cfqe;
 
 	if (!cfq_cfqq_prio_changed(cfqq))
 		return;
 
+	cfqe = &cfqq->cfqe;
+
 	ioprio_class = IOPRIO_PRIO_CLASS(ioc->ioprio);
 	switch (ioprio_class) {
 	default:
@@ -2838,6 +2925,17 @@ static void cfq_init_prio_data(struct cfq_queue *cfqq, struct io_context *ioc)
 		break;
 	}
 
+	if (!RB_EMPTY_NODE(&cfqe->rb_node)) {
+		/*
+		 * If this CFQ entity is already on service tree, we need to
+		 * adjust service tree's total weight accordingly.
+		 */
+		cfqe->service_tree->total_weight -= cfqe->weight;
+		cfqe->weight = cfq_prio_to_weight(cfqq->ioprio);
+		cfqe->service_tree->total_weight += cfqe->weight;
+	} else
+		cfqe->weight = cfq_prio_to_weight(cfqq->ioprio);
+
 	/*
 	 * keep track of original prio settings in case we have to temporarily
 	 * elevate the priority of this queue
@@ -3572,6 +3670,9 @@ static void cfq_completed_request(struct request_queue *q, struct request *rq)
  */
 static void cfq_prio_boost(struct cfq_queue *cfqq)
 {
+	struct cfq_entity *cfqe;
+
+	cfqe = &cfqq->cfqe;
 	if (has_fs_excl()) {
 		/*
 		 * boost idle prio on transactions that would lock out other
@@ -3588,6 +3689,20 @@ static void cfq_prio_boost(struct cfq_queue *cfqq)
 		cfqq->ioprio_class = cfqq->org_ioprio_class;
 		cfqq->ioprio = cfqq->org_ioprio;
 	}
+
+	/*
+	 * update the io weight if io priority gets changed.
+	 */
+	if (!RB_EMPTY_NODE(&cfqe->rb_node)) {
+		/*
+		 * If this CFQ entity is already on service tree, we need to
+		 * adjust service tree's total weight accordingly.
+		 */
+		cfqe->service_tree->total_weight -= cfqe->weight;
+		cfqe->weight = cfq_prio_to_weight(cfqq->ioprio);
+		cfqe->service_tree->total_weight += cfqe->weight;
+	} else
+		cfqe->weight = cfq_prio_to_weight(cfqq->ioprio);
 }
 
 static inline int __cfq_may_queue(struct cfq_queue *cfqq)
-- 
1.7.1





^ permalink raw reply	[flat|nested] 40+ messages in thread

* [PATCH 4/6 v4] cfq-iosched: Extract some common code of service tree handling for CFQ queue and CFQ group
       [not found] <4D51ED26.8050809@cn.fujitsu.com>
                   ` (2 preceding siblings ...)
  2011-02-10  7:47 ` [PATCH 3/6 v4] cfq-iosched: Introduce vdisktime and io weight for CFQ queue Gui Jianfeng
@ 2011-02-10  7:47 ` Gui Jianfeng
  2011-02-10  7:47 ` [PATCH 5/6 v4] cfq-iosched: CFQ group hierarchical scheduling and use_hierarchy interface Gui Jianfeng
  2011-02-10  7:47 ` [PATCH 6/6 v4] blkio-cgroup: Document for blkio.use_hierarchy interface Gui Jianfeng
  5 siblings, 0 replies; 40+ messages in thread
From: Gui Jianfeng @ 2011-02-10  7:47 UTC (permalink / raw)
  To: Vivek Goyal, Jens Axboe
  Cc: Gui Jianfeng, Shaohua Li, lkml, Chad Talbott, Divyesh Shah

Extract some common code of service tree handling for CFQ queue
and CFQ group. This helps when CFQ queue and CFQ group are scheduling
together.

Signed-off-by: Gui Jianfeng <guijianfeng@cn.fujitsu.com>
---
 block/cfq-iosched.c |   86 +++++++++++++++++++++-----------------------------
 1 files changed, 36 insertions(+), 50 deletions(-)

diff --git a/block/cfq-iosched.c b/block/cfq-iosched.c
index 41cef2e..aa3eda8 100644
--- a/block/cfq-iosched.c
+++ b/block/cfq-iosched.c
@@ -871,12 +871,11 @@ entity_key(struct cfq_rb_root *st, struct cfq_entity *entity)
 }
 
 static void
-__cfq_group_service_tree_add(struct cfq_rb_root *st, struct cfq_group *cfqg)
+__cfq_entity_service_tree_add(struct cfq_rb_root *st, struct cfq_entity *cfqe)
 {
 	struct rb_node **node = &st->rb.rb_node;
 	struct rb_node *parent = NULL;
 	struct cfq_entity *__cfqe;
-	struct cfq_entity *cfqe = &cfqg->cfqe;
 	s64 key = entity_key(st, cfqe);
 	int left = 1;
 
@@ -900,6 +899,14 @@ __cfq_group_service_tree_add(struct cfq_rb_root *st, struct cfq_group *cfqg)
 }
 
 static void
+cfq_entity_service_tree_add(struct cfq_rb_root *st, struct cfq_entity *cfqe)
+{
+	__cfq_entity_service_tree_add(st, cfqe);
+	st->count++;
+	st->total_weight += cfqe->weight;
+}
+
+static void
 cfq_group_service_tree_add(struct cfq_data *cfqd, struct cfq_group *cfqg)
 {
 	struct cfq_rb_root *st = &cfqd->grp_service_tree;
@@ -923,8 +930,23 @@ cfq_group_service_tree_add(struct cfq_data *cfqd, struct cfq_group *cfqg)
 	} else
 		cfqe->vdisktime = st->min_vdisktime;
 
-	__cfq_group_service_tree_add(st, cfqg);
-	st->total_weight += cfqe->weight;
+	cfq_entity_service_tree_add(st, cfqe);
+}
+
+static void
+__cfq_entity_service_tree_del(struct cfq_rb_root *st, struct cfq_entity *cfqe)
+{
+	cfq_rb_erase(&cfqe->rb_node, st);
+}
+
+static void
+cfq_entity_service_tree_del(struct cfq_rb_root *st, struct cfq_entity *cfqe)
+{
+	if (!RB_EMPTY_NODE(&cfqe->rb_node)) {
+		__cfq_entity_service_tree_del(st, cfqe);
+		st->total_weight -= cfqe->weight;
+		cfqe->service_tree = NULL;
+	}
 }
 
 static void
@@ -941,9 +963,7 @@ cfq_group_service_tree_del(struct cfq_data *cfqd, struct cfq_group *cfqg)
 		return;
 
 	cfq_log_cfqg(cfqd, cfqg, "del_from_rr group");
-	st->total_weight -= cfqe->weight;
-	if (!RB_EMPTY_NODE(&cfqe->rb_node))
-		cfq_rb_erase(&cfqe->rb_node, st);
+	cfq_entity_service_tree_del(st, cfqe);
 	cfqg->saved_workload_slice = 0;
 	cfq_blkiocg_update_dequeue_stats(&cfqg->blkg, 1);
 }
@@ -992,9 +1012,9 @@ static void cfq_group_served(struct cfq_data *cfqd, struct cfq_group *cfqg,
 		charge = cfqq->allocated_slice;
 
 	/* Can't update vdisktime while group is on service tree */
-	cfq_rb_erase(&cfqe->rb_node, st);
+	__cfq_entity_service_tree_del(st, cfqe);
 	cfqe->vdisktime += cfq_scale_slice(charge, cfqe);
-	__cfq_group_service_tree_add(st, cfqg);
+	__cfq_entity_service_tree_add(st, cfqe);
 
 	/* This group is being expired. Save the context */
 	if (time_after(cfqd->workload_expires, jiffies)) {
@@ -1237,13 +1257,11 @@ static void cfq_service_tree_add(struct cfq_data *cfqd, struct cfq_queue *cfqq,
 				 bool add_front)
 {
 	struct cfq_entity *cfqe;
-	struct rb_node **p, *parent;
+	struct rb_node *parent;
 	struct cfq_entity *__cfqe;
 	struct cfq_rb_root *service_tree, *orig_st;
-	int left;
 	int new_cfqq = 1;
 	int group_changed = 0;
-	s64 key;
 
 	cfqe = &cfqq->cfqe;
 	orig_st = cfqe->service_tree;
@@ -1260,8 +1278,7 @@ static void cfq_service_tree_add(struct cfq_data *cfqd, struct cfq_queue *cfqq,
 			 * Group changed, dequeue this CFQ queue from the
 			 * original service tree.
 			 */
-			cfq_rb_erase(&cfqe->rb_node, cfqe->service_tree);
-			orig_st->total_weight -= cfqe->weight;
+			cfq_entity_service_tree_del(orig_st, cfqe);
 		}
 		cfqq->orig_cfqg = cfqq->cfqg;
 		cfqq->cfqg = &cfqd->root_group;
@@ -1277,8 +1294,7 @@ static void cfq_service_tree_add(struct cfq_data *cfqd, struct cfq_queue *cfqq,
 			 * Group changed, dequeue this CFQ queue from the
 			 * original service tree.
 			 */
-			cfq_rb_erase(&cfqe->rb_node, cfqe->service_tree);
-			orig_st->total_weight -= cfqe->weight;
+			cfq_entity_service_tree_del(orig_st, cfqe);
 		}
 		cfq_put_cfqg(cfqq->cfqg);
 		cfqq->cfqg = cfqq->orig_cfqg;
@@ -1320,8 +1336,7 @@ static void cfq_service_tree_add(struct cfq_data *cfqd, struct cfq_queue *cfqq,
 	 * Ok, we get here, this CFQ queue is on the service tree, dequeue it
 	 * firstly.
 	 */
-	cfq_rb_erase(&cfqe->rb_node, cfqe->service_tree);
-	orig_st->total_weight -= cfqe->weight;
+	cfq_entity_service_tree_del(orig_st, cfqe);
 
 	new_cfqq = 0;
 
@@ -1347,38 +1362,11 @@ static void cfq_service_tree_add(struct cfq_data *cfqd, struct cfq_queue *cfqq,
 	}
 
 insert:
-	left = 1;
-	parent = NULL;
 	cfqe->service_tree = service_tree;
-	p = &service_tree->rb.rb_node;
-	key = entity_key(service_tree, cfqe);
-	while (*p) {
-		struct rb_node **n;
-
-		parent = *p;
-		__cfqe = rb_entry(parent, struct cfq_entity, rb_node);
-
-		/*
-		 * sort by key, that represents service time.
-		 */
-		if (key < entity_key(service_tree, __cfqe))
-			n = &(*p)->rb_left;
-		else {
-			n = &(*p)->rb_right;
-			left = 0;
-		}
-
-		p = n;
-	}
 
-	if (left)
-		service_tree->left = &cfqe->rb_node;
-
-	rb_link_node(&cfqe->rb_node, parent, p);
-	rb_insert_color(&cfqe->rb_node, &service_tree->rb);
+	/* Add cfqq onto service tree. */
+	cfq_entity_service_tree_add(service_tree, cfqe);
 	update_min_vdisktime(service_tree);
-	service_tree->count++;
-	service_tree->total_weight += cfqe->weight;
 	cfqq->reposition_time = jiffies;
 	if ((add_front || !new_cfqq) && !group_changed)
 		return;
@@ -1491,9 +1479,7 @@ static void cfq_del_cfqq_rr(struct cfq_data *cfqd, struct cfq_queue *cfqq)
 	service_tree = cfqe->service_tree;
 
 	if (!RB_EMPTY_NODE(&cfqe->rb_node)) {
-		cfq_rb_erase(&cfqe->rb_node, cfqe->service_tree);
-		service_tree->total_weight -= cfqe->weight;
-		cfqe->service_tree = NULL;
+		cfq_entity_service_tree_del(service_tree, cfqe);
 	}
 	if (cfqq->p_root) {
 		rb_erase(&cfqq->p_node, cfqq->p_root);
-- 
1.7.1





^ permalink raw reply	[flat|nested] 40+ messages in thread

* [PATCH 5/6 v4] cfq-iosched: CFQ group hierarchical scheduling and use_hierarchy interface
       [not found] <4D51ED26.8050809@cn.fujitsu.com>
                   ` (3 preceding siblings ...)
  2011-02-10  7:47 ` [PATCH 4/6 v4] cfq-iosched: Extract some common code of service tree handling for CFQ queue and CFQ group Gui Jianfeng
@ 2011-02-10  7:47 ` Gui Jianfeng
  2011-02-10 20:57   ` Vivek Goyal
  2011-02-17  0:31   ` Justin TerAvest
  2011-02-10  7:47 ` [PATCH 6/6 v4] blkio-cgroup: Document for blkio.use_hierarchy interface Gui Jianfeng
  5 siblings, 2 replies; 40+ messages in thread
From: Gui Jianfeng @ 2011-02-10  7:47 UTC (permalink / raw)
  To: Vivek Goyal, Jens Axboe
  Cc: Gui Jianfeng, Shaohua Li, lkml, Chad Talbott, Divyesh Shah

CFQ group hierarchical scheduling and use_hierarchy interface.

Signed-off-by: Gui Jianfeng <guijianfeng@cn.fujitsu.com>
---
 block/blk-cgroup.c  |   61 +++++-
 block/blk-cgroup.h  |    3 +
 block/cfq-iosched.c |  603 +++++++++++++++++++++++++++++++++++++--------------
 3 files changed, 500 insertions(+), 167 deletions(-)

diff --git a/block/blk-cgroup.c b/block/blk-cgroup.c
index 455768a..c55fecd 100644
--- a/block/blk-cgroup.c
+++ b/block/blk-cgroup.c
@@ -25,7 +25,10 @@
 static DEFINE_SPINLOCK(blkio_list_lock);
 static LIST_HEAD(blkio_list);
 
-struct blkio_cgroup blkio_root_cgroup = { .weight = 2*BLKIO_WEIGHT_DEFAULT };
+struct blkio_cgroup blkio_root_cgroup = {
+	.weight = 2*BLKIO_WEIGHT_DEFAULT,
+	.use_hierarchy = 0
+};
 EXPORT_SYMBOL_GPL(blkio_root_cgroup);
 
 static struct cgroup_subsys_state *blkiocg_create(struct cgroup_subsys *,
@@ -454,6 +457,7 @@ static void __blkiocg_del_blkio_group(struct blkio_group *blkg)
 	blkg->blkcg_id = 0;
 }
 
+
 /*
  * returns 0 if blkio_group was still on cgroup list. Otherwise returns 1
  * indicating that blk_group was unhashed by the time we got to it.
@@ -765,6 +769,12 @@ unsigned int blkcg_get_weight(struct blkio_cgroup *blkcg,
 }
 EXPORT_SYMBOL_GPL(blkcg_get_weight);
 
+unsigned int blkcg_get_use_hierarchy(struct blkio_cgroup *blkcg)
+{
+	return blkcg->use_hierarchy;
+}
+EXPORT_SYMBOL_GPL(blkcg_get_use_hierarchy);
+
 uint64_t blkcg_get_read_bps(struct blkio_cgroup *blkcg, dev_t dev)
 {
 	struct blkio_policy_node *pn;
@@ -1202,6 +1212,8 @@ static u64 blkiocg_file_read_u64 (struct cgroup *cgrp, struct cftype *cft) {
 		switch(name) {
 		case BLKIO_PROP_weight:
 			return (u64)blkcg->weight;
+		case BLKIO_PROP_use_hierarchy:
+			return (u64)blkcg->use_hierarchy;
 		}
 		break;
 	default:
@@ -1210,6 +1222,36 @@ static u64 blkiocg_file_read_u64 (struct cgroup *cgrp, struct cftype *cft) {
 	return 0;
 }
 
+static int blkio_use_hierarchy_write(struct cgroup *cgrp, u64 val)
+{
+	struct cgroup *parent = cgrp->parent;
+	struct blkio_cgroup *blkcg, *parent_blkcg = NULL;
+	int ret = 0;
+
+	if (val != 0 && val != 1)
+		return -EINVAL;
+
+	blkcg = cgroup_to_blkio_cgroup(cgrp);
+	if (parent)
+		parent_blkcg = cgroup_to_blkio_cgroup(parent);
+
+	cgroup_lock();
+	/*
+	 * If parent's use_hierarchy is set, we can't make any modifications
+	 * in the child subtrees. If it is unset, then the change can occur,
+	 * provided the current cgroup has no children.
+	 */
+	if (!parent_blkcg || !parent_blkcg->use_hierarchy) {
+		if (list_empty(&cgrp->children))
+			blkcg->use_hierarchy = val;
+		else
+			ret = -EBUSY;
+	} else
+		ret = -EINVAL;
+	cgroup_unlock();
+	return ret;
+}
+
 static int
 blkiocg_file_write_u64(struct cgroup *cgrp, struct cftype *cft, u64 val)
 {
@@ -1224,6 +1266,8 @@ blkiocg_file_write_u64(struct cgroup *cgrp, struct cftype *cft, u64 val)
 		switch(name) {
 		case BLKIO_PROP_weight:
 			return blkio_weight_write(blkcg, val);
+		case BLKIO_PROP_use_hierarchy:
+			return blkio_use_hierarchy_write(cgrp, val);
 		}
 		break;
 	default:
@@ -1301,6 +1345,13 @@ struct cftype blkio_files[] = {
 		.name = "reset_stats",
 		.write_u64 = blkiocg_reset_stats,
 	},
+	{
+		.name = "use_hierarchy",
+		.private = BLKIOFILE_PRIVATE(BLKIO_POLICY_PROP,
+					     BLKIO_PROP_use_hierarchy),
+		.read_u64 = blkiocg_file_read_u64,
+		.write_u64 = blkiocg_file_write_u64,
+	},
 #ifdef CONFIG_BLK_DEV_THROTTLING
 	{
 		.name = "throttle.read_bps_device",
@@ -1444,7 +1495,7 @@ static void blkiocg_destroy(struct cgroup_subsys *subsys, struct cgroup *cgroup)
 static struct cgroup_subsys_state *
 blkiocg_create(struct cgroup_subsys *subsys, struct cgroup *cgroup)
 {
-	struct blkio_cgroup *blkcg;
+	struct blkio_cgroup *blkcg, *parent_blkcg = NULL;
 	struct cgroup *parent = cgroup->parent;
 
 	if (!parent) {
@@ -1452,6 +1503,7 @@ blkiocg_create(struct cgroup_subsys *subsys, struct cgroup *cgroup)
 		goto done;
 	}
 
+	parent_blkcg = cgroup_to_blkio_cgroup(parent);
 	blkcg = kzalloc(sizeof(*blkcg), GFP_KERNEL);
 	if (!blkcg)
 		return ERR_PTR(-ENOMEM);
@@ -1462,6 +1514,11 @@ done:
 	INIT_HLIST_HEAD(&blkcg->blkg_list);
 
 	INIT_LIST_HEAD(&blkcg->policy_list);
+	if (parent)
+		blkcg->use_hierarchy = parent_blkcg->use_hierarchy;
+	else
+		blkcg->use_hierarchy = 0;
+
 	return &blkcg->css;
 }
 
diff --git a/block/blk-cgroup.h b/block/blk-cgroup.h
index ea4861b..5b4b351 100644
--- a/block/blk-cgroup.h
+++ b/block/blk-cgroup.h
@@ -90,6 +90,7 @@ enum blkcg_file_name_prop {
 	BLKIO_PROP_idle_time,
 	BLKIO_PROP_empty_time,
 	BLKIO_PROP_dequeue,
+	BLKIO_PROP_use_hierarchy,
 };
 
 /* cgroup files owned by throttle policy */
@@ -105,6 +106,7 @@ enum blkcg_file_name_throtl {
 struct blkio_cgroup {
 	struct cgroup_subsys_state css;
 	unsigned int weight;
+	bool use_hierarchy;
 	spinlock_t lock;
 	struct hlist_head blkg_list;
 	struct list_head policy_list; /* list of blkio_policy_node */
@@ -179,6 +181,7 @@ struct blkio_policy_node {
 
 extern unsigned int blkcg_get_weight(struct blkio_cgroup *blkcg,
 				     dev_t dev);
+extern unsigned int blkcg_get_use_hierarchy(struct blkio_cgroup *blkcg);
 extern uint64_t blkcg_get_read_bps(struct blkio_cgroup *blkcg,
 				     dev_t dev);
 extern uint64_t blkcg_get_write_bps(struct blkio_cgroup *blkcg,
diff --git a/block/cfq-iosched.c b/block/cfq-iosched.c
index aa3eda8..0e21d27 100644
--- a/block/cfq-iosched.c
+++ b/block/cfq-iosched.c
@@ -110,6 +110,9 @@ struct cfq_entity {
 	u64 vdisktime;
 	bool is_group_entity;
 	unsigned int weight;
+	struct cfq_entity *parent;
+	/* Reposition time */
+	unsigned long reposition_time;
 };
 
 /*
@@ -118,8 +121,6 @@ struct cfq_entity {
 struct cfq_queue {
 	/* The schedule entity */
 	struct cfq_entity cfqe;
-	/* Reposition time */
-	unsigned long reposition_time;
 	/* reference count */
 	int ref;
 	/* various state flags, see below */
@@ -199,6 +200,9 @@ struct cfq_group {
 	/* number of cfqq currently on this group */
 	int nr_cfqq;
 
+	/* number of sub cfq groups */
+	int nr_subgp;
+
 	/*
 	 * Per group busy queus average. Useful for workload slice calc. We
 	 * create the array for each prio class but at run time it is used
@@ -234,10 +238,11 @@ struct cfq_group {
  */
 struct cfq_data {
 	struct request_queue *queue;
-	/* Root service tree for cfq_groups */
-	struct cfq_rb_root grp_service_tree;
 	struct cfq_group root_group;
 
+	/* cfq group schedule in flat or hierarchy manner. */
+	bool use_hierarchy;
+
 	/*
 	 * The priority currently being served
 	 */
@@ -246,6 +251,9 @@ struct cfq_data {
 	unsigned long workload_expires;
 	struct cfq_group *serving_group;
 
+	/* Service tree for cfq group flat scheduling mode. */
+	struct cfq_rb_root grp_service_tree;
+
 	/*
 	 * Each priority tree is sorted by next_request position.  These
 	 * trees are used when determining if two or more queues are
@@ -355,8 +363,6 @@ cfqg_of_entity(struct cfq_entity *cfqe)
 }
 
 
-static struct cfq_group *cfq_get_next_cfqg(struct cfq_data *cfqd);
-
 static struct cfq_rb_root *service_tree_for(struct cfq_group *cfqg,
 					    enum wl_prio_t prio,
 					    enum wl_type_t type)
@@ -643,13 +649,50 @@ static inline unsigned cfq_group_get_avg_queues(struct cfq_data *cfqd,
 	return cfqg->busy_queues_avg[rt];
 }
 
+static inline unsigned int
+cfq_group_get_total_weight(struct cfq_group *cfqg)
+{
+	int i, j;
+	struct cfq_rb_root *st;
+	unsigned int total_weight = 0;
+
+	for_each_cfqg_st(cfqg, i, j, st) {
+		total_weight += st->total_weight;
+	}
+
+	return total_weight;
+}
+
 static inline unsigned
 cfq_group_slice(struct cfq_data *cfqd, struct cfq_group *cfqg)
 {
-	struct cfq_rb_root *st = &cfqd->grp_service_tree;
 	struct cfq_entity *cfqe = &cfqg->cfqe;
+	struct cfq_rb_root *st;
+	int group_slice = cfq_target_latency;
+	unsigned int grp_total_weight;
+	struct cfq_group *p_cfqg;
+
+	/*
+	 * Calculate group slice in a hierarchical way.
+	 * Note, the calculation is cross all service trees under a group.
+	 */
+	do {
+		if (cfqe->parent) {
+			p_cfqg = cfqg_of_entity(cfqe->parent);
+			grp_total_weight = cfq_group_get_total_weight(p_cfqg);
+			group_slice = group_slice * cfqe->weight /
+					grp_total_weight;
+		} else {
+			/* For top level groups */
+			st = cfqe->service_tree;
+			group_slice = group_slice * cfqe->weight /
+					st->total_weight;
+		}
 
-	return cfq_target_latency * cfqe->weight / st->total_weight;
+		cfqe = cfqe->parent;
+	} while (cfqe);
+
+	return group_slice;
 }
 
 static inline void
@@ -672,7 +715,8 @@ cfq_set_prio_slice(struct cfq_data *cfqd, struct cfq_queue *cfqq)
 			/* scale low_slice according to IO priority
 			 * and sync vs async */
 			unsigned low_slice =
-				min(slice, base_low_slice * slice / sync_slice);
+				min(slice, base_low_slice * slice /
+				    sync_slice);
 			/* the adapted slice value is scaled to fit all iqs
 			 * into the target latency */
 			slice = max(slice * group_slice / expect_latency,
@@ -812,17 +856,6 @@ static struct cfq_entity *cfq_rb_first(struct cfq_rb_root *root)
 	return NULL;
 }
 
-static struct cfq_entity *cfq_rb_first_entity(struct cfq_rb_root *root)
-{
-	if (!root->left)
-		root->left = rb_first(&root->rb);
-
-	if (root->left)
-		return rb_entry_entity(root->left);
-
-	return NULL;
-}
-
 static void rb_erase_init(struct rb_node *n, struct rb_root *root)
 {
 	rb_erase(n, root);
@@ -896,12 +929,15 @@ __cfq_entity_service_tree_add(struct cfq_rb_root *st, struct cfq_entity *cfqe)
 
 	rb_link_node(&cfqe->rb_node, parent, node);
 	rb_insert_color(&cfqe->rb_node, &st->rb);
+
+	update_min_vdisktime(st);
 }
 
 static void
 cfq_entity_service_tree_add(struct cfq_rb_root *st, struct cfq_entity *cfqe)
 {
 	__cfq_entity_service_tree_add(st, cfqe);
+	cfqe->reposition_time = jiffies;
 	st->count++;
 	st->total_weight += cfqe->weight;
 }
@@ -909,34 +945,52 @@ cfq_entity_service_tree_add(struct cfq_rb_root *st, struct cfq_entity *cfqe)
 static void
 cfq_group_service_tree_add(struct cfq_data *cfqd, struct cfq_group *cfqg)
 {
-	struct cfq_rb_root *st = &cfqd->grp_service_tree;
 	struct cfq_entity *cfqe = &cfqg->cfqe;
-	struct cfq_entity *__cfqe;
 	struct rb_node *n;
+	struct cfq_entity *entity;
+	struct cfq_rb_root *st;
+	struct cfq_group *__cfqg;
 
 	cfqg->nr_cfqq++;
+
 	if (!RB_EMPTY_NODE(&cfqe->rb_node))
 		return;
 
 	/*
-	 * Currently put the group at the end. Later implement something
-	 * so that groups get lesser vtime based on their weights, so that
-	 * if group does not loose all if it was not continously backlogged.
+	 * Enqueue this group and its ancestors onto their service tree.
 	 */
-	n = rb_last(&st->rb);
-	if (n) {
-		__cfqe = rb_entry_entity(n);
-		cfqe->vdisktime = __cfqe->vdisktime + CFQ_IDLE_DELAY;
-	} else
-		cfqe->vdisktime = st->min_vdisktime;
+	while (cfqe) {
+		if (!RB_EMPTY_NODE(&cfqe->rb_node))
+			return;
 
-	cfq_entity_service_tree_add(st, cfqe);
+		/*
+		 * Currently put the group at the end. Later implement
+		 * something so that groups get lesser vtime based on
+		 * their weights, so that if group does not loose all
+		 * if it was not continously backlogged.
+		 */
+		st = cfqe->service_tree;
+		n = rb_last(&st->rb);
+		if (n) {
+			entity = rb_entry_entity(n);
+			cfqe->vdisktime = entity->vdisktime +
+				CFQ_IDLE_DELAY;
+		} else
+			cfqe->vdisktime = st->min_vdisktime;
+
+		cfq_entity_service_tree_add(st, cfqe);
+		cfqe = cfqe->parent;
+		__cfqg = cfqg_of_entity(cfqe);
+		if (__cfqg)
+			__cfqg->nr_subgp++;
+	}
 }
 
 static void
 __cfq_entity_service_tree_del(struct cfq_rb_root *st, struct cfq_entity *cfqe)
 {
 	cfq_rb_erase(&cfqe->rb_node, st);
+	update_min_vdisktime(st);
 }
 
 static void
@@ -945,27 +999,43 @@ cfq_entity_service_tree_del(struct cfq_rb_root *st, struct cfq_entity *cfqe)
 	if (!RB_EMPTY_NODE(&cfqe->rb_node)) {
 		__cfq_entity_service_tree_del(st, cfqe);
 		st->total_weight -= cfqe->weight;
-		cfqe->service_tree = NULL;
 	}
 }
 
 static void
 cfq_group_service_tree_del(struct cfq_data *cfqd, struct cfq_group *cfqg)
 {
-	struct cfq_rb_root *st = &cfqd->grp_service_tree;
 	struct cfq_entity *cfqe = &cfqg->cfqe;
+	struct cfq_group *__cfqg, *p_cfqg;
 
 	BUG_ON(cfqg->nr_cfqq < 1);
 	cfqg->nr_cfqq--;
 
-	/* If there are other cfq queues under this group, don't delete it */
-	if (cfqg->nr_cfqq)
+	/*
+	 * If there are other cfq queues under this group, or there are other
+	 * cfq groups under this group, don't delete it.
+	 */
+	if (cfqg->nr_cfqq || cfqg->nr_subgp)
 		return;
 
-	cfq_log_cfqg(cfqd, cfqg, "del_from_rr group");
-	cfq_entity_service_tree_del(st, cfqe);
-	cfqg->saved_workload_slice = 0;
-	cfq_blkiocg_update_dequeue_stats(&cfqg->blkg, 1);
+	/*
+	 * Dequeue this group and its ancestors from their service
+	 * tree.
+	 */
+	while (cfqe) {
+		__cfqg = cfqg_of_entity(cfqe);
+		p_cfqg = cfqg_of_entity(cfqe->parent);
+		cfq_entity_service_tree_del(cfqe->service_tree, cfqe);
+		cfq_blkiocg_update_dequeue_stats(&__cfqg->blkg, 1);
+		cfq_log_cfqg(cfqd, __cfqg, "del_from_rr group");
+		__cfqg->saved_workload_slice = 0;
+		cfqe = cfqe->parent;
+		if (p_cfqg) {
+			p_cfqg->nr_subgp--;
+			if (p_cfqg->nr_cfqq || p_cfqg->nr_subgp)
+				return;
+		}
+	}
 }
 
 static inline unsigned int cfq_cfqq_slice_usage(struct cfq_queue *cfqq)
@@ -997,7 +1067,6 @@ static inline unsigned int cfq_cfqq_slice_usage(struct cfq_queue *cfqq)
 static void cfq_group_served(struct cfq_data *cfqd, struct cfq_group *cfqg,
 				struct cfq_queue *cfqq)
 {
-	struct cfq_rb_root *st = &cfqd->grp_service_tree;
 	unsigned int used_sl, charge;
 	int nr_sync = cfqg->nr_cfqq - cfqg_busy_async_queues(cfqd, cfqg)
 			- cfqg->service_tree_idle.count;
@@ -1011,10 +1080,23 @@ static void cfq_group_served(struct cfq_data *cfqd, struct cfq_group *cfqg,
 	else if (!cfq_cfqq_sync(cfqq) && !nr_sync)
 		charge = cfqq->allocated_slice;
 
-	/* Can't update vdisktime while group is on service tree */
-	__cfq_entity_service_tree_del(st, cfqe);
-	cfqe->vdisktime += cfq_scale_slice(charge, cfqe);
-	__cfq_entity_service_tree_add(st, cfqe);
+	/*
+	 * Update the vdisktime on the whole chain.
+	 */
+	while (cfqe) {
+		struct cfq_rb_root *st = cfqe->service_tree;
+
+		/*
+		 * Can't update vdisktime while group is on service
+		 * tree.
+		 */
+		__cfq_entity_service_tree_del(st, cfqe);
+		cfqe->vdisktime += cfq_scale_slice(charge, cfqe);
+		__cfq_entity_service_tree_add(st, cfqe);
+		st->count++;
+		cfqe->reposition_time = jiffies;
+		cfqe = cfqe->parent;
+	}
 
 	/* This group is being expired. Save the context */
 	if (time_after(cfqd->workload_expires, jiffies)) {
@@ -1026,7 +1108,8 @@ static void cfq_group_served(struct cfq_data *cfqd, struct cfq_group *cfqg,
 		cfqg->saved_workload_slice = 0;
 
 	cfq_log_cfqg(cfqd, cfqg, "served: vt=%llu min_vt=%llu",
-		     cfqe->vdisktime, st->min_vdisktime);
+		     cfqg->cfqe.vdisktime,
+		     cfqg->cfqe.service_tree->min_vdisktime);
 	cfq_log_cfqq(cfqq->cfqd, cfqq, "sl_used=%u disp=%u charge=%u iops=%u"
 			" sect=%u", used_sl, cfqq->slice_dispatch, charge,
 			iops_mode(cfqd), cfqq->nr_sectors);
@@ -1048,35 +1131,27 @@ void cfq_update_blkio_group_weight(void *key, struct blkio_group *blkg,
 	cfqg_of_blkg(blkg)->cfqe.weight = weight;
 }
 
-static struct cfq_group *
-cfq_find_alloc_cfqg(struct cfq_data *cfqd, struct cgroup *cgroup, int create)
+static void init_cfqe(struct blkio_cgroup *blkcg,
+				    struct cfq_group *cfqg)
+{
+	struct cfq_entity *cfqe = &cfqg->cfqe;
+
+	cfqe->weight = blkcg_get_weight(blkcg, cfqg->blkg.dev);
+	RB_CLEAR_NODE(&cfqe->rb_node);
+	cfqe->is_group_entity = true;
+	cfqe->parent = NULL;
+}
+
+static void init_cfqg(struct cfq_data *cfqd, struct blkio_cgroup *blkcg,
+		      struct cfq_group *cfqg)
 {
-	struct blkio_cgroup *blkcg = cgroup_to_blkio_cgroup(cgroup);
-	struct cfq_group *cfqg = NULL;
-	void *key = cfqd;
 	int i, j;
 	struct cfq_rb_root *st;
-	struct backing_dev_info *bdi = &cfqd->queue->backing_dev_info;
 	unsigned int major, minor;
-
-	cfqg = cfqg_of_blkg(blkiocg_lookup_group(blkcg, key));
-	if (cfqg && !cfqg->blkg.dev && bdi->dev && dev_name(bdi->dev)) {
-		sscanf(dev_name(bdi->dev), "%u:%u", &major, &minor);
-		cfqg->blkg.dev = MKDEV(major, minor);
-		goto done;
-	}
-	if (cfqg || !create)
-		goto done;
-
-	cfqg = kzalloc_node(sizeof(*cfqg), GFP_ATOMIC, cfqd->queue->node);
-	if (!cfqg)
-		goto done;
+	struct backing_dev_info *bdi = &cfqd->queue->backing_dev_info;
 
 	for_each_cfqg_st(cfqg, i, j, st)
 		*st = CFQ_RB_ROOT;
-	RB_CLEAR_NODE(&cfqg->cfqe.rb_node);
-
-	cfqg->cfqe.is_group_entity = true;
 
 	/*
 	 * Take the initial reference that will be released on destroy
@@ -1086,24 +1161,199 @@ cfq_find_alloc_cfqg(struct cfq_data *cfqd, struct cgroup *cgroup, int create)
 	 */
 	cfqg->ref = 1;
 
+	/* Add group onto cgroup list */
+	sscanf(dev_name(bdi->dev), "%u:%u", &major, &minor);
+	cfq_blkiocg_add_blkio_group(blkcg, &cfqg->blkg, (void *)cfqd,
+				    MKDEV(major, minor));
+	/* Initiate group entity */
+	init_cfqe(blkcg, cfqg);
+	/* Add group on cfqd list */
+	hlist_add_head(&cfqg->cfqd_node, &cfqd->cfqg_list);
+}
+
+static void cfq_destroy_cfqg(struct cfq_data *cfqd, struct cfq_group *cfqg);
+
+static void uninit_cfqg(struct cfq_data *cfqd, struct cfq_group *cfqg)
+{
+	if (!cfq_blkiocg_del_blkio_group(&cfqg->blkg))
+		cfq_destroy_cfqg(cfqd, cfqg);
+}
+
+static void cfqg_set_parent(struct cfq_data *cfqd, struct cfq_group *cfqg,
+			    struct cfq_group *p_cfqg)
+{
+	struct cfq_entity *cfqe, *p_cfqe;
+
+	cfqe = &cfqg->cfqe;
+
 	/*
-	 * Add group onto cgroup list. It might happen that bdi->dev is
-	 * not initiliazed yet. Initialize this new group without major
-	 * and minor info and this info will be filled in once a new thread
-	 * comes for IO. See code above.
+	 * 1. If use_hierarchy of the CGroup where cfqg's parent stays is not
+	 *    set, we put this cfqg onto global service tree.
+	 * 2. If cfqg is root cfqg, put it onto global service tree.
 	 */
-	if (bdi->dev) {
-		sscanf(dev_name(bdi->dev), "%u:%u", &major, &minor);
-		cfq_blkiocg_add_blkio_group(blkcg, &cfqg->blkg, (void *)cfqd,
-					MKDEV(major, minor));
-	} else
-		cfq_blkiocg_add_blkio_group(blkcg, &cfqg->blkg, (void *)cfqd,
-					0);
+	if (!p_cfqg) {
+		cfqe->service_tree = &cfqd->grp_service_tree;
+		cfqe->parent = NULL;
+		return;
+	}
 
-	cfqg->cfqe.weight = blkcg_get_weight(blkcg, cfqg->blkg.dev);
+	p_cfqe = &p_cfqg->cfqe;
 
-	/* Add group on cfqd list */
-	hlist_add_head(&cfqg->cfqd_node, &cfqd->cfqg_list);
+	cfqe->parent = p_cfqe;
+
+	/*
+	 * Currently, just put cfq group entity on "BE:SYNC" workload
+	 * service tree.
+	 */
+	cfqe->service_tree = service_tree_for(p_cfqg, BE_WORKLOAD,
+						      SYNC_WORKLOAD);
+	/* child reference */
+	p_cfqg->ref++;
+}
+
+static struct cfq_group *cfqg_get_parent(struct cfq_group * cfqg)
+{
+	struct cfq_entity *cfqe, *p_cfqe;
+
+	if (!cfqg)
+		return NULL;
+
+	cfqe = &cfqg->cfqe;
+	p_cfqe = cfqe->parent;
+	if (!p_cfqe)
+		return NULL;
+
+	return cfqg_of_entity(p_cfqe);
+}
+
+static struct cfq_group *
+cfqg_chain_alloc(struct cfq_data *cfqd, struct cgroup *cgroup)
+{
+	struct blkio_cgroup *blkcg;
+	struct backing_dev_info *bdi = &cfqd->queue->backing_dev_info;
+	unsigned int major, minor;
+	struct cfq_group *cfqg, *leaf_cfqg, *child_cfqg, *tmp_cfqg;
+	void *key = cfqd;
+
+	/*
+	 * If CGroup's use_hierarchy is unset, we just need to allocate only
+	 * one CFQ group, and this group will put onto the "grp_service_tree".
+	 * We don't need to check whether the cfqg exists, the caller has
+	 * already checked it.
+	 */
+	blkcg = cgroup_to_blkio_cgroup(cgroup);
+	if (!blkcg_get_use_hierarchy(blkcg)) {
+		cfqg = kzalloc_node(sizeof(*cfqg), GFP_ATOMIC,
+				    cfqd->queue->node);
+		if (!cfqg)
+			return NULL;
+
+		init_cfqg(cfqd, blkcg, cfqg);
+		cfqg_set_parent(cfqd, cfqg, NULL);
+		return cfqg;
+	}
+
+	/*
+	 * Allocate the CFQ group chain until we meet the group we'v already
+	 * allocated before, or to the CGroup whose use_hierarchy is not set.
+	 */
+	leaf_cfqg = NULL;
+	child_cfqg = NULL;
+	for (; cgroup != NULL; cgroup = cgroup->parent) {
+		blkcg = cgroup_to_blkio_cgroup(cgroup);
+		cfqg = cfqg_of_blkg(blkiocg_lookup_group(blkcg, key));
+		if (cfqg) {
+			if (!cfqg->blkg.dev && bdi->dev &&
+			    dev_name(bdi->dev)) {
+				sscanf(dev_name(bdi->dev), "%u:%u",
+				       &major, &minor);
+				cfqg->blkg.dev = MKDEV(major, minor);
+			}
+
+			/*
+			 * Initialization of parent doesn't finish yet, get
+			 * it done.
+			 */
+			if (child_cfqg) {
+				if (blkcg_get_use_hierarchy(blkcg))
+					cfqg_set_parent(cfqd, child_cfqg,
+							cfqg);
+				else
+					cfqg_set_parent(cfqd, child_cfqg,
+							NULL);
+			}
+
+			/* chain has already been built */
+			break;
+		}
+
+		/*
+		 * We only allocate a cfqg that the corresponding cgroup's
+		 * use_hierarchy is set.
+		 */
+		if (blkcg_get_use_hierarchy(blkcg)) {
+			cfqg = kzalloc_node(sizeof(*cfqg), GFP_ATOMIC,
+					    cfqd->queue->node);
+			if (!cfqg)
+				goto clean_up;
+
+			if (!leaf_cfqg)
+				leaf_cfqg = cfqg;
+
+			init_cfqg(cfqd, blkcg, cfqg);
+		} else {
+			cfqg = NULL;
+		}
+
+		if (child_cfqg)
+			cfqg_set_parent(cfqd, child_cfqg, cfqg);
+
+		/*
+		 * This CGroup's use_hierarchy isn't set, this means the CFQ
+		 * group chain has been built.
+		 */
+		if (!blkcg_get_use_hierarchy(blkcg))
+			break;
+
+		child_cfqg = cfqg;
+	}
+
+	return leaf_cfqg;
+
+clean_up:
+	/* clean up the allocated cfq groups. */
+	while (leaf_cfqg) {
+		tmp_cfqg = leaf_cfqg;
+		leaf_cfqg = cfqg_get_parent(leaf_cfqg);
+		uninit_cfqg(cfqd, tmp_cfqg);
+	}
+
+	return NULL;
+}
+
+static struct cfq_group *
+cfq_find_alloc_cfqg(struct cfq_data *cfqd, struct cgroup *cgroup, int create)
+{
+	struct blkio_cgroup *blkcg = cgroup_to_blkio_cgroup(cgroup);
+	struct cfq_group *cfqg = NULL;
+	void *key = cfqd;
+	struct backing_dev_info *bdi = &cfqd->queue->backing_dev_info;
+	unsigned int major, minor;
+
+	cfqg = cfqg_of_blkg(blkiocg_lookup_group(blkcg, key));
+	if (cfqg && !cfqg->blkg.dev && bdi->dev && dev_name(bdi->dev)) {
+		sscanf(dev_name(bdi->dev), "%u:%u", &major, &minor);
+		cfqg->blkg.dev = MKDEV(major, minor);
+		goto done;
+	}
+	if (cfqg || !create)
+		goto done;
+
+	/*
+	 * Allocate CFQ group chain to the root group or we meet the CGroup
+	 * with use_hierarchy disabled.
+	 */
+	cfqg = cfqg_chain_alloc(cfqd, cgroup);
 
 done:
 	return cfqg;
@@ -1148,6 +1398,7 @@ static void cfq_put_cfqg(struct cfq_group *cfqg)
 {
 	struct cfq_rb_root *st;
 	int i, j;
+	struct cfq_group *p_cfqg;
 
 	BUG_ON(cfqg->ref <= 0);
 	cfqg->ref--;
@@ -1155,6 +1406,22 @@ static void cfq_put_cfqg(struct cfq_group *cfqg)
 		return;
 	for_each_cfqg_st(cfqg, i, j, st)
 		BUG_ON(!RB_EMPTY_ROOT(&st->rb));
+
+	do {
+		p_cfqg = cfqg_get_parent(cfqg);
+		kfree(cfqg);
+		cfqg = NULL;
+		/*
+		 * Drop the reference taken by children, if nobody references
+		 * parent group, we need delete the parent also.
+		 */
+		if (p_cfqg) {
+			p_cfqg->ref--;
+			if (p_cfqg->ref == 0)
+				cfqg = p_cfqg;
+		}
+	} while (cfqg);
+
 	kfree(cfqg);
 }
 
@@ -1321,9 +1588,6 @@ static void cfq_service_tree_add(struct cfq_data *cfqd, struct cfq_queue *cfqq,
 			 * ioprio.
 			 */
 			pos_offset = cfq_get_boost(cfqd, cfqq);
-			/* Debug purpose, should remove. */
-			cfq_log_cfqq(cfqd, cfqq, "pos_offset: %llu",
-				     pos_offset);
 			cfqe->vdisktime = service_tree->min_vdisktime +
 						pos_offset;
 		} else
@@ -1365,9 +1629,8 @@ insert:
 	cfqe->service_tree = service_tree;
 
 	/* Add cfqq onto service tree. */
+
 	cfq_entity_service_tree_add(service_tree, cfqe);
-	update_min_vdisktime(service_tree);
-	cfqq->reposition_time = jiffies;
 	if ((add_front || !new_cfqq) && !group_changed)
 		return;
 	cfq_group_service_tree_add(cfqd, cfqq->cfqg);
@@ -1810,28 +2073,43 @@ static struct cfq_queue *cfq_get_next_queue(struct cfq_data *cfqd)
 	return cfqq_of_entity(cfq_rb_first(service_tree));
 }
 
-static struct cfq_queue *cfq_get_next_queue_forced(struct cfq_data *cfqd)
+struct cfq_rb_root *choose_service_tree_forced(struct cfq_group *cfqg)
 {
-	struct cfq_group *cfqg;
-	struct cfq_entity *cfqe;
 	int i, j;
 	struct cfq_rb_root *st;
 
-	if (!cfqd->rq_queued)
-		return NULL;
+	for_each_cfqg_st(cfqg, i, j, st) {
+		if (st->count != 0)
+			return st;
+	}
 
-	cfqg = cfq_get_next_cfqg(cfqd);
-	if (!cfqg)
+	return NULL;
+}
+
+static struct cfq_entity *
+cfq_get_next_entity_forced(struct cfq_data *cfqd)
+{
+	struct cfq_entity *cfqe;
+	struct cfq_rb_root *st = &cfqd->grp_service_tree;
+	struct cfq_group *cfqg;
+
+	if (!cfqd->rq_queued)
 		return NULL;
 
-	for_each_cfqg_st(cfqg, i, j, st) {
+	do {
 		cfqe = cfq_rb_first(st);
-		if (cfqe != NULL)
-			return cfqq_of_entity(cfqe);
-	}
+		if (cfqe && !cfqe->is_group_entity)
+			return cfqe;
+		else if (cfqe && cfqe->is_group_entity)
+			cfqg = cfqg_of_entity(cfqe);
+
+		st = choose_service_tree_forced(cfqg);
+	} while (st);
+
 	return NULL;
 }
 
+
 /*
  * Get and set a new active qu



^ permalink raw reply	[flat|nested] 40+ messages in thread

* [PATCH 6/6 v4] blkio-cgroup: Document for blkio.use_hierarchy interface
       [not found] <4D51ED26.8050809@cn.fujitsu.com>
                   ` (4 preceding siblings ...)
  2011-02-10  7:47 ` [PATCH 5/6 v4] cfq-iosched: CFQ group hierarchical scheduling and use_hierarchy interface Gui Jianfeng
@ 2011-02-10  7:47 ` Gui Jianfeng
  5 siblings, 0 replies; 40+ messages in thread
From: Gui Jianfeng @ 2011-02-10  7:47 UTC (permalink / raw)
  To: Vivek Goyal, Jens Axboe
  Cc: Gui Jianfeng, Shaohua Li, lkml, Chad Talbott, Divyesh Shah

Document for blkio.use_hierarchy interface

Signed-off-by: Gui Jianfeng <guijianfeng@cn.fujitsu.com>
---
 Documentation/cgroups/blkio-controller.txt |   81 +++++++++++++++++++++-------
 1 files changed, 62 insertions(+), 19 deletions(-)

diff --git a/Documentation/cgroups/blkio-controller.txt b/Documentation/cgroups/blkio-controller.txt
index 4ed7b5c..24399f4 100644
--- a/Documentation/cgroups/blkio-controller.txt
+++ b/Documentation/cgroups/blkio-controller.txt
@@ -91,30 +91,62 @@ Throttling/Upper Limit policy
 
 Hierarchical Cgroups
 ====================
-- Currently none of the IO control policy supports hierarhical groups. But
-  cgroup interface does allow creation of hierarhical cgroups and internally
-  IO policies treat them as flat hierarchy.
+- Cgroup interface allows creation of hierarchical cgroups. Currently,
+  internally IO policies are able to treat them as flat hierarchy or
+  hierarchical hierarchy. Both hierarchical bandwidth division and flat
+  bandwidth division are supported. "blkio.use_hierarchy" can be used to
+  switch between flat mode and hierarchical mode.
 
-  So this patch will allow creation of cgroup hierarhcy but at the backend
-  everything will be treated as flat. So if somebody created a hierarchy like
-  as follows.
+  Note: Currently, "blkio.use_hierarchy" only effects proportional bandwidth
+  division. For Throttling logic, it still continues to treat everything as flat.
 
-			root
-			/  \
-		     test1 test2
-			|
-		     test3
+  Consider the following CGroup hierarchy:
 
-  CFQ and throttling will practically treat all groups at same level.
+			  Root
+			/  |   \
+		     Grp1  Grp2 tsk1
+	            /  \
+		 Grp3 tsk2
 
-				pivot
-			     /  |   \  \
-			root  test1 test2  test3
+  If blkio.use_hierarchy is disabled in all CGroups, CFQ will practically treat all groups
+  at the same level.
 
-  Down the line we can implement hierarchical accounting/control support
-  and also introduce a new cgroup file "use_hierarchy" which will control
-  whether cgroup hierarchy is viewed as flat or hierarchical by the policy..
-  This is how memory controller also has implemented the things.
+			     Pivot tree
+			    /  |   |   \
+			Root Grp1 Grp2 Grp3
+			 /     |
+			tsk1  tsk2
+
+  If blkio.use_hierarchy is enabled in Root group, then all children will inherit it, thus
+  all children group have use_hierarchy=1 set automatically and looks as follows.
+
+		       Pivot tree
+			   |
+			  Root
+			/  |   \
+		     Grp1  Grp2 tsk1
+	            /  \
+		 Grp3 tsk2
+
+  If blkio.use_hierarchy is enabled in Grp1 and Grp3, CFQ will treat groups and tasks as the
+  same view in CGroup hierarchy, it looks as follows.
+
+
+			     Pivot tree
+			    /    |    \
+			  Root  Grp1  Grp2
+			  /     /  \
+		       tsk1   Grp3 tsk2
+
+  Root, Grp1 and Grp2 are treated at the same level under Pivot tree. tsk1 stays under Root.
+  Grp3 and tsk2 are treated at the same level under Grp1. Below is the mapping between
+  task io priority and io weight:
+
+	    prio       0    1     2    3    4    5    6     7
+	    weight  1000  868   740  612  484  356  228   100
+
+  Note: Regardless of the use_hierarchy setting in Root group, Root group is always put onto
+  Pivot tree.
 
 Various user visible config options
 ===================================
@@ -169,6 +201,17 @@ Proportional weight policy files
 	  dev     weight
 	  8:16    300
 
+- blkio.use_hierarchy
+	- Switch between hierarchical mode and flat mode as stated above.
+	  blkio.use_hierarchy == 1 means hierarchical mode is enabled.
+	  blkio.use_hierarchy == 0 means flat mode is enabled.
+	  You can set this interface only if there isn't any child CGroup under
+	  this CGroup. If one CGroup's blkio.use_hierarchy is set, the created
+	  children will inherit it. it's not allowed to unset it in children.
+	  The default mode in Root CGroup is flat.
+	  blkio.use_hierarchy only works for proportional bandwidth division
+	  as of today and doesn't have any effect on throttling logic.
+
 - blkio.time
 	- disk time allocated to cgroup per device in milliseconds. First
 	  two fields specify the major and minor number of the device and
-- 
1.7.1





^ permalink raw reply	[flat|nested] 40+ messages in thread

* Re: [PATCH 3/6 v4] cfq-iosched: Introduce vdisktime and io weight for CFQ queue
  2011-02-10  7:47 ` [PATCH 3/6 v4] cfq-iosched: Introduce vdisktime and io weight for CFQ queue Gui Jianfeng
@ 2011-02-10 19:29   ` Vivek Goyal
  2011-02-12  1:20     ` Gui Jianfeng
  2011-02-14 18:13   ` Vivek Goyal
  2011-02-14 23:32   ` Justin TerAvest
  2 siblings, 1 reply; 40+ messages in thread
From: Vivek Goyal @ 2011-02-10 19:29 UTC (permalink / raw)
  To: Gui Jianfeng; +Cc: Jens Axboe, Shaohua Li, lkml, Chad Talbott, Divyesh Shah

On Thu, Feb 10, 2011 at 03:47:16PM +0800, Gui Jianfeng wrote:
> Introduce vdisktime and io weight for CFQ queue scheduling. Currently, io priority
> maps to a range [100,1000]. It also gets rid of cfq_slice_offset() logic and makes
> use the same scheduling algorithm as CFQ group does. This helps for CFQ queue and
> group scheduling on the same service tree.
> 
> Signed-off-by: Gui Jianfeng <guijianfeng@cn.fujitsu.com>
> ---
>  block/cfq-iosched.c |  219 +++++++++++++++++++++++++++++++++++++++------------
>  1 files changed, 167 insertions(+), 52 deletions(-)
> 
> diff --git a/block/cfq-iosched.c b/block/cfq-iosched.c
> index f3a126e..41cef2e 100644
> --- a/block/cfq-iosched.c
> +++ b/block/cfq-iosched.c
> @@ -39,6 +39,13 @@ static const int cfq_hist_divisor = 4;
>   */
>  #define CFQ_IDLE_DELAY		(HZ / 5)
>  
> +/* 
> + * The base boosting value.
> + */
> +#define CFQ_BOOST_SYNC_BASE          (HZ / 10)
> +#define CFQ_BOOST_ASYNC_BASE          (HZ / 25)
> +

These are same as cfq_slice_sync and cfq_slice_async. Looking at
boost logic, this is equivalent of starting a new queue/group as
if it is being requeued after conuming a full slice. So may be we can divide
it by some const number say 4 or something like that. This is a minor
point though as this algorimthm will kind of evolve and we will learn
what works best.

Secondly, I think you wanted to SYNC vs ASYNC logic seem to be reversed.
We would like to give ASYNC queues higher boost (Put these farther in 
tree) and lesser boost to SYNC queues. Looks like above constants will
do the reverse? 


[..]
> +	if (RB_EMPTY_NODE(&cfqe->rb_node)) {
> +		/*
> +		 * If this CFQ queue moves to another group, the original
> +		 * vdisktime makes no sense any more, reset the vdisktime
> +		 * here.
> +		 */
> +		parent = rb_last(&service_tree->rb);
> +		if (parent) {
> +			u64 pos_offset;
> +
> +			/*
> +			 * Estimate the position according to its weight and
> +			 * ioprio.
> +			 */
> +			pos_offset = cfq_get_boost(cfqd, cfqq);
> +			/* Debug purpose, should remove. */
> +			cfq_log_cfqq(cfqd, cfqq, "pos_offset: %llu",
> +				     pos_offset);

You wanted to get rid of above debugging comment?

Thanks
Vivek

^ permalink raw reply	[flat|nested] 40+ messages in thread

* Re: [PATCH 5/6 v4] cfq-iosched: CFQ group hierarchical scheduling and use_hierarchy interface
  2011-02-10  7:47 ` [PATCH 5/6 v4] cfq-iosched: CFQ group hierarchical scheduling and use_hierarchy interface Gui Jianfeng
@ 2011-02-10 20:57   ` Vivek Goyal
  2011-02-12  2:21     ` Gui Jianfeng
  2011-02-14  3:20     ` Gui Jianfeng
  2011-02-17  0:31   ` Justin TerAvest
  1 sibling, 2 replies; 40+ messages in thread
From: Vivek Goyal @ 2011-02-10 20:57 UTC (permalink / raw)
  To: Gui Jianfeng; +Cc: Jens Axboe, Shaohua Li, lkml, Chad Talbott, Divyesh Shah

On Thu, Feb 10, 2011 at 03:47:45PM +0800, Gui Jianfeng wrote:
> CFQ group hierarchical scheduling and use_hierarchy interface.
> 

Hi Gui,

I have done a quick high level review. Some minor comments inline.

[..]
>  struct cfq_data {
>  	struct request_queue *queue;
> -	/* Root service tree for cfq_groups */
> -	struct cfq_rb_root grp_service_tree;
>  	struct cfq_group root_group;
>  
> +	/* cfq group schedule in flat or hierarchy manner. */
> +	bool use_hierarchy;
> +

This seems to be redundant now? Nobody is using it?

>  	/*
>  	 * The priority currently being served
>  	 */
> @@ -246,6 +251,9 @@ struct cfq_data {
>  	unsigned long workload_expires;
>  	struct cfq_group *serving_group;
>  
> +	/* Service tree for cfq group flat scheduling mode. */
> +	struct cfq_rb_root grp_service_tree;

Above comment is misleading. This service tree is now used both for
flat as well as hierarhical mode.

[..]
>  static void
>  cfq_group_service_tree_add(struct cfq_data *cfqd, struct cfq_group *cfqg)
>  {
> -	struct cfq_rb_root *st = &cfqd->grp_service_tree;
>  	struct cfq_entity *cfqe = &cfqg->cfqe;
> -	struct cfq_entity *__cfqe;
>  	struct rb_node *n;
> +	struct cfq_entity *entity;
> +	struct cfq_rb_root *st;
> +	struct cfq_group *__cfqg;
>  
>  	cfqg->nr_cfqq++;
> +
>  	if (!RB_EMPTY_NODE(&cfqe->rb_node))
>  		return;
>  
>  	/*
> -	 * Currently put the group at the end. Later implement something
> -	 * so that groups get lesser vtime based on their weights, so that
> -	 * if group does not loose all if it was not continously backlogged.
> +	 * Enqueue this group and its ancestors onto their service tree.
>  	 */
> -	n = rb_last(&st->rb);
> -	if (n) {
> -		__cfqe = rb_entry_entity(n);
> -		cfqe->vdisktime = __cfqe->vdisktime + CFQ_IDLE_DELAY;
> -	} else
> -		cfqe->vdisktime = st->min_vdisktime;
> +	while (cfqe) {
> +		if (!RB_EMPTY_NODE(&cfqe->rb_node))
> +			return;
>  
> -	cfq_entity_service_tree_add(st, cfqe);
> +		/*
> +		 * Currently put the group at the end. Later implement
> +		 * something so that groups get lesser vtime based on
> +		 * their weights, so that if group does not loose all
> +		 * if it was not continously backlogged.
> +		 */

Can we use vdisktime boost logic for groups also? I think it can be a separate
patch in the series (the last one). Keeping it as a separate patch will
also help you to coordinate with chad's patch.

> +		st = cfqe->service_tree;

Group entity set their service tree when they get allocated and retain
this pointer even when they get deleted from serivce tree. Queue entities
seem to have it NULL when they get deleted from service tree and it
gets set again when queue is getting inserted. It would be nice if we
can fix this discrepancy and keep it consistent. I think clearing up
cfqe->service_tree is a better idea and then calculate it again for
group also.

[..]
>  
> -static struct cfq_group *
> -cfq_find_alloc_cfqg(struct cfq_data *cfqd, struct cgroup *cgroup, int create)
> +static void init_cfqe(struct blkio_cgroup *blkcg,
> +				    struct cfq_group *cfqg)

As you are using this function for initializing group entity, possibly
rename it to init_group_entity() or init_group_cfqe() etc.

[..]
> +static struct cfq_group *
> +cfq_find_alloc_cfqg(struct cfq_data *cfqd, struct cgroup *cgroup, int create)
> +{
> +	struct blkio_cgroup *blkcg = cgroup_to_blkio_cgroup(cgroup);
> +	struct cfq_group *cfqg = NULL;
> +	void *key = cfqd;
> +	struct backing_dev_info *bdi = &cfqd->queue->backing_dev_info;
> +	unsigned int major, minor;
> +
> +	cfqg = cfqg_of_blkg(blkiocg_lookup_group(blkcg, key));
> +	if (cfqg && !cfqg->blkg.dev && bdi->dev && dev_name(bdi->dev)) {
> +		sscanf(dev_name(bdi->dev), "%u:%u", &major, &minor);
> +		cfqg->blkg.dev = MKDEV(major, minor);
> +		goto done;
> +	}

Should we make this updation of this info hierarhical?

Thanks
Vivek

^ permalink raw reply	[flat|nested] 40+ messages in thread

* Re: [PATCH 3/6 v4] cfq-iosched: Introduce vdisktime and io weight for CFQ queue
  2011-02-10 19:29   ` Vivek Goyal
@ 2011-02-12  1:20     ` Gui Jianfeng
  2011-02-14 16:58       ` Vivek Goyal
  0 siblings, 1 reply; 40+ messages in thread
From: Gui Jianfeng @ 2011-02-12  1:20 UTC (permalink / raw)
  To: Vivek Goyal; +Cc: Jens Axboe, Shaohua Li, lkml, Chad Talbott, Divyesh Shah

Vivek Goyal wrote:
> On Thu, Feb 10, 2011 at 03:47:16PM +0800, Gui Jianfeng wrote:
>> Introduce vdisktime and io weight for CFQ queue scheduling. Currently, io priority
>> maps to a range [100,1000]. It also gets rid of cfq_slice_offset() logic and makes
>> use the same scheduling algorithm as CFQ group does. This helps for CFQ queue and
>> group scheduling on the same service tree.
>>
>> Signed-off-by: Gui Jianfeng <guijianfeng@cn.fujitsu.com>
>> ---
>>  block/cfq-iosched.c |  219 +++++++++++++++++++++++++++++++++++++++------------
>>  1 files changed, 167 insertions(+), 52 deletions(-)
>>
>> diff --git a/block/cfq-iosched.c b/block/cfq-iosched.c
>> index f3a126e..41cef2e 100644
>> --- a/block/cfq-iosched.c
>> +++ b/block/cfq-iosched.c
>> @@ -39,6 +39,13 @@ static const int cfq_hist_divisor = 4;
>>   */
>>  #define CFQ_IDLE_DELAY		(HZ / 5)
>>  
>> +/* 
>> + * The base boosting value.
>> + */
>> +#define CFQ_BOOST_SYNC_BASE          (HZ / 10)
>> +#define CFQ_BOOST_ASYNC_BASE          (HZ / 25)
>> +
> 
> These are same as cfq_slice_sync and cfq_slice_async. Looking at
> boost logic, this is equivalent of starting a new queue/group as
> if it is being requeued after conuming a full slice. So may be we can divide
> it by some const number say 4 or something like that. This is a minor
> point though as this algorimthm will kind of evolve and we will learn
> what works best.
> 
> Secondly, I think you wanted to SYNC vs ASYNC logic seem to be reversed.
> We would like to give ASYNC queues higher boost (Put these farther in 
> tree) and lesser boost to SYNC queues. Looks like above constants will
> do the reverse? 

Hi Vivek,

Currently, SYNC and ASYNC queues are in different service tree, they don't
impact each other. Here, I Really want use this logic.

Thanks,
Gui

> 
> 
> [..]
>> +	if (RB_EMPTY_NODE(&cfqe->rb_node)) {
>> +		/*
>> +		 * If this CFQ queue moves to another group, the original
>> +		 * vdisktime makes no sense any more, reset the vdisktime
>> +		 * here.
>> +		 */
>> +		parent = rb_last(&service_tree->rb);
>> +		if (parent) {
>> +			u64 pos_offset;
>> +
>> +			/*
>> +			 * Estimate the position according to its weight and
>> +			 * ioprio.
>> +			 */
>> +			pos_offset = cfq_get_boost(cfqd, cfqq);
>> +			/* Debug purpose, should remove. */
>> +			cfq_log_cfqq(cfqd, cfqq, "pos_offset: %llu",
>> +				     pos_offset);
> 
> You wanted to get rid of above debugging comment?
> 
> Thanks
> Vivek
> 

-- 
Regards
Gui Jianfeng

^ permalink raw reply	[flat|nested] 40+ messages in thread

* Re: [PATCH 5/6 v4] cfq-iosched: CFQ group hierarchical scheduling and use_hierarchy interface
  2011-02-10 20:57   ` Vivek Goyal
@ 2011-02-12  2:21     ` Gui Jianfeng
  2011-02-14 18:04       ` Vivek Goyal
  2011-02-14  3:20     ` Gui Jianfeng
  1 sibling, 1 reply; 40+ messages in thread
From: Gui Jianfeng @ 2011-02-12  2:21 UTC (permalink / raw)
  To: Vivek Goyal; +Cc: Jens Axboe, Shaohua Li, lkml, Chad Talbott, Divyesh Shah

Vivek Goyal wrote:
> On Thu, Feb 10, 2011 at 03:47:45PM +0800, Gui Jianfeng wrote:
>> CFQ group hierarchical scheduling and use_hierarchy interface.
>>
> 
> Hi Gui,
> 
> I have done a quick high level review. Some minor comments inline.
> 
> [..]
>>  struct cfq_data {
>>  	struct request_queue *queue;
>> -	/* Root service tree for cfq_groups */
>> -	struct cfq_rb_root grp_service_tree;
>>  	struct cfq_group root_group;
>>  
>> +	/* cfq group schedule in flat or hierarchy manner. */
>> +	bool use_hierarchy;
>> +
> 
> This seems to be redundant now? Nobody is using it?

Ahh, I think so.

> 
>>  	/*
>>  	 * The priority currently being served
>>  	 */
>> @@ -246,6 +251,9 @@ struct cfq_data {
>>  	unsigned long workload_expires;
>>  	struct cfq_group *serving_group;
>>  
>> +	/* Service tree for cfq group flat scheduling mode. */
>> +	struct cfq_rb_root grp_service_tree;
> 
> Above comment is misleading. This service tree is now used both for
> flat as well as hierarhical mode.

Will modify.

> 
> [..]
>>  static void
>>  cfq_group_service_tree_add(struct cfq_data *cfqd, struct cfq_group *cfqg)
>>  {
>> -	struct cfq_rb_root *st = &cfqd->grp_service_tree;
>>  	struct cfq_entity *cfqe = &cfqg->cfqe;
>> -	struct cfq_entity *__cfqe;
>>  	struct rb_node *n;
>> +	struct cfq_entity *entity;
>> +	struct cfq_rb_root *st;
>> +	struct cfq_group *__cfqg;
>>  
>>  	cfqg->nr_cfqq++;
>> +
>>  	if (!RB_EMPTY_NODE(&cfqe->rb_node))
>>  		return;
>>  
>>  	/*
>> -	 * Currently put the group at the end. Later implement something
>> -	 * so that groups get lesser vtime based on their weights, so that
>> -	 * if group does not loose all if it was not continously backlogged.
>> +	 * Enqueue this group and its ancestors onto their service tree.
>>  	 */
>> -	n = rb_last(&st->rb);
>> -	if (n) {
>> -		__cfqe = rb_entry_entity(n);
>> -		cfqe->vdisktime = __cfqe->vdisktime + CFQ_IDLE_DELAY;
>> -	} else
>> -		cfqe->vdisktime = st->min_vdisktime;
>> +	while (cfqe) {
>> +		if (!RB_EMPTY_NODE(&cfqe->rb_node))
>> +			return;
>>  
>> -	cfq_entity_service_tree_add(st, cfqe);
>> +		/*
>> +		 * Currently put the group at the end. Later implement
>> +		 * something so that groups get lesser vtime based on
>> +		 * their weights, so that if group does not loose all
>> +		 * if it was not continously backlogged.
>> +		 */
> 
> Can we use vdisktime boost logic for groups also? I think it can be a separate
> patch in the series (the last one). Keeping it as a separate patch will
> also help you to coordinate with chad's patch.

Make a separete patch make more sense, will do it as soon as this series gets merged.

> 
>> +		st = cfqe->service_tree;
> 
> Group entity set their service tree when they get allocated and retain
> this pointer even when they get deleted from serivce tree. Queue entities
> seem to have it NULL when they get deleted from service tree and it
> gets set again when queue is getting inserted. It would be nice if we
> can fix this discrepancy and keep it consistent. I think clearing up
> cfqe->service_tree is a better idea and then calculate it again for
> group also.

Ok, will consider to change.

> 
> [..]
>>  
>> -static struct cfq_group *
>> -cfq_find_alloc_cfqg(struct cfq_data *cfqd, struct cgroup *cgroup, int create)
>> +static void init_cfqe(struct blkio_cgroup *blkcg,
>> +				    struct cfq_group *cfqg)
> 
> As you are using this function for initializing group entity, possibly
> rename it to init_group_entity() or init_group_cfqe() etc.

Sure.

> 
> [..]
>> +static struct cfq_group *
>> +cfq_find_alloc_cfqg(struct cfq_data *cfqd, struct cgroup *cgroup, int create)
>> +{
>> +	struct blkio_cgroup *blkcg = cgroup_to_blkio_cgroup(cgroup);
>> +	struct cfq_group *cfqg = NULL;
>> +	void *key = cfqd;
>> +	struct backing_dev_info *bdi = &cfqd->queue->backing_dev_info;
>> +	unsigned int major, minor;
>> +
>> +	cfqg = cfqg_of_blkg(blkiocg_lookup_group(blkcg, key));
>> +	if (cfqg && !cfqg->blkg.dev && bdi->dev && dev_name(bdi->dev)) {
>> +		sscanf(dev_name(bdi->dev), "%u:%u", &major, &minor);
>> +		cfqg->blkg.dev = MKDEV(major, minor);
>> +		goto done;
>> +	}
> 
> Should we make this updation of this info hierarhical?

IMHO, it's fine to defer the updation when we really get the cfqg.

Will post an updated version.

Thanks,
Gui

> 
> Thanks
> Vivek
> 

^ permalink raw reply	[flat|nested] 40+ messages in thread

* Re: [PATCH 5/6 v4] cfq-iosched: CFQ group hierarchical scheduling and use_hierarchy interface
  2011-02-10 20:57   ` Vivek Goyal
  2011-02-12  2:21     ` Gui Jianfeng
@ 2011-02-14  3:20     ` Gui Jianfeng
  2011-02-14 18:10       ` Vivek Goyal
  1 sibling, 1 reply; 40+ messages in thread
From: Gui Jianfeng @ 2011-02-14  3:20 UTC (permalink / raw)
  To: Vivek Goyal; +Cc: Jens Axboe, Shaohua Li, lkml, Chad Talbott, Divyesh Shah

Vivek Goyal wrote:
> On Thu, Feb 10, 2011 at 03:47:45PM +0800, Gui Jianfeng wrote:
>> CFQ group hierarchical scheduling and use_hierarchy interface.
>>
> 
> Hi Gui,
> 
> I have done a quick high level review. Some minor comments inline.
> 
> [..]
>>  struct cfq_data {
>>  	struct request_queue *queue;
>> -	/* Root service tree for cfq_groups */
>> -	struct cfq_rb_root grp_service_tree;
>>  	struct cfq_group root_group;
>>  
>> +	/* cfq group schedule in flat or hierarchy manner. */
>> +	bool use_hierarchy;
>> +
> 
> This seems to be redundant now? Nobody is using it?
> 
>>  	/*
>>  	 * The priority currently being served
>>  	 */
>> @@ -246,6 +251,9 @@ struct cfq_data {
>>  	unsigned long workload_expires;
>>  	struct cfq_group *serving_group;
>>  
>> +	/* Service tree for cfq group flat scheduling mode. */
>> +	struct cfq_rb_root grp_service_tree;
> 
> Above comment is misleading. This service tree is now used both for
> flat as well as hierarhical mode.
> 
> [..]
>>  static void
>>  cfq_group_service_tree_add(struct cfq_data *cfqd, struct cfq_group *cfqg)
>>  {
>> -	struct cfq_rb_root *st = &cfqd->grp_service_tree;
>>  	struct cfq_entity *cfqe = &cfqg->cfqe;
>> -	struct cfq_entity *__cfqe;
>>  	struct rb_node *n;
>> +	struct cfq_entity *entity;
>> +	struct cfq_rb_root *st;
>> +	struct cfq_group *__cfqg;
>>  
>>  	cfqg->nr_cfqq++;
>> +
>>  	if (!RB_EMPTY_NODE(&cfqe->rb_node))
>>  		return;
>>  
>>  	/*
>> -	 * Currently put the group at the end. Later implement something
>> -	 * so that groups get lesser vtime based on their weights, so that
>> -	 * if group does not loose all if it was not continously backlogged.
>> +	 * Enqueue this group and its ancestors onto their service tree.
>>  	 */
>> -	n = rb_last(&st->rb);
>> -	if (n) {
>> -		__cfqe = rb_entry_entity(n);
>> -		cfqe->vdisktime = __cfqe->vdisktime + CFQ_IDLE_DELAY;
>> -	} else
>> -		cfqe->vdisktime = st->min_vdisktime;
>> +	while (cfqe) {
>> +		if (!RB_EMPTY_NODE(&cfqe->rb_node))
>> +			return;
>>  
>> -	cfq_entity_service_tree_add(st, cfqe);
>> +		/*
>> +		 * Currently put the group at the end. Later implement
>> +		 * something so that groups get lesser vtime based on
>> +		 * their weights, so that if group does not loose all
>> +		 * if it was not continously backlogged.
>> +		 */
> 
> Can we use vdisktime boost logic for groups also? I think it can be a separate
> patch in the series (the last one). Keeping it as a separate patch will
> also help you to coordinate with chad's patch.
> 
>> +		st = cfqe->service_tree;
> 
> Group entity set their service tree when they get allocated and retain
> this pointer even when they get deleted from serivce tree. Queue entities
> seem to have it NULL when they get deleted from service tree and it
> gets set again when queue is getting inserted. It would be nice if we
> can fix this discrepancy and keep it consistent. I think clearing up
> cfqe->service_tree is a better idea and then calculate it again for
> group also.

Vivek,

Currently, cfq queue might change workload type and io class, so we need to recalculate
its service_tree. But for cfq groups, IMHO we don't need to add this complexity for the
time being.
I think we can add this change as soon as different io classes or workload types are
introduced. How do you think?

Thanks,
Gui

^ permalink raw reply	[flat|nested] 40+ messages in thread

* Re: [PATCH 3/6 v4] cfq-iosched: Introduce vdisktime and io weight for CFQ queue
  2011-02-12  1:20     ` Gui Jianfeng
@ 2011-02-14 16:58       ` Vivek Goyal
  2011-02-15  1:53         ` Gui Jianfeng
  0 siblings, 1 reply; 40+ messages in thread
From: Vivek Goyal @ 2011-02-14 16:58 UTC (permalink / raw)
  To: Gui Jianfeng; +Cc: Jens Axboe, Shaohua Li, lkml, Chad Talbott, Divyesh Shah

On Sat, Feb 12, 2011 at 09:20:58AM +0800, Gui Jianfeng wrote:
> Vivek Goyal wrote:
> > On Thu, Feb 10, 2011 at 03:47:16PM +0800, Gui Jianfeng wrote:
> >> Introduce vdisktime and io weight for CFQ queue scheduling. Currently, io priority
> >> maps to a range [100,1000]. It also gets rid of cfq_slice_offset() logic and makes
> >> use the same scheduling algorithm as CFQ group does. This helps for CFQ queue and
> >> group scheduling on the same service tree.
> >>
> >> Signed-off-by: Gui Jianfeng <guijianfeng@cn.fujitsu.com>
> >> ---
> >>  block/cfq-iosched.c |  219 +++++++++++++++++++++++++++++++++++++++------------
> >>  1 files changed, 167 insertions(+), 52 deletions(-)
> >>
> >> diff --git a/block/cfq-iosched.c b/block/cfq-iosched.c
> >> index f3a126e..41cef2e 100644
> >> --- a/block/cfq-iosched.c
> >> +++ b/block/cfq-iosched.c
> >> @@ -39,6 +39,13 @@ static const int cfq_hist_divisor = 4;
> >>   */
> >>  #define CFQ_IDLE_DELAY		(HZ / 5)
> >>  
> >> +/* 
> >> + * The base boosting value.
> >> + */
> >> +#define CFQ_BOOST_SYNC_BASE          (HZ / 10)
> >> +#define CFQ_BOOST_ASYNC_BASE          (HZ / 25)
> >> +
> > 
> > These are same as cfq_slice_sync and cfq_slice_async. Looking at
> > boost logic, this is equivalent of starting a new queue/group as
> > if it is being requeued after conuming a full slice. So may be we can divide
> > it by some const number say 4 or something like that. This is a minor
> > point though as this algorimthm will kind of evolve and we will learn
> > what works best.
> > 
> > Secondly, I think you wanted to SYNC vs ASYNC logic seem to be reversed.
> > We would like to give ASYNC queues higher boost (Put these farther in 
> > tree) and lesser boost to SYNC queues. Looks like above constants will
> > do the reverse? 
> 
> Hi Vivek,
> 
> Currently, SYNC and ASYNC queues are in different service tree, they don't
> impact each other. Here, I Really want use this logic.

Ok, SYNC and ASYNC are on separate service tree so their vtime are not
comparable (as of today, down the line one might want to look at those for
better workload selection logic).

Anyway, because two are on seprate tree so why should we have separate
boosting constants for them? How does it help?

Thanks
Vivek

^ permalink raw reply	[flat|nested] 40+ messages in thread

* Re: [PATCH 5/6 v4] cfq-iosched: CFQ group hierarchical scheduling and use_hierarchy interface
  2011-02-12  2:21     ` Gui Jianfeng
@ 2011-02-14 18:04       ` Vivek Goyal
  2011-02-15  2:38         ` Gui Jianfeng
  0 siblings, 1 reply; 40+ messages in thread
From: Vivek Goyal @ 2011-02-14 18:04 UTC (permalink / raw)
  To: Gui Jianfeng; +Cc: Jens Axboe, Shaohua Li, lkml, Chad Talbott, Divyesh Shah

On Sat, Feb 12, 2011 at 10:21:47AM +0800, Gui Jianfeng wrote:
[..]
> >> +static struct cfq_group *
> >> +cfq_find_alloc_cfqg(struct cfq_data *cfqd, struct cgroup *cgroup, int create)
> >> +{
> >> +	struct blkio_cgroup *blkcg = cgroup_to_blkio_cgroup(cgroup);
> >> +	struct cfq_group *cfqg = NULL;
> >> +	void *key = cfqd;
> >> +	struct backing_dev_info *bdi = &cfqd->queue->backing_dev_info;
> >> +	unsigned int major, minor;
> >> +
> >> +	cfqg = cfqg_of_blkg(blkiocg_lookup_group(blkcg, key));
> >> +	if (cfqg && !cfqg->blkg.dev && bdi->dev && dev_name(bdi->dev)) {
> >> +		sscanf(dev_name(bdi->dev), "%u:%u", &major, &minor);
> >> +		cfqg->blkg.dev = MKDEV(major, minor);
> >> +		goto done;
> >> +	}
> > 
> > Should we make this updation of this info hierarhical?
> 
> IMHO, it's fine to defer the updation when we really get the cfqg.

But if cfqg is alrady present, we will never hit the allocation path 
again. So if somebody creates 2-3 level deep hierarchy and does IO
only in the children cgroup, parent cgroups will potentially not get
device info updated hence no visible stats?

Thanks
Vivek

^ permalink raw reply	[flat|nested] 40+ messages in thread

* Re: [PATCH 5/6 v4] cfq-iosched: CFQ group hierarchical scheduling and use_hierarchy interface
  2011-02-14  3:20     ` Gui Jianfeng
@ 2011-02-14 18:10       ` Vivek Goyal
  0 siblings, 0 replies; 40+ messages in thread
From: Vivek Goyal @ 2011-02-14 18:10 UTC (permalink / raw)
  To: Gui Jianfeng; +Cc: Jens Axboe, Shaohua Li, lkml, Chad Talbott, Divyesh Shah

On Mon, Feb 14, 2011 at 11:20:33AM +0800, Gui Jianfeng wrote:
> Vivek Goyal wrote:
> > On Thu, Feb 10, 2011 at 03:47:45PM +0800, Gui Jianfeng wrote:
> >> CFQ group hierarchical scheduling and use_hierarchy interface.
> >>
> > 
> > Hi Gui,
> > 
> > I have done a quick high level review. Some minor comments inline.
> > 
> > [..]
> >>  struct cfq_data {
> >>  	struct request_queue *queue;
> >> -	/* Root service tree for cfq_groups */
> >> -	struct cfq_rb_root grp_service_tree;
> >>  	struct cfq_group root_group;
> >>  
> >> +	/* cfq group schedule in flat or hierarchy manner. */
> >> +	bool use_hierarchy;
> >> +
> > 
> > This seems to be redundant now? Nobody is using it?
> > 
> >>  	/*
> >>  	 * The priority currently being served
> >>  	 */
> >> @@ -246,6 +251,9 @@ struct cfq_data {
> >>  	unsigned long workload_expires;
> >>  	struct cfq_group *serving_group;
> >>  
> >> +	/* Service tree for cfq group flat scheduling mode. */
> >> +	struct cfq_rb_root grp_service_tree;
> > 
> > Above comment is misleading. This service tree is now used both for
> > flat as well as hierarhical mode.
> > 
> > [..]
> >>  static void
> >>  cfq_group_service_tree_add(struct cfq_data *cfqd, struct cfq_group *cfqg)
> >>  {
> >> -	struct cfq_rb_root *st = &cfqd->grp_service_tree;
> >>  	struct cfq_entity *cfqe = &cfqg->cfqe;
> >> -	struct cfq_entity *__cfqe;
> >>  	struct rb_node *n;
> >> +	struct cfq_entity *entity;
> >> +	struct cfq_rb_root *st;
> >> +	struct cfq_group *__cfqg;
> >>  
> >>  	cfqg->nr_cfqq++;
> >> +
> >>  	if (!RB_EMPTY_NODE(&cfqe->rb_node))
> >>  		return;
> >>  
> >>  	/*
> >> -	 * Currently put the group at the end. Later implement something
> >> -	 * so that groups get lesser vtime based on their weights, so that
> >> -	 * if group does not loose all if it was not continously backlogged.
> >> +	 * Enqueue this group and its ancestors onto their service tree.
> >>  	 */
> >> -	n = rb_last(&st->rb);
> >> -	if (n) {
> >> -		__cfqe = rb_entry_entity(n);
> >> -		cfqe->vdisktime = __cfqe->vdisktime + CFQ_IDLE_DELAY;
> >> -	} else
> >> -		cfqe->vdisktime = st->min_vdisktime;
> >> +	while (cfqe) {
> >> +		if (!RB_EMPTY_NODE(&cfqe->rb_node))
> >> +			return;
> >>  
> >> -	cfq_entity_service_tree_add(st, cfqe);
> >> +		/*
> >> +		 * Currently put the group at the end. Later implement
> >> +		 * something so that groups get lesser vtime based on
> >> +		 * their weights, so that if group does not loose all
> >> +		 * if it was not continously backlogged.
> >> +		 */
> > 
> > Can we use vdisktime boost logic for groups also? I think it can be a separate
> > patch in the series (the last one). Keeping it as a separate patch will
> > also help you to coordinate with chad's patch.
> > 
> >> +		st = cfqe->service_tree;
> > 
> > Group entity set their service tree when they get allocated and retain
> > this pointer even when they get deleted from serivce tree. Queue entities
> > seem to have it NULL when they get deleted from service tree and it
> > gets set again when queue is getting inserted. It would be nice if we
> > can fix this discrepancy and keep it consistent. I think clearing up
> > cfqe->service_tree is a better idea and then calculate it again for
> > group also.
> 
> Vivek,
> 
> Currently, cfq queue might change workload type and io class, so we need to recalculate
> its service_tree. But for cfq groups, IMHO we don't need to add this complexity for the
> time being.
> I think we can add this change as soon as different io classes or workload types are
> introduced. How do you think?

Ok, that's fine.

Thanks
Vivek

^ permalink raw reply	[flat|nested] 40+ messages in thread

* Re: [PATCH 3/6 v4] cfq-iosched: Introduce vdisktime and io weight for CFQ queue
  2011-02-10  7:47 ` [PATCH 3/6 v4] cfq-iosched: Introduce vdisktime and io weight for CFQ queue Gui Jianfeng
  2011-02-10 19:29   ` Vivek Goyal
@ 2011-02-14 18:13   ` Vivek Goyal
  2011-02-15  1:46     ` Gui Jianfeng
  2011-02-18  6:04     ` Gui Jianfeng
  2011-02-14 23:32   ` Justin TerAvest
  2 siblings, 2 replies; 40+ messages in thread
From: Vivek Goyal @ 2011-02-14 18:13 UTC (permalink / raw)
  To: Gui Jianfeng; +Cc: Jens Axboe, Shaohua Li, lkml, Chad Talbott, Divyesh Shah

On Thu, Feb 10, 2011 at 03:47:16PM +0800, Gui Jianfeng wrote:

[..]
> +/*
> + * The time when a CFQ queue is put onto a service tree is recoreded in
> + * cfqq->reposition_time. Currently, we check the first priority CFQ queues
> + * on each service tree, and select the workload type that contains the lowest
> + * reposition_time CFQ queue among them.
> + */
>  static enum wl_type_t cfq_choose_wl(struct cfq_data *cfqd,
>  				struct cfq_group *cfqg, enum wl_prio_t prio)
>  {
>  	struct cfq_entity *cfqe;
> +	struct cfq_queue *cfqq;
> +	unsigned long lowest_start_time;
>  	int i;
> -	bool key_valid = false;
> -	unsigned long lowest_key = 0;
> +	bool time_valid = false;
>  	enum wl_type_t cur_best = SYNC_NOIDLE_WORKLOAD;
>  
> +	/*
> +	 * TODO: We may take io priority and io class into account when
> +	 * choosing a workload type. But for the time being just make use of
> +	 * reposition_time only.
> +	 */
>  	for (i = 0; i <= SYNC_WORKLOAD; ++i) {
> -		/* select the one with lowest rb_key */
>  		cfqe = cfq_rb_first(service_tree_for(cfqg, prio, i));
> -		if (cfqe &&
> -		    (!key_valid || time_before(cfqe->rb_key, lowest_key))) {
> -			lowest_key = cfqe->rb_key;
> +		cfqq = cfqq_of_entity(cfqe);
> +		if (cfqe && (!time_valid ||
> +			     time_before(cfqq->reposition_time,
> +					 lowest_start_time))) {
> +			lowest_start_time = cfqq->reposition_time;

Gui,

Have you had a chance to run some mixed workloads in a group (some sync,
some async and some sync-idle queues), and see how latency and throughput
of sync-idle workload changes due to this "resposition_time" logic. I 
just want to make sure that latency of sync-noidle workload does not
go up as that's the workload that people care and gets noticed first.

Thanks
Vivek

^ permalink raw reply	[flat|nested] 40+ messages in thread

* Re: [PATCH 3/6 v4] cfq-iosched: Introduce vdisktime and io weight for CFQ queue
  2011-02-10  7:47 ` [PATCH 3/6 v4] cfq-iosched: Introduce vdisktime and io weight for CFQ queue Gui Jianfeng
  2011-02-10 19:29   ` Vivek Goyal
  2011-02-14 18:13   ` Vivek Goyal
@ 2011-02-14 23:32   ` Justin TerAvest
  2011-02-15  1:44     ` Gui Jianfeng
  2 siblings, 1 reply; 40+ messages in thread
From: Justin TerAvest @ 2011-02-14 23:32 UTC (permalink / raw)
  To: Gui Jianfeng
  Cc: Vivek Goyal, Jens Axboe, Shaohua Li, lkml, Chad Talbott, Divyesh Shah

On Wed, Feb 9, 2011 at 11:47 PM, Gui Jianfeng
<guijianfeng@cn.fujitsu.com> wrote:
> Introduce vdisktime and io weight for CFQ queue scheduling. Currently, io priority
> maps to a range [100,1000]. It also gets rid of cfq_slice_offset() logic and makes
> use the same scheduling algorithm as CFQ group does. This helps for CFQ queue and
> group scheduling on the same service tree.

Hi Gui,
I have a couple of questions inline.

Thanks,
Justin

>
> Signed-off-by: Gui Jianfeng <guijianfeng@cn.fujitsu.com>
> ---
>  block/cfq-iosched.c |  219 +++++++++++++++++++++++++++++++++++++++------------
>  1 files changed, 167 insertions(+), 52 deletions(-)
>
> diff --git a/block/cfq-iosched.c b/block/cfq-iosched.c
> index f3a126e..41cef2e 100644
> --- a/block/cfq-iosched.c
> +++ b/block/cfq-iosched.c
> @@ -39,6 +39,13 @@ static const int cfq_hist_divisor = 4;
>  */
>  #define CFQ_IDLE_DELAY         (HZ / 5)
>
> +/*
> + * The base boosting value.
> + */
> +#define CFQ_BOOST_SYNC_BASE          (HZ / 10)
> +#define CFQ_BOOST_ASYNC_BASE          (HZ / 25)
> +
> +
>  /*
>  * below this threshold, we consider thinktime immediate
>  */
> @@ -99,10 +106,7 @@ struct cfq_entity {
>        struct cfq_rb_root *service_tree;
>        /* service_tree member */
>        struct rb_node rb_node;
> -       /* service_tree key, represent the position on the tree */
> -       unsigned long rb_key;
> -
> -       /* group service_tree key */
> +       /* service_tree key */
>        u64 vdisktime;
>        bool is_group_entity;
>        unsigned int weight;
> @@ -114,6 +118,8 @@ struct cfq_entity {
>  struct cfq_queue {
>        /* The schedule entity */
>        struct cfq_entity cfqe;
> +       /* Reposition time */
> +       unsigned long reposition_time;

Can this be addition time or something else instead? This is set, even
when we are not repositioning among service trees.

>        /* reference count */
>        int ref;
>        /* various state flags, see below */
> @@ -312,6 +318,24 @@ struct cfq_data {
>        struct rcu_head rcu;
>  };
>
> +/*
> + * Map io priority(7 ~ 0) to io weight(100 ~ 1000) as follows
> + *     prio       0    1     2    3    4    5    6     7
> + *     weight  1000  868   740  612  484  356  228   100
> + */
> +static inline unsigned int cfq_prio_to_weight(unsigned short ioprio)
> +{
> +       unsigned int step;
> +
> +       BUG_ON(ioprio >= IOPRIO_BE_NR);
> +
> +       step = (BLKIO_WEIGHT_MAX - BLKIO_WEIGHT_MIN) / (IOPRIO_BE_NR - 1);
> +       if (ioprio == 0)
> +               return BLKIO_WEIGHT_MAX;
> +
> +       return BLKIO_WEIGHT_MIN + (IOPRIO_BE_NR - ioprio - 1) * step;
> +}
> +
>  static inline struct cfq_queue *
>  cfqq_of_entity(struct cfq_entity *cfqe)
>  {
> @@ -840,16 +864,6 @@ cfq_find_next_rq(struct cfq_data *cfqd, struct cfq_queue *cfqq,
>        return cfq_choose_req(cfqd, next, prev, blk_rq_pos(last));
>  }
>
> -static unsigned long cfq_slice_offset(struct cfq_data *cfqd,
> -                                     struct cfq_queue *cfqq)
> -{
> -       /*
> -        * just an approximation, should be ok.
> -        */
> -       return (cfqq->cfqg->nr_cfqq - 1) * (cfq_prio_slice(cfqd, 1, 0) -
> -                      cfq_prio_slice(cfqd, cfq_cfqq_sync(cfqq), cfqq->ioprio));
> -}
> -
>  static inline s64
>  entity_key(struct cfq_rb_root *st, struct cfq_entity *entity)
>  {
> @@ -1199,6 +1213,21 @@ static inline void cfq_put_cfqg(struct cfq_group *cfqg) {}
>
>  #endif /* GROUP_IOSCHED */
>
> +static inline u64 cfq_get_boost(struct cfq_data *cfqd,
> +                                struct cfq_queue *cfqq)
> +{
> +       u64 d;
> +
> +       if (cfq_cfqq_sync(cfqq))
> +               d = CFQ_BOOST_SYNC_BASE << CFQ_SERVICE_SHIFT;
> +       else
> +               d = CFQ_BOOST_ASYNC_BASE << CFQ_SERVICE_SHIFT;
> +
> +       d = d * BLKIO_WEIGHT_DEFAULT;
> +       do_div(d, cfqq->cfqe.weight);
> +       return d;
> +}

The logic for cfq_get_boost() looks a lot like cfq_scale_slice().
Instead of duplicating code, can't it just be
u64 d;
if (cfq_cfqq_sync(cfqq))
        return cfq_scale_slice(CFQ_BOOST_SYNC_BASE, cfqq->cfqe);
else
        return cfq_scale_slice(CFQ_BOOST_ASYNC_BASE, cfqq->cfqe);


> +
>  /*
>  * The cfqd->service_trees holds all pending cfq_queue's that have
>  * requests waiting to be processed. It is sorted in the order that
> @@ -1210,13 +1239,14 @@ static void cfq_service_tree_add(struct cfq_data *cfqd, struct cfq_queue *cfqq,
>        struct cfq_entity *cfqe;
>        struct rb_node **p, *parent;
>        struct cfq_entity *__cfqe;
> -       unsigned long rb_key;
> -       struct cfq_rb_root *service_tree;
> +       struct cfq_rb_root *service_tree, *orig_st;
>        int left;
>        int new_cfqq = 1;
>        int group_changed = 0;
> +       s64 key;
>
>        cfqe = &cfqq->cfqe;
> +       orig_st = cfqe->service_tree;
>
>  #ifdef CONFIG_CFQ_GROUP_IOSCHED
>        if (!cfqd->cfq_group_isolation
> @@ -1224,8 +1254,15 @@ static void cfq_service_tree_add(struct cfq_data *cfqd, struct cfq_queue *cfqq,
>            && cfqq->cfqg && cfqq->cfqg != &cfqd->root_group) {
>                /* Move this cfq to root group */
>                cfq_log_cfqq(cfqd, cfqq, "moving to root group");
> -               if (!RB_EMPTY_NODE(&cfqe->rb_node))
> +               if (!RB_EMPTY_NODE(&cfqe->rb_node)) {
>                        cfq_group_service_tree_del(cfqd, cfqq->cfqg);
> +                       /*
> +                        * Group changed, dequeue this CFQ queue from the
> +                        * original service tree.
> +                        */
> +                       cfq_rb_erase(&cfqe->rb_node, cfqe->service_tree);
> +                       orig_st->total_weight -= cfqe->weight;
> +               }
>                cfqq->orig_cfqg = cfqq->cfqg;
>                cfqq->cfqg = &cfqd->root_group;
>                cfqd->root_group.ref++;
> @@ -1234,8 +1271,15 @@ static void cfq_service_tree_add(struct cfq_data *cfqd, struct cfq_queue *cfqq,
>                   && cfqq_type(cfqq) == SYNC_WORKLOAD && cfqq->orig_cfqg) {
>                /* cfqq is sequential now needs to go to its original group */
>                BUG_ON(cfqq->cfqg != &cfqd->root_group);
> -               if (!RB_EMPTY_NODE(&cfqe->rb_node))
> +               if (!RB_EMPTY_NODE(&cfqe->rb_node)) {
>                        cfq_group_service_tree_del(cfqd, cfqq->cfqg);
> +                       /*
> +                        * Group changed, dequeue this CFQ queue from the
> +                        * original service tree.
> +                        */
> +                       cfq_rb_erase(&cfqe->rb_node, cfqe->service_tree);
> +                       orig_st->total_weight -= cfqe->weight;
> +               }
>                cfq_put_cfqg(cfqq->cfqg);
>                cfqq->cfqg = cfqq->orig_cfqg;
>                cfqq->orig_cfqg = NULL;
> @@ -1246,47 +1290,68 @@ static void cfq_service_tree_add(struct cfq_data *cfqd, struct cfq_queue *cfqq,
>
>        service_tree = service_tree_for(cfqq->cfqg, cfqq_prio(cfqq),
>                                                cfqq_type(cfqq));
> +       if (RB_EMPTY_NODE(&cfqe->rb_node)) {
> +               /*
> +                * If this CFQ queue moves to another group, the original
> +                * vdisktime makes no sense any more, reset the vdisktime
> +                * here.
> +                */
> +               parent = rb_last(&service_tree->rb);
> +               if (parent) {
> +                       u64 pos_offset;
> +
> +                       /*
> +                        * Estimate the position according to its weight and
> +                        * ioprio.
> +                        */
> +                       pos_offset = cfq_get_boost(cfqd, cfqq);
> +                       /* Debug purpose, should remove. */
> +                       cfq_log_cfqq(cfqd, cfqq, "pos_offset: %llu",
> +                                    pos_offset);
> +                       cfqe->vdisktime = service_tree->min_vdisktime +
> +                                               pos_offset;
> +               } else
> +                       cfqe->vdisktime = service_tree->min_vdisktime;
> +
> +               goto insert;
> +       }
> +
> +       /*
> +        * Ok, we get here, this CFQ queue is on the service tree, dequeue it
> +        * firstly.
> +        */
> +       cfq_rb_erase(&cfqe->rb_node, cfqe->service_tree);
> +       orig_st->total_weight -= cfqe->weight;
> +
> +       new_cfqq = 0;
> +
>        if (cfq_class_idle(cfqq)) {
> -               rb_key = CFQ_IDLE_DELAY;
>                parent = rb_last(&service_tree->rb);
>                if (parent && parent != &cfqe->rb_node) {
>                        __cfqe = rb_entry(parent, struct cfq_entity, rb_node);
> -                       rb_key += __cfqe->rb_key;
> +                       cfqe->vdisktime = __cfqe->vdisktime + CFQ_IDLE_DELAY;
>                } else
> -                       rb_key += jiffies;
> +                       cfqe->vdisktime = service_tree->min_vdisktime;
>        } else if (!add_front) {
>                /*
> -                * Get our rb key offset. Subtract any residual slice
> -                * value carried from last service. A negative resid
> -                * count indicates slice overrun, and this should position
> -                * the next service time further away in the tree.
> +                * We charge the CFQ queue by the time this queue runs, and
> +                * repsition it on the service tree.
>                 */
> -               rb_key = cfq_slice_offset(cfqd, cfqq) + jiffies;
> -               rb_key -= cfqq->slice_resid;
> +               unsigned int used_sl;
> +
> +               used_sl = cfq_cfqq_slice_usage(cfqq);
> +               cfqe->vdisktime += cfq_scale_slice(used_sl, cfqe);
>                cfqq->slice_resid = 0;
>        } else {
> -               rb_key = -HZ;
> -               __cfqe = cfq_rb_first(service_tree);
> -               rb_key += __cfqe ? __cfqe->rb_key : jiffies;
> -       }
> -
> -       if (!RB_EMPTY_NODE(&cfqe->rb_node)) {
> -               new_cfqq = 0;
> -               /*
> -                * same position, nothing more to do
> -                */
> -               if (rb_key == cfqe->rb_key &&
> -                   cfqe->service_tree == service_tree)
> -                       return;
> -
> -               cfq_rb_erase(&cfqe->rb_node, cfqe->service_tree);
> -               cfqe->service_tree = NULL;
> +               cfqe->vdisktime = service_tree->min_vdisktime;
>        }
>
> +insert:
>        left = 1;
>        parent = NULL;
>        cfqe->service_tree = service_tree;
>        p = &service_tree->rb.rb_node;
> +       key = entity_key(service_tree, cfqe);
>        while (*p) {
>                struct rb_node **n;
>
> @@ -1296,7 +1361,7 @@ static void cfq_service_tree_add(struct cfq_data *cfqd, struct cfq_queue *cfqq,
>                /*
>                 * sort by key, that represents service time.
>                 */
> -               if (time_before(rb_key, __cfqe->rb_key))
> +               if (key < entity_key(service_tree, __cfqe))
>                        n = &(*p)->rb_left;
>                else {
>                        n = &(*p)->rb_right;
> @@ -1309,10 +1374,12 @@ static void cfq_service_tree_add(struct cfq_data *cfqd, struct cfq_queue *cfqq,
>        if (left)
>                service_tree->left = &cfqe->rb_node;
>
> -       cfqe->rb_key = rb_key;
>        rb_link_node(&cfqe->rb_node, parent, p);
>        rb_insert_color(&cfqe->rb_node, &service_tree->rb);
> +       update_min_vdisktime(service_tree);
>        service_tree->count++;
> +       service_tree->total_weight += cfqe->weight;

I'm confused by this line. Why are we doing some adjustment for cfqe
weight that we weren't doing previously? I think
cfq_group_service_tree_add below will still do the total_weight
adjustment.

> +       cfqq->reposition_time = jiffies;
>        if ((add_front || !new_cfqq) && !group_changed)
>                return;
>        cfq_group_service_tree_add(cfqd, cfqq->cfqg);
> @@ -1414,14 +1481,18 @@ static void cfq_add_cfqq_rr(struct cfq_data *cfqd, struct cfq_queue *cfqq)
>  static void cfq_del_cfqq_rr(struct cfq_data *cfqd, struct cfq_queue *cfqq)
>  {
>        struct cfq_entity *cfqe;
> +       struct cfq_rb_root *service_tree;
> +
>        cfq_log_cfqq(cfqd, cfqq, "del_from_rr");
>        BUG_ON(!cfq_cfqq_on_rr(cfqq));
>        cfq_clear_cfqq_on_rr(cfqq);
>
>        cfqe = &cfqq->cfqe;
> +       service_tree = cfqe->service_tree;
>
>        if (!RB_EMPTY_NODE(&cfqe->rb_node)) {
>                cfq_rb_erase(&cfqe->rb_node, cfqe->service_tree);
> +               service_tree->total_weight -= cfqe->weight;
>                cfqe->service_tree = NULL;
>        }
>        if (cfqq->p_root) {
> @@ -2120,23 +2191,36 @@ static void cfq_setup_merge(struct cfq_queue *cfqq, struct cfq_queue *new_cfqq)
>        }
>  }
>
> +/*
> + * The time when a CFQ queue is put onto a service tree is recoreded in
> + * cfqq->reposition_time. Currently, we check the first priority CFQ queues
> + * on each service tree, and select the workload type that contains the lowest
> + * reposition_time CFQ queue among them.
> + */
>  static enum wl_type_t cfq_choose_wl(struct cfq_data *cfqd,
>                                struct cfq_group *cfqg, enum wl_prio_t prio)
>  {
>        struct cfq_entity *cfqe;
> +       struct cfq_queue *cfqq;
> +       unsigned long lowest_start_time;
>        int i;
> -       bool key_valid = false;
> -       unsigned long lowest_key = 0;
> +       bool time_valid = false;
>        enum wl_type_t cur_best = SYNC_NOIDLE_WORKLOAD;
>
> +       /*
> +        * TODO: We may take io priority and io class into account when
> +        * choosing a workload type. But for the time being just make use of
> +        * reposition_time only.
> +        */
>        for (i = 0; i <= SYNC_WORKLOAD; ++i) {
> -               /* select the one with lowest rb_key */
>                cfqe = cfq_rb_first(service_tree_for(cfqg, prio, i));
> -               if (cfqe &&
> -                   (!key_valid || time_before(cfqe->rb_key, lowest_key))) {
> -                       lowest_key = cfqe->rb_key;
> +               cfqq = cfqq_of_entity(cfqe);
> +               if (cfqe && (!time_valid ||
> +                            time_before(cfqq->reposition_time,
> +                                        lowest_start_time))) {
> +                       lowest_start_time = cfqq->reposition_time;
>                        cur_best = i;
> -                       key_valid = true;
> +                       time_valid = true;
>                }
>        }
>
> @@ -2808,10 +2892,13 @@ static void cfq_init_prio_data(struct cfq_queue *cfqq, struct io_context *ioc)
>  {
>        struct task_struct *tsk = current;
>        int ioprio_class;
> +       struct cfq_entity *cfqe;
>
>        if (!cfq_cfqq_prio_changed(cfqq))
>                return;
>
> +       cfqe = &cfqq->cfqe;
> +
>        ioprio_class = IOPRIO_PRIO_CLASS(ioc->ioprio);
>        switch (ioprio_class) {
>        default:
> @@ -2838,6 +2925,17 @@ static void cfq_init_prio_data(struct cfq_queue *cfqq, struct io_context *ioc)
>                break;
>        }
>
> +       if (!RB_EMPTY_NODE(&cfqe->rb_node)) {
> +               /*
> +                * If this CFQ entity is already on service tree, we need to
> +                * adjust service tree's total weight accordingly.
> +                */
> +               cfqe->service_tree->total_weight -= cfqe->weight;
> +               cfqe->weight = cfq_prio_to_weight(cfqq->ioprio);
> +               cfqe->service_tree->total_weight += cfqe->weight;
> +       } else
> +               cfqe->weight = cfq_prio_to_weight(cfqq->ioprio);
> +
>        /*
>         * keep track of original prio settings in case we have to temporarily
>         * elevate the priority of this queue
> @@ -3572,6 +3670,9 @@ static void cfq_completed_request(struct request_queue *q, struct request *rq)
>  */
>  static void cfq_prio_boost(struct cfq_queue *cfqq)
>  {
> +       struct cfq_entity *cfqe;
> +
> +       cfqe = &cfqq->cfqe;
>        if (has_fs_excl()) {
>                /*
>                 * boost idle prio on transactions that would lock out other
> @@ -3588,6 +3689,20 @@ static void cfq_prio_boost(struct cfq_queue *cfqq)
>                cfqq->ioprio_class = cfqq->org_ioprio_class;
>                cfqq->ioprio = cfqq->org_ioprio;
>        }
> +
> +       /*
> +        * update the io weight if io priority gets changed.
> +        */
> +       if (!RB_EMPTY_NODE(&cfqe->rb_node)) {
> +               /*
> +                * If this CFQ entity is already on service tree, we need to
> +                * adjust service tree's total weight accordingly.
> +                */
> +               cfqe->service_tree->total_weight -= cfqe->weight;
> +               cfqe->weight = cfq_prio_to_weight(cfqq->ioprio);
> +               cfqe->service_tree->total_weight += cfqe->weight;
> +       } else
> +               cfqe->weight = cfq_prio_to_weight(cfqq->ioprio);
>  }
>
>  static inline int __cfq_may_queue(struct cfq_queue *cfqq)
> --
> 1.7.1
>
>
>
>
> --
> To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
> the body of a message to majordomo@vger.kernel.org
> More majordomo info at  http://vger.kernel.org/majordomo-info.html
> Please read the FAQ at  http://www.tux.org/lkml/
>

^ permalink raw reply	[flat|nested] 40+ messages in thread

* Re: [PATCH 3/6 v4] cfq-iosched: Introduce vdisktime and io weight for CFQ queue
  2011-02-14 23:32   ` Justin TerAvest
@ 2011-02-15  1:44     ` Gui Jianfeng
  2011-02-15 14:21       ` Vivek Goyal
  0 siblings, 1 reply; 40+ messages in thread
From: Gui Jianfeng @ 2011-02-15  1:44 UTC (permalink / raw)
  To: Justin TerAvest
  Cc: Vivek Goyal, Jens Axboe, Shaohua Li, lkml, Chad Talbott, Divyesh Shah

Justin TerAvest wrote:
> On Wed, Feb 9, 2011 at 11:47 PM, Gui Jianfeng
> <guijianfeng@cn.fujitsu.com> wrote:
>> Introduce vdisktime and io weight for CFQ queue scheduling. Currently, io priority
>> maps to a range [100,1000]. It also gets rid of cfq_slice_offset() logic and makes
>> use the same scheduling algorithm as CFQ group does. This helps for CFQ queue and
>> group scheduling on the same service tree.
> 
> Hi Gui,
> I have a couple of questions inline.
> 
> Thanks,
> Justin
> 
>> Signed-off-by: Gui Jianfeng <guijianfeng@cn.fujitsu.com>
>> ---
>>  block/cfq-iosched.c |  219 +++++++++++++++++++++++++++++++++++++++------------
>>  1 files changed, 167 insertions(+), 52 deletions(-)
>>
>> diff --git a/block/cfq-iosched.c b/block/cfq-iosched.c
>> index f3a126e..41cef2e 100644
>> --- a/block/cfq-iosched.c
>> +++ b/block/cfq-iosched.c
>> @@ -39,6 +39,13 @@ static const int cfq_hist_divisor = 4;
>>  */
>>  #define CFQ_IDLE_DELAY         (HZ / 5)
>>
>> +/*
>> + * The base boosting value.
>> + */
>> +#define CFQ_BOOST_SYNC_BASE          (HZ / 10)
>> +#define CFQ_BOOST_ASYNC_BASE          (HZ / 25)
>> +
>> +
>>  /*
>>  * below this threshold, we consider thinktime immediate
>>  */
>> @@ -99,10 +106,7 @@ struct cfq_entity {
>>        struct cfq_rb_root *service_tree;
>>        /* service_tree member */
>>        struct rb_node rb_node;
>> -       /* service_tree key, represent the position on the tree */
>> -       unsigned long rb_key;
>> -
>> -       /* group service_tree key */
>> +       /* service_tree key */
>>        u64 vdisktime;
>>        bool is_group_entity;
>>        unsigned int weight;
>> @@ -114,6 +118,8 @@ struct cfq_entity {
>>  struct cfq_queue {
>>        /* The schedule entity */
>>        struct cfq_entity cfqe;
>> +       /* Reposition time */
>> +       unsigned long reposition_time;
> 
> Can this be addition time or something else instead? This is set, even
> when we are not repositioning among service trees.

Hi Justin,

how about position_time :)

> 
>>        /* reference count */
>>        int ref;
>>        /* various state flags, see below */
>> @@ -312,6 +318,24 @@ struct cfq_data {
>>        struct rcu_head rcu;
>>  };
>>
>> +/*
>> + * Map io priority(7 ~ 0) to io weight(100 ~ 1000) as follows
>> + *     prio       0    1     2    3    4    5    6     7
>> + *     weight  1000  868   740  612  484  356  228   100
>> + */
>> +static inline unsigned int cfq_prio_to_weight(unsigned short ioprio)
>> +{
>> +       unsigned int step;
>> +
>> +       BUG_ON(ioprio >= IOPRIO_BE_NR);
>> +
>> +       step = (BLKIO_WEIGHT_MAX - BLKIO_WEIGHT_MIN) / (IOPRIO_BE_NR - 1);
>> +       if (ioprio == 0)
>> +               return BLKIO_WEIGHT_MAX;
>> +
>> +       return BLKIO_WEIGHT_MIN + (IOPRIO_BE_NR - ioprio - 1) * step;
>> +}
>> +
>>  static inline struct cfq_queue *
>>  cfqq_of_entity(struct cfq_entity *cfqe)
>>  {
>> @@ -840,16 +864,6 @@ cfq_find_next_rq(struct cfq_data *cfqd, struct cfq_queue *cfqq,
>>        return cfq_choose_req(cfqd, next, prev, blk_rq_pos(last));
>>  }
>>
>> -static unsigned long cfq_slice_offset(struct cfq_data *cfqd,
>> -                                     struct cfq_queue *cfqq)
>> -{
>> -       /*
>> -        * just an approximation, should be ok.
>> -        */
>> -       return (cfqq->cfqg->nr_cfqq - 1) * (cfq_prio_slice(cfqd, 1, 0) -
>> -                      cfq_prio_slice(cfqd, cfq_cfqq_sync(cfqq), cfqq->ioprio));
>> -}
>> -
>>  static inline s64
>>  entity_key(struct cfq_rb_root *st, struct cfq_entity *entity)
>>  {
>> @@ -1199,6 +1213,21 @@ static inline void cfq_put_cfqg(struct cfq_group *cfqg) {}
>>
>>  #endif /* GROUP_IOSCHED */
>>
>> +static inline u64 cfq_get_boost(struct cfq_data *cfqd,
>> +                                struct cfq_queue *cfqq)
>> +{
>> +       u64 d;
>> +
>> +       if (cfq_cfqq_sync(cfqq))
>> +               d = CFQ_BOOST_SYNC_BASE << CFQ_SERVICE_SHIFT;
>> +       else
>> +               d = CFQ_BOOST_ASYNC_BASE << CFQ_SERVICE_SHIFT;
>> +
>> +       d = d * BLKIO_WEIGHT_DEFAULT;
>> +       do_div(d, cfqq->cfqe.weight);
>> +       return d;
>> +}
> 
> The logic for cfq_get_boost() looks a lot like cfq_scale_slice().
> Instead of duplicating code, can't it just be
> u64 d;
> if (cfq_cfqq_sync(cfqq))
>         return cfq_scale_slice(CFQ_BOOST_SYNC_BASE, cfqq->cfqe);
> else
>         return cfq_scale_slice(CFQ_BOOST_ASYNC_BASE, cfqq->cfqe);
> 

Ok, I think this should work.

> 
>> +
>>  /*
>>  * The cfqd->service_trees holds all pending cfq_queue's that have
...

> I'm confused by this line. Why are we doing some adjustment for cfqe
> weight that we weren't doing previously? I think
> cfq_group_service_tree_add below will still do the total_weight
> adjustment.

later patch does the integration for cfqq and cfqg.

Thanks,
Gui

> 
>> +       cfqq->reposition_time = jiffies;
>>        if ((add_front || !new_cfqq) && !group_changed)
>>                return;
>>        cfq_group_service_tree_add(cfqd, cfqq->cfqg);
>> @@ -1414,14 +1481,18 @@ static void cfq_add_cfqq_rr(struct cfq_data *cfqd, struct cfq_queue *cfqq)
>>  static void cfq_del_cfqq_rr(struct cfq_data *cfqd, struct cfq_queue *cfqq)
>>  {

^ permalink raw reply	[flat|nested] 40+ messages in thread

* Re: [PATCH 3/6 v4] cfq-iosched: Introduce vdisktime and io weight for CFQ queue
  2011-02-14 18:13   ` Vivek Goyal
@ 2011-02-15  1:46     ` Gui Jianfeng
  2011-02-18  6:04     ` Gui Jianfeng
  1 sibling, 0 replies; 40+ messages in thread
From: Gui Jianfeng @ 2011-02-15  1:46 UTC (permalink / raw)
  To: Vivek Goyal; +Cc: Jens Axboe, Shaohua Li, lkml, Chad Talbott, Divyesh Shah

Vivek Goyal wrote:
> On Thu, Feb 10, 2011 at 03:47:16PM +0800, Gui Jianfeng wrote:
> 
> [..]
>> +/*
>> + * The time when a CFQ queue is put onto a service tree is recoreded in
>> + * cfqq->reposition_time. Currently, we check the first priority CFQ queues
>> + * on each service tree, and select the workload type that contains the lowest
>> + * reposition_time CFQ queue among them.
>> + */
>>  static enum wl_type_t cfq_choose_wl(struct cfq_data *cfqd,
>>  				struct cfq_group *cfqg, enum wl_prio_t prio)
>>  {
>>  	struct cfq_entity *cfqe;
>> +	struct cfq_queue *cfqq;
>> +	unsigned long lowest_start_time;
>>  	int i;
>> -	bool key_valid = false;
>> -	unsigned long lowest_key = 0;
>> +	bool time_valid = false;
>>  	enum wl_type_t cur_best = SYNC_NOIDLE_WORKLOAD;
>>  
>> +	/*
>> +	 * TODO: We may take io priority and io class into account when
>> +	 * choosing a workload type. But for the time being just make use of
>> +	 * reposition_time only.
>> +	 */
>>  	for (i = 0; i <= SYNC_WORKLOAD; ++i) {
>> -		/* select the one with lowest rb_key */
>>  		cfqe = cfq_rb_first(service_tree_for(cfqg, prio, i));
>> -		if (cfqe &&
>> -		    (!key_valid || time_before(cfqe->rb_key, lowest_key))) {
>> -			lowest_key = cfqe->rb_key;
>> +		cfqq = cfqq_of_entity(cfqe);
>> +		if (cfqe && (!time_valid ||
>> +			     time_before(cfqq->reposition_time,
>> +					 lowest_start_time))) {
>> +			lowest_start_time = cfqq->reposition_time;
> 
> Gui,
> 
> Have you had a chance to run some mixed workloads in a group (some sync,
> some async and some sync-idle queues), and see how latency and throughput
> of sync-idle workload changes due to this "resposition_time" logic. I 
> just want to make sure that latency of sync-noidle workload does not
> go up as that's the workload that people care and gets noticed first.

Hi Vivek

Will do some tests.

Thanks,
Gui

> 
> Thanks
> Vivek
> 

^ permalink raw reply	[flat|nested] 40+ messages in thread

* Re: [PATCH 3/6 v4] cfq-iosched: Introduce vdisktime and io weight for CFQ queue
  2011-02-14 16:58       ` Vivek Goyal
@ 2011-02-15  1:53         ` Gui Jianfeng
  2011-02-15 14:24           ` Vivek Goyal
  0 siblings, 1 reply; 40+ messages in thread
From: Gui Jianfeng @ 2011-02-15  1:53 UTC (permalink / raw)
  To: Vivek Goyal; +Cc: Jens Axboe, Shaohua Li, lkml, Chad Talbott, Divyesh Shah

Vivek Goyal wrote:
> On Sat, Feb 12, 2011 at 09:20:58AM +0800, Gui Jianfeng wrote:
>> Vivek Goyal wrote:
>>> On Thu, Feb 10, 2011 at 03:47:16PM +0800, Gui Jianfeng wrote:
>>>> Introduce vdisktime and io weight for CFQ queue scheduling. Currently, io priority
>>>> maps to a range [100,1000]. It also gets rid of cfq_slice_offset() logic and makes
>>>> use the same scheduling algorithm as CFQ group does. This helps for CFQ queue and
>>>> group scheduling on the same service tree.
>>>>
>>>> Signed-off-by: Gui Jianfeng <guijianfeng@cn.fujitsu.com>
>>>> ---
>>>>  block/cfq-iosched.c |  219 +++++++++++++++++++++++++++++++++++++++------------
>>>>  1 files changed, 167 insertions(+), 52 deletions(-)
>>>>
>>>> diff --git a/block/cfq-iosched.c b/block/cfq-iosched.c
>>>> index f3a126e..41cef2e 100644
>>>> --- a/block/cfq-iosched.c
>>>> +++ b/block/cfq-iosched.c
>>>> @@ -39,6 +39,13 @@ static const int cfq_hist_divisor = 4;
>>>>   */
>>>>  #define CFQ_IDLE_DELAY		(HZ / 5)
>>>>  
>>>> +/* 
>>>> + * The base boosting value.
>>>> + */
>>>> +#define CFQ_BOOST_SYNC_BASE          (HZ / 10)
>>>> +#define CFQ_BOOST_ASYNC_BASE          (HZ / 25)
>>>> +
>>> These are same as cfq_slice_sync and cfq_slice_async. Looking at
>>> boost logic, this is equivalent of starting a new queue/group as
>>> if it is being requeued after conuming a full slice. So may be we can divide
>>> it by some const number say 4 or something like that. This is a minor
>>> point though as this algorimthm will kind of evolve and we will learn
>>> what works best.
>>>
>>> Secondly, I think you wanted to SYNC vs ASYNC logic seem to be reversed.
>>> We would like to give ASYNC queues higher boost (Put these farther in 
>>> tree) and lesser boost to SYNC queues. Looks like above constants will
>>> do the reverse? 
>> Hi Vivek,
>>
>> Currently, SYNC and ASYNC queues are in different service tree, they don't
>> impact each other. Here, I Really want use this logic.
> 
> Ok, SYNC and ASYNC are on separate service tree so their vtime are not
> comparable (as of today, down the line one might want to look at those for
> better workload selection logic).
> 
> Anyway, because two are on seprate tree so why should we have separate
> boosting constants for them? How does it help?

Here if we are using CFQ_BOOST_SYNC_BASE for both, I think it might boost
too much for an ASYNC cfqe as compare to others on the same service tree(async).
So I make charging and boosting follow the same base.

Thanks,
Gui

> 
> Thanks
> Vivek
> --
> To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
> the body of a message to majordomo@vger.kernel.org
> More majordomo info at  http://vger.kernel.org/majordomo-info.html
> Please read the FAQ at  http://www.tux.org/lkml/
> 

^ permalink raw reply	[flat|nested] 40+ messages in thread

* Re: [PATCH 5/6 v4] cfq-iosched: CFQ group hierarchical scheduling and use_hierarchy interface
  2011-02-14 18:04       ` Vivek Goyal
@ 2011-02-15  2:38         ` Gui Jianfeng
  2011-02-15 14:27           ` Vivek Goyal
  0 siblings, 1 reply; 40+ messages in thread
From: Gui Jianfeng @ 2011-02-15  2:38 UTC (permalink / raw)
  To: Vivek Goyal; +Cc: Jens Axboe, Shaohua Li, lkml, Chad Talbott, Divyesh Shah

Vivek Goyal wrote:
> On Sat, Feb 12, 2011 at 10:21:47AM +0800, Gui Jianfeng wrote:
> [..]
>>>> +static struct cfq_group *
>>>> +cfq_find_alloc_cfqg(struct cfq_data *cfqd, struct cgroup *cgroup, int create)
>>>> +{
>>>> +	struct blkio_cgroup *blkcg = cgroup_to_blkio_cgroup(cgroup);
>>>> +	struct cfq_group *cfqg = NULL;
>>>> +	void *key = cfqd;
>>>> +	struct backing_dev_info *bdi = &cfqd->queue->backing_dev_info;
>>>> +	unsigned int major, minor;
>>>> +
>>>> +	cfqg = cfqg_of_blkg(blkiocg_lookup_group(blkcg, key));
>>>> +	if (cfqg && !cfqg->blkg.dev && bdi->dev && dev_name(bdi->dev)) {
>>>> +		sscanf(dev_name(bdi->dev), "%u:%u", &major, &minor);
>>>> +		cfqg->blkg.dev = MKDEV(major, minor);
>>>> +		goto done;
>>>> +	}
>>> Should we make this updation of this info hierarhical?
>> IMHO, it's fine to defer the updation when we really get the cfqg.
> 
> But if cfqg is alrady present, we will never hit the allocation path 
> again. So if somebody creates 2-3 level deep hierarchy and does IO
> only in the children cgroup, parent cgroups will potentially not get
> device info updated hence no visible stats?

Ahh, I see your concern. But do we really need to show the stats even if
a cgroup doesn't issue any IO on a given device? 
For example, on valinna kernel, when we create a new cgroup, we check the
stats, say blkio.io_service_bytes, we will get "Total 0", and no disk
specific stats.
Currently, As soon as a cgroup issue one IO request, cfqg->blkg.dev will
be updated.

Thanks,
Gui

> 
> Thanks
> Vivek
> --
> To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
> the body of a message to majordomo@vger.kernel.org
> More majordomo info at  http://vger.kernel.org/majordomo-info.html
> Please read the FAQ at  http://www.tux.org/lkml/
> 

^ permalink raw reply	[flat|nested] 40+ messages in thread

* Re: [PATCH 3/6 v4] cfq-iosched: Introduce vdisktime and io weight for CFQ queue
  2011-02-15  1:44     ` Gui Jianfeng
@ 2011-02-15 14:21       ` Vivek Goyal
  0 siblings, 0 replies; 40+ messages in thread
From: Vivek Goyal @ 2011-02-15 14:21 UTC (permalink / raw)
  To: Gui Jianfeng
  Cc: Justin TerAvest, Jens Axboe, Shaohua Li, lkml, Chad Talbott,
	Divyesh Shah

On Tue, Feb 15, 2011 at 09:44:44AM +0800, Gui Jianfeng wrote:

[..]
> >> +static inline u64 cfq_get_boost(struct cfq_data *cfqd,
> >> +                                struct cfq_queue *cfqq)
> >> +{
> >> +       u64 d;
> >> +
> >> +       if (cfq_cfqq_sync(cfqq))
> >> +               d = CFQ_BOOST_SYNC_BASE << CFQ_SERVICE_SHIFT;
> >> +       else
> >> +               d = CFQ_BOOST_ASYNC_BASE << CFQ_SERVICE_SHIFT;
> >> +
> >> +       d = d * BLKIO_WEIGHT_DEFAULT;
> >> +       do_div(d, cfqq->cfqe.weight);
> >> +       return d;
> >> +}
> > 
> > The logic for cfq_get_boost() looks a lot like cfq_scale_slice().
> > Instead of duplicating code, can't it just be
> > u64 d;
> > if (cfq_cfqq_sync(cfqq))
> >         return cfq_scale_slice(CFQ_BOOST_SYNC_BASE, cfqq->cfqe);
> > else
> >         return cfq_scale_slice(CFQ_BOOST_ASYNC_BASE, cfqq->cfqe);
> > 

I still think that we should use smaller values for CFQ_BOOST_SYNC_BASE
because otherwise what it means is that for freshly backlogged queues
we assume that these have already used one slice and then requeue these
accordingly. Instead it should be reverse where freshly backlogged queues
should get preference over already queues which are hogging the disk
for long time.

Thanks
Vivek 

^ permalink raw reply	[flat|nested] 40+ messages in thread

* Re: [PATCH 3/6 v4] cfq-iosched: Introduce vdisktime and io weight for CFQ queue
  2011-02-15  1:53         ` Gui Jianfeng
@ 2011-02-15 14:24           ` Vivek Goyal
  2011-02-16  1:06             ` Gui Jianfeng
  0 siblings, 1 reply; 40+ messages in thread
From: Vivek Goyal @ 2011-02-15 14:24 UTC (permalink / raw)
  To: Gui Jianfeng; +Cc: Jens Axboe, Shaohua Li, lkml, Chad Talbott, Divyesh Shah

On Tue, Feb 15, 2011 at 09:53:58AM +0800, Gui Jianfeng wrote:
> Vivek Goyal wrote:
> > On Sat, Feb 12, 2011 at 09:20:58AM +0800, Gui Jianfeng wrote:
> >> Vivek Goyal wrote:
> >>> On Thu, Feb 10, 2011 at 03:47:16PM +0800, Gui Jianfeng wrote:
> >>>> Introduce vdisktime and io weight for CFQ queue scheduling. Currently, io priority
> >>>> maps to a range [100,1000]. It also gets rid of cfq_slice_offset() logic and makes
> >>>> use the same scheduling algorithm as CFQ group does. This helps for CFQ queue and
> >>>> group scheduling on the same service tree.
> >>>>
> >>>> Signed-off-by: Gui Jianfeng <guijianfeng@cn.fujitsu.com>
> >>>> ---
> >>>>  block/cfq-iosched.c |  219 +++++++++++++++++++++++++++++++++++++++------------
> >>>>  1 files changed, 167 insertions(+), 52 deletions(-)
> >>>>
> >>>> diff --git a/block/cfq-iosched.c b/block/cfq-iosched.c
> >>>> index f3a126e..41cef2e 100644
> >>>> --- a/block/cfq-iosched.c
> >>>> +++ b/block/cfq-iosched.c
> >>>> @@ -39,6 +39,13 @@ static const int cfq_hist_divisor = 4;
> >>>>   */
> >>>>  #define CFQ_IDLE_DELAY		(HZ / 5)
> >>>>  
> >>>> +/* 
> >>>> + * The base boosting value.
> >>>> + */
> >>>> +#define CFQ_BOOST_SYNC_BASE          (HZ / 10)
> >>>> +#define CFQ_BOOST_ASYNC_BASE          (HZ / 25)
> >>>> +
> >>> These are same as cfq_slice_sync and cfq_slice_async. Looking at
> >>> boost logic, this is equivalent of starting a new queue/group as
> >>> if it is being requeued after conuming a full slice. So may be we can divide
> >>> it by some const number say 4 or something like that. This is a minor
> >>> point though as this algorimthm will kind of evolve and we will learn
> >>> what works best.
> >>>
> >>> Secondly, I think you wanted to SYNC vs ASYNC logic seem to be reversed.
> >>> We would like to give ASYNC queues higher boost (Put these farther in 
> >>> tree) and lesser boost to SYNC queues. Looks like above constants will
> >>> do the reverse? 
> >> Hi Vivek,
> >>
> >> Currently, SYNC and ASYNC queues are in different service tree, they don't
> >> impact each other. Here, I Really want use this logic.
> > 
> > Ok, SYNC and ASYNC are on separate service tree so their vtime are not
> > comparable (as of today, down the line one might want to look at those for
> > better workload selection logic).
> > 
> > Anyway, because two are on seprate tree so why should we have separate
> > boosting constants for them? How does it help?
> 
> Here if we are using CFQ_BOOST_SYNC_BASE for both, I think it might boost
> too much for an ASYNC cfqe as compare to others on the same service tree(async).
> So I make charging and boosting follow the same base.

Ok, that makes sense. So as suggested in other mails, lets use a even
smaller base so that freshly backlogged queues get smaller vdisktimes
as compared to existing queues which are using disks for longer time.

Thanks
Vivek

^ permalink raw reply	[flat|nested] 40+ messages in thread

* Re: [PATCH 5/6 v4] cfq-iosched: CFQ group hierarchical scheduling and use_hierarchy interface
  2011-02-15  2:38         ` Gui Jianfeng
@ 2011-02-15 14:27           ` Vivek Goyal
  2011-02-16  1:44             ` Gui Jianfeng
  0 siblings, 1 reply; 40+ messages in thread
From: Vivek Goyal @ 2011-02-15 14:27 UTC (permalink / raw)
  To: Gui Jianfeng; +Cc: Jens Axboe, Shaohua Li, lkml, Chad Talbott, Divyesh Shah

On Tue, Feb 15, 2011 at 10:38:32AM +0800, Gui Jianfeng wrote:
> Vivek Goyal wrote:
> > On Sat, Feb 12, 2011 at 10:21:47AM +0800, Gui Jianfeng wrote:
> > [..]
> >>>> +static struct cfq_group *
> >>>> +cfq_find_alloc_cfqg(struct cfq_data *cfqd, struct cgroup *cgroup, int create)
> >>>> +{
> >>>> +	struct blkio_cgroup *blkcg = cgroup_to_blkio_cgroup(cgroup);
> >>>> +	struct cfq_group *cfqg = NULL;
> >>>> +	void *key = cfqd;
> >>>> +	struct backing_dev_info *bdi = &cfqd->queue->backing_dev_info;
> >>>> +	unsigned int major, minor;
> >>>> +
> >>>> +	cfqg = cfqg_of_blkg(blkiocg_lookup_group(blkcg, key));
> >>>> +	if (cfqg && !cfqg->blkg.dev && bdi->dev && dev_name(bdi->dev)) {
> >>>> +		sscanf(dev_name(bdi->dev), "%u:%u", &major, &minor);
> >>>> +		cfqg->blkg.dev = MKDEV(major, minor);
> >>>> +		goto done;
> >>>> +	}
> >>> Should we make this updation of this info hierarhical?
> >> IMHO, it's fine to defer the updation when we really get the cfqg.
> > 
> > But if cfqg is alrady present, we will never hit the allocation path 
> > again. So if somebody creates 2-3 level deep hierarchy and does IO
> > only in the children cgroup, parent cgroups will potentially not get
> > device info updated hence no visible stats?
> 
> Ahh, I see your concern. But do we really need to show the stats even if
> a cgroup doesn't issue any IO on a given device? 

I am assuming that once use_hierarchy=1, you are aggregating the stats
in parent cgroups? So if a child services 5 IOs, these are accounted
to parent group also when user_hier=1?

What happens in case of  memoy cgroup controller?

Thanks
Vivek

^ permalink raw reply	[flat|nested] 40+ messages in thread

* Re: [PATCH 3/6 v4] cfq-iosched: Introduce vdisktime and io weight for CFQ queue
  2011-02-15 14:24           ` Vivek Goyal
@ 2011-02-16  1:06             ` Gui Jianfeng
  0 siblings, 0 replies; 40+ messages in thread
From: Gui Jianfeng @ 2011-02-16  1:06 UTC (permalink / raw)
  To: Vivek Goyal; +Cc: Jens Axboe, Shaohua Li, lkml, Chad Talbott, Divyesh Shah

Vivek Goyal wrote:
> On Tue, Feb 15, 2011 at 09:53:58AM +0800, Gui Jianfeng wrote:
>> Vivek Goyal wrote:
>>> On Sat, Feb 12, 2011 at 09:20:58AM +0800, Gui Jianfeng wrote:
>>>> Vivek Goyal wrote:
>>>>> On Thu, Feb 10, 2011 at 03:47:16PM +0800, Gui Jianfeng wrote:
>>>>>> Introduce vdisktime and io weight for CFQ queue scheduling. Currently, io priority
>>>>>> maps to a range [100,1000]. It also gets rid of cfq_slice_offset() logic and makes
>>>>>> use the same scheduling algorithm as CFQ group does. This helps for CFQ queue and
>>>>>> group scheduling on the same service tree.
>>>>>>
>>>>>> Signed-off-by: Gui Jianfeng <guijianfeng@cn.fujitsu.com>
>>>>>> ---
>>>>>>  block/cfq-iosched.c |  219 +++++++++++++++++++++++++++++++++++++++------------
>>>>>>  1 files changed, 167 insertions(+), 52 deletions(-)
>>>>>>
>>>>>> diff --git a/block/cfq-iosched.c b/block/cfq-iosched.c
>>>>>> index f3a126e..41cef2e 100644
>>>>>> --- a/block/cfq-iosched.c
>>>>>> +++ b/block/cfq-iosched.c
>>>>>> @@ -39,6 +39,13 @@ static const int cfq_hist_divisor = 4;
>>>>>>   */
>>>>>>  #define CFQ_IDLE_DELAY		(HZ / 5)
>>>>>>  
>>>>>> +/* 
>>>>>> + * The base boosting value.
>>>>>> + */
>>>>>> +#define CFQ_BOOST_SYNC_BASE          (HZ / 10)
>>>>>> +#define CFQ_BOOST_ASYNC_BASE          (HZ / 25)
>>>>>> +
>>>>> These are same as cfq_slice_sync and cfq_slice_async. Looking at
>>>>> boost logic, this is equivalent of starting a new queue/group as
>>>>> if it is being requeued after conuming a full slice. So may be we can divide
>>>>> it by some const number say 4 or something like that. This is a minor
>>>>> point though as this algorimthm will kind of evolve and we will learn
>>>>> what works best.
>>>>>
>>>>> Secondly, I think you wanted to SYNC vs ASYNC logic seem to be reversed.
>>>>> We would like to give ASYNC queues higher boost (Put these farther in 
>>>>> tree) and lesser boost to SYNC queues. Looks like above constants will
>>>>> do the reverse? 
>>>> Hi Vivek,
>>>>
>>>> Currently, SYNC and ASYNC queues are in different service tree, they don't
>>>> impact each other. Here, I Really want use this logic.
>>> Ok, SYNC and ASYNC are on separate service tree so their vtime are not
>>> comparable (as of today, down the line one might want to look at those for
>>> better workload selection logic).
>>>
>>> Anyway, because two are on seprate tree so why should we have separate
>>> boosting constants for them? How does it help?
>> Here if we are using CFQ_BOOST_SYNC_BASE for both, I think it might boost
>> too much for an ASYNC cfqe as compare to others on the same service tree(async).
>> So I make charging and boosting follow the same base.
> 
> Ok, that makes sense. So as suggested in other mails, lets use a even
> smaller base so that freshly backlogged queues get smaller vdisktimes
> as compared to existing queues which are using disks for longer time.

Ok, It sounds making sense.

Thanks
Gui

> 
> Thanks
> Vivek
> 

^ permalink raw reply	[flat|nested] 40+ messages in thread

* Re: [PATCH 5/6 v4] cfq-iosched: CFQ group hierarchical scheduling and use_hierarchy interface
  2011-02-15 14:27           ` Vivek Goyal
@ 2011-02-16  1:44             ` Gui Jianfeng
  2011-02-16 14:17               ` Vivek Goyal
  2011-02-16 17:22               ` Divyesh Shah
  0 siblings, 2 replies; 40+ messages in thread
From: Gui Jianfeng @ 2011-02-16  1:44 UTC (permalink / raw)
  To: Vivek Goyal; +Cc: Jens Axboe, Shaohua Li, lkml, Chad Talbott, Divyesh Shah

Vivek Goyal wrote:
> On Tue, Feb 15, 2011 at 10:38:32AM +0800, Gui Jianfeng wrote:
>> Vivek Goyal wrote:
>>> On Sat, Feb 12, 2011 at 10:21:47AM +0800, Gui Jianfeng wrote:
>>> [..]
>>>>>> +static struct cfq_group *
>>>>>> +cfq_find_alloc_cfqg(struct cfq_data *cfqd, struct cgroup *cgroup, int create)
>>>>>> +{
>>>>>> +	struct blkio_cgroup *blkcg = cgroup_to_blkio_cgroup(cgroup);
>>>>>> +	struct cfq_group *cfqg = NULL;
>>>>>> +	void *key = cfqd;
>>>>>> +	struct backing_dev_info *bdi = &cfqd->queue->backing_dev_info;
>>>>>> +	unsigned int major, minor;
>>>>>> +
>>>>>> +	cfqg = cfqg_of_blkg(blkiocg_lookup_group(blkcg, key));
>>>>>> +	if (cfqg && !cfqg->blkg.dev && bdi->dev && dev_name(bdi->dev)) {
>>>>>> +		sscanf(dev_name(bdi->dev), "%u:%u", &major, &minor);
>>>>>> +		cfqg->blkg.dev = MKDEV(major, minor);
>>>>>> +		goto done;
>>>>>> +	}
>>>>> Should we make this updation of this info hierarhical?
>>>> IMHO, it's fine to defer the updation when we really get the cfqg.
>>> But if cfqg is alrady present, we will never hit the allocation path 
>>> again. So if somebody creates 2-3 level deep hierarchy and does IO
>>> only in the children cgroup, parent cgroups will potentially not get
>>> device info updated hence no visible stats?
>> Ahh, I see your concern. But do we really need to show the stats even if
>> a cgroup doesn't issue any IO on a given device? 
> 
> I am assuming that once use_hierarchy=1, you are aggregating the stats
> in parent cgroups? So if a child services 5 IOs, these are accounted
> to parent group also when user_hier=1?
> 
> What happens in case of  memoy cgroup controller?

Hmm, it seems memcg aggregating stats in parent group.
But do we really need to do that in kernel? I think it's easier to do it in
userland, and it makes kernel much simpler.

Thanks,
Gui

> 
> Thanks
> Vivek
> 

^ permalink raw reply	[flat|nested] 40+ messages in thread

* Re: [PATCH 5/6 v4] cfq-iosched: CFQ group hierarchical scheduling and use_hierarchy interface
  2011-02-16  1:44             ` Gui Jianfeng
@ 2011-02-16 14:17               ` Vivek Goyal
  2011-02-17  1:22                 ` Gui Jianfeng
  2011-02-16 17:22               ` Divyesh Shah
  1 sibling, 1 reply; 40+ messages in thread
From: Vivek Goyal @ 2011-02-16 14:17 UTC (permalink / raw)
  To: Gui Jianfeng; +Cc: Jens Axboe, Shaohua Li, lkml, Chad Talbott, Divyesh Shah

On Wed, Feb 16, 2011 at 09:44:39AM +0800, Gui Jianfeng wrote:
> Vivek Goyal wrote:
> > On Tue, Feb 15, 2011 at 10:38:32AM +0800, Gui Jianfeng wrote:
> >> Vivek Goyal wrote:
> >>> On Sat, Feb 12, 2011 at 10:21:47AM +0800, Gui Jianfeng wrote:
> >>> [..]
> >>>>>> +static struct cfq_group *
> >>>>>> +cfq_find_alloc_cfqg(struct cfq_data *cfqd, struct cgroup *cgroup, int create)
> >>>>>> +{
> >>>>>> +	struct blkio_cgroup *blkcg = cgroup_to_blkio_cgroup(cgroup);
> >>>>>> +	struct cfq_group *cfqg = NULL;
> >>>>>> +	void *key = cfqd;
> >>>>>> +	struct backing_dev_info *bdi = &cfqd->queue->backing_dev_info;
> >>>>>> +	unsigned int major, minor;
> >>>>>> +
> >>>>>> +	cfqg = cfqg_of_blkg(blkiocg_lookup_group(blkcg, key));
> >>>>>> +	if (cfqg && !cfqg->blkg.dev && bdi->dev && dev_name(bdi->dev)) {
> >>>>>> +		sscanf(dev_name(bdi->dev), "%u:%u", &major, &minor);
> >>>>>> +		cfqg->blkg.dev = MKDEV(major, minor);
> >>>>>> +		goto done;
> >>>>>> +	}
> >>>>> Should we make this updation of this info hierarhical?
> >>>> IMHO, it's fine to defer the updation when we really get the cfqg.
> >>> But if cfqg is alrady present, we will never hit the allocation path 
> >>> again. So if somebody creates 2-3 level deep hierarchy and does IO
> >>> only in the children cgroup, parent cgroups will potentially not get
> >>> device info updated hence no visible stats?
> >> Ahh, I see your concern. But do we really need to show the stats even if
> >> a cgroup doesn't issue any IO on a given device? 
> > 
> > I am assuming that once use_hierarchy=1, you are aggregating the stats
> > in parent cgroups? So if a child services 5 IOs, these are accounted
> > to parent group also when user_hier=1?
> > 
> > What happens in case of  memoy cgroup controller?
> 
> Hmm, it seems memcg aggregating stats in parent group.
> But do we really need to do that in kernel? I think it's easier to do it in
> userland, and it makes kernel much simpler.

I think at some point of time hierarchical aggregated stats will also be
required. I am also looking at "memory.stat" file of meomory controller
and they seem to be reporting both aggregated as well as individual group
stats.

So we can probably skip implementing hierarhical stats in this patchset
and implement it on a need basis in future.

Thanks
Vivek

^ permalink raw reply	[flat|nested] 40+ messages in thread

* Re: [PATCH 5/6 v4] cfq-iosched: CFQ group hierarchical scheduling and use_hierarchy interface
  2011-02-16  1:44             ` Gui Jianfeng
  2011-02-16 14:17               ` Vivek Goyal
@ 2011-02-16 17:22               ` Divyesh Shah
  2011-02-16 17:28                 ` Divyesh Shah
  1 sibling, 1 reply; 40+ messages in thread
From: Divyesh Shah @ 2011-02-16 17:22 UTC (permalink / raw)
  To: Gui Jianfeng; +Cc: Vivek Goyal, Jens Axboe, Shaohua Li, lkml, Chad Talbott

On Tue, Feb 15, 2011 at 5:44 PM, Gui Jianfeng
<guijianfeng@cn.fujitsu.com> wrote:
> Hmm, it seems memcg aggregating stats in parent group.
> But do we really need to do that in kernel? I think it's easier to do it in
> userland, and it makes kernel much simpler.
>

I would prefer having stats aggregated up the hierarchy. One trick we
used at Google earlier was to do lazy updates for most stats. So we
would accumulate stats for a given timeslice and at the end of that
timeslice propagate those counts all the way up to the parent. For
device info like this, the update can be instantaneous since its not a
very frequent event. It would also be very useful to distinguish
between stats for the cgroup itself vs total values accumulated from
the subtree rooted at that cgroup like memcg does.

-- 
Thanks,
Divyesh

^ permalink raw reply	[flat|nested] 40+ messages in thread

* Re: [PATCH 5/6 v4] cfq-iosched: CFQ group hierarchical scheduling and use_hierarchy interface
  2011-02-16 17:22               ` Divyesh Shah
@ 2011-02-16 17:28                 ` Divyesh Shah
  2011-02-16 18:06                   ` Vivek Goyal
  0 siblings, 1 reply; 40+ messages in thread
From: Divyesh Shah @ 2011-02-16 17:28 UTC (permalink / raw)
  To: Gui Jianfeng; +Cc: Vivek Goyal, Jens Axboe, Shaohua Li, lkml, Chad Talbott

On Wed, Feb 16, 2011 at 9:22 AM, Divyesh Shah <dpshah@google.com> wrote:
> On Tue, Feb 15, 2011 at 5:44 PM, Gui Jianfeng
> <guijianfeng@cn.fujitsu.com> wrote:
>> Hmm, it seems memcg aggregating stats in parent group.
>> But do we really need to do that in kernel? I think it's easier to do it in
>> userland, and it makes kernel much simpler.
>>
>
> I would prefer having stats aggregated up the hierarchy. One trick we
> used at Google earlier was to do lazy updates for most stats. So we

Note that this was on ancient version of the blkio controller :). As
Vivek mentioned,
it may be ok to add the hierarchical accounting later.

> would accumulate stats for a given timeslice and at the end of that
> timeslice propagate those counts all the way up to the parent. For
> device info like this, the update can be instantaneous since its not a
> very frequent event. It would also be very useful to distinguish
> between stats for the cgroup itself vs total values accumulated from
> the subtree rooted at that cgroup like memcg does.
>
> --
> Thanks,
> Divyesh
>



-- 
Thanks,
Divyesh

^ permalink raw reply	[flat|nested] 40+ messages in thread

* Re: [PATCH 5/6 v4] cfq-iosched: CFQ group hierarchical scheduling and use_hierarchy interface
  2011-02-16 17:28                 ` Divyesh Shah
@ 2011-02-16 18:06                   ` Vivek Goyal
  0 siblings, 0 replies; 40+ messages in thread
From: Vivek Goyal @ 2011-02-16 18:06 UTC (permalink / raw)
  To: Divyesh Shah; +Cc: Gui Jianfeng, Jens Axboe, Shaohua Li, lkml, Chad Talbott

On Wed, Feb 16, 2011 at 09:28:07AM -0800, Divyesh Shah wrote:
> On Wed, Feb 16, 2011 at 9:22 AM, Divyesh Shah <dpshah@google.com> wrote:
> > On Tue, Feb 15, 2011 at 5:44 PM, Gui Jianfeng
> > <guijianfeng@cn.fujitsu.com> wrote:
> >> Hmm, it seems memcg aggregating stats in parent group.
> >> But do we really need to do that in kernel? I think it's easier to do it in
> >> userland, and it makes kernel much simpler.
> >>
> >
> > I would prefer having stats aggregated up the hierarchy. One trick we
> > used at Google earlier was to do lazy updates for most stats. So we
> 
> Note that this was on ancient version of the blkio controller :). As
> Vivek mentioned,
> it may be ok to add the hierarchical accounting later.

One improvement we can probably do is make accounting per cpu and make
it lockless.

Thanks
Vivek

^ permalink raw reply	[flat|nested] 40+ messages in thread

* Re: [PATCH 5/6 v4] cfq-iosched: CFQ group hierarchical scheduling and use_hierarchy interface
  2011-02-10  7:47 ` [PATCH 5/6 v4] cfq-iosched: CFQ group hierarchical scheduling and use_hierarchy interface Gui Jianfeng
  2011-02-10 20:57   ` Vivek Goyal
@ 2011-02-17  0:31   ` Justin TerAvest
  2011-02-17  1:21     ` Gui Jianfeng
  2011-02-17 10:39     ` Alan Cox
  1 sibling, 2 replies; 40+ messages in thread
From: Justin TerAvest @ 2011-02-17  0:31 UTC (permalink / raw)
  To: Gui Jianfeng
  Cc: Vivek Goyal, Jens Axboe, Shaohua Li, lkml, Chad Talbott, Divyesh Shah

After a quick read,

It's sad that we have to have so many use_hierarchy checks; it seems
like we're asking for bugs, especially in the future when one codepath
gets updated but not the other.

CodingStyle says we should only have one declaration per line.

I feel like there is an implicit assumption that groups and tasks
should not be children of the same parent; that is, a group should
contain only groups, or only tasks, but I don't see this enforced;
there's just and assumption that BE:SYNC is "good enough" for that
comparison. This smells like something that will be tweaked/tuned for
fairness later. :( Why don't we just prevent this from happening?

The clean_up label in chain_alloc() is strange; I don't think the goto
is necessary at all. I found that method generally hard to understand.
It's doing a lot.

It's possible that some of these can't be worked around.


On Wed, Feb 9, 2011 at 11:47 PM, Gui Jianfeng
<guijianfeng@cn.fujitsu.com> wrote:
> CFQ group hierarchical scheduling and use_hierarchy interface.
>
> Signed-off-by: Gui Jianfeng <guijianfeng@cn.fujitsu.com>
> ---
>  block/blk-cgroup.c  |   61 +++++-
>  block/blk-cgroup.h  |    3 +
>  block/cfq-iosched.c |  603 +++++++++++++++++++++++++++++++++++++--------------
>  3 files changed, 500 insertions(+), 167 deletions(-)
>
> diff --git a/block/blk-cgroup.c b/block/blk-cgroup.c
> index 455768a..c55fecd 100644
> --- a/block/blk-cgroup.c
> +++ b/block/blk-cgroup.c
> @@ -25,7 +25,10 @@
>  static DEFINE_SPINLOCK(blkio_list_lock);
>  static LIST_HEAD(blkio_list);
>
> -struct blkio_cgroup blkio_root_cgroup = { .weight = 2*BLKIO_WEIGHT_DEFAULT };
> +struct blkio_cgroup blkio_root_cgroup = {
> +       .weight = 2*BLKIO_WEIGHT_DEFAULT,
> +       .use_hierarchy = 0
> +};
>  EXPORT_SYMBOL_GPL(blkio_root_cgroup);
>
>  static struct cgroup_subsys_state *blkiocg_create(struct cgroup_subsys *,
> @@ -454,6 +457,7 @@ static void __blkiocg_del_blkio_group(struct blkio_group *blkg)
>        blkg->blkcg_id = 0;
>  }
>
> +
>  /*
>  * returns 0 if blkio_group was still on cgroup list. Otherwise returns 1
>  * indicating that blk_group was unhashed by the time we got to it.
> @@ -765,6 +769,12 @@ unsigned int blkcg_get_weight(struct blkio_cgroup *blkcg,
>  }
>  EXPORT_SYMBOL_GPL(blkcg_get_weight);
>
> +unsigned int blkcg_get_use_hierarchy(struct blkio_cgroup *blkcg)
> +{
> +       return blkcg->use_hierarchy;
> +}
> +EXPORT_SYMBOL_GPL(blkcg_get_use_hierarchy);
> +
>  uint64_t blkcg_get_read_bps(struct blkio_cgroup *blkcg, dev_t dev)
>  {
>        struct blkio_policy_node *pn;
> @@ -1202,6 +1212,8 @@ static u64 blkiocg_file_read_u64 (struct cgroup *cgrp, struct cftype *cft) {
>                switch(name) {
>                case BLKIO_PROP_weight:
>                        return (u64)blkcg->weight;
> +               case BLKIO_PROP_use_hierarchy:
> +                       return (u64)blkcg->use_hierarchy;
>                }
>                break;
>        default:
> @@ -1210,6 +1222,36 @@ static u64 blkiocg_file_read_u64 (struct cgroup *cgrp, struct cftype *cft) {
>        return 0;
>  }
>
> +static int blkio_use_hierarchy_write(struct cgroup *cgrp, u64 val)
> +{
> +       struct cgroup *parent = cgrp->parent;
> +       struct blkio_cgroup *blkcg, *parent_blkcg = NULL;
> +       int ret = 0;
> +
> +       if (val != 0 && val != 1)
> +               return -EINVAL;
> +
> +       blkcg = cgroup_to_blkio_cgroup(cgrp);
> +       if (parent)
> +               parent_blkcg = cgroup_to_blkio_cgroup(parent);
> +
> +       cgroup_lock();
> +       /*
> +        * If parent's use_hierarchy is set, we can't make any modifications
> +        * in the child subtrees. If it is unset, then the change can occur,
> +        * provided the current cgroup has no children.
> +        */
> +       if (!parent_blkcg || !parent_blkcg->use_hierarchy) {
> +               if (list_empty(&cgrp->children))
> +                       blkcg->use_hierarchy = val;
> +               else
> +                       ret = -EBUSY;
> +       } else
> +               ret = -EINVAL;
> +       cgroup_unlock();
> +       return ret;
> +}
> +
>  static int
>  blkiocg_file_write_u64(struct cgroup *cgrp, struct cftype *cft, u64 val)
>  {
> @@ -1224,6 +1266,8 @@ blkiocg_file_write_u64(struct cgroup *cgrp, struct cftype *cft, u64 val)
>                switch(name) {
>                case BLKIO_PROP_weight:
>                        return blkio_weight_write(blkcg, val);
> +               case BLKIO_PROP_use_hierarchy:
> +                       return blkio_use_hierarchy_write(cgrp, val);
>                }
>                break;
>        default:
> @@ -1301,6 +1345,13 @@ struct cftype blkio_files[] = {
>                .name = "reset_stats",
>                .write_u64 = blkiocg_reset_stats,
>        },
> +       {
> +               .name = "use_hierarchy",
> +               .private = BLKIOFILE_PRIVATE(BLKIO_POLICY_PROP,
> +                                            BLKIO_PROP_use_hierarchy),
> +               .read_u64 = blkiocg_file_read_u64,
> +               .write_u64 = blkiocg_file_write_u64,
> +       },
>  #ifdef CONFIG_BLK_DEV_THROTTLING
>        {
>                .name = "throttle.read_bps_device",
> @@ -1444,7 +1495,7 @@ static void blkiocg_destroy(struct cgroup_subsys *subsys, struct cgroup *cgroup)
>  static struct cgroup_subsys_state *
>  blkiocg_create(struct cgroup_subsys *subsys, struct cgroup *cgroup)
>  {
> -       struct blkio_cgroup *blkcg;
> +       struct blkio_cgroup *blkcg, *parent_blkcg = NULL;
>        struct cgroup *parent = cgroup->parent;
>
>        if (!parent) {
> @@ -1452,6 +1503,7 @@ blkiocg_create(struct cgroup_subsys *subsys, struct cgroup *cgroup)
>                goto done;
>        }
>
> +       parent_blkcg = cgroup_to_blkio_cgroup(parent);
>        blkcg = kzalloc(sizeof(*blkcg), GFP_KERNEL);
>        if (!blkcg)
>                return ERR_PTR(-ENOMEM);
> @@ -1462,6 +1514,11 @@ done:
>        INIT_HLIST_HEAD(&blkcg->blkg_list);
>
>        INIT_LIST_HEAD(&blkcg->policy_list);
> +       if (parent)
> +               blkcg->use_hierarchy = parent_blkcg->use_hierarchy;
> +       else
> +               blkcg->use_hierarchy = 0;
> +
>        return &blkcg->css;
>  }
>
> diff --git a/block/blk-cgroup.h b/block/blk-cgroup.h
> index ea4861b..5b4b351 100644
> --- a/block/blk-cgroup.h
> +++ b/block/blk-cgroup.h
> @@ -90,6 +90,7 @@ enum blkcg_file_name_prop {
>        BLKIO_PROP_idle_time,
>        BLKIO_PROP_empty_time,
>        BLKIO_PROP_dequeue,
> +       BLKIO_PROP_use_hierarchy,
>  };
>
>  /* cgroup files owned by throttle policy */
> @@ -105,6 +106,7 @@ enum blkcg_file_name_throtl {
>  struct blkio_cgroup {
>        struct cgroup_subsys_state css;
>        unsigned int weight;
> +       bool use_hierarchy;
>        spinlock_t lock;
>        struct hlist_head blkg_list;
>        struct list_head policy_list; /* list of blkio_policy_node */
> @@ -179,6 +181,7 @@ struct blkio_policy_node {
>
>  extern unsigned int blkcg_get_weight(struct blkio_cgroup *blkcg,
>                                     dev_t dev);
> +extern unsigned int blkcg_get_use_hierarchy(struct blkio_cgroup *blkcg);
>  extern uint64_t blkcg_get_read_bps(struct blkio_cgroup *blkcg,
>                                     dev_t dev);
>  extern uint64_t blkcg_get_write_bps(struct blkio_cgroup *blkcg,
> diff --git a/block/cfq-iosched.c b/block/cfq-iosched.c
> index aa3eda8..0e21d27 100644
> --- a/block/cfq-iosched.c
> +++ b/block/cfq-iosched.c
> @@ -110,6 +110,9 @@ struct cfq_entity {
>        u64 vdisktime;
>        bool is_group_entity;
>        unsigned int weight;
> +       struct cfq_entity *parent;
> +       /* Reposition time */
> +       unsigned long reposition_time;
>  };
>
>  /*
> @@ -118,8 +121,6 @@ struct cfq_entity {
>  struct cfq_queue {
>        /* The schedule entity */
>        struct cfq_entity cfqe;
> -       /* Reposition time */
> -       unsigned long reposition_time;
>        /* reference count */
>        int ref;
>        /* various state flags, see below */
> @@ -199,6 +200,9 @@ struct cfq_group {
>        /* number of cfqq currently on this group */
>        int nr_cfqq;
>
> +       /* number of sub cfq groups */
> +       int nr_subgp;
> +
>        /*
>         * Per group busy queus average. Useful for workload slice calc. We
>         * create the array for each prio class but at run time it is used
> @@ -234,10 +238,11 @@ struct cfq_group {
>  */
>  struct cfq_data {
>        struct request_queue *queue;
> -       /* Root service tree for cfq_groups */
> -       struct cfq_rb_root grp_service_tree;
>        struct cfq_group root_group;
>
> +       /* cfq group schedule in flat or hierarchy manner. */
> +       bool use_hierarchy;
> +
>        /*
>         * The priority currently being served
>         */
> @@ -246,6 +251,9 @@ struct cfq_data {
>        unsigned long workload_expires;
>        struct cfq_group *serving_group;
>
> +       /* Service tree for cfq group flat scheduling mode. */
> +       struct cfq_rb_root grp_service_tree;
> +
>        /*
>         * Each priority tree is sorted by next_request position.  These
>         * trees are used when determining if two or more queues are
> @@ -355,8 +363,6 @@ cfqg_of_entity(struct cfq_entity *cfqe)
>  }
>
>
> -static struct cfq_group *cfq_get_next_cfqg(struct cfq_data *cfqd);
> -
>  static struct cfq_rb_root *service_tree_for(struct cfq_group *cfqg,
>                                            enum wl_prio_t prio,
>                                            enum wl_type_t type)
> @@ -643,13 +649,50 @@ static inline unsigned cfq_group_get_avg_queues(struct cfq_data *cfqd,
>        return cfqg->busy_queues_avg[rt];
>  }
>
> +static inline unsigned int
> +cfq_group_get_total_weight(struct cfq_group *cfqg)
> +{
> +       int i, j;
> +       struct cfq_rb_root *st;
> +       unsigned int total_weight = 0;
> +
> +       for_each_cfqg_st(cfqg, i, j, st) {
> +               total_weight += st->total_weight;
> +       }
> +
> +       return total_weight;
> +}
> +
>  static inline unsigned
>  cfq_group_slice(struct cfq_data *cfqd, struct cfq_group *cfqg)
>  {
> -       struct cfq_rb_root *st = &cfqd->grp_service_tree;
>        struct cfq_entity *cfqe = &cfqg->cfqe;
> +       struct cfq_rb_root *st;
> +       int group_slice = cfq_target_latency;
> +       unsigned int grp_total_weight;
> +       struct cfq_group *p_cfqg;
> +
> +       /*
> +        * Calculate group slice in a hierarchical way.
> +        * Note, the calculation is cross all service trees under a group.
> +        */
> +       do {
> +               if (cfqe->parent) {
> +                       p_cfqg = cfqg_of_entity(cfqe->parent);
> +                       grp_total_weight = cfq_group_get_total_weight(p_cfqg);
> +                       group_slice = group_slice * cfqe->weight /
> +                                       grp_total_weight;
> +               } else {
> +                       /* For top level groups */
> +                       st = cfqe->service_tree;
> +                       group_slice = group_slice * cfqe->weight /
> +                                       st->total_weight;
> +               }
>
> -       return cfq_target_latency * cfqe->weight / st->total_weight;
> +               cfqe = cfqe->parent;
> +       } while (cfqe);
> +
> +       return group_slice;
>  }
>
>  static inline void
> @@ -672,7 +715,8 @@ cfq_set_prio_slice(struct cfq_data *cfqd, struct cfq_queue *cfqq)
>                        /* scale low_slice according to IO priority
>                         * and sync vs async */
>                        unsigned low_slice =
> -                               min(slice, base_low_slice * slice / sync_slice);
> +                               min(slice, base_low_slice * slice /
> +                                   sync_slice);
>                        /* the adapted slice value is scaled to fit all iqs
>                         * into the target latency */
>                        slice = max(slice * group_slice / expect_latency,
> @@ -812,17 +856,6 @@ static struct cfq_entity *cfq_rb_first(struct cfq_rb_root *root)
>        return NULL;
>  }
>
> -static struct cfq_entity *cfq_rb_first_entity(struct cfq_rb_root *root)
> -{
> -       if (!root->left)
> -               root->left = rb_first(&root->rb);
> -
> -       if (root->left)
> -               return rb_entry_entity(root->left);
> -
> -       return NULL;
> -}
> -
>  static void rb_erase_init(struct rb_node *n, struct rb_root *root)
>  {
>        rb_erase(n, root);
> @@ -896,12 +929,15 @@ __cfq_entity_service_tree_add(struct cfq_rb_root *st, struct cfq_entity *cfqe)
>
>        rb_link_node(&cfqe->rb_node, parent, node);
>        rb_insert_color(&cfqe->rb_node, &st->rb);
> +
> +       update_min_vdisktime(st);
>  }
>
>  static void
>  cfq_entity_service_tree_add(struct cfq_rb_root *st, struct cfq_entity *cfqe)
>  {
>        __cfq_entity_service_tree_add(st, cfqe);
> +       cfqe->reposition_time = jiffies;
>        st->count++;
>        st->total_weight += cfqe->weight;
>  }
> @@ -909,34 +945,52 @@ cfq_entity_service_tree_add(struct cfq_rb_root *st, struct cfq_entity *cfqe)
>  static void
>  cfq_group_service_tree_add(struct cfq_data *cfqd, struct cfq_group *cfqg)
>  {
> -       struct cfq_rb_root *st = &cfqd->grp_service_tree;
>        struct cfq_entity *cfqe = &cfqg->cfqe;
> -       struct cfq_entity *__cfqe;
>        struct rb_node *n;
> +       struct cfq_entity *entity;
> +       struct cfq_rb_root *st;
> +       struct cfq_group *__cfqg;
>
>        cfqg->nr_cfqq++;
> +
>        if (!RB_EMPTY_NODE(&cfqe->rb_node))
>                return;
>
>        /*
> -        * Currently put the group at the end. Later implement something
> -        * so that groups get lesser vtime based on their weights, so that
> -        * if group does not loose all if it was not continously backlogged.
> +        * Enqueue this group and its ancestors onto their service tree.
>         */
> -       n = rb_last(&st->rb);
> -       if (n) {
> -               __cfqe = rb_entry_entity(n);
> -               cfqe->vdisktime = __cfqe->vdisktime + CFQ_IDLE_DELAY;
> -       } else
> -               cfqe->vdisktime = st->min_vdisktime;
> +       while (cfqe) {
> +               if (!RB_EMPTY_NODE(&cfqe->rb_node))
> +                       return;
>
> -       cfq_entity_service_tree_add(st, cfqe);
> +               /*
> +                * Currently put the group at the end. Later implement
> +                * something so that groups get lesser vtime based on
> +                * their weights, so that if group does not loose all
> +                * if it was not continously backlogged.
> +                */
> +               st = cfqe->service_tree;
> +               n = rb_last(&st->rb);
> +               if (n) {
> +                       entity = rb_entry_entity(n);
> +                       cfqe->vdisktime = entity->vdisktime +
> +                               CFQ_IDLE_DELAY;
> +               } else
> +                       cfqe->vdisktime = st->min_vdisktime;
> +
> +               cfq_entity_service_tree_add(st, cfqe);
> +               cfqe = cfqe->parent;
> +               __cfqg = cfqg_of_entity(cfqe);
> +               if (__cfqg)
> +                       __cfqg->nr_subgp++;
> +       }
>  }
>
>  static void
>  __cfq_entity_service_tree_del(struct cfq_rb_root *st, struct cfq_entity *cfqe)
>  {
>        cfq_rb_erase(&cfqe->rb_node, st);
> +       update_min_vdisktime(st);
>  }
>
>  static void
> @@ -945,27 +999,43 @@ cfq_entity_service_tree_del(struct cfq_rb_root *st, struct cfq_entity *cfqe)
>        if (!RB_EMPTY_NODE(&cfqe->rb_node)) {
>                __cfq_entity_service_tree_del(st, cfqe);
>                st->total_weight -= cfqe->weight;
> -               cfqe->service_tree = NULL;
>        }
>  }
>
>  static void
>  cfq_group_service_tree_del(struct cfq_data *cfqd, struct cfq_group *cfqg)
>  {
> -       struct cfq_rb_root *st = &cfqd->grp_service_tree;
>        struct cfq_entity *cfqe = &cfqg->cfqe;
> +       struct cfq_group *__cfqg, *p_cfqg;
>
>        BUG_ON(cfqg->nr_cfqq < 1);
>        cfqg->nr_cfqq--;
>
> -       /* If there are other cfq queues under this group, don't delete it */
> -       if (cfqg->nr_cfqq)
> +       /*
> +        * If there are other cfq queues under this group, or there are other
> +        * cfq groups under this group, don't delete it.
> +        */
> +       if (cfqg->nr_cfqq || cfqg->nr_subgp)
>                return;
>
> -       cfq_log_cfqg(cfqd, cfqg, "del_from_rr group");
> -       cfq_entity_service_tree_del(st, cfqe);
> -       cfqg->saved_workload_slice = 0;
> -       cfq_blkiocg_update_dequeue_stats(&cfqg->blkg, 1);
> +       /*
> +        * Dequeue this group and its ancestors from their service
> +        * tree.
> +        */
> +       while (cfqe) {
> +               __cfqg = cfqg_of_entity(cfqe);
> +               p_cfqg = cfqg_of_entity(cfqe->parent);
> +               cfq_entity_service_tree_del(cfqe->service_tree, cfqe);
> +               cfq_blkiocg_update_dequeue_stats(&__cfqg->blkg, 1);
> +               cfq_log_cfqg(cfqd, __cfqg, "del_from_rr group");
> +               __cfqg->saved_workload_slice = 0;
> +               cfqe = cfqe->parent;
> +               if (p_cfqg) {
> +                       p_cfqg->nr_subgp--;
> +                       if (p_cfqg->nr_cfqq || p_cfqg->nr_subgp)
> +                               return;
> +               }
> +       }
>  }
>
>  static inline unsigned int cfq_cfqq_slice_usage(struct cfq_queue *cfqq)
> @@ -997,7 +1067,6 @@ static inline unsigned int cfq_cfqq_slice_usage(struct cfq_queue *cfqq)
>  static void cfq_group_served(struct cfq_data *cfqd, struct cfq_group *cfqg,
>                                struct cfq_queue *cfqq)
>  {
> -       struct cfq_rb_root *st = &cfqd->grp_service_tree;
>        unsigned int used_sl, charge;
>        int nr_sync = cfqg->nr_cfqq - cfqg_busy_async_queues(cfqd, cfqg)
>                        - cfqg->service_tree_idle.count;
> @@ -1011,10 +1080,23 @@ static void cfq_group_served(struct cfq_data *cfqd, struct cfq_group *cfqg,
>        else if (!cfq_cfqq_sync(cfqq) && !nr_sync)
>                charge = cfqq->allocated_slice;
>
> -       /* Can't update vdisktime while group is on service tree */
> -       __cfq_entity_service_tree_del(st, cfqe);
> -       cfqe->vdisktime += cfq_scale_slice(charge, cfqe);
> -       __cfq_entity_service_tree_add(st, cfqe);
> +       /*
> +        * Update the vdisktime on the whole chain.
> +        */
> +       while (cfqe) {
> +               struct cfq_rb_root *st = cfqe->service_tree;
> +
> +               /*
> +                * Can't update vdisktime while group is on service
> +                * tree.
> +                */
> +               __cfq_entity_service_tree_del(st, cfqe);
> +               cfqe->vdisktime += cfq_scale_slice(charge, cfqe);
> +               __cfq_entity_service_tree_add(st, cfqe);
> +               st->count++;
> +               cfqe->reposition_time = jiffies;
> +               cfqe = cfqe->parent;
> +       }
>
>        /* This group is being expired. Save the context */
>        if (time_after(cfqd->workload_expires, jiffies)) {
> @@ -1026,7 +1108,8 @@ static void cfq_group_served(struct cfq_data *cfqd, struct cfq_group *cfqg,
>                cfqg->saved_workload_slice = 0;
>
>        cfq_log_cfqg(cfqd, cfqg, "served: vt=%llu min_vt=%llu",
> -                    cfqe->vdisktime, st->min_vdisktime);
> +                    cfqg->cfqe.vdisktime,
> +                    cfqg->cfqe.service_tree->min_vdisktime);
>        cfq_log_cfqq(cfqq->cfqd, cfqq, "sl_used=%u disp=%u charge=%u iops=%u"
>                        " sect=%u", used_sl, cfqq->slice_dispatch, charge,
>                        iops_mode(cfqd), cfqq->nr_sectors);
> @@ -1048,35 +1131,27 @@ void cfq_update_blkio_group_weight(void *key, struct blkio_group *blkg,
>        cfqg_of_blkg(blkg)->cfqe.weight = weight;
>  }
>
> -static struct cfq_group *
> -cfq_find_alloc_cfqg(struct cfq_data *cfqd, struct cgroup *cgroup, int create)
> +static void init_cfqe(struct blkio_cgroup *blkcg,
> +                                   struct cfq_group *cfqg)
> +{
> +       struct cfq_entity *cfqe = &cfqg->cfqe;
> +
> +       cfqe->weight = blkcg_get_weight(blkcg, cfqg->blkg.dev);
> +       RB_CLEAR_NODE(&cfqe->rb_node);
> +       cfqe->is_group_entity = true;
> +       cfqe->parent = NULL;
> +}
> +
> +static void init_cfqg(struct cfq_data *cfqd, struct blkio_cgroup *blkcg,
> +                     struct cfq_group *cfqg)
>  {
> -       struct blkio_cgroup *blkcg = cgroup_to_blkio_cgroup(cgroup);
> -       struct cfq_group *cfqg = NULL;
> -       void *key = cfqd;
>        int i, j;
>        struct cfq_rb_root *st;
> -       struct backing_dev_info *bdi = &cfqd->queue->backing_dev_info;
>        unsigned int major, minor;
> -
> -       cfqg = cfqg_of_blkg(blkiocg_lookup_group(blkcg, key));
> -       if (cfqg && !cfqg->blkg.dev && bdi->dev && dev_name(bdi->dev)) {
> -               sscanf(dev_name(bdi->dev), "%u:%u", &major, &minor);
> -               cfqg->blkg.dev = MKDEV(major, minor);
> -               goto done;
> -       }
> -       if (cfqg || !create)
> -               goto done;
> -
> -       cfqg = kzalloc_node(sizeof(*cfqg), GFP_ATOMIC, cfqd->queue->node);
> -       if (!cfqg)
> -               goto done;
> +       struct backing_dev_info *bdi = &cfqd->queue->backing_dev_info;
>
>        for_each_cfqg_st(cfqg, i, j, st)
>                *st = CFQ_RB_ROOT;
> -       RB_CLEAR_NODE(&cfqg->cfqe.rb_node);
> -
> -       cfqg->cfqe.is_group_entity = true;
>
>        /*
>         * Take the initial reference that will be released on destroy
> @@ -1086,24 +1161,199 @@ cfq_find_alloc_cfqg(struct cfq_data *cfqd, struct cgroup *cgroup, int create)
>         */
>        cfqg->ref = 1;
>
> +       /* Add group onto cgroup list */
> +       sscanf(dev_name(bdi->dev), "%u:%u", &major, &minor);
> +       cfq_blkiocg_add_blkio_group(blkcg, &cfqg->blkg, (void *)cfqd,
> +                                   MKDEV(major, minor));
> +       /* Initiate group entity */
> +       init_cfqe(blkcg, cfqg);
> +       /* Add group on cfqd list */
> +       hlist_add_head(&cfqg->cfqd_node, &cfqd->cfqg_list);
> +}
> +
> +static void cfq_destroy_cfqg(struct cfq_data *cfqd, struct cfq_group *cfqg);
> +
> +static void uninit_cfqg(struct cfq_data *cfqd, struct cfq_group *cfqg)
> +{
> +       if (!cfq_blkiocg_del_blkio_group(&cfqg->blkg))
> +               cfq_destroy_cfqg(cfqd, cfqg);
> +}
> +
> +static void cfqg_set_parent(struct cfq_data *cfqd, struct cfq_group *cfqg,
> +                           struct cfq_group *p_cfqg)
> +{
> +       struct cfq_entity *cfqe, *p_cfqe;
> +
> +       cfqe = &cfqg->cfqe;
> +
>        /*
> -        * Add group onto cgroup list. It might happen that bdi->dev is
> -        * not initiliazed yet. Initialize this new group without major
> -        * and minor info and this info will be filled in once a new thread
> -        * comes for IO. See code above.
> +        * 1. If use_hierarchy of the CGroup where cfqg's parent stays is not
> +        *    set, we put this cfqg onto global service tree.
> +        * 2. If cfqg is root cfqg, put it onto global service tree.
>         */
> -       if (bdi->dev) {
> -               sscanf(dev_name(bdi->dev), "%u:%u", &major, &minor);
> -               cfq_blkiocg_add_blkio_group(blkcg, &cfqg->blkg, (void *)cfqd,
> -                                       MKDEV(major, minor));
> -       } else
> -               cfq_blkiocg_add_blkio_group(blkcg, &cfqg->blkg, (void *)cfqd,
> -                                       0);
> +       if (!p_cfqg) {
> +               cfqe->service_tree = &cfqd->grp_service_tree;
> +               cfqe->parent = NULL;
> +               return;
> +       }
>
> -       cfqg->cfqe.weight = blkcg_get_weight(blkcg, cfqg->blkg.dev);
> +       p_cfqe = &p_cfqg->cfqe;
>
> -       /* Add group on cfqd list */
> -       hlist_add_head(&cfqg->cfqd_node, &cfqd->cfqg_list);
> +       cfqe->parent = p_cfqe;
> +
> +       /*
> +        * Currently, just put cfq group entity on "BE:SYNC" workload
> +        * service tree.
> +        */
> +       cfqe->service_tree = service_tree_for(p_cfqg, BE_WORKLOAD,
> +                                                     SYNC_WORKLOAD);
> +       /* child reference */
> +       p_cfqg->ref++;
> +}
> +
> +static struct cfq_group *cfqg_get_parent(struct cfq_group * cfqg)
> +{
> +       struct cfq_entity *cfqe, *p_cfqe;
> +
> +       if (!cfqg)
> +               return NULL;
> +
> +       cfqe = &cfqg->cfqe;
> +       p_cfqe = cfqe->parent;
> +       if (!p_cfqe)
> +               return NULL;
> +
> +       return cfqg_of_entity(p_cfqe);
> +}
> +
> +static struct cfq_group *
> +cfqg_chain_alloc(struct cfq_data *cfqd, struct cgroup *cgroup)
> +{
> +       struct blkio_cgroup *blkcg;
> +       struct backing_dev_info *bdi = &cfqd->queue->backing_dev_info;
> +       unsigned int major, minor;
> +       struct cfq_group *cfqg, *leaf_cfqg, *child_cfqg, *tmp_cfqg;
> +       void *key = cfqd;
> +
> +       /*
> +        * If CGroup's use_hierarchy is unset, we just need to allocate only
> +        * one CFQ group, and this group will put onto the "grp_service_tree".
> +        * We don't need to check whether the cfqg exists, the caller has
> +        * already checked it.
> +        */
> +       blkcg = cgroup_to_blkio_cgroup(cgroup);
> +       if (!blkcg_get_use_hierarchy(blkcg)) {
> +               cfqg = kzalloc_node(sizeof(*cfqg), GFP_ATOMIC,
> +                                   cfqd->queue->node);
> +               if (!cfqg)
> +                       return NULL;
> +
> +               init_cfqg(cfqd, blkcg, cfqg);
> +               cfqg_set_parent(cfqd, cfqg, NULL);
> +               return cfqg;
> +       }
> +
> +       /*
> +        * Allocate the CFQ group chain until we meet the group we'v already
> +        * allocated before, or to the CGroup whose use_hierarchy is not set.
> +        */
> +       leaf_cfqg = NULL;
> +       child_cfqg = NULL;
> +       for (; cgroup != NULL; cgroup = cgroup->parent) {
> +               blkcg = cgroup_to_blkio_cgroup(cgroup);
> +               cfqg = cfqg_of_blkg(blkiocg_lookup_group(blkcg, key));
> +               if (cfqg) {
> +                       if (!cfqg->blkg.dev && bdi->dev &&
> +                           dev_name(bdi->dev)) {
> +                               sscanf(dev_name(bdi->dev), "%u:%u",
> +                                      &major, &minor);
> +                               cfqg->blkg.dev = MKDEV(major, minor);
> +                       }
> +
> +                       /*
> +                        * Initialization of parent doesn't finish yet, get
> +                        * it done.
> +                        */
> +                       if (child_cfqg) {
> +                               if (blkcg_get_use_hierarchy(blkcg))
> +                                       cfqg_set_parent(cfqd, child_cfqg,
> +                                                       cfqg);
> +                               else
> +                                       cfqg_set_parent(cfqd, child_cfqg,
> +                                                       NULL);
> +                       }
> +
> +                       /* chain has already been built */
> +                       break;
> +               }
> +
> +               /*
> +                * We only allocate a cfqg that the corresponding cgroup's
> +                * use_hierarchy is set.
> +                */
> +               if (blkcg_get_use_hierarchy(blkcg)) {
> +                       cfqg = kzalloc_node(sizeof(*cfqg), GFP_ATOMIC,
> +                                           cfqd->queue->node);
> +                       if (!cfqg)
> +                               goto clean_up;
> +
> +                       if (!leaf_cfqg)
> +                               leaf_cfqg = cfqg;
> +
> +                       init_cfqg(cfqd, blkcg, cfqg);
> +               } else {
> +                       cfqg = NULL;
> +               }
> +
> +               if (child_cfqg)
> +                       cfqg_set_parent(cfqd, child_cfqg, cfqg);
> +
> +               /*
> +                * This CGroup's use_hierarchy isn't set, this means the CFQ
> +                * group chain has been built.
> +                */
> +               if (!blkcg_get_use_hierarchy(blkcg))
> +                       break;
> +
> +               child_cfqg = cfqg;
> +       }
> +
> +       return leaf_cfqg;
> +
> +clean_up:
> +       /* clean up the allocated cfq groups. */
> +       while (leaf_cfqg) {
> +               tmp_cfqg = leaf_cfqg;
> +               leaf_cfqg = cfqg_get_parent(leaf_cfqg);
> +               uninit_cfqg(cfqd, tmp_cfqg);
> +       }
> +
> +       return NULL;
> +}
> +
> +static struct cfq_group *
> +cfq_find_alloc_cfqg(struct cfq_data *cfqd, struct cgroup *cgroup, int create)
> +{
> +       struct blkio_cgroup *blkcg = cgroup_to_blkio_cgroup(cgroup);
> +       struct cfq_group *cfqg = NULL;
> +       void *key = cfqd;
> +       struct backing_dev_info *bdi = &cfqd->queue->backing_dev_info;
> +       unsigned int major, minor;
> +
> +       cfqg = cfqg_of_blkg(blkiocg_lookup_group(blkcg, key));
> +       if (cfqg && !cfqg->blkg.dev && bdi->dev && dev_name(bdi->dev)) {
> +               sscanf(dev_name(bdi->dev), "%u:%u", &major, &minor);
> +               cfqg->blkg.dev = MKDEV(major, minor);
> +               goto done;
> +       }
> +       if (cfqg || !create)
> +               goto done;
> +
> +       /*
> +        * Allocate CFQ group chain to the root group or we meet the CGroup
> +        * with use_hierarchy disabled.
> +        */
> +       cfqg = cfqg_chain_alloc(cfqd, cgroup);
>
>  done:
>        return cfqg;
> @@ -1148,6 +1398,7 @@ static void cfq_put_cfqg(struct cfq_group *cfqg)
>  {
>        struct cfq_rb_root *st;
>        int i, j;
> +       struct cfq_group *p_cfqg;
>
>        BUG_ON(cfqg->ref <= 0);
>        cfqg->ref--;
> @@ -1155,6 +1406,22 @@ static void cfq_put_cfqg(struct cfq_group *cfqg)
>                return;
>        for_each_cfqg_st(cfqg, i, j, st)
>                BUG_ON(!RB_EMPTY_ROOT(&st->rb));
> +
> +       do {
> +               p_cfqg = cfqg_get_parent(cfqg);
> +               kfree(cfqg);
> +               cfqg = NULL;
> +               /*
> +                * Drop the reference taken by children, if nobody references
> +                * parent group, we need delete the parent also.
> +                */
> +               if (p_cfqg) {
> +                       p_cfqg->ref--;
> +                       if (p_cfqg->ref == 0)
> +                               cfqg = p_cfqg;
> +               }
> +       } while (cfqg);
> +
>        kfree(cfqg);
>  }
>
> @@ -1321,9 +1588,6 @@ static void cfq_service_tree_add(struct cfq_data *cfqd, struct cfq_queue *cfqq,
>                         * ioprio.
>                         */
>                        pos_offset = cfq_get_boost(cfqd, cfqq);
> -                       /* Debug purpose, should remove. */
> -                       cfq_log_cfqq(cfqd, cfqq, "pos_offset: %llu",
> -                                    pos_offset);
>                        cfqe->vdisktime = service_tree->min_vdisktime +
>                                                pos_offset;
>                } else
> @@ -1365,9 +1629,8 @@ insert:
>        cfqe->service_tree = service_tree;
>
>        /* Add cfqq onto service tree. */
> +
>        cfq_entity_service_tree_add(service_tree, cfqe);
> -       update_min_vdisktime(service_tree);
> -       cfqq->reposition_time = jiffies;
>        if ((add_front || !new_cfqq) && !group_changed)
>                return;
>        cfq_group_service_tree_add(cfqd, cfqq->cfqg);
> @@ -1810,28 +2073,43 @@ static struct cfq_queue *cfq_get_next_queue(struct cfq_data *cfqd)
>        return cfqq_of_entity(cfq_rb_first(service_tree));
>  }
>
> -static struct cfq_queue *cfq_get_next_queue_forced(struct cfq_data *cfqd)
> +struct cfq_rb_root *choose_service_tree_forced(struct cfq_group *cfqg)
>  {
> -       struct cfq_group *cfqg;
> -       struct cfq_entity *cfqe;
>        int i, j;
>        struct cfq_rb_root *st;
>
> -       if (!cfqd->rq_queued)
> -               return NULL;
> +       for_each_cfqg_st(cfqg, i, j, st) {
> +               if (st->count != 0)
> +                       return st;
> +       }
>
> -       cfqg = cfq_get_next_cfqg(cfqd);
> -       if (!cfqg)
> +       return NULL;
> +}
> +
> +static struct cfq_entity *
> +cfq_get_next_entity_forced(struct cfq_data *cfqd)
> +{
> +       struct cfq_entity *cfqe;
> +       struct cfq_rb_root *st = &cfqd->grp_service_tree;
> +       struct cfq_group *cfqg;
> +
> +       if (!cfqd->rq_queued)
>                return NULL;
>
> -       for_each_cfqg_st(cfqg, i, j, st) {
> +       do {
>                cfqe = cfq_rb_first(st);
> -               if (cfqe != NULL)
> -                       return cfqq_of_entity(cfqe);
> -       }
> +               if (cfqe && !cfqe->is_group_entity)
> +                       return cfqe;
> +               else if (cfqe && cfqe->is_group_entity)
> +                       cfqg = cfqg_of_entity(cfqe);
> +
> +               st = choose_service_tree_forced(cfqg);
> +       } while (st);
> +
>        return NULL;
>  }
>
> +
>  /*
>  * Get and set a new active qu
>
>
> --
> To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
> the body of a message to majordomo@vger.kernel.org
> More majordomo info at  http://vger.kernel.org/majordomo-info.html
> Please read the FAQ at  http://www.tux.org/lkml/
>

^ permalink raw reply	[flat|nested] 40+ messages in thread

* Re: [PATCH 5/6 v4] cfq-iosched: CFQ group hierarchical scheduling and use_hierarchy interface
  2011-02-17  0:31   ` Justin TerAvest
@ 2011-02-17  1:21     ` Gui Jianfeng
  2011-02-17 17:36       ` Justin TerAvest
  2011-02-17 10:39     ` Alan Cox
  1 sibling, 1 reply; 40+ messages in thread
From: Gui Jianfeng @ 2011-02-17  1:21 UTC (permalink / raw)
  To: Justin TerAvest
  Cc: Vivek Goyal, Jens Axboe, Shaohua Li, lkml, Chad Talbott, Divyesh Shah

Justin TerAvest wrote:
> After a quick read,
> 
> It's sad that we have to have so many use_hierarchy checks; it seems
> like we're asking for bugs, especially in the future when one codepath
> gets updated but not the other.
> 
> CodingStyle says we should only have one declaration per line.
> 
> I feel like there is an implicit assumption that groups and tasks
> should not be children of the same parent; that is, a group should
> contain only groups, or only tasks, but I don't see this enforced;
> there's just and assumption that BE:SYNC is "good enough" for that
> comparison. This smells like something that will be tweaked/tuned for
> fairness later. :( Why don't we just prevent this from happening?

Hi Justin,

Thanks for reviewing.

Previously, I posted very first version that makes a group containing only
groups or only tasks. But I think it's more flexible to treat groups and
tasks at the same level. I think Vivek and Jens have the same opinion.
We had discussed in this thread http://lkml.org/lkml/2010/8/30/30

> 
> The clean_up label in chain_alloc() is strange; I don't think the goto
> is necessary at all. I found that method generally hard to understand.
> It's doing a lot.

I don't understand why clean_up isn't needed.
When we fail to allocate a cfq group at some level, we have to clean up
all groups in the chain that we have already allocated.

Thanks,
Gui

> 
> It's possible that some of these can't be worked around.
> 
> 
> On Wed, Feb 9, 2011 at 11:47 PM, Gui Jianfeng
> <guijianfeng@cn.fujitsu.com> wrote:
>> CFQ group hierarchical scheduling and use_hierarchy interface.
>>
>> Signed-off-by: Gui Jianfeng <guijianfeng@cn.fujitsu.com>
>> ---
>>  block/blk-cgroup.c  |   61 +++++-
>>  block/blk-cgroup.h  |    3 +
>>  block/cfq-iosched.c |  603 +++++++++++++++++++++++++++++++++++++--------------
>>  3 files changed, 500 insertions(+), 167 deletions(-)
>>
>> diff --git a/block/blk-cgroup.c b/block/blk-cgroup.c
>> index 455768a..c55fecd 100644
>> --- a/block/blk-cgroup.c
>> +++ b/block/blk-cgroup.c
>> @@ -25,7 +25,10 @@
>>  static DEFINE_SPINLOCK(blkio_list_lock);
>>  static LIST_HEAD(blkio_list);
>>
>> -struct blkio_cgroup blkio_root_cgroup = { .weight = 2*BLKIO_WEIGHT_DEFAULT };
>> +struct blkio_cgroup blkio_root_cgroup = {
>> +       .weight = 2*BLKIO_WEIGHT_DEFAULT,
>> +       .use_hierarchy = 0
>> +};
>>  EXPORT_SYMBOL_GPL(blkio_root_cgroup);
>>
>>  static struct cgroup_subsys_state *blkiocg_create(struct cgroup_subsys *,
>> @@ -454,6 +457,7 @@ static void __blkiocg_del_blkio_group(struct blkio_group *blkg)
>>        blkg->blkcg_id = 0;
>>  }
>>
>> +
>>  /*
>>  * returns 0 if blkio_group was still on cgroup list. Otherwise returns 1
>>  * indicating that blk_group was unhashed by the time we got to it.
>> @@ -765,6 +769,12 @@ unsigned int blkcg_get_weight(struct blkio_cgroup *blkcg,
>>  }
>>  EXPORT_SYMBOL_GPL(blkcg_get_weight);
>>
>> +unsigned int blkcg_get_use_hierarchy(struct blkio_cgroup *blkcg)
>> +{
>> +       return blkcg->use_hierarchy;
>> +}
>> +EXPORT_SYMBOL_GPL(blkcg_get_use_hierarchy);
>> +
>>  uint64_t blkcg_get_read_bps(struct blkio_cgroup *blkcg, dev_t dev)
>>  {
>>        struct blkio_policy_node *pn;
>> @@ -1202,6 +1212,8 @@ static u64 blkiocg_file_read_u64 (struct cgroup *cgrp, struct cftype *cft) {
>>                switch(name) {
>>                case BLKIO_PROP_weight:
>>                        return (u64)blkcg->weight;
>> +               case BLKIO_PROP_use_hierarchy:
>> +                       return (u64)blkcg->use_hierarchy;
>>                }
>>                break;
>>        default:
>> @@ -1210,6 +1222,36 @@ static u64 blkiocg_file_read_u64 (struct cgroup *cgrp, struct cftype *cft) {
>>        return 0;
>>  }
>>
>> +static int blkio_use_hierarchy_write(struct cgroup *cgrp, u64 val)
>> +{
>> +       struct cgroup *parent = cgrp->parent;
>> +       struct blkio_cgroup *blkcg, *parent_blkcg = NULL;
>> +       int ret = 0;
>> +
>> +       if (val != 0 && val != 1)
>> +               return -EINVAL;
>> +
>> +       blkcg = cgroup_to_blkio_cgroup(cgrp);
>> +       if (parent)
>> +               parent_blkcg = cgroup_to_blkio_cgroup(parent);
>> +
>> +       cgroup_lock();
>> +       /*
>> +        * If parent's use_hierarchy is set, we can't make any modifications
>> +        * in the child subtrees. If it is unset, then the change can occur,
>> +        * provided the current cgroup has no children.
>> +        */
>> +       if (!parent_blkcg || !parent_blkcg->use_hierarchy) {
>> +               if (list_empty(&cgrp->children))
>> +                       blkcg->use_hierarchy = val;
>> +               else
>> +                       ret = -EBUSY;
>> +       } else
>> +               ret = -EINVAL;
>> +       cgroup_unlock();
>> +       return ret;
>> +}
>> +
>>  static int
>>  blkiocg_file_write_u64(struct cgroup *cgrp, struct cftype *cft, u64 val)
>>  {
>> @@ -1224,6 +1266,8 @@ blkiocg_file_write_u64(struct cgroup *cgrp, struct cftype *cft, u64 val)
>>                switch(name) {
>>                case BLKIO_PROP_weight:
>>                        return blkio_weight_write(blkcg, val);
>> +               case BLKIO_PROP_use_hierarchy:
>> +                       return blkio_use_hierarchy_write(cgrp, val);
>>                }
>>                break;
>>        default:
>> @@ -1301,6 +1345,13 @@ struct cftype blkio_files[] = {
>>                .name = "reset_stats",
>>                .write_u64 = blkiocg_reset_stats,
>>        },
>> +       {
>> +               .name = "use_hierarchy",
>> +               .private = BLKIOFILE_PRIVATE(BLKIO_POLICY_PROP,
>> +                                            BLKIO_PROP_use_hierarchy),
>> +               .read_u64 = blkiocg_file_read_u64,
>> +               .write_u64 = blkiocg_file_write_u64,
>> +       },
>>  #ifdef CONFIG_BLK_DEV_THROTTLING
>>        {
>>                .name = "throttle.read_bps_device",
>> @@ -1444,7 +1495,7 @@ static void blkiocg_destroy(struct cgroup_subsys *subsys, struct cgroup *cgroup)
>>  static struct cgroup_subsys_state *
>>  blkiocg_create(struct cgroup_subsys *subsys, struct cgroup *cgroup)
>>  {
>> -       struct blkio_cgroup *blkcg;
>> +       struct blkio_cgroup *blkcg, *parent_blkcg = NULL;
>>        struct cgroup *parent = cgroup->parent;
>>
>>        if (!parent) {
>> @@ -1452,6 +1503,7 @@ blkiocg_create(struct cgroup_subsys *subsys, struct cgroup *cgroup)
>>                goto done;
>>        }
>>
>> +       parent_blkcg = cgroup_to_blkio_cgroup(parent);
>>        blkcg = kzalloc(sizeof(*blkcg), GFP_KERNEL);
>>        if (!blkcg)
>>                return ERR_PTR(-ENOMEM);
>> @@ -1462,6 +1514,11 @@ done:
>>        INIT_HLIST_HEAD(&blkcg->blkg_list);
>>
>>        INIT_LIST_HEAD(&blkcg->policy_list);
>> +       if (parent)
>> +               blkcg->use_hierarchy = parent_blkcg->use_hierarchy;
>> +       else
>> +               blkcg->use_hierarchy = 0;
>> +
>>        return &blkcg->css;
>>  }
>>
>> diff --git a/block/blk-cgroup.h b/block/blk-cgroup.h
>> index ea4861b..5b4b351 100644
>> --- a/block/blk-cgroup.h
>> +++ b/block/blk-cgroup.h
>> @@ -90,6 +90,7 @@ enum blkcg_file_name_prop {
>>        BLKIO_PROP_idle_time,
>>        BLKIO_PROP_empty_time,
>>        BLKIO_PROP_dequeue,
>> +       BLKIO_PROP_use_hierarchy,
>>  };
>>
>>  /* cgroup files owned by throttle policy */
>> @@ -105,6 +106,7 @@ enum blkcg_file_name_throtl {
>>  struct blkio_cgroup {
>>        struct cgroup_subsys_state css;
>>        unsigned int weight;
>> +       bool use_hierarchy;
>>        spinlock_t lock;
>>        struct hlist_head blkg_list;
>>        struct list_head policy_list; /* list of blkio_policy_node */
>> @@ -179,6 +181,7 @@ struct blkio_policy_node {
>>
>>  extern unsigned int blkcg_get_weight(struct blkio_cgroup *blkcg,
>>                                     dev_t dev);
>> +extern unsigned int blkcg_get_use_hierarchy(struct blkio_cgroup *blkcg);
>>  extern uint64_t blkcg_get_read_bps(struct blkio_cgroup *blkcg,
>>                                     dev_t dev);
>>  extern uint64_t blkcg_get_write_bps(struct blkio_cgroup *blkcg,
>> diff --git a/block/cfq-iosched.c b/block/cfq-iosched.c
>> index aa3eda8..0e21d27 100644
>> --- a/block/cfq-iosched.c
>> +++ b/block/cfq-iosched.c
>> @@ -110,6 +110,9 @@ struct cfq_entity {
>>        u64 vdisktime;
>>        bool is_group_entity;
>>        unsigned int weight;
>> +       struct cfq_entity *parent;
>> +       /* Reposition time */
>> +       unsigned long reposition_time;
>>  };
>>
>>  /*
>> @@ -118,8 +121,6 @@ struct cfq_entity {
>>  struct cfq_queue {
>>        /* The schedule entity */
>>        struct cfq_entity cfqe;
>> -       /* Reposition time */
>> -       unsigned long reposition_time;
>>        /* reference count */
>>        int ref;
>>        /* various state flags, see below */
>> @@ -199,6 +200,9 @@ struct cfq_group {
>>        /* number of cfqq currently on this group */
>>        int nr_cfqq;
>>
>> +       /* number of sub cfq groups */
>> +       int nr_subgp;
>> +
>>        /*
>>         * Per group busy queus average. Useful for workload slice calc. We
>>         * create the array for each prio class but at run time it is used
>> @@ -234,10 +238,11 @@ struct cfq_group {
>>  */
>>  struct cfq_data {
>>        struct request_queue *queue;
>> -       /* Root service tree for cfq_groups */
>> -       struct cfq_rb_root grp_service_tree;
>>        struct cfq_group root_group;
>>
>> +       /* cfq group schedule in flat or hierarchy manner. */
>> +       bool use_hierarchy;
>> +
>>        /*
>>         * The priority currently being served
>>         */
>> @@ -246,6 +251,9 @@ struct cfq_data {
>>        unsigned long workload_expires;
>>        struct cfq_group *serving_group;
>>
>> +       /* Service tree for cfq group flat scheduling mode. */
>> +       struct cfq_rb_root grp_service_tree;
>> +
>>        /*
>>         * Each priority tree is sorted by next_request position.  These
>>         * trees are used when determining if two or more queues are
>> @@ -355,8 +363,6 @@ cfqg_of_entity(struct cfq_entity *cfqe)
>>  }
>>
>>
>> -static struct cfq_group *cfq_get_next_cfqg(struct cfq_data *cfqd);
>> -
>>  static struct cfq_rb_root *service_tree_for(struct cfq_group *cfqg,
>>                                            enum wl_prio_t prio,
>>                                            enum wl_type_t type)
>> @@ -643,13 +649,50 @@ static inline unsigned cfq_group_get_avg_queues(struct cfq_data *cfqd,
>>        return cfqg->busy_queues_avg[rt];
>>  }
>>
>> +static inline unsigned int
>> +cfq_group_get_total_weight(struct cfq_group *cfqg)
>> +{
>> +       int i, j;
>> +       struct cfq_rb_root *st;
>> +       unsigned int total_weight = 0;
>> +
>> +       for_each_cfqg_st(cfqg, i, j, st) {
>> +               total_weight += st->total_weight;
>> +       }
>> +
>> +       return total_weight;
>> +}
>> +
>>  static inline unsigned
>>  cfq_group_slice(struct cfq_data *cfqd, struct cfq_group *cfqg)
>>  {
>> -       struct cfq_rb_root *st = &cfqd->grp_service_tree;
>>        struct cfq_entity *cfqe = &cfqg->cfqe;
>> +       struct cfq_rb_root *st;
>> +       int group_slice = cfq_target_latency;
>> +       unsigned int grp_total_weight;
>> +       struct cfq_group *p_cfqg;
>> +
>> +       /*
>> +        * Calculate group slice in a hierarchical way.
>> +        * Note, the calculation is cross all service trees under a group.
>> +        */
>> +       do {
>> +               if (cfqe->parent) {
>> +                       p_cfqg = cfqg_of_entity(cfqe->parent);
>> +                       grp_total_weight = cfq_group_get_total_weight(p_cfqg);
>> +                       group_slice = group_slice * cfqe->weight /
>> +                                       grp_total_weight;
>> +               } else {
>> +                       /* For top level groups */
>> +                       st = cfqe->service_tree;
>> +                       group_slice = group_slice * cfqe->weight /
>> +                                       st->total_weight;
>> +               }
>>
>> -       return cfq_target_latency * cfqe->weight / st->total_weight;
>> +               cfqe = cfqe->parent;
>> +       } while (cfqe);
>> +
>> +       return group_slice;
>>  }
>>
>>  static inline void
>> @@ -672,7 +715,8 @@ cfq_set_prio_slice(struct cfq_data *cfqd, struct cfq_queue *cfqq)
>>                        /* scale low_slice according to IO priority
>>                         * and sync vs async */
>>                        unsigned low_slice =
>> -                               min(slice, base_low_slice * slice / sync_slice);
>> +                               min(slice, base_low_slice * slice /
>> +                                   sync_slice);
>>                        /* the adapted slice value is scaled to fit all iqs
>>                         * into the target latency */
>>                        slice = max(slice * group_slice / expect_latency,
>> @@ -812,17 +856,6 @@ static struct cfq_entity *cfq_rb_first(struct cfq_rb_root *root)
>>        return NULL;
>>  }
>>
>> -static struct cfq_entity *cfq_rb_first_entity(struct cfq_rb_root *root)
>> -{
>> -       if (!root->left)
>> -               root->left = rb_first(&root->rb);
>> -
>> -       if (root->left)
>> -               return rb_entry_entity(root->left);
>> -
>> -       return NULL;
>> -}
>> -
>>  static void rb_erase_init(struct rb_node *n, struct rb_root *root)
>>  {
>>        rb_erase(n, root);
>> @@ -896,12 +929,15 @@ __cfq_entity_service_tree_add(struct cfq_rb_root *st, struct cfq_entity *cfqe)
>>
>>        rb_link_node(&cfqe->rb_node, parent, node);
>>        rb_insert_color(&cfqe->rb_node, &st->rb);
>> +
>> +       update_min_vdisktime(st);
>>  }
>>
>>  static void
>>  cfq_entity_service_tree_add(struct cfq_rb_root *st, struct cfq_entity *cfqe)
>>  {
>>        __cfq_entity_service_tree_add(st, cfqe);
>> +       cfqe->reposition_time = jiffies;
>>        st->count++;
>>        st->total_weight += cfqe->weight;
>>  }
>> @@ -909,34 +945,52 @@ cfq_entity_service_tree_add(struct cfq_rb_root *st, struct cfq_entity *cfqe)
>>  static void
>>  cfq_group_service_tree_add(struct cfq_data *cfqd, struct cfq_group *cfqg)
>>  {
>> -       struct cfq_rb_root *st = &cfqd->grp_service_tree;
>>        struct cfq_entity *cfqe = &cfqg->cfqe;
>> -       struct cfq_entity *__cfqe;
>>        struct rb_node *n;
>> +       struct cfq_entity *entity;
>> +       struct cfq_rb_root *st;
>> +       struct cfq_group *__cfqg;
>>
>>        cfqg->nr_cfqq++;
>> +
>>        if (!RB_EMPTY_NODE(&cfqe->rb_node))
>>                return;
>>
>>        /*
>> -        * Currently put the group at the end. Later implement something
>> -        * so that groups get lesser vtime based on their weights, so that
>> -        * if group does not loose all if it was not continously backlogged.
>> +        * Enqueue this group and its ancestors onto their service tree.
>>         */
>> -       n = rb_last(&st->rb);
>> -       if (n) {
>> -               __cfqe = rb_entry_entity(n);
>> -               cfqe->vdisktime = __cfqe->vdisktime + CFQ_IDLE_DELAY;
>> -       } else
>> -               cfqe->vdisktime = st->min_vdisktime;
>> +       while (cfqe) {
>> +               if (!RB_EMPTY_NODE(&cfqe->rb_node))
>> +                       return;
>>
>> -       cfq_entity_service_tree_add(st, cfqe);
>> +               /*
>> +                * Currently put the group at the end. Later implement
>> +                * something so that groups get lesser vtime based on
>> +                * their weights, so that if group does not loose all
>> +                * if it was not continously backlogged.
>> +                */
>> +               st = cfqe->service_tree;
>> +               n = rb_last(&st->rb);
>> +               if (n) {
>> +                       entity = rb_entry_entity(n);
>> +                       cfqe->vdisktime = entity->vdisktime +
>> +                               CFQ_IDLE_DELAY;
>> +               } else
>> +                       cfqe->vdisktime = st->min_vdisktime;
>> +
>> +               cfq_entity_service_tree_add(st, cfqe);
>> +               cfqe = cfqe->parent;
>> +               __cfqg = cfqg_of_entity(cfqe);
>> +               if (__cfqg)
>> +                       __cfqg->nr_subgp++;
>> +       }
>>  }
>>
>>  static void
>>  __cfq_entity_service_tree_del(struct cfq_rb_root *st, struct cfq_entity *cfqe)
>>  {
>>        cfq_rb_erase(&cfqe->rb_node, st);
>> +       update_min_vdisktime(st);
>>  }
>>
>>  static void
>> @@ -945,27 +999,43 @@ cfq_entity_service_tree_del(struct cfq_rb_root *st, struct cfq_entity *cfqe)
>>        if (!RB_EMPTY_NODE(&cfqe->rb_node)) {
>>                __cfq_entity_service_tree_del(st, cfqe);
>>                st->total_weight -= cfqe->weight;
>> -               cfqe->service_tree = NULL;
>>        }
>>  }
>>
>>  static void
>>  cfq_group_service_tree_del(struct cfq_data *cfqd, struct cfq_group *cfqg)
>>  {
>> -       struct cfq_rb_root *st = &cfqd->grp_service_tree;
>>        struct cfq_entity *cfqe = &cfqg->cfqe;
>> +       struct cfq_group *__cfqg, *p_cfqg;
>>
>>        BUG_ON(cfqg->nr_cfqq < 1);
>>        cfqg->nr_cfqq--;
>>
>> -       /* If there are other cfq queues under this group, don't delete it */
>> -       if (cfqg->nr_cfqq)
>> +       /*
>> +        * If there are other cfq queues under this group, or there are other
>> +        * cfq groups under this group, don't delete it.
>> +        */
>> +       if (cfqg->nr_cfqq || cfqg->nr_subgp)
>>                return;
>>
>> -       cfq_log_cfqg(cfqd, cfqg, "del_from_rr group");
>> -       cfq_entity_service_tree_del(st, cfqe);
>> -       cfqg->saved_workload_slice = 0;
>> -       cfq_blkiocg_update_dequeue_stats(&cfqg->blkg, 1);
>> +       /*
>> +        * Dequeue this group and its ancestors from their service
>> +        * tree.
>> +        */
>> +       while (cfqe) {
>> +               __cfqg = cfqg_of_entity(cfqe);
>> +               p_cfqg = cfqg_of_entity(cfqe->parent);
>> +               cfq_entity_service_tree_del(cfqe->service_tree, cfqe);
>> +               cfq_blkiocg_update_dequeue_stats(&__cfqg->blkg, 1);
>> +               cfq_log_cfqg(cfqd, __cfqg, "del_from_rr group");
>> +               __cfqg->saved_workload_slice = 0;
>> +               cfqe = cfqe->parent;
>> +               if (p_cfqg) {
>> +                       p_cfqg->nr_subgp--;
>> +                       if (p_cfqg->nr_cfqq || p_cfqg->nr_subgp)
>> +                               return;
>> +               }
>> +       }
>>  }
>>
>>  static inline unsigned int cfq_cfqq_slice_usage(struct cfq_queue *cfqq)
>> @@ -997,7 +1067,6 @@ static inline unsigned int cfq_cfqq_slice_usage(struct cfq_queue *cfqq)
>>  static void cfq_group_served(struct cfq_data *cfqd, struct cfq_group *cfqg,
>>                                struct cfq_queue *cfqq)
>>  {
>> -       struct cfq_rb_root *st = &cfqd->grp_service_tree;
>>        unsigned int used_sl, charge;
>>        int nr_sync = cfqg->nr_cfqq - cfqg_busy_async_queues(cfqd, cfqg)
>>                        - cfqg->service_tree_idle.count;
>> @@ -1011,10 +1080,23 @@ static void cfq_group_served(struct cfq_data *cfqd, struct cfq_group *cfqg,
>>        else if (!cfq_cfqq_sync(cfqq) && !nr_sync)
>>                charge = cfqq->allocated_slice;
>>
>> -       /* Can't update vdisktime while group is on service tree */
>> -       __cfq_entity_service_tree_del(st, cfqe);
>> -       cfqe->vdisktime += cfq_scale_slice(charge, cfqe);
>> -       __cfq_entity_service_tree_add(st, cfqe);
>> +       /*
>> +        * Update the vdisktime on the whole chain.
>> +        */
>> +       while (cfqe) {
>> +               struct cfq_rb_root *st = cfqe->service_tree;
>> +
>> +               /*
>> +                * Can't update vdisktime while group is on service
>> +                * tree.
>> +                */
>> +               __cfq_entity_service_tree_del(st, cfqe);
>> +               cfqe->vdisktime += cfq_scale_slice(charge, cfqe);
>> +               __cfq_entity_service_tree_add(st, cfqe);
>> +               st->count++;
>> +               cfqe->reposition_time = jiffies;
>> +               cfqe = cfqe->parent;
>> +       }
>>
>>        /* This group is being expired. Save the context */
>>        if (time_after(cfqd->workload_expires, jiffies)) {
>> @@ -1026,7 +1108,8 @@ static void cfq_group_served(struct cfq_data *cfqd, struct cfq_group *cfqg,
>>                cfqg->saved_workload_slice = 0;
>>
>>        cfq_log_cfqg(cfqd, cfqg, "served: vt=%llu min_vt=%llu",
>> -                    cfqe->vdisktime, st->min_vdisktime);
>> +                    cfqg->cfqe.vdisktime,
>> +                    cfqg->cfqe.service_tree->min_vdisktime);
>>        cfq_log_cfqq(cfqq->cfqd, cfqq, "sl_used=%u disp=%u charge=%u iops=%u"
>>                        " sect=%u", used_sl, cfqq->slice_dispatch, charge,
>>                        iops_mode(cfqd), cfqq->nr_sectors);
>> @@ -1048,35 +1131,27 @@ void cfq_update_blkio_group_weight(void *key, struct blkio_group *blkg,
>>        cfqg_of_blkg(blkg)->cfqe.weight = weight;
>>  }
>>
>> -static struct cfq_group *
>> -cfq_find_alloc_cfqg(struct cfq_data *cfqd, struct cgroup *cgroup, int create)
>> +static void init_cfqe(struct blkio_cgroup *blkcg,
>> +                                   struct cfq_group *cfqg)
>> +{
>> +       struct cfq_entity *cfqe = &cfqg->cfqe;
>> +
>> +       cfqe->weight = blkcg_get_weight(blkcg, cfqg->blkg.dev);
>> +       RB_CLEAR_NODE(&cfqe->rb_node);
>> +       cfqe->is_group_entity = true;
>> +       cfqe->parent = NULL;
>> +}
>> +
>> +static void init_cfqg(struct cfq_data *cfqd, struct blkio_cgroup *blkcg,
>> +                     struct cfq_group *cfqg)
>>  {
>> -       struct blkio_cgroup *blkcg = cgroup_to_blkio_cgroup(cgroup);
>> -       struct cfq_group *cfqg = NULL;
>> -       void *key = cfqd;
>>        int i, j;
>>        struct cfq_rb_root *st;
>> -       struct backing_dev_info *bdi = &cfqd->queue->backing_dev_info;
>>        unsigned int major, minor;
>> -
>> -       cfqg = cfqg_of_blkg(blkiocg_lookup_group(blkcg, key));
>> -       if (cfqg && !cfqg->blkg.dev && bdi->dev && dev_name(bdi->dev)) {
>> -               sscanf(dev_name(bdi->dev), "%u:%u", &major, &minor);
>> -               cfqg->blkg.dev = MKDEV(major, minor);
>> -               goto done;
>> -       }
>> -       if (cfqg || !create)
>> -               goto done;
>> -
>> -       cfqg = kzalloc_node(sizeof(*cfqg), GFP_ATOMIC, cfqd->queue->node);
>> -       if (!cfqg)
>> -               goto done;
>> +       struct backing_dev_info *bdi = &cfqd->queue->backing_dev_info;
>>
>>        for_each_cfqg_st(cfqg, i, j, st)
>>                *st = CFQ_RB_ROOT;
>> -       RB_CLEAR_NODE(&cfqg->cfqe.rb_node);
>> -
>> -       cfqg->cfqe.is_group_entity = true;
>>
>>        /*
>>         * Take the initial reference that will be released on destroy
>> @@ -1086,24 +1161,199 @@ cfq_find_alloc_cfqg(struct cfq_data *cfqd, struct cgroup *cgroup, int create)
>>         */
>>        cfqg->ref = 1;
>>
>> +       /* Add group onto cgroup list */
>> +       sscanf(dev_name(bdi->dev), "%u:%u", &major, &minor);
>> +       cfq_blkiocg_add_blkio_group(blkcg, &cfqg->blkg, (void *)cfqd,
>> +                                   MKDEV(major, minor));
>> +       /* Initiate group entity */
>> +       init_cfqe(blkcg, cfqg);
>> +       /* Add group on cfqd list */
>> +       hlist_add_head(&cfqg->cfqd_node, &cfqd->cfqg_list);
>> +}
>> +
>> +static void cfq_destroy_cfqg(struct cfq_data *cfqd, struct cfq_group *cfqg);
>> +
>> +static void uninit_cfqg(struct cfq_data *cfqd, struct cfq_group *cfqg)
>> +{
>> +       if (!cfq_blkiocg_del_blkio_group(&cfqg->blkg))
>> +               cfq_destroy_cfqg(cfqd, cfqg);
>> +}
>> +
>> +static void cfqg_set_parent(struct cfq_data *cfqd, struct cfq_group *cfqg,
>> +                           struct cfq_group *p_cfqg)
>> +{
>> +       struct cfq_entity *cfqe, *p_cfqe;
>> +
>> +       cfqe = &cfqg->cfqe;
>> +
>>        /*
>> -        * Add group onto cgroup list. It might happen that bdi->dev is
>> -        * not initiliazed yet. Initialize this new group without major
>> -        * and minor info and this info will be filled in once a new thread
>> -        * comes for IO. See code above.
>> +        * 1. If use_hierarchy of the CGroup where cfqg's parent stays is not
>> +        *    set, we put this cfqg onto global service tree.
>> +        * 2. If cfqg is root cfqg, put it onto global service tree.
>>         */
>> -       if (bdi->dev) {
>> -               sscanf(dev_name(bdi->dev), "%u:%u", &major, &minor);
>> -               cfq_blkiocg_add_blkio_group(blkcg, &cfqg->blkg, (void *)cfqd,
>> -                                       MKDEV(major, minor));
>> -       } else
>> -               cfq_blkiocg_add_blkio_group(blkcg, &cfqg->blkg, (void *)cfqd,
>> -                                       0);
>> +       if (!p_cfqg) {
>> +               cfqe->service_tree = &cfqd->grp_service_tree;
>> +               cfqe->parent = NULL;
>> +               return;
>> +       }
>>
>> -       cfqg->cfqe.weight = blkcg_get_weight(blkcg, cfqg->blkg.dev);
>> +       p_cfqe = &p_cfqg->cfqe;
>>
>> -       /* Add group on cfqd list */
>> -       hlist_add_head(&cfqg->cfqd_node, &cfqd->cfqg_list);
>> +       cfqe->parent = p_cfqe;
>> +
>> +       /*
>> +        * Currently, just put cfq group entity on "BE:SYNC" workload
>> +        * service tree.
>> +        */
>> +       cfqe->service_tree = service_tree_for(p_cfqg, BE_WORKLOAD,
>> +                                                     SYNC_WORKLOAD);
>> +       /* child reference */
>> +       p_cfqg->ref++;
>> +}
>> +
>> +static struct cfq_group *cfqg_get_parent(struct cfq_group * cfqg)
>> +{
>> +       struct cfq_entity *cfqe, *p_cfqe;
>> +
>> +       if (!cfqg)
>> +               return NULL;
>> +
>> +       cfqe = &cfqg->cfqe;
>> +       p_cfqe = cfqe->parent;
>> +       if (!p_cfqe)
>> +               return NULL;
>> +
>> +       return cfqg_of_entity(p_cfqe);
>> +}
>> +
>> +static struct cfq_group *
>> +cfqg_chain_alloc(struct cfq_data *cfqd, struct cgroup *cgroup)
>> +{
>> +       struct blkio_cgroup *blkcg;
>> +       struct backing_dev_info *bdi = &cfqd->queue->backing_dev_info;
>> +       unsigned int major, minor;
>> +       struct cfq_group *cfqg, *leaf_cfqg, *child_cfqg, *tmp_cfqg;
>> +       void *key = cfqd;
>> +
>> +       /*
>> +        * If CGroup's use_hierarchy is unset, we just need to allocate only
>> +        * one CFQ group, and this group will put onto the "grp_service_tree".
>> +        * We don't need to check whether the cfqg exists, the caller has
>> +        * already checked it.
>> +        */
>> +       blkcg = cgroup_to_blkio_cgroup(cgroup);
>> +       if (!blkcg_get_use_hierarchy(blkcg)) {
>> +               cfqg = kzalloc_node(sizeof(*cfqg), GFP_ATOMIC,
>> +                                   cfqd->queue->node);
>> +               if (!cfqg)
>> +                       return NULL;
>> +
>> +               init_cfqg(cfqd, blkcg, cfqg);
>> +               cfqg_set_parent(cfqd, cfqg, NULL);
>> +               return cfqg;
>> +       }
>> +
>> +       /*
>> +        * Allocate the CFQ group chain until we meet the group we'v already
>> +        * allocated before, or to the CGroup whose use_hierarchy is not set.
>> +        */
>> +       leaf_cfqg = NULL;
>> +       child_cfqg = NULL;
>> +       for (; cgroup != NULL; cgroup = cgroup->parent) {
>> +               blkcg = cgroup_to_blkio_cgroup(cgroup);
>> +               cfqg = cfqg_of_blkg(blkiocg_lookup_group(blkcg, key));
>> +               if (cfqg) {
>> +                       if (!cfqg->blkg.dev && bdi->dev &&
>> +                           dev_name(bdi->dev)) {
>> +                               sscanf(dev_name(bdi->dev), "%u:%u",
>> +                                      &major, &minor);
>> +                               cfqg->blkg.dev = MKDEV(major, minor);
>> +                       }
>> +
>> +                       /*
>> +                        * Initialization of parent doesn't finish yet, get
>> +                        * it done.
>> +                        */
>> +                       if (child_cfqg) {
>> +                               if (blkcg_get_use_hierarchy(blkcg))
>> +                                       cfqg_set_parent(cfqd, child_cfqg,
>> +                                                       cfqg);
>> +                               else
>> +                                       cfqg_set_parent(cfqd, child_cfqg,
>> +                                                       NULL);
>> +                       }
>> +
>> +                       /* chain has already been built */
>> +                       break;
>> +               }
>> +
>> +               /*
>> +                * We only allocate a cfqg that the corresponding cgroup's
>> +                * use_hierarchy is set.
>> +                */
>> +               if (blkcg_get_use_hierarchy(blkcg)) {
>> +                       cfqg = kzalloc_node(sizeof(*cfqg), GFP_ATOMIC,
>> +                                           cfqd->queue->node);
>> +                       if (!cfqg)
>> +                               goto clean_up;
>> +
>> +                       if (!leaf_cfqg)
>> +                               leaf_cfqg = cfqg;
>> +
>> +                       init_cfqg(cfqd, blkcg, cfqg);
>> +               } else {
>> +                       cfqg = NULL;
>> +               }
>> +
>> +               if (child_cfqg)
>> +                       cfqg_set_parent(cfqd, child_cfqg, cfqg);
>> +
>> +               /*
>> +                * This CGroup's use_hierarchy isn't set, this means the CFQ
>> +                * group chain has been built.
>> +                */
>> +               if (!blkcg_get_use_hierarchy(blkcg))
>> +                       break;
>> +
>> +               child_cfqg = cfqg;
>> +       }
>> +
>> +       return leaf_cfqg;
>> +
>> +clean_up:
>> +       /* clean up the allocated cfq groups. */
>> +       while (leaf_cfqg) {
>> +               tmp_cfqg = leaf_cfqg;
>> +               leaf_cfqg = cfqg_get_parent(leaf_cfqg);
>> +               uninit_cfqg(cfqd, tmp_cfqg);
>> +       }
>> +
>> +       return NULL;
>> +}
>> +
>> +static struct cfq_group *
>> +cfq_find_alloc_cfqg(struct cfq_data *cfqd, struct cgroup *cgroup, int create)
>> +{
>> +       struct blkio_cgroup *blkcg = cgroup_to_blkio_cgroup(cgroup);
>> +       struct cfq_group *cfqg = NULL;
>> +       void *key = cfqd;
>> +       struct backing_dev_info *bdi = &cfqd->queue->backing_dev_info;
>> +       unsigned int major, minor;
>> +
>> +       cfqg = cfqg_of_blkg(blkiocg_lookup_group(blkcg, key));
>> +       if (cfqg && !cfqg->blkg.dev && bdi->dev && dev_name(bdi->dev)) {
>> +               sscanf(dev_name(bdi->dev), "%u:%u", &major, &minor);
>> +               cfqg->blkg.dev = MKDEV(major, minor);
>> +               goto done;
>> +       }
>> +       if (cfqg || !create)
>> +               goto done;
>> +
>> +       /*
>> +        * Allocate CFQ group chain to the root group or we meet the CGroup
>> +        * with use_hierarchy disabled.
>> +        */
>> +       cfqg = cfqg_chain_alloc(cfqd, cgroup);
>>
>>  done:
>>        return cfqg;
>> @@ -1148,6 +1398,7 @@ static void cfq_put_cfqg(struct cfq_group *cfqg)
>>  {
>>        struct cfq_rb_root *st;
>>        int i, j;
>> +       struct cfq_group *p_cfqg;
>>
>>        BUG_ON(cfqg->ref <= 0);
>>        cfqg->ref--;
>> @@ -1155,6 +1406,22 @@ static void cfq_put_cfqg(struct cfq_group *cfqg)
>>                return;
>>        for_each_cfqg_st(cfqg, i, j, st)
>>                BUG_ON(!RB_EMPTY_ROOT(&st->rb));
>> +
>> +       do {
>> +               p_cfqg = cfqg_get_parent(cfqg);
>> +               kfree(cfqg);
>> +               cfqg = NULL;
>> +               /*
>> +                * Drop the reference taken by children, if nobody references
>> +                * parent group, we need delete the parent also.
>> +                */
>> +               if (p_cfqg) {
>> +                       p_cfqg->ref--;
>> +                       if (p_cfqg->ref == 0)
>> +                               cfqg = p_cfqg;
>> +               }
>> +       } while (cfqg);
>> +
>>        kfree(cfqg);
>>  }
>>
>> @@ -1321,9 +1588,6 @@ static void cfq_service_tree_add(struct cfq_data *cfqd, struct cfq_queue *cfqq,
>>                         * ioprio.
>>                         */
>>                        pos_offset = cfq_get_boost(cfqd, cfqq);
>> -                       /* Debug purpose, should remove. */
>> -                       cfq_log_cfqq(cfqd, cfqq, "pos_offset: %llu",
>> -                                    pos_offset);
>>                        cfqe->vdisktime = service_tree->min_vdisktime +
>>                                                pos_offset;
>>                } else
>> @@ -1365,9 +1629,8 @@ insert:
>>        cfqe->service_tree = service_tree;
>>
>>        /* Add cfqq onto service tree. */
>> +
>>        cfq_entity_service_tree_add(service_tree, cfqe);
>> -       update_min_vdisktime(service_tree);
>> -       cfqq->reposition_time = jiffies;
>>        if ((add_front || !new_cfqq) && !group_changed)
>>                return;
>>        cfq_group_service_tree_add(cfqd, cfqq->cfqg);
>> @@ -1810,28 +2073,43 @@ static struct cfq_queue *cfq_get_next_queue(struct cfq_data *cfqd)
>>        return cfqq_of_entity(cfq_rb_first(service_tree));
>>  }
>>
>> -static struct cfq_queue *cfq_get_next_queue_forced(struct cfq_data *cfqd)
>> +struct cfq_rb_root *choose_service_tree_forced(struct cfq_group *cfqg)
>>  {
>> -       struct cfq_group *cfqg;
>> -       struct cfq_entity *cfqe;
>>        int i, j;
>>        struct cfq_rb_root *st;
>>
>> -       if (!cfqd->rq_queued)
>> -               return NULL;
>> +       for_each_cfqg_st(cfqg, i, j, st) {
>> +               if (st->count != 0)
>> +                       return st;
>> +       }
>>
>> -       cfqg = cfq_get_next_cfqg(cfqd);
>> -       if (!cfqg)
>> +       return NULL;
>> +}
>> +
>> +static struct cfq_entity *
>> +cfq_get_next_entity_forced(struct cfq_data *cfqd)
>> +{
>> +       struct cfq_entity *cfqe;
>> +       struct cfq_rb_root *st = &cfqd->grp_service_tree;
>> +       struct cfq_group *cfqg;
>> +
>> +       if (!cfqd->rq_queued)
>>                return NULL;
>>
>> -       for_each_cfqg_st(cfqg, i, j, st) {
>> +       do {
>>                cfqe = cfq_rb_first(st);
>> -               if (cfqe != NULL)
>> -                       return cfqq_of_entity(cfqe);
>> -       }
>> +               if (cfqe && !cfqe->is_group_entity)
>> +                       return cfqe;
>> +               else if (cfqe && cfqe->is_group_entity)
>> +                       cfqg = cfqg_of_entity(cfqe);
>> +
>> +               st = choose_service_tree_forced(cfqg);
>> +       } while (st);
>> +
>>        return NULL;
>>  }
>>
>> +
>>  /*
>>  * Get and set a new active qu
>>
>>
>> --
>> To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
>> the body of a message to majordomo@vger.kernel.org
>> More majordomo info at  http://vger.kernel.org/majordomo-info.html
>> Please read the FAQ at  http://www.tux.org/lkml/
>>
> 

-- 
Regards
Gui Jianfeng

^ permalink raw reply	[flat|nested] 40+ messages in thread

* Re: [PATCH 5/6 v4] cfq-iosched: CFQ group hierarchical scheduling and use_hierarchy interface
  2011-02-16 14:17               ` Vivek Goyal
@ 2011-02-17  1:22                 ` Gui Jianfeng
  0 siblings, 0 replies; 40+ messages in thread
From: Gui Jianfeng @ 2011-02-17  1:22 UTC (permalink / raw)
  To: Vivek Goyal; +Cc: Jens Axboe, Shaohua Li, lkml, Chad Talbott, Divyesh Shah

Vivek Goyal wrote:
> On Wed, Feb 16, 2011 at 09:44:39AM +0800, Gui Jianfeng wrote:
>> Vivek Goyal wrote:
>>> On Tue, Feb 15, 2011 at 10:38:32AM +0800, Gui Jianfeng wrote:
>>>> Vivek Goyal wrote:
>>>>> On Sat, Feb 12, 2011 at 10:21:47AM +0800, Gui Jianfeng wrote:
>>>>> [..]
>>>>>>>> +static struct cfq_group *
>>>>>>>> +cfq_find_alloc_cfqg(struct cfq_data *cfqd, struct cgroup *cgroup, int create)
>>>>>>>> +{
>>>>>>>> +	struct blkio_cgroup *blkcg = cgroup_to_blkio_cgroup(cgroup);
>>>>>>>> +	struct cfq_group *cfqg = NULL;
>>>>>>>> +	void *key = cfqd;
>>>>>>>> +	struct backing_dev_info *bdi = &cfqd->queue->backing_dev_info;
>>>>>>>> +	unsigned int major, minor;
>>>>>>>> +
>>>>>>>> +	cfqg = cfqg_of_blkg(blkiocg_lookup_group(blkcg, key));
>>>>>>>> +	if (cfqg && !cfqg->blkg.dev && bdi->dev && dev_name(bdi->dev)) {
>>>>>>>> +		sscanf(dev_name(bdi->dev), "%u:%u", &major, &minor);
>>>>>>>> +		cfqg->blkg.dev = MKDEV(major, minor);
>>>>>>>> +		goto done;
>>>>>>>> +	}
>>>>>>> Should we make this updation of this info hierarhical?
>>>>>> IMHO, it's fine to defer the updation when we really get the cfqg.
>>>>> But if cfqg is alrady present, we will never hit the allocation path 
>>>>> again. So if somebody creates 2-3 level deep hierarchy and does IO
>>>>> only in the children cgroup, parent cgroups will potentially not get
>>>>> device info updated hence no visible stats?
>>>> Ahh, I see your concern. But do we really need to show the stats even if
>>>> a cgroup doesn't issue any IO on a given device? 
>>> I am assuming that once use_hierarchy=1, you are aggregating the stats
>>> in parent cgroups? So if a child services 5 IOs, these are accounted
>>> to parent group also when user_hier=1?
>>>
>>> What happens in case of  memoy cgroup controller?
>> Hmm, it seems memcg aggregating stats in parent group.
>> But do we really need to do that in kernel? I think it's easier to do it in
>> userland, and it makes kernel much simpler.
> 
> I think at some point of time hierarchical aggregated stats will also be
> required. I am also looking at "memory.stat" file of meomory controller
> and they seem to be reporting both aggregated as well as individual group
> stats.
> 
> So we can probably skip implementing hierarhical stats in this patchset
> and implement it on a need basis in future.

Ok, I agree.

Thanks,
Gui

> 
> Thanks
> Vivek
> --
> To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
> the body of a message to majordomo@vger.kernel.org
> More majordomo info at  http://vger.kernel.org/majordomo-info.html
> Please read the FAQ at  http://www.tux.org/lkml/
> 

^ permalink raw reply	[flat|nested] 40+ messages in thread

* Re: [PATCH 5/6 v4] cfq-iosched: CFQ group hierarchical scheduling and use_hierarchy interface
  2011-02-17  0:31   ` Justin TerAvest
  2011-02-17  1:21     ` Gui Jianfeng
@ 2011-02-17 10:39     ` Alan Cox
  1 sibling, 0 replies; 40+ messages in thread
From: Alan Cox @ 2011-02-17 10:39 UTC (permalink / raw)
  To: Justin TerAvest
  Cc: Gui Jianfeng, Vivek Goyal, Jens Axboe, Shaohua Li, lkml,
	Chad Talbott, Divyesh Shah

> CodingStyle says we should only have one declaration per line.

CodingStyle is a guide not a religious document at over which common
sense deviations should be sacrificed


^ permalink raw reply	[flat|nested] 40+ messages in thread

* Re: [PATCH 5/6 v4] cfq-iosched: CFQ group hierarchical scheduling and use_hierarchy interface
  2011-02-17  1:21     ` Gui Jianfeng
@ 2011-02-17 17:36       ` Justin TerAvest
  2011-02-18  1:14         ` Gui Jianfeng
  0 siblings, 1 reply; 40+ messages in thread
From: Justin TerAvest @ 2011-02-17 17:36 UTC (permalink / raw)
  To: Gui Jianfeng
  Cc: Vivek Goyal, Jens Axboe, Shaohua Li, lkml, Chad Talbott, Divyesh Shah

On Wed, Feb 16, 2011 at 5:21 PM, Gui Jianfeng
<guijianfeng@cn.fujitsu.com> wrote:
> Justin TerAvest wrote:
>> After a quick read,
>>
>> It's sad that we have to have so many use_hierarchy checks; it seems
>> like we're asking for bugs, especially in the future when one codepath
>> gets updated but not the other.
>>
>> CodingStyle says we should only have one declaration per line.
>>
>> I feel like there is an implicit assumption that groups and tasks
>> should not be children of the same parent; that is, a group should
>> contain only groups, or only tasks, but I don't see this enforced;
>> there's just and assumption that BE:SYNC is "good enough" for that
>> comparison. This smells like something that will be tweaked/tuned for
>> fairness later. :( Why don't we just prevent this from happening?
>
> Hi Justin,
>
> Thanks for reviewing.
>
> Previously, I posted very first version that makes a group containing only
> groups or only tasks. But I think it's more flexible to treat groups and
> tasks at the same level. I think Vivek and Jens have the same opinion.
> We had discussed in this thread http://lkml.org/lkml/2010/8/30/30

Hi Gui,
Thanks for pointing me at the earlier discussion, the decisions make a
lot more sense now.

>
>>
>> The clean_up label in chain_alloc() is strange; I don't think the goto
>> is necessary at all. I found that method generally hard to understand.
>> It's doing a lot.
>
> I don't understand why clean_up isn't needed.
> When we fail to allocate a cfq group at some level, we have to clean up
> all groups in the chain that we have already allocated.

Cleaning up is necessary, but the label is only used from one place.
Why add the goto and the label when the code below "clean_up" can just
be moved inside the condition
+                       if (!cfqg)



Thanks,
Justin

>
> Thanks,
> Gui
>
>>
>> It's possible that some of these can't be worked around.
>>
>>
>> On Wed, Feb 9, 2011 at 11:47 PM, Gui Jianfeng
>> <guijianfeng@cn.fujitsu.com> wrote:
>>> CFQ group hierarchical scheduling and use_hierarchy interface.
>>>
>>> Signed-off-by: Gui Jianfeng <guijianfeng@cn.fujitsu.com>
>>> ---
>>>  block/blk-cgroup.c  |   61 +++++-
>>>  block/blk-cgroup.h  |    3 +
>>>  block/cfq-iosched.c |  603 +++++++++++++++++++++++++++++++++++++--------------
>>>  3 files changed, 500 insertions(+), 167 deletions(-)
>>>
>>> diff --git a/block/blk-cgroup.c b/block/blk-cgroup.c
>>> index 455768a..c55fecd 100644
>>> --- a/block/blk-cgroup.c
>>> +++ b/block/blk-cgroup.c
>>> @@ -25,7 +25,10 @@
>>>  static DEFINE_SPINLOCK(blkio_list_lock);
>>>  static LIST_HEAD(blkio_list);
>>>
>>> -struct blkio_cgroup blkio_root_cgroup = { .weight = 2*BLKIO_WEIGHT_DEFAULT };
>>> +struct blkio_cgroup blkio_root_cgroup = {
>>> +       .weight = 2*BLKIO_WEIGHT_DEFAULT,
>>> +       .use_hierarchy = 0
>>> +};
>>>  EXPORT_SYMBOL_GPL(blkio_root_cgroup);
>>>
>>>  static struct cgroup_subsys_state *blkiocg_create(struct cgroup_subsys *,
>>> @@ -454,6 +457,7 @@ static void __blkiocg_del_blkio_group(struct blkio_group *blkg)
>>>        blkg->blkcg_id = 0;
>>>  }
>>>
>>> +
>>>  /*
>>>  * returns 0 if blkio_group was still on cgroup list. Otherwise returns 1
>>>  * indicating that blk_group was unhashed by the time we got to it.
>>> @@ -765,6 +769,12 @@ unsigned int blkcg_get_weight(struct blkio_cgroup *blkcg,
>>>  }
>>>  EXPORT_SYMBOL_GPL(blkcg_get_weight);
>>>
>>> +unsigned int blkcg_get_use_hierarchy(struct blkio_cgroup *blkcg)
>>> +{
>>> +       return blkcg->use_hierarchy;
>>> +}
>>> +EXPORT_SYMBOL_GPL(blkcg_get_use_hierarchy);
>>> +
>>>  uint64_t blkcg_get_read_bps(struct blkio_cgroup *blkcg, dev_t dev)
>>>  {
>>>        struct blkio_policy_node *pn;
>>> @@ -1202,6 +1212,8 @@ static u64 blkiocg_file_read_u64 (struct cgroup *cgrp, struct cftype *cft) {
>>>                switch(name) {
>>>                case BLKIO_PROP_weight:
>>>                        return (u64)blkcg->weight;
>>> +               case BLKIO_PROP_use_hierarchy:
>>> +                       return (u64)blkcg->use_hierarchy;
>>>                }
>>>                break;
>>>        default:
>>> @@ -1210,6 +1222,36 @@ static u64 blkiocg_file_read_u64 (struct cgroup *cgrp, struct cftype *cft) {
>>>        return 0;
>>>  }
>>>
>>> +static int blkio_use_hierarchy_write(struct cgroup *cgrp, u64 val)
>>> +{
>>> +       struct cgroup *parent = cgrp->parent;
>>> +       struct blkio_cgroup *blkcg, *parent_blkcg = NULL;
>>> +       int ret = 0;
>>> +
>>> +       if (val != 0 && val != 1)
>>> +               return -EINVAL;
>>> +
>>> +       blkcg = cgroup_to_blkio_cgroup(cgrp);
>>> +       if (parent)
>>> +               parent_blkcg = cgroup_to_blkio_cgroup(parent);
>>> +
>>> +       cgroup_lock();
>>> +       /*
>>> +        * If parent's use_hierarchy is set, we can't make any modifications
>>> +        * in the child subtrees. If it is unset, then the change can occur,
>>> +        * provided the current cgroup has no children.
>>> +        */
>>> +       if (!parent_blkcg || !parent_blkcg->use_hierarchy) {
>>> +               if (list_empty(&cgrp->children))
>>> +                       blkcg->use_hierarchy = val;
>>> +               else
>>> +                       ret = -EBUSY;
>>> +       } else
>>> +               ret = -EINVAL;
>>> +       cgroup_unlock();
>>> +       return ret;
>>> +}
>>> +
>>>  static int
>>>  blkiocg_file_write_u64(struct cgroup *cgrp, struct cftype *cft, u64 val)
>>>  {
>>> @@ -1224,6 +1266,8 @@ blkiocg_file_write_u64(struct cgroup *cgrp, struct cftype *cft, u64 val)
>>>                switch(name) {
>>>                case BLKIO_PROP_weight:
>>>                        return blkio_weight_write(blkcg, val);
>>> +               case BLKIO_PROP_use_hierarchy:
>>> +                       return blkio_use_hierarchy_write(cgrp, val);
>>>                }
>>>                break;
>>>        default:
>>> @@ -1301,6 +1345,13 @@ struct cftype blkio_files[] = {
>>>                .name = "reset_stats",
>>>                .write_u64 = blkiocg_reset_stats,
>>>        },
>>> +       {
>>> +               .name = "use_hierarchy",
>>> +               .private = BLKIOFILE_PRIVATE(BLKIO_POLICY_PROP,
>>> +                                            BLKIO_PROP_use_hierarchy),
>>> +               .read_u64 = blkiocg_file_read_u64,
>>> +               .write_u64 = blkiocg_file_write_u64,
>>> +       },
>>>  #ifdef CONFIG_BLK_DEV_THROTTLING
>>>        {
>>>                .name = "throttle.read_bps_device",
>>> @@ -1444,7 +1495,7 @@ static void blkiocg_destroy(struct cgroup_subsys *subsys, struct cgroup *cgroup)
>>>  static struct cgroup_subsys_state *
>>>  blkiocg_create(struct cgroup_subsys *subsys, struct cgroup *cgroup)
>>>  {
>>> -       struct blkio_cgroup *blkcg;
>>> +       struct blkio_cgroup *blkcg, *parent_blkcg = NULL;
>>>        struct cgroup *parent = cgroup->parent;
>>>
>>>        if (!parent) {
>>> @@ -1452,6 +1503,7 @@ blkiocg_create(struct cgroup_subsys *subsys, struct cgroup *cgroup)
>>>                goto done;
>>>        }
>>>
>>> +       parent_blkcg = cgroup_to_blkio_cgroup(parent);
>>>        blkcg = kzalloc(sizeof(*blkcg), GFP_KERNEL);
>>>        if (!blkcg)
>>>                return ERR_PTR(-ENOMEM);
>>> @@ -1462,6 +1514,11 @@ done:
>>>        INIT_HLIST_HEAD(&blkcg->blkg_list);
>>>
>>>        INIT_LIST_HEAD(&blkcg->policy_list);
>>> +       if (parent)
>>> +               blkcg->use_hierarchy = parent_blkcg->use_hierarchy;
>>> +       else
>>> +               blkcg->use_hierarchy = 0;
>>> +
>>>        return &blkcg->css;
>>>  }
>>>
>>> diff --git a/block/blk-cgroup.h b/block/blk-cgroup.h
>>> index ea4861b..5b4b351 100644
>>> --- a/block/blk-cgroup.h
>>> +++ b/block/blk-cgroup.h
>>> @@ -90,6 +90,7 @@ enum blkcg_file_name_prop {
>>>        BLKIO_PROP_idle_time,
>>>        BLKIO_PROP_empty_time,
>>>        BLKIO_PROP_dequeue,
>>> +       BLKIO_PROP_use_hierarchy,
>>>  };
>>>
>>>  /* cgroup files owned by throttle policy */
>>> @@ -105,6 +106,7 @@ enum blkcg_file_name_throtl {
>>>  struct blkio_cgroup {
>>>        struct cgroup_subsys_state css;
>>>        unsigned int weight;
>>> +       bool use_hierarchy;
>>>        spinlock_t lock;
>>>        struct hlist_head blkg_list;
>>>        struct list_head policy_list; /* list of blkio_policy_node */
>>> @@ -179,6 +181,7 @@ struct blkio_policy_node {
>>>
>>>  extern unsigned int blkcg_get_weight(struct blkio_cgroup *blkcg,
>>>                                     dev_t dev);
>>> +extern unsigned int blkcg_get_use_hierarchy(struct blkio_cgroup *blkcg);
>>>  extern uint64_t blkcg_get_read_bps(struct blkio_cgroup *blkcg,
>>>                                     dev_t dev);
>>>  extern uint64_t blkcg_get_write_bps(struct blkio_cgroup *blkcg,
>>> diff --git a/block/cfq-iosched.c b/block/cfq-iosched.c
>>> index aa3eda8..0e21d27 100644
>>> --- a/block/cfq-iosched.c
>>> +++ b/block/cfq-iosched.c
>>> @@ -110,6 +110,9 @@ struct cfq_entity {
>>>        u64 vdisktime;
>>>        bool is_group_entity;
>>>        unsigned int weight;
>>> +       struct cfq_entity *parent;
>>> +       /* Reposition time */
>>> +       unsigned long reposition_time;
>>>  };
>>>
>>>  /*
>>> @@ -118,8 +121,6 @@ struct cfq_entity {
>>>  struct cfq_queue {
>>>        /* The schedule entity */
>>>        struct cfq_entity cfqe;
>>> -       /* Reposition time */
>>> -       unsigned long reposition_time;
>>>        /* reference count */
>>>        int ref;
>>>        /* various state flags, see below */
>>> @@ -199,6 +200,9 @@ struct cfq_group {
>>>        /* number of cfqq currently on this group */
>>>        int nr_cfqq;
>>>
>>> +       /* number of sub cfq groups */
>>> +       int nr_subgp;
>>> +
>>>        /*
>>>         * Per group busy queus average. Useful for workload slice calc. We
>>>         * create the array for each prio class but at run time it is used
>>> @@ -234,10 +238,11 @@ struct cfq_group {
>>>  */
>>>  struct cfq_data {
>>>        struct request_queue *queue;
>>> -       /* Root service tree for cfq_groups */
>>> -       struct cfq_rb_root grp_service_tree;
>>>        struct cfq_group root_group;
>>>
>>> +       /* cfq group schedule in flat or hierarchy manner. */
>>> +       bool use_hierarchy;
>>> +
>>>        /*
>>>         * The priority currently being served
>>>         */
>>> @@ -246,6 +251,9 @@ struct cfq_data {
>>>        unsigned long workload_expires;
>>>        struct cfq_group *serving_group;
>>>
>>> +       /* Service tree for cfq group flat scheduling mode. */
>>> +       struct cfq_rb_root grp_service_tree;
>>> +
>>>        /*
>>>         * Each priority tree is sorted by next_request position.  These
>>>         * trees are used when determining if two or more queues are
>>> @@ -355,8 +363,6 @@ cfqg_of_entity(struct cfq_entity *cfqe)
>>>  }
>>>
>>>
>>> -static struct cfq_group *cfq_get_next_cfqg(struct cfq_data *cfqd);
>>> -
>>>  static struct cfq_rb_root *service_tree_for(struct cfq_group *cfqg,
>>>                                            enum wl_prio_t prio,
>>>                                            enum wl_type_t type)
>>> @@ -643,13 +649,50 @@ static inline unsigned cfq_group_get_avg_queues(struct cfq_data *cfqd,
>>>        return cfqg->busy_queues_avg[rt];
>>>  }
>>>
>>> +static inline unsigned int
>>> +cfq_group_get_total_weight(struct cfq_group *cfqg)
>>> +{
>>> +       int i, j;
>>> +       struct cfq_rb_root *st;
>>> +       unsigned int total_weight = 0;
>>> +
>>> +       for_each_cfqg_st(cfqg, i, j, st) {
>>> +               total_weight += st->total_weight;
>>> +       }
>>> +
>>> +       return total_weight;
>>> +}
>>> +
>>>  static inline unsigned
>>>  cfq_group_slice(struct cfq_data *cfqd, struct cfq_group *cfqg)
>>>  {
>>> -       struct cfq_rb_root *st = &cfqd->grp_service_tree;
>>>        struct cfq_entity *cfqe = &cfqg->cfqe;
>>> +       struct cfq_rb_root *st;
>>> +       int group_slice = cfq_target_latency;
>>> +       unsigned int grp_total_weight;
>>> +       struct cfq_group *p_cfqg;
>>> +
>>> +       /*
>>> +        * Calculate group slice in a hierarchical way.
>>> +        * Note, the calculation is cross all service trees under a group.
>>> +        */
>>> +       do {
>>> +               if (cfqe->parent) {
>>> +                       p_cfqg = cfqg_of_entity(cfqe->parent);
>>> +                       grp_total_weight = cfq_group_get_total_weight(p_cfqg);
>>> +                       group_slice = group_slice * cfqe->weight /
>>> +                                       grp_total_weight;
>>> +               } else {
>>> +                       /* For top level groups */
>>> +                       st = cfqe->service_tree;
>>> +                       group_slice = group_slice * cfqe->weight /
>>> +                                       st->total_weight;
>>> +               }
>>>
>>> -       return cfq_target_latency * cfqe->weight / st->total_weight;
>>> +               cfqe = cfqe->parent;
>>> +       } while (cfqe);
>>> +
>>> +       return group_slice;
>>>  }
>>>
>>>  static inline void
>>> @@ -672,7 +715,8 @@ cfq_set_prio_slice(struct cfq_data *cfqd, struct cfq_queue *cfqq)
>>>                        /* scale low_slice according to IO priority
>>>                         * and sync vs async */
>>>                        unsigned low_slice =
>>> -                               min(slice, base_low_slice * slice / sync_slice);
>>> +                               min(slice, base_low_slice * slice /
>>> +                                   sync_slice);
>>>                        /* the adapted slice value is scaled to fit all iqs
>>>                         * into the target latency */
>>>                        slice = max(slice * group_slice / expect_latency,
>>> @@ -812,17 +856,6 @@ static struct cfq_entity *cfq_rb_first(struct cfq_rb_root *root)
>>>        return NULL;
>>>  }
>>>
>>> -static struct cfq_entity *cfq_rb_first_entity(struct cfq_rb_root *root)
>>> -{
>>> -       if (!root->left)
>>> -               root->left = rb_first(&root->rb);
>>> -
>>> -       if (root->left)
>>> -               return rb_entry_entity(root->left);
>>> -
>>> -       return NULL;
>>> -}
>>> -
>>>  static void rb_erase_init(struct rb_node *n, struct rb_root *root)
>>>  {
>>>        rb_erase(n, root);
>>> @@ -896,12 +929,15 @@ __cfq_entity_service_tree_add(struct cfq_rb_root *st, struct cfq_entity *cfqe)
>>>
>>>        rb_link_node(&cfqe->rb_node, parent, node);
>>>        rb_insert_color(&cfqe->rb_node, &st->rb);
>>> +
>>> +       update_min_vdisktime(st);
>>>  }
>>>
>>>  static void
>>>  cfq_entity_service_tree_add(struct cfq_rb_root *st, struct cfq_entity *cfqe)
>>>  {
>>>        __cfq_entity_service_tree_add(st, cfqe);
>>> +       cfqe->reposition_time = jiffies;
>>>        st->count++;
>>>        st->total_weight += cfqe->weight;
>>>  }
>>> @@ -909,34 +945,52 @@ cfq_entity_service_tree_add(struct cfq_rb_root *st, struct cfq_entity *cfqe)
>>>  static void
>>>  cfq_group_service_tree_add(struct cfq_data *cfqd, struct cfq_group *cfqg)
>>>  {
>>> -       struct cfq_rb_root *st = &cfqd->grp_service_tree;
>>>        struct cfq_entity *cfqe = &cfqg->cfqe;
>>> -       struct cfq_entity *__cfqe;
>>>        struct rb_node *n;
>>> +       struct cfq_entity *entity;
>>> +       struct cfq_rb_root *st;
>>> +       struct cfq_group *__cfqg;
>>>
>>>        cfqg->nr_cfqq++;
>>> +
>>>        if (!RB_EMPTY_NODE(&cfqe->rb_node))
>>>                return;
>>>
>>>        /*
>>> -        * Currently put the group at the end. Later implement something
>>> -        * so that groups get lesser vtime based on their weights, so that
>>> -        * if group does not loose all if it was not continously backlogged.
>>> +        * Enqueue this group and its ancestors onto their service tree.
>>>         */
>>> -       n = rb_last(&st->rb);
>>> -       if (n) {
>>> -               __cfqe = rb_entry_entity(n);
>>> -               cfqe->vdisktime = __cfqe->vdisktime + CFQ_IDLE_DELAY;
>>> -       } else
>>> -               cfqe->vdisktime = st->min_vdisktime;
>>> +       while (cfqe) {
>>> +               if (!RB_EMPTY_NODE(&cfqe->rb_node))
>>> +                       return;
>>>
>>> -       cfq_entity_service_tree_add(st, cfqe);
>>> +               /*
>>> +                * Currently put the group at the end. Later implement
>>> +                * something so that groups get lesser vtime based on
>>> +                * their weights, so that if group does not loose all
>>> +                * if it was not continously backlogged.
>>> +                */
>>> +               st = cfqe->service_tree;
>>> +               n = rb_last(&st->rb);
>>> +               if (n) {
>>> +                       entity = rb_entry_entity(n);
>>> +                       cfqe->vdisktime = entity->vdisktime +
>>> +                               CFQ_IDLE_DELAY;
>>> +               } else
>>> +                       cfqe->vdisktime = st->min_vdisktime;
>>> +
>>> +               cfq_entity_service_tree_add(st, cfqe);
>>> +               cfqe = cfqe->parent;
>>> +               __cfqg = cfqg_of_entity(cfqe);
>>> +               if (__cfqg)
>>> +                       __cfqg->nr_subgp++;
>>> +       }
>>>  }
>>>
>>>  static void
>>>  __cfq_entity_service_tree_del(struct cfq_rb_root *st, struct cfq_entity *cfqe)
>>>  {
>>>        cfq_rb_erase(&cfqe->rb_node, st);
>>> +       update_min_vdisktime(st);
>>>  }
>>>
>>>  static void
>>> @@ -945,27 +999,43 @@ cfq_entity_service_tree_del(struct cfq_rb_root *st, struct cfq_entity *cfqe)
>>>        if (!RB_EMPTY_NODE(&cfqe->rb_node)) {
>>>                __cfq_entity_service_tree_del(st, cfqe);
>>>                st->total_weight -= cfqe->weight;
>>> -               cfqe->service_tree = NULL;
>>>        }
>>>  }
>>>
>>>  static void
>>>  cfq_group_service_tree_del(struct cfq_data *cfqd, struct cfq_group *cfqg)
>>>  {
>>> -       struct cfq_rb_root *st = &cfqd->grp_service_tree;
>>>        struct cfq_entity *cfqe = &cfqg->cfqe;
>>> +       struct cfq_group *__cfqg, *p_cfqg;
>>>
>>>        BUG_ON(cfqg->nr_cfqq < 1);
>>>        cfqg->nr_cfqq--;
>>>
>>> -       /* If there are other cfq queues under this group, don't delete it */
>>> -       if (cfqg->nr_cfqq)
>>> +       /*
>>> +        * If there are other cfq queues under this group, or there are other
>>> +        * cfq groups under this group, don't delete it.
>>> +        */
>>> +       if (cfqg->nr_cfqq || cfqg->nr_subgp)
>>>                return;
>>>
>>> -       cfq_log_cfqg(cfqd, cfqg, "del_from_rr group");
>>> -       cfq_entity_service_tree_del(st, cfqe);
>>> -       cfqg->saved_workload_slice = 0;
>>> -       cfq_blkiocg_update_dequeue_stats(&cfqg->blkg, 1);
>>> +       /*
>>> +        * Dequeue this group and its ancestors from their service
>>> +        * tree.
>>> +        */
>>> +       while (cfqe) {
>>> +               __cfqg = cfqg_of_entity(cfqe);
>>> +               p_cfqg = cfqg_of_entity(cfqe->parent);
>>> +               cfq_entity_service_tree_del(cfqe->service_tree, cfqe);
>>> +               cfq_blkiocg_update_dequeue_stats(&__cfqg->blkg, 1);
>>> +               cfq_log_cfqg(cfqd, __cfqg, "del_from_rr group");
>>> +               __cfqg->saved_workload_slice = 0;
>>> +               cfqe = cfqe->parent;
>>> +               if (p_cfqg) {
>>> +                       p_cfqg->nr_subgp--;
>>> +                       if (p_cfqg->nr_cfqq || p_cfqg->nr_subgp)
>>> +                               return;
>>> +               }
>>> +       }
>>>  }
>>>
>>>  static inline unsigned int cfq_cfqq_slice_usage(struct cfq_queue *cfqq)
>>> @@ -997,7 +1067,6 @@ static inline unsigned int cfq_cfqq_slice_usage(struct cfq_queue *cfqq)
>>>  static void cfq_group_served(struct cfq_data *cfqd, struct cfq_group *cfqg,
>>>                                struct cfq_queue *cfqq)
>>>  {
>>> -       struct cfq_rb_root *st = &cfqd->grp_service_tree;
>>>        unsigned int used_sl, charge;
>>>        int nr_sync = cfqg->nr_cfqq - cfqg_busy_async_queues(cfqd, cfqg)
>>>                        - cfqg->service_tree_idle.count;
>>> @@ -1011,10 +1080,23 @@ static void cfq_group_served(struct cfq_data *cfqd, struct cfq_group *cfqg,
>>>        else if (!cfq_cfqq_sync(cfqq) && !nr_sync)
>>>                charge = cfqq->allocated_slice;
>>>
>>> -       /* Can't update vdisktime while group is on service tree */
>>> -       __cfq_entity_service_tree_del(st, cfqe);
>>> -       cfqe->vdisktime += cfq_scale_slice(charge, cfqe);
>>> -       __cfq_entity_service_tree_add(st, cfqe);
>>> +       /*
>>> +        * Update the vdisktime on the whole chain.
>>> +        */
>>> +       while (cfqe) {
>>> +               struct cfq_rb_root *st = cfqe->service_tree;
>>> +
>>> +               /*
>>> +                * Can't update vdisktime while group is on service
>>> +                * tree.
>>> +                */
>>> +               __cfq_entity_service_tree_del(st, cfqe);
>>> +               cfqe->vdisktime += cfq_scale_slice(charge, cfqe);
>>> +               __cfq_entity_service_tree_add(st, cfqe);
>>> +               st->count++;
>>> +               cfqe->reposition_time = jiffies;
>>> +               cfqe = cfqe->parent;
>>> +       }
>>>
>>>        /* This group is being expired. Save the context */
>>>        if (time_after(cfqd->workload_expires, jiffies)) {
>>> @@ -1026,7 +1108,8 @@ static void cfq_group_served(struct cfq_data *cfqd, struct cfq_group *cfqg,
>>>                cfqg->saved_workload_slice = 0;
>>>
>>>        cfq_log_cfqg(cfqd, cfqg, "served: vt=%llu min_vt=%llu",
>>> -                    cfqe->vdisktime, st->min_vdisktime);
>>> +                    cfqg->cfqe.vdisktime,
>>> +                    cfqg->cfqe.service_tree->min_vdisktime);
>>>        cfq_log_cfqq(cfqq->cfqd, cfqq, "sl_used=%u disp=%u charge=%u iops=%u"
>>>                        " sect=%u", used_sl, cfqq->slice_dispatch, charge,
>>>                        iops_mode(cfqd), cfqq->nr_sectors);
>>> @@ -1048,35 +1131,27 @@ void cfq_update_blkio_group_weight(void *key, struct blkio_group *blkg,
>>>        cfqg_of_blkg(blkg)->cfqe.weight = weight;
>>>  }
>>>
>>> -static struct cfq_group *
>>> -cfq_find_alloc_cfqg(struct cfq_data *cfqd, struct cgroup *cgroup, int create)
>>> +static void init_cfqe(struct blkio_cgroup *blkcg,
>>> +                                   struct cfq_group *cfqg)
>>> +{
>>> +       struct cfq_entity *cfqe = &cfqg->cfqe;
>>> +
>>> +       cfqe->weight = blkcg_get_weight(blkcg, cfqg->blkg.dev);
>>> +       RB_CLEAR_NODE(&cfqe->rb_node);
>>> +       cfqe->is_group_entity = true;
>>> +       cfqe->parent = NULL;
>>> +}
>>> +
>>> +static void init_cfqg(struct cfq_data *cfqd, struct blkio_cgroup *blkcg,
>>> +                     struct cfq_group *cfqg)
>>>  {
>>> -       struct blkio_cgroup *blkcg = cgroup_to_blkio_cgroup(cgroup);
>>> -       struct cfq_group *cfqg = NULL;
>>> -       void *key = cfqd;
>>>        int i, j;
>>>        struct cfq_rb_root *st;
>>> -       struct backing_dev_info *bdi = &cfqd->queue->backing_dev_info;
>>>        unsigned int major, minor;
>>> -
>>> -       cfqg = cfqg_of_blkg(blkiocg_lookup_group(blkcg, key));
>>> -       if (cfqg && !cfqg->blkg.dev && bdi->dev && dev_name(bdi->dev)) {
>>> -               sscanf(dev_name(bdi->dev), "%u:%u", &major, &minor);
>>> -               cfqg->blkg.dev = MKDEV(major, minor);
>>> -               goto done;
>>> -       }
>>> -       if (cfqg || !create)
>>> -               goto done;
>>> -
>>> -       cfqg = kzalloc_node(sizeof(*cfqg), GFP_ATOMIC, cfqd->queue->node);
>>> -       if (!cfqg)
>>> -               goto done;
>>> +       struct backing_dev_info *bdi = &cfqd->queue->backing_dev_info;
>>>
>>>        for_each_cfqg_st(cfqg, i, j, st)
>>>                *st = CFQ_RB_ROOT;
>>> -       RB_CLEAR_NODE(&cfqg->cfqe.rb_node);
>>> -
>>> -       cfqg->cfqe.is_group_entity = true;
>>>
>>>        /*
>>>         * Take the initial reference that will be released on destroy
>>> @@ -1086,24 +1161,199 @@ cfq_find_alloc_cfqg(struct cfq_data *cfqd, struct cgroup *cgroup, int create)
>>>         */
>>>        cfqg->ref = 1;
>>>
>>> +       /* Add group onto cgroup list */
>>> +       sscanf(dev_name(bdi->dev), "%u:%u", &major, &minor);
>>> +       cfq_blkiocg_add_blkio_group(blkcg, &cfqg->blkg, (void *)cfqd,
>>> +                                   MKDEV(major, minor));
>>> +       /* Initiate group entity */
>>> +       init_cfqe(blkcg, cfqg);
>>> +       /* Add group on cfqd list */
>>> +       hlist_add_head(&cfqg->cfqd_node, &cfqd->cfqg_list);
>>> +}
>>> +
>>> +static void cfq_destroy_cfqg(struct cfq_data *cfqd, struct cfq_group *cfqg);
>>> +
>>> +static void uninit_cfqg(struct cfq_data *cfqd, struct cfq_group *cfqg)
>>> +{
>>> +       if (!cfq_blkiocg_del_blkio_group(&cfqg->blkg))
>>> +               cfq_destroy_cfqg(cfqd, cfqg);
>>> +}
>>> +
>>> +static void cfqg_set_parent(struct cfq_data *cfqd, struct cfq_group *cfqg,
>>> +                           struct cfq_group *p_cfqg)
>>> +{
>>> +       struct cfq_entity *cfqe, *p_cfqe;
>>> +
>>> +       cfqe = &cfqg->cfqe;
>>> +
>>>        /*
>>> -        * Add group onto cgroup list. It might happen that bdi->dev is
>>> -        * not initiliazed yet. Initialize this new group without major
>>> -        * and minor info and this info will be filled in once a new thread
>>> -        * comes for IO. See code above.
>>> +        * 1. If use_hierarchy of the CGroup where cfqg's parent stays is not
>>> +        *    set, we put this cfqg onto global service tree.
>>> +        * 2. If cfqg is root cfqg, put it onto global service tree.
>>>         */
>>> -       if (bdi->dev) {
>>> -               sscanf(dev_name(bdi->dev), "%u:%u", &major, &minor);
>>> -               cfq_blkiocg_add_blkio_group(blkcg, &cfqg->blkg, (void *)cfqd,
>>> -                                       MKDEV(major, minor));
>>> -       } else
>>> -               cfq_blkiocg_add_blkio_group(blkcg, &cfqg->blkg, (void *)cfqd,
>>> -                                       0);
>>> +       if (!p_cfqg) {
>>> +               cfqe->service_tree = &cfqd->grp_service_tree;
>>> +               cfqe->parent = NULL;
>>> +               return;
>>> +       }
>>>
>>> -       cfqg->cfqe.weight = blkcg_get_weight(blkcg, cfqg->blkg.dev);
>>> +       p_cfqe = &p_cfqg->cfqe;
>>>
>>> -       /* Add group on cfqd list */
>>> -       hlist_add_head(&cfqg->cfqd_node, &cfqd->cfqg_list);
>>> +       cfqe->parent = p_cfqe;
>>> +
>>> +       /*
>>> +        * Currently, just put cfq group entity on "BE:SYNC" workload
>>> +        * service tree.
>>> +        */
>>> +       cfqe->service_tree = service_tree_for(p_cfqg, BE_WORKLOAD,
>>> +                                                     SYNC_WORKLOAD);
>>> +       /* child reference */
>>> +       p_cfqg->ref++;
>>> +}
>>> +
>>> +static struct cfq_group *cfqg_get_parent(struct cfq_group * cfqg)
>>> +{
>>> +       struct cfq_entity *cfqe, *p_cfqe;
>>> +
>>> +       if (!cfqg)
>>> +               return NULL;
>>> +
>>> +       cfqe = &cfqg->cfqe;
>>> +       p_cfqe = cfqe->parent;
>>> +       if (!p_cfqe)
>>> +               return NULL;
>>> +
>>> +       return cfqg_of_entity(p_cfqe);
>>> +}
>>> +
>>> +static struct cfq_group *
>>> +cfqg_chain_alloc(struct cfq_data *cfqd, struct cgroup *cgroup)
>>> +{
>>> +       struct blkio_cgroup *blkcg;
>>> +       struct backing_dev_info *bdi = &cfqd->queue->backing_dev_info;
>>> +       unsigned int major, minor;
>>> +       struct cfq_group *cfqg, *leaf_cfqg, *child_cfqg, *tmp_cfqg;
>>> +       void *key = cfqd;
>>> +
>>> +       /*
>>> +        * If CGroup's use_hierarchy is unset, we just need to allocate only
>>> +        * one CFQ group, and this group will put onto the "grp_service_tree".
>>> +        * We don't need to check whether the cfqg exists, the caller has
>>> +        * already checked it.
>>> +        */
>>> +       blkcg = cgroup_to_blkio_cgroup(cgroup);
>>> +       if (!blkcg_get_use_hierarchy(blkcg)) {
>>> +               cfqg = kzalloc_node(sizeof(*cfqg), GFP_ATOMIC,
>>> +                                   cfqd->queue->node);
>>> +               if (!cfqg)
>>> +                       return NULL;
>>> +
>>> +               init_cfqg(cfqd, blkcg, cfqg);
>>> +               cfqg_set_parent(cfqd, cfqg, NULL);
>>> +               return cfqg;
>>> +       }
>>> +
>>> +       /*
>>> +        * Allocate the CFQ group chain until we meet the group we'v already
>>> +        * allocated before, or to the CGroup whose use_hierarchy is not set.
>>> +        */
>>> +       leaf_cfqg = NULL;
>>> +       child_cfqg = NULL;
>>> +       for (; cgroup != NULL; cgroup = cgroup->parent) {
>>> +               blkcg = cgroup_to_blkio_cgroup(cgroup);
>>> +               cfqg = cfqg_of_blkg(blkiocg_lookup_group(blkcg, key));
>>> +               if (cfqg) {
>>> +                       if (!cfqg->blkg.dev && bdi->dev &&
>>> +                           dev_name(bdi->dev)) {
>>> +                               sscanf(dev_name(bdi->dev), "%u:%u",
>>> +                                      &major, &minor);
>>> +                               cfqg->blkg.dev = MKDEV(major, minor);
>>> +                       }
>>> +
>>> +                       /*
>>> +                        * Initialization of parent doesn't finish yet, get
>>> +                        * it done.
>>> +                        */
>>> +                       if (child_cfqg) {
>>> +                               if (blkcg_get_use_hierarchy(blkcg))
>>> +                                       cfqg_set_parent(cfqd, child_cfqg,
>>> +                                                       cfqg);
>>> +                               else
>>> +                                       cfqg_set_parent(cfqd, child_cfqg,
>>> +                                                       NULL);
>>> +                       }
>>> +
>>> +                       /* chain has already been built */
>>> +                       break;
>>> +               }
>>> +
>>> +               /*
>>> +                * We only allocate a cfqg that the corresponding cgroup's
>>> +                * use_hierarchy is set.
>>> +                */
>>> +               if (blkcg_get_use_hierarchy(blkcg)) {
>>> +                       cfqg = kzalloc_node(sizeof(*cfqg), GFP_ATOMIC,
>>> +                                           cfqd->queue->node);
>>> +                       if (!cfqg)
>>> +                               goto clean_up;
>>> +
>>> +                       if (!leaf_cfqg)
>>> +                               leaf_cfqg = cfqg;
>>> +
>>> +                       init_cfqg(cfqd, blkcg, cfqg);
>>> +               } else {
>>> +                       cfqg = NULL;
>>> +               }
>>> +
>>> +               if (child_cfqg)
>>> +                       cfqg_set_parent(cfqd, child_cfqg, cfqg);
>>> +
>>> +               /*
>>> +                * This CGroup's use_hierarchy isn't set, this means the CFQ
>>> +                * group chain has been built.
>>> +                */
>>> +               if (!blkcg_get_use_hierarchy(blkcg))
>>> +                       break;
>>> +
>>> +               child_cfqg = cfqg;
>>> +       }
>>> +
>>> +       return leaf_cfqg;
>>> +
>>> +clean_up:
>>> +       /* clean up the allocated cfq groups. */
>>> +       while (leaf_cfqg) {
>>> +               tmp_cfqg = leaf_cfqg;
>>> +               leaf_cfqg = cfqg_get_parent(leaf_cfqg);
>>> +               uninit_cfqg(cfqd, tmp_cfqg);
>>> +       }
>>> +
>>> +       return NULL;
>>> +}
>>> +
>>> +static struct cfq_group *
>>> +cfq_find_alloc_cfqg(struct cfq_data *cfqd, struct cgroup *cgroup, int create)
>>> +{
>>> +       struct blkio_cgroup *blkcg = cgroup_to_blkio_cgroup(cgroup);
>>> +       struct cfq_group *cfqg = NULL;
>>> +       void *key = cfqd;
>>> +       struct backing_dev_info *bdi = &cfqd->queue->backing_dev_info;
>>> +       unsigned int major, minor;
>>> +
>>> +       cfqg = cfqg_of_blkg(blkiocg_lookup_group(blkcg, key));
>>> +       if (cfqg && !cfqg->blkg.dev && bdi->dev && dev_name(bdi->dev)) {
>>> +               sscanf(dev_name(bdi->dev), "%u:%u", &major, &minor);
>>> +               cfqg->blkg.dev = MKDEV(major, minor);
>>> +               goto done;
>>> +       }
>>> +       if (cfqg || !create)
>>> +               goto done;
>>> +
>>> +       /*
>>> +        * Allocate CFQ group chain to the root group or we meet the CGroup
>>> +        * with use_hierarchy disabled.
>>> +        */
>>> +       cfqg = cfqg_chain_alloc(cfqd, cgroup);
>>>
>>>  done:
>>>        return cfqg;
>>> @@ -1148,6 +1398,7 @@ static void cfq_put_cfqg(struct cfq_group *cfqg)
>>>  {
>>>        struct cfq_rb_root *st;
>>>        int i, j;
>>> +       struct cfq_group *p_cfqg;
>>>
>>>        BUG_ON(cfqg->ref <= 0);
>>>        cfqg->ref--;
>>> @@ -1155,6 +1406,22 @@ static void cfq_put_cfqg(struct cfq_group *cfqg)
>>>                return;
>>>        for_each_cfqg_st(cfqg, i, j, st)
>>>                BUG_ON(!RB_EMPTY_ROOT(&st->rb));
>>> +
>>> +       do {
>>> +               p_cfqg = cfqg_get_parent(cfqg);
>>> +               kfree(cfqg);
>>> +               cfqg = NULL;
>>> +               /*
>>> +                * Drop the reference taken by children, if nobody references
>>> +                * parent group, we need delete the parent also.
>>> +                */
>>> +               if (p_cfqg) {
>>> +                       p_cfqg->ref--;
>>> +                       if (p_cfqg->ref == 0)
>>> +                               cfqg = p_cfqg;
>>> +               }
>>> +       } while (cfqg);
>>> +
>>>        kfree(cfqg);
>>>  }
>>>
>>> @@ -1321,9 +1588,6 @@ static void cfq_service_tree_add(struct cfq_data *cfqd, struct cfq_queue *cfqq,
>>>                         * ioprio.
>>>                         */
>>>                        pos_offset = cfq_get_boost(cfqd, cfqq);
>>> -                       /* Debug purpose, should remove. */
>>> -                       cfq_log_cfqq(cfqd, cfqq, "pos_offset: %llu",
>>> -                                    pos_offset);
>>>                        cfqe->vdisktime = service_tree->min_vdisktime +
>>>                                                pos_offset;
>>>                } else
>>> @@ -1365,9 +1629,8 @@ insert:
>>>        cfqe->service_tree = service_tree;
>>>
>>>        /* Add cfqq onto service tree. */
>>> +
>>>        cfq_entity_service_tree_add(service_tree, cfqe);
>>> -       update_min_vdisktime(service_tree);
>>> -       cfqq->reposition_time = jiffies;
>>>        if ((add_front || !new_cfqq) && !group_changed)
>>>                return;
>>>        cfq_group_service_tree_add(cfqd, cfqq->cfqg);
>>> @@ -1810,28 +2073,43 @@ static struct cfq_queue *cfq_get_next_queue(struct cfq_data *cfqd)
>>>        return cfqq_of_entity(cfq_rb_first(service_tree));
>>>  }
>>>
>>> -static struct cfq_queue *cfq_get_next_queue_forced(struct cfq_data *cfqd)
>>> +struct cfq_rb_root *choose_service_tree_forced(struct cfq_group *cfqg)
>>>  {
>>> -       struct cfq_group *cfqg;
>>> -       struct cfq_entity *cfqe;
>>>        int i, j;
>>>        struct cfq_rb_root *st;
>>>
>>> -       if (!cfqd->rq_queued)
>>> -               return NULL;
>>> +       for_each_cfqg_st(cfqg, i, j, st) {
>>> +               if (st->count != 0)
>>> +                       return st;
>>> +       }
>>>
>>> -       cfqg = cfq_get_next_cfqg(cfqd);
>>> -       if (!cfqg)
>>> +       return NULL;
>>> +}
>>> +
>>> +static struct cfq_entity *
>>> +cfq_get_next_entity_forced(struct cfq_data *cfqd)
>>> +{
>>> +       struct cfq_entity *cfqe;
>>> +       struct cfq_rb_root *st = &cfqd->grp_service_tree;
>>> +       struct cfq_group *cfqg;
>>> +
>>> +       if (!cfqd->rq_queued)
>>>                return NULL;
>>>
>>> -       for_each_cfqg_st(cfqg, i, j, st) {
>>> +       do {
>>>                cfqe = cfq_rb_first(st);
>>> -               if (cfqe != NULL)
>>> -                       return cfqq_of_entity(cfqe);
>>> -       }
>>> +               if (cfqe && !cfqe->is_group_entity)
>>> +                       return cfqe;
>>> +               else if (cfqe && cfqe->is_group_entity)
>>> +                       cfqg = cfqg_of_entity(cfqe);
>>> +
>>> +               st = choose_service_tree_forced(cfqg);
>>> +       } while (st);
>>> +
>>>        return NULL;
>>>  }
>>>
>>> +
>>>  /*
>>>  * Get and set a new active qu
>>>
>>>
>>> --
>>> To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
>>> the body of a message to majordomo@vger.kernel.org
>>> More majordomo info at  http://vger.kernel.org/majordomo-info.html
>>> Please read the FAQ at  http://www.tux.org/lkml/
>>>
>>
>
> --
> Regards
> Gui Jianfeng
>

^ permalink raw reply	[flat|nested] 40+ messages in thread

* Re: [PATCH 5/6 v4] cfq-iosched: CFQ group hierarchical scheduling and use_hierarchy interface
  2011-02-17 17:36       ` Justin TerAvest
@ 2011-02-18  1:14         ` Gui Jianfeng
  0 siblings, 0 replies; 40+ messages in thread
From: Gui Jianfeng @ 2011-02-18  1:14 UTC (permalink / raw)
  To: Justin TerAvest
  Cc: Vivek Goyal, Jens Axboe, Shaohua Li, lkml, Chad Talbott, Divyesh Shah

Justin TerAvest wrote:
> On Wed, Feb 16, 2011 at 5:21 PM, Gui Jianfeng
> <guijianfeng@cn.fujitsu.com> wrote:
>> Justin TerAvest wrote:
>>> After a quick read,
>>>
>>> It's sad that we have to have so many use_hierarchy checks; it seems
>>> like we're asking for bugs, especially in the future when one codepath
>>> gets updated but not the other.
>>>
>>> CodingStyle says we should only have one declaration per line.
>>>
>>> I feel like there is an implicit assumption that groups and tasks
>>> should not be children of the same parent; that is, a group should
>>> contain only groups, or only tasks, but I don't see this enforced;
>>> there's just and assumption that BE:SYNC is "good enough" for that
>>> comparison. This smells like something that will be tweaked/tuned for
>>> fairness later. :( Why don't we just prevent this from happening?
>> Hi Justin,
>>
>> Thanks for reviewing.
>>
>> Previously, I posted very first version that makes a group containing only
>> groups or only tasks. But I think it's more flexible to treat groups and
>> tasks at the same level. I think Vivek and Jens have the same opinion.
>> We had discussed in this thread http://lkml.org/lkml/2010/8/30/30
> 
> Hi Gui,
> Thanks for pointing me at the earlier discussion, the decisions make a
> lot more sense now.
> 
>>> The clean_up label in chain_alloc() is strange; I don't think the goto
>>> is necessary at all. I found that method generally hard to understand.
>>> It's doing a lot.
>> I don't understand why clean_up isn't needed.
>> When we fail to allocate a cfq group at some level, we have to clean up
>> all groups in the chain that we have already allocated.
> 
> Cleaning up is necessary, but the label is only used from one place.
> Why add the goto and the label when the code below "clean_up" can just
> be moved inside the condition
> +                       if (!cfqg)

It's common in kernel to put error processing at the end of a function. ;)

Thanks,
Gui


^ permalink raw reply	[flat|nested] 40+ messages in thread

* Re: [PATCH 3/6 v4] cfq-iosched: Introduce vdisktime and io weight for CFQ queue
  2011-02-14 18:13   ` Vivek Goyal
  2011-02-15  1:46     ` Gui Jianfeng
@ 2011-02-18  6:04     ` Gui Jianfeng
  2011-02-18 14:54       ` Vivek Goyal
  1 sibling, 1 reply; 40+ messages in thread
From: Gui Jianfeng @ 2011-02-18  6:04 UTC (permalink / raw)
  To: Vivek Goyal; +Cc: Jens Axboe, Shaohua Li, lkml, Chad Talbott, Divyesh Shah

Vivek Goyal wrote:
> On Thu, Feb 10, 2011 at 03:47:16PM +0800, Gui Jianfeng wrote:
> 
> [..]
>> +/*
>> + * The time when a CFQ queue is put onto a service tree is recoreded in
>> + * cfqq->reposition_time. Currently, we check the first priority CFQ queues
>> + * on each service tree, and select the workload type that contains the lowest
>> + * reposition_time CFQ queue among them.
>> + */
>>  static enum wl_type_t cfq_choose_wl(struct cfq_data *cfqd,
>>  				struct cfq_group *cfqg, enum wl_prio_t prio)
>>  {
>>  	struct cfq_entity *cfqe;
>> +	struct cfq_queue *cfqq;
>> +	unsigned long lowest_start_time;
>>  	int i;
>> -	bool key_valid = false;
>> -	unsigned long lowest_key = 0;
>> +	bool time_valid = false;
>>  	enum wl_type_t cur_best = SYNC_NOIDLE_WORKLOAD;
>>  
>> +	/*
>> +	 * TODO: We may take io priority and io class into account when
>> +	 * choosing a workload type. But for the time being just make use of
>> +	 * reposition_time only.
>> +	 */
>>  	for (i = 0; i <= SYNC_WORKLOAD; ++i) {
>> -		/* select the one with lowest rb_key */
>>  		cfqe = cfq_rb_first(service_tree_for(cfqg, prio, i));
>> -		if (cfqe &&
>> -		    (!key_valid || time_before(cfqe->rb_key, lowest_key))) {
>> -			lowest_key = cfqe->rb_key;
>> +		cfqq = cfqq_of_entity(cfqe);
>> +		if (cfqe && (!time_valid ||
>> +			     time_before(cfqq->reposition_time,
>> +					 lowest_start_time))) {
>> +			lowest_start_time = cfqq->reposition_time;
> 
> Gui,
> 
> Have you had a chance to run some mixed workloads in a group (some sync,
> some async and some sync-idle queues), and see how latency and throughput
> of sync-idle workload changes due to this "resposition_time" logic. I 
> just want to make sure that latency of sync-noidle workload does not
> go up as that's the workload that people care and gets noticed first.

Hi Vivek,

I made a quick test by using fio. It seems the number changes little
between vanilla kernel and patched kernel.


Vanilla:    SYNC read            SYNC-NOIDLE read      ASYNC write  
         1. 23,640KB/s 5.40 ---- 6,696KB/s 19.07 ---- 50,142KB/s 128.00
         2. 24,459KB/s 5.22 ---- 6,775KB/s 18.86 ---- 47,349KB/s 129.89
         3. 25,929KB/s 4.93 ---- 7,378KB/s 17.32 ---- 32,350KB/s 131.88

Patched:   SYNC read            SYNC-NOIDLE read      ASYNC write  
        1. 24,000KB/s 5.32 ---- 6,942KB/s 18.39 ---- 30,860KB/s 135.95
        2. 23,678KB/s 5.40 ---- 7,274KB/s 17.58 ---- 67,432KB/s 120.44
        3. 23,004KB/s 5.55 ---- 6,621KB/s 19.30 ---- 36,536KB/s 148.64

Thanks,
Gui

^ permalink raw reply	[flat|nested] 40+ messages in thread

* Re: [PATCH 3/6 v4] cfq-iosched: Introduce vdisktime and io weight for CFQ queue
  2011-02-18  6:04     ` Gui Jianfeng
@ 2011-02-18 14:54       ` Vivek Goyal
  2011-02-21  1:13         ` Gui Jianfeng
  2011-02-21  5:55         ` Gui Jianfeng
  0 siblings, 2 replies; 40+ messages in thread
From: Vivek Goyal @ 2011-02-18 14:54 UTC (permalink / raw)
  To: Gui Jianfeng; +Cc: Jens Axboe, Shaohua Li, lkml, Chad Talbott, Divyesh Shah

On Fri, Feb 18, 2011 at 02:04:18PM +0800, Gui Jianfeng wrote:
> Vivek Goyal wrote:
> > On Thu, Feb 10, 2011 at 03:47:16PM +0800, Gui Jianfeng wrote:
> > 
> > [..]
> >> +/*
> >> + * The time when a CFQ queue is put onto a service tree is recoreded in
> >> + * cfqq->reposition_time. Currently, we check the first priority CFQ queues
> >> + * on each service tree, and select the workload type that contains the lowest
> >> + * reposition_time CFQ queue among them.
> >> + */
> >>  static enum wl_type_t cfq_choose_wl(struct cfq_data *cfqd,
> >>  				struct cfq_group *cfqg, enum wl_prio_t prio)
> >>  {
> >>  	struct cfq_entity *cfqe;
> >> +	struct cfq_queue *cfqq;
> >> +	unsigned long lowest_start_time;
> >>  	int i;
> >> -	bool key_valid = false;
> >> -	unsigned long lowest_key = 0;
> >> +	bool time_valid = false;
> >>  	enum wl_type_t cur_best = SYNC_NOIDLE_WORKLOAD;
> >>  
> >> +	/*
> >> +	 * TODO: We may take io priority and io class into account when
> >> +	 * choosing a workload type. But for the time being just make use of
> >> +	 * reposition_time only.
> >> +	 */
> >>  	for (i = 0; i <= SYNC_WORKLOAD; ++i) {
> >> -		/* select the one with lowest rb_key */
> >>  		cfqe = cfq_rb_first(service_tree_for(cfqg, prio, i));
> >> -		if (cfqe &&
> >> -		    (!key_valid || time_before(cfqe->rb_key, lowest_key))) {
> >> -			lowest_key = cfqe->rb_key;
> >> +		cfqq = cfqq_of_entity(cfqe);
> >> +		if (cfqe && (!time_valid ||
> >> +			     time_before(cfqq->reposition_time,
> >> +					 lowest_start_time))) {
> >> +			lowest_start_time = cfqq->reposition_time;
> > 
> > Gui,
> > 
> > Have you had a chance to run some mixed workloads in a group (some sync,
> > some async and some sync-idle queues), and see how latency and throughput
> > of sync-idle workload changes due to this "resposition_time" logic. I 
> > just want to make sure that latency of sync-noidle workload does not
> > go up as that's the workload that people care and gets noticed first.
> 
> Hi Vivek,
> 
> I made a quick test by using fio. It seems the number changes little
> between vanilla kernel and patched kernel.
> 
> 
> Vanilla:    SYNC read            SYNC-NOIDLE read      ASYNC write  
>          1. 23,640KB/s 5.40 ---- 6,696KB/s 19.07 ---- 50,142KB/s 128.00
>          2. 24,459KB/s 5.22 ---- 6,775KB/s 18.86 ---- 47,349KB/s 129.89
>          3. 25,929KB/s 4.93 ---- 7,378KB/s 17.32 ---- 32,350KB/s 131.88
> 
> Patched:   SYNC read            SYNC-NOIDLE read      ASYNC write  
>         1. 24,000KB/s 5.32 ---- 6,942KB/s 18.39 ---- 30,860KB/s 135.95
>         2. 23,678KB/s 5.40 ---- 7,274KB/s 17.58 ---- 67,432KB/s 120.44
>         3. 23,004KB/s 5.55 ---- 6,621KB/s 19.30 ---- 36,536KB/s 148.64

Hi Gui,

Do you also have latency numbers? I am especially interested max completion
latencies of SYNC-NOIDLE workload.

Thanks
Vivek

^ permalink raw reply	[flat|nested] 40+ messages in thread

* Re: [PATCH 3/6 v4] cfq-iosched: Introduce vdisktime and io weight for CFQ queue
  2011-02-18 14:54       ` Vivek Goyal
@ 2011-02-21  1:13         ` Gui Jianfeng
  2011-02-21  5:55         ` Gui Jianfeng
  1 sibling, 0 replies; 40+ messages in thread
From: Gui Jianfeng @ 2011-02-21  1:13 UTC (permalink / raw)
  To: Vivek Goyal; +Cc: Jens Axboe, Shaohua Li, lkml, Chad Talbott, Divyesh Shah

Vivek Goyal wrote:
> On Fri, Feb 18, 2011 at 02:04:18PM +0800, Gui Jianfeng wrote:
>> Vivek Goyal wrote:
>>> On Thu, Feb 10, 2011 at 03:47:16PM +0800, Gui Jianfeng wrote:
>>>
>>> [..]
>>>> +/*
>>>> + * The time when a CFQ queue is put onto a service tree is recoreded in
>>>> + * cfqq->reposition_time. Currently, we check the first priority CFQ queues
>>>> + * on each service tree, and select the workload type that contains the lowest
>>>> + * reposition_time CFQ queue among them.
>>>> + */
>>>>  static enum wl_type_t cfq_choose_wl(struct cfq_data *cfqd,
>>>>  				struct cfq_group *cfqg, enum wl_prio_t prio)
>>>>  {
>>>>  	struct cfq_entity *cfqe;
>>>> +	struct cfq_queue *cfqq;
>>>> +	unsigned long lowest_start_time;
>>>>  	int i;
>>>> -	bool key_valid = false;
>>>> -	unsigned long lowest_key = 0;
>>>> +	bool time_valid = false;
>>>>  	enum wl_type_t cur_best = SYNC_NOIDLE_WORKLOAD;
>>>>  
>>>> +	/*
>>>> +	 * TODO: We may take io priority and io class into account when
>>>> +	 * choosing a workload type. But for the time being just make use of
>>>> +	 * reposition_time only.
>>>> +	 */
>>>>  	for (i = 0; i <= SYNC_WORKLOAD; ++i) {
>>>> -		/* select the one with lowest rb_key */
>>>>  		cfqe = cfq_rb_first(service_tree_for(cfqg, prio, i));
>>>> -		if (cfqe &&
>>>> -		    (!key_valid || time_before(cfqe->rb_key, lowest_key))) {
>>>> -			lowest_key = cfqe->rb_key;
>>>> +		cfqq = cfqq_of_entity(cfqe);
>>>> +		if (cfqe && (!time_valid ||
>>>> +			     time_before(cfqq->reposition_time,
>>>> +					 lowest_start_time))) {
>>>> +			lowest_start_time = cfqq->reposition_time;
>>> Gui,
>>>
>>> Have you had a chance to run some mixed workloads in a group (some sync,
>>> some async and some sync-idle queues), and see how latency and throughput
>>> of sync-idle workload changes due to this "resposition_time" logic. I 
>>> just want to make sure that latency of sync-noidle workload does not
>>> go up as that's the workload that people care and gets noticed first.
>> Hi Vivek,
>>
>> I made a quick test by using fio. It seems the number changes little
>> between vanilla kernel and patched kernel.
>>
>>
>> Vanilla:    SYNC read            SYNC-NOIDLE read      ASYNC write  
>>          1. 23,640KB/s 5.40 ---- 6,696KB/s 19.07 ---- 50,142KB/s 128.00
>>          2. 24,459KB/s 5.22 ---- 6,775KB/s 18.86 ---- 47,349KB/s 129.89
>>          3. 25,929KB/s 4.93 ---- 7,378KB/s 17.32 ---- 32,350KB/s 131.88
>>
>> Patched:   SYNC read            SYNC-NOIDLE read      ASYNC write  
>>         1. 24,000KB/s 5.32 ---- 6,942KB/s 18.39 ---- 30,860KB/s 135.95
>>         2. 23,678KB/s 5.40 ---- 7,274KB/s 17.58 ---- 67,432KB/s 120.44
>>         3. 23,004KB/s 5.55 ---- 6,621KB/s 19.30 ---- 36,536KB/s 148.64
> 
> Hi Gui,
> 
> Do you also have latency numbers? I am especially interested max completion
> latencies of SYNC-NOIDLE workload.

Vivek,

The number behind bandwidth is the average completion latency which is extracted 
from the fio output.
For example, 23,640KB/s 5.40, the average completion lantency is 5.40 usec.
I'll re-test to check *max* completion latencies.

Thanks,
Gui

> 
> Thanks
> Vivek
> --
> To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
> the body of a message to majordomo@vger.kernel.org
> More majordomo info at  http://vger.kernel.org/majordomo-info.html
> Please read the FAQ at  http://www.tux.org/lkml/
> 

^ permalink raw reply	[flat|nested] 40+ messages in thread

* Re: [PATCH 3/6 v4] cfq-iosched: Introduce vdisktime and io weight for CFQ queue
  2011-02-18 14:54       ` Vivek Goyal
  2011-02-21  1:13         ` Gui Jianfeng
@ 2011-02-21  5:55         ` Gui Jianfeng
  2011-02-21 15:41           ` Vivek Goyal
  1 sibling, 1 reply; 40+ messages in thread
From: Gui Jianfeng @ 2011-02-21  5:55 UTC (permalink / raw)
  To: Vivek Goyal; +Cc: Jens Axboe, Shaohua Li, lkml, Chad Talbott, Divyesh Shah

Vivek Goyal wrote:
> On Fri, Feb 18, 2011 at 02:04:18PM +0800, Gui Jianfeng wrote:
>> Vivek Goyal wrote:
>>> On Thu, Feb 10, 2011 at 03:47:16PM +0800, Gui Jianfeng wrote:
>>>
>>> [..]
>>>> +/*
>>>> + * The time when a CFQ queue is put onto a service tree is recoreded in
>>>> + * cfqq->reposition_time. Currently, we check the first priority CFQ queues
>>>> + * on each service tree, and select the workload type that contains the lowest
>>>> + * reposition_time CFQ queue among them.
>>>> + */
>>>>  static enum wl_type_t cfq_choose_wl(struct cfq_data *cfqd,
>>>>  				struct cfq_group *cfqg, enum wl_prio_t prio)
>>>>  {
>>>>  	struct cfq_entity *cfqe;
>>>> +	struct cfq_queue *cfqq;
>>>> +	unsigned long lowest_start_time;
>>>>  	int i;
>>>> -	bool key_valid = false;
>>>> -	unsigned long lowest_key = 0;
>>>> +	bool time_valid = false;
>>>>  	enum wl_type_t cur_best = SYNC_NOIDLE_WORKLOAD;
>>>>  
>>>> +	/*
>>>> +	 * TODO: We may take io priority and io class into account when
>>>> +	 * choosing a workload type. But for the time being just make use of
>>>> +	 * reposition_time only.
>>>> +	 */
>>>>  	for (i = 0; i <= SYNC_WORKLOAD; ++i) {
>>>> -		/* select the one with lowest rb_key */
>>>>  		cfqe = cfq_rb_first(service_tree_for(cfqg, prio, i));
>>>> -		if (cfqe &&
>>>> -		    (!key_valid || time_before(cfqe->rb_key, lowest_key))) {
>>>> -			lowest_key = cfqe->rb_key;
>>>> +		cfqq = cfqq_of_entity(cfqe);
>>>> +		if (cfqe && (!time_valid ||
>>>> +			     time_before(cfqq->reposition_time,
>>>> +					 lowest_start_time))) {
>>>> +			lowest_start_time = cfqq->reposition_time;
>>> Gui,
>>>
>>> Have you had a chance to run some mixed workloads in a group (some sync,
>>> some async and some sync-idle queues), and see how latency and throughput
>>> of sync-idle workload changes due to this "resposition_time" logic. I 
>>> just want to make sure that latency of sync-noidle workload does not
>>> go up as that's the workload that people care and gets noticed first.
>> Hi Vivek,
>>
>> I made a quick test by using fio. It seems the number changes little
>> between vanilla kernel and patched kernel.
>>
>>
>> Vanilla:    SYNC read            SYNC-NOIDLE read      ASYNC write  
>>          1. 23,640KB/s 5.40 ---- 6,696KB/s 19.07 ---- 50,142KB/s 128.00
>>          2. 24,459KB/s 5.22 ---- 6,775KB/s 18.86 ---- 47,349KB/s 129.89
>>          3. 25,929KB/s 4.93 ---- 7,378KB/s 17.32 ---- 32,350KB/s 131.88
>>
>> Patched:   SYNC read            SYNC-NOIDLE read      ASYNC write  
>>         1. 24,000KB/s 5.32 ---- 6,942KB/s 18.39 ---- 30,860KB/s 135.95
>>         2. 23,678KB/s 5.40 ---- 7,274KB/s 17.58 ---- 67,432KB/s 120.44
>>         3. 23,004KB/s 5.55 ---- 6,621KB/s 19.30 ---- 36,536KB/s 148.64
> 
> Hi Gui,
> 
> Do you also have latency numbers? I am especially interested max completion
> latencies of SYNC-NOIDLE workload.

Vivek,

Here some numbers about latency between vanilla and patched kernel.
I tested 4 times for each. It seems no latency regression happens.

Vanilla:
1. clat (msec): min=1, max=302, avg=18.19, stdev=39.80
2. clat (msec): min=1, max=201, avg=17.76, stdev=31.90
3. clat (msec): min=1, max=303, avg=18.64, stdev=41.30
4. clat (msec): min=1, max=370, avg=17.43, stdev=35.09

Patched:
1. clat (msec): min=1, max=176, avg=19.00, stdev=32.98
2. clat (msec): min=1, max=175, avg=17.75, stdev=32.41
3. clat (msec): min=1, max=191, avg=19.11, stdev=33.28
4. clat (msec): min=1, max=176, avg=17.11, stdev=32.99

Thanks,
Gui

> 
> Thanks
> Vivek
> 

^ permalink raw reply	[flat|nested] 40+ messages in thread

* Re: [PATCH 3/6 v4] cfq-iosched: Introduce vdisktime and io weight for CFQ queue
  2011-02-21  5:55         ` Gui Jianfeng
@ 2011-02-21 15:41           ` Vivek Goyal
  0 siblings, 0 replies; 40+ messages in thread
From: Vivek Goyal @ 2011-02-21 15:41 UTC (permalink / raw)
  To: Gui Jianfeng; +Cc: Jens Axboe, Shaohua Li, lkml, Chad Talbott, Divyesh Shah

On Mon, Feb 21, 2011 at 01:55:38PM +0800, Gui Jianfeng wrote:
> Vivek Goyal wrote:
> > On Fri, Feb 18, 2011 at 02:04:18PM +0800, Gui Jianfeng wrote:
> >> Vivek Goyal wrote:
> >>> On Thu, Feb 10, 2011 at 03:47:16PM +0800, Gui Jianfeng wrote:
> >>>
> >>> [..]
> >>>> +/*
> >>>> + * The time when a CFQ queue is put onto a service tree is recoreded in
> >>>> + * cfqq->reposition_time. Currently, we check the first priority CFQ queues
> >>>> + * on each service tree, and select the workload type that contains the lowest
> >>>> + * reposition_time CFQ queue among them.
> >>>> + */
> >>>>  static enum wl_type_t cfq_choose_wl(struct cfq_data *cfqd,
> >>>>  				struct cfq_group *cfqg, enum wl_prio_t prio)
> >>>>  {
> >>>>  	struct cfq_entity *cfqe;
> >>>> +	struct cfq_queue *cfqq;
> >>>> +	unsigned long lowest_start_time;
> >>>>  	int i;
> >>>> -	bool key_valid = false;
> >>>> -	unsigned long lowest_key = 0;
> >>>> +	bool time_valid = false;
> >>>>  	enum wl_type_t cur_best = SYNC_NOIDLE_WORKLOAD;
> >>>>  
> >>>> +	/*
> >>>> +	 * TODO: We may take io priority and io class into account when
> >>>> +	 * choosing a workload type. But for the time being just make use of
> >>>> +	 * reposition_time only.
> >>>> +	 */
> >>>>  	for (i = 0; i <= SYNC_WORKLOAD; ++i) {
> >>>> -		/* select the one with lowest rb_key */
> >>>>  		cfqe = cfq_rb_first(service_tree_for(cfqg, prio, i));
> >>>> -		if (cfqe &&
> >>>> -		    (!key_valid || time_before(cfqe->rb_key, lowest_key))) {
> >>>> -			lowest_key = cfqe->rb_key;
> >>>> +		cfqq = cfqq_of_entity(cfqe);
> >>>> +		if (cfqe && (!time_valid ||
> >>>> +			     time_before(cfqq->reposition_time,
> >>>> +					 lowest_start_time))) {
> >>>> +			lowest_start_time = cfqq->reposition_time;
> >>> Gui,
> >>>
> >>> Have you had a chance to run some mixed workloads in a group (some sync,
> >>> some async and some sync-idle queues), and see how latency and throughput
> >>> of sync-idle workload changes due to this "resposition_time" logic. I 
> >>> just want to make sure that latency of sync-noidle workload does not
> >>> go up as that's the workload that people care and gets noticed first.
> >> Hi Vivek,
> >>
> >> I made a quick test by using fio. It seems the number changes little
> >> between vanilla kernel and patched kernel.
> >>
> >>
> >> Vanilla:    SYNC read            SYNC-NOIDLE read      ASYNC write  
> >>          1. 23,640KB/s 5.40 ---- 6,696KB/s 19.07 ---- 50,142KB/s 128.00
> >>          2. 24,459KB/s 5.22 ---- 6,775KB/s 18.86 ---- 47,349KB/s 129.89
> >>          3. 25,929KB/s 4.93 ---- 7,378KB/s 17.32 ---- 32,350KB/s 131.88
> >>
> >> Patched:   SYNC read            SYNC-NOIDLE read      ASYNC write  
> >>         1. 24,000KB/s 5.32 ---- 6,942KB/s 18.39 ---- 30,860KB/s 135.95
> >>         2. 23,678KB/s 5.40 ---- 7,274KB/s 17.58 ---- 67,432KB/s 120.44
> >>         3. 23,004KB/s 5.55 ---- 6,621KB/s 19.30 ---- 36,536KB/s 148.64
> > 
> > Hi Gui,
> > 
> > Do you also have latency numbers? I am especially interested max completion
> > latencies of SYNC-NOIDLE workload.
> 
> Vivek,
> 
> Here some numbers about latency between vanilla and patched kernel.
> I tested 4 times for each. It seems no latency regression happens.
> 
> Vanilla:
> 1. clat (msec): min=1, max=302, avg=18.19, stdev=39.80
> 2. clat (msec): min=1, max=201, avg=17.76, stdev=31.90
> 3. clat (msec): min=1, max=303, avg=18.64, stdev=41.30
> 4. clat (msec): min=1, max=370, avg=17.43, stdev=35.09
> 
> Patched:
> 1. clat (msec): min=1, max=176, avg=19.00, stdev=32.98
> 2. clat (msec): min=1, max=175, avg=17.75, stdev=32.41
> 3. clat (msec): min=1, max=191, avg=19.11, stdev=33.28
> 4. clat (msec): min=1, max=176, avg=17.11, stdev=32.99

Thanks Gui, In fact they seem to have mproved a bit for sync-noidle
workload. So there are no major issues in presence of other SYNC-IDLE
and ASYNC workload. I wanted to be sure of that. If we run into issues,
we will tweak the worklaod selection logic futher.

Thanks
Vivek

^ permalink raw reply	[flat|nested] 40+ messages in thread

end of thread, other threads:[~2011-02-21 15:41 UTC | newest]

Thread overview: 40+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
     [not found] <4D51ED26.8050809@cn.fujitsu.com>
2011-02-10  7:46 ` [PATCH 1/6 v4] cfq-iosched: Introduce cfq_entity for CFQ queue Gui Jianfeng
2011-02-10  7:47 ` [PATCH 2/6 v4] cfq-iosched: Introduce cfq_entity for CFQ group Gui Jianfeng
2011-02-10  7:47 ` [PATCH 3/6 v4] cfq-iosched: Introduce vdisktime and io weight for CFQ queue Gui Jianfeng
2011-02-10 19:29   ` Vivek Goyal
2011-02-12  1:20     ` Gui Jianfeng
2011-02-14 16:58       ` Vivek Goyal
2011-02-15  1:53         ` Gui Jianfeng
2011-02-15 14:24           ` Vivek Goyal
2011-02-16  1:06             ` Gui Jianfeng
2011-02-14 18:13   ` Vivek Goyal
2011-02-15  1:46     ` Gui Jianfeng
2011-02-18  6:04     ` Gui Jianfeng
2011-02-18 14:54       ` Vivek Goyal
2011-02-21  1:13         ` Gui Jianfeng
2011-02-21  5:55         ` Gui Jianfeng
2011-02-21 15:41           ` Vivek Goyal
2011-02-14 23:32   ` Justin TerAvest
2011-02-15  1:44     ` Gui Jianfeng
2011-02-15 14:21       ` Vivek Goyal
2011-02-10  7:47 ` [PATCH 4/6 v4] cfq-iosched: Extract some common code of service tree handling for CFQ queue and CFQ group Gui Jianfeng
2011-02-10  7:47 ` [PATCH 5/6 v4] cfq-iosched: CFQ group hierarchical scheduling and use_hierarchy interface Gui Jianfeng
2011-02-10 20:57   ` Vivek Goyal
2011-02-12  2:21     ` Gui Jianfeng
2011-02-14 18:04       ` Vivek Goyal
2011-02-15  2:38         ` Gui Jianfeng
2011-02-15 14:27           ` Vivek Goyal
2011-02-16  1:44             ` Gui Jianfeng
2011-02-16 14:17               ` Vivek Goyal
2011-02-17  1:22                 ` Gui Jianfeng
2011-02-16 17:22               ` Divyesh Shah
2011-02-16 17:28                 ` Divyesh Shah
2011-02-16 18:06                   ` Vivek Goyal
2011-02-14  3:20     ` Gui Jianfeng
2011-02-14 18:10       ` Vivek Goyal
2011-02-17  0:31   ` Justin TerAvest
2011-02-17  1:21     ` Gui Jianfeng
2011-02-17 17:36       ` Justin TerAvest
2011-02-18  1:14         ` Gui Jianfeng
2011-02-17 10:39     ` Alan Cox
2011-02-10  7:47 ` [PATCH 6/6 v4] blkio-cgroup: Document for blkio.use_hierarchy interface Gui Jianfeng

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).