From 4ebc1c61d6185604c97fd0b0355ab668052044ab Mon Sep 17 00:00:00 2001
From: Tejun Heo <tj@kernel.org>
Date: Tue, 18 Aug 2015 14:54:57 -0700
Subject: cfq-iosched: simplify control flow in cfq_get_queue()

cfq_get_queue()'s control flow looks like the following.

	async_cfqq = NULL;
	cfqq = NULL;

	if (!is_sync) {
		...
		async_cfqq = ...;
		cfqq = *async_cfqq;
	}

	if (!cfqq)
		cfqq = ...;

	if (!is_sync && !(*async_cfqq))
		...;

The only thing the local variable init, the second if, and the
async_cfqq test in the third if achieves is to skip cfqq creation and
installation if *async_cfqq was already non-NULL.  This is needlessly
complicated with different tests examining the same condition.
Simplify it to the following.

	if (!is_sync) {
		...
		async_cfqq = ...;
		cfqq = *async_cfqq;
		if (cfqq)
			goto out;
	}

	cfqq = ...;

	if (!is_sync)
		...;
 out:

Signed-off-by: Tejun Heo <tj@kernel.org>
Acked-by: Jeff Moyer <jmoyer@redhat.com>
Cc: Vivek Goyal <vgoyal@redhat.com>
Cc: Arianna Avanzini <avanzini.arianna@gmail.com>
Signed-off-by: Jens Axboe <axboe@fb.com>
---
 block/cfq-iosched.c | 13 +++++++------
 1 file changed, 7 insertions(+), 6 deletions(-)

(limited to 'block')

diff --git a/block/cfq-iosched.c b/block/cfq-iosched.c
index c62bb2e650b8..e91093d0c0eb 100644
--- a/block/cfq-iosched.c
+++ b/block/cfq-iosched.c
@@ -3731,8 +3731,8 @@ cfq_get_queue(struct cfq_data *cfqd, bool is_sync, struct cfq_io_cq *cic,
 {
 	int ioprio_class = IOPRIO_PRIO_CLASS(cic->ioprio);
 	int ioprio = IOPRIO_PRIO_DATA(cic->ioprio);
-	struct cfq_queue **async_cfqq = NULL;
-	struct cfq_queue *cfqq = NULL;
+	struct cfq_queue **async_cfqq;
+	struct cfq_queue *cfqq;
 
 	if (!is_sync) {
 		if (!ioprio_valid(cic->ioprio)) {
@@ -3742,19 +3742,20 @@ cfq_get_queue(struct cfq_data *cfqd, bool is_sync, struct cfq_io_cq *cic,
 		}
 		async_cfqq = cfq_async_queue_prio(cfqd, ioprio_class, ioprio);
 		cfqq = *async_cfqq;
+		if (cfqq)
+			goto out;
 	}
 
-	if (!cfqq)
-		cfqq = cfq_find_alloc_queue(cfqd, is_sync, cic, bio, gfp_mask);
+	cfqq = cfq_find_alloc_queue(cfqd, is_sync, cic, bio, gfp_mask);
 
 	/*
 	 * pin the queue now that it's allocated, scheduler exit will prune it
 	 */
-	if (!is_sync && !(*async_cfqq)) {
+	if (!is_sync) {
 		cfqq->ref++;
 		*async_cfqq = cfqq;
 	}
-
+out:
 	cfqq->ref++;
 	return cfqq;
 }
-- 
cgit v1.2.3


From 95e5d6f62693f27d9011ec307eb32c6126314ea3 Mon Sep 17 00:00:00 2001
From: Tejun Heo <tj@kernel.org>
Date: Tue, 18 Aug 2015 14:54:58 -0700
Subject: cfq-iosched: fix async oom queue handling

Async cfqq's (cfq_queue's) are shared across cfq_data.  When
cfq_get_queue() obtains a new queue from cfq_find_alloc_queue(), it
stashes the pointer in cfq_data and reuses it from then on; however,
the function doesn't consider that cfq_find_alloc_queue() may return
the oom_cfqq under memory pressure and installs the returned queue
unconditionally.

If the oom_cfqq is installed as an async cfqq, cfq_set_request() will
continue calling cfq_get_queue() hoping to replace it with a proper
queue; however, cfq_get_queue() will keep returning the cached queue
for the slot - the oom_cfqq.

Fix it by skipping caching if the queue is the oom one.

Signed-off-by: Tejun Heo <tj@kernel.org>
Acked-by: Jeff Moyer <jmoyer@redhat.com>
Cc: Vivek Goyal <vgoyal@redhat.com>
Cc: Arianna Avanzini <avanzini.arianna@gmail.com>
Signed-off-by: Jens Axboe <axboe@fb.com>
---
 block/cfq-iosched.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'block')

diff --git a/block/cfq-iosched.c b/block/cfq-iosched.c
index e91093d0c0eb..baf0b706169d 100644
--- a/block/cfq-iosched.c
+++ b/block/cfq-iosched.c
@@ -3751,7 +3751,7 @@ cfq_get_queue(struct cfq_data *cfqd, bool is_sync, struct cfq_io_cq *cic,
 	/*
 	 * pin the queue now that it's allocated, scheduler exit will prune it
 	 */
-	if (!is_sync) {
+	if (!is_sync && cfqq != &cfqd->oom_cfqq) {
 		cfqq->ref++;
 		*async_cfqq = cfqq;
 	}
-- 
cgit v1.2.3


From bce6133b09013f70d41a678d262a12147ed43889 Mon Sep 17 00:00:00 2001
From: Tejun Heo <tj@kernel.org>
Date: Tue, 18 Aug 2015 14:54:59 -0700
Subject: cfq-iosched: fix oom cfq_queue ref leak in cfq_set_request()

If the cfq_queue cached in cfq_io_cq is the oom one, cfq_set_request()
replaces it by invoking cfq_get_queue() again without putting the oom
queue leaking the reference it was holding.  While oom queues are not
released through reference counting, they're still reference counted
and this can theoretically lead to the reference count overflowing and
incorrectly invoke the usual release path on it.

Fix it by making cfq_set_request() put the ref it was holding.

Signed-off-by: Tejun Heo <tj@kernel.org>
Acked-by: Jeff Moyer <jmoyer@redhat.com>
Cc: Vivek Goyal <vgoyal@redhat.com>
Cc: Arianna Avanzini <avanzini.arianna@gmail.com>
Signed-off-by: Jens Axboe <axboe@fb.com>
---
 block/cfq-iosched.c | 2 ++
 1 file changed, 2 insertions(+)

(limited to 'block')

diff --git a/block/cfq-iosched.c b/block/cfq-iosched.c
index baf0b706169d..8acfb7a3b2ba 100644
--- a/block/cfq-iosched.c
+++ b/block/cfq-iosched.c
@@ -4299,6 +4299,8 @@ cfq_set_request(struct request_queue *q, struct request *rq, struct bio *bio,
 new_queue:
 	cfqq = cic_to_cfqq(cic, is_sync);
 	if (!cfqq || cfqq == &cfqd->oom_cfqq) {
+		if (cfqq)
+			cfq_put_queue(cfqq);
 		cfqq = cfq_get_queue(cfqd, is_sync, cic, bio, gfp_mask);
 		cic_set_cfqq(cic, cfqq, is_sync);
 	} else {
-- 
cgit v1.2.3


From 563180a44b7d7978f44e9776eedfbbc550c2398d Mon Sep 17 00:00:00 2001
From: Tejun Heo <tj@kernel.org>
Date: Tue, 18 Aug 2015 14:55:00 -0700
Subject: cfq-iosched: minor cleanups

* Some were accessing cic->cfqq[] directly.  Always use cic_to_cfqq()
  and cic_set_cfqq().

* check_ioprio_changed() doesn't need to verify cfq_get_queue()'s
  return for NULL.  It's always non-NULL.  Simplify accordingly.

This patch doesn't cause any functional changes.

Signed-off-by: Tejun Heo <tj@kernel.org>
Acked-by: Jeff Moyer <jmoyer@redhat.com>
Cc: Vivek Goyal <vgoyal@redhat.com>
Cc: Arianna Avanzini <avanzini.arianna@gmail.com>
Signed-off-by: Jens Axboe <axboe@fb.com>
---
 block/cfq-iosched.c | 26 +++++++++++---------------
 1 file changed, 11 insertions(+), 15 deletions(-)

(limited to 'block')

diff --git a/block/cfq-iosched.c b/block/cfq-iosched.c
index 8acfb7a3b2ba..bf6f49cca311 100644
--- a/block/cfq-iosched.c
+++ b/block/cfq-iosched.c
@@ -3506,14 +3506,14 @@ static void cfq_exit_icq(struct io_cq *icq)
 	struct cfq_io_cq *cic = icq_to_cic(icq);
 	struct cfq_data *cfqd = cic_to_cfqd(cic);
 
-	if (cic->cfqq[BLK_RW_ASYNC]) {
-		cfq_exit_cfqq(cfqd, cic->cfqq[BLK_RW_ASYNC]);
-		cic->cfqq[BLK_RW_ASYNC] = NULL;
+	if (cic_to_cfqq(cic, false)) {
+		cfq_exit_cfqq(cfqd, cic_to_cfqq(cic, false));
+		cic_set_cfqq(cic, NULL, false);
 	}
 
-	if (cic->cfqq[BLK_RW_SYNC]) {
-		cfq_exit_cfqq(cfqd, cic->cfqq[BLK_RW_SYNC]);
-		cic->cfqq[BLK_RW_SYNC] = NULL;
+	if (cic_to_cfqq(cic, true)) {
+		cfq_exit_cfqq(cfqd, cic_to_cfqq(cic, true));
+		cic_set_cfqq(cic, NULL, true);
 	}
 }
 
@@ -3572,18 +3572,14 @@ static void check_ioprio_changed(struct cfq_io_cq *cic, struct bio *bio)
 	if (unlikely(!cfqd) || likely(cic->ioprio == ioprio))
 		return;
 
-	cfqq = cic->cfqq[BLK_RW_ASYNC];
+	cfqq = cic_to_cfqq(cic, false);
 	if (cfqq) {
-		struct cfq_queue *new_cfqq;
-		new_cfqq = cfq_get_queue(cfqd, BLK_RW_ASYNC, cic, bio,
-					 GFP_ATOMIC);
-		if (new_cfqq) {
-			cic->cfqq[BLK_RW_ASYNC] = new_cfqq;
-			cfq_put_queue(cfqq);
-		}
+		cfq_put_queue(cfqq);
+		cfqq = cfq_get_queue(cfqd, BLK_RW_ASYNC, cic, bio, GFP_ATOMIC);
+		cic_set_cfqq(cic, cfqq, false);
 	}
 
-	cfqq = cic->cfqq[BLK_RW_SYNC];
+	cfqq = cic_to_cfqq(cic, true);
 	if (cfqq)
 		cfq_mark_cfqq_prio_changed(cfqq);
 
-- 
cgit v1.2.3


From d93a11f1cd890d4ea72f7cef75fac56801b099b3 Mon Sep 17 00:00:00 2001
From: Tejun Heo <tj@kernel.org>
Date: Tue, 18 Aug 2015 14:55:01 -0700
Subject: blkcg, cfq-iosched: use GFP_NOWAIT instead of GFP_ATOMIC for
 non-critical allocations

blkcg performs several allocations to track IOs per cgroup and enforce
resource control.  Most of these allocations are performed lazily on
demand in the IO path and thus can't involve reclaim path.  Currently,
these allocations use GFP_ATOMIC; however, blkcg can gracefully deal
with occassional failures of these allocations by punting IOs to the
root cgroup and there's no reason to reach into the emergency reserve.

This patch replaces GFP_ATOMIC with GFP_NOWAIT for the following
allocations.

* bdi_writeback_congested and blkcg_gq allocations in blkg_create().

* radix tree node allocations for blkcg->blkg_tree.

* cfq_queue allocation on ioprio changes.

Signed-off-by: Tejun Heo <tj@kernel.org>
Suggested-and-Reviewed-by: Jeff Moyer <jmoyer@redhat.com>
Suggested-by: Vivek Goyal <vgoyal@redhat.com>
Signed-off-by: Jens Axboe <axboe@fb.com>
---
 block/blk-cgroup.c  | 8 ++++----
 block/cfq-iosched.c | 2 +-
 2 files changed, 5 insertions(+), 5 deletions(-)

(limited to 'block')

diff --git a/block/blk-cgroup.c b/block/blk-cgroup.c
index d6283b3f5db5..1db904f95502 100644
--- a/block/blk-cgroup.c
+++ b/block/blk-cgroup.c
@@ -188,7 +188,7 @@ EXPORT_SYMBOL_GPL(blkg_lookup);
 
 /*
  * If @new_blkg is %NULL, this function tries to allocate a new one as
- * necessary using %GFP_ATOMIC.  @new_blkg is always consumed on return.
+ * necessary using %GFP_NOWAIT.  @new_blkg is always consumed on return.
  */
 static struct blkcg_gq *blkg_create(struct blkcg *blkcg,
 				    struct request_queue *q,
@@ -208,7 +208,7 @@ static struct blkcg_gq *blkg_create(struct blkcg *blkcg,
 	}
 
 	wb_congested = wb_congested_get_create(&q->backing_dev_info,
-					       blkcg->css.id, GFP_ATOMIC);
+					       blkcg->css.id, GFP_NOWAIT);
 	if (!wb_congested) {
 		ret = -ENOMEM;
 		goto err_put_css;
@@ -216,7 +216,7 @@ static struct blkcg_gq *blkg_create(struct blkcg *blkcg,
 
 	/* allocate */
 	if (!new_blkg) {
-		new_blkg = blkg_alloc(blkcg, q, GFP_ATOMIC);
+		new_blkg = blkg_alloc(blkcg, q, GFP_NOWAIT);
 		if (unlikely(!new_blkg)) {
 			ret = -ENOMEM;
 			goto err_put_congested;
@@ -882,7 +882,7 @@ blkcg_css_alloc(struct cgroup_subsys_state *parent_css)
 
 done:
 	spin_lock_init(&blkcg->lock);
-	INIT_RADIX_TREE(&blkcg->blkg_tree, GFP_ATOMIC);
+	INIT_RADIX_TREE(&blkcg->blkg_tree, GFP_NOWAIT);
 	INIT_HLIST_HEAD(&blkcg->blkg_list);
 #ifdef CONFIG_CGROUP_WRITEBACK
 	INIT_LIST_HEAD(&blkcg->cgwb_list);
diff --git a/block/cfq-iosched.c b/block/cfq-iosched.c
index bf6f49cca311..5f119292a254 100644
--- a/block/cfq-iosched.c
+++ b/block/cfq-iosched.c
@@ -3575,7 +3575,7 @@ static void check_ioprio_changed(struct cfq_io_cq *cic, struct bio *bio)
 	cfqq = cic_to_cfqq(cic, false);
 	if (cfqq) {
 		cfq_put_queue(cfqq);
-		cfqq = cfq_get_queue(cfqd, BLK_RW_ASYNC, cic, bio, GFP_ATOMIC);
+		cfqq = cfq_get_queue(cfqd, BLK_RW_ASYNC, cic, bio, GFP_NOWAIT);
 		cic_set_cfqq(cic, cfqq, false);
 	}
 
-- 
cgit v1.2.3


From 2da8de0bb799bf2bdfa893e5a1e294eb6bafba62 Mon Sep 17 00:00:00 2001
From: Tejun Heo <tj@kernel.org>
Date: Tue, 18 Aug 2015 14:55:02 -0700
Subject: cfq-iosched: remove @gfp_mask from cfq_find_alloc_queue()

Even when allocations fail, cfq_find_alloc_queue() always returns a
valid cfq_queue by falling back to the oom cfq_queue.  As such, there
isn't much point in taking @gfp_mask and trying "harder" if __GFP_WAIT
is set.  GFP_NOWAIT allocations don't fail often and even when they do
the degraded behavior is acceptable and temporary.

After all, the only reason get_request(), which ultimately determines
the gfp_mask, cares about __GFP_WAIT is to guarantee request
allocation, assuming IO forward progress, for callers which are
willing to wait.  There's no reason for cfq_find_alloc_queue() to
behave differently on __GFP_WAIT when it already has a fallback
mechanism.

Remove @gfp_mask from cfq_find_alloc_queue() and propagate the changes
to its callers.  This simplifies the function quite a bit and will
help making async queues per-cfq_group.

v2: Updated to reflect GFP_ATOMIC -> GPF_NOWAIT.

Signed-off-by: Tejun Heo <tj@kernel.org>
Reviewed-by: Jeff Moyer <jmoyer@redhat.com>
Cc: Vivek Goyal <vgoyal@redhat.com>
Cc: Arianna Avanzini <avanzini.arianna@gmail.com>
Signed-off-by: Jens Axboe <axboe@fb.com>
---
 block/cfq-iosched.c | 45 ++++++++++-----------------------------------
 1 file changed, 10 insertions(+), 35 deletions(-)

(limited to 'block')

diff --git a/block/cfq-iosched.c b/block/cfq-iosched.c
index 5f119292a254..146b03d64b7e 100644
--- a/block/cfq-iosched.c
+++ b/block/cfq-iosched.c
@@ -883,8 +883,7 @@ static inline int cfqg_busy_async_queues(struct cfq_data *cfqd,
 
 static void cfq_dispatch_insert(struct request_queue *, struct request *);
 static struct cfq_queue *cfq_get_queue(struct cfq_data *cfqd, bool is_sync,
-				       struct cfq_io_cq *cic, struct bio *bio,
-				       gfp_t gfp_mask);
+				       struct cfq_io_cq *cic, struct bio *bio);
 
 static inline struct cfq_io_cq *icq_to_cic(struct io_cq *icq)
 {
@@ -3575,7 +3574,7 @@ static void check_ioprio_changed(struct cfq_io_cq *cic, struct bio *bio)
 	cfqq = cic_to_cfqq(cic, false);
 	if (cfqq) {
 		cfq_put_queue(cfqq);
-		cfqq = cfq_get_queue(cfqd, BLK_RW_ASYNC, cic, bio, GFP_NOWAIT);
+		cfqq = cfq_get_queue(cfqd, BLK_RW_ASYNC, cic, bio);
 		cic_set_cfqq(cic, cfqq, false);
 	}
 
@@ -3643,13 +3642,12 @@ static inline void check_blkcg_changed(struct cfq_io_cq *cic, struct bio *bio) {
 
 static struct cfq_queue *
 cfq_find_alloc_queue(struct cfq_data *cfqd, bool is_sync, struct cfq_io_cq *cic,
-		     struct bio *bio, gfp_t gfp_mask)
+		     struct bio *bio)
 {
 	struct blkcg *blkcg;
-	struct cfq_queue *cfqq, *new_cfqq = NULL;
+	struct cfq_queue *cfqq;
 	struct cfq_group *cfqg;
 
-retry:
 	rcu_read_lock();
 
 	blkcg = bio_blkcg(bio);
@@ -3666,27 +3664,9 @@ retry:
 	 * originally, since it should just be a temporary situation.
 	 */
 	if (!cfqq || cfqq == &cfqd->oom_cfqq) {
-		cfqq = NULL;
-		if (new_cfqq) {
-			cfqq = new_cfqq;
-			new_cfqq = NULL;
-		} else if (gfp_mask & __GFP_WAIT) {
-			rcu_read_unlock();
-			spin_unlock_irq(cfqd->queue->queue_lock);
-			new_cfqq = kmem_cache_alloc_node(cfq_pool,
-					gfp_mask | __GFP_ZERO,
-					cfqd->queue->node);
-			spin_lock_irq(cfqd->queue->queue_lock);
-			if (new_cfqq)
-				goto retry;
-			else
-				return &cfqd->oom_cfqq;
-		} else {
-			cfqq = kmem_cache_alloc_node(cfq_pool,
-					gfp_mask | __GFP_ZERO,
-					cfqd->queue->node);
-		}
-
+		cfqq = kmem_cache_alloc_node(cfq_pool,
+					     GFP_NOWAIT | __GFP_ZERO,
+					     cfqd->queue->node);
 		if (cfqq) {
 			cfq_init_cfqq(cfqd, cfqq, current->pid, is_sync);
 			cfq_init_prio_data(cfqq, cic);
@@ -3696,9 +3676,6 @@ retry:
 			cfqq = &cfqd->oom_cfqq;
 	}
 out:
-	if (new_cfqq)
-		kmem_cache_free(cfq_pool, new_cfqq);
-
 	rcu_read_unlock();
 	return cfqq;
 }
@@ -3723,7 +3700,7 @@ cfq_async_queue_prio(struct cfq_data *cfqd, int ioprio_class, int ioprio)
 
 static struct cfq_queue *
 cfq_get_queue(struct cfq_data *cfqd, bool is_sync, struct cfq_io_cq *cic,
-	      struct bio *bio, gfp_t gfp_mask)
+	      struct bio *bio)
 {
 	int ioprio_class = IOPRIO_PRIO_CLASS(cic->ioprio);
 	int ioprio = IOPRIO_PRIO_DATA(cic->ioprio);
@@ -3742,7 +3719,7 @@ cfq_get_queue(struct cfq_data *cfqd, bool is_sync, struct cfq_io_cq *cic,
 			goto out;
 	}
 
-	cfqq = cfq_find_alloc_queue(cfqd, is_sync, cic, bio, gfp_mask);
+	cfqq = cfq_find_alloc_queue(cfqd, is_sync, cic, bio);
 
 	/*
 	 * pin the queue now that it's allocated, scheduler exit will prune it
@@ -4286,8 +4263,6 @@ cfq_set_request(struct request_queue *q, struct request *rq, struct bio *bio,
 	const bool is_sync = rq_is_sync(rq);
 	struct cfq_queue *cfqq;
 
-	might_sleep_if(gfp_mask & __GFP_WAIT);
-
 	spin_lock_irq(q->queue_lock);
 
 	check_ioprio_changed(cic, bio);
@@ -4297,7 +4272,7 @@ new_queue:
 	if (!cfqq || cfqq == &cfqd->oom_cfqq) {
 		if (cfqq)
 			cfq_put_queue(cfqq);
-		cfqq = cfq_get_queue(cfqd, is_sync, cic, bio, gfp_mask);
+		cfqq = cfq_get_queue(cfqd, is_sync, cic, bio);
 		cic_set_cfqq(cic, cfqq, is_sync);
 	} else {
 		/*
-- 
cgit v1.2.3


From 322731ed0dd2d8a7f11307e0444257f48580a0de Mon Sep 17 00:00:00 2001
From: Tejun Heo <tj@kernel.org>
Date: Tue, 18 Aug 2015 14:55:03 -0700
Subject: cfq-iosched: move cfq_group determination from cfq_find_alloc_queue()
 to cfq_get_queue()

This is necessary for making async cfq_cgroups per-cfq_group instead
of per-cfq_data.  While this change makes cfq_get_queue() perform RCU
locking and look up cfq_group even when it reuses async queue, the
extra overhead is extremely unlikely to be noticeable given that this
is already sitting behind cic->cfqq[] cache and the overall cost of
cfq operation.

Signed-off-by: Tejun Heo <tj@kernel.org>
Reviewed-by: Jeff Moyer <jmoyer@redhat.com>
Cc: Vivek Goyal <vgoyal@redhat.com>
Cc: Arianna Avanzini <avanzini.arianna@gmail.com>
Signed-off-by: Jens Axboe <axboe@fb.com>
---
 block/cfq-iosched.c | 28 ++++++++++++----------------
 1 file changed, 12 insertions(+), 16 deletions(-)

(limited to 'block')

diff --git a/block/cfq-iosched.c b/block/cfq-iosched.c
index 146b03d64b7e..f3ea8a198def 100644
--- a/block/cfq-iosched.c
+++ b/block/cfq-iosched.c
@@ -3641,21 +3641,10 @@ static inline void check_blkcg_changed(struct cfq_io_cq *cic, struct bio *bio) {
 #endif  /* CONFIG_CFQ_GROUP_IOSCHED */
 
 static struct cfq_queue *
-cfq_find_alloc_queue(struct cfq_data *cfqd, bool is_sync, struct cfq_io_cq *cic,
-		     struct bio *bio)
+cfq_find_alloc_queue(struct cfq_data *cfqd, struct cfq_group *cfqg, bool is_sync,
+		     struct cfq_io_cq *cic, struct bio *bio)
 {
-	struct blkcg *blkcg;
 	struct cfq_queue *cfqq;
-	struct cfq_group *cfqg;
-
-	rcu_read_lock();
-
-	blkcg = bio_blkcg(bio);
-	cfqg = cfq_lookup_create_cfqg(cfqd, blkcg);
-	if (!cfqg) {
-		cfqq = &cfqd->oom_cfqq;
-		goto out;
-	}
 
 	cfqq = cic_to_cfqq(cic, is_sync);
 
@@ -3675,8 +3664,6 @@ cfq_find_alloc_queue(struct cfq_data *cfqd, bool is_sync, struct cfq_io_cq *cic,
 		} else
 			cfqq = &cfqd->oom_cfqq;
 	}
-out:
-	rcu_read_unlock();
 	return cfqq;
 }
 
@@ -3706,6 +3693,14 @@ cfq_get_queue(struct cfq_data *cfqd, bool is_sync, struct cfq_io_cq *cic,
 	int ioprio = IOPRIO_PRIO_DATA(cic->ioprio);
 	struct cfq_queue **async_cfqq;
 	struct cfq_queue *cfqq;
+	struct cfq_group *cfqg;
+
+	rcu_read_lock();
+	cfqg = cfq_lookup_create_cfqg(cfqd, bio_blkcg(bio));
+	if (!cfqg) {
+		cfqq = &cfqd->oom_cfqq;
+		goto out;
+	}
 
 	if (!is_sync) {
 		if (!ioprio_valid(cic->ioprio)) {
@@ -3719,7 +3714,7 @@ cfq_get_queue(struct cfq_data *cfqd, bool is_sync, struct cfq_io_cq *cic,
 			goto out;
 	}
 
-	cfqq = cfq_find_alloc_queue(cfqd, is_sync, cic, bio);
+	cfqq = cfq_find_alloc_queue(cfqd, cfqg, is_sync, cic, bio);
 
 	/*
 	 * pin the queue now that it's allocated, scheduler exit will prune it
@@ -3730,6 +3725,7 @@ cfq_get_queue(struct cfq_data *cfqd, bool is_sync, struct cfq_io_cq *cic,
 	}
 out:
 	cfqq->ref++;
+	rcu_read_unlock();
 	return cfqq;
 }
 
-- 
cgit v1.2.3


From d4aad7ff04dfd00f2a69356a48054d6be84dda31 Mon Sep 17 00:00:00 2001
From: Tejun Heo <tj@kernel.org>
Date: Tue, 18 Aug 2015 14:55:04 -0700
Subject: cfq-iosched: fold cfq_find_alloc_queue() into cfq_get_queue()

cfq_find_alloc_queue() checks whether a queue actually needs to be
allocated, which is unnecessary as its sole caller, cfq_get_queue(),
only calls it if so.  Also, the oom queue fallback logic is scattered
between cfq_get_queue() and cfq_find_alloc_queue().  There really
isn't much going on in the latter and things can be made simpler by
folding it into cfq_get_queue().

This patch collapses cfq_find_alloc_queue() into cfq_get_queue().  The
change is fairly straight-forward with one exception - async_cfqq is
now initialized to NULL and the "!is_sync" test in the last if
conditional is replaced with "async_cfqq" test.  This is because gcc
(5.1.1) gets confused for some reason and warns that async_cfqq may be
used uninitialized otherwise.  Oh well, the code isn't necessarily
worse this way.

This patch doesn't cause any functional difference.

v2: Updated to reflect GFP_ATOMIC -> GPF_NOWAIT.

Signed-off-by: Tejun Heo <tj@kernel.org>
Reviewed-by: Jeff Moyer <jmoyer@redhat.com>
Cc: Vivek Goyal <vgoyal@redhat.com>
Cc: Arianna Avanzini <avanzini.arianna@gmail.com>
Signed-off-by: Jens Axboe <axboe@fb.com>
---
 block/cfq-iosched.c | 49 +++++++++++++++----------------------------------
 1 file changed, 15 insertions(+), 34 deletions(-)

(limited to 'block')

diff --git a/block/cfq-iosched.c b/block/cfq-iosched.c
index f3ea8a198def..fa2a2f7dbecd 100644
--- a/block/cfq-iosched.c
+++ b/block/cfq-iosched.c
@@ -3640,33 +3640,6 @@ static void check_blkcg_changed(struct cfq_io_cq *cic, struct bio *bio)
 static inline void check_blkcg_changed(struct cfq_io_cq *cic, struct bio *bio) { }
 #endif  /* CONFIG_CFQ_GROUP_IOSCHED */
 
-static struct cfq_queue *
-cfq_find_alloc_queue(struct cfq_data *cfqd, struct cfq_group *cfqg, bool is_sync,
-		     struct cfq_io_cq *cic, struct bio *bio)
-{
-	struct cfq_queue *cfqq;
-
-	cfqq = cic_to_cfqq(cic, is_sync);
-
-	/*
-	 * Always try a new alloc if we fell back to the OOM cfqq
-	 * originally, since it should just be a temporary situation.
-	 */
-	if (!cfqq || cfqq == &cfqd->oom_cfqq) {
-		cfqq = kmem_cache_alloc_node(cfq_pool,
-					     GFP_NOWAIT | __GFP_ZERO,
-					     cfqd->queue->node);
-		if (cfqq) {
-			cfq_init_cfqq(cfqd, cfqq, current->pid, is_sync);
-			cfq_init_prio_data(cfqq, cic);
-			cfq_link_cfqq_cfqg(cfqq, cfqg);
-			cfq_log_cfqq(cfqd, cfqq, "alloced");
-		} else
-			cfqq = &cfqd->oom_cfqq;
-	}
-	return cfqq;
-}
-
 static struct cfq_queue **
 cfq_async_queue_prio(struct cfq_data *cfqd, int ioprio_class, int ioprio)
 {
@@ -3691,7 +3664,7 @@ cfq_get_queue(struct cfq_data *cfqd, bool is_sync, struct cfq_io_cq *cic,
 {
 	int ioprio_class = IOPRIO_PRIO_CLASS(cic->ioprio);
 	int ioprio = IOPRIO_PRIO_DATA(cic->ioprio);
-	struct cfq_queue **async_cfqq;
+	struct cfq_queue **async_cfqq = NULL;
 	struct cfq_queue *cfqq;
 	struct cfq_group *cfqg;
 
@@ -3714,12 +3687,20 @@ cfq_get_queue(struct cfq_data *cfqd, bool is_sync, struct cfq_io_cq *cic,
 			goto out;
 	}
 
-	cfqq = cfq_find_alloc_queue(cfqd, cfqg, is_sync, cic, bio);
+	cfqq = kmem_cache_alloc_node(cfq_pool, GFP_NOWAIT | __GFP_ZERO,
+				     cfqd->queue->node);
+	if (!cfqq) {
+		cfqq = &cfqd->oom_cfqq;
+		goto out;
+	}
+
+	cfq_init_cfqq(cfqd, cfqq, current->pid, is_sync);
+	cfq_init_prio_data(cfqq, cic);
+	cfq_link_cfqq_cfqg(cfqq, cfqg);
+	cfq_log_cfqq(cfqd, cfqq, "alloced");
 
-	/*
-	 * pin the queue now that it's allocated, scheduler exit will prune it
-	 */
-	if (!is_sync && cfqq != &cfqd->oom_cfqq) {
+	if (async_cfqq) {
+		/* a new async queue is created, pin and remember */
 		cfqq->ref++;
 		*async_cfqq = cfqq;
 	}
@@ -4469,7 +4450,7 @@ static int cfq_init_queue(struct request_queue *q, struct elevator_type *e)
 		cfqd->prio_trees[i] = RB_ROOT;
 
 	/*
-	 * Our fallback cfqq if cfq_find_alloc_queue() runs into OOM issues.
+	 * Our fallback cfqq if cfq_get_queue() runs into OOM issues.
 	 * Grab a permanent reference to it, so that the normal code flow
 	 * will not attempt to free it.  oom_cfqq is linked to root_group
 	 * but shouldn't hold a reference as it'll never be unlinked.  Lose
-- 
cgit v1.2.3


From 60a837077e2b5672ebbd4c8bd67ca443951bbc92 Mon Sep 17 00:00:00 2001
From: Tejun Heo <tj@kernel.org>
Date: Tue, 18 Aug 2015 14:55:05 -0700
Subject: cfq-iosched: charge async IOs to the appropriate blkcg's instead of
 the root

Up until now, all async IOs were queued to async queues which are
shared across the whole request_queue, which means that blkcg resource
control is completely void on async IOs including all writeback IOs.
It was done this way because writeback didn't support writeback and
there was no way of telling which writeback IO belonged to which
cgroup; however, writeback recently became cgroup aware and writeback
bio's are sent down properly tagged with the blkcg's to charge them
against.

This patch makes async cfq_queues per-cfq_cgroup instead of
per-cfq_data so that each async IO is charged to the blkcg that it was
tagged for instead of unconditionally attributing it to root.

* cfq_data->async_cfqq and ->async_idle_cfqq are moved to cfq_group
  and alloc / destroy paths are updated accordingly.

* cfq_link_cfqq_cfqg() no longer overrides @cfqg to root for async
  queues.

* check_blkcg_changed() now also invalidates async queues as they no
  longer stay the same across cgroups.

After this patch, cfq's proportional IO control through blkio.weight
works correctly when cgroup writeback is in use.

Signed-off-by: Tejun Heo <tj@kernel.org>
Reviewed-by: Jeff Moyer <jmoyer@redhat.com>
Cc: Vivek Goyal <vgoyal@redhat.com>
Cc: Arianna Avanzini <avanzini.arianna@gmail.com>
Signed-off-by: Jens Axboe <axboe@fb.com>
---
 block/cfq-iosched.c | 85 ++++++++++++++++++++++++++---------------------------
 1 file changed, 42 insertions(+), 43 deletions(-)

(limited to 'block')

diff --git a/block/cfq-iosched.c b/block/cfq-iosched.c
index fa2a2f7dbecd..9c9ec7cc9f99 100644
--- a/block/cfq-iosched.c
+++ b/block/cfq-iosched.c
@@ -305,6 +305,11 @@ struct cfq_group {
 	struct cfq_ttime ttime;
 	struct cfqg_stats stats;	/* stats for this cfqg */
 	struct cfqg_stats dead_stats;	/* stats pushed from dead children */
+
+	/* async queue for each priority case */
+	struct cfq_queue *async_cfqq[2][IOPRIO_BE_NR];
+	struct cfq_queue *async_idle_cfqq;
+
 };
 
 struct cfq_io_cq {
@@ -370,12 +375,6 @@ struct cfq_data {
 	struct cfq_queue *active_queue;
 	struct cfq_io_cq *active_cic;
 
-	/*
-	 * async queue for each priority case
-	 */
-	struct cfq_queue *async_cfqq[2][IOPRIO_BE_NR];
-	struct cfq_queue *async_idle_cfqq;
-
 	sector_t last_position;
 
 	/*
@@ -401,6 +400,7 @@ struct cfq_data {
 };
 
 static struct cfq_group *cfq_get_next_cfqg(struct cfq_data *cfqd);
+static void cfq_put_queue(struct cfq_queue *cfqq);
 
 static struct cfq_rb_root *st_for(struct cfq_group *cfqg,
 					    enum wl_class_t class,
@@ -1596,13 +1596,26 @@ static void cfq_pd_init(struct blkcg_gq *blkg)
 
 static void cfq_pd_offline(struct blkcg_gq *blkg)
 {
+	struct cfq_group *cfqg = blkg_to_cfqg(blkg);
+	int i;
+
+	for (i = 0; i < IOPRIO_BE_NR; i++) {
+		if (cfqg->async_cfqq[0][i])
+			cfq_put_queue(cfqg->async_cfqq[0][i]);
+		if (cfqg->async_cfqq[1][i])
+			cfq_put_queue(cfqg->async_cfqq[1][i]);
+	}
+
+	if (cfqg->async_idle_cfqq)
+		cfq_put_queue(cfqg->async_idle_cfqq);
+
 	/*
 	 * @blkg is going offline and will be ignored by
 	 * blkg_[rw]stat_recursive_sum().  Transfer stats to the parent so
 	 * that they don't get lost.  If IOs complete after this point, the
 	 * stats for them will be lost.  Oh well...
 	 */
-	cfqg_stats_xfer_dead(blkg_to_cfqg(blkg));
+	cfqg_stats_xfer_dead(cfqg);
 }
 
 /* offset delta from cfqg->stats to cfqg->dead_stats */
@@ -1665,10 +1678,6 @@ static struct cfq_group *cfq_lookup_create_cfqg(struct cfq_data *cfqd,
 
 static void cfq_link_cfqq_cfqg(struct cfq_queue *cfqq, struct cfq_group *cfqg)
 {
-	/* Currently, all async queues are mapped to root group */
-	if (!cfq_cfqq_sync(cfqq))
-		cfqg = cfqq->cfqd->root_group;
-
 	cfqq->cfqg = cfqg;
 	/* cfqq reference on cfqg */
 	cfqg_get(cfqg);
@@ -3609,7 +3618,7 @@ static void cfq_init_cfqq(struct cfq_data *cfqd, struct cfq_queue *cfqq,
 static void check_blkcg_changed(struct cfq_io_cq *cic, struct bio *bio)
 {
 	struct cfq_data *cfqd = cic_to_cfqd(cic);
-	struct cfq_queue *sync_cfqq;
+	struct cfq_queue *cfqq;
 	uint64_t serial_nr;
 
 	rcu_read_lock();
@@ -3623,15 +3632,22 @@ static void check_blkcg_changed(struct cfq_io_cq *cic, struct bio *bio)
 	if (unlikely(!cfqd) || likely(cic->blkcg_serial_nr == serial_nr))
 		return;
 
-	sync_cfqq = cic_to_cfqq(cic, 1);
-	if (sync_cfqq) {
-		/*
-		 * Drop reference to sync queue. A new sync queue will be
-		 * assigned in new group upon arrival of a fresh request.
-		 */
-		cfq_log_cfqq(cfqd, sync_cfqq, "changed cgroup");
-		cic_set_cfqq(cic, NULL, 1);
-		cfq_put_queue(sync_cfqq);
+	/*
+	 * Drop reference to queues.  New queues will be assigned in new
+	 * group upon arrival of fresh requests.
+	 */
+	cfqq = cic_to_cfqq(cic, false);
+	if (cfqq) {
+		cfq_log_cfqq(cfqd, cfqq, "changed cgroup");
+		cic_set_cfqq(cic, NULL, false);
+		cfq_put_queue(cfqq);
+	}
+
+	cfqq = cic_to_cfqq(cic, true);
+	if (cfqq) {
+		cfq_log_cfqq(cfqd, cfqq, "changed cgroup");
+		cic_set_cfqq(cic, NULL, true);
+		cfq_put_queue(cfqq);
 	}
 
 	cic->blkcg_serial_nr = serial_nr;
@@ -3641,18 +3657,18 @@ static inline void check_blkcg_changed(struct cfq_io_cq *cic, struct bio *bio) {
 #endif  /* CONFIG_CFQ_GROUP_IOSCHED */
 
 static struct cfq_queue **
-cfq_async_queue_prio(struct cfq_data *cfqd, int ioprio_class, int ioprio)
+cfq_async_queue_prio(struct cfq_group *cfqg, int ioprio_class, int ioprio)
 {
 	switch (ioprio_class) {
 	case IOPRIO_CLASS_RT:
-		return &cfqd->async_cfqq[0][ioprio];
+		return &cfqg->async_cfqq[0][ioprio];
 	case IOPRIO_CLASS_NONE:
 		ioprio = IOPRIO_NORM;
 		/* fall through */
 	case IOPRIO_CLASS_BE:
-		return &cfqd->async_cfqq[1][ioprio];
+		return &cfqg->async_cfqq[1][ioprio];
 	case IOPRIO_CLASS_IDLE:
-		return &cfqd->async_idle_cfqq;
+		return &cfqg->async_idle_cfqq;
 	default:
 		BUG();
 	}
@@ -3681,7 +3697,7 @@ cfq_get_queue(struct cfq_data *cfqd, bool is_sync, struct cfq_io_cq *cic,
 			ioprio = task_nice_ioprio(tsk);
 			ioprio_class = task_nice_ioclass(tsk);
 		}
-		async_cfqq = cfq_async_queue_prio(cfqd, ioprio_class, ioprio);
+		async_cfqq = cfq_async_queue_prio(cfqg, ioprio_class, ioprio);
 		cfqq = *async_cfqq;
 		if (cfqq)
 			goto out;
@@ -4355,21 +4371,6 @@ static void cfq_shutdown_timer_wq(struct cfq_data *cfqd)
 	cancel_work_sync(&cfqd->unplug_work);
 }
 
-static void cfq_put_async_queues(struct cfq_data *cfqd)
-{
-	int i;
-
-	for (i = 0; i < IOPRIO_BE_NR; i++) {
-		if (cfqd->async_cfqq[0][i])
-			cfq_put_queue(cfqd->async_cfqq[0][i]);
-		if (cfqd->async_cfqq[1][i])
-			cfq_put_queue(cfqd->async_cfqq[1][i]);
-	}
-
-	if (cfqd->async_idle_cfqq)
-		cfq_put_queue(cfqd->async_idle_cfqq);
-}
-
 static void cfq_exit_queue(struct elevator_queue *e)
 {
 	struct cfq_data *cfqd = e->elevator_data;
@@ -4382,8 +4383,6 @@ static void cfq_exit_queue(struct elevator_queue *e)
 	if (cfqd->active_queue)
 		__cfq_slice_expired(cfqd, cfqd->active_queue, 0);
 
-	cfq_put_async_queues(cfqd);
-
 	spin_unlock_irq(q->queue_lock);
 
 	cfq_shutdown_timer_wq(cfqd);
-- 
cgit v1.2.3


From 994b78327458ea14a1743196ee0560c73ace37f3 Mon Sep 17 00:00:00 2001
From: Tejun Heo <tj@kernel.org>
Date: Tue, 18 Aug 2015 14:55:07 -0700
Subject: blkcg: use blkg_free() in blkcg_init_queue() failure path

When blkcg_init_queue() fails midway after creating a new blkg, it
performs kfree() directly; however, this doesn't free the policy data
areas.  Make it use blkg_free() instead.  In turn, blkg_free() is
updated to handle root request_list special case.

While this fixes a possible memory leak, it's on an unlikely failure
path of an already cold path and the size leaked per occurrence is
miniscule too.  I don't think it needs to be tagged for -stable.

Signed-off-by: Tejun Heo <tj@kernel.org>
Cc: Vivek Goyal <vgoyal@redhat.com>
Signed-off-by: Jens Axboe <axboe@fb.com>
---
 block/blk-cgroup.c | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

(limited to 'block')

diff --git a/block/blk-cgroup.c b/block/blk-cgroup.c
index 1db904f95502..cd251830038a 100644
--- a/block/blk-cgroup.c
+++ b/block/blk-cgroup.c
@@ -70,7 +70,8 @@ static void blkg_free(struct blkcg_gq *blkg)
 	for (i = 0; i < BLKCG_MAX_POLS; i++)
 		kfree(blkg->pd[i]);
 
-	blk_exit_rl(&blkg->rl);
+	if (blkg->blkcg != &blkcg_root)
+		blk_exit_rl(&blkg->rl);
 	kfree(blkg);
 }
 
@@ -938,7 +939,7 @@ int blkcg_init_queue(struct request_queue *q)
 		radix_tree_preload_end();
 
 	if (IS_ERR(blkg)) {
-		kfree(new_blkg);
+		blkg_free(new_blkg);
 		return PTR_ERR(blkg);
 	}
 
-- 
cgit v1.2.3


From bc915e61cde25d0b429f536cec9e83039bf23504 Mon Sep 17 00:00:00 2001
From: Tejun Heo <tj@kernel.org>
Date: Tue, 18 Aug 2015 14:55:08 -0700
Subject: blkcg: remove unnecessary blkcg_root handling from css_alloc/free
 paths

blkcg_css_alloc() bypasses policy data allocation and blkcg_css_free()
bypasses policy data and blkcg freeing for blkcg_root.  There's no
reason to to treat policy data any differently for blkcg_root.  If the
root css gets allocated after policies are registered, policy
registration path will add policy data; otherwise, the alloc path
will.  The free path isn't never invoked for root csses.

This patch removes the unnecessary special handling of blkcg_root from
css_alloc/free paths.

Signed-off-by: Tejun Heo <tj@kernel.org>
Cc: Vivek Goyal <vgoyal@redhat.com>
Signed-off-by: Jens Axboe <axboe@fb.com>
---
 block/blk-cgroup.c | 25 ++++++++++---------------
 1 file changed, 10 insertions(+), 15 deletions(-)

(limited to 'block')

diff --git a/block/blk-cgroup.c b/block/blk-cgroup.c
index cd251830038a..f91a4e09e0c9 100644
--- a/block/blk-cgroup.c
+++ b/block/blk-cgroup.c
@@ -823,18 +823,15 @@ static void blkcg_css_offline(struct cgroup_subsys_state *css)
 static void blkcg_css_free(struct cgroup_subsys_state *css)
 {
 	struct blkcg *blkcg = css_to_blkcg(css);
+	int i;
 
 	mutex_lock(&blkcg_pol_mutex);
 	list_del(&blkcg->all_blkcgs_node);
 	mutex_unlock(&blkcg_pol_mutex);
 
-	if (blkcg != &blkcg_root) {
-		int i;
-
-		for (i = 0; i < BLKCG_MAX_POLS; i++)
-			kfree(blkcg->pd[i]);
-		kfree(blkcg);
-	}
+	for (i = 0; i < BLKCG_MAX_POLS; i++)
+		kfree(blkcg->pd[i]);
+	kfree(blkcg);
 }
 
 static struct cgroup_subsys_state *
@@ -848,13 +845,12 @@ blkcg_css_alloc(struct cgroup_subsys_state *parent_css)
 
 	if (!parent_css) {
 		blkcg = &blkcg_root;
-		goto done;
-	}
-
-	blkcg = kzalloc(sizeof(*blkcg), GFP_KERNEL);
-	if (!blkcg) {
-		ret = ERR_PTR(-ENOMEM);
-		goto free_blkcg;
+	} else {
+		blkcg = kzalloc(sizeof(*blkcg), GFP_KERNEL);
+		if (!blkcg) {
+			ret = ERR_PTR(-ENOMEM);
+			goto free_blkcg;
+		}
 	}
 
 	for (i = 0; i < BLKCG_MAX_POLS ; i++) {
@@ -881,7 +877,6 @@ blkcg_css_alloc(struct cgroup_subsys_state *parent_css)
 		pol->cpd_init_fn(blkcg);
 	}
 
-done:
 	spin_lock_init(&blkcg->lock);
 	INIT_RADIX_TREE(&blkcg->blkg_tree, GFP_NOWAIT);
 	INIT_HLIST_HEAD(&blkcg->blkg_list);
-- 
cgit v1.2.3


From 4c55f4f9ad3001ac1fefdd8d8ca7641d18558e23 Mon Sep 17 00:00:00 2001
From: Tejun Heo <tj@kernel.org>
Date: Tue, 18 Aug 2015 14:55:09 -0700
Subject: blkcg: restructure blkg_policy_data allocation in
 blkcg_activate_policy()

When a policy gets activated, it needs to allocate and install its
policy data on all existing blkg's (blkcg_gq's).  Because blkg
iteration is protected by a spinlock, it currently counts the total
number of blkg's in the system, allocates the matching number of
policy data on a list and installs them during a single iteration.

This can be simplified by using speculative GFP_NOWAIT allocations
while iterating and falling back to a preallocated policy data on
failure.  If the preallocated one has already been consumed, it
releases the lock, preallocate with GFP_KERNEL and then restarts the
iteration.  This can be a bit more expensive than before but policy
activation is a very cold path and shouldn't matter.

Signed-off-by: Tejun Heo <tj@kernel.org>
Signed-off-by: Jens Axboe <axboe@fb.com>
---
 block/blk-cgroup.c         | 55 ++++++++++++++++++----------------------------
 include/linux/blk-cgroup.h |  3 ---
 2 files changed, 21 insertions(+), 37 deletions(-)

(limited to 'block')

diff --git a/block/blk-cgroup.c b/block/blk-cgroup.c
index f91a4e09e0c9..9e9b0df339ee 100644
--- a/block/blk-cgroup.c
+++ b/block/blk-cgroup.c
@@ -1047,65 +1047,52 @@ EXPORT_SYMBOL_GPL(blkio_cgrp_subsys);
 int blkcg_activate_policy(struct request_queue *q,
 			  const struct blkcg_policy *pol)
 {
-	LIST_HEAD(pds);
+	struct blkg_policy_data *pd_prealloc = NULL;
 	struct blkcg_gq *blkg;
-	struct blkg_policy_data *pd, *nd;
-	int cnt = 0, ret;
+	int ret;
 
 	if (blkcg_policy_enabled(q, pol))
 		return 0;
 
-	/* count and allocate policy_data for all existing blkgs */
 	blk_queue_bypass_start(q);
-	spin_lock_irq(q->queue_lock);
-	list_for_each_entry(blkg, &q->blkg_list, q_node)
-		cnt++;
-	spin_unlock_irq(q->queue_lock);
-
-	/* allocate per-blkg policy data for all existing blkgs */
-	while (cnt--) {
-		pd = kzalloc_node(pol->pd_size, GFP_KERNEL, q->node);
-		if (!pd) {
+pd_prealloc:
+	if (!pd_prealloc) {
+		pd_prealloc = kzalloc_node(pol->pd_size, GFP_KERNEL, q->node);
+		if (!pd_prealloc) {
 			ret = -ENOMEM;
-			goto out_free;
+			goto out_bypass_end;
 		}
-		list_add_tail(&pd->alloc_node, &pds);
 	}
 
-	/*
-	 * Install the allocated pds and cpds. With @q bypassing, no new blkg
-	 * should have been created while the queue lock was dropped.
-	 */
 	spin_lock_irq(q->queue_lock);
 
 	list_for_each_entry(blkg, &q->blkg_list, q_node) {
-		if (WARN_ON(list_empty(&pds))) {
-			/* umm... this shouldn't happen, just abort */
-			ret = -ENOMEM;
-			goto out_unlock;
-		}
-		pd = list_first_entry(&pds, struct blkg_policy_data, alloc_node);
-		list_del_init(&pd->alloc_node);
+		struct blkg_policy_data *pd;
 
-		/* grab blkcg lock too while installing @pd on @blkg */
-		spin_lock(&blkg->blkcg->lock);
+		if (blkg->pd[pol->plid])
+			continue;
+
+		pd = kzalloc_node(pol->pd_size, GFP_NOWAIT, q->node);
+		if (!pd)
+			swap(pd, pd_prealloc);
+		if (!pd) {
+			spin_unlock_irq(q->queue_lock);
+			goto pd_prealloc;
+		}
 
 		blkg->pd[pol->plid] = pd;
 		pd->blkg = blkg;
 		pd->plid = pol->plid;
 		pol->pd_init_fn(blkg);
-
-		spin_unlock(&blkg->blkcg->lock);
 	}
 
 	__set_bit(pol->plid, q->blkcg_pols);
 	ret = 0;
-out_unlock:
+
 	spin_unlock_irq(q->queue_lock);
-out_free:
+out_bypass_end:
 	blk_queue_bypass_end(q);
-	list_for_each_entry_safe(pd, nd, &pds, alloc_node)
-		kfree(pd);
+	kfree(pd_prealloc);
 	return ret;
 }
 EXPORT_SYMBOL_GPL(blkcg_activate_policy);
diff --git a/include/linux/blk-cgroup.h b/include/linux/blk-cgroup.h
index 9711fc277c02..db822880242a 100644
--- a/include/linux/blk-cgroup.h
+++ b/include/linux/blk-cgroup.h
@@ -80,9 +80,6 @@ struct blkg_policy_data {
 	/* the blkg and policy id this per-policy data belongs to */
 	struct blkcg_gq			*blkg;
 	int				plid;
-
-	/* used during policy activation */
-	struct list_head		alloc_node;
 };
 
 /*
-- 
cgit v1.2.3


From 3e41871046bfe0ba7d122a1f14f0c1db2dca0256 Mon Sep 17 00:00:00 2001
From: Tejun Heo <tj@kernel.org>
Date: Tue, 18 Aug 2015 14:55:10 -0700
Subject: blkcg: make blkcg_activate_policy() allow NULL ->pd_init_fn

blkg_create() allows NULL ->pd_init_fn() but blkcg_activate_policy()
doesn't.  As both in-kernel policies implement ->pd_init_fn, it
currently doesn't break anything.  Update blkcg_activate_policy() so
that its behavior is consistent with blkg_create().

Signed-off-by: Tejun Heo <tj@kernel.org>
Cc: Vivek Goyal <vgoyal@redhat.com>
Signed-off-by: Jens Axboe <axboe@fb.com>
---
 block/blk-cgroup.c | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

(limited to 'block')

diff --git a/block/blk-cgroup.c b/block/blk-cgroup.c
index 9e9b0df339ee..4defbbabc0ff 100644
--- a/block/blk-cgroup.c
+++ b/block/blk-cgroup.c
@@ -1083,7 +1083,8 @@ pd_prealloc:
 		blkg->pd[pol->plid] = pd;
 		pd->blkg = blkg;
 		pd->plid = pol->plid;
-		pol->pd_init_fn(blkg);
+		if (pol->pd_init_fn)
+			pol->pd_init_fn(blkg);
 	}
 
 	__set_bit(pol->plid, q->blkcg_pols);
-- 
cgit v1.2.3


From 001bea73e70efdf48a9e00188cf302f6b6aed2bf Mon Sep 17 00:00:00 2001
From: Tejun Heo <tj@kernel.org>
Date: Tue, 18 Aug 2015 14:55:11 -0700
Subject: blkcg: replace blkcg_policy->pd_size with ->pd_alloc/free_fn()
 methods

A blkg (blkcg_gq) represents the relationship between a cgroup and
request_queue.  Each active policy has a pd (blkg_policy_data) on each
blkg.  The pd's were allocated by blkcg core and each policy could
request to allocate extra space at the end by setting
blkcg_policy->pd_size larger than the size of pd.

This is a bit unusual but was done this way mostly to simplify error
handling and all the existing use cases could be handled this way;
however, this is becoming too restrictive now that percpu memory can
be allocated without blocking.

This introduces two new mandatory blkcg_policy methods - pd_alloc_fn()
and pd_free_fn() - which are used to allocate and release pd for a
given policy.  As pd allocation is now done from policy side, it can
simply allocate a larger area which embeds pd at the beginning.  This
change makes ->pd_size pointless.  Removed.

Signed-off-by: Tejun Heo <tj@kernel.org>
Cc: Vivek Goyal <vgoyal@redhat.com>
Signed-off-by: Jens Axboe <axboe@fb.com>
---
 block/blk-cgroup.c         | 21 +++++++++++----------
 block/blk-throttle.c       | 13 ++++++++++++-
 block/cfq-iosched.c        | 13 ++++++++++++-
 include/linux/blk-cgroup.h | 18 +++++++++---------
 4 files changed, 44 insertions(+), 21 deletions(-)

(limited to 'block')

diff --git a/block/blk-cgroup.c b/block/blk-cgroup.c
index 4defbbabc0ff..d1bc6099bd1e 100644
--- a/block/blk-cgroup.c
+++ b/block/blk-cgroup.c
@@ -68,7 +68,8 @@ static void blkg_free(struct blkcg_gq *blkg)
 		return;
 
 	for (i = 0; i < BLKCG_MAX_POLS; i++)
-		kfree(blkg->pd[i]);
+		if (blkg->pd[i])
+			blkcg_policy[i]->pd_free_fn(blkg->pd[i]);
 
 	if (blkg->blkcg != &blkcg_root)
 		blk_exit_rl(&blkg->rl);
@@ -114,7 +115,7 @@ static struct blkcg_gq *blkg_alloc(struct blkcg *blkcg, struct request_queue *q,
 			continue;
 
 		/* alloc per-policy data and attach it to blkg */
-		pd = kzalloc_node(pol->pd_size, gfp_mask, q->node);
+		pd = pol->pd_alloc_fn(gfp_mask, q->node);
 		if (!pd)
 			goto err_free;
 
@@ -1057,7 +1058,7 @@ int blkcg_activate_policy(struct request_queue *q,
 	blk_queue_bypass_start(q);
 pd_prealloc:
 	if (!pd_prealloc) {
-		pd_prealloc = kzalloc_node(pol->pd_size, GFP_KERNEL, q->node);
+		pd_prealloc = pol->pd_alloc_fn(GFP_KERNEL, q->node);
 		if (!pd_prealloc) {
 			ret = -ENOMEM;
 			goto out_bypass_end;
@@ -1072,7 +1073,7 @@ pd_prealloc:
 		if (blkg->pd[pol->plid])
 			continue;
 
-		pd = kzalloc_node(pol->pd_size, GFP_NOWAIT, q->node);
+		pd = pol->pd_alloc_fn(GFP_NOWAIT, q->node);
 		if (!pd)
 			swap(pd, pd_prealloc);
 		if (!pd) {
@@ -1093,7 +1094,8 @@ pd_prealloc:
 	spin_unlock_irq(q->queue_lock);
 out_bypass_end:
 	blk_queue_bypass_end(q);
-	kfree(pd_prealloc);
+	if (pd_prealloc)
+		pol->pd_free_fn(pd_prealloc);
 	return ret;
 }
 EXPORT_SYMBOL_GPL(blkcg_activate_policy);
@@ -1128,8 +1130,10 @@ void blkcg_deactivate_policy(struct request_queue *q,
 		if (pol->pd_exit_fn)
 			pol->pd_exit_fn(blkg);
 
-		kfree(blkg->pd[pol->plid]);
-		blkg->pd[pol->plid] = NULL;
+		if (blkg->pd[pol->plid]) {
+			pol->pd_free_fn(blkg->pd[pol->plid]);
+			blkg->pd[pol->plid] = NULL;
+		}
 
 		spin_unlock(&blkg->blkcg->lock);
 	}
@@ -1151,9 +1155,6 @@ int blkcg_policy_register(struct blkcg_policy *pol)
 	struct blkcg *blkcg;
 	int i, ret;
 
-	if (WARN_ON(pol->pd_size < sizeof(struct blkg_policy_data)))
-		return -EINVAL;
-
 	mutex_lock(&blkcg_pol_register_mutex);
 	mutex_lock(&blkcg_pol_mutex);
 
diff --git a/block/blk-throttle.c b/block/blk-throttle.c
index b23193518ac7..f1dd691c5359 100644
--- a/block/blk-throttle.c
+++ b/block/blk-throttle.c
@@ -403,6 +403,11 @@ static void throtl_service_queue_exit(struct throtl_service_queue *sq)
 	del_timer_sync(&sq->pending_timer);
 }
 
+static struct blkg_policy_data *throtl_pd_alloc(gfp_t gfp, int node)
+{
+	return kzalloc_node(sizeof(struct throtl_grp), gfp, node);
+}
+
 static void throtl_pd_init(struct blkcg_gq *blkg)
 {
 	struct throtl_grp *tg = blkg_to_tg(blkg);
@@ -493,6 +498,11 @@ static void throtl_pd_exit(struct blkcg_gq *blkg)
 	throtl_service_queue_exit(&tg->service_queue);
 }
 
+static void throtl_pd_free(struct blkg_policy_data *pd)
+{
+	kfree(pd);
+}
+
 static void throtl_pd_reset_stats(struct blkcg_gq *blkg)
 {
 	struct throtl_grp *tg = blkg_to_tg(blkg);
@@ -1468,12 +1478,13 @@ static void throtl_shutdown_wq(struct request_queue *q)
 }
 
 static struct blkcg_policy blkcg_policy_throtl = {
-	.pd_size		= sizeof(struct throtl_grp),
 	.cftypes		= throtl_files,
 
+	.pd_alloc_fn		= throtl_pd_alloc,
 	.pd_init_fn		= throtl_pd_init,
 	.pd_online_fn		= throtl_pd_online,
 	.pd_exit_fn		= throtl_pd_exit,
+	.pd_free_fn		= throtl_pd_free,
 	.pd_reset_stats_fn	= throtl_pd_reset_stats,
 };
 
diff --git a/block/cfq-iosched.c b/block/cfq-iosched.c
index 9c9ec7cc9f99..69ce2883099e 100644
--- a/block/cfq-iosched.c
+++ b/block/cfq-iosched.c
@@ -1582,6 +1582,11 @@ static void cfq_cpd_init(const struct blkcg *blkcg)
 	}
 }
 
+static struct blkg_policy_data *cfq_pd_alloc(gfp_t gfp, int node)
+{
+	return kzalloc_node(sizeof(struct cfq_group), gfp, node);
+}
+
 static void cfq_pd_init(struct blkcg_gq *blkg)
 {
 	struct cfq_group *cfqg = blkg_to_cfqg(blkg);
@@ -1618,6 +1623,11 @@ static void cfq_pd_offline(struct blkcg_gq *blkg)
 	cfqg_stats_xfer_dead(cfqg);
 }
 
+static void cfq_pd_free(struct blkg_policy_data *pd)
+{
+	return kfree(pd);
+}
+
 /* offset delta from cfqg->stats to cfqg->dead_stats */
 static const int dead_stats_off_delta = offsetof(struct cfq_group, dead_stats) -
 					offsetof(struct cfq_group, stats);
@@ -4633,13 +4643,14 @@ static struct elevator_type iosched_cfq = {
 
 #ifdef CONFIG_CFQ_GROUP_IOSCHED
 static struct blkcg_policy blkcg_policy_cfq = {
-	.pd_size		= sizeof(struct cfq_group),
 	.cpd_size		= sizeof(struct cfq_group_data),
 	.cftypes		= cfq_blkcg_files,
 
 	.cpd_init_fn		= cfq_cpd_init,
+	.pd_alloc_fn		= cfq_pd_alloc,
 	.pd_init_fn		= cfq_pd_init,
 	.pd_offline_fn		= cfq_pd_offline,
+	.pd_free_fn		= cfq_pd_free,
 	.pd_reset_stats_fn	= cfq_pd_reset_stats,
 };
 #endif
diff --git a/include/linux/blk-cgroup.h b/include/linux/blk-cgroup.h
index db822880242a..bd173ea360ce 100644
--- a/include/linux/blk-cgroup.h
+++ b/include/linux/blk-cgroup.h
@@ -68,13 +68,11 @@ struct blkg_rwstat {
  * request_queue (q).  This is used by blkcg policies which need to track
  * information per blkcg - q pair.
  *
- * There can be multiple active blkcg policies and each has its private
- * data on each blkg, the size of which is determined by
- * blkcg_policy->pd_size.  blkcg core allocates and frees such areas
- * together with blkg and invokes pd_init/exit_fn() methods.
- *
- * Such private data must embed struct blkg_policy_data (pd) at the
- * beginning and pd_size can't be smaller than pd.
+ * There can be multiple active blkcg policies and each blkg:policy pair is
+ * represented by a blkg_policy_data which is allocated and freed by each
+ * policy's pd_alloc/free_fn() methods.  A policy can allocate private data
+ * area by allocating larger data structure which embeds blkg_policy_data
+ * at the beginning.
  */
 struct blkg_policy_data {
 	/* the blkg and policy id this per-policy data belongs to */
@@ -126,16 +124,16 @@ struct blkcg_gq {
 };
 
 typedef void (blkcg_pol_init_cpd_fn)(const struct blkcg *blkcg);
+typedef struct blkg_policy_data *(blkcg_pol_alloc_pd_fn)(gfp_t gfp, int node);
 typedef void (blkcg_pol_init_pd_fn)(struct blkcg_gq *blkg);
 typedef void (blkcg_pol_online_pd_fn)(struct blkcg_gq *blkg);
 typedef void (blkcg_pol_offline_pd_fn)(struct blkcg_gq *blkg);
 typedef void (blkcg_pol_exit_pd_fn)(struct blkcg_gq *blkg);
+typedef void (blkcg_pol_free_pd_fn)(struct blkg_policy_data *pd);
 typedef void (blkcg_pol_reset_pd_stats_fn)(struct blkcg_gq *blkg);
 
 struct blkcg_policy {
 	int				plid;
-	/* policy specific private data size */
-	size_t				pd_size;
 	/* policy specific per-blkcg data size */
 	size_t				cpd_size;
 	/* cgroup files for the policy */
@@ -143,10 +141,12 @@ struct blkcg_policy {
 
 	/* operations */
 	blkcg_pol_init_cpd_fn		*cpd_init_fn;
+	blkcg_pol_alloc_pd_fn		*pd_alloc_fn;
 	blkcg_pol_init_pd_fn		*pd_init_fn;
 	blkcg_pol_online_pd_fn		*pd_online_fn;
 	blkcg_pol_offline_pd_fn		*pd_offline_fn;
 	blkcg_pol_exit_pd_fn		*pd_exit_fn;
+	blkcg_pol_free_pd_fn		*pd_free_fn;
 	blkcg_pol_reset_pd_stats_fn	*pd_reset_stats_fn;
 };
 
-- 
cgit v1.2.3


From 4fb72036fbf9c28de7a64b1d3f19b4ce9da1c6bf Mon Sep 17 00:00:00 2001
From: Tejun Heo <tj@kernel.org>
Date: Tue, 18 Aug 2015 14:55:12 -0700
Subject: blk-throttle: remove asynchrnous percpu stats allocation mechanism

Because percpu allocator couldn't do non-blocking allocations,
blk-throttle was forced to implement an ad-hoc asynchronous allocation
mechanism for its percpu stats for cases where blkg's (blkcg_gq's) are
allocated from an IO path without sleepable context.

Now that percpu allocator can handle gfp_mask and blkg_policy_data
alloc / free are handled by policy methods, the ad-hoc asynchronous
allocation mechanism can be replaced with direct allocation from
tg_stats_alloc_fn().  Rit it out.

This ensures that an active throtl_grp always has valid non-NULL
->stats_cpu.  Remove checks on it.

Signed-off-by: Tejun Heo <tj@kernel.org>
Cc: Vivek Goyal <vgoyal@redhat.com>
Signed-off-by: Jens Axboe <axboe@fb.com>
---
 block/blk-throttle.c | 112 ++++++++++++---------------------------------------
 1 file changed, 25 insertions(+), 87 deletions(-)

(limited to 'block')

diff --git a/block/blk-throttle.c b/block/blk-throttle.c
index f1dd691c5359..3c869768cfdd 100644
--- a/block/blk-throttle.c
+++ b/block/blk-throttle.c
@@ -144,9 +144,6 @@ struct throtl_grp {
 
 	/* Per cpu stats pointer */
 	struct tg_stats_cpu __percpu *stats_cpu;
-
-	/* List of tgs waiting for per cpu stats memory to be allocated */
-	struct list_head stats_alloc_node;
 };
 
 struct throtl_data
@@ -168,13 +165,6 @@ struct throtl_data
 	struct work_struct dispatch_work;
 };
 
-/* list and work item to allocate percpu group stats */
-static DEFINE_SPINLOCK(tg_stats_alloc_lock);
-static LIST_HEAD(tg_stats_alloc_list);
-
-static void tg_stats_alloc_fn(struct work_struct *);
-static DECLARE_DELAYED_WORK(tg_stats_alloc_work, tg_stats_alloc_fn);
-
 static void throtl_pending_timer_fn(unsigned long arg);
 
 static inline struct throtl_grp *pd_to_tg(struct blkg_policy_data *pd)
@@ -256,53 +246,6 @@ static struct throtl_data *sq_to_td(struct throtl_service_queue *sq)
 	}								\
 } while (0)
 
-static void tg_stats_init(struct tg_stats_cpu *tg_stats)
-{
-	blkg_rwstat_init(&tg_stats->service_bytes);
-	blkg_rwstat_init(&tg_stats->serviced);
-}
-
-/*
- * Worker for allocating per cpu stat for tgs. This is scheduled on the
- * system_wq once there are some groups on the alloc_list waiting for
- * allocation.
- */
-static void tg_stats_alloc_fn(struct work_struct *work)
-{
-	static struct tg_stats_cpu *stats_cpu;	/* this fn is non-reentrant */
-	struct delayed_work *dwork = to_delayed_work(work);
-	bool empty = false;
-
-alloc_stats:
-	if (!stats_cpu) {
-		int cpu;
-
-		stats_cpu = alloc_percpu(struct tg_stats_cpu);
-		if (!stats_cpu) {
-			/* allocation failed, try again after some time */
-			schedule_delayed_work(dwork, msecs_to_jiffies(10));
-			return;
-		}
-		for_each_possible_cpu(cpu)
-			tg_stats_init(per_cpu_ptr(stats_cpu, cpu));
-	}
-
-	spin_lock_irq(&tg_stats_alloc_lock);
-
-	if (!list_empty(&tg_stats_alloc_list)) {
-		struct throtl_grp *tg = list_first_entry(&tg_stats_alloc_list,
-							 struct throtl_grp,
-							 stats_alloc_node);
-		swap(tg->stats_cpu, stats_cpu);
-		list_del_init(&tg->stats_alloc_node);
-	}
-
-	empty = list_empty(&tg_stats_alloc_list);
-	spin_unlock_irq(&tg_stats_alloc_lock);
-	if (!empty)
-		goto alloc_stats;
-}
-
 static void throtl_qnode_init(struct throtl_qnode *qn, struct throtl_grp *tg)
 {
 	INIT_LIST_HEAD(&qn->node);
@@ -405,7 +348,27 @@ static void throtl_service_queue_exit(struct throtl_service_queue *sq)
 
 static struct blkg_policy_data *throtl_pd_alloc(gfp_t gfp, int node)
 {
-	return kzalloc_node(sizeof(struct throtl_grp), gfp, node);
+	struct throtl_grp *tg;
+	int cpu;
+
+	tg = kzalloc_node(sizeof(*tg), gfp, node);
+	if (!tg)
+		return NULL;
+
+	tg->stats_cpu = alloc_percpu_gfp(struct tg_stats_cpu, gfp);
+	if (!tg->stats_cpu) {
+		kfree(tg);
+		return NULL;
+	}
+
+	for_each_possible_cpu(cpu) {
+		struct tg_stats_cpu *stats_cpu = per_cpu_ptr(tg->stats_cpu, cpu);
+
+		blkg_rwstat_init(&stats_cpu->service_bytes);
+		blkg_rwstat_init(&stats_cpu->serviced);
+	}
+
+	return &tg->pd;
 }
 
 static void throtl_pd_init(struct blkcg_gq *blkg)
@@ -413,7 +376,6 @@ static void throtl_pd_init(struct blkcg_gq *blkg)
 	struct throtl_grp *tg = blkg_to_tg(blkg);
 	struct throtl_data *td = blkg->q->td;
 	struct throtl_service_queue *parent_sq;
-	unsigned long flags;
 	int rw;
 
 	/*
@@ -448,16 +410,6 @@ static void throtl_pd_init(struct blkcg_gq *blkg)
 	tg->bps[WRITE] = -1;
 	tg->iops[READ] = -1;
 	tg->iops[WRITE] = -1;
-
-	/*
-	 * Ugh... We need to perform per-cpu allocation for tg->stats_cpu
-	 * but percpu allocator can't be called from IO path.  Queue tg on
-	 * tg_stats_alloc_list and allocate from work item.
-	 */
-	spin_lock_irqsave(&tg_stats_alloc_lock, flags);
-	list_add(&tg->stats_alloc_node, &tg_stats_alloc_list);
-	schedule_delayed_work(&tg_stats_alloc_work, 0);
-	spin_unlock_irqrestore(&tg_stats_alloc_lock, flags);
 }
 
 /*
@@ -487,20 +439,16 @@ static void throtl_pd_online(struct blkcg_gq *blkg)
 static void throtl_pd_exit(struct blkcg_gq *blkg)
 {
 	struct throtl_grp *tg = blkg_to_tg(blkg);
-	unsigned long flags;
-
-	spin_lock_irqsave(&tg_stats_alloc_lock, flags);
-	list_del_init(&tg->stats_alloc_node);
-	spin_unlock_irqrestore(&tg_stats_alloc_lock, flags);
-
-	free_percpu(tg->stats_cpu);
 
 	throtl_service_queue_exit(&tg->service_queue);
 }
 
 static void throtl_pd_free(struct blkg_policy_data *pd)
 {
-	kfree(pd);
+	struct throtl_grp *tg = pd_to_tg(pd);
+
+	free_percpu(tg->stats_cpu);
+	kfree(tg);
 }
 
 static void throtl_pd_reset_stats(struct blkcg_gq *blkg)
@@ -508,9 +456,6 @@ static void throtl_pd_reset_stats(struct blkcg_gq *blkg)
 	struct throtl_grp *tg = blkg_to_tg(blkg);
 	int cpu;
 
-	if (tg->stats_cpu == NULL)
-		return;
-
 	for_each_possible_cpu(cpu) {
 		struct tg_stats_cpu *sc = per_cpu_ptr(tg->stats_cpu, cpu);
 
@@ -973,10 +918,6 @@ static void throtl_update_dispatch_stats(struct blkcg_gq *blkg, u64 bytes,
 	struct tg_stats_cpu *stats_cpu;
 	unsigned long flags;
 
-	/* If per cpu stats are not allocated yet, don't do any accounting. */
-	if (tg->stats_cpu == NULL)
-		return;
-
 	/*
 	 * Disabling interrupts to provide mutual exclusion between two
 	 * writes on same cpu. It probably is not needed for 64bit. Not
@@ -1302,9 +1243,6 @@ static u64 tg_prfill_cpu_rwstat(struct seq_file *sf,
 	struct blkg_rwstat rwstat = { }, tmp;
 	int i, cpu;
 
-	if (tg->stats_cpu == NULL)
-		return 0;
-
 	for_each_possible_cpu(cpu) {
 		struct tg_stats_cpu *sc = per_cpu_ptr(tg->stats_cpu, cpu);
 
-- 
cgit v1.2.3


From b2ce2643cc705aa9043642d7b6248ccfd8e20629 Mon Sep 17 00:00:00 2001
From: Tejun Heo <tj@kernel.org>
Date: Tue, 18 Aug 2015 14:55:13 -0700
Subject: blk-throttle: clean up blkg_policy_data alloc/init/exit/free methods

With the recent addition of alloc and free methods, things became
messier.  This patch reorganizes them according to the followings.

* ->pd_alloc_fn()

  Responsible for allocation and static initializations - the ones
  which can be done independent of where the pd might be attached.

* ->pd_init_fn()

  Initializations which require the knowledge of where the pd is
  attached.

* ->pd_free_fn()

  The counter part of pd_alloc_fn().  Static de-init and freeing.

This leaves ->pd_exit_fn() without any users.  Removed.

While at it, collapse an one liner function throtl_pd_exit(), which
has only one user, into its user.

Signed-off-by: Tejun Heo <tj@kernel.org>
Cc: Vivek Goyal <vgoyal@redhat.com>
Signed-off-by: Jens Axboe <axboe@fb.com>
---
 block/blk-cgroup.c         | 11 ---------
 block/blk-throttle.c       | 57 ++++++++++++++++------------------------------
 block/cfq-iosched.c        | 15 ++++++++----
 include/linux/blk-cgroup.h |  2 --
 4 files changed, 31 insertions(+), 54 deletions(-)

(limited to 'block')

diff --git a/block/blk-cgroup.c b/block/blk-cgroup.c
index d1bc6099bd1e..acfb09af58a5 100644
--- a/block/blk-cgroup.c
+++ b/block/blk-cgroup.c
@@ -402,15 +402,6 @@ static void blkg_destroy_all(struct request_queue *q)
 void __blkg_release_rcu(struct rcu_head *rcu_head)
 {
 	struct blkcg_gq *blkg = container_of(rcu_head, struct blkcg_gq, rcu_head);
-	int i;
-
-	/* tell policies that this one is being freed */
-	for (i = 0; i < BLKCG_MAX_POLS; i++) {
-		struct blkcg_policy *pol = blkcg_policy[i];
-
-		if (blkg->pd[i] && pol->pd_exit_fn)
-			pol->pd_exit_fn(blkg);
-	}
 
 	/* release the blkcg and parent blkg refs this blkg has been holding */
 	css_put(&blkg->blkcg->css);
@@ -1127,8 +1118,6 @@ void blkcg_deactivate_policy(struct request_queue *q,
 
 		if (pol->pd_offline_fn)
 			pol->pd_offline_fn(blkg);
-		if (pol->pd_exit_fn)
-			pol->pd_exit_fn(blkg);
 
 		if (blkg->pd[pol->plid]) {
 			pol->pd_free_fn(blkg->pd[pol->plid]);
diff --git a/block/blk-throttle.c b/block/blk-throttle.c
index 3c869768cfdd..c3a235b8ec7e 100644
--- a/block/blk-throttle.c
+++ b/block/blk-throttle.c
@@ -330,26 +330,19 @@ static struct bio *throtl_pop_queued(struct list_head *queued,
 }
 
 /* init a service_queue, assumes the caller zeroed it */
-static void throtl_service_queue_init(struct throtl_service_queue *sq,
-				      struct throtl_service_queue *parent_sq)
+static void throtl_service_queue_init(struct throtl_service_queue *sq)
 {
 	INIT_LIST_HEAD(&sq->queued[0]);
 	INIT_LIST_HEAD(&sq->queued[1]);
 	sq->pending_tree = RB_ROOT;
-	sq->parent_sq = parent_sq;
 	setup_timer(&sq->pending_timer, throtl_pending_timer_fn,
 		    (unsigned long)sq);
 }
 
-static void throtl_service_queue_exit(struct throtl_service_queue *sq)
-{
-	del_timer_sync(&sq->pending_timer);
-}
-
 static struct blkg_policy_data *throtl_pd_alloc(gfp_t gfp, int node)
 {
 	struct throtl_grp *tg;
-	int cpu;
+	int rw, cpu;
 
 	tg = kzalloc_node(sizeof(*tg), gfp, node);
 	if (!tg)
@@ -361,6 +354,19 @@ static struct blkg_policy_data *throtl_pd_alloc(gfp_t gfp, int node)
 		return NULL;
 	}
 
+	throtl_service_queue_init(&tg->service_queue);
+
+	for (rw = READ; rw <= WRITE; rw++) {
+		throtl_qnode_init(&tg->qnode_on_self[rw], tg);
+		throtl_qnode_init(&tg->qnode_on_parent[rw], tg);
+	}
+
+	RB_CLEAR_NODE(&tg->rb_node);
+	tg->bps[READ] = -1;
+	tg->bps[WRITE] = -1;
+	tg->iops[READ] = -1;
+	tg->iops[WRITE] = -1;
+
 	for_each_possible_cpu(cpu) {
 		struct tg_stats_cpu *stats_cpu = per_cpu_ptr(tg->stats_cpu, cpu);
 
@@ -375,8 +381,7 @@ static void throtl_pd_init(struct blkcg_gq *blkg)
 {
 	struct throtl_grp *tg = blkg_to_tg(blkg);
 	struct throtl_data *td = blkg->q->td;
-	struct throtl_service_queue *parent_sq;
-	int rw;
+	struct throtl_service_queue *sq = &tg->service_queue;
 
 	/*
 	 * If on the default hierarchy, we switch to properly hierarchical
@@ -391,25 +396,10 @@ static void throtl_pd_init(struct blkcg_gq *blkg)
 	 * Limits of a group don't interact with limits of other groups
 	 * regardless of the position of the group in the hierarchy.
 	 */
-	parent_sq = &td->service_queue;
-
+	sq->parent_sq = &td->service_queue;
 	if (cgroup_on_dfl(blkg->blkcg->css.cgroup) && blkg->parent)
-		parent_sq = &blkg_to_tg(blkg->parent)->service_queue;
-
-	throtl_service_queue_init(&tg->service_queue, parent_sq);
-
-	for (rw = READ; rw <= WRITE; rw++) {
-		throtl_qnode_init(&tg->qnode_on_self[rw], tg);
-		throtl_qnode_init(&tg->qnode_on_parent[rw], tg);
-	}
-
-	RB_CLEAR_NODE(&tg->rb_node);
+		sq->parent_sq = &blkg_to_tg(blkg->parent)->service_queue;
 	tg->td = td;
-
-	tg->bps[READ] = -1;
-	tg->bps[WRITE] = -1;
-	tg->iops[READ] = -1;
-	tg->iops[WRITE] = -1;
 }
 
 /*
@@ -436,17 +426,11 @@ static void throtl_pd_online(struct blkcg_gq *blkg)
 	tg_update_has_rules(blkg_to_tg(blkg));
 }
 
-static void throtl_pd_exit(struct blkcg_gq *blkg)
-{
-	struct throtl_grp *tg = blkg_to_tg(blkg);
-
-	throtl_service_queue_exit(&tg->service_queue);
-}
-
 static void throtl_pd_free(struct blkg_policy_data *pd)
 {
 	struct throtl_grp *tg = pd_to_tg(pd);
 
+	del_timer_sync(&tg->service_queue.pending_timer);
 	free_percpu(tg->stats_cpu);
 	kfree(tg);
 }
@@ -1421,7 +1405,6 @@ static struct blkcg_policy blkcg_policy_throtl = {
 	.pd_alloc_fn		= throtl_pd_alloc,
 	.pd_init_fn		= throtl_pd_init,
 	.pd_online_fn		= throtl_pd_online,
-	.pd_exit_fn		= throtl_pd_exit,
 	.pd_free_fn		= throtl_pd_free,
 	.pd_reset_stats_fn	= throtl_pd_reset_stats,
 };
@@ -1616,7 +1599,7 @@ int blk_throtl_init(struct request_queue *q)
 		return -ENOMEM;
 
 	INIT_WORK(&td->dispatch_work, blk_throtl_dispatch_work_fn);
-	throtl_service_queue_init(&td->service_queue, NULL);
+	throtl_service_queue_init(&td->service_queue);
 
 	q->td = td;
 	td->queue = q;
diff --git a/block/cfq-iosched.c b/block/cfq-iosched.c
index 69ce2883099e..4b795c7250ef 100644
--- a/block/cfq-iosched.c
+++ b/block/cfq-iosched.c
@@ -1584,7 +1584,17 @@ static void cfq_cpd_init(const struct blkcg *blkcg)
 
 static struct blkg_policy_data *cfq_pd_alloc(gfp_t gfp, int node)
 {
-	return kzalloc_node(sizeof(struct cfq_group), gfp, node);
+	struct cfq_group *cfqg;
+
+	cfqg = kzalloc_node(sizeof(*cfqg), gfp, node);
+	if (!cfqg)
+		return NULL;
+
+	cfq_init_cfqg_base(cfqg);
+	cfqg_stats_init(&cfqg->stats);
+	cfqg_stats_init(&cfqg->dead_stats);
+
+	return &cfqg->pd;
 }
 
 static void cfq_pd_init(struct blkcg_gq *blkg)
@@ -1592,11 +1602,8 @@ static void cfq_pd_init(struct blkcg_gq *blkg)
 	struct cfq_group *cfqg = blkg_to_cfqg(blkg);
 	struct cfq_group_data *cgd = blkcg_to_cfqgd(blkg->blkcg);
 
-	cfq_init_cfqg_base(cfqg);
 	cfqg->weight = cgd->weight;
 	cfqg->leaf_weight = cgd->leaf_weight;
-	cfqg_stats_init(&cfqg->stats);
-	cfqg_stats_init(&cfqg->dead_stats);
 }
 
 static void cfq_pd_offline(struct blkcg_gq *blkg)
diff --git a/include/linux/blk-cgroup.h b/include/linux/blk-cgroup.h
index bd173ea360ce..9879469b1b38 100644
--- a/include/linux/blk-cgroup.h
+++ b/include/linux/blk-cgroup.h
@@ -128,7 +128,6 @@ typedef struct blkg_policy_data *(blkcg_pol_alloc_pd_fn)(gfp_t gfp, int node);
 typedef void (blkcg_pol_init_pd_fn)(struct blkcg_gq *blkg);
 typedef void (blkcg_pol_online_pd_fn)(struct blkcg_gq *blkg);
 typedef void (blkcg_pol_offline_pd_fn)(struct blkcg_gq *blkg);
-typedef void (blkcg_pol_exit_pd_fn)(struct blkcg_gq *blkg);
 typedef void (blkcg_pol_free_pd_fn)(struct blkg_policy_data *pd);
 typedef void (blkcg_pol_reset_pd_stats_fn)(struct blkcg_gq *blkg);
 
@@ -145,7 +144,6 @@ struct blkcg_policy {
 	blkcg_pol_init_pd_fn		*pd_init_fn;
 	blkcg_pol_online_pd_fn		*pd_online_fn;
 	blkcg_pol_offline_pd_fn		*pd_offline_fn;
-	blkcg_pol_exit_pd_fn		*pd_exit_fn;
 	blkcg_pol_free_pd_fn		*pd_free_fn;
 	blkcg_pol_reset_pd_stats_fn	*pd_reset_stats_fn;
 };
-- 
cgit v1.2.3


From a9520cd6f2ac1fbbf206b915946534c6dddbaae2 Mon Sep 17 00:00:00 2001
From: Tejun Heo <tj@kernel.org>
Date: Tue, 18 Aug 2015 14:55:14 -0700
Subject: blkcg: make blkcg_policy methods take a pointer to blkcg_policy_data

The newly added ->pd_alloc_fn() and ->pd_free_fn() deal with pd
(blkg_policy_data) while the older ones use blkg (blkcg_gq).  As using
blkg doesn't make sense for ->pd_alloc_fn() and after allocation pd
can always be mapped to blkg and given that these are policy-specific
methods, it makes sense to converge on pd.

This patch makes all methods deal with pd instead of blkg.  Most
conversions are trivial.  In blk-cgroup.c, a couple method invocation
sites now test whether pd exists instead of policy state for
consistency.  This shouldn't cause any behavioral differences.

Signed-off-by: Tejun Heo <tj@kernel.org>
Cc: Vivek Goyal <vgoyal@redhat.com>
Signed-off-by: Jens Axboe <axboe@fb.com>
---
 block/blk-cgroup.c         | 18 ++++++++----------
 block/blk-throttle.c       | 13 +++++++------
 block/cfq-iosched.c        | 14 +++++++-------
 include/linux/blk-cgroup.h |  8 ++++----
 4 files changed, 26 insertions(+), 27 deletions(-)

(limited to 'block')

diff --git a/block/blk-cgroup.c b/block/blk-cgroup.c
index acfb09af58a5..8343450cffe2 100644
--- a/block/blk-cgroup.c
+++ b/block/blk-cgroup.c
@@ -242,7 +242,7 @@ static struct blkcg_gq *blkg_create(struct blkcg *blkcg,
 		struct blkcg_policy *pol = blkcg_policy[i];
 
 		if (blkg->pd[i] && pol->pd_init_fn)
-			pol->pd_init_fn(blkg);
+			pol->pd_init_fn(blkg->pd[i]);
 	}
 
 	/* insert */
@@ -256,7 +256,7 @@ static struct blkcg_gq *blkg_create(struct blkcg *blkcg,
 			struct blkcg_policy *pol = blkcg_policy[i];
 
 			if (blkg->pd[i] && pol->pd_online_fn)
-				pol->pd_online_fn(blkg);
+				pol->pd_online_fn(blkg->pd[i]);
 		}
 	}
 	blkg->online = true;
@@ -347,7 +347,7 @@ static void blkg_destroy(struct blkcg_gq *blkg)
 		struct blkcg_policy *pol = blkcg_policy[i];
 
 		if (blkg->pd[i] && pol->pd_offline_fn)
-			pol->pd_offline_fn(blkg);
+			pol->pd_offline_fn(blkg->pd[i]);
 	}
 	blkg->online = false;
 
@@ -468,9 +468,8 @@ static int blkcg_reset_stats(struct cgroup_subsys_state *css,
 		for (i = 0; i < BLKCG_MAX_POLS; i++) {
 			struct blkcg_policy *pol = blkcg_policy[i];
 
-			if (blkcg_policy_enabled(blkg->q, pol) &&
-			    pol->pd_reset_stats_fn)
-				pol->pd_reset_stats_fn(blkg);
+			if (blkg->pd[i] && pol->pd_reset_stats_fn)
+				pol->pd_reset_stats_fn(blkg->pd[i]);
 		}
 	}
 
@@ -1076,7 +1075,7 @@ pd_prealloc:
 		pd->blkg = blkg;
 		pd->plid = pol->plid;
 		if (pol->pd_init_fn)
-			pol->pd_init_fn(blkg);
+			pol->pd_init_fn(pd);
 	}
 
 	__set_bit(pol->plid, q->blkcg_pols);
@@ -1116,10 +1115,9 @@ void blkcg_deactivate_policy(struct request_queue *q,
 		/* grab blkcg lock too while removing @pd from @blkg */
 		spin_lock(&blkg->blkcg->lock);
 
-		if (pol->pd_offline_fn)
-			pol->pd_offline_fn(blkg);
-
 		if (blkg->pd[pol->plid]) {
+			if (pol->pd_offline_fn)
+				pol->pd_offline_fn(blkg->pd[pol->plid]);
 			pol->pd_free_fn(blkg->pd[pol->plid]);
 			blkg->pd[pol->plid] = NULL;
 		}
diff --git a/block/blk-throttle.c b/block/blk-throttle.c
index c3a235b8ec7e..c2c75477a6b2 100644
--- a/block/blk-throttle.c
+++ b/block/blk-throttle.c
@@ -377,9 +377,10 @@ static struct blkg_policy_data *throtl_pd_alloc(gfp_t gfp, int node)
 	return &tg->pd;
 }
 
-static void throtl_pd_init(struct blkcg_gq *blkg)
+static void throtl_pd_init(struct blkg_policy_data *pd)
 {
-	struct throtl_grp *tg = blkg_to_tg(blkg);
+	struct throtl_grp *tg = pd_to_tg(pd);
+	struct blkcg_gq *blkg = tg_to_blkg(tg);
 	struct throtl_data *td = blkg->q->td;
 	struct throtl_service_queue *sq = &tg->service_queue;
 
@@ -417,13 +418,13 @@ static void tg_update_has_rules(struct throtl_grp *tg)
 				    (tg->bps[rw] != -1 || tg->iops[rw] != -1);
 }
 
-static void throtl_pd_online(struct blkcg_gq *blkg)
+static void throtl_pd_online(struct blkg_policy_data *pd)
 {
 	/*
 	 * We don't want new groups to escape the limits of its ancestors.
 	 * Update has_rules[] after a new group is brought online.
 	 */
-	tg_update_has_rules(blkg_to_tg(blkg));
+	tg_update_has_rules(pd_to_tg(pd));
 }
 
 static void throtl_pd_free(struct blkg_policy_data *pd)
@@ -435,9 +436,9 @@ static void throtl_pd_free(struct blkg_policy_data *pd)
 	kfree(tg);
 }
 
-static void throtl_pd_reset_stats(struct blkcg_gq *blkg)
+static void throtl_pd_reset_stats(struct blkg_policy_data *pd)
 {
-	struct throtl_grp *tg = blkg_to_tg(blkg);
+	struct throtl_grp *tg = pd_to_tg(pd);
 	int cpu;
 
 	for_each_possible_cpu(cpu) {
diff --git a/block/cfq-iosched.c b/block/cfq-iosched.c
index 4b795c7250ef..95e6b0cbaa84 100644
--- a/block/cfq-iosched.c
+++ b/block/cfq-iosched.c
@@ -1597,18 +1597,18 @@ static struct blkg_policy_data *cfq_pd_alloc(gfp_t gfp, int node)
 	return &cfqg->pd;
 }
 
-static void cfq_pd_init(struct blkcg_gq *blkg)
+static void cfq_pd_init(struct blkg_policy_data *pd)
 {
-	struct cfq_group *cfqg = blkg_to_cfqg(blkg);
-	struct cfq_group_data *cgd = blkcg_to_cfqgd(blkg->blkcg);
+	struct cfq_group *cfqg = pd_to_cfqg(pd);
+	struct cfq_group_data *cgd = blkcg_to_cfqgd(pd->blkg->blkcg);
 
 	cfqg->weight = cgd->weight;
 	cfqg->leaf_weight = cgd->leaf_weight;
 }
 
-static void cfq_pd_offline(struct blkcg_gq *blkg)
+static void cfq_pd_offline(struct blkg_policy_data *pd)
 {
-	struct cfq_group *cfqg = blkg_to_cfqg(blkg);
+	struct cfq_group *cfqg = pd_to_cfqg(pd);
 	int i;
 
 	for (i = 0; i < IOPRIO_BE_NR; i++) {
@@ -1661,9 +1661,9 @@ static struct blkg_rwstat cfqg_rwstat_pd_recursive_sum(struct blkg_policy_data *
 	return a;
 }
 
-static void cfq_pd_reset_stats(struct blkcg_gq *blkg)
+static void cfq_pd_reset_stats(struct blkg_policy_data *pd)
 {
-	struct cfq_group *cfqg = blkg_to_cfqg(blkg);
+	struct cfq_group *cfqg = pd_to_cfqg(pd);
 
 	cfqg_stats_reset(&cfqg->stats);
 	cfqg_stats_reset(&cfqg->dead_stats);
diff --git a/include/linux/blk-cgroup.h b/include/linux/blk-cgroup.h
index 9879469b1b38..ddd4b8b252c7 100644
--- a/include/linux/blk-cgroup.h
+++ b/include/linux/blk-cgroup.h
@@ -125,11 +125,11 @@ struct blkcg_gq {
 
 typedef void (blkcg_pol_init_cpd_fn)(const struct blkcg *blkcg);
 typedef struct blkg_policy_data *(blkcg_pol_alloc_pd_fn)(gfp_t gfp, int node);
-typedef void (blkcg_pol_init_pd_fn)(struct blkcg_gq *blkg);
-typedef void (blkcg_pol_online_pd_fn)(struct blkcg_gq *blkg);
-typedef void (blkcg_pol_offline_pd_fn)(struct blkcg_gq *blkg);
+typedef void (blkcg_pol_init_pd_fn)(struct blkg_policy_data *pd);
+typedef void (blkcg_pol_online_pd_fn)(struct blkg_policy_data *pd);
+typedef void (blkcg_pol_offline_pd_fn)(struct blkg_policy_data *pd);
 typedef void (blkcg_pol_free_pd_fn)(struct blkg_policy_data *pd);
-typedef void (blkcg_pol_reset_pd_stats_fn)(struct blkcg_gq *blkg);
+typedef void (blkcg_pol_reset_pd_stats_fn)(struct blkg_policy_data *pd);
 
 struct blkcg_policy {
 	int				plid;
-- 
cgit v1.2.3


From 814376483e7d85b69a70634633f1f9d01c6ee0cf Mon Sep 17 00:00:00 2001
From: Tejun Heo <tj@kernel.org>
Date: Tue, 18 Aug 2015 14:55:15 -0700
Subject: blkcg: minor updates around blkcg_policy_data

* Rename blkcg->pd[] to blkcg->cpd[] so that cpd is consistently used
  for blkcg_policy_data.

* Make blkcg_policy->cpd_init_fn() take blkcg_policy_data instead of
  blkcg.  This makes it consistent with blkg_policy_data methods and
  to-be-added cpd alloc/free methods.

* blkcg_policy_data->blkcg and cpd_to_blkcg() added so that
  cpd_init_fn() can determine the associated blkcg from
  blkcg_policy_data.

v2: blkcg_policy_data->blkcg initializations were missing.  Added.

Signed-off-by: Tejun Heo <tj@kernel.org>
Cc: Vivek Goyal <vgoyal@redhat.com>
Cc: Arianna Avanzini <avanzini.arianna@gmail.com>
Signed-off-by: Jens Axboe <axboe@fb.com>
---
 block/blk-cgroup.c         | 24 +++++++++++++-----------
 block/cfq-iosched.c        | 11 +++++------
 include/linux/blk-cgroup.h | 14 ++++++++++----
 3 files changed, 28 insertions(+), 21 deletions(-)

(limited to 'block')

diff --git a/block/blk-cgroup.c b/block/blk-cgroup.c
index 8343450cffe2..247c42c8c83b 100644
--- a/block/blk-cgroup.c
+++ b/block/blk-cgroup.c
@@ -821,7 +821,7 @@ static void blkcg_css_free(struct cgroup_subsys_state *css)
 	mutex_unlock(&blkcg_pol_mutex);
 
 	for (i = 0; i < BLKCG_MAX_POLS; i++)
-		kfree(blkcg->pd[i]);
+		kfree(blkcg->cpd[i]);
 	kfree(blkcg);
 }
 
@@ -857,15 +857,16 @@ blkcg_css_alloc(struct cgroup_subsys_state *parent_css)
 		if (!pol || !pol->cpd_size)
 			continue;
 
-		BUG_ON(blkcg->pd[i]);
+		BUG_ON(blkcg->cpd[i]);
 		cpd = kzalloc(pol->cpd_size, GFP_KERNEL);
 		if (!cpd) {
 			ret = ERR_PTR(-ENOMEM);
 			goto free_pd_blkcg;
 		}
-		blkcg->pd[i] = cpd;
+		blkcg->cpd[i] = cpd;
+		cpd->blkcg = blkcg;
 		cpd->plid = i;
-		pol->cpd_init_fn(blkcg);
+		pol->cpd_init_fn(cpd);
 	}
 
 	spin_lock_init(&blkcg->lock);
@@ -881,7 +882,7 @@ blkcg_css_alloc(struct cgroup_subsys_state *parent_css)
 
 free_pd_blkcg:
 	for (i--; i >= 0; i--)
-		kfree(blkcg->pd[i]);
+		kfree(blkcg->cpd[i]);
 free_blkcg:
 	kfree(blkcg);
 	mutex_unlock(&blkcg_pol_mutex);
@@ -1168,9 +1169,10 @@ int blkcg_policy_register(struct blkcg_policy *pol)
 				goto err_free_cpds;
 			}
 
-			blkcg->pd[pol->plid] = cpd;
+			blkcg->cpd[pol->plid] = cpd;
+			cpd->blkcg = blkcg;
 			cpd->plid = pol->plid;
-			pol->cpd_init_fn(blkcg);
+			pol->cpd_init_fn(cpd);
 		}
 	}
 
@@ -1186,8 +1188,8 @@ int blkcg_policy_register(struct blkcg_policy *pol)
 err_free_cpds:
 	if (pol->cpd_size) {
 		list_for_each_entry(blkcg, &all_blkcgs, all_blkcgs_node) {
-			kfree(blkcg->pd[pol->plid]);
-			blkcg->pd[pol->plid] = NULL;
+			kfree(blkcg->cpd[pol->plid]);
+			blkcg->cpd[pol->plid] = NULL;
 		}
 	}
 	blkcg_policy[pol->plid] = NULL;
@@ -1222,8 +1224,8 @@ void blkcg_policy_unregister(struct blkcg_policy *pol)
 
 	if (pol->cpd_size) {
 		list_for_each_entry(blkcg, &all_blkcgs, all_blkcgs_node) {
-			kfree(blkcg->pd[pol->plid]);
-			blkcg->pd[pol->plid] = NULL;
+			kfree(blkcg->cpd[pol->plid]);
+			blkcg->cpd[pol->plid] = NULL;
 		}
 	}
 	blkcg_policy[pol->plid] = NULL;
diff --git a/block/cfq-iosched.c b/block/cfq-iosched.c
index 95e6b0cbaa84..dd6ea9ee6245 100644
--- a/block/cfq-iosched.c
+++ b/block/cfq-iosched.c
@@ -220,7 +220,7 @@ struct cfqg_stats {
 /* Per-cgroup data */
 struct cfq_group_data {
 	/* must be the first member */
-	struct blkcg_policy_data pd;
+	struct blkcg_policy_data cpd;
 
 	unsigned int weight;
 	unsigned int leaf_weight;
@@ -612,7 +612,7 @@ static inline struct cfq_group *pd_to_cfqg(struct blkg_policy_data *pd)
 static struct cfq_group_data
 *cpd_to_cfqgd(struct blkcg_policy_data *cpd)
 {
-	return cpd ? container_of(cpd, struct cfq_group_data, pd) : NULL;
+	return cpd ? container_of(cpd, struct cfq_group_data, cpd) : NULL;
 }
 
 static inline struct blkcg_gq *cfqg_to_blkg(struct cfq_group *cfqg)
@@ -1568,12 +1568,11 @@ static void cfqg_stats_init(struct cfqg_stats *stats)
 #endif
 }
 
-static void cfq_cpd_init(const struct blkcg *blkcg)
+static void cfq_cpd_init(struct blkcg_policy_data *cpd)
 {
-	struct cfq_group_data *cgd =
-		cpd_to_cfqgd(blkcg->pd[blkcg_policy_cfq.plid]);
+	struct cfq_group_data *cgd = cpd_to_cfqgd(cpd);
 
-	if (blkcg == &blkcg_root) {
+	if (cpd_to_blkcg(cpd) == &blkcg_root) {
 		cgd->weight = 2 * CFQ_WEIGHT_DEFAULT;
 		cgd->leaf_weight = 2 * CFQ_WEIGHT_DEFAULT;
 	} else {
diff --git a/include/linux/blk-cgroup.h b/include/linux/blk-cgroup.h
index ddd4b8b252c7..7988d4749fff 100644
--- a/include/linux/blk-cgroup.h
+++ b/include/linux/blk-cgroup.h
@@ -45,7 +45,7 @@ struct blkcg {
 	struct blkcg_gq			*blkg_hint;
 	struct hlist_head		blkg_list;
 
-	struct blkcg_policy_data	*pd[BLKCG_MAX_POLS];
+	struct blkcg_policy_data	*cpd[BLKCG_MAX_POLS];
 
 	struct list_head		all_blkcgs_node;
 #ifdef CONFIG_CGROUP_WRITEBACK
@@ -88,7 +88,8 @@ struct blkg_policy_data {
  * each policy handle per-blkcg data.
  */
 struct blkcg_policy_data {
-	/* the policy id this per-policy data belongs to */
+	/* the blkcg and policy id this per-policy data belongs to */
+	struct blkcg			*blkcg;
 	int				plid;
 };
 
@@ -123,7 +124,7 @@ struct blkcg_gq {
 	struct rcu_head			rcu_head;
 };
 
-typedef void (blkcg_pol_init_cpd_fn)(const struct blkcg *blkcg);
+typedef void (blkcg_pol_init_cpd_fn)(struct blkcg_policy_data *cpd);
 typedef struct blkg_policy_data *(blkcg_pol_alloc_pd_fn)(gfp_t gfp, int node);
 typedef void (blkcg_pol_init_pd_fn)(struct blkg_policy_data *pd);
 typedef void (blkcg_pol_online_pd_fn)(struct blkg_policy_data *pd);
@@ -243,7 +244,7 @@ static inline struct blkg_policy_data *blkg_to_pd(struct blkcg_gq *blkg,
 static inline struct blkcg_policy_data *blkcg_to_cpd(struct blkcg *blkcg,
 						     struct blkcg_policy *pol)
 {
-	return blkcg ? blkcg->pd[pol->plid] : NULL;
+	return blkcg ? blkcg->cpd[pol->plid] : NULL;
 }
 
 /**
@@ -257,6 +258,11 @@ static inline struct blkcg_gq *pd_to_blkg(struct blkg_policy_data *pd)
 	return pd ? pd->blkg : NULL;
 }
 
+static inline struct blkcg *cpd_to_blkcg(struct blkcg_policy_data *cpd)
+{
+	return cpd ? cpd->blkcg : NULL;
+}
+
 /**
  * blkg_path - format cgroup path of blkg
  * @blkg: blkg of interest
-- 
cgit v1.2.3


From e4a9bde9589fdc51283755cdd75d47b27ca7c6fb Mon Sep 17 00:00:00 2001
From: Tejun Heo <tj@kernel.org>
Date: Tue, 18 Aug 2015 14:55:16 -0700
Subject: blkcg: replace blkcg_policy->cpd_size with ->cpd_alloc/free_fn()
 methods

Each active policy has a cpd (blkcg_policy_data) on each blkcg.  The
cpd's were allocated by blkcg core and each policy could request to
allocate extra space at the end by setting blkcg_policy->cpd_size
larger than the size of cpd.

This is a bit unusual but blkg (blkcg_gq) policy data used to be
handled this way too so it made sense to be consistent; however, blkg
policy data switched to alloc/free callbacks.

This patch makes similar changes to cpd handling.
blkcg_policy->cpd_alloc/free_fn() are added to replace ->cpd_size.  As
cpd allocation is now done from policy side, it can simply allocate a
larger area which embeds cpd at the beginning.

As ->cpd_alloc_fn() may be able to perform all necessary
initializations, this patch makes ->cpd_init_fn() optional.

Signed-off-by: Tejun Heo <tj@kernel.org>
Cc: Vivek Goyal <vgoyal@redhat.com>
Cc: Arianna Avanzini <avanzini.arianna@gmail.com>
Signed-off-by: Jens Axboe <axboe@fb.com>
---
 block/blk-cgroup.c         | 39 ++++++++++++++++++++++++---------------
 block/cfq-iosched.c        | 19 ++++++++++++++++++-
 include/linux/blk-cgroup.h | 17 ++++++++++-------
 3 files changed, 52 insertions(+), 23 deletions(-)

(limited to 'block')

diff --git a/block/blk-cgroup.c b/block/blk-cgroup.c
index 247c42c8c83b..2b4354b6b5de 100644
--- a/block/blk-cgroup.c
+++ b/block/blk-cgroup.c
@@ -817,11 +817,15 @@ static void blkcg_css_free(struct cgroup_subsys_state *css)
 	int i;
 
 	mutex_lock(&blkcg_pol_mutex);
+
 	list_del(&blkcg->all_blkcgs_node);
-	mutex_unlock(&blkcg_pol_mutex);
 
 	for (i = 0; i < BLKCG_MAX_POLS; i++)
-		kfree(blkcg->cpd[i]);
+		if (blkcg->cpd[i])
+			blkcg_policy[i]->cpd_free_fn(blkcg->cpd[i]);
+
+	mutex_unlock(&blkcg_pol_mutex);
+
 	kfree(blkcg);
 }
 
@@ -854,11 +858,10 @@ blkcg_css_alloc(struct cgroup_subsys_state *parent_css)
 		 * check if the policy requires any specific per-cgroup
 		 * data: if it does, allocate and initialize it.
 		 */
-		if (!pol || !pol->cpd_size)
+		if (!pol || !pol->cpd_alloc_fn)
 			continue;
 
-		BUG_ON(blkcg->cpd[i]);
-		cpd = kzalloc(pol->cpd_size, GFP_KERNEL);
+		cpd = pol->cpd_alloc_fn(GFP_KERNEL);
 		if (!cpd) {
 			ret = ERR_PTR(-ENOMEM);
 			goto free_pd_blkcg;
@@ -866,7 +869,8 @@ blkcg_css_alloc(struct cgroup_subsys_state *parent_css)
 		blkcg->cpd[i] = cpd;
 		cpd->blkcg = blkcg;
 		cpd->plid = i;
-		pol->cpd_init_fn(cpd);
+		if (pol->cpd_init_fn)
+			pol->cpd_init_fn(cpd);
 	}
 
 	spin_lock_init(&blkcg->lock);
@@ -882,7 +886,8 @@ blkcg_css_alloc(struct cgroup_subsys_state *parent_css)
 
 free_pd_blkcg:
 	for (i--; i >= 0; i--)
-		kfree(blkcg->cpd[i]);
+		if (blkcg->cpd[i])
+			blkcg_policy[i]->cpd_free_fn(blkcg->cpd[i]);
 free_blkcg:
 	kfree(blkcg);
 	mutex_unlock(&blkcg_pol_mutex);
@@ -1159,11 +1164,11 @@ int blkcg_policy_register(struct blkcg_policy *pol)
 	blkcg_policy[pol->plid] = pol;
 
 	/* allocate and install cpd's */
-	if (pol->cpd_size) {
+	if (pol->cpd_alloc_fn) {
 		list_for_each_entry(blkcg, &all_blkcgs, all_blkcgs_node) {
 			struct blkcg_policy_data *cpd;
 
-			cpd = kzalloc(pol->cpd_size, GFP_KERNEL);
+			cpd = pol->cpd_alloc_fn(GFP_KERNEL);
 			if (!cpd) {
 				mutex_unlock(&blkcg_pol_mutex);
 				goto err_free_cpds;
@@ -1186,10 +1191,12 @@ int blkcg_policy_register(struct blkcg_policy *pol)
 	return 0;
 
 err_free_cpds:
-	if (pol->cpd_size) {
+	if (pol->cpd_alloc_fn) {
 		list_for_each_entry(blkcg, &all_blkcgs, all_blkcgs_node) {
-			kfree(blkcg->cpd[pol->plid]);
-			blkcg->cpd[pol->plid] = NULL;
+			if (blkcg->cpd[pol->plid]) {
+				pol->cpd_free_fn(blkcg->cpd[pol->plid]);
+				blkcg->cpd[pol->plid] = NULL;
+			}
 		}
 	}
 	blkcg_policy[pol->plid] = NULL;
@@ -1222,10 +1229,12 @@ void blkcg_policy_unregister(struct blkcg_policy *pol)
 	/* remove cpds and unregister */
 	mutex_lock(&blkcg_pol_mutex);
 
-	if (pol->cpd_size) {
+	if (pol->cpd_alloc_fn) {
 		list_for_each_entry(blkcg, &all_blkcgs, all_blkcgs_node) {
-			kfree(blkcg->cpd[pol->plid]);
-			blkcg->cpd[pol->plid] = NULL;
+			if (blkcg->cpd[pol->plid]) {
+				pol->cpd_free_fn(blkcg->cpd[pol->plid]);
+				blkcg->cpd[pol->plid] = NULL;
+			}
 		}
 	}
 	blkcg_policy[pol->plid] = NULL;
diff --git a/block/cfq-iosched.c b/block/cfq-iosched.c
index dd6ea9ee6245..a4429b366820 100644
--- a/block/cfq-iosched.c
+++ b/block/cfq-iosched.c
@@ -1568,6 +1568,16 @@ static void cfqg_stats_init(struct cfqg_stats *stats)
 #endif
 }
 
+static struct blkcg_policy_data *cfq_cpd_alloc(gfp_t gfp)
+{
+	struct cfq_group_data *cgd;
+
+	cgd = kzalloc(sizeof(*cgd), GFP_KERNEL);
+	if (!cgd)
+		return NULL;
+	return &cgd->cpd;
+}
+
 static void cfq_cpd_init(struct blkcg_policy_data *cpd)
 {
 	struct cfq_group_data *cgd = cpd_to_cfqgd(cpd);
@@ -1581,6 +1591,11 @@ static void cfq_cpd_init(struct blkcg_policy_data *cpd)
 	}
 }
 
+static void cfq_cpd_free(struct blkcg_policy_data *cpd)
+{
+	kfree(cpd_to_cfqgd(cpd));
+}
+
 static struct blkg_policy_data *cfq_pd_alloc(gfp_t gfp, int node)
 {
 	struct cfq_group *cfqg;
@@ -4649,10 +4664,12 @@ static struct elevator_type iosched_cfq = {
 
 #ifdef CONFIG_CFQ_GROUP_IOSCHED
 static struct blkcg_policy blkcg_policy_cfq = {
-	.cpd_size		= sizeof(struct cfq_group_data),
 	.cftypes		= cfq_blkcg_files,
 
+	.cpd_alloc_fn		= cfq_cpd_alloc,
 	.cpd_init_fn		= cfq_cpd_init,
+	.cpd_free_fn		= cfq_cpd_free,
+
 	.pd_alloc_fn		= cfq_pd_alloc,
 	.pd_init_fn		= cfq_pd_init,
 	.pd_offline_fn		= cfq_pd_offline,
diff --git a/include/linux/blk-cgroup.h b/include/linux/blk-cgroup.h
index 7988d4749fff..15f2382bc723 100644
--- a/include/linux/blk-cgroup.h
+++ b/include/linux/blk-cgroup.h
@@ -81,11 +81,11 @@ struct blkg_policy_data {
 };
 
 /*
- * Policies that need to keep per-blkcg data which is independent
- * from any request_queue associated to it must specify its size
- * with the cpd_size field of the blkcg_policy structure and
- * embed a blkcg_policy_data in it.  cpd_init() is invoked to let
- * each policy handle per-blkcg data.
+ * Policies that need to keep per-blkcg data which is independent from any
+ * request_queue associated to it should implement cpd_alloc/free_fn()
+ * methods.  A policy can allocate private data area by allocating larger
+ * data structure which embeds blkcg_policy_data at the beginning.
+ * cpd_init() is invoked to let each policy handle per-blkcg data.
  */
 struct blkcg_policy_data {
 	/* the blkcg and policy id this per-policy data belongs to */
@@ -124,7 +124,9 @@ struct blkcg_gq {
 	struct rcu_head			rcu_head;
 };
 
+typedef struct blkcg_policy_data *(blkcg_pol_alloc_cpd_fn)(gfp_t gfp);
 typedef void (blkcg_pol_init_cpd_fn)(struct blkcg_policy_data *cpd);
+typedef void (blkcg_pol_free_cpd_fn)(struct blkcg_policy_data *cpd);
 typedef struct blkg_policy_data *(blkcg_pol_alloc_pd_fn)(gfp_t gfp, int node);
 typedef void (blkcg_pol_init_pd_fn)(struct blkg_policy_data *pd);
 typedef void (blkcg_pol_online_pd_fn)(struct blkg_policy_data *pd);
@@ -134,13 +136,14 @@ typedef void (blkcg_pol_reset_pd_stats_fn)(struct blkg_policy_data *pd);
 
 struct blkcg_policy {
 	int				plid;
-	/* policy specific per-blkcg data size */
-	size_t				cpd_size;
 	/* cgroup files for the policy */
 	struct cftype			*cftypes;
 
 	/* operations */
+	blkcg_pol_alloc_cpd_fn		*cpd_alloc_fn;
 	blkcg_pol_init_cpd_fn		*cpd_init_fn;
+	blkcg_pol_free_cpd_fn		*cpd_free_fn;
+
 	blkcg_pol_alloc_pd_fn		*pd_alloc_fn;
 	blkcg_pol_init_pd_fn		*pd_init_fn;
 	blkcg_pol_online_pd_fn		*pd_online_fn;
-- 
cgit v1.2.3


From 24f290466f79a6497f1654f64b9a841872cba3ca Mon Sep 17 00:00:00 2001
From: Tejun Heo <tj@kernel.org>
Date: Tue, 18 Aug 2015 14:55:17 -0700
Subject: blkcg: inline [__]blkg_lookup()

blkg_lookup() checks whether the target queue is bypassing and, if
not, calls __blkg_lookup() which first checks the lookup hint and then
performs radix tree walk.  The operations upto hint checking are
trivial and there are many users of this function.  This patch inlines
blkg_lookup() and the fast path part of __blkg_lookup().  The radix
tree lookup and hint update are now in blkg_lookup_slowpath().

This will help consolidating blkg handling by easing moving root blkcg
short-circuit to inlined lookup fast path.

Signed-off-by: Tejun Heo <tj@kernel.org>
Cc: Vivek Goyal <vgoyal@redhat.com>
Cc: Arianna Avanzini <avanzini.arianna@gmail.com>
Signed-off-by: Jens Axboe <axboe@fb.com>
---
 block/blk-cgroup.c         | 38 ++---------------------------------
 include/linux/blk-cgroup.h | 49 ++++++++++++++++++++++++++++++++++++++++++----
 2 files changed, 47 insertions(+), 40 deletions(-)

(limited to 'block')

diff --git a/block/blk-cgroup.c b/block/blk-cgroup.c
index 2b4354b6b5de..52e637ed9d7e 100644
--- a/block/blk-cgroup.c
+++ b/block/blk-cgroup.c
@@ -131,26 +131,11 @@ err_free:
 	return NULL;
 }
 
-/**
- * __blkg_lookup - internal version of blkg_lookup()
- * @blkcg: blkcg of interest
- * @q: request_queue of interest
- * @update_hint: whether to update lookup hint with the result or not
- *
- * This is internal version and shouldn't be used by policy
- * implementations.  Looks up blkgs for the @blkcg - @q pair regardless of
- * @q's bypass state.  If @update_hint is %true, the caller should be
- * holding @q->queue_lock and lookup hint is updated on success.
- */
-struct blkcg_gq *__blkg_lookup(struct blkcg *blkcg, struct request_queue *q,
-			       bool update_hint)
+struct blkcg_gq *blkg_lookup_slowpath(struct blkcg *blkcg,
+				      struct request_queue *q, bool update_hint)
 {
 	struct blkcg_gq *blkg;
 
-	blkg = rcu_dereference(blkcg->blkg_hint);
-	if (blkg && blkg->q == q)
-		return blkg;
-
 	/*
 	 * Hint didn't match.  Look up from the radix tree.  Note that the
 	 * hint can only be updated under queue_lock as otherwise @blkg
@@ -169,25 +154,6 @@ struct blkcg_gq *__blkg_lookup(struct blkcg *blkcg, struct request_queue *q,
 	return NULL;
 }
 
-/**
- * blkg_lookup - lookup blkg for the specified blkcg - q pair
- * @blkcg: blkcg of interest
- * @q: request_queue of interest
- *
- * Lookup blkg for the @blkcg - @q pair.  This function should be called
- * under RCU read lock and is guaranteed to return %NULL if @q is bypassing
- * - see blk_queue_bypass_start() for details.
- */
-struct blkcg_gq *blkg_lookup(struct blkcg *blkcg, struct request_queue *q)
-{
-	WARN_ON_ONCE(!rcu_read_lock_held());
-
-	if (unlikely(blk_queue_bypass(q)))
-		return NULL;
-	return __blkg_lookup(blkcg, q, false);
-}
-EXPORT_SYMBOL_GPL(blkg_lookup);
-
 /*
  * If @new_blkg is %NULL, this function tries to allocate a new one as
  * necessary using %GFP_NOWAIT.  @new_blkg is always consumed on return.
diff --git a/include/linux/blk-cgroup.h b/include/linux/blk-cgroup.h
index 15f2382bc723..d5b54aa50582 100644
--- a/include/linux/blk-cgroup.h
+++ b/include/linux/blk-cgroup.h
@@ -155,7 +155,8 @@ struct blkcg_policy {
 extern struct blkcg blkcg_root;
 extern struct cgroup_subsys_state * const blkcg_root_css;
 
-struct blkcg_gq *blkg_lookup(struct blkcg *blkcg, struct request_queue *q);
+struct blkcg_gq *blkg_lookup_slowpath(struct blkcg *blkcg,
+				      struct request_queue *q, bool update_hint);
 struct blkcg_gq *blkg_lookup_create(struct blkcg *blkcg,
 				    struct request_queue *q);
 int blkcg_init_queue(struct request_queue *q);
@@ -231,6 +232,49 @@ static inline struct blkcg *blkcg_parent(struct blkcg *blkcg)
 	return css_to_blkcg(blkcg->css.parent);
 }
 
+/**
+ * __blkg_lookup - internal version of blkg_lookup()
+ * @blkcg: blkcg of interest
+ * @q: request_queue of interest
+ * @update_hint: whether to update lookup hint with the result or not
+ *
+ * This is internal version and shouldn't be used by policy
+ * implementations.  Looks up blkgs for the @blkcg - @q pair regardless of
+ * @q's bypass state.  If @update_hint is %true, the caller should be
+ * holding @q->queue_lock and lookup hint is updated on success.
+ */
+static inline struct blkcg_gq *__blkg_lookup(struct blkcg *blkcg,
+					     struct request_queue *q,
+					     bool update_hint)
+{
+	struct blkcg_gq *blkg;
+
+	blkg = rcu_dereference(blkcg->blkg_hint);
+	if (blkg && blkg->q == q)
+		return blkg;
+
+	return blkg_lookup_slowpath(blkcg, q, update_hint);
+}
+
+/**
+ * blkg_lookup - lookup blkg for the specified blkcg - q pair
+ * @blkcg: blkcg of interest
+ * @q: request_queue of interest
+ *
+ * Lookup blkg for the @blkcg - @q pair.  This function should be called
+ * under RCU read lock and is guaranteed to return %NULL if @q is bypassing
+ * - see blk_queue_bypass_start() for details.
+ */
+static inline struct blkcg_gq *blkg_lookup(struct blkcg *blkcg,
+					   struct request_queue *q)
+{
+	WARN_ON_ONCE(!rcu_read_lock_held());
+
+	if (unlikely(blk_queue_bypass(q)))
+		return NULL;
+	return __blkg_lookup(blkcg, q, false);
+}
+
 /**
  * blkg_to_pdata - get policy private data
  * @blkg: blkg of interest
@@ -313,9 +357,6 @@ static inline void blkg_put(struct blkcg_gq *blkg)
 		call_rcu(&blkg->rcu_head, __blkg_release_rcu);
 }
 
-struct blkcg_gq *__blkg_lookup(struct blkcg *blkcg, struct request_queue *q,
-			       bool update_hint);
-
 /**
  * blkg_for_each_descendant_pre - pre-order walk of a blkg's descendants
  * @d_blkg: loop cursor pointing to the current descendant
-- 
cgit v1.2.3


From 85b6bc9db6d5ab6980b43c38b5cbd11d24414ce4 Mon Sep 17 00:00:00 2001
From: Tejun Heo <tj@kernel.org>
Date: Tue, 18 Aug 2015 14:55:18 -0700
Subject: blkcg: move root blkg lookup optimization from throtl_lookup_tg() to
 __blkg_lookup()

Currently, both throttle and cfq policies implement their own root
blkg (blkcg_gq) lookup fast path.  This patch moves root blkg
optimization from throtl_lookup_tg() to __blkg_lookup().  cfq-iosched
currently doesn't use blkg_lookup() but will be converted and drop the
optimization too.

Signed-off-by: Tejun Heo <tj@kernel.org>
Cc: Vivek Goyal <vgoyal@redhat.com>
Cc: Arianna Avanzini <avanzini.arianna@gmail.com>
Signed-off-by: Jens Axboe <axboe@fb.com>
---
 block/blk-throttle.c       | 7 -------
 include/linux/blk-cgroup.h | 3 +++
 2 files changed, 3 insertions(+), 7 deletions(-)

(limited to 'block')

diff --git a/block/blk-throttle.c b/block/blk-throttle.c
index c2c75477a6b2..1f63fc834dc3 100644
--- a/block/blk-throttle.c
+++ b/block/blk-throttle.c
@@ -452,13 +452,6 @@ static void throtl_pd_reset_stats(struct blkg_policy_data *pd)
 static struct throtl_grp *throtl_lookup_tg(struct throtl_data *td,
 					   struct blkcg *blkcg)
 {
-	/*
-	 * This is the common case when there are no blkcgs.  Avoid lookup
-	 * in this case
-	 */
-	if (blkcg == &blkcg_root)
-		return td_root_tg(td);
-
 	return blkg_to_tg(blkg_lookup(blkcg, td->queue));
 }
 
diff --git a/include/linux/blk-cgroup.h b/include/linux/blk-cgroup.h
index d5b54aa50582..0609bce69f68 100644
--- a/include/linux/blk-cgroup.h
+++ b/include/linux/blk-cgroup.h
@@ -249,6 +249,9 @@ static inline struct blkcg_gq *__blkg_lookup(struct blkcg *blkcg,
 {
 	struct blkcg_gq *blkg;
 
+	if (blkcg == &blkcg_root)
+		return q->root_blkg;
+
 	blkg = rcu_dereference(blkcg->blkg_hint);
 	if (blkg && blkg->q == q)
 		return blkg;
-- 
cgit v1.2.3


From c9589f03e490956628ff91a1da133216dc796b63 Mon Sep 17 00:00:00 2001
From: Tejun Heo <tj@kernel.org>
Date: Tue, 18 Aug 2015 14:55:19 -0700
Subject: blk-throttle: improve queue bypass handling

If a queue is bypassing, all blkcg policies should become noops but
blk-throttle wasn't.  It only became noop if the queue was dying.
While this wouldn't lead to an oops as falling back to the root blkg
is safe in this case, this can be a bit surprising - a bypassing queue
could still be applying throttle limits.

Fix it by removing blk_queue_dying() test in throtl_lookup_create_tg()
and testing blk_queue_bypass() in blk_throtl_bio() and bypassing
before doing anything else.

Signed-off-by: Tejun Heo <tj@kernel.org>
Cc: Vivek Goyal <vgoyal@redhat.com>
Cc: Arianna Avanzini <avanzini.arianna@gmail.com>
Signed-off-by: Jens Axboe <axboe@fb.com>
---
 block/blk-throttle.c | 7 ++++---
 1 file changed, 4 insertions(+), 3 deletions(-)

(limited to 'block')

diff --git a/block/blk-throttle.c b/block/blk-throttle.c
index 1f63fc834dc3..900a777e01c2 100644
--- a/block/blk-throttle.c
+++ b/block/blk-throttle.c
@@ -475,7 +475,7 @@ static struct throtl_grp *throtl_lookup_create_tg(struct throtl_data *td,
 		/* if %NULL and @q is alive, fall back to root_tg */
 		if (!IS_ERR(blkg))
 			tg = blkg_to_tg(blkg);
-		else if (!blk_queue_dying(q))
+		else
 			tg = td_root_tg(td);
 	}
 
@@ -1438,10 +1438,11 @@ bool blk_throtl_bio(struct request_queue *q, struct bio *bio)
 	 * IO group
 	 */
 	spin_lock_irq(q->queue_lock);
-	tg = throtl_lookup_create_tg(td, blkcg);
-	if (unlikely(!tg))
+
+	if (unlikely(blk_queue_bypass(q)))
 		goto out_unlock;
 
+	tg = throtl_lookup_create_tg(td, blkcg);
 	sq = &tg->service_queue;
 
 	while (true) {
-- 
cgit v1.2.3


From ae11889636111199dbcf47283b4167f578b69472 Mon Sep 17 00:00:00 2001
From: Tejun Heo <tj@kernel.org>
Date: Tue, 18 Aug 2015 14:55:20 -0700
Subject: blkcg: consolidate blkg creation in blkcg_bio_issue_check()

blkg (blkcg_gq) currently is created by blkcg policies invoking
blkg_lookup_create() which ends up repeating about the same code in
different policies.  Theoretically, this can avoid the overhead of
looking and/or creating blkg's if blkcg is enabled but no policy is in
use; however, the cost of blkg lookup / creation is very low
especially if only the root blkcg is in use which is highly likely if
no blkcg policy is in active use - it boils down to a single very
predictable conditional and surrounding RCU protection.

This patch consolidates blkg creation to a new function
blkcg_bio_issue_check() which is called during bio issue from
generic_make_request_checks().  blkcg_bio_issue_check() is now the
only function which tries to create missing blkg's.  The subsequent
policy and request_list operations just perform blkg_lookup() and if
missing falls back to the root.

* blk_get_rl() no longer tries to create blkg.  It uses blkg_lookup()
  instead of blkg_lookup_create().

* blk_throtl_bio() is now called from blkcg_bio_issue_check() with rcu
  read locked and blkg already looked up.  Both throtl_lookup_tg() and
  throtl_lookup_create_tg() are dropped.

* cfq is similarly updated.  cfq_lookup_create_cfqg() is replaced with
  cfq_lookup_cfqg()which uses blkg_lookup().

This consolidates blkg handling and avoids unnecessary blkg creation
retries under memory pressure.  In addition, this provides a common
bio entry point into blkcg where things like common accounting can be
performed.

v2: Build fixes for !CONFIG_CFQ_GROUP_IOSCHED and
    !CONFIG_BLK_DEV_THROTTLING.

Signed-off-by: Tejun Heo <tj@kernel.org>
Cc: Vivek Goyal <vgoyal@redhat.com>
Cc: Arianna Avanzini <avanzini.arianna@gmail.com>
Signed-off-by: Jens Axboe <axboe@fb.com>
---
 block/blk-cgroup.c         |  2 +-
 block/blk-core.c           |  4 +--
 block/blk-throttle.c       | 72 ++++------------------------------------------
 block/blk.h                |  5 ----
 block/cfq-iosched.c        | 33 +++++++--------------
 include/linux/blk-cgroup.h | 40 ++++++++++++++++++++++++--
 6 files changed, 57 insertions(+), 99 deletions(-)

(limited to 'block')

diff --git a/block/blk-cgroup.c b/block/blk-cgroup.c
index 52e637ed9d7e..097c4a670fa4 100644
--- a/block/blk-cgroup.c
+++ b/block/blk-cgroup.c
@@ -153,6 +153,7 @@ struct blkcg_gq *blkg_lookup_slowpath(struct blkcg *blkcg,
 
 	return NULL;
 }
+EXPORT_SYMBOL_GPL(blkg_lookup_slowpath);
 
 /*
  * If @new_blkg is %NULL, this function tries to allocate a new one as
@@ -295,7 +296,6 @@ struct blkcg_gq *blkg_lookup_create(struct blkcg *blkcg,
 			return blkg;
 	}
 }
-EXPORT_SYMBOL_GPL(blkg_lookup_create);
 
 static void blkg_destroy(struct blkcg_gq *blkg)
 {
diff --git a/block/blk-core.c b/block/blk-core.c
index 627ed0c593fb..140094f4eaca 100644
--- a/block/blk-core.c
+++ b/block/blk-core.c
@@ -1889,8 +1889,8 @@ generic_make_request_checks(struct bio *bio)
 	 */
 	create_io_context(GFP_ATOMIC, q->node);
 
-	if (blk_throtl_bio(q, bio))
-		return false;	/* throttled, will be resubmitted later */
+	if (!blkcg_bio_issue_check(q, bio))
+		return false;
 
 	trace_block_bio_queue(q, bio);
 	return true;
diff --git a/block/blk-throttle.c b/block/blk-throttle.c
index 900a777e01c2..29c22ed4b073 100644
--- a/block/blk-throttle.c
+++ b/block/blk-throttle.c
@@ -182,11 +182,6 @@ static inline struct blkcg_gq *tg_to_blkg(struct throtl_grp *tg)
 	return pd_to_blkg(&tg->pd);
 }
 
-static inline struct throtl_grp *td_root_tg(struct throtl_data *td)
-{
-	return blkg_to_tg(td->queue->root_blkg);
-}
-
 /**
  * sq_to_tg - return the throl_grp the specified service queue belongs to
  * @sq: the throtl_service_queue of interest
@@ -449,39 +444,6 @@ static void throtl_pd_reset_stats(struct blkg_policy_data *pd)
 	}
 }
 
-static struct throtl_grp *throtl_lookup_tg(struct throtl_data *td,
-					   struct blkcg *blkcg)
-{
-	return blkg_to_tg(blkg_lookup(blkcg, td->queue));
-}
-
-static struct throtl_grp *throtl_lookup_create_tg(struct throtl_data *td,
-						  struct blkcg *blkcg)
-{
-	struct request_queue *q = td->queue;
-	struct throtl_grp *tg = NULL;
-
-	/*
-	 * This is the common case when there are no blkcgs.  Avoid lookup
-	 * in this case
-	 */
-	if (blkcg == &blkcg_root) {
-		tg = td_root_tg(td);
-	} else {
-		struct blkcg_gq *blkg;
-
-		blkg = blkg_lookup_create(blkcg, q);
-
-		/* if %NULL and @q is alive, fall back to root_tg */
-		if (!IS_ERR(blkg))
-			tg = blkg_to_tg(blkg);
-		else
-			tg = td_root_tg(td);
-	}
-
-	return tg;
-}
-
 static struct throtl_grp *
 throtl_rb_first(struct throtl_service_queue *parent_sq)
 {
@@ -1403,46 +1365,26 @@ static struct blkcg_policy blkcg_policy_throtl = {
 	.pd_reset_stats_fn	= throtl_pd_reset_stats,
 };
 
-bool blk_throtl_bio(struct request_queue *q, struct bio *bio)
+bool blk_throtl_bio(struct request_queue *q, struct blkcg_gq *blkg,
+		    struct bio *bio)
 {
-	struct throtl_data *td = q->td;
 	struct throtl_qnode *qn = NULL;
-	struct throtl_grp *tg;
+	struct throtl_grp *tg = blkg_to_tg(blkg ?: q->root_blkg);
 	struct throtl_service_queue *sq;
 	bool rw = bio_data_dir(bio);
-	struct blkcg *blkcg;
 	bool throttled = false;
 
+	WARN_ON_ONCE(!rcu_read_lock_held());
+
 	/* see throtl_charge_bio() */
-	if (bio->bi_rw & REQ_THROTTLED)
+	if ((bio->bi_rw & REQ_THROTTLED) || !tg->has_rules[rw])
 		goto out;
 
-	/*
-	 * A throtl_grp pointer retrieved under rcu can be used to access
-	 * basic fields like stats and io rates. If a group has no rules,
-	 * just update the dispatch stats in lockless manner and return.
-	 */
-	rcu_read_lock();
-	blkcg = bio_blkcg(bio);
-	tg = throtl_lookup_tg(td, blkcg);
-	if (tg) {
-		if (!tg->has_rules[rw]) {
-			throtl_update_dispatch_stats(tg_to_blkg(tg),
-					bio->bi_iter.bi_size, bio->bi_rw);
-			goto out_unlock_rcu;
-		}
-	}
-
-	/*
-	 * Either group has not been allocated yet or it is not an unlimited
-	 * IO group
-	 */
 	spin_lock_irq(q->queue_lock);
 
 	if (unlikely(blk_queue_bypass(q)))
 		goto out_unlock;
 
-	tg = throtl_lookup_create_tg(td, blkcg);
 	sq = &tg->service_queue;
 
 	while (true) {
@@ -1507,8 +1449,6 @@ bool blk_throtl_bio(struct request_queue *q, struct bio *bio)
 
 out_unlock:
 	spin_unlock_irq(q->queue_lock);
-out_unlock_rcu:
-	rcu_read_unlock();
 out:
 	/*
 	 * As multiple blk-throtls may stack in the same issue path, we
diff --git a/block/blk.h b/block/blk.h
index 026d9594142b..d905b26fbff5 100644
--- a/block/blk.h
+++ b/block/blk.h
@@ -266,15 +266,10 @@ static inline struct io_context *create_io_context(gfp_t gfp_mask, int node)
  * Internal throttling interface
  */
 #ifdef CONFIG_BLK_DEV_THROTTLING
-extern bool blk_throtl_bio(struct request_queue *q, struct bio *bio);
 extern void blk_throtl_drain(struct request_queue *q);
 extern int blk_throtl_init(struct request_queue *q);
 extern void blk_throtl_exit(struct request_queue *q);
 #else /* CONFIG_BLK_DEV_THROTTLING */
-static inline bool blk_throtl_bio(struct request_queue *q, struct bio *bio)
-{
-	return false;
-}
 static inline void blk_throtl_drain(struct request_queue *q) { }
 static inline int blk_throtl_init(struct request_queue *q) { return 0; }
 static inline void blk_throtl_exit(struct request_queue *q) { }
diff --git a/block/cfq-iosched.c b/block/cfq-iosched.c
index a4429b366820..0994f3b523a8 100644
--- a/block/cfq-iosched.c
+++ b/block/cfq-iosched.c
@@ -1683,28 +1683,15 @@ static void cfq_pd_reset_stats(struct blkg_policy_data *pd)
 	cfqg_stats_reset(&cfqg->dead_stats);
 }
 
-/*
- * Search for the cfq group current task belongs to. request_queue lock must
- * be held.
- */
-static struct cfq_group *cfq_lookup_create_cfqg(struct cfq_data *cfqd,
-						struct blkcg *blkcg)
+static struct cfq_group *cfq_lookup_cfqg(struct cfq_data *cfqd,
+					 struct blkcg *blkcg)
 {
-	struct request_queue *q = cfqd->queue;
-	struct cfq_group *cfqg = NULL;
-
-	/* avoid lookup for the common case where there's no blkcg */
-	if (blkcg == &blkcg_root) {
-		cfqg = cfqd->root_group;
-	} else {
-		struct blkcg_gq *blkg;
-
-		blkg = blkg_lookup_create(blkcg, q);
-		if (!IS_ERR(blkg))
-			cfqg = blkg_to_cfqg(blkg);
-	}
+	struct blkcg_gq *blkg;
 
-	return cfqg;
+	blkg = blkg_lookup(blkcg, cfqd->queue);
+	if (likely(blkg))
+		return blkg_to_cfqg(blkg);
+	return NULL;
 }
 
 static void cfq_link_cfqq_cfqg(struct cfq_queue *cfqq, struct cfq_group *cfqg)
@@ -2108,8 +2095,8 @@ static struct cftype cfq_blkcg_files[] = {
 	{ }	/* terminate */
 };
 #else /* GROUP_IOSCHED */
-static struct cfq_group *cfq_lookup_create_cfqg(struct cfq_data *cfqd,
-						struct blkcg *blkcg)
+static struct cfq_group *cfq_lookup_cfqg(struct cfq_data *cfqd,
+					 struct blkcg *blkcg)
 {
 	return cfqd->root_group;
 }
@@ -3716,7 +3703,7 @@ cfq_get_queue(struct cfq_data *cfqd, bool is_sync, struct cfq_io_cq *cic,
 	struct cfq_group *cfqg;
 
 	rcu_read_lock();
-	cfqg = cfq_lookup_create_cfqg(cfqd, bio_blkcg(bio));
+	cfqg = cfq_lookup_cfqg(cfqd, bio_blkcg(bio));
 	if (!cfqg) {
 		cfqq = &cfqd->oom_cfqq;
 		goto out;
diff --git a/include/linux/blk-cgroup.h b/include/linux/blk-cgroup.h
index 0609bce69f68..4d1659c7f84b 100644
--- a/include/linux/blk-cgroup.h
+++ b/include/linux/blk-cgroup.h
@@ -421,8 +421,8 @@ static inline struct request_list *blk_get_rl(struct request_queue *q,
 	 * or if either the blkcg or queue is going away.  Fall back to
 	 * root_rl in such cases.
 	 */
-	blkg = blkg_lookup_create(blkcg, q);
-	if (unlikely(IS_ERR(blkg)))
+	blkg = blkg_lookup(blkcg, q);
+	if (unlikely(!blkg))
 		goto root_rl;
 
 	blkg_get(blkg);
@@ -636,6 +636,39 @@ static inline void blkg_rwstat_merge(struct blkg_rwstat *to,
 	u64_stats_update_end(&to->syncp);
 }
 
+#ifdef CONFIG_BLK_DEV_THROTTLING
+extern bool blk_throtl_bio(struct request_queue *q, struct blkcg_gq *blkg,
+			   struct bio *bio);
+#else
+static inline bool blk_throtl_bio(struct request_queue *q, struct blkcg_gq *blkg,
+				  struct bio *bio) { return false; }
+#endif
+
+static inline bool blkcg_bio_issue_check(struct request_queue *q,
+					 struct bio *bio)
+{
+	struct blkcg *blkcg;
+	struct blkcg_gq *blkg;
+	bool throtl = false;
+
+	rcu_read_lock();
+	blkcg = bio_blkcg(bio);
+
+	blkg = blkg_lookup(blkcg, q);
+	if (unlikely(!blkg)) {
+		spin_lock_irq(q->queue_lock);
+		blkg = blkg_lookup_create(blkcg, q);
+		if (IS_ERR(blkg))
+			blkg = NULL;
+		spin_unlock_irq(q->queue_lock);
+	}
+
+	throtl = blk_throtl_bio(q, blkg, bio);
+
+	rcu_read_unlock();
+	return !throtl;
+}
+
 #else	/* CONFIG_BLK_CGROUP */
 
 struct blkcg {
@@ -689,6 +722,9 @@ static inline void blk_put_rl(struct request_list *rl) { }
 static inline void blk_rq_set_rl(struct request *rq, struct request_list *rl) { }
 static inline struct request_list *blk_rq_rl(struct request *rq) { return &rq->q->root_rl; }
 
+static inline bool blkcg_bio_issue_check(struct request_queue *q,
+					 struct bio *bio) { return true; }
+
 #define blk_queue_for_each_rl(rl, q)	\
 	for ((rl) = &(q)->root_rl; (rl); (rl) = NULL)
 
-- 
cgit v1.2.3


From e6269c44546755094979ab53609e6e203a68c8ff Mon Sep 17 00:00:00 2001
From: Tejun Heo <tj@kernel.org>
Date: Tue, 18 Aug 2015 14:55:21 -0700
Subject: blkcg: add blkg_[rw]stat->aux_cnt and replace cfq_group->dead_stats
 with it

cgroup stats are local to each cgroup and doesn't propagate to
ancestors by default.  When recursive stats are necessary, the sum is
calculated over all the descendants.  This initially was for backward
compatibility to support both group-local and recursive stats but this
mode of operation makes general sense as stat update is much hotter
thafn reporting those stats.

This however ends up losing recursive stats when a child is removed.
To work around this, cfq-iosched adds its stats to its parent
cfq_group->dead_stats which is summed up together when calculating
recursive stats.

It's planned that the core stats will be moved to blkcg_gq, so we want
to move the mechanism for keeping track of the stats of dead children
from cfq to blkcg core.  This patch adds blkg_[rw]stat->aux_cnt which
are atomic64_t's keeping track of auxiliary counts which are excluded
when reading local counts but included for recursive.

blkg_[rw]stat_merge() which were used by cfq to implement dead_stats
are replaced by blkg_[rw]stat_add_aux(), and cfq now forwards stats of
a dead cgroup to the aux counts of parent->stats instead of separate
->dead_stats.

This will also help making blkg_[rw]stats per-cpu.

Signed-off-by: Tejun Heo <tj@kernel.org>
Cc: Vivek Goyal <vgoyal@redhat.com>
Signed-off-by: Jens Axboe <axboe@fb.com>
---
 block/blk-cgroup.c         | 10 ++++---
 block/cfq-iosched.c        | 67 +++++++++++++---------------------------------
 include/linux/blk-cgroup.h | 46 ++++++++++++++++++++++---------
 3 files changed, 57 insertions(+), 66 deletions(-)

(limited to 'block')

diff --git a/block/blk-cgroup.c b/block/blk-cgroup.c
index 097c4a670fa4..ff79b52d1a0e 100644
--- a/block/blk-cgroup.c
+++ b/block/blk-cgroup.c
@@ -584,7 +584,7 @@ EXPORT_SYMBOL_GPL(blkg_prfill_rwstat);
  * @off: offset to the blkg_stat in @pd
  *
  * Collect the blkg_stat specified by @off from @pd and all its online
- * descendants and return the sum.  The caller must be holding the queue
+ * descendants and their aux counts.  The caller must be holding the queue
  * lock for online tests.
  */
 u64 blkg_stat_recursive_sum(struct blkg_policy_data *pd, int off)
@@ -602,7 +602,8 @@ u64 blkg_stat_recursive_sum(struct blkg_policy_data *pd, int off)
 		struct blkg_stat *stat = (void *)pos_pd + off;
 
 		if (pos_blkg->online)
-			sum += blkg_stat_read(stat);
+			sum += blkg_stat_read(stat) +
+				atomic64_read(&stat->aux_cnt);
 	}
 	rcu_read_unlock();
 
@@ -616,7 +617,7 @@ EXPORT_SYMBOL_GPL(blkg_stat_recursive_sum);
  * @off: offset to the blkg_stat in @pd
  *
  * Collect the blkg_rwstat specified by @off from @pd and all its online
- * descendants and return the sum.  The caller must be holding the queue
+ * descendants and their aux counts.  The caller must be holding the queue
  * lock for online tests.
  */
 struct blkg_rwstat blkg_rwstat_recursive_sum(struct blkg_policy_data *pd,
@@ -642,7 +643,8 @@ struct blkg_rwstat blkg_rwstat_recursive_sum(struct blkg_policy_data *pd,
 		tmp = blkg_rwstat_read(rwstat);
 
 		for (i = 0; i < BLKG_RWSTAT_NR; i++)
-			sum.cnt[i] += tmp.cnt[i];
+			sum.cnt[i] += tmp.cnt[i] +
+				atomic64_read(&rwstat->aux_cnt[i]);
 	}
 	rcu_read_unlock();
 
diff --git a/block/cfq-iosched.c b/block/cfq-iosched.c
index 0994f3b523a8..b272cfff7364 100644
--- a/block/cfq-iosched.c
+++ b/block/cfq-iosched.c
@@ -304,7 +304,6 @@ struct cfq_group {
 	int dispatched;
 	struct cfq_ttime ttime;
 	struct cfqg_stats stats;	/* stats for this cfqg */
-	struct cfqg_stats dead_stats;	/* stats pushed from dead children */
 
 	/* async queue for each priority case */
 	struct cfq_queue *async_cfqq[2][IOPRIO_BE_NR];
@@ -736,28 +735,28 @@ static void cfqg_stats_reset(struct cfqg_stats *stats)
 }
 
 /* @to += @from */
-static void cfqg_stats_merge(struct cfqg_stats *to, struct cfqg_stats *from)
+static void cfqg_stats_add_aux(struct cfqg_stats *to, struct cfqg_stats *from)
 {
 	/* queued stats shouldn't be cleared */
-	blkg_rwstat_merge(&to->service_bytes, &from->service_bytes);
-	blkg_rwstat_merge(&to->serviced, &from->serviced);
-	blkg_rwstat_merge(&to->merged, &from->merged);
-	blkg_rwstat_merge(&to->service_time, &from->service_time);
-	blkg_rwstat_merge(&to->wait_time, &from->wait_time);
-	blkg_stat_merge(&from->time, &from->time);
+	blkg_rwstat_add_aux(&to->service_bytes, &from->service_bytes);
+	blkg_rwstat_add_aux(&to->serviced, &from->serviced);
+	blkg_rwstat_add_aux(&to->merged, &from->merged);
+	blkg_rwstat_add_aux(&to->service_time, &from->service_time);
+	blkg_rwstat_add_aux(&to->wait_time, &from->wait_time);
+	blkg_stat_add_aux(&from->time, &from->time);
 #ifdef CONFIG_DEBUG_BLK_CGROUP
-	blkg_stat_merge(&to->unaccounted_time, &from->unaccounted_time);
-	blkg_stat_merge(&to->avg_queue_size_sum, &from->avg_queue_size_sum);
-	blkg_stat_merge(&to->avg_queue_size_samples, &from->avg_queue_size_samples);
-	blkg_stat_merge(&to->dequeue, &from->dequeue);
-	blkg_stat_merge(&to->group_wait_time, &from->group_wait_time);
-	blkg_stat_merge(&to->idle_time, &from->idle_time);
-	blkg_stat_merge(&to->empty_time, &from->empty_time);
+	blkg_stat_add_aux(&to->unaccounted_time, &from->unaccounted_time);
+	blkg_stat_add_aux(&to->avg_queue_size_sum, &from->avg_queue_size_sum);
+	blkg_stat_add_aux(&to->avg_queue_size_samples, &from->avg_queue_size_samples);
+	blkg_stat_add_aux(&to->dequeue, &from->dequeue);
+	blkg_stat_add_aux(&to->group_wait_time, &from->group_wait_time);
+	blkg_stat_add_aux(&to->idle_time, &from->idle_time);
+	blkg_stat_add_aux(&to->empty_time, &from->empty_time);
 #endif
 }
 
 /*
- * Transfer @cfqg's stats to its parent's dead_stats so that the ancestors'
+ * Transfer @cfqg's stats to its parent's aux counts so that the ancestors'
  * recursive stats can still account for the amount used by this cfqg after
  * it's gone.
  */
@@ -770,10 +769,8 @@ static void cfqg_stats_xfer_dead(struct cfq_group *cfqg)
 	if (unlikely(!parent))
 		return;
 
-	cfqg_stats_merge(&parent->dead_stats, &cfqg->stats);
-	cfqg_stats_merge(&parent->dead_stats, &cfqg->dead_stats);
+	cfqg_stats_add_aux(&parent->stats, &cfqg->stats);
 	cfqg_stats_reset(&cfqg->stats);
-	cfqg_stats_reset(&cfqg->dead_stats);
 }
 
 #else	/* CONFIG_CFQ_GROUP_IOSCHED */
@@ -1606,7 +1603,6 @@ static struct blkg_policy_data *cfq_pd_alloc(gfp_t gfp, int node)
 
 	cfq_init_cfqg_base(cfqg);
 	cfqg_stats_init(&cfqg->stats);
-	cfqg_stats_init(&cfqg->dead_stats);
 
 	return &cfqg->pd;
 }
@@ -1649,38 +1645,11 @@ static void cfq_pd_free(struct blkg_policy_data *pd)
 	return kfree(pd);
 }
 
-/* offset delta from cfqg->stats to cfqg->dead_stats */
-static const int dead_stats_off_delta = offsetof(struct cfq_group, dead_stats) -
-					offsetof(struct cfq_group, stats);
-
-/* to be used by recursive prfill, sums live and dead stats recursively */
-static u64 cfqg_stat_pd_recursive_sum(struct blkg_policy_data *pd, int off)
-{
-	u64 sum = 0;
-
-	sum += blkg_stat_recursive_sum(pd, off);
-	sum += blkg_stat_recursive_sum(pd, off + dead_stats_off_delta);
-	return sum;
-}
-
-/* to be used by recursive prfill, sums live and dead rwstats recursively */
-static struct blkg_rwstat cfqg_rwstat_pd_recursive_sum(struct blkg_policy_data *pd,
-						       int off)
-{
-	struct blkg_rwstat a, b;
-
-	a = blkg_rwstat_recursive_sum(pd, off);
-	b = blkg_rwstat_recursive_sum(pd, off + dead_stats_off_delta);
-	blkg_rwstat_merge(&a, &b);
-	return a;
-}
-
 static void cfq_pd_reset_stats(struct blkg_policy_data *pd)
 {
 	struct cfq_group *cfqg = pd_to_cfqg(pd);
 
 	cfqg_stats_reset(&cfqg->stats);
-	cfqg_stats_reset(&cfqg->dead_stats);
 }
 
 static struct cfq_group *cfq_lookup_cfqg(struct cfq_data *cfqd,
@@ -1883,7 +1852,7 @@ static int cfqg_print_rwstat(struct seq_file *sf, void *v)
 static u64 cfqg_prfill_stat_recursive(struct seq_file *sf,
 				      struct blkg_policy_data *pd, int off)
 {
-	u64 sum = cfqg_stat_pd_recursive_sum(pd, off);
+	u64 sum = blkg_stat_recursive_sum(pd, off);
 
 	return __blkg_prfill_u64(sf, pd, sum);
 }
@@ -1891,7 +1860,7 @@ static u64 cfqg_prfill_stat_recursive(struct seq_file *sf,
 static u64 cfqg_prfill_rwstat_recursive(struct seq_file *sf,
 					struct blkg_policy_data *pd, int off)
 {
-	struct blkg_rwstat sum = cfqg_rwstat_pd_recursive_sum(pd, off);
+	struct blkg_rwstat sum = blkg_rwstat_recursive_sum(pd, off);
 
 	return __blkg_prfill_rwstat(sf, pd, &sum);
 }
diff --git a/include/linux/blk-cgroup.h b/include/linux/blk-cgroup.h
index 4d1659c7f84b..e8092276af58 100644
--- a/include/linux/blk-cgroup.h
+++ b/include/linux/blk-cgroup.h
@@ -53,14 +53,20 @@ struct blkcg {
 #endif
 };
 
+/*
+ * blkg_[rw]stat->aux_cnt is excluded for local stats but included for
+ * recursive.  Used to carry stats of dead children.
+ */
 struct blkg_stat {
 	struct u64_stats_sync		syncp;
 	uint64_t			cnt;
+	atomic64_t			aux_cnt;
 };
 
 struct blkg_rwstat {
 	struct u64_stats_sync		syncp;
 	uint64_t			cnt[BLKG_RWSTAT_NR];
+	atomic64_t			aux_cnt[BLKG_RWSTAT_NR];
 };
 
 /*
@@ -483,6 +489,7 @@ struct request_list *__blk_queue_next_rl(struct request_list *rl,
 static inline void blkg_stat_init(struct blkg_stat *stat)
 {
 	u64_stats_init(&stat->syncp);
+	atomic64_set(&stat->aux_cnt, 0);
 }
 
 /**
@@ -504,8 +511,9 @@ static inline void blkg_stat_add(struct blkg_stat *stat, uint64_t val)
  * blkg_stat_read - read the current value of a blkg_stat
  * @stat: blkg_stat to read
  *
- * Read the current value of @stat.  This function can be called without
- * synchroniztion and takes care of u64 atomicity.
+ * Read the current value of @stat.  The returned value doesn't include the
+ * aux count.  This function can be called without synchroniztion and takes
+ * care of u64 atomicity.
  */
 static inline uint64_t blkg_stat_read(struct blkg_stat *stat)
 {
@@ -527,23 +535,31 @@ static inline uint64_t blkg_stat_read(struct blkg_stat *stat)
 static inline void blkg_stat_reset(struct blkg_stat *stat)
 {
 	stat->cnt = 0;
+	atomic64_set(&stat->aux_cnt, 0);
 }
 
 /**
- * blkg_stat_merge - merge a blkg_stat into another
+ * blkg_stat_add_aux - add a blkg_stat into another's aux count
  * @to: the destination blkg_stat
  * @from: the source
  *
- * Add @from's count to @to.
+ * Add @from's count including the aux one to @to's aux count.
  */
-static inline void blkg_stat_merge(struct blkg_stat *to, struct blkg_stat *from)
+static inline void blkg_stat_add_aux(struct blkg_stat *to,
+				     struct blkg_stat *from)
 {
-	blkg_stat_add(to, blkg_stat_read(from));
+	atomic64_add(blkg_stat_read(from) + atomic64_read(&from->aux_cnt),
+		     &to->aux_cnt);
 }
 
 static inline void blkg_rwstat_init(struct blkg_rwstat *rwstat)
 {
+	int i;
+
 	u64_stats_init(&rwstat->syncp);
+
+	for (i = 0; i < BLKG_RWSTAT_NR; i++)
+		atomic64_set(&rwstat->aux_cnt[i], 0);
 }
 
 /**
@@ -614,26 +630,30 @@ static inline uint64_t blkg_rwstat_total(struct blkg_rwstat *rwstat)
  */
 static inline void blkg_rwstat_reset(struct blkg_rwstat *rwstat)
 {
+	int i;
+
 	memset(rwstat->cnt, 0, sizeof(rwstat->cnt));
+
+	for (i = 0; i < BLKG_RWSTAT_NR; i++)
+		atomic64_set(&rwstat->aux_cnt[i], 0);
 }
 
 /**
- * blkg_rwstat_merge - merge a blkg_rwstat into another
+ * blkg_rwstat_add_aux - add a blkg_rwstat into another's aux count
  * @to: the destination blkg_rwstat
  * @from: the source
  *
- * Add @from's counts to @to.
+ * Add @from's count including the aux one to @to's aux count.
  */
-static inline void blkg_rwstat_merge(struct blkg_rwstat *to,
-				     struct blkg_rwstat *from)
+static inline void blkg_rwstat_add_aux(struct blkg_rwstat *to,
+				       struct blkg_rwstat *from)
 {
 	struct blkg_rwstat v = blkg_rwstat_read(from);
 	int i;
 
-	u64_stats_update_begin(&to->syncp);
 	for (i = 0; i < BLKG_RWSTAT_NR; i++)
-		to->cnt[i] += v.cnt[i];
-	u64_stats_update_end(&to->syncp);
+		atomic64_add(v.cnt[i] + atomic64_read(&from->aux_cnt[i]),
+			     &to->aux_cnt[i]);
 }
 
 #ifdef CONFIG_BLK_DEV_THROTTLING
-- 
cgit v1.2.3


From 24bdb8ef068ebdc2a57ce715f0ab22d5da32832a Mon Sep 17 00:00:00 2001
From: Tejun Heo <tj@kernel.org>
Date: Tue, 18 Aug 2015 14:55:22 -0700
Subject: blkcg: make blkcg_[rw]stat per-cpu

blkcg_[rw]stat are used as stat counters for blkcg policies.  It isn't
per-cpu by itself and blk-throttle makes it per-cpu by wrapping around
it.  This patch makes blkcg_[rw]stat per-cpu and drop the ad-hoc
per-cpu wrapping in blk-throttle.

* blkg_[rw]stat->cnt is replaced with cpu_cnt which is struct
  percpu_counter.  This makes syncp unnecessary as remote accesses are
  handled by percpu_counter itself.

* blkg_[rw]stat_init() can now fail due to percpu allocation failure
  and thus are updated to return int.

* percpu_counters need explicit freeing.  blkg_[rw]stat_exit() added.

* As blkg_rwstat->cpu_cnt[] can't be read directly anymore, reading
  and summing results are stored in ->aux_cnt[] instead.

* Custom per-cpu stat implementation in blk-throttle is removed.

This makes all blkcg stat counters per-cpu without complicating policy
implmentations.

Signed-off-by: Tejun Heo <tj@kernel.org>
Cc: Vivek Goyal <vgoyal@redhat.com>
Signed-off-by: Jens Axboe <axboe@fb.com>
---
 block/blk-cgroup.c         |  10 ++--
 block/blk-throttle.c       |  89 +++++++++++----------------------
 block/cfq-iosched.c        |  70 +++++++++++++++++++-------
 include/linux/blk-cgroup.h | 120 +++++++++++++++++++++++++--------------------
 4 files changed, 153 insertions(+), 136 deletions(-)

(limited to 'block')

diff --git a/block/blk-cgroup.c b/block/blk-cgroup.c
index ff79b52d1a0e..02a2d029b5a5 100644
--- a/block/blk-cgroup.c
+++ b/block/blk-cgroup.c
@@ -539,9 +539,10 @@ u64 __blkg_prfill_rwstat(struct seq_file *sf, struct blkg_policy_data *pd,
 
 	for (i = 0; i < BLKG_RWSTAT_NR; i++)
 		seq_printf(sf, "%s %s %llu\n", dname, rwstr[i],
-			   (unsigned long long)rwstat->cnt[i]);
+			   (unsigned long long)atomic64_read(&rwstat->aux_cnt[i]));
 
-	v = rwstat->cnt[BLKG_RWSTAT_READ] + rwstat->cnt[BLKG_RWSTAT_WRITE];
+	v = atomic64_read(&rwstat->aux_cnt[BLKG_RWSTAT_READ]) +
+		atomic64_read(&rwstat->aux_cnt[BLKG_RWSTAT_WRITE]);
 	seq_printf(sf, "%s Total %llu\n", dname, (unsigned long long)v);
 	return v;
 }
@@ -643,8 +644,9 @@ struct blkg_rwstat blkg_rwstat_recursive_sum(struct blkg_policy_data *pd,
 		tmp = blkg_rwstat_read(rwstat);
 
 		for (i = 0; i < BLKG_RWSTAT_NR; i++)
-			sum.cnt[i] += tmp.cnt[i] +
-				atomic64_read(&rwstat->aux_cnt[i]);
+			atomic64_add(atomic64_read(&tmp.aux_cnt[i]) +
+				     atomic64_read(&rwstat->aux_cnt[i]),
+				     &sum.aux_cnt[i]);
 	}
 	rcu_read_unlock();
 
diff --git a/block/blk-throttle.c b/block/blk-throttle.c
index 29c22ed4b073..c0b2263a222a 100644
--- a/block/blk-throttle.c
+++ b/block/blk-throttle.c
@@ -83,14 +83,6 @@ enum tg_state_flags {
 
 #define rb_entry_tg(node)	rb_entry((node), struct throtl_grp, rb_node)
 
-/* Per-cpu group stats */
-struct tg_stats_cpu {
-	/* total bytes transferred */
-	struct blkg_rwstat		service_bytes;
-	/* total IOs serviced, post merge */
-	struct blkg_rwstat		serviced;
-};
-
 struct throtl_grp {
 	/* must be the first member */
 	struct blkg_policy_data pd;
@@ -142,8 +134,10 @@ struct throtl_grp {
 	unsigned long slice_start[2];
 	unsigned long slice_end[2];
 
-	/* Per cpu stats pointer */
-	struct tg_stats_cpu __percpu *stats_cpu;
+	/* total bytes transferred */
+	struct blkg_rwstat		service_bytes;
+	/* total IOs serviced, post merge */
+	struct blkg_rwstat		serviced;
 };
 
 struct throtl_data
@@ -337,17 +331,15 @@ static void throtl_service_queue_init(struct throtl_service_queue *sq)
 static struct blkg_policy_data *throtl_pd_alloc(gfp_t gfp, int node)
 {
 	struct throtl_grp *tg;
-	int rw, cpu;
+	int rw;
 
 	tg = kzalloc_node(sizeof(*tg), gfp, node);
 	if (!tg)
-		return NULL;
+		goto err;
 
-	tg->stats_cpu = alloc_percpu_gfp(struct tg_stats_cpu, gfp);
-	if (!tg->stats_cpu) {
-		kfree(tg);
-		return NULL;
-	}
+	if (blkg_rwstat_init(&tg->service_bytes, gfp) ||
+	    blkg_rwstat_init(&tg->serviced, gfp))
+		goto err_free_tg;
 
 	throtl_service_queue_init(&tg->service_queue);
 
@@ -362,14 +354,14 @@ static struct blkg_policy_data *throtl_pd_alloc(gfp_t gfp, int node)
 	tg->iops[READ] = -1;
 	tg->iops[WRITE] = -1;
 
-	for_each_possible_cpu(cpu) {
-		struct tg_stats_cpu *stats_cpu = per_cpu_ptr(tg->stats_cpu, cpu);
-
-		blkg_rwstat_init(&stats_cpu->service_bytes);
-		blkg_rwstat_init(&stats_cpu->serviced);
-	}
-
 	return &tg->pd;
+
+err_free_tg:
+	blkg_rwstat_exit(&tg->serviced);
+	blkg_rwstat_exit(&tg->service_bytes);
+	kfree(tg);
+err:
+	return NULL;
 }
 
 static void throtl_pd_init(struct blkg_policy_data *pd)
@@ -427,21 +419,17 @@ static void throtl_pd_free(struct blkg_policy_data *pd)
 	struct throtl_grp *tg = pd_to_tg(pd);
 
 	del_timer_sync(&tg->service_queue.pending_timer);
-	free_percpu(tg->stats_cpu);
+	blkg_rwstat_exit(&tg->serviced);
+	blkg_rwstat_exit(&tg->service_bytes);
 	kfree(tg);
 }
 
 static void throtl_pd_reset_stats(struct blkg_policy_data *pd)
 {
 	struct throtl_grp *tg = pd_to_tg(pd);
-	int cpu;
 
-	for_each_possible_cpu(cpu) {
-		struct tg_stats_cpu *sc = per_cpu_ptr(tg->stats_cpu, cpu);
-
-		blkg_rwstat_reset(&sc->service_bytes);
-		blkg_rwstat_reset(&sc->serviced);
-	}
+	blkg_rwstat_reset(&tg->service_bytes);
+	blkg_rwstat_reset(&tg->serviced);
 }
 
 static struct throtl_grp *
@@ -855,7 +843,6 @@ static void throtl_update_dispatch_stats(struct blkcg_gq *blkg, u64 bytes,
 					 int rw)
 {
 	struct throtl_grp *tg = blkg_to_tg(blkg);
-	struct tg_stats_cpu *stats_cpu;
 	unsigned long flags;
 
 	/*
@@ -865,10 +852,8 @@ static void throtl_update_dispatch_stats(struct blkcg_gq *blkg, u64 bytes,
 	 */
 	local_irq_save(flags);
 
-	stats_cpu = this_cpu_ptr(tg->stats_cpu);
-
-	blkg_rwstat_add(&stats_cpu->serviced, rw, 1);
-	blkg_rwstat_add(&stats_cpu->service_bytes, rw, bytes);
+	blkg_rwstat_add(&tg->serviced, rw, 1);
+	blkg_rwstat_add(&tg->service_bytes, rw, bytes);
 
 	local_irq_restore(flags);
 }
@@ -1176,27 +1161,9 @@ static void blk_throtl_dispatch_work_fn(struct work_struct *work)
 	}
 }
 
-static u64 tg_prfill_cpu_rwstat(struct seq_file *sf,
-				struct blkg_policy_data *pd, int off)
-{
-	struct throtl_grp *tg = pd_to_tg(pd);
-	struct blkg_rwstat rwstat = { }, tmp;
-	int i, cpu;
-
-	for_each_possible_cpu(cpu) {
-		struct tg_stats_cpu *sc = per_cpu_ptr(tg->stats_cpu, cpu);
-
-		tmp = blkg_rwstat_read((void *)sc + off);
-		for (i = 0; i < BLKG_RWSTAT_NR; i++)
-			rwstat.cnt[i] += tmp.cnt[i];
-	}
-
-	return __blkg_prfill_rwstat(sf, pd, &rwstat);
-}
-
-static int tg_print_cpu_rwstat(struct seq_file *sf, void *v)
+static int tg_print_rwstat(struct seq_file *sf, void *v)
 {
-	blkcg_print_blkgs(sf, css_to_blkcg(seq_css(sf)), tg_prfill_cpu_rwstat,
+	blkcg_print_blkgs(sf, css_to_blkcg(seq_css(sf)), blkg_prfill_rwstat,
 			  &blkcg_policy_throtl, seq_cft(sf)->private, true);
 	return 0;
 }
@@ -1337,13 +1304,13 @@ static struct cftype throtl_files[] = {
 	},
 	{
 		.name = "throttle.io_service_bytes",
-		.private = offsetof(struct tg_stats_cpu, service_bytes),
-		.seq_show = tg_print_cpu_rwstat,
+		.private = offsetof(struct throtl_grp, service_bytes),
+		.seq_show = tg_print_rwstat,
 	},
 	{
 		.name = "throttle.io_serviced",
-		.private = offsetof(struct tg_stats_cpu, serviced),
-		.seq_show = tg_print_cpu_rwstat,
+		.private = offsetof(struct throtl_grp, serviced),
+		.seq_show = tg_print_rwstat,
 	},
 	{ }	/* terminate */
 };
diff --git a/block/cfq-iosched.c b/block/cfq-iosched.c
index b272cfff7364..71e55c91ee98 100644
--- a/block/cfq-iosched.c
+++ b/block/cfq-iosched.c
@@ -1542,27 +1542,55 @@ static void cfq_init_cfqg_base(struct cfq_group *cfqg)
 }
 
 #ifdef CONFIG_CFQ_GROUP_IOSCHED
-static void cfqg_stats_init(struct cfqg_stats *stats)
+static void cfqg_stats_exit(struct cfqg_stats *stats)
 {
-	blkg_rwstat_init(&stats->service_bytes);
-	blkg_rwstat_init(&stats->serviced);
-	blkg_rwstat_init(&stats->merged);
-	blkg_rwstat_init(&stats->service_time);
-	blkg_rwstat_init(&stats->wait_time);
-	blkg_rwstat_init(&stats->queued);
+	blkg_rwstat_exit(&stats->service_bytes);
+	blkg_rwstat_exit(&stats->serviced);
+	blkg_rwstat_exit(&stats->merged);
+	blkg_rwstat_exit(&stats->service_time);
+	blkg_rwstat_exit(&stats->wait_time);
+	blkg_rwstat_exit(&stats->queued);
 
-	blkg_stat_init(&stats->sectors);
-	blkg_stat_init(&stats->time);
+	blkg_stat_exit(&stats->sectors);
+	blkg_stat_exit(&stats->time);
+#ifdef CONFIG_DEBUG_BLK_CGROUP
+	blkg_stat_exit(&stats->unaccounted_time);
+	blkg_stat_exit(&stats->avg_queue_size_sum);
+	blkg_stat_exit(&stats->avg_queue_size_samples);
+	blkg_stat_exit(&stats->dequeue);
+	blkg_stat_exit(&stats->group_wait_time);
+	blkg_stat_exit(&stats->idle_time);
+	blkg_stat_exit(&stats->empty_time);
+#endif
+}
+
+static int cfqg_stats_init(struct cfqg_stats *stats, gfp_t gfp)
+{
+	if (blkg_rwstat_init(&stats->service_bytes, gfp) ||
+	    blkg_rwstat_init(&stats->serviced, gfp) ||
+	    blkg_rwstat_init(&stats->merged, gfp) ||
+	    blkg_rwstat_init(&stats->service_time, gfp) ||
+	    blkg_rwstat_init(&stats->wait_time, gfp) ||
+	    blkg_rwstat_init(&stats->queued, gfp) ||
+
+	    blkg_stat_init(&stats->sectors, gfp) ||
+	    blkg_stat_init(&stats->time, gfp))
+		goto err;
 
 #ifdef CONFIG_DEBUG_BLK_CGROUP
-	blkg_stat_init(&stats->unaccounted_time);
-	blkg_stat_init(&stats->avg_queue_size_sum);
-	blkg_stat_init(&stats->avg_queue_size_samples);
-	blkg_stat_init(&stats->dequeue);
-	blkg_stat_init(&stats->group_wait_time);
-	blkg_stat_init(&stats->idle_time);
-	blkg_stat_init(&stats->empty_time);
+	if (blkg_stat_init(&stats->unaccounted_time, gfp) ||
+	    blkg_stat_init(&stats->avg_queue_size_sum, gfp) ||
+	    blkg_stat_init(&stats->avg_queue_size_samples, gfp) ||
+	    blkg_stat_init(&stats->dequeue, gfp) ||
+	    blkg_stat_init(&stats->group_wait_time, gfp) ||
+	    blkg_stat_init(&stats->idle_time, gfp) ||
+	    blkg_stat_init(&stats->empty_time, gfp))
+		goto err;
 #endif
+	return 0;
+err:
+	cfqg_stats_exit(stats);
+	return -ENOMEM;
 }
 
 static struct blkcg_policy_data *cfq_cpd_alloc(gfp_t gfp)
@@ -1602,7 +1630,10 @@ static struct blkg_policy_data *cfq_pd_alloc(gfp_t gfp, int node)
 		return NULL;
 
 	cfq_init_cfqg_base(cfqg);
-	cfqg_stats_init(&cfqg->stats);
+	if (cfqg_stats_init(&cfqg->stats, gfp)) {
+		kfree(cfqg);
+		return NULL;
+	}
 
 	return &cfqg->pd;
 }
@@ -1642,7 +1673,10 @@ static void cfq_pd_offline(struct blkg_policy_data *pd)
 
 static void cfq_pd_free(struct blkg_policy_data *pd)
 {
-	return kfree(pd);
+	struct cfq_group *cfqg = pd_to_cfqg(pd);
+
+	cfqg_stats_exit(&cfqg->stats);
+	return kfree(cfqg);
 }
 
 static void cfq_pd_reset_stats(struct blkg_policy_data *pd)
diff --git a/include/linux/blk-cgroup.h b/include/linux/blk-cgroup.h
index e8092276af58..fdc7ac08b1ce 100644
--- a/include/linux/blk-cgroup.h
+++ b/include/linux/blk-cgroup.h
@@ -14,12 +14,15 @@
  */
 
 #include <linux/cgroup.h>
-#include <linux/u64_stats_sync.h>
+#include <linux/percpu_counter.h>
 #include <linux/seq_file.h>
 #include <linux/radix-tree.h>
 #include <linux/blkdev.h>
 #include <linux/atomic.h>
 
+/* percpu_counter batch for blkg_[rw]stats, per-cpu drift doesn't matter */
+#define BLKG_STAT_CPU_BATCH	(INT_MAX / 2)
+
 /* Max limits for throttle policy */
 #define THROTL_IOPS_MAX		UINT_MAX
 
@@ -55,17 +58,16 @@ struct blkcg {
 
 /*
  * blkg_[rw]stat->aux_cnt is excluded for local stats but included for
- * recursive.  Used to carry stats of dead children.
+ * recursive.  Used to carry stats of dead children, and, for blkg_rwstat,
+ * to carry result values from read and sum operations.
  */
 struct blkg_stat {
-	struct u64_stats_sync		syncp;
-	uint64_t			cnt;
+	struct percpu_counter		cpu_cnt;
 	atomic64_t			aux_cnt;
 };
 
 struct blkg_rwstat {
-	struct u64_stats_sync		syncp;
-	uint64_t			cnt[BLKG_RWSTAT_NR];
+	struct percpu_counter		cpu_cnt[BLKG_RWSTAT_NR];
 	atomic64_t			aux_cnt[BLKG_RWSTAT_NR];
 };
 
@@ -486,10 +488,21 @@ struct request_list *__blk_queue_next_rl(struct request_list *rl,
 #define blk_queue_for_each_rl(rl, q)	\
 	for ((rl) = &(q)->root_rl; (rl); (rl) = __blk_queue_next_rl((rl), (q)))
 
-static inline void blkg_stat_init(struct blkg_stat *stat)
+static inline int blkg_stat_init(struct blkg_stat *stat, gfp_t gfp)
 {
-	u64_stats_init(&stat->syncp);
+	int ret;
+
+	ret = percpu_counter_init(&stat->cpu_cnt, 0, gfp);
+	if (ret)
+		return ret;
+
 	atomic64_set(&stat->aux_cnt, 0);
+	return 0;
+}
+
+static inline void blkg_stat_exit(struct blkg_stat *stat)
+{
+	percpu_counter_destroy(&stat->cpu_cnt);
 }
 
 /**
@@ -497,35 +510,21 @@ static inline void blkg_stat_init(struct blkg_stat *stat)
  * @stat: target blkg_stat
  * @val: value to add
  *
- * Add @val to @stat.  The caller is responsible for synchronizing calls to
- * this function.
+ * Add @val to @stat.  The caller must ensure that IRQ on the same CPU
+ * don't re-enter this function for the same counter.
  */
 static inline void blkg_stat_add(struct blkg_stat *stat, uint64_t val)
 {
-	u64_stats_update_begin(&stat->syncp);
-	stat->cnt += val;
-	u64_stats_update_end(&stat->syncp);
+	__percpu_counter_add(&stat->cpu_cnt, val, BLKG_STAT_CPU_BATCH);
 }
 
 /**
  * blkg_stat_read - read the current value of a blkg_stat
  * @stat: blkg_stat to read
- *
- * Read the current value of @stat.  The returned value doesn't include the
- * aux count.  This function can be called without synchroniztion and takes
- * care of u64 atomicity.
  */
 static inline uint64_t blkg_stat_read(struct blkg_stat *stat)
 {
-	unsigned int start;
-	uint64_t v;
-
-	do {
-		start = u64_stats_fetch_begin_irq(&stat->syncp);
-		v = stat->cnt;
-	} while (u64_stats_fetch_retry_irq(&stat->syncp, start));
-
-	return v;
+	return percpu_counter_sum_positive(&stat->cpu_cnt);
 }
 
 /**
@@ -534,7 +533,7 @@ static inline uint64_t blkg_stat_read(struct blkg_stat *stat)
  */
 static inline void blkg_stat_reset(struct blkg_stat *stat)
 {
-	stat->cnt = 0;
+	percpu_counter_set(&stat->cpu_cnt, 0);
 	atomic64_set(&stat->aux_cnt, 0);
 }
 
@@ -552,14 +551,28 @@ static inline void blkg_stat_add_aux(struct blkg_stat *to,
 		     &to->aux_cnt);
 }
 
-static inline void blkg_rwstat_init(struct blkg_rwstat *rwstat)
+static inline int blkg_rwstat_init(struct blkg_rwstat *rwstat, gfp_t gfp)
 {
-	int i;
+	int i, ret;
+
+	for (i = 0; i < BLKG_RWSTAT_NR; i++) {
+		ret = percpu_counter_init(&rwstat->cpu_cnt[i], 0, gfp);
+		if (ret) {
+			while (--i >= 0)
+				percpu_counter_destroy(&rwstat->cpu_cnt[i]);
+			return ret;
+		}
+		atomic64_set(&rwstat->aux_cnt[i], 0);
+	}
+	return 0;
+}
 
-	u64_stats_init(&rwstat->syncp);
+static inline void blkg_rwstat_exit(struct blkg_rwstat *rwstat)
+{
+	int i;
 
 	for (i = 0; i < BLKG_RWSTAT_NR; i++)
-		atomic64_set(&rwstat->aux_cnt[i], 0);
+		percpu_counter_destroy(&rwstat->cpu_cnt[i]);
 }
 
 /**
@@ -574,39 +587,38 @@ static inline void blkg_rwstat_init(struct blkg_rwstat *rwstat)
 static inline void blkg_rwstat_add(struct blkg_rwstat *rwstat,
 				   int rw, uint64_t val)
 {
-	u64_stats_update_begin(&rwstat->syncp);
+	struct percpu_counter *cnt;
 
 	if (rw & REQ_WRITE)
-		rwstat->cnt[BLKG_RWSTAT_WRITE] += val;
+		cnt = &rwstat->cpu_cnt[BLKG_RWSTAT_WRITE];
 	else
-		rwstat->cnt[BLKG_RWSTAT_READ] += val;
+		cnt = &rwstat->cpu_cnt[BLKG_RWSTAT_READ];
+
+	__percpu_counter_add(cnt, val, BLKG_STAT_CPU_BATCH);
+
 	if (rw & REQ_SYNC)
-		rwstat->cnt[BLKG_RWSTAT_SYNC] += val;
+		cnt = &rwstat->cpu_cnt[BLKG_RWSTAT_SYNC];
 	else
-		rwstat->cnt[BLKG_RWSTAT_ASYNC] += val;
+		cnt = &rwstat->cpu_cnt[BLKG_RWSTAT_ASYNC];
 
-	u64_stats_update_end(&rwstat->syncp);
+	__percpu_counter_add(cnt, val, BLKG_STAT_CPU_BATCH);
 }
 
 /**
  * blkg_rwstat_read - read the current values of a blkg_rwstat
  * @rwstat: blkg_rwstat to read
  *
- * Read the current snapshot of @rwstat and return it as the return value.
- * This function can be called without synchronization and takes care of
- * u64 atomicity.
+ * Read the current snapshot of @rwstat and return it in the aux counts.
  */
 static inline struct blkg_rwstat blkg_rwstat_read(struct blkg_rwstat *rwstat)
 {
-	unsigned int start;
-	struct blkg_rwstat tmp;
-
-	do {
-		start = u64_stats_fetch_begin_irq(&rwstat->syncp);
-		tmp = *rwstat;
-	} while (u64_stats_fetch_retry_irq(&rwstat->syncp, start));
+	struct blkg_rwstat result;
+	int i;
 
-	return tmp;
+	for (i = 0; i < BLKG_RWSTAT_NR; i++)
+		atomic64_set(&result.aux_cnt[i],
+			     percpu_counter_sum_positive(&rwstat->cpu_cnt[i]));
+	return result;
 }
 
 /**
@@ -621,7 +633,8 @@ static inline uint64_t blkg_rwstat_total(struct blkg_rwstat *rwstat)
 {
 	struct blkg_rwstat tmp = blkg_rwstat_read(rwstat);
 
-	return tmp.cnt[BLKG_RWSTAT_READ] + tmp.cnt[BLKG_RWSTAT_WRITE];
+	return atomic64_read(&tmp.aux_cnt[BLKG_RWSTAT_READ]) +
+		atomic64_read(&tmp.aux_cnt[BLKG_RWSTAT_WRITE]);
 }
 
 /**
@@ -632,10 +645,10 @@ static inline void blkg_rwstat_reset(struct blkg_rwstat *rwstat)
 {
 	int i;
 
-	memset(rwstat->cnt, 0, sizeof(rwstat->cnt));
-
-	for (i = 0; i < BLKG_RWSTAT_NR; i++)
+	for (i = 0; i < BLKG_RWSTAT_NR; i++) {
+		percpu_counter_set(&rwstat->cpu_cnt[i], 0);
 		atomic64_set(&rwstat->aux_cnt[i], 0);
+	}
 }
 
 /**
@@ -652,7 +665,8 @@ static inline void blkg_rwstat_add_aux(struct blkg_rwstat *to,
 	int i;
 
 	for (i = 0; i < BLKG_RWSTAT_NR; i++)
-		atomic64_add(v.cnt[i] + atomic64_read(&from->aux_cnt[i]),
+		atomic64_add(atomic64_read(&v.aux_cnt[i]) +
+			     atomic64_read(&from->aux_cnt[i]),
 			     &to->aux_cnt[i]);
 }
 
-- 
cgit v1.2.3


From f12c74cab1635d67077ce8cc40da88b57980f637 Mon Sep 17 00:00:00 2001
From: Tejun Heo <tj@kernel.org>
Date: Tue, 18 Aug 2015 14:55:23 -0700
Subject: blkcg: make blkg_[rw]stat_recursive_sum() to be able to index into
 blkcg_gq

Currently, blkg_[rw]stat_recursive_sum() assume that the target
counter is located in pd (blkg_policy_data); however, some counters
are planned to be moved to blkg (blkcg_gq).

This patch updates blkg_[rw]stat_recursive_sum() to take blkg and
blkg_policy pointers instead of pd.  If policy is NULL, it indexes
into blkg.  If non-NULL, into the blkg's pd of the policy.

The existing usages are updated to maintain the current behaviors.

Signed-off-by: Tejun Heo <tj@kernel.org>
Cc: Vivek Goyal <vgoyal@redhat.com>
Signed-off-by: Jens Axboe <axboe@fb.com>
---
 block/blk-cgroup.c         | 69 ++++++++++++++++++++++++++++------------------
 block/cfq-iosched.c        |  8 +++---
 include/linux/blk-cgroup.h |  7 +++--
 3 files changed, 50 insertions(+), 34 deletions(-)

(limited to 'block')

diff --git a/block/blk-cgroup.c b/block/blk-cgroup.c
index 02a2d029b5a5..b26320720a3c 100644
--- a/block/blk-cgroup.c
+++ b/block/blk-cgroup.c
@@ -581,30 +581,39 @@ EXPORT_SYMBOL_GPL(blkg_prfill_rwstat);
 
 /**
  * blkg_stat_recursive_sum - collect hierarchical blkg_stat
- * @pd: policy private data of interest
- * @off: offset to the blkg_stat in @pd
+ * @blkg: blkg of interest
+ * @pol: blkcg_policy which contains the blkg_stat
+ * @off: offset to the blkg_stat in blkg_policy_data or @blkg
  *
- * Collect the blkg_stat specified by @off from @pd and all its online
- * descendants and their aux counts.  The caller must be holding the queue
- * lock for online tests.
+ * Collect the blkg_stat specified by @blkg, @pol and @off and all its
+ * online descendants and their aux counts.  The caller must be holding the
+ * queue lock for online tests.
+ *
+ * If @pol is NULL, blkg_stat is at @off bytes into @blkg; otherwise, it is
+ * at @off bytes into @blkg's blkg_policy_data of the policy.
  */
-u64 blkg_stat_recursive_sum(struct blkg_policy_data *pd, int off)
+u64 blkg_stat_recursive_sum(struct blkcg_gq *blkg,
+			    struct blkcg_policy *pol, int off)
 {
-	struct blkcg_policy *pol = blkcg_policy[pd->plid];
 	struct blkcg_gq *pos_blkg;
 	struct cgroup_subsys_state *pos_css;
 	u64 sum = 0;
 
-	lockdep_assert_held(pd->blkg->q->queue_lock);
+	lockdep_assert_held(blkg->q->queue_lock);
 
 	rcu_read_lock();
-	blkg_for_each_descendant_pre(pos_blkg, pos_css, pd_to_blkg(pd)) {
-		struct blkg_policy_data *pos_pd = blkg_to_pd(pos_blkg, pol);
-		struct blkg_stat *stat = (void *)pos_pd + off;
+	blkg_for_each_descendant_pre(pos_blkg, pos_css, blkg) {
+		struct blkg_stat *stat;
+
+		if (!pos_blkg->online)
+			continue;
 
-		if (pos_blkg->online)
-			sum += blkg_stat_read(stat) +
-				atomic64_read(&stat->aux_cnt);
+		if (pol)
+			stat = (void *)blkg_to_pd(pos_blkg, pol) + off;
+		else
+			stat = (void *)blkg + off;
+
+		sum += blkg_stat_read(stat) + atomic64_read(&stat->aux_cnt);
 	}
 	rcu_read_unlock();
 
@@ -614,33 +623,39 @@ EXPORT_SYMBOL_GPL(blkg_stat_recursive_sum);
 
 /**
  * blkg_rwstat_recursive_sum - collect hierarchical blkg_rwstat
- * @pd: policy private data of interest
- * @off: offset to the blkg_stat in @pd
+ * @blkg: blkg of interest
+ * @pol: blkcg_policy which contains the blkg_rwstat
+ * @off: offset to the blkg_rwstat in blkg_policy_data or @blkg
  *
- * Collect the blkg_rwstat specified by @off from @pd and all its online
- * descendants and their aux counts.  The caller must be holding the queue
- * lock for online tests.
+ * Collect the blkg_rwstat specified by @blkg, @pol and @off and all its
+ * online descendants and their aux counts.  The caller must be holding the
+ * queue lock for online tests.
+ *
+ * If @pol is NULL, blkg_rwstat is at @off bytes into @blkg; otherwise, it
+ * is at @off bytes into @blkg's blkg_policy_data of the policy.
  */
-struct blkg_rwstat blkg_rwstat_recursive_sum(struct blkg_policy_data *pd,
-					     int off)
+struct blkg_rwstat blkg_rwstat_recursive_sum(struct blkcg_gq *blkg,
+					     struct blkcg_policy *pol, int off)
 {
-	struct blkcg_policy *pol = blkcg_policy[pd->plid];
 	struct blkcg_gq *pos_blkg;
 	struct cgroup_subsys_state *pos_css;
 	struct blkg_rwstat sum = { };
 	int i;
 
-	lockdep_assert_held(pd->blkg->q->queue_lock);
+	lockdep_assert_held(blkg->q->queue_lock);
 
 	rcu_read_lock();
-	blkg_for_each_descendant_pre(pos_blkg, pos_css, pd_to_blkg(pd)) {
-		struct blkg_policy_data *pos_pd = blkg_to_pd(pos_blkg, pol);
-		struct blkg_rwstat *rwstat = (void *)pos_pd + off;
-		struct blkg_rwstat tmp;
+	blkg_for_each_descendant_pre(pos_blkg, pos_css, blkg) {
+		struct blkg_rwstat *rwstat, tmp;
 
 		if (!pos_blkg->online)
 			continue;
 
+		if (pol)
+			rwstat = (void *)blkg_to_pd(pos_blkg, pol) + off;
+		else
+			rwstat = (void *)pos_blkg + off;
+
 		tmp = blkg_rwstat_read(rwstat);
 
 		for (i = 0; i < BLKG_RWSTAT_NR; i++)
diff --git a/block/cfq-iosched.c b/block/cfq-iosched.c
index 71e55c91ee98..e861cc1ebd62 100644
--- a/block/cfq-iosched.c
+++ b/block/cfq-iosched.c
@@ -1886,16 +1886,16 @@ static int cfqg_print_rwstat(struct seq_file *sf, void *v)
 static u64 cfqg_prfill_stat_recursive(struct seq_file *sf,
 				      struct blkg_policy_data *pd, int off)
 {
-	u64 sum = blkg_stat_recursive_sum(pd, off);
-
+	u64 sum = blkg_stat_recursive_sum(pd_to_blkg(pd),
+					  &blkcg_policy_cfq, off);
 	return __blkg_prfill_u64(sf, pd, sum);
 }
 
 static u64 cfqg_prfill_rwstat_recursive(struct seq_file *sf,
 					struct blkg_policy_data *pd, int off)
 {
-	struct blkg_rwstat sum = blkg_rwstat_recursive_sum(pd, off);
-
+	struct blkg_rwstat sum = blkg_rwstat_recursive_sum(pd_to_blkg(pd),
+							&blkcg_policy_cfq, off);
 	return __blkg_prfill_rwstat(sf, pd, &sum);
 }
 
diff --git a/include/linux/blk-cgroup.h b/include/linux/blk-cgroup.h
index fdc7ac08b1ce..4630ce8f9425 100644
--- a/include/linux/blk-cgroup.h
+++ b/include/linux/blk-cgroup.h
@@ -191,9 +191,10 @@ u64 blkg_prfill_stat(struct seq_file *sf, struct blkg_policy_data *pd, int off);
 u64 blkg_prfill_rwstat(struct seq_file *sf, struct blkg_policy_data *pd,
 		       int off);
 
-u64 blkg_stat_recursive_sum(struct blkg_policy_data *pd, int off);
-struct blkg_rwstat blkg_rwstat_recursive_sum(struct blkg_policy_data *pd,
-					     int off);
+u64 blkg_stat_recursive_sum(struct blkcg_gq *blkg,
+			    struct blkcg_policy *pol, int off);
+struct blkg_rwstat blkg_rwstat_recursive_sum(struct blkcg_gq *blkg,
+					     struct blkcg_policy *pol, int off);
 
 struct blkg_conf_ctx {
 	struct gendisk			*disk;
-- 
cgit v1.2.3


From 77ea733884eb5520f22c36def1309fe2ab61633e Mon Sep 17 00:00:00 2001
From: Tejun Heo <tj@kernel.org>
Date: Tue, 18 Aug 2015 14:55:24 -0700
Subject: blkcg: move io_service_bytes and io_serviced stats into blkcg_gq

Currently, both cfq-iosched and blk-throttle keep track of
io_service_bytes and io_serviced stats.  While keeping track of them
separately may be useful during development, it doesn't make much
sense otherwise.  Also, blk-throttle was counting bio's as IOs while
cfq-iosched request's, which is more confusing than informative.

This patch adds ->stat_bytes and ->stat_ios to blkg (blkcg_gq),
removes the counterparts from cfq-iosched and blk-throttle and let
them print from the common blkg counters.  The common counters are
incremented during bio issue in blkcg_bio_issue_check().

The outputs are still filtered by whether the policy has
blkg_policy_data on a given blkg, so cfq's output won't show up if it
has never been used for a given blkg.  The only times when the outputs
would differ significantly are when policies are attached on the fly
or elevators are switched back and forth.  Those are quite exceptional
operations and I don't think they warrant keeping separate counters.

v3: Update blkio-controller.txt accordingly.

v2: Account IOs during bio issues instead of request completions so
    that bio-based drivers can be handled the same way.

Signed-off-by: Tejun Heo <tj@kernel.org>
Cc: Vivek Goyal <vgoyal@redhat.com>
Signed-off-by: Jens Axboe <axboe@fb.com>
---
 Documentation/cgroups/blkio-controller.txt | 24 ++------
 block/blk-cgroup.c                         | 98 ++++++++++++++++++++++++++++++
 block/blk-throttle.c                       | 73 ++--------------------
 block/cfq-iosched.c                        | 32 +++-------
 include/linux/blk-cgroup.h                 | 14 +++++
 5 files changed, 133 insertions(+), 108 deletions(-)

(limited to 'block')

diff --git a/Documentation/cgroups/blkio-controller.txt b/Documentation/cgroups/blkio-controller.txt
index 68b6a6a470b0..12686bec37b9 100644
--- a/Documentation/cgroups/blkio-controller.txt
+++ b/Documentation/cgroups/blkio-controller.txt
@@ -201,7 +201,7 @@ Proportional weight policy files
 	  specifies the number of bytes.
 
 - blkio.io_serviced
-	- Number of IOs completed to/from the disk by the group. These
+	- Number of IOs (bio) issued to the disk by the group. These
 	  are further divided by the type of operation - read or write, sync
 	  or async. First two fields specify the major and minor number of the
 	  device, third field specifies the operation type and the fourth field
@@ -327,18 +327,11 @@ Note: If both BW and IOPS rules are specified for a device, then IO is
       subjected to both the constraints.
 
 - blkio.throttle.io_serviced
-	- Number of IOs (bio) completed to/from the disk by the group (as
-	  seen by throttling policy). These are further divided by the type
-	  of operation - read or write, sync or async. First two fields specify
-	  the major and minor number of the device, third field specifies the
-	  operation type and the fourth field specifies the number of IOs.
-
-	  blkio.io_serviced does accounting as seen by CFQ and counts are in
-	  number of requests (struct request). On the other hand,
-	  blkio.throttle.io_serviced counts number of IO in terms of number
-	  of bios as seen by throttling policy.  These bios can later be
-	  merged by elevator and total number of requests completed can be
-	  lesser.
+	- Number of IOs (bio) issued to the disk by the group. These
+	  are further divided by the type of operation - read or write, sync
+	  or async. First two fields specify the major and minor number of the
+	  device, third field specifies the operation type and the fourth field
+	  specifies the number of IOs.
 
 - blkio.throttle.io_service_bytes
 	- Number of bytes transferred to/from the disk by the group. These
@@ -347,11 +340,6 @@ Note: If both BW and IOPS rules are specified for a device, then IO is
 	  device, third field specifies the operation type and the fourth field
 	  specifies the number of bytes.
 
-	  These numbers should roughly be same as blkio.io_service_bytes as
-	  updated by CFQ. The difference between two is that
-	  blkio.io_service_bytes will not be updated if CFQ is not operating
-	  on request queue.
-
 Common files among various policies
 -----------------------------------
 - blkio.reset_stats
diff --git a/block/blk-cgroup.c b/block/blk-cgroup.c
index b26320720a3c..a25263ca39ca 100644
--- a/block/blk-cgroup.c
+++ b/block/blk-cgroup.c
@@ -73,6 +73,9 @@ static void blkg_free(struct blkcg_gq *blkg)
 
 	if (blkg->blkcg != &blkcg_root)
 		blk_exit_rl(&blkg->rl);
+
+	blkg_rwstat_exit(&blkg->stat_ios);
+	blkg_rwstat_exit(&blkg->stat_bytes);
 	kfree(blkg);
 }
 
@@ -95,6 +98,10 @@ static struct blkcg_gq *blkg_alloc(struct blkcg *blkcg, struct request_queue *q,
 	if (!blkg)
 		return NULL;
 
+	if (blkg_rwstat_init(&blkg->stat_bytes, gfp_mask) ||
+	    blkg_rwstat_init(&blkg->stat_ios, gfp_mask))
+		goto err_free;
+
 	blkg->q = q;
 	INIT_LIST_HEAD(&blkg->q_node);
 	blkg->blkcg = blkcg;
@@ -300,6 +307,7 @@ struct blkcg_gq *blkg_lookup_create(struct blkcg *blkcg,
 static void blkg_destroy(struct blkcg_gq *blkg)
 {
 	struct blkcg *blkcg = blkg->blkcg;
+	struct blkcg_gq *parent = blkg->parent;
 	int i;
 
 	lockdep_assert_held(blkg->q->queue_lock);
@@ -315,6 +323,12 @@ static void blkg_destroy(struct blkcg_gq *blkg)
 		if (blkg->pd[i] && pol->pd_offline_fn)
 			pol->pd_offline_fn(blkg->pd[i]);
 	}
+
+	if (parent) {
+		blkg_rwstat_add_aux(&parent->stat_bytes, &blkg->stat_bytes);
+		blkg_rwstat_add_aux(&parent->stat_ios, &blkg->stat_ios);
+	}
+
 	blkg->online = false;
 
 	radix_tree_delete(&blkcg->blkg_tree, blkg->q->id);
@@ -431,6 +445,9 @@ static int blkcg_reset_stats(struct cgroup_subsys_state *css,
 	 * anyway.  If you get hit by a race, retry.
 	 */
 	hlist_for_each_entry(blkg, &blkcg->blkg_list, blkcg_node) {
+		blkg_rwstat_reset(&blkg->stat_bytes);
+		blkg_rwstat_reset(&blkg->stat_ios);
+
 		for (i = 0; i < BLKCG_MAX_POLS; i++) {
 			struct blkcg_policy *pol = blkcg_policy[i];
 
@@ -579,6 +596,87 @@ u64 blkg_prfill_rwstat(struct seq_file *sf, struct blkg_policy_data *pd,
 }
 EXPORT_SYMBOL_GPL(blkg_prfill_rwstat);
 
+static u64 blkg_prfill_rwstat_field(struct seq_file *sf,
+				    struct blkg_policy_data *pd, int off)
+{
+	struct blkg_rwstat rwstat = blkg_rwstat_read((void *)pd->blkg + off);
+
+	return __blkg_prfill_rwstat(sf, pd, &rwstat);
+}
+
+/**
+ * blkg_print_stat_bytes - seq_show callback for blkg->stat_bytes
+ * @sf: seq_file to print to
+ * @v: unused
+ *
+ * To be used as cftype->seq_show to print blkg->stat_bytes.
+ * cftype->private must be set to the blkcg_policy.
+ */
+int blkg_print_stat_bytes(struct seq_file *sf, void *v)
+{
+	blkcg_print_blkgs(sf, css_to_blkcg(seq_css(sf)),
+			  blkg_prfill_rwstat_field, (void *)seq_cft(sf)->private,
+			  offsetof(struct blkcg_gq, stat_bytes), true);
+	return 0;
+}
+EXPORT_SYMBOL_GPL(blkg_print_stat_bytes);
+
+/**
+ * blkg_print_stat_bytes - seq_show callback for blkg->stat_ios
+ * @sf: seq_file to print to
+ * @v: unused
+ *
+ * To be used as cftype->seq_show to print blkg->stat_ios.  cftype->private
+ * must be set to the blkcg_policy.
+ */
+int blkg_print_stat_ios(struct seq_file *sf, void *v)
+{
+	blkcg_print_blkgs(sf, css_to_blkcg(seq_css(sf)),
+			  blkg_prfill_rwstat_field, (void *)seq_cft(sf)->private,
+			  offsetof(struct blkcg_gq, stat_ios), true);
+	return 0;
+}
+EXPORT_SYMBOL_GPL(blkg_print_stat_ios);
+
+static u64 blkg_prfill_rwstat_field_recursive(struct seq_file *sf,
+					      struct blkg_policy_data *pd,
+					      int off)
+{
+	struct blkg_rwstat rwstat = blkg_rwstat_recursive_sum(pd->blkg,
+							      NULL, off);
+	return __blkg_prfill_rwstat(sf, pd, &rwstat);
+}
+
+/**
+ * blkg_print_stat_bytes_recursive - recursive version of blkg_print_stat_bytes
+ * @sf: seq_file to print to
+ * @v: unused
+ */
+int blkg_print_stat_bytes_recursive(struct seq_file *sf, void *v)
+{
+	blkcg_print_blkgs(sf, css_to_blkcg(seq_css(sf)),
+			  blkg_prfill_rwstat_field_recursive,
+			  (void *)seq_cft(sf)->private,
+			  offsetof(struct blkcg_gq, stat_bytes), true);
+	return 0;
+}
+EXPORT_SYMBOL_GPL(blkg_print_stat_bytes_recursive);
+
+/**
+ * blkg_print_stat_ios_recursive - recursive version of blkg_print_stat_ios
+ * @sf: seq_file to print to
+ * @v: unused
+ */
+int blkg_print_stat_ios_recursive(struct seq_file *sf, void *v)
+{
+	blkcg_print_blkgs(sf, css_to_blkcg(seq_css(sf)),
+			  blkg_prfill_rwstat_field_recursive,
+			  (void *)seq_cft(sf)->private,
+			  offsetof(struct blkcg_gq, stat_ios), true);
+	return 0;
+}
+EXPORT_SYMBOL_GPL(blkg_print_stat_ios_recursive);
+
 /**
  * blkg_stat_recursive_sum - collect hierarchical blkg_stat
  * @blkg: blkg of interest
diff --git a/block/blk-throttle.c b/block/blk-throttle.c
index c0b2263a222a..bd3e4b2c7a9d 100644
--- a/block/blk-throttle.c
+++ b/block/blk-throttle.c
@@ -133,11 +133,6 @@ struct throtl_grp {
 	/* When did we start a new slice */
 	unsigned long slice_start[2];
 	unsigned long slice_end[2];
-
-	/* total bytes transferred */
-	struct blkg_rwstat		service_bytes;
-	/* total IOs serviced, post merge */
-	struct blkg_rwstat		serviced;
 };
 
 struct throtl_data
@@ -335,11 +330,7 @@ static struct blkg_policy_data *throtl_pd_alloc(gfp_t gfp, int node)
 
 	tg = kzalloc_node(sizeof(*tg), gfp, node);
 	if (!tg)
-		goto err;
-
-	if (blkg_rwstat_init(&tg->service_bytes, gfp) ||
-	    blkg_rwstat_init(&tg->serviced, gfp))
-		goto err_free_tg;
+		return NULL;
 
 	throtl_service_queue_init(&tg->service_queue);
 
@@ -355,13 +346,6 @@ static struct blkg_policy_data *throtl_pd_alloc(gfp_t gfp, int node)
 	tg->iops[WRITE] = -1;
 
 	return &tg->pd;
-
-err_free_tg:
-	blkg_rwstat_exit(&tg->serviced);
-	blkg_rwstat_exit(&tg->service_bytes);
-	kfree(tg);
-err:
-	return NULL;
 }
 
 static void throtl_pd_init(struct blkg_policy_data *pd)
@@ -419,19 +403,9 @@ static void throtl_pd_free(struct blkg_policy_data *pd)
 	struct throtl_grp *tg = pd_to_tg(pd);
 
 	del_timer_sync(&tg->service_queue.pending_timer);
-	blkg_rwstat_exit(&tg->serviced);
-	blkg_rwstat_exit(&tg->service_bytes);
 	kfree(tg);
 }
 
-static void throtl_pd_reset_stats(struct blkg_policy_data *pd)
-{
-	struct throtl_grp *tg = pd_to_tg(pd);
-
-	blkg_rwstat_reset(&tg->service_bytes);
-	blkg_rwstat_reset(&tg->serviced);
-}
-
 static struct throtl_grp *
 throtl_rb_first(struct throtl_service_queue *parent_sq)
 {
@@ -839,25 +813,6 @@ static bool tg_may_dispatch(struct throtl_grp *tg, struct bio *bio,
 	return 0;
 }
 
-static void throtl_update_dispatch_stats(struct blkcg_gq *blkg, u64 bytes,
-					 int rw)
-{
-	struct throtl_grp *tg = blkg_to_tg(blkg);
-	unsigned long flags;
-
-	/*
-	 * Disabling interrupts to provide mutual exclusion between two
-	 * writes on same cpu. It probably is not needed for 64bit. Not
-	 * optimizing that case yet.
-	 */
-	local_irq_save(flags);
-
-	blkg_rwstat_add(&tg->serviced, rw, 1);
-	blkg_rwstat_add(&tg->service_bytes, rw, bytes);
-
-	local_irq_restore(flags);
-}
-
 static void throtl_charge_bio(struct throtl_grp *tg, struct bio *bio)
 {
 	bool rw = bio_data_dir(bio);
@@ -871,17 +826,9 @@ static void throtl_charge_bio(struct throtl_grp *tg, struct bio *bio)
 	 * more than once as a throttled bio will go through blk-throtl the
 	 * second time when it eventually gets issued.  Set it when a bio
 	 * is being charged to a tg.
-	 *
-	 * Dispatch stats aren't recursive and each @bio should only be
-	 * accounted by the @tg it was originally associated with.  Let's
-	 * update the stats when setting REQ_THROTTLED for the first time
-	 * which is guaranteed to be for the @bio's original tg.
 	 */
-	if (!(bio->bi_rw & REQ_THROTTLED)) {
+	if (!(bio->bi_rw & REQ_THROTTLED))
 		bio->bi_rw |= REQ_THROTTLED;
-		throtl_update_dispatch_stats(tg_to_blkg(tg),
-					     bio->bi_iter.bi_size, bio->bi_rw);
-	}
 }
 
 /**
@@ -1161,13 +1108,6 @@ static void blk_throtl_dispatch_work_fn(struct work_struct *work)
 	}
 }
 
-static int tg_print_rwstat(struct seq_file *sf, void *v)
-{
-	blkcg_print_blkgs(sf, css_to_blkcg(seq_css(sf)), blkg_prfill_rwstat,
-			  &blkcg_policy_throtl, seq_cft(sf)->private, true);
-	return 0;
-}
-
 static u64 tg_prfill_conf_u64(struct seq_file *sf, struct blkg_policy_data *pd,
 			      int off)
 {
@@ -1304,13 +1244,13 @@ static struct cftype throtl_files[] = {
 	},
 	{
 		.name = "throttle.io_service_bytes",
-		.private = offsetof(struct throtl_grp, service_bytes),
-		.seq_show = tg_print_rwstat,
+		.private = (unsigned long)&blkcg_policy_throtl,
+		.seq_show = blkg_print_stat_bytes,
 	},
 	{
 		.name = "throttle.io_serviced",
-		.private = offsetof(struct throtl_grp, serviced),
-		.seq_show = tg_print_rwstat,
+		.private = (unsigned long)&blkcg_policy_throtl,
+		.seq_show = blkg_print_stat_ios,
 	},
 	{ }	/* terminate */
 };
@@ -1329,7 +1269,6 @@ static struct blkcg_policy blkcg_policy_throtl = {
 	.pd_init_fn		= throtl_pd_init,
 	.pd_online_fn		= throtl_pd_online,
 	.pd_free_fn		= throtl_pd_free,
-	.pd_reset_stats_fn	= throtl_pd_reset_stats,
 };
 
 bool blk_throtl_bio(struct request_queue *q, struct blkcg_gq *blkg,
diff --git a/block/cfq-iosched.c b/block/cfq-iosched.c
index e861cc1ebd62..a948d4df3fc3 100644
--- a/block/cfq-iosched.c
+++ b/block/cfq-iosched.c
@@ -177,10 +177,6 @@ enum wl_type_t {
 
 struct cfqg_stats {
 #ifdef CONFIG_CFQ_GROUP_IOSCHED
-	/* total bytes transferred */
-	struct blkg_rwstat		service_bytes;
-	/* total IOs serviced, post merge */
-	struct blkg_rwstat		serviced;
 	/* number of ios merged */
 	struct blkg_rwstat		merged;
 	/* total time spent on device in ns, may not be accurate w/ queueing */
@@ -696,8 +692,6 @@ static inline void cfqg_stats_update_dispatch(struct cfq_group *cfqg,
 					      uint64_t bytes, int rw)
 {
 	blkg_stat_add(&cfqg->stats.sectors, bytes >> 9);
-	blkg_rwstat_add(&cfqg->stats.serviced, rw, 1);
-	blkg_rwstat_add(&cfqg->stats.service_bytes, rw, bytes);
 }
 
 static inline void cfqg_stats_update_completion(struct cfq_group *cfqg,
@@ -717,8 +711,6 @@ static inline void cfqg_stats_update_completion(struct cfq_group *cfqg,
 static void cfqg_stats_reset(struct cfqg_stats *stats)
 {
 	/* queued stats shouldn't be cleared */
-	blkg_rwstat_reset(&stats->service_bytes);
-	blkg_rwstat_reset(&stats->serviced);
 	blkg_rwstat_reset(&stats->merged);
 	blkg_rwstat_reset(&stats->service_time);
 	blkg_rwstat_reset(&stats->wait_time);
@@ -738,8 +730,6 @@ static void cfqg_stats_reset(struct cfqg_stats *stats)
 static void cfqg_stats_add_aux(struct cfqg_stats *to, struct cfqg_stats *from)
 {
 	/* queued stats shouldn't be cleared */
-	blkg_rwstat_add_aux(&to->service_bytes, &from->service_bytes);
-	blkg_rwstat_add_aux(&to->serviced, &from->serviced);
 	blkg_rwstat_add_aux(&to->merged, &from->merged);
 	blkg_rwstat_add_aux(&to->service_time, &from->service_time);
 	blkg_rwstat_add_aux(&to->wait_time, &from->wait_time);
@@ -1544,8 +1534,6 @@ static void cfq_init_cfqg_base(struct cfq_group *cfqg)
 #ifdef CONFIG_CFQ_GROUP_IOSCHED
 static void cfqg_stats_exit(struct cfqg_stats *stats)
 {
-	blkg_rwstat_exit(&stats->service_bytes);
-	blkg_rwstat_exit(&stats->serviced);
 	blkg_rwstat_exit(&stats->merged);
 	blkg_rwstat_exit(&stats->service_time);
 	blkg_rwstat_exit(&stats->wait_time);
@@ -1566,9 +1554,7 @@ static void cfqg_stats_exit(struct cfqg_stats *stats)
 
 static int cfqg_stats_init(struct cfqg_stats *stats, gfp_t gfp)
 {
-	if (blkg_rwstat_init(&stats->service_bytes, gfp) ||
-	    blkg_rwstat_init(&stats->serviced, gfp) ||
-	    blkg_rwstat_init(&stats->merged, gfp) ||
+	if (blkg_rwstat_init(&stats->merged, gfp) ||
 	    blkg_rwstat_init(&stats->service_time, gfp) ||
 	    blkg_rwstat_init(&stats->wait_time, gfp) ||
 	    blkg_rwstat_init(&stats->queued, gfp) ||
@@ -1994,13 +1980,13 @@ static struct cftype cfq_blkcg_files[] = {
 	},
 	{
 		.name = "io_service_bytes",
-		.private = offsetof(struct cfq_group, stats.service_bytes),
-		.seq_show = cfqg_print_rwstat,
+		.private = (unsigned long)&blkcg_policy_cfq,
+		.seq_show = blkg_print_stat_bytes,
 	},
 	{
 		.name = "io_serviced",
-		.private = offsetof(struct cfq_group, stats.serviced),
-		.seq_show = cfqg_print_rwstat,
+		.private = (unsigned long)&blkcg_policy_cfq,
+		.seq_show = blkg_print_stat_ios,
 	},
 	{
 		.name = "io_service_time",
@@ -2036,13 +2022,13 @@ static struct cftype cfq_blkcg_files[] = {
 	},
 	{
 		.name = "io_service_bytes_recursive",
-		.private = offsetof(struct cfq_group, stats.service_bytes),
-		.seq_show = cfqg_print_rwstat_recursive,
+		.private = (unsigned long)&blkcg_policy_cfq,
+		.seq_show = blkg_print_stat_bytes_recursive,
 	},
 	{
 		.name = "io_serviced_recursive",
-		.private = offsetof(struct cfq_group, stats.serviced),
-		.seq_show = cfqg_print_rwstat_recursive,
+		.private = (unsigned long)&blkcg_policy_cfq,
+		.seq_show = blkg_print_stat_ios_recursive,
 	},
 	{
 		.name = "io_service_time_recursive",
diff --git a/include/linux/blk-cgroup.h b/include/linux/blk-cgroup.h
index 4630ce8f9425..286e1bde249f 100644
--- a/include/linux/blk-cgroup.h
+++ b/include/linux/blk-cgroup.h
@@ -127,6 +127,9 @@ struct blkcg_gq {
 	/* is this blkg online? protected by both blkcg and q locks */
 	bool				online;
 
+	struct blkg_rwstat		stat_bytes;
+	struct blkg_rwstat		stat_ios;
+
 	struct blkg_policy_data		*pd[BLKCG_MAX_POLS];
 
 	struct rcu_head			rcu_head;
@@ -190,6 +193,10 @@ u64 __blkg_prfill_rwstat(struct seq_file *sf, struct blkg_policy_data *pd,
 u64 blkg_prfill_stat(struct seq_file *sf, struct blkg_policy_data *pd, int off);
 u64 blkg_prfill_rwstat(struct seq_file *sf, struct blkg_policy_data *pd,
 		       int off);
+int blkg_print_stat_bytes(struct seq_file *sf, void *v);
+int blkg_print_stat_ios(struct seq_file *sf, void *v);
+int blkg_print_stat_bytes_recursive(struct seq_file *sf, void *v);
+int blkg_print_stat_ios_recursive(struct seq_file *sf, void *v);
 
 u64 blkg_stat_recursive_sum(struct blkcg_gq *blkg,
 			    struct blkcg_policy *pol, int off);
@@ -700,6 +707,13 @@ static inline bool blkcg_bio_issue_check(struct request_queue *q,
 
 	throtl = blk_throtl_bio(q, blkg, bio);
 
+	if (!throtl) {
+		blkg = blkg ?: q->root_blkg;
+		blkg_rwstat_add(&blkg->stat_bytes, bio->bi_flags,
+				bio->bi_iter.bi_size);
+		blkg_rwstat_add(&blkg->stat_ios, bio->bi_flags, 1);
+	}
+
 	rcu_read_unlock();
 	return !throtl;
 }
-- 
cgit v1.2.3


From 702747cabe737fe9b358739443d539f10cc7c715 Mon Sep 17 00:00:00 2001
From: Tejun Heo <tj@kernel.org>
Date: Tue, 18 Aug 2015 14:55:25 -0700
Subject: blkcg: remove cfqg_stats->sectors

cfq_stats->sectors is a blkg_stat which keeps track of the total
number of sectors serviced; however, this can be trivially calculated
from blkcg_gq->stat_bytes.  The only thing necessary is adding up
READs and WRITEs and then dividing by sector size.

Remove cfqg_stats->sectors and make cfq print "sectors" and
"sectors_recursive" from stat_bytes.

While this is a bit more code, it removes duplicate stat allocations
and updates and ensures that the reported stats stay in tune with each
other.

Signed-off-by: Tejun Heo <tj@kernel.org>
Cc: Vivek Goyal <vgoyal@redhat.com>
Signed-off-by: Jens Axboe <axboe@fb.com>
---
 block/cfq-iosched.c | 55 +++++++++++++++++++++++++++++++++++------------------
 1 file changed, 36 insertions(+), 19 deletions(-)

(limited to 'block')

diff --git a/block/cfq-iosched.c b/block/cfq-iosched.c
index a948d4df3fc3..395476ab14fe 100644
--- a/block/cfq-iosched.c
+++ b/block/cfq-iosched.c
@@ -185,8 +185,6 @@ struct cfqg_stats {
 	struct blkg_rwstat		wait_time;
 	/* number of IOs queued up */
 	struct blkg_rwstat		queued;
-	/* total sectors transferred */
-	struct blkg_stat		sectors;
 	/* total disk time and nr sectors dispatched by this group */
 	struct blkg_stat		time;
 #ifdef CONFIG_DEBUG_BLK_CGROUP
@@ -688,12 +686,6 @@ static inline void cfqg_stats_update_io_merged(struct cfq_group *cfqg, int rw)
 	blkg_rwstat_add(&cfqg->stats.merged, rw, 1);
 }
 
-static inline void cfqg_stats_update_dispatch(struct cfq_group *cfqg,
-					      uint64_t bytes, int rw)
-{
-	blkg_stat_add(&cfqg->stats.sectors, bytes >> 9);
-}
-
 static inline void cfqg_stats_update_completion(struct cfq_group *cfqg,
 			uint64_t start_time, uint64_t io_start_time, int rw)
 {
@@ -782,8 +774,6 @@ static inline void cfqg_stats_update_timeslice_used(struct cfq_group *cfqg,
 			unsigned long time, unsigned long unaccounted_time) { }
 static inline void cfqg_stats_update_io_remove(struct cfq_group *cfqg, int rw) { }
 static inline void cfqg_stats_update_io_merged(struct cfq_group *cfqg, int rw) { }
-static inline void cfqg_stats_update_dispatch(struct cfq_group *cfqg,
-					      uint64_t bytes, int rw) { }
 static inline void cfqg_stats_update_completion(struct cfq_group *cfqg,
 			uint64_t start_time, uint64_t io_start_time, int rw) { }
 
@@ -1538,8 +1528,6 @@ static void cfqg_stats_exit(struct cfqg_stats *stats)
 	blkg_rwstat_exit(&stats->service_time);
 	blkg_rwstat_exit(&stats->wait_time);
 	blkg_rwstat_exit(&stats->queued);
-
-	blkg_stat_exit(&stats->sectors);
 	blkg_stat_exit(&stats->time);
 #ifdef CONFIG_DEBUG_BLK_CGROUP
 	blkg_stat_exit(&stats->unaccounted_time);
@@ -1558,8 +1546,6 @@ static int cfqg_stats_init(struct cfqg_stats *stats, gfp_t gfp)
 	    blkg_rwstat_init(&stats->service_time, gfp) ||
 	    blkg_rwstat_init(&stats->wait_time, gfp) ||
 	    blkg_rwstat_init(&stats->queued, gfp) ||
-
-	    blkg_stat_init(&stats->sectors, gfp) ||
 	    blkg_stat_init(&stats->time, gfp))
 		goto err;
 
@@ -1901,6 +1887,40 @@ static int cfqg_print_rwstat_recursive(struct seq_file *sf, void *v)
 	return 0;
 }
 
+static u64 cfqg_prfill_sectors(struct seq_file *sf, struct blkg_policy_data *pd,
+			       int off)
+{
+	u64 sum = blkg_rwstat_total(&pd->blkg->stat_bytes);
+
+	return __blkg_prfill_u64(sf, pd, sum >> 9);
+}
+
+static int cfqg_print_stat_sectors(struct seq_file *sf, void *v)
+{
+	blkcg_print_blkgs(sf, css_to_blkcg(seq_css(sf)),
+			  cfqg_prfill_sectors, &blkcg_policy_cfq, 0, false);
+	return 0;
+}
+
+static u64 cfqg_prfill_sectors_recursive(struct seq_file *sf,
+					 struct blkg_policy_data *pd, int off)
+{
+	struct blkg_rwstat tmp = blkg_rwstat_recursive_sum(pd->blkg, NULL,
+					offsetof(struct blkcg_gq, stat_bytes));
+	u64 sum = atomic64_read(&tmp.aux_cnt[BLKG_RWSTAT_READ]) +
+		atomic64_read(&tmp.aux_cnt[BLKG_RWSTAT_WRITE]);
+
+	return __blkg_prfill_u64(sf, pd, sum >> 9);
+}
+
+static int cfqg_print_stat_sectors_recursive(struct seq_file *sf, void *v)
+{
+	blkcg_print_blkgs(sf, css_to_blkcg(seq_css(sf)),
+			  cfqg_prfill_sectors_recursive, &blkcg_policy_cfq, 0,
+			  false);
+	return 0;
+}
+
 #ifdef CONFIG_DEBUG_BLK_CGROUP
 static u64 cfqg_prfill_avg_queue_size(struct seq_file *sf,
 				      struct blkg_policy_data *pd, int off)
@@ -1975,8 +1995,7 @@ static struct cftype cfq_blkcg_files[] = {
 	},
 	{
 		.name = "sectors",
-		.private = offsetof(struct cfq_group, stats.sectors),
-		.seq_show = cfqg_print_stat,
+		.seq_show = cfqg_print_stat_sectors,
 	},
 	{
 		.name = "io_service_bytes",
@@ -2017,8 +2036,7 @@ static struct cftype cfq_blkcg_files[] = {
 	},
 	{
 		.name = "sectors_recursive",
-		.private = offsetof(struct cfq_group, stats.sectors),
-		.seq_show = cfqg_print_stat_recursive,
+		.seq_show = cfqg_print_stat_sectors_recursive,
 	},
 	{
 		.name = "io_service_bytes_recursive",
@@ -2888,7 +2906,6 @@ static void cfq_dispatch_insert(struct request_queue *q, struct request *rq)
 
 	cfqd->rq_in_flight[cfq_cfqq_sync(cfqq)]++;
 	cfqq->nr_sectors += blk_rq_sectors(rq);
-	cfqg_stats_update_dispatch(cfqq->cfqg, blk_rq_bytes(rq), rq->cmd_flags);
 }
 
 /*
-- 
cgit v1.2.3


From 3a7faeada20d72f07d3a7b13454859025cd50a36 Mon Sep 17 00:00:00 2001
From: Tejun Heo <tj@kernel.org>
Date: Tue, 18 Aug 2015 14:55:26 -0700
Subject: blkcg: reduce stack usage of blkg_rwstat_recursive_sum()

The recent percpu conversion of blkg_rwstat triggered the following
warning in certain configurations.

 block/blk-cgroup.c:654:1: warning: the frame size of 1360 bytes is larger than 1024 bytes

This is because blkg_rwstat now contains four percpu_counter which can
be pretty big depending on debug options although it shouldn't be a
problem in production configs.  This patch removes one of the two
local blkg_rwstat variables used by blkg_rwstat_recursive_sum() to
reduce stack usage.

Signed-off-by: Tejun Heo <tj@kernel.org>
Cc: Vivek Goyal <vgoyal@redhat.com>
Reported-by: kbuild test robot <fengguang.wu@intel.com>
Link: http://article.gmane.org/gmane.linux.kernel.cgroups/13835
Signed-off-by: Jens Axboe <axboe@fb.com>
---
 block/blk-cgroup.c | 10 ++++------
 1 file changed, 4 insertions(+), 6 deletions(-)

(limited to 'block')

diff --git a/block/blk-cgroup.c b/block/blk-cgroup.c
index a25263ca39ca..c82c5dbf9187 100644
--- a/block/blk-cgroup.c
+++ b/block/blk-cgroup.c
@@ -744,7 +744,7 @@ struct blkg_rwstat blkg_rwstat_recursive_sum(struct blkcg_gq *blkg,
 
 	rcu_read_lock();
 	blkg_for_each_descendant_pre(pos_blkg, pos_css, blkg) {
-		struct blkg_rwstat *rwstat, tmp;
+		struct blkg_rwstat *rwstat;
 
 		if (!pos_blkg->online)
 			continue;
@@ -754,12 +754,10 @@ struct blkg_rwstat blkg_rwstat_recursive_sum(struct blkcg_gq *blkg,
 		else
 			rwstat = (void *)pos_blkg + off;
 
-		tmp = blkg_rwstat_read(rwstat);
-
 		for (i = 0; i < BLKG_RWSTAT_NR; i++)
-			atomic64_add(atomic64_read(&tmp.aux_cnt[i]) +
-				     atomic64_read(&rwstat->aux_cnt[i]),
-				     &sum.aux_cnt[i]);
+			atomic64_add(atomic64_read(&rwstat->aux_cnt[i]) +
+				percpu_counter_sum_positive(&rwstat->cpu_cnt[i]),
+				&sum.aux_cnt[i]);
 	}
 	rcu_read_unlock();
 
-- 
cgit v1.2.3


From 5332dfc36483d2373d980526145789a354af2d49 Mon Sep 17 00:00:00 2001
From: Tejun Heo <tj@kernel.org>
Date: Tue, 18 Aug 2015 14:55:27 -0700
Subject: blkcg: remove unnecessary NULL checks from __cfqg_set_weight_device()

blkg_to_cfqg() and blkcg_to_cfqgd() on a valid blkg with the policy
enabled are guaranteed to return non-NULL and the counterpart in
blk-throttle doesn't have these checks either.  Remove the spurious
NULL checks.

Signed-off-by: Tejun Heo <tj@kernel.org>
Signed-off-by: Jens Axboe <axboe@fb.com>
---
 block/cfq-iosched.c | 5 +----
 1 file changed, 1 insertion(+), 4 deletions(-)

(limited to 'block')

diff --git a/block/cfq-iosched.c b/block/cfq-iosched.c
index 395476ab14fe..bcf40266bd0a 100644
--- a/block/cfq-iosched.c
+++ b/block/cfq-iosched.c
@@ -1752,12 +1752,10 @@ static ssize_t __cfqg_set_weight_device(struct kernfs_open_file *of,
 	if (ret)
 		return ret;
 
-	ret = -EINVAL;
 	cfqg = blkg_to_cfqg(ctx.blkg);
 	cfqgd = blkcg_to_cfqgd(blkcg);
-	if (!cfqg || !cfqgd)
-		goto err;
 
+	ret = -EINVAL;
 	if (!ctx.v || (ctx.v >= CFQ_WEIGHT_MIN && ctx.v <= CFQ_WEIGHT_MAX)) {
 		if (!is_leaf_weight) {
 			cfqg->dev_weight = ctx.v;
@@ -1769,7 +1767,6 @@ static ssize_t __cfqg_set_weight_device(struct kernfs_open_file *of,
 		ret = 0;
 	}
 
-err:
 	blkg_conf_finish(&ctx);
 	return ret ?: nbytes;
 }
-- 
cgit v1.2.3


From 20386ce0143899ccb5bcbda714436a82d3029f33 Mon Sep 17 00:00:00 2001
From: Tejun Heo <tj@kernel.org>
Date: Tue, 18 Aug 2015 14:55:28 -0700
Subject: blkcg: refine error codes returned during blkcg configuration

blkcg currently returns -EINVAL for most errors which can be pretty
confusing given that the failure modes are quite varied.  Update the
error returns so that

* -EINVAL only for syntactic errors.
* -ERANGE if the value is out of range.
* -ENODEV if the target device can't be found.
* -EOPNOTSUPP if the policy is not enabled on the target device.

Signed-off-by: Tejun Heo <tj@kernel.org>
Signed-off-by: Jens Axboe <axboe@fb.com>
---
 block/blk-cgroup.c  | 12 ++++++------
 block/cfq-iosched.c |  2 +-
 2 files changed, 7 insertions(+), 7 deletions(-)

(limited to 'block')

diff --git a/block/blk-cgroup.c b/block/blk-cgroup.c
index c82c5dbf9187..0af3bff198ed 100644
--- a/block/blk-cgroup.c
+++ b/block/blk-cgroup.c
@@ -179,7 +179,7 @@ static struct blkcg_gq *blkg_create(struct blkcg *blkcg,
 
 	/* blkg holds a reference to blkcg */
 	if (!css_tryget_online(&blkcg->css)) {
-		ret = -EINVAL;
+		ret = -ENODEV;
 		goto err_free_blkg;
 	}
 
@@ -205,7 +205,7 @@ static struct blkcg_gq *blkg_create(struct blkcg *blkcg,
 	if (blkcg_parent(blkcg)) {
 		blkg->parent = __blkg_lookup(blkcg_parent(blkcg), q, false);
 		if (WARN_ON_ONCE(!blkg->parent)) {
-			ret = -EINVAL;
+			ret = -ENODEV;
 			goto err_put_congested;
 		}
 		blkg_get(blkg->parent);
@@ -279,7 +279,7 @@ struct blkcg_gq *blkg_lookup_create(struct blkcg *blkcg,
 	 * we shouldn't allow anything to go through for a bypassing queue.
 	 */
 	if (unlikely(blk_queue_bypass(q)))
-		return ERR_PTR(blk_queue_dying(q) ? -EINVAL : -EBUSY);
+		return ERR_PTR(blk_queue_dying(q) ? -ENODEV : -EBUSY);
 
 	blkg = __blkg_lookup(blkcg, q, true);
 	if (blkg)
@@ -792,10 +792,10 @@ int blkg_conf_prep(struct blkcg *blkcg, const struct blkcg_policy *pol,
 
 	disk = get_gendisk(MKDEV(major, minor), &part);
 	if (!disk)
-		return -EINVAL;
+		return -ENODEV;
 	if (part) {
 		put_disk(disk);
-		return -EINVAL;
+		return -ENODEV;
 	}
 
 	rcu_read_lock();
@@ -804,7 +804,7 @@ int blkg_conf_prep(struct blkcg *blkcg, const struct blkcg_policy *pol,
 	if (blkcg_policy_enabled(disk->queue, pol))
 		blkg = blkg_lookup_create(blkcg, disk->queue);
 	else
-		blkg = ERR_PTR(-EINVAL);
+		blkg = ERR_PTR(-EOPNOTSUPP);
 
 	if (IS_ERR(blkg)) {
 		ret = PTR_ERR(blkg);
diff --git a/block/cfq-iosched.c b/block/cfq-iosched.c
index bcf40266bd0a..38277e3ce286 100644
--- a/block/cfq-iosched.c
+++ b/block/cfq-iosched.c
@@ -1755,7 +1755,7 @@ static ssize_t __cfqg_set_weight_device(struct kernfs_open_file *of,
 	cfqg = blkg_to_cfqg(ctx.blkg);
 	cfqgd = blkcg_to_cfqgd(blkcg);
 
-	ret = -EINVAL;
+	ret = -ERANGE;
 	if (!ctx.v || (ctx.v >= CFQ_WEIGHT_MIN && ctx.v <= CFQ_WEIGHT_MAX)) {
 		if (!is_leaf_weight) {
 			cfqg->dev_weight = ctx.v;
-- 
cgit v1.2.3


From c165b3e3c7bb68c2ed55a5ac2623f030d01d9567 Mon Sep 17 00:00:00 2001
From: Tejun Heo <tj@kernel.org>
Date: Tue, 18 Aug 2015 14:55:29 -0700
Subject: blkcg: rename subsystem name from blkio to io

blkio interface has become messy over time and is currently the
largest.  In addition to the inconsistent naming scheme, it has
multiple stat files which report more or less the same thing, a number
of debug stat files which expose internal details which shouldn't have
been part of the public interface in the first place, recursive and
non-recursive stats and leaf and non-leaf knobs.

Both recursive vs. non-recursive and leaf vs. non-leaf distinctions
don't make any sense on the unified hierarchy as only leaf cgroups can
contain processes.  cgroups is going through a major interface
revision with the unified hierarchy involving significant fundamental
usage changes and given that a significant portion of the interface
doesn't make sense anymore, it's a good time to reorganize the
interface.

As the first step, this patch renames the external visible subsystem
name from "blkio" to "io".  This is more concise, matches the other
two major subsystem names, "cpu" and "memory", and better suited as
blkcg will be involved in anything writeback related too whether an
actual block device is involved or not.

As the subsystem legacy_name is set to "blkio", the only userland
visible change outside the unified hierarchy is that blkcg is reported
as "io" instead of "blkio" in the subsystem initialized message during
boot.  On the unified hierarchy, blkcg now appears as "io".

Signed-off-by: Tejun Heo <tj@kernel.org>
Cc: Li Zefan <lizefan@huawei.com>
Cc: Johannes Weiner <hannes@cmpxchg.org>
Cc: cgroups@vger.kernel.org
Signed-off-by: Jens Axboe <axboe@fb.com>
---
 block/bio.c                   | 2 +-
 block/blk-cgroup.c            | 7 ++++---
 include/linux/backing-dev.h   | 2 +-
 include/linux/blk-cgroup.h    | 4 ++--
 include/linux/cgroup_subsys.h | 2 +-
 mm/backing-dev.c              | 4 ++--
 6 files changed, 11 insertions(+), 10 deletions(-)

(limited to 'block')

diff --git a/block/bio.c b/block/bio.c
index d6e5ba3399f0..c52222c6c69b 100644
--- a/block/bio.c
+++ b/block/bio.c
@@ -2046,7 +2046,7 @@ int bio_associate_current(struct bio *bio)
 
 	get_io_context_active(ioc);
 	bio->bi_ioc = ioc;
-	bio->bi_css = task_get_css(current, blkio_cgrp_id);
+	bio->bi_css = task_get_css(current, io_cgrp_id);
 	return 0;
 }
 EXPORT_SYMBOL_GPL(bio_associate_current);
diff --git a/block/blk-cgroup.c b/block/blk-cgroup.c
index 0af3bff198ed..fc197ea4c992 100644
--- a/block/blk-cgroup.c
+++ b/block/blk-cgroup.c
@@ -1089,12 +1089,13 @@ static int blkcg_can_attach(struct cgroup_subsys_state *css,
 	return ret;
 }
 
-struct cgroup_subsys blkio_cgrp_subsys = {
+struct cgroup_subsys io_cgrp_subsys = {
 	.css_alloc = blkcg_css_alloc,
 	.css_offline = blkcg_css_offline,
 	.css_free = blkcg_css_free,
 	.can_attach = blkcg_can_attach,
 	.legacy_cftypes = blkcg_files,
+	.legacy_name = "blkio",
 #ifdef CONFIG_MEMCG
 	/*
 	 * This ensures that, if available, memcg is automatically enabled
@@ -1104,7 +1105,7 @@ struct cgroup_subsys blkio_cgrp_subsys = {
 	.depends_on = 1 << memory_cgrp_id,
 #endif
 };
-EXPORT_SYMBOL_GPL(blkio_cgrp_subsys);
+EXPORT_SYMBOL_GPL(io_cgrp_subsys);
 
 /**
  * blkcg_activate_policy - activate a blkcg policy on a request_queue
@@ -1266,7 +1267,7 @@ int blkcg_policy_register(struct blkcg_policy *pol)
 
 	/* everything is in place, add intf files for the new policy */
 	if (pol->cftypes)
-		WARN_ON(cgroup_add_legacy_cftypes(&blkio_cgrp_subsys,
+		WARN_ON(cgroup_add_legacy_cftypes(&io_cgrp_subsys,
 						  pol->cftypes));
 	mutex_unlock(&blkcg_pol_register_mutex);
 	return 0;
diff --git a/include/linux/backing-dev.h b/include/linux/backing-dev.h
index 23ebb946e66f..5a5d79ee256f 100644
--- a/include/linux/backing-dev.h
+++ b/include/linux/backing-dev.h
@@ -286,7 +286,7 @@ static inline struct bdi_writeback *wb_find_current(struct backing_dev_info *bdi
 	 * %current's blkcg equals the effective blkcg of its memcg.  No
 	 * need to use the relatively expensive cgroup_get_e_css().
 	 */
-	if (likely(wb && wb->blkcg_css == task_css(current, blkio_cgrp_id)))
+	if (likely(wb && wb->blkcg_css == task_css(current, io_cgrp_id)))
 		return wb;
 	return NULL;
 }
diff --git a/include/linux/blk-cgroup.h b/include/linux/blk-cgroup.h
index 286e1bde249f..db89acd2a864 100644
--- a/include/linux/blk-cgroup.h
+++ b/include/linux/blk-cgroup.h
@@ -221,7 +221,7 @@ static inline struct blkcg *css_to_blkcg(struct cgroup_subsys_state *css)
 
 static inline struct blkcg *task_blkcg(struct task_struct *tsk)
 {
-	return css_to_blkcg(task_css(tsk, blkio_cgrp_id));
+	return css_to_blkcg(task_css(tsk, io_cgrp_id));
 }
 
 static inline struct blkcg *bio_blkcg(struct bio *bio)
@@ -234,7 +234,7 @@ static inline struct blkcg *bio_blkcg(struct bio *bio)
 static inline struct cgroup_subsys_state *
 task_get_blkcg_css(struct task_struct *task)
 {
-	return task_get_css(task, blkio_cgrp_id);
+	return task_get_css(task, io_cgrp_id);
 }
 
 /**
diff --git a/include/linux/cgroup_subsys.h b/include/linux/cgroup_subsys.h
index e4a96fb14403..86b5056104df 100644
--- a/include/linux/cgroup_subsys.h
+++ b/include/linux/cgroup_subsys.h
@@ -16,7 +16,7 @@ SUBSYS(cpuacct)
 #endif
 
 #if IS_ENABLED(CONFIG_BLK_CGROUP)
-SUBSYS(blkio)
+SUBSYS(io)
 #endif
 
 #if IS_ENABLED(CONFIG_MEMCG)
diff --git a/mm/backing-dev.c b/mm/backing-dev.c
index dac5bf59309d..d0ee90e72867 100644
--- a/mm/backing-dev.c
+++ b/mm/backing-dev.c
@@ -523,7 +523,7 @@ static int cgwb_create(struct backing_dev_info *bdi,
 	int ret = 0;
 
 	memcg = mem_cgroup_from_css(memcg_css);
-	blkcg_css = cgroup_get_e_css(memcg_css->cgroup, &blkio_cgrp_subsys);
+	blkcg_css = cgroup_get_e_css(memcg_css->cgroup, &io_cgrp_subsys);
 	blkcg = css_to_blkcg(blkcg_css);
 	memcg_cgwb_list = mem_cgroup_cgwb_list(memcg);
 	blkcg_cgwb_list = &blkcg->cgwb_list;
@@ -645,7 +645,7 @@ struct bdi_writeback *wb_get_create(struct backing_dev_info *bdi,
 
 			/* see whether the blkcg association has changed */
 			blkcg_css = cgroup_get_e_css(memcg_css->cgroup,
-						     &blkio_cgrp_subsys);
+						     &io_cgrp_subsys);
 			if (unlikely(wb->blkcg_css != blkcg_css ||
 				     !wb_tryget(wb)))
 				wb = NULL;
-- 
cgit v1.2.3


From 880f50e228f80626dff6327a6e281e40286f5228 Mon Sep 17 00:00:00 2001
From: Tejun Heo <tj@kernel.org>
Date: Tue, 18 Aug 2015 14:55:30 -0700
Subject: blkcg: mark existing cftypes as legacy

blkcg is about to grow interface for the unified hierarchy.  Add
legacy to existing cftypes.

* blkcg_policy->cftypes -> blkcg_policy->legacy_cftypes
* blk-cgroup.c:blkcg_files -> blkcg_legacy_files
* cfq-iosched.c:cfq_blkcg_files -> cfq_blkcg_legacy_files
* blk-throttle.c:throtl_files -> throtl_legacy_files

Pure renames.  No functional change.

Signed-off-by: Tejun Heo <tj@kernel.org>
Signed-off-by: Jens Axboe <axboe@fb.com>
---
 block/blk-cgroup.c         | 12 ++++++------
 block/blk-throttle.c       |  4 ++--
 block/cfq-iosched.c        |  4 ++--
 include/linux/blk-cgroup.h |  2 +-
 4 files changed, 11 insertions(+), 11 deletions(-)

(limited to 'block')

diff --git a/block/blk-cgroup.c b/block/blk-cgroup.c
index fc197ea4c992..429726bed560 100644
--- a/block/blk-cgroup.c
+++ b/block/blk-cgroup.c
@@ -847,7 +847,7 @@ void blkg_conf_finish(struct blkg_conf_ctx *ctx)
 }
 EXPORT_SYMBOL_GPL(blkg_conf_finish);
 
-struct cftype blkcg_files[] = {
+struct cftype blkcg_legacy_files[] = {
 	{
 		.name = "reset_stats",
 		.write_u64 = blkcg_reset_stats,
@@ -1094,7 +1094,7 @@ struct cgroup_subsys io_cgrp_subsys = {
 	.css_offline = blkcg_css_offline,
 	.css_free = blkcg_css_free,
 	.can_attach = blkcg_can_attach,
-	.legacy_cftypes = blkcg_files,
+	.legacy_cftypes = blkcg_legacy_files,
 	.legacy_name = "blkio",
 #ifdef CONFIG_MEMCG
 	/*
@@ -1266,9 +1266,9 @@ int blkcg_policy_register(struct blkcg_policy *pol)
 	mutex_unlock(&blkcg_pol_mutex);
 
 	/* everything is in place, add intf files for the new policy */
-	if (pol->cftypes)
+	if (pol->legacy_cftypes)
 		WARN_ON(cgroup_add_legacy_cftypes(&io_cgrp_subsys,
-						  pol->cftypes));
+						  pol->legacy_cftypes));
 	mutex_unlock(&blkcg_pol_register_mutex);
 	return 0;
 
@@ -1305,8 +1305,8 @@ void blkcg_policy_unregister(struct blkcg_policy *pol)
 		goto out_unlock;
 
 	/* kill the intf files first */
-	if (pol->cftypes)
-		cgroup_rm_cftypes(pol->cftypes);
+	if (pol->legacy_cftypes)
+		cgroup_rm_cftypes(pol->legacy_cftypes);
 
 	/* remove cpds and unregister */
 	mutex_lock(&blkcg_pol_mutex);
diff --git a/block/blk-throttle.c b/block/blk-throttle.c
index bd3e4b2c7a9d..8b4f6b81bb72 100644
--- a/block/blk-throttle.c
+++ b/block/blk-throttle.c
@@ -1217,7 +1217,7 @@ static ssize_t tg_set_conf_uint(struct kernfs_open_file *of,
 	return tg_set_conf(of, buf, nbytes, off, false);
 }
 
-static struct cftype throtl_files[] = {
+static struct cftype throtl_legacy_files[] = {
 	{
 		.name = "throttle.read_bps_device",
 		.private = offsetof(struct throtl_grp, bps[READ]),
@@ -1263,7 +1263,7 @@ static void throtl_shutdown_wq(struct request_queue *q)
 }
 
 static struct blkcg_policy blkcg_policy_throtl = {
-	.cftypes		= throtl_files,
+	.legacy_cftypes		= throtl_legacy_files,
 
 	.pd_alloc_fn		= throtl_pd_alloc,
 	.pd_init_fn		= throtl_pd_init,
diff --git a/block/cfq-iosched.c b/block/cfq-iosched.c
index 38277e3ce286..baa8459f7064 100644
--- a/block/cfq-iosched.c
+++ b/block/cfq-iosched.c
@@ -1944,7 +1944,7 @@ static int cfqg_print_avg_queue_size(struct seq_file *sf, void *v)
 }
 #endif	/* CONFIG_DEBUG_BLK_CGROUP */
 
-static struct cftype cfq_blkcg_files[] = {
+static struct cftype cfq_blkcg_legacy_files[] = {
 	/* on root, weight is mapped to leaf_weight */
 	{
 		.name = "weight_device",
@@ -4654,7 +4654,7 @@ static struct elevator_type iosched_cfq = {
 
 #ifdef CONFIG_CFQ_GROUP_IOSCHED
 static struct blkcg_policy blkcg_policy_cfq = {
-	.cftypes		= cfq_blkcg_files,
+	.legacy_cftypes		= cfq_blkcg_legacy_files,
 
 	.cpd_alloc_fn		= cfq_cpd_alloc,
 	.cpd_init_fn		= cfq_cpd_init,
diff --git a/include/linux/blk-cgroup.h b/include/linux/blk-cgroup.h
index db89acd2a864..6e016e6fee87 100644
--- a/include/linux/blk-cgroup.h
+++ b/include/linux/blk-cgroup.h
@@ -148,7 +148,7 @@ typedef void (blkcg_pol_reset_pd_stats_fn)(struct blkg_policy_data *pd);
 struct blkcg_policy {
 	int				plid;
 	/* cgroup files for the policy */
-	struct cftype			*cftypes;
+	struct cftype			*legacy_cftypes;
 
 	/* operations */
 	blkcg_pol_alloc_cpd_fn		*cpd_alloc_fn;
-- 
cgit v1.2.3


From 36aa9e5f591e84d67aad2c5bff75e413d77660dd Mon Sep 17 00:00:00 2001
From: Tejun Heo <tj@kernel.org>
Date: Tue, 18 Aug 2015 14:55:31 -0700
Subject: blkcg: move body parsing from blkg_conf_prep() to its callers

Currently, blkg_conf_prep() expects input to be of the following form

 MAJ:MIN NUM

and reads the NUM part into blkg_conf_ctx->v.  This is quite
restrictive and gets in the way in implementing blkcg interface for
the unified hierarchy.  This patch updates blkg_conf_prep() so that it
expects

 MAJ:MIN BODY_STR

where BODY_STR is an arbitrary string.  blkg_conf_ctx->v is replaced
with ->body which is a char pointer pointing to the start of BODY_STR.
Parsing of the body is moved to blkg_conf_prep()'s callers.

To allow using, for example, strsep() on blkg_conf_ctx->val, it is a
non-const pointer and to accommodate that const is dropped from @input
too.

This doesn't cause any behavior changes.

Signed-off-by: Tejun Heo <tj@kernel.org>
Signed-off-by: Jens Axboe <axboe@fb.com>
---
 block/blk-cgroup.c         | 22 ++++++++++++++--------
 block/blk-throttle.c       | 18 ++++++++++++------
 block/cfq-iosched.c        | 17 +++++++++++------
 include/linux/blk-cgroup.h |  4 ++--
 4 files changed, 39 insertions(+), 22 deletions(-)

(limited to 'block')

diff --git a/block/blk-cgroup.c b/block/blk-cgroup.c
index 429726bed560..8ee1ca4d4f2f 100644
--- a/block/blk-cgroup.c
+++ b/block/blk-cgroup.c
@@ -24,6 +24,7 @@
 #include <linux/genhd.h>
 #include <linux/delay.h>
 #include <linux/atomic.h>
+#include <linux/ctype.h>
 #include <linux/blk-cgroup.h>
 #include "blk.h"
 
@@ -773,23 +774,28 @@ EXPORT_SYMBOL_GPL(blkg_rwstat_recursive_sum);
  * @ctx: blkg_conf_ctx to be filled
  *
  * Parse per-blkg config update from @input and initialize @ctx with the
- * result.  @ctx->blkg points to the blkg to be updated and @ctx->v the new
- * value.  This function returns with RCU read lock and queue lock held and
- * must be paired with blkg_conf_finish().
+ * result.  @ctx->blkg points to the blkg to be updated and @ctx->body the
+ * part of @input following MAJ:MIN.  This function returns with RCU read
+ * lock and queue lock held and must be paired with blkg_conf_finish().
  */
 int blkg_conf_prep(struct blkcg *blkcg, const struct blkcg_policy *pol,
-		   const char *input, struct blkg_conf_ctx *ctx)
+		   char *input, struct blkg_conf_ctx *ctx)
 	__acquires(rcu) __acquires(disk->queue->queue_lock)
 {
 	struct gendisk *disk;
 	struct blkcg_gq *blkg;
 	unsigned int major, minor;
-	unsigned long long v;
-	int part, ret;
+	int key_len, part, ret;
+	char *body;
 
-	if (sscanf(input, "%u:%u %llu", &major, &minor, &v) != 3)
+	if (sscanf(input, "%u:%u%n", &major, &minor, &key_len) != 2)
 		return -EINVAL;
 
+	body = input + key_len;
+	if (!isspace(*body))
+		return -EINVAL;
+	body = skip_spaces(body);
+
 	disk = get_gendisk(MKDEV(major, minor), &part);
 	if (!disk)
 		return -ENODEV;
@@ -826,7 +832,7 @@ int blkg_conf_prep(struct blkcg *blkcg, const struct blkcg_policy *pol,
 
 	ctx->disk = disk;
 	ctx->blkg = blkg;
-	ctx->v = v;
+	ctx->body = body;
 	return 0;
 }
 EXPORT_SYMBOL_GPL(blkg_conf_prep);
diff --git a/block/blk-throttle.c b/block/blk-throttle.c
index 8b4f6b81bb72..0e17c8f92d25 100644
--- a/block/blk-throttle.c
+++ b/block/blk-throttle.c
@@ -1154,21 +1154,25 @@ static ssize_t tg_set_conf(struct kernfs_open_file *of,
 	struct blkcg_gq *blkg;
 	struct cgroup_subsys_state *pos_css;
 	int ret;
+	u64 v;
 
 	ret = blkg_conf_prep(blkcg, &blkcg_policy_throtl, buf, &ctx);
 	if (ret)
 		return ret;
 
+	ret = -EINVAL;
+	if (sscanf(ctx.body, "%llu", &v) != 1)
+		goto out_finish;
+	if (!v)
+		v = -1;
+
 	tg = blkg_to_tg(ctx.blkg);
 	sq = &tg->service_queue;
 
-	if (!ctx.v)
-		ctx.v = -1;
-
 	if (is_u64)
-		*(u64 *)((void *)tg + of_cft(of)->private) = ctx.v;
+		*(u64 *)((void *)tg + of_cft(of)->private) = v;
 	else
-		*(unsigned int *)((void *)tg + of_cft(of)->private) = ctx.v;
+		*(unsigned int *)((void *)tg + of_cft(of)->private) = v;
 
 	throtl_log(&tg->service_queue,
 		   "limit change rbps=%llu wbps=%llu riops=%u wiops=%u",
@@ -1201,8 +1205,10 @@ static ssize_t tg_set_conf(struct kernfs_open_file *of,
 		throtl_schedule_next_dispatch(sq->parent_sq, true);
 	}
 
+	ret = 0;
+out_finish:
 	blkg_conf_finish(&ctx);
-	return nbytes;
+	return ret ?: nbytes;
 }
 
 static ssize_t tg_set_conf_u64(struct kernfs_open_file *of,
diff --git a/block/cfq-iosched.c b/block/cfq-iosched.c
index baa8459f7064..ea88d8905bbd 100644
--- a/block/cfq-iosched.c
+++ b/block/cfq-iosched.c
@@ -1747,26 +1747,31 @@ static ssize_t __cfqg_set_weight_device(struct kernfs_open_file *of,
 	struct cfq_group *cfqg;
 	struct cfq_group_data *cfqgd;
 	int ret;
+	u64 v;
 
 	ret = blkg_conf_prep(blkcg, &blkcg_policy_cfq, buf, &ctx);
 	if (ret)
 		return ret;
 
+	ret = -EINVAL;
+	if (sscanf(ctx.body, "%llu", &v) != 1)
+		goto out_finish;
+
 	cfqg = blkg_to_cfqg(ctx.blkg);
 	cfqgd = blkcg_to_cfqgd(blkcg);
 
 	ret = -ERANGE;
-	if (!ctx.v || (ctx.v >= CFQ_WEIGHT_MIN && ctx.v <= CFQ_WEIGHT_MAX)) {
+	if (!v || (v >= CFQ_WEIGHT_MIN && v <= CFQ_WEIGHT_MAX)) {
 		if (!is_leaf_weight) {
-			cfqg->dev_weight = ctx.v;
-			cfqg->new_weight = ctx.v ?: cfqgd->weight;
+			cfqg->dev_weight = v;
+			cfqg->new_weight = v ?: cfqgd->weight;
 		} else {
-			cfqg->dev_leaf_weight = ctx.v;
-			cfqg->new_leaf_weight = ctx.v ?: cfqgd->leaf_weight;
+			cfqg->dev_leaf_weight = v;
+			cfqg->new_leaf_weight = v ?: cfqgd->leaf_weight;
 		}
 		ret = 0;
 	}
-
+out_finish:
 	blkg_conf_finish(&ctx);
 	return ret ?: nbytes;
 }
diff --git a/include/linux/blk-cgroup.h b/include/linux/blk-cgroup.h
index 6e016e6fee87..85a4d989ae43 100644
--- a/include/linux/blk-cgroup.h
+++ b/include/linux/blk-cgroup.h
@@ -206,11 +206,11 @@ struct blkg_rwstat blkg_rwstat_recursive_sum(struct blkcg_gq *blkg,
 struct blkg_conf_ctx {
 	struct gendisk			*disk;
 	struct blkcg_gq			*blkg;
-	u64				v;
+	char				*body;
 };
 
 int blkg_conf_prep(struct blkcg *blkcg, const struct blkcg_policy *pol,
-		   const char *input, struct blkg_conf_ctx *ctx);
+		   char *input, struct blkg_conf_ctx *ctx);
 void blkg_conf_finish(struct blkg_conf_ctx *ctx);
 
 
-- 
cgit v1.2.3


From 69948b070ee2bc3cc253e862cbe2bb09b173d7bd Mon Sep 17 00:00:00 2001
From: Tejun Heo <tj@kernel.org>
Date: Tue, 18 Aug 2015 14:55:32 -0700
Subject: blkcg: separate out tg_conf_updated() from tg_set_conf()

tg_set_conf() is largely consisted of parsing and setting the new
config and the follow-up application and propagation.  This patch
separates out the latter part into tg_conf_updated().  This will be
used to implement interface for the unified hierarchy.

Signed-off-by: Tejun Heo <tj@kernel.org>
Signed-off-by: Jens Axboe <axboe@fb.com>
---
 block/blk-throttle.c | 60 ++++++++++++++++++++++++++++------------------------
 1 file changed, 32 insertions(+), 28 deletions(-)

(limited to 'block')

diff --git a/block/blk-throttle.c b/block/blk-throttle.c
index 0e17c8f92d25..a8bb2fd8f523 100644
--- a/block/blk-throttle.c
+++ b/block/blk-throttle.c
@@ -1144,35 +1144,11 @@ static int tg_print_conf_uint(struct seq_file *sf, void *v)
 	return 0;
 }
 
-static ssize_t tg_set_conf(struct kernfs_open_file *of,
-			   char *buf, size_t nbytes, loff_t off, bool is_u64)
+static void tg_conf_updated(struct throtl_grp *tg)
 {
-	struct blkcg *blkcg = css_to_blkcg(of_css(of));
-	struct blkg_conf_ctx ctx;
-	struct throtl_grp *tg;
-	struct throtl_service_queue *sq;
-	struct blkcg_gq *blkg;
+	struct throtl_service_queue *sq = &tg->service_queue;
 	struct cgroup_subsys_state *pos_css;
-	int ret;
-	u64 v;
-
-	ret = blkg_conf_prep(blkcg, &blkcg_policy_throtl, buf, &ctx);
-	if (ret)
-		return ret;
-
-	ret = -EINVAL;
-	if (sscanf(ctx.body, "%llu", &v) != 1)
-		goto out_finish;
-	if (!v)
-		v = -1;
-
-	tg = blkg_to_tg(ctx.blkg);
-	sq = &tg->service_queue;
-
-	if (is_u64)
-		*(u64 *)((void *)tg + of_cft(of)->private) = v;
-	else
-		*(unsigned int *)((void *)tg + of_cft(of)->private) = v;
+	struct blkcg_gq *blkg;
 
 	throtl_log(&tg->service_queue,
 		   "limit change rbps=%llu wbps=%llu riops=%u wiops=%u",
@@ -1186,7 +1162,7 @@ static ssize_t tg_set_conf(struct kernfs_open_file *of,
 	 * restrictions in the whole hierarchy and allows them to bypass
 	 * blk-throttle.
 	 */
-	blkg_for_each_descendant_pre(blkg, pos_css, ctx.blkg)
+	blkg_for_each_descendant_pre(blkg, pos_css, tg_to_blkg(tg))
 		tg_update_has_rules(blkg_to_tg(blkg));
 
 	/*
@@ -1204,7 +1180,35 @@ static ssize_t tg_set_conf(struct kernfs_open_file *of,
 		tg_update_disptime(tg);
 		throtl_schedule_next_dispatch(sq->parent_sq, true);
 	}
+}
+
+static ssize_t tg_set_conf(struct kernfs_open_file *of,
+			   char *buf, size_t nbytes, loff_t off, bool is_u64)
+{
+	struct blkcg *blkcg = css_to_blkcg(of_css(of));
+	struct blkg_conf_ctx ctx;
+	struct throtl_grp *tg;
+	int ret;
+	u64 v;
+
+	ret = blkg_conf_prep(blkcg, &blkcg_policy_throtl, buf, &ctx);
+	if (ret)
+		return ret;
+
+	ret = -EINVAL;
+	if (sscanf(ctx.body, "%llu", &v) != 1)
+		goto out_finish;
+	if (!v)
+		v = -1;
+
+	tg = blkg_to_tg(ctx.blkg);
+
+	if (is_u64)
+		*(u64 *)((void *)tg + of_cft(of)->private) = v;
+	else
+		*(unsigned int *)((void *)tg + of_cft(of)->private) = v;
 
+	tg_conf_updated(tg);
 	ret = 0;
 out_finish:
 	blkg_conf_finish(&ctx);
-- 
cgit v1.2.3


From dd165eb3bb4ef16bcdb75417add40633f38c52b8 Mon Sep 17 00:00:00 2001
From: Tejun Heo <tj@kernel.org>
Date: Tue, 18 Aug 2015 14:55:33 -0700
Subject: blkcg: misc preparations for unified hierarchy interface

* Export blkg_dev_name()

* Drop unnecessary @cft from __cfq_set_weight().

Signed-off-by: Tejun Heo <tj@kernel.org>
Signed-off-by: Jens Axboe <axboe@fb.com>
---
 block/blk-cgroup.c         | 3 ++-
 block/cfq-iosched.c        | 8 ++++----
 include/linux/blk-cgroup.h | 1 +
 3 files changed, 7 insertions(+), 5 deletions(-)

(limited to 'block')

diff --git a/block/blk-cgroup.c b/block/blk-cgroup.c
index 8ee1ca4d4f2f..b5e72d756be1 100644
--- a/block/blk-cgroup.c
+++ b/block/blk-cgroup.c
@@ -462,13 +462,14 @@ static int blkcg_reset_stats(struct cgroup_subsys_state *css,
 	return 0;
 }
 
-static const char *blkg_dev_name(struct blkcg_gq *blkg)
+const char *blkg_dev_name(struct blkcg_gq *blkg)
 {
 	/* some drivers (floppy) instantiate a queue w/o disk registered */
 	if (blkg->q->backing_dev_info.dev)
 		return dev_name(blkg->q->backing_dev_info.dev);
 	return NULL;
 }
+EXPORT_SYMBOL_GPL(blkg_dev_name);
 
 /**
  * blkcg_print_blkgs - helper for printing per-blkg data
diff --git a/block/cfq-iosched.c b/block/cfq-iosched.c
index ea88d8905bbd..7a7230136e43 100644
--- a/block/cfq-iosched.c
+++ b/block/cfq-iosched.c
@@ -1788,8 +1788,8 @@ static ssize_t cfqg_set_leaf_weight_device(struct kernfs_open_file *of,
 	return __cfqg_set_weight_device(of, buf, nbytes, off, true);
 }
 
-static int __cfq_set_weight(struct cgroup_subsys_state *css, struct cftype *cft,
-			    u64 val, bool is_leaf_weight)
+static int __cfq_set_weight(struct cgroup_subsys_state *css, u64 val,
+			    bool is_leaf_weight)
 {
 	struct blkcg *blkcg = css_to_blkcg(css);
 	struct blkcg_gq *blkg;
@@ -1834,13 +1834,13 @@ out:
 static int cfq_set_weight(struct cgroup_subsys_state *css, struct cftype *cft,
 			  u64 val)
 {
-	return __cfq_set_weight(css, cft, val, false);
+	return __cfq_set_weight(css, val, false);
 }
 
 static int cfq_set_leaf_weight(struct cgroup_subsys_state *css,
 			       struct cftype *cft, u64 val)
 {
-	return __cfq_set_weight(css, cft, val, true);
+	return __cfq_set_weight(css, val, true);
 }
 
 static int cfqg_print_stat(struct seq_file *sf, void *v)
diff --git a/include/linux/blk-cgroup.h b/include/linux/blk-cgroup.h
index 85a4d989ae43..b270aef519c6 100644
--- a/include/linux/blk-cgroup.h
+++ b/include/linux/blk-cgroup.h
@@ -182,6 +182,7 @@ int blkcg_activate_policy(struct request_queue *q,
 void blkcg_deactivate_policy(struct request_queue *q,
 			     const struct blkcg_policy *pol);
 
+const char *blkg_dev_name(struct blkcg_gq *blkg);
 void blkcg_print_blkgs(struct seq_file *sf, struct blkcg *blkcg,
 		       u64 (*prfill)(struct seq_file *,
 				     struct blkg_policy_data *, int),
-- 
cgit v1.2.3


From 2ee867dcfa2eaef1063b686da55c35878b2da4a2 Mon Sep 17 00:00:00 2001
From: Tejun Heo <tj@kernel.org>
Date: Tue, 18 Aug 2015 14:55:34 -0700
Subject: blkcg: implement interface for the unified hierarchy

blkcg interface grew to be the biggest of all controllers and
unfortunately most inconsistent too.  The interface files are
inconsistent with a number of cloes duplicates.  Some files have
recursive variants while others don't.  There's distinction between
normal and leaf weights which isn't intuitive and there are a lot of
stat knobs which don't make much sense outside of debugging and expose
too much implementation details to userland.

In the unified hierarchy, everything is always hierarchical and
internal nodes can't have tasks rendering the two structural issues
twisting the current interface.  The interface has to be updated in a
significant anyway and this is a good chance to revamp it as a whole.
This patch implements blkcg interface for the unified hierarchy.

* (from a previous patch) blkcg is identified by "io" instead of
  "blkio" on the unified hierarchy.  Given that the whole interface is
  updated anyway, the rename shouldn't carry noticeable conversion
  overhead.

* The original interface consisted of 27 files is replaced with the
  following three files.

  blkio.stat	: per-blkcg stats
  blkio.weight	: per-cgroup and per-cgroup-queue weight settings
  blkio.max	: per-cgroup-queue bps and iops max limits

Documentation/cgroups/unified-hierarchy.txt updated accordingly.

v2: blkcg_policy->dfl_cftypes wasn't removed on
    blkcg_policy_unregister() corrupting the cftypes list.  Fixed.

Signed-off-by: Tejun Heo <tj@kernel.org>
Signed-off-by: Jens Axboe <axboe@fb.com>
---
 Documentation/cgroups/unified-hierarchy.txt |  61 ++++++++++++++-
 block/blk-cgroup.c                          |  53 +++++++++++++
 block/blk-throttle.c                        | 112 ++++++++++++++++++++++++++++
 block/cfq-iosched.c                         |  61 +++++++++++++--
 include/linux/blk-cgroup.h                  |   1 +
 5 files changed, 279 insertions(+), 9 deletions(-)

(limited to 'block')

diff --git a/Documentation/cgroups/unified-hierarchy.txt b/Documentation/cgroups/unified-hierarchy.txt
index 1ee9caf29e57..bd1ce15d5178 100644
--- a/Documentation/cgroups/unified-hierarchy.txt
+++ b/Documentation/cgroups/unified-hierarchy.txt
@@ -27,7 +27,7 @@ CONTENTS
     5-3-1. Format
     5-3-2. Control Knobs
   5-4. Per-Controller Changes
-    5-4-1. blkio
+    5-4-1. io
     5-4-2. cpuset
     5-4-3. memory
 6. Planned Changes
@@ -203,7 +203,7 @@ other issues.  The mapping from nice level to weight isn't obvious or
 universal, and there are various other knobs which simply aren't
 available for tasks.
 
-The blkio controller implicitly creates a hidden leaf node for each
+The io controller implicitly creates a hidden leaf node for each
 cgroup to host the tasks.  The hidden leaf has its own copies of all
 the knobs with "leaf_" prefixed.  While this allows equivalent control
 over internal tasks, it's with serious drawbacks.  It always adds an
@@ -438,9 +438,62 @@ may be specified in any order and not all pairs have to be specified.
 
 5-4. Per-Controller Changes
 
-5-4-1. blkio
+5-4-1. io
 
-- blk-throttle becomes properly hierarchical.
+- blkio is renamed to io.  The interface is overhauled anyway.  The
+  new name is more in line with the other two major controllers, cpu
+  and memory, and better suited given that it may be used for cgroup
+  writeback without involving block layer.
+
+- Everything including stat is always hierarchical making separate
+  recursive stat files pointless and, as no internal node can have
+  tasks, leaf weights are meaningless.  The operation model is
+  simplified and the interface is overhauled accordingly.
+
+  io.stat
+
+	The stat file.  The reported stats are from the point where
+	bio's are issued to request_queue.  The stats are counted
+	independent of which policies are enabled.  Each line in the
+	file follows the following format.  More fields may later be
+	added at the end.
+
+	  $MAJ:$MIN rbytes=$RBYTES wbytes=$WBYTES rios=$RIOS wrios=$WIOS
+
+  io.weight
+
+	The weight setting, currently only available and effective if
+	cfq-iosched is in use for the target device.  The weight is
+	between 10 and 1000 and defaults to 500.  The first line
+	always contains the default weight in the following format to
+	use when per-device setting is missing.
+
+	  default $WEIGHT
+
+	Subsequent lines list per-device weights of the following
+	format.
+
+	  $MAJ:$MIN $WEIGHT
+
+	Writing "$WEIGHT" or "default $WEIGHT" changes the default
+	setting.  Writing "$MAJ:$MIN $WEIGHT" sets per-device weight
+	while "$MAJ:$MIN default" clears it.
+
+	This file is available only on non-root cgroups.
+
+  io.max
+
+	The maximum bandwidth and/or iops setting, only available if
+	blk-throttle is enabled.  The file is of the following format.
+
+	  $MAJ:$MIN rbps=$RBPS wbps=$WBPS riops=$RIOPS wiops=$WIOPS
+
+	${R|W}BPS are read/write bytes per second and ${R|W}IOPS are
+	read/write IOs per second.  "max" indicates no limit.  Writing
+	to the file follows the same format but the individual
+	settings may be ommitted or specified in any order.
+
+	This file is available only on non-root cgroups.
 
 
 5-4-2. cpuset
diff --git a/block/blk-cgroup.c b/block/blk-cgroup.c
index b5e72d756be1..88bdb73bd5e0 100644
--- a/block/blk-cgroup.c
+++ b/block/blk-cgroup.c
@@ -854,6 +854,53 @@ void blkg_conf_finish(struct blkg_conf_ctx *ctx)
 }
 EXPORT_SYMBOL_GPL(blkg_conf_finish);
 
+static int blkcg_print_stat(struct seq_file *sf, void *v)
+{
+	struct blkcg *blkcg = css_to_blkcg(seq_css(sf));
+	struct blkcg_gq *blkg;
+
+	rcu_read_lock();
+
+	hlist_for_each_entry_rcu(blkg, &blkcg->blkg_list, blkcg_node) {
+		const char *dname;
+		struct blkg_rwstat rwstat;
+		u64 rbytes, wbytes, rios, wios;
+
+		dname = blkg_dev_name(blkg);
+		if (!dname)
+			continue;
+
+		spin_lock_irq(blkg->q->queue_lock);
+
+		rwstat = blkg_rwstat_recursive_sum(blkg, NULL,
+					offsetof(struct blkcg_gq, stat_bytes));
+		rbytes = atomic64_read(&rwstat.aux_cnt[BLKG_RWSTAT_READ]);
+		wbytes = atomic64_read(&rwstat.aux_cnt[BLKG_RWSTAT_WRITE]);
+
+		rwstat = blkg_rwstat_recursive_sum(blkg, NULL,
+					offsetof(struct blkcg_gq, stat_ios));
+		rios = atomic64_read(&rwstat.aux_cnt[BLKG_RWSTAT_READ]);
+		wios = atomic64_read(&rwstat.aux_cnt[BLKG_RWSTAT_WRITE]);
+
+		spin_unlock_irq(blkg->q->queue_lock);
+
+		if (rbytes || wbytes || rios || wios)
+			seq_printf(sf, "%s rbytes=%llu wbytes=%llu rios=%llu wios=%llu\n",
+				   dname, rbytes, wbytes, rios, wios);
+	}
+
+	rcu_read_unlock();
+	return 0;
+}
+
+struct cftype blkcg_files[] = {
+	{
+		.name = "stat",
+		.seq_show = blkcg_print_stat,
+	},
+	{ }	/* terminate */
+};
+
 struct cftype blkcg_legacy_files[] = {
 	{
 		.name = "reset_stats",
@@ -1101,6 +1148,7 @@ struct cgroup_subsys io_cgrp_subsys = {
 	.css_offline = blkcg_css_offline,
 	.css_free = blkcg_css_free,
 	.can_attach = blkcg_can_attach,
+	.dfl_cftypes = blkcg_files,
 	.legacy_cftypes = blkcg_legacy_files,
 	.legacy_name = "blkio",
 #ifdef CONFIG_MEMCG
@@ -1273,6 +1321,9 @@ int blkcg_policy_register(struct blkcg_policy *pol)
 	mutex_unlock(&blkcg_pol_mutex);
 
 	/* everything is in place, add intf files for the new policy */
+	if (pol->dfl_cftypes)
+		WARN_ON(cgroup_add_dfl_cftypes(&io_cgrp_subsys,
+					       pol->dfl_cftypes));
 	if (pol->legacy_cftypes)
 		WARN_ON(cgroup_add_legacy_cftypes(&io_cgrp_subsys,
 						  pol->legacy_cftypes));
@@ -1312,6 +1363,8 @@ void blkcg_policy_unregister(struct blkcg_policy *pol)
 		goto out_unlock;
 
 	/* kill the intf files first */
+	if (pol->dfl_cftypes)
+		cgroup_rm_cftypes(pol->dfl_cftypes);
 	if (pol->legacy_cftypes)
 		cgroup_rm_cftypes(pol->legacy_cftypes);
 
diff --git a/block/blk-throttle.c b/block/blk-throttle.c
index a8bb2fd8f523..c75a2636dd40 100644
--- a/block/blk-throttle.c
+++ b/block/blk-throttle.c
@@ -1265,6 +1265,117 @@ static struct cftype throtl_legacy_files[] = {
 	{ }	/* terminate */
 };
 
+static u64 tg_prfill_max(struct seq_file *sf, struct blkg_policy_data *pd,
+			 int off)
+{
+	struct throtl_grp *tg = pd_to_tg(pd);
+	const char *dname = blkg_dev_name(pd->blkg);
+	char bufs[4][21] = { "max", "max", "max", "max" };
+
+	if (!dname)
+		return 0;
+	if (tg->bps[READ] == -1 && tg->bps[WRITE] == -1 &&
+	    tg->iops[READ] == -1 && tg->iops[WRITE] == -1)
+		return 0;
+
+	if (tg->bps[READ] != -1)
+		snprintf(bufs[0], sizeof(bufs[0]), "%llu", tg->bps[READ]);
+	if (tg->bps[WRITE] != -1)
+		snprintf(bufs[1], sizeof(bufs[1]), "%llu", tg->bps[WRITE]);
+	if (tg->iops[READ] != -1)
+		snprintf(bufs[2], sizeof(bufs[2]), "%u", tg->iops[READ]);
+	if (tg->iops[WRITE] != -1)
+		snprintf(bufs[3], sizeof(bufs[3]), "%u", tg->iops[WRITE]);
+
+	seq_printf(sf, "%s rbps=%s wbps=%s riops=%s wiops=%s\n",
+		   dname, bufs[0], bufs[1], bufs[2], bufs[3]);
+	return 0;
+}
+
+static int tg_print_max(struct seq_file *sf, void *v)
+{
+	blkcg_print_blkgs(sf, css_to_blkcg(seq_css(sf)), tg_prfill_max,
+			  &blkcg_policy_throtl, seq_cft(sf)->private, false);
+	return 0;
+}
+
+static ssize_t tg_set_max(struct kernfs_open_file *of,
+			  char *buf, size_t nbytes, loff_t off)
+{
+	struct blkcg *blkcg = css_to_blkcg(of_css(of));
+	struct blkg_conf_ctx ctx;
+	struct throtl_grp *tg;
+	u64 v[4];
+	int ret;
+
+	ret = blkg_conf_prep(blkcg, &blkcg_policy_throtl, buf, &ctx);
+	if (ret)
+		return ret;
+
+	tg = blkg_to_tg(ctx.blkg);
+
+	v[0] = tg->bps[READ];
+	v[1] = tg->bps[WRITE];
+	v[2] = tg->iops[READ];
+	v[3] = tg->iops[WRITE];
+
+	while (true) {
+		char tok[27];	/* wiops=18446744073709551616 */
+		char *p;
+		u64 val = -1;
+		int len;
+
+		if (sscanf(ctx.body, "%26s%n", tok, &len) != 1)
+			break;
+		if (tok[0] == '\0')
+			break;
+		ctx.body += len;
+
+		ret = -EINVAL;
+		p = tok;
+		strsep(&p, "=");
+		if (!p || (sscanf(p, "%llu", &val) != 1 && strcmp(p, "max")))
+			goto out_finish;
+
+		ret = -ERANGE;
+		if (!val)
+			goto out_finish;
+
+		ret = -EINVAL;
+		if (!strcmp(tok, "rbps"))
+			v[0] = val;
+		else if (!strcmp(tok, "wbps"))
+			v[1] = val;
+		else if (!strcmp(tok, "riops"))
+			v[2] = min_t(u64, val, UINT_MAX);
+		else if (!strcmp(tok, "wiops"))
+			v[3] = min_t(u64, val, UINT_MAX);
+		else
+			goto out_finish;
+	}
+
+	tg->bps[READ] = v[0];
+	tg->bps[WRITE] = v[1];
+	tg->iops[READ] = v[2];
+	tg->iops[WRITE] = v[3];
+
+	tg_conf_updated(tg);
+	ret = 0;
+out_finish:
+	blkg_conf_finish(&ctx);
+	return ret ?: nbytes;
+}
+
+static struct cftype throtl_files[] = {
+	{
+		.name = "max",
+		.flags = CFTYPE_NOT_ON_ROOT,
+		.seq_show = tg_print_max,
+		.write = tg_set_max,
+	},
+	{ }	/* terminate */
+};
+
 static void throtl_shutdown_wq(struct request_queue *q)
 {
 	struct throtl_data *td = q->td;
@@ -1273,6 +1384,7 @@ static void throtl_shutdown_wq(struct request_queue *q)
 }
 
 static struct blkcg_policy blkcg_policy_throtl = {
+	.dfl_cftypes		= throtl_files,
 	.legacy_cftypes		= throtl_legacy_files,
 
 	.pd_alloc_fn		= throtl_pd_alloc,
diff --git a/block/cfq-iosched.c b/block/cfq-iosched.c
index 7a7230136e43..97da5719c87a 100644
--- a/block/cfq-iosched.c
+++ b/block/cfq-iosched.c
@@ -1740,7 +1740,7 @@ static int cfq_print_leaf_weight(struct seq_file *sf, void *v)
 
 static ssize_t __cfqg_set_weight_device(struct kernfs_open_file *of,
 					char *buf, size_t nbytes, loff_t off,
-					bool is_leaf_weight)
+					bool on_dfl, bool is_leaf_weight)
 {
 	struct blkcg *blkcg = css_to_blkcg(of_css(of));
 	struct blkg_conf_ctx ctx;
@@ -1753,9 +1753,17 @@ static ssize_t __cfqg_set_weight_device(struct kernfs_open_file *of,
 	if (ret)
 		return ret;
 
-	ret = -EINVAL;
-	if (sscanf(ctx.body, "%llu", &v) != 1)
+	if (sscanf(ctx.body, "%llu", &v) == 1) {
+		/* require "default" on dfl */
+		ret = -ERANGE;
+		if (!v && on_dfl)
+			goto out_finish;
+	} else if (!strcmp(strim(ctx.body), "default")) {
+		v = 0;
+	} else {
+		ret = -EINVAL;
 		goto out_finish;
+	}
 
 	cfqg = blkg_to_cfqg(ctx.blkg);
 	cfqgd = blkcg_to_cfqgd(blkcg);
@@ -1779,13 +1787,13 @@ out_finish:
 static ssize_t cfqg_set_weight_device(struct kernfs_open_file *of,
 				      char *buf, size_t nbytes, loff_t off)
 {
-	return __cfqg_set_weight_device(of, buf, nbytes, off, false);
+	return __cfqg_set_weight_device(of, buf, nbytes, off, false, false);
 }
 
 static ssize_t cfqg_set_leaf_weight_device(struct kernfs_open_file *of,
 					   char *buf, size_t nbytes, loff_t off)
 {
-	return __cfqg_set_weight_device(of, buf, nbytes, off, true);
+	return __cfqg_set_weight_device(of, buf, nbytes, off, false, true);
 }
 
 static int __cfq_set_weight(struct cgroup_subsys_state *css, u64 val,
@@ -2103,6 +2111,48 @@ static struct cftype cfq_blkcg_legacy_files[] = {
 #endif	/* CONFIG_DEBUG_BLK_CGROUP */
 	{ }	/* terminate */
 };
+
+static int cfq_print_weight_on_dfl(struct seq_file *sf, void *v)
+{
+	struct blkcg *blkcg = css_to_blkcg(seq_css(sf));
+	struct cfq_group_data *cgd = blkcg_to_cfqgd(blkcg);
+
+	seq_printf(sf, "default %u\n", cgd->weight);
+	blkcg_print_blkgs(sf, blkcg, cfqg_prfill_weight_device,
+			  &blkcg_policy_cfq, 0, false);
+	return 0;
+}
+
+static ssize_t cfq_set_weight_on_dfl(struct kernfs_open_file *of,
+				     char *buf, size_t nbytes, loff_t off)
+{
+	char *endp;
+	int ret;
+	u64 v;
+
+	buf = strim(buf);
+
+	/* "WEIGHT" or "default WEIGHT" sets the default weight */
+	v = simple_strtoull(buf, &endp, 0);
+	if (*endp == '\0' || sscanf(buf, "default %llu", &v) == 1) {
+		ret = __cfq_set_weight(of_css(of), v, false);
+		return ret ?: nbytes;
+	}
+
+	/* "MAJ:MIN WEIGHT" */
+	return __cfqg_set_weight_device(of, buf, nbytes, off, true, false);
+}
+
+static struct cftype cfq_blkcg_files[] = {
+	{
+		.name = "weight",
+		.flags = CFTYPE_NOT_ON_ROOT,
+		.seq_show = cfq_print_weight_on_dfl,
+		.write = cfq_set_weight_on_dfl,
+	},
+	{ }	/* terminate */
+};
+
 #else /* GROUP_IOSCHED */
 static struct cfq_group *cfq_lookup_cfqg(struct cfq_data *cfqd,
 					 struct blkcg *blkcg)
@@ -4659,6 +4709,7 @@ static struct elevator_type iosched_cfq = {
 
 #ifdef CONFIG_CFQ_GROUP_IOSCHED
 static struct blkcg_policy blkcg_policy_cfq = {
+	.dfl_cftypes		= cfq_blkcg_files,
 	.legacy_cftypes		= cfq_blkcg_legacy_files,
 
 	.cpd_alloc_fn		= cfq_cpd_alloc,
diff --git a/include/linux/blk-cgroup.h b/include/linux/blk-cgroup.h
index b270aef519c6..9a7c4bd45fff 100644
--- a/include/linux/blk-cgroup.h
+++ b/include/linux/blk-cgroup.h
@@ -148,6 +148,7 @@ typedef void (blkcg_pol_reset_pd_stats_fn)(struct blkg_policy_data *pd);
 struct blkcg_policy {
 	int				plid;
 	/* cgroup files for the policy */
+	struct cftype			*dfl_cftypes;
 	struct cftype			*legacy_cftypes;
 
 	/* operations */
-- 
cgit v1.2.3


From 3ecca62931ee6a30822a1ab7299bc8b8a21e5288 Mon Sep 17 00:00:00 2001
From: Tejun Heo <tj@kernel.org>
Date: Tue, 18 Aug 2015 14:55:35 -0700
Subject: blkcg: s/CFQ_WEIGHT_*/CFQ_WEIGHT_LEGACY_*/

blkcg is gonna switch to cgroup common weight range as defined by
CGROUP_WEIGHT_* on the unified hierarchy.  In preparation, rename
CFQ_WEIGHT_* constants to CFQ_WEIGHT_LEGACY_*.

Signed-off-by: Tejun Heo <tj@kernel.org>
Cc: Vivek Goyal <vgoyal@redhat.com>
Cc: Arianna Avanzini <avanzini.arianna@gmail.com>
Signed-off-by: Jens Axboe <axboe@fb.com>
---
 block/cfq-iosched.c | 22 +++++++++++-----------
 1 file changed, 11 insertions(+), 11 deletions(-)

(limited to 'block')

diff --git a/block/cfq-iosched.c b/block/cfq-iosched.c
index 97da5719c87a..0fe721eaff28 100644
--- a/block/cfq-iosched.c
+++ b/block/cfq-iosched.c
@@ -68,9 +68,9 @@ static struct kmem_cache *cfq_pool;
 #define rb_entry_cfqg(node)	rb_entry((node), struct cfq_group, rb_node)
 
 /* blkio-related constants */
-#define CFQ_WEIGHT_MIN          10
-#define CFQ_WEIGHT_MAX          1000
-#define CFQ_WEIGHT_DEFAULT      500
+#define CFQ_WEIGHT_LEGACY_MIN	10
+#define CFQ_WEIGHT_LEGACY_DFL	500
+#define CFQ_WEIGHT_LEGACY_MAX	1000
 
 struct cfq_ttime {
 	unsigned long last_end_request;
@@ -1580,11 +1580,11 @@ static void cfq_cpd_init(struct blkcg_policy_data *cpd)
 	struct cfq_group_data *cgd = cpd_to_cfqgd(cpd);
 
 	if (cpd_to_blkcg(cpd) == &blkcg_root) {
-		cgd->weight = 2 * CFQ_WEIGHT_DEFAULT;
-		cgd->leaf_weight = 2 * CFQ_WEIGHT_DEFAULT;
+		cgd->weight = 2 * CFQ_WEIGHT_LEGACY_DFL;
+		cgd->leaf_weight = 2 * CFQ_WEIGHT_LEGACY_DFL;
 	} else {
-		cgd->weight = CFQ_WEIGHT_DEFAULT;
-		cgd->leaf_weight = CFQ_WEIGHT_DEFAULT;
+		cgd->weight = CFQ_WEIGHT_LEGACY_DFL;
+		cgd->leaf_weight = CFQ_WEIGHT_LEGACY_DFL;
 	}
 }
 
@@ -1769,7 +1769,7 @@ static ssize_t __cfqg_set_weight_device(struct kernfs_open_file *of,
 	cfqgd = blkcg_to_cfqgd(blkcg);
 
 	ret = -ERANGE;
-	if (!v || (v >= CFQ_WEIGHT_MIN && v <= CFQ_WEIGHT_MAX)) {
+	if (!v || (v >= CFQ_WEIGHT_LEGACY_MIN && v <= CFQ_WEIGHT_LEGACY_MAX)) {
 		if (!is_leaf_weight) {
 			cfqg->dev_weight = v;
 			cfqg->new_weight = v ?: cfqgd->weight;
@@ -1804,7 +1804,7 @@ static int __cfq_set_weight(struct cgroup_subsys_state *css, u64 val,
 	struct cfq_group_data *cfqgd;
 	int ret = 0;
 
-	if (val < CFQ_WEIGHT_MIN || val > CFQ_WEIGHT_MAX)
+	if (val < CFQ_WEIGHT_LEGACY_MIN || val > CFQ_WEIGHT_LEGACY_MAX)
 		return -EINVAL;
 
 	spin_lock_irq(&blkcg->lock);
@@ -4513,8 +4513,8 @@ static int cfq_init_queue(struct request_queue *q, struct elevator_type *e)
 
 	cfq_init_cfqg_base(cfqd->root_group);
 #endif
-	cfqd->root_group->weight = 2 * CFQ_WEIGHT_DEFAULT;
-	cfqd->root_group->leaf_weight = 2 * CFQ_WEIGHT_DEFAULT;
+	cfqd->root_group->weight = 2 * CFQ_WEIGHT_LEGACY_DFL;
+	cfqd->root_group->leaf_weight = 2 * CFQ_WEIGHT_LEGACY_DFL;
 
 	/*
 	 * Not strictly needed (since RB_ROOT just clears the node and we
-- 
cgit v1.2.3


From 69d7fde5909b614114343974cfc52cb8ff30b544 Mon Sep 17 00:00:00 2001
From: Tejun Heo <tj@kernel.org>
Date: Tue, 18 Aug 2015 14:55:36 -0700
Subject: blkcg: use CGROUP_WEIGHT_* scale for io.weight on the unified
 hierarchy

cgroup is trying to make interface consistent across different
controllers.  For weight based resource control, the knob should have
the range [1, 10000] and default to 100.  This patch updates
cfq-iosched so that the weight range conforms.  The internal
calculations have enough range and the widening of the weight range
shouldn't cause any problem.

* blkcg_policy->cpd_bind_fn() is added.  If present, this is invoked
  when blkcg is attached to a hierarchy.

* cfq_cpd_init() is updated to use the new default value on the
  unified hierarchy.

* cfq_cpd_bind() callback is implemented to clear per-blkg configs and
  apply the default config matching the hierarchy type.

* cfqd->root_group->[leaf_]weight initialization in cfq_init_queue()
  is moved into !CONFIG_CFQ_GROUP_IOSCHED block.  cfq_cpd_bind() is
  now responsible for initializing the initial weights when blkcg is
  enabled.

Signed-off-by: Tejun Heo <tj@kernel.org>
Cc: Vivek Goyal <vgoyal@redhat.com>
Cc: Arianna Avanzini <avanzini.arianna@gmail.com>
Signed-off-by: Jens Axboe <axboe@fb.com>
---
 Documentation/cgroups/unified-hierarchy.txt |  2 +-
 block/blk-cgroup.c                          | 21 +++++++++++
 block/cfq-iosched.c                         | 55 +++++++++++++++++++++--------
 include/linux/blk-cgroup.h                  |  2 ++
 4 files changed, 64 insertions(+), 16 deletions(-)

(limited to 'block')

diff --git a/Documentation/cgroups/unified-hierarchy.txt b/Documentation/cgroups/unified-hierarchy.txt
index bd1ce15d5178..e0975c2cf03d 100644
--- a/Documentation/cgroups/unified-hierarchy.txt
+++ b/Documentation/cgroups/unified-hierarchy.txt
@@ -464,7 +464,7 @@ may be specified in any order and not all pairs have to be specified.
 
 	The weight setting, currently only available and effective if
 	cfq-iosched is in use for the target device.  The weight is
-	between 10 and 1000 and defaults to 500.  The first line
+	between 1 and 10000 and defaults to 100.  The first line
 	always contains the default weight in the following format to
 	use when per-device setting is missing.
 
diff --git a/block/blk-cgroup.c b/block/blk-cgroup.c
index 88bdb73bd5e0..ac8370cb2515 100644
--- a/block/blk-cgroup.c
+++ b/block/blk-cgroup.c
@@ -1143,11 +1143,32 @@ static int blkcg_can_attach(struct cgroup_subsys_state *css,
 	return ret;
 }
 
+static void blkcg_bind(struct cgroup_subsys_state *root_css)
+{
+	int i;
+
+	mutex_lock(&blkcg_pol_mutex);
+
+	for (i = 0; i < BLKCG_MAX_POLS; i++) {
+		struct blkcg_policy *pol = blkcg_policy[i];
+		struct blkcg *blkcg;
+
+		if (!pol || !pol->cpd_bind_fn)
+			continue;
+
+		list_for_each_entry(blkcg, &all_blkcgs, all_blkcgs_node)
+			if (blkcg->cpd[pol->plid])
+				pol->cpd_bind_fn(blkcg->cpd[pol->plid]);
+	}
+	mutex_unlock(&blkcg_pol_mutex);
+}
+
 struct cgroup_subsys io_cgrp_subsys = {
 	.css_alloc = blkcg_css_alloc,
 	.css_offline = blkcg_css_offline,
 	.css_free = blkcg_css_free,
 	.can_attach = blkcg_can_attach,
+	.bind = blkcg_bind,
 	.dfl_cftypes = blkcg_files,
 	.legacy_cftypes = blkcg_legacy_files,
 	.legacy_name = "blkio",
diff --git a/block/cfq-iosched.c b/block/cfq-iosched.c
index 0fe721eaff28..04de88463a98 100644
--- a/block/cfq-iosched.c
+++ b/block/cfq-iosched.c
@@ -1522,6 +1522,9 @@ static void cfq_init_cfqg_base(struct cfq_group *cfqg)
 }
 
 #ifdef CONFIG_CFQ_GROUP_IOSCHED
+static int __cfq_set_weight(struct cgroup_subsys_state *css, u64 val,
+			    bool on_dfl, bool reset_dev, bool is_leaf_weight);
+
 static void cfqg_stats_exit(struct cfqg_stats *stats)
 {
 	blkg_rwstat_exit(&stats->merged);
@@ -1578,14 +1581,14 @@ static struct blkcg_policy_data *cfq_cpd_alloc(gfp_t gfp)
 static void cfq_cpd_init(struct blkcg_policy_data *cpd)
 {
 	struct cfq_group_data *cgd = cpd_to_cfqgd(cpd);
+	unsigned int weight = cgroup_on_dfl(blkcg_root.css.cgroup) ?
+			      CGROUP_WEIGHT_DFL : CFQ_WEIGHT_LEGACY_DFL;
 
-	if (cpd_to_blkcg(cpd) == &blkcg_root) {
-		cgd->weight = 2 * CFQ_WEIGHT_LEGACY_DFL;
-		cgd->leaf_weight = 2 * CFQ_WEIGHT_LEGACY_DFL;
-	} else {
-		cgd->weight = CFQ_WEIGHT_LEGACY_DFL;
-		cgd->leaf_weight = CFQ_WEIGHT_LEGACY_DFL;
-	}
+	if (cpd_to_blkcg(cpd) == &blkcg_root)
+		weight *= 2;
+
+	cgd->weight = weight;
+	cgd->leaf_weight = weight;
 }
 
 static void cfq_cpd_free(struct blkcg_policy_data *cpd)
@@ -1593,6 +1596,19 @@ static void cfq_cpd_free(struct blkcg_policy_data *cpd)
 	kfree(cpd_to_cfqgd(cpd));
 }
 
+static void cfq_cpd_bind(struct blkcg_policy_data *cpd)
+{
+	struct blkcg *blkcg = cpd_to_blkcg(cpd);
+	bool on_dfl = cgroup_on_dfl(blkcg_root.css.cgroup);
+	unsigned int weight = on_dfl ? CGROUP_WEIGHT_DFL : CFQ_WEIGHT_LEGACY_DFL;
+
+	if (blkcg == &blkcg_root)
+		weight *= 2;
+
+	WARN_ON_ONCE(__cfq_set_weight(&blkcg->css, weight, on_dfl, true, false));
+	WARN_ON_ONCE(__cfq_set_weight(&blkcg->css, weight, on_dfl, true, true));
+}
+
 static struct blkg_policy_data *cfq_pd_alloc(gfp_t gfp, int node)
 {
 	struct cfq_group *cfqg;
@@ -1742,6 +1758,8 @@ static ssize_t __cfqg_set_weight_device(struct kernfs_open_file *of,
 					char *buf, size_t nbytes, loff_t off,
 					bool on_dfl, bool is_leaf_weight)
 {
+	unsigned int min = on_dfl ? CGROUP_WEIGHT_MIN : CFQ_WEIGHT_LEGACY_MIN;
+	unsigned int max = on_dfl ? CGROUP_WEIGHT_MAX : CFQ_WEIGHT_LEGACY_MAX;
 	struct blkcg *blkcg = css_to_blkcg(of_css(of));
 	struct blkg_conf_ctx ctx;
 	struct cfq_group *cfqg;
@@ -1769,7 +1787,7 @@ static ssize_t __cfqg_set_weight_device(struct kernfs_open_file *of,
 	cfqgd = blkcg_to_cfqgd(blkcg);
 
 	ret = -ERANGE;
-	if (!v || (v >= CFQ_WEIGHT_LEGACY_MIN && v <= CFQ_WEIGHT_LEGACY_MAX)) {
+	if (!v || (v >= min && v <= max)) {
 		if (!is_leaf_weight) {
 			cfqg->dev_weight = v;
 			cfqg->new_weight = v ?: cfqgd->weight;
@@ -1797,15 +1815,17 @@ static ssize_t cfqg_set_leaf_weight_device(struct kernfs_open_file *of,
 }
 
 static int __cfq_set_weight(struct cgroup_subsys_state *css, u64 val,
-			    bool is_leaf_weight)
+			    bool on_dfl, bool reset_dev, bool is_leaf_weight)
 {
+	unsigned int min = on_dfl ? CGROUP_WEIGHT_MIN : CFQ_WEIGHT_LEGACY_MIN;
+	unsigned int max = on_dfl ? CGROUP_WEIGHT_MAX : CFQ_WEIGHT_LEGACY_MAX;
 	struct blkcg *blkcg = css_to_blkcg(css);
 	struct blkcg_gq *blkg;
 	struct cfq_group_data *cfqgd;
 	int ret = 0;
 
-	if (val < CFQ_WEIGHT_LEGACY_MIN || val > CFQ_WEIGHT_LEGACY_MAX)
-		return -EINVAL;
+	if (val < min || val > max)
+		return -ERANGE;
 
 	spin_lock_irq(&blkcg->lock);
 	cfqgd = blkcg_to_cfqgd(blkcg);
@@ -1826,9 +1846,13 @@ static int __cfq_set_weight(struct cgroup_subsys_state *css, u64 val,
 			continue;
 
 		if (!is_leaf_weight) {
+			if (reset_dev)
+				cfqg->dev_weight = 0;
 			if (!cfqg->dev_weight)
 				cfqg->new_weight = cfqgd->weight;
 		} else {
+			if (reset_dev)
+				cfqg->dev_leaf_weight = 0;
 			if (!cfqg->dev_leaf_weight)
 				cfqg->new_leaf_weight = cfqgd->leaf_weight;
 		}
@@ -1842,13 +1866,13 @@ out:
 static int cfq_set_weight(struct cgroup_subsys_state *css, struct cftype *cft,
 			  u64 val)
 {
-	return __cfq_set_weight(css, val, false);
+	return __cfq_set_weight(css, val, false, false, false);
 }
 
 static int cfq_set_leaf_weight(struct cgroup_subsys_state *css,
 			       struct cftype *cft, u64 val)
 {
-	return __cfq_set_weight(css, val, true);
+	return __cfq_set_weight(css, val, false, false, true);
 }
 
 static int cfqg_print_stat(struct seq_file *sf, void *v)
@@ -2135,7 +2159,7 @@ static ssize_t cfq_set_weight_on_dfl(struct kernfs_open_file *of,
 	/* "WEIGHT" or "default WEIGHT" sets the default weight */
 	v = simple_strtoull(buf, &endp, 0);
 	if (*endp == '\0' || sscanf(buf, "default %llu", &v) == 1) {
-		ret = __cfq_set_weight(of_css(of), v, false);
+		ret = __cfq_set_weight(of_css(of), v, true, false, false);
 		return ret ?: nbytes;
 	}
 
@@ -4512,9 +4536,9 @@ static int cfq_init_queue(struct request_queue *q, struct elevator_type *e)
 		goto out_free;
 
 	cfq_init_cfqg_base(cfqd->root_group);
-#endif
 	cfqd->root_group->weight = 2 * CFQ_WEIGHT_LEGACY_DFL;
 	cfqd->root_group->leaf_weight = 2 * CFQ_WEIGHT_LEGACY_DFL;
+#endif
 
 	/*
 	 * Not strictly needed (since RB_ROOT just clears the node and we
@@ -4715,6 +4739,7 @@ static struct blkcg_policy blkcg_policy_cfq = {
 	.cpd_alloc_fn		= cfq_cpd_alloc,
 	.cpd_init_fn		= cfq_cpd_init,
 	.cpd_free_fn		= cfq_cpd_free,
+	.cpd_bind_fn		= cfq_cpd_bind,
 
 	.pd_alloc_fn		= cfq_pd_alloc,
 	.pd_init_fn		= cfq_pd_init,
diff --git a/include/linux/blk-cgroup.h b/include/linux/blk-cgroup.h
index 9a7c4bd45fff..0a5cc7a1109b 100644
--- a/include/linux/blk-cgroup.h
+++ b/include/linux/blk-cgroup.h
@@ -138,6 +138,7 @@ struct blkcg_gq {
 typedef struct blkcg_policy_data *(blkcg_pol_alloc_cpd_fn)(gfp_t gfp);
 typedef void (blkcg_pol_init_cpd_fn)(struct blkcg_policy_data *cpd);
 typedef void (blkcg_pol_free_cpd_fn)(struct blkcg_policy_data *cpd);
+typedef void (blkcg_pol_bind_cpd_fn)(struct blkcg_policy_data *cpd);
 typedef struct blkg_policy_data *(blkcg_pol_alloc_pd_fn)(gfp_t gfp, int node);
 typedef void (blkcg_pol_init_pd_fn)(struct blkg_policy_data *pd);
 typedef void (blkcg_pol_online_pd_fn)(struct blkg_policy_data *pd);
@@ -155,6 +156,7 @@ struct blkcg_policy {
 	blkcg_pol_alloc_cpd_fn		*cpd_alloc_fn;
 	blkcg_pol_init_cpd_fn		*cpd_init_fn;
 	blkcg_pol_free_cpd_fn		*cpd_free_fn;
+	blkcg_pol_bind_cpd_fn		*cpd_bind_fn;
 
 	blkcg_pol_alloc_pd_fn		*pd_alloc_fn;
 	blkcg_pol_init_pd_fn		*pd_init_fn;
-- 
cgit v1.2.3