LKML Archive on lore.kernel.org
help / color / mirror / Atom feed
* [PATCH V2 0/7] loop: cleanup charging io to mem/blkcg
@ 2021-07-28  8:16 Ming Lei
  2021-07-28  8:16 ` [PATCH V2 1/7] mm: memcontrol: add helper of memcg_get_e_css Ming Lei
                   ` (6 more replies)
  0 siblings, 7 replies; 9+ messages in thread
From: Ming Lei @ 2021-07-28  8:16 UTC (permalink / raw)
  To: Jens Axboe
  Cc: linux-kernel, linux-block, Christoph Hellwig, Dan Schatzberg, Ming Lei

Hello Guys,

Cleanup charging io to mem/blkcg a bit:

- avoid to store blkcg_css/memcg_css in loop_cmd, and store blkcg_css in
loop_worker instead

- avoid to acquire ->lo_work_lock in IO path

- simplify blkcg_css query via xarray

- other misc cleanup

V2:
	- add helper of memcg_get_e_css
	- cleanup #ifdef
	- improve the last patch, as discussed with Dan Schatzberg


Ming Lei (7):
  mm: memcontrol: add helper of memcg_get_e_css
  loop: clean up blkcg association
  loop: conver timer for monitoring idle worker into dwork
  loop: add __loop_free_idle_workers() for covering freeing workers in
    clearing FD
  loop: improve loop_process_work
  loop: use xarray to store workers
  loop: don't add worker into idle list

 drivers/block/loop.c       | 320 ++++++++++++++++++++-----------------
 drivers/block/loop.h       |   7 +-
 include/linux/memcontrol.h |   8 +
 3 files changed, 188 insertions(+), 147 deletions(-)

-- 
2.31.1


^ permalink raw reply	[flat|nested] 9+ messages in thread

* [PATCH V2 1/7] mm: memcontrol: add helper of memcg_get_e_css
  2021-07-28  8:16 [PATCH V2 0/7] loop: cleanup charging io to mem/blkcg Ming Lei
@ 2021-07-28  8:16 ` Ming Lei
  2021-07-28  8:16 ` [PATCH V2 2/7] loop: clean up blkcg association Ming Lei
                   ` (5 subsequent siblings)
  6 siblings, 0 replies; 9+ messages in thread
From: Ming Lei @ 2021-07-28  8:16 UTC (permalink / raw)
  To: Jens Axboe
  Cc: linux-kernel, linux-block, Christoph Hellwig, Dan Schatzberg,
	Ming Lei, Andrew Morton

And helper of memcg_get_e_css() so that the consumer needn't to
call cgroup_get_e_css(cgroup, &memory_cgrp_subsys) directly, since
&memory_cgrp_subsys has to be used in case that MEMCG is enabled.

Cc: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Ming Lei <ming.lei@redhat.com>
---
 include/linux/memcontrol.h | 8 ++++++++
 1 file changed, 8 insertions(+)

diff --git a/include/linux/memcontrol.h b/include/linux/memcontrol.h
index bfe5c486f4ad..67c35793fdce 100644
--- a/include/linux/memcontrol.h
+++ b/include/linux/memcontrol.h
@@ -1101,6 +1101,10 @@ unsigned long mem_cgroup_soft_limit_reclaim(pg_data_t *pgdat, int order,
 						gfp_t gfp_mask,
 						unsigned long *total_scanned);
 
+static inline struct cgroup_subsys_state *memcg_get_e_css(struct cgroup *cgrp)
+{
+	return cgroup_get_e_css(cgrp, &memory_cgrp_subsys);
+}
 #else /* CONFIG_MEMCG */
 
 #define MEM_CGROUP_ID_SHIFT	0
@@ -1456,6 +1460,10 @@ unsigned long mem_cgroup_soft_limit_reclaim(pg_data_t *pgdat, int order,
 {
 	return 0;
 }
+static inline struct cgroup_subsys_state *memcg_get_e_css(struct cgroup *cgrp)
+{
+	return NULL;
+}
 #endif /* CONFIG_MEMCG */
 
 static inline void __inc_lruvec_kmem_state(void *p, enum node_stat_item idx)
-- 
2.31.1


^ permalink raw reply	[flat|nested] 9+ messages in thread

* [PATCH V2 2/7] loop: clean up blkcg association
  2021-07-28  8:16 [PATCH V2 0/7] loop: cleanup charging io to mem/blkcg Ming Lei
  2021-07-28  8:16 ` [PATCH V2 1/7] mm: memcontrol: add helper of memcg_get_e_css Ming Lei
@ 2021-07-28  8:16 ` Ming Lei
  2021-07-28  8:16 ` [PATCH V2 3/7] loop: conver timer for monitoring idle worker into dwork Ming Lei
                   ` (4 subsequent siblings)
  6 siblings, 0 replies; 9+ messages in thread
From: Ming Lei @ 2021-07-28  8:16 UTC (permalink / raw)
  To: Jens Axboe
  Cc: linux-kernel, linux-block, Christoph Hellwig, Dan Schatzberg, Ming Lei

Each loop_worker is responsible for running requests originated from
same blkcg, so:

1) associate with kthread in the entry of loop_process_work(), and
disassociate in the end of this function, then we can avoid to do
both for each request.

2) remove ->blkcg_css and ->memcg_css from 'loop_cmd' since both are
per loop_worker.

Also kill #ifdef in the related functions.

Acked-by: Dan Schatzberg <schatzberg.dan@gmail.com>
Signed-off-by: Ming Lei <ming.lei@redhat.com>
---
 drivers/block/loop.c | 81 +++++++++++++++++++++++---------------------
 drivers/block/loop.h |  2 --
 2 files changed, 43 insertions(+), 40 deletions(-)

diff --git a/drivers/block/loop.c b/drivers/block/loop.c
index f37b9e3d833c..1756c17d3936 100644
--- a/drivers/block/loop.c
+++ b/drivers/block/loop.c
@@ -936,23 +936,46 @@ static inline int queue_on_root_worker(struct cgroup_subsys_state *css)
 {
 	return !css || css == blkcg_root_css;
 }
+static struct cgroup_subsys_state *loop_rq_blkcg_css(struct loop_cmd *cmd)
+{
+	struct request *rq = blk_mq_rq_from_pdu(cmd);
+	/* always use the first bio's css */
+	struct blkcg *blkcg = bio_blkcg(rq->bio);
+
+	if (blkcg)
+		return &blkcg->css;
+	return NULL;
+}
 #else
 static inline int queue_on_root_worker(struct cgroup_subsys_state *css)
 {
 	return !css;
 }
+static struct cgroup_subsys_state *loop_rq_blkcg_css(struct loop_cmd *cmd)
+{
+	return NULL;
+}
 #endif
 
+static struct cgroup_subsys_state *loop_rq_get_memcg_css(
+		struct cgroup_subsys_state *blkcg_css)
+{
+	if (blkcg_css)
+		return memcg_get_e_css(blkcg_css->cgroup);
+	return NULL;
+}
+
 static void loop_queue_work(struct loop_device *lo, struct loop_cmd *cmd)
 {
 	struct rb_node **node = &(lo->worker_tree.rb_node), *parent = NULL;
 	struct loop_worker *cur_worker, *worker = NULL;
 	struct work_struct *work;
 	struct list_head *cmd_list;
+	struct cgroup_subsys_state *blkcg_css = loop_rq_blkcg_css(cmd);
 
 	spin_lock_irq(&lo->lo_work_lock);
 
-	if (queue_on_root_worker(cmd->blkcg_css))
+	if (queue_on_root_worker(blkcg_css))
 		goto queue_work;
 
 	node = &lo->worker_tree.rb_node;
@@ -960,10 +983,10 @@ static void loop_queue_work(struct loop_device *lo, struct loop_cmd *cmd)
 	while (*node) {
 		parent = *node;
 		cur_worker = container_of(*node, struct loop_worker, rb_node);
-		if (cur_worker->blkcg_css == cmd->blkcg_css) {
+		if (cur_worker->blkcg_css == blkcg_css) {
 			worker = cur_worker;
 			break;
-		} else if ((long)cur_worker->blkcg_css < (long)cmd->blkcg_css) {
+		} else if ((long)cur_worker->blkcg_css < (long)blkcg_css) {
 			node = &(*node)->rb_left;
 		} else {
 			node = &(*node)->rb_right;
@@ -977,15 +1000,10 @@ static void loop_queue_work(struct loop_device *lo, struct loop_cmd *cmd)
 	 * In the event we cannot allocate a worker, just queue on the
 	 * rootcg worker and issue the I/O as the rootcg
 	 */
-	if (!worker) {
-		cmd->blkcg_css = NULL;
-		if (cmd->memcg_css)
-			css_put(cmd->memcg_css);
-		cmd->memcg_css = NULL;
+	if (!worker)
 		goto queue_work;
-	}
 
-	worker->blkcg_css = cmd->blkcg_css;
+	worker->blkcg_css = blkcg_css;
 	css_get(worker->blkcg_css);
 	INIT_WORK(&worker->work, loop_workfn);
 	INIT_LIST_HEAD(&worker->cmd_list);
@@ -2098,19 +2116,6 @@ static blk_status_t loop_queue_rq(struct blk_mq_hw_ctx *hctx,
 		break;
 	}
 
-	/* always use the first bio's css */
-	cmd->blkcg_css = NULL;
-	cmd->memcg_css = NULL;
-#ifdef CONFIG_BLK_CGROUP
-	if (rq->bio && rq->bio->bi_blkg) {
-		cmd->blkcg_css = &bio_blkcg(rq->bio)->css;
-#ifdef CONFIG_MEMCG
-		cmd->memcg_css =
-			cgroup_get_e_css(cmd->blkcg_css->cgroup,
-					&memory_cgrp_subsys);
-#endif
-	}
-#endif
 	loop_queue_work(lo, cmd);
 
 	return BLK_STS_OK;
@@ -2122,28 +2127,14 @@ static void loop_handle_cmd(struct loop_cmd *cmd)
 	const bool write = op_is_write(req_op(rq));
 	struct loop_device *lo = rq->q->queuedata;
 	int ret = 0;
-	struct mem_cgroup *old_memcg = NULL;
 
 	if (write && (lo->lo_flags & LO_FLAGS_READ_ONLY)) {
 		ret = -EIO;
 		goto failed;
 	}
 
-	if (cmd->blkcg_css)
-		kthread_associate_blkcg(cmd->blkcg_css);
-	if (cmd->memcg_css)
-		old_memcg = set_active_memcg(
-			mem_cgroup_from_css(cmd->memcg_css));
-
 	ret = do_req_filebacked(lo, rq);
 
-	if (cmd->blkcg_css)
-		kthread_associate_blkcg(NULL);
-
-	if (cmd->memcg_css) {
-		set_active_memcg(old_memcg);
-		css_put(cmd->memcg_css);
-	}
  failed:
 	/* complete non-aio request */
 	if (!cmd->use_aio || ret) {
@@ -2199,7 +2190,21 @@ static void loop_workfn(struct work_struct *work)
 {
 	struct loop_worker *worker =
 		container_of(work, struct loop_worker, work);
-	loop_process_work(worker, &worker->cmd_list, worker->lo);
+	struct mem_cgroup *old_memcg = NULL;
+	struct cgroup_subsys_state *memcg_css = NULL;
+
+	kthread_associate_blkcg(worker->blkcg_css);
+	memcg_css = loop_rq_get_memcg_css(worker->blkcg_css);
+	if (memcg_css) {
+		old_memcg = set_active_memcg(
+				mem_cgroup_from_css(memcg_css));
+		loop_process_work(worker, &worker->cmd_list, worker->lo);
+		set_active_memcg(old_memcg);
+		css_put(memcg_css);
+	} else {
+		loop_process_work(worker, &worker->cmd_list, worker->lo);
+	}
+	kthread_associate_blkcg(NULL);
 }
 
 static void loop_rootcg_workfn(struct work_struct *work)
diff --git a/drivers/block/loop.h b/drivers/block/loop.h
index 1988899db63a..a52a3fd89457 100644
--- a/drivers/block/loop.h
+++ b/drivers/block/loop.h
@@ -77,8 +77,6 @@ struct loop_cmd {
 	long ret;
 	struct kiocb iocb;
 	struct bio_vec *bvec;
-	struct cgroup_subsys_state *blkcg_css;
-	struct cgroup_subsys_state *memcg_css;
 };
 
 /* Support for loadable transfer modules */
-- 
2.31.1


^ permalink raw reply	[flat|nested] 9+ messages in thread

* [PATCH V2 3/7] loop: conver timer for monitoring idle worker into dwork
  2021-07-28  8:16 [PATCH V2 0/7] loop: cleanup charging io to mem/blkcg Ming Lei
  2021-07-28  8:16 ` [PATCH V2 1/7] mm: memcontrol: add helper of memcg_get_e_css Ming Lei
  2021-07-28  8:16 ` [PATCH V2 2/7] loop: clean up blkcg association Ming Lei
@ 2021-07-28  8:16 ` Ming Lei
  2021-07-28  8:16 ` [PATCH V2 4/7] loop: add __loop_free_idle_workers() for covering freeing workers in clearing FD Ming Lei
                   ` (3 subsequent siblings)
  6 siblings, 0 replies; 9+ messages in thread
From: Ming Lei @ 2021-07-28  8:16 UTC (permalink / raw)
  To: Jens Axboe
  Cc: linux-kernel, linux-block, Christoph Hellwig, Dan Schatzberg, Ming Lei

Not necessary to use a timer to do that, and dwork is just fine,
then we don't need to always disable interrupt when acquiring
->loop_work_lock.

Acked-by: Dan Schatzberg <schatzberg.dan@gmail.com>
Signed-off-by: Ming Lei <ming.lei@redhat.com>
---
 drivers/block/loop.c | 75 ++++++++++++++++++++++----------------------
 drivers/block/loop.h |  2 +-
 2 files changed, 38 insertions(+), 39 deletions(-)

diff --git a/drivers/block/loop.c b/drivers/block/loop.c
index 1756c17d3936..f63bd75e80ba 100644
--- a/drivers/block/loop.c
+++ b/drivers/block/loop.c
@@ -929,7 +929,6 @@ struct loop_worker {
 
 static void loop_workfn(struct work_struct *work);
 static void loop_rootcg_workfn(struct work_struct *work);
-static void loop_free_idle_workers(struct timer_list *timer);
 
 #ifdef CONFIG_BLK_CGROUP
 static inline int queue_on_root_worker(struct cgroup_subsys_state *css)
@@ -973,7 +972,7 @@ static void loop_queue_work(struct loop_device *lo, struct loop_cmd *cmd)
 	struct list_head *cmd_list;
 	struct cgroup_subsys_state *blkcg_css = loop_rq_blkcg_css(cmd);
 
-	spin_lock_irq(&lo->lo_work_lock);
+	spin_lock(&lo->lo_work_lock);
 
 	if (queue_on_root_worker(blkcg_css))
 		goto queue_work;
@@ -1028,7 +1027,7 @@ static void loop_queue_work(struct loop_device *lo, struct loop_cmd *cmd)
 	}
 	list_add_tail(&cmd->list_entry, cmd_list);
 	queue_work(lo->workqueue, work);
-	spin_unlock_irq(&lo->lo_work_lock);
+	spin_unlock(&lo->lo_work_lock);
 }
 
 static void loop_update_rotational(struct loop_device *lo)
@@ -1150,6 +1149,33 @@ loop_set_status_from_info(struct loop_device *lo,
 	return 0;
 }
 
+static void loop_set_timer(struct loop_device *lo)
+{
+	schedule_delayed_work(&lo->idle_work, LOOP_IDLE_WORKER_TIMEOUT);
+}
+
+static void loop_free_idle_workers(struct work_struct *work)
+{
+	struct loop_device *lo = container_of(work, struct loop_device,
+			idle_work.work);
+	struct loop_worker *pos, *worker;
+
+	spin_lock(&lo->lo_work_lock);
+	list_for_each_entry_safe(worker, pos, &lo->idle_worker_list,
+				idle_list) {
+		if (time_is_after_jiffies(worker->last_ran_at +
+						LOOP_IDLE_WORKER_TIMEOUT))
+			break;
+		list_del(&worker->idle_list);
+		rb_erase(&worker->rb_node, &lo->worker_tree);
+		css_put(worker->blkcg_css);
+		kfree(worker);
+	}
+	if (!list_empty(&lo->idle_worker_list))
+		loop_set_timer(lo);
+	spin_unlock(&lo->lo_work_lock);
+}
+
 static int loop_configure(struct loop_device *lo, fmode_t mode,
 			  struct block_device *bdev,
 			  const struct loop_config *config)
@@ -1229,8 +1255,7 @@ static int loop_configure(struct loop_device *lo, fmode_t mode,
 	INIT_LIST_HEAD(&lo->rootcg_cmd_list);
 	INIT_LIST_HEAD(&lo->idle_worker_list);
 	lo->worker_tree = RB_ROOT;
-	timer_setup(&lo->timer, loop_free_idle_workers,
-		TIMER_DEFERRABLE);
+	INIT_DELAYED_WORK(&lo->idle_work, loop_free_idle_workers);
 	lo->use_dio = lo->lo_flags & LO_FLAGS_DIRECT_IO;
 	lo->lo_device = bdev;
 	lo->lo_backing_file = file;
@@ -1320,7 +1345,7 @@ static int __loop_clr_fd(struct loop_device *lo, bool release)
 	blk_mq_freeze_queue(lo->lo_queue);
 
 	destroy_workqueue(lo->workqueue);
-	spin_lock_irq(&lo->lo_work_lock);
+	spin_lock(&lo->lo_work_lock);
 	list_for_each_entry_safe(worker, pos, &lo->idle_worker_list,
 				idle_list) {
 		list_del(&worker->idle_list);
@@ -1328,8 +1353,8 @@ static int __loop_clr_fd(struct loop_device *lo, bool release)
 		css_put(worker->blkcg_css);
 		kfree(worker);
 	}
-	spin_unlock_irq(&lo->lo_work_lock);
-	del_timer_sync(&lo->timer);
+	spin_unlock(&lo->lo_work_lock);
+	cancel_delayed_work_sync(&lo->idle_work);
 
 	spin_lock_irq(&lo->lo_lock);
 	lo->lo_backing_file = NULL;
@@ -2147,11 +2172,6 @@ static void loop_handle_cmd(struct loop_cmd *cmd)
 	}
 }
 
-static void loop_set_timer(struct loop_device *lo)
-{
-	timer_reduce(&lo->timer, jiffies + LOOP_IDLE_WORKER_TIMEOUT);
-}
-
 static void loop_process_work(struct loop_worker *worker,
 			struct list_head *cmd_list, struct loop_device *lo)
 {
@@ -2159,17 +2179,17 @@ static void loop_process_work(struct loop_worker *worker,
 	struct loop_cmd *cmd;
 
 	current->flags |= PF_LOCAL_THROTTLE | PF_MEMALLOC_NOIO;
-	spin_lock_irq(&lo->lo_work_lock);
+	spin_lock(&lo->lo_work_lock);
 	while (!list_empty(cmd_list)) {
 		cmd = container_of(
 			cmd_list->next, struct loop_cmd, list_entry);
 		list_del(cmd_list->next);
-		spin_unlock_irq(&lo->lo_work_lock);
+		spin_unlock(&lo->lo_work_lock);
 
 		loop_handle_cmd(cmd);
 		cond_resched();
 
-		spin_lock_irq(&lo->lo_work_lock);
+		spin_lock(&lo->lo_work_lock);
 	}
 
 	/*
@@ -2182,7 +2202,7 @@ static void loop_process_work(struct loop_worker *worker,
 		list_add_tail(&worker->idle_list, &lo->idle_worker_list);
 		loop_set_timer(lo);
 	}
-	spin_unlock_irq(&lo->lo_work_lock);
+	spin_unlock(&lo->lo_work_lock);
 	current->flags = orig_flags;
 }
 
@@ -2214,27 +2234,6 @@ static void loop_rootcg_workfn(struct work_struct *work)
 	loop_process_work(NULL, &lo->rootcg_cmd_list, lo);
 }
 
-static void loop_free_idle_workers(struct timer_list *timer)
-{
-	struct loop_device *lo = container_of(timer, struct loop_device, timer);
-	struct loop_worker *pos, *worker;
-
-	spin_lock_irq(&lo->lo_work_lock);
-	list_for_each_entry_safe(worker, pos, &lo->idle_worker_list,
-				idle_list) {
-		if (time_is_after_jiffies(worker->last_ran_at +
-						LOOP_IDLE_WORKER_TIMEOUT))
-			break;
-		list_del(&worker->idle_list);
-		rb_erase(&worker->rb_node, &lo->worker_tree);
-		css_put(worker->blkcg_css);
-		kfree(worker);
-	}
-	if (!list_empty(&lo->idle_worker_list))
-		loop_set_timer(lo);
-	spin_unlock_irq(&lo->lo_work_lock);
-}
-
 static const struct blk_mq_ops loop_mq_ops = {
 	.queue_rq       = loop_queue_rq,
 	.complete	= lo_complete_rq,
diff --git a/drivers/block/loop.h b/drivers/block/loop.h
index a52a3fd89457..9df889af1bcf 100644
--- a/drivers/block/loop.h
+++ b/drivers/block/loop.h
@@ -60,7 +60,7 @@ struct loop_device {
 	struct list_head        rootcg_cmd_list;
 	struct list_head        idle_worker_list;
 	struct rb_root          worker_tree;
-	struct timer_list       timer;
+	struct delayed_work	idle_work;
 	bool			use_dio;
 	bool			sysfs_inited;
 
-- 
2.31.1


^ permalink raw reply	[flat|nested] 9+ messages in thread

* [PATCH V2 4/7] loop: add __loop_free_idle_workers() for covering freeing workers in clearing FD
  2021-07-28  8:16 [PATCH V2 0/7] loop: cleanup charging io to mem/blkcg Ming Lei
                   ` (2 preceding siblings ...)
  2021-07-28  8:16 ` [PATCH V2 3/7] loop: conver timer for monitoring idle worker into dwork Ming Lei
@ 2021-07-28  8:16 ` Ming Lei
  2021-07-28  8:16 ` [PATCH V2 5/7] loop: improve loop_process_work Ming Lei
                   ` (2 subsequent siblings)
  6 siblings, 0 replies; 9+ messages in thread
From: Ming Lei @ 2021-07-28  8:16 UTC (permalink / raw)
  To: Jens Axboe
  Cc: linux-kernel, linux-block, Christoph Hellwig, Dan Schatzberg, Ming Lei

Add the helper, so we can remove the duplicated code for freeing all
workers in clearing FD.

Acked-by: Dan Schatzberg <schatzberg.dan@gmail.com>
Signed-off-by: Ming Lei <ming.lei@redhat.com>
---
 drivers/block/loop.c | 25 +++++++++++--------------
 1 file changed, 11 insertions(+), 14 deletions(-)

diff --git a/drivers/block/loop.c b/drivers/block/loop.c
index f63bd75e80ba..4c524c72d976 100644
--- a/drivers/block/loop.c
+++ b/drivers/block/loop.c
@@ -1154,16 +1154,14 @@ static void loop_set_timer(struct loop_device *lo)
 	schedule_delayed_work(&lo->idle_work, LOOP_IDLE_WORKER_TIMEOUT);
 }
 
-static void loop_free_idle_workers(struct work_struct *work)
+static void __loop_free_idle_workers(struct loop_device *lo, bool force)
 {
-	struct loop_device *lo = container_of(work, struct loop_device,
-			idle_work.work);
 	struct loop_worker *pos, *worker;
 
 	spin_lock(&lo->lo_work_lock);
 	list_for_each_entry_safe(worker, pos, &lo->idle_worker_list,
 				idle_list) {
-		if (time_is_after_jiffies(worker->last_ran_at +
+		if (!force && time_is_after_jiffies(worker->last_ran_at +
 						LOOP_IDLE_WORKER_TIMEOUT))
 			break;
 		list_del(&worker->idle_list);
@@ -1176,6 +1174,14 @@ static void loop_free_idle_workers(struct work_struct *work)
 	spin_unlock(&lo->lo_work_lock);
 }
 
+static void loop_free_idle_workers(struct work_struct *work)
+{
+	struct loop_device *lo = container_of(work, struct loop_device,
+			idle_work.work);
+
+	__loop_free_idle_workers(lo, false);
+}
+
 static int loop_configure(struct loop_device *lo, fmode_t mode,
 			  struct block_device *bdev,
 			  const struct loop_config *config)
@@ -1324,7 +1330,6 @@ static int __loop_clr_fd(struct loop_device *lo, bool release)
 	int err = 0;
 	bool partscan = false;
 	int lo_number;
-	struct loop_worker *pos, *worker;
 
 	mutex_lock(&lo->lo_mutex);
 	if (WARN_ON_ONCE(lo->lo_state != Lo_rundown)) {
@@ -1345,15 +1350,7 @@ static int __loop_clr_fd(struct loop_device *lo, bool release)
 	blk_mq_freeze_queue(lo->lo_queue);
 
 	destroy_workqueue(lo->workqueue);
-	spin_lock(&lo->lo_work_lock);
-	list_for_each_entry_safe(worker, pos, &lo->idle_worker_list,
-				idle_list) {
-		list_del(&worker->idle_list);
-		rb_erase(&worker->rb_node, &lo->worker_tree);
-		css_put(worker->blkcg_css);
-		kfree(worker);
-	}
-	spin_unlock(&lo->lo_work_lock);
+	__loop_free_idle_workers(lo, true);
 	cancel_delayed_work_sync(&lo->idle_work);
 
 	spin_lock_irq(&lo->lo_lock);
-- 
2.31.1


^ permalink raw reply	[flat|nested] 9+ messages in thread

* [PATCH V2 5/7] loop: improve loop_process_work
  2021-07-28  8:16 [PATCH V2 0/7] loop: cleanup charging io to mem/blkcg Ming Lei
                   ` (3 preceding siblings ...)
  2021-07-28  8:16 ` [PATCH V2 4/7] loop: add __loop_free_idle_workers() for covering freeing workers in clearing FD Ming Lei
@ 2021-07-28  8:16 ` Ming Lei
  2021-07-28  8:16 ` [PATCH V2 6/7] loop: use xarray to store workers Ming Lei
  2021-07-28  8:16 ` [PATCH V2 7/7] loop: don't add worker into idle list Ming Lei
  6 siblings, 0 replies; 9+ messages in thread
From: Ming Lei @ 2021-07-28  8:16 UTC (permalink / raw)
  To: Jens Axboe
  Cc: linux-kernel, linux-block, Christoph Hellwig, Dan Schatzberg, Ming Lei

Avoid to acquire the spinlock for handling every loop command, and hold
lock once for taking all commands.

Acked-by: Dan Schatzberg <schatzberg.dan@gmail.com>
Signed-off-by: Ming Lei <ming.lei@redhat.com>
---
 drivers/block/loop.c | 21 +++++++++++++--------
 1 file changed, 13 insertions(+), 8 deletions(-)

diff --git a/drivers/block/loop.c b/drivers/block/loop.c
index 4c524c72d976..74682c0f3f86 100644
--- a/drivers/block/loop.c
+++ b/drivers/block/loop.c
@@ -2174,21 +2174,26 @@ static void loop_process_work(struct loop_worker *worker,
 {
 	int orig_flags = current->flags;
 	struct loop_cmd *cmd;
+	LIST_HEAD(list);
 
 	current->flags |= PF_LOCAL_THROTTLE | PF_MEMALLOC_NOIO;
+
 	spin_lock(&lo->lo_work_lock);
-	while (!list_empty(cmd_list)) {
-		cmd = container_of(
-			cmd_list->next, struct loop_cmd, list_entry);
-		list_del(cmd_list->next);
-		spin_unlock(&lo->lo_work_lock);
+ again:
+	list_splice_init(cmd_list, &list);
+	spin_unlock(&lo->lo_work_lock);
 
-		loop_handle_cmd(cmd);
-		cond_resched();
+	while (!list_empty(&list)) {
+		cmd = list_first_entry(&list, struct loop_cmd, list_entry);
+		list_del_init(&cmd->list_entry);
 
-		spin_lock(&lo->lo_work_lock);
+		loop_handle_cmd(cmd);
 	}
 
+	spin_lock(&lo->lo_work_lock);
+	if (!list_empty(cmd_list))
+		goto again;
+
 	/*
 	 * We only add to the idle list if there are no pending cmds
 	 * *and* the worker will not run again which ensures that it
-- 
2.31.1


^ permalink raw reply	[flat|nested] 9+ messages in thread

* [PATCH V2 6/7] loop: use xarray to store workers
  2021-07-28  8:16 [PATCH V2 0/7] loop: cleanup charging io to mem/blkcg Ming Lei
                   ` (4 preceding siblings ...)
  2021-07-28  8:16 ` [PATCH V2 5/7] loop: improve loop_process_work Ming Lei
@ 2021-07-28  8:16 ` Ming Lei
  2021-07-28  8:16 ` [PATCH V2 7/7] loop: don't add worker into idle list Ming Lei
  6 siblings, 0 replies; 9+ messages in thread
From: Ming Lei @ 2021-07-28  8:16 UTC (permalink / raw)
  To: Jens Axboe
  Cc: linux-kernel, linux-block, Christoph Hellwig, Dan Schatzberg, Ming Lei

css->id is unique in io controller wide, so replace rbtree with xarray
for querying/storing 'blkcg_css' by using css->id as key, then code is
simplified a lot.

Acked-by: Dan Schatzberg <schatzberg.dan@gmail.com>
Signed-off-by: Ming Lei <ming.lei@redhat.com>
---
 drivers/block/loop.c | 36 ++++++++++++++----------------------
 drivers/block/loop.h |  3 ++-
 2 files changed, 16 insertions(+), 23 deletions(-)

diff --git a/drivers/block/loop.c b/drivers/block/loop.c
index 74682c0f3f86..9b22e5469ed3 100644
--- a/drivers/block/loop.c
+++ b/drivers/block/loop.c
@@ -918,7 +918,6 @@ static void loop_config_discard(struct loop_device *lo)
 }
 
 struct loop_worker {
-	struct rb_node rb_node;
 	struct work_struct work;
 	struct list_head cmd_list;
 	struct list_head idle_list;
@@ -966,35 +965,23 @@ static struct cgroup_subsys_state *loop_rq_get_memcg_css(
 
 static void loop_queue_work(struct loop_device *lo, struct loop_cmd *cmd)
 {
-	struct rb_node **node = &(lo->worker_tree.rb_node), *parent = NULL;
-	struct loop_worker *cur_worker, *worker = NULL;
+	struct loop_worker *worker = NULL;
 	struct work_struct *work;
 	struct list_head *cmd_list;
 	struct cgroup_subsys_state *blkcg_css = loop_rq_blkcg_css(cmd);
+	gfp_t gfp = GFP_NOWAIT | __GFP_NOWARN;
 
 	spin_lock(&lo->lo_work_lock);
 
 	if (queue_on_root_worker(blkcg_css))
 		goto queue_work;
 
-	node = &lo->worker_tree.rb_node;
-
-	while (*node) {
-		parent = *node;
-		cur_worker = container_of(*node, struct loop_worker, rb_node);
-		if (cur_worker->blkcg_css == blkcg_css) {
-			worker = cur_worker;
-			break;
-		} else if ((long)cur_worker->blkcg_css < (long)blkcg_css) {
-			node = &(*node)->rb_left;
-		} else {
-			node = &(*node)->rb_right;
-		}
-	}
+	/* css->id is unique in each cgroup subsystem */
+	worker = xa_load(&lo->workers, blkcg_css->id);
 	if (worker)
 		goto queue_work;
 
-	worker = kzalloc(sizeof(struct loop_worker), GFP_NOWAIT | __GFP_NOWARN);
+	worker = kzalloc(sizeof(*worker), gfp);
 	/*
 	 * In the event we cannot allocate a worker, just queue on the
 	 * rootcg worker and issue the I/O as the rootcg
@@ -1008,8 +995,12 @@ static void loop_queue_work(struct loop_device *lo, struct loop_cmd *cmd)
 	INIT_LIST_HEAD(&worker->cmd_list);
 	INIT_LIST_HEAD(&worker->idle_list);
 	worker->lo = lo;
-	rb_link_node(&worker->rb_node, parent, node);
-	rb_insert_color(&worker->rb_node, &lo->worker_tree);
+
+	if (xa_err(xa_store(&lo->workers, blkcg_css->id, worker, gfp))) {
+		kfree(worker);
+		worker = NULL;
+	}
+
 queue_work:
 	if (worker) {
 		/*
@@ -1165,7 +1156,7 @@ static void __loop_free_idle_workers(struct loop_device *lo, bool force)
 						LOOP_IDLE_WORKER_TIMEOUT))
 			break;
 		list_del(&worker->idle_list);
-		rb_erase(&worker->rb_node, &lo->worker_tree);
+		xa_erase(&lo->workers, worker->blkcg_css->id);
 		css_put(worker->blkcg_css);
 		kfree(worker);
 	}
@@ -1260,7 +1251,7 @@ static int loop_configure(struct loop_device *lo, fmode_t mode,
 	INIT_WORK(&lo->rootcg_work, loop_rootcg_workfn);
 	INIT_LIST_HEAD(&lo->rootcg_cmd_list);
 	INIT_LIST_HEAD(&lo->idle_worker_list);
-	lo->worker_tree = RB_ROOT;
+	xa_init(&lo->workers);
 	INIT_DELAYED_WORK(&lo->idle_work, loop_free_idle_workers);
 	lo->use_dio = lo->lo_flags & LO_FLAGS_DIRECT_IO;
 	lo->lo_device = bdev;
@@ -1352,6 +1343,7 @@ static int __loop_clr_fd(struct loop_device *lo, bool release)
 	destroy_workqueue(lo->workqueue);
 	__loop_free_idle_workers(lo, true);
 	cancel_delayed_work_sync(&lo->idle_work);
+	xa_destroy(&lo->workers);
 
 	spin_lock_irq(&lo->lo_lock);
 	lo->lo_backing_file = NULL;
diff --git a/drivers/block/loop.h b/drivers/block/loop.h
index 9df889af1bcf..cab34da1e1bb 100644
--- a/drivers/block/loop.h
+++ b/drivers/block/loop.h
@@ -14,6 +14,7 @@
 #include <linux/blk-mq.h>
 #include <linux/spinlock.h>
 #include <linux/mutex.h>
+#include <linux/xarray.h>
 #include <uapi/linux/loop.h>
 
 /* Possible states of device */
@@ -59,7 +60,7 @@ struct loop_device {
 	struct work_struct      rootcg_work;
 	struct list_head        rootcg_cmd_list;
 	struct list_head        idle_worker_list;
-	struct rb_root          worker_tree;
+	struct xarray		workers;
 	struct delayed_work	idle_work;
 	bool			use_dio;
 	bool			sysfs_inited;
-- 
2.31.1


^ permalink raw reply	[flat|nested] 9+ messages in thread

* [PATCH V2 7/7] loop: don't add worker into idle list
  2021-07-28  8:16 [PATCH V2 0/7] loop: cleanup charging io to mem/blkcg Ming Lei
                   ` (5 preceding siblings ...)
  2021-07-28  8:16 ` [PATCH V2 6/7] loop: use xarray to store workers Ming Lei
@ 2021-07-28  8:16 ` Ming Lei
  2021-07-30 16:52   ` Dan Schatzberg
  6 siblings, 1 reply; 9+ messages in thread
From: Ming Lei @ 2021-07-28  8:16 UTC (permalink / raw)
  To: Jens Axboe
  Cc: linux-kernel, linux-block, Christoph Hellwig, Dan Schatzberg, Ming Lei

We can retrieve any workers via xarray, so not add it into idle list.
Meantime reduce .lo_work_lock coverage, especially we don't need that
in IO path except for adding/deleting worker into xarray.

Also replace .last_ran_at with .reclaim_time, which is set when adding
loop command into worker->cmd_list. Meantime reclaim the worker when
the worker is expired and no any pending commands.

Cc: Dan Schatzberg <schatzberg.dan@gmail.com>
Signed-off-by: Ming Lei <ming.lei@redhat.com>
---
 drivers/block/loop.c | 172 ++++++++++++++++++++++++++-----------------
 1 file changed, 104 insertions(+), 68 deletions(-)

diff --git a/drivers/block/loop.c b/drivers/block/loop.c
index 9b22e5469ed3..e4b292b2d8f4 100644
--- a/drivers/block/loop.c
+++ b/drivers/block/loop.c
@@ -920,10 +920,11 @@ static void loop_config_discard(struct loop_device *lo)
 struct loop_worker {
 	struct work_struct work;
 	struct list_head cmd_list;
-	struct list_head idle_list;
 	struct loop_device *lo;
 	struct cgroup_subsys_state *blkcg_css;
-	unsigned long last_ran_at;
+	unsigned long reclaim_time;
+	spinlock_t lock;
+	refcount_t refcnt;
 };
 
 static void loop_workfn(struct work_struct *work);
@@ -963,62 +964,93 @@ static struct cgroup_subsys_state *loop_rq_get_memcg_css(
 	return NULL;
 }
 
-static void loop_queue_work(struct loop_device *lo, struct loop_cmd *cmd)
+static struct loop_worker *loop_alloc_or_get_worker(struct loop_device *lo,
+		struct cgroup_subsys_state *blkcg_css)
 {
-	struct loop_worker *worker = NULL;
-	struct work_struct *work;
-	struct list_head *cmd_list;
-	struct cgroup_subsys_state *blkcg_css = loop_rq_blkcg_css(cmd);
 	gfp_t gfp = GFP_NOWAIT | __GFP_NOWARN;
+	struct loop_worker *worker = kzalloc(sizeof(*worker), gfp);
+	struct loop_worker *worker_old;
 
-	spin_lock(&lo->lo_work_lock);
-
-	if (queue_on_root_worker(blkcg_css))
-		goto queue_work;
-
-	/* css->id is unique in each cgroup subsystem */
-	worker = xa_load(&lo->workers, blkcg_css->id);
-	if (worker)
-		goto queue_work;
-
-	worker = kzalloc(sizeof(*worker), gfp);
-	/*
-	 * In the event we cannot allocate a worker, just queue on the
-	 * rootcg worker and issue the I/O as the rootcg
-	 */
 	if (!worker)
-		goto queue_work;
+		return NULL;
 
 	worker->blkcg_css = blkcg_css;
-	css_get(worker->blkcg_css);
 	INIT_WORK(&worker->work, loop_workfn);
 	INIT_LIST_HEAD(&worker->cmd_list);
-	INIT_LIST_HEAD(&worker->idle_list);
 	worker->lo = lo;
+	spin_lock_init(&worker->lock);
+	refcount_set(&worker->refcnt, 2);	/* INIT + INC */
 
-	if (xa_err(xa_store(&lo->workers, blkcg_css->id, worker, gfp))) {
+	spin_lock(&lo->lo_work_lock);
+	/* maybe someone is storing a new worker */
+	worker_old = xa_load(&lo->workers, blkcg_css->id);
+	if (!worker_old || !refcount_inc_not_zero(&worker_old->refcnt)) {
+		if (xa_err(xa_store(&lo->workers, blkcg_css->id, worker, gfp))) {
+			kfree(worker);
+			worker = NULL;
+		} else {
+			if (!work_pending(&lo->idle_work.work))
+				schedule_delayed_work(&lo->idle_work,
+						LOOP_IDLE_WORKER_TIMEOUT);
+			css_get(worker->blkcg_css);
+		}
+	} else {
 		kfree(worker);
-		worker = NULL;
+		worker = worker_old;
 	}
+	spin_unlock(&lo->lo_work_lock);
 
-queue_work:
-	if (worker) {
+	return worker;
+}
+
+static void loop_release_worker(struct loop_worker *worker)
+{
+	css_put(worker->blkcg_css);
+	kfree_rcu(worker);
+}
+
+static void loop_queue_work(struct loop_device *lo, struct loop_cmd *cmd)
+{
+	struct loop_worker *worker = NULL;
+	struct work_struct *work;
+	struct list_head *cmd_list;
+	struct cgroup_subsys_state *blkcg_css = loop_rq_blkcg_css(cmd);
+	spinlock_t	*lock;
+
+	if (!queue_on_root_worker(blkcg_css)) {
+		int ret = 0;
+
+		rcu_read_lock();
+		/* css->id is unique in each cgroup subsystem */
+		worker = xa_load(&lo->workers, blkcg_css->id);
+		if (worker)
+			ret = refcount_inc_not_zero(&worker->refcnt);
+		rcu_read_unlock();
+
+		if (!worker || !ret)
+			worker = loop_alloc_or_get_worker(lo, blkcg_css);
 		/*
-		 * We need to remove from the idle list here while
-		 * holding the lock so that the idle timer doesn't
-		 * free the worker
+		 * In the event we cannot allocate a worker, just queue on the
+		 * rootcg worker and issue the I/O as the rootcg
 		 */
-		if (!list_empty(&worker->idle_list))
-			list_del_init(&worker->idle_list);
+	}
+
+	if (worker) {
 		work = &worker->work;
 		cmd_list = &worker->cmd_list;
+		lock = &worker->lock;
 	} else {
 		work = &lo->rootcg_work;
 		cmd_list = &lo->rootcg_cmd_list;
+		lock = &lo->lo_work_lock;
 	}
+
+	spin_lock(lock);
 	list_add_tail(&cmd->list_entry, cmd_list);
+	if (worker)
+		worker->reclaim_time = jiffies + LOOP_IDLE_WORKER_TIMEOUT;
+	spin_unlock(lock);
 	queue_work(lo->workqueue, work);
-	spin_unlock(&lo->lo_work_lock);
 }
 
 static void loop_update_rotational(struct loop_device *lo)
@@ -1140,28 +1172,38 @@ loop_set_status_from_info(struct loop_device *lo,
 	return 0;
 }
 
-static void loop_set_timer(struct loop_device *lo)
+static bool loop_need_reclaim_worker(struct loop_worker *worker)
 {
-	schedule_delayed_work(&lo->idle_work, LOOP_IDLE_WORKER_TIMEOUT);
+	bool reclaim = false;
+
+	spin_lock(&worker->lock);
+	if (list_empty(&worker->cmd_list) &&
+			time_is_before_jiffies(worker->reclaim_time))
+		reclaim = true;
+	else
+		reclaim = false;
+	spin_unlock(&worker->lock);
+
+	return reclaim;
 }
 
 static void __loop_free_idle_workers(struct loop_device *lo, bool force)
 {
-	struct loop_worker *pos, *worker;
+	struct loop_worker *worker;
+	unsigned long id;
 
 	spin_lock(&lo->lo_work_lock);
-	list_for_each_entry_safe(worker, pos, &lo->idle_worker_list,
-				idle_list) {
-		if (!force && time_is_after_jiffies(worker->last_ran_at +
-						LOOP_IDLE_WORKER_TIMEOUT))
-			break;
-		list_del(&worker->idle_list);
-		xa_erase(&lo->workers, worker->blkcg_css->id);
-		css_put(worker->blkcg_css);
-		kfree(worker);
+	xa_for_each(&lo->workers, id, worker) {
+		if (!force && !loop_need_reclaim_worker(worker))
+			continue;
+
+		xa_erase(&worker->lo->workers, worker->blkcg_css->id);
+		if (refcount_dec_and_test(&worker->refcnt))
+			loop_release_worker(worker);
 	}
-	if (!list_empty(&lo->idle_worker_list))
-		loop_set_timer(lo);
+	if (!xa_empty(&lo->workers))
+		schedule_delayed_work(&lo->idle_work,
+				LOOP_IDLE_WORKER_TIMEOUT);
 	spin_unlock(&lo->lo_work_lock);
 }
 
@@ -2162,42 +2204,36 @@ static void loop_handle_cmd(struct loop_cmd *cmd)
 }
 
 static void loop_process_work(struct loop_worker *worker,
-			struct list_head *cmd_list, struct loop_device *lo)
+			struct list_head *cmd_list, spinlock_t *lock)
 {
 	int orig_flags = current->flags;
 	struct loop_cmd *cmd;
 	LIST_HEAD(list);
+	int cnt = 0;
 
 	current->flags |= PF_LOCAL_THROTTLE | PF_MEMALLOC_NOIO;
 
-	spin_lock(&lo->lo_work_lock);
+	spin_lock(lock);
  again:
 	list_splice_init(cmd_list, &list);
-	spin_unlock(&lo->lo_work_lock);
+	spin_unlock(lock);
 
 	while (!list_empty(&list)) {
 		cmd = list_first_entry(&list, struct loop_cmd, list_entry);
 		list_del_init(&cmd->list_entry);
 
 		loop_handle_cmd(cmd);
+		cnt++;
 	}
 
-	spin_lock(&lo->lo_work_lock);
+	spin_lock(lock);
 	if (!list_empty(cmd_list))
 		goto again;
-
-	/*
-	 * We only add to the idle list if there are no pending cmds
-	 * *and* the worker will not run again which ensures that it
-	 * is safe to free any worker on the idle list
-	 */
-	if (worker && !work_pending(&worker->work)) {
-		worker->last_ran_at = jiffies;
-		list_add_tail(&worker->idle_list, &lo->idle_worker_list);
-		loop_set_timer(lo);
-	}
-	spin_unlock(&lo->lo_work_lock);
+	spin_unlock(lock);
 	current->flags = orig_flags;
+
+	if (worker && refcount_sub_and_test(cnt, &worker->refcnt))
+		loop_release_worker(worker);
 }
 
 static void loop_workfn(struct work_struct *work)
@@ -2212,11 +2248,11 @@ static void loop_workfn(struct work_struct *work)
 	if (memcg_css) {
 		old_memcg = set_active_memcg(
 				mem_cgroup_from_css(memcg_css));
-		loop_process_work(worker, &worker->cmd_list, worker->lo);
+		loop_process_work(worker, &worker->cmd_list, &worker->lock);
 		set_active_memcg(old_memcg);
 		css_put(memcg_css);
 	} else {
-		loop_process_work(worker, &worker->cmd_list, worker->lo);
+		loop_process_work(worker, &worker->cmd_list, &worker->lock);
 	}
 	kthread_associate_blkcg(NULL);
 }
@@ -2225,7 +2261,7 @@ static void loop_rootcg_workfn(struct work_struct *work)
 {
 	struct loop_device *lo =
 		container_of(work, struct loop_device, rootcg_work);
-	loop_process_work(NULL, &lo->rootcg_cmd_list, lo);
+	loop_process_work(NULL, &lo->rootcg_cmd_list, &lo->lo_work_lock);
 }
 
 static const struct blk_mq_ops loop_mq_ops = {
-- 
2.31.1


^ permalink raw reply	[flat|nested] 9+ messages in thread

* Re: [PATCH V2 7/7] loop: don't add worker into idle list
  2021-07-28  8:16 ` [PATCH V2 7/7] loop: don't add worker into idle list Ming Lei
@ 2021-07-30 16:52   ` Dan Schatzberg
  0 siblings, 0 replies; 9+ messages in thread
From: Dan Schatzberg @ 2021-07-30 16:52 UTC (permalink / raw)
  To: Ming Lei; +Cc: Jens Axboe, linux-kernel, linux-block, Christoph Hellwig

On Wed, Jul 28, 2021 at 04:16:38PM +0800, Ming Lei wrote:
> We can retrieve any workers via xarray, so not add it into idle list.
> Meantime reduce .lo_work_lock coverage, especially we don't need that
> in IO path except for adding/deleting worker into xarray.
> 
> Also replace .last_ran_at with .reclaim_time, which is set when adding
> loop command into worker->cmd_list. Meantime reclaim the worker when
> the worker is expired and no any pending commands.
> 
> Cc: Dan Schatzberg <schatzberg.dan@gmail.com>
> Signed-off-by: Ming Lei <ming.lei@redhat.com>
> ---
>  drivers/block/loop.c | 172 ++++++++++++++++++++++++++-----------------
>  1 file changed, 104 insertions(+), 68 deletions(-)
> 
> diff --git a/drivers/block/loop.c b/drivers/block/loop.c
> index 9b22e5469ed3..e4b292b2d8f4 100644
> --- a/drivers/block/loop.c
> +++ b/drivers/block/loop.c
> @@ -920,10 +920,11 @@ static void loop_config_discard(struct loop_device *lo)
>  struct loop_worker {
>  	struct work_struct work;
>  	struct list_head cmd_list;
> -	struct list_head idle_list;
>  	struct loop_device *lo;
>  	struct cgroup_subsys_state *blkcg_css;
> -	unsigned long last_ran_at;
> +	unsigned long reclaim_time;
> +	spinlock_t lock;
> +	refcount_t refcnt;
>  };
>  
>  static void loop_workfn(struct work_struct *work);
> @@ -963,62 +964,93 @@ static struct cgroup_subsys_state *loop_rq_get_memcg_css(
>  	return NULL;
>  }
>  
> -static void loop_queue_work(struct loop_device *lo, struct loop_cmd *cmd)
> +static struct loop_worker *loop_alloc_or_get_worker(struct loop_device *lo,
> +		struct cgroup_subsys_state *blkcg_css)
>  {
> -	struct loop_worker *worker = NULL;
> -	struct work_struct *work;
> -	struct list_head *cmd_list;
> -	struct cgroup_subsys_state *blkcg_css = loop_rq_blkcg_css(cmd);
>  	gfp_t gfp = GFP_NOWAIT | __GFP_NOWARN;
> +	struct loop_worker *worker = kzalloc(sizeof(*worker), gfp);
> +	struct loop_worker *worker_old;
>  
> -	spin_lock(&lo->lo_work_lock);
> -
> -	if (queue_on_root_worker(blkcg_css))
> -		goto queue_work;
> -
> -	/* css->id is unique in each cgroup subsystem */
> -	worker = xa_load(&lo->workers, blkcg_css->id);
> -	if (worker)
> -		goto queue_work;
> -
> -	worker = kzalloc(sizeof(*worker), gfp);
> -	/*
> -	 * In the event we cannot allocate a worker, just queue on the
> -	 * rootcg worker and issue the I/O as the rootcg
> -	 */
>  	if (!worker)
> -		goto queue_work;
> +		return NULL;
>  
>  	worker->blkcg_css = blkcg_css;
> -	css_get(worker->blkcg_css);
>  	INIT_WORK(&worker->work, loop_workfn);
>  	INIT_LIST_HEAD(&worker->cmd_list);
> -	INIT_LIST_HEAD(&worker->idle_list);
>  	worker->lo = lo;
> +	spin_lock_init(&worker->lock);
> +	refcount_set(&worker->refcnt, 2);	/* INIT + INC */
>  
> -	if (xa_err(xa_store(&lo->workers, blkcg_css->id, worker, gfp))) {
> +	spin_lock(&lo->lo_work_lock);
> +	/* maybe someone is storing a new worker */
> +	worker_old = xa_load(&lo->workers, blkcg_css->id);
> +	if (!worker_old || !refcount_inc_not_zero(&worker_old->refcnt)) {
> +		if (xa_err(xa_store(&lo->workers, blkcg_css->id, worker, gfp))) {
> +			kfree(worker);
> +			worker = NULL;
> +		} else {
> +			if (!work_pending(&lo->idle_work.work))
> +				schedule_delayed_work(&lo->idle_work,
> +						LOOP_IDLE_WORKER_TIMEOUT);
> +			css_get(worker->blkcg_css);
> +		}
> +	} else {
>  		kfree(worker);
> -		worker = NULL;
> +		worker = worker_old;
>  	}
> +	spin_unlock(&lo->lo_work_lock);
>  
> -queue_work:
> -	if (worker) {
> +	return worker;
> +}
> +
> +static void loop_release_worker(struct loop_worker *worker)
> +{
> +	css_put(worker->blkcg_css);
> +	kfree_rcu(worker);
> +}
> +
> +static void loop_queue_work(struct loop_device *lo, struct loop_cmd *cmd)
> +{
> +	struct loop_worker *worker = NULL;
> +	struct work_struct *work;
> +	struct list_head *cmd_list;
> +	struct cgroup_subsys_state *blkcg_css = loop_rq_blkcg_css(cmd);
> +	spinlock_t	*lock;
> +
> +	if (!queue_on_root_worker(blkcg_css)) {
> +		int ret = 0;
> +
> +		rcu_read_lock();
> +		/* css->id is unique in each cgroup subsystem */
> +		worker = xa_load(&lo->workers, blkcg_css->id);
> +		if (worker)
> +			ret = refcount_inc_not_zero(&worker->refcnt);
> +		rcu_read_unlock();
> +
> +		if (!worker || !ret)
> +			worker = loop_alloc_or_get_worker(lo, blkcg_css);
>  		/*
> -		 * We need to remove from the idle list here while
> -		 * holding the lock so that the idle timer doesn't
> -		 * free the worker
> +		 * In the event we cannot allocate a worker, just queue on the
> +		 * rootcg worker and issue the I/O as the rootcg
>  		 */
> -		if (!list_empty(&worker->idle_list))
> -			list_del_init(&worker->idle_list);
> +	}
> +
> +	if (worker) {
>  		work = &worker->work;
>  		cmd_list = &worker->cmd_list;
> +		lock = &worker->lock;
>  	} else {
>  		work = &lo->rootcg_work;
>  		cmd_list = &lo->rootcg_cmd_list;
> +		lock = &lo->lo_work_lock;
>  	}
> +
> +	spin_lock(lock);
>  	list_add_tail(&cmd->list_entry, cmd_list);
> +	if (worker)
> +		worker->reclaim_time = jiffies + LOOP_IDLE_WORKER_TIMEOUT;
> +	spin_unlock(lock);
>  	queue_work(lo->workqueue, work);
> -	spin_unlock(&lo->lo_work_lock);
>  }
>  
>  static void loop_update_rotational(struct loop_device *lo)
> @@ -1140,28 +1172,38 @@ loop_set_status_from_info(struct loop_device *lo,
>  	return 0;
>  }
>  
> -static void loop_set_timer(struct loop_device *lo)
> +static bool loop_need_reclaim_worker(struct loop_worker *worker)
>  {
> -	schedule_delayed_work(&lo->idle_work, LOOP_IDLE_WORKER_TIMEOUT);
> +	bool reclaim = false;

You don't need both this initialization and the else block below. I
think it's easier to read if you just remove one of them.

> +
> +	spin_lock(&worker->lock);
> +	if (list_empty(&worker->cmd_list) &&
> +			time_is_before_jiffies(worker->reclaim_time))
> +		reclaim = true;
> +	else
> +		reclaim = false;
> +	spin_unlock(&worker->lock);
> +
> +	return reclaim;
>  }
>  
>  static void __loop_free_idle_workers(struct loop_device *lo, bool force)
>  {
> -	struct loop_worker *pos, *worker;
> +	struct loop_worker *worker;
> +	unsigned long id;
>  
>  	spin_lock(&lo->lo_work_lock);
> -	list_for_each_entry_safe(worker, pos, &lo->idle_worker_list,
> -				idle_list) {
> -		if (!force && time_is_after_jiffies(worker->last_ran_at +
> -						LOOP_IDLE_WORKER_TIMEOUT))
> -			break;
> -		list_del(&worker->idle_list);
> -		xa_erase(&lo->workers, worker->blkcg_css->id);
> -		css_put(worker->blkcg_css);
> -		kfree(worker);
> +	xa_for_each(&lo->workers, id, worker) {
> +		if (!force && !loop_need_reclaim_worker(worker))
> +			continue;
> +
> +		xa_erase(&worker->lo->workers, worker->blkcg_css->id);
> +		if (refcount_dec_and_test(&worker->refcnt))
> +			loop_release_worker(worker);
>  	}
> -	if (!list_empty(&lo->idle_worker_list))
> -		loop_set_timer(lo);
> +	if (!xa_empty(&lo->workers))
> +		schedule_delayed_work(&lo->idle_work,
> +				LOOP_IDLE_WORKER_TIMEOUT);
>  	spin_unlock(&lo->lo_work_lock);
>  }
>  
> @@ -2162,42 +2204,36 @@ static void loop_handle_cmd(struct loop_cmd *cmd)
>  }
>  
>  static void loop_process_work(struct loop_worker *worker,
> -			struct list_head *cmd_list, struct loop_device *lo)
> +			struct list_head *cmd_list, spinlock_t *lock)
>  {
>  	int orig_flags = current->flags;
>  	struct loop_cmd *cmd;
>  	LIST_HEAD(list);
> +	int cnt = 0;
>  
>  	current->flags |= PF_LOCAL_THROTTLE | PF_MEMALLOC_NOIO;
>  
> -	spin_lock(&lo->lo_work_lock);
> +	spin_lock(lock);
>   again:
>  	list_splice_init(cmd_list, &list);
> -	spin_unlock(&lo->lo_work_lock);
> +	spin_unlock(lock);
>  
>  	while (!list_empty(&list)) {
>  		cmd = list_first_entry(&list, struct loop_cmd, list_entry);
>  		list_del_init(&cmd->list_entry);
>  
>  		loop_handle_cmd(cmd);
> +		cnt++;
>  	}
>  
> -	spin_lock(&lo->lo_work_lock);
> +	spin_lock(lock);
>  	if (!list_empty(cmd_list))
>  		goto again;
> -
> -	/*
> -	 * We only add to the idle list if there are no pending cmds
> -	 * *and* the worker will not run again which ensures that it
> -	 * is safe to free any worker on the idle list
> -	 */
> -	if (worker && !work_pending(&worker->work)) {
> -		worker->last_ran_at = jiffies;
> -		list_add_tail(&worker->idle_list, &lo->idle_worker_list);
> -		loop_set_timer(lo);
> -	}
> -	spin_unlock(&lo->lo_work_lock);
> +	spin_unlock(lock);
>  	current->flags = orig_flags;
> +
> +	if (worker && refcount_sub_and_test(cnt, &worker->refcnt))
> +		loop_release_worker(worker);
>  }
>  
>  static void loop_workfn(struct work_struct *work)
> @@ -2212,11 +2248,11 @@ static void loop_workfn(struct work_struct *work)
>  	if (memcg_css) {
>  		old_memcg = set_active_memcg(
>  				mem_cgroup_from_css(memcg_css));
> -		loop_process_work(worker, &worker->cmd_list, worker->lo);
> +		loop_process_work(worker, &worker->cmd_list, &worker->lock);
>  		set_active_memcg(old_memcg);
>  		css_put(memcg_css);
>  	} else {
> -		loop_process_work(worker, &worker->cmd_list, worker->lo);
> +		loop_process_work(worker, &worker->cmd_list, &worker->lock);
>  	}
>  	kthread_associate_blkcg(NULL);
>  }
> @@ -2225,7 +2261,7 @@ static void loop_rootcg_workfn(struct work_struct *work)
>  {
>  	struct loop_device *lo =
>  		container_of(work, struct loop_device, rootcg_work);
> -	loop_process_work(NULL, &lo->rootcg_cmd_list, lo);
> +	loop_process_work(NULL, &lo->rootcg_cmd_list, &lo->lo_work_lock);
>  }
>  
>  static const struct blk_mq_ops loop_mq_ops = {
> -- 
> 2.31.1
> 

This is all great.

Acked-by: Dan Schatzberg <schatzberg.dan@gmail.com>

^ permalink raw reply	[flat|nested] 9+ messages in thread

end of thread, other threads:[~2021-07-30 16:52 UTC | newest]

Thread overview: 9+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2021-07-28  8:16 [PATCH V2 0/7] loop: cleanup charging io to mem/blkcg Ming Lei
2021-07-28  8:16 ` [PATCH V2 1/7] mm: memcontrol: add helper of memcg_get_e_css Ming Lei
2021-07-28  8:16 ` [PATCH V2 2/7] loop: clean up blkcg association Ming Lei
2021-07-28  8:16 ` [PATCH V2 3/7] loop: conver timer for monitoring idle worker into dwork Ming Lei
2021-07-28  8:16 ` [PATCH V2 4/7] loop: add __loop_free_idle_workers() for covering freeing workers in clearing FD Ming Lei
2021-07-28  8:16 ` [PATCH V2 5/7] loop: improve loop_process_work Ming Lei
2021-07-28  8:16 ` [PATCH V2 6/7] loop: use xarray to store workers Ming Lei
2021-07-28  8:16 ` [PATCH V2 7/7] loop: don't add worker into idle list Ming Lei
2021-07-30 16:52   ` Dan Schatzberg

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).