LKML Archive on lore.kernel.org
help / color / mirror / Atom feed
* [PATCH] f2fs: use rwlock instead of rwsem for journal
@ 2021-07-22  1:41 Jaegeuk Kim
  2021-07-22 13:41 ` [f2fs-dev] " Chao Yu
  2021-07-22 17:44 ` [PATCH v2] f2fs: don't sleep while grabing nat_tree_lock Jaegeuk Kim
  0 siblings, 2 replies; 7+ messages in thread
From: Jaegeuk Kim @ 2021-07-22  1:41 UTC (permalink / raw)
  To: linux-kernel, linux-f2fs-devel; +Cc: Jaegeuk Kim

This tries to fix priority inversion in the below condition resulting in
long checkpoint delay.

f2fs_get_node_info()
 - nat_tree_lock
  -> sleep in journal_rwsem
                                     checkpoint
                                     - waiting for nat_tree_lock

Signed-off-by: Jaegeuk Kim <jaegeuk@kernel.org>
---
 fs/f2fs/node.c    | 16 ++++++++--------
 fs/f2fs/segment.c | 22 +++++++++++-----------
 fs/f2fs/segment.h |  2 +-
 3 files changed, 20 insertions(+), 20 deletions(-)

diff --git a/fs/f2fs/node.c b/fs/f2fs/node.c
index 0be9e2d7120e..821a40e02fb4 100644
--- a/fs/f2fs/node.c
+++ b/fs/f2fs/node.c
@@ -567,13 +567,13 @@ int f2fs_get_node_info(struct f2fs_sb_info *sbi, nid_t nid,
 	memset(&ne, 0, sizeof(struct f2fs_nat_entry));
 
 	/* Check current segment summary */
-	down_read(&curseg->journal_rwsem);
+	read_lock(&curseg->journal_rwlock);
 	i = f2fs_lookup_journal_in_cursum(journal, NAT_JOURNAL, nid, 0);
 	if (i >= 0) {
 		ne = nat_in_journal(journal, i);
 		node_info_from_raw_nat(ni, &ne);
 	}
-	up_read(&curseg->journal_rwsem);
+	read_unlock(&curseg->journal_rwlock);
 	if (i >= 0) {
 		up_read(&nm_i->nat_tree_lock);
 		goto cache;
@@ -2338,7 +2338,7 @@ static void scan_curseg_cache(struct f2fs_sb_info *sbi)
 	struct f2fs_journal *journal = curseg->journal;
 	int i;
 
-	down_read(&curseg->journal_rwsem);
+	read_lock(&curseg->journal_rwlock);
 	for (i = 0; i < nats_in_cursum(journal); i++) {
 		block_t addr;
 		nid_t nid;
@@ -2350,7 +2350,7 @@ static void scan_curseg_cache(struct f2fs_sb_info *sbi)
 		else
 			remove_free_nid(sbi, nid);
 	}
-	up_read(&curseg->journal_rwsem);
+	read_unlock(&curseg->journal_rwlock);
 }
 
 static void scan_free_nid_bits(struct f2fs_sb_info *sbi)
@@ -2799,7 +2799,7 @@ static void remove_nats_in_journal(struct f2fs_sb_info *sbi)
 	struct f2fs_journal *journal = curseg->journal;
 	int i;
 
-	down_write(&curseg->journal_rwsem);
+	write_lock(&curseg->journal_rwlock);
 	for (i = 0; i < nats_in_cursum(journal); i++) {
 		struct nat_entry *ne;
 		struct f2fs_nat_entry raw_ne;
@@ -2831,7 +2831,7 @@ static void remove_nats_in_journal(struct f2fs_sb_info *sbi)
 		__set_nat_cache_dirty(nm_i, ne);
 	}
 	update_nats_in_cursum(journal, -i);
-	up_write(&curseg->journal_rwsem);
+	write_unlock(&curseg->journal_rwlock);
 }
 
 static void __adjust_nat_entry_set(struct nat_entry_set *nes,
@@ -2906,7 +2906,7 @@ static int __flush_nat_entry_set(struct f2fs_sb_info *sbi,
 		to_journal = false;
 
 	if (to_journal) {
-		down_write(&curseg->journal_rwsem);
+		write_lock(&curseg->journal_rwlock);
 	} else {
 		page = get_next_nat_page(sbi, start_nid);
 		if (IS_ERR(page))
@@ -2946,7 +2946,7 @@ static int __flush_nat_entry_set(struct f2fs_sb_info *sbi,
 	}
 
 	if (to_journal) {
-		up_write(&curseg->journal_rwsem);
+		write_unlock(&curseg->journal_rwlock);
 	} else {
 		__update_nat_bits(sbi, start_nid, page);
 		f2fs_put_page(page, 1);
diff --git a/fs/f2fs/segment.c b/fs/f2fs/segment.c
index f9b7fb785e1d..c397c1c271ec 100644
--- a/fs/f2fs/segment.c
+++ b/fs/f2fs/segment.c
@@ -2442,9 +2442,9 @@ static void write_current_sum_page(struct f2fs_sb_info *sbi,
 
 	mutex_lock(&curseg->curseg_mutex);
 
-	down_read(&curseg->journal_rwsem);
+	read_lock(&curseg->journal_rwlock);
 	memcpy(&dst->journal, curseg->journal, SUM_JOURNAL_SIZE);
-	up_read(&curseg->journal_rwsem);
+	read_unlock(&curseg->journal_rwlock);
 
 	memcpy(dst->entries, src->entries, SUM_ENTRY_SIZE);
 	memcpy(&dst->footer, &src->footer, SUM_FOOTER_SIZE);
@@ -3876,9 +3876,9 @@ static int read_normal_summaries(struct f2fs_sb_info *sbi, int type)
 	mutex_lock(&curseg->curseg_mutex);
 
 	/* update journal info */
-	down_write(&curseg->journal_rwsem);
+	write_lock(&curseg->journal_rwlock);
 	memcpy(curseg->journal, &sum->journal, SUM_JOURNAL_SIZE);
-	up_write(&curseg->journal_rwsem);
+	write_unlock(&curseg->journal_rwlock);
 
 	memcpy(curseg->sum_blk->entries, sum->entries, SUM_ENTRY_SIZE);
 	memcpy(&curseg->sum_blk->footer, &sum->footer, SUM_FOOTER_SIZE);
@@ -4136,7 +4136,7 @@ static void remove_sits_in_journal(struct f2fs_sb_info *sbi)
 	struct f2fs_journal *journal = curseg->journal;
 	int i;
 
-	down_write(&curseg->journal_rwsem);
+	write_lock(&curseg->journal_rwlock);
 	for (i = 0; i < sits_in_cursum(journal); i++) {
 		unsigned int segno;
 		bool dirtied;
@@ -4148,7 +4148,7 @@ static void remove_sits_in_journal(struct f2fs_sb_info *sbi)
 			add_sit_entry(segno, &SM_I(sbi)->sit_entry_set);
 	}
 	update_sits_in_cursum(journal, -i);
-	up_write(&curseg->journal_rwsem);
+	write_unlock(&curseg->journal_rwlock);
 }
 
 /*
@@ -4204,7 +4204,7 @@ void f2fs_flush_sit_entries(struct f2fs_sb_info *sbi, struct cp_control *cpc)
 			to_journal = false;
 
 		if (to_journal) {
-			down_write(&curseg->journal_rwsem);
+			write_lock(&curseg->journal_rwlock);
 		} else {
 			page = get_next_sit_page(sbi, start_segno);
 			raw_sit = page_address(page);
@@ -4251,7 +4251,7 @@ void f2fs_flush_sit_entries(struct f2fs_sb_info *sbi, struct cp_control *cpc)
 		}
 
 		if (to_journal)
-			up_write(&curseg->journal_rwsem);
+			write_unlock(&curseg->journal_rwlock);
 		else
 			f2fs_put_page(page, 1);
 
@@ -4432,7 +4432,7 @@ static int build_curseg(struct f2fs_sb_info *sbi)
 		array[i].sum_blk = f2fs_kzalloc(sbi, PAGE_SIZE, GFP_KERNEL);
 		if (!array[i].sum_blk)
 			return -ENOMEM;
-		init_rwsem(&array[i].journal_rwsem);
+		rwlock_init(&array[i].journal_rwlock);
 		array[i].journal = f2fs_kzalloc(sbi,
 				sizeof(struct f2fs_journal), GFP_KERNEL);
 		if (!array[i].journal)
@@ -4509,7 +4509,7 @@ static int build_sit_entries(struct f2fs_sb_info *sbi)
 		start_blk += readed;
 	} while (start_blk < sit_blk_cnt);
 
-	down_read(&curseg->journal_rwsem);
+	read_lock(&curseg->journal_rwlock);
 	for (i = 0; i < sits_in_cursum(journal); i++) {
 		unsigned int old_valid_blocks;
 
@@ -4551,7 +4551,7 @@ static int build_sit_entries(struct f2fs_sb_info *sbi)
 							old_valid_blocks;
 		}
 	}
-	up_read(&curseg->journal_rwsem);
+	read_unlock(&curseg->journal_rwlock);
 
 	if (!err && total_node_blocks != valid_node_count(sbi)) {
 		f2fs_err(sbi, "SIT is corrupted node# %u vs %u",
diff --git a/fs/f2fs/segment.h b/fs/f2fs/segment.h
index 050230c70a53..afb9041dc2e6 100644
--- a/fs/f2fs/segment.h
+++ b/fs/f2fs/segment.h
@@ -306,7 +306,7 @@ struct victim_selection {
 struct curseg_info {
 	struct mutex curseg_mutex;		/* lock for consistency */
 	struct f2fs_summary_block *sum_blk;	/* cached summary block */
-	struct rw_semaphore journal_rwsem;	/* protect journal area */
+	rwlock_t journal_rwlock;		/* protect journal area */
 	struct f2fs_journal *journal;		/* cached journal info */
 	unsigned char alloc_type;		/* current allocation type */
 	unsigned short seg_type;		/* segment type like CURSEG_XXX_TYPE */
-- 
2.32.0.402.g57bb445576-goog


^ permalink raw reply	[flat|nested] 7+ messages in thread

* Re: [f2fs-dev] [PATCH] f2fs: use rwlock instead of rwsem for journal
  2021-07-22  1:41 [PATCH] f2fs: use rwlock instead of rwsem for journal Jaegeuk Kim
@ 2021-07-22 13:41 ` Chao Yu
  2021-07-22 17:44 ` [PATCH v2] f2fs: don't sleep while grabing nat_tree_lock Jaegeuk Kim
  1 sibling, 0 replies; 7+ messages in thread
From: Chao Yu @ 2021-07-22 13:41 UTC (permalink / raw)
  To: Jaegeuk Kim, linux-kernel, linux-f2fs-devel

On 2021/7/22 9:41, Jaegeuk Kim wrote:
> This tries to fix priority inversion in the below condition resulting in
> long checkpoint delay.
> 
> f2fs_get_node_info()
>   - nat_tree_lock
>    -> sleep in journal_rwsem
>                                       checkpoint
>                                       - waiting for nat_tree_lock
> 
> Signed-off-by: Jaegeuk Kim <jaegeuk@kernel.org>
> ---
>   fs/f2fs/node.c    | 16 ++++++++--------
>   fs/f2fs/segment.c | 22 +++++++++++-----------
>   fs/f2fs/segment.h |  2 +-
>   3 files changed, 20 insertions(+), 20 deletions(-)
> 
> diff --git a/fs/f2fs/node.c b/fs/f2fs/node.c
> index 0be9e2d7120e..821a40e02fb4 100644
> --- a/fs/f2fs/node.c
> +++ b/fs/f2fs/node.c
> @@ -567,13 +567,13 @@ int f2fs_get_node_info(struct f2fs_sb_info *sbi, nid_t nid,
>   	memset(&ne, 0, sizeof(struct f2fs_nat_entry));
>   
>   	/* Check current segment summary */
> -	down_read(&curseg->journal_rwsem);
> +	read_lock(&curseg->journal_rwlock);

It seem journal_rwsem covers too many operations, e.g.

- scan_curseg_cache
  - add_free_nid
   - f2fs_kmem_cache_alloc
    - kmem_cache_alloc(__GFP_NOFAIL)

Thanks,

>   	i = f2fs_lookup_journal_in_cursum(journal, NAT_JOURNAL, nid, 0);
>   	if (i >= 0) {
>   		ne = nat_in_journal(journal, i);
>   		node_info_from_raw_nat(ni, &ne);
>   	}
> -	up_read(&curseg->journal_rwsem);
> +	read_unlock(&curseg->journal_rwlock);
>   	if (i >= 0) {
>   		up_read(&nm_i->nat_tree_lock);
>   		goto cache;
> @@ -2338,7 +2338,7 @@ static void scan_curseg_cache(struct f2fs_sb_info *sbi)
>   	struct f2fs_journal *journal = curseg->journal;
>   	int i;
>   
> -	down_read(&curseg->journal_rwsem);
> +	read_lock(&curseg->journal_rwlock);
>   	for (i = 0; i < nats_in_cursum(journal); i++) {
>   		block_t addr;
>   		nid_t nid;
> @@ -2350,7 +2350,7 @@ static void scan_curseg_cache(struct f2fs_sb_info *sbi)
>   		else
>   			remove_free_nid(sbi, nid);
>   	}
> -	up_read(&curseg->journal_rwsem);
> +	read_unlock(&curseg->journal_rwlock);
>   }
>   
>   static void scan_free_nid_bits(struct f2fs_sb_info *sbi)
> @@ -2799,7 +2799,7 @@ static void remove_nats_in_journal(struct f2fs_sb_info *sbi)
>   	struct f2fs_journal *journal = curseg->journal;
>   	int i;
>   
> -	down_write(&curseg->journal_rwsem);
> +	write_lock(&curseg->journal_rwlock);
>   	for (i = 0; i < nats_in_cursum(journal); i++) {
>   		struct nat_entry *ne;
>   		struct f2fs_nat_entry raw_ne;
> @@ -2831,7 +2831,7 @@ static void remove_nats_in_journal(struct f2fs_sb_info *sbi)
>   		__set_nat_cache_dirty(nm_i, ne);
>   	}
>   	update_nats_in_cursum(journal, -i);
> -	up_write(&curseg->journal_rwsem);
> +	write_unlock(&curseg->journal_rwlock);
>   }
>   
>   static void __adjust_nat_entry_set(struct nat_entry_set *nes,
> @@ -2906,7 +2906,7 @@ static int __flush_nat_entry_set(struct f2fs_sb_info *sbi,
>   		to_journal = false;
>   
>   	if (to_journal) {
> -		down_write(&curseg->journal_rwsem);
> +		write_lock(&curseg->journal_rwlock);
>   	} else {
>   		page = get_next_nat_page(sbi, start_nid);
>   		if (IS_ERR(page))
> @@ -2946,7 +2946,7 @@ static int __flush_nat_entry_set(struct f2fs_sb_info *sbi,
>   	}
>   
>   	if (to_journal) {
> -		up_write(&curseg->journal_rwsem);
> +		write_unlock(&curseg->journal_rwlock);
>   	} else {
>   		__update_nat_bits(sbi, start_nid, page);
>   		f2fs_put_page(page, 1);
> diff --git a/fs/f2fs/segment.c b/fs/f2fs/segment.c
> index f9b7fb785e1d..c397c1c271ec 100644
> --- a/fs/f2fs/segment.c
> +++ b/fs/f2fs/segment.c
> @@ -2442,9 +2442,9 @@ static void write_current_sum_page(struct f2fs_sb_info *sbi,
>   
>   	mutex_lock(&curseg->curseg_mutex);
>   
> -	down_read(&curseg->journal_rwsem);
> +	read_lock(&curseg->journal_rwlock);
>   	memcpy(&dst->journal, curseg->journal, SUM_JOURNAL_SIZE);
> -	up_read(&curseg->journal_rwsem);
> +	read_unlock(&curseg->journal_rwlock);
>   
>   	memcpy(dst->entries, src->entries, SUM_ENTRY_SIZE);
>   	memcpy(&dst->footer, &src->footer, SUM_FOOTER_SIZE);
> @@ -3876,9 +3876,9 @@ static int read_normal_summaries(struct f2fs_sb_info *sbi, int type)
>   	mutex_lock(&curseg->curseg_mutex);
>   
>   	/* update journal info */
> -	down_write(&curseg->journal_rwsem);
> +	write_lock(&curseg->journal_rwlock);
>   	memcpy(curseg->journal, &sum->journal, SUM_JOURNAL_SIZE);
> -	up_write(&curseg->journal_rwsem);
> +	write_unlock(&curseg->journal_rwlock);
>   
>   	memcpy(curseg->sum_blk->entries, sum->entries, SUM_ENTRY_SIZE);
>   	memcpy(&curseg->sum_blk->footer, &sum->footer, SUM_FOOTER_SIZE);
> @@ -4136,7 +4136,7 @@ static void remove_sits_in_journal(struct f2fs_sb_info *sbi)
>   	struct f2fs_journal *journal = curseg->journal;
>   	int i;
>   
> -	down_write(&curseg->journal_rwsem);
> +	write_lock(&curseg->journal_rwlock);
>   	for (i = 0; i < sits_in_cursum(journal); i++) {
>   		unsigned int segno;
>   		bool dirtied;
> @@ -4148,7 +4148,7 @@ static void remove_sits_in_journal(struct f2fs_sb_info *sbi)
>   			add_sit_entry(segno, &SM_I(sbi)->sit_entry_set);
>   	}
>   	update_sits_in_cursum(journal, -i);
> -	up_write(&curseg->journal_rwsem);
> +	write_unlock(&curseg->journal_rwlock);
>   }
>   
>   /*
> @@ -4204,7 +4204,7 @@ void f2fs_flush_sit_entries(struct f2fs_sb_info *sbi, struct cp_control *cpc)
>   			to_journal = false;
>   
>   		if (to_journal) {
> -			down_write(&curseg->journal_rwsem);
> +			write_lock(&curseg->journal_rwlock);
>   		} else {
>   			page = get_next_sit_page(sbi, start_segno);
>   			raw_sit = page_address(page);
> @@ -4251,7 +4251,7 @@ void f2fs_flush_sit_entries(struct f2fs_sb_info *sbi, struct cp_control *cpc)
>   		}
>   
>   		if (to_journal)
> -			up_write(&curseg->journal_rwsem);
> +			write_unlock(&curseg->journal_rwlock);
>   		else
>   			f2fs_put_page(page, 1);
>   
> @@ -4432,7 +4432,7 @@ static int build_curseg(struct f2fs_sb_info *sbi)
>   		array[i].sum_blk = f2fs_kzalloc(sbi, PAGE_SIZE, GFP_KERNEL);
>   		if (!array[i].sum_blk)
>   			return -ENOMEM;
> -		init_rwsem(&array[i].journal_rwsem);
> +		rwlock_init(&array[i].journal_rwlock);
>   		array[i].journal = f2fs_kzalloc(sbi,
>   				sizeof(struct f2fs_journal), GFP_KERNEL);
>   		if (!array[i].journal)
> @@ -4509,7 +4509,7 @@ static int build_sit_entries(struct f2fs_sb_info *sbi)
>   		start_blk += readed;
>   	} while (start_blk < sit_blk_cnt);
>   
> -	down_read(&curseg->journal_rwsem);
> +	read_lock(&curseg->journal_rwlock);
>   	for (i = 0; i < sits_in_cursum(journal); i++) {
>   		unsigned int old_valid_blocks;
>   
> @@ -4551,7 +4551,7 @@ static int build_sit_entries(struct f2fs_sb_info *sbi)
>   							old_valid_blocks;
>   		}
>   	}
> -	up_read(&curseg->journal_rwsem);
> +	read_unlock(&curseg->journal_rwlock);
>   
>   	if (!err && total_node_blocks != valid_node_count(sbi)) {
>   		f2fs_err(sbi, "SIT is corrupted node# %u vs %u",
> diff --git a/fs/f2fs/segment.h b/fs/f2fs/segment.h
> index 050230c70a53..afb9041dc2e6 100644
> --- a/fs/f2fs/segment.h
> +++ b/fs/f2fs/segment.h
> @@ -306,7 +306,7 @@ struct victim_selection {
>   struct curseg_info {
>   	struct mutex curseg_mutex;		/* lock for consistency */
>   	struct f2fs_summary_block *sum_blk;	/* cached summary block */
> -	struct rw_semaphore journal_rwsem;	/* protect journal area */
> +	rwlock_t journal_rwlock;		/* protect journal area */
>   	struct f2fs_journal *journal;		/* cached journal info */
>   	unsigned char alloc_type;		/* current allocation type */
>   	unsigned short seg_type;		/* segment type like CURSEG_XXX_TYPE */
> 

^ permalink raw reply	[flat|nested] 7+ messages in thread

* Re: [PATCH v2] f2fs: don't sleep while grabing nat_tree_lock
  2021-07-22  1:41 [PATCH] f2fs: use rwlock instead of rwsem for journal Jaegeuk Kim
  2021-07-22 13:41 ` [f2fs-dev] " Chao Yu
@ 2021-07-22 17:44 ` Jaegeuk Kim
  2021-07-23  0:39   ` [f2fs-dev] " Chao Yu
  2021-07-23  2:35   ` [PATCH v3] " Jaegeuk Kim
  1 sibling, 2 replies; 7+ messages in thread
From: Jaegeuk Kim @ 2021-07-22 17:44 UTC (permalink / raw)
  To: linux-kernel, linux-f2fs-devel

This tries to fix priority inversion in the below condition resulting in
long checkpoint delay.

f2fs_get_node_info()
 - nat_tree_lock
  -> sleep to grab journal_rwsem by contention

                                     checkpoint
                                     - waiting for nat_tree_lock

In order to let checkpoint go, let's release nat_tree_lock, if there's a
journal_rwsem contention.

Signed-off-by: Jaegeuk Kim <jaegeuk@kernel.org>
---
 Change log from v1:
  - drop rwlock but take retry logic to reduce lock-holding time

 fs/f2fs/node.c | 14 ++++++++++----
 1 file changed, 10 insertions(+), 4 deletions(-)

diff --git a/fs/f2fs/node.c b/fs/f2fs/node.c
index 0be9e2d7120e..c60ba4179bb2 100644
--- a/fs/f2fs/node.c
+++ b/fs/f2fs/node.c
@@ -552,7 +552,7 @@ int f2fs_get_node_info(struct f2fs_sb_info *sbi, nid_t nid,
 	int i;
 
 	ni->nid = nid;
-
+retry:
 	/* Check nat cache */
 	down_read(&nm_i->nat_tree_lock);
 	e = __lookup_nat_cache(nm_i, nid);
@@ -564,10 +564,16 @@ int f2fs_get_node_info(struct f2fs_sb_info *sbi, nid_t nid,
 		return 0;
 	}
 
-	memset(&ne, 0, sizeof(struct f2fs_nat_entry));
+	/*
+	 * Check current segment summary by trying to grab journal_rwsem first.
+	 * This sem is on the critical path on the checkpoint requiring the above
+	 * nat_tree_lock. Therefore, we should retry, if we failed to grab here.
+	 */
+	if (!down_read_trylock(&curseg->journal_rwsem)) {
+		up_read(&nm_i->nat_tree_lock);
+		goto retry;
+	}
 
-	/* Check current segment summary */
-	down_read(&curseg->journal_rwsem);
 	i = f2fs_lookup_journal_in_cursum(journal, NAT_JOURNAL, nid, 0);
 	if (i >= 0) {
 		ne = nat_in_journal(journal, i);
-- 
2.32.0.432.gabb21c7263-goog


^ permalink raw reply	[flat|nested] 7+ messages in thread

* Re: [f2fs-dev] [PATCH v2] f2fs: don't sleep while grabing nat_tree_lock
  2021-07-22 17:44 ` [PATCH v2] f2fs: don't sleep while grabing nat_tree_lock Jaegeuk Kim
@ 2021-07-23  0:39   ` Chao Yu
  2021-07-23  2:35   ` [PATCH v3] " Jaegeuk Kim
  1 sibling, 0 replies; 7+ messages in thread
From: Chao Yu @ 2021-07-23  0:39 UTC (permalink / raw)
  To: Jaegeuk Kim, linux-kernel, linux-f2fs-devel

On 2021/7/23 1:44, Jaegeuk Kim wrote:
> This tries to fix priority inversion in the below condition resulting in
> long checkpoint delay.
> 
> f2fs_get_node_info()
>   - nat_tree_lock
>    -> sleep to grab journal_rwsem by contention
> 
>                                       checkpoint
>                                       - waiting for nat_tree_lock
> 
> In order to let checkpoint go, let's release nat_tree_lock, if there's a
> journal_rwsem contention.

Write lock of nat_tree_lock is held from many places, how about just
retrying unlock/lock only if checkpoint() is flushing nat blocks?

---
  fs/f2fs/f2fs.h | 1 +
  fs/f2fs/node.c | 8 ++++++--
  2 files changed, 7 insertions(+), 2 deletions(-)

diff --git a/fs/f2fs/f2fs.h b/fs/f2fs/f2fs.h
index ad8f99d7235f..05f41a15fda4 100644
--- a/fs/f2fs/f2fs.h
+++ b/fs/f2fs/f2fs.h
@@ -877,6 +877,7 @@ struct f2fs_nm_info {
  	spinlock_t nat_list_lock;	/* protect clean nat entry list */
  	unsigned int nat_cnt[MAX_NAT_STATE]; /* the # of cached nat entries */
  	unsigned int nat_blocks;	/* # of nat blocks */
+	bool flushing_nat;		/* indicate checkpoint() is flushing nat blocks */

  	/* free node ids management */
  	struct radix_tree_root free_nid_root;/* root of the free_nid cache */
diff --git a/fs/f2fs/node.c b/fs/f2fs/node.c
index c60ba4179bb2..2caa171a68f8 100644
--- a/fs/f2fs/node.c
+++ b/fs/f2fs/node.c
@@ -569,7 +569,7 @@ int f2fs_get_node_info(struct f2fs_sb_info *sbi, nid_t nid,
  	 * This sem is on the critical path on the checkpoint requiring the above
  	 * nat_tree_lock. Therefore, we should retry, if we failed to grab here.
  	 */
-	if (!down_read_trylock(&curseg->journal_rwsem)) {
+	if (!down_read_trylock(&curseg->journal_rwsem) && nm_i->flushing_nat) {
  		up_read(&nm_i->nat_tree_lock);
  		goto retry;
  	}
@@ -2981,6 +2981,8 @@ int f2fs_flush_nat_entries(struct f2fs_sb_info *sbi, struct cp_control *cpc)
  	LIST_HEAD(sets);
  	int err = 0;

+	nm_i->flushing_nat = true;
+
  	/*
  	 * during unmount, let's flush nat_bits before checking
  	 * nat_cnt[DIRTY_NAT].
@@ -2992,7 +2994,7 @@ int f2fs_flush_nat_entries(struct f2fs_sb_info *sbi, struct cp_control *cpc)
  	}

  	if (!nm_i->nat_cnt[DIRTY_NAT])
-		return 0;
+		goto out;

  	down_write(&nm_i->nat_tree_lock);

@@ -3026,6 +3028,8 @@ int f2fs_flush_nat_entries(struct f2fs_sb_info *sbi, struct cp_control *cpc)
  	up_write(&nm_i->nat_tree_lock);
  	/* Allow dirty nats by node block allocation in write_begin */

+out:
+	nm_i->flushing_nat = false;
  	return err;
  }

-- 
2.22.1



^ permalink raw reply	[flat|nested] 7+ messages in thread

* Re: [PATCH v3] f2fs: don't sleep while grabing nat_tree_lock
  2021-07-22 17:44 ` [PATCH v2] f2fs: don't sleep while grabing nat_tree_lock Jaegeuk Kim
  2021-07-23  0:39   ` [f2fs-dev] " Chao Yu
@ 2021-07-23  2:35   ` Jaegeuk Kim
  2021-07-23  2:41     ` [f2fs-dev] " Chao Yu
  2021-07-23  4:27     ` [f2fs-dev] [PATCH v4] " Jaegeuk Kim
  1 sibling, 2 replies; 7+ messages in thread
From: Jaegeuk Kim @ 2021-07-23  2:35 UTC (permalink / raw)
  To: linux-kernel, linux-f2fs-devel

This tries to fix priority inversion in the below condition resulting in
long checkpoint delay.

f2fs_get_node_info()
 - nat_tree_lock
  -> sleep to grab journal_rwsem by contention

                                     checkpoint
                                     - waiting for nat_tree_lock

In order to let checkpoint go, let's release nat_tree_lock, if there's a
journal_rwsem contention.

Signed-off-by: Daeho Jeong <daehojeong@google.com>
Signed-off-by: Jaegeuk Kim <jaegeuk@kernel.org>
---
 Change log from v2:
  - don't bother checkpoint

 fs/f2fs/node.c | 17 +++++++++++++----
 1 file changed, 13 insertions(+), 4 deletions(-)

diff --git a/fs/f2fs/node.c b/fs/f2fs/node.c
index 0be9e2d7120e..b26642daa3d2 100644
--- a/fs/f2fs/node.c
+++ b/fs/f2fs/node.c
@@ -552,7 +552,7 @@ int f2fs_get_node_info(struct f2fs_sb_info *sbi, nid_t nid,
 	int i;
 
 	ni->nid = nid;
-
+retry:
 	/* Check nat cache */
 	down_read(&nm_i->nat_tree_lock);
 	e = __lookup_nat_cache(nm_i, nid);
@@ -564,10 +564,19 @@ int f2fs_get_node_info(struct f2fs_sb_info *sbi, nid_t nid,
 		return 0;
 	}
 
-	memset(&ne, 0, sizeof(struct f2fs_nat_entry));
+	/*
+	 * Check current segment summary by trying to grab journal_rwsem first.
+	 * This sem is on the critical path on the checkpoint requiring the above
+	 * nat_tree_lock. Therefore, we should retry, if we failed to grab here
+	 * while not bothering checkpoint.
+	 */
+	if (rwsem_is_locked(&sbi->cp_global_sem)) {
+		down_read(&curseg->journal_rwsem);
+	} else if (!down_read_trylock(&curseg->journal_rwsem)) {
+		up_read(&nm_i->nat_tree_lock);
+		goto retry;
+	}
 
-	/* Check current segment summary */
-	down_read(&curseg->journal_rwsem);
 	i = f2fs_lookup_journal_in_cursum(journal, NAT_JOURNAL, nid, 0);
 	if (i >= 0) {
 		ne = nat_in_journal(journal, i);
-- 
2.32.0.432.gabb21c7263-goog


^ permalink raw reply	[flat|nested] 7+ messages in thread

* Re: [f2fs-dev] [PATCH v3] f2fs: don't sleep while grabing nat_tree_lock
  2021-07-23  2:35   ` [PATCH v3] " Jaegeuk Kim
@ 2021-07-23  2:41     ` Chao Yu
  2021-07-23  4:27     ` [f2fs-dev] [PATCH v4] " Jaegeuk Kim
  1 sibling, 0 replies; 7+ messages in thread
From: Chao Yu @ 2021-07-23  2:41 UTC (permalink / raw)
  To: Jaegeuk Kim, linux-kernel, linux-f2fs-devel

On 2021/7/23 10:35, Jaegeuk Kim wrote:
> This tries to fix priority inversion in the below condition resulting in
> long checkpoint delay.
> 
> f2fs_get_node_info()
>   - nat_tree_lock
>    -> sleep to grab journal_rwsem by contention
> 
>                                       checkpoint
>                                       - waiting for nat_tree_lock
> 
> In order to let checkpoint go, let's release nat_tree_lock, if there's a
> journal_rwsem contention.
> 
> Signed-off-by: Daeho Jeong <daehojeong@google.com>
> Signed-off-by: Jaegeuk Kim <jaegeuk@kernel.org>
> ---
>   Change log from v2:
>    - don't bother checkpoint
> 
>   fs/f2fs/node.c | 17 +++++++++++++----
>   1 file changed, 13 insertions(+), 4 deletions(-)
> 
> diff --git a/fs/f2fs/node.c b/fs/f2fs/node.c
> index 0be9e2d7120e..b26642daa3d2 100644
> --- a/fs/f2fs/node.c
> +++ b/fs/f2fs/node.c
> @@ -552,7 +552,7 @@ int f2fs_get_node_info(struct f2fs_sb_info *sbi, nid_t nid,
>   	int i;
>   
>   	ni->nid = nid;
> -
> +retry:
>   	/* Check nat cache */
>   	down_read(&nm_i->nat_tree_lock);
>   	e = __lookup_nat_cache(nm_i, nid);
> @@ -564,10 +564,19 @@ int f2fs_get_node_info(struct f2fs_sb_info *sbi, nid_t nid,
>   		return 0;
>   	}
>   
> -	memset(&ne, 0, sizeof(struct f2fs_nat_entry));
> +	/*
> +	 * Check current segment summary by trying to grab journal_rwsem first.
> +	 * This sem is on the critical path on the checkpoint requiring the above
> +	 * nat_tree_lock. Therefore, we should retry, if we failed to grab here
> +	 * while not bothering checkpoint.
> +	 */
> +	if (rwsem_is_locked(&sbi->cp_global_sem)) {

You mean: if (!rwsem_is_locked()) ?

IMO, once CP is processing, all readers who needs read nat will be blocked...

Thanks,

> +		down_read(&curseg->journal_rwsem);
> +	} else if (!down_read_trylock(&curseg->journal_rwsem)) {
> +		up_read(&nm_i->nat_tree_lock);
> +		goto retry;
> +	}
>   
> -	/* Check current segment summary */
> -	down_read(&curseg->journal_rwsem);
>   	i = f2fs_lookup_journal_in_cursum(journal, NAT_JOURNAL, nid, 0);
>   	if (i >= 0) {
>   		ne = nat_in_journal(journal, i);
> 

^ permalink raw reply	[flat|nested] 7+ messages in thread

* Re: [f2fs-dev] [PATCH v4] f2fs: don't sleep while grabing nat_tree_lock
  2021-07-23  2:35   ` [PATCH v3] " Jaegeuk Kim
  2021-07-23  2:41     ` [f2fs-dev] " Chao Yu
@ 2021-07-23  4:27     ` Jaegeuk Kim
  1 sibling, 0 replies; 7+ messages in thread
From: Jaegeuk Kim @ 2021-07-23  4:27 UTC (permalink / raw)
  To: linux-kernel, linux-f2fs-devel

This tries to fix priority inversion in the below condition resulting in
long checkpoint delay.

f2fs_get_node_info()
 - nat_tree_lock
  -> sleep to grab journal_rwsem by contention

                                     checkpoint
                                     - waiting for nat_tree_lock

In order to let checkpoint go, let's release nat_tree_lock, if there's a
journal_rwsem contention.

Signed-off-by: Daeho Jeong <daehojeong@google.com>
Signed-off-by: Jaegeuk Kim <jaegeuk@kernel.org>
---

 Change log from v3:
  - fix wrong condition check

 fs/f2fs/node.c | 17 +++++++++++++----
 1 file changed, 13 insertions(+), 4 deletions(-)

diff --git a/fs/f2fs/node.c b/fs/f2fs/node.c
index 0be9e2d7120e..c945a9730f3c 100644
--- a/fs/f2fs/node.c
+++ b/fs/f2fs/node.c
@@ -552,7 +552,7 @@ int f2fs_get_node_info(struct f2fs_sb_info *sbi, nid_t nid,
 	int i;
 
 	ni->nid = nid;
-
+retry:
 	/* Check nat cache */
 	down_read(&nm_i->nat_tree_lock);
 	e = __lookup_nat_cache(nm_i, nid);
@@ -564,10 +564,19 @@ int f2fs_get_node_info(struct f2fs_sb_info *sbi, nid_t nid,
 		return 0;
 	}
 
-	memset(&ne, 0, sizeof(struct f2fs_nat_entry));
+	/*
+	 * Check current segment summary by trying to grab journal_rwsem first.
+	 * This sem is on the critical path on the checkpoint requiring the above
+	 * nat_tree_lock. Therefore, we should retry, if we failed to grab here
+	 * while not bothering checkpoint.
+	 */
+	if (!rwsem_is_locked(&sbi->cp_global_sem)) {
+		down_read(&curseg->journal_rwsem);
+	} else if (!down_read_trylock(&curseg->journal_rwsem)) {
+		up_read(&nm_i->nat_tree_lock);
+		goto retry;
+	}
 
-	/* Check current segment summary */
-	down_read(&curseg->journal_rwsem);
 	i = f2fs_lookup_journal_in_cursum(journal, NAT_JOURNAL, nid, 0);
 	if (i >= 0) {
 		ne = nat_in_journal(journal, i);
-- 
2.32.0.432.gabb21c7263-goog


^ permalink raw reply	[flat|nested] 7+ messages in thread

end of thread, other threads:[~2021-07-23  4:27 UTC | newest]

Thread overview: 7+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2021-07-22  1:41 [PATCH] f2fs: use rwlock instead of rwsem for journal Jaegeuk Kim
2021-07-22 13:41 ` [f2fs-dev] " Chao Yu
2021-07-22 17:44 ` [PATCH v2] f2fs: don't sleep while grabing nat_tree_lock Jaegeuk Kim
2021-07-23  0:39   ` [f2fs-dev] " Chao Yu
2021-07-23  2:35   ` [PATCH v3] " Jaegeuk Kim
2021-07-23  2:41     ` [f2fs-dev] " Chao Yu
2021-07-23  4:27     ` [f2fs-dev] [PATCH v4] " Jaegeuk Kim

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).