LKML Archive on lore.kernel.org
help / color / mirror / Atom feed
* [PATCH 1/2] zram: free meta out of init_lock
@ 2015-01-23  5:58 Minchan Kim
  2015-01-23  5:58 ` [PATCH 2/2] zram: protect zram->stat race with init_lock Minchan Kim
                   ` (2 more replies)
  0 siblings, 3 replies; 32+ messages in thread
From: Minchan Kim @ 2015-01-23  5:58 UTC (permalink / raw)
  To: Andrew Morton
  Cc: linux-kernel, linux-mm, Nitin Gupta, Jerome Marchand,
	Sergey Senozhatsky, Minchan Kim

We don't need to call zram_meta_free, zcomp_destroy and zs_free
under init_lock. What we need to prevent race with init_lock
in reset is setting NULL into zram->meta (ie, init_done).
This patch does it.

Signed-off-by: Minchan Kim <minchan@kernel.org>
---
 drivers/block/zram/zram_drv.c | 28 ++++++++++++++++------------
 1 file changed, 16 insertions(+), 12 deletions(-)

diff --git a/drivers/block/zram/zram_drv.c b/drivers/block/zram/zram_drv.c
index 9250b3f54a8f..0299d82275e7 100644
--- a/drivers/block/zram/zram_drv.c
+++ b/drivers/block/zram/zram_drv.c
@@ -708,6 +708,7 @@ static void zram_reset_device(struct zram *zram, bool reset_capacity)
 {
 	size_t index;
 	struct zram_meta *meta;
+	struct zcomp *comp;
 
 	down_write(&zram->init_lock);
 
@@ -719,20 +720,10 @@ static void zram_reset_device(struct zram *zram, bool reset_capacity)
 	}
 
 	meta = zram->meta;
-	/* Free all pages that are still in this zram device */
-	for (index = 0; index < zram->disksize >> PAGE_SHIFT; index++) {
-		unsigned long handle = meta->table[index].handle;
-		if (!handle)
-			continue;
-
-		zs_free(meta->mem_pool, handle);
-	}
-
-	zcomp_destroy(zram->comp);
+	comp = zram->comp;
+	zram->meta = NULL;
 	zram->max_comp_streams = 1;
 
-	zram_meta_free(zram->meta);
-	zram->meta = NULL;
 	/* Reset stats */
 	memset(&zram->stats, 0, sizeof(zram->stats));
 
@@ -742,6 +733,19 @@ static void zram_reset_device(struct zram *zram, bool reset_capacity)
 
 	up_write(&zram->init_lock);
 
+	/* Free all pages that are still in this zram device */
+	for (index = 0; index < zram->disksize >> PAGE_SHIFT; index++) {
+		unsigned long handle = meta->table[index].handle;
+
+		if (!handle)
+			continue;
+
+		zs_free(meta->mem_pool, handle);
+	}
+
+	zcomp_destroy(comp);
+	zram_meta_free(meta);
+
 	/*
 	 * Revalidate disk out of the init_lock to avoid lockdep splat.
 	 * It's okay because disk's capacity is protected by init_lock
-- 
1.9.1


^ permalink raw reply	[flat|nested] 32+ messages in thread

* [PATCH 2/2] zram: protect zram->stat race with init_lock
  2015-01-23  5:58 [PATCH 1/2] zram: free meta out of init_lock Minchan Kim
@ 2015-01-23  5:58 ` Minchan Kim
  2015-01-23 13:45   ` Jerome Marchand
  2015-01-23 14:38   ` Sergey Senozhatsky
  2015-01-23 13:07 ` [PATCH 1/2] zram: free meta out of init_lock Jerome Marchand
  2015-01-23 14:24 ` Sergey Senozhatsky
  2 siblings, 2 replies; 32+ messages in thread
From: Minchan Kim @ 2015-01-23  5:58 UTC (permalink / raw)
  To: Andrew Morton
  Cc: linux-kernel, linux-mm, Nitin Gupta, Jerome Marchand,
	Sergey Senozhatsky, Minchan Kim

The zram->stat handling should be procted by init_lock.
Otherwise, user could see stale value from the stat.

Signed-off-by: Minchan Kim <minchan@kernel.org>
---

I don't think it's stable material. The race is rare in real practice
and this stale stat value read is not a critical.

 drivers/block/zram/zram_drv.c | 37 ++++++++++++++++++++++++++++---------
 1 file changed, 28 insertions(+), 9 deletions(-)

diff --git a/drivers/block/zram/zram_drv.c b/drivers/block/zram/zram_drv.c
index 0299d82275e7..53f176f590b0 100644
--- a/drivers/block/zram/zram_drv.c
+++ b/drivers/block/zram/zram_drv.c
@@ -48,8 +48,13 @@ static ssize_t name##_show(struct device *d,		\
 				struct device_attribute *attr, char *b)	\
 {									\
 	struct zram *zram = dev_to_zram(d);				\
-	return scnprintf(b, PAGE_SIZE, "%llu\n",			\
-		(u64)atomic64_read(&zram->stats.name));			\
+	u64 val = 0;							\
+									\
+	down_read(&zram->init_lock);					\
+	if (init_done(zram))						\
+		val = atomic64_read(&zram->stats.name);			\
+	up_read(&zram->init_lock);					\
+	return scnprintf(b, PAGE_SIZE, "%llu\n", val);			\
 }									\
 static DEVICE_ATTR_RO(name);
 
@@ -67,8 +72,14 @@ static ssize_t disksize_show(struct device *dev,
 		struct device_attribute *attr, char *buf)
 {
 	struct zram *zram = dev_to_zram(dev);
+	u64 val = 0;
+
+	down_read(&zram->init_lock);
+	if (init_done(zram))
+		val = zram->disksize;
+	up_read(&zram->init_lock);
 
-	return scnprintf(buf, PAGE_SIZE, "%llu\n", zram->disksize);
+	return scnprintf(buf, PAGE_SIZE, "%llu\n", val);
 }
 
 static ssize_t initstate_show(struct device *dev,
@@ -88,9 +99,14 @@ static ssize_t orig_data_size_show(struct device *dev,
 		struct device_attribute *attr, char *buf)
 {
 	struct zram *zram = dev_to_zram(dev);
+	u64 val = 0;
+
+	down_read(&zram->init_lock);
+	if (init_done(zram))
+		val = atomic64_read(&zram->stats.pages_stored) << PAGE_SHIFT;
+	up_read(&zram->init_lock);
 
-	return scnprintf(buf, PAGE_SIZE, "%llu\n",
-		(u64)(atomic64_read(&zram->stats.pages_stored)) << PAGE_SHIFT);
+	return scnprintf(buf, PAGE_SIZE, "%llu\n", val);
 }
 
 static ssize_t mem_used_total_show(struct device *dev,
@@ -957,10 +973,6 @@ static int zram_rw_page(struct block_device *bdev, sector_t sector,
 	struct bio_vec bv;
 
 	zram = bdev->bd_disk->private_data;
-	if (!valid_io_request(zram, sector, PAGE_SIZE)) {
-		atomic64_inc(&zram->stats.invalid_io);
-		return -EINVAL;
-	}
 
 	down_read(&zram->init_lock);
 	if (unlikely(!init_done(zram))) {
@@ -968,6 +980,13 @@ static int zram_rw_page(struct block_device *bdev, sector_t sector,
 		goto out_unlock;
 	}
 
+	if (!valid_io_request(zram, sector, PAGE_SIZE)) {
+		atomic64_inc(&zram->stats.invalid_io);
+		err = -EINVAL;
+		goto out_unlock;
+	}
+
+
 	index = sector >> SECTORS_PER_PAGE_SHIFT;
 	offset = sector & (SECTORS_PER_PAGE - 1) << SECTOR_SHIFT;
 
-- 
1.9.1


^ permalink raw reply	[flat|nested] 32+ messages in thread

* Re: [PATCH 1/2] zram: free meta out of init_lock
  2015-01-23  5:58 [PATCH 1/2] zram: free meta out of init_lock Minchan Kim
  2015-01-23  5:58 ` [PATCH 2/2] zram: protect zram->stat race with init_lock Minchan Kim
@ 2015-01-23 13:07 ` Jerome Marchand
  2015-01-23 14:24 ` Sergey Senozhatsky
  2 siblings, 0 replies; 32+ messages in thread
From: Jerome Marchand @ 2015-01-23 13:07 UTC (permalink / raw)
  To: Minchan Kim, Andrew Morton
  Cc: linux-kernel, linux-mm, Nitin Gupta, Sergey Senozhatsky

[-- Attachment #1: Type: text/plain, Size: 1348 bytes --]

On 01/23/2015 06:58 AM, Minchan Kim wrote:
> We don't need to call zram_meta_free, zcomp_destroy and zs_free
> under init_lock. What we need to prevent race with init_lock
> in reset is setting NULL into zram->meta (ie, init_done).
> This patch does it.
> 
> Signed-off-by: Minchan Kim <minchan@kernel.org>

Acked-by: Jerome Marchand <jmarchan@redhat.com>

On a side note, when zram->meta replaced init_done, no comment was
added in zram structure to explain that. Things could be made more
explicit.

---
Subject: [PATCH] zram: explicitely state that zram->meta is used to determine
 the init state

zram->meta is used to determine the initialization state of a zram structure.
This patch adds a comment to zram structure to make this clear.

Signed-off-by: Jerome Marchand <jmarchan@redhat.com>
---
 drivers/block/zram/zram_drv.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/drivers/block/zram/zram_drv.h b/drivers/block/zram/zram_drv.h
index b05a816..551569a 100644
--- a/drivers/block/zram/zram_drv.h
+++ b/drivers/block/zram/zram_drv.h
@@ -99,7 +99,7 @@ struct zram_meta {
 };
 
 struct zram {
-	struct zram_meta *meta;
+	struct zram_meta *meta;	/* also used to determine the init state */
 	struct request_queue *queue;
 	struct gendisk *disk;
 	struct zcomp *comp;
-- 
1.9.3


[-- Attachment #2: OpenPGP digital signature --]
[-- Type: application/pgp-signature, Size: 473 bytes --]

^ permalink raw reply	[flat|nested] 32+ messages in thread

* Re: [PATCH 2/2] zram: protect zram->stat race with init_lock
  2015-01-23  5:58 ` [PATCH 2/2] zram: protect zram->stat race with init_lock Minchan Kim
@ 2015-01-23 13:45   ` Jerome Marchand
  2015-01-23 14:38   ` Sergey Senozhatsky
  1 sibling, 0 replies; 32+ messages in thread
From: Jerome Marchand @ 2015-01-23 13:45 UTC (permalink / raw)
  To: Minchan Kim, Andrew Morton
  Cc: linux-kernel, linux-mm, Nitin Gupta, Sergey Senozhatsky

[-- Attachment #1: Type: text/plain, Size: 269 bytes --]

On 01/23/2015 06:58 AM, Minchan Kim wrote:
> The zram->stat handling should be procted by init_lock.
> Otherwise, user could see stale value from the stat.
> 
> Signed-off-by: Minchan Kim <minchan@kernel.org>

Acked-by: Jerome Marchand <jmarchan@redhat.com>



[-- Attachment #2: OpenPGP digital signature --]
[-- Type: application/pgp-signature, Size: 473 bytes --]

^ permalink raw reply	[flat|nested] 32+ messages in thread

* Re: [PATCH 1/2] zram: free meta out of init_lock
  2015-01-23  5:58 [PATCH 1/2] zram: free meta out of init_lock Minchan Kim
  2015-01-23  5:58 ` [PATCH 2/2] zram: protect zram->stat race with init_lock Minchan Kim
  2015-01-23 13:07 ` [PATCH 1/2] zram: free meta out of init_lock Jerome Marchand
@ 2015-01-23 14:24 ` Sergey Senozhatsky
  2015-01-23 14:48   ` Jerome Marchand
  2 siblings, 1 reply; 32+ messages in thread
From: Sergey Senozhatsky @ 2015-01-23 14:24 UTC (permalink / raw)
  To: Minchan Kim
  Cc: Andrew Morton, linux-kernel, linux-mm, Nitin Gupta,
	Jerome Marchand, Sergey Senozhatsky

On (01/23/15 14:58), Minchan Kim wrote:
> We don't need to call zram_meta_free, zcomp_destroy and zs_free
> under init_lock. What we need to prevent race with init_lock
> in reset is setting NULL into zram->meta (ie, init_done).
> This patch does it.
> 
> Signed-off-by: Minchan Kim <minchan@kernel.org>
> ---
>  drivers/block/zram/zram_drv.c | 28 ++++++++++++++++------------
>  1 file changed, 16 insertions(+), 12 deletions(-)
> 
> diff --git a/drivers/block/zram/zram_drv.c b/drivers/block/zram/zram_drv.c
> index 9250b3f54a8f..0299d82275e7 100644
> --- a/drivers/block/zram/zram_drv.c
> +++ b/drivers/block/zram/zram_drv.c
> @@ -708,6 +708,7 @@ static void zram_reset_device(struct zram *zram, bool reset_capacity)
>  {
>  	size_t index;
>  	struct zram_meta *meta;
> +	struct zcomp *comp;
>  
>  	down_write(&zram->init_lock);
>  
> @@ -719,20 +720,10 @@ static void zram_reset_device(struct zram *zram, bool reset_capacity)
>  	}
>  
>  	meta = zram->meta;
> -	/* Free all pages that are still in this zram device */
> -	for (index = 0; index < zram->disksize >> PAGE_SHIFT; index++) {
> -		unsigned long handle = meta->table[index].handle;
> -		if (!handle)
> -			continue;
> -
> -		zs_free(meta->mem_pool, handle);
> -	}
> -
> -	zcomp_destroy(zram->comp);

I'm not so sure about moving zcomp destruction. if we would have detached it
from zram, then yes. otherwise, think of zram ->destoy vs ->init race.

suppose,
CPU1 waits for down_write() init lock in disksize_store() with new comp already allocated;
CPU0 detaches ->meta and releases write init lock;
CPU1 grabs the lock and does zram->comp = comp;
CPU0 reaches the point of zcomp_destroy(zram->comp);


I'd probably prefer to keep zcomp destruction on its current place. I
see a little real value in introducing zcomp detaching and moving
destruction out of init_lock.

	-ss

> +	comp = zram->comp;
> +	zram->meta = NULL;
>  	zram->max_comp_streams = 1;
>  
> -	zram_meta_free(zram->meta);
> -	zram->meta = NULL;
>  	/* Reset stats */
>  	memset(&zram->stats, 0, sizeof(zram->stats));
>  
> @@ -742,6 +733,19 @@ static void zram_reset_device(struct zram *zram, bool reset_capacity)
>  
>  	up_write(&zram->init_lock);
>  
> +	/* Free all pages that are still in this zram device */
> +	for (index = 0; index < zram->disksize >> PAGE_SHIFT; index++) {
> +		unsigned long handle = meta->table[index].handle;
> +
> +		if (!handle)
> +			continue;
> +
> +		zs_free(meta->mem_pool, handle);
> +	}
> +
> +	zcomp_destroy(comp);
> +	zram_meta_free(meta);
> +
>  	/*
>  	 * Revalidate disk out of the init_lock to avoid lockdep splat.
>  	 * It's okay because disk's capacity is protected by init_lock
> -- 
> 1.9.1
> 

^ permalink raw reply	[flat|nested] 32+ messages in thread

* Re: [PATCH 2/2] zram: protect zram->stat race with init_lock
  2015-01-23  5:58 ` [PATCH 2/2] zram: protect zram->stat race with init_lock Minchan Kim
  2015-01-23 13:45   ` Jerome Marchand
@ 2015-01-23 14:38   ` Sergey Senozhatsky
  2015-01-24 13:17     ` Ganesh Mahendran
  1 sibling, 1 reply; 32+ messages in thread
From: Sergey Senozhatsky @ 2015-01-23 14:38 UTC (permalink / raw)
  To: Minchan Kim
  Cc: Andrew Morton, linux-kernel, linux-mm, Nitin Gupta,
	Jerome Marchand, Sergey Senozhatsky

On (01/23/15 14:58), Minchan Kim wrote:
> The zram->stat handling should be procted by init_lock.
> Otherwise, user could see stale value from the stat.
> 
> Signed-off-by: Minchan Kim <minchan@kernel.org>
> ---
> 
> I don't think it's stable material. The race is rare in real practice
> and this stale stat value read is not a critical.
> 
>  drivers/block/zram/zram_drv.c | 37 ++++++++++++++++++++++++++++---------
>  1 file changed, 28 insertions(+), 9 deletions(-)
> 
> diff --git a/drivers/block/zram/zram_drv.c b/drivers/block/zram/zram_drv.c
> index 0299d82275e7..53f176f590b0 100644
> --- a/drivers/block/zram/zram_drv.c
> +++ b/drivers/block/zram/zram_drv.c
> @@ -48,8 +48,13 @@ static ssize_t name##_show(struct device *d,		\
>  				struct device_attribute *attr, char *b)	\
>  {									\

a side note: I wasn't Cc'd in that patchset and found out it only when it's
been merged. I'm not sure I understand, why it has been renamed from specific
zram_X_show to X_show. what gives?


can't help, catches my eye every time, that rename has broken the original
formatting:


diff --git a/drivers/block/zram/zram_drv.c b/drivers/block/zram/zram_drv.c
index 9250b3f..c567af5 100644
--- a/drivers/block/zram/zram_drv.c
+++ b/drivers/block/zram/zram_drv.c
@@ -44,7 +44,7 @@ static const char *default_compressor = "lzo";
 static unsigned int num_devices = 1;
 
 #define ZRAM_ATTR_RO(name)						\
-static ssize_t name##_show(struct device *d,		\
+static ssize_t name##_show(struct device *d,				\
 				struct device_attribute *attr, char *b)	\
 {									\
 	struct zram *zram = dev_to_zram(d);				\



I don't have any objections. but do we really want to wrap atomic ops in
semaphore? it is really such serious race?


	-ss

>  	struct zram *zram = dev_to_zram(d);				\
> -	return scnprintf(b, PAGE_SIZE, "%llu\n",			\
> -		(u64)atomic64_read(&zram->stats.name));			\
> +	u64 val = 0;							\
> +									\
> +	down_read(&zram->init_lock);					\
> +	if (init_done(zram))						\
> +		val = atomic64_read(&zram->stats.name);			\
> +	up_read(&zram->init_lock);					\
> +	return scnprintf(b, PAGE_SIZE, "%llu\n", val);			\
>  }									\
>  static DEVICE_ATTR_RO(name);
>  
> @@ -67,8 +72,14 @@ static ssize_t disksize_show(struct device *dev,
>  		struct device_attribute *attr, char *buf)
>  {
>  	struct zram *zram = dev_to_zram(dev);
> +	u64 val = 0;
> +
> +	down_read(&zram->init_lock);
> +	if (init_done(zram))
> +		val = zram->disksize;
> +	up_read(&zram->init_lock);
>  
> -	return scnprintf(buf, PAGE_SIZE, "%llu\n", zram->disksize);
> +	return scnprintf(buf, PAGE_SIZE, "%llu\n", val);
>  }
>  
>  static ssize_t initstate_show(struct device *dev,
> @@ -88,9 +99,14 @@ static ssize_t orig_data_size_show(struct device *dev,
>  		struct device_attribute *attr, char *buf)
>  {
>  	struct zram *zram = dev_to_zram(dev);
> +	u64 val = 0;
> +
> +	down_read(&zram->init_lock);
> +	if (init_done(zram))
> +		val = atomic64_read(&zram->stats.pages_stored) << PAGE_SHIFT;
> +	up_read(&zram->init_lock);
>  
> -	return scnprintf(buf, PAGE_SIZE, "%llu\n",
> -		(u64)(atomic64_read(&zram->stats.pages_stored)) << PAGE_SHIFT);
> +	return scnprintf(buf, PAGE_SIZE, "%llu\n", val);
>  }
>  
>  static ssize_t mem_used_total_show(struct device *dev,
> @@ -957,10 +973,6 @@ static int zram_rw_page(struct block_device *bdev, sector_t sector,
>  	struct bio_vec bv;
>  
>  	zram = bdev->bd_disk->private_data;
> -	if (!valid_io_request(zram, sector, PAGE_SIZE)) {
> -		atomic64_inc(&zram->stats.invalid_io);
> -		return -EINVAL;
> -	}
>  
>  	down_read(&zram->init_lock);
>  	if (unlikely(!init_done(zram))) {
> @@ -968,6 +980,13 @@ static int zram_rw_page(struct block_device *bdev, sector_t sector,
>  		goto out_unlock;
>  	}
>  
> +	if (!valid_io_request(zram, sector, PAGE_SIZE)) {
> +		atomic64_inc(&zram->stats.invalid_io);
> +		err = -EINVAL;
> +		goto out_unlock;
> +	}
> +
> +
>  	index = sector >> SECTORS_PER_PAGE_SHIFT;
>  	offset = sector & (SECTORS_PER_PAGE - 1) << SECTOR_SHIFT;
>  
> -- 
> 1.9.1
> 

^ permalink raw reply	[flat|nested] 32+ messages in thread

* Re: [PATCH 1/2] zram: free meta out of init_lock
  2015-01-23 14:24 ` Sergey Senozhatsky
@ 2015-01-23 14:48   ` Jerome Marchand
  2015-01-23 15:47     ` Sergey Senozhatsky
  0 siblings, 1 reply; 32+ messages in thread
From: Jerome Marchand @ 2015-01-23 14:48 UTC (permalink / raw)
  To: Sergey Senozhatsky, Minchan Kim
  Cc: Andrew Morton, linux-kernel, linux-mm, Nitin Gupta

[-- Attachment #1: Type: text/plain, Size: 3067 bytes --]

On 01/23/2015 03:24 PM, Sergey Senozhatsky wrote:
> On (01/23/15 14:58), Minchan Kim wrote:
>> We don't need to call zram_meta_free, zcomp_destroy and zs_free
>> under init_lock. What we need to prevent race with init_lock
>> in reset is setting NULL into zram->meta (ie, init_done).
>> This patch does it.
>>
>> Signed-off-by: Minchan Kim <minchan@kernel.org>
>> ---
>>  drivers/block/zram/zram_drv.c | 28 ++++++++++++++++------------
>>  1 file changed, 16 insertions(+), 12 deletions(-)
>>
>> diff --git a/drivers/block/zram/zram_drv.c b/drivers/block/zram/zram_drv.c
>> index 9250b3f54a8f..0299d82275e7 100644
>> --- a/drivers/block/zram/zram_drv.c
>> +++ b/drivers/block/zram/zram_drv.c
>> @@ -708,6 +708,7 @@ static void zram_reset_device(struct zram *zram, bool reset_capacity)
>>  {
>>  	size_t index;
>>  	struct zram_meta *meta;
>> +	struct zcomp *comp;
>>  
>>  	down_write(&zram->init_lock);
>>  
>> @@ -719,20 +720,10 @@ static void zram_reset_device(struct zram *zram, bool reset_capacity)
>>  	}
>>  
>>  	meta = zram->meta;
>> -	/* Free all pages that are still in this zram device */
>> -	for (index = 0; index < zram->disksize >> PAGE_SHIFT; index++) {
>> -		unsigned long handle = meta->table[index].handle;
>> -		if (!handle)
>> -			continue;
>> -
>> -		zs_free(meta->mem_pool, handle);
>> -	}
>> -
>> -	zcomp_destroy(zram->comp);
> 
> I'm not so sure about moving zcomp destruction. if we would have detached it
> from zram, then yes. otherwise, think of zram ->destoy vs ->init race.
> 
> suppose,
> CPU1 waits for down_write() init lock in disksize_store() with new comp already allocated;
> CPU0 detaches ->meta and releases write init lock;
> CPU1 grabs the lock and does zram->comp = comp;
> CPU0 reaches the point of zcomp_destroy(zram->comp);

I don't see your point: this patch does not call
zcomp_destroy(zram->comp) anymore, but zram_destroy(comp), where comp is
the old zram->comp.

> 
> 
> I'd probably prefer to keep zcomp destruction on its current place. I
> see a little real value in introducing zcomp detaching and moving
> destruction out of init_lock.
> 
> 	-ss
> 
>> +	comp = zram->comp;
>> +	zram->meta = NULL;
>>  	zram->max_comp_streams = 1;
>>  
>> -	zram_meta_free(zram->meta);
>> -	zram->meta = NULL;
>>  	/* Reset stats */
>>  	memset(&zram->stats, 0, sizeof(zram->stats));
>>  
>> @@ -742,6 +733,19 @@ static void zram_reset_device(struct zram *zram, bool reset_capacity)
>>  
>>  	up_write(&zram->init_lock);
>>  
>> +	/* Free all pages that are still in this zram device */
>> +	for (index = 0; index < zram->disksize >> PAGE_SHIFT; index++) {
>> +		unsigned long handle = meta->table[index].handle;
>> +
>> +		if (!handle)
>> +			continue;
>> +
>> +		zs_free(meta->mem_pool, handle);
>> +	}
>> +
>> +	zcomp_destroy(comp);
>> +	zram_meta_free(meta);
>> +
>>  	/*
>>  	 * Revalidate disk out of the init_lock to avoid lockdep splat.
>>  	 * It's okay because disk's capacity is protected by init_lock
>> -- 
>> 1.9.1
>>



[-- Attachment #2: OpenPGP digital signature --]
[-- Type: application/pgp-signature, Size: 473 bytes --]

^ permalink raw reply	[flat|nested] 32+ messages in thread

* Re: [PATCH 1/2] zram: free meta out of init_lock
  2015-01-23 14:48   ` Jerome Marchand
@ 2015-01-23 15:47     ` Sergey Senozhatsky
  2015-01-26  1:33       ` Minchan Kim
  0 siblings, 1 reply; 32+ messages in thread
From: Sergey Senozhatsky @ 2015-01-23 15:47 UTC (permalink / raw)
  To: Jerome Marchand
  Cc: Sergey Senozhatsky, Minchan Kim, Andrew Morton, linux-kernel,
	linux-mm, Nitin Gupta

On (01/23/15 15:48), Jerome Marchand wrote:
> Date: Fri, 23 Jan 2015 15:48:05 +0100
> From: Jerome Marchand <jmarchan@redhat.com>
> To: Sergey Senozhatsky <sergey.senozhatsky@gmail.com>, Minchan Kim
>  <minchan@kernel.org>
> CC: Andrew Morton <akpm@linux-foundation.org>,
>  linux-kernel@vger.kernel.org, linux-mm@kvack.org, Nitin Gupta
>  <ngupta@vflare.org>
> Subject: Re: [PATCH 1/2] zram: free meta out of init_lock
> User-Agent: Mozilla/5.0 (X11; Linux x86_64; rv:31.0) Gecko/20100101
>  Thunderbird/31.3.0
> 
> On 01/23/2015 03:24 PM, Sergey Senozhatsky wrote:
> > On (01/23/15 14:58), Minchan Kim wrote:
> >> We don't need to call zram_meta_free, zcomp_destroy and zs_free
> >> under init_lock. What we need to prevent race with init_lock
> >> in reset is setting NULL into zram->meta (ie, init_done).
> >> This patch does it.
> >>
> >> Signed-off-by: Minchan Kim <minchan@kernel.org>
> >> ---
> >>  drivers/block/zram/zram_drv.c | 28 ++++++++++++++++------------
> >>  1 file changed, 16 insertions(+), 12 deletions(-)
> >>
> >> diff --git a/drivers/block/zram/zram_drv.c b/drivers/block/zram/zram_drv.c
> >> index 9250b3f54a8f..0299d82275e7 100644
> >> --- a/drivers/block/zram/zram_drv.c
> >> +++ b/drivers/block/zram/zram_drv.c
> >> @@ -708,6 +708,7 @@ static void zram_reset_device(struct zram *zram, bool reset_capacity)
> >>  {
> >>  	size_t index;
> >>  	struct zram_meta *meta;
> >> +	struct zcomp *comp;
> >>  
> >>  	down_write(&zram->init_lock);
> >>  
> >> @@ -719,20 +720,10 @@ static void zram_reset_device(struct zram *zram, bool reset_capacity)
> >>  	}
> >>  
> >>  	meta = zram->meta;
> >> -	/* Free all pages that are still in this zram device */
> >> -	for (index = 0; index < zram->disksize >> PAGE_SHIFT; index++) {
> >> -		unsigned long handle = meta->table[index].handle;
> >> -		if (!handle)
> >> -			continue;
> >> -
> >> -		zs_free(meta->mem_pool, handle);
> >> -	}
> >> -
> >> -	zcomp_destroy(zram->comp);
> > 
> > I'm not so sure about moving zcomp destruction. if we would have detached it
> > from zram, then yes. otherwise, think of zram ->destoy vs ->init race.
> > 
> > suppose,
> > CPU1 waits for down_write() init lock in disksize_store() with new comp already allocated;
> > CPU0 detaches ->meta and releases write init lock;
> > CPU1 grabs the lock and does zram->comp = comp;
> > CPU0 reaches the point of zcomp_destroy(zram->comp);
> 
> I don't see your point: this patch does not call
> zcomp_destroy(zram->comp) anymore, but zram_destroy(comp), where comp is
> the old zram->comp.


oh... yes. sorry! my bad.



anyway, on a second thought, do we even want to destoy meta out of init_lock?

I mean, it will let you init new device quicker. but... assume, you have
30G zram (or any other bad-enough number). on CPU0 you reset device -- iterate
over 30G meta->table, etc. out of init_lock.
on CPU1 you concurrently re-init device and request again 30G.

how bad that can be?



diskstore called on already initialised device is also not so perfect.
we first will try to allocate ->meta (vmalloc pages for another 30G),
then allocate comp, then down_write() init lock to find out that device
is initialised and we need to release allocated memory.



may be we better keep ->meta destruction under init_lock and additionally
move ->meta and ->comp allocation under init_lock in disksize_store()?

like the following one:

---

 drivers/block/zram/zram_drv.c | 25 +++++++++++++------------
 1 file changed, 13 insertions(+), 12 deletions(-)

diff --git a/drivers/block/zram/zram_drv.c b/drivers/block/zram/zram_drv.c
index 9250b3f..827ab21 100644
--- a/drivers/block/zram/zram_drv.c
+++ b/drivers/block/zram/zram_drv.c
@@ -765,9 +765,18 @@ static ssize_t disksize_store(struct device *dev,
 		return -EINVAL;
 
 	disksize = PAGE_ALIGN(disksize);
+	down_write(&zram->init_lock);
+	if (init_done(zram)) {
+		up_write(&zram->init_lock);
+		pr_info("Cannot change disksize for initialized device\n");
+		return -EBUSY;
+	}
+
 	meta = zram_meta_alloc(zram->disk->first_minor, disksize);
-	if (!meta)
-		return -ENOMEM;
+	if (!meta) {
+		err = -ENOMEM;
+		goto out_unlock;
+	}
 
 	comp = zcomp_create(zram->compressor, zram->max_comp_streams);
 	if (IS_ERR(comp)) {
@@ -777,13 +786,6 @@ static ssize_t disksize_store(struct device *dev,
 		goto out_free_meta;
 	}
 
-	down_write(&zram->init_lock);
-	if (init_done(zram)) {
-		pr_info("Cannot change disksize for initialized device\n");
-		err = -EBUSY;
-		goto out_destroy_comp;
-	}
-
 	zram->meta = meta;
 	zram->comp = comp;
 	zram->disksize = disksize;
@@ -799,11 +801,10 @@ static ssize_t disksize_store(struct device *dev,
 
 	return len;
 
-out_destroy_comp:
-	up_write(&zram->init_lock);
-	zcomp_destroy(comp);
 out_free_meta:
 	zram_meta_free(meta);
+out_unlock:
+	up_write(&zram->init_lock);
 	return err;
 }
 

^ permalink raw reply	[flat|nested] 32+ messages in thread

* Re: [PATCH 2/2] zram: protect zram->stat race with init_lock
  2015-01-23 14:38   ` Sergey Senozhatsky
@ 2015-01-24 13:17     ` Ganesh Mahendran
  2015-01-25 14:38       ` Sergey Senozhatsky
  0 siblings, 1 reply; 32+ messages in thread
From: Ganesh Mahendran @ 2015-01-24 13:17 UTC (permalink / raw)
  To: Sergey Senozhatsky
  Cc: Minchan Kim, Andrew Morton, linux-kernel, Linux-MM, Nitin Gupta,
	Jerome Marchand

Hello Sergey

2015-01-23 22:38 GMT+08:00 Sergey Senozhatsky <sergey.senozhatsky@gmail.com>:
> On (01/23/15 14:58), Minchan Kim wrote:
>> The zram->stat handling should be procted by init_lock.
>> Otherwise, user could see stale value from the stat.
>>
>> Signed-off-by: Minchan Kim <minchan@kernel.org>
>> ---
>>
>> I don't think it's stable material. The race is rare in real practice
>> and this stale stat value read is not a critical.
>>
>>  drivers/block/zram/zram_drv.c | 37 ++++++++++++++++++++++++++++---------
>>  1 file changed, 28 insertions(+), 9 deletions(-)
>>
>> diff --git a/drivers/block/zram/zram_drv.c b/drivers/block/zram/zram_drv.c
>> index 0299d82275e7..53f176f590b0 100644
>> --- a/drivers/block/zram/zram_drv.c
>> +++ b/drivers/block/zram/zram_drv.c
>> @@ -48,8 +48,13 @@ static ssize_t name##_show(struct device *d,               \
>>                               struct device_attribute *attr, char *b) \
>>  {                                                                    \
>
> a side note: I wasn't Cc'd in that patchset and found out it only when it's
> been merged. I'm not sure I understand, why it has been renamed from specific
> zram_X_show to X_show. what gives?

I changed from zram_attr_##name##_show to name##_show in commit:
fcf1bce zram: use DEVICE_ATTR_[RW|RO|WO] to define zram sys device attribute

I just want to keep the name consistent with others, like
disksize_show(), initstate_show().

Thanks.

>
>
> can't help, catches my eye every time, that rename has broken the original
> formatting:
>
>
> diff --git a/drivers/block/zram/zram_drv.c b/drivers/block/zram/zram_drv.c
> index 9250b3f..c567af5 100644
> --- a/drivers/block/zram/zram_drv.c
> +++ b/drivers/block/zram/zram_drv.c
> @@ -44,7 +44,7 @@ static const char *default_compressor = "lzo";
>  static unsigned int num_devices = 1;
>
>  #define ZRAM_ATTR_RO(name)                                             \
> -static ssize_t name##_show(struct device *d,           \
> +static ssize_t name##_show(struct device *d,                           \
>                                 struct device_attribute *attr, char *b) \
>  {                                                                      \
>         struct zram *zram = dev_to_zram(d);                             \
>
>
>
> I don't have any objections. but do we really want to wrap atomic ops in
> semaphore? it is really such serious race?
>
>
>         -ss
>
>>       struct zram *zram = dev_to_zram(d);                             \
>> -     return scnprintf(b, PAGE_SIZE, "%llu\n",                        \
>> -             (u64)atomic64_read(&zram->stats.name));                 \
>> +     u64 val = 0;                                                    \
>> +                                                                     \
>> +     down_read(&zram->init_lock);                                    \
>> +     if (init_done(zram))                                            \
>> +             val = atomic64_read(&zram->stats.name);                 \
>> +     up_read(&zram->init_lock);                                      \
>> +     return scnprintf(b, PAGE_SIZE, "%llu\n", val);                  \
>>  }                                                                    \
>>  static DEVICE_ATTR_RO(name);
>>
>> @@ -67,8 +72,14 @@ static ssize_t disksize_show(struct device *dev,
>>               struct device_attribute *attr, char *buf)
>>  {
>>       struct zram *zram = dev_to_zram(dev);
>> +     u64 val = 0;
>> +
>> +     down_read(&zram->init_lock);
>> +     if (init_done(zram))
>> +             val = zram->disksize;
>> +     up_read(&zram->init_lock);
>>
>> -     return scnprintf(buf, PAGE_SIZE, "%llu\n", zram->disksize);
>> +     return scnprintf(buf, PAGE_SIZE, "%llu\n", val);
>>  }
>>
>>  static ssize_t initstate_show(struct device *dev,
>> @@ -88,9 +99,14 @@ static ssize_t orig_data_size_show(struct device *dev,
>>               struct device_attribute *attr, char *buf)
>>  {
>>       struct zram *zram = dev_to_zram(dev);
>> +     u64 val = 0;
>> +
>> +     down_read(&zram->init_lock);
>> +     if (init_done(zram))
>> +             val = atomic64_read(&zram->stats.pages_stored) << PAGE_SHIFT;
>> +     up_read(&zram->init_lock);
>>
>> -     return scnprintf(buf, PAGE_SIZE, "%llu\n",
>> -             (u64)(atomic64_read(&zram->stats.pages_stored)) << PAGE_SHIFT);
>> +     return scnprintf(buf, PAGE_SIZE, "%llu\n", val);
>>  }
>>
>>  static ssize_t mem_used_total_show(struct device *dev,
>> @@ -957,10 +973,6 @@ static int zram_rw_page(struct block_device *bdev, sector_t sector,
>>       struct bio_vec bv;
>>
>>       zram = bdev->bd_disk->private_data;
>> -     if (!valid_io_request(zram, sector, PAGE_SIZE)) {
>> -             atomic64_inc(&zram->stats.invalid_io);
>> -             return -EINVAL;
>> -     }
>>
>>       down_read(&zram->init_lock);
>>       if (unlikely(!init_done(zram))) {
>> @@ -968,6 +980,13 @@ static int zram_rw_page(struct block_device *bdev, sector_t sector,
>>               goto out_unlock;
>>       }
>>
>> +     if (!valid_io_request(zram, sector, PAGE_SIZE)) {
>> +             atomic64_inc(&zram->stats.invalid_io);
>> +             err = -EINVAL;
>> +             goto out_unlock;
>> +     }
>> +
>> +
>>       index = sector >> SECTORS_PER_PAGE_SHIFT;
>>       offset = sector & (SECTORS_PER_PAGE - 1) << SECTOR_SHIFT;
>>
>> --
>> 1.9.1
>>
> --
> To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
> the body of a message to majordomo@vger.kernel.org
> More majordomo info at  http://vger.kernel.org/majordomo-info.html
> Please read the FAQ at  http://www.tux.org/lkml/

^ permalink raw reply	[flat|nested] 32+ messages in thread

* Re: [PATCH 2/2] zram: protect zram->stat race with init_lock
  2015-01-24 13:17     ` Ganesh Mahendran
@ 2015-01-25 14:38       ` Sergey Senozhatsky
  0 siblings, 0 replies; 32+ messages in thread
From: Sergey Senozhatsky @ 2015-01-25 14:38 UTC (permalink / raw)
  To: Ganesh Mahendran
  Cc: Sergey Senozhatsky, Minchan Kim, Andrew Morton, linux-kernel,
	Linux-MM, Nitin Gupta, Jerome Marchand

Hello,

On (01/24/15 21:17), Ganesh Mahendran wrote:
> Hello Sergey
> 
> 2015-01-23 22:38 GMT+08:00 Sergey Senozhatsky <sergey.senozhatsky@gmail.com>:
> > On (01/23/15 14:58), Minchan Kim wrote:
> >> The zram->stat handling should be procted by init_lock.
> >> Otherwise, user could see stale value from the stat.
> >>
> >> Signed-off-by: Minchan Kim <minchan@kernel.org>
> >> ---
> >>
> >> I don't think it's stable material. The race is rare in real practice
> >> and this stale stat value read is not a critical.
> >>
> >>  drivers/block/zram/zram_drv.c | 37 ++++++++++++++++++++++++++++---------
> >>  1 file changed, 28 insertions(+), 9 deletions(-)
> >>
> >> diff --git a/drivers/block/zram/zram_drv.c b/drivers/block/zram/zram_drv.c
> >> index 0299d82275e7..53f176f590b0 100644
> >> --- a/drivers/block/zram/zram_drv.c
> >> +++ b/drivers/block/zram/zram_drv.c
> >> @@ -48,8 +48,13 @@ static ssize_t name##_show(struct device *d,               \
> >>                               struct device_attribute *attr, char *b) \
> >>  {                                                                    \
> >
> > a side note: I wasn't Cc'd in that patchset and found out it only when it's
> > been merged. I'm not sure I understand, why it has been renamed from specific
> > zram_X_show to X_show. what gives?
> 
> I changed from zram_attr_##name##_show to name##_show in commit:
> fcf1bce zram: use DEVICE_ATTR_[RW|RO|WO] to define zram sys device attribute
> 
> I just want to keep the name consistent with others, like
> disksize_show(), initstate_show().

aha, I see.

	-ss

> Thanks.
> 
> >
> >
> > can't help, catches my eye every time, that rename has broken the original
> > formatting:
> >
> >
> > diff --git a/drivers/block/zram/zram_drv.c b/drivers/block/zram/zram_drv.c
> > index 9250b3f..c567af5 100644
> > --- a/drivers/block/zram/zram_drv.c
> > +++ b/drivers/block/zram/zram_drv.c
> > @@ -44,7 +44,7 @@ static const char *default_compressor = "lzo";
> >  static unsigned int num_devices = 1;
> >
> >  #define ZRAM_ATTR_RO(name)                                             \
> > -static ssize_t name##_show(struct device *d,           \
> > +static ssize_t name##_show(struct device *d,                           \
> >                                 struct device_attribute *attr, char *b) \
> >  {                                                                      \
> >         struct zram *zram = dev_to_zram(d);                             \
> >
> >
> >
> > I don't have any objections. but do we really want to wrap atomic ops in
> > semaphore? it is really such serious race?
> >
> >
> >         -ss
> >
> >>       struct zram *zram = dev_to_zram(d);                             \
> >> -     return scnprintf(b, PAGE_SIZE, "%llu\n",                        \
> >> -             (u64)atomic64_read(&zram->stats.name));                 \
> >> +     u64 val = 0;                                                    \
> >> +                                                                     \
> >> +     down_read(&zram->init_lock);                                    \
> >> +     if (init_done(zram))                                            \
> >> +             val = atomic64_read(&zram->stats.name);                 \
> >> +     up_read(&zram->init_lock);                                      \
> >> +     return scnprintf(b, PAGE_SIZE, "%llu\n", val);                  \
> >>  }                                                                    \
> >>  static DEVICE_ATTR_RO(name);
> >>
> >> @@ -67,8 +72,14 @@ static ssize_t disksize_show(struct device *dev,
> >>               struct device_attribute *attr, char *buf)
> >>  {
> >>       struct zram *zram = dev_to_zram(dev);
> >> +     u64 val = 0;
> >> +
> >> +     down_read(&zram->init_lock);
> >> +     if (init_done(zram))
> >> +             val = zram->disksize;
> >> +     up_read(&zram->init_lock);
> >>
> >> -     return scnprintf(buf, PAGE_SIZE, "%llu\n", zram->disksize);
> >> +     return scnprintf(buf, PAGE_SIZE, "%llu\n", val);
> >>  }
> >>
> >>  static ssize_t initstate_show(struct device *dev,
> >> @@ -88,9 +99,14 @@ static ssize_t orig_data_size_show(struct device *dev,
> >>               struct device_attribute *attr, char *buf)
> >>  {
> >>       struct zram *zram = dev_to_zram(dev);
> >> +     u64 val = 0;
> >> +
> >> +     down_read(&zram->init_lock);
> >> +     if (init_done(zram))
> >> +             val = atomic64_read(&zram->stats.pages_stored) << PAGE_SHIFT;
> >> +     up_read(&zram->init_lock);
> >>
> >> -     return scnprintf(buf, PAGE_SIZE, "%llu\n",
> >> -             (u64)(atomic64_read(&zram->stats.pages_stored)) << PAGE_SHIFT);
> >> +     return scnprintf(buf, PAGE_SIZE, "%llu\n", val);
> >>  }
> >>
> >>  static ssize_t mem_used_total_show(struct device *dev,
> >> @@ -957,10 +973,6 @@ static int zram_rw_page(struct block_device *bdev, sector_t sector,
> >>       struct bio_vec bv;
> >>
> >>       zram = bdev->bd_disk->private_data;
> >> -     if (!valid_io_request(zram, sector, PAGE_SIZE)) {
> >> -             atomic64_inc(&zram->stats.invalid_io);
> >> -             return -EINVAL;
> >> -     }
> >>
> >>       down_read(&zram->init_lock);
> >>       if (unlikely(!init_done(zram))) {
> >> @@ -968,6 +980,13 @@ static int zram_rw_page(struct block_device *bdev, sector_t sector,
> >>               goto out_unlock;
> >>       }
> >>
> >> +     if (!valid_io_request(zram, sector, PAGE_SIZE)) {
> >> +             atomic64_inc(&zram->stats.invalid_io);
> >> +             err = -EINVAL;
> >> +             goto out_unlock;
> >> +     }
> >> +
> >> +
> >>       index = sector >> SECTORS_PER_PAGE_SHIFT;
> >>       offset = sector & (SECTORS_PER_PAGE - 1) << SECTOR_SHIFT;
> >>
> >> --
> >> 1.9.1
> >>
> > --
> > To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
> > the body of a message to majordomo@vger.kernel.org
> > More majordomo info at  http://vger.kernel.org/majordomo-info.html
> > Please read the FAQ at  http://www.tux.org/lkml/
> 

^ permalink raw reply	[flat|nested] 32+ messages in thread

* Re: [PATCH 1/2] zram: free meta out of init_lock
  2015-01-23 15:47     ` Sergey Senozhatsky
@ 2015-01-26  1:33       ` Minchan Kim
  2015-01-26 14:17         ` Sergey Senozhatsky
  2015-01-26 14:34         ` Jerome Marchand
  0 siblings, 2 replies; 32+ messages in thread
From: Minchan Kim @ 2015-01-26  1:33 UTC (permalink / raw)
  To: Sergey Senozhatsky
  Cc: Jerome Marchand, Andrew Morton, linux-kernel, linux-mm, Nitin Gupta

Hello,

On Sat, Jan 24, 2015 at 12:47:07AM +0900, Sergey Senozhatsky wrote:
> On (01/23/15 15:48), Jerome Marchand wrote:
> > Date: Fri, 23 Jan 2015 15:48:05 +0100
> > From: Jerome Marchand <jmarchan@redhat.com>
> > To: Sergey Senozhatsky <sergey.senozhatsky@gmail.com>, Minchan Kim
> >  <minchan@kernel.org>
> > CC: Andrew Morton <akpm@linux-foundation.org>,
> >  linux-kernel@vger.kernel.org, linux-mm@kvack.org, Nitin Gupta
> >  <ngupta@vflare.org>
> > Subject: Re: [PATCH 1/2] zram: free meta out of init_lock
> > User-Agent: Mozilla/5.0 (X11; Linux x86_64; rv:31.0) Gecko/20100101
> >  Thunderbird/31.3.0
> > 
> > On 01/23/2015 03:24 PM, Sergey Senozhatsky wrote:
> > > On (01/23/15 14:58), Minchan Kim wrote:
> > >> We don't need to call zram_meta_free, zcomp_destroy and zs_free
> > >> under init_lock. What we need to prevent race with init_lock
> > >> in reset is setting NULL into zram->meta (ie, init_done).
> > >> This patch does it.
> > >>
> > >> Signed-off-by: Minchan Kim <minchan@kernel.org>
> > >> ---
> > >>  drivers/block/zram/zram_drv.c | 28 ++++++++++++++++------------
> > >>  1 file changed, 16 insertions(+), 12 deletions(-)
> > >>
> > >> diff --git a/drivers/block/zram/zram_drv.c b/drivers/block/zram/zram_drv.c
> > >> index 9250b3f54a8f..0299d82275e7 100644
> > >> --- a/drivers/block/zram/zram_drv.c
> > >> +++ b/drivers/block/zram/zram_drv.c
> > >> @@ -708,6 +708,7 @@ static void zram_reset_device(struct zram *zram, bool reset_capacity)
> > >>  {
> > >>  	size_t index;
> > >>  	struct zram_meta *meta;
> > >> +	struct zcomp *comp;
> > >>  
> > >>  	down_write(&zram->init_lock);
> > >>  
> > >> @@ -719,20 +720,10 @@ static void zram_reset_device(struct zram *zram, bool reset_capacity)
> > >>  	}
> > >>  
> > >>  	meta = zram->meta;
> > >> -	/* Free all pages that are still in this zram device */
> > >> -	for (index = 0; index < zram->disksize >> PAGE_SHIFT; index++) {
> > >> -		unsigned long handle = meta->table[index].handle;
> > >> -		if (!handle)
> > >> -			continue;
> > >> -
> > >> -		zs_free(meta->mem_pool, handle);
> > >> -	}
> > >> -
> > >> -	zcomp_destroy(zram->comp);
> > > 
> > > I'm not so sure about moving zcomp destruction. if we would have detached it
> > > from zram, then yes. otherwise, think of zram ->destoy vs ->init race.
> > > 
> > > suppose,
> > > CPU1 waits for down_write() init lock in disksize_store() with new comp already allocated;
> > > CPU0 detaches ->meta and releases write init lock;
> > > CPU1 grabs the lock and does zram->comp = comp;
> > > CPU0 reaches the point of zcomp_destroy(zram->comp);
> > 
> > I don't see your point: this patch does not call
> > zcomp_destroy(zram->comp) anymore, but zram_destroy(comp), where comp is
> > the old zram->comp.
> 
> 
> oh... yes. sorry! my bad.
> 
> 
> 
> anyway, on a second thought, do we even want to destoy meta out of init_lock?
> 
> I mean, it will let you init new device quicker. but... assume, you have
> 30G zram (or any other bad-enough number). on CPU0 you reset device -- iterate
> over 30G meta->table, etc. out of init_lock.
> on CPU1 you concurrently re-init device and request again 30G.
> 
> how bad that can be?
> 
> 
> 
> diskstore called on already initialised device is also not so perfect.
> we first will try to allocate ->meta (vmalloc pages for another 30G),
> then allocate comp, then down_write() init lock to find out that device
> is initialised and we need to release allocated memory.
> 
> 
> 
> may be we better keep ->meta destruction under init_lock and additionally
> move ->meta and ->comp allocation under init_lock in disksize_store()?
> 
> like the following one:
> 
> ---
> 
>  drivers/block/zram/zram_drv.c | 25 +++++++++++++------------
>  1 file changed, 13 insertions(+), 12 deletions(-)
> 
> diff --git a/drivers/block/zram/zram_drv.c b/drivers/block/zram/zram_drv.c
> index 9250b3f..827ab21 100644
> --- a/drivers/block/zram/zram_drv.c
> +++ b/drivers/block/zram/zram_drv.c
> @@ -765,9 +765,18 @@ static ssize_t disksize_store(struct device *dev,
>  		return -EINVAL;
>  
>  	disksize = PAGE_ALIGN(disksize);
> +	down_write(&zram->init_lock);
> +	if (init_done(zram)) {
> +		up_write(&zram->init_lock);
> +		pr_info("Cannot change disksize for initialized device\n");
> +		return -EBUSY;
> +	}
> +
>  	meta = zram_meta_alloc(zram->disk->first_minor, disksize);
> -	if (!meta)
> -		return -ENOMEM;
> +	if (!meta) {
> +		err = -ENOMEM;
> +		goto out_unlock;
> +	}
>  
>  	comp = zcomp_create(zram->compressor, zram->max_comp_streams);
>  	if (IS_ERR(comp)) {
> @@ -777,13 +786,6 @@ static ssize_t disksize_store(struct device *dev,
>  		goto out_free_meta;
>  	}
>  
> -	down_write(&zram->init_lock);
> -	if (init_done(zram)) {
> -		pr_info("Cannot change disksize for initialized device\n");
> -		err = -EBUSY;
> -		goto out_destroy_comp;
> -	}
> -
>  	zram->meta = meta;
>  	zram->comp = comp;
>  	zram->disksize = disksize;
> @@ -799,11 +801,10 @@ static ssize_t disksize_store(struct device *dev,
>  
>  	return len;
>  
> -out_destroy_comp:
> -	up_write(&zram->init_lock);
> -	zcomp_destroy(comp);
>  out_free_meta:
>  	zram_meta_free(meta);
> +out_unlock:
> +	up_write(&zram->init_lock);
>  	return err;
>  }
>  

The init_lock is really troublesome. We can't do call zram_meta_alloc
under init_lock due to lockdep report. Please keep in mind.
The zram_rw_page is one of the function under reclaim path and hold it
as read_lock while here holds it as write_lock.
It's a false positive so that we might could make shut lockdep up
by annotation but I don't want it but want to work with lockdep rather
than disable. As well, there are other pathes to use init_lock to
protect other data where would be victims of lockdep.

I didn't tell the motivation of this patch because it made you busy
guys wasted. Let me tell it now. It was another lockdep report by
kmem_cache_destroy for zsmalloc compaction about init_lock. That's why
the patchset was one of the patch in compaction.

Yes, the ideal is to remove horrible init_lock of zram in this phase and
make code more simple and clear but I don't want to stuck zsmalloc
compaction by the work. Having said that, I feel it's time to revisit
to remove init_lock.
At least, I will think over to find a solution to kill init_lock.

Thanks!



^ permalink raw reply	[flat|nested] 32+ messages in thread

* Re: [PATCH 1/2] zram: free meta out of init_lock
  2015-01-26  1:33       ` Minchan Kim
@ 2015-01-26 14:17         ` Sergey Senozhatsky
  2015-01-26 16:00           ` Minchan Kim
  2015-01-26 14:34         ` Jerome Marchand
  1 sibling, 1 reply; 32+ messages in thread
From: Sergey Senozhatsky @ 2015-01-26 14:17 UTC (permalink / raw)
  To: Minchan Kim
  Cc: Sergey Senozhatsky, Jerome Marchand, Andrew Morton, linux-kernel,
	linux-mm, Nitin Gupta, sergey.senozhatsky.work

Hello,

On (01/26/15 10:33), Minchan Kim wrote:
> Hello,
> 
> On Sat, Jan 24, 2015 at 12:47:07AM +0900, Sergey Senozhatsky wrote:
> > On (01/23/15 15:48), Jerome Marchand wrote:
> > > On 01/23/2015 03:24 PM, Sergey Senozhatsky wrote:
> > > > On (01/23/15 14:58), Minchan Kim wrote:
> > > >> We don't need to call zram_meta_free, zcomp_destroy and zs_free
> > > >> under init_lock. What we need to prevent race with init_lock
> > > >> in reset is setting NULL into zram->meta (ie, init_done).
> > > >> This patch does it.
> > > >>
> > > >> Signed-off-by: Minchan Kim <minchan@kernel.org>
> > > >> ---
> > > >>  drivers/block/zram/zram_drv.c | 28 ++++++++++++++++------------
> > > >>  1 file changed, 16 insertions(+), 12 deletions(-)
> > > >>
> > > >> diff --git a/drivers/block/zram/zram_drv.c b/drivers/block/zram/zram_drv.c
> > > >> index 9250b3f54a8f..0299d82275e7 100644
> > > >> --- a/drivers/block/zram/zram_drv.c
> > > >> +++ b/drivers/block/zram/zram_drv.c
> > > >> @@ -708,6 +708,7 @@ static void zram_reset_device(struct zram *zram, bool reset_capacity)
> > > >>  {
> > > >>  	size_t index;
> > > >>  	struct zram_meta *meta;
> > > >> +	struct zcomp *comp;
> > > >>  
> > > >>  	down_write(&zram->init_lock);
> > > >>  
> > > >> @@ -719,20 +720,10 @@ static void zram_reset_device(struct zram *zram, bool reset_capacity)
> > > >>  	}
> > > >>  
> > > >>  	meta = zram->meta;
> > > >> -	/* Free all pages that are still in this zram device */
> > > >> -	for (index = 0; index < zram->disksize >> PAGE_SHIFT; index++) {
> > > >> -		unsigned long handle = meta->table[index].handle;
> > > >> -		if (!handle)
> > > >> -			continue;
> > > >> -
> > > >> -		zs_free(meta->mem_pool, handle);
> > > >> -	}
> > > >> -
> > > >> -	zcomp_destroy(zram->comp);
> > > > 
> > > > I'm not so sure about moving zcomp destruction. if we would have detached it
> > > > from zram, then yes. otherwise, think of zram ->destoy vs ->init race.
> > > > 
> > > > suppose,
> > > > CPU1 waits for down_write() init lock in disksize_store() with new comp already allocated;
> > > > CPU0 detaches ->meta and releases write init lock;
> > > > CPU1 grabs the lock and does zram->comp = comp;
> > > > CPU0 reaches the point of zcomp_destroy(zram->comp);
> > > 
> > > I don't see your point: this patch does not call
> > > zcomp_destroy(zram->comp) anymore, but zram_destroy(comp), where comp is
> > > the old zram->comp.
> > 
> > 
> > oh... yes. sorry! my bad.
> > 
> > 
> > 
> > anyway, on a second thought, do we even want to destoy meta out of init_lock?
> > 
> > I mean, it will let you init new device quicker. but... assume, you have
> > 30G zram (or any other bad-enough number). on CPU0 you reset device -- iterate
> > over 30G meta->table, etc. out of init_lock.
> > on CPU1 you concurrently re-init device and request again 30G.
> > 
> > how bad that can be?
> > 
> > 
> > 
> > diskstore called on already initialised device is also not so perfect.
> > we first will try to allocate ->meta (vmalloc pages for another 30G),
> > then allocate comp, then down_write() init lock to find out that device
> > is initialised and we need to release allocated memory.
> > 
> > 
> > 
> > may be we better keep ->meta destruction under init_lock and additionally
> > move ->meta and ->comp allocation under init_lock in disksize_store()?
> > 
> > like the following one:
> > 
> > ---
> > 
> >  drivers/block/zram/zram_drv.c | 25 +++++++++++++------------
> >  1 file changed, 13 insertions(+), 12 deletions(-)
> > 
> > diff --git a/drivers/block/zram/zram_drv.c b/drivers/block/zram/zram_drv.c
> > index 9250b3f..827ab21 100644
> > --- a/drivers/block/zram/zram_drv.c
> > +++ b/drivers/block/zram/zram_drv.c
> > @@ -765,9 +765,18 @@ static ssize_t disksize_store(struct device *dev,
> >  		return -EINVAL;
> >  
> >  	disksize = PAGE_ALIGN(disksize);
> > +	down_write(&zram->init_lock);
> > +	if (init_done(zram)) {
> > +		up_write(&zram->init_lock);
> > +		pr_info("Cannot change disksize for initialized device\n");
> > +		return -EBUSY;
> > +	}
> > +
> >  	meta = zram_meta_alloc(zram->disk->first_minor, disksize);
> > -	if (!meta)
> > -		return -ENOMEM;
> > +	if (!meta) {
> > +		err = -ENOMEM;
> > +		goto out_unlock;
> > +	}
> >  
> >  	comp = zcomp_create(zram->compressor, zram->max_comp_streams);
> >  	if (IS_ERR(comp)) {
> > @@ -777,13 +786,6 @@ static ssize_t disksize_store(struct device *dev,
> >  		goto out_free_meta;
> >  	}
> >  
> > -	down_write(&zram->init_lock);
> > -	if (init_done(zram)) {
> > -		pr_info("Cannot change disksize for initialized device\n");
> > -		err = -EBUSY;
> > -		goto out_destroy_comp;
> > -	}
> > -
> >  	zram->meta = meta;
> >  	zram->comp = comp;
> >  	zram->disksize = disksize;
> > @@ -799,11 +801,10 @@ static ssize_t disksize_store(struct device *dev,
> >  
> >  	return len;
> >  
> > -out_destroy_comp:
> > -	up_write(&zram->init_lock);
> > -	zcomp_destroy(comp);
> >  out_free_meta:
> >  	zram_meta_free(meta);
> > +out_unlock:
> > +	up_write(&zram->init_lock);
> >  	return err;
> >  }
> >  
> 
> The init_lock is really troublesome. We can't do call zram_meta_alloc
> under init_lock due to lockdep report. Please keep in mind.
>

ah... I do recall it, thanks for your reminder.


> The zram_rw_page is one of the function under reclaim path and hold it
> as read_lock while here holds it as write_lock.
> It's a false positive so that we might could make shut lockdep up
> by annotation but I don't want it but want to work with lockdep rather
> than disable. As well, there are other pathes to use init_lock to
> protect other data where would be victims of lockdep.
> 
> I didn't tell the motivation of this patch because it made you busy
> guys wasted. Let me tell it now. It was another lockdep report by
> kmem_cache_destroy for zsmalloc compaction about init_lock. That's why
> the patchset was one of the patch in compaction.
>
> Yes, the ideal is to remove horrible init_lock of zram in this phase and
> make code more simple and clear but I don't want to stuck zsmalloc
> compaction by the work.


> Having said that, I feel it's time to revisit
> to remove init_lock.
> At least, I will think over to find a solution to kill init_lock.

hm, can't think of anything quick...

	-ss

^ permalink raw reply	[flat|nested] 32+ messages in thread

* Re: [PATCH 1/2] zram: free meta out of init_lock
  2015-01-26  1:33       ` Minchan Kim
  2015-01-26 14:17         ` Sergey Senozhatsky
@ 2015-01-26 14:34         ` Jerome Marchand
  2015-01-26 15:52           ` Minchan Kim
  1 sibling, 1 reply; 32+ messages in thread
From: Jerome Marchand @ 2015-01-26 14:34 UTC (permalink / raw)
  To: Minchan Kim, Sergey Senozhatsky
  Cc: Andrew Morton, linux-kernel, linux-mm, Nitin Gupta

[-- Attachment #1: Type: text/plain, Size: 6836 bytes --]

On 01/26/2015 02:33 AM, Minchan Kim wrote:
> Hello,
> 
> On Sat, Jan 24, 2015 at 12:47:07AM +0900, Sergey Senozhatsky wrote:
>> On (01/23/15 15:48), Jerome Marchand wrote:
>>> Date: Fri, 23 Jan 2015 15:48:05 +0100
>>> From: Jerome Marchand <jmarchan@redhat.com>
>>> To: Sergey Senozhatsky <sergey.senozhatsky@gmail.com>, Minchan Kim
>>>  <minchan@kernel.org>
>>> CC: Andrew Morton <akpm@linux-foundation.org>,
>>>  linux-kernel@vger.kernel.org, linux-mm@kvack.org, Nitin Gupta
>>>  <ngupta@vflare.org>
>>> Subject: Re: [PATCH 1/2] zram: free meta out of init_lock
>>> User-Agent: Mozilla/5.0 (X11; Linux x86_64; rv:31.0) Gecko/20100101
>>>  Thunderbird/31.3.0
>>>
>>> On 01/23/2015 03:24 PM, Sergey Senozhatsky wrote:
>>>> On (01/23/15 14:58), Minchan Kim wrote:
>>>>> We don't need to call zram_meta_free, zcomp_destroy and zs_free
>>>>> under init_lock. What we need to prevent race with init_lock
>>>>> in reset is setting NULL into zram->meta (ie, init_done).
>>>>> This patch does it.
>>>>>
>>>>> Signed-off-by: Minchan Kim <minchan@kernel.org>
>>>>> ---
>>>>>  drivers/block/zram/zram_drv.c | 28 ++++++++++++++++------------
>>>>>  1 file changed, 16 insertions(+), 12 deletions(-)
>>>>>
>>>>> diff --git a/drivers/block/zram/zram_drv.c b/drivers/block/zram/zram_drv.c
>>>>> index 9250b3f54a8f..0299d82275e7 100644
>>>>> --- a/drivers/block/zram/zram_drv.c
>>>>> +++ b/drivers/block/zram/zram_drv.c
>>>>> @@ -708,6 +708,7 @@ static void zram_reset_device(struct zram *zram, bool reset_capacity)
>>>>>  {
>>>>>  	size_t index;
>>>>>  	struct zram_meta *meta;
>>>>> +	struct zcomp *comp;
>>>>>  
>>>>>  	down_write(&zram->init_lock);
>>>>>  
>>>>> @@ -719,20 +720,10 @@ static void zram_reset_device(struct zram *zram, bool reset_capacity)
>>>>>  	}
>>>>>  
>>>>>  	meta = zram->meta;
>>>>> -	/* Free all pages that are still in this zram device */
>>>>> -	for (index = 0; index < zram->disksize >> PAGE_SHIFT; index++) {
>>>>> -		unsigned long handle = meta->table[index].handle;
>>>>> -		if (!handle)
>>>>> -			continue;
>>>>> -
>>>>> -		zs_free(meta->mem_pool, handle);
>>>>> -	}
>>>>> -
>>>>> -	zcomp_destroy(zram->comp);
>>>>
>>>> I'm not so sure about moving zcomp destruction. if we would have detached it
>>>> from zram, then yes. otherwise, think of zram ->destoy vs ->init race.
>>>>
>>>> suppose,
>>>> CPU1 waits for down_write() init lock in disksize_store() with new comp already allocated;
>>>> CPU0 detaches ->meta and releases write init lock;
>>>> CPU1 grabs the lock and does zram->comp = comp;
>>>> CPU0 reaches the point of zcomp_destroy(zram->comp);
>>>
>>> I don't see your point: this patch does not call
>>> zcomp_destroy(zram->comp) anymore, but zram_destroy(comp), where comp is
>>> the old zram->comp.
>>
>>
>> oh... yes. sorry! my bad.
>>
>>
>>
>> anyway, on a second thought, do we even want to destoy meta out of init_lock?
>>
>> I mean, it will let you init new device quicker. but... assume, you have
>> 30G zram (or any other bad-enough number). on CPU0 you reset device -- iterate
>> over 30G meta->table, etc. out of init_lock.
>> on CPU1 you concurrently re-init device and request again 30G.
>>
>> how bad that can be?
>>
>>
>>
>> diskstore called on already initialised device is also not so perfect.
>> we first will try to allocate ->meta (vmalloc pages for another 30G),
>> then allocate comp, then down_write() init lock to find out that device
>> is initialised and we need to release allocated memory.
>>
>>
>>
>> may be we better keep ->meta destruction under init_lock and additionally
>> move ->meta and ->comp allocation under init_lock in disksize_store()?
>>
>> like the following one:
>>
>> ---
>>
>>  drivers/block/zram/zram_drv.c | 25 +++++++++++++------------
>>  1 file changed, 13 insertions(+), 12 deletions(-)
>>
>> diff --git a/drivers/block/zram/zram_drv.c b/drivers/block/zram/zram_drv.c
>> index 9250b3f..827ab21 100644
>> --- a/drivers/block/zram/zram_drv.c
>> +++ b/drivers/block/zram/zram_drv.c
>> @@ -765,9 +765,18 @@ static ssize_t disksize_store(struct device *dev,
>>  		return -EINVAL;
>>  
>>  	disksize = PAGE_ALIGN(disksize);
>> +	down_write(&zram->init_lock);
>> +	if (init_done(zram)) {
>> +		up_write(&zram->init_lock);
>> +		pr_info("Cannot change disksize for initialized device\n");
>> +		return -EBUSY;
>> +	}
>> +
>>  	meta = zram_meta_alloc(zram->disk->first_minor, disksize);
>> -	if (!meta)
>> -		return -ENOMEM;
>> +	if (!meta) {
>> +		err = -ENOMEM;
>> +		goto out_unlock;
>> +	}
>>  
>>  	comp = zcomp_create(zram->compressor, zram->max_comp_streams);
>>  	if (IS_ERR(comp)) {
>> @@ -777,13 +786,6 @@ static ssize_t disksize_store(struct device *dev,
>>  		goto out_free_meta;
>>  	}
>>  
>> -	down_write(&zram->init_lock);
>> -	if (init_done(zram)) {
>> -		pr_info("Cannot change disksize for initialized device\n");
>> -		err = -EBUSY;
>> -		goto out_destroy_comp;
>> -	}
>> -
>>  	zram->meta = meta;
>>  	zram->comp = comp;
>>  	zram->disksize = disksize;
>> @@ -799,11 +801,10 @@ static ssize_t disksize_store(struct device *dev,
>>  
>>  	return len;
>>  
>> -out_destroy_comp:
>> -	up_write(&zram->init_lock);
>> -	zcomp_destroy(comp);
>>  out_free_meta:
>>  	zram_meta_free(meta);
>> +out_unlock:
>> +	up_write(&zram->init_lock);
>>  	return err;
>>  }
>>  
> 
> The init_lock is really troublesome. We can't do call zram_meta_alloc
> under init_lock due to lockdep report. Please keep in mind.
> The zram_rw_page is one of the function under reclaim path and hold it
> as read_lock while here holds it as write_lock.
> It's a false positive so that we might could make shut lockdep up
> by annotation but I don't want it but want to work with lockdep rather
> than disable. As well, there are other pathes to use init_lock to
> protect other data where would be victims of lockdep.
> 
> I didn't tell the motivation of this patch because it made you busy
> guys wasted. Let me tell it now.

In my experience, reading a short explanation takes much less time that
trying to figure out why something is done the way it is. Please add
this explanation to the patch description. It might be very useful in
the future to someone "git-blaming" this code.

Jerome

> It was another lockdep report by
> kmem_cache_destroy for zsmalloc compaction about init_lock. That's why
> the patchset was one of the patch in compaction.
> 
> Yes, the ideal is to remove horrible init_lock of zram in this phase and
> make code more simple and clear but I don't want to stuck zsmalloc
> compaction by the work. Having said that, I feel it's time to revisit
> to remove init_lock.
> At least, I will think over to find a solution to kill init_lock.
> 
> Thanks!
> 
> 



[-- Attachment #2: OpenPGP digital signature --]
[-- Type: application/pgp-signature, Size: 473 bytes --]

^ permalink raw reply	[flat|nested] 32+ messages in thread

* Re: [PATCH 1/2] zram: free meta out of init_lock
  2015-01-26 14:34         ` Jerome Marchand
@ 2015-01-26 15:52           ` Minchan Kim
  0 siblings, 0 replies; 32+ messages in thread
From: Minchan Kim @ 2015-01-26 15:52 UTC (permalink / raw)
  To: Jerome Marchand
  Cc: Sergey Senozhatsky, Andrew Morton, linux-kernel, linux-mm, Nitin Gupta

Hello,

On Mon, Jan 26, 2015 at 03:34:06PM +0100, Jerome Marchand wrote:
> On 01/26/2015 02:33 AM, Minchan Kim wrote:
> > Hello,
> > 
> > On Sat, Jan 24, 2015 at 12:47:07AM +0900, Sergey Senozhatsky wrote:
> >> On (01/23/15 15:48), Jerome Marchand wrote:
> >>> Date: Fri, 23 Jan 2015 15:48:05 +0100
> >>> From: Jerome Marchand <jmarchan@redhat.com>
> >>> To: Sergey Senozhatsky <sergey.senozhatsky@gmail.com>, Minchan Kim
> >>>  <minchan@kernel.org>
> >>> CC: Andrew Morton <akpm@linux-foundation.org>,
> >>>  linux-kernel@vger.kernel.org, linux-mm@kvack.org, Nitin Gupta
> >>>  <ngupta@vflare.org>
> >>> Subject: Re: [PATCH 1/2] zram: free meta out of init_lock
> >>> User-Agent: Mozilla/5.0 (X11; Linux x86_64; rv:31.0) Gecko/20100101
> >>>  Thunderbird/31.3.0
> >>>
> >>> On 01/23/2015 03:24 PM, Sergey Senozhatsky wrote:
> >>>> On (01/23/15 14:58), Minchan Kim wrote:
> >>>>> We don't need to call zram_meta_free, zcomp_destroy and zs_free
> >>>>> under init_lock. What we need to prevent race with init_lock
> >>>>> in reset is setting NULL into zram->meta (ie, init_done).
> >>>>> This patch does it.
> >>>>>
> >>>>> Signed-off-by: Minchan Kim <minchan@kernel.org>
> >>>>> ---
> >>>>>  drivers/block/zram/zram_drv.c | 28 ++++++++++++++++------------
> >>>>>  1 file changed, 16 insertions(+), 12 deletions(-)
> >>>>>
> >>>>> diff --git a/drivers/block/zram/zram_drv.c b/drivers/block/zram/zram_drv.c
> >>>>> index 9250b3f54a8f..0299d82275e7 100644
> >>>>> --- a/drivers/block/zram/zram_drv.c
> >>>>> +++ b/drivers/block/zram/zram_drv.c
> >>>>> @@ -708,6 +708,7 @@ static void zram_reset_device(struct zram *zram, bool reset_capacity)
> >>>>>  {
> >>>>>  	size_t index;
> >>>>>  	struct zram_meta *meta;
> >>>>> +	struct zcomp *comp;
> >>>>>  
> >>>>>  	down_write(&zram->init_lock);
> >>>>>  
> >>>>> @@ -719,20 +720,10 @@ static void zram_reset_device(struct zram *zram, bool reset_capacity)
> >>>>>  	}
> >>>>>  
> >>>>>  	meta = zram->meta;
> >>>>> -	/* Free all pages that are still in this zram device */
> >>>>> -	for (index = 0; index < zram->disksize >> PAGE_SHIFT; index++) {
> >>>>> -		unsigned long handle = meta->table[index].handle;
> >>>>> -		if (!handle)
> >>>>> -			continue;
> >>>>> -
> >>>>> -		zs_free(meta->mem_pool, handle);
> >>>>> -	}
> >>>>> -
> >>>>> -	zcomp_destroy(zram->comp);
> >>>>
> >>>> I'm not so sure about moving zcomp destruction. if we would have detached it
> >>>> from zram, then yes. otherwise, think of zram ->destoy vs ->init race.
> >>>>
> >>>> suppose,
> >>>> CPU1 waits for down_write() init lock in disksize_store() with new comp already allocated;
> >>>> CPU0 detaches ->meta and releases write init lock;
> >>>> CPU1 grabs the lock and does zram->comp = comp;
> >>>> CPU0 reaches the point of zcomp_destroy(zram->comp);
> >>>
> >>> I don't see your point: this patch does not call
> >>> zcomp_destroy(zram->comp) anymore, but zram_destroy(comp), where comp is
> >>> the old zram->comp.
> >>
> >>
> >> oh... yes. sorry! my bad.
> >>
> >>
> >>
> >> anyway, on a second thought, do we even want to destoy meta out of init_lock?
> >>
> >> I mean, it will let you init new device quicker. but... assume, you have
> >> 30G zram (or any other bad-enough number). on CPU0 you reset device -- iterate
> >> over 30G meta->table, etc. out of init_lock.
> >> on CPU1 you concurrently re-init device and request again 30G.
> >>
> >> how bad that can be?
> >>
> >>
> >>
> >> diskstore called on already initialised device is also not so perfect.
> >> we first will try to allocate ->meta (vmalloc pages for another 30G),
> >> then allocate comp, then down_write() init lock to find out that device
> >> is initialised and we need to release allocated memory.
> >>
> >>
> >>
> >> may be we better keep ->meta destruction under init_lock and additionally
> >> move ->meta and ->comp allocation under init_lock in disksize_store()?
> >>
> >> like the following one:
> >>
> >> ---
> >>
> >>  drivers/block/zram/zram_drv.c | 25 +++++++++++++------------
> >>  1 file changed, 13 insertions(+), 12 deletions(-)
> >>
> >> diff --git a/drivers/block/zram/zram_drv.c b/drivers/block/zram/zram_drv.c
> >> index 9250b3f..827ab21 100644
> >> --- a/drivers/block/zram/zram_drv.c
> >> +++ b/drivers/block/zram/zram_drv.c
> >> @@ -765,9 +765,18 @@ static ssize_t disksize_store(struct device *dev,
> >>  		return -EINVAL;
> >>  
> >>  	disksize = PAGE_ALIGN(disksize);
> >> +	down_write(&zram->init_lock);
> >> +	if (init_done(zram)) {
> >> +		up_write(&zram->init_lock);
> >> +		pr_info("Cannot change disksize for initialized device\n");
> >> +		return -EBUSY;
> >> +	}
> >> +
> >>  	meta = zram_meta_alloc(zram->disk->first_minor, disksize);
> >> -	if (!meta)
> >> -		return -ENOMEM;
> >> +	if (!meta) {
> >> +		err = -ENOMEM;
> >> +		goto out_unlock;
> >> +	}
> >>  
> >>  	comp = zcomp_create(zram->compressor, zram->max_comp_streams);
> >>  	if (IS_ERR(comp)) {
> >> @@ -777,13 +786,6 @@ static ssize_t disksize_store(struct device *dev,
> >>  		goto out_free_meta;
> >>  	}
> >>  
> >> -	down_write(&zram->init_lock);
> >> -	if (init_done(zram)) {
> >> -		pr_info("Cannot change disksize for initialized device\n");
> >> -		err = -EBUSY;
> >> -		goto out_destroy_comp;
> >> -	}
> >> -
> >>  	zram->meta = meta;
> >>  	zram->comp = comp;
> >>  	zram->disksize = disksize;
> >> @@ -799,11 +801,10 @@ static ssize_t disksize_store(struct device *dev,
> >>  
> >>  	return len;
> >>  
> >> -out_destroy_comp:
> >> -	up_write(&zram->init_lock);
> >> -	zcomp_destroy(comp);
> >>  out_free_meta:
> >>  	zram_meta_free(meta);
> >> +out_unlock:
> >> +	up_write(&zram->init_lock);
> >>  	return err;
> >>  }
> >>  
> > 
> > The init_lock is really troublesome. We can't do call zram_meta_alloc
> > under init_lock due to lockdep report. Please keep in mind.
> > The zram_rw_page is one of the function under reclaim path and hold it
> > as read_lock while here holds it as write_lock.
> > It's a false positive so that we might could make shut lockdep up
> > by annotation but I don't want it but want to work with lockdep rather
> > than disable. As well, there are other pathes to use init_lock to
> > protect other data where would be victims of lockdep.
> > 
> > I didn't tell the motivation of this patch because it made you busy
> > guys wasted. Let me tell it now.
> 
> In my experience, reading a short explanation takes much less time that
> trying to figure out why something is done the way it is. Please add
> this explanation to the patch description. It might be very useful in
> the future to someone "git-blaming" this code.

This patch has two goals.

1. Avoid unnecessary lock
2. Prepare init_lock lockdep splot with upcoming zsmalloc compaction.

The compaction work doesn't come yet in mainline so I thought I don't
need to tell about 2 so if it become merging first by just 1's reason
before compaction work, everyone would happy without wasting the time
to look into lockdep splat.

Anyway, I will send an idea to remove init_lock in rw path.
Thanks!

> 
> Jerome
> 
> > It was another lockdep report by
> > kmem_cache_destroy for zsmalloc compaction about init_lock. That's why
> > the patchset was one of the patch in compaction.
> > 
> > Yes, the ideal is to remove horrible init_lock of zram in this phase and
> > make code more simple and clear but I don't want to stuck zsmalloc
> > compaction by the work. Having said that, I feel it's time to revisit
> > to remove init_lock.
> > At least, I will think over to find a solution to kill init_lock.
> > 
> > Thanks!
> > 
> > 
> 
> 



^ permalink raw reply	[flat|nested] 32+ messages in thread

* Re: [PATCH 1/2] zram: free meta out of init_lock
  2015-01-26 14:17         ` Sergey Senozhatsky
@ 2015-01-26 16:00           ` Minchan Kim
  2015-01-27  2:17             ` Sergey Senozhatsky
  0 siblings, 1 reply; 32+ messages in thread
From: Minchan Kim @ 2015-01-26 16:00 UTC (permalink / raw)
  To: Sergey Senozhatsky
  Cc: Jerome Marchand, Andrew Morton, linux-kernel, linux-mm,
	Nitin Gupta, sergey.senozhatsky.work

On Mon, Jan 26, 2015 at 11:17:09PM +0900, Sergey Senozhatsky wrote:
> Hello,
> 
> On (01/26/15 10:33), Minchan Kim wrote:
> > Hello,
> > 
> > On Sat, Jan 24, 2015 at 12:47:07AM +0900, Sergey Senozhatsky wrote:
> > > On (01/23/15 15:48), Jerome Marchand wrote:
> > > > On 01/23/2015 03:24 PM, Sergey Senozhatsky wrote:
> > > > > On (01/23/15 14:58), Minchan Kim wrote:
> > > > >> We don't need to call zram_meta_free, zcomp_destroy and zs_free
> > > > >> under init_lock. What we need to prevent race with init_lock
> > > > >> in reset is setting NULL into zram->meta (ie, init_done).
> > > > >> This patch does it.
> > > > >>
> > > > >> Signed-off-by: Minchan Kim <minchan@kernel.org>
> > > > >> ---
> > > > >>  drivers/block/zram/zram_drv.c | 28 ++++++++++++++++------------
> > > > >>  1 file changed, 16 insertions(+), 12 deletions(-)
> > > > >>
> > > > >> diff --git a/drivers/block/zram/zram_drv.c b/drivers/block/zram/zram_drv.c
> > > > >> index 9250b3f54a8f..0299d82275e7 100644
> > > > >> --- a/drivers/block/zram/zram_drv.c
> > > > >> +++ b/drivers/block/zram/zram_drv.c
> > > > >> @@ -708,6 +708,7 @@ static void zram_reset_device(struct zram *zram, bool reset_capacity)
> > > > >>  {
> > > > >>  	size_t index;
> > > > >>  	struct zram_meta *meta;
> > > > >> +	struct zcomp *comp;
> > > > >>  
> > > > >>  	down_write(&zram->init_lock);
> > > > >>  
> > > > >> @@ -719,20 +720,10 @@ static void zram_reset_device(struct zram *zram, bool reset_capacity)
> > > > >>  	}
> > > > >>  
> > > > >>  	meta = zram->meta;
> > > > >> -	/* Free all pages that are still in this zram device */
> > > > >> -	for (index = 0; index < zram->disksize >> PAGE_SHIFT; index++) {
> > > > >> -		unsigned long handle = meta->table[index].handle;
> > > > >> -		if (!handle)
> > > > >> -			continue;
> > > > >> -
> > > > >> -		zs_free(meta->mem_pool, handle);
> > > > >> -	}
> > > > >> -
> > > > >> -	zcomp_destroy(zram->comp);
> > > > > 
> > > > > I'm not so sure about moving zcomp destruction. if we would have detached it
> > > > > from zram, then yes. otherwise, think of zram ->destoy vs ->init race.
> > > > > 
> > > > > suppose,
> > > > > CPU1 waits for down_write() init lock in disksize_store() with new comp already allocated;
> > > > > CPU0 detaches ->meta and releases write init lock;
> > > > > CPU1 grabs the lock and does zram->comp = comp;
> > > > > CPU0 reaches the point of zcomp_destroy(zram->comp);
> > > > 
> > > > I don't see your point: this patch does not call
> > > > zcomp_destroy(zram->comp) anymore, but zram_destroy(comp), where comp is
> > > > the old zram->comp.
> > > 
> > > 
> > > oh... yes. sorry! my bad.
> > > 
> > > 
> > > 
> > > anyway, on a second thought, do we even want to destoy meta out of init_lock?
> > > 
> > > I mean, it will let you init new device quicker. but... assume, you have
> > > 30G zram (or any other bad-enough number). on CPU0 you reset device -- iterate
> > > over 30G meta->table, etc. out of init_lock.
> > > on CPU1 you concurrently re-init device and request again 30G.
> > > 
> > > how bad that can be?
> > > 
> > > 
> > > 
> > > diskstore called on already initialised device is also not so perfect.
> > > we first will try to allocate ->meta (vmalloc pages for another 30G),
> > > then allocate comp, then down_write() init lock to find out that device
> > > is initialised and we need to release allocated memory.
> > > 
> > > 
> > > 
> > > may be we better keep ->meta destruction under init_lock and additionally
> > > move ->meta and ->comp allocation under init_lock in disksize_store()?
> > > 
> > > like the following one:
> > > 
> > > ---
> > > 
> > >  drivers/block/zram/zram_drv.c | 25 +++++++++++++------------
> > >  1 file changed, 13 insertions(+), 12 deletions(-)
> > > 
> > > diff --git a/drivers/block/zram/zram_drv.c b/drivers/block/zram/zram_drv.c
> > > index 9250b3f..827ab21 100644
> > > --- a/drivers/block/zram/zram_drv.c
> > > +++ b/drivers/block/zram/zram_drv.c
> > > @@ -765,9 +765,18 @@ static ssize_t disksize_store(struct device *dev,
> > >  		return -EINVAL;
> > >  
> > >  	disksize = PAGE_ALIGN(disksize);
> > > +	down_write(&zram->init_lock);
> > > +	if (init_done(zram)) {
> > > +		up_write(&zram->init_lock);
> > > +		pr_info("Cannot change disksize for initialized device\n");
> > > +		return -EBUSY;
> > > +	}
> > > +
> > >  	meta = zram_meta_alloc(zram->disk->first_minor, disksize);
> > > -	if (!meta)
> > > -		return -ENOMEM;
> > > +	if (!meta) {
> > > +		err = -ENOMEM;
> > > +		goto out_unlock;
> > > +	}
> > >  
> > >  	comp = zcomp_create(zram->compressor, zram->max_comp_streams);
> > >  	if (IS_ERR(comp)) {
> > > @@ -777,13 +786,6 @@ static ssize_t disksize_store(struct device *dev,
> > >  		goto out_free_meta;
> > >  	}
> > >  
> > > -	down_write(&zram->init_lock);
> > > -	if (init_done(zram)) {
> > > -		pr_info("Cannot change disksize for initialized device\n");
> > > -		err = -EBUSY;
> > > -		goto out_destroy_comp;
> > > -	}
> > > -
> > >  	zram->meta = meta;
> > >  	zram->comp = comp;
> > >  	zram->disksize = disksize;
> > > @@ -799,11 +801,10 @@ static ssize_t disksize_store(struct device *dev,
> > >  
> > >  	return len;
> > >  
> > > -out_destroy_comp:
> > > -	up_write(&zram->init_lock);
> > > -	zcomp_destroy(comp);
> > >  out_free_meta:
> > >  	zram_meta_free(meta);
> > > +out_unlock:
> > > +	up_write(&zram->init_lock);
> > >  	return err;
> > >  }
> > >  
> > 
> > The init_lock is really troublesome. We can't do call zram_meta_alloc
> > under init_lock due to lockdep report. Please keep in mind.
> >
> 
> ah... I do recall it, thanks for your reminder.
> 
> 
> > The zram_rw_page is one of the function under reclaim path and hold it
> > as read_lock while here holds it as write_lock.
> > It's a false positive so that we might could make shut lockdep up
> > by annotation but I don't want it but want to work with lockdep rather
> > than disable. As well, there are other pathes to use init_lock to
> > protect other data where would be victims of lockdep.
> > 
> > I didn't tell the motivation of this patch because it made you busy
> > guys wasted. Let me tell it now. It was another lockdep report by
> > kmem_cache_destroy for zsmalloc compaction about init_lock. That's why
> > the patchset was one of the patch in compaction.
> >
> > Yes, the ideal is to remove horrible init_lock of zram in this phase and
> > make code more simple and clear but I don't want to stuck zsmalloc
> > compaction by the work.
> 
> 
> > Having said that, I feel it's time to revisit
> > to remove init_lock.
> > At least, I will think over to find a solution to kill init_lock.
> 
> hm, can't think of anything quick...
> 
> 	-ss

Hello guys,

How about this?

It's based on Ganesh's patch.
https://lkml.org/lkml/2015/1/24/50

>From afda9fd2f6c40dd0745d8a6babe78c5cbdceddf5 Mon Sep 17 00:00:00 2001
From: Minchan Kim <minchan@kernel.org>
Date: Mon, 26 Jan 2015 14:34:10 +0900
Subject: [RFC] zram: remove init_lock in zram_make_request

Admin could reset zram during I/O operation going on so we have
used zram->init_lock as read-side lock in I/O path to prevent
sudden zram meta freeing.

However, the init_lock is really troublesome.
We can't do call zram_meta_alloc under init_lock due to lockdep splat
because zram_rw_page is one of the function under reclaim path and
hold it as read_lock while other places in process context hold it
as write_lock. So, we have used allocation out of the lock to avoid
lockdep warn but it's not good for readability and fainally, I met
another lockdep splat between init_lock and cpu_hotpulug from
kmem_cache_destroy during wokring zsmalloc compaction. :(

Yes, the ideal is to remove horrible init_lock of zram in rw path.
This patch removes it in rw path and instead, put init_done bool
variable to check initialization done with smp_[wmb|rmb] and
srcu_[un]read_lock to prevent sudden zram meta freeing
during I/O operation.

Signed-off-by: Minchan Kim <minchan@kernel.org>
---
 drivers/block/zram/zram_drv.c | 76 +++++++++++++++++++++++++++++--------------
 drivers/block/zram/zram_drv.h |  5 +++
 2 files changed, 57 insertions(+), 24 deletions(-)

diff --git a/drivers/block/zram/zram_drv.c b/drivers/block/zram/zram_drv.c
index a598ada817f0..e06ff975f997 100644
--- a/drivers/block/zram/zram_drv.c
+++ b/drivers/block/zram/zram_drv.c
@@ -32,6 +32,7 @@
 #include <linux/string.h>
 #include <linux/vmalloc.h>
 #include <linux/err.h>
+#include <linux/srcu.h>
 
 #include "zram_drv.h"
 
@@ -53,9 +54,16 @@ static ssize_t name##_show(struct device *d,		\
 }									\
 static DEVICE_ATTR_RO(name);
 
-static inline int init_done(struct zram *zram)
+static inline bool init_done(struct zram *zram)
 {
-	return zram->meta != NULL;
+	/*
+	 * init_done can be used without holding zram->init_lock in
+	 * read/write handler(ie, zram_make_request) but we should make sure
+	 * that zram->init_done should set up after meta initialization is
+	 * done. Look at disksize_store.
+	 */
+	smp_rmb();
+	return zram->init_done;
 }
 
 static inline struct zram *dev_to_zram(struct device *dev)
@@ -326,6 +334,10 @@ static void zram_meta_free(struct zram_meta *meta)
 	kfree(meta);
 }
 
+static void rcu_zram_do_nothing(struct rcu_head *unused)
+{
+}
+
 static struct zram_meta *zram_meta_alloc(int device_id, u64 disksize)
 {
 	char pool_name[8];
@@ -726,11 +738,8 @@ static void zram_reset_device(struct zram *zram, bool reset_capacity)
 		return;
 	}
 
-	zcomp_destroy(zram->comp);
 	zram->max_comp_streams = 1;
 
-	zram_meta_free(zram->meta);
-	zram->meta = NULL;
 	/* Reset stats */
 	memset(&zram->stats, 0, sizeof(zram->stats));
 
@@ -738,8 +747,12 @@ static void zram_reset_device(struct zram *zram, bool reset_capacity)
 	if (reset_capacity)
 		set_capacity(zram->disk, 0);
 
+	zram->init_done = false;
+	call_srcu(&zram->srcu, &zram->rcu, rcu_zram_do_nothing);
+	synchronize_srcu(&zram->srcu);
+	zram_meta_free(zram->meta);
+	zcomp_destroy(zram->comp);
 	up_write(&zram->init_lock);
-
 	/*
 	 * Revalidate disk out of the init_lock to avoid lockdep splat.
 	 * It's okay because disk's capacity is protected by init_lock
@@ -762,10 +775,19 @@ static ssize_t disksize_store(struct device *dev,
 	if (!disksize)
 		return -EINVAL;
 
+	down_write(&zram->init_lock);
+	if (init_done(zram)) {
+		pr_info("Cannot change disksize for initialized device\n");
+		up_write(&zram->init_lock);
+		return -EBUSY;
+	}
+
 	disksize = PAGE_ALIGN(disksize);
 	meta = zram_meta_alloc(zram->disk->first_minor, disksize);
-	if (!meta)
+	if (!meta) {
+		up_write(&zram->init_lock);
 		return -ENOMEM;
+	}
 
 	comp = zcomp_create(zram->compressor, zram->max_comp_streams);
 	if (IS_ERR(comp)) {
@@ -775,17 +797,17 @@ static ssize_t disksize_store(struct device *dev,
 		goto out_free_meta;
 	}
 
-	down_write(&zram->init_lock);
-	if (init_done(zram)) {
-		pr_info("Cannot change disksize for initialized device\n");
-		err = -EBUSY;
-		goto out_destroy_comp;
-	}
-
 	zram->meta = meta;
 	zram->comp = comp;
 	zram->disksize = disksize;
 	set_capacity(zram->disk, zram->disksize >> SECTOR_SHIFT);
+	/*
+	 * Store operation of struct zram fields should complete
+	 * before init_done set up because zram_bvec_rw doesn't
+	 * hold an zram->init_lock.
+	 */
+	smp_wmb();
+	zram->init_done = true;
 	up_write(&zram->init_lock);
 
 	/*
@@ -797,10 +819,8 @@ static ssize_t disksize_store(struct device *dev,
 
 	return len;
 
-out_destroy_comp:
-	up_write(&zram->init_lock);
-	zcomp_destroy(comp);
 out_free_meta:
+	up_write(&zram->init_lock);
 	zram_meta_free(meta);
 	return err;
 }
@@ -905,9 +925,10 @@ out:
  */
 static void zram_make_request(struct request_queue *queue, struct bio *bio)
 {
+	int idx;
 	struct zram *zram = queue->queuedata;
 
-	down_read(&zram->init_lock);
+	idx = srcu_read_lock(&zram->srcu);
 	if (unlikely(!init_done(zram)))
 		goto error;
 
@@ -918,12 +939,12 @@ static void zram_make_request(struct request_queue *queue, struct bio *bio)
 	}
 
 	__zram_make_request(zram, bio);
-	up_read(&zram->init_lock);
+	srcu_read_unlock(&zram->srcu, idx);
 
 	return;
 
 error:
-	up_read(&zram->init_lock);
+	srcu_read_unlock(&zram->srcu, idx);
 	bio_io_error(bio);
 }
 
@@ -945,18 +966,20 @@ static void zram_slot_free_notify(struct block_device *bdev,
 static int zram_rw_page(struct block_device *bdev, sector_t sector,
 		       struct page *page, int rw)
 {
-	int offset, err;
+	int offset, err, idx;
 	u32 index;
 	struct zram *zram;
 	struct bio_vec bv;
 
 	zram = bdev->bd_disk->private_data;
+	idx = srcu_read_lock(&zram->srcu);
+
 	if (!valid_io_request(zram, sector, PAGE_SIZE)) {
 		atomic64_inc(&zram->stats.invalid_io);
+		srcu_read_unlock(&zram->srcu, idx);
 		return -EINVAL;
 	}
 
-	down_read(&zram->init_lock);
 	if (unlikely(!init_done(zram))) {
 		err = -EIO;
 		goto out_unlock;
@@ -971,7 +994,7 @@ static int zram_rw_page(struct block_device *bdev, sector_t sector,
 
 	err = zram_bvec_rw(zram, &bv, index, offset, rw);
 out_unlock:
-	up_read(&zram->init_lock);
+	srcu_read_unlock(&zram->srcu, idx);
 	/*
 	 * If I/O fails, just return error(ie, non-zero) without
 	 * calling page_endio.
@@ -1041,6 +1064,11 @@ static int create_device(struct zram *zram, int device_id)
 
 	init_rwsem(&zram->init_lock);
 
+	if (init_srcu_struct(&zram->srcu)) {
+		pr_err("Error initialize srcu for device %d\n", device_id);
+		goto out;
+	}
+
 	zram->queue = blk_alloc_queue(GFP_KERNEL);
 	if (!zram->queue) {
 		pr_err("Error allocating disk queue for device %d\n",
@@ -1125,8 +1153,8 @@ static void destroy_device(struct zram *zram)
 
 	del_gendisk(zram->disk);
 	put_disk(zram->disk);
-
 	blk_cleanup_queue(zram->queue);
+	cleanup_srcu_struct(&zram->srcu);
 }
 
 static int __init zram_init(void)
diff --git a/drivers/block/zram/zram_drv.h b/drivers/block/zram/zram_drv.h
index e492f6bf11f1..2042c310aea8 100644
--- a/drivers/block/zram/zram_drv.h
+++ b/drivers/block/zram/zram_drv.h
@@ -105,8 +105,13 @@ struct zram {
 	struct gendisk *disk;
 	struct zcomp *comp;
 
+	struct srcu_struct srcu;
+	struct rcu_head rcu;
+
 	/* Prevent concurrent execution of device init, reset and R/W request */
 	struct rw_semaphore init_lock;
+	bool init_done;
+
 	/*
 	 * This is the limit on amount of *uncompressed* worth of data
 	 * we can store in a disk.
-- 
1.9.1


^ permalink raw reply	[flat|nested] 32+ messages in thread

* Re: [PATCH 1/2] zram: free meta out of init_lock
  2015-01-26 16:00           ` Minchan Kim
@ 2015-01-27  2:17             ` Sergey Senozhatsky
  2015-01-27  3:18               ` Minchan Kim
  0 siblings, 1 reply; 32+ messages in thread
From: Sergey Senozhatsky @ 2015-01-27  2:17 UTC (permalink / raw)
  To: Minchan Kim
  Cc: Sergey Senozhatsky, Jerome Marchand, Andrew Morton, linux-kernel,
	linux-mm, Nitin Gupta, sergey.senozhatsky.work

On (01/27/15 01:00), Minchan Kim wrote:
> On Mon, Jan 26, 2015 at 11:17:09PM +0900, Sergey Senozhatsky wrote:
> > Hello,
> > 
> > On (01/26/15 10:33), Minchan Kim wrote:
> > > Hello,
> > > 
> > > On Sat, Jan 24, 2015 at 12:47:07AM +0900, Sergey Senozhatsky wrote:
> > > > On (01/23/15 15:48), Jerome Marchand wrote:
> > > > > On 01/23/2015 03:24 PM, Sergey Senozhatsky wrote:
> > > > > > On (01/23/15 14:58), Minchan Kim wrote:
> > > > > >> We don't need to call zram_meta_free, zcomp_destroy and zs_free
> > > > > >> under init_lock. What we need to prevent race with init_lock
> > > > > >> in reset is setting NULL into zram->meta (ie, init_done).
> > > > > >> This patch does it.
> > > > > >>
> > > > > >> Signed-off-by: Minchan Kim <minchan@kernel.org>
> > > > > >> ---
> > > > > >>  drivers/block/zram/zram_drv.c | 28 ++++++++++++++++------------
> > > > > >>  1 file changed, 16 insertions(+), 12 deletions(-)
> > > > > >>
> > > > > >> diff --git a/drivers/block/zram/zram_drv.c b/drivers/block/zram/zram_drv.c
> > > > > >> index 9250b3f54a8f..0299d82275e7 100644
> > > > > >> --- a/drivers/block/zram/zram_drv.c
> > > > > >> +++ b/drivers/block/zram/zram_drv.c
> > > > > >> @@ -708,6 +708,7 @@ static void zram_reset_device(struct zram *zram, bool reset_capacity)
> > > > > >>  {
> > > > > >>  	size_t index;
> > > > > >>  	struct zram_meta *meta;
> > > > > >> +	struct zcomp *comp;
> > > > > >>  
> > > > > >>  	down_write(&zram->init_lock);
> > > > > >>  
> > > > > >> @@ -719,20 +720,10 @@ static void zram_reset_device(struct zram *zram, bool reset_capacity)
> > > > > >>  	}
> > > > > >>  
> > > > > >>  	meta = zram->meta;
> > > > > >> -	/* Free all pages that are still in this zram device */
> > > > > >> -	for (index = 0; index < zram->disksize >> PAGE_SHIFT; index++) {
> > > > > >> -		unsigned long handle = meta->table[index].handle;
> > > > > >> -		if (!handle)
> > > > > >> -			continue;
> > > > > >> -
> > > > > >> -		zs_free(meta->mem_pool, handle);
> > > > > >> -	}
> > > > > >> -
> > > > > >> -	zcomp_destroy(zram->comp);
> > > > > > 
> > > > > > I'm not so sure about moving zcomp destruction. if we would have detached it
> > > > > > from zram, then yes. otherwise, think of zram ->destoy vs ->init race.
> > > > > > 
> > > > > > suppose,
> > > > > > CPU1 waits for down_write() init lock in disksize_store() with new comp already allocated;
> > > > > > CPU0 detaches ->meta and releases write init lock;
> > > > > > CPU1 grabs the lock and does zram->comp = comp;
> > > > > > CPU0 reaches the point of zcomp_destroy(zram->comp);
> > > > > 
> > > > > I don't see your point: this patch does not call
> > > > > zcomp_destroy(zram->comp) anymore, but zram_destroy(comp), where comp is
> > > > > the old zram->comp.
> > > > 
> > > > 
> > > > oh... yes. sorry! my bad.
> > > > 
> > > > 
> > > > 
> > > > anyway, on a second thought, do we even want to destoy meta out of init_lock?
> > > > 
> > > > I mean, it will let you init new device quicker. but... assume, you have
> > > > 30G zram (or any other bad-enough number). on CPU0 you reset device -- iterate
> > > > over 30G meta->table, etc. out of init_lock.
> > > > on CPU1 you concurrently re-init device and request again 30G.
> > > > 
> > > > how bad that can be?
> > > > 
> > > > 
> > > > 
> > > > diskstore called on already initialised device is also not so perfect.
> > > > we first will try to allocate ->meta (vmalloc pages for another 30G),
> > > > then allocate comp, then down_write() init lock to find out that device
> > > > is initialised and we need to release allocated memory.
> > > > 
> > > > 
> > > > 
> > > > may be we better keep ->meta destruction under init_lock and additionally
> > > > move ->meta and ->comp allocation under init_lock in disksize_store()?
> > > > 
> > > > like the following one:
> > > > 
> > > > ---
> > > > 
> > > >  drivers/block/zram/zram_drv.c | 25 +++++++++++++------------
> > > >  1 file changed, 13 insertions(+), 12 deletions(-)
> > > > 
> > > > diff --git a/drivers/block/zram/zram_drv.c b/drivers/block/zram/zram_drv.c
> > > > index 9250b3f..827ab21 100644
> > > > --- a/drivers/block/zram/zram_drv.c
> > > > +++ b/drivers/block/zram/zram_drv.c
> > > > @@ -765,9 +765,18 @@ static ssize_t disksize_store(struct device *dev,
> > > >  		return -EINVAL;
> > > >  
> > > >  	disksize = PAGE_ALIGN(disksize);
> > > > +	down_write(&zram->init_lock);
> > > > +	if (init_done(zram)) {
> > > > +		up_write(&zram->init_lock);
> > > > +		pr_info("Cannot change disksize for initialized device\n");
> > > > +		return -EBUSY;
> > > > +	}
> > > > +
> > > >  	meta = zram_meta_alloc(zram->disk->first_minor, disksize);
> > > > -	if (!meta)
> > > > -		return -ENOMEM;
> > > > +	if (!meta) {
> > > > +		err = -ENOMEM;
> > > > +		goto out_unlock;
> > > > +	}
> > > >  
> > > >  	comp = zcomp_create(zram->compressor, zram->max_comp_streams);
> > > >  	if (IS_ERR(comp)) {
> > > > @@ -777,13 +786,6 @@ static ssize_t disksize_store(struct device *dev,
> > > >  		goto out_free_meta;
> > > >  	}
> > > >  
> > > > -	down_write(&zram->init_lock);
> > > > -	if (init_done(zram)) {
> > > > -		pr_info("Cannot change disksize for initialized device\n");
> > > > -		err = -EBUSY;
> > > > -		goto out_destroy_comp;
> > > > -	}
> > > > -
> > > >  	zram->meta = meta;
> > > >  	zram->comp = comp;
> > > >  	zram->disksize = disksize;
> > > > @@ -799,11 +801,10 @@ static ssize_t disksize_store(struct device *dev,
> > > >  
> > > >  	return len;
> > > >  
> > > > -out_destroy_comp:
> > > > -	up_write(&zram->init_lock);
> > > > -	zcomp_destroy(comp);
> > > >  out_free_meta:
> > > >  	zram_meta_free(meta);
> > > > +out_unlock:
> > > > +	up_write(&zram->init_lock);
> > > >  	return err;
> > > >  }
> > > >  
> > > 
> > > The init_lock is really troublesome. We can't do call zram_meta_alloc
> > > under init_lock due to lockdep report. Please keep in mind.
> > >
> > 
> > ah... I do recall it, thanks for your reminder.
> > 
> > 
> > > The zram_rw_page is one of the function under reclaim path and hold it
> > > as read_lock while here holds it as write_lock.
> > > It's a false positive so that we might could make shut lockdep up
> > > by annotation but I don't want it but want to work with lockdep rather
> > > than disable. As well, there are other pathes to use init_lock to
> > > protect other data where would be victims of lockdep.
> > > 
> > > I didn't tell the motivation of this patch because it made you busy
> > > guys wasted. Let me tell it now. It was another lockdep report by
> > > kmem_cache_destroy for zsmalloc compaction about init_lock. That's why
> > > the patchset was one of the patch in compaction.
> > >
> > > Yes, the ideal is to remove horrible init_lock of zram in this phase and
> > > make code more simple and clear but I don't want to stuck zsmalloc
> > > compaction by the work.
> > 
> > 
> > > Having said that, I feel it's time to revisit
> > > to remove init_lock.
> > > At least, I will think over to find a solution to kill init_lock.
> > 
> > hm, can't think of anything quick...
> > 
> > 	-ss
> 
> Hello guys,
> 
> How about this?
> 
> It's based on Ganesh's patch.
> https://lkml.org/lkml/2015/1/24/50
(I see no similarities with Ganesh's patch)

hm, you probably meant this one https://lkml.org/lkml/2015/1/23/406


at glance this makes things a bit more complicated, so I need to think more.

> From afda9fd2f6c40dd0745d8a6babe78c5cbdceddf5 Mon Sep 17 00:00:00 2001
> From: Minchan Kim <minchan@kernel.org>
> Date: Mon, 26 Jan 2015 14:34:10 +0900
> Subject: [RFC] zram: remove init_lock in zram_make_request
> 
> Admin could reset zram during I/O operation going on so we have
> used zram->init_lock as read-side lock in I/O path to prevent
> sudden zram meta freeing.
> 
> However, the init_lock is really troublesome.
> We can't do call zram_meta_alloc under init_lock due to lockdep splat
> because zram_rw_page is one of the function under reclaim path and
> hold it as read_lock while other places in process context hold it
> as write_lock. So, we have used allocation out of the lock to avoid
> lockdep warn but it's not good for readability and fainally, I met
> another lockdep splat between init_lock and cpu_hotpulug from
> kmem_cache_destroy during wokring zsmalloc compaction. :(
> 
> Yes, the ideal is to remove horrible init_lock of zram in rw path.
> This patch removes it in rw path and instead, put init_done bool
> variable to check initialization done with smp_[wmb|rmb] and
> srcu_[un]read_lock to prevent sudden zram meta freeing
> during I/O operation.
> 
> Signed-off-by: Minchan Kim <minchan@kernel.org>
> ---
>  drivers/block/zram/zram_drv.c | 76 +++++++++++++++++++++++++++++--------------
>  drivers/block/zram/zram_drv.h |  5 +++
>  2 files changed, 57 insertions(+), 24 deletions(-)
> 
> diff --git a/drivers/block/zram/zram_drv.c b/drivers/block/zram/zram_drv.c
> index a598ada817f0..e06ff975f997 100644
> --- a/drivers/block/zram/zram_drv.c
> +++ b/drivers/block/zram/zram_drv.c
> @@ -32,6 +32,7 @@
>  #include <linux/string.h>
>  #include <linux/vmalloc.h>
>  #include <linux/err.h>
> +#include <linux/srcu.h>
>  
>  #include "zram_drv.h"
>  
> @@ -53,9 +54,16 @@ static ssize_t name##_show(struct device *d,		\
>  }									\
>  static DEVICE_ATTR_RO(name);
>  
> -static inline int init_done(struct zram *zram)
> +static inline bool init_done(struct zram *zram)
>  {
> -	return zram->meta != NULL;
> +	/*
> +	 * init_done can be used without holding zram->init_lock in
> +	 * read/write handler(ie, zram_make_request) but we should make sure
> +	 * that zram->init_done should set up after meta initialization is
> +	 * done. Look at disksize_store.
> +	 */
> +	smp_rmb();
> +	return zram->init_done;

->init_done returns back :)
can we rely on write ->meta; wmb; --- rmb; read ->meta?

how much performance do we lose on barriers?

>  }
>  
>  static inline struct zram *dev_to_zram(struct device *dev)
> @@ -326,6 +334,10 @@ static void zram_meta_free(struct zram_meta *meta)
>  	kfree(meta);
>  }
>  
> +static void rcu_zram_do_nothing(struct rcu_head *unused)
> +{
> +}
> +
>  static struct zram_meta *zram_meta_alloc(int device_id, u64 disksize)
>  {
>  	char pool_name[8];
> @@ -726,11 +738,8 @@ static void zram_reset_device(struct zram *zram, bool reset_capacity)
>  		return;
>  	}
>  
> -	zcomp_destroy(zram->comp);
>  	zram->max_comp_streams = 1;
>  
> -	zram_meta_free(zram->meta);
> -	zram->meta = NULL;
>  	/* Reset stats */
>  	memset(&zram->stats, 0, sizeof(zram->stats));
>  
> @@ -738,8 +747,12 @@ static void zram_reset_device(struct zram *zram, bool reset_capacity)
>  	if (reset_capacity)
>  		set_capacity(zram->disk, 0);
>  
> +	zram->init_done = false;

missing wmb?

I think we also better put comments after every wmb/rmb. like

	smp_wmb(); /* pairs with rmb() in foo() */


> +	call_srcu(&zram->srcu, &zram->rcu, rcu_zram_do_nothing);
> +	synchronize_srcu(&zram->srcu);
> +	zram_meta_free(zram->meta);
> +	zcomp_destroy(zram->comp);
>  	up_write(&zram->init_lock);
> -
>  	/*
>  	 * Revalidate disk out of the init_lock to avoid lockdep splat.
>  	 * It's okay because disk's capacity is protected by init_lock
> @@ -762,10 +775,19 @@ static ssize_t disksize_store(struct device *dev,
>  	if (!disksize)
>  		return -EINVAL;
>  
> +	down_write(&zram->init_lock);
> +	if (init_done(zram)) {
> +		pr_info("Cannot change disksize for initialized device\n");
> +		up_write(&zram->init_lock);
> +		return -EBUSY;
> +	}
> +
>  	disksize = PAGE_ALIGN(disksize);
>  	meta = zram_meta_alloc(zram->disk->first_minor, disksize);
> -	if (!meta)
> +	if (!meta) {
> +		up_write(&zram->init_lock);
>  		return -ENOMEM;
> +	}
>  
>  	comp = zcomp_create(zram->compressor, zram->max_comp_streams);
>  	if (IS_ERR(comp)) {
> @@ -775,17 +797,17 @@ static ssize_t disksize_store(struct device *dev,
>  		goto out_free_meta;
>  	}
>  
> -	down_write(&zram->init_lock);
> -	if (init_done(zram)) {
> -		pr_info("Cannot change disksize for initialized device\n");
> -		err = -EBUSY;
> -		goto out_destroy_comp;
> -	}
> -
>  	zram->meta = meta;
>  	zram->comp = comp;
>  	zram->disksize = disksize;
>  	set_capacity(zram->disk, zram->disksize >> SECTOR_SHIFT);
> +	/*
> +	 * Store operation of struct zram fields should complete
> +	 * before init_done set up because zram_bvec_rw doesn't
> +	 * hold an zram->init_lock.
> +	 */
> +	smp_wmb();
> +	zram->init_done = true;
>  	up_write(&zram->init_lock);
>  
>  	/*
> @@ -797,10 +819,8 @@ static ssize_t disksize_store(struct device *dev,
>  
>  	return len;
>  
> -out_destroy_comp:
> -	up_write(&zram->init_lock);
> -	zcomp_destroy(comp);
>  out_free_meta:
> +	up_write(&zram->init_lock);
>  	zram_meta_free(meta);

 zram_meta_free(meta);
 up_write(&zram->init_lock);

 ?

>  	return err;
>  }
> @@ -905,9 +925,10 @@ out:
>   */
>  static void zram_make_request(struct request_queue *queue, struct bio *bio)
>  {
> +	int idx;
>  	struct zram *zram = queue->queuedata;
>  
> -	down_read(&zram->init_lock);
> +	idx = srcu_read_lock(&zram->srcu);
>  	if (unlikely(!init_done(zram)))
>  		goto error;
>  
> @@ -918,12 +939,12 @@ static void zram_make_request(struct request_queue *queue, struct bio *bio)
>  	}
>  
>  	__zram_make_request(zram, bio);
> -	up_read(&zram->init_lock);
> +	srcu_read_unlock(&zram->srcu, idx);
>  
>  	return;
>  
>  error:
> -	up_read(&zram->init_lock);
> +	srcu_read_unlock(&zram->srcu, idx);
>  	bio_io_error(bio);
>  }
>  
> @@ -945,18 +966,20 @@ static void zram_slot_free_notify(struct block_device *bdev,
>  static int zram_rw_page(struct block_device *bdev, sector_t sector,
>  		       struct page *page, int rw)
>  {
> -	int offset, err;
> +	int offset, err, idx;
>  	u32 index;
>  	struct zram *zram;
>  	struct bio_vec bv;
>  
>  	zram = bdev->bd_disk->private_data;
> +	idx = srcu_read_lock(&zram->srcu);
> +
>  	if (!valid_io_request(zram, sector, PAGE_SIZE)) {
>  		atomic64_inc(&zram->stats.invalid_io);
> +		srcu_read_unlock(&zram->srcu, idx);
>  		return -EINVAL;
>  	}
>  
> -	down_read(&zram->init_lock);
>  	if (unlikely(!init_done(zram))) {
>  		err = -EIO;
>  		goto out_unlock;
> @@ -971,7 +994,7 @@ static int zram_rw_page(struct block_device *bdev, sector_t sector,
>  
>  	err = zram_bvec_rw(zram, &bv, index, offset, rw);
>  out_unlock:
> -	up_read(&zram->init_lock);
> +	srcu_read_unlock(&zram->srcu, idx);
>  	/*
>  	 * If I/O fails, just return error(ie, non-zero) without
>  	 * calling page_endio.
> @@ -1041,6 +1064,11 @@ static int create_device(struct zram *zram, int device_id)
>  
>  	init_rwsem(&zram->init_lock);
>  
> +	if (init_srcu_struct(&zram->srcu)) {
> +		pr_err("Error initialize srcu for device %d\n", device_id);
> +		goto out;
> +	}
> +
>  	zram->queue = blk_alloc_queue(GFP_KERNEL);
>  	if (!zram->queue) {
>  		pr_err("Error allocating disk queue for device %d\n",
> @@ -1125,8 +1153,8 @@ static void destroy_device(struct zram *zram)
>  
>  	del_gendisk(zram->disk);
>  	put_disk(zram->disk);
> -
>  	blk_cleanup_queue(zram->queue);
> +	cleanup_srcu_struct(&zram->srcu);
>  }
>  
>  static int __init zram_init(void)
> diff --git a/drivers/block/zram/zram_drv.h b/drivers/block/zram/zram_drv.h
> index e492f6bf11f1..2042c310aea8 100644
> --- a/drivers/block/zram/zram_drv.h
> +++ b/drivers/block/zram/zram_drv.h
> @@ -105,8 +105,13 @@ struct zram {
>  	struct gendisk *disk;
>  	struct zcomp *comp;
>  
> +	struct srcu_struct srcu;
> +	struct rcu_head rcu;
> +
>  	/* Prevent concurrent execution of device init, reset and R/W request */
>  	struct rw_semaphore init_lock;
> +	bool init_done;
> +
>  	/*
>  	 * This is the limit on amount of *uncompressed* worth of data
>  	 * we can store in a disk.
> -- 
> 1.9.1
> 

^ permalink raw reply	[flat|nested] 32+ messages in thread

* Re: [PATCH 1/2] zram: free meta out of init_lock
  2015-01-27  2:17             ` Sergey Senozhatsky
@ 2015-01-27  3:18               ` Minchan Kim
  2015-01-27  4:03                 ` Sergey Senozhatsky
  0 siblings, 1 reply; 32+ messages in thread
From: Minchan Kim @ 2015-01-27  3:18 UTC (permalink / raw)
  To: Sergey Senozhatsky
  Cc: Sergey Senozhatsky, Jerome Marchand, Andrew Morton, linux-kernel,
	linux-mm, Nitin Gupta

Hello Sergey,

On Tue, Jan 27, 2015 at 11:17:04AM +0900, Sergey Senozhatsky wrote:
> On (01/27/15 01:00), Minchan Kim wrote:
> > On Mon, Jan 26, 2015 at 11:17:09PM +0900, Sergey Senozhatsky wrote:
> > > Hello,
> > > 
> > > On (01/26/15 10:33), Minchan Kim wrote:
> > > > Hello,
> > > > 
> > > > On Sat, Jan 24, 2015 at 12:47:07AM +0900, Sergey Senozhatsky wrote:
> > > > > On (01/23/15 15:48), Jerome Marchand wrote:
> > > > > > On 01/23/2015 03:24 PM, Sergey Senozhatsky wrote:
> > > > > > > On (01/23/15 14:58), Minchan Kim wrote:
> > > > > > >> We don't need to call zram_meta_free, zcomp_destroy and zs_free
> > > > > > >> under init_lock. What we need to prevent race with init_lock
> > > > > > >> in reset is setting NULL into zram->meta (ie, init_done).
> > > > > > >> This patch does it.
> > > > > > >>
> > > > > > >> Signed-off-by: Minchan Kim <minchan@kernel.org>
> > > > > > >> ---
> > > > > > >>  drivers/block/zram/zram_drv.c | 28 ++++++++++++++++------------
> > > > > > >>  1 file changed, 16 insertions(+), 12 deletions(-)
> > > > > > >>
> > > > > > >> diff --git a/drivers/block/zram/zram_drv.c b/drivers/block/zram/zram_drv.c
> > > > > > >> index 9250b3f54a8f..0299d82275e7 100644
> > > > > > >> --- a/drivers/block/zram/zram_drv.c
> > > > > > >> +++ b/drivers/block/zram/zram_drv.c
> > > > > > >> @@ -708,6 +708,7 @@ static void zram_reset_device(struct zram *zram, bool reset_capacity)
> > > > > > >>  {
> > > > > > >>  	size_t index;
> > > > > > >>  	struct zram_meta *meta;
> > > > > > >> +	struct zcomp *comp;
> > > > > > >>  
> > > > > > >>  	down_write(&zram->init_lock);
> > > > > > >>  
> > > > > > >> @@ -719,20 +720,10 @@ static void zram_reset_device(struct zram *zram, bool reset_capacity)
> > > > > > >>  	}
> > > > > > >>  
> > > > > > >>  	meta = zram->meta;
> > > > > > >> -	/* Free all pages that are still in this zram device */
> > > > > > >> -	for (index = 0; index < zram->disksize >> PAGE_SHIFT; index++) {
> > > > > > >> -		unsigned long handle = meta->table[index].handle;
> > > > > > >> -		if (!handle)
> > > > > > >> -			continue;
> > > > > > >> -
> > > > > > >> -		zs_free(meta->mem_pool, handle);
> > > > > > >> -	}
> > > > > > >> -
> > > > > > >> -	zcomp_destroy(zram->comp);
> > > > > > > 
> > > > > > > I'm not so sure about moving zcomp destruction. if we would have detached it
> > > > > > > from zram, then yes. otherwise, think of zram ->destoy vs ->init race.
> > > > > > > 
> > > > > > > suppose,
> > > > > > > CPU1 waits for down_write() init lock in disksize_store() with new comp already allocated;
> > > > > > > CPU0 detaches ->meta and releases write init lock;
> > > > > > > CPU1 grabs the lock and does zram->comp = comp;
> > > > > > > CPU0 reaches the point of zcomp_destroy(zram->comp);
> > > > > > 
> > > > > > I don't see your point: this patch does not call
> > > > > > zcomp_destroy(zram->comp) anymore, but zram_destroy(comp), where comp is
> > > > > > the old zram->comp.
> > > > > 
> > > > > 
> > > > > oh... yes. sorry! my bad.
> > > > > 
> > > > > 
> > > > > 
> > > > > anyway, on a second thought, do we even want to destoy meta out of init_lock?
> > > > > 
> > > > > I mean, it will let you init new device quicker. but... assume, you have
> > > > > 30G zram (or any other bad-enough number). on CPU0 you reset device -- iterate
> > > > > over 30G meta->table, etc. out of init_lock.
> > > > > on CPU1 you concurrently re-init device and request again 30G.
> > > > > 
> > > > > how bad that can be?
> > > > > 
> > > > > 
> > > > > 
> > > > > diskstore called on already initialised device is also not so perfect.
> > > > > we first will try to allocate ->meta (vmalloc pages for another 30G),
> > > > > then allocate comp, then down_write() init lock to find out that device
> > > > > is initialised and we need to release allocated memory.
> > > > > 
> > > > > 
> > > > > 
> > > > > may be we better keep ->meta destruction under init_lock and additionally
> > > > > move ->meta and ->comp allocation under init_lock in disksize_store()?
> > > > > 
> > > > > like the following one:
> > > > > 
> > > > > ---
> > > > > 
> > > > >  drivers/block/zram/zram_drv.c | 25 +++++++++++++------------
> > > > >  1 file changed, 13 insertions(+), 12 deletions(-)
> > > > > 
> > > > > diff --git a/drivers/block/zram/zram_drv.c b/drivers/block/zram/zram_drv.c
> > > > > index 9250b3f..827ab21 100644
> > > > > --- a/drivers/block/zram/zram_drv.c
> > > > > +++ b/drivers/block/zram/zram_drv.c
> > > > > @@ -765,9 +765,18 @@ static ssize_t disksize_store(struct device *dev,
> > > > >  		return -EINVAL;
> > > > >  
> > > > >  	disksize = PAGE_ALIGN(disksize);
> > > > > +	down_write(&zram->init_lock);
> > > > > +	if (init_done(zram)) {
> > > > > +		up_write(&zram->init_lock);
> > > > > +		pr_info("Cannot change disksize for initialized device\n");
> > > > > +		return -EBUSY;
> > > > > +	}
> > > > > +
> > > > >  	meta = zram_meta_alloc(zram->disk->first_minor, disksize);
> > > > > -	if (!meta)
> > > > > -		return -ENOMEM;
> > > > > +	if (!meta) {
> > > > > +		err = -ENOMEM;
> > > > > +		goto out_unlock;
> > > > > +	}
> > > > >  
> > > > >  	comp = zcomp_create(zram->compressor, zram->max_comp_streams);
> > > > >  	if (IS_ERR(comp)) {
> > > > > @@ -777,13 +786,6 @@ static ssize_t disksize_store(struct device *dev,
> > > > >  		goto out_free_meta;
> > > > >  	}
> > > > >  
> > > > > -	down_write(&zram->init_lock);
> > > > > -	if (init_done(zram)) {
> > > > > -		pr_info("Cannot change disksize for initialized device\n");
> > > > > -		err = -EBUSY;
> > > > > -		goto out_destroy_comp;
> > > > > -	}
> > > > > -
> > > > >  	zram->meta = meta;
> > > > >  	zram->comp = comp;
> > > > >  	zram->disksize = disksize;
> > > > > @@ -799,11 +801,10 @@ static ssize_t disksize_store(struct device *dev,
> > > > >  
> > > > >  	return len;
> > > > >  
> > > > > -out_destroy_comp:
> > > > > -	up_write(&zram->init_lock);
> > > > > -	zcomp_destroy(comp);
> > > > >  out_free_meta:
> > > > >  	zram_meta_free(meta);
> > > > > +out_unlock:
> > > > > +	up_write(&zram->init_lock);
> > > > >  	return err;
> > > > >  }
> > > > >  
> > > > 
> > > > The init_lock is really troublesome. We can't do call zram_meta_alloc
> > > > under init_lock due to lockdep report. Please keep in mind.
> > > >
> > > 
> > > ah... I do recall it, thanks for your reminder.
> > > 
> > > 
> > > > The zram_rw_page is one of the function under reclaim path and hold it
> > > > as read_lock while here holds it as write_lock.
> > > > It's a false positive so that we might could make shut lockdep up
> > > > by annotation but I don't want it but want to work with lockdep rather
> > > > than disable. As well, there are other pathes to use init_lock to
> > > > protect other data where would be victims of lockdep.
> > > > 
> > > > I didn't tell the motivation of this patch because it made you busy
> > > > guys wasted. Let me tell it now. It was another lockdep report by
> > > > kmem_cache_destroy for zsmalloc compaction about init_lock. That's why
> > > > the patchset was one of the patch in compaction.
> > > >
> > > > Yes, the ideal is to remove horrible init_lock of zram in this phase and
> > > > make code more simple and clear but I don't want to stuck zsmalloc
> > > > compaction by the work.
> > > 
> > > 
> > > > Having said that, I feel it's time to revisit
> > > > to remove init_lock.
> > > > At least, I will think over to find a solution to kill init_lock.
> > > 
> > > hm, can't think of anything quick...
> > > 
> > > 	-ss
> > 
> > Hello guys,
> > 
> > How about this?
> > 
> > It's based on Ganesh's patch.
> > https://lkml.org/lkml/2015/1/24/50
> (I see no similarities with Ganesh's patch)
> 
> hm, you probably meant this one https://lkml.org/lkml/2015/1/23/406
> 
> 
> at glance this makes things a bit more complicated, so I need to think more.
> 
> > From afda9fd2f6c40dd0745d8a6babe78c5cbdceddf5 Mon Sep 17 00:00:00 2001
> > From: Minchan Kim <minchan@kernel.org>
> > Date: Mon, 26 Jan 2015 14:34:10 +0900
> > Subject: [RFC] zram: remove init_lock in zram_make_request
> > 
> > Admin could reset zram during I/O operation going on so we have
> > used zram->init_lock as read-side lock in I/O path to prevent
> > sudden zram meta freeing.
> > 
> > However, the init_lock is really troublesome.
> > We can't do call zram_meta_alloc under init_lock due to lockdep splat
> > because zram_rw_page is one of the function under reclaim path and
> > hold it as read_lock while other places in process context hold it
> > as write_lock. So, we have used allocation out of the lock to avoid
> > lockdep warn but it's not good for readability and fainally, I met
> > another lockdep splat between init_lock and cpu_hotpulug from
> > kmem_cache_destroy during wokring zsmalloc compaction. :(
> > 
> > Yes, the ideal is to remove horrible init_lock of zram in rw path.
> > This patch removes it in rw path and instead, put init_done bool
> > variable to check initialization done with smp_[wmb|rmb] and
> > srcu_[un]read_lock to prevent sudden zram meta freeing
> > during I/O operation.
> > 
> > Signed-off-by: Minchan Kim <minchan@kernel.org>
> > ---
> >  drivers/block/zram/zram_drv.c | 76 +++++++++++++++++++++++++++++--------------
> >  drivers/block/zram/zram_drv.h |  5 +++
> >  2 files changed, 57 insertions(+), 24 deletions(-)
> > 
> > diff --git a/drivers/block/zram/zram_drv.c b/drivers/block/zram/zram_drv.c
> > index a598ada817f0..e06ff975f997 100644
> > --- a/drivers/block/zram/zram_drv.c
> > +++ b/drivers/block/zram/zram_drv.c
> > @@ -32,6 +32,7 @@
> >  #include <linux/string.h>
> >  #include <linux/vmalloc.h>
> >  #include <linux/err.h>
> > +#include <linux/srcu.h>
> >  
> >  #include "zram_drv.h"
> >  
> > @@ -53,9 +54,16 @@ static ssize_t name##_show(struct device *d,		\
> >  }									\
> >  static DEVICE_ATTR_RO(name);
> >  
> > -static inline int init_done(struct zram *zram)
> > +static inline bool init_done(struct zram *zram)
> >  {
> > -	return zram->meta != NULL;
> > +	/*
> > +	 * init_done can be used without holding zram->init_lock in
> > +	 * read/write handler(ie, zram_make_request) but we should make sure
> > +	 * that zram->init_done should set up after meta initialization is
> > +	 * done. Look at disksize_store.
> > +	 */
> > +	smp_rmb();
> > +	return zram->init_done;
> 
> ->init_done returns back :)


> can we rely on write ->meta; wmb; --- rmb; read ->meta?

Might be possible.

> 
> how much performance do we lose on barriers?

I think it's not too much than locking which does more than(ie,
barrier, fairness, spin on owner and so on) such simple barrier.

> 
> >  }
> >  
> >  static inline struct zram *dev_to_zram(struct device *dev)
> > @@ -326,6 +334,10 @@ static void zram_meta_free(struct zram_meta *meta)
> >  	kfree(meta);
> >  }
> >  
> > +static void rcu_zram_do_nothing(struct rcu_head *unused)
> > +{
> > +}
> > +
> >  static struct zram_meta *zram_meta_alloc(int device_id, u64 disksize)
> >  {
> >  	char pool_name[8];
> > @@ -726,11 +738,8 @@ static void zram_reset_device(struct zram *zram, bool reset_capacity)
> >  		return;
> >  	}
> >  
> > -	zcomp_destroy(zram->comp);
> >  	zram->max_comp_streams = 1;
> >  
> > -	zram_meta_free(zram->meta);
> > -	zram->meta = NULL;
> >  	/* Reset stats */
> >  	memset(&zram->stats, 0, sizeof(zram->stats));
> >  
> > @@ -738,8 +747,12 @@ static void zram_reset_device(struct zram *zram, bool reset_capacity)
> >  	if (reset_capacity)
> >  		set_capacity(zram->disk, 0);
> >  
> > +	zram->init_done = false;
> 
> missing wmb?

I thouht about it but when I read comment from call_srcu as follows
"each cpu is guaranteed to have executed a full memory barrier",
I decided we don't need it. Right? (ie, double check)

> 
> I think we also better put comments after every wmb/rmb. like
> 
> 	smp_wmb(); /* pairs with rmb() in foo() */

I already put the comment in other smp_rmb/wmb.
If it's not what you want, please suggest me. :)

> 
> 
> > +	call_srcu(&zram->srcu, &zram->rcu, rcu_zram_do_nothing);
> > +	synchronize_srcu(&zram->srcu);
> > +	zram_meta_free(zram->meta);
> > +	zcomp_destroy(zram->comp);
> >  	up_write(&zram->init_lock);
> > -
> >  	/*
> >  	 * Revalidate disk out of the init_lock to avoid lockdep splat.
> >  	 * It's okay because disk's capacity is protected by init_lock
> > @@ -762,10 +775,19 @@ static ssize_t disksize_store(struct device *dev,
> >  	if (!disksize)
> >  		return -EINVAL;
> >  
> > +	down_write(&zram->init_lock);
> > +	if (init_done(zram)) {
> > +		pr_info("Cannot change disksize for initialized device\n");
> > +		up_write(&zram->init_lock);
> > +		return -EBUSY;
> > +	}
> > +
> >  	disksize = PAGE_ALIGN(disksize);
> >  	meta = zram_meta_alloc(zram->disk->first_minor, disksize);
> > -	if (!meta)
> > +	if (!meta) {
> > +		up_write(&zram->init_lock);
> >  		return -ENOMEM;
> > +	}
> >  
> >  	comp = zcomp_create(zram->compressor, zram->max_comp_streams);
> >  	if (IS_ERR(comp)) {
> > @@ -775,17 +797,17 @@ static ssize_t disksize_store(struct device *dev,
> >  		goto out_free_meta;
> >  	}
> >  
> > -	down_write(&zram->init_lock);
> > -	if (init_done(zram)) {
> > -		pr_info("Cannot change disksize for initialized device\n");
> > -		err = -EBUSY;
> > -		goto out_destroy_comp;
> > -	}
> > -
> >  	zram->meta = meta;
> >  	zram->comp = comp;
> >  	zram->disksize = disksize;
> >  	set_capacity(zram->disk, zram->disksize >> SECTOR_SHIFT);
> > +	/*
> > +	 * Store operation of struct zram fields should complete
> > +	 * before init_done set up because zram_bvec_rw doesn't
> > +	 * hold an zram->init_lock.
> > +	 */
> > +	smp_wmb();
> > +	zram->init_done = true;
> >  	up_write(&zram->init_lock);
> >  
> >  	/*
> > @@ -797,10 +819,8 @@ static ssize_t disksize_store(struct device *dev,
> >  
> >  	return len;
> >  
> > -out_destroy_comp:
> > -	up_write(&zram->init_lock);
> > -	zcomp_destroy(comp);
> >  out_free_meta:
> > +	up_write(&zram->init_lock);
> >  	zram_meta_free(meta);
> 
>  zram_meta_free(meta);
>  up_write(&zram->init_lock);
> 
>  ?

I don't think we should release meta under init_lock.
Do you have any reason I am missing?

> 
> >  	return err;
> >  }
> > @@ -905,9 +925,10 @@ out:
> >   */
> >  static void zram_make_request(struct request_queue *queue, struct bio *bio)
> >  {
> > +	int idx;
> >  	struct zram *zram = queue->queuedata;
> >  
> > -	down_read(&zram->init_lock);
> > +	idx = srcu_read_lock(&zram->srcu);
> >  	if (unlikely(!init_done(zram)))
> >  		goto error;
> >  
> > @@ -918,12 +939,12 @@ static void zram_make_request(struct request_queue *queue, struct bio *bio)
> >  	}
> >  
> >  	__zram_make_request(zram, bio);
> > -	up_read(&zram->init_lock);
> > +	srcu_read_unlock(&zram->srcu, idx);
> >  
> >  	return;
> >  
> >  error:
> > -	up_read(&zram->init_lock);
> > +	srcu_read_unlock(&zram->srcu, idx);
> >  	bio_io_error(bio);
> >  }
> >  
> > @@ -945,18 +966,20 @@ static void zram_slot_free_notify(struct block_device *bdev,
> >  static int zram_rw_page(struct block_device *bdev, sector_t sector,
> >  		       struct page *page, int rw)
> >  {
> > -	int offset, err;
> > +	int offset, err, idx;
> >  	u32 index;
> >  	struct zram *zram;
> >  	struct bio_vec bv;
> >  
> >  	zram = bdev->bd_disk->private_data;
> > +	idx = srcu_read_lock(&zram->srcu);
> > +
> >  	if (!valid_io_request(zram, sector, PAGE_SIZE)) {
> >  		atomic64_inc(&zram->stats.invalid_io);
> > +		srcu_read_unlock(&zram->srcu, idx);
> >  		return -EINVAL;
> >  	}
> >  
> > -	down_read(&zram->init_lock);
> >  	if (unlikely(!init_done(zram))) {
> >  		err = -EIO;
> >  		goto out_unlock;
> > @@ -971,7 +994,7 @@ static int zram_rw_page(struct block_device *bdev, sector_t sector,
> >  
> >  	err = zram_bvec_rw(zram, &bv, index, offset, rw);
> >  out_unlock:
> > -	up_read(&zram->init_lock);
> > +	srcu_read_unlock(&zram->srcu, idx);
> >  	/*
> >  	 * If I/O fails, just return error(ie, non-zero) without
> >  	 * calling page_endio.
> > @@ -1041,6 +1064,11 @@ static int create_device(struct zram *zram, int device_id)
> >  
> >  	init_rwsem(&zram->init_lock);
> >  
> > +	if (init_srcu_struct(&zram->srcu)) {
> > +		pr_err("Error initialize srcu for device %d\n", device_id);
> > +		goto out;
> > +	}
> > +
> >  	zram->queue = blk_alloc_queue(GFP_KERNEL);
> >  	if (!zram->queue) {
> >  		pr_err("Error allocating disk queue for device %d\n",
> > @@ -1125,8 +1153,8 @@ static void destroy_device(struct zram *zram)
> >  
> >  	del_gendisk(zram->disk);
> >  	put_disk(zram->disk);
> > -
> >  	blk_cleanup_queue(zram->queue);
> > +	cleanup_srcu_struct(&zram->srcu);
> >  }
> >  
> >  static int __init zram_init(void)
> > diff --git a/drivers/block/zram/zram_drv.h b/drivers/block/zram/zram_drv.h
> > index e492f6bf11f1..2042c310aea8 100644
> > --- a/drivers/block/zram/zram_drv.h
> > +++ b/drivers/block/zram/zram_drv.h
> > @@ -105,8 +105,13 @@ struct zram {
> >  	struct gendisk *disk;
> >  	struct zcomp *comp;
> >  
> > +	struct srcu_struct srcu;
> > +	struct rcu_head rcu;
> > +
> >  	/* Prevent concurrent execution of device init, reset and R/W request */
> >  	struct rw_semaphore init_lock;
> > +	bool init_done;
> > +
> >  	/*
> >  	 * This is the limit on amount of *uncompressed* worth of data
> >  	 * we can store in a disk.
> > -- 
> > 1.9.1
> > 

-- 
Kind regards,
Minchan Kim

^ permalink raw reply	[flat|nested] 32+ messages in thread

* Re: [PATCH 1/2] zram: free meta out of init_lock
  2015-01-27  3:18               ` Minchan Kim
@ 2015-01-27  4:03                 ` Sergey Senozhatsky
  2015-01-28  0:15                   ` Minchan Kim
  0 siblings, 1 reply; 32+ messages in thread
From: Sergey Senozhatsky @ 2015-01-27  4:03 UTC (permalink / raw)
  To: Minchan Kim
  Cc: Sergey Senozhatsky, Sergey Senozhatsky, Jerome Marchand,
	Andrew Morton, linux-kernel, linux-mm, Nitin Gupta

Hello,

On (01/27/15 12:18), Minchan Kim wrote:
> Hello Sergey,
> 
> On Tue, Jan 27, 2015 at 11:17:04AM +0900, Sergey Senozhatsky wrote:
> > On (01/27/15 01:00), Minchan Kim wrote:
> > > On Mon, Jan 26, 2015 at 11:17:09PM +0900, Sergey Senozhatsky wrote:
> > > > Hello,
> > > > 
> > > > On (01/26/15 10:33), Minchan Kim wrote:
> > > > > Hello,
> > > > > 
> > > > > On Sat, Jan 24, 2015 at 12:47:07AM +0900, Sergey Senozhatsky wrote:
> > > > > > On (01/23/15 15:48), Jerome Marchand wrote:
> > > > > > > On 01/23/2015 03:24 PM, Sergey Senozhatsky wrote:
> > > > > > > > On (01/23/15 14:58), Minchan Kim wrote:
> > > > > > > >> We don't need to call zram_meta_free, zcomp_destroy and zs_free
> > > > > > > >> under init_lock. What we need to prevent race with init_lock
> > > > > > > >> in reset is setting NULL into zram->meta (ie, init_done).
> > > > > > > >> This patch does it.
> > > > > > > >>
> > > > > > > >> Signed-off-by: Minchan Kim <minchan@kernel.org>
> > > > > > > >> ---
> > > > > > > >>  drivers/block/zram/zram_drv.c | 28 ++++++++++++++++------------
> > > > > > > >>  1 file changed, 16 insertions(+), 12 deletions(-)
> > > > > > > >>
> > > > > > > >> diff --git a/drivers/block/zram/zram_drv.c b/drivers/block/zram/zram_drv.c
> > > > > > > >> index 9250b3f54a8f..0299d82275e7 100644
> > > > > > > >> --- a/drivers/block/zram/zram_drv.c
> > > > > > > >> +++ b/drivers/block/zram/zram_drv.c
> > > > > > > >> @@ -708,6 +708,7 @@ static void zram_reset_device(struct zram *zram, bool reset_capacity)
> > > > > > > >>  {
> > > > > > > >>  	size_t index;
> > > > > > > >>  	struct zram_meta *meta;
> > > > > > > >> +	struct zcomp *comp;
> > > > > > > >>  
> > > > > > > >>  	down_write(&zram->init_lock);
> > > > > > > >>  
> > > > > > > >> @@ -719,20 +720,10 @@ static void zram_reset_device(struct zram *zram, bool reset_capacity)
> > > > > > > >>  	}
> > > > > > > >>  
> > > > > > > >>  	meta = zram->meta;
> > > > > > > >> -	/* Free all pages that are still in this zram device */
> > > > > > > >> -	for (index = 0; index < zram->disksize >> PAGE_SHIFT; index++) {
> > > > > > > >> -		unsigned long handle = meta->table[index].handle;
> > > > > > > >> -		if (!handle)
> > > > > > > >> -			continue;
> > > > > > > >> -
> > > > > > > >> -		zs_free(meta->mem_pool, handle);
> > > > > > > >> -	}
> > > > > > > >> -
> > > > > > > >> -	zcomp_destroy(zram->comp);
> > > > > > > > 
> > > > > > > > I'm not so sure about moving zcomp destruction. if we would have detached it
> > > > > > > > from zram, then yes. otherwise, think of zram ->destoy vs ->init race.
> > > > > > > > 
> > > > > > > > suppose,
> > > > > > > > CPU1 waits for down_write() init lock in disksize_store() with new comp already allocated;
> > > > > > > > CPU0 detaches ->meta and releases write init lock;
> > > > > > > > CPU1 grabs the lock and does zram->comp = comp;
> > > > > > > > CPU0 reaches the point of zcomp_destroy(zram->comp);
> > > > > > > 
> > > > > > > I don't see your point: this patch does not call
> > > > > > > zcomp_destroy(zram->comp) anymore, but zram_destroy(comp), where comp is
> > > > > > > the old zram->comp.
> > > > > > 
> > > > > > 
> > > > > > oh... yes. sorry! my bad.
> > > > > > 
> > > > > > 
> > > > > > 
> > > > > > anyway, on a second thought, do we even want to destoy meta out of init_lock?
> > > > > > 
> > > > > > I mean, it will let you init new device quicker. but... assume, you have
> > > > > > 30G zram (or any other bad-enough number). on CPU0 you reset device -- iterate
> > > > > > over 30G meta->table, etc. out of init_lock.
> > > > > > on CPU1 you concurrently re-init device and request again 30G.
> > > > > > 
> > > > > > how bad that can be?
> > > > > > 
> > > > > > 
> > > > > > 
> > > > > > diskstore called on already initialised device is also not so perfect.
> > > > > > we first will try to allocate ->meta (vmalloc pages for another 30G),
> > > > > > then allocate comp, then down_write() init lock to find out that device
> > > > > > is initialised and we need to release allocated memory.
> > > > > > 
> > > > > > 
> > > > > > 
> > > > > > may be we better keep ->meta destruction under init_lock and additionally
> > > > > > move ->meta and ->comp allocation under init_lock in disksize_store()?
> > > > > > 
> > > > > > like the following one:
> > > > > > 
> > > > > > ---
> > > > > > 
> > > > > >  drivers/block/zram/zram_drv.c | 25 +++++++++++++------------
> > > > > >  1 file changed, 13 insertions(+), 12 deletions(-)
> > > > > > 
> > > > > > diff --git a/drivers/block/zram/zram_drv.c b/drivers/block/zram/zram_drv.c
> > > > > > index 9250b3f..827ab21 100644
> > > > > > --- a/drivers/block/zram/zram_drv.c
> > > > > > +++ b/drivers/block/zram/zram_drv.c
> > > > > > @@ -765,9 +765,18 @@ static ssize_t disksize_store(struct device *dev,
> > > > > >  		return -EINVAL;
> > > > > >  
> > > > > >  	disksize = PAGE_ALIGN(disksize);
> > > > > > +	down_write(&zram->init_lock);
> > > > > > +	if (init_done(zram)) {
> > > > > > +		up_write(&zram->init_lock);
> > > > > > +		pr_info("Cannot change disksize for initialized device\n");
> > > > > > +		return -EBUSY;
> > > > > > +	}
> > > > > > +
> > > > > >  	meta = zram_meta_alloc(zram->disk->first_minor, disksize);
> > > > > > -	if (!meta)
> > > > > > -		return -ENOMEM;
> > > > > > +	if (!meta) {
> > > > > > +		err = -ENOMEM;
> > > > > > +		goto out_unlock;
> > > > > > +	}
> > > > > >  
> > > > > >  	comp = zcomp_create(zram->compressor, zram->max_comp_streams);
> > > > > >  	if (IS_ERR(comp)) {
> > > > > > @@ -777,13 +786,6 @@ static ssize_t disksize_store(struct device *dev,
> > > > > >  		goto out_free_meta;
> > > > > >  	}
> > > > > >  
> > > > > > -	down_write(&zram->init_lock);
> > > > > > -	if (init_done(zram)) {
> > > > > > -		pr_info("Cannot change disksize for initialized device\n");
> > > > > > -		err = -EBUSY;
> > > > > > -		goto out_destroy_comp;
> > > > > > -	}
> > > > > > -
> > > > > >  	zram->meta = meta;
> > > > > >  	zram->comp = comp;
> > > > > >  	zram->disksize = disksize;
> > > > > > @@ -799,11 +801,10 @@ static ssize_t disksize_store(struct device *dev,
> > > > > >  
> > > > > >  	return len;
> > > > > >  
> > > > > > -out_destroy_comp:
> > > > > > -	up_write(&zram->init_lock);
> > > > > > -	zcomp_destroy(comp);
> > > > > >  out_free_meta:
> > > > > >  	zram_meta_free(meta);
> > > > > > +out_unlock:
> > > > > > +	up_write(&zram->init_lock);
> > > > > >  	return err;
> > > > > >  }
> > > > > >  
> > > > > 
> > > > > The init_lock is really troublesome. We can't do call zram_meta_alloc
> > > > > under init_lock due to lockdep report. Please keep in mind.
> > > > >
> > > > 
> > > > ah... I do recall it, thanks for your reminder.
> > > > 
> > > > 
> > > > > The zram_rw_page is one of the function under reclaim path and hold it
> > > > > as read_lock while here holds it as write_lock.
> > > > > It's a false positive so that we might could make shut lockdep up
> > > > > by annotation but I don't want it but want to work with lockdep rather
> > > > > than disable. As well, there are other pathes to use init_lock to
> > > > > protect other data where would be victims of lockdep.
> > > > > 
> > > > > I didn't tell the motivation of this patch because it made you busy
> > > > > guys wasted. Let me tell it now. It was another lockdep report by
> > > > > kmem_cache_destroy for zsmalloc compaction about init_lock. That's why
> > > > > the patchset was one of the patch in compaction.
> > > > >
> > > > > Yes, the ideal is to remove horrible init_lock of zram in this phase and
> > > > > make code more simple and clear but I don't want to stuck zsmalloc
> > > > > compaction by the work.
> > > > 
> > > > 
> > > > > Having said that, I feel it's time to revisit
> > > > > to remove init_lock.
> > > > > At least, I will think over to find a solution to kill init_lock.
> > > > 
> > > > hm, can't think of anything quick...
> > > > 
> > > > 	-ss
> > > 
> > > Hello guys,
> > > 
> > > How about this?
> > > 
> > > It's based on Ganesh's patch.
> > > https://lkml.org/lkml/2015/1/24/50
> > (I see no similarities with Ganesh's patch)
> > 
> > hm, you probably meant this one https://lkml.org/lkml/2015/1/23/406
> > 
> > 
> > at glance this makes things a bit more complicated, so I need to think more.
> > 
> > > From afda9fd2f6c40dd0745d8a6babe78c5cbdceddf5 Mon Sep 17 00:00:00 2001
> > > From: Minchan Kim <minchan@kernel.org>
> > > Date: Mon, 26 Jan 2015 14:34:10 +0900
> > > Subject: [RFC] zram: remove init_lock in zram_make_request
> > > 
> > > Admin could reset zram during I/O operation going on so we have
> > > used zram->init_lock as read-side lock in I/O path to prevent
> > > sudden zram meta freeing.
> > > 
> > > However, the init_lock is really troublesome.
> > > We can't do call zram_meta_alloc under init_lock due to lockdep splat
> > > because zram_rw_page is one of the function under reclaim path and
> > > hold it as read_lock while other places in process context hold it
> > > as write_lock. So, we have used allocation out of the lock to avoid
> > > lockdep warn but it's not good for readability and fainally, I met
> > > another lockdep splat between init_lock and cpu_hotpulug from
> > > kmem_cache_destroy during wokring zsmalloc compaction. :(
> > > 
> > > Yes, the ideal is to remove horrible init_lock of zram in rw path.
> > > This patch removes it in rw path and instead, put init_done bool
> > > variable to check initialization done with smp_[wmb|rmb] and
> > > srcu_[un]read_lock to prevent sudden zram meta freeing
> > > during I/O operation.
> > > 
> > > Signed-off-by: Minchan Kim <minchan@kernel.org>
> > > ---
> > >  drivers/block/zram/zram_drv.c | 76 +++++++++++++++++++++++++++++--------------
> > >  drivers/block/zram/zram_drv.h |  5 +++
> > >  2 files changed, 57 insertions(+), 24 deletions(-)
> > > 
> > > diff --git a/drivers/block/zram/zram_drv.c b/drivers/block/zram/zram_drv.c
> > > index a598ada817f0..e06ff975f997 100644
> > > --- a/drivers/block/zram/zram_drv.c
> > > +++ b/drivers/block/zram/zram_drv.c
> > > @@ -32,6 +32,7 @@
> > >  #include <linux/string.h>
> > >  #include <linux/vmalloc.h>
> > >  #include <linux/err.h>
> > > +#include <linux/srcu.h>
> > >  
> > >  #include "zram_drv.h"
> > >  
> > > @@ -53,9 +54,16 @@ static ssize_t name##_show(struct device *d,		\
> > >  }									\
> > >  static DEVICE_ATTR_RO(name);
> > >  
> > > -static inline int init_done(struct zram *zram)
> > > +static inline bool init_done(struct zram *zram)
> > >  {
> > > -	return zram->meta != NULL;
> > > +	/*
> > > +	 * init_done can be used without holding zram->init_lock in
> > > +	 * read/write handler(ie, zram_make_request) but we should make sure
> > > +	 * that zram->init_done should set up after meta initialization is
> > > +	 * done. Look at disksize_store.
> > > +	 */
> > > +	smp_rmb();
> > > +	return zram->init_done;
> > 
> > ->init_done returns back :)
> 
> 
> > can we rely on write ->meta; wmb; --- rmb; read ->meta?
> 
> Might be possible.
> 
> > 
> > how much performance do we lose on barriers?
> 
> I think it's not too much than locking which does more than(ie,
> barrier, fairness, spin on owner and so on) such simple barrier.
> 
> > 
> > >  }
> > >  
> > >  static inline struct zram *dev_to_zram(struct device *dev)
> > > @@ -326,6 +334,10 @@ static void zram_meta_free(struct zram_meta *meta)
> > >  	kfree(meta);
> > >  }
> > >  
> > > +static void rcu_zram_do_nothing(struct rcu_head *unused)
> > > +{
> > > +}
> > > +
> > >  static struct zram_meta *zram_meta_alloc(int device_id, u64 disksize)
> > >  {
> > >  	char pool_name[8];
> > > @@ -726,11 +738,8 @@ static void zram_reset_device(struct zram *zram, bool reset_capacity)
> > >  		return;
> > >  	}
> > >  
> > > -	zcomp_destroy(zram->comp);
> > >  	zram->max_comp_streams = 1;
> > >  
> > > -	zram_meta_free(zram->meta);
> > > -	zram->meta = NULL;
> > >  	/* Reset stats */
> > >  	memset(&zram->stats, 0, sizeof(zram->stats));
> > >  
> > > @@ -738,8 +747,12 @@ static void zram_reset_device(struct zram *zram, bool reset_capacity)
> > >  	if (reset_capacity)
> > >  		set_capacity(zram->disk, 0);
> > >  
> > > +	zram->init_done = false;
> > 
> > missing wmb?
> 
> I thouht about it but when I read comment from call_srcu as follows
> "each cpu is guaranteed to have executed a full memory barrier",
> I decided we don't need it. Right? (ie, double check)
> 

hm, need to think about it.

> > 
> > I think we also better put comments after every wmb/rmb. like
> > 
> > 	smp_wmb(); /* pairs with rmb() in foo() */
> 
> I already put the comment in other smp_rmb/wmb.
> If it's not what you want, please suggest me. :)
> 

they are fine. it was a minor nitpick.
I just read in the list that guys want to explicitly show which wmb
corresponds to which rmb. but we have only two of them, so it's not
a big deal.

> > 
> > 
> > > +	call_srcu(&zram->srcu, &zram->rcu, rcu_zram_do_nothing);
> > > +	synchronize_srcu(&zram->srcu);
> > > +	zram_meta_free(zram->meta);
> > > +	zcomp_destroy(zram->comp);
> > >  	up_write(&zram->init_lock);
> > > -
> > >  	/*
> > >  	 * Revalidate disk out of the init_lock to avoid lockdep splat.
> > >  	 * It's okay because disk's capacity is protected by init_lock
> > > @@ -762,10 +775,19 @@ static ssize_t disksize_store(struct device *dev,
> > >  	if (!disksize)
> > >  		return -EINVAL;
> > >  
> > > +	down_write(&zram->init_lock);
> > > +	if (init_done(zram)) {
> > > +		pr_info("Cannot change disksize for initialized device\n");
> > > +		up_write(&zram->init_lock);
> > > +		return -EBUSY;
> > > +	}
> > > +
> > >  	disksize = PAGE_ALIGN(disksize);
> > >  	meta = zram_meta_alloc(zram->disk->first_minor, disksize);
> > > -	if (!meta)
> > > +	if (!meta) {
> > > +		up_write(&zram->init_lock);
> > >  		return -ENOMEM;
> > > +	}
> > >  
> > >  	comp = zcomp_create(zram->compressor, zram->max_comp_streams);
> > >  	if (IS_ERR(comp)) {
> > > @@ -775,17 +797,17 @@ static ssize_t disksize_store(struct device *dev,
> > >  		goto out_free_meta;
> > >  	}
> > >  
> > > -	down_write(&zram->init_lock);
> > > -	if (init_done(zram)) {
> > > -		pr_info("Cannot change disksize for initialized device\n");
> > > -		err = -EBUSY;
> > > -		goto out_destroy_comp;
> > > -	}
> > > -
> > >  	zram->meta = meta;
> > >  	zram->comp = comp;
> > >  	zram->disksize = disksize;
> > >  	set_capacity(zram->disk, zram->disksize >> SECTOR_SHIFT);
> > > +	/*
> > > +	 * Store operation of struct zram fields should complete
> > > +	 * before init_done set up because zram_bvec_rw doesn't
> > > +	 * hold an zram->init_lock.
> > > +	 */
> > > +	smp_wmb();
> > > +	zram->init_done = true;
> > >  	up_write(&zram->init_lock);
> > >  
> > >  	/*
> > > @@ -797,10 +819,8 @@ static ssize_t disksize_store(struct device *dev,
> > >  
> > >  	return len;
> > >  
> > > -out_destroy_comp:
> > > -	up_write(&zram->init_lock);
> > > -	zcomp_destroy(comp);
> > >  out_free_meta:
> > > +	up_write(&zram->init_lock);
> > >  	zram_meta_free(meta);
> > 
> >  zram_meta_free(meta);
> >  up_write(&zram->init_lock);
> > 
> >  ?
> 
> I don't think we should release meta under init_lock.
> Do you have any reason I am missing?
> 

well, just theoretical.
forbid concurrent initialization until we completely rollback.

             CPU0                                     CPU1

echo 30G > /.../zram0/disksize
meta = vmalloc(pages for 30G)

out_free_meta:                              echo 30G > /.../zram0/disksize
	up_write(&zram->init_lock);         meta = vmalloc(pages for 30G)
	zram_meta_free(meta);               ^^^^ 30G + 30G
                                            out_free_meta:
                                                   ....
	-ss

> > 
> > >  	return err;
> > >  }
> > > @@ -905,9 +925,10 @@ out:
> > >   */
> > >  static void zram_make_request(struct request_queue *queue, struct bio *bio)
> > >  {
> > > +	int idx;
> > >  	struct zram *zram = queue->queuedata;
> > >  
> > > -	down_read(&zram->init_lock);
> > > +	idx = srcu_read_lock(&zram->srcu);
> > >  	if (unlikely(!init_done(zram)))
> > >  		goto error;
> > >  
> > > @@ -918,12 +939,12 @@ static void zram_make_request(struct request_queue *queue, struct bio *bio)
> > >  	}
> > >  
> > >  	__zram_make_request(zram, bio);
> > > -	up_read(&zram->init_lock);
> > > +	srcu_read_unlock(&zram->srcu, idx);
> > >  
> > >  	return;
> > >  
> > >  error:
> > > -	up_read(&zram->init_lock);
> > > +	srcu_read_unlock(&zram->srcu, idx);
> > >  	bio_io_error(bio);
> > >  }
> > >  
> > > @@ -945,18 +966,20 @@ static void zram_slot_free_notify(struct block_device *bdev,
> > >  static int zram_rw_page(struct block_device *bdev, sector_t sector,
> > >  		       struct page *page, int rw)
> > >  {
> > > -	int offset, err;
> > > +	int offset, err, idx;
> > >  	u32 index;
> > >  	struct zram *zram;
> > >  	struct bio_vec bv;
> > >  
> > >  	zram = bdev->bd_disk->private_data;
> > > +	idx = srcu_read_lock(&zram->srcu);
> > > +
> > >  	if (!valid_io_request(zram, sector, PAGE_SIZE)) {
> > >  		atomic64_inc(&zram->stats.invalid_io);
> > > +		srcu_read_unlock(&zram->srcu, idx);
> > >  		return -EINVAL;
> > >  	}
> > >  
> > > -	down_read(&zram->init_lock);
> > >  	if (unlikely(!init_done(zram))) {
> > >  		err = -EIO;
> > >  		goto out_unlock;
> > > @@ -971,7 +994,7 @@ static int zram_rw_page(struct block_device *bdev, sector_t sector,
> > >  
> > >  	err = zram_bvec_rw(zram, &bv, index, offset, rw);
> > >  out_unlock:
> > > -	up_read(&zram->init_lock);
> > > +	srcu_read_unlock(&zram->srcu, idx);
> > >  	/*
> > >  	 * If I/O fails, just return error(ie, non-zero) without
> > >  	 * calling page_endio.
> > > @@ -1041,6 +1064,11 @@ static int create_device(struct zram *zram, int device_id)
> > >  
> > >  	init_rwsem(&zram->init_lock);
> > >  
> > > +	if (init_srcu_struct(&zram->srcu)) {
> > > +		pr_err("Error initialize srcu for device %d\n", device_id);
> > > +		goto out;
> > > +	}
> > > +
> > >  	zram->queue = blk_alloc_queue(GFP_KERNEL);
> > >  	if (!zram->queue) {
> > >  		pr_err("Error allocating disk queue for device %d\n",
> > > @@ -1125,8 +1153,8 @@ static void destroy_device(struct zram *zram)
> > >  
> > >  	del_gendisk(zram->disk);
> > >  	put_disk(zram->disk);
> > > -
> > >  	blk_cleanup_queue(zram->queue);
> > > +	cleanup_srcu_struct(&zram->srcu);
> > >  }
> > >  
> > >  static int __init zram_init(void)
> > > diff --git a/drivers/block/zram/zram_drv.h b/drivers/block/zram/zram_drv.h
> > > index e492f6bf11f1..2042c310aea8 100644
> > > --- a/drivers/block/zram/zram_drv.h
> > > +++ b/drivers/block/zram/zram_drv.h
> > > @@ -105,8 +105,13 @@ struct zram {
> > >  	struct gendisk *disk;
> > >  	struct zcomp *comp;
> > >  
> > > +	struct srcu_struct srcu;
> > > +	struct rcu_head rcu;
> > > +
> > >  	/* Prevent concurrent execution of device init, reset and R/W request */
> > >  	struct rw_semaphore init_lock;
> > > +	bool init_done;
> > > +
> > >  	/*
> > >  	 * This is the limit on amount of *uncompressed* worth of data
> > >  	 * we can store in a disk.
> > > -- 
> > > 1.9.1
> > > 
> 
> -- 
> Kind regards,
> Minchan Kim
> 

^ permalink raw reply	[flat|nested] 32+ messages in thread

* Re: [PATCH 1/2] zram: free meta out of init_lock
  2015-01-27  4:03                 ` Sergey Senozhatsky
@ 2015-01-28  0:15                   ` Minchan Kim
  2015-01-28  0:22                     ` Minchan Kim
  2015-01-28  0:24                     ` Sergey Senozhatsky
  0 siblings, 2 replies; 32+ messages in thread
From: Minchan Kim @ 2015-01-28  0:15 UTC (permalink / raw)
  To: Sergey Senozhatsky
  Cc: Sergey Senozhatsky, Jerome Marchand, Andrew Morton, linux-kernel,
	linux-mm, Nitin Gupta

Hello Sergey,

On Tue, Jan 27, 2015 at 01:03:05PM +0900, Sergey Senozhatsky wrote:
> Hello,
> 
> On (01/27/15 12:18), Minchan Kim wrote:
> > Hello Sergey,
> > 
> > On Tue, Jan 27, 2015 at 11:17:04AM +0900, Sergey Senozhatsky wrote:
> > > On (01/27/15 01:00), Minchan Kim wrote:
> > > > On Mon, Jan 26, 2015 at 11:17:09PM +0900, Sergey Senozhatsky wrote:
> > > > > Hello,
> > > > > 
> > > > > On (01/26/15 10:33), Minchan Kim wrote:
> > > > > > Hello,
> > > > > > 
> > > > > > On Sat, Jan 24, 2015 at 12:47:07AM +0900, Sergey Senozhatsky wrote:
> > > > > > > On (01/23/15 15:48), Jerome Marchand wrote:
> > > > > > > > On 01/23/2015 03:24 PM, Sergey Senozhatsky wrote:
> > > > > > > > > On (01/23/15 14:58), Minchan Kim wrote:
> > > > > > > > >> We don't need to call zram_meta_free, zcomp_destroy and zs_free
> > > > > > > > >> under init_lock. What we need to prevent race with init_lock
> > > > > > > > >> in reset is setting NULL into zram->meta (ie, init_done).
> > > > > > > > >> This patch does it.
> > > > > > > > >>
> > > > > > > > >> Signed-off-by: Minchan Kim <minchan@kernel.org>
> > > > > > > > >> ---
> > > > > > > > >>  drivers/block/zram/zram_drv.c | 28 ++++++++++++++++------------
> > > > > > > > >>  1 file changed, 16 insertions(+), 12 deletions(-)
> > > > > > > > >>
> > > > > > > > >> diff --git a/drivers/block/zram/zram_drv.c b/drivers/block/zram/zram_drv.c
> > > > > > > > >> index 9250b3f54a8f..0299d82275e7 100644
> > > > > > > > >> --- a/drivers/block/zram/zram_drv.c
> > > > > > > > >> +++ b/drivers/block/zram/zram_drv.c
> > > > > > > > >> @@ -708,6 +708,7 @@ static void zram_reset_device(struct zram *zram, bool reset_capacity)
> > > > > > > > >>  {
> > > > > > > > >>  	size_t index;
> > > > > > > > >>  	struct zram_meta *meta;
> > > > > > > > >> +	struct zcomp *comp;
> > > > > > > > >>  
> > > > > > > > >>  	down_write(&zram->init_lock);
> > > > > > > > >>  
> > > > > > > > >> @@ -719,20 +720,10 @@ static void zram_reset_device(struct zram *zram, bool reset_capacity)
> > > > > > > > >>  	}
> > > > > > > > >>  
> > > > > > > > >>  	meta = zram->meta;
> > > > > > > > >> -	/* Free all pages that are still in this zram device */
> > > > > > > > >> -	for (index = 0; index < zram->disksize >> PAGE_SHIFT; index++) {
> > > > > > > > >> -		unsigned long handle = meta->table[index].handle;
> > > > > > > > >> -		if (!handle)
> > > > > > > > >> -			continue;
> > > > > > > > >> -
> > > > > > > > >> -		zs_free(meta->mem_pool, handle);
> > > > > > > > >> -	}
> > > > > > > > >> -
> > > > > > > > >> -	zcomp_destroy(zram->comp);
> > > > > > > > > 
> > > > > > > > > I'm not so sure about moving zcomp destruction. if we would have detached it
> > > > > > > > > from zram, then yes. otherwise, think of zram ->destoy vs ->init race.
> > > > > > > > > 
> > > > > > > > > suppose,
> > > > > > > > > CPU1 waits for down_write() init lock in disksize_store() with new comp already allocated;
> > > > > > > > > CPU0 detaches ->meta and releases write init lock;
> > > > > > > > > CPU1 grabs the lock and does zram->comp = comp;
> > > > > > > > > CPU0 reaches the point of zcomp_destroy(zram->comp);
> > > > > > > > 
> > > > > > > > I don't see your point: this patch does not call
> > > > > > > > zcomp_destroy(zram->comp) anymore, but zram_destroy(comp), where comp is
> > > > > > > > the old zram->comp.
> > > > > > > 
> > > > > > > 
> > > > > > > oh... yes. sorry! my bad.
> > > > > > > 
> > > > > > > 
> > > > > > > 
> > > > > > > anyway, on a second thought, do we even want to destoy meta out of init_lock?
> > > > > > > 
> > > > > > > I mean, it will let you init new device quicker. but... assume, you have
> > > > > > > 30G zram (or any other bad-enough number). on CPU0 you reset device -- iterate
> > > > > > > over 30G meta->table, etc. out of init_lock.
> > > > > > > on CPU1 you concurrently re-init device and request again 30G.
> > > > > > > 
> > > > > > > how bad that can be?
> > > > > > > 
> > > > > > > 
> > > > > > > 
> > > > > > > diskstore called on already initialised device is also not so perfect.
> > > > > > > we first will try to allocate ->meta (vmalloc pages for another 30G),
> > > > > > > then allocate comp, then down_write() init lock to find out that device
> > > > > > > is initialised and we need to release allocated memory.
> > > > > > > 
> > > > > > > 
> > > > > > > 
> > > > > > > may be we better keep ->meta destruction under init_lock and additionally
> > > > > > > move ->meta and ->comp allocation under init_lock in disksize_store()?
> > > > > > > 
> > > > > > > like the following one:
> > > > > > > 
> > > > > > > ---
> > > > > > > 
> > > > > > >  drivers/block/zram/zram_drv.c | 25 +++++++++++++------------
> > > > > > >  1 file changed, 13 insertions(+), 12 deletions(-)
> > > > > > > 
> > > > > > > diff --git a/drivers/block/zram/zram_drv.c b/drivers/block/zram/zram_drv.c
> > > > > > > index 9250b3f..827ab21 100644
> > > > > > > --- a/drivers/block/zram/zram_drv.c
> > > > > > > +++ b/drivers/block/zram/zram_drv.c
> > > > > > > @@ -765,9 +765,18 @@ static ssize_t disksize_store(struct device *dev,
> > > > > > >  		return -EINVAL;
> > > > > > >  
> > > > > > >  	disksize = PAGE_ALIGN(disksize);
> > > > > > > +	down_write(&zram->init_lock);
> > > > > > > +	if (init_done(zram)) {
> > > > > > > +		up_write(&zram->init_lock);
> > > > > > > +		pr_info("Cannot change disksize for initialized device\n");
> > > > > > > +		return -EBUSY;
> > > > > > > +	}
> > > > > > > +
> > > > > > >  	meta = zram_meta_alloc(zram->disk->first_minor, disksize);
> > > > > > > -	if (!meta)
> > > > > > > -		return -ENOMEM;
> > > > > > > +	if (!meta) {
> > > > > > > +		err = -ENOMEM;
> > > > > > > +		goto out_unlock;
> > > > > > > +	}
> > > > > > >  
> > > > > > >  	comp = zcomp_create(zram->compressor, zram->max_comp_streams);
> > > > > > >  	if (IS_ERR(comp)) {
> > > > > > > @@ -777,13 +786,6 @@ static ssize_t disksize_store(struct device *dev,
> > > > > > >  		goto out_free_meta;
> > > > > > >  	}
> > > > > > >  
> > > > > > > -	down_write(&zram->init_lock);
> > > > > > > -	if (init_done(zram)) {
> > > > > > > -		pr_info("Cannot change disksize for initialized device\n");
> > > > > > > -		err = -EBUSY;
> > > > > > > -		goto out_destroy_comp;
> > > > > > > -	}
> > > > > > > -
> > > > > > >  	zram->meta = meta;
> > > > > > >  	zram->comp = comp;
> > > > > > >  	zram->disksize = disksize;
> > > > > > > @@ -799,11 +801,10 @@ static ssize_t disksize_store(struct device *dev,
> > > > > > >  
> > > > > > >  	return len;
> > > > > > >  
> > > > > > > -out_destroy_comp:
> > > > > > > -	up_write(&zram->init_lock);
> > > > > > > -	zcomp_destroy(comp);
> > > > > > >  out_free_meta:
> > > > > > >  	zram_meta_free(meta);
> > > > > > > +out_unlock:
> > > > > > > +	up_write(&zram->init_lock);
> > > > > > >  	return err;
> > > > > > >  }
> > > > > > >  
> > > > > > 
> > > > > > The init_lock is really troublesome. We can't do call zram_meta_alloc
> > > > > > under init_lock due to lockdep report. Please keep in mind.
> > > > > >
> > > > > 
> > > > > ah... I do recall it, thanks for your reminder.
> > > > > 
> > > > > 
> > > > > > The zram_rw_page is one of the function under reclaim path and hold it
> > > > > > as read_lock while here holds it as write_lock.
> > > > > > It's a false positive so that we might could make shut lockdep up
> > > > > > by annotation but I don't want it but want to work with lockdep rather
> > > > > > than disable. As well, there are other pathes to use init_lock to
> > > > > > protect other data where would be victims of lockdep.
> > > > > > 
> > > > > > I didn't tell the motivation of this patch because it made you busy
> > > > > > guys wasted. Let me tell it now. It was another lockdep report by
> > > > > > kmem_cache_destroy for zsmalloc compaction about init_lock. That's why
> > > > > > the patchset was one of the patch in compaction.
> > > > > >
> > > > > > Yes, the ideal is to remove horrible init_lock of zram in this phase and
> > > > > > make code more simple and clear but I don't want to stuck zsmalloc
> > > > > > compaction by the work.
> > > > > 
> > > > > 
> > > > > > Having said that, I feel it's time to revisit
> > > > > > to remove init_lock.
> > > > > > At least, I will think over to find a solution to kill init_lock.
> > > > > 
> > > > > hm, can't think of anything quick...
> > > > > 
> > > > > 	-ss
> > > > 
> > > > Hello guys,
> > > > 
> > > > How about this?
> > > > 
> > > > It's based on Ganesh's patch.
> > > > https://lkml.org/lkml/2015/1/24/50
> > > (I see no similarities with Ganesh's patch)
> > > 
> > > hm, you probably meant this one https://lkml.org/lkml/2015/1/23/406
> > > 
> > > 
> > > at glance this makes things a bit more complicated, so I need to think more.
> > > 
> > > > From afda9fd2f6c40dd0745d8a6babe78c5cbdceddf5 Mon Sep 17 00:00:00 2001
> > > > From: Minchan Kim <minchan@kernel.org>
> > > > Date: Mon, 26 Jan 2015 14:34:10 +0900
> > > > Subject: [RFC] zram: remove init_lock in zram_make_request
> > > > 
> > > > Admin could reset zram during I/O operation going on so we have
> > > > used zram->init_lock as read-side lock in I/O path to prevent
> > > > sudden zram meta freeing.
> > > > 
> > > > However, the init_lock is really troublesome.
> > > > We can't do call zram_meta_alloc under init_lock due to lockdep splat
> > > > because zram_rw_page is one of the function under reclaim path and
> > > > hold it as read_lock while other places in process context hold it
> > > > as write_lock. So, we have used allocation out of the lock to avoid
> > > > lockdep warn but it's not good for readability and fainally, I met
> > > > another lockdep splat between init_lock and cpu_hotpulug from
> > > > kmem_cache_destroy during wokring zsmalloc compaction. :(
> > > > 
> > > > Yes, the ideal is to remove horrible init_lock of zram in rw path.
> > > > This patch removes it in rw path and instead, put init_done bool
> > > > variable to check initialization done with smp_[wmb|rmb] and
> > > > srcu_[un]read_lock to prevent sudden zram meta freeing
> > > > during I/O operation.
> > > > 
> > > > Signed-off-by: Minchan Kim <minchan@kernel.org>
> > > > ---
> > > >  drivers/block/zram/zram_drv.c | 76 +++++++++++++++++++++++++++++--------------
> > > >  drivers/block/zram/zram_drv.h |  5 +++
> > > >  2 files changed, 57 insertions(+), 24 deletions(-)
> > > > 
> > > > diff --git a/drivers/block/zram/zram_drv.c b/drivers/block/zram/zram_drv.c
> > > > index a598ada817f0..e06ff975f997 100644
> > > > --- a/drivers/block/zram/zram_drv.c
> > > > +++ b/drivers/block/zram/zram_drv.c
> > > > @@ -32,6 +32,7 @@
> > > >  #include <linux/string.h>
> > > >  #include <linux/vmalloc.h>
> > > >  #include <linux/err.h>
> > > > +#include <linux/srcu.h>
> > > >  
> > > >  #include "zram_drv.h"
> > > >  
> > > > @@ -53,9 +54,16 @@ static ssize_t name##_show(struct device *d,		\
> > > >  }									\
> > > >  static DEVICE_ATTR_RO(name);
> > > >  
> > > > -static inline int init_done(struct zram *zram)
> > > > +static inline bool init_done(struct zram *zram)
> > > >  {
> > > > -	return zram->meta != NULL;
> > > > +	/*
> > > > +	 * init_done can be used without holding zram->init_lock in
> > > > +	 * read/write handler(ie, zram_make_request) but we should make sure
> > > > +	 * that zram->init_done should set up after meta initialization is
> > > > +	 * done. Look at disksize_store.
> > > > +	 */
> > > > +	smp_rmb();
> > > > +	return zram->init_done;
> > > 
> > > ->init_done returns back :)
> > 
> > 
> > > can we rely on write ->meta; wmb; --- rmb; read ->meta?
> > 
> > Might be possible.

Now that I think about it, it's impossible with zram->meta because
we need to nullify it before call_srcu but pre-existing SRCU read-side
critical sections can access zram->meta.
Anyway, introducing a new variable should be not a party-pooper.

> > 
> > > 
> > > how much performance do we lose on barriers?
> > 
> > I think it's not too much than locking which does more than(ie,
> > barrier, fairness, spin on owner and so on) such simple barrier.
> > 
> > > 
> > > >  }
> > > >  
> > > >  static inline struct zram *dev_to_zram(struct device *dev)
> > > > @@ -326,6 +334,10 @@ static void zram_meta_free(struct zram_meta *meta)
> > > >  	kfree(meta);
> > > >  }
> > > >  
> > > > +static void rcu_zram_do_nothing(struct rcu_head *unused)
> > > > +{
> > > > +}
> > > > +
> > > >  static struct zram_meta *zram_meta_alloc(int device_id, u64 disksize)
> > > >  {
> > > >  	char pool_name[8];
> > > > @@ -726,11 +738,8 @@ static void zram_reset_device(struct zram *zram, bool reset_capacity)
> > > >  		return;
> > > >  	}
> > > >  
> > > > -	zcomp_destroy(zram->comp);
> > > >  	zram->max_comp_streams = 1;
> > > >  
> > > > -	zram_meta_free(zram->meta);
> > > > -	zram->meta = NULL;
> > > >  	/* Reset stats */
> > > >  	memset(&zram->stats, 0, sizeof(zram->stats));
> > > >  
> > > > @@ -738,8 +747,12 @@ static void zram_reset_device(struct zram *zram, bool reset_capacity)
> > > >  	if (reset_capacity)
> > > >  		set_capacity(zram->disk, 0);
> > > >  
> > > > +	zram->init_done = false;
> > > 
> > > missing wmb?
> > 
> > I thouht about it but when I read comment from call_srcu as follows
> > "each cpu is guaranteed to have executed a full memory barrier",
> > I decided we don't need it. Right? (ie, double check)
> > 
> 
> hm, need to think about it.

Another idea is to use kick_all_cpus_sync, not srcu.
With that, we don't need to add more instruction in rw path.
I will try it.

> 
> > > 
> > > I think we also better put comments after every wmb/rmb. like
> > > 
> > > 	smp_wmb(); /* pairs with rmb() in foo() */
> > 
> > I already put the comment in other smp_rmb/wmb.
> > If it's not what you want, please suggest me. :)
> > 
> 
> they are fine. it was a minor nitpick.
> I just read in the list that guys want to explicitly show which wmb
> corresponds to which rmb. but we have only two of them, so it's not
> a big deal.
> 
> > > 
> > > 
> > > > +	call_srcu(&zram->srcu, &zram->rcu, rcu_zram_do_nothing);
> > > > +	synchronize_srcu(&zram->srcu);
> > > > +	zram_meta_free(zram->meta);
> > > > +	zcomp_destroy(zram->comp);
> > > >  	up_write(&zram->init_lock);
> > > > -
> > > >  	/*
> > > >  	 * Revalidate disk out of the init_lock to avoid lockdep splat.
> > > >  	 * It's okay because disk's capacity is protected by init_lock
> > > > @@ -762,10 +775,19 @@ static ssize_t disksize_store(struct device *dev,
> > > >  	if (!disksize)
> > > >  		return -EINVAL;
> > > >  
> > > > +	down_write(&zram->init_lock);
> > > > +	if (init_done(zram)) {
> > > > +		pr_info("Cannot change disksize for initialized device\n");
> > > > +		up_write(&zram->init_lock);
> > > > +		return -EBUSY;
> > > > +	}
> > > > +
> > > >  	disksize = PAGE_ALIGN(disksize);
> > > >  	meta = zram_meta_alloc(zram->disk->first_minor, disksize);
> > > > -	if (!meta)
> > > > +	if (!meta) {
> > > > +		up_write(&zram->init_lock);
> > > >  		return -ENOMEM;
> > > > +	}
> > > >  
> > > >  	comp = zcomp_create(zram->compressor, zram->max_comp_streams);
> > > >  	if (IS_ERR(comp)) {
> > > > @@ -775,17 +797,17 @@ static ssize_t disksize_store(struct device *dev,
> > > >  		goto out_free_meta;
> > > >  	}
> > > >  
> > > > -	down_write(&zram->init_lock);
> > > > -	if (init_done(zram)) {
> > > > -		pr_info("Cannot change disksize for initialized device\n");
> > > > -		err = -EBUSY;
> > > > -		goto out_destroy_comp;
> > > > -	}
> > > > -
> > > >  	zram->meta = meta;
> > > >  	zram->comp = comp;
> > > >  	zram->disksize = disksize;
> > > >  	set_capacity(zram->disk, zram->disksize >> SECTOR_SHIFT);
> > > > +	/*
> > > > +	 * Store operation of struct zram fields should complete
> > > > +	 * before init_done set up because zram_bvec_rw doesn't
> > > > +	 * hold an zram->init_lock.
> > > > +	 */
> > > > +	smp_wmb();
> > > > +	zram->init_done = true;
> > > >  	up_write(&zram->init_lock);
> > > >  
> > > >  	/*
> > > > @@ -797,10 +819,8 @@ static ssize_t disksize_store(struct device *dev,
> > > >  
> > > >  	return len;
> > > >  
> > > > -out_destroy_comp:
> > > > -	up_write(&zram->init_lock);
> > > > -	zcomp_destroy(comp);
> > > >  out_free_meta:
> > > > +	up_write(&zram->init_lock);
> > > >  	zram_meta_free(meta);
> > > 
> > >  zram_meta_free(meta);
> > >  up_write(&zram->init_lock);
> > > 
> > >  ?
> > 
> > I don't think we should release meta under init_lock.
> > Do you have any reason I am missing?
> > 
> 
> well, just theoretical.
> forbid concurrent initialization until we completely rollback.
> 
>              CPU0                                     CPU1
> 
> echo 30G > /.../zram0/disksize
> meta = vmalloc(pages for 30G)
> 
> out_free_meta:                              echo 30G > /.../zram0/disksize
> 	up_write(&zram->init_lock);         meta = vmalloc(pages for 30G)
> 	zram_meta_free(meta);               ^^^^ 30G + 30G
>                                             out_free_meta:
>                                                    ....
> 	-ss

It might but as it is, we have allocated meta out of the lock.
if it turns out real problem, it's easy to fix it byby this work
(ie, we could alloc/free meta under init_lock).
IOW, it should be another patch so I don't want to take care of it
in this work.

> 
> > > 
> > > >  	return err;
> > > >  }
> > > > @@ -905,9 +925,10 @@ out:
> > > >   */
> > > >  static void zram_make_request(struct request_queue *queue, struct bio *bio)
> > > >  {
> > > > +	int idx;
> > > >  	struct zram *zram = queue->queuedata;
> > > >  
> > > > -	down_read(&zram->init_lock);
> > > > +	idx = srcu_read_lock(&zram->srcu);
> > > >  	if (unlikely(!init_done(zram)))
> > > >  		goto error;
> > > >  
> > > > @@ -918,12 +939,12 @@ static void zram_make_request(struct request_queue *queue, struct bio *bio)
> > > >  	}
> > > >  
> > > >  	__zram_make_request(zram, bio);
> > > > -	up_read(&zram->init_lock);
> > > > +	srcu_read_unlock(&zram->srcu, idx);
> > > >  
> > > >  	return;
> > > >  
> > > >  error:
> > > > -	up_read(&zram->init_lock);
> > > > +	srcu_read_unlock(&zram->srcu, idx);
> > > >  	bio_io_error(bio);
> > > >  }
> > > >  
> > > > @@ -945,18 +966,20 @@ static void zram_slot_free_notify(struct block_device *bdev,
> > > >  static int zram_rw_page(struct block_device *bdev, sector_t sector,
> > > >  		       struct page *page, int rw)
> > > >  {
> > > > -	int offset, err;
> > > > +	int offset, err, idx;
> > > >  	u32 index;
> > > >  	struct zram *zram;
> > > >  	struct bio_vec bv;
> > > >  
> > > >  	zram = bdev->bd_disk->private_data;
> > > > +	idx = srcu_read_lock(&zram->srcu);
> > > > +
> > > >  	if (!valid_io_request(zram, sector, PAGE_SIZE)) {
> > > >  		atomic64_inc(&zram->stats.invalid_io);
> > > > +		srcu_read_unlock(&zram->srcu, idx);
> > > >  		return -EINVAL;
> > > >  	}
> > > >  
> > > > -	down_read(&zram->init_lock);
> > > >  	if (unlikely(!init_done(zram))) {
> > > >  		err = -EIO;
> > > >  		goto out_unlock;
> > > > @@ -971,7 +994,7 @@ static int zram_rw_page(struct block_device *bdev, sector_t sector,
> > > >  
> > > >  	err = zram_bvec_rw(zram, &bv, index, offset, rw);
> > > >  out_unlock:
> > > > -	up_read(&zram->init_lock);
> > > > +	srcu_read_unlock(&zram->srcu, idx);
> > > >  	/*
> > > >  	 * If I/O fails, just return error(ie, non-zero) without
> > > >  	 * calling page_endio.
> > > > @@ -1041,6 +1064,11 @@ static int create_device(struct zram *zram, int device_id)
> > > >  
> > > >  	init_rwsem(&zram->init_lock);
> > > >  
> > > > +	if (init_srcu_struct(&zram->srcu)) {
> > > > +		pr_err("Error initialize srcu for device %d\n", device_id);
> > > > +		goto out;
> > > > +	}
> > > > +
> > > >  	zram->queue = blk_alloc_queue(GFP_KERNEL);
> > > >  	if (!zram->queue) {
> > > >  		pr_err("Error allocating disk queue for device %d\n",
> > > > @@ -1125,8 +1153,8 @@ static void destroy_device(struct zram *zram)
> > > >  
> > > >  	del_gendisk(zram->disk);
> > > >  	put_disk(zram->disk);
> > > > -
> > > >  	blk_cleanup_queue(zram->queue);
> > > > +	cleanup_srcu_struct(&zram->srcu);
> > > >  }
> > > >  
> > > >  static int __init zram_init(void)
> > > > diff --git a/drivers/block/zram/zram_drv.h b/drivers/block/zram/zram_drv.h
> > > > index e492f6bf11f1..2042c310aea8 100644
> > > > --- a/drivers/block/zram/zram_drv.h
> > > > +++ b/drivers/block/zram/zram_drv.h
> > > > @@ -105,8 +105,13 @@ struct zram {
> > > >  	struct gendisk *disk;
> > > >  	struct zcomp *comp;
> > > >  
> > > > +	struct srcu_struct srcu;
> > > > +	struct rcu_head rcu;
> > > > +
> > > >  	/* Prevent concurrent execution of device init, reset and R/W request */
> > > >  	struct rw_semaphore init_lock;
> > > > +	bool init_done;
> > > > +
> > > >  	/*
> > > >  	 * This is the limit on amount of *uncompressed* worth of data
> > > >  	 * we can store in a disk.
> > > > -- 
> > > > 1.9.1
> > > > 
> > 
> > -- 
> > Kind regards,
> > Minchan Kim
> > 

-- 
Kind regards,
Minchan Kim

^ permalink raw reply	[flat|nested] 32+ messages in thread

* Re: [PATCH 1/2] zram: free meta out of init_lock
  2015-01-28  0:15                   ` Minchan Kim
@ 2015-01-28  0:22                     ` Minchan Kim
  2015-01-28  2:07                       ` Sergey Senozhatsky
  2015-01-28  0:24                     ` Sergey Senozhatsky
  1 sibling, 1 reply; 32+ messages in thread
From: Minchan Kim @ 2015-01-28  0:22 UTC (permalink / raw)
  To: Sergey Senozhatsky
  Cc: Sergey Senozhatsky, Jerome Marchand, Andrew Morton, linux-kernel,
	linux-mm, Nitin Gupta

On Wed, Jan 28, 2015 at 09:15:27AM +0900, Minchan Kim wrote:
> Hello Sergey,
> 
> On Tue, Jan 27, 2015 at 01:03:05PM +0900, Sergey Senozhatsky wrote:
> > Hello,
> > 
> > On (01/27/15 12:18), Minchan Kim wrote:
> > > Hello Sergey,
> > > 
> > > On Tue, Jan 27, 2015 at 11:17:04AM +0900, Sergey Senozhatsky wrote:
> > > > On (01/27/15 01:00), Minchan Kim wrote:
> > > > > On Mon, Jan 26, 2015 at 11:17:09PM +0900, Sergey Senozhatsky wrote:
> > > > > > Hello,
> > > > > > 
> > > > > > On (01/26/15 10:33), Minchan Kim wrote:
> > > > > > > Hello,
> > > > > > > 
> > > > > > > On Sat, Jan 24, 2015 at 12:47:07AM +0900, Sergey Senozhatsky wrote:
> > > > > > > > On (01/23/15 15:48), Jerome Marchand wrote:
> > > > > > > > > On 01/23/2015 03:24 PM, Sergey Senozhatsky wrote:
> > > > > > > > > > On (01/23/15 14:58), Minchan Kim wrote:
> > > > > > > > > >> We don't need to call zram_meta_free, zcomp_destroy and zs_free
> > > > > > > > > >> under init_lock. What we need to prevent race with init_lock
> > > > > > > > > >> in reset is setting NULL into zram->meta (ie, init_done).
> > > > > > > > > >> This patch does it.
> > > > > > > > > >>
> > > > > > > > > >> Signed-off-by: Minchan Kim <minchan@kernel.org>
> > > > > > > > > >> ---
> > > > > > > > > >>  drivers/block/zram/zram_drv.c | 28 ++++++++++++++++------------
> > > > > > > > > >>  1 file changed, 16 insertions(+), 12 deletions(-)
> > > > > > > > > >>
> > > > > > > > > >> diff --git a/drivers/block/zram/zram_drv.c b/drivers/block/zram/zram_drv.c
> > > > > > > > > >> index 9250b3f54a8f..0299d82275e7 100644
> > > > > > > > > >> --- a/drivers/block/zram/zram_drv.c
> > > > > > > > > >> +++ b/drivers/block/zram/zram_drv.c
> > > > > > > > > >> @@ -708,6 +708,7 @@ static void zram_reset_device(struct zram *zram, bool reset_capacity)
> > > > > > > > > >>  {
> > > > > > > > > >>  	size_t index;
> > > > > > > > > >>  	struct zram_meta *meta;
> > > > > > > > > >> +	struct zcomp *comp;
> > > > > > > > > >>  
> > > > > > > > > >>  	down_write(&zram->init_lock);
> > > > > > > > > >>  
> > > > > > > > > >> @@ -719,20 +720,10 @@ static void zram_reset_device(struct zram *zram, bool reset_capacity)
> > > > > > > > > >>  	}
> > > > > > > > > >>  
> > > > > > > > > >>  	meta = zram->meta;
> > > > > > > > > >> -	/* Free all pages that are still in this zram device */
> > > > > > > > > >> -	for (index = 0; index < zram->disksize >> PAGE_SHIFT; index++) {
> > > > > > > > > >> -		unsigned long handle = meta->table[index].handle;
> > > > > > > > > >> -		if (!handle)
> > > > > > > > > >> -			continue;
> > > > > > > > > >> -
> > > > > > > > > >> -		zs_free(meta->mem_pool, handle);
> > > > > > > > > >> -	}
> > > > > > > > > >> -
> > > > > > > > > >> -	zcomp_destroy(zram->comp);
> > > > > > > > > > 
> > > > > > > > > > I'm not so sure about moving zcomp destruction. if we would have detached it
> > > > > > > > > > from zram, then yes. otherwise, think of zram ->destoy vs ->init race.
> > > > > > > > > > 
> > > > > > > > > > suppose,
> > > > > > > > > > CPU1 waits for down_write() init lock in disksize_store() with new comp already allocated;
> > > > > > > > > > CPU0 detaches ->meta and releases write init lock;
> > > > > > > > > > CPU1 grabs the lock and does zram->comp = comp;
> > > > > > > > > > CPU0 reaches the point of zcomp_destroy(zram->comp);
> > > > > > > > > 
> > > > > > > > > I don't see your point: this patch does not call
> > > > > > > > > zcomp_destroy(zram->comp) anymore, but zram_destroy(comp), where comp is
> > > > > > > > > the old zram->comp.
> > > > > > > > 
> > > > > > > > 
> > > > > > > > oh... yes. sorry! my bad.
> > > > > > > > 
> > > > > > > > 
> > > > > > > > 
> > > > > > > > anyway, on a second thought, do we even want to destoy meta out of init_lock?
> > > > > > > > 
> > > > > > > > I mean, it will let you init new device quicker. but... assume, you have
> > > > > > > > 30G zram (or any other bad-enough number). on CPU0 you reset device -- iterate
> > > > > > > > over 30G meta->table, etc. out of init_lock.
> > > > > > > > on CPU1 you concurrently re-init device and request again 30G.
> > > > > > > > 
> > > > > > > > how bad that can be?
> > > > > > > > 
> > > > > > > > 
> > > > > > > > 
> > > > > > > > diskstore called on already initialised device is also not so perfect.
> > > > > > > > we first will try to allocate ->meta (vmalloc pages for another 30G),
> > > > > > > > then allocate comp, then down_write() init lock to find out that device
> > > > > > > > is initialised and we need to release allocated memory.
> > > > > > > > 
> > > > > > > > 
> > > > > > > > 
> > > > > > > > may be we better keep ->meta destruction under init_lock and additionally
> > > > > > > > move ->meta and ->comp allocation under init_lock in disksize_store()?
> > > > > > > > 
> > > > > > > > like the following one:
> > > > > > > > 
> > > > > > > > ---
> > > > > > > > 
> > > > > > > >  drivers/block/zram/zram_drv.c | 25 +++++++++++++------------
> > > > > > > >  1 file changed, 13 insertions(+), 12 deletions(-)
> > > > > > > > 
> > > > > > > > diff --git a/drivers/block/zram/zram_drv.c b/drivers/block/zram/zram_drv.c
> > > > > > > > index 9250b3f..827ab21 100644
> > > > > > > > --- a/drivers/block/zram/zram_drv.c
> > > > > > > > +++ b/drivers/block/zram/zram_drv.c
> > > > > > > > @@ -765,9 +765,18 @@ static ssize_t disksize_store(struct device *dev,
> > > > > > > >  		return -EINVAL;
> > > > > > > >  
> > > > > > > >  	disksize = PAGE_ALIGN(disksize);
> > > > > > > > +	down_write(&zram->init_lock);
> > > > > > > > +	if (init_done(zram)) {
> > > > > > > > +		up_write(&zram->init_lock);
> > > > > > > > +		pr_info("Cannot change disksize for initialized device\n");
> > > > > > > > +		return -EBUSY;
> > > > > > > > +	}
> > > > > > > > +
> > > > > > > >  	meta = zram_meta_alloc(zram->disk->first_minor, disksize);
> > > > > > > > -	if (!meta)
> > > > > > > > -		return -ENOMEM;
> > > > > > > > +	if (!meta) {
> > > > > > > > +		err = -ENOMEM;
> > > > > > > > +		goto out_unlock;
> > > > > > > > +	}
> > > > > > > >  
> > > > > > > >  	comp = zcomp_create(zram->compressor, zram->max_comp_streams);
> > > > > > > >  	if (IS_ERR(comp)) {
> > > > > > > > @@ -777,13 +786,6 @@ static ssize_t disksize_store(struct device *dev,
> > > > > > > >  		goto out_free_meta;
> > > > > > > >  	}
> > > > > > > >  
> > > > > > > > -	down_write(&zram->init_lock);
> > > > > > > > -	if (init_done(zram)) {
> > > > > > > > -		pr_info("Cannot change disksize for initialized device\n");
> > > > > > > > -		err = -EBUSY;
> > > > > > > > -		goto out_destroy_comp;
> > > > > > > > -	}
> > > > > > > > -
> > > > > > > >  	zram->meta = meta;
> > > > > > > >  	zram->comp = comp;
> > > > > > > >  	zram->disksize = disksize;
> > > > > > > > @@ -799,11 +801,10 @@ static ssize_t disksize_store(struct device *dev,
> > > > > > > >  
> > > > > > > >  	return len;
> > > > > > > >  
> > > > > > > > -out_destroy_comp:
> > > > > > > > -	up_write(&zram->init_lock);
> > > > > > > > -	zcomp_destroy(comp);
> > > > > > > >  out_free_meta:
> > > > > > > >  	zram_meta_free(meta);
> > > > > > > > +out_unlock:
> > > > > > > > +	up_write(&zram->init_lock);
> > > > > > > >  	return err;
> > > > > > > >  }
> > > > > > > >  
> > > > > > > 
> > > > > > > The init_lock is really troublesome. We can't do call zram_meta_alloc
> > > > > > > under init_lock due to lockdep report. Please keep in mind.
> > > > > > >
> > > > > > 
> > > > > > ah... I do recall it, thanks for your reminder.
> > > > > > 
> > > > > > 
> > > > > > > The zram_rw_page is one of the function under reclaim path and hold it
> > > > > > > as read_lock while here holds it as write_lock.
> > > > > > > It's a false positive so that we might could make shut lockdep up
> > > > > > > by annotation but I don't want it but want to work with lockdep rather
> > > > > > > than disable. As well, there are other pathes to use init_lock to
> > > > > > > protect other data where would be victims of lockdep.
> > > > > > > 
> > > > > > > I didn't tell the motivation of this patch because it made you busy
> > > > > > > guys wasted. Let me tell it now. It was another lockdep report by
> > > > > > > kmem_cache_destroy for zsmalloc compaction about init_lock. That's why
> > > > > > > the patchset was one of the patch in compaction.
> > > > > > >
> > > > > > > Yes, the ideal is to remove horrible init_lock of zram in this phase and
> > > > > > > make code more simple and clear but I don't want to stuck zsmalloc
> > > > > > > compaction by the work.
> > > > > > 
> > > > > > 
> > > > > > > Having said that, I feel it's time to revisit
> > > > > > > to remove init_lock.
> > > > > > > At least, I will think over to find a solution to kill init_lock.
> > > > > > 
> > > > > > hm, can't think of anything quick...
> > > > > > 
> > > > > > 	-ss
> > > > > 
> > > > > Hello guys,
> > > > > 
> > > > > How about this?
> > > > > 
> > > > > It's based on Ganesh's patch.
> > > > > https://lkml.org/lkml/2015/1/24/50
> > > > (I see no similarities with Ganesh's patch)
> > > > 
> > > > hm, you probably meant this one https://lkml.org/lkml/2015/1/23/406
> > > > 
> > > > 
> > > > at glance this makes things a bit more complicated, so I need to think more.
> > > > 
> > > > > From afda9fd2f6c40dd0745d8a6babe78c5cbdceddf5 Mon Sep 17 00:00:00 2001
> > > > > From: Minchan Kim <minchan@kernel.org>
> > > > > Date: Mon, 26 Jan 2015 14:34:10 +0900
> > > > > Subject: [RFC] zram: remove init_lock in zram_make_request
> > > > > 
> > > > > Admin could reset zram during I/O operation going on so we have
> > > > > used zram->init_lock as read-side lock in I/O path to prevent
> > > > > sudden zram meta freeing.
> > > > > 
> > > > > However, the init_lock is really troublesome.
> > > > > We can't do call zram_meta_alloc under init_lock due to lockdep splat
> > > > > because zram_rw_page is one of the function under reclaim path and
> > > > > hold it as read_lock while other places in process context hold it
> > > > > as write_lock. So, we have used allocation out of the lock to avoid
> > > > > lockdep warn but it's not good for readability and fainally, I met
> > > > > another lockdep splat between init_lock and cpu_hotpulug from
> > > > > kmem_cache_destroy during wokring zsmalloc compaction. :(
> > > > > 
> > > > > Yes, the ideal is to remove horrible init_lock of zram in rw path.
> > > > > This patch removes it in rw path and instead, put init_done bool
> > > > > variable to check initialization done with smp_[wmb|rmb] and
> > > > > srcu_[un]read_lock to prevent sudden zram meta freeing
> > > > > during I/O operation.
> > > > > 
> > > > > Signed-off-by: Minchan Kim <minchan@kernel.org>
> > > > > ---
> > > > >  drivers/block/zram/zram_drv.c | 76 +++++++++++++++++++++++++++++--------------
> > > > >  drivers/block/zram/zram_drv.h |  5 +++
> > > > >  2 files changed, 57 insertions(+), 24 deletions(-)
> > > > > 
> > > > > diff --git a/drivers/block/zram/zram_drv.c b/drivers/block/zram/zram_drv.c
> > > > > index a598ada817f0..e06ff975f997 100644
> > > > > --- a/drivers/block/zram/zram_drv.c
> > > > > +++ b/drivers/block/zram/zram_drv.c
> > > > > @@ -32,6 +32,7 @@
> > > > >  #include <linux/string.h>
> > > > >  #include <linux/vmalloc.h>
> > > > >  #include <linux/err.h>
> > > > > +#include <linux/srcu.h>
> > > > >  
> > > > >  #include "zram_drv.h"
> > > > >  
> > > > > @@ -53,9 +54,16 @@ static ssize_t name##_show(struct device *d,		\
> > > > >  }									\
> > > > >  static DEVICE_ATTR_RO(name);
> > > > >  
> > > > > -static inline int init_done(struct zram *zram)
> > > > > +static inline bool init_done(struct zram *zram)
> > > > >  {
> > > > > -	return zram->meta != NULL;
> > > > > +	/*
> > > > > +	 * init_done can be used without holding zram->init_lock in
> > > > > +	 * read/write handler(ie, zram_make_request) but we should make sure
> > > > > +	 * that zram->init_done should set up after meta initialization is
> > > > > +	 * done. Look at disksize_store.
> > > > > +	 */
> > > > > +	smp_rmb();
> > > > > +	return zram->init_done;
> > > > 
> > > > ->init_done returns back :)
> > > 
> > > 
> > > > can we rely on write ->meta; wmb; --- rmb; read ->meta?
> > > 
> > > Might be possible.
> 
> Now that I think about it, it's impossible with zram->meta because
> we need to nullify it before call_srcu but pre-existing SRCU read-side
> critical sections can access zram->meta.
> Anyway, introducing a new variable should be not a party-pooper.
> 
> > > 
> > > > 
> > > > how much performance do we lose on barriers?
> > > 
> > > I think it's not too much than locking which does more than(ie,
> > > barrier, fairness, spin on owner and so on) such simple barrier.
> > > 
> > > > 
> > > > >  }
> > > > >  
> > > > >  static inline struct zram *dev_to_zram(struct device *dev)
> > > > > @@ -326,6 +334,10 @@ static void zram_meta_free(struct zram_meta *meta)
> > > > >  	kfree(meta);
> > > > >  }
> > > > >  
> > > > > +static void rcu_zram_do_nothing(struct rcu_head *unused)
> > > > > +{
> > > > > +}
> > > > > +
> > > > >  static struct zram_meta *zram_meta_alloc(int device_id, u64 disksize)
> > > > >  {
> > > > >  	char pool_name[8];
> > > > > @@ -726,11 +738,8 @@ static void zram_reset_device(struct zram *zram, bool reset_capacity)
> > > > >  		return;
> > > > >  	}
> > > > >  
> > > > > -	zcomp_destroy(zram->comp);
> > > > >  	zram->max_comp_streams = 1;
> > > > >  
> > > > > -	zram_meta_free(zram->meta);
> > > > > -	zram->meta = NULL;
> > > > >  	/* Reset stats */
> > > > >  	memset(&zram->stats, 0, sizeof(zram->stats));
> > > > >  
> > > > > @@ -738,8 +747,12 @@ static void zram_reset_device(struct zram *zram, bool reset_capacity)
> > > > >  	if (reset_capacity)
> > > > >  		set_capacity(zram->disk, 0);
> > > > >  
> > > > > +	zram->init_done = false;
> > > > 
> > > > missing wmb?
> > > 
> > > I thouht about it but when I read comment from call_srcu as follows
> > > "each cpu is guaranteed to have executed a full memory barrier",
> > > I decided we don't need it. Right? (ie, double check)
> > > 
> > 
> > hm, need to think about it.
> 
> Another idea is to use kick_all_cpus_sync, not srcu.
> With that, we don't need to add more instruction in rw path.
> I will try it.

>From 560478040d2e08c61796e67d0c3ee519ae67ac0f Mon Sep 17 00:00:00 2001
From: Minchan Kim <minchan@kernel.org>
Date: Mon, 26 Jan 2015 14:34:10 +0900
Subject: [PATCH] zram: remove init_lock in zram_make_request

Admin could reset zram during I/O operation going on so we have
used zram->init_lock as read-side lock in I/O path to prevent
sudden zram meta freeing.

However, the init_lock is really troublesome.
We can't do call zram_meta_alloc under init_lock due to lockdep splat
because zram_rw_page is one of the function under reclaim path and
hold it as read_lock while other places in process context hold it
as write_lock. So, we have used allocation out of the lock to avoid
lockdep warn but it's not good for readability and finally, I met
another lockdep splat between init_lock and cpu_hotpulug from
kmem_cache_destroy during wokring zsmalloc compaction. :(

Yes, the ideal is to remove horrible init_lock of zram in rw path.
This patch removes it in rw path and instead, use kick_all_cpus_sync
and a bool init_done variable to check initialization done with
smp_[wmb|rmb].

Upon kick_all_cpus_sync returns, any CPU cannot access zram meta
any more due to init_done in zram_make_request so it's safe to
free meta. So, finally, we avoids init_lock in reclaim context
so we are free for deadlock.

Signed-off-by: Minchan Kim <minchan@kernel.org>
---
 drivers/block/zram/zram_drv.c | 70 +++++++++++++++++++++++++------------------
 drivers/block/zram/zram_drv.h |  2 ++
 2 files changed, 43 insertions(+), 29 deletions(-)

diff --git a/drivers/block/zram/zram_drv.c b/drivers/block/zram/zram_drv.c
index a598ada817f0..404602b1932e 100644
--- a/drivers/block/zram/zram_drv.c
+++ b/drivers/block/zram/zram_drv.c
@@ -53,9 +53,16 @@ static ssize_t name##_show(struct device *d,		\
 }									\
 static DEVICE_ATTR_RO(name);
 
-static inline int init_done(struct zram *zram)
+static inline bool init_done(struct zram *zram)
 {
-	return zram->meta != NULL;
+	/*
+	 * init_done can be used without holding zram->init_lock in
+	 * read/write handler(ie, zram_make_request) but we should make sure
+	 * that zram->init_done should set up after meta initialization is
+	 * done. Look at disksize_store.
+	 */
+	smp_rmb();
+	return zram->init_done;
 }
 
 static inline struct zram *dev_to_zram(struct device *dev)
@@ -726,11 +733,8 @@ static void zram_reset_device(struct zram *zram, bool reset_capacity)
 		return;
 	}
 
-	zcomp_destroy(zram->comp);
 	zram->max_comp_streams = 1;
 
-	zram_meta_free(zram->meta);
-	zram->meta = NULL;
 	/* Reset stats */
 	memset(&zram->stats, 0, sizeof(zram->stats));
 
@@ -738,8 +742,16 @@ static void zram_reset_device(struct zram *zram, bool reset_capacity)
 	if (reset_capacity)
 		set_capacity(zram->disk, 0);
 
+	zram->init_done = false;
+	/* don't need smp_wmb because kick_all_cpus_sync does */
+	kick_all_cpus_sync();
+	/*
+	 * From now on, any read/write cannot access zram meta data
+	 * by init_done in the handler.
+	 */
+	zram_meta_free(zram->meta);
+	zcomp_destroy(zram->comp);
 	up_write(&zram->init_lock);
-
 	/*
 	 * Revalidate disk out of the init_lock to avoid lockdep splat.
 	 * It's okay because disk's capacity is protected by init_lock
@@ -762,10 +774,19 @@ static ssize_t disksize_store(struct device *dev,
 	if (!disksize)
 		return -EINVAL;
 
+	down_write(&zram->init_lock);
+	if (init_done(zram)) {
+		pr_info("Cannot change disksize for initialized device\n");
+		up_write(&zram->init_lock);
+		return -EBUSY;
+	}
+
 	disksize = PAGE_ALIGN(disksize);
 	meta = zram_meta_alloc(zram->disk->first_minor, disksize);
-	if (!meta)
+	if (!meta) {
+		up_write(&zram->init_lock);
 		return -ENOMEM;
+	}
 
 	comp = zcomp_create(zram->compressor, zram->max_comp_streams);
 	if (IS_ERR(comp)) {
@@ -775,17 +796,17 @@ static ssize_t disksize_store(struct device *dev,
 		goto out_free_meta;
 	}
 
-	down_write(&zram->init_lock);
-	if (init_done(zram)) {
-		pr_info("Cannot change disksize for initialized device\n");
-		err = -EBUSY;
-		goto out_destroy_comp;
-	}
-
 	zram->meta = meta;
 	zram->comp = comp;
 	zram->disksize = disksize;
 	set_capacity(zram->disk, zram->disksize >> SECTOR_SHIFT);
+	/*
+	 * Store operation of struct zram fields should complete
+	 * before init_done set up because zram_bvec_rw doesn't
+	 * hold an zram->init_lock.
+	 */
+	smp_wmb();
+	zram->init_done = true;
 	up_write(&zram->init_lock);
 
 	/*
@@ -797,10 +818,8 @@ static ssize_t disksize_store(struct device *dev,
 
 	return len;
 
-out_destroy_comp:
-	up_write(&zram->init_lock);
-	zcomp_destroy(comp);
 out_free_meta:
+	up_write(&zram->init_lock);
 	zram_meta_free(meta);
 	return err;
 }
@@ -907,7 +926,6 @@ static void zram_make_request(struct request_queue *queue, struct bio *bio)
 {
 	struct zram *zram = queue->queuedata;
 
-	down_read(&zram->init_lock);
 	if (unlikely(!init_done(zram)))
 		goto error;
 
@@ -918,12 +936,10 @@ static void zram_make_request(struct request_queue *queue, struct bio *bio)
 	}
 
 	__zram_make_request(zram, bio);
-	up_read(&zram->init_lock);
 
 	return;
 
 error:
-	up_read(&zram->init_lock);
 	bio_io_error(bio);
 }
 
@@ -951,17 +967,16 @@ static int zram_rw_page(struct block_device *bdev, sector_t sector,
 	struct bio_vec bv;
 
 	zram = bdev->bd_disk->private_data;
+
+	/* This should be another patch */
+	if (unlikely(!init_done(zram)))
+		return -EIO;
+
 	if (!valid_io_request(zram, sector, PAGE_SIZE)) {
 		atomic64_inc(&zram->stats.invalid_io);
 		return -EINVAL;
 	}
 
-	down_read(&zram->init_lock);
-	if (unlikely(!init_done(zram))) {
-		err = -EIO;
-		goto out_unlock;
-	}
-
 	index = sector >> SECTORS_PER_PAGE_SHIFT;
 	offset = sector & (SECTORS_PER_PAGE - 1) << SECTOR_SHIFT;
 
@@ -970,8 +985,6 @@ static int zram_rw_page(struct block_device *bdev, sector_t sector,
 	bv.bv_offset = 0;
 
 	err = zram_bvec_rw(zram, &bv, index, offset, rw);
-out_unlock:
-	up_read(&zram->init_lock);
 	/*
 	 * If I/O fails, just return error(ie, non-zero) without
 	 * calling page_endio.
@@ -1125,7 +1138,6 @@ static void destroy_device(struct zram *zram)
 
 	del_gendisk(zram->disk);
 	put_disk(zram->disk);
-
 	blk_cleanup_queue(zram->queue);
 }
 
diff --git a/drivers/block/zram/zram_drv.h b/drivers/block/zram/zram_drv.h
index e492f6bf11f1..dca265654285 100644
--- a/drivers/block/zram/zram_drv.h
+++ b/drivers/block/zram/zram_drv.h
@@ -107,6 +107,8 @@ struct zram {
 
 	/* Prevent concurrent execution of device init, reset and R/W request */
 	struct rw_semaphore init_lock;
+	bool init_done;
+
 	/*
 	 * This is the limit on amount of *uncompressed* worth of data
 	 * we can store in a disk.
-- 
1.9.1


-- 
Kind regards,
Minchan Kim

^ permalink raw reply	[flat|nested] 32+ messages in thread

* Re: [PATCH 1/2] zram: free meta out of init_lock
  2015-01-28  0:15                   ` Minchan Kim
  2015-01-28  0:22                     ` Minchan Kim
@ 2015-01-28  0:24                     ` Sergey Senozhatsky
  2015-01-28  0:59                       ` Minchan Kim
  1 sibling, 1 reply; 32+ messages in thread
From: Sergey Senozhatsky @ 2015-01-28  0:24 UTC (permalink / raw)
  To: Minchan Kim
  Cc: Sergey Senozhatsky, Sergey Senozhatsky, Jerome Marchand,
	Andrew Morton, linux-kernel, linux-mm, Nitin Gupta

Hello,

On (01/28/15 09:15), Minchan Kim wrote:
> > > > > > > On Sat, Jan 24, 2015 at 12:47:07AM +0900, Sergey Senozhatsky wrote:
> > > > > > > > On (01/23/15 15:48), Jerome Marchand wrote:
> > > > > > > > > On 01/23/2015 03:24 PM, Sergey Senozhatsky wrote:
> > > > > > > > > > On (01/23/15 14:58), Minchan Kim wrote:
> > > > > > > > > >> We don't need to call zram_meta_free, zcomp_destroy and zs_free
> > > > > > > > > >> under init_lock. What we need to prevent race with init_lock
> > > > > > > > > >> in reset is setting NULL into zram->meta (ie, init_done).
> > > > > > > > > >> This patch does it.
> > > > > > > > > >>
> > > > > > > > > >> Signed-off-by: Minchan Kim <minchan@kernel.org>
> > > > > > > > > >> ---
> > > > > > > > > >>  drivers/block/zram/zram_drv.c | 28 ++++++++++++++++------------
> > > > > > > > > >>  1 file changed, 16 insertions(+), 12 deletions(-)
> > > > > > > > > >>
> > > > > > > > > >> diff --git a/drivers/block/zram/zram_drv.c b/drivers/block/zram/zram_drv.c
> > > > > > > > > >> index 9250b3f54a8f..0299d82275e7 100644
> > > > > > > > > >> --- a/drivers/block/zram/zram_drv.c
> > > > > > > > > >> +++ b/drivers/block/zram/zram_drv.c
> > > > > > > > > >> @@ -708,6 +708,7 @@ static void zram_reset_device(struct zram *zram, bool reset_capacity)
> > > > > > > > > >>  {
> > > > > > > > > >>  	size_t index;
> > > > > > > > > >>  	struct zram_meta *meta;
> > > > > > > > > >> +	struct zcomp *comp;
> > > > > > > > > >>  
> > > > > > > > > >>  	down_write(&zram->init_lock);
> > > > > > > > > >>  
> > > > > > > > > >> @@ -719,20 +720,10 @@ static void zram_reset_device(struct zram *zram, bool reset_capacity)
> > > > > > > > > >>  	}
> > > > > > > > > >>  
> > > > > > > > > >>  	meta = zram->meta;
> > > > > > > > > >> -	/* Free all pages that are still in this zram device */
> > > > > > > > > >> -	for (index = 0; index < zram->disksize >> PAGE_SHIFT; index++) {
> > > > > > > > > >> -		unsigned long handle = meta->table[index].handle;
> > > > > > > > > >> -		if (!handle)
> > > > > > > > > >> -			continue;
> > > > > > > > > >> -
> > > > > > > > > >> -		zs_free(meta->mem_pool, handle);
> > > > > > > > > >> -	}
> > > > > > > > > >> -
> > > > > > > > > >> -	zcomp_destroy(zram->comp);
> > > > > > > > > > 
> > > > > > > > > > I'm not so sure about moving zcomp destruction. if we would have detached it
> > > > > > > > > > from zram, then yes. otherwise, think of zram ->destoy vs ->init race.
> > > > > > > > > > 
> > > > > > > > > > suppose,
> > > > > > > > > > CPU1 waits for down_write() init lock in disksize_store() with new comp already allocated;
> > > > > > > > > > CPU0 detaches ->meta and releases write init lock;
> > > > > > > > > > CPU1 grabs the lock and does zram->comp = comp;
> > > > > > > > > > CPU0 reaches the point of zcomp_destroy(zram->comp);
> > > > > > > > > 
> > > > > > > > > I don't see your point: this patch does not call
> > > > > > > > > zcomp_destroy(zram->comp) anymore, but zram_destroy(comp), where comp is
> > > > > > > > > the old zram->comp.
> > > > > > > > 
> > > > > > > > 
> > > > > > > > oh... yes. sorry! my bad.
> > > > > > > > 
> > > > > > > > 
> > > > > > > > 
> > > > > > > > anyway, on a second thought, do we even want to destoy meta out of init_lock?
> > > > > > > > 
> > > > > > > > I mean, it will let you init new device quicker. but... assume, you have
> > > > > > > > 30G zram (or any other bad-enough number). on CPU0 you reset device -- iterate
> > > > > > > > over 30G meta->table, etc. out of init_lock.
> > > > > > > > on CPU1 you concurrently re-init device and request again 30G.
> > > > > > > > 
> > > > > > > > how bad that can be?
> > > > > > > > 
> > > > > > > > 
> > > > > > > > 
> > > > > > > > diskstore called on already initialised device is also not so perfect.
> > > > > > > > we first will try to allocate ->meta (vmalloc pages for another 30G),
> > > > > > > > then allocate comp, then down_write() init lock to find out that device
> > > > > > > > is initialised and we need to release allocated memory.
> > > > > > > > 
> > > > > > > > 
> > > > > > > > 
> > > > > > > > may be we better keep ->meta destruction under init_lock and additionally
> > > > > > > > move ->meta and ->comp allocation under init_lock in disksize_store()?
> > > > > > > > 
> > > > > > > > like the following one:
> > > > > > > > 
> > > > > > > > ---
> > > > > > > > 
> > > > > > > >  drivers/block/zram/zram_drv.c | 25 +++++++++++++------------
> > > > > > > >  1 file changed, 13 insertions(+), 12 deletions(-)
> > > > > > > > 
> > > > > > > > diff --git a/drivers/block/zram/zram_drv.c b/drivers/block/zram/zram_drv.c
> > > > > > > > index 9250b3f..827ab21 100644
> > > > > > > > --- a/drivers/block/zram/zram_drv.c
> > > > > > > > +++ b/drivers/block/zram/zram_drv.c
> > > > > > > > @@ -765,9 +765,18 @@ static ssize_t disksize_store(struct device *dev,
> > > > > > > >  		return -EINVAL;
> > > > > > > >  
> > > > > > > >  	disksize = PAGE_ALIGN(disksize);
> > > > > > > > +	down_write(&zram->init_lock);
> > > > > > > > +	if (init_done(zram)) {
> > > > > > > > +		up_write(&zram->init_lock);
> > > > > > > > +		pr_info("Cannot change disksize for initialized device\n");
> > > > > > > > +		return -EBUSY;
> > > > > > > > +	}
> > > > > > > > +
> > > > > > > >  	meta = zram_meta_alloc(zram->disk->first_minor, disksize);
> > > > > > > > -	if (!meta)
> > > > > > > > -		return -ENOMEM;
> > > > > > > > +	if (!meta) {
> > > > > > > > +		err = -ENOMEM;
> > > > > > > > +		goto out_unlock;
> > > > > > > > +	}
> > > > > > > >  
> > > > > > > >  	comp = zcomp_create(zram->compressor, zram->max_comp_streams);
> > > > > > > >  	if (IS_ERR(comp)) {
> > > > > > > > @@ -777,13 +786,6 @@ static ssize_t disksize_store(struct device *dev,
> > > > > > > >  		goto out_free_meta;
> > > > > > > >  	}
> > > > > > > >  
> > > > > > > > -	down_write(&zram->init_lock);
> > > > > > > > -	if (init_done(zram)) {
> > > > > > > > -		pr_info("Cannot change disksize for initialized device\n");
> > > > > > > > -		err = -EBUSY;
> > > > > > > > -		goto out_destroy_comp;
> > > > > > > > -	}
> > > > > > > > -
> > > > > > > >  	zram->meta = meta;
> > > > > > > >  	zram->comp = comp;
> > > > > > > >  	zram->disksize = disksize;
> > > > > > > > @@ -799,11 +801,10 @@ static ssize_t disksize_store(struct device *dev,
> > > > > > > >  
> > > > > > > >  	return len;
> > > > > > > >  
> > > > > > > > -out_destroy_comp:
> > > > > > > > -	up_write(&zram->init_lock);
> > > > > > > > -	zcomp_destroy(comp);
> > > > > > > >  out_free_meta:
> > > > > > > >  	zram_meta_free(meta);
> > > > > > > > +out_unlock:
> > > > > > > > +	up_write(&zram->init_lock);
> > > > > > > >  	return err;
> > > > > > > >  }
> > > > > > > >  
> > > > > > > 
> > > > > > > The init_lock is really troublesome. We can't do call zram_meta_alloc
> > > > > > > under init_lock due to lockdep report. Please keep in mind.
> > > > > > >
> > > > > > 
> > > > > > ah... I do recall it, thanks for your reminder.
> > > > > > 
> > > > > > 
> > > > > > > The zram_rw_page is one of the function under reclaim path and hold it
> > > > > > > as read_lock while here holds it as write_lock.
> > > > > > > It's a false positive so that we might could make shut lockdep up
> > > > > > > by annotation but I don't want it but want to work with lockdep rather
> > > > > > > than disable. As well, there are other pathes to use init_lock to
> > > > > > > protect other data where would be victims of lockdep.
> > > > > > > 
> > > > > > > I didn't tell the motivation of this patch because it made you busy
> > > > > > > guys wasted. Let me tell it now. It was another lockdep report by
> > > > > > > kmem_cache_destroy for zsmalloc compaction about init_lock. That's why
> > > > > > > the patchset was one of the patch in compaction.
> > > > > > >
> > > > > > > Yes, the ideal is to remove horrible init_lock of zram in this phase and
> > > > > > > make code more simple and clear but I don't want to stuck zsmalloc
> > > > > > > compaction by the work.
> > > > > > 
> > > > > > 
> > > > > > > Having said that, I feel it's time to revisit
> > > > > > > to remove init_lock.
> > > > > > > At least, I will think over to find a solution to kill init_lock.
> > > > > > 
> > > > > > hm, can't think of anything quick...
> > > > > > 
> > > > > > 	-ss
> > > > > 
> > > > > Hello guys,
> > > > > 
> > > > > How about this?
> > > > > 
> > > > > It's based on Ganesh's patch.
> > > > > https://lkml.org/lkml/2015/1/24/50
> > > > (I see no similarities with Ganesh's patch)
> > > > 
> > > > hm, you probably meant this one https://lkml.org/lkml/2015/1/23/406
> > > > 
> > > > 
> > > > at glance this makes things a bit more complicated, so I need to think more.
> > > > 
> > > > > From afda9fd2f6c40dd0745d8a6babe78c5cbdceddf5 Mon Sep 17 00:00:00 2001
> > > > > From: Minchan Kim <minchan@kernel.org>
> > > > > Date: Mon, 26 Jan 2015 14:34:10 +0900
> > > > > Subject: [RFC] zram: remove init_lock in zram_make_request
> > > > > 
> > > > > Admin could reset zram during I/O operation going on so we have
> > > > > used zram->init_lock as read-side lock in I/O path to prevent
> > > > > sudden zram meta freeing.
> > > > > 
> > > > > However, the init_lock is really troublesome.
> > > > > We can't do call zram_meta_alloc under init_lock due to lockdep splat
> > > > > because zram_rw_page is one of the function under reclaim path and
> > > > > hold it as read_lock while other places in process context hold it
> > > > > as write_lock. So, we have used allocation out of the lock to avoid
> > > > > lockdep warn but it's not good for readability and fainally, I met
> > > > > another lockdep splat between init_lock and cpu_hotpulug from
> > > > > kmem_cache_destroy during wokring zsmalloc compaction. :(
> > > > > 
> > > > > Yes, the ideal is to remove horrible init_lock of zram in rw path.
> > > > > This patch removes it in rw path and instead, put init_done bool
> > > > > variable to check initialization done with smp_[wmb|rmb] and
> > > > > srcu_[un]read_lock to prevent sudden zram meta freeing
> > > > > during I/O operation.
> > > > > 
> > > > > Signed-off-by: Minchan Kim <minchan@kernel.org>
> > > > > ---
> > > > >  drivers/block/zram/zram_drv.c | 76 +++++++++++++++++++++++++++++--------------
> > > > >  drivers/block/zram/zram_drv.h |  5 +++
> > > > >  2 files changed, 57 insertions(+), 24 deletions(-)
> > > > > 
> > > > > diff --git a/drivers/block/zram/zram_drv.c b/drivers/block/zram/zram_drv.c
> > > > > index a598ada817f0..e06ff975f997 100644
> > > > > --- a/drivers/block/zram/zram_drv.c
> > > > > +++ b/drivers/block/zram/zram_drv.c
> > > > > @@ -32,6 +32,7 @@
> > > > >  #include <linux/string.h>
> > > > >  #include <linux/vmalloc.h>
> > > > >  #include <linux/err.h>
> > > > > +#include <linux/srcu.h>
> > > > >  
> > > > >  #include "zram_drv.h"
> > > > >  
> > > > > @@ -53,9 +54,16 @@ static ssize_t name##_show(struct device *d,		\
> > > > >  }									\
> > > > >  static DEVICE_ATTR_RO(name);
> > > > >  
> > > > > -static inline int init_done(struct zram *zram)
> > > > > +static inline bool init_done(struct zram *zram)
> > > > >  {
> > > > > -	return zram->meta != NULL;
> > > > > +	/*
> > > > > +	 * init_done can be used without holding zram->init_lock in
> > > > > +	 * read/write handler(ie, zram_make_request) but we should make sure
> > > > > +	 * that zram->init_done should set up after meta initialization is
> > > > > +	 * done. Look at disksize_store.
> > > > > +	 */
> > > > > +	smp_rmb();
> > > > > +	return zram->init_done;
> > > > 
> > > > ->init_done returns back :)
> > > 
> > > 
> > > > can we rely on write ->meta; wmb; --- rmb; read ->meta?
> > > 
> > > Might be possible.
> 
> Now that I think about it, it's impossible with zram->meta because
> we need to nullify it before call_srcu but pre-existing SRCU read-side
> critical sections can access zram->meta.
> Anyway, introducing a new variable should be not a party-pooper.
> 
> > > 
> > > > 
> > > > how much performance do we lose on barriers?
> > > 
> > > I think it's not too much than locking which does more than(ie,
> > > barrier, fairness, spin on owner and so on) such simple barrier.
> > > 
> > > > 
> > > > >  }
> > > > >  
> > > > >  static inline struct zram *dev_to_zram(struct device *dev)
> > > > > @@ -326,6 +334,10 @@ static void zram_meta_free(struct zram_meta *meta)
> > > > >  	kfree(meta);
> > > > >  }
> > > > >  
> > > > > +static void rcu_zram_do_nothing(struct rcu_head *unused)
> > > > > +{
> > > > > +}
> > > > > +
> > > > >  static struct zram_meta *zram_meta_alloc(int device_id, u64 disksize)
> > > > >  {
> > > > >  	char pool_name[8];
> > > > > @@ -726,11 +738,8 @@ static void zram_reset_device(struct zram *zram, bool reset_capacity)
> > > > >  		return;
> > > > >  	}
> > > > >  
> > > > > -	zcomp_destroy(zram->comp);
> > > > >  	zram->max_comp_streams = 1;
> > > > >  
> > > > > -	zram_meta_free(zram->meta);
> > > > > -	zram->meta = NULL;
> > > > >  	/* Reset stats */
> > > > >  	memset(&zram->stats, 0, sizeof(zram->stats));
> > > > >  
> > > > > @@ -738,8 +747,12 @@ static void zram_reset_device(struct zram *zram, bool reset_capacity)
> > > > >  	if (reset_capacity)
> > > > >  		set_capacity(zram->disk, 0);
> > > > >  
> > > > > +	zram->init_done = false;
> > > > 
> > > > missing wmb?
> > > 
> > > I thouht about it but when I read comment from call_srcu as follows
> > > "each cpu is guaranteed to have executed a full memory barrier",
> > > I decided we don't need it. Right? (ie, double check)
> > > 
> > 
> > hm, need to think about it.
> 
> Another idea is to use kick_all_cpus_sync, not srcu.
> With that, we don't need to add more instruction in rw path.
> I will try it.
> 

hm, that will kick all cpus out of idle.

> > 
> > > > 
> > > > I think we also better put comments after every wmb/rmb. like
> > > > 
> > > > 	smp_wmb(); /* pairs with rmb() in foo() */
> > > 
> > > I already put the comment in other smp_rmb/wmb.
> > > If it's not what you want, please suggest me. :)
> > > 
> > 
> > they are fine. it was a minor nitpick.
> > I just read in the list that guys want to explicitly show which wmb
> > corresponds to which rmb. but we have only two of them, so it's not
> > a big deal.
> > 
> > > > 
> > > > 
> > > > > +	call_srcu(&zram->srcu, &zram->rcu, rcu_zram_do_nothing);
> > > > > +	synchronize_srcu(&zram->srcu);
> > > > > +	zram_meta_free(zram->meta);
> > > > > +	zcomp_destroy(zram->comp);
> > > > >  	up_write(&zram->init_lock);
> > > > > -
> > > > >  	/*
> > > > >  	 * Revalidate disk out of the init_lock to avoid lockdep splat.
> > > > >  	 * It's okay because disk's capacity is protected by init_lock
> > > > > @@ -762,10 +775,19 @@ static ssize_t disksize_store(struct device *dev,
> > > > >  	if (!disksize)
> > > > >  		return -EINVAL;
> > > > >  
> > > > > +	down_write(&zram->init_lock);
> > > > > +	if (init_done(zram)) {
> > > > > +		pr_info("Cannot change disksize for initialized device\n");
> > > > > +		up_write(&zram->init_lock);
> > > > > +		return -EBUSY;
> > > > > +	}
> > > > > +
> > > > >  	disksize = PAGE_ALIGN(disksize);
> > > > >  	meta = zram_meta_alloc(zram->disk->first_minor, disksize);
> > > > > -	if (!meta)
> > > > > +	if (!meta) {
> > > > > +		up_write(&zram->init_lock);
> > > > >  		return -ENOMEM;
> > > > > +	}
> > > > >  
> > > > >  	comp = zcomp_create(zram->compressor, zram->max_comp_streams);
> > > > >  	if (IS_ERR(comp)) {
> > > > > @@ -775,17 +797,17 @@ static ssize_t disksize_store(struct device *dev,
> > > > >  		goto out_free_meta;
> > > > >  	}
> > > > >  
> > > > > -	down_write(&zram->init_lock);
> > > > > -	if (init_done(zram)) {
> > > > > -		pr_info("Cannot change disksize for initialized device\n");
> > > > > -		err = -EBUSY;
> > > > > -		goto out_destroy_comp;
> > > > > -	}
> > > > > -
> > > > >  	zram->meta = meta;
> > > > >  	zram->comp = comp;
> > > > >  	zram->disksize = disksize;
> > > > >  	set_capacity(zram->disk, zram->disksize >> SECTOR_SHIFT);
> > > > > +	/*
> > > > > +	 * Store operation of struct zram fields should complete
> > > > > +	 * before init_done set up because zram_bvec_rw doesn't
> > > > > +	 * hold an zram->init_lock.
> > > > > +	 */
> > > > > +	smp_wmb();
> > > > > +	zram->init_done = true;
> > > > >  	up_write(&zram->init_lock);
> > > > >  
> > > > >  	/*
> > > > > @@ -797,10 +819,8 @@ static ssize_t disksize_store(struct device *dev,
> > > > >  
> > > > >  	return len;
> > > > >  
> > > > > -out_destroy_comp:
> > > > > -	up_write(&zram->init_lock);
> > > > > -	zcomp_destroy(comp);
> > > > >  out_free_meta:
> > > > > +	up_write(&zram->init_lock);
> > > > >  	zram_meta_free(meta);
> > > > 
> > > >  zram_meta_free(meta);
> > > >  up_write(&zram->init_lock);
> > > > 
> > > >  ?
> > > 
> > > I don't think we should release meta under init_lock.
> > > Do you have any reason I am missing?
> > > 
> > 
> > well, just theoretical.
> > forbid concurrent initialization until we completely rollback.
> > 
> >              CPU0                                     CPU1
> > 
> > echo 30G > /.../zram0/disksize
> > meta = vmalloc(pages for 30G)
> > 
> > out_free_meta:                              echo 30G > /.../zram0/disksize
> > 	up_write(&zram->init_lock);         meta = vmalloc(pages for 30G)
> > 	zram_meta_free(meta);               ^^^^ 30G + 30G
> >                                             out_free_meta:
> >                                                    ....
> > 	-ss
> 
> It might but as it is, we have allocated meta out of the lock.
> if it turns out real problem, it's easy to fix it byby this work
> (ie, we could alloc/free meta under init_lock).
> IOW, it should be another patch so I don't want to take care of it
> in this work.
> 

fair enough.

	-ss

> > 
> > > > 
> > > > >  	return err;
> > > > >  }
> > > > > @@ -905,9 +925,10 @@ out:
> > > > >   */
> > > > >  static void zram_make_request(struct request_queue *queue, struct bio *bio)
> > > > >  {
> > > > > +	int idx;
> > > > >  	struct zram *zram = queue->queuedata;
> > > > >  
> > > > > -	down_read(&zram->init_lock);
> > > > > +	idx = srcu_read_lock(&zram->srcu);
> > > > >  	if (unlikely(!init_done(zram)))
> > > > >  		goto error;
> > > > >  
> > > > > @@ -918,12 +939,12 @@ static void zram_make_request(struct request_queue *queue, struct bio *bio)
> > > > >  	}
> > > > >  
> > > > >  	__zram_make_request(zram, bio);
> > > > > -	up_read(&zram->init_lock);
> > > > > +	srcu_read_unlock(&zram->srcu, idx);
> > > > >  
> > > > >  	return;
> > > > >  
> > > > >  error:
> > > > > -	up_read(&zram->init_lock);
> > > > > +	srcu_read_unlock(&zram->srcu, idx);
> > > > >  	bio_io_error(bio);
> > > > >  }
> > > > >  
> > > > > @@ -945,18 +966,20 @@ static void zram_slot_free_notify(struct block_device *bdev,
> > > > >  static int zram_rw_page(struct block_device *bdev, sector_t sector,
> > > > >  		       struct page *page, int rw)
> > > > >  {
> > > > > -	int offset, err;
> > > > > +	int offset, err, idx;
> > > > >  	u32 index;
> > > > >  	struct zram *zram;
> > > > >  	struct bio_vec bv;
> > > > >  
> > > > >  	zram = bdev->bd_disk->private_data;
> > > > > +	idx = srcu_read_lock(&zram->srcu);
> > > > > +
> > > > >  	if (!valid_io_request(zram, sector, PAGE_SIZE)) {
> > > > >  		atomic64_inc(&zram->stats.invalid_io);
> > > > > +		srcu_read_unlock(&zram->srcu, idx);
> > > > >  		return -EINVAL;
> > > > >  	}
> > > > >  
> > > > > -	down_read(&zram->init_lock);
> > > > >  	if (unlikely(!init_done(zram))) {
> > > > >  		err = -EIO;
> > > > >  		goto out_unlock;
> > > > > @@ -971,7 +994,7 @@ static int zram_rw_page(struct block_device *bdev, sector_t sector,
> > > > >  
> > > > >  	err = zram_bvec_rw(zram, &bv, index, offset, rw);
> > > > >  out_unlock:
> > > > > -	up_read(&zram->init_lock);
> > > > > +	srcu_read_unlock(&zram->srcu, idx);
> > > > >  	/*
> > > > >  	 * If I/O fails, just return error(ie, non-zero) without
> > > > >  	 * calling page_endio.
> > > > > @@ -1041,6 +1064,11 @@ static int create_device(struct zram *zram, int device_id)
> > > > >  
> > > > >  	init_rwsem(&zram->init_lock);
> > > > >  
> > > > > +	if (init_srcu_struct(&zram->srcu)) {
> > > > > +		pr_err("Error initialize srcu for device %d\n", device_id);
> > > > > +		goto out;
> > > > > +	}
> > > > > +
> > > > >  	zram->queue = blk_alloc_queue(GFP_KERNEL);
> > > > >  	if (!zram->queue) {
> > > > >  		pr_err("Error allocating disk queue for device %d\n",
> > > > > @@ -1125,8 +1153,8 @@ static void destroy_device(struct zram *zram)
> > > > >  
> > > > >  	del_gendisk(zram->disk);
> > > > >  	put_disk(zram->disk);
> > > > > -
> > > > >  	blk_cleanup_queue(zram->queue);
> > > > > +	cleanup_srcu_struct(&zram->srcu);
> > > > >  }
> > > > >  
> > > > >  static int __init zram_init(void)
> > > > > diff --git a/drivers/block/zram/zram_drv.h b/drivers/block/zram/zram_drv.h
> > > > > index e492f6bf11f1..2042c310aea8 100644
> > > > > --- a/drivers/block/zram/zram_drv.h
> > > > > +++ b/drivers/block/zram/zram_drv.h
> > > > > @@ -105,8 +105,13 @@ struct zram {
> > > > >  	struct gendisk *disk;
> > > > >  	struct zcomp *comp;
> > > > >  
> > > > > +	struct srcu_struct srcu;
> > > > > +	struct rcu_head rcu;
> > > > > +
> > > > >  	/* Prevent concurrent execution of device init, reset and R/W request */
> > > > >  	struct rw_semaphore init_lock;
> > > > > +	bool init_done;
> > > > > +
> > > > >  	/*
> > > > >  	 * This is the limit on amount of *uncompressed* worth of data
> > > > >  	 * we can store in a disk.
> > > > > -- 
> > > > > 1.9.1
> > > > > 
> > > 
> > > -- 
> > > Kind regards,
> > > Minchan Kim
> > > 
> 
> -- 
> Kind regards,
> Minchan Kim
> 

^ permalink raw reply	[flat|nested] 32+ messages in thread

* Re: [PATCH 1/2] zram: free meta out of init_lock
  2015-01-28  0:24                     ` Sergey Senozhatsky
@ 2015-01-28  0:59                       ` Minchan Kim
  0 siblings, 0 replies; 32+ messages in thread
From: Minchan Kim @ 2015-01-28  0:59 UTC (permalink / raw)
  To: Sergey Senozhatsky
  Cc: Sergey Senozhatsky, Jerome Marchand, Andrew Morton, linux-kernel,
	linux-mm, Nitin Gupta

On Wed, Jan 28, 2015 at 09:24:49AM +0900, Sergey Senozhatsky wrote:
> Hello,
> 
> On (01/28/15 09:15), Minchan Kim wrote:
> > > > > > > > On Sat, Jan 24, 2015 at 12:47:07AM +0900, Sergey Senozhatsky wrote:
> > > > > > > > > On (01/23/15 15:48), Jerome Marchand wrote:
> > > > > > > > > > On 01/23/2015 03:24 PM, Sergey Senozhatsky wrote:
> > > > > > > > > > > On (01/23/15 14:58), Minchan Kim wrote:
> > > > > > > > > > >> We don't need to call zram_meta_free, zcomp_destroy and zs_free
> > > > > > > > > > >> under init_lock. What we need to prevent race with init_lock
> > > > > > > > > > >> in reset is setting NULL into zram->meta (ie, init_done).
> > > > > > > > > > >> This patch does it.
> > > > > > > > > > >>
> > > > > > > > > > >> Signed-off-by: Minchan Kim <minchan@kernel.org>
> > > > > > > > > > >> ---
> > > > > > > > > > >>  drivers/block/zram/zram_drv.c | 28 ++++++++++++++++------------
> > > > > > > > > > >>  1 file changed, 16 insertions(+), 12 deletions(-)
> > > > > > > > > > >>
> > > > > > > > > > >> diff --git a/drivers/block/zram/zram_drv.c b/drivers/block/zram/zram_drv.c
> > > > > > > > > > >> index 9250b3f54a8f..0299d82275e7 100644
> > > > > > > > > > >> --- a/drivers/block/zram/zram_drv.c
> > > > > > > > > > >> +++ b/drivers/block/zram/zram_drv.c
> > > > > > > > > > >> @@ -708,6 +708,7 @@ static void zram_reset_device(struct zram *zram, bool reset_capacity)
> > > > > > > > > > >>  {
> > > > > > > > > > >>  	size_t index;
> > > > > > > > > > >>  	struct zram_meta *meta;
> > > > > > > > > > >> +	struct zcomp *comp;
> > > > > > > > > > >>  
> > > > > > > > > > >>  	down_write(&zram->init_lock);
> > > > > > > > > > >>  
> > > > > > > > > > >> @@ -719,20 +720,10 @@ static void zram_reset_device(struct zram *zram, bool reset_capacity)
> > > > > > > > > > >>  	}
> > > > > > > > > > >>  
> > > > > > > > > > >>  	meta = zram->meta;
> > > > > > > > > > >> -	/* Free all pages that are still in this zram device */
> > > > > > > > > > >> -	for (index = 0; index < zram->disksize >> PAGE_SHIFT; index++) {
> > > > > > > > > > >> -		unsigned long handle = meta->table[index].handle;
> > > > > > > > > > >> -		if (!handle)
> > > > > > > > > > >> -			continue;
> > > > > > > > > > >> -
> > > > > > > > > > >> -		zs_free(meta->mem_pool, handle);
> > > > > > > > > > >> -	}
> > > > > > > > > > >> -
> > > > > > > > > > >> -	zcomp_destroy(zram->comp);
> > > > > > > > > > > 
> > > > > > > > > > > I'm not so sure about moving zcomp destruction. if we would have detached it
> > > > > > > > > > > from zram, then yes. otherwise, think of zram ->destoy vs ->init race.
> > > > > > > > > > > 
> > > > > > > > > > > suppose,
> > > > > > > > > > > CPU1 waits for down_write() init lock in disksize_store() with new comp already allocated;
> > > > > > > > > > > CPU0 detaches ->meta and releases write init lock;
> > > > > > > > > > > CPU1 grabs the lock and does zram->comp = comp;
> > > > > > > > > > > CPU0 reaches the point of zcomp_destroy(zram->comp);
> > > > > > > > > > 
> > > > > > > > > > I don't see your point: this patch does not call
> > > > > > > > > > zcomp_destroy(zram->comp) anymore, but zram_destroy(comp), where comp is
> > > > > > > > > > the old zram->comp.
> > > > > > > > > 
> > > > > > > > > 
> > > > > > > > > oh... yes. sorry! my bad.
> > > > > > > > > 
> > > > > > > > > 
> > > > > > > > > 
> > > > > > > > > anyway, on a second thought, do we even want to destoy meta out of init_lock?
> > > > > > > > > 
> > > > > > > > > I mean, it will let you init new device quicker. but... assume, you have
> > > > > > > > > 30G zram (or any other bad-enough number). on CPU0 you reset device -- iterate
> > > > > > > > > over 30G meta->table, etc. out of init_lock.
> > > > > > > > > on CPU1 you concurrently re-init device and request again 30G.
> > > > > > > > > 
> > > > > > > > > how bad that can be?
> > > > > > > > > 
> > > > > > > > > 
> > > > > > > > > 
> > > > > > > > > diskstore called on already initialised device is also not so perfect.
> > > > > > > > > we first will try to allocate ->meta (vmalloc pages for another 30G),
> > > > > > > > > then allocate comp, then down_write() init lock to find out that device
> > > > > > > > > is initialised and we need to release allocated memory.
> > > > > > > > > 
> > > > > > > > > 
> > > > > > > > > 
> > > > > > > > > may be we better keep ->meta destruction under init_lock and additionally
> > > > > > > > > move ->meta and ->comp allocation under init_lock in disksize_store()?
> > > > > > > > > 
> > > > > > > > > like the following one:
> > > > > > > > > 
> > > > > > > > > ---
> > > > > > > > > 
> > > > > > > > >  drivers/block/zram/zram_drv.c | 25 +++++++++++++------------
> > > > > > > > >  1 file changed, 13 insertions(+), 12 deletions(-)
> > > > > > > > > 
> > > > > > > > > diff --git a/drivers/block/zram/zram_drv.c b/drivers/block/zram/zram_drv.c
> > > > > > > > > index 9250b3f..827ab21 100644
> > > > > > > > > --- a/drivers/block/zram/zram_drv.c
> > > > > > > > > +++ b/drivers/block/zram/zram_drv.c
> > > > > > > > > @@ -765,9 +765,18 @@ static ssize_t disksize_store(struct device *dev,
> > > > > > > > >  		return -EINVAL;
> > > > > > > > >  
> > > > > > > > >  	disksize = PAGE_ALIGN(disksize);
> > > > > > > > > +	down_write(&zram->init_lock);
> > > > > > > > > +	if (init_done(zram)) {
> > > > > > > > > +		up_write(&zram->init_lock);
> > > > > > > > > +		pr_info("Cannot change disksize for initialized device\n");
> > > > > > > > > +		return -EBUSY;
> > > > > > > > > +	}
> > > > > > > > > +
> > > > > > > > >  	meta = zram_meta_alloc(zram->disk->first_minor, disksize);
> > > > > > > > > -	if (!meta)
> > > > > > > > > -		return -ENOMEM;
> > > > > > > > > +	if (!meta) {
> > > > > > > > > +		err = -ENOMEM;
> > > > > > > > > +		goto out_unlock;
> > > > > > > > > +	}
> > > > > > > > >  
> > > > > > > > >  	comp = zcomp_create(zram->compressor, zram->max_comp_streams);
> > > > > > > > >  	if (IS_ERR(comp)) {
> > > > > > > > > @@ -777,13 +786,6 @@ static ssize_t disksize_store(struct device *dev,
> > > > > > > > >  		goto out_free_meta;
> > > > > > > > >  	}
> > > > > > > > >  
> > > > > > > > > -	down_write(&zram->init_lock);
> > > > > > > > > -	if (init_done(zram)) {
> > > > > > > > > -		pr_info("Cannot change disksize for initialized device\n");
> > > > > > > > > -		err = -EBUSY;
> > > > > > > > > -		goto out_destroy_comp;
> > > > > > > > > -	}
> > > > > > > > > -
> > > > > > > > >  	zram->meta = meta;
> > > > > > > > >  	zram->comp = comp;
> > > > > > > > >  	zram->disksize = disksize;
> > > > > > > > > @@ -799,11 +801,10 @@ static ssize_t disksize_store(struct device *dev,
> > > > > > > > >  
> > > > > > > > >  	return len;
> > > > > > > > >  
> > > > > > > > > -out_destroy_comp:
> > > > > > > > > -	up_write(&zram->init_lock);
> > > > > > > > > -	zcomp_destroy(comp);
> > > > > > > > >  out_free_meta:
> > > > > > > > >  	zram_meta_free(meta);
> > > > > > > > > +out_unlock:
> > > > > > > > > +	up_write(&zram->init_lock);
> > > > > > > > >  	return err;
> > > > > > > > >  }
> > > > > > > > >  
> > > > > > > > 
> > > > > > > > The init_lock is really troublesome. We can't do call zram_meta_alloc
> > > > > > > > under init_lock due to lockdep report. Please keep in mind.
> > > > > > > >
> > > > > > > 
> > > > > > > ah... I do recall it, thanks for your reminder.
> > > > > > > 
> > > > > > > 
> > > > > > > > The zram_rw_page is one of the function under reclaim path and hold it
> > > > > > > > as read_lock while here holds it as write_lock.
> > > > > > > > It's a false positive so that we might could make shut lockdep up
> > > > > > > > by annotation but I don't want it but want to work with lockdep rather
> > > > > > > > than disable. As well, there are other pathes to use init_lock to
> > > > > > > > protect other data where would be victims of lockdep.
> > > > > > > > 
> > > > > > > > I didn't tell the motivation of this patch because it made you busy
> > > > > > > > guys wasted. Let me tell it now. It was another lockdep report by
> > > > > > > > kmem_cache_destroy for zsmalloc compaction about init_lock. That's why
> > > > > > > > the patchset was one of the patch in compaction.
> > > > > > > >
> > > > > > > > Yes, the ideal is to remove horrible init_lock of zram in this phase and
> > > > > > > > make code more simple and clear but I don't want to stuck zsmalloc
> > > > > > > > compaction by the work.
> > > > > > > 
> > > > > > > 
> > > > > > > > Having said that, I feel it's time to revisit
> > > > > > > > to remove init_lock.
> > > > > > > > At least, I will think over to find a solution to kill init_lock.
> > > > > > > 
> > > > > > > hm, can't think of anything quick...
> > > > > > > 
> > > > > > > 	-ss
> > > > > > 
> > > > > > Hello guys,
> > > > > > 
> > > > > > How about this?
> > > > > > 
> > > > > > It's based on Ganesh's patch.
> > > > > > https://lkml.org/lkml/2015/1/24/50
> > > > > (I see no similarities with Ganesh's patch)
> > > > > 
> > > > > hm, you probably meant this one https://lkml.org/lkml/2015/1/23/406
> > > > > 
> > > > > 
> > > > > at glance this makes things a bit more complicated, so I need to think more.
> > > > > 
> > > > > > From afda9fd2f6c40dd0745d8a6babe78c5cbdceddf5 Mon Sep 17 00:00:00 2001
> > > > > > From: Minchan Kim <minchan@kernel.org>
> > > > > > Date: Mon, 26 Jan 2015 14:34:10 +0900
> > > > > > Subject: [RFC] zram: remove init_lock in zram_make_request
> > > > > > 
> > > > > > Admin could reset zram during I/O operation going on so we have
> > > > > > used zram->init_lock as read-side lock in I/O path to prevent
> > > > > > sudden zram meta freeing.
> > > > > > 
> > > > > > However, the init_lock is really troublesome.
> > > > > > We can't do call zram_meta_alloc under init_lock due to lockdep splat
> > > > > > because zram_rw_page is one of the function under reclaim path and
> > > > > > hold it as read_lock while other places in process context hold it
> > > > > > as write_lock. So, we have used allocation out of the lock to avoid
> > > > > > lockdep warn but it's not good for readability and fainally, I met
> > > > > > another lockdep splat between init_lock and cpu_hotpulug from
> > > > > > kmem_cache_destroy during wokring zsmalloc compaction. :(
> > > > > > 
> > > > > > Yes, the ideal is to remove horrible init_lock of zram in rw path.
> > > > > > This patch removes it in rw path and instead, put init_done bool
> > > > > > variable to check initialization done with smp_[wmb|rmb] and
> > > > > > srcu_[un]read_lock to prevent sudden zram meta freeing
> > > > > > during I/O operation.
> > > > > > 
> > > > > > Signed-off-by: Minchan Kim <minchan@kernel.org>
> > > > > > ---
> > > > > >  drivers/block/zram/zram_drv.c | 76 +++++++++++++++++++++++++++++--------------
> > > > > >  drivers/block/zram/zram_drv.h |  5 +++
> > > > > >  2 files changed, 57 insertions(+), 24 deletions(-)
> > > > > > 
> > > > > > diff --git a/drivers/block/zram/zram_drv.c b/drivers/block/zram/zram_drv.c
> > > > > > index a598ada817f0..e06ff975f997 100644
> > > > > > --- a/drivers/block/zram/zram_drv.c
> > > > > > +++ b/drivers/block/zram/zram_drv.c
> > > > > > @@ -32,6 +32,7 @@
> > > > > >  #include <linux/string.h>
> > > > > >  #include <linux/vmalloc.h>
> > > > > >  #include <linux/err.h>
> > > > > > +#include <linux/srcu.h>
> > > > > >  
> > > > > >  #include "zram_drv.h"
> > > > > >  
> > > > > > @@ -53,9 +54,16 @@ static ssize_t name##_show(struct device *d,		\
> > > > > >  }									\
> > > > > >  static DEVICE_ATTR_RO(name);
> > > > > >  
> > > > > > -static inline int init_done(struct zram *zram)
> > > > > > +static inline bool init_done(struct zram *zram)
> > > > > >  {
> > > > > > -	return zram->meta != NULL;
> > > > > > +	/*
> > > > > > +	 * init_done can be used without holding zram->init_lock in
> > > > > > +	 * read/write handler(ie, zram_make_request) but we should make sure
> > > > > > +	 * that zram->init_done should set up after meta initialization is
> > > > > > +	 * done. Look at disksize_store.
> > > > > > +	 */
> > > > > > +	smp_rmb();
> > > > > > +	return zram->init_done;
> > > > > 
> > > > > ->init_done returns back :)
> > > > 
> > > > 
> > > > > can we rely on write ->meta; wmb; --- rmb; read ->meta?
> > > > 
> > > > Might be possible.
> > 
> > Now that I think about it, it's impossible with zram->meta because
> > we need to nullify it before call_srcu but pre-existing SRCU read-side
> > critical sections can access zram->meta.
> > Anyway, introducing a new variable should be not a party-pooper.
> > 
> > > > 
> > > > > 
> > > > > how much performance do we lose on barriers?
> > > > 
> > > > I think it's not too much than locking which does more than(ie,
> > > > barrier, fairness, spin on owner and so on) such simple barrier.
> > > > 
> > > > > 
> > > > > >  }
> > > > > >  
> > > > > >  static inline struct zram *dev_to_zram(struct device *dev)
> > > > > > @@ -326,6 +334,10 @@ static void zram_meta_free(struct zram_meta *meta)
> > > > > >  	kfree(meta);
> > > > > >  }
> > > > > >  
> > > > > > +static void rcu_zram_do_nothing(struct rcu_head *unused)
> > > > > > +{
> > > > > > +}
> > > > > > +
> > > > > >  static struct zram_meta *zram_meta_alloc(int device_id, u64 disksize)
> > > > > >  {
> > > > > >  	char pool_name[8];
> > > > > > @@ -726,11 +738,8 @@ static void zram_reset_device(struct zram *zram, bool reset_capacity)
> > > > > >  		return;
> > > > > >  	}
> > > > > >  
> > > > > > -	zcomp_destroy(zram->comp);
> > > > > >  	zram->max_comp_streams = 1;
> > > > > >  
> > > > > > -	zram_meta_free(zram->meta);
> > > > > > -	zram->meta = NULL;
> > > > > >  	/* Reset stats */
> > > > > >  	memset(&zram->stats, 0, sizeof(zram->stats));
> > > > > >  
> > > > > > @@ -738,8 +747,12 @@ static void zram_reset_device(struct zram *zram, bool reset_capacity)
> > > > > >  	if (reset_capacity)
> > > > > >  		set_capacity(zram->disk, 0);
> > > > > >  
> > > > > > +	zram->init_done = false;
> > > > > 
> > > > > missing wmb?
> > > > 
> > > > I thouht about it but when I read comment from call_srcu as follows
> > > > "each cpu is guaranteed to have executed a full memory barrier",
> > > > I decided we don't need it. Right? (ie, double check)
> > > > 
> > > 
> > > hm, need to think about it.
> > 
> > Another idea is to use kick_all_cpus_sync, not srcu.
> > With that, we don't need to add more instruction in rw path.
> > I will try it.
> > 
> 
> hm, that will kick all cpus out of idle.

It just calls smp_call_funcion which is used by a lot places
by arch and drivers by on_each_cpu and I don't think resetting
of zram is not a frequent activity.
Anyway, I'm okay either way. Just want to show the concept
and let's decide the way and go forward. :)

-- 
Kind regards,
Minchan Kim

^ permalink raw reply	[flat|nested] 32+ messages in thread

* Re: [PATCH 1/2] zram: free meta out of init_lock
  2015-01-28  0:22                     ` Minchan Kim
@ 2015-01-28  2:07                       ` Sergey Senozhatsky
  2015-01-28  2:57                         ` Minchan Kim
  0 siblings, 1 reply; 32+ messages in thread
From: Sergey Senozhatsky @ 2015-01-28  2:07 UTC (permalink / raw)
  To: Minchan Kim
  Cc: Sergey Senozhatsky, Sergey Senozhatsky, Jerome Marchand,
	Andrew Morton, linux-kernel, linux-mm, Nitin Gupta

On (01/28/15 09:22), Minchan Kim wrote:
> On Wed, Jan 28, 2015 at 09:15:27AM +0900, Minchan Kim wrote:
> > Hello Sergey,
> > 
> > On Tue, Jan 27, 2015 at 01:03:05PM +0900, Sergey Senozhatsky wrote:
> > > Hello,
> > > 
> > > On (01/27/15 12:18), Minchan Kim wrote:
> > > > Hello Sergey,
> > > > 
> > > > On Tue, Jan 27, 2015 at 11:17:04AM +0900, Sergey Senozhatsky wrote:
> > > > > On (01/27/15 01:00), Minchan Kim wrote:
> > > > > > On Mon, Jan 26, 2015 at 11:17:09PM +0900, Sergey Senozhatsky wrote:
> > > > > > > Hello,
> > > > > > > 
> > > > > > > On (01/26/15 10:33), Minchan Kim wrote:
> > > > > > > > Hello,
> > > > > > > > 
> > > > > > > > On Sat, Jan 24, 2015 at 12:47:07AM +0900, Sergey Senozhatsky wrote:
> > > > > > > > > On (01/23/15 15:48), Jerome Marchand wrote:
> > > > > > > > > > On 01/23/2015 03:24 PM, Sergey Senozhatsky wrote:
> > > > > > > > > > > On (01/23/15 14:58), Minchan Kim wrote:
> > > > > > > > > > >> We don't need to call zram_meta_free, zcomp_destroy and zs_free
> > > > > > > > > > >> under init_lock. What we need to prevent race with init_lock
> > > > > > > > > > >> in reset is setting NULL into zram->meta (ie, init_done).
> > > > > > > > > > >> This patch does it.
> > > > > > > > > > >>
> > > > > > > > > > >> Signed-off-by: Minchan Kim <minchan@kernel.org>
> > > > > > > > > > >> ---
> > > > > > > > > > >>  drivers/block/zram/zram_drv.c | 28 ++++++++++++++++------------
> > > > > > > > > > >>  1 file changed, 16 insertions(+), 12 deletions(-)
> > > > > > > > > > >>
> > > > > > > > > > >> diff --git a/drivers/block/zram/zram_drv.c b/drivers/block/zram/zram_drv.c
> > > > > > > > > > >> index 9250b3f54a8f..0299d82275e7 100644
> > > > > > > > > > >> --- a/drivers/block/zram/zram_drv.c
> > > > > > > > > > >> +++ b/drivers/block/zram/zram_drv.c
> > > > > > > > > > >> @@ -708,6 +708,7 @@ static void zram_reset_device(struct zram *zram, bool reset_capacity)
> > > > > > > > > > >>  {
> > > > > > > > > > >>  	size_t index;
> > > > > > > > > > >>  	struct zram_meta *meta;
> > > > > > > > > > >> +	struct zcomp *comp;
> > > > > > > > > > >>  
> > > > > > > > > > >>  	down_write(&zram->init_lock);
> > > > > > > > > > >>  
> > > > > > > > > > >> @@ -719,20 +720,10 @@ static void zram_reset_device(struct zram *zram, bool reset_capacity)
> > > > > > > > > > >>  	}
> > > > > > > > > > >>  
> > > > > > > > > > >>  	meta = zram->meta;
> > > > > > > > > > >> -	/* Free all pages that are still in this zram device */
> > > > > > > > > > >> -	for (index = 0; index < zram->disksize >> PAGE_SHIFT; index++) {
> > > > > > > > > > >> -		unsigned long handle = meta->table[index].handle;
> > > > > > > > > > >> -		if (!handle)
> > > > > > > > > > >> -			continue;
> > > > > > > > > > >> -
> > > > > > > > > > >> -		zs_free(meta->mem_pool, handle);
> > > > > > > > > > >> -	}
> > > > > > > > > > >> -
> > > > > > > > > > >> -	zcomp_destroy(zram->comp);
> > > > > > > > > > > 
> > > > > > > > > > > I'm not so sure about moving zcomp destruction. if we would have detached it
> > > > > > > > > > > from zram, then yes. otherwise, think of zram ->destoy vs ->init race.
> > > > > > > > > > > 
> > > > > > > > > > > suppose,
> > > > > > > > > > > CPU1 waits for down_write() init lock in disksize_store() with new comp already allocated;
> > > > > > > > > > > CPU0 detaches ->meta and releases write init lock;
> > > > > > > > > > > CPU1 grabs the lock and does zram->comp = comp;
> > > > > > > > > > > CPU0 reaches the point of zcomp_destroy(zram->comp);
> > > > > > > > > > 
> > > > > > > > > > I don't see your point: this patch does not call
> > > > > > > > > > zcomp_destroy(zram->comp) anymore, but zram_destroy(comp), where comp is
> > > > > > > > > > the old zram->comp.
> > > > > > > > > 
> > > > > > > > > 
> > > > > > > > > oh... yes. sorry! my bad.
> > > > > > > > > 
> > > > > > > > > 
> > > > > > > > > 
> > > > > > > > > anyway, on a second thought, do we even want to destoy meta out of init_lock?
> > > > > > > > > 
> > > > > > > > > I mean, it will let you init new device quicker. but... assume, you have
> > > > > > > > > 30G zram (or any other bad-enough number). on CPU0 you reset device -- iterate
> > > > > > > > > over 30G meta->table, etc. out of init_lock.
> > > > > > > > > on CPU1 you concurrently re-init device and request again 30G.
> > > > > > > > > 
> > > > > > > > > how bad that can be?
> > > > > > > > > 
> > > > > > > > > 
> > > > > > > > > 
> > > > > > > > > diskstore called on already initialised device is also not so perfect.
> > > > > > > > > we first will try to allocate ->meta (vmalloc pages for another 30G),
> > > > > > > > > then allocate comp, then down_write() init lock to find out that device
> > > > > > > > > is initialised and we need to release allocated memory.
> > > > > > > > > 
> > > > > > > > > 
> > > > > > > > > 
> > > > > > > > > may be we better keep ->meta destruction under init_lock and additionally
> > > > > > > > > move ->meta and ->comp allocation under init_lock in disksize_store()?
> > > > > > > > > 
> > > > > > > > > like the following one:
> > > > > > > > > 
> > > > > > > > > ---
> > > > > > > > > 
> > > > > > > > >  drivers/block/zram/zram_drv.c | 25 +++++++++++++------------
> > > > > > > > >  1 file changed, 13 insertions(+), 12 deletions(-)
> > > > > > > > > 
> > > > > > > > > diff --git a/drivers/block/zram/zram_drv.c b/drivers/block/zram/zram_drv.c
> > > > > > > > > index 9250b3f..827ab21 100644
> > > > > > > > > --- a/drivers/block/zram/zram_drv.c
> > > > > > > > > +++ b/drivers/block/zram/zram_drv.c
> > > > > > > > > @@ -765,9 +765,18 @@ static ssize_t disksize_store(struct device *dev,
> > > > > > > > >  		return -EINVAL;
> > > > > > > > >  
> > > > > > > > >  	disksize = PAGE_ALIGN(disksize);
> > > > > > > > > +	down_write(&zram->init_lock);
> > > > > > > > > +	if (init_done(zram)) {
> > > > > > > > > +		up_write(&zram->init_lock);
> > > > > > > > > +		pr_info("Cannot change disksize for initialized device\n");
> > > > > > > > > +		return -EBUSY;
> > > > > > > > > +	}
> > > > > > > > > +
> > > > > > > > >  	meta = zram_meta_alloc(zram->disk->first_minor, disksize);
> > > > > > > > > -	if (!meta)
> > > > > > > > > -		return -ENOMEM;
> > > > > > > > > +	if (!meta) {
> > > > > > > > > +		err = -ENOMEM;
> > > > > > > > > +		goto out_unlock;
> > > > > > > > > +	}
> > > > > > > > >  
> > > > > > > > >  	comp = zcomp_create(zram->compressor, zram->max_comp_streams);
> > > > > > > > >  	if (IS_ERR(comp)) {
> > > > > > > > > @@ -777,13 +786,6 @@ static ssize_t disksize_store(struct device *dev,
> > > > > > > > >  		goto out_free_meta;
> > > > > > > > >  	}
> > > > > > > > >  
> > > > > > > > > -	down_write(&zram->init_lock);
> > > > > > > > > -	if (init_done(zram)) {
> > > > > > > > > -		pr_info("Cannot change disksize for initialized device\n");
> > > > > > > > > -		err = -EBUSY;
> > > > > > > > > -		goto out_destroy_comp;
> > > > > > > > > -	}
> > > > > > > > > -
> > > > > > > > >  	zram->meta = meta;
> > > > > > > > >  	zram->comp = comp;
> > > > > > > > >  	zram->disksize = disksize;
> > > > > > > > > @@ -799,11 +801,10 @@ static ssize_t disksize_store(struct device *dev,
> > > > > > > > >  
> > > > > > > > >  	return len;
> > > > > > > > >  
> > > > > > > > > -out_destroy_comp:
> > > > > > > > > -	up_write(&zram->init_lock);
> > > > > > > > > -	zcomp_destroy(comp);
> > > > > > > > >  out_free_meta:
> > > > > > > > >  	zram_meta_free(meta);
> > > > > > > > > +out_unlock:
> > > > > > > > > +	up_write(&zram->init_lock);
> > > > > > > > >  	return err;
> > > > > > > > >  }
> > > > > > > > >  
> > > > > > > > 
> > > > > > > > The init_lock is really troublesome. We can't do call zram_meta_alloc
> > > > > > > > under init_lock due to lockdep report. Please keep in mind.
> > > > > > > >
> > > > > > > 
> > > > > > > ah... I do recall it, thanks for your reminder.
> > > > > > > 
> > > > > > > 
> > > > > > > > The zram_rw_page is one of the function under reclaim path and hold it
> > > > > > > > as read_lock while here holds it as write_lock.
> > > > > > > > It's a false positive so that we might could make shut lockdep up
> > > > > > > > by annotation but I don't want it but want to work with lockdep rather
> > > > > > > > than disable. As well, there are other pathes to use init_lock to
> > > > > > > > protect other data where would be victims of lockdep.
> > > > > > > > 
> > > > > > > > I didn't tell the motivation of this patch because it made you busy
> > > > > > > > guys wasted. Let me tell it now. It was another lockdep report by
> > > > > > > > kmem_cache_destroy for zsmalloc compaction about init_lock. That's why
> > > > > > > > the patchset was one of the patch in compaction.
> > > > > > > >
> > > > > > > > Yes, the ideal is to remove horrible init_lock of zram in this phase and
> > > > > > > > make code more simple and clear but I don't want to stuck zsmalloc
> > > > > > > > compaction by the work.
> > > > > > > 
> > > > > > > 
> > > > > > > > Having said that, I feel it's time to revisit
> > > > > > > > to remove init_lock.
> > > > > > > > At least, I will think over to find a solution to kill init_lock.
> > > > > > > 
> > > > > > > hm, can't think of anything quick...
> > > > > > > 
> > > > > > > 	-ss
> > > > > > 
> > > > > > Hello guys,
> > > > > > 
> > > > > > How about this?
> > > > > > 
> > > > > > It's based on Ganesh's patch.
> > > > > > https://lkml.org/lkml/2015/1/24/50
> > > > > (I see no similarities with Ganesh's patch)
> > > > > 
> > > > > hm, you probably meant this one https://lkml.org/lkml/2015/1/23/406
> > > > > 
> > > > > 
> > > > > at glance this makes things a bit more complicated, so I need to think more.
> > > > > 
> > > > > > From afda9fd2f6c40dd0745d8a6babe78c5cbdceddf5 Mon Sep 17 00:00:00 2001
> > > > > > From: Minchan Kim <minchan@kernel.org>
> > > > > > Date: Mon, 26 Jan 2015 14:34:10 +0900
> > > > > > Subject: [RFC] zram: remove init_lock in zram_make_request
> > > > > > 
> > > > > > Admin could reset zram during I/O operation going on so we have
> > > > > > used zram->init_lock as read-side lock in I/O path to prevent
> > > > > > sudden zram meta freeing.
> > > > > > 
> > > > > > However, the init_lock is really troublesome.
> > > > > > We can't do call zram_meta_alloc under init_lock due to lockdep splat
> > > > > > because zram_rw_page is one of the function under reclaim path and
> > > > > > hold it as read_lock while other places in process context hold it
> > > > > > as write_lock. So, we have used allocation out of the lock to avoid
> > > > > > lockdep warn but it's not good for readability and fainally, I met
> > > > > > another lockdep splat between init_lock and cpu_hotpulug from
> > > > > > kmem_cache_destroy during wokring zsmalloc compaction. :(
> > > > > > 
> > > > > > Yes, the ideal is to remove horrible init_lock of zram in rw path.
> > > > > > This patch removes it in rw path and instead, put init_done bool
> > > > > > variable to check initialization done with smp_[wmb|rmb] and
> > > > > > srcu_[un]read_lock to prevent sudden zram meta freeing
> > > > > > during I/O operation.
> > > > > > 
> > > > > > Signed-off-by: Minchan Kim <minchan@kernel.org>
> > > > > > ---
> > > > > >  drivers/block/zram/zram_drv.c | 76 +++++++++++++++++++++++++++++--------------
> > > > > >  drivers/block/zram/zram_drv.h |  5 +++
> > > > > >  2 files changed, 57 insertions(+), 24 deletions(-)
> > > > > > 
> > > > > > diff --git a/drivers/block/zram/zram_drv.c b/drivers/block/zram/zram_drv.c
> > > > > > index a598ada817f0..e06ff975f997 100644
> > > > > > --- a/drivers/block/zram/zram_drv.c
> > > > > > +++ b/drivers/block/zram/zram_drv.c
> > > > > > @@ -32,6 +32,7 @@
> > > > > >  #include <linux/string.h>
> > > > > >  #include <linux/vmalloc.h>
> > > > > >  #include <linux/err.h>
> > > > > > +#include <linux/srcu.h>
> > > > > >  
> > > > > >  #include "zram_drv.h"
> > > > > >  
> > > > > > @@ -53,9 +54,16 @@ static ssize_t name##_show(struct device *d,		\
> > > > > >  }									\
> > > > > >  static DEVICE_ATTR_RO(name);
> > > > > >  
> > > > > > -static inline int init_done(struct zram *zram)
> > > > > > +static inline bool init_done(struct zram *zram)
> > > > > >  {
> > > > > > -	return zram->meta != NULL;
> > > > > > +	/*
> > > > > > +	 * init_done can be used without holding zram->init_lock in
> > > > > > +	 * read/write handler(ie, zram_make_request) but we should make sure
> > > > > > +	 * that zram->init_done should set up after meta initialization is
> > > > > > +	 * done. Look at disksize_store.
> > > > > > +	 */
> > > > > > +	smp_rmb();
> > > > > > +	return zram->init_done;
> > > > > 
> > > > > ->init_done returns back :)
> > > > 
> > > > 
> > > > > can we rely on write ->meta; wmb; --- rmb; read ->meta?
> > > > 
> > > > Might be possible.
> > 
> > Now that I think about it, it's impossible with zram->meta because
> > we need to nullify it before call_srcu but pre-existing SRCU read-side
> > critical sections can access zram->meta.
> > Anyway, introducing a new variable should be not a party-pooper.
> > 
> > > > 
> > > > > 
> > > > > how much performance do we lose on barriers?
> > > > 
> > > > I think it's not too much than locking which does more than(ie,
> > > > barrier, fairness, spin on owner and so on) such simple barrier.
> > > > 
> > > > > 
> > > > > >  }
> > > > > >  
> > > > > >  static inline struct zram *dev_to_zram(struct device *dev)
> > > > > > @@ -326,6 +334,10 @@ static void zram_meta_free(struct zram_meta *meta)
> > > > > >  	kfree(meta);
> > > > > >  }
> > > > > >  
> > > > > > +static void rcu_zram_do_nothing(struct rcu_head *unused)
> > > > > > +{
> > > > > > +}
> > > > > > +
> > > > > >  static struct zram_meta *zram_meta_alloc(int device_id, u64 disksize)
> > > > > >  {
> > > > > >  	char pool_name[8];
> > > > > > @@ -726,11 +738,8 @@ static void zram_reset_device(struct zram *zram, bool reset_capacity)
> > > > > >  		return;
> > > > > >  	}
> > > > > >  
> > > > > > -	zcomp_destroy(zram->comp);
> > > > > >  	zram->max_comp_streams = 1;
> > > > > >  
> > > > > > -	zram_meta_free(zram->meta);
> > > > > > -	zram->meta = NULL;
> > > > > >  	/* Reset stats */
> > > > > >  	memset(&zram->stats, 0, sizeof(zram->stats));
> > > > > >  
> > > > > > @@ -738,8 +747,12 @@ static void zram_reset_device(struct zram *zram, bool reset_capacity)
> > > > > >  	if (reset_capacity)
> > > > > >  		set_capacity(zram->disk, 0);
> > > > > >  
> > > > > > +	zram->init_done = false;
> > > > > 
> > > > > missing wmb?
> > > > 
> > > > I thouht about it but when I read comment from call_srcu as follows
> > > > "each cpu is guaranteed to have executed a full memory barrier",
> > > > I decided we don't need it. Right? (ie, double check)
> > > > 
> > > 
> > > hm, need to think about it.
> > 
> > Another idea is to use kick_all_cpus_sync, not srcu.
> > With that, we don't need to add more instruction in rw path.
> > I will try it.
> 
> From 560478040d2e08c61796e67d0c3ee519ae67ac0f Mon Sep 17 00:00:00 2001
> From: Minchan Kim <minchan@kernel.org>
> Date: Mon, 26 Jan 2015 14:34:10 +0900
> Subject: [PATCH] zram: remove init_lock in zram_make_request
> 
> Admin could reset zram during I/O operation going on so we have
> used zram->init_lock as read-side lock in I/O path to prevent
> sudden zram meta freeing.
> 
> However, the init_lock is really troublesome.
> We can't do call zram_meta_alloc under init_lock due to lockdep splat
> because zram_rw_page is one of the function under reclaim path and
> hold it as read_lock while other places in process context hold it
> as write_lock. So, we have used allocation out of the lock to avoid
> lockdep warn but it's not good for readability and finally, I met
> another lockdep splat between init_lock and cpu_hotpulug from
> kmem_cache_destroy during wokring zsmalloc compaction. :(
> 
> Yes, the ideal is to remove horrible init_lock of zram in rw path.
> This patch removes it in rw path and instead, use kick_all_cpus_sync
> and a bool init_done variable to check initialization done with
> smp_[wmb|rmb].
> 
> Upon kick_all_cpus_sync returns, any CPU cannot access zram meta
> any more due to init_done in zram_make_request so it's safe to
> free meta. So, finally, we avoids init_lock in reclaim context
> so we are free for deadlock.
> 
> Signed-off-by: Minchan Kim <minchan@kernel.org>

I like it better.

> ---
>  drivers/block/zram/zram_drv.c | 70 +++++++++++++++++++++++++------------------
>  drivers/block/zram/zram_drv.h |  2 ++
>  2 files changed, 43 insertions(+), 29 deletions(-)
> 
> diff --git a/drivers/block/zram/zram_drv.c b/drivers/block/zram/zram_drv.c
> index a598ada817f0..404602b1932e 100644
> --- a/drivers/block/zram/zram_drv.c
> +++ b/drivers/block/zram/zram_drv.c
> @@ -53,9 +53,16 @@ static ssize_t name##_show(struct device *d,		\
>  }									\
>  static DEVICE_ATTR_RO(name);
>  
> -static inline int init_done(struct zram *zram)
> +static inline bool init_done(struct zram *zram)
>  {
> -	return zram->meta != NULL;
> +	/*
> +	 * init_done can be used without holding zram->init_lock in
> +	 * read/write handler(ie, zram_make_request) but we should make sure
> +	 * that zram->init_done should set up after meta initialization is
> +	 * done. Look at disksize_store.
> +	 */
> +	smp_rmb();
> +	return zram->init_done;
>  }
>  

so now we can
	smp_rmb();
	return zram->meta != NULL;

right?

>  static inline struct zram *dev_to_zram(struct device *dev)
> @@ -726,11 +733,8 @@ static void zram_reset_device(struct zram *zram, bool reset_capacity)
>  		return;
>  	}
>  
> -	zcomp_destroy(zram->comp);
>  	zram->max_comp_streams = 1;
>  
> -	zram_meta_free(zram->meta);
> -	zram->meta = NULL;
>  	/* Reset stats */
>  	memset(&zram->stats, 0, sizeof(zram->stats));
>  
> @@ -738,8 +742,16 @@ static void zram_reset_device(struct zram *zram, bool reset_capacity)
>  	if (reset_capacity)
>  		set_capacity(zram->disk, 0);
>  
> +	zram->init_done = false;
> +	/* don't need smp_wmb because kick_all_cpus_sync does */
> +	kick_all_cpus_sync();

first, how about
	meta = zram->meta;
	zram->meta = NULL;
	kick_all_cpus_sync();

	zram_meta_free(meta);
	zcomp_destroy(zram->comp);
	..


second,
after kick_all_cpus_sync() new RW operations will see false init_done().
bdev->bd_holders protects from resetting device which has read/write
operation ongoing on the onther CPU.

I need to refresh on how ->bd_holders actually incremented/decremented.
can the following race condition take a place?

	CPU0					CPU1
reset_store()
bdev->bd_holders == false
					zram_make_request
						-rm- down_read(&zram->init_lock);
					init_done(zram) == true
zram_reset_device()			valid_io_request()
					__zram_make_request
down_write(&zram->init_lock);		zram_bvec_rw
[..]
set_capacity(zram->disk, 0);
zram->init_done = false;
kick_all_cpus_sync();			zram_bvec_write or zram_bvec_read()
zram_meta_free(zram->meta);		
zcomp_destroy(zram->comp);		zcomp_compress() or zcomp_decompress()

> +	/*
> +	 * From now on, any read/write cannot access zram meta data
> +	 * by init_done in the handler.
> +	 */
> +	zram_meta_free(zram->meta);
> +	zcomp_destroy(zram->comp);
>  	up_write(&zram->init_lock);
> -
>  	/*
>  	 * Revalidate disk out of the init_lock to avoid lockdep splat.
>  	 * It's okay because disk's capacity is protected by init_lock
> @@ -762,10 +774,19 @@ static ssize_t disksize_store(struct device *dev,
>  	if (!disksize)
>  		return -EINVAL;
>  
> +	down_write(&zram->init_lock);
> +	if (init_done(zram)) {
> +		pr_info("Cannot change disksize for initialized device\n");
> +		up_write(&zram->init_lock);
> +		return -EBUSY;
> +	}
> +
>  	disksize = PAGE_ALIGN(disksize);
>  	meta = zram_meta_alloc(zram->disk->first_minor, disksize);
> -	if (!meta)
> +	if (!meta) {
> +		up_write(&zram->init_lock);
>  		return -ENOMEM;
> +	}
>  
>  	comp = zcomp_create(zram->compressor, zram->max_comp_streams);
>  	if (IS_ERR(comp)) {
> @@ -775,17 +796,17 @@ static ssize_t disksize_store(struct device *dev,
>  		goto out_free_meta;
>  	}
>  
> -	down_write(&zram->init_lock);
> -	if (init_done(zram)) {
> -		pr_info("Cannot change disksize for initialized device\n");
> -		err = -EBUSY;
> -		goto out_destroy_comp;
> -	}
> -
>  	zram->meta = meta;
>  	zram->comp = comp;
>  	zram->disksize = disksize;
>  	set_capacity(zram->disk, zram->disksize >> SECTOR_SHIFT);
> +	/*
> +	 * Store operation of struct zram fields should complete
> +	 * before init_done set up because zram_bvec_rw doesn't
> +	 * hold an zram->init_lock.
> +	 */
> +	smp_wmb();
> +	zram->init_done = true;

	zram->meta = meta;
	smp_wmb();

	?

>  	up_write(&zram->init_lock);
>  
>  	/*
> @@ -797,10 +818,8 @@ static ssize_t disksize_store(struct device *dev,
>  
>  	return len;
>  
> -out_destroy_comp:
> -	up_write(&zram->init_lock);
> -	zcomp_destroy(comp);
>  out_free_meta:
> +	up_write(&zram->init_lock);
>  	zram_meta_free(meta);
>  	return err;
>  }
> @@ -907,7 +926,6 @@ static void zram_make_request(struct request_queue *queue, struct bio *bio)
>  {
>  	struct zram *zram = queue->queuedata;
>  
> -	down_read(&zram->init_lock);
>  	if (unlikely(!init_done(zram)))
>  		goto error;
>  
> @@ -918,12 +936,10 @@ static void zram_make_request(struct request_queue *queue, struct bio *bio)
>  	}
>  
>  	__zram_make_request(zram, bio);
> -	up_read(&zram->init_lock);
>  
>  	return;
>  
>  error:
> -	up_read(&zram->init_lock);
>  	bio_io_error(bio);
>  }
>  
> @@ -951,17 +967,16 @@ static int zram_rw_page(struct block_device *bdev, sector_t sector,
>  	struct bio_vec bv;
>  
>  	zram = bdev->bd_disk->private_data;
> +
> +	/* This should be another patch */

why? do you want to have this comment in the code?

	-ss

> +	if (unlikely(!init_done(zram)))
> +		return -EIO;
> +
>  	if (!valid_io_request(zram, sector, PAGE_SIZE)) {
>  		atomic64_inc(&zram->stats.invalid_io);
>  		return -EINVAL;
>  	}
>  
> -	down_read(&zram->init_lock);
> -	if (unlikely(!init_done(zram))) {
> -		err = -EIO;
> -		goto out_unlock;
> -	}
> -
>  	index = sector >> SECTORS_PER_PAGE_SHIFT;
>  	offset = sector & (SECTORS_PER_PAGE - 1) << SECTOR_SHIFT;
>  
> @@ -970,8 +985,6 @@ static int zram_rw_page(struct block_device *bdev, sector_t sector,
>  	bv.bv_offset = 0;
>  
>  	err = zram_bvec_rw(zram, &bv, index, offset, rw);
> -out_unlock:
> -	up_read(&zram->init_lock);
>  	/*
>  	 * If I/O fails, just return error(ie, non-zero) without
>  	 * calling page_endio.
> @@ -1125,7 +1138,6 @@ static void destroy_device(struct zram *zram)
>  
>  	del_gendisk(zram->disk);
>  	put_disk(zram->disk);
> -
>  	blk_cleanup_queue(zram->queue);
>  }
>  
> diff --git a/drivers/block/zram/zram_drv.h b/drivers/block/zram/zram_drv.h
> index e492f6bf11f1..dca265654285 100644
> --- a/drivers/block/zram/zram_drv.h
> +++ b/drivers/block/zram/zram_drv.h
> @@ -107,6 +107,8 @@ struct zram {
>  
>  	/* Prevent concurrent execution of device init, reset and R/W request */
>  	struct rw_semaphore init_lock;
> +	bool init_done;
> +
>  	/*
>  	 * This is the limit on amount of *uncompressed* worth of data
>  	 * we can store in a disk.
> -- 
> 1.9.1
> 
> 
> -- 
> Kind regards,
> Minchan Kim
> 

^ permalink raw reply	[flat|nested] 32+ messages in thread

* Re: [PATCH 1/2] zram: free meta out of init_lock
  2015-01-28  2:07                       ` Sergey Senozhatsky
@ 2015-01-28  2:57                         ` Minchan Kim
  2015-01-28  3:53                           ` Sergey Senozhatsky
  0 siblings, 1 reply; 32+ messages in thread
From: Minchan Kim @ 2015-01-28  2:57 UTC (permalink / raw)
  To: Sergey Senozhatsky
  Cc: Sergey Senozhatsky, Jerome Marchand, Andrew Morton, linux-kernel,
	linux-mm, Nitin Gupta

On Wed, Jan 28, 2015 at 11:07:59AM +0900, Sergey Senozhatsky wrote:
> On (01/28/15 09:22), Minchan Kim wrote:
> > On Wed, Jan 28, 2015 at 09:15:27AM +0900, Minchan Kim wrote:
> > > Hello Sergey,
> > > 
> > > On Tue, Jan 27, 2015 at 01:03:05PM +0900, Sergey Senozhatsky wrote:
> > > > Hello,
> > > > 
> > > > On (01/27/15 12:18), Minchan Kim wrote:
> > > > > Hello Sergey,
> > > > > 
> > > > > On Tue, Jan 27, 2015 at 11:17:04AM +0900, Sergey Senozhatsky wrote:
> > > > > > On (01/27/15 01:00), Minchan Kim wrote:
> > > > > > > On Mon, Jan 26, 2015 at 11:17:09PM +0900, Sergey Senozhatsky wrote:
> > > > > > > > Hello,
> > > > > > > > 
> > > > > > > > On (01/26/15 10:33), Minchan Kim wrote:
> > > > > > > > > Hello,
> > > > > > > > > 
> > > > > > > > > On Sat, Jan 24, 2015 at 12:47:07AM +0900, Sergey Senozhatsky wrote:
> > > > > > > > > > On (01/23/15 15:48), Jerome Marchand wrote:
> > > > > > > > > > > On 01/23/2015 03:24 PM, Sergey Senozhatsky wrote:
> > > > > > > > > > > > On (01/23/15 14:58), Minchan Kim wrote:
> > > > > > > > > > > >> We don't need to call zram_meta_free, zcomp_destroy and zs_free
> > > > > > > > > > > >> under init_lock. What we need to prevent race with init_lock
> > > > > > > > > > > >> in reset is setting NULL into zram->meta (ie, init_done).
> > > > > > > > > > > >> This patch does it.
> > > > > > > > > > > >>
> > > > > > > > > > > >> Signed-off-by: Minchan Kim <minchan@kernel.org>
> > > > > > > > > > > >> ---
> > > > > > > > > > > >>  drivers/block/zram/zram_drv.c | 28 ++++++++++++++++------------
> > > > > > > > > > > >>  1 file changed, 16 insertions(+), 12 deletions(-)
> > > > > > > > > > > >>
> > > > > > > > > > > >> diff --git a/drivers/block/zram/zram_drv.c b/drivers/block/zram/zram_drv.c
> > > > > > > > > > > >> index 9250b3f54a8f..0299d82275e7 100644
> > > > > > > > > > > >> --- a/drivers/block/zram/zram_drv.c
> > > > > > > > > > > >> +++ b/drivers/block/zram/zram_drv.c
> > > > > > > > > > > >> @@ -708,6 +708,7 @@ static void zram_reset_device(struct zram *zram, bool reset_capacity)
> > > > > > > > > > > >>  {
> > > > > > > > > > > >>  	size_t index;
> > > > > > > > > > > >>  	struct zram_meta *meta;
> > > > > > > > > > > >> +	struct zcomp *comp;
> > > > > > > > > > > >>  
> > > > > > > > > > > >>  	down_write(&zram->init_lock);
> > > > > > > > > > > >>  
> > > > > > > > > > > >> @@ -719,20 +720,10 @@ static void zram_reset_device(struct zram *zram, bool reset_capacity)
> > > > > > > > > > > >>  	}
> > > > > > > > > > > >>  
> > > > > > > > > > > >>  	meta = zram->meta;
> > > > > > > > > > > >> -	/* Free all pages that are still in this zram device */
> > > > > > > > > > > >> -	for (index = 0; index < zram->disksize >> PAGE_SHIFT; index++) {
> > > > > > > > > > > >> -		unsigned long handle = meta->table[index].handle;
> > > > > > > > > > > >> -		if (!handle)
> > > > > > > > > > > >> -			continue;
> > > > > > > > > > > >> -
> > > > > > > > > > > >> -		zs_free(meta->mem_pool, handle);
> > > > > > > > > > > >> -	}
> > > > > > > > > > > >> -
> > > > > > > > > > > >> -	zcomp_destroy(zram->comp);
> > > > > > > > > > > > 
> > > > > > > > > > > > I'm not so sure about moving zcomp destruction. if we would have detached it
> > > > > > > > > > > > from zram, then yes. otherwise, think of zram ->destoy vs ->init race.
> > > > > > > > > > > > 
> > > > > > > > > > > > suppose,
> > > > > > > > > > > > CPU1 waits for down_write() init lock in disksize_store() with new comp already allocated;
> > > > > > > > > > > > CPU0 detaches ->meta and releases write init lock;
> > > > > > > > > > > > CPU1 grabs the lock and does zram->comp = comp;
> > > > > > > > > > > > CPU0 reaches the point of zcomp_destroy(zram->comp);
> > > > > > > > > > > 
> > > > > > > > > > > I don't see your point: this patch does not call
> > > > > > > > > > > zcomp_destroy(zram->comp) anymore, but zram_destroy(comp), where comp is
> > > > > > > > > > > the old zram->comp.
> > > > > > > > > > 
> > > > > > > > > > 
> > > > > > > > > > oh... yes. sorry! my bad.
> > > > > > > > > > 
> > > > > > > > > > 
> > > > > > > > > > 
> > > > > > > > > > anyway, on a second thought, do we even want to destoy meta out of init_lock?
> > > > > > > > > > 
> > > > > > > > > > I mean, it will let you init new device quicker. but... assume, you have
> > > > > > > > > > 30G zram (or any other bad-enough number). on CPU0 you reset device -- iterate
> > > > > > > > > > over 30G meta->table, etc. out of init_lock.
> > > > > > > > > > on CPU1 you concurrently re-init device and request again 30G.
> > > > > > > > > > 
> > > > > > > > > > how bad that can be?
> > > > > > > > > > 
> > > > > > > > > > 
> > > > > > > > > > 
> > > > > > > > > > diskstore called on already initialised device is also not so perfect.
> > > > > > > > > > we first will try to allocate ->meta (vmalloc pages for another 30G),
> > > > > > > > > > then allocate comp, then down_write() init lock to find out that device
> > > > > > > > > > is initialised and we need to release allocated memory.
> > > > > > > > > > 
> > > > > > > > > > 
> > > > > > > > > > 
> > > > > > > > > > may be we better keep ->meta destruction under init_lock and additionally
> > > > > > > > > > move ->meta and ->comp allocation under init_lock in disksize_store()?
> > > > > > > > > > 
> > > > > > > > > > like the following one:
> > > > > > > > > > 
> > > > > > > > > > ---
> > > > > > > > > > 
> > > > > > > > > >  drivers/block/zram/zram_drv.c | 25 +++++++++++++------------
> > > > > > > > > >  1 file changed, 13 insertions(+), 12 deletions(-)
> > > > > > > > > > 
> > > > > > > > > > diff --git a/drivers/block/zram/zram_drv.c b/drivers/block/zram/zram_drv.c
> > > > > > > > > > index 9250b3f..827ab21 100644
> > > > > > > > > > --- a/drivers/block/zram/zram_drv.c
> > > > > > > > > > +++ b/drivers/block/zram/zram_drv.c
> > > > > > > > > > @@ -765,9 +765,18 @@ static ssize_t disksize_store(struct device *dev,
> > > > > > > > > >  		return -EINVAL;
> > > > > > > > > >  
> > > > > > > > > >  	disksize = PAGE_ALIGN(disksize);
> > > > > > > > > > +	down_write(&zram->init_lock);
> > > > > > > > > > +	if (init_done(zram)) {
> > > > > > > > > > +		up_write(&zram->init_lock);
> > > > > > > > > > +		pr_info("Cannot change disksize for initialized device\n");
> > > > > > > > > > +		return -EBUSY;
> > > > > > > > > > +	}
> > > > > > > > > > +
> > > > > > > > > >  	meta = zram_meta_alloc(zram->disk->first_minor, disksize);
> > > > > > > > > > -	if (!meta)
> > > > > > > > > > -		return -ENOMEM;
> > > > > > > > > > +	if (!meta) {
> > > > > > > > > > +		err = -ENOMEM;
> > > > > > > > > > +		goto out_unlock;
> > > > > > > > > > +	}
> > > > > > > > > >  
> > > > > > > > > >  	comp = zcomp_create(zram->compressor, zram->max_comp_streams);
> > > > > > > > > >  	if (IS_ERR(comp)) {
> > > > > > > > > > @@ -777,13 +786,6 @@ static ssize_t disksize_store(struct device *dev,
> > > > > > > > > >  		goto out_free_meta;
> > > > > > > > > >  	}
> > > > > > > > > >  
> > > > > > > > > > -	down_write(&zram->init_lock);
> > > > > > > > > > -	if (init_done(zram)) {
> > > > > > > > > > -		pr_info("Cannot change disksize for initialized device\n");
> > > > > > > > > > -		err = -EBUSY;
> > > > > > > > > > -		goto out_destroy_comp;
> > > > > > > > > > -	}
> > > > > > > > > > -
> > > > > > > > > >  	zram->meta = meta;
> > > > > > > > > >  	zram->comp = comp;
> > > > > > > > > >  	zram->disksize = disksize;
> > > > > > > > > > @@ -799,11 +801,10 @@ static ssize_t disksize_store(struct device *dev,
> > > > > > > > > >  
> > > > > > > > > >  	return len;
> > > > > > > > > >  
> > > > > > > > > > -out_destroy_comp:
> > > > > > > > > > -	up_write(&zram->init_lock);
> > > > > > > > > > -	zcomp_destroy(comp);
> > > > > > > > > >  out_free_meta:
> > > > > > > > > >  	zram_meta_free(meta);
> > > > > > > > > > +out_unlock:
> > > > > > > > > > +	up_write(&zram->init_lock);
> > > > > > > > > >  	return err;
> > > > > > > > > >  }
> > > > > > > > > >  
> > > > > > > > > 
> > > > > > > > > The init_lock is really troublesome. We can't do call zram_meta_alloc
> > > > > > > > > under init_lock due to lockdep report. Please keep in mind.
> > > > > > > > >
> > > > > > > > 
> > > > > > > > ah... I do recall it, thanks for your reminder.
> > > > > > > > 
> > > > > > > > 
> > > > > > > > > The zram_rw_page is one of the function under reclaim path and hold it
> > > > > > > > > as read_lock while here holds it as write_lock.
> > > > > > > > > It's a false positive so that we might could make shut lockdep up
> > > > > > > > > by annotation but I don't want it but want to work with lockdep rather
> > > > > > > > > than disable. As well, there are other pathes to use init_lock to
> > > > > > > > > protect other data where would be victims of lockdep.
> > > > > > > > > 
> > > > > > > > > I didn't tell the motivation of this patch because it made you busy
> > > > > > > > > guys wasted. Let me tell it now. It was another lockdep report by
> > > > > > > > > kmem_cache_destroy for zsmalloc compaction about init_lock. That's why
> > > > > > > > > the patchset was one of the patch in compaction.
> > > > > > > > >
> > > > > > > > > Yes, the ideal is to remove horrible init_lock of zram in this phase and
> > > > > > > > > make code more simple and clear but I don't want to stuck zsmalloc
> > > > > > > > > compaction by the work.
> > > > > > > > 
> > > > > > > > 
> > > > > > > > > Having said that, I feel it's time to revisit
> > > > > > > > > to remove init_lock.
> > > > > > > > > At least, I will think over to find a solution to kill init_lock.
> > > > > > > > 
> > > > > > > > hm, can't think of anything quick...
> > > > > > > > 
> > > > > > > > 	-ss
> > > > > > > 
> > > > > > > Hello guys,
> > > > > > > 
> > > > > > > How about this?
> > > > > > > 
> > > > > > > It's based on Ganesh's patch.
> > > > > > > https://lkml.org/lkml/2015/1/24/50
> > > > > > (I see no similarities with Ganesh's patch)
> > > > > > 
> > > > > > hm, you probably meant this one https://lkml.org/lkml/2015/1/23/406
> > > > > > 
> > > > > > 
> > > > > > at glance this makes things a bit more complicated, so I need to think more.
> > > > > > 
> > > > > > > From afda9fd2f6c40dd0745d8a6babe78c5cbdceddf5 Mon Sep 17 00:00:00 2001
> > > > > > > From: Minchan Kim <minchan@kernel.org>
> > > > > > > Date: Mon, 26 Jan 2015 14:34:10 +0900
> > > > > > > Subject: [RFC] zram: remove init_lock in zram_make_request
> > > > > > > 
> > > > > > > Admin could reset zram during I/O operation going on so we have
> > > > > > > used zram->init_lock as read-side lock in I/O path to prevent
> > > > > > > sudden zram meta freeing.
> > > > > > > 
> > > > > > > However, the init_lock is really troublesome.
> > > > > > > We can't do call zram_meta_alloc under init_lock due to lockdep splat
> > > > > > > because zram_rw_page is one of the function under reclaim path and
> > > > > > > hold it as read_lock while other places in process context hold it
> > > > > > > as write_lock. So, we have used allocation out of the lock to avoid
> > > > > > > lockdep warn but it's not good for readability and fainally, I met
> > > > > > > another lockdep splat between init_lock and cpu_hotpulug from
> > > > > > > kmem_cache_destroy during wokring zsmalloc compaction. :(
> > > > > > > 
> > > > > > > Yes, the ideal is to remove horrible init_lock of zram in rw path.
> > > > > > > This patch removes it in rw path and instead, put init_done bool
> > > > > > > variable to check initialization done with smp_[wmb|rmb] and
> > > > > > > srcu_[un]read_lock to prevent sudden zram meta freeing
> > > > > > > during I/O operation.
> > > > > > > 
> > > > > > > Signed-off-by: Minchan Kim <minchan@kernel.org>
> > > > > > > ---
> > > > > > >  drivers/block/zram/zram_drv.c | 76 +++++++++++++++++++++++++++++--------------
> > > > > > >  drivers/block/zram/zram_drv.h |  5 +++
> > > > > > >  2 files changed, 57 insertions(+), 24 deletions(-)
> > > > > > > 
> > > > > > > diff --git a/drivers/block/zram/zram_drv.c b/drivers/block/zram/zram_drv.c
> > > > > > > index a598ada817f0..e06ff975f997 100644
> > > > > > > --- a/drivers/block/zram/zram_drv.c
> > > > > > > +++ b/drivers/block/zram/zram_drv.c
> > > > > > > @@ -32,6 +32,7 @@
> > > > > > >  #include <linux/string.h>
> > > > > > >  #include <linux/vmalloc.h>
> > > > > > >  #include <linux/err.h>
> > > > > > > +#include <linux/srcu.h>
> > > > > > >  
> > > > > > >  #include "zram_drv.h"
> > > > > > >  
> > > > > > > @@ -53,9 +54,16 @@ static ssize_t name##_show(struct device *d,		\
> > > > > > >  }									\
> > > > > > >  static DEVICE_ATTR_RO(name);
> > > > > > >  
> > > > > > > -static inline int init_done(struct zram *zram)
> > > > > > > +static inline bool init_done(struct zram *zram)
> > > > > > >  {
> > > > > > > -	return zram->meta != NULL;
> > > > > > > +	/*
> > > > > > > +	 * init_done can be used without holding zram->init_lock in
> > > > > > > +	 * read/write handler(ie, zram_make_request) but we should make sure
> > > > > > > +	 * that zram->init_done should set up after meta initialization is
> > > > > > > +	 * done. Look at disksize_store.
> > > > > > > +	 */
> > > > > > > +	smp_rmb();
> > > > > > > +	return zram->init_done;
> > > > > > 
> > > > > > ->init_done returns back :)
> > > > > 
> > > > > 
> > > > > > can we rely on write ->meta; wmb; --- rmb; read ->meta?
> > > > > 
> > > > > Might be possible.
> > > 
> > > Now that I think about it, it's impossible with zram->meta because
> > > we need to nullify it before call_srcu but pre-existing SRCU read-side
> > > critical sections can access zram->meta.
> > > Anyway, introducing a new variable should be not a party-pooper.
> > > 
> > > > > 
> > > > > > 
> > > > > > how much performance do we lose on barriers?
> > > > > 
> > > > > I think it's not too much than locking which does more than(ie,
> > > > > barrier, fairness, spin on owner and so on) such simple barrier.
> > > > > 
> > > > > > 
> > > > > > >  }
> > > > > > >  
> > > > > > >  static inline struct zram *dev_to_zram(struct device *dev)
> > > > > > > @@ -326,6 +334,10 @@ static void zram_meta_free(struct zram_meta *meta)
> > > > > > >  	kfree(meta);
> > > > > > >  }
> > > > > > >  
> > > > > > > +static void rcu_zram_do_nothing(struct rcu_head *unused)
> > > > > > > +{
> > > > > > > +}
> > > > > > > +
> > > > > > >  static struct zram_meta *zram_meta_alloc(int device_id, u64 disksize)
> > > > > > >  {
> > > > > > >  	char pool_name[8];
> > > > > > > @@ -726,11 +738,8 @@ static void zram_reset_device(struct zram *zram, bool reset_capacity)
> > > > > > >  		return;
> > > > > > >  	}
> > > > > > >  
> > > > > > > -	zcomp_destroy(zram->comp);
> > > > > > >  	zram->max_comp_streams = 1;
> > > > > > >  
> > > > > > > -	zram_meta_free(zram->meta);
> > > > > > > -	zram->meta = NULL;
> > > > > > >  	/* Reset stats */
> > > > > > >  	memset(&zram->stats, 0, sizeof(zram->stats));
> > > > > > >  
> > > > > > > @@ -738,8 +747,12 @@ static void zram_reset_device(struct zram *zram, bool reset_capacity)
> > > > > > >  	if (reset_capacity)
> > > > > > >  		set_capacity(zram->disk, 0);
> > > > > > >  
> > > > > > > +	zram->init_done = false;
> > > > > > 
> > > > > > missing wmb?
> > > > > 
> > > > > I thouht about it but when I read comment from call_srcu as follows
> > > > > "each cpu is guaranteed to have executed a full memory barrier",
> > > > > I decided we don't need it. Right? (ie, double check)
> > > > > 
> > > > 
> > > > hm, need to think about it.
> > > 
> > > Another idea is to use kick_all_cpus_sync, not srcu.
> > > With that, we don't need to add more instruction in rw path.
> > > I will try it.
> > 
> > From 560478040d2e08c61796e67d0c3ee519ae67ac0f Mon Sep 17 00:00:00 2001
> > From: Minchan Kim <minchan@kernel.org>
> > Date: Mon, 26 Jan 2015 14:34:10 +0900
> > Subject: [PATCH] zram: remove init_lock in zram_make_request
> > 
> > Admin could reset zram during I/O operation going on so we have
> > used zram->init_lock as read-side lock in I/O path to prevent
> > sudden zram meta freeing.
> > 
> > However, the init_lock is really troublesome.
> > We can't do call zram_meta_alloc under init_lock due to lockdep splat
> > because zram_rw_page is one of the function under reclaim path and
> > hold it as read_lock while other places in process context hold it
> > as write_lock. So, we have used allocation out of the lock to avoid
> > lockdep warn but it's not good for readability and finally, I met
> > another lockdep splat between init_lock and cpu_hotpulug from
> > kmem_cache_destroy during wokring zsmalloc compaction. :(
> > 
> > Yes, the ideal is to remove horrible init_lock of zram in rw path.
> > This patch removes it in rw path and instead, use kick_all_cpus_sync
> > and a bool init_done variable to check initialization done with
> > smp_[wmb|rmb].
> > 
> > Upon kick_all_cpus_sync returns, any CPU cannot access zram meta
> > any more due to init_done in zram_make_request so it's safe to
> > free meta. So, finally, we avoids init_lock in reclaim context
> > so we are free for deadlock.
> > 
> > Signed-off-by: Minchan Kim <minchan@kernel.org>
> 
> I like it better.
> 
> > ---
> >  drivers/block/zram/zram_drv.c | 70 +++++++++++++++++++++++++------------------
> >  drivers/block/zram/zram_drv.h |  2 ++
> >  2 files changed, 43 insertions(+), 29 deletions(-)
> > 
> > diff --git a/drivers/block/zram/zram_drv.c b/drivers/block/zram/zram_drv.c
> > index a598ada817f0..404602b1932e 100644
> > --- a/drivers/block/zram/zram_drv.c
> > +++ b/drivers/block/zram/zram_drv.c
> > @@ -53,9 +53,16 @@ static ssize_t name##_show(struct device *d,		\
> >  }									\
> >  static DEVICE_ATTR_RO(name);
> >  
> > -static inline int init_done(struct zram *zram)
> > +static inline bool init_done(struct zram *zram)
> >  {
> > -	return zram->meta != NULL;
> > +	/*
> > +	 * init_done can be used without holding zram->init_lock in
> > +	 * read/write handler(ie, zram_make_request) but we should make sure
> > +	 * that zram->init_done should set up after meta initialization is
> > +	 * done. Look at disksize_store.
> > +	 */
> > +	smp_rmb();
> > +	return zram->init_done;
> >  }
> >  
> 
> so now we can
> 	smp_rmb();
> 	return zram->meta != NULL;
> 
> right?
> 
> >  static inline struct zram *dev_to_zram(struct device *dev)
> > @@ -726,11 +733,8 @@ static void zram_reset_device(struct zram *zram, bool reset_capacity)
> >  		return;
> >  	}
> >  
> > -	zcomp_destroy(zram->comp);
> >  	zram->max_comp_streams = 1;
> >  
> > -	zram_meta_free(zram->meta);
> > -	zram->meta = NULL;
> >  	/* Reset stats */
> >  	memset(&zram->stats, 0, sizeof(zram->stats));
> >  
> > @@ -738,8 +742,16 @@ static void zram_reset_device(struct zram *zram, bool reset_capacity)
> >  	if (reset_capacity)
> >  		set_capacity(zram->disk, 0);
> >  
> > +	zram->init_done = false;
> > +	/* don't need smp_wmb because kick_all_cpus_sync does */
> > +	kick_all_cpus_sync();
> 
> first, how about
> 	meta = zram->meta;
> 	zram->meta = NULL;
> 	kick_all_cpus_sync();
> 
> 	zram_meta_free(meta);
> 	zcomp_destroy(zram->comp);
> 	..
> 
> 
> second,
> after kick_all_cpus_sync() new RW operations will see false init_done().
> bdev->bd_holders protects from resetting device which has read/write
> operation ongoing on the onther CPU.
> 
> I need to refresh on how ->bd_holders actually incremented/decremented.
> can the following race condition take a place?
> 
> 	CPU0					CPU1
> reset_store()
> bdev->bd_holders == false
> 					zram_make_request
> 						-rm- down_read(&zram->init_lock);
> 					init_done(zram) == true
> zram_reset_device()			valid_io_request()
> 					__zram_make_request
> down_write(&zram->init_lock);		zram_bvec_rw
> [..]
> set_capacity(zram->disk, 0);
> zram->init_done = false;
> kick_all_cpus_sync();			zram_bvec_write or zram_bvec_read()
> zram_meta_free(zram->meta);		
> zcomp_destroy(zram->comp);		zcomp_compress() or zcomp_decompress()

You're absolutely right. I forgot rw path is blockable so
kick_all_cpus_sync doesn't work for our case, unfortunately.
So, I want to go with srcu. Do you agree? or another suggestion?

> 
> > +	/*
> > +	 * From now on, any read/write cannot access zram meta data
> > +	 * by init_done in the handler.
> > +	 */
> > +	zram_meta_free(zram->meta);
> > +	zcomp_destroy(zram->comp);
> >  	up_write(&zram->init_lock);
> > -
> >  	/*
> >  	 * Revalidate disk out of the init_lock to avoid lockdep splat.
> >  	 * It's okay because disk's capacity is protected by init_lock
> > @@ -762,10 +774,19 @@ static ssize_t disksize_store(struct device *dev,
> >  	if (!disksize)
> >  		return -EINVAL;
> >  
> > +	down_write(&zram->init_lock);
> > +	if (init_done(zram)) {
> > +		pr_info("Cannot change disksize for initialized device\n");
> > +		up_write(&zram->init_lock);
> > +		return -EBUSY;
> > +	}
> > +
> >  	disksize = PAGE_ALIGN(disksize);
> >  	meta = zram_meta_alloc(zram->disk->first_minor, disksize);
> > -	if (!meta)
> > +	if (!meta) {
> > +		up_write(&zram->init_lock);
> >  		return -ENOMEM;
> > +	}
> >  
> >  	comp = zcomp_create(zram->compressor, zram->max_comp_streams);
> >  	if (IS_ERR(comp)) {
> > @@ -775,17 +796,17 @@ static ssize_t disksize_store(struct device *dev,
> >  		goto out_free_meta;
> >  	}
> >  
> > -	down_write(&zram->init_lock);
> > -	if (init_done(zram)) {
> > -		pr_info("Cannot change disksize for initialized device\n");
> > -		err = -EBUSY;
> > -		goto out_destroy_comp;
> > -	}
> > -
> >  	zram->meta = meta;
> >  	zram->comp = comp;
> >  	zram->disksize = disksize;
> >  	set_capacity(zram->disk, zram->disksize >> SECTOR_SHIFT);
> > +	/*
> > +	 * Store operation of struct zram fields should complete
> > +	 * before init_done set up because zram_bvec_rw doesn't
> > +	 * hold an zram->init_lock.
> > +	 */
> > +	smp_wmb();
> > +	zram->init_done = true;
> 
> 	zram->meta = meta;
> 	smp_wmb();
> 
> 	?
> 
> >  	up_write(&zram->init_lock);
> >  
> >  	/*
> > @@ -797,10 +818,8 @@ static ssize_t disksize_store(struct device *dev,
> >  
> >  	return len;
> >  
> > -out_destroy_comp:
> > -	up_write(&zram->init_lock);
> > -	zcomp_destroy(comp);
> >  out_free_meta:
> > +	up_write(&zram->init_lock);
> >  	zram_meta_free(meta);
> >  	return err;
> >  }
> > @@ -907,7 +926,6 @@ static void zram_make_request(struct request_queue *queue, struct bio *bio)
> >  {
> >  	struct zram *zram = queue->queuedata;
> >  
> > -	down_read(&zram->init_lock);
> >  	if (unlikely(!init_done(zram)))
> >  		goto error;
> >  
> > @@ -918,12 +936,10 @@ static void zram_make_request(struct request_queue *queue, struct bio *bio)
> >  	}
> >  
> >  	__zram_make_request(zram, bio);
> > -	up_read(&zram->init_lock);
> >  
> >  	return;
> >  
> >  error:
> > -	up_read(&zram->init_lock);
> >  	bio_io_error(bio);
> >  }
> >  
> > @@ -951,17 +967,16 @@ static int zram_rw_page(struct block_device *bdev, sector_t sector,
> >  	struct bio_vec bv;
> >  
> >  	zram = bdev->bd_disk->private_data;
> > +
> > +	/* This should be another patch */
> 
> why? do you want to have this comment in the code?

It's one I should separate as another patch but I didn't in this RFC.
Just want to say that.

> 
> 	-ss
> 
> > +	if (unlikely(!init_done(zram)))
> > +		return -EIO;
> > +
> >  	if (!valid_io_request(zram, sector, PAGE_SIZE)) {
> >  		atomic64_inc(&zram->stats.invalid_io);
> >  		return -EINVAL;
> >  	}
> >  
> > -	down_read(&zram->init_lock);
> > -	if (unlikely(!init_done(zram))) {
> > -		err = -EIO;
> > -		goto out_unlock;
> > -	}
> > -
> >  	index = sector >> SECTORS_PER_PAGE_SHIFT;
> >  	offset = sector & (SECTORS_PER_PAGE - 1) << SECTOR_SHIFT;
> >  
> > @@ -970,8 +985,6 @@ static int zram_rw_page(struct block_device *bdev, sector_t sector,
> >  	bv.bv_offset = 0;
> >  
> >  	err = zram_bvec_rw(zram, &bv, index, offset, rw);
> > -out_unlock:
> > -	up_read(&zram->init_lock);
> >  	/*
> >  	 * If I/O fails, just return error(ie, non-zero) without
> >  	 * calling page_endio.
> > @@ -1125,7 +1138,6 @@ static void destroy_device(struct zram *zram)
> >  
> >  	del_gendisk(zram->disk);
> >  	put_disk(zram->disk);
> > -
> >  	blk_cleanup_queue(zram->queue);
> >  }
> >  
> > diff --git a/drivers/block/zram/zram_drv.h b/drivers/block/zram/zram_drv.h
> > index e492f6bf11f1..dca265654285 100644
> > --- a/drivers/block/zram/zram_drv.h
> > +++ b/drivers/block/zram/zram_drv.h
> > @@ -107,6 +107,8 @@ struct zram {
> >  
> >  	/* Prevent concurrent execution of device init, reset and R/W request */
> >  	struct rw_semaphore init_lock;
> > +	bool init_done;
> > +
> >  	/*
> >  	 * This is the limit on amount of *uncompressed* worth of data
> >  	 * we can store in a disk.
> > -- 
> > 1.9.1
> > 
> > 
> > -- 
> > Kind regards,
> > Minchan Kim
> > 
> 
> --
> To unsubscribe, send a message with 'unsubscribe linux-mm' in
> the body to majordomo@kvack.org.  For more info on Linux MM,
> see: http://www.linux-mm.org/ .
> Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

-- 
Kind regards,
Minchan Kim

^ permalink raw reply	[flat|nested] 32+ messages in thread

* Re: [PATCH 1/2] zram: free meta out of init_lock
  2015-01-28  2:57                         ` Minchan Kim
@ 2015-01-28  3:53                           ` Sergey Senozhatsky
  2015-01-28  4:07                             ` Sergey Senozhatsky
  2015-01-28  4:55                             ` Minchan Kim
  0 siblings, 2 replies; 32+ messages in thread
From: Sergey Senozhatsky @ 2015-01-28  3:53 UTC (permalink / raw)
  To: Minchan Kim
  Cc: Sergey Senozhatsky, Sergey Senozhatsky, Jerome Marchand,
	Andrew Morton, linux-kernel, linux-mm, Nitin Gupta

On (01/28/15 11:57), Minchan Kim wrote:
[..]
> > second,
> > after kick_all_cpus_sync() new RW operations will see false init_done().
> > bdev->bd_holders protects from resetting device which has read/write
> > operation ongoing on the onther CPU.
> > 
> > I need to refresh on how ->bd_holders actually incremented/decremented.
> > can the following race condition take a place?
> > 
> > 	CPU0					CPU1
> > reset_store()
> > bdev->bd_holders == false
> > 					zram_make_request
> > 						-rm- down_read(&zram->init_lock);
> > 					init_done(zram) == true
> > zram_reset_device()			valid_io_request()
> > 					__zram_make_request
> > down_write(&zram->init_lock);		zram_bvec_rw
> > [..]
> > set_capacity(zram->disk, 0);
> > zram->init_done = false;
> > kick_all_cpus_sync();			zram_bvec_write or zram_bvec_read()
> > zram_meta_free(zram->meta);		
> > zcomp_destroy(zram->comp);		zcomp_compress() or zcomp_decompress()
> 
> You're absolutely right. I forgot rw path is blockable so
> kick_all_cpus_sync doesn't work for our case, unfortunately.
> So, I want to go with srcu. Do you agree? or another suggestion?

yes, I think we need to take a second look on srcu approach.

	-ss

^ permalink raw reply	[flat|nested] 32+ messages in thread

* Re: [PATCH 1/2] zram: free meta out of init_lock
  2015-01-28  3:53                           ` Sergey Senozhatsky
@ 2015-01-28  4:07                             ` Sergey Senozhatsky
  2015-01-28  4:50                               ` Sergey Senozhatsky
  2015-01-28  4:55                             ` Minchan Kim
  1 sibling, 1 reply; 32+ messages in thread
From: Sergey Senozhatsky @ 2015-01-28  4:07 UTC (permalink / raw)
  To: Sergey Senozhatsky
  Cc: Minchan Kim, Sergey Senozhatsky, Jerome Marchand, Andrew Morton,
	linux-kernel, linux-mm, Nitin Gupta

On (01/28/15 12:53), Sergey Senozhatsky wrote:
> > So, I want to go with srcu. Do you agree? or another suggestion?
> 
> yes, I think we need to take a second look on srcu approach.
> 

... or we can ask lockdep to stop false alarming us and leave it as is.
I wouldn't say that ->init_lock is so hard to understand.
just as an option.

	-ss

^ permalink raw reply	[flat|nested] 32+ messages in thread

* Re: [PATCH 1/2] zram: free meta out of init_lock
  2015-01-28  4:07                             ` Sergey Senozhatsky
@ 2015-01-28  4:50                               ` Sergey Senozhatsky
  2015-01-28  4:58                                 ` Minchan Kim
  0 siblings, 1 reply; 32+ messages in thread
From: Sergey Senozhatsky @ 2015-01-28  4:50 UTC (permalink / raw)
  To: Minchan Kim
  Cc: Sergey Senozhatsky, Sergey Senozhatsky, Jerome Marchand,
	Andrew Morton, linux-kernel, linux-mm, Nitin Gupta

On (01/28/15 13:07), Sergey Senozhatsky wrote:
> On (01/28/15 12:53), Sergey Senozhatsky wrote:
> > > So, I want to go with srcu. Do you agree? or another suggestion?
> > 
> > yes, I think we need to take a second look on srcu approach.
> > 
> 
> ... or we can ask lockdep to stop false alarming us and leave it as is.
> I wouldn't say that ->init_lock is so hard to understand.
> just as an option.
> 

so... returning back to barriers performance implications.

x86_64, lzo, 4 comp streams, 2G zram, ext4, mount -o rw,relatime,data=ordered

 ./iozone -t 3 -R -r 16K -s 60M -I +Z

       test           base          srcu
"  Initial write " 1299639.75   1277621.03
"        Rewrite " 2139387.50   2004663.94
"           Read " 6193415.00   5091000.00
"        Re-read " 6199050.38   4814297.88
"   Reverse Read " 4693868.88   4367201.75
"    Stride read " 4470633.75   4247550.00
"    Random read " 5115339.50   4517352.75
" Mixed workload " 4340747.06   3880517.31
"   Random write " 1982369.75   1892456.25
"         Pwrite " 1352550.22   1248667.78
"          Pread " 2853150.06   2445154.41
"         Fwrite " 2367397.81   2262384.56
"          Fread " 8100746.50   7578071.75

not good.

	-ss

^ permalink raw reply	[flat|nested] 32+ messages in thread

* Re: [PATCH 1/2] zram: free meta out of init_lock
  2015-01-28  3:53                           ` Sergey Senozhatsky
  2015-01-28  4:07                             ` Sergey Senozhatsky
@ 2015-01-28  4:55                             ` Minchan Kim
  1 sibling, 0 replies; 32+ messages in thread
From: Minchan Kim @ 2015-01-28  4:55 UTC (permalink / raw)
  To: Sergey Senozhatsky
  Cc: Sergey Senozhatsky, Jerome Marchand, Andrew Morton, linux-kernel,
	linux-mm, Nitin Gupta

On Wed, Jan 28, 2015 at 12:53:54PM +0900, Sergey Senozhatsky wrote:
> On (01/28/15 11:57), Minchan Kim wrote:
> [..]
> > > second,
> > > after kick_all_cpus_sync() new RW operations will see false init_done().
> > > bdev->bd_holders protects from resetting device which has read/write
> > > operation ongoing on the onther CPU.
> > > 
> > > I need to refresh on how ->bd_holders actually incremented/decremented.
> > > can the following race condition take a place?
> > > 
> > > 	CPU0					CPU1
> > > reset_store()
> > > bdev->bd_holders == false
> > > 					zram_make_request
> > > 						-rm- down_read(&zram->init_lock);
> > > 					init_done(zram) == true
> > > zram_reset_device()			valid_io_request()
> > > 					__zram_make_request
> > > down_write(&zram->init_lock);		zram_bvec_rw
> > > [..]
> > > set_capacity(zram->disk, 0);
> > > zram->init_done = false;
> > > kick_all_cpus_sync();			zram_bvec_write or zram_bvec_read()
> > > zram_meta_free(zram->meta);		
> > > zcomp_destroy(zram->comp);		zcomp_compress() or zcomp_decompress()
> > 
> > You're absolutely right. I forgot rw path is blockable so
> > kick_all_cpus_sync doesn't work for our case, unfortunately.
> > So, I want to go with srcu. Do you agree? or another suggestion?
> 
> yes, I think we need to take a second look on srcu approach.
> 
> 	-ss

Another idea is to introduce atomic refcount on zram for meta's lifetime management
so that rw path should get a ref for right before using the meta and put it on done.
If the refcount is negative, anyone shouldn't go with it.

However, I guess we can do it simple and more scalable with srcu rather than
introducing new atomic count. ;-)

-- 
Kind regards,
Minchan Kim

^ permalink raw reply	[flat|nested] 32+ messages in thread

* Re: [PATCH 1/2] zram: free meta out of init_lock
  2015-01-28  4:50                               ` Sergey Senozhatsky
@ 2015-01-28  4:58                                 ` Minchan Kim
  2015-01-28  5:35                                   ` Minchan Kim
  0 siblings, 1 reply; 32+ messages in thread
From: Minchan Kim @ 2015-01-28  4:58 UTC (permalink / raw)
  To: Sergey Senozhatsky
  Cc: Sergey Senozhatsky, Jerome Marchand, Andrew Morton, linux-kernel,
	linux-mm, Nitin Gupta

On Wed, Jan 28, 2015 at 01:50:28PM +0900, Sergey Senozhatsky wrote:
> On (01/28/15 13:07), Sergey Senozhatsky wrote:
> > On (01/28/15 12:53), Sergey Senozhatsky wrote:
> > > > So, I want to go with srcu. Do you agree? or another suggestion?
> > > 
> > > yes, I think we need to take a second look on srcu approach.
> > > 
> > 
> > ... or we can ask lockdep to stop false alarming us and leave it as is.
> > I wouldn't say that ->init_lock is so hard to understand.
> > just as an option.
> > 
> 
> so... returning back to barriers performance implications.
> 
> x86_64, lzo, 4 comp streams, 2G zram, ext4, mount -o rw,relatime,data=ordered
> 
>  ./iozone -t 3 -R -r 16K -s 60M -I +Z
> 
>        test           base          srcu
> "  Initial write " 1299639.75   1277621.03
> "        Rewrite " 2139387.50   2004663.94
> "           Read " 6193415.00   5091000.00
> "        Re-read " 6199050.38   4814297.88
> "   Reverse Read " 4693868.88   4367201.75
> "    Stride read " 4470633.75   4247550.00
> "    Random read " 5115339.50   4517352.75
> " Mixed workload " 4340747.06   3880517.31
> "   Random write " 1982369.75   1892456.25
> "         Pwrite " 1352550.22   1248667.78
> "          Pread " 2853150.06   2445154.41
> "         Fwrite " 2367397.81   2262384.56
> "          Fread " 8100746.50   7578071.75
> 
> not good.
> 

Oops, I never thought it could make mesurable performance.
I will investigate it.

Thanks a lot, Sergey!

> 	-ss

-- 
Kind regards,
Minchan Kim

^ permalink raw reply	[flat|nested] 32+ messages in thread

* Re: [PATCH 1/2] zram: free meta out of init_lock
  2015-01-28  4:58                                 ` Minchan Kim
@ 2015-01-28  5:35                                   ` Minchan Kim
  2015-01-28  6:08                                     ` Sergey Senozhatsky
  0 siblings, 1 reply; 32+ messages in thread
From: Minchan Kim @ 2015-01-28  5:35 UTC (permalink / raw)
  To: Sergey Senozhatsky
  Cc: Sergey Senozhatsky, Jerome Marchand, Andrew Morton, linux-kernel,
	linux-mm, Nitin Gupta

On Wed, Jan 28, 2015 at 01:58:55PM +0900, Minchan Kim wrote:
> On Wed, Jan 28, 2015 at 01:50:28PM +0900, Sergey Senozhatsky wrote:
> > On (01/28/15 13:07), Sergey Senozhatsky wrote:
> > > On (01/28/15 12:53), Sergey Senozhatsky wrote:
> > > > > So, I want to go with srcu. Do you agree? or another suggestion?
> > > > 
> > > > yes, I think we need to take a second look on srcu approach.
> > > > 
> > > 
> > > ... or we can ask lockdep to stop false alarming us and leave it as is.
> > > I wouldn't say that ->init_lock is so hard to understand.
> > > just as an option.
> > > 
> > 
> > so... returning back to barriers performance implications.
> > 
> > x86_64, lzo, 4 comp streams, 2G zram, ext4, mount -o rw,relatime,data=ordered
> > 
> >  ./iozone -t 3 -R -r 16K -s 60M -I +Z
> > 
> >        test           base          srcu
> > "  Initial write " 1299639.75   1277621.03
> > "        Rewrite " 2139387.50   2004663.94
> > "           Read " 6193415.00   5091000.00
> > "        Re-read " 6199050.38   4814297.88
> > "   Reverse Read " 4693868.88   4367201.75
> > "    Stride read " 4470633.75   4247550.00
> > "    Random read " 5115339.50   4517352.75
> > " Mixed workload " 4340747.06   3880517.31
> > "   Random write " 1982369.75   1892456.25
> > "         Pwrite " 1352550.22   1248667.78
> > "          Pread " 2853150.06   2445154.41
> > "         Fwrite " 2367397.81   2262384.56
> > "          Fread " 8100746.50   7578071.75
> > 
> > not good.
> > 
> 
> Oops, I never thought it could make mesurable performance.
> I will investigate it.
> 
> Thanks a lot, Sergey!

Sergey, the data is consistent for repeated work?

I tested it with dd on /dev/zram0 without any FS on my KVM
and I cannot see any measureable performance gap.
Hmm, I will try it on real machine.

-- 
Kind regards,
Minchan Kim

^ permalink raw reply	[flat|nested] 32+ messages in thread

* Re: [PATCH 1/2] zram: free meta out of init_lock
  2015-01-28  5:35                                   ` Minchan Kim
@ 2015-01-28  6:08                                     ` Sergey Senozhatsky
  2015-01-28  6:10                                       ` Sergey Senozhatsky
  0 siblings, 1 reply; 32+ messages in thread
From: Sergey Senozhatsky @ 2015-01-28  6:08 UTC (permalink / raw)
  To: Minchan Kim
  Cc: Sergey Senozhatsky, Sergey Senozhatsky, Jerome Marchand,
	Andrew Morton, linux-kernel, linux-mm, Nitin Gupta

On (01/28/15 14:35), Minchan Kim wrote:
> I tested it with dd on /dev/zram0 without any FS on my KVM
> and I cannot see any measureable performance gap.
> Hmm, I will try it on real machine.

hm... no, it's 100% stable

 ./iozone -t 3 -R -r 16K -s 60M -I +Z

        test           base        srcu

 "  Initial write " 1274320.94  1251996.78
 "        Rewrite " 1965783.94  1994964.06
 "           Read " 4994070.75  4785895.88
 "        Re-read " 5134244.62  5010810.50
 "   Reverse Read " 4098531.38  4049988.38
 "    Stride read " 4577775.75  4263884.50
 "    Random read " 4131315.75  4636718.38
 " Mixed workload " 3675635.25  3854783.06
 "   Random write " 1832045.12  1863511.31
 "         Pwrite " 1238366.59  1258660.47
 "          Pread " 2475710.28  2404201.75
 "         Fwrite " 2410579.94  2396443.25
 "          Fread " 7723248.00  7127479.75

 "  Initial write " 1325167.41  1321517.41
 "        Rewrite " 2044098.62  2161141.06
 "           Read " 5267661.12  6203909.25
 "        Re-read " 5458601.62  5773477.12
 "   Reverse Read " 5001896.25  5103856.12
 "    Stride read " 4858877.62  5003335.25
 "    Random read " 4620529.88  4685374.62
 " Mixed workload " 3868978.19  3939195.31
 "   Random write " 2037816.75  1949729.56
 "         Pwrite " 1298255.91  1323038.47
 "          Pread " 2688768.09  2957903.06
 "         Fwrite " 2482632.44  2351247.50
 "          Fread " 7905214.75  7500859.75

 "  Initial write " 1334890.88  1332275.59
 "        Rewrite " 2061126.00  2152643.69
 "           Read " 5749209.88  5652791.62
 "        Re-read " 5845869.00  6261777.25
 "   Reverse Read " 4681375.12  4875618.50
 "    Stride read " 4760689.75  5242670.00
 "    Random read " 5112395.75  4650536.62
 " Mixed workload " 4129292.06  4075847.88
 "   Random write " 2067824.19  2022719.88
 "         Pwrite " 1328648.88  1334709.97
 "          Pread " 2607281.94  2581113.12
 "         Fwrite " 2404771.38  2348427.62
 "          Fread " 7903982.75  7486812.50

	-ss

^ permalink raw reply	[flat|nested] 32+ messages in thread

* Re: [PATCH 1/2] zram: free meta out of init_lock
  2015-01-28  6:08                                     ` Sergey Senozhatsky
@ 2015-01-28  6:10                                       ` Sergey Senozhatsky
  0 siblings, 0 replies; 32+ messages in thread
From: Sergey Senozhatsky @ 2015-01-28  6:10 UTC (permalink / raw)
  To: Sergey Senozhatsky
  Cc: Minchan Kim, Sergey Senozhatsky, Jerome Marchand, Andrew Morton,
	linux-kernel, linux-mm, Nitin Gupta

On (01/28/15 15:08), Sergey Senozhatsky wrote:
> hm... no, it's 100% stable

sorry, should be "it's NOT 100% stable".

	-ss

>  ./iozone -t 3 -R -r 16K -s 60M -I +Z
> 
>         test           base        srcu
> 
>  "  Initial write " 1274320.94  1251996.78
>  "        Rewrite " 1965783.94  1994964.06
>  "           Read " 4994070.75  4785895.88
>  "        Re-read " 5134244.62  5010810.50
>  "   Reverse Read " 4098531.38  4049988.38
>  "    Stride read " 4577775.75  4263884.50
>  "    Random read " 4131315.75  4636718.38
>  " Mixed workload " 3675635.25  3854783.06
>  "   Random write " 1832045.12  1863511.31
>  "         Pwrite " 1238366.59  1258660.47
>  "          Pread " 2475710.28  2404201.75
>  "         Fwrite " 2410579.94  2396443.25
>  "          Fread " 7723248.00  7127479.75
> 
>  "  Initial write " 1325167.41  1321517.41
>  "        Rewrite " 2044098.62  2161141.06
>  "           Read " 5267661.12  6203909.25
>  "        Re-read " 5458601.62  5773477.12
>  "   Reverse Read " 5001896.25  5103856.12
>  "    Stride read " 4858877.62  5003335.25
>  "    Random read " 4620529.88  4685374.62
>  " Mixed workload " 3868978.19  3939195.31
>  "   Random write " 2037816.75  1949729.56
>  "         Pwrite " 1298255.91  1323038.47
>  "          Pread " 2688768.09  2957903.06
>  "         Fwrite " 2482632.44  2351247.50
>  "          Fread " 7905214.75  7500859.75
> 
>  "  Initial write " 1334890.88  1332275.59
>  "        Rewrite " 2061126.00  2152643.69
>  "           Read " 5749209.88  5652791.62
>  "        Re-read " 5845869.00  6261777.25
>  "   Reverse Read " 4681375.12  4875618.50
>  "    Stride read " 4760689.75  5242670.00
>  "    Random read " 5112395.75  4650536.62
>  " Mixed workload " 4129292.06  4075847.88
>  "   Random write " 2067824.19  2022719.88
>  "         Pwrite " 1328648.88  1334709.97
>  "          Pread " 2607281.94  2581113.12
>  "         Fwrite " 2404771.38  2348427.62
>  "          Fread " 7903982.75  7486812.50
> 
> 	-ss
> 

^ permalink raw reply	[flat|nested] 32+ messages in thread

end of thread, other threads:[~2015-01-28  6:11 UTC | newest]

Thread overview: 32+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2015-01-23  5:58 [PATCH 1/2] zram: free meta out of init_lock Minchan Kim
2015-01-23  5:58 ` [PATCH 2/2] zram: protect zram->stat race with init_lock Minchan Kim
2015-01-23 13:45   ` Jerome Marchand
2015-01-23 14:38   ` Sergey Senozhatsky
2015-01-24 13:17     ` Ganesh Mahendran
2015-01-25 14:38       ` Sergey Senozhatsky
2015-01-23 13:07 ` [PATCH 1/2] zram: free meta out of init_lock Jerome Marchand
2015-01-23 14:24 ` Sergey Senozhatsky
2015-01-23 14:48   ` Jerome Marchand
2015-01-23 15:47     ` Sergey Senozhatsky
2015-01-26  1:33       ` Minchan Kim
2015-01-26 14:17         ` Sergey Senozhatsky
2015-01-26 16:00           ` Minchan Kim
2015-01-27  2:17             ` Sergey Senozhatsky
2015-01-27  3:18               ` Minchan Kim
2015-01-27  4:03                 ` Sergey Senozhatsky
2015-01-28  0:15                   ` Minchan Kim
2015-01-28  0:22                     ` Minchan Kim
2015-01-28  2:07                       ` Sergey Senozhatsky
2015-01-28  2:57                         ` Minchan Kim
2015-01-28  3:53                           ` Sergey Senozhatsky
2015-01-28  4:07                             ` Sergey Senozhatsky
2015-01-28  4:50                               ` Sergey Senozhatsky
2015-01-28  4:58                                 ` Minchan Kim
2015-01-28  5:35                                   ` Minchan Kim
2015-01-28  6:08                                     ` Sergey Senozhatsky
2015-01-28  6:10                                       ` Sergey Senozhatsky
2015-01-28  4:55                             ` Minchan Kim
2015-01-28  0:24                     ` Sergey Senozhatsky
2015-01-28  0:59                       ` Minchan Kim
2015-01-26 14:34         ` Jerome Marchand
2015-01-26 15:52           ` Minchan Kim

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).