LKML Archive on lore.kernel.org
help / color / mirror / Atom feed
* [RESEND] [PATCH] block: create ioctl to discard-or-zeroout a range of blocks
@ 2015-01-29  2:00 Darrick J. Wong
  2015-01-29 10:02 ` Arnd Bergmann
                   ` (2 more replies)
  0 siblings, 3 replies; 7+ messages in thread
From: Darrick J. Wong @ 2015-01-29  2:00 UTC (permalink / raw)
  To: Jens Axboe, Christoph Hellwig
  Cc: linux-kernel, linux-fsdevel, linux-api, Jeff Layton, J. Bruce Fields

Create a new ioctl to expose the block layer's newfound ability to
issue either a zeroing discard, a WRITE SAME with a zero page, or a
regular write with the zero page.  This BLKZEROOUT2 ioctl takes
{start, length, flags} as parameters.  So far, the only flag available
is to enable the zeroing discard part -- without it, the call invokes
the old BLKZEROOUT behavior.  start and length have the same meaning
as in BLKZEROOUT.

Furthermore, because BLKZEROOUT2 issues commands directly to the
storage device, we must invalidate the page cache (as a regular
O_DIRECT write would do) to avoid returning stale cache contents at a
later time.

This patch depends on "block: Add discard flag to
blkdev_issue_zeroout() function" in Jens' for-3.20/core branch.

Signed-off-by: Darrick J. Wong <darrick.wong@oracle.com>
---
 block/ioctl.c           |   45 ++++++++++++++++++++++++++++++++++++++-------
 include/uapi/linux/fs.h |    7 +++++++
 2 files changed, 45 insertions(+), 7 deletions(-)

diff --git a/block/ioctl.c b/block/ioctl.c
index 7d8befd..ff623d5 100644
--- a/block/ioctl.c
+++ b/block/ioctl.c
@@ -186,19 +186,39 @@ static int blk_ioctl_discard(struct block_device *bdev, uint64_t start,
 }
 
 static int blk_ioctl_zeroout(struct block_device *bdev, uint64_t start,
-			     uint64_t len)
+			     uint64_t len, uint32_t flags)
 {
+	int ret;
+	struct address_space *mapping;
+	uint64_t end = start + len - 1;
+
+	if (flags & ~BLKZEROOUT2_DISCARD_OK)
+		return -EINVAL;
 	if (start & 511)
 		return -EINVAL;
 	if (len & 511)
 		return -EINVAL;
-	start >>= 9;
-	len >>= 9;
-
-	if (start + len > (i_size_read(bdev->bd_inode) >> 9))
+	if (end >= i_size_read(bdev->bd_inode))
 		return -EINVAL;
 
-	return blkdev_issue_zeroout(bdev, start, len, GFP_KERNEL, false);
+	/* Invalidate the page cache, including dirty pages */
+	mapping = bdev->bd_inode->i_mapping;
+	truncate_inode_pages_range(mapping, start, end);
+
+	ret = blkdev_issue_zeroout(bdev, start >> 9, len >> 9, GFP_KERNEL,
+				   flags & BLKZEROOUT2_DISCARD_OK);
+	if (ret)
+		goto out;
+
+	/*
+	 * Invalidate again; if someone wandered in and dirtied a page,
+	 * the caller will be given -EBUSY.
+	 */
+	ret = invalidate_inode_pages2_range(mapping,
+					    start >> PAGE_CACHE_SHIFT,
+					    end >> PAGE_CACHE_SHIFT);
+out:
+	return ret;
 }
 
 static int put_ushort(unsigned long arg, unsigned short val)
@@ -326,7 +346,18 @@ int blkdev_ioctl(struct block_device *bdev, fmode_t mode, unsigned cmd,
 		if (copy_from_user(range, (void __user *)arg, sizeof(range)))
 			return -EFAULT;
 
-		return blk_ioctl_zeroout(bdev, range[0], range[1]);
+		return blk_ioctl_zeroout(bdev, range[0], range[1], 0);
+	}
+	case BLKZEROOUT2: {
+		struct blkzeroout2 p;
+
+		if (!(mode & FMODE_WRITE))
+			return -EBADF;
+
+		if (copy_from_user(&p, (void __user *)arg, sizeof(p)))
+			return -EFAULT;
+
+		return blk_ioctl_zeroout(bdev, p.start, p.length, p.flags);
 	}
 
 	case HDIO_GETGEO: {
diff --git a/include/uapi/linux/fs.h b/include/uapi/linux/fs.h
index 3735fa0..54d24ea 100644
--- a/include/uapi/linux/fs.h
+++ b/include/uapi/linux/fs.h
@@ -150,6 +150,13 @@ struct inodes_stat_t {
 #define BLKSECDISCARD _IO(0x12,125)
 #define BLKROTATIONAL _IO(0x12,126)
 #define BLKZEROOUT _IO(0x12,127)
+struct blkzeroout2 {
+	__u64 start;
+	__u64 length;
+	__u32 flags;
+};
+#define BLKZEROOUT2_DISCARD_OK	1
+#define BLKZEROOUT2 _IOR(0x12, 127, struct blkzeroout2)
 
 #define BMAP_IOCTL 1		/* obsolete - kept for compatibility */
 #define FIBMAP	   _IO(0x00,1)	/* bmap access */

^ permalink raw reply	[flat|nested] 7+ messages in thread

* Re: [RESEND] [PATCH] block: create ioctl to discard-or-zeroout a range of blocks
  2015-01-29  2:00 [RESEND] [PATCH] block: create ioctl to discard-or-zeroout a range of blocks Darrick J. Wong
@ 2015-01-29 10:02 ` Arnd Bergmann
  2015-01-29 16:28   ` Elliott, Robert (Server Storage)
  2015-01-29 19:01   ` Darrick J. Wong
  2015-01-30  0:48 ` Martin K. Petersen
  2015-02-13  8:51 ` Darrick J. Wong
  2 siblings, 2 replies; 7+ messages in thread
From: Arnd Bergmann @ 2015-01-29 10:02 UTC (permalink / raw)
  To: Darrick J. Wong
  Cc: Jens Axboe, Christoph Hellwig, linux-kernel, linux-fsdevel,
	linux-api, Jeff Layton, J. Bruce Fields

On Wednesday 28 January 2015 18:00:25 Darrick J. Wong wrote:
> Create a new ioctl to expose the block layer's newfound ability to
> issue either a zeroing discard, a WRITE SAME with a zero page, or a
> regular write with the zero page.  This BLKZEROOUT2 ioctl takes
> {start, length, flags} as parameters.  So far, the only flag available
> is to enable the zeroing discard part -- without it, the call invokes
> the old BLKZEROOUT behavior.  start and length have the same meaning
> as in BLKZEROOUT.
> 
> Furthermore, because BLKZEROOUT2 issues commands directly to the
> storage device, we must invalidate the page cache (as a regular
> O_DIRECT write would do) to avoid returning stale cache contents at a
> later time.
> 
> This patch depends on "block: Add discard flag to
> blkdev_issue_zeroout() function" in Jens' for-3.20/core branch.
> 
> Signed-off-by: Darrick J. Wong <darrick.wong@oracle.com>
> 

Would this work ok for devices that fill discarded areas with all-ones
instead of all-zeroes? I believe SD cards can do either.

	Arnd

^ permalink raw reply	[flat|nested] 7+ messages in thread

* RE: [RESEND] [PATCH] block: create ioctl to discard-or-zeroout a range of blocks
  2015-01-29 10:02 ` Arnd Bergmann
@ 2015-01-29 16:28   ` Elliott, Robert (Server Storage)
  2015-01-29 19:01   ` Darrick J. Wong
  1 sibling, 0 replies; 7+ messages in thread
From: Elliott, Robert (Server Storage) @ 2015-01-29 16:28 UTC (permalink / raw)
  To: Arnd Bergmann, Darrick J. Wong
  Cc: Jens Axboe, Christoph Hellwig, linux-kernel, linux-fsdevel,
	linux-api, Jeff Layton, J. Bruce Fields



> -----Original Message-----
> From: linux-kernel-owner@vger.kernel.org [mailto:linux-kernel-
> owner@vger.kernel.org] On Behalf Of Arnd Bergmann
> Sent: Thursday, 29 January, 2015 4:03 AM
> To: Darrick J. Wong
> Cc: Jens Axboe; Christoph Hellwig; linux-kernel@vger.kernel.org; linux-
> fsdevel@vger.kernel.org; linux-api@vger.kernel.org; Jeff Layton; J. Bruce
> Fields
> Subject: Re: [RESEND] [PATCH] block: create ioctl to discard-or-zeroout a
> range of blocks
> 
> On Wednesday 28 January 2015 18:00:25 Darrick J. Wong wrote:
> > Create a new ioctl to expose the block layer's newfound ability to
> > issue either a zeroing discard, a WRITE SAME with a zero page, or a
> > regular write with the zero page.  This BLKZEROOUT2 ioctl takes
> > {start, length, flags} as parameters.  So far, the only flag available
> > is to enable the zeroing discard part -- without it, the call invokes
> > the old BLKZEROOUT behavior.  start and length have the same meaning
> > as in BLKZEROOUT.
> >
> > Furthermore, because BLKZEROOUT2 issues commands directly to the
> > storage device, we must invalidate the page cache (as a regular
> > O_DIRECT write would do) to avoid returning stale cache contents at a
> > later time.
> >
> > This patch depends on "block: Add discard flag to
> > blkdev_issue_zeroout() function" in Jens' for-3.20/core branch.
> >
> > Signed-off-by: Darrick J. Wong <darrick.wong@oracle.com>
> >
> 
> Would this work ok for devices that fill discarded areas with all-ones
> instead of all-zeroes? I believe SD cards can do either.
> 
> 	Arnd

RAID software stacks need predictable data, and 0 is a lot easier to
deal with than 0xff in Galois field (XOR-based) math.

The SCSI and ATA standards both define bits to report if the device is
designed to return 0 after discard; they don't have a way to report
that any other particular non-zero value is returned.

---
Rob Elliott    HP Server Storage



^ permalink raw reply	[flat|nested] 7+ messages in thread

* Re: [RESEND] [PATCH] block: create ioctl to discard-or-zeroout a range of blocks
  2015-01-29 10:02 ` Arnd Bergmann
  2015-01-29 16:28   ` Elliott, Robert (Server Storage)
@ 2015-01-29 19:01   ` Darrick J. Wong
  2015-01-29 22:19     ` Arnd Bergmann
  1 sibling, 1 reply; 7+ messages in thread
From: Darrick J. Wong @ 2015-01-29 19:01 UTC (permalink / raw)
  To: Arnd Bergmann
  Cc: Jens Axboe, Christoph Hellwig, linux-kernel, linux-fsdevel,
	linux-api, Jeff Layton, J. Bruce Fields

On Thu, Jan 29, 2015 at 11:02:58AM +0100, Arnd Bergmann wrote:
> On Wednesday 28 January 2015 18:00:25 Darrick J. Wong wrote:
> > Create a new ioctl to expose the block layer's newfound ability to
> > issue either a zeroing discard, a WRITE SAME with a zero page, or a
> > regular write with the zero page.  This BLKZEROOUT2 ioctl takes
> > {start, length, flags} as parameters.  So far, the only flag available
> > is to enable the zeroing discard part -- without it, the call invokes
> > the old BLKZEROOUT behavior.  start and length have the same meaning
> > as in BLKZEROOUT.
> > 
> > Furthermore, because BLKZEROOUT2 issues commands directly to the
> > storage device, we must invalidate the page cache (as a regular
> > O_DIRECT write would do) to avoid returning stale cache contents at a
> > later time.
> > 
> > This patch depends on "block: Add discard flag to
> > blkdev_issue_zeroout() function" in Jens' for-3.20/core branch.
> > 
> > Signed-off-by: Darrick J. Wong <darrick.wong@oracle.com>
> > 
> 
> Would this work ok for devices that fill discarded areas with all-ones
> instead of all-zeroes? I believe SD cards can do either.

It won't do all-ones, because the underlying blkdev_issue_zeroout call only
knows how to tell the device to write zeroes or perform a discard if the flag
is set and the device is whitelisted.  This patch only exposes the existing
kernel call to userspace.

(All-ones could be plumbed into the storage stack, but that would have to be a
separate patch.)

--D

> 
> 	Arnd
> --
> To unsubscribe from this list: send the line "unsubscribe linux-api" in
> the body of a message to majordomo@vger.kernel.org
> More majordomo info at  http://vger.kernel.org/majordomo-info.html

^ permalink raw reply	[flat|nested] 7+ messages in thread

* Re: [RESEND] [PATCH] block: create ioctl to discard-or-zeroout a range of blocks
  2015-01-29 19:01   ` Darrick J. Wong
@ 2015-01-29 22:19     ` Arnd Bergmann
  0 siblings, 0 replies; 7+ messages in thread
From: Arnd Bergmann @ 2015-01-29 22:19 UTC (permalink / raw)
  To: Darrick J. Wong
  Cc: Jens Axboe, Christoph Hellwig, linux-kernel, linux-fsdevel,
	linux-api, Jeff Layton, J. Bruce Fields

On Thursday 29 January 2015 11:01:02 Darrick J. Wong wrote:
> It won't do all-ones, because the underlying blkdev_issue_zeroout call only
> knows how to tell the device to write zeroes or perform a discard if the flag
> is set and the device is whitelisted.  This patch only exposes the existing
> kernel call to userspace.
>
> (All-ones could be plumbed into the storage stack, but that would have to be a
> separate patch.)

I see. I've just checked the code in the mmc core and it seems to
get the correct information for both mmc and sd devices (they use
different bits to announce the feature).

So as long as the devices follow the spec, everything is fine here.

Thanks Darrick and Robert for the quick reply!

	Arnd

^ permalink raw reply	[flat|nested] 7+ messages in thread

* Re: [RESEND] [PATCH] block: create ioctl to discard-or-zeroout a range of blocks
  2015-01-29  2:00 [RESEND] [PATCH] block: create ioctl to discard-or-zeroout a range of blocks Darrick J. Wong
  2015-01-29 10:02 ` Arnd Bergmann
@ 2015-01-30  0:48 ` Martin K. Petersen
  2015-02-13  8:51 ` Darrick J. Wong
  2 siblings, 0 replies; 7+ messages in thread
From: Martin K. Petersen @ 2015-01-30  0:48 UTC (permalink / raw)
  To: Darrick J. Wong
  Cc: Jens Axboe, Christoph Hellwig, linux-kernel, linux-fsdevel,
	linux-api, Jeff Layton, J. Bruce Fields

>>>>> "Darrick" == Darrick J Wong <darrick.wong@oracle.com> writes:

Darrick> Create a new ioctl to expose the block layer's newfound ability
Darrick> to issue either a zeroing discard, a WRITE SAME with a zero
Darrick> page, or a regular write with the zero page.  This BLKZEROOUT2
Darrick> ioctl takes {start, length, flags} as parameters.  So far, the
Darrick> only flag available is to enable the zeroing discard part --
Darrick> without it, the call invokes the old BLKZEROOUT behavior.
Darrick> start and length have the same meaning as in BLKZEROOUT.

Darrick> Furthermore, because BLKZEROOUT2 issues commands directly to
Darrick> the storage device, we must invalidate the page cache (as a
Darrick> regular O_DIRECT write would do) to avoid returning stale cache
Darrick> contents at a later time.

Looks good to me, Darrick.

Reviewed-by: Martin K. Petersen <martin.petersen@oracle.com>

-- 
Martin K. Petersen	Oracle Linux Engineering

^ permalink raw reply	[flat|nested] 7+ messages in thread

* Re: [RESEND] [PATCH] block: create ioctl to discard-or-zeroout a range of blocks
  2015-01-29  2:00 [RESEND] [PATCH] block: create ioctl to discard-or-zeroout a range of blocks Darrick J. Wong
  2015-01-29 10:02 ` Arnd Bergmann
  2015-01-30  0:48 ` Martin K. Petersen
@ 2015-02-13  8:51 ` Darrick J. Wong
  2 siblings, 0 replies; 7+ messages in thread
From: Darrick J. Wong @ 2015-02-13  8:51 UTC (permalink / raw)
  To: Jens Axboe, Christoph Hellwig
  Cc: linux-kernel, linux-fsdevel, linux-api, Jeff Layton, J. Bruce Fields

So, uh, it's been a couple of weeks...

Jens: Any comments?  Nobody's objected to either the function or the interface;
can this go in -next?

--D

On Wed, Jan 28, 2015 at 06:00:25PM -0800, Darrick J. Wong wrote:
> Create a new ioctl to expose the block layer's newfound ability to
> issue either a zeroing discard, a WRITE SAME with a zero page, or a
> regular write with the zero page.  This BLKZEROOUT2 ioctl takes
> {start, length, flags} as parameters.  So far, the only flag available
> is to enable the zeroing discard part -- without it, the call invokes
> the old BLKZEROOUT behavior.  start and length have the same meaning
> as in BLKZEROOUT.
> 
> Furthermore, because BLKZEROOUT2 issues commands directly to the
> storage device, we must invalidate the page cache (as a regular
> O_DIRECT write would do) to avoid returning stale cache contents at a
> later time.
> 
> This patch depends on "block: Add discard flag to
> blkdev_issue_zeroout() function" in Jens' for-3.20/core branch.
> 
> Signed-off-by: Darrick J. Wong <darrick.wong@oracle.com>
> ---
>  block/ioctl.c           |   45 ++++++++++++++++++++++++++++++++++++++-------
>  include/uapi/linux/fs.h |    7 +++++++
>  2 files changed, 45 insertions(+), 7 deletions(-)
> 
> diff --git a/block/ioctl.c b/block/ioctl.c
> index 7d8befd..ff623d5 100644
> --- a/block/ioctl.c
> +++ b/block/ioctl.c
> @@ -186,19 +186,39 @@ static int blk_ioctl_discard(struct block_device *bdev, uint64_t start,
>  }
>  
>  static int blk_ioctl_zeroout(struct block_device *bdev, uint64_t start,
> -			     uint64_t len)
> +			     uint64_t len, uint32_t flags)
>  {
> +	int ret;
> +	struct address_space *mapping;
> +	uint64_t end = start + len - 1;
> +
> +	if (flags & ~BLKZEROOUT2_DISCARD_OK)
> +		return -EINVAL;
>  	if (start & 511)
>  		return -EINVAL;
>  	if (len & 511)
>  		return -EINVAL;
> -	start >>= 9;
> -	len >>= 9;
> -
> -	if (start + len > (i_size_read(bdev->bd_inode) >> 9))
> +	if (end >= i_size_read(bdev->bd_inode))
>  		return -EINVAL;
>  
> -	return blkdev_issue_zeroout(bdev, start, len, GFP_KERNEL, false);
> +	/* Invalidate the page cache, including dirty pages */
> +	mapping = bdev->bd_inode->i_mapping;
> +	truncate_inode_pages_range(mapping, start, end);
> +
> +	ret = blkdev_issue_zeroout(bdev, start >> 9, len >> 9, GFP_KERNEL,
> +				   flags & BLKZEROOUT2_DISCARD_OK);
> +	if (ret)
> +		goto out;
> +
> +	/*
> +	 * Invalidate again; if someone wandered in and dirtied a page,
> +	 * the caller will be given -EBUSY.
> +	 */
> +	ret = invalidate_inode_pages2_range(mapping,
> +					    start >> PAGE_CACHE_SHIFT,
> +					    end >> PAGE_CACHE_SHIFT);
> +out:
> +	return ret;
>  }
>  
>  static int put_ushort(unsigned long arg, unsigned short val)
> @@ -326,7 +346,18 @@ int blkdev_ioctl(struct block_device *bdev, fmode_t mode, unsigned cmd,
>  		if (copy_from_user(range, (void __user *)arg, sizeof(range)))
>  			return -EFAULT;
>  
> -		return blk_ioctl_zeroout(bdev, range[0], range[1]);
> +		return blk_ioctl_zeroout(bdev, range[0], range[1], 0);
> +	}
> +	case BLKZEROOUT2: {
> +		struct blkzeroout2 p;
> +
> +		if (!(mode & FMODE_WRITE))
> +			return -EBADF;
> +
> +		if (copy_from_user(&p, (void __user *)arg, sizeof(p)))
> +			return -EFAULT;
> +
> +		return blk_ioctl_zeroout(bdev, p.start, p.length, p.flags);
>  	}
>  
>  	case HDIO_GETGEO: {
> diff --git a/include/uapi/linux/fs.h b/include/uapi/linux/fs.h
> index 3735fa0..54d24ea 100644
> --- a/include/uapi/linux/fs.h
> +++ b/include/uapi/linux/fs.h
> @@ -150,6 +150,13 @@ struct inodes_stat_t {
>  #define BLKSECDISCARD _IO(0x12,125)
>  #define BLKROTATIONAL _IO(0x12,126)
>  #define BLKZEROOUT _IO(0x12,127)
> +struct blkzeroout2 {
> +	__u64 start;
> +	__u64 length;
> +	__u32 flags;
> +};
> +#define BLKZEROOUT2_DISCARD_OK	1
> +#define BLKZEROOUT2 _IOR(0x12, 127, struct blkzeroout2)
>  
>  #define BMAP_IOCTL 1		/* obsolete - kept for compatibility */
>  #define FIBMAP	   _IO(0x00,1)	/* bmap access */
> --
> To unsubscribe from this list: send the line "unsubscribe linux-api" in
> the body of a message to majordomo@vger.kernel.org
> More majordomo info at  http://vger.kernel.org/majordomo-info.html

^ permalink raw reply	[flat|nested] 7+ messages in thread

end of thread, other threads:[~2015-02-13  8:51 UTC | newest]

Thread overview: 7+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2015-01-29  2:00 [RESEND] [PATCH] block: create ioctl to discard-or-zeroout a range of blocks Darrick J. Wong
2015-01-29 10:02 ` Arnd Bergmann
2015-01-29 16:28   ` Elliott, Robert (Server Storage)
2015-01-29 19:01   ` Darrick J. Wong
2015-01-29 22:19     ` Arnd Bergmann
2015-01-30  0:48 ` Martin K. Petersen
2015-02-13  8:51 ` Darrick J. Wong

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).