LKML Archive on lore.kernel.org
help / color / mirror / Atom feed
* [patch] rfc: introduce /dev/hugetlb
@ 2007-03-23  8:44 Ken Chen
  2007-03-23 15:03 ` William Lee Irwin III
                   ` (3 more replies)
  0 siblings, 4 replies; 17+ messages in thread
From: Ken Chen @ 2007-03-23  8:44 UTC (permalink / raw)
  To: Adam Litke
  Cc: Andrew Morton, Arjan van de Ven, William Lee Irwin III,
	Christoph Hellwig, linux-mm, linux-kernel

On 3/21/07, Adam Litke <agl@us.ibm.com> wrote:
> The main reason I am advocating a set of pagetable_operations is to
> enable the development of a new hugetlb interface.  During the hugetlb
> BOFS at OLS last year, we talked about a character device that would
> behave like /dev/zero.  Many of the people were talking about how they
> just wanted to create MAP_PRIVATE hugetlb mappings without all the fuss
> about the hugetlbfs filesystem.  /dev/zero is a familiar interface for
> getting anonymous memory so bringing that model to huge pages would make
> programming for anonymous huge pages easier.

I think we have enough infrastructure currently in hugetlbfs to
implement what Adam wants for something like a /dev/hugetlb char
device (except we can't afford to have a zero hugetlb page since it
will be too costly on some arch).

I really like the idea of having something similar to /dev/zero for
hugetlb page.  So I coded it up on top of existing hugetlbfs.  The
core change is really small and half of the patch is really just
moving things around.  I think this at least can partially fulfill the
goal.


Signed-off-by: Ken Chen <kenchen@google.com>

diff --git a/drivers/char/mem.c b/drivers/char/mem.c
index f5c160c..56e58f5 100644
--- a/drivers/char/mem.c
+++ b/drivers/char/mem.c
@@ -27,6 +27,7 @@
 #include <linux/bootmem.h>
 #include <linux/pipe_fs_i.h>
 #include <linux/pfn.h>
+#include <linux/hugetlb.h>

 #include <asm/uaccess.h>
 #include <asm/io.h>
@@ -872,6 +873,13 @@ static const struct file_operations oldmem_fops = {
 };
 #endif

+#ifdef CONFIG_HUGETLBFS
+static const struct file_operations hugetlb_fops = {
+	.mmap			= hugetlb_zero_setup,
+	.get_unmapped_area	= hugetlb_get_unmapped_area,
+};
+#endif
+
 static ssize_t kmsg_write(struct file * file, const char __user * buf,
 			  size_t count, loff_t *ppos)
 {
@@ -939,6 +947,11 @@ static int memory_open(struct inode *
 			filp->f_op = &oldmem_fops;
 			break;
 #endif
+#ifdef CONFIG_HUGETLBFS
+		case 13:
+			filp->f_op = &hugetlb_fops;
+			break;
+#endif
 		default:
 			return -ENXIO;
 	}
@@ -971,6 +984,9 @@ static const struct {
 #ifdef CONFIG_CRASH_DUMP
 	{12,"oldmem",    S_IRUSR | S_IWUSR | S_IRGRP, &oldmem_fops},
 #endif
+#ifdef CONFIG_HUGETLBFS
+	{13, "hugetlb",S_IRUGO | S_IWUGO,	    &hugetlb_fops},
+#endif
 };

 static struct class *mem_class;
diff --git a/fs/hugetlbfs/inode.c b/fs/hugetlbfs/inode.c
index 8c718a3..af24664 100644
--- a/fs/hugetlbfs/inode.c
+++ b/fs/hugetlbfs/inode.c
@@ -97,12 +97,7 @@ out:
 /*
  * Called under down_write(mmap_sem).
  */
-
-#ifdef HAVE_ARCH_HUGETLB_UNMAPPED_AREA
-unsigned long hugetlb_get_unmapped_area(struct file *file, unsigned long addr,
-		unsigned long len, unsigned long pgoff, unsigned long flags);
-#else
-static unsigned long
+unsigned long
 hugetlb_get_unmapped_area(struct file *file, unsigned long addr,
 		unsigned long len, unsigned long pgoff, unsigned long flags)
 {
@@ -150,7 +145,6 @@ full_search:
 		addr = ALIGN(vma->vm_end, HPAGE_SIZE);
 	}
 }
-#endif

 /*
  * Read a page. Again trivial. If it didn't already exist
@@ -734,7 +728,7 @@ static int can_do_hugetlb_shm(void)
 			can_do_mlock());
 }

-struct file *hugetlb_zero_setup(size_t size)
+struct file *hugetlb_file_setup(size_t size, int resv)
 {
 	int error = -ENOMEM;
 	struct file *file;
@@ -771,7 +765,7 @@ struct file *hugetlb_zero_setup(size_t size)
 		goto out_file;

 	error = -ENOMEM;
-	if (hugetlb_reserve_pages(inode, 0, size >> HPAGE_SHIFT))
+	if (resv && hugetlb_reserve_pages(inode, 0, size >> HPAGE_SHIFT))
 		goto out_inode;

 	d_instantiate(dentry, inode);
@@ -795,6 +789,18 @@ out_shm_unlock:
 	return ERR_PTR(error);
 }

+int hugetlb_zero_setup(struct file *file, struct vm_area_struct *vma)
+{
+	file = hugetlb_file_setup(vma->vm_end - vma->vm_start, 0);
+	if (IS_ERR(file))
+		return PTR_ERR(file);
+
+	if (vma->vm_file)
+		fput(vma->vm_file);
+	vma->vm_file = file;
+	return hugetlbfs_file_mmap(file, vma);
+}
+
 static int __init init_hugetlbfs_fs(void)
 {
 	int error;
diff --git a/include/linux/hugetlb.h b/include/linux/hugetlb.h
index 3f3e7a6..d2a2190 100644
--- a/include/linux/hugetlb.h
+++ b/include/linux/hugetlb.h
@@ -163,9 +163,12 @@ static inline struct hugetlbfs_sb_info *

 extern const struct file_operations hugetlbfs_file_operations;
 extern struct vm_operations_struct hugetlb_vm_ops;
-struct file *hugetlb_zero_setup(size_t);
+struct file *hugetlb_file_setup(size_t, int);
+int hugetlb_zero_setup(struct file *, struct vm_area_struct *);
 int hugetlb_get_quota(struct address_space *mapping);
 void hugetlb_put_quota(struct address_space *mapping);
+unsigned long hugetlb_get_unmapped_area(struct file *file, unsigned long addr,
+		unsigned long len, unsigned long pgoff, unsigned long flags);

 static inline int is_file_hugepages(struct file *file)
 {
@@ -185,7 +188,7 @@ static inline void set_file_hugepages(struct file *file)

 #define is_file_hugepages(file)		0
 #define set_file_hugepages(file)	BUG()
-#define hugetlb_zero_setup(size)	ERR_PTR(-ENOSYS)
+#define hugetlb_file_setup(size, resv)	ERR_PTR(-ENOSYS)

 #endif /* !CONFIG_HUGETLBFS */

diff --git a/ipc/shm.c b/ipc/shm.c
index 4fefbad..c64643f 100644
--- a/ipc/shm.c
+++ b/ipc/shm.c
@@ -366,7 +366,7 @@ static int newseg (struct ipc_namespace *ns

 	if (shmflg & SHM_HUGETLB) {
 		/* hugetlb_zero_setup takes care of mlock user accounting */
-		file = hugetlb_zero_setup(size);
+		file = hugetlb_file_setup(size, 1);
 		shp->mlock_user = current->user;
 	} else {
 		int acctflag = VM_ACCOUNT;

^ permalink raw reply	[flat|nested] 17+ messages in thread

* Re: [patch] rfc: introduce /dev/hugetlb
  2007-03-23  8:44 [patch] rfc: introduce /dev/hugetlb Ken Chen
@ 2007-03-23 15:03 ` William Lee Irwin III
  2007-03-23 21:56   ` Ken Chen
  2007-03-23 15:03 ` Mel Gorman
                   ` (2 subsequent siblings)
  3 siblings, 1 reply; 17+ messages in thread
From: William Lee Irwin III @ 2007-03-23 15:03 UTC (permalink / raw)
  To: Ken Chen
  Cc: Adam Litke, Andrew Morton, Arjan van de Ven, Christoph Hellwig,
	linux-mm, linux-kernel

On Fri, Mar 23, 2007 at 01:44:38AM -0700, Ken Chen wrote:
> I think we have enough infrastructure currently in hugetlbfs to
> implement what Adam wants for something like a /dev/hugetlb char
> device (except we can't afford to have a zero hugetlb page since it
> will be too costly on some arch).
> I really like the idea of having something similar to /dev/zero for
> hugetlb page.  So I coded it up on top of existing hugetlbfs.  The
> core change is really small and half of the patch is really just
> moving things around.  I think this at least can partially fulfill the
> goal.
> Signed-off-by: Ken Chen <kenchen@google.com>

I like this patch a lot, though I'm not likely to get around to testing
it today. If userspace testcode is available that would be great to see
posted so I can just boot into things and run that.


-- wli

^ permalink raw reply	[flat|nested] 17+ messages in thread

* Re: [patch] rfc: introduce /dev/hugetlb
  2007-03-23  8:44 [patch] rfc: introduce /dev/hugetlb Ken Chen
  2007-03-23 15:03 ` William Lee Irwin III
@ 2007-03-23 15:03 ` Mel Gorman
  2007-03-23 15:09   ` William Lee Irwin III
  2007-03-23 21:08 ` Benjamin Herrenschmidt
  2007-03-24  4:58 ` Andrew Morton
  3 siblings, 1 reply; 17+ messages in thread
From: Mel Gorman @ 2007-03-23 15:03 UTC (permalink / raw)
  To: Ken Chen
  Cc: Adam Litke, Andrew Morton, Arjan van de Ven,
	William Lee Irwin III, Christoph Hellwig, linux-mm, linux-kernel

On Fri, 23 Mar 2007, Ken Chen wrote:

> On 3/21/07, Adam Litke <agl@us.ibm.com> wrote:
>> The main reason I am advocating a set of pagetable_operations is to
>> enable the development of a new hugetlb interface.  During the hugetlb
>> BOFS at OLS last year, we talked about a character device that would
>> behave like /dev/zero.  Many of the people were talking about how they
>> just wanted to create MAP_PRIVATE hugetlb mappings without all the fuss
>> about the hugetlbfs filesystem.  /dev/zero is a familiar interface for
>> getting anonymous memory so bringing that model to huge pages would make
>> programming for anonymous huge pages easier.
>
> I think we have enough infrastructure currently in hugetlbfs to
> implement what Adam wants for something like a /dev/hugetlb char
> device (except we can't afford to have a zero hugetlb page since it
> will be too costly on some arch).
>
> I really like the idea of having something similar to /dev/zero for
> hugetlb page.  So I coded it up on top of existing hugetlbfs.  The
> core change is really small and half of the patch is really just
> moving things around.  I think this at least can partially fulfill the
> goal.
>

Good stuff. Lets take a look

>
> Signed-off-by: Ken Chen <kenchen@google.com>
>
> diff --git a/drivers/char/mem.c b/drivers/char/mem.c
> index f5c160c..56e58f5 100644
> --- a/drivers/char/mem.c
> +++ b/drivers/char/mem.c
> @@ -27,6 +27,7 @@
> #include <linux/bootmem.h>
> #include <linux/pipe_fs_i.h>
> #include <linux/pfn.h>
> +#include <linux/hugetlb.h>
>
> #include <asm/uaccess.h>
> #include <asm/io.h>
> @@ -872,6 +873,13 @@ static const struct file_operations oldmem_fops = {
> };
> #endif
>
> +#ifdef CONFIG_HUGETLBFS
> +static const struct file_operations hugetlb_fops = {
> +	.mmap			= hugetlb_zero_setup,
> +	.get_unmapped_area	= hugetlb_get_unmapped_area,
> +};
> +#endif

Ok, so we'd behave similar to shared memory and use the internal mount. 
Seems reasonable

> +
> static ssize_t kmsg_write(struct file * file, const char __user * buf,
> 			  size_t count, loff_t *ppos)
> {
> @@ -939,6 +947,11 @@ static int memory_open(struct inode *
> 			filp->f_op = &oldmem_fops;
> 			break;
> #endif
> +#ifdef CONFIG_HUGETLBFS
> +		case 13:
> +			filp->f_op = &hugetlb_fops;
> +			break;
> +#endif
> 		default:
> 			return -ENXIO;
> 	}
> @@ -971,6 +984,9 @@ static const struct {
> #ifdef CONFIG_CRASH_DUMP
> 	{12,"oldmem",    S_IRUSR | S_IWUSR | S_IRGRP, &oldmem_fops},
> #endif
> +#ifdef CONFIG_HUGETLBFS
> +	{13, "hugetlb",S_IRUGO | S_IWUGO,	    &hugetlb_fops},
> +#endif
> };
>
> static struct class *mem_class;
> diff --git a/fs/hugetlbfs/inode.c b/fs/hugetlbfs/inode.c
> index 8c718a3..af24664 100644
> --- a/fs/hugetlbfs/inode.c
> +++ b/fs/hugetlbfs/inode.c
> @@ -97,12 +97,7 @@ out:
> /*
> * Called under down_write(mmap_sem).
> */
> -
> -#ifdef HAVE_ARCH_HUGETLB_UNMAPPED_AREA
> -unsigned long hugetlb_get_unmapped_area(struct file *file, unsigned long 
> addr,
> -		unsigned long len, unsigned long pgoff, unsigned long flags);
> -#else
> -static unsigned long
> +unsigned long
> hugetlb_get_unmapped_area(struct file *file, unsigned long addr,
> 		unsigned long len, unsigned long pgoff, unsigned long flags)

What is going on here? Why do arches not get to specify a 
get_unmapped_area any more?

> {
> @@ -150,7 +145,6 @@ full_search:
> 		addr = ALIGN(vma->vm_end, HPAGE_SIZE);
> 	}
> }
> -#endif
>
> /*
> * Read a page. Again trivial. If it didn't already exist
> @@ -734,7 +728,7 @@ static int can_do_hugetlb_shm(void)
> 			can_do_mlock());
> }
>
> -struct file *hugetlb_zero_setup(size_t size)
> +struct file *hugetlb_file_setup(size_t size, int resv)
> {
> 	int error = -ENOMEM;
> 	struct file *file;
> @@ -771,7 +765,7 @@ struct file *hugetlb_zero_setup(size_t size)
> 		goto out_file;
>
> 	error = -ENOMEM;
> -	if (hugetlb_reserve_pages(inode, 0, size >> HPAGE_SHIFT))
> +	if (resv && hugetlb_reserve_pages(inode, 0, size >> HPAGE_SHIFT))
> 		goto out_inode;
>

This looks like it should be a separate patch altogether. At first glance, 
it seems reasonable enough - just not jammed in with a char device.

> 	d_instantiate(dentry, inode);
> @@ -795,6 +789,18 @@ out_shm_unlock:
> 	return ERR_PTR(error);
> }
>
> +int hugetlb_zero_setup(struct file *file, struct vm_area_struct *vma)
> +{
> +	file = hugetlb_file_setup(vma->vm_end - vma->vm_start, 0);
> +	if (IS_ERR(file))
> +		return PTR_ERR(file);
> +
> +	if (vma->vm_file)
> +		fput(vma->vm_file);
> +	vma->vm_file = file;
> +	return hugetlbfs_file_mmap(file, vma);
> +}
> +
> static int __init init_hugetlbfs_fs(void)
> {
> 	int error;
> diff --git a/include/linux/hugetlb.h b/include/linux/hugetlb.h
> index 3f3e7a6..d2a2190 100644
> --- a/include/linux/hugetlb.h
> +++ b/include/linux/hugetlb.h
> @@ -163,9 +163,12 @@ static inline struct hugetlbfs_sb_info *
>
> extern const struct file_operations hugetlbfs_file_operations;
> extern struct vm_operations_struct hugetlb_vm_ops;
> -struct file *hugetlb_zero_setup(size_t);
> +struct file *hugetlb_file_setup(size_t, int);
> +int hugetlb_zero_setup(struct file *, struct vm_area_struct *);
> int hugetlb_get_quota(struct address_space *mapping);
> void hugetlb_put_quota(struct address_space *mapping);
> +unsigned long hugetlb_get_unmapped_area(struct file *file, unsigned long 
> addr,
> +		unsigned long len, unsigned long pgoff, unsigned long flags);
>

Still not clear why hugetlb_get_unmapped_area() is moving around.

> static inline int is_file_hugepages(struct file *file)
> {
> @@ -185,7 +188,7 @@ static inline void set_file_hugepages(struct file *file)
>
> #define is_file_hugepages(file)		0
> #define set_file_hugepages(file)	BUG()
> -#define hugetlb_zero_setup(size)	ERR_PTR(-ENOSYS)
> +#define hugetlb_file_setup(size, resv)	ERR_PTR(-ENOSYS)
>
> #endif /* !CONFIG_HUGETLBFS */
>
> diff --git a/ipc/shm.c b/ipc/shm.c
> index 4fefbad..c64643f 100644
> --- a/ipc/shm.c
> +++ b/ipc/shm.c
> @@ -366,7 +366,7 @@ static int newseg (struct ipc_namespace *ns
>
> 	if (shmflg & SHM_HUGETLB) {
> 		/* hugetlb_zero_setup takes care of mlock user accounting */
> -		file = hugetlb_zero_setup(size);
> +		file = hugetlb_file_setup(size, 1);
> 		shp->mlock_user = current->user;
> 	} else {
> 		int acctflag = VM_ACCOUNT;
>

Otherwise, seems promising.

-- 
Mel Gorman
Part-time Phd Student                          Linux Technology Center
University of Limerick                         IBM Dublin Software Lab

^ permalink raw reply	[flat|nested] 17+ messages in thread

* Re: [patch] rfc: introduce /dev/hugetlb
  2007-03-23 15:03 ` Mel Gorman
@ 2007-03-23 15:09   ` William Lee Irwin III
  2007-03-23 15:15     ` Mel Gorman
  0 siblings, 1 reply; 17+ messages in thread
From: William Lee Irwin III @ 2007-03-23 15:09 UTC (permalink / raw)
  To: Mel Gorman
  Cc: Ken Chen, Adam Litke, Andrew Morton, Arjan van de Ven,
	Christoph Hellwig, linux-mm, linux-kernel

On Fri, 23 Mar 2007, Ken Chen wrote:
> >-#ifdef HAVE_ARCH_HUGETLB_UNMAPPED_AREA
> >-unsigned long hugetlb_get_unmapped_area(struct file *file, unsigned long 
> >addr,
> >-		unsigned long len, unsigned long pgoff, unsigned long flags);
> >-#else
> >-static unsigned long
> >+unsigned long
> >hugetlb_get_unmapped_area(struct file *file, unsigned long addr,
> >		unsigned long len, unsigned long pgoff, unsigned long flags)
> 
On Fri, Mar 23, 2007 at 03:03:57PM +0000, Mel Gorman wrote:
> What is going on here? Why do arches not get to specify a 
> get_unmapped_area any more?

Lack of compiletesting beyond x86-64 in all probability.


-- wli

^ permalink raw reply	[flat|nested] 17+ messages in thread

* Re: [patch] rfc: introduce /dev/hugetlb
  2007-03-23 15:09   ` William Lee Irwin III
@ 2007-03-23 15:15     ` Mel Gorman
  2007-03-23 15:30       ` William Lee Irwin III
  0 siblings, 1 reply; 17+ messages in thread
From: Mel Gorman @ 2007-03-23 15:15 UTC (permalink / raw)
  To: William Lee Irwin III
  Cc: Ken Chen, Adam Litke, Andrew Morton, Arjan van de Ven,
	Christoph Hellwig, linux-mm, linux-kernel

On Fri, 23 Mar 2007, William Lee Irwin III wrote:

> On Fri, 23 Mar 2007, Ken Chen wrote:
>>> -#ifdef HAVE_ARCH_HUGETLB_UNMAPPED_AREA
>>> -unsigned long hugetlb_get_unmapped_area(struct file *file, unsigned long
>>> addr,
>>> -		unsigned long len, unsigned long pgoff, unsigned long flags);
>>> -#else
>>> -static unsigned long
>>> +unsigned long
>>> hugetlb_get_unmapped_area(struct file *file, unsigned long addr,
>>> 		unsigned long len, unsigned long pgoff, unsigned long flags)
>>
> On Fri, Mar 23, 2007 at 03:03:57PM +0000, Mel Gorman wrote:
>> What is going on here? Why do arches not get to specify a
>> get_unmapped_area any more?
>
> Lack of compiletesting beyond x86-64 in all probability.
>

Ok, this will go kablamo on Power then even if it compiles. I don't 
consider it a fundamental problem though. For the purposes of an RFC, it's 
grand and something that can be worked with.

-- 
Mel Gorman
Part-time Phd Student                          Linux Technology Center
University of Limerick                         IBM Dublin Software Lab

^ permalink raw reply	[flat|nested] 17+ messages in thread

* Re: [patch] rfc: introduce /dev/hugetlb
  2007-03-23 15:15     ` Mel Gorman
@ 2007-03-23 15:30       ` William Lee Irwin III
  2007-03-23 22:04         ` Ken Chen
  0 siblings, 1 reply; 17+ messages in thread
From: William Lee Irwin III @ 2007-03-23 15:30 UTC (permalink / raw)
  To: Mel Gorman
  Cc: Ken Chen, Adam Litke, Andrew Morton, Arjan van de Ven,
	Christoph Hellwig, linux-mm, linux-kernel

On Fri, 23 Mar 2007, William Lee Irwin III wrote:
>> Lack of compiletesting beyond x86-64 in all probability.

On Fri, Mar 23, 2007 at 03:15:55PM +0000, Mel Gorman wrote:
> Ok, this will go kablamo on Power then even if it compiles. I don't 
> consider it a fundamental problem though. For the purposes of an RFC, it's 
> grand and something that can be worked with.

He needs to un-#ifdef the prototype (which he already does), but he
needs to leave the definition under #ifdef while removing the static
qualifier. A relatively minor fixup.


-- wli

^ permalink raw reply	[flat|nested] 17+ messages in thread

* Re: [patch] rfc: introduce /dev/hugetlb
  2007-03-23  8:44 [patch] rfc: introduce /dev/hugetlb Ken Chen
  2007-03-23 15:03 ` William Lee Irwin III
  2007-03-23 15:03 ` Mel Gorman
@ 2007-03-23 21:08 ` Benjamin Herrenschmidt
  2007-03-24  4:58 ` Andrew Morton
  3 siblings, 0 replies; 17+ messages in thread
From: Benjamin Herrenschmidt @ 2007-03-23 21:08 UTC (permalink / raw)
  To: Ken Chen
  Cc: Adam Litke, Andrew Morton, Arjan van de Ven,
	William Lee Irwin III, Christoph Hellwig, linux-mm, linux-kernel


> -#ifdef HAVE_ARCH_HUGETLB_UNMAPPED_AREA
> -unsigned long hugetlb_get_unmapped_area(struct file *file, unsigned long addr,
> -		unsigned long len, unsigned long pgoff, unsigned long flags);
> -#else
> -static unsigned long
> +unsigned long
>  hugetlb_get_unmapped_area(struct file *file, unsigned long addr,
>  		unsigned long len, unsigned long pgoff, unsigned long flags)
>  {
> @@ -150,7 +145,6 @@ full_search:
>  		addr = ALIGN(vma->vm_end, HPAGE_SIZE);
>  	}
>  }
> -#endif

WTF ? get_unmapped_area() -has- to be arch in some platforms like
power...

I'm trying to improve the whole get_unmapped_area() to better handle
multiple constraints (cacheability, page size, ...) though I haven't
quite yet settled on an interface I like.

Ben.



^ permalink raw reply	[flat|nested] 17+ messages in thread

* Re: [patch] rfc: introduce /dev/hugetlb
  2007-03-23 15:03 ` William Lee Irwin III
@ 2007-03-23 21:56   ` Ken Chen
  0 siblings, 0 replies; 17+ messages in thread
From: Ken Chen @ 2007-03-23 21:56 UTC (permalink / raw)
  To: William Lee Irwin III
  Cc: Adam Litke, Andrew Morton, Arjan van de Ven, Christoph Hellwig,
	linux-mm, linux-kernel

On 3/23/07, William Lee Irwin III <wli@holomorphy.com> wrote:
> I like this patch a lot, though I'm not likely to get around to testing
> it today. If userspace testcode is available that would be great to see
> posted so I can just boot into things and run that.

Here is the test code that I used:
(warning: x86 centric)

#include <stdlib.h>
#include <stdio.h>
#include <fcntl.h>
#include <sys/mman.h>

#define SIZE	(4*1024*1024UL)

int main(void)
{
	int fd;
	long i;
	char *addr;

	fd = open("/dev/hugetlb", O_RDWR);
	if (fd == -1) {
		perror("open failure");
		exit(1);
	}

	addr = mmap(0, SIZE, PROT_READ | PROT_WRITE, MAP_PRIVATE, fd, 0);
	if (addr == MAP_FAILED) {
		perror("mmap failure");
		exit(2);
	}

	for (i = 0; i < SIZE; i+=4096)
		addr[i] = 1;

	printf("success!\n");
}

^ permalink raw reply	[flat|nested] 17+ messages in thread

* Re: [patch] rfc: introduce /dev/hugetlb
  2007-03-23 15:30       ` William Lee Irwin III
@ 2007-03-23 22:04         ` Ken Chen
  0 siblings, 0 replies; 17+ messages in thread
From: Ken Chen @ 2007-03-23 22:04 UTC (permalink / raw)
  To: William Lee Irwin III
  Cc: Mel Gorman, Adam Litke, Andrew Morton, Arjan van de Ven,
	Christoph Hellwig, linux-mm, linux-kernel,
	Benjamin Herrenschmidt

On 3/23/07, William Lee Irwin III <wli@holomorphy.com> wrote:
> On Fri, 23 Mar 2007, William Lee Irwin III wrote:
> >> Lack of compiletesting beyond x86-64 in all probability.
>
> On Fri, Mar 23, 2007 at 03:15:55PM +0000, Mel Gorman wrote:
> > Ok, this will go kablamo on Power then even if it compiles. I don't
> > consider it a fundamental problem though. For the purposes of an RFC, it's
> > grand and something that can be worked with.
>
> He needs to un-#ifdef the prototype (which he already does), but he
> needs to leave the definition under #ifdef while removing the static
> qualifier. A relatively minor fixup.

Yes, sorry about that for lack of access to non-x86-64 machines.  I
needed to move the function prototype to hugetlb.h and evidently
removed the #ifdef by mistake.  I'm not going to touch this in my next
clean up patch, instead I will just declare char specific
file_operations struct in hugetlbfs and then have char device
reference it.

But nevertheless, hugetlb_get_unmapped_area function prototype  better
be in a header file somewhere.

- Ken

^ permalink raw reply	[flat|nested] 17+ messages in thread

* Re: [patch] rfc: introduce /dev/hugetlb
  2007-03-23  8:44 [patch] rfc: introduce /dev/hugetlb Ken Chen
                   ` (2 preceding siblings ...)
  2007-03-23 21:08 ` Benjamin Herrenschmidt
@ 2007-03-24  4:58 ` Andrew Morton
  2007-03-24  5:32   ` Nish Aravamudan
  2007-03-25 10:22   ` Arjan van de Ven
  3 siblings, 2 replies; 17+ messages in thread
From: Andrew Morton @ 2007-03-24  4:58 UTC (permalink / raw)
  To: Ken Chen
  Cc: Adam Litke, Arjan van de Ven, William Lee Irwin III,
	Christoph Hellwig, linux-mm, linux-kernel

On Fri, 23 Mar 2007 01:44:38 -0700 "Ken Chen" <kenchen@google.com> wrote:

> On 3/21/07, Adam Litke <agl@us.ibm.com> wrote:
> > The main reason I am advocating a set of pagetable_operations is to
> > enable the development of a new hugetlb interface.  During the hugetlb
> > BOFS at OLS last year, we talked about a character device that would
> > behave like /dev/zero.  Many of the people were talking about how they
> > just wanted to create MAP_PRIVATE hugetlb mappings without all the fuss
> > about the hugetlbfs filesystem.  /dev/zero is a familiar interface for
> > getting anonymous memory so bringing that model to huge pages would make
> > programming for anonymous huge pages easier.
> 
> I think we have enough infrastructure currently in hugetlbfs to
> implement what Adam wants for something like a /dev/hugetlb char
> device (except we can't afford to have a zero hugetlb page since it
> will be too costly on some arch).
> 
> I really like the idea of having something similar to /dev/zero for
> hugetlb page.  So I coded it up on top of existing hugetlbfs.  The
> core change is really small and half of the patch is really just
> moving things around.  I think this at least can partially fulfill the
> goal.

Standing back and looking at this...

afaict the whole reason for this work is to provide a quick-n-easy way to
get private mappings of hugetlb pages.  With the emphasis on quick-n-easy.

We can do the same with hugetlbfs, but that involves (horror) "fuss".

The way to avoid "fuss" is of course to do it once, do it properly then stick
it in a library which everyone uses.

But libraries are hard, for a number of distributional reasons.  It is
easier for us to distribute this functionality within the kernel.  In fact,
if Linus's tree included a ./userspace/libkernel/libhugetlb/ then we'd
probably provide this functionality in there.

This comes up regularly, and it's pretty sad.

Probably the kernel team should be maintaining, via existing processes, a
separate libkernel project, to fix these distributional problems.  The
advantage in this case is of course that our new hugetlb functionality
would be available to people on 2.6.18 kernels, not only on 2.6.22 and
later.

Am I wrong?

^ permalink raw reply	[flat|nested] 17+ messages in thread

* Re: [patch] rfc: introduce /dev/hugetlb
  2007-03-24  4:58 ` Andrew Morton
@ 2007-03-24  5:32   ` Nish Aravamudan
  2007-03-24  6:12     ` Andrew Morton
  2007-03-25 10:22   ` Arjan van de Ven
  1 sibling, 1 reply; 17+ messages in thread
From: Nish Aravamudan @ 2007-03-24  5:32 UTC (permalink / raw)
  To: Andrew Morton
  Cc: Ken Chen, Adam Litke, Arjan van de Ven, William Lee Irwin III,
	Christoph Hellwig, linux-mm, linux-kernel

On 3/23/07, Andrew Morton <akpm@linux-foundation.org> wrote:
> On Fri, 23 Mar 2007 01:44:38 -0700 "Ken Chen" <kenchen@google.com> wrote:
>
> > On 3/21/07, Adam Litke <agl@us.ibm.com> wrote:
> > > The main reason I am advocating a set of pagetable_operations is to
> > > enable the development of a new hugetlb interface.  During the hugetlb
> > > BOFS at OLS last year, we talked about a character device that would
> > > behave like /dev/zero.  Many of the people were talking about how they
> > > just wanted to create MAP_PRIVATE hugetlb mappings without all the fuss
> > > about the hugetlbfs filesystem.  /dev/zero is a familiar interface for
> > > getting anonymous memory so bringing that model to huge pages would make
> > > programming for anonymous huge pages easier.
> >
> > I think we have enough infrastructure currently in hugetlbfs to
> > implement what Adam wants for something like a /dev/hugetlb char
> > device (except we can't afford to have a zero hugetlb page since it
> > will be too costly on some arch).
> >
> > I really like the idea of having something similar to /dev/zero for
> > hugetlb page.  So I coded it up on top of existing hugetlbfs.  The
> > core change is really small and half of the patch is really just
> > moving things around.  I think this at least can partially fulfill the
> > goal.
>
> Standing back and looking at this...
>
> afaict the whole reason for this work is to provide a quick-n-easy way to
> get private mappings of hugetlb pages.  With the emphasis on quick-n-easy.

I agree.

> We can do the same with hugetlbfs, but that involves (horror) "fuss".

Yes.

> The way to avoid "fuss" is of course to do it once, do it properly then stick
> it in a library which everyone uses.

That's sort of what libhugetlbfs
(http://sourceforge.net/projects/libhugetlbfs for stable releases,
http://libhugetlbfs.ozlabs.org/ for development snapshots/git tree) is
for; while it currently only tries to abstract/provide functionality
via hugetlbfs, that's mostly because that is the only interface
available (or was, pending some sort of char dev being merged).

> But libraries are hard, for a number of distributional reasons.  It is
> easier for us to distribute this functionality within the kernel.  In fact,
> if Linus's tree included a ./userspace/libkernel/libhugetlb/ then we'd
> probably provide this functionality in there.

libhugetlbfs is available for both SLES10 and RHEL5.

> This comes up regularly, and it's pretty sad.

I agree. There is simply some functionality that is *very* closely
tied to the kernel.

> Probably the kernel team should be maintaining, via existing processes, a
> separate libkernel project, to fix these distributional problems.  The
> advantage in this case is of course that our new hugetlb functionality
> would be available to people on 2.6.18 kernels, not only on 2.6.22 and
> later.

That sounds like a good idea. For this hugetlb stuff, though, I plan
on simply taking advantage of /dev/hugetlb (or whatever it is called)
if it exists, and otherwise falling back to hugetlbfs (which
admittedly requires some admin intervention (mounting hugetlbfs,
permissions, and such), but then again, so does using hugepages in the
first place (either at boot-time or via /proc/sys/vm/nr_hugepages)).
Is that what you mean by available to 2.6.18 (falling back to
hugetlbfs) and 2.6.22 (using the chardev)?

> Am I wrong?

I don't think so. And hugepages are hard enough to use (and with
enough architecture specific quirks) that it was worth creating
libhugetlbfs. While having some nice features like segment remapping
and overriding malloc, it is also meant to provide an API that is
useful for general use of hugepages: we currently export
gethugepagesize(), hugetlbfs_test_path() (verify a path is a valid
hugetlbfs mount), hugetlbfs_find_path() (gives you the hugetlbfs
mount) and hugetlbfs_unlinked_fd() (gives you an unlinked file in the
hugetlbfs mount).

Then again, maybe I'm missing some much bigger picture here and you
meant something completely different -- sorry for the noise in that
case :/

Thanks,
Nish

^ permalink raw reply	[flat|nested] 17+ messages in thread

* Re: [patch] rfc: introduce /dev/hugetlb
  2007-03-24  5:32   ` Nish Aravamudan
@ 2007-03-24  6:12     ` Andrew Morton
  2007-03-24  6:57       ` Sam Ravnborg
  2007-03-24  7:11       ` Ken Chen
  0 siblings, 2 replies; 17+ messages in thread
From: Andrew Morton @ 2007-03-24  6:12 UTC (permalink / raw)
  To: Nish Aravamudan
  Cc: Ken Chen, Adam Litke, Arjan van de Ven, William Lee Irwin III,
	Christoph Hellwig, linux-mm, linux-kernel

On Fri, 23 Mar 2007 22:32:31 -0700 "Nish Aravamudan" <nish.aravamudan@gmail.com> wrote:

> > Probably the kernel team should be maintaining, via existing processes, a
> > separate libkernel project, to fix these distributional problems.  The
> > advantage in this case is of course that our new hugetlb functionality
> > would be available to people on 2.6.18 kernels, not only on 2.6.22 and
> > later.
> 
> That sounds like a good idea. For this hugetlb stuff, though, I plan
> on simply taking advantage of /dev/hugetlb (or whatever it is called)
> if it exists, and otherwise falling back to hugetlbfs (which
> admittedly requires some admin intervention (mounting hugetlbfs,
> permissions, and such), but then again, so does using hugepages in the
> first place (either at boot-time or via /proc/sys/vm/nr_hugepages)).
> Is that what you mean by available to 2.6.18 (falling back to
> hugetlbfs) and 2.6.22 (using the chardev)?

My point is:

a) Ken observes that obtaining private hugetlb memory via hugetlbfs
   involves "fuss".

b) the libhugetlbfs maintainers then go off and implement a no-fuss way of
   doing this.

c) voila, people can now use the new no-fuss interface on older kernels.
   Whereas Ken's kernel patch would require that they upgrade to a new
   kernel.

It wasn't a vary big point ;) I'm assuming that users find that upgrading
libhugetlb is less costly than upgrading their kernel.


> > Am I wrong?
> 
> I don't think so. And hugepages are hard enough to use (and with
> enough architecture specific quirks) that it was worth creating
> libhugetlbfs. While having some nice features like segment remapping
> and overriding malloc, it is also meant to provide an API that is
> useful for general use of hugepages: we currently export
> gethugepagesize(), hugetlbfs_test_path() (verify a path is a valid
> hugetlbfs mount), hugetlbfs_find_path() (gives you the hugetlbfs
> mount) and hugetlbfs_unlinked_fd() (gives you an unlinked file in the
> hugetlbfs mount).
> 
> Then again, maybe I'm missing some much bigger picture here and you
> meant something completely different -- sorry for the noise in that
> case :/

You got it.

The fact that a kernel interface is "hard to use" really shouldn't be an
issue for us, because that hardness can be addressed in libraries.  Kernel
interfaces should be good, and complete, and maintainable, and etcetera. 
If that means that they end up hard to use, well, that's not necessarily a
bad thing.  I'm not sure that in all cases we want to be optimising for
ease-of-use just because libraries-are-hard.


But for non-programming reasons, we're just not there yet: people want to
program direct to the kernel interfaces simply because of the
distribution/coordination problems with libraries.  It would be nice to fix
that problem.


For a counter-example, look at futexes.  Their kernel interfaces are
*damned* hard to use.  But practically nobody is affected by that because
glibc solved the problem and programmers just use the pthread API.

More of this, please ;)


^ permalink raw reply	[flat|nested] 17+ messages in thread

* Re: [patch] rfc: introduce /dev/hugetlb
  2007-03-24  6:12     ` Andrew Morton
@ 2007-03-24  6:57       ` Sam Ravnborg
  2007-03-24  7:41         ` Andrew Morton
  2007-03-24  7:11       ` Ken Chen
  1 sibling, 1 reply; 17+ messages in thread
From: Sam Ravnborg @ 2007-03-24  6:57 UTC (permalink / raw)
  To: Andrew Morton
  Cc: Nish Aravamudan, Ken Chen, Adam Litke, Arjan van de Ven,
	William Lee Irwin III, Christoph Hellwig, linux-mm, linux-kernel

> 
> But for non-programming reasons, we're just not there yet: people want to
> program direct to the kernel interfaces simply because of the
> distribution/coordination problems with libraries.  It would be nice to fix
> that problem.

What is then needed to get a small subset of user-space in the kernel-development cycle?
Maybe a topic worth to take up at LKS...

The build system is anyway ready but that the smallest issue of all :-(

	Sam

^ permalink raw reply	[flat|nested] 17+ messages in thread

* Re: [patch] rfc: introduce /dev/hugetlb
  2007-03-24  6:12     ` Andrew Morton
  2007-03-24  6:57       ` Sam Ravnborg
@ 2007-03-24  7:11       ` Ken Chen
  2007-03-24  7:39         ` Andrew Morton
  1 sibling, 1 reply; 17+ messages in thread
From: Ken Chen @ 2007-03-24  7:11 UTC (permalink / raw)
  To: Andrew Morton
  Cc: Nish Aravamudan, Adam Litke, Arjan van de Ven,
	William Lee Irwin III, Christoph Hellwig, linux-mm, linux-kernel

On 3/23/07, Andrew Morton <akpm@linux-foundation.org> wrote:
> a) Ken observes that obtaining private hugetlb memory via hugetlbfs
>    involves "fuss".
>
> b) the libhugetlbfs maintainers then go off and implement a no-fuss way of
>    doing this.

Hmm, what started this thread was libhugetlbfs maintainer complained
how "fuss" it was to create private hugetlb mapping and suggested an
even bigger kernel change with pagetable_operations API.  The new API
was designed with an end goal of introduce /dev/hugetlb (as one of the
feature, they might be thinking more).  What motivated me here is to
point out that we can achieve the same goal of having a /dev/hugetlb
with existing hugetlbfs infrastructure and the implementation is
relatively straightforward.  What it also buys us is a bit more
flexibility to the end user who wants to use the interface directly.

^ permalink raw reply	[flat|nested] 17+ messages in thread

* Re: [patch] rfc: introduce /dev/hugetlb
  2007-03-24  7:11       ` Ken Chen
@ 2007-03-24  7:39         ` Andrew Morton
  0 siblings, 0 replies; 17+ messages in thread
From: Andrew Morton @ 2007-03-24  7:39 UTC (permalink / raw)
  To: Ken Chen
  Cc: Nish Aravamudan, Adam Litke, Arjan van de Ven,
	William Lee Irwin III, Christoph Hellwig, linux-mm, linux-kernel

On Sat, 24 Mar 2007 00:11:32 -0700 "Ken Chen" <kenchen@google.com> wrote:

> On 3/23/07, Andrew Morton <akpm@linux-foundation.org> wrote:
> > a) Ken observes that obtaining private hugetlb memory via hugetlbfs
> >    involves "fuss".
> >
> > b) the libhugetlbfs maintainers then go off and implement a no-fuss way of
> >    doing this.
> 
> Hmm, what started this thread was libhugetlbfs maintainer complained
> how "fuss" it was to create private hugetlb mapping and suggested an
> even bigger kernel change with pagetable_operations API.

OK.  I wasn't paying particularly close attention.  But my rant still
stands ;)

>  The new API
> was designed with an end goal of introduce /dev/hugetlb (as one of the
> feature, they might be thinking more).  What motivated me here is to
> point out that we can achieve the same goal of having a /dev/hugetlb
> with existing hugetlbfs infrastructure and the implementation is
> relatively straightforward.  What it also buys us is a bit more
> flexibility to the end user who wants to use the interface directly.

OK.

Why is it a "fuss" to do this with hugetlbfs files, btw?

Having read back through the thread, the only substantiation I can really
see is

  The pagetable_operations API opens up possibilities to do some
  additional (and completely sane) things.  For example, I have a patch
  that alters the character device code below to make use of a hugetlb
  ZERO_PAGE.  This eliminates almost all the up-front fault time, allowing
  pages to be COW'ed only when first written to.  We cannot do things like
  this with hugetlbfs anymore because we have a set of complex semantics to
  preserve.


Why is this actually a useful feature?

What does "complex semantics to preserve" mean?


I dunno.  I see a lot of code flying around, but comparatively little
effort to describe the actual problems which we're trying to solve.


^ permalink raw reply	[flat|nested] 17+ messages in thread

* Re: [patch] rfc: introduce /dev/hugetlb
  2007-03-24  6:57       ` Sam Ravnborg
@ 2007-03-24  7:41         ` Andrew Morton
  0 siblings, 0 replies; 17+ messages in thread
From: Andrew Morton @ 2007-03-24  7:41 UTC (permalink / raw)
  To: Sam Ravnborg
  Cc: Nish Aravamudan, Ken Chen, Adam Litke, Arjan van de Ven,
	William Lee Irwin III, Christoph Hellwig, linux-mm, linux-kernel

On Sat, 24 Mar 2007 07:57:52 +0100 Sam Ravnborg <sam@ravnborg.org> wrote:

> > 
> > But for non-programming reasons, we're just not there yet: people want to
> > program direct to the kernel interfaces simply because of the
> > distribution/coordination problems with libraries.  It would be nice to fix
> > that problem.
> 
> What is then needed to get a small subset of user-space in the kernel-development cycle?

Someone to lead the work, mainly.  It would be a large effort, a lot of
time and email traffic.

> Maybe a topic worth to take up at LKS...

Well, perhaps.  But unless someone with suitable experience has enough time
and energy to spare to make it happen, it won't be happening.


^ permalink raw reply	[flat|nested] 17+ messages in thread

* Re: [patch] rfc: introduce /dev/hugetlb
  2007-03-24  4:58 ` Andrew Morton
  2007-03-24  5:32   ` Nish Aravamudan
@ 2007-03-25 10:22   ` Arjan van de Ven
  1 sibling, 0 replies; 17+ messages in thread
From: Arjan van de Ven @ 2007-03-25 10:22 UTC (permalink / raw)
  To: Andrew Morton
  Cc: Ken Chen, Adam Litke, William Lee Irwin III, Christoph Hellwig,
	linux-mm, linux-kernel


> But libraries are hard, for a number of distributional reasons.  

I don't see why this is the case to be honest.
You can ask distros to ship your library, and if it's a sensible one,
they will. And if you can't wait, you can always bundle the library with
your application, it's really not a big deal to do that properly.

That's not a reason to make it a harder problem by tying a library to
the kernel source... in fact I know enterprise distros are more likely
to uprev a library than to uprev a kernel.... tying them together you
get the worst of both worlds....

-- 
if you want to mail me at work (you don't), use arjan (at) linux.intel.com
Test the interaction between Linux and your BIOS via http://www.linuxfirmwarekit.org


^ permalink raw reply	[flat|nested] 17+ messages in thread

end of thread, other threads:[~2007-03-25 10:23 UTC | newest]

Thread overview: 17+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2007-03-23  8:44 [patch] rfc: introduce /dev/hugetlb Ken Chen
2007-03-23 15:03 ` William Lee Irwin III
2007-03-23 21:56   ` Ken Chen
2007-03-23 15:03 ` Mel Gorman
2007-03-23 15:09   ` William Lee Irwin III
2007-03-23 15:15     ` Mel Gorman
2007-03-23 15:30       ` William Lee Irwin III
2007-03-23 22:04         ` Ken Chen
2007-03-23 21:08 ` Benjamin Herrenschmidt
2007-03-24  4:58 ` Andrew Morton
2007-03-24  5:32   ` Nish Aravamudan
2007-03-24  6:12     ` Andrew Morton
2007-03-24  6:57       ` Sam Ravnborg
2007-03-24  7:41         ` Andrew Morton
2007-03-24  7:11       ` Ken Chen
2007-03-24  7:39         ` Andrew Morton
2007-03-25 10:22   ` Arjan van de Ven

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).