LKML Archive on lore.kernel.org
help / color / mirror / Atom feed
* READDIRPLUS max mount option
@ 2008-03-07 19:37 Kyle Rose
  2008-03-07 19:59 ` Trond Myklebust
  0 siblings, 1 reply; 6+ messages in thread
From: Kyle Rose @ 2008-03-07 19:37 UTC (permalink / raw)
  To: Linux Kernel Mailing List

[-- Attachment #1: Type: text/plain, Size: 653 bytes --]

I have a very specific use for an NFS mount over a WAN, and allowing for 
much larger expected READDIRPLUS requests actually improves performance 
by at least a factor of 10 by eliminating the round-trip latency that 
results from the application's single-threaded 
readdir/stat/stat/stat/... behavior.  Rather than maintain a hacked 
kernel on my end, I'd rather the READDIRPLUS limit be a mount option.  
Hence, the following patch.  It defaults to the old behavior 
(8*PAGE_SIZE), but with a properly-prepared mount binary will allow the 
client to specify a limit.

I'm not subscribed to the list, so please CC me in any relevant discussion.

Kyle


[-- Attachment #2: readdirplusmax.patch --]
[-- Type: text/x-patch, Size: 10063 bytes --]

diff --git a/fs/nfs/client.c b/fs/nfs/client.c
index c5c0175..97cb997 100644
--- a/fs/nfs/client.c
+++ b/fs/nfs/client.c
@@ -675,6 +675,8 @@ static int nfs_init_server(struct nfs_server *server,
 	server->acdirmin = data->acdirmin * HZ;
 	server->acdirmax = data->acdirmax * HZ;
 
+	server->readdirplusmax = data->readdirplusmax;
+
 	/* Start lockd here, before we might error out */
 	error = nfs_start_lockd(server);
 	if (error < 0)
@@ -806,6 +808,7 @@ static void nfs_server_copy_userdata(struct nfs_server *target, struct nfs_serve
 	target->acregmax = source->acregmax;
 	target->acdirmin = source->acdirmin;
 	target->acdirmax = source->acdirmax;
+	target->readdirplusmax = source->readdirplusmax;
 	target->caps = source->caps;
 }
 
@@ -1062,6 +1065,8 @@ static int nfs4_init_server(struct nfs_server *server,
 	server->acdirmin = data->acdirmin * HZ;
 	server->acdirmax = data->acdirmax * HZ;
 
+	server->readdirplusmax = data->readdirplusmax;
+
 	error = nfs_init_server_rpcclient(server, &timeparms, data->auth_flavors[0]);
 
 error:
diff --git a/fs/nfs/inode.c b/fs/nfs/inode.c
index 966a885..644239f 100644
--- a/fs/nfs/inode.c
+++ b/fs/nfs/inode.c
@@ -234,9 +234,6 @@ nfs_init_locked(struct inode *inode, void *opaque)
 	return 0;
 }
 
-/* Don't use READDIRPLUS on directories that we believe are too large */
-#define NFS_LIMIT_READDIRPLUS (8*PAGE_SIZE)
-
 /*
  * This is our front-end to iget that looks up inodes by file handle
  * instead of inode number.
@@ -290,7 +287,7 @@ nfs_fhget(struct super_block *sb, struct nfs_fh *fh, struct nfs_fattr *fattr)
 			inode->i_op = NFS_SB(sb)->nfs_client->rpc_ops->dir_inode_ops;
 			inode->i_fop = &nfs_dir_operations;
 			if (nfs_server_capable(inode, NFS_CAP_READDIRPLUS)
-			    && fattr->size <= NFS_LIMIT_READDIRPLUS)
+			    && (NFS_READDIRPLUS_MAX(inode) == 0 || fattr->size <= NFS_READDIRPLUS_MAX(inode)))
 				set_bit(NFS_INO_ADVISE_RDPLUS, &NFS_I(inode)->flags);
 			/* Deal with crossing mountpoints */
 			if (!nfs_fsid_equal(&NFS_SB(sb)->fsid, &fattr->fsid)) {
diff --git a/fs/nfs/internal.h b/fs/nfs/internal.h
index 9319927..91426a0 100644
--- a/fs/nfs/internal.h
+++ b/fs/nfs/internal.h
@@ -36,6 +36,7 @@ struct nfs_parsed_mount_data {
 	int			timeo, retrans;
 	int			acregmin, acregmax,
 				acdirmin, acdirmax;
+	int			readdirplusmax;
 	int			namlen;
 	unsigned int		bsize;
 	unsigned int		auth_flavor_len;
diff --git a/fs/nfs/nfsroot.c b/fs/nfs/nfsroot.c
index 531379d..c5a9eb5 100644
--- a/fs/nfs/nfsroot.c
+++ b/fs/nfs/nfsroot.c
@@ -119,7 +119,7 @@ static int mount_port __initdata = 0;		/* Mount daemon port number */
 enum {
 	/* Options that take integer arguments */
 	Opt_port, Opt_rsize, Opt_wsize, Opt_timeo, Opt_retrans, Opt_acregmin,
-	Opt_acregmax, Opt_acdirmin, Opt_acdirmax,
+	Opt_acregmax, Opt_acdirmin, Opt_acdirmax, Opt_readdirplusmax,
 	/* Options that take no arguments */
 	Opt_soft, Opt_hard, Opt_intr,
 	Opt_nointr, Opt_posix, Opt_noposix, Opt_cto, Opt_nocto, Opt_ac, 
@@ -139,6 +139,7 @@ static match_table_t __initdata tokens = {
 	{Opt_acregmax, "acregmax=%u"},
 	{Opt_acdirmin, "acdirmin=%u"},
 	{Opt_acdirmax, "acdirmax=%u"},
+	{Opt_readdirplusmax, "readdirplusmax=%u"},
 	{Opt_soft, "soft"},
 	{Opt_hard, "hard"},
 	{Opt_intr, "intr"},
@@ -221,6 +222,9 @@ static int __init root_nfs_parse(char *name, char *buf)
 			case Opt_acdirmax:
 				nfs_data.acdirmax = option;
 				break;
+			case Opt_readdirplusmax:
+				nfs_data.readdirplusmax = option;
+				break;
 			case Opt_soft:
 				nfs_data.flags |= NFS_MOUNT_SOFT;
 				break;
@@ -292,15 +296,17 @@ static int __init root_nfs_name(char *name)
 
 	/* Set some default values */
 	memset(&nfs_data, 0, sizeof(nfs_data));
-	nfs_port          = -1;
-	nfs_data.version  = NFS_MOUNT_VERSION;
-	nfs_data.flags    = NFS_MOUNT_NONLM;	/* No lockd in nfs root yet */
-	nfs_data.rsize    = NFS_DEF_FILE_IO_SIZE;
-	nfs_data.wsize    = NFS_DEF_FILE_IO_SIZE;
-	nfs_data.acregmin = 3;
-	nfs_data.acregmax = 60;
-	nfs_data.acdirmin = 30;
-	nfs_data.acdirmax = 60;
+	nfs_port                = -1;
+	nfs_data.version        = NFS_MOUNT_VERSION;
+	nfs_data.flags          = NFS_MOUNT_NONLM;	/* No lockd in nfs root yet */
+	nfs_data.rsize          = NFS_DEF_FILE_IO_SIZE;
+	nfs_data.wsize          = NFS_DEF_FILE_IO_SIZE;
+	nfs_data.acregmin       = 3;
+	nfs_data.acregmax       = 60;
+	nfs_data.acdirmin       = 30;
+	nfs_data.acdirmax       = 60;
+	nfs_data.readdirplusmax = 8*PAGE_SIZE;
+
 	strcpy(buf, NFS_ROOT);
 
 	/* Process options received from the remote server */
@@ -348,6 +354,8 @@ static void __init root_nfs_print(void)
 	printk(KERN_NOTICE "Root-NFS:     acreg (min,max) = (%d,%d), acdir (min,max) = (%d,%d)\n",
 		nfs_data.acregmin, nfs_data.acregmax,
 		nfs_data.acdirmin, nfs_data.acdirmax);
+	printk(KERN_NOTICE "Root-NFS:     readdirplusmax = %d\n",
+		nfs_data.readdirplusmax);
 	printk(KERN_NOTICE "Root-NFS:     nfsd port = %d, mountd port = %d, flags = %08x\n",
 		nfs_port, mount_port, nfs_data.flags);
 }
diff --git a/fs/nfs/super.c b/fs/nfs/super.c
index fcf4b98..87dfdfb 100644
--- a/fs/nfs/super.c
+++ b/fs/nfs/super.c
@@ -83,6 +83,7 @@ enum {
 	Opt_acregmin, Opt_acregmax,
 	Opt_acdirmin, Opt_acdirmax,
 	Opt_actimeo,
+	Opt_readdirplusmax,
 	Opt_namelen,
 	Opt_mountport,
 	Opt_mountvers,
@@ -136,6 +137,7 @@ static match_table_t nfs_mount_option_tokens = {
 	{ Opt_acdirmin, "acdirmin=%u" },
 	{ Opt_acdirmax, "acdirmax=%u" },
 	{ Opt_actimeo, "actimeo=%u" },
+	{ Opt_readdirplusmax, "readdirplusmax=%u" },
 	{ Opt_userspace, "retry=%u" },
 	{ Opt_namelen, "namlen=%u" },
 	{ Opt_mountport, "mountport=%u" },
@@ -474,6 +476,7 @@ static void nfs_show_mount_options(struct seq_file *m, struct nfs_server *nfss,
 		seq_printf(m, ",acdirmin=%d", nfss->acdirmin/HZ);
 	if (nfss->acdirmax != 60*HZ || showdefaults)
 		seq_printf(m, ",acdirmax=%d", nfss->acdirmax/HZ);
+	seq_printf(m, ",readdirplusmax=%d", nfss->readdirplusmax);
 	for (nfs_infop = nfs_info; nfs_infop->flag; nfs_infop++) {
 		if (nfss->flags & nfs_infop->flag)
 			seq_puts(m, nfs_infop->str);
@@ -851,6 +854,10 @@ static int nfs_parse_mount_options(char *raw,
 			mnt->acdirmin =
 			mnt->acdirmax = option;
 			break;
+		case Opt_readdirplusmax:
+			if (match_int(args, &mnt->readdirplusmax))
+				return 0;
+			break;
 		case Opt_namelen:
 			if (match_int(args, &mnt->namlen))
 				return 0;
@@ -1190,6 +1197,8 @@ static int nfs_validate_mount_data(void *options,
 	case 5:
 		memset(data->context, 0, sizeof(data->context));
 	case 6:
+		data->readdirplusmax = 8*PAGE_SIZE;
+	case 7:
 		if (data->flags & NFS_MOUNT_VER3)
 			mntfh->size = data->root.size;
 		else
@@ -1217,6 +1226,7 @@ static int nfs_validate_mount_data(void *options,
 		args->acregmax		= data->acregmax;
 		args->acdirmin		= data->acdirmin;
 		args->acdirmax		= data->acdirmax;
+		args->readdirplusmax	= data->readdirplusmax;
 
 		memcpy(&args->nfs_server.address, &data->addr,
 		       sizeof(data->addr));
@@ -1428,6 +1438,8 @@ static int nfs_compare_mount_options(const struct super_block *s, const struct n
 		goto Ebusy;
 	if (a->acdirmax != b->acdirmax)
 		goto Ebusy;
+	if (a->readdirplusmax != b->readdirplusmax)
+		goto Ebusy;
 	if (clnt_a->cl_auth->au_flavor != clnt_b->cl_auth->au_flavor)
 		goto Ebusy;
 	return 1;
@@ -1757,6 +1769,8 @@ static int nfs4_validate_mount_data(void *options,
 
 	switch (data->version) {
 	case 1:
+		data->readdirplusmax = 8*PAGE_SIZE;
+	case 2:
 		ap = (struct sockaddr_in *)&args->nfs_server.address;
 		if (data->host_addrlen > sizeof(args->nfs_server.address))
 			goto out_no_address;
@@ -1816,6 +1830,7 @@ static int nfs4_validate_mount_data(void *options,
 		args->acregmax	= data->acregmax;
 		args->acdirmin	= data->acdirmin;
 		args->acdirmax	= data->acdirmax;
+		args->readdirplusmax = data->readdirplusmax;
 		args->nfs_server.protocol = data->proto;
 
 		break;
diff --git a/include/linux/nfs4_mount.h b/include/linux/nfs4_mount.h
index a0dcf66..ee9c21d 100644
--- a/include/linux/nfs4_mount.h
+++ b/include/linux/nfs4_mount.h
@@ -16,7 +16,7 @@
  * mount-to-kernel version compatibility.  Some of these aren't used yet
  * but here they are anyway.
  */
-#define NFS4_MOUNT_VERSION	1
+#define NFS4_MOUNT_VERSION	2
 
 struct nfs_string {
 	unsigned int len;
@@ -53,6 +53,8 @@ struct nfs4_mount_data {
 	/* Pseudo-flavours to use for authentication. See RFC2623 */
 	int auth_flavourlen;			/* 1 */
 	int __user *auth_flavours;		/* 1 */
+
+	int readdirplusmax;			/* 2 */
 };
 
 /* bits in the flags field */
diff --git a/include/linux/nfs_fs.h b/include/linux/nfs_fs.h
index a69ba80..d7401b4 100644
--- a/include/linux/nfs_fs.h
+++ b/include/linux/nfs_fs.h
@@ -274,6 +274,11 @@ static inline int nfs_server_capable(struct inode *inode, int cap)
 	return NFS_SERVER(inode)->caps & cap;
 }
 
+static inline unsigned int NFS_READDIRPLUS_MAX(struct inode *inode)
+{
+	return NFS_SERVER(inode)->readdirplusmax;
+}
+
 static inline int NFS_USE_READDIRPLUS(struct inode *inode)
 {
 	return test_bit(NFS_INO_ADVISE_RDPLUS, &NFS_I(inode)->flags);
diff --git a/include/linux/nfs_fs_sb.h b/include/linux/nfs_fs_sb.h
index 3423c67..d596065 100644
--- a/include/linux/nfs_fs_sb.h
+++ b/include/linux/nfs_fs_sb.h
@@ -99,6 +99,8 @@ struct nfs_server {
 	unsigned int		acdirmin;
 	unsigned int		acdirmax;
 	unsigned int		namelen;
+	unsigned int		readdirplusmax;	/* max believed dir size for
+						   READDIRPLUS */
 
 	struct nfs_fsid		fsid;
 	__u64			maxfilesize;	/* maximum file size */
diff --git a/include/linux/nfs_mount.h b/include/linux/nfs_mount.h
index df7c6b7..686d185 100644
--- a/include/linux/nfs_mount.h
+++ b/include/linux/nfs_mount.h
@@ -20,7 +20,7 @@
  * mount-to-kernel version compatibility.  Some of these aren't used yet
  * but here they are anyway.
  */
-#define NFS_MOUNT_VERSION	6
+#define NFS_MOUNT_VERSION	7
 #define NFS_MAX_CONTEXT_LEN	256
 
 struct nfs_mount_data {
@@ -43,6 +43,7 @@ struct nfs_mount_data {
 	struct nfs3_fh	root;			/* 4 */
 	int		pseudoflavor;		/* 5 */
 	char		context[NFS_MAX_CONTEXT_LEN + 1];	/* 6 */
+	int		readdirplusmax;		/* 7 */
 };
 
 /* bits in the flags field */

^ permalink raw reply	[flat|nested] 6+ messages in thread

* Re: READDIRPLUS max mount option
  2008-03-07 19:37 READDIRPLUS max mount option Kyle Rose
@ 2008-03-07 19:59 ` Trond Myklebust
  2008-03-07 20:09   ` Kyle Rose
  0 siblings, 1 reply; 6+ messages in thread
From: Trond Myklebust @ 2008-03-07 19:59 UTC (permalink / raw)
  To: Kyle Rose; +Cc: Linux Kernel Mailing List, linux-nfs


On Fri, 2008-03-07 at 14:37 -0500, Kyle Rose wrote:
> I have a very specific use for an NFS mount over a WAN, and allowing for 
> much larger expected READDIRPLUS requests actually improves performance 
> by at least a factor of 10 by eliminating the round-trip latency that 
> results from the application's single-threaded 
> readdir/stat/stat/stat/... behavior.  Rather than maintain a hacked 
> kernel on my end, I'd rather the READDIRPLUS limit be a mount option.  
> Hence, the following patch.  It defaults to the old behavior 
> (8*PAGE_SIZE), but with a properly-prepared mount binary will allow the 
> client to specify a limit.
> 
> I'm not subscribed to the list, so please CC me in any relevant discussion.
> 
> Kyle

(adding cc to linux-nfs@vger.kernel.org)

The binary mount format is frozen forever, so the changes to nfs_mount.h
and nfs4_mount.h are definitely NACKed.

Otherwise, it would be nice to know why this absolutely has to be made a
mount option rather than just having a system-wide option (either a
module/boot parameter or a sysctl) to control the behaviour of all
mounts.

Cheers
  Trond


^ permalink raw reply	[flat|nested] 6+ messages in thread

* Re: READDIRPLUS max mount option
  2008-03-07 19:59 ` Trond Myklebust
@ 2008-03-07 20:09   ` Kyle Rose
  2008-03-07 20:42     ` Trond Myklebust
  0 siblings, 1 reply; 6+ messages in thread
From: Kyle Rose @ 2008-03-07 20:09 UTC (permalink / raw)
  To: Trond Myklebust; +Cc: Linux Kernel Mailing List, linux-nfs


> The binary mount format is frozen forever, so the changes to nfs_mount.h
> and nfs4_mount.h are definitely NACKed.
>   
Ah. :-)  So is there no way to add mount options, or is there a 
different mechanism today?
> Otherwise, it would be nice to know why this absolutely has to be made a
> mount option rather than just having a system-wide option (either a
> module/boot parameter or a sysctl) to control the behaviour of all
> mounts.
>   
I mount multiple remote file systems.  Only one of them I own, so I'm 
willing to potentially hammer it with huge READDIRPLUS requests, while 
the others probably deserve more benign behavior. ;-)

In general, I think having system-wide defaults somewhere in proc is 
helpful---and certainly superior to a constant in the source---but there 
should really be mount-specific overrides wherever the system-wide 
default might not be globally appropriate.

Kyle


^ permalink raw reply	[flat|nested] 6+ messages in thread

* Re: READDIRPLUS max mount option
  2008-03-07 20:09   ` Kyle Rose
@ 2008-03-07 20:42     ` Trond Myklebust
  2008-03-07 21:04       ` Kyle Rose
  0 siblings, 1 reply; 6+ messages in thread
From: Trond Myklebust @ 2008-03-07 20:42 UTC (permalink / raw)
  To: Kyle Rose; +Cc: Linux Kernel Mailing List, linux-nfs


On Fri, 2008-03-07 at 15:09 -0500, Kyle Rose wrote:
> > The binary mount format is frozen forever, so the changes to nfs_mount.h
> > and nfs4_mount.h are definitely NACKed.
> >   
> Ah. :-)  So is there no way to add mount options, or is there a 
> different mechanism today?

Newer versions of 'mount' should use the text-based interface.

> > Otherwise, it would be nice to know why this absolutely has to be made a
> > mount option rather than just having a system-wide option (either a
> > module/boot parameter or a sysctl) to control the behaviour of all
> > mounts.
> >   
> I mount multiple remote file systems.  Only one of them I own, so I'm 
> willing to potentially hammer it with huge READDIRPLUS requests, while 
> the others probably deserve more benign behavior. ;-)

The size of the actual READDIRPLUS requests is completely unaffected by
your patch. Your change actually means that the client will continue to
use READDIRPLUS on very large directories instead of falling back to
readdir.
The reason for falling back to readdir is that value of readdirplus
tends to decrease with larger directories as the cost of caching all
those dentries, attributes and filehandles both on the client and the
server goes up.

If you want a faster readdir(), you will find that splitting those huge
directories up into smaller subdirs is an alternative solution that
tends to scale much better on both client and server.

> In general, I think having system-wide defaults somewhere in proc is 
> helpful---and certainly superior to a constant in the source---but there 
> should really be mount-specific overrides wherever the system-wide 
> default might not be globally appropriate.

Having hundreds of mount options for minor tweaks is not an acceptable
practice. Each mount option needs to be abundantly justified.

Since we're talking about what is really a quite arbitrary limit, I can
certainly see an argument for why we might want a way to change it, but
I'm still not convinced that we need to be setting this parameter at the
mountpoint level.

Cheers
  Trond


^ permalink raw reply	[flat|nested] 6+ messages in thread

* Re: READDIRPLUS max mount option
  2008-03-07 20:42     ` Trond Myklebust
@ 2008-03-07 21:04       ` Kyle Rose
  2008-03-09 16:16         ` Jan Engelhardt
  0 siblings, 1 reply; 6+ messages in thread
From: Kyle Rose @ 2008-03-07 21:04 UTC (permalink / raw)
  To: Trond Myklebust; +Cc: Linux Kernel Mailing List, linux-nfs


> The size of the actual READDIRPLUS requests is completely unaffected by
> your patch. Your change actually means that the client will continue to
> use READDIRPLUS on very large directories instead of falling back to
> readdir.
>   
Sorry to be imprecise.  "Size of request" should be "size of response" 
or "cost of request".  The meaning is clear, I think.
> If you want a faster readdir(), you will find that splitting those huge
> directories up into smaller subdirs is an alternative solution that
> tends to scale much better on both client and server.
>   
Agreed that this is probably the least terrible of the available 
solutions, but in my specific case it requires a more extensive 
modification to my software than the relatively minor kernel change.
> Having hundreds of mount options for minor tweaks is not an acceptable
> practice. Each mount option needs to be abundantly justified.
>   
Regarding your straw man, nobody's proposing hundreds of mount options.  
I imagine the effort required to implement each one would keep such a 
thing from happening. ;-)
> Since we're talking about what is really a quite arbitrary limit, I can
> certainly see an argument for why we might want a way to change it, but
> I'm still not convinced that we need to be setting this parameter at the
> mountpoint level.
Fair enough.  A proc entry to alter this globally would be an acceptable 
compromise for me, even if my local sysadmins might not like it.

Kyle

^ permalink raw reply	[flat|nested] 6+ messages in thread

* Re: READDIRPLUS max mount option
  2008-03-07 21:04       ` Kyle Rose
@ 2008-03-09 16:16         ` Jan Engelhardt
  0 siblings, 0 replies; 6+ messages in thread
From: Jan Engelhardt @ 2008-03-09 16:16 UTC (permalink / raw)
  To: Kyle Rose; +Cc: Trond Myklebust, Linux Kernel Mailing List, linux-nfs


On Mar 7 2008 16:04, Kyle Rose wrote:
>> Since we're talking about what is really a quite arbitrary limit, I can
>> certainly see an argument for why we might want a way to change it, but
>> I'm still not convinced that we need to be setting this parameter at the
>> mountpoint level.
>
> Fair enough.  A proc entry to alter this globally would be an acceptable
> compromise for me, even if my local sysadmins might not like it.
>
sysfs, no? A module parameter is easy for a global default and is
the minimum thing to do :)

^ permalink raw reply	[flat|nested] 6+ messages in thread

end of thread, other threads:[~2008-03-09 16:16 UTC | newest]

Thread overview: 6+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2008-03-07 19:37 READDIRPLUS max mount option Kyle Rose
2008-03-07 19:59 ` Trond Myklebust
2008-03-07 20:09   ` Kyle Rose
2008-03-07 20:42     ` Trond Myklebust
2008-03-07 21:04       ` Kyle Rose
2008-03-09 16:16         ` Jan Engelhardt

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).