LKML Archive on lore.kernel.org
help / color / mirror / Atom feed
* [patch 1/6] mempolicy: convert MPOL constants to enum
@ 2008-03-01  0:44 David Rientjes
  2008-03-01  0:45 ` [patch 2/6] mempolicy: support optional mode flags David Rientjes
  2008-03-03  7:17 ` [patch 1/6] mempolicy: convert MPOL constants to enum Andrew Morton
  0 siblings, 2 replies; 12+ messages in thread
From: David Rientjes @ 2008-03-01  0:44 UTC (permalink / raw)
  To: Andrew Morton
  Cc: Paul Jackson, Christoph Lameter, Lee Schermerhorn, Andi Kleen,
	linux-kernel

The mempolicy mode constants, MPOL_DEFAULT, MPOL_PREFERRED, MPOL_BIND,
and MPOL_INTERLEAVE, are better declared as part of an enum since they
are sequentially numbered and cannot be combined.

The policy member of struct mempolicy is also converted from type short
to type unsigned short.  A negative policy does not have any legitimate
meaning, so it is possible to change its type in preparation for adding
optional mode flags later.

The equivalent member of struct shmem_sb_info is also changed from int
to unsigned short.

For compatibility, the policy formal to get_mempolicy() remains as a
pointer to an int:

	int get_mempolicy(int *policy, unsigned long *nmask,
			  unsigned long maxnode, unsigned long addr,
			  unsigned long flags);

although the only possible values is the range of type unsigned short.

Cc: Paul Jackson <pj@sgi.com>
Cc: Christoph Lameter <clameter@sgi.com>
Cc: Lee Schermerhorn <Lee.Schermerhorn@hp.com>
Cc: Andi Kleen <ak@suse.de>
Signed-off-by: David Rientjes <rientjes@google.com>
---
 include/linux/mempolicy.h |   19 ++++++++++---------
 include/linux/shmem_fs.h  |    2 +-
 mm/mempolicy.c            |   27 ++++++++++++++++-----------
 mm/shmem.c                |    9 +++++----
 4 files changed, 32 insertions(+), 25 deletions(-)

diff --git a/include/linux/mempolicy.h b/include/linux/mempolicy.h
--- a/include/linux/mempolicy.h
+++ b/include/linux/mempolicy.h
@@ -9,12 +9,13 @@
  */
 
 /* Policies */
-#define MPOL_DEFAULT	0
-#define MPOL_PREFERRED	1
-#define MPOL_BIND	2
-#define MPOL_INTERLEAVE	3
-
-#define MPOL_MAX MPOL_INTERLEAVE
+enum {
+	MPOL_DEFAULT,
+	MPOL_PREFERRED,
+	MPOL_BIND,
+	MPOL_INTERLEAVE,
+	MPOL_MAX,	/* always last member of enum */
+};
 
 /* Flags for get_mem_policy */
 #define MPOL_F_NODE	(1<<0)	/* return next IL mode instead of node mask */
@@ -62,7 +63,7 @@ struct mm_struct;
  */
 struct mempolicy {
 	atomic_t refcnt;
-	short policy; 	/* See MPOL_* above */
+	unsigned short policy; 	/* See MPOL_* above */
 	union {
 		struct zonelist  *zonelist;	/* bind */
 		short 		 preferred_node; /* preferred */
@@ -133,7 +134,7 @@ struct shared_policy {
 	spinlock_t lock;
 };
 
-void mpol_shared_policy_init(struct shared_policy *info, int policy,
+void mpol_shared_policy_init(struct shared_policy *info, unsigned short policy,
 				nodemask_t *nodes);
 int mpol_set_shared_policy(struct shared_policy *info,
 				struct vm_area_struct *vma,
@@ -200,7 +201,7 @@ static inline int mpol_set_shared_policy(struct shared_policy *info,
 }
 
 static inline void mpol_shared_policy_init(struct shared_policy *info,
-					int policy, nodemask_t *nodes)
+				unsigned short policy, nodemask_t *nodes)
 {
 }
 
diff --git a/include/linux/shmem_fs.h b/include/linux/shmem_fs.h
--- a/include/linux/shmem_fs.h
+++ b/include/linux/shmem_fs.h
@@ -34,7 +34,7 @@ struct shmem_sb_info {
 	uid_t uid;		    /* Mount uid for root directory */
 	gid_t gid;		    /* Mount gid for root directory */
 	mode_t mode;		    /* Mount mode for root directory */
-	int policy;		    /* Default NUMA memory alloc policy */
+	unsigned short policy;	    /* Default NUMA memory alloc policy */
 	nodemask_t policy_nodes;    /* nodemask for preferred and bind */
 };
 
diff --git a/mm/mempolicy.c b/mm/mempolicy.c
--- a/mm/mempolicy.c
+++ b/mm/mempolicy.c
@@ -114,7 +114,7 @@ static void mpol_rebind_policy(struct mempolicy *pol,
                                const nodemask_t *newmask);
 
 /* Do sanity checking on a policy */
-static int mpol_check_policy(int mode, nodemask_t *nodes)
+static int mpol_check_policy(unsigned short mode, nodemask_t *nodes)
 {
 	int was_empty, is_empty;
 
@@ -159,6 +159,8 @@ static int mpol_check_policy(int mode, nodemask_t *nodes)
 		if (!was_empty && is_empty)
 			return -EINVAL;
 		break;
+	default:
+		BUG();
 	}
 	return 0;
 }
@@ -201,7 +203,7 @@ static struct zonelist *bind_zonelist(nodemask_t *nodes)
 }
 
 /* Create a new policy */
-static struct mempolicy *mpol_new(int mode, nodemask_t *nodes)
+static struct mempolicy *mpol_new(unsigned short mode, nodemask_t *nodes)
 {
 	struct mempolicy *policy;
 
@@ -235,6 +237,8 @@ static struct mempolicy *mpol_new(int mode, nodemask_t *nodes)
 			return error_code;
 		}
 		break;
+	default:
+		BUG();
 	}
 	policy->policy = mode;
 	policy->cpuset_mems_allowed = cpuset_mems_allowed(current);
@@ -479,7 +483,7 @@ static void mpol_set_task_struct_flag(void)
 }
 
 /* Set the process memory policy */
-static long do_set_mempolicy(int mode, nodemask_t *nodes)
+static long do_set_mempolicy(unsigned short mode, nodemask_t *nodes)
 {
 	struct mempolicy *new;
 
@@ -781,7 +785,7 @@ static struct page *new_vma_page(struct page *page, unsigned long private, int *
 #endif
 
 static long do_mbind(unsigned long start, unsigned long len,
-		     unsigned long mode, nodemask_t *nmask,
+		     unsigned short mode, nodemask_t *nmask,
 		     unsigned long flags)
 {
 	struct vm_area_struct *vma;
@@ -791,9 +795,8 @@ static long do_mbind(unsigned long start, unsigned long len,
 	int err;
 	LIST_HEAD(pagelist);
 
-	if ((flags & ~(unsigned long)(MPOL_MF_STRICT |
+	if (flags & ~(unsigned long)(MPOL_MF_STRICT |
 				      MPOL_MF_MOVE | MPOL_MF_MOVE_ALL))
-	    || mode > MPOL_MAX)
 		return -EINVAL;
 	if ((flags & MPOL_MF_MOVE_ALL) && !capable(CAP_SYS_NICE))
 		return -EPERM;
@@ -826,7 +829,7 @@ static long do_mbind(unsigned long start, unsigned long len,
 	if (!new)
 		flags |= MPOL_MF_DISCONTIG_OK;
 
-	pr_debug("mbind %lx-%lx mode:%ld nodes:%lx\n",start,start+len,
+	pr_debug("mbind %lx-%lx mode:%d nodes:%lx\n",start,start+len,
 		 mode, nmask ? nodes_addr(*nmask)[0] : -1);
 
 	down_write(&mm->mmap_sem);
@@ -927,6 +930,8 @@ asmlinkage long sys_mbind(unsigned long start, unsigned long len,
 	nodemask_t nodes;
 	int err;
 
+	if (mode >= MPOL_MAX)
+		return -EINVAL;
 	err = get_nodes(&nodes, nmask, maxnode);
 	if (err)
 		return err;
@@ -940,7 +945,7 @@ asmlinkage long sys_set_mempolicy(int mode, unsigned long __user *nmask,
 	int err;
 	nodemask_t nodes;
 
-	if (mode < 0 || mode > MPOL_MAX)
+	if (mode < 0 || mode >= MPOL_MAX)
 		return -EINVAL;
 	err = get_nodes(&nodes, nmask, maxnode);
 	if (err)
@@ -1206,7 +1211,7 @@ static unsigned interleave_nodes(struct mempolicy *policy)
  */
 unsigned slab_node(struct mempolicy *policy)
 {
-	int pol = policy ? policy->policy : MPOL_DEFAULT;
+	unsigned short pol = policy ? policy->policy : MPOL_DEFAULT;
 
 	switch (pol) {
 	case MPOL_INTERLEAVE:
@@ -1634,7 +1639,7 @@ restart:
 	return 0;
 }
 
-void mpol_shared_policy_init(struct shared_policy *info, int policy,
+void mpol_shared_policy_init(struct shared_policy *info, unsigned short policy,
 				nodemask_t *policy_nodes)
 {
 	info->root = RB_ROOT;
@@ -1853,7 +1858,7 @@ static inline int mpol_to_str(char *buffer, int maxlen, struct mempolicy *pol)
 	char *p = buffer;
 	int l;
 	nodemask_t nodes;
-	int mode = pol ? pol->policy : MPOL_DEFAULT;
+	unsigned short mode = pol ? pol->policy : MPOL_DEFAULT;
 
 	switch (mode) {
 	case MPOL_DEFAULT:
diff --git a/mm/shmem.c b/mm/shmem.c
--- a/mm/shmem.c
+++ b/mm/shmem.c
@@ -1082,7 +1082,8 @@ redirty:
 
 #ifdef CONFIG_NUMA
 #ifdef CONFIG_TMPFS
-static int shmem_parse_mpol(char *value, int *policy, nodemask_t *policy_nodes)
+static int shmem_parse_mpol(char *value, unsigned short *policy,
+			    nodemask_t *policy_nodes)
 {
 	char *nodelist = strchr(value, ':');
 	int err = 1;
@@ -1131,7 +1132,7 @@ out:
 	return err;
 }
 
-static void shmem_show_mpol(struct seq_file *seq, int policy,
+static void shmem_show_mpol(struct seq_file *seq, unsigned short policy,
 			    const nodemask_t policy_nodes)
 {
 	char *policy_string;
@@ -1200,13 +1201,13 @@ static struct page *shmem_alloc_page(gfp_t gfp,
 }
 #else /* !CONFIG_NUMA */
 #ifdef CONFIG_TMPFS
-static inline int shmem_parse_mpol(char *value, int *policy,
+static inline int shmem_parse_mpol(char *value, unsigned short *policy,
 						nodemask_t *policy_nodes)
 {
 	return 1;
 }
 
-static inline void shmem_show_mpol(struct seq_file *seq, int policy,
+static inline void shmem_show_mpol(struct seq_file *seq, unsigned short policy,
 			    const nodemask_t policy_nodes)
 {
 }

^ permalink raw reply	[flat|nested] 12+ messages in thread

* [patch 2/6] mempolicy: support optional mode flags
  2008-03-01  0:44 [patch 1/6] mempolicy: convert MPOL constants to enum David Rientjes
@ 2008-03-01  0:45 ` David Rientjes
  2008-03-01  0:45   ` [patch 3/6] mempolicy: add MPOL_F_STATIC_NODES flag David Rientjes
  2008-03-03  7:17 ` [patch 1/6] mempolicy: convert MPOL constants to enum Andrew Morton
  1 sibling, 1 reply; 12+ messages in thread
From: David Rientjes @ 2008-03-01  0:45 UTC (permalink / raw)
  To: Andrew Morton
  Cc: Paul Jackson, Christoph Lameter, Lee Schermerhorn, Andi Kleen,
	linux-kernel

With the evolution of mempolicies, it is necessary to support mempolicy
mode flags that specify how the policy shall behave in certain
circumstances.  The most immediate need for mode flag support is to
suppress remapping the nodemask of a policy at the time of rebind.

Both the mempolicy mode and flags are passed by the user in the 'int
policy' formal of either the set_mempolicy() or mbind() syscall.  A new
constant, MPOL_MODE_FLAGS, represents the union of legal optional flags
that may be passed as part of this int.  Mempolicies that include illegal
flags as part of their policy are rejected as invalid.

An additional member to struct mempolicy is added to support the mode
flags:

	struct mempolicy {
		...
		unsigned short policy;
		unsigned short flags;
	}

The splitting of the 'int' actual passed by the user is done in
sys_set_mempolicy() and sys_mbind() for their respective syscalls.  This
is done by intersecting the actual with MPOL_MODE_FLAGS, rejecting the
syscall of there are additional flags, and storing it in the new 'flags'
member of struct mempolicy.  The intersection of the actual with
~MPOL_MODE_FLAGS is stored in the 'policy' member of the struct and all
current users of pol->policy remain unchanged.

The union of the policy mode and optional mode flags is passed back to
the user in get_mempolicy().

This combination of mode and flags within the same actual does not break 
userspace code that relies on get_mempolicy(&policy, ...) and either

	switch (policy) {
	case MPOL_BIND:
		...
	case MPOL_INTERLEAVE:
		...
	};

statements or

	if (policy == MPOL_INTERLEAVE) {
		...
	}

statements.  Such applications would need to use optional mode flags when
calling set_mempolicy() or mbind() for these previously implemented
statements to stop working.  If an application does start using optional
mode flags, it will need to mask the optional flags off the policy in
switch and conditional statements that only test mode.

An additional member is also added to struct shmem_sb_info to store the
optional mode flags.

Cc: Paul Jackson <pj@sgi.com>
Cc: Christoph Lameter <clameter@sgi.com>
Cc: Lee Schermerhorn <Lee.Schermerhorn@hp.com>
Cc: Andi Kleen <ak@suse.de>
Signed-off-by: David Rientjes <rientjes@google.com>
---
 fs/hugetlbfs/inode.c      |    2 +-
 include/linux/mempolicy.h |   20 +++++++++++++++--
 include/linux/shmem_fs.h  |    1 +
 mm/mempolicy.c            |   51 +++++++++++++++++++++++++++-----------------
 mm/shmem.c                |   24 ++++++++++++++-------
 5 files changed, 66 insertions(+), 32 deletions(-)

diff --git a/fs/hugetlbfs/inode.c b/fs/hugetlbfs/inode.c
--- a/fs/hugetlbfs/inode.c
+++ b/fs/hugetlbfs/inode.c
@@ -504,7 +504,7 @@ static struct inode *hugetlbfs_get_inode(struct super_block *sb, uid_t uid,
 		inode->i_atime = inode->i_mtime = inode->i_ctime = CURRENT_TIME;
 		INIT_LIST_HEAD(&inode->i_mapping->private_list);
 		info = HUGETLBFS_I(inode);
-		mpol_shared_policy_init(&info->policy, MPOL_DEFAULT, NULL);
+		mpol_shared_policy_init(&info->policy, MPOL_DEFAULT, 0, NULL);
 		switch (mode & S_IFMT) {
 		default:
 			init_special_inode(inode, mode, dev);
diff --git a/include/linux/mempolicy.h b/include/linux/mempolicy.h
--- a/include/linux/mempolicy.h
+++ b/include/linux/mempolicy.h
@@ -8,6 +8,12 @@
  * Copyright 2003,2004 Andi Kleen SuSE Labs
  */
 
+/*
+ * Both the MPOL_* mempolicy mode and the MPOL_F_* optional mode flags are
+ * passed by the user to either set_mempolicy() or mbind() in an 'int' actual.
+ * The MPOL_MODE_FLAGS macro determines the legal set of optional mode flags.
+ */
+
 /* Policies */
 enum {
 	MPOL_DEFAULT,
@@ -17,7 +23,14 @@ enum {
 	MPOL_MAX,	/* always last member of enum */
 };
 
-/* Flags for get_mem_policy */
+/* Flags for set_mempolicy */
+/*
+ * MPOL_MODE_FLAGS is the union of all possible optional mode flags passed to
+ * either set_mempolicy() or mbind().
+ */
+#define MPOL_MODE_FLAGS		(0)
+
+/* Flags for get_mempolicy */
 #define MPOL_F_NODE	(1<<0)	/* return next IL mode instead of node mask */
 #define MPOL_F_ADDR	(1<<1)	/* look up vma using address */
 #define MPOL_F_MEMS_ALLOWED (1<<2) /* return allowed memories */
@@ -64,6 +77,7 @@ struct mm_struct;
 struct mempolicy {
 	atomic_t refcnt;
 	unsigned short policy; 	/* See MPOL_* above */
+	unsigned short flags;	/* See set_mempolicy() MPOL_F_* above */
 	union {
 		struct zonelist  *zonelist;	/* bind */
 		short 		 preferred_node; /* preferred */
@@ -135,7 +149,7 @@ struct shared_policy {
 };
 
 void mpol_shared_policy_init(struct shared_policy *info, unsigned short policy,
-				nodemask_t *nodes);
+				unsigned short flags, nodemask_t *nodes);
 int mpol_set_shared_policy(struct shared_policy *info,
 				struct vm_area_struct *vma,
 				struct mempolicy *new);
@@ -201,7 +215,7 @@ static inline int mpol_set_shared_policy(struct shared_policy *info,
 }
 
 static inline void mpol_shared_policy_init(struct shared_policy *info,
-				unsigned short policy, nodemask_t *nodes)
+		unsigned short policy, unsigned short flags, nodemask_t *nodes)
 {
 }
 
diff --git a/include/linux/shmem_fs.h b/include/linux/shmem_fs.h
--- a/include/linux/shmem_fs.h
+++ b/include/linux/shmem_fs.h
@@ -35,6 +35,7 @@ struct shmem_sb_info {
 	gid_t gid;		    /* Mount gid for root directory */
 	mode_t mode;		    /* Mount mode for root directory */
 	unsigned short policy;	    /* Default NUMA memory alloc policy */
+	unsigned short flags;	    /* Optional mempolicy flags */
 	nodemask_t policy_nodes;    /* nodemask for preferred and bind */
 };
 
diff --git a/mm/mempolicy.c b/mm/mempolicy.c
--- a/mm/mempolicy.c
+++ b/mm/mempolicy.c
@@ -203,12 +203,13 @@ static struct zonelist *bind_zonelist(nodemask_t *nodes)
 }
 
 /* Create a new policy */
-static struct mempolicy *mpol_new(unsigned short mode, nodemask_t *nodes)
+static struct mempolicy *mpol_new(unsigned short mode, unsigned short flags,
+				  nodemask_t *nodes)
 {
 	struct mempolicy *policy;
 
-	pr_debug("setting mode %d nodes[0] %lx\n",
-		 mode, nodes ? nodes_addr(*nodes)[0] : -1);
+	pr_debug("setting mode %d flags %d nodes[0] %lx\n",
+		 mode, flags, nodes ? nodes_addr(*nodes)[0] : -1);
 
 	if (mode == MPOL_DEFAULT)
 		return NULL;
@@ -241,6 +242,7 @@ static struct mempolicy *mpol_new(unsigned short mode, nodemask_t *nodes)
 		BUG();
 	}
 	policy->policy = mode;
+	policy->flags = flags;
 	policy->cpuset_mems_allowed = cpuset_mems_allowed(current);
 	return policy;
 }
@@ -483,13 +485,14 @@ static void mpol_set_task_struct_flag(void)
 }
 
 /* Set the process memory policy */
-static long do_set_mempolicy(unsigned short mode, nodemask_t *nodes)
+static long do_set_mempolicy(unsigned short mode, unsigned short flags,
+			     nodemask_t *nodes)
 {
 	struct mempolicy *new;
 
 	if (mpol_check_policy(mode, nodes))
 		return -EINVAL;
-	new = mpol_new(mode, nodes);
+	new = mpol_new(mode, flags, nodes);
 	if (IS_ERR(new))
 		return PTR_ERR(new);
 	mpol_free(current->mempolicy);
@@ -595,7 +598,7 @@ static long do_get_mempolicy(int *policy, nodemask_t *nmask,
 			goto out;
 		}
 	} else
-		*policy = pol->policy;
+		*policy = pol->policy | pol->flags;
 
 	if (vma) {
 		up_read(&current->mm->mmap_sem);
@@ -785,8 +788,8 @@ static struct page *new_vma_page(struct page *page, unsigned long private, int *
 #endif
 
 static long do_mbind(unsigned long start, unsigned long len,
-		     unsigned short mode, nodemask_t *nmask,
-		     unsigned long flags)
+		     unsigned short mode, unsigned short mode_flags,
+		     nodemask_t *nmask, unsigned long flags)
 {
 	struct vm_area_struct *vma;
 	struct mm_struct *mm = current->mm;
@@ -818,7 +821,7 @@ static long do_mbind(unsigned long start, unsigned long len,
 	if (mpol_check_policy(mode, nmask))
 		return -EINVAL;
 
-	new = mpol_new(mode, nmask);
+	new = mpol_new(mode, mode_flags, nmask);
 	if (IS_ERR(new))
 		return PTR_ERR(new);
 
@@ -829,8 +832,9 @@ static long do_mbind(unsigned long start, unsigned long len,
 	if (!new)
 		flags |= MPOL_MF_DISCONTIG_OK;
 
-	pr_debug("mbind %lx-%lx mode:%d nodes:%lx\n",start,start+len,
-		 mode, nmask ? nodes_addr(*nmask)[0] : -1);
+	pr_debug("mbind %lx-%lx mode:%d flags:%d nodes:%lx\n",
+		 start, start + len, mode, mode_flags,
+		 nmask ? nodes_addr(*nmask)[0] : -1);
 
 	down_write(&mm->mmap_sem);
 	vma = check_range(mm, start, end, nmask,
@@ -929,13 +933,16 @@ asmlinkage long sys_mbind(unsigned long start, unsigned long len,
 {
 	nodemask_t nodes;
 	int err;
+	unsigned short mode_flags;
 
+	mode_flags = mode & MPOL_MODE_FLAGS;
+	mode &= ~MPOL_MODE_FLAGS;
 	if (mode >= MPOL_MAX)
 		return -EINVAL;
 	err = get_nodes(&nodes, nmask, maxnode);
 	if (err)
 		return err;
-	return do_mbind(start, len, mode, &nodes, flags);
+	return do_mbind(start, len, mode, mode_flags, &nodes, flags);
 }
 
 /* Set the process memory policy */
@@ -944,13 +951,16 @@ asmlinkage long sys_set_mempolicy(int mode, unsigned long __user *nmask,
 {
 	int err;
 	nodemask_t nodes;
+	unsigned short flags;
 
-	if (mode < 0 || mode >= MPOL_MAX)
+	flags = mode & MPOL_MODE_FLAGS;
+	mode &= ~MPOL_MODE_FLAGS;
+	if ((unsigned int)mode >= MPOL_MAX)
 		return -EINVAL;
 	err = get_nodes(&nodes, nmask, maxnode);
 	if (err)
 		return err;
-	return do_set_mempolicy(mode, &nodes);
+	return do_set_mempolicy(mode, flags, &nodes);
 }
 
 asmlinkage long sys_migrate_pages(pid_t pid, unsigned long maxnode,
@@ -1640,7 +1650,7 @@ restart:
 }
 
 void mpol_shared_policy_init(struct shared_policy *info, unsigned short policy,
-				nodemask_t *policy_nodes)
+			unsigned short flags, nodemask_t *policy_nodes)
 {
 	info->root = RB_ROOT;
 	spin_lock_init(&info->lock);
@@ -1649,7 +1659,7 @@ void mpol_shared_policy_init(struct shared_policy *info, unsigned short policy,
 		struct mempolicy *newpol;
 
 		/* Falls back to MPOL_DEFAULT on any error */
-		newpol = mpol_new(policy, policy_nodes);
+		newpol = mpol_new(policy, flags, policy_nodes);
 		if (!IS_ERR(newpol)) {
 			/* Create pseudo-vma that contains just the policy */
 			struct vm_area_struct pvma;
@@ -1670,9 +1680,10 @@ int mpol_set_shared_policy(struct shared_policy *info,
 	struct sp_node *new = NULL;
 	unsigned long sz = vma_pages(vma);
 
-	pr_debug("set_shared_policy %lx sz %lu %d %lx\n",
+	pr_debug("set_shared_policy %lx sz %lu %d %d %lx\n",
 		 vma->vm_pgoff,
-		 sz, npol? npol->policy : -1,
+		 sz, npol ? npol->policy : -1,
+		 npol ? npol->flags : -1,
 		 npol ? nodes_addr(npol->v.nodes)[0] : -1);
 
 	if (npol) {
@@ -1745,14 +1756,14 @@ void __init numa_policy_init(void)
 	if (unlikely(nodes_empty(interleave_nodes)))
 		node_set(prefer, interleave_nodes);
 
-	if (do_set_mempolicy(MPOL_INTERLEAVE, &interleave_nodes))
+	if (do_set_mempolicy(MPOL_INTERLEAVE, 0, &interleave_nodes))
 		printk("numa_policy_init: interleaving failed\n");
 }
 
 /* Reset policy of current process to default */
 void numa_default_policy(void)
 {
-	do_set_mempolicy(MPOL_DEFAULT, NULL);
+	do_set_mempolicy(MPOL_DEFAULT, 0, NULL);
 }
 
 /* Migrate a policy to a different set of nodes */
diff --git a/mm/shmem.c b/mm/shmem.c
--- a/mm/shmem.c
+++ b/mm/shmem.c
@@ -1083,9 +1083,10 @@ redirty:
 #ifdef CONFIG_NUMA
 #ifdef CONFIG_TMPFS
 static int shmem_parse_mpol(char *value, unsigned short *policy,
-			    nodemask_t *policy_nodes)
+			unsigned short *mode_flags, nodemask_t *policy_nodes)
 {
 	char *nodelist = strchr(value, ':');
+	char *flags = strchr(value, '=');
 	int err = 1;
 
 	if (nodelist) {
@@ -1096,6 +1097,8 @@ static int shmem_parse_mpol(char *value, unsigned short *policy,
 		if (!nodes_subset(*policy_nodes, node_states[N_HIGH_MEMORY]))
 			goto out;
 	}
+	if (flags)
+		*flags++ = '\0';
 	if (!strcmp(value, "default")) {
 		*policy = MPOL_DEFAULT;
 		/* Don't allow a nodelist */
@@ -1125,6 +1128,8 @@ static int shmem_parse_mpol(char *value, unsigned short *policy,
 			*policy_nodes = node_states[N_HIGH_MEMORY];
 		err = 0;
 	}
+	if (flags) {
+	}
 out:
 	/* Restore string for error message */
 	if (nodelist)
@@ -1133,7 +1138,7 @@ out:
 }
 
 static void shmem_show_mpol(struct seq_file *seq, unsigned short policy,
-			    const nodemask_t policy_nodes)
+			unsigned short flags, const nodemask_t policy_nodes)
 {
 	char *policy_string;
 
@@ -1202,13 +1207,13 @@ static struct page *shmem_alloc_page(gfp_t gfp,
 #else /* !CONFIG_NUMA */
 #ifdef CONFIG_TMPFS
 static inline int shmem_parse_mpol(char *value, unsigned short *policy,
-						nodemask_t *policy_nodes)
+				unsigned short *flags, nodemask_t *policy_nodes)
 {
 	return 1;
 }
 
 static inline void shmem_show_mpol(struct seq_file *seq, unsigned short policy,
-			    const nodemask_t policy_nodes)
+			unsigned short flags, const nodemask_t policy_nodes)
 {
 }
 #endif /* CONFIG_TMPFS */
@@ -1578,7 +1583,7 @@ shmem_get_inode(struct super_block *sb, int mode, dev_t dev)
 			inode->i_op = &shmem_inode_operations;
 			inode->i_fop = &shmem_file_operations;
 			mpol_shared_policy_init(&info->policy, sbinfo->policy,
-							&sbinfo->policy_nodes);
+					sbinfo->flags, &sbinfo->policy_nodes);
 			break;
 		case S_IFDIR:
 			inc_nlink(inode);
@@ -1592,7 +1597,7 @@ shmem_get_inode(struct super_block *sb, int mode, dev_t dev)
 			 * Must not load anything in the rbtree,
 			 * mpol_free_shared_policy will not be called.
 			 */
-			mpol_shared_policy_init(&info->policy, MPOL_DEFAULT,
+			mpol_shared_policy_init(&info->policy, MPOL_DEFAULT, 0,
 						NULL);
 			break;
 		}
@@ -2209,7 +2214,7 @@ static int shmem_parse_options(char *options, struct shmem_sb_info *sbinfo,
 				goto bad_val;
 		} else if (!strcmp(this_char,"mpol")) {
 			if (shmem_parse_mpol(value, &sbinfo->policy,
-					     &sbinfo->policy_nodes))
+				&sbinfo->flags, &sbinfo->policy_nodes))
 				goto bad_val;
 		} else {
 			printk(KERN_ERR "tmpfs: Bad mount option %s\n",
@@ -2261,6 +2266,7 @@ static int shmem_remount_fs(struct super_block *sb, int *flags, char *data)
 	sbinfo->max_inodes  = config.max_inodes;
 	sbinfo->free_inodes = config.max_inodes - inodes;
 	sbinfo->policy      = config.policy;
+	sbinfo->flags       = config.flags;
 	sbinfo->policy_nodes = config.policy_nodes;
 out:
 	spin_unlock(&sbinfo->stat_lock);
@@ -2282,7 +2288,8 @@ static int shmem_show_options(struct seq_file *seq, struct vfsmount *vfs)
 		seq_printf(seq, ",uid=%u", sbinfo->uid);
 	if (sbinfo->gid != 0)
 		seq_printf(seq, ",gid=%u", sbinfo->gid);
-	shmem_show_mpol(seq, sbinfo->policy, sbinfo->policy_nodes);
+	shmem_show_mpol(seq, sbinfo->policy, sbinfo->flags,
+			sbinfo->policy_nodes);
 	return 0;
 }
 #endif /* CONFIG_TMPFS */
@@ -2313,6 +2320,7 @@ static int shmem_fill_super(struct super_block *sb,
 	sbinfo->uid = current->fsuid;
 	sbinfo->gid = current->fsgid;
 	sbinfo->policy = MPOL_DEFAULT;
+	sbinfo->flags = 0;
 	sbinfo->policy_nodes = node_states[N_HIGH_MEMORY];
 	sb->s_fs_info = sbinfo;
 

^ permalink raw reply	[flat|nested] 12+ messages in thread

* [patch 3/6] mempolicy: add MPOL_F_STATIC_NODES flag
  2008-03-01  0:45 ` [patch 2/6] mempolicy: support optional mode flags David Rientjes
@ 2008-03-01  0:45   ` David Rientjes
  2008-03-01  0:45     ` [patch 4/6] mempolicy: add bitmap_onto() and bitmap_fold() operations David Rientjes
  0 siblings, 1 reply; 12+ messages in thread
From: David Rientjes @ 2008-03-01  0:45 UTC (permalink / raw)
  To: Andrew Morton
  Cc: Paul Jackson, Christoph Lameter, Lee Schermerhorn, Andi Kleen,
	linux-kernel

Add an optional mempolicy mode flag, MPOL_F_STATIC_NODES, that suppresses
the node remap when the policy is rebound.

Adds another member to struct mempolicy, nodemask_t user_nodemask, as
part of a union with cpuset_mems_allowed:

	struct mempolicy {
		...
		union {
			nodemask_t cpuset_mems_allowed;
			nodemask_t user_nodemask;
		} w;
	}

that stores the the nodemask that the user passed when he or she created
the mempolicy via set_mempolicy() or mbind().  When using
MPOL_F_STATIC_NODES, which is passed with any mempolicy mode, the user's
passed nodemask intersected with the VMA or task's allowed nodes is always
used when determining the preferred node, building the MPOL_BIND zonelist,
or creating the interleave nodemask.  This happens whenever the policy is 
rebound, including when a task's cpuset assignment changes or the cpuset's
mems are changed.

This creates an interesting side-effect in that it allows the mempolicy
"intent" to lie dormant and uneffected until it has access to the node(s)
that it desires.  For example, if you currently ask for an interleaved
policy over a set of nodes that you do not have access to, the mempolicy
is not created and the task continues to use the previous policy.  With
this change, however, it is possible to create the same mempolicy; it is
only effected when access to nodes in the nodemask is acquired.

It is also possible to mount tmpfs with the static nodemask behavior when
specifying a node or nodemask.  To do this, simply add "=static"
immediately following the mempolicy mode at mount time:

	mount -o remount mpol=interleave=static:1-3

Also removes mpol_check_policy() and folds its logic into mpol_new() since
it is now obsoleted.  The unused vma_mpol_equal() is also removed.

Cc: Paul Jackson <pj@sgi.com>
Cc: Christoph Lameter <clameter@sgi.com>
Cc: Lee Schermerhorn <Lee.Schermerhorn@hp.com>
Cc: Andi Kleen <ak@suse.de>
Signed-off-by: David Rientjes <rientjes@google.com>
---
 include/linux/mempolicy.h |   11 ++-
 mm/mempolicy.c            |  208 ++++++++++++++++++++++++---------------------
 mm/shmem.c                |    2 +
 3 files changed, 119 insertions(+), 102 deletions(-)

diff --git a/include/linux/mempolicy.h b/include/linux/mempolicy.h
--- a/include/linux/mempolicy.h
+++ b/include/linux/mempolicy.h
@@ -24,11 +24,13 @@ enum {
 };
 
 /* Flags for set_mempolicy */
+#define MPOL_F_STATIC_NODES	(1 << 15)
+
 /*
  * MPOL_MODE_FLAGS is the union of all possible optional mode flags passed to
  * either set_mempolicy() or mbind().
  */
-#define MPOL_MODE_FLAGS		(0)
+#define MPOL_MODE_FLAGS		(MPOL_F_STATIC_NODES)
 
 /* Flags for get_mempolicy */
 #define MPOL_F_NODE	(1<<0)	/* return next IL mode instead of node mask */
@@ -84,7 +86,10 @@ struct mempolicy {
 		nodemask_t	 nodes;		/* interleave */
 		/* undefined for default */
 	} v;
-	nodemask_t cpuset_mems_allowed;	/* mempolicy relative to these nodes */
+	union {
+		nodemask_t cpuset_mems_allowed;	/* relative to these nodes */
+		nodemask_t user_nodemask;	/* nodemask passed by user */
+	} w;
 };
 
 /*
@@ -123,7 +128,6 @@ static inline int mpol_equal(struct mempolicy *a, struct mempolicy *b)
 		return 1;
 	return __mpol_equal(a, b);
 }
-#define vma_mpol_equal(a,b) mpol_equal(vma_policy(a), vma_policy(b))
 
 /* Could later add inheritance of the process policy here. */
 
@@ -188,7 +192,6 @@ static inline int mpol_equal(struct mempolicy *a, struct mempolicy *b)
 {
 	return 1;
 }
-#define vma_mpol_equal(a,b) 1
 
 #define mpol_set_vma_default(vma) do {} while(0)
 
diff --git a/mm/mempolicy.c b/mm/mempolicy.c
--- a/mm/mempolicy.c
+++ b/mm/mempolicy.c
@@ -113,58 +113,6 @@ struct mempolicy default_policy = {
 static void mpol_rebind_policy(struct mempolicy *pol,
                                const nodemask_t *newmask);
 
-/* Do sanity checking on a policy */
-static int mpol_check_policy(unsigned short mode, nodemask_t *nodes)
-{
-	int was_empty, is_empty;
-
-	if (!nodes)
-		return 0;
-
-	/*
-	 * "Contextualize" the in-coming nodemast for cpusets:
-	 * Remember whether in-coming nodemask was empty,  If not,
-	 * restrict the nodes to the allowed nodes in the cpuset.
-	 * This is guaranteed to be a subset of nodes with memory.
-	 */
-	cpuset_update_task_memory_state();
-	is_empty = was_empty = nodes_empty(*nodes);
-	if (!was_empty) {
-		nodes_and(*nodes, *nodes, cpuset_current_mems_allowed);
-		is_empty = nodes_empty(*nodes);	/* after "contextualization" */
-	}
-
-	switch (mode) {
-	case MPOL_DEFAULT:
-		/*
-		 * require caller to specify an empty nodemask
-		 * before "contextualization"
-		 */
-		if (!was_empty)
-			return -EINVAL;
-		break;
-	case MPOL_BIND:
-	case MPOL_INTERLEAVE:
-		/*
-		 * require at least 1 valid node after "contextualization"
-		 */
-		if (is_empty)
-			return -EINVAL;
-		break;
-	case MPOL_PREFERRED:
-		/*
-		 * Did caller specify invalid nodes?
-		 * Don't silently accept this as "local allocation".
-		 */
-		if (!was_empty && is_empty)
-			return -EINVAL;
-		break;
-	default:
-		BUG();
-	}
-	return 0;
-}
-
 /* Generate a custom zonelist for the BIND policy. */
 static struct zonelist *bind_zonelist(nodemask_t *nodes)
 {
@@ -202,40 +150,48 @@ static struct zonelist *bind_zonelist(nodemask_t *nodes)
 	return zl;
 }
 
+static inline int mpol_store_user_nodemask(const struct mempolicy *pol)
+{
+	return pol->flags & MPOL_F_STATIC_NODES;
+}
+
 /* Create a new policy */
 static struct mempolicy *mpol_new(unsigned short mode, unsigned short flags,
 				  nodemask_t *nodes)
 {
 	struct mempolicy *policy;
+	nodemask_t cpuset_context_nmask;
+	void *error = ERR_PTR(-EINVAL);
 
 	pr_debug("setting mode %d flags %d nodes[0] %lx\n",
 		 mode, flags, nodes ? nodes_addr(*nodes)[0] : -1);
 
 	if (mode == MPOL_DEFAULT)
-		return NULL;
+		return (nodes && nodes_weight(*nodes)) ? error : NULL;
 	policy = kmem_cache_alloc(policy_cache, GFP_KERNEL);
 	if (!policy)
 		return ERR_PTR(-ENOMEM);
 	atomic_set(&policy->refcnt, 1);
+	cpuset_update_task_memory_state();
+	nodes_and(cpuset_context_nmask, *nodes, cpuset_current_mems_allowed);
 	switch (mode) {
 	case MPOL_INTERLEAVE:
-		policy->v.nodes = *nodes;
-		if (nodes_weight(policy->v.nodes) == 0) {
-			kmem_cache_free(policy_cache, policy);
-			return ERR_PTR(-EINVAL);
-		}
+		if (nodes_empty(*nodes) || nodes_empty(cpuset_context_nmask))
+			goto free;
+		policy->v.nodes = cpuset_context_nmask;
 		break;
 	case MPOL_PREFERRED:
-		policy->v.preferred_node = first_node(*nodes);
+		policy->v.preferred_node = first_node(cpuset_context_nmask);
 		if (policy->v.preferred_node >= MAX_NUMNODES)
-			policy->v.preferred_node = -1;
+			goto free;
 		break;
 	case MPOL_BIND:
-		policy->v.zonelist = bind_zonelist(nodes);
+		if (nodes_empty(*nodes))
+			goto free;
+		policy->v.zonelist = bind_zonelist(&cpuset_context_nmask);
 		if (IS_ERR(policy->v.zonelist)) {
-			void *error_code = policy->v.zonelist;
-			kmem_cache_free(policy_cache, policy);
-			return error_code;
+			error = policy->v.zonelist;
+			goto free;
 		}
 		break;
 	default:
@@ -243,8 +199,15 @@ static struct mempolicy *mpol_new(unsigned short mode, unsigned short flags,
 	}
 	policy->policy = mode;
 	policy->flags = flags;
-	policy->cpuset_mems_allowed = cpuset_mems_allowed(current);
+	if (mpol_store_user_nodemask(policy))
+		policy->w.user_nodemask = *nodes;
+	else
+		policy->w.cpuset_mems_allowed = cpuset_mems_allowed(current);
 	return policy;
+
+free:
+	kmem_cache_free(policy_cache, policy);
+	return error;
 }
 
 static void gather_stats(struct page *, void *, int pte_dirty);
@@ -490,15 +453,14 @@ static long do_set_mempolicy(unsigned short mode, unsigned short flags,
 {
 	struct mempolicy *new;
 
-	if (mpol_check_policy(mode, nodes))
-		return -EINVAL;
 	new = mpol_new(mode, flags, nodes);
 	if (IS_ERR(new))
 		return PTR_ERR(new);
 	mpol_free(current->mempolicy);
 	current->mempolicy = new;
 	mpol_set_task_struct_flag();
-	if (new && new->policy == MPOL_INTERLEAVE)
+	if (new && new->policy == MPOL_INTERLEAVE &&
+	    nodes_weight(new->v.nodes))
 		current->il_next = first_node(new->v.nodes);
 	return 0;
 }
@@ -818,9 +780,6 @@ static long do_mbind(unsigned long start, unsigned long len,
 	if (end == start)
 		return 0;
 
-	if (mpol_check_policy(mode, nmask))
-		return -EINVAL;
-
 	new = mpol_new(mode, mode_flags, nmask);
 	if (IS_ERR(new))
 		return PTR_ERR(new);
@@ -1211,7 +1170,8 @@ static unsigned interleave_nodes(struct mempolicy *policy)
 	next = next_node(nid, policy->v.nodes);
 	if (next >= MAX_NUMNODES)
 		next = first_node(policy->v.nodes);
-	me->il_next = next;
+	if (next < MAX_NUMNODES)
+		me->il_next = next;
 	return nid;
 }
 
@@ -1249,10 +1209,13 @@ static unsigned offset_il_node(struct mempolicy *pol,
 		struct vm_area_struct *vma, unsigned long off)
 {
 	unsigned nnodes = nodes_weight(pol->v.nodes);
-	unsigned target = (unsigned)off % nnodes;
+	unsigned target;
 	int c;
 	int nid = -1;
 
+	if (!nnodes)
+		return numa_node_id();
+	target = (unsigned int)off % nnodes;
 	c = 0;
 	do {
 		nid = next_node(nid, pol->v.nodes);
@@ -1457,6 +1420,16 @@ struct mempolicy *__mpol_copy(struct mempolicy *old)
 	return new;
 }
 
+static int mpol_match_intent(const struct mempolicy *a,
+			     const struct mempolicy *b)
+{
+	if (a->flags != b->flags)
+		return 0;
+	if (!mpol_store_user_nodemask(a))
+		return 1;
+	return nodes_equal(a->w.user_nodemask, b->w.user_nodemask);
+}
+
 /* Slow path of a mempolicy comparison */
 int __mpol_equal(struct mempolicy *a, struct mempolicy *b)
 {
@@ -1464,6 +1437,8 @@ int __mpol_equal(struct mempolicy *a, struct mempolicy *b)
 		return 0;
 	if (a->policy != b->policy)
 		return 0;
+	if (a->policy != MPOL_DEFAULT && !mpol_match_intent(a, b))
+		return 0;
 	switch (a->policy) {
 	case MPOL_DEFAULT:
 		return 1;
@@ -1770,54 +1745,79 @@ void numa_default_policy(void)
 static void mpol_rebind_policy(struct mempolicy *pol,
 			       const nodemask_t *newmask)
 {
-	nodemask_t *mpolmask;
 	nodemask_t tmp;
+	int static_nodes;
 
 	if (!pol)
 		return;
-	mpolmask = &pol->cpuset_mems_allowed;
-	if (nodes_equal(*mpolmask, *newmask))
+	static_nodes = pol->flags & MPOL_F_STATIC_NODES;
+	if (!mpol_store_user_nodemask(pol) &&
+	    nodes_equal(pol->w.cpuset_mems_allowed, *newmask))
 		return;
 
 	switch (pol->policy) {
 	case MPOL_DEFAULT:
 		break;
 	case MPOL_INTERLEAVE:
-		nodes_remap(tmp, pol->v.nodes, *mpolmask, *newmask);
+		if (static_nodes)
+			nodes_and(tmp, pol->w.user_nodemask, *newmask);
+		else {
+			nodes_remap(tmp, pol->v.nodes,
+				    pol->w.cpuset_mems_allowed, *newmask);
+			pol->w.cpuset_mems_allowed = *newmask;
+		}
 		pol->v.nodes = tmp;
-		*mpolmask = *newmask;
-		current->il_next = node_remap(current->il_next,
-						*mpolmask, *newmask);
+		if (!node_isset(current->il_next, tmp)) {
+			current->il_next = next_node(current->il_next, tmp);
+			if (current->il_next >= MAX_NUMNODES)
+				current->il_next = first_node(tmp);
+			if (current->il_next >= MAX_NUMNODES)
+				current->il_next = numa_node_id();
+		}
 		break;
 	case MPOL_PREFERRED:
-		pol->v.preferred_node = node_remap(pol->v.preferred_node,
-						*mpolmask, *newmask);
-		*mpolmask = *newmask;
+		if (static_nodes) {
+			int node = first_node(pol->w.user_nodemask);
+
+			if (node_isset(node, *newmask))
+				pol->v.preferred_node = node;
+			else
+				pol->v.preferred_node = -1;
+		} else {
+			pol->v.preferred_node = node_remap(pol->v.preferred_node,
+					pol->w.cpuset_mems_allowed, *newmask);
+			pol->w.cpuset_mems_allowed = *newmask;
+		}
 		break;
 	case MPOL_BIND: {
-		nodemask_t nodes;
-		struct zone **z;
 		struct zonelist *zonelist;
 
-		nodes_clear(nodes);
-		for (z = pol->v.zonelist->zones; *z; z++)
-			node_set(zone_to_nid(*z), nodes);
-		nodes_remap(tmp, nodes, *mpolmask, *newmask);
-		nodes = tmp;
-
-		zonelist = bind_zonelist(&nodes);
+		if (static_nodes)
+			nodes_and(tmp, pol->w.user_nodemask, *newmask);
+		else {
+			nodemask_t nodes;
+			struct zone **z;
+
+			nodes_clear(nodes);
+			for (z = pol->v.zonelist->zones; *z; z++)
+				node_set(zone_to_nid(*z), nodes);
+			nodes_remap(tmp, nodes, pol->w.cpuset_mems_allowed,
+				    *newmask);
+			pol->w.cpuset_mems_allowed = *newmask;
+		}
 
 		/* If no mem, then zonelist is NULL and we keep old zonelist.
 		 * If that old zonelist has no remaining mems_allowed nodes,
 		 * then zonelist_policy() will "FALL THROUGH" to MPOL_DEFAULT.
 		 */
-
-		if (!IS_ERR(zonelist)) {
-			/* Good - got mem - substitute new zonelist */
-			kfree(pol->v.zonelist);
-			pol->v.zonelist = zonelist;
+		if (nodes_weight(tmp)) {
+			zonelist = bind_zonelist(&tmp);
+			if (!IS_ERR(zonelist)) {
+				/* Good - got mem - substitute new zonelist */
+				kfree(pol->v.zonelist);
+				pol->v.zonelist = zonelist;
+			}
 		}
-		*mpolmask = *newmask;
 		break;
 	}
 	default:
@@ -1870,6 +1870,7 @@ static inline int mpol_to_str(char *buffer, int maxlen, struct mempolicy *pol)
 	int l;
 	nodemask_t nodes;
 	unsigned short mode = pol ? pol->policy : MPOL_DEFAULT;
+	unsigned short flags = pol ? pol->flags : 0;
 
 	switch (mode) {
 	case MPOL_DEFAULT:
@@ -1901,6 +1902,17 @@ static inline int mpol_to_str(char *buffer, int maxlen, struct mempolicy *pol)
 	strcpy(p, policy_types[mode]);
 	p += l;
 
+	if (flags) {
+		int need_bar = 0;
+
+		if (buffer + maxlen < p + 2)
+			return -ENOSPC;
+		*p++ = '=';
+
+		if (flags & MPOL_F_STATIC_NODES)
+			p += sprintf(p, "%sstatic", need_bar++ ? "|" : "");
+	}
+
 	if (!nodes_empty(nodes)) {
 		if (buffer + maxlen < p + 2)
 			return -ENOSPC;
diff --git a/mm/shmem.c b/mm/shmem.c
--- a/mm/shmem.c
+++ b/mm/shmem.c
@@ -1129,6 +1129,8 @@ static int shmem_parse_mpol(char *value, unsigned short *policy,
 		err = 0;
 	}
 	if (flags) {
+		if (!strcmp(flags, "static"))
+			*mode_flags |= MPOL_F_STATIC_NODES;
 	}
 out:
 	/* Restore string for error message */

^ permalink raw reply	[flat|nested] 12+ messages in thread

* [patch 4/6] mempolicy: add bitmap_onto() and bitmap_fold() operations
  2008-03-01  0:45   ` [patch 3/6] mempolicy: add MPOL_F_STATIC_NODES flag David Rientjes
@ 2008-03-01  0:45     ` David Rientjes
  2008-03-01  0:45       ` [patch 5/6] mempolicy: add MPOL_F_RELATIVE_NODES flag David Rientjes
  0 siblings, 1 reply; 12+ messages in thread
From: David Rientjes @ 2008-03-01  0:45 UTC (permalink / raw)
  To: Andrew Morton
  Cc: Paul Jackson, Christoph Lameter, Lee Schermerhorn, Andi Kleen,
	kosaki.motohiro, mel, ray-lk, linux-kernel

From: Paul Jackson <pj@sgi.com>

The following adds two more bitmap operators, bitmap_onto() and
bitmap_fold(), with the usual cpumask and nodemask wrappers.

The bitmap_onto() operator computes one bitmap relative to
another.  If the n-th bit in the origin mask is set, then the
m-th bit of the destination mask will be set, where m is
the position of the n-th set bit in the relative mask.

The bitmap_fold() operator folds a bitmap into a second that
has bit m set iff the input bitmap has some bit n set, where
m == n mod sz, for the specified sz value.

There are two substantive changes between this patch and its
predecessor bitmap_relative:
 1) Renamed bitmap_relative() to be bitmap_onto().
 2) Added bitmap_fold().

The essential motivation for bitmap_onto() is to provide
a mechanism for converting a cpuset-relative CPU or Node
mask to an absolute mask.  Cpuset relative masks are written
as if the current task were in a cpuset whose CPUs or
Nodes were just the consecutive ones numbered 0..N-1, for
some N.  The bitmap_onto() operator is provided in anticipation
of adding support for the first such cpuset relative mask,
by the mbind() and set_mempolicy() system calls, using a
planned flag of MPOL_F_RELATIVE_NODES.  These bitmap operators
(and their nodemask wrappers, in particular) will be used in
code that converts the user specified cpuset relative memory
policy to a specific system node numbered policy, given the
current mems_allowed of the tasks cpuset.

Such cpuset relative mempolicies will address two deficiencies
of the existing interface between cpusets and mempolicies:
 1) A task cannot at present reliably establish a cpuset
    relative mempolicy because there is an essential race
    condition, in that the tasks cpuset may be changed in
    between the time the task can query its cpuset placement,
    and the time the task can issue the applicable mbind or
    set_memplicy system call.
 2) A task cannot at present establish what cpuset relative
    mempolicy it would like to have, if it is in a smaller
    cpuset than it might have mempolicy preferences for,
    because the existing interface only allows specifying
    mempolicies for nodes currently allowed by the cpuset.

Cpuset relative mempolicies are useful for tasks that don't
distinguish particularly between one CPU or Node and another,
but only between how many of each are allowed, and the proper
placement of threads and memory pages on the various CPUs and
Nodes available.

The motivation for the added bitmap_fold() can be seen in
the following example.

Let's say an application has specified some mempolicies
that presume 16 memory nodes, including say a mempolicy that
specified MPOL_F_RELATIVE_NODES (cpuset relative) nodes 12-15.
Then lets say that application is crammed into a cpuset that only
has 8 memory nodes, 0-7.  If one just uses bitmap_onto(), this
mempolicy, mapped to that cpuset, would ignore the requested
relative nodes above 7, leaving it empty of nodes.  That's not
good; better to fold the higher nodes down, so that some nodes
are included in the resulting mapped mempolicy.  In this case,
the mempolicy nodes 12-15 are taken modulo 8 (the weight of the
mems_allowed of the confining cpuset), resulting in a mempolicy
specifying nodes 4-7.

Signed-off-by: Paul Jackson <pj@sgi.com>
Cc: Lee.Schermerhorn@hp.com
Cc: clameter@sgi.com
Cc: ak@suse.de
Cc: mel@csn.ul.ie
Cc: kosaki.motohiro@jp.fujitsu.com
Cc: ray-lk@madrabbit.org
Signed-off-by: David Rientjes <rientjes@google.com>
---
 include/linux/bitmap.h   |    6 ++
 include/linux/cpumask.h  |   22 ++++++-
 include/linux/nodemask.h |   22 ++++++-
 lib/bitmap.c             |  158 ++++++++++++++++++++++++++++++++++++++++++++++
 4 files changed, 206 insertions(+), 2 deletions(-)

diff --git a/include/linux/bitmap.h b/include/linux/bitmap.h
--- a/include/linux/bitmap.h
+++ b/include/linux/bitmap.h
@@ -46,6 +46,8 @@
  * bitmap_shift_left(dst, src, n, nbits)	*dst = *src << n
  * bitmap_remap(dst, src, old, new, nbits)	*dst = map(old, new)(src)
  * bitmap_bitremap(oldbit, old, new, nbits)	newbit = map(old, new)(oldbit)
+ * bitmap_onto(dst, orig, relmap, nbits)	*dst = orig relative to relmap
+ * bitmap_fold(dst, orig, sz, nbits)		dst bits = orig bits mod sz
  * bitmap_scnprintf(buf, len, src, nbits)	Print bitmap src to buf
  * bitmap_parse(buf, buflen, dst, nbits)	Parse bitmap dst from kernel buf
  * bitmap_parse_user(ubuf, ulen, dst, nbits)	Parse bitmap dst from user buf
@@ -120,6 +122,10 @@ extern void bitmap_remap(unsigned long *dst, const unsigned long *src,
 		const unsigned long *old, const unsigned long *new, int bits);
 extern int bitmap_bitremap(int oldbit,
 		const unsigned long *old, const unsigned long *new, int bits);
+extern void bitmap_onto(unsigned long *dst, const unsigned long *orig,
+		const unsigned long *relmap, int bits);
+extern void bitmap_fold(unsigned long *dst, const unsigned long *orig,
+		int sz, int bits);
 extern int bitmap_find_free_region(unsigned long *bitmap, int bits, int order);
 extern void bitmap_release_region(unsigned long *bitmap, int pos, int order);
 extern int bitmap_allocate_region(unsigned long *bitmap, int pos, int order);
diff --git a/include/linux/cpumask.h b/include/linux/cpumask.h
--- a/include/linux/cpumask.h
+++ b/include/linux/cpumask.h
@@ -14,6 +14,8 @@
  * bitmap_scnlistprintf() and bitmap_parselist(), also in bitmap.c.
  * For details of cpu_remap(), see bitmap_bitremap in lib/bitmap.c
  * For details of cpus_remap(), see bitmap_remap in lib/bitmap.c.
+ * For details of cpus_onto(), see bitmap_onto in lib/bitmap.c.
+ * For details of cpus_fold(), see bitmap_fold in lib/bitmap.c.
  *
  * The available cpumask operations are:
  *
@@ -53,7 +55,9 @@
  * int cpulist_scnprintf(buf, len, mask) Format cpumask as list for printing
  * int cpulist_parse(buf, map)		Parse ascii string as cpulist
  * int cpu_remap(oldbit, old, new)	newbit = map(old, new)(oldbit)
- * int cpus_remap(dst, src, old, new)	*dst = map(old, new)(src)
+ * void cpus_remap(dst, src, old, new)	*dst = map(old, new)(src)
+ * void cpus_onto(dst, orig, relmap)	*dst = orig relative to relmap
+ * void cpus_fold(dst, orig, sz)	dst bits = orig bits mod sz
  *
  * for_each_cpu_mask(cpu, mask)		for-loop cpu over mask
  *
@@ -311,6 +315,22 @@ static inline void __cpus_remap(cpumask_t *dstp, const cpumask_t *srcp,
 	bitmap_remap(dstp->bits, srcp->bits, oldp->bits, newp->bits, nbits);
 }
 
+#define cpus_onto(dst, orig, relmap) \
+		__cpus_onto(&(dst), &(orig), &(relmap), NR_CPUS)
+static inline void __cpus_onto(cpumask_t *dstp, const cpumask_t *origp,
+		const cpumask_t *relmapp, int nbits)
+{
+	bitmap_onto(dstp->bits, origp->bits, relmapp->bits, nbits);
+}
+
+#define cpus_fold(dst, orig, sz) \
+		__cpus_fold(&(dst), &(orig), sz, NR_CPUS)
+static inline void __cpus_fold(cpumask_t *dstp, const cpumask_t *origp,
+		int sz, int nbits)
+{
+	bitmap_fold(dstp->bits, origp->bits, sz, nbits);
+}
+
 #if NR_CPUS > 1
 #define for_each_cpu_mask(cpu, mask)		\
 	for ((cpu) = first_cpu(mask);		\
diff --git a/include/linux/nodemask.h b/include/linux/nodemask.h
--- a/include/linux/nodemask.h
+++ b/include/linux/nodemask.h
@@ -14,6 +14,8 @@
  * bitmap_scnlistprintf() and bitmap_parselist(), also in bitmap.c.
  * For details of node_remap(), see bitmap_bitremap in lib/bitmap.c.
  * For details of nodes_remap(), see bitmap_remap in lib/bitmap.c.
+ * For details of nodes_onto(), see bitmap_onto in lib/bitmap.c.
+ * For details of nodes_fold(), see bitmap_fold in lib/bitmap.c.
  *
  * The available nodemask operations are:
  *
@@ -55,7 +57,9 @@
  * int nodelist_scnprintf(buf, len, mask) Format nodemask as list for printing
  * int nodelist_parse(buf, map)		Parse ascii string as nodelist
  * int node_remap(oldbit, old, new)	newbit = map(old, new)(oldbit)
- * int nodes_remap(dst, src, old, new)	*dst = map(old, new)(dst)
+ * void nodes_remap(dst, src, old, new)	*dst = map(old, new)(src)
+ * void nodes_onto(dst, orig, relmap)	*dst = orig relative to relmap
+ * void nodes_fold(dst, orig, sz)	dst bits = orig bits mod sz
  *
  * for_each_node_mask(node, mask)	for-loop node over mask
  *
@@ -326,6 +330,22 @@ static inline void __nodes_remap(nodemask_t *dstp, const nodemask_t *srcp,
 	bitmap_remap(dstp->bits, srcp->bits, oldp->bits, newp->bits, nbits);
 }
 
+#define nodes_onto(dst, orig, relmap) \
+		__nodes_onto(&(dst), &(orig), &(relmap), MAX_NUMNODES)
+static inline void __nodes_onto(nodemask_t *dstp, const nodemask_t *origp,
+		const nodemask_t *relmapp, int nbits)
+{
+	bitmap_onto(dstp->bits, origp->bits, relmapp->bits, nbits);
+}
+
+#define nodes_fold(dst, orig, sz) \
+		__nodes_fold(&(dst), &(orig), sz, MAX_NUMNODES)
+static inline void __nodes_fold(nodemask_t *dstp, const nodemask_t *origp,
+		int sz, int nbits)
+{
+	bitmap_fold(dstp->bits, origp->bits, sz, nbits);
+}
+
 #if MAX_NUMNODES > 1
 #define for_each_node_mask(node, mask)			\
 	for ((node) = first_node(mask);			\
diff --git a/lib/bitmap.c b/lib/bitmap.c
--- a/lib/bitmap.c
+++ b/lib/bitmap.c
@@ -698,6 +698,164 @@ int bitmap_bitremap(int oldbit, const unsigned long *old,
 }
 EXPORT_SYMBOL(bitmap_bitremap);
 
+/**
+ * bitmap_onto - translate one bitmap relative to another
+ *	@dst: resulting translated bitmap
+ * 	@orig: original untranslated bitmap
+ * 	@relmap: bitmap relative to which translated
+ *	@bits: number of bits in each of these bitmaps
+ *
+ * Set the n-th bit of @dst iff there exists some m such that the
+ * n-th bit of @relmap is set, the m-th bit of @orig is set, and
+ * the n-th bit of @relmap is also the m-th _set_ bit of @relmap.
+ * (If you understood the previous sentence the first time your
+ * read it, you're overqualified for your current job.)
+ *
+ * In other words, @orig is mapped onto (surjectively) @dst,
+ * using the the map { <n, m> | the n-th bit of @relmap is the
+ * m-th set bit of @relmap }.
+ *
+ * Any set bits in @orig above bit number W, where W is the
+ * weight of (number of set bits in) @relmap are mapped nowhere.
+ * In particular, if for all bits m set in @orig, m >= W, then
+ * @dst will end up empty.  In situations where the possibility
+ * of such an empty result is not desired, one way to avoid it is
+ * to use the bitmap_fold() operator, below, to first fold the
+ * @orig bitmap over itself so that all its set bits x are in the
+ * range 0 <= x < W.  The bitmap_fold() operator does this by
+ * setting the bit (m % W) in @dst, for each bit (m) set in @orig.
+ *
+ * Example [1] for bitmap_onto():
+ *  Let's say @relmap has bits 30-39 set, and @orig has bits
+ *  1, 3, 5, 7, 9 and 11 set.  Then on return from this routine,
+ *  @dst will have bits 31, 33, 35, 37 and 39 set.
+ *
+ *  When bit 0 is set in @orig, it means turn on the bit in
+ *  @dst corresponding to whatever is the first bit (if any)
+ *  that is turned on in @relmap.  Since bit 0 was off in the
+ *  above example, we leave off that bit (bit 30) in @dst.
+ *
+ *  When bit 1 is set in @orig (as in the above example), it
+ *  means turn on the bit in @dst corresponding to whatever
+ *  is the second bit that is turned on in @relmap.  The second
+ *  bit in @relmap that was turned on in the above example was
+ *  bit 31, so we turned on bit 31 in @dst.
+ *
+ *  Similarly, we turned on bits 33, 35, 37 and 39 in @dst,
+ *  because they were the 4th, 6th, 8th and 10th set bits
+ *  set in @relmap, and the 4th, 6th, 8th and 10th bits of
+ *  @orig (i.e. bits 3, 5, 7 and 9) were also set.
+ *
+ *  When bit 11 is set in @orig, it means turn on the bit in
+ *  @dst corresponding to whatever is the twelth bit that is
+ *  turned on in @relmap.  In the above example, there were
+ *  only ten bits turned on in @relmap (30..39), so that bit
+ *  11 was set in @orig had no affect on @dst.
+ *
+ * Example [2] for bitmap_fold() + bitmap_onto():
+ *  Let's say @relmap has these ten bits set:
+ *		40 41 42 43 45 48 53 61 74 95
+ *  (for the curious, that's 40 plus the first ten terms of the
+ *  Fibonacci sequence.)
+ *
+ *  Further lets say we use the following code, invoking
+ *  bitmap_fold() then bitmap_onto, as suggested above to
+ *  avoid the possitility of an empty @dst result:
+ *
+ *	unsigned long *tmp;	// a temporary bitmap's bits
+ *
+ *	bitmap_fold(tmp, orig, bitmap_weight(relmap, bits), bits);
+ *	bitmap_onto(dst, tmp, relmap, bits);
+ *
+ *  Then this table shows what various values of @dst would be, for
+ *  various @orig's.  I list the zero-based positions of each set bit.
+ *  The tmp column shows the intermediate result, as computed by
+ *  using bitmap_fold() to fold the @orig bitmap modulo ten
+ *  (the weight of @relmap).
+ *
+ *      @orig           tmp            @dst
+ *      0                0             40
+ *      1                1             41
+ *      9                9             95
+ *      10               0             40 (*)
+ *      1 3 5 7          1 3 5 7       41 43 48 61
+ *      0 1 2 3 4        0 1 2 3 4     40 41 42 43 45
+ *      0 9 18 27        0 9 8 7       40 61 74 95
+ *      0 10 20 30       0             40
+ *      0 11 22 33       0 1 2 3       40 41 42 43
+ *      0 12 24 36       0 2 4 6       40 42 45 53
+ *      78 102 211       1 2 8         41 42 74 (*)
+ *
+ * (*) For these marked lines, if we hadn't first done bitmap_fold()
+ *     into tmp, then the @dst result would have been empty.
+ *
+ * If either of @orig or @relmap is empty (no set bits), then @dst
+ * will be returned empty.
+ *
+ * If (as explained above) the only set bits in @orig are in positions
+ * m where m >= W, (where W is the weight of @relmap) then @dst will
+ * once again be returned empty.
+ *
+ * All bits in @dst not set by the above rule are cleared.
+ */
+void bitmap_onto(unsigned long *dst, const unsigned long *orig,
+			const unsigned long *relmap, int bits)
+{
+	int n, m;       	/* same meaning as in above comment */
+
+	if (dst == orig)	/* following doesn't handle inplace mappings */
+		return;
+	bitmap_zero(dst, bits);
+
+	/*
+	 * The following code is a more efficient, but less
+	 * obvious, equivalent to the loop:
+	 *	for (m = 0; m < bitmap_weight(relmap, bits); m++) {
+	 *		n = bitmap_ord_to_pos(orig, m, bits);
+	 *		if (test_bit(m, orig))
+	 *			set_bit(n, dst);
+	 *	}
+	 */
+
+	m = 0;
+	for (n = find_first_bit(relmap, bits);
+	     n < bits;
+	     n = find_next_bit(relmap, bits, n + 1)) {
+		/* m == bitmap_pos_to_ord(relmap, n, bits) */
+		if (test_bit(m, orig))
+			set_bit(n, dst);
+		m++;
+	}
+}
+EXPORT_SYMBOL(bitmap_onto);
+
+/**
+ * bitmap_fold - fold larger bitmap into smaller, modulo specified size
+ *	@dst: resulting smaller bitmap
+ *	@orig: original larger bitmap
+ *	@sz: specified size
+ *	@bits: number of bits in each of these bitmaps
+ *
+ * For each bit oldbit in @orig, set bit oldbit mod @sz in @dst.
+ * Clear all other bits in @dst.  See further the comment and
+ * Example [2] for bitmap_onto() for why and how to use this.
+ */
+void bitmap_fold(unsigned long *dst, const unsigned long *orig,
+			int sz, int bits)
+{
+	int oldbit;
+
+	if (dst == orig)	/* following doesn't handle inplace mappings */
+		return;
+	bitmap_zero(dst, bits);
+
+	for (oldbit = find_first_bit(orig, bits);
+	     oldbit < bits;
+	     oldbit = find_next_bit(orig, bits, oldbit + 1))
+		set_bit(oldbit % sz, dst);
+}
+EXPORT_SYMBOL(bitmap_fold);
+
 /*
  * Common code for bitmap_*_region() routines.
  *	bitmap: array of unsigned longs corresponding to the bitmap

^ permalink raw reply	[flat|nested] 12+ messages in thread

* [patch 5/6] mempolicy: add MPOL_F_RELATIVE_NODES flag
  2008-03-01  0:45     ` [patch 4/6] mempolicy: add bitmap_onto() and bitmap_fold() operations David Rientjes
@ 2008-03-01  0:45       ` David Rientjes
  2008-03-01  0:45         ` [patch 6/6] mempolicy: update NUMA memory policy documentation David Rientjes
  0 siblings, 1 reply; 12+ messages in thread
From: David Rientjes @ 2008-03-01  0:45 UTC (permalink / raw)
  To: Andrew Morton
  Cc: Paul Jackson, Christoph Lameter, Lee Schermerhorn, Andi Kleen,
	linux-kernel

Adds another optional mode flag, MPOL_F_RELATIVE_NODES, that specifies
nodemasks passed via set_mempolicy() or mbind() should be considered
relative to the current task's mems_allowed.

When the mempolicy is created, the passed nodemask is folded and mapped
onto the current task's mems_allowed.  For example, consider a task
using set_mempolicy() to pass MPOL_INTERLEAVE | MPOL_F_RELATIVE_NODES
with a nodemask of 1-3.  If current's mems_allowed is 4-7, the effected
nodemask is 5-7 (the second, third, and fourth node of mems_allowed).

If the same task is attached to a cpuset, the mempolicy nodemask is
rebound each time the mems are changed.  Some possible rebinds and
results are:

	mems			result
	1-3			1-3
	1-7			2-4
	1,5-6			1,5-6
	1,5-7			5-7

Likewise, the zonelist built for MPOL_BIND acts on the set of zones
assigned to the resultant nodemask from the relative remap.

In the MPOL_PREFERRED case, the preferred node is remapped from the
currently effected nodemask to the relative nodemask.

This mempolicy mode flag was conceived of by Paul Jackson <pj@sgi.com>.

Cc: Paul Jackson <pj@sgi.com>
Cc: Christoph Lameter <clameter@sgi.com>
Cc: Lee Schermerhorn <Lee.Schermerhorn@hp.com>
Cc: Andi Kleen <ak@suse.de>
Signed-off-by: David Rientjes <rientjes@google.com>
---
 include/linux/mempolicy.h |    3 ++-
 mm/mempolicy.c            |   36 ++++++++++++++++++++++++++++++++++--
 mm/shmem.c                |    6 ++++++
 3 files changed, 42 insertions(+), 3 deletions(-)

diff --git a/include/linux/mempolicy.h b/include/linux/mempolicy.h
--- a/include/linux/mempolicy.h
+++ b/include/linux/mempolicy.h
@@ -25,12 +25,13 @@ enum {
 
 /* Flags for set_mempolicy */
 #define MPOL_F_STATIC_NODES	(1 << 15)
+#define MPOL_F_RELATIVE_NODES	(1 << 14)
 
 /*
  * MPOL_MODE_FLAGS is the union of all possible optional mode flags passed to
  * either set_mempolicy() or mbind().
  */
-#define MPOL_MODE_FLAGS		(MPOL_F_STATIC_NODES)
+#define MPOL_MODE_FLAGS		(MPOL_F_STATIC_NODES | MPOL_F_RELATIVE_NODES)
 
 /* Flags for get_mempolicy */
 #define MPOL_F_NODE	(1<<0)	/* return next IL mode instead of node mask */
diff --git a/mm/mempolicy.c b/mm/mempolicy.c
--- a/mm/mempolicy.c
+++ b/mm/mempolicy.c
@@ -152,7 +152,15 @@ static struct zonelist *bind_zonelist(nodemask_t *nodes)
 
 static inline int mpol_store_user_nodemask(const struct mempolicy *pol)
 {
-	return pol->flags & MPOL_F_STATIC_NODES;
+	return pol->flags & (MPOL_F_STATIC_NODES | MPOL_F_RELATIVE_NODES);
+}
+
+static void mpol_relative_nodemask(nodemask_t *ret, const nodemask_t *orig,
+				   const nodemask_t *rel)
+{
+	nodemask_t tmp;
+	nodes_fold(tmp, *orig, nodes_weight(*rel));
+	nodes_onto(*ret, tmp, *rel);
 }
 
 /* Create a new policy */
@@ -173,7 +181,12 @@ static struct mempolicy *mpol_new(unsigned short mode, unsigned short flags,
 		return ERR_PTR(-ENOMEM);
 	atomic_set(&policy->refcnt, 1);
 	cpuset_update_task_memory_state();
-	nodes_and(cpuset_context_nmask, *nodes, cpuset_current_mems_allowed);
+	if (flags & MPOL_F_RELATIVE_NODES)
+		mpol_relative_nodemask(&cpuset_context_nmask, nodes,
+				       &cpuset_current_mems_allowed);
+	else
+		nodes_and(cpuset_context_nmask, *nodes,
+			  cpuset_current_mems_allowed);
 	switch (mode) {
 	case MPOL_INTERLEAVE:
 		if (nodes_empty(*nodes) || nodes_empty(cpuset_context_nmask))
@@ -898,6 +911,9 @@ asmlinkage long sys_mbind(unsigned long start, unsigned long len,
 	mode &= ~MPOL_MODE_FLAGS;
 	if (mode >= MPOL_MAX)
 		return -EINVAL;
+	if ((mode_flags & MPOL_F_STATIC_NODES) &&
+	    (mode_flags & MPOL_F_RELATIVE_NODES))
+		return -EINVAL;
 	err = get_nodes(&nodes, nmask, maxnode);
 	if (err)
 		return err;
@@ -916,6 +932,8 @@ asmlinkage long sys_set_mempolicy(int mode, unsigned long __user *nmask,
 	mode &= ~MPOL_MODE_FLAGS;
 	if ((unsigned int)mode >= MPOL_MAX)
 		return -EINVAL;
+	if ((flags & MPOL_F_STATIC_NODES) && (flags & MPOL_F_RELATIVE_NODES))
+		return -EINVAL;
 	err = get_nodes(&nodes, nmask, maxnode);
 	if (err)
 		return err;
@@ -1747,10 +1765,12 @@ static void mpol_rebind_policy(struct mempolicy *pol,
 {
 	nodemask_t tmp;
 	int static_nodes;
+	int relative_nodes;
 
 	if (!pol)
 		return;
 	static_nodes = pol->flags & MPOL_F_STATIC_NODES;
+	relative_nodes = pol->flags & MPOL_F_RELATIVE_NODES;
 	if (!mpol_store_user_nodemask(pol) &&
 	    nodes_equal(pol->w.cpuset_mems_allowed, *newmask))
 		return;
@@ -1761,6 +1781,9 @@ static void mpol_rebind_policy(struct mempolicy *pol,
 	case MPOL_INTERLEAVE:
 		if (static_nodes)
 			nodes_and(tmp, pol->w.user_nodemask, *newmask);
+		else if (relative_nodes)
+			mpol_relative_nodemask(&tmp, &pol->w.user_nodemask,
+					       newmask);
 		else {
 			nodes_remap(tmp, pol->v.nodes,
 				    pol->w.cpuset_mems_allowed, *newmask);
@@ -1783,6 +1806,10 @@ static void mpol_rebind_policy(struct mempolicy *pol,
 				pol->v.preferred_node = node;
 			else
 				pol->v.preferred_node = -1;
+		} else if (relative_nodes) {
+			mpol_relative_nodemask(&tmp, &pol->w.user_nodemask,
+					       newmask);
+			pol->v.preferred_node = first_node(tmp);
 		} else {
 			pol->v.preferred_node = node_remap(pol->v.preferred_node,
 					pol->w.cpuset_mems_allowed, *newmask);
@@ -1794,6 +1821,9 @@ static void mpol_rebind_policy(struct mempolicy *pol,
 
 		if (static_nodes)
 			nodes_and(tmp, pol->w.user_nodemask, *newmask);
+		else if (relative_nodes)
+			mpol_relative_nodemask(&tmp, &pol->w.user_nodemask,
+					       newmask);
 		else {
 			nodemask_t nodes;
 			struct zone **z;
@@ -1911,6 +1941,8 @@ static inline int mpol_to_str(char *buffer, int maxlen, struct mempolicy *pol)
 
 		if (flags & MPOL_F_STATIC_NODES)
 			p += sprintf(p, "%sstatic", need_bar++ ? "|" : "");
+		if (flags & MPOL_F_RELATIVE_NODES)
+			p += sprintf(p, "%srelative", need_bar++ ? "|" : "");
 	}
 
 	if (!nodes_empty(nodes)) {
diff --git a/mm/shmem.c b/mm/shmem.c
--- a/mm/shmem.c
+++ b/mm/shmem.c
@@ -1131,6 +1131,12 @@ static int shmem_parse_mpol(char *value, unsigned short *policy,
 	if (flags) {
 		if (!strcmp(flags, "static"))
 			*mode_flags |= MPOL_F_STATIC_NODES;
+		if (!strcmp(flags, "relative"))
+			*mode_flags |= MPOL_F_RELATIVE_NODES;
+
+		if ((*mode_flags & MPOL_F_STATIC_NODES) &&
+		    (*mode_flags & MPOL_F_RELATIVE_NODES))
+			err = 1;
 	}
 out:
 	/* Restore string for error message */

^ permalink raw reply	[flat|nested] 12+ messages in thread

* [patch 6/6] mempolicy: update NUMA memory policy documentation
  2008-03-01  0:45       ` [patch 5/6] mempolicy: add MPOL_F_RELATIVE_NODES flag David Rientjes
@ 2008-03-01  0:45         ` David Rientjes
  2008-03-01  0:55           ` Randy Dunlap
  0 siblings, 1 reply; 12+ messages in thread
From: David Rientjes @ 2008-03-01  0:45 UTC (permalink / raw)
  To: Andrew Morton
  Cc: Paul Jackson, Christoph Lameter, Lee Schermerhorn, Andi Kleen,
	Randy Dunlap, linux-kernel

Updates Documentation/vm/numa_memory_policy.txt and
Documentation/filesystems/tmpfs.txt to describe optional mempolicy mode
flags.

Cc: Christoph Lameter <clameter@sgi.com>
Cc: Lee Schermerhorn <Lee.Schermerhorn@hp.com>
Cc: Andi Kleen <ak@suse.de>
Cc: Randy Dunlap <randy.dunlap@oracle.com>
Signed-off-by: David Rientjes <rientjes@google.com>
Signed-off-by: Paul Jackson <pj@sgi.com>
---
 Documentation/filesystems/tmpfs.txt     |   12 +++
 Documentation/vm/numa_memory_policy.txt |  131 +++++++++++++++++++++++-------
 2 files changed, 112 insertions(+), 31 deletions(-)

diff --git a/Documentation/filesystems/tmpfs.txt b/Documentation/filesystems/tmpfs.txt
--- a/Documentation/filesystems/tmpfs.txt
+++ b/Documentation/filesystems/tmpfs.txt
@@ -92,6 +92,18 @@ NodeList format is a comma-separated list of decimal numbers and ranges,
 a range being two hyphen-separated decimal numbers, the smallest and
 largest node numbers in the range.  For example, mpol=bind:0-3,5,7,9-15
 
+NUMA memory allocation policies have optional flags that can be used in
+conjunction with their modes.  These optional flags can be specified
+when tmpfs is mounted by appending them to the mode before the NodeList.
+See Documentation/vm/numa_memory_policy.txt for a list of all available
+memory allocation policy mode flags.
+
+	=static		is equivalent to	MPOL_F_STATIC_NODES
+	=relative	is equivalent to	MPOL_F_RELATIVE_NODES
+
+For example, mpol=bind=static:NodeList, is the equivalent of an
+allocation policy of MPOL_BIND | MPOL_F_STATIC_NODES.
+
 Note that trying to mount a tmpfs with an mpol option will fail if the
 running kernel does not support NUMA; and will fail if its nodelist
 specifies a node which is not online.  If your system relies on that
diff --git a/Documentation/vm/numa_memory_policy.txt b/Documentation/vm/numa_memory_policy.txt
--- a/Documentation/vm/numa_memory_policy.txt
+++ b/Documentation/vm/numa_memory_policy.txt
@@ -135,9 +135,11 @@ most general to most specific:
 
 Components of Memory Policies
 
-    A Linux memory policy is a tuple consisting of a "mode" and an optional set
-    of nodes.  The mode determine the behavior of the policy, while the
-    optional set of nodes can be viewed as the arguments to the behavior.
+    A Linux memory policy consists of a "mode", optional mode flags, and an
+    optional set of nodes.  The mode determines the behavior of the policy,
+    the optional mode flags determine the behavior of the mode, and the
+    optional set of nodes can be viewed as the arguments to the policy
+    behavior.
 
    Internally, memory policies are implemented by a reference counted
    structure, struct mempolicy.  Details of this structure will be discussed
@@ -179,7 +181,8 @@ Components of Memory Policies
 	    on a non-shared region of the address space.  However, see
 	    MPOL_PREFERRED below.
 
-	    The Default mode does not use the optional set of nodes.
+	    It is an error for the set of nodes specified for this policy to
+	    be non-empty.
 
 	MPOL_BIND:  This mode specifies that memory must come from the
 	set of nodes specified by the policy.
@@ -231,6 +234,80 @@ Components of Memory Policies
 	    the temporary interleaved system default policy works in this
 	    mode.
 
+   Linux memory policy supports the following optional mode flag:
+
+	MPOL_F_STATIC_NODES:  This flag specifies that the nodemask passed by
+	the user should not be remapped if the task or VMA's set of allowed
+	nodes changes after the memory policy has been defined.
+
+	    Without this flag, anytime a mempolicy is rebound because of a
+	    change in the set of allowed nodes, the node (Preferred) or
+	    nodemask (Bind, Interleave) is remapped to the new set of
+	    allowed nodes.  This may result in nodes being used that were
+	    previously undesired.
+
+	    With this flag, if the user specified nodes overlap with the
+	    nodes allowed by the task's cpuset, then the memory policy is
+	    applied to their intersection.  If the two sets of nodes do not
+	    overlap, the Default policy is used.
+
+	    For example, consider a task that is attached to a cpuset with
+	    mems 1-3 that sets an Interleave policy over the same set.  If
+	    the cpuset's mems change to 3-5, the Interleave will now occur
+	    over nodes 3, 4, and 5.  With this flag, however, since only node
+	    3 is allowed from the user's nodemask, the "interleave" only
+	    occurs over that node.  If no nodes from the user's nodemask are
+	    now allowed, the Default behavior is used.
+
+	    MPOL_F_STATIC_NODES cannot be used with MPOL_F_RELATIVE_NODES.
+
+	MPOL_F_RELATIVE_NODES:  This flag specifies that the nodemask passed
+	by the user will be mapped relative to the set of the task or VMA's
+	set of allowed nodes.  The kernel stores the user-passed nodemask,
+	and if the allowed nodes changes, then that original nodemask will
+	be remapped relative to the new set of allowed nodes.
+
+	    Without this flag (and without MPOL_F_STATIC_NODES), anytime a
+	    mempolicy is rebound because of a change in the set of allowed
+	    nodes, the node (Preferred) or nodemask (Bind, Interleave) is
+	    remapped to the new set of allowed nodes.  That remap may not
+	    preserve the relative nature of the user's passed nodemask to its
+	    set of allowed nodes upon successive rebinds: a nodemask of
+	    1,3,5 may be remapped to 7-9 and then to 1-3 if the set of
+	    allowed nodes is restored to its original state.
+
+	    With this flag, the remap is done so that the node numbers from
+	    the user's passed nodemask are relative to the set of allowed
+	    nodes.  In other words, if nodes 0, 2, and 4 are set in the user's
+	    nodemask, the policy will be effected over the first (and in the
+	    Bind or Interleave case, the third and fifth) nodes in the set of
+	    allowed nodes.  The nodemask passed by the user represents nodes
+	    relative to task or VMA's set of allowed nodes.
+
+	    If the user's nodemask includes nodes that are outside the range
+	    of the new set of allowed nodes (for example, node 5 is set in
+	    the user's nodemask when the set of allowed nodes is only 0-3),
+	    then the remap wraps around to the beginning of the nodemask and,
+	    if not already set, sets the node in the mempolicy nodemask.
+
+	    For example, consider a task that is attached to a cpuset with
+	    mems 2-5 that sets an Interleave policy over the same set with
+	    MPOL_F_RELATIVE_NODES.  If the cpuset's mems change to 3-7, the
+	    interleave now occurs over nodes 3,5-6.  If the cpuset's mems
+	    then change to 0,2-3,5, then the interleave occurs over nodes
+	    0,3,5.
+
+	    Thanks to the consistent remapping, applications preparing
+	    nodemasks to specify memory policies using this flag should
+	    disregard their current, actual cpuset imposed memory placement
+	    and prepare the nodemask as if they were always located on
+	    memory nodes 0 to N-1, where N is the number of memory nodes the
+	    policy is intended to manage.  Let the kernel then remap to the
+	    set of memory nodes allowed by the task's cpuset, as that may
+	    change over time.
+
+	    MPOL_F_RELATIVE_NODES cannot be used with MPOL_F_STATIC_NODES.
+
 MEMORY POLICY APIs
 
 Linux supports 3 system calls for controlling memory policy.  These APIS
@@ -251,7 +328,9 @@ Set [Task] Memory Policy:
 	Set's the calling task's "task/process memory policy" to mode
 	specified by the 'mode' argument and the set of nodes defined
 	by 'nmask'.  'nmask' points to a bit mask of node ids containing
-	at least 'maxnode' ids.
+	at least 'maxnode' ids.  Optional mode flags may be passed by
+	combining the 'mode' argument with the flag (for example:
+	MPOL_INTERLEAVE | MPOL_F_STATIC_NODES).
 
 	See the set_mempolicy(2) man page for more details
 
@@ -303,29 +382,19 @@ MEMORY POLICIES AND CPUSETS
 Memory policies work within cpusets as described above.  For memory policies
 that require a node or set of nodes, the nodes are restricted to the set of
 nodes whose memories are allowed by the cpuset constraints.  If the nodemask
-specified for the policy contains nodes that are not allowed by the cpuset, or
-the intersection of the set of nodes specified for the policy and the set of
-nodes with memory is the empty set, the policy is considered invalid
-and cannot be installed.
-
-The interaction of memory policies and cpusets can be problematic for a
-couple of reasons:
-
-1) the memory policy APIs take physical node id's as arguments.  As mentioned
-   above, it is illegal to specify nodes that are not allowed in the cpuset.
-   The application must query the allowed nodes using the get_mempolicy()
-   API with the MPOL_F_MEMS_ALLOWED flag to determine the allowed nodes and
-   restrict itself to those nodes.  However, the resources available to a
-   cpuset can be changed by the system administrator, or a workload manager
-   application, at any time.  So, a task may still get errors attempting to
-   specify policy nodes, and must query the allowed memories again.
-
-2) when tasks in two cpusets share access to a memory region, such as shared
-   memory segments created by shmget() of mmap() with the MAP_ANONYMOUS and
-   MAP_SHARED flags, and any of the tasks install shared policy on the region,
-   only nodes whose memories are allowed in both cpusets may be used in the
-   policies.  Obtaining this information requires "stepping outside" the
-   memory policy APIs to use the cpuset information and requires that one
-   know in what cpusets other task might be attaching to the shared region.
-   Furthermore, if the cpusets' allowed memory sets are disjoint, "local"
-   allocation is the only valid policy.
+specified for the policy contains nodes that are not allowed by the cpuset and
+MPOL_F_RELATIVE_NODES is not used, the intersection of the set of nodes
+specified for the policy and the set of nodes with memory is used.  If the
+result is the empty set, the policy is considered invalid and cannot be
+installed.  If MPOL_F_RELATIVE_NODES is used, the policy's nodes are mapped
+onto and folded into the task's set of allowed nodes as previously described.
+
+The interaction of memory policies and cpusets can be problematic when tasks
+in two cpusets share access to a memory region, such as shared memory segments
+created by shmget() of mmap() with the MAP_ANONYMOUS and MAP_SHARED flags, and
+any of the tasks install shared policy on the region, only nodes whose
+memories are allowed in both cpusets may be used in the policies.  Obtaining
+this information requires "stepping outside" the memory policy APIs to use the
+cpuset information and requires that one know in what cpusets other task might
+be attaching to the shared region.  Furthermore, if the cpusets' allowed
+memory sets are disjoint, "local" allocation is the only valid policy.

^ permalink raw reply	[flat|nested] 12+ messages in thread

* Re: [patch 6/6] mempolicy: update NUMA memory policy documentation
  2008-03-01  0:45         ` [patch 6/6] mempolicy: update NUMA memory policy documentation David Rientjes
@ 2008-03-01  0:55           ` Randy Dunlap
  2008-03-01  1:14             ` David Rientjes
  0 siblings, 1 reply; 12+ messages in thread
From: Randy Dunlap @ 2008-03-01  0:55 UTC (permalink / raw)
  To: David Rientjes
  Cc: Andrew Morton, Paul Jackson, Christoph Lameter, Lee Schermerhorn,
	Andi Kleen, linux-kernel

David Rientjes wrote:
> Updates Documentation/vm/numa_memory_policy.txt and
> Documentation/filesystems/tmpfs.txt to describe optional mempolicy mode
> flags.
> 
> Cc: Christoph Lameter <clameter@sgi.com>
> Cc: Lee Schermerhorn <Lee.Schermerhorn@hp.com>
> Cc: Andi Kleen <ak@suse.de>
> Cc: Randy Dunlap <randy.dunlap@oracle.com>
> Signed-off-by: David Rientjes <rientjes@google.com>
> Signed-off-by: Paul Jackson <pj@sgi.com>
> ---
>  Documentation/filesystems/tmpfs.txt     |   12 +++
>  Documentation/vm/numa_memory_policy.txt |  131 +++++++++++++++++++++++-------
>  2 files changed, 112 insertions(+), 31 deletions(-)
> 
> diff --git a/Documentation/vm/numa_memory_policy.txt b/Documentation/vm/numa_memory_policy.txt
> --- a/Documentation/vm/numa_memory_policy.txt
> +++ b/Documentation/vm/numa_memory_policy.txt

> @@ -231,6 +234,80 @@ Components of Memory Policies
>  	    the temporary interleaved system default policy works in this
>  	    mode.
>  
> +   Linux memory policy supports the following optional mode flag:

                                                               flags:

> +
> +	MPOL_F_STATIC_NODES:  This flag specifies that the nodemask passed by
> +	the user should not be remapped if the task or VMA's set of allowed
> +	nodes changes after the memory policy has been defined.
> +
> +	    Without this flag, anytime a mempolicy is rebound because of a
> +	    change in the set of allowed nodes, the node (Preferred) or
> +	    nodemask (Bind, Interleave) is remapped to the new set of
> +	    allowed nodes.  This may result in nodes being used that were
> +	    previously undesired.
> +
> +	    With this flag, if the user specified nodes overlap with the

                                   user-specified

> +	    nodes allowed by the task's cpuset, then the memory policy is
> +	    applied to their intersection.  If the two sets of nodes do not
> +	    overlap, the Default policy is used.
> +
> +	    For example, consider a task that is attached to a cpuset with
> +	    mems 1-3 that sets an Interleave policy over the same set.  If
> +	    the cpuset's mems change to 3-5, the Interleave will now occur
> +	    over nodes 3, 4, and 5.  With this flag, however, since only node
> +	    3 is allowed from the user's nodemask, the "interleave" only
> +	    occurs over that node.  If no nodes from the user's nodemask are
> +	    now allowed, the Default behavior is used.
> +
> +	    MPOL_F_STATIC_NODES cannot be used with MPOL_F_RELATIVE_NODES.
> +


-- 
~Randy

^ permalink raw reply	[flat|nested] 12+ messages in thread

* Re: [patch 6/6] mempolicy: update NUMA memory policy documentation
  2008-03-01  0:55           ` Randy Dunlap
@ 2008-03-01  1:14             ` David Rientjes
  0 siblings, 0 replies; 12+ messages in thread
From: David Rientjes @ 2008-03-01  1:14 UTC (permalink / raw)
  To: Andrew Morton; +Cc: Randy Dunlap, Paul Jackson, linux-kernel

On Fri, 29 Feb 2008, Randy Dunlap wrote:

> David Rientjes wrote:
> > Updates Documentation/vm/numa_memory_policy.txt and
> > Documentation/filesystems/tmpfs.txt to describe optional mempolicy mode
> > flags.
> > 
> > Cc: Christoph Lameter <clameter@sgi.com>
> > Cc: Lee Schermerhorn <Lee.Schermerhorn@hp.com>
> > Cc: Andi Kleen <ak@suse.de>
> > Cc: Randy Dunlap <randy.dunlap@oracle.com>
> > Signed-off-by: David Rientjes <rientjes@google.com>
> > Signed-off-by: Paul Jackson <pj@sgi.com>
> > ---
> >  Documentation/filesystems/tmpfs.txt     |   12 +++
> >  Documentation/vm/numa_memory_policy.txt |  131
> > +++++++++++++++++++++++-------
> >  2 files changed, 112 insertions(+), 31 deletions(-)
> > 
> > diff --git a/Documentation/vm/numa_memory_policy.txt
> > b/Documentation/vm/numa_memory_policy.txt
> > --- a/Documentation/vm/numa_memory_policy.txt
> > +++ b/Documentation/vm/numa_memory_policy.txt
> 
> > @@ -231,6 +234,80 @@ Components of Memory Policies
> >  	    the temporary interleaved system default policy works in this
> >  	    mode.
> >  +   Linux memory policy supports the following optional mode flag:
> 
>                                                               flags:
> 
> > +
> > +	MPOL_F_STATIC_NODES:  This flag specifies that the nodemask passed by
> > +	the user should not be remapped if the task or VMA's set of allowed
> > +	nodes changes after the memory policy has been defined.
> > +
> > +	    Without this flag, anytime a mempolicy is rebound because of a
> > +	    change in the set of allowed nodes, the node (Preferred) or
> > +	    nodemask (Bind, Interleave) is remapped to the new set of
> > +	    allowed nodes.  This may result in nodes being used that were
> > +	    previously undesired.
> > +
> > +	    With this flag, if the user specified nodes overlap with the
> 
>                                   user-specified
> 
> > +	    nodes allowed by the task's cpuset, then the memory policy is
> > +	    applied to their intersection.  If the two sets of nodes do not
> > +	    overlap, the Default policy is used.
> > +
> > +	    For example, consider a task that is attached to a cpuset with
> > +	    mems 1-3 that sets an Interleave policy over the same set.  If
> > +	    the cpuset's mems change to 3-5, the Interleave will now occur
> > +	    over nodes 3, 4, and 5.  With this flag, however, since only node
> > +	    3 is allowed from the user's nodemask, the "interleave" only
> > +	    occurs over that node.  If no nodes from the user's nodemask are
> > +	    now allowed, the Default behavior is used.
> > +
> > +	    MPOL_F_STATIC_NODES cannot be used with MPOL_F_RELATIVE_NODES.
> > +
> 

Andrew, please fold this into 
mempolicy-update-numa-memory-policy-documentation.patch if this patchset is 
added to -mm.

Corrects two errors in Documentation/vm/numa_memory_policy.txt as identified by 
Randy Dunlap <randy.dunlap@oracle.com>.

Cc: Randy Dunlap <randy.dunlap@oracle.com>
Cc: Paul Jackson <pj@sgi.com>
Signed-off-by: David Rientjes <rientjes@google.com>
---
 Documentation/vm/numa_memory_policy.txt |    4 ++--
 1 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/Documentation/vm/numa_memory_policy.txt b/Documentation/vm/numa_memory_policy.txt
--- a/Documentation/vm/numa_memory_policy.txt
+++ b/Documentation/vm/numa_memory_policy.txt
@@ -234,7 +234,7 @@ Components of Memory Policies
 	    the temporary interleaved system default policy works in this
 	    mode.
 
-   Linux memory policy supports the following optional mode flag:
+   Linux memory policy supports the following optional mode flags:
 
 	MPOL_F_STATIC_NODES:  This flag specifies that the nodemask passed by
 	the user should not be remapped if the task or VMA's set of allowed
@@ -246,7 +246,7 @@ Components of Memory Policies
 	    allowed nodes.  This may result in nodes being used that were
 	    previously undesired.
 
-	    With this flag, if the user specified nodes overlap with the
+	    With this flag, if the user-specified nodes overlap with the
 	    nodes allowed by the task's cpuset, then the memory policy is
 	    applied to their intersection.  If the two sets of nodes do not
 	    overlap, the Default policy is used.

^ permalink raw reply	[flat|nested] 12+ messages in thread

* Re: [patch 1/6] mempolicy: convert MPOL constants to enum
  2008-03-01  0:44 [patch 1/6] mempolicy: convert MPOL constants to enum David Rientjes
  2008-03-01  0:45 ` [patch 2/6] mempolicy: support optional mode flags David Rientjes
@ 2008-03-03  7:17 ` Andrew Morton
  1 sibling, 0 replies; 12+ messages in thread
From: Andrew Morton @ 2008-03-03  7:17 UTC (permalink / raw)
  To: David Rientjes
  Cc: Paul Jackson, Christoph Lameter, Lee Schermerhorn, Andi Kleen,
	linux-kernel

On Fri, 29 Feb 2008 16:44:59 -0800 (PST) David Rientjes <rientjes@google.com> wrote:

> The mempolicy mode constants, MPOL_DEFAULT, MPOL_PREFERRED, MPOL_BIND,
> and MPOL_INTERLEAVE, are better declared as part of an enum since they
> are sequentially numbered and cannot be combined.
> 
> The policy member of struct mempolicy is also converted from type short
> to type unsigned short.  A negative policy does not have any legitimate
> meaning, so it is possible to change its type in preparation for adding
> optional mode flags later.
> 
> The equivalent member of struct shmem_sb_info is also changed from int
> to unsigned short.
> 
> For compatibility, the policy formal to get_mempolicy() remains as a
> pointer to an int:
> 
> 	int get_mempolicy(int *policy, unsigned long *nmask,
> 			  unsigned long maxnode, unsigned long addr,
> 			  unsigned long flags);
> 
> although the only possible values is the range of type unsigned short.

The collisions between this work and Mel's

mm-use-zonelists-instead-of-zones-when-direct-reclaiming-pages.patch
mm-introduce-node_zonelist-for-accessing-the-zonelist-for-a-gfp-mask.patch
mm-remember-what-the-preferred-zone-is-for-zone_statistics.patch
mm-use-two-zonelist-that-are-filtered-by-gfp-mask.patch
mm-have-zonelist-contains-structs-with-both-a-zone-pointer-and-zone_idx.patch
mm-filter-based-on-a-nodemask-as-well-as-a-gfp_mask.patch

are more than I'm comfortable with fixing, sorry.

^ permalink raw reply	[flat|nested] 12+ messages in thread

* Re: [patch 6/6] mempolicy: update NUMA memory policy documentation
  2008-02-26 17:34           ` Paul Jackson
@ 2008-02-26 21:23             ` David Rientjes
  0 siblings, 0 replies; 12+ messages in thread
From: David Rientjes @ 2008-02-26 21:23 UTC (permalink / raw)
  To: Paul Jackson
  Cc: akpm, clameter, Lee.Schermerhorn, ak, randy.dunlap, linux-kernel

On Tue, 26 Feb 2008, Paul Jackson wrote:

> Hmmm ... I've read this several times now ... still can't figure out
> what it's saying ;).  And it doesn't really explain key aspects of
> MPOL_F_RELATIVE_NODES, such as that it provides cpuset relative
> numbering (use nodes 0..N-1, regardless of your current cpuset, to
> refer to the first N nodes in whatever is your current cpuset.)
> 

I'll elaborate more on the MPOL_F_RELATIVE_NODES effect.

^ permalink raw reply	[flat|nested] 12+ messages in thread

* Re: [patch 6/6] mempolicy: update NUMA memory policy documentation
  2008-02-25 15:35         ` [patch 6/6] mempolicy: update NUMA memory policy documentation David Rientjes
@ 2008-02-26 17:34           ` Paul Jackson
  2008-02-26 21:23             ` David Rientjes
  0 siblings, 1 reply; 12+ messages in thread
From: Paul Jackson @ 2008-02-26 17:34 UTC (permalink / raw)
  To: David Rientjes
  Cc: akpm, clameter, Lee.Schermerhorn, ak, randy.dunlap, linux-kernel

+	MPOL_F_RELATIVE_NODES:  This flag specifies that the nodemask passed
+	by the user should remain in the same context as it is for the
+	current task or VMA's set of accessible nodes after the memory
+	policy has been defined.
+
+	    Without this flag (and without MPOL_F_STATIC_NODES), anytime a
+	    mempolicy is rebound because of a change in the set of
+	    accessible nodes, the node (Preferred) or nodemask (Bind,
+	    Interleave) is remapped to the new set of accessible nodes.
+	    With this flag, the remap is done to ensure the context of the
+	    previous nodemask with its set of allowed mems is preserved.

Hmmm ... I've read this several times now ... still can't figure out
what it's saying ;).  And it doesn't really explain key aspects of
MPOL_F_RELATIVE_NODES, such as that it provides cpuset relative
numbering (use nodes 0..N-1, regardless of your current cpuset, to
refer to the first N nodes in whatever is your current cpuset.)

Perhaps we'd be further ahead of the game if you started with the
documentation changes to Documentation/vm/numa_memory_policy.txt,
in my patch:
	Date: Sun, 23 Dec 2007 22:24:02 -0600
	From: Paul Jackson <pj@sgi.com>
	To: David Rientjes <rientjes@google.com>
	Cc: Lee.Schermerhorn@hp.com, clameter@sgi.com
	Subject: Re: [RFC] cpuset relative memory policies - second choice
	Message-Id: <20071223222402.5486bf0a.pj@sgi.com>

Change MPOL_MASK_REL to MPOL_F_RELATIVE_NODES and similar changes.

-- 
                  I won't rest till it's the best ...
                  Programmer, Linux Scalability
                  Paul Jackson <pj@sgi.com> 1.940.382.4214

^ permalink raw reply	[flat|nested] 12+ messages in thread

* [patch 6/6] mempolicy: update NUMA memory policy documentation
  2008-02-25 15:35       ` [patch 5/6] mempolicy: add MPOL_F_RELATIVE_NODES flag David Rientjes
@ 2008-02-25 15:35         ` David Rientjes
  2008-02-26 17:34           ` Paul Jackson
  0 siblings, 1 reply; 12+ messages in thread
From: David Rientjes @ 2008-02-25 15:35 UTC (permalink / raw)
  To: Andrew Morton
  Cc: Paul Jackson, Christoph Lameter, Lee Schermerhorn, Andi Kleen,
	Randy Dunlap, linux-kernel

Updates Documentation/vm/numa_memory_policy.txt and
Documentation/filesystems/tmpfs.txt to describe optional mempolicy mode
flags.

Cc: Paul Jackson <pj@sgi.com>
Cc: Christoph Lameter <clameter@sgi.com>
Cc: Lee Schermerhorn <Lee.Schermerhorn@hp.com>
Cc: Andi Kleen <ak@suse.de>
Cc: Randy Dunlap <randy.dunlap@oracle.com>
Signed-off-by: David Rientjes <rientjes@google.com>
---
 Documentation/filesystems/tmpfs.txt     |   19 +++++++++++
 Documentation/vm/numa_memory_policy.txt |   54 ++++++++++++++++++++++++++++--
 2 files changed, 69 insertions(+), 4 deletions(-)

diff --git a/Documentation/filesystems/tmpfs.txt b/Documentation/filesystems/tmpfs.txt
--- a/Documentation/filesystems/tmpfs.txt
+++ b/Documentation/filesystems/tmpfs.txt
@@ -92,6 +92,25 @@ NodeList format is a comma-separated list of decimal numbers and ranges,
 a range being two hyphen-separated decimal numbers, the smallest and
 largest node numbers in the range.  For example, mpol=bind:0-3,5,7,9-15
 
+It is possible to specify a static NodeList by appending '=static' to
+the memory policy mode in the mpol= argument.  This will require that
+tasks or VMA's restricted to a subset of allowed nodes are only allowed
+to effect the memory policy over those nodes.  No remapping of the
+NodeList when the policy is rebound, which is the default behavior, is
+allowed when '=static' is specified.  For example:
+
+mpol=bind=static:NodeList	will only allocate from each node in
+				the NodeList without remapping the
+				NodeList if the policy is rebound
+
+It is also possible is to specify a relative NodeList by appending
+'=relative' to the memory policy mode in the mpol= argument.  When the
+allowed nodes of a task or VMA changes, the mempolicy nodemask is
+rebound to maintain the same context as the previously bound nodemask.
+For example, consider a relative mempolicy nodemask of 1-3 for a task
+that is allowed access to nodes 0-4.  If those permissions change to
+allow access to 3-7 instead, the mempolicy nodemask becomes 4-6.
+
 Note that trying to mount a tmpfs with an mpol option will fail if the
 running kernel does not support NUMA; and will fail if its nodelist
 specifies a node which is not online.  If your system relies on that
diff --git a/Documentation/vm/numa_memory_policy.txt b/Documentation/vm/numa_memory_policy.txt
--- a/Documentation/vm/numa_memory_policy.txt
+++ b/Documentation/vm/numa_memory_policy.txt
@@ -135,9 +135,11 @@ most general to most specific:
 
 Components of Memory Policies
 
-    A Linux memory policy is a tuple consisting of a "mode" and an optional set
-    of nodes.  The mode determine the behavior of the policy, while the
-    optional set of nodes can be viewed as the arguments to the behavior.
+    A Linux memory policy consists of a "mode", optional mode flags, and an
+    optional set of nodes.  The mode determines the behavior of the policy,
+    the optional mode flags determine the behavior of the mode, and the
+    optional set of nodes can be viewed as the arguments to the policy
+    behavior.
 
    Internally, memory policies are implemented by a reference counted
    structure, struct mempolicy.  Details of this structure will be discussed
@@ -231,6 +233,48 @@ Components of Memory Policies
 	    the temporary interleaved system default policy works in this
 	    mode.
 
+   Linux memory policy supports the following optional mode flag:
+
+	MPOL_F_STATIC_NODES:  This flag specifies that the nodemask passed by
+	the user should not be remapped if the task or VMA's set of accessible
+	nodes changes after the memory policy has been defined.
+
+	    Without this flag, anytime a mempolicy is rebound because of a
+	    change in the set of accessible nodes, the node (Preferred) or
+	    nodemask (Bind, Interleave) is remapped to the new set of
+	    accessible nodes.  This may result in nodes being used that were
+	    previously undesired.  With this flag, the policy is either
+	    effected over the user's specified nodemask or the Default
+	    behavior is used.
+
+	    For example, consider a task that is attached to a cpuset with
+	    mems 1-3 that sets an Interleave policy over the same set.  If
+	    the cpuset's mems change to 3-5, the Interleave will now occur
+	    over nodes 3, 4, and 5.  With this flag, however, since only
+	    node 3 is accessible from the user's nodemask, the "interleave"
+	    only occurs over that node.  If no nodes from the user's
+	    nodemask are now accessible, the Default behavior is used.
+
+	MPOL_F_RELATIVE_NODES:  This flag specifies that the nodemask passed
+	by the user should remain in the same context as it is for the
+	current task or VMA's set of accessible nodes after the memory
+	policy has been defined.
+
+	    Without this flag (and without MPOL_F_STATIC_NODES), anytime a
+	    mempolicy is rebound because of a change in the set of
+	    accessible nodes, the node (Preferred) or nodemask (Bind,
+	    Interleave) is remapped to the new set of accessible nodes.
+	    With this flag, the remap is done to ensure the context of the
+	    previous nodemask with its set of allowed mems is preserved.
+
+	    For example, consider a task that is attached to a cpuset with
+	    mems 1-3 that sets an Interleave policy over the same set.  If
+	    the cpuset's mems change to 3-7, the Interleave will now occur
+	    over nodes 3, 4, and 5.  With this flag, however, since a
+	    nodemask of 1-3 represents the contextually second, third, and
+	    fourth nodes of the allowed mems, the Interleave now occurs
+	    over nodes 4-6.
+
 MEMORY POLICY APIs
 
 Linux supports 3 system calls for controlling memory policy.  These APIS
@@ -251,7 +295,9 @@ Set [Task] Memory Policy:
 	Set's the calling task's "task/process memory policy" to mode
 	specified by the 'mode' argument and the set of nodes defined
 	by 'nmask'.  'nmask' points to a bit mask of node ids containing
-	at least 'maxnode' ids.
+	at least 'maxnode' ids.  Optional mode flags may be passed by
+	combining the 'mode' argument with the flag (for example:
+	MPOL_INTERLEAVE | MPOL_F_STATIC_NODES).
 
 	See the set_mempolicy(2) man page for more details
 

^ permalink raw reply	[flat|nested] 12+ messages in thread

end of thread, other threads:[~2008-03-03  7:18 UTC | newest]

Thread overview: 12+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2008-03-01  0:44 [patch 1/6] mempolicy: convert MPOL constants to enum David Rientjes
2008-03-01  0:45 ` [patch 2/6] mempolicy: support optional mode flags David Rientjes
2008-03-01  0:45   ` [patch 3/6] mempolicy: add MPOL_F_STATIC_NODES flag David Rientjes
2008-03-01  0:45     ` [patch 4/6] mempolicy: add bitmap_onto() and bitmap_fold() operations David Rientjes
2008-03-01  0:45       ` [patch 5/6] mempolicy: add MPOL_F_RELATIVE_NODES flag David Rientjes
2008-03-01  0:45         ` [patch 6/6] mempolicy: update NUMA memory policy documentation David Rientjes
2008-03-01  0:55           ` Randy Dunlap
2008-03-01  1:14             ` David Rientjes
2008-03-03  7:17 ` [patch 1/6] mempolicy: convert MPOL constants to enum Andrew Morton
  -- strict thread matches above, loose matches on Subject: below --
2008-02-25 15:35 David Rientjes
2008-02-25 15:35 ` [patch 2/6] mempolicy: support optional mode flags David Rientjes
2008-02-25 15:35   ` [patch 3/6] mempolicy: add MPOL_F_STATIC_NODES flag David Rientjes
2008-02-25 15:35     ` [patch 4/6] mempolicy: add bitmap_onto() and bitmap_fold() operations David Rientjes
2008-02-25 15:35       ` [patch 5/6] mempolicy: add MPOL_F_RELATIVE_NODES flag David Rientjes
2008-02-25 15:35         ` [patch 6/6] mempolicy: update NUMA memory policy documentation David Rientjes
2008-02-26 17:34           ` Paul Jackson
2008-02-26 21:23             ` David Rientjes

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).