LKML Archive on lore.kernel.org
help / color / mirror / Atom feed
* [PATCH] change global zonelist order on NUMA v2
@ 2007-04-26  9:34 KAMEZAWA Hiroyuki
  2007-04-26  9:47 ` Andi Kleen
  2007-04-26 21:57 ` Lee Schermerhorn
  0 siblings, 2 replies; 19+ messages in thread
From: KAMEZAWA Hiroyuki @ 2007-04-26  9:34 UTC (permalink / raw)
  To: LKML; +Cc: Linux-MM, AKPM, Christoph Lameter, Andi Kleen


Changelog from V1 -> V2
- sysctl name is changed to be relaxed_zone_order
- NORMAL->NORMAL->....->DMA->DMA->DMA order (new ordering) is now default.
  NORMAL->DMA->NORMAL->DMA order (old ordering) is optional.
- addes boot opttion to set relaxed_zone_order. ia64 is supported now.
- Added documentation

patch is against 2.6.21-rc7-mm2. tested on ia64 NUMA box. works well.

-Kame
> from hrere

Make zonelist creation policy selectable from sysctl v2.

[Description]
Assume 2 node NUMA, only node(0) has ZONE_DMA.
(ia64's ZONE_DMA is below 4GB...x86_64's ZONE_DMA32)

In this case, current default (node0's) zonelist order is

Node(0)'s NORMAL -> Node(0)'s DMA -> Node(1)"s NORMAL.

This means Node(0)'s DMA will be used before Node(1)'s NORMAL.

This patch changes *default* zone order to

Node(0)'s NORMAL -> Node(1)'s NORMAL -> Node(0)'s DMA.

But, if Node(0)'s memory is too small (near or below 4G), Node(0)'s process has
to allocate its memory from Node(1) even if there are free memory in Node(0).
Some applications/uses will dislike this.
This patch adds a knob to change zonelist ordering.

[What this patch adds]

command:
%echo 1 > /proc/sys/vm/relaxed_zone_order

Will rebuild zonelist in following order(old style).

Node(0)'s NORMAL -> Node(0)'s DMA -> Node(0)'s NORMAL.

And you can specify "relaxed_zone_order" boot option if supported by arch.
But this style zonelist can easily cause OOM-Kill because of ZONE_DMA
exhaition. be careful.

command:
echo 0 > /proc/sys/vm/relaxed_zone_order
will rebuild zonelist as
Node(0)'s NORMAL -> Node(1)'s NORMAL -> Node(0)'s DMA.

Added ia64 support and tested on ia64 2-Node NUMA. works well.

Signed-Off-By: KAMEZAWA Hiroyuki <kamezawa.hiroyu@jp.fujitsu.com>

Index: linux-2.6.21-rc7-mm2/kernel/sysctl.c
===================================================================
--- linux-2.6.21-rc7-mm2.orig/kernel/sysctl.c
+++ linux-2.6.21-rc7-mm2/kernel/sysctl.c
@@ -80,6 +80,7 @@ extern int sysctl_drop_caches;
 extern int percpu_pagelist_fraction;
 extern int compat_log;
 extern int maps_protect;
+extern int sysctl_relaxed_zone_order;
 
 #if defined(CONFIG_ADAPTIVE_READAHEAD)
 extern int readahead_ratio;
@@ -893,6 +894,15 @@ static ctl_table vm_table[] = {
 		.extra1		= &zero,
 		.extra2		= &one_hundred,
 	},
+	{
+		.ctl_name	= CTL_UNNUMBERED,
+		.procname	= "relaxed_zone_order",
+		.data		= &sysctl_relaxed_zone_order,
+		.maxlen		= sizeof(sysctl_relaxed_zone_order),
+		.mode		= 0644,
+		.proc_handler	= &sysctl_relaxed_zone_order_handler,
+		.strategy	= &sysctl_intvec,
+	},
 #endif
 #if defined(CONFIG_X86_32) || \
    (defined(CONFIG_SUPERH) && defined(CONFIG_VSYSCALL))
Index: linux-2.6.21-rc7-mm2/mm/page_alloc.c
===================================================================
--- linux-2.6.21-rc7-mm2.orig/mm/page_alloc.c
+++ linux-2.6.21-rc7-mm2/mm/page_alloc.c
@@ -2045,7 +2045,7 @@ static int __meminit build_zonelists_nod
 
 #ifdef CONFIG_NUMA
 #define MAX_NODE_LOAD (num_online_nodes())
-static int __meminitdata node_load[MAX_NUMNODES];
+static int node_load[MAX_NUMNODES];
 /**
  * find_next_best_node - find the next node that should appear in a given node's fallback list
  * @node: node whose fallback list we're appending
@@ -2060,7 +2060,7 @@ static int __meminitdata node_load[MAX_N
  * on them otherwise.
  * It returns -1 if no node is found.
  */
-static int __meminit find_next_best_node(int node, nodemask_t *used_node_mask)
+static int find_next_best_node(int node, nodemask_t *used_node_mask)
 {
 	int n, val;
 	int min_val = INT_MAX;
@@ -2106,7 +2106,10 @@ static int __meminit find_next_best_node
 	return best_node;
 }
 
-static void __meminit build_zonelists(pg_data_t *pgdat)
+/*
+ * Build zonelists based on node locality.
+ */
+static void build_zonelists_locality_aware(pg_data_t *pgdat)
 {
 	int j, node, local_node;
 	enum zone_type i;
@@ -2155,6 +2158,81 @@ static void __meminit build_zonelists(pg
 	}
 }
 
+/*
+ * Build zonelist based on zone priority.
+ */
+static int node_order[MAX_NUMNODES];
+static void build_zonelists_zone_aware(pg_data_t *pgdat)
+{
+	int i, j, pos, zone_type, node, load;
+	nodemask_t used_mask;
+	int local_node, prev_node;
+	struct zone *z;
+	struct zonelist *zonelist;
+
+	for (i = 0; i < MAX_NR_ZONES; i++) {
+		zonelist = pgdat->node_zonelists + i;
+		zonelist->zones[0] = NULL;
+	}
+	memset(node_order, 0, sizeof(node_order));
+	local_node = pgdat->node_id;
+	load = num_online_nodes();
+	prev_node = local_node;
+	nodes_clear(used_mask);
+	j = 0;
+	while ((node = find_next_best_node(local_node, &used_mask)) >= 0) {
+		int distance = node_distance(local_node, node);
+		if (distance > RECLAIM_DISTANCE)
+			zone_reclaim_mode = 1;
+		if (distance != node_distance(local_node, prev_node))
+			node_load[node] = load;
+		node_order[j++] = node;
+		prev_node = node;
+		load--;
+	}
+	/* calculate node order */
+	for (i = 0; i < MAX_NR_ZONES; i++) {
+		zonelist = pgdat->node_zonelists + i;
+		pos = 0;
+		for (zone_type = i; zone_type >= 0; zone_type--) {
+			for (j = 0; j < num_online_nodes(); j++) {
+				node = node_order[j];
+				z = &NODE_DATA(node)->node_zones[zone_type];
+				if (populated_zone(z))
+					zonelist->zones[pos++] = z;
+			}
+		}
+		zonelist->zones[pos] = NULL;
+	}
+}
+
+int sysctl_relaxed_zone_order = 0;
+
+static void build_zonelists(pg_data_t *pgdat)
+{
+	if (sysctl_relaxed_zone_order)
+		build_zonelists_locality_aware(pgdat);
+	else
+		build_zonelists_zone_aware(pgdat);
+}
+
+int sysctl_relaxed_zone_order_handler(ctl_table *table, int write,
+		struct file *file, void __user *buffer, size_t *length,
+		loff_t *ppos)
+{
+	int oldval = sysctl_relaxed_zone_order;
+	proc_dointvec_minmax(table, write, file, buffer, length, ppos);
+	if (write && (oldval != sysctl_relaxed_zone_order))
+		build_all_zonelists();
+	return 0;
+}
+
+int __init cmdline_parse_relaxed_zone_order(char *p)
+{
+	sysctl_relaxed_zone_order = 1;
+	return 0;
+}
+
 /* Construct the zonelist performance cache - see further mmzone.h */
 static void __meminit build_zonelist_cache(pg_data_t *pgdat)
 {
@@ -2222,7 +2300,7 @@ static void __meminit build_zonelist_cac
 #endif	/* CONFIG_NUMA */
 
 /* return values int ....just for stop_machine_run() */
-static int __meminit __build_all_zonelists(void *dummy)
+static int __build_all_zonelists(void *dummy)
 {
 	int nid;
 
@@ -2233,12 +2311,13 @@ static int __meminit __build_all_zonelis
 	return 0;
 }
 
-void __meminit build_all_zonelists(void)
+void build_all_zonelists(void)
 {
 	if (system_state == SYSTEM_BOOTING) {
 		__build_all_zonelists(NULL);
 		cpuset_init_current_mems_allowed();
 	} else {
+		memset(node_load, 0, sizeof(node_load));
 		/* we have to stop all cpus to guaranntee there is no user
 		   of zonelist */
 		stop_machine_run(__build_all_zonelists, NULL, NR_CPUS);
Index: linux-2.6.21-rc7-mm2/include/linux/mmzone.h
===================================================================
--- linux-2.6.21-rc7-mm2.orig/include/linux/mmzone.h
+++ linux-2.6.21-rc7-mm2/include/linux/mmzone.h
@@ -608,6 +608,11 @@ int sysctl_min_unmapped_ratio_sysctl_han
 int sysctl_min_slab_ratio_sysctl_handler(struct ctl_table *, int,
 			struct file *, void __user *, size_t *, loff_t *);
 
+extern int sysctl_relaxed_zone_order_handler(struct ctl_table *, int,
+			struct file *, void __user *, size_t *, loff_t *);
+
+extern int cmdline_parse_relaxed_zone_order(char *p);
+
 #include <linux/topology.h>
 /* Returns the number of the current Node. */
 #ifndef numa_node_id
Index: linux-2.6.21-rc7-mm2/Documentation/kernel-parameters.txt
===================================================================
--- linux-2.6.21-rc7-mm2.orig/Documentation/kernel-parameters.txt
+++ linux-2.6.21-rc7-mm2/Documentation/kernel-parameters.txt
@@ -1500,6 +1500,10 @@ and is between 256 and 4096 characters. 
 			Format: <reboot_mode>[,<reboot_mode2>[,...]]
 			See arch/*/kernel/reboot.c or arch/*/kernel/process.c			
 
+	relaxed_zone_order [KNL,BOOT]
+			give memory allocation priority to locality rather
+			than zone class. See Documentation/sysctl/vm.txt
+
 	reserve=	[KNL,BUGS] Force the kernel to ignore some iomem area
 
 	reservetop=	[X86-32]
Index: linux-2.6.21-rc7-mm2/Documentation/sysctl/vm.txt
===================================================================
--- linux-2.6.21-rc7-mm2.orig/Documentation/sysctl/vm.txt
+++ linux-2.6.21-rc7-mm2/Documentation/sysctl/vm.txt
@@ -34,6 +34,7 @@ Currently, these files are in /proc/sys/
 - swap_prefetch
 - readahead_ratio
 - readahead_hit_rate
+- relaxed_zone_order
 
 ==============================================================
 
@@ -275,3 +276,24 @@ Possible values can be:
 The larger value, the more capabilities, with more possible overheads.
 
 The default value is 1.
+
+=============================================================
+
+relaxed_zone_order
+
+This sysctl is only for NUMA.
+This allows you to allocate local memory more aggresively.
+Assume 2 Node NUMA.The kernel memory allocateion order on Node(0)
+is following. relaxed_zone_order=0 in this case.(default)
+==
+Node(0)NORMAL -> Node(1)NORMAL -> Node(0)DMA -> Node(1)DMA(if any)
+==
+If set to relaxed_zone_order=1, This option changes this order to be
+==
+Node(0)NORMAL -> Node(0)DMA -> Node(1)NORMA -> Node(1)DMA
+==
+Then you can use more local memory. But, in this case, ZONE_DMA can be
+used more eagerly than default. Then, OOM-KILL in ZONE_DMA can happen easier.
+
+The default value is 0.
+
Index: linux-2.6.21-rc7-mm2/arch/ia64/mm/discontig.c
===================================================================
--- linux-2.6.21-rc7-mm2.orig/arch/ia64/mm/discontig.c
+++ linux-2.6.21-rc7-mm2/arch/ia64/mm/discontig.c
@@ -27,6 +27,8 @@
 #include <asm/numa.h>
 #include <asm/sections.h>
 
+
+early_param("relaxed_zone_order", cmdline_parse_relaxed_zone_order);
 /*
  * Track per-node information needed to setup the boot memory allocator, the
  * per-node areas, and the real VM.


^ permalink raw reply	[flat|nested] 19+ messages in thread

* Re: [PATCH] change global zonelist order on NUMA v2
  2007-04-26  9:34 [PATCH] change global zonelist order on NUMA v2 KAMEZAWA Hiroyuki
@ 2007-04-26  9:47 ` Andi Kleen
  2007-04-26 10:10   ` KAMEZAWA Hiroyuki
  2007-04-26 15:46   ` Christoph Lameter
  2007-04-26 21:57 ` Lee Schermerhorn
  1 sibling, 2 replies; 19+ messages in thread
From: Andi Kleen @ 2007-04-26  9:47 UTC (permalink / raw)
  To: KAMEZAWA Hiroyuki; +Cc: LKML, Linux-MM, AKPM, Christoph Lameter

On Thursday 26 April 2007 11:34:17 KAMEZAWA Hiroyuki wrote:
> 
> Changelog from V1 -> V2
> - sysctl name is changed to be relaxed_zone_order
> - NORMAL->NORMAL->....->DMA->DMA->DMA order (new ordering) is now default.
>   NORMAL->DMA->NORMAL->DMA order (old ordering) is optional.
> - addes boot opttion to set relaxed_zone_order. ia64 is supported now.
> - Added documentation
> 
> patch is against 2.6.21-rc7-mm2. tested on ia64 NUMA box. works well.

IMHO the change should be default (without any options) unless someone
can come up with a good reason why not. On x86-64 it should be definitely
default.

If there is a good reason on some architecture or machine a user option is also not a 
good idea, but instead it should be set automatically by that architecture or machine
on boot.

-Andi


^ permalink raw reply	[flat|nested] 19+ messages in thread

* Re: [PATCH] change global zonelist order on NUMA v2
  2007-04-26  9:47 ` Andi Kleen
@ 2007-04-26 10:10   ` KAMEZAWA Hiroyuki
  2007-04-26 10:53     ` [PATCH] change global zonelist order on NUMA v3 KAMEZAWA Hiroyuki
  2007-04-26 15:48     ` [PATCH] change global zonelist order on NUMA v2 Christoph Lameter
  2007-04-26 15:46   ` Christoph Lameter
  1 sibling, 2 replies; 19+ messages in thread
From: KAMEZAWA Hiroyuki @ 2007-04-26 10:10 UTC (permalink / raw)
  To: Andi Kleen; +Cc: linux-kernel, linux-mm, akpm, clameter

On Thu, 26 Apr 2007 11:47:44 +0200
Andi Kleen <ak@suse.de> wrote:

> On Thursday 26 April 2007 11:34:17 KAMEZAWA Hiroyuki wrote:
> > 
> > Changelog from V1 -> V2
> > - sysctl name is changed to be relaxed_zone_order
> > - NORMAL->NORMAL->....->DMA->DMA->DMA order (new ordering) is now default.
> >   NORMAL->DMA->NORMAL->DMA order (old ordering) is optional.
> > - addes boot opttion to set relaxed_zone_order. ia64 is supported now.
> > - Added documentation
> > 
> > patch is against 2.6.21-rc7-mm2. tested on ia64 NUMA box. works well.
> 
> IMHO the change should be default (without any options) unless someone
> can come up with a good reason why not. On x86-64 it should be definitely
> default.
> 
> If there is a good reason on some architecture or machine a user option is also not a 
> good idea, but instead it should be set automatically by that architecture or machine
> on boot.
> 
Hmm...sounds reasonable. 

I have 2 idea for automatic way..

(1)Use new zonelist ordering always and move init_task's tied cpu to a
  cpu on the best node. 
  Child processes will start in good nodes even if Node 0 has small memory.

(2) Set Node's local highest zone to the top of zonelist.

I like (1). Does anyone have an idea ?

-Kame


^ permalink raw reply	[flat|nested] 19+ messages in thread

* [PATCH] change global zonelist order on NUMA v3
  2007-04-26 10:10   ` KAMEZAWA Hiroyuki
@ 2007-04-26 10:53     ` KAMEZAWA Hiroyuki
  2007-04-26 16:00       ` Lee Schermerhorn
  2007-04-26 15:48     ` [PATCH] change global zonelist order on NUMA v2 Christoph Lameter
  1 sibling, 1 reply; 19+ messages in thread
From: KAMEZAWA Hiroyuki @ 2007-04-26 10:53 UTC (permalink / raw)
  To: KAMEZAWA Hiroyuki; +Cc: ak, linux-kernel, linux-mm, akpm, clameter

Changelog V2 -> V3

- removed zone ordering selection knobs...

much simpler one. just changing zonelist ordering.
tested on ia64 NUMA works well as expected.

-Kame


change zonelist order on NUMA v3.

[Description]
Assume 2 node NUMA, only node(0) has ZONE_DMA.
(ia64's ZONE_DMA is below 4GB...x86_64's ZONE_DMA32)

In this case, current default (node0's) zonelist order is

Node(0)'s NORMAL -> Node(0)'s DMA -> Node(1)"s NORMAL.

This means Node(0)'s DMA will be used before Node(1)'s NORMAL.
This will cause OOM on ZONE_DMA easily.

This patch changes *default* zone order to

Node(0)'s NORMAL -> Node(1)'s NORMAL -> Node(0)'s DMA.

tested ia64 2-Node NUMA. works well.

Signed-Off-By: KAMEZAWA Hiroyuki <kamezawa.hiroyu@jp.fujitsu.com>

Index: linux-2.6.21-rc7-mm2/mm/page_alloc.c
===================================================================
--- linux-2.6.21-rc7-mm2.orig/mm/page_alloc.c
+++ linux-2.6.21-rc7-mm2/mm/page_alloc.c
@@ -2023,6 +2023,7 @@ void show_free_areas(void)
  *
  * Add all populated zones of a node to the zonelist.
  */
+#ifndef CONFIG_NUMA
 static int __meminit build_zonelists_node(pg_data_t *pgdat,
 			struct zonelist *zonelist, int nr_zones, enum zone_type zone_type)
 {
@@ -2042,6 +2043,7 @@ static int __meminit build_zonelists_nod
 	} while (zone_type);
 	return nr_zones;
 }
+#endif
 
 #ifdef CONFIG_NUMA
 #define MAX_NODE_LOAD (num_online_nodes())
@@ -2106,52 +2108,51 @@ static int __meminit find_next_best_node
 	return best_node;
 }
 
+/*
+ * Build zonelist based on zone priority.
+ */
+static int __meminitdata node_order[MAX_NUMNODES];
 static void __meminit build_zonelists(pg_data_t *pgdat)
 {
-	int j, node, local_node;
-	enum zone_type i;
-	int prev_node, load;
-	struct zonelist *zonelist;
+	int i, j, pos, zone_type, node, load;
 	nodemask_t used_mask;
+	int local_node, prev_node;
+	struct zone *z;
+	struct zonelist *zonelist;
 
-	/* initialize zonelists */
 	for (i = 0; i < MAX_NR_ZONES; i++) {
 		zonelist = pgdat->node_zonelists + i;
 		zonelist->zones[0] = NULL;
 	}
-
-	/* NUMA-aware ordering of nodes */
+	memset(node_order, 0, sizeof(node_order));
 	local_node = pgdat->node_id;
 	load = num_online_nodes();
 	prev_node = local_node;
 	nodes_clear(used_mask);
+	j = 0;
 	while ((node = find_next_best_node(local_node, &used_mask)) >= 0) {
 		int distance = node_distance(local_node, node);
-
-		/*
-		 * If another node is sufficiently far away then it is better
-		 * to reclaim pages in a zone before going off node.
-		 */
 		if (distance > RECLAIM_DISTANCE)
 			zone_reclaim_mode = 1;
-
-		/*
-		 * We don't want to pressure a particular node.
-		 * So adding penalty to the first node in same
-		 * distance group to make it round-robin.
-		 */
-
 		if (distance != node_distance(local_node, prev_node))
-			node_load[node] += load;
+			node_load[node] = load;
+		node_order[j++] = node;
 		prev_node = node;
 		load--;
-		for (i = 0; i < MAX_NR_ZONES; i++) {
-			zonelist = pgdat->node_zonelists + i;
-			for (j = 0; zonelist->zones[j] != NULL; j++);
-
-	 		j = build_zonelists_node(NODE_DATA(node), zonelist, j, i);
-			zonelist->zones[j] = NULL;
+	}
+	/* calculate node order */
+	for (i = 0; i < MAX_NR_ZONES; i++) {
+		zonelist = pgdat->node_zonelists + i;
+		pos = 0;
+		for (zone_type = i; zone_type >= 0; zone_type--) {
+			for (j = 0; j < num_online_nodes(); j++) {
+				node = node_order[j];
+				z = &NODE_DATA(node)->node_zones[zone_type];
+				if (populated_zone(z))
+					zonelist->zones[pos++] = z;
+			}
 		}
+		zonelist->zones[pos] = NULL;
 	}
 }
 
@@ -2239,6 +2240,7 @@ void __meminit build_all_zonelists(void)
 		__build_all_zonelists(NULL);
 		cpuset_init_current_mems_allowed();
 	} else {
+		memset(node_load, 0, sizeof(node_load));
 		/* we have to stop all cpus to guaranntee there is no user
 		   of zonelist */
 		stop_machine_run(__build_all_zonelists, NULL, NR_CPUS);


^ permalink raw reply	[flat|nested] 19+ messages in thread

* Re: [PATCH] change global zonelist order on NUMA v2
  2007-04-26  9:47 ` Andi Kleen
  2007-04-26 10:10   ` KAMEZAWA Hiroyuki
@ 2007-04-26 15:46   ` Christoph Lameter
  2007-04-26 15:51     ` Andi Kleen
  1 sibling, 1 reply; 19+ messages in thread
From: Christoph Lameter @ 2007-04-26 15:46 UTC (permalink / raw)
  To: Andi Kleen; +Cc: KAMEZAWA Hiroyuki, LKML, Linux-MM, AKPM

On Thu, 26 Apr 2007, Andi Kleen wrote:

> On Thursday 26 April 2007 11:34:17 KAMEZAWA Hiroyuki wrote:
> > 
> > Changelog from V1 -> V2
> > - sysctl name is changed to be relaxed_zone_order
> > - NORMAL->NORMAL->....->DMA->DMA->DMA order (new ordering) is now default.
> >   NORMAL->DMA->NORMAL->DMA order (old ordering) is optional.
> > - addes boot opttion to set relaxed_zone_order. ia64 is supported now.
> > - Added documentation
> > 
> > patch is against 2.6.21-rc7-mm2. tested on ia64 NUMA box. works well.
> 
> IMHO the change should be default (without any options) unless someone
> can come up with a good reason why not. On x86-64 it should be definitely
> default.

It is not a good idea if node 0 has both DMA and NORMAL memory and normal 
memory is a small fraction of node memory. In that case lots of 
allocations get redirected to node 1.
 
> If there is a good reason on some architecture or machine a user option is also not a 
> good idea, but instead it should be set automatically by that architecture or machine
> on boot.

Right. That was my thinking.


^ permalink raw reply	[flat|nested] 19+ messages in thread

* Re: [PATCH] change global zonelist order on NUMA v2
  2007-04-26 10:10   ` KAMEZAWA Hiroyuki
  2007-04-26 10:53     ` [PATCH] change global zonelist order on NUMA v3 KAMEZAWA Hiroyuki
@ 2007-04-26 15:48     ` Christoph Lameter
  2007-04-27  0:27       ` KAMEZAWA Hiroyuki
  1 sibling, 1 reply; 19+ messages in thread
From: Christoph Lameter @ 2007-04-26 15:48 UTC (permalink / raw)
  To: KAMEZAWA Hiroyuki; +Cc: Andi Kleen, linux-kernel, linux-mm, akpm

On Thu, 26 Apr 2007, KAMEZAWA Hiroyuki wrote:

> (1)Use new zonelist ordering always and move init_task's tied cpu to a
>   cpu on the best node. 
>   Child processes will start in good nodes even if Node 0 has small memory.

How about renumbering the nodes? Node 0 is the one with no DMA memory and 
node 1 may be the one with the DMA? That would take care of things even 
without core modifications. We can start on node 0 (which hardware 1) and 
consume the required memory for boot there not impacting the node with the 
DMA memory.



^ permalink raw reply	[flat|nested] 19+ messages in thread

* Re: [PATCH] change global zonelist order on NUMA v2
  2007-04-26 15:46   ` Christoph Lameter
@ 2007-04-26 15:51     ` Andi Kleen
  0 siblings, 0 replies; 19+ messages in thread
From: Andi Kleen @ 2007-04-26 15:51 UTC (permalink / raw)
  To: Christoph Lameter; +Cc: KAMEZAWA Hiroyuki, LKML, Linux-MM, AKPM

On Thursday 26 April 2007 17:46:35 Christoph Lameter wrote:

> 
> It is not a good idea if node 0 has both DMA and NORMAL memory and normal 
> memory is a small fraction of node memory. In that case lots of 
> allocations get redirected to node 1.

Good point yes. On x86-64 you might even have ZONE_DMA on node 0/1 and NORMAL
only on 3. I guess this needs to be detected somehow.

-Andi



^ permalink raw reply	[flat|nested] 19+ messages in thread

* Re: [PATCH] change global zonelist order on NUMA v3
  2007-04-26 10:53     ` [PATCH] change global zonelist order on NUMA v3 KAMEZAWA Hiroyuki
@ 2007-04-26 16:00       ` Lee Schermerhorn
  2007-04-26 16:06         ` Christoph Lameter
  0 siblings, 1 reply; 19+ messages in thread
From: Lee Schermerhorn @ 2007-04-26 16:00 UTC (permalink / raw)
  To: KAMEZAWA Hiroyuki
  Cc: ak, linux-kernel, linux-mm, akpm, clameter, Eric Whitney

On Thu, 2007-04-26 at 19:53 +0900, KAMEZAWA Hiroyuki wrote:
> Changelog V2 -> V3
> 
> - removed zone ordering selection knobs...
> 
> much simpler one. just changing zonelist ordering.
> tested on ia64 NUMA works well as expected.
> 
> -Kame
> 
> 
> change zonelist order on NUMA v3.
> 
> [Description]
> Assume 2 node NUMA, only node(0) has ZONE_DMA.
> (ia64's ZONE_DMA is below 4GB...x86_64's ZONE_DMA32)
> 
> In this case, current default (node0's) zonelist order is
> 
> Node(0)'s NORMAL -> Node(0)'s DMA -> Node(1)"s NORMAL.
> 
> This means Node(0)'s DMA will be used before Node(1)'s NORMAL.
> This will cause OOM on ZONE_DMA easily.

We have a similar situation on HP Intergrity [ia64-based] platforms.
The platform supports cache-line interleaved memory or cell local
memory--a firmware configuration option.  Even configured for "100% Cell
Local Memory [CLM]", we have a small amount of interleaved memory at
physical address zero.  [I think the DIG spec may require this?  CLM
shows up at some ridiculously high physical address.]  Here, "small
amount" means ~512MB for a 4 node system and ~1G for a 16node system.
This shows up as the only DMA memory [below 4G] on the system. 

The interleaved memory shows up as a pseudo-node "N" in an N-node
platform.  I.e., nodes 0-N-1 represent the real physical nodes and node
N is the pseudo-node containing only interleaved memory [no cpus nor
IO].  The firmware tells us, via the SLIT, that the interleaved
pseudo-node is closer to all physical nodes than any other real
node--apparently based on the theory that the average latency is less
because it contains some local memory.  This means that with the current
zone ordering, the DMA zone ends up as the second zone in each node's
Normal zonelist.  Thus, we are subject to the same DMA zone exhaustion
that this patch addresses.

I have tested the patch on our platforms and it appears to work as
advertised.

Thanks, Kame!

Question:  why remove the comments below?  Especially the ones that
attempt to explain the rationale for the logic?

Lee

> 
> This patch changes *default* zone order to
> 
> Node(0)'s NORMAL -> Node(1)'s NORMAL -> Node(0)'s DMA.
> 
> tested ia64 2-Node NUMA. works well.
> 
> Signed-Off-By: KAMEZAWA Hiroyuki <kamezawa.hiroyu@jp.fujitsu.com>
> 
> Index: linux-2.6.21-rc7-mm2/mm/page_alloc.c
> ===================================================================
> --- linux-2.6.21-rc7-mm2.orig/mm/page_alloc.c
> +++ linux-2.6.21-rc7-mm2/mm/page_alloc.c
> @@ -2023,6 +2023,7 @@ void show_free_areas(void)
>   *
>   * Add all populated zones of a node to the zonelist.
>   */
> +#ifndef CONFIG_NUMA
>  static int __meminit build_zonelists_node(pg_data_t *pgdat,
>  			struct zonelist *zonelist, int nr_zones, enum zone_type zone_type)
>  {
> @@ -2042,6 +2043,7 @@ static int __meminit build_zonelists_nod
>  	} while (zone_type);
>  	return nr_zones;
>  }
> +#endif
>  
>  #ifdef CONFIG_NUMA
>  #define MAX_NODE_LOAD (num_online_nodes())
> @@ -2106,52 +2108,51 @@ static int __meminit find_next_best_node
>  	return best_node;
>  }
>  
> +/*
> + * Build zonelist based on zone priority.
> + */
> +static int __meminitdata node_order[MAX_NUMNODES];
>  static void __meminit build_zonelists(pg_data_t *pgdat)
>  {
> -	int j, node, local_node;
> -	enum zone_type i;
> -	int prev_node, load;
> -	struct zonelist *zonelist;
> +	int i, j, pos, zone_type, node, load;
>  	nodemask_t used_mask;
> +	int local_node, prev_node;
> +	struct zone *z;
> +	struct zonelist *zonelist;
>  
> -	/* initialize zonelists */
>  	for (i = 0; i < MAX_NR_ZONES; i++) {
>  		zonelist = pgdat->node_zonelists + i;
>  		zonelist->zones[0] = NULL;
>  	}
> -
> -	/* NUMA-aware ordering of nodes */
> +	memset(node_order, 0, sizeof(node_order));
>  	local_node = pgdat->node_id;
>  	load = num_online_nodes();
>  	prev_node = local_node;
>  	nodes_clear(used_mask);
> +	j = 0;
>  	while ((node = find_next_best_node(local_node, &used_mask)) >= 0) {
>  		int distance = node_distance(local_node, node);
> -
> -		/*
> -		 * If another node is sufficiently far away then it is better
> -		 * to reclaim pages in a zone before going off node.
> -		 */
>  		if (distance > RECLAIM_DISTANCE)
>  			zone_reclaim_mode = 1;
> -
> -		/*
> -		 * We don't want to pressure a particular node.
> -		 * So adding penalty to the first node in same
> -		 * distance group to make it round-robin.
> -		 */
> -
>  		if (distance != node_distance(local_node, prev_node))
> -			node_load[node] += load;
> +			node_load[node] = load;
> +		node_order[j++] = node;
>  		prev_node = node;
>  		load--;
> -		for (i = 0; i < MAX_NR_ZONES; i++) {
> -			zonelist = pgdat->node_zonelists + i;
> -			for (j = 0; zonelist->zones[j] != NULL; j++);
> -
> -	 		j = build_zonelists_node(NODE_DATA(node), zonelist, j, i);
> -			zonelist->zones[j] = NULL;
> +	}
> +	/* calculate node order */
> +	for (i = 0; i < MAX_NR_ZONES; i++) {
> +		zonelist = pgdat->node_zonelists + i;
> +		pos = 0;
> +		for (zone_type = i; zone_type >= 0; zone_type--) {
> +			for (j = 0; j < num_online_nodes(); j++) {
> +				node = node_order[j];
> +				z = &NODE_DATA(node)->node_zones[zone_type];
> +				if (populated_zone(z))
> +					zonelist->zones[pos++] = z;
> +			}
>  		}
> +		zonelist->zones[pos] = NULL;
>  	}
>  }
>  
> @@ -2239,6 +2240,7 @@ void __meminit build_all_zonelists(void)
>  		__build_all_zonelists(NULL);
>  		cpuset_init_current_mems_allowed();
>  	} else {
> +		memset(node_load, 0, sizeof(node_load));
>  		/* we have to stop all cpus to guaranntee there is no user
>  		   of zonelist */
>  		stop_machine_run(__build_all_zonelists, NULL, NR_CPUS);
> 
> --
> To unsubscribe, send a message with 'unsubscribe linux-mm' in
> the body to majordomo@kvack.org.  For more info on Linux MM,
> see: http://www.linux-mm.org/ .
> Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>


^ permalink raw reply	[flat|nested] 19+ messages in thread

* Re: [PATCH] change global zonelist order on NUMA v3
  2007-04-26 16:00       ` Lee Schermerhorn
@ 2007-04-26 16:06         ` Christoph Lameter
  2007-04-26 16:29           ` Lee Schermerhorn
  0 siblings, 1 reply; 19+ messages in thread
From: Christoph Lameter @ 2007-04-26 16:06 UTC (permalink / raw)
  To: Lee Schermerhorn
  Cc: KAMEZAWA Hiroyuki, ak, linux-kernel, linux-mm, akpm, Eric Whitney

Hmmmm... One additional easy way to fix this would be to create a DMA 
node and place it very distant to other nodes. This would make it a 
precious system resource that is only used for

1. GFP_DMA allocations

2. If the memory on the other nodes is exhausted.


^ permalink raw reply	[flat|nested] 19+ messages in thread

* Re: [PATCH] change global zonelist order on NUMA v3
  2007-04-26 16:06         ` Christoph Lameter
@ 2007-04-26 16:29           ` Lee Schermerhorn
  2007-04-26 16:36             ` Christoph Lameter
  0 siblings, 1 reply; 19+ messages in thread
From: Lee Schermerhorn @ 2007-04-26 16:29 UTC (permalink / raw)
  To: Christoph Lameter
  Cc: KAMEZAWA Hiroyuki, ak, linux-kernel, linux-mm, akpm, Eric Whitney

On Thu, 2007-04-26 at 09:06 -0700, Christoph Lameter wrote:
> Hmmmm... One additional easy way to fix this would be to create a DMA 
> node and place it very distant to other nodes. This would make it a 
> precious system resource that is only used for
> 
> 1. GFP_DMA allocations
> 
> 2. If the memory on the other nodes is exhausted.
> 

This would solve the problem for "100% CLM" configurations where the
only thing in the interleaved pseudo-node is DMA zone.  However, we can
configure any %-age of CLM between 0% [fully interleaved, pseudo-SMP]
and "100%" [which is not really, as I've mentioned].  Interestingly,
older revs of our firmware set the SLIT distance for the interleaved
pseudo-node to 255 [or such], so it was always last.  Then someone
decided that the interleaved node was effectively closer than other
nodes...

I have been considering an HP-platform-specific boot option [handled by
a new ia64 machine vec op] to re-distance the interleaved node, but for
other platforms, such as Kame's, I think we still need the ability to
move the DMA zones last in the Normal zone lists.  Or, exclude them
altogether?

Lee


^ permalink raw reply	[flat|nested] 19+ messages in thread

* Re: [PATCH] change global zonelist order on NUMA v3
  2007-04-26 16:29           ` Lee Schermerhorn
@ 2007-04-26 16:36             ` Christoph Lameter
  0 siblings, 0 replies; 19+ messages in thread
From: Christoph Lameter @ 2007-04-26 16:36 UTC (permalink / raw)
  To: Lee Schermerhorn
  Cc: KAMEZAWA Hiroyuki, ak, linux-kernel, linux-mm, akpm, Eric Whitney

On Thu, 26 Apr 2007, Lee Schermerhorn wrote:

> I have been considering an HP-platform-specific boot option [handled by
> a new ia64 machine vec op] to re-distance the interleaved node, but for
> other platforms, such as Kame's, I think we still need the ability to
> move the DMA zones last in the Normal zone lists.  Or, exclude them
> altogether?

Maybe a solution would be to have a dma_penalty option on boot? The dma 
penalty is added to the dma zone. If its higher than zero then the dma 
zone will become a node at that distance to other nodes.

The default is zero which would leave it as is.

If you boot with

	dma_penalty=40

then a new slit entry is generated for the DMA zone and its put at that 
distance.


^ permalink raw reply	[flat|nested] 19+ messages in thread

* Re: [PATCH] change global zonelist order on NUMA v2
  2007-04-26  9:34 [PATCH] change global zonelist order on NUMA v2 KAMEZAWA Hiroyuki
  2007-04-26  9:47 ` Andi Kleen
@ 2007-04-26 21:57 ` Lee Schermerhorn
  2007-04-26 22:07   ` Christoph Lameter
  2007-04-27  0:41   ` KAMEZAWA Hiroyuki
  1 sibling, 2 replies; 19+ messages in thread
From: Lee Schermerhorn @ 2007-04-26 21:57 UTC (permalink / raw)
  To: KAMEZAWA Hiroyuki; +Cc: LKML, Linux-MM, AKPM, Christoph Lameter, Andi Kleen

On Thu, 2007-04-26 at 18:34 +0900, KAMEZAWA Hiroyuki wrote:
> Changelog from V1 -> V2
> - sysctl name is changed to be relaxed_zone_order
> - NORMAL->NORMAL->....->DMA->DMA->DMA order (new ordering) is now default.
>   NORMAL->DMA->NORMAL->DMA order (old ordering) is optional.
> - addes boot opttion to set relaxed_zone_order. ia64 is supported now.
> - Added documentation
> 
> patch is against 2.6.21-rc7-mm2. tested on ia64 NUMA box. works well.

[PATCH] factor/rework change zonelist order patch

Against 2.6.21-rc7 atop KAMEZAWA Hiroyuki's "change global zonelist
order on NUMA v2" patch.

This patch reworks Kame's patch to select the zonelist order as
follows:

1) factor common code out of the build_zonelists_*_aware() functions.
   Renamed these functions to "build_zonelists_in_{node|zone}_order()".
   Restored the comments about zone_reclaim and node "loading" in 
   build_zonelists().  Diff stats for page_alloc.c are inflated by some
   code reorg/movement [or maybe not].

2) renamed the sysctl and boot parameter to "numa_zonelist_order".  I had
   already started this against the v1 patch when Kame came out with his
   v2 patch, so I kept that name here.  One can specify values:

     "[Nn]ode" | "[Dd]efault" | "0" => default/node order
     "[Zz]one" | "1"                => alternate/zone order

   Being lazy, I only check the 1st character of the parameter string.

   Differentiate between default and explicitly specified "node" order
   in case we want to add arch-specific auto-tuning.  Admin/Operator can
   still override by specifying a non-default mode.

   Note that the sense of this switch [0/1] is opposite that of the
   "relaxed_zone_order" in Kame's v2 patch.  I.e., same as the v1 patch.
   Easy to change if we want the new behavior to become the default.

3) kept early_param() definition for boot parameter in mm/page_alloc.c,
   along with the handler function.  One less file to modify.

4) modified the two Documentation additions to match these changes.

I've tested various combinations [non-exhaustive], with an ad hoc
instrumentation patch, and it appears to work as expected [as I expect,
anyway] on ia64 NUMA.

Question:  do we need to rebuild the zonelist caches when we reorder
           the zones?  The z_to_n[] array appears to be dependent on
           the zonelist order... 

Also:      I see the "Movable" zones show up in 21-rc7-mm2.  This patch
           will cause Movable zone to overflow to remote movable zones
           before using local Normal memory in non-default, zone order.
           Is this what we want?

Signed-off-by:  Lee Schermerhorn <lee.schermerhorn@hp.com>

 Documentation/kernel-parameters.txt |   11 +
 Documentation/sysctl/vm.txt         |   38 +++--
 arch/ia64/mm/discontig.c            |    2 
 include/linux/mmzone.h              |    6 
 kernel/sysctl.c                     |   11 -
 mm/page_alloc.c                     |  229 ++++++++++++++++++++++--------------
 6 files changed, 182 insertions(+), 115 deletions(-)

Index: Linux/mm/page_alloc.c
===================================================================
--- Linux.orig/mm/page_alloc.c	2007-04-26 14:06:17.000000000 -0400
+++ Linux/mm/page_alloc.c	2007-04-26 16:51:31.000000000 -0400
@@ -2024,7 +2024,8 @@ void show_free_areas(void)
  * Add all populated zones of a node to the zonelist.
  */
 static int __meminit build_zonelists_node(pg_data_t *pgdat,
-			struct zonelist *zonelist, int nr_zones, enum zone_type zone_type)
+			struct zonelist *zonelist, int nr_zones,
+			enum zone_type zone_type)
 {
 	struct zone *zone;
 
@@ -2107,130 +2108,155 @@ static int find_next_best_node(int node,
 }
 
 /*
- * Build zonelists based on node locality.
+ * numa_zonelist_order:
+ *  0 [default] = order by ([node] distance, -zonetype)
+ *  1           = order by (-zonetype, [node] distance)
+ */
+static int zonelist_order = 0;
+
+/*
+ * command line option "numa_zonelist_order"
+ *      = "[dD]efault|[nN]ode"|"0" - default, order by node locality,
+ *         then zone within node.
+ *	= "[zZ]one"|"1" - order by zone, then by locality within zone
+ */
+char numa_zonelist_order[NUMA_ZONELIST_ORDER_LEN] = "default";
+
+static int __parse_numa_zonelist_order(char *s)
+{
+	if (*s == 'd' || *s == 'D') {
+		strncpy(numa_zonelist_order, "default",
+					NUMA_ZONELIST_ORDER_LEN);
+		zonelist_order = 0;
+	} else if (*s == 'n' || *s == 'N' || *s == '0') {
+		strncpy(numa_zonelist_order, "node",
+					NUMA_ZONELIST_ORDER_LEN);
+		zonelist_order = 0;
+	} else if (*s == 'z' || *s == 'Z' || *s == '1') {
+		strncpy(numa_zonelist_order, "zone",
+					NUMA_ZONELIST_ORDER_LEN);
+		zonelist_order = 1;
+	} else {
+		printk(KERN_WARNING
+			"Ignoring invalid numa_zonelist_order value:  "
+			"%s\n", s);
+		return -EINVAL;
+	}
+	return 0;
+}
+
+static __init int setup_numa_zonelist_order(char *s)
+{
+	if (s)
+		return __parse_numa_zonelist_order(s);
+	return 0;
+}
+early_param("numa_zonelist_order", setup_numa_zonelist_order);
+
+/*
+ * Build zonelists ordered by node and zones within node.
+ * This results in maximum locality--normal zone overflows into local
+ * DMA zone, if any--but risks exhausting DMA zone.
  */
-static void build_zonelists_locality_aware(pg_data_t *pgdat)
+static void build_zonelists_in_node_order(pg_data_t *pgdat, int node)
 {
-	int j, node, local_node;
 	enum zone_type i;
-	int prev_node, load;
+	int j;
 	struct zonelist *zonelist;
-	nodemask_t used_mask;
 
-	/* initialize zonelists */
 	for (i = 0; i < MAX_NR_ZONES; i++) {
 		zonelist = pgdat->node_zonelists + i;
-		zonelist->zones[0] = NULL;
-	}
+		for (j = 0; zonelist->zones[j] != NULL; j++);
 
-	/* NUMA-aware ordering of nodes */
-	local_node = pgdat->node_id;
-	load = num_online_nodes();
-	prev_node = local_node;
-	nodes_clear(used_mask);
-	while ((node = find_next_best_node(local_node, &used_mask)) >= 0) {
-		int distance = node_distance(local_node, node);
-
-		/*
-		 * If another node is sufficiently far away then it is better
-		 * to reclaim pages in a zone before going off node.
-		 */
-		if (distance > RECLAIM_DISTANCE)
-			zone_reclaim_mode = 1;
+ 		j = build_zonelists_node(NODE_DATA(node), zonelist, j, i);
+		zonelist->zones[j] = NULL;
+	}
+}
 
-		/*
-		 * We don't want to pressure a particular node.
-		 * So adding penalty to the first node in same
-		 * distance group to make it round-robin.
-		 */
+/*
+ * Build zonelists ordered by zone and nodes within zones.
+ * This results in conserving DMA zone[s] until all Normal memory is
+ * exhausted, but results in overflowing to remote node while memory
+ * may still exist in local DMA zone.
+ */
+static int node_order[MAX_NUMNODES];
 
-		if (distance != node_distance(local_node, prev_node))
-			node_load[node] += load;
-		prev_node = node;
-		load--;
-		for (i = 0; i < MAX_NR_ZONES; i++) {
-			zonelist = pgdat->node_zonelists + i;
-			for (j = 0; zonelist->zones[j] != NULL; j++);
+static void build_zonelists_in_zone_order(pg_data_t *pgdat, int nr_nodes)
+{
+	enum zone_type i;
+	int pos, j, node;
+	int zone_type;		/* needs to be signed */
+	struct zone *z;
+	struct zonelist *zonelist;
 
-	 		j = build_zonelists_node(NODE_DATA(node), zonelist, j, i);
-			zonelist->zones[j] = NULL;
+	for (i = 0; i < MAX_NR_ZONES; i++) {
+		zonelist = pgdat->node_zonelists + i;
+		pos = 0;
+		for (zone_type = i; zone_type >= 0; zone_type--) {
+			for (j = 0; j < nr_nodes; j++) {
+				node = node_order[j];
+				z = &NODE_DATA(node)->node_zones[zone_type];
+				if (populated_zone(z))
+					zonelist->zones[pos++] = z;
+			}
 		}
+		zonelist->zones[pos] = NULL;
 	}
 }
 
-/*
- * Build zonelist based on zone priority.
- */
-static int node_order[MAX_NUMNODES];
-static void build_zonelists_zone_aware(pg_data_t *pgdat)
+static void build_zonelists(pg_data_t *pgdat)
 {
-	int i, j, pos, zone_type, node, load;
+	int j, node, load;
+	enum zone_type i;
 	nodemask_t used_mask;
 	int local_node, prev_node;
-	struct zone *z;
 	struct zonelist *zonelist;
 
+	/* initialize zonelists */
 	for (i = 0; i < MAX_NR_ZONES; i++) {
 		zonelist = pgdat->node_zonelists + i;
 		zonelist->zones[0] = NULL;
 	}
-	memset(node_order, 0, sizeof(node_order));
+
+	/* NUMA-aware ordering of nodes */
 	local_node = pgdat->node_id;
 	load = num_online_nodes();
 	prev_node = local_node;
 	nodes_clear(used_mask);
+
+	memset(node_order, 0, sizeof(node_order));
 	j = 0;
+
 	while ((node = find_next_best_node(local_node, &used_mask)) >= 0) {
 		int distance = node_distance(local_node, node);
+
+		/*
+		 * If another node is sufficiently far away then it is better
+		 * to reclaim pages in a zone before going off node.
+		 */
 		if (distance > RECLAIM_DISTANCE)
 			zone_reclaim_mode = 1;
+
+		/*
+		 * We don't want to pressure a particular node.
+		 * So adding penalty to the first node in same
+		 * distance group to make it round-robin.
+		 */
 		if (distance != node_distance(local_node, prev_node))
 			node_load[node] = load;
-		node_order[j++] = node;
+
 		prev_node = node;
 		load--;
+		if (!zonelist_order)	/* default */
+			build_zonelists_in_node_order(pgdat, node);
+		else
+			node_order[j++] = node;	/* remember order */
 	}
-	/* calculate node order */
-	for (i = 0; i < MAX_NR_ZONES; i++) {
-		zonelist = pgdat->node_zonelists + i;
-		pos = 0;
-		for (zone_type = i; zone_type >= 0; zone_type--) {
-			for (j = 0; j < num_online_nodes(); j++) {
-				node = node_order[j];
-				z = &NODE_DATA(node)->node_zones[zone_type];
-				if (populated_zone(z))
-					zonelist->zones[pos++] = z;
-			}
-		}
-		zonelist->zones[pos] = NULL;
-	}
-}
 
-int sysctl_relaxed_zone_order = 0;
-
-static void build_zonelists(pg_data_t *pgdat)
-{
-	if (sysctl_relaxed_zone_order)
-		build_zonelists_locality_aware(pgdat);
-	else
-		build_zonelists_zone_aware(pgdat);
-}
-
-int sysctl_relaxed_zone_order_handler(ctl_table *table, int write,
-		struct file *file, void __user *buffer, size_t *length,
-		loff_t *ppos)
-{
-	int oldval = sysctl_relaxed_zone_order;
-	proc_dointvec_minmax(table, write, file, buffer, length, ppos);
-	if (write && (oldval != sysctl_relaxed_zone_order))
-		build_all_zonelists();
-	return 0;
-}
-
-int __init cmdline_parse_relaxed_zone_order(char *p)
-{
-	sysctl_relaxed_zone_order = 1;
-	return 0;
+	if (zonelist_order) {
+		/* calculate node order -- i.e., DMA last! */
+		build_zonelists_in_zone_order(pgdat, j);
+	}
 }
 
 /* Construct the zonelist performance cache - see further mmzone.h */
@@ -2251,6 +2277,37 @@ static void __meminit build_zonelist_cac
 	}
 }
 
+/*
+ * sysctl handler for numa_zonelist_order
+ */
+int numa_zonelist_order_handler(ctl_table *table, int write,
+		struct file *file, void __user *buffer, size_t *length,
+		loff_t *ppos)
+{
+	char saved_string[NUMA_ZONELIST_ORDER_LEN];
+	int ret;
+
+	if (write)
+		strncpy(saved_string, (char*)table->data,
+			NUMA_ZONELIST_ORDER_LEN);
+	ret = proc_dostring(table, write, file, buffer, length, ppos);
+	if (ret)
+		return ret;
+	if (write) {
+		int oldval = zonelist_order;
+		if (__parse_numa_zonelist_order((char*)table->data)) {
+			/*
+			 * bogus value.  restore saved string
+			 */
+			strncpy((char*)table->data, saved_string,
+				NUMA_ZONELIST_ORDER_LEN);
+			zonelist_order = oldval;
+		} else if (oldval != zonelist_order)
+			build_all_zonelists();
+	}
+	return 0;
+}
+
 #else	/* CONFIG_NUMA */
 
 static void __meminit build_zonelists(pg_data_t *pgdat)
Index: Linux/kernel/sysctl.c
===================================================================
--- Linux.orig/kernel/sysctl.c	2007-04-26 14:06:17.000000000 -0400
+++ Linux/kernel/sysctl.c	2007-04-26 16:46:29.000000000 -0400
@@ -80,7 +80,6 @@ extern int sysctl_drop_caches;
 extern int percpu_pagelist_fraction;
 extern int compat_log;
 extern int maps_protect;
-extern int sysctl_relaxed_zone_order;
 
 #if defined(CONFIG_ADAPTIVE_READAHEAD)
 extern int readahead_ratio;
@@ -896,12 +895,12 @@ static ctl_table vm_table[] = {
 	},
 	{
 		.ctl_name	= CTL_UNNUMBERED,
-		.procname	= "relaxed_zone_order",
-		.data		= &sysctl_relaxed_zone_order,
-		.maxlen		= sizeof(sysctl_relaxed_zone_order),
+		.procname	= "numa_zonelist_order",
+		.data		= &numa_zonelist_order,
+		.maxlen		= NUMA_ZONELIST_ORDER_LEN,
 		.mode		= 0644,
-		.proc_handler	= &sysctl_relaxed_zone_order_handler,
-		.strategy	= &sysctl_intvec,
+		.proc_handler	= &numa_zonelist_order_handler,
+		.strategy	= &sysctl_string,
 	},
 #endif
 #if defined(CONFIG_X86_32) || \
Index: Linux/include/linux/mmzone.h
===================================================================
--- Linux.orig/include/linux/mmzone.h	2007-04-26 13:35:49.000000000 -0400
+++ Linux/include/linux/mmzone.h	2007-04-26 16:51:15.000000000 -0400
@@ -608,10 +608,10 @@ int sysctl_min_unmapped_ratio_sysctl_han
 int sysctl_min_slab_ratio_sysctl_handler(struct ctl_table *, int,
 			struct file *, void __user *, size_t *, loff_t *);
 
-extern int sysctl_relaxed_zone_order_handler(struct ctl_table *, int,
+extern int numa_zonelist_order_handler(struct ctl_table *, int,
 			struct file *, void __user *, size_t *, loff_t *);
-
-extern int cmdline_parse_relaxed_zone_order(char *p);
+extern char numa_zonelist_order[];
+#define NUMA_ZONELIST_ORDER_LEN 16	/* string buffer size */
 
 #include <linux/topology.h>
 /* Returns the number of the current Node. */
Index: Linux/arch/ia64/mm/discontig.c
===================================================================
--- Linux.orig/arch/ia64/mm/discontig.c	2007-04-26 13:35:49.000000000 -0400
+++ Linux/arch/ia64/mm/discontig.c	2007-04-26 14:10:23.000000000 -0400
@@ -27,8 +27,6 @@
 #include <asm/numa.h>
 #include <asm/sections.h>
 
-
-early_param("relaxed_zone_order", cmdline_parse_relaxed_zone_order);
 /*
  * Track per-node information needed to setup the boot memory allocator, the
  * per-node areas, and the real VM.
Index: Linux/Documentation/kernel-parameters.txt
===================================================================
--- Linux.orig/Documentation/kernel-parameters.txt	2007-04-26 13:35:49.000000000 -0400
+++ Linux/Documentation/kernel-parameters.txt	2007-04-26 15:38:54.000000000 -0400
@@ -1500,9 +1500,14 @@ and is between 256 and 4096 characters. 
 			Format: <reboot_mode>[,<reboot_mode2>[,...]]
 			See arch/*/kernel/reboot.c or arch/*/kernel/process.c			
 
-	relaxed_zone_order [KNL,BOOT]
-			give memory allocation priority to locality rather
-			than zone class. See Documentation/sysctl/vm.txt
+	numa_zonelist_order [KNL,BOOT]
+			Select memory allocation zonelist order for NUMA
+			platform.  Default a.k.a. "Node order" orders the
+			zonelists by node [locality], then zones within
+			nodes.  "Zone order" orders the zonelists by zone,
+			then nodes within the zone.  This moves DMA zone,
+			if any, to the end of the allocation lists.
+			See also Documentation/sysctl/vm.txt
 
 	reserve=	[KNL,BUGS] Force the kernel to ignore some iomem area
 
Index: Linux/Documentation/sysctl/vm.txt
===================================================================
--- Linux.orig/Documentation/sysctl/vm.txt	2007-04-26 13:35:49.000000000 -0400
+++ Linux/Documentation/sysctl/vm.txt	2007-04-26 15:48:20.000000000 -0400
@@ -34,7 +34,7 @@ Currently, these files are in /proc/sys/
 - swap_prefetch
 - readahead_ratio
 - readahead_hit_rate
-- relaxed_zone_order
+- numa_zonelist_order
 
 ==============================================================
 
@@ -279,21 +279,29 @@ The default value is 1.
 
 =============================================================
 
-relaxed_zone_order
+numa_zonelist_order
 
 This sysctl is only for NUMA.
-This allows you to allocate local memory more aggresively.
-Assume 2 Node NUMA.The kernel memory allocateion order on Node(0)
-is following. relaxed_zone_order=0 in this case.(default)
-==
-Node(0)NORMAL -> Node(1)NORMAL -> Node(0)DMA -> Node(1)DMA(if any)
-==
-If set to relaxed_zone_order=1, This option changes this order to be
-==
-Node(0)NORMAL -> Node(0)DMA -> Node(1)NORMA -> Node(1)DMA
-==
-Then you can use more local memory. But, in this case, ZONE_DMA can be
-used more eagerly than default. Then, OOM-KILL in ZONE_DMA can happen easier.
 
-The default value is 0.
+numa_zonelist_order selects the order of the memory allocation zonelists.
+The default order [a.k.a. "node order"] orders the zonelists by node, the
+by zone within each node.  For example, assume 2 Node NUMA.  The default
+kernel memory allocation order on Node(0) will be:
+
+	Node(0)NORMAL -> Node(0)DMA -> Node(1)NORMAL -> Node(1)DMA(if any)
+
+Thus, allocations that request Node(0) NORMAL may overflow onto Node(0)DMA
+first.  This provides maximum locality, but risks exhausting all of DMA
+memory while NORMAL memory exists elsewhere on the system.  This can result
+in OOM-KILL in ZONE_DMA.  You can specify "[Dd]efault", "[Zz]one" or "0" to
+request default/zone order.
+
+If numa_zonelist_order is set to "node" order, the kernel memory allocation
+order on Node(0) becomes:
+
+	Node(0)NORMAL -> Node(1)NORMAL -> Node(0)DMA -> Node(1)DMA(if any)
+
+In this mode, DMA memory will be used in place of NORMAL memory, only when
+all NORMAL zones are exhausted.  Specify "[Nn]ode" or "1" for node order.
+
 



^ permalink raw reply	[flat|nested] 19+ messages in thread

* Re: [PATCH] change global zonelist order on NUMA v2
  2007-04-26 21:57 ` Lee Schermerhorn
@ 2007-04-26 22:07   ` Christoph Lameter
  2007-04-27  0:41   ` KAMEZAWA Hiroyuki
  1 sibling, 0 replies; 19+ messages in thread
From: Christoph Lameter @ 2007-04-26 22:07 UTC (permalink / raw)
  To: Lee Schermerhorn; +Cc: KAMEZAWA Hiroyuki, LKML, Linux-MM, AKPM, Andi Kleen

On Thu, 26 Apr 2007, Lee Schermerhorn wrote:

> Against 2.6.21-rc7 atop KAMEZAWA Hiroyuki's "change global zonelist
> order on NUMA v2" patch.

Hmmm.. hmmm... serious hackery here. Isnt there some way to simplify the 
core impact and make the arch select a strategy? A boot option would be
less impact (I am a bit concerned about switching zonelist mid stream).

The arch should be able to specify a default zone order. So the best thing 
would be to make the zone orders configurable in the page allocator and 
then have the arch code determine a default order depending on the 
hardware that we are running on.

Make sure that the !CONFIG_ZONE_DMA case works.

What about ZONE_DMA32 support?

^ permalink raw reply	[flat|nested] 19+ messages in thread

* Re: [PATCH] change global zonelist order on NUMA v2
  2007-04-26 15:48     ` [PATCH] change global zonelist order on NUMA v2 Christoph Lameter
@ 2007-04-27  0:27       ` KAMEZAWA Hiroyuki
  2007-04-27  1:25         ` Christoph Lameter
  2007-04-30 14:09         ` Lee Schermerhorn
  0 siblings, 2 replies; 19+ messages in thread
From: KAMEZAWA Hiroyuki @ 2007-04-27  0:27 UTC (permalink / raw)
  To: Christoph Lameter; +Cc: ak, linux-kernel, linux-mm, akpm

On Thu, 26 Apr 2007 08:48:19 -0700 (PDT)
Christoph Lameter <clameter@sgi.com> wrote:

> On Thu, 26 Apr 2007, KAMEZAWA Hiroyuki wrote:
> 
> > (1)Use new zonelist ordering always and move init_task's tied cpu to a
> >   cpu on the best node. 
> >   Child processes will start in good nodes even if Node 0 has small memory.
> 
> How about renumbering the nodes? Node 0 is the one with no DMA memory and 
> node 1 may be the one with the DMA? That would take care of things even 
> without core modifications. We can start on node 0 (which hardware 1) and 
> consume the required memory for boot there not impacting the node with the 
> DMA memory.
> 
It seems a bit complicated. If we do so, following can occur,

Node1: cpu0,1,2,3
Node0: cpu4,5,6,7

the system layout will be not imaginable look, maybe.

-Kame


^ permalink raw reply	[flat|nested] 19+ messages in thread

* Re: [PATCH] change global zonelist order on NUMA v2
  2007-04-26 21:57 ` Lee Schermerhorn
  2007-04-26 22:07   ` Christoph Lameter
@ 2007-04-27  0:41   ` KAMEZAWA Hiroyuki
  1 sibling, 0 replies; 19+ messages in thread
From: KAMEZAWA Hiroyuki @ 2007-04-27  0:41 UTC (permalink / raw)
  To: Lee Schermerhorn; +Cc: linux-kernel, linux-mm, akpm, clameter, ak

On Thu, 26 Apr 2007 17:57:40 -0400
Lee Schermerhorn <Lee.Schermerhorn@hp.com> wrote:

> On Thu, 2007-04-26 at 18:34 +0900, KAMEZAWA Hiroyuki wrote:
> > Changelog from V1 -> V2
> > - sysctl name is changed to be relaxed_zone_order
> > - NORMAL->NORMAL->....->DMA->DMA->DMA order (new ordering) is now default.
> >   NORMAL->DMA->NORMAL->DMA order (old ordering) is optional.
> > - addes boot opttion to set relaxed_zone_order. ia64 is supported now.
> > - Added documentation
> > 
> > patch is against 2.6.21-rc7-mm2. tested on ia64 NUMA box. works well.
> 
> [PATCH] factor/rework change zonelist order patch
> 
> Against 2.6.21-rc7 atop KAMEZAWA Hiroyuki's "change global zonelist
> order on NUMA v2" patch.
> 
Hi, this looks 'easier-to-read' than mine. thanks.


> 3) kept early_param() definition for boot parameter in mm/page_alloc.c,
>    along with the handler function.  One less file to modify.
> 
I put early_param() to arch dependent part just beacause no generic code
except for pci seems to call it. If it is allowed, I welcome this change.


> 4) modified the two Documentation additions to match these changes.
> 

> I've tested various combinations [non-exhaustive], with an ad hoc
> instrumentation patch, and it appears to work as expected [as I expect,
> anyway] on ia64 NUMA.
> 
> Question:  do we need to rebuild the zonelist caches when we reorder
>            the zones?  The z_to_n[] array appears to be dependent on
>            the zonelist order... 
> 
maybe no.


> Also:      I see the "Movable" zones show up in 21-rc7-mm2.  This patch
>            will cause Movable zone to overflow to remote movable zones
>            before using local Normal memory in non-default, zone order.
>            Is this what we want?
> 
>From my point of view, it's what I want. What we have to do will be
establish a way to create ZONE_MOVABLE with suitable size on each node.

I'll merge your change to my set and add "automatic detection" support.

Thank you.
-Kame


^ permalink raw reply	[flat|nested] 19+ messages in thread

* Re: [PATCH] change global zonelist order on NUMA v2
  2007-04-27  0:27       ` KAMEZAWA Hiroyuki
@ 2007-04-27  1:25         ` Christoph Lameter
  2007-04-27  1:50           ` KAMEZAWA Hiroyuki
  2007-04-30 15:03           ` Lee Schermerhorn
  2007-04-30 14:09         ` Lee Schermerhorn
  1 sibling, 2 replies; 19+ messages in thread
From: Christoph Lameter @ 2007-04-27  1:25 UTC (permalink / raw)
  To: KAMEZAWA Hiroyuki; +Cc: ak, linux-kernel, linux-mm, akpm

On Fri, 27 Apr 2007, KAMEZAWA Hiroyuki wrote:

> > DMA memory.
> > 
> It seems a bit complicated. If we do so, following can occur,
> 
> Node1: cpu0,1,2,3
> Node0: cpu4,5,6,7

We were discussing a two node NUMA system. If you have more put it onto 
the last.

^ permalink raw reply	[flat|nested] 19+ messages in thread

* Re: [PATCH] change global zonelist order on NUMA v2
  2007-04-27  1:25         ` Christoph Lameter
@ 2007-04-27  1:50           ` KAMEZAWA Hiroyuki
  2007-04-30 15:03           ` Lee Schermerhorn
  1 sibling, 0 replies; 19+ messages in thread
From: KAMEZAWA Hiroyuki @ 2007-04-27  1:50 UTC (permalink / raw)
  To: Christoph Lameter; +Cc: ak, linux-kernel, linux-mm, akpm

On Thu, 26 Apr 2007 18:25:10 -0700 (PDT)
Christoph Lameter <clameter@sgi.com> wrote:

> On Fri, 27 Apr 2007, KAMEZAWA Hiroyuki wrote:
> 
> > > DMA memory.
> > > 
> > It seems a bit complicated. If we do so, following can occur,
> > 
> > Node1: cpu0,1,2,3
> > Node0: cpu4,5,6,7
> 
> We were discussing a two node NUMA system. If you have more put it onto 
> the last.
> 
Hmm, from technical point of view, renumbering may be an option.
But I feel that it's not natural look and different from user's expectation...

-Kame


^ permalink raw reply	[flat|nested] 19+ messages in thread

* Re: [PATCH] change global zonelist order on NUMA v2
  2007-04-27  0:27       ` KAMEZAWA Hiroyuki
  2007-04-27  1:25         ` Christoph Lameter
@ 2007-04-30 14:09         ` Lee Schermerhorn
  1 sibling, 0 replies; 19+ messages in thread
From: Lee Schermerhorn @ 2007-04-30 14:09 UTC (permalink / raw)
  To: KAMEZAWA Hiroyuki
  Cc: Christoph Lameter, ak, linux-kernel, linux-mm, akpm, mike.stroyan

On Fri, 2007-04-27 at 09:27 +0900, KAMEZAWA Hiroyuki wrote:
> On Thu, 26 Apr 2007 08:48:19 -0700 (PDT)
> Christoph Lameter <clameter@sgi.com> wrote:
> 
> > On Thu, 26 Apr 2007, KAMEZAWA Hiroyuki wrote:
> > 
> > > (1)Use new zonelist ordering always and move init_task's tied cpu to a
> > >   cpu on the best node. 
> > >   Child processes will start in good nodes even if Node 0 has small memory.
> > 
> > How about renumbering the nodes? Node 0 is the one with no DMA memory and 
> > node 1 may be the one with the DMA? That would take care of things even 
> > without core modifications. We can start on node 0 (which hardware 1) and 
> > consume the required memory for boot there not impacting the node with the 
> > DMA memory.
> > 
> It seems a bit complicated. If we do so, following can occur,
> 
> Node1: cpu0,1,2,3
> Node0: cpu4,5,6,7
> 
> the system layout will be not imaginable look, maybe.

Interesting.  A colleague recently showed me that this can occur on HP
platforms if we boot from, say, node 1 instead of node 0.  The kernel
doesn't mind because it maintains a translation of cpus to nodes and
vice versa.  Applications don't need to mind if they use libnuma's
numa_node_to_cpus(), rather than assume a fixed relationship.  But, I
agree, that it may surprise some people when/if node_id !=
cpu_id/cpus_per_node.

Lee


^ permalink raw reply	[flat|nested] 19+ messages in thread

* Re: [PATCH] change global zonelist order on NUMA v2
  2007-04-27  1:25         ` Christoph Lameter
  2007-04-27  1:50           ` KAMEZAWA Hiroyuki
@ 2007-04-30 15:03           ` Lee Schermerhorn
  1 sibling, 0 replies; 19+ messages in thread
From: Lee Schermerhorn @ 2007-04-30 15:03 UTC (permalink / raw)
  To: Christoph Lameter; +Cc: KAMEZAWA Hiroyuki, ak, linux-kernel, linux-mm, akpm

On Thu, 2007-04-26 at 18:25 -0700, Christoph Lameter wrote:
> On Fri, 27 Apr 2007, KAMEZAWA Hiroyuki wrote:
> 
> > > DMA memory.
> > > 
> > It seems a bit complicated. If we do so, following can occur,
> > 
> > Node1: cpu0,1,2,3
> > Node0: cpu4,5,6,7
> 
> We were discussing a two node NUMA system. If you have more put it onto 
> the last.

Doesn't this [renumbering nodes] just move the problem to that "last"
node?  I.e., when one attempts to allocate normal memory from the last
node, it will overflow to the DMA zone.  What we need is for and DMA[32]
zone[s] to be last in [or excluded from?] the Normal/Movable/High/...
zonelist for each node.  That is what Kame's patch does.

Lee



^ permalink raw reply	[flat|nested] 19+ messages in thread

end of thread, other threads:[~2007-04-30 15:06 UTC | newest]

Thread overview: 19+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2007-04-26  9:34 [PATCH] change global zonelist order on NUMA v2 KAMEZAWA Hiroyuki
2007-04-26  9:47 ` Andi Kleen
2007-04-26 10:10   ` KAMEZAWA Hiroyuki
2007-04-26 10:53     ` [PATCH] change global zonelist order on NUMA v3 KAMEZAWA Hiroyuki
2007-04-26 16:00       ` Lee Schermerhorn
2007-04-26 16:06         ` Christoph Lameter
2007-04-26 16:29           ` Lee Schermerhorn
2007-04-26 16:36             ` Christoph Lameter
2007-04-26 15:48     ` [PATCH] change global zonelist order on NUMA v2 Christoph Lameter
2007-04-27  0:27       ` KAMEZAWA Hiroyuki
2007-04-27  1:25         ` Christoph Lameter
2007-04-27  1:50           ` KAMEZAWA Hiroyuki
2007-04-30 15:03           ` Lee Schermerhorn
2007-04-30 14:09         ` Lee Schermerhorn
2007-04-26 15:46   ` Christoph Lameter
2007-04-26 15:51     ` Andi Kleen
2007-04-26 21:57 ` Lee Schermerhorn
2007-04-26 22:07   ` Christoph Lameter
2007-04-27  0:41   ` KAMEZAWA Hiroyuki

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).