LKML Archive on lore.kernel.org
help / color / mirror / Atom feed
* [RFC][-mm] [1/2] Simple stats for cpu resource controller
@ 2008-03-26 18:18 Balaji Rao
  2008-03-26 19:00 ` Paul Menage
  2008-03-26 19:58 ` Peter Zijlstra
  0 siblings, 2 replies; 16+ messages in thread
From: Balaji Rao @ 2008-03-26 18:18 UTC (permalink / raw)
  To: linux-kernel; +Cc: menage, balbir, containers, dhaval

This patch implements trivial statistics for the cpu controller.

Signed-off-by: Balaji Rao <balajirrao@gmail.com>
CC: Balbir Singh <balbir@linux.vnet.ibm.com>
CC: Dhaval Giani <dhaval@linux.vnet.ibm.com>

diff --git a/kernel/sched.c b/kernel/sched.c
index 9fbfa05..eac9333 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -164,10 +164,38 @@ struct cfs_rq;
 
 static LIST_HEAD(task_groups);
 
+#ifdef CONFIG_CGROUP_SCHED
+enum cpu_cgroup_stat_index {
+	CPU_CGROUP_STAT_UTIME, /* Usertime of the task group */
+	CPU_CGROUP_STAT_STIME, /* Kerneltime of the task group */
+
+	CPU_CGROUP_STAT_NSTATS,
+};
+
+struct cpu_cgroup_stat_cpu {
+	s64 count[CPU_CGROUP_STAT_NSTATS];
+} ____cacheline_aligned_in_smp;
+
+struct cpu_cgroup_stat {
+	struct cpu_cgroup_stat_cpu cpustat[NR_CPUS];
+};
+
+/* Called under irq disable. */
+static void __cpu_cgroup_stat_add_safe(struct cpu_cgroup_stat *stat,
+		enum cpu_cgroup_stat_index idx, int val)
+{
+	int cpu = smp_processor_id();
+
+	BUG_ON(!irqs_disabled());
+	stat->cpustat[cpu].count[idx] += val;
+}
+#endif
+
 /* task group related information */
 struct task_group {
 #ifdef CONFIG_CGROUP_SCHED
 	struct cgroup_subsys_state css;
+	struct cpu_cgroup_stat stat;
 #endif
 
 #ifdef CONFIG_FAIR_GROUP_SCHED
@@ -3670,6 +3698,16 @@ void account_user_time(struct task_struct *p, cputime_t cputime)
 		cpustat->nice = cputime64_add(cpustat->nice, tmp);
 	else
 		cpustat->user = cputime64_add(cpustat->user, tmp);
+
+	/* Charge the task's group */
+#ifdef CONFIG_CGROUP_SCHED 
+	{
+	struct task_group *tg;
+	tg = task_group(p);
+	__cpu_cgroup_stat_add_safe(&tg->stat, CPU_CGROUP_STAT_UTIME,
+			cputime_to_msecs(cputime));
+	}
+#endif
 }
 
 /*
@@ -3733,6 +3771,15 @@ void account_system_time(struct task_struct *p, int hardirq_offset,
 		cpustat->idle = cputime64_add(cpustat->idle, tmp);
 	/* Account for system time used */
 	acct_update_integrals(p);
+
+#ifdef CONFIG_CGROUP_SCHED
+	{
+	struct task_group *tg;
+	tg = task_group(p);
+	__cpu_cgroup_stat_add_safe(&tg->stat, CPU_CGROUP_STAT_STIME,
+			cputime_to_msecs(cputime));
+	}
+#endif
 }
 
 /*
@@ -7939,6 +7986,40 @@ static u64 cpu_shares_read_u64(struct cgroup *cgrp, struct cftype *cft)
 
 	return (u64) tg->shares;
 }
+
+static s64 cpu_cgroup_read_stat(struct cpu_cgroup_stat *stat,
+		enum cpu_cgroup_stat_index idx)
+{
+	int cpu;
+	s64 ret = 0;
+	for_each_possible_cpu(cpu)
+		ret += stat->cpustat[cpu].count[idx];
+	return ret;
+}
+
+static const struct cpu_cgroup_stat_desc {
+	const char *msg;
+	u64 unit;
+} cpu_cgroup_stat_desc[] = {
+	[CPU_CGROUP_STAT_UTIME] = { "utime", 1, },
+	[CPU_CGROUP_STAT_STIME] = { "stime", 1, },
+};
+
+static int cpu_cgroup_stats_show(struct cgroup *cgrp, struct cftype *cft,
+				struct cgroup_map_cb *cb)
+{
+	struct task_group *tg = cgroup_tg(cgrp);
+	struct cpu_cgroup_stat *stat = &tg->stat;
+	int i;
+
+	for (i = 0; i < ARRAY_SIZE(stat->cpustat[0].count); i++) {
+		s64 val;
+		val = cpu_cgroup_read_stat(stat, i);
+		val *= cpu_cgroup_stat_desc[i].unit;
+		cb->fill(cb, cpu_cgroup_stat_desc[i].msg, val);
+	}
+	return 0;
+}
 #endif
 
 #ifdef CONFIG_RT_GROUP_SCHED
@@ -7961,6 +8042,11 @@ static struct cftype cpu_files[] = {
 		.read_u64 = cpu_shares_read_u64,
 		.write_u64 = cpu_shares_write_u64,
 	},
+
+	{
+		.name = "stat",
+		.read_map = cpu_cgroup_stats_show,
+	},
 #endif
 #ifdef CONFIG_RT_GROUP_SCHED
 	{


^ permalink raw reply related	[flat|nested] 16+ messages in thread

* Re: [RFC][-mm] [1/2] Simple stats for cpu resource controller
  2008-03-26 18:18 [RFC][-mm] [1/2] Simple stats for cpu resource controller Balaji Rao
@ 2008-03-26 19:00 ` Paul Menage
  2008-03-26 19:58 ` Peter Zijlstra
  1 sibling, 0 replies; 16+ messages in thread
From: Paul Menage @ 2008-03-26 19:00 UTC (permalink / raw)
  To: Balaji Rao; +Cc: linux-kernel, balbir, containers, dhaval

On Wed, Mar 26, 2008 at 11:18 AM, Balaji Rao <balajirrao@gmail.com> wrote:
>  +
>  +static s64 cpu_cgroup_read_stat(struct cpu_cgroup_stat *stat,
>  +               enum cpu_cgroup_stat_index idx)
>  +{
>  +       int cpu;
>  +       s64 ret = 0;
>  +       for_each_possible_cpu(cpu)
>  +               ret += stat->cpustat[cpu].count[idx];

On a 32-bit architecture I think this could race with a non-atomic
update that crosses a 32-bit boundary and get a corrupted result.

Paul

^ permalink raw reply	[flat|nested] 16+ messages in thread

* Re: [RFC][-mm] [1/2] Simple stats for cpu resource controller
  2008-03-26 18:18 [RFC][-mm] [1/2] Simple stats for cpu resource controller Balaji Rao
  2008-03-26 19:00 ` Paul Menage
@ 2008-03-26 19:58 ` Peter Zijlstra
  2008-03-28 10:02   ` Balaji Rao
  1 sibling, 1 reply; 16+ messages in thread
From: Peter Zijlstra @ 2008-03-26 19:58 UTC (permalink / raw)
  To: Balaji Rao; +Cc: linux-kernel, menage, balbir, containers, dhaval

On Wed, 2008-03-26 at 23:48 +0530, Balaji Rao wrote:
> This patch implements trivial statistics for the cpu controller.
> 
> Signed-off-by: Balaji Rao <balajirrao@gmail.com>
> CC: Balbir Singh <balbir@linux.vnet.ibm.com>
> CC: Dhaval Giani <dhaval@linux.vnet.ibm.com>
> 
> diff --git a/kernel/sched.c b/kernel/sched.c
> index 9fbfa05..eac9333 100644
> --- a/kernel/sched.c
> +++ b/kernel/sched.c
> @@ -164,10 +164,38 @@ struct cfs_rq;
>  
>  static LIST_HEAD(task_groups);
>  
> +#ifdef CONFIG_CGROUP_SCHED
> +enum cpu_cgroup_stat_index {
> +	CPU_CGROUP_STAT_UTIME, /* Usertime of the task group */
> +	CPU_CGROUP_STAT_STIME, /* Kerneltime of the task group */
> +
> +	CPU_CGROUP_STAT_NSTATS,
> +};
> +
> +struct cpu_cgroup_stat_cpu {
> +	s64 count[CPU_CGROUP_STAT_NSTATS];
> +} ____cacheline_aligned_in_smp;
> +
> +struct cpu_cgroup_stat {
> +	struct cpu_cgroup_stat_cpu cpustat[NR_CPUS];
> +};
> +
> +/* Called under irq disable. */
> +static void __cpu_cgroup_stat_add_safe(struct cpu_cgroup_stat *stat,
> +		enum cpu_cgroup_stat_index idx, int val)

What is safe about this function?

> +{
> +	int cpu = smp_processor_id();
> +
> +	BUG_ON(!irqs_disabled());
> +	stat->cpustat[cpu].count[idx] += val;
> +}
> +#endif
> +
>  /* task group related information */
>  struct task_group {
>  #ifdef CONFIG_CGROUP_SCHED
>  	struct cgroup_subsys_state css;
> +	struct cpu_cgroup_stat stat;
>  #endif
>  
>  #ifdef CONFIG_FAIR_GROUP_SCHED
> @@ -3670,6 +3698,16 @@ void account_user_time(struct task_struct *p, cputime_t cputime)
>  		cpustat->nice = cputime64_add(cpustat->nice, tmp);
>  	else
>  		cpustat->user = cputime64_add(cpustat->user, tmp);
> +
> +	/* Charge the task's group */
> +#ifdef CONFIG_CGROUP_SCHED 
> +	{
> +	struct task_group *tg;
> +	tg = task_group(p);
> +	__cpu_cgroup_stat_add_safe(&tg->stat, CPU_CGROUP_STAT_UTIME,
> +			cputime_to_msecs(cputime));
> +	}
> +#endif
>  }
>  
>  /*
> @@ -3733,6 +3771,15 @@ void account_system_time(struct task_struct *p, int hardirq_offset,
>  		cpustat->idle = cputime64_add(cpustat->idle, tmp);
>  	/* Account for system time used */
>  	acct_update_integrals(p);
> +
> +#ifdef CONFIG_CGROUP_SCHED
> +	{
> +	struct task_group *tg;
> +	tg = task_group(p);
> +	__cpu_cgroup_stat_add_safe(&tg->stat, CPU_CGROUP_STAT_STIME,
> +			cputime_to_msecs(cputime));
> +	}
> +#endif
>  }

So both of these are tick based? The normal CFS [us]time stats are not.

>  /*
> @@ -7939,6 +7986,40 @@ static u64 cpu_shares_read_u64(struct cgroup *cgrp, struct cftype *cft)
>  
>  	return (u64) tg->shares;
>  }
> +
> +static s64 cpu_cgroup_read_stat(struct cpu_cgroup_stat *stat,
> +		enum cpu_cgroup_stat_index idx)
> +{
> +	int cpu;
> +	s64 ret = 0;
> +	for_each_possible_cpu(cpu)
> +		ret += stat->cpustat[cpu].count[idx];
> +	return ret;
> +}
> +
> +static const struct cpu_cgroup_stat_desc {
> +	const char *msg;
> +	u64 unit;
> +} cpu_cgroup_stat_desc[] = {
> +	[CPU_CGROUP_STAT_UTIME] = { "utime", 1, },
> +	[CPU_CGROUP_STAT_STIME] = { "stime", 1, },
> +};
> +
> +static int cpu_cgroup_stats_show(struct cgroup *cgrp, struct cftype *cft,
> +				struct cgroup_map_cb *cb)
> +{
> +	struct task_group *tg = cgroup_tg(cgrp);
> +	struct cpu_cgroup_stat *stat = &tg->stat;
> +	int i;
> +
> +	for (i = 0; i < ARRAY_SIZE(stat->cpustat[0].count); i++) {
> +		s64 val;
> +		val = cpu_cgroup_read_stat(stat, i);
> +		val *= cpu_cgroup_stat_desc[i].unit;
> +		cb->fill(cb, cpu_cgroup_stat_desc[i].msg, val);
> +	}
> +	return 0;
> +}
>  #endif
>  
>  #ifdef CONFIG_RT_GROUP_SCHED
> @@ -7961,6 +8042,11 @@ static struct cftype cpu_files[] = {
>  		.read_u64 = cpu_shares_read_u64,
>  		.write_u64 = cpu_shares_write_u64,
>  	},
> +
> +	{
> +		.name = "stat",
> +		.read_map = cpu_cgroup_stats_show,
> +	},
>  #endif
>  #ifdef CONFIG_RT_GROUP_SCHED
>  	{
> 
> --
> To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
> the body of a message to majordomo@vger.kernel.org
> More majordomo info at  http://vger.kernel.org/majordomo-info.html
> Please read the FAQ at  http://www.tux.org/lkml/


^ permalink raw reply	[flat|nested] 16+ messages in thread

* Re: [RFC][-mm] [1/2] Simple stats for cpu resource controller
  2008-03-26 19:58 ` Peter Zijlstra
@ 2008-03-28 10:02   ` Balaji Rao
  2008-03-28 10:17     ` Peter Zijlstra
  0 siblings, 1 reply; 16+ messages in thread
From: Balaji Rao @ 2008-03-28 10:02 UTC (permalink / raw)
  To: Peter Zijlstra; +Cc: linux-kernel, menage, balbir, containers, dhaval

On Thursday 27 March 2008 01:28:10 am Peter Zijlstra wrote:
> On Wed, 2008-03-26 at 23:48 +0530, Balaji Rao wrote:
<snip>
> > +/* Called under irq disable. */
> > +static void __cpu_cgroup_stat_add_safe(struct cpu_cgroup_stat *stat,
> > +		enum cpu_cgroup_stat_index idx, int val)
> 
> What is safe about this function?
> 
That it can be called only from an interrupt context.
> > +{
> > +	int cpu = smp_processor_id();
> > +
> > +	BUG_ON(!irqs_disabled());
> > +	stat->cpustat[cpu].count[idx] += val;
> > +}
> > +#endif
> > +
> >  /* task group related information */
> >  struct task_group {
> >  #ifdef CONFIG_CGROUP_SCHED
> >  	struct cgroup_subsys_state css;
> > +	struct cpu_cgroup_stat stat;
> >  #endif
> >  
> >  #ifdef CONFIG_FAIR_GROUP_SCHED
> > @@ -3670,6 +3698,16 @@ void account_user_time(struct task_struct *p, cputime_t cputime)
> >  		cpustat->nice = cputime64_add(cpustat->nice, tmp);
> >  	else
> >  		cpustat->user = cputime64_add(cpustat->user, tmp);
> > +
> > +	/* Charge the task's group */
> > +#ifdef CONFIG_CGROUP_SCHED 
> > +	{
> > +	struct task_group *tg;
> > +	tg = task_group(p);
> > +	__cpu_cgroup_stat_add_safe(&tg->stat, CPU_CGROUP_STAT_UTIME,
> > +			cputime_to_msecs(cputime));
> > +	}
> > +#endif
> >  }
> >  
> >  /*
> > @@ -3733,6 +3771,15 @@ void account_system_time(struct task_struct *p, int hardirq_offset,
> >  		cpustat->idle = cputime64_add(cpustat->idle, tmp);
> >  	/* Account for system time used */
> >  	acct_update_integrals(p);
> > +
> > +#ifdef CONFIG_CGROUP_SCHED
> > +	{
> > +	struct task_group *tg;
> > +	tg = task_group(p);
> > +	__cpu_cgroup_stat_add_safe(&tg->stat, CPU_CGROUP_STAT_STIME,
> > +			cputime_to_msecs(cputime));
> > +	}
> > +#endif
> >  }
> 
> So both of these are tick based? The normal CFS [us]time stats are not.
> 
Hmmm.. Yea, right. So I should use the approach used by task_utime and task_stime when reporting it, right ?
> >  /*
> > @@ -7939,6 +7986,40 @@ static u64 cpu_shares_read_u64(struct cgroup *cgrp, struct cftype *cft)
> >  
> >  	return (u64) tg->shares;
> >  }
> > +

Thanks for the review.
-- 
regards,
balaji rao

^ permalink raw reply	[flat|nested] 16+ messages in thread

* Re: [RFC][-mm] [1/2] Simple stats for cpu resource controller
  2008-03-28 10:02   ` Balaji Rao
@ 2008-03-28 10:17     ` Peter Zijlstra
  0 siblings, 0 replies; 16+ messages in thread
From: Peter Zijlstra @ 2008-03-28 10:17 UTC (permalink / raw)
  To: Balaji Rao; +Cc: linux-kernel, menage, balbir, containers, dhaval

On Fri, 2008-03-28 at 15:32 +0530, Balaji Rao wrote:
> On Thursday 27 March 2008 01:28:10 am Peter Zijlstra wrote:
> > On Wed, 2008-03-26 at 23:48 +0530, Balaji Rao wrote:
> <snip>
> > > +/* Called under irq disable. */
> > > +static void __cpu_cgroup_stat_add_safe(struct cpu_cgroup_stat *stat,
> > > +		enum cpu_cgroup_stat_index idx, int val)
> > 
> > What is safe about this function?
> > 
> That it can be called only from an interrupt context.

It can be called from any context that has hardirqs disabled. And the __
prefix suggests as much, no need to tag _safe to the end as well, we
never do that.

> > > +{
> > > +	int cpu = smp_processor_id();
> > > +
> > > +	BUG_ON(!irqs_disabled());
> > > +	stat->cpustat[cpu].count[idx] += val;
> > > +}
> > > +#endif
> > > +
> > >  /* task group related information */
> > >  struct task_group {
> > >  #ifdef CONFIG_CGROUP_SCHED
> > >  	struct cgroup_subsys_state css;
> > > +	struct cpu_cgroup_stat stat;
> > >  #endif
> > >  
> > >  #ifdef CONFIG_FAIR_GROUP_SCHED
> > > @@ -3670,6 +3698,16 @@ void account_user_time(struct task_struct *p, cputime_t cputime)
> > >  		cpustat->nice = cputime64_add(cpustat->nice, tmp);
> > >  	else
> > >  		cpustat->user = cputime64_add(cpustat->user, tmp);
> > > +
> > > +	/* Charge the task's group */
> > > +#ifdef CONFIG_CGROUP_SCHED 
> > > +	{
> > > +	struct task_group *tg;
> > > +	tg = task_group(p);
> > > +	__cpu_cgroup_stat_add_safe(&tg->stat, CPU_CGROUP_STAT_UTIME,
> > > +			cputime_to_msecs(cputime));
> > > +	}
> > > +#endif
> > >  }
> > >  
> > >  /*
> > > @@ -3733,6 +3771,15 @@ void account_system_time(struct task_struct *p, int hardirq_offset,
> > >  		cpustat->idle = cputime64_add(cpustat->idle, tmp);
> > >  	/* Account for system time used */
> > >  	acct_update_integrals(p);
> > > +
> > > +#ifdef CONFIG_CGROUP_SCHED
> > > +	{
> > > +	struct task_group *tg;
> > > +	tg = task_group(p);
> > > +	__cpu_cgroup_stat_add_safe(&tg->stat, CPU_CGROUP_STAT_STIME,
> > > +			cputime_to_msecs(cputime));
> > > +	}
> > > +#endif
> > >  }
> > 
> > So both of these are tick based? The normal CFS [us]time stats are not.
> > 
> Hmmm.. Yea, right. So I should use the approach used by task_utime and task_stime when reporting it, right ?

Not sure what you want to use this for, but yeah, that makes most sense.

That is, I do know _what_ you want to use it for, just not sure which
requirements you put on it. The pure tick based thing might be good
enough for most purposes, the runtime proportion thing is just more
accurate.

> > >  /*
> > > @@ -7939,6 +7986,40 @@ static u64 cpu_shares_read_u64(struct cgroup *cgrp, struct cftype *cft)
> > >  
> > >  	return (u64) tg->shares;
> > >  }
> > > +
> 
> Thanks for the review.


^ permalink raw reply	[flat|nested] 16+ messages in thread

* Re: [RFC][-mm] [1/2] Simple stats for cpu resource controller
  2008-04-10 16:09       ` Balaji Rao
@ 2008-04-10 16:25         ` Peter Zijlstra
  0 siblings, 0 replies; 16+ messages in thread
From: Peter Zijlstra @ 2008-04-10 16:25 UTC (permalink / raw)
  To: Balaji Rao
  Cc: Dhaval Giani, linux-kernel, containers, menage, balbir,
	Srivatsa Vaddagiri

On Thu, 2008-04-10 at 21:39 +0530, Balaji Rao wrote:
> On Monday 07 April 2008 06:54:53 pm Peter Zijlstra wrote:
> > On Sun, 2008-04-06 at 02:01 +0530, Balaji Rao wrote:
> > 
> > > > > +static s64 cpu_cgroup_read_stat(struct cpu_cgroup_stat *stat,
> > > > > +		enum cpu_cgroup_stat_index idx)
> > > > > +{
> > > > > +	int cpu;
> > > > > +	s64 ret = 0;
> > > > > +	unsigned long flags;
> > > > 
> > > > > +
> > > > > +	local_irq_save(flags);
> > > > 
> > > > I am just wondering. Is local_irq_save() enough?
> > > > 
> > > Hmmm.. You are right.This does not prevent concurrent updates on other 
> CPUs 
> > > from crossing a 32bit boundary. Am not sure how to do this in a safe way. 
> I 
> > > can only think of using atomic64_t now..
> > > 
> > > > > +	for_each_possible_cpu(cpu)
> > > > > +		ret += stat->cpustat[cpu].count[idx];
> > > > > +	local_irq_restore(flags);
> > > > > +
> > > > > +	return ret;
> > > > > +}
> > > > > +
> > 
> > So many stats to steal code from,.. but you didn't :-(
> > 
> > Look at mm/vmstat.c, that is a rather complete example.
> > 
> > The trick to solving the above is to use per cpu deltas instead, the
> > deltas can be machine word size and are thus always read in an atomic
> > manner (provided they are also naturally aligned).
> > 
> > 
> Hi Peter,
> 
> This wont work for time based statistics. At nsec granularity, a word can hold 
> a time value of up to ~4s. 

4 seconds is plenty for a delta, most increments are in the ms range.

> I propose to solve this problem by using a lock to protect the statistics, but 
> only on 32bit architectures.
> 
> I'm not sure how good a solution this is, but that's the best I can think of 
> ATM. 

Not needed, keep per cpu word deltas and fold into a global u64 counter
while holding a lock.



^ permalink raw reply	[flat|nested] 16+ messages in thread

* Re: [RFC][-mm] [1/2] Simple stats for cpu resource controller
  2008-04-07 13:24     ` Peter Zijlstra
@ 2008-04-10 16:09       ` Balaji Rao
  2008-04-10 16:25         ` Peter Zijlstra
  0 siblings, 1 reply; 16+ messages in thread
From: Balaji Rao @ 2008-04-10 16:09 UTC (permalink / raw)
  To: Peter Zijlstra
  Cc: Dhaval Giani, linux-kernel, containers, menage, balbir,
	Srivatsa Vaddagiri

On Monday 07 April 2008 06:54:53 pm Peter Zijlstra wrote:
> On Sun, 2008-04-06 at 02:01 +0530, Balaji Rao wrote:
> 
> > > > +static s64 cpu_cgroup_read_stat(struct cpu_cgroup_stat *stat,
> > > > +		enum cpu_cgroup_stat_index idx)
> > > > +{
> > > > +	int cpu;
> > > > +	s64 ret = 0;
> > > > +	unsigned long flags;
> > > 
> > > > +
> > > > +	local_irq_save(flags);
> > > 
> > > I am just wondering. Is local_irq_save() enough?
> > > 
> > Hmmm.. You are right.This does not prevent concurrent updates on other 
CPUs 
> > from crossing a 32bit boundary. Am not sure how to do this in a safe way. 
I 
> > can only think of using atomic64_t now..
> > 
> > > > +	for_each_possible_cpu(cpu)
> > > > +		ret += stat->cpustat[cpu].count[idx];
> > > > +	local_irq_restore(flags);
> > > > +
> > > > +	return ret;
> > > > +}
> > > > +
> 
> So many stats to steal code from,.. but you didn't :-(
> 
> Look at mm/vmstat.c, that is a rather complete example.
> 
> The trick to solving the above is to use per cpu deltas instead, the
> deltas can be machine word size and are thus always read in an atomic
> manner (provided they are also naturally aligned).
> 
> 
Hi Peter,

This wont work for time based statistics. At nsec granularity, a word can hold 
a time value of up to ~4s. 

I propose to solve this problem by using a lock to protect the statistics, but 
only on 32bit architectures.

I'm not sure how good a solution this is, but that's the best I can think of 
ATM. 
-- 
regards,
Balaji Rao
Dept. of Mechanical Engineering,
National Institute of Technology Karnataka, India

^ permalink raw reply	[flat|nested] 16+ messages in thread

* Re: [RFC][-mm] [1/2] Simple stats for cpu resource controller
  2008-04-05 20:31   ` Balaji Rao
  2008-04-05 20:59     ` Dhaval Giani
@ 2008-04-07 13:24     ` Peter Zijlstra
  2008-04-10 16:09       ` Balaji Rao
  1 sibling, 1 reply; 16+ messages in thread
From: Peter Zijlstra @ 2008-04-07 13:24 UTC (permalink / raw)
  To: Balaji Rao
  Cc: Dhaval Giani, linux-kernel, containers, menage, balbir,
	Srivatsa Vaddagiri

On Sun, 2008-04-06 at 02:01 +0530, Balaji Rao wrote:

> > > +static s64 cpu_cgroup_read_stat(struct cpu_cgroup_stat *stat,
> > > +		enum cpu_cgroup_stat_index idx)
> > > +{
> > > +	int cpu;
> > > +	s64 ret = 0;
> > > +	unsigned long flags;
> > 
> > > +
> > > +	local_irq_save(flags);
> > 
> > I am just wondering. Is local_irq_save() enough?
> > 
> Hmmm.. You are right.This does not prevent concurrent updates on other CPUs 
> from crossing a 32bit boundary. Am not sure how to do this in a safe way. I 
> can only think of using atomic64_t now..
> 
> > > +	for_each_possible_cpu(cpu)
> > > +		ret += stat->cpustat[cpu].count[idx];
> > > +	local_irq_restore(flags);
> > > +
> > > +	return ret;
> > > +}
> > > +

So many stats to steal code from,.. but you didn't :-(

Look at mm/vmstat.c, that is a rather complete example.

The trick to solving the above is to use per cpu deltas instead, the
deltas can be machine word size and are thus always read in an atomic
manner (provided they are also naturally aligned).


^ permalink raw reply	[flat|nested] 16+ messages in thread

* Re: [RFC][-mm] [1/2] Simple stats for cpu resource controller
  2008-04-05 21:21       ` Balaji Rao
@ 2008-04-06  5:12         ` Balbir Singh
  0 siblings, 0 replies; 16+ messages in thread
From: Balbir Singh @ 2008-04-06  5:12 UTC (permalink / raw)
  To: Balaji Rao
  Cc: Dhaval Giani, linux-kernel, containers, menage, a.p.zijlstra,
	balbir, Srivatsa Vaddagiri

Balaji Rao wrote:
> On Sunday 06 April 2008 02:29:14 am Dhaval Giani wrote:
>> On Sun, Apr 06, 2008 at 02:01:52AM +0530, Balaji Rao wrote:
>>> On Sunday 06 April 2008 01:10:41 am Dhaval Giani wrote:
>>>>> +};
>>>>> +
>>>>> +struct cpu_cgroup_stat_cpu {
>>>>> +	s64 count[CPU_CGROUP_STAT_NSTATS];
>>>> u64? time does not go negative :)
>>> Right. But these stats are not only going to measure time. We need the 
> same 
>>> variables for measuring other stats as well. I'm not sure if we would 
>>> encounter scheduler stats that would count negative.
>>>
>>> Balbir, what do you say ?
>> I would prefer to keep the stats logically separate. So something like
>> struct cpu_cgroup_stat_cpu {
>> 	u64 time[];
>> 	s64 some_other_stat;
>> }
>> and so on. (I am not sure, is there some advantage gained by using
>> structs?) Makes the code more maintainable imho.
>>
> This would break the generic nature of __cpu_cgroup_stat_add. Its not a nice 
> thing in my opinion.
> 

I prefer keeping stats in the array as Balaji has done, it makes it easier to do
batch processing on the stats.



-- 
	Warm Regards,
	Balbir Singh
	Linux Technology Center
	IBM, ISTL

^ permalink raw reply	[flat|nested] 16+ messages in thread

* Re: [RFC][-mm] [1/2] Simple stats for cpu resource controller
  2008-04-05 20:59     ` Dhaval Giani
@ 2008-04-05 21:21       ` Balaji Rao
  2008-04-06  5:12         ` Balbir Singh
  0 siblings, 1 reply; 16+ messages in thread
From: Balaji Rao @ 2008-04-05 21:21 UTC (permalink / raw)
  To: Dhaval Giani
  Cc: linux-kernel, containers, menage, a.p.zijlstra, balbir,
	Srivatsa Vaddagiri

On Sunday 06 April 2008 02:29:14 am Dhaval Giani wrote:
> On Sun, Apr 06, 2008 at 02:01:52AM +0530, Balaji Rao wrote:
> > On Sunday 06 April 2008 01:10:41 am Dhaval Giani wrote:
> > > > +};
> > > > +
> > > > +struct cpu_cgroup_stat_cpu {
> > > > +	s64 count[CPU_CGROUP_STAT_NSTATS];
> > > 
> > > u64? time does not go negative :)
> > Right. But these stats are not only going to measure time. We need the 
same 
> > variables for measuring other stats as well. I'm not sure if we would 
> > encounter scheduler stats that would count negative.
> > 
> > Balbir, what do you say ?
> 
> I would prefer to keep the stats logically separate. So something like
> struct cpu_cgroup_stat_cpu {
> 	u64 time[];
> 	s64 some_other_stat;
> }
> and so on. (I am not sure, is there some advantage gained by using
> structs?) Makes the code more maintainable imho.
> 
This would break the generic nature of __cpu_cgroup_stat_add. Its not a nice 
thing in my opinion.

-- 
regards,
Balaji Rao
Dept. of Mechanical Engineering,
National Institute of Technology Karnataka, India

^ permalink raw reply	[flat|nested] 16+ messages in thread

* Re: [RFC][-mm] [1/2] Simple stats for cpu resource controller
  2008-04-05 20:31   ` Balaji Rao
@ 2008-04-05 20:59     ` Dhaval Giani
  2008-04-05 21:21       ` Balaji Rao
  2008-04-07 13:24     ` Peter Zijlstra
  1 sibling, 1 reply; 16+ messages in thread
From: Dhaval Giani @ 2008-04-05 20:59 UTC (permalink / raw)
  To: Balaji Rao
  Cc: linux-kernel, containers, menage, a.p.zijlstra, balbir,
	Srivatsa Vaddagiri

On Sun, Apr 06, 2008 at 02:01:52AM +0530, Balaji Rao wrote:
> On Sunday 06 April 2008 01:10:41 am Dhaval Giani wrote:
> > > +};
> > > +
> > > +struct cpu_cgroup_stat_cpu {
> > > +	s64 count[CPU_CGROUP_STAT_NSTATS];
> > 
> > u64? time does not go negative :)
> Right. But these stats are not only going to measure time. We need the same 
> variables for measuring other stats as well. I'm not sure if we would 
> encounter scheduler stats that would count negative.
> 
> Balbir, what do you say ?

I would prefer to keep the stats logically separate. So something like
struct cpu_cgroup_stat_cpu {
	u64 time[];
	s64 some_other_stat;
}
and so on. (I am not sure, is there some advantage gained by using
structs?) Makes the code more maintainable imho.

> 
> > count also is not very clear? Can you give a more descriptive name?
> > 
> ok. How does 'value' look  ?
> 
> <snip>
> 
> > > +static s64 cpu_cgroup_read_stat(struct cpu_cgroup_stat *stat,
> > > +		enum cpu_cgroup_stat_index idx)
> > > +{
> > > +	int cpu;
> > > +	s64 ret = 0;
> > > +	unsigned long flags;
> > 
> > > +
> > > +	local_irq_save(flags);
> > 
> > I am just wondering. Is local_irq_save() enough?
> > 
> Hmmm.. You are right.This does not prevent concurrent updates on other CPUs 
> from crossing a 32bit boundary. Am not sure how to do this in a safe way. I 
> can only think of using atomic64_t now..
> 

I am going to answer that one when I am awake :-)

-- 
regards,
Dhaval

^ permalink raw reply	[flat|nested] 16+ messages in thread

* Re: [RFC][-mm] [1/2] Simple stats for cpu resource controller
  2008-04-05 19:40 ` Dhaval Giani
@ 2008-04-05 20:31   ` Balaji Rao
  2008-04-05 20:59     ` Dhaval Giani
  2008-04-07 13:24     ` Peter Zijlstra
  0 siblings, 2 replies; 16+ messages in thread
From: Balaji Rao @ 2008-04-05 20:31 UTC (permalink / raw)
  To: Dhaval Giani
  Cc: linux-kernel, containers, menage, a.p.zijlstra, balbir,
	Srivatsa Vaddagiri

On Sunday 06 April 2008 01:10:41 am Dhaval Giani wrote:
<snip>
> > 
> > +#ifdef CONFIG_CGROUP_SCHED
> > +enum cpu_cgroup_stat_index {
> > +	CPU_CGROUP_STAT_UTIME, /* Usertime of the task group */
> > +	CPU_CGROUP_STAT_STIME, /* Kerneltime of the task group */
> > +
> > +	CPU_CGROUP_STAT_NSTATS,
> 
> why the extra space?
Just to keep things clearly separated. If you've not noticed, 
CPU_CGROUP_STAT_NSTATS is not a stat.
> 
> > +};
> > +
> > +struct cpu_cgroup_stat_cpu {
> > +	s64 count[CPU_CGROUP_STAT_NSTATS];
> 
> u64? time does not go negative :)
Right. But these stats are not only going to measure time. We need the same 
variables for measuring other stats as well. I'm not sure if we would 
encounter scheduler stats that would count negative.

Balbir, what do you say ?

> count also is not very clear? Can you give a more descriptive name?
> 
ok. How does 'value' look  ?

<snip>

> > +static s64 cpu_cgroup_read_stat(struct cpu_cgroup_stat *stat,
> > +		enum cpu_cgroup_stat_index idx)
> > +{
> > +	int cpu;
> > +	s64 ret = 0;
> > +	unsigned long flags;
> 
> > +
> > +	local_irq_save(flags);
> 
> I am just wondering. Is local_irq_save() enough?
> 
Hmmm.. You are right.This does not prevent concurrent updates on other CPUs 
from crossing a 32bit boundary. Am not sure how to do this in a safe way. I 
can only think of using atomic64_t now..

> > +	for_each_possible_cpu(cpu)
> > +		ret += stat->cpustat[cpu].count[idx];
> > +	local_irq_restore(flags);
> > +
> > +	return ret;
> > +}
> > +

Thanks for the review.

-- 
regards,
Balaji Rao
Dept. of Mechanical Engineering,
National Institute of Technology Karnataka, India

^ permalink raw reply	[flat|nested] 16+ messages in thread

* Re: [RFC][-mm] [1/2] Simple stats for cpu resource controller
  2008-04-05 18:09 Balaji Rao
  2008-04-05 18:56 ` Dhaval Giani
@ 2008-04-05 19:40 ` Dhaval Giani
  2008-04-05 20:31   ` Balaji Rao
  1 sibling, 1 reply; 16+ messages in thread
From: Dhaval Giani @ 2008-04-05 19:40 UTC (permalink / raw)
  To: Balaji Rao; +Cc: linux-kernel, containers, menage, a.p.zijlstra, balbir

On Sat, Apr 05, 2008 at 11:39:46PM +0530, Balaji Rao wrote:
> This patch implements some trivial statistics for the CPU controller.
> 
> As per our current requirements, we have decided to keep the stats tick based as opposed to using CFS precise accounting.
> 
> Signed-off-by: Balaji Rao <balajirrao@gmail.com>
> CC: Balbir Singh <balbir@linux.vnet.ibm.com>
> CC: Dhaval Giani <dhaval@linux.vnet.ibm.com>
> 
> diff --git a/kernel/sched.c b/kernel/sched.c
> index 8206fda..e2acf06 100644
> --- a/kernel/sched.c
> +++ b/kernel/sched.c
> @@ -164,10 +164,38 @@ struct cfs_rq;
> 
>  static LIST_HEAD(task_groups);
> 
> +#ifdef CONFIG_CGROUP_SCHED
> +enum cpu_cgroup_stat_index {
> +	CPU_CGROUP_STAT_UTIME, /* Usertime of the task group */
> +	CPU_CGROUP_STAT_STIME, /* Kerneltime of the task group */
> +
> +	CPU_CGROUP_STAT_NSTATS,

why the extra space?

> +};
> +
> +struct cpu_cgroup_stat_cpu {
> +	s64 count[CPU_CGROUP_STAT_NSTATS];

u64? time does not go negative :)
count also is not very clear? Can you give a more descriptive name?

> +} ____cacheline_aligned_in_smp;
> +
> +struct cpu_cgroup_stat {
> +	struct cpu_cgroup_stat_cpu cpustat[NR_CPUS];
> +};
> +
> +/* Called under irq disable. */
> +static void __cpu_cgroup_stat_add(struct cpu_cgroup_stat *stat,
> +		enum cpu_cgroup_stat_index idx, int val)
> +{
> +	int cpu = smp_processor_id();
> +
> +	BUG_ON(!irqs_disabled());
> +	stat->cpustat[cpu].count[idx] += val;
> +}
> +#endif
> +
>  /* task group related information */
>  struct task_group {
>  #ifdef CONFIG_CGROUP_SCHED
>  	struct cgroup_subsys_state css;
> +	struct cpu_cgroup_stat stat;
>  #endif
> 
>  #ifdef CONFIG_FAIR_GROUP_SCHED
> @@ -3733,6 +3761,16 @@ void account_user_time(struct task_struct *p, cputime_t cputime)
>  		cpustat->nice = cputime64_add(cpustat->nice, tmp);
>  	else
>  		cpustat->user = cputime64_add(cpustat->user, tmp);
> +
> +	/* Charge the task's group */
> +#ifdef CONFIG_CGROUP_SCHED
> +	{
> +	struct task_group *tg;
> +	tg = task_group(p);
> +	__cpu_cgroup_stat_add(&tg->stat, CPU_CGROUP_STAT_UTIME,
> +			cputime_to_msecs(cputime));
> +	}
> +#endif
>  }
> 
>  /*
> @@ -3796,6 +3834,15 @@ void account_system_time(struct task_struct *p, int hardirq_offset,
>  		cpustat->idle = cputime64_add(cpustat->idle, tmp);
>  	/* Account for system time used */
>  	acct_update_integrals(p);
> +
> +#ifdef CONFIG_CGROUP_SCHED
> +	{
> +	struct task_group *tg;
> +	tg = task_group(p);
> +	__cpu_cgroup_stat_add(&tg->stat, CPU_CGROUP_STAT_STIME,
> +			cputime_to_msecs(cputime));
> +	}
> +#endif
>  }
> 
>  /*
> @@ -8004,6 +8051,45 @@ static u64 cpu_shares_read_u64(struct cgroup *cgrp, struct cftype *cft)
> 
>  	return (u64) tg->shares;
>  }
> +
> +static s64 cpu_cgroup_read_stat(struct cpu_cgroup_stat *stat,
> +		enum cpu_cgroup_stat_index idx)
> +{
> +	int cpu;
> +	s64 ret = 0;
> +	unsigned long flags;
> +
> +	local_irq_save(flags);

I am just wondering. Is local_irq_save() enough?

> +	for_each_possible_cpu(cpu)
> +		ret += stat->cpustat[cpu].count[idx];
> +	local_irq_restore(flags);
> +
> +	return ret;
> +}
> +
> +static const struct cpu_cgroup_stat_desc {
> +	const char *msg;
> +	u64 unit;
> +} cpu_cgroup_stat_desc[] = {
> +	[CPU_CGROUP_STAT_UTIME] = { "utime", 1, },
> +	[CPU_CGROUP_STAT_STIME] = { "stime", 1, },
> +};
> +
> +static int cpu_cgroup_stats_show(struct cgroup *cgrp, struct cftype *cft,
> +				struct cgroup_map_cb *cb)
> +{
> +	struct task_group *tg = cgroup_tg(cgrp);
> +	struct cpu_cgroup_stat *stat = &tg->stat;
> +	int i;
> +
> +	for (i = 0; i < ARRAY_SIZE(stat->cpustat[0].count); i++) {
> +		s64 val;
> +		val = cpu_cgroup_read_stat(stat, i);
> +		val *= cpu_cgroup_stat_desc[i].unit;
> +		cb->fill(cb, cpu_cgroup_stat_desc[i].msg, val);
> +	}
> +	return 0;
> +}
>  #endif
> 
>  #ifdef CONFIG_RT_GROUP_SCHED
> @@ -8026,6 +8112,11 @@ static struct cftype cpu_files[] = {
>  		.read_u64 = cpu_shares_read_u64,
>  		.write_u64 = cpu_shares_write_u64,
>  	},
> +
> +	{
> +		.name = "stat",
> +		.read_map = cpu_cgroup_stats_show,
> +	},
>  #endif
>  #ifdef CONFIG_RT_GROUP_SCHED
>  	{
> 

-- 
regards,
Dhaval

^ permalink raw reply	[flat|nested] 16+ messages in thread

* Re: [RFC][-mm] [1/2] Simple stats for cpu resource controller
  2008-04-05 18:56 ` Dhaval Giani
@ 2008-04-05 19:09   ` Balaji Rao
  0 siblings, 0 replies; 16+ messages in thread
From: Balaji Rao @ 2008-04-05 19:09 UTC (permalink / raw)
  To: Dhaval Giani
  Cc: linux-kernel, containers, menage, a.p.zijlstra, balbir,
	Ingo Molnar, Srivatsa Vaddagiri

On Sunday 06 April 2008 12:26:16 am Dhaval Giani wrote:
> On Sat, Apr 05, 2008 at 11:39:46PM +0530, Balaji Rao wrote:
> > This patch implements some trivial statistics for the CPU controller.
> > 
> > As per our current requirements, we have decided to keep the stats tick 
based as opposed to using CFS precise accounting.
> > 
> 
> A quick question? Why? Everywhere in the CFS we use precise accounting,
> (even within the CPU controller). Doesn't make sense using ticks here
> then.
> 
Using CFS precise accounting has a cost (see task_utime and task_stime). 
Currently our requirements don't call for precise statistics.

-- 
regards,
Balaji Rao
Dept. of Mechanical Engineering,
National Institute of Technology Karnataka, India

^ permalink raw reply	[flat|nested] 16+ messages in thread

* Re: [RFC][-mm] [1/2] Simple stats for cpu resource controller
  2008-04-05 18:09 Balaji Rao
@ 2008-04-05 18:56 ` Dhaval Giani
  2008-04-05 19:09   ` Balaji Rao
  2008-04-05 19:40 ` Dhaval Giani
  1 sibling, 1 reply; 16+ messages in thread
From: Dhaval Giani @ 2008-04-05 18:56 UTC (permalink / raw)
  To: Balaji Rao
  Cc: linux-kernel, containers, menage, a.p.zijlstra, balbir,
	Ingo Molnar, Srivatsa Vaddagiri

On Sat, Apr 05, 2008 at 11:39:46PM +0530, Balaji Rao wrote:
> This patch implements some trivial statistics for the CPU controller.
> 
> As per our current requirements, we have decided to keep the stats tick based as opposed to using CFS precise accounting.
> 

A quick question? Why? Everywhere in the CFS we use precise accounting,
(even within the CPU controller). Doesn't make sense using ticks here
then.

Will review and respond soon.

Thanks,
Dhaval

> Signed-off-by: Balaji Rao <balajirrao@gmail.com>
> CC: Balbir Singh <balbir@linux.vnet.ibm.com>
> CC: Dhaval Giani <dhaval@linux.vnet.ibm.com>
> 
> diff --git a/kernel/sched.c b/kernel/sched.c
> index 8206fda..e2acf06 100644
> --- a/kernel/sched.c
> +++ b/kernel/sched.c
> @@ -164,10 +164,38 @@ struct cfs_rq;
> 
>  static LIST_HEAD(task_groups);
> 
> +#ifdef CONFIG_CGROUP_SCHED
> +enum cpu_cgroup_stat_index {
> +	CPU_CGROUP_STAT_UTIME, /* Usertime of the task group */
> +	CPU_CGROUP_STAT_STIME, /* Kerneltime of the task group */
> +
> +	CPU_CGROUP_STAT_NSTATS,
> +};
> +
> +struct cpu_cgroup_stat_cpu {
> +	s64 count[CPU_CGROUP_STAT_NSTATS];
> +} ____cacheline_aligned_in_smp;
> +
> +struct cpu_cgroup_stat {
> +	struct cpu_cgroup_stat_cpu cpustat[NR_CPUS];
> +};
> +
> +/* Called under irq disable. */
> +static void __cpu_cgroup_stat_add(struct cpu_cgroup_stat *stat,
> +		enum cpu_cgroup_stat_index idx, int val)
> +{
> +	int cpu = smp_processor_id();
> +
> +	BUG_ON(!irqs_disabled());
> +	stat->cpustat[cpu].count[idx] += val;
> +}
> +#endif
> +
>  /* task group related information */
>  struct task_group {
>  #ifdef CONFIG_CGROUP_SCHED
>  	struct cgroup_subsys_state css;
> +	struct cpu_cgroup_stat stat;
>  #endif
> 
>  #ifdef CONFIG_FAIR_GROUP_SCHED
> @@ -3733,6 +3761,16 @@ void account_user_time(struct task_struct *p, cputime_t cputime)
>  		cpustat->nice = cputime64_add(cpustat->nice, tmp);
>  	else
>  		cpustat->user = cputime64_add(cpustat->user, tmp);
> +
> +	/* Charge the task's group */
> +#ifdef CONFIG_CGROUP_SCHED
> +	{
> +	struct task_group *tg;
> +	tg = task_group(p);
> +	__cpu_cgroup_stat_add(&tg->stat, CPU_CGROUP_STAT_UTIME,
> +			cputime_to_msecs(cputime));
> +	}
> +#endif
>  }
> 
>  /*
> @@ -3796,6 +3834,15 @@ void account_system_time(struct task_struct *p, int hardirq_offset,
>  		cpustat->idle = cputime64_add(cpustat->idle, tmp);
>  	/* Account for system time used */
>  	acct_update_integrals(p);
> +
> +#ifdef CONFIG_CGROUP_SCHED
> +	{
> +	struct task_group *tg;
> +	tg = task_group(p);
> +	__cpu_cgroup_stat_add(&tg->stat, CPU_CGROUP_STAT_STIME,
> +			cputime_to_msecs(cputime));
> +	}
> +#endif
>  }
> 
>  /*
> @@ -8004,6 +8051,45 @@ static u64 cpu_shares_read_u64(struct cgroup *cgrp, struct cftype *cft)
> 
>  	return (u64) tg->shares;
>  }
> +
> +static s64 cpu_cgroup_read_stat(struct cpu_cgroup_stat *stat,
> +		enum cpu_cgroup_stat_index idx)
> +{
> +	int cpu;
> +	s64 ret = 0;
> +	unsigned long flags;
> +
> +	local_irq_save(flags);
> +	for_each_possible_cpu(cpu)
> +		ret += stat->cpustat[cpu].count[idx];
> +	local_irq_restore(flags);
> +
> +	return ret;
> +}
> +
> +static const struct cpu_cgroup_stat_desc {
> +	const char *msg;
> +	u64 unit;
> +} cpu_cgroup_stat_desc[] = {
> +	[CPU_CGROUP_STAT_UTIME] = { "utime", 1, },
> +	[CPU_CGROUP_STAT_STIME] = { "stime", 1, },
> +};
> +
> +static int cpu_cgroup_stats_show(struct cgroup *cgrp, struct cftype *cft,
> +				struct cgroup_map_cb *cb)
> +{
> +	struct task_group *tg = cgroup_tg(cgrp);
> +	struct cpu_cgroup_stat *stat = &tg->stat;
> +	int i;
> +
> +	for (i = 0; i < ARRAY_SIZE(stat->cpustat[0].count); i++) {
> +		s64 val;
> +		val = cpu_cgroup_read_stat(stat, i);
> +		val *= cpu_cgroup_stat_desc[i].unit;
> +		cb->fill(cb, cpu_cgroup_stat_desc[i].msg, val);
> +	}
> +	return 0;
> +}
>  #endif
> 
>  #ifdef CONFIG_RT_GROUP_SCHED
> @@ -8026,6 +8112,11 @@ static struct cftype cpu_files[] = {
>  		.read_u64 = cpu_shares_read_u64,
>  		.write_u64 = cpu_shares_write_u64,
>  	},
> +
> +	{
> +		.name = "stat",
> +		.read_map = cpu_cgroup_stats_show,
> +	},
>  #endif
>  #ifdef CONFIG_RT_GROUP_SCHED
>  	{
> 
> -- 
> regards,
> Balaji Rao
> Dept. of Mechanical Engineering,
> National Institute of Technology Karnataka, India

-- 
regards,
Dhaval

^ permalink raw reply	[flat|nested] 16+ messages in thread

* [RFC][-mm] [1/2] Simple stats for cpu resource controller
@ 2008-04-05 18:09 Balaji Rao
  2008-04-05 18:56 ` Dhaval Giani
  2008-04-05 19:40 ` Dhaval Giani
  0 siblings, 2 replies; 16+ messages in thread
From: Balaji Rao @ 2008-04-05 18:09 UTC (permalink / raw)
  To: linux-kernel; +Cc: containers, dhaval, menage, a.p.zijlstra, balbir

This patch implements some trivial statistics for the CPU controller.

As per our current requirements, we have decided to keep the stats tick based as opposed to using CFS precise accounting.

Signed-off-by: Balaji Rao <balajirrao@gmail.com>
CC: Balbir Singh <balbir@linux.vnet.ibm.com>
CC: Dhaval Giani <dhaval@linux.vnet.ibm.com>

diff --git a/kernel/sched.c b/kernel/sched.c
index 8206fda..e2acf06 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -164,10 +164,38 @@ struct cfs_rq;
 
 static LIST_HEAD(task_groups);
 
+#ifdef CONFIG_CGROUP_SCHED
+enum cpu_cgroup_stat_index {
+	CPU_CGROUP_STAT_UTIME, /* Usertime of the task group */
+	CPU_CGROUP_STAT_STIME, /* Kerneltime of the task group */
+
+	CPU_CGROUP_STAT_NSTATS,
+};
+
+struct cpu_cgroup_stat_cpu {
+	s64 count[CPU_CGROUP_STAT_NSTATS];
+} ____cacheline_aligned_in_smp;
+
+struct cpu_cgroup_stat {
+	struct cpu_cgroup_stat_cpu cpustat[NR_CPUS];
+};
+
+/* Called under irq disable. */
+static void __cpu_cgroup_stat_add(struct cpu_cgroup_stat *stat,
+		enum cpu_cgroup_stat_index idx, int val)
+{
+	int cpu = smp_processor_id();
+
+	BUG_ON(!irqs_disabled());
+	stat->cpustat[cpu].count[idx] += val;
+}
+#endif
+
 /* task group related information */
 struct task_group {
 #ifdef CONFIG_CGROUP_SCHED
 	struct cgroup_subsys_state css;
+	struct cpu_cgroup_stat stat;
 #endif
 
 #ifdef CONFIG_FAIR_GROUP_SCHED
@@ -3733,6 +3761,16 @@ void account_user_time(struct task_struct *p, cputime_t cputime)
 		cpustat->nice = cputime64_add(cpustat->nice, tmp);
 	else
 		cpustat->user = cputime64_add(cpustat->user, tmp);
+
+	/* Charge the task's group */
+#ifdef CONFIG_CGROUP_SCHED
+	{
+	struct task_group *tg;
+	tg = task_group(p);
+	__cpu_cgroup_stat_add(&tg->stat, CPU_CGROUP_STAT_UTIME,
+			cputime_to_msecs(cputime));
+	}
+#endif
 }
 
 /*
@@ -3796,6 +3834,15 @@ void account_system_time(struct task_struct *p, int hardirq_offset,
 		cpustat->idle = cputime64_add(cpustat->idle, tmp);
 	/* Account for system time used */
 	acct_update_integrals(p);
+
+#ifdef CONFIG_CGROUP_SCHED
+	{
+	struct task_group *tg;
+	tg = task_group(p);
+	__cpu_cgroup_stat_add(&tg->stat, CPU_CGROUP_STAT_STIME,
+			cputime_to_msecs(cputime));
+	}
+#endif
 }
 
 /*
@@ -8004,6 +8051,45 @@ static u64 cpu_shares_read_u64(struct cgroup *cgrp, struct cftype *cft)
 
 	return (u64) tg->shares;
 }
+
+static s64 cpu_cgroup_read_stat(struct cpu_cgroup_stat *stat,
+		enum cpu_cgroup_stat_index idx)
+{
+	int cpu;
+	s64 ret = 0;
+	unsigned long flags;
+
+	local_irq_save(flags);
+	for_each_possible_cpu(cpu)
+		ret += stat->cpustat[cpu].count[idx];
+	local_irq_restore(flags);
+
+	return ret;
+}
+
+static const struct cpu_cgroup_stat_desc {
+	const char *msg;
+	u64 unit;
+} cpu_cgroup_stat_desc[] = {
+	[CPU_CGROUP_STAT_UTIME] = { "utime", 1, },
+	[CPU_CGROUP_STAT_STIME] = { "stime", 1, },
+};
+
+static int cpu_cgroup_stats_show(struct cgroup *cgrp, struct cftype *cft,
+				struct cgroup_map_cb *cb)
+{
+	struct task_group *tg = cgroup_tg(cgrp);
+	struct cpu_cgroup_stat *stat = &tg->stat;
+	int i;
+
+	for (i = 0; i < ARRAY_SIZE(stat->cpustat[0].count); i++) {
+		s64 val;
+		val = cpu_cgroup_read_stat(stat, i);
+		val *= cpu_cgroup_stat_desc[i].unit;
+		cb->fill(cb, cpu_cgroup_stat_desc[i].msg, val);
+	}
+	return 0;
+}
 #endif
 
 #ifdef CONFIG_RT_GROUP_SCHED
@@ -8026,6 +8112,11 @@ static struct cftype cpu_files[] = {
 		.read_u64 = cpu_shares_read_u64,
 		.write_u64 = cpu_shares_write_u64,
 	},
+
+	{
+		.name = "stat",
+		.read_map = cpu_cgroup_stats_show,
+	},
 #endif
 #ifdef CONFIG_RT_GROUP_SCHED
 	{

-- 
regards,
Balaji Rao
Dept. of Mechanical Engineering,
National Institute of Technology Karnataka, India

^ permalink raw reply related	[flat|nested] 16+ messages in thread

end of thread, other threads:[~2008-04-10 16:25 UTC | newest]

Thread overview: 16+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2008-03-26 18:18 [RFC][-mm] [1/2] Simple stats for cpu resource controller Balaji Rao
2008-03-26 19:00 ` Paul Menage
2008-03-26 19:58 ` Peter Zijlstra
2008-03-28 10:02   ` Balaji Rao
2008-03-28 10:17     ` Peter Zijlstra
2008-04-05 18:09 Balaji Rao
2008-04-05 18:56 ` Dhaval Giani
2008-04-05 19:09   ` Balaji Rao
2008-04-05 19:40 ` Dhaval Giani
2008-04-05 20:31   ` Balaji Rao
2008-04-05 20:59     ` Dhaval Giani
2008-04-05 21:21       ` Balaji Rao
2008-04-06  5:12         ` Balbir Singh
2008-04-07 13:24     ` Peter Zijlstra
2008-04-10 16:09       ` Balaji Rao
2008-04-10 16:25         ` Peter Zijlstra

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).