LKML Archive on lore.kernel.org
help / color / mirror / Atom feed
* [RFC,PATCH] CELL PPU Oprofile SPU profiling updated patch
@ 2007-02-06  0:28 Carl Love
  2007-02-06  0:46 ` Paul Mackerras
                   ` (2 more replies)
  0 siblings, 3 replies; 20+ messages in thread
From: Carl Love @ 2007-02-06  0:28 UTC (permalink / raw)
  To: linux-kernel, cbe-oss-dev, linuxppc-dev

This is the first update to the patch previously posted by Maynard
Johnson as "PATCH 4/4. Add support to OProfile for profiling CELL".  

There are still a few items from the comments being discussed
specifically how to profile the dynamic code for the SPFS context switch
code and how to deal with dynamic code stubs for library support.  Our
proposal is to assign the samples from the SPFS and dynamic library code
to an anonymous sample bucket.  The support for properly handling the
symbol extraction in these cases would be deferred to a later SDK.    

There is also a bug in profiling overlay code that we are investigating.

Subject: Add support to OProfile for profiling Cell BE SPUs

From: Maynard Johnson <maynardj@us.ibm.com>

This patch updates the existing arch/powerpc/oprofile/op_model_cell.c
to add in the SPU profiling capabilities.  In addition, a 'cell'
subdirectory
was added to arch/powerpc/oprofile to hold Cell-specific SPU profiling
code.

Signed-off-by: Carl Love <carll@us.ibm.com>
Signed-off-by: Maynard Johnson <mpjohn@us.ibm.com>

Index: linux-2.6.20-rc1/arch/powerpc/configs/cell_defconfig
===================================================================
--- linux-2.6.20-rc1.orig/arch/powerpc/configs/cell_defconfig	2007-01-18
16:43:14.230540320 -0600
+++ linux-2.6.20-rc1/arch/powerpc/configs/cell_defconfig	2007-02-01
17:21:46.928875304 -0600
@@ -1403,7 +1403,7 @@
 # Instrumentation Support
 #
 CONFIG_PROFILING=y
-CONFIG_OPROFILE=y
+CONFIG_OPROFILE=m
 # CONFIG_KPROBES is not set
 
 #
Index: linux-2.6.20-rc1/arch/powerpc/oprofile/cell/pr_util.h
===================================================================
--- /dev/null	1970-01-01 00:00:00.000000000 +0000
+++ linux-2.6.20-rc1/arch/powerpc/oprofile/cell/pr_util.h	2007-02-03
15:56:01.094856152 -0600
@@ -0,0 +1,78 @@
+ /*
+ * Cell Broadband Engine OProfile Support
+ *
+ * (C) Copyright IBM Corporation 2006
+ *
+ * Author: Maynard Johnson <maynardj@us.ibm.com>
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version
+ * 2 of the License, or (at your option) any later version.
+ */
+
+#ifndef PR_UTIL_H
+#define PR_UTIL_H
+
+#include <linux/cpumask.h>
+#include <linux/oprofile.h>
+#include <asm/cell-pmu.h>
+#include <asm/spu.h>
+
+static inline int number_of_online_nodes(void) 
+{
+	u32 cpu; u32 tmp;
+	int nodes = 0;
+	for_each_online_cpu(cpu) {
+		tmp = cbe_cpu_to_node(cpu) + 1;
+		if (tmp > nodes)
+			nodes++;
+	}
+	return nodes;
+}
+
+/* Defines used for sync_start */
+#define SKIP_GENERIC_SYNC 0
+#define SYNC_START_ERROR -1
+#define DO_GENERIC_SYNC 1
+
+struct vma_to_fileoffset_map
+{
+	struct vma_to_fileoffset_map *next;
+	unsigned int vma;
+	unsigned int size;
+	unsigned int offset;
+	unsigned int guard_ptr;
+	unsigned int guard_val;
+};
+
+/* The three functions below are for maintaining and accessing
+ * the vma-to-fileoffset map.
+ */
+struct vma_to_fileoffset_map * create_vma_map(const struct spu * spu,
u64 objectid);
+unsigned int vma_map_lookup(struct vma_to_fileoffset_map *map, unsigned
int vma,
+			    const struct spu * aSpu);
+void vma_map_free(struct vma_to_fileoffset_map *map);
+
+/*
+ * Entry point for SPU profiling.
+ * cycles_reset is the SPU_CYCLES count value specified by the user.
+ */
+void start_spu_profiling(unsigned int cycles_reset);
+
+void stop_spu_profiling(void);
+
+ 
+/* add the necessary profiling hooks */
+int spu_sync_start(void);
+
+/* remove the hooks */
+int spu_sync_stop(void);
+ 
+/* Record SPU program counter samples to the oprofile event buffer. */
+void spu_sync_buffer(int spu_num, unsigned int * samples, 
+		     int num_samples);
+
+void set_profiling_frequency(unsigned int freq_khz, unsigned int
cycles_reset);
+
+#endif    // PR_UTIL_H 
Index: linux-2.6.20-rc1/arch/powerpc/oprofile/cell/spu_profiler.c
===================================================================
--- /dev/null	1970-01-01 00:00:00.000000000 +0000
+++ linux-2.6.20-rc1/arch/powerpc/oprofile/cell/spu_profiler.c
2007-02-05 09:32:25.708937424 -0600
@@ -0,0 +1,203 @@
+/*
+ * Cell Broadband Engine OProfile Support
+ *
+ * (C) Copyright IBM Corporation 2006
+ *
+ * Authors: Maynard Johnson <maynardj@us.ibm.com>
+ *          Carl Love <carll@us.ibm.com>
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version
+ * 2 of the License, or (at your option) any later version.
+ */
+
+#include <linux/hrtimer.h>
+#include <linux/smp.h>
+#include <linux/slab.h>
+#include <asm/cell-pmu.h>
+#include <asm/time.h>
+#include "pr_util.h"
+
+#define TRACE_ARRAY_SIZE 1024
+#define SCALE_SHIFT 14 
+
+static u32 * samples;
+
+static int spu_prof_running = 0;
+static unsigned int profiling_interval = 0;
+
+extern int spu_prof_num_nodes;
+
+
+#define NUM_SPU_BITS_TRBUF 16
+#define SPUS_PER_TB_ENTRY   4
+#define SPUS_PER_NODE       8
+
+void set_profiling_frequency(unsigned int freq_khz, unsigned int
cycles_reset)
+{
+	unsigned long nsPerCyc;
+	if (!freq_khz)
+		freq_khz = ppc_proc_freq/1000;
+
+        /* To calculate a timeout in nanoseconds, the basic
+	 * formula is ns = cycles_reset * (NSEC_PER_SEC / cpu frequency).
+	 * To avoid floating point math, we use the scale math
+	 * technique as described in linux/jiffies.h.  We use
+	 * a scale factor of SCALE_SHIFT,which provides 4 decimal places
+	 * of precision, which is close enough for the purpose at hand.
+	 */
+
+	nsPerCyc = (USEC_PER_SEC << SCALE_SHIFT)/freq_khz;
+	profiling_interval = (nsPerCyc * cycles_reset) >> SCALE_SHIFT;
+
+}
+
+/*
+ * Extract SPU PC from trace buffer entry
+ */
+static void spu_pc_extract(int cpu, int entry)
+{
+        /* the trace buffer is 128 bits */
+	u64 trace_buffer[2];
+	u64 spu_pc_lower;  
+	u64 spu_pc_upper;
+	u64 spu_mask;
+	int spu;
+	int node_factor;
+	
+	spu_mask = 0xFFFF;
+	node_factor = cbe_cpu_to_node(cpu) * SPUS_PER_NODE;
+	
+	/* Each SPU PC is 16 bits; hence, four spus in each of 
+	 * the two 64-bit buffer entries that make up the
+	 * 128-bit trace_buffer entry.  Process the upper and
+	 * lower 64-bit values simultaneously.
+	 * trace[0] SPU PC contents are: 0 1 2 3
+	 * trace[1] SPU PC contents are: 4 5 6 7
+	 */
+
+	cbe_read_trace_buffer(cpu, trace_buffer);  
+
+	for (spu = SPUS_PER_TB_ENTRY-1; spu >= 0; spu--) {
+		spu_pc_lower = spu_mask & trace_buffer[0];
+		trace_buffer[0] = trace_buffer[0] >> NUM_SPU_BITS_TRBUF;
+
+		spu_pc_upper = spu_mask & trace_buffer[1];
+		trace_buffer[1] = trace_buffer[1] >> NUM_SPU_BITS_TRBUF;
+		
+		/* spu PC trace entry is upper 16 bits of the
+		 * 18 bit SPU program counter 
+		 */
+		spu_pc_lower = spu_pc_lower << 2;
+		spu_pc_upper = spu_pc_upper << 2;
+		
+		samples[((node_factor + spu) * TRACE_ARRAY_SIZE) + entry]
+			= (u32) spu_pc_lower;
+		samples[((node_factor + spu + SPUS_PER_TB_ENTRY) 
+			 * TRACE_ARRAY_SIZE) + entry] = (u32) spu_pc_upper;
+	}
+}
+
+static int cell_spu_pc_collection(int cpu)
+{
+	u32 trace_addr;
+	int entry;
+
+	/* process the collected SPU PC for the node */
+
+	entry = 0;
+
+	trace_addr = cbe_read_pm(cpu, trace_address);
+	while ((trace_addr & CBE_PM_TRACE_BUF_EMPTY) != 0x400)
+	{
+		/* there is data in the trace buffer to process */
+		spu_pc_extract(cpu, entry);
+
+		entry++;
+
+		if (entry >= TRACE_ARRAY_SIZE) 
+			/* spu_samples is full */
+			break;
+
+		trace_addr = cbe_read_pm(cpu, trace_address);
+	}
+	return(entry);
+}
+
+
+static int profile_spus(struct hrtimer * timer)
+{
+	ktime_t kt;
+	int cpu, node, k, num_samples, spu_num;
+	
+	if (!spu_prof_running)
+		goto stop;
+
+	for_each_online_cpu(cpu) {
+		if (cbe_get_hw_thread_id(cpu))
+			continue;
+
+		node = cbe_cpu_to_node(cpu);
+
+		num_samples = cell_spu_pc_collection(cpu);
+
+		if (num_samples == 0)
+			continue;
+
+		for (k = 0; k < SPUS_PER_NODE; k++) {
+			spu_num = k + (node * SPUS_PER_NODE);
+			spu_sync_buffer(spu_num, 
+					samples + (spu_num * TRACE_ARRAY_SIZE),
+					num_samples);
+		}
+	}
+	smp_wmb();
+
+	kt = ktime_set(0, profiling_interval);
+	if (!spu_prof_running)
+		goto stop;
+	hrtimer_forward(timer, timer->base->get_time(), kt);
+	return HRTIMER_RESTART;
+
+ stop:
+	printk(KERN_INFO "SPU_PROF: spu-prof timer ending\n");
+	return HRTIMER_NORESTART;
+}
+
+static struct hrtimer timer;
+/*
+ * Entry point for SPU profiling.
+ * NOTE:  SPU profiling is done system-wide, not per-CPU.
+ *
+ * cycles_reset is the count value specified by the user when
+ * setting up OProfile to count SPU_CYCLES.
+ */
+void start_spu_profiling(unsigned int cycles_reset) {
+
+	ktime_t kt;
+	
+	pr_debug("timer resolution: %lu\n", 
+		 TICK_NSEC);
+	kt = ktime_set(0, profiling_interval);
+	hrtimer_init(&timer, CLOCK_MONOTONIC, HRTIMER_REL);
+	timer.expires = kt;
+	timer.function = profile_spus;
+
+        /* Allocate arrays for collecting SPU PC samples */
+	samples = (u32 *) kzalloc(spu_prof_num_nodes * SPUS_PER_NODE * 
+				  TRACE_ARRAY_SIZE * sizeof(u32), GFP_KERNEL);
+
+	spu_prof_running = 1;
+	hrtimer_start(&timer, kt, HRTIMER_REL);
+}
+
+void stop_spu_profiling(void) 
+{
+	spu_prof_running = 0;
+	hrtimer_cancel(&timer);
+	kfree(samples);
+	pr_debug("SPU_PROF: stop_spu_profiling issued\n");
+}
+
+
Index: linux-2.6.20-rc1/arch/powerpc/oprofile/cell/spu_task_sync.c
===================================================================
--- /dev/null	1970-01-01 00:00:00.000000000 +0000
+++ linux-2.6.20-rc1/arch/powerpc/oprofile/cell/spu_task_sync.c
2007-02-05 18:07:02.896962904 -0600
@@ -0,0 +1,436 @@
+/*
+ * Cell Broadband Engine OProfile Support
+ *
+ * (C) Copyright IBM Corporation 2006
+ *
+ * Author: Maynard Johnson <maynardj@us.ibm.com>
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version
+ * 2 of the License, or (at your option) any later version.
+ */
+
+/* The purpose of this file is to handle SPU event task switching
+ * and to record SPU context information into the OProfile
+ * event buffer. 
+ *
+ * Additionally, the spu_sync_buffer function is provided as a helper
+ * for recoding actual SPU program counter samples to the event buffer.
+ */
+
+#include <linux/notifier.h>
+#include <linux/list.h>
+#include <linux/numa.h>
+#include <linux/mm.h>
+#include <linux/dcookies.h>
+#include <linux/spinlock.h>
+#include <linux/kref.h>
+#include <linux/oprofile.h>
+#include "pr_util.h"
+
+#define RELEASE_ALL 9999
+
+static spinlock_t buffer_lock = SPIN_LOCK_UNLOCKED;
+static spinlock_t cache_lock = SPIN_LOCK_UNLOCKED;
+static int num_spu_nodes;
+int spu_prof_num_nodes;
+
+/* Container for caching information about an active SPU task.
+ * 
+ */
+struct cached_info {
+	struct vma_to_fileoffset_map * map;
+	struct spu * the_spu;   /* needed to access pointer to local_store */
+	struct kref cache_ref;
+};
+
+static struct cached_info * spu_info[MAX_NUMNODES * 8];
+
+static void destroy_cached_info(struct kref * kref)
+{
+	struct cached_info * info;
+	info = container_of(kref, struct cached_info, cache_ref);
+	vma_map_free(info->map);
+	kfree(info);
+}
+
+/* Return the cached_info for the passed SPU number.
+ * 
+ * NOTE: Clients of this function MUST call kref_put()
+ *       when finished using the returned cached_info (if the
+ *       returned value is non-null).
+ */
+static struct cached_info * get_cached_info(struct spu * the_spu, int
spu_num)
+{
+	struct cached_info * ret_info = NULL;
+	unsigned long flags = 0;
+	if (spu_num >= num_spu_nodes) {
+		printk(KERN_ERR "SPU_PROF: " 
+		       "%s, line %d: Invalid index %d into spu info cache\n",
+		       __FUNCTION__, __LINE__, spu_num); 
+		goto out;
+	}
+	spin_lock_irqsave(&cache_lock, flags);
+	if (spu_info[spu_num])
+		kref_get(&spu_info[spu_num]->cache_ref);
+	else if (the_spu)
+		spu_info[spu_num] = (struct cached_info *)
+			spu_get_profile_private(the_spu->ctx);
+
+	ret_info = spu_info[spu_num];
+	spin_unlock_irqrestore(&cache_lock, flags);
+ out:
+	return ret_info;
+}
+
+
+/* Looks for cached info for the passed spu.  If not found, the
+ * cached info is created for the passed spu.
+ * Returns 0 for success; otherwise, -1 for error.  
+ */ 
+static int
+prepare_cached_spu_info(struct spu * spu, unsigned int objectId)
+{
+	unsigned long flags = 0;
+	struct vma_to_fileoffset_map * new_map;
+	int retval = 0;
+	struct cached_info * info = get_cached_info(spu, spu->number);
+
+	if (info) {
+        /* Immedidately put back reference to cached_info since we
don't
+	 * really need it -- just checking whether we have it.
+	 */
+		kref_put(&spu_info[spu->number]->cache_ref,
+			 destroy_cached_info);
+		pr_debug("Found cached SPU info.\n");
+		goto out;
+	}
+
+	/* Create cached_info and set spu_info[spu->number] to point to it.
+	 * spu->number is a system-wide value, not a per-node value.
+	 */
+	info = kzalloc(sizeof(struct cached_info), GFP_KERNEL);
+	if (!info) {
+		printk(KERN_ERR "SPU_PROF: "
+		       "%s, line %d: create vma_map failed\n",
+		       __FUNCTION__, __LINE__);
+		goto err_alloc;
+	}
+	new_map = create_vma_map(spu, objectId);
+	if (!new_map) {
+		printk(KERN_ERR "SPU_PROF: "
+		       "%s, line %d: create vma_map failed\n",
+		       __FUNCTION__, __LINE__);
+		goto err_alloc;
+	}
+
+	pr_debug("Created vma_map\n");
+	info->map = new_map;
+	info->the_spu = spu;
+	kref_init(&info->cache_ref);
+	spin_lock_irqsave(&cache_lock, flags);
+	spu_info[spu->number] = info;
+	spin_unlock_irqrestore(&cache_lock, flags);
+	/* Increment count before passing off ref to SPUFS. */
+	kref_get(&info->cache_ref);
+	spu_set_profile_private(spu->ctx, info, &info->cache_ref,
+				destroy_cached_info);
+	goto out;
+	
+err_alloc:
+	retval = -1;
+out:
+	return retval;
+}
+
+/*
+ * NOTE:  The caller is responsible for locking the
+ *	  cache_lock prior to calling this function.
+ */
+static int release_cached_info(int spu_index)
+{
+	int index, end;
+	if (spu_index == RELEASE_ALL) {
+		end = num_spu_nodes;
+		index = 0;
+	} else {
+	        if (spu_index >= num_spu_nodes) {
+        	        printk(KERN_ERR "SPU_PROF: "
+			       "%s, line %d: Invalid index %d into spu info cache\n",
+               	               __FUNCTION__, __LINE__, spu_index);
+	                goto out;
+	        }
+		end = spu_index +1;
+		index = spu_index;
+	}
+	for (; index < end; index++) {
+		if (spu_info[index]) {
+			kref_put(&spu_info[index]->cache_ref, destroy_cached_info);
+			spu_info[index] = NULL;
+		}
+	}
+
+out:
+	return 0;
+}
+
+/* The source code for fast_get_dcookie was "borrowed"
+ * from drivers/oprofile/buffer_sync.c.
+ */
+
+/* Optimisation. We can manage without taking the dcookie sem
+ * because we cannot reach this code without at least one
+ * dcookie user still being registered (namely, the reader
+ * of the event buffer).
+ */
+static inline unsigned long fast_get_dcookie(struct dentry * dentry,
+					     struct vfsmount * vfsmnt)
+{
+	unsigned long cookie;
+
+	if (dentry->d_cookie)
+		return (unsigned long)dentry;
+	get_dcookie(dentry, vfsmnt, &cookie);
+	return cookie;
+}
+
+/* Look up the dcookie for the task's first VM_EXECUTABLE mapping,
+ * which corresponds loosely to "application name". Also, determine
+ * the offset for the SPU ELF object.  If computed offset is 
+ * non-zero, it implies an embedded SPU object; otherwise, it's a
+ * separate SPU binary, in which case we retrieve it's dcookie.
+ */
+static unsigned long 
+get_exec_dcookie_and_offset(
+	struct spu * spu, unsigned int * offsetp,
+	unsigned long * spu_bin_dcookie,
+	unsigned int spu_ref)
+{
+	unsigned long cookie = 0;
+	unsigned int my_offset = 0;
+	struct vm_area_struct * vma;
+	struct mm_struct * mm = spu->mm;
+
+	if (!mm)
+		goto out;
+
+	for (vma = mm->mmap; vma; vma = vma->vm_next) {
+		if (!vma->vm_file)
+			continue;
+		if (!(vma->vm_flags & VM_EXECUTABLE))
+			continue;
+		cookie = fast_get_dcookie(vma->vm_file->f_dentry,
+					  vma->vm_file->f_vfsmnt);
+		pr_debug("got dcookie for %s\n",
+			 vma->vm_file->f_dentry->d_name.name);
+		break;
+	}
+
+	for (vma = mm->mmap; vma; vma = vma->vm_next) {
+		if (vma->vm_start > spu_ref || vma->vm_end < spu_ref)
+			continue;
+		my_offset = spu_ref - vma->vm_start;
+		pr_debug("Found spu ELF at "
+			 " %X for file %s\n", my_offset,
+			 vma->vm_file->f_dentry->d_name.name);
+		*offsetp = my_offset;
+		if (my_offset == 0) {
+			if (!vma->vm_file) {
+				goto fail_no_spu_cookie;
+			}
+			*spu_bin_dcookie = fast_get_dcookie(
+				vma->vm_file->f_dentry,
+				vma->vm_file->f_vfsmnt);
+			pr_debug("got dcookie for %s\n",
+				 vma->vm_file->f_dentry->d_name.name);
+		}
+		break;			
+	}
+	
+ out:
+	return cookie;
+
+ fail_no_spu_cookie:
+	printk(KERN_ERR "SPU_PROF: "
+	       "%s, line %d: Cannot find dcookie for SPU binary\n",
+	       __FUNCTION__, __LINE__);
+	goto out;
+}
+
+
+
+/* This function finds or creates cached context information for the
+ * passed SPU and records SPU context information into the OProfile
+ * event buffer.
+ */
+static int process_context_switch(struct spu * spu, unsigned int
objectId)
+{
+	unsigned long flags;
+	int retval = 0;
+	unsigned int offset = 0;
+	unsigned long spu_cookie = 0, app_dcookie = 0;
+	retval = prepare_cached_spu_info(spu, objectId);
+	if (retval == -1) {
+		goto out;
+	}
+        /* Get dcookie first because a mutex_lock is taken in that
+	 * code path, so interrupts must not be disabled.
+	 */
+	app_dcookie = get_exec_dcookie_and_offset(spu, &offset,
+						  &spu_cookie, objectId);
+
+        /* Record context info in event buffer */
+	spin_lock_irqsave(&buffer_lock, flags);
+	add_event_entry(ESCAPE_CODE);
+	add_event_entry(SPU_CTX_SWITCH_CODE);
+	add_event_entry(spu->number);
+	add_event_entry(spu->pid);
+	add_event_entry(spu->tgid);
+	add_event_entry(app_dcookie);
+
+	add_event_entry(ESCAPE_CODE);
+	if (offset) {
+	  /* When offset is non-zero,  this means the SPU ELF was embedded;
+	   * otherwise, it was loaded from a separate binary file.  For the
+	   * embedded case, we record the offset of the SPU ELF into the PPU
+	   * executable; for the non-embedded case, we record a dcookie that
+	   * points to the location of the SPU binary that was loaded.
+	   */
+		add_event_entry(SPU_OFFSET_CODE);
+		add_event_entry(offset);
+	} else {
+		add_event_entry(SPU_COOKIE_CODE);
+		add_event_entry(spu_cookie);
+	}
+	spin_unlock_irqrestore(&buffer_lock, flags);
+	smp_wmb();
+out:
+	return retval;
+}
+
+/* 
+ * This function is invoked on either a bind_context or
unbind_context.  
+ * If called for an unbind_context, the val arg is 0; otherwise, 
+ * it is the object-id value for the spu context.
+ * The data arg is of type 'struct spu *'.
+ */
+static int spu_active_notify(struct notifier_block * self, unsigned
long val,
+			     void * data)
+{
+	int retval;
+	unsigned long flags = 0;
+	struct spu * the_spu = data;
+	pr_debug("SPU event notification arrived\n");
+	if (!val){
+		spin_lock_irqsave(&cache_lock, flags);
+		retval = release_cached_info(the_spu->number);
+		spin_unlock_irqrestore(&cache_lock, flags);
+	} else {
+		retval = process_context_switch(the_spu, val);
+	}
+	return retval;
+}
+
+static struct notifier_block spu_active = {
+	.notifier_call = spu_active_notify,
+};
+
+/* The main purpose of this function is to synchronize
+ * OProfile with SPUFS by registering to be notified of
+ * SPU task switches.
+ *
+ * NOTE: When profiling SPUs, we must ensure that only
+ * spu_sync_start is invoked and not the generic sync_start
+ * in drivers/oprofile/oprof.c.  A return value of
+ * SKIP_GENERIC_SYNC or SYNC_START_ERROR will
+ * accomplish this.
+ */
+int spu_sync_start(void) {
+	int ret = SKIP_GENERIC_SYNC;
+	int register_ret;
+	unsigned long flags = 0;
+	spu_prof_num_nodes = number_of_online_nodes();
+	num_spu_nodes = spu_prof_num_nodes * 8;
+
+	spin_lock_irqsave(&buffer_lock, flags);
+	add_event_entry(ESCAPE_CODE);
+	add_event_entry(SPU_PROFILING_CODE);
+	add_event_entry(num_spu_nodes);
+	spin_unlock_irqrestore(&buffer_lock, flags);
+
+        /* Register for SPU events  */
+	register_ret = spu_switch_event_register(&spu_active);
+	if (register_ret) {
+		ret = SYNC_START_ERROR;
+		goto out;
+	}
+
+	pr_debug("spu_sync_start -- running.\n");
+out:
+	return ret;	
+}
+
+/* Record SPU program counter samples to the oprofile event buffer. */
+void spu_sync_buffer(int spu_num, unsigned int * samples, 
+		     int num_samples)
+{
+	unsigned long flags = 0;
+	int i;
+	struct vma_to_fileoffset_map * map;
+	struct spu * the_spu;
+	unsigned long long spu_num_ll = spu_num;
+	unsigned long long spu_num_shifted = spu_num_ll << 32;
+	struct cached_info * c_info = get_cached_info(NULL, spu_num);
+	if (c_info == NULL) {
+        /* This legitimately happens when the SPU task ends before all 
+	 * samples are recorded.  No big deal -- so we just drop a few
samples.
+	 */
+		pr_debug("SPU_PROF: No cached SPU contex "
+			  "for SPU #%d. Dropping samples.\n", spu_num);
+		return ;
+	}
+
+	map = c_info->map;
+	the_spu = c_info->the_spu;
+	spin_lock_irqsave(&buffer_lock, flags);
+	for (i = 0; i < num_samples; i++) {
+		unsigned long long file_offset;
+		unsigned int sample = *(samples+i);
+		if (sample == 0)
+			continue;
+		file_offset = vma_map_lookup(
+			map, sample, the_spu);
+		/* For now, we'll drop samples that can't be mapped.
+		 * This can happen for generated stubs executed from
+		 * the SPU stack.  Do we need to record these somehow?
+		 */
+		if (unlikely(file_offset == -1))
+			continue;
+		add_event_entry(file_offset | spu_num_shifted);
+	}
+	spin_unlock_irqrestore(&buffer_lock, flags);
+	kref_put(&spu_info[spu_num]->cache_ref, destroy_cached_info);
+}
+
+
+int spu_sync_stop(void)
+{
+	unsigned long flags = 0;
+	int ret = spu_switch_event_unregister(&spu_active);
+	if (ret) {
+		printk(KERN_ERR "SPU_PROF: "
+		       "%s, line %d: spu_switch_event_unregister returned %d\n",
+		       __FUNCTION__, __LINE__, ret);
+		goto out;
+	} 
+
+	spin_lock_irqsave(&cache_lock, flags);
+	ret = release_cached_info(RELEASE_ALL);
+	spin_unlock_irqrestore(&cache_lock, flags);
+out:
+	pr_debug("spu_sync_stop -- done.\n");
+	return ret;
+}
+
+
Index: linux-2.6.20-rc1/arch/powerpc/oprofile/cell/vma_map.c
===================================================================
--- /dev/null	1970-01-01 00:00:00.000000000 +0000
+++ linux-2.6.20-rc1/arch/powerpc/oprofile/cell/vma_map.c	2007-02-01
17:21:46.944872872 -0600
@@ -0,0 +1,229 @@
+ /*
+ * Cell Broadband Engine OProfile Support
+ *
+ * (C) Copyright IBM Corporation 2006
+ *
+ * Author: Maynard Johnson <maynardj@us.ibm.com>
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version
+ * 2 of the License, or (at your option) any later version.
+ */
+
+/* The code in this source file is responsible for generating
+ * vma-to-fileOffset maps for both overlay and non-overlay SPU
+ * applications.
+ */
+
+#include <linux/mm.h>
+#include <linux/string.h>
+#include <linux/uaccess.h>
+#include <linux/elf.h>
+#include "pr_util.h"
+
+
+void vma_map_free(struct vma_to_fileoffset_map *map)
+{
+	while (map) {
+		struct vma_to_fileoffset_map *next = map->next;
+		kfree(map);
+		map = next;
+	}
+}
+
+unsigned int
+vma_map_lookup(struct vma_to_fileoffset_map *map, unsigned int vma,
+	       const struct spu * aSpu)
+{
+	u32 offset = -1;
+	u32 ovly_grd;
+	for (; map; map = map->next) {
+		if (vma < map->vma || vma >= map->vma + map->size)
+			continue;
+
+		if (map->guard_ptr) {
+			ovly_grd = *(u32 *)(aSpu->local_store + map->guard_ptr);
+			if (ovly_grd != map->guard_val)
+				continue;
+		}
+		break;
+	}
+
+	if (likely(map != NULL)) {
+		offset = vma - map->vma + map->offset;
+	}
+	return offset;
+}
+
+static struct vma_to_fileoffset_map *
+vma_map_add(struct vma_to_fileoffset_map *map, unsigned int vma,
+	    unsigned int size, unsigned int offset, unsigned int guard_ptr,
+	    unsigned int guard_val)
+{
+	struct vma_to_fileoffset_map *new = kzalloc(sizeof(struct
vma_to_fileoffset_map), GFP_KERNEL);
+	if (!new) {
+		printk(KERN_ERR "SPU_PROF: %s, line %d: malloc failed\n",
+		       __FUNCTION__, __LINE__);
+		vma_map_free(map);
+		return NULL;
+	}
+
+	new->next = map;
+	new->vma = vma;
+	new->size = size;
+	new->offset = offset;
+	new->guard_ptr = guard_ptr;
+	new->guard_val = guard_val;
+
+	return new;
+}
+
+
+/* Parse SPE ELF header and generate a list of vma_maps.
+ * A pointer to the first vma_map in the generated list
+ * of vma_maps is returned.  */
+struct vma_to_fileoffset_map * create_vma_map(const struct spu * aSpu, 
+					      unsigned long spu_elf_start)
+{
+	static const unsigned char expected[EI_PAD] = {
+		[EI_MAG0] = ELFMAG0,
+		[EI_MAG1] = ELFMAG1,
+		[EI_MAG2] = ELFMAG2,
+		[EI_MAG3] = ELFMAG3,
+		[EI_CLASS] = ELFCLASS32,
+		[EI_DATA] = ELFDATA2MSB,
+		[EI_VERSION] = EV_CURRENT,
+		[EI_OSABI] = ELFOSABI_NONE
+	};
+
+	struct vma_to_fileoffset_map *map = NULL;
+	unsigned int overlay_tbl_offset = -1;
+	unsigned long phdr_start, shdr_start;
+	Elf32_Ehdr ehdr;
+	Elf32_Phdr phdr;
+	Elf32_Shdr shdr, shdr_str;
+	Elf32_Sym sym;
+	int i, j;
+	char name[32];
+
+	unsigned int ovly_table_sym = 0;
+	unsigned int ovly_buf_table_sym = 0;
+	unsigned int ovly_table_end_sym = 0;
+	unsigned int ovly_buf_table_end_sym = 0;
+	unsigned long ovly_table;
+	unsigned int n_ovlys;
+
+	struct {
+		unsigned int vma;
+		unsigned int size;
+		unsigned int offset;
+		unsigned int buf;
+	} ovly;
+
+	/* Get and validate ELF header.  */
+
+	copy_from_user(&ehdr, (void *) spu_elf_start, sizeof (ehdr));
+	if (memcmp(ehdr.e_ident, expected, EI_PAD) != 0) {
+		printk(KERN_ERR "SPU_PROF: "
+		       "%s, line %d: Unexpected value parsing SPU ELF\n",
+		       __FUNCTION__, __LINE__);
+		return NULL;
+	}
+	if (ehdr.e_machine != 23) {
+		printk(KERN_ERR "SPU_PROF: "
+		       "%s, line %d: Unexpected value parsing SPU ELF\n",
+		       __FUNCTION__,  __LINE__);
+
+		return NULL;
+	}
+	if (ehdr.e_type != ET_EXEC) {
+		printk(KERN_ERR "SPU_PROF: "
+		       "%s, line %d: Unexpected value parsing SPU ELF\n",
+		       __FUNCTION__, __LINE__);
+		return NULL;
+	}
+	phdr_start = spu_elf_start + ehdr.e_phoff;
+	shdr_start = spu_elf_start + ehdr.e_shoff;
+
+	/* Traverse program headers.  */
+	for (i = 0; i < ehdr.e_phnum; i++) {
+		copy_from_user(&phdr, (void *) (phdr_start + i * sizeof(phdr)), 
+			       sizeof(phdr));
+		if (phdr.p_type != PT_LOAD)
+			continue;
+		if (phdr.p_flags & (1 << 27))
+			continue;
+
+		map = vma_map_add(map, phdr.p_vaddr, phdr.p_memsz, 
+				  phdr.p_offset, 0, 0);
+		if (!map)
+			return NULL;
+	}
+
+	pr_debug("SPU_PROF: Created non-overlay maps\n");	
+	/* Traverse section table and search for overlay-related symbols.  */
+	for (i = 0; i < ehdr.e_shnum; i++) {
+		copy_from_user(&shdr, (void *) (shdr_start + i * sizeof(shdr)), 
+			       sizeof(shdr));
+		if (shdr.sh_type != SHT_SYMTAB)
+			continue;
+		if (shdr.sh_entsize != sizeof (sym))
+			continue;
+
+		copy_from_user(&shdr_str, 
+			       (void *) (shdr_start + shdr.sh_link * sizeof(shdr)),
+			       sizeof(shdr));
+		if (shdr_str.sh_type != SHT_STRTAB)
+			return NULL;
+
+		for (j = 0; j < shdr.sh_size / sizeof (sym); j++) {
+			copy_from_user(&sym, (void *) (spu_elf_start +
+						       shdr.sh_offset + j * sizeof (sym)),
+				       sizeof (sym));
+			copy_from_user(name, (void *) (spu_elf_start + shdr_str.sh_offset + 
+						       sym.st_name),
+				       20);
+			if (memcmp(name, "_ovly_table", 12) == 0)
+				ovly_table_sym = sym.st_value;
+			if (memcmp(name, "_ovly_buf_table", 16) == 0)
+				ovly_buf_table_sym = sym.st_value;
+			if (memcmp(name, "_ovly_table_end", 16) == 0)
+				ovly_table_end_sym = sym.st_value;
+			if (memcmp(name, "_ovly_buf_table_end", 20) == 0)
+				ovly_buf_table_end_sym = sym.st_value;
+		}
+	}
+
+	/* If we don't have overlays, we're done.  */
+	if (ovly_table_sym == 0 || ovly_buf_table_sym == 0
+	    || ovly_table_end_sym == 0 || ovly_buf_table_end_sym == 0) {
+		pr_debug("SPU_PROF: No overlay table found\n");
+		return map;
+	}
+	else {
+		pr_debug("SPU_PROF: Overlay table found\n");
+	}
+
+	overlay_tbl_offset = vma_map_lookup(map, ovly_table_sym, aSpu);
+	if (overlay_tbl_offset < 0) {
+		printk(KERN_ERR "SPU_PROF: "
+		       "%s, line %d: Error finding SPU overlay table\n",
+		       __FUNCTION__, __LINE__);
+		return NULL;
+	}
+	ovly_table = spu_elf_start + overlay_tbl_offset;
+	n_ovlys = (ovly_table_end_sym - ovly_table_sym) / sizeof (ovly);
+
+	/* Traverse overlay table.  */
+	for (i = 0; i < n_ovlys; i++) {
+		copy_from_user(&ovly, (void *) (ovly_table + i * sizeof (ovly)),
+			       sizeof (ovly));
+		map = vma_map_add(map, ovly.vma, ovly.size, ovly.offset,
+				   ovly_buf_table_sym + (ovly.buf - 1) * 4, i + 1);
+		if (!map)
+			return NULL;
+	}
+	
+	return map;
+}
Index: linux-2.6.20-rc1/arch/powerpc/oprofile/common.c
===================================================================
--- linux-2.6.20-rc1.orig/arch/powerpc/oprofile/common.c	2007-01-18
16:43:14.429510072 -0600
+++ linux-2.6.20-rc1/arch/powerpc/oprofile/common.c	2007-02-01
17:21:46.946872568 -0600
@@ -150,6 +150,8 @@
 #ifdef CONFIG_PPC_CELL_NATIVE
 		case PPC_OPROFILE_CELL:
 			model = &op_model_cell;
+			ops->sync_start = model->sync_start;
+			ops->sync_stop = model->sync_stop;
 			break;
 #endif
 		case PPC_OPROFILE_RS64:
Index: linux-2.6.20-rc1/arch/powerpc/oprofile/Kconfig
===================================================================
--- linux-2.6.20-rc1.orig/arch/powerpc/oprofile/Kconfig	2007-01-18
16:43:14.426510528 -0600
+++ linux-2.6.20-rc1/arch/powerpc/oprofile/Kconfig	2007-02-03
17:05:51.967892936 -0600
@@ -7,7 +7,8 @@
 
 config OPROFILE
 	tristate "OProfile system profiling (EXPERIMENTAL)"
-	depends on PROFILING
+	default m
+	depends on SPU_FS && PROFILING
 	help
 	  OProfile is a profiling system capable of profiling the
 	  whole system, include the kernel, kernel modules, libraries,
Index: linux-2.6.20-rc1/arch/powerpc/oprofile/Makefile
===================================================================
--- linux-2.6.20-rc1.orig/arch/powerpc/oprofile/Makefile	2007-01-18
16:43:14.429510072 -0600
+++ linux-2.6.20-rc1/arch/powerpc/oprofile/Makefile	2007-02-01
17:21:46.948872264 -0600
@@ -11,7 +11,8 @@
 		timer_int.o )
 
 oprofile-y := $(DRIVER_OBJS) common.o backtrace.o
-oprofile-$(CONFIG_PPC_CELL_NATIVE) += op_model_cell.o
+oprofile-$(CONFIG_PPC_CELL_NATIVE) += op_model_cell.o \
+					cell/spu_profiler.o cell/vma_map.o cell/spu_task_sync.o
 oprofile-$(CONFIG_PPC64) += op_model_rs64.o op_model_power4.o
 oprofile-$(CONFIG_FSL_BOOKE) += op_model_fsl_booke.o
 oprofile-$(CONFIG_6xx) += op_model_7450.o
Index: linux-2.6.20-rc1/arch/powerpc/oprofile/op_model_cell.c
===================================================================
--- linux-2.6.20-rc1.orig/arch/powerpc/oprofile/op_model_cell.c
2007-02-01 17:21:38.388840624 -0600
+++ linux-2.6.20-rc1/arch/powerpc/oprofile/op_model_cell.c	2007-02-03
15:59:38.555810464 -0600
@@ -37,6 +37,16 @@
 #include <asm/system.h>
 
 #include "../platforms/cell/interrupt.h"
+#include "cell/pr_util.h"
+
+/* spu_cycle_reset is the number of cycles between samples.
+ * This variable is used for SPU profiling and should ONLY be set
+ * at the beginning of cell_reg_setup; otherwise, it's read-only.
+ */
+static unsigned int spu_cycle_reset = 0;
+
+#define NUM_SPUS_PER_NODE    8
+#define SPU_CYCLES_EVENT_NUM 2        /*  event number for SPU_CYCLES
*/
 
 #define PPU_CYCLES_EVENT_NUM 1	/*  event number for CYCLES */
 #define PPU_CYCLES_GRP_NUM   1  /* special group number for identifying
@@ -50,7 +60,6 @@
 #define NUM_TRACE_BUS_WORDS 4
 #define NUM_INPUT_BUS_WORDS 2
 
-
 struct pmc_cntrl_data {
 	unsigned long vcntr;
 	unsigned long evnts;
@@ -140,12 +149,21 @@
 /*
  * Firmware interface functions
  */
+
 static int
 rtas_ibm_cbe_perftools(int subfunc, int passthru,
 		       void *address, unsigned long length)
 {
 	u64 paddr = __pa(address);
 
+	pm_rtas_token = rtas_token("ibm,cbe-perftools");  
+
+	if (pm_rtas_token == RTAS_UNKNOWN_SERVICE) {
+	  printk(KERN_ERR
+		 "%s: rtas token ibm,cbe-perftools unknown\n",
+		 __FUNCTION__);
+	}
+
 	return rtas_call(pm_rtas_token, 5, 1, NULL, subfunc, passthru,
 			 paddr >> 32, paddr & 0xffffffff, length);
 }
@@ -486,7 +504,12 @@
 	       struct op_system_config *sys, int num_ctrs)
 {
 	int i, j, cpu;
+	spu_cycle_reset = 0;
 
+	if (ctr[0].event == SPU_CYCLES_EVENT_NUM) {
+		spu_cycle_reset = ctr[0].count;
+		return;
+	}
 	pm_rtas_token = rtas_token("ibm,cbe-perftools");
 	if (pm_rtas_token == RTAS_UNKNOWN_SERVICE) {
 		printk(KERN_WARNING "%s: RTAS_UNKNOWN_SERVICE\n",
@@ -572,6 +595,8 @@
 	;
 }
 
+
+
 /* This function is called once for each cpu */
 static void cell_cpu_setup(struct op_counter_config *cntr)
 {
@@ -579,6 +604,9 @@
 	u32 num_enabled = 0;
 	int i;
 
+	if (spu_cycle_reset)
+		return;
+
 	/* There is one performance monitor per processor chip (i.e. node),
 	 * so we only need to perform this function once per node.
 	 */
@@ -613,11 +641,216 @@
 	;
 }
 
-static void cell_global_start(struct op_counter_config *ctr)
+#define size 24
+#define ENTRIES  (0x1<<8) /* 256 */
+#define MAXLFSR  0xFFFFFF
+
+int initial_lfsr[] =
+{16777215, 3797240, 13519805, 11602690, 6497030, 7614675, 2328937,
2889445,
+ 12364575, 8723156, 2450594, 16280864, 14742496, 10904589, 6434212,
4996256,
+ 5814270, 13014041, 9825245, 410260, 904096, 15151047, 15487695,
3061843,
+ 16482682, 7938572, 4893279, 9390321, 4320879, 5686402, 1711063,
10176714,
+ 4512270, 1057359, 16700434, 5731602, 2070114, 16030890, 1208230,
15603106,
+ 11857845, 6470172, 1362790, 7316876, 8534496, 1629197, 10003072,
1714539,
+ 1814669, 7106700, 5427154, 3395151, 3683327, 12950450, 16620273,
12122372,
+ 7194999, 9952750, 3608260, 13604295, 2266835, 14943567, 7079230,
777380,
+ 4516801, 1737661, 8730333, 13796927, 3247181, 9950017, 3481896,
16527555,
+ 13116123, 14505033, 9781119, 4860212, 7403253, 13264219, 12269980,
100120,
+ 664506, 607795, 8274553, 13133688, 6215305, 13208866, 16439693,
3320753,
+ 8773582, 13874619, 1784784, 4513501, 11002978, 9318515, 3038856,
14254582,
+ 15484958, 15967857, 13504461, 13657322, 14724513, 13955736, 5695315,
7330509,
+ 12630101, 6826854, 439712, 4609055, 13288878, 1309632, 4996398,
11392266,
+ 793740, 7653789, 2472670, 14641200, 5164364, 5482529, 10415855,
1629108,
+ 2012376, 13661123, 14655718, 9534083, 16637925, 2537745, 9787923,
12750103,
+ 4660370, 3283461, 14862772, 7034955, 6679872, 8918232, 6506913,
103649,
+ 6085577, 13324033, 14251613, 11058220, 11998181, 3100233, 468898,
7104918,
+ 12498413, 14408165, 1208514, 15712321, 3088687, 14778333, 3632503,
11151952,
+ 98896, 9159367, 8866146, 4780737, 4925758, 12362320, 4122783, 8543358,
+ 7056879, 10876914, 6282881, 1686625, 5100373, 4573666, 9265515,
13593840,
+ 5853060, 1188880, 4237111, 15765555, 14344137, 4608332, 6590210,
13745050,
+ 10916568, 12340402, 7145275, 4417153, 2300360, 12079643, 7608534,
15238251,
+ 4947424, 7014722, 3984546, 7168073, 10759589, 16293080, 3757181,
4577717,
+ 5163790, 2488841, 4650617, 3650022, 5440654, 1814617, 6939232,
15540909,
+ 501788, 1060986, 5058235, 5078222, 3734500, 10762065, 390862, 5172712,
+ 1070780, 7904429, 1669757, 3439997, 2956788, 14944927, 12496638,
994152,
+ 8901173, 11827497, 4268056, 15725859, 1694506, 5451950, 2892428,
1434298,
+ 9048323, 13558747, 15083840, 8154495, 15830901, 391127, 14970070,
2451434,
+ 2080347, 10775644, 14599429, 12540753, 4813943, 16140655, 2421772,
12724304,
+ 12935733, 7206473, 5697333, 10328104, 2418008, 13547986, 284246,
1732363,
+ 16375319, 8109554, 16372365, 14346072, 1835890, 13059499, 2442500,
4110674};
+
+/*
+ * The hardware uses an LFSR counting sequence to determine when to
capture
+ * the SPU PCs.  The SPU PC capture is done when the LFSR sequence
reaches the
+ * last value in the sequence.  An LFSR sequence is like a puesdo
random 
+ * number sequence where each number occurs once in the sequence but
the 
+ * sequence is not in numerical order.  To reduce the calculation time,
a 
+ * sequence of 256 precomputed values in the LFSR sequence are stored
in a
+ * table.  The nearest precomputed value is used as the initial point
from
+ * which to caculate the desired LFSR value that is n from the end of
the 
+ * sequence.  The lookup table reduces the maximum number of iterations
in 
+ * the loop from 2^24 to 2^16.
+ */
+static int calculate_lfsr(int n)
 {
-	u32 cpu;
+  int i;
+
+  int start_lfsr_index;
+  unsigned int newlfsr0;
+  unsigned int lfsr = MAXLFSR;
+  unsigned int binsize = (MAXLFSR+1)/ENTRIES;
+  unsigned int howmany; 
+
+  start_lfsr_index = (MAXLFSR - n) / binsize;
+  lfsr = initial_lfsr[start_lfsr_index];
+  howmany = (MAXLFSR - n) - (start_lfsr_index * (binsize));
+
+  for (i = 2; i < howmany+2; i++) {
+    newlfsr0 = (((lfsr >> (size - 1 - 0)) & 1) ^
+		((lfsr >> (size - 1 - 1)) & 1) ^
+		(((lfsr >> (size - 1 - 6)) & 1) ^
+		 ((lfsr >> (size - 1 - 23)) & 1)));
+
+    lfsr >>= 1;
+    lfsr = lfsr | (newlfsr0 << (size - 1)); 
+  }
+  return lfsr;
+}
+
+static void pm_rtas_activate_spu_profiling(u32 node)
+{
+	int ret, i;
+	struct pm_signal pm_signal_local[NR_PHYS_CTRS];
+
+	/* Set up the rtas call to configure the debug bus to 
+	 * route the SPU PCs.  Setup the pm_signal for each SPU */
+	for (i = 0; i < NUM_SPUS_PER_NODE; i++) {
+		pm_signal_local[i].cpu = node;
+		pm_signal_local[i].signal_group = 41;
+		pm_signal_local[i].bus_word = 1 << i / 2; /* spu i on 
+							   * word (i/2) 
+							   */
+		pm_signal_local[i].sub_unit = i;	/* spu i */
+		pm_signal_local[i].bit = 63;
+	}
+
+	pm_rtas_token = rtas_token("ibm,cbe-perftools");
+	if (pm_rtas_token == RTAS_UNKNOWN_SERVICE) {
+		printk(KERN_WARNING "%s: RTAS_UNKNOWN_SERVICE \n",
+		       __FUNCTION__);
+	}
+
+	ret = rtas_ibm_cbe_perftools(SUBFUNC_ACTIVATE, PASSTHRU_ENABLE,
+				     pm_signal_local,
+				     8 * sizeof(struct pm_signal)); //FIXME 8 to #define
+
+	if (ret)
+		printk(KERN_WARNING "%s: rtas returned: %d\n",
+		       __FUNCTION__, ret);
+
+}
+
+#ifdef CONFIG_CPU_FREQ
+static int
+oprof_cpufreq_notify(struct notifier_block *nb, unsigned long val, void
*data)
+{
+	int ret = 0;
+	struct cpufreq_freqs * frq = data;
+	if ((val == CPUFREQ_PRECHANGE && frq->old < frq->new) ||
+	    (val == CPUFREQ_POSTCHANGE && frq->old > frq->new) ||
+	    (val == CPUFREQ_RESUMECHANGE || val == CPUFREQ_SUSPENDCHANGE))
+		set_profiling_frequency(frq->new, spu_cycle_reset);
+	return ret;
+}
+
+static struct notifier_block cpu_freq_notifier_block = {
+	.notifier_call	= oprof_cpufreq_notify
+};
+#endif
+
+static void cell_global_start_spu(struct op_counter_config *ctr)
+{
+	int subfunc, rtn_value;
+	unsigned int lfsr_value;
+	int cpu;
+	int ret = 0;
+	unsigned int cpu_khzfreq = 0;
+
+	/* The SPU profiling uses time-based profiling based on
+	 * cpu frequency, so if configured with the CPU_FREQ
+	 * option, we should detect frequency changes and react
+	 * accordingly.
+	 */
+#ifdef CONFIG_CPU_FREQ
+	ret = cpufreq_register_notifier(&cpu_freq_notifier_block,
+					CPUFREQ_TRANSITION_NOTIFIER);
+	if (ret < 0)
+		printk(KERN_ERR "CPU freq change registration failed: %d\n",
+		       ret);
+	else
+		cpu_khzfreq = cpufreq_quick_get(smp_processor_id());
+#endif
+
+	set_profiling_frequency(cpu_khzfreq, spu_cycle_reset);
+
+	for_each_online_cpu(cpu) {
+		if (cbe_get_hw_thread_id(cpu))
+			continue;
+		/* Setup SPU cycle-based profiling.
+		 * Set perf_mon_control bit 0 to a zero before
+		 * enabling spu collection hardware.
+		 */
+		cbe_write_pm(cpu, pm_control, 0);
+
+		pm_rtas_activate_spu_profiling(cbe_cpu_to_node(cpu));
+
+		if (spu_cycle_reset > 0xFFFFFE) 
+				lfsr_value = calculate_lfsr(1);  /* use largest possible 
+								  *  value 
+								  */
+		else 
+		    lfsr_value = calculate_lfsr(spu_cycle_reset);
+
+		if (lfsr_value == 0) {  /* must use a non zero value.  Zero
+					 * disables data collection.
+					 */
+				lfsr_value = calculate_lfsr(1);  /* use largest possible 
+								  * value 
+								 */
+		}
+
+		lfsr_value = lfsr_value << 8; /* shift lfsr to correct 
+					       * register location
+					       */
+		
+		pm_rtas_token = rtas_token("ibm,cbe-spu-perftools");  
+
+		if (pm_rtas_token == RTAS_UNKNOWN_SERVICE) {
+			printk(KERN_ERR
+			       "%s: rtas token ibm,cbe-spu-perftools unknown\n",
+			       __FUNCTION__);
+		}
+
+		subfunc = 2;	// 2 - activate SPU tracing, 3 - deactivate
+
+		rtn_value = rtas_call(pm_rtas_token, 3, 1, NULL, subfunc,
+			  cbe_cpu_to_node(cpu), lfsr_value);
+
+		if (rtn_value != 0)
+			printk(KERN_ERR
+			       "%s: rtas call ibm,cbe-spu-perftools failed, return = %d\n",
+			       __FUNCTION__, rtn_value);
+	}
+
+	start_spu_profiling(spu_cycle_reset);
+
+	oprofile_running = 1;
+}
+
+static void cell_global_start_ppu(struct op_counter_config *ctr)
+{
+	u32 cpu, i;
 	u32 interrupt_mask = 0;
-	u32 i;
 
 	/* This routine gets called once for the system.
 	 * There is one performance monitor per node, so we
@@ -658,7 +891,61 @@
 	start_virt_cntrs();
 }
 
-static void cell_global_stop(void)
+
+static void cell_global_start(struct op_counter_config *ctr)
+{
+	if (spu_cycle_reset) {
+		cell_global_start_spu(ctr);
+	} else {
+		cell_global_start_ppu(ctr);
+	}
+}
+
+static void cell_global_stop_spu(void)
+{
+	int subfunc, rtn_value;
+	unsigned int lfsr_value;
+	int cpu;
+
+	oprofile_running = 0;
+
+#ifdef CONFIG_CPU_FREQ
+	cpufreq_unregister_notifier(&cpu_freq_notifier_block,
+				    CPUFREQ_TRANSITION_NOTIFIER);
+#endif
+
+	for_each_online_cpu(cpu) {
+		if (cbe_get_hw_thread_id(cpu))
+			continue;
+
+		pm_rtas_token = rtas_token("ibm,cbe-spu-perftools");  
+
+		if (pm_rtas_token == RTAS_UNKNOWN_SERVICE) {
+			printk(KERN_ERR
+			       "%s: rtas token ibm,cbe-spu-perftools unknown\n",
+			       __FUNCTION__);
+		}
+
+		subfunc = 3;	// 2 - activate SPU tracing, 3 - deactivate
+		lfsr_value = 0x8f100000;
+
+		rtn_value =
+		    rtas_call(pm_rtas_token, 3, 1, NULL, subfunc,
+			      cbe_cpu_to_node(cpu), lfsr_value);
+
+		if (rtn_value != 0)
+			printk
+			    ("ERROR, rtas call ibm,cbe-spu-perftools failed, return = %d\n",
+			     rtn_value);
+
+		/* Deactivate the signals */
+		pm_rtas_reset_signals(cbe_cpu_to_node(cpu));
+	}
+
+	stop_spu_profiling();
+}
+
+static void cell_global_stop_ppu(void)
 {
 	int cpu;
 
@@ -686,6 +973,16 @@
 	}
 }
 
+static void cell_global_stop(void)
+{
+	if (spu_cycle_reset) {
+		cell_global_stop_spu();
+	} else {
+		cell_global_stop_ppu();
+	}
+
+}
+
 static void
 cell_handle_interrupt(struct pt_regs *regs, struct op_counter_config
*ctr)
 {
@@ -754,10 +1051,35 @@
 	spin_unlock_irqrestore(&virt_cntr_lock, flags);
 }
 
+/* This function is called from the generic OProfile
+ * driver.  When profiling PPUs, we need to do the
+ * generic sync start; otherwise, do spu_sync_start.
+ */
+static int cell_sync_start(void)
+{
+	if (spu_cycle_reset)
+		return spu_sync_start();
+	else
+		return DO_GENERIC_SYNC;
+}
+
+static int cell_sync_stop(void)
+{
+	if (spu_cycle_reset)
+		return spu_sync_stop();
+	else
+		return 1;
+}
+
+
 struct op_powerpc_model op_model_cell = {
 	.reg_setup = cell_reg_setup,
 	.cpu_setup = cell_cpu_setup,
 	.global_start = cell_global_start,
 	.global_stop = cell_global_stop,
+	.sync_start = cell_sync_start,
+	.sync_stop = cell_sync_stop,
 	.handle_interrupt = cell_handle_interrupt,
 };
+
+
Index: linux-2.6.20-rc1/arch/powerpc/platforms/cell/spufs/sched.c
===================================================================
--- linux-2.6.20-rc1.orig/arch/powerpc/platforms/cell/spufs/sched.c
2007-02-01 17:21:41.943834416 -0600
+++ linux-2.6.20-rc1/arch/powerpc/platforms/cell/spufs/sched.c
2007-02-01 17:21:46.957870896 -0600
@@ -129,6 +129,7 @@
 	ctx->spu = spu;
 	ctx->ops = &spu_hw_ops;
 	spu->pid = current->pid;
+	spu->tgid = current->tgid;
 	spu->prio = current->prio;
 	spu->mm = ctx->owner;
 	mm_needs_global_tlbie(spu->mm);
@@ -161,6 +162,7 @@
 	spu->dma_callback = NULL;
 	spu->mm = NULL;
 	spu->pid = 0;
+	spu->tgid = 0;
 	spu->prio = MAX_PRIO;
 	ctx->ops = &spu_backing_ops;
 	ctx->spu = NULL;
Index: linux-2.6.20-rc1/drivers/oprofile/buffer_sync.c
===================================================================
--- linux-2.6.20-rc1.orig/drivers/oprofile/buffer_sync.c	2007-01-18
16:43:11.675529376 -0600
+++ linux-2.6.20-rc1/drivers/oprofile/buffer_sync.c	2007-02-01
17:21:46.960870440 -0600
@@ -26,6 +26,7 @@
 #include <linux/profile.h>
 #include <linux/module.h>
 #include <linux/fs.h>
+#include <linux/oprofile.h>
  
 #include "oprofile_stats.h"
 #include "event_buffer.h"
Index: linux-2.6.20-rc1/drivers/oprofile/event_buffer.h
===================================================================
--- linux-2.6.20-rc1.orig/drivers/oprofile/event_buffer.h	2007-01-18
16:43:11.673529680 -0600
+++ linux-2.6.20-rc1/drivers/oprofile/event_buffer.h	2007-02-01
17:21:46.962870136 -0600
@@ -19,28 +19,10 @@
  
 /* wake up the process sleeping on the event file */
 void wake_up_buffer_waiter(void);
- 
-/* Each escaped entry is prefixed by ESCAPE_CODE
- * then one of the following codes, then the
- * relevant data.
- */
-#define ESCAPE_CODE			~0UL
-#define CTX_SWITCH_CODE 		1
-#define CPU_SWITCH_CODE 		2
-#define COOKIE_SWITCH_CODE 		3
-#define KERNEL_ENTER_SWITCH_CODE	4
-#define KERNEL_EXIT_SWITCH_CODE		5
-#define MODULE_LOADED_CODE		6
-#define CTX_TGID_CODE			7
-#define TRACE_BEGIN_CODE		8
-#define TRACE_END_CODE			9
- 
+  
 #define INVALID_COOKIE ~0UL
 #define NO_COOKIE 0UL
 
-/* add data to the event buffer */
-void add_event_entry(unsigned long data);
- 
 extern struct file_operations event_buffer_fops;
  
 /* mutex between sync_cpu_buffers() and the
Index: linux-2.6.20-rc1/drivers/oprofile/oprof.c
===================================================================
--- linux-2.6.20-rc1.orig/drivers/oprofile/oprof.c	2007-01-18
16:43:11.675529376 -0600
+++ linux-2.6.20-rc1/drivers/oprofile/oprof.c	2007-02-01
17:21:46.964869832 -0600
@@ -53,9 +53,23 @@
 	 * us missing task deaths and eventually oopsing
 	 * when trying to process the event buffer.
 	 */
+	if (oprofile_ops.sync_start) {
+		int sync_ret = oprofile_ops.sync_start();
+		switch (sync_ret) {
+			case 0: goto post_sync;
+				break;
+			case 1: goto do_generic;
+				break;
+			case -1: goto out3;
+				break;
+			default: goto out3;
+		}
+	}
+do_generic:
 	if ((err = sync_start()))
 		goto out3;
 
+post_sync:
 	is_setup = 1;
 	mutex_unlock(&start_mutex);
 	return 0;
@@ -118,7 +132,19 @@
 void oprofile_shutdown(void)
 {
 	mutex_lock(&start_mutex);
+        if (oprofile_ops.sync_stop) {
+                int sync_ret = oprofile_ops.sync_stop();
+                switch (sync_ret) {
+                        case 0: goto post_sync;
+                                break;
+                        case 1: goto do_generic;
+                                break;
+			default: goto post_sync;
+                }
+        }
+do_generic:
 	sync_stop();
+post_sync:
 	if (oprofile_ops.shutdown)
 		oprofile_ops.shutdown();
 	is_setup = 0;
Index: linux-2.6.20-rc1/include/asm-powerpc/oprofile_impl.h
===================================================================
--- linux-2.6.20-rc1.orig/include/asm-powerpc/oprofile_impl.h	2007-01-18
16:43:19.315566704 -0600
+++ linux-2.6.20-rc1/include/asm-powerpc/oprofile_impl.h	2007-02-01
17:21:46.966869528 -0600
@@ -47,6 +47,8 @@
         void (*global_start) (struct op_counter_config *);
 	void (*stop) (void);
 	void (*global_stop) (void);
+	int (*sync_start)(void);
+	int (*sync_stop)(void);
 	void (*handle_interrupt) (struct pt_regs *,
 				  struct op_counter_config *);
 	int num_counters;
Index: linux-2.6.20-rc1/include/asm-powerpc/spu.h
===================================================================
--- linux-2.6.20-rc1.orig/include/asm-powerpc/spu.h	2007-02-01
17:21:41.950833352 -0600
+++ linux-2.6.20-rc1/include/asm-powerpc/spu.h	2007-02-05
08:34:38.498856800 -0600
@@ -128,6 +128,7 @@
 	struct spu_runqueue *rq;
 	unsigned long long timestamp;
 	pid_t pid;
+	pid_t tgid;
 	int prio;
 	int class_0_pending;
 	spinlock_t register_lock;
@@ -153,6 +154,11 @@
 int spu_irq_class_0_bottom(struct spu *spu);
 int spu_irq_class_1_bottom(struct spu *spu);
 void spu_irq_setaffinity(struct spu *spu, int cpu);
+void * spu_get_profile_private(struct spu_context * ctx);
+void spu_set_profile_private(struct spu_context * ctx, void *
profile_info,
+			     struct kref * prof_info_kref, 
+			     void (* prof_info_release) (struct kref * kref));
+
 
 /* system callbacks from the SPU */
 struct spu_syscall_block {
Index: linux-2.6.20-rc1/include/linux/oprofile.h
===================================================================
--- linux-2.6.20-rc1.orig/include/linux/oprofile.h	2007-01-18
16:43:18.379575976 -0600
+++ linux-2.6.20-rc1/include/linux/oprofile.h	2007-02-01
17:21:46.970868920 -0600
@@ -17,6 +17,28 @@
 #include <linux/spinlock.h>
 #include <asm/atomic.h>
  
+/* Each escaped entry is prefixed by ESCAPE_CODE
+ * then one of the following codes, then the
+ * relevant data.
+ * These #defines live in this file so that arch-specific
+ * buffer sync'ing code can access them.  
+ */
+#define ESCAPE_CODE                     ~0UL
+#define CTX_SWITCH_CODE                 1
+#define CPU_SWITCH_CODE                 2
+#define COOKIE_SWITCH_CODE              3
+#define KERNEL_ENTER_SWITCH_CODE        4
+#define KERNEL_EXIT_SWITCH_CODE         5
+#define MODULE_LOADED_CODE              6
+#define CTX_TGID_CODE                   7
+#define TRACE_BEGIN_CODE                8
+#define TRACE_END_CODE                  9
+#define XEN_ENTER_SWITCH_CODE          10
+#define SPU_PROFILING_CODE             11
+#define SPU_CTX_SWITCH_CODE            12
+#define SPU_OFFSET_CODE                13
+#define SPU_COOKIE_CODE                14
+
 struct super_block;
 struct dentry;
 struct file_operations;
@@ -35,6 +57,14 @@
 	int (*start)(void);
 	/* Stop delivering interrupts. */
 	void (*stop)(void);
+	/* Arch-specific buffer sync functions.
+	 * Return value = 0:  Success
+	 * Return value = -1: Failure
+	 * Return value = 1:  Run generic sync function
+	 */
+	int (*sync_start)(void);
+	int (*sync_stop)(void);
+
 	/* Initiate a stack backtrace. Optional. */
 	void (*backtrace)(struct pt_regs * const regs, unsigned int depth);
 	/* CPU identification string. */
@@ -56,6 +86,13 @@
 void oprofile_arch_exit(void);
 
 /**
+ * Add data to the event buffer.
+ * The data passed is free-form, but typically consists of
+ * file offsets, dcookies, context information, and ESCAPE codes.
+ */
+void add_event_entry(unsigned long data);
+ 
+/**
  * Add a sample. This may be called from any context. Pass
  * smp_processor_id() as cpu.
  */
Index: linux-2.6.20-rc1/kernel/hrtimer.c
===================================================================
--- linux-2.6.20-rc1.orig/kernel/hrtimer.c	2007-01-18 16:43:05.808489704
-0600
+++ linux-2.6.20-rc1/kernel/hrtimer.c	2007-02-01 17:21:46.973868464
-0600
@@ -335,6 +335,7 @@
 
 	return orun;
 }
+EXPORT_SYMBOL_GPL(hrtimer_forward);
 
 /*
  * enqueue_hrtimer - internal function to (re)start a timer
Index: linux-2.6.20-rc1/arch/powerpc/kernel/time.c
===================================================================
--- linux-2.6.20-rc1.orig/arch/powerpc/kernel/time.c	2007-02-02
15:47:08.624906680 -0600
+++ linux-2.6.20-rc1/arch/powerpc/kernel/time.c	2007-02-02
17:06:28.183894912 -0600
@@ -122,6 +122,7 @@
 static long timezone_offset;
 
 unsigned long ppc_proc_freq;
+EXPORT_SYMBOL(ppc_proc_freq);
 unsigned long ppc_tb_freq;
 
 static u64 tb_last_jiffy __cacheline_aligned_in_smp;
Index: linux-2.6.20-rc1/arch/powerpc/platforms/cell/spufs/spufs.h
===================================================================
--- linux-2.6.20-rc1.orig/arch/powerpc/platforms/cell/spufs/spufs.h
2007-02-01 17:21:41.945834112 -0600
+++ linux-2.6.20-rc1/arch/powerpc/platforms/cell/spufs/spufs.h
2007-02-05 08:06:01.793907392 -0600
@@ -75,6 +75,9 @@
 
 	struct list_head gang_list;
 	struct spu_gang *gang;
+	void * profile_private;		/* To be used only by profiler */
+	struct kref * prof_priv_kref;
+	void (* prof_priv_release) (struct kref *kref);
 };
 
 struct spu_gang {
Index: linux-2.6.20-rc1/arch/powerpc/platforms/cell/spufs/context.c
===================================================================
--- linux-2.6.20-rc1.orig/arch/powerpc/platforms/cell/spufs/context.c
2007-02-05 14:42:04.359859432 -0600
+++ linux-2.6.20-rc1/arch/powerpc/platforms/cell/spufs/context.c
2007-02-05 15:17:02.939929352 -0600
@@ -22,6 +22,7 @@
 
 #include <linux/fs.h>
 #include <linux/mm.h>
+#include <linux/module.h>
 #include <linux/slab.h>
 #include <asm/spu.h>
 #include <asm/spu_csa.h>
@@ -71,6 +72,7 @@
 	spu_fini_csa(&ctx->csa);
 	if (ctx->gang)
 		spu_gang_remove_ctx(ctx->gang, ctx);
+	kref_put(ctx->prof_priv_kref, ctx->prof_priv_release);
 	kfree(ctx);
 }
 
@@ -200,3 +202,31 @@
 
 	downgrade_write(&ctx->state_sema);
 }
+
+/* This interface allows a profiler (e.g., OProfile) to store
+ * spu_context information needed for profiling, allowing it to
+ * be saved across context save/restore operation.
+ *
+ * Assumes the caller has already incremented the ref count to
+ * profile_info; then spu_context_destroy must call kref_put
+ * on prof_info_kref.
+ */
+void spu_set_profile_private(struct spu_context * ctx, void *
profile_info,
+			     struct kref * prof_info_kref,
+			     void (* prof_info_release) (struct kref * kref))
+{
+	ctx->profile_private = profile_info;
+	ctx->prof_priv_kref = prof_info_kref;
+	ctx->prof_priv_release = prof_info_release;
+}
+EXPORT_SYMBOL_GPL(spu_set_profile_private);
+
+void * spu_get_profile_private(struct spu_context * ctx)
+{
+	if (ctx->profile_private)
+		kref_get(ctx->prof_priv_kref);
+	return ctx->profile_private;
+}
+EXPORT_SYMBOL_GPL(spu_get_profile_private);
+
+



^ permalink raw reply	[flat|nested] 20+ messages in thread

* Re: [RFC,PATCH] CELL PPU Oprofile SPU profiling updated patch
  2007-02-06  0:28 [RFC,PATCH] CELL PPU Oprofile SPU profiling updated patch Carl Love
@ 2007-02-06  0:46 ` Paul Mackerras
  2007-02-06  0:49 ` [Cbe-oss-dev] [RFC, PATCH] " Benjamin Herrenschmidt
  2007-02-06 23:02 ` [Cbe-oss-dev] [RFC, PATCH] CELL " Carl Love
  2 siblings, 0 replies; 20+ messages in thread
From: Paul Mackerras @ 2007-02-06  0:46 UTC (permalink / raw)
  To: Carl Love; +Cc: linux-kernel, cbe-oss-dev, linuxppc-dev

Carl Love writes:

> This is the first update to the patch previously posted by Maynard
> Johnson as "PATCH 4/4. Add support to OProfile for profiling CELL".  

Is it really necessary to keep posting all these messages to three
lists?  Wouldn't cbe-oss-dev be sufficient?

Paul.

^ permalink raw reply	[flat|nested] 20+ messages in thread

* Re: [Cbe-oss-dev] [RFC, PATCH] CELL PPU Oprofile SPU profiling updated patch
  2007-02-06  0:28 [RFC,PATCH] CELL PPU Oprofile SPU profiling updated patch Carl Love
  2007-02-06  0:46 ` Paul Mackerras
@ 2007-02-06  0:49 ` Benjamin Herrenschmidt
  2007-02-06 23:02 ` [Cbe-oss-dev] [RFC, PATCH] CELL " Carl Love
  2 siblings, 0 replies; 20+ messages in thread
From: Benjamin Herrenschmidt @ 2007-02-06  0:49 UTC (permalink / raw)
  To: Carl Love; +Cc: linux-kernel, cbe-oss-dev, linuxppc-dev

On Mon, 2007-02-05 at 16:28 -0800, Carl Love wrote:
> This is the first update to the patch previously posted by Maynard
> Johnson as "PATCH 4/4. Add support to OProfile for profiling CELL".  
> 
> There are still a few items from the comments being discussed
> specifically how to profile the dynamic code for the SPFS context
> switch
> code and how to deal with dynamic code stubs for library support.  Our
> proposal is to assign the samples from the SPFS and dynamic library
> code
> to an anonymous sample bucket.  The support for properly handling the
> symbol extraction in these cases would be deferred to a later SDK.    
> 
> There is also a bug in profiling overlay code that we are
> investigating.
> 
> Subject: Add support to OProfile for profiling Cell BE SPUs
> 
> From: Maynard Johnson <maynardj@us.ibm.com>
> 
> This patch updates the existing arch/powerpc/oprofile/op_model_cell.c
> to add in the SPU profiling capabilities.  In addition, a 'cell'
> subdirectory
> was added to arch/powerpc/oprofile to hold Cell-specific SPU profiling
> code.
> 
> Signed-off-by: Carl Love <carll@us.ibm.com>
> Signed-off-by: Maynard Johnson <mpjohn@us.ibm.com> 

Patch got mangled. You seem to be using evolution, make sure you use the
"Preformat" style when posting a patch.

Ben.



^ permalink raw reply	[flat|nested] 20+ messages in thread

* Re: [Cbe-oss-dev] [RFC, PATCH] CELL Oprofile SPU profiling updated patch
  2007-02-06  0:28 [RFC,PATCH] CELL PPU Oprofile SPU profiling updated patch Carl Love
  2007-02-06  0:46 ` Paul Mackerras
  2007-02-06  0:49 ` [Cbe-oss-dev] [RFC, PATCH] " Benjamin Herrenschmidt
@ 2007-02-06 23:02 ` Carl Love
  2007-02-07 15:41   ` Maynard Johnson
  2007-02-08 14:18   ` Milton Miller
  2 siblings, 2 replies; 20+ messages in thread
From: Carl Love @ 2007-02-06 23:02 UTC (permalink / raw)
  To: linux-kernel; +Cc: cbe-oss-dev, linuxppc-dev

This is the first update to the patch previously posted by Maynard
Johnson as "PATCH 4/4. Add support to OProfile for profiling CELL".  

This repost fixes the line wrap issue that Ben mentioned.  Also the kref
handling for the cached info has been fixed and simplified.

There are still a few items from the comments being discussed
specifically how to profile the dynamic code for the SPFS context switch
code and how to deal with dynamic code stubs for library support.  Our
proposal is to assign the samples from the SPFS and dynamic library code
to an anonymous sample bucket.  The support for properly handling the
symbol extraction in these cases would be deferred to a later SDK.    

There is also a bug in profiling overlay code that we are investigating.


Subject: Add support to OProfile for profiling Cell BE SPUs

From: Maynard Johnson <maynardj@us.ibm.com>

This patch updates the existing arch/powerpc/oprofile/op_model_cell.c
to add in the SPU profiling capabilities.  In addition, a 'cell' subdirectory
was added to arch/powerpc/oprofile to hold Cell-specific SPU profiling
code.

Signed-off-by: Carl Love <carll@us.ibm.com>
Signed-off-by: Maynard Johnson <mpjohn@us.ibm.com>

Index: linux-2.6.20-rc1/arch/powerpc/configs/cell_defconfig
===================================================================
--- linux-2.6.20-rc1.orig/arch/powerpc/configs/cell_defconfig	2007-01-18 16:43:14.230540320 -0600
+++ linux-2.6.20-rc1/arch/powerpc/configs/cell_defconfig	2007-02-01 17:21:46.928875304 -0600
@@ -1403,7 +1403,7 @@
 # Instrumentation Support
 #
 CONFIG_PROFILING=y
-CONFIG_OPROFILE=y
+CONFIG_OPROFILE=m
 # CONFIG_KPROBES is not set
 
 #
Index: linux-2.6.20-rc1/arch/powerpc/oprofile/cell/pr_util.h
===================================================================
--- /dev/null	1970-01-01 00:00:00.000000000 +0000
+++ linux-2.6.20-rc1/arch/powerpc/oprofile/cell/pr_util.h	2007-02-03 15:56:01.094856152 -0600
@@ -0,0 +1,78 @@
+ /*
+ * Cell Broadband Engine OProfile Support
+ *
+ * (C) Copyright IBM Corporation 2006
+ *
+ * Author: Maynard Johnson <maynardj@us.ibm.com>
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version
+ * 2 of the License, or (at your option) any later version.
+ */
+
+#ifndef PR_UTIL_H
+#define PR_UTIL_H
+
+#include <linux/cpumask.h>
+#include <linux/oprofile.h>
+#include <asm/cell-pmu.h>
+#include <asm/spu.h>
+
+static inline int number_of_online_nodes(void) 
+{
+	u32 cpu; u32 tmp;
+	int nodes = 0;
+	for_each_online_cpu(cpu) {
+		tmp = cbe_cpu_to_node(cpu) + 1;
+		if (tmp > nodes)
+			nodes++;
+	}
+	return nodes;
+}
+
+/* Defines used for sync_start */
+#define SKIP_GENERIC_SYNC 0
+#define SYNC_START_ERROR -1
+#define DO_GENERIC_SYNC 1
+
+struct vma_to_fileoffset_map
+{
+	struct vma_to_fileoffset_map *next;
+	unsigned int vma;
+	unsigned int size;
+	unsigned int offset;
+	unsigned int guard_ptr;
+	unsigned int guard_val;
+};
+
+/* The three functions below are for maintaining and accessing
+ * the vma-to-fileoffset map.
+ */
+struct vma_to_fileoffset_map * create_vma_map(const struct spu * spu, u64 objectid);
+unsigned int vma_map_lookup(struct vma_to_fileoffset_map *map, unsigned int vma,
+			    const struct spu * aSpu);
+void vma_map_free(struct vma_to_fileoffset_map *map);
+
+/*
+ * Entry point for SPU profiling.
+ * cycles_reset is the SPU_CYCLES count value specified by the user.
+ */
+void start_spu_profiling(unsigned int cycles_reset);
+
+void stop_spu_profiling(void);
+
+ 
+/* add the necessary profiling hooks */
+int spu_sync_start(void);
+
+/* remove the hooks */
+int spu_sync_stop(void);
+ 
+/* Record SPU program counter samples to the oprofile event buffer. */
+void spu_sync_buffer(int spu_num, unsigned int * samples, 
+		     int num_samples);
+
+void set_profiling_frequency(unsigned int freq_khz, unsigned int cycles_reset);
+
+#endif    // PR_UTIL_H 
Index: linux-2.6.20-rc1/arch/powerpc/oprofile/cell/spu_profiler.c
===================================================================
--- /dev/null	1970-01-01 00:00:00.000000000 +0000
+++ linux-2.6.20-rc1/arch/powerpc/oprofile/cell/spu_profiler.c	2007-02-05 09:32:25.708937424 -0600
@@ -0,0 +1,203 @@
+/*
+ * Cell Broadband Engine OProfile Support
+ *
+ * (C) Copyright IBM Corporation 2006
+ *
+ * Authors: Maynard Johnson <maynardj@us.ibm.com>
+ *          Carl Love <carll@us.ibm.com>
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version
+ * 2 of the License, or (at your option) any later version.
+ */
+
+#include <linux/hrtimer.h>
+#include <linux/smp.h>
+#include <linux/slab.h>
+#include <asm/cell-pmu.h>
+#include <asm/time.h>
+#include "pr_util.h"
+
+#define TRACE_ARRAY_SIZE 1024
+#define SCALE_SHIFT 14 
+
+static u32 * samples;
+
+static int spu_prof_running = 0;
+static unsigned int profiling_interval = 0;
+
+extern int spu_prof_num_nodes;
+
+
+#define NUM_SPU_BITS_TRBUF 16
+#define SPUS_PER_TB_ENTRY   4
+#define SPUS_PER_NODE       8
+
+void set_profiling_frequency(unsigned int freq_khz, unsigned int cycles_reset)
+{
+	unsigned long nsPerCyc;
+	if (!freq_khz)
+		freq_khz = ppc_proc_freq/1000;
+
+        /* To calculate a timeout in nanoseconds, the basic
+	 * formula is ns = cycles_reset * (NSEC_PER_SEC / cpu frequency).
+	 * To avoid floating point math, we use the scale math
+	 * technique as described in linux/jiffies.h.  We use
+	 * a scale factor of SCALE_SHIFT,which provides 4 decimal places
+	 * of precision, which is close enough for the purpose at hand.
+	 */
+
+	nsPerCyc = (USEC_PER_SEC << SCALE_SHIFT)/freq_khz;
+	profiling_interval = (nsPerCyc * cycles_reset) >> SCALE_SHIFT;
+
+}
+
+/*
+ * Extract SPU PC from trace buffer entry
+ */
+static void spu_pc_extract(int cpu, int entry)
+{
+        /* the trace buffer is 128 bits */
+	u64 trace_buffer[2];
+	u64 spu_pc_lower;  
+	u64 spu_pc_upper;
+	u64 spu_mask;
+	int spu;
+	int node_factor;
+	
+	spu_mask = 0xFFFF;
+	node_factor = cbe_cpu_to_node(cpu) * SPUS_PER_NODE;
+	
+	/* Each SPU PC is 16 bits; hence, four spus in each of 
+	 * the two 64-bit buffer entries that make up the
+	 * 128-bit trace_buffer entry.  Process the upper and
+	 * lower 64-bit values simultaneously.
+	 * trace[0] SPU PC contents are: 0 1 2 3
+	 * trace[1] SPU PC contents are: 4 5 6 7
+	 */
+
+	cbe_read_trace_buffer(cpu, trace_buffer);  
+
+	for (spu = SPUS_PER_TB_ENTRY-1; spu >= 0; spu--) {
+		spu_pc_lower = spu_mask & trace_buffer[0];
+		trace_buffer[0] = trace_buffer[0] >> NUM_SPU_BITS_TRBUF;
+
+		spu_pc_upper = spu_mask & trace_buffer[1];
+		trace_buffer[1] = trace_buffer[1] >> NUM_SPU_BITS_TRBUF;
+		
+		/* spu PC trace entry is upper 16 bits of the
+		 * 18 bit SPU program counter 
+		 */
+		spu_pc_lower = spu_pc_lower << 2;
+		spu_pc_upper = spu_pc_upper << 2;
+		
+		samples[((node_factor + spu) * TRACE_ARRAY_SIZE) + entry]
+			= (u32) spu_pc_lower;
+		samples[((node_factor + spu + SPUS_PER_TB_ENTRY) 
+			 * TRACE_ARRAY_SIZE) + entry] = (u32) spu_pc_upper;
+	}
+}
+
+static int cell_spu_pc_collection(int cpu)
+{
+	u32 trace_addr;
+	int entry;
+
+	/* process the collected SPU PC for the node */
+
+	entry = 0;
+
+	trace_addr = cbe_read_pm(cpu, trace_address);
+	while ((trace_addr & CBE_PM_TRACE_BUF_EMPTY) != 0x400)
+	{
+		/* there is data in the trace buffer to process */
+		spu_pc_extract(cpu, entry);
+
+		entry++;
+
+		if (entry >= TRACE_ARRAY_SIZE) 
+			/* spu_samples is full */
+			break;
+
+		trace_addr = cbe_read_pm(cpu, trace_address);
+	}
+	return(entry);
+}
+
+
+static int profile_spus(struct hrtimer * timer)
+{
+	ktime_t kt;
+	int cpu, node, k, num_samples, spu_num;
+	
+	if (!spu_prof_running)
+		goto stop;
+
+	for_each_online_cpu(cpu) {
+		if (cbe_get_hw_thread_id(cpu))
+			continue;
+
+		node = cbe_cpu_to_node(cpu);
+
+		num_samples = cell_spu_pc_collection(cpu);
+
+		if (num_samples == 0)
+			continue;
+
+		for (k = 0; k < SPUS_PER_NODE; k++) {
+			spu_num = k + (node * SPUS_PER_NODE);
+			spu_sync_buffer(spu_num, 
+					samples + (spu_num * TRACE_ARRAY_SIZE),
+					num_samples);
+		}
+	}
+	smp_wmb();
+
+	kt = ktime_set(0, profiling_interval);
+	if (!spu_prof_running)
+		goto stop;
+	hrtimer_forward(timer, timer->base->get_time(), kt);
+	return HRTIMER_RESTART;
+
+ stop:
+	printk(KERN_INFO "SPU_PROF: spu-prof timer ending\n");
+	return HRTIMER_NORESTART;
+}
+
+static struct hrtimer timer;
+/*
+ * Entry point for SPU profiling.
+ * NOTE:  SPU profiling is done system-wide, not per-CPU.
+ *
+ * cycles_reset is the count value specified by the user when
+ * setting up OProfile to count SPU_CYCLES.
+ */
+void start_spu_profiling(unsigned int cycles_reset) {
+
+	ktime_t kt;
+	
+	pr_debug("timer resolution: %lu\n", 
+		 TICK_NSEC);
+	kt = ktime_set(0, profiling_interval);
+	hrtimer_init(&timer, CLOCK_MONOTONIC, HRTIMER_REL);
+	timer.expires = kt;
+	timer.function = profile_spus;
+
+        /* Allocate arrays for collecting SPU PC samples */
+	samples = (u32 *) kzalloc(spu_prof_num_nodes * SPUS_PER_NODE * 
+				  TRACE_ARRAY_SIZE * sizeof(u32), GFP_KERNEL);
+
+	spu_prof_running = 1;
+	hrtimer_start(&timer, kt, HRTIMER_REL);
+}
+
+void stop_spu_profiling(void) 
+{
+	spu_prof_running = 0;
+	hrtimer_cancel(&timer);
+	kfree(samples);
+	pr_debug("SPU_PROF: stop_spu_profiling issued\n");
+}
+
+
Index: linux-2.6.20-rc1/arch/powerpc/oprofile/cell/spu_task_sync.c
===================================================================
--- /dev/null	1970-01-01 00:00:00.000000000 +0000
+++ linux-2.6.20-rc1/arch/powerpc/oprofile/cell/spu_task_sync.c	2007-02-06 16:43:27.832908640 -0600
@@ -0,0 +1,425 @@
+/*
+ * Cell Broadband Engine OProfile Support
+ *
+ * (C) Copyright IBM Corporation 2006
+ *
+ * Author: Maynard Johnson <maynardj@us.ibm.com>
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version
+ * 2 of the License, or (at your option) any later version.
+ */
+
+/* The purpose of this file is to handle SPU event task switching
+ * and to record SPU context information into the OProfile
+ * event buffer. 
+ *
+ * Additionally, the spu_sync_buffer function is provided as a helper
+ * for recoding actual SPU program counter samples to the event buffer.
+ */
+
+#include <linux/notifier.h>
+#include <linux/list.h>
+#include <linux/numa.h>
+#include <linux/mm.h>
+#include <linux/dcookies.h>
+#include <linux/spinlock.h>
+#include <linux/kref.h>
+#include <linux/oprofile.h>
+#include "pr_util.h"
+
+#define RELEASE_ALL 9999
+
+static spinlock_t buffer_lock = SPIN_LOCK_UNLOCKED;
+static spinlock_t cache_lock = SPIN_LOCK_UNLOCKED;
+static int num_spu_nodes;
+int spu_prof_num_nodes;
+
+/* Container for caching information about an active SPU task.
+ * 
+ */
+struct cached_info {
+	struct vma_to_fileoffset_map * map;
+	struct spu * the_spu;   /* needed to access pointer to local_store */
+	struct kref cache_ref;
+};
+
+static struct cached_info * spu_info[MAX_NUMNODES * 8];
+
+static void destroy_cached_info(struct kref * kref)
+{
+	struct cached_info * info;
+	info = container_of(kref, struct cached_info, cache_ref);
+	vma_map_free(info->map);
+	kfree(info);
+}
+
+/* Return the cached_info for the passed SPU number.
+ * 
+ */
+static struct cached_info * get_cached_info(struct spu * the_spu, int spu_num)
+{
+	struct cached_info * ret_info = NULL;
+	unsigned long flags = 0;
+	if (spu_num >= num_spu_nodes) {
+		printk(KERN_ERR "SPU_PROF: " 
+		       "%s, line %d: Invalid index %d into spu info cache\n",
+		       __FUNCTION__, __LINE__, spu_num); 
+		goto out;
+	}
+	spin_lock_irqsave(&cache_lock, flags);
+	if (!spu_info[spu_num] && the_spu)
+		spu_info[spu_num] = (struct cached_info *)
+			spu_get_profile_private(the_spu->ctx);
+
+	ret_info = spu_info[spu_num];
+	spin_unlock_irqrestore(&cache_lock, flags);
+ out:
+	return ret_info;
+}
+
+
+/* Looks for cached info for the passed spu.  If not found, the
+ * cached info is created for the passed spu.
+ * Returns 0 for success; otherwise, -1 for error.  
+ */ 
+static int
+prepare_cached_spu_info(struct spu * spu, unsigned int objectId)
+{
+	unsigned long flags = 0;
+	struct vma_to_fileoffset_map * new_map;
+	int retval = 0;
+	struct cached_info * info = get_cached_info(spu, spu->number);
+
+	if (info) {
+		pr_debug("Found cached SPU info.\n");
+		goto out;
+	}
+
+	/* Create cached_info and set spu_info[spu->number] to point to it.
+	 * spu->number is a system-wide value, not a per-node value.
+	 */
+	info = kzalloc(sizeof(struct cached_info), GFP_KERNEL);
+	if (!info) {
+		printk(KERN_ERR "SPU_PROF: "
+		       "%s, line %d: create vma_map failed\n",
+		       __FUNCTION__, __LINE__);
+		goto err_alloc;
+	}
+	new_map = create_vma_map(spu, objectId);
+	if (!new_map) {
+		printk(KERN_ERR "SPU_PROF: "
+		       "%s, line %d: create vma_map failed\n",
+		       __FUNCTION__, __LINE__);
+		goto err_alloc;
+	}
+
+	pr_debug("Created vma_map\n");
+	info->map = new_map;
+	info->the_spu = spu;
+	kref_init(&info->cache_ref);
+	spin_lock_irqsave(&cache_lock, flags);
+	spu_info[spu->number] = info;
+	spin_unlock_irqrestore(&cache_lock, flags);
+	/* Increment count before passing off ref to SPUFS. */
+	kref_get(&info->cache_ref);
+	spu_set_profile_private(spu->ctx, info, &info->cache_ref,
+				destroy_cached_info);
+	goto out;
+	
+err_alloc:
+	retval = -1;
+out:
+	return retval;
+}
+
+/*
+ * NOTE:  The caller is responsible for locking the
+ *	  cache_lock prior to calling this function.
+ */
+static int release_cached_info(int spu_index)
+{
+	int index, end;
+	if (spu_index == RELEASE_ALL) {
+		end = num_spu_nodes;
+		index = 0;
+	} else {
+	        if (spu_index >= num_spu_nodes) {
+        	        printk(KERN_ERR "SPU_PROF: "
+			       "%s, line %d: Invalid index %d into spu info cache\n",
+               	               __FUNCTION__, __LINE__, spu_index);
+	                goto out;
+	        }
+		end = spu_index +1;
+		index = spu_index;
+	}
+	for (; index < end; index++) {
+		if (spu_info[index]) {
+			kref_put(&spu_info[index]->cache_ref, destroy_cached_info);
+			spu_info[index] = NULL;
+		}
+	}
+
+out:
+	return 0;
+}
+
+/* The source code for fast_get_dcookie was "borrowed"
+ * from drivers/oprofile/buffer_sync.c.
+ */
+
+/* Optimisation. We can manage without taking the dcookie sem
+ * because we cannot reach this code without at least one
+ * dcookie user still being registered (namely, the reader
+ * of the event buffer).
+ */
+static inline unsigned long fast_get_dcookie(struct dentry * dentry,
+					     struct vfsmount * vfsmnt)
+{
+	unsigned long cookie;
+
+	if (dentry->d_cookie)
+		return (unsigned long)dentry;
+	get_dcookie(dentry, vfsmnt, &cookie);
+	return cookie;
+}
+
+/* Look up the dcookie for the task's first VM_EXECUTABLE mapping,
+ * which corresponds loosely to "application name". Also, determine
+ * the offset for the SPU ELF object.  If computed offset is 
+ * non-zero, it implies an embedded SPU object; otherwise, it's a
+ * separate SPU binary, in which case we retrieve it's dcookie.
+ */
+static unsigned long 
+get_exec_dcookie_and_offset(
+	struct spu * spu, unsigned int * offsetp,
+	unsigned long * spu_bin_dcookie,
+	unsigned int spu_ref)
+{
+	unsigned long cookie = 0;
+	unsigned int my_offset = 0;
+	struct vm_area_struct * vma;
+	struct mm_struct * mm = spu->mm;
+
+	if (!mm)
+		goto out;
+
+	for (vma = mm->mmap; vma; vma = vma->vm_next) {
+		if (!vma->vm_file)
+			continue;
+		if (!(vma->vm_flags & VM_EXECUTABLE))
+			continue;
+		cookie = fast_get_dcookie(vma->vm_file->f_dentry,
+					  vma->vm_file->f_vfsmnt);
+		pr_debug("got dcookie for %s\n",
+			 vma->vm_file->f_dentry->d_name.name);
+		break;
+	}
+
+	for (vma = mm->mmap; vma; vma = vma->vm_next) {
+		if (vma->vm_start > spu_ref || vma->vm_end < spu_ref)
+			continue;
+		my_offset = spu_ref - vma->vm_start;
+		pr_debug("Found spu ELF at "
+			 " %X for file %s\n", my_offset,
+			 vma->vm_file->f_dentry->d_name.name);
+		*offsetp = my_offset;
+		if (my_offset == 0) {
+			if (!vma->vm_file) {
+				goto fail_no_spu_cookie;
+			}
+			*spu_bin_dcookie = fast_get_dcookie(
+				vma->vm_file->f_dentry,
+				vma->vm_file->f_vfsmnt);
+			pr_debug("got dcookie for %s\n",
+				 vma->vm_file->f_dentry->d_name.name);
+		}
+		break;			
+	}
+	
+ out:
+	return cookie;
+
+ fail_no_spu_cookie:
+	printk(KERN_ERR "SPU_PROF: "
+	       "%s, line %d: Cannot find dcookie for SPU binary\n",
+	       __FUNCTION__, __LINE__);
+	goto out;
+}
+
+
+
+/* This function finds or creates cached context information for the
+ * passed SPU and records SPU context information into the OProfile
+ * event buffer.
+ */
+static int process_context_switch(struct spu * spu, unsigned int objectId)
+{
+	unsigned long flags;
+	int retval = 0;
+	unsigned int offset = 0;
+	unsigned long spu_cookie = 0, app_dcookie = 0;
+	retval = prepare_cached_spu_info(spu, objectId);
+	if (retval == -1) {
+		goto out;
+	}
+        /* Get dcookie first because a mutex_lock is taken in that
+	 * code path, so interrupts must not be disabled.
+	 */
+	app_dcookie = get_exec_dcookie_and_offset(spu, &offset,
+						  &spu_cookie, objectId);
+
+        /* Record context info in event buffer */
+	spin_lock_irqsave(&buffer_lock, flags);
+	add_event_entry(ESCAPE_CODE);
+	add_event_entry(SPU_CTX_SWITCH_CODE);
+	add_event_entry(spu->number);
+	add_event_entry(spu->pid);
+	add_event_entry(spu->tgid);
+	add_event_entry(app_dcookie);
+
+	add_event_entry(ESCAPE_CODE);
+	if (offset) {
+	  /* When offset is non-zero,  this means the SPU ELF was embedded;
+	   * otherwise, it was loaded from a separate binary file.  For the
+	   * embedded case, we record the offset of the SPU ELF into the PPU
+	   * executable; for the non-embedded case, we record a dcookie that
+	   * points to the location of the SPU binary that was loaded.
+	   */
+		add_event_entry(SPU_OFFSET_CODE);
+		add_event_entry(offset);
+	} else {
+		add_event_entry(SPU_COOKIE_CODE);
+		add_event_entry(spu_cookie);
+	}
+	spin_unlock_irqrestore(&buffer_lock, flags);
+	smp_wmb();
+out:
+	return retval;
+}
+
+/* 
+ * This function is invoked on either a bind_context or unbind_context.  
+ * If called for an unbind_context, the val arg is 0; otherwise, 
+ * it is the object-id value for the spu context.
+ * The data arg is of type 'struct spu *'.
+ */
+static int spu_active_notify(struct notifier_block * self, unsigned long val,
+			     void * data)
+{
+	int retval;
+	unsigned long flags = 0;
+	struct spu * the_spu = data;
+	pr_debug("SPU event notification arrived\n");
+	if (!val){
+		spin_lock_irqsave(&cache_lock, flags);
+		retval = release_cached_info(the_spu->number);
+		spin_unlock_irqrestore(&cache_lock, flags);
+	} else {
+		retval = process_context_switch(the_spu, val);
+	}
+	return retval;
+}
+
+static struct notifier_block spu_active = {
+	.notifier_call = spu_active_notify,
+};
+
+/* The main purpose of this function is to synchronize
+ * OProfile with SPUFS by registering to be notified of
+ * SPU task switches.
+ *
+ * NOTE: When profiling SPUs, we must ensure that only
+ * spu_sync_start is invoked and not the generic sync_start
+ * in drivers/oprofile/oprof.c.  A return value of
+ * SKIP_GENERIC_SYNC or SYNC_START_ERROR will
+ * accomplish this.
+ */
+int spu_sync_start(void) {
+	int ret = SKIP_GENERIC_SYNC;
+	int register_ret;
+	unsigned long flags = 0;
+	spu_prof_num_nodes = number_of_online_nodes();
+	num_spu_nodes = spu_prof_num_nodes * 8;
+
+	spin_lock_irqsave(&buffer_lock, flags);
+	add_event_entry(ESCAPE_CODE);
+	add_event_entry(SPU_PROFILING_CODE);
+	add_event_entry(num_spu_nodes);
+	spin_unlock_irqrestore(&buffer_lock, flags);
+
+        /* Register for SPU events  */
+	register_ret = spu_switch_event_register(&spu_active);
+	if (register_ret) {
+		ret = SYNC_START_ERROR;
+		goto out;
+	}
+
+	pr_debug("spu_sync_start -- running.\n");
+out:
+	return ret;	
+}
+
+/* Record SPU program counter samples to the oprofile event buffer. */
+void spu_sync_buffer(int spu_num, unsigned int * samples, 
+		     int num_samples)
+{
+	unsigned long flags = 0;
+	int i;
+	struct vma_to_fileoffset_map * map;
+	struct spu * the_spu;
+	unsigned long long spu_num_ll = spu_num;
+	unsigned long long spu_num_shifted = spu_num_ll << 32;
+	struct cached_info * c_info = get_cached_info(NULL, spu_num);
+	if (c_info == NULL) {
+        /* This legitimately happens when the SPU task ends before all 
+	 * samples are recorded.  No big deal -- so we just drop a few samples.
+	 */
+		pr_debug("SPU_PROF: No cached SPU contex "
+			  "for SPU #%d. Dropping samples.\n", spu_num);
+		return ;
+	}
+
+	map = c_info->map;
+	the_spu = c_info->the_spu;
+	spin_lock_irqsave(&buffer_lock, flags);
+	for (i = 0; i < num_samples; i++) {
+		unsigned long long file_offset;
+		unsigned int sample = *(samples+i);
+		if (sample == 0)
+			continue;
+		file_offset = vma_map_lookup(
+			map, sample, the_spu);
+		/* For now, we'll drop samples that can't be mapped.
+		 * This can happen for generated stubs executed from
+		 * the SPU stack.  Do we need to record these somehow?
+		 */
+		if (unlikely(file_offset == -1))
+			continue;
+		add_event_entry(file_offset | spu_num_shifted);
+	}
+	spin_unlock_irqrestore(&buffer_lock, flags);
+}
+
+
+int spu_sync_stop(void)
+{
+	unsigned long flags = 0;
+	int ret = spu_switch_event_unregister(&spu_active);
+	if (ret) {
+		printk(KERN_ERR "SPU_PROF: "
+		       "%s, line %d: spu_switch_event_unregister returned %d\n",
+		       __FUNCTION__, __LINE__, ret);
+		goto out;
+	} 
+
+	spin_lock_irqsave(&cache_lock, flags);
+	ret = release_cached_info(RELEASE_ALL);
+	spin_unlock_irqrestore(&cache_lock, flags);
+out:
+	pr_debug("spu_sync_stop -- done.\n");
+	return ret;
+}
+
+
Index: linux-2.6.20-rc1/arch/powerpc/oprofile/cell/vma_map.c
===================================================================
--- /dev/null	1970-01-01 00:00:00.000000000 +0000
+++ linux-2.6.20-rc1/arch/powerpc/oprofile/cell/vma_map.c	2007-02-01 17:21:46.944872872 -0600
@@ -0,0 +1,229 @@
+ /*
+ * Cell Broadband Engine OProfile Support
+ *
+ * (C) Copyright IBM Corporation 2006
+ *
+ * Author: Maynard Johnson <maynardj@us.ibm.com>
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version
+ * 2 of the License, or (at your option) any later version.
+ */
+
+/* The code in this source file is responsible for generating
+ * vma-to-fileOffset maps for both overlay and non-overlay SPU
+ * applications.
+ */
+
+#include <linux/mm.h>
+#include <linux/string.h>
+#include <linux/uaccess.h>
+#include <linux/elf.h>
+#include "pr_util.h"
+
+
+void vma_map_free(struct vma_to_fileoffset_map *map)
+{
+	while (map) {
+		struct vma_to_fileoffset_map *next = map->next;
+		kfree(map);
+		map = next;
+	}
+}
+
+unsigned int
+vma_map_lookup(struct vma_to_fileoffset_map *map, unsigned int vma,
+	       const struct spu * aSpu)
+{
+	u32 offset = -1;
+	u32 ovly_grd;
+	for (; map; map = map->next) {
+		if (vma < map->vma || vma >= map->vma + map->size)
+			continue;
+
+		if (map->guard_ptr) {
+			ovly_grd = *(u32 *)(aSpu->local_store + map->guard_ptr);
+			if (ovly_grd != map->guard_val)
+				continue;
+		}
+		break;
+	}
+
+	if (likely(map != NULL)) {
+		offset = vma - map->vma + map->offset;
+	}
+	return offset;
+}
+
+static struct vma_to_fileoffset_map *
+vma_map_add(struct vma_to_fileoffset_map *map, unsigned int vma,
+	    unsigned int size, unsigned int offset, unsigned int guard_ptr,
+	    unsigned int guard_val)
+{
+	struct vma_to_fileoffset_map *new = kzalloc(sizeof(struct vma_to_fileoffset_map), GFP_KERNEL);
+	if (!new) {
+		printk(KERN_ERR "SPU_PROF: %s, line %d: malloc failed\n",
+		       __FUNCTION__, __LINE__);
+		vma_map_free(map);
+		return NULL;
+	}
+
+	new->next = map;
+	new->vma = vma;
+	new->size = size;
+	new->offset = offset;
+	new->guard_ptr = guard_ptr;
+	new->guard_val = guard_val;
+
+	return new;
+}
+
+
+/* Parse SPE ELF header and generate a list of vma_maps.
+ * A pointer to the first vma_map in the generated list
+ * of vma_maps is returned.  */
+struct vma_to_fileoffset_map * create_vma_map(const struct spu * aSpu, 
+					      unsigned long spu_elf_start)
+{
+	static const unsigned char expected[EI_PAD] = {
+		[EI_MAG0] = ELFMAG0,
+		[EI_MAG1] = ELFMAG1,
+		[EI_MAG2] = ELFMAG2,
+		[EI_MAG3] = ELFMAG3,
+		[EI_CLASS] = ELFCLASS32,
+		[EI_DATA] = ELFDATA2MSB,
+		[EI_VERSION] = EV_CURRENT,
+		[EI_OSABI] = ELFOSABI_NONE
+	};
+
+	struct vma_to_fileoffset_map *map = NULL;
+	unsigned int overlay_tbl_offset = -1;
+	unsigned long phdr_start, shdr_start;
+	Elf32_Ehdr ehdr;
+	Elf32_Phdr phdr;
+	Elf32_Shdr shdr, shdr_str;
+	Elf32_Sym sym;
+	int i, j;
+	char name[32];
+
+	unsigned int ovly_table_sym = 0;
+	unsigned int ovly_buf_table_sym = 0;
+	unsigned int ovly_table_end_sym = 0;
+	unsigned int ovly_buf_table_end_sym = 0;
+	unsigned long ovly_table;
+	unsigned int n_ovlys;
+
+	struct {
+		unsigned int vma;
+		unsigned int size;
+		unsigned int offset;
+		unsigned int buf;
+	} ovly;
+
+	/* Get and validate ELF header.  */
+
+	copy_from_user(&ehdr, (void *) spu_elf_start, sizeof (ehdr));
+	if (memcmp(ehdr.e_ident, expected, EI_PAD) != 0) {
+		printk(KERN_ERR "SPU_PROF: "
+		       "%s, line %d: Unexpected value parsing SPU ELF\n",
+		       __FUNCTION__, __LINE__);
+		return NULL;
+	}
+	if (ehdr.e_machine != 23) {
+		printk(KERN_ERR "SPU_PROF: "
+		       "%s, line %d: Unexpected value parsing SPU ELF\n",
+		       __FUNCTION__,  __LINE__);
+
+		return NULL;
+	}
+	if (ehdr.e_type != ET_EXEC) {
+		printk(KERN_ERR "SPU_PROF: "
+		       "%s, line %d: Unexpected value parsing SPU ELF\n",
+		       __FUNCTION__, __LINE__);
+		return NULL;
+	}
+	phdr_start = spu_elf_start + ehdr.e_phoff;
+	shdr_start = spu_elf_start + ehdr.e_shoff;
+
+	/* Traverse program headers.  */
+	for (i = 0; i < ehdr.e_phnum; i++) {
+		copy_from_user(&phdr, (void *) (phdr_start + i * sizeof(phdr)), 
+			       sizeof(phdr));
+		if (phdr.p_type != PT_LOAD)
+			continue;
+		if (phdr.p_flags & (1 << 27))
+			continue;
+
+		map = vma_map_add(map, phdr.p_vaddr, phdr.p_memsz, 
+				  phdr.p_offset, 0, 0);
+		if (!map)
+			return NULL;
+	}
+
+	pr_debug("SPU_PROF: Created non-overlay maps\n");	
+	/* Traverse section table and search for overlay-related symbols.  */
+	for (i = 0; i < ehdr.e_shnum; i++) {
+		copy_from_user(&shdr, (void *) (shdr_start + i * sizeof(shdr)), 
+			       sizeof(shdr));
+		if (shdr.sh_type != SHT_SYMTAB)
+			continue;
+		if (shdr.sh_entsize != sizeof (sym))
+			continue;
+
+		copy_from_user(&shdr_str, 
+			       (void *) (shdr_start + shdr.sh_link * sizeof(shdr)),
+			       sizeof(shdr));
+		if (shdr_str.sh_type != SHT_STRTAB)
+			return NULL;
+
+		for (j = 0; j < shdr.sh_size / sizeof (sym); j++) {
+			copy_from_user(&sym, (void *) (spu_elf_start +
+						       shdr.sh_offset + j * sizeof (sym)),
+				       sizeof (sym));
+			copy_from_user(name, (void *) (spu_elf_start + shdr_str.sh_offset + 
+						       sym.st_name),
+				       20);
+			if (memcmp(name, "_ovly_table", 12) == 0)
+				ovly_table_sym = sym.st_value;
+			if (memcmp(name, "_ovly_buf_table", 16) == 0)
+				ovly_buf_table_sym = sym.st_value;
+			if (memcmp(name, "_ovly_table_end", 16) == 0)
+				ovly_table_end_sym = sym.st_value;
+			if (memcmp(name, "_ovly_buf_table_end", 20) == 0)
+				ovly_buf_table_end_sym = sym.st_value;
+		}
+	}
+
+	/* If we don't have overlays, we're done.  */
+	if (ovly_table_sym == 0 || ovly_buf_table_sym == 0
+	    || ovly_table_end_sym == 0 || ovly_buf_table_end_sym == 0) {
+		pr_debug("SPU_PROF: No overlay table found\n");
+		return map;
+	}
+	else {
+		pr_debug("SPU_PROF: Overlay table found\n");
+	}
+
+	overlay_tbl_offset = vma_map_lookup(map, ovly_table_sym, aSpu);
+	if (overlay_tbl_offset < 0) {
+		printk(KERN_ERR "SPU_PROF: "
+		       "%s, line %d: Error finding SPU overlay table\n",
+		       __FUNCTION__, __LINE__);
+		return NULL;
+	}
+	ovly_table = spu_elf_start + overlay_tbl_offset;
+	n_ovlys = (ovly_table_end_sym - ovly_table_sym) / sizeof (ovly);
+
+	/* Traverse overlay table.  */
+	for (i = 0; i < n_ovlys; i++) {
+		copy_from_user(&ovly, (void *) (ovly_table + i * sizeof (ovly)),
+			       sizeof (ovly));
+		map = vma_map_add(map, ovly.vma, ovly.size, ovly.offset,
+				   ovly_buf_table_sym + (ovly.buf - 1) * 4, i + 1);
+		if (!map)
+			return NULL;
+	}
+	
+	return map;
+}
Index: linux-2.6.20-rc1/arch/powerpc/oprofile/common.c
===================================================================
--- linux-2.6.20-rc1.orig/arch/powerpc/oprofile/common.c	2007-01-18 16:43:14.429510072 -0600
+++ linux-2.6.20-rc1/arch/powerpc/oprofile/common.c	2007-02-01 17:21:46.946872568 -0600
@@ -150,6 +150,8 @@
 #ifdef CONFIG_PPC_CELL_NATIVE
 		case PPC_OPROFILE_CELL:
 			model = &op_model_cell;
+			ops->sync_start = model->sync_start;
+			ops->sync_stop = model->sync_stop;
 			break;
 #endif
 		case PPC_OPROFILE_RS64:
Index: linux-2.6.20-rc1/arch/powerpc/oprofile/Kconfig
===================================================================
--- linux-2.6.20-rc1.orig/arch/powerpc/oprofile/Kconfig	2007-01-18 16:43:14.426510528 -0600
+++ linux-2.6.20-rc1/arch/powerpc/oprofile/Kconfig	2007-02-03 17:05:51.967892936 -0600
@@ -7,7 +7,8 @@
 
 config OPROFILE
 	tristate "OProfile system profiling (EXPERIMENTAL)"
-	depends on PROFILING
+	default m
+	depends on SPU_FS && PROFILING
 	help
 	  OProfile is a profiling system capable of profiling the
 	  whole system, include the kernel, kernel modules, libraries,
Index: linux-2.6.20-rc1/arch/powerpc/oprofile/Makefile
===================================================================
--- linux-2.6.20-rc1.orig/arch/powerpc/oprofile/Makefile	2007-01-18 16:43:14.429510072 -0600
+++ linux-2.6.20-rc1/arch/powerpc/oprofile/Makefile	2007-02-01 17:21:46.948872264 -0600
@@ -11,7 +11,8 @@
 		timer_int.o )
 
 oprofile-y := $(DRIVER_OBJS) common.o backtrace.o
-oprofile-$(CONFIG_PPC_CELL_NATIVE) += op_model_cell.o
+oprofile-$(CONFIG_PPC_CELL_NATIVE) += op_model_cell.o \
+					cell/spu_profiler.o cell/vma_map.o cell/spu_task_sync.o
 oprofile-$(CONFIG_PPC64) += op_model_rs64.o op_model_power4.o
 oprofile-$(CONFIG_FSL_BOOKE) += op_model_fsl_booke.o
 oprofile-$(CONFIG_6xx) += op_model_7450.o
Index: linux-2.6.20-rc1/arch/powerpc/oprofile/op_model_cell.c
===================================================================
--- linux-2.6.20-rc1.orig/arch/powerpc/oprofile/op_model_cell.c	2007-02-01 17:21:38.388840624 -0600
+++ linux-2.6.20-rc1/arch/powerpc/oprofile/op_model_cell.c	2007-02-03 15:59:38.555810464 -0600
@@ -37,6 +37,16 @@
 #include <asm/system.h>
 
 #include "../platforms/cell/interrupt.h"
+#include "cell/pr_util.h"
+
+/* spu_cycle_reset is the number of cycles between samples.
+ * This variable is used for SPU profiling and should ONLY be set
+ * at the beginning of cell_reg_setup; otherwise, it's read-only.
+ */
+static unsigned int spu_cycle_reset = 0;
+
+#define NUM_SPUS_PER_NODE    8
+#define SPU_CYCLES_EVENT_NUM 2        /*  event number for SPU_CYCLES */
 
 #define PPU_CYCLES_EVENT_NUM 1	/*  event number for CYCLES */
 #define PPU_CYCLES_GRP_NUM   1  /* special group number for identifying
@@ -50,7 +60,6 @@
 #define NUM_TRACE_BUS_WORDS 4
 #define NUM_INPUT_BUS_WORDS 2
 
-
 struct pmc_cntrl_data {
 	unsigned long vcntr;
 	unsigned long evnts;
@@ -140,12 +149,21 @@
 /*
  * Firmware interface functions
  */
+
 static int
 rtas_ibm_cbe_perftools(int subfunc, int passthru,
 		       void *address, unsigned long length)
 {
 	u64 paddr = __pa(address);
 
+	pm_rtas_token = rtas_token("ibm,cbe-perftools");  
+
+	if (pm_rtas_token == RTAS_UNKNOWN_SERVICE) {
+	  printk(KERN_ERR
+		 "%s: rtas token ibm,cbe-perftools unknown\n",
+		 __FUNCTION__);
+	}
+
 	return rtas_call(pm_rtas_token, 5, 1, NULL, subfunc, passthru,
 			 paddr >> 32, paddr & 0xffffffff, length);
 }
@@ -486,7 +504,12 @@
 	       struct op_system_config *sys, int num_ctrs)
 {
 	int i, j, cpu;
+	spu_cycle_reset = 0;
 
+	if (ctr[0].event == SPU_CYCLES_EVENT_NUM) {
+		spu_cycle_reset = ctr[0].count;
+		return;
+	}
 	pm_rtas_token = rtas_token("ibm,cbe-perftools");
 	if (pm_rtas_token == RTAS_UNKNOWN_SERVICE) {
 		printk(KERN_WARNING "%s: RTAS_UNKNOWN_SERVICE\n",
@@ -572,6 +595,8 @@
 	;
 }
 
+
+
 /* This function is called once for each cpu */
 static void cell_cpu_setup(struct op_counter_config *cntr)
 {
@@ -579,6 +604,9 @@
 	u32 num_enabled = 0;
 	int i;
 
+	if (spu_cycle_reset)
+		return;
+
 	/* There is one performance monitor per processor chip (i.e. node),
 	 * so we only need to perform this function once per node.
 	 */
@@ -613,11 +641,216 @@
 	;
 }
 
-static void cell_global_start(struct op_counter_config *ctr)
+#define size 24
+#define ENTRIES  (0x1<<8) /* 256 */
+#define MAXLFSR  0xFFFFFF
+
+int initial_lfsr[] =
+{16777215, 3797240, 13519805, 11602690, 6497030, 7614675, 2328937, 2889445,
+ 12364575, 8723156, 2450594, 16280864, 14742496, 10904589, 6434212, 4996256,
+ 5814270, 13014041, 9825245, 410260, 904096, 15151047, 15487695, 3061843,
+ 16482682, 7938572, 4893279, 9390321, 4320879, 5686402, 1711063, 10176714,
+ 4512270, 1057359, 16700434, 5731602, 2070114, 16030890, 1208230, 15603106,
+ 11857845, 6470172, 1362790, 7316876, 8534496, 1629197, 10003072, 1714539,
+ 1814669, 7106700, 5427154, 3395151, 3683327, 12950450, 16620273, 12122372,
+ 7194999, 9952750, 3608260, 13604295, 2266835, 14943567, 7079230, 777380,
+ 4516801, 1737661, 8730333, 13796927, 3247181, 9950017, 3481896, 16527555,
+ 13116123, 14505033, 9781119, 4860212, 7403253, 13264219, 12269980, 100120,
+ 664506, 607795, 8274553, 13133688, 6215305, 13208866, 16439693, 3320753,
+ 8773582, 13874619, 1784784, 4513501, 11002978, 9318515, 3038856, 14254582,
+ 15484958, 15967857, 13504461, 13657322, 14724513, 13955736, 5695315, 7330509,
+ 12630101, 6826854, 439712, 4609055, 13288878, 1309632, 4996398, 11392266,
+ 793740, 7653789, 2472670, 14641200, 5164364, 5482529, 10415855, 1629108,
+ 2012376, 13661123, 14655718, 9534083, 16637925, 2537745, 9787923, 12750103,
+ 4660370, 3283461, 14862772, 7034955, 6679872, 8918232, 6506913, 103649,
+ 6085577, 13324033, 14251613, 11058220, 11998181, 3100233, 468898, 7104918,
+ 12498413, 14408165, 1208514, 15712321, 3088687, 14778333, 3632503, 11151952,
+ 98896, 9159367, 8866146, 4780737, 4925758, 12362320, 4122783, 8543358,
+ 7056879, 10876914, 6282881, 1686625, 5100373, 4573666, 9265515, 13593840,
+ 5853060, 1188880, 4237111, 15765555, 14344137, 4608332, 6590210, 13745050,
+ 10916568, 12340402, 7145275, 4417153, 2300360, 12079643, 7608534, 15238251,
+ 4947424, 7014722, 3984546, 7168073, 10759589, 16293080, 3757181, 4577717,
+ 5163790, 2488841, 4650617, 3650022, 5440654, 1814617, 6939232, 15540909,
+ 501788, 1060986, 5058235, 5078222, 3734500, 10762065, 390862, 5172712,
+ 1070780, 7904429, 1669757, 3439997, 2956788, 14944927, 12496638, 994152,
+ 8901173, 11827497, 4268056, 15725859, 1694506, 5451950, 2892428, 1434298,
+ 9048323, 13558747, 15083840, 8154495, 15830901, 391127, 14970070, 2451434,
+ 2080347, 10775644, 14599429, 12540753, 4813943, 16140655, 2421772, 12724304,
+ 12935733, 7206473, 5697333, 10328104, 2418008, 13547986, 284246, 1732363,
+ 16375319, 8109554, 16372365, 14346072, 1835890, 13059499, 2442500, 4110674};
+
+/*
+ * The hardware uses an LFSR counting sequence to determine when to capture
+ * the SPU PCs.  The SPU PC capture is done when the LFSR sequence reaches the
+ * last value in the sequence.  An LFSR sequence is like a puesdo random 
+ * number sequence where each number occurs once in the sequence but the 
+ * sequence is not in numerical order.  To reduce the calculation time, a 
+ * sequence of 256 precomputed values in the LFSR sequence are stored in a
+ * table.  The nearest precomputed value is used as the initial point from
+ * which to caculate the desired LFSR value that is n from the end of the 
+ * sequence.  The lookup table reduces the maximum number of iterations in 
+ * the loop from 2^24 to 2^16.
+ */
+static int calculate_lfsr(int n)
 {
-	u32 cpu;
+  int i;
+
+  int start_lfsr_index;
+  unsigned int newlfsr0;
+  unsigned int lfsr = MAXLFSR;
+  unsigned int binsize = (MAXLFSR+1)/ENTRIES;
+  unsigned int howmany; 
+
+  start_lfsr_index = (MAXLFSR - n) / binsize;
+  lfsr = initial_lfsr[start_lfsr_index];
+  howmany = (MAXLFSR - n) - (start_lfsr_index * (binsize));
+
+  for (i = 2; i < howmany+2; i++) {
+    newlfsr0 = (((lfsr >> (size - 1 - 0)) & 1) ^
+		((lfsr >> (size - 1 - 1)) & 1) ^
+		(((lfsr >> (size - 1 - 6)) & 1) ^
+		 ((lfsr >> (size - 1 - 23)) & 1)));
+
+    lfsr >>= 1;
+    lfsr = lfsr | (newlfsr0 << (size - 1)); 
+  }
+  return lfsr;
+}
+
+static void pm_rtas_activate_spu_profiling(u32 node)
+{
+	int ret, i;
+	struct pm_signal pm_signal_local[NR_PHYS_CTRS];
+
+	/* Set up the rtas call to configure the debug bus to 
+	 * route the SPU PCs.  Setup the pm_signal for each SPU */
+	for (i = 0; i < NUM_SPUS_PER_NODE; i++) {
+		pm_signal_local[i].cpu = node;
+		pm_signal_local[i].signal_group = 41;
+		pm_signal_local[i].bus_word = 1 << i / 2; /* spu i on 
+							   * word (i/2) 
+							   */
+		pm_signal_local[i].sub_unit = i;	/* spu i */
+		pm_signal_local[i].bit = 63;
+	}
+
+	pm_rtas_token = rtas_token("ibm,cbe-perftools");
+	if (pm_rtas_token == RTAS_UNKNOWN_SERVICE) {
+		printk(KERN_WARNING "%s: RTAS_UNKNOWN_SERVICE \n",
+		       __FUNCTION__);
+	}
+
+	ret = rtas_ibm_cbe_perftools(SUBFUNC_ACTIVATE, PASSTHRU_ENABLE,
+				     pm_signal_local,
+				     8 * sizeof(struct pm_signal)); //FIXME 8 to #define
+
+	if (ret)
+		printk(KERN_WARNING "%s: rtas returned: %d\n",
+		       __FUNCTION__, ret);
+
+}
+
+#ifdef CONFIG_CPU_FREQ
+static int
+oprof_cpufreq_notify(struct notifier_block *nb, unsigned long val, void *data)
+{
+	int ret = 0;
+	struct cpufreq_freqs * frq = data;
+	if ((val == CPUFREQ_PRECHANGE && frq->old < frq->new) ||
+	    (val == CPUFREQ_POSTCHANGE && frq->old > frq->new) ||
+	    (val == CPUFREQ_RESUMECHANGE || val == CPUFREQ_SUSPENDCHANGE))
+		set_profiling_frequency(frq->new, spu_cycle_reset);
+	return ret;
+}
+
+static struct notifier_block cpu_freq_notifier_block = {
+	.notifier_call	= oprof_cpufreq_notify
+};
+#endif
+
+static void cell_global_start_spu(struct op_counter_config *ctr)
+{
+	int subfunc, rtn_value;
+	unsigned int lfsr_value;
+	int cpu;
+	int ret = 0;
+	unsigned int cpu_khzfreq = 0;
+
+	/* The SPU profiling uses time-based profiling based on
+	 * cpu frequency, so if configured with the CPU_FREQ
+	 * option, we should detect frequency changes and react
+	 * accordingly.
+	 */
+#ifdef CONFIG_CPU_FREQ
+	ret = cpufreq_register_notifier(&cpu_freq_notifier_block,
+					CPUFREQ_TRANSITION_NOTIFIER);
+	if (ret < 0)
+		printk(KERN_ERR "CPU freq change registration failed: %d\n",
+		       ret);
+	else
+		cpu_khzfreq = cpufreq_quick_get(smp_processor_id());
+#endif
+
+	set_profiling_frequency(cpu_khzfreq, spu_cycle_reset);
+
+	for_each_online_cpu(cpu) {
+		if (cbe_get_hw_thread_id(cpu))
+			continue;
+		/* Setup SPU cycle-based profiling.
+		 * Set perf_mon_control bit 0 to a zero before
+		 * enabling spu collection hardware.
+		 */
+		cbe_write_pm(cpu, pm_control, 0);
+
+		pm_rtas_activate_spu_profiling(cbe_cpu_to_node(cpu));
+
+		if (spu_cycle_reset > 0xFFFFFE) 
+				lfsr_value = calculate_lfsr(1);  /* use largest possible 
+								  *  value 
+								  */
+		else 
+		    lfsr_value = calculate_lfsr(spu_cycle_reset);
+
+		if (lfsr_value == 0) {  /* must use a non zero value.  Zero
+					 * disables data collection.
+					 */
+				lfsr_value = calculate_lfsr(1);  /* use largest possible 
+								  * value 
+								 */
+		}
+
+		lfsr_value = lfsr_value << 8; /* shift lfsr to correct 
+					       * register location
+					       */
+		
+		pm_rtas_token = rtas_token("ibm,cbe-spu-perftools");  
+
+		if (pm_rtas_token == RTAS_UNKNOWN_SERVICE) {
+			printk(KERN_ERR
+			       "%s: rtas token ibm,cbe-spu-perftools unknown\n",
+			       __FUNCTION__);
+		}
+
+		subfunc = 2;	// 2 - activate SPU tracing, 3 - deactivate
+
+		rtn_value = rtas_call(pm_rtas_token, 3, 1, NULL, subfunc,
+			  cbe_cpu_to_node(cpu), lfsr_value);
+
+		if (rtn_value != 0)
+			printk(KERN_ERR
+			       "%s: rtas call ibm,cbe-spu-perftools failed, return = %d\n",
+			       __FUNCTION__, rtn_value);
+	}
+
+	start_spu_profiling(spu_cycle_reset);
+
+	oprofile_running = 1;
+}
+
+static void cell_global_start_ppu(struct op_counter_config *ctr)
+{
+	u32 cpu, i;
 	u32 interrupt_mask = 0;
-	u32 i;
 
 	/* This routine gets called once for the system.
 	 * There is one performance monitor per node, so we
@@ -658,7 +891,61 @@
 	start_virt_cntrs();
 }
 
-static void cell_global_stop(void)
+
+static void cell_global_start(struct op_counter_config *ctr)
+{
+	if (spu_cycle_reset) {
+		cell_global_start_spu(ctr);
+	} else {
+		cell_global_start_ppu(ctr);
+	}
+}
+
+static void cell_global_stop_spu(void)
+{
+	int subfunc, rtn_value;
+	unsigned int lfsr_value;
+	int cpu;
+
+	oprofile_running = 0;
+
+#ifdef CONFIG_CPU_FREQ
+	cpufreq_unregister_notifier(&cpu_freq_notifier_block,
+				    CPUFREQ_TRANSITION_NOTIFIER);
+#endif
+
+	for_each_online_cpu(cpu) {
+		if (cbe_get_hw_thread_id(cpu))
+			continue;
+
+		pm_rtas_token = rtas_token("ibm,cbe-spu-perftools");  
+
+		if (pm_rtas_token == RTAS_UNKNOWN_SERVICE) {
+			printk(KERN_ERR
+			       "%s: rtas token ibm,cbe-spu-perftools unknown\n",
+			       __FUNCTION__);
+		}
+
+		subfunc = 3;	// 2 - activate SPU tracing, 3 - deactivate
+		lfsr_value = 0x8f100000;
+
+		rtn_value =
+		    rtas_call(pm_rtas_token, 3, 1, NULL, subfunc,
+			      cbe_cpu_to_node(cpu), lfsr_value);
+
+		if (rtn_value != 0)
+			printk
+			    ("ERROR, rtas call ibm,cbe-spu-perftools failed, return = %d\n",
+			     rtn_value);
+
+		/* Deactivate the signals */
+		pm_rtas_reset_signals(cbe_cpu_to_node(cpu));
+	}
+
+	stop_spu_profiling();
+}
+
+static void cell_global_stop_ppu(void)
 {
 	int cpu;
 
@@ -686,6 +973,16 @@
 	}
 }
 
+static void cell_global_stop(void)
+{
+	if (spu_cycle_reset) {
+		cell_global_stop_spu();
+	} else {
+		cell_global_stop_ppu();
+	}
+
+}
+
 static void
 cell_handle_interrupt(struct pt_regs *regs, struct op_counter_config *ctr)
 {
@@ -754,10 +1051,35 @@
 	spin_unlock_irqrestore(&virt_cntr_lock, flags);
 }
 
+/* This function is called from the generic OProfile
+ * driver.  When profiling PPUs, we need to do the
+ * generic sync start; otherwise, do spu_sync_start.
+ */
+static int cell_sync_start(void)
+{
+	if (spu_cycle_reset)
+		return spu_sync_start();
+	else
+		return DO_GENERIC_SYNC;
+}
+
+static int cell_sync_stop(void)
+{
+	if (spu_cycle_reset)
+		return spu_sync_stop();
+	else
+		return 1;
+}
+
+
 struct op_powerpc_model op_model_cell = {
 	.reg_setup = cell_reg_setup,
 	.cpu_setup = cell_cpu_setup,
 	.global_start = cell_global_start,
 	.global_stop = cell_global_stop,
+	.sync_start = cell_sync_start,
+	.sync_stop = cell_sync_stop,
 	.handle_interrupt = cell_handle_interrupt,
 };
+
+
Index: linux-2.6.20-rc1/arch/powerpc/platforms/cell/spufs/sched.c
===================================================================
--- linux-2.6.20-rc1.orig/arch/powerpc/platforms/cell/spufs/sched.c	2007-02-01 17:21:41.943834416 -0600
+++ linux-2.6.20-rc1/arch/powerpc/platforms/cell/spufs/sched.c	2007-02-01 17:21:46.957870896 -0600
@@ -129,6 +129,7 @@
 	ctx->spu = spu;
 	ctx->ops = &spu_hw_ops;
 	spu->pid = current->pid;
+	spu->tgid = current->tgid;
 	spu->prio = current->prio;
 	spu->mm = ctx->owner;
 	mm_needs_global_tlbie(spu->mm);
@@ -161,6 +162,7 @@
 	spu->dma_callback = NULL;
 	spu->mm = NULL;
 	spu->pid = 0;
+	spu->tgid = 0;
 	spu->prio = MAX_PRIO;
 	ctx->ops = &spu_backing_ops;
 	ctx->spu = NULL;
Index: linux-2.6.20-rc1/drivers/oprofile/buffer_sync.c
===================================================================
--- linux-2.6.20-rc1.orig/drivers/oprofile/buffer_sync.c	2007-01-18 16:43:11.675529376 -0600
+++ linux-2.6.20-rc1/drivers/oprofile/buffer_sync.c	2007-02-01 17:21:46.960870440 -0600
@@ -26,6 +26,7 @@
 #include <linux/profile.h>
 #include <linux/module.h>
 #include <linux/fs.h>
+#include <linux/oprofile.h>
  
 #include "oprofile_stats.h"
 #include "event_buffer.h"
Index: linux-2.6.20-rc1/drivers/oprofile/event_buffer.h
===================================================================
--- linux-2.6.20-rc1.orig/drivers/oprofile/event_buffer.h	2007-01-18 16:43:11.673529680 -0600
+++ linux-2.6.20-rc1/drivers/oprofile/event_buffer.h	2007-02-01 17:21:46.962870136 -0600
@@ -19,28 +19,10 @@
  
 /* wake up the process sleeping on the event file */
 void wake_up_buffer_waiter(void);
- 
-/* Each escaped entry is prefixed by ESCAPE_CODE
- * then one of the following codes, then the
- * relevant data.
- */
-#define ESCAPE_CODE			~0UL
-#define CTX_SWITCH_CODE 		1
-#define CPU_SWITCH_CODE 		2
-#define COOKIE_SWITCH_CODE 		3
-#define KERNEL_ENTER_SWITCH_CODE	4
-#define KERNEL_EXIT_SWITCH_CODE		5
-#define MODULE_LOADED_CODE		6
-#define CTX_TGID_CODE			7
-#define TRACE_BEGIN_CODE		8
-#define TRACE_END_CODE			9
- 
+  
 #define INVALID_COOKIE ~0UL
 #define NO_COOKIE 0UL
 
-/* add data to the event buffer */
-void add_event_entry(unsigned long data);
- 
 extern struct file_operations event_buffer_fops;
  
 /* mutex between sync_cpu_buffers() and the
Index: linux-2.6.20-rc1/drivers/oprofile/oprof.c
===================================================================
--- linux-2.6.20-rc1.orig/drivers/oprofile/oprof.c	2007-01-18 16:43:11.675529376 -0600
+++ linux-2.6.20-rc1/drivers/oprofile/oprof.c	2007-02-01 17:21:46.964869832 -0600
@@ -53,9 +53,23 @@
 	 * us missing task deaths and eventually oopsing
 	 * when trying to process the event buffer.
 	 */
+	if (oprofile_ops.sync_start) {
+		int sync_ret = oprofile_ops.sync_start();
+		switch (sync_ret) {
+			case 0: goto post_sync;
+				break;
+			case 1: goto do_generic;
+				break;
+			case -1: goto out3;
+				break;
+			default: goto out3;
+		}
+	}
+do_generic:
 	if ((err = sync_start()))
 		goto out3;
 
+post_sync:
 	is_setup = 1;
 	mutex_unlock(&start_mutex);
 	return 0;
@@ -118,7 +132,19 @@
 void oprofile_shutdown(void)
 {
 	mutex_lock(&start_mutex);
+        if (oprofile_ops.sync_stop) {
+                int sync_ret = oprofile_ops.sync_stop();
+                switch (sync_ret) {
+                        case 0: goto post_sync;
+                                break;
+                        case 1: goto do_generic;
+                                break;
+			default: goto post_sync;
+                }
+        }
+do_generic:
 	sync_stop();
+post_sync:
 	if (oprofile_ops.shutdown)
 		oprofile_ops.shutdown();
 	is_setup = 0;
Index: linux-2.6.20-rc1/include/asm-powerpc/oprofile_impl.h
===================================================================
--- linux-2.6.20-rc1.orig/include/asm-powerpc/oprofile_impl.h	2007-01-18 16:43:19.315566704 -0600
+++ linux-2.6.20-rc1/include/asm-powerpc/oprofile_impl.h	2007-02-01 17:21:46.966869528 -0600
@@ -47,6 +47,8 @@
         void (*global_start) (struct op_counter_config *);
 	void (*stop) (void);
 	void (*global_stop) (void);
+	int (*sync_start)(void);
+	int (*sync_stop)(void);
 	void (*handle_interrupt) (struct pt_regs *,
 				  struct op_counter_config *);
 	int num_counters;
Index: linux-2.6.20-rc1/include/asm-powerpc/spu.h
===================================================================
--- linux-2.6.20-rc1.orig/include/asm-powerpc/spu.h	2007-02-01 17:21:41.950833352 -0600
+++ linux-2.6.20-rc1/include/asm-powerpc/spu.h	2007-02-05 08:34:38.498856800 -0600
@@ -128,6 +128,7 @@
 	struct spu_runqueue *rq;
 	unsigned long long timestamp;
 	pid_t pid;
+	pid_t tgid;
 	int prio;
 	int class_0_pending;
 	spinlock_t register_lock;
@@ -153,6 +154,11 @@
 int spu_irq_class_0_bottom(struct spu *spu);
 int spu_irq_class_1_bottom(struct spu *spu);
 void spu_irq_setaffinity(struct spu *spu, int cpu);
+void * spu_get_profile_private(struct spu_context * ctx);
+void spu_set_profile_private(struct spu_context * ctx, void * profile_info,
+			     struct kref * prof_info_kref, 
+			     void (* prof_info_release) (struct kref * kref));
+
 
 /* system callbacks from the SPU */
 struct spu_syscall_block {
Index: linux-2.6.20-rc1/include/linux/oprofile.h
===================================================================
--- linux-2.6.20-rc1.orig/include/linux/oprofile.h	2007-01-18 16:43:18.379575976 -0600
+++ linux-2.6.20-rc1/include/linux/oprofile.h	2007-02-01 17:21:46.970868920 -0600
@@ -17,6 +17,28 @@
 #include <linux/spinlock.h>
 #include <asm/atomic.h>
  
+/* Each escaped entry is prefixed by ESCAPE_CODE
+ * then one of the following codes, then the
+ * relevant data.
+ * These #defines live in this file so that arch-specific
+ * buffer sync'ing code can access them.  
+ */
+#define ESCAPE_CODE                     ~0UL
+#define CTX_SWITCH_CODE                 1
+#define CPU_SWITCH_CODE                 2
+#define COOKIE_SWITCH_CODE              3
+#define KERNEL_ENTER_SWITCH_CODE        4
+#define KERNEL_EXIT_SWITCH_CODE         5
+#define MODULE_LOADED_CODE              6
+#define CTX_TGID_CODE                   7
+#define TRACE_BEGIN_CODE                8
+#define TRACE_END_CODE                  9
+#define XEN_ENTER_SWITCH_CODE          10
+#define SPU_PROFILING_CODE             11
+#define SPU_CTX_SWITCH_CODE            12
+#define SPU_OFFSET_CODE                13
+#define SPU_COOKIE_CODE                14
+
 struct super_block;
 struct dentry;
 struct file_operations;
@@ -35,6 +57,14 @@
 	int (*start)(void);
 	/* Stop delivering interrupts. */
 	void (*stop)(void);
+	/* Arch-specific buffer sync functions.
+	 * Return value = 0:  Success
+	 * Return value = -1: Failure
+	 * Return value = 1:  Run generic sync function
+	 */
+	int (*sync_start)(void);
+	int (*sync_stop)(void);
+
 	/* Initiate a stack backtrace. Optional. */
 	void (*backtrace)(struct pt_regs * const regs, unsigned int depth);
 	/* CPU identification string. */
@@ -56,6 +86,13 @@
 void oprofile_arch_exit(void);
 
 /**
+ * Add data to the event buffer.
+ * The data passed is free-form, but typically consists of
+ * file offsets, dcookies, context information, and ESCAPE codes.
+ */
+void add_event_entry(unsigned long data);
+ 
+/**
  * Add a sample. This may be called from any context. Pass
  * smp_processor_id() as cpu.
  */
Index: linux-2.6.20-rc1/kernel/hrtimer.c
===================================================================
--- linux-2.6.20-rc1.orig/kernel/hrtimer.c	2007-01-18 16:43:05.808489704 -0600
+++ linux-2.6.20-rc1/kernel/hrtimer.c	2007-02-01 17:21:46.973868464 -0600
@@ -335,6 +335,7 @@
 
 	return orun;
 }
+EXPORT_SYMBOL_GPL(hrtimer_forward);
 
 /*
  * enqueue_hrtimer - internal function to (re)start a timer
Index: linux-2.6.20-rc1/arch/powerpc/kernel/time.c
===================================================================
--- linux-2.6.20-rc1.orig/arch/powerpc/kernel/time.c	2007-02-02 15:47:08.624906680 -0600
+++ linux-2.6.20-rc1/arch/powerpc/kernel/time.c	2007-02-02 17:06:28.183894912 -0600
@@ -122,6 +122,7 @@
 static long timezone_offset;
 
 unsigned long ppc_proc_freq;
+EXPORT_SYMBOL(ppc_proc_freq);
 unsigned long ppc_tb_freq;
 
 static u64 tb_last_jiffy __cacheline_aligned_in_smp;
Index: linux-2.6.20-rc1/arch/powerpc/platforms/cell/spufs/spufs.h
===================================================================
--- linux-2.6.20-rc1.orig/arch/powerpc/platforms/cell/spufs/spufs.h	2007-02-01 17:21:41.945834112 -0600
+++ linux-2.6.20-rc1/arch/powerpc/platforms/cell/spufs/spufs.h	2007-02-05 08:06:01.793907392 -0600
@@ -75,6 +75,9 @@
 
 	struct list_head gang_list;
 	struct spu_gang *gang;
+	void * profile_private;		/* To be used only by profiler */
+	struct kref * prof_priv_kref;
+	void (* prof_priv_release) (struct kref *kref);
 };
 
 struct spu_gang {
Index: linux-2.6.20-rc1/arch/powerpc/platforms/cell/spufs/context.c
===================================================================
--- linux-2.6.20-rc1.orig/arch/powerpc/platforms/cell/spufs/context.c	2007-02-05 14:42:04.359859432 -0600
+++ linux-2.6.20-rc1/arch/powerpc/platforms/cell/spufs/context.c	2007-02-06 16:44:05.983965096 -0600
@@ -22,6 +22,7 @@
 
 #include <linux/fs.h>
 #include <linux/mm.h>
+#include <linux/module.h>
 #include <linux/slab.h>
 #include <asm/spu.h>
 #include <asm/spu_csa.h>
@@ -71,6 +72,8 @@
 	spu_fini_csa(&ctx->csa);
 	if (ctx->gang)
 		spu_gang_remove_ctx(ctx->gang, ctx);
+	if (ctx->prof_priv_kref)
+		kref_put(ctx->prof_priv_kref, ctx->prof_priv_release);
 	kfree(ctx);
 }
 
@@ -200,3 +203,29 @@
 
 	downgrade_write(&ctx->state_sema);
 }
+
+/* This interface allows a profiler (e.g., OProfile) to store
+ * spu_context information needed for profiling, allowing it to
+ * be saved across context save/restore operation.
+ *
+ * Assumes the caller has already incremented the ref count to
+ * profile_info; then spu_context_destroy must call kref_put
+ * on prof_info_kref.
+ */
+void spu_set_profile_private(struct spu_context * ctx, void * profile_info,
+			     struct kref * prof_info_kref,
+			     void (* prof_info_release) (struct kref * kref))
+{
+	ctx->profile_private = profile_info;
+	ctx->prof_priv_kref = prof_info_kref;
+	ctx->prof_priv_release = prof_info_release;
+}
+EXPORT_SYMBOL_GPL(spu_set_profile_private);
+
+void * spu_get_profile_private(struct spu_context * ctx)
+{
+	return ctx->profile_private;
+}
+EXPORT_SYMBOL_GPL(spu_get_profile_private);
+
+



^ permalink raw reply	[flat|nested] 20+ messages in thread

* Re: [Cbe-oss-dev] [RFC, PATCH] CELL Oprofile SPU profiling updated patch
  2007-02-06 23:02 ` [Cbe-oss-dev] [RFC, PATCH] CELL " Carl Love
@ 2007-02-07 15:41   ` Maynard Johnson
  2007-02-07 22:48     ` Michael Ellerman
  2007-02-08 14:18   ` Milton Miller
  1 sibling, 1 reply; 20+ messages in thread
From: Maynard Johnson @ 2007-02-07 15:41 UTC (permalink / raw)
  To: Carl Love; +Cc: linux-kernel, linuxppc-dev, cbe-oss-dev

Carl Love wrote:

>
>Subject: Add support to OProfile for profiling Cell BE SPUs
>
>From: Maynard Johnson <maynardj@us.ibm.com>
>
>This patch updates the existing arch/powerpc/oprofile/op_model_cell.c
>to add in the SPU profiling capabilities.  In addition, a 'cell' subdirectory
>was added to arch/powerpc/oprofile to hold Cell-specific SPU profiling
>code.
>
>Signed-off-by: Carl Love <carll@us.ibm.com>
>Signed-off-by: Maynard Johnson <mpjohn@us.ibm.com>
>
>Index: linux-2.6.20-rc1/arch/powerpc/configs/cell_defconfig
>  
>
I've discovered more problems with the kref handling for the cached_info 
object that we store in the spu_context.  :-(

When the OProfile module initially determines that no cached_info yet 
exists for a given spu_context, it creates the cached_info, inits the 
cached_info's kref (which increments the refcount) and does a kref_get 
(for SPUFS' ref) before passing the cached_info reference off to SUPFS 
to store into the spu_context.  When OProfile shuts down or the SPU job 
ends, OProfile gives up its ref to the cached_info with kref_put.  Then 
when SPUFS destroys the spu_context, it also gives up its ref.  HOWEVER 
. . . . If OProfile shuts down while the SPU job is still active _and_ 
_then_ is restarted while the job is still active, OProfile will find 
that the cached_info exists for the given spu_context, so it won't go 
through the process of creating it and doing kref_init on the kref.  
Under this scenario, OProfile does not own a ref of its own to the 
cached_info, and should not be doing a kref_put when done using the 
cached_info -- but it does, and so does SPUFS when the spu_context is 
destroyed.  The end result (with the code as currently written) is that 
an extra kref_put is done when the refcount is already down to zero.  To 
fix this, OProfile needs to detect when it finds an existing cached_info 
already stored in the spu_context.  Then, instead of creating a new one, 
it sets a reminder flag to be used later when it's done using the cached 
info to indicate whether or not it needs to call kref_put.

Unfortunately, there's another problem (one that should have been 
obvious to me).  The cached_info's kref "release" function is 
destroy_cached_info(), defined in the OProfile module.  If the OProfile 
module is unloaded when SPUFS destroys the spu_context and calls 
kref_put on the cached_info's kref -- KABOOM!  The destroy_cached_info 
function (the second arg to kref_put) is not in memory, so we get a 
paging fault.  I see a couple options to solve this:
    1) Don't store the cached_info in the spu_context.  Basically, go 
back to the simplistic model of creating/deleting the cached_info on 
every SPU task activation/deactivation.
    2)  If there's some way to do this, force the OProfile module to 
stay loaded until SPUFS has destroyed its last spu_context that holds a 
cached_info object.

I thought about putting the cached_info's kref "release" function in 
SPUFS, but this just won't work. This implies that SPUFS needs to know 
about the structure of the cached_info, e.g., that it contains the 
vma_map member that needs to be freed.  But even with that information, 
it's not enough, since the vma_map member consists of list of vma_maps, 
which is why we have the vma_map_free() function.  So SPUFS would still 
need access to vma_map_free() from the OProfile module.

Opinions from others would be appreciated.

Thanks.
-Maynard

>+/* Container for caching information about an active SPU task.
>+ * 
>+ */
>+struct cached_info {
>+	struct vma_to_fileoffset_map * map;
>+	struct spu * the_spu;   /* needed to access pointer to local_store */
>+	struct kref cache_ref;
>+};
>+
>+static struct cached_info * spu_info[MAX_NUMNODES * 8];
>+
>+static void destroy_cached_info(struct kref * kref)
>+{
>+	struct cached_info * info;
>+	info = container_of(kref, struct cached_info, cache_ref);
>+	vma_map_free(info->map);
>+	kfree(info);
>+}
>+
>+/* Return the cached_info for the passed SPU number.
>+ * 
>+ */
>+static struct cached_info * get_cached_info(struct spu * the_spu, int spu_num)
>+{
>+	struct cached_info * ret_info = NULL;
>+	unsigned long flags = 0;
>+	if (spu_num >= num_spu_nodes) {
>+		printk(KERN_ERR "SPU_PROF: " 
>+		       "%s, line %d: Invalid index %d into spu info cache\n",
>+		       __FUNCTION__, __LINE__, spu_num); 
>+		goto out;
>+	}
>+	spin_lock_irqsave(&cache_lock, flags);
>+	if (!spu_info[spu_num] && the_spu)
>+		spu_info[spu_num] = (struct cached_info *)
>+			spu_get_profile_private(the_spu->ctx);
>+
>+	ret_info = spu_info[spu_num];
>+	spin_unlock_irqrestore(&cache_lock, flags);
>+ out:
>+	return ret_info;
>+}
>+
>+
>+/* Looks for cached info for the passed spu.  If not found, the
>+ * cached info is created for the passed spu.
>+ * Returns 0 for success; otherwise, -1 for error.  
>+ */ 
>+static int
>+prepare_cached_spu_info(struct spu * spu, unsigned int objectId)
>+{
>+	unsigned long flags = 0;
>+	struct vma_to_fileoffset_map * new_map;
>+	int retval = 0;
>+	struct cached_info * info = get_cached_info(spu, spu->number);
>+
>+	if (info) {
>+		pr_debug("Found cached SPU info.\n");
>+		goto out;
>+	}
>+
>+	/* Create cached_info and set spu_info[spu->number] to point to it.
>+	 * spu->number is a system-wide value, not a per-node value.
>+	 */
>+	info = kzalloc(sizeof(struct cached_info), GFP_KERNEL);
>+	if (!info) {
>+		printk(KERN_ERR "SPU_PROF: "
>+		       "%s, line %d: create vma_map failed\n",
>+		       __FUNCTION__, __LINE__);
>+		goto err_alloc;
>+	}
>+	new_map = create_vma_map(spu, objectId);
>+	if (!new_map) {
>+		printk(KERN_ERR "SPU_PROF: "
>+		       "%s, line %d: create vma_map failed\n",
>+		       __FUNCTION__, __LINE__);
>+		goto err_alloc;
>+	}
>+
>+	pr_debug("Created vma_map\n");
>+	info->map = new_map;
>+	info->the_spu = spu;
>+	kref_init(&info->cache_ref);
>+	spin_lock_irqsave(&cache_lock, flags);
>+	spu_info[spu->number] = info;
>+	spin_unlock_irqrestore(&cache_lock, flags);
>+	/* Increment count before passing off ref to SPUFS. */
>+	kref_get(&info->cache_ref);
>+	spu_set_profile_private(spu->ctx, info, &info->cache_ref,
>+				destroy_cached_info);
>+	goto out;
>+	
>+err_alloc:
>+	retval = -1;
>+out:
>+	return retval;
>+}
>+
>+/*
>+ * NOTE:  The caller is responsible for locking the
>+ *	  cache_lock prior to calling this function.
>+ */
>+static int release_cached_info(int spu_index)
>+{
>+	int index, end;
>+	if (spu_index == RELEASE_ALL) {
>+		end = num_spu_nodes;
>+		index = 0;
>+	} else {
>+	        if (spu_index >= num_spu_nodes) {
>+        	        printk(KERN_ERR "SPU_PROF: "
>+			       "%s, line %d: Invalid index %d into spu info cache\n",
>+               	               __FUNCTION__, __LINE__, spu_index);
>+	                goto out;
>+	        }
>+		end = spu_index +1;
>+		index = spu_index;
>+	}
>+	for (; index < end; index++) {
>+		if (spu_info[index]) {
>+			kref_put(&spu_info[index]->cache_ref, destroy_cached_info);
>+			spu_info[index] = NULL;
>+		}
>+	}
>+
>+out:
>+	return 0;
>+}
>+
>Index: linux-2.6.20-rc1/arch/powerpc/platforms/cell/spufs/context.c
>===================================================================
>--- linux-2.6.20-rc1.orig/arch/powerpc/platforms/cell/spufs/context.c	2007-02-05 14:42:04.359859432 -0600
>+++ linux-2.6.20-rc1/arch/powerpc/platforms/cell/spufs/context.c	2007-02-06 16:44:05.983965096 -0600
>@@ -22,6 +22,7 @@
>
> #include <linux/fs.h>
> #include <linux/mm.h>
>+#include <linux/module.h>
> #include <linux/slab.h>
> #include <asm/spu.h>
> #include <asm/spu_csa.h>
>@@ -71,6 +72,8 @@
> 	spu_fini_csa(&ctx->csa);
> 	if (ctx->gang)
> 		spu_gang_remove_ctx(ctx->gang, ctx);
>+	if (ctx->prof_priv_kref)
>+		kref_put(ctx->prof_priv_kref, ctx->prof_priv_release);
> 	kfree(ctx);
> }
>
>@@ -200,3 +203,29 @@
>
> 	downgrade_write(&ctx->state_sema);
> }
>+
>+/* This interface allows a profiler (e.g., OProfile) to store
>+ * spu_context information needed for profiling, allowing it to
>+ * be saved across context save/restore operation.
>+ *
>+ * Assumes the caller has already incremented the ref count to
>+ * profile_info; then spu_context_destroy must call kref_put
>+ * on prof_info_kref.
>+ */
>+void spu_set_profile_private(struct spu_context * ctx, void * profile_info,
>+			     struct kref * prof_info_kref,
>+			     void (* prof_info_release) (struct kref * kref))
>+{
>+	ctx->profile_private = profile_info;
>+	ctx->prof_priv_kref = prof_info_kref;
>+	ctx->prof_priv_release = prof_info_release;
>+}
>+EXPORT_SYMBOL_GPL(spu_set_profile_private);
>+
>+void * spu_get_profile_private(struct spu_context * ctx)
>+{
>+	return ctx->profile_private;
>+}
>+EXPORT_SYMBOL_GPL(spu_get_profile_private);
>+
>+
>
>
>  
>



^ permalink raw reply	[flat|nested] 20+ messages in thread

* Re: [Cbe-oss-dev] [RFC, PATCH] CELL Oprofile SPU profiling updated patch
  2007-02-07 15:41   ` Maynard Johnson
@ 2007-02-07 22:48     ` Michael Ellerman
  2007-02-08 15:03       ` Maynard Johnson
  0 siblings, 1 reply; 20+ messages in thread
From: Michael Ellerman @ 2007-02-07 22:48 UTC (permalink / raw)
  To: Maynard Johnson; +Cc: Carl Love, linuxppc-dev, linux-kernel, cbe-oss-dev

[-- Attachment #1: Type: text/plain, Size: 3804 bytes --]

On Wed, 2007-02-07 at 09:41 -0600, Maynard Johnson wrote:
> Carl Love wrote:
> 
> >
> >Subject: Add support to OProfile for profiling Cell BE SPUs
> >
> >From: Maynard Johnson <maynardj@us.ibm.com>
> >
> >This patch updates the existing arch/powerpc/oprofile/op_model_cell.c
> >to add in the SPU profiling capabilities.  In addition, a 'cell' subdirectory
> >was added to arch/powerpc/oprofile to hold Cell-specific SPU profiling
> >code.
> >
> >Signed-off-by: Carl Love <carll@us.ibm.com>
> >Signed-off-by: Maynard Johnson <mpjohn@us.ibm.com>
> >
> >Index: linux-2.6.20-rc1/arch/powerpc/configs/cell_defconfig
> >  
> >
> I've discovered more problems with the kref handling for the cached_info 
> object that we store in the spu_context.  :-(
> 
> When the OProfile module initially determines that no cached_info yet 
> exists for a given spu_context, it creates the cached_info, inits the 
> cached_info's kref (which increments the refcount) and does a kref_get 
> (for SPUFS' ref) before passing the cached_info reference off to SUPFS 
> to store into the spu_context.  When OProfile shuts down or the SPU job 
> ends, OProfile gives up its ref to the cached_info with kref_put.  Then 
> when SPUFS destroys the spu_context, it also gives up its ref.  HOWEVER 
> . . . . If OProfile shuts down while the SPU job is still active _and_ 
> _then_ is restarted while the job is still active, OProfile will find 
> that the cached_info exists for the given spu_context, so it won't go 
> through the process of creating it and doing kref_init on the kref.  
> Under this scenario, OProfile does not own a ref of its own to the 
> cached_info, and should not be doing a kref_put when done using the 
> cached_info -- but it does, and so does SPUFS when the spu_context is 
> destroyed.  The end result (with the code as currently written) is that 
> an extra kref_put is done when the refcount is already down to zero.  To 
> fix this, OProfile needs to detect when it finds an existing cached_info 
> already stored in the spu_context.  Then, instead of creating a new one, 
> it sets a reminder flag to be used later when it's done using the cached 
> info to indicate whether or not it needs to call kref_put.

I think all you want to do is when oprofile finds the cached_info
already existing, it does a kref_get(). After all it doesn't have a
reference to it, so before it starts using it it must inc the ref count.

> Unfortunately, there's another problem (one that should have been 
> obvious to me).  The cached_info's kref "release" function is 
> destroy_cached_info(), defined in the OProfile module.  If the OProfile 
> module is unloaded when SPUFS destroys the spu_context and calls 
> kref_put on the cached_info's kref -- KABOOM!  The destroy_cached_info 
> function (the second arg to kref_put) is not in memory, so we get a 
> paging fault.  I see a couple options to solve this:
>     1) Don't store the cached_info in the spu_context.  Basically, go 
> back to the simplistic model of creating/deleting the cached_info on 
> every SPU task activation/deactivation.
>     2)  If there's some way to do this, force the OProfile module to 
> stay loaded until SPUFS has destroyed its last spu_context that holds a 
> cached_info object.

There is a mechanism for that, you just have each cached_info inc the
module's refcount.

Another option would be to have a mapping, in the oprofile code, from
spu_contexts to cached_infos, ie. a hash table or something.

cheers

-- 
Michael Ellerman
OzLabs, IBM Australia Development Lab

wwweb: http://michael.ellerman.id.au
phone: +61 2 6212 1183 (tie line 70 21183)

We do not inherit the earth from our ancestors,
we borrow it from our children. - S.M.A.R.T Person

[-- Attachment #2: This is a digitally signed message part --]
[-- Type: application/pgp-signature, Size: 189 bytes --]

^ permalink raw reply	[flat|nested] 20+ messages in thread

* Re: [Cbe-oss-dev] [RFC, PATCH] CELL Oprofile SPU profiling updated patch
  2007-02-06 23:02 ` [Cbe-oss-dev] [RFC, PATCH] CELL " Carl Love
  2007-02-07 15:41   ` Maynard Johnson
@ 2007-02-08 14:18   ` Milton Miller
  2007-02-08 17:21     ` Arnd Bergmann
  2007-02-08 23:59     ` Maynard Johnson
  1 sibling, 2 replies; 20+ messages in thread
From: Milton Miller @ 2007-02-08 14:18 UTC (permalink / raw)
  To: Carl Love; +Cc: LKML, linuxppc-dev, cbe-oss-dev, oprofile-list


On Feb 6, 2007, at 5:02 PM, Carl Love wrote:

> This is the first update to the patch previously posted by Maynard
> Johnson as "PATCH 4/4. Add support to OProfile for profiling CELL".
>
> This repost fixes the line wrap issue that Ben mentioned.  Also the 
> kref
> handling for the cached info has been fixed and simplified.
>
> There are still a few items from the comments being discussed
> specifically how to profile the dynamic code for the SPFS context 
> switch
> code and how to deal with dynamic code stubs for library support.  Our
> proposal is to assign the samples from the SPFS and dynamic library 
> code
> to an anonymous sample bucket.  The support for properly handling the
> symbol extraction in these cases would be deferred to a later SDK.
>
> There is also a bug in profiling overlay code that we are 
> investigating.

I'd like to talk about about both some of the concepts, including what
is logged and what the kernel API is, and provide some directed comments
on the code in the patch as it stands.

[Ok, the background and concepts portion is big enough, I'll send
this to the several lists for comment, and the code comments to
just cbe-oss-dev]

First, I'll give some background and my understanding of both the
hardware, and some of the linux interface.  I'm not consulting any
manuals, so I could be mistaken.   I'm basing this on a combination
of past reading of the kernel, this patch, a discussion with Arnd,
and my knowledge of the PowerPC processor and other IBM processors.
Hopefully this will be of use to other reviewers.

Background:

The cell broadband architecture consists of PowerPC processors
and synergistic processing units, or SPUs.  The current chip has
a multi-threaded PowerPC core and 8 SPUs.   Each SPU has an
execution core (running its own instruction set), a 256kB
private memory (local store), and DMA engine that can access
the coherent memory domain of the PowerPC processor(s).  Multiple
chips can be connected together in a single coherent SMP system.
The addresses provided to the DMA engine are effective, translated
through virtual to real address spaces like the PowerPC MMU.
The SPUs are intended to run application threads, and require
the PowerPC to perform exception handling and other operating
system tasks.

Because of the limited address space of the SPU (18
bits), the compilers (gcc) have support for code
overlays.  The ELF format defines these overlays
including file offsets, sizes, load address, and
a unique word to identify the overlay.  The toolchain
has support to check that an overlay is present and
setup the DMA engine to load it if it is not present.

In Linux, the SPUs are controlled through spufs.  Once
a SPU context is created, the local store and registers can be
modified through the file system.  A linux thread makes
a syscall requesting the context to run.  This call is
blocking to that thread; it waits for an event from the
SPU (either an exception or a reschedule). In this regard
the syscall can be thought of as a cross instruction set
function call.  The contents of the local store are
initialized and modified via read/write or mmap and memcopy
of a spufs file. Specifically, pages are not mapped into the
local store, they are copied into it.  The SPU context is
tied to the creating mm; this provides a clear context for
the DMA engine (controlled by the program in the SPU).

To assist with the use of the SPUs, and to provide some
portability to other environments, a library, libpse, is
available and widely used.   Among other items, it provides
an elf loader for SPU elf objects.  Applications may mmap
external objects or embed the elf objects into their
executable (as data).  The contained objects copied to the
SPU local store as dictated by the elf header.   To assist
other tools, spufs provides an object-id file.   Before this
patch, it has been treated as an opaque number. Although not
present in the first release of libpse, current versions write
the virtual address of the elf header to this file
when the loader is called.

This patch is trying to enable oprofile to collect and
analyze SPU execution, using the trace collection
hardware of the processor.   The trace collection hardware
consists of a LFSR (linear-feedback shift register) counter
and an array 256 bits wide and 256 entries deep.  When the
counter matches the programed value, it causes an entry
to be written to the trace array and the counter to be
reloaded.  On chip logic tracks how many entries have
been written and not yet read.  When programed to trace
the SPUs, the trace entry consists of the upper 16 bits of
the 18 bit program counter for each SPU; the 8 SPUs split
over the two words.  By their nature, LFSRs require a minimum
of hardware (typically 3 exclusive-or gates and a shift
register), but the count sequence is pseudo-random.

The oprofile system is designed to collect a stream of trace
data and context information to and provide it to user space
for post processing and analysis.   While at the lowest level
the data is a stream of words, the kernel and user space tools
agree on a protocol that provides meaning to the words by
prefixing stream elements with escape sequences to pass
infrequent context to interpret the trace data.

For most streams today, the kernel translates a hardware
collected address through the mm struct and vma list to
the backing file and offset in that file.  A given file
is hashed to a word by providing a "dcookie" that maps
through the dcache to a given file and path.  The user
space tool that collects the data from the kernel trace
buffer queries the kernel and translates the stream back to
files in the file system and offsets into those files.   The
user space analysis tools can then take that information
and display the file name, instructions, and symbolic
debug information that may be contained in the typical
elf executable or other binary format.


1) sample rate setup

    In the current patch, the user specifies a sample rate as a time 
interval.
    The kernel is (a) calling cpufreq to get the current cpu frequency, 
(b)
    converting the rate to a cycle count, (c) converting this to a 24 bit
    LFSR count, an iterative algorithm (in this patch, starting from
    one of 256 values so a max of 2^16 or 64k iterations), (d) 
calculating
    an trace unload interval.   In addition, a cpufreq notifier is 
registered
    to recalculate on frequency changes.

    The obvious problem is step (c), running a loop potentially 64 
thousand
    times in kernel space will have a noticeable impact on other threads.

    I propose instead that user space perform the above 4 steps, and 
provide
    the kernel with two inputs: (1) the value to load in the LFSR and (2)
    the periodic frequency / time interval at which to empty the hardware
    trace buffer, perform sample analysis, and send the data to the 
oprofile
    subsystem.

    There should be no security issues with this approach.   If the LFSR 
value
    is calculated incorrectly, either it will be too short, causing the 
trace
    array to overfill and data to be dropped, or it will be too long, and
    there will be fewer samples.   Likewise, the kernel periodic poll 
can be
    too long, again causing overflow, or too frequent, causing only timer
    execution overhead.

    Various data is collected by the kernel while processing the 
periodic timer,
    this approach would also allow the profiling tools to control the
    frequency of this collection.   More frequent collection results in 
more
    accurate sample data, with the linear cost of poll execution 
overhead.

    Frequency changes can be handled either by the profile code setting
    collection at a higher than necessary rate, or by interacting with 
the
    governor to limit the speeds.

    Optionally, the kernel can add a record indicating that some data was
    likely dropped if it is able to read all 256 entries without 
underflowing
    the array.  This can be used as hint to user space that the kernel 
time
    was too long for the collection rate.


Data collected

The trace hardware provides the execution address of the SPUs in their
local store at some time in the past.   The samples are read most
efficiently in batch.

To be of use to oprofile, the raw address must be translated to the
execution context, and ideally to the persistent file system object
where the code is stored.   In addition, for the case where the elf
image is embedded in another file as data, the start of the elf
image is needed.

By draining the trace array on context switch, the kernel can map the
SPU number to the SPU context and linux thread id (and hence the mm
context of the process).  However, the SPU address recorded has an
arbitrary mapping to source address in that thread's context.   Because
the SPU is executing a copy of object mapped into its private, always
writable store, the normal oprofile mm lookup is ineffective.  In 
addition,
because of the active use of overlays, the mapping of SPU address to
source vm address changes over time.   The oprofile driver is 
necessarily
reading the sample later in time.

The current patch starts tackling these translation issues for the
presently common case of a static self contained binary from a single
file, either single separate source file or embedded in the data of
the host application.   When creating the trace entry for a SPU
context switch, it records the application owner, pid, tid, and
dcookie of the main executable.   It addition, it looks up the
object-id as a virtual address and records the offset if it is non-zero,
or the dcookie of the object if it is zero.   The code then creates
a data structure by reading the elf headers from the user process
(at the address given by the object-id) and building a list of
SPU address to elf object offsets, as specified by the ELF loader
headers.   In addition to the elf loader section, it processes the
overlay headers and records the address, size, and magic number of
the overlay.

When the hardware trace entries are processed, each address is
looked up this structure and translated to the elf offset.  If
it is an overlay region, the overlay identify word is read and
the list is searched for the matching overlay.  The resulting
offset is sent to the oprofile system.

The current patch specifically identifies that only single
elf objects are handled.  There is no code to handle dynamic
linked libraries or overlays.   Nor is there any method to
present samples that may have been collected during context
switch processing, they must be discarded.


My proposal is to change what is presented to user space.  Instead
of trying to translate the SPU address to the backing file
as the samples are recorded, store the samples as the SPU
context and address.  The context switch would record tid,
pid, object id as it does now.   In addition, if this is a
new object-id, the kernel would read elf headers as it does
today.  However, it would then proceed to provide accurate
dcookie information for each loader region and overlay.  To
identify which overlays are active, (instead of the present
read on use and search the list to translate approach) the
kernel would record the location of the overlay identifiers
as it parsed the kernel, but would then read the identification
word and would record the present value as an sample from
a separate but related stream.   The kernel could maintain
the last value for each overlay and only send profile events
for the deltas.

This approach trades translation lookup overhead for each
recorded sample for a burst of data on new context activation.
In addition it exposes the sample point of the overlay identifier
vs the address collection.  This allows the ambiguity to be
exposed to user space.   In addition, with the above proposed
kernel timer vs sample collection, user space could limit the
elapsed time between the address collection and the overlay
id check.

This approach allows multiple objects by its nature.  A new
elf header could be constructed in memory that contained
the union of the elf objects load segments, and the tools
will magically work.   Alternatively the object id could
point to a new structure, identified via a new header, that
it points to other elf headers (easily differentiated by the
elf magic headers).   Other binary formats, including several
objects in a ar archive, could be supported.

If better overlay identification is required, in theory the
overlay switch code could be augmented to record the switches
(DMA reference time from the PowerPC memory and record a
relative decrementer in the SPU), this is obviously a future
item.  But it is facilitated by having user space resolve the
SPU to source file translation.

milton
--
miltonm@bga.com   Milton Miller
Speaking for myself only.


^ permalink raw reply	[flat|nested] 20+ messages in thread

* Re: [Cbe-oss-dev] [RFC, PATCH] CELL Oprofile SPU profiling updated patch
  2007-02-07 22:48     ` Michael Ellerman
@ 2007-02-08 15:03       ` Maynard Johnson
  0 siblings, 0 replies; 20+ messages in thread
From: Maynard Johnson @ 2007-02-08 15:03 UTC (permalink / raw)
  To: michael; +Cc: Carl Love, linuxppc-dev, linux-kernel, cbe-oss-dev

Michael,
Thanks very much for the advice.  Both issues have been solved now, with 
your help.

-Maynard

Michael Ellerman wrote:

>On Wed, 2007-02-07 at 09:41 -0600, Maynard Johnson wrote:
>  
>
>>Carl Love wrote:
>>
>>    
>>
>>>Subject: Add support to OProfile for profiling Cell BE SPUs
>>>
>>>From: Maynard Johnson <maynardj@us.ibm.com>
>>>
>>>This patch updates the existing arch/powerpc/oprofile/op_model_cell.c
>>>to add in the SPU profiling capabilities.  In addition, a 'cell' subdirectory
>>>was added to arch/powerpc/oprofile to hold Cell-specific SPU profiling
>>>code.
>>>
>>>Signed-off-by: Carl Love <carll@us.ibm.com>
>>>Signed-off-by: Maynard Johnson <mpjohn@us.ibm.com>
>>>
>>>Index: linux-2.6.20-rc1/arch/powerpc/configs/cell_defconfig
>>> 
>>>
>>>      
>>>
>>I've discovered more problems with the kref handling for the cached_info 
>>object that we store in the spu_context.  :-(
>>
>>When the OProfile module initially determines that no cached_info yet 
>>exists for a given spu_context, it creates the cached_info, inits the 
>>cached_info's kref (which increments the refcount) and does a kref_get 
>>(for SPUFS' ref) before passing the cached_info reference off to SUPFS 
>>to store into the spu_context.  When OProfile shuts down or the SPU job 
>>ends, OProfile gives up its ref to the cached_info with kref_put.  Then 
>>when SPUFS destroys the spu_context, it also gives up its ref.  HOWEVER 
>>. . . . If OProfile shuts down while the SPU job is still active _and_ 
>>_then_ is restarted while the job is still active, OProfile will find 
>>that the cached_info exists for the given spu_context, so it won't go 
>>through the process of creating it and doing kref_init on the kref.  
>>Under this scenario, OProfile does not own a ref of its own to the 
>>cached_info, and should not be doing a kref_put when done using the 
>>cached_info -- but it does, and so does SPUFS when the spu_context is 
>>destroyed.  The end result (with the code as currently written) is that 
>>an extra kref_put is done when the refcount is already down to zero.  To 
>>fix this, OProfile needs to detect when it finds an existing cached_info 
>>already stored in the spu_context.  Then, instead of creating a new one, 
>>it sets a reminder flag to be used later when it's done using the cached 
>>info to indicate whether or not it needs to call kref_put.
>>    
>>
>
>I think all you want to do is when oprofile finds the cached_info
>already existing, it does a kref_get(). After all it doesn't have a
>reference to it, so before it starts using it it must inc the ref count.
>
>  
>
>>Unfortunately, there's another problem (one that should have been 
>>obvious to me).  The cached_info's kref "release" function is 
>>destroy_cached_info(), defined in the OProfile module.  If the OProfile 
>>module is unloaded when SPUFS destroys the spu_context and calls 
>>kref_put on the cached_info's kref -- KABOOM!  The destroy_cached_info 
>>function (the second arg to kref_put) is not in memory, so we get a 
>>paging fault.  I see a couple options to solve this:
>>    1) Don't store the cached_info in the spu_context.  Basically, go 
>>back to the simplistic model of creating/deleting the cached_info on 
>>every SPU task activation/deactivation.
>>    2)  If there's some way to do this, force the OProfile module to 
>>stay loaded until SPUFS has destroyed its last spu_context that holds a 
>>cached_info object.
>>    
>>
>
>There is a mechanism for that, you just have each cached_info inc the
>module's refcount.
>
>Another option would be to have a mapping, in the oprofile code, from
>spu_contexts to cached_infos, ie. a hash table or something.
>
>cheers
>
>  
>



^ permalink raw reply	[flat|nested] 20+ messages in thread

* Re: [Cbe-oss-dev] [RFC, PATCH] CELL Oprofile SPU profiling updated patch
  2007-02-08 14:18   ` Milton Miller
@ 2007-02-08 17:21     ` Arnd Bergmann
  2007-02-08 18:01       ` Adrian Reber
                         ` (2 more replies)
  2007-02-08 23:59     ` Maynard Johnson
  1 sibling, 3 replies; 20+ messages in thread
From: Arnd Bergmann @ 2007-02-08 17:21 UTC (permalink / raw)
  To: cbe-oss-dev; +Cc: Milton Miller, Carl Love, linuxppc-dev, LKML, oprofile-list

On Thursday 08 February 2007 15:18, Milton Miller wrote:

> 1) sample rate setup
> 
>     In the current patch, the user specifies a sample rate as a time 
> interval.
>     The kernel is (a) calling cpufreq to get the current cpu frequency, 
> (b)
>     converting the rate to a cycle count, (c) converting this to a 24 bit
>     LFSR count, an iterative algorithm (in this patch, starting from
>     one of 256 values so a max of 2^16 or 64k iterations), (d) 
> calculating
>     an trace unload interval.   In addition, a cpufreq notifier is 
> registered
>     to recalculate on frequency changes.
> 
>     The obvious problem is step (c), running a loop potentially 64 
> thousand
>     times in kernel space will have a noticeable impact on other threads.
> 
>     I propose instead that user space perform the above 4 steps, and 
> provide
>     the kernel with two inputs: (1) the value to load in the LFSR and (2)
>     the periodic frequency / time interval at which to empty the hardware
>     trace buffer, perform sample analysis, and send the data to the 
> oprofile
>     subsystem.
> 
>     There should be no security issues with this approach.   If the LFSR 
> value
>     is calculated incorrectly, either it will be too short, causing the 
> trace
>     array to overfill and data to be dropped, or it will be too long, and
>     there will be fewer samples.   Likewise, the kernel periodic poll 
> can be
>     too long, again causing overflow, or too frequent, causing only timer
>     execution overhead.
> 
>     Various data is collected by the kernel while processing the 
> periodic timer,
>     this approach would also allow the profiling tools to control the
>     frequency of this collection.   More frequent collection results in 
> more
>     accurate sample data, with the linear cost of poll execution 
> overhead.
> 
>     Frequency changes can be handled either by the profile code setting
>     collection at a higher than necessary rate, or by interacting with 
> the
>     governor to limit the speeds.
> 
>     Optionally, the kernel can add a record indicating that some data was
>     likely dropped if it is able to read all 256 entries without 
> underflowing
>     the array.  This can be used as hint to user space that the kernel 
> time
>     was too long for the collection rate.

Moving the sample rate computation to user space sounds like the right
idea, but why not have a more drastic version of it:

Right now, all products that support this feature run at the same clock
rate (3.2 Ghz), with cpufreq, we can reduce this to 1.6 Ghz. If I understand
this correctly, the value depends only on the frequency, so we could simply
hardcode this in the kernel, and print out a warning message if we ever
encounter a different frequency, right?

> The current patch specifically identifies that only single
> elf objects are handled.  There is no code to handle dynamic
> linked libraries or overlays.   Nor is there any method to
> present samples that may have been collected during context
> switch processing, they must be discarded.

I thought it already did handle overlays, what did I miss here?

> My proposal is to change what is presented to user space.  Instead
> of trying to translate the SPU address to the backing file
> as the samples are recorded, store the samples as the SPU
> context and address.  The context switch would record tid,
> pid, object id as it does now.   In addition, if this is a
> new object-id, the kernel would read elf headers as it does
> today.  However, it would then proceed to provide accurate
> dcookie information for each loader region and overlay.

Doing the translation in two stages in user space, as you
suggest here, definitely makes sense to me. I think it
can be done a little simpler though:

Why would you need the accurate dcookie information to be
provided by the kernel? The ELF loader is done in user
space, and the kernel only reproduces what it thinks that
came up with. If the kernel only gives the dcookie information
about the SPU ELF binary to the oprofile user space, then
that can easily recreate the same mapping.

The kernel still needs to provide the overlay identifiers
though.

> To identify which overlays are active, (instead of the present
> read on use and search the list to translate approach) the
> kernel would record the location of the overlay identifiers
> as it parsed the kernel, but would then read the identification
> word and would record the present value as an sample from
> a separate but related stream.   The kernel could maintain
> the last value for each overlay and only send profile events
> for the deltas.

right.

> This approach trades translation lookup overhead for each
> recorded sample for a burst of data on new context activation.
> In addition it exposes the sample point of the overlay identifier
> vs the address collection.  This allows the ambiguity to be
> exposed to user space.   In addition, with the above proposed
> kernel timer vs sample collection, user space could limit the
> elapsed time between the address collection and the overlay
> id check.

yes, this sounds nice. But tt does not at all help accuracy,
only performance, right?
 
> This approach allows multiple objects by its nature.  A new
> elf header could be constructed in memory that contained
> the union of the elf objects load segments, and the tools
> will magically work.   Alternatively the object id could
> point to a new structure, identified via a new header, that
> it points to other elf headers (easily differentiated by the
> elf magic headers).   Other binary formats, including several
> objects in a ar archive, could be supported.

Yes, that would be a new feature if the kernel passed dcookie
information for every section, but I doubt that it is worth
it. I have not seen any program that allows loading code
from more than one ELF file. In particular, the ELF format
on the SPU is currently lacking the relocation mechanisms
that you would need for resolving spu-side symbols at load
time.
 
> If better overlay identification is required, in theory the
> overlay switch code could be augmented to record the switches
> (DMA reference time from the PowerPC memory and record a
> relative decrementer in the SPU), this is obviously a future
> item.  But it is facilitated by having user space resolve the
> SPU to source file translation.

This seems to incur a run-time overhead on the SPU even if not
profiling, I would consider that not acceptable.

	Arnd <><

^ permalink raw reply	[flat|nested] 20+ messages in thread

* Re: [Cbe-oss-dev] [RFC, PATCH] CELL Oprofile SPU profiling updated patch
  2007-02-08 17:21     ` Arnd Bergmann
@ 2007-02-08 18:01       ` Adrian Reber
  2007-02-08 22:51       ` Carl Love
  2007-02-09 18:47       ` Milton Miller
  2 siblings, 0 replies; 20+ messages in thread
From: Adrian Reber @ 2007-02-08 18:01 UTC (permalink / raw)
  To: Arnd Bergmann
  Cc: cbe-oss-dev, linuxppc-dev, oprofile-list, LKML, Milton Miller, Carl Love

On Thu, Feb 08, 2007 at 06:21:56PM +0100, Arnd Bergmann wrote:
[...]
> Moving the sample rate computation to user space sounds like the right
> idea, but why not have a more drastic version of it:
> 
> Right now, all products that support this feature run at the same clock
> rate (3.2 Ghz), with cpufreq, we can reduce this to 1.6 Ghz. If I understand
> this correctly, the value depends only on the frequency, so we could simply
> hardcode this in the kernel, and print out a warning message if we ever
> encounter a different frequency, right?

Just for the record... CAB is running with 2.8 GHz. At least all the boards
I have seen.

		Adrian

^ permalink raw reply	[flat|nested] 20+ messages in thread

* Re: [Cbe-oss-dev] [RFC, PATCH] CELL Oprofile SPU profiling updated patch
  2007-02-08 17:21     ` Arnd Bergmann
  2007-02-08 18:01       ` Adrian Reber
@ 2007-02-08 22:51       ` Carl Love
  2007-02-09  2:46         ` Milton Miller
  2007-02-09 18:47       ` Milton Miller
  2 siblings, 1 reply; 20+ messages in thread
From: Carl Love @ 2007-02-08 22:51 UTC (permalink / raw)
  To: Arnd Bergmann
  Cc: cbe-oss-dev, Milton Miller, linuxppc-dev, LKML, oprofile-list

On Thu, 2007-02-08 at 18:21 +0100, Arnd Bergmann wrote:
> On Thursday 08 February 2007 15:18, Milton Miller wrote:
>  
> > 1) sample rate setup
> > 
> >     In the current patch, the user specifies a sample rate as a time 
> > interval.
> >     The kernel is (a) calling cpufreq to get the current cpu frequency, 
> > (b)
> >     converting the rate to a cycle count, (c) converting this to a 24 bit
> >     LFSR count, an iterative algorithm (in this patch, starting from
> >     one of 256 values so a max of 2^16 or 64k iterations), (d) 
> > calculating
> >     an trace unload interval.   In addition, a cpufreq notifier is 
> > registered
> >     to recalculate on frequency changes.

No.  The user issues the command opcontrol --event:N  where N is the
number of events (cycles, l2 cache misses, instructions retired etc)
that are to elapse between collecting the samples.  The OProfile passes
the value N to the kernel via the variable ctr[i].count.  Where i is the
performance counter entry for that event.  Specifically with SPU
profiling, we do not use performance counters because the CELL HW does
not allow the normal the PPU to read the SPU PC when a performance
counter interrupt occurs.  We are using some additional hw support in
the chip that allows us to periodically capture the SPU PC.  There is an
LFSR hardware counter that can be started at an arbitrary LFSR value.
When the last LFSR value in the sequence is reached, a sample is taken
and stored in the trace buffer.  Hence, the value of N specified by the
user must get converted to the LFSR value that is N from the end of the
sequence.  The same clock that the processor is running at is used to
control the LFSR count.  Hence the LFSR counter increments once per CPU
clock cycle regardless of the CPU frequency or changes in the frequency.
There is no calculation for the LFSR value that is a function of the
processor frequency.  There is no need to adjust the LFSR when the
processor frequency changes.  

Milton had a comment about the code 

 if (ctr[0].event == SPU_CYCLES_EVENT_NUM) {
> +             spu_cycle_reset = ctr[0].count;
> +             return;
> +     }

Well, given the above description, it is clear that if you are doing SPU
event profiling, the value N is put into the cntr[0].cnt entry since
there is only one event.  Thus in cell_global_start_spu() you use
spu_cycle_reset as the argument to the lfsr calculation routine to get
the LFSR value that is N from the end of the sequence.

> > 
> >     The obvious problem is step (c), running a loop potentially 64 
> > thousand
> >     times in kernel space will have a noticeable impact on other threads.
> > 
> >     I propose instead that user space perform the above 4 steps, and 
> > provide
> >     the kernel with two inputs: (1) the value to load in the LFSR and (2)
> >     the periodic frequency / time interval at which to empty the hardware
> >     trace buffer, perform sample analysis, and send the data to the 
> > oprofile
> >     subsystem.
> > 
> >     There should be no security issues with this approach.   If the LFSR 
> > value
> >     is calculated incorrectly, either it will be too short, causing the 
> > trace
> >     array to overfill and data to be dropped, or it will be too long, and
> >     there will be fewer samples.   Likewise, the kernel periodic poll 
> > can be
> >     too long, again causing overflow, or too frequent, causing only timer
> >     execution overhead.
> > 
> >     Various data is collected by the kernel while processing the 
> > periodic timer,
> >     this approach would also allow the profiling tools to control the
> >     frequency of this collection.   More frequent collection results in 
> > more
> >     accurate sample data, with the linear cost of poll execution 
> > overhead.
> > 
> >     Frequency changes can be handled either by the profile code setting
> >     collection at a higher than necessary rate, or by interacting with 
> > the
> >     governor to limit the speeds.
> > 
> >     Optionally, the kernel can add a record indicating that some data was
> >     likely dropped if it is able to read all 256 entries without 
> > underflowing
> >     the array.  This can be used as hint to user space that the kernel 
> > time
> >     was too long for the collection rate.
>  
> Moving the sample rate computation to user space sounds like the right
> idea, but why not have a more drastic version of it:

No, I do not agree.  The user/kernel API pass N where N is the number of
events between samples.  We are not at liberty to just change the API.
We we did do this, we fully expect that John Levon will push back saying
why make an architecture specific API change when it isn't necessary.  

Please define "drastic" in this context.  Do you mean make the table
bigger i.e. more then 256 precomputed elements?  I did 256 since Arnd
seemed to think that would be a reasonable size. Based on his example
How much kernel space are we willing to use to same some computation?
Keep in mind only one of the entries in the table will ever be used.

I think if we found the LFSR that was with in 2^10 of the desired value
that would be good enough. It would be within 1% of the minimum N the
user can specify.  That would require a table with 2^14 entries.  That
seems unreasonably large.

Anyway, the user controls how often sampling is done by setting N. 

>  
                Carl Love
> [cut]


^ permalink raw reply	[flat|nested] 20+ messages in thread

* Re: [Cbe-oss-dev] [RFC, PATCH] CELL Oprofile SPU profiling updated patch
  2007-02-08 14:18   ` Milton Miller
  2007-02-08 17:21     ` Arnd Bergmann
@ 2007-02-08 23:59     ` Maynard Johnson
  2007-02-09 18:03       ` Milton Miller
  1 sibling, 1 reply; 20+ messages in thread
From: Maynard Johnson @ 2007-02-08 23:59 UTC (permalink / raw)
  To: Milton Miller; +Cc: Carl Love, linuxppc-dev, LKML, oprofile-list, cbe-oss-dev

Milton,
Thank you for your comments.  Carl will reply to certain parts of your 
posting where he's more knowledgeable than I.  See my replies below.

-Maynard

Milton Miller wrote:

>On Feb 6, 2007, at 5:02 PM, Carl Love wrote:
>
>  
>
>>This is the first update to the patch previously posted by Maynard
>>Johnson as "PATCH 4/4. Add support to OProfile for profiling CELL".
>>
>>
>>    
>>
>  
>
[snip]

>
>Data collected
>
>
>The current patch starts tackling these translation issues for the
>presently common case of a static self contained binary from a single
>file, either single separate source file or embedded in the data of
>the host application.   When creating the trace entry for a SPU
>context switch, it records the application owner, pid, tid, and
>dcookie of the main executable.   It addition, it looks up the
>object-id as a virtual address and records the offset if it is non-zero,
>or the dcookie of the object if it is zero.   The code then creates
>a data structure by reading the elf headers from the user process
>(at the address given by the object-id) and building a list of
>SPU address to elf object offsets, as specified by the ELF loader
>headers.   In addition to the elf loader section, it processes the
>overlay headers and records the address, size, and magic number of
>the overlay.
>
>When the hardware trace entries are processed, each address is
>looked up this structure and translated to the elf offset.  If
>it is an overlay region, the overlay identify word is read and
>the list is searched for the matching overlay.  The resulting
>offset is sent to the oprofile system.
>
>The current patch specifically identifies that only single
>elf objects are handled.  There is no code to handle dynamic
>linked libraries or overlays.   Nor is there any method to
>  
>
Yes, we do handle overlays.  (Note: I'm looking into a bug right now in 
our overlay support.)

>present samples that may have been collected during context
>switch processing, they must be discarded.
>
>
>My proposal is to change what is presented to user space.  Instead
>of trying to translate the SPU address to the backing file
>as the samples are recorded, store the samples as the SPU
>context and address.  The context switch would record tid,
>pid, object id as it does now.   In addition, if this is a
>new object-id, the kernel would read elf headers as it does
>today.  However, it would then proceed to provide accurate
>dcookie information for each loader region and overlay.  To
>identify which overlays are active, (instead of the present
>read on use and search the list to translate approach) the
>kernel would record the location of the overlay identifiers
>as it parsed the kernel, but would then read the identification
>word and would record the present value as an sample from
>a separate but related stream.   The kernel could maintain
>the last value for each overlay and only send profile events
>for the deltas.
>  
>
Discussions on this topic in the past have resulted in the current 
implementation precisely because we're able to record the samples as 
fileoffsets, just as the userspace tools expect.   I haven't had time to 
check out how much this would impact the userspace tools, but my gut 
feel is that it would be quite significant.  If we were developing this 
module with a matching newly-created userspace tool, I would be more 
inclined to agree that this makes sense.  But you give no rationale for 
your proposal that justifies the change.  The current implementation 
works, it has no impact on normal, non-profiling behavior,  and the 
overhead during profiling is not noticeable.

>This approach trades translation lookup overhead for each
>recorded sample for a burst of data on new context activation.
>In addition it exposes the sample point of the overlay identifier
>vs the address collection.  This allows the ambiguity to be
>exposed to user space.   In addition, with the above proposed
>kernel timer vs sample collection, user space could limit the
>elapsed time between the address collection and the overlay
>id check.
>  
>
Yes, there is a window here where an overlay could occur before we 
finish processing a group of samples that were actually taken from a 
different overlay.  The obvious way to prevent that is for the kernel 
(or SPUFS) to be notified of the overlay and let OProfile know that we 
need to drain (perhaps discard would be best) our sample trace buffer.  
As you indicate above, your proposal faces the same issue, but would 
just decrease the number of bogus samples.  I contend that the relative 
number of bogus samples will be quite low in either case.  Ideally, we 
should have a mechanism to eliminate them completely so as to avoid 
confusion the user's part when they're looking at a report.  Even a few 
bogus samples in the wrong place can be troubling.  Such a mechanism 
will be a good future enhancement.

[snip]

>milton
>--
>miltonm@bga.com   Milton Miller
>Speaking for myself only.
>
>_______________________________________________
>Linuxppc-dev mailing list
>Linuxppc-dev@ozlabs.org
>https://ozlabs.org/mailman/listinfo/linuxppc-dev
>  
>



^ permalink raw reply	[flat|nested] 20+ messages in thread

* Re: [Cbe-oss-dev] [RFC, PATCH] CELL Oprofile SPU profiling updated patch
  2007-02-08 22:51       ` Carl Love
@ 2007-02-09  2:46         ` Milton Miller
  2007-02-09 16:17           ` Carl Love
  0 siblings, 1 reply; 20+ messages in thread
From: Milton Miller @ 2007-02-09  2:46 UTC (permalink / raw)
  To: Carl Love; +Cc: cbe-oss-dev, Arnd Bergmann, LKML, linuxppc-dev, oprofile-list


On Feb 8, 2007, at 4:51 PM, Carl Love wrote:

> On Thu, 2007-02-08 at 18:21 +0100, Arnd Bergmann wrote:
>> On Thursday 08 February 2007 15:18, Milton Miller wrote:
>>
>>> 1) sample rate setup
>>>
>>>     In the current patch, the user specifies a sample rate as a time
>>> interval.
>>>     The kernel is (a) calling cpufreq to get the current cpu 
>>> frequency,
>>> (b)
>>>     converting the rate to a cycle count, (c) converting this to a 
>>> 24 bit
>>>     LFSR count, an iterative algorithm (in this patch, starting from
>>>     one of 256 values so a max of 2^16 or 64k iterations), (d)
>>> calculating
>>>     an trace unload interval.   In addition, a cpufreq notifier is
>>> registered
>>>     to recalculate on frequency changes.
>
> No.  The user issues the command opcontrol --event:N  where N is the
> number of events (cycles, l2 cache misses, instructions retired etc)
> that are to elapse between collecting the samples.

So you are saying that b and c are primary, and a is used to calculate
a safe value for d.   All of the above work is dont, just from a
different starting point?


> The OProfile passes
> the value N to the kernel via the variable ctr[i].count.  Where i is 
> the
> performance counter entry for that event.

Ok I haven't looked a the api closely.

> Specifically with SPU
> profiling, we do not use performance counters because the CELL HW does
> not allow the normal the PPU to read the SPU PC when a performance
> counter interrupt occurs.  We are using some additional hw support in
> the chip that allows us to periodically capture the SPU PC.  There is 
> an
> LFSR hardware counter that can be started at an arbitrary LFSR value.
> When the last LFSR value in the sequence is reached, a sample is taken
> and stored in the trace buffer.  Hence, the value of N specified by the
> user must get converted to the LFSR value that is N from the end of the
> sequence.

Ok so its arbitray load count to max vs count and compare.   A critical
detail when computing the value to load, but the net result is the
same; the value for the count it hard to determine.

> The same clock that the processor is running at is used to
> control the LFSR count.  Hence the LFSR counter increments once per CPU
> clock cycle regardless of the CPU frequency or changes in the 
> frequency.
> There is no calculation for the LFSR value that is a function of the
> processor frequency.  There is no need to adjust the LFSR when the
> processor frequency changes.



Oh, so the lfsr doesn't have to be recomputed, only the time
between unloads.

>
> Milton had a comment about the code
>
>  if (ctr[0].event == SPU_CYCLES_EVENT_NUM) {
>> +             spu_cycle_reset = ctr[0].count;
>> +             return;
>> +     }
>
> Well, given the above description, it is clear that if you are doing 
> SPU
> event profiling, the value N is put into the cntr[0].cnt entry since
> there is only one event.  Thus in cell_global_start_spu() you use
> spu_cycle_reset as the argument to the lfsr calculation routine to get
> the LFSR value that is N from the end of the sequence.

I was looking at the patch and the context was not very good.   You
might consider adding -p to your diff command, it provides the function
name after the @@.

However, in this case, I think I just need to see the final result.

>
>>>
>>>     The obvious problem is step (c), running a loop potentially 64
>>> thousand
>>>     times in kernel space will have a noticeable impact on other 
>>> threads.
>>>
>>>     I propose instead that user space perform the above 4 steps, and
>>> provide
>>>     the kernel with two inputs: (1) the value to load in the LFSR 
>>> and (2)
>>>     the periodic frequency / time interval at which to empty the 
>>> hardware
>>>     trace buffer, perform sample analysis, and send the data to the
>>> oprofile
>>>     subsystem.
>>>
>>>     There should be no security issues with this approach.   If the 
>>> LFSR
>>> value
>>>     is calculated incorrectly, either it will be too short, causing 
>>> the
>>> trace
>>>     array to overfill and data to be dropped, or it will be too 
>>> long, and
>>>     there will be fewer samples.   Likewise, the kernel periodic poll
>>> can be
>>>     too long, again causing overflow, or too frequent, causing only 
>>> timer
>>>     execution overhead.
>>>
>>>     Various data is collected by the kernel while processing the
>>> periodic timer,
>>>     this approach would also allow the profiling tools to control the
>>>     frequency of this collection.   More frequent collection results 
>>> in
>>> more
>>>     accurate sample data, with the linear cost of poll execution
>>> overhead.
>>>
>>>     Frequency changes can be handled either by the profile code 
>>> setting
>>>     collection at a higher than necessary rate, or by interacting 
>>> with
>>> the
>>>     governor to limit the speeds.
>>>
>>>     Optionally, the kernel can add a record indicating that some 
>>> data was
>>>     likely dropped if it is able to read all 256 entries without
>>> underflowing
>>>     the array.  This can be used as hint to user space that the 
>>> kernel
>>> time
>>>     was too long for the collection rate.
>>
>> Moving the sample rate computation to user space sounds like the right
>> idea, but why not have a more drastic version of it:
>
> No, I do not agree.  The user/kernel API pass N where N is the number 
> of
> events between samples.  We are not at liberty to just change the API.
> We we did do this, we fully expect that John Levon will push back 
> saying
> why make an architecture specific API change when it isn't necessary.

[So you have not asked.]

<Kludge> If you want to overlaod the existing array, one
event could be the lfsr sample rate, and another event be
the collection time.  That would stay within the framework.
But that is a kludge. </Kludge>

[Me goes and reads kernel profile driver and skims powerpc code].

You are confusing the user interface (what the user specifies on the
command line) with the kernel API.

It is somewhat hidden by the PowerPC specific common code in
arch/powerpc/oprofile/common.c.  That is where the counter
array is exposd.

The user to kernel api is not fill out an array of counter
event names and sampling intervals.

The user to kernel interface is a file system that contains a
heirachy of files.  Each file consists of the hex ascii
represntation of a unsigned long.   The filesystem interfaces
to the kernel by provding an API to create directorys and files,
specifing the name of the directory or file.  Theere are helper
routines to connect a file to a ulong and read access to an atomic_t.
The common driver (in drivers/oprofile) creates the file system
and some common files that implement the control interface, which
interfaces to the architecture specific driver through the ops
array.

The powerpc common driver creates a heiarchy exposing the
values to be placed in the performance monitor registers,
and directory of counters with the event selection.

Since this architeture code does not seem to match the
capabilitys of the hardware, it would seem that this is
the area to change.   This driver does not seem to use
the actual PMU interrput or sprs.   Lets make it its
own directory with its own controls.

I don't see how exposing the sample collection to
rate and the computation of the LFSR create a userspace
api change;  I think its totally within the framework.

> Please define "drastic" in this context.  Do you mean make the table
> bigger i.e. more then 256 precomputed elements?  I did 256 since Arnd
> seemed to think that would be a reasonable size. Based on his example
> How much kernel space are we willing to use to same some computation?
> Keep in mind only one of the entries in the table will ever be used.
>
> I think if we found the LFSR that was with in 2^10 of the desired value
> that would be good enough. It would be within 1% of the minimum N the
> user can specify.  That would require a table with 2^14 entries.  That
> seems unreasonably large.

Why does the table have to be linear?  Compute samples at various
logrimathic staring points.   Determine how many significant bits
you want to keep, and have an array for each sample length.

ie for 3 bits of significance, you could have
F00000, E00000, ... 800000,   0F0000, ......
this would take (24-n) * 2^(n-1) slots, while maintaing user
control of the range.

>
> Anyway, the user controls how often sampling is done by setting N.

When calling a user space program.   The events are converted to
a series of control register values that are communicated to the
kernel by writing to files in the file system.   The writes are
interpreted (converted from ascii hex to binary longs) and stored
until the control files are written, at which point callbacks
copy and interpret the controls and start the hardware collection.

Frankly, I would expect a lot more resistance to the event data
stream generation changes and duplicaton.


milton


^ permalink raw reply	[flat|nested] 20+ messages in thread

* Re: [Cbe-oss-dev] [RFC, PATCH] CELL Oprofile SPU profiling updated patch
  2007-02-09  2:46         ` Milton Miller
@ 2007-02-09 16:17           ` Carl Love
  2007-02-11 22:46             ` Milton Miller
  0 siblings, 1 reply; 20+ messages in thread
From: Carl Love @ 2007-02-09 16:17 UTC (permalink / raw)
  To: Milton Miller
  Cc: cbe-oss-dev, Arnd Bergmann, LKML, linuxppc-dev, oprofile-list

On Thu, 2007-02-08 at 20:46 -0600, Milton Miller wrote:
>  On Feb 8, 2007, at 4:51 PM, Carl Love wrote:
>  
> > On Thu, 2007-02-08 at 18:21 +0100, Arnd Bergmann wrote:
> >> On Thursday 08 February 2007 15:18, Milton Miller wrote:
> >>
> >>> 1) sample rate setup
> >>>
> >>>     In the current patch, the user specifies a sample rate as a time
> >>> interval.
> >>>     The kernel is (a) calling cpufreq to get the current cpu 
> >>> frequency,
> >>> (b)
> >>>     converting the rate to a cycle count, (c) converting this to a 
> >>> 24 bit
> >>>     LFSR count, an iterative algorithm (in this patch, starting from
> >>>     one of 256 values so a max of 2^16 or 64k iterations), (d)
> >>> calculating
> >>>     an trace unload interval.   In addition, a cpufreq notifier is
> >>> registered
> >>>     to recalculate on frequency changes.
> >
> > No.  The user issues the command opcontrol --event:N  where N is the
> > number of events (cycles, l2 cache misses, instructions retired etc)
> > that are to elapse between collecting the samples.
>  
> So you are saying that b and c are primary, and a is used to calculate
> a safe value for d.   All of the above work is dont, just from a
> different starting point?

There are two things 1) setup the LFSR to control how often the Hardware
puts samples into the trace buffer.  2) setup the kernel timer to read
the trace buffer (your d, d is a function of the cpu freq) and process
the samples.  

(c is nonsense)

The kernel timer was set with the goal the the hardware trace buffer
would not get more then half full to ensure we would not lose samples
even for the maximum rate that the hardware would be adding samples to
the trace buffer (user specified N=100,000).  

>  
> 
> > The OProfile passes
> > the value N to the kernel via the variable ctr[i].count.  Where i is 
> > the
> > performance counter entry for that event.
>  
> Ok I haven't looked a the api closely.
>  
> > Specifically with SPU
> > profiling, we do not use performance counters because the CELL HW does
> > not allow the normal the PPU to read the SPU PC when a performance
> > counter interrupt occurs.  We are using some additional hw support in
> > the chip that allows us to periodically capture the SPU PC.  There is 
> > an
> > LFSR hardware counter that can be started at an arbitrary LFSR value.
> > When the last LFSR value in the sequence is reached, a sample is taken
> > and stored in the trace buffer.  Hence, the value of N specified by the
> > user must get converted to the LFSR value that is N from the end of the
> > sequence.
>  
> Ok so its arbitray load count to max vs count and compare.   A critical
> detail when computing the value to load, but the net result is the
> same; the value for the count it hard to determine.

The above statement makes no sense to me.  

Determining the initial LFSR value that is N values from the last value
in the sequence is not easy to do.  

>  
> > The same clock that the processor is running at is used to
> > control the LFSR count.  Hence the LFSR counter increments once per CPU
> > clock cycle regardless of the CPU frequency or changes in the 
> > frequency.
> > There is no calculation for the LFSR value that is a function of the
> > processor frequency.  There is no need to adjust the LFSR when the
> > processor frequency changes.
>  
> 
> 
> Oh, so the lfsr doesn't have to be recomputed, only the time
> between unloads.

The LFSR value is computed ONCE when you start OProfile.  The value is
setup in the hardware once when OProfile starts.  The hardware will
always restart with the value given to it after it reaches the last
value in the sequence.  What you call the "time between unloads" is the
time at which you schedule the kernel routine to empty the trace buffer.
It is calculated once.  It would only need to be recomputed if the cpu
frequency changed.

>  
> >
> > Milton had a comment about the code
> >
> >  if (ctr[0].event == SPU_CYCLES_EVENT_NUM) {
> >> +             spu_cycle_reset = ctr[0].count;
> >> +             return;
> >> +     }
> >
> > Well, given the above description, it is clear that if you are doing 
> > SPU
> > event profiling, the value N is put into the cntr[0].cnt entry since
> > there is only one event.  Thus in cell_global_start_spu() you use
> > spu_cycle_reset as the argument to the lfsr calculation routine to get
> > the LFSR value that is N from the end of the sequence.
>  
> I was looking at the patch and the context was not very good.   You
> might consider adding -p to your diff command, it provides the function
> name after the @@.
>  
> However, in this case, I think I just need to see the final result.
>  
> >
> >>>
> >>>     The obvious problem is step (c), running a loop potentially 64
> >>> thousand
> >>>     times in kernel space will have a noticeable impact on other 
> >>> threads.
> >>>
> >>>     I propose instead that user space perform the above 4 steps, and
> >>> provide
> >>>     the kernel with two inputs: (1) the value to load in the LFSR 
> >>> and (2)
> >>>     the periodic frequency / time interval at which to empty the 
> >>> hardware
> >>>     trace buffer, perform sample analysis, and send the data to the
> >>> oprofile
> >>>     subsystem.
> >>>
> >>>     There should be no security issues with this approach.   If the 
> >>> LFSR
> >>> value
> >>>     is calculated incorrectly, either it will be too short, causing 
> >>> the
> >>> trace
> >>>     array to overfill and data to be dropped, or it will be too 
> >>> long, and
> >>>     there will be fewer samples.   Likewise, the kernel periodic poll
> >>> can be
> >>>     too long, again causing overflow, or too frequent, causing only 
> >>> timer
> >>>     execution overhead.
> >>>
> >>>     Various data is collected by the kernel while processing the
> >>> periodic timer,
> >>>     this approach would also allow the profiling tools to control the
> >>>     frequency of this collection.   More frequent collection results 
> >>> in
> >>> more
> >>>     accurate sample data, with the linear cost of poll execution
> >>> overhead.
> >>>
> >>>     Frequency changes can be handled either by the profile code 
> >>> setting
> >>>     collection at a higher than necessary rate, or by interacting 
> >>> with
> >>> the
> >>>     governor to limit the speeds.
> >>>
> >>>     Optionally, the kernel can add a record indicating that some 
> >>> data was
> >>>     likely dropped if it is able to read all 256 entries without
> >>> underflowing
> >>>     the array.  This can be used as hint to user space that the 
> >>> kernel
> >>> time
> >>>     was too long for the collection rate.
> >>
> >> Moving the sample rate computation to user space sounds like the right
> >> idea, but why not have a more drastic version of it:
> >
> > No, I do not agree.  The user/kernel API pass N where N is the number 
> > of
> > events between samples.  We are not at liberty to just change the API.
> > We we did do this, we fully expect that John Levon will push back 
> > saying
> > why make an architecture specific API change when it isn't necessary.
>  
> [So you have not asked.]
>  
> <Kludge> If you want to overlaod the existing array, one
> event could be the lfsr sample rate, and another event be
> the collection time.  That would stay within the framework.
> But that is a kludge. </Kludge>
>  
> [Me goes and reads kernel profile driver and skims powerpc code].
>  
> You are confusing the user interface (what the user specifies on the
> command line) with the kernel API.
>  
> It is somewhat hidden by the PowerPC specific common code in
> arch/powerpc/oprofile/common.c.  That is where the counter
> array is exposd.
>  
> The user to kernel api is not fill out an array of counter
> event names and sampling intervals.
>  
> The user to kernel interface is a file system that contains a
> heirachy of files.  Each file consists of the hex ascii
> represntation of a unsigned long.   The filesystem interfaces
> to the kernel by provding an API to create directorys and files,
> specifing the name of the directory or file.  Theere are helper
> routines to connect a file to a ulong and read access to an atomic_t.
> The common driver (in drivers/oprofile) creates the file system
> and some common files that implement the control interface, which
> interfaces to the architecture specific driver through the ops
> array.
>  
> The powerpc common driver creates a heiarchy exposing the
> values to be placed in the performance monitor registers,
> and directory of counters with the event selection.
>  
> Since this architeture code does not seem to match the
> capabilitys of the hardware, it would seem that this is
> the area to change.   This driver does not seem to use
> the actual PMU interrput or sprs.   Lets make it its
> own directory with its own controls.
>  
> I don't see how exposing the sample collection to
> rate and the computation of the LFSR create a userspace
> api change;  I think its totally within the framework.

I was being a bit simplistic in my explination.  I am well aware of the
file system.  The point is the USER specifies the rate (every N events)
that they want to have sampling done.  We are using the existing
mechanism to pass the value of N to the kernel.  So from that
standpoint, we are trying to be consistent in how it is done with the
PPU. I feel that this is best to try to handle the N value in the same
way rather then having a completely different way.  

If we were to put the LFSR into the user space, you would pass the N
into the kernel for the PPU profiling case.  To make the API clean, you
would have to create a new file entry to pass the LFSR value.  For SPU
profiling you would not pass N instead you would pass LFSR.  I think it
is a bad idea to have these two different things for PPU versus SPU
profiling.  I really feel it is best to consistent to use pass N for PPU
and SPU.  Then deal with converting N to the LFSR for the special case
of SPU profiling.  

Unlike POWER 4/5 support where we absolutely had to add entries to the
API to pass the three values for the control registers.  We already have
an established mechanism for passing N from user to kernel.  Just use
it.  We are very sure that John Levon, the OProfile user maintainer,
will say the same thing and refuse to accept adding to the API to pass
the LFSR when the whole thing can be handled in a more consistent way
that does not require an architecture specific change.  And I also feel
that we really don't need or want an additional architecture specific
API change. 

>  
> > Please define "drastic" in this context.  Do you mean make the table
> > bigger i.e. more then 256 precomputed elements?  I did 256 since Arnd
> > seemed to think that would be a reasonable size. Based on his example
> > How much kernel space are we willing to use to same some computation?
> > Keep in mind only one of the entries in the table will ever be used.
> >
> > I think if we found the LFSR that was with in 2^10 of the desired value
> > that would be good enough. It would be within 1% of the minimum N the
> > user can specify.  That would require a table with 2^14 entries.  That
> > seems unreasonably large.
>  
> Why does the table have to be linear?  Compute samples at various
> logrimathic staring points.   Determine how many significant bits
> you want to keep, and have an array for each sample length.
>  
> ie for 3 bits of significance, you could have
> F00000, E00000, ... 800000,   0F0000, ......
> this would take (24-n) * 2^(n-1) slots, while maintaing user
> control of the range.
>  

I don't see any advantage in this log approach.  If we do this with a
linear table with a reasonable number of pre calculated values.  I think
that a table with no more then 1024 entries would be reasonable.  The
overhead for calculating the desired LFSR value would be going through
the for loop 16K times, for 1024 entries in the table, is not
unreasonable.  I think this whole discussion of moving the LFSR to the
user space is not needed.  The overhead of the for loop does not justify
pushing the LFSR determination to user space.  But that is just my
opinion.  I am open to suggestions on how big to make the lookup table
in the kernel.  But I really am apposed to putting the LFSR into the
user space.

> >
> > Anyway, the user controls how often sampling is done by setting N.
>  
> When calling a user space program.   The events are converted to
> a series of control register values that are communicated to the
> kernel by writing to files in the file system.   The writes are
> interpreted (converted from ascii hex to binary longs) and stored
> until the control files are written, at which point callbacks
> copy and interpret the controls and start the hardware collection.
>  
> Frankly, I would expect a lot more resistance to the event data
> stream generation changes and duplicaton.

I don't agree.  But that is just my opinion based on my experience
working on OProfile.

>  
> 
> milton
>  


^ permalink raw reply	[flat|nested] 20+ messages in thread

* Re: [Cbe-oss-dev] [RFC, PATCH] CELL Oprofile SPU profiling updated patch
  2007-02-08 23:59     ` Maynard Johnson
@ 2007-02-09 18:03       ` Milton Miller
  0 siblings, 0 replies; 20+ messages in thread
From: Milton Miller @ 2007-02-09 18:03 UTC (permalink / raw)
  To: Maynard Johnson; +Cc: cbe-oss-dev, LKML, linuxppc-dev, Carl Love, oprofile-list


On Feb 8, 2007, at 5:59 PM, Maynard Johnson wrote:

> Milton,
> Thank you for your comments.  Carl will reply to certain parts of your 
> posting where he's more knowledgeable than I.  See my replies below.
>

Thanks for the pleasant tone and dialog.

> Milton Miller wrote:
>> On Feb 6, 2007, at 5:02 PM, Carl Love wrote:
>>> This is the first update to the patch previously posted by Maynard
>>> Johnson as "PATCH 4/4. Add support to OProfile for profiling CELL".
>>

>> Data collected
>>
>>
>> The current patch starts tackling these translation issues for the
>> presently common case of a static self contained binary from a single
>> file, either single separate source file or embedded in the data of
>> the host application.   When creating the trace entry for a SPU
>> context switch, it records the application owner, pid, tid, and
>> dcookie of the main executable.   It addition, it looks up the
>> object-id as a virtual address and records the offset if it is 
>> non-zero,
>> or the dcookie of the object if it is zero.   The code then creates
>> a data structure by reading the elf headers from the user process
>> (at the address given by the object-id) and building a list of
>> SPU address to elf object offsets, as specified by the ELF loader
>> headers.   In addition to the elf loader section, it processes the
>> overlay headers and records the address, size, and magic number of
>> the overlay.
>>
>> When the hardware trace entries are processed, each address is
>> looked up this structure and translated to the elf offset.  If
>> it is an overlay region, the overlay identify word is read and
>> the list is searched for the matching overlay.  The resulting
>> offset is sent to the oprofile system.
>>
>> The current patch specifically identifies that only single
>> elf objects are handled.  There is no code to handle dynamic
>> linked libraries or overlays.   Nor is there any method to
>>
> Yes, we do handle overlays.  (Note: I'm looking into a bug
> right now in our overlay support.)

I knew you handled overlays, and I did not mean to say that
you did not.   I am not sure how that got there.  I may have
been thinking of the kernel supplied context switch code
discussion, or how the code supplied dcookie or offset but
not both.  Actually, I might have been referring to the
fact that overlays are guessed rather than recorded.

>> present samples that may have been collected during context
>> switch processing, they must be discarded.
>>
>>
>> My proposal is to change what is presented to user space.  Instead
>> of trying to translate the SPU address to the backing file
>> as the samples are recorded, store the samples as the SPU
>> context and address.  The context switch would record tid,
>> pid, object id as it does now.   In addition, if this is a
>> new object-id, the kernel would read elf headers as it does
>> today.  However, it would then proceed to provide accurate
>> dcookie information for each loader region and overlay.  To
>> identify which overlays are active, (instead of the present
>> read on use and search the list to translate approach) the
>> kernel would record the location of the overlay identifiers
>> as it parsed the kernel, but would then read the identification
>> word and would record the present value as an sample from
>> a separate but related stream.   The kernel could maintain
>> the last value for each overlay and only send profile events
>> for the deltas.
>>
> Discussions on this topic in the past have resulted in the
> current implementation precisely because we're able to record
> the samples as fileoffsets, just as the userspace tools expect.

I was not part of the previous discussions, so please forgive me.

> I haven't had time to check out how much this would impact the
> userspace tools, but my gut feel is that it would be quite
> significant.  If we were developing this module with a matching
> newly-created userspace tool, I would be more inclined to agree
> that this makes sense.

I have not yet studied the user space tool.   In fact, when I
made this proposal, I had not studied the kernel oprofile code
either, although I had read the concepts and discussion of the
event buffer when the base patch was added to the kernel.

I have read and now consider myself to have some understanding
of the kernel side.  I note that the user space tool calls itself
alpha and the kernel support experimental.   I only looked at
the user space enough to determine it is written in C++.

I would hope the tool would be modular enough to insert a data
transformation pass to do this conversion that the kernel is
doing.

> But you give no rationale for your
> proposal that justifies the change.  The current implementation
> works, it has no impact on normal, non-profiling behavior,  and
> the overhead during profiling is not noticeable.

I was proposing this for several of reasons.

One, there were expressed limitations  in the current
proposal.  There is a requirement that everything be linked
into one ELF object for it to be profiled seemed significant
to me.  This implies that shared libraries (well, dynamic
linked ones) have no path forward in this framework.

Two, there was a discussion of profiling the kernel context
switch, that seemed to be deferred.

Three, I saw the amount of code added to create the buffer
stream, but had not studied it yet.   It appeared to be
creating a totally new stream to be interpreted by user space.
With that in mind, I was exploring what the right interface
should be with more freedom.

Fourth, the interpretation was being done by heuristics and not
first source data.  By this I mean that both the overlay
identification is delayed, and that the code being executed
is a copy from the file.   While existing users may leave
the code mapped into the host process, there is no inherent
requirement to do so.  Now that I understand overlays, I see
they would have to remain mapped.

Of these, I think the fourth is the most compelling argument,
although the first and the second were what caused me to
try to understand why all this new code was needed.

My proposal was an attempt to expose the raw data, providing
user space what we know vs some guess.  I was trying to provide
the information that was needed to perform the translation
in user space with at least the same accuracy, but with the
ambiguities exposed.


>> This approach trades translation lookup overhead for each
>> recorded sample for a burst of data on new context activation.
>> In addition it exposes the sample point of the overlay identifier
>> vs the address collection.  This allows the ambiguity to be
>> exposed to user space.   In addition, with the above proposed
>> kernel timer vs sample collection, user space could limit the
>> elapsed time between the address collection and the overlay
>> id check.
>>
> Yes, there is a window here where an overlay could occur before
> we finish processing a group of samples that were actually taken
> from a different overlay.  The obvious way to prevent that is
> for the kernel (or SPUFS) to be notified of the overlay and let
> OProfile know that we need to drain (perhaps discard would be
> best) our sample trace buffer.  As you indicate above, your
> proposal faces the same issue, but would just decrease the number
> of bogus samples.

Actually, with my proposal, I let user space decide how to handle
the ambiguity.   If the SPU periodically switches between two
overlays then the sampling will be out of sync.   Flags could be
passed to the user space decoder to influence the overlay; for
instance one flag might say place all samples on each overlay
in turn, or create separate buckets when the overlay was known
to change during sample collection (or just discard those samples).

> I contend that the relative number of bogus samples will be
> quite low in either case.

I think this totally depends on the frequency of overlay changes.
If an application frequently swaps between overlays then the
data could be wrong more than it is right.   If instead
the overlay is only switched when a fundamental phase change
in data processing occurs, then the ambiguity will have
little affect on the sample quality.


> Ideally, we should have a mechanism to eliminate them completely
> so as to avoid confusion the user's part when they're looking at
> a report.  Even a few bogus samples in the wrong place can be
> troubling.  Such a mechanism will be a good future enhancement.

I think the obvious most accurate to resolve this would be to have
the overlay load code record an event in a buffer with a
timestamp, the kernel record a timestamp periodically during
collection, and the user space do the correlation.   However,
this obviously involves both space and time overhead, as Arnd
pointed out.


Actually, if we record in the data stream that the overlay
changed from last read, then user space could know that samples
from that overlay are suspect, during that portion.   We
would need to mark both the beginning and end of that trace
array unload.


When I started to write this reply, I had done my review of
the existing kernel code, and thought I would severely back
off from this proposal.  However, now that I have written
my reasons for the proposal, I think I would still like to
have it explored.

Now that I have read the kernel code and slept on it, let
me also propose, in the alternative, an approach that tries
to reuse the existing sample framework be used instead of
the total re-implemntation.  This idea is still mostly
concept, so please bear with me.  That is to say I read
the headers, and have some understanding of how the stages
of data collection fit together, but have not studied the
actual impact to the implentation.

Even if we do stay with the kernel resolution of the SPU
address to source in the process vm, why do we need to
write new buffer sync code?   Instead of recording the hit
as an ELF object source, why not resolve it to the threads
context vm address, and let the existing oprofile code
lookup the dcookie and offset?

1) The  user space needs to find the elf header of embedded
objects.   Answer: on context switch pass the object-id as
a sample, with appropriate escape code.

2) THe offset will be file relative instead of elf object
relative.   Answer: user space has to account for this
offset either when reading the instructions for decode or
when doing the sample lookup.   Answer b: if the context
id dcookie is for the same file, subtract its offset.
Actually, only do this if its in range of the file size
as indicated in the ELF headers.   Hmmm.. this would imply
a artifical dcookie for the offset object-id case.

3) In addition to the additional records to record the
context-id on switch, the existing interfaces assume the
sample is being recorded from the context of the running
thread, implicitly reading cpu number and active mm.
Answer: Yes, but factoring and code reuse should be
possible.


[I wonder if any other nommu architectures use overlays
like this.  I would not be surprised if the answer is
no, they have enough address space.]

milton
--
miltonm@bga.com   Milton Miller
Speaking for myself only.


^ permalink raw reply	[flat|nested] 20+ messages in thread

* Re: [Cbe-oss-dev] [RFC, PATCH] CELL Oprofile SPU profiling updated patch
  2007-02-08 17:21     ` Arnd Bergmann
  2007-02-08 18:01       ` Adrian Reber
  2007-02-08 22:51       ` Carl Love
@ 2007-02-09 18:47       ` Milton Miller
  2007-02-09 19:10         ` Arnd Bergmann
  2 siblings, 1 reply; 20+ messages in thread
From: Milton Miller @ 2007-02-09 18:47 UTC (permalink / raw)
  To: Arnd Bergmann; +Cc: cbe-oss-dev, LKML, linuxppc-dev, Carl Love, oprofile-list


On Feb 8, 2007, at 11:21 AM, Arnd Bergmann wrote:

> On Thursday 08 February 2007 15:18, Milton Miller wrote:
>
>> The current patch specifically identifies that only single
>> elf objects are handled.  There is no code to handle dynamic
>> linked libraries or overlays.   Nor is there any method to
>> present samples that may have been collected during context
>> switch processing, they must be discarded.
>
> I thought it already did handle overlays, what did I miss here?

It does, see my reply to Maynard.  Not sure what I was thinking
when I wrote this, possibly I was thinking of the inaccuracies.

>
>> My proposal is to change what is presented to user space.  Instead
>> of trying to translate the SPU address to the backing file
>> as the samples are recorded, store the samples as the SPU
>> context and address.  The context switch would record tid,
>> pid, object id as it does now.   In addition, if this is a
>> new object-id, the kernel would read elf headers as it does
>> today.  However, it would then proceed to provide accurate
>> dcookie information for each loader region and overlay.
>
> Doing the translation in two stages in user space, as you
> suggest here, definitely makes sense to me. I think it
> can be done a little simpler though:
>
> Why would you need the accurate dcookie information to be
> provided by the kernel? The ELF loader is done in user
> space, and the kernel only reproduces what it thinks that
> came up with. If the kernel only gives the dcookie information
> about the SPU ELF binary to the oprofile user space, then
> that can easily recreate the same mapping.

Actually, I was trying to cover issues such as anonymous
memory.   If the kernel doesn't generate dcookies for
the load segments the information is not recoverable once
the task exits.  This would also allow the loader to create
an artifical elf header that covered both the base executable
and a dynamicly linked one.

Other alternatives exist, such as a structure for context-id
that would have its own identifing magic and an array of elf
header pointers.


>
> The kernel still needs to provide the overlay identifiers
> though.

Yes, which means it needs to parse the header (or libpse
be enhanced to write the monitor info to a spufs file).

>
>> To identify which overlays are active, (instead of the present
>> read on use and search the list to translate approach) the
>> kernel would record the location of the overlay identifiers
>> as it parsed the kernel, but would then read the identification
>> word and would record the present value as an sample from
>> a separate but related stream.   The kernel could maintain
>> the last value for each overlay and only send profile events
>> for the deltas.
>
> right.
>
>> This approach trades translation lookup overhead for each
>> recorded sample for a burst of data on new context activation.
>> In addition it exposes the sample point of the overlay identifier
>> vs the address collection.  This allows the ambiguity to be
>> exposed to user space.   In addition, with the above proposed
>> kernel timer vs sample collection, user space could limit the
>> elapsed time between the address collection and the overlay
>> id check.
>
> yes, this sounds nice. But tt does not at all help accuracy,
> only performance, right?

It allows the user space to know when the sample was taken
and  be aware of the ambiguity.   If the sample rate is
high enough in relation to the overlay switch rate, user space
could decide to discard the ambiguous samples.

>
>> This approach allows multiple objects by its nature.  A new
>> elf header could be constructed in memory that contained
>> the union of the elf objects load segments, and the tools
>> will magically work.   Alternatively the object id could
>> point to a new structure, identified via a new header, that
>> it points to other elf headers (easily differentiated by the
>> elf magic headers).   Other binary formats, including several
>> objects in a ar archive, could be supported.
>
> Yes, that would be a new feature if the kernel passed dcookie
> information for every section, but I doubt that it is worth
> it. I have not seen any program that allows loading code
> from more than one ELF file. In particular, the ELF format
> on the SPU is currently lacking the relocation mechanisms
> that you would need for resolving spu-side symbols at load
> time

Actually, It could check all load segments, and only report
those where the dcookie changes (as opposed to the offset).

> .
>
>> If better overlay identification is required, in theory the
>> overlay switch code could be augmented to record the switches
>> (DMA reference time from the PowerPC memory and record a
>> relative decrementer in the SPU), this is obviously a future
>> item.  But it is facilitated by having user space resolve the
>> SPU to source file translation.
>
> This seems to incur a run-time overhead on the SPU even if not
> profiling, I would consider that not acceptable.

It definitely is overhead.  Which means it would have to be
optional, like gprof.


milton


^ permalink raw reply	[flat|nested] 20+ messages in thread

* Re: [Cbe-oss-dev] [RFC, PATCH] CELL Oprofile SPU profiling updated patch
  2007-02-09 18:47       ` Milton Miller
@ 2007-02-09 19:10         ` Arnd Bergmann
  2007-02-09 19:46           ` Milton Miller
  0 siblings, 1 reply; 20+ messages in thread
From: Arnd Bergmann @ 2007-02-09 19:10 UTC (permalink / raw)
  To: Milton Miller; +Cc: cbe-oss-dev, LKML, linuxppc-dev, Carl Love, oprofile-list

On Friday 09 February 2007 19:47, Milton Miller wrote:
> On Feb 8, 2007, at 11:21 AM, Arnd Bergmann wrote:
> 

> > Doing the translation in two stages in user space, as you
> > suggest here, definitely makes sense to me. I think it
> > can be done a little simpler though:
> >
> > Why would you need the accurate dcookie information to be
> > provided by the kernel? The ELF loader is done in user
> > space, and the kernel only reproduces what it thinks that
> > came up with. If the kernel only gives the dcookie information
> > about the SPU ELF binary to the oprofile user space, then
> > that can easily recreate the same mapping.
> 
> Actually, I was trying to cover issues such as anonymous
> memory.   If the kernel doesn't generate dcookies for
> the load segments the information is not recoverable once
> the task exits.  This would also allow the loader to create
> an artifical elf header that covered both the base executable
> and a dynamicly linked one.
> 
> Other alternatives exist, such as a structure for context-id
> that would have its own identifing magic and an array of elf
> header pointers.

But _why_ do you want to solve that problem? we don't have
dynamically linked binaries and I really don't see why the loader
would want to create artificial elf headers...

> > The kernel still needs to provide the overlay identifiers
> > though.
> 
> Yes, which means it needs to parse the header (or libpse
> be enhanced to write the monitor info to a spufs file).

we thought about this in the past and discarded it because of
the complexity of an spufs interface that would handle this
correctly. 

> > yes, this sounds nice. But tt does not at all help accuracy,
> > only performance, right?
> 
> It allows the user space to know when the sample was taken
> and  be aware of the ambiguity.   If the sample rate is
> high enough in relation to the overlay switch rate, user space
> could decide to discard the ambiguous samples.

yes, good point.

> >> This approach allows multiple objects by its nature.  A new
> >> elf header could be constructed in memory that contained
> >> the union of the elf objects load segments, and the tools
> >> will magically work.   Alternatively the object id could
> >> point to a new structure, identified via a new header, that
> >> it points to other elf headers (easily differentiated by the
> >> elf magic headers).   Other binary formats, including several
> >> objects in a ar archive, could be supported.
> >
> > Yes, that would be a new feature if the kernel passed dcookie
> > information for every section, but I doubt that it is worth
> > it. I have not seen any program that allows loading code
> > from more than one ELF file. In particular, the ELF format
> > on the SPU is currently lacking the relocation mechanisms
> > that you would need for resolving spu-side symbols at load
> > time
> 
> Actually, It could check all load segments, and only report
> those where the dcookie changes (as opposed to the offset).

I'm not really following you here, but probably you misunderstood
my point as well.

> > This seems to incur a run-time overhead on the SPU even if not
> > profiling, I would consider that not acceptable.
> 
> It definitely is overhead.  Which means it would have to be
> optional, like gprof.

There is some work going on for another profiler independent
of the hardware feature that only relies on instrumenting the
spu executable for things like DMA transfers and overlay
changes. 

	Arnd <><

^ permalink raw reply	[flat|nested] 20+ messages in thread

* Re: [Cbe-oss-dev] [RFC, PATCH] CELL Oprofile SPU profiling updated patch
  2007-02-09 19:10         ` Arnd Bergmann
@ 2007-02-09 19:46           ` Milton Miller
  0 siblings, 0 replies; 20+ messages in thread
From: Milton Miller @ 2007-02-09 19:46 UTC (permalink / raw)
  To: Arnd Bergmann; +Cc: cbe-oss-dev, LKML, linuxppc-dev, Carl Love, oprofile-list


On Feb 9, 2007, at 1:10 PM, Arnd Bergmann wrote:

> On Friday 09 February 2007 19:47, Milton Miller wrote:
>> On Feb 8, 2007, at 11:21 AM, Arnd Bergmann wrote:
>>
>
>>> Doing the translation in two stages in user space, as you
>>> suggest here, definitely makes sense to me. I think it
>>> can be done a little simpler though:
>>>
>>> Why would you need the accurate dcookie information to be
>>> provided by the kernel? The ELF loader is done in user
>>> space, and the kernel only reproduces what it thinks that
>>> came up with. If the kernel only gives the dcookie information
>>> about the SPU ELF binary to the oprofile user space, then
>>> that can easily recreate the same mapping.
>>
>> Actually, I was trying to cover issues such as anonymous
>> memory.   If the kernel doesn't generate dcookies for
>> the load segments the information is not recoverable once
>> the task exits.  This would also allow the loader to create
>> an artifical elf header that covered both the base executable
>> and a dynamicly linked one.
>>
>> Other alternatives exist, such as a structure for context-id
>> that would have its own identifing magic and an array of elf
>> header pointers.
>
> But _why_ do you want to solve that problem? we don't have
> dynamically linked binaries and I really don't see why the loader
> would want to create artificial elf headers...

I'm explainign how they could be handled.   Actully I think
the other proposal (an identified structure that points to
sevear elf headers) would be more approprate.  As you point
out, if there are presently no dynamic libraries in use, it
doesn't have to be solved today.  I'm just trying to make
the code future proof, or at least a clear path forward.

>
>>> The kernel still needs to provide the overlay identifiers
>>> though.
>>
>> Yes, which means it needs to parse the header (or libpse
>> be enhanced to write the monitor info to a spufs file).
>
> we thought about this in the past and discarded it because of
> the complexity of an spufs interface that would handle this
> correctly.

Not sure what would be difficuult, and it would allow other
binary formats.   But parsing the headers in the kernel
means existing userspace doesn't have to be upgraded, so I
am not proposing this requirement.

>
>>> yes, this sounds nice. But tt does not at all help accuracy,
>>> only performance, right?
>>
>> It allows the user space to know when the sample was taken
>> and  be aware of the ambiguity.   If the sample rate is
>> high enough in relation to the overlay switch rate, user space
>> could decide to discard the ambiguous samples.
>
> yes, good point.
>
>>>> This approach allows multiple objects by its nature.  A new
>>>> elf header could be constructed in memory that contained
>>>> the union of the elf objects load segments, and the tools
>>>> will magically work.   Alternatively the object id could
>>>> point to a new structure, identified via a new header, that
>>>> it points to other elf headers (easily differentiated by the
>>>> elf magic headers).   Other binary formats, including several
>>>> objects in a ar archive, could be supported.
>>>
>>> Yes, that would be a new feature if the kernel passed dcookie
>>> information for every section, but I doubt that it is worth
>>> it. I have not seen any program that allows loading code
>>> from more than one ELF file. In particular, the ELF format
>>> on the SPU is currently lacking the relocation mechanisms
>>> that you would need for resolving spu-side symbols at load
>>> time
>>
>> Actually, It could check all load segments, and only report
>> those where the dcookie changes (as opposed to the offset).
>
> I'm not really following you here, but probably you misunderstood
> my point as well.

I was thinking in terms of dyanmic libraries, and totally skipped
your comment about the relocaiton info being missing.   My reply
point was that the table could be compressed to the current
entry if all hased to the same vm area.

>
>>> This seems to incur a run-time overhead on the SPU even if not
>>> profiling, I would consider that not acceptable.
>>
>> It definitely is overhead.  Which means it would have to be
>> optional, like gprof.
>
> There is some work going on for another profiler independent
> of the hardware feature that only relies on instrumenting the
> spu executable for things like DMA transfers and overlay
> changes.

Regardless, its beyond the current scope.

milton


^ permalink raw reply	[flat|nested] 20+ messages in thread

* Re: [Cbe-oss-dev] [RFC, PATCH] CELL Oprofile SPU profiling updated patch
  2007-02-09 16:17           ` Carl Love
@ 2007-02-11 22:46             ` Milton Miller
  2007-02-12 16:38               ` Carl Love
  0 siblings, 1 reply; 20+ messages in thread
From: Milton Miller @ 2007-02-11 22:46 UTC (permalink / raw)
  To: Carl Love; +Cc: cbe-oss-dev, Arnd Bergmann, LKML, linuxppc-dev, oprofile-list


On Feb 9, 2007, at 10:17 AM, Carl Love wrote:

> On Thu, 2007-02-08 at 20:46 -0600, Milton Miller wrote:
>>  On Feb 8, 2007, at 4:51 PM, Carl Love wrote:
>>
>>> On Thu, 2007-02-08 at 18:21 +0100, Arnd Bergmann wrote:
>>>> On Thursday 08 February 2007 15:18, Milton Miller wrote:
>>>>
>>>>> 1) sample rate setup
>>>>>
>>>>>     In the current patch, the user specifies a sample rate as a 
>>>>> time
>>>>> interval.
>>>>>     The kernel is (a) calling cpufreq to get the current cpu
>>>>> frequency,
>>>>> (b)
>>>>>     converting the rate to a cycle count, (c) converting this to a
>>>>> 24 bit
>>>>>     LFSR count, an iterative algorithm (in this patch, starting 
>>>>> from
>>>>>     one of 256 values so a max of 2^16 or 64k iterations), (d)
>>>>> calculating
>>>>>     an trace unload interval.   In addition, a cpufreq notifier is
>>>>> registered
>>>>>     to recalculate on frequency changes.
>>>
>>> No.  The user issues the command opcontrol --event:N  where N is the
>>> number of events (cycles, l2 cache misses, instructions retired etc)
>>> that are to elapse between collecting the samples.
>>
>> So you are saying that b and c are primary, and a is used to calculate
>> a safe value for d.   All of the above work is dont, just from a
>> different starting point?
>
> There are two things 1) setup the LFSR to control how often the 
> Hardware
> puts samples into the trace buffer.  2) setup the kernel timer to read
> the trace buffer (your d, d is a function of the cpu freq) and process
> the samples.
>
> (c is nonsense)


Well, its cacluate the rate from the count vs count from the rate.

>
> The kernel timer was set with the goal the the hardware trace buffer
> would not get more then half full to ensure we would not lose samples
> even for the maximum rate that the hardware would be adding samples to
> the trace buffer (user specified N=100,000).

That should be well commented.

>>
>>> The OProfile passes
>>> the value N to the kernel via the variable ctr[i].count.  Where i is
>>> the
>>> performance counter entry for that event.
>>
>> Ok I haven't looked a the api closely.


Actually, the oprofile userspace fills out files in a file system to 
tell
the kernel what it needs to know.   The powerpc code defines the 
reources
needed to use the PMU hardware, which is (1) common mux selects, for
processors that need them, and (2) a set of directories, one for each 
of the
pmu counters, each of which contain the controls for that counter (such
as enable kernel space, enable user space, event timeout to interrupt or
sample collection, etc.   The ctr[i].count is one of these files.

>>
>>> Specifically with SPU
>>> profiling, we do not use performance counters because the CELL HW 
>>> does
>>> not allow the normal the PPU to read the SPU PC when a performance
>>> counter interrupt occurs.  We are using some additional hw support in
>>> the chip that allows us to periodically capture the SPU PC.  There is
>>> an
>>> LFSR hardware counter that can be started at an arbitrary LFSR value.
>>> When the last LFSR value in the sequence is reached, a sample is 
>>> taken
>>> and stored in the trace buffer.  Hence, the value of N specified by 
>>> the
>>> user must get converted to the LFSR value that is N from the end of 
>>> the
>>> sequence.
>>
>> Ok so its arbitray load count to max vs count and compare.   A 
>> critical
>> detail when computing the value to load, but the net result is the
>> same; the value for the count it hard to determine.
>
> The above statement makes no sense to me.

I think I talked past you.  Your description of hadrware vs mine was
different in that the counter always ends at a specified point and is
loaded with the variable count, where mine had it comparing to the
count as it incremented.

However, I now see that you were referring to the fact that what the
user specifes, the count, has to be converted to a LFSR value.  My point
is this can be done in the user-space oprofile code.  It already has
to look up magic numbers for setting up event selection muxes for other
hardware, adding a lfsr calculation is not beyond resoon.  Nor is having
it provide two values in two files.

>
> Determining the initial LFSR value that is N values from the last value
> in the sequence is not easy to do.

Well, its easy, its just order(N).

>>> The same clock that the processor is running at is used to
>>> control the LFSR count.  Hence the LFSR counter increments once per 
>>> CPU
>>> clock cycle regardless of the CPU frequency or changes in the
>>> frequency.
>>> There is no calculation for the LFSR value that is a function of the
>>> processor frequency.  There is no need to adjust the LFSR when the
>>> processor frequency changes.
>>
>> Oh, so the lfsr doesn't have to be recomputed, only the time
>> between unloads.
>
> The LFSR value is computed ONCE when you start OProfile.  The value is
> setup in the hardware once when OProfile starts.  The hardware will
> always restart with the value given to it after it reaches the last
> value in the sequence.  What you call the "time between unloads" is the
> time at which you schedule the kernel routine to empty the trace 
> buffer.
> It is calculated once.  It would only need to be recomputed if the cpu
> frequency changed.



>>>>>
>>>>>     The obvious problem is step (c), running a loop potentially 64
>>>>> thousand
>>>>>     times in kernel space will have a noticeable impact on other
>>>>> threads.
>>>>>
>>>>>     I propose instead that user space perform the above 4 steps, 
>>>>> and
>>>>> provide
>>>>>     the kernel with two inputs: (1) the value to load in the LFSR
>>>>> and (2)
>>>>>     the periodic frequency / time interval at which to empty the
>>>>> hardware
>>>>>     trace buffer, perform sample analysis, and send the data to the
>>>>> oprofile
>>>>>     subsystem.
>>>>>
>>>>>     There should be no security issues with this approach.   If the
>>>>> LFSR
>>>>> value
>>>>>     is calculated incorrectly, either it will be too short, causing
>>>>> the
>>>>> trace
>>>>>     array to overfill and data to be dropped, or it will be too
>>>>> long, and
>>>>>     there will be fewer samples.   Likewise, the kernel periodic 
>>>>> poll
>>>>> can be
>>>>>     too long, again causing overflow, or too frequent, causing only
>>>>> timer
>>>>>     execution overhead.
>>>>>
>>>>>     Various data is collected by the kernel while processing the
>>>>> periodic timer,
>>>>>     this approach would also allow the profiling tools to control 
>>>>> the
>>>>>     frequency of this collection.   More frequent collection 
>>>>> results
>>>>> in
>>>>> more
>>>>>     accurate sample data, with the linear cost of poll execution
>>>>> overhead.
>>>>>
>>>>>     Frequency changes can be handled either by the profile code
>>>>> setting
>>>>>     collection at a higher than necessary rate, or by interacting
>>>>> with
>>>>> the
>>>>>     governor to limit the speeds.
>>>>>
>>>>>     Optionally, the kernel can add a record indicating that some
>>>>> data was
>>>>>     likely dropped if it is able to read all 256 entries without
>>>>> underflowing
>>>>>     the array.  This can be used as hint to user space that the
>>>>> kernel
>>>>> time
>>>>>     was too long for the collection rate.
>>>>
>>>> Moving the sample rate computation to user space sounds like the 
>>>> right
>>>> idea, but why not have a more drastic version of it:
>>>
>>> No, I do not agree.  The user/kernel API pass N where N is the number
>>> of
>>> events between samples.  We are not at liberty to just change the 
>>> API.
>>> We we did do this, we fully expect that John Levon will push back
>>> saying
>>> why make an architecture specific API change when it isn't necessary.
>>
>> [So you have not asked.]
>>
>> <Kludge> If you want to overlaod the existing array, one
>> event could be the lfsr sample rate, and another event be
>> the collection time.  That would stay within the framework.
>> But that is a kludge. </Kludge>
>>
>> [Me goes and reads kernel profile driver and skims powerpc code].
>>
>> You are confusing the user interface (what the user specifies on the
>> command line) with the kernel API.
>>
>> It is somewhat hidden by the PowerPC specific common code in
>> arch/powerpc/oprofile/common.c.  That is where the counter
>> array is exposd.
>>
>> The user to kernel api is not fill out an array of counter
>> event names and sampling intervals.
>>
>> The user to kernel interface is a file system that contains a
>> heirachy of files.  Each file consists of the hex ascii
>> represntation of a unsigned long.   The filesystem interfaces
>> to the kernel by provding an API to create directorys and files,
>> specifing the name of the directory or file.  Theere are helper
>> routines to connect a file to a ulong and read access to an atomic_t.
>> The common driver (in drivers/oprofile) creates the file system
>> and some common files that implement the control interface, which
>> interfaces to the architecture specific driver through the ops
>> array.
>>
>> The powerpc common driver creates a heiarchy exposing the
>> values to be placed in the performance monitor registers,
>> and directory of counters with the event selection.
>>
>> Since this architeture code does not seem to match the
>> capabilitys of the hardware, it would seem that this is
>> the area to change.   This driver does not seem to use
>> the actual PMU interrput or sprs.   Lets make it its
>> own directory with its own controls.
>>
>> I don't see how exposing the sample collection to
>> rate and the computation of the LFSR create a userspace
>> api change;  I think its totally within the framework.
>
> I was being a bit simplistic in my explination.  I am well aware of the
> file system.  The point is the USER specifies the rate (every N events)
> that they want to have sampling done.  We are using the existing
> mechanism to pass the value of N to the kernel.  So from that
> standpoint, we are trying to be consistent in how it is done with the
> PPU. I feel that this is best to try to handle the N value in the same
> way rather then having a completely different way.


>
> If we were to put the LFSR into the user space, you would pass the N
> into the kernel for the PPU profiling case.  To make the API clean, you
> would have to create a new file entry to pass the LFSR value.  For SPU
> profiling you would not pass N instead you would pass LFSR.  I think it
> is a bad idea to have these two different things for PPU versus SPU
> profiling.  I really feel it is best to consistent to use pass N for 
> PPU
> and SPU.  Then deal with converting N to the LFSR for the special case
> of SPU profiling.
>

As far as I understand, you are providing access to a completely new
hardware that is related to the PMU hardware by the fact that it
collects a program counter.   It doesn't use the PMU counters nor the
PMU event selection.

In fact, why can the existing op_model_cell profiling not run while
the SPU profiling runs?   Is there a shared debug bus inside the
chip?   Or just the data stream with your buffer_sync code?

> Unlike POWER 4/5 support where we absolutely had to add entries to the
> API to pass the three values for the control registers.  We already 
> have
> an established mechanism for passing N from user to kernel.  Just use
> it.  We are very sure that John Levon, the OProfile user maintainer,
> will say the same thing and refuse to accept adding to the API to pass
> the LFSR when the whole thing can be handled in a more consistent way
> that does not require an architecture specific change.  And I also feel
> that we really don't need or want an additional architecture specific
> API change.
>
>>
>>> Please define "drastic" in this context.  Do you mean make the table
>>> bigger i.e. more then 256 precomputed elements?  I did 256 since Arnd
>>> seemed to think that would be a reasonable size. Based on his example
>>> How much kernel space are we willing to use to same some computation?
>>> Keep in mind only one of the entries in the table will ever be used.
>>>
>>> I think if we found the LFSR that was with in 2^10 of the desired 
>>> value
>>> that would be good enough. It would be within 1% of the minimum N the
>>> user can specify.  That would require a table with 2^14 entries.  
>>> That
>>> seems unreasonably large.
>>
>> Why does the table have to be linear?  Compute samples at various
>> logrimathic staring points.   Determine how many significant bits
>> you want to keep, and have an array for each sample length.
>>
>> ie for 3 bits of significance, you could have
>> F00000, E00000, ... 800000,   0F0000, ......
>> this would take (24-n) * 2^(n-1) slots, while maintaing user
>> control of the range.
>>
>
> I don't see any advantage in this log approach.  If we do this with a
> linear table with a reasonable number of pre calculated values.  I 
> think
> that a table with no more then 1024 entries would be reasonable.  The
> overhead for calculating the desired LFSR value would be going through
> the for loop 16K times, for 1024 entries in the table, is not
> unreasonable.  I think this whole discussion of moving the LFSR to the
> user space is not needed.  The overhead of the for loop does not 
> justify
> pushing the LFSR determination to user space.  But that is just my
> opinion.  I am open to suggestions on how big to make the lookup table
> in the kernel.  But I really am apposed to putting the LFSR into the
> user space.
>
>>>
>>> Anyway, the user controls how often sampling is done by setting N.
>>
>> When calling a user space program.   The events are converted to
>> a series of control register values that are communicated to the
>> kernel by writing to files in the file system.   The writes are
>> interpreted (converted from ascii hex to binary longs) and stored
>> until the control files are written, at which point callbacks
>> copy and interpret the controls and start the hardware collection.
>>
>> Frankly, I would expect a lot more resistance to the event data
>> stream generation changes and duplicaton.
>
> I don't agree.  But that is just my opinion based on my experience
> working on OProfile.
>

Well, I haven't worked with oprofile in the past, but I have worked
on the kernel.  And I stay by my statement.

milton


^ permalink raw reply	[flat|nested] 20+ messages in thread

* Re: [Cbe-oss-dev] [RFC, PATCH] CELL Oprofile SPU profiling updated patch
  2007-02-11 22:46             ` Milton Miller
@ 2007-02-12 16:38               ` Carl Love
  0 siblings, 0 replies; 20+ messages in thread
From: Carl Love @ 2007-02-12 16:38 UTC (permalink / raw)
  To: Milton Miller
  Cc: cbe-oss-dev, Arnd Bergmann, LKML, linuxppc-dev, oprofile-list

On Sun, 2007-02-11 at 16:46 -0600, Milton Miller wrote:

[cut]

> 
> As far as I understand, you are providing access to a completely new
> hardware that is related to the PMU hardware by the fact that it
> collects a program counter.   It doesn't use the PMU counters nor the
> PMU event selection.
> 
> In fact, why can the existing op_model_cell profiling not run while
> the SPU profiling runs?   Is there a shared debug bus inside the
> chip?   Or just the data stream with your buffer_sync code?
> 

There are two reasons you cannot do SPU profiling and profiling on non
SPU events at the sametime.  1) the SPU PC values are routed on the
debug bus.  You cannot also route the signals for other non SPU events
on the debug bus since they will conflict.  Specifically, the signals
would get logically OR'd on the bus.  The exception is PPU cycles which
does not use the debug bus.  2) the hardware that captures the SPU
program counters has some shared components with the HW performance
counters.  To use the SPU program counter hardware, the performance
counter hardware must be disabled.

In summary, we cannot do SPU cycle profiling and non SPU event profiling
at the same time due to limitations in the hardware.


[cut]


> milton
> 


^ permalink raw reply	[flat|nested] 20+ messages in thread

end of thread, other threads:[~2007-02-12 16:39 UTC | newest]

Thread overview: 20+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2007-02-06  0:28 [RFC,PATCH] CELL PPU Oprofile SPU profiling updated patch Carl Love
2007-02-06  0:46 ` Paul Mackerras
2007-02-06  0:49 ` [Cbe-oss-dev] [RFC, PATCH] " Benjamin Herrenschmidt
2007-02-06 23:02 ` [Cbe-oss-dev] [RFC, PATCH] CELL " Carl Love
2007-02-07 15:41   ` Maynard Johnson
2007-02-07 22:48     ` Michael Ellerman
2007-02-08 15:03       ` Maynard Johnson
2007-02-08 14:18   ` Milton Miller
2007-02-08 17:21     ` Arnd Bergmann
2007-02-08 18:01       ` Adrian Reber
2007-02-08 22:51       ` Carl Love
2007-02-09  2:46         ` Milton Miller
2007-02-09 16:17           ` Carl Love
2007-02-11 22:46             ` Milton Miller
2007-02-12 16:38               ` Carl Love
2007-02-09 18:47       ` Milton Miller
2007-02-09 19:10         ` Arnd Bergmann
2007-02-09 19:46           ` Milton Miller
2007-02-08 23:59     ` Maynard Johnson
2007-02-09 18:03       ` Milton Miller

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).