Creation of Cybook 2416 (actually Gen4) repository

2009-12-18 17:10:00 +00:00
commit 76f20f4d40
13791 changed files with 6812321 additions and 0 deletions
--- a/kernel/Kconfig.hz
+++ b/kernel/Kconfig.hz
@@ -0,0 +1,56 @@
+#
+# Timer Interrupt Frequency Configuration
+#
+
+choice
+	prompt "Timer frequency"
+	default HZ_250
+	help
+	 Allows the configuration of the timer frequency. It is customary
+	 to have the timer interrupt run at 1000 Hz but 100 Hz may be more
+	 beneficial for servers and NUMA systems that do not need to have
+	 a fast response for user interaction and that may experience bus
+	 contention and cacheline bounces as a result of timer interrupts.
+	 Note that the timer interrupt occurs on each processor in an SMP
+	 environment leading to NR_CPUS * HZ number of timer interrupts
+	 per second.
+
+
+	config HZ_100
+		bool "100 HZ"
+	help
+	  100 Hz is a typical choice for servers, SMP and NUMA systems
+	  with lots of processors that may show reduced performance if
+	  too many timer interrupts are occurring.
+
+	config HZ_250
+		bool "250 HZ"
+	help
+	 250 Hz is a good compromise choice allowing server performance
+	 while also showing good interactive responsiveness even
+	 on SMP and NUMA systems. If you are going to be using NTSC video
+	 or multimedia, selected 300Hz instead.
+
+	config HZ_300
+		bool "300 HZ"
+	help
+	 300 Hz is a good compromise choice allowing server performance
+	 while also showing good interactive responsiveness even
+	 on SMP and NUMA systems and exactly dividing by both PAL and
+	 NTSC frame rates for video and multimedia work.
+
+	config HZ_1000
+		bool "1000 HZ"
+	help
+	 1000 Hz is the preferred choice for desktop systems and other
+	 systems requiring fast interactive responses to events.
+
+endchoice
+
+config HZ
+	int
+	default 100 if HZ_100
+	default 250 if HZ_250
+	default 300 if HZ_300
+	default 1000 if HZ_1000
+
--- a/kernel/Kconfig.preempt
+++ b/kernel/Kconfig.preempt
@@ -0,0 +1,65 @@
+
+choice
+	prompt "Preemption Model"
+	default PREEMPT_NONE
+
+config PREEMPT_NONE
+	bool "No Forced Preemption (Server)"
+	help
+	  This is the traditional Linux preemption model, geared towards
+	  throughput. It will still provide good latencies most of the
+	  time, but there are no guarantees and occasional longer delays
+	  are possible.
+
+	  Select this option if you are building a kernel for a server or
+	  scientific/computation system, or if you want to maximize the
+	  raw processing power of the kernel, irrespective of scheduling
+	  latencies.
+
+config PREEMPT_VOLUNTARY
+	bool "Voluntary Kernel Preemption (Desktop)"
+	help
+	  This option reduces the latency of the kernel by adding more
+	  "explicit preemption points" to the kernel code. These new
+	  preemption points have been selected to reduce the maximum
+	  latency of rescheduling, providing faster application reactions,
+	  at the cost of slighly lower throughput.
+
+	  This allows reaction to interactive events by allowing a
+	  low priority process to voluntarily preempt itself even if it
+	  is in kernel mode executing a system call. This allows
+	  applications to run more 'smoothly' even when the system is
+	  under load.
+
+	  Select this if you are building a kernel for a desktop system.
+
+config PREEMPT
+	bool "Preemptible Kernel (Low-Latency Desktop)"
+	help
+	  This option reduces the latency of the kernel by making
+	  all kernel code (that is not executing in a critical section)
+	  preemptible.  This allows reaction to interactive events by
+	  permitting a low priority process to be preempted involuntarily
+	  even if it is in kernel mode executing a system call and would
+	  otherwise not be about to reach a natural preemption point.
+	  This allows applications to run more 'smoothly' even when the
+	  system is under load, at the cost of slighly lower throughput
+	  and a slight runtime overhead to kernel code.
+
+	  Select this if you are building a kernel for a desktop or
+	  embedded system with latency requirements in the milliseconds
+	  range.
+
+endchoice
+
+config PREEMPT_BKL
+	bool "Preempt The Big Kernel Lock"
+	depends on SMP || PREEMPT
+	default y
+	help
+	  This option reduces the latency of the kernel by making the
+	  big kernel lock preemptible.
+
+	  Say Y here if you are building a kernel for a desktop system.
+	  Say N if you are unsure.
+
--- a/kernel/Makefile
+++ b/kernel/Makefile
@@ -0,0 +1,76 @@
+#
+# Makefile for the linux kernel.
+#
+
+obj-y     = sched.o fork.o exec_domain.o panic.o printk.o profile.o \
+	    exit.o itimer.o time.o softirq.o resource.o \
+	    sysctl.o capability.o ptrace.o timer.o user.o \
+	    signal.o sys.o kmod.o workqueue.o pid.o \
+	    rcupdate.o extable.o params.o posix-timers.o \
+	    kthread.o wait.o kfifo.o sys_ni.o posix-cpu-timers.o mutex.o \
+	    hrtimer.o rwsem.o latency.o nsproxy.o srcu.o
+
+obj-$(CONFIG_STACKTRACE) += stacktrace.o
+obj-y += time/
+obj-$(CONFIG_DEBUG_MUTEXES) += mutex-debug.o
+obj-$(CONFIG_LOCKDEP) += lockdep.o
+ifeq ($(CONFIG_PROC_FS),y)
+obj-$(CONFIG_LOCKDEP) += lockdep_proc.o
+endif
+obj-$(CONFIG_FUTEX) += futex.o
+ifeq ($(CONFIG_COMPAT),y)
+obj-$(CONFIG_FUTEX) += futex_compat.o
+endif
+obj-$(CONFIG_RT_MUTEXES) += rtmutex.o
+obj-$(CONFIG_DEBUG_RT_MUTEXES) += rtmutex-debug.o
+obj-$(CONFIG_RT_MUTEX_TESTER) += rtmutex-tester.o
+obj-$(CONFIG_GENERIC_ISA_DMA) += dma.o
+obj-$(CONFIG_SMP) += cpu.o spinlock.o
+obj-$(CONFIG_DEBUG_SPINLOCK) += spinlock.o
+obj-$(CONFIG_PROVE_LOCKING) += spinlock.o
+obj-$(CONFIG_UID16) += uid16.o
+obj-$(CONFIG_MODULES) += module.o
+obj-$(CONFIG_KALLSYMS) += kallsyms.o
+obj-$(CONFIG_PM) += power/
+obj-$(CONFIG_BSD_PROCESS_ACCT) += acct.o
+obj-$(CONFIG_KEXEC) += kexec.o
+obj-$(CONFIG_COMPAT) += compat.o
+obj-$(CONFIG_CPUSETS) += cpuset.o
+obj-$(CONFIG_IKCONFIG) += configs.o
+obj-$(CONFIG_STOP_MACHINE) += stop_machine.o
+obj-$(CONFIG_AUDIT) += audit.o auditfilter.o
+obj-$(CONFIG_AUDITSYSCALL) += auditsc.o
+obj-$(CONFIG_KPROBES) += kprobes.o
+obj-$(CONFIG_SYSFS) += ksysfs.o
+obj-$(CONFIG_DETECT_SOFTLOCKUP) += softlockup.o
+obj-$(CONFIG_GENERIC_HARDIRQS) += irq/
+obj-$(CONFIG_SECCOMP) += seccomp.o
+obj-$(CONFIG_RCU_TORTURE_TEST) += rcutorture.o
+obj-$(CONFIG_RELAY) += relay.o
+obj-$(CONFIG_SYSCTL) += utsname_sysctl.o
+obj-$(CONFIG_UTS_NS) += utsname.o
+obj-$(CONFIG_TASK_DELAY_ACCT) += delayacct.o
+obj-$(CONFIG_TASKSTATS) += taskstats.o tsacct.o
+
+ifneq ($(CONFIG_SCHED_NO_NO_OMIT_FRAME_POINTER),y)
+# According to Alan Modra <alan@linuxcare.com.au>, the -fno-omit-frame-pointer is
+# needed for x86 only.  Why this used to be enabled for all architectures is beyond
+# me.  I suspect most platforms don't need this, but until we know that for sure
+# I turn this off for IA-64 only.  Andreas Schwab says it's also needed on m68k
+# to get a correct value for the wait-channel (WCHAN in ps). --davidm
+CFLAGS_sched.o := $(PROFILING) -fno-omit-frame-pointer
+endif
+
+$(obj)/configs.o: $(obj)/config_data.h
+
+# config_data.h contains the same information as ikconfig.h but gzipped.
+# Info from config_data can be extracted from /proc/config*
+targets += config_data.gz
+$(obj)/config_data.gz: .config FORCE
+	$(call if_changed,gzip)
+
+quiet_cmd_ikconfiggz = IKCFG   $@
+      cmd_ikconfiggz = (echo "static const char kernel_config_data[] = MAGIC_START"; cat $< | scripts/bin2c; echo "MAGIC_END;") > $@
+targets += config_data.h
+$(obj)/config_data.h: $(obj)/config_data.gz FORCE
+	$(call if_changed,ikconfiggz)
--- a/kernel/acct.c
+++ b/kernel/acct.c
@@ -0,0 +1,599 @@
+/*
+ *  linux/kernel/acct.c
+ *
+ *  BSD Process Accounting for Linux
+ *
+ *  Author: Marco van Wieringen <mvw@planets.elm.net>
+ *
+ *  Some code based on ideas and code from:
+ *  Thomas K. Dyas <tdyas@eden.rutgers.edu>
+ *
+ *  This file implements BSD-style process accounting. Whenever any
+ *  process exits, an accounting record of type "struct acct" is
+ *  written to the file specified with the acct() system call. It is
+ *  up to user-level programs to do useful things with the accounting
+ *  log. The kernel just provides the raw accounting information.
+ *
+ * (C) Copyright 1995 - 1997 Marco van Wieringen - ELM Consultancy B.V.
+ *
+ *  Plugged two leaks. 1) It didn't return acct_file into the free_filps if
+ *  the file happened to be read-only. 2) If the accounting was suspended
+ *  due to the lack of space it happily allowed to reopen it and completely
+ *  lost the old acct_file. 3/10/98, Al Viro.
+ *
+ *  Now we silently close acct_file on attempt to reopen. Cleaned sys_acct().
+ *  XTerms and EMACS are manifestations of pure evil. 21/10/98, AV.
+ *
+ *  Fixed a nasty interaction with with sys_umount(). If the accointing
+ *  was suspeneded we failed to stop it on umount(). Messy.
+ *  Another one: remount to readonly didn't stop accounting.
+ *	Question: what should we do if we have CAP_SYS_ADMIN but not
+ *  CAP_SYS_PACCT? Current code does the following: umount returns -EBUSY
+ *  unless we are messing with the root. In that case we are getting a
+ *  real mess with do_remount_sb(). 9/11/98, AV.
+ *
+ *  Fixed a bunch of races (and pair of leaks). Probably not the best way,
+ *  but this one obviously doesn't introduce deadlocks. Later. BTW, found
+ *  one race (and leak) in BSD implementation.
+ *  OK, that's better. ANOTHER race and leak in BSD variant. There always
+ *  is one more bug... 10/11/98, AV.
+ *
+ *	Oh, fsck... Oopsable SMP race in do_process_acct() - we must hold
+ * ->mmap_sem to walk the vma list of current->mm. Nasty, since it leaks
+ * a struct file opened for write. Fixed. 2/6/2000, AV.
+ */
+
+#include <linux/mm.h>
+#include <linux/slab.h>
+#include <linux/acct.h>
+#include <linux/capability.h>
+#include <linux/file.h>
+#include <linux/tty.h>
+#include <linux/security.h>
+#include <linux/vfs.h>
+#include <linux/jiffies.h>
+#include <linux/times.h>
+#include <linux/syscalls.h>
+#include <linux/mount.h>
+#include <asm/uaccess.h>
+#include <asm/div64.h>
+#include <linux/blkdev.h> /* sector_div */
+
+/*
+ * These constants control the amount of freespace that suspend and
+ * resume the process accounting system, and the time delay between
+ * each check.
+ * Turned into sysctl-controllable parameters. AV, 12/11/98
+ */
+
+int acct_parm[3] = {4, 2, 30};
+#define RESUME		(acct_parm[0])	/* >foo% free space - resume */
+#define SUSPEND		(acct_parm[1])	/* <foo% free space - suspend */
+#define ACCT_TIMEOUT	(acct_parm[2])	/* foo second timeout between checks */
+
+/*
+ * External references and all of the globals.
+ */
+static void do_acct_process(struct file *);
+
+/*
+ * This structure is used so that all the data protected by lock
+ * can be placed in the same cache line as the lock.  This primes
+ * the cache line to have the data after getting the lock.
+ */
+struct acct_glbs {
+	spinlock_t		lock;
+	volatile int		active;
+	volatile int		needcheck;
+	struct file		*file;
+	struct timer_list	timer;
+};
+
+static struct acct_glbs acct_globals __cacheline_aligned =
+	{__SPIN_LOCK_UNLOCKED(acct_globals.lock)};
+
+/*
+ * Called whenever the timer says to check the free space.
+ */
+static void acct_timeout(unsigned long unused)
+{
+	acct_globals.needcheck = 1;
+}
+
+/*
+ * Check the amount of free space and suspend/resume accordingly.
+ */
+static int check_free_space(struct file *file)
+{
+	struct kstatfs sbuf;
+	int res;
+	int act;
+	sector_t resume;
+	sector_t suspend;
+
+	spin_lock(&acct_globals.lock);
+	res = acct_globals.active;
+	if (!file || !acct_globals.needcheck)
+		goto out;
+	spin_unlock(&acct_globals.lock);
+
+	/* May block */
+	if (vfs_statfs(file->f_path.dentry, &sbuf))
+		return res;
+	suspend = sbuf.f_blocks * SUSPEND;
+	resume = sbuf.f_blocks * RESUME;
+
+	sector_div(suspend, 100);
+	sector_div(resume, 100);
+
+	if (sbuf.f_bavail <= suspend)
+		act = -1;
+	else if (sbuf.f_bavail >= resume)
+		act = 1;
+	else
+		act = 0;
+
+	/*
+	 * If some joker switched acct_globals.file under us we'ld better be
+	 * silent and _not_ touch anything.
+	 */
+	spin_lock(&acct_globals.lock);
+	if (file != acct_globals.file) {
+		if (act)
+			res = act>0;
+		goto out;
+	}
+
+	if (acct_globals.active) {
+		if (act < 0) {
+			acct_globals.active = 0;
+			printk(KERN_INFO "Process accounting paused\n");
+		}
+	} else {
+		if (act > 0) {
+			acct_globals.active = 1;
+			printk(KERN_INFO "Process accounting resumed\n");
+		}
+	}
+
+	del_timer(&acct_globals.timer);
+	acct_globals.needcheck = 0;
+	acct_globals.timer.expires = jiffies + ACCT_TIMEOUT*HZ;
+	add_timer(&acct_globals.timer);
+	res = acct_globals.active;
+out:
+	spin_unlock(&acct_globals.lock);
+	return res;
+}
+
+/*
+ * Close the old accounting file (if currently open) and then replace
+ * it with file (if non-NULL).
+ *
+ * NOTE: acct_globals.lock MUST be held on entry and exit.
+ */
+static void acct_file_reopen(struct file *file)
+{
+	struct file *old_acct = NULL;
+
+	if (acct_globals.file) {
+		old_acct = acct_globals.file;
+		del_timer(&acct_globals.timer);
+		acct_globals.active = 0;
+		acct_globals.needcheck = 0;
+		acct_globals.file = NULL;
+	}
+	if (file) {
+		acct_globals.file = file;
+		acct_globals.needcheck = 0;
+		acct_globals.active = 1;
+		/* It's been deleted if it was used before so this is safe */
+		init_timer(&acct_globals.timer);
+		acct_globals.timer.function = acct_timeout;
+		acct_globals.timer.expires = jiffies + ACCT_TIMEOUT*HZ;
+		add_timer(&acct_globals.timer);
+	}
+	if (old_acct) {
+		mnt_unpin(old_acct->f_path.mnt);
+		spin_unlock(&acct_globals.lock);
+		do_acct_process(old_acct);
+		filp_close(old_acct, NULL);
+		spin_lock(&acct_globals.lock);
+	}
+}
+
+static int acct_on(char *name)
+{
+	struct file *file;
+	int error;
+
+	/* Difference from BSD - they don't do O_APPEND */
+	file = filp_open(name, O_WRONLY|O_APPEND|O_LARGEFILE, 0);
+	if (IS_ERR(file))
+		return PTR_ERR(file);
+
+	if (!S_ISREG(file->f_path.dentry->d_inode->i_mode)) {
+		filp_close(file, NULL);
+		return -EACCES;
+	}
+
+	if (!file->f_op->write) {
+		filp_close(file, NULL);
+		return -EIO;
+	}
+
+	error = security_acct(file);
+	if (error) {
+		filp_close(file, NULL);
+		return error;
+	}
+
+	spin_lock(&acct_globals.lock);
+	mnt_pin(file->f_path.mnt);
+	acct_file_reopen(file);
+	spin_unlock(&acct_globals.lock);
+
+	mntput(file->f_path.mnt); /* it's pinned, now give up active reference */
+
+	return 0;
+}
+
+/**
+ * sys_acct - enable/disable process accounting
+ * @name: file name for accounting records or NULL to shutdown accounting
+ *
+ * Returns 0 for success or negative errno values for failure.
+ *
+ * sys_acct() is the only system call needed to implement process
+ * accounting. It takes the name of the file where accounting records
+ * should be written. If the filename is NULL, accounting will be
+ * shutdown.
+ */
+asmlinkage long sys_acct(const char __user *name)
+{
+	int error;
+
+	if (!capable(CAP_SYS_PACCT))
+		return -EPERM;
+
+	if (name) {
+		char *tmp = getname(name);
+		if (IS_ERR(tmp))
+			return (PTR_ERR(tmp));
+		error = acct_on(tmp);
+		putname(tmp);
+	} else {
+		error = security_acct(NULL);
+		if (!error) {
+			spin_lock(&acct_globals.lock);
+			acct_file_reopen(NULL);
+			spin_unlock(&acct_globals.lock);
+		}
+	}
+	return error;
+}
+
+/**
+ * acct_auto_close - turn off a filesystem's accounting if it is on
+ * @m: vfsmount being shut down
+ *
+ * If the accounting is turned on for a file in the subtree pointed to
+ * to by m, turn accounting off.  Done when m is about to die.
+ */
+void acct_auto_close_mnt(struct vfsmount *m)
+{
+	spin_lock(&acct_globals.lock);
+	if (acct_globals.file && acct_globals.file->f_path.mnt == m)
+		acct_file_reopen(NULL);
+	spin_unlock(&acct_globals.lock);
+}
+
+/**
+ * acct_auto_close - turn off a filesystem's accounting if it is on
+ * @sb: super block for the filesystem
+ *
+ * If the accounting is turned on for a file in the filesystem pointed
+ * to by sb, turn accounting off.
+ */
+void acct_auto_close(struct super_block *sb)
+{
+	spin_lock(&acct_globals.lock);
+	if (acct_globals.file &&
+	    acct_globals.file->f_path.mnt->mnt_sb == sb) {
+		acct_file_reopen(NULL);
+	}
+	spin_unlock(&acct_globals.lock);
+}
+
+/*
+ *  encode an unsigned long into a comp_t
+ *
+ *  This routine has been adopted from the encode_comp_t() function in
+ *  the kern_acct.c file of the FreeBSD operating system. The encoding
+ *  is a 13-bit fraction with a 3-bit (base 8) exponent.
+ */
+
+#define	MANTSIZE	13			/* 13 bit mantissa. */
+#define	EXPSIZE		3			/* Base 8 (3 bit) exponent. */
+#define	MAXFRACT	((1 << MANTSIZE) - 1)	/* Maximum fractional value. */
+
+static comp_t encode_comp_t(unsigned long value)
+{
+	int exp, rnd;
+
+	exp = rnd = 0;
+	while (value > MAXFRACT) {
+		rnd = value & (1 << (EXPSIZE - 1));	/* Round up? */
+		value >>= EXPSIZE;	/* Base 8 exponent == 3 bit shift. */
+		exp++;
+	}
+
+	/*
+         * If we need to round up, do it (and handle overflow correctly).
+         */
+	if (rnd && (++value > MAXFRACT)) {
+		value >>= EXPSIZE;
+		exp++;
+	}
+
+	/*
+         * Clean it up and polish it off.
+         */
+	exp <<= MANTSIZE;		/* Shift the exponent into place */
+	exp += value;			/* and add on the mantissa. */
+	return exp;
+}
+
+#if ACCT_VERSION==1 || ACCT_VERSION==2
+/*
+ * encode an u64 into a comp2_t (24 bits)
+ *
+ * Format: 5 bit base 2 exponent, 20 bits mantissa.
+ * The leading bit of the mantissa is not stored, but implied for
+ * non-zero exponents.
+ * Largest encodable value is 50 bits.
+ */
+
+#define MANTSIZE2       20                      /* 20 bit mantissa. */
+#define EXPSIZE2        5                       /* 5 bit base 2 exponent. */
+#define MAXFRACT2       ((1ul << MANTSIZE2) - 1) /* Maximum fractional value. */
+#define MAXEXP2         ((1 <<EXPSIZE2) - 1)    /* Maximum exponent. */
+
+static comp2_t encode_comp2_t(u64 value)
+{
+        int exp, rnd;
+
+        exp = (value > (MAXFRACT2>>1));
+        rnd = 0;
+        while (value > MAXFRACT2) {
+                rnd = value & 1;
+                value >>= 1;
+                exp++;
+        }
+
+        /*
+         * If we need to round up, do it (and handle overflow correctly).
+         */
+        if (rnd && (++value > MAXFRACT2)) {
+                value >>= 1;
+                exp++;
+        }
+
+        if (exp > MAXEXP2) {
+                /* Overflow. Return largest representable number instead. */
+                return (1ul << (MANTSIZE2+EXPSIZE2-1)) - 1;
+        } else {
+                return (value & (MAXFRACT2>>1)) | (exp << (MANTSIZE2-1));
+        }
+}
+#endif
+
+#if ACCT_VERSION==3
+/*
+ * encode an u64 into a 32 bit IEEE float
+ */
+static u32 encode_float(u64 value)
+{
+	unsigned exp = 190;
+	unsigned u;
+
+	if (value==0) return 0;
+	while ((s64)value > 0){
+		value <<= 1;
+		exp--;
+	}
+	u = (u32)(value >> 40) & 0x7fffffu;
+	return u | (exp << 23);
+}
+#endif
+
+/*
+ *  Write an accounting entry for an exiting process
+ *
+ *  The acct_process() call is the workhorse of the process
+ *  accounting system. The struct acct is built here and then written
+ *  into the accounting file. This function should only be called from
+ *  do_exit().
+ */
+
+/*
+ *  do_acct_process does all actual work. Caller holds the reference to file.
+ */
+static void do_acct_process(struct file *file)
+{
+	struct pacct_struct *pacct = &current->signal->pacct;
+	acct_t ac;
+	mm_segment_t fs;
+	unsigned long flim;
+	u64 elapsed;
+	u64 run_time;
+	struct timespec uptime;
+	struct tty_struct *tty;
+
+	/*
+	 * First check to see if there is enough free_space to continue
+	 * the process accounting system.
+	 */
+	if (!check_free_space(file))
+		return;
+
+	/*
+	 * Fill the accounting struct with the needed info as recorded
+	 * by the different kernel functions.
+	 */
+	memset((caddr_t)&ac, 0, sizeof(acct_t));
+
+	ac.ac_version = ACCT_VERSION | ACCT_BYTEORDER;
+	strlcpy(ac.ac_comm, current->comm, sizeof(ac.ac_comm));
+
+	/* calculate run_time in nsec*/
+	do_posix_clock_monotonic_gettime(&uptime);
+	run_time = (u64)uptime.tv_sec*NSEC_PER_SEC + uptime.tv_nsec;
+	run_time -= (u64)current->group_leader->start_time.tv_sec * NSEC_PER_SEC
+		       + current->group_leader->start_time.tv_nsec;
+	/* convert nsec -> AHZ */
+	elapsed = nsec_to_AHZ(run_time);
+#if ACCT_VERSION==3
+	ac.ac_etime = encode_float(elapsed);
+#else
+	ac.ac_etime = encode_comp_t(elapsed < (unsigned long) -1l ?
+	                       (unsigned long) elapsed : (unsigned long) -1l);
+#endif
+#if ACCT_VERSION==1 || ACCT_VERSION==2
+	{
+		/* new enlarged etime field */
+		comp2_t etime = encode_comp2_t(elapsed);
+		ac.ac_etime_hi = etime >> 16;
+		ac.ac_etime_lo = (u16) etime;
+	}
+#endif
+	do_div(elapsed, AHZ);
+	ac.ac_btime = xtime.tv_sec - elapsed;
+	/* we really need to bite the bullet and change layout */
+	ac.ac_uid = current->uid;
+	ac.ac_gid = current->gid;
+#if ACCT_VERSION==2
+	ac.ac_ahz = AHZ;
+#endif
+#if ACCT_VERSION==1 || ACCT_VERSION==2
+	/* backward-compatible 16 bit fields */
+	ac.ac_uid16 = current->uid;
+	ac.ac_gid16 = current->gid;
+#endif
+#if ACCT_VERSION==3
+	ac.ac_pid = current->tgid;
+	ac.ac_ppid = current->parent->tgid;
+#endif
+
+	spin_lock_irq(&current->sighand->siglock);
+	tty = current->signal->tty;
+	ac.ac_tty = tty ? old_encode_dev(tty_devnum(tty)) : 0;
+	ac.ac_utime = encode_comp_t(jiffies_to_AHZ(cputime_to_jiffies(pacct->ac_utime)));
+	ac.ac_stime = encode_comp_t(jiffies_to_AHZ(cputime_to_jiffies(pacct->ac_stime)));
+	ac.ac_flag = pacct->ac_flag;
+	ac.ac_mem = encode_comp_t(pacct->ac_mem);
+	ac.ac_minflt = encode_comp_t(pacct->ac_minflt);
+	ac.ac_majflt = encode_comp_t(pacct->ac_majflt);
+	ac.ac_exitcode = pacct->ac_exitcode;
+	spin_unlock_irq(&current->sighand->siglock);
+	ac.ac_io = encode_comp_t(0 /* current->io_usage */);	/* %% */
+	ac.ac_rw = encode_comp_t(ac.ac_io / 1024);
+	ac.ac_swaps = encode_comp_t(0);
+
+	/*
+         * Kernel segment override to datasegment and write it
+         * to the accounting file.
+         */
+	fs = get_fs();
+	set_fs(KERNEL_DS);
+	/*
+ 	 * Accounting records are not subject to resource limits.
+ 	 */
+	flim = current->signal->rlim[RLIMIT_FSIZE].rlim_cur;
+	current->signal->rlim[RLIMIT_FSIZE].rlim_cur = RLIM_INFINITY;
+	file->f_op->write(file, (char *)&ac,
+			       sizeof(acct_t), &file->f_pos);
+	current->signal->rlim[RLIMIT_FSIZE].rlim_cur = flim;
+	set_fs(fs);
+}
+
+/**
+ * acct_init_pacct - initialize a new pacct_struct
+ * @pacct: per-process accounting info struct to initialize
+ */
+void acct_init_pacct(struct pacct_struct *pacct)
+{
+	memset(pacct, 0, sizeof(struct pacct_struct));
+	pacct->ac_utime = pacct->ac_stime = cputime_zero;
+}
+
+/**
+ * acct_collect - collect accounting information into pacct_struct
+ * @exitcode: task exit code
+ * @group_dead: not 0, if this thread is the last one in the process.
+ */
+void acct_collect(long exitcode, int group_dead)
+{
+	struct pacct_struct *pacct = &current->signal->pacct;
+	unsigned long vsize = 0;
+
+	if (group_dead && current->mm) {
+		struct vm_area_struct *vma;
+		down_read(&current->mm->mmap_sem);
+		vma = current->mm->mmap;
+		while (vma) {
+			vsize += vma->vm_end - vma->vm_start;
+			vma = vma->vm_next;
+		}
+		up_read(&current->mm->mmap_sem);
+	}
+
+	spin_lock_irq(&current->sighand->siglock);
+	if (group_dead)
+		pacct->ac_mem = vsize / 1024;
+	if (thread_group_leader(current)) {
+		pacct->ac_exitcode = exitcode;
+		if (current->flags & PF_FORKNOEXEC)
+			pacct->ac_flag |= AFORK;
+	}
+	if (current->flags & PF_SUPERPRIV)
+		pacct->ac_flag |= ASU;
+	if (current->flags & PF_DUMPCORE)
+		pacct->ac_flag |= ACORE;
+	if (current->flags & PF_SIGNALED)
+		pacct->ac_flag |= AXSIG;
+	pacct->ac_utime = cputime_add(pacct->ac_utime, current->utime);
+	pacct->ac_stime = cputime_add(pacct->ac_stime, current->stime);
+	pacct->ac_minflt += current->min_flt;
+	pacct->ac_majflt += current->maj_flt;
+	spin_unlock_irq(&current->sighand->siglock);
+}
+
+/**
+ * acct_process - now just a wrapper around do_acct_process
+ * @exitcode: task exit code
+ *
+ * handles process accounting for an exiting task
+ */
+void acct_process(void)
+{
+	struct file *file = NULL;
+
+	/*
+	 * accelerate the common fastpath:
+	 */
+	if (!acct_globals.file)
+		return;
+
+	spin_lock(&acct_globals.lock);
+	file = acct_globals.file;
+	if (unlikely(!file)) {
+		spin_unlock(&acct_globals.lock);
+		return;
+	}
+	get_file(file);
+	spin_unlock(&acct_globals.lock);
+
+	do_acct_process(file);
+	fput(file);
+}
--- a/kernel/audit.c
+++ b/kernel/audit.c
--- a/kernel/audit.h
+++ b/kernel/audit.h
@@ -0,0 +1,147 @@
+/* audit -- definition of audit_context structure and supporting types 
+ *
+ * Copyright 2003-2004 Red Hat, Inc.
+ * Copyright 2005 Hewlett-Packard Development Company, L.P.
+ * Copyright 2005 IBM Corporation
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
+ */
+
+#include <linux/fs.h>
+#include <linux/audit.h>
+#include <linux/skbuff.h>
+
+/* 0 = no checking
+   1 = put_count checking
+   2 = verbose put_count checking
+*/
+#define AUDIT_DEBUG 0
+
+/* At task start time, the audit_state is set in the audit_context using
+   a per-task filter.  At syscall entry, the audit_state is augmented by
+   the syscall filter. */
+enum audit_state {
+	AUDIT_DISABLED,		/* Do not create per-task audit_context.
+				 * No syscall-specific audit records can
+				 * be generated. */
+	AUDIT_SETUP_CONTEXT,	/* Create the per-task audit_context,
+				 * but don't necessarily fill it in at
+				 * syscall entry time (i.e., filter
+				 * instead). */
+	AUDIT_BUILD_CONTEXT,	/* Create the per-task audit_context,
+				 * and always fill it in at syscall
+				 * entry time.  This makes a full
+				 * syscall record available if some
+				 * other part of the kernel decides it
+				 * should be recorded. */
+	AUDIT_RECORD_CONTEXT	/* Create the per-task audit_context,
+				 * always fill it in at syscall entry
+				 * time, and always write out the audit
+				 * record at syscall exit time.  */
+};
+
+/* Rule lists */
+struct audit_parent;
+
+struct audit_watch {
+	atomic_t		count;	/* reference count */
+	char			*path;	/* insertion path */
+	dev_t			dev;	/* associated superblock device */
+	unsigned long		ino;	/* associated inode number */
+	struct audit_parent	*parent; /* associated parent */
+	struct list_head	wlist;	/* entry in parent->watches list */
+	struct list_head	rules;	/* associated rules */
+};
+
+struct audit_field {
+	u32				type;
+	u32				val;
+	u32				op;
+	char				*se_str;
+	struct selinux_audit_rule	*se_rule;
+};
+
+struct audit_krule {
+	int			vers_ops;
+	u32			flags;
+	u32			listnr;
+	u32			action;
+	u32			mask[AUDIT_BITMASK_SIZE];
+	u32			buflen; /* for data alloc on list rules */
+	u32			field_count;
+	char			*filterkey; /* ties events to rules */
+	struct audit_field	*fields;
+	struct audit_field	*inode_f; /* quick access to an inode field */
+	struct audit_watch	*watch;	/* associated watch */
+	struct list_head	rlist;	/* entry in audit_watch.rules list */
+};
+
+struct audit_entry {
+	struct list_head	list;
+	struct rcu_head		rcu;
+	struct audit_krule	rule;
+};
+
+extern int audit_pid;
+
+#define AUDIT_INODE_BUCKETS	32
+extern struct list_head audit_inode_hash[AUDIT_INODE_BUCKETS];
+
+static inline int audit_hash_ino(u32 ino)
+{
+	return (ino & (AUDIT_INODE_BUCKETS-1));
+}
+
+extern int audit_match_class(int class, unsigned syscall);
+extern int audit_comparator(const u32 left, const u32 op, const u32 right);
+extern int audit_compare_dname_path(const char *dname, const char *path,
+				    int *dirlen);
+extern struct sk_buff *	    audit_make_reply(int pid, int seq, int type,
+					     int done, int multi,
+					     void *payload, int size);
+extern void		    audit_send_reply(int pid, int seq, int type,
+					     int done, int multi,
+					     void *payload, int size);
+extern void		    audit_log_lost(const char *message);
+extern void		    audit_panic(const char *message);
+
+struct audit_netlink_list {
+	int pid;
+	struct sk_buff_head q;
+};
+
+int audit_send_list(void *);
+
+struct inotify_watch;
+extern void audit_free_parent(struct inotify_watch *);
+extern void audit_handle_ievent(struct inotify_watch *, u32, u32, u32,
+				const char *, struct inode *);
+extern int selinux_audit_rule_update(void);
+
+#ifdef CONFIG_AUDITSYSCALL
+extern void __audit_signal_info(int sig, struct task_struct *t);
+static inline void audit_signal_info(int sig, struct task_struct *t)
+{
+	if (unlikely(audit_pid && t->tgid == audit_pid))
+		__audit_signal_info(sig, t);
+}
+extern enum audit_state audit_filter_inodes(struct task_struct *,
+					    struct audit_context *);
+extern void audit_set_auditable(struct audit_context *);
+#else
+#define audit_signal_info(s,t)
+#define audit_filter_inodes(t,c) AUDIT_DISABLED
+#define audit_set_auditable(c)
+#endif
--- a/kernel/auditfilter.c
+++ b/kernel/auditfilter.c
--- a/kernel/auditsc.c
+++ b/kernel/auditsc.c
--- a/kernel/capability.c
+++ b/kernel/capability.c
@@ -0,0 +1,253 @@
+/*
+ * linux/kernel/capability.c
+ *
+ * Copyright (C) 1997  Andrew Main <zefram@fysh.org>
+ *
+ * Integrated into 2.1.97+,  Andrew G. Morgan <morgan@transmeta.com>
+ * 30 May 2002:	Cleanup, Robert M. Love <rml@tech9.net>
+ */ 
+
+#include <linux/capability.h>
+#include <linux/mm.h>
+#include <linux/module.h>
+#include <linux/security.h>
+#include <linux/syscalls.h>
+#include <asm/uaccess.h>
+
+unsigned securebits = SECUREBITS_DEFAULT; /* systemwide security settings */
+kernel_cap_t cap_bset = CAP_INIT_EFF_SET;
+
+EXPORT_SYMBOL(securebits);
+EXPORT_SYMBOL(cap_bset);
+
+/*
+ * This lock protects task->cap_* for all tasks including current.
+ * Locking rule: acquire this prior to tasklist_lock.
+ */
+static DEFINE_SPINLOCK(task_capability_lock);
+
+/*
+ * For sys_getproccap() and sys_setproccap(), any of the three
+ * capability set pointers may be NULL -- indicating that that set is
+ * uninteresting and/or not to be changed.
+ */
+
+/**
+ * sys_capget - get the capabilities of a given process.
+ * @header: pointer to struct that contains capability version and
+ *	target pid data
+ * @dataptr: pointer to struct that contains the effective, permitted,
+ *	and inheritable capabilities that are returned
+ *
+ * Returns 0 on success and < 0 on error.
+ */
+asmlinkage long sys_capget(cap_user_header_t header, cap_user_data_t dataptr)
+{
+     int ret = 0;
+     pid_t pid;
+     __u32 version;
+     struct task_struct *target;
+     struct __user_cap_data_struct data;
+
+     if (get_user(version, &header->version))
+	     return -EFAULT;
+
+     if (version != _LINUX_CAPABILITY_VERSION) {
+	     if (put_user(_LINUX_CAPABILITY_VERSION, &header->version))
+		     return -EFAULT; 
+             return -EINVAL;
+     }
+
+     if (get_user(pid, &header->pid))
+	     return -EFAULT;
+
+     if (pid < 0) 
+             return -EINVAL;
+
+     spin_lock(&task_capability_lock);
+     read_lock(&tasklist_lock); 
+
+     if (pid && pid != current->pid) {
+	     target = find_task_by_pid(pid);
+	     if (!target) {
+	          ret = -ESRCH;
+	          goto out;
+	     }
+     } else
+	     target = current;
+
+     ret = security_capget(target, &data.effective, &data.inheritable, &data.permitted);
+
+out:
+     read_unlock(&tasklist_lock); 
+     spin_unlock(&task_capability_lock);
+
+     if (!ret && copy_to_user(dataptr, &data, sizeof data))
+          return -EFAULT; 
+
+     return ret;
+}
+
+/*
+ * cap_set_pg - set capabilities for all processes in a given process
+ * group.  We call this holding task_capability_lock and tasklist_lock.
+ */
+static inline int cap_set_pg(int pgrp_nr, kernel_cap_t *effective,
+			      kernel_cap_t *inheritable,
+			      kernel_cap_t *permitted)
+{
+	struct task_struct *g, *target;
+	int ret = -EPERM;
+	int found = 0;
+	struct pid *pgrp;
+
+	pgrp = find_pid(pgrp_nr);
+	do_each_pid_task(pgrp, PIDTYPE_PGID, g) {
+		target = g;
+		while_each_thread(g, target) {
+			if (!security_capset_check(target, effective,
+							inheritable,
+							permitted)) {
+				security_capset_set(target, effective,
+							inheritable,
+							permitted);
+				ret = 0;
+			}
+			found = 1;
+		}
+	} while_each_pid_task(pgrp, PIDTYPE_PGID, g);
+
+	if (!found)
+	     ret = 0;
+	return ret;
+}
+
+/*
+ * cap_set_all - set capabilities for all processes other than init
+ * and self.  We call this holding task_capability_lock and tasklist_lock.
+ */
+static inline int cap_set_all(kernel_cap_t *effective,
+			       kernel_cap_t *inheritable,
+			       kernel_cap_t *permitted)
+{
+     struct task_struct *g, *target;
+     int ret = -EPERM;
+     int found = 0;
+
+     do_each_thread(g, target) {
+             if (target == current || is_init(target))
+                     continue;
+             found = 1;
+	     if (security_capset_check(target, effective, inheritable,
+						permitted))
+		     continue;
+	     ret = 0;
+	     security_capset_set(target, effective, inheritable, permitted);
+     } while_each_thread(g, target);
+
+     if (!found)
+	     ret = 0;
+     return ret;
+}
+
+/**
+ * sys_capset - set capabilities for a process or a group of processes
+ * @header: pointer to struct that contains capability version and
+ *	target pid data
+ * @data: pointer to struct that contains the effective, permitted,
+ *	and inheritable capabilities
+ *
+ * Set capabilities for a given process, all processes, or all
+ * processes in a given process group.
+ *
+ * The restrictions on setting capabilities are specified as:
+ *
+ * [pid is for the 'target' task.  'current' is the calling task.]
+ *
+ * I: any raised capabilities must be a subset of the (old current) permitted
+ * P: any raised capabilities must be a subset of the (old current) permitted
+ * E: must be set to a subset of (new target) permitted
+ *
+ * Returns 0 on success and < 0 on error.
+ */
+asmlinkage long sys_capset(cap_user_header_t header, const cap_user_data_t data)
+{
+     kernel_cap_t inheritable, permitted, effective;
+     __u32 version;
+     struct task_struct *target;
+     int ret;
+     pid_t pid;
+
+     if (get_user(version, &header->version))
+	     return -EFAULT; 
+
+     if (version != _LINUX_CAPABILITY_VERSION) {
+	     if (put_user(_LINUX_CAPABILITY_VERSION, &header->version))
+		     return -EFAULT; 
+             return -EINVAL;
+     }
+
+     if (get_user(pid, &header->pid))
+	     return -EFAULT; 
+
+     if (pid && pid != current->pid && !capable(CAP_SETPCAP))
+             return -EPERM;
+
+     if (copy_from_user(&effective, &data->effective, sizeof(effective)) ||
+	 copy_from_user(&inheritable, &data->inheritable, sizeof(inheritable)) ||
+	 copy_from_user(&permitted, &data->permitted, sizeof(permitted)))
+	     return -EFAULT; 
+
+     spin_lock(&task_capability_lock);
+     read_lock(&tasklist_lock);
+
+     if (pid > 0 && pid != current->pid) {
+          target = find_task_by_pid(pid);
+          if (!target) {
+               ret = -ESRCH;
+               goto out;
+          }
+     } else
+               target = current;
+
+     ret = 0;
+
+     /* having verified that the proposed changes are legal,
+           we now put them into effect. */
+     if (pid < 0) {
+             if (pid == -1)  /* all procs other than current and init */
+                     ret = cap_set_all(&effective, &inheritable, &permitted);
+
+             else            /* all procs in process group */
+                     ret = cap_set_pg(-pid, &effective, &inheritable,
+		     					&permitted);
+     } else {
+	     ret = security_capset_check(target, &effective, &inheritable,
+	     						&permitted);
+	     if (!ret)
+		     security_capset_set(target, &effective, &inheritable,
+		     					&permitted);
+     }
+
+out:
+     read_unlock(&tasklist_lock);
+     spin_unlock(&task_capability_lock);
+
+     return ret;
+}
+
+int __capable(struct task_struct *t, int cap)
+{
+	if (security_capable(t, cap) == 0) {
+		t->flags |= PF_SUPERPRIV;
+		return 1;
+	}
+	return 0;
+}
+EXPORT_SYMBOL(__capable);
+
+int capable(int cap)
+{
+	return __capable(current, cap);
+}
+EXPORT_SYMBOL(capable);
--- a/kernel/compat.c
+++ b/kernel/compat.c
--- a/kernel/configs.c
+++ b/kernel/configs.c
@@ -0,0 +1,117 @@
+/*
+ * kernel/configs.c
+ * Echo the kernel .config file used to build the kernel
+ *
+ * Copyright (C) 2002 Khalid Aziz <khalid_aziz@hp.com>
+ * Copyright (C) 2002 Randy Dunlap <rdunlap@xenotime.net>
+ * Copyright (C) 2002 Al Stone <ahs3@fc.hp.com>
+ * Copyright (C) 2002 Hewlett-Packard Company
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or (at
+ * your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY OR FITNESS FOR A PARTICULAR PURPOSE, GOOD TITLE or
+ * NON INFRINGEMENT.  See the GNU General Public License for more
+ * details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
+ */
+
+#include <linux/kernel.h>
+#include <linux/module.h>
+#include <linux/proc_fs.h>
+#include <linux/seq_file.h>
+#include <linux/init.h>
+#include <asm/uaccess.h>
+
+/**************************************************/
+/* the actual current config file                 */
+
+/*
+ * Define kernel_config_data and kernel_config_data_size, which contains the
+ * wrapped and compressed configuration file.  The file is first compressed
+ * with gzip and then bounded by two eight byte magic numbers to allow
+ * extraction from a binary kernel image:
+ *
+ *   IKCFG_ST
+ *   <image>
+ *   IKCFG_ED
+ */
+#define MAGIC_START	"IKCFG_ST"
+#define MAGIC_END	"IKCFG_ED"
+#include "config_data.h"
+
+
+#define MAGIC_SIZE (sizeof(MAGIC_START) - 1)
+#define kernel_config_data_size \
+	(sizeof(kernel_config_data) - 1 - MAGIC_SIZE * 2)
+
+#ifdef CONFIG_IKCONFIG_PROC
+
+/**************************************************/
+/* globals and useful constants                   */
+
+static ssize_t
+ikconfig_read_current(struct file *file, char __user *buf,
+		      size_t len, loff_t * offset)
+{
+	loff_t pos = *offset;
+	ssize_t count;
+
+	if (pos >= kernel_config_data_size)
+		return 0;
+
+	count = min(len, (size_t)(kernel_config_data_size - pos));
+	if (copy_to_user(buf, kernel_config_data + MAGIC_SIZE + pos, count))
+		return -EFAULT;
+
+	*offset += count;
+	return count;
+}
+
+static const struct file_operations ikconfig_file_ops = {
+	.owner = THIS_MODULE,
+	.read = ikconfig_read_current,
+};
+
+/***************************************************/
+/* ikconfig_init: start up everything we need to */
+
+static int __init ikconfig_init(void)
+{
+	struct proc_dir_entry *entry;
+
+	/* create the current config file */
+	entry = create_proc_entry("config.gz", S_IFREG | S_IRUGO,
+				  &proc_root);
+	if (!entry)
+		return -ENOMEM;
+
+	entry->proc_fops = &ikconfig_file_ops;
+	entry->size = kernel_config_data_size;
+
+	return 0;
+}
+
+/***************************************************/
+/* ikconfig_cleanup: clean up our mess           */
+
+static void __exit ikconfig_cleanup(void)
+{
+	remove_proc_entry("config.gz", &proc_root);
+}
+
+module_init(ikconfig_init);
+module_exit(ikconfig_cleanup);
+
+MODULE_LICENSE("GPL");
+MODULE_AUTHOR("Randy Dunlap");
+MODULE_DESCRIPTION("Echo the kernel .config file used to build the kernel");
+
+#endif /* CONFIG_IKCONFIG_PROC */
--- a/kernel/cpu.c
+++ b/kernel/cpu.c
@@ -0,0 +1,327 @@
+/* CPU control.
+ * (C) 2001, 2002, 2003, 2004 Rusty Russell
+ *
+ * This code is licenced under the GPL.
+ */
+#include <linux/proc_fs.h>
+#include <linux/smp.h>
+#include <linux/init.h>
+#include <linux/notifier.h>
+#include <linux/sched.h>
+#include <linux/unistd.h>
+#include <linux/cpu.h>
+#include <linux/module.h>
+#include <linux/kthread.h>
+#include <linux/stop_machine.h>
+#include <linux/mutex.h>
+
+/* This protects CPUs going up and down... */
+static DEFINE_MUTEX(cpu_add_remove_lock);
+static DEFINE_MUTEX(cpu_bitmask_lock);
+
+static __cpuinitdata RAW_NOTIFIER_HEAD(cpu_chain);
+
+/* If set, cpu_up and cpu_down will return -EBUSY and do nothing.
+ * Should always be manipulated under cpu_add_remove_lock
+ */
+static int cpu_hotplug_disabled;
+
+#ifdef CONFIG_HOTPLUG_CPU
+
+/* Crappy recursive lock-takers in cpufreq! Complain loudly about idiots */
+static struct task_struct *recursive;
+static int recursive_depth;
+
+void lock_cpu_hotplug(void)
+{
+	struct task_struct *tsk = current;
+
+	if (tsk == recursive) {
+		static int warnings = 10;
+		if (warnings) {
+			printk(KERN_ERR "Lukewarm IQ detected in hotplug locking\n");
+			WARN_ON(1);
+			warnings--;
+		}
+		recursive_depth++;
+		return;
+	}
+	mutex_lock(&cpu_bitmask_lock);
+	recursive = tsk;
+}
+EXPORT_SYMBOL_GPL(lock_cpu_hotplug);
+
+void unlock_cpu_hotplug(void)
+{
+	WARN_ON(recursive != current);
+	if (recursive_depth) {
+		recursive_depth--;
+		return;
+	}
+	recursive = NULL;
+	mutex_unlock(&cpu_bitmask_lock);
+}
+EXPORT_SYMBOL_GPL(unlock_cpu_hotplug);
+
+#endif	/* CONFIG_HOTPLUG_CPU */
+
+/* Need to know about CPUs going up/down? */
+int __cpuinit register_cpu_notifier(struct notifier_block *nb)
+{
+	int ret;
+	mutex_lock(&cpu_add_remove_lock);
+	ret = raw_notifier_chain_register(&cpu_chain, nb);
+	mutex_unlock(&cpu_add_remove_lock);
+	return ret;
+}
+
+#ifdef CONFIG_HOTPLUG_CPU
+
+EXPORT_SYMBOL(register_cpu_notifier);
+
+void unregister_cpu_notifier(struct notifier_block *nb)
+{
+	mutex_lock(&cpu_add_remove_lock);
+	raw_notifier_chain_unregister(&cpu_chain, nb);
+	mutex_unlock(&cpu_add_remove_lock);
+}
+EXPORT_SYMBOL(unregister_cpu_notifier);
+
+static inline void check_for_tasks(int cpu)
+{
+	struct task_struct *p;
+
+	write_lock_irq(&tasklist_lock);
+	for_each_process(p) {
+		if (task_cpu(p) == cpu &&
+		    (!cputime_eq(p->utime, cputime_zero) ||
+		     !cputime_eq(p->stime, cputime_zero)))
+			printk(KERN_WARNING "Task %s (pid = %d) is on cpu %d\
+				(state = %ld, flags = %lx) \n",
+				 p->comm, p->pid, cpu, p->state, p->flags);
+	}
+	write_unlock_irq(&tasklist_lock);
+}
+
+/* Take this CPU down. */
+static int take_cpu_down(void *unused)
+{
+	int err;
+
+	/* Ensure this CPU doesn't handle any more interrupts. */
+	err = __cpu_disable();
+	if (err < 0)
+		return err;
+
+	/* Force idle task to run as soon as we yield: it should
+	   immediately notice cpu is offline and die quickly. */
+	sched_idle_next();
+	return 0;
+}
+
+/* Requires cpu_add_remove_lock to be held */
+static int _cpu_down(unsigned int cpu)
+{
+	int err;
+	struct task_struct *p;
+	cpumask_t old_allowed, tmp;
+
+	if (num_online_cpus() == 1)
+		return -EBUSY;
+
+	if (!cpu_online(cpu))
+		return -EINVAL;
+
+	err = raw_notifier_call_chain(&cpu_chain, CPU_DOWN_PREPARE,
+						(void *)(long)cpu);
+	if (err == NOTIFY_BAD) {
+		printk("%s: attempt to take down CPU %u failed\n",
+				__FUNCTION__, cpu);
+		return -EINVAL;
+	}
+
+	/* Ensure that we are not runnable on dying cpu */
+	old_allowed = current->cpus_allowed;
+	tmp = CPU_MASK_ALL;
+	cpu_clear(cpu, tmp);
+	set_cpus_allowed(current, tmp);
+
+	mutex_lock(&cpu_bitmask_lock);
+	p = __stop_machine_run(take_cpu_down, NULL, cpu);
+	mutex_unlock(&cpu_bitmask_lock);
+
+	if (IS_ERR(p) || cpu_online(cpu)) {
+		/* CPU didn't die: tell everyone.  Can't complain. */
+		if (raw_notifier_call_chain(&cpu_chain, CPU_DOWN_FAILED,
+				(void *)(long)cpu) == NOTIFY_BAD)
+			BUG();
+
+		if (IS_ERR(p)) {
+			err = PTR_ERR(p);
+			goto out_allowed;
+		}
+		goto out_thread;
+	}
+
+	/* Wait for it to sleep (leaving idle task). */
+	while (!idle_cpu(cpu))
+		yield();
+
+	/* This actually kills the CPU. */
+	__cpu_die(cpu);
+
+	/* Move it here so it can run. */
+	kthread_bind(p, get_cpu());
+	put_cpu();
+
+	/* CPU is completely dead: tell everyone.  Too late to complain. */
+	if (raw_notifier_call_chain(&cpu_chain, CPU_DEAD,
+			(void *)(long)cpu) == NOTIFY_BAD)
+		BUG();
+
+	check_for_tasks(cpu);
+
+out_thread:
+	err = kthread_stop(p);
+out_allowed:
+	set_cpus_allowed(current, old_allowed);
+	return err;
+}
+
+int cpu_down(unsigned int cpu)
+{
+	int err = 0;
+
+	mutex_lock(&cpu_add_remove_lock);
+	if (cpu_hotplug_disabled)
+		err = -EBUSY;
+	else
+		err = _cpu_down(cpu);
+
+	mutex_unlock(&cpu_add_remove_lock);
+	return err;
+}
+#endif /*CONFIG_HOTPLUG_CPU*/
+
+/* Requires cpu_add_remove_lock to be held */
+static int __cpuinit _cpu_up(unsigned int cpu)
+{
+	int ret;
+	void *hcpu = (void *)(long)cpu;
+
+	if (cpu_online(cpu) || !cpu_present(cpu))
+		return -EINVAL;
+
+	ret = raw_notifier_call_chain(&cpu_chain, CPU_UP_PREPARE, hcpu);
+	if (ret == NOTIFY_BAD) {
+		printk("%s: attempt to bring up CPU %u failed\n",
+				__FUNCTION__, cpu);
+		ret = -EINVAL;
+		goto out_notify;
+	}
+
+	/* Arch-specific enabling code. */
+	mutex_lock(&cpu_bitmask_lock);
+	ret = __cpu_up(cpu);
+	mutex_unlock(&cpu_bitmask_lock);
+	if (ret != 0)
+		goto out_notify;
+	BUG_ON(!cpu_online(cpu));
+
+	/* Now call notifier in preparation. */
+	raw_notifier_call_chain(&cpu_chain, CPU_ONLINE, hcpu);
+
+out_notify:
+	if (ret != 0)
+		raw_notifier_call_chain(&cpu_chain,
+				CPU_UP_CANCELED, hcpu);
+
+	return ret;
+}
+
+int __cpuinit cpu_up(unsigned int cpu)
+{
+	int err = 0;
+
+	mutex_lock(&cpu_add_remove_lock);
+	if (cpu_hotplug_disabled)
+		err = -EBUSY;
+	else
+		err = _cpu_up(cpu);
+
+	mutex_unlock(&cpu_add_remove_lock);
+	return err;
+}
+
+#ifdef CONFIG_SUSPEND_SMP
+/* Needed to prevent the microcode driver from requesting firmware in its CPU
+ * hotplug notifier during the suspend/resume.
+ */
+int suspend_cpu_hotplug;
+EXPORT_SYMBOL(suspend_cpu_hotplug);
+
+static cpumask_t frozen_cpus;
+
+int disable_nonboot_cpus(void)
+{
+	int cpu, first_cpu, error = 0;
+
+	mutex_lock(&cpu_add_remove_lock);
+	suspend_cpu_hotplug = 1;
+	first_cpu = first_cpu(cpu_online_map);
+	/* We take down all of the non-boot CPUs in one shot to avoid races
+	 * with the userspace trying to use the CPU hotplug at the same time
+	 */
+	cpus_clear(frozen_cpus);
+	printk("Disabling non-boot CPUs ...\n");
+	for_each_online_cpu(cpu) {
+		if (cpu == first_cpu)
+			continue;
+		error = _cpu_down(cpu);
+		if (!error) {
+			cpu_set(cpu, frozen_cpus);
+			printk("CPU%d is down\n", cpu);
+		} else {
+			printk(KERN_ERR "Error taking CPU%d down: %d\n",
+				cpu, error);
+			break;
+		}
+	}
+	if (!error) {
+		BUG_ON(num_online_cpus() > 1);
+		/* Make sure the CPUs won't be enabled by someone else */
+		cpu_hotplug_disabled = 1;
+	} else {
+		printk(KERN_ERR "Non-boot CPUs are not disabled\n");
+	}
+	suspend_cpu_hotplug = 0;
+	mutex_unlock(&cpu_add_remove_lock);
+	return error;
+}
+
+void enable_nonboot_cpus(void)
+{
+	int cpu, error;
+
+	/* Allow everyone to use the CPU hotplug again */
+	mutex_lock(&cpu_add_remove_lock);
+	cpu_hotplug_disabled = 0;
+	if (cpus_empty(frozen_cpus))
+		goto out;
+
+	suspend_cpu_hotplug = 1;
+	printk("Enabling non-boot CPUs ...\n");
+	for_each_cpu_mask(cpu, frozen_cpus) {
+		error = _cpu_up(cpu);
+		if (!error) {
+			printk("CPU%d is up\n", cpu);
+			continue;
+		}
+		printk(KERN_WARNING "Error taking CPU%d up: %d\n", cpu, error);
+	}
+	cpus_clear(frozen_cpus);
+	suspend_cpu_hotplug = 0;
+out:
+	mutex_unlock(&cpu_add_remove_lock);
+}
+#endif
--- a/kernel/cpuset.c
+++ b/kernel/cpuset.c
--- a/kernel/delayacct.c
+++ b/kernel/delayacct.c
@@ -0,0 +1,165 @@
+/* delayacct.c - per-task delay accounting
+ *
+ * Copyright (C) Shailabh Nagar, IBM Corp. 2006
+ *
+ * This program is free software;  you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it would be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See
+ * the GNU General Public License for more details.
+ */
+
+#include <linux/sched.h>
+#include <linux/slab.h>
+#include <linux/time.h>
+#include <linux/sysctl.h>
+#include <linux/delayacct.h>
+
+int delayacct_on __read_mostly = 1;	/* Delay accounting turned on/off */
+struct kmem_cache *delayacct_cache;
+
+static int __init delayacct_setup_disable(char *str)
+{
+	delayacct_on = 0;
+	return 1;
+}
+__setup("nodelayacct", delayacct_setup_disable);
+
+void delayacct_init(void)
+{
+	delayacct_cache = kmem_cache_create("delayacct_cache",
+					sizeof(struct task_delay_info),
+					0,
+					SLAB_PANIC,
+					NULL, NULL);
+	delayacct_tsk_init(&init_task);
+}
+
+void __delayacct_tsk_init(struct task_struct *tsk)
+{
+	tsk->delays = kmem_cache_zalloc(delayacct_cache, GFP_KERNEL);
+	if (tsk->delays)
+		spin_lock_init(&tsk->delays->lock);
+}
+
+/*
+ * Start accounting for a delay statistic using
+ * its starting timestamp (@start)
+ */
+
+static inline void delayacct_start(struct timespec *start)
+{
+	do_posix_clock_monotonic_gettime(start);
+}
+
+/*
+ * Finish delay accounting for a statistic using
+ * its timestamps (@start, @end), accumalator (@total) and @count
+ */
+
+static void delayacct_end(struct timespec *start, struct timespec *end,
+				u64 *total, u32 *count)
+{
+	struct timespec ts;
+	s64 ns;
+	unsigned long flags;
+
+	do_posix_clock_monotonic_gettime(end);
+	ts = timespec_sub(*end, *start);
+	ns = timespec_to_ns(&ts);
+	if (ns < 0)
+		return;
+
+	spin_lock_irqsave(&current->delays->lock, flags);
+	*total += ns;
+	(*count)++;
+	spin_unlock_irqrestore(&current->delays->lock, flags);
+}
+
+void __delayacct_blkio_start(void)
+{
+	delayacct_start(&current->delays->blkio_start);
+}
+
+void __delayacct_blkio_end(void)
+{
+	if (current->delays->flags & DELAYACCT_PF_SWAPIN)
+		/* Swapin block I/O */
+		delayacct_end(&current->delays->blkio_start,
+			&current->delays->blkio_end,
+			&current->delays->swapin_delay,
+			&current->delays->swapin_count);
+	else	/* Other block I/O */
+		delayacct_end(&current->delays->blkio_start,
+			&current->delays->blkio_end,
+			&current->delays->blkio_delay,
+			&current->delays->blkio_count);
+}
+
+int __delayacct_add_tsk(struct taskstats *d, struct task_struct *tsk)
+{
+	s64 tmp;
+	unsigned long t1;
+	unsigned long long t2,t3;
+	unsigned long flags;
+	struct timespec ts;
+
+	/* Though tsk->delays accessed later, early exit avoids
+	 * unnecessary returning of other data
+	 */
+	if (!tsk->delays)
+		goto done;
+
+	tmp = (s64)d->cpu_run_real_total;
+	cputime_to_timespec(tsk->utime + tsk->stime, &ts);
+	tmp += timespec_to_ns(&ts);
+	d->cpu_run_real_total = (tmp < (s64)d->cpu_run_real_total) ? 0 : tmp;
+
+	/*
+	 * No locking available for sched_info (and too expensive to add one)
+	 * Mitigate by taking snapshot of values
+	 */
+	t1 = tsk->sched_info.pcnt;
+	t2 = tsk->sched_info.run_delay;
+	t3 = tsk->sched_info.cpu_time;
+
+	d->cpu_count += t1;
+
+	tmp = (s64)d->cpu_delay_total + t2;
+	d->cpu_delay_total = (tmp < (s64)d->cpu_delay_total) ? 0 : tmp;
+
+	tmp = (s64)d->cpu_run_virtual_total + t3;
+	d->cpu_run_virtual_total =
+		(tmp < (s64)d->cpu_run_virtual_total) ?	0 : tmp;
+
+	/* zero XXX_total, non-zero XXX_count implies XXX stat overflowed */
+
+	spin_lock_irqsave(&tsk->delays->lock, flags);
+	tmp = d->blkio_delay_total + tsk->delays->blkio_delay;
+	d->blkio_delay_total = (tmp < d->blkio_delay_total) ? 0 : tmp;
+	tmp = d->swapin_delay_total + tsk->delays->swapin_delay;
+	d->swapin_delay_total = (tmp < d->swapin_delay_total) ? 0 : tmp;
+	d->blkio_count += tsk->delays->blkio_count;
+	d->swapin_count += tsk->delays->swapin_count;
+	spin_unlock_irqrestore(&tsk->delays->lock, flags);
+
+done:
+	return 0;
+}
+
+__u64 __delayacct_blkio_ticks(struct task_struct *tsk)
+{
+	__u64 ret;
+	unsigned long flags;
+
+	spin_lock_irqsave(&tsk->delays->lock, flags);
+	ret = nsec_to_clock_t(tsk->delays->blkio_delay +
+				tsk->delays->swapin_delay);
+	spin_unlock_irqrestore(&tsk->delays->lock, flags);
+	return ret;
+}
+
--- a/kernel/dma.c
+++ b/kernel/dma.c
@@ -0,0 +1,166 @@
+/* $Id: dma.c,v 1.1.1.1 2007/06/12 07:27:11 eyryu Exp $
+ * linux/kernel/dma.c: A DMA channel allocator. Inspired by linux/kernel/irq.c.
+ *
+ * Written by Hennus Bergman, 1992.
+ *
+ * 1994/12/26: Changes by Alex Nash to fix a minor bug in /proc/dma.
+ *   In the previous version the reported device could end up being wrong,
+ *   if a device requested a DMA channel that was already in use.
+ *   [It also happened to remove the sizeof(char *) == sizeof(int)
+ *   assumption introduced because of those /proc/dma patches. -- Hennus]
+ */
+#include <linux/module.h>
+#include <linux/kernel.h>
+#include <linux/errno.h>
+#include <linux/spinlock.h>
+#include <linux/string.h>
+#include <linux/seq_file.h>
+#include <linux/proc_fs.h>
+#include <linux/init.h>
+#include <asm/dma.h>
+#include <asm/system.h>
+
+ 
+
+/* A note on resource allocation:
+ *
+ * All drivers needing DMA channels, should allocate and release them
+ * through the public routines `request_dma()' and `free_dma()'.
+ *
+ * In order to avoid problems, all processes should allocate resources in
+ * the same sequence and release them in the reverse order.
+ *
+ * So, when allocating DMAs and IRQs, first allocate the IRQ, then the DMA.
+ * When releasing them, first release the DMA, then release the IRQ.
+ * If you don't, you may cause allocation requests to fail unnecessarily.
+ * This doesn't really matter now, but it will once we get real semaphores
+ * in the kernel.
+ */
+
+
+DEFINE_SPINLOCK(dma_spin_lock);
+
+/*
+ *	If our port doesn't define this it has no PC like DMA
+ */
+
+#ifdef MAX_DMA_CHANNELS
+
+
+/* Channel n is busy iff dma_chan_busy[n].lock != 0.
+ * DMA0 used to be reserved for DRAM refresh, but apparently not any more...
+ * DMA4 is reserved for cascading.
+ */
+
+struct dma_chan {
+	int  lock;
+	const char *device_id;
+};
+
+static struct dma_chan dma_chan_busy[MAX_DMA_CHANNELS] = {
+	[4] = { 1, "cascade" },
+};
+
+
+/**
+ * request_dma - request and reserve a system DMA channel
+ * @dmanr: DMA channel number
+ * @device_id: reserving device ID string, used in /proc/dma
+ */
+int request_dma(unsigned int dmanr, const char * device_id)
+{
+	if (dmanr >= MAX_DMA_CHANNELS)
+		return -EINVAL;
+
+	if (xchg(&dma_chan_busy[dmanr].lock, 1) != 0)
+		return -EBUSY;
+
+	dma_chan_busy[dmanr].device_id = device_id;
+
+	/* old flag was 0, now contains 1 to indicate busy */
+	return 0;
+} /* request_dma */
+
+/**
+ * free_dma - free a reserved system DMA channel
+ * @dmanr: DMA channel number
+ */
+void free_dma(unsigned int dmanr)
+{
+	if (dmanr >= MAX_DMA_CHANNELS) {
+		printk(KERN_WARNING "Trying to free DMA%d\n", dmanr);
+		return;
+	}
+
+	if (xchg(&dma_chan_busy[dmanr].lock, 0) == 0) {
+		printk(KERN_WARNING "Trying to free free DMA%d\n", dmanr);
+		return;
+	}	
+
+} /* free_dma */
+
+#else
+
+int request_dma(unsigned int dmanr, const char *device_id)
+{
+	return -EINVAL;
+}
+
+void free_dma(unsigned int dmanr)
+{
+}
+
+#endif
+
+#ifdef CONFIG_PROC_FS
+
+#ifdef MAX_DMA_CHANNELS
+static int proc_dma_show(struct seq_file *m, void *v)
+{
+	int i;
+
+	for (i = 0 ; i < MAX_DMA_CHANNELS ; i++) {
+		if (dma_chan_busy[i].lock) {
+		    seq_printf(m, "%2d: %s\n", i,
+			       dma_chan_busy[i].device_id);
+		}
+	}
+	return 0;
+}
+#else
+static int proc_dma_show(struct seq_file *m, void *v)
+{
+	seq_puts(m, "No DMA\n");
+	return 0;
+}
+#endif /* MAX_DMA_CHANNELS */
+
+static int proc_dma_open(struct inode *inode, struct file *file)
+{
+	return single_open(file, proc_dma_show, NULL);
+}
+
+static const struct file_operations proc_dma_operations = {
+	.open		= proc_dma_open,
+	.read		= seq_read,
+	.llseek		= seq_lseek,
+	.release	= single_release,
+};
+
+static int __init proc_dma_init(void)
+{
+	struct proc_dir_entry *e;
+
+	e = create_proc_entry("dma", 0, NULL);
+	if (e)
+		e->proc_fops = &proc_dma_operations;
+
+	return 0;
+}
+
+__initcall(proc_dma_init);
+#endif
+
+EXPORT_SYMBOL(request_dma);
+EXPORT_SYMBOL(free_dma);
+EXPORT_SYMBOL(dma_spin_lock);
--- a/kernel/exec_domain.c
+++ b/kernel/exec_domain.c
@@ -0,0 +1,209 @@
+/*
+ * Handling of different ABIs (personalities).
+ *
+ * We group personalities into execution domains which have their
+ * own handlers for kernel entry points, signal mapping, etc...
+ *
+ * 2001-05-06	Complete rewrite,  Christoph Hellwig (hch@infradead.org)
+ */
+
+#include <linux/init.h>
+#include <linux/kernel.h>
+#include <linux/kmod.h>
+#include <linux/module.h>
+#include <linux/personality.h>
+#include <linux/sched.h>
+#include <linux/syscalls.h>
+#include <linux/sysctl.h>
+#include <linux/types.h>
+
+
+static void default_handler(int, struct pt_regs *);
+
+static struct exec_domain *exec_domains = &default_exec_domain;
+static DEFINE_RWLOCK(exec_domains_lock);
+
+
+static u_long ident_map[32] = {
+	0,	1,	2,	3,	4,	5,	6,	7,
+	8,	9,	10,	11,	12,	13,	14,	15,
+	16,	17,	18,	19,	20,	21,	22,	23,
+	24,	25,	26,	27,	28,	29,	30,	31
+};
+
+struct exec_domain default_exec_domain = {
+	.name		= "Linux",		/* name */
+	.handler	= default_handler,	/* lcall7 causes a seg fault. */
+	.pers_low	= 0, 			/* PER_LINUX personality. */
+	.pers_high	= 0,			/* PER_LINUX personality. */
+	.signal_map	= ident_map,		/* Identity map signals. */
+	.signal_invmap	= ident_map,		/*  - both ways. */
+};
+
+
+static void
+default_handler(int segment, struct pt_regs *regp)
+{
+	set_personality(0);
+
+	if (current_thread_info()->exec_domain->handler != default_handler)
+		current_thread_info()->exec_domain->handler(segment, regp);
+	else
+		send_sig(SIGSEGV, current, 1);
+}
+
+static struct exec_domain *
+lookup_exec_domain(u_long personality)
+{
+	struct exec_domain *	ep;
+	u_long			pers = personality(personality);
+		
+	read_lock(&exec_domains_lock);
+	for (ep = exec_domains; ep; ep = ep->next) {
+		if (pers >= ep->pers_low && pers <= ep->pers_high)
+			if (try_module_get(ep->module))
+				goto out;
+	}
+
+#ifdef CONFIG_KMOD
+	read_unlock(&exec_domains_lock);
+	request_module("personality-%ld", pers);
+	read_lock(&exec_domains_lock);
+
+	for (ep = exec_domains; ep; ep = ep->next) {
+		if (pers >= ep->pers_low && pers <= ep->pers_high)
+			if (try_module_get(ep->module))
+				goto out;
+	}
+#endif
+
+	ep = &default_exec_domain;
+out:
+	read_unlock(&exec_domains_lock);
+	return (ep);
+}
+
+int
+register_exec_domain(struct exec_domain *ep)
+{
+	struct exec_domain	*tmp;
+	int			err = -EBUSY;
+
+	if (ep == NULL)
+		return -EINVAL;
+
+	if (ep->next != NULL)
+		return -EBUSY;
+
+	write_lock(&exec_domains_lock);
+	for (tmp = exec_domains; tmp; tmp = tmp->next) {
+		if (tmp == ep)
+			goto out;
+	}
+
+	ep->next = exec_domains;
+	exec_domains = ep;
+	err = 0;
+
+out:
+	write_unlock(&exec_domains_lock);
+	return (err);
+}
+
+int
+unregister_exec_domain(struct exec_domain *ep)
+{
+	struct exec_domain	**epp;
+
+	epp = &exec_domains;
+	write_lock(&exec_domains_lock);
+	for (epp = &exec_domains; *epp; epp = &(*epp)->next) {
+		if (ep == *epp)
+			goto unregister;
+	}
+	write_unlock(&exec_domains_lock);
+	return -EINVAL;
+
+unregister:
+	*epp = ep->next;
+	ep->next = NULL;
+	write_unlock(&exec_domains_lock);
+	return 0;
+}
+
+int
+__set_personality(u_long personality)
+{
+	struct exec_domain	*ep, *oep;
+
+	ep = lookup_exec_domain(personality);
+	if (ep == current_thread_info()->exec_domain) {
+		current->personality = personality;
+		module_put(ep->module);
+		return 0;
+	}
+
+	if (atomic_read(&current->fs->count) != 1) {
+		struct fs_struct *fsp, *ofsp;
+
+		fsp = copy_fs_struct(current->fs);
+		if (fsp == NULL) {
+			module_put(ep->module);
+			return -ENOMEM;
+		}
+
+		task_lock(current);
+		ofsp = current->fs;
+		current->fs = fsp;
+		task_unlock(current);
+
+		put_fs_struct(ofsp);
+	}
+
+	/*
+	 * At that point we are guaranteed to be the sole owner of
+	 * current->fs.
+	 */
+
+	current->personality = personality;
+	oep = current_thread_info()->exec_domain;
+	current_thread_info()->exec_domain = ep;
+	set_fs_altroot();
+
+	module_put(oep->module);
+	return 0;
+}
+
+int
+get_exec_domain_list(char *page)
+{
+	struct exec_domain	*ep;
+	int			len = 0;
+
+	read_lock(&exec_domains_lock);
+	for (ep = exec_domains; ep && len < PAGE_SIZE - 80; ep = ep->next)
+		len += sprintf(page + len, "%d-%d\t%-16s\t[%s]\n",
+			       ep->pers_low, ep->pers_high, ep->name,
+			       module_name(ep->module));
+	read_unlock(&exec_domains_lock);
+	return (len);
+}
+
+asmlinkage long
+sys_personality(u_long personality)
+{
+	u_long old = current->personality;
+
+	if (personality != 0xffffffff) {
+		set_personality(personality);
+		if (current->personality != personality)
+			return -EINVAL;
+	}
+
+	return (long)old;
+}
+
+
+EXPORT_SYMBOL(register_exec_domain);
+EXPORT_SYMBOL(unregister_exec_domain);
+EXPORT_SYMBOL(__set_personality);
--- a/kernel/exit.c
+++ b/kernel/exit.c
--- a/kernel/extable.c
+++ b/kernel/extable.c
@@ -0,0 +1,67 @@
+/* Rewritten by Rusty Russell, on the backs of many others...
+   Copyright (C) 2001 Rusty Russell, 2002 Rusty Russell IBM.
+
+    This program is free software; you can redistribute it and/or modify
+    it under the terms of the GNU General Public License as published by
+    the Free Software Foundation; either version 2 of the License, or
+    (at your option) any later version.
+
+    This program is distributed in the hope that it will be useful,
+    but WITHOUT ANY WARRANTY; without even the implied warranty of
+    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+    GNU General Public License for more details.
+
+    You should have received a copy of the GNU General Public License
+    along with this program; if not, write to the Free Software
+    Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
+*/
+#include <linux/module.h>
+#include <linux/init.h>
+#include <asm/uaccess.h>
+#include <asm/sections.h>
+
+extern struct exception_table_entry __start___ex_table[];
+extern struct exception_table_entry __stop___ex_table[];
+
+/* Sort the kernel's built-in exception table */
+void __init sort_main_extable(void)
+{
+	sort_extable(__start___ex_table, __stop___ex_table);
+}
+
+/* Given an address, look for it in the exception tables. */
+const struct exception_table_entry *search_exception_tables(unsigned long addr)
+{
+	const struct exception_table_entry *e;
+
+	e = search_extable(__start___ex_table, __stop___ex_table-1, addr);
+	if (!e)
+		e = search_module_extables(addr);
+	return e;
+}
+
+int core_kernel_text(unsigned long addr)
+{
+	if (addr >= (unsigned long)_stext &&
+	    addr <= (unsigned long)_etext)
+		return 1;
+
+	if (addr >= (unsigned long)_sinittext &&
+	    addr <= (unsigned long)_einittext)
+		return 1;
+	return 0;
+}
+
+int __kernel_text_address(unsigned long addr)
+{
+	if (core_kernel_text(addr))
+		return 1;
+	return __module_text_address(addr) != NULL;
+}
+
+int kernel_text_address(unsigned long addr)
+{
+	if (core_kernel_text(addr))
+		return 1;
+	return module_text_address(addr) != NULL;
+}
--- a/kernel/fork.c
+++ b/kernel/fork.c
--- a/kernel/futex.c
+++ b/kernel/futex.c
--- a/kernel/futex_compat.c
+++ b/kernel/futex_compat.c
@@ -0,0 +1,164 @@
+/*
+ * linux/kernel/futex_compat.c
+ *
+ * Futex compatibililty routines.
+ *
+ * Copyright 2006, Red Hat, Inc., Ingo Molnar
+ */
+
+#include <linux/linkage.h>
+#include <linux/compat.h>
+#include <linux/futex.h>
+
+#include <asm/uaccess.h>
+
+
+/*
+ * Fetch a robust-list pointer. Bit 0 signals PI futexes:
+ */
+static inline int
+fetch_robust_entry(compat_uptr_t *uentry, struct robust_list __user **entry,
+		   compat_uptr_t __user *head, int *pi)
+{
+	if (get_user(*uentry, head))
+		return -EFAULT;
+
+	*entry = compat_ptr((*uentry) & ~1);
+	*pi = (unsigned int)(*uentry) & 1;
+
+	return 0;
+}
+
+/*
+ * Walk curr->robust_list (very carefully, it's a userspace list!)
+ * and mark any locks found there dead, and notify any waiters.
+ *
+ * We silently return on any sign of list-walking problem.
+ */
+void compat_exit_robust_list(struct task_struct *curr)
+{
+	struct compat_robust_list_head __user *head = curr->compat_robust_list;
+	struct robust_list __user *entry, *pending;
+	unsigned int limit = ROBUST_LIST_LIMIT, pi, pip;
+	compat_uptr_t uentry, upending;
+	compat_long_t futex_offset;
+
+	/*
+	 * Fetch the list head (which was registered earlier, via
+	 * sys_set_robust_list()):
+	 */
+	if (fetch_robust_entry(&uentry, &entry, &head->list.next, &pi))
+		return;
+	/*
+	 * Fetch the relative futex offset:
+	 */
+	if (get_user(futex_offset, &head->futex_offset))
+		return;
+	/*
+	 * Fetch any possibly pending lock-add first, and handle it
+	 * if it exists:
+	 */
+	if (fetch_robust_entry(&upending, &pending,
+			       &head->list_op_pending, &pip))
+		return;
+	if (upending)
+		handle_futex_death((void __user *)pending + futex_offset, curr, pip);
+
+	while (compat_ptr(uentry) != &head->list) {
+		/*
+		 * A pending lock might already be on the list, so
+		 * dont process it twice:
+		 */
+		if (entry != pending)
+			if (handle_futex_death((void __user *)entry + futex_offset,
+						curr, pi))
+				return;
+
+		/*
+		 * Fetch the next entry in the list:
+		 */
+		if (fetch_robust_entry(&uentry, &entry,
+				       (compat_uptr_t __user *)&entry->next, &pi))
+			return;
+		/*
+		 * Avoid excessively long or circular lists:
+		 */
+		if (!--limit)
+			break;
+
+		cond_resched();
+	}
+}
+
+asmlinkage long
+compat_sys_set_robust_list(struct compat_robust_list_head __user *head,
+			   compat_size_t len)
+{
+	if (unlikely(len != sizeof(*head)))
+		return -EINVAL;
+
+	current->compat_robust_list = head;
+
+	return 0;
+}
+
+asmlinkage long
+compat_sys_get_robust_list(int pid, compat_uptr_t __user *head_ptr,
+			   compat_size_t __user *len_ptr)
+{
+	struct compat_robust_list_head __user *head;
+	unsigned long ret;
+
+	if (!pid)
+		head = current->compat_robust_list;
+	else {
+		struct task_struct *p;
+
+		ret = -ESRCH;
+		read_lock(&tasklist_lock);
+		p = find_task_by_pid(pid);
+		if (!p)
+			goto err_unlock;
+		ret = -EPERM;
+		if ((current->euid != p->euid) && (current->euid != p->uid) &&
+				!capable(CAP_SYS_PTRACE))
+			goto err_unlock;
+		head = p->compat_robust_list;
+		read_unlock(&tasklist_lock);
+	}
+
+	if (put_user(sizeof(*head), len_ptr))
+		return -EFAULT;
+	return put_user(ptr_to_compat(head), head_ptr);
+
+err_unlock:
+	read_unlock(&tasklist_lock);
+
+	return ret;
+}
+
+asmlinkage long compat_sys_futex(u32 __user *uaddr, int op, u32 val,
+		struct compat_timespec __user *utime, u32 __user *uaddr2,
+		u32 val3)
+{
+	struct timespec t;
+	unsigned long timeout = MAX_SCHEDULE_TIMEOUT;
+	int val2 = 0;
+
+	if (utime && (op == FUTEX_WAIT || op == FUTEX_LOCK_PI)) {
+		if (get_compat_timespec(&t, utime))
+			return -EFAULT;
+		if (!timespec_valid(&t))
+			return -EINVAL;
+		if (op == FUTEX_WAIT)
+			timeout = timespec_to_jiffies(&t) + 1;
+		else {
+			timeout = t.tv_sec;
+			val2 = t.tv_nsec;
+		}
+	}
+	if (op == FUTEX_REQUEUE || op == FUTEX_CMP_REQUEUE)
+		val2 = (int) (unsigned long) utime;
+
+	return do_futex(uaddr, op, val, timeout, uaddr2, val2, val3);
+}
--- a/kernel/hrtimer.c
+++ b/kernel/hrtimer.c
--- a/kernel/irq/Makefile
+++ b/kernel/irq/Makefile
@@ -0,0 +1,5 @@
+
+obj-y := handle.o manage.o spurious.o resend.o chip.o devres.o
+obj-$(CONFIG_GENERIC_IRQ_PROBE) += autoprobe.o
+obj-$(CONFIG_PROC_FS) += proc.o
+obj-$(CONFIG_GENERIC_PENDING_IRQ) += migration.o
--- a/kernel/irq/autoprobe.c
+++ b/kernel/irq/autoprobe.c
@@ -0,0 +1,202 @@
+/*
+ * linux/kernel/irq/autoprobe.c
+ *
+ * Copyright (C) 1992, 1998-2004 Linus Torvalds, Ingo Molnar
+ *
+ * This file contains the interrupt probing code and driver APIs.
+ */
+
+#include <linux/irq.h>
+#include <linux/module.h>
+#include <linux/interrupt.h>
+#include <linux/delay.h>
+
+#include "internals.h"
+
+/*
+ * Autodetection depends on the fact that any interrupt that
+ * comes in on to an unassigned handler will get stuck with
+ * "IRQ_WAITING" cleared and the interrupt disabled.
+ */
+static DEFINE_MUTEX(probing_active);
+
+/**
+ *	probe_irq_on	- begin an interrupt autodetect
+ *
+ *	Commence probing for an interrupt. The interrupts are scanned
+ *	and a mask of potential interrupt lines is returned.
+ *
+ */
+unsigned long probe_irq_on(void)
+{
+	struct irq_desc *desc;
+	unsigned long mask;
+	unsigned int i;
+
+	mutex_lock(&probing_active);
+	/*
+	 * something may have generated an irq long ago and we want to
+	 * flush such a longstanding irq before considering it as spurious.
+	 */
+	for (i = NR_IRQS-1; i > 0; i--) {
+		desc = irq_desc + i;
+
+		spin_lock_irq(&desc->lock);
+		if (!desc->action && !(desc->status & IRQ_NOPROBE)) {
+			/*
+			 * An old-style architecture might still have
+			 * the handle_bad_irq handler there:
+			 */
+			compat_irq_chip_set_default_handler(desc);
+
+			/*
+			 * Some chips need to know about probing in
+			 * progress:
+			 */
+			if (desc->chip->set_type)
+				desc->chip->set_type(i, IRQ_TYPE_PROBE);
+			desc->chip->startup(i);
+		}
+		spin_unlock_irq(&desc->lock);
+	}
+
+	/* Wait for longstanding interrupts to trigger. */
+	msleep(20);
+
+	/*
+	 * enable any unassigned irqs
+	 * (we must startup again here because if a longstanding irq
+	 * happened in the previous stage, it may have masked itself)
+	 */
+	for (i = NR_IRQS-1; i > 0; i--) {
+		desc = irq_desc + i;
+
+		spin_lock_irq(&desc->lock);
+		if (!desc->action && !(desc->status & IRQ_NOPROBE)) {
+			desc->status |= IRQ_AUTODETECT | IRQ_WAITING;
+			if (desc->chip->startup(i))
+				desc->status |= IRQ_PENDING;
+		}
+		spin_unlock_irq(&desc->lock);
+	}
+
+	/*
+	 * Wait for spurious interrupts to trigger
+	 */
+	msleep(100);
+
+	/*
+	 * Now filter out any obviously spurious interrupts
+	 */
+	mask = 0;
+	for (i = 0; i < NR_IRQS; i++) {
+		unsigned int status;
+
+		desc = irq_desc + i;
+		spin_lock_irq(&desc->lock);
+		status = desc->status;
+
+		if (status & IRQ_AUTODETECT) {
+			/* It triggered already - consider it spurious. */
+			if (!(status & IRQ_WAITING)) {
+				desc->status = status & ~IRQ_AUTODETECT;
+				desc->chip->shutdown(i);
+			} else
+				if (i < 32)
+					mask |= 1 << i;
+		}
+		spin_unlock_irq(&desc->lock);
+	}
+
+	return mask;
+}
+EXPORT_SYMBOL(probe_irq_on);
+
+/**
+ *	probe_irq_mask - scan a bitmap of interrupt lines
+ *	@val:	mask of interrupts to consider
+ *
+ *	Scan the interrupt lines and return a bitmap of active
+ *	autodetect interrupts. The interrupt probe logic state
+ *	is then returned to its previous value.
+ *
+ *	Note: we need to scan all the irq's even though we will
+ *	only return autodetect irq numbers - just so that we reset
+ *	them all to a known state.
+ */
+unsigned int probe_irq_mask(unsigned long val)
+{
+	unsigned int mask;
+	int i;
+
+	mask = 0;
+	for (i = 0; i < NR_IRQS; i++) {
+		struct irq_desc *desc = irq_desc + i;
+		unsigned int status;
+
+		spin_lock_irq(&desc->lock);
+		status = desc->status;
+
+		if (status & IRQ_AUTODETECT) {
+			if (i < 16 && !(status & IRQ_WAITING))
+				mask |= 1 << i;
+
+			desc->status = status & ~IRQ_AUTODETECT;
+			desc->chip->shutdown(i);
+		}
+		spin_unlock_irq(&desc->lock);
+	}
+	mutex_unlock(&probing_active);
+
+	return mask & val;
+}
+EXPORT_SYMBOL(probe_irq_mask);
+
+/**
+ *	probe_irq_off	- end an interrupt autodetect
+ *	@val: mask of potential interrupts (unused)
+ *
+ *	Scans the unused interrupt lines and returns the line which
+ *	appears to have triggered the interrupt. If no interrupt was
+ *	found then zero is returned. If more than one interrupt is
+ *	found then minus the first candidate is returned to indicate
+ *	their is doubt.
+ *
+ *	The interrupt probe logic state is returned to its previous
+ *	value.
+ *
+ *	BUGS: When used in a module (which arguably shouldn't happen)
+ *	nothing prevents two IRQ probe callers from overlapping. The
+ *	results of this are non-optimal.
+ */
+int probe_irq_off(unsigned long val)
+{
+	int i, irq_found = 0, nr_irqs = 0;
+
+	for (i = 0; i < NR_IRQS; i++) {
+		struct irq_desc *desc = irq_desc + i;
+		unsigned int status;
+
+		spin_lock_irq(&desc->lock);
+		status = desc->status;
+
+		if (status & IRQ_AUTODETECT) {
+			if (!(status & IRQ_WAITING)) {
+				if (!nr_irqs)
+					irq_found = i;
+				nr_irqs++;
+			}
+			desc->status = status & ~IRQ_AUTODETECT;
+			desc->chip->shutdown(i);
+		}
+		spin_unlock_irq(&desc->lock);
+	}
+	mutex_unlock(&probing_active);
+
+	if (nr_irqs > 1)
+		irq_found = -irq_found;
+
+	return irq_found;
+}
+EXPORT_SYMBOL(probe_irq_off);
+
--- a/kernel/irq/chip.c
+++ b/kernel/irq/chip.c
@@ -0,0 +1,599 @@
+/*
+ * linux/kernel/irq/chip.c
+ *
+ * Copyright (C) 1992, 1998-2006 Linus Torvalds, Ingo Molnar
+ * Copyright (C) 2005-2006, Thomas Gleixner, Russell King
+ *
+ * This file contains the core interrupt handling code, for irq-chip
+ * based architectures.
+ *
+ * Detailed information is available in Documentation/DocBook/genericirq
+ */
+
+#include <linux/irq.h>
+#include <linux/module.h>
+#include <linux/interrupt.h>
+#include <linux/kernel_stat.h>
+
+#include "internals.h"
+
+/**
+ *	dynamic_irq_init - initialize a dynamically allocated irq
+ *	@irq:	irq number to initialize
+ */
+void dynamic_irq_init(unsigned int irq)
+{
+	struct irq_desc *desc;
+	unsigned long flags;
+
+	if (irq >= NR_IRQS) {
+		printk(KERN_ERR "Trying to initialize invalid IRQ%d\n", irq);
+		WARN_ON(1);
+		return;
+	}
+
+	/* Ensure we don't have left over values from a previous use of this irq */
+	desc = irq_desc + irq;
+	spin_lock_irqsave(&desc->lock, flags);
+	desc->status = IRQ_DISABLED;
+	desc->chip = &no_irq_chip;
+	desc->handle_irq = handle_bad_irq;
+	desc->depth = 1;
+	desc->msi_desc = NULL;
+	desc->handler_data = NULL;
+	desc->chip_data = NULL;
+	desc->action = NULL;
+	desc->irq_count = 0;
+	desc->irqs_unhandled = 0;
+#ifdef CONFIG_SMP
+	desc->affinity = CPU_MASK_ALL;
+#endif
+	spin_unlock_irqrestore(&desc->lock, flags);
+}
+
+/**
+ *	dynamic_irq_cleanup - cleanup a dynamically allocated irq
+ *	@irq:	irq number to initialize
+ */
+void dynamic_irq_cleanup(unsigned int irq)
+{
+	struct irq_desc *desc;
+	unsigned long flags;
+
+	if (irq >= NR_IRQS) {
+		printk(KERN_ERR "Trying to cleanup invalid IRQ%d\n", irq);
+		WARN_ON(1);
+		return;
+	}
+
+	desc = irq_desc + irq;
+	spin_lock_irqsave(&desc->lock, flags);
+	if (desc->action) {
+		spin_unlock_irqrestore(&desc->lock, flags);
+		printk(KERN_ERR "Destroying IRQ%d without calling free_irq\n",
+			irq);
+		WARN_ON(1);
+		return;
+	}
+	desc->msi_desc = NULL;
+	desc->handler_data = NULL;
+	desc->chip_data = NULL;
+	desc->handle_irq = handle_bad_irq;
+	desc->chip = &no_irq_chip;
+	spin_unlock_irqrestore(&desc->lock, flags);
+}
+
+
+/**
+ *	set_irq_chip - set the irq chip for an irq
+ *	@irq:	irq number
+ *	@chip:	pointer to irq chip description structure
+ */
+int set_irq_chip(unsigned int irq, struct irq_chip *chip)
+{
+	struct irq_desc *desc;
+	unsigned long flags;
+
+	if (irq >= NR_IRQS) {
+		printk(KERN_ERR "Trying to install chip for IRQ%d\n", irq);
+		WARN_ON(1);
+		return -EINVAL;
+	}
+
+	if (!chip)
+		chip = &no_irq_chip;
+
+	desc = irq_desc + irq;
+	spin_lock_irqsave(&desc->lock, flags);
+	irq_chip_set_defaults(chip);
+	desc->chip = chip;
+	spin_unlock_irqrestore(&desc->lock, flags);
+
+	return 0;
+}
+EXPORT_SYMBOL(set_irq_chip);
+
+/**
+ *	set_irq_type - set the irq type for an irq
+ *	@irq:	irq number
+ *	@type:	interrupt type - see include/linux/interrupt.h
+ */
+int set_irq_type(unsigned int irq, unsigned int type)
+{
+	struct irq_desc *desc;
+	unsigned long flags;
+	int ret = -ENXIO;
+
+	if (irq >= NR_IRQS) {
+		printk(KERN_ERR "Trying to set irq type for IRQ%d\n", irq);
+		return -ENODEV;
+	}
+
+	desc = irq_desc + irq;
+	if (desc->chip->set_type) {
+		spin_lock_irqsave(&desc->lock, flags);
+		ret = desc->chip->set_type(irq, type);
+		spin_unlock_irqrestore(&desc->lock, flags);
+	}
+	return ret;
+}
+EXPORT_SYMBOL(set_irq_type);
+
+/**
+ *	set_irq_data - set irq type data for an irq
+ *	@irq:	Interrupt number
+ *	@data:	Pointer to interrupt specific data
+ *
+ *	Set the hardware irq controller data for an irq
+ */
+int set_irq_data(unsigned int irq, void *data)
+{
+	struct irq_desc *desc;
+	unsigned long flags;
+
+	if (irq >= NR_IRQS) {
+		printk(KERN_ERR
+		       "Trying to install controller data for IRQ%d\n", irq);
+		return -EINVAL;
+	}
+
+	desc = irq_desc + irq;
+	spin_lock_irqsave(&desc->lock, flags);
+	desc->handler_data = data;
+	spin_unlock_irqrestore(&desc->lock, flags);
+	return 0;
+}
+EXPORT_SYMBOL(set_irq_data);
+
+/**
+ *	set_irq_data - set irq type data for an irq
+ *	@irq:	Interrupt number
+ *	@entry:	Pointer to MSI descriptor data
+ *
+ *	Set the hardware irq controller data for an irq
+ */
+int set_irq_msi(unsigned int irq, struct msi_desc *entry)
+{
+	struct irq_desc *desc;
+	unsigned long flags;
+
+	if (irq >= NR_IRQS) {
+		printk(KERN_ERR
+		       "Trying to install msi data for IRQ%d\n", irq);
+		return -EINVAL;
+	}
+	desc = irq_desc + irq;
+	spin_lock_irqsave(&desc->lock, flags);
+	desc->msi_desc = entry;
+	spin_unlock_irqrestore(&desc->lock, flags);
+	return 0;
+}
+
+/**
+ *	set_irq_chip_data - set irq chip data for an irq
+ *	@irq:	Interrupt number
+ *	@data:	Pointer to chip specific data
+ *
+ *	Set the hardware irq chip data for an irq
+ */
+int set_irq_chip_data(unsigned int irq, void *data)
+{
+	struct irq_desc *desc = irq_desc + irq;
+	unsigned long flags;
+
+	if (irq >= NR_IRQS || !desc->chip) {
+		printk(KERN_ERR "BUG: bad set_irq_chip_data(IRQ#%d)\n", irq);
+		return -EINVAL;
+	}
+
+	spin_lock_irqsave(&desc->lock, flags);
+	desc->chip_data = data;
+	spin_unlock_irqrestore(&desc->lock, flags);
+
+	return 0;
+}
+EXPORT_SYMBOL(set_irq_chip_data);
+
+/*
+ * default enable function
+ */
+static void default_enable(unsigned int irq)
+{
+	struct irq_desc *desc = irq_desc + irq;
+
+	desc->chip->unmask(irq);
+	desc->status &= ~IRQ_MASKED;
+}
+
+/*
+ * default disable function
+ */
+static void default_disable(unsigned int irq)
+{
+}
+
+/*
+ * default startup function
+ */
+static unsigned int default_startup(unsigned int irq)
+{
+	irq_desc[irq].chip->enable(irq);
+
+	return 0;
+}
+
+/*
+ * Fixup enable/disable function pointers
+ */
+void irq_chip_set_defaults(struct irq_chip *chip)
+{
+	if (!chip->enable)
+		chip->enable = default_enable;
+	if (!chip->disable)
+		chip->disable = default_disable;
+	if (!chip->startup)
+		chip->startup = default_startup;
+	if (!chip->shutdown)
+		chip->shutdown = chip->disable;
+	if (!chip->name)
+		chip->name = chip->typename;
+	if (!chip->end)
+		chip->end = dummy_irq_chip.end;
+}
+
+static inline void mask_ack_irq(struct irq_desc *desc, int irq)
+{
+	if (desc->chip->mask_ack)
+		desc->chip->mask_ack(irq);
+	else {
+		desc->chip->mask(irq);
+		desc->chip->ack(irq);
+	}
+}
+
+/**
+ *	handle_simple_irq - Simple and software-decoded IRQs.
+ *	@irq:	the interrupt number
+ *	@desc:	the interrupt description structure for this irq
+ *
+ *	Simple interrupts are either sent from a demultiplexing interrupt
+ *	handler or come from hardware, where no interrupt hardware control
+ *	is necessary.
+ *
+ *	Note: The caller is expected to handle the ack, clear, mask and
+ *	unmask issues if necessary.
+ */
+void fastcall
+handle_simple_irq(unsigned int irq, struct irq_desc *desc)
+{
+	struct irqaction *action;
+	irqreturn_t action_ret;
+	const unsigned int cpu = smp_processor_id();
+
+	spin_lock(&desc->lock);
+
+	if (unlikely(desc->status & IRQ_INPROGRESS))
+		goto out_unlock;
+	kstat_cpu(cpu).irqs[irq]++;
+
+	action = desc->action;
+	if (unlikely(!action || (desc->status & IRQ_DISABLED))) {
+		if (desc->chip->mask)
+			desc->chip->mask(irq);
+		desc->status &= ~(IRQ_REPLAY | IRQ_WAITING);
+		desc->status |= IRQ_PENDING;
+		goto out_unlock;
+	}
+
+	desc->status &= ~(IRQ_REPLAY | IRQ_WAITING | IRQ_PENDING);
+	desc->status |= IRQ_INPROGRESS;
+	spin_unlock(&desc->lock);
+
+	action_ret = handle_IRQ_event(irq, action);
+	if (!noirqdebug)
+		note_interrupt(irq, desc, action_ret);
+
+	spin_lock(&desc->lock);
+	desc->status &= ~IRQ_INPROGRESS;
+out_unlock:
+	spin_unlock(&desc->lock);
+}
+
+/**
+ *	handle_level_irq - Level type irq handler
+ *	@irq:	the interrupt number
+ *	@desc:	the interrupt description structure for this irq
+ *
+ *	Level type interrupts are active as long as the hardware line has
+ *	the active level. This may require to mask the interrupt and unmask
+ *	it after the associated handler has acknowledged the device, so the
+ *	interrupt line is back to inactive.
+ */
+void fastcall
+handle_level_irq(unsigned int irq, struct irq_desc *desc)
+{
+	unsigned int cpu = smp_processor_id();
+	struct irqaction *action;
+	irqreturn_t action_ret;
+
+	spin_lock(&desc->lock);
+	mask_ack_irq(desc, irq);
+
+	if (unlikely(desc->status & IRQ_INPROGRESS))
+		goto out_unlock;
+	desc->status &= ~(IRQ_REPLAY | IRQ_WAITING);
+	kstat_cpu(cpu).irqs[irq]++;
+
+	/*
+	 * If its disabled or no action available
+	 * keep it masked and get out of here
+	 */
+	action = desc->action;
+	if (unlikely(!action || (desc->status & IRQ_DISABLED))) {
+		desc->status |= IRQ_PENDING;
+		goto out_unlock;
+	}
+
+	desc->status |= IRQ_INPROGRESS;
+	desc->status &= ~IRQ_PENDING;
+	spin_unlock(&desc->lock);
+
+	action_ret = handle_IRQ_event(irq, action);
+	if (!noirqdebug)
+		note_interrupt(irq, desc, action_ret);
+
+	spin_lock(&desc->lock);
+	desc->status &= ~IRQ_INPROGRESS;
+	if (!(desc->status & IRQ_DISABLED) && desc->chip->unmask)
+		desc->chip->unmask(irq);
+out_unlock:
+	spin_unlock(&desc->lock);
+}
+
+/**
+ *	handle_fasteoi_irq - irq handler for transparent controllers
+ *	@irq:	the interrupt number
+ *	@desc:	the interrupt description structure for this irq
+ *
+ *	Only a single callback will be issued to the chip: an ->eoi()
+ *	call when the interrupt has been serviced. This enables support
+ *	for modern forms of interrupt handlers, which handle the flow
+ *	details in hardware, transparently.
+ */
+void fastcall
+handle_fasteoi_irq(unsigned int irq, struct irq_desc *desc)
+{
+	unsigned int cpu = smp_processor_id();
+	struct irqaction *action;
+	irqreturn_t action_ret;
+
+	spin_lock(&desc->lock);
+
+	if (unlikely(desc->status & IRQ_INPROGRESS))
+		goto out;
+
+	desc->status &= ~(IRQ_REPLAY | IRQ_WAITING);
+	kstat_cpu(cpu).irqs[irq]++;
+
+	/*
+	 * If its disabled or no action available
+	 * then mask it and get out of here:
+	 */
+	action = desc->action;
+	if (unlikely(!action || (desc->status & IRQ_DISABLED))) {
+		desc->status |= IRQ_PENDING;
+		if (desc->chip->mask)
+			desc->chip->mask(irq);
+		goto out;
+	}
+
+	desc->status |= IRQ_INPROGRESS;
+	desc->status &= ~IRQ_PENDING;
+	spin_unlock(&desc->lock);
+
+	action_ret = handle_IRQ_event(irq, action);
+	if (!noirqdebug)
+		note_interrupt(irq, desc, action_ret);
+
+	spin_lock(&desc->lock);
+	desc->status &= ~IRQ_INPROGRESS;
+out:
+	desc->chip->eoi(irq);
+
+	spin_unlock(&desc->lock);
+}
+
+/**
+ *	handle_edge_irq - edge type IRQ handler
+ *	@irq:	the interrupt number
+ *	@desc:	the interrupt description structure for this irq
+ *
+ *	Interrupt occures on the falling and/or rising edge of a hardware
+ *	signal. The occurence is latched into the irq controller hardware
+ *	and must be acked in order to be reenabled. After the ack another
+ *	interrupt can happen on the same source even before the first one
+ *	is handled by the assosiacted event handler. If this happens it
+ *	might be necessary to disable (mask) the interrupt depending on the
+ *	controller hardware. This requires to reenable the interrupt inside
+ *	of the loop which handles the interrupts which have arrived while
+ *	the handler was running. If all pending interrupts are handled, the
+ *	loop is left.
+ */
+void fastcall
+handle_edge_irq(unsigned int irq, struct irq_desc *desc)
+{
+	const unsigned int cpu = smp_processor_id();
+
+	spin_lock(&desc->lock);
+
+	desc->status &= ~(IRQ_REPLAY | IRQ_WAITING);
+
+	/*
+	 * If we're currently running this IRQ, or its disabled,
+	 * we shouldn't process the IRQ. Mark it pending, handle
+	 * the necessary masking and go out
+	 */
+	if (unlikely((desc->status & (IRQ_INPROGRESS | IRQ_DISABLED)) ||
+		    !desc->action)) {
+		desc->status |= (IRQ_PENDING | IRQ_MASKED);
+		mask_ack_irq(desc, irq);
+		goto out_unlock;
+	}
+
+	kstat_cpu(cpu).irqs[irq]++;
+
+	/* Start handling the irq */
+	desc->chip->ack(irq);
+
+	/* Mark the IRQ currently in progress.*/
+	desc->status |= IRQ_INPROGRESS;
+
+	do {
+		struct irqaction *action = desc->action;
+		irqreturn_t action_ret;
+
+		if (unlikely(!action)) {
+			desc->chip->mask(irq);
+			goto out_unlock;
+		}
+
+		/*
+		 * When another irq arrived while we were handling
+		 * one, we could have masked the irq.
+		 * Renable it, if it was not disabled in meantime.
+		 */
+		if (unlikely((desc->status &
+			       (IRQ_PENDING | IRQ_MASKED | IRQ_DISABLED)) ==
+			      (IRQ_PENDING | IRQ_MASKED))) {
+			desc->chip->unmask(irq);
+			desc->status &= ~IRQ_MASKED;
+		}
+
+		desc->status &= ~IRQ_PENDING;
+		spin_unlock(&desc->lock);
+		action_ret = handle_IRQ_event(irq, action);
+		if (!noirqdebug)
+			note_interrupt(irq, desc, action_ret);
+		spin_lock(&desc->lock);
+
+	} while ((desc->status & (IRQ_PENDING | IRQ_DISABLED)) == IRQ_PENDING);
+
+	desc->status &= ~IRQ_INPROGRESS;
+out_unlock:
+	spin_unlock(&desc->lock);
+}
+
+#ifdef CONFIG_SMP
+/**
+ *	handle_percpu_IRQ - Per CPU local irq handler
+ *	@irq:	the interrupt number
+ *	@desc:	the interrupt description structure for this irq
+ *
+ *	Per CPU interrupts on SMP machines without locking requirements
+ */
+void fastcall
+handle_percpu_irq(unsigned int irq, struct irq_desc *desc)
+{
+	irqreturn_t action_ret;
+
+	kstat_this_cpu.irqs[irq]++;
+
+	if (desc->chip->ack)
+		desc->chip->ack(irq);
+
+	action_ret = handle_IRQ_event(irq, desc->action);
+	if (!noirqdebug)
+		note_interrupt(irq, desc, action_ret);
+
+	if (desc->chip->eoi)
+		desc->chip->eoi(irq);
+}
+
+#endif /* CONFIG_SMP */
+
+void
+__set_irq_handler(unsigned int irq, irq_flow_handler_t handle, int is_chained,
+		  const char *name)
+{
+	struct irq_desc *desc;
+	unsigned long flags;
+
+	if (irq >= NR_IRQS) {
+		printk(KERN_ERR
+		       "Trying to install type control for IRQ%d\n", irq);
+		return;
+	}
+
+	desc = irq_desc + irq;
+
+	if (!handle)
+		handle = handle_bad_irq;
+	else if (desc->chip == &no_irq_chip) {
+		printk(KERN_WARNING "Trying to install %sinterrupt handler "
+		       "for IRQ%d\n", is_chained ? "chained " : "", irq);
+		/*
+		 * Some ARM implementations install a handler for really dumb
+		 * interrupt hardware without setting an irq_chip. This worked
+		 * with the ARM no_irq_chip but the check in setup_irq would
+		 * prevent us to setup the interrupt at all. Switch it to
+		 * dummy_irq_chip for easy transition.
+		 */
+		desc->chip = &dummy_irq_chip;
+	}
+
+	spin_lock_irqsave(&desc->lock, flags);
+
+	/* Uninstall? */
+	if (handle == handle_bad_irq) {
+		if (desc->chip != &no_irq_chip)
+			mask_ack_irq(desc, irq);
+		desc->status |= IRQ_DISABLED;
+		desc->depth = 1;
+	}
+	desc->handle_irq = handle;
+	desc->name = name;
+
+	if (handle != handle_bad_irq && is_chained) {
+		desc->status &= ~IRQ_DISABLED;
+		desc->status |= IRQ_NOREQUEST | IRQ_NOPROBE;
+		desc->depth = 0;
+		desc->chip->unmask(irq);
+	}
+	spin_unlock_irqrestore(&desc->lock, flags);
+}
+
+void
+set_irq_chip_and_handler(unsigned int irq, struct irq_chip *chip,
+			 irq_flow_handler_t handle)
+{
+	set_irq_chip(irq, chip);
+	__set_irq_handler(irq, handle, 0, NULL);
+}
+
+void
+set_irq_chip_and_handler_name(unsigned int irq, struct irq_chip *chip,
+			      irq_flow_handler_t handle, const char *name)
+{
+	set_irq_chip(irq, chip);
+	__set_irq_handler(irq, handle, 0, name);
+}
--- a/kernel/irq/devres.c
+++ b/kernel/irq/devres.c
@@ -0,0 +1,88 @@
+#include <linux/module.h>
+#include <linux/interrupt.h>
+
+/*
+ * Device resource management aware IRQ request/free implementation.
+ */
+struct irq_devres {
+	unsigned int irq;
+	void *dev_id;
+};
+
+static void devm_irq_release(struct device *dev, void *res)
+{
+	struct irq_devres *this = res;
+
+	free_irq(this->irq, this->dev_id);
+}
+
+static int devm_irq_match(struct device *dev, void *res, void *data)
+{
+	struct irq_devres *this = res, *match = data;
+
+	return this->irq == match->irq && this->dev_id == match->dev_id;
+}
+
+/**
+ *	devm_request_irq - allocate an interrupt line for a managed device
+ *	@dev: device to request interrupt for
+ *	@irq: Interrupt line to allocate
+ *	@handler: Function to be called when the IRQ occurs
+ *	@irqflags: Interrupt type flags
+ *	@devname: An ascii name for the claiming device
+ *	@dev_id: A cookie passed back to the handler function
+ *
+ *	Except for the extra @dev argument, this function takes the
+ *	same arguments and performs the same function as
+ *	request_irq().  IRQs requested with this function will be
+ *	automatically freed on driver detach.
+ *
+ *	If an IRQ allocated with this function needs to be freed
+ *	separately, dev_free_irq() must be used.
+ */
+int devm_request_irq(struct device *dev, unsigned int irq,
+		     irq_handler_t handler, unsigned long irqflags,
+		     const char *devname, void *dev_id)
+{
+	struct irq_devres *dr;
+	int rc;
+
+	dr = devres_alloc(devm_irq_release, sizeof(struct irq_devres),
+			  GFP_KERNEL);
+	if (!dr)
+		return -ENOMEM;
+
+	rc = request_irq(irq, handler, irqflags, devname, dev_id);
+	if (rc) {
+		devres_free(dr);
+		return rc;
+	}
+
+	dr->irq = irq;
+	dr->dev_id = dev_id;
+	devres_add(dev, dr);
+
+	return 0;
+}
+EXPORT_SYMBOL(devm_request_irq);
+
+/**
+ *	devm_free_irq - free an interrupt
+ *	@dev: device to free interrupt for
+ *	@irq: Interrupt line to free
+ *	@dev_id: Device identity to free
+ *
+ *	Except for the extra @dev argument, this function takes the
+ *	same arguments and performs the same function as free_irq().
+ *	This function instead of free_irq() should be used to manually
+ *	free IRQs allocated with dev_request_irq().
+ */
+void devm_free_irq(struct device *dev, unsigned int irq, void *dev_id)
+{
+	struct irq_devres match_data = { irq, dev_id };
+
+	free_irq(irq, dev_id);
+	WARN_ON(devres_destroy(dev, devm_irq_release, devm_irq_match,
+			       &match_data));
+}
+EXPORT_SYMBOL(devm_free_irq);
--- a/kernel/irq/handle.c
+++ b/kernel/irq/handle.c
@@ -0,0 +1,271 @@
+/*
+ * linux/kernel/irq/handle.c
+ *
+ * Copyright (C) 1992, 1998-2006 Linus Torvalds, Ingo Molnar
+ * Copyright (C) 2005-2006, Thomas Gleixner, Russell King
+ *
+ * This file contains the core interrupt handling code.
+ *
+ * Detailed information is available in Documentation/DocBook/genericirq
+ *
+ */
+
+#include <linux/irq.h>
+#include <linux/module.h>
+#include <linux/random.h>
+#include <linux/interrupt.h>
+#include <linux/kernel_stat.h>
+
+#include "internals.h"
+
+/**
+ * handle_bad_irq - handle spurious and unhandled irqs
+ * @irq:       the interrupt number
+ * @desc:      description of the interrupt
+ * @regs:      pointer to a register structure
+ *
+ * Handles spurious and unhandled IRQ's. It also prints a debugmessage.
+ */
+void fastcall
+handle_bad_irq(unsigned int irq, struct irq_desc *desc)
+{
+	print_irq_desc(irq, desc);
+	kstat_this_cpu.irqs[irq]++;
+	ack_bad_irq(irq);
+}
+
+/*
+ * Linux has a controller-independent interrupt architecture.
+ * Every controller has a 'controller-template', that is used
+ * by the main code to do the right thing. Each driver-visible
+ * interrupt source is transparently wired to the appropriate
+ * controller. Thus drivers need not be aware of the
+ * interrupt-controller.
+ *
+ * The code is designed to be easily extended with new/different
+ * interrupt controllers, without having to do assembly magic or
+ * having to touch the generic code.
+ *
+ * Controller mappings for all interrupt sources:
+ */
+struct irq_desc irq_desc[NR_IRQS] __cacheline_aligned = {
+	[0 ... NR_IRQS-1] = {
+		.status = IRQ_DISABLED,
+		.chip = &no_irq_chip,
+		.handle_irq = handle_bad_irq,
+		.depth = 1,
+		.lock = __SPIN_LOCK_UNLOCKED(irq_desc->lock),
+#ifdef CONFIG_SMP
+		.affinity = CPU_MASK_ALL
+#endif
+	}
+};
+
+/*
+ * What should we do if we get a hw irq event on an illegal vector?
+ * Each architecture has to answer this themself.
+ */
+static void ack_bad(unsigned int irq)
+{
+	print_irq_desc(irq, irq_desc + irq);
+	ack_bad_irq(irq);
+}
+
+/*
+ * NOP functions
+ */
+static void noop(unsigned int irq)
+{
+}
+
+static unsigned int noop_ret(unsigned int irq)
+{
+	return 0;
+}
+
+/*
+ * Generic no controller implementation
+ */
+struct irq_chip no_irq_chip = {
+	.name		= "none",
+	.startup	= noop_ret,
+	.shutdown	= noop,
+	.enable		= noop,
+	.disable	= noop,
+	.ack		= ack_bad,
+	.end		= noop,
+};
+
+/*
+ * Generic dummy implementation which can be used for
+ * real dumb interrupt sources
+ */
+struct irq_chip dummy_irq_chip = {
+	.name		= "dummy",
+	.startup	= noop_ret,
+	.shutdown	= noop,
+	.enable		= noop,
+	.disable	= noop,
+	.ack		= noop,
+	.mask		= noop,
+	.unmask		= noop,
+	.end		= noop,
+};
+
+/*
+ * Special, empty irq handler:
+ */
+irqreturn_t no_action(int cpl, void *dev_id)
+{
+	return IRQ_NONE;
+}
+
+/**
+ * handle_IRQ_event - irq action chain handler
+ * @irq:	the interrupt number
+ * @action:	the interrupt action chain for this irq
+ *
+ * Handles the action chain of an irq event
+ */
+irqreturn_t handle_IRQ_event(unsigned int irq, struct irqaction *action)
+{
+	irqreturn_t ret, retval = IRQ_NONE;
+	unsigned int status = 0;
+
+	handle_dynamic_tick(action);
+
+	if (!(action->flags & IRQF_DISABLED))
+		local_irq_enable_in_hardirq();
+
+	do {
+		ret = action->handler(irq, action->dev_id);
+		if (ret == IRQ_HANDLED)
+			status |= action->flags;
+		retval |= ret;
+		action = action->next;
+	} while (action);
+
+	if (status & IRQF_SAMPLE_RANDOM)
+		add_interrupt_randomness(irq);
+	local_irq_disable();
+
+	return retval;
+}
+
+#ifndef CONFIG_GENERIC_HARDIRQS_NO__DO_IRQ
+/**
+ * __do_IRQ - original all in one highlevel IRQ handler
+ * @irq:	the interrupt number
+ *
+ * __do_IRQ handles all normal device IRQ's (the special
+ * SMP cross-CPU interrupts have their own specific
+ * handlers).
+ *
+ * This is the original x86 implementation which is used for every
+ * interrupt type.
+ */
+fastcall unsigned int __do_IRQ(unsigned int irq)
+{
+	struct irq_desc *desc = irq_desc + irq;
+	struct irqaction *action;
+	unsigned int status;
+
+	kstat_this_cpu.irqs[irq]++;
+	if (CHECK_IRQ_PER_CPU(desc->status)) {
+		irqreturn_t action_ret;
+
+		/*
+		 * No locking required for CPU-local interrupts:
+		 */
+		if (desc->chip->ack)
+			desc->chip->ack(irq);
+		action_ret = handle_IRQ_event(irq, desc->action);
+		desc->chip->end(irq);
+		return 1;
+	}
+
+	spin_lock(&desc->lock);
+	if (desc->chip->ack)
+		desc->chip->ack(irq);
+	/*
+	 * REPLAY is when Linux resends an IRQ that was dropped earlier
+	 * WAITING is used by probe to mark irqs that are being tested
+	 */
+	status = desc->status & ~(IRQ_REPLAY | IRQ_WAITING);
+	status |= IRQ_PENDING; /* we _want_ to handle it */
+
+	/*
+	 * If the IRQ is disabled for whatever reason, we cannot
+	 * use the action we have.
+	 */
+	action = NULL;
+	if (likely(!(status & (IRQ_DISABLED | IRQ_INPROGRESS)))) {
+		action = desc->action;
+		status &= ~IRQ_PENDING; /* we commit to handling */
+		status |= IRQ_INPROGRESS; /* we are handling it */
+	}
+	desc->status = status;
+
+	/*
+	 * If there is no IRQ handler or it was disabled, exit early.
+	 * Since we set PENDING, if another processor is handling
+	 * a different instance of this same irq, the other processor
+	 * will take care of it.
+	 */
+	if (unlikely(!action))
+		goto out;
+
+	/*
+	 * Edge triggered interrupts need to remember
+	 * pending events.
+	 * This applies to any hw interrupts that allow a second
+	 * instance of the same irq to arrive while we are in do_IRQ
+	 * or in the handler. But the code here only handles the _second_
+	 * instance of the irq, not the third or fourth. So it is mostly
+	 * useful for irq hardware that does not mask cleanly in an
+	 * SMP environment.
+	 */
+	for (;;) {
+		irqreturn_t action_ret;
+
+		spin_unlock(&desc->lock);
+
+		action_ret = handle_IRQ_event(irq, action);
+		if (!noirqdebug)
+			note_interrupt(irq, desc, action_ret);
+
+		spin_lock(&desc->lock);
+		if (likely(!(desc->status & IRQ_PENDING)))
+			break;
+		desc->status &= ~IRQ_PENDING;
+	}
+	desc->status &= ~IRQ_INPROGRESS;
+
+out:
+	/*
+	 * The ->end() handler has to deal with interrupts which got
+	 * disabled while the handler was running.
+	 */
+	desc->chip->end(irq);
+	spin_unlock(&desc->lock);
+
+	return 1;
+}
+#endif
+
+#ifdef CONFIG_TRACE_IRQFLAGS
+
+/*
+ * lockdep: we want to handle all irq_desc locks as a single lock-class:
+ */
+static struct lock_class_key irq_desc_lock_class;
+
+void early_init_irq_lock_class(void)
+{
+	int i;
+
+	for (i = 0; i < NR_IRQS; i++)
+		lockdep_set_class(&irq_desc[i].lock, &irq_desc_lock_class);
+}
+
+#endif
--- a/kernel/irq/internals.h
+++ b/kernel/irq/internals.h
@@ -0,0 +1,64 @@
+/*
+ * IRQ subsystem internal functions and variables:
+ */
+
+extern int noirqdebug;
+
+/* Set default functions for irq_chip structures: */
+extern void irq_chip_set_defaults(struct irq_chip *chip);
+
+/* Set default handler: */
+extern void compat_irq_chip_set_default_handler(struct irq_desc *desc);
+
+#ifdef CONFIG_PROC_FS
+extern void register_irq_proc(unsigned int irq);
+extern void register_handler_proc(unsigned int irq, struct irqaction *action);
+extern void unregister_handler_proc(unsigned int irq, struct irqaction *action);
+#else
+static inline void register_irq_proc(unsigned int irq) { }
+static inline void register_handler_proc(unsigned int irq,
+					 struct irqaction *action) { }
+static inline void unregister_handler_proc(unsigned int irq,
+					   struct irqaction *action) { }
+#endif
+
+/*
+ * Debugging printout:
+ */
+
+#include <linux/kallsyms.h>
+
+#define P(f) if (desc->status & f) printk("%14s set\n", #f)
+
+static inline void print_irq_desc(unsigned int irq, struct irq_desc *desc)
+{
+	printk("irq %d, desc: %p, depth: %d, count: %d, unhandled: %d\n",
+		irq, desc, desc->depth, desc->irq_count, desc->irqs_unhandled);
+	printk("->handle_irq():  %p, ", desc->handle_irq);
+	print_symbol("%s\n", (unsigned long)desc->handle_irq);
+	printk("->chip(): %p, ", desc->chip);
+	print_symbol("%s\n", (unsigned long)desc->chip);
+	printk("->action(): %p\n", desc->action);
+	if (desc->action) {
+		printk("->action->handler(): %p, ", desc->action->handler);
+		print_symbol("%s\n", (unsigned long)desc->action->handler);
+	}
+
+	P(IRQ_INPROGRESS);
+	P(IRQ_DISABLED);
+	P(IRQ_PENDING);
+	P(IRQ_REPLAY);
+	P(IRQ_AUTODETECT);
+	P(IRQ_WAITING);
+	P(IRQ_LEVEL);
+	P(IRQ_MASKED);
+#ifdef CONFIG_IRQ_PER_CPU
+	P(IRQ_PER_CPU);
+#endif
+	P(IRQ_NOPROBE);
+	P(IRQ_NOREQUEST);
+	P(IRQ_NOAUTOEN);
+}
+
+#undef P
+
--- a/kernel/irq/manage.c
+++ b/kernel/irq/manage.c
@@ -0,0 +1,563 @@
+/*
+ * linux/kernel/irq/manage.c
+ *
+ * Copyright (C) 1992, 1998-2006 Linus Torvalds, Ingo Molnar
+ * Copyright (C) 2005-2006 Thomas Gleixner
+ *
+ * This file contains driver APIs to the irq subsystem.
+ */
+
+#include <linux/irq.h>
+#include <linux/module.h>
+#include <linux/random.h>
+#include <linux/interrupt.h>
+
+#include "internals.h"
+
+#ifdef CONFIG_SMP
+
+/**
+ *	synchronize_irq - wait for pending IRQ handlers (on other CPUs)
+ *	@irq: interrupt number to wait for
+ *
+ *	This function waits for any pending IRQ handlers for this interrupt
+ *	to complete before returning. If you use this function while
+ *	holding a resource the IRQ handler may need you will deadlock.
+ *
+ *	This function may be called - with care - from IRQ context.
+ */
+void synchronize_irq(unsigned int irq)
+{
+	struct irq_desc *desc = irq_desc + irq;
+
+	if (irq >= NR_IRQS)
+		return;
+
+	while (desc->status & IRQ_INPROGRESS)
+		cpu_relax();
+}
+EXPORT_SYMBOL(synchronize_irq);
+
+/**
+ *	irq_can_set_affinity - Check if the affinity of a given irq can be set
+ *	@irq:		Interrupt to check
+ *
+ */
+int irq_can_set_affinity(unsigned int irq)
+{
+	struct irq_desc *desc = irq_desc + irq;
+
+	if (CHECK_IRQ_PER_CPU(desc->status) || !desc->chip ||
+	    !desc->chip->set_affinity)
+		return 0;
+
+	return 1;
+}
+
+/**
+ *	irq_set_affinity - Set the irq affinity of a given irq
+ *	@irq:		Interrupt to set affinity
+ *	@cpumask:	cpumask
+ *
+ */
+int irq_set_affinity(unsigned int irq, cpumask_t cpumask)
+{
+	struct irq_desc *desc = irq_desc + irq;
+
+	if (!desc->chip->set_affinity)
+		return -EINVAL;
+
+	set_balance_irq_affinity(irq, cpumask);
+
+#ifdef CONFIG_GENERIC_PENDING_IRQ
+	set_pending_irq(irq, cpumask);
+#else
+	desc->affinity = cpumask;
+	desc->chip->set_affinity(irq, cpumask);
+#endif
+	return 0;
+}
+
+#endif
+
+/**
+ *	disable_irq_nosync - disable an irq without waiting
+ *	@irq: Interrupt to disable
+ *
+ *	Disable the selected interrupt line.  Disables and Enables are
+ *	nested.
+ *	Unlike disable_irq(), this function does not ensure existing
+ *	instances of the IRQ handler have completed before returning.
+ *
+ *	This function may be called from IRQ context.
+ */
+void disable_irq_nosync(unsigned int irq)
+{
+	struct irq_desc *desc = irq_desc + irq;
+	unsigned long flags;
+
+	if (irq >= NR_IRQS)
+		return;
+
+	spin_lock_irqsave(&desc->lock, flags);
+	if (!desc->depth++) {
+		desc->status |= IRQ_DISABLED;
+		desc->chip->disable(irq);
+	}
+	spin_unlock_irqrestore(&desc->lock, flags);
+}
+EXPORT_SYMBOL(disable_irq_nosync);
+
+/**
+ *	disable_irq - disable an irq and wait for completion
+ *	@irq: Interrupt to disable
+ *
+ *	Disable the selected interrupt line.  Enables and Disables are
+ *	nested.
+ *	This function waits for any pending IRQ handlers for this interrupt
+ *	to complete before returning. If you use this function while
+ *	holding a resource the IRQ handler may need you will deadlock.
+ *
+ *	This function may be called - with care - from IRQ context.
+ */
+void disable_irq(unsigned int irq)
+{
+	struct irq_desc *desc = irq_desc + irq;
+
+	if (irq >= NR_IRQS)
+		return;
+
+	disable_irq_nosync(irq);
+	if (desc->action)
+		synchronize_irq(irq);
+}
+EXPORT_SYMBOL(disable_irq);
+
+/**
+ *	enable_irq - enable handling of an irq
+ *	@irq: Interrupt to enable
+ *
+ *	Undoes the effect of one call to disable_irq().  If this
+ *	matches the last disable, processing of interrupts on this
+ *	IRQ line is re-enabled.
+ *
+ *	This function may be called from IRQ context.
+ */
+void enable_irq(unsigned int irq)
+{
+	struct irq_desc *desc = irq_desc + irq;
+	unsigned long flags;
+
+	if (irq >= NR_IRQS)
+		return;
+
+	spin_lock_irqsave(&desc->lock, flags);
+	switch (desc->depth) {
+	case 0:
+		printk(KERN_WARNING "Unbalanced enable for IRQ %d\n", irq);
+		WARN_ON(1);
+		break;
+	case 1: {
+		unsigned int status = desc->status & ~IRQ_DISABLED;
+
+		/* Prevent probing on this irq: */
+		desc->status = status | IRQ_NOPROBE;
+		check_irq_resend(desc, irq);
+		/* fall-through */
+	}
+	default:
+		desc->depth--;
+	}
+	spin_unlock_irqrestore(&desc->lock, flags);
+}
+EXPORT_SYMBOL(enable_irq);
+
+/**
+ *	set_irq_wake - control irq power management wakeup
+ *	@irq:	interrupt to control
+ *	@on:	enable/disable power management wakeup
+ *
+ *	Enable/disable power management wakeup mode, which is
+ *	disabled by default.  Enables and disables must match,
+ *	just as they match for non-wakeup mode support.
+ *
+ *	Wakeup mode lets this IRQ wake the system from sleep
+ *	states like "suspend to RAM".
+ */
+int set_irq_wake(unsigned int irq, unsigned int on)
+{
+	struct irq_desc *desc = irq_desc + irq;
+	unsigned long flags;
+	int ret = -ENXIO;
+	int (*set_wake)(unsigned, unsigned) = desc->chip->set_wake;
+
+	/* wakeup-capable irqs can be shared between drivers that
+	 * don't need to have the same sleep mode behaviors.
+	 */
+	spin_lock_irqsave(&desc->lock, flags);
+	if (on) {
+		if (desc->wake_depth++ == 0)
+			desc->status |= IRQ_WAKEUP;
+		else
+			set_wake = NULL;
+	} else {
+		if (desc->wake_depth == 0) {
+			printk(KERN_WARNING "Unbalanced IRQ %d "
+					"wake disable\n", irq);
+			WARN_ON(1);
+		} else if (--desc->wake_depth == 0)
+			desc->status &= ~IRQ_WAKEUP;
+		else
+			set_wake = NULL;
+	}
+	if (set_wake)
+		ret = desc->chip->set_wake(irq, on);
+	spin_unlock_irqrestore(&desc->lock, flags);
+	return ret;
+}
+EXPORT_SYMBOL(set_irq_wake);
+
+/*
+ * Internal function that tells the architecture code whether a
+ * particular irq has been exclusively allocated or is available
+ * for driver use.
+ */
+int can_request_irq(unsigned int irq, unsigned long irqflags)
+{
+	struct irqaction *action;
+
+	if (irq >= NR_IRQS || irq_desc[irq].status & IRQ_NOREQUEST)
+		return 0;
+
+	action = irq_desc[irq].action;
+	if (action)
+		if (irqflags & action->flags & IRQF_SHARED)
+			action = NULL;
+
+	return !action;
+}
+
+void compat_irq_chip_set_default_handler(struct irq_desc *desc)
+{
+	/*
+	 * If the architecture still has not overriden
+	 * the flow handler then zap the default. This
+	 * should catch incorrect flow-type setting.
+	 */
+	if (desc->handle_irq == &handle_bad_irq)
+		desc->handle_irq = NULL;
+}
+
+/*
+ * Internal function to register an irqaction - typically used to
+ * allocate special interrupts that are part of the architecture.
+ */
+int setup_irq(unsigned int irq, struct irqaction *new)
+{
+	struct irq_desc *desc = irq_desc + irq;
+	struct irqaction *old, **p;
+	const char *old_name = NULL;
+	unsigned long flags;
+	int shared = 0;
+
+	if (irq >= NR_IRQS)
+		return -EINVAL;
+
+	if (desc->chip == &no_irq_chip)
+		return -ENOSYS;
+	/*
+	 * Some drivers like serial.c use request_irq() heavily,
+	 * so we have to be careful not to interfere with a
+	 * running system.
+	 */
+	if (new->flags & IRQF_SAMPLE_RANDOM) {
+		/*
+		 * This function might sleep, we want to call it first,
+		 * outside of the atomic block.
+		 * Yes, this might clear the entropy pool if the wrong
+		 * driver is attempted to be loaded, without actually
+		 * installing a new handler, but is this really a problem,
+		 * only the sysadmin is able to do this.
+		 */
+		rand_initialize_irq(irq);
+	}
+
+	/*
+	 * The following block of code has to be executed atomically
+	 */
+	spin_lock_irqsave(&desc->lock, flags);
+	p = &desc->action;
+	old = *p;
+	if (old) {
+		/*
+		 * Can't share interrupts unless both agree to and are
+		 * the same type (level, edge, polarity). So both flag
+		 * fields must have IRQF_SHARED set and the bits which
+		 * set the trigger type must match.
+		 */
+		if (!((old->flags & new->flags) & IRQF_SHARED) ||
+		    ((old->flags ^ new->flags) & IRQF_TRIGGER_MASK)) {
+			old_name = old->name;
+			goto mismatch;
+		}
+
+#if defined(CONFIG_IRQ_PER_CPU)
+		/* All handlers must agree on per-cpuness */
+		if ((old->flags & IRQF_PERCPU) !=
+		    (new->flags & IRQF_PERCPU))
+			goto mismatch;
+#endif
+
+		/* add new interrupt at end of irq queue */
+		do {
+			p = &old->next;
+			old = *p;
+		} while (old);
+		shared = 1;
+	}
+
+	*p = new;
+#if defined(CONFIG_IRQ_PER_CPU)
+	if (new->flags & IRQF_PERCPU)
+		desc->status |= IRQ_PER_CPU;
+#endif
+	/* Exclude IRQ from balancing */
+	if (new->flags & IRQF_NOBALANCING)
+		desc->status |= IRQ_NO_BALANCING;
+
+	if (!shared) {
+		irq_chip_set_defaults(desc->chip);
+
+		/* Setup the type (level, edge polarity) if configured: */
+		if (new->flags & IRQF_TRIGGER_MASK) {
+			if (desc->chip && desc->chip->set_type)
+				desc->chip->set_type(irq,
+						new->flags & IRQF_TRIGGER_MASK);
+			else
+				/*
+				 * IRQF_TRIGGER_* but the PIC does not support
+				 * multiple flow-types?
+				 */
+				printk(KERN_WARNING "No IRQF_TRIGGER set_type "
+				       "function for IRQ %d (%s)\n", irq,
+				       desc->chip ? desc->chip->name :
+				       "unknown");
+		} else
+			compat_irq_chip_set_default_handler(desc);
+
+		desc->status &= ~(IRQ_AUTODETECT | IRQ_WAITING |
+				  IRQ_INPROGRESS);
+
+		if (!(desc->status & IRQ_NOAUTOEN)) {
+			desc->depth = 0;
+			desc->status &= ~IRQ_DISABLED;
+			if (desc->chip->startup)
+				desc->chip->startup(irq);
+			else
+				desc->chip->enable(irq);
+		} else
+			/* Undo nested disables: */
+			desc->depth = 1;
+	}
+	/* Reset broken irq detection when installing new handler */
+	desc->irq_count = 0;
+	desc->irqs_unhandled = 0;
+	spin_unlock_irqrestore(&desc->lock, flags);
+
+	new->irq = irq;
+	register_irq_proc(irq);
+	new->dir = NULL;
+	register_handler_proc(irq, new);
+
+	return 0;
+
+mismatch:
+#ifdef CONFIG_DEBUG_SHIRQ
+	if (!(new->flags & IRQF_PROBE_SHARED)) {
+		printk(KERN_ERR "IRQ handler type mismatch for IRQ %d\n", irq);
+		if (old_name)
+			printk(KERN_ERR "current handler: %s\n", old_name);
+		dump_stack();
+	}
+#endif
+	spin_unlock_irqrestore(&desc->lock, flags);
+	return -EBUSY;
+}
+
+/**
+ *	free_irq - free an interrupt
+ *	@irq: Interrupt line to free
+ *	@dev_id: Device identity to free
+ *
+ *	Remove an interrupt handler. The handler is removed and if the
+ *	interrupt line is no longer in use by any driver it is disabled.
+ *	On a shared IRQ the caller must ensure the interrupt is disabled
+ *	on the card it drives before calling this function. The function
+ *	does not return until any executing interrupts for this IRQ
+ *	have completed.
+ *
+ *	This function must not be called from interrupt context.
+ */
+void free_irq(unsigned int irq, void *dev_id)
+{
+	struct irq_desc *desc;
+	struct irqaction **p;
+	unsigned long flags;
+	irqreturn_t (*handler)(int, void *) = NULL;
+
+	WARN_ON(in_interrupt());
+	if (irq >= NR_IRQS)
+		return;
+
+	desc = irq_desc + irq;
+	spin_lock_irqsave(&desc->lock, flags);
+	p = &desc->action;
+	for (;;) {
+		struct irqaction *action = *p;
+
+		if (action) {
+			struct irqaction **pp = p;
+
+			p = &action->next;
+			if (action->dev_id != dev_id)
+				continue;
+
+			/* Found it - now remove it from the list of entries */
+			*pp = action->next;
+
+			/* Currently used only by UML, might disappear one day.*/
+#ifdef CONFIG_IRQ_RELEASE_METHOD
+			if (desc->chip->release)
+				desc->chip->release(irq, dev_id);
+#endif
+
+			if (!desc->action) {
+				desc->status |= IRQ_DISABLED;
+				if (desc->chip->shutdown)
+					desc->chip->shutdown(irq);
+				else
+					desc->chip->disable(irq);
+			}
+			spin_unlock_irqrestore(&desc->lock, flags);
+			unregister_handler_proc(irq, action);
+
+			/* Make sure it's not being used on another CPU */
+			synchronize_irq(irq);
+			if (action->flags & IRQF_SHARED)
+				handler = action->handler;
+			kfree(action);
+			return;
+		}
+		printk(KERN_ERR "Trying to free already-free IRQ %d\n", irq);
+		spin_unlock_irqrestore(&desc->lock, flags);
+		return;
+	}
+#ifdef CONFIG_DEBUG_SHIRQ
+	if (handler) {
+		/*
+		 * It's a shared IRQ -- the driver ought to be prepared for it
+		 * to happen even now it's being freed, so let's make sure....
+		 * We do this after actually deregistering it, to make sure that
+		 * a 'real' IRQ doesn't run in parallel with our fake
+		 */
+		handler(irq, dev_id);
+	}
+#endif
+}
+EXPORT_SYMBOL(free_irq);
+
+/**
+ *	request_irq - allocate an interrupt line
+ *	@irq: Interrupt line to allocate
+ *	@handler: Function to be called when the IRQ occurs
+ *	@irqflags: Interrupt type flags
+ *	@devname: An ascii name for the claiming device
+ *	@dev_id: A cookie passed back to the handler function
+ *
+ *	This call allocates interrupt resources and enables the
+ *	interrupt line and IRQ handling. From the point this
+ *	call is made your handler function may be invoked. Since
+ *	your handler function must clear any interrupt the board
+ *	raises, you must take care both to initialise your hardware
+ *	and to set up the interrupt handler in the right order.
+ *
+ *	Dev_id must be globally unique. Normally the address of the
+ *	device data structure is used as the cookie. Since the handler
+ *	receives this value it makes sense to use it.
+ *
+ *	If your interrupt is shared you must pass a non NULL dev_id
+ *	as this is required when freeing the interrupt.
+ *
+ *	Flags:
+ *
+ *	IRQF_SHARED		Interrupt is shared
+ *	IRQF_DISABLED	Disable local interrupts while processing
+ *	IRQF_SAMPLE_RANDOM	The interrupt can be used for entropy
+ *
+ */
+int request_irq(unsigned int irq, irq_handler_t handler,
+		unsigned long irqflags, const char *devname, void *dev_id)
+{
+	struct irqaction *action;
+	int retval;
+
+#ifdef CONFIG_LOCKDEP
+	/*
+	 * Lockdep wants atomic interrupt handlers:
+	 */
+	irqflags |= IRQF_DISABLED;
+#endif
+	/*
+	 * Sanity-check: shared interrupts must pass in a real dev-ID,
+	 * otherwise we'll have trouble later trying to figure out
+	 * which interrupt is which (messes up the interrupt freeing
+	 * logic etc).
+	 */
+	if ((irqflags & IRQF_SHARED) && !dev_id)
+		return -EINVAL;
+	if (irq >= NR_IRQS)
+		return -EINVAL;
+	if (irq_desc[irq].status & IRQ_NOREQUEST)
+		return -EINVAL;
+	if (!handler)
+		return -EINVAL;
+
+	action = kmalloc(sizeof(struct irqaction), GFP_ATOMIC);
+	if (!action)
+		return -ENOMEM;
+
+	action->handler = handler;
+	action->flags = irqflags;
+	cpus_clear(action->mask);
+	action->name = devname;
+	action->next = NULL;
+	action->dev_id = dev_id;
+
+	select_smp_affinity(irq);
+
+#ifdef CONFIG_DEBUG_SHIRQ
+	if (irqflags & IRQF_SHARED) {
+		/*
+		 * It's a shared IRQ -- the driver ought to be prepared for it
+		 * to happen immediately, so let's make sure....
+		 * We do this before actually registering it, to make sure that
+		 * a 'real' IRQ doesn't run in parallel with our fake
+		 */
+		if (irqflags & IRQF_DISABLED) {
+			unsigned long flags;
+
+			local_irq_save(flags);
+			handler(irq, dev_id);
+			local_irq_restore(flags);
+		} else
+			handler(irq, dev_id);
+	}
+#endif
+
+	retval = setup_irq(irq, action);
+	if (retval)
+		kfree(action);
+
+	return retval;
+}
+EXPORT_SYMBOL(request_irq);
--- a/kernel/irq/migration.c
+++ b/kernel/irq/migration.c
@@ -0,0 +1,75 @@
+
+#include <linux/irq.h>
+
+void set_pending_irq(unsigned int irq, cpumask_t mask)
+{
+	struct irq_desc *desc = irq_desc + irq;
+	unsigned long flags;
+
+	spin_lock_irqsave(&desc->lock, flags);
+	desc->status |= IRQ_MOVE_PENDING;
+	irq_desc[irq].pending_mask = mask;
+	spin_unlock_irqrestore(&desc->lock, flags);
+}
+
+void move_masked_irq(int irq)
+{
+	struct irq_desc *desc = irq_desc + irq;
+	cpumask_t tmp;
+
+	if (likely(!(desc->status & IRQ_MOVE_PENDING)))
+		return;
+
+	/*
+	 * Paranoia: cpu-local interrupts shouldn't be calling in here anyway.
+	 */
+	if (CHECK_IRQ_PER_CPU(desc->status)) {
+		WARN_ON(1);
+		return;
+	}
+
+	desc->status &= ~IRQ_MOVE_PENDING;
+
+	if (unlikely(cpus_empty(irq_desc[irq].pending_mask)))
+		return;
+
+	if (!desc->chip->set_affinity)
+		return;
+
+	assert_spin_locked(&desc->lock);
+
+	cpus_and(tmp, irq_desc[irq].pending_mask, cpu_online_map);
+
+	/*
+	 * If there was a valid mask to work with, please
+	 * do the disable, re-program, enable sequence.
+	 * This is *not* particularly important for level triggered
+	 * but in a edge trigger case, we might be setting rte
+	 * when an active trigger is comming in. This could
+	 * cause some ioapics to mal-function.
+	 * Being paranoid i guess!
+	 *
+	 * For correct operation this depends on the caller
+	 * masking the irqs.
+	 */
+	if (likely(!cpus_empty(tmp))) {
+		desc->chip->set_affinity(irq,tmp);
+	}
+	cpus_clear(irq_desc[irq].pending_mask);
+}
+
+void move_native_irq(int irq)
+{
+	struct irq_desc *desc = irq_desc + irq;
+
+	if (likely(!(desc->status & IRQ_MOVE_PENDING)))
+		return;
+
+	if (unlikely(desc->status & IRQ_DISABLED))
+		return;
+
+	desc->chip->mask(irq);
+	move_masked_irq(irq);
+	desc->chip->unmask(irq);
+}
+
--- a/kernel/irq/proc.c
+++ b/kernel/irq/proc.c
@@ -0,0 +1,150 @@
+/*
+ * linux/kernel/irq/proc.c
+ *
+ * Copyright (C) 1992, 1998-2004 Linus Torvalds, Ingo Molnar
+ *
+ * This file contains the /proc/irq/ handling code.
+ */
+
+#include <linux/irq.h>
+#include <linux/proc_fs.h>
+#include <linux/interrupt.h>
+
+#include "internals.h"
+
+static struct proc_dir_entry *root_irq_dir;
+
+#ifdef CONFIG_SMP
+
+static int irq_affinity_read_proc(char *page, char **start, off_t off,
+				  int count, int *eof, void *data)
+{
+	int len = cpumask_scnprintf(page, count, irq_desc[(long)data].affinity);
+
+	if (count - len < 2)
+		return -EINVAL;
+	len += sprintf(page + len, "\n");
+	return len;
+}
+
+int no_irq_affinity;
+static int irq_affinity_write_proc(struct file *file, const char __user *buffer,
+				   unsigned long count, void *data)
+{
+	unsigned int irq = (int)(long)data, full_count = count, err;
+	cpumask_t new_value, tmp;
+
+	if (!irq_desc[irq].chip->set_affinity || no_irq_affinity ||
+	    irq_balancing_disabled(irq))
+		return -EIO;
+
+	err = cpumask_parse_user(buffer, count, new_value);
+	if (err)
+		return err;
+
+	/*
+	 * Do not allow disabling IRQs completely - it's a too easy
+	 * way to make the system unusable accidentally :-) At least
+	 * one online CPU still has to be targeted.
+	 */
+	cpus_and(tmp, new_value, cpu_online_map);
+	if (cpus_empty(tmp))
+		/* Special case for empty set - allow the architecture
+		   code to set default SMP affinity. */
+		return select_smp_affinity(irq) ? -EINVAL : full_count;
+
+	irq_set_affinity(irq, new_value);
+
+	return full_count;
+}
+
+#endif
+
+#define MAX_NAMELEN 128
+
+static int name_unique(unsigned int irq, struct irqaction *new_action)
+{
+	struct irq_desc *desc = irq_desc + irq;
+	struct irqaction *action;
+
+	for (action = desc->action ; action; action = action->next)
+		if ((action != new_action) && action->name &&
+				!strcmp(new_action->name, action->name))
+			return 0;
+	return 1;
+}
+
+void register_handler_proc(unsigned int irq, struct irqaction *action)
+{
+	char name [MAX_NAMELEN];
+
+	if (!irq_desc[irq].dir || action->dir || !action->name ||
+					!name_unique(irq, action))
+		return;
+
+	memset(name, 0, MAX_NAMELEN);
+	snprintf(name, MAX_NAMELEN, "%s", action->name);
+
+	/* create /proc/irq/1234/handler/ */
+	action->dir = proc_mkdir(name, irq_desc[irq].dir);
+}
+
+#undef MAX_NAMELEN
+
+#define MAX_NAMELEN 10
+
+void register_irq_proc(unsigned int irq)
+{
+	char name [MAX_NAMELEN];
+
+	if (!root_irq_dir ||
+		(irq_desc[irq].chip == &no_irq_chip) ||
+			irq_desc[irq].dir)
+		return;
+
+	memset(name, 0, MAX_NAMELEN);
+	sprintf(name, "%d", irq);
+
+	/* create /proc/irq/1234 */
+	irq_desc[irq].dir = proc_mkdir(name, root_irq_dir);
+
+#ifdef CONFIG_SMP
+	{
+		struct proc_dir_entry *entry;
+
+		/* create /proc/irq/<irq>/smp_affinity */
+		entry = create_proc_entry("smp_affinity", 0600, irq_desc[irq].dir);
+
+		if (entry) {
+			entry->data = (void *)(long)irq;
+			entry->read_proc = irq_affinity_read_proc;
+			entry->write_proc = irq_affinity_write_proc;
+		}
+	}
+#endif
+}
+
+#undef MAX_NAMELEN
+
+void unregister_handler_proc(unsigned int irq, struct irqaction *action)
+{
+	if (action->dir)
+		remove_proc_entry(action->dir->name, irq_desc[irq].dir);
+}
+
+void init_irq_proc(void)
+{
+	int i;
+
+	/* create /proc/irq */
+	root_irq_dir = proc_mkdir("irq", NULL);
+	if (!root_irq_dir)
+		return;
+
+	/*
+	 * Create entries for all existing IRQs.
+	 */
+	for (i = 0; i < NR_IRQS; i++)
+		register_irq_proc(i);
+}
+
--- a/kernel/irq/resend.c
+++ b/kernel/irq/resend.c
@@ -0,0 +1,77 @@
+/*
+ * linux/kernel/irq/resend.c
+ *
+ * Copyright (C) 1992, 1998-2006 Linus Torvalds, Ingo Molnar
+ * Copyright (C) 2005-2006, Thomas Gleixner
+ *
+ * This file contains the IRQ-resend code
+ *
+ * If the interrupt is waiting to be processed, we try to re-run it.
+ * We can't directly run it from here since the caller might be in an
+ * interrupt-protected region. Not all irq controller chips can
+ * retrigger interrupts at the hardware level, so in those cases
+ * we allow the resending of IRQs via a tasklet.
+ */
+
+#include <linux/irq.h>
+#include <linux/module.h>
+#include <linux/random.h>
+#include <linux/interrupt.h>
+
+#include "internals.h"
+
+#ifdef CONFIG_HARDIRQS_SW_RESEND
+
+/* Bitmap to handle software resend of interrupts: */
+static DECLARE_BITMAP(irqs_resend, NR_IRQS);
+
+/*
+ * Run software resends of IRQ's
+ */
+static void resend_irqs(unsigned long arg)
+{
+	struct irq_desc *desc;
+	int irq;
+
+	while (!bitmap_empty(irqs_resend, NR_IRQS)) {
+		irq = find_first_bit(irqs_resend, NR_IRQS);
+		clear_bit(irq, irqs_resend);
+		desc = irq_desc + irq;
+		local_irq_disable();
+		desc->handle_irq(irq, desc);
+		local_irq_enable();
+	}
+}
+
+/* Tasklet to handle resend: */
+static DECLARE_TASKLET(resend_tasklet, resend_irqs, 0);
+
+#endif
+
+/*
+ * IRQ resend
+ *
+ * Is called with interrupts disabled and desc->lock held.
+ */
+void check_irq_resend(struct irq_desc *desc, unsigned int irq)
+{
+	unsigned int status = desc->status;
+
+	/*
+	 * Make sure the interrupt is enabled, before resending it:
+	 */
+	desc->chip->enable(irq);
+
+	if ((status & (IRQ_PENDING | IRQ_REPLAY)) == IRQ_PENDING) {
+		desc->status = (status & ~IRQ_PENDING) | IRQ_REPLAY;
+
+		if (!desc->chip || !desc->chip->retrigger ||
+					!desc->chip->retrigger(irq)) {
+#ifdef CONFIG_HARDIRQS_SW_RESEND
+			/* Set it pending and activate the softirq: */
+			set_bit(irq, irqs_resend);
+			tasklet_schedule(&resend_tasklet);
+#endif
+		}
+	}
+}
--- a/kernel/irq/spurious.c
+++ b/kernel/irq/spurious.c
@@ -0,0 +1,210 @@
+/*
+ * linux/kernel/irq/spurious.c
+ *
+ * Copyright (C) 1992, 1998-2004 Linus Torvalds, Ingo Molnar
+ *
+ * This file contains spurious interrupt handling.
+ */
+
+#include <linux/irq.h>
+#include <linux/module.h>
+#include <linux/kallsyms.h>
+#include <linux/interrupt.h>
+
+static int irqfixup __read_mostly;
+
+/*
+ * Recovery handler for misrouted interrupts.
+ */
+static int misrouted_irq(int irq)
+{
+	int i;
+	int ok = 0;
+	int work = 0;	/* Did we do work for a real IRQ */
+
+	for (i = 1; i < NR_IRQS; i++) {
+		struct irq_desc *desc = irq_desc + i;
+		struct irqaction *action;
+
+		if (i == irq)	/* Already tried */
+			continue;
+
+		spin_lock(&desc->lock);
+		/* Already running on another processor */
+		if (desc->status & IRQ_INPROGRESS) {
+			/*
+			 * Already running: If it is shared get the other
+			 * CPU to go looking for our mystery interrupt too
+			 */
+			if (desc->action && (desc->action->flags & IRQF_SHARED))
+				desc->status |= IRQ_PENDING;
+			spin_unlock(&desc->lock);
+			continue;
+		}
+		/* Honour the normal IRQ locking */
+		desc->status |= IRQ_INPROGRESS;
+		action = desc->action;
+		spin_unlock(&desc->lock);
+
+		while (action) {
+			/* Only shared IRQ handlers are safe to call */
+			if (action->flags & IRQF_SHARED) {
+				if (action->handler(i, action->dev_id) ==
+						IRQ_HANDLED)
+					ok = 1;
+			}
+			action = action->next;
+		}
+		local_irq_disable();
+		/* Now clean up the flags */
+		spin_lock(&desc->lock);
+		action = desc->action;
+
+		/*
+		 * While we were looking for a fixup someone queued a real
+		 * IRQ clashing with our walk:
+		 */
+		while ((desc->status & IRQ_PENDING) && action) {
+			/*
+			 * Perform real IRQ processing for the IRQ we deferred
+			 */
+			work = 1;
+			spin_unlock(&desc->lock);
+			handle_IRQ_event(i, action);
+			spin_lock(&desc->lock);
+			desc->status &= ~IRQ_PENDING;
+		}
+		desc->status &= ~IRQ_INPROGRESS;
+		/*
+		 * If we did actual work for the real IRQ line we must let the
+		 * IRQ controller clean up too
+		 */
+		if (work && desc->chip && desc->chip->end)
+			desc->chip->end(i);
+		spin_unlock(&desc->lock);
+	}
+	/* So the caller can adjust the irq error counts */
+	return ok;
+}
+
+/*
+ * If 99,900 of the previous 100,000 interrupts have not been handled
+ * then assume that the IRQ is stuck in some manner. Drop a diagnostic
+ * and try to turn the IRQ off.
+ *
+ * (The other 100-of-100,000 interrupts may have been a correctly
+ *  functioning device sharing an IRQ with the failing one)
+ *
+ * Called under desc->lock
+ */
+
+static void
+__report_bad_irq(unsigned int irq, struct irq_desc *desc,
+		 irqreturn_t action_ret)
+{
+	struct irqaction *action;
+
+	if (action_ret != IRQ_HANDLED && action_ret != IRQ_NONE) {
+		printk(KERN_ERR "irq event %d: bogus return value %x\n",
+				irq, action_ret);
+	} else {
+		printk(KERN_ERR "irq %d: nobody cared (try booting with "
+				"the \"irqpoll\" option)\n", irq);
+	}
+	dump_stack();
+	printk(KERN_ERR "handlers:\n");
+
+	action = desc->action;
+	while (action) {
+		printk(KERN_ERR "[<%p>]", action->handler);
+		print_symbol(" (%s)",
+			(unsigned long)action->handler);
+		printk("\n");
+		action = action->next;
+	}
+}
+
+static void
+report_bad_irq(unsigned int irq, struct irq_desc *desc, irqreturn_t action_ret)
+{
+	static int count = 100;
+
+	if (count > 0) {
+		count--;
+		__report_bad_irq(irq, desc, action_ret);
+	}
+}
+
+void note_interrupt(unsigned int irq, struct irq_desc *desc,
+		    irqreturn_t action_ret)
+{
+	if (unlikely(action_ret != IRQ_HANDLED)) {
+		desc->irqs_unhandled++;
+		if (unlikely(action_ret != IRQ_NONE))
+			report_bad_irq(irq, desc, action_ret);
+	}
+
+	if (unlikely(irqfixup)) {
+		/* Don't punish working computers */
+		if ((irqfixup == 2 && irq == 0) || action_ret == IRQ_NONE) {
+			int ok = misrouted_irq(irq);
+			if (action_ret == IRQ_NONE)
+				desc->irqs_unhandled -= ok;
+		}
+	}
+
+	desc->irq_count++;
+	if (likely(desc->irq_count < 100000))
+		return;
+
+	desc->irq_count = 0;
+	if (unlikely(desc->irqs_unhandled > 99900)) {
+		/*
+		 * The interrupt is stuck
+		 */
+		__report_bad_irq(irq, desc, action_ret);
+		/*
+		 * Now kill the IRQ
+		 */
+		printk(KERN_EMERG "Disabling IRQ #%d\n", irq);
+		desc->status |= IRQ_DISABLED;
+		desc->depth = 1;
+		desc->chip->disable(irq);
+	}
+	desc->irqs_unhandled = 0;
+}
+
+int noirqdebug __read_mostly;
+
+int noirqdebug_setup(char *str)
+{
+	noirqdebug = 1;
+	printk(KERN_INFO "IRQ lockup detection disabled\n");
+
+	return 1;
+}
+
+__setup("noirqdebug", noirqdebug_setup);
+
+static int __init irqfixup_setup(char *str)
+{
+	irqfixup = 1;
+	printk(KERN_WARNING "Misrouted IRQ fixup support enabled.\n");
+	printk(KERN_WARNING "This may impact system performance.\n");
+
+	return 1;
+}
+
+__setup("irqfixup", irqfixup_setup);
+
+static int __init irqpoll_setup(char *str)
+{
+	irqfixup = 2;
+	printk(KERN_WARNING "Misrouted IRQ fixup and polling support "
+				"enabled\n");
+	printk(KERN_WARNING "This may significantly impact system "
+				"performance\n");
+	return 1;
+}
+
+__setup("irqpoll", irqpoll_setup);
--- a/kernel/itimer.c
+++ b/kernel/itimer.c
@@ -0,0 +1,350 @@
+/*
+ * linux/kernel/itimer.c
+ *
+ * Copyright (C) 1992 Darren Senn
+ */
+
+/* These are all the functions necessary to implement itimers */
+
+#include <linux/mm.h>
+#include <linux/smp_lock.h>
+#include <linux/interrupt.h>
+#include <linux/syscalls.h>
+#include <linux/time.h>
+#include <linux/posix-timers.h>
+#include <linux/hrtimer.h>
+
+#include <asm/uaccess.h>
+
+/**
+ * itimer_get_remtime - get remaining time for the timer
+ *
+ * @timer: the timer to read
+ *
+ * Returns the delta between the expiry time and now, which can be
+ * less than zero or 1usec for an pending expired timer
+ */
+static struct timeval itimer_get_remtime(struct hrtimer *timer)
+{
+	ktime_t rem = hrtimer_get_remaining(timer);
+
+	/*
+	 * Racy but safe: if the itimer expires after the above
+	 * hrtimer_get_remtime() call but before this condition
+	 * then we return 0 - which is correct.
+	 */
+	if (hrtimer_active(timer)) {
+		if (rem.tv64 <= 0)
+			rem.tv64 = NSEC_PER_USEC;
+	} else
+		rem.tv64 = 0;
+
+	return ktime_to_timeval(rem);
+}
+
+int do_getitimer(int which, struct itimerval *value)
+{
+	struct task_struct *tsk = current;
+	cputime_t cinterval, cval;
+
+	switch (which) {
+	case ITIMER_REAL:
+		spin_lock_irq(&tsk->sighand->siglock);
+		value->it_value = itimer_get_remtime(&tsk->signal->real_timer);
+		value->it_interval =
+			ktime_to_timeval(tsk->signal->it_real_incr);
+		spin_unlock_irq(&tsk->sighand->siglock);
+		break;
+	case ITIMER_VIRTUAL:
+		read_lock(&tasklist_lock);
+		spin_lock_irq(&tsk->sighand->siglock);
+		cval = tsk->signal->it_virt_expires;
+		cinterval = tsk->signal->it_virt_incr;
+		if (!cputime_eq(cval, cputime_zero)) {
+			struct task_struct *t = tsk;
+			cputime_t utime = tsk->signal->utime;
+			do {
+				utime = cputime_add(utime, t->utime);
+				t = next_thread(t);
+			} while (t != tsk);
+			if (cputime_le(cval, utime)) { /* about to fire */
+				cval = jiffies_to_cputime(1);
+			} else {
+				cval = cputime_sub(cval, utime);
+			}
+		}
+		spin_unlock_irq(&tsk->sighand->siglock);
+		read_unlock(&tasklist_lock);
+		cputime_to_timeval(cval, &value->it_value);
+		cputime_to_timeval(cinterval, &value->it_interval);
+		break;
+	case ITIMER_PROF:
+		read_lock(&tasklist_lock);
+		spin_lock_irq(&tsk->sighand->siglock);
+		cval = tsk->signal->it_prof_expires;
+		cinterval = tsk->signal->it_prof_incr;
+		if (!cputime_eq(cval, cputime_zero)) {
+			struct task_struct *t = tsk;
+			cputime_t ptime = cputime_add(tsk->signal->utime,
+						      tsk->signal->stime);
+			do {
+				ptime = cputime_add(ptime,
+						    cputime_add(t->utime,
+								t->stime));
+				t = next_thread(t);
+			} while (t != tsk);
+			if (cputime_le(cval, ptime)) { /* about to fire */
+				cval = jiffies_to_cputime(1);
+			} else {
+				cval = cputime_sub(cval, ptime);
+			}
+		}
+		spin_unlock_irq(&tsk->sighand->siglock);
+		read_unlock(&tasklist_lock);
+		cputime_to_timeval(cval, &value->it_value);
+		cputime_to_timeval(cinterval, &value->it_interval);
+		break;
+	default:
+		return(-EINVAL);
+	}
+	return 0;
+}
+
+asmlinkage long sys_getitimer(int which, struct itimerval __user *value)
+{
+	int error = -EFAULT;
+	struct itimerval get_buffer;
+
+	if (value) {
+		error = do_getitimer(which, &get_buffer);
+		if (!error &&
+		    copy_to_user(value, &get_buffer, sizeof(get_buffer)))
+			error = -EFAULT;
+	}
+	return error;
+}
+
+
+/*
+ * The timer is automagically restarted, when interval != 0
+ */
+enum hrtimer_restart it_real_fn(struct hrtimer *timer)
+{
+	struct signal_struct *sig =
+	    container_of(timer, struct signal_struct, real_timer);
+
+	send_group_sig_info(SIGALRM, SEND_SIG_PRIV, sig->tsk);
+
+	return HRTIMER_NORESTART;
+}
+
+/*
+ * We do not care about correctness. We just sanitize the values so
+ * the ktime_t operations which expect normalized values do not
+ * break. This converts negative values to long timeouts similar to
+ * the code in kernel versions < 2.6.16
+ *
+ * Print a limited number of warning messages when an invalid timeval
+ * is detected.
+ */
+static void fixup_timeval(struct timeval *tv, int interval)
+{
+	static int warnlimit = 10;
+	unsigned long tmp;
+
+	if (warnlimit > 0) {
+		warnlimit--;
+		printk(KERN_WARNING
+		       "setitimer: %s (pid = %d) provided "
+		       "invalid timeval %s: tv_sec = %ld tv_usec = %ld\n",
+		       current->comm, current->pid,
+		       interval ? "it_interval" : "it_value",
+		       tv->tv_sec, (long) tv->tv_usec);
+	}
+
+	tmp = tv->tv_usec;
+	if (tmp >= USEC_PER_SEC) {
+		tv->tv_usec = tmp % USEC_PER_SEC;
+		tv->tv_sec += tmp / USEC_PER_SEC;
+	}
+
+	tmp = tv->tv_sec;
+	if (tmp > LONG_MAX)
+		tv->tv_sec = LONG_MAX;
+}
+
+/*
+ * Returns true if the timeval is in canonical form
+ */
+#define timeval_valid(t) \
+	(((t)->tv_sec >= 0) && (((unsigned long) (t)->tv_usec) < USEC_PER_SEC))
+
+/*
+ * Check for invalid timevals, sanitize them and print a limited
+ * number of warnings.
+ */
+static void check_itimerval(struct itimerval *value) {
+
+	if (unlikely(!timeval_valid(&value->it_value)))
+		fixup_timeval(&value->it_value, 0);
+
+	if (unlikely(!timeval_valid(&value->it_interval)))
+		fixup_timeval(&value->it_interval, 1);
+}
+
+int do_setitimer(int which, struct itimerval *value, struct itimerval *ovalue)
+{
+	struct task_struct *tsk = current;
+	struct hrtimer *timer;
+	ktime_t expires;
+	cputime_t cval, cinterval, nval, ninterval;
+
+	/*
+	 * Validate the timevals in value.
+	 *
+	 * Note: Although the spec requires that invalid values shall
+	 * return -EINVAL, we just fixup the value and print a limited
+	 * number of warnings in order not to break users of this
+	 * historical misfeature.
+	 *
+	 * Scheduled for replacement in March 2007
+	 */
+	check_itimerval(value);
+
+	switch (which) {
+	case ITIMER_REAL:
+again:
+		spin_lock_irq(&tsk->sighand->siglock);
+		timer = &tsk->signal->real_timer;
+		if (ovalue) {
+			ovalue->it_value = itimer_get_remtime(timer);
+			ovalue->it_interval
+				= ktime_to_timeval(tsk->signal->it_real_incr);
+		}
+		/* We are sharing ->siglock with it_real_fn() */
+		if (hrtimer_try_to_cancel(timer) < 0) {
+			spin_unlock_irq(&tsk->sighand->siglock);
+			goto again;
+		}
+		expires = timeval_to_ktime(value->it_value);
+		if (expires.tv64 != 0) {
+			tsk->signal->it_real_incr =
+				timeval_to_ktime(value->it_interval);
+			hrtimer_start(timer, expires, HRTIMER_MODE_REL);
+		} else
+			tsk->signal->it_real_incr.tv64 = 0;
+
+		spin_unlock_irq(&tsk->sighand->siglock);
+		break;
+	case ITIMER_VIRTUAL:
+		nval = timeval_to_cputime(&value->it_value);
+		ninterval = timeval_to_cputime(&value->it_interval);
+		read_lock(&tasklist_lock);
+		spin_lock_irq(&tsk->sighand->siglock);
+		cval = tsk->signal->it_virt_expires;
+		cinterval = tsk->signal->it_virt_incr;
+		if (!cputime_eq(cval, cputime_zero) ||
+		    !cputime_eq(nval, cputime_zero)) {
+			if (cputime_gt(nval, cputime_zero))
+				nval = cputime_add(nval,
+						   jiffies_to_cputime(1));
+			set_process_cpu_timer(tsk, CPUCLOCK_VIRT,
+					      &nval, &cval);
+		}
+		tsk->signal->it_virt_expires = nval;
+		tsk->signal->it_virt_incr = ninterval;
+		spin_unlock_irq(&tsk->sighand->siglock);
+		read_unlock(&tasklist_lock);
+		if (ovalue) {
+			cputime_to_timeval(cval, &ovalue->it_value);
+			cputime_to_timeval(cinterval, &ovalue->it_interval);
+		}
+		break;
+	case ITIMER_PROF:
+		nval = timeval_to_cputime(&value->it_value);
+		ninterval = timeval_to_cputime(&value->it_interval);
+		read_lock(&tasklist_lock);
+		spin_lock_irq(&tsk->sighand->siglock);
+		cval = tsk->signal->it_prof_expires;
+		cinterval = tsk->signal->it_prof_incr;
+		if (!cputime_eq(cval, cputime_zero) ||
+		    !cputime_eq(nval, cputime_zero)) {
+			if (cputime_gt(nval, cputime_zero))
+				nval = cputime_add(nval,
+						   jiffies_to_cputime(1));
+			set_process_cpu_timer(tsk, CPUCLOCK_PROF,
+					      &nval, &cval);
+		}
+		tsk->signal->it_prof_expires = nval;
+		tsk->signal->it_prof_incr = ninterval;
+		spin_unlock_irq(&tsk->sighand->siglock);
+		read_unlock(&tasklist_lock);
+		if (ovalue) {
+			cputime_to_timeval(cval, &ovalue->it_value);
+			cputime_to_timeval(cinterval, &ovalue->it_interval);
+		}
+		break;
+	default:
+		return -EINVAL;
+	}
+	return 0;
+}
+
+/**
+ * alarm_setitimer - set alarm in seconds
+ *
+ * @seconds:	number of seconds until alarm
+ *		0 disables the alarm
+ *
+ * Returns the remaining time in seconds of a pending timer or 0 when
+ * the timer is not active.
+ *
+ * On 32 bit machines the seconds value is limited to (INT_MAX/2) to avoid
+ * negative timeval settings which would cause immediate expiry.
+ */
+unsigned int alarm_setitimer(unsigned int seconds)
+{
+	struct itimerval it_new, it_old;
+
+#if BITS_PER_LONG < 64
+	if (seconds > INT_MAX)
+		seconds = INT_MAX;
+#endif
+	it_new.it_value.tv_sec = seconds;
+	it_new.it_value.tv_usec = 0;
+	it_new.it_interval.tv_sec = it_new.it_interval.tv_usec = 0;
+
+	do_setitimer(ITIMER_REAL, &it_new, &it_old);
+
+	/*
+	 * We can't return 0 if we have an alarm pending ...  And we'd
+	 * better return too much than too little anyway
+	 */
+	if ((!it_old.it_value.tv_sec && it_old.it_value.tv_usec) ||
+	      it_old.it_value.tv_usec >= 500000)
+		it_old.it_value.tv_sec++;
+
+	return it_old.it_value.tv_sec;
+}
+
+asmlinkage long sys_setitimer(int which,
+			      struct itimerval __user *value,
+			      struct itimerval __user *ovalue)
+{
+	struct itimerval set_buffer, get_buffer;
+	int error;
+
+	if (value) {
+		if(copy_from_user(&set_buffer, value, sizeof(set_buffer)))
+			return -EFAULT;
+	} else
+		memset((char *) &set_buffer, 0, sizeof(set_buffer));
+
+	error = do_setitimer(which, &set_buffer, ovalue ? &get_buffer : NULL);
+	if (error || !ovalue)
+		return error;
+
+	if (copy_to_user(ovalue, &get_buffer, sizeof(get_buffer)))
+		return -EFAULT; 
+	return 0;
+}
--- a/kernel/kallsyms.c
+++ b/kernel/kallsyms.c
@@ -0,0 +1,454 @@
+/*
+ * kallsyms.c: in-kernel printing of symbolic oopses and stack traces.
+ *
+ * Rewritten and vastly simplified by Rusty Russell for in-kernel
+ * module loader:
+ *   Copyright 2002 Rusty Russell <rusty@rustcorp.com.au> IBM Corporation
+ *
+ * ChangeLog:
+ *
+ * (25/Aug/2004) Paulo Marques <pmarques@grupopie.com>
+ *      Changed the compression method from stem compression to "table lookup"
+ *      compression (see scripts/kallsyms.c for a more complete description)
+ */
+#include <linux/kallsyms.h>
+#include <linux/module.h>
+#include <linux/init.h>
+#include <linux/seq_file.h>
+#include <linux/fs.h>
+#include <linux/err.h>
+#include <linux/proc_fs.h>
+#include <linux/sched.h>	/* for cond_resched */
+#include <linux/mm.h>
+#include <linux/ctype.h>
+
+#include <asm/sections.h>
+
+#ifdef CONFIG_KALLSYMS_ALL
+#define all_var 1
+#else
+#define all_var 0
+#endif
+
+/* These will be re-linked against their real values during the second link stage */
+extern const unsigned long kallsyms_addresses[] __attribute__((weak));
+extern const unsigned long kallsyms_num_syms __attribute__((weak));
+extern const u8 kallsyms_names[] __attribute__((weak));
+
+extern const u8 kallsyms_token_table[] __attribute__((weak));
+extern const u16 kallsyms_token_index[] __attribute__((weak));
+
+extern const unsigned long kallsyms_markers[] __attribute__((weak));
+
+static inline int is_kernel_inittext(unsigned long addr)
+{
+	if (addr >= (unsigned long)_sinittext
+	    && addr <= (unsigned long)_einittext)
+		return 1;
+	return 0;
+}
+
+static inline int is_kernel_extratext(unsigned long addr)
+{
+	if (addr >= (unsigned long)_sextratext
+	    && addr <= (unsigned long)_eextratext)
+		return 1;
+	return 0;
+}
+
+static inline int is_kernel_text(unsigned long addr)
+{
+	if (addr >= (unsigned long)_stext && addr <= (unsigned long)_etext)
+		return 1;
+	return in_gate_area_no_task(addr);
+}
+
+static inline int is_kernel(unsigned long addr)
+{
+	if (addr >= (unsigned long)_stext && addr <= (unsigned long)_end)
+		return 1;
+	return in_gate_area_no_task(addr);
+}
+
+static int is_ksym_addr(unsigned long addr)
+{
+	if (all_var)
+		return is_kernel(addr);
+
+	return is_kernel_text(addr) || is_kernel_inittext(addr) ||
+		is_kernel_extratext(addr);
+}
+
+/* expand a compressed symbol data into the resulting uncompressed string,
+   given the offset to where the symbol is in the compressed stream */
+static unsigned int kallsyms_expand_symbol(unsigned int off, char *result)
+{
+	int len, skipped_first = 0;
+	const u8 *tptr, *data;
+
+	/* get the compressed symbol length from the first symbol byte */
+	data = &kallsyms_names[off];
+	len = *data;
+	data++;
+
+	/* update the offset to return the offset for the next symbol on
+	 * the compressed stream */
+	off += len + 1;
+
+	/* for every byte on the compressed symbol data, copy the table
+	   entry for that byte */
+	while(len) {
+		tptr = &kallsyms_token_table[ kallsyms_token_index[*data] ];
+		data++;
+		len--;
+
+		while (*tptr) {
+			if(skipped_first) {
+				*result = *tptr;
+				result++;
+			} else
+				skipped_first = 1;
+			tptr++;
+		}
+	}
+
+	*result = '\0';
+
+	/* return to offset to the next symbol */
+	return off;
+}
+
+/* get symbol type information. This is encoded as a single char at the
+ * begining of the symbol name */
+static char kallsyms_get_symbol_type(unsigned int off)
+{
+	/* get just the first code, look it up in the token table, and return the
+	 * first char from this token */
+	return kallsyms_token_table[ kallsyms_token_index[ kallsyms_names[off+1] ] ];
+}
+
+
+/* find the offset on the compressed stream given and index in the
+ * kallsyms array */
+static unsigned int get_symbol_offset(unsigned long pos)
+{
+	const u8 *name;
+	int i;
+
+	/* use the closest marker we have. We have markers every 256 positions,
+	 * so that should be close enough */
+	name = &kallsyms_names[ kallsyms_markers[pos>>8] ];
+
+	/* sequentially scan all the symbols up to the point we're searching for.
+	 * Every symbol is stored in a [<len>][<len> bytes of data] format, so we
+	 * just need to add the len to the current pointer for every symbol we
+	 * wish to skip */
+	for(i = 0; i < (pos&0xFF); i++)
+		name = name + (*name) + 1;
+
+	return name - kallsyms_names;
+}
+
+/* Lookup the address for this symbol. Returns 0 if not found. */
+unsigned long kallsyms_lookup_name(const char *name)
+{
+	char namebuf[KSYM_NAME_LEN+1];
+	unsigned long i;
+	unsigned int off;
+
+	for (i = 0, off = 0; i < kallsyms_num_syms; i++) {
+		off = kallsyms_expand_symbol(off, namebuf);
+
+		if (strcmp(namebuf, name) == 0)
+			return kallsyms_addresses[i];
+	}
+	return module_kallsyms_lookup_name(name);
+}
+
+static unsigned long get_symbol_pos(unsigned long addr,
+				    unsigned long *symbolsize,
+				    unsigned long *offset)
+{
+	unsigned long symbol_start = 0, symbol_end = 0;
+	unsigned long i, low, high, mid;
+
+	/* This kernel should never had been booted. */
+	BUG_ON(!kallsyms_addresses);
+
+	/* do a binary search on the sorted kallsyms_addresses array */
+	low = 0;
+	high = kallsyms_num_syms;
+
+	while (high - low > 1) {
+		mid = (low + high) / 2;
+		if (kallsyms_addresses[mid] <= addr)
+			low = mid;
+		else
+			high = mid;
+	}
+
+	/*
+	 * search for the first aliased symbol. Aliased
+	 * symbols are symbols with the same address
+	 */
+	while (low && kallsyms_addresses[low-1] == kallsyms_addresses[low])
+		--low;
+
+	symbol_start = kallsyms_addresses[low];
+
+	/* Search for next non-aliased symbol */
+	for (i = low + 1; i < kallsyms_num_syms; i++) {
+		if (kallsyms_addresses[i] > symbol_start) {
+			symbol_end = kallsyms_addresses[i];
+			break;
+		}
+	}
+
+	/* if we found no next symbol, we use the end of the section */
+	if (!symbol_end) {
+		if (is_kernel_inittext(addr))
+			symbol_end = (unsigned long)_einittext;
+		else if (all_var)
+			symbol_end = (unsigned long)_end;
+		else
+			symbol_end = (unsigned long)_etext;
+	}
+
+	*symbolsize = symbol_end - symbol_start;
+	*offset = addr - symbol_start;
+
+	return low;
+}
+
+/*
+ * Lookup an address but don't bother to find any names.
+ */
+int kallsyms_lookup_size_offset(unsigned long addr, unsigned long *symbolsize,
+				unsigned long *offset)
+{
+	if (is_ksym_addr(addr))
+		return !!get_symbol_pos(addr, symbolsize, offset);
+
+	return !!module_address_lookup(addr, symbolsize, offset, NULL);
+}
+
+/*
+ * Lookup an address
+ * - modname is set to NULL if it's in the kernel
+ * - we guarantee that the returned name is valid until we reschedule even if
+ *   it resides in a module
+ * - we also guarantee that modname will be valid until rescheduled
+ */
+const char *kallsyms_lookup(unsigned long addr,
+			    unsigned long *symbolsize,
+			    unsigned long *offset,
+			    char **modname, char *namebuf)
+{
+	const char *msym;
+
+	namebuf[KSYM_NAME_LEN] = 0;
+	namebuf[0] = 0;
+
+	if (is_ksym_addr(addr)) {
+		unsigned long pos;
+
+		pos = get_symbol_pos(addr, symbolsize, offset);
+		/* Grab name */
+		kallsyms_expand_symbol(get_symbol_offset(pos), namebuf);
+		*modname = NULL;
+		return namebuf;
+	}
+
+	/* see if it's in a module */
+	msym = module_address_lookup(addr, symbolsize, offset, modname);
+	if (msym)
+		return strncpy(namebuf, msym, KSYM_NAME_LEN);
+
+	return NULL;
+}
+
+/* Replace "%s" in format with address, or returns -errno. */
+void __print_symbol(const char *fmt, unsigned long address)
+{
+	char *modname;
+	const char *name;
+	unsigned long offset, size;
+	char namebuf[KSYM_NAME_LEN+1];
+	char buffer[sizeof("%s+%#lx/%#lx [%s]") + KSYM_NAME_LEN +
+		    2*(BITS_PER_LONG*3/10) + MODULE_NAME_LEN + 1];
+
+	name = kallsyms_lookup(address, &size, &offset, &modname, namebuf);
+
+	if (!name)
+		sprintf(buffer, "0x%lx", address);
+	else {
+		if (modname)
+			sprintf(buffer, "%s+%#lx/%#lx [%s]", name, offset,
+				size, modname);
+		else
+			sprintf(buffer, "%s+%#lx/%#lx", name, offset, size);
+	}
+	printk(fmt, buffer);
+}
+
+/* To avoid using get_symbol_offset for every symbol, we carry prefix along. */
+struct kallsym_iter
+{
+	loff_t pos;
+	struct module *owner;
+	unsigned long value;
+	unsigned int nameoff; /* If iterating in core kernel symbols */
+	char type;
+	char name[KSYM_NAME_LEN+1];
+};
+
+static int get_ksymbol_mod(struct kallsym_iter *iter)
+{
+	iter->owner = module_get_kallsym(iter->pos - kallsyms_num_syms,
+					 &iter->value, &iter->type,
+					 iter->name, sizeof(iter->name));
+	if (iter->owner == NULL)
+		return 0;
+
+	/* Label it "global" if it is exported, "local" if not exported. */
+	iter->type = is_exported(iter->name, iter->owner)
+		? toupper(iter->type) : tolower(iter->type);
+
+	return 1;
+}
+
+/* Returns space to next name. */
+static unsigned long get_ksymbol_core(struct kallsym_iter *iter)
+{
+	unsigned off = iter->nameoff;
+
+	iter->owner = NULL;
+	iter->value = kallsyms_addresses[iter->pos];
+
+	iter->type = kallsyms_get_symbol_type(off);
+
+	off = kallsyms_expand_symbol(off, iter->name);
+
+	return off - iter->nameoff;
+}
+
+static void reset_iter(struct kallsym_iter *iter, loff_t new_pos)
+{
+	iter->name[0] = '\0';
+	iter->nameoff = get_symbol_offset(new_pos);
+	iter->pos = new_pos;
+}
+
+/* Returns false if pos at or past end of file. */
+static int update_iter(struct kallsym_iter *iter, loff_t pos)
+{
+	/* Module symbols can be accessed randomly. */
+	if (pos >= kallsyms_num_syms) {
+		iter->pos = pos;
+		return get_ksymbol_mod(iter);
+	}
+	
+	/* If we're not on the desired position, reset to new position. */
+	if (pos != iter->pos)
+		reset_iter(iter, pos);
+
+	iter->nameoff += get_ksymbol_core(iter);
+	iter->pos++;
+
+	return 1;
+}
+
+static void *s_next(struct seq_file *m, void *p, loff_t *pos)
+{
+	(*pos)++;
+
+	if (!update_iter(m->private, *pos))
+		return NULL;
+	return p;
+}
+
+static void *s_start(struct seq_file *m, loff_t *pos)
+{
+	if (!update_iter(m->private, *pos))
+		return NULL;
+	return m->private;
+}
+
+static void s_stop(struct seq_file *m, void *p)
+{
+}
+
+static int s_show(struct seq_file *m, void *p)
+{
+	struct kallsym_iter *iter = m->private;
+
+	/* Some debugging symbols have no name.  Ignore them. */ 
+	if (!iter->name[0])
+		return 0;
+
+	if (iter->owner)
+		seq_printf(m, "%0*lx %c %s\t[%s]\n",
+			   (int)(2*sizeof(void*)),
+			   iter->value, iter->type, iter->name,
+			   module_name(iter->owner));
+	else
+		seq_printf(m, "%0*lx %c %s\n",
+			   (int)(2*sizeof(void*)),
+			   iter->value, iter->type, iter->name);
+	return 0;
+}
+
+static const struct seq_operations kallsyms_op = {
+	.start = s_start,
+	.next = s_next,
+	.stop = s_stop,
+	.show = s_show
+};
+
+static int kallsyms_open(struct inode *inode, struct file *file)
+{
+	/* We keep iterator in m->private, since normal case is to
+	 * s_start from where we left off, so we avoid doing
+	 * using get_symbol_offset for every symbol */
+	struct kallsym_iter *iter;
+	int ret;
+
+	iter = kmalloc(sizeof(*iter), GFP_KERNEL);
+	if (!iter)
+		return -ENOMEM;
+	reset_iter(iter, 0);
+
+	ret = seq_open(file, &kallsyms_op);
+	if (ret == 0)
+		((struct seq_file *)file->private_data)->private = iter;
+	else
+		kfree(iter);
+	return ret;
+}
+
+static int kallsyms_release(struct inode *inode, struct file *file)
+{
+	struct seq_file *m = (struct seq_file *)file->private_data;
+	kfree(m->private);
+	return seq_release(inode, file);
+}
+
+static const struct file_operations kallsyms_operations = {
+	.open = kallsyms_open,
+	.read = seq_read,
+	.llseek = seq_lseek,
+	.release = kallsyms_release,
+};
+
+static int __init kallsyms_init(void)
+{
+	struct proc_dir_entry *entry;
+
+	entry = create_proc_entry("kallsyms", 0444, NULL);
+	if (entry)
+		entry->proc_fops = &kallsyms_operations;
+	return 0;
+}
+__initcall(kallsyms_init);
+
+EXPORT_SYMBOL(__print_symbol);
--- a/kernel/kexec.c
+++ b/kernel/kexec.c
--- a/kernel/kfifo.c
+++ b/kernel/kfifo.c
@@ -0,0 +1,196 @@
+/*
+ * A simple kernel FIFO implementation.
+ *
+ * Copyright (C) 2004 Stelian Pop <stelian@popies.net>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
+ *
+ */
+
+#include <linux/kernel.h>
+#include <linux/module.h>
+#include <linux/slab.h>
+#include <linux/err.h>
+#include <linux/kfifo.h>
+
+/**
+ * kfifo_init - allocates a new FIFO using a preallocated buffer
+ * @buffer: the preallocated buffer to be used.
+ * @size: the size of the internal buffer, this have to be a power of 2.
+ * @gfp_mask: get_free_pages mask, passed to kmalloc()
+ * @lock: the lock to be used to protect the fifo buffer
+ *
+ * Do NOT pass the kfifo to kfifo_free() after use! Simply free the
+ * &struct kfifo with kfree().
+ */
+struct kfifo *kfifo_init(unsigned char *buffer, unsigned int size,
+			 gfp_t gfp_mask, spinlock_t *lock)
+{
+	struct kfifo *fifo;
+
+	/* size must be a power of 2 */
+	BUG_ON(size & (size - 1));
+
+	fifo = kmalloc(sizeof(struct kfifo), gfp_mask);
+	if (!fifo)
+		return ERR_PTR(-ENOMEM);
+
+	fifo->buffer = buffer;
+	fifo->size = size;
+	fifo->in = fifo->out = 0;
+	fifo->lock = lock;
+
+	return fifo;
+}
+EXPORT_SYMBOL(kfifo_init);
+
+/**
+ * kfifo_alloc - allocates a new FIFO and its internal buffer
+ * @size: the size of the internal buffer to be allocated.
+ * @gfp_mask: get_free_pages mask, passed to kmalloc()
+ * @lock: the lock to be used to protect the fifo buffer
+ *
+ * The size will be rounded-up to a power of 2.
+ */
+struct kfifo *kfifo_alloc(unsigned int size, gfp_t gfp_mask, spinlock_t *lock)
+{
+	unsigned char *buffer;
+	struct kfifo *ret;
+
+	/*
+	 * round up to the next power of 2, since our 'let the indices
+	 * wrap' tachnique works only in this case.
+	 */
+	if (size & (size - 1)) {
+		BUG_ON(size > 0x80000000);
+		size = roundup_pow_of_two(size);
+	}
+
+	buffer = kmalloc(size, gfp_mask);
+	if (!buffer)
+		return ERR_PTR(-ENOMEM);
+
+	ret = kfifo_init(buffer, size, gfp_mask, lock);
+
+	if (IS_ERR(ret))
+		kfree(buffer);
+
+	return ret;
+}
+EXPORT_SYMBOL(kfifo_alloc);
+
+/**
+ * kfifo_free - frees the FIFO
+ * @fifo: the fifo to be freed.
+ */
+void kfifo_free(struct kfifo *fifo)
+{
+	kfree(fifo->buffer);
+	kfree(fifo);
+}
+EXPORT_SYMBOL(kfifo_free);
+
+/**
+ * __kfifo_put - puts some data into the FIFO, no locking version
+ * @fifo: the fifo to be used.
+ * @buffer: the data to be added.
+ * @len: the length of the data to be added.
+ *
+ * This function copies at most @len bytes from the @buffer into
+ * the FIFO depending on the free space, and returns the number of
+ * bytes copied.
+ *
+ * Note that with only one concurrent reader and one concurrent
+ * writer, you don't need extra locking to use these functions.
+ */
+unsigned int __kfifo_put(struct kfifo *fifo,
+			 unsigned char *buffer, unsigned int len)
+{
+	unsigned int l;
+
+	len = min(len, fifo->size - fifo->in + fifo->out);
+
+	/*
+	 * Ensure that we sample the fifo->out index -before- we
+	 * start putting bytes into the kfifo.
+	 */
+
+	smp_mb();
+
+	/* first put the data starting from fifo->in to buffer end */
+	l = min(len, fifo->size - (fifo->in & (fifo->size - 1)));
+	memcpy(fifo->buffer + (fifo->in & (fifo->size - 1)), buffer, l);
+
+	/* then put the rest (if any) at the beginning of the buffer */
+	memcpy(fifo->buffer, buffer + l, len - l);
+
+	/*
+	 * Ensure that we add the bytes to the kfifo -before-
+	 * we update the fifo->in index.
+	 */
+
+	smp_wmb();
+
+	fifo->in += len;
+
+	return len;
+}
+EXPORT_SYMBOL(__kfifo_put);
+
+/**
+ * __kfifo_get - gets some data from the FIFO, no locking version
+ * @fifo: the fifo to be used.
+ * @buffer: where the data must be copied.
+ * @len: the size of the destination buffer.
+ *
+ * This function copies at most @len bytes from the FIFO into the
+ * @buffer and returns the number of copied bytes.
+ *
+ * Note that with only one concurrent reader and one concurrent
+ * writer, you don't need extra locking to use these functions.
+ */
+unsigned int __kfifo_get(struct kfifo *fifo,
+			 unsigned char *buffer, unsigned int len)
+{
+	unsigned int l;
+
+	len = min(len, fifo->in - fifo->out);
+
+	/*
+	 * Ensure that we sample the fifo->in index -before- we
+	 * start removing bytes from the kfifo.
+	 */
+
+	smp_rmb();
+
+	/* first get the data from fifo->out until the end of the buffer */
+	l = min(len, fifo->size - (fifo->out & (fifo->size - 1)));
+	memcpy(buffer, fifo->buffer + (fifo->out & (fifo->size - 1)), l);
+
+	/* then get the rest (if any) from the beginning of the buffer */
+	memcpy(buffer + l, fifo->buffer, len - l);
+
+	/*
+	 * Ensure that we remove the bytes from the kfifo -before-
+	 * we update the fifo->out index.
+	 */
+
+	smp_mb();
+
+	fifo->out += len;
+
+	return len;
+}
+EXPORT_SYMBOL(__kfifo_get);
--- a/kernel/kmod.c
+++ b/kernel/kmod.c
@@ -0,0 +1,351 @@
+/*
+	kmod, the new module loader (replaces kerneld)
+	Kirk Petersen
+
+	Reorganized not to be a daemon by Adam Richter, with guidance
+	from Greg Zornetzer.
+
+	Modified to avoid chroot and file sharing problems.
+	Mikael Pettersson
+
+	Limit the concurrent number of kmod modprobes to catch loops from
+	"modprobe needs a service that is in a module".
+	Keith Owens <kaos@ocs.com.au> December 1999
+
+	Unblock all signals when we exec a usermode process.
+	Shuu Yamaguchi <shuu@wondernetworkresources.com> December 2000
+
+	call_usermodehelper wait flag, and remove exec_usermodehelper.
+	Rusty Russell <rusty@rustcorp.com.au>  Jan 2003
+*/
+#include <linux/module.h>
+#include <linux/sched.h>
+#include <linux/syscalls.h>
+#include <linux/unistd.h>
+#include <linux/kmod.h>
+#include <linux/smp_lock.h>
+#include <linux/slab.h>
+#include <linux/mnt_namespace.h>
+#include <linux/completion.h>
+#include <linux/file.h>
+#include <linux/workqueue.h>
+#include <linux/security.h>
+#include <linux/mount.h>
+#include <linux/kernel.h>
+#include <linux/init.h>
+#include <linux/resource.h>
+#include <asm/uaccess.h>
+
+extern int max_threads;
+
+static struct workqueue_struct *khelper_wq;
+
+#ifdef CONFIG_KMOD
+
+/*
+	modprobe_path is set via /proc/sys.
+*/
+char modprobe_path[KMOD_PATH_LEN] = "/sbin/modprobe";
+
+/**
+ * request_module - try to load a kernel module
+ * @fmt:     printf style format string for the name of the module
+ * @varargs: arguements as specified in the format string
+ *
+ * Load a module using the user mode module loader. The function returns
+ * zero on success or a negative errno code on failure. Note that a
+ * successful module load does not mean the module did not then unload
+ * and exit on an error of its own. Callers must check that the service
+ * they requested is now available not blindly invoke it.
+ *
+ * If module auto-loading support is disabled then this function
+ * becomes a no-operation.
+ */
+int request_module(const char *fmt, ...)
+{
+	va_list args;
+	char module_name[MODULE_NAME_LEN];
+	unsigned int max_modprobes;
+	int ret;
+	char *argv[] = { modprobe_path, "-q", "--", module_name, NULL };
+	static char *envp[] = { "HOME=/",
+				"TERM=linux",
+				"PATH=/sbin:/usr/sbin:/bin:/usr/bin",
+				NULL };
+	static atomic_t kmod_concurrent = ATOMIC_INIT(0);
+#define MAX_KMOD_CONCURRENT 50	/* Completely arbitrary value - KAO */
+	static int kmod_loop_msg;
+
+	va_start(args, fmt);
+	ret = vsnprintf(module_name, MODULE_NAME_LEN, fmt, args);
+	va_end(args);
+	if (ret >= MODULE_NAME_LEN)
+		return -ENAMETOOLONG;
+
+	/* If modprobe needs a service that is in a module, we get a recursive
+	 * loop.  Limit the number of running kmod threads to max_threads/2 or
+	 * MAX_KMOD_CONCURRENT, whichever is the smaller.  A cleaner method
+	 * would be to run the parents of this process, counting how many times
+	 * kmod was invoked.  That would mean accessing the internals of the
+	 * process tables to get the command line, proc_pid_cmdline is static
+	 * and it is not worth changing the proc code just to handle this case. 
+	 * KAO.
+	 *
+	 * "trace the ppid" is simple, but will fail if someone's
+	 * parent exits.  I think this is as good as it gets. --RR
+	 */
+	max_modprobes = min(max_threads/2, MAX_KMOD_CONCURRENT);
+	atomic_inc(&kmod_concurrent);
+	if (atomic_read(&kmod_concurrent) > max_modprobes) {
+		/* We may be blaming an innocent here, but unlikely */
+		if (kmod_loop_msg++ < 5)
+			printk(KERN_ERR
+			       "request_module: runaway loop modprobe %s\n",
+			       module_name);
+		atomic_dec(&kmod_concurrent);
+		return -ENOMEM;
+	}
+
+	ret = call_usermodehelper(modprobe_path, argv, envp, 1);
+	atomic_dec(&kmod_concurrent);
+	return ret;
+}
+EXPORT_SYMBOL(request_module);
+#endif /* CONFIG_KMOD */
+
+struct subprocess_info {
+	struct work_struct work;
+	struct completion *complete;
+	char *path;
+	char **argv;
+	char **envp;
+	struct key *ring;
+	int wait;
+	int retval;
+	struct file *stdin;
+};
+
+/*
+ * This is the task which runs the usermode application
+ */
+static int ____call_usermodehelper(void *data)
+{
+	struct subprocess_info *sub_info = data;
+	struct key *new_session, *old_session;
+	int retval;
+
+	/* Unblock all signals and set the session keyring. */
+	new_session = key_get(sub_info->ring);
+	flush_signals(current);
+	spin_lock_irq(&current->sighand->siglock);
+	old_session = __install_session_keyring(current, new_session);
+	flush_signal_handlers(current, 1);
+	sigemptyset(&current->blocked);
+	recalc_sigpending();
+	spin_unlock_irq(&current->sighand->siglock);
+
+	key_put(old_session);
+
+	/* Install input pipe when needed */
+	if (sub_info->stdin) {
+		struct files_struct *f = current->files;
+		struct fdtable *fdt;
+		/* no races because files should be private here */
+		sys_close(0);
+		fd_install(0, sub_info->stdin);
+		spin_lock(&f->file_lock);
+		fdt = files_fdtable(f);
+		FD_SET(0, fdt->open_fds);
+		FD_CLR(0, fdt->close_on_exec);
+		spin_unlock(&f->file_lock);
+
+		/* and disallow core files too */
+		current->signal->rlim[RLIMIT_CORE] = (struct rlimit){0, 0};
+	}
+
+	/* We can run anywhere, unlike our parent keventd(). */
+	set_cpus_allowed(current, CPU_MASK_ALL);
+
+	retval = -EPERM;
+	if (current->fs->root)
+		retval = kernel_execve(sub_info->path,
+				sub_info->argv, sub_info->envp);
+
+	/* Exec failed? */
+	sub_info->retval = retval;
+	do_exit(0);
+}
+
+/* Keventd can't block, but this (a child) can. */
+static int wait_for_helper(void *data)
+{
+	struct subprocess_info *sub_info = data;
+	pid_t pid;
+	struct k_sigaction sa;
+
+	/* Install a handler: if SIGCLD isn't handled sys_wait4 won't
+	 * populate the status, but will return -ECHILD. */
+	sa.sa.sa_handler = SIG_IGN;
+	sa.sa.sa_flags = 0;
+	siginitset(&sa.sa.sa_mask, sigmask(SIGCHLD));
+	do_sigaction(SIGCHLD, &sa, NULL);
+	allow_signal(SIGCHLD);
+
+	pid = kernel_thread(____call_usermodehelper, sub_info, SIGCHLD);
+	if (pid < 0) {
+		sub_info->retval = pid;
+	} else {
+		int ret;
+
+		/*
+		 * Normally it is bogus to call wait4() from in-kernel because
+		 * wait4() wants to write the exit code to a userspace address.
+		 * But wait_for_helper() always runs as keventd, and put_user()
+		 * to a kernel address works OK for kernel threads, due to their
+		 * having an mm_segment_t which spans the entire address space.
+		 *
+		 * Thus the __user pointer cast is valid here.
+		 */
+		sys_wait4(pid, (int __user *)&ret, 0, NULL);
+
+		/*
+		 * If ret is 0, either ____call_usermodehelper failed and the
+		 * real error code is already in sub_info->retval or
+		 * sub_info->retval is 0 anyway, so don't mess with it then.
+		 */
+		if (ret)
+			sub_info->retval = ret;
+	}
+
+	if (sub_info->wait < 0)
+		kfree(sub_info);
+	else
+		complete(sub_info->complete);
+	return 0;
+}
+
+/* This is run by khelper thread  */
+static void __call_usermodehelper(struct work_struct *work)
+{
+	struct subprocess_info *sub_info =
+		container_of(work, struct subprocess_info, work);
+	pid_t pid;
+	int wait = sub_info->wait;
+
+	/* CLONE_VFORK: wait until the usermode helper has execve'd
+	 * successfully We need the data structures to stay around
+	 * until that is done.  */
+	if (wait)
+		pid = kernel_thread(wait_for_helper, sub_info,
+				    CLONE_FS | CLONE_FILES | SIGCHLD);
+	else
+		pid = kernel_thread(____call_usermodehelper, sub_info,
+				    CLONE_VFORK | SIGCHLD);
+
+	if (wait < 0)
+		return;
+
+	if (pid < 0) {
+		sub_info->retval = pid;
+		complete(sub_info->complete);
+	} else if (!wait)
+		complete(sub_info->complete);
+}
+
+/**
+ * call_usermodehelper_keys - start a usermode application
+ * @path: pathname for the application
+ * @argv: null-terminated argument list
+ * @envp: null-terminated environment list
+ * @session_keyring: session keyring for process (NULL for an empty keyring)
+ * @wait: wait for the application to finish and return status.
+ *        when -1 don't wait at all, but you get no useful error back when
+ *        the program couldn't be exec'ed. This makes it safe to call
+ *        from interrupt context.
+ *
+ * Runs a user-space application.  The application is started
+ * asynchronously if wait is not set, and runs as a child of keventd.
+ * (ie. it runs with full root capabilities).
+ *
+ * Must be called from process context.  Returns a negative error code
+ * if program was not execed successfully, or 0.
+ */
+int call_usermodehelper_keys(char *path, char **argv, char **envp,
+			     struct key *session_keyring, int wait)
+{
+	DECLARE_COMPLETION_ONSTACK(done);
+	struct subprocess_info *sub_info;
+	int retval;
+
+	if (!khelper_wq)
+		return -EBUSY;
+
+	if (path[0] == '\0')
+		return 0;
+
+	sub_info = kzalloc(sizeof(struct subprocess_info),  GFP_ATOMIC);
+	if (!sub_info)
+		return -ENOMEM;
+
+	INIT_WORK(&sub_info->work, __call_usermodehelper);
+	sub_info->complete = &done;
+	sub_info->path = path;
+	sub_info->argv = argv;
+	sub_info->envp = envp;
+	sub_info->ring = session_keyring;
+	sub_info->wait = wait;
+
+	queue_work(khelper_wq, &sub_info->work);
+	if (wait < 0) /* task has freed sub_info */
+		return 0;
+	wait_for_completion(&done);
+	retval = sub_info->retval;
+	kfree(sub_info);
+	return retval;
+}
+EXPORT_SYMBOL(call_usermodehelper_keys);
+
+int call_usermodehelper_pipe(char *path, char **argv, char **envp,
+			     struct file **filp)
+{
+	DECLARE_COMPLETION(done);
+	struct subprocess_info sub_info = {
+		.work		= __WORK_INITIALIZER(sub_info.work,
+						     __call_usermodehelper),
+		.complete	= &done,
+		.path		= path,
+		.argv		= argv,
+		.envp		= envp,
+		.retval		= 0,
+	};
+	struct file *f;
+
+	if (!khelper_wq)
+		return -EBUSY;
+
+	if (path[0] == '\0')
+		return 0;
+
+	f = create_write_pipe();
+	if (IS_ERR(f))
+		return PTR_ERR(f);
+	*filp = f;
+
+	f = create_read_pipe(f);
+	if (IS_ERR(f)) {
+		free_write_pipe(*filp);
+		return PTR_ERR(f);
+	}
+	sub_info.stdin = f;
+
+	queue_work(khelper_wq, &sub_info.work);
+	wait_for_completion(&done);
+	return sub_info.retval;
+}
+EXPORT_SYMBOL(call_usermodehelper_pipe);
+
+void __init usermodehelper_init(void)
+{
+	khelper_wq = create_singlethread_workqueue("khelper");
+	BUG_ON(!khelper_wq);
+}
--- a/kernel/kprobes.c
+++ b/kernel/kprobes.c
@@ -0,0 +1,936 @@
+/*
+ *  Kernel Probes (KProbes)
+ *  kernel/kprobes.c
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
+ *
+ * Copyright (C) IBM Corporation, 2002, 2004
+ *
+ * 2002-Oct	Created by Vamsi Krishna S <vamsi_krishna@in.ibm.com> Kernel
+ *		Probes initial implementation (includes suggestions from
+ *		Rusty Russell).
+ * 2004-Aug	Updated by Prasanna S Panchamukhi <prasanna@in.ibm.com> with
+ *		hlists and exceptions notifier as suggested by Andi Kleen.
+ * 2004-July	Suparna Bhattacharya <suparna@in.ibm.com> added jumper probes
+ *		interface to access function arguments.
+ * 2004-Sep	Prasanna S Panchamukhi <prasanna@in.ibm.com> Changed Kprobes
+ *		exceptions notifier to be first on the priority list.
+ * 2005-May	Hien Nguyen <hien@us.ibm.com>, Jim Keniston
+ *		<jkenisto@us.ibm.com> and Prasanna S Panchamukhi
+ *		<prasanna@in.ibm.com> added function-return probes.
+ */
+#include <linux/kprobes.h>
+#include <linux/hash.h>
+#include <linux/init.h>
+#include <linux/slab.h>
+#include <linux/module.h>
+#include <linux/moduleloader.h>
+#include <linux/kallsyms.h>
+#include <linux/freezer.h>
+#include <linux/seq_file.h>
+#include <linux/debugfs.h>
+#include <asm-generic/sections.h>
+#include <asm/cacheflush.h>
+#include <asm/errno.h>
+#include <asm/kdebug.h>
+
+#define KPROBE_HASH_BITS 6
+#define KPROBE_TABLE_SIZE (1 << KPROBE_HASH_BITS)
+
+
+/*
+ * Some oddball architectures like 64bit powerpc have function descriptors
+ * so this must be overridable.
+ */
+#ifndef kprobe_lookup_name
+#define kprobe_lookup_name(name, addr) \
+	addr = ((kprobe_opcode_t *)(kallsyms_lookup_name(name)))
+#endif
+
+static struct hlist_head kprobe_table[KPROBE_TABLE_SIZE];
+static struct hlist_head kretprobe_inst_table[KPROBE_TABLE_SIZE];
+static atomic_t kprobe_count;
+
+DEFINE_MUTEX(kprobe_mutex);		/* Protects kprobe_table */
+DEFINE_SPINLOCK(kretprobe_lock);	/* Protects kretprobe_inst_table */
+static DEFINE_PER_CPU(struct kprobe *, kprobe_instance) = NULL;
+
+static struct notifier_block kprobe_page_fault_nb = {
+	.notifier_call = kprobe_exceptions_notify,
+	.priority = 0x7fffffff /* we need to notified first */
+};
+
+#ifdef __ARCH_WANT_KPROBES_INSN_SLOT
+/*
+ * kprobe->ainsn.insn points to the copy of the instruction to be
+ * single-stepped. x86_64, POWER4 and above have no-exec support and
+ * stepping on the instruction on a vmalloced/kmalloced/data page
+ * is a recipe for disaster
+ */
+#define INSNS_PER_PAGE	(PAGE_SIZE/(MAX_INSN_SIZE * sizeof(kprobe_opcode_t)))
+
+struct kprobe_insn_page {
+	struct hlist_node hlist;
+	kprobe_opcode_t *insns;		/* Page of instruction slots */
+	char slot_used[INSNS_PER_PAGE];
+	int nused;
+	int ngarbage;
+};
+
+enum kprobe_slot_state {
+	SLOT_CLEAN = 0,
+	SLOT_DIRTY = 1,
+	SLOT_USED = 2,
+};
+
+static struct hlist_head kprobe_insn_pages;
+static int kprobe_garbage_slots;
+static int collect_garbage_slots(void);
+
+static int __kprobes check_safety(void)
+{
+	int ret = 0;
+#if defined(CONFIG_PREEMPT) && defined(CONFIG_PM)
+	ret = freeze_processes();
+	if (ret == 0) {
+		struct task_struct *p, *q;
+		do_each_thread(p, q) {
+			if (p != current && p->state == TASK_RUNNING &&
+			    p->pid != 0) {
+				printk("Check failed: %s is running\n",p->comm);
+				ret = -1;
+				goto loop_end;
+			}
+		} while_each_thread(p, q);
+	}
+loop_end:
+	thaw_processes();
+#else
+	synchronize_sched();
+#endif
+	return ret;
+}
+
+/**
+ * get_insn_slot() - Find a slot on an executable page for an instruction.
+ * We allocate an executable page if there's no room on existing ones.
+ */
+kprobe_opcode_t __kprobes *get_insn_slot(void)
+{
+	struct kprobe_insn_page *kip;
+	struct hlist_node *pos;
+
+      retry:
+	hlist_for_each(pos, &kprobe_insn_pages) {
+		kip = hlist_entry(pos, struct kprobe_insn_page, hlist);
+		if (kip->nused < INSNS_PER_PAGE) {
+			int i;
+			for (i = 0; i < INSNS_PER_PAGE; i++) {
+				if (kip->slot_used[i] == SLOT_CLEAN) {
+					kip->slot_used[i] = SLOT_USED;
+					kip->nused++;
+					return kip->insns + (i * MAX_INSN_SIZE);
+				}
+			}
+			/* Surprise!  No unused slots.  Fix kip->nused. */
+			kip->nused = INSNS_PER_PAGE;
+		}
+	}
+
+	/* If there are any garbage slots, collect it and try again. */
+	if (kprobe_garbage_slots && collect_garbage_slots() == 0) {
+		goto retry;
+	}
+	/* All out of space.  Need to allocate a new page. Use slot 0. */
+	kip = kmalloc(sizeof(struct kprobe_insn_page), GFP_KERNEL);
+	if (!kip) {
+		return NULL;
+	}
+
+	/*
+	 * Use module_alloc so this page is within +/- 2GB of where the
+	 * kernel image and loaded module images reside. This is required
+	 * so x86_64 can correctly handle the %rip-relative fixups.
+	 */
+	kip->insns = module_alloc(PAGE_SIZE);
+	if (!kip->insns) {
+		kfree(kip);
+		return NULL;
+	}
+	INIT_HLIST_NODE(&kip->hlist);
+	hlist_add_head(&kip->hlist, &kprobe_insn_pages);
+	memset(kip->slot_used, SLOT_CLEAN, INSNS_PER_PAGE);
+	kip->slot_used[0] = SLOT_USED;
+	kip->nused = 1;
+	kip->ngarbage = 0;
+	return kip->insns;
+}
+
+/* Return 1 if all garbages are collected, otherwise 0. */
+static int __kprobes collect_one_slot(struct kprobe_insn_page *kip, int idx)
+{
+	kip->slot_used[idx] = SLOT_CLEAN;
+	kip->nused--;
+	if (kip->nused == 0) {
+		/*
+		 * Page is no longer in use.  Free it unless
+		 * it's the last one.  We keep the last one
+		 * so as not to have to set it up again the
+		 * next time somebody inserts a probe.
+		 */
+		hlist_del(&kip->hlist);
+		if (hlist_empty(&kprobe_insn_pages)) {
+			INIT_HLIST_NODE(&kip->hlist);
+			hlist_add_head(&kip->hlist,
+				       &kprobe_insn_pages);
+		} else {
+			module_free(NULL, kip->insns);
+			kfree(kip);
+		}
+		return 1;
+	}
+	return 0;
+}
+
+static int __kprobes collect_garbage_slots(void)
+{
+	struct kprobe_insn_page *kip;
+	struct hlist_node *pos, *next;
+
+	/* Ensure no-one is preepmted on the garbages */
+	if (check_safety() != 0)
+		return -EAGAIN;
+
+	hlist_for_each_safe(pos, next, &kprobe_insn_pages) {
+		int i;
+		kip = hlist_entry(pos, struct kprobe_insn_page, hlist);
+		if (kip->ngarbage == 0)
+			continue;
+		kip->ngarbage = 0;	/* we will collect all garbages */
+		for (i = 0; i < INSNS_PER_PAGE; i++) {
+			if (kip->slot_used[i] == SLOT_DIRTY &&
+			    collect_one_slot(kip, i))
+				break;
+		}
+	}
+	kprobe_garbage_slots = 0;
+	return 0;
+}
+
+void __kprobes free_insn_slot(kprobe_opcode_t * slot, int dirty)
+{
+	struct kprobe_insn_page *kip;
+	struct hlist_node *pos;
+
+	hlist_for_each(pos, &kprobe_insn_pages) {
+		kip = hlist_entry(pos, struct kprobe_insn_page, hlist);
+		if (kip->insns <= slot &&
+		    slot < kip->insns + (INSNS_PER_PAGE * MAX_INSN_SIZE)) {
+			int i = (slot - kip->insns) / MAX_INSN_SIZE;
+			if (dirty) {
+				kip->slot_used[i] = SLOT_DIRTY;
+				kip->ngarbage++;
+			} else {
+				collect_one_slot(kip, i);
+			}
+			break;
+		}
+	}
+	if (dirty && (++kprobe_garbage_slots > INSNS_PER_PAGE)) {
+		collect_garbage_slots();
+	}
+}
+#endif
+
+/* We have preemption disabled.. so it is safe to use __ versions */
+static inline void set_kprobe_instance(struct kprobe *kp)
+{
+	__get_cpu_var(kprobe_instance) = kp;
+}
+
+static inline void reset_kprobe_instance(void)
+{
+	__get_cpu_var(kprobe_instance) = NULL;
+}
+
+/*
+ * This routine is called either:
+ * 	- under the kprobe_mutex - during kprobe_[un]register()
+ * 				OR
+ * 	- with preemption disabled - from arch/xxx/kernel/kprobes.c
+ */
+struct kprobe __kprobes *get_kprobe(void *addr)
+{
+	struct hlist_head *head;
+	struct hlist_node *node;
+	struct kprobe *p;
+
+	head = &kprobe_table[hash_ptr(addr, KPROBE_HASH_BITS)];
+	hlist_for_each_entry_rcu(p, node, head, hlist) {
+		if (p->addr == addr)
+			return p;
+	}
+	return NULL;
+}
+
+/*
+ * Aggregate handlers for multiple kprobes support - these handlers
+ * take care of invoking the individual kprobe handlers on p->list
+ */
+static int __kprobes aggr_pre_handler(struct kprobe *p, struct pt_regs *regs)
+{
+	struct kprobe *kp;
+
+	list_for_each_entry_rcu(kp, &p->list, list) {
+		if (kp->pre_handler) {
+			set_kprobe_instance(kp);
+			if (kp->pre_handler(kp, regs))
+				return 1;
+		}
+		reset_kprobe_instance();
+	}
+	return 0;
+}
+
+static void __kprobes aggr_post_handler(struct kprobe *p, struct pt_regs *regs,
+					unsigned long flags)
+{
+	struct kprobe *kp;
+
+	list_for_each_entry_rcu(kp, &p->list, list) {
+		if (kp->post_handler) {
+			set_kprobe_instance(kp);
+			kp->post_handler(kp, regs, flags);
+			reset_kprobe_instance();
+		}
+	}
+	return;
+}
+
+static int __kprobes aggr_fault_handler(struct kprobe *p, struct pt_regs *regs,
+					int trapnr)
+{
+	struct kprobe *cur = __get_cpu_var(kprobe_instance);
+
+	/*
+	 * if we faulted "during" the execution of a user specified
+	 * probe handler, invoke just that probe's fault handler
+	 */
+	if (cur && cur->fault_handler) {
+		if (cur->fault_handler(cur, regs, trapnr))
+			return 1;
+	}
+	return 0;
+}
+
+static int __kprobes aggr_break_handler(struct kprobe *p, struct pt_regs *regs)
+{
+	struct kprobe *cur = __get_cpu_var(kprobe_instance);
+	int ret = 0;
+
+	if (cur && cur->break_handler) {
+		if (cur->break_handler(cur, regs))
+			ret = 1;
+	}
+	reset_kprobe_instance();
+	return ret;
+}
+
+/* Walks the list and increments nmissed count for multiprobe case */
+void __kprobes kprobes_inc_nmissed_count(struct kprobe *p)
+{
+	struct kprobe *kp;
+	if (p->pre_handler != aggr_pre_handler) {
+		p->nmissed++;
+	} else {
+		list_for_each_entry_rcu(kp, &p->list, list)
+			kp->nmissed++;
+	}
+	return;
+}
+
+/* Called with kretprobe_lock held */
+struct kretprobe_instance __kprobes *get_free_rp_inst(struct kretprobe *rp)
+{
+	struct hlist_node *node;
+	struct kretprobe_instance *ri;
+	hlist_for_each_entry(ri, node, &rp->free_instances, uflist)
+		return ri;
+	return NULL;
+}
+
+/* Called with kretprobe_lock held */
+static struct kretprobe_instance __kprobes *get_used_rp_inst(struct kretprobe
+							      *rp)
+{
+	struct hlist_node *node;
+	struct kretprobe_instance *ri;
+	hlist_for_each_entry(ri, node, &rp->used_instances, uflist)
+		return ri;
+	return NULL;
+}
+
+/* Called with kretprobe_lock held */
+void __kprobes add_rp_inst(struct kretprobe_instance *ri)
+{
+	/*
+	 * Remove rp inst off the free list -
+	 * Add it back when probed function returns
+	 */
+	hlist_del(&ri->uflist);
+
+	/* Add rp inst onto table */
+	INIT_HLIST_NODE(&ri->hlist);
+	hlist_add_head(&ri->hlist,
+			&kretprobe_inst_table[hash_ptr(ri->task, KPROBE_HASH_BITS)]);
+
+	/* Also add this rp inst to the used list. */
+	INIT_HLIST_NODE(&ri->uflist);
+	hlist_add_head(&ri->uflist, &ri->rp->used_instances);
+}
+
+/* Called with kretprobe_lock held */
+void __kprobes recycle_rp_inst(struct kretprobe_instance *ri,
+				struct hlist_head *head)
+{
+	/* remove rp inst off the rprobe_inst_table */
+	hlist_del(&ri->hlist);
+	if (ri->rp) {
+		/* remove rp inst off the used list */
+		hlist_del(&ri->uflist);
+		/* put rp inst back onto the free list */
+		INIT_HLIST_NODE(&ri->uflist);
+		hlist_add_head(&ri->uflist, &ri->rp->free_instances);
+	} else
+		/* Unregistering */
+		hlist_add_head(&ri->hlist, head);
+}
+
+struct hlist_head __kprobes *kretprobe_inst_table_head(struct task_struct *tsk)
+{
+	return &kretprobe_inst_table[hash_ptr(tsk, KPROBE_HASH_BITS)];
+}
+
+/*
+ * This function is called from finish_task_switch when task tk becomes dead,
+ * so that we can recycle any function-return probe instances associated
+ * with this task. These left over instances represent probed functions
+ * that have been called but will never return.
+ */
+void __kprobes kprobe_flush_task(struct task_struct *tk)
+{
+	struct kretprobe_instance *ri;
+	struct hlist_head *head, empty_rp;
+	struct hlist_node *node, *tmp;
+	unsigned long flags = 0;
+
+	INIT_HLIST_HEAD(&empty_rp);
+	spin_lock_irqsave(&kretprobe_lock, flags);
+	head = kretprobe_inst_table_head(tk);
+	hlist_for_each_entry_safe(ri, node, tmp, head, hlist) {
+		if (ri->task == tk)
+			recycle_rp_inst(ri, &empty_rp);
+	}
+	spin_unlock_irqrestore(&kretprobe_lock, flags);
+
+	hlist_for_each_entry_safe(ri, node, tmp, &empty_rp, hlist) {
+		hlist_del(&ri->hlist);
+		kfree(ri);
+	}
+}
+
+static inline void free_rp_inst(struct kretprobe *rp)
+{
+	struct kretprobe_instance *ri;
+	while ((ri = get_free_rp_inst(rp)) != NULL) {
+		hlist_del(&ri->uflist);
+		kfree(ri);
+	}
+}
+
+/*
+ * Keep all fields in the kprobe consistent
+ */
+static inline void copy_kprobe(struct kprobe *old_p, struct kprobe *p)
+{
+	memcpy(&p->opcode, &old_p->opcode, sizeof(kprobe_opcode_t));
+	memcpy(&p->ainsn, &old_p->ainsn, sizeof(struct arch_specific_insn));
+}
+
+/*
+* Add the new probe to old_p->list. Fail if this is the
+* second jprobe at the address - two jprobes can't coexist
+*/
+static int __kprobes add_new_kprobe(struct kprobe *old_p, struct kprobe *p)
+{
+	if (p->break_handler) {
+		if (old_p->break_handler)
+			return -EEXIST;
+		list_add_tail_rcu(&p->list, &old_p->list);
+		old_p->break_handler = aggr_break_handler;
+	} else
+		list_add_rcu(&p->list, &old_p->list);
+	if (p->post_handler && !old_p->post_handler)
+		old_p->post_handler = aggr_post_handler;
+	return 0;
+}
+
+/*
+ * Fill in the required fields of the "manager kprobe". Replace the
+ * earlier kprobe in the hlist with the manager kprobe
+ */
+static inline void add_aggr_kprobe(struct kprobe *ap, struct kprobe *p)
+{
+	copy_kprobe(p, ap);
+	flush_insn_slot(ap);
+	ap->addr = p->addr;
+	ap->pre_handler = aggr_pre_handler;
+	ap->fault_handler = aggr_fault_handler;
+	if (p->post_handler)
+		ap->post_handler = aggr_post_handler;
+	if (p->break_handler)
+		ap->break_handler = aggr_break_handler;
+
+	INIT_LIST_HEAD(&ap->list);
+	list_add_rcu(&p->list, &ap->list);
+
+	hlist_replace_rcu(&p->hlist, &ap->hlist);
+}
+
+/*
+ * This is the second or subsequent kprobe at the address - handle
+ * the intricacies
+ */
+static int __kprobes register_aggr_kprobe(struct kprobe *old_p,
+					  struct kprobe *p)
+{
+	int ret = 0;
+	struct kprobe *ap;
+
+	if (old_p->pre_handler == aggr_pre_handler) {
+		copy_kprobe(old_p, p);
+		ret = add_new_kprobe(old_p, p);
+	} else {
+		ap = kzalloc(sizeof(struct kprobe), GFP_KERNEL);
+		if (!ap)
+			return -ENOMEM;
+		add_aggr_kprobe(ap, old_p);
+		copy_kprobe(ap, p);
+		ret = add_new_kprobe(ap, p);
+	}
+	return ret;
+}
+
+static int __kprobes in_kprobes_functions(unsigned long addr)
+{
+	if (addr >= (unsigned long)__kprobes_text_start
+		&& addr < (unsigned long)__kprobes_text_end)
+		return -EINVAL;
+	return 0;
+}
+
+static int __kprobes __register_kprobe(struct kprobe *p,
+	unsigned long called_from)
+{
+	int ret = 0;
+	struct kprobe *old_p;
+	struct module *probed_mod;
+
+	/*
+	 * If we have a symbol_name argument look it up,
+	 * and add it to the address.  That way the addr
+	 * field can either be global or relative to a symbol.
+	 */
+	if (p->symbol_name) {
+		if (p->addr)
+			return -EINVAL;
+		kprobe_lookup_name(p->symbol_name, p->addr);
+	}
+
+	if (!p->addr)
+		return -EINVAL;
+	p->addr = (kprobe_opcode_t *)(((char *)p->addr)+ p->offset);
+
+	if ((!kernel_text_address((unsigned long) p->addr)) ||
+		in_kprobes_functions((unsigned long) p->addr))
+		return -EINVAL;
+
+	p->mod_refcounted = 0;
+	/* Check are we probing a module */
+	if ((probed_mod = module_text_address((unsigned long) p->addr))) {
+		struct module *calling_mod = module_text_address(called_from);
+		/* We must allow modules to probe themself and
+		 * in this case avoid incrementing the module refcount,
+		 * so as to allow unloading of self probing modules.
+		 */
+		if (calling_mod && (calling_mod != probed_mod)) {
+			if (unlikely(!try_module_get(probed_mod)))
+				return -EINVAL;
+			p->mod_refcounted = 1;
+		} else
+			probed_mod = NULL;
+	}
+
+	p->nmissed = 0;
+	mutex_lock(&kprobe_mutex);
+	old_p = get_kprobe(p->addr);
+	if (old_p) {
+		ret = register_aggr_kprobe(old_p, p);
+		if (!ret)
+			atomic_inc(&kprobe_count);
+		goto out;
+	}
+
+	if ((ret = arch_prepare_kprobe(p)) != 0)
+		goto out;
+
+	INIT_HLIST_NODE(&p->hlist);
+	hlist_add_head_rcu(&p->hlist,
+		       &kprobe_table[hash_ptr(p->addr, KPROBE_HASH_BITS)]);
+
+	if (atomic_add_return(1, &kprobe_count) == \
+				(ARCH_INACTIVE_KPROBE_COUNT + 1))
+		register_page_fault_notifier(&kprobe_page_fault_nb);
+
+	arch_arm_kprobe(p);
+
+out:
+	mutex_unlock(&kprobe_mutex);
+
+	if (ret && probed_mod)
+		module_put(probed_mod);
+	return ret;
+}
+
+int __kprobes register_kprobe(struct kprobe *p)
+{
+	return __register_kprobe(p,
+		(unsigned long)__builtin_return_address(0));
+}
+
+void __kprobes unregister_kprobe(struct kprobe *p)
+{
+	struct module *mod;
+	struct kprobe *old_p, *list_p;
+	int cleanup_p;
+
+	mutex_lock(&kprobe_mutex);
+	old_p = get_kprobe(p->addr);
+	if (unlikely(!old_p)) {
+		mutex_unlock(&kprobe_mutex);
+		return;
+	}
+	if (p != old_p) {
+		list_for_each_entry_rcu(list_p, &old_p->list, list)
+			if (list_p == p)
+			/* kprobe p is a valid probe */
+				goto valid_p;
+		mutex_unlock(&kprobe_mutex);
+		return;
+	}
+valid_p:
+	if ((old_p == p) || ((old_p->pre_handler == aggr_pre_handler) &&
+		(p->list.next == &old_p->list) &&
+		(p->list.prev == &old_p->list))) {
+		/* Only probe on the hash list */
+		arch_disarm_kprobe(p);
+		hlist_del_rcu(&old_p->hlist);
+		cleanup_p = 1;
+	} else {
+		list_del_rcu(&p->list);
+		cleanup_p = 0;
+	}
+
+	mutex_unlock(&kprobe_mutex);
+
+	synchronize_sched();
+	if (p->mod_refcounted &&
+	    (mod = module_text_address((unsigned long)p->addr)))
+		module_put(mod);
+
+	if (cleanup_p) {
+		if (p != old_p) {
+			list_del_rcu(&p->list);
+			kfree(old_p);
+		}
+		arch_remove_kprobe(p);
+	} else {
+		mutex_lock(&kprobe_mutex);
+		if (p->break_handler)
+			old_p->break_handler = NULL;
+		if (p->post_handler){
+			list_for_each_entry_rcu(list_p, &old_p->list, list){
+				if (list_p->post_handler){
+					cleanup_p = 2;
+					break;
+				}
+			}
+			if (cleanup_p == 0)
+				old_p->post_handler = NULL;
+		}
+		mutex_unlock(&kprobe_mutex);
+	}
+
+	/* Call unregister_page_fault_notifier()
+	 * if no probes are active
+	 */
+	mutex_lock(&kprobe_mutex);
+	if (atomic_add_return(-1, &kprobe_count) == \
+				ARCH_INACTIVE_KPROBE_COUNT)
+		unregister_page_fault_notifier(&kprobe_page_fault_nb);
+	mutex_unlock(&kprobe_mutex);
+	return;
+}
+
+static struct notifier_block kprobe_exceptions_nb = {
+	.notifier_call = kprobe_exceptions_notify,
+	.priority = 0x7fffffff /* we need to be notified first */
+};
+
+
+int __kprobes register_jprobe(struct jprobe *jp)
+{
+	/* Todo: Verify probepoint is a function entry point */
+	jp->kp.pre_handler = setjmp_pre_handler;
+	jp->kp.break_handler = longjmp_break_handler;
+
+	return __register_kprobe(&jp->kp,
+		(unsigned long)__builtin_return_address(0));
+}
+
+void __kprobes unregister_jprobe(struct jprobe *jp)
+{
+	unregister_kprobe(&jp->kp);
+}
+
+#ifdef ARCH_SUPPORTS_KRETPROBES
+
+/*
+ * This kprobe pre_handler is registered with every kretprobe. When probe
+ * hits it will set up the return probe.
+ */
+static int __kprobes pre_handler_kretprobe(struct kprobe *p,
+					   struct pt_regs *regs)
+{
+	struct kretprobe *rp = container_of(p, struct kretprobe, kp);
+	unsigned long flags = 0;
+
+	/*TODO: consider to only swap the RA after the last pre_handler fired */
+	spin_lock_irqsave(&kretprobe_lock, flags);
+	arch_prepare_kretprobe(rp, regs);
+	spin_unlock_irqrestore(&kretprobe_lock, flags);
+	return 0;
+}
+
+int __kprobes register_kretprobe(struct kretprobe *rp)
+{
+	int ret = 0;
+	struct kretprobe_instance *inst;
+	int i;
+
+	rp->kp.pre_handler = pre_handler_kretprobe;
+	rp->kp.post_handler = NULL;
+	rp->kp.fault_handler = NULL;
+	rp->kp.break_handler = NULL;
+
+	/* Pre-allocate memory for max kretprobe instances */
+	if (rp->maxactive <= 0) {
+#ifdef CONFIG_PREEMPT
+		rp->maxactive = max(10, 2 * NR_CPUS);
+#else
+		rp->maxactive = NR_CPUS;
+#endif
+	}
+	INIT_HLIST_HEAD(&rp->used_instances);
+	INIT_HLIST_HEAD(&rp->free_instances);
+	for (i = 0; i < rp->maxactive; i++) {
+		inst = kmalloc(sizeof(struct kretprobe_instance), GFP_KERNEL);
+		if (inst == NULL) {
+			free_rp_inst(rp);
+			return -ENOMEM;
+		}
+		INIT_HLIST_NODE(&inst->uflist);
+		hlist_add_head(&inst->uflist, &rp->free_instances);
+	}
+
+	rp->nmissed = 0;
+	/* Establish function entry probe point */
+	if ((ret = __register_kprobe(&rp->kp,
+		(unsigned long)__builtin_return_address(0))) != 0)
+		free_rp_inst(rp);
+	return ret;
+}
+
+#else /* ARCH_SUPPORTS_KRETPROBES */
+
+int __kprobes register_kretprobe(struct kretprobe *rp)
+{
+	return -ENOSYS;
+}
+
+static int __kprobes pre_handler_kretprobe(struct kprobe *p,
+					   struct pt_regs *regs)
+{
+	return 0;
+}
+
+#endif /* ARCH_SUPPORTS_KRETPROBES */
+
+void __kprobes unregister_kretprobe(struct kretprobe *rp)
+{
+	unsigned long flags;
+	struct kretprobe_instance *ri;
+
+	unregister_kprobe(&rp->kp);
+	/* No race here */
+	spin_lock_irqsave(&kretprobe_lock, flags);
+	while ((ri = get_used_rp_inst(rp)) != NULL) {
+		ri->rp = NULL;
+		hlist_del(&ri->uflist);
+	}
+	spin_unlock_irqrestore(&kretprobe_lock, flags);
+	free_rp_inst(rp);
+}
+
+static int __init init_kprobes(void)
+{
+	int i, err = 0;
+
+	/* FIXME allocate the probe table, currently defined statically */
+	/* initialize all list heads */
+	for (i = 0; i < KPROBE_TABLE_SIZE; i++) {
+		INIT_HLIST_HEAD(&kprobe_table[i]);
+		INIT_HLIST_HEAD(&kretprobe_inst_table[i]);
+	}
+	atomic_set(&kprobe_count, 0);
+
+	err = arch_init_kprobes();
+	if (!err)
+		err = register_die_notifier(&kprobe_exceptions_nb);
+
+	return err;
+}
+
+#ifdef CONFIG_DEBUG_FS
+static void __kprobes report_probe(struct seq_file *pi, struct kprobe *p,
+               const char *sym, int offset,char *modname)
+{
+	char *kprobe_type;
+
+	if (p->pre_handler == pre_handler_kretprobe)
+		kprobe_type = "r";
+	else if (p->pre_handler == setjmp_pre_handler)
+		kprobe_type = "j";
+	else
+		kprobe_type = "k";
+	if (sym)
+		seq_printf(pi, "%p  %s  %s+0x%x  %s\n", p->addr, kprobe_type,
+			sym, offset, (modname ? modname : " "));
+	else
+		seq_printf(pi, "%p  %s  %p\n", p->addr, kprobe_type, p->addr);
+}
+
+static void __kprobes *kprobe_seq_start(struct seq_file *f, loff_t *pos)
+{
+	return (*pos < KPROBE_TABLE_SIZE) ? pos : NULL;
+}
+
+static void __kprobes *kprobe_seq_next(struct seq_file *f, void *v, loff_t *pos)
+{
+	(*pos)++;
+	if (*pos >= KPROBE_TABLE_SIZE)
+		return NULL;
+	return pos;
+}
+
+static void __kprobes kprobe_seq_stop(struct seq_file *f, void *v)
+{
+	/* Nothing to do */
+}
+
+static int __kprobes show_kprobe_addr(struct seq_file *pi, void *v)
+{
+	struct hlist_head *head;
+	struct hlist_node *node;
+	struct kprobe *p, *kp;
+	const char *sym = NULL;
+	unsigned int i = *(loff_t *) v;
+	unsigned long size, offset = 0;
+	char *modname, namebuf[128];
+
+	head = &kprobe_table[i];
+	preempt_disable();
+	hlist_for_each_entry_rcu(p, node, head, hlist) {
+		sym = kallsyms_lookup((unsigned long)p->addr, &size,
+					&offset, &modname, namebuf);
+		if (p->pre_handler == aggr_pre_handler) {
+			list_for_each_entry_rcu(kp, &p->list, list)
+				report_probe(pi, kp, sym, offset, modname);
+		} else
+			report_probe(pi, p, sym, offset, modname);
+	}
+	preempt_enable();
+	return 0;
+}
+
+static struct seq_operations kprobes_seq_ops = {
+	.start = kprobe_seq_start,
+	.next  = kprobe_seq_next,
+	.stop  = kprobe_seq_stop,
+	.show  = show_kprobe_addr
+};
+
+static int __kprobes kprobes_open(struct inode *inode, struct file *filp)
+{
+	return seq_open(filp, &kprobes_seq_ops);
+}
+
+static struct file_operations debugfs_kprobes_operations = {
+	.open           = kprobes_open,
+	.read           = seq_read,
+	.llseek         = seq_lseek,
+	.release        = seq_release,
+};
+
+static int __kprobes debugfs_kprobe_init(void)
+{
+	struct dentry *dir, *file;
+
+	dir = debugfs_create_dir("kprobes", NULL);
+	if (!dir)
+		return -ENOMEM;
+
+	file = debugfs_create_file("list", 0444, dir , 0 ,
+				&debugfs_kprobes_operations);
+	if (!file) {
+		debugfs_remove(dir);
+		return -ENOMEM;
+	}
+
+	return 0;
+}
+
+late_initcall(debugfs_kprobe_init);
+#endif /* CONFIG_DEBUG_FS */
+
+module_init(init_kprobes);
+
+EXPORT_SYMBOL_GPL(register_kprobe);
+EXPORT_SYMBOL_GPL(unregister_kprobe);
+EXPORT_SYMBOL_GPL(register_jprobe);
+EXPORT_SYMBOL_GPL(unregister_jprobe);
+EXPORT_SYMBOL_GPL(jprobe_return);
+EXPORT_SYMBOL_GPL(register_kretprobe);
+EXPORT_SYMBOL_GPL(unregister_kretprobe);
--- a/kernel/ksysfs.c
+++ b/kernel/ksysfs.c
@@ -0,0 +1,94 @@
+/*
+ * kernel/ksysfs.c - sysfs attributes in /sys/kernel, which
+ * 		     are not related to any other subsystem
+ *
+ * Copyright (C) 2004 Kay Sievers <kay.sievers@vrfy.org>
+ * 
+ * This file is release under the GPLv2
+ *
+ */
+
+#include <linux/kobject.h>
+#include <linux/string.h>
+#include <linux/sysfs.h>
+#include <linux/module.h>
+#include <linux/init.h>
+#include <linux/kexec.h>
+
+#define KERNEL_ATTR_RO(_name) \
+static struct subsys_attribute _name##_attr = __ATTR_RO(_name)
+
+#define KERNEL_ATTR_RW(_name) \
+static struct subsys_attribute _name##_attr = \
+	__ATTR(_name, 0644, _name##_show, _name##_store)
+
+#if defined(CONFIG_HOTPLUG) && defined(CONFIG_NET)
+/* current uevent sequence number */
+static ssize_t uevent_seqnum_show(struct subsystem *subsys, char *page)
+{
+	return sprintf(page, "%llu\n", (unsigned long long)uevent_seqnum);
+}
+KERNEL_ATTR_RO(uevent_seqnum);
+
+/* uevent helper program, used during early boo */
+static ssize_t uevent_helper_show(struct subsystem *subsys, char *page)
+{
+	return sprintf(page, "%s\n", uevent_helper);
+}
+static ssize_t uevent_helper_store(struct subsystem *subsys, const char *page, size_t count)
+{
+	if (count+1 > UEVENT_HELPER_PATH_LEN)
+		return -ENOENT;
+	memcpy(uevent_helper, page, count);
+	uevent_helper[count] = '\0';
+	if (count && uevent_helper[count-1] == '\n')
+		uevent_helper[count-1] = '\0';
+	return count;
+}
+KERNEL_ATTR_RW(uevent_helper);
+#endif
+
+#ifdef CONFIG_KEXEC
+static ssize_t kexec_loaded_show(struct subsystem *subsys, char *page)
+{
+	return sprintf(page, "%d\n", !!kexec_image);
+}
+KERNEL_ATTR_RO(kexec_loaded);
+
+static ssize_t kexec_crash_loaded_show(struct subsystem *subsys, char *page)
+{
+	return sprintf(page, "%d\n", !!kexec_crash_image);
+}
+KERNEL_ATTR_RO(kexec_crash_loaded);
+#endif /* CONFIG_KEXEC */
+
+decl_subsys(kernel, NULL, NULL);
+EXPORT_SYMBOL_GPL(kernel_subsys);
+
+static struct attribute * kernel_attrs[] = {
+#if defined(CONFIG_HOTPLUG) && defined(CONFIG_NET)
+	&uevent_seqnum_attr.attr,
+	&uevent_helper_attr.attr,
+#endif
+#ifdef CONFIG_KEXEC
+	&kexec_loaded_attr.attr,
+	&kexec_crash_loaded_attr.attr,
+#endif
+	NULL
+};
+
+static struct attribute_group kernel_attr_group = {
+	.attrs = kernel_attrs,
+};
+
+static int __init ksysfs_init(void)
+{
+	int error = subsystem_register(&kernel_subsys);
+	if (!error)
+		error = sysfs_create_group(&kernel_subsys.kset.kobj,
+					   &kernel_attr_group);
+
+	return error;
+}
+
+core_initcall(ksysfs_init);
--- a/kernel/kthread.c
+++ b/kernel/kthread.c
@@ -0,0 +1,256 @@
+/* Kernel thread helper functions.
+ *   Copyright (C) 2004 IBM Corporation, Rusty Russell.
+ *
+ * Creation is done via keventd, so that we get a clean environment
+ * even if we're invoked from userspace (think modprobe, hotplug cpu,
+ * etc.).
+ */
+#include <linux/sched.h>
+#include <linux/kthread.h>
+#include <linux/completion.h>
+#include <linux/err.h>
+#include <linux/unistd.h>
+#include <linux/file.h>
+#include <linux/module.h>
+#include <linux/mutex.h>
+#include <asm/semaphore.h>
+
+/*
+ * We dont want to execute off keventd since it might
+ * hold a semaphore our callers hold too:
+ */
+static struct workqueue_struct *helper_wq;
+
+struct kthread_create_info
+{
+	/* Information passed to kthread() from keventd. */
+	int (*threadfn)(void *data);
+	void *data;
+	struct completion started;
+
+	/* Result passed back to kthread_create() from keventd. */
+	struct task_struct *result;
+	struct completion done;
+
+	struct work_struct work;
+};
+
+struct kthread_stop_info
+{
+	struct task_struct *k;
+	int err;
+	struct completion done;
+};
+
+/* Thread stopping is done by setthing this var: lock serializes
+ * multiple kthread_stop calls. */
+static DEFINE_MUTEX(kthread_stop_lock);
+static struct kthread_stop_info kthread_stop_info;
+
+/**
+ * kthread_should_stop - should this kthread return now?
+ *
+ * When someone calls kthread_stop() on your kthread, it will be woken
+ * and this will return true.  You should then return, and your return
+ * value will be passed through to kthread_stop().
+ */
+int kthread_should_stop(void)
+{
+	return (kthread_stop_info.k == current);
+}
+EXPORT_SYMBOL(kthread_should_stop);
+
+static void kthread_exit_files(void)
+{
+	struct fs_struct *fs;
+	struct task_struct *tsk = current;
+
+	exit_fs(tsk);		/* current->fs->count--; */
+	fs = init_task.fs;
+	tsk->fs = fs;
+	atomic_inc(&fs->count);
+ 	exit_files(tsk);
+	current->files = init_task.files;
+	atomic_inc(&tsk->files->count);
+}
+
+static int kthread(void *_create)
+{
+	struct kthread_create_info *create = _create;
+	int (*threadfn)(void *data);
+	void *data;
+	sigset_t blocked;
+	int ret = -EINTR;
+
+	kthread_exit_files();
+
+	/* Copy data: it's on keventd's stack */
+	threadfn = create->threadfn;
+	data = create->data;
+
+	/* Block and flush all signals (in case we're not from keventd). */
+	sigfillset(&blocked);
+	sigprocmask(SIG_BLOCK, &blocked, NULL);
+	flush_signals(current);
+
+	/* By default we can run anywhere, unlike keventd. */
+	set_cpus_allowed(current, CPU_MASK_ALL);
+
+	/* OK, tell user we're spawned, wait for stop or wakeup */
+	__set_current_state(TASK_INTERRUPTIBLE);
+	complete(&create->started);
+	schedule();
+
+	if (!kthread_should_stop())
+		ret = threadfn(data);
+
+	/* It might have exited on its own, w/o kthread_stop.  Check. */
+	if (kthread_should_stop()) {
+		kthread_stop_info.err = ret;
+		complete(&kthread_stop_info.done);
+	}
+	return 0;
+}
+
+/* We are keventd: create a thread. */
+static void keventd_create_kthread(struct work_struct *work)
+{
+	struct kthread_create_info *create =
+		container_of(work, struct kthread_create_info, work);
+	int pid;
+
+	/* We want our own signal handler (we take no signals by default). */
+	pid = kernel_thread(kthread, create, CLONE_FS | CLONE_FILES | SIGCHLD);
+	if (pid < 0) {
+		create->result = ERR_PTR(pid);
+	} else {
+		wait_for_completion(&create->started);
+		read_lock(&tasklist_lock);
+		create->result = find_task_by_pid(pid);
+		read_unlock(&tasklist_lock);
+	}
+	complete(&create->done);
+}
+
+/**
+ * kthread_create - create a kthread.
+ * @threadfn: the function to run until signal_pending(current).
+ * @data: data ptr for @threadfn.
+ * @namefmt: printf-style name for the thread.
+ *
+ * Description: This helper function creates and names a kernel
+ * thread.  The thread will be stopped: use wake_up_process() to start
+ * it.  See also kthread_run(), kthread_create_on_cpu().
+ *
+ * When woken, the thread will run @threadfn() with @data as its
+ * argument. @threadfn() can either call do_exit() directly if it is a
+ * standalone thread for which noone will call kthread_stop(), or
+ * return when 'kthread_should_stop()' is true (which means
+ * kthread_stop() has been called).  The return value should be zero
+ * or a negative error number; it will be passed to kthread_stop().
+ *
+ * Returns a task_struct or ERR_PTR(-ENOMEM).
+ */
+struct task_struct *kthread_create(int (*threadfn)(void *data),
+				   void *data,
+				   const char namefmt[],
+				   ...)
+{
+	struct kthread_create_info create;
+
+	create.threadfn = threadfn;
+	create.data = data;
+	init_completion(&create.started);
+	init_completion(&create.done);
+	INIT_WORK(&create.work, keventd_create_kthread);
+
+	/*
+	 * The workqueue needs to start up first:
+	 */
+	if (!helper_wq)
+		create.work.func(&create.work);
+	else {
+		queue_work(helper_wq, &create.work);
+		wait_for_completion(&create.done);
+	}
+	if (!IS_ERR(create.result)) {
+		va_list args;
+		va_start(args, namefmt);
+		vsnprintf(create.result->comm, sizeof(create.result->comm),
+			  namefmt, args);
+		va_end(args);
+	}
+
+	return create.result;
+}
+EXPORT_SYMBOL(kthread_create);
+
+/**
+ * kthread_bind - bind a just-created kthread to a cpu.
+ * @k: thread created by kthread_create().
+ * @cpu: cpu (might not be online, must be possible) for @k to run on.
+ *
+ * Description: This function is equivalent to set_cpus_allowed(),
+ * except that @cpu doesn't need to be online, and the thread must be
+ * stopped (i.e., just returned from kthread_create()).
+ */
+void kthread_bind(struct task_struct *k, unsigned int cpu)
+{
+	BUG_ON(k->state != TASK_INTERRUPTIBLE);
+	/* Must have done schedule() in kthread() before we set_task_cpu */
+	wait_task_inactive(k);
+	set_task_cpu(k, cpu);
+	k->cpus_allowed = cpumask_of_cpu(cpu);
+}
+EXPORT_SYMBOL(kthread_bind);
+
+/**
+ * kthread_stop - stop a thread created by kthread_create().
+ * @k: thread created by kthread_create().
+ *
+ * Sets kthread_should_stop() for @k to return true, wakes it, and
+ * waits for it to exit.  Your threadfn() must not call do_exit()
+ * itself if you use this function!  This can also be called after
+ * kthread_create() instead of calling wake_up_process(): the thread
+ * will exit without calling threadfn().
+ *
+ * Returns the result of threadfn(), or %-EINTR if wake_up_process()
+ * was never called.
+ */
+int kthread_stop(struct task_struct *k)
+{
+	int ret;
+
+	mutex_lock(&kthread_stop_lock);
+
+	/* It could exit after stop_info.k set, but before wake_up_process. */
+	get_task_struct(k);
+
+	/* Must init completion *before* thread sees kthread_stop_info.k */
+	init_completion(&kthread_stop_info.done);
+	smp_wmb();
+
+	/* Now set kthread_should_stop() to true, and wake it up. */
+	kthread_stop_info.k = k;
+	wake_up_process(k);
+	put_task_struct(k);
+
+	/* Once it dies, reset stop ptr, gather result and we're done. */
+	wait_for_completion(&kthread_stop_info.done);
+	kthread_stop_info.k = NULL;
+	ret = kthread_stop_info.err;
+	mutex_unlock(&kthread_stop_lock);
+
+	return ret;
+}
+EXPORT_SYMBOL(kthread_stop);
+
+static __init int helper_init(void)
+{
+	helper_wq = create_singlethread_workqueue("kthread");
+	BUG_ON(!helper_wq);
+
+	return 0;
+}
+
+core_initcall(helper_init);
--- a/kernel/latency.c
+++ b/kernel/latency.c
@@ -0,0 +1,280 @@
+/*
+ * latency.c: Explicit system-wide latency-expectation infrastructure
+ *
+ * The purpose of this infrastructure is to allow device drivers to set
+ * latency constraint they have and to collect and summarize these
+ * expectations globally. The cummulated result can then be used by
+ * power management and similar users to make decisions that have
+ * tradoffs with a latency component.
+ *
+ * An example user of this are the x86 C-states; each higher C state saves
+ * more power, but has a higher exit latency. For the idle loop power
+ * code to make a good decision which C-state to use, information about
+ * acceptable latencies is required.
+ *
+ * An example announcer of latency is an audio driver that knowns it
+ * will get an interrupt when the hardware has 200 usec of samples
+ * left in the DMA buffer; in that case the driver can set a latency
+ * constraint of, say, 150 usec.
+ *
+ * Multiple drivers can each announce their maximum accepted latency,
+ * to keep these appart, a string based identifier is used.
+ *
+ *
+ * (C) Copyright 2006 Intel Corporation
+ * Author: Arjan van de Ven <arjan@linux.intel.com>
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; version 2
+ * of the License.
+ */
+
+#include <linux/latency.h>
+#include <linux/list.h>
+#include <linux/spinlock.h>
+#include <linux/slab.h>
+#include <linux/module.h>
+#include <linux/notifier.h>
+#include <linux/jiffies.h>
+#include <asm/atomic.h>
+
+struct latency_info {
+	struct list_head list;
+	int usecs;
+	char *identifier;
+};
+
+/*
+ * locking rule: all modifications to current_max_latency and
+ * latency_list need to be done while holding the latency_lock.
+ * latency_lock needs to be taken _irqsave.
+ */
+static atomic_t current_max_latency;
+static DEFINE_SPINLOCK(latency_lock);
+
+static LIST_HEAD(latency_list);
+static BLOCKING_NOTIFIER_HEAD(latency_notifier);
+
+/*
+ * This function returns the maximum latency allowed, which
+ * happens to be the minimum of all maximum latencies on the
+ * list.
+ */
+static int __find_max_latency(void)
+{
+	int min = INFINITE_LATENCY;
+	struct latency_info *info;
+
+	list_for_each_entry(info, &latency_list, list) {
+		if (info->usecs < min)
+			min = info->usecs;
+	}
+	return min;
+}
+
+/**
+ * set_acceptable_latency - sets the maximum latency acceptable
+ * @identifier: string that identifies this driver
+ * @usecs: maximum acceptable latency for this driver
+ *
+ * This function informs the kernel that this device(driver)
+ * can accept at most usecs latency. This setting is used for
+ * power management and similar tradeoffs.
+ *
+ * This function sleeps and can only be called from process
+ * context.
+ * Calling this function with an existing identifier is valid
+ * and will cause the existing latency setting to be changed.
+ */
+void set_acceptable_latency(char *identifier, int usecs)
+{
+	struct latency_info *info, *iter;
+	unsigned long flags;
+	int found_old = 0;
+
+	info = kzalloc(sizeof(struct latency_info), GFP_KERNEL);
+	if (!info)
+		return;
+	info->usecs = usecs;
+	info->identifier = kstrdup(identifier, GFP_KERNEL);
+	if (!info->identifier)
+		goto free_info;
+
+	spin_lock_irqsave(&latency_lock, flags);
+	list_for_each_entry(iter, &latency_list, list) {
+		if (strcmp(iter->identifier, identifier)==0) {
+			found_old = 1;
+			iter->usecs = usecs;
+			break;
+		}
+	}
+	if (!found_old)
+		list_add(&info->list, &latency_list);
+
+	if (usecs < atomic_read(&current_max_latency))
+		atomic_set(&current_max_latency, usecs);
+
+	spin_unlock_irqrestore(&latency_lock, flags);
+
+	blocking_notifier_call_chain(&latency_notifier,
+		atomic_read(&current_max_latency), NULL);
+
+	/*
+	 * if we inserted the new one, we're done; otherwise there was
+	 * an existing one so we need to free the redundant data
+	 */
+	if (!found_old)
+		return;
+
+	kfree(info->identifier);
+free_info:
+	kfree(info);
+}
+EXPORT_SYMBOL_GPL(set_acceptable_latency);
+
+/**
+ * modify_acceptable_latency - changes the maximum latency acceptable
+ * @identifier: string that identifies this driver
+ * @usecs: maximum acceptable latency for this driver
+ *
+ * This function informs the kernel that this device(driver)
+ * can accept at most usecs latency. This setting is used for
+ * power management and similar tradeoffs.
+ *
+ * This function does not sleep and can be called in any context.
+ * Trying to use a non-existing identifier silently gets ignored.
+ *
+ * Due to the atomic nature of this function, the modified latency
+ * value will only be used for future decisions; past decisions
+ * can still lead to longer latencies in the near future.
+ */
+void modify_acceptable_latency(char *identifier, int usecs)
+{
+	struct latency_info *iter;
+	unsigned long flags;
+
+	spin_lock_irqsave(&latency_lock, flags);
+	list_for_each_entry(iter, &latency_list, list) {
+		if (strcmp(iter->identifier, identifier) == 0) {
+			iter->usecs = usecs;
+			break;
+		}
+	}
+	if (usecs < atomic_read(&current_max_latency))
+		atomic_set(&current_max_latency, usecs);
+	spin_unlock_irqrestore(&latency_lock, flags);
+}
+EXPORT_SYMBOL_GPL(modify_acceptable_latency);
+
+/**
+ * remove_acceptable_latency - removes the maximum latency acceptable
+ * @identifier: string that identifies this driver
+ *
+ * This function removes a previously set maximum latency setting
+ * for the driver and frees up any resources associated with the
+ * bookkeeping needed for this.
+ *
+ * This function does not sleep and can be called in any context.
+ * Trying to use a non-existing identifier silently gets ignored.
+ */
+void remove_acceptable_latency(char *identifier)
+{
+	unsigned long flags;
+	int newmax = 0;
+	struct latency_info *iter, *temp;
+
+	spin_lock_irqsave(&latency_lock, flags);
+
+	list_for_each_entry_safe(iter,  temp, &latency_list, list) {
+		if (strcmp(iter->identifier, identifier) == 0) {
+			list_del(&iter->list);
+			newmax = iter->usecs;
+			kfree(iter->identifier);
+			kfree(iter);
+			break;
+		}
+	}
+
+	/* If we just deleted the system wide value, we need to
+	 * recalculate with a full search
+	 */
+	if (newmax == atomic_read(&current_max_latency)) {
+		newmax = __find_max_latency();
+		atomic_set(&current_max_latency, newmax);
+	}
+	spin_unlock_irqrestore(&latency_lock, flags);
+}
+EXPORT_SYMBOL_GPL(remove_acceptable_latency);
+
+/**
+ * system_latency_constraint - queries the system wide latency maximum
+ *
+ * This function returns the system wide maximum latency in
+ * microseconds.
+ *
+ * This function does not sleep and can be called in any context.
+ */
+int system_latency_constraint(void)
+{
+	return atomic_read(&current_max_latency);
+}
+EXPORT_SYMBOL_GPL(system_latency_constraint);
+
+/**
+ * synchronize_acceptable_latency - recalculates all latency decisions
+ *
+ * This function will cause a callback to various kernel pieces that
+ * will make those pieces rethink their latency decisions. This implies
+ * that if there are overlong latencies in hardware state already, those
+ * latencies get taken right now. When this call completes no overlong
+ * latency decisions should be active anymore.
+ *
+ * Typical usecase of this is after a modify_acceptable_latency() call,
+ * which in itself is non-blocking and non-synchronizing.
+ *
+ * This function blocks and should not be called with locks held.
+ */
+
+void synchronize_acceptable_latency(void)
+{
+	blocking_notifier_call_chain(&latency_notifier,
+		atomic_read(&current_max_latency), NULL);
+}
+EXPORT_SYMBOL_GPL(synchronize_acceptable_latency);
+
+/*
+ * Latency notifier: this notifier gets called when a non-atomic new
+ * latency value gets set. The expectation nof the caller of the
+ * non-atomic set is that when the call returns, future latencies
+ * are within bounds, so the functions on the notifier list are
+ * expected to take the overlong latencies immediately, inside the
+ * callback, and not make a overlong latency decision anymore.
+ *
+ * The callback gets called when the new latency value is made
+ * active so system_latency_constraint() returns the new latency.
+ */
+int register_latency_notifier(struct notifier_block * nb)
+{
+	return blocking_notifier_chain_register(&latency_notifier, nb);
+}
+EXPORT_SYMBOL_GPL(register_latency_notifier);
+
+int unregister_latency_notifier(struct notifier_block * nb)
+{
+	return blocking_notifier_chain_unregister(&latency_notifier, nb);
+}
+EXPORT_SYMBOL_GPL(unregister_latency_notifier);
+
+static __init int latency_init(void)
+{
+	atomic_set(&current_max_latency, INFINITE_LATENCY);
+	/*
+	 * we don't want by default to have longer latencies than 2 ticks,
+	 * since that would cause lost ticks
+	 */
+	set_acceptable_latency("kernel", 2*1000000/HZ);
+	return 0;
+}
+
+module_init(latency_init);
--- a/kernel/lockdep.c
+++ b/kernel/lockdep.c
--- a/kernel/lockdep_internals.h
+++ b/kernel/lockdep_internals.h
@@ -0,0 +1,78 @@
+/*
+ * kernel/lockdep_internals.h
+ *
+ * Runtime locking correctness validator
+ *
+ * lockdep subsystem internal functions and variables.
+ */
+
+/*
+ * MAX_LOCKDEP_ENTRIES is the maximum number of lock dependencies
+ * we track.
+ *
+ * We use the per-lock dependency maps in two ways: we grow it by adding
+ * every to-be-taken lock to all currently held lock's own dependency
+ * table (if it's not there yet), and we check it for lock order
+ * conflicts and deadlocks.
+ */
+#define MAX_LOCKDEP_ENTRIES	8192UL
+
+#define MAX_LOCKDEP_KEYS_BITS	11
+#define MAX_LOCKDEP_KEYS	(1UL << MAX_LOCKDEP_KEYS_BITS)
+
+#define MAX_LOCKDEP_CHAINS_BITS	14
+#define MAX_LOCKDEP_CHAINS	(1UL << MAX_LOCKDEP_CHAINS_BITS)
+
+/*
+ * Stack-trace: tightly packed array of stack backtrace
+ * addresses. Protected by the hash_lock.
+ */
+#define MAX_STACK_TRACE_ENTRIES	262144UL
+
+extern struct list_head all_lock_classes;
+
+extern void
+get_usage_chars(struct lock_class *class, char *c1, char *c2, char *c3, char *c4);
+
+extern const char * __get_key_name(struct lockdep_subclass_key *key, char *str);
+
+extern unsigned long nr_lock_classes;
+extern unsigned long nr_list_entries;
+extern unsigned long nr_lock_chains;
+extern unsigned long nr_stack_trace_entries;
+
+extern unsigned int nr_hardirq_chains;
+extern unsigned int nr_softirq_chains;
+extern unsigned int nr_process_chains;
+extern unsigned int max_lockdep_depth;
+extern unsigned int max_recursion_depth;
+
+#ifdef CONFIG_DEBUG_LOCKDEP
+/*
+ * Various lockdep statistics:
+ */
+extern atomic_t chain_lookup_hits;
+extern atomic_t chain_lookup_misses;
+extern atomic_t hardirqs_on_events;
+extern atomic_t hardirqs_off_events;
+extern atomic_t redundant_hardirqs_on;
+extern atomic_t redundant_hardirqs_off;
+extern atomic_t softirqs_on_events;
+extern atomic_t softirqs_off_events;
+extern atomic_t redundant_softirqs_on;
+extern atomic_t redundant_softirqs_off;
+extern atomic_t nr_unused_locks;
+extern atomic_t nr_cyclic_checks;
+extern atomic_t nr_cyclic_check_recursions;
+extern atomic_t nr_find_usage_forwards_checks;
+extern atomic_t nr_find_usage_forwards_recursions;
+extern atomic_t nr_find_usage_backwards_checks;
+extern atomic_t nr_find_usage_backwards_recursions;
+# define debug_atomic_inc(ptr)		atomic_inc(ptr)
+# define debug_atomic_dec(ptr)		atomic_dec(ptr)
+# define debug_atomic_read(ptr)		atomic_read(ptr)
+#else
+# define debug_atomic_inc(ptr)		do { } while (0)
+# define debug_atomic_dec(ptr)		do { } while (0)
+# define debug_atomic_read(ptr)		0
+#endif
--- a/kernel/lockdep_proc.c
+++ b/kernel/lockdep_proc.c
@@ -0,0 +1,361 @@
+/*
+ * kernel/lockdep_proc.c
+ *
+ * Runtime locking correctness validator
+ *
+ * Started by Ingo Molnar:
+ *
+ *  Copyright (C) 2006 Red Hat, Inc., Ingo Molnar <mingo@redhat.com>
+ *
+ * Code for /proc/lockdep and /proc/lockdep_stats:
+ *
+ */
+#include <linux/module.h>
+#include <linux/proc_fs.h>
+#include <linux/seq_file.h>
+#include <linux/kallsyms.h>
+#include <linux/debug_locks.h>
+
+#include "lockdep_internals.h"
+
+static void *l_next(struct seq_file *m, void *v, loff_t *pos)
+{
+	struct lock_class *class = v;
+
+	(*pos)++;
+
+	if (class->lock_entry.next != &all_lock_classes)
+		class = list_entry(class->lock_entry.next, struct lock_class,
+				  lock_entry);
+	else
+		class = NULL;
+	m->private = class;
+
+	return class;
+}
+
+static void *l_start(struct seq_file *m, loff_t *pos)
+{
+	struct lock_class *class = m->private;
+
+	if (&class->lock_entry == all_lock_classes.next)
+		seq_printf(m, "all lock classes:\n");
+
+	return class;
+}
+
+static void l_stop(struct seq_file *m, void *v)
+{
+}
+
+static unsigned long count_forward_deps(struct lock_class *class)
+{
+	struct lock_list *entry;
+	unsigned long ret = 1;
+
+	/*
+	 * Recurse this class's dependency list:
+	 */
+	list_for_each_entry(entry, &class->locks_after, entry)
+		ret += count_forward_deps(entry->class);
+
+	return ret;
+}
+
+static unsigned long count_backward_deps(struct lock_class *class)
+{
+	struct lock_list *entry;
+	unsigned long ret = 1;
+
+	/*
+	 * Recurse this class's dependency list:
+	 */
+	list_for_each_entry(entry, &class->locks_before, entry)
+		ret += count_backward_deps(entry->class);
+
+	return ret;
+}
+
+static void print_name(struct seq_file *m, struct lock_class *class)
+{
+	char str[128];
+	const char *name = class->name;
+
+	if (!name) {
+		name = __get_key_name(class->key, str);
+		seq_printf(m, "%s", name);
+	} else{
+		seq_printf(m, "%s", name);
+		if (class->name_version > 1)
+			seq_printf(m, "#%d", class->name_version);
+		if (class->subclass)
+			seq_printf(m, "/%d", class->subclass);
+	}
+}
+
+static int l_show(struct seq_file *m, void *v)
+{
+	unsigned long nr_forward_deps, nr_backward_deps;
+	struct lock_class *class = m->private;
+	struct lock_list *entry;
+	char c1, c2, c3, c4;
+
+	seq_printf(m, "%p", class->key);
+#ifdef CONFIG_DEBUG_LOCKDEP
+	seq_printf(m, " OPS:%8ld", class->ops);
+#endif
+	nr_forward_deps = count_forward_deps(class);
+	seq_printf(m, " FD:%5ld", nr_forward_deps);
+
+	nr_backward_deps = count_backward_deps(class);
+	seq_printf(m, " BD:%5ld", nr_backward_deps);
+
+	get_usage_chars(class, &c1, &c2, &c3, &c4);
+	seq_printf(m, " %c%c%c%c", c1, c2, c3, c4);
+
+	seq_printf(m, ": ");
+	print_name(m, class);
+	seq_puts(m, "\n");
+
+	list_for_each_entry(entry, &class->locks_after, entry) {
+		if (entry->distance == 1) {
+			seq_printf(m, " -> [%p] ", entry->class);
+			print_name(m, entry->class);
+			seq_puts(m, "\n");
+		}
+	}
+	seq_puts(m, "\n");
+
+	return 0;
+}
+
+static const struct seq_operations lockdep_ops = {
+	.start	= l_start,
+	.next	= l_next,
+	.stop	= l_stop,
+	.show	= l_show,
+};
+
+static int lockdep_open(struct inode *inode, struct file *file)
+{
+	int res = seq_open(file, &lockdep_ops);
+	if (!res) {
+		struct seq_file *m = file->private_data;
+
+		if (!list_empty(&all_lock_classes))
+			m->private = list_entry(all_lock_classes.next,
+					struct lock_class, lock_entry);
+		else
+			m->private = NULL;
+	}
+	return res;
+}
+
+static const struct file_operations proc_lockdep_operations = {
+	.open		= lockdep_open,
+	.read		= seq_read,
+	.llseek		= seq_lseek,
+	.release	= seq_release,
+};
+
+static void lockdep_stats_debug_show(struct seq_file *m)
+{
+#ifdef CONFIG_DEBUG_LOCKDEP
+	unsigned int hi1 = debug_atomic_read(&hardirqs_on_events),
+		     hi2 = debug_atomic_read(&hardirqs_off_events),
+		     hr1 = debug_atomic_read(&redundant_hardirqs_on),
+		     hr2 = debug_atomic_read(&redundant_hardirqs_off),
+		     si1 = debug_atomic_read(&softirqs_on_events),
+		     si2 = debug_atomic_read(&softirqs_off_events),
+		     sr1 = debug_atomic_read(&redundant_softirqs_on),
+		     sr2 = debug_atomic_read(&redundant_softirqs_off);
+
+	seq_printf(m, " chain lookup misses:           %11u\n",
+		debug_atomic_read(&chain_lookup_misses));
+	seq_printf(m, " chain lookup hits:             %11u\n",
+		debug_atomic_read(&chain_lookup_hits));
+	seq_printf(m, " cyclic checks:                 %11u\n",
+		debug_atomic_read(&nr_cyclic_checks));
+	seq_printf(m, " cyclic-check recursions:       %11u\n",
+		debug_atomic_read(&nr_cyclic_check_recursions));
+	seq_printf(m, " find-mask forwards checks:     %11u\n",
+		debug_atomic_read(&nr_find_usage_forwards_checks));
+	seq_printf(m, " find-mask forwards recursions: %11u\n",
+		debug_atomic_read(&nr_find_usage_forwards_recursions));
+	seq_printf(m, " find-mask backwards checks:    %11u\n",
+		debug_atomic_read(&nr_find_usage_backwards_checks));
+	seq_printf(m, " find-mask backwards recursions:%11u\n",
+		debug_atomic_read(&nr_find_usage_backwards_recursions));
+
+	seq_printf(m, " hardirq on events:             %11u\n", hi1);
+	seq_printf(m, " hardirq off events:            %11u\n", hi2);
+	seq_printf(m, " redundant hardirq ons:         %11u\n", hr1);
+	seq_printf(m, " redundant hardirq offs:        %11u\n", hr2);
+	seq_printf(m, " softirq on events:             %11u\n", si1);
+	seq_printf(m, " softirq off events:            %11u\n", si2);
+	seq_printf(m, " redundant softirq ons:         %11u\n", sr1);
+	seq_printf(m, " redundant softirq offs:        %11u\n", sr2);
+#endif
+}
+
+static int lockdep_stats_show(struct seq_file *m, void *v)
+{
+	struct lock_class *class;
+	unsigned long nr_unused = 0, nr_uncategorized = 0,
+		      nr_irq_safe = 0, nr_irq_unsafe = 0,
+		      nr_softirq_safe = 0, nr_softirq_unsafe = 0,
+		      nr_hardirq_safe = 0, nr_hardirq_unsafe = 0,
+		      nr_irq_read_safe = 0, nr_irq_read_unsafe = 0,
+		      nr_softirq_read_safe = 0, nr_softirq_read_unsafe = 0,
+		      nr_hardirq_read_safe = 0, nr_hardirq_read_unsafe = 0,
+		      sum_forward_deps = 0, factor = 0;
+
+	list_for_each_entry(class, &all_lock_classes, lock_entry) {
+
+		if (class->usage_mask == 0)
+			nr_unused++;
+		if (class->usage_mask == LOCKF_USED)
+			nr_uncategorized++;
+		if (class->usage_mask & LOCKF_USED_IN_IRQ)
+			nr_irq_safe++;
+		if (class->usage_mask & LOCKF_ENABLED_IRQS)
+			nr_irq_unsafe++;
+		if (class->usage_mask & LOCKF_USED_IN_SOFTIRQ)
+			nr_softirq_safe++;
+		if (class->usage_mask & LOCKF_ENABLED_SOFTIRQS)
+			nr_softirq_unsafe++;
+		if (class->usage_mask & LOCKF_USED_IN_HARDIRQ)
+			nr_hardirq_safe++;
+		if (class->usage_mask & LOCKF_ENABLED_HARDIRQS)
+			nr_hardirq_unsafe++;
+		if (class->usage_mask & LOCKF_USED_IN_IRQ_READ)
+			nr_irq_read_safe++;
+		if (class->usage_mask & LOCKF_ENABLED_IRQS_READ)
+			nr_irq_read_unsafe++;
+		if (class->usage_mask & LOCKF_USED_IN_SOFTIRQ_READ)
+			nr_softirq_read_safe++;
+		if (class->usage_mask & LOCKF_ENABLED_SOFTIRQS_READ)
+			nr_softirq_read_unsafe++;
+		if (class->usage_mask & LOCKF_USED_IN_HARDIRQ_READ)
+			nr_hardirq_read_safe++;
+		if (class->usage_mask & LOCKF_ENABLED_HARDIRQS_READ)
+			nr_hardirq_read_unsafe++;
+
+		sum_forward_deps += count_forward_deps(class);
+	}
+#ifdef CONFIG_DEBUG_LOCKDEP
+	DEBUG_LOCKS_WARN_ON(debug_atomic_read(&nr_unused_locks) != nr_unused);
+#endif
+	seq_printf(m, " lock-classes:                  %11lu [max: %lu]\n",
+			nr_lock_classes, MAX_LOCKDEP_KEYS);
+	seq_printf(m, " direct dependencies:           %11lu [max: %lu]\n",
+			nr_list_entries, MAX_LOCKDEP_ENTRIES);
+	seq_printf(m, " indirect dependencies:         %11lu\n",
+			sum_forward_deps);
+
+	/*
+	 * Total number of dependencies:
+	 *
+	 * All irq-safe locks may nest inside irq-unsafe locks,
+	 * plus all the other known dependencies:
+	 */
+	seq_printf(m, " all direct dependencies:       %11lu\n",
+			nr_irq_unsafe * nr_irq_safe +
+			nr_hardirq_unsafe * nr_hardirq_safe +
+			nr_list_entries);
+
+	/*
+	 * Estimated factor between direct and indirect
+	 * dependencies:
+	 */
+	if (nr_list_entries)
+		factor = sum_forward_deps / nr_list_entries;
+
+	seq_printf(m, " dependency chains:             %11lu [max: %lu]\n",
+			nr_lock_chains, MAX_LOCKDEP_CHAINS);
+
+#ifdef CONFIG_TRACE_IRQFLAGS
+	seq_printf(m, " in-hardirq chains:             %11u\n",
+			nr_hardirq_chains);
+	seq_printf(m, " in-softirq chains:             %11u\n",
+			nr_softirq_chains);
+#endif
+	seq_printf(m, " in-process chains:             %11u\n",
+			nr_process_chains);
+	seq_printf(m, " stack-trace entries:           %11lu [max: %lu]\n",
+			nr_stack_trace_entries, MAX_STACK_TRACE_ENTRIES);
+	seq_printf(m, " combined max dependencies:     %11u\n",
+			(nr_hardirq_chains + 1) *
+			(nr_softirq_chains + 1) *
+			(nr_process_chains + 1)
+	);
+	seq_printf(m, " hardirq-safe locks:            %11lu\n",
+			nr_hardirq_safe);
+	seq_printf(m, " hardirq-unsafe locks:          %11lu\n",
+			nr_hardirq_unsafe);
+	seq_printf(m, " softirq-safe locks:            %11lu\n",
+			nr_softirq_safe);
+	seq_printf(m, " softirq-unsafe locks:          %11lu\n",
+			nr_softirq_unsafe);
+	seq_printf(m, " irq-safe locks:                %11lu\n",
+			nr_irq_safe);
+	seq_printf(m, " irq-unsafe locks:              %11lu\n",
+			nr_irq_unsafe);
+
+	seq_printf(m, " hardirq-read-safe locks:       %11lu\n",
+			nr_hardirq_read_safe);
+	seq_printf(m, " hardirq-read-unsafe locks:     %11lu\n",
+			nr_hardirq_read_unsafe);
+	seq_printf(m, " softirq-read-safe locks:       %11lu\n",
+			nr_softirq_read_safe);
+	seq_printf(m, " softirq-read-unsafe locks:     %11lu\n",
+			nr_softirq_read_unsafe);
+	seq_printf(m, " irq-read-safe locks:           %11lu\n",
+			nr_irq_read_safe);
+	seq_printf(m, " irq-read-unsafe locks:         %11lu\n",
+			nr_irq_read_unsafe);
+
+	seq_printf(m, " uncategorized locks:           %11lu\n",
+			nr_uncategorized);
+	seq_printf(m, " unused locks:                  %11lu\n",
+			nr_unused);
+	seq_printf(m, " max locking depth:             %11u\n",
+			max_lockdep_depth);
+	seq_printf(m, " max recursion depth:           %11u\n",
+			max_recursion_depth);
+	lockdep_stats_debug_show(m);
+	seq_printf(m, " debug_locks:                   %11u\n",
+			debug_locks);
+
+	return 0;
+}
+
+static int lockdep_stats_open(struct inode *inode, struct file *file)
+{
+	return single_open(file, lockdep_stats_show, NULL);
+}
+
+static const struct file_operations proc_lockdep_stats_operations = {
+	.open		= lockdep_stats_open,
+	.read		= seq_read,
+	.llseek		= seq_lseek,
+	.release	= seq_release,
+};
+
+static int __init lockdep_proc_init(void)
+{
+	struct proc_dir_entry *entry;
+
+	entry = create_proc_entry("lockdep", S_IRUSR, NULL);
+	if (entry)
+		entry->proc_fops = &proc_lockdep_operations;
+
+	entry = create_proc_entry("lockdep_stats", S_IRUSR, NULL);
+	if (entry)
+		entry->proc_fops = &proc_lockdep_stats_operations;
+
+	return 0;
+}
+
+__initcall(lockdep_proc_init);
+
--- a/kernel/module.c
+++ b/kernel/module.c
--- a/kernel/mutex-debug.c
+++ b/kernel/mutex-debug.c
@@ -0,0 +1,116 @@
+/*
+ * kernel/mutex-debug.c
+ *
+ * Debugging code for mutexes
+ *
+ * Started by Ingo Molnar:
+ *
+ *  Copyright (C) 2004, 2005, 2006 Red Hat, Inc., Ingo Molnar <mingo@redhat.com>
+ *
+ * lock debugging, locking tree, deadlock detection started by:
+ *
+ *  Copyright (C) 2004, LynuxWorks, Inc., Igor Manyilov, Bill Huey
+ *  Released under the General Public License (GPL).
+ */
+#include <linux/mutex.h>
+#include <linux/delay.h>
+#include <linux/module.h>
+#include <linux/poison.h>
+#include <linux/spinlock.h>
+#include <linux/kallsyms.h>
+#include <linux/interrupt.h>
+#include <linux/debug_locks.h>
+
+#include "mutex-debug.h"
+
+/*
+ * Must be called with lock->wait_lock held.
+ */
+void debug_mutex_set_owner(struct mutex *lock, struct thread_info *new_owner)
+{
+	lock->owner = new_owner;
+}
+
+void debug_mutex_lock_common(struct mutex *lock, struct mutex_waiter *waiter)
+{
+	memset(waiter, MUTEX_DEBUG_INIT, sizeof(*waiter));
+	waiter->magic = waiter;
+	INIT_LIST_HEAD(&waiter->list);
+}
+
+void debug_mutex_wake_waiter(struct mutex *lock, struct mutex_waiter *waiter)
+{
+	SMP_DEBUG_LOCKS_WARN_ON(!spin_is_locked(&lock->wait_lock));
+	DEBUG_LOCKS_WARN_ON(list_empty(&lock->wait_list));
+	DEBUG_LOCKS_WARN_ON(waiter->magic != waiter);
+	DEBUG_LOCKS_WARN_ON(list_empty(&waiter->list));
+}
+
+void debug_mutex_free_waiter(struct mutex_waiter *waiter)
+{
+	DEBUG_LOCKS_WARN_ON(!list_empty(&waiter->list));
+	memset(waiter, MUTEX_DEBUG_FREE, sizeof(*waiter));
+}
+
+void debug_mutex_add_waiter(struct mutex *lock, struct mutex_waiter *waiter,
+			    struct thread_info *ti)
+{
+	SMP_DEBUG_LOCKS_WARN_ON(!spin_is_locked(&lock->wait_lock));
+
+	/* Mark the current thread as blocked on the lock: */
+	ti->task->blocked_on = waiter;
+	waiter->lock = lock;
+}
+
+void mutex_remove_waiter(struct mutex *lock, struct mutex_waiter *waiter,
+			 struct thread_info *ti)
+{
+	DEBUG_LOCKS_WARN_ON(list_empty(&waiter->list));
+	DEBUG_LOCKS_WARN_ON(waiter->task != ti->task);
+	DEBUG_LOCKS_WARN_ON(ti->task->blocked_on != waiter);
+	ti->task->blocked_on = NULL;
+
+	list_del_init(&waiter->list);
+	waiter->task = NULL;
+}
+
+void debug_mutex_unlock(struct mutex *lock)
+{
+	if (unlikely(!debug_locks))
+		return;
+
+	DEBUG_LOCKS_WARN_ON(lock->owner != current_thread_info());
+	DEBUG_LOCKS_WARN_ON(lock->magic != lock);
+	DEBUG_LOCKS_WARN_ON(!lock->wait_list.prev && !lock->wait_list.next);
+	DEBUG_LOCKS_WARN_ON(lock->owner != current_thread_info());
+}
+
+void debug_mutex_init(struct mutex *lock, const char *name,
+		      struct lock_class_key *key)
+{
+#ifdef CONFIG_DEBUG_LOCK_ALLOC
+	/*
+	 * Make sure we are not reinitializing a held lock:
+	 */
+	debug_check_no_locks_freed((void *)lock, sizeof(*lock));
+	lockdep_init_map(&lock->dep_map, name, key, 0);
+#endif
+	lock->owner = NULL;
+	lock->magic = lock;
+}
+
+/***
+ * mutex_destroy - mark a mutex unusable
+ * @lock: the mutex to be destroyed
+ *
+ * This function marks the mutex uninitialized, and any subsequent
+ * use of the mutex is forbidden. The mutex must not be locked when
+ * this function is called.
+ */
+void fastcall mutex_destroy(struct mutex *lock)
+{
+	DEBUG_LOCKS_WARN_ON(mutex_is_locked(lock));
+	lock->magic = NULL;
+}
+
+EXPORT_SYMBOL_GPL(mutex_destroy);
--- a/kernel/mutex-debug.h
+++ b/kernel/mutex-debug.h
@@ -0,0 +1,53 @@
+/*
+ * Mutexes: blocking mutual exclusion locks
+ *
+ * started by Ingo Molnar:
+ *
+ *  Copyright (C) 2004, 2005, 2006 Red Hat, Inc., Ingo Molnar <mingo@redhat.com>
+ *
+ * This file contains mutex debugging related internal declarations,
+ * prototypes and inline functions, for the CONFIG_DEBUG_MUTEXES case.
+ * More details are in kernel/mutex-debug.c.
+ */
+
+/*
+ * This must be called with lock->wait_lock held.
+ */
+extern void
+debug_mutex_set_owner(struct mutex *lock, struct thread_info *new_owner);
+
+static inline void debug_mutex_clear_owner(struct mutex *lock)
+{
+	lock->owner = NULL;
+}
+
+extern void debug_mutex_lock_common(struct mutex *lock,
+				    struct mutex_waiter *waiter);
+extern void debug_mutex_wake_waiter(struct mutex *lock,
+				    struct mutex_waiter *waiter);
+extern void debug_mutex_free_waiter(struct mutex_waiter *waiter);
+extern void debug_mutex_add_waiter(struct mutex *lock,
+				   struct mutex_waiter *waiter,
+				   struct thread_info *ti);
+extern void mutex_remove_waiter(struct mutex *lock, struct mutex_waiter *waiter,
+				struct thread_info *ti);
+extern void debug_mutex_unlock(struct mutex *lock);
+extern void debug_mutex_init(struct mutex *lock, const char *name,
+			     struct lock_class_key *key);
+
+#define spin_lock_mutex(lock, flags)			\
+	do {						\
+		struct mutex *l = container_of(lock, struct mutex, wait_lock); \
+							\
+		DEBUG_LOCKS_WARN_ON(in_interrupt());	\
+		local_irq_save(flags);			\
+		__raw_spin_lock(&(lock)->raw_lock);	\
+		DEBUG_LOCKS_WARN_ON(l->magic != l);	\
+	} while (0)
+
+#define spin_unlock_mutex(lock, flags)			\
+	do {						\
+		__raw_spin_unlock(&(lock)->raw_lock);	\
+		local_irq_restore(flags);		\
+		preempt_check_resched();		\
+	} while (0)
--- a/kernel/mutex.c
+++ b/kernel/mutex.c
@@ -0,0 +1,347 @@
+/*
+ * kernel/mutex.c
+ *
+ * Mutexes: blocking mutual exclusion locks
+ *
+ * Started by Ingo Molnar:
+ *
+ *  Copyright (C) 2004, 2005, 2006 Red Hat, Inc., Ingo Molnar <mingo@redhat.com>
+ *
+ * Many thanks to Arjan van de Ven, Thomas Gleixner, Steven Rostedt and
+ * David Howells for suggestions and improvements.
+ *
+ * Also see Documentation/mutex-design.txt.
+ */
+#include <linux/mutex.h>
+#include <linux/sched.h>
+#include <linux/module.h>
+#include <linux/spinlock.h>
+#include <linux/interrupt.h>
+#include <linux/debug_locks.h>
+
+/*
+ * In the DEBUG case we are using the "NULL fastpath" for mutexes,
+ * which forces all calls into the slowpath:
+ */
+#ifdef CONFIG_DEBUG_MUTEXES
+# include "mutex-debug.h"
+# include <asm-generic/mutex-null.h>
+#else
+# include "mutex.h"
+# include <asm/mutex.h>
+#endif
+
+/***
+ * mutex_init - initialize the mutex
+ * @lock: the mutex to be initialized
+ *
+ * Initialize the mutex to unlocked state.
+ *
+ * It is not allowed to initialize an already locked mutex.
+ */
+void
+__mutex_init(struct mutex *lock, const char *name, struct lock_class_key *key)
+{
+	atomic_set(&lock->count, 1);
+	spin_lock_init(&lock->wait_lock);
+	INIT_LIST_HEAD(&lock->wait_list);
+
+	debug_mutex_init(lock, name, key);
+}
+
+EXPORT_SYMBOL(__mutex_init);
+
+/*
+ * We split the mutex lock/unlock logic into separate fastpath and
+ * slowpath functions, to reduce the register pressure on the fastpath.
+ * We also put the fastpath first in the kernel image, to make sure the
+ * branch is predicted by the CPU as default-untaken.
+ */
+static void fastcall noinline __sched
+__mutex_lock_slowpath(atomic_t *lock_count);
+
+/***
+ * mutex_lock - acquire the mutex
+ * @lock: the mutex to be acquired
+ *
+ * Lock the mutex exclusively for this task. If the mutex is not
+ * available right now, it will sleep until it can get it.
+ *
+ * The mutex must later on be released by the same task that
+ * acquired it. Recursive locking is not allowed. The task
+ * may not exit without first unlocking the mutex. Also, kernel
+ * memory where the mutex resides mutex must not be freed with
+ * the mutex still locked. The mutex must first be initialized
+ * (or statically defined) before it can be locked. memset()-ing
+ * the mutex to 0 is not allowed.
+ *
+ * ( The CONFIG_DEBUG_MUTEXES .config option turns on debugging
+ *   checks that will enforce the restrictions and will also do
+ *   deadlock debugging. )
+ *
+ * This function is similar to (but not equivalent to) down().
+ */
+void inline fastcall __sched mutex_lock(struct mutex *lock)
+{
+	might_sleep();
+	/*
+	 * The locking fastpath is the 1->0 transition from
+	 * 'unlocked' into 'locked' state.
+	 */
+	__mutex_fastpath_lock(&lock->count, __mutex_lock_slowpath);
+}
+
+EXPORT_SYMBOL(mutex_lock);
+
+static void fastcall noinline __sched
+__mutex_unlock_slowpath(atomic_t *lock_count);
+
+/***
+ * mutex_unlock - release the mutex
+ * @lock: the mutex to be released
+ *
+ * Unlock a mutex that has been locked by this task previously.
+ *
+ * This function must not be used in interrupt context. Unlocking
+ * of a not locked mutex is not allowed.
+ *
+ * This function is similar to (but not equivalent to) up().
+ */
+void fastcall __sched mutex_unlock(struct mutex *lock)
+{
+	/*
+	 * The unlocking fastpath is the 0->1 transition from 'locked'
+	 * into 'unlocked' state:
+	 */
+	__mutex_fastpath_unlock(&lock->count, __mutex_unlock_slowpath);
+}
+
+EXPORT_SYMBOL(mutex_unlock);
+
+/*
+ * Lock a mutex (possibly interruptible), slowpath:
+ */
+static inline int __sched
+__mutex_lock_common(struct mutex *lock, long state, unsigned int subclass)
+{
+	struct task_struct *task = current;
+	struct mutex_waiter waiter;
+	unsigned int old_val;
+	unsigned long flags;
+
+	spin_lock_mutex(&lock->wait_lock, flags);
+
+	debug_mutex_lock_common(lock, &waiter);
+	mutex_acquire(&lock->dep_map, subclass, 0, _RET_IP_);
+	debug_mutex_add_waiter(lock, &waiter, task->thread_info);
+
+	/* add waiting tasks to the end of the waitqueue (FIFO): */
+	list_add_tail(&waiter.list, &lock->wait_list);
+	waiter.task = task;
+
+	for (;;) {
+		/*
+		 * Lets try to take the lock again - this is needed even if
+		 * we get here for the first time (shortly after failing to
+		 * acquire the lock), to make sure that we get a wakeup once
+		 * it's unlocked. Later on, if we sleep, this is the
+		 * operation that gives us the lock. We xchg it to -1, so
+		 * that when we release the lock, we properly wake up the
+		 * other waiters:
+		 */
+		old_val = atomic_xchg(&lock->count, -1);
+		if (old_val == 1)
+			break;
+
+		/*
+		 * got a signal? (This code gets eliminated in the
+		 * TASK_UNINTERRUPTIBLE case.)
+		 */
+		if (unlikely(state == TASK_INTERRUPTIBLE &&
+						signal_pending(task))) {
+			mutex_remove_waiter(lock, &waiter, task->thread_info);
+			mutex_release(&lock->dep_map, 1, _RET_IP_);
+			spin_unlock_mutex(&lock->wait_lock, flags);
+
+			debug_mutex_free_waiter(&waiter);
+			return -EINTR;
+		}
+		__set_task_state(task, state);
+
+		/* didnt get the lock, go to sleep: */
+		spin_unlock_mutex(&lock->wait_lock, flags);
+		schedule();
+		spin_lock_mutex(&lock->wait_lock, flags);
+	}
+
+	/* got the lock - rejoice! */
+	mutex_remove_waiter(lock, &waiter, task->thread_info);
+	debug_mutex_set_owner(lock, task->thread_info);
+
+	/* set it to 0 if there are no waiters left: */
+	if (likely(list_empty(&lock->wait_list)))
+		atomic_set(&lock->count, 0);
+
+	spin_unlock_mutex(&lock->wait_lock, flags);
+
+	debug_mutex_free_waiter(&waiter);
+
+	return 0;
+}
+
+static void fastcall noinline __sched
+__mutex_lock_slowpath(atomic_t *lock_count)
+{
+	struct mutex *lock = container_of(lock_count, struct mutex, count);
+
+	__mutex_lock_common(lock, TASK_UNINTERRUPTIBLE, 0);
+}
+
+#ifdef CONFIG_DEBUG_LOCK_ALLOC
+void __sched
+mutex_lock_nested(struct mutex *lock, unsigned int subclass)
+{
+	might_sleep();
+	__mutex_lock_common(lock, TASK_UNINTERRUPTIBLE, subclass);
+}
+
+EXPORT_SYMBOL_GPL(mutex_lock_nested);
+
+int __sched
+mutex_lock_interruptible_nested(struct mutex *lock, unsigned int subclass)
+{
+	might_sleep();
+	return __mutex_lock_common(lock, TASK_INTERRUPTIBLE, subclass);
+}
+
+EXPORT_SYMBOL_GPL(mutex_lock_interruptible_nested);
+#endif
+
+/*
+ * Release the lock, slowpath:
+ */
+static fastcall inline void
+__mutex_unlock_common_slowpath(atomic_t *lock_count, int nested)
+{
+	struct mutex *lock = container_of(lock_count, struct mutex, count);
+	unsigned long flags;
+
+	spin_lock_mutex(&lock->wait_lock, flags);
+	mutex_release(&lock->dep_map, nested, _RET_IP_);
+	debug_mutex_unlock(lock);
+
+	/*
+	 * some architectures leave the lock unlocked in the fastpath failure
+	 * case, others need to leave it locked. In the later case we have to
+	 * unlock it here
+	 */
+	if (__mutex_slowpath_needs_to_unlock())
+		atomic_set(&lock->count, 1);
+
+	if (!list_empty(&lock->wait_list)) {
+		/* get the first entry from the wait-list: */
+		struct mutex_waiter *waiter =
+				list_entry(lock->wait_list.next,
+					   struct mutex_waiter, list);
+
+		debug_mutex_wake_waiter(lock, waiter);
+
+		wake_up_process(waiter->task);
+	}
+
+	debug_mutex_clear_owner(lock);
+
+	spin_unlock_mutex(&lock->wait_lock, flags);
+}
+
+/*
+ * Release the lock, slowpath:
+ */
+static fastcall noinline void
+__mutex_unlock_slowpath(atomic_t *lock_count)
+{
+	__mutex_unlock_common_slowpath(lock_count, 1);
+}
+
+/*
+ * Here come the less common (and hence less performance-critical) APIs:
+ * mutex_lock_interruptible() and mutex_trylock().
+ */
+static int fastcall noinline __sched
+__mutex_lock_interruptible_slowpath(atomic_t *lock_count);
+
+/***
+ * mutex_lock_interruptible - acquire the mutex, interruptable
+ * @lock: the mutex to be acquired
+ *
+ * Lock the mutex like mutex_lock(), and return 0 if the mutex has
+ * been acquired or sleep until the mutex becomes available. If a
+ * signal arrives while waiting for the lock then this function
+ * returns -EINTR.
+ *
+ * This function is similar to (but not equivalent to) down_interruptible().
+ */
+int fastcall __sched mutex_lock_interruptible(struct mutex *lock)
+{
+	might_sleep();
+	return __mutex_fastpath_lock_retval
+			(&lock->count, __mutex_lock_interruptible_slowpath);
+}
+
+EXPORT_SYMBOL(mutex_lock_interruptible);
+
+static int fastcall noinline __sched
+__mutex_lock_interruptible_slowpath(atomic_t *lock_count)
+{
+	struct mutex *lock = container_of(lock_count, struct mutex, count);
+
+	return __mutex_lock_common(lock, TASK_INTERRUPTIBLE, 0);
+}
+
+/*
+ * Spinlock based trylock, we take the spinlock and check whether we
+ * can get the lock:
+ */
+static inline int __mutex_trylock_slowpath(atomic_t *lock_count)
+{
+	struct mutex *lock = container_of(lock_count, struct mutex, count);
+	unsigned long flags;
+	int prev;
+
+	spin_lock_mutex(&lock->wait_lock, flags);
+
+	prev = atomic_xchg(&lock->count, -1);
+	if (likely(prev == 1)) {
+		debug_mutex_set_owner(lock, current_thread_info());
+		mutex_acquire(&lock->dep_map, 0, 1, _RET_IP_);
+	}
+	/* Set it back to 0 if there are no waiters: */
+	if (likely(list_empty(&lock->wait_list)))
+		atomic_set(&lock->count, 0);
+
+	spin_unlock_mutex(&lock->wait_lock, flags);
+
+	return prev == 1;
+}
+
+/***
+ * mutex_trylock - try acquire the mutex, without waiting
+ * @lock: the mutex to be acquired
+ *
+ * Try to acquire the mutex atomically. Returns 1 if the mutex
+ * has been acquired successfully, and 0 on contention.
+ *
+ * NOTE: this function follows the spin_trylock() convention, so
+ * it is negated to the down_trylock() return values! Be careful
+ * about this when converting semaphore users to mutexes.
+ *
+ * This function must not be used in interrupt context. The
+ * mutex must be released by the same task that acquired it.
+ */
+int fastcall __sched mutex_trylock(struct mutex *lock)
+{
+	return __mutex_fastpath_trylock(&lock->count,
+					__mutex_trylock_slowpath);
+}
+
+EXPORT_SYMBOL(mutex_trylock);
--- a/kernel/mutex.h
+++ b/kernel/mutex.h
@@ -0,0 +1,30 @@
+/*
+ * Mutexes: blocking mutual exclusion locks
+ *
+ * started by Ingo Molnar:
+ *
+ *  Copyright (C) 2004, 2005, 2006 Red Hat, Inc., Ingo Molnar <mingo@redhat.com>
+ *
+ * This file contains mutex debugging related internal prototypes, for the
+ * !CONFIG_DEBUG_MUTEXES case. Most of them are NOPs:
+ */
+
+#define spin_lock_mutex(lock, flags) \
+		do { spin_lock(lock); (void)(flags); } while (0)
+#define spin_unlock_mutex(lock, flags) \
+		do { spin_unlock(lock); (void)(flags); } while (0)
+#define mutex_remove_waiter(lock, waiter, ti) \
+		__list_del((waiter)->list.prev, (waiter)->list.next)
+
+#define debug_mutex_set_owner(lock, new_owner)		do { } while (0)
+#define debug_mutex_clear_owner(lock)			do { } while (0)
+#define debug_mutex_wake_waiter(lock, waiter)		do { } while (0)
+#define debug_mutex_free_waiter(waiter)			do { } while (0)
+#define debug_mutex_add_waiter(lock, waiter, ti)	do { } while (0)
+#define debug_mutex_unlock(lock)			do { } while (0)
+#define debug_mutex_init(lock, name, key)		do { } while (0)
+
+static inline void
+debug_mutex_lock_common(struct mutex *lock, struct mutex_waiter *waiter)
+{
+}
--- a/kernel/nsproxy.c
+++ b/kernel/nsproxy.c
@@ -0,0 +1,149 @@
+/*
+ *  Copyright (C) 2006 IBM Corporation
+ *
+ *  Author: Serge Hallyn <serue@us.ibm.com>
+ *
+ *  This program is free software; you can redistribute it and/or
+ *  modify it under the terms of the GNU General Public License as
+ *  published by the Free Software Foundation, version 2 of the
+ *  License.
+ *
+ *  Jun 2006 - namespaces support
+ *             OpenVZ, SWsoft Inc.
+ *             Pavel Emelianov <xemul@openvz.org>
+ */
+
+#include <linux/module.h>
+#include <linux/version.h>
+#include <linux/nsproxy.h>
+#include <linux/init_task.h>
+#include <linux/mnt_namespace.h>
+#include <linux/utsname.h>
+#include <linux/pid_namespace.h>
+
+struct nsproxy init_nsproxy = INIT_NSPROXY(init_nsproxy);
+
+static inline void get_nsproxy(struct nsproxy *ns)
+{
+	atomic_inc(&ns->count);
+}
+
+void get_task_namespaces(struct task_struct *tsk)
+{
+	struct nsproxy *ns = tsk->nsproxy;
+	if (ns) {
+		get_nsproxy(ns);
+	}
+}
+
+/*
+ * creates a copy of "orig" with refcount 1.
+ * This does not grab references to the contained namespaces,
+ * so that needs to be done by dup_namespaces.
+ */
+static inline struct nsproxy *clone_namespaces(struct nsproxy *orig)
+{
+	struct nsproxy *ns;
+
+	ns = kmemdup(orig, sizeof(struct nsproxy), GFP_KERNEL);
+	if (ns)
+		atomic_set(&ns->count, 1);
+	return ns;
+}
+
+/*
+ * copies the nsproxy, setting refcount to 1, and grabbing a
+ * reference to all contained namespaces.  Called from
+ * sys_unshare()
+ */
+struct nsproxy *dup_namespaces(struct nsproxy *orig)
+{
+	struct nsproxy *ns = clone_namespaces(orig);
+
+	if (ns) {
+		if (ns->mnt_ns)
+			get_mnt_ns(ns->mnt_ns);
+		if (ns->uts_ns)
+			get_uts_ns(ns->uts_ns);
+		if (ns->ipc_ns)
+			get_ipc_ns(ns->ipc_ns);
+		if (ns->pid_ns)
+			get_pid_ns(ns->pid_ns);
+	}
+
+	return ns;
+}
+
+/*
+ * called from clone.  This now handles copy for nsproxy and all
+ * namespaces therein.
+ */
+int copy_namespaces(int flags, struct task_struct *tsk)
+{
+	struct nsproxy *old_ns = tsk->nsproxy;
+	struct nsproxy *new_ns;
+	int err = 0;
+
+	if (!old_ns)
+		return 0;
+
+	get_nsproxy(old_ns);
+
+	if (!(flags & (CLONE_NEWNS | CLONE_NEWUTS | CLONE_NEWIPC)))
+		return 0;
+
+	new_ns = clone_namespaces(old_ns);
+	if (!new_ns) {
+		err = -ENOMEM;
+		goto out;
+	}
+
+	tsk->nsproxy = new_ns;
+
+	err = copy_mnt_ns(flags, tsk);
+	if (err)
+		goto out_ns;
+
+	err = copy_utsname(flags, tsk);
+	if (err)
+		goto out_uts;
+
+	err = copy_ipcs(flags, tsk);
+	if (err)
+		goto out_ipc;
+
+	err = copy_pid_ns(flags, tsk);
+	if (err)
+		goto out_pid;
+
+out:
+	put_nsproxy(old_ns);
+	return err;
+
+out_pid:
+	if (new_ns->ipc_ns)
+		put_ipc_ns(new_ns->ipc_ns);
+out_ipc:
+	if (new_ns->uts_ns)
+		put_uts_ns(new_ns->uts_ns);
+out_uts:
+	if (new_ns->mnt_ns)
+		put_mnt_ns(new_ns->mnt_ns);
+out_ns:
+	tsk->nsproxy = old_ns;
+	kfree(new_ns);
+	goto out;
+}
+
+void free_nsproxy(struct nsproxy *ns)
+{
+	if (ns->mnt_ns)
+		put_mnt_ns(ns->mnt_ns);
+	if (ns->uts_ns)
+		put_uts_ns(ns->uts_ns);
+	if (ns->ipc_ns)
+		put_ipc_ns(ns->ipc_ns);
+	if (ns->pid_ns)
+		put_pid_ns(ns->pid_ns);
+	kfree(ns);
+}
--- a/kernel/panic.c
+++ b/kernel/panic.c
@@ -0,0 +1,286 @@
+/*
+ *  linux/kernel/panic.c
+ *
+ *  Copyright (C) 1991, 1992  Linus Torvalds
+ */
+
+/*
+ * This function is used through-out the kernel (including mm and fs)
+ * to indicate a major problem.
+ */
+#include <linux/module.h>
+#include <linux/sched.h>
+#include <linux/delay.h>
+#include <linux/reboot.h>
+#include <linux/notifier.h>
+#include <linux/init.h>
+#include <linux/sysrq.h>
+#include <linux/interrupt.h>
+#include <linux/nmi.h>
+#include <linux/kexec.h>
+#include <linux/debug_locks.h>
+
+int panic_on_oops;
+int tainted;
+static int pause_on_oops;
+static int pause_on_oops_flag;
+static DEFINE_SPINLOCK(pause_on_oops_lock);
+
+int panic_timeout;
+
+ATOMIC_NOTIFIER_HEAD(panic_notifier_list);
+
+EXPORT_SYMBOL(panic_notifier_list);
+
+static int __init panic_setup(char *str)
+{
+	panic_timeout = simple_strtoul(str, NULL, 0);
+	return 1;
+}
+__setup("panic=", panic_setup);
+
+static long no_blink(long time)
+{
+	return 0;
+}
+
+/* Returns how long it waited in ms */
+long (*panic_blink)(long time);
+EXPORT_SYMBOL(panic_blink);
+
+/**
+ *	panic - halt the system
+ *	@fmt: The text string to print
+ *
+ *	Display a message, then perform cleanups.
+ *
+ *	This function never returns.
+ */
+ 
+NORET_TYPE void panic(const char * fmt, ...)
+{
+	long i;
+	static char buf[1024];
+	va_list args;
+#if defined(CONFIG_S390)
+        unsigned long caller = (unsigned long) __builtin_return_address(0);
+#endif
+
+	/*
+	 * It's possible to come here directly from a panic-assertion and not
+	 * have preempt disabled. Some functions called from here want
+	 * preempt to be disabled. No point enabling it later though...
+	 */
+	preempt_disable();
+
+	bust_spinlocks(1);
+	va_start(args, fmt);
+	vsnprintf(buf, sizeof(buf), fmt, args);
+	va_end(args);
+	printk(KERN_EMERG "Kernel panic - not syncing: %s\n",buf);
+	bust_spinlocks(0);
+
+	/*
+	 * If we have crashed and we have a crash kernel loaded let it handle
+	 * everything else.
+	 * Do we want to call this before we try to display a message?
+	 */
+	crash_kexec(NULL);
+
+#ifdef CONFIG_SMP
+	/*
+	 * Note smp_send_stop is the usual smp shutdown function, which
+	 * unfortunately means it may not be hardened to work in a panic
+	 * situation.
+	 */
+	smp_send_stop();
+#endif
+
+	atomic_notifier_call_chain(&panic_notifier_list, 0, buf);
+
+	if (!panic_blink)
+		panic_blink = no_blink;
+
+	if (panic_timeout > 0) {
+		/*
+	 	 * Delay timeout seconds before rebooting the machine. 
+		 * We can't use the "normal" timers since we just panicked..
+	 	 */
+		printk(KERN_EMERG "Rebooting in %d seconds..",panic_timeout);
+		for (i = 0; i < panic_timeout*1000; ) {
+			touch_nmi_watchdog();
+			i += panic_blink(i);
+			mdelay(1);
+			i++;
+		}
+		/*	This will not be a clean reboot, with everything
+		 *	shutting down.  But if there is a chance of
+		 *	rebooting the system it will be rebooted.
+		 */
+		emergency_restart();
+	}
+#ifdef __sparc__
+	{
+		extern int stop_a_enabled;
+		/* Make sure the user can actually press Stop-A (L1-A) */
+		stop_a_enabled = 1;
+		printk(KERN_EMERG "Press Stop-A (L1-A) to return to the boot prom\n");
+	}
+#endif
+#if defined(CONFIG_S390)
+        disabled_wait(caller);
+#endif
+	local_irq_enable();
+	for (i = 0;;) {
+		touch_softlockup_watchdog();
+		i += panic_blink(i);
+		mdelay(1);
+		i++;
+	}
+}
+
+EXPORT_SYMBOL(panic);
+
+/**
+ *	print_tainted - return a string to represent the kernel taint state.
+ *
+ *  'P' - Proprietary module has been loaded.
+ *  'F' - Module has been forcibly loaded.
+ *  'S' - SMP with CPUs not designed for SMP.
+ *  'R' - User forced a module unload.
+ *  'M' - Machine had a machine check experience.
+ *  'B' - System has hit bad_page.
+ *  'U' - Userspace-defined naughtiness.
+ *
+ *	The string is overwritten by the next call to print_taint().
+ */
+ 
+const char *print_tainted(void)
+{
+	static char buf[20];
+	if (tainted) {
+		snprintf(buf, sizeof(buf), "Tainted: %c%c%c%c%c%c%c",
+			tainted & TAINT_PROPRIETARY_MODULE ? 'P' : 'G',
+			tainted & TAINT_FORCED_MODULE ? 'F' : ' ',
+			tainted & TAINT_UNSAFE_SMP ? 'S' : ' ',
+			tainted & TAINT_FORCED_RMMOD ? 'R' : ' ',
+ 			tainted & TAINT_MACHINE_CHECK ? 'M' : ' ',
+			tainted & TAINT_BAD_PAGE ? 'B' : ' ',
+			tainted & TAINT_USER ? 'U' : ' ');
+	}
+	else
+		snprintf(buf, sizeof(buf), "Not tainted");
+	return(buf);
+}
+
+void add_taint(unsigned flag)
+{
+	debug_locks = 0; /* can't trust the integrity of the kernel anymore */
+	tainted |= flag;
+}
+EXPORT_SYMBOL(add_taint);
+
+static int __init pause_on_oops_setup(char *str)
+{
+	pause_on_oops = simple_strtoul(str, NULL, 0);
+	return 1;
+}
+__setup("pause_on_oops=", pause_on_oops_setup);
+
+static void spin_msec(int msecs)
+{
+	int i;
+
+	for (i = 0; i < msecs; i++) {
+		touch_nmi_watchdog();
+		mdelay(1);
+	}
+}
+
+/*
+ * It just happens that oops_enter() and oops_exit() are identically
+ * implemented...
+ */
+static void do_oops_enter_exit(void)
+{
+	unsigned long flags;
+	static int spin_counter;
+
+	if (!pause_on_oops)
+		return;
+
+	spin_lock_irqsave(&pause_on_oops_lock, flags);
+	if (pause_on_oops_flag == 0) {
+		/* This CPU may now print the oops message */
+		pause_on_oops_flag = 1;
+	} else {
+		/* We need to stall this CPU */
+		if (!spin_counter) {
+			/* This CPU gets to do the counting */
+			spin_counter = pause_on_oops;
+			do {
+				spin_unlock(&pause_on_oops_lock);
+				spin_msec(MSEC_PER_SEC);
+				spin_lock(&pause_on_oops_lock);
+			} while (--spin_counter);
+			pause_on_oops_flag = 0;
+		} else {
+			/* This CPU waits for a different one */
+			while (spin_counter) {
+				spin_unlock(&pause_on_oops_lock);
+				spin_msec(1);
+				spin_lock(&pause_on_oops_lock);
+			}
+		}
+	}
+	spin_unlock_irqrestore(&pause_on_oops_lock, flags);
+}
+
+/*
+ * Return true if the calling CPU is allowed to print oops-related info.  This
+ * is a bit racy..
+ */
+int oops_may_print(void)
+{
+	return pause_on_oops_flag == 0;
+}
+
+/*
+ * Called when the architecture enters its oops handler, before it prints
+ * anything.  If this is the first CPU to oops, and it's oopsing the first time
+ * then let it proceed.
+ *
+ * This is all enabled by the pause_on_oops kernel boot option.  We do all this
+ * to ensure that oopses don't scroll off the screen.  It has the side-effect
+ * of preventing later-oopsing CPUs from mucking up the display, too.
+ *
+ * It turns out that the CPU which is allowed to print ends up pausing for the
+ * right duration, whereas all the other CPUs pause for twice as long: once in
+ * oops_enter(), once in oops_exit().
+ */
+void oops_enter(void)
+{
+	debug_locks_off(); /* can't trust the integrity of the kernel anymore */
+	do_oops_enter_exit();
+}
+
+/*
+ * Called when the architecture exits its oops handler, after printing
+ * everything.
+ */
+void oops_exit(void)
+{
+	do_oops_enter_exit();
+}
+
+#ifdef CONFIG_CC_STACKPROTECTOR
+/*
+ * Called when gcc's -fstack-protector feature is used, and
+ * gcc detects corruption of the on-stack canary value
+ */
+void __stack_chk_fail(void)
+{
+	panic("stack-protector: Kernel stack is corrupted");
+}
+EXPORT_SYMBOL(__stack_chk_fail);
+#endif
--- a/kernel/params.c
+++ b/kernel/params.c
@@ -0,0 +1,751 @@
+/* Helpers for initial module or kernel cmdline parsing
+   Copyright (C) 2001 Rusty Russell.
+
+    This program is free software; you can redistribute it and/or modify
+    it under the terms of the GNU General Public License as published by
+    the Free Software Foundation; either version 2 of the License, or
+    (at your option) any later version.
+
+    This program is distributed in the hope that it will be useful,
+    but WITHOUT ANY WARRANTY; without even the implied warranty of
+    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+    GNU General Public License for more details.
+
+    You should have received a copy of the GNU General Public License
+    along with this program; if not, write to the Free Software
+    Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
+*/
+#include <linux/moduleparam.h>
+#include <linux/kernel.h>
+#include <linux/string.h>
+#include <linux/errno.h>
+#include <linux/module.h>
+#include <linux/device.h>
+#include <linux/err.h>
+#include <linux/slab.h>
+
+#if 0
+#define DEBUGP printk
+#else
+#define DEBUGP(fmt, a...)
+#endif
+
+static inline char dash2underscore(char c)
+{
+	if (c == '-')
+		return '_';
+	return c;
+}
+
+static inline int parameq(const char *input, const char *paramname)
+{
+	unsigned int i;
+	for (i = 0; dash2underscore(input[i]) == paramname[i]; i++)
+		if (input[i] == '\0')
+			return 1;
+	return 0;
+}
+
+static int parse_one(char *param,
+		     char *val,
+		     struct kernel_param *params, 
+		     unsigned num_params,
+		     int (*handle_unknown)(char *param, char *val))
+{
+	unsigned int i;
+
+	/* Find parameter */
+	for (i = 0; i < num_params; i++) {
+		if (parameq(param, params[i].name)) {
+			DEBUGP("They are equal!  Calling %p\n",
+			       params[i].set);
+			return params[i].set(val, &params[i]);
+		}
+	}
+
+	if (handle_unknown) {
+		DEBUGP("Unknown argument: calling %p\n", handle_unknown);
+		return handle_unknown(param, val);
+	}
+
+	DEBUGP("Unknown argument `%s'\n", param);
+	return -ENOENT;
+}
+
+/* You can use " around spaces, but can't escape ". */
+/* Hyphens and underscores equivalent in parameter names. */
+static char *next_arg(char *args, char **param, char **val)
+{
+	unsigned int i, equals = 0;
+	int in_quote = 0, quoted = 0;
+	char *next;
+
+	if (*args == '"') {
+		args++;
+		in_quote = 1;
+		quoted = 1;
+	}
+
+	for (i = 0; args[i]; i++) {
+		if (args[i] == ' ' && !in_quote)
+			break;
+		if (equals == 0) {
+			if (args[i] == '=')
+				equals = i;
+		}
+		if (args[i] == '"')
+			in_quote = !in_quote;
+	}
+
+	*param = args;
+	if (!equals)
+		*val = NULL;
+	else {
+		args[equals] = '\0';
+		*val = args + equals + 1;
+
+		/* Don't include quotes in value. */
+		if (**val == '"') {
+			(*val)++;
+			if (args[i-1] == '"')
+				args[i-1] = '\0';
+		}
+		if (quoted && args[i-1] == '"')
+			args[i-1] = '\0';
+	}
+
+	if (args[i]) {
+		args[i] = '\0';
+		next = args + i + 1;
+	} else
+		next = args + i;
+
+	/* Chew up trailing spaces. */
+	while (*next == ' ')
+		next++;
+	return next;
+}
+
+/* Args looks like "foo=bar,bar2 baz=fuz wiz". */
+int parse_args(const char *name,
+	       char *args,
+	       struct kernel_param *params,
+	       unsigned num,
+	       int (*unknown)(char *param, char *val))
+{
+	char *param, *val;
+
+	DEBUGP("Parsing ARGS: %s\n", args);
+
+	/* Chew leading spaces */
+	while (*args == ' ')
+		args++;
+
+	while (*args) {
+		int ret;
+		int irq_was_disabled;
+
+		args = next_arg(args, &param, &val);
+		irq_was_disabled = irqs_disabled();
+		ret = parse_one(param, val, params, num, unknown);
+		if (irq_was_disabled && !irqs_disabled()) {
+			printk(KERN_WARNING "parse_args(): option '%s' enabled "
+					"irq's!\n", param);
+		}
+		switch (ret) {
+		case -ENOENT:
+			printk(KERN_ERR "%s: Unknown parameter `%s'\n",
+			       name, param);
+			return ret;
+		case -ENOSPC:
+			printk(KERN_ERR
+			       "%s: `%s' too large for parameter `%s'\n",
+			       name, val ?: "", param);
+			return ret;
+		case 0:
+			break;
+		default:
+			printk(KERN_ERR
+			       "%s: `%s' invalid for parameter `%s'\n",
+			       name, val ?: "", param);
+			return ret;
+		}
+	}
+
+	/* All parsed OK. */
+	return 0;
+}
+
+/* Lazy bastard, eh? */
+#define STANDARD_PARAM_DEF(name, type, format, tmptype, strtolfn)      	\
+	int param_set_##name(const char *val, struct kernel_param *kp)	\
+	{								\
+		char *endp;						\
+		tmptype l;						\
+									\
+		if (!val) return -EINVAL;				\
+		l = strtolfn(val, &endp, 0);				\
+		if (endp == val || ((type)l != l))			\
+			return -EINVAL;					\
+		*((type *)kp->arg) = l;					\
+		return 0;						\
+	}								\
+	int param_get_##name(char *buffer, struct kernel_param *kp)	\
+	{								\
+		return sprintf(buffer, format, *((type *)kp->arg));	\
+	}
+
+STANDARD_PARAM_DEF(byte, unsigned char, "%c", unsigned long, simple_strtoul);
+STANDARD_PARAM_DEF(short, short, "%hi", long, simple_strtol);
+STANDARD_PARAM_DEF(ushort, unsigned short, "%hu", unsigned long, simple_strtoul);
+STANDARD_PARAM_DEF(int, int, "%i", long, simple_strtol);
+STANDARD_PARAM_DEF(uint, unsigned int, "%u", unsigned long, simple_strtoul);
+STANDARD_PARAM_DEF(long, long, "%li", long, simple_strtol);
+STANDARD_PARAM_DEF(ulong, unsigned long, "%lu", unsigned long, simple_strtoul);
+
+int param_set_charp(const char *val, struct kernel_param *kp)
+{
+	if (!val) {
+		printk(KERN_ERR "%s: string parameter expected\n",
+		       kp->name);
+		return -EINVAL;
+	}
+
+	if (strlen(val) > 1024) {
+		printk(KERN_ERR "%s: string parameter too long\n",
+		       kp->name);
+		return -ENOSPC;
+	}
+
+	*(char **)kp->arg = (char *)val;
+	return 0;
+}
+
+int param_get_charp(char *buffer, struct kernel_param *kp)
+{
+	return sprintf(buffer, "%s", *((char **)kp->arg));
+}
+
+int param_set_bool(const char *val, struct kernel_param *kp)
+{
+	/* No equals means "set"... */
+	if (!val) val = "1";
+
+	/* One of =[yYnN01] */
+	switch (val[0]) {
+	case 'y': case 'Y': case '1':
+		*(int *)kp->arg = 1;
+		return 0;
+	case 'n': case 'N': case '0':
+		*(int *)kp->arg = 0;
+		return 0;
+	}
+	return -EINVAL;
+}
+
+int param_get_bool(char *buffer, struct kernel_param *kp)
+{
+	/* Y and N chosen as being relatively non-coder friendly */
+	return sprintf(buffer, "%c", (*(int *)kp->arg) ? 'Y' : 'N');
+}
+
+int param_set_invbool(const char *val, struct kernel_param *kp)
+{
+	int boolval, ret;
+	struct kernel_param dummy = { .arg = &boolval };
+
+	ret = param_set_bool(val, &dummy);
+	if (ret == 0)
+		*(int *)kp->arg = !boolval;
+	return ret;
+}
+
+int param_get_invbool(char *buffer, struct kernel_param *kp)
+{
+	int val;
+	struct kernel_param dummy = { .arg = &val };
+
+	val = !*(int *)kp->arg;
+	return param_get_bool(buffer, &dummy);
+}
+
+/* We cheat here and temporarily mangle the string. */
+static int param_array(const char *name,
+		       const char *val,
+		       unsigned int min, unsigned int max,
+		       void *elem, int elemsize,
+		       int (*set)(const char *, struct kernel_param *kp),
+		       int *num)
+{
+	int ret;
+	struct kernel_param kp;
+	char save;
+
+	/* Get the name right for errors. */
+	kp.name = name;
+	kp.arg = elem;
+
+	/* No equals sign? */
+	if (!val) {
+		printk(KERN_ERR "%s: expects arguments\n", name);
+		return -EINVAL;
+	}
+
+	*num = 0;
+	/* We expect a comma-separated list of values. */
+	do {
+		int len;
+
+		if (*num == max) {
+			printk(KERN_ERR "%s: can only take %i arguments\n",
+			       name, max);
+			return -EINVAL;
+		}
+		len = strcspn(val, ",");
+
+		/* nul-terminate and parse */
+		save = val[len];
+		((char *)val)[len] = '\0';
+		ret = set(val, &kp);
+
+		if (ret != 0)
+			return ret;
+		kp.arg += elemsize;
+		val += len+1;
+		(*num)++;
+	} while (save == ',');
+
+	if (*num < min) {
+		printk(KERN_ERR "%s: needs at least %i arguments\n",
+		       name, min);
+		return -EINVAL;
+	}
+	return 0;
+}
+
+int param_array_set(const char *val, struct kernel_param *kp)
+{
+	struct kparam_array *arr = kp->arg;
+	unsigned int temp_num;
+
+	return param_array(kp->name, val, 1, arr->max, arr->elem,
+			   arr->elemsize, arr->set, arr->num ?: &temp_num);
+}
+
+int param_array_get(char *buffer, struct kernel_param *kp)
+{
+	int i, off, ret;
+	struct kparam_array *arr = kp->arg;
+	struct kernel_param p;
+
+	p = *kp;
+	for (i = off = 0; i < (arr->num ? *arr->num : arr->max); i++) {
+		if (i)
+			buffer[off++] = ',';
+		p.arg = arr->elem + arr->elemsize * i;
+		ret = arr->get(buffer + off, &p);
+		if (ret < 0)
+			return ret;
+		off += ret;
+	}
+	buffer[off] = '\0';
+	return off;
+}
+
+int param_set_copystring(const char *val, struct kernel_param *kp)
+{
+	struct kparam_string *kps = kp->arg;
+
+	if (!val) {
+		printk(KERN_ERR "%s: missing param set value\n", kp->name);
+		return -EINVAL;
+	}
+	if (strlen(val)+1 > kps->maxlen) {
+		printk(KERN_ERR "%s: string doesn't fit in %u chars.\n",
+		       kp->name, kps->maxlen-1);
+		return -ENOSPC;
+	}
+	strcpy(kps->string, val);
+	return 0;
+}
+
+int param_get_string(char *buffer, struct kernel_param *kp)
+{
+	struct kparam_string *kps = kp->arg;
+	return strlcpy(buffer, kps->string, kps->maxlen);
+}
+
+/* sysfs output in /sys/modules/XYZ/parameters/ */
+
+extern struct kernel_param __start___param[], __stop___param[];
+
+#define MAX_KBUILD_MODNAME KOBJ_NAME_LEN
+
+struct param_attribute
+{
+	struct module_attribute mattr;
+	struct kernel_param *param;
+};
+
+struct module_param_attrs
+{
+	struct attribute_group grp;
+	struct param_attribute attrs[0];
+};
+
+#ifdef CONFIG_SYSFS
+#define to_param_attr(n) container_of(n, struct param_attribute, mattr);
+
+static ssize_t param_attr_show(struct module_attribute *mattr,
+			       struct module *mod, char *buf)
+{
+	int count;
+	struct param_attribute *attribute = to_param_attr(mattr);
+
+	if (!attribute->param->get)
+		return -EPERM;
+
+	count = attribute->param->get(buf, attribute->param);
+	if (count > 0) {
+		strcat(buf, "\n");
+		++count;
+	}
+	return count;
+}
+
+/* sysfs always hands a nul-terminated string in buf.  We rely on that. */
+static ssize_t param_attr_store(struct module_attribute *mattr,
+				struct module *owner,
+				const char *buf, size_t len)
+{
+ 	int err;
+	struct param_attribute *attribute = to_param_attr(mattr);
+
+	if (!attribute->param->set)
+		return -EPERM;
+
+	err = attribute->param->set(buf, attribute->param);
+	if (!err)
+		return len;
+	return err;
+}
+#endif
+
+#ifdef CONFIG_MODULES
+#define __modinit
+#else
+#define __modinit __init
+#endif
+
+#ifdef CONFIG_SYSFS
+/*
+ * param_sysfs_setup - setup sysfs support for one module or KBUILD_MODNAME
+ * @mk: struct module_kobject (contains parent kobject)
+ * @kparam: array of struct kernel_param, the actual parameter definitions
+ * @num_params: number of entries in array
+ * @name_skip: offset where the parameter name start in kparam[].name. Needed for built-in "modules"
+ *
+ * Create a kobject for a (per-module) group of parameters, and create files
+ * in sysfs. A pointer to the param_kobject is returned on success,
+ * NULL if there's no parameter to export, or other ERR_PTR(err).
+ */
+static __modinit struct module_param_attrs *
+param_sysfs_setup(struct module_kobject *mk,
+		  struct kernel_param *kparam,
+		  unsigned int num_params,
+		  unsigned int name_skip)
+{
+	struct module_param_attrs *mp;
+	unsigned int valid_attrs = 0;
+	unsigned int i, size[2];
+	struct param_attribute *pattr;
+	struct attribute **gattr;
+	int err;
+
+	for (i=0; i<num_params; i++) {
+		if (kparam[i].perm)
+			valid_attrs++;
+	}
+
+	if (!valid_attrs)
+		return NULL;
+
+	size[0] = ALIGN(sizeof(*mp) +
+			valid_attrs * sizeof(mp->attrs[0]),
+			sizeof(mp->grp.attrs[0]));
+	size[1] = (valid_attrs + 1) * sizeof(mp->grp.attrs[0]);
+
+	mp = kmalloc(size[0] + size[1], GFP_KERNEL);
+	if (!mp)
+		return ERR_PTR(-ENOMEM);
+
+	mp->grp.name = "parameters";
+	mp->grp.attrs = (void *)mp + size[0];
+
+	pattr = &mp->attrs[0];
+	gattr = &mp->grp.attrs[0];
+	for (i = 0; i < num_params; i++) {
+		struct kernel_param *kp = &kparam[i];
+		if (kp->perm) {
+			pattr->param = kp;
+			pattr->mattr.show = param_attr_show;
+			pattr->mattr.store = param_attr_store;
+			pattr->mattr.attr.name = (char *)&kp->name[name_skip];
+			pattr->mattr.attr.owner = mk->mod;
+			pattr->mattr.attr.mode = kp->perm;
+			*(gattr++) = &(pattr++)->mattr.attr;
+		}
+	}
+	*gattr = NULL;
+
+	if ((err = sysfs_create_group(&mk->kobj, &mp->grp))) {
+		kfree(mp);
+		return ERR_PTR(err);
+	}
+	return mp;
+}
+
+#ifdef CONFIG_MODULES
+/*
+ * module_param_sysfs_setup - setup sysfs support for one module
+ * @mod: module
+ * @kparam: module parameters (array)
+ * @num_params: number of module parameters
+ *
+ * Adds sysfs entries for module parameters, and creates a link from
+ * /sys/module/[mod->name]/parameters to /sys/parameters/[mod->name]/
+ */
+int module_param_sysfs_setup(struct module *mod,
+			     struct kernel_param *kparam,
+			     unsigned int num_params)
+{
+	struct module_param_attrs *mp;
+
+	mp = param_sysfs_setup(&mod->mkobj, kparam, num_params, 0);
+	if (IS_ERR(mp))
+		return PTR_ERR(mp);
+
+	mod->param_attrs = mp;
+	return 0;
+}
+
+/*
+ * module_param_sysfs_remove - remove sysfs support for one module
+ * @mod: module
+ *
+ * Remove sysfs entries for module parameters and the corresponding
+ * kobject.
+ */
+void module_param_sysfs_remove(struct module *mod)
+{
+	if (mod->param_attrs) {
+		sysfs_remove_group(&mod->mkobj.kobj,
+				   &mod->param_attrs->grp);
+		/* We are positive that no one is using any param
+		 * attrs at this point.  Deallocate immediately. */
+		kfree(mod->param_attrs);
+		mod->param_attrs = NULL;
+	}
+}
+#endif
+
+/*
+ * kernel_param_sysfs_setup - wrapper for built-in params support
+ */
+static void __init kernel_param_sysfs_setup(const char *name,
+					    struct kernel_param *kparam,
+					    unsigned int num_params,
+					    unsigned int name_skip)
+{
+	struct module_kobject *mk;
+	int ret;
+
+	mk = kzalloc(sizeof(struct module_kobject), GFP_KERNEL);
+	BUG_ON(!mk);
+
+	mk->mod = THIS_MODULE;
+	kobj_set_kset_s(mk, module_subsys);
+	kobject_set_name(&mk->kobj, name);
+	kobject_init(&mk->kobj);
+	ret = kobject_add(&mk->kobj);
+	BUG_ON(ret < 0);
+	param_sysfs_setup(mk, kparam, num_params, name_skip);
+	kobject_uevent(&mk->kobj, KOBJ_ADD);
+}
+
+/*
+ * param_sysfs_builtin - add contents in /sys/parameters for built-in modules
+ *
+ * Add module_parameters to sysfs for "modules" built into the kernel.
+ *
+ * The "module" name (KBUILD_MODNAME) is stored before a dot, the
+ * "parameter" name is stored behind a dot in kernel_param->name. So,
+ * extract the "module" name for all built-in kernel_param-eters,
+ * and for all who have the same, call kernel_param_sysfs_setup.
+ */
+static void __init param_sysfs_builtin(void)
+{
+	struct kernel_param *kp, *kp_begin = NULL;
+	unsigned int i, name_len, count = 0;
+	char modname[MAX_KBUILD_MODNAME + 1] = "";
+
+	for (i=0; i < __stop___param - __start___param; i++) {
+		char *dot;
+
+		kp = &__start___param[i];
+
+		/* We do not handle args without periods. */
+		dot = memchr(kp->name, '.', MAX_KBUILD_MODNAME);
+		if (!dot) {
+			DEBUGP("couldn't find period in %s\n", kp->name);
+			continue;
+		}
+		name_len = dot - kp->name;
+
+ 		/* new kbuild_modname? */
+		if (strlen(modname) != name_len
+		    || strncmp(modname, kp->name, name_len) != 0) {
+			/* add a new kobject for previous kernel_params. */
+			if (count)
+				kernel_param_sysfs_setup(modname,
+							 kp_begin,
+							 count,
+							 strlen(modname)+1);
+
+			strncpy(modname, kp->name, name_len);
+			modname[name_len] = '\0';
+			count = 0;
+			kp_begin = kp;
+		}
+		count++;
+	}
+
+	/* last kernel_params need to be registered as well */
+	if (count)
+		kernel_param_sysfs_setup(modname, kp_begin, count,
+					 strlen(modname)+1);
+}
+
+
+/* module-related sysfs stuff */
+
+#define to_module_attr(n) container_of(n, struct module_attribute, attr);
+#define to_module_kobject(n) container_of(n, struct module_kobject, kobj);
+
+static ssize_t module_attr_show(struct kobject *kobj,
+				struct attribute *attr,
+				char *buf)
+{
+	struct module_attribute *attribute;
+	struct module_kobject *mk;
+	int ret;
+
+	attribute = to_module_attr(attr);
+	mk = to_module_kobject(kobj);
+
+	if (!attribute->show)
+		return -EIO;
+
+	ret = attribute->show(attribute, mk->mod, buf);
+
+	return ret;
+}
+
+static ssize_t module_attr_store(struct kobject *kobj,
+				struct attribute *attr,
+				const char *buf, size_t len)
+{
+	struct module_attribute *attribute;
+	struct module_kobject *mk;
+	int ret;
+
+	attribute = to_module_attr(attr);
+	mk = to_module_kobject(kobj);
+
+	if (!attribute->store)
+		return -EIO;
+
+	ret = attribute->store(attribute, mk->mod, buf, len);
+
+	return ret;
+}
+
+static struct sysfs_ops module_sysfs_ops = {
+	.show = module_attr_show,
+	.store = module_attr_store,
+};
+
+static struct kobj_type module_ktype;
+
+static int uevent_filter(struct kset *kset, struct kobject *kobj)
+{
+	struct kobj_type *ktype = get_ktype(kobj);
+
+	if (ktype == &module_ktype)
+		return 1;
+	return 0;
+}
+
+static struct kset_uevent_ops module_uevent_ops = {
+	.filter = uevent_filter,
+};
+
+decl_subsys(module, &module_ktype, &module_uevent_ops);
+
+static struct kobj_type module_ktype = {
+	.sysfs_ops =	&module_sysfs_ops,
+};
+
+/*
+ * param_sysfs_init - wrapper for built-in params support
+ */
+static int __init param_sysfs_init(void)
+{
+	int ret;
+
+	ret = subsystem_register(&module_subsys);
+	if (ret < 0) {
+		printk(KERN_WARNING "%s (%d): subsystem_register error: %d\n",
+			__FILE__, __LINE__, ret);
+		return ret;
+	}
+
+	param_sysfs_builtin();
+
+	return 0;
+}
+subsys_initcall(param_sysfs_init);
+
+#else
+#if 0
+static struct sysfs_ops module_sysfs_ops = {
+	.show = NULL,
+	.store = NULL,
+};
+#endif
+#endif
+
+EXPORT_SYMBOL(param_set_byte);
+EXPORT_SYMBOL(param_get_byte);
+EXPORT_SYMBOL(param_set_short);
+EXPORT_SYMBOL(param_get_short);
+EXPORT_SYMBOL(param_set_ushort);
+EXPORT_SYMBOL(param_get_ushort);
+EXPORT_SYMBOL(param_set_int);
+EXPORT_SYMBOL(param_get_int);
+EXPORT_SYMBOL(param_set_uint);
+EXPORT_SYMBOL(param_get_uint);
+EXPORT_SYMBOL(param_set_long);
+EXPORT_SYMBOL(param_get_long);
+EXPORT_SYMBOL(param_set_ulong);
+EXPORT_SYMBOL(param_get_ulong);
+EXPORT_SYMBOL(param_set_charp);
+EXPORT_SYMBOL(param_get_charp);
+EXPORT_SYMBOL(param_set_bool);
+EXPORT_SYMBOL(param_get_bool);
+EXPORT_SYMBOL(param_set_invbool);
+EXPORT_SYMBOL(param_get_invbool);
+EXPORT_SYMBOL(param_array_set);
+EXPORT_SYMBOL(param_array_get);
+EXPORT_SYMBOL(param_set_copystring);
+EXPORT_SYMBOL(param_get_string);
--- a/kernel/pid.c
+++ b/kernel/pid.c
@@ -0,0 +1,418 @@
+/*
+ * Generic pidhash and scalable, time-bounded PID allocator
+ *
+ * (C) 2002-2003 William Irwin, IBM
+ * (C) 2004 William Irwin, Oracle
+ * (C) 2002-2004 Ingo Molnar, Red Hat
+ *
+ * pid-structures are backing objects for tasks sharing a given ID to chain
+ * against. There is very little to them aside from hashing them and
+ * parking tasks using given ID's on a list.
+ *
+ * The hash is always changed with the tasklist_lock write-acquired,
+ * and the hash is only accessed with the tasklist_lock at least
+ * read-acquired, so there's no additional SMP locking needed here.
+ *
+ * We have a list of bitmap pages, which bitmaps represent the PID space.
+ * Allocating and freeing PIDs is completely lockless. The worst-case
+ * allocation scenario when all but one out of 1 million PIDs possible are
+ * allocated already: the scanning of 32 list entries and at most PAGE_SIZE
+ * bytes. The typical fastpath is a single successful setbit. Freeing is O(1).
+ */
+
+#include <linux/mm.h>
+#include <linux/module.h>
+#include <linux/slab.h>
+#include <linux/init.h>
+#include <linux/bootmem.h>
+#include <linux/hash.h>
+#include <linux/pid_namespace.h>
+
+#define pid_hashfn(nr) hash_long((unsigned long)nr, pidhash_shift)
+static struct hlist_head *pid_hash;
+static int pidhash_shift;
+static struct kmem_cache *pid_cachep;
+
+int pid_max = PID_MAX_DEFAULT;
+
+#define RESERVED_PIDS		300
+
+int pid_max_min = RESERVED_PIDS + 1;
+int pid_max_max = PID_MAX_LIMIT;
+
+#define BITS_PER_PAGE		(PAGE_SIZE*8)
+#define BITS_PER_PAGE_MASK	(BITS_PER_PAGE-1)
+
+static inline int mk_pid(struct pid_namespace *pid_ns,
+		struct pidmap *map, int off)
+{
+	return (map - pid_ns->pidmap)*BITS_PER_PAGE + off;
+}
+
+#define find_next_offset(map, off)					\
+		find_next_zero_bit((map)->page, BITS_PER_PAGE, off)
+
+/*
+ * PID-map pages start out as NULL, they get allocated upon
+ * first use and are never deallocated. This way a low pid_max
+ * value does not cause lots of bitmaps to be allocated, but
+ * the scheme scales to up to 4 million PIDs, runtime.
+ */
+struct pid_namespace init_pid_ns = {
+	.kref = {
+		.refcount       = ATOMIC_INIT(2),
+	},
+	.pidmap = {
+		[ 0 ... PIDMAP_ENTRIES-1] = { ATOMIC_INIT(BITS_PER_PAGE), NULL }
+	},
+	.last_pid = 0,
+	.child_reaper = &init_task
+};
+
+/*
+ * Note: disable interrupts while the pidmap_lock is held as an
+ * interrupt might come in and do read_lock(&tasklist_lock).
+ *
+ * If we don't disable interrupts there is a nasty deadlock between
+ * detach_pid()->free_pid() and another cpu that does
+ * spin_lock(&pidmap_lock) followed by an interrupt routine that does
+ * read_lock(&tasklist_lock);
+ *
+ * After we clean up the tasklist_lock and know there are no
+ * irq handlers that take it we can leave the interrupts enabled.
+ * For now it is easier to be safe than to prove it can't happen.
+ */
+
+static  __cacheline_aligned_in_smp DEFINE_SPINLOCK(pidmap_lock);
+
+static fastcall void free_pidmap(struct pid_namespace *pid_ns, int pid)
+{
+	struct pidmap *map = pid_ns->pidmap + pid / BITS_PER_PAGE;
+	int offset = pid & BITS_PER_PAGE_MASK;
+
+	clear_bit(offset, map->page);
+	atomic_inc(&map->nr_free);
+}
+
+static int alloc_pidmap(struct pid_namespace *pid_ns)
+{
+	int i, offset, max_scan, pid, last = pid_ns->last_pid;
+	struct pidmap *map;
+
+	pid = last + 1;
+	if (pid >= pid_max)
+		pid = RESERVED_PIDS;
+	offset = pid & BITS_PER_PAGE_MASK;
+	map = &pid_ns->pidmap[pid/BITS_PER_PAGE];
+	max_scan = (pid_max + BITS_PER_PAGE - 1)/BITS_PER_PAGE - !offset;
+	for (i = 0; i <= max_scan; ++i) {
+		if (unlikely(!map->page)) {
+			void *page = kzalloc(PAGE_SIZE, GFP_KERNEL);
+			/*
+			 * Free the page if someone raced with us
+			 * installing it:
+			 */
+			spin_lock_irq(&pidmap_lock);
+			if (map->page)
+				kfree(page);
+			else
+				map->page = page;
+			spin_unlock_irq(&pidmap_lock);
+			if (unlikely(!map->page))
+				break;
+		}
+		if (likely(atomic_read(&map->nr_free))) {
+			do {
+				if (!test_and_set_bit(offset, map->page)) {
+					atomic_dec(&map->nr_free);
+					pid_ns->last_pid = pid;
+					return pid;
+				}
+				offset = find_next_offset(map, offset);
+				pid = mk_pid(pid_ns, map, offset);
+			/*
+			 * find_next_offset() found a bit, the pid from it
+			 * is in-bounds, and if we fell back to the last
+			 * bitmap block and the final block was the same
+			 * as the starting point, pid is before last_pid.
+			 */
+			} while (offset < BITS_PER_PAGE && pid < pid_max &&
+					(i != max_scan || pid < last ||
+					    !((last+1) & BITS_PER_PAGE_MASK)));
+		}
+		if (map < &pid_ns->pidmap[(pid_max-1)/BITS_PER_PAGE]) {
+			++map;
+			offset = 0;
+		} else {
+			map = &pid_ns->pidmap[0];
+			offset = RESERVED_PIDS;
+			if (unlikely(last == offset))
+				break;
+		}
+		pid = mk_pid(pid_ns, map, offset);
+	}
+	return -1;
+}
+
+static int next_pidmap(struct pid_namespace *pid_ns, int last)
+{
+	int offset;
+	struct pidmap *map, *end;
+
+	offset = (last + 1) & BITS_PER_PAGE_MASK;
+	map = &pid_ns->pidmap[(last + 1)/BITS_PER_PAGE];
+	end = &pid_ns->pidmap[PIDMAP_ENTRIES];
+	for (; map < end; map++, offset = 0) {
+		if (unlikely(!map->page))
+			continue;
+		offset = find_next_bit((map)->page, BITS_PER_PAGE, offset);
+		if (offset < BITS_PER_PAGE)
+			return mk_pid(pid_ns, map, offset);
+	}
+	return -1;
+}
+
+fastcall void put_pid(struct pid *pid)
+{
+	if (!pid)
+		return;
+	if ((atomic_read(&pid->count) == 1) ||
+	     atomic_dec_and_test(&pid->count))
+		kmem_cache_free(pid_cachep, pid);
+}
+EXPORT_SYMBOL_GPL(put_pid);
+
+static void delayed_put_pid(struct rcu_head *rhp)
+{
+	struct pid *pid = container_of(rhp, struct pid, rcu);
+	put_pid(pid);
+}
+
+fastcall void free_pid(struct pid *pid)
+{
+	/* We can be called with write_lock_irq(&tasklist_lock) held */
+	unsigned long flags;
+
+	spin_lock_irqsave(&pidmap_lock, flags);
+	hlist_del_rcu(&pid->pid_chain);
+	spin_unlock_irqrestore(&pidmap_lock, flags);
+
+	free_pidmap(&init_pid_ns, pid->nr);
+	call_rcu(&pid->rcu, delayed_put_pid);
+}
+
+struct pid *alloc_pid(void)
+{
+	struct pid *pid;
+	enum pid_type type;
+	int nr = -1;
+
+	pid = kmem_cache_alloc(pid_cachep, GFP_KERNEL);
+	if (!pid)
+		goto out;
+
+	nr = alloc_pidmap(current->nsproxy->pid_ns);
+	if (nr < 0)
+		goto out_free;
+
+	atomic_set(&pid->count, 1);
+	pid->nr = nr;
+	for (type = 0; type < PIDTYPE_MAX; ++type)
+		INIT_HLIST_HEAD(&pid->tasks[type]);
+
+	spin_lock_irq(&pidmap_lock);
+	hlist_add_head_rcu(&pid->pid_chain, &pid_hash[pid_hashfn(pid->nr)]);
+	spin_unlock_irq(&pidmap_lock);
+
+out:
+	return pid;
+
+out_free:
+	kmem_cache_free(pid_cachep, pid);
+	pid = NULL;
+	goto out;
+}
+
+struct pid * fastcall find_pid(int nr)
+{
+	struct hlist_node *elem;
+	struct pid *pid;
+
+	hlist_for_each_entry_rcu(pid, elem,
+			&pid_hash[pid_hashfn(nr)], pid_chain) {
+		if (pid->nr == nr)
+			return pid;
+	}
+	return NULL;
+}
+EXPORT_SYMBOL_GPL(find_pid);
+
+int fastcall attach_pid(struct task_struct *task, enum pid_type type, int nr)
+{
+	struct pid_link *link;
+	struct pid *pid;
+
+	link = &task->pids[type];
+	link->pid = pid = find_pid(nr);
+	hlist_add_head_rcu(&link->node, &pid->tasks[type]);
+
+	return 0;
+}
+
+void fastcall detach_pid(struct task_struct *task, enum pid_type type)
+{
+	struct pid_link *link;
+	struct pid *pid;
+	int tmp;
+
+	link = &task->pids[type];
+	pid = link->pid;
+
+	hlist_del_rcu(&link->node);
+	link->pid = NULL;
+
+	for (tmp = PIDTYPE_MAX; --tmp >= 0; )
+		if (!hlist_empty(&pid->tasks[tmp]))
+			return;
+
+	free_pid(pid);
+}
+
+/* transfer_pid is an optimization of attach_pid(new), detach_pid(old) */
+void fastcall transfer_pid(struct task_struct *old, struct task_struct *new,
+			   enum pid_type type)
+{
+	new->pids[type].pid = old->pids[type].pid;
+	hlist_replace_rcu(&old->pids[type].node, &new->pids[type].node);
+	old->pids[type].pid = NULL;
+}
+
+struct task_struct * fastcall pid_task(struct pid *pid, enum pid_type type)
+{
+	struct task_struct *result = NULL;
+	if (pid) {
+		struct hlist_node *first;
+		first = rcu_dereference(pid->tasks[type].first);
+		if (first)
+			result = hlist_entry(first, struct task_struct, pids[(type)].node);
+	}
+	return result;
+}
+
+/*
+ * Must be called under rcu_read_lock() or with tasklist_lock read-held.
+ */
+struct task_struct *find_task_by_pid_type(int type, int nr)
+{
+	return pid_task(find_pid(nr), type);
+}
+
+EXPORT_SYMBOL(find_task_by_pid_type);
+
+struct pid *get_task_pid(struct task_struct *task, enum pid_type type)
+{
+	struct pid *pid;
+	rcu_read_lock();
+	pid = get_pid(task->pids[type].pid);
+	rcu_read_unlock();
+	return pid;
+}
+
+struct task_struct *fastcall get_pid_task(struct pid *pid, enum pid_type type)
+{
+	struct task_struct *result;
+	rcu_read_lock();
+	result = pid_task(pid, type);
+	if (result)
+		get_task_struct(result);
+	rcu_read_unlock();
+	return result;
+}
+
+struct pid *find_get_pid(pid_t nr)
+{
+	struct pid *pid;
+
+	rcu_read_lock();
+	pid = get_pid(find_pid(nr));
+	rcu_read_unlock();
+
+	return pid;
+}
+
+/*
+ * Used by proc to find the first pid that is greater then or equal to nr.
+ *
+ * If there is a pid at nr this function is exactly the same as find_pid.
+ */
+struct pid *find_ge_pid(int nr)
+{
+	struct pid *pid;
+
+	do {
+		pid = find_pid(nr);
+		if (pid)
+			break;
+		nr = next_pidmap(current->nsproxy->pid_ns, nr);
+	} while (nr > 0);
+
+	return pid;
+}
+EXPORT_SYMBOL_GPL(find_get_pid);
+
+int copy_pid_ns(int flags, struct task_struct *tsk)
+{
+	struct pid_namespace *old_ns = tsk->nsproxy->pid_ns;
+	int err = 0;
+
+	if (!old_ns)
+		return 0;
+
+	get_pid_ns(old_ns);
+	return err;
+}
+
+void free_pid_ns(struct kref *kref)
+{
+	struct pid_namespace *ns;
+
+	ns = container_of(kref, struct pid_namespace, kref);
+	kfree(ns);
+}
+
+/*
+ * The pid hash table is scaled according to the amount of memory in the
+ * machine.  From a minimum of 16 slots up to 4096 slots at one gigabyte or
+ * more.
+ */
+void __init pidhash_init(void)
+{
+	int i, pidhash_size;
+	unsigned long megabytes = nr_kernel_pages >> (20 - PAGE_SHIFT);
+
+	pidhash_shift = max(4, fls(megabytes * 4));
+	pidhash_shift = min(12, pidhash_shift);
+	pidhash_size = 1 << pidhash_shift;
+
+	printk("PID hash table entries: %d (order: %d, %Zd bytes)\n",
+		pidhash_size, pidhash_shift,
+		pidhash_size * sizeof(struct hlist_head));
+
+	pid_hash = alloc_bootmem(pidhash_size *	sizeof(*(pid_hash)));
+	if (!pid_hash)
+		panic("Could not alloc pidhash!\n");
+	for (i = 0; i < pidhash_size; i++)
+		INIT_HLIST_HEAD(&pid_hash[i]);
+}
+
+void __init pidmap_init(void)
+{
+	init_pid_ns.pidmap[0].page = kzalloc(PAGE_SIZE, GFP_KERNEL);
+	/* Reserve PID 0. We never call free_pidmap(0) */
+	set_bit(0, init_pid_ns.pidmap[0].page);
+	atomic_dec(&init_pid_ns.pidmap[0].nr_free);
+
+	pid_cachep = kmem_cache_create("pid", sizeof(struct pid),
+					__alignof__(struct pid),
+					SLAB_PANIC, NULL, NULL);
+}
--- a/kernel/posix-cpu-timers.c
+++ b/kernel/posix-cpu-timers.c
--- a/kernel/posix-timers.c
+++ b/kernel/posix-timers.c
@@ -0,0 +1,996 @@
+/*
+ * linux/kernel/posix-timers.c
+ *
+ *
+ * 2002-10-15  Posix Clocks & timers
+ *                           by George Anzinger george@mvista.com
+ *
+ *			     Copyright (C) 2002 2003 by MontaVista Software.
+ *
+ * 2004-06-01  Fix CLOCK_REALTIME clock/timer TIMER_ABSTIME bug.
+ *			     Copyright (C) 2004 Boris Hu
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or (at
+ * your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * General Public License for more details.
+
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
+ *
+ * MontaVista Software | 1237 East Arques Avenue | Sunnyvale | CA 94085 | USA
+ */
+
+/* These are all the functions necessary to implement
+ * POSIX clocks & timers
+ */
+#include <linux/mm.h>
+#include <linux/smp_lock.h>
+#include <linux/interrupt.h>
+#include <linux/slab.h>
+#include <linux/time.h>
+#include <linux/mutex.h>
+
+#include <asm/uaccess.h>
+#include <asm/semaphore.h>
+#include <linux/list.h>
+#include <linux/init.h>
+#include <linux/compiler.h>
+#include <linux/idr.h>
+#include <linux/posix-timers.h>
+#include <linux/syscalls.h>
+#include <linux/wait.h>
+#include <linux/workqueue.h>
+#include <linux/module.h>
+
+/*
+ * Management arrays for POSIX timers.	 Timers are kept in slab memory
+ * Timer ids are allocated by an external routine that keeps track of the
+ * id and the timer.  The external interface is:
+ *
+ * void *idr_find(struct idr *idp, int id);           to find timer_id <id>
+ * int idr_get_new(struct idr *idp, void *ptr);       to get a new id and
+ *                                                    related it to <ptr>
+ * void idr_remove(struct idr *idp, int id);          to release <id>
+ * void idr_init(struct idr *idp);                    to initialize <idp>
+ *                                                    which we supply.
+ * The idr_get_new *may* call slab for more memory so it must not be
+ * called under a spin lock.  Likewise idr_remore may release memory
+ * (but it may be ok to do this under a lock...).
+ * idr_find is just a memory look up and is quite fast.  A -1 return
+ * indicates that the requested id does not exist.
+ */
+
+/*
+ * Lets keep our timers in a slab cache :-)
+ */
+static struct kmem_cache *posix_timers_cache;
+static struct idr posix_timers_id;
+static DEFINE_SPINLOCK(idr_lock);
+
+/*
+ * we assume that the new SIGEV_THREAD_ID shares no bits with the other
+ * SIGEV values.  Here we put out an error if this assumption fails.
+ */
+#if SIGEV_THREAD_ID != (SIGEV_THREAD_ID & \
+                       ~(SIGEV_SIGNAL | SIGEV_NONE | SIGEV_THREAD))
+#error "SIGEV_THREAD_ID must not share bit with other SIGEV values!"
+#endif
+
+
+/*
+ * The timer ID is turned into a timer address by idr_find().
+ * Verifying a valid ID consists of:
+ *
+ * a) checking that idr_find() returns other than -1.
+ * b) checking that the timer id matches the one in the timer itself.
+ * c) that the timer owner is in the callers thread group.
+ */
+
+/*
+ * CLOCKs: The POSIX standard calls for a couple of clocks and allows us
+ *	    to implement others.  This structure defines the various
+ *	    clocks and allows the possibility of adding others.	 We
+ *	    provide an interface to add clocks to the table and expect
+ *	    the "arch" code to add at least one clock that is high
+ *	    resolution.	 Here we define the standard CLOCK_REALTIME as a
+ *	    1/HZ resolution clock.
+ *
+ * RESOLUTION: Clock resolution is used to round up timer and interval
+ *	    times, NOT to report clock times, which are reported with as
+ *	    much resolution as the system can muster.  In some cases this
+ *	    resolution may depend on the underlying clock hardware and
+ *	    may not be quantifiable until run time, and only then is the
+ *	    necessary code is written.	The standard says we should say
+ *	    something about this issue in the documentation...
+ *
+ * FUNCTIONS: The CLOCKs structure defines possible functions to handle
+ *	    various clock functions.  For clocks that use the standard
+ *	    system timer code these entries should be NULL.  This will
+ *	    allow dispatch without the overhead of indirect function
+ *	    calls.  CLOCKS that depend on other sources (e.g. WWV or GPS)
+ *	    must supply functions here, even if the function just returns
+ *	    ENOSYS.  The standard POSIX timer management code assumes the
+ *	    following: 1.) The k_itimer struct (sched.h) is used for the
+ *	    timer.  2.) The list, it_lock, it_clock, it_id and it_process
+ *	    fields are not modified by timer code.
+ *
+ *          At this time all functions EXCEPT clock_nanosleep can be
+ *          redirected by the CLOCKS structure.  Clock_nanosleep is in
+ *          there, but the code ignores it.
+ *
+ * Permissions: It is assumed that the clock_settime() function defined
+ *	    for each clock will take care of permission checks.	 Some
+ *	    clocks may be set able by any user (i.e. local process
+ *	    clocks) others not.	 Currently the only set able clock we
+ *	    have is CLOCK_REALTIME and its high res counter part, both of
+ *	    which we beg off on and pass to do_sys_settimeofday().
+ */
+
+static struct k_clock posix_clocks[MAX_CLOCKS];
+
+/*
+ * These ones are defined below.
+ */
+static int common_nsleep(const clockid_t, int flags, struct timespec *t,
+			 struct timespec __user *rmtp);
+static void common_timer_get(struct k_itimer *, struct itimerspec *);
+static int common_timer_set(struct k_itimer *, int,
+			    struct itimerspec *, struct itimerspec *);
+static int common_timer_del(struct k_itimer *timer);
+
+static enum hrtimer_restart posix_timer_fn(struct hrtimer *data);
+
+static struct k_itimer *lock_timer(timer_t timer_id, unsigned long *flags);
+
+static inline void unlock_timer(struct k_itimer *timr, unsigned long flags)
+{
+	spin_unlock_irqrestore(&timr->it_lock, flags);
+}
+
+/*
+ * Call the k_clock hook function if non-null, or the default function.
+ */
+#define CLOCK_DISPATCH(clock, call, arglist) \
+ 	((clock) < 0 ? posix_cpu_##call arglist : \
+ 	 (posix_clocks[clock].call != NULL \
+ 	  ? (*posix_clocks[clock].call) arglist : common_##call arglist))
+
+/*
+ * Default clock hook functions when the struct k_clock passed
+ * to register_posix_clock leaves a function pointer null.
+ *
+ * The function common_CALL is the default implementation for
+ * the function pointer CALL in struct k_clock.
+ */
+
+static inline int common_clock_getres(const clockid_t which_clock,
+				      struct timespec *tp)
+{
+	tp->tv_sec = 0;
+	tp->tv_nsec = posix_clocks[which_clock].res;
+	return 0;
+}
+
+/*
+ * Get real time for posix timers
+ */
+static int common_clock_get(clockid_t which_clock, struct timespec *tp)
+{
+	ktime_get_real_ts(tp);
+	return 0;
+}
+
+static inline int common_clock_set(const clockid_t which_clock,
+				   struct timespec *tp)
+{
+	return do_sys_settimeofday(tp, NULL);
+}
+
+static int common_timer_create(struct k_itimer *new_timer)
+{
+	hrtimer_init(&new_timer->it.real.timer, new_timer->it_clock, 0);
+	return 0;
+}
+
+/*
+ * Return nonzero if we know a priori this clockid_t value is bogus.
+ */
+static inline int invalid_clockid(const clockid_t which_clock)
+{
+	if (which_clock < 0)	/* CPU clock, posix_cpu_* will check it */
+		return 0;
+	if ((unsigned) which_clock >= MAX_CLOCKS)
+		return 1;
+	if (posix_clocks[which_clock].clock_getres != NULL)
+		return 0;
+	if (posix_clocks[which_clock].res != 0)
+		return 0;
+	return 1;
+}
+
+/*
+ * Get monotonic time for posix timers
+ */
+static int posix_ktime_get_ts(clockid_t which_clock, struct timespec *tp)
+{
+	ktime_get_ts(tp);
+	return 0;
+}
+
+/*
+ * Initialize everything, well, just everything in Posix clocks/timers ;)
+ */
+static __init int init_posix_timers(void)
+{
+	struct k_clock clock_realtime = {
+		.clock_getres = hrtimer_get_res,
+	};
+	struct k_clock clock_monotonic = {
+		.clock_getres = hrtimer_get_res,
+		.clock_get = posix_ktime_get_ts,
+		.clock_set = do_posix_clock_nosettime,
+	};
+
+	register_posix_clock(CLOCK_REALTIME, &clock_realtime);
+	register_posix_clock(CLOCK_MONOTONIC, &clock_monotonic);
+
+	posix_timers_cache = kmem_cache_create("posix_timers_cache",
+					sizeof (struct k_itimer), 0, 0, NULL, NULL);
+	idr_init(&posix_timers_id);
+	return 0;
+}
+
+__initcall(init_posix_timers);
+
+static void schedule_next_timer(struct k_itimer *timr)
+{
+	struct hrtimer *timer = &timr->it.real.timer;
+
+	if (timr->it.real.interval.tv64 == 0)
+		return;
+
+	timr->it_overrun += hrtimer_forward(timer, timer->base->get_time(),
+					    timr->it.real.interval);
+
+	timr->it_overrun_last = timr->it_overrun;
+	timr->it_overrun = -1;
+	++timr->it_requeue_pending;
+	hrtimer_restart(timer);
+}
+
+/*
+ * This function is exported for use by the signal deliver code.  It is
+ * called just prior to the info block being released and passes that
+ * block to us.  It's function is to update the overrun entry AND to
+ * restart the timer.  It should only be called if the timer is to be
+ * restarted (i.e. we have flagged this in the sys_private entry of the
+ * info block).
+ *
+ * To protect aginst the timer going away while the interrupt is queued,
+ * we require that the it_requeue_pending flag be set.
+ */
+void do_schedule_next_timer(struct siginfo *info)
+{
+	struct k_itimer *timr;
+	unsigned long flags;
+
+	timr = lock_timer(info->si_tid, &flags);
+
+	if (timr && timr->it_requeue_pending == info->si_sys_private) {
+		if (timr->it_clock < 0)
+			posix_cpu_timer_schedule(timr);
+		else
+			schedule_next_timer(timr);
+
+		info->si_overrun = timr->it_overrun_last;
+	}
+
+	if (timr)
+		unlock_timer(timr, flags);
+}
+
+int posix_timer_event(struct k_itimer *timr,int si_private)
+{
+	memset(&timr->sigq->info, 0, sizeof(siginfo_t));
+	timr->sigq->info.si_sys_private = si_private;
+	/* Send signal to the process that owns this timer.*/
+
+	timr->sigq->info.si_signo = timr->it_sigev_signo;
+	timr->sigq->info.si_errno = 0;
+	timr->sigq->info.si_code = SI_TIMER;
+	timr->sigq->info.si_tid = timr->it_id;
+	timr->sigq->info.si_value = timr->it_sigev_value;
+
+	if (timr->it_sigev_notify & SIGEV_THREAD_ID) {
+		struct task_struct *leader;
+		int ret = send_sigqueue(timr->it_sigev_signo, timr->sigq,
+					timr->it_process);
+
+		if (likely(ret >= 0))
+			return ret;
+
+		timr->it_sigev_notify = SIGEV_SIGNAL;
+		leader = timr->it_process->group_leader;
+		put_task_struct(timr->it_process);
+		timr->it_process = leader;
+	}
+
+	return send_group_sigqueue(timr->it_sigev_signo, timr->sigq,
+				   timr->it_process);
+}
+EXPORT_SYMBOL_GPL(posix_timer_event);
+
+/*
+ * This function gets called when a POSIX.1b interval timer expires.  It
+ * is used as a callback from the kernel internal timer.  The
+ * run_timer_list code ALWAYS calls with interrupts on.
+
+ * This code is for CLOCK_REALTIME* and CLOCK_MONOTONIC* timers.
+ */
+static enum hrtimer_restart posix_timer_fn(struct hrtimer *timer)
+{
+	struct k_itimer *timr;
+	unsigned long flags;
+	int si_private = 0;
+	enum hrtimer_restart ret = HRTIMER_NORESTART;
+
+	timr = container_of(timer, struct k_itimer, it.real.timer);
+	spin_lock_irqsave(&timr->it_lock, flags);
+
+	if (timr->it.real.interval.tv64 != 0)
+		si_private = ++timr->it_requeue_pending;
+
+	if (posix_timer_event(timr, si_private)) {
+		/*
+		 * signal was not sent because of sig_ignor
+		 * we will not get a call back to restart it AND
+		 * it should be restarted.
+		 */
+		if (timr->it.real.interval.tv64 != 0) {
+			timr->it_overrun +=
+				hrtimer_forward(timer,
+						hrtimer_cb_get_time(timer),
+						timr->it.real.interval);
+			ret = HRTIMER_RESTART;
+			++timr->it_requeue_pending;
+		}
+	}
+
+	unlock_timer(timr, flags);
+	return ret;
+}
+
+static struct task_struct * good_sigevent(sigevent_t * event)
+{
+	struct task_struct *rtn = current->group_leader;
+
+	if ((event->sigev_notify & SIGEV_THREAD_ID ) &&
+		(!(rtn = find_task_by_pid(event->sigev_notify_thread_id)) ||
+		 rtn->tgid != current->tgid ||
+		 (event->sigev_notify & ~SIGEV_THREAD_ID) != SIGEV_SIGNAL))
+		return NULL;
+
+	if (((event->sigev_notify & ~SIGEV_THREAD_ID) != SIGEV_NONE) &&
+	    ((event->sigev_signo <= 0) || (event->sigev_signo > SIGRTMAX)))
+		return NULL;
+
+	return rtn;
+}
+
+void register_posix_clock(const clockid_t clock_id, struct k_clock *new_clock)
+{
+	if ((unsigned) clock_id >= MAX_CLOCKS) {
+		printk("POSIX clock register failed for clock_id %d\n",
+		       clock_id);
+		return;
+	}
+
+	posix_clocks[clock_id] = *new_clock;
+}
+EXPORT_SYMBOL_GPL(register_posix_clock);
+
+static struct k_itimer * alloc_posix_timer(void)
+{
+	struct k_itimer *tmr;
+	tmr = kmem_cache_zalloc(posix_timers_cache, GFP_KERNEL);
+	if (!tmr)
+		return tmr;
+	if (unlikely(!(tmr->sigq = sigqueue_alloc()))) {
+		kmem_cache_free(posix_timers_cache, tmr);
+		tmr = NULL;
+	}
+	return tmr;
+}
+
+#define IT_ID_SET	1
+#define IT_ID_NOT_SET	0
+static void release_posix_timer(struct k_itimer *tmr, int it_id_set)
+{
+	if (it_id_set) {
+		unsigned long flags;
+		spin_lock_irqsave(&idr_lock, flags);
+		idr_remove(&posix_timers_id, tmr->it_id);
+		spin_unlock_irqrestore(&idr_lock, flags);
+	}
+	sigqueue_free(tmr->sigq);
+	if (unlikely(tmr->it_process) &&
+	    tmr->it_sigev_notify == (SIGEV_SIGNAL|SIGEV_THREAD_ID))
+		put_task_struct(tmr->it_process);
+	kmem_cache_free(posix_timers_cache, tmr);
+}
+
+/* Create a POSIX.1b interval timer. */
+
+asmlinkage long
+sys_timer_create(const clockid_t which_clock,
+		 struct sigevent __user *timer_event_spec,
+		 timer_t __user * created_timer_id)
+{
+	int error = 0;
+	struct k_itimer *new_timer = NULL;
+	int new_timer_id;
+	struct task_struct *process = NULL;
+	unsigned long flags;
+	sigevent_t event;
+	int it_id_set = IT_ID_NOT_SET;
+
+	if (invalid_clockid(which_clock))
+		return -EINVAL;
+
+	new_timer = alloc_posix_timer();
+	if (unlikely(!new_timer))
+		return -EAGAIN;
+
+	spin_lock_init(&new_timer->it_lock);
+ retry:
+	if (unlikely(!idr_pre_get(&posix_timers_id, GFP_KERNEL))) {
+		error = -EAGAIN;
+		goto out;
+	}
+	spin_lock_irq(&idr_lock);
+	error = idr_get_new(&posix_timers_id, (void *) new_timer,
+			    &new_timer_id);
+	spin_unlock_irq(&idr_lock);
+	if (error == -EAGAIN)
+		goto retry;
+	else if (error) {
+		/*
+		 * Wierd looking, but we return EAGAIN if the IDR is
+		 * full (proper POSIX return value for this)
+		 */
+		error = -EAGAIN;
+		goto out;
+	}
+
+	it_id_set = IT_ID_SET;
+	new_timer->it_id = (timer_t) new_timer_id;
+	new_timer->it_clock = which_clock;
+	new_timer->it_overrun = -1;
+	error = CLOCK_DISPATCH(which_clock, timer_create, (new_timer));
+	if (error)
+		goto out;
+
+	/*
+	 * return the timer_id now.  The next step is hard to
+	 * back out if there is an error.
+	 */
+	if (copy_to_user(created_timer_id,
+			 &new_timer_id, sizeof (new_timer_id))) {
+		error = -EFAULT;
+		goto out;
+	}
+	if (timer_event_spec) {
+		if (copy_from_user(&event, timer_event_spec, sizeof (event))) {
+			error = -EFAULT;
+			goto out;
+		}
+		new_timer->it_sigev_notify = event.sigev_notify;
+		new_timer->it_sigev_signo = event.sigev_signo;
+		new_timer->it_sigev_value = event.sigev_value;
+
+		read_lock(&tasklist_lock);
+		if ((process = good_sigevent(&event))) {
+			/*
+			 * We may be setting up this process for another
+			 * thread.  It may be exiting.  To catch this
+			 * case the we check the PF_EXITING flag.  If
+			 * the flag is not set, the siglock will catch
+			 * him before it is too late (in exit_itimers).
+			 *
+			 * The exec case is a bit more invloved but easy
+			 * to code.  If the process is in our thread
+			 * group (and it must be or we would not allow
+			 * it here) and is doing an exec, it will cause
+			 * us to be killed.  In this case it will wait
+			 * for us to die which means we can finish this
+			 * linkage with our last gasp. I.e. no code :)
+			 */
+			spin_lock_irqsave(&process->sighand->siglock, flags);
+			if (!(process->flags & PF_EXITING)) {
+				new_timer->it_process = process;
+				list_add(&new_timer->list,
+					 &process->signal->posix_timers);
+				spin_unlock_irqrestore(&process->sighand->siglock, flags);
+				if (new_timer->it_sigev_notify == (SIGEV_SIGNAL|SIGEV_THREAD_ID))
+					get_task_struct(process);
+			} else {
+				spin_unlock_irqrestore(&process->sighand->siglock, flags);
+				process = NULL;
+			}
+		}
+		read_unlock(&tasklist_lock);
+		if (!process) {
+			error = -EINVAL;
+			goto out;
+		}
+	} else {
+		new_timer->it_sigev_notify = SIGEV_SIGNAL;
+		new_timer->it_sigev_signo = SIGALRM;
+		new_timer->it_sigev_value.sival_int = new_timer->it_id;
+		process = current->group_leader;
+		spin_lock_irqsave(&process->sighand->siglock, flags);
+		new_timer->it_process = process;
+		list_add(&new_timer->list, &process->signal->posix_timers);
+		spin_unlock_irqrestore(&process->sighand->siglock, flags);
+	}
+
+ 	/*
+	 * In the case of the timer belonging to another task, after
+	 * the task is unlocked, the timer is owned by the other task
+	 * and may cease to exist at any time.  Don't use or modify
+	 * new_timer after the unlock call.
+	 */
+
+out:
+	if (error)
+		release_posix_timer(new_timer, it_id_set);
+
+	return error;
+}
+
+/*
+ * Locking issues: We need to protect the result of the id look up until
+ * we get the timer locked down so it is not deleted under us.  The
+ * removal is done under the idr spinlock so we use that here to bridge
+ * the find to the timer lock.  To avoid a dead lock, the timer id MUST
+ * be release with out holding the timer lock.
+ */
+static struct k_itimer * lock_timer(timer_t timer_id, unsigned long *flags)
+{
+	struct k_itimer *timr;
+	/*
+	 * Watch out here.  We do a irqsave on the idr_lock and pass the
+	 * flags part over to the timer lock.  Must not let interrupts in
+	 * while we are moving the lock.
+	 */
+
+	spin_lock_irqsave(&idr_lock, *flags);
+	timr = (struct k_itimer *) idr_find(&posix_timers_id, (int) timer_id);
+	if (timr) {
+		spin_lock(&timr->it_lock);
+		spin_unlock(&idr_lock);
+
+		if ((timr->it_id != timer_id) || !(timr->it_process) ||
+				timr->it_process->tgid != current->tgid) {
+			unlock_timer(timr, *flags);
+			timr = NULL;
+		}
+	} else
+		spin_unlock_irqrestore(&idr_lock, *flags);
+
+	return timr;
+}
+
+/*
+ * Get the time remaining on a POSIX.1b interval timer.  This function
+ * is ALWAYS called with spin_lock_irq on the timer, thus it must not
+ * mess with irq.
+ *
+ * We have a couple of messes to clean up here.  First there is the case
+ * of a timer that has a requeue pending.  These timers should appear to
+ * be in the timer list with an expiry as if we were to requeue them
+ * now.
+ *
+ * The second issue is the SIGEV_NONE timer which may be active but is
+ * not really ever put in the timer list (to save system resources).
+ * This timer may be expired, and if so, we will do it here.  Otherwise
+ * it is the same as a requeue pending timer WRT to what we should
+ * report.
+ */
+static void
+common_timer_get(struct k_itimer *timr, struct itimerspec *cur_setting)
+{
+	ktime_t now, remaining, iv;
+	struct hrtimer *timer = &timr->it.real.timer;
+
+	memset(cur_setting, 0, sizeof(struct itimerspec));
+
+	iv = timr->it.real.interval;
+
+	/* interval timer ? */
+	if (iv.tv64)
+		cur_setting->it_interval = ktime_to_timespec(iv);
+	else if (!hrtimer_active(timer) &&
+		 (timr->it_sigev_notify & ~SIGEV_THREAD_ID) != SIGEV_NONE)
+		return;
+
+	now = timer->base->get_time();
+
+	/*
+	 * When a requeue is pending or this is a SIGEV_NONE
+	 * timer move the expiry time forward by intervals, so
+	 * expiry is > now.
+	 */
+	if (iv.tv64 && (timr->it_requeue_pending & REQUEUE_PENDING ||
+	    (timr->it_sigev_notify & ~SIGEV_THREAD_ID) == SIGEV_NONE))
+		timr->it_overrun += hrtimer_forward(timer, now, iv);
+
+	remaining = ktime_sub(timer->expires, now);
+	/* Return 0 only, when the timer is expired and not pending */
+	if (remaining.tv64 <= 0) {
+		/*
+		 * A single shot SIGEV_NONE timer must return 0, when
+		 * it is expired !
+		 */
+		if ((timr->it_sigev_notify & ~SIGEV_THREAD_ID) != SIGEV_NONE)
+			cur_setting->it_value.tv_nsec = 1;
+	} else
+		cur_setting->it_value = ktime_to_timespec(remaining);
+}
+
+/* Get the time remaining on a POSIX.1b interval timer. */
+asmlinkage long
+sys_timer_gettime(timer_t timer_id, struct itimerspec __user *setting)
+{
+	struct k_itimer *timr;
+	struct itimerspec cur_setting;
+	unsigned long flags;
+
+	timr = lock_timer(timer_id, &flags);
+	if (!timr)
+		return -EINVAL;
+
+	CLOCK_DISPATCH(timr->it_clock, timer_get, (timr, &cur_setting));
+
+	unlock_timer(timr, flags);
+
+	if (copy_to_user(setting, &cur_setting, sizeof (cur_setting)))
+		return -EFAULT;
+
+	return 0;
+}
+
+/*
+ * Get the number of overruns of a POSIX.1b interval timer.  This is to
+ * be the overrun of the timer last delivered.  At the same time we are
+ * accumulating overruns on the next timer.  The overrun is frozen when
+ * the signal is delivered, either at the notify time (if the info block
+ * is not queued) or at the actual delivery time (as we are informed by
+ * the call back to do_schedule_next_timer().  So all we need to do is
+ * to pick up the frozen overrun.
+ */
+asmlinkage long
+sys_timer_getoverrun(timer_t timer_id)
+{
+	struct k_itimer *timr;
+	int overrun;
+	long flags;
+
+	timr = lock_timer(timer_id, &flags);
+	if (!timr)
+		return -EINVAL;
+
+	overrun = timr->it_overrun_last;
+	unlock_timer(timr, flags);
+
+	return overrun;
+}
+
+/* Set a POSIX.1b interval timer. */
+/* timr->it_lock is taken. */
+static int
+common_timer_set(struct k_itimer *timr, int flags,
+		 struct itimerspec *new_setting, struct itimerspec *old_setting)
+{
+	struct hrtimer *timer = &timr->it.real.timer;
+	enum hrtimer_mode mode;
+
+	if (old_setting)
+		common_timer_get(timr, old_setting);
+
+	/* disable the timer */
+	timr->it.real.interval.tv64 = 0;
+	/*
+	 * careful here.  If smp we could be in the "fire" routine which will
+	 * be spinning as we hold the lock.  But this is ONLY an SMP issue.
+	 */
+	if (hrtimer_try_to_cancel(timer) < 0)
+		return TIMER_RETRY;
+
+	timr->it_requeue_pending = (timr->it_requeue_pending + 2) & 
+		~REQUEUE_PENDING;
+	timr->it_overrun_last = 0;
+
+	/* switch off the timer when it_value is zero */
+	if (!new_setting->it_value.tv_sec && !new_setting->it_value.tv_nsec)
+		return 0;
+
+	mode = flags & TIMER_ABSTIME ? HRTIMER_MODE_ABS : HRTIMER_MODE_REL;
+	hrtimer_init(&timr->it.real.timer, timr->it_clock, mode);
+	timr->it.real.timer.function = posix_timer_fn;
+
+	timer->expires = timespec_to_ktime(new_setting->it_value);
+
+	/* Convert interval */
+	timr->it.real.interval = timespec_to_ktime(new_setting->it_interval);
+
+	/* SIGEV_NONE timers are not queued ! See common_timer_get */
+	if (((timr->it_sigev_notify & ~SIGEV_THREAD_ID) == SIGEV_NONE)) {
+		/* Setup correct expiry time for relative timers */
+		if (mode == HRTIMER_MODE_REL)
+			timer->expires = ktime_add(timer->expires,
+						   timer->base->get_time());
+		return 0;
+	}
+
+	hrtimer_start(timer, timer->expires, mode);
+	return 0;
+}
+
+/* Set a POSIX.1b interval timer */
+asmlinkage long
+sys_timer_settime(timer_t timer_id, int flags,
+		  const struct itimerspec __user *new_setting,
+		  struct itimerspec __user *old_setting)
+{
+	struct k_itimer *timr;
+	struct itimerspec new_spec, old_spec;
+	int error = 0;
+	long flag;
+	struct itimerspec *rtn = old_setting ? &old_spec : NULL;
+
+	if (!new_setting)
+		return -EINVAL;
+
+	if (copy_from_user(&new_spec, new_setting, sizeof (new_spec)))
+		return -EFAULT;
+
+	if (!timespec_valid(&new_spec.it_interval) ||
+	    !timespec_valid(&new_spec.it_value))
+		return -EINVAL;
+retry:
+	timr = lock_timer(timer_id, &flag);
+	if (!timr)
+		return -EINVAL;
+
+	error = CLOCK_DISPATCH(timr->it_clock, timer_set,
+			       (timr, flags, &new_spec, rtn));
+
+	unlock_timer(timr, flag);
+	if (error == TIMER_RETRY) {
+		rtn = NULL;	// We already got the old time...
+		goto retry;
+	}
+
+	if (old_setting && !error &&
+	    copy_to_user(old_setting, &old_spec, sizeof (old_spec)))
+		error = -EFAULT;
+
+	return error;
+}
+
+static inline int common_timer_del(struct k_itimer *timer)
+{
+	timer->it.real.interval.tv64 = 0;
+
+	if (hrtimer_try_to_cancel(&timer->it.real.timer) < 0)
+		return TIMER_RETRY;
+	return 0;
+}
+
+static inline int timer_delete_hook(struct k_itimer *timer)
+{
+	return CLOCK_DISPATCH(timer->it_clock, timer_del, (timer));
+}
+
+/* Delete a POSIX.1b interval timer. */
+asmlinkage long
+sys_timer_delete(timer_t timer_id)
+{
+	struct k_itimer *timer;
+	long flags;
+
+retry_delete:
+	timer = lock_timer(timer_id, &flags);
+	if (!timer)
+		return -EINVAL;
+
+	if (timer_delete_hook(timer) == TIMER_RETRY) {
+		unlock_timer(timer, flags);
+		goto retry_delete;
+	}
+
+	spin_lock(&current->sighand->siglock);
+	list_del(&timer->list);
+	spin_unlock(&current->sighand->siglock);
+	/*
+	 * This keeps any tasks waiting on the spin lock from thinking
+	 * they got something (see the lock code above).
+	 */
+	if (timer->it_process) {
+		if (timer->it_sigev_notify == (SIGEV_SIGNAL|SIGEV_THREAD_ID))
+			put_task_struct(timer->it_process);
+		timer->it_process = NULL;
+	}
+	unlock_timer(timer, flags);
+	release_posix_timer(timer, IT_ID_SET);
+	return 0;
+}
+
+/*
+ * return timer owned by the process, used by exit_itimers
+ */
+static void itimer_delete(struct k_itimer *timer)
+{
+	unsigned long flags;
+
+retry_delete:
+	spin_lock_irqsave(&timer->it_lock, flags);
+
+	if (timer_delete_hook(timer) == TIMER_RETRY) {
+		unlock_timer(timer, flags);
+		goto retry_delete;
+	}
+	list_del(&timer->list);
+	/*
+	 * This keeps any tasks waiting on the spin lock from thinking
+	 * they got something (see the lock code above).
+	 */
+	if (timer->it_process) {
+		if (timer->it_sigev_notify == (SIGEV_SIGNAL|SIGEV_THREAD_ID))
+			put_task_struct(timer->it_process);
+		timer->it_process = NULL;
+	}
+	unlock_timer(timer, flags);
+	release_posix_timer(timer, IT_ID_SET);
+}
+
+/*
+ * This is called by do_exit or de_thread, only when there are no more
+ * references to the shared signal_struct.
+ */
+void exit_itimers(struct signal_struct *sig)
+{
+	struct k_itimer *tmr;
+
+	while (!list_empty(&sig->posix_timers)) {
+		tmr = list_entry(sig->posix_timers.next, struct k_itimer, list);
+		itimer_delete(tmr);
+	}
+}
+
+/* Not available / possible... functions */
+int do_posix_clock_nosettime(const clockid_t clockid, struct timespec *tp)
+{
+	return -EINVAL;
+}
+EXPORT_SYMBOL_GPL(do_posix_clock_nosettime);
+
+int do_posix_clock_nonanosleep(const clockid_t clock, int flags,
+			       struct timespec *t, struct timespec __user *r)
+{
+#ifndef ENOTSUP
+	return -EOPNOTSUPP;	/* aka ENOTSUP in userland for POSIX */
+#else  /*  parisc does define it separately.  */
+	return -ENOTSUP;
+#endif
+}
+EXPORT_SYMBOL_GPL(do_posix_clock_nonanosleep);
+
+asmlinkage long sys_clock_settime(const clockid_t which_clock,
+				  const struct timespec __user *tp)
+{
+	struct timespec new_tp;
+
+	if (invalid_clockid(which_clock))
+		return -EINVAL;
+	if (copy_from_user(&new_tp, tp, sizeof (*tp)))
+		return -EFAULT;
+
+	return CLOCK_DISPATCH(which_clock, clock_set, (which_clock, &new_tp));
+}
+
+asmlinkage long
+sys_clock_gettime(const clockid_t which_clock, struct timespec __user *tp)
+{
+	struct timespec kernel_tp;
+	int error;
+
+	if (invalid_clockid(which_clock))
+		return -EINVAL;
+	error = CLOCK_DISPATCH(which_clock, clock_get,
+			       (which_clock, &kernel_tp));
+	if (!error && copy_to_user(tp, &kernel_tp, sizeof (kernel_tp)))
+		error = -EFAULT;
+
+	return error;
+
+}
+
+asmlinkage long
+sys_clock_getres(const clockid_t which_clock, struct timespec __user *tp)
+{
+	struct timespec rtn_tp;
+	int error;
+
+	if (invalid_clockid(which_clock))
+		return -EINVAL;
+
+	error = CLOCK_DISPATCH(which_clock, clock_getres,
+			       (which_clock, &rtn_tp));
+
+	if (!error && tp && copy_to_user(tp, &rtn_tp, sizeof (rtn_tp))) {
+		error = -EFAULT;
+	}
+
+	return error;
+}
+
+/*
+ * nanosleep for monotonic and realtime clocks
+ */
+static int common_nsleep(const clockid_t which_clock, int flags,
+			 struct timespec *tsave, struct timespec __user *rmtp)
+{
+	return hrtimer_nanosleep(tsave, rmtp, flags & TIMER_ABSTIME ?
+				 HRTIMER_MODE_ABS : HRTIMER_MODE_REL,
+				 which_clock);
+}
+
+asmlinkage long
+sys_clock_nanosleep(const clockid_t which_clock, int flags,
+		    const struct timespec __user *rqtp,
+		    struct timespec __user *rmtp)
+{
+	struct timespec t;
+
+	if (invalid_clockid(which_clock))
+		return -EINVAL;
+
+	if (copy_from_user(&t, rqtp, sizeof (struct timespec)))
+		return -EFAULT;
+
+	if (!timespec_valid(&t))
+		return -EINVAL;
+
+	return CLOCK_DISPATCH(which_clock, nsleep,
+			      (which_clock, flags, &t, rmtp));
+}
+
+/*
+ * nanosleep_restart for monotonic and realtime clocks
+ */
+static int common_nsleep_restart(struct restart_block *restart_block)
+{
+	return hrtimer_nanosleep_restart(restart_block);
+}
+
+/*
+ * This will restart clock_nanosleep. This is required only by
+ * compat_clock_nanosleep_restart for now.
+ */
+long
+clock_nanosleep_restart(struct restart_block *restart_block)
+{
+	clockid_t which_clock = restart_block->arg0;
+
+	return CLOCK_DISPATCH(which_clock, nsleep_restart,
+			      (restart_block));
+}
--- a/kernel/power/Kconfig
+++ b/kernel/power/Kconfig
@@ -0,0 +1,173 @@
+config PM
+	bool "Power Management support"
+	depends on !IA64_HP_SIM
+	---help---
+	  "Power Management" means that parts of your computer are shut
+	  off or put into a power conserving "sleep" mode if they are not
+	  being used.  There are two competing standards for doing this: APM
+	  and ACPI.  If you want to use either one, say Y here and then also
+	  to the requisite support below.
+
+	  Power Management is most important for battery powered laptop
+	  computers; if you have a laptop, check out the Linux Laptop home
+	  page on the WWW at <http://www.linux-on-laptops.com/> or
+	  Tuxmobil - Linux on Mobile Computers at <http://www.tuxmobil.org/>
+	  and the Battery Powered Linux mini-HOWTO, available from
+	  <http://www.tldp.org/docs.html#howto>.
+
+	  Note that, even if you say N here, Linux on the x86 architecture
+	  will issue the hlt instruction if nothing is to be done, thereby
+	  sending the processor to sleep and saving power.
+
+config PM_LEGACY
+	bool "Legacy Power Management API (DEPRECATED)"
+	depends on PM
+	default n
+	---help---
+	   Support for pm_register() and friends.  This old API is obsoleted
+	   by the driver model.
+
+	   If unsure, say N.
+
+config PM_CPU_MODE
+	bool "PM_CPU_MODE"
+	depends on PM
+        default y
+	---help---
+        This is for the power consumption of CPU mode 
+
+config PM_DEBUG
+	bool "Power Management Debug Support"
+	depends on PM
+	---help---
+	This option enables verbose debugging support in the Power Management
+	code. This is helpful when debugging and reporting various PM bugs, 
+	like suspend support.
+
+config DISABLE_CONSOLE_SUSPEND
+	bool "Keep console(s) enabled during suspend/resume (DANGEROUS)"
+	depends on PM && PM_DEBUG
+	default n
+	---help---
+	This option turns off the console suspend mechanism that prevents
+	debug messages from reaching the console during the suspend/resume
+	operations.  This may be helpful when debugging device drivers'
+	suspend/resume routines, but may itself lead to problems, for example
+	if netconsole is used.
+
+config PM_TRACE
+	bool "Suspend/resume event tracing"
+	depends on PM && PM_DEBUG && X86_32 && EXPERIMENTAL
+	default n
+	---help---
+	This enables some cheesy code to save the last PM event point in the
+	RTC across reboots, so that you can debug a machine that just hangs
+	during suspend (or more commonly, during resume).
+
+	To use this debugging feature you should attempt to suspend the machine,
+	then reboot it, then run
+
+		dmesg -s 1000000 | grep 'hash matches'
+
+	CAUTION: this option will cause your machine's real-time clock to be
+	set to an invalid time after a resume.
+
+config PM_SYSFS_DEPRECATED
+	bool "Driver model /sys/devices/.../power/state files (DEPRECATED)"
+	depends on PM && SYSFS
+	default n
+	help
+	  The driver model started out with a sysfs file intended to provide
+	  a userspace hook for device power management.  This feature has never
+	  worked very well, except for limited testing purposes, and so it will
+	  be removed.   It's not clear that a generic mechanism could really
+	  handle the wide variability of device power states; any replacements
+	  are likely to be bus or driver specific.
+
+config SOFTWARE_SUSPEND
+	bool "Software Suspend"
+	depends on PM && SWAP && ((X86 && (!SMP || SUSPEND_SMP)) || ((FRV || PPC32) && !SMP))
+	---help---
+	  Enable the suspend to disk (STD) functionality.
+
+	  You can suspend your machine with 'echo disk > /sys/power/state'.
+	  Alternatively, you can use the additional userland tools available
+	  from <http://suspend.sf.net>.
+
+	  In principle it does not require ACPI or APM, although for example
+	  ACPI will be used if available.
+
+	  It creates an image which is saved in your active swap. Upon the next
+	  boot, pass the 'resume=/dev/swappartition' argument to the kernel to
+	  have it detect the saved image, restore memory state from it, and
+	  continue to run as before. If you do not want the previous state to
+	  be reloaded, then use the 'noresume' kernel command line argument.
+	  Note, however, that fsck will be run on your filesystems and you will
+	  need to run mkswap against the swap partition used for the suspend.
+
+	  It also works with swap files to a limited extent (for details see
+	  <file:Documentation/power/swsusp-and-swap-files.txt>).
+
+	  Right now you may boot without resuming and resume later but in the
+	  meantime you cannot use the swap partition(s)/file(s) involved in
+	  suspending.  Also in this case you must not use the filesystems
+	  that were mounted before the suspend.  In particular, you MUST NOT
+	  MOUNT any journaled filesystems mounted before the suspend or they
+	  will get corrupted in a nasty way.
+
+	  For more information take a look at <file:Documentation/power/swsusp.txt>.
+
+config PM_STD_PARTITION
+	string "Default resume partition"
+	depends on SOFTWARE_SUSPEND
+	default ""
+	---help---
+	  The default resume partition is the partition that the suspend-
+	  to-disk implementation will look for a suspended disk image. 
+
+	  The partition specified here will be different for almost every user. 
+	  It should be a valid swap partition (at least for now) that is turned
+	  on before suspending. 
+
+	  The partition specified can be overridden by specifying:
+
+		resume=/dev/<other device> 
+
+	  which will set the resume partition to the device specified. 
+
+	  Note there is currently not a way to specify which device to save the
+	  suspended image to. It will simply pick the first available swap 
+	  device.
+
+config SUSPEND_SMP
+	bool
+	depends on HOTPLUG_CPU && X86 && PM
+	default y
+
+config APM_EMULATION
+	tristate "Advanced Power Management Emulation"
+	depends on PM && SYS_SUPPORTS_APM_EMULATION
+	help
+	  APM is a BIOS specification for saving power using several different
+	  techniques. This is mostly useful for battery powered laptops with
+	  APM compliant BIOSes. If you say Y here, the system time will be
+	  reset after a RESUME operation, the /proc/apm device will provide
+	  battery status information, and user-space programs will receive
+	  notification of APM "events" (e.g. battery status change).
+
+	  In order to use APM, you will need supporting software. For location
+	  and more information, read <file:Documentation/pm.txt> and the
+	  Battery Powered Linux mini-HOWTO, available from
+	  <http://www.tldp.org/docs.html#howto>.
+
+	  This driver does not spin down disk drives (see the hdparm(8)
+	  manpage ("man 8 hdparm") for that), and it doesn't turn off
+	  VESA-compliant "green" monitors.
+
+	  Generally, if you don't have a battery in your machine, there isn't
+	  much point in using this driver and you should say N. If you get
+	  random kernel OOPSes or reboots that don't seem to be related to
+	  anything, try disabling/enabling this option (or disabling/enabling
+	  APM in your BIOS).
+
+source "drivers/char/s3c-dvfs/Kconfig"
--- a/kernel/power/Makefile
+++ b/kernel/power/Makefile
@@ -0,0 +1,10 @@
+
+ifeq ($(CONFIG_PM_DEBUG),y)
+EXTRA_CFLAGS	+=	-DDEBUG
+endif
+
+obj-y				:= main.o process.o console.o
+obj-$(CONFIG_PM_LEGACY)		+= pm.o
+obj-$(CONFIG_SOFTWARE_SUSPEND)	+= swsusp.o disk.o snapshot.o swap.o user.o
+
+obj-$(CONFIG_MAGIC_SYSRQ)	+= poweroff.o
--- a/kernel/power/console.c
+++ b/kernel/power/console.c
@@ -0,0 +1,58 @@
+/*
+ * drivers/power/process.c - Functions for saving/restoring console.
+ *
+ * Originally from swsusp.
+ */
+
+#include <linux/vt_kern.h>
+#include <linux/kbd_kern.h>
+#include <linux/console.h>
+#include "power.h"
+
+#if defined(CONFIG_VT) && defined(CONFIG_VT_CONSOLE)
+#define SUSPEND_CONSOLE	(MAX_NR_CONSOLES-1)
+
+static int orig_fgconsole, orig_kmsg;
+
+int pm_prepare_console(void)
+{
+	acquire_console_sem();
+
+	orig_fgconsole = fg_console;
+
+	if (vc_allocate(SUSPEND_CONSOLE)) {
+	  /* we can't have a free VC for now. Too bad,
+	   * we don't want to mess the screen for now. */
+		release_console_sem();
+		return 1;
+	}
+
+	if (set_console(SUSPEND_CONSOLE)) {
+		/*
+		 * We're unable to switch to the SUSPEND_CONSOLE.
+		 * Let the calling function know so it can decide
+		 * what to do.
+		 */
+		release_console_sem();
+		return 1;
+	}
+	release_console_sem();
+
+	if (vt_waitactive(SUSPEND_CONSOLE)) {
+		pr_debug("Suspend: Can't switch VCs.");
+		return 1;
+	}
+	orig_kmsg = kmsg_redirect;
+	kmsg_redirect = SUSPEND_CONSOLE;
+	return 0;
+}
+
+void pm_restore_console(void)
+{
+	acquire_console_sem();
+	set_console(orig_fgconsole);
+	release_console_sem();
+	kmsg_redirect = orig_kmsg;
+	return;
+}
+#endif
--- a/kernel/power/disk.c
+++ b/kernel/power/disk.c
@@ -0,0 +1,465 @@
+/*
+ * kernel/power/disk.c - Suspend-to-disk support.
+ *
+ * Copyright (c) 2003 Patrick Mochel
+ * Copyright (c) 2003 Open Source Development Lab
+ * Copyright (c) 2004 Pavel Machek <pavel@suse.cz>
+ *
+ * This file is released under the GPLv2.
+ *
+ */
+
+#include <linux/suspend.h>
+#include <linux/syscalls.h>
+#include <linux/reboot.h>
+#include <linux/string.h>
+#include <linux/device.h>
+#include <linux/delay.h>
+#include <linux/fs.h>
+#include <linux/mount.h>
+#include <linux/pm.h>
+#include <linux/console.h>
+#include <linux/cpu.h>
+#include <linux/freezer.h>
+
+#include "power.h"
+
+
+static int noresume = 0;
+char resume_file[256] = CONFIG_PM_STD_PARTITION;
+dev_t swsusp_resume_device;
+sector_t swsusp_resume_block;
+
+/**
+ *	platform_prepare - prepare the machine for hibernation using the
+ *	platform driver if so configured and return an error code if it fails
+ */
+
+static inline int platform_prepare(void)
+{
+	int error = 0;
+
+	if (pm_disk_mode == PM_DISK_PLATFORM) {
+		if (pm_ops && pm_ops->prepare)
+			error = pm_ops->prepare(PM_SUSPEND_DISK);
+	}
+	return error;
+}
+
+/**
+ *	power_down - Shut machine down for hibernate.
+ *	@mode:		Suspend-to-disk mode
+ *
+ *	Use the platform driver, if configured so, and return gracefully if it
+ *	fails.
+ *	Otherwise, try to power off and reboot. If they fail, halt the machine,
+ *	there ain't no turning back.
+ */
+
+static void power_down(suspend_disk_method_t mode)
+{
+	switch(mode) {
+	case PM_DISK_PLATFORM:
+		if (pm_ops && pm_ops->enter) {
+			kernel_shutdown_prepare(SYSTEM_SUSPEND_DISK);
+			pm_ops->enter(PM_SUSPEND_DISK);
+			break;
+		}
+	case PM_DISK_SHUTDOWN:
+		kernel_power_off();
+		break;
+	case PM_DISK_REBOOT:
+		kernel_restart(NULL);
+		break;
+	}
+	kernel_halt();
+	/* Valid image is on the disk, if we continue we risk serious data corruption
+	   after resume. */
+	printk(KERN_CRIT "Please power me down manually\n");
+	while(1);
+}
+
+static inline void platform_finish(void)
+{
+	if (pm_disk_mode == PM_DISK_PLATFORM) {
+		if (pm_ops && pm_ops->finish)
+			pm_ops->finish(PM_SUSPEND_DISK);
+	}
+}
+
+static void unprepare_processes(void)
+{
+	thaw_processes();
+	pm_restore_console();
+}
+
+static int prepare_processes(void)
+{
+	int error = 0;
+
+	pm_prepare_console();
+	if (freeze_processes()) {
+		error = -EBUSY;
+		unprepare_processes();
+	}
+	return error;
+}
+
+/**
+ *	pm_suspend_disk - The granpappy of hibernation power management.
+ *
+ *	If we're going through the firmware, then get it over with quickly.
+ *
+ *	If not, then call swsusp to do its thing, then figure out how
+ *	to power down the system.
+ */
+
+int pm_suspend_disk(void)
+{
+	int error;
+
+	error = prepare_processes();
+	if (error)
+		return error;
+
+	if (pm_disk_mode == PM_DISK_TESTPROC) {
+		printk("swsusp debug: Waiting for 5 seconds.\n");
+		mdelay(5000);
+		goto Thaw;
+	}
+	/* Free memory before shutting down devices. */
+	error = swsusp_shrink_memory();
+	if (error)
+		goto Thaw;
+
+	error = platform_prepare();
+	if (error)
+		goto Thaw;
+
+	suspend_console();
+	error = device_suspend(PMSG_FREEZE);
+	if (error) {
+		printk(KERN_ERR "PM: Some devices failed to suspend\n");
+		goto Resume_devices;
+	}
+	error = disable_nonboot_cpus();
+	if (error)
+		goto Enable_cpus;
+
+	if (pm_disk_mode == PM_DISK_TEST) {
+		printk("swsusp debug: Waiting for 5 seconds.\n");
+		mdelay(5000);
+		goto Enable_cpus;
+	}
+
+	pr_debug("PM: snapshotting memory.\n");
+	in_suspend = 1;
+	error = swsusp_suspend();
+	if (error)
+		goto Enable_cpus;
+
+	if (in_suspend) {
+		enable_nonboot_cpus();
+		platform_finish();
+		device_resume();
+		resume_console();
+		pr_debug("PM: writing image.\n");
+		error = swsusp_write();
+		if (!error)
+			power_down(pm_disk_mode);
+		else {
+			swsusp_free();
+			goto Thaw;
+		}
+	} else {
+		pr_debug("PM: Image restored successfully.\n");
+	}
+
+	swsusp_free();
+ Enable_cpus:
+	enable_nonboot_cpus();
+ Resume_devices:
+	platform_finish();
+	device_resume();
+	resume_console();
+ Thaw:
+	unprepare_processes();
+	return error;
+}
+
+
+/**
+ *	software_resume - Resume from a saved image.
+ *
+ *	Called as a late_initcall (so all devices are discovered and
+ *	initialized), we call swsusp to see if we have a saved image or not.
+ *	If so, we quiesce devices, the restore the saved image. We will
+ *	return above (in pm_suspend_disk() ) if everything goes well.
+ *	Otherwise, we fail gracefully and return to the normally
+ *	scheduled program.
+ *
+ */
+
+static int software_resume(void)
+{
+	int error;
+
+	mutex_lock(&pm_mutex);
+	if (!swsusp_resume_device) {
+		if (!strlen(resume_file)) {
+			mutex_unlock(&pm_mutex);
+			return -ENOENT;
+		}
+		swsusp_resume_device = name_to_dev_t(resume_file);
+		pr_debug("swsusp: Resume From Partition %s\n", resume_file);
+	} else {
+		pr_debug("swsusp: Resume From Partition %d:%d\n",
+			 MAJOR(swsusp_resume_device), MINOR(swsusp_resume_device));
+	}
+
+	if (noresume) {
+		/**
+		 * FIXME: If noresume is specified, we need to find the partition
+		 * and reset it back to normal swap space.
+		 */
+		mutex_unlock(&pm_mutex);
+		return 0;
+	}
+
+	pr_debug("PM: Checking swsusp image.\n");
+
+	error = swsusp_check();
+	if (error)
+		goto Done;
+
+	pr_debug("PM: Preparing processes for restore.\n");
+
+	error = prepare_processes();
+	if (error) {
+		swsusp_close();
+		goto Done;
+	}
+
+	pr_debug("PM: Reading swsusp image.\n");
+
+	error = swsusp_read();
+	if (error) {
+		swsusp_free();
+		goto Thaw;
+	}
+
+	pr_debug("PM: Preparing devices for restore.\n");
+
+	suspend_console();
+	error = device_suspend(PMSG_PRETHAW);
+	if (error)
+		goto Free;
+
+	error = disable_nonboot_cpus();
+	if (!error)
+		swsusp_resume();
+
+	enable_nonboot_cpus();
+ Free:
+	swsusp_free();
+	device_resume();
+	resume_console();
+ Thaw:
+	printk(KERN_ERR "PM: Restore failed, recovering.\n");
+	unprepare_processes();
+ Done:
+	/* For success case, the suspend path will release the lock */
+	mutex_unlock(&pm_mutex);
+	pr_debug("PM: Resume from disk failed.\n");
+	return 0;
+}
+
+late_initcall(software_resume);
+
+
+static const char * const pm_disk_modes[] = {
+	[PM_DISK_FIRMWARE]	= "firmware",
+	[PM_DISK_PLATFORM]	= "platform",
+	[PM_DISK_SHUTDOWN]	= "shutdown",
+	[PM_DISK_REBOOT]	= "reboot",
+	[PM_DISK_TEST]		= "test",
+	[PM_DISK_TESTPROC]	= "testproc",
+};
+
+/**
+ *	disk - Control suspend-to-disk mode
+ *
+ *	Suspend-to-disk can be handled in several ways. The greatest
+ *	distinction is who writes memory to disk - the firmware or the OS.
+ *	If the firmware does it, we assume that it also handles suspending
+ *	the system.
+ *	If the OS does it, then we have three options for putting the system
+ *	to sleep - using the platform driver (e.g. ACPI or other PM registers),
+ *	powering off the system or rebooting the system (for testing).
+ *
+ *	The system will support either 'firmware' or 'platform', and that is
+ *	known a priori (and encoded in pm_ops). But, the user may choose
+ *	'shutdown' or 'reboot' as alternatives.
+ *
+ *	show() will display what the mode is currently set to.
+ *	store() will accept one of
+ *
+ *	'firmware'
+ *	'platform'
+ *	'shutdown'
+ *	'reboot'
+ *
+ *	It will only change to 'firmware' or 'platform' if the system
+ *	supports it (as determined from pm_ops->pm_disk_mode).
+ */
+
+static ssize_t disk_show(struct subsystem * subsys, char * buf)
+{
+	return sprintf(buf, "%s\n", pm_disk_modes[pm_disk_mode]);
+}
+
+
+static ssize_t disk_store(struct subsystem * s, const char * buf, size_t n)
+{
+	int error = 0;
+	int i;
+	int len;
+	char *p;
+	suspend_disk_method_t mode = 0;
+
+	p = memchr(buf, '\n', n);
+	len = p ? p - buf : n;
+
+	mutex_lock(&pm_mutex);
+	for (i = PM_DISK_FIRMWARE; i < PM_DISK_MAX; i++) {
+		if (!strncmp(buf, pm_disk_modes[i], len)) {
+			mode = i;
+			break;
+		}
+	}
+	if (mode) {
+		if (mode == PM_DISK_SHUTDOWN || mode == PM_DISK_REBOOT ||
+		     mode == PM_DISK_TEST || mode == PM_DISK_TESTPROC) {
+			pm_disk_mode = mode;
+		} else {
+			if (pm_ops && pm_ops->enter &&
+			    (mode == pm_ops->pm_disk_mode))
+				pm_disk_mode = mode;
+			else
+				error = -EINVAL;
+		}
+	} else {
+		error = -EINVAL;
+	}
+
+	pr_debug("PM: suspend-to-disk mode set to '%s'\n",
+		 pm_disk_modes[mode]);
+	mutex_unlock(&pm_mutex);
+	return error ? error : n;
+}
+
+power_attr(disk);
+
+static ssize_t resume_show(struct subsystem * subsys, char *buf)
+{
+	return sprintf(buf,"%d:%d\n", MAJOR(swsusp_resume_device),
+		       MINOR(swsusp_resume_device));
+}
+
+static ssize_t resume_store(struct subsystem *subsys, const char *buf, size_t n)
+{
+	unsigned int maj, min;
+	dev_t res;
+	int ret = -EINVAL;
+
+	if (sscanf(buf, "%u:%u", &maj, &min) != 2)
+		goto out;
+
+	res = MKDEV(maj,min);
+	if (maj != MAJOR(res) || min != MINOR(res))
+		goto out;
+
+	mutex_lock(&pm_mutex);
+	swsusp_resume_device = res;
+	mutex_unlock(&pm_mutex);
+	printk("Attempting manual resume\n");
+	noresume = 0;
+	software_resume();
+	ret = n;
+ out:
+	return ret;
+}
+
+power_attr(resume);
+
+static ssize_t image_size_show(struct subsystem * subsys, char *buf)
+{
+	return sprintf(buf, "%lu\n", image_size);
+}
+
+static ssize_t image_size_store(struct subsystem * subsys, const char * buf, size_t n)
+{
+	unsigned long size;
+
+	if (sscanf(buf, "%lu", &size) == 1) {
+		image_size = size;
+		return n;
+	}
+
+	return -EINVAL;
+}
+
+power_attr(image_size);
+
+static struct attribute * g[] = {
+	&disk_attr.attr,
+	&resume_attr.attr,
+	&image_size_attr.attr,
+	NULL,
+};
+
+
+static struct attribute_group attr_group = {
+	.attrs = g,
+};
+
+
+static int __init pm_disk_init(void)
+{
+	return sysfs_create_group(&power_subsys.kset.kobj,&attr_group);
+}
+
+core_initcall(pm_disk_init);
+
+
+static int __init resume_setup(char *str)
+{
+	if (noresume)
+		return 1;
+
+	strncpy( resume_file, str, 255 );
+	return 1;
+}
+
+static int __init resume_offset_setup(char *str)
+{
+	unsigned long long offset;
+
+	if (noresume)
+		return 1;
+
+	if (sscanf(str, "%llu", &offset) == 1)
+		swsusp_resume_block = offset;
+
+	return 1;
+}
+
+static int __init noresume_setup(char *str)
+{
+	noresume = 1;
+	return 1;
+}
+
+__setup("noresume", noresume_setup);
+__setup("resume_offset=", resume_offset_setup);
+__setup("resume=", resume_setup);
--- a/kernel/power/main.c
+++ b/kernel/power/main.c
@@ -0,0 +1,451 @@
+/*
+ * kernel/power/main.c - PM subsystem core functionality.
+ *
+ * Copyright (c) 2003 Patrick Mochel
+ * Copyright (c) 2003 Open Source Development Lab
+ * 
+ * This file is released under the GPLv2
+ *
+ */
+
+#include <linux/module.h>
+#include <linux/suspend.h>
+#include <linux/kobject.h>
+#include <linux/string.h>
+#include <linux/delay.h>
+#include <linux/errno.h>
+#include <linux/init.h>
+#include <linux/pm.h>
+#include <linux/console.h>
+#include <linux/cpu.h>
+#include <linux/resume-trace.h>
+#include <linux/freezer.h>
+#include <linux/vmstat.h>
+#include "power.h"
+#if 0
+/* Qisda, ShiYong Lin, 2009/07/18, Send message when sleep{*/
+#include <linux/input.h>
+/* Qisda, ShiYong Lin, 2009/07/18, Send message when sleep}*/
+#endif
+#ifdef CONFIG_PM_CPU_MODE
+extern unsigned char pm_cpu_mode;
+#endif
+
+/*This is just an arbitrary number */
+#define FREE_PAGE_NUMBER (100)
+
+DEFINE_MUTEX(pm_mutex);
+
+struct pm_ops *pm_ops;
+suspend_disk_method_t pm_disk_mode = PM_DISK_PLATFORM;
+/* Qisda, ShiYong Lin, 2009/07/18, Send message when sleep{*/
+extern void pm_keypad_message_to_ap (void);
+/* Qisda, ShiYong Lin, 2009/07/18, Send message when sleep}*/
+
+/**
+ *	pm_set_ops - Set the global power method table. 
+ *	@ops:	Pointer to ops structure.
+ */
+
+void pm_set_ops(struct pm_ops * ops)
+{
+	mutex_lock(&pm_mutex);
+	pm_ops = ops;
+	mutex_unlock(&pm_mutex);
+}
+
+static inline void pm_finish(suspend_state_t state)
+{
+	if (pm_ops->finish)
+		pm_ops->finish(state);
+}
+
+/**
+ *	suspend_prepare - Do prep work before entering low-power state.
+ *	@state:		State we're entering.
+ *
+ *	This is common code that is called for each state that we're 
+ *	entering. Allocate a console, stop all processes, then make sure
+ *	the platform can enter the requested state.
+ */
+
+static int suspend_prepare(suspend_state_t state)
+{
+	int error;
+	unsigned int free_pages;
+
+	if (!pm_ops || !pm_ops->enter)
+		return -EPERM;
+
+	pm_prepare_console();
+
+	if (freeze_processes()) {
+		error = -EAGAIN;
+		goto Thaw;
+	}
+
+	if ((free_pages = global_page_state(NR_FREE_PAGES))
+			< FREE_PAGE_NUMBER) {
+		pr_debug("PM: free some memory\n");
+		shrink_all_memory(FREE_PAGE_NUMBER - free_pages);
+		if (nr_free_pages() < FREE_PAGE_NUMBER) {
+			error = -ENOMEM;
+			printk(KERN_ERR "PM: No enough memory\n");
+			goto Thaw;
+		}
+	}
+
+	if (pm_ops->prepare) {
+		if ((error = pm_ops->prepare(state)))
+			goto Thaw;
+	}
+
+	suspend_console();
+	error = device_suspend(PMSG_SUSPEND);
+	if (error) {
+		printk(KERN_ERR "Some devices failed to suspend\n");
+		goto Resume_devices;
+	}
+	error = disable_nonboot_cpus();
+	if (!error)
+		return 0;
+
+	enable_nonboot_cpus();
+ Resume_devices:
+	pm_finish(state);
+	device_resume();
+	resume_console();
+ Thaw:
+	thaw_processes();
+	pm_restore_console();
+	return error;
+}
+
+
+int suspend_enter(suspend_state_t state)
+{
+	int error = 0;
+	unsigned long flags;
+
+	local_irq_save(flags);
+
+	if ((error = device_power_down(PMSG_SUSPEND))) {
+		printk(KERN_ERR "Some devices failed to power down\n");
+		goto Done;
+	}
+	error = pm_ops->enter(state);
+	device_power_up();
+ Done:
+	local_irq_restore(flags);
+	return error;
+}
+
+
+/**
+ *	suspend_finish - Do final work before exiting suspend sequence.
+ *	@state:		State we're coming out of.
+ *
+ *	Call platform code to clean up, restart processes, and free the 
+ *	console that we've allocated. This is not called for suspend-to-disk.
+ */
+
+static void suspend_finish(suspend_state_t state)
+{
+	enable_nonboot_cpus();
+	pm_finish(state);
+	device_resume();
+	resume_console();
+	thaw_processes();
+	pm_restore_console();
+}
+
+
+
+
+static const char * const pm_states[PM_SUSPEND_MAX] = {
+	[PM_SUSPEND_STANDBY]	= "standby",
+	[PM_SUSPEND_MEM]	    = "mem",
+#ifdef CONFIG_PM_CPU_MODE
+	[PM_SUSPEND_CPU_MODE]	= "cpu",
+#endif
+#ifdef CONFIG_SOFTWARE_SUSPEND
+	[PM_SUSPEND_DISK]	    = "disk",
+#endif
+};
+
+static inline int valid_state(suspend_state_t state)
+{
+	/* Suspend-to-disk does not really need low-level support.
+	 * It can work with reboot if needed. */
+	if (state == PM_SUSPEND_DISK)
+		return 1;
+
+	/* all other states need lowlevel support and need to be
+	 * valid to the lowlevel implementation, no valid callback
+	 * implies that all are valid. */
+	if (!pm_ops || (pm_ops->valid && !pm_ops->valid(state)))
+		return 0;
+	return 1;
+}
+
+#ifdef CONFIG_PM_CPU_MODE
+static int suspend_pm_cpu_mode_prepare(suspend_state_t state)
+{
+	return suspend_prepare(state);
+}
+
+
+int suspend_pm_cpu_mode_enter(suspend_state_t state)
+{
+	int error = 0;
+	unsigned long flags;
+    printk("suspend_cpu_mode_enter\n");
+	local_irq_save(flags);
+
+/*
+	if ((error = device_power_down(PMSG_SUSPEND))) {
+		printk(KERN_ERR "Some devices failed to power down\n");
+		goto Done;
+	}
+*/
+	error = pm_ops->enter(state);
+
+	device_power_up();
+ Done:
+	local_irq_restore(flags);
+	return error;
+}
+
+
+static void suspend_pm_cpu_mode_finish(suspend_state_t state)
+{
+	enable_nonboot_cpus();
+	pm_finish(state);
+	device_resume();
+//    s3c24xx_cpu_mode_serial_resume();
+	resume_console();
+	thaw_processes();
+	pm_restore_console();
+}
+
+static int enter_pm_cpu_mode(suspend_state_t state)
+{
+	int error;   
+
+	if (!valid_state(state))
+		return -ENODEV;
+
+	if (!mutex_trylock(&pm_mutex))
+		return -EBUSY;
+
+	if (state == PM_SUSPEND_DISK) {
+		error = pm_suspend_disk();
+		goto Unlock;
+	}
+
+	pr_debug("PM: Preparing system for %s sleep\n", pm_states[state]);
+	if ((error = suspend_pm_cpu_mode_prepare(state)))
+		goto Unlock;
+
+//	pr_debug("PM: Entering %s sleep\n", pm_states[state]);
+	error = suspend_pm_cpu_mode_enter(state);
+	pr_debug("PM: Finishing wakeup.\n");
+	suspend_pm_cpu_mode_finish(state);
+ Unlock:
+	mutex_unlock(&pm_mutex);
+/* Qisda, ShiYong Lin, 2009/09/28, Add the sleep event message when sleep {*/
+    s3c_keypad_pm_sleep_message_to_ap(0);
+//    printk(KERN_ERR "Sleep end enter_pm_cpu_mode, %d\n", state);
+/* } Qisda, ShiYong Lin, 2009/09/28, Add the sleep event message when sleep */
+	return error;
+}
+#endif
+
+
+/**
+ *	enter_state - Do common work of entering low-power state.
+ *	@state:		pm_state structure for state we're entering.
+ *
+ *	Make sure we're the only ones trying to enter a sleep state. Fail
+ *	if someone has beat us to it, since we don't want anything weird to
+ *	happen when we wake up.
+ *	Then, do the setup for suspend, enter the state, and cleaup (after
+ *	we've woken up).
+ */
+
+static int enter_state(suspend_state_t state)
+{
+	int error;
+
+	if (!valid_state(state))
+		return -ENODEV;
+	if (!mutex_trylock(&pm_mutex))
+		return -EBUSY;
+
+	if (state == PM_SUSPEND_DISK) {
+		error = pm_suspend_disk();
+		goto Unlock;
+	}
+
+	pr_debug("PM: Preparing system for %s sleep\n", pm_states[state]);
+	if ((error = suspend_prepare(state)))
+		goto Unlock;
+
+	pr_debug("PM: Entering %s sleep\n", pm_states[state]);
+	error = suspend_enter(state);
+
+	pr_debug("PM: Finishing wakeup.\n");
+	suspend_finish(state);
+ Unlock:
+	mutex_unlock(&pm_mutex);
+/* Qisda, ShiYong Lin, 2009/09/28, Add the sleep event message when sleep {*/
+    s3c_keypad_pm_sleep_message_to_ap(0);
+/* } Qisda, ShiYong Lin, 2009/09/28, Add the sleep event message when sleep */
+	return error;
+}
+
+/*
+ * This is main interface to the outside world. It needs to be
+ * called from process context.
+ */
+int software_suspend(void)
+{
+	return enter_state(PM_SUSPEND_DISK);
+}
+
+
+/**
+ *	pm_suspend - Externally visible function for suspending system.
+ *	@state:		Enumarted value of state to enter.
+ *
+ *	Determine whether or not value is within range, get state 
+ *	structure, and enter (above).
+ */
+
+int pm_suspend(suspend_state_t state)
+{
+	if (state > PM_SUSPEND_ON && state <= PM_SUSPEND_MAX)
+		return enter_state(state);
+	return -EINVAL;
+}
+
+EXPORT_SYMBOL(pm_suspend);
+
+decl_subsys(power,NULL,NULL);
+
+
+/**
+ *	state - control system power state.
+ *
+ *	show() returns what states are supported, which is hard-coded to
+ *	'standby' (Power-On Suspend), 'mem' (Suspend-to-RAM), and
+ *	'disk' (Suspend-to-Disk).
+ *
+ *	store() accepts one of those strings, translates it into the 
+ *	proper enumerated value, and initiates a suspend transition.
+ */
+
+static ssize_t state_show(struct subsystem * subsys, char * buf)
+{
+	int i;
+	char * s = buf;
+
+	for (i = 0; i < PM_SUSPEND_MAX; i++) {
+		if (pm_states[i] && valid_state(i))
+			s += sprintf(s,"%s ", pm_states[i]);
+	}
+	s += sprintf(s,"\n");
+	return (s - buf);
+}
+
+static ssize_t state_store(struct subsystem * subsys, const char * buf, size_t n)
+{
+	suspend_state_t state = PM_SUSPEND_STANDBY;
+	const char * const *s;
+	char *p;
+	int error;
+	int len;
+
+	p = memchr(buf, '\n', n);
+	len = p ? p - buf : n;
+
+	for (s = &pm_states[state]; state < PM_SUSPEND_MAX; s++, state++) {
+		if (*s && !strncmp(buf, *s, len))
+			break;
+	}
+
+#ifdef CONFIG_PM_CPU_MODE
+    printk(KERN_ERR "state_store, %d\n", state);
+	if (state < PM_SUSPEND_MAX && (state == PM_SUSPEND_CPU_MODE || 
+        state == PM_SUSPEND_MEM) ){
+        if(pm_cpu_mode){
+//            state = 0;
+            error = enter_pm_cpu_mode(state);
+        }
+        else{
+            error = enter_state(state);
+        }
+    }
+    else{
+        error = -EINVAL;
+    }
+    printk(KERN_ERR "end, leave state_store, %d\n", state);	
+#else
+	if (state < PM_SUSPEND_MAX && *s)
+		error = enter_state(state);
+	else
+		error = -EINVAL;
+#endif
+	return error ? error : n;
+}
+
+power_attr(state);
+
+#ifdef CONFIG_PM_TRACE
+int pm_trace_enabled;
+
+static ssize_t pm_trace_show(struct subsystem * subsys, char * buf)
+{
+	return sprintf(buf, "%d\n", pm_trace_enabled);
+}
+
+static ssize_t
+pm_trace_store(struct subsystem * subsys, const char * buf, size_t n)
+{
+	int val;
+
+	if (sscanf(buf, "%d", &val) == 1) {
+		pm_trace_enabled = !!val;
+		return n;
+	}
+	return -EINVAL;
+}
+
+power_attr(pm_trace);
+
+static struct attribute * g[] = {
+	&state_attr.attr,
+	&pm_trace_attr.attr,
+	NULL,
+};
+#else
+static struct attribute * g[] = {
+	&state_attr.attr,
+	NULL,
+};
+#endif /* CONFIG_PM_TRACE */
+
+static struct attribute_group attr_group = {
+	.attrs = g,
+};
+
+
+static int __init pm_init(void)
+{
+	int error = subsystem_register(&power_subsys);
+	if (!error)
+		error = sysfs_create_group(&power_subsys.kset.kobj,&attr_group);
+	return error;
+}
+
+core_initcall(pm_init);
--- a/kernel/power/pm.c
+++ b/kernel/power/pm.c
@@ -0,0 +1,209 @@
+/*
+ *  pm.c - Power management interface
+ *
+ *  Copyright (C) 2000 Andrew Henroid
+ *
+ *  This program is free software; you can redistribute it and/or modify
+ *  it under the terms of the GNU General Public License as published by
+ *  the Free Software Foundation; either version 2 of the License, or
+ *  (at your option) any later version.
+ *
+ *  This program is distributed in the hope that it will be useful,
+ *  but WITHOUT ANY WARRANTY; without even the implied warranty of
+ *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ *  GNU General Public License for more details.
+ *
+ *  You should have received a copy of the GNU General Public License
+ *  along with this program; if not, write to the Free Software
+ *  Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
+ */
+#include <linux/init.h>
+#include <linux/module.h>
+#include <linux/spinlock.h>
+#include <linux/mm.h>
+#include <linux/slab.h>
+#include <linux/pm.h>
+#include <linux/pm_legacy.h>
+#include <linux/interrupt.h>
+#include <linux/mutex.h>
+
+int pm_active;
+
+/*
+ *	Locking notes:
+ *		pm_devs_lock can be a semaphore providing pm ops are not called
+ *	from an interrupt handler (already a bad idea so no change here). Each
+ *	change must be protected so that an unlink of an entry doesn't clash
+ *	with a pm send - which is permitted to sleep in the current architecture
+ *
+ *	Module unloads clashing with pm events now work out safely, the module 
+ *	unload path will block until the event has been sent. It may well block
+ *	until a resume but that will be fine.
+ */
+ 
+static DEFINE_MUTEX(pm_devs_lock);
+static LIST_HEAD(pm_devs);
+
+/**
+ *	pm_register - register a device with power management
+ *	@type: device type 
+ *	@id: device ID
+ *	@callback: callback function
+ *
+ *	Add a device to the list of devices that wish to be notified about
+ *	power management events. A &pm_dev structure is returned on success,
+ *	on failure the return is %NULL.
+ *
+ *      The callback function will be called in process context and
+ *      it may sleep.
+ */
+ 
+struct pm_dev *pm_register(pm_dev_t type,
+			   unsigned long id,
+			   pm_callback callback)
+{
+	struct pm_dev *dev = kzalloc(sizeof(struct pm_dev), GFP_KERNEL);
+	if (dev) {
+		dev->type = type;
+		dev->id = id;
+		dev->callback = callback;
+
+		mutex_lock(&pm_devs_lock);
+		list_add(&dev->entry, &pm_devs);
+		mutex_unlock(&pm_devs_lock);
+	}
+	return dev;
+}
+
+/**
+ *	pm_send - send request to a single device
+ *	@dev: device to send to
+ *	@rqst: power management request
+ *	@data: data for the callback
+ *
+ *	Issue a power management request to a given device. The 
+ *	%PM_SUSPEND and %PM_RESUME events are handled specially. The
+ *	data field must hold the intended next state. No call is made
+ *	if the state matches.
+ *
+ *	BUGS: what stops two power management requests occurring in parallel
+ *	and conflicting.
+ *
+ *	WARNING: Calling pm_send directly is not generally recommended, in
+ *	particular there is no locking against the pm_dev going away. The
+ *	caller must maintain all needed locking or have 'inside knowledge'
+ *	on the safety. Also remember that this function is not locked against
+ *	pm_unregister. This means that you must handle SMP races on callback
+ *	execution and unload yourself.
+ */
+ 
+static int pm_send(struct pm_dev *dev, pm_request_t rqst, void *data)
+{
+	int status = 0;
+	unsigned long prev_state, next_state;
+
+	if (in_interrupt())
+		BUG();
+
+	switch (rqst) {
+	case PM_SUSPEND:
+	case PM_RESUME:
+		prev_state = dev->state;
+		next_state = (unsigned long) data;
+		if (prev_state != next_state) {
+			if (dev->callback)
+				status = (*dev->callback)(dev, rqst, data);
+			if (!status) {
+				dev->state = next_state;
+				dev->prev_state = prev_state;
+			}
+		}
+		else {
+			dev->prev_state = prev_state;
+		}
+		break;
+	default:
+		if (dev->callback)
+			status = (*dev->callback)(dev, rqst, data);
+		break;
+	}
+	return status;
+}
+
+/*
+ * Undo incomplete request
+ */
+static void pm_undo_all(struct pm_dev *last)
+{
+	struct list_head *entry = last->entry.prev;
+	while (entry != &pm_devs) {
+		struct pm_dev *dev = list_entry(entry, struct pm_dev, entry);
+		if (dev->state != dev->prev_state) {
+			/* previous state was zero (running) resume or
+			 * previous state was non-zero (suspended) suspend
+			 */
+			pm_request_t undo = (dev->prev_state
+					     ? PM_SUSPEND:PM_RESUME);
+			pm_send(dev, undo, (void*) dev->prev_state);
+		}
+		entry = entry->prev;
+	}
+}
+
+/**
+ *	pm_send_all - send request to all managed devices
+ *	@rqst: power management request
+ *	@data: data for the callback
+ *
+ *	Issue a power management request to a all devices. The 
+ *	%PM_SUSPEND events are handled specially. Any device is 
+ *	permitted to fail a suspend by returning a non zero (error)
+ *	value from its callback function. If any device vetoes a 
+ *	suspend request then all other devices that have suspended 
+ *	during the processing of this request are restored to their
+ *	previous state.
+ *
+ *	WARNING:  This function takes the pm_devs_lock. The lock is not dropped until
+ *	the callbacks have completed. This prevents races against pm locking
+ *	functions, races against module unload pm_unregister code. It does
+ *	mean however that you must not issue pm_ functions within the callback
+ *	or you will deadlock and users will hate you.
+ *
+ *	Zero is returned on success. If a suspend fails then the status
+ *	from the device that vetoes the suspend is returned.
+ *
+ *	BUGS: what stops two power management requests occurring in parallel
+ *	and conflicting.
+ */
+ 
+int pm_send_all(pm_request_t rqst, void *data)
+{
+	struct list_head *entry;
+	
+	mutex_lock(&pm_devs_lock);
+	entry = pm_devs.next;
+	while (entry != &pm_devs) {
+		struct pm_dev *dev = list_entry(entry, struct pm_dev, entry);
+		if (dev->callback) {
+			int status = pm_send(dev, rqst, data);
+			if (status) {
+				/* return devices to previous state on
+				 * failed suspend request
+				 */
+				if (rqst == PM_SUSPEND)
+					pm_undo_all(dev);
+				mutex_unlock(&pm_devs_lock);
+				return status;
+			}
+		}
+		entry = entry->next;
+	}
+	mutex_unlock(&pm_devs_lock);
+	return 0;
+}
+
+EXPORT_SYMBOL(pm_register);
+EXPORT_SYMBOL(pm_send_all);
+EXPORT_SYMBOL(pm_active);
+
+
--- a/kernel/power/power.h
+++ b/kernel/power/power.h
@@ -0,0 +1,179 @@
+#include <linux/suspend.h>
+#include <linux/utsname.h>
+
+struct swsusp_info {
+	struct new_utsname	uts;
+	u32			version_code;
+	unsigned long		num_physpages;
+	int			cpus;
+	unsigned long		image_pages;
+	unsigned long		pages;
+	unsigned long		size;
+} __attribute__((aligned(PAGE_SIZE)));
+
+
+
+#ifdef CONFIG_SOFTWARE_SUSPEND
+extern int pm_suspend_disk(void);
+
+#else
+static inline int pm_suspend_disk(void)
+{
+	return -EPERM;
+}
+#endif
+
+extern struct mutex pm_mutex;
+
+#define power_attr(_name) \
+static struct subsys_attribute _name##_attr = {	\
+	.attr	= {				\
+		.name = __stringify(_name),	\
+		.mode = 0644,			\
+	},					\
+	.show	= _name##_show,			\
+	.store	= _name##_store,		\
+}
+
+extern struct subsystem power_subsys;
+
+/* References to section boundaries */
+extern const void __nosave_begin, __nosave_end;
+
+/* Preferred image size in bytes (default 500 MB) */
+extern unsigned long image_size;
+extern int in_suspend;
+extern dev_t swsusp_resume_device;
+extern sector_t swsusp_resume_block;
+
+extern asmlinkage int swsusp_arch_suspend(void);
+extern asmlinkage int swsusp_arch_resume(void);
+
+extern unsigned int count_data_pages(void);
+
+/**
+ *	Auxiliary structure used for reading the snapshot image data and
+ *	metadata from and writing them to the list of page backup entries
+ *	(PBEs) which is the main data structure of swsusp.
+ *
+ *	Using struct snapshot_handle we can transfer the image, including its
+ *	metadata, as a continuous sequence of bytes with the help of
+ *	snapshot_read_next() and snapshot_write_next().
+ *
+ *	The code that writes the image to a storage or transfers it to
+ *	the user land is required to use snapshot_read_next() for this
+ *	purpose and it should not make any assumptions regarding the internal
+ *	structure of the image.  Similarly, the code that reads the image from
+ *	a storage or transfers it from the user land is required to use
+ *	snapshot_write_next().
+ *
+ *	This may allow us to change the internal structure of the image
+ *	in the future with considerably less effort.
+ */
+
+struct snapshot_handle {
+	loff_t		offset;	/* number of the last byte ready for reading
+				 * or writing in the sequence
+				 */
+	unsigned int	cur;	/* number of the block of PAGE_SIZE bytes the
+				 * next operation will refer to (ie. current)
+				 */
+	unsigned int	cur_offset;	/* offset with respect to the current
+					 * block (for the next operation)
+					 */
+	unsigned int	prev;	/* number of the block of PAGE_SIZE bytes that
+				 * was the current one previously
+				 */
+	void		*buffer;	/* address of the block to read from
+					 * or write to
+					 */
+	unsigned int	buf_offset;	/* location to read from or write to,
+					 * given as a displacement from 'buffer'
+					 */
+	int		sync_read;	/* Set to one to notify the caller of
+					 * snapshot_write_next() that it may
+					 * need to call wait_on_bio_chain()
+					 */
+};
+
+/* This macro returns the address from/to which the caller of
+ * snapshot_read_next()/snapshot_write_next() is allowed to
+ * read/write data after the function returns
+ */
+#define data_of(handle)	((handle).buffer + (handle).buf_offset)
+
+extern unsigned int snapshot_additional_pages(struct zone *zone);
+extern int snapshot_read_next(struct snapshot_handle *handle, size_t count);
+extern int snapshot_write_next(struct snapshot_handle *handle, size_t count);
+extern void snapshot_write_finalize(struct snapshot_handle *handle);
+extern int snapshot_image_loaded(struct snapshot_handle *handle);
+
+/*
+ * This structure is used to pass the values needed for the identification
+ * of the resume swap area from a user space to the kernel via the
+ * SNAPSHOT_SET_SWAP_AREA ioctl
+ */
+struct resume_swap_area {
+	loff_t offset;
+	u_int32_t dev;
+} __attribute__((packed));
+
+#define SNAPSHOT_IOC_MAGIC	'3'
+#define SNAPSHOT_FREEZE			_IO(SNAPSHOT_IOC_MAGIC, 1)
+#define SNAPSHOT_UNFREEZE		_IO(SNAPSHOT_IOC_MAGIC, 2)
+#define SNAPSHOT_ATOMIC_SNAPSHOT	_IOW(SNAPSHOT_IOC_MAGIC, 3, void *)
+#define SNAPSHOT_ATOMIC_RESTORE		_IO(SNAPSHOT_IOC_MAGIC, 4)
+#define SNAPSHOT_FREE			_IO(SNAPSHOT_IOC_MAGIC, 5)
+#define SNAPSHOT_SET_IMAGE_SIZE		_IOW(SNAPSHOT_IOC_MAGIC, 6, unsigned long)
+#define SNAPSHOT_AVAIL_SWAP		_IOR(SNAPSHOT_IOC_MAGIC, 7, void *)
+#define SNAPSHOT_GET_SWAP_PAGE		_IOR(SNAPSHOT_IOC_MAGIC, 8, void *)
+#define SNAPSHOT_FREE_SWAP_PAGES	_IO(SNAPSHOT_IOC_MAGIC, 9)
+#define SNAPSHOT_SET_SWAP_FILE		_IOW(SNAPSHOT_IOC_MAGIC, 10, unsigned int)
+#define SNAPSHOT_S2RAM			_IO(SNAPSHOT_IOC_MAGIC, 11)
+#define SNAPSHOT_PMOPS			_IOW(SNAPSHOT_IOC_MAGIC, 12, unsigned int)
+#define SNAPSHOT_SET_SWAP_AREA		_IOW(SNAPSHOT_IOC_MAGIC, 13, \
+							struct resume_swap_area)
+#define SNAPSHOT_IOC_MAXNR	13
+
+#define PMOPS_PREPARE	1
+#define PMOPS_ENTER	2
+#define PMOPS_FINISH	3
+
+/**
+ *	The bitmap is used for tracing allocated swap pages
+ *
+ *	The entire bitmap consists of a number of bitmap_page
+ *	structures linked with the help of the .next member.
+ *	Thus each page can be allocated individually, so we only
+ *	need to make 0-order memory allocations to create
+ *	the bitmap.
+ */
+
+#define BITMAP_PAGE_SIZE	(PAGE_SIZE - sizeof(void *))
+#define BITMAP_PAGE_CHUNKS	(BITMAP_PAGE_SIZE / sizeof(long))
+#define BITS_PER_CHUNK		(sizeof(long) * 8)
+#define BITMAP_PAGE_BITS	(BITMAP_PAGE_CHUNKS * BITS_PER_CHUNK)
+
+struct bitmap_page {
+	unsigned long		chunks[BITMAP_PAGE_CHUNKS];
+	struct bitmap_page	*next;
+};
+
+extern void free_bitmap(struct bitmap_page *bitmap);
+extern struct bitmap_page *alloc_bitmap(unsigned int nr_bits);
+extern sector_t alloc_swapdev_block(int swap, struct bitmap_page *bitmap);
+extern void free_all_swap_pages(int swap, struct bitmap_page *bitmap);
+
+extern int swsusp_check(void);
+extern int swsusp_shrink_memory(void);
+extern void swsusp_free(void);
+extern int swsusp_suspend(void);
+extern int swsusp_resume(void);
+extern int swsusp_read(void);
+extern int swsusp_write(void);
+extern void swsusp_close(void);
+extern int suspend_enter(suspend_state_t state);
+
+struct timeval;
+extern void swsusp_show_speed(struct timeval *, struct timeval *,
+				unsigned int, char *);
--- a/kernel/power/poweroff.c
+++ b/kernel/power/poweroff.c
@@ -0,0 +1,44 @@
+/*
+ * poweroff.c - sysrq handler to gracefully power down machine.
+ *
+ * This file is released under the GPL v2
+ */
+
+#include <linux/kernel.h>
+#include <linux/sysrq.h>
+#include <linux/init.h>
+#include <linux/pm.h>
+#include <linux/workqueue.h>
+#include <linux/reboot.h>
+
+/*
+ * When the user hits Sys-Rq o to power down the machine this is the
+ * callback we use.
+ */
+
+static void do_poweroff(struct work_struct *dummy)
+{
+	kernel_power_off();
+}
+
+static DECLARE_WORK(poweroff_work, do_poweroff);
+
+static void handle_poweroff(int key, struct tty_struct *tty)
+{
+	schedule_work(&poweroff_work);
+}
+
+static struct sysrq_key_op	sysrq_poweroff_op = {
+	.handler        = handle_poweroff,
+	.help_msg       = "powerOff",
+	.action_msg     = "Power Off",
+ 	.enable_mask	= SYSRQ_ENABLE_BOOT,
+};
+
+static int pm_sysrq_init(void)
+{
+	register_sysrq_key('o', &sysrq_poweroff_op);
+	return 0;
+}
+
+subsys_initcall(pm_sysrq_init);
--- a/kernel/power/process.c
+++ b/kernel/power/process.c
@@ -0,0 +1,219 @@
+/*
+ * drivers/power/process.c - Functions for starting/stopping processes on 
+ *                           suspend transitions.
+ *
+ * Originally from swsusp.
+ */
+
+
+#undef DEBUG
+
+#include <linux/smp_lock.h>
+#include <linux/interrupt.h>
+#include <linux/suspend.h>
+#include <linux/module.h>
+#include <linux/syscalls.h>
+#include <linux/freezer.h>
+
+/* 
+ * Timeout for stopping processes
+ */
+#define TIMEOUT	(20 * HZ)
+
+#define FREEZER_KERNEL_THREADS 0
+#define FREEZER_USER_SPACE 1
+
+static inline int freezeable(struct task_struct * p)
+{
+	if ((p == current) || 
+	    (p->flags & PF_NOFREEZE) ||
+	    (p->exit_state == EXIT_ZOMBIE) ||
+	    (p->exit_state == EXIT_DEAD))
+		return 0;
+	return 1;
+}
+
+/* Refrigerator is place where frozen processes are stored :-). */
+void refrigerator(void)
+{
+	/* Hmm, should we be allowed to suspend when there are realtime
+	   processes around? */
+	long save;
+	save = current->state;
+	pr_debug("%s entered refrigerator\n", current->comm);
+
+	frozen_process(current);
+	spin_lock_irq(&current->sighand->siglock);
+	recalc_sigpending(); /* We sent fake signal, clean it up */
+	spin_unlock_irq(&current->sighand->siglock);
+
+	while (frozen(current)) {
+		current->state = TASK_UNINTERRUPTIBLE;
+		schedule();
+	}
+	pr_debug("%s left refrigerator\n", current->comm);
+	current->state = save;
+}
+
+static inline void freeze_process(struct task_struct *p)
+{
+	unsigned long flags;
+
+	if (!freezing(p)) {
+		rmb();
+		if (!frozen(p)) {
+			if (p->state == TASK_STOPPED)
+				force_sig_specific(SIGSTOP, p);
+
+			freeze(p);
+			spin_lock_irqsave(&p->sighand->siglock, flags);
+			signal_wake_up(p, p->state == TASK_STOPPED);
+			spin_unlock_irqrestore(&p->sighand->siglock, flags);
+		}
+	}
+}
+
+static void cancel_freezing(struct task_struct *p)
+{
+	unsigned long flags;
+
+	if (freezing(p)) {
+		pr_debug("  clean up: %s\n", p->comm);
+		do_not_freeze(p);
+		spin_lock_irqsave(&p->sighand->siglock, flags);
+		recalc_sigpending_tsk(p);
+		spin_unlock_irqrestore(&p->sighand->siglock, flags);
+	}
+}
+
+static inline int is_user_space(struct task_struct *p)
+{
+	return p->mm && !(p->flags & PF_BORROWED_MM);
+}
+
+static unsigned int try_to_freeze_tasks(int freeze_user_space)
+{
+	struct task_struct *g, *p;
+	unsigned long end_time;
+	unsigned int todo;
+
+	end_time = jiffies + TIMEOUT;
+	do {
+		todo = 0;
+		read_lock(&tasklist_lock);
+		do_each_thread(g, p) {
+			if (!freezeable(p))
+				continue;
+
+			if (frozen(p))
+				continue;
+
+			if (p->state == TASK_TRACED && frozen(p->parent)) {
+				cancel_freezing(p);
+				continue;
+			}
+			if (is_user_space(p)) {
+				if (!freeze_user_space)
+					continue;
+
+				/* Freeze the task unless there is a vfork
+				 * completion pending
+				 */
+				if (!p->vfork_done)
+					freeze_process(p);
+			} else {
+				if (freeze_user_space)
+					continue;
+
+				freeze_process(p);
+			}
+			todo++;
+		} while_each_thread(g, p);
+		read_unlock(&tasklist_lock);
+		yield();			/* Yield is okay here */
+		if (todo && time_after(jiffies, end_time))
+			break;
+	} while (todo);
+
+	if (todo) {
+		/* This does not unfreeze processes that are already frozen
+		 * (we have slightly ugly calling convention in that respect,
+		 * and caller must call thaw_processes() if something fails),
+		 * but it cleans up leftover PF_FREEZE requests.
+		 */
+		printk("\n");
+		printk(KERN_ERR "Stopping %s timed out after %d seconds "
+				"(%d tasks refusing to freeze):\n",
+				freeze_user_space ? "user space processes" :
+					"kernel threads",
+				TIMEOUT / HZ, todo);
+		read_lock(&tasklist_lock);
+		do_each_thread(g, p) {
+			if (is_user_space(p) == !freeze_user_space)
+				continue;
+
+			if (freezeable(p) && !frozen(p))
+				printk(KERN_ERR " %s\n", p->comm);
+
+			cancel_freezing(p);
+		} while_each_thread(g, p);
+		read_unlock(&tasklist_lock);
+	}
+
+	return todo;
+}
+
+/**
+ *	freeze_processes - tell processes to enter the refrigerator
+ *
+ *	Returns 0 on success, or the number of processes that didn't freeze,
+ *	although they were told to.
+ */
+int freeze_processes(void)
+{
+	unsigned int nr_unfrozen;
+
+	printk("Stopping tasks ... ");
+	nr_unfrozen = try_to_freeze_tasks(FREEZER_USER_SPACE);
+	if (nr_unfrozen)
+		return nr_unfrozen;
+
+	sys_sync();
+	nr_unfrozen = try_to_freeze_tasks(FREEZER_KERNEL_THREADS);
+	if (nr_unfrozen)
+		return nr_unfrozen;
+
+	printk("done.\n");
+	BUG_ON(in_atomic());
+	return 0;
+}
+
+static void thaw_tasks(int thaw_user_space)
+{
+	struct task_struct *g, *p;
+
+	read_lock(&tasklist_lock);
+	do_each_thread(g, p) {
+		if (!freezeable(p))
+			continue;
+
+		if (is_user_space(p) == !thaw_user_space)
+			continue;
+
+		if (!thaw_process(p))
+			printk(KERN_WARNING " Strange, %s not stopped\n",
+				p->comm );
+	} while_each_thread(g, p);
+	read_unlock(&tasklist_lock);
+}
+
+void thaw_processes(void)
+{
+	printk("Restarting tasks ... ");
+	thaw_tasks(FREEZER_KERNEL_THREADS);
+	thaw_tasks(FREEZER_USER_SPACE);
+	schedule();
+	printk("done.\n");
+}
+
+EXPORT_SYMBOL(refrigerator);
--- a/kernel/power/snapshot.c
+++ b/kernel/power/snapshot.c
--- a/kernel/power/swap.c
+++ b/kernel/power/swap.c
@@ -0,0 +1,634 @@
+/*
+ * linux/kernel/power/swap.c
+ *
+ * This file provides functions for reading the suspend image from
+ * and writing it to a swap partition.
+ *
+ * Copyright (C) 1998,2001-2005 Pavel Machek <pavel@suse.cz>
+ * Copyright (C) 2006 Rafael J. Wysocki <rjw@sisk.pl>
+ *
+ * This file is released under the GPLv2.
+ *
+ */
+
+#include <linux/module.h>
+#include <linux/smp_lock.h>
+#include <linux/file.h>
+#include <linux/utsname.h>
+#include <linux/version.h>
+#include <linux/delay.h>
+#include <linux/bitops.h>
+#include <linux/genhd.h>
+#include <linux/device.h>
+#include <linux/buffer_head.h>
+#include <linux/bio.h>
+#include <linux/blkdev.h>
+#include <linux/swap.h>
+#include <linux/swapops.h>
+#include <linux/pm.h>
+
+#include "power.h"
+
+extern char resume_file[];
+
+#define SWSUSP_SIG	"S1SUSPEND"
+
+static struct swsusp_header {
+	char reserved[PAGE_SIZE - 20 - sizeof(sector_t)];
+	sector_t image;
+	char	orig_sig[10];
+	char	sig[10];
+} __attribute__((packed, aligned(PAGE_SIZE))) swsusp_header;
+
+/*
+ * General things
+ */
+
+static unsigned short root_swap = 0xffff;
+static struct block_device *resume_bdev;
+
+/**
+ *	submit - submit BIO request.
+ *	@rw:	READ or WRITE.
+ *	@off	physical offset of page.
+ *	@page:	page we're reading or writing.
+ *	@bio_chain: list of pending biod (for async reading)
+ *
+ *	Straight from the textbook - allocate and initialize the bio.
+ *	If we're reading, make sure the page is marked as dirty.
+ *	Then submit it and, if @bio_chain == NULL, wait.
+ */
+static int submit(int rw, pgoff_t page_off, struct page *page,
+			struct bio **bio_chain)
+{
+	struct bio *bio;
+
+	bio = bio_alloc(__GFP_WAIT | __GFP_HIGH, 1);
+	if (!bio)
+		return -ENOMEM;
+	bio->bi_sector = page_off * (PAGE_SIZE >> 9);
+	bio->bi_bdev = resume_bdev;
+	bio->bi_end_io = end_swap_bio_read;
+
+	if (bio_add_page(bio, page, PAGE_SIZE, 0) < PAGE_SIZE) {
+		printk("swsusp: ERROR: adding page to bio at %ld\n", page_off);
+		bio_put(bio);
+		return -EFAULT;
+	}
+
+	lock_page(page);
+	bio_get(bio);
+
+	if (bio_chain == NULL) {
+		submit_bio(rw | (1 << BIO_RW_SYNC), bio);
+		wait_on_page_locked(page);
+		if (rw == READ)
+			bio_set_pages_dirty(bio);
+		bio_put(bio);
+	} else {
+		if (rw == READ)
+			get_page(page);	/* These pages are freed later */
+		bio->bi_private = *bio_chain;
+		*bio_chain = bio;
+		submit_bio(rw | (1 << BIO_RW_SYNC), bio);
+	}
+	return 0;
+}
+
+static int bio_read_page(pgoff_t page_off, void *addr, struct bio **bio_chain)
+{
+	return submit(READ, page_off, virt_to_page(addr), bio_chain);
+}
+
+static int bio_write_page(pgoff_t page_off, void *addr, struct bio **bio_chain)
+{
+	return submit(WRITE, page_off, virt_to_page(addr), bio_chain);
+}
+
+static int wait_on_bio_chain(struct bio **bio_chain)
+{
+	struct bio *bio;
+	struct bio *next_bio;
+	int ret = 0;
+
+	if (bio_chain == NULL)
+		return 0;
+
+	bio = *bio_chain;
+	if (bio == NULL)
+		return 0;
+	while (bio) {
+		struct page *page;
+
+		next_bio = bio->bi_private;
+		page = bio->bi_io_vec[0].bv_page;
+		wait_on_page_locked(page);
+		if (!PageUptodate(page) || PageError(page))
+			ret = -EIO;
+		put_page(page);
+		bio_put(bio);
+		bio = next_bio;
+	}
+	*bio_chain = NULL;
+	return ret;
+}
+
+/*
+ * Saving part
+ */
+
+static int mark_swapfiles(sector_t start)
+{
+	int error;
+
+	bio_read_page(swsusp_resume_block, &swsusp_header, NULL);
+	if (!memcmp("SWAP-SPACE",swsusp_header.sig, 10) ||
+	    !memcmp("SWAPSPACE2",swsusp_header.sig, 10)) {
+		memcpy(swsusp_header.orig_sig,swsusp_header.sig, 10);
+		memcpy(swsusp_header.sig,SWSUSP_SIG, 10);
+		swsusp_header.image = start;
+		error = bio_write_page(swsusp_resume_block,
+					&swsusp_header, NULL);
+	} else {
+		printk(KERN_ERR "swsusp: Swap header not found!\n");
+		error = -ENODEV;
+	}
+	return error;
+}
+
+/**
+ *	swsusp_swap_check - check if the resume device is a swap device
+ *	and get its index (if so)
+ */
+
+static int swsusp_swap_check(void) /* This is called before saving image */
+{
+	int res;
+
+	res = swap_type_of(swsusp_resume_device, swsusp_resume_block,
+			&resume_bdev);
+	if (res < 0)
+		return res;
+
+	root_swap = res;
+	res = blkdev_get(resume_bdev, FMODE_WRITE, O_RDWR);
+	if (res)
+		return res;
+
+	res = set_blocksize(resume_bdev, PAGE_SIZE);
+	if (res < 0)
+		blkdev_put(resume_bdev);
+
+	return res;
+}
+
+/**
+ *	write_page - Write one page to given swap location.
+ *	@buf:		Address we're writing.
+ *	@offset:	Offset of the swap page we're writing to.
+ *	@bio_chain:	Link the next write BIO here
+ */
+
+static int write_page(void *buf, sector_t offset, struct bio **bio_chain)
+{
+	void *src;
+
+	if (!offset)
+		return -ENOSPC;
+
+	if (bio_chain) {
+		src = (void *)__get_free_page(__GFP_WAIT | __GFP_HIGH);
+		if (src) {
+			memcpy(src, buf, PAGE_SIZE);
+		} else {
+			WARN_ON_ONCE(1);
+			bio_chain = NULL;	/* Go synchronous */
+			src = buf;
+		}
+	} else {
+		src = buf;
+	}
+	return bio_write_page(offset, src, bio_chain);
+}
+
+/*
+ *	The swap map is a data structure used for keeping track of each page
+ *	written to a swap partition.  It consists of many swap_map_page
+ *	structures that contain each an array of MAP_PAGE_SIZE swap entries.
+ *	These structures are stored on the swap and linked together with the
+ *	help of the .next_swap member.
+ *
+ *	The swap map is created during suspend.  The swap map pages are
+ *	allocated and populated one at a time, so we only need one memory
+ *	page to set up the entire structure.
+ *
+ *	During resume we also only need to use one swap_map_page structure
+ *	at a time.
+ */
+
+#define MAP_PAGE_ENTRIES	(PAGE_SIZE / sizeof(sector_t) - 1)
+
+struct swap_map_page {
+	sector_t entries[MAP_PAGE_ENTRIES];
+	sector_t next_swap;
+};
+
+/**
+ *	The swap_map_handle structure is used for handling swap in
+ *	a file-alike way
+ */
+
+struct swap_map_handle {
+	struct swap_map_page *cur;
+	sector_t cur_swap;
+	struct bitmap_page *bitmap;
+	unsigned int k;
+};
+
+static void release_swap_writer(struct swap_map_handle *handle)
+{
+	if (handle->cur)
+		free_page((unsigned long)handle->cur);
+	handle->cur = NULL;
+	if (handle->bitmap)
+		free_bitmap(handle->bitmap);
+	handle->bitmap = NULL;
+}
+
+static int get_swap_writer(struct swap_map_handle *handle)
+{
+	handle->cur = (struct swap_map_page *)get_zeroed_page(GFP_KERNEL);
+	if (!handle->cur)
+		return -ENOMEM;
+	handle->bitmap = alloc_bitmap(count_swap_pages(root_swap, 0));
+	if (!handle->bitmap) {
+		release_swap_writer(handle);
+		return -ENOMEM;
+	}
+	handle->cur_swap = alloc_swapdev_block(root_swap, handle->bitmap);
+	if (!handle->cur_swap) {
+		release_swap_writer(handle);
+		return -ENOSPC;
+	}
+	handle->k = 0;
+	return 0;
+}
+
+static int swap_write_page(struct swap_map_handle *handle, void *buf,
+				struct bio **bio_chain)
+{
+	int error = 0;
+	sector_t offset;
+
+	if (!handle->cur)
+		return -EINVAL;
+	offset = alloc_swapdev_block(root_swap, handle->bitmap);
+	error = write_page(buf, offset, bio_chain);
+	if (error)
+		return error;
+	handle->cur->entries[handle->k++] = offset;
+	if (handle->k >= MAP_PAGE_ENTRIES) {
+		error = wait_on_bio_chain(bio_chain);
+		if (error)
+			goto out;
+		offset = alloc_swapdev_block(root_swap, handle->bitmap);
+		if (!offset)
+			return -ENOSPC;
+		handle->cur->next_swap = offset;
+		error = write_page(handle->cur, handle->cur_swap, NULL);
+		if (error)
+			goto out;
+		memset(handle->cur, 0, PAGE_SIZE);
+		handle->cur_swap = offset;
+		handle->k = 0;
+	}
+ out:
+	return error;
+}
+
+static int flush_swap_writer(struct swap_map_handle *handle)
+{
+	if (handle->cur && handle->cur_swap)
+		return write_page(handle->cur, handle->cur_swap, NULL);
+	else
+		return -EINVAL;
+}
+
+/**
+ *	save_image - save the suspend image data
+ */
+
+static int save_image(struct swap_map_handle *handle,
+                      struct snapshot_handle *snapshot,
+                      unsigned int nr_to_write)
+{
+	unsigned int m;
+	int ret;
+	int error = 0;
+	int nr_pages;
+	int err2;
+	struct bio *bio;
+	struct timeval start;
+	struct timeval stop;
+
+	printk("Saving image data pages (%u pages) ...     ", nr_to_write);
+	m = nr_to_write / 100;
+	if (!m)
+		m = 1;
+	nr_pages = 0;
+	bio = NULL;
+	do_gettimeofday(&start);
+	do {
+		ret = snapshot_read_next(snapshot, PAGE_SIZE);
+		if (ret > 0) {
+			error = swap_write_page(handle, data_of(*snapshot),
+						&bio);
+			if (error)
+				break;
+			if (!(nr_pages % m))
+				printk("\b\b\b\b%3d%%", nr_pages / m);
+			nr_pages++;
+		}
+	} while (ret > 0);
+	err2 = wait_on_bio_chain(&bio);
+	do_gettimeofday(&stop);
+	if (!error)
+		error = err2;
+	if (!error)
+		printk("\b\b\b\bdone\n");
+	swsusp_show_speed(&start, &stop, nr_to_write, "Wrote");
+	return error;
+}
+
+/**
+ *	enough_swap - Make sure we have enough swap to save the image.
+ *
+ *	Returns TRUE or FALSE after checking the total amount of swap
+ *	space avaiable from the resume partition.
+ */
+
+static int enough_swap(unsigned int nr_pages)
+{
+	unsigned int free_swap = count_swap_pages(root_swap, 1);
+
+	pr_debug("swsusp: free swap pages: %u\n", free_swap);
+	return free_swap > nr_pages + PAGES_FOR_IO;
+}
+
+/**
+ *	swsusp_write - Write entire image and metadata.
+ *
+ *	It is important _NOT_ to umount filesystems at this point. We want
+ *	them synced (in case something goes wrong) but we DO not want to mark
+ *	filesystem clean: it is not. (And it does not matter, if we resume
+ *	correctly, we'll mark system clean, anyway.)
+ */
+
+int swsusp_write(void)
+{
+	struct swap_map_handle handle;
+	struct snapshot_handle snapshot;
+	struct swsusp_info *header;
+	int error;
+
+	error = swsusp_swap_check();
+	if (error) {
+		printk(KERN_ERR "swsusp: Cannot find swap device, try "
+				"swapon -a.\n");
+		return error;
+	}
+	memset(&snapshot, 0, sizeof(struct snapshot_handle));
+	error = snapshot_read_next(&snapshot, PAGE_SIZE);
+	if (error < PAGE_SIZE) {
+		if (error >= 0)
+			error = -EFAULT;
+
+		goto out;
+	}
+	header = (struct swsusp_info *)data_of(snapshot);
+	if (!enough_swap(header->pages)) {
+		printk(KERN_ERR "swsusp: Not enough free swap\n");
+		error = -ENOSPC;
+		goto out;
+	}
+	error = get_swap_writer(&handle);
+	if (!error) {
+		sector_t start = handle.cur_swap;
+
+		error = swap_write_page(&handle, header, NULL);
+		if (!error)
+			error = save_image(&handle, &snapshot,
+					header->pages - 1);
+
+		if (!error) {
+			flush_swap_writer(&handle);
+			printk("S");
+			error = mark_swapfiles(start);
+			printk("|\n");
+		}
+	}
+	if (error)
+		free_all_swap_pages(root_swap, handle.bitmap);
+	release_swap_writer(&handle);
+ out:
+	swsusp_close();
+	return error;
+}
+
+/**
+ *	The following functions allow us to read data using a swap map
+ *	in a file-alike way
+ */
+
+static void release_swap_reader(struct swap_map_handle *handle)
+{
+	if (handle->cur)
+		free_page((unsigned long)handle->cur);
+	handle->cur = NULL;
+}
+
+static int get_swap_reader(struct swap_map_handle *handle, sector_t start)
+{
+	int error;
+
+	if (!start)
+		return -EINVAL;
+
+	handle->cur = (struct swap_map_page *)get_zeroed_page(__GFP_WAIT | __GFP_HIGH);
+	if (!handle->cur)
+		return -ENOMEM;
+
+	error = bio_read_page(start, handle->cur, NULL);
+	if (error) {
+		release_swap_reader(handle);
+		return error;
+	}
+	handle->k = 0;
+	return 0;
+}
+
+static int swap_read_page(struct swap_map_handle *handle, void *buf,
+				struct bio **bio_chain)
+{
+	sector_t offset;
+	int error;
+
+	if (!handle->cur)
+		return -EINVAL;
+	offset = handle->cur->entries[handle->k];
+	if (!offset)
+		return -EFAULT;
+	error = bio_read_page(offset, buf, bio_chain);
+	if (error)
+		return error;
+	if (++handle->k >= MAP_PAGE_ENTRIES) {
+		error = wait_on_bio_chain(bio_chain);
+		handle->k = 0;
+		offset = handle->cur->next_swap;
+		if (!offset)
+			release_swap_reader(handle);
+		else if (!error)
+			error = bio_read_page(offset, handle->cur, NULL);
+	}
+	return error;
+}
+
+/**
+ *	load_image - load the image using the swap map handle
+ *	@handle and the snapshot handle @snapshot
+ *	(assume there are @nr_pages pages to load)
+ */
+
+static int load_image(struct swap_map_handle *handle,
+                      struct snapshot_handle *snapshot,
+                      unsigned int nr_to_read)
+{
+	unsigned int m;
+	int error = 0;
+	struct timeval start;
+	struct timeval stop;
+	struct bio *bio;
+	int err2;
+	unsigned nr_pages;
+
+	printk("Loading image data pages (%u pages) ...     ", nr_to_read);
+	m = nr_to_read / 100;
+	if (!m)
+		m = 1;
+	nr_pages = 0;
+	bio = NULL;
+	do_gettimeofday(&start);
+	for ( ; ; ) {
+		error = snapshot_write_next(snapshot, PAGE_SIZE);
+		if (error <= 0)
+			break;
+		error = swap_read_page(handle, data_of(*snapshot), &bio);
+		if (error)
+			break;
+		if (snapshot->sync_read)
+			error = wait_on_bio_chain(&bio);
+		if (error)
+			break;
+		if (!(nr_pages % m))
+			printk("\b\b\b\b%3d%%", nr_pages / m);
+		nr_pages++;
+	}
+	err2 = wait_on_bio_chain(&bio);
+	do_gettimeofday(&stop);
+	if (!error)
+		error = err2;
+	if (!error) {
+		printk("\b\b\b\bdone\n");
+		snapshot_write_finalize(snapshot);
+		if (!snapshot_image_loaded(snapshot))
+			error = -ENODATA;
+	}
+	swsusp_show_speed(&start, &stop, nr_to_read, "Read");
+	return error;
+}
+
+int swsusp_read(void)
+{
+	int error;
+	struct swap_map_handle handle;
+	struct snapshot_handle snapshot;
+	struct swsusp_info *header;
+
+	if (IS_ERR(resume_bdev)) {
+		pr_debug("swsusp: block device not initialised\n");
+		return PTR_ERR(resume_bdev);
+	}
+
+	memset(&snapshot, 0, sizeof(struct snapshot_handle));
+	error = snapshot_write_next(&snapshot, PAGE_SIZE);
+	if (error < PAGE_SIZE)
+		return error < 0 ? error : -EFAULT;
+	header = (struct swsusp_info *)data_of(snapshot);
+	error = get_swap_reader(&handle, swsusp_header.image);
+	if (!error)
+		error = swap_read_page(&handle, header, NULL);
+	if (!error)
+		error = load_image(&handle, &snapshot, header->pages - 1);
+	release_swap_reader(&handle);
+
+	blkdev_put(resume_bdev);
+
+	if (!error)
+		pr_debug("swsusp: Reading resume file was successful\n");
+	else
+		pr_debug("swsusp: Error %d resuming\n", error);
+	return error;
+}
+
+/**
+ *      swsusp_check - Check for swsusp signature in the resume device
+ */
+
+int swsusp_check(void)
+{
+	int error;
+
+	resume_bdev = open_by_devnum(swsusp_resume_device, FMODE_READ);
+	if (!IS_ERR(resume_bdev)) {
+		set_blocksize(resume_bdev, PAGE_SIZE);
+		memset(&swsusp_header, 0, sizeof(swsusp_header));
+		error = bio_read_page(swsusp_resume_block,
+					&swsusp_header, NULL);
+		if (error)
+			return error;
+
+		if (!memcmp(SWSUSP_SIG, swsusp_header.sig, 10)) {
+			memcpy(swsusp_header.sig, swsusp_header.orig_sig, 10);
+			/* Reset swap signature now */
+			error = bio_write_page(swsusp_resume_block,
+						&swsusp_header, NULL);
+		} else {
+			return -EINVAL;
+		}
+		if (error)
+			blkdev_put(resume_bdev);
+		else
+			pr_debug("swsusp: Signature found, resuming\n");
+	} else {
+		error = PTR_ERR(resume_bdev);
+	}
+
+	if (error)
+		pr_debug("swsusp: Error %d check for resume file\n", error);
+
+	return error;
+}
+
+/**
+ *	swsusp_close - close swap device.
+ */
+
+void swsusp_close(void)
+{
+	if (IS_ERR(resume_bdev)) {
+		pr_debug("swsusp: block device not initialised\n");
+		return;
+	}
+
+	blkdev_put(resume_bdev);
+}
--- a/kernel/power/swsusp.c
+++ b/kernel/power/swsusp.c
@@ -0,0 +1,330 @@
+/*
+ * linux/kernel/power/swsusp.c
+ *
+ * This file provides code to write suspend image to swap and read it back.
+ *
+ * Copyright (C) 1998-2001 Gabor Kuti <seasons@fornax.hu>
+ * Copyright (C) 1998,2001-2005 Pavel Machek <pavel@suse.cz>
+ *
+ * This file is released under the GPLv2.
+ *
+ * I'd like to thank the following people for their work:
+ *
+ * Pavel Machek <pavel@ucw.cz>:
+ * Modifications, defectiveness pointing, being with me at the very beginning,
+ * suspend to swap space, stop all tasks. Port to 2.4.18-ac and 2.5.17.
+ *
+ * Steve Doddi <dirk@loth.demon.co.uk>:
+ * Support the possibility of hardware state restoring.
+ *
+ * Raph <grey.havens@earthling.net>:
+ * Support for preserving states of network devices and virtual console
+ * (including X and svgatextmode)
+ *
+ * Kurt Garloff <garloff@suse.de>:
+ * Straightened the critical function in order to prevent compilers from
+ * playing tricks with local variables.
+ *
+ * Andreas Mohr <a.mohr@mailto.de>
+ *
+ * Alex Badea <vampire@go.ro>:
+ * Fixed runaway init
+ *
+ * Rafael J. Wysocki <rjw@sisk.pl>
+ * Reworked the freeing of memory and the handling of swap
+ *
+ * More state savers are welcome. Especially for the scsi layer...
+ *
+ * For TODOs,FIXMEs also look in Documentation/power/swsusp.txt
+ */
+
+#include <linux/mm.h>
+#include <linux/suspend.h>
+#include <linux/spinlock.h>
+#include <linux/kernel.h>
+#include <linux/major.h>
+#include <linux/swap.h>
+#include <linux/pm.h>
+#include <linux/swapops.h>
+#include <linux/bootmem.h>
+#include <linux/syscalls.h>
+#include <linux/highmem.h>
+#include <linux/time.h>
+
+#include "power.h"
+
+/*
+ * Preferred image size in bytes (tunable via /sys/power/image_size).
+ * When it is set to N, swsusp will do its best to ensure the image
+ * size will not exceed N bytes, but if that is impossible, it will
+ * try to create the smallest image possible.
+ */
+unsigned long image_size = 500 * 1024 * 1024;
+
+int in_suspend __nosavedata = 0;
+
+#ifdef CONFIG_HIGHMEM
+unsigned int count_highmem_pages(void);
+int restore_highmem(void);
+#else
+static inline int restore_highmem(void) { return 0; }
+static inline unsigned int count_highmem_pages(void) { return 0; }
+#endif
+
+/**
+ *	The following functions are used for tracing the allocated
+ *	swap pages, so that they can be freed in case of an error.
+ *
+ *	The functions operate on a linked bitmap structure defined
+ *	in power.h
+ */
+
+void free_bitmap(struct bitmap_page *bitmap)
+{
+	struct bitmap_page *bp;
+
+	while (bitmap) {
+		bp = bitmap->next;
+		free_page((unsigned long)bitmap);
+		bitmap = bp;
+	}
+}
+
+struct bitmap_page *alloc_bitmap(unsigned int nr_bits)
+{
+	struct bitmap_page *bitmap, *bp;
+	unsigned int n;
+
+	if (!nr_bits)
+		return NULL;
+
+	bitmap = (struct bitmap_page *)get_zeroed_page(GFP_KERNEL);
+	bp = bitmap;
+	for (n = BITMAP_PAGE_BITS; n < nr_bits; n += BITMAP_PAGE_BITS) {
+		bp->next = (struct bitmap_page *)get_zeroed_page(GFP_KERNEL);
+		bp = bp->next;
+		if (!bp) {
+			free_bitmap(bitmap);
+			return NULL;
+		}
+	}
+	return bitmap;
+}
+
+static int bitmap_set(struct bitmap_page *bitmap, unsigned long bit)
+{
+	unsigned int n;
+
+	n = BITMAP_PAGE_BITS;
+	while (bitmap && n <= bit) {
+		n += BITMAP_PAGE_BITS;
+		bitmap = bitmap->next;
+	}
+	if (!bitmap)
+		return -EINVAL;
+	n -= BITMAP_PAGE_BITS;
+	bit -= n;
+	n = 0;
+	while (bit >= BITS_PER_CHUNK) {
+		bit -= BITS_PER_CHUNK;
+		n++;
+	}
+	bitmap->chunks[n] |= (1UL << bit);
+	return 0;
+}
+
+sector_t alloc_swapdev_block(int swap, struct bitmap_page *bitmap)
+{
+	unsigned long offset;
+
+	offset = swp_offset(get_swap_page_of_type(swap));
+	if (offset) {
+		if (bitmap_set(bitmap, offset))
+			swap_free(swp_entry(swap, offset));
+		else
+			return swapdev_block(swap, offset);
+	}
+	return 0;
+}
+
+void free_all_swap_pages(int swap, struct bitmap_page *bitmap)
+{
+	unsigned int bit, n;
+	unsigned long test;
+
+	bit = 0;
+	while (bitmap) {
+		for (n = 0; n < BITMAP_PAGE_CHUNKS; n++)
+			for (test = 1UL; test; test <<= 1) {
+				if (bitmap->chunks[n] & test)
+					swap_free(swp_entry(swap, bit));
+				bit++;
+			}
+		bitmap = bitmap->next;
+	}
+}
+
+/**
+ *	swsusp_show_speed - print the time elapsed between two events represented by
+ *	@start and @stop
+ *
+ *	@nr_pages -	number of pages processed between @start and @stop
+ *	@msg -		introductory message to print
+ */
+
+void swsusp_show_speed(struct timeval *start, struct timeval *stop,
+			unsigned nr_pages, char *msg)
+{
+	s64 elapsed_centisecs64;
+	int centisecs;
+	int k;
+	int kps;
+
+	elapsed_centisecs64 = timeval_to_ns(stop) - timeval_to_ns(start);
+	do_div(elapsed_centisecs64, NSEC_PER_SEC / 100);
+	centisecs = elapsed_centisecs64;
+	if (centisecs == 0)
+		centisecs = 1;	/* avoid div-by-zero */
+	k = nr_pages * (PAGE_SIZE / 1024);
+	kps = (k * 100) / centisecs;
+	printk("%s %d kbytes in %d.%02d seconds (%d.%02d MB/s)\n", msg, k,
+			centisecs / 100, centisecs % 100,
+			kps / 1000, (kps % 1000) / 10);
+}
+
+/**
+ *	swsusp_shrink_memory -  Try to free as much memory as needed
+ *
+ *	... but do not OOM-kill anyone
+ *
+ *	Notice: all userland should be stopped before it is called, or
+ *	livelock is possible.
+ */
+
+#define SHRINK_BITE	10000
+static inline unsigned long __shrink_memory(long tmp)
+{
+	if (tmp > SHRINK_BITE)
+		tmp = SHRINK_BITE;
+	return shrink_all_memory(tmp);
+}
+
+int swsusp_shrink_memory(void)
+{
+	long tmp;
+	struct zone *zone;
+	unsigned long pages = 0;
+	unsigned int i = 0;
+	char *p = "-\\|/";
+	struct timeval start, stop;
+
+	printk("Shrinking memory...  ");
+	do_gettimeofday(&start);
+	do {
+		long size, highmem_size;
+
+		highmem_size = count_highmem_pages();
+		size = count_data_pages() + PAGES_FOR_IO;
+		tmp = size;
+		size += highmem_size;
+		for_each_zone (zone)
+			if (populated_zone(zone)) {
+				tmp += snapshot_additional_pages(zone);
+				if (is_highmem(zone)) {
+					highmem_size -=
+					zone_page_state(zone, NR_FREE_PAGES);
+				} else {
+					tmp -= zone_page_state(zone, NR_FREE_PAGES);
+					tmp += zone->lowmem_reserve[ZONE_NORMAL];
+				}
+			}
+
+		if (highmem_size < 0)
+			highmem_size = 0;
+
+		tmp += highmem_size;
+		if (tmp > 0) {
+			tmp = __shrink_memory(tmp);
+			if (!tmp)
+				return -ENOMEM;
+			pages += tmp;
+		} else if (size > image_size / PAGE_SIZE) {
+			tmp = __shrink_memory(size - (image_size / PAGE_SIZE));
+			pages += tmp;
+		}
+		printk("\b%c", p[i++%4]);
+	} while (tmp > 0);
+	do_gettimeofday(&stop);
+	printk("\bdone (%lu pages freed)\n", pages);
+	swsusp_show_speed(&start, &stop, pages, "Freed");
+
+	return 0;
+}
+
+int swsusp_suspend(void)
+{
+	int error;
+
+	if ((error = arch_prepare_suspend()))
+		return error;
+
+	local_irq_disable();
+	/* At this point, device_suspend() has been called, but *not*
+	 * device_power_down(). We *must* device_power_down() now.
+	 * Otherwise, drivers for some devices (e.g. interrupt controllers)
+	 * become desynchronized with the actual state of the hardware
+	 * at resume time, and evil weirdness ensues.
+	 */
+	if ((error = device_power_down(PMSG_FREEZE))) {
+		printk(KERN_ERR "Some devices failed to power down, aborting suspend\n");
+		goto Enable_irqs;
+	}
+
+	save_processor_state();
+	if ((error = swsusp_arch_suspend()))
+		printk(KERN_ERR "Error %d suspending\n", error);
+	/* Restore control flow magically appears here */
+	restore_processor_state();
+	/* NOTE:  device_power_up() is just a resume() for devices
+	 * that suspended with irqs off ... no overall powerup.
+	 */
+	device_power_up();
+ Enable_irqs:
+	local_irq_enable();
+	return error;
+}
+
+int swsusp_resume(void)
+{
+	int error;
+
+	local_irq_disable();
+	/* NOTE:  device_power_down() is just a suspend() with irqs off;
+	 * it has no special "power things down" semantics
+	 */
+	if (device_power_down(PMSG_PRETHAW))
+		printk(KERN_ERR "Some devices failed to power down, very bad\n");
+	/* We'll ignore saved state, but this gets preempt count (etc) right */
+	save_processor_state();
+	error = restore_highmem();
+	if (!error) {
+		error = swsusp_arch_resume();
+		/* The code below is only ever reached in case of a failure.
+		 * Otherwise execution continues at place where
+		 * swsusp_arch_suspend() was called
+        	 */
+		BUG_ON(!error);
+		/* This call to restore_highmem() undos the previous one */
+		restore_highmem();
+	}
+	/* The only reason why swsusp_arch_resume() can fail is memory being
+	 * very tight, so we have to free it as soon as we can to avoid
+	 * subsequent failures
+	 */
+	swsusp_free();
+	restore_processor_state();
+	touch_softlockup_watchdog();
+	device_power_up();
+	local_irq_enable();
+	return error;
+}
--- a/kernel/power/user.c
+++ b/kernel/power/user.c
@@ -0,0 +1,481 @@
+/*
+ * linux/kernel/power/user.c
+ *
+ * This file provides the user space interface for software suspend/resume.
+ *
+ * Copyright (C) 2006 Rafael J. Wysocki <rjw@sisk.pl>
+ *
+ * This file is released under the GPLv2.
+ *
+ */
+
+#include <linux/suspend.h>
+#include <linux/syscalls.h>
+#include <linux/reboot.h>
+#include <linux/string.h>
+#include <linux/device.h>
+#include <linux/miscdevice.h>
+#include <linux/mm.h>
+#include <linux/swap.h>
+#include <linux/swapops.h>
+#include <linux/pm.h>
+#include <linux/fs.h>
+#include <linux/console.h>
+#include <linux/cpu.h>
+#include <linux/freezer.h>
+
+#include <asm/uaccess.h>
+
+#include "power.h"
+
+#define SNAPSHOT_MINOR	231
+
+static struct snapshot_data {
+	struct snapshot_handle handle;
+	int swap;
+	struct bitmap_page *bitmap;
+	int mode;
+	char frozen;
+	char ready;
+	char platform_suspend;
+} snapshot_state;
+
+static atomic_t device_available = ATOMIC_INIT(1);
+
+static int snapshot_open(struct inode *inode, struct file *filp)
+{
+	struct snapshot_data *data;
+
+	if (!atomic_add_unless(&device_available, -1, 0))
+		return -EBUSY;
+
+	if ((filp->f_flags & O_ACCMODE) == O_RDWR)
+		return -ENOSYS;
+
+	nonseekable_open(inode, filp);
+	data = &snapshot_state;
+	filp->private_data = data;
+	memset(&data->handle, 0, sizeof(struct snapshot_handle));
+	if ((filp->f_flags & O_ACCMODE) == O_RDONLY) {
+		data->swap = swsusp_resume_device ?
+			swap_type_of(swsusp_resume_device, 0, NULL) : -1;
+		data->mode = O_RDONLY;
+	} else {
+		data->swap = -1;
+		data->mode = O_WRONLY;
+	}
+	data->bitmap = NULL;
+	data->frozen = 0;
+	data->ready = 0;
+	data->platform_suspend = 0;
+
+	return 0;
+}
+
+static int snapshot_release(struct inode *inode, struct file *filp)
+{
+	struct snapshot_data *data;
+
+	swsusp_free();
+	data = filp->private_data;
+	free_all_swap_pages(data->swap, data->bitmap);
+	free_bitmap(data->bitmap);
+	if (data->frozen) {
+		mutex_lock(&pm_mutex);
+		thaw_processes();
+		enable_nonboot_cpus();
+		mutex_unlock(&pm_mutex);
+	}
+	atomic_inc(&device_available);
+	return 0;
+}
+
+static ssize_t snapshot_read(struct file *filp, char __user *buf,
+                             size_t count, loff_t *offp)
+{
+	struct snapshot_data *data;
+	ssize_t res;
+
+	data = filp->private_data;
+	res = snapshot_read_next(&data->handle, count);
+	if (res > 0) {
+		if (copy_to_user(buf, data_of(data->handle), res))
+			res = -EFAULT;
+		else
+			*offp = data->handle.offset;
+	}
+	return res;
+}
+
+static ssize_t snapshot_write(struct file *filp, const char __user *buf,
+                              size_t count, loff_t *offp)
+{
+	struct snapshot_data *data;
+	ssize_t res;
+
+	data = filp->private_data;
+	res = snapshot_write_next(&data->handle, count);
+	if (res > 0) {
+		if (copy_from_user(data_of(data->handle), buf, res))
+			res = -EFAULT;
+		else
+			*offp = data->handle.offset;
+	}
+	return res;
+}
+
+static inline int platform_prepare(void)
+{
+	int error = 0;
+
+	if (pm_ops && pm_ops->prepare)
+		error = pm_ops->prepare(PM_SUSPEND_DISK);
+
+	return error;
+}
+
+static inline void platform_finish(void)
+{
+	if (pm_ops && pm_ops->finish)
+		pm_ops->finish(PM_SUSPEND_DISK);
+}
+
+static inline int snapshot_suspend(int platform_suspend)
+{
+	int error;
+
+	mutex_lock(&pm_mutex);
+	/* Free memory before shutting down devices. */
+	error = swsusp_shrink_memory();
+	if (error)
+		goto Finish;
+
+	if (platform_suspend) {
+		error = platform_prepare();
+		if (error)
+			goto Finish;
+	}
+	suspend_console();
+	error = device_suspend(PMSG_FREEZE);
+	if (error)
+		goto Resume_devices;
+
+	error = disable_nonboot_cpus();
+	if (!error) {
+		in_suspend = 1;
+		error = swsusp_suspend();
+	}
+	enable_nonboot_cpus();
+ Resume_devices:
+	if (platform_suspend)
+		platform_finish();
+
+	device_resume();
+	resume_console();
+ Finish:
+	mutex_unlock(&pm_mutex);
+	return error;
+}
+
+static inline int snapshot_restore(int platform_suspend)
+{
+	int error;
+
+	mutex_lock(&pm_mutex);
+	pm_prepare_console();
+	if (platform_suspend) {
+		error = platform_prepare();
+		if (error)
+			goto Finish;
+	}
+	suspend_console();
+	error = device_suspend(PMSG_PRETHAW);
+	if (error)
+		goto Resume_devices;
+
+	error = disable_nonboot_cpus();
+	if (!error)
+		error = swsusp_resume();
+
+	enable_nonboot_cpus();
+ Resume_devices:
+	if (platform_suspend)
+		platform_finish();
+
+	device_resume();
+	resume_console();
+ Finish:
+	pm_restore_console();
+	mutex_unlock(&pm_mutex);
+	return error;
+}
+
+static int snapshot_ioctl(struct inode *inode, struct file *filp,
+                          unsigned int cmd, unsigned long arg)
+{
+	int error = 0;
+	struct snapshot_data *data;
+	loff_t avail;
+	sector_t offset;
+
+	if (_IOC_TYPE(cmd) != SNAPSHOT_IOC_MAGIC)
+		return -ENOTTY;
+	if (_IOC_NR(cmd) > SNAPSHOT_IOC_MAXNR)
+		return -ENOTTY;
+	if (!capable(CAP_SYS_ADMIN))
+		return -EPERM;
+
+	data = filp->private_data;
+
+	switch (cmd) {
+
+	case SNAPSHOT_FREEZE:
+		if (data->frozen)
+			break;
+		mutex_lock(&pm_mutex);
+		if (freeze_processes()) {
+			thaw_processes();
+			error = -EBUSY;
+		}
+		mutex_unlock(&pm_mutex);
+		if (!error)
+			data->frozen = 1;
+		break;
+
+	case SNAPSHOT_UNFREEZE:
+		if (!data->frozen)
+			break;
+		mutex_lock(&pm_mutex);
+		thaw_processes();
+		mutex_unlock(&pm_mutex);
+		data->frozen = 0;
+		break;
+
+	case SNAPSHOT_ATOMIC_SNAPSHOT:
+		if (data->mode != O_RDONLY || !data->frozen  || data->ready) {
+			error = -EPERM;
+			break;
+		}
+		error = snapshot_suspend(data->platform_suspend);
+		if (!error)
+			error = put_user(in_suspend, (unsigned int __user *)arg);
+		if (!error)
+			data->ready = 1;
+		break;
+
+	case SNAPSHOT_ATOMIC_RESTORE:
+		snapshot_write_finalize(&data->handle);
+		if (data->mode != O_WRONLY || !data->frozen ||
+		    !snapshot_image_loaded(&data->handle)) {
+			error = -EPERM;
+			break;
+		}
+		error = snapshot_restore(data->platform_suspend);
+		break;
+
+	case SNAPSHOT_FREE:
+		swsusp_free();
+		memset(&data->handle, 0, sizeof(struct snapshot_handle));
+		data->ready = 0;
+		break;
+
+	case SNAPSHOT_SET_IMAGE_SIZE:
+		image_size = arg;
+		break;
+
+	case SNAPSHOT_AVAIL_SWAP:
+		avail = count_swap_pages(data->swap, 1);
+		avail <<= PAGE_SHIFT;
+		error = put_user(avail, (loff_t __user *)arg);
+		break;
+
+	case SNAPSHOT_GET_SWAP_PAGE:
+		if (data->swap < 0 || data->swap >= MAX_SWAPFILES) {
+			error = -ENODEV;
+			break;
+		}
+		if (!data->bitmap) {
+			data->bitmap = alloc_bitmap(count_swap_pages(data->swap, 0));
+			if (!data->bitmap) {
+				error = -ENOMEM;
+				break;
+			}
+		}
+		offset = alloc_swapdev_block(data->swap, data->bitmap);
+		if (offset) {
+			offset <<= PAGE_SHIFT;
+			error = put_user(offset, (sector_t __user *)arg);
+		} else {
+			error = -ENOSPC;
+		}
+		break;
+
+	case SNAPSHOT_FREE_SWAP_PAGES:
+		if (data->swap < 0 || data->swap >= MAX_SWAPFILES) {
+			error = -ENODEV;
+			break;
+		}
+		free_all_swap_pages(data->swap, data->bitmap);
+		free_bitmap(data->bitmap);
+		data->bitmap = NULL;
+		break;
+
+	case SNAPSHOT_SET_SWAP_FILE:
+		if (!data->bitmap) {
+			/*
+			 * User space encodes device types as two-byte values,
+			 * so we need to recode them
+			 */
+			if (old_decode_dev(arg)) {
+				data->swap = swap_type_of(old_decode_dev(arg),
+							0, NULL);
+				if (data->swap < 0)
+					error = -ENODEV;
+			} else {
+				data->swap = -1;
+				error = -EINVAL;
+			}
+		} else {
+			error = -EPERM;
+		}
+		break;
+
+	case SNAPSHOT_S2RAM:
+		if (!pm_ops) {
+			error = -ENOSYS;
+			break;
+		}
+
+		if (!data->frozen) {
+			error = -EPERM;
+			break;
+		}
+
+		if (!mutex_trylock(&pm_mutex)) {
+			error = -EBUSY;
+			break;
+		}
+
+		if (pm_ops->prepare) {
+			error = pm_ops->prepare(PM_SUSPEND_MEM);
+			if (error)
+				goto OutS3;
+		}
+
+		/* Put devices to sleep */
+		suspend_console();
+		error = device_suspend(PMSG_SUSPEND);
+		if (error) {
+			printk(KERN_ERR "Failed to suspend some devices.\n");
+		} else {
+			error = disable_nonboot_cpus();
+			if (!error) {
+				/* Enter S3, system is already frozen */
+				suspend_enter(PM_SUSPEND_MEM);
+				enable_nonboot_cpus();
+			}
+			/* Wake up devices */
+			device_resume();
+		}
+		resume_console();
+		if (pm_ops->finish)
+			pm_ops->finish(PM_SUSPEND_MEM);
+
+ OutS3:
+		mutex_unlock(&pm_mutex);
+		break;
+
+	case SNAPSHOT_PMOPS:
+		error = -EINVAL;
+
+		switch (arg) {
+
+		case PMOPS_PREPARE:
+			if (pm_ops && pm_ops->enter) {
+				data->platform_suspend = 1;
+				error = 0;
+			} else {
+				error = -ENOSYS;
+			}
+			break;
+
+		case PMOPS_ENTER:
+			if (data->platform_suspend) {
+				kernel_shutdown_prepare(SYSTEM_SUSPEND_DISK);
+				error = pm_ops->enter(PM_SUSPEND_DISK);
+				error = 0;
+			}
+			break;
+
+		case PMOPS_FINISH:
+			if (data->platform_suspend)
+				error = 0;
+
+			break;
+
+		default:
+			printk(KERN_ERR "SNAPSHOT_PMOPS: invalid argument %ld\n", arg);
+
+		}
+		break;
+
+	case SNAPSHOT_SET_SWAP_AREA:
+		if (data->bitmap) {
+			error = -EPERM;
+		} else {
+			struct resume_swap_area swap_area;
+			dev_t swdev;
+
+			error = copy_from_user(&swap_area, (void __user *)arg,
+					sizeof(struct resume_swap_area));
+			if (error) {
+				error = -EFAULT;
+				break;
+			}
+
+			/*
+			 * User space encodes device types as two-byte values,
+			 * so we need to recode them
+			 */
+			swdev = old_decode_dev(swap_area.dev);
+			if (swdev) {
+				offset = swap_area.offset;
+				data->swap = swap_type_of(swdev, offset, NULL);
+				if (data->swap < 0)
+					error = -ENODEV;
+			} else {
+				data->swap = -1;
+				error = -EINVAL;
+			}
+		}
+		break;
+
+	default:
+		error = -ENOTTY;
+
+	}
+
+	return error;
+}
+
+static const struct file_operations snapshot_fops = {
+	.open = snapshot_open,
+	.release = snapshot_release,
+	.read = snapshot_read,
+	.write = snapshot_write,
+	.llseek = no_llseek,
+	.ioctl = snapshot_ioctl,
+};
+
+static struct miscdevice snapshot_device = {
+	.minor = SNAPSHOT_MINOR,
+	.name = "snapshot",
+	.fops = &snapshot_fops,
+};
+
+static int __init snapshot_device_init(void)
+{
+	return misc_register(&snapshot_device);
+};
+
+device_initcall(snapshot_device_init);
--- a/kernel/printk.c
+++ b/kernel/printk.c
--- a/kernel/profile.c
+++ b/kernel/profile.c
@@ -0,0 +1,592 @@
+/*
+ *  linux/kernel/profile.c
+ *  Simple profiling. Manages a direct-mapped profile hit count buffer,
+ *  with configurable resolution, support for restricting the cpus on
+ *  which profiling is done, and switching between cpu time and
+ *  schedule() calls via kernel command line parameters passed at boot.
+ *
+ *  Scheduler profiling support, Arjan van de Ven and Ingo Molnar,
+ *	Red Hat, July 2004
+ *  Consolidation of architecture support code for profiling,
+ *	William Irwin, Oracle, July 2004
+ *  Amortized hit count accounting via per-cpu open-addressed hashtables
+ *	to resolve timer interrupt livelocks, William Irwin, Oracle, 2004
+ */
+
+#include <linux/module.h>
+#include <linux/profile.h>
+#include <linux/bootmem.h>
+#include <linux/notifier.h>
+#include <linux/mm.h>
+#include <linux/cpumask.h>
+#include <linux/cpu.h>
+#include <linux/profile.h>
+#include <linux/highmem.h>
+#include <linux/mutex.h>
+#include <asm/sections.h>
+#include <asm/semaphore.h>
+#include <asm/irq_regs.h>
+
+struct profile_hit {
+	u32 pc, hits;
+};
+#define PROFILE_GRPSHIFT	3
+#define PROFILE_GRPSZ		(1 << PROFILE_GRPSHIFT)
+#define NR_PROFILE_HIT		(PAGE_SIZE/sizeof(struct profile_hit))
+#define NR_PROFILE_GRP		(NR_PROFILE_HIT/PROFILE_GRPSZ)
+
+/* Oprofile timer tick hook */
+int (*timer_hook)(struct pt_regs *) __read_mostly;
+
+static atomic_t *prof_buffer;
+static unsigned long prof_len, prof_shift;
+
+int prof_on __read_mostly;
+EXPORT_SYMBOL_GPL(prof_on);
+
+static cpumask_t prof_cpu_mask = CPU_MASK_ALL;
+#ifdef CONFIG_SMP
+static DEFINE_PER_CPU(struct profile_hit *[2], cpu_profile_hits);
+static DEFINE_PER_CPU(int, cpu_profile_flip);
+static DEFINE_MUTEX(profile_flip_mutex);
+#endif /* CONFIG_SMP */
+
+static int __init profile_setup(char * str)
+{
+	static char __initdata schedstr[] = "schedule";
+	static char __initdata sleepstr[] = "sleep";
+	static char __initdata kvmstr[] = "kvm";
+	int par;
+
+	if (!strncmp(str, sleepstr, strlen(sleepstr))) {
+		prof_on = SLEEP_PROFILING;
+		if (str[strlen(sleepstr)] == ',')
+			str += strlen(sleepstr) + 1;
+		if (get_option(&str, &par))
+			prof_shift = par;
+		printk(KERN_INFO
+			"kernel sleep profiling enabled (shift: %ld)\n",
+			prof_shift);
+	} else if (!strncmp(str, schedstr, strlen(schedstr))) {
+		prof_on = SCHED_PROFILING;
+		if (str[strlen(schedstr)] == ',')
+			str += strlen(schedstr) + 1;
+		if (get_option(&str, &par))
+			prof_shift = par;
+		printk(KERN_INFO
+			"kernel schedule profiling enabled (shift: %ld)\n",
+			prof_shift);
+	} else if (!strncmp(str, kvmstr, strlen(kvmstr))) {
+		prof_on = KVM_PROFILING;
+		if (str[strlen(kvmstr)] == ',')
+			str += strlen(kvmstr) + 1;
+		if (get_option(&str, &par))
+			prof_shift = par;
+		printk(KERN_INFO
+			"kernel KVM profiling enabled (shift: %ld)\n",
+			prof_shift);
+	} else if (get_option(&str, &par)) {
+		prof_shift = par;
+		prof_on = CPU_PROFILING;
+		printk(KERN_INFO "kernel profiling enabled (shift: %ld)\n",
+			prof_shift);
+	}
+	return 1;
+}
+__setup("profile=", profile_setup);
+
+
+void __init profile_init(void)
+{
+	if (!prof_on) 
+		return;
+ 
+	/* only text is profiled */
+	prof_len = (_etext - _stext) >> prof_shift;
+	prof_buffer = alloc_bootmem(prof_len*sizeof(atomic_t));
+}
+
+/* Profile event notifications */
+ 
+#ifdef CONFIG_PROFILING
+ 
+static BLOCKING_NOTIFIER_HEAD(task_exit_notifier);
+static ATOMIC_NOTIFIER_HEAD(task_free_notifier);
+static BLOCKING_NOTIFIER_HEAD(munmap_notifier);
+ 
+void profile_task_exit(struct task_struct * task)
+{
+	blocking_notifier_call_chain(&task_exit_notifier, 0, task);
+}
+ 
+int profile_handoff_task(struct task_struct * task)
+{
+	int ret;
+	ret = atomic_notifier_call_chain(&task_free_notifier, 0, task);
+	return (ret == NOTIFY_OK) ? 1 : 0;
+}
+
+void profile_munmap(unsigned long addr)
+{
+	blocking_notifier_call_chain(&munmap_notifier, 0, (void *)addr);
+}
+
+int task_handoff_register(struct notifier_block * n)
+{
+	return atomic_notifier_chain_register(&task_free_notifier, n);
+}
+
+int task_handoff_unregister(struct notifier_block * n)
+{
+	return atomic_notifier_chain_unregister(&task_free_notifier, n);
+}
+
+int profile_event_register(enum profile_type type, struct notifier_block * n)
+{
+	int err = -EINVAL;
+ 
+	switch (type) {
+		case PROFILE_TASK_EXIT:
+			err = blocking_notifier_chain_register(
+					&task_exit_notifier, n);
+			break;
+		case PROFILE_MUNMAP:
+			err = blocking_notifier_chain_register(
+					&munmap_notifier, n);
+			break;
+	}
+ 
+	return err;
+}
+
+ 
+int profile_event_unregister(enum profile_type type, struct notifier_block * n)
+{
+	int err = -EINVAL;
+ 
+	switch (type) {
+		case PROFILE_TASK_EXIT:
+			err = blocking_notifier_chain_unregister(
+					&task_exit_notifier, n);
+			break;
+		case PROFILE_MUNMAP:
+			err = blocking_notifier_chain_unregister(
+					&munmap_notifier, n);
+			break;
+	}
+
+	return err;
+}
+
+int register_timer_hook(int (*hook)(struct pt_regs *))
+{
+	if (timer_hook)
+		return -EBUSY;
+	timer_hook = hook;
+	return 0;
+}
+
+void unregister_timer_hook(int (*hook)(struct pt_regs *))
+{
+	WARN_ON(hook != timer_hook);
+	timer_hook = NULL;
+	/* make sure all CPUs see the NULL hook */
+	synchronize_sched();  /* Allow ongoing interrupts to complete. */
+}
+
+EXPORT_SYMBOL_GPL(register_timer_hook);
+EXPORT_SYMBOL_GPL(unregister_timer_hook);
+EXPORT_SYMBOL_GPL(task_handoff_register);
+EXPORT_SYMBOL_GPL(task_handoff_unregister);
+
+#endif /* CONFIG_PROFILING */
+
+EXPORT_SYMBOL_GPL(profile_event_register);
+EXPORT_SYMBOL_GPL(profile_event_unregister);
+
+#ifdef CONFIG_SMP
+/*
+ * Each cpu has a pair of open-addressed hashtables for pending
+ * profile hits. read_profile() IPI's all cpus to request them
+ * to flip buffers and flushes their contents to prof_buffer itself.
+ * Flip requests are serialized by the profile_flip_mutex. The sole
+ * use of having a second hashtable is for avoiding cacheline
+ * contention that would otherwise happen during flushes of pending
+ * profile hits required for the accuracy of reported profile hits
+ * and so resurrect the interrupt livelock issue.
+ *
+ * The open-addressed hashtables are indexed by profile buffer slot
+ * and hold the number of pending hits to that profile buffer slot on
+ * a cpu in an entry. When the hashtable overflows, all pending hits
+ * are accounted to their corresponding profile buffer slots with
+ * atomic_add() and the hashtable emptied. As numerous pending hits
+ * may be accounted to a profile buffer slot in a hashtable entry,
+ * this amortizes a number of atomic profile buffer increments likely
+ * to be far larger than the number of entries in the hashtable,
+ * particularly given that the number of distinct profile buffer
+ * positions to which hits are accounted during short intervals (e.g.
+ * several seconds) is usually very small. Exclusion from buffer
+ * flipping is provided by interrupt disablement (note that for
+ * SCHED_PROFILING or SLEEP_PROFILING profile_hit() may be called from
+ * process context).
+ * The hash function is meant to be lightweight as opposed to strong,
+ * and was vaguely inspired by ppc64 firmware-supported inverted
+ * pagetable hash functions, but uses a full hashtable full of finite
+ * collision chains, not just pairs of them.
+ *
+ * -- wli
+ */
+static void __profile_flip_buffers(void *unused)
+{
+	int cpu = smp_processor_id();
+
+	per_cpu(cpu_profile_flip, cpu) = !per_cpu(cpu_profile_flip, cpu);
+}
+
+static void profile_flip_buffers(void)
+{
+	int i, j, cpu;
+
+	mutex_lock(&profile_flip_mutex);
+	j = per_cpu(cpu_profile_flip, get_cpu());
+	put_cpu();
+	on_each_cpu(__profile_flip_buffers, NULL, 0, 1);
+	for_each_online_cpu(cpu) {
+		struct profile_hit *hits = per_cpu(cpu_profile_hits, cpu)[j];
+		for (i = 0; i < NR_PROFILE_HIT; ++i) {
+			if (!hits[i].hits) {
+				if (hits[i].pc)
+					hits[i].pc = 0;
+				continue;
+			}
+			atomic_add(hits[i].hits, &prof_buffer[hits[i].pc]);
+			hits[i].hits = hits[i].pc = 0;
+		}
+	}
+	mutex_unlock(&profile_flip_mutex);
+}
+
+static void profile_discard_flip_buffers(void)
+{
+	int i, cpu;
+
+	mutex_lock(&profile_flip_mutex);
+	i = per_cpu(cpu_profile_flip, get_cpu());
+	put_cpu();
+	on_each_cpu(__profile_flip_buffers, NULL, 0, 1);
+	for_each_online_cpu(cpu) {
+		struct profile_hit *hits = per_cpu(cpu_profile_hits, cpu)[i];
+		memset(hits, 0, NR_PROFILE_HIT*sizeof(struct profile_hit));
+	}
+	mutex_unlock(&profile_flip_mutex);
+}
+
+void profile_hits(int type, void *__pc, unsigned int nr_hits)
+{
+	unsigned long primary, secondary, flags, pc = (unsigned long)__pc;
+	int i, j, cpu;
+	struct profile_hit *hits;
+
+	if (prof_on != type || !prof_buffer)
+		return;
+	pc = min((pc - (unsigned long)_stext) >> prof_shift, prof_len - 1);
+	i = primary = (pc & (NR_PROFILE_GRP - 1)) << PROFILE_GRPSHIFT;
+	secondary = (~(pc << 1) & (NR_PROFILE_GRP - 1)) << PROFILE_GRPSHIFT;
+	cpu = get_cpu();
+	hits = per_cpu(cpu_profile_hits, cpu)[per_cpu(cpu_profile_flip, cpu)];
+	if (!hits) {
+		put_cpu();
+		return;
+	}
+	/*
+	 * We buffer the global profiler buffer into a per-CPU
+	 * queue and thus reduce the number of global (and possibly
+	 * NUMA-alien) accesses. The write-queue is self-coalescing:
+	 */
+	local_irq_save(flags);
+	do {
+		for (j = 0; j < PROFILE_GRPSZ; ++j) {
+			if (hits[i + j].pc == pc) {
+				hits[i + j].hits += nr_hits;
+				goto out;
+			} else if (!hits[i + j].hits) {
+				hits[i + j].pc = pc;
+				hits[i + j].hits = nr_hits;
+				goto out;
+			}
+		}
+		i = (i + secondary) & (NR_PROFILE_HIT - 1);
+	} while (i != primary);
+
+	/*
+	 * Add the current hit(s) and flush the write-queue out
+	 * to the global buffer:
+	 */
+	atomic_add(nr_hits, &prof_buffer[pc]);
+	for (i = 0; i < NR_PROFILE_HIT; ++i) {
+		atomic_add(hits[i].hits, &prof_buffer[hits[i].pc]);
+		hits[i].pc = hits[i].hits = 0;
+	}
+out:
+	local_irq_restore(flags);
+	put_cpu();
+}
+
+static int __devinit profile_cpu_callback(struct notifier_block *info,
+					unsigned long action, void *__cpu)
+{
+	int node, cpu = (unsigned long)__cpu;
+	struct page *page;
+
+	switch (action) {
+	case CPU_UP_PREPARE:
+		node = cpu_to_node(cpu);
+		per_cpu(cpu_profile_flip, cpu) = 0;
+		if (!per_cpu(cpu_profile_hits, cpu)[1]) {
+			page = alloc_pages_node(node,
+					GFP_KERNEL | __GFP_ZERO | GFP_THISNODE,
+					0);
+			if (!page)
+				return NOTIFY_BAD;
+			per_cpu(cpu_profile_hits, cpu)[1] = page_address(page);
+		}
+		if (!per_cpu(cpu_profile_hits, cpu)[0]) {
+			page = alloc_pages_node(node,
+					GFP_KERNEL | __GFP_ZERO | GFP_THISNODE,
+					0);
+			if (!page)
+				goto out_free;
+			per_cpu(cpu_profile_hits, cpu)[0] = page_address(page);
+		}
+		break;
+	out_free:
+		page = virt_to_page(per_cpu(cpu_profile_hits, cpu)[1]);
+		per_cpu(cpu_profile_hits, cpu)[1] = NULL;
+		__free_page(page);
+		return NOTIFY_BAD;
+	case CPU_ONLINE:
+		cpu_set(cpu, prof_cpu_mask);
+		break;
+	case CPU_UP_CANCELED:
+	case CPU_DEAD:
+		cpu_clear(cpu, prof_cpu_mask);
+		if (per_cpu(cpu_profile_hits, cpu)[0]) {
+			page = virt_to_page(per_cpu(cpu_profile_hits, cpu)[0]);
+			per_cpu(cpu_profile_hits, cpu)[0] = NULL;
+			__free_page(page);
+		}
+		if (per_cpu(cpu_profile_hits, cpu)[1]) {
+			page = virt_to_page(per_cpu(cpu_profile_hits, cpu)[1]);
+			per_cpu(cpu_profile_hits, cpu)[1] = NULL;
+			__free_page(page);
+		}
+		break;
+	}
+	return NOTIFY_OK;
+}
+#else /* !CONFIG_SMP */
+#define profile_flip_buffers()		do { } while (0)
+#define profile_discard_flip_buffers()	do { } while (0)
+#define profile_cpu_callback		NULL
+
+void profile_hits(int type, void *__pc, unsigned int nr_hits)
+{
+	unsigned long pc;
+
+	if (prof_on != type || !prof_buffer)
+		return;
+	pc = ((unsigned long)__pc - (unsigned long)_stext) >> prof_shift;
+	atomic_add(nr_hits, &prof_buffer[min(pc, prof_len - 1)]);
+}
+#endif /* !CONFIG_SMP */
+
+EXPORT_SYMBOL_GPL(profile_hits);
+
+void profile_tick(int type)
+{
+	struct pt_regs *regs = get_irq_regs();
+
+	if (type == CPU_PROFILING && timer_hook)
+		timer_hook(regs);
+	if (!user_mode(regs) && cpu_isset(smp_processor_id(), prof_cpu_mask))
+		profile_hit(type, (void *)profile_pc(regs));
+}
+
+#ifdef CONFIG_PROC_FS
+#include <linux/proc_fs.h>
+#include <asm/uaccess.h>
+#include <asm/ptrace.h>
+
+static int prof_cpu_mask_read_proc (char *page, char **start, off_t off,
+			int count, int *eof, void *data)
+{
+	int len = cpumask_scnprintf(page, count, *(cpumask_t *)data);
+	if (count - len < 2)
+		return -EINVAL;
+	len += sprintf(page + len, "\n");
+	return len;
+}
+
+static int prof_cpu_mask_write_proc (struct file *file, const char __user *buffer,
+					unsigned long count, void *data)
+{
+	cpumask_t *mask = (cpumask_t *)data;
+	unsigned long full_count = count, err;
+	cpumask_t new_value;
+
+	err = cpumask_parse_user(buffer, count, new_value);
+	if (err)
+		return err;
+
+	*mask = new_value;
+	return full_count;
+}
+
+void create_prof_cpu_mask(struct proc_dir_entry *root_irq_dir)
+{
+	struct proc_dir_entry *entry;
+
+	/* create /proc/irq/prof_cpu_mask */
+	if (!(entry = create_proc_entry("prof_cpu_mask", 0600, root_irq_dir)))
+		return;
+	entry->data = (void *)&prof_cpu_mask;
+	entry->read_proc = prof_cpu_mask_read_proc;
+	entry->write_proc = prof_cpu_mask_write_proc;
+}
+
+/*
+ * This function accesses profiling information. The returned data is
+ * binary: the sampling step and the actual contents of the profile
+ * buffer. Use of the program readprofile is recommended in order to
+ * get meaningful info out of these data.
+ */
+static ssize_t
+read_profile(struct file *file, char __user *buf, size_t count, loff_t *ppos)
+{
+	unsigned long p = *ppos;
+	ssize_t read;
+	char * pnt;
+	unsigned int sample_step = 1 << prof_shift;
+
+	profile_flip_buffers();
+	if (p >= (prof_len+1)*sizeof(unsigned int))
+		return 0;
+	if (count > (prof_len+1)*sizeof(unsigned int) - p)
+		count = (prof_len+1)*sizeof(unsigned int) - p;
+	read = 0;
+
+	while (p < sizeof(unsigned int) && count > 0) {
+		if (put_user(*((char *)(&sample_step)+p),buf))
+			return -EFAULT;
+		buf++; p++; count--; read++;
+	}
+	pnt = (char *)prof_buffer + p - sizeof(atomic_t);
+	if (copy_to_user(buf,(void *)pnt,count))
+		return -EFAULT;
+	read += count;
+	*ppos += read;
+	return read;
+}
+
+/*
+ * Writing to /proc/profile resets the counters
+ *
+ * Writing a 'profiling multiplier' value into it also re-sets the profiling
+ * interrupt frequency, on architectures that support this.
+ */
+static ssize_t write_profile(struct file *file, const char __user *buf,
+			     size_t count, loff_t *ppos)
+{
+#ifdef CONFIG_SMP
+	extern int setup_profiling_timer (unsigned int multiplier);
+
+	if (count == sizeof(int)) {
+		unsigned int multiplier;
+
+		if (copy_from_user(&multiplier, buf, sizeof(int)))
+			return -EFAULT;
+
+		if (setup_profiling_timer(multiplier))
+			return -EINVAL;
+	}
+#endif
+	profile_discard_flip_buffers();
+	memset(prof_buffer, 0, prof_len * sizeof(atomic_t));
+	return count;
+}
+
+static const struct file_operations proc_profile_operations = {
+	.read		= read_profile,
+	.write		= write_profile,
+};
+
+#ifdef CONFIG_SMP
+static void __init profile_nop(void *unused)
+{
+}
+
+static int __init create_hash_tables(void)
+{
+	int cpu;
+
+	for_each_online_cpu(cpu) {
+		int node = cpu_to_node(cpu);
+		struct page *page;
+
+		page = alloc_pages_node(node,
+				GFP_KERNEL | __GFP_ZERO | GFP_THISNODE,
+				0);
+		if (!page)
+			goto out_cleanup;
+		per_cpu(cpu_profile_hits, cpu)[1]
+				= (struct profile_hit *)page_address(page);
+		page = alloc_pages_node(node,
+				GFP_KERNEL | __GFP_ZERO | GFP_THISNODE,
+				0);
+		if (!page)
+			goto out_cleanup;
+		per_cpu(cpu_profile_hits, cpu)[0]
+				= (struct profile_hit *)page_address(page);
+	}
+	return 0;
+out_cleanup:
+	prof_on = 0;
+	smp_mb();
+	on_each_cpu(profile_nop, NULL, 0, 1);
+	for_each_online_cpu(cpu) {
+		struct page *page;
+
+		if (per_cpu(cpu_profile_hits, cpu)[0]) {
+			page = virt_to_page(per_cpu(cpu_profile_hits, cpu)[0]);
+			per_cpu(cpu_profile_hits, cpu)[0] = NULL;
+			__free_page(page);
+		}
+		if (per_cpu(cpu_profile_hits, cpu)[1]) {
+			page = virt_to_page(per_cpu(cpu_profile_hits, cpu)[1]);
+			per_cpu(cpu_profile_hits, cpu)[1] = NULL;
+			__free_page(page);
+		}
+	}
+	return -1;
+}
+#else
+#define create_hash_tables()			({ 0; })
+#endif
+
+static int __init create_proc_profile(void)
+{
+	struct proc_dir_entry *entry;
+
+	if (!prof_on)
+		return 0;
+	if (create_hash_tables())
+		return -1;
+	if (!(entry = create_proc_entry("profile", S_IWUSR | S_IRUGO, NULL)))
+		return 0;
+	entry->proc_fops = &proc_profile_operations;
+	entry->size = (1+prof_len) * sizeof(atomic_t);
+	hotcpu_notifier(profile_cpu_callback, 0);
+	return 0;
+}
+module_init(create_proc_profile);
+#endif /* CONFIG_PROC_FS */
--- a/kernel/ptrace.c
+++ b/kernel/ptrace.c
@@ -0,0 +1,490 @@
+/*
+ * linux/kernel/ptrace.c
+ *
+ * (C) Copyright 1999 Linus Torvalds
+ *
+ * Common interfaces for "ptrace()" which we do not want
+ * to continually duplicate across every architecture.
+ */
+
+#include <linux/capability.h>
+#include <linux/module.h>
+#include <linux/sched.h>
+#include <linux/errno.h>
+#include <linux/mm.h>
+#include <linux/highmem.h>
+#include <linux/pagemap.h>
+#include <linux/smp_lock.h>
+#include <linux/ptrace.h>
+#include <linux/security.h>
+#include <linux/signal.h>
+
+#include <asm/pgtable.h>
+#include <asm/uaccess.h>
+
+/*
+ * ptrace a task: make the debugger its new parent and
+ * move it to the ptrace list.
+ *
+ * Must be called with the tasklist lock write-held.
+ */
+void __ptrace_link(struct task_struct *child, struct task_struct *new_parent)
+{
+	BUG_ON(!list_empty(&child->ptrace_list));
+	if (child->parent == new_parent)
+		return;
+	list_add(&child->ptrace_list, &child->parent->ptrace_children);
+	remove_parent(child);
+	child->parent = new_parent;
+	add_parent(child);
+}
+ 
+/*
+ * Turn a tracing stop into a normal stop now, since with no tracer there
+ * would be no way to wake it up with SIGCONT or SIGKILL.  If there was a
+ * signal sent that would resume the child, but didn't because it was in
+ * TASK_TRACED, resume it now.
+ * Requires that irqs be disabled.
+ */
+void ptrace_untrace(struct task_struct *child)
+{
+	spin_lock(&child->sighand->siglock);
+	if (child->state == TASK_TRACED) {
+		if (child->signal->flags & SIGNAL_STOP_STOPPED) {
+			child->state = TASK_STOPPED;
+		} else {
+			signal_wake_up(child, 1);
+		}
+	}
+	spin_unlock(&child->sighand->siglock);
+}
+
+/*
+ * unptrace a task: move it back to its original parent and
+ * remove it from the ptrace list.
+ *
+ * Must be called with the tasklist lock write-held.
+ */
+void __ptrace_unlink(struct task_struct *child)
+{
+	BUG_ON(!child->ptrace);
+
+	child->ptrace = 0;
+	if (!list_empty(&child->ptrace_list)) {
+		list_del_init(&child->ptrace_list);
+		remove_parent(child);
+		child->parent = child->real_parent;
+		add_parent(child);
+	}
+
+	if (child->state == TASK_TRACED)
+		ptrace_untrace(child);
+}
+
+/*
+ * Check that we have indeed attached to the thing..
+ */
+int ptrace_check_attach(struct task_struct *child, int kill)
+{
+	int ret = -ESRCH;
+
+	/*
+	 * We take the read lock around doing both checks to close a
+	 * possible race where someone else was tracing our child and
+	 * detached between these two checks.  After this locked check,
+	 * we are sure that this is our traced child and that can only
+	 * be changed by us so it's not changing right after this.
+	 */
+	read_lock(&tasklist_lock);
+	if ((child->ptrace & PT_PTRACED) && child->parent == current &&
+	    (!(child->ptrace & PT_ATTACHED) || child->real_parent != current)
+	    && child->signal != NULL) {
+		ret = 0;
+		spin_lock_irq(&child->sighand->siglock);
+		if (child->state == TASK_STOPPED) {
+			child->state = TASK_TRACED;
+		} else if (child->state != TASK_TRACED && !kill) {
+			ret = -ESRCH;
+		}
+		spin_unlock_irq(&child->sighand->siglock);
+	}
+	read_unlock(&tasklist_lock);
+
+	if (!ret && !kill) {
+		wait_task_inactive(child);
+	}
+
+	/* All systems go.. */
+	return ret;
+}
+
+static int may_attach(struct task_struct *task)
+{
+	/* May we inspect the given task?
+	 * This check is used both for attaching with ptrace
+	 * and for allowing access to sensitive information in /proc.
+	 *
+	 * ptrace_attach denies several cases that /proc allows
+	 * because setting up the necessary parent/child relationship
+	 * or halting the specified task is impossible.
+	 */
+	int dumpable = 0;
+	/* Don't let security modules deny introspection */
+	if (task == current)
+		return 0;
+	if (((current->uid != task->euid) ||
+	     (current->uid != task->suid) ||
+	     (current->uid != task->uid) ||
+	     (current->gid != task->egid) ||
+	     (current->gid != task->sgid) ||
+	     (current->gid != task->gid)) && !capable(CAP_SYS_PTRACE))
+		return -EPERM;
+	smp_rmb();
+	if (task->mm)
+		dumpable = task->mm->dumpable;
+	if (!dumpable && !capable(CAP_SYS_PTRACE))
+		return -EPERM;
+
+	return security_ptrace(current, task);
+}
+
+int ptrace_may_attach(struct task_struct *task)
+{
+	int err;
+	task_lock(task);
+	err = may_attach(task);
+	task_unlock(task);
+	return !err;
+}
+
+int ptrace_attach(struct task_struct *task)
+{
+	int retval;
+
+	retval = -EPERM;
+	if (task->pid <= 1)
+		goto out;
+	if (task->tgid == current->tgid)
+		goto out;
+
+repeat:
+	/*
+	 * Nasty, nasty.
+	 *
+	 * We want to hold both the task-lock and the
+	 * tasklist_lock for writing at the same time.
+	 * But that's against the rules (tasklist_lock
+	 * is taken for reading by interrupts on other
+	 * cpu's that may have task_lock).
+	 */
+	task_lock(task);
+	local_irq_disable();
+	if (!write_trylock(&tasklist_lock)) {
+		local_irq_enable();
+		task_unlock(task);
+		do {
+			cpu_relax();
+		} while (!write_can_lock(&tasklist_lock));
+		goto repeat;
+	}
+
+	if (!task->mm)
+		goto bad;
+	/* the same process cannot be attached many times */
+	if (task->ptrace & PT_PTRACED)
+		goto bad;
+	retval = may_attach(task);
+	if (retval)
+		goto bad;
+
+	/* Go */
+	task->ptrace |= PT_PTRACED | ((task->real_parent != current)
+				      ? PT_ATTACHED : 0);
+	if (capable(CAP_SYS_PTRACE))
+		task->ptrace |= PT_PTRACE_CAP;
+
+	__ptrace_link(task, current);
+
+	force_sig_specific(SIGSTOP, task);
+
+bad:
+	write_unlock_irq(&tasklist_lock);
+	task_unlock(task);
+out:
+	return retval;
+}
+
+static inline void __ptrace_detach(struct task_struct *child, unsigned int data)
+{
+	child->exit_code = data;
+	/* .. re-parent .. */
+	__ptrace_unlink(child);
+	/* .. and wake it up. */
+	if (child->exit_state != EXIT_ZOMBIE)
+		wake_up_process(child);
+}
+
+int ptrace_detach(struct task_struct *child, unsigned int data)
+{
+	if (!valid_signal(data))
+		return -EIO;
+
+	/* Architecture-specific hardware disable .. */
+	ptrace_disable(child);
+
+	write_lock_irq(&tasklist_lock);
+	/* protect against de_thread()->release_task() */
+	if (child->ptrace)
+		__ptrace_detach(child, data);
+	write_unlock_irq(&tasklist_lock);
+
+	return 0;
+}
+
+int ptrace_readdata(struct task_struct *tsk, unsigned long src, char __user *dst, int len)
+{
+	int copied = 0;
+
+	while (len > 0) {
+		char buf[128];
+		int this_len, retval;
+
+		this_len = (len > sizeof(buf)) ? sizeof(buf) : len;
+		retval = access_process_vm(tsk, src, buf, this_len, 0);
+		if (!retval) {
+			if (copied)
+				break;
+			return -EIO;
+		}
+		if (copy_to_user(dst, buf, retval))
+			return -EFAULT;
+		copied += retval;
+		src += retval;
+		dst += retval;
+		len -= retval;			
+	}
+	return copied;
+}
+
+int ptrace_writedata(struct task_struct *tsk, char __user *src, unsigned long dst, int len)
+{
+	int copied = 0;
+
+	while (len > 0) {
+		char buf[128];
+		int this_len, retval;
+
+		this_len = (len > sizeof(buf)) ? sizeof(buf) : len;
+		if (copy_from_user(buf, src, this_len))
+			return -EFAULT;
+		retval = access_process_vm(tsk, dst, buf, this_len, 1);
+		if (!retval) {
+			if (copied)
+				break;
+			return -EIO;
+		}
+		copied += retval;
+		src += retval;
+		dst += retval;
+		len -= retval;			
+	}
+	return copied;
+}
+
+static int ptrace_setoptions(struct task_struct *child, long data)
+{
+	child->ptrace &= ~PT_TRACE_MASK;
+
+	if (data & PTRACE_O_TRACESYSGOOD)
+		child->ptrace |= PT_TRACESYSGOOD;
+
+	if (data & PTRACE_O_TRACEFORK)
+		child->ptrace |= PT_TRACE_FORK;
+
+	if (data & PTRACE_O_TRACEVFORK)
+		child->ptrace |= PT_TRACE_VFORK;
+
+	if (data & PTRACE_O_TRACECLONE)
+		child->ptrace |= PT_TRACE_CLONE;
+
+	if (data & PTRACE_O_TRACEEXEC)
+		child->ptrace |= PT_TRACE_EXEC;
+
+	if (data & PTRACE_O_TRACEVFORKDONE)
+		child->ptrace |= PT_TRACE_VFORK_DONE;
+
+	if (data & PTRACE_O_TRACEEXIT)
+		child->ptrace |= PT_TRACE_EXIT;
+
+	return (data & ~PTRACE_O_MASK) ? -EINVAL : 0;
+}
+
+static int ptrace_getsiginfo(struct task_struct *child, siginfo_t __user * data)
+{
+	siginfo_t lastinfo;
+	int error = -ESRCH;
+
+	read_lock(&tasklist_lock);
+	if (likely(child->sighand != NULL)) {
+		error = -EINVAL;
+		spin_lock_irq(&child->sighand->siglock);
+		if (likely(child->last_siginfo != NULL)) {
+			lastinfo = *child->last_siginfo;
+			error = 0;
+		}
+		spin_unlock_irq(&child->sighand->siglock);
+	}
+	read_unlock(&tasklist_lock);
+	if (!error)
+		return copy_siginfo_to_user(data, &lastinfo);
+	return error;
+}
+
+static int ptrace_setsiginfo(struct task_struct *child, siginfo_t __user * data)
+{
+	siginfo_t newinfo;
+	int error = -ESRCH;
+
+	if (copy_from_user(&newinfo, data, sizeof (siginfo_t)))
+		return -EFAULT;
+
+	read_lock(&tasklist_lock);
+	if (likely(child->sighand != NULL)) {
+		error = -EINVAL;
+		spin_lock_irq(&child->sighand->siglock);
+		if (likely(child->last_siginfo != NULL)) {
+			*child->last_siginfo = newinfo;
+			error = 0;
+		}
+		spin_unlock_irq(&child->sighand->siglock);
+	}
+	read_unlock(&tasklist_lock);
+	return error;
+}
+
+int ptrace_request(struct task_struct *child, long request,
+		   long addr, long data)
+{
+	int ret = -EIO;
+
+	switch (request) {
+#ifdef PTRACE_OLDSETOPTIONS
+	case PTRACE_OLDSETOPTIONS:
+#endif
+	case PTRACE_SETOPTIONS:
+		ret = ptrace_setoptions(child, data);
+		break;
+	case PTRACE_GETEVENTMSG:
+		ret = put_user(child->ptrace_message, (unsigned long __user *) data);
+		break;
+	case PTRACE_GETSIGINFO:
+		ret = ptrace_getsiginfo(child, (siginfo_t __user *) data);
+		break;
+	case PTRACE_SETSIGINFO:
+		ret = ptrace_setsiginfo(child, (siginfo_t __user *) data);
+		break;
+	default:
+		break;
+	}
+
+	return ret;
+}
+
+/**
+ * ptrace_traceme  --  helper for PTRACE_TRACEME
+ *
+ * Performs checks and sets PT_PTRACED.
+ * Should be used by all ptrace implementations for PTRACE_TRACEME.
+ */
+int ptrace_traceme(void)
+{
+	int ret = -EPERM;
+
+	/*
+	 * Are we already being traced?
+	 */
+	task_lock(current);
+	if (!(current->ptrace & PT_PTRACED)) {
+		ret = security_ptrace(current->parent, current);
+		/*
+		 * Set the ptrace bit in the process ptrace flags.
+		 */
+		if (!ret)
+			current->ptrace |= PT_PTRACED;
+	}
+	task_unlock(current);
+	return ret;
+}
+
+/**
+ * ptrace_get_task_struct  --  grab a task struct reference for ptrace
+ * @pid:       process id to grab a task_struct reference of
+ *
+ * This function is a helper for ptrace implementations.  It checks
+ * permissions and then grabs a task struct for use of the actual
+ * ptrace implementation.
+ *
+ * Returns the task_struct for @pid or an ERR_PTR() on failure.
+ */
+struct task_struct *ptrace_get_task_struct(pid_t pid)
+{
+	struct task_struct *child;
+
+	/*
+	 * Tracing init is not allowed.
+	 */
+	if (pid == 1)
+		return ERR_PTR(-EPERM);
+
+	read_lock(&tasklist_lock);
+	child = find_task_by_pid(pid);
+	if (child)
+		get_task_struct(child);
+
+	read_unlock(&tasklist_lock);
+	if (!child)
+		return ERR_PTR(-ESRCH);
+	return child;
+}
+
+#ifndef __ARCH_SYS_PTRACE
+asmlinkage long sys_ptrace(long request, long pid, long addr, long data)
+{
+	struct task_struct *child;
+	long ret;
+
+	/*
+	 * This lock_kernel fixes a subtle race with suid exec
+	 */
+	lock_kernel();
+	if (request == PTRACE_TRACEME) {
+		ret = ptrace_traceme();
+		goto out;
+	}
+
+	child = ptrace_get_task_struct(pid);
+	if (IS_ERR(child)) {
+		ret = PTR_ERR(child);
+		goto out;
+	}
+
+	if (request == PTRACE_ATTACH) {
+		ret = ptrace_attach(child);
+		goto out_put_task_struct;
+	}
+
+	ret = ptrace_check_attach(child, request == PTRACE_KILL);
+	if (ret < 0)
+		goto out_put_task_struct;
+
+	ret = arch_ptrace(child, request, addr, data);
+	if (ret < 0)
+		goto out_put_task_struct;
+
+ out_put_task_struct:
+	put_task_struct(child);
+ out:
+	unlock_kernel();
+	return ret;
+}
+#endif /* __ARCH_SYS_PTRACE */
--- a/kernel/rcupdate.c
+++ b/kernel/rcupdate.c
@@ -0,0 +1,635 @@
+/*
+ * Read-Copy Update mechanism for mutual exclusion
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
+ *
+ * Copyright (C) IBM Corporation, 2001
+ *
+ * Authors: Dipankar Sarma <dipankar@in.ibm.com>
+ *	    Manfred Spraul <manfred@colorfullife.com>
+ * 
+ * Based on the original work by Paul McKenney <paulmck@us.ibm.com>
+ * and inputs from Rusty Russell, Andrea Arcangeli and Andi Kleen.
+ * Papers:
+ * http://www.rdrop.com/users/paulmck/paper/rclockpdcsproof.pdf
+ * http://lse.sourceforge.net/locking/rclock_OLS.2001.05.01c.sc.pdf (OLS2001)
+ *
+ * For detailed explanation of Read-Copy Update mechanism see -
+ * 		http://lse.sourceforge.net/locking/rcupdate.html
+ *
+ */
+#include <linux/types.h>
+#include <linux/kernel.h>
+#include <linux/init.h>
+#include <linux/spinlock.h>
+#include <linux/smp.h>
+#include <linux/rcupdate.h>
+#include <linux/interrupt.h>
+#include <linux/sched.h>
+#include <asm/atomic.h>
+#include <linux/bitops.h>
+#include <linux/module.h>
+#include <linux/completion.h>
+#include <linux/moduleparam.h>
+#include <linux/percpu.h>
+#include <linux/notifier.h>
+#include <linux/rcupdate.h>
+#include <linux/cpu.h>
+#include <linux/mutex.h>
+
+/* Definition for rcupdate control block. */
+static struct rcu_ctrlblk rcu_ctrlblk = {
+	.cur = -300,
+	.completed = -300,
+	.lock = __SPIN_LOCK_UNLOCKED(&rcu_ctrlblk.lock),
+	.cpumask = CPU_MASK_NONE,
+};
+static struct rcu_ctrlblk rcu_bh_ctrlblk = {
+	.cur = -300,
+	.completed = -300,
+	.lock = __SPIN_LOCK_UNLOCKED(&rcu_bh_ctrlblk.lock),
+	.cpumask = CPU_MASK_NONE,
+};
+
+DEFINE_PER_CPU(struct rcu_data, rcu_data) = { 0L };
+DEFINE_PER_CPU(struct rcu_data, rcu_bh_data) = { 0L };
+
+/* Fake initialization required by compiler */
+static DEFINE_PER_CPU(struct tasklet_struct, rcu_tasklet) = {NULL};
+static int blimit = 10;
+static int qhimark = 10000;
+static int qlowmark = 100;
+
+static atomic_t rcu_barrier_cpu_count;
+static DEFINE_MUTEX(rcu_barrier_mutex);
+static struct completion rcu_barrier_completion;
+
+#ifdef CONFIG_SMP
+static void force_quiescent_state(struct rcu_data *rdp,
+			struct rcu_ctrlblk *rcp)
+{
+	int cpu;
+	cpumask_t cpumask;
+	set_need_resched();
+	if (unlikely(!rcp->signaled)) {
+		rcp->signaled = 1;
+		/*
+		 * Don't send IPI to itself. With irqs disabled,
+		 * rdp->cpu is the current cpu.
+		 */
+		cpumask = rcp->cpumask;
+		cpu_clear(rdp->cpu, cpumask);
+		for_each_cpu_mask(cpu, cpumask)
+			smp_send_reschedule(cpu);
+	}
+}
+#else
+static inline void force_quiescent_state(struct rcu_data *rdp,
+			struct rcu_ctrlblk *rcp)
+{
+	set_need_resched();
+}
+#endif
+
+/**
+ * call_rcu - Queue an RCU callback for invocation after a grace period.
+ * @head: structure to be used for queueing the RCU updates.
+ * @func: actual update function to be invoked after the grace period
+ *
+ * The update function will be invoked some time after a full grace
+ * period elapses, in other words after all currently executing RCU
+ * read-side critical sections have completed.  RCU read-side critical
+ * sections are delimited by rcu_read_lock() and rcu_read_unlock(),
+ * and may be nested.
+ */
+void fastcall call_rcu(struct rcu_head *head,
+				void (*func)(struct rcu_head *rcu))
+{
+	unsigned long flags;
+	struct rcu_data *rdp;
+
+	head->func = func;
+	head->next = NULL;
+	local_irq_save(flags);
+	rdp = &__get_cpu_var(rcu_data);
+	*rdp->nxttail = head;
+	rdp->nxttail = &head->next;
+	if (unlikely(++rdp->qlen > qhimark)) {
+		rdp->blimit = INT_MAX;
+		force_quiescent_state(rdp, &rcu_ctrlblk);
+	}
+	local_irq_restore(flags);
+}
+
+/**
+ * call_rcu_bh - Queue an RCU for invocation after a quicker grace period.
+ * @head: structure to be used for queueing the RCU updates.
+ * @func: actual update function to be invoked after the grace period
+ *
+ * The update function will be invoked some time after a full grace
+ * period elapses, in other words after all currently executing RCU
+ * read-side critical sections have completed. call_rcu_bh() assumes
+ * that the read-side critical sections end on completion of a softirq
+ * handler. This means that read-side critical sections in process
+ * context must not be interrupted by softirqs. This interface is to be
+ * used when most of the read-side critical sections are in softirq context.
+ * RCU read-side critical sections are delimited by rcu_read_lock() and
+ * rcu_read_unlock(), * if in interrupt context or rcu_read_lock_bh()
+ * and rcu_read_unlock_bh(), if in process context. These may be nested.
+ */
+void fastcall call_rcu_bh(struct rcu_head *head,
+				void (*func)(struct rcu_head *rcu))
+{
+	unsigned long flags;
+	struct rcu_data *rdp;
+
+	head->func = func;
+	head->next = NULL;
+	local_irq_save(flags);
+	rdp = &__get_cpu_var(rcu_bh_data);
+	*rdp->nxttail = head;
+	rdp->nxttail = &head->next;
+
+	if (unlikely(++rdp->qlen > qhimark)) {
+		rdp->blimit = INT_MAX;
+		force_quiescent_state(rdp, &rcu_bh_ctrlblk);
+	}
+
+	local_irq_restore(flags);
+}
+
+/*
+ * Return the number of RCU batches processed thus far.  Useful
+ * for debug and statistics.
+ */
+long rcu_batches_completed(void)
+{
+	return rcu_ctrlblk.completed;
+}
+
+/*
+ * Return the number of RCU batches processed thus far.  Useful
+ * for debug and statistics.
+ */
+long rcu_batches_completed_bh(void)
+{
+	return rcu_bh_ctrlblk.completed;
+}
+
+static void rcu_barrier_callback(struct rcu_head *notused)
+{
+	if (atomic_dec_and_test(&rcu_barrier_cpu_count))
+		complete(&rcu_barrier_completion);
+}
+
+/*
+ * Called with preemption disabled, and from cross-cpu IRQ context.
+ */
+static void rcu_barrier_func(void *notused)
+{
+	int cpu = smp_processor_id();
+	struct rcu_data *rdp = &per_cpu(rcu_data, cpu);
+	struct rcu_head *head;
+
+	head = &rdp->barrier;
+	atomic_inc(&rcu_barrier_cpu_count);
+	call_rcu(head, rcu_barrier_callback);
+}
+
+/**
+ * rcu_barrier - Wait until all the in-flight RCUs are complete.
+ */
+void rcu_barrier(void)
+{
+	BUG_ON(in_interrupt());
+	/* Take cpucontrol mutex to protect against CPU hotplug */
+	mutex_lock(&rcu_barrier_mutex);
+	init_completion(&rcu_barrier_completion);
+	atomic_set(&rcu_barrier_cpu_count, 0);
+	on_each_cpu(rcu_barrier_func, NULL, 0, 1);
+	wait_for_completion(&rcu_barrier_completion);
+	mutex_unlock(&rcu_barrier_mutex);
+}
+EXPORT_SYMBOL_GPL(rcu_barrier);
+
+/*
+ * Invoke the completed RCU callbacks. They are expected to be in
+ * a per-cpu list.
+ */
+static void rcu_do_batch(struct rcu_data *rdp)
+{
+	struct rcu_head *next, *list;
+	int count = 0;
+
+	list = rdp->donelist;
+	while (list) {
+		next = list->next;
+		prefetch(next);
+		list->func(list);
+		list = next;
+		if (++count >= rdp->blimit)
+			break;
+	}
+	rdp->donelist = list;
+
+	local_irq_disable();
+	rdp->qlen -= count;
+	local_irq_enable();
+	if (rdp->blimit == INT_MAX && rdp->qlen <= qlowmark)
+		rdp->blimit = blimit;
+
+	if (!rdp->donelist)
+		rdp->donetail = &rdp->donelist;
+	else
+		tasklet_schedule(&per_cpu(rcu_tasklet, rdp->cpu));
+}
+
+/*
+ * Grace period handling:
+ * The grace period handling consists out of two steps:
+ * - A new grace period is started.
+ *   This is done by rcu_start_batch. The start is not broadcasted to
+ *   all cpus, they must pick this up by comparing rcp->cur with
+ *   rdp->quiescbatch. All cpus are recorded  in the
+ *   rcu_ctrlblk.cpumask bitmap.
+ * - All cpus must go through a quiescent state.
+ *   Since the start of the grace period is not broadcasted, at least two
+ *   calls to rcu_check_quiescent_state are required:
+ *   The first call just notices that a new grace period is running. The
+ *   following calls check if there was a quiescent state since the beginning
+ *   of the grace period. If so, it updates rcu_ctrlblk.cpumask. If
+ *   the bitmap is empty, then the grace period is completed.
+ *   rcu_check_quiescent_state calls rcu_start_batch(0) to start the next grace
+ *   period (if necessary).
+ */
+/*
+ * Register a new batch of callbacks, and start it up if there is currently no
+ * active batch and the batch to be registered has not already occurred.
+ * Caller must hold rcu_ctrlblk.lock.
+ */
+static void rcu_start_batch(struct rcu_ctrlblk *rcp)
+{
+	if (rcp->next_pending &&
+			rcp->completed == rcp->cur) {
+		rcp->next_pending = 0;
+		/*
+		 * next_pending == 0 must be visible in
+		 * __rcu_process_callbacks() before it can see new value of cur.
+		 */
+		smp_wmb();
+		rcp->cur++;
+
+		/*
+		 * Accessing nohz_cpu_mask before incrementing rcp->cur needs a
+		 * Barrier  Otherwise it can cause tickless idle CPUs to be
+		 * included in rcp->cpumask, which will extend graceperiods
+		 * unnecessarily.
+		 */
+		smp_mb();
+		cpus_andnot(rcp->cpumask, cpu_online_map, nohz_cpu_mask);
+
+		rcp->signaled = 0;
+	}
+}
+
+/*
+ * cpu went through a quiescent state since the beginning of the grace period.
+ * Clear it from the cpu mask and complete the grace period if it was the last
+ * cpu. Start another grace period if someone has further entries pending
+ */
+static void cpu_quiet(int cpu, struct rcu_ctrlblk *rcp)
+{
+	cpu_clear(cpu, rcp->cpumask);
+	if (cpus_empty(rcp->cpumask)) {
+		/* batch completed ! */
+		rcp->completed = rcp->cur;
+		rcu_start_batch(rcp);
+	}
+}
+
+/*
+ * Check if the cpu has gone through a quiescent state (say context
+ * switch). If so and if it already hasn't done so in this RCU
+ * quiescent cycle, then indicate that it has done so.
+ */
+static void rcu_check_quiescent_state(struct rcu_ctrlblk *rcp,
+					struct rcu_data *rdp)
+{
+	if (rdp->quiescbatch != rcp->cur) {
+		/* start new grace period: */
+		rdp->qs_pending = 1;
+		rdp->passed_quiesc = 0;
+		rdp->quiescbatch = rcp->cur;
+		return;
+	}
+
+	/* Grace period already completed for this cpu?
+	 * qs_pending is checked instead of the actual bitmap to avoid
+	 * cacheline trashing.
+	 */
+	if (!rdp->qs_pending)
+		return;
+
+	/* 
+	 * Was there a quiescent state since the beginning of the grace
+	 * period? If no, then exit and wait for the next call.
+	 */
+	if (!rdp->passed_quiesc)
+		return;
+	rdp->qs_pending = 0;
+
+	spin_lock(&rcp->lock);
+	/*
+	 * rdp->quiescbatch/rcp->cur and the cpu bitmap can come out of sync
+	 * during cpu startup. Ignore the quiescent state.
+	 */
+	if (likely(rdp->quiescbatch == rcp->cur))
+		cpu_quiet(rdp->cpu, rcp);
+
+	spin_unlock(&rcp->lock);
+}
+
+
+#ifdef CONFIG_HOTPLUG_CPU
+
+/* warning! helper for rcu_offline_cpu. do not use elsewhere without reviewing
+ * locking requirements, the list it's pulling from has to belong to a cpu
+ * which is dead and hence not processing interrupts.
+ */
+static void rcu_move_batch(struct rcu_data *this_rdp, struct rcu_head *list,
+				struct rcu_head **tail)
+{
+	local_irq_disable();
+	*this_rdp->nxttail = list;
+	if (list)
+		this_rdp->nxttail = tail;
+	local_irq_enable();
+}
+
+static void __rcu_offline_cpu(struct rcu_data *this_rdp,
+				struct rcu_ctrlblk *rcp, struct rcu_data *rdp)
+{
+	/* if the cpu going offline owns the grace period
+	 * we can block indefinitely waiting for it, so flush
+	 * it here
+	 */
+	spin_lock_bh(&rcp->lock);
+	if (rcp->cur != rcp->completed)
+		cpu_quiet(rdp->cpu, rcp);
+	spin_unlock_bh(&rcp->lock);
+	rcu_move_batch(this_rdp, rdp->curlist, rdp->curtail);
+	rcu_move_batch(this_rdp, rdp->nxtlist, rdp->nxttail);
+	rcu_move_batch(this_rdp, rdp->donelist, rdp->donetail);
+}
+
+static void rcu_offline_cpu(int cpu)
+{
+	struct rcu_data *this_rdp = &get_cpu_var(rcu_data);
+	struct rcu_data *this_bh_rdp = &get_cpu_var(rcu_bh_data);
+
+	__rcu_offline_cpu(this_rdp, &rcu_ctrlblk,
+					&per_cpu(rcu_data, cpu));
+	__rcu_offline_cpu(this_bh_rdp, &rcu_bh_ctrlblk,
+					&per_cpu(rcu_bh_data, cpu));
+	put_cpu_var(rcu_data);
+	put_cpu_var(rcu_bh_data);
+	tasklet_kill_immediate(&per_cpu(rcu_tasklet, cpu), cpu);
+}
+
+#else
+
+static void rcu_offline_cpu(int cpu)
+{
+}
+
+#endif
+
+/*
+ * This does the RCU processing work from tasklet context. 
+ */
+static void __rcu_process_callbacks(struct rcu_ctrlblk *rcp,
+					struct rcu_data *rdp)
+{
+	if (rdp->curlist && !rcu_batch_before(rcp->completed, rdp->batch)) {
+		*rdp->donetail = rdp->curlist;
+		rdp->donetail = rdp->curtail;
+		rdp->curlist = NULL;
+		rdp->curtail = &rdp->curlist;
+	}
+
+	if (rdp->nxtlist && !rdp->curlist) {
+		local_irq_disable();
+		rdp->curlist = rdp->nxtlist;
+		rdp->curtail = rdp->nxttail;
+		rdp->nxtlist = NULL;
+		rdp->nxttail = &rdp->nxtlist;
+		local_irq_enable();
+
+		/*
+		 * start the next batch of callbacks
+		 */
+
+		/* determine batch number */
+		rdp->batch = rcp->cur + 1;
+		/* see the comment and corresponding wmb() in
+		 * the rcu_start_batch()
+		 */
+		smp_rmb();
+
+		if (!rcp->next_pending) {
+			/* and start it/schedule start if it's a new batch */
+			spin_lock(&rcp->lock);
+			rcp->next_pending = 1;
+			rcu_start_batch(rcp);
+			spin_unlock(&rcp->lock);
+		}
+	}
+
+	rcu_check_quiescent_state(rcp, rdp);
+	if (rdp->donelist)
+		rcu_do_batch(rdp);
+}
+
+static void rcu_process_callbacks(unsigned long unused)
+{
+	__rcu_process_callbacks(&rcu_ctrlblk, &__get_cpu_var(rcu_data));
+	__rcu_process_callbacks(&rcu_bh_ctrlblk, &__get_cpu_var(rcu_bh_data));
+}
+
+static int __rcu_pending(struct rcu_ctrlblk *rcp, struct rcu_data *rdp)
+{
+	/* This cpu has pending rcu entries and the grace period
+	 * for them has completed.
+	 */
+	if (rdp->curlist && !rcu_batch_before(rcp->completed, rdp->batch))
+		return 1;
+
+	/* This cpu has no pending entries, but there are new entries */
+	if (!rdp->curlist && rdp->nxtlist)
+		return 1;
+
+	/* This cpu has finished callbacks to invoke */
+	if (rdp->donelist)
+		return 1;
+
+	/* The rcu core waits for a quiescent state from the cpu */
+	if (rdp->quiescbatch != rcp->cur || rdp->qs_pending)
+		return 1;
+
+	/* nothing to do */
+	return 0;
+}
+
+/*
+ * Check to see if there is any immediate RCU-related work to be done
+ * by the current CPU, returning 1 if so.  This function is part of the
+ * RCU implementation; it is -not- an exported member of the RCU API.
+ */
+int rcu_pending(int cpu)
+{
+	return __rcu_pending(&rcu_ctrlblk, &per_cpu(rcu_data, cpu)) ||
+		__rcu_pending(&rcu_bh_ctrlblk, &per_cpu(rcu_bh_data, cpu));
+}
+
+/*
+ * Check to see if any future RCU-related work will need to be done
+ * by the current CPU, even if none need be done immediately, returning
+ * 1 if so.  This function is part of the RCU implementation; it is -not-
+ * an exported member of the RCU API.
+ */
+int rcu_needs_cpu(int cpu)
+{
+	struct rcu_data *rdp = &per_cpu(rcu_data, cpu);
+	struct rcu_data *rdp_bh = &per_cpu(rcu_bh_data, cpu);
+
+	return (!!rdp->curlist || !!rdp_bh->curlist || rcu_pending(cpu));
+}
+
+void rcu_check_callbacks(int cpu, int user)
+{
+	if (user || 
+	    (idle_cpu(cpu) && !in_softirq() && 
+				hardirq_count() <= (1 << HARDIRQ_SHIFT))) {
+		rcu_qsctr_inc(cpu);
+		rcu_bh_qsctr_inc(cpu);
+	} else if (!in_softirq())
+		rcu_bh_qsctr_inc(cpu);
+	tasklet_schedule(&per_cpu(rcu_tasklet, cpu));
+}
+
+static void rcu_init_percpu_data(int cpu, struct rcu_ctrlblk *rcp,
+						struct rcu_data *rdp)
+{
+	memset(rdp, 0, sizeof(*rdp));
+	rdp->curtail = &rdp->curlist;
+	rdp->nxttail = &rdp->nxtlist;
+	rdp->donetail = &rdp->donelist;
+	rdp->quiescbatch = rcp->completed;
+	rdp->qs_pending = 0;
+	rdp->cpu = cpu;
+	rdp->blimit = blimit;
+}
+
+static void __devinit rcu_online_cpu(int cpu)
+{
+	struct rcu_data *rdp = &per_cpu(rcu_data, cpu);
+	struct rcu_data *bh_rdp = &per_cpu(rcu_bh_data, cpu);
+
+	rcu_init_percpu_data(cpu, &rcu_ctrlblk, rdp);
+	rcu_init_percpu_data(cpu, &rcu_bh_ctrlblk, bh_rdp);
+	tasklet_init(&per_cpu(rcu_tasklet, cpu), rcu_process_callbacks, 0UL);
+}
+
+static int __cpuinit rcu_cpu_notify(struct notifier_block *self,
+				unsigned long action, void *hcpu)
+{
+	long cpu = (long)hcpu;
+	switch (action) {
+	case CPU_UP_PREPARE:
+		rcu_online_cpu(cpu);
+		break;
+	case CPU_DEAD:
+		rcu_offline_cpu(cpu);
+		break;
+	default:
+		break;
+	}
+	return NOTIFY_OK;
+}
+
+static struct notifier_block __cpuinitdata rcu_nb = {
+	.notifier_call	= rcu_cpu_notify,
+};
+
+/*
+ * Initializes rcu mechanism.  Assumed to be called early.
+ * That is before local timer(SMP) or jiffie timer (uniproc) is setup.
+ * Note that rcu_qsctr and friends are implicitly
+ * initialized due to the choice of ``0'' for RCU_CTR_INVALID.
+ */
+void __init rcu_init(void)
+{
+	rcu_cpu_notify(&rcu_nb, CPU_UP_PREPARE,
+			(void *)(long)smp_processor_id());
+	/* Register notifier for non-boot CPUs */
+	register_cpu_notifier(&rcu_nb);
+}
+
+struct rcu_synchronize {
+	struct rcu_head head;
+	struct completion completion;
+};
+
+/* Because of FASTCALL declaration of complete, we use this wrapper */
+static void wakeme_after_rcu(struct rcu_head  *head)
+{
+	struct rcu_synchronize *rcu;
+
+	rcu = container_of(head, struct rcu_synchronize, head);
+	complete(&rcu->completion);
+}
+
+/**
+ * synchronize_rcu - wait until a grace period has elapsed.
+ *
+ * Control will return to the caller some time after a full grace
+ * period has elapsed, in other words after all currently executing RCU
+ * read-side critical sections have completed.  RCU read-side critical
+ * sections are delimited by rcu_read_lock() and rcu_read_unlock(),
+ * and may be nested.
+ *
+ * If your read-side code is not protected by rcu_read_lock(), do -not-
+ * use synchronize_rcu().
+ */
+void synchronize_rcu(void)
+{
+	struct rcu_synchronize rcu;
+
+	init_completion(&rcu.completion);
+	/* Will wake me after RCU finished */
+	call_rcu(&rcu.head, wakeme_after_rcu);
+
+	/* Wait for it */
+	wait_for_completion(&rcu.completion);
+}
+
+module_param(blimit, int, 0);
+module_param(qhimark, int, 0);
+module_param(qlowmark, int, 0);
+EXPORT_SYMBOL_GPL(rcu_batches_completed);
+EXPORT_SYMBOL_GPL(rcu_batches_completed_bh);
+EXPORT_SYMBOL_GPL(call_rcu);
+EXPORT_SYMBOL_GPL(call_rcu_bh);
+EXPORT_SYMBOL_GPL(synchronize_rcu);
--- a/kernel/rcutorture.c
+++ b/kernel/rcutorture.c
--- a/kernel/relay.c
+++ b/kernel/relay.c
--- a/kernel/resource.c
+++ b/kernel/resource.c
@@ -0,0 +1,712 @@
+/*
+ *	linux/kernel/resource.c
+ *
+ * Copyright (C) 1999	Linus Torvalds
+ * Copyright (C) 1999	Martin Mares <mj@ucw.cz>
+ *
+ * Arbitrary resource management.
+ */
+
+#include <linux/module.h>
+#include <linux/errno.h>
+#include <linux/ioport.h>
+#include <linux/init.h>
+#include <linux/slab.h>
+#include <linux/spinlock.h>
+#include <linux/fs.h>
+#include <linux/proc_fs.h>
+#include <linux/seq_file.h>
+#include <linux/device.h>
+#include <asm/io.h>
+
+
+struct resource ioport_resource = {
+	.name	= "PCI IO",
+	.start	= 0,
+	.end	= IO_SPACE_LIMIT,
+	.flags	= IORESOURCE_IO,
+};
+EXPORT_SYMBOL(ioport_resource);
+
+struct resource iomem_resource = {
+	.name	= "PCI mem",
+	.start	= 0,
+	.end	= -1,
+	.flags	= IORESOURCE_MEM,
+};
+EXPORT_SYMBOL(iomem_resource);
+
+static DEFINE_RWLOCK(resource_lock);
+
+#ifdef CONFIG_PROC_FS
+
+enum { MAX_IORES_LEVEL = 5 };
+
+static void *r_next(struct seq_file *m, void *v, loff_t *pos)
+{
+	struct resource *p = v;
+	(*pos)++;
+	if (p->child)
+		return p->child;
+	while (!p->sibling && p->parent)
+		p = p->parent;
+	return p->sibling;
+}
+
+static void *r_start(struct seq_file *m, loff_t *pos)
+	__acquires(resource_lock)
+{
+	struct resource *p = m->private;
+	loff_t l = 0;
+	read_lock(&resource_lock);
+	for (p = p->child; p && l < *pos; p = r_next(m, p, &l))
+		;
+	return p;
+}
+
+static void r_stop(struct seq_file *m, void *v)
+	__releases(resource_lock)
+{
+	read_unlock(&resource_lock);
+}
+
+static int r_show(struct seq_file *m, void *v)
+{
+	struct resource *root = m->private;
+	struct resource *r = v, *p;
+	int width = root->end < 0x10000 ? 4 : 8;
+	int depth;
+
+	for (depth = 0, p = r; depth < MAX_IORES_LEVEL; depth++, p = p->parent)
+		if (p->parent == root)
+			break;
+	seq_printf(m, "%*s%0*llx-%0*llx : %s\n",
+			depth * 2, "",
+			width, (unsigned long long) r->start,
+			width, (unsigned long long) r->end,
+			r->name ? r->name : "<BAD>");
+	return 0;
+}
+
+static const struct seq_operations resource_op = {
+	.start	= r_start,
+	.next	= r_next,
+	.stop	= r_stop,
+	.show	= r_show,
+};
+
+static int ioports_open(struct inode *inode, struct file *file)
+{
+	int res = seq_open(file, &resource_op);
+	if (!res) {
+		struct seq_file *m = file->private_data;
+		m->private = &ioport_resource;
+	}
+	return res;
+}
+
+static int iomem_open(struct inode *inode, struct file *file)
+{
+	int res = seq_open(file, &resource_op);
+	if (!res) {
+		struct seq_file *m = file->private_data;
+		m->private = &iomem_resource;
+	}
+	return res;
+}
+
+static const struct file_operations proc_ioports_operations = {
+	.open		= ioports_open,
+	.read		= seq_read,
+	.llseek		= seq_lseek,
+	.release	= seq_release,
+};
+
+static const struct file_operations proc_iomem_operations = {
+	.open		= iomem_open,
+	.read		= seq_read,
+	.llseek		= seq_lseek,
+	.release	= seq_release,
+};
+
+static int __init ioresources_init(void)
+{
+	struct proc_dir_entry *entry;
+
+	entry = create_proc_entry("ioports", 0, NULL);
+	if (entry)
+		entry->proc_fops = &proc_ioports_operations;
+	entry = create_proc_entry("iomem", 0, NULL);
+	if (entry)
+		entry->proc_fops = &proc_iomem_operations;
+	return 0;
+}
+__initcall(ioresources_init);
+
+#endif /* CONFIG_PROC_FS */
+
+/* Return the conflict entry if you can't request it */
+static struct resource * __request_resource(struct resource *root, struct resource *new)
+{
+	resource_size_t start = new->start;
+	resource_size_t end = new->end;
+	struct resource *tmp, **p;
+
+	if (end < start)
+		return root;
+	if (start < root->start)
+		return root;
+	if (end > root->end)
+		return root;
+	p = &root->child;
+	for (;;) {
+		tmp = *p;
+		if (!tmp || tmp->start > end) {
+			new->sibling = tmp;
+			*p = new;
+			new->parent = root;
+			return NULL;
+		}
+		p = &tmp->sibling;
+		if (tmp->end < start)
+			continue;
+		return tmp;
+	}
+}
+
+static int __release_resource(struct resource *old)
+{
+	struct resource *tmp, **p;
+
+	p = &old->parent->child;
+	for (;;) {
+		tmp = *p;
+		if (!tmp)
+			break;
+		if (tmp == old) {
+			*p = tmp->sibling;
+			old->parent = NULL;
+			return 0;
+		}
+		p = &tmp->sibling;
+	}
+	return -EINVAL;
+}
+
+/**
+ * request_resource - request and reserve an I/O or memory resource
+ * @root: root resource descriptor
+ * @new: resource descriptor desired by caller
+ *
+ * Returns 0 for success, negative error code on error.
+ */
+int request_resource(struct resource *root, struct resource *new)
+{
+	struct resource *conflict;
+
+	write_lock(&resource_lock);
+	conflict = __request_resource(root, new);
+	write_unlock(&resource_lock);
+	return conflict ? -EBUSY : 0;
+}
+
+EXPORT_SYMBOL(request_resource);
+
+/**
+ * ____request_resource - reserve a resource, with resource conflict returned
+ * @root: root resource descriptor
+ * @new: resource descriptor desired by caller
+ *
+ * Returns:
+ * On success, NULL is returned.
+ * On error, a pointer to the conflicting resource is returned.
+ */
+struct resource *____request_resource(struct resource *root, struct resource *new)
+{
+	struct resource *conflict;
+
+	write_lock(&resource_lock);
+	conflict = __request_resource(root, new);
+	write_unlock(&resource_lock);
+	return conflict;
+}
+
+EXPORT_SYMBOL(____request_resource);
+
+/**
+ * release_resource - release a previously reserved resource
+ * @old: resource pointer
+ */
+int release_resource(struct resource *old)
+{
+	int retval;
+
+	write_lock(&resource_lock);
+	retval = __release_resource(old);
+	write_unlock(&resource_lock);
+	return retval;
+}
+
+EXPORT_SYMBOL(release_resource);
+
+#ifdef CONFIG_MEMORY_HOTPLUG
+/*
+ * Finds the lowest memory reosurce exists within [res->start.res->end)
+ * the caller must specify res->start, res->end, res->flags.
+ * If found, returns 0, res is overwritten, if not found, returns -1.
+ */
+int find_next_system_ram(struct resource *res)
+{
+	resource_size_t start, end;
+	struct resource *p;
+
+	BUG_ON(!res);
+
+	start = res->start;
+	end = res->end;
+	BUG_ON(start >= end);
+
+	read_lock(&resource_lock);
+	for (p = iomem_resource.child; p ; p = p->sibling) {
+		/* system ram is just marked as IORESOURCE_MEM */
+		if (p->flags != res->flags)
+			continue;
+		if (p->start > end) {
+			p = NULL;
+			break;
+		}
+		if ((p->end >= start) && (p->start < end))
+			break;
+	}
+	read_unlock(&resource_lock);
+	if (!p)
+		return -1;
+	/* copy data */
+	if (res->start < p->start)
+		res->start = p->start;
+	if (res->end > p->end)
+		res->end = p->end;
+	return 0;
+}
+#endif
+
+/*
+ * Find empty slot in the resource tree given range and alignment.
+ */
+static int find_resource(struct resource *root, struct resource *new,
+			 resource_size_t size, resource_size_t min,
+			 resource_size_t max, resource_size_t align,
+			 void (*alignf)(void *, struct resource *,
+					resource_size_t, resource_size_t),
+			 void *alignf_data)
+{
+	struct resource *this = root->child;
+
+	new->start = root->start;
+	/*
+	 * Skip past an allocated resource that starts at 0, since the assignment
+	 * of this->start - 1 to new->end below would cause an underflow.
+	 */
+	if (this && this->start == 0) {
+		new->start = this->end + 1;
+		this = this->sibling;
+	}
+	for(;;) {
+		if (this)
+			new->end = this->start - 1;
+		else
+			new->end = root->end;
+		if (new->start < min)
+			new->start = min;
+		if (new->end > max)
+			new->end = max;
+		new->start = ALIGN(new->start, align);
+		if (alignf)
+			alignf(alignf_data, new, size, align);
+		if (new->start < new->end && new->end - new->start >= size - 1) {
+			new->end = new->start + size - 1;
+			return 0;
+		}
+		if (!this)
+			break;
+		new->start = this->end + 1;
+		this = this->sibling;
+	}
+	return -EBUSY;
+}
+
+/**
+ * allocate_resource - allocate empty slot in the resource tree given range & alignment
+ * @root: root resource descriptor
+ * @new: resource descriptor desired by caller
+ * @size: requested resource region size
+ * @min: minimum size to allocate
+ * @max: maximum size to allocate
+ * @align: alignment requested, in bytes
+ * @alignf: alignment function, optional, called if not NULL
+ * @alignf_data: arbitrary data to pass to the @alignf function
+ */
+int allocate_resource(struct resource *root, struct resource *new,
+		      resource_size_t size, resource_size_t min,
+		      resource_size_t max, resource_size_t align,
+		      void (*alignf)(void *, struct resource *,
+				     resource_size_t, resource_size_t),
+		      void *alignf_data)
+{
+	int err;
+
+	write_lock(&resource_lock);
+	err = find_resource(root, new, size, min, max, align, alignf, alignf_data);
+	if (err >= 0 && __request_resource(root, new))
+		err = -EBUSY;
+	write_unlock(&resource_lock);
+	return err;
+}
+
+EXPORT_SYMBOL(allocate_resource);
+
+/**
+ * insert_resource - Inserts a resource in the resource tree
+ * @parent: parent of the new resource
+ * @new: new resource to insert
+ *
+ * Returns 0 on success, -EBUSY if the resource can't be inserted.
+ *
+ * This function is equivalent to request_resource when no conflict
+ * happens. If a conflict happens, and the conflicting resources
+ * entirely fit within the range of the new resource, then the new
+ * resource is inserted and the conflicting resources become children of
+ * the new resource.
+ */
+int insert_resource(struct resource *parent, struct resource *new)
+{
+	int result;
+	struct resource *first, *next;
+
+	write_lock(&resource_lock);
+
+	for (;; parent = first) {
+	 	result = 0;
+		first = __request_resource(parent, new);
+		if (!first)
+			goto out;
+
+		result = -EBUSY;
+		if (first == parent)
+			goto out;
+
+		if ((first->start > new->start) || (first->end < new->end))
+			break;
+		if ((first->start == new->start) && (first->end == new->end))
+			break;
+	}
+
+	for (next = first; ; next = next->sibling) {
+		/* Partial overlap? Bad, and unfixable */
+		if (next->start < new->start || next->end > new->end)
+			goto out;
+		if (!next->sibling)
+			break;
+		if (next->sibling->start > new->end)
+			break;
+	}
+
+	result = 0;
+
+	new->parent = parent;
+	new->sibling = next->sibling;
+	new->child = first;
+
+	next->sibling = NULL;
+	for (next = first; next; next = next->sibling)
+		next->parent = new;
+
+	if (parent->child == first) {
+		parent->child = new;
+	} else {
+		next = parent->child;
+		while (next->sibling != first)
+			next = next->sibling;
+		next->sibling = new;
+	}
+
+ out:
+	write_unlock(&resource_lock);
+	return result;
+}
+
+/**
+ * adjust_resource - modify a resource's start and size
+ * @res: resource to modify
+ * @start: new start value
+ * @size: new size
+ *
+ * Given an existing resource, change its start and size to match the
+ * arguments.  Returns 0 on success, -EBUSY if it can't fit.
+ * Existing children of the resource are assumed to be immutable.
+ */
+int adjust_resource(struct resource *res, resource_size_t start, resource_size_t size)
+{
+	struct resource *tmp, *parent = res->parent;
+	resource_size_t end = start + size - 1;
+	int result = -EBUSY;
+
+	write_lock(&resource_lock);
+
+	if ((start < parent->start) || (end > parent->end))
+		goto out;
+
+	for (tmp = res->child; tmp; tmp = tmp->sibling) {
+		if ((tmp->start < start) || (tmp->end > end))
+			goto out;
+	}
+
+	if (res->sibling && (res->sibling->start <= end))
+		goto out;
+
+	tmp = parent->child;
+	if (tmp != res) {
+		while (tmp->sibling != res)
+			tmp = tmp->sibling;
+		if (start <= tmp->end)
+			goto out;
+	}
+
+	res->start = start;
+	res->end = end;
+	result = 0;
+
+ out:
+	write_unlock(&resource_lock);
+	return result;
+}
+
+EXPORT_SYMBOL(adjust_resource);
+
+/*
+ * This is compatibility stuff for IO resources.
+ *
+ * Note how this, unlike the above, knows about
+ * the IO flag meanings (busy etc).
+ *
+ * request_region creates a new busy region.
+ *
+ * check_region returns non-zero if the area is already busy.
+ *
+ * release_region releases a matching busy region.
+ */
+
+/**
+ * __request_region - create a new busy resource region
+ * @parent: parent resource descriptor
+ * @start: resource start address
+ * @n: resource region size
+ * @name: reserving caller's ID string
+ */
+struct resource * __request_region(struct resource *parent,
+				   resource_size_t start, resource_size_t n,
+				   const char *name)
+{
+	struct resource *res = kzalloc(sizeof(*res), GFP_KERNEL);
+
+	if (res) {
+		res->name = name;
+		res->start = start;
+		res->end = start + n - 1;
+		res->flags = IORESOURCE_BUSY;
+
+		write_lock(&resource_lock);
+
+		for (;;) {
+			struct resource *conflict;
+
+			conflict = __request_resource(parent, res);
+			if (!conflict)
+				break;
+			if (conflict != parent) {
+				parent = conflict;
+				if (!(conflict->flags & IORESOURCE_BUSY))
+					continue;
+			}
+
+			/* Uhhuh, that didn't work out.. */
+			kfree(res);
+			res = NULL;
+			break;
+		}
+		write_unlock(&resource_lock);
+	}
+	return res;
+}
+EXPORT_SYMBOL(__request_region);
+
+/**
+ * __check_region - check if a resource region is busy or free
+ * @parent: parent resource descriptor
+ * @start: resource start address
+ * @n: resource region size
+ *
+ * Returns 0 if the region is free at the moment it is checked,
+ * returns %-EBUSY if the region is busy.
+ *
+ * NOTE:
+ * This function is deprecated because its use is racy.
+ * Even if it returns 0, a subsequent call to request_region()
+ * may fail because another driver etc. just allocated the region.
+ * Do NOT use it.  It will be removed from the kernel.
+ */
+int __check_region(struct resource *parent, resource_size_t start,
+			resource_size_t n)
+{
+	struct resource * res;
+
+	res = __request_region(parent, start, n, "check-region");
+	if (!res)
+		return -EBUSY;
+
+	release_resource(res);
+	kfree(res);
+	return 0;
+}
+EXPORT_SYMBOL(__check_region);
+
+/**
+ * __release_region - release a previously reserved resource region
+ * @parent: parent resource descriptor
+ * @start: resource start address
+ * @n: resource region size
+ *
+ * The described resource region must match a currently busy region.
+ */
+void __release_region(struct resource *parent, resource_size_t start,
+			resource_size_t n)
+{
+	struct resource **p;
+	resource_size_t end;
+
+	p = &parent->child;
+	end = start + n - 1;
+
+	write_lock(&resource_lock);
+
+	for (;;) {
+		struct resource *res = *p;
+
+		if (!res)
+			break;
+		if (res->start <= start && res->end >= end) {
+			if (!(res->flags & IORESOURCE_BUSY)) {
+				p = &res->child;
+				continue;
+			}
+			if (res->start != start || res->end != end)
+				break;
+			*p = res->sibling;
+			write_unlock(&resource_lock);
+			kfree(res);
+			return;
+		}
+		p = &res->sibling;
+	}
+
+	write_unlock(&resource_lock);
+
+	printk(KERN_WARNING "Trying to free nonexistent resource "
+		"<%016llx-%016llx>\n", (unsigned long long)start,
+		(unsigned long long)end);
+}
+EXPORT_SYMBOL(__release_region);
+
+/*
+ * Managed region resource
+ */
+struct region_devres {
+	struct resource *parent;
+	resource_size_t start;
+	resource_size_t n;
+};
+
+static void devm_region_release(struct device *dev, void *res)
+{
+	struct region_devres *this = res;
+
+	__release_region(this->parent, this->start, this->n);
+}
+
+static int devm_region_match(struct device *dev, void *res, void *match_data)
+{
+	struct region_devres *this = res, *match = match_data;
+
+	return this->parent == match->parent &&
+		this->start == match->start && this->n == match->n;
+}
+
+struct resource * __devm_request_region(struct device *dev,
+				struct resource *parent, resource_size_t start,
+				resource_size_t n, const char *name)
+{
+	struct region_devres *dr = NULL;
+	struct resource *res;
+
+	dr = devres_alloc(devm_region_release, sizeof(struct region_devres),
+			  GFP_KERNEL);
+	if (!dr)
+		return NULL;
+
+	dr->parent = parent;
+	dr->start = start;
+	dr->n = n;
+
+	res = __request_region(parent, start, n, name);
+	if (res)
+		devres_add(dev, dr);
+	else
+		devres_free(dr);
+
+	return res;
+}
+EXPORT_SYMBOL(__devm_request_region);
+
+void __devm_release_region(struct device *dev, struct resource *parent,
+			   resource_size_t start, resource_size_t n)
+{
+	struct region_devres match_data = { parent, start, n };
+
+	__release_region(parent, start, n);
+	WARN_ON(devres_destroy(dev, devm_region_release, devm_region_match,
+			       &match_data));
+}
+EXPORT_SYMBOL(__devm_release_region);
+
+/*
+ * Called from init/main.c to reserve IO ports.
+ */
+#define MAXRESERVE 4
+static int __init reserve_setup(char *str)
+{
+	static int reserved;
+	static struct resource reserve[MAXRESERVE];
+
+	for (;;) {
+		int io_start, io_num;
+		int x = reserved;
+
+		if (get_option (&str, &io_start) != 2)
+			break;
+		if (get_option (&str, &io_num)   == 0)
+			break;
+		if (x < MAXRESERVE) {
+			struct resource *res = reserve + x;
+			res->name = "reserved";
+			res->start = io_start;
+			res->end = io_start + io_num - 1;
+			res->flags = IORESOURCE_BUSY;
+			res->child = NULL;
+			if (request_resource(res->start >= 0x10000 ? &iomem_resource : &ioport_resource, res) == 0)
+				reserved = x+1;
+		}
+	}
+	return 1;
+}
+
+__setup("reserve=", reserve_setup);
--- a/kernel/rtmutex-debug.c
+++ b/kernel/rtmutex-debug.c
@@ -0,0 +1,241 @@
+/*
+ * RT-Mutexes: blocking mutual exclusion locks with PI support
+ *
+ * started by Ingo Molnar and Thomas Gleixner:
+ *
+ *  Copyright (C) 2004-2006 Red Hat, Inc., Ingo Molnar <mingo@redhat.com>
+ *  Copyright (C) 2006 Timesys Corp., Thomas Gleixner <tglx@timesys.com>
+ *
+ * This code is based on the rt.c implementation in the preempt-rt tree.
+ * Portions of said code are
+ *
+ *  Copyright (C) 2004  LynuxWorks, Inc., Igor Manyilov, Bill Huey
+ *  Copyright (C) 2006  Esben Nielsen
+ *  Copyright (C) 2006  Kihon Technologies Inc.,
+ *			Steven Rostedt <rostedt@goodmis.org>
+ *
+ * See rt.c in preempt-rt for proper credits and further information
+ */
+#include <linux/sched.h>
+#include <linux/delay.h>
+#include <linux/module.h>
+#include <linux/spinlock.h>
+#include <linux/kallsyms.h>
+#include <linux/syscalls.h>
+#include <linux/interrupt.h>
+#include <linux/plist.h>
+#include <linux/fs.h>
+#include <linux/debug_locks.h>
+
+#include "rtmutex_common.h"
+
+#ifdef CONFIG_DEBUG_RT_MUTEXES
+# include "rtmutex-debug.h"
+#else
+# include "rtmutex.h"
+#endif
+
+# define TRACE_WARN_ON(x)			WARN_ON(x)
+# define TRACE_BUG_ON(x)			BUG_ON(x)
+
+# define TRACE_OFF()						\
+do {								\
+	if (rt_trace_on) {					\
+		rt_trace_on = 0;				\
+		console_verbose();				\
+		if (spin_is_locked(&current->pi_lock))		\
+			spin_unlock(&current->pi_lock);		\
+	}							\
+} while (0)
+
+# define TRACE_OFF_NOLOCK()					\
+do {								\
+	if (rt_trace_on) {					\
+		rt_trace_on = 0;				\
+		console_verbose();				\
+	}							\
+} while (0)
+
+# define TRACE_BUG_LOCKED()			\
+do {						\
+	TRACE_OFF();				\
+	BUG();					\
+} while (0)
+
+# define TRACE_WARN_ON_LOCKED(c)		\
+do {						\
+	if (unlikely(c)) {			\
+		TRACE_OFF();			\
+		WARN_ON(1);			\
+	}					\
+} while (0)
+
+# define TRACE_BUG_ON_LOCKED(c)			\
+do {						\
+	if (unlikely(c))			\
+		TRACE_BUG_LOCKED();		\
+} while (0)
+
+#ifdef CONFIG_SMP
+# define SMP_TRACE_BUG_ON_LOCKED(c)	TRACE_BUG_ON_LOCKED(c)
+#else
+# define SMP_TRACE_BUG_ON_LOCKED(c)	do { } while (0)
+#endif
+
+/*
+ * deadlock detection flag. We turn it off when we detect
+ * the first problem because we dont want to recurse back
+ * into the tracing code when doing error printk or
+ * executing a BUG():
+ */
+int rt_trace_on = 1;
+
+void deadlock_trace_off(void)
+{
+	rt_trace_on = 0;
+}
+
+static void printk_task(struct task_struct *p)
+{
+	if (p)
+		printk("%16s:%5d [%p, %3d]", p->comm, p->pid, p, p->prio);
+	else
+		printk("<none>");
+}
+
+static void printk_lock(struct rt_mutex *lock, int print_owner)
+{
+	if (lock->name)
+		printk(" [%p] {%s}\n",
+			lock, lock->name);
+	else
+		printk(" [%p] {%s:%d}\n",
+			lock, lock->file, lock->line);
+
+	if (print_owner && rt_mutex_owner(lock)) {
+		printk(".. ->owner: %p\n", lock->owner);
+		printk(".. held by:  ");
+		printk_task(rt_mutex_owner(lock));
+		printk("\n");
+	}
+}
+
+void rt_mutex_debug_task_free(struct task_struct *task)
+{
+	WARN_ON(!plist_head_empty(&task->pi_waiters));
+	WARN_ON(task->pi_blocked_on);
+}
+
+/*
+ * We fill out the fields in the waiter to store the information about
+ * the deadlock. We print when we return. act_waiter can be NULL in
+ * case of a remove waiter operation.
+ */
+void debug_rt_mutex_deadlock(int detect, struct rt_mutex_waiter *act_waiter,
+			     struct rt_mutex *lock)
+{
+	struct task_struct *task;
+
+	if (!rt_trace_on || detect || !act_waiter)
+		return;
+
+	task = rt_mutex_owner(act_waiter->lock);
+	if (task && task != current) {
+		act_waiter->deadlock_task_pid = task->pid;
+		act_waiter->deadlock_lock = lock;
+	}
+}
+
+void debug_rt_mutex_print_deadlock(struct rt_mutex_waiter *waiter)
+{
+	struct task_struct *task;
+
+	if (!waiter->deadlock_lock || !rt_trace_on)
+		return;
+
+	task = find_task_by_pid(waiter->deadlock_task_pid);
+	if (!task)
+		return;
+
+	TRACE_OFF_NOLOCK();
+
+	printk("\n============================================\n");
+	printk(  "[ BUG: circular locking deadlock detected! ]\n");
+	printk(  "--------------------------------------------\n");
+	printk("%s/%d is deadlocking current task %s/%d\n\n",
+	       task->comm, task->pid, current->comm, current->pid);
+
+	printk("\n1) %s/%d is trying to acquire this lock:\n",
+	       current->comm, current->pid);
+	printk_lock(waiter->lock, 1);
+
+	printk("\n2) %s/%d is blocked on this lock:\n", task->comm, task->pid);
+	printk_lock(waiter->deadlock_lock, 1);
+
+	debug_show_held_locks(current);
+	debug_show_held_locks(task);
+
+	printk("\n%s/%d's [blocked] stackdump:\n\n", task->comm, task->pid);
+	show_stack(task, NULL);
+	printk("\n%s/%d's [current] stackdump:\n\n",
+	       current->comm, current->pid);
+	dump_stack();
+	debug_show_all_locks();
+
+	printk("[ turning off deadlock detection."
+	       "Please report this trace. ]\n\n");
+	local_irq_disable();
+}
+
+void debug_rt_mutex_lock(struct rt_mutex *lock)
+{
+}
+
+void debug_rt_mutex_unlock(struct rt_mutex *lock)
+{
+	TRACE_WARN_ON_LOCKED(rt_mutex_owner(lock) != current);
+}
+
+void
+debug_rt_mutex_proxy_lock(struct rt_mutex *lock, struct task_struct *powner)
+{
+}
+
+void debug_rt_mutex_proxy_unlock(struct rt_mutex *lock)
+{
+	TRACE_WARN_ON_LOCKED(!rt_mutex_owner(lock));
+}
+
+void debug_rt_mutex_init_waiter(struct rt_mutex_waiter *waiter)
+{
+	memset(waiter, 0x11, sizeof(*waiter));
+	plist_node_init(&waiter->list_entry, MAX_PRIO);
+	plist_node_init(&waiter->pi_list_entry, MAX_PRIO);
+}
+
+void debug_rt_mutex_free_waiter(struct rt_mutex_waiter *waiter)
+{
+	TRACE_WARN_ON(!plist_node_empty(&waiter->list_entry));
+	TRACE_WARN_ON(!plist_node_empty(&waiter->pi_list_entry));
+	TRACE_WARN_ON(waiter->task);
+	memset(waiter, 0x22, sizeof(*waiter));
+}
+
+void debug_rt_mutex_init(struct rt_mutex *lock, const char *name)
+{
+	/*
+	 * Make sure we are not reinitializing a held lock:
+	 */
+	debug_check_no_locks_freed((void *)lock, sizeof(*lock));
+	lock->name = name;
+}
+
+void
+rt_mutex_deadlock_account_lock(struct rt_mutex *lock, struct task_struct *task)
+{
+}
+
+void rt_mutex_deadlock_account_unlock(struct task_struct *task)
+{
+}
+
--- a/kernel/rtmutex-debug.h
+++ b/kernel/rtmutex-debug.h
@@ -0,0 +1,33 @@
+/*
+ * RT-Mutexes: blocking mutual exclusion locks with PI support
+ *
+ * started by Ingo Molnar and Thomas Gleixner:
+ *
+ *  Copyright (C) 2004-2006 Red Hat, Inc., Ingo Molnar <mingo@redhat.com>
+ *  Copyright (C) 2006, Timesys Corp., Thomas Gleixner <tglx@timesys.com>
+ *
+ * This file contains macros used solely by rtmutex.c. Debug version.
+ */
+
+extern void
+rt_mutex_deadlock_account_lock(struct rt_mutex *lock, struct task_struct *task);
+extern void rt_mutex_deadlock_account_unlock(struct task_struct *task);
+extern void debug_rt_mutex_init_waiter(struct rt_mutex_waiter *waiter);
+extern void debug_rt_mutex_free_waiter(struct rt_mutex_waiter *waiter);
+extern void debug_rt_mutex_init(struct rt_mutex *lock, const char *name);
+extern void debug_rt_mutex_lock(struct rt_mutex *lock);
+extern void debug_rt_mutex_unlock(struct rt_mutex *lock);
+extern void debug_rt_mutex_proxy_lock(struct rt_mutex *lock,
+				      struct task_struct *powner);
+extern void debug_rt_mutex_proxy_unlock(struct rt_mutex *lock);
+extern void debug_rt_mutex_deadlock(int detect, struct rt_mutex_waiter *waiter,
+				    struct rt_mutex *lock);
+extern void debug_rt_mutex_print_deadlock(struct rt_mutex_waiter *waiter);
+# define debug_rt_mutex_reset_waiter(w)			\
+	do { (w)->deadlock_lock = NULL; } while (0)
+
+static inline int debug_rt_mutex_detect_deadlock(struct rt_mutex_waiter *waiter,
+						 int detect)
+{
+	return (waiter != NULL);
+}
--- a/kernel/rtmutex-tester.c
+++ b/kernel/rtmutex-tester.c
@@ -0,0 +1,441 @@
+/*
+ * RT-Mutex-tester: scriptable tester for rt mutexes
+ *
+ * started by Thomas Gleixner:
+ *
+ *  Copyright (C) 2006, Timesys Corp., Thomas Gleixner <tglx@timesys.com>
+ *
+ */
+#include <linux/kthread.h>
+#include <linux/module.h>
+#include <linux/sched.h>
+#include <linux/smp_lock.h>
+#include <linux/spinlock.h>
+#include <linux/sysdev.h>
+#include <linux/timer.h>
+#include <linux/freezer.h>
+
+#include "rtmutex.h"
+
+#define MAX_RT_TEST_THREADS	8
+#define MAX_RT_TEST_MUTEXES	8
+
+static spinlock_t rttest_lock;
+static atomic_t rttest_event;
+
+struct test_thread_data {
+	int			opcode;
+	int			opdata;
+	int			mutexes[MAX_RT_TEST_MUTEXES];
+	int			bkl;
+	int			event;
+	struct sys_device	sysdev;
+};
+
+static struct test_thread_data thread_data[MAX_RT_TEST_THREADS];
+static struct task_struct *threads[MAX_RT_TEST_THREADS];
+static struct rt_mutex mutexes[MAX_RT_TEST_MUTEXES];
+
+enum test_opcodes {
+	RTTEST_NOP = 0,
+	RTTEST_SCHEDOT,		/* 1 Sched other, data = nice */
+	RTTEST_SCHEDRT,		/* 2 Sched fifo, data = prio */
+	RTTEST_LOCK,		/* 3 Lock uninterruptible, data = lockindex */
+	RTTEST_LOCKNOWAIT,	/* 4 Lock uninterruptible no wait in wakeup, data = lockindex */
+	RTTEST_LOCKINT,		/* 5 Lock interruptible, data = lockindex */
+	RTTEST_LOCKINTNOWAIT,	/* 6 Lock interruptible no wait in wakeup, data = lockindex */
+	RTTEST_LOCKCONT,	/* 7 Continue locking after the wakeup delay */
+	RTTEST_UNLOCK,		/* 8 Unlock, data = lockindex */
+	RTTEST_LOCKBKL,		/* 9 Lock BKL */
+	RTTEST_UNLOCKBKL,	/* 10 Unlock BKL */
+	RTTEST_SIGNAL,		/* 11 Signal other test thread, data = thread id */
+	RTTEST_RESETEVENT = 98,	/* 98 Reset event counter */
+	RTTEST_RESET = 99,	/* 99 Reset all pending operations */
+};
+
+static int handle_op(struct test_thread_data *td, int lockwakeup)
+{
+	int i, id, ret = -EINVAL;
+
+	switch(td->opcode) {
+
+	case RTTEST_NOP:
+		return 0;
+
+	case RTTEST_LOCKCONT:
+		td->mutexes[td->opdata] = 1;
+		td->event = atomic_add_return(1, &rttest_event);
+		return 0;
+
+	case RTTEST_RESET:
+		for (i = 0; i < MAX_RT_TEST_MUTEXES; i++) {
+			if (td->mutexes[i] == 4) {
+				rt_mutex_unlock(&mutexes[i]);
+				td->mutexes[i] = 0;
+			}
+		}
+
+		if (!lockwakeup && td->bkl == 4) {
+			unlock_kernel();
+			td->bkl = 0;
+		}
+		return 0;
+
+	case RTTEST_RESETEVENT:
+		atomic_set(&rttest_event, 0);
+		return 0;
+
+	default:
+		if (lockwakeup)
+			return ret;
+	}
+
+	switch(td->opcode) {
+
+	case RTTEST_LOCK:
+	case RTTEST_LOCKNOWAIT:
+		id = td->opdata;
+		if (id < 0 || id >= MAX_RT_TEST_MUTEXES)
+			return ret;
+
+		td->mutexes[id] = 1;
+		td->event = atomic_add_return(1, &rttest_event);
+		rt_mutex_lock(&mutexes[id]);
+		td->event = atomic_add_return(1, &rttest_event);
+		td->mutexes[id] = 4;
+		return 0;
+
+	case RTTEST_LOCKINT:
+	case RTTEST_LOCKINTNOWAIT:
+		id = td->opdata;
+		if (id < 0 || id >= MAX_RT_TEST_MUTEXES)
+			return ret;
+
+		td->mutexes[id] = 1;
+		td->event = atomic_add_return(1, &rttest_event);
+		ret = rt_mutex_lock_interruptible(&mutexes[id], 0);
+		td->event = atomic_add_return(1, &rttest_event);
+		td->mutexes[id] = ret ? 0 : 4;
+		return ret ? -EINTR : 0;
+
+	case RTTEST_UNLOCK:
+		id = td->opdata;
+		if (id < 0 || id >= MAX_RT_TEST_MUTEXES || td->mutexes[id] != 4)
+			return ret;
+
+		td->event = atomic_add_return(1, &rttest_event);
+		rt_mutex_unlock(&mutexes[id]);
+		td->event = atomic_add_return(1, &rttest_event);
+		td->mutexes[id] = 0;
+		return 0;
+
+	case RTTEST_LOCKBKL:
+		if (td->bkl)
+			return 0;
+		td->bkl = 1;
+		lock_kernel();
+		td->bkl = 4;
+		return 0;
+
+	case RTTEST_UNLOCKBKL:
+		if (td->bkl != 4)
+			break;
+		unlock_kernel();
+		td->bkl = 0;
+		return 0;
+
+	default:
+		break;
+	}
+	return ret;
+}
+
+/*
+ * Schedule replacement for rtsem_down(). Only called for threads with
+ * PF_MUTEX_TESTER set.
+ *
+ * This allows us to have finegrained control over the event flow.
+ *
+ */
+void schedule_rt_mutex_test(struct rt_mutex *mutex)
+{
+	int tid, op, dat;
+	struct test_thread_data *td;
+
+	/* We have to lookup the task */
+	for (tid = 0; tid < MAX_RT_TEST_THREADS; tid++) {
+		if (threads[tid] == current)
+			break;
+	}
+
+	BUG_ON(tid == MAX_RT_TEST_THREADS);
+
+	td = &thread_data[tid];
+
+	op = td->opcode;
+	dat = td->opdata;
+
+	switch (op) {
+	case RTTEST_LOCK:
+	case RTTEST_LOCKINT:
+	case RTTEST_LOCKNOWAIT:
+	case RTTEST_LOCKINTNOWAIT:
+		if (mutex != &mutexes[dat])
+			break;
+
+		if (td->mutexes[dat] != 1)
+			break;
+
+		td->mutexes[dat] = 2;
+		td->event = atomic_add_return(1, &rttest_event);
+		break;
+
+	case RTTEST_LOCKBKL:
+	default:
+		break;
+	}
+
+	schedule();
+
+
+	switch (op) {
+	case RTTEST_LOCK:
+	case RTTEST_LOCKINT:
+		if (mutex != &mutexes[dat])
+			return;
+
+		if (td->mutexes[dat] != 2)
+			return;
+
+		td->mutexes[dat] = 3;
+		td->event = atomic_add_return(1, &rttest_event);
+		break;
+
+	case RTTEST_LOCKNOWAIT:
+	case RTTEST_LOCKINTNOWAIT:
+		if (mutex != &mutexes[dat])
+			return;
+
+		if (td->mutexes[dat] != 2)
+			return;
+
+		td->mutexes[dat] = 1;
+		td->event = atomic_add_return(1, &rttest_event);
+		return;
+
+	case RTTEST_LOCKBKL:
+		return;
+	default:
+		return;
+	}
+
+	td->opcode = 0;
+
+	for (;;) {
+		set_current_state(TASK_INTERRUPTIBLE);
+
+		if (td->opcode > 0) {
+			int ret;
+
+			set_current_state(TASK_RUNNING);
+			ret = handle_op(td, 1);
+			set_current_state(TASK_INTERRUPTIBLE);
+			if (td->opcode == RTTEST_LOCKCONT)
+				break;
+			td->opcode = ret;
+		}
+
+		/* Wait for the next command to be executed */
+		schedule();
+	}
+
+	/* Restore previous command and data */
+	td->opcode = op;
+	td->opdata = dat;
+}
+
+static int test_func(void *data)
+{
+	struct test_thread_data *td = data;
+	int ret;
+
+	current->flags |= PF_MUTEX_TESTER;
+	allow_signal(SIGHUP);
+
+	for(;;) {
+
+		set_current_state(TASK_INTERRUPTIBLE);
+
+		if (td->opcode > 0) {
+			set_current_state(TASK_RUNNING);
+			ret = handle_op(td, 0);
+			set_current_state(TASK_INTERRUPTIBLE);
+			td->opcode = ret;
+		}
+
+		/* Wait for the next command to be executed */
+		schedule();
+		try_to_freeze();
+
+		if (signal_pending(current))
+			flush_signals(current);
+
+		if(kthread_should_stop())
+			break;
+	}
+	return 0;
+}
+
+/**
+ * sysfs_test_command - interface for test commands
+ * @dev:	thread reference
+ * @buf:	command for actual step
+ * @count:	length of buffer
+ *
+ * command syntax:
+ *
+ * opcode:data
+ */
+static ssize_t sysfs_test_command(struct sys_device *dev, const char *buf,
+				  size_t count)
+{
+	struct sched_param schedpar;
+	struct test_thread_data *td;
+	char cmdbuf[32];
+	int op, dat, tid, ret;
+
+	td = container_of(dev, struct test_thread_data, sysdev);
+	tid = td->sysdev.id;
+
+	/* strings from sysfs write are not 0 terminated! */
+	if (count >= sizeof(cmdbuf))
+		return -EINVAL;
+
+	/* strip of \n: */
+	if (buf[count-1] == '\n')
+		count--;
+	if (count < 1)
+		return -EINVAL;
+
+	memcpy(cmdbuf, buf, count);
+	cmdbuf[count] = 0;
+
+	if (sscanf(cmdbuf, "%d:%d", &op, &dat) != 2)
+		return -EINVAL;
+
+	switch (op) {
+	case RTTEST_SCHEDOT:
+		schedpar.sched_priority = 0;
+		ret = sched_setscheduler(threads[tid], SCHED_NORMAL, &schedpar);
+		if (ret)
+			return ret;
+		set_user_nice(current, 0);
+		break;
+
+	case RTTEST_SCHEDRT:
+		schedpar.sched_priority = dat;
+		ret = sched_setscheduler(threads[tid], SCHED_FIFO, &schedpar);
+		if (ret)
+			return ret;
+		break;
+
+	case RTTEST_SIGNAL:
+		send_sig(SIGHUP, threads[tid], 0);
+		break;
+
+	default:
+		if (td->opcode > 0)
+			return -EBUSY;
+		td->opdata = dat;
+		td->opcode = op;
+		wake_up_process(threads[tid]);
+	}
+
+	return count;
+}
+
+/**
+ * sysfs_test_status - sysfs interface for rt tester
+ * @dev:	thread to query
+ * @buf:	char buffer to be filled with thread status info
+ */
+static ssize_t sysfs_test_status(struct sys_device *dev, char *buf)
+{
+	struct test_thread_data *td;
+	struct task_struct *tsk;
+	char *curr = buf;
+	int i;
+
+	td = container_of(dev, struct test_thread_data, sysdev);
+	tsk = threads[td->sysdev.id];
+
+	spin_lock(&rttest_lock);
+
+	curr += sprintf(curr,
+		"O: %4d, E:%8d, S: 0x%08lx, P: %4d, N: %4d, B: %p, K: %d, M:",
+		td->opcode, td->event, tsk->state,
+			(MAX_RT_PRIO - 1) - tsk->prio,
+			(MAX_RT_PRIO - 1) - tsk->normal_prio,
+		tsk->pi_blocked_on, td->bkl);
+
+	for (i = MAX_RT_TEST_MUTEXES - 1; i >=0 ; i--)
+		curr += sprintf(curr, "%d", td->mutexes[i]);
+
+	spin_unlock(&rttest_lock);
+
+	curr += sprintf(curr, ", T: %p, R: %p\n", tsk,
+			mutexes[td->sysdev.id].owner);
+
+	return curr - buf;
+}
+
+static SYSDEV_ATTR(status, 0600, sysfs_test_status, NULL);
+static SYSDEV_ATTR(command, 0600, NULL, sysfs_test_command);
+
+static struct sysdev_class rttest_sysclass = {
+	set_kset_name("rttest"),
+};
+
+static int init_test_thread(int id)
+{
+	thread_data[id].sysdev.cls = &rttest_sysclass;
+	thread_data[id].sysdev.id = id;
+
+	threads[id] = kthread_run(test_func, &thread_data[id], "rt-test-%d", id);
+	if (IS_ERR(threads[id]))
+		return PTR_ERR(threads[id]);
+
+	return sysdev_register(&thread_data[id].sysdev);
+}
+
+static int init_rttest(void)
+{
+	int ret, i;
+
+	spin_lock_init(&rttest_lock);
+
+	for (i = 0; i < MAX_RT_TEST_MUTEXES; i++)
+		rt_mutex_init(&mutexes[i]);
+
+	ret = sysdev_class_register(&rttest_sysclass);
+	if (ret)
+		return ret;
+
+	for (i = 0; i < MAX_RT_TEST_THREADS; i++) {
+		ret = init_test_thread(i);
+		if (ret)
+			break;
+		ret = sysdev_create_file(&thread_data[i].sysdev, &attr_status);
+		if (ret)
+			break;
+		ret = sysdev_create_file(&thread_data[i].sysdev, &attr_command);
+		if (ret)
+			break;
+	}
+
+	printk("Initializing RT-Tester: %s\n", ret ? "Failed" : "OK" );
+
+	return ret;
+}
+
+device_initcall(init_rttest);
--- a/kernel/rtmutex.c
+++ b/kernel/rtmutex.c
@@ -0,0 +1,990 @@
+/*
+ * RT-Mutexes: simple blocking mutual exclusion locks with PI support
+ *
+ * started by Ingo Molnar and Thomas Gleixner.
+ *
+ *  Copyright (C) 2004-2006 Red Hat, Inc., Ingo Molnar <mingo@redhat.com>
+ *  Copyright (C) 2005-2006 Timesys Corp., Thomas Gleixner <tglx@timesys.com>
+ *  Copyright (C) 2005 Kihon Technologies Inc., Steven Rostedt
+ *  Copyright (C) 2006 Esben Nielsen
+ *
+ *  See Documentation/rt-mutex-design.txt for details.
+ */
+#include <linux/spinlock.h>
+#include <linux/module.h>
+#include <linux/sched.h>
+#include <linux/timer.h>
+
+#include "rtmutex_common.h"
+
+#ifdef CONFIG_DEBUG_RT_MUTEXES
+# include "rtmutex-debug.h"
+#else
+# include "rtmutex.h"
+#endif
+
+/*
+ * lock->owner state tracking:
+ *
+ * lock->owner holds the task_struct pointer of the owner. Bit 0 and 1
+ * are used to keep track of the "owner is pending" and "lock has
+ * waiters" state.
+ *
+ * owner	bit1	bit0
+ * NULL		0	0	lock is free (fast acquire possible)
+ * NULL		0	1	invalid state
+ * NULL		1	0	Transitional State*
+ * NULL		1	1	invalid state
+ * taskpointer	0	0	lock is held (fast release possible)
+ * taskpointer	0	1	task is pending owner
+ * taskpointer	1	0	lock is held and has waiters
+ * taskpointer	1	1	task is pending owner and lock has more waiters
+ *
+ * Pending ownership is assigned to the top (highest priority)
+ * waiter of the lock, when the lock is released. The thread is woken
+ * up and can now take the lock. Until the lock is taken (bit 0
+ * cleared) a competing higher priority thread can steal the lock
+ * which puts the woken up thread back on the waiters list.
+ *
+ * The fast atomic compare exchange based acquire and release is only
+ * possible when bit 0 and 1 of lock->owner are 0.
+ *
+ * (*) There's a small time where the owner can be NULL and the
+ * "lock has waiters" bit is set.  This can happen when grabbing the lock.
+ * To prevent a cmpxchg of the owner releasing the lock, we need to set this
+ * bit before looking at the lock, hence the reason this is a transitional
+ * state.
+ */
+
+static void
+rt_mutex_set_owner(struct rt_mutex *lock, struct task_struct *owner,
+		   unsigned long mask)
+{
+	unsigned long val = (unsigned long)owner | mask;
+
+	if (rt_mutex_has_waiters(lock))
+		val |= RT_MUTEX_HAS_WAITERS;
+
+	lock->owner = (struct task_struct *)val;
+}
+
+static inline void clear_rt_mutex_waiters(struct rt_mutex *lock)
+{
+	lock->owner = (struct task_struct *)
+			((unsigned long)lock->owner & ~RT_MUTEX_HAS_WAITERS);
+}
+
+static void fixup_rt_mutex_waiters(struct rt_mutex *lock)
+{
+	if (!rt_mutex_has_waiters(lock))
+		clear_rt_mutex_waiters(lock);
+}
+
+/*
+ * We can speed up the acquire/release, if the architecture
+ * supports cmpxchg and if there's no debugging state to be set up
+ */
+#if defined(__HAVE_ARCH_CMPXCHG) && !defined(CONFIG_DEBUG_RT_MUTEXES)
+# define rt_mutex_cmpxchg(l,c,n)	(cmpxchg(&l->owner, c, n) == c)
+static inline void mark_rt_mutex_waiters(struct rt_mutex *lock)
+{
+	unsigned long owner, *p = (unsigned long *) &lock->owner;
+
+	do {
+		owner = *p;
+	} while (cmpxchg(p, owner, owner | RT_MUTEX_HAS_WAITERS) != owner);
+}
+#else
+# define rt_mutex_cmpxchg(l,c,n)	(0)
+static inline void mark_rt_mutex_waiters(struct rt_mutex *lock)
+{
+	lock->owner = (struct task_struct *)
+			((unsigned long)lock->owner | RT_MUTEX_HAS_WAITERS);
+}
+#endif
+
+/*
+ * Calculate task priority from the waiter list priority
+ *
+ * Return task->normal_prio when the waiter list is empty or when
+ * the waiter is not allowed to do priority boosting
+ */
+int rt_mutex_getprio(struct task_struct *task)
+{
+	if (likely(!task_has_pi_waiters(task)))
+		return task->normal_prio;
+
+	return min(task_top_pi_waiter(task)->pi_list_entry.prio,
+		   task->normal_prio);
+}
+
+/*
+ * Adjust the priority of a task, after its pi_waiters got modified.
+ *
+ * This can be both boosting and unboosting. task->pi_lock must be held.
+ */
+static void __rt_mutex_adjust_prio(struct task_struct *task)
+{
+	int prio = rt_mutex_getprio(task);
+
+	if (task->prio != prio)
+		rt_mutex_setprio(task, prio);
+}
+
+/*
+ * Adjust task priority (undo boosting). Called from the exit path of
+ * rt_mutex_slowunlock() and rt_mutex_slowlock().
+ *
+ * (Note: We do this outside of the protection of lock->wait_lock to
+ * allow the lock to be taken while or before we readjust the priority
+ * of task. We do not use the spin_xx_mutex() variants here as we are
+ * outside of the debug path.)
+ */
+static void rt_mutex_adjust_prio(struct task_struct *task)
+{
+	unsigned long flags;
+
+	spin_lock_irqsave(&task->pi_lock, flags);
+	__rt_mutex_adjust_prio(task);
+	spin_unlock_irqrestore(&task->pi_lock, flags);
+}
+
+/*
+ * Max number of times we'll walk the boosting chain:
+ */
+int max_lock_depth = 1024;
+
+/*
+ * Adjust the priority chain. Also used for deadlock detection.
+ * Decreases task's usage by one - may thus free the task.
+ * Returns 0 or -EDEADLK.
+ */
+static int rt_mutex_adjust_prio_chain(struct task_struct *task,
+				      int deadlock_detect,
+				      struct rt_mutex *orig_lock,
+				      struct rt_mutex_waiter *orig_waiter,
+				      struct task_struct *top_task)
+{
+	struct rt_mutex *lock;
+	struct rt_mutex_waiter *waiter, *top_waiter = orig_waiter;
+	int detect_deadlock, ret = 0, depth = 0;
+	unsigned long flags;
+
+	detect_deadlock = debug_rt_mutex_detect_deadlock(orig_waiter,
+							 deadlock_detect);
+
+	/*
+	 * The (de)boosting is a step by step approach with a lot of
+	 * pitfalls. We want this to be preemptible and we want hold a
+	 * maximum of two locks per step. So we have to check
+	 * carefully whether things change under us.
+	 */
+ again:
+	if (++depth > max_lock_depth) {
+		static int prev_max;
+
+		/*
+		 * Print this only once. If the admin changes the limit,
+		 * print a new message when reaching the limit again.
+		 */
+		if (prev_max != max_lock_depth) {
+			prev_max = max_lock_depth;
+			printk(KERN_WARNING "Maximum lock depth %d reached "
+			       "task: %s (%d)\n", max_lock_depth,
+			       top_task->comm, top_task->pid);
+		}
+		put_task_struct(task);
+
+		return deadlock_detect ? -EDEADLK : 0;
+	}
+ retry:
+	/*
+	 * Task can not go away as we did a get_task() before !
+	 */
+	spin_lock_irqsave(&task->pi_lock, flags);
+
+	waiter = task->pi_blocked_on;
+	/*
+	 * Check whether the end of the boosting chain has been
+	 * reached or the state of the chain has changed while we
+	 * dropped the locks.
+	 */
+	if (!waiter || !waiter->task)
+		goto out_unlock_pi;
+
+	if (top_waiter && (!task_has_pi_waiters(task) ||
+			   top_waiter != task_top_pi_waiter(task)))
+		goto out_unlock_pi;
+
+	/*
+	 * When deadlock detection is off then we check, if further
+	 * priority adjustment is necessary.
+	 */
+	if (!detect_deadlock && waiter->list_entry.prio == task->prio)
+		goto out_unlock_pi;
+
+	lock = waiter->lock;
+	if (!spin_trylock(&lock->wait_lock)) {
+		spin_unlock_irqrestore(&task->pi_lock, flags);
+		cpu_relax();
+		goto retry;
+	}
+
+	/* Deadlock detection */
+	if (lock == orig_lock || rt_mutex_owner(lock) == top_task) {
+		debug_rt_mutex_deadlock(deadlock_detect, orig_waiter, lock);
+		spin_unlock(&lock->wait_lock);
+		ret = deadlock_detect ? -EDEADLK : 0;
+		goto out_unlock_pi;
+	}
+
+	top_waiter = rt_mutex_top_waiter(lock);
+
+	/* Requeue the waiter */
+	plist_del(&waiter->list_entry, &lock->wait_list);
+	waiter->list_entry.prio = task->prio;
+	plist_add(&waiter->list_entry, &lock->wait_list);
+
+	/* Release the task */
+	spin_unlock_irqrestore(&task->pi_lock, flags);
+	put_task_struct(task);
+
+	/* Grab the next task */
+	task = rt_mutex_owner(lock);
+	get_task_struct(task);
+	spin_lock_irqsave(&task->pi_lock, flags);
+
+	if (waiter == rt_mutex_top_waiter(lock)) {
+		/* Boost the owner */
+		plist_del(&top_waiter->pi_list_entry, &task->pi_waiters);
+		waiter->pi_list_entry.prio = waiter->list_entry.prio;
+		plist_add(&waiter->pi_list_entry, &task->pi_waiters);
+		__rt_mutex_adjust_prio(task);
+
+	} else if (top_waiter == waiter) {
+		/* Deboost the owner */
+		plist_del(&waiter->pi_list_entry, &task->pi_waiters);
+		waiter = rt_mutex_top_waiter(lock);
+		waiter->pi_list_entry.prio = waiter->list_entry.prio;
+		plist_add(&waiter->pi_list_entry, &task->pi_waiters);
+		__rt_mutex_adjust_prio(task);
+	}
+
+	spin_unlock_irqrestore(&task->pi_lock, flags);
+
+	top_waiter = rt_mutex_top_waiter(lock);
+	spin_unlock(&lock->wait_lock);
+
+	if (!detect_deadlock && waiter != top_waiter)
+		goto out_put_task;
+
+	goto again;
+
+ out_unlock_pi:
+	spin_unlock_irqrestore(&task->pi_lock, flags);
+ out_put_task:
+	put_task_struct(task);
+
+	return ret;
+}
+
+/*
+ * Optimization: check if we can steal the lock from the
+ * assigned pending owner [which might not have taken the
+ * lock yet]:
+ */
+static inline int try_to_steal_lock(struct rt_mutex *lock)
+{
+	struct task_struct *pendowner = rt_mutex_owner(lock);
+	struct rt_mutex_waiter *next;
+	unsigned long flags;
+
+	if (!rt_mutex_owner_pending(lock))
+		return 0;
+
+	if (pendowner == current)
+		return 1;
+
+	spin_lock_irqsave(&pendowner->pi_lock, flags);
+	if (current->prio >= pendowner->prio) {
+		spin_unlock_irqrestore(&pendowner->pi_lock, flags);
+		return 0;
+	}
+
+	/*
+	 * Check if a waiter is enqueued on the pending owners
+	 * pi_waiters list. Remove it and readjust pending owners
+	 * priority.
+	 */
+	if (likely(!rt_mutex_has_waiters(lock))) {
+		spin_unlock_irqrestore(&pendowner->pi_lock, flags);
+		return 1;
+	}
+
+	/* No chain handling, pending owner is not blocked on anything: */
+	next = rt_mutex_top_waiter(lock);
+	plist_del(&next->pi_list_entry, &pendowner->pi_waiters);
+	__rt_mutex_adjust_prio(pendowner);
+	spin_unlock_irqrestore(&pendowner->pi_lock, flags);
+
+	/*
+	 * We are going to steal the lock and a waiter was
+	 * enqueued on the pending owners pi_waiters queue. So
+	 * we have to enqueue this waiter into
+	 * current->pi_waiters list. This covers the case,
+	 * where current is boosted because it holds another
+	 * lock and gets unboosted because the booster is
+	 * interrupted, so we would delay a waiter with higher
+	 * priority as current->normal_prio.
+	 *
+	 * Note: in the rare case of a SCHED_OTHER task changing
+	 * its priority and thus stealing the lock, next->task
+	 * might be current:
+	 */
+	if (likely(next->task != current)) {
+		spin_lock_irqsave(&current->pi_lock, flags);
+		plist_add(&next->pi_list_entry, &current->pi_waiters);
+		__rt_mutex_adjust_prio(current);
+		spin_unlock_irqrestore(&current->pi_lock, flags);
+	}
+	return 1;
+}
+
+/*
+ * Try to take an rt-mutex
+ *
+ * This fails
+ * - when the lock has a real owner
+ * - when a different pending owner exists and has higher priority than current
+ *
+ * Must be called with lock->wait_lock held.
+ */
+static int try_to_take_rt_mutex(struct rt_mutex *lock)
+{
+	/*
+	 * We have to be careful here if the atomic speedups are
+	 * enabled, such that, when
+	 *  - no other waiter is on the lock
+	 *  - the lock has been released since we did the cmpxchg
+	 * the lock can be released or taken while we are doing the
+	 * checks and marking the lock with RT_MUTEX_HAS_WAITERS.
+	 *
+	 * The atomic acquire/release aware variant of
+	 * mark_rt_mutex_waiters uses a cmpxchg loop. After setting
+	 * the WAITERS bit, the atomic release / acquire can not
+	 * happen anymore and lock->wait_lock protects us from the
+	 * non-atomic case.
+	 *
+	 * Note, that this might set lock->owner =
+	 * RT_MUTEX_HAS_WAITERS in the case the lock is not contended
+	 * any more. This is fixed up when we take the ownership.
+	 * This is the transitional state explained at the top of this file.
+	 */
+	mark_rt_mutex_waiters(lock);
+
+	if (rt_mutex_owner(lock) && !try_to_steal_lock(lock))
+		return 0;
+
+	/* We got the lock. */
+	debug_rt_mutex_lock(lock);
+
+	rt_mutex_set_owner(lock, current, 0);
+
+	rt_mutex_deadlock_account_lock(lock, current);
+
+	return 1;
+}
+
+/*
+ * Task blocks on lock.
+ *
+ * Prepare waiter and propagate pi chain
+ *
+ * This must be called with lock->wait_lock held.
+ */
+static int task_blocks_on_rt_mutex(struct rt_mutex *lock,
+				   struct rt_mutex_waiter *waiter,
+				   int detect_deadlock)
+{
+	struct task_struct *owner = rt_mutex_owner(lock);
+	struct rt_mutex_waiter *top_waiter = waiter;
+	unsigned long flags;
+	int chain_walk = 0, res;
+
+	spin_lock_irqsave(&current->pi_lock, flags);
+	__rt_mutex_adjust_prio(current);
+	waiter->task = current;
+	waiter->lock = lock;
+	plist_node_init(&waiter->list_entry, current->prio);
+	plist_node_init(&waiter->pi_list_entry, current->prio);
+
+	/* Get the top priority waiter on the lock */
+	if (rt_mutex_has_waiters(lock))
+		top_waiter = rt_mutex_top_waiter(lock);
+	plist_add(&waiter->list_entry, &lock->wait_list);
+
+	current->pi_blocked_on = waiter;
+
+	spin_unlock_irqrestore(&current->pi_lock, flags);
+
+	if (waiter == rt_mutex_top_waiter(lock)) {
+		spin_lock_irqsave(&owner->pi_lock, flags);
+		plist_del(&top_waiter->pi_list_entry, &owner->pi_waiters);
+		plist_add(&waiter->pi_list_entry, &owner->pi_waiters);
+
+		__rt_mutex_adjust_prio(owner);
+		if (owner->pi_blocked_on)
+			chain_walk = 1;
+		spin_unlock_irqrestore(&owner->pi_lock, flags);
+	}
+	else if (debug_rt_mutex_detect_deadlock(waiter, detect_deadlock))
+		chain_walk = 1;
+
+	if (!chain_walk)
+		return 0;
+
+	/*
+	 * The owner can't disappear while holding a lock,
+	 * so the owner struct is protected by wait_lock.
+	 * Gets dropped in rt_mutex_adjust_prio_chain()!
+	 */
+	get_task_struct(owner);
+
+	spin_unlock(&lock->wait_lock);
+
+	res = rt_mutex_adjust_prio_chain(owner, detect_deadlock, lock, waiter,
+					 current);
+
+	spin_lock(&lock->wait_lock);
+
+	return res;
+}
+
+/*
+ * Wake up the next waiter on the lock.
+ *
+ * Remove the top waiter from the current tasks waiter list and from
+ * the lock waiter list. Set it as pending owner. Then wake it up.
+ *
+ * Called with lock->wait_lock held.
+ */
+static void wakeup_next_waiter(struct rt_mutex *lock)
+{
+	struct rt_mutex_waiter *waiter;
+	struct task_struct *pendowner;
+	unsigned long flags;
+
+	spin_lock_irqsave(&current->pi_lock, flags);
+
+	waiter = rt_mutex_top_waiter(lock);
+	plist_del(&waiter->list_entry, &lock->wait_list);
+
+	/*
+	 * Remove it from current->pi_waiters. We do not adjust a
+	 * possible priority boost right now. We execute wakeup in the
+	 * boosted mode and go back to normal after releasing
+	 * lock->wait_lock.
+	 */
+	plist_del(&waiter->pi_list_entry, &current->pi_waiters);
+	pendowner = waiter->task;
+	waiter->task = NULL;
+
+	rt_mutex_set_owner(lock, pendowner, RT_MUTEX_OWNER_PENDING);
+
+	spin_unlock_irqrestore(&current->pi_lock, flags);
+
+	/*
+	 * Clear the pi_blocked_on variable and enqueue a possible
+	 * waiter into the pi_waiters list of the pending owner. This
+	 * prevents that in case the pending owner gets unboosted a
+	 * waiter with higher priority than pending-owner->normal_prio
+	 * is blocked on the unboosted (pending) owner.
+	 */
+	spin_lock_irqsave(&pendowner->pi_lock, flags);
+
+	WARN_ON(!pendowner->pi_blocked_on);
+	WARN_ON(pendowner->pi_blocked_on != waiter);
+	WARN_ON(pendowner->pi_blocked_on->lock != lock);
+
+	pendowner->pi_blocked_on = NULL;
+
+	if (rt_mutex_has_waiters(lock)) {
+		struct rt_mutex_waiter *next;
+
+		next = rt_mutex_top_waiter(lock);
+		plist_add(&next->pi_list_entry, &pendowner->pi_waiters);
+	}
+	spin_unlock_irqrestore(&pendowner->pi_lock, flags);
+
+	wake_up_process(pendowner);
+}
+
+/*
+ * Remove a waiter from a lock
+ *
+ * Must be called with lock->wait_lock held
+ */
+static void remove_waiter(struct rt_mutex *lock,
+			  struct rt_mutex_waiter *waiter)
+{
+	int first = (waiter == rt_mutex_top_waiter(lock));
+	struct task_struct *owner = rt_mutex_owner(lock);
+	unsigned long flags;
+	int chain_walk = 0;
+
+	spin_lock_irqsave(&current->pi_lock, flags);
+	plist_del(&waiter->list_entry, &lock->wait_list);
+	waiter->task = NULL;
+	current->pi_blocked_on = NULL;
+	spin_unlock_irqrestore(&current->pi_lock, flags);
+
+	if (first && owner != current) {
+
+		spin_lock_irqsave(&owner->pi_lock, flags);
+
+		plist_del(&waiter->pi_list_entry, &owner->pi_waiters);
+
+		if (rt_mutex_has_waiters(lock)) {
+			struct rt_mutex_waiter *next;
+
+			next = rt_mutex_top_waiter(lock);
+			plist_add(&next->pi_list_entry, &owner->pi_waiters);
+		}
+		__rt_mutex_adjust_prio(owner);
+
+		if (owner->pi_blocked_on)
+			chain_walk = 1;
+
+		spin_unlock_irqrestore(&owner->pi_lock, flags);
+	}
+
+	WARN_ON(!plist_node_empty(&waiter->pi_list_entry));
+
+	if (!chain_walk)
+		return;
+
+	/* gets dropped in rt_mutex_adjust_prio_chain()! */
+	get_task_struct(owner);
+
+	spin_unlock(&lock->wait_lock);
+
+	rt_mutex_adjust_prio_chain(owner, 0, lock, NULL, current);
+
+	spin_lock(&lock->wait_lock);
+}
+
+/*
+ * Recheck the pi chain, in case we got a priority setting
+ *
+ * Called from sched_setscheduler
+ */
+void rt_mutex_adjust_pi(struct task_struct *task)
+{
+	struct rt_mutex_waiter *waiter;
+	unsigned long flags;
+
+	spin_lock_irqsave(&task->pi_lock, flags);
+
+	waiter = task->pi_blocked_on;
+	if (!waiter || waiter->list_entry.prio == task->prio) {
+		spin_unlock_irqrestore(&task->pi_lock, flags);
+		return;
+	}
+
+	spin_unlock_irqrestore(&task->pi_lock, flags);
+
+	/* gets dropped in rt_mutex_adjust_prio_chain()! */
+	get_task_struct(task);
+	rt_mutex_adjust_prio_chain(task, 0, NULL, NULL, task);
+}
+
+/*
+ * Slow path lock function:
+ */
+static int __sched
+rt_mutex_slowlock(struct rt_mutex *lock, int state,
+		  struct hrtimer_sleeper *timeout,
+		  int detect_deadlock)
+{
+	struct rt_mutex_waiter waiter;
+	int ret = 0;
+
+	debug_rt_mutex_init_waiter(&waiter);
+	waiter.task = NULL;
+
+	spin_lock(&lock->wait_lock);
+
+	/* Try to acquire the lock again: */
+	if (try_to_take_rt_mutex(lock)) {
+		spin_unlock(&lock->wait_lock);
+		return 0;
+	}
+
+	set_current_state(state);
+
+	/* Setup the timer, when timeout != NULL */
+	if (unlikely(timeout))
+		hrtimer_start(&timeout->timer, timeout->timer.expires,
+			      HRTIMER_MODE_ABS);
+
+	for (;;) {
+		/* Try to acquire the lock: */
+		if (try_to_take_rt_mutex(lock))
+			break;
+
+		/*
+		 * TASK_INTERRUPTIBLE checks for signals and
+		 * timeout. Ignored otherwise.
+		 */
+		if (unlikely(state == TASK_INTERRUPTIBLE)) {
+			/* Signal pending? */
+			if (signal_pending(current))
+				ret = -EINTR;
+			if (timeout && !timeout->task)
+				ret = -ETIMEDOUT;
+			if (ret)
+				break;
+		}
+
+		/*
+		 * waiter.task is NULL the first time we come here and
+		 * when we have been woken up by the previous owner
+		 * but the lock got stolen by a higher prio task.
+		 */
+		if (!waiter.task) {
+			ret = task_blocks_on_rt_mutex(lock, &waiter,
+						      detect_deadlock);
+			/*
+			 * If we got woken up by the owner then start loop
+			 * all over without going into schedule to try
+			 * to get the lock now:
+			 */
+			if (unlikely(!waiter.task))
+				continue;
+
+			if (unlikely(ret))
+				break;
+		}
+
+		spin_unlock(&lock->wait_lock);
+
+		debug_rt_mutex_print_deadlock(&waiter);
+
+		if (waiter.task)
+			schedule_rt_mutex(lock);
+
+		spin_lock(&lock->wait_lock);
+		set_current_state(state);
+	}
+
+	set_current_state(TASK_RUNNING);
+
+	if (unlikely(waiter.task))
+		remove_waiter(lock, &waiter);
+
+	/*
+	 * try_to_take_rt_mutex() sets the waiter bit
+	 * unconditionally. We might have to fix that up.
+	 */
+	fixup_rt_mutex_waiters(lock);
+
+	spin_unlock(&lock->wait_lock);
+
+	/* Remove pending timer: */
+	if (unlikely(timeout))
+		hrtimer_cancel(&timeout->timer);
+
+	/*
+	 * Readjust priority, when we did not get the lock. We might
+	 * have been the pending owner and boosted. Since we did not
+	 * take the lock, the PI boost has to go.
+	 */
+	if (unlikely(ret))
+		rt_mutex_adjust_prio(current);
+
+	debug_rt_mutex_free_waiter(&waiter);
+
+	return ret;
+}
+
+/*
+ * Slow path try-lock function:
+ */
+static inline int
+rt_mutex_slowtrylock(struct rt_mutex *lock)
+{
+	int ret = 0;
+
+	spin_lock(&lock->wait_lock);
+
+	if (likely(rt_mutex_owner(lock) != current)) {
+
+		ret = try_to_take_rt_mutex(lock);
+		/*
+		 * try_to_take_rt_mutex() sets the lock waiters
+		 * bit unconditionally. Clean this up.
+		 */
+		fixup_rt_mutex_waiters(lock);
+	}
+
+	spin_unlock(&lock->wait_lock);
+
+	return ret;
+}
+
+/*
+ * Slow path to release a rt-mutex:
+ */
+static void __sched
+rt_mutex_slowunlock(struct rt_mutex *lock)
+{
+	spin_lock(&lock->wait_lock);
+
+	debug_rt_mutex_unlock(lock);
+
+	rt_mutex_deadlock_account_unlock(current);
+
+	if (!rt_mutex_has_waiters(lock)) {
+		lock->owner = NULL;
+		spin_unlock(&lock->wait_lock);
+		return;
+	}
+
+	wakeup_next_waiter(lock);
+
+	spin_unlock(&lock->wait_lock);
+
+	/* Undo pi boosting if necessary: */
+	rt_mutex_adjust_prio(current);
+}
+
+/*
+ * debug aware fast / slowpath lock,trylock,unlock
+ *
+ * The atomic acquire/release ops are compiled away, when either the
+ * architecture does not support cmpxchg or when debugging is enabled.
+ */
+static inline int
+rt_mutex_fastlock(struct rt_mutex *lock, int state,
+		  int detect_deadlock,
+		  int (*slowfn)(struct rt_mutex *lock, int state,
+				struct hrtimer_sleeper *timeout,
+				int detect_deadlock))
+{
+	if (!detect_deadlock && likely(rt_mutex_cmpxchg(lock, NULL, current))) {
+		rt_mutex_deadlock_account_lock(lock, current);
+		return 0;
+	} else
+		return slowfn(lock, state, NULL, detect_deadlock);
+}
+
+static inline int
+rt_mutex_timed_fastlock(struct rt_mutex *lock, int state,
+			struct hrtimer_sleeper *timeout, int detect_deadlock,
+			int (*slowfn)(struct rt_mutex *lock, int state,
+				      struct hrtimer_sleeper *timeout,
+				      int detect_deadlock))
+{
+	if (!detect_deadlock && likely(rt_mutex_cmpxchg(lock, NULL, current))) {
+		rt_mutex_deadlock_account_lock(lock, current);
+		return 0;
+	} else
+		return slowfn(lock, state, timeout, detect_deadlock);
+}
+
+static inline int
+rt_mutex_fasttrylock(struct rt_mutex *lock,
+		     int (*slowfn)(struct rt_mutex *lock))
+{
+	if (likely(rt_mutex_cmpxchg(lock, NULL, current))) {
+		rt_mutex_deadlock_account_lock(lock, current);
+		return 1;
+	}
+	return slowfn(lock);
+}
+
+static inline void
+rt_mutex_fastunlock(struct rt_mutex *lock,
+		    void (*slowfn)(struct rt_mutex *lock))
+{
+	if (likely(rt_mutex_cmpxchg(lock, current, NULL)))
+		rt_mutex_deadlock_account_unlock(current);
+	else
+		slowfn(lock);
+}
+
+/**
+ * rt_mutex_lock - lock a rt_mutex
+ *
+ * @lock: the rt_mutex to be locked
+ */
+void __sched rt_mutex_lock(struct rt_mutex *lock)
+{
+	might_sleep();
+
+	rt_mutex_fastlock(lock, TASK_UNINTERRUPTIBLE, 0, rt_mutex_slowlock);
+}
+EXPORT_SYMBOL_GPL(rt_mutex_lock);
+
+/**
+ * rt_mutex_lock_interruptible - lock a rt_mutex interruptible
+ *
+ * @lock: 		the rt_mutex to be locked
+ * @detect_deadlock:	deadlock detection on/off
+ *
+ * Returns:
+ *  0 		on success
+ * -EINTR 	when interrupted by a signal
+ * -EDEADLK	when the lock would deadlock (when deadlock detection is on)
+ */
+int __sched rt_mutex_lock_interruptible(struct rt_mutex *lock,
+						 int detect_deadlock)
+{
+	might_sleep();
+
+	return rt_mutex_fastlock(lock, TASK_INTERRUPTIBLE,
+				 detect_deadlock, rt_mutex_slowlock);
+}
+EXPORT_SYMBOL_GPL(rt_mutex_lock_interruptible);
+
+/**
+ * rt_mutex_lock_interruptible_ktime - lock a rt_mutex interruptible
+ *				       the timeout structure is provided
+ *				       by the caller
+ *
+ * @lock: 		the rt_mutex to be locked
+ * @timeout:		timeout structure or NULL (no timeout)
+ * @detect_deadlock:	deadlock detection on/off
+ *
+ * Returns:
+ *  0 		on success
+ * -EINTR 	when interrupted by a signal
+ * -ETIMEOUT	when the timeout expired
+ * -EDEADLK	when the lock would deadlock (when deadlock detection is on)
+ */
+int
+rt_mutex_timed_lock(struct rt_mutex *lock, struct hrtimer_sleeper *timeout,
+		    int detect_deadlock)
+{
+	might_sleep();
+
+	return rt_mutex_timed_fastlock(lock, TASK_INTERRUPTIBLE, timeout,
+				       detect_deadlock, rt_mutex_slowlock);
+}
+EXPORT_SYMBOL_GPL(rt_mutex_timed_lock);
+
+/**
+ * rt_mutex_trylock - try to lock a rt_mutex
+ *
+ * @lock:	the rt_mutex to be locked
+ *
+ * Returns 1 on success and 0 on contention
+ */
+int __sched rt_mutex_trylock(struct rt_mutex *lock)
+{
+	return rt_mutex_fasttrylock(lock, rt_mutex_slowtrylock);
+}
+EXPORT_SYMBOL_GPL(rt_mutex_trylock);
+
+/**
+ * rt_mutex_unlock - unlock a rt_mutex
+ *
+ * @lock: the rt_mutex to be unlocked
+ */
+void __sched rt_mutex_unlock(struct rt_mutex *lock)
+{
+	rt_mutex_fastunlock(lock, rt_mutex_slowunlock);
+}
+EXPORT_SYMBOL_GPL(rt_mutex_unlock);
+
+/***
+ * rt_mutex_destroy - mark a mutex unusable
+ * @lock: the mutex to be destroyed
+ *
+ * This function marks the mutex uninitialized, and any subsequent
+ * use of the mutex is forbidden. The mutex must not be locked when
+ * this function is called.
+ */
+void rt_mutex_destroy(struct rt_mutex *lock)
+{
+	WARN_ON(rt_mutex_is_locked(lock));
+#ifdef CONFIG_DEBUG_RT_MUTEXES
+	lock->magic = NULL;
+#endif
+}
+
+EXPORT_SYMBOL_GPL(rt_mutex_destroy);
+
+/**
+ * __rt_mutex_init - initialize the rt lock
+ *
+ * @lock: the rt lock to be initialized
+ *
+ * Initialize the rt lock to unlocked state.
+ *
+ * Initializing of a locked rt lock is not allowed
+ */
+void __rt_mutex_init(struct rt_mutex *lock, const char *name)
+{
+	lock->owner = NULL;
+	spin_lock_init(&lock->wait_lock);
+	plist_head_init(&lock->wait_list, &lock->wait_lock);
+
+	debug_rt_mutex_init(lock, name);
+}
+EXPORT_SYMBOL_GPL(__rt_mutex_init);
+
+/**
+ * rt_mutex_init_proxy_locked - initialize and lock a rt_mutex on behalf of a
+ *				proxy owner
+ *
+ * @lock: 	the rt_mutex to be locked
+ * @proxy_owner:the task to set as owner
+ *
+ * No locking. Caller has to do serializing itself
+ * Special API call for PI-futex support
+ */
+void rt_mutex_init_proxy_locked(struct rt_mutex *lock,
+				struct task_struct *proxy_owner)
+{
+	__rt_mutex_init(lock, NULL);
+	debug_rt_mutex_proxy_lock(lock, proxy_owner);
+	rt_mutex_set_owner(lock, proxy_owner, 0);
+	rt_mutex_deadlock_account_lock(lock, proxy_owner);
+}
+
+/**
+ * rt_mutex_proxy_unlock - release a lock on behalf of owner
+ *
+ * @lock: 	the rt_mutex to be locked
+ *
+ * No locking. Caller has to do serializing itself
+ * Special API call for PI-futex support
+ */
+void rt_mutex_proxy_unlock(struct rt_mutex *lock,
+			   struct task_struct *proxy_owner)
+{
+	debug_rt_mutex_proxy_unlock(lock);
+	rt_mutex_set_owner(lock, NULL, 0);
+	rt_mutex_deadlock_account_unlock(proxy_owner);
+}
+
+/**
+ * rt_mutex_next_owner - return the next owner of the lock
+ *
+ * @lock: the rt lock query
+ *
+ * Returns the next owner of the lock or NULL
+ *
+ * Caller has to serialize against other accessors to the lock
+ * itself.
+ *
+ * Special API call for PI-futex support
+ */
+struct task_struct *rt_mutex_next_owner(struct rt_mutex *lock)
+{
+	if (!rt_mutex_has_waiters(lock))
+		return NULL;
+
+	return rt_mutex_top_waiter(lock)->task;
+}
--- a/kernel/rtmutex.h
+++ b/kernel/rtmutex.h
@@ -0,0 +1,26 @@
+/*
+ * RT-Mutexes: blocking mutual exclusion locks with PI support
+ *
+ * started by Ingo Molnar and Thomas Gleixner:
+ *
+ *  Copyright (C) 2004-2006 Red Hat, Inc., Ingo Molnar <mingo@redhat.com>
+ *  Copyright (C) 2006, Timesys Corp., Thomas Gleixner <tglx@timesys.com>
+ *
+ * This file contains macros used solely by rtmutex.c.
+ * Non-debug version.
+ */
+
+#define rt_mutex_deadlock_check(l)			(0)
+#define rt_mutex_deadlock_account_lock(m, t)		do { } while (0)
+#define rt_mutex_deadlock_account_unlock(l)		do { } while (0)
+#define debug_rt_mutex_init_waiter(w)			do { } while (0)
+#define debug_rt_mutex_free_waiter(w)			do { } while (0)
+#define debug_rt_mutex_lock(l)				do { } while (0)
+#define debug_rt_mutex_proxy_lock(l,p)			do { } while (0)
+#define debug_rt_mutex_proxy_unlock(l)			do { } while (0)
+#define debug_rt_mutex_unlock(l)			do { } while (0)
+#define debug_rt_mutex_init(m, n)			do { } while (0)
+#define debug_rt_mutex_deadlock(d, a ,l)		do { } while (0)
+#define debug_rt_mutex_print_deadlock(w)		do { } while (0)
+#define debug_rt_mutex_detect_deadlock(w,d)		(d)
+#define debug_rt_mutex_reset_waiter(w)			do { } while (0)
--- a/kernel/rtmutex_common.h
+++ b/kernel/rtmutex_common.h
@@ -0,0 +1,123 @@
+/*
+ * RT Mutexes: blocking mutual exclusion locks with PI support
+ *
+ * started by Ingo Molnar and Thomas Gleixner:
+ *
+ *  Copyright (C) 2004-2006 Red Hat, Inc., Ingo Molnar <mingo@redhat.com>
+ *  Copyright (C) 2006, Timesys Corp., Thomas Gleixner <tglx@timesys.com>
+ *
+ * This file contains the private data structure and API definitions.
+ */
+
+#ifndef __KERNEL_RTMUTEX_COMMON_H
+#define __KERNEL_RTMUTEX_COMMON_H
+
+#include <linux/rtmutex.h>
+
+/*
+ * The rtmutex in kernel tester is independent of rtmutex debugging. We
+ * call schedule_rt_mutex_test() instead of schedule() for the tasks which
+ * belong to the tester. That way we can delay the wakeup path of those
+ * threads to provoke lock stealing and testing of  complex boosting scenarios.
+ */
+#ifdef CONFIG_RT_MUTEX_TESTER
+
+extern void schedule_rt_mutex_test(struct rt_mutex *lock);
+
+#define schedule_rt_mutex(_lock)				\
+  do {								\
+	if (!(current->flags & PF_MUTEX_TESTER))		\
+		schedule();					\
+	else							\
+		schedule_rt_mutex_test(_lock);			\
+  } while (0)
+
+#else
+# define schedule_rt_mutex(_lock)			schedule()
+#endif
+
+/*
+ * This is the control structure for tasks blocked on a rt_mutex,
+ * which is allocated on the kernel stack on of the blocked task.
+ *
+ * @list_entry:		pi node to enqueue into the mutex waiters list
+ * @pi_list_entry:	pi node to enqueue into the mutex owner waiters list
+ * @task:		task reference to the blocked task
+ */
+struct rt_mutex_waiter {
+	struct plist_node	list_entry;
+	struct plist_node	pi_list_entry;
+	struct task_struct	*task;
+	struct rt_mutex		*lock;
+#ifdef CONFIG_DEBUG_RT_MUTEXES
+	unsigned long		ip;
+	pid_t			deadlock_task_pid;
+	struct rt_mutex		*deadlock_lock;
+#endif
+};
+
+/*
+ * Various helpers to access the waiters-plist:
+ */
+static inline int rt_mutex_has_waiters(struct rt_mutex *lock)
+{
+	return !plist_head_empty(&lock->wait_list);
+}
+
+static inline struct rt_mutex_waiter *
+rt_mutex_top_waiter(struct rt_mutex *lock)
+{
+	struct rt_mutex_waiter *w;
+
+	w = plist_first_entry(&lock->wait_list, struct rt_mutex_waiter,
+			       list_entry);
+	BUG_ON(w->lock != lock);
+
+	return w;
+}
+
+static inline int task_has_pi_waiters(struct task_struct *p)
+{
+	return !plist_head_empty(&p->pi_waiters);
+}
+
+static inline struct rt_mutex_waiter *
+task_top_pi_waiter(struct task_struct *p)
+{
+	return plist_first_entry(&p->pi_waiters, struct rt_mutex_waiter,
+				  pi_list_entry);
+}
+
+/*
+ * lock->owner state tracking:
+ */
+#define RT_MUTEX_OWNER_PENDING	1UL
+#define RT_MUTEX_HAS_WAITERS	2UL
+#define RT_MUTEX_OWNER_MASKALL	3UL
+
+static inline struct task_struct *rt_mutex_owner(struct rt_mutex *lock)
+{
+	return (struct task_struct *)
+		((unsigned long)lock->owner & ~RT_MUTEX_OWNER_MASKALL);
+}
+
+static inline struct task_struct *rt_mutex_real_owner(struct rt_mutex *lock)
+{
+ 	return (struct task_struct *)
+		((unsigned long)lock->owner & ~RT_MUTEX_HAS_WAITERS);
+}
+
+static inline unsigned long rt_mutex_owner_pending(struct rt_mutex *lock)
+{
+	return (unsigned long)lock->owner & RT_MUTEX_OWNER_PENDING;
+}
+
+/*
+ * PI-futex support (proxy locking functions, etc.):
+ */
+extern struct task_struct *rt_mutex_next_owner(struct rt_mutex *lock);
+extern void rt_mutex_init_proxy_locked(struct rt_mutex *lock,
+				       struct task_struct *proxy_owner);
+extern void rt_mutex_proxy_unlock(struct rt_mutex *lock,
+				  struct task_struct *proxy_owner);
+#endif
--- a/kernel/rwsem.c
+++ b/kernel/rwsem.c
@@ -0,0 +1,147 @@
+/* kernel/rwsem.c: R/W semaphores, public implementation
+ *
+ * Written by David Howells (dhowells@redhat.com).
+ * Derived from asm-i386/semaphore.h
+ */
+
+#include <linux/types.h>
+#include <linux/kernel.h>
+#include <linux/module.h>
+#include <linux/rwsem.h>
+
+#include <asm/system.h>
+#include <asm/atomic.h>
+
+/*
+ * lock for reading
+ */
+void down_read(struct rw_semaphore *sem)
+{
+	might_sleep();
+	rwsem_acquire_read(&sem->dep_map, 0, 0, _RET_IP_);
+
+	__down_read(sem);
+}
+
+EXPORT_SYMBOL(down_read);
+
+/*
+ * trylock for reading -- returns 1 if successful, 0 if contention
+ */
+int down_read_trylock(struct rw_semaphore *sem)
+{
+	int ret = __down_read_trylock(sem);
+
+	if (ret == 1)
+		rwsem_acquire_read(&sem->dep_map, 0, 1, _RET_IP_);
+	return ret;
+}
+
+EXPORT_SYMBOL(down_read_trylock);
+
+/*
+ * lock for writing
+ */
+void down_write(struct rw_semaphore *sem)
+{
+	might_sleep();
+	rwsem_acquire(&sem->dep_map, 0, 0, _RET_IP_);
+
+	__down_write(sem);
+}
+
+EXPORT_SYMBOL(down_write);
+
+/*
+ * trylock for writing -- returns 1 if successful, 0 if contention
+ */
+int down_write_trylock(struct rw_semaphore *sem)
+{
+	int ret = __down_write_trylock(sem);
+
+	if (ret == 1)
+		rwsem_acquire(&sem->dep_map, 0, 0, _RET_IP_);
+	return ret;
+}
+
+EXPORT_SYMBOL(down_write_trylock);
+
+/*
+ * release a read lock
+ */
+void up_read(struct rw_semaphore *sem)
+{
+	rwsem_release(&sem->dep_map, 1, _RET_IP_);
+
+	__up_read(sem);
+}
+
+EXPORT_SYMBOL(up_read);
+
+/*
+ * release a write lock
+ */
+void up_write(struct rw_semaphore *sem)
+{
+	rwsem_release(&sem->dep_map, 1, _RET_IP_);
+
+	__up_write(sem);
+}
+
+EXPORT_SYMBOL(up_write);
+
+/*
+ * downgrade write lock to read lock
+ */
+void downgrade_write(struct rw_semaphore *sem)
+{
+	/*
+	 * lockdep: a downgraded write will live on as a write
+	 * dependency.
+	 */
+	__downgrade_write(sem);
+}
+
+EXPORT_SYMBOL(downgrade_write);
+
+#ifdef CONFIG_DEBUG_LOCK_ALLOC
+
+void down_read_nested(struct rw_semaphore *sem, int subclass)
+{
+	might_sleep();
+	rwsem_acquire_read(&sem->dep_map, subclass, 0, _RET_IP_);
+
+	__down_read(sem);
+}
+
+EXPORT_SYMBOL(down_read_nested);
+
+void down_read_non_owner(struct rw_semaphore *sem)
+{
+	might_sleep();
+
+	__down_read(sem);
+}
+
+EXPORT_SYMBOL(down_read_non_owner);
+
+void down_write_nested(struct rw_semaphore *sem, int subclass)
+{
+	might_sleep();
+	rwsem_acquire(&sem->dep_map, subclass, 0, _RET_IP_);
+
+	__down_write_nested(sem, subclass);
+}
+
+EXPORT_SYMBOL(down_write_nested);
+
+void up_read_non_owner(struct rw_semaphore *sem)
+{
+	__up_read(sem);
+}
+
+EXPORT_SYMBOL(up_read_non_owner);
+
+#endif
+
+
--- a/kernel/sched.c
+++ b/kernel/sched.c
--- a/kernel/sched_debug.c
+++ b/kernel/sched_debug.c
@@ -0,0 +1,276 @@
+/*
+ * kernel/time/sched_debug.c
+ *
+ * Print the CFS rbtree
+ *
+ * Copyright(C) 2007, Red Hat, Inc., Ingo Molnar
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ */
+
+#include <linux/proc_fs.h>
+#include <linux/sched.h>
+#include <linux/seq_file.h>
+#include <linux/kallsyms.h>
+#include <linux/utsname.h>
+
+typedef void (*print_fn_t)(struct seq_file *m, unsigned int *classes);
+
+/*
+ * This allows printing both to /proc/sched_debug and
+ * to the console
+ */
+#define SEQ_printf(m, x...)			\
+ do {						\
+	if (m)					\
+		seq_printf(m, x);		\
+	else					\
+		printk(x);			\
+ } while (0)
+
+static void
+print_task(struct seq_file *m, struct rq *rq, struct task_struct *p, u64 now)
+{
+	if (rq->curr == p)
+		SEQ_printf(m, "R");
+	else
+		SEQ_printf(m, " ");
+
+	SEQ_printf(m, "%15s %5d %15Ld %13Ld %13Ld %9Ld %5d "
+		      "%15Ld %15Ld %15Ld %15Ld %15Ld\n",
+		p->comm, p->pid,
+		(long long)p->se.fair_key,
+		(long long)(p->se.fair_key - rq->cfs.fair_clock),
+		(long long)p->se.wait_runtime,
+		(long long)(p->nvcsw + p->nivcsw),
+		p->prio,
+		(long long)p->se.sum_exec_runtime,
+		(long long)p->se.sum_wait_runtime,
+		(long long)p->se.sum_sleep_runtime,
+		(long long)p->se.wait_runtime_overruns,
+		(long long)p->se.wait_runtime_underruns);
+}
+
+static void print_rq(struct seq_file *m, struct rq *rq, int rq_cpu, u64 now)
+{
+	struct task_struct *g, *p;
+
+	SEQ_printf(m,
+	"\nrunnable tasks:\n"
+	"            task   PID        tree-key         delta       waiting"
+	"  switches  prio"
+	"        sum-exec        sum-wait       sum-sleep"
+	"    wait-overrun   wait-underrun\n"
+	"------------------------------------------------------------------"
+	"----------------"
+	"------------------------------------------------"
+	"--------------------------------\n");
+
+	read_lock_irq(&tasklist_lock);
+
+	do_each_thread(g, p) {
+		if (!p->se.on_rq || task_cpu(p) != rq_cpu)
+			continue;
+
+		print_task(m, rq, p, now);
+	} while_each_thread(g, p);
+
+	read_unlock_irq(&tasklist_lock);
+}
+
+static void
+print_cfs_rq_runtime_sum(struct seq_file *m, int cpu, struct cfs_rq *cfs_rq)
+{
+	s64 wait_runtime_rq_sum = 0;
+	struct task_struct *p;
+	struct rb_node *curr;
+	unsigned long flags;
+	struct rq *rq = &per_cpu(runqueues, cpu);
+
+	spin_lock_irqsave(&rq->lock, flags);
+	curr = first_fair(cfs_rq);
+	while (curr) {
+		p = rb_entry(curr, struct task_struct, se.run_node);
+		wait_runtime_rq_sum += p->se.wait_runtime;
+
+		curr = rb_next(curr);
+	}
+	spin_unlock_irqrestore(&rq->lock, flags);
+
+	SEQ_printf(m, "  .%-30s: %Ld\n", "wait_runtime_rq_sum",
+		(long long)wait_runtime_rq_sum);
+}
+
+void print_cfs_rq(struct seq_file *m, int cpu, struct cfs_rq *cfs_rq, u64 now)
+{
+	SEQ_printf(m, "\ncfs_rq %p\n", cfs_rq);
+
+#define P(x) \
+	SEQ_printf(m, "  .%-30s: %Ld\n", #x, (long long)(cfs_rq->x))
+
+	P(fair_clock);
+	P(exec_clock);
+	P(wait_runtime);
+	P(wait_runtime_overruns);
+	P(wait_runtime_underruns);
+	P(sleeper_bonus);
+#undef P
+
+	print_cfs_rq_runtime_sum(m, cpu, cfs_rq);
+}
+
+static void print_cpu(struct seq_file *m, int cpu, u64 now)
+{
+	struct rq *rq = &per_cpu(runqueues, cpu);
+
+#ifdef CONFIG_X86
+	{
+		unsigned int freq = cpu_khz ? : 1;
+
+		SEQ_printf(m, "\ncpu#%d, %u.%03u MHz\n",
+			   cpu, freq / 1000, (freq % 1000));
+	}
+#else
+	SEQ_printf(m, "\ncpu#%d\n", cpu);
+#endif
+
+#define P(x) \
+	SEQ_printf(m, "  .%-30s: %Ld\n", #x, (long long)(rq->x))
+
+	P(nr_running);
+	SEQ_printf(m, "  .%-30s: %lu\n", "load",
+		   rq->ls.load.weight);
+	P(ls.delta_fair);
+	P(ls.delta_exec);
+	P(nr_switches);
+	P(nr_load_updates);
+	P(nr_uninterruptible);
+	SEQ_printf(m, "  .%-30s: %lu\n", "jiffies", jiffies);
+	P(next_balance);
+	P(curr->pid);
+	P(clock);
+	P(prev_clock_raw);
+	P(clock_warps);
+	P(clock_overflows);
+	P(clock_unstable_events);
+	P(clock_max_delta);
+	P(cpu_load[0]);
+	P(cpu_load[1]);
+	P(cpu_load[2]);
+	P(cpu_load[3]);
+	P(cpu_load[4]);
+#undef P
+
+	print_cfs_stats(m, cpu, now);
+
+	print_rq(m, rq, cpu, now);
+}
+
+static int sched_debug_show(struct seq_file *m, void *v)
+{
+	u64 now = ktime_to_ns(ktime_get());
+	int cpu;
+
+	SEQ_printf(m, "Sched Debug Version: v0.04, cfs-v19, %s %.*s\n",
+		init_utsname()->release,
+		(int)strcspn(init_utsname()->version, " "),
+		init_utsname()->version);
+
+	SEQ_printf(m, "now at %Lu nsecs\n", (unsigned long long)now);
+
+	for_each_online_cpu(cpu)
+		print_cpu(m, cpu, now);
+
+	SEQ_printf(m, "\n");
+
+	return 0;
+}
+
+void sysrq_sched_debug_show(void)
+{
+	sched_debug_show(NULL, NULL);
+}
+
+static int sched_debug_open(struct inode *inode, struct file *filp)
+{
+	return single_open(filp, sched_debug_show, NULL);
+}
+
+static struct file_operations sched_debug_fops = {
+	.open		= sched_debug_open,
+	.read		= seq_read,
+	.llseek		= seq_lseek,
+	.release	= seq_release,
+};
+
+static int __init init_sched_debug_procfs(void)
+{
+	struct proc_dir_entry *pe;
+
+	pe = create_proc_entry("sched_debug", 0644, NULL);
+	if (!pe)
+		return -ENOMEM;
+
+	pe->proc_fops = &sched_debug_fops;
+
+	return 0;
+}
+
+__initcall(init_sched_debug_procfs);
+
+void proc_sched_show_task(struct task_struct *p, struct seq_file *m)
+{
+	unsigned long flags;
+	int num_threads = 1;
+
+	rcu_read_lock();
+	if (lock_task_sighand(p, &flags)) {
+		num_threads = atomic_read(&p->signal->count);
+		unlock_task_sighand(p, &flags);
+	}
+	rcu_read_unlock();
+
+	SEQ_printf(m, "%s (%d, #threads: %d)\n", p->comm, p->pid, num_threads);
+	SEQ_printf(m, "----------------------------------------------\n");
+#define P(F) \
+	SEQ_printf(m, "%-25s:%20Ld\n", #F, (long long)p->F)
+
+	P(se.wait_start);
+	P(se.wait_start_fair);
+	P(se.exec_start);
+	P(se.sleep_start);
+	P(se.sleep_start_fair);
+	P(se.block_start);
+	P(se.sleep_max);
+	P(se.block_max);
+	P(se.exec_max);
+	P(se.wait_max);
+	P(se.wait_runtime);
+	P(se.wait_runtime_overruns);
+	P(se.wait_runtime_underruns);
+	P(se.sum_wait_runtime);
+	P(se.sum_exec_runtime);
+	SEQ_printf(m, "%-25s:%20Ld\n",
+		   "nr_switches", (long long)(p->nvcsw + p->nivcsw));
+	P(se.load.weight);
+	P(policy);
+	P(prio);
+#undef P
+
+	{
+		u64 t0, t1;
+
+		t0 = sched_clock();
+		t1 = sched_clock();
+		SEQ_printf(m, "%-25s:%20Ld\n", "clock-delta", (long long)(t1-t0));
+	}
+}
+
+void proc_sched_set_task(struct task_struct *p)
+{
+	p->se.sleep_max = p->se.block_max = p->se.exec_max = p->se.wait_max = 0;
+	p->se.wait_runtime_overruns = p->se.wait_runtime_underruns = 0;
+	p->se.sum_exec_runtime = 0;
+}
--- a/kernel/sched_fair.c
+++ b/kernel/sched_fair.c
--- a/kernel/sched_idletask.c
+++ b/kernel/sched_idletask.c
@@ -0,0 +1,71 @@
+/*
+ * idle-task scheduling class.
+ *
+ * (NOTE: these are not related to SCHED_IDLE tasks which are
+ *  handled in sched_fair.c)
+ */
+
+/*
+ * Idle tasks are unconditionally rescheduled:
+ */
+static void check_preempt_curr_idle(struct rq *rq, struct task_struct *p)
+{
+	resched_task(rq->idle);
+}
+
+static struct task_struct *pick_next_task_idle(struct rq *rq, u64 now)
+{
+	schedstat_inc(rq, sched_goidle);
+
+	return rq->idle;
+}
+
+/*
+ * It is not legal to sleep in the idle task - print a warning
+ * message if some code attempts to do it:
+ */
+static void
+dequeue_task_idle(struct rq *rq, struct task_struct *p, int sleep, u64 now)
+{
+	spin_unlock_irq(&rq->lock);
+	printk(KERN_ERR "bad: scheduling from the idle thread!\n");
+	dump_stack();
+	spin_lock_irq(&rq->lock);
+}
+
+static void put_prev_task_idle(struct rq *rq, struct task_struct *prev, u64 now)
+{
+}
+
+static int
+load_balance_idle(struct rq *this_rq, int this_cpu, struct rq *busiest,
+			unsigned long max_nr_move, unsigned long max_load_move,
+			struct sched_domain *sd, enum cpu_idle_type idle,
+			int *all_pinned, unsigned long *total_load_moved)
+{
+	return 0;
+}
+
+static void task_tick_idle(struct rq *rq, struct task_struct *curr)
+{
+}
+
+/*
+ * Simple, special scheduling class for the per-CPU idle tasks:
+ */
+static struct sched_class idle_sched_class __read_mostly = {
+	/* no enqueue/yield_task for idle tasks */
+
+	/* dequeue is not valid, we print a debug message there: */
+	.dequeue_task		= dequeue_task_idle,
+
+	.check_preempt_curr	= check_preempt_curr_idle,
+
+	.pick_next_task		= pick_next_task_idle,
+	.put_prev_task		= put_prev_task_idle,
+
+	.load_balance		= load_balance_idle,
+
+	.task_tick		= task_tick_idle,
+	/* no .task_new for idle tasks */
+};
--- a/kernel/sched_rt.c
+++ b/kernel/sched_rt.c
@@ -0,0 +1,255 @@
+/*
+ * Real-Time Scheduling Class (mapped to the SCHED_FIFO and SCHED_RR
+ * policies)
+ */
+
+/*
+ * Update the current task's runtime statistics. Skip current tasks that
+ * are not in our scheduling class.
+ */
+static inline void update_curr_rt(struct rq *rq, u64 now)
+{
+	struct task_struct *curr = rq->curr;
+	u64 delta_exec;
+
+	if (!task_has_rt_policy(curr))
+		return;
+
+	delta_exec = now - curr->se.exec_start;
+	if (unlikely((s64)delta_exec < 0))
+		delta_exec = 0;
+	if (unlikely(delta_exec > curr->se.exec_max))
+		curr->se.exec_max = delta_exec;
+
+	curr->se.sum_exec_runtime += delta_exec;
+	curr->se.exec_start = now;
+}
+
+static void
+enqueue_task_rt(struct rq *rq, struct task_struct *p, int wakeup, u64 now)
+{
+	struct prio_array *array = &rq->rt.active;
+
+	list_add_tail(&p->run_list, array->queue + p->prio);
+	__set_bit(p->prio, array->bitmap);
+}
+
+/*
+ * Adding/removing a task to/from a priority array:
+ */
+static void
+dequeue_task_rt(struct rq *rq, struct task_struct *p, int sleep, u64 now)
+{
+	struct prio_array *array = &rq->rt.active;
+
+	update_curr_rt(rq, now);
+
+	list_del(&p->run_list);
+	if (list_empty(array->queue + p->prio))
+		__clear_bit(p->prio, array->bitmap);
+}
+
+/*
+ * Put task to the end of the run list without the overhead of dequeue
+ * followed by enqueue.
+ */
+static void requeue_task_rt(struct rq *rq, struct task_struct *p)
+{
+	struct prio_array *array = &rq->rt.active;
+
+	list_move_tail(&p->run_list, array->queue + p->prio);
+}
+
+static void
+yield_task_rt(struct rq *rq, struct task_struct *p)
+{
+	requeue_task_rt(rq, p);
+}
+
+/*
+ * Preempt the current task with a newly woken task if needed:
+ */
+static void check_preempt_curr_rt(struct rq *rq, struct task_struct *p)
+{
+	if (p->prio < rq->curr->prio)
+		resched_task(rq->curr);
+}
+
+static struct task_struct * pick_next_task_rt(struct rq *rq, u64 now)
+{
+	struct prio_array *array = &rq->rt.active;
+	struct task_struct *next;
+	struct list_head *queue;
+	int idx;
+
+	idx = sched_find_first_bit(array->bitmap);
+	if (idx >= MAX_RT_PRIO)
+		return NULL;
+
+	queue = array->queue + idx;
+	next = list_entry(queue->next, struct task_struct, run_list);
+
+	next->se.exec_start = now;
+
+	return next;
+}
+
+static void put_prev_task_rt(struct rq *rq, struct task_struct *p, u64 now)
+{
+	update_curr_rt(rq, now);
+	p->se.exec_start = 0;
+}
+
+/*
+ * Load-balancing iterator. Note: while the runqueue stays locked
+ * during the whole iteration, the current task might be
+ * dequeued so the iterator has to be dequeue-safe. Here we
+ * achieve that by always pre-iterating before returning
+ * the current task:
+ */
+static struct task_struct *load_balance_start_rt(void *arg)
+{
+	struct rq *rq = arg;
+	struct prio_array *array = &rq->rt.active;
+	struct list_head *head, *curr;
+	struct task_struct *p;
+	int idx;
+
+	idx = sched_find_first_bit(array->bitmap);
+	if (idx >= MAX_RT_PRIO)
+		return NULL;
+
+	head = array->queue + idx;
+	curr = head->prev;
+
+	p = list_entry(curr, struct task_struct, run_list);
+
+	curr = curr->prev;
+
+	rq->rt.rt_load_balance_idx = idx;
+	rq->rt.rt_load_balance_head = head;
+	rq->rt.rt_load_balance_curr = curr;
+
+	return p;
+}
+
+static struct task_struct *load_balance_next_rt(void *arg)
+{
+	struct rq *rq = arg;
+	struct prio_array *array = &rq->rt.active;
+	struct list_head *head, *curr;
+	struct task_struct *p;
+	int idx;
+
+	idx = rq->rt.rt_load_balance_idx;
+	head = rq->rt.rt_load_balance_head;
+	curr = rq->rt.rt_load_balance_curr;
+
+	/*
+	 * If we arrived back to the head again then
+	 * iterate to the next queue (if any):
+	 */
+	if (unlikely(head == curr)) {
+		int next_idx = find_next_bit(array->bitmap, MAX_RT_PRIO, idx+1);
+
+		if (next_idx >= MAX_RT_PRIO)
+			return NULL;
+
+		idx = next_idx;
+		head = array->queue + idx;
+		curr = head->prev;
+
+		rq->rt.rt_load_balance_idx = idx;
+		rq->rt.rt_load_balance_head = head;
+	}
+
+	p = list_entry(curr, struct task_struct, run_list);
+
+	curr = curr->prev;
+
+	rq->rt.rt_load_balance_curr = curr;
+
+	return p;
+}
+
+static int
+load_balance_rt(struct rq *this_rq, int this_cpu, struct rq *busiest,
+			unsigned long max_nr_move, unsigned long max_load_move,
+			struct sched_domain *sd, enum cpu_idle_type idle,
+			int *all_pinned, unsigned long *load_moved)
+{
+	int this_best_prio, best_prio, best_prio_seen = 0;
+	int nr_moved;
+	struct rq_iterator rt_rq_iterator;
+
+	best_prio = sched_find_first_bit(busiest->rt.active.bitmap);
+	this_best_prio = sched_find_first_bit(this_rq->rt.active.bitmap);
+
+	/*
+	 * Enable handling of the case where there is more than one task
+	 * with the best priority.   If the current running task is one
+	 * of those with prio==best_prio we know it won't be moved
+	 * and therefore it's safe to override the skip (based on load)
+	 * of any task we find with that prio.
+	 */
+	if (busiest->curr->prio == best_prio)
+		best_prio_seen = 1;
+
+	rt_rq_iterator.start = load_balance_start_rt;
+	rt_rq_iterator.next = load_balance_next_rt;
+	/* pass 'busiest' rq argument into
+	 * load_balance_[start|next]_rt iterators
+	 */
+	rt_rq_iterator.arg = busiest;
+
+	nr_moved = balance_tasks(this_rq, this_cpu, busiest, max_nr_move,
+			max_load_move, sd, idle, all_pinned, load_moved,
+			this_best_prio, best_prio, best_prio_seen,
+			&rt_rq_iterator);
+
+	return nr_moved;
+}
+
+static void task_tick_rt(struct rq *rq, struct task_struct *p)
+{
+	/*
+	 * RR tasks need a special form of timeslice management.
+	 * FIFO tasks have no timeslices.
+	 */
+	if (p->policy != SCHED_RR)
+		return;
+
+	if (--p->time_slice)
+		return;
+
+	p->time_slice = static_prio_timeslice(p->static_prio);
+	set_tsk_need_resched(p);
+
+	/* put it at the end of the queue: */
+	requeue_task_rt(rq, p);
+}
+
+/*
+ * No parent/child timeslice management necessary for RT tasks,
+ * just activate them:
+ */
+static void task_new_rt(struct rq *rq, struct task_struct *p)
+{
+	activate_task(rq, p, 1);
+}
+
+static struct sched_class rt_sched_class __read_mostly = {
+	.enqueue_task		= enqueue_task_rt,
+	.dequeue_task		= dequeue_task_rt,
+	.yield_task		= yield_task_rt,
+
+	.check_preempt_curr	= check_preempt_curr_rt,
+
+	.pick_next_task		= pick_next_task_rt,
+	.put_prev_task		= put_prev_task_rt,
+
+	.load_balance		= load_balance_rt,
+
+	.task_tick		= task_tick_rt,
+	.task_new		= task_new_rt,
+};
--- a/kernel/sched_stats.h
+++ b/kernel/sched_stats.h
@@ -0,0 +1,235 @@
+
+#ifdef CONFIG_SCHEDSTATS
+/*
+ * bump this up when changing the output format or the meaning of an existing
+ * format, so that tools can adapt (or abort)
+ */
+#define SCHEDSTAT_VERSION 14
+
+static int show_schedstat(struct seq_file *seq, void *v)
+{
+	int cpu;
+
+	seq_printf(seq, "version %d\n", SCHEDSTAT_VERSION);
+	seq_printf(seq, "timestamp %lu\n", jiffies);
+	for_each_online_cpu(cpu) {
+		struct rq *rq = cpu_rq(cpu);
+#ifdef CONFIG_SMP
+		struct sched_domain *sd;
+		int dcnt = 0;
+#endif
+
+		/* runqueue-specific stats */
+		seq_printf(seq,
+		    "cpu%d %lu %lu %lu %lu %lu %lu %lu %lu %lu %llu %llu %lu",
+		    cpu, rq->yld_both_empty,
+		    rq->yld_act_empty, rq->yld_exp_empty, rq->yld_cnt,
+		    rq->sched_switch, rq->sched_cnt, rq->sched_goidle,
+		    rq->ttwu_cnt, rq->ttwu_local,
+		    rq->rq_sched_info.cpu_time,
+		    rq->rq_sched_info.run_delay, rq->rq_sched_info.pcnt);
+
+		seq_printf(seq, "\n");
+
+#ifdef CONFIG_SMP
+		/* domain-specific stats */
+		preempt_disable();
+		for_each_domain(cpu, sd) {
+			enum cpu_idle_type itype;
+			char mask_str[NR_CPUS];
+
+			cpumask_scnprintf(mask_str, NR_CPUS, sd->span);
+			seq_printf(seq, "domain%d %s", dcnt++, mask_str);
+			for (itype = CPU_IDLE; itype < CPU_MAX_IDLE_TYPES;
+					itype++) {
+				seq_printf(seq, " %lu %lu %lu %lu %lu %lu %lu "
+						"%lu",
+				    sd->lb_cnt[itype],
+				    sd->lb_balanced[itype],
+				    sd->lb_failed[itype],
+				    sd->lb_imbalance[itype],
+				    sd->lb_gained[itype],
+				    sd->lb_hot_gained[itype],
+				    sd->lb_nobusyq[itype],
+				    sd->lb_nobusyg[itype]);
+			}
+			seq_printf(seq, " %lu %lu %lu %lu %lu %lu %lu %lu %lu"
+			    " %lu %lu %lu\n",
+			    sd->alb_cnt, sd->alb_failed, sd->alb_pushed,
+			    sd->sbe_cnt, sd->sbe_balanced, sd->sbe_pushed,
+			    sd->sbf_cnt, sd->sbf_balanced, sd->sbf_pushed,
+			    sd->ttwu_wake_remote, sd->ttwu_move_affine,
+			    sd->ttwu_move_balance);
+		}
+		preempt_enable();
+#endif
+	}
+	return 0;
+}
+
+static int schedstat_open(struct inode *inode, struct file *file)
+{
+	unsigned int size = PAGE_SIZE * (1 + num_online_cpus() / 32);
+	char *buf = kmalloc(size, GFP_KERNEL);
+	struct seq_file *m;
+	int res;
+
+	if (!buf)
+		return -ENOMEM;
+	res = single_open(file, show_schedstat, NULL);
+	if (!res) {
+		m = file->private_data;
+		m->buf = buf;
+		m->size = size;
+	} else
+		kfree(buf);
+	return res;
+}
+
+const struct file_operations proc_schedstat_operations = {
+	.open    = schedstat_open,
+	.read    = seq_read,
+	.llseek  = seq_lseek,
+	.release = single_release,
+};
+
+/*
+ * Expects runqueue lock to be held for atomicity of update
+ */
+static inline void
+rq_sched_info_arrive(struct rq *rq, unsigned long long delta)
+{
+	if (rq) {
+		rq->rq_sched_info.run_delay += delta;
+		rq->rq_sched_info.pcnt++;
+	}
+}
+
+/*
+ * Expects runqueue lock to be held for atomicity of update
+ */
+static inline void
+rq_sched_info_depart(struct rq *rq, unsigned long long delta)
+{
+	if (rq)
+		rq->rq_sched_info.cpu_time += delta;
+}
+# define schedstat_inc(rq, field)	do { (rq)->field++; } while (0)
+# define schedstat_add(rq, field, amt)	do { (rq)->field += (amt); } while (0)
+#else /* !CONFIG_SCHEDSTATS */
+static inline void
+rq_sched_info_arrive(struct rq *rq, unsigned long long delta)
+{}
+static inline void
+rq_sched_info_depart(struct rq *rq, unsigned long long delta)
+{}
+# define schedstat_inc(rq, field)	do { } while (0)
+# define schedstat_add(rq, field, amt)	do { } while (0)
+#endif
+
+#if defined(CONFIG_SCHEDSTATS) || defined(CONFIG_TASK_DELAY_ACCT)
+/*
+ * Called when a process is dequeued from the active array and given
+ * the cpu.  We should note that with the exception of interactive
+ * tasks, the expired queue will become the active queue after the active
+ * queue is empty, without explicitly dequeuing and requeuing tasks in the
+ * expired queue.  (Interactive tasks may be requeued directly to the
+ * active queue, thus delaying tasks in the expired queue from running;
+ * see scheduler_tick()).
+ *
+ * This function is only called from sched_info_arrive(), rather than
+ * dequeue_task(). Even though a task may be queued and dequeued multiple
+ * times as it is shuffled about, we're really interested in knowing how
+ * long it was from the *first* time it was queued to the time that it
+ * finally hit a cpu.
+ */
+static inline void sched_info_dequeued(struct task_struct *t)
+{
+	t->sched_info.last_queued = 0;
+}
+
+/*
+ * Called when a task finally hits the cpu.  We can now calculate how
+ * long it was waiting to run.  We also note when it began so that we
+ * can keep stats on how long its timeslice is.
+ */
+static void sched_info_arrive(struct task_struct *t)
+{
+	unsigned long long now = sched_clock(), delta = 0;
+
+	if (t->sched_info.last_queued)
+		delta = now - t->sched_info.last_queued;
+	sched_info_dequeued(t);
+	t->sched_info.run_delay += delta;
+	t->sched_info.last_arrival = now;
+	t->sched_info.pcnt++;
+
+	rq_sched_info_arrive(task_rq(t), delta);
+}
+
+/*
+ * Called when a process is queued into either the active or expired
+ * array.  The time is noted and later used to determine how long we
+ * had to wait for us to reach the cpu.  Since the expired queue will
+ * become the active queue after active queue is empty, without dequeuing
+ * and requeuing any tasks, we are interested in queuing to either. It
+ * is unusual but not impossible for tasks to be dequeued and immediately
+ * requeued in the same or another array: this can happen in sched_yield(),
+ * set_user_nice(), and even load_balance() as it moves tasks from runqueue
+ * to runqueue.
+ *
+ * This function is only called from enqueue_task(), but also only updates
+ * the timestamp if it is already not set.  It's assumed that
+ * sched_info_dequeued() will clear that stamp when appropriate.
+ */
+static inline void sched_info_queued(struct task_struct *t)
+{
+	if (unlikely(sched_info_on()))
+		if (!t->sched_info.last_queued)
+			t->sched_info.last_queued = sched_clock();
+}
+
+/*
+ * Called when a process ceases being the active-running process, either
+ * voluntarily or involuntarily.  Now we can calculate how long we ran.
+ */
+static inline void sched_info_depart(struct task_struct *t)
+{
+	unsigned long long delta = sched_clock() - t->sched_info.last_arrival;
+
+	t->sched_info.cpu_time += delta;
+	rq_sched_info_depart(task_rq(t), delta);
+}
+
+/*
+ * Called when tasks are switched involuntarily due, typically, to expiring
+ * their time slice.  (This may also be called when switching to or from
+ * the idle task.)  We are only called when prev != next.
+ */
+static inline void
+__sched_info_switch(struct task_struct *prev, struct task_struct *next)
+{
+	struct rq *rq = task_rq(prev);
+
+	/*
+	 * prev now departs the cpu.  It's not interesting to record
+	 * stats about how efficient we were at scheduling the idle
+	 * process, however.
+	 */
+	if (prev != rq->idle)
+		sched_info_depart(prev);
+
+	if (next != rq->idle)
+		sched_info_arrive(next);
+}
+static inline void
+sched_info_switch(struct task_struct *prev, struct task_struct *next)
+{
+	if (unlikely(sched_info_on()))
+		__sched_info_switch(prev, next);
+}
+#else
+#define sched_info_queued(t)		do { } while (0)
+#define sched_info_switch(t, next)	do { } while (0)
+#endif /* CONFIG_SCHEDSTATS || CONFIG_TASK_DELAY_ACCT */
+
--- a/kernel/seccomp.c
+++ b/kernel/seccomp.c
@@ -0,0 +1,56 @@
+/*
+ * linux/kernel/seccomp.c
+ *
+ * Copyright 2004-2005  Andrea Arcangeli <andrea@cpushare.com>
+ *
+ * This defines a simple but solid secure-computing mode.
+ */
+
+#include <linux/seccomp.h>
+#include <linux/sched.h>
+
+/* #define SECCOMP_DEBUG 1 */
+
+/*
+ * Secure computing mode 1 allows only read/write/exit/sigreturn.
+ * To be fully secure this must be combined with rlimit
+ * to limit the stack allocations too.
+ */
+static int mode1_syscalls[] = {
+	__NR_seccomp_read, __NR_seccomp_write, __NR_seccomp_exit, __NR_seccomp_sigreturn,
+	0, /* null terminated */
+};
+
+#ifdef TIF_32BIT
+static int mode1_syscalls_32[] = {
+	__NR_seccomp_read_32, __NR_seccomp_write_32, __NR_seccomp_exit_32, __NR_seccomp_sigreturn_32,
+	0, /* null terminated */
+};
+#endif
+
+void __secure_computing(int this_syscall)
+{
+	int mode = current->seccomp.mode;
+	int * syscall;
+
+	switch (mode) {
+	case 1:
+		syscall = mode1_syscalls;
+#ifdef TIF_32BIT
+		if (test_thread_flag(TIF_32BIT))
+			syscall = mode1_syscalls_32;
+#endif
+		do {
+			if (*syscall == this_syscall)
+				return;
+		} while (*++syscall);
+		break;
+	default:
+		BUG();
+	}
+
+#ifdef SECCOMP_DEBUG
+	dump_stack();
+#endif
+	do_exit(SIGKILL);
+}
--- a/kernel/signal.c
+++ b/kernel/signal.c
--- a/kernel/softirq.c
+++ b/kernel/softirq.c
@@ -0,0 +1,656 @@
+/*
+ *	linux/kernel/softirq.c
+ *
+ *	Copyright (C) 1992 Linus Torvalds
+ *
+ * Rewritten. Old one was good in 2.2, but in 2.3 it was immoral. --ANK (990903)
+ */
+
+#include <linux/module.h>
+#include <linux/kernel_stat.h>
+#include <linux/interrupt.h>
+#include <linux/init.h>
+#include <linux/mm.h>
+#include <linux/notifier.h>
+#include <linux/percpu.h>
+#include <linux/cpu.h>
+#include <linux/kthread.h>
+#include <linux/rcupdate.h>
+#include <linux/smp.h>
+#include <linux/tick.h>
+
+#include <asm/irq.h>
+/*
+   - No shared variables, all the data are CPU local.
+   - If a softirq needs serialization, let it serialize itself
+     by its own spinlocks.
+   - Even if softirq is serialized, only local cpu is marked for
+     execution. Hence, we get something sort of weak cpu binding.
+     Though it is still not clear, will it result in better locality
+     or will not.
+
+   Examples:
+   - NET RX softirq. It is multithreaded and does not require
+     any global serialization.
+   - NET TX softirq. It kicks software netdevice queues, hence
+     it is logically serialized per device, but this serialization
+     is invisible to common code.
+   - Tasklets: serialized wrt itself.
+ */
+
+#ifndef __ARCH_IRQ_STAT
+irq_cpustat_t irq_stat[NR_CPUS] ____cacheline_aligned;
+EXPORT_SYMBOL(irq_stat);
+#endif
+
+static struct softirq_action softirq_vec[32] __cacheline_aligned_in_smp;
+
+static DEFINE_PER_CPU(struct task_struct *, ksoftirqd);
+
+/*
+ * we cannot loop indefinitely here to avoid userspace starvation,
+ * but we also don't want to introduce a worst case 1/HZ latency
+ * to the pending events, so lets the scheduler to balance
+ * the softirq load for us.
+ */
+static inline void wakeup_softirqd(void)
+{
+	/* Interrupts are disabled: no need to stop preemption */
+	struct task_struct *tsk = __get_cpu_var(ksoftirqd);
+
+	if (tsk && tsk->state != TASK_RUNNING)
+		wake_up_process(tsk);
+}
+
+/*
+ * This one is for softirq.c-internal use,
+ * where hardirqs are disabled legitimately:
+ */
+#ifdef CONFIG_TRACE_IRQFLAGS
+static void __local_bh_disable(unsigned long ip)
+{
+	unsigned long flags;
+
+	WARN_ON_ONCE(in_irq());
+
+	raw_local_irq_save(flags);
+	add_preempt_count(SOFTIRQ_OFFSET);
+	/*
+	 * Were softirqs turned off above:
+	 */
+	if (softirq_count() == SOFTIRQ_OFFSET)
+		trace_softirqs_off(ip);
+	raw_local_irq_restore(flags);
+}
+#else /* !CONFIG_TRACE_IRQFLAGS */
+static inline void __local_bh_disable(unsigned long ip)
+{
+	add_preempt_count(SOFTIRQ_OFFSET);
+	barrier();
+}
+#endif /* CONFIG_TRACE_IRQFLAGS */
+
+void local_bh_disable(void)
+{
+	__local_bh_disable((unsigned long)__builtin_return_address(0));
+}
+
+EXPORT_SYMBOL(local_bh_disable);
+
+void __local_bh_enable(void)
+{
+	WARN_ON_ONCE(in_irq());
+
+	/*
+	 * softirqs should never be enabled by __local_bh_enable(),
+	 * it always nests inside local_bh_enable() sections:
+	 */
+	WARN_ON_ONCE(softirq_count() == SOFTIRQ_OFFSET);
+
+	sub_preempt_count(SOFTIRQ_OFFSET);
+}
+EXPORT_SYMBOL_GPL(__local_bh_enable);
+
+/*
+ * Special-case - softirqs can safely be enabled in
+ * cond_resched_softirq(), or by __do_softirq(),
+ * without processing still-pending softirqs:
+ */
+void _local_bh_enable(void)
+{
+	WARN_ON_ONCE(in_irq());
+	WARN_ON_ONCE(!irqs_disabled());
+
+	if (softirq_count() == SOFTIRQ_OFFSET)
+		trace_softirqs_on((unsigned long)__builtin_return_address(0));
+	sub_preempt_count(SOFTIRQ_OFFSET);
+}
+
+EXPORT_SYMBOL(_local_bh_enable);
+
+void local_bh_enable(void)
+{
+#ifdef CONFIG_TRACE_IRQFLAGS
+	unsigned long flags;
+
+	WARN_ON_ONCE(in_irq());
+#endif
+	WARN_ON_ONCE(irqs_disabled());
+
+#ifdef CONFIG_TRACE_IRQFLAGS
+	local_irq_save(flags);
+#endif
+	/*
+	 * Are softirqs going to be turned on now:
+	 */
+	if (softirq_count() == SOFTIRQ_OFFSET)
+		trace_softirqs_on((unsigned long)__builtin_return_address(0));
+	/*
+	 * Keep preemption disabled until we are done with
+	 * softirq processing:
+ 	 */
+ 	sub_preempt_count(SOFTIRQ_OFFSET - 1);
+
+	if (unlikely(!in_interrupt() && local_softirq_pending()))
+		do_softirq();
+
+	dec_preempt_count();
+#ifdef CONFIG_TRACE_IRQFLAGS
+	local_irq_restore(flags);
+#endif
+	preempt_check_resched();
+}
+EXPORT_SYMBOL(local_bh_enable);
+
+void local_bh_enable_ip(unsigned long ip)
+{
+#ifdef CONFIG_TRACE_IRQFLAGS
+	unsigned long flags;
+
+	WARN_ON_ONCE(in_irq());
+
+	local_irq_save(flags);
+#endif
+	/*
+	 * Are softirqs going to be turned on now:
+	 */
+	if (softirq_count() == SOFTIRQ_OFFSET)
+		trace_softirqs_on(ip);
+	/*
+	 * Keep preemption disabled until we are done with
+	 * softirq processing:
+ 	 */
+ 	sub_preempt_count(SOFTIRQ_OFFSET - 1);
+
+	if (unlikely(!in_interrupt() && local_softirq_pending()))
+		do_softirq();
+
+	dec_preempt_count();
+#ifdef CONFIG_TRACE_IRQFLAGS
+	local_irq_restore(flags);
+#endif
+	preempt_check_resched();
+}
+EXPORT_SYMBOL(local_bh_enable_ip);
+
+/*
+ * We restart softirq processing MAX_SOFTIRQ_RESTART times,
+ * and we fall back to softirqd after that.
+ *
+ * This number has been established via experimentation.
+ * The two things to balance is latency against fairness -
+ * we want to handle softirqs as soon as possible, but they
+ * should not be able to lock up the box.
+ */
+#define MAX_SOFTIRQ_RESTART 10
+
+asmlinkage void __do_softirq(void)
+{
+	struct softirq_action *h;
+	__u32 pending;
+	int max_restart = MAX_SOFTIRQ_RESTART;
+	int cpu;
+
+	pending = local_softirq_pending();
+	account_system_vtime(current);
+
+	__local_bh_disable((unsigned long)__builtin_return_address(0));
+	trace_softirq_enter();
+
+	cpu = smp_processor_id();
+restart:
+	/* Reset the pending bitmask before enabling irqs */
+	set_softirq_pending(0);
+
+	local_irq_enable();
+
+	h = softirq_vec;
+
+	do {
+		if (pending & 1) {
+			h->action(h);
+			rcu_bh_qsctr_inc(cpu);
+		}
+		h++;
+		pending >>= 1;
+	} while (pending);
+
+	local_irq_disable();
+
+	pending = local_softirq_pending();
+	if (pending && --max_restart)
+		goto restart;
+
+	if (pending)
+		wakeup_softirqd();
+
+	trace_softirq_exit();
+
+	account_system_vtime(current);
+	_local_bh_enable();
+}
+
+#ifndef __ARCH_HAS_DO_SOFTIRQ
+
+asmlinkage void do_softirq(void)
+{
+	__u32 pending;
+	unsigned long flags;
+
+	if (in_interrupt())
+		return;
+
+	local_irq_save(flags);
+
+	pending = local_softirq_pending();
+
+	if (pending)
+		__do_softirq();
+
+	local_irq_restore(flags);
+}
+
+EXPORT_SYMBOL(do_softirq);
+
+#endif
+
+/*
+ * Enter an interrupt context.
+ */
+void irq_enter(void)
+{
+	__irq_enter();
+#ifdef CONFIG_NO_HZ
+	if (idle_cpu(smp_processor_id()))
+		tick_nohz_update_jiffies();
+#endif
+}
+
+#ifdef __ARCH_IRQ_EXIT_IRQS_DISABLED
+# define invoke_softirq()	__do_softirq()
+#else
+# define invoke_softirq()	do_softirq()
+#endif
+
+/*
+ * Exit an interrupt context. Process softirqs if needed and possible:
+ */
+void irq_exit(void)
+{
+	account_system_vtime(current);
+	trace_hardirq_exit();
+	sub_preempt_count(IRQ_EXIT_OFFSET);
+	if (!in_interrupt() && local_softirq_pending())
+		invoke_softirq();
+
+#ifdef CONFIG_NO_HZ
+	/* Make sure that timer wheel updates are propagated */
+	if (!in_interrupt() && idle_cpu(smp_processor_id()) && !need_resched())
+		tick_nohz_stop_sched_tick();
+#endif
+	preempt_enable_no_resched();
+}
+
+/*
+ * This function must run with irqs disabled!
+ */
+inline fastcall void raise_softirq_irqoff(unsigned int nr)
+{
+	__raise_softirq_irqoff(nr);
+
+	/*
+	 * If we're in an interrupt or softirq, we're done
+	 * (this also catches softirq-disabled code). We will
+	 * actually run the softirq once we return from
+	 * the irq or softirq.
+	 *
+	 * Otherwise we wake up ksoftirqd to make sure we
+	 * schedule the softirq soon.
+	 */
+	if (!in_interrupt())
+		wakeup_softirqd();
+}
+
+EXPORT_SYMBOL(raise_softirq_irqoff);
+
+void fastcall raise_softirq(unsigned int nr)
+{
+	unsigned long flags;
+
+	local_irq_save(flags);
+	raise_softirq_irqoff(nr);
+	local_irq_restore(flags);
+}
+
+void open_softirq(int nr, void (*action)(struct softirq_action*), void *data)
+{
+	softirq_vec[nr].data = data;
+	softirq_vec[nr].action = action;
+}
+
+/* Tasklets */
+struct tasklet_head
+{
+	struct tasklet_struct *list;
+};
+
+/* Some compilers disobey section attribute on statics when not
+   initialized -- RR */
+static DEFINE_PER_CPU(struct tasklet_head, tasklet_vec) = { NULL };
+static DEFINE_PER_CPU(struct tasklet_head, tasklet_hi_vec) = { NULL };
+
+void fastcall __tasklet_schedule(struct tasklet_struct *t)
+{
+	unsigned long flags;
+
+	local_irq_save(flags);
+	t->next = __get_cpu_var(tasklet_vec).list;
+	__get_cpu_var(tasklet_vec).list = t;
+	raise_softirq_irqoff(TASKLET_SOFTIRQ);
+	local_irq_restore(flags);
+}
+
+EXPORT_SYMBOL(__tasklet_schedule);
+
+void fastcall __tasklet_hi_schedule(struct tasklet_struct *t)
+{
+	unsigned long flags;
+
+	local_irq_save(flags);
+	t->next = __get_cpu_var(tasklet_hi_vec).list;
+	__get_cpu_var(tasklet_hi_vec).list = t;
+	raise_softirq_irqoff(HI_SOFTIRQ);
+	local_irq_restore(flags);
+}
+
+EXPORT_SYMBOL(__tasklet_hi_schedule);
+
+static void tasklet_action(struct softirq_action *a)
+{
+	struct tasklet_struct *list;
+
+	local_irq_disable();
+	list = __get_cpu_var(tasklet_vec).list;
+	__get_cpu_var(tasklet_vec).list = NULL;
+	local_irq_enable();
+
+	while (list) {
+		struct tasklet_struct *t = list;
+
+		list = list->next;
+
+		if (tasklet_trylock(t)) {
+			if (!atomic_read(&t->count)) {
+				if (!test_and_clear_bit(TASKLET_STATE_SCHED, &t->state))
+					BUG();
+				t->func(t->data);
+				tasklet_unlock(t);
+				continue;
+			}
+			tasklet_unlock(t);
+		}
+
+		local_irq_disable();
+		t->next = __get_cpu_var(tasklet_vec).list;
+		__get_cpu_var(tasklet_vec).list = t;
+		__raise_softirq_irqoff(TASKLET_SOFTIRQ);
+		local_irq_enable();
+	}
+}
+
+static void tasklet_hi_action(struct softirq_action *a)
+{
+	struct tasklet_struct *list;
+
+	local_irq_disable();
+	list = __get_cpu_var(tasklet_hi_vec).list;
+	__get_cpu_var(tasklet_hi_vec).list = NULL;
+	local_irq_enable();
+
+	while (list) {
+		struct tasklet_struct *t = list;
+
+		list = list->next;
+
+		if (tasklet_trylock(t)) {
+			if (!atomic_read(&t->count)) {
+				if (!test_and_clear_bit(TASKLET_STATE_SCHED, &t->state))
+					BUG();
+				t->func(t->data);
+				tasklet_unlock(t);
+				continue;
+			}
+			tasklet_unlock(t);
+		}
+
+		local_irq_disable();
+		t->next = __get_cpu_var(tasklet_hi_vec).list;
+		__get_cpu_var(tasklet_hi_vec).list = t;
+		__raise_softirq_irqoff(HI_SOFTIRQ);
+		local_irq_enable();
+	}
+}
+
+
+void tasklet_init(struct tasklet_struct *t,
+		  void (*func)(unsigned long), unsigned long data)
+{
+	t->next = NULL;
+	t->state = 0;
+	atomic_set(&t->count, 0);
+	t->func = func;
+	t->data = data;
+}
+
+EXPORT_SYMBOL(tasklet_init);
+
+void tasklet_kill(struct tasklet_struct *t)
+{
+	if (in_interrupt())
+		printk("Attempt to kill tasklet from interrupt\n");
+
+	while (test_and_set_bit(TASKLET_STATE_SCHED, &t->state)) {
+		do
+			yield();
+		while (test_bit(TASKLET_STATE_SCHED, &t->state));
+	}
+	tasklet_unlock_wait(t);
+	clear_bit(TASKLET_STATE_SCHED, &t->state);
+}
+
+EXPORT_SYMBOL(tasklet_kill);
+
+void __init softirq_init(void)
+{
+	open_softirq(TASKLET_SOFTIRQ, tasklet_action, NULL);
+	open_softirq(HI_SOFTIRQ, tasklet_hi_action, NULL);
+}
+
+static int ksoftirqd(void * __bind_cpu)
+{
+	current->flags |= PF_NOFREEZE;
+
+	set_current_state(TASK_INTERRUPTIBLE);
+
+	while (!kthread_should_stop()) {
+		preempt_disable();
+		if (!local_softirq_pending()) {
+			preempt_enable_no_resched();
+			schedule();
+			preempt_disable();
+		}
+
+		__set_current_state(TASK_RUNNING);
+
+		while (local_softirq_pending()) {
+			/* Preempt disable stops cpu going offline.
+			   If already offline, we'll be on wrong CPU:
+			   don't process */
+			if (cpu_is_offline((long)__bind_cpu))
+				goto wait_to_die;
+			do_softirq();
+			preempt_enable_no_resched();
+			cond_resched();
+			preempt_disable();
+		}
+		preempt_enable();
+		set_current_state(TASK_INTERRUPTIBLE);
+	}
+	__set_current_state(TASK_RUNNING);
+	return 0;
+
+wait_to_die:
+	preempt_enable();
+	/* Wait for kthread_stop */
+	set_current_state(TASK_INTERRUPTIBLE);
+	while (!kthread_should_stop()) {
+		schedule();
+		set_current_state(TASK_INTERRUPTIBLE);
+	}
+	__set_current_state(TASK_RUNNING);
+	return 0;
+}
+
+#ifdef CONFIG_HOTPLUG_CPU
+/*
+ * tasklet_kill_immediate is called to remove a tasklet which can already be
+ * scheduled for execution on @cpu.
+ *
+ * Unlike tasklet_kill, this function removes the tasklet
+ * _immediately_, even if the tasklet is in TASKLET_STATE_SCHED state.
+ *
+ * When this function is called, @cpu must be in the CPU_DEAD state.
+ */
+void tasklet_kill_immediate(struct tasklet_struct *t, unsigned int cpu)
+{
+	struct tasklet_struct **i;
+
+	BUG_ON(cpu_online(cpu));
+	BUG_ON(test_bit(TASKLET_STATE_RUN, &t->state));
+
+	if (!test_bit(TASKLET_STATE_SCHED, &t->state))
+		return;
+
+	/* CPU is dead, so no lock needed. */
+	for (i = &per_cpu(tasklet_vec, cpu).list; *i; i = &(*i)->next) {
+		if (*i == t) {
+			*i = t->next;
+			return;
+		}
+	}
+	BUG();
+}
+
+static void takeover_tasklets(unsigned int cpu)
+{
+	struct tasklet_struct **i;
+
+	/* CPU is dead, so no lock needed. */
+	local_irq_disable();
+
+	/* Find end, append list for that CPU. */
+	for (i = &__get_cpu_var(tasklet_vec).list; *i; i = &(*i)->next);
+	*i = per_cpu(tasklet_vec, cpu).list;
+	per_cpu(tasklet_vec, cpu).list = NULL;
+	raise_softirq_irqoff(TASKLET_SOFTIRQ);
+
+	for (i = &__get_cpu_var(tasklet_hi_vec).list; *i; i = &(*i)->next);
+	*i = per_cpu(tasklet_hi_vec, cpu).list;
+	per_cpu(tasklet_hi_vec, cpu).list = NULL;
+	raise_softirq_irqoff(HI_SOFTIRQ);
+
+	local_irq_enable();
+}
+#endif /* CONFIG_HOTPLUG_CPU */
+
+static int __cpuinit cpu_callback(struct notifier_block *nfb,
+				  unsigned long action,
+				  void *hcpu)
+{
+	int hotcpu = (unsigned long)hcpu;
+	struct task_struct *p;
+
+	switch (action) {
+	case CPU_UP_PREPARE:
+		p = kthread_create(ksoftirqd, hcpu, "ksoftirqd/%d", hotcpu);
+		if (IS_ERR(p)) {
+			printk("ksoftirqd for %i failed\n", hotcpu);
+			return NOTIFY_BAD;
+		}
+		kthread_bind(p, hotcpu);
+  		per_cpu(ksoftirqd, hotcpu) = p;
+ 		break;
+	case CPU_ONLINE:
+		wake_up_process(per_cpu(ksoftirqd, hotcpu));
+		break;
+#ifdef CONFIG_HOTPLUG_CPU
+	case CPU_UP_CANCELED:
+		if (!per_cpu(ksoftirqd, hotcpu))
+			break;
+		/* Unbind so it can run.  Fall thru. */
+		kthread_bind(per_cpu(ksoftirqd, hotcpu),
+			     any_online_cpu(cpu_online_map));
+	case CPU_DEAD:
+		p = per_cpu(ksoftirqd, hotcpu);
+		per_cpu(ksoftirqd, hotcpu) = NULL;
+		kthread_stop(p);
+		takeover_tasklets(hotcpu);
+		break;
+#endif /* CONFIG_HOTPLUG_CPU */
+ 	}
+	return NOTIFY_OK;
+}
+
+static struct notifier_block __cpuinitdata cpu_nfb = {
+	.notifier_call = cpu_callback
+};
+
+__init int spawn_ksoftirqd(void)
+{
+	void *cpu = (void *)(long)smp_processor_id();
+	int err = cpu_callback(&cpu_nfb, CPU_UP_PREPARE, cpu);
+
+	BUG_ON(err == NOTIFY_BAD);
+	cpu_callback(&cpu_nfb, CPU_ONLINE, cpu);
+	register_cpu_notifier(&cpu_nfb);
+	return 0;
+}
+
+#ifdef CONFIG_SMP
+/*
+ * Call a function on all processors
+ */
+int on_each_cpu(void (*func) (void *info), void *info, int retry, int wait)
+{
+	int ret = 0;
+
+	preempt_disable();
+	ret = smp_call_function(func, info, retry, wait);
+	local_irq_disable();
+	func(info);
+	local_irq_enable();
+	preempt_enable();
+	return ret;
+}
+EXPORT_SYMBOL(on_each_cpu);
+#endif
--- a/kernel/softlockup.c
+++ b/kernel/softlockup.c
@@ -0,0 +1,174 @@
+/*
+ * Detect Soft Lockups
+ *
+ * started by Ingo Molnar, Copyright (C) 2005, 2006 Red Hat, Inc.
+ *
+ * this code detects soft lockups: incidents in where on a CPU
+ * the kernel does not reschedule for 10 seconds or more.
+ */
+#include <linux/mm.h>
+#include <linux/cpu.h>
+#include <linux/init.h>
+#include <linux/delay.h>
+#include <linux/kthread.h>
+#include <linux/notifier.h>
+#include <linux/module.h>
+
+static DEFINE_SPINLOCK(print_lock);
+
+static DEFINE_PER_CPU(unsigned long, touch_timestamp);
+static DEFINE_PER_CPU(unsigned long, print_timestamp);
+static DEFINE_PER_CPU(struct task_struct *, watchdog_task);
+
+static int did_panic = 0;
+
+static int
+softlock_panic(struct notifier_block *this, unsigned long event, void *ptr)
+{
+	did_panic = 1;
+
+	return NOTIFY_DONE;
+}
+
+static struct notifier_block panic_block = {
+	.notifier_call = softlock_panic,
+};
+
+void touch_softlockup_watchdog(void)
+{
+	__raw_get_cpu_var(touch_timestamp) = jiffies;
+}
+EXPORT_SYMBOL(touch_softlockup_watchdog);
+
+void touch_all_softlockup_watchdogs(void)
+{
+	int cpu;
+
+	/* Cause each CPU to re-update its timestamp rather than complain */
+	for_each_online_cpu(cpu)
+		per_cpu(touch_timestamp, cpu) = 0;
+}
+EXPORT_SYMBOL(touch_all_softlockup_watchdogs);
+
+/*
+ * This callback runs from the timer interrupt, and checks
+ * whether the watchdog thread has hung or not:
+ */
+void softlockup_tick(void)
+{
+	int this_cpu = smp_processor_id();
+	unsigned long touch_timestamp = per_cpu(touch_timestamp, this_cpu);
+
+	if (touch_timestamp == 0) {
+		touch_softlockup_watchdog();
+		return;
+	}
+
+	/* prevent double reports: */
+	if (per_cpu(print_timestamp, this_cpu) == touch_timestamp ||
+		did_panic ||
+			!per_cpu(watchdog_task, this_cpu))
+		return;
+
+	/* do not print during early bootup: */
+	if (unlikely(system_state != SYSTEM_RUNNING)) {
+		touch_softlockup_watchdog();
+		return;
+	}
+
+	/* Wake up the high-prio watchdog task every second: */
+	if (time_after(jiffies, touch_timestamp + HZ))
+		wake_up_process(per_cpu(watchdog_task, this_cpu));
+
+	/* Warn about unreasonable 10+ seconds delays: */
+	if (time_after(jiffies, touch_timestamp + 10*HZ)) {
+		per_cpu(print_timestamp, this_cpu) = touch_timestamp;
+
+		spin_lock(&print_lock);
+		printk(KERN_ERR "BUG: soft lockup detected on CPU#%d!\n",
+			this_cpu);
+		dump_stack();
+		spin_unlock(&print_lock);
+	}
+}
+
+/*
+ * The watchdog thread - runs every second and touches the timestamp.
+ */
+static int watchdog(void * __bind_cpu)
+{
+	struct sched_param param = { .sched_priority = 99 };
+
+	sched_setscheduler(current, SCHED_FIFO, &param);
+	current->flags |= PF_NOFREEZE;
+
+	/*
+	 * Run briefly once per second to reset the softlockup timestamp.
+	 * If this gets delayed for more than 10 seconds then the
+	 * debug-printout triggers in softlockup_tick().
+	 */
+	while (!kthread_should_stop()) {
+		set_current_state(TASK_INTERRUPTIBLE);
+		touch_softlockup_watchdog();
+		schedule();
+	}
+
+	return 0;
+}
+
+/*
+ * Create/destroy watchdog threads as CPUs come and go:
+ */
+static int __cpuinit
+cpu_callback(struct notifier_block *nfb, unsigned long action, void *hcpu)
+{
+	int hotcpu = (unsigned long)hcpu;
+	struct task_struct *p;
+
+	switch (action) {
+	case CPU_UP_PREPARE:
+		BUG_ON(per_cpu(watchdog_task, hotcpu));
+		p = kthread_create(watchdog, hcpu, "watchdog/%d", hotcpu);
+		if (IS_ERR(p)) {
+			printk("watchdog for %i failed\n", hotcpu);
+			return NOTIFY_BAD;
+		}
+  		per_cpu(touch_timestamp, hotcpu) = jiffies;
+  		per_cpu(watchdog_task, hotcpu) = p;
+		kthread_bind(p, hotcpu);
+ 		break;
+	case CPU_ONLINE:
+		wake_up_process(per_cpu(watchdog_task, hotcpu));
+		break;
+#ifdef CONFIG_HOTPLUG_CPU
+	case CPU_UP_CANCELED:
+		if (!per_cpu(watchdog_task, hotcpu))
+			break;
+		/* Unbind so it can run.  Fall thru. */
+		kthread_bind(per_cpu(watchdog_task, hotcpu),
+			     any_online_cpu(cpu_online_map));
+	case CPU_DEAD:
+		p = per_cpu(watchdog_task, hotcpu);
+		per_cpu(watchdog_task, hotcpu) = NULL;
+		kthread_stop(p);
+		break;
+#endif /* CONFIG_HOTPLUG_CPU */
+ 	}
+	return NOTIFY_OK;
+}
+
+static struct notifier_block __cpuinitdata cpu_nfb = {
+	.notifier_call = cpu_callback
+};
+
+__init void spawn_softlockup_task(void)
+{
+	void *cpu = (void *)(long)smp_processor_id();
+	int err = cpu_callback(&cpu_nfb, CPU_UP_PREPARE, cpu);
+
+	BUG_ON(err == NOTIFY_BAD);
+	cpu_callback(&cpu_nfb, CPU_ONLINE, cpu);
+	register_cpu_notifier(&cpu_nfb);
+
+	atomic_notifier_chain_register(&panic_notifier_list, &panic_block);
+}
--- a/kernel/spinlock.c
+++ b/kernel/spinlock.c
@@ -0,0 +1,448 @@
+/*
+ * Copyright (2004) Linus Torvalds
+ *
+ * Author: Zwane Mwaikambo <zwane@fsmlabs.com>
+ *
+ * Copyright (2004, 2005) Ingo Molnar
+ *
+ * This file contains the spinlock/rwlock implementations for the
+ * SMP and the DEBUG_SPINLOCK cases. (UP-nondebug inlines them)
+ *
+ * Note that some architectures have special knowledge about the
+ * stack frames of these functions in their profile_pc. If you
+ * change anything significant here that could change the stack
+ * frame contact the architecture maintainers.
+ */
+
+#include <linux/linkage.h>
+#include <linux/preempt.h>
+#include <linux/spinlock.h>
+#include <linux/interrupt.h>
+#include <linux/debug_locks.h>
+#include <linux/module.h>
+
+int __lockfunc _spin_trylock(spinlock_t *lock)
+{
+	preempt_disable();
+	if (_raw_spin_trylock(lock)) {
+		spin_acquire(&lock->dep_map, 0, 1, _RET_IP_);
+		return 1;
+	}
+	
+	preempt_enable();
+	return 0;
+}
+EXPORT_SYMBOL(_spin_trylock);
+
+int __lockfunc _read_trylock(rwlock_t *lock)
+{
+	preempt_disable();
+	if (_raw_read_trylock(lock)) {
+		rwlock_acquire_read(&lock->dep_map, 0, 1, _RET_IP_);
+		return 1;
+	}
+
+	preempt_enable();
+	return 0;
+}
+EXPORT_SYMBOL(_read_trylock);
+
+int __lockfunc _write_trylock(rwlock_t *lock)
+{
+	preempt_disable();
+	if (_raw_write_trylock(lock)) {
+		rwlock_acquire(&lock->dep_map, 0, 1, _RET_IP_);
+		return 1;
+	}
+
+	preempt_enable();
+	return 0;
+}
+EXPORT_SYMBOL(_write_trylock);
+
+/*
+ * If lockdep is enabled then we use the non-preemption spin-ops
+ * even on CONFIG_PREEMPT, because lockdep assumes that interrupts are
+ * not re-enabled during lock-acquire (which the preempt-spin-ops do):
+ */
+#if !defined(CONFIG_PREEMPT) || !defined(CONFIG_SMP) || \
+	defined(CONFIG_DEBUG_LOCK_ALLOC)
+
+void __lockfunc _read_lock(rwlock_t *lock)
+{
+	preempt_disable();
+	rwlock_acquire_read(&lock->dep_map, 0, 0, _RET_IP_);
+	_raw_read_lock(lock);
+}
+EXPORT_SYMBOL(_read_lock);
+
+unsigned long __lockfunc _spin_lock_irqsave(spinlock_t *lock)
+{
+	unsigned long flags;
+
+	local_irq_save(flags);
+	preempt_disable();
+	spin_acquire(&lock->dep_map, 0, 0, _RET_IP_);
+	/*
+	 * On lockdep we dont want the hand-coded irq-enable of
+	 * _raw_spin_lock_flags() code, because lockdep assumes
+	 * that interrupts are not re-enabled during lock-acquire:
+	 */
+#ifdef CONFIG_PROVE_LOCKING
+	_raw_spin_lock(lock);
+#else
+	_raw_spin_lock_flags(lock, &flags);
+#endif
+	return flags;
+}
+EXPORT_SYMBOL(_spin_lock_irqsave);
+
+void __lockfunc _spin_lock_irq(spinlock_t *lock)
+{
+	local_irq_disable();
+	preempt_disable();
+	spin_acquire(&lock->dep_map, 0, 0, _RET_IP_);
+	_raw_spin_lock(lock);
+}
+EXPORT_SYMBOL(_spin_lock_irq);
+
+void __lockfunc _spin_lock_bh(spinlock_t *lock)
+{
+	local_bh_disable();
+	preempt_disable();
+	spin_acquire(&lock->dep_map, 0, 0, _RET_IP_);
+	_raw_spin_lock(lock);
+}
+EXPORT_SYMBOL(_spin_lock_bh);
+
+unsigned long __lockfunc _read_lock_irqsave(rwlock_t *lock)
+{
+	unsigned long flags;
+
+	local_irq_save(flags);
+	preempt_disable();
+	rwlock_acquire_read(&lock->dep_map, 0, 0, _RET_IP_);
+	_raw_read_lock(lock);
+	return flags;
+}
+EXPORT_SYMBOL(_read_lock_irqsave);
+
+void __lockfunc _read_lock_irq(rwlock_t *lock)
+{
+	local_irq_disable();
+	preempt_disable();
+	rwlock_acquire_read(&lock->dep_map, 0, 0, _RET_IP_);
+	_raw_read_lock(lock);
+}
+EXPORT_SYMBOL(_read_lock_irq);
+
+void __lockfunc _read_lock_bh(rwlock_t *lock)
+{
+	local_bh_disable();
+	preempt_disable();
+	rwlock_acquire_read(&lock->dep_map, 0, 0, _RET_IP_);
+	_raw_read_lock(lock);
+}
+EXPORT_SYMBOL(_read_lock_bh);
+
+unsigned long __lockfunc _write_lock_irqsave(rwlock_t *lock)
+{
+	unsigned long flags;
+
+	local_irq_save(flags);
+	preempt_disable();
+	rwlock_acquire(&lock->dep_map, 0, 0, _RET_IP_);
+	_raw_write_lock(lock);
+	return flags;
+}
+EXPORT_SYMBOL(_write_lock_irqsave);
+
+void __lockfunc _write_lock_irq(rwlock_t *lock)
+{
+	local_irq_disable();
+	preempt_disable();
+	rwlock_acquire(&lock->dep_map, 0, 0, _RET_IP_);
+	_raw_write_lock(lock);
+}
+EXPORT_SYMBOL(_write_lock_irq);
+
+void __lockfunc _write_lock_bh(rwlock_t *lock)
+{
+	local_bh_disable();
+	preempt_disable();
+	rwlock_acquire(&lock->dep_map, 0, 0, _RET_IP_);
+	_raw_write_lock(lock);
+}
+EXPORT_SYMBOL(_write_lock_bh);
+
+void __lockfunc _spin_lock(spinlock_t *lock)
+{
+	preempt_disable();
+	spin_acquire(&lock->dep_map, 0, 0, _RET_IP_);
+	_raw_spin_lock(lock);
+}
+
+EXPORT_SYMBOL(_spin_lock);
+
+void __lockfunc _write_lock(rwlock_t *lock)
+{
+	preempt_disable();
+	rwlock_acquire(&lock->dep_map, 0, 0, _RET_IP_);
+	_raw_write_lock(lock);
+}
+
+EXPORT_SYMBOL(_write_lock);
+
+#else /* CONFIG_PREEMPT: */
+
+/*
+ * This could be a long-held lock. We both prepare to spin for a long
+ * time (making _this_ CPU preemptable if possible), and we also signal
+ * towards that other CPU that it should break the lock ASAP.
+ *
+ * (We do this in a function because inlining it would be excessive.)
+ */
+
+#define BUILD_LOCK_OPS(op, locktype)					\
+void __lockfunc _##op##_lock(locktype##_t *lock)			\
+{									\
+	for (;;) {							\
+		preempt_disable();					\
+		if (likely(_raw_##op##_trylock(lock)))			\
+			break;						\
+		preempt_enable();					\
+									\
+		if (!(lock)->break_lock)				\
+			(lock)->break_lock = 1;				\
+		while (!op##_can_lock(lock) && (lock)->break_lock)	\
+			_raw_##op##_relax(&lock->raw_lock);		\
+	}								\
+	(lock)->break_lock = 0;						\
+}									\
+									\
+EXPORT_SYMBOL(_##op##_lock);						\
+									\
+unsigned long __lockfunc _##op##_lock_irqsave(locktype##_t *lock)	\
+{									\
+	unsigned long flags;						\
+									\
+	for (;;) {							\
+		preempt_disable();					\
+		local_irq_save(flags);					\
+		if (likely(_raw_##op##_trylock(lock)))			\
+			break;						\
+		local_irq_restore(flags);				\
+		preempt_enable();					\
+									\
+		if (!(lock)->break_lock)				\
+			(lock)->break_lock = 1;				\
+		while (!op##_can_lock(lock) && (lock)->break_lock)	\
+			_raw_##op##_relax(&lock->raw_lock);		\
+	}								\
+	(lock)->break_lock = 0;						\
+	return flags;							\
+}									\
+									\
+EXPORT_SYMBOL(_##op##_lock_irqsave);					\
+									\
+void __lockfunc _##op##_lock_irq(locktype##_t *lock)			\
+{									\
+	_##op##_lock_irqsave(lock);					\
+}									\
+									\
+EXPORT_SYMBOL(_##op##_lock_irq);					\
+									\
+void __lockfunc _##op##_lock_bh(locktype##_t *lock)			\
+{									\
+	unsigned long flags;						\
+									\
+	/*							*/	\
+	/* Careful: we must exclude softirqs too, hence the	*/	\
+	/* irq-disabling. We use the generic preemption-aware	*/	\
+	/* function:						*/	\
+	/**/								\
+	flags = _##op##_lock_irqsave(lock);				\
+	local_bh_disable();						\
+	local_irq_restore(flags);					\
+}									\
+									\
+EXPORT_SYMBOL(_##op##_lock_bh)
+
+/*
+ * Build preemption-friendly versions of the following
+ * lock-spinning functions:
+ *
+ *         _[spin|read|write]_lock()
+ *         _[spin|read|write]_lock_irq()
+ *         _[spin|read|write]_lock_irqsave()
+ *         _[spin|read|write]_lock_bh()
+ */
+BUILD_LOCK_OPS(spin, spinlock);
+BUILD_LOCK_OPS(read, rwlock);
+BUILD_LOCK_OPS(write, rwlock);
+
+#endif /* CONFIG_PREEMPT */
+
+#ifdef CONFIG_DEBUG_LOCK_ALLOC
+
+void __lockfunc _spin_lock_nested(spinlock_t *lock, int subclass)
+{
+	preempt_disable();
+	spin_acquire(&lock->dep_map, subclass, 0, _RET_IP_);
+	_raw_spin_lock(lock);
+}
+
+EXPORT_SYMBOL(_spin_lock_nested);
+unsigned long __lockfunc _spin_lock_irqsave_nested(spinlock_t *lock, int subclass)
+{
+	unsigned long flags;
+
+	local_irq_save(flags);
+	preempt_disable();
+	spin_acquire(&lock->dep_map, subclass, 0, _RET_IP_);
+	/*
+	 * On lockdep we dont want the hand-coded irq-enable of
+	 * _raw_spin_lock_flags() code, because lockdep assumes
+	 * that interrupts are not re-enabled during lock-acquire:
+	 */
+#ifdef CONFIG_PROVE_SPIN_LOCKING
+	_raw_spin_lock(lock);
+#else
+	_raw_spin_lock_flags(lock, &flags);
+#endif
+	return flags;
+}
+
+EXPORT_SYMBOL(_spin_lock_irqsave_nested);
+
+#endif
+
+void __lockfunc _spin_unlock(spinlock_t *lock)
+{
+	spin_release(&lock->dep_map, 1, _RET_IP_);
+	_raw_spin_unlock(lock);
+	preempt_enable();
+}
+EXPORT_SYMBOL(_spin_unlock);
+
+void __lockfunc _write_unlock(rwlock_t *lock)
+{
+	rwlock_release(&lock->dep_map, 1, _RET_IP_);
+	_raw_write_unlock(lock);
+	preempt_enable();
+}
+EXPORT_SYMBOL(_write_unlock);
+
+void __lockfunc _read_unlock(rwlock_t *lock)
+{
+	rwlock_release(&lock->dep_map, 1, _RET_IP_);
+	_raw_read_unlock(lock);
+	preempt_enable();
+}
+EXPORT_SYMBOL(_read_unlock);
+
+void __lockfunc _spin_unlock_irqrestore(spinlock_t *lock, unsigned long flags)
+{
+	spin_release(&lock->dep_map, 1, _RET_IP_);
+	_raw_spin_unlock(lock);
+	local_irq_restore(flags);
+	preempt_enable();
+}
+EXPORT_SYMBOL(_spin_unlock_irqrestore);
+
+void __lockfunc _spin_unlock_irq(spinlock_t *lock)
+{
+	spin_release(&lock->dep_map, 1, _RET_IP_);
+	_raw_spin_unlock(lock);
+	local_irq_enable();
+	preempt_enable();
+}
+EXPORT_SYMBOL(_spin_unlock_irq);
+
+void __lockfunc _spin_unlock_bh(spinlock_t *lock)
+{
+	spin_release(&lock->dep_map, 1, _RET_IP_);
+	_raw_spin_unlock(lock);
+	preempt_enable_no_resched();
+	local_bh_enable_ip((unsigned long)__builtin_return_address(0));
+}
+EXPORT_SYMBOL(_spin_unlock_bh);
+
+void __lockfunc _read_unlock_irqrestore(rwlock_t *lock, unsigned long flags)
+{
+	rwlock_release(&lock->dep_map, 1, _RET_IP_);
+	_raw_read_unlock(lock);
+	local_irq_restore(flags);
+	preempt_enable();
+}
+EXPORT_SYMBOL(_read_unlock_irqrestore);
+
+void __lockfunc _read_unlock_irq(rwlock_t *lock)
+{
+	rwlock_release(&lock->dep_map, 1, _RET_IP_);
+	_raw_read_unlock(lock);
+	local_irq_enable();
+	preempt_enable();
+}
+EXPORT_SYMBOL(_read_unlock_irq);
+
+void __lockfunc _read_unlock_bh(rwlock_t *lock)
+{
+	rwlock_release(&lock->dep_map, 1, _RET_IP_);
+	_raw_read_unlock(lock);
+	preempt_enable_no_resched();
+	local_bh_enable_ip((unsigned long)__builtin_return_address(0));
+}
+EXPORT_SYMBOL(_read_unlock_bh);
+
+void __lockfunc _write_unlock_irqrestore(rwlock_t *lock, unsigned long flags)
+{
+	rwlock_release(&lock->dep_map, 1, _RET_IP_);
+	_raw_write_unlock(lock);
+	local_irq_restore(flags);
+	preempt_enable();
+}
+EXPORT_SYMBOL(_write_unlock_irqrestore);
+
+void __lockfunc _write_unlock_irq(rwlock_t *lock)
+{
+	rwlock_release(&lock->dep_map, 1, _RET_IP_);
+	_raw_write_unlock(lock);
+	local_irq_enable();
+	preempt_enable();
+}
+EXPORT_SYMBOL(_write_unlock_irq);
+
+void __lockfunc _write_unlock_bh(rwlock_t *lock)
+{
+	rwlock_release(&lock->dep_map, 1, _RET_IP_);
+	_raw_write_unlock(lock);
+	preempt_enable_no_resched();
+	local_bh_enable_ip((unsigned long)__builtin_return_address(0));
+}
+EXPORT_SYMBOL(_write_unlock_bh);
+
+int __lockfunc _spin_trylock_bh(spinlock_t *lock)
+{
+	local_bh_disable();
+	preempt_disable();
+	if (_raw_spin_trylock(lock)) {
+		spin_acquire(&lock->dep_map, 0, 1, _RET_IP_);
+		return 1;
+	}
+
+	preempt_enable_no_resched();
+	local_bh_enable_ip((unsigned long)__builtin_return_address(0));
+	return 0;
+}
+EXPORT_SYMBOL(_spin_trylock_bh);
+
+int in_lock_functions(unsigned long addr)
+{
+	/* Linker adds these: start and end of __lockfunc functions */
+	extern char __lock_text_start[], __lock_text_end[];
+
+	return addr >= (unsigned long)__lock_text_start
+	&& addr < (unsigned long)__lock_text_end;
+}
+EXPORT_SYMBOL(in_lock_functions);
--- a/kernel/srcu.c
+++ b/kernel/srcu.c
@@ -0,0 +1,258 @@
+/*
+ * Sleepable Read-Copy Update mechanism for mutual exclusion.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
+ *
+ * Copyright (C) IBM Corporation, 2006
+ *
+ * Author: Paul McKenney <paulmck@us.ibm.com>
+ *
+ * For detailed explanation of Read-Copy Update mechanism see -
+ * 		Documentation/RCU/ *.txt
+ *
+ */
+
+#include <linux/module.h>
+#include <linux/mutex.h>
+#include <linux/percpu.h>
+#include <linux/preempt.h>
+#include <linux/rcupdate.h>
+#include <linux/sched.h>
+#include <linux/slab.h>
+#include <linux/smp.h>
+#include <linux/srcu.h>
+
+/**
+ * init_srcu_struct - initialize a sleep-RCU structure
+ * @sp: structure to initialize.
+ *
+ * Must invoke this on a given srcu_struct before passing that srcu_struct
+ * to any other function.  Each srcu_struct represents a separate domain
+ * of SRCU protection.
+ */
+int init_srcu_struct(struct srcu_struct *sp)
+{
+	sp->completed = 0;
+	mutex_init(&sp->mutex);
+	sp->per_cpu_ref = alloc_percpu(struct srcu_struct_array);
+	return (sp->per_cpu_ref ? 0 : -ENOMEM);
+}
+
+/*
+ * srcu_readers_active_idx -- returns approximate number of readers
+ *	active on the specified rank of per-CPU counters.
+ */
+
+static int srcu_readers_active_idx(struct srcu_struct *sp, int idx)
+{
+	int cpu;
+	int sum;
+
+	sum = 0;
+	for_each_possible_cpu(cpu)
+		sum += per_cpu_ptr(sp->per_cpu_ref, cpu)->c[idx];
+	return sum;
+}
+
+/**
+ * srcu_readers_active - returns approximate number of readers.
+ * @sp: which srcu_struct to count active readers (holding srcu_read_lock).
+ *
+ * Note that this is not an atomic primitive, and can therefore suffer
+ * severe errors when invoked on an active srcu_struct.  That said, it
+ * can be useful as an error check at cleanup time.
+ */
+int srcu_readers_active(struct srcu_struct *sp)
+{
+	return srcu_readers_active_idx(sp, 0) + srcu_readers_active_idx(sp, 1);
+}
+
+/**
+ * cleanup_srcu_struct - deconstruct a sleep-RCU structure
+ * @sp: structure to clean up.
+ *
+ * Must invoke this after you are finished using a given srcu_struct that
+ * was initialized via init_srcu_struct(), else you leak memory.
+ */
+void cleanup_srcu_struct(struct srcu_struct *sp)
+{
+	int sum;
+
+	sum = srcu_readers_active(sp);
+	WARN_ON(sum);  /* Leakage unless caller handles error. */
+	if (sum != 0)
+		return;
+	free_percpu(sp->per_cpu_ref);
+	sp->per_cpu_ref = NULL;
+}
+
+/**
+ * srcu_read_lock - register a new reader for an SRCU-protected structure.
+ * @sp: srcu_struct in which to register the new reader.
+ *
+ * Counts the new reader in the appropriate per-CPU element of the
+ * srcu_struct.  Must be called from process context.
+ * Returns an index that must be passed to the matching srcu_read_unlock().
+ */
+int srcu_read_lock(struct srcu_struct *sp)
+{
+	int idx;
+
+	preempt_disable();
+	idx = sp->completed & 0x1;
+	barrier();  /* ensure compiler looks -once- at sp->completed. */
+	per_cpu_ptr(sp->per_cpu_ref, smp_processor_id())->c[idx]++;
+	srcu_barrier();  /* ensure compiler won't misorder critical section. */
+	preempt_enable();
+	return idx;
+}
+
+/**
+ * srcu_read_unlock - unregister a old reader from an SRCU-protected structure.
+ * @sp: srcu_struct in which to unregister the old reader.
+ * @idx: return value from corresponding srcu_read_lock().
+ *
+ * Removes the count for the old reader from the appropriate per-CPU
+ * element of the srcu_struct.  Note that this may well be a different
+ * CPU than that which was incremented by the corresponding srcu_read_lock().
+ * Must be called from process context.
+ */
+void srcu_read_unlock(struct srcu_struct *sp, int idx)
+{
+	preempt_disable();
+	srcu_barrier();  /* ensure compiler won't misorder critical section. */
+	per_cpu_ptr(sp->per_cpu_ref, smp_processor_id())->c[idx]--;
+	preempt_enable();
+}
+
+/**
+ * synchronize_srcu - wait for prior SRCU read-side critical-section completion
+ * @sp: srcu_struct with which to synchronize.
+ *
+ * Flip the completed counter, and wait for the old count to drain to zero.
+ * As with classic RCU, the updater must use some separate means of
+ * synchronizing concurrent updates.  Can block; must be called from
+ * process context.
+ *
+ * Note that it is illegal to call synchornize_srcu() from the corresponding
+ * SRCU read-side critical section; doing so will result in deadlock.
+ * However, it is perfectly legal to call synchronize_srcu() on one
+ * srcu_struct from some other srcu_struct's read-side critical section.
+ */
+void synchronize_srcu(struct srcu_struct *sp)
+{
+	int idx;
+
+	idx = sp->completed;
+	mutex_lock(&sp->mutex);
+
+	/*
+	 * Check to see if someone else did the work for us while we were
+	 * waiting to acquire the lock.  We need -two- advances of
+	 * the counter, not just one.  If there was but one, we might have
+	 * shown up -after- our helper's first synchronize_sched(), thus
+	 * having failed to prevent CPU-reordering races with concurrent
+	 * srcu_read_unlock()s on other CPUs (see comment below).  So we
+	 * either (1) wait for two or (2) supply the second ourselves.
+	 */
+
+	if ((sp->completed - idx) >= 2) {
+		mutex_unlock(&sp->mutex);
+		return;
+	}
+
+	synchronize_sched();  /* Force memory barrier on all CPUs. */
+
+	/*
+	 * The preceding synchronize_sched() ensures that any CPU that
+	 * sees the new value of sp->completed will also see any preceding
+	 * changes to data structures made by this CPU.  This prevents
+	 * some other CPU from reordering the accesses in its SRCU
+	 * read-side critical section to precede the corresponding
+	 * srcu_read_lock() -- ensuring that such references will in
+	 * fact be protected.
+	 *
+	 * So it is now safe to do the flip.
+	 */
+
+	idx = sp->completed & 0x1;
+	sp->completed++;
+
+	synchronize_sched();  /* Force memory barrier on all CPUs. */
+
+	/*
+	 * At this point, because of the preceding synchronize_sched(),
+	 * all srcu_read_lock() calls using the old counters have completed.
+	 * Their corresponding critical sections might well be still
+	 * executing, but the srcu_read_lock() primitives themselves
+	 * will have finished executing.
+	 */
+
+	while (srcu_readers_active_idx(sp, idx))
+		schedule_timeout_interruptible(1);
+
+	synchronize_sched();  /* Force memory barrier on all CPUs. */
+
+	/*
+	 * The preceding synchronize_sched() forces all srcu_read_unlock()
+	 * primitives that were executing concurrently with the preceding
+	 * for_each_possible_cpu() loop to have completed by this point.
+	 * More importantly, it also forces the corresponding SRCU read-side
+	 * critical sections to have also completed, and the corresponding
+	 * references to SRCU-protected data items to be dropped.
+	 *
+	 * Note:
+	 *
+	 *	Despite what you might think at first glance, the
+	 *	preceding synchronize_sched() -must- be within the
+	 *	critical section ended by the following mutex_unlock().
+	 *	Otherwise, a task taking the early exit can race
+	 *	with a srcu_read_unlock(), which might have executed
+	 *	just before the preceding srcu_readers_active() check,
+	 *	and whose CPU might have reordered the srcu_read_unlock()
+	 *	with the preceding critical section.  In this case, there
+	 *	is nothing preventing the synchronize_sched() task that is
+	 *	taking the early exit from freeing a data structure that
+	 *	is still being referenced (out of order) by the task
+	 *	doing the srcu_read_unlock().
+	 *
+	 *	Alternatively, the comparison with "2" on the early exit
+	 *	could be changed to "3", but this increases synchronize_srcu()
+	 *	latency for bulk loads.  So the current code is preferred.
+	 */
+
+	mutex_unlock(&sp->mutex);
+}
+
+/**
+ * srcu_batches_completed - return batches completed.
+ * @sp: srcu_struct on which to report batch completion.
+ *
+ * Report the number of batches, correlated with, but not necessarily
+ * precisely the same as, the number of grace periods that have elapsed.
+ */
+
+long srcu_batches_completed(struct srcu_struct *sp)
+{
+	return sp->completed;
+}
+
+EXPORT_SYMBOL_GPL(init_srcu_struct);
+EXPORT_SYMBOL_GPL(cleanup_srcu_struct);
+EXPORT_SYMBOL_GPL(srcu_read_lock);
+EXPORT_SYMBOL_GPL(srcu_read_unlock);
+EXPORT_SYMBOL_GPL(synchronize_srcu);
+EXPORT_SYMBOL_GPL(srcu_batches_completed);
+EXPORT_SYMBOL_GPL(srcu_readers_active);
--- a/kernel/stacktrace.c
+++ b/kernel/stacktrace.c
@@ -0,0 +1,24 @@
+/*
+ * kernel/stacktrace.c
+ *
+ * Stack trace management functions
+ *
+ *  Copyright (C) 2006 Red Hat, Inc., Ingo Molnar <mingo@redhat.com>
+ */
+#include <linux/sched.h>
+#include <linux/kallsyms.h>
+#include <linux/stacktrace.h>
+
+void print_stack_trace(struct stack_trace *trace, int spaces)
+{
+	int i, j;
+
+	for (i = 0; i < trace->nr_entries; i++) {
+		unsigned long ip = trace->entries[i];
+
+		for (j = 0; j < spaces + 1; j++)
+			printk(" ");
+		print_ip_sym(ip);
+	}
+}
+
--- a/kernel/stop_machine.c
+++ b/kernel/stop_machine.c
@@ -0,0 +1,210 @@
+/* Copyright 2005 Rusty Russell rusty@rustcorp.com.au IBM Corporation.
+ * GPL v2 and any later version.
+ */
+#include <linux/stop_machine.h>
+#include <linux/kthread.h>
+#include <linux/sched.h>
+#include <linux/cpu.h>
+#include <linux/err.h>
+#include <linux/syscalls.h>
+#include <asm/atomic.h>
+#include <asm/semaphore.h>
+#include <asm/uaccess.h>
+
+/* Since we effect priority and affinity (both of which are visible
+ * to, and settable by outside processes) we do indirection via a
+ * kthread. */
+
+/* Thread to stop each CPU in user context. */
+enum stopmachine_state {
+	STOPMACHINE_WAIT,
+	STOPMACHINE_PREPARE,
+	STOPMACHINE_DISABLE_IRQ,
+	STOPMACHINE_EXIT,
+};
+
+static enum stopmachine_state stopmachine_state;
+static unsigned int stopmachine_num_threads;
+static atomic_t stopmachine_thread_ack;
+static DECLARE_MUTEX(stopmachine_mutex);
+
+static int stopmachine(void *cpu)
+{
+	int irqs_disabled = 0;
+	int prepared = 0;
+
+	set_cpus_allowed(current, cpumask_of_cpu((int)(long)cpu));
+
+	/* Ack: we are alive */
+	smp_mb(); /* Theoretically the ack = 0 might not be on this CPU yet. */
+	atomic_inc(&stopmachine_thread_ack);
+
+	/* Simple state machine */
+	while (stopmachine_state != STOPMACHINE_EXIT) {
+		if (stopmachine_state == STOPMACHINE_DISABLE_IRQ 
+		    && !irqs_disabled) {
+			local_irq_disable();
+			irqs_disabled = 1;
+			/* Ack: irqs disabled. */
+			smp_mb(); /* Must read state first. */
+			atomic_inc(&stopmachine_thread_ack);
+		} else if (stopmachine_state == STOPMACHINE_PREPARE
+			   && !prepared) {
+			/* Everyone is in place, hold CPU. */
+			preempt_disable();
+			prepared = 1;
+			smp_mb(); /* Must read state first. */
+			atomic_inc(&stopmachine_thread_ack);
+		}
+		/* Yield in first stage: migration threads need to
+		 * help our sisters onto their CPUs. */
+		if (!prepared && !irqs_disabled)
+			yield();
+		else
+			cpu_relax();
+	}
+
+	/* Ack: we are exiting. */
+	smp_mb(); /* Must read state first. */
+	atomic_inc(&stopmachine_thread_ack);
+
+	if (irqs_disabled)
+		local_irq_enable();
+	if (prepared)
+		preempt_enable();
+
+	return 0;
+}
+
+/* Change the thread state */
+static void stopmachine_set_state(enum stopmachine_state state)
+{
+	atomic_set(&stopmachine_thread_ack, 0);
+	smp_wmb();
+	stopmachine_state = state;
+	while (atomic_read(&stopmachine_thread_ack) != stopmachine_num_threads)
+		cpu_relax();
+}
+
+static int stop_machine(void)
+{
+	int i, ret = 0;
+	struct sched_param param = { .sched_priority = MAX_RT_PRIO-1 };
+
+	/* One high-prio thread per cpu.  We'll do this one. */
+	sched_setscheduler(current, SCHED_FIFO, &param);
+
+	atomic_set(&stopmachine_thread_ack, 0);
+	stopmachine_num_threads = 0;
+	stopmachine_state = STOPMACHINE_WAIT;
+
+	for_each_online_cpu(i) {
+		if (i == raw_smp_processor_id())
+			continue;
+		ret = kernel_thread(stopmachine, (void *)(long)i,CLONE_KERNEL);
+		if (ret < 0)
+			break;
+		stopmachine_num_threads++;
+	}
+
+	/* Wait for them all to come to life. */
+	while (atomic_read(&stopmachine_thread_ack) != stopmachine_num_threads)
+		yield();
+
+	/* If some failed, kill them all. */
+	if (ret < 0) {
+		stopmachine_set_state(STOPMACHINE_EXIT);
+		return ret;
+	}
+
+	/* Now they are all started, make them hold the CPUs, ready. */
+	preempt_disable();
+	stopmachine_set_state(STOPMACHINE_PREPARE);
+
+	/* Make them disable irqs. */
+	local_irq_disable();
+	stopmachine_set_state(STOPMACHINE_DISABLE_IRQ);
+
+	return 0;
+}
+
+static void restart_machine(void)
+{
+	stopmachine_set_state(STOPMACHINE_EXIT);
+	local_irq_enable();
+	preempt_enable_no_resched();
+}
+
+struct stop_machine_data
+{
+	int (*fn)(void *);
+	void *data;
+	struct completion done;
+};
+
+static int do_stop(void *_smdata)
+{
+	struct stop_machine_data *smdata = _smdata;
+	int ret;
+
+	ret = stop_machine();
+	if (ret == 0) {
+		ret = smdata->fn(smdata->data);
+		restart_machine();
+	}
+
+	/* We're done: you can kthread_stop us now */
+	complete(&smdata->done);
+
+	/* Wait for kthread_stop */
+	set_current_state(TASK_INTERRUPTIBLE);
+	while (!kthread_should_stop()) {
+		schedule();
+		set_current_state(TASK_INTERRUPTIBLE);
+	}
+	__set_current_state(TASK_RUNNING);
+	return ret;
+}
+
+struct task_struct *__stop_machine_run(int (*fn)(void *), void *data,
+				       unsigned int cpu)
+{
+	struct stop_machine_data smdata;
+	struct task_struct *p;
+
+	smdata.fn = fn;
+	smdata.data = data;
+	init_completion(&smdata.done);
+
+	down(&stopmachine_mutex);
+
+	/* If they don't care which CPU fn runs on, bind to any online one. */
+	if (cpu == NR_CPUS)
+		cpu = raw_smp_processor_id();
+
+	p = kthread_create(do_stop, &smdata, "kstopmachine");
+	if (!IS_ERR(p)) {
+		kthread_bind(p, cpu);
+		wake_up_process(p);
+		wait_for_completion(&smdata.done);
+	}
+	up(&stopmachine_mutex);
+	return p;
+}
+
+int stop_machine_run(int (*fn)(void *), void *data, unsigned int cpu)
+{
+	struct task_struct *p;
+	int ret;
+
+	/* No CPUs can come up or down during this. */
+	lock_cpu_hotplug();
+	p = __stop_machine_run(fn, data, cpu);
+	if (!IS_ERR(p))
+		ret = kthread_stop(p);
+	else
+		ret = PTR_ERR(p);
+	unlock_cpu_hotplug();
+
+	return ret;
+}
--- a/kernel/sys.c
+++ b/kernel/sys.c
--- a/kernel/sys_ni.c
+++ b/kernel/sys_ni.c
@@ -0,0 +1,143 @@
+
+#include <linux/linkage.h>
+#include <linux/errno.h>
+
+#include <asm/unistd.h>
+
+/*
+ * Non-implemented system calls get redirected here.
+ */
+asmlinkage long sys_ni_syscall(void)
+{
+	return -ENOSYS;
+}
+
+cond_syscall(sys_nfsservctl);
+cond_syscall(sys_quotactl);
+cond_syscall(sys_acct);
+cond_syscall(sys_lookup_dcookie);
+cond_syscall(sys_swapon);
+cond_syscall(sys_swapoff);
+cond_syscall(sys_kexec_load);
+cond_syscall(compat_sys_kexec_load);
+cond_syscall(sys_init_module);
+cond_syscall(sys_delete_module);
+cond_syscall(sys_socketpair);
+cond_syscall(sys_bind);
+cond_syscall(sys_listen);
+cond_syscall(sys_accept);
+cond_syscall(sys_connect);
+cond_syscall(sys_getsockname);
+cond_syscall(sys_getpeername);
+cond_syscall(sys_sendto);
+cond_syscall(sys_send);
+cond_syscall(sys_recvfrom);
+cond_syscall(sys_recv);
+cond_syscall(sys_socket);
+cond_syscall(sys_setsockopt);
+cond_syscall(sys_getsockopt);
+cond_syscall(sys_shutdown);
+cond_syscall(sys_sendmsg);
+cond_syscall(sys_recvmsg);
+cond_syscall(sys_socketcall);
+cond_syscall(sys_futex);
+cond_syscall(compat_sys_futex);
+cond_syscall(sys_set_robust_list);
+cond_syscall(compat_sys_set_robust_list);
+cond_syscall(sys_get_robust_list);
+cond_syscall(compat_sys_get_robust_list);
+cond_syscall(sys_epoll_create);
+cond_syscall(sys_epoll_ctl);
+cond_syscall(sys_epoll_wait);
+cond_syscall(sys_epoll_pwait);
+cond_syscall(sys_semget);
+cond_syscall(sys_semop);
+cond_syscall(sys_semtimedop);
+cond_syscall(sys_semctl);
+cond_syscall(sys_msgget);
+cond_syscall(sys_msgsnd);
+cond_syscall(sys_msgrcv);
+cond_syscall(sys_msgctl);
+cond_syscall(sys_shmget);
+cond_syscall(sys_shmat);
+cond_syscall(sys_shmdt);
+cond_syscall(sys_shmctl);
+cond_syscall(sys_mq_open);
+cond_syscall(sys_mq_unlink);
+cond_syscall(sys_mq_timedsend);
+cond_syscall(sys_mq_timedreceive);
+cond_syscall(sys_mq_notify);
+cond_syscall(sys_mq_getsetattr);
+cond_syscall(compat_sys_mq_open);
+cond_syscall(compat_sys_mq_timedsend);
+cond_syscall(compat_sys_mq_timedreceive);
+cond_syscall(compat_sys_mq_notify);
+cond_syscall(compat_sys_mq_getsetattr);
+cond_syscall(sys_mbind);
+cond_syscall(sys_get_mempolicy);
+cond_syscall(sys_set_mempolicy);
+cond_syscall(compat_sys_mbind);
+cond_syscall(compat_sys_get_mempolicy);
+cond_syscall(compat_sys_set_mempolicy);
+cond_syscall(sys_add_key);
+cond_syscall(sys_request_key);
+cond_syscall(sys_keyctl);
+cond_syscall(compat_sys_keyctl);
+cond_syscall(compat_sys_socketcall);
+cond_syscall(sys_inotify_init);
+cond_syscall(sys_inotify_add_watch);
+cond_syscall(sys_inotify_rm_watch);
+cond_syscall(sys_migrate_pages);
+cond_syscall(sys_move_pages);
+cond_syscall(sys_chown16);
+cond_syscall(sys_fchown16);
+cond_syscall(sys_getegid16);
+cond_syscall(sys_geteuid16);
+cond_syscall(sys_getgid16);
+cond_syscall(sys_getgroups16);
+cond_syscall(sys_getresgid16);
+cond_syscall(sys_getresuid16);
+cond_syscall(sys_getuid16);
+cond_syscall(sys_lchown16);
+cond_syscall(sys_setfsgid16);
+cond_syscall(sys_setfsuid16);
+cond_syscall(sys_setgid16);
+cond_syscall(sys_setgroups16);
+cond_syscall(sys_setregid16);
+cond_syscall(sys_setresgid16);
+cond_syscall(sys_setresuid16);
+cond_syscall(sys_setreuid16);
+cond_syscall(sys_setuid16);
+cond_syscall(sys_vm86old);
+cond_syscall(sys_vm86);
+cond_syscall(compat_sys_ipc);
+cond_syscall(compat_sys_sysctl);
+
+/* arch-specific weak syscall entries */
+cond_syscall(sys_pciconfig_read);
+cond_syscall(sys_pciconfig_write);
+cond_syscall(sys_pciconfig_iobase);
+cond_syscall(sys32_ipc);
+cond_syscall(sys32_sysctl);
+cond_syscall(ppc_rtas);
+cond_syscall(sys_spu_run);
+cond_syscall(sys_spu_create);
+
+/* mmu depending weak syscall entries */
+cond_syscall(sys_mprotect);
+cond_syscall(sys_msync);
+cond_syscall(sys_mlock);
+cond_syscall(sys_munlock);
+cond_syscall(sys_mlockall);
+cond_syscall(sys_munlockall);
+cond_syscall(sys_mincore);
+cond_syscall(sys_madvise);
+cond_syscall(sys_mremap);
+cond_syscall(sys_remap_file_pages);
+cond_syscall(compat_sys_move_pages);
+cond_syscall(compat_sys_migrate_pages);
+
+/* block-layer dependent */
+cond_syscall(sys_bdflush);
+cond_syscall(sys_ioprio_set);
+cond_syscall(sys_ioprio_get);
--- a/kernel/sysctl.c
+++ b/kernel/sysctl.c
--- a/Show More
+++ b/Show More